Stewart Slocum
Add fine-tuned model
67b9c0e
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 713,
"global_step": 713,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001402524544179523,
"grad_norm": 0.89453125,
"learning_rate": 1e-05,
"loss": 2.048,
"step": 1
},
{
"epoch": 0.002805049088359046,
"grad_norm": 0.765625,
"learning_rate": 9.985974754558206e-06,
"loss": 2.0929,
"step": 2
},
{
"epoch": 0.004207573632538569,
"grad_norm": 0.84765625,
"learning_rate": 9.97194950911641e-06,
"loss": 1.8896,
"step": 3
},
{
"epoch": 0.005610098176718092,
"grad_norm": 0.7734375,
"learning_rate": 9.957924263674615e-06,
"loss": 2.0957,
"step": 4
},
{
"epoch": 0.0070126227208976155,
"grad_norm": 0.79296875,
"learning_rate": 9.94389901823282e-06,
"loss": 1.8981,
"step": 5
},
{
"epoch": 0.008415147265077139,
"grad_norm": 0.66015625,
"learning_rate": 9.929873772791025e-06,
"loss": 2.0162,
"step": 6
},
{
"epoch": 0.009817671809256662,
"grad_norm": 0.67578125,
"learning_rate": 9.915848527349229e-06,
"loss": 1.7155,
"step": 7
},
{
"epoch": 0.011220196353436185,
"grad_norm": 0.6875,
"learning_rate": 9.901823281907434e-06,
"loss": 2.0333,
"step": 8
},
{
"epoch": 0.012622720897615708,
"grad_norm": 0.462890625,
"learning_rate": 9.887798036465639e-06,
"loss": 1.6522,
"step": 9
},
{
"epoch": 0.014025245441795231,
"grad_norm": 1.765625,
"learning_rate": 9.873772791023844e-06,
"loss": 1.8498,
"step": 10
},
{
"epoch": 0.015427769985974754,
"grad_norm": 0.5546875,
"learning_rate": 9.859747545582048e-06,
"loss": 1.9811,
"step": 11
},
{
"epoch": 0.016830294530154277,
"grad_norm": 0.67578125,
"learning_rate": 9.845722300140253e-06,
"loss": 1.6495,
"step": 12
},
{
"epoch": 0.0182328190743338,
"grad_norm": 0.37890625,
"learning_rate": 9.831697054698458e-06,
"loss": 1.8677,
"step": 13
},
{
"epoch": 0.019635343618513323,
"grad_norm": 0.4765625,
"learning_rate": 9.817671809256662e-06,
"loss": 1.998,
"step": 14
},
{
"epoch": 0.021037868162692847,
"grad_norm": 0.380859375,
"learning_rate": 9.803646563814867e-06,
"loss": 1.8926,
"step": 15
},
{
"epoch": 0.02244039270687237,
"grad_norm": 0.36328125,
"learning_rate": 9.789621318373072e-06,
"loss": 1.7889,
"step": 16
},
{
"epoch": 0.023842917251051893,
"grad_norm": 0.30859375,
"learning_rate": 9.775596072931277e-06,
"loss": 1.7603,
"step": 17
},
{
"epoch": 0.025245441795231416,
"grad_norm": 0.4609375,
"learning_rate": 9.761570827489481e-06,
"loss": 2.0804,
"step": 18
},
{
"epoch": 0.02664796633941094,
"grad_norm": 0.265625,
"learning_rate": 9.747545582047686e-06,
"loss": 1.8,
"step": 19
},
{
"epoch": 0.028050490883590462,
"grad_norm": 0.2578125,
"learning_rate": 9.733520336605891e-06,
"loss": 1.5292,
"step": 20
},
{
"epoch": 0.029453015427769985,
"grad_norm": 0.2734375,
"learning_rate": 9.719495091164096e-06,
"loss": 1.5226,
"step": 21
},
{
"epoch": 0.030855539971949508,
"grad_norm": 0.28125,
"learning_rate": 9.7054698457223e-06,
"loss": 1.8072,
"step": 22
},
{
"epoch": 0.03225806451612903,
"grad_norm": 0.34765625,
"learning_rate": 9.691444600280505e-06,
"loss": 1.8934,
"step": 23
},
{
"epoch": 0.033660589060308554,
"grad_norm": 0.248046875,
"learning_rate": 9.67741935483871e-06,
"loss": 1.4903,
"step": 24
},
{
"epoch": 0.03506311360448808,
"grad_norm": 0.1728515625,
"learning_rate": 9.663394109396916e-06,
"loss": 1.4165,
"step": 25
},
{
"epoch": 0.0364656381486676,
"grad_norm": 0.19921875,
"learning_rate": 9.649368863955119e-06,
"loss": 1.5423,
"step": 26
},
{
"epoch": 0.037868162692847124,
"grad_norm": 0.287109375,
"learning_rate": 9.635343618513324e-06,
"loss": 1.6054,
"step": 27
},
{
"epoch": 0.03927068723702665,
"grad_norm": 0.328125,
"learning_rate": 9.62131837307153e-06,
"loss": 1.9075,
"step": 28
},
{
"epoch": 0.04067321178120617,
"grad_norm": 0.298828125,
"learning_rate": 9.607293127629735e-06,
"loss": 1.7197,
"step": 29
},
{
"epoch": 0.04207573632538569,
"grad_norm": 0.1884765625,
"learning_rate": 9.593267882187938e-06,
"loss": 1.52,
"step": 30
},
{
"epoch": 0.043478260869565216,
"grad_norm": 0.240234375,
"learning_rate": 9.579242636746143e-06,
"loss": 1.5502,
"step": 31
},
{
"epoch": 0.04488078541374474,
"grad_norm": 0.275390625,
"learning_rate": 9.565217391304349e-06,
"loss": 1.7388,
"step": 32
},
{
"epoch": 0.04628330995792426,
"grad_norm": 0.25390625,
"learning_rate": 9.551192145862554e-06,
"loss": 1.6555,
"step": 33
},
{
"epoch": 0.047685834502103785,
"grad_norm": 0.1904296875,
"learning_rate": 9.537166900420757e-06,
"loss": 1.5854,
"step": 34
},
{
"epoch": 0.04908835904628331,
"grad_norm": 0.19140625,
"learning_rate": 9.523141654978963e-06,
"loss": 1.6306,
"step": 35
},
{
"epoch": 0.05049088359046283,
"grad_norm": 0.1416015625,
"learning_rate": 9.509116409537168e-06,
"loss": 1.3931,
"step": 36
},
{
"epoch": 0.051893408134642355,
"grad_norm": 0.3046875,
"learning_rate": 9.495091164095373e-06,
"loss": 1.8825,
"step": 37
},
{
"epoch": 0.05329593267882188,
"grad_norm": 0.1748046875,
"learning_rate": 9.481065918653577e-06,
"loss": 1.1516,
"step": 38
},
{
"epoch": 0.0546984572230014,
"grad_norm": 0.203125,
"learning_rate": 9.467040673211782e-06,
"loss": 1.5705,
"step": 39
},
{
"epoch": 0.056100981767180924,
"grad_norm": 0.1591796875,
"learning_rate": 9.453015427769987e-06,
"loss": 1.4537,
"step": 40
},
{
"epoch": 0.05750350631136045,
"grad_norm": 0.197265625,
"learning_rate": 9.438990182328192e-06,
"loss": 1.3941,
"step": 41
},
{
"epoch": 0.05890603085553997,
"grad_norm": 0.2431640625,
"learning_rate": 9.424964936886396e-06,
"loss": 1.7377,
"step": 42
},
{
"epoch": 0.06030855539971949,
"grad_norm": 0.201171875,
"learning_rate": 9.410939691444601e-06,
"loss": 1.5313,
"step": 43
},
{
"epoch": 0.061711079943899017,
"grad_norm": 0.1962890625,
"learning_rate": 9.396914446002806e-06,
"loss": 1.451,
"step": 44
},
{
"epoch": 0.06311360448807854,
"grad_norm": 0.171875,
"learning_rate": 9.382889200561011e-06,
"loss": 1.5415,
"step": 45
},
{
"epoch": 0.06451612903225806,
"grad_norm": 0.294921875,
"learning_rate": 9.368863955119215e-06,
"loss": 1.6808,
"step": 46
},
{
"epoch": 0.06591865357643759,
"grad_norm": 0.1708984375,
"learning_rate": 9.35483870967742e-06,
"loss": 1.5434,
"step": 47
},
{
"epoch": 0.06732117812061711,
"grad_norm": 0.1728515625,
"learning_rate": 9.340813464235625e-06,
"loss": 1.5043,
"step": 48
},
{
"epoch": 0.06872370266479663,
"grad_norm": 0.203125,
"learning_rate": 9.32678821879383e-06,
"loss": 1.6039,
"step": 49
},
{
"epoch": 0.07012622720897616,
"grad_norm": 0.2314453125,
"learning_rate": 9.312762973352034e-06,
"loss": 1.5649,
"step": 50
},
{
"epoch": 0.07152875175315568,
"grad_norm": 0.2080078125,
"learning_rate": 9.298737727910239e-06,
"loss": 1.5116,
"step": 51
},
{
"epoch": 0.0729312762973352,
"grad_norm": 0.1650390625,
"learning_rate": 9.284712482468444e-06,
"loss": 1.469,
"step": 52
},
{
"epoch": 0.07433380084151472,
"grad_norm": 0.14453125,
"learning_rate": 9.27068723702665e-06,
"loss": 1.4499,
"step": 53
},
{
"epoch": 0.07573632538569425,
"grad_norm": 0.201171875,
"learning_rate": 9.256661991584853e-06,
"loss": 1.5955,
"step": 54
},
{
"epoch": 0.07713884992987377,
"grad_norm": 0.2314453125,
"learning_rate": 9.242636746143058e-06,
"loss": 1.5542,
"step": 55
},
{
"epoch": 0.0785413744740533,
"grad_norm": 0.322265625,
"learning_rate": 9.228611500701263e-06,
"loss": 1.3431,
"step": 56
},
{
"epoch": 0.07994389901823282,
"grad_norm": 0.1396484375,
"learning_rate": 9.214586255259467e-06,
"loss": 1.5128,
"step": 57
},
{
"epoch": 0.08134642356241234,
"grad_norm": 0.12060546875,
"learning_rate": 9.200561009817672e-06,
"loss": 1.3224,
"step": 58
},
{
"epoch": 0.08274894810659186,
"grad_norm": 0.1474609375,
"learning_rate": 9.186535764375877e-06,
"loss": 1.4161,
"step": 59
},
{
"epoch": 0.08415147265077139,
"grad_norm": 0.265625,
"learning_rate": 9.172510518934083e-06,
"loss": 1.4785,
"step": 60
},
{
"epoch": 0.08555399719495091,
"grad_norm": 0.298828125,
"learning_rate": 9.158485273492286e-06,
"loss": 1.555,
"step": 61
},
{
"epoch": 0.08695652173913043,
"grad_norm": 0.126953125,
"learning_rate": 9.144460028050491e-06,
"loss": 1.3373,
"step": 62
},
{
"epoch": 0.08835904628330996,
"grad_norm": 0.169921875,
"learning_rate": 9.130434782608697e-06,
"loss": 1.4302,
"step": 63
},
{
"epoch": 0.08976157082748948,
"grad_norm": 0.1904296875,
"learning_rate": 9.116409537166902e-06,
"loss": 1.4033,
"step": 64
},
{
"epoch": 0.091164095371669,
"grad_norm": 0.138671875,
"learning_rate": 9.102384291725105e-06,
"loss": 1.3426,
"step": 65
},
{
"epoch": 0.09256661991584852,
"grad_norm": 0.216796875,
"learning_rate": 9.08835904628331e-06,
"loss": 1.5167,
"step": 66
},
{
"epoch": 0.09396914446002805,
"grad_norm": 0.154296875,
"learning_rate": 9.074333800841516e-06,
"loss": 1.3468,
"step": 67
},
{
"epoch": 0.09537166900420757,
"grad_norm": 0.1748046875,
"learning_rate": 9.060308555399721e-06,
"loss": 1.4049,
"step": 68
},
{
"epoch": 0.0967741935483871,
"grad_norm": 0.1513671875,
"learning_rate": 9.046283309957924e-06,
"loss": 1.4426,
"step": 69
},
{
"epoch": 0.09817671809256662,
"grad_norm": 0.138671875,
"learning_rate": 9.03225806451613e-06,
"loss": 1.3865,
"step": 70
},
{
"epoch": 0.09957924263674614,
"grad_norm": 0.197265625,
"learning_rate": 9.018232819074335e-06,
"loss": 1.5355,
"step": 71
},
{
"epoch": 0.10098176718092566,
"grad_norm": 0.1875,
"learning_rate": 9.00420757363254e-06,
"loss": 1.757,
"step": 72
},
{
"epoch": 0.10238429172510519,
"grad_norm": 0.1484375,
"learning_rate": 8.990182328190744e-06,
"loss": 1.5123,
"step": 73
},
{
"epoch": 0.10378681626928471,
"grad_norm": 0.19921875,
"learning_rate": 8.976157082748949e-06,
"loss": 1.4946,
"step": 74
},
{
"epoch": 0.10518934081346423,
"grad_norm": 0.28125,
"learning_rate": 8.962131837307152e-06,
"loss": 1.3608,
"step": 75
},
{
"epoch": 0.10659186535764376,
"grad_norm": 0.1005859375,
"learning_rate": 8.94810659186536e-06,
"loss": 1.1957,
"step": 76
},
{
"epoch": 0.10799438990182328,
"grad_norm": 0.125,
"learning_rate": 8.934081346423563e-06,
"loss": 1.2561,
"step": 77
},
{
"epoch": 0.1093969144460028,
"grad_norm": 0.11865234375,
"learning_rate": 8.920056100981768e-06,
"loss": 1.2952,
"step": 78
},
{
"epoch": 0.11079943899018233,
"grad_norm": 0.1513671875,
"learning_rate": 8.906030855539971e-06,
"loss": 1.2421,
"step": 79
},
{
"epoch": 0.11220196353436185,
"grad_norm": 0.125,
"learning_rate": 8.892005610098178e-06,
"loss": 1.4074,
"step": 80
},
{
"epoch": 0.11360448807854137,
"grad_norm": 0.123046875,
"learning_rate": 8.877980364656382e-06,
"loss": 1.2973,
"step": 81
},
{
"epoch": 0.1150070126227209,
"grad_norm": 0.11083984375,
"learning_rate": 8.863955119214587e-06,
"loss": 1.3492,
"step": 82
},
{
"epoch": 0.11640953716690042,
"grad_norm": 0.1455078125,
"learning_rate": 8.84992987377279e-06,
"loss": 1.2249,
"step": 83
},
{
"epoch": 0.11781206171107994,
"grad_norm": 0.1220703125,
"learning_rate": 8.835904628330997e-06,
"loss": 1.4241,
"step": 84
},
{
"epoch": 0.11921458625525946,
"grad_norm": 0.12255859375,
"learning_rate": 8.821879382889201e-06,
"loss": 1.3361,
"step": 85
},
{
"epoch": 0.12061711079943899,
"grad_norm": 0.11474609375,
"learning_rate": 8.807854137447406e-06,
"loss": 1.4971,
"step": 86
},
{
"epoch": 0.12201963534361851,
"grad_norm": 0.109375,
"learning_rate": 8.79382889200561e-06,
"loss": 1.1059,
"step": 87
},
{
"epoch": 0.12342215988779803,
"grad_norm": 0.2099609375,
"learning_rate": 8.779803646563817e-06,
"loss": 1.5296,
"step": 88
},
{
"epoch": 0.12482468443197756,
"grad_norm": 0.18359375,
"learning_rate": 8.76577840112202e-06,
"loss": 1.5274,
"step": 89
},
{
"epoch": 0.12622720897615708,
"grad_norm": 0.162109375,
"learning_rate": 8.751753155680225e-06,
"loss": 1.3704,
"step": 90
},
{
"epoch": 0.1276297335203366,
"grad_norm": 0.15625,
"learning_rate": 8.737727910238429e-06,
"loss": 1.3388,
"step": 91
},
{
"epoch": 0.12903225806451613,
"grad_norm": 0.1298828125,
"learning_rate": 8.723702664796636e-06,
"loss": 1.3987,
"step": 92
},
{
"epoch": 0.13043478260869565,
"grad_norm": 0.1279296875,
"learning_rate": 8.70967741935484e-06,
"loss": 1.2487,
"step": 93
},
{
"epoch": 0.13183730715287517,
"grad_norm": 0.181640625,
"learning_rate": 8.695652173913044e-06,
"loss": 1.345,
"step": 94
},
{
"epoch": 0.1332398316970547,
"grad_norm": 0.125,
"learning_rate": 8.681626928471248e-06,
"loss": 1.3105,
"step": 95
},
{
"epoch": 0.13464235624123422,
"grad_norm": 0.1513671875,
"learning_rate": 8.667601683029455e-06,
"loss": 1.3739,
"step": 96
},
{
"epoch": 0.13604488078541374,
"grad_norm": 0.1123046875,
"learning_rate": 8.653576437587658e-06,
"loss": 1.2664,
"step": 97
},
{
"epoch": 0.13744740532959326,
"grad_norm": 0.185546875,
"learning_rate": 8.639551192145864e-06,
"loss": 1.2584,
"step": 98
},
{
"epoch": 0.1388499298737728,
"grad_norm": 0.1865234375,
"learning_rate": 8.625525946704067e-06,
"loss": 1.356,
"step": 99
},
{
"epoch": 0.1402524544179523,
"grad_norm": 0.14453125,
"learning_rate": 8.611500701262272e-06,
"loss": 1.3034,
"step": 100
},
{
"epoch": 0.14165497896213183,
"grad_norm": 0.201171875,
"learning_rate": 8.597475455820477e-06,
"loss": 1.3498,
"step": 101
},
{
"epoch": 0.14305750350631136,
"grad_norm": 0.1748046875,
"learning_rate": 8.583450210378683e-06,
"loss": 1.3575,
"step": 102
},
{
"epoch": 0.14446002805049088,
"grad_norm": 0.1259765625,
"learning_rate": 8.569424964936886e-06,
"loss": 1.3232,
"step": 103
},
{
"epoch": 0.1458625525946704,
"grad_norm": 0.1328125,
"learning_rate": 8.555399719495091e-06,
"loss": 1.3514,
"step": 104
},
{
"epoch": 0.14726507713884993,
"grad_norm": 0.142578125,
"learning_rate": 8.541374474053297e-06,
"loss": 1.2854,
"step": 105
},
{
"epoch": 0.14866760168302945,
"grad_norm": 0.134765625,
"learning_rate": 8.527349228611502e-06,
"loss": 1.365,
"step": 106
},
{
"epoch": 0.15007012622720897,
"grad_norm": 0.1474609375,
"learning_rate": 8.513323983169705e-06,
"loss": 1.2432,
"step": 107
},
{
"epoch": 0.1514726507713885,
"grad_norm": 0.1708984375,
"learning_rate": 8.49929873772791e-06,
"loss": 1.417,
"step": 108
},
{
"epoch": 0.15287517531556802,
"grad_norm": 0.1357421875,
"learning_rate": 8.485273492286116e-06,
"loss": 1.1604,
"step": 109
},
{
"epoch": 0.15427769985974754,
"grad_norm": 0.11474609375,
"learning_rate": 8.471248246844321e-06,
"loss": 1.4095,
"step": 110
},
{
"epoch": 0.15568022440392706,
"grad_norm": 0.1494140625,
"learning_rate": 8.457223001402524e-06,
"loss": 1.429,
"step": 111
},
{
"epoch": 0.1570827489481066,
"grad_norm": 0.1416015625,
"learning_rate": 8.44319775596073e-06,
"loss": 1.3476,
"step": 112
},
{
"epoch": 0.1584852734922861,
"grad_norm": 0.10546875,
"learning_rate": 8.429172510518935e-06,
"loss": 1.2188,
"step": 113
},
{
"epoch": 0.15988779803646563,
"grad_norm": 0.0888671875,
"learning_rate": 8.41514726507714e-06,
"loss": 1.1952,
"step": 114
},
{
"epoch": 0.16129032258064516,
"grad_norm": 0.11962890625,
"learning_rate": 8.401122019635344e-06,
"loss": 1.1998,
"step": 115
},
{
"epoch": 0.16269284712482468,
"grad_norm": 0.11572265625,
"learning_rate": 8.387096774193549e-06,
"loss": 1.2351,
"step": 116
},
{
"epoch": 0.1640953716690042,
"grad_norm": 0.166015625,
"learning_rate": 8.373071528751754e-06,
"loss": 1.2987,
"step": 117
},
{
"epoch": 0.16549789621318373,
"grad_norm": 0.138671875,
"learning_rate": 8.35904628330996e-06,
"loss": 1.5069,
"step": 118
},
{
"epoch": 0.16690042075736325,
"grad_norm": 0.12060546875,
"learning_rate": 8.345021037868163e-06,
"loss": 1.3131,
"step": 119
},
{
"epoch": 0.16830294530154277,
"grad_norm": 0.138671875,
"learning_rate": 8.330995792426368e-06,
"loss": 1.3799,
"step": 120
},
{
"epoch": 0.1697054698457223,
"grad_norm": 0.140625,
"learning_rate": 8.316970546984573e-06,
"loss": 1.3526,
"step": 121
},
{
"epoch": 0.17110799438990182,
"grad_norm": 0.1318359375,
"learning_rate": 8.302945301542777e-06,
"loss": 1.3056,
"step": 122
},
{
"epoch": 0.17251051893408134,
"grad_norm": 0.1259765625,
"learning_rate": 8.288920056100982e-06,
"loss": 1.244,
"step": 123
},
{
"epoch": 0.17391304347826086,
"grad_norm": 0.1494140625,
"learning_rate": 8.274894810659187e-06,
"loss": 1.3172,
"step": 124
},
{
"epoch": 0.1753155680224404,
"grad_norm": 0.1767578125,
"learning_rate": 8.260869565217392e-06,
"loss": 1.3831,
"step": 125
},
{
"epoch": 0.1767180925666199,
"grad_norm": 0.10302734375,
"learning_rate": 8.246844319775596e-06,
"loss": 1.1699,
"step": 126
},
{
"epoch": 0.17812061711079943,
"grad_norm": 0.10791015625,
"learning_rate": 8.232819074333801e-06,
"loss": 1.3282,
"step": 127
},
{
"epoch": 0.17952314165497896,
"grad_norm": 0.119140625,
"learning_rate": 8.218793828892006e-06,
"loss": 1.122,
"step": 128
},
{
"epoch": 0.18092566619915848,
"grad_norm": 0.158203125,
"learning_rate": 8.204768583450211e-06,
"loss": 1.1407,
"step": 129
},
{
"epoch": 0.182328190743338,
"grad_norm": 0.130859375,
"learning_rate": 8.190743338008415e-06,
"loss": 1.3206,
"step": 130
},
{
"epoch": 0.18373071528751753,
"grad_norm": 0.1533203125,
"learning_rate": 8.17671809256662e-06,
"loss": 1.5346,
"step": 131
},
{
"epoch": 0.18513323983169705,
"grad_norm": 0.19140625,
"learning_rate": 8.162692847124825e-06,
"loss": 1.4225,
"step": 132
},
{
"epoch": 0.18653576437587657,
"grad_norm": 0.11962890625,
"learning_rate": 8.14866760168303e-06,
"loss": 1.3242,
"step": 133
},
{
"epoch": 0.1879382889200561,
"grad_norm": 0.1533203125,
"learning_rate": 8.134642356241234e-06,
"loss": 1.3555,
"step": 134
},
{
"epoch": 0.18934081346423562,
"grad_norm": 0.130859375,
"learning_rate": 8.12061711079944e-06,
"loss": 1.1939,
"step": 135
},
{
"epoch": 0.19074333800841514,
"grad_norm": 0.1494140625,
"learning_rate": 8.106591865357644e-06,
"loss": 1.2565,
"step": 136
},
{
"epoch": 0.19214586255259467,
"grad_norm": 0.10595703125,
"learning_rate": 8.09256661991585e-06,
"loss": 1.3652,
"step": 137
},
{
"epoch": 0.1935483870967742,
"grad_norm": 0.142578125,
"learning_rate": 8.078541374474053e-06,
"loss": 1.1999,
"step": 138
},
{
"epoch": 0.1949509116409537,
"grad_norm": 0.1611328125,
"learning_rate": 8.064516129032258e-06,
"loss": 1.2727,
"step": 139
},
{
"epoch": 0.19635343618513323,
"grad_norm": 0.119140625,
"learning_rate": 8.050490883590464e-06,
"loss": 1.1687,
"step": 140
},
{
"epoch": 0.19775596072931276,
"grad_norm": 0.11767578125,
"learning_rate": 8.036465638148669e-06,
"loss": 1.3706,
"step": 141
},
{
"epoch": 0.19915848527349228,
"grad_norm": 0.12255859375,
"learning_rate": 8.022440392706872e-06,
"loss": 1.2393,
"step": 142
},
{
"epoch": 0.2005610098176718,
"grad_norm": 0.14453125,
"learning_rate": 8.008415147265078e-06,
"loss": 1.2912,
"step": 143
},
{
"epoch": 0.20196353436185133,
"grad_norm": 0.1318359375,
"learning_rate": 7.994389901823283e-06,
"loss": 1.2348,
"step": 144
},
{
"epoch": 0.20336605890603085,
"grad_norm": 0.181640625,
"learning_rate": 7.980364656381488e-06,
"loss": 1.4441,
"step": 145
},
{
"epoch": 0.20476858345021037,
"grad_norm": 0.10546875,
"learning_rate": 7.966339410939691e-06,
"loss": 1.2566,
"step": 146
},
{
"epoch": 0.2061711079943899,
"grad_norm": 0.12255859375,
"learning_rate": 7.952314165497897e-06,
"loss": 1.4199,
"step": 147
},
{
"epoch": 0.20757363253856942,
"grad_norm": 0.2265625,
"learning_rate": 7.938288920056102e-06,
"loss": 1.2723,
"step": 148
},
{
"epoch": 0.20897615708274894,
"grad_norm": 0.36328125,
"learning_rate": 7.924263674614307e-06,
"loss": 1.7504,
"step": 149
},
{
"epoch": 0.21037868162692847,
"grad_norm": 0.1611328125,
"learning_rate": 7.91023842917251e-06,
"loss": 1.2681,
"step": 150
},
{
"epoch": 0.211781206171108,
"grad_norm": 0.1572265625,
"learning_rate": 7.896213183730716e-06,
"loss": 1.3545,
"step": 151
},
{
"epoch": 0.2131837307152875,
"grad_norm": 0.12158203125,
"learning_rate": 7.882187938288921e-06,
"loss": 1.3314,
"step": 152
},
{
"epoch": 0.21458625525946703,
"grad_norm": 0.28125,
"learning_rate": 7.868162692847126e-06,
"loss": 1.2628,
"step": 153
},
{
"epoch": 0.21598877980364656,
"grad_norm": 0.12255859375,
"learning_rate": 7.85413744740533e-06,
"loss": 1.3766,
"step": 154
},
{
"epoch": 0.21739130434782608,
"grad_norm": 0.142578125,
"learning_rate": 7.840112201963535e-06,
"loss": 1.5394,
"step": 155
},
{
"epoch": 0.2187938288920056,
"grad_norm": 0.166015625,
"learning_rate": 7.82608695652174e-06,
"loss": 1.4721,
"step": 156
},
{
"epoch": 0.22019635343618513,
"grad_norm": 0.1630859375,
"learning_rate": 7.812061711079945e-06,
"loss": 1.4292,
"step": 157
},
{
"epoch": 0.22159887798036465,
"grad_norm": 0.126953125,
"learning_rate": 7.798036465638149e-06,
"loss": 1.2383,
"step": 158
},
{
"epoch": 0.22300140252454417,
"grad_norm": 0.1376953125,
"learning_rate": 7.784011220196354e-06,
"loss": 1.3895,
"step": 159
},
{
"epoch": 0.2244039270687237,
"grad_norm": 0.158203125,
"learning_rate": 7.76998597475456e-06,
"loss": 1.3189,
"step": 160
},
{
"epoch": 0.22580645161290322,
"grad_norm": 0.158203125,
"learning_rate": 7.755960729312764e-06,
"loss": 1.3514,
"step": 161
},
{
"epoch": 0.22720897615708274,
"grad_norm": 0.1435546875,
"learning_rate": 7.741935483870968e-06,
"loss": 1.2598,
"step": 162
},
{
"epoch": 0.22861150070126227,
"grad_norm": 0.12890625,
"learning_rate": 7.727910238429173e-06,
"loss": 1.2271,
"step": 163
},
{
"epoch": 0.2300140252454418,
"grad_norm": 0.107421875,
"learning_rate": 7.713884992987378e-06,
"loss": 1.1796,
"step": 164
},
{
"epoch": 0.2314165497896213,
"grad_norm": 0.212890625,
"learning_rate": 7.699859747545582e-06,
"loss": 1.3644,
"step": 165
},
{
"epoch": 0.23281907433380084,
"grad_norm": 0.1376953125,
"learning_rate": 7.685834502103787e-06,
"loss": 1.31,
"step": 166
},
{
"epoch": 0.23422159887798036,
"grad_norm": 0.12353515625,
"learning_rate": 7.671809256661992e-06,
"loss": 1.1865,
"step": 167
},
{
"epoch": 0.23562412342215988,
"grad_norm": 0.154296875,
"learning_rate": 7.657784011220198e-06,
"loss": 1.2996,
"step": 168
},
{
"epoch": 0.2370266479663394,
"grad_norm": 0.1337890625,
"learning_rate": 7.643758765778401e-06,
"loss": 1.3056,
"step": 169
},
{
"epoch": 0.23842917251051893,
"grad_norm": 0.1533203125,
"learning_rate": 7.629733520336606e-06,
"loss": 1.321,
"step": 170
},
{
"epoch": 0.23983169705469845,
"grad_norm": 0.205078125,
"learning_rate": 7.615708274894811e-06,
"loss": 1.2381,
"step": 171
},
{
"epoch": 0.24123422159887797,
"grad_norm": 0.1357421875,
"learning_rate": 7.601683029453017e-06,
"loss": 1.4478,
"step": 172
},
{
"epoch": 0.2426367461430575,
"grad_norm": 0.158203125,
"learning_rate": 7.587657784011221e-06,
"loss": 1.4235,
"step": 173
},
{
"epoch": 0.24403927068723702,
"grad_norm": 0.10400390625,
"learning_rate": 7.573632538569425e-06,
"loss": 1.1866,
"step": 174
},
{
"epoch": 0.24544179523141654,
"grad_norm": 0.2177734375,
"learning_rate": 7.55960729312763e-06,
"loss": 1.3728,
"step": 175
},
{
"epoch": 0.24684431977559607,
"grad_norm": 0.2138671875,
"learning_rate": 7.545582047685836e-06,
"loss": 1.2539,
"step": 176
},
{
"epoch": 0.2482468443197756,
"grad_norm": 0.1298828125,
"learning_rate": 7.53155680224404e-06,
"loss": 1.3318,
"step": 177
},
{
"epoch": 0.2496493688639551,
"grad_norm": 0.119140625,
"learning_rate": 7.5175315568022445e-06,
"loss": 1.4789,
"step": 178
},
{
"epoch": 0.25105189340813466,
"grad_norm": 0.10498046875,
"learning_rate": 7.503506311360449e-06,
"loss": 1.2135,
"step": 179
},
{
"epoch": 0.25245441795231416,
"grad_norm": 0.1787109375,
"learning_rate": 7.489481065918655e-06,
"loss": 1.2406,
"step": 180
},
{
"epoch": 0.2538569424964937,
"grad_norm": 0.2041015625,
"learning_rate": 7.475455820476859e-06,
"loss": 1.1215,
"step": 181
},
{
"epoch": 0.2552594670406732,
"grad_norm": 0.1875,
"learning_rate": 7.461430575035064e-06,
"loss": 1.1114,
"step": 182
},
{
"epoch": 0.25666199158485276,
"grad_norm": 0.1220703125,
"learning_rate": 7.447405329593268e-06,
"loss": 1.3997,
"step": 183
},
{
"epoch": 0.25806451612903225,
"grad_norm": 0.2080078125,
"learning_rate": 7.433380084151473e-06,
"loss": 1.2383,
"step": 184
},
{
"epoch": 0.2594670406732118,
"grad_norm": 0.1337890625,
"learning_rate": 7.4193548387096784e-06,
"loss": 1.2564,
"step": 185
},
{
"epoch": 0.2608695652173913,
"grad_norm": 0.1572265625,
"learning_rate": 7.405329593267883e-06,
"loss": 1.3568,
"step": 186
},
{
"epoch": 0.26227208976157085,
"grad_norm": 0.18359375,
"learning_rate": 7.391304347826087e-06,
"loss": 1.4266,
"step": 187
},
{
"epoch": 0.26367461430575034,
"grad_norm": 0.1533203125,
"learning_rate": 7.377279102384292e-06,
"loss": 1.3553,
"step": 188
},
{
"epoch": 0.2650771388499299,
"grad_norm": 0.1572265625,
"learning_rate": 7.3632538569424976e-06,
"loss": 1.4127,
"step": 189
},
{
"epoch": 0.2664796633941094,
"grad_norm": 0.125,
"learning_rate": 7.349228611500702e-06,
"loss": 1.3274,
"step": 190
},
{
"epoch": 0.26788218793828894,
"grad_norm": 0.1591796875,
"learning_rate": 7.335203366058906e-06,
"loss": 1.2343,
"step": 191
},
{
"epoch": 0.26928471248246844,
"grad_norm": 0.177734375,
"learning_rate": 7.3211781206171115e-06,
"loss": 1.1226,
"step": 192
},
{
"epoch": 0.270687237026648,
"grad_norm": 0.09521484375,
"learning_rate": 7.307152875175316e-06,
"loss": 1.2653,
"step": 193
},
{
"epoch": 0.2720897615708275,
"grad_norm": 0.1416015625,
"learning_rate": 7.293127629733521e-06,
"loss": 1.1759,
"step": 194
},
{
"epoch": 0.27349228611500703,
"grad_norm": 0.134765625,
"learning_rate": 7.2791023842917254e-06,
"loss": 1.3378,
"step": 195
},
{
"epoch": 0.27489481065918653,
"grad_norm": 0.12353515625,
"learning_rate": 7.265077138849931e-06,
"loss": 1.1542,
"step": 196
},
{
"epoch": 0.2762973352033661,
"grad_norm": 0.1650390625,
"learning_rate": 7.251051893408135e-06,
"loss": 1.3159,
"step": 197
},
{
"epoch": 0.2776998597475456,
"grad_norm": 0.1044921875,
"learning_rate": 7.23702664796634e-06,
"loss": 1.2158,
"step": 198
},
{
"epoch": 0.2791023842917251,
"grad_norm": 0.15234375,
"learning_rate": 7.2230014025245446e-06,
"loss": 1.2818,
"step": 199
},
{
"epoch": 0.2805049088359046,
"grad_norm": 0.212890625,
"learning_rate": 7.20897615708275e-06,
"loss": 1.1737,
"step": 200
},
{
"epoch": 0.28190743338008417,
"grad_norm": 0.185546875,
"learning_rate": 7.194950911640954e-06,
"loss": 1.2758,
"step": 201
},
{
"epoch": 0.28330995792426367,
"grad_norm": 0.1494140625,
"learning_rate": 7.1809256661991585e-06,
"loss": 1.5019,
"step": 202
},
{
"epoch": 0.2847124824684432,
"grad_norm": 0.140625,
"learning_rate": 7.166900420757364e-06,
"loss": 1.3002,
"step": 203
},
{
"epoch": 0.2861150070126227,
"grad_norm": 0.1669921875,
"learning_rate": 7.152875175315569e-06,
"loss": 1.1012,
"step": 204
},
{
"epoch": 0.28751753155680226,
"grad_norm": 0.1298828125,
"learning_rate": 7.138849929873773e-06,
"loss": 1.3503,
"step": 205
},
{
"epoch": 0.28892005610098176,
"grad_norm": 0.189453125,
"learning_rate": 7.124824684431978e-06,
"loss": 1.2954,
"step": 206
},
{
"epoch": 0.2903225806451613,
"grad_norm": 0.14453125,
"learning_rate": 7.110799438990183e-06,
"loss": 1.3856,
"step": 207
},
{
"epoch": 0.2917251051893408,
"grad_norm": 0.15625,
"learning_rate": 7.096774193548388e-06,
"loss": 1.1842,
"step": 208
},
{
"epoch": 0.29312762973352036,
"grad_norm": 0.1484375,
"learning_rate": 7.082748948106592e-06,
"loss": 1.3386,
"step": 209
},
{
"epoch": 0.29453015427769985,
"grad_norm": 0.1748046875,
"learning_rate": 7.068723702664797e-06,
"loss": 1.1526,
"step": 210
},
{
"epoch": 0.2959326788218794,
"grad_norm": 0.181640625,
"learning_rate": 7.054698457223001e-06,
"loss": 1.3657,
"step": 211
},
{
"epoch": 0.2973352033660589,
"grad_norm": 0.1259765625,
"learning_rate": 7.040673211781207e-06,
"loss": 1.2629,
"step": 212
},
{
"epoch": 0.29873772791023845,
"grad_norm": 0.1748046875,
"learning_rate": 7.0266479663394115e-06,
"loss": 1.2934,
"step": 213
},
{
"epoch": 0.30014025245441794,
"grad_norm": 0.130859375,
"learning_rate": 7.012622720897616e-06,
"loss": 1.3227,
"step": 214
},
{
"epoch": 0.3015427769985975,
"grad_norm": 0.2001953125,
"learning_rate": 6.99859747545582e-06,
"loss": 1.2734,
"step": 215
},
{
"epoch": 0.302945301542777,
"grad_norm": 0.09716796875,
"learning_rate": 6.984572230014026e-06,
"loss": 1.3514,
"step": 216
},
{
"epoch": 0.30434782608695654,
"grad_norm": 0.1787109375,
"learning_rate": 6.970546984572231e-06,
"loss": 1.1325,
"step": 217
},
{
"epoch": 0.30575035063113604,
"grad_norm": 0.134765625,
"learning_rate": 6.956521739130435e-06,
"loss": 1.3717,
"step": 218
},
{
"epoch": 0.3071528751753156,
"grad_norm": 0.1630859375,
"learning_rate": 6.942496493688639e-06,
"loss": 1.4719,
"step": 219
},
{
"epoch": 0.3085553997194951,
"grad_norm": 0.107421875,
"learning_rate": 6.9284712482468454e-06,
"loss": 1.3102,
"step": 220
},
{
"epoch": 0.30995792426367463,
"grad_norm": 0.125,
"learning_rate": 6.91444600280505e-06,
"loss": 1.1821,
"step": 221
},
{
"epoch": 0.31136044880785413,
"grad_norm": 0.12451171875,
"learning_rate": 6.900420757363254e-06,
"loss": 1.2056,
"step": 222
},
{
"epoch": 0.3127629733520337,
"grad_norm": 0.2275390625,
"learning_rate": 6.8863955119214585e-06,
"loss": 1.2066,
"step": 223
},
{
"epoch": 0.3141654978962132,
"grad_norm": 0.1328125,
"learning_rate": 6.8723702664796646e-06,
"loss": 1.4078,
"step": 224
},
{
"epoch": 0.3155680224403927,
"grad_norm": 0.1572265625,
"learning_rate": 6.858345021037869e-06,
"loss": 1.1525,
"step": 225
},
{
"epoch": 0.3169705469845722,
"grad_norm": 0.11767578125,
"learning_rate": 6.844319775596073e-06,
"loss": 1.2254,
"step": 226
},
{
"epoch": 0.31837307152875177,
"grad_norm": 0.185546875,
"learning_rate": 6.830294530154278e-06,
"loss": 1.2657,
"step": 227
},
{
"epoch": 0.31977559607293127,
"grad_norm": 0.10400390625,
"learning_rate": 6.816269284712484e-06,
"loss": 1.2751,
"step": 228
},
{
"epoch": 0.3211781206171108,
"grad_norm": 0.1220703125,
"learning_rate": 6.802244039270688e-06,
"loss": 1.3515,
"step": 229
},
{
"epoch": 0.3225806451612903,
"grad_norm": 0.134765625,
"learning_rate": 6.788218793828892e-06,
"loss": 1.5331,
"step": 230
},
{
"epoch": 0.32398316970546986,
"grad_norm": 0.1962890625,
"learning_rate": 6.774193548387097e-06,
"loss": 1.2843,
"step": 231
},
{
"epoch": 0.32538569424964936,
"grad_norm": 0.1435546875,
"learning_rate": 6.760168302945303e-06,
"loss": 1.2427,
"step": 232
},
{
"epoch": 0.3267882187938289,
"grad_norm": 0.1826171875,
"learning_rate": 6.746143057503507e-06,
"loss": 1.174,
"step": 233
},
{
"epoch": 0.3281907433380084,
"grad_norm": 0.12255859375,
"learning_rate": 6.7321178120617116e-06,
"loss": 1.2616,
"step": 234
},
{
"epoch": 0.32959326788218796,
"grad_norm": 0.2392578125,
"learning_rate": 6.718092566619916e-06,
"loss": 1.3096,
"step": 235
},
{
"epoch": 0.33099579242636745,
"grad_norm": 0.08447265625,
"learning_rate": 6.704067321178121e-06,
"loss": 1.1866,
"step": 236
},
{
"epoch": 0.332398316970547,
"grad_norm": 0.1298828125,
"learning_rate": 6.690042075736326e-06,
"loss": 1.1565,
"step": 237
},
{
"epoch": 0.3338008415147265,
"grad_norm": 0.14453125,
"learning_rate": 6.676016830294531e-06,
"loss": 1.442,
"step": 238
},
{
"epoch": 0.33520336605890605,
"grad_norm": 0.134765625,
"learning_rate": 6.661991584852735e-06,
"loss": 1.2511,
"step": 239
},
{
"epoch": 0.33660589060308554,
"grad_norm": 0.193359375,
"learning_rate": 6.64796633941094e-06,
"loss": 1.1966,
"step": 240
},
{
"epoch": 0.3380084151472651,
"grad_norm": 0.12158203125,
"learning_rate": 6.6339410939691455e-06,
"loss": 1.1705,
"step": 241
},
{
"epoch": 0.3394109396914446,
"grad_norm": 0.1123046875,
"learning_rate": 6.61991584852735e-06,
"loss": 1.3132,
"step": 242
},
{
"epoch": 0.34081346423562414,
"grad_norm": 0.1845703125,
"learning_rate": 6.605890603085554e-06,
"loss": 1.1844,
"step": 243
},
{
"epoch": 0.34221598877980364,
"grad_norm": 0.1943359375,
"learning_rate": 6.591865357643759e-06,
"loss": 1.4392,
"step": 244
},
{
"epoch": 0.3436185133239832,
"grad_norm": 0.1416015625,
"learning_rate": 6.577840112201964e-06,
"loss": 1.2875,
"step": 245
},
{
"epoch": 0.3450210378681627,
"grad_norm": 0.12890625,
"learning_rate": 6.563814866760169e-06,
"loss": 1.1475,
"step": 246
},
{
"epoch": 0.34642356241234223,
"grad_norm": 0.1845703125,
"learning_rate": 6.549789621318373e-06,
"loss": 1.4414,
"step": 247
},
{
"epoch": 0.34782608695652173,
"grad_norm": 0.1875,
"learning_rate": 6.5357643758765785e-06,
"loss": 1.2128,
"step": 248
},
{
"epoch": 0.3492286115007013,
"grad_norm": 0.10693359375,
"learning_rate": 6.521739130434783e-06,
"loss": 1.2818,
"step": 249
},
{
"epoch": 0.3506311360448808,
"grad_norm": 0.1533203125,
"learning_rate": 6.507713884992988e-06,
"loss": 1.3934,
"step": 250
},
{
"epoch": 0.3520336605890603,
"grad_norm": 0.1669921875,
"learning_rate": 6.4936886395511925e-06,
"loss": 1.2186,
"step": 251
},
{
"epoch": 0.3534361851332398,
"grad_norm": 0.162109375,
"learning_rate": 6.479663394109398e-06,
"loss": 1.2839,
"step": 252
},
{
"epoch": 0.3548387096774194,
"grad_norm": 0.1435546875,
"learning_rate": 6.465638148667602e-06,
"loss": 1.1664,
"step": 253
},
{
"epoch": 0.35624123422159887,
"grad_norm": 0.1064453125,
"learning_rate": 6.451612903225806e-06,
"loss": 1.2641,
"step": 254
},
{
"epoch": 0.3576437587657784,
"grad_norm": 0.1787109375,
"learning_rate": 6.437587657784012e-06,
"loss": 1.204,
"step": 255
},
{
"epoch": 0.3590462833099579,
"grad_norm": 0.11767578125,
"learning_rate": 6.423562412342217e-06,
"loss": 1.3466,
"step": 256
},
{
"epoch": 0.36044880785413747,
"grad_norm": 0.1552734375,
"learning_rate": 6.409537166900421e-06,
"loss": 1.5334,
"step": 257
},
{
"epoch": 0.36185133239831696,
"grad_norm": 0.1396484375,
"learning_rate": 6.3955119214586255e-06,
"loss": 1.2872,
"step": 258
},
{
"epoch": 0.3632538569424965,
"grad_norm": 0.1806640625,
"learning_rate": 6.381486676016831e-06,
"loss": 1.3031,
"step": 259
},
{
"epoch": 0.364656381486676,
"grad_norm": 0.1396484375,
"learning_rate": 6.367461430575036e-06,
"loss": 1.1709,
"step": 260
},
{
"epoch": 0.36605890603085556,
"grad_norm": 0.1650390625,
"learning_rate": 6.35343618513324e-06,
"loss": 1.2545,
"step": 261
},
{
"epoch": 0.36746143057503505,
"grad_norm": 0.1416015625,
"learning_rate": 6.339410939691445e-06,
"loss": 1.2764,
"step": 262
},
{
"epoch": 0.3688639551192146,
"grad_norm": 0.11572265625,
"learning_rate": 6.32538569424965e-06,
"loss": 1.3391,
"step": 263
},
{
"epoch": 0.3702664796633941,
"grad_norm": 0.1103515625,
"learning_rate": 6.311360448807855e-06,
"loss": 1.3695,
"step": 264
},
{
"epoch": 0.37166900420757365,
"grad_norm": 0.248046875,
"learning_rate": 6.297335203366059e-06,
"loss": 1.2392,
"step": 265
},
{
"epoch": 0.37307152875175315,
"grad_norm": 0.1630859375,
"learning_rate": 6.283309957924264e-06,
"loss": 1.1389,
"step": 266
},
{
"epoch": 0.3744740532959327,
"grad_norm": 0.134765625,
"learning_rate": 6.269284712482468e-06,
"loss": 1.2203,
"step": 267
},
{
"epoch": 0.3758765778401122,
"grad_norm": 0.130859375,
"learning_rate": 6.255259467040674e-06,
"loss": 1.2082,
"step": 268
},
{
"epoch": 0.37727910238429174,
"grad_norm": 0.126953125,
"learning_rate": 6.2412342215988786e-06,
"loss": 1.3453,
"step": 269
},
{
"epoch": 0.37868162692847124,
"grad_norm": 0.1396484375,
"learning_rate": 6.227208976157083e-06,
"loss": 1.1697,
"step": 270
},
{
"epoch": 0.3800841514726508,
"grad_norm": 0.11328125,
"learning_rate": 6.213183730715287e-06,
"loss": 1.2059,
"step": 271
},
{
"epoch": 0.3814866760168303,
"grad_norm": 0.14453125,
"learning_rate": 6.199158485273493e-06,
"loss": 1.5541,
"step": 272
},
{
"epoch": 0.38288920056100983,
"grad_norm": 0.2890625,
"learning_rate": 6.185133239831698e-06,
"loss": 1.2391,
"step": 273
},
{
"epoch": 0.38429172510518933,
"grad_norm": 0.138671875,
"learning_rate": 6.171107994389902e-06,
"loss": 1.4042,
"step": 274
},
{
"epoch": 0.3856942496493689,
"grad_norm": 0.1845703125,
"learning_rate": 6.157082748948106e-06,
"loss": 1.149,
"step": 275
},
{
"epoch": 0.3870967741935484,
"grad_norm": 0.205078125,
"learning_rate": 6.1430575035063125e-06,
"loss": 1.209,
"step": 276
},
{
"epoch": 0.3884992987377279,
"grad_norm": 0.11865234375,
"learning_rate": 6.129032258064517e-06,
"loss": 1.102,
"step": 277
},
{
"epoch": 0.3899018232819074,
"grad_norm": 0.19140625,
"learning_rate": 6.115007012622721e-06,
"loss": 1.2143,
"step": 278
},
{
"epoch": 0.391304347826087,
"grad_norm": 0.1484375,
"learning_rate": 6.1009817671809255e-06,
"loss": 1.1747,
"step": 279
},
{
"epoch": 0.39270687237026647,
"grad_norm": 0.1943359375,
"learning_rate": 6.086956521739132e-06,
"loss": 1.1295,
"step": 280
},
{
"epoch": 0.394109396914446,
"grad_norm": 0.150390625,
"learning_rate": 6.072931276297336e-06,
"loss": 1.1241,
"step": 281
},
{
"epoch": 0.3955119214586255,
"grad_norm": 0.1376953125,
"learning_rate": 6.05890603085554e-06,
"loss": 1.1516,
"step": 282
},
{
"epoch": 0.39691444600280507,
"grad_norm": 0.251953125,
"learning_rate": 6.044880785413745e-06,
"loss": 1.1896,
"step": 283
},
{
"epoch": 0.39831697054698456,
"grad_norm": 0.185546875,
"learning_rate": 6.030855539971951e-06,
"loss": 1.2702,
"step": 284
},
{
"epoch": 0.3997194950911641,
"grad_norm": 0.189453125,
"learning_rate": 6.016830294530155e-06,
"loss": 1.2896,
"step": 285
},
{
"epoch": 0.4011220196353436,
"grad_norm": 0.15625,
"learning_rate": 6.0028050490883594e-06,
"loss": 1.1695,
"step": 286
},
{
"epoch": 0.40252454417952316,
"grad_norm": 0.15234375,
"learning_rate": 5.988779803646564e-06,
"loss": 1.3022,
"step": 287
},
{
"epoch": 0.40392706872370265,
"grad_norm": 0.1640625,
"learning_rate": 5.97475455820477e-06,
"loss": 1.4232,
"step": 288
},
{
"epoch": 0.4053295932678822,
"grad_norm": 0.1650390625,
"learning_rate": 5.960729312762974e-06,
"loss": 1.1834,
"step": 289
},
{
"epoch": 0.4067321178120617,
"grad_norm": 0.2216796875,
"learning_rate": 5.946704067321179e-06,
"loss": 1.2576,
"step": 290
},
{
"epoch": 0.40813464235624125,
"grad_norm": 0.146484375,
"learning_rate": 5.932678821879383e-06,
"loss": 1.1872,
"step": 291
},
{
"epoch": 0.40953716690042075,
"grad_norm": 0.1767578125,
"learning_rate": 5.918653576437588e-06,
"loss": 1.3551,
"step": 292
},
{
"epoch": 0.4109396914446003,
"grad_norm": 0.1865234375,
"learning_rate": 5.904628330995793e-06,
"loss": 1.2281,
"step": 293
},
{
"epoch": 0.4123422159887798,
"grad_norm": 0.1865234375,
"learning_rate": 5.890603085553998e-06,
"loss": 1.225,
"step": 294
},
{
"epoch": 0.41374474053295934,
"grad_norm": 0.2138671875,
"learning_rate": 5.876577840112202e-06,
"loss": 1.2137,
"step": 295
},
{
"epoch": 0.41514726507713884,
"grad_norm": 0.1396484375,
"learning_rate": 5.862552594670407e-06,
"loss": 1.139,
"step": 296
},
{
"epoch": 0.4165497896213184,
"grad_norm": 0.1806640625,
"learning_rate": 5.8485273492286125e-06,
"loss": 1.1809,
"step": 297
},
{
"epoch": 0.4179523141654979,
"grad_norm": 0.181640625,
"learning_rate": 5.834502103786817e-06,
"loss": 1.4761,
"step": 298
},
{
"epoch": 0.41935483870967744,
"grad_norm": 0.140625,
"learning_rate": 5.820476858345021e-06,
"loss": 1.5597,
"step": 299
},
{
"epoch": 0.42075736325385693,
"grad_norm": 0.15234375,
"learning_rate": 5.806451612903226e-06,
"loss": 1.2376,
"step": 300
},
{
"epoch": 0.4221598877980365,
"grad_norm": 0.2119140625,
"learning_rate": 5.792426367461431e-06,
"loss": 1.3761,
"step": 301
},
{
"epoch": 0.423562412342216,
"grad_norm": 0.1767578125,
"learning_rate": 5.778401122019636e-06,
"loss": 1.0822,
"step": 302
},
{
"epoch": 0.42496493688639553,
"grad_norm": 0.1591796875,
"learning_rate": 5.76437587657784e-06,
"loss": 1.3255,
"step": 303
},
{
"epoch": 0.426367461430575,
"grad_norm": 0.185546875,
"learning_rate": 5.7503506311360456e-06,
"loss": 1.2672,
"step": 304
},
{
"epoch": 0.4277699859747546,
"grad_norm": 0.134765625,
"learning_rate": 5.73632538569425e-06,
"loss": 1.0457,
"step": 305
},
{
"epoch": 0.42917251051893407,
"grad_norm": 0.150390625,
"learning_rate": 5.722300140252455e-06,
"loss": 1.4726,
"step": 306
},
{
"epoch": 0.4305750350631136,
"grad_norm": 0.162109375,
"learning_rate": 5.7082748948106595e-06,
"loss": 1.2004,
"step": 307
},
{
"epoch": 0.4319775596072931,
"grad_norm": 0.15234375,
"learning_rate": 5.694249649368865e-06,
"loss": 1.2963,
"step": 308
},
{
"epoch": 0.43338008415147267,
"grad_norm": 0.1904296875,
"learning_rate": 5.680224403927069e-06,
"loss": 1.0556,
"step": 309
},
{
"epoch": 0.43478260869565216,
"grad_norm": 0.1640625,
"learning_rate": 5.666199158485273e-06,
"loss": 1.0575,
"step": 310
},
{
"epoch": 0.4361851332398317,
"grad_norm": 0.1572265625,
"learning_rate": 5.652173913043479e-06,
"loss": 1.2175,
"step": 311
},
{
"epoch": 0.4375876577840112,
"grad_norm": 0.1416015625,
"learning_rate": 5.638148667601684e-06,
"loss": 1.4628,
"step": 312
},
{
"epoch": 0.43899018232819076,
"grad_norm": 0.1328125,
"learning_rate": 5.624123422159888e-06,
"loss": 1.1847,
"step": 313
},
{
"epoch": 0.44039270687237025,
"grad_norm": 0.1796875,
"learning_rate": 5.6100981767180925e-06,
"loss": 1.1012,
"step": 314
},
{
"epoch": 0.4417952314165498,
"grad_norm": 0.12060546875,
"learning_rate": 5.596072931276298e-06,
"loss": 1.14,
"step": 315
},
{
"epoch": 0.4431977559607293,
"grad_norm": 0.1484375,
"learning_rate": 5.582047685834503e-06,
"loss": 1.2151,
"step": 316
},
{
"epoch": 0.44460028050490885,
"grad_norm": 0.1455078125,
"learning_rate": 5.568022440392707e-06,
"loss": 1.2165,
"step": 317
},
{
"epoch": 0.44600280504908835,
"grad_norm": 0.169921875,
"learning_rate": 5.553997194950912e-06,
"loss": 1.2602,
"step": 318
},
{
"epoch": 0.4474053295932679,
"grad_norm": 0.2177734375,
"learning_rate": 5.539971949509116e-06,
"loss": 1.2089,
"step": 319
},
{
"epoch": 0.4488078541374474,
"grad_norm": 0.232421875,
"learning_rate": 5.525946704067322e-06,
"loss": 1.0351,
"step": 320
},
{
"epoch": 0.45021037868162694,
"grad_norm": 0.21875,
"learning_rate": 5.5119214586255264e-06,
"loss": 1.3343,
"step": 321
},
{
"epoch": 0.45161290322580644,
"grad_norm": 0.1513671875,
"learning_rate": 5.497896213183731e-06,
"loss": 1.1131,
"step": 322
},
{
"epoch": 0.453015427769986,
"grad_norm": 0.1650390625,
"learning_rate": 5.483870967741935e-06,
"loss": 1.0596,
"step": 323
},
{
"epoch": 0.4544179523141655,
"grad_norm": 0.2041015625,
"learning_rate": 5.469845722300141e-06,
"loss": 1.2091,
"step": 324
},
{
"epoch": 0.45582047685834504,
"grad_norm": 0.1826171875,
"learning_rate": 5.455820476858346e-06,
"loss": 1.2544,
"step": 325
},
{
"epoch": 0.45722300140252453,
"grad_norm": 0.181640625,
"learning_rate": 5.44179523141655e-06,
"loss": 1.1604,
"step": 326
},
{
"epoch": 0.4586255259467041,
"grad_norm": 0.2333984375,
"learning_rate": 5.427769985974754e-06,
"loss": 1.0818,
"step": 327
},
{
"epoch": 0.4600280504908836,
"grad_norm": 0.203125,
"learning_rate": 5.41374474053296e-06,
"loss": 1.1955,
"step": 328
},
{
"epoch": 0.46143057503506313,
"grad_norm": 0.1953125,
"learning_rate": 5.399719495091165e-06,
"loss": 1.1787,
"step": 329
},
{
"epoch": 0.4628330995792426,
"grad_norm": 0.2099609375,
"learning_rate": 5.385694249649369e-06,
"loss": 1.187,
"step": 330
},
{
"epoch": 0.4642356241234222,
"grad_norm": 0.171875,
"learning_rate": 5.3716690042075734e-06,
"loss": 1.1248,
"step": 331
},
{
"epoch": 0.46563814866760167,
"grad_norm": 0.2275390625,
"learning_rate": 5.3576437587657795e-06,
"loss": 1.0849,
"step": 332
},
{
"epoch": 0.4670406732117812,
"grad_norm": 0.216796875,
"learning_rate": 5.343618513323984e-06,
"loss": 1.2235,
"step": 333
},
{
"epoch": 0.4684431977559607,
"grad_norm": 0.154296875,
"learning_rate": 5.329593267882188e-06,
"loss": 1.0148,
"step": 334
},
{
"epoch": 0.46984572230014027,
"grad_norm": 0.1435546875,
"learning_rate": 5.3155680224403926e-06,
"loss": 1.2177,
"step": 335
},
{
"epoch": 0.47124824684431976,
"grad_norm": 0.1669921875,
"learning_rate": 5.301542776998599e-06,
"loss": 1.3309,
"step": 336
},
{
"epoch": 0.4726507713884993,
"grad_norm": 0.2119140625,
"learning_rate": 5.287517531556803e-06,
"loss": 1.457,
"step": 337
},
{
"epoch": 0.4740532959326788,
"grad_norm": 0.173828125,
"learning_rate": 5.273492286115007e-06,
"loss": 1.0875,
"step": 338
},
{
"epoch": 0.47545582047685836,
"grad_norm": 0.2197265625,
"learning_rate": 5.259467040673212e-06,
"loss": 1.1302,
"step": 339
},
{
"epoch": 0.47685834502103785,
"grad_norm": 0.095703125,
"learning_rate": 5.245441795231418e-06,
"loss": 1.2536,
"step": 340
},
{
"epoch": 0.4782608695652174,
"grad_norm": 0.1650390625,
"learning_rate": 5.231416549789622e-06,
"loss": 1.3286,
"step": 341
},
{
"epoch": 0.4796633941093969,
"grad_norm": 0.2060546875,
"learning_rate": 5.2173913043478265e-06,
"loss": 1.2036,
"step": 342
},
{
"epoch": 0.48106591865357645,
"grad_norm": 0.2578125,
"learning_rate": 5.203366058906031e-06,
"loss": 1.2194,
"step": 343
},
{
"epoch": 0.48246844319775595,
"grad_norm": 0.2001953125,
"learning_rate": 5.189340813464236e-06,
"loss": 1.2176,
"step": 344
},
{
"epoch": 0.4838709677419355,
"grad_norm": 0.1982421875,
"learning_rate": 5.175315568022441e-06,
"loss": 1.1685,
"step": 345
},
{
"epoch": 0.485273492286115,
"grad_norm": 0.2041015625,
"learning_rate": 5.161290322580646e-06,
"loss": 1.1327,
"step": 346
},
{
"epoch": 0.48667601683029454,
"grad_norm": 0.1630859375,
"learning_rate": 5.14726507713885e-06,
"loss": 1.2903,
"step": 347
},
{
"epoch": 0.48807854137447404,
"grad_norm": 0.140625,
"learning_rate": 5.133239831697055e-06,
"loss": 1.2387,
"step": 348
},
{
"epoch": 0.4894810659186536,
"grad_norm": 0.177734375,
"learning_rate": 5.11921458625526e-06,
"loss": 1.1081,
"step": 349
},
{
"epoch": 0.4908835904628331,
"grad_norm": 0.1982421875,
"learning_rate": 5.105189340813465e-06,
"loss": 1.15,
"step": 350
},
{
"epoch": 0.49228611500701264,
"grad_norm": 0.1630859375,
"learning_rate": 5.091164095371669e-06,
"loss": 1.2073,
"step": 351
},
{
"epoch": 0.49368863955119213,
"grad_norm": 0.1484375,
"learning_rate": 5.077138849929874e-06,
"loss": 1.2808,
"step": 352
},
{
"epoch": 0.4950911640953717,
"grad_norm": 0.12255859375,
"learning_rate": 5.063113604488079e-06,
"loss": 1.2547,
"step": 353
},
{
"epoch": 0.4964936886395512,
"grad_norm": 0.142578125,
"learning_rate": 5.049088359046284e-06,
"loss": 1.1674,
"step": 354
},
{
"epoch": 0.49789621318373073,
"grad_norm": 0.173828125,
"learning_rate": 5.035063113604488e-06,
"loss": 1.26,
"step": 355
},
{
"epoch": 0.4992987377279102,
"grad_norm": 0.2451171875,
"learning_rate": 5.0210378681626934e-06,
"loss": 1.3142,
"step": 356
},
{
"epoch": 0.5007012622720898,
"grad_norm": 0.16015625,
"learning_rate": 5.007012622720898e-06,
"loss": 1.2545,
"step": 357
},
{
"epoch": 0.5021037868162693,
"grad_norm": 0.1826171875,
"learning_rate": 4.992987377279103e-06,
"loss": 1.2984,
"step": 358
},
{
"epoch": 0.5035063113604488,
"grad_norm": 0.2353515625,
"learning_rate": 4.978962131837307e-06,
"loss": 1.1791,
"step": 359
},
{
"epoch": 0.5049088359046283,
"grad_norm": 0.171875,
"learning_rate": 4.964936886395513e-06,
"loss": 1.2114,
"step": 360
},
{
"epoch": 0.5063113604488079,
"grad_norm": 0.2333984375,
"learning_rate": 4.950911640953717e-06,
"loss": 1.1979,
"step": 361
},
{
"epoch": 0.5077138849929874,
"grad_norm": 0.19921875,
"learning_rate": 4.936886395511922e-06,
"loss": 1.4286,
"step": 362
},
{
"epoch": 0.5091164095371669,
"grad_norm": 0.15234375,
"learning_rate": 4.9228611500701265e-06,
"loss": 1.1784,
"step": 363
},
{
"epoch": 0.5105189340813464,
"grad_norm": 0.17578125,
"learning_rate": 4.908835904628331e-06,
"loss": 1.2079,
"step": 364
},
{
"epoch": 0.511921458625526,
"grad_norm": 0.1640625,
"learning_rate": 4.894810659186536e-06,
"loss": 1.3151,
"step": 365
},
{
"epoch": 0.5133239831697055,
"grad_norm": 0.283203125,
"learning_rate": 4.8807854137447404e-06,
"loss": 1.1817,
"step": 366
},
{
"epoch": 0.514726507713885,
"grad_norm": 0.1650390625,
"learning_rate": 4.866760168302946e-06,
"loss": 1.0681,
"step": 367
},
{
"epoch": 0.5161290322580645,
"grad_norm": 0.25,
"learning_rate": 4.85273492286115e-06,
"loss": 1.2658,
"step": 368
},
{
"epoch": 0.517531556802244,
"grad_norm": 0.1455078125,
"learning_rate": 4.838709677419355e-06,
"loss": 1.2459,
"step": 369
},
{
"epoch": 0.5189340813464236,
"grad_norm": 0.1748046875,
"learning_rate": 4.8246844319775596e-06,
"loss": 1.265,
"step": 370
},
{
"epoch": 0.520336605890603,
"grad_norm": 0.2138671875,
"learning_rate": 4.810659186535765e-06,
"loss": 1.0783,
"step": 371
},
{
"epoch": 0.5217391304347826,
"grad_norm": 0.21875,
"learning_rate": 4.796633941093969e-06,
"loss": 1.2522,
"step": 372
},
{
"epoch": 0.5231416549789621,
"grad_norm": 0.1884765625,
"learning_rate": 4.782608695652174e-06,
"loss": 1.2196,
"step": 373
},
{
"epoch": 0.5245441795231417,
"grad_norm": 0.1591796875,
"learning_rate": 4.768583450210379e-06,
"loss": 1.1709,
"step": 374
},
{
"epoch": 0.5259467040673211,
"grad_norm": 0.2080078125,
"learning_rate": 4.754558204768584e-06,
"loss": 1.0803,
"step": 375
},
{
"epoch": 0.5273492286115007,
"grad_norm": 0.138671875,
"learning_rate": 4.740532959326788e-06,
"loss": 1.4023,
"step": 376
},
{
"epoch": 0.5287517531556802,
"grad_norm": 0.1875,
"learning_rate": 4.7265077138849935e-06,
"loss": 1.225,
"step": 377
},
{
"epoch": 0.5301542776998598,
"grad_norm": 0.2373046875,
"learning_rate": 4.712482468443198e-06,
"loss": 1.1343,
"step": 378
},
{
"epoch": 0.5315568022440392,
"grad_norm": 0.1669921875,
"learning_rate": 4.698457223001403e-06,
"loss": 1.1392,
"step": 379
},
{
"epoch": 0.5329593267882188,
"grad_norm": 0.1748046875,
"learning_rate": 4.684431977559607e-06,
"loss": 1.1653,
"step": 380
},
{
"epoch": 0.5343618513323983,
"grad_norm": 0.1748046875,
"learning_rate": 4.670406732117813e-06,
"loss": 1.3631,
"step": 381
},
{
"epoch": 0.5357643758765779,
"grad_norm": 0.21875,
"learning_rate": 4.656381486676017e-06,
"loss": 1.201,
"step": 382
},
{
"epoch": 0.5371669004207573,
"grad_norm": 0.1298828125,
"learning_rate": 4.642356241234222e-06,
"loss": 1.2728,
"step": 383
},
{
"epoch": 0.5385694249649369,
"grad_norm": 0.21875,
"learning_rate": 4.6283309957924265e-06,
"loss": 1.1816,
"step": 384
},
{
"epoch": 0.5399719495091164,
"grad_norm": 0.212890625,
"learning_rate": 4.614305750350632e-06,
"loss": 1.1313,
"step": 385
},
{
"epoch": 0.541374474053296,
"grad_norm": 0.166015625,
"learning_rate": 4.600280504908836e-06,
"loss": 1.2561,
"step": 386
},
{
"epoch": 0.5427769985974754,
"grad_norm": 0.193359375,
"learning_rate": 4.586255259467041e-06,
"loss": 1.324,
"step": 387
},
{
"epoch": 0.544179523141655,
"grad_norm": 0.1259765625,
"learning_rate": 4.572230014025246e-06,
"loss": 1.2157,
"step": 388
},
{
"epoch": 0.5455820476858345,
"grad_norm": 0.1943359375,
"learning_rate": 4.558204768583451e-06,
"loss": 1.1258,
"step": 389
},
{
"epoch": 0.5469845722300141,
"grad_norm": 0.1513671875,
"learning_rate": 4.544179523141655e-06,
"loss": 1.0754,
"step": 390
},
{
"epoch": 0.5483870967741935,
"grad_norm": 0.1787109375,
"learning_rate": 4.5301542776998604e-06,
"loss": 1.1802,
"step": 391
},
{
"epoch": 0.5497896213183731,
"grad_norm": 0.21484375,
"learning_rate": 4.516129032258065e-06,
"loss": 1.194,
"step": 392
},
{
"epoch": 0.5511921458625526,
"grad_norm": 0.13671875,
"learning_rate": 4.50210378681627e-06,
"loss": 1.2355,
"step": 393
},
{
"epoch": 0.5525946704067322,
"grad_norm": 0.1455078125,
"learning_rate": 4.488078541374474e-06,
"loss": 1.1846,
"step": 394
},
{
"epoch": 0.5539971949509116,
"grad_norm": 0.10693359375,
"learning_rate": 4.47405329593268e-06,
"loss": 1.3068,
"step": 395
},
{
"epoch": 0.5553997194950911,
"grad_norm": 0.1357421875,
"learning_rate": 4.460028050490884e-06,
"loss": 1.455,
"step": 396
},
{
"epoch": 0.5568022440392707,
"grad_norm": 0.1513671875,
"learning_rate": 4.446002805049089e-06,
"loss": 1.2467,
"step": 397
},
{
"epoch": 0.5582047685834503,
"grad_norm": 0.1611328125,
"learning_rate": 4.4319775596072935e-06,
"loss": 1.1788,
"step": 398
},
{
"epoch": 0.5596072931276297,
"grad_norm": 0.1865234375,
"learning_rate": 4.417952314165499e-06,
"loss": 1.21,
"step": 399
},
{
"epoch": 0.5610098176718092,
"grad_norm": 0.1943359375,
"learning_rate": 4.403927068723703e-06,
"loss": 1.2269,
"step": 400
},
{
"epoch": 0.5624123422159888,
"grad_norm": 0.2001953125,
"learning_rate": 4.389901823281908e-06,
"loss": 1.1959,
"step": 401
},
{
"epoch": 0.5638148667601683,
"grad_norm": 0.234375,
"learning_rate": 4.375876577840113e-06,
"loss": 1.23,
"step": 402
},
{
"epoch": 0.5652173913043478,
"grad_norm": 0.1376953125,
"learning_rate": 4.361851332398318e-06,
"loss": 1.2101,
"step": 403
},
{
"epoch": 0.5666199158485273,
"grad_norm": 0.203125,
"learning_rate": 4.347826086956522e-06,
"loss": 1.1597,
"step": 404
},
{
"epoch": 0.5680224403927069,
"grad_norm": 0.2451171875,
"learning_rate": 4.333800841514727e-06,
"loss": 1.1738,
"step": 405
},
{
"epoch": 0.5694249649368864,
"grad_norm": 0.1806640625,
"learning_rate": 4.319775596072932e-06,
"loss": 1.4462,
"step": 406
},
{
"epoch": 0.5708274894810659,
"grad_norm": 0.1689453125,
"learning_rate": 4.305750350631136e-06,
"loss": 1.2194,
"step": 407
},
{
"epoch": 0.5722300140252454,
"grad_norm": 0.166015625,
"learning_rate": 4.291725105189341e-06,
"loss": 1.1773,
"step": 408
},
{
"epoch": 0.573632538569425,
"grad_norm": 0.263671875,
"learning_rate": 4.277699859747546e-06,
"loss": 1.1419,
"step": 409
},
{
"epoch": 0.5750350631136045,
"grad_norm": 0.1708984375,
"learning_rate": 4.263674614305751e-06,
"loss": 1.3138,
"step": 410
},
{
"epoch": 0.576437587657784,
"grad_norm": 0.146484375,
"learning_rate": 4.249649368863955e-06,
"loss": 1.2294,
"step": 411
},
{
"epoch": 0.5778401122019635,
"grad_norm": 0.1728515625,
"learning_rate": 4.2356241234221605e-06,
"loss": 1.0948,
"step": 412
},
{
"epoch": 0.5792426367461431,
"grad_norm": 0.2080078125,
"learning_rate": 4.221598877980365e-06,
"loss": 1.2114,
"step": 413
},
{
"epoch": 0.5806451612903226,
"grad_norm": 0.205078125,
"learning_rate": 4.20757363253857e-06,
"loss": 1.1497,
"step": 414
},
{
"epoch": 0.5820476858345021,
"grad_norm": 0.14453125,
"learning_rate": 4.193548387096774e-06,
"loss": 1.1986,
"step": 415
},
{
"epoch": 0.5834502103786816,
"grad_norm": 0.197265625,
"learning_rate": 4.17952314165498e-06,
"loss": 1.0188,
"step": 416
},
{
"epoch": 0.5848527349228612,
"grad_norm": 0.1513671875,
"learning_rate": 4.165497896213184e-06,
"loss": 1.1713,
"step": 417
},
{
"epoch": 0.5862552594670407,
"grad_norm": 0.2373046875,
"learning_rate": 4.151472650771388e-06,
"loss": 1.1821,
"step": 418
},
{
"epoch": 0.5876577840112202,
"grad_norm": 0.10546875,
"learning_rate": 4.1374474053295935e-06,
"loss": 1.2692,
"step": 419
},
{
"epoch": 0.5890603085553997,
"grad_norm": 0.12890625,
"learning_rate": 4.123422159887798e-06,
"loss": 1.4345,
"step": 420
},
{
"epoch": 0.5904628330995793,
"grad_norm": 0.1865234375,
"learning_rate": 4.109396914446003e-06,
"loss": 1.1865,
"step": 421
},
{
"epoch": 0.5918653576437588,
"grad_norm": 0.10693359375,
"learning_rate": 4.0953716690042075e-06,
"loss": 1.2786,
"step": 422
},
{
"epoch": 0.5932678821879382,
"grad_norm": 0.130859375,
"learning_rate": 4.081346423562413e-06,
"loss": 1.4287,
"step": 423
},
{
"epoch": 0.5946704067321178,
"grad_norm": 0.2578125,
"learning_rate": 4.067321178120617e-06,
"loss": 1.3566,
"step": 424
},
{
"epoch": 0.5960729312762973,
"grad_norm": 0.2060546875,
"learning_rate": 4.053295932678822e-06,
"loss": 1.0893,
"step": 425
},
{
"epoch": 0.5974754558204769,
"grad_norm": 0.2109375,
"learning_rate": 4.039270687237027e-06,
"loss": 1.3028,
"step": 426
},
{
"epoch": 0.5988779803646563,
"grad_norm": 0.21875,
"learning_rate": 4.025245441795232e-06,
"loss": 1.1084,
"step": 427
},
{
"epoch": 0.6002805049088359,
"grad_norm": 0.1767578125,
"learning_rate": 4.011220196353436e-06,
"loss": 1.31,
"step": 428
},
{
"epoch": 0.6016830294530154,
"grad_norm": 0.1650390625,
"learning_rate": 3.997194950911641e-06,
"loss": 1.2634,
"step": 429
},
{
"epoch": 0.603085553997195,
"grad_norm": 0.2421875,
"learning_rate": 3.983169705469846e-06,
"loss": 1.1716,
"step": 430
},
{
"epoch": 0.6044880785413744,
"grad_norm": 0.212890625,
"learning_rate": 3.969144460028051e-06,
"loss": 1.246,
"step": 431
},
{
"epoch": 0.605890603085554,
"grad_norm": 0.1640625,
"learning_rate": 3.955119214586255e-06,
"loss": 1.2066,
"step": 432
},
{
"epoch": 0.6072931276297335,
"grad_norm": 0.21875,
"learning_rate": 3.9410939691444605e-06,
"loss": 1.3996,
"step": 433
},
{
"epoch": 0.6086956521739131,
"grad_norm": 0.1796875,
"learning_rate": 3.927068723702665e-06,
"loss": 1.2979,
"step": 434
},
{
"epoch": 0.6100981767180925,
"grad_norm": 0.1728515625,
"learning_rate": 3.91304347826087e-06,
"loss": 1.2356,
"step": 435
},
{
"epoch": 0.6115007012622721,
"grad_norm": 0.15625,
"learning_rate": 3.8990182328190744e-06,
"loss": 1.2458,
"step": 436
},
{
"epoch": 0.6129032258064516,
"grad_norm": 0.177734375,
"learning_rate": 3.88499298737728e-06,
"loss": 1.399,
"step": 437
},
{
"epoch": 0.6143057503506312,
"grad_norm": 0.208984375,
"learning_rate": 3.870967741935484e-06,
"loss": 1.2904,
"step": 438
},
{
"epoch": 0.6157082748948106,
"grad_norm": 0.1064453125,
"learning_rate": 3.856942496493689e-06,
"loss": 1.1914,
"step": 439
},
{
"epoch": 0.6171107994389902,
"grad_norm": 0.20703125,
"learning_rate": 3.8429172510518936e-06,
"loss": 1.3626,
"step": 440
},
{
"epoch": 0.6185133239831697,
"grad_norm": 0.265625,
"learning_rate": 3.828892005610099e-06,
"loss": 1.1886,
"step": 441
},
{
"epoch": 0.6199158485273493,
"grad_norm": 0.19140625,
"learning_rate": 3.814866760168303e-06,
"loss": 1.1162,
"step": 442
},
{
"epoch": 0.6213183730715287,
"grad_norm": 0.134765625,
"learning_rate": 3.8008415147265083e-06,
"loss": 1.1404,
"step": 443
},
{
"epoch": 0.6227208976157083,
"grad_norm": 0.1630859375,
"learning_rate": 3.7868162692847127e-06,
"loss": 1.1815,
"step": 444
},
{
"epoch": 0.6241234221598878,
"grad_norm": 0.1572265625,
"learning_rate": 3.772791023842918e-06,
"loss": 1.2118,
"step": 445
},
{
"epoch": 0.6255259467040674,
"grad_norm": 0.20703125,
"learning_rate": 3.7587657784011223e-06,
"loss": 1.2007,
"step": 446
},
{
"epoch": 0.6269284712482468,
"grad_norm": 0.1796875,
"learning_rate": 3.7447405329593275e-06,
"loss": 1.1749,
"step": 447
},
{
"epoch": 0.6283309957924264,
"grad_norm": 0.166015625,
"learning_rate": 3.730715287517532e-06,
"loss": 1.0124,
"step": 448
},
{
"epoch": 0.6297335203366059,
"grad_norm": 0.296875,
"learning_rate": 3.7166900420757366e-06,
"loss": 1.1405,
"step": 449
},
{
"epoch": 0.6311360448807855,
"grad_norm": 0.138671875,
"learning_rate": 3.7026647966339414e-06,
"loss": 1.2862,
"step": 450
},
{
"epoch": 0.6325385694249649,
"grad_norm": 0.1650390625,
"learning_rate": 3.688639551192146e-06,
"loss": 1.2599,
"step": 451
},
{
"epoch": 0.6339410939691444,
"grad_norm": 0.216796875,
"learning_rate": 3.674614305750351e-06,
"loss": 1.1407,
"step": 452
},
{
"epoch": 0.635343618513324,
"grad_norm": 0.1474609375,
"learning_rate": 3.6605890603085557e-06,
"loss": 1.2165,
"step": 453
},
{
"epoch": 0.6367461430575035,
"grad_norm": 0.263671875,
"learning_rate": 3.6465638148667605e-06,
"loss": 1.1954,
"step": 454
},
{
"epoch": 0.638148667601683,
"grad_norm": 0.251953125,
"learning_rate": 3.6325385694249653e-06,
"loss": 1.2646,
"step": 455
},
{
"epoch": 0.6395511921458625,
"grad_norm": 0.197265625,
"learning_rate": 3.61851332398317e-06,
"loss": 1.2184,
"step": 456
},
{
"epoch": 0.6409537166900421,
"grad_norm": 0.22265625,
"learning_rate": 3.604488078541375e-06,
"loss": 1.121,
"step": 457
},
{
"epoch": 0.6423562412342216,
"grad_norm": 0.2119140625,
"learning_rate": 3.5904628330995792e-06,
"loss": 1.2237,
"step": 458
},
{
"epoch": 0.6437587657784011,
"grad_norm": 0.203125,
"learning_rate": 3.5764375876577844e-06,
"loss": 1.2567,
"step": 459
},
{
"epoch": 0.6451612903225806,
"grad_norm": 0.1416015625,
"learning_rate": 3.562412342215989e-06,
"loss": 1.3652,
"step": 460
},
{
"epoch": 0.6465638148667602,
"grad_norm": 0.1923828125,
"learning_rate": 3.548387096774194e-06,
"loss": 1.2201,
"step": 461
},
{
"epoch": 0.6479663394109397,
"grad_norm": 0.177734375,
"learning_rate": 3.5343618513323984e-06,
"loss": 1.1577,
"step": 462
},
{
"epoch": 0.6493688639551192,
"grad_norm": 0.173828125,
"learning_rate": 3.5203366058906036e-06,
"loss": 1.3246,
"step": 463
},
{
"epoch": 0.6507713884992987,
"grad_norm": 0.15234375,
"learning_rate": 3.506311360448808e-06,
"loss": 1.1457,
"step": 464
},
{
"epoch": 0.6521739130434783,
"grad_norm": 0.1708984375,
"learning_rate": 3.492286115007013e-06,
"loss": 1.2854,
"step": 465
},
{
"epoch": 0.6535764375876578,
"grad_norm": 0.1728515625,
"learning_rate": 3.4782608695652175e-06,
"loss": 1.1192,
"step": 466
},
{
"epoch": 0.6549789621318373,
"grad_norm": 0.171875,
"learning_rate": 3.4642356241234227e-06,
"loss": 1.3773,
"step": 467
},
{
"epoch": 0.6563814866760168,
"grad_norm": 0.12255859375,
"learning_rate": 3.450210378681627e-06,
"loss": 1.1959,
"step": 468
},
{
"epoch": 0.6577840112201964,
"grad_norm": 0.1630859375,
"learning_rate": 3.4361851332398323e-06,
"loss": 1.3236,
"step": 469
},
{
"epoch": 0.6591865357643759,
"grad_norm": 0.1875,
"learning_rate": 3.4221598877980366e-06,
"loss": 1.2094,
"step": 470
},
{
"epoch": 0.6605890603085554,
"grad_norm": 0.1669921875,
"learning_rate": 3.408134642356242e-06,
"loss": 1.2986,
"step": 471
},
{
"epoch": 0.6619915848527349,
"grad_norm": 0.255859375,
"learning_rate": 3.394109396914446e-06,
"loss": 1.1477,
"step": 472
},
{
"epoch": 0.6633941093969145,
"grad_norm": 0.158203125,
"learning_rate": 3.3800841514726514e-06,
"loss": 1.2415,
"step": 473
},
{
"epoch": 0.664796633941094,
"grad_norm": 0.1650390625,
"learning_rate": 3.3660589060308558e-06,
"loss": 1.1927,
"step": 474
},
{
"epoch": 0.6661991584852734,
"grad_norm": 0.1572265625,
"learning_rate": 3.3520336605890606e-06,
"loss": 1.1582,
"step": 475
},
{
"epoch": 0.667601683029453,
"grad_norm": 0.1396484375,
"learning_rate": 3.3380084151472653e-06,
"loss": 1.0954,
"step": 476
},
{
"epoch": 0.6690042075736325,
"grad_norm": 0.1552734375,
"learning_rate": 3.32398316970547e-06,
"loss": 1.1115,
"step": 477
},
{
"epoch": 0.6704067321178121,
"grad_norm": 0.1630859375,
"learning_rate": 3.309957924263675e-06,
"loss": 1.1437,
"step": 478
},
{
"epoch": 0.6718092566619915,
"grad_norm": 0.140625,
"learning_rate": 3.2959326788218797e-06,
"loss": 1.517,
"step": 479
},
{
"epoch": 0.6732117812061711,
"grad_norm": 0.1376953125,
"learning_rate": 3.2819074333800845e-06,
"loss": 1.168,
"step": 480
},
{
"epoch": 0.6746143057503506,
"grad_norm": 0.1962890625,
"learning_rate": 3.2678821879382893e-06,
"loss": 1.2514,
"step": 481
},
{
"epoch": 0.6760168302945302,
"grad_norm": 0.1943359375,
"learning_rate": 3.253856942496494e-06,
"loss": 1.1855,
"step": 482
},
{
"epoch": 0.6774193548387096,
"grad_norm": 0.2333984375,
"learning_rate": 3.239831697054699e-06,
"loss": 1.139,
"step": 483
},
{
"epoch": 0.6788218793828892,
"grad_norm": 0.2021484375,
"learning_rate": 3.225806451612903e-06,
"loss": 1.2044,
"step": 484
},
{
"epoch": 0.6802244039270687,
"grad_norm": 0.1650390625,
"learning_rate": 3.2117812061711084e-06,
"loss": 1.2088,
"step": 485
},
{
"epoch": 0.6816269284712483,
"grad_norm": 0.201171875,
"learning_rate": 3.1977559607293128e-06,
"loss": 1.0997,
"step": 486
},
{
"epoch": 0.6830294530154277,
"grad_norm": 0.234375,
"learning_rate": 3.183730715287518e-06,
"loss": 1.1307,
"step": 487
},
{
"epoch": 0.6844319775596073,
"grad_norm": 0.1455078125,
"learning_rate": 3.1697054698457223e-06,
"loss": 1.2425,
"step": 488
},
{
"epoch": 0.6858345021037868,
"grad_norm": 0.17578125,
"learning_rate": 3.1556802244039275e-06,
"loss": 1.1604,
"step": 489
},
{
"epoch": 0.6872370266479664,
"grad_norm": 0.2470703125,
"learning_rate": 3.141654978962132e-06,
"loss": 1.1059,
"step": 490
},
{
"epoch": 0.6886395511921458,
"grad_norm": 0.2197265625,
"learning_rate": 3.127629733520337e-06,
"loss": 1.169,
"step": 491
},
{
"epoch": 0.6900420757363254,
"grad_norm": 0.19921875,
"learning_rate": 3.1136044880785415e-06,
"loss": 1.3918,
"step": 492
},
{
"epoch": 0.6914446002805049,
"grad_norm": 0.26171875,
"learning_rate": 3.0995792426367467e-06,
"loss": 1.0892,
"step": 493
},
{
"epoch": 0.6928471248246845,
"grad_norm": 0.19140625,
"learning_rate": 3.085553997194951e-06,
"loss": 1.2275,
"step": 494
},
{
"epoch": 0.6942496493688639,
"grad_norm": 0.142578125,
"learning_rate": 3.0715287517531562e-06,
"loss": 1.497,
"step": 495
},
{
"epoch": 0.6956521739130435,
"grad_norm": 0.251953125,
"learning_rate": 3.0575035063113606e-06,
"loss": 1.1735,
"step": 496
},
{
"epoch": 0.697054698457223,
"grad_norm": 0.1591796875,
"learning_rate": 3.043478260869566e-06,
"loss": 1.1378,
"step": 497
},
{
"epoch": 0.6984572230014026,
"grad_norm": 0.255859375,
"learning_rate": 3.02945301542777e-06,
"loss": 1.2152,
"step": 498
},
{
"epoch": 0.699859747545582,
"grad_norm": 0.1748046875,
"learning_rate": 3.0154277699859754e-06,
"loss": 1.3393,
"step": 499
},
{
"epoch": 0.7012622720897616,
"grad_norm": 0.2177734375,
"learning_rate": 3.0014025245441797e-06,
"loss": 1.1096,
"step": 500
},
{
"epoch": 0.7026647966339411,
"grad_norm": 0.1669921875,
"learning_rate": 2.987377279102385e-06,
"loss": 1.2146,
"step": 501
},
{
"epoch": 0.7040673211781207,
"grad_norm": 0.146484375,
"learning_rate": 2.9733520336605893e-06,
"loss": 1.3882,
"step": 502
},
{
"epoch": 0.7054698457223001,
"grad_norm": 0.185546875,
"learning_rate": 2.959326788218794e-06,
"loss": 1.3228,
"step": 503
},
{
"epoch": 0.7068723702664796,
"grad_norm": 0.2119140625,
"learning_rate": 2.945301542776999e-06,
"loss": 1.1921,
"step": 504
},
{
"epoch": 0.7082748948106592,
"grad_norm": 0.16796875,
"learning_rate": 2.9312762973352036e-06,
"loss": 1.2217,
"step": 505
},
{
"epoch": 0.7096774193548387,
"grad_norm": 0.1181640625,
"learning_rate": 2.9172510518934084e-06,
"loss": 1.4375,
"step": 506
},
{
"epoch": 0.7110799438990182,
"grad_norm": 0.1796875,
"learning_rate": 2.903225806451613e-06,
"loss": 1.1508,
"step": 507
},
{
"epoch": 0.7124824684431977,
"grad_norm": 0.25390625,
"learning_rate": 2.889200561009818e-06,
"loss": 1.1197,
"step": 508
},
{
"epoch": 0.7138849929873773,
"grad_norm": 0.1845703125,
"learning_rate": 2.8751753155680228e-06,
"loss": 1.391,
"step": 509
},
{
"epoch": 0.7152875175315568,
"grad_norm": 0.1806640625,
"learning_rate": 2.8611500701262276e-06,
"loss": 1.2597,
"step": 510
},
{
"epoch": 0.7166900420757363,
"grad_norm": 0.1669921875,
"learning_rate": 2.8471248246844323e-06,
"loss": 1.3079,
"step": 511
},
{
"epoch": 0.7180925666199158,
"grad_norm": 0.2490234375,
"learning_rate": 2.8330995792426367e-06,
"loss": 1.2022,
"step": 512
},
{
"epoch": 0.7194950911640954,
"grad_norm": 0.205078125,
"learning_rate": 2.819074333800842e-06,
"loss": 1.094,
"step": 513
},
{
"epoch": 0.7208976157082749,
"grad_norm": 0.21484375,
"learning_rate": 2.8050490883590463e-06,
"loss": 1.1829,
"step": 514
},
{
"epoch": 0.7223001402524544,
"grad_norm": 0.1416015625,
"learning_rate": 2.7910238429172515e-06,
"loss": 1.2382,
"step": 515
},
{
"epoch": 0.7237026647966339,
"grad_norm": 0.2578125,
"learning_rate": 2.776998597475456e-06,
"loss": 1.1327,
"step": 516
},
{
"epoch": 0.7251051893408135,
"grad_norm": 0.18359375,
"learning_rate": 2.762973352033661e-06,
"loss": 1.3151,
"step": 517
},
{
"epoch": 0.726507713884993,
"grad_norm": 0.1669921875,
"learning_rate": 2.7489481065918654e-06,
"loss": 1.3314,
"step": 518
},
{
"epoch": 0.7279102384291725,
"grad_norm": 0.271484375,
"learning_rate": 2.7349228611500706e-06,
"loss": 1.2279,
"step": 519
},
{
"epoch": 0.729312762973352,
"grad_norm": 0.2314453125,
"learning_rate": 2.720897615708275e-06,
"loss": 1.1031,
"step": 520
},
{
"epoch": 0.7307152875175316,
"grad_norm": 0.142578125,
"learning_rate": 2.70687237026648e-06,
"loss": 1.3399,
"step": 521
},
{
"epoch": 0.7321178120617111,
"grad_norm": 0.20703125,
"learning_rate": 2.6928471248246845e-06,
"loss": 1.0524,
"step": 522
},
{
"epoch": 0.7335203366058906,
"grad_norm": 0.236328125,
"learning_rate": 2.6788218793828897e-06,
"loss": 1.1273,
"step": 523
},
{
"epoch": 0.7349228611500701,
"grad_norm": 0.1875,
"learning_rate": 2.664796633941094e-06,
"loss": 1.4101,
"step": 524
},
{
"epoch": 0.7363253856942497,
"grad_norm": 0.25,
"learning_rate": 2.6507713884992993e-06,
"loss": 0.9936,
"step": 525
},
{
"epoch": 0.7377279102384292,
"grad_norm": 0.224609375,
"learning_rate": 2.6367461430575037e-06,
"loss": 1.1749,
"step": 526
},
{
"epoch": 0.7391304347826086,
"grad_norm": 0.1669921875,
"learning_rate": 2.622720897615709e-06,
"loss": 1.344,
"step": 527
},
{
"epoch": 0.7405329593267882,
"grad_norm": 0.169921875,
"learning_rate": 2.6086956521739132e-06,
"loss": 1.3,
"step": 528
},
{
"epoch": 0.7419354838709677,
"grad_norm": 0.2197265625,
"learning_rate": 2.594670406732118e-06,
"loss": 1.1268,
"step": 529
},
{
"epoch": 0.7433380084151473,
"grad_norm": 0.1611328125,
"learning_rate": 2.580645161290323e-06,
"loss": 1.3111,
"step": 530
},
{
"epoch": 0.7447405329593267,
"grad_norm": 0.1357421875,
"learning_rate": 2.5666199158485276e-06,
"loss": 1.2274,
"step": 531
},
{
"epoch": 0.7461430575035063,
"grad_norm": 0.25390625,
"learning_rate": 2.5525946704067324e-06,
"loss": 1.1316,
"step": 532
},
{
"epoch": 0.7475455820476858,
"grad_norm": 0.2734375,
"learning_rate": 2.538569424964937e-06,
"loss": 1.2295,
"step": 533
},
{
"epoch": 0.7489481065918654,
"grad_norm": 0.181640625,
"learning_rate": 2.524544179523142e-06,
"loss": 1.0851,
"step": 534
},
{
"epoch": 0.7503506311360448,
"grad_norm": 0.1767578125,
"learning_rate": 2.5105189340813467e-06,
"loss": 1.1677,
"step": 535
},
{
"epoch": 0.7517531556802244,
"grad_norm": 0.244140625,
"learning_rate": 2.4964936886395515e-06,
"loss": 1.2814,
"step": 536
},
{
"epoch": 0.7531556802244039,
"grad_norm": 0.2158203125,
"learning_rate": 2.4824684431977563e-06,
"loss": 1.1817,
"step": 537
},
{
"epoch": 0.7545582047685835,
"grad_norm": 0.2578125,
"learning_rate": 2.468443197755961e-06,
"loss": 1.2785,
"step": 538
},
{
"epoch": 0.7559607293127629,
"grad_norm": 0.2080078125,
"learning_rate": 2.4544179523141654e-06,
"loss": 1.2524,
"step": 539
},
{
"epoch": 0.7573632538569425,
"grad_norm": 0.1953125,
"learning_rate": 2.4403927068723702e-06,
"loss": 1.2224,
"step": 540
},
{
"epoch": 0.758765778401122,
"grad_norm": 0.240234375,
"learning_rate": 2.426367461430575e-06,
"loss": 1.2381,
"step": 541
},
{
"epoch": 0.7601683029453016,
"grad_norm": 0.205078125,
"learning_rate": 2.4123422159887798e-06,
"loss": 1.1549,
"step": 542
},
{
"epoch": 0.761570827489481,
"grad_norm": 0.22265625,
"learning_rate": 2.3983169705469846e-06,
"loss": 0.9989,
"step": 543
},
{
"epoch": 0.7629733520336606,
"grad_norm": 0.185546875,
"learning_rate": 2.3842917251051894e-06,
"loss": 1.3427,
"step": 544
},
{
"epoch": 0.7643758765778401,
"grad_norm": 0.2294921875,
"learning_rate": 2.370266479663394e-06,
"loss": 1.2504,
"step": 545
},
{
"epoch": 0.7657784011220197,
"grad_norm": 0.1767578125,
"learning_rate": 2.356241234221599e-06,
"loss": 1.1086,
"step": 546
},
{
"epoch": 0.7671809256661991,
"grad_norm": 0.2158203125,
"learning_rate": 2.3422159887798037e-06,
"loss": 1.4594,
"step": 547
},
{
"epoch": 0.7685834502103787,
"grad_norm": 0.251953125,
"learning_rate": 2.3281907433380085e-06,
"loss": 1.1633,
"step": 548
},
{
"epoch": 0.7699859747545582,
"grad_norm": 0.2001953125,
"learning_rate": 2.3141654978962133e-06,
"loss": 1.1294,
"step": 549
},
{
"epoch": 0.7713884992987378,
"grad_norm": 0.236328125,
"learning_rate": 2.300140252454418e-06,
"loss": 1.3159,
"step": 550
},
{
"epoch": 0.7727910238429172,
"grad_norm": 0.1669921875,
"learning_rate": 2.286115007012623e-06,
"loss": 1.2132,
"step": 551
},
{
"epoch": 0.7741935483870968,
"grad_norm": 0.177734375,
"learning_rate": 2.2720897615708276e-06,
"loss": 1.0724,
"step": 552
},
{
"epoch": 0.7755960729312763,
"grad_norm": 0.18359375,
"learning_rate": 2.2580645161290324e-06,
"loss": 1.2799,
"step": 553
},
{
"epoch": 0.7769985974754559,
"grad_norm": 0.236328125,
"learning_rate": 2.244039270687237e-06,
"loss": 1.2397,
"step": 554
},
{
"epoch": 0.7784011220196353,
"grad_norm": 0.236328125,
"learning_rate": 2.230014025245442e-06,
"loss": 1.258,
"step": 555
},
{
"epoch": 0.7798036465638148,
"grad_norm": 0.23046875,
"learning_rate": 2.2159887798036468e-06,
"loss": 1.0467,
"step": 556
},
{
"epoch": 0.7812061711079944,
"grad_norm": 0.212890625,
"learning_rate": 2.2019635343618515e-06,
"loss": 1.2859,
"step": 557
},
{
"epoch": 0.782608695652174,
"grad_norm": 0.291015625,
"learning_rate": 2.1879382889200563e-06,
"loss": 1.2433,
"step": 558
},
{
"epoch": 0.7840112201963534,
"grad_norm": 0.1982421875,
"learning_rate": 2.173913043478261e-06,
"loss": 1.2297,
"step": 559
},
{
"epoch": 0.7854137447405329,
"grad_norm": 0.169921875,
"learning_rate": 2.159887798036466e-06,
"loss": 1.3596,
"step": 560
},
{
"epoch": 0.7868162692847125,
"grad_norm": 0.2431640625,
"learning_rate": 2.1458625525946707e-06,
"loss": 1.2091,
"step": 561
},
{
"epoch": 0.788218793828892,
"grad_norm": 0.2099609375,
"learning_rate": 2.1318373071528755e-06,
"loss": 1.176,
"step": 562
},
{
"epoch": 0.7896213183730715,
"grad_norm": 0.208984375,
"learning_rate": 2.1178120617110802e-06,
"loss": 1.211,
"step": 563
},
{
"epoch": 0.791023842917251,
"grad_norm": 0.220703125,
"learning_rate": 2.103786816269285e-06,
"loss": 1.1723,
"step": 564
},
{
"epoch": 0.7924263674614306,
"grad_norm": 0.1484375,
"learning_rate": 2.08976157082749e-06,
"loss": 1.1888,
"step": 565
},
{
"epoch": 0.7938288920056101,
"grad_norm": 0.3359375,
"learning_rate": 2.075736325385694e-06,
"loss": 1.1476,
"step": 566
},
{
"epoch": 0.7952314165497896,
"grad_norm": 0.2734375,
"learning_rate": 2.061711079943899e-06,
"loss": 1.1118,
"step": 567
},
{
"epoch": 0.7966339410939691,
"grad_norm": 0.2001953125,
"learning_rate": 2.0476858345021037e-06,
"loss": 1.1394,
"step": 568
},
{
"epoch": 0.7980364656381487,
"grad_norm": 0.2177734375,
"learning_rate": 2.0336605890603085e-06,
"loss": 1.2972,
"step": 569
},
{
"epoch": 0.7994389901823282,
"grad_norm": 0.169921875,
"learning_rate": 2.0196353436185133e-06,
"loss": 1.3216,
"step": 570
},
{
"epoch": 0.8008415147265077,
"grad_norm": 0.1630859375,
"learning_rate": 2.005610098176718e-06,
"loss": 1.2028,
"step": 571
},
{
"epoch": 0.8022440392706872,
"grad_norm": 0.232421875,
"learning_rate": 1.991584852734923e-06,
"loss": 1.2873,
"step": 572
},
{
"epoch": 0.8036465638148668,
"grad_norm": 0.2431640625,
"learning_rate": 1.9775596072931276e-06,
"loss": 1.0877,
"step": 573
},
{
"epoch": 0.8050490883590463,
"grad_norm": 0.328125,
"learning_rate": 1.9635343618513324e-06,
"loss": 1.4381,
"step": 574
},
{
"epoch": 0.8064516129032258,
"grad_norm": 0.1689453125,
"learning_rate": 1.9495091164095372e-06,
"loss": 1.1963,
"step": 575
},
{
"epoch": 0.8078541374474053,
"grad_norm": 0.28125,
"learning_rate": 1.935483870967742e-06,
"loss": 1.1772,
"step": 576
},
{
"epoch": 0.8092566619915849,
"grad_norm": 0.177734375,
"learning_rate": 1.9214586255259468e-06,
"loss": 1.3152,
"step": 577
},
{
"epoch": 0.8106591865357644,
"grad_norm": 0.2265625,
"learning_rate": 1.9074333800841516e-06,
"loss": 1.2125,
"step": 578
},
{
"epoch": 0.8120617110799438,
"grad_norm": 0.203125,
"learning_rate": 1.8934081346423563e-06,
"loss": 1.0921,
"step": 579
},
{
"epoch": 0.8134642356241234,
"grad_norm": 0.205078125,
"learning_rate": 1.8793828892005611e-06,
"loss": 1.118,
"step": 580
},
{
"epoch": 0.814866760168303,
"grad_norm": 0.208984375,
"learning_rate": 1.865357643758766e-06,
"loss": 1.1351,
"step": 581
},
{
"epoch": 0.8162692847124825,
"grad_norm": 0.1630859375,
"learning_rate": 1.8513323983169707e-06,
"loss": 1.1744,
"step": 582
},
{
"epoch": 0.8176718092566619,
"grad_norm": 0.11328125,
"learning_rate": 1.8373071528751755e-06,
"loss": 1.3168,
"step": 583
},
{
"epoch": 0.8190743338008415,
"grad_norm": 0.2470703125,
"learning_rate": 1.8232819074333803e-06,
"loss": 1.1647,
"step": 584
},
{
"epoch": 0.820476858345021,
"grad_norm": 0.25,
"learning_rate": 1.809256661991585e-06,
"loss": 1.0769,
"step": 585
},
{
"epoch": 0.8218793828892006,
"grad_norm": 0.2197265625,
"learning_rate": 1.7952314165497896e-06,
"loss": 1.2422,
"step": 586
},
{
"epoch": 0.82328190743338,
"grad_norm": 0.263671875,
"learning_rate": 1.7812061711079944e-06,
"loss": 1.1405,
"step": 587
},
{
"epoch": 0.8246844319775596,
"grad_norm": 0.2236328125,
"learning_rate": 1.7671809256661992e-06,
"loss": 1.1655,
"step": 588
},
{
"epoch": 0.8260869565217391,
"grad_norm": 0.1923828125,
"learning_rate": 1.753155680224404e-06,
"loss": 1.3527,
"step": 589
},
{
"epoch": 0.8274894810659187,
"grad_norm": 0.1787109375,
"learning_rate": 1.7391304347826088e-06,
"loss": 1.2396,
"step": 590
},
{
"epoch": 0.8288920056100981,
"grad_norm": 0.1357421875,
"learning_rate": 1.7251051893408135e-06,
"loss": 1.4179,
"step": 591
},
{
"epoch": 0.8302945301542777,
"grad_norm": 0.2138671875,
"learning_rate": 1.7110799438990183e-06,
"loss": 1.1096,
"step": 592
},
{
"epoch": 0.8316970546984572,
"grad_norm": 0.15234375,
"learning_rate": 1.697054698457223e-06,
"loss": 1.2698,
"step": 593
},
{
"epoch": 0.8330995792426368,
"grad_norm": 0.1435546875,
"learning_rate": 1.6830294530154279e-06,
"loss": 1.3,
"step": 594
},
{
"epoch": 0.8345021037868162,
"grad_norm": 0.1845703125,
"learning_rate": 1.6690042075736327e-06,
"loss": 1.1469,
"step": 595
},
{
"epoch": 0.8359046283309958,
"grad_norm": 0.21875,
"learning_rate": 1.6549789621318375e-06,
"loss": 1.1219,
"step": 596
},
{
"epoch": 0.8373071528751753,
"grad_norm": 0.224609375,
"learning_rate": 1.6409537166900422e-06,
"loss": 1.1502,
"step": 597
},
{
"epoch": 0.8387096774193549,
"grad_norm": 0.1376953125,
"learning_rate": 1.626928471248247e-06,
"loss": 1.3427,
"step": 598
},
{
"epoch": 0.8401122019635343,
"grad_norm": 0.2333984375,
"learning_rate": 1.6129032258064516e-06,
"loss": 1.2831,
"step": 599
},
{
"epoch": 0.8415147265077139,
"grad_norm": 0.197265625,
"learning_rate": 1.5988779803646564e-06,
"loss": 1.1839,
"step": 600
},
{
"epoch": 0.8429172510518934,
"grad_norm": 0.17578125,
"learning_rate": 1.5848527349228612e-06,
"loss": 1.204,
"step": 601
},
{
"epoch": 0.844319775596073,
"grad_norm": 0.244140625,
"learning_rate": 1.570827489481066e-06,
"loss": 1.0692,
"step": 602
},
{
"epoch": 0.8457223001402524,
"grad_norm": 0.185546875,
"learning_rate": 1.5568022440392707e-06,
"loss": 1.2152,
"step": 603
},
{
"epoch": 0.847124824684432,
"grad_norm": 0.22265625,
"learning_rate": 1.5427769985974755e-06,
"loss": 1.2749,
"step": 604
},
{
"epoch": 0.8485273492286115,
"grad_norm": 0.1787109375,
"learning_rate": 1.5287517531556803e-06,
"loss": 1.4086,
"step": 605
},
{
"epoch": 0.8499298737727911,
"grad_norm": 0.267578125,
"learning_rate": 1.514726507713885e-06,
"loss": 1.2094,
"step": 606
},
{
"epoch": 0.8513323983169705,
"grad_norm": 0.2353515625,
"learning_rate": 1.5007012622720899e-06,
"loss": 1.1288,
"step": 607
},
{
"epoch": 0.85273492286115,
"grad_norm": 0.1376953125,
"learning_rate": 1.4866760168302946e-06,
"loss": 1.307,
"step": 608
},
{
"epoch": 0.8541374474053296,
"grad_norm": 0.19140625,
"learning_rate": 1.4726507713884994e-06,
"loss": 1.1327,
"step": 609
},
{
"epoch": 0.8555399719495091,
"grad_norm": 0.208984375,
"learning_rate": 1.4586255259467042e-06,
"loss": 1.2754,
"step": 610
},
{
"epoch": 0.8569424964936886,
"grad_norm": 0.216796875,
"learning_rate": 1.444600280504909e-06,
"loss": 1.2639,
"step": 611
},
{
"epoch": 0.8583450210378681,
"grad_norm": 0.1826171875,
"learning_rate": 1.4305750350631138e-06,
"loss": 1.2885,
"step": 612
},
{
"epoch": 0.8597475455820477,
"grad_norm": 0.19921875,
"learning_rate": 1.4165497896213184e-06,
"loss": 1.2376,
"step": 613
},
{
"epoch": 0.8611500701262272,
"grad_norm": 0.2275390625,
"learning_rate": 1.4025245441795231e-06,
"loss": 1.1711,
"step": 614
},
{
"epoch": 0.8625525946704067,
"grad_norm": 0.205078125,
"learning_rate": 1.388499298737728e-06,
"loss": 1.194,
"step": 615
},
{
"epoch": 0.8639551192145862,
"grad_norm": 0.162109375,
"learning_rate": 1.3744740532959327e-06,
"loss": 1.1591,
"step": 616
},
{
"epoch": 0.8653576437587658,
"grad_norm": 0.1796875,
"learning_rate": 1.3604488078541375e-06,
"loss": 1.1594,
"step": 617
},
{
"epoch": 0.8667601683029453,
"grad_norm": 0.267578125,
"learning_rate": 1.3464235624123423e-06,
"loss": 1.3947,
"step": 618
},
{
"epoch": 0.8681626928471248,
"grad_norm": 0.259765625,
"learning_rate": 1.332398316970547e-06,
"loss": 1.1734,
"step": 619
},
{
"epoch": 0.8695652173913043,
"grad_norm": 0.119140625,
"learning_rate": 1.3183730715287518e-06,
"loss": 1.3298,
"step": 620
},
{
"epoch": 0.8709677419354839,
"grad_norm": 0.1884765625,
"learning_rate": 1.3043478260869566e-06,
"loss": 1.1371,
"step": 621
},
{
"epoch": 0.8723702664796634,
"grad_norm": 0.13671875,
"learning_rate": 1.2903225806451614e-06,
"loss": 1.1397,
"step": 622
},
{
"epoch": 0.8737727910238429,
"grad_norm": 0.244140625,
"learning_rate": 1.2762973352033662e-06,
"loss": 1.0592,
"step": 623
},
{
"epoch": 0.8751753155680224,
"grad_norm": 0.162109375,
"learning_rate": 1.262272089761571e-06,
"loss": 1.1607,
"step": 624
},
{
"epoch": 0.876577840112202,
"grad_norm": 0.1689453125,
"learning_rate": 1.2482468443197758e-06,
"loss": 1.1171,
"step": 625
},
{
"epoch": 0.8779803646563815,
"grad_norm": 0.310546875,
"learning_rate": 1.2342215988779805e-06,
"loss": 1.1599,
"step": 626
},
{
"epoch": 0.879382889200561,
"grad_norm": 0.2109375,
"learning_rate": 1.2201963534361851e-06,
"loss": 1.1901,
"step": 627
},
{
"epoch": 0.8807854137447405,
"grad_norm": 0.30859375,
"learning_rate": 1.2061711079943899e-06,
"loss": 1.5766,
"step": 628
},
{
"epoch": 0.8821879382889201,
"grad_norm": 0.27734375,
"learning_rate": 1.1921458625525947e-06,
"loss": 1.1917,
"step": 629
},
{
"epoch": 0.8835904628330996,
"grad_norm": 0.1494140625,
"learning_rate": 1.1781206171107995e-06,
"loss": 1.6177,
"step": 630
},
{
"epoch": 0.884992987377279,
"grad_norm": 0.189453125,
"learning_rate": 1.1640953716690042e-06,
"loss": 1.3351,
"step": 631
},
{
"epoch": 0.8863955119214586,
"grad_norm": 0.1376953125,
"learning_rate": 1.150070126227209e-06,
"loss": 1.1969,
"step": 632
},
{
"epoch": 0.8877980364656382,
"grad_norm": 0.171875,
"learning_rate": 1.1360448807854138e-06,
"loss": 1.2267,
"step": 633
},
{
"epoch": 0.8892005610098177,
"grad_norm": 0.1640625,
"learning_rate": 1.1220196353436186e-06,
"loss": 1.2762,
"step": 634
},
{
"epoch": 0.8906030855539971,
"grad_norm": 0.193359375,
"learning_rate": 1.1079943899018234e-06,
"loss": 1.4256,
"step": 635
},
{
"epoch": 0.8920056100981767,
"grad_norm": 0.1669921875,
"learning_rate": 1.0939691444600282e-06,
"loss": 1.1813,
"step": 636
},
{
"epoch": 0.8934081346423562,
"grad_norm": 0.154296875,
"learning_rate": 1.079943899018233e-06,
"loss": 1.3887,
"step": 637
},
{
"epoch": 0.8948106591865358,
"grad_norm": 0.19140625,
"learning_rate": 1.0659186535764377e-06,
"loss": 1.2293,
"step": 638
},
{
"epoch": 0.8962131837307152,
"grad_norm": 0.14453125,
"learning_rate": 1.0518934081346425e-06,
"loss": 1.2268,
"step": 639
},
{
"epoch": 0.8976157082748948,
"grad_norm": 0.234375,
"learning_rate": 1.037868162692847e-06,
"loss": 1.2701,
"step": 640
},
{
"epoch": 0.8990182328190743,
"grad_norm": 0.1943359375,
"learning_rate": 1.0238429172510519e-06,
"loss": 1.2166,
"step": 641
},
{
"epoch": 0.9004207573632539,
"grad_norm": 0.1748046875,
"learning_rate": 1.0098176718092566e-06,
"loss": 1.195,
"step": 642
},
{
"epoch": 0.9018232819074333,
"grad_norm": 0.1298828125,
"learning_rate": 9.957924263674614e-07,
"loss": 1.2161,
"step": 643
},
{
"epoch": 0.9032258064516129,
"grad_norm": 0.1806640625,
"learning_rate": 9.817671809256662e-07,
"loss": 1.1612,
"step": 644
},
{
"epoch": 0.9046283309957924,
"grad_norm": 0.2333984375,
"learning_rate": 9.67741935483871e-07,
"loss": 1.1929,
"step": 645
},
{
"epoch": 0.906030855539972,
"grad_norm": 0.25390625,
"learning_rate": 9.537166900420758e-07,
"loss": 1.2325,
"step": 646
},
{
"epoch": 0.9074333800841514,
"grad_norm": 0.189453125,
"learning_rate": 9.396914446002806e-07,
"loss": 1.2358,
"step": 647
},
{
"epoch": 0.908835904628331,
"grad_norm": 0.1748046875,
"learning_rate": 9.256661991584853e-07,
"loss": 1.3516,
"step": 648
},
{
"epoch": 0.9102384291725105,
"grad_norm": 0.203125,
"learning_rate": 9.116409537166901e-07,
"loss": 1.3763,
"step": 649
},
{
"epoch": 0.9116409537166901,
"grad_norm": 0.1650390625,
"learning_rate": 8.976157082748948e-07,
"loss": 1.3796,
"step": 650
},
{
"epoch": 0.9130434782608695,
"grad_norm": 0.2431640625,
"learning_rate": 8.835904628330996e-07,
"loss": 1.1251,
"step": 651
},
{
"epoch": 0.9144460028050491,
"grad_norm": 0.1591796875,
"learning_rate": 8.695652173913044e-07,
"loss": 1.2083,
"step": 652
},
{
"epoch": 0.9158485273492286,
"grad_norm": 0.162109375,
"learning_rate": 8.555399719495092e-07,
"loss": 1.2821,
"step": 653
},
{
"epoch": 0.9172510518934082,
"grad_norm": 0.11669921875,
"learning_rate": 8.415147265077139e-07,
"loss": 1.2481,
"step": 654
},
{
"epoch": 0.9186535764375876,
"grad_norm": 0.1572265625,
"learning_rate": 8.274894810659187e-07,
"loss": 1.1164,
"step": 655
},
{
"epoch": 0.9200561009817672,
"grad_norm": 0.27734375,
"learning_rate": 8.134642356241235e-07,
"loss": 1.2441,
"step": 656
},
{
"epoch": 0.9214586255259467,
"grad_norm": 0.2197265625,
"learning_rate": 7.994389901823282e-07,
"loss": 1.1084,
"step": 657
},
{
"epoch": 0.9228611500701263,
"grad_norm": 0.22265625,
"learning_rate": 7.85413744740533e-07,
"loss": 1.1713,
"step": 658
},
{
"epoch": 0.9242636746143057,
"grad_norm": 0.1591796875,
"learning_rate": 7.713884992987378e-07,
"loss": 1.377,
"step": 659
},
{
"epoch": 0.9256661991584852,
"grad_norm": 0.162109375,
"learning_rate": 7.573632538569425e-07,
"loss": 1.2492,
"step": 660
},
{
"epoch": 0.9270687237026648,
"grad_norm": 0.271484375,
"learning_rate": 7.433380084151473e-07,
"loss": 1.1586,
"step": 661
},
{
"epoch": 0.9284712482468443,
"grad_norm": 0.1591796875,
"learning_rate": 7.293127629733521e-07,
"loss": 1.2767,
"step": 662
},
{
"epoch": 0.9298737727910238,
"grad_norm": 0.2138671875,
"learning_rate": 7.152875175315569e-07,
"loss": 1.1998,
"step": 663
},
{
"epoch": 0.9312762973352033,
"grad_norm": 0.251953125,
"learning_rate": 7.012622720897616e-07,
"loss": 1.1143,
"step": 664
},
{
"epoch": 0.9326788218793829,
"grad_norm": 0.1865234375,
"learning_rate": 6.872370266479664e-07,
"loss": 1.2119,
"step": 665
},
{
"epoch": 0.9340813464235624,
"grad_norm": 0.2490234375,
"learning_rate": 6.732117812061711e-07,
"loss": 1.1035,
"step": 666
},
{
"epoch": 0.9354838709677419,
"grad_norm": 0.1728515625,
"learning_rate": 6.591865357643759e-07,
"loss": 1.3081,
"step": 667
},
{
"epoch": 0.9368863955119214,
"grad_norm": 0.1923828125,
"learning_rate": 6.451612903225807e-07,
"loss": 1.2938,
"step": 668
},
{
"epoch": 0.938288920056101,
"grad_norm": 0.25,
"learning_rate": 6.311360448807855e-07,
"loss": 1.2476,
"step": 669
},
{
"epoch": 0.9396914446002805,
"grad_norm": 0.1328125,
"learning_rate": 6.171107994389903e-07,
"loss": 1.3178,
"step": 670
},
{
"epoch": 0.94109396914446,
"grad_norm": 0.30078125,
"learning_rate": 6.030855539971949e-07,
"loss": 1.2007,
"step": 671
},
{
"epoch": 0.9424964936886395,
"grad_norm": 0.212890625,
"learning_rate": 5.890603085553997e-07,
"loss": 1.1024,
"step": 672
},
{
"epoch": 0.9438990182328191,
"grad_norm": 0.1630859375,
"learning_rate": 5.750350631136045e-07,
"loss": 1.2341,
"step": 673
},
{
"epoch": 0.9453015427769986,
"grad_norm": 0.2041015625,
"learning_rate": 5.610098176718093e-07,
"loss": 1.1421,
"step": 674
},
{
"epoch": 0.9467040673211781,
"grad_norm": 0.259765625,
"learning_rate": 5.469845722300141e-07,
"loss": 1.1908,
"step": 675
},
{
"epoch": 0.9481065918653576,
"grad_norm": 0.193359375,
"learning_rate": 5.329593267882189e-07,
"loss": 1.0408,
"step": 676
},
{
"epoch": 0.9495091164095372,
"grad_norm": 0.1474609375,
"learning_rate": 5.189340813464235e-07,
"loss": 1.1853,
"step": 677
},
{
"epoch": 0.9509116409537167,
"grad_norm": 0.146484375,
"learning_rate": 5.049088359046283e-07,
"loss": 1.2057,
"step": 678
},
{
"epoch": 0.9523141654978962,
"grad_norm": 0.1630859375,
"learning_rate": 4.908835904628331e-07,
"loss": 1.0716,
"step": 679
},
{
"epoch": 0.9537166900420757,
"grad_norm": 0.1884765625,
"learning_rate": 4.768583450210379e-07,
"loss": 1.2057,
"step": 680
},
{
"epoch": 0.9551192145862553,
"grad_norm": 0.208984375,
"learning_rate": 4.628330995792427e-07,
"loss": 1.3381,
"step": 681
},
{
"epoch": 0.9565217391304348,
"grad_norm": 0.1923828125,
"learning_rate": 4.488078541374474e-07,
"loss": 1.209,
"step": 682
},
{
"epoch": 0.9579242636746143,
"grad_norm": 0.1435546875,
"learning_rate": 4.347826086956522e-07,
"loss": 1.2604,
"step": 683
},
{
"epoch": 0.9593267882187938,
"grad_norm": 0.232421875,
"learning_rate": 4.2075736325385697e-07,
"loss": 1.099,
"step": 684
},
{
"epoch": 0.9607293127629734,
"grad_norm": 0.23828125,
"learning_rate": 4.0673211781206176e-07,
"loss": 1.1337,
"step": 685
},
{
"epoch": 0.9621318373071529,
"grad_norm": 0.181640625,
"learning_rate": 3.927068723702665e-07,
"loss": 1.1812,
"step": 686
},
{
"epoch": 0.9635343618513323,
"grad_norm": 0.1875,
"learning_rate": 3.7868162692847127e-07,
"loss": 1.2327,
"step": 687
},
{
"epoch": 0.9649368863955119,
"grad_norm": 0.16015625,
"learning_rate": 3.6465638148667605e-07,
"loss": 1.1862,
"step": 688
},
{
"epoch": 0.9663394109396914,
"grad_norm": 0.19140625,
"learning_rate": 3.506311360448808e-07,
"loss": 1.2904,
"step": 689
},
{
"epoch": 0.967741935483871,
"grad_norm": 0.2333984375,
"learning_rate": 3.3660589060308557e-07,
"loss": 1.2837,
"step": 690
},
{
"epoch": 0.9691444600280504,
"grad_norm": 0.1669921875,
"learning_rate": 3.2258064516129035e-07,
"loss": 1.0958,
"step": 691
},
{
"epoch": 0.97054698457223,
"grad_norm": 0.1826171875,
"learning_rate": 3.0855539971949513e-07,
"loss": 1.3348,
"step": 692
},
{
"epoch": 0.9719495091164095,
"grad_norm": 0.1708984375,
"learning_rate": 2.9453015427769986e-07,
"loss": 1.2333,
"step": 693
},
{
"epoch": 0.9733520336605891,
"grad_norm": 0.1669921875,
"learning_rate": 2.8050490883590465e-07,
"loss": 1.1464,
"step": 694
},
{
"epoch": 0.9747545582047685,
"grad_norm": 0.2275390625,
"learning_rate": 2.6647966339410943e-07,
"loss": 1.29,
"step": 695
},
{
"epoch": 0.9761570827489481,
"grad_norm": 0.1533203125,
"learning_rate": 2.5245441795231416e-07,
"loss": 1.2551,
"step": 696
},
{
"epoch": 0.9775596072931276,
"grad_norm": 0.1728515625,
"learning_rate": 2.3842917251051895e-07,
"loss": 1.206,
"step": 697
},
{
"epoch": 0.9789621318373072,
"grad_norm": 0.2216796875,
"learning_rate": 2.244039270687237e-07,
"loss": 1.1592,
"step": 698
},
{
"epoch": 0.9803646563814866,
"grad_norm": 0.19921875,
"learning_rate": 2.1037868162692849e-07,
"loss": 1.1878,
"step": 699
},
{
"epoch": 0.9817671809256662,
"grad_norm": 0.259765625,
"learning_rate": 1.9635343618513324e-07,
"loss": 0.9784,
"step": 700
},
{
"epoch": 0.9831697054698457,
"grad_norm": 0.18359375,
"learning_rate": 1.8232819074333803e-07,
"loss": 1.1988,
"step": 701
},
{
"epoch": 0.9845722300140253,
"grad_norm": 0.2197265625,
"learning_rate": 1.6830294530154278e-07,
"loss": 1.0429,
"step": 702
},
{
"epoch": 0.9859747545582047,
"grad_norm": 0.27734375,
"learning_rate": 1.5427769985974757e-07,
"loss": 1.4367,
"step": 703
},
{
"epoch": 0.9873772791023843,
"grad_norm": 0.203125,
"learning_rate": 1.4025245441795232e-07,
"loss": 1.2093,
"step": 704
},
{
"epoch": 0.9887798036465638,
"grad_norm": 0.193359375,
"learning_rate": 1.2622720897615708e-07,
"loss": 1.1985,
"step": 705
},
{
"epoch": 0.9901823281907434,
"grad_norm": 0.1806640625,
"learning_rate": 1.1220196353436185e-07,
"loss": 1.2731,
"step": 706
},
{
"epoch": 0.9915848527349228,
"grad_norm": 0.2470703125,
"learning_rate": 9.817671809256662e-08,
"loss": 1.3363,
"step": 707
},
{
"epoch": 0.9929873772791024,
"grad_norm": 0.2001953125,
"learning_rate": 8.415147265077139e-08,
"loss": 1.2392,
"step": 708
},
{
"epoch": 0.9943899018232819,
"grad_norm": 0.2275390625,
"learning_rate": 7.012622720897616e-08,
"loss": 1.19,
"step": 709
},
{
"epoch": 0.9957924263674615,
"grad_norm": 0.1337890625,
"learning_rate": 5.6100981767180926e-08,
"loss": 1.3113,
"step": 710
},
{
"epoch": 0.9971949509116409,
"grad_norm": 0.2275390625,
"learning_rate": 4.2075736325385696e-08,
"loss": 1.0762,
"step": 711
},
{
"epoch": 0.9985974754558204,
"grad_norm": 0.2041015625,
"learning_rate": 2.8050490883590463e-08,
"loss": 1.2402,
"step": 712
},
{
"epoch": 1.0,
"grad_norm": 0.30859375,
"learning_rate": 1.4025245441795231e-08,
"loss": 1.3883,
"step": 713
},
{
"epoch": 1.0,
"eval_loss": 1.2593402862548828,
"eval_runtime": 12.6935,
"eval_samples_per_second": 2.836,
"eval_steps_per_second": 0.394,
"step": 713
}
],
"logging_steps": 1.0,
"max_steps": 713,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.4643205711485993e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}