Wanff
Add fine-tuned model
1d5492c
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 0,
"global_step": 797,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0012547051442910915,
"grad_norm": 0.4921875,
"learning_rate": 9.98745294855709e-06,
"loss": 1.8811,
"step": 1
},
{
"epoch": 0.002509410288582183,
"grad_norm": 0.50390625,
"learning_rate": 9.974905897114179e-06,
"loss": 1.9066,
"step": 2
},
{
"epoch": 0.0037641154328732747,
"grad_norm": 0.48046875,
"learning_rate": 9.962358845671269e-06,
"loss": 1.838,
"step": 3
},
{
"epoch": 0.005018820577164366,
"grad_norm": 0.474609375,
"learning_rate": 9.949811794228357e-06,
"loss": 1.9337,
"step": 4
},
{
"epoch": 0.006273525721455458,
"grad_norm": 0.453125,
"learning_rate": 9.937264742785447e-06,
"loss": 1.7792,
"step": 5
},
{
"epoch": 0.0075282308657465494,
"grad_norm": 0.486328125,
"learning_rate": 9.924717691342535e-06,
"loss": 1.8373,
"step": 6
},
{
"epoch": 0.00878293601003764,
"grad_norm": 0.4375,
"learning_rate": 9.912170639899625e-06,
"loss": 1.8733,
"step": 7
},
{
"epoch": 0.010037641154328732,
"grad_norm": 0.4296875,
"learning_rate": 9.899623588456713e-06,
"loss": 1.9003,
"step": 8
},
{
"epoch": 0.011292346298619825,
"grad_norm": 0.427734375,
"learning_rate": 9.887076537013803e-06,
"loss": 1.839,
"step": 9
},
{
"epoch": 0.012547051442910916,
"grad_norm": 0.390625,
"learning_rate": 9.874529485570891e-06,
"loss": 1.8247,
"step": 10
},
{
"epoch": 0.013801756587202008,
"grad_norm": 0.353515625,
"learning_rate": 9.861982434127981e-06,
"loss": 1.7166,
"step": 11
},
{
"epoch": 0.015056461731493099,
"grad_norm": 0.365234375,
"learning_rate": 9.849435382685069e-06,
"loss": 1.7286,
"step": 12
},
{
"epoch": 0.01631116687578419,
"grad_norm": 0.38671875,
"learning_rate": 9.836888331242159e-06,
"loss": 1.7676,
"step": 13
},
{
"epoch": 0.01756587202007528,
"grad_norm": 0.353515625,
"learning_rate": 9.824341279799247e-06,
"loss": 1.7642,
"step": 14
},
{
"epoch": 0.018820577164366373,
"grad_norm": 0.357421875,
"learning_rate": 9.811794228356337e-06,
"loss": 1.6793,
"step": 15
},
{
"epoch": 0.020075282308657464,
"grad_norm": 0.31640625,
"learning_rate": 9.799247176913425e-06,
"loss": 1.5586,
"step": 16
},
{
"epoch": 0.02132998745294856,
"grad_norm": 0.326171875,
"learning_rate": 9.786700125470515e-06,
"loss": 1.6833,
"step": 17
},
{
"epoch": 0.02258469259723965,
"grad_norm": 0.302734375,
"learning_rate": 9.774153074027605e-06,
"loss": 1.5879,
"step": 18
},
{
"epoch": 0.02383939774153074,
"grad_norm": 0.306640625,
"learning_rate": 9.761606022584693e-06,
"loss": 1.5705,
"step": 19
},
{
"epoch": 0.025094102885821833,
"grad_norm": 0.29296875,
"learning_rate": 9.749058971141783e-06,
"loss": 1.5896,
"step": 20
},
{
"epoch": 0.026348808030112924,
"grad_norm": 0.31640625,
"learning_rate": 9.736511919698871e-06,
"loss": 1.6392,
"step": 21
},
{
"epoch": 0.027603513174404015,
"grad_norm": 0.435546875,
"learning_rate": 9.723964868255961e-06,
"loss": 1.5772,
"step": 22
},
{
"epoch": 0.028858218318695106,
"grad_norm": 0.265625,
"learning_rate": 9.711417816813051e-06,
"loss": 1.5175,
"step": 23
},
{
"epoch": 0.030112923462986198,
"grad_norm": 0.25390625,
"learning_rate": 9.69887076537014e-06,
"loss": 1.5083,
"step": 24
},
{
"epoch": 0.03136762860727729,
"grad_norm": 0.25,
"learning_rate": 9.686323713927227e-06,
"loss": 1.5575,
"step": 25
},
{
"epoch": 0.03262233375156838,
"grad_norm": 0.251953125,
"learning_rate": 9.673776662484317e-06,
"loss": 1.549,
"step": 26
},
{
"epoch": 0.033877038895859475,
"grad_norm": 0.244140625,
"learning_rate": 9.661229611041405e-06,
"loss": 1.5112,
"step": 27
},
{
"epoch": 0.03513174404015056,
"grad_norm": 0.25390625,
"learning_rate": 9.648682559598495e-06,
"loss": 1.5569,
"step": 28
},
{
"epoch": 0.03638644918444166,
"grad_norm": 0.2412109375,
"learning_rate": 9.636135508155583e-06,
"loss": 1.4399,
"step": 29
},
{
"epoch": 0.037641154328732745,
"grad_norm": 0.2314453125,
"learning_rate": 9.623588456712673e-06,
"loss": 1.4761,
"step": 30
},
{
"epoch": 0.03889585947302384,
"grad_norm": 0.224609375,
"learning_rate": 9.611041405269761e-06,
"loss": 1.4108,
"step": 31
},
{
"epoch": 0.04015056461731493,
"grad_norm": 0.234375,
"learning_rate": 9.598494353826851e-06,
"loss": 1.5262,
"step": 32
},
{
"epoch": 0.04140526976160602,
"grad_norm": 0.232421875,
"learning_rate": 9.585947302383941e-06,
"loss": 1.4889,
"step": 33
},
{
"epoch": 0.04265997490589712,
"grad_norm": 0.228515625,
"learning_rate": 9.57340025094103e-06,
"loss": 1.456,
"step": 34
},
{
"epoch": 0.043914680050188205,
"grad_norm": 0.2265625,
"learning_rate": 9.56085319949812e-06,
"loss": 1.4607,
"step": 35
},
{
"epoch": 0.0451693851944793,
"grad_norm": 0.240234375,
"learning_rate": 9.548306148055207e-06,
"loss": 1.4645,
"step": 36
},
{
"epoch": 0.04642409033877039,
"grad_norm": 0.1953125,
"learning_rate": 9.535759096612297e-06,
"loss": 1.342,
"step": 37
},
{
"epoch": 0.04767879548306148,
"grad_norm": 0.2109375,
"learning_rate": 9.523212045169386e-06,
"loss": 1.4266,
"step": 38
},
{
"epoch": 0.04893350062735257,
"grad_norm": 0.1884765625,
"learning_rate": 9.510664993726475e-06,
"loss": 1.3732,
"step": 39
},
{
"epoch": 0.050188205771643665,
"grad_norm": 0.2333984375,
"learning_rate": 9.498117942283565e-06,
"loss": 1.5087,
"step": 40
},
{
"epoch": 0.05144291091593475,
"grad_norm": 0.2275390625,
"learning_rate": 9.485570890840653e-06,
"loss": 1.4479,
"step": 41
},
{
"epoch": 0.05269761606022585,
"grad_norm": 0.197265625,
"learning_rate": 9.473023839397743e-06,
"loss": 1.4427,
"step": 42
},
{
"epoch": 0.053952321204516936,
"grad_norm": 0.1904296875,
"learning_rate": 9.460476787954832e-06,
"loss": 1.3319,
"step": 43
},
{
"epoch": 0.05520702634880803,
"grad_norm": 0.2451171875,
"learning_rate": 9.44792973651192e-06,
"loss": 1.4298,
"step": 44
},
{
"epoch": 0.056461731493099125,
"grad_norm": 0.1875,
"learning_rate": 9.43538268506901e-06,
"loss": 1.3957,
"step": 45
},
{
"epoch": 0.05771643663739021,
"grad_norm": 0.1669921875,
"learning_rate": 9.422835633626098e-06,
"loss": 1.3846,
"step": 46
},
{
"epoch": 0.05897114178168131,
"grad_norm": 0.1708984375,
"learning_rate": 9.410288582183188e-06,
"loss": 1.2841,
"step": 47
},
{
"epoch": 0.060225846925972396,
"grad_norm": 0.185546875,
"learning_rate": 9.397741530740276e-06,
"loss": 1.3474,
"step": 48
},
{
"epoch": 0.06148055207026349,
"grad_norm": 0.1669921875,
"learning_rate": 9.385194479297366e-06,
"loss": 1.3209,
"step": 49
},
{
"epoch": 0.06273525721455459,
"grad_norm": 0.1962890625,
"learning_rate": 9.372647427854456e-06,
"loss": 1.3605,
"step": 50
},
{
"epoch": 0.06398996235884567,
"grad_norm": 0.1728515625,
"learning_rate": 9.360100376411544e-06,
"loss": 1.284,
"step": 51
},
{
"epoch": 0.06524466750313676,
"grad_norm": 0.19140625,
"learning_rate": 9.347553324968634e-06,
"loss": 1.3667,
"step": 52
},
{
"epoch": 0.06649937264742785,
"grad_norm": 0.177734375,
"learning_rate": 9.335006273525722e-06,
"loss": 1.3511,
"step": 53
},
{
"epoch": 0.06775407779171895,
"grad_norm": 0.166015625,
"learning_rate": 9.322459222082812e-06,
"loss": 1.3237,
"step": 54
},
{
"epoch": 0.06900878293601004,
"grad_norm": 0.189453125,
"learning_rate": 9.309912170639902e-06,
"loss": 1.4086,
"step": 55
},
{
"epoch": 0.07026348808030113,
"grad_norm": 0.1689453125,
"learning_rate": 9.29736511919699e-06,
"loss": 1.2681,
"step": 56
},
{
"epoch": 0.07151819322459223,
"grad_norm": 0.1640625,
"learning_rate": 9.28481806775408e-06,
"loss": 1.2862,
"step": 57
},
{
"epoch": 0.07277289836888332,
"grad_norm": 0.1767578125,
"learning_rate": 9.272271016311168e-06,
"loss": 1.2984,
"step": 58
},
{
"epoch": 0.0740276035131744,
"grad_norm": 0.1650390625,
"learning_rate": 9.259723964868258e-06,
"loss": 1.2213,
"step": 59
},
{
"epoch": 0.07528230865746549,
"grad_norm": 0.1591796875,
"learning_rate": 9.247176913425346e-06,
"loss": 1.2776,
"step": 60
},
{
"epoch": 0.07653701380175659,
"grad_norm": 0.1611328125,
"learning_rate": 9.234629861982434e-06,
"loss": 1.2724,
"step": 61
},
{
"epoch": 0.07779171894604768,
"grad_norm": 0.1591796875,
"learning_rate": 9.222082810539524e-06,
"loss": 1.2754,
"step": 62
},
{
"epoch": 0.07904642409033877,
"grad_norm": 0.1572265625,
"learning_rate": 9.209535759096612e-06,
"loss": 1.2553,
"step": 63
},
{
"epoch": 0.08030112923462986,
"grad_norm": 0.1552734375,
"learning_rate": 9.196988707653702e-06,
"loss": 1.2712,
"step": 64
},
{
"epoch": 0.08155583437892096,
"grad_norm": 0.17578125,
"learning_rate": 9.18444165621079e-06,
"loss": 1.3042,
"step": 65
},
{
"epoch": 0.08281053952321205,
"grad_norm": 0.2431640625,
"learning_rate": 9.17189460476788e-06,
"loss": 1.3251,
"step": 66
},
{
"epoch": 0.08406524466750313,
"grad_norm": 0.1845703125,
"learning_rate": 9.15934755332497e-06,
"loss": 1.2872,
"step": 67
},
{
"epoch": 0.08531994981179424,
"grad_norm": 0.1806640625,
"learning_rate": 9.146800501882058e-06,
"loss": 1.3151,
"step": 68
},
{
"epoch": 0.08657465495608532,
"grad_norm": 0.146484375,
"learning_rate": 9.134253450439148e-06,
"loss": 1.2665,
"step": 69
},
{
"epoch": 0.08782936010037641,
"grad_norm": 0.1513671875,
"learning_rate": 9.121706398996236e-06,
"loss": 1.253,
"step": 70
},
{
"epoch": 0.0890840652446675,
"grad_norm": 0.1533203125,
"learning_rate": 9.109159347553326e-06,
"loss": 1.2587,
"step": 71
},
{
"epoch": 0.0903387703889586,
"grad_norm": 0.150390625,
"learning_rate": 9.096612296110416e-06,
"loss": 1.266,
"step": 72
},
{
"epoch": 0.09159347553324969,
"grad_norm": 0.154296875,
"learning_rate": 9.084065244667504e-06,
"loss": 1.2809,
"step": 73
},
{
"epoch": 0.09284818067754078,
"grad_norm": 0.154296875,
"learning_rate": 9.071518193224594e-06,
"loss": 1.2475,
"step": 74
},
{
"epoch": 0.09410288582183186,
"grad_norm": 0.1552734375,
"learning_rate": 9.058971141781682e-06,
"loss": 1.2244,
"step": 75
},
{
"epoch": 0.09535759096612297,
"grad_norm": 0.1611328125,
"learning_rate": 9.046424090338772e-06,
"loss": 1.315,
"step": 76
},
{
"epoch": 0.09661229611041405,
"grad_norm": 0.162109375,
"learning_rate": 9.03387703889586e-06,
"loss": 1.2624,
"step": 77
},
{
"epoch": 0.09786700125470514,
"grad_norm": 0.1494140625,
"learning_rate": 9.02132998745295e-06,
"loss": 1.2259,
"step": 78
},
{
"epoch": 0.09912170639899624,
"grad_norm": 0.1494140625,
"learning_rate": 9.008782936010038e-06,
"loss": 1.2157,
"step": 79
},
{
"epoch": 0.10037641154328733,
"grad_norm": 0.1611328125,
"learning_rate": 8.996235884567126e-06,
"loss": 1.2593,
"step": 80
},
{
"epoch": 0.10163111668757842,
"grad_norm": 0.17578125,
"learning_rate": 8.983688833124216e-06,
"loss": 1.2992,
"step": 81
},
{
"epoch": 0.1028858218318695,
"grad_norm": 0.15625,
"learning_rate": 8.971141781681304e-06,
"loss": 1.2249,
"step": 82
},
{
"epoch": 0.10414052697616061,
"grad_norm": 0.15234375,
"learning_rate": 8.958594730238394e-06,
"loss": 1.2319,
"step": 83
},
{
"epoch": 0.1053952321204517,
"grad_norm": 0.154296875,
"learning_rate": 8.946047678795484e-06,
"loss": 1.2112,
"step": 84
},
{
"epoch": 0.10664993726474278,
"grad_norm": 0.1640625,
"learning_rate": 8.933500627352572e-06,
"loss": 1.264,
"step": 85
},
{
"epoch": 0.10790464240903387,
"grad_norm": 0.173828125,
"learning_rate": 8.920953575909662e-06,
"loss": 1.2381,
"step": 86
},
{
"epoch": 0.10915934755332497,
"grad_norm": 0.15625,
"learning_rate": 8.90840652446675e-06,
"loss": 1.2084,
"step": 87
},
{
"epoch": 0.11041405269761606,
"grad_norm": 0.16015625,
"learning_rate": 8.89585947302384e-06,
"loss": 1.2211,
"step": 88
},
{
"epoch": 0.11166875784190715,
"grad_norm": 0.1923828125,
"learning_rate": 8.88331242158093e-06,
"loss": 1.2351,
"step": 89
},
{
"epoch": 0.11292346298619825,
"grad_norm": 0.16015625,
"learning_rate": 8.870765370138018e-06,
"loss": 1.2459,
"step": 90
},
{
"epoch": 0.11417816813048934,
"grad_norm": 0.1640625,
"learning_rate": 8.858218318695108e-06,
"loss": 1.2756,
"step": 91
},
{
"epoch": 0.11543287327478043,
"grad_norm": 0.244140625,
"learning_rate": 8.845671267252196e-06,
"loss": 1.1769,
"step": 92
},
{
"epoch": 0.11668757841907151,
"grad_norm": 0.166015625,
"learning_rate": 8.833124215809286e-06,
"loss": 1.2025,
"step": 93
},
{
"epoch": 0.11794228356336262,
"grad_norm": 0.177734375,
"learning_rate": 8.820577164366374e-06,
"loss": 1.1827,
"step": 94
},
{
"epoch": 0.1191969887076537,
"grad_norm": 0.166015625,
"learning_rate": 8.808030112923464e-06,
"loss": 1.2374,
"step": 95
},
{
"epoch": 0.12045169385194479,
"grad_norm": 0.1572265625,
"learning_rate": 8.795483061480552e-06,
"loss": 1.2126,
"step": 96
},
{
"epoch": 0.12170639899623588,
"grad_norm": 0.1513671875,
"learning_rate": 8.782936010037642e-06,
"loss": 1.2181,
"step": 97
},
{
"epoch": 0.12296110414052698,
"grad_norm": 0.15234375,
"learning_rate": 8.77038895859473e-06,
"loss": 1.1734,
"step": 98
},
{
"epoch": 0.12421580928481807,
"grad_norm": 0.2314453125,
"learning_rate": 8.75784190715182e-06,
"loss": 1.1591,
"step": 99
},
{
"epoch": 0.12547051442910917,
"grad_norm": 0.1796875,
"learning_rate": 8.745294855708909e-06,
"loss": 1.1996,
"step": 100
},
{
"epoch": 0.12672521957340024,
"grad_norm": 0.1572265625,
"learning_rate": 8.732747804265998e-06,
"loss": 1.2239,
"step": 101
},
{
"epoch": 0.12797992471769135,
"grad_norm": 0.1533203125,
"learning_rate": 8.720200752823087e-06,
"loss": 1.1277,
"step": 102
},
{
"epoch": 0.12923462986198245,
"grad_norm": 0.1669921875,
"learning_rate": 8.707653701380176e-06,
"loss": 1.1678,
"step": 103
},
{
"epoch": 0.13048933500627352,
"grad_norm": 0.251953125,
"learning_rate": 8.695106649937265e-06,
"loss": 1.1846,
"step": 104
},
{
"epoch": 0.13174404015056462,
"grad_norm": 0.1591796875,
"learning_rate": 8.682559598494355e-06,
"loss": 1.164,
"step": 105
},
{
"epoch": 0.1329987452948557,
"grad_norm": 0.1513671875,
"learning_rate": 8.670012547051444e-06,
"loss": 1.1754,
"step": 106
},
{
"epoch": 0.1342534504391468,
"grad_norm": 0.1748046875,
"learning_rate": 8.657465495608533e-06,
"loss": 1.2038,
"step": 107
},
{
"epoch": 0.1355081555834379,
"grad_norm": 0.1728515625,
"learning_rate": 8.644918444165622e-06,
"loss": 1.2131,
"step": 108
},
{
"epoch": 0.13676286072772897,
"grad_norm": 0.1640625,
"learning_rate": 8.63237139272271e-06,
"loss": 1.2102,
"step": 109
},
{
"epoch": 0.13801756587202008,
"grad_norm": 0.158203125,
"learning_rate": 8.6198243412798e-06,
"loss": 1.2427,
"step": 110
},
{
"epoch": 0.13927227101631118,
"grad_norm": 0.158203125,
"learning_rate": 8.607277289836889e-06,
"loss": 1.1832,
"step": 111
},
{
"epoch": 0.14052697616060225,
"grad_norm": 0.16015625,
"learning_rate": 8.594730238393979e-06,
"loss": 1.2198,
"step": 112
},
{
"epoch": 0.14178168130489335,
"grad_norm": 0.16796875,
"learning_rate": 8.582183186951067e-06,
"loss": 1.117,
"step": 113
},
{
"epoch": 0.14303638644918445,
"grad_norm": 0.1591796875,
"learning_rate": 8.569636135508157e-06,
"loss": 1.166,
"step": 114
},
{
"epoch": 0.14429109159347553,
"grad_norm": 0.1552734375,
"learning_rate": 8.557089084065245e-06,
"loss": 1.1694,
"step": 115
},
{
"epoch": 0.14554579673776663,
"grad_norm": 0.1513671875,
"learning_rate": 8.544542032622335e-06,
"loss": 1.1551,
"step": 116
},
{
"epoch": 0.1468005018820577,
"grad_norm": 0.1572265625,
"learning_rate": 8.531994981179423e-06,
"loss": 1.1501,
"step": 117
},
{
"epoch": 0.1480552070263488,
"grad_norm": 0.171875,
"learning_rate": 8.519447929736513e-06,
"loss": 1.1958,
"step": 118
},
{
"epoch": 0.1493099121706399,
"grad_norm": 0.169921875,
"learning_rate": 8.506900878293601e-06,
"loss": 1.1376,
"step": 119
},
{
"epoch": 0.15056461731493098,
"grad_norm": 0.1708984375,
"learning_rate": 8.49435382685069e-06,
"loss": 1.2132,
"step": 120
},
{
"epoch": 0.15181932245922208,
"grad_norm": 0.1708984375,
"learning_rate": 8.48180677540778e-06,
"loss": 1.181,
"step": 121
},
{
"epoch": 0.15307402760351319,
"grad_norm": 0.203125,
"learning_rate": 8.469259723964869e-06,
"loss": 1.2168,
"step": 122
},
{
"epoch": 0.15432873274780426,
"grad_norm": 0.1640625,
"learning_rate": 8.456712672521959e-06,
"loss": 1.165,
"step": 123
},
{
"epoch": 0.15558343789209536,
"grad_norm": 0.181640625,
"learning_rate": 8.444165621079047e-06,
"loss": 1.1765,
"step": 124
},
{
"epoch": 0.15683814303638646,
"grad_norm": 0.1611328125,
"learning_rate": 8.431618569636137e-06,
"loss": 1.1852,
"step": 125
},
{
"epoch": 0.15809284818067754,
"grad_norm": 0.1689453125,
"learning_rate": 8.419071518193225e-06,
"loss": 1.1537,
"step": 126
},
{
"epoch": 0.15934755332496864,
"grad_norm": 0.1640625,
"learning_rate": 8.406524466750315e-06,
"loss": 1.1517,
"step": 127
},
{
"epoch": 0.1606022584692597,
"grad_norm": 0.15234375,
"learning_rate": 8.393977415307403e-06,
"loss": 1.1485,
"step": 128
},
{
"epoch": 0.1618569636135508,
"grad_norm": 0.1640625,
"learning_rate": 8.381430363864493e-06,
"loss": 1.1894,
"step": 129
},
{
"epoch": 0.16311166875784192,
"grad_norm": 0.1650390625,
"learning_rate": 8.368883312421581e-06,
"loss": 1.1028,
"step": 130
},
{
"epoch": 0.164366373902133,
"grad_norm": 0.255859375,
"learning_rate": 8.356336260978671e-06,
"loss": 1.1623,
"step": 131
},
{
"epoch": 0.1656210790464241,
"grad_norm": 0.17578125,
"learning_rate": 8.343789209535759e-06,
"loss": 1.1596,
"step": 132
},
{
"epoch": 0.1668757841907152,
"grad_norm": 0.1640625,
"learning_rate": 8.331242158092849e-06,
"loss": 1.1794,
"step": 133
},
{
"epoch": 0.16813048933500627,
"grad_norm": 0.162109375,
"learning_rate": 8.318695106649937e-06,
"loss": 1.1584,
"step": 134
},
{
"epoch": 0.16938519447929737,
"grad_norm": 0.1591796875,
"learning_rate": 8.306148055207027e-06,
"loss": 1.1652,
"step": 135
},
{
"epoch": 0.17063989962358847,
"grad_norm": 0.154296875,
"learning_rate": 8.293601003764115e-06,
"loss": 1.1535,
"step": 136
},
{
"epoch": 0.17189460476787954,
"grad_norm": 0.2021484375,
"learning_rate": 8.281053952321205e-06,
"loss": 1.1373,
"step": 137
},
{
"epoch": 0.17314930991217065,
"grad_norm": 0.1640625,
"learning_rate": 8.268506900878295e-06,
"loss": 1.1891,
"step": 138
},
{
"epoch": 0.17440401505646172,
"grad_norm": 0.166015625,
"learning_rate": 8.255959849435383e-06,
"loss": 1.1435,
"step": 139
},
{
"epoch": 0.17565872020075282,
"grad_norm": 0.1826171875,
"learning_rate": 8.243412797992473e-06,
"loss": 1.2206,
"step": 140
},
{
"epoch": 0.17691342534504392,
"grad_norm": 0.1728515625,
"learning_rate": 8.230865746549561e-06,
"loss": 1.1836,
"step": 141
},
{
"epoch": 0.178168130489335,
"grad_norm": 0.216796875,
"learning_rate": 8.218318695106651e-06,
"loss": 1.163,
"step": 142
},
{
"epoch": 0.1794228356336261,
"grad_norm": 0.16796875,
"learning_rate": 8.205771643663741e-06,
"loss": 1.1532,
"step": 143
},
{
"epoch": 0.1806775407779172,
"grad_norm": 0.185546875,
"learning_rate": 8.193224592220829e-06,
"loss": 1.1273,
"step": 144
},
{
"epoch": 0.18193224592220827,
"grad_norm": 0.1728515625,
"learning_rate": 8.180677540777919e-06,
"loss": 1.1758,
"step": 145
},
{
"epoch": 0.18318695106649938,
"grad_norm": 0.15625,
"learning_rate": 8.168130489335007e-06,
"loss": 1.1591,
"step": 146
},
{
"epoch": 0.18444165621079048,
"grad_norm": 0.166015625,
"learning_rate": 8.155583437892095e-06,
"loss": 1.1518,
"step": 147
},
{
"epoch": 0.18569636135508155,
"grad_norm": 0.1748046875,
"learning_rate": 8.143036386449185e-06,
"loss": 1.083,
"step": 148
},
{
"epoch": 0.18695106649937265,
"grad_norm": 0.17578125,
"learning_rate": 8.130489335006273e-06,
"loss": 1.2089,
"step": 149
},
{
"epoch": 0.18820577164366373,
"grad_norm": 0.171875,
"learning_rate": 8.117942283563363e-06,
"loss": 1.1721,
"step": 150
},
{
"epoch": 0.18946047678795483,
"grad_norm": 0.2060546875,
"learning_rate": 8.105395232120451e-06,
"loss": 1.0729,
"step": 151
},
{
"epoch": 0.19071518193224593,
"grad_norm": 0.1640625,
"learning_rate": 8.092848180677541e-06,
"loss": 1.1459,
"step": 152
},
{
"epoch": 0.191969887076537,
"grad_norm": 0.88671875,
"learning_rate": 8.08030112923463e-06,
"loss": 1.0966,
"step": 153
},
{
"epoch": 0.1932245922208281,
"grad_norm": 0.1748046875,
"learning_rate": 8.06775407779172e-06,
"loss": 1.1786,
"step": 154
},
{
"epoch": 0.1944792973651192,
"grad_norm": 0.1728515625,
"learning_rate": 8.05520702634881e-06,
"loss": 1.0695,
"step": 155
},
{
"epoch": 0.19573400250941028,
"grad_norm": 0.1708984375,
"learning_rate": 8.042659974905897e-06,
"loss": 1.1277,
"step": 156
},
{
"epoch": 0.19698870765370138,
"grad_norm": 0.181640625,
"learning_rate": 8.030112923462987e-06,
"loss": 1.1343,
"step": 157
},
{
"epoch": 0.19824341279799249,
"grad_norm": 0.1728515625,
"learning_rate": 8.017565872020076e-06,
"loss": 1.1395,
"step": 158
},
{
"epoch": 0.19949811794228356,
"grad_norm": 0.310546875,
"learning_rate": 8.005018820577165e-06,
"loss": 1.1521,
"step": 159
},
{
"epoch": 0.20075282308657466,
"grad_norm": 0.166015625,
"learning_rate": 7.992471769134255e-06,
"loss": 1.1283,
"step": 160
},
{
"epoch": 0.20200752823086573,
"grad_norm": 0.205078125,
"learning_rate": 7.979924717691343e-06,
"loss": 1.2036,
"step": 161
},
{
"epoch": 0.20326223337515684,
"grad_norm": 0.1953125,
"learning_rate": 7.967377666248433e-06,
"loss": 1.0924,
"step": 162
},
{
"epoch": 0.20451693851944794,
"grad_norm": 0.1875,
"learning_rate": 7.954830614805521e-06,
"loss": 1.1782,
"step": 163
},
{
"epoch": 0.205771643663739,
"grad_norm": 0.16796875,
"learning_rate": 7.942283563362611e-06,
"loss": 1.1477,
"step": 164
},
{
"epoch": 0.20702634880803011,
"grad_norm": 0.21875,
"learning_rate": 7.9297365119197e-06,
"loss": 1.0975,
"step": 165
},
{
"epoch": 0.20828105395232122,
"grad_norm": 0.173828125,
"learning_rate": 7.917189460476788e-06,
"loss": 1.1117,
"step": 166
},
{
"epoch": 0.2095357590966123,
"grad_norm": 0.2060546875,
"learning_rate": 7.904642409033878e-06,
"loss": 1.1155,
"step": 167
},
{
"epoch": 0.2107904642409034,
"grad_norm": 0.177734375,
"learning_rate": 7.892095357590966e-06,
"loss": 1.1577,
"step": 168
},
{
"epoch": 0.2120451693851945,
"grad_norm": 0.1806640625,
"learning_rate": 7.879548306148056e-06,
"loss": 1.0996,
"step": 169
},
{
"epoch": 0.21329987452948557,
"grad_norm": 0.16796875,
"learning_rate": 7.867001254705144e-06,
"loss": 1.1129,
"step": 170
},
{
"epoch": 0.21455457967377667,
"grad_norm": 0.177734375,
"learning_rate": 7.854454203262234e-06,
"loss": 1.1069,
"step": 171
},
{
"epoch": 0.21580928481806774,
"grad_norm": 0.1708984375,
"learning_rate": 7.841907151819324e-06,
"loss": 1.112,
"step": 172
},
{
"epoch": 0.21706398996235884,
"grad_norm": 0.171875,
"learning_rate": 7.829360100376412e-06,
"loss": 1.1032,
"step": 173
},
{
"epoch": 0.21831869510664995,
"grad_norm": 0.208984375,
"learning_rate": 7.816813048933502e-06,
"loss": 1.1991,
"step": 174
},
{
"epoch": 0.21957340025094102,
"grad_norm": 0.3359375,
"learning_rate": 7.80426599749059e-06,
"loss": 1.1184,
"step": 175
},
{
"epoch": 0.22082810539523212,
"grad_norm": 0.18359375,
"learning_rate": 7.79171894604768e-06,
"loss": 1.0521,
"step": 176
},
{
"epoch": 0.22208281053952322,
"grad_norm": 0.1728515625,
"learning_rate": 7.77917189460477e-06,
"loss": 1.1179,
"step": 177
},
{
"epoch": 0.2233375156838143,
"grad_norm": 0.1787109375,
"learning_rate": 7.766624843161858e-06,
"loss": 1.1509,
"step": 178
},
{
"epoch": 0.2245922208281054,
"grad_norm": 0.2734375,
"learning_rate": 7.754077791718948e-06,
"loss": 1.1356,
"step": 179
},
{
"epoch": 0.2258469259723965,
"grad_norm": 0.2138671875,
"learning_rate": 7.741530740276036e-06,
"loss": 1.1048,
"step": 180
},
{
"epoch": 0.22710163111668757,
"grad_norm": 0.228515625,
"learning_rate": 7.728983688833126e-06,
"loss": 1.1204,
"step": 181
},
{
"epoch": 0.22835633626097868,
"grad_norm": 0.2109375,
"learning_rate": 7.716436637390214e-06,
"loss": 1.1351,
"step": 182
},
{
"epoch": 0.22961104140526975,
"grad_norm": 0.203125,
"learning_rate": 7.703889585947302e-06,
"loss": 1.14,
"step": 183
},
{
"epoch": 0.23086574654956085,
"grad_norm": 0.16015625,
"learning_rate": 7.691342534504392e-06,
"loss": 1.0908,
"step": 184
},
{
"epoch": 0.23212045169385195,
"grad_norm": 0.185546875,
"learning_rate": 7.67879548306148e-06,
"loss": 1.073,
"step": 185
},
{
"epoch": 0.23337515683814303,
"grad_norm": 0.1748046875,
"learning_rate": 7.66624843161857e-06,
"loss": 1.1242,
"step": 186
},
{
"epoch": 0.23462986198243413,
"grad_norm": 0.177734375,
"learning_rate": 7.65370138017566e-06,
"loss": 1.0843,
"step": 187
},
{
"epoch": 0.23588456712672523,
"grad_norm": 0.1767578125,
"learning_rate": 7.641154328732748e-06,
"loss": 1.1049,
"step": 188
},
{
"epoch": 0.2371392722710163,
"grad_norm": 0.1884765625,
"learning_rate": 7.628607277289838e-06,
"loss": 1.1389,
"step": 189
},
{
"epoch": 0.2383939774153074,
"grad_norm": 0.1650390625,
"learning_rate": 7.616060225846926e-06,
"loss": 1.1214,
"step": 190
},
{
"epoch": 0.2396486825595985,
"grad_norm": 0.171875,
"learning_rate": 7.603513174404016e-06,
"loss": 1.1295,
"step": 191
},
{
"epoch": 0.24090338770388958,
"grad_norm": 0.1689453125,
"learning_rate": 7.590966122961104e-06,
"loss": 1.1018,
"step": 192
},
{
"epoch": 0.24215809284818068,
"grad_norm": 0.1708984375,
"learning_rate": 7.578419071518194e-06,
"loss": 1.0607,
"step": 193
},
{
"epoch": 0.24341279799247176,
"grad_norm": 0.1884765625,
"learning_rate": 7.565872020075283e-06,
"loss": 1.2116,
"step": 194
},
{
"epoch": 0.24466750313676286,
"grad_norm": 0.1767578125,
"learning_rate": 7.553324968632372e-06,
"loss": 1.0891,
"step": 195
},
{
"epoch": 0.24592220828105396,
"grad_norm": 0.18359375,
"learning_rate": 7.540777917189461e-06,
"loss": 1.217,
"step": 196
},
{
"epoch": 0.24717691342534504,
"grad_norm": 0.1708984375,
"learning_rate": 7.52823086574655e-06,
"loss": 1.1137,
"step": 197
},
{
"epoch": 0.24843161856963614,
"grad_norm": 0.1796875,
"learning_rate": 7.515683814303639e-06,
"loss": 1.0802,
"step": 198
},
{
"epoch": 0.24968632371392724,
"grad_norm": 0.1806640625,
"learning_rate": 7.503136762860729e-06,
"loss": 1.0763,
"step": 199
},
{
"epoch": 0.25094102885821834,
"grad_norm": 0.1943359375,
"learning_rate": 7.490589711417817e-06,
"loss": 1.0652,
"step": 200
},
{
"epoch": 0.2521957340025094,
"grad_norm": 0.1787109375,
"learning_rate": 7.478042659974907e-06,
"loss": 1.0527,
"step": 201
},
{
"epoch": 0.2534504391468005,
"grad_norm": 0.189453125,
"learning_rate": 7.465495608531995e-06,
"loss": 1.1129,
"step": 202
},
{
"epoch": 0.2547051442910916,
"grad_norm": 0.189453125,
"learning_rate": 7.452948557089085e-06,
"loss": 1.0906,
"step": 203
},
{
"epoch": 0.2559598494353827,
"grad_norm": 0.181640625,
"learning_rate": 7.440401505646174e-06,
"loss": 1.0946,
"step": 204
},
{
"epoch": 0.2572145545796738,
"grad_norm": 0.169921875,
"learning_rate": 7.427854454203262e-06,
"loss": 1.0753,
"step": 205
},
{
"epoch": 0.2584692597239649,
"grad_norm": 0.189453125,
"learning_rate": 7.415307402760352e-06,
"loss": 1.143,
"step": 206
},
{
"epoch": 0.25972396486825594,
"grad_norm": 0.185546875,
"learning_rate": 7.40276035131744e-06,
"loss": 1.0865,
"step": 207
},
{
"epoch": 0.26097867001254704,
"grad_norm": 0.1845703125,
"learning_rate": 7.39021329987453e-06,
"loss": 1.1062,
"step": 208
},
{
"epoch": 0.26223337515683814,
"grad_norm": 0.220703125,
"learning_rate": 7.37766624843162e-06,
"loss": 1.0765,
"step": 209
},
{
"epoch": 0.26348808030112925,
"grad_norm": 0.267578125,
"learning_rate": 7.365119196988708e-06,
"loss": 1.0659,
"step": 210
},
{
"epoch": 0.26474278544542035,
"grad_norm": 0.205078125,
"learning_rate": 7.352572145545797e-06,
"loss": 1.0919,
"step": 211
},
{
"epoch": 0.2659974905897114,
"grad_norm": 0.1884765625,
"learning_rate": 7.340025094102886e-06,
"loss": 1.0782,
"step": 212
},
{
"epoch": 0.2672521957340025,
"grad_norm": 0.1826171875,
"learning_rate": 7.327478042659975e-06,
"loss": 1.0881,
"step": 213
},
{
"epoch": 0.2685069008782936,
"grad_norm": 0.1767578125,
"learning_rate": 7.314930991217064e-06,
"loss": 1.1003,
"step": 214
},
{
"epoch": 0.2697616060225847,
"grad_norm": 0.203125,
"learning_rate": 7.302383939774153e-06,
"loss": 1.1336,
"step": 215
},
{
"epoch": 0.2710163111668758,
"grad_norm": 0.1796875,
"learning_rate": 7.289836888331243e-06,
"loss": 1.0792,
"step": 216
},
{
"epoch": 0.2722710163111669,
"grad_norm": 0.1689453125,
"learning_rate": 7.2772898368883315e-06,
"loss": 1.0272,
"step": 217
},
{
"epoch": 0.27352572145545795,
"grad_norm": 0.283203125,
"learning_rate": 7.264742785445421e-06,
"loss": 1.0626,
"step": 218
},
{
"epoch": 0.27478042659974905,
"grad_norm": 0.1865234375,
"learning_rate": 7.2521957340025095e-06,
"loss": 1.0846,
"step": 219
},
{
"epoch": 0.27603513174404015,
"grad_norm": 0.181640625,
"learning_rate": 7.239648682559599e-06,
"loss": 1.0261,
"step": 220
},
{
"epoch": 0.27728983688833125,
"grad_norm": 0.1845703125,
"learning_rate": 7.2271016311166884e-06,
"loss": 1.0981,
"step": 221
},
{
"epoch": 0.27854454203262236,
"grad_norm": 0.1943359375,
"learning_rate": 7.2145545796737775e-06,
"loss": 1.1451,
"step": 222
},
{
"epoch": 0.2797992471769134,
"grad_norm": 0.1787109375,
"learning_rate": 7.2020075282308665e-06,
"loss": 1.0633,
"step": 223
},
{
"epoch": 0.2810539523212045,
"grad_norm": 0.1806640625,
"learning_rate": 7.189460476787955e-06,
"loss": 1.0107,
"step": 224
},
{
"epoch": 0.2823086574654956,
"grad_norm": 0.232421875,
"learning_rate": 7.1769134253450445e-06,
"loss": 1.077,
"step": 225
},
{
"epoch": 0.2835633626097867,
"grad_norm": 0.177734375,
"learning_rate": 7.164366373902134e-06,
"loss": 1.0807,
"step": 226
},
{
"epoch": 0.2848180677540778,
"grad_norm": 0.2080078125,
"learning_rate": 7.151819322459223e-06,
"loss": 1.0934,
"step": 227
},
{
"epoch": 0.2860727728983689,
"grad_norm": 0.1923828125,
"learning_rate": 7.1392722710163125e-06,
"loss": 1.06,
"step": 228
},
{
"epoch": 0.28732747804265996,
"grad_norm": 0.1689453125,
"learning_rate": 7.126725219573401e-06,
"loss": 1.0939,
"step": 229
},
{
"epoch": 0.28858218318695106,
"grad_norm": 0.1982421875,
"learning_rate": 7.11417816813049e-06,
"loss": 1.099,
"step": 230
},
{
"epoch": 0.28983688833124216,
"grad_norm": 0.1904296875,
"learning_rate": 7.1016311166875795e-06,
"loss": 1.1347,
"step": 231
},
{
"epoch": 0.29109159347553326,
"grad_norm": 0.1884765625,
"learning_rate": 7.089084065244668e-06,
"loss": 1.1201,
"step": 232
},
{
"epoch": 0.29234629861982436,
"grad_norm": 0.212890625,
"learning_rate": 7.076537013801758e-06,
"loss": 1.1353,
"step": 233
},
{
"epoch": 0.2936010037641154,
"grad_norm": 0.1953125,
"learning_rate": 7.063989962358846e-06,
"loss": 1.107,
"step": 234
},
{
"epoch": 0.2948557089084065,
"grad_norm": 0.2119140625,
"learning_rate": 7.051442910915936e-06,
"loss": 1.0844,
"step": 235
},
{
"epoch": 0.2961104140526976,
"grad_norm": 0.330078125,
"learning_rate": 7.038895859473024e-06,
"loss": 1.1499,
"step": 236
},
{
"epoch": 0.2973651191969887,
"grad_norm": 0.201171875,
"learning_rate": 7.026348808030114e-06,
"loss": 1.1032,
"step": 237
},
{
"epoch": 0.2986198243412798,
"grad_norm": 0.1923828125,
"learning_rate": 7.013801756587203e-06,
"loss": 1.0374,
"step": 238
},
{
"epoch": 0.2998745294855709,
"grad_norm": 0.1923828125,
"learning_rate": 7.001254705144292e-06,
"loss": 1.0793,
"step": 239
},
{
"epoch": 0.30112923462986196,
"grad_norm": 0.181640625,
"learning_rate": 6.988707653701381e-06,
"loss": 1.0437,
"step": 240
},
{
"epoch": 0.30238393977415307,
"grad_norm": 0.201171875,
"learning_rate": 6.976160602258469e-06,
"loss": 1.056,
"step": 241
},
{
"epoch": 0.30363864491844417,
"grad_norm": 0.212890625,
"learning_rate": 6.963613550815559e-06,
"loss": 1.0532,
"step": 242
},
{
"epoch": 0.30489335006273527,
"grad_norm": 0.1787109375,
"learning_rate": 6.951066499372649e-06,
"loss": 1.0622,
"step": 243
},
{
"epoch": 0.30614805520702637,
"grad_norm": 0.1826171875,
"learning_rate": 6.938519447929737e-06,
"loss": 1.0593,
"step": 244
},
{
"epoch": 0.3074027603513174,
"grad_norm": 0.2158203125,
"learning_rate": 6.925972396486827e-06,
"loss": 1.143,
"step": 245
},
{
"epoch": 0.3086574654956085,
"grad_norm": 0.20703125,
"learning_rate": 6.913425345043915e-06,
"loss": 1.1176,
"step": 246
},
{
"epoch": 0.3099121706398996,
"grad_norm": 0.2060546875,
"learning_rate": 6.900878293601004e-06,
"loss": 1.0449,
"step": 247
},
{
"epoch": 0.3111668757841907,
"grad_norm": 0.197265625,
"learning_rate": 6.888331242158094e-06,
"loss": 1.1068,
"step": 248
},
{
"epoch": 0.3124215809284818,
"grad_norm": 0.203125,
"learning_rate": 6.875784190715182e-06,
"loss": 1.0431,
"step": 249
},
{
"epoch": 0.3136762860727729,
"grad_norm": 0.1962890625,
"learning_rate": 6.863237139272272e-06,
"loss": 1.0766,
"step": 250
},
{
"epoch": 0.31493099121706397,
"grad_norm": 0.22265625,
"learning_rate": 6.85069008782936e-06,
"loss": 1.1234,
"step": 251
},
{
"epoch": 0.3161856963613551,
"grad_norm": 0.19140625,
"learning_rate": 6.83814303638645e-06,
"loss": 1.0631,
"step": 252
},
{
"epoch": 0.3174404015056462,
"grad_norm": 0.185546875,
"learning_rate": 6.825595984943539e-06,
"loss": 1.0232,
"step": 253
},
{
"epoch": 0.3186951066499373,
"grad_norm": 0.216796875,
"learning_rate": 6.813048933500628e-06,
"loss": 1.0325,
"step": 254
},
{
"epoch": 0.3199498117942284,
"grad_norm": 0.1806640625,
"learning_rate": 6.800501882057717e-06,
"loss": 1.0789,
"step": 255
},
{
"epoch": 0.3212045169385194,
"grad_norm": 0.1796875,
"learning_rate": 6.787954830614806e-06,
"loss": 1.0603,
"step": 256
},
{
"epoch": 0.3224592220828105,
"grad_norm": 0.173828125,
"learning_rate": 6.775407779171895e-06,
"loss": 1.0139,
"step": 257
},
{
"epoch": 0.3237139272271016,
"grad_norm": 0.177734375,
"learning_rate": 6.762860727728984e-06,
"loss": 1.074,
"step": 258
},
{
"epoch": 0.32496863237139273,
"grad_norm": 0.193359375,
"learning_rate": 6.750313676286073e-06,
"loss": 1.1028,
"step": 259
},
{
"epoch": 0.32622333751568383,
"grad_norm": 0.19140625,
"learning_rate": 6.737766624843163e-06,
"loss": 1.0827,
"step": 260
},
{
"epoch": 0.32747804265997493,
"grad_norm": 0.19921875,
"learning_rate": 6.725219573400251e-06,
"loss": 1.125,
"step": 261
},
{
"epoch": 0.328732747804266,
"grad_norm": 0.189453125,
"learning_rate": 6.712672521957341e-06,
"loss": 1.0249,
"step": 262
},
{
"epoch": 0.3299874529485571,
"grad_norm": 0.197265625,
"learning_rate": 6.700125470514429e-06,
"loss": 1.1003,
"step": 263
},
{
"epoch": 0.3312421580928482,
"grad_norm": 0.224609375,
"learning_rate": 6.687578419071519e-06,
"loss": 1.1257,
"step": 264
},
{
"epoch": 0.3324968632371393,
"grad_norm": 0.1826171875,
"learning_rate": 6.675031367628608e-06,
"loss": 1.0234,
"step": 265
},
{
"epoch": 0.3337515683814304,
"grad_norm": 0.19921875,
"learning_rate": 6.662484316185696e-06,
"loss": 1.0556,
"step": 266
},
{
"epoch": 0.33500627352572143,
"grad_norm": 0.197265625,
"learning_rate": 6.649937264742786e-06,
"loss": 1.0651,
"step": 267
},
{
"epoch": 0.33626097867001253,
"grad_norm": 0.1845703125,
"learning_rate": 6.637390213299874e-06,
"loss": 1.0517,
"step": 268
},
{
"epoch": 0.33751568381430364,
"grad_norm": 0.1884765625,
"learning_rate": 6.624843161856964e-06,
"loss": 1.1119,
"step": 269
},
{
"epoch": 0.33877038895859474,
"grad_norm": 0.2109375,
"learning_rate": 6.612296110414054e-06,
"loss": 1.0818,
"step": 270
},
{
"epoch": 0.34002509410288584,
"grad_norm": 0.18359375,
"learning_rate": 6.599749058971142e-06,
"loss": 1.0845,
"step": 271
},
{
"epoch": 0.34127979924717694,
"grad_norm": 0.203125,
"learning_rate": 6.587202007528231e-06,
"loss": 1.0434,
"step": 272
},
{
"epoch": 0.342534504391468,
"grad_norm": 0.1865234375,
"learning_rate": 6.57465495608532e-06,
"loss": 1.0789,
"step": 273
},
{
"epoch": 0.3437892095357591,
"grad_norm": 0.2177734375,
"learning_rate": 6.562107904642409e-06,
"loss": 1.1049,
"step": 274
},
{
"epoch": 0.3450439146800502,
"grad_norm": 0.1904296875,
"learning_rate": 6.549560853199499e-06,
"loss": 1.0722,
"step": 275
},
{
"epoch": 0.3462986198243413,
"grad_norm": 0.1904296875,
"learning_rate": 6.5370138017565874e-06,
"loss": 1.0139,
"step": 276
},
{
"epoch": 0.3475533249686324,
"grad_norm": 0.326171875,
"learning_rate": 6.524466750313677e-06,
"loss": 1.061,
"step": 277
},
{
"epoch": 0.34880803011292344,
"grad_norm": 0.1884765625,
"learning_rate": 6.5119196988707655e-06,
"loss": 1.0736,
"step": 278
},
{
"epoch": 0.35006273525721454,
"grad_norm": 0.296875,
"learning_rate": 6.499372647427855e-06,
"loss": 1.0735,
"step": 279
},
{
"epoch": 0.35131744040150564,
"grad_norm": 0.2177734375,
"learning_rate": 6.4868255959849435e-06,
"loss": 1.1039,
"step": 280
},
{
"epoch": 0.35257214554579674,
"grad_norm": 0.3203125,
"learning_rate": 6.474278544542033e-06,
"loss": 1.0102,
"step": 281
},
{
"epoch": 0.35382685069008785,
"grad_norm": 0.1943359375,
"learning_rate": 6.4617314930991224e-06,
"loss": 1.1021,
"step": 282
},
{
"epoch": 0.35508155583437895,
"grad_norm": 0.1982421875,
"learning_rate": 6.4491844416562115e-06,
"loss": 1.0137,
"step": 283
},
{
"epoch": 0.35633626097867,
"grad_norm": 0.1845703125,
"learning_rate": 6.4366373902133005e-06,
"loss": 1.1087,
"step": 284
},
{
"epoch": 0.3575909661229611,
"grad_norm": 0.1982421875,
"learning_rate": 6.424090338770389e-06,
"loss": 1.0688,
"step": 285
},
{
"epoch": 0.3588456712672522,
"grad_norm": 0.193359375,
"learning_rate": 6.4115432873274786e-06,
"loss": 1.087,
"step": 286
},
{
"epoch": 0.3601003764115433,
"grad_norm": 0.3359375,
"learning_rate": 6.3989962358845684e-06,
"loss": 1.1308,
"step": 287
},
{
"epoch": 0.3613550815558344,
"grad_norm": 0.208984375,
"learning_rate": 6.386449184441657e-06,
"loss": 1.1111,
"step": 288
},
{
"epoch": 0.36260978670012545,
"grad_norm": 0.24609375,
"learning_rate": 6.373902132998746e-06,
"loss": 1.0542,
"step": 289
},
{
"epoch": 0.36386449184441655,
"grad_norm": 0.20703125,
"learning_rate": 6.361355081555835e-06,
"loss": 1.0971,
"step": 290
},
{
"epoch": 0.36511919698870765,
"grad_norm": 0.1982421875,
"learning_rate": 6.348808030112924e-06,
"loss": 1.0325,
"step": 291
},
{
"epoch": 0.36637390213299875,
"grad_norm": 0.1962890625,
"learning_rate": 6.3362609786700136e-06,
"loss": 1.044,
"step": 292
},
{
"epoch": 0.36762860727728985,
"grad_norm": 0.7265625,
"learning_rate": 6.323713927227102e-06,
"loss": 1.1222,
"step": 293
},
{
"epoch": 0.36888331242158096,
"grad_norm": 0.18359375,
"learning_rate": 6.311166875784192e-06,
"loss": 1.0249,
"step": 294
},
{
"epoch": 0.370138017565872,
"grad_norm": 0.1904296875,
"learning_rate": 6.29861982434128e-06,
"loss": 1.0361,
"step": 295
},
{
"epoch": 0.3713927227101631,
"grad_norm": 0.19140625,
"learning_rate": 6.28607277289837e-06,
"loss": 1.1183,
"step": 296
},
{
"epoch": 0.3726474278544542,
"grad_norm": 0.220703125,
"learning_rate": 6.273525721455459e-06,
"loss": 1.0808,
"step": 297
},
{
"epoch": 0.3739021329987453,
"grad_norm": 0.1953125,
"learning_rate": 6.260978670012548e-06,
"loss": 1.0988,
"step": 298
},
{
"epoch": 0.3751568381430364,
"grad_norm": 0.1904296875,
"learning_rate": 6.248431618569637e-06,
"loss": 1.0883,
"step": 299
},
{
"epoch": 0.37641154328732745,
"grad_norm": 0.185546875,
"learning_rate": 6.235884567126726e-06,
"loss": 1.053,
"step": 300
},
{
"epoch": 0.37766624843161856,
"grad_norm": 0.25,
"learning_rate": 6.223337515683815e-06,
"loss": 1.1065,
"step": 301
},
{
"epoch": 0.37892095357590966,
"grad_norm": 0.1953125,
"learning_rate": 6.210790464240903e-06,
"loss": 1.0535,
"step": 302
},
{
"epoch": 0.38017565872020076,
"grad_norm": 0.22265625,
"learning_rate": 6.198243412797993e-06,
"loss": 1.1443,
"step": 303
},
{
"epoch": 0.38143036386449186,
"grad_norm": 0.18359375,
"learning_rate": 6.185696361355083e-06,
"loss": 1.0134,
"step": 304
},
{
"epoch": 0.38268506900878296,
"grad_norm": 0.2109375,
"learning_rate": 6.173149309912171e-06,
"loss": 1.0806,
"step": 305
},
{
"epoch": 0.383939774153074,
"grad_norm": 0.2041015625,
"learning_rate": 6.160602258469261e-06,
"loss": 1.0563,
"step": 306
},
{
"epoch": 0.3851944792973651,
"grad_norm": 0.1904296875,
"learning_rate": 6.148055207026349e-06,
"loss": 1.0994,
"step": 307
},
{
"epoch": 0.3864491844416562,
"grad_norm": 0.203125,
"learning_rate": 6.135508155583438e-06,
"loss": 1.0843,
"step": 308
},
{
"epoch": 0.3877038895859473,
"grad_norm": 0.248046875,
"learning_rate": 6.122961104140528e-06,
"loss": 0.9856,
"step": 309
},
{
"epoch": 0.3889585947302384,
"grad_norm": 0.357421875,
"learning_rate": 6.110414052697616e-06,
"loss": 1.0472,
"step": 310
},
{
"epoch": 0.39021329987452946,
"grad_norm": 0.1923828125,
"learning_rate": 6.097867001254706e-06,
"loss": 1.0711,
"step": 311
},
{
"epoch": 0.39146800501882056,
"grad_norm": 0.197265625,
"learning_rate": 6.085319949811794e-06,
"loss": 1.1034,
"step": 312
},
{
"epoch": 0.39272271016311167,
"grad_norm": 0.2060546875,
"learning_rate": 6.072772898368884e-06,
"loss": 1.0568,
"step": 313
},
{
"epoch": 0.39397741530740277,
"grad_norm": 0.2373046875,
"learning_rate": 6.060225846925973e-06,
"loss": 1.0413,
"step": 314
},
{
"epoch": 0.39523212045169387,
"grad_norm": 0.2314453125,
"learning_rate": 6.047678795483062e-06,
"loss": 1.0047,
"step": 315
},
{
"epoch": 0.39648682559598497,
"grad_norm": 0.1923828125,
"learning_rate": 6.035131744040151e-06,
"loss": 1.0121,
"step": 316
},
{
"epoch": 0.397741530740276,
"grad_norm": 0.2470703125,
"learning_rate": 6.02258469259724e-06,
"loss": 1.0732,
"step": 317
},
{
"epoch": 0.3989962358845671,
"grad_norm": 0.1982421875,
"learning_rate": 6.010037641154329e-06,
"loss": 1.092,
"step": 318
},
{
"epoch": 0.4002509410288582,
"grad_norm": 0.19140625,
"learning_rate": 5.997490589711419e-06,
"loss": 1.0383,
"step": 319
},
{
"epoch": 0.4015056461731493,
"grad_norm": 0.2119140625,
"learning_rate": 5.984943538268507e-06,
"loss": 1.069,
"step": 320
},
{
"epoch": 0.4027603513174404,
"grad_norm": 0.1953125,
"learning_rate": 5.972396486825597e-06,
"loss": 0.9865,
"step": 321
},
{
"epoch": 0.40401505646173147,
"grad_norm": 0.1904296875,
"learning_rate": 5.959849435382685e-06,
"loss": 1.0732,
"step": 322
},
{
"epoch": 0.40526976160602257,
"grad_norm": 0.2158203125,
"learning_rate": 5.947302383939775e-06,
"loss": 1.1649,
"step": 323
},
{
"epoch": 0.4065244667503137,
"grad_norm": 0.1884765625,
"learning_rate": 5.934755332496863e-06,
"loss": 1.0262,
"step": 324
},
{
"epoch": 0.4077791718946048,
"grad_norm": 0.1923828125,
"learning_rate": 5.922208281053953e-06,
"loss": 1.0525,
"step": 325
},
{
"epoch": 0.4090338770388959,
"grad_norm": 0.1845703125,
"learning_rate": 5.909661229611042e-06,
"loss": 1.0554,
"step": 326
},
{
"epoch": 0.410288582183187,
"grad_norm": 0.1943359375,
"learning_rate": 5.89711417816813e-06,
"loss": 1.0965,
"step": 327
},
{
"epoch": 0.411543287327478,
"grad_norm": 0.2021484375,
"learning_rate": 5.88456712672522e-06,
"loss": 1.0792,
"step": 328
},
{
"epoch": 0.4127979924717691,
"grad_norm": 0.1953125,
"learning_rate": 5.872020075282308e-06,
"loss": 1.056,
"step": 329
},
{
"epoch": 0.41405269761606023,
"grad_norm": 0.19140625,
"learning_rate": 5.859473023839398e-06,
"loss": 1.0459,
"step": 330
},
{
"epoch": 0.41530740276035133,
"grad_norm": 0.19921875,
"learning_rate": 5.846925972396488e-06,
"loss": 1.0796,
"step": 331
},
{
"epoch": 0.41656210790464243,
"grad_norm": 0.21875,
"learning_rate": 5.834378920953576e-06,
"loss": 1.0094,
"step": 332
},
{
"epoch": 0.4178168130489335,
"grad_norm": 0.208984375,
"learning_rate": 5.821831869510665e-06,
"loss": 1.0475,
"step": 333
},
{
"epoch": 0.4190715181932246,
"grad_norm": 0.32421875,
"learning_rate": 5.809284818067754e-06,
"loss": 1.0524,
"step": 334
},
{
"epoch": 0.4203262233375157,
"grad_norm": 0.1845703125,
"learning_rate": 5.796737766624843e-06,
"loss": 1.0325,
"step": 335
},
{
"epoch": 0.4215809284818068,
"grad_norm": 0.203125,
"learning_rate": 5.784190715181933e-06,
"loss": 1.0461,
"step": 336
},
{
"epoch": 0.4228356336260979,
"grad_norm": 0.2265625,
"learning_rate": 5.7716436637390215e-06,
"loss": 1.0828,
"step": 337
},
{
"epoch": 0.424090338770389,
"grad_norm": 0.2060546875,
"learning_rate": 5.759096612296111e-06,
"loss": 1.119,
"step": 338
},
{
"epoch": 0.42534504391468003,
"grad_norm": 0.1962890625,
"learning_rate": 5.7465495608531995e-06,
"loss": 1.025,
"step": 339
},
{
"epoch": 0.42659974905897113,
"grad_norm": 0.20703125,
"learning_rate": 5.734002509410289e-06,
"loss": 1.0919,
"step": 340
},
{
"epoch": 0.42785445420326224,
"grad_norm": 0.2373046875,
"learning_rate": 5.721455457967378e-06,
"loss": 1.081,
"step": 341
},
{
"epoch": 0.42910915934755334,
"grad_norm": 0.287109375,
"learning_rate": 5.7089084065244674e-06,
"loss": 1.0661,
"step": 342
},
{
"epoch": 0.43036386449184444,
"grad_norm": 0.1923828125,
"learning_rate": 5.6963613550815565e-06,
"loss": 1.059,
"step": 343
},
{
"epoch": 0.4316185696361355,
"grad_norm": 0.1982421875,
"learning_rate": 5.683814303638645e-06,
"loss": 1.0253,
"step": 344
},
{
"epoch": 0.4328732747804266,
"grad_norm": 0.201171875,
"learning_rate": 5.6712672521957345e-06,
"loss": 1.0943,
"step": 345
},
{
"epoch": 0.4341279799247177,
"grad_norm": 0.2236328125,
"learning_rate": 5.658720200752823e-06,
"loss": 1.0899,
"step": 346
},
{
"epoch": 0.4353826850690088,
"grad_norm": 0.2333984375,
"learning_rate": 5.6461731493099126e-06,
"loss": 1.0111,
"step": 347
},
{
"epoch": 0.4366373902132999,
"grad_norm": 0.1953125,
"learning_rate": 5.6336260978670024e-06,
"loss": 1.0214,
"step": 348
},
{
"epoch": 0.437892095357591,
"grad_norm": 0.1982421875,
"learning_rate": 5.621079046424091e-06,
"loss": 1.1007,
"step": 349
},
{
"epoch": 0.43914680050188204,
"grad_norm": 0.197265625,
"learning_rate": 5.60853199498118e-06,
"loss": 1.1226,
"step": 350
},
{
"epoch": 0.44040150564617314,
"grad_norm": 0.197265625,
"learning_rate": 5.595984943538269e-06,
"loss": 1.0233,
"step": 351
},
{
"epoch": 0.44165621079046424,
"grad_norm": 0.201171875,
"learning_rate": 5.583437892095358e-06,
"loss": 1.0142,
"step": 352
},
{
"epoch": 0.44291091593475534,
"grad_norm": 0.2314453125,
"learning_rate": 5.5708908406524476e-06,
"loss": 1.0528,
"step": 353
},
{
"epoch": 0.44416562107904645,
"grad_norm": 0.234375,
"learning_rate": 5.558343789209536e-06,
"loss": 1.0586,
"step": 354
},
{
"epoch": 0.4454203262233375,
"grad_norm": 0.22265625,
"learning_rate": 5.545796737766626e-06,
"loss": 1.0455,
"step": 355
},
{
"epoch": 0.4466750313676286,
"grad_norm": 0.1943359375,
"learning_rate": 5.533249686323714e-06,
"loss": 1.0394,
"step": 356
},
{
"epoch": 0.4479297365119197,
"grad_norm": 0.1953125,
"learning_rate": 5.520702634880804e-06,
"loss": 1.055,
"step": 357
},
{
"epoch": 0.4491844416562108,
"grad_norm": 0.1953125,
"learning_rate": 5.508155583437893e-06,
"loss": 1.0775,
"step": 358
},
{
"epoch": 0.4504391468005019,
"grad_norm": 0.2001953125,
"learning_rate": 5.495608531994982e-06,
"loss": 1.0259,
"step": 359
},
{
"epoch": 0.451693851944793,
"grad_norm": 0.2041015625,
"learning_rate": 5.483061480552071e-06,
"loss": 1.0071,
"step": 360
},
{
"epoch": 0.45294855708908405,
"grad_norm": 0.203125,
"learning_rate": 5.47051442910916e-06,
"loss": 1.0958,
"step": 361
},
{
"epoch": 0.45420326223337515,
"grad_norm": 0.1875,
"learning_rate": 5.457967377666249e-06,
"loss": 1.0126,
"step": 362
},
{
"epoch": 0.45545796737766625,
"grad_norm": 0.19921875,
"learning_rate": 5.445420326223339e-06,
"loss": 1.0417,
"step": 363
},
{
"epoch": 0.45671267252195735,
"grad_norm": 0.2197265625,
"learning_rate": 5.432873274780427e-06,
"loss": 1.1278,
"step": 364
},
{
"epoch": 0.45796737766624845,
"grad_norm": 0.236328125,
"learning_rate": 5.420326223337517e-06,
"loss": 1.0332,
"step": 365
},
{
"epoch": 0.4592220828105395,
"grad_norm": 0.1923828125,
"learning_rate": 5.407779171894605e-06,
"loss": 1.0645,
"step": 366
},
{
"epoch": 0.4604767879548306,
"grad_norm": 0.2333984375,
"learning_rate": 5.395232120451695e-06,
"loss": 1.0541,
"step": 367
},
{
"epoch": 0.4617314930991217,
"grad_norm": 0.2041015625,
"learning_rate": 5.382685069008783e-06,
"loss": 1.05,
"step": 368
},
{
"epoch": 0.4629861982434128,
"grad_norm": 0.193359375,
"learning_rate": 5.370138017565872e-06,
"loss": 1.0079,
"step": 369
},
{
"epoch": 0.4642409033877039,
"grad_norm": 0.1943359375,
"learning_rate": 5.357590966122962e-06,
"loss": 1.0665,
"step": 370
},
{
"epoch": 0.465495608531995,
"grad_norm": 0.2080078125,
"learning_rate": 5.34504391468005e-06,
"loss": 1.0672,
"step": 371
},
{
"epoch": 0.46675031367628605,
"grad_norm": 0.1982421875,
"learning_rate": 5.33249686323714e-06,
"loss": 1.0404,
"step": 372
},
{
"epoch": 0.46800501882057716,
"grad_norm": 0.19140625,
"learning_rate": 5.319949811794228e-06,
"loss": 1.049,
"step": 373
},
{
"epoch": 0.46925972396486826,
"grad_norm": 0.2021484375,
"learning_rate": 5.307402760351318e-06,
"loss": 1.0869,
"step": 374
},
{
"epoch": 0.47051442910915936,
"grad_norm": 0.259765625,
"learning_rate": 5.294855708908407e-06,
"loss": 1.0311,
"step": 375
},
{
"epoch": 0.47176913425345046,
"grad_norm": 0.203125,
"learning_rate": 5.282308657465496e-06,
"loss": 0.9855,
"step": 376
},
{
"epoch": 0.4730238393977415,
"grad_norm": 0.2080078125,
"learning_rate": 5.269761606022585e-06,
"loss": 1.1071,
"step": 377
},
{
"epoch": 0.4742785445420326,
"grad_norm": 0.2158203125,
"learning_rate": 5.257214554579674e-06,
"loss": 1.0345,
"step": 378
},
{
"epoch": 0.4755332496863237,
"grad_norm": 0.205078125,
"learning_rate": 5.244667503136763e-06,
"loss": 1.0342,
"step": 379
},
{
"epoch": 0.4767879548306148,
"grad_norm": 0.21484375,
"learning_rate": 5.232120451693853e-06,
"loss": 1.1377,
"step": 380
},
{
"epoch": 0.4780426599749059,
"grad_norm": 0.1943359375,
"learning_rate": 5.219573400250941e-06,
"loss": 1.0074,
"step": 381
},
{
"epoch": 0.479297365119197,
"grad_norm": 0.20703125,
"learning_rate": 5.207026348808031e-06,
"loss": 1.1016,
"step": 382
},
{
"epoch": 0.48055207026348806,
"grad_norm": 0.212890625,
"learning_rate": 5.194479297365119e-06,
"loss": 0.9935,
"step": 383
},
{
"epoch": 0.48180677540777916,
"grad_norm": 0.357421875,
"learning_rate": 5.181932245922209e-06,
"loss": 0.9686,
"step": 384
},
{
"epoch": 0.48306148055207027,
"grad_norm": 0.2060546875,
"learning_rate": 5.169385194479298e-06,
"loss": 1.0541,
"step": 385
},
{
"epoch": 0.48431618569636137,
"grad_norm": 0.2080078125,
"learning_rate": 5.156838143036387e-06,
"loss": 1.0873,
"step": 386
},
{
"epoch": 0.48557089084065247,
"grad_norm": 0.2060546875,
"learning_rate": 5.144291091593476e-06,
"loss": 1.0733,
"step": 387
},
{
"epoch": 0.4868255959849435,
"grad_norm": 0.2119140625,
"learning_rate": 5.131744040150564e-06,
"loss": 1.0454,
"step": 388
},
{
"epoch": 0.4880803011292346,
"grad_norm": 0.2138671875,
"learning_rate": 5.119196988707654e-06,
"loss": 1.0617,
"step": 389
},
{
"epoch": 0.4893350062735257,
"grad_norm": 0.1982421875,
"learning_rate": 5.106649937264742e-06,
"loss": 1.0762,
"step": 390
},
{
"epoch": 0.4905897114178168,
"grad_norm": 0.2060546875,
"learning_rate": 5.094102885821832e-06,
"loss": 1.0593,
"step": 391
},
{
"epoch": 0.4918444165621079,
"grad_norm": 0.1923828125,
"learning_rate": 5.081555834378922e-06,
"loss": 1.0697,
"step": 392
},
{
"epoch": 0.493099121706399,
"grad_norm": 0.19921875,
"learning_rate": 5.06900878293601e-06,
"loss": 1.0755,
"step": 393
},
{
"epoch": 0.49435382685069007,
"grad_norm": 0.2041015625,
"learning_rate": 5.056461731493099e-06,
"loss": 1.0863,
"step": 394
},
{
"epoch": 0.49560853199498117,
"grad_norm": 0.201171875,
"learning_rate": 5.043914680050188e-06,
"loss": 1.075,
"step": 395
},
{
"epoch": 0.4968632371392723,
"grad_norm": 0.2119140625,
"learning_rate": 5.031367628607277e-06,
"loss": 1.0718,
"step": 396
},
{
"epoch": 0.4981179422835634,
"grad_norm": 0.2421875,
"learning_rate": 5.018820577164367e-06,
"loss": 1.0541,
"step": 397
},
{
"epoch": 0.4993726474278545,
"grad_norm": 0.2001953125,
"learning_rate": 5.0062735257214555e-06,
"loss": 1.11,
"step": 398
},
{
"epoch": 0.5006273525721455,
"grad_norm": 0.203125,
"learning_rate": 4.993726474278545e-06,
"loss": 1.0366,
"step": 399
},
{
"epoch": 0.5018820577164367,
"grad_norm": 0.263671875,
"learning_rate": 4.981179422835634e-06,
"loss": 1.0649,
"step": 400
},
{
"epoch": 0.5031367628607277,
"grad_norm": 0.1923828125,
"learning_rate": 4.968632371392723e-06,
"loss": 1.0277,
"step": 401
},
{
"epoch": 0.5043914680050188,
"grad_norm": 0.2177734375,
"learning_rate": 4.9560853199498124e-06,
"loss": 1.0862,
"step": 402
},
{
"epoch": 0.5056461731493099,
"grad_norm": 0.2109375,
"learning_rate": 4.9435382685069015e-06,
"loss": 1.0414,
"step": 403
},
{
"epoch": 0.506900878293601,
"grad_norm": 0.259765625,
"learning_rate": 4.9309912170639905e-06,
"loss": 1.033,
"step": 404
},
{
"epoch": 0.5081555834378921,
"grad_norm": 0.20703125,
"learning_rate": 4.9184441656210795e-06,
"loss": 1.0621,
"step": 405
},
{
"epoch": 0.5094102885821832,
"grad_norm": 0.21484375,
"learning_rate": 4.9058971141781685e-06,
"loss": 1.0961,
"step": 406
},
{
"epoch": 0.5106649937264742,
"grad_norm": 0.2373046875,
"learning_rate": 4.8933500627352576e-06,
"loss": 1.0633,
"step": 407
},
{
"epoch": 0.5119196988707654,
"grad_norm": 0.2001953125,
"learning_rate": 4.880803011292347e-06,
"loss": 1.0757,
"step": 408
},
{
"epoch": 0.5131744040150564,
"grad_norm": 0.255859375,
"learning_rate": 4.868255959849436e-06,
"loss": 1.0588,
"step": 409
},
{
"epoch": 0.5144291091593476,
"grad_norm": 0.2197265625,
"learning_rate": 4.8557089084065255e-06,
"loss": 1.0291,
"step": 410
},
{
"epoch": 0.5156838143036386,
"grad_norm": 0.302734375,
"learning_rate": 4.843161856963614e-06,
"loss": 1.0779,
"step": 411
},
{
"epoch": 0.5169385194479298,
"grad_norm": 0.205078125,
"learning_rate": 4.830614805520703e-06,
"loss": 1.0104,
"step": 412
},
{
"epoch": 0.5181932245922208,
"grad_norm": 0.2021484375,
"learning_rate": 4.818067754077792e-06,
"loss": 1.0218,
"step": 413
},
{
"epoch": 0.5194479297365119,
"grad_norm": 0.197265625,
"learning_rate": 4.805520702634881e-06,
"loss": 1.0131,
"step": 414
},
{
"epoch": 0.520702634880803,
"grad_norm": 0.197265625,
"learning_rate": 4.792973651191971e-06,
"loss": 1.0428,
"step": 415
},
{
"epoch": 0.5219573400250941,
"grad_norm": 0.296875,
"learning_rate": 4.78042659974906e-06,
"loss": 1.0789,
"step": 416
},
{
"epoch": 0.5232120451693852,
"grad_norm": 0.2080078125,
"learning_rate": 4.767879548306149e-06,
"loss": 1.0057,
"step": 417
},
{
"epoch": 0.5244667503136763,
"grad_norm": 0.259765625,
"learning_rate": 4.755332496863238e-06,
"loss": 1.0565,
"step": 418
},
{
"epoch": 0.5257214554579673,
"grad_norm": 0.2177734375,
"learning_rate": 4.742785445420327e-06,
"loss": 1.105,
"step": 419
},
{
"epoch": 0.5269761606022585,
"grad_norm": 0.3125,
"learning_rate": 4.730238393977416e-06,
"loss": 1.0279,
"step": 420
},
{
"epoch": 0.5282308657465495,
"grad_norm": 0.2041015625,
"learning_rate": 4.717691342534505e-06,
"loss": 1.0793,
"step": 421
},
{
"epoch": 0.5294855708908407,
"grad_norm": 0.2080078125,
"learning_rate": 4.705144291091594e-06,
"loss": 1.0478,
"step": 422
},
{
"epoch": 0.5307402760351317,
"grad_norm": 0.20703125,
"learning_rate": 4.692597239648683e-06,
"loss": 1.0389,
"step": 423
},
{
"epoch": 0.5319949811794228,
"grad_norm": 0.255859375,
"learning_rate": 4.680050188205772e-06,
"loss": 1.0813,
"step": 424
},
{
"epoch": 0.533249686323714,
"grad_norm": 0.2001953125,
"learning_rate": 4.667503136762861e-06,
"loss": 1.0383,
"step": 425
},
{
"epoch": 0.534504391468005,
"grad_norm": 0.2099609375,
"learning_rate": 4.654956085319951e-06,
"loss": 1.0483,
"step": 426
},
{
"epoch": 0.5357590966122961,
"grad_norm": 0.1962890625,
"learning_rate": 4.64240903387704e-06,
"loss": 1.06,
"step": 427
},
{
"epoch": 0.5370138017565872,
"grad_norm": 0.19921875,
"learning_rate": 4.629861982434129e-06,
"loss": 1.0391,
"step": 428
},
{
"epoch": 0.5382685069008782,
"grad_norm": 0.2060546875,
"learning_rate": 4.617314930991217e-06,
"loss": 1.0405,
"step": 429
},
{
"epoch": 0.5395232120451694,
"grad_norm": 0.2158203125,
"learning_rate": 4.604767879548306e-06,
"loss": 1.0204,
"step": 430
},
{
"epoch": 0.5407779171894604,
"grad_norm": 0.2060546875,
"learning_rate": 4.592220828105395e-06,
"loss": 1.0687,
"step": 431
},
{
"epoch": 0.5420326223337516,
"grad_norm": 0.193359375,
"learning_rate": 4.579673776662485e-06,
"loss": 1.0194,
"step": 432
},
{
"epoch": 0.5432873274780426,
"grad_norm": 0.2021484375,
"learning_rate": 4.567126725219574e-06,
"loss": 1.1113,
"step": 433
},
{
"epoch": 0.5445420326223338,
"grad_norm": 0.259765625,
"learning_rate": 4.554579673776663e-06,
"loss": 1.0448,
"step": 434
},
{
"epoch": 0.5457967377666249,
"grad_norm": 0.19921875,
"learning_rate": 4.542032622333752e-06,
"loss": 1.0532,
"step": 435
},
{
"epoch": 0.5470514429109159,
"grad_norm": 0.203125,
"learning_rate": 4.529485570890841e-06,
"loss": 1.0412,
"step": 436
},
{
"epoch": 0.548306148055207,
"grad_norm": 0.1953125,
"learning_rate": 4.51693851944793e-06,
"loss": 1.0333,
"step": 437
},
{
"epoch": 0.5495608531994981,
"grad_norm": 0.1923828125,
"learning_rate": 4.504391468005019e-06,
"loss": 1.0322,
"step": 438
},
{
"epoch": 0.5508155583437893,
"grad_norm": 0.1982421875,
"learning_rate": 4.491844416562108e-06,
"loss": 1.0335,
"step": 439
},
{
"epoch": 0.5520702634880803,
"grad_norm": 0.20703125,
"learning_rate": 4.479297365119197e-06,
"loss": 0.9846,
"step": 440
},
{
"epoch": 0.5533249686323714,
"grad_norm": 0.19921875,
"learning_rate": 4.466750313676286e-06,
"loss": 1.0333,
"step": 441
},
{
"epoch": 0.5545796737766625,
"grad_norm": 0.2099609375,
"learning_rate": 4.454203262233375e-06,
"loss": 0.9969,
"step": 442
},
{
"epoch": 0.5558343789209536,
"grad_norm": 0.2255859375,
"learning_rate": 4.441656210790465e-06,
"loss": 1.0541,
"step": 443
},
{
"epoch": 0.5570890840652447,
"grad_norm": 0.2158203125,
"learning_rate": 4.429109159347554e-06,
"loss": 1.0073,
"step": 444
},
{
"epoch": 0.5583437892095358,
"grad_norm": 0.1943359375,
"learning_rate": 4.416562107904643e-06,
"loss": 0.9851,
"step": 445
},
{
"epoch": 0.5595984943538268,
"grad_norm": 0.2001953125,
"learning_rate": 4.404015056461732e-06,
"loss": 1.0577,
"step": 446
},
{
"epoch": 0.560853199498118,
"grad_norm": 0.2236328125,
"learning_rate": 4.391468005018821e-06,
"loss": 1.0376,
"step": 447
},
{
"epoch": 0.562107904642409,
"grad_norm": 0.2294921875,
"learning_rate": 4.37892095357591e-06,
"loss": 1.0417,
"step": 448
},
{
"epoch": 0.5633626097867002,
"grad_norm": 0.21875,
"learning_rate": 4.366373902132999e-06,
"loss": 1.018,
"step": 449
},
{
"epoch": 0.5646173149309912,
"grad_norm": 0.208984375,
"learning_rate": 4.353826850690088e-06,
"loss": 1.0216,
"step": 450
},
{
"epoch": 0.5658720200752823,
"grad_norm": 0.197265625,
"learning_rate": 4.341279799247177e-06,
"loss": 1.0128,
"step": 451
},
{
"epoch": 0.5671267252195734,
"grad_norm": 0.1962890625,
"learning_rate": 4.328732747804266e-06,
"loss": 1.043,
"step": 452
},
{
"epoch": 0.5683814303638645,
"grad_norm": 0.20703125,
"learning_rate": 4.316185696361355e-06,
"loss": 1.0468,
"step": 453
},
{
"epoch": 0.5696361355081556,
"grad_norm": 0.20703125,
"learning_rate": 4.303638644918444e-06,
"loss": 1.1028,
"step": 454
},
{
"epoch": 0.5708908406524467,
"grad_norm": 0.2138671875,
"learning_rate": 4.291091593475533e-06,
"loss": 1.0701,
"step": 455
},
{
"epoch": 0.5721455457967378,
"grad_norm": 0.2099609375,
"learning_rate": 4.278544542032622e-06,
"loss": 1.0275,
"step": 456
},
{
"epoch": 0.5734002509410289,
"grad_norm": 0.203125,
"learning_rate": 4.2659974905897114e-06,
"loss": 1.0543,
"step": 457
},
{
"epoch": 0.5746549560853199,
"grad_norm": 0.205078125,
"learning_rate": 4.2534504391468005e-06,
"loss": 1.0299,
"step": 458
},
{
"epoch": 0.5759096612296111,
"grad_norm": 0.2060546875,
"learning_rate": 4.24090338770389e-06,
"loss": 1.0248,
"step": 459
},
{
"epoch": 0.5771643663739021,
"grad_norm": 0.19921875,
"learning_rate": 4.228356336260979e-06,
"loss": 1.084,
"step": 460
},
{
"epoch": 0.5784190715181933,
"grad_norm": 0.2177734375,
"learning_rate": 4.215809284818068e-06,
"loss": 1.0025,
"step": 461
},
{
"epoch": 0.5796737766624843,
"grad_norm": 0.2373046875,
"learning_rate": 4.203262233375157e-06,
"loss": 1.0101,
"step": 462
},
{
"epoch": 0.5809284818067754,
"grad_norm": 0.220703125,
"learning_rate": 4.1907151819322464e-06,
"loss": 1.1198,
"step": 463
},
{
"epoch": 0.5821831869510665,
"grad_norm": 0.2119140625,
"learning_rate": 4.1781681304893355e-06,
"loss": 1.0278,
"step": 464
},
{
"epoch": 0.5834378920953576,
"grad_norm": 0.19921875,
"learning_rate": 4.1656210790464245e-06,
"loss": 1.0495,
"step": 465
},
{
"epoch": 0.5846925972396487,
"grad_norm": 0.212890625,
"learning_rate": 4.1530740276035135e-06,
"loss": 1.0356,
"step": 466
},
{
"epoch": 0.5859473023839398,
"grad_norm": 0.2041015625,
"learning_rate": 4.1405269761606026e-06,
"loss": 1.0484,
"step": 467
},
{
"epoch": 0.5872020075282308,
"grad_norm": 0.2236328125,
"learning_rate": 4.127979924717692e-06,
"loss": 1.0519,
"step": 468
},
{
"epoch": 0.588456712672522,
"grad_norm": 0.203125,
"learning_rate": 4.115432873274781e-06,
"loss": 1.0387,
"step": 469
},
{
"epoch": 0.589711417816813,
"grad_norm": 0.21484375,
"learning_rate": 4.1028858218318705e-06,
"loss": 1.0847,
"step": 470
},
{
"epoch": 0.5909661229611042,
"grad_norm": 0.201171875,
"learning_rate": 4.0903387703889595e-06,
"loss": 1.0396,
"step": 471
},
{
"epoch": 0.5922208281053952,
"grad_norm": 0.2119140625,
"learning_rate": 4.077791718946048e-06,
"loss": 1.0742,
"step": 472
},
{
"epoch": 0.5934755332496863,
"grad_norm": 0.2470703125,
"learning_rate": 4.065244667503137e-06,
"loss": 1.0868,
"step": 473
},
{
"epoch": 0.5947302383939774,
"grad_norm": 0.267578125,
"learning_rate": 4.052697616060226e-06,
"loss": 1.0198,
"step": 474
},
{
"epoch": 0.5959849435382685,
"grad_norm": 0.2099609375,
"learning_rate": 4.040150564617315e-06,
"loss": 1.0183,
"step": 475
},
{
"epoch": 0.5972396486825596,
"grad_norm": 0.2236328125,
"learning_rate": 4.027603513174405e-06,
"loss": 0.9979,
"step": 476
},
{
"epoch": 0.5984943538268507,
"grad_norm": 0.2099609375,
"learning_rate": 4.015056461731494e-06,
"loss": 1.0757,
"step": 477
},
{
"epoch": 0.5997490589711418,
"grad_norm": 0.1962890625,
"learning_rate": 4.002509410288583e-06,
"loss": 1.0298,
"step": 478
},
{
"epoch": 0.6010037641154329,
"grad_norm": 0.1884765625,
"learning_rate": 3.989962358845672e-06,
"loss": 1.0244,
"step": 479
},
{
"epoch": 0.6022584692597239,
"grad_norm": 0.212890625,
"learning_rate": 3.977415307402761e-06,
"loss": 1.1217,
"step": 480
},
{
"epoch": 0.6035131744040151,
"grad_norm": 0.201171875,
"learning_rate": 3.96486825595985e-06,
"loss": 1.0211,
"step": 481
},
{
"epoch": 0.6047678795483061,
"grad_norm": 0.2021484375,
"learning_rate": 3.952321204516939e-06,
"loss": 1.0317,
"step": 482
},
{
"epoch": 0.6060225846925973,
"grad_norm": 0.193359375,
"learning_rate": 3.939774153074028e-06,
"loss": 1.0053,
"step": 483
},
{
"epoch": 0.6072772898368883,
"grad_norm": 0.203125,
"learning_rate": 3.927227101631117e-06,
"loss": 1.0749,
"step": 484
},
{
"epoch": 0.6085319949811794,
"grad_norm": 0.2099609375,
"learning_rate": 3.914680050188206e-06,
"loss": 1.0847,
"step": 485
},
{
"epoch": 0.6097867001254705,
"grad_norm": 0.30078125,
"learning_rate": 3.902132998745295e-06,
"loss": 1.0502,
"step": 486
},
{
"epoch": 0.6110414052697616,
"grad_norm": 0.1943359375,
"learning_rate": 3.889585947302385e-06,
"loss": 0.9839,
"step": 487
},
{
"epoch": 0.6122961104140527,
"grad_norm": 0.208984375,
"learning_rate": 3.877038895859474e-06,
"loss": 1.0372,
"step": 488
},
{
"epoch": 0.6135508155583438,
"grad_norm": 0.197265625,
"learning_rate": 3.864491844416563e-06,
"loss": 1.025,
"step": 489
},
{
"epoch": 0.6148055207026348,
"grad_norm": 0.1923828125,
"learning_rate": 3.851944792973651e-06,
"loss": 1.0403,
"step": 490
},
{
"epoch": 0.616060225846926,
"grad_norm": 0.208984375,
"learning_rate": 3.83939774153074e-06,
"loss": 0.9886,
"step": 491
},
{
"epoch": 0.617314930991217,
"grad_norm": 0.2001953125,
"learning_rate": 3.82685069008783e-06,
"loss": 1.0598,
"step": 492
},
{
"epoch": 0.6185696361355082,
"grad_norm": 0.2109375,
"learning_rate": 3.814303638644919e-06,
"loss": 0.9854,
"step": 493
},
{
"epoch": 0.6198243412797992,
"grad_norm": 0.1923828125,
"learning_rate": 3.801756587202008e-06,
"loss": 1.0404,
"step": 494
},
{
"epoch": 0.6210790464240903,
"grad_norm": 0.19921875,
"learning_rate": 3.789209535759097e-06,
"loss": 1.0512,
"step": 495
},
{
"epoch": 0.6223337515683814,
"grad_norm": 0.2265625,
"learning_rate": 3.776662484316186e-06,
"loss": 1.0205,
"step": 496
},
{
"epoch": 0.6235884567126725,
"grad_norm": 0.197265625,
"learning_rate": 3.764115432873275e-06,
"loss": 1.0123,
"step": 497
},
{
"epoch": 0.6248431618569636,
"grad_norm": 0.205078125,
"learning_rate": 3.7515683814303645e-06,
"loss": 1.0438,
"step": 498
},
{
"epoch": 0.6260978670012547,
"grad_norm": 0.2109375,
"learning_rate": 3.7390213299874535e-06,
"loss": 0.9792,
"step": 499
},
{
"epoch": 0.6273525721455459,
"grad_norm": 0.1982421875,
"learning_rate": 3.7264742785445425e-06,
"loss": 1.057,
"step": 500
},
{
"epoch": 0.6286072772898369,
"grad_norm": 0.2109375,
"learning_rate": 3.713927227101631e-06,
"loss": 1.0221,
"step": 501
},
{
"epoch": 0.6298619824341279,
"grad_norm": 0.21875,
"learning_rate": 3.70138017565872e-06,
"loss": 1.0133,
"step": 502
},
{
"epoch": 0.6311166875784191,
"grad_norm": 0.2021484375,
"learning_rate": 3.68883312421581e-06,
"loss": 1.0232,
"step": 503
},
{
"epoch": 0.6323713927227101,
"grad_norm": 0.2080078125,
"learning_rate": 3.6762860727728987e-06,
"loss": 1.0509,
"step": 504
},
{
"epoch": 0.6336260978670013,
"grad_norm": 0.2392578125,
"learning_rate": 3.6637390213299877e-06,
"loss": 0.9949,
"step": 505
},
{
"epoch": 0.6348808030112923,
"grad_norm": 0.203125,
"learning_rate": 3.6511919698870767e-06,
"loss": 1.0611,
"step": 506
},
{
"epoch": 0.6361355081555834,
"grad_norm": 0.2177734375,
"learning_rate": 3.6386449184441657e-06,
"loss": 1.0192,
"step": 507
},
{
"epoch": 0.6373902132998746,
"grad_norm": 0.2119140625,
"learning_rate": 3.6260978670012548e-06,
"loss": 1.0595,
"step": 508
},
{
"epoch": 0.6386449184441656,
"grad_norm": 0.203125,
"learning_rate": 3.6135508155583442e-06,
"loss": 1.004,
"step": 509
},
{
"epoch": 0.6398996235884568,
"grad_norm": 0.2109375,
"learning_rate": 3.6010037641154332e-06,
"loss": 1.0088,
"step": 510
},
{
"epoch": 0.6411543287327478,
"grad_norm": 0.2099609375,
"learning_rate": 3.5884567126725223e-06,
"loss": 1.0476,
"step": 511
},
{
"epoch": 0.6424090338770388,
"grad_norm": 0.2138671875,
"learning_rate": 3.5759096612296113e-06,
"loss": 1.0425,
"step": 512
},
{
"epoch": 0.64366373902133,
"grad_norm": 0.296875,
"learning_rate": 3.5633626097867003e-06,
"loss": 1.0592,
"step": 513
},
{
"epoch": 0.644918444165621,
"grad_norm": 0.22265625,
"learning_rate": 3.5508155583437898e-06,
"loss": 0.9867,
"step": 514
},
{
"epoch": 0.6461731493099122,
"grad_norm": 0.205078125,
"learning_rate": 3.538268506900879e-06,
"loss": 1.0498,
"step": 515
},
{
"epoch": 0.6474278544542033,
"grad_norm": 0.2119140625,
"learning_rate": 3.525721455457968e-06,
"loss": 1.0369,
"step": 516
},
{
"epoch": 0.6486825595984943,
"grad_norm": 0.22265625,
"learning_rate": 3.513174404015057e-06,
"loss": 1.0112,
"step": 517
},
{
"epoch": 0.6499372647427855,
"grad_norm": 0.21484375,
"learning_rate": 3.500627352572146e-06,
"loss": 1.0099,
"step": 518
},
{
"epoch": 0.6511919698870765,
"grad_norm": 0.2197265625,
"learning_rate": 3.4880803011292345e-06,
"loss": 1.0667,
"step": 519
},
{
"epoch": 0.6524466750313677,
"grad_norm": 0.2109375,
"learning_rate": 3.4755332496863244e-06,
"loss": 1.0835,
"step": 520
},
{
"epoch": 0.6537013801756587,
"grad_norm": 0.205078125,
"learning_rate": 3.4629861982434134e-06,
"loss": 1.0616,
"step": 521
},
{
"epoch": 0.6549560853199499,
"grad_norm": 0.212890625,
"learning_rate": 3.450439146800502e-06,
"loss": 1.0777,
"step": 522
},
{
"epoch": 0.6562107904642409,
"grad_norm": 0.20703125,
"learning_rate": 3.437892095357591e-06,
"loss": 1.0241,
"step": 523
},
{
"epoch": 0.657465495608532,
"grad_norm": 0.208984375,
"learning_rate": 3.42534504391468e-06,
"loss": 1.0476,
"step": 524
},
{
"epoch": 0.6587202007528231,
"grad_norm": 0.2353515625,
"learning_rate": 3.4127979924717695e-06,
"loss": 1.0474,
"step": 525
},
{
"epoch": 0.6599749058971142,
"grad_norm": 0.20703125,
"learning_rate": 3.4002509410288585e-06,
"loss": 1.0541,
"step": 526
},
{
"epoch": 0.6612296110414053,
"grad_norm": 0.2119140625,
"learning_rate": 3.3877038895859475e-06,
"loss": 1.0815,
"step": 527
},
{
"epoch": 0.6624843161856964,
"grad_norm": 0.2041015625,
"learning_rate": 3.3751568381430366e-06,
"loss": 1.0137,
"step": 528
},
{
"epoch": 0.6637390213299874,
"grad_norm": 0.2041015625,
"learning_rate": 3.3626097867001256e-06,
"loss": 1.0295,
"step": 529
},
{
"epoch": 0.6649937264742786,
"grad_norm": 0.2041015625,
"learning_rate": 3.3500627352572146e-06,
"loss": 1.0171,
"step": 530
},
{
"epoch": 0.6662484316185696,
"grad_norm": 0.220703125,
"learning_rate": 3.337515683814304e-06,
"loss": 1.0739,
"step": 531
},
{
"epoch": 0.6675031367628608,
"grad_norm": 0.345703125,
"learning_rate": 3.324968632371393e-06,
"loss": 1.0432,
"step": 532
},
{
"epoch": 0.6687578419071518,
"grad_norm": 0.232421875,
"learning_rate": 3.312421580928482e-06,
"loss": 1.0359,
"step": 533
},
{
"epoch": 0.6700125470514429,
"grad_norm": 0.2080078125,
"learning_rate": 3.299874529485571e-06,
"loss": 1.0066,
"step": 534
},
{
"epoch": 0.671267252195734,
"grad_norm": 0.2197265625,
"learning_rate": 3.28732747804266e-06,
"loss": 1.0246,
"step": 535
},
{
"epoch": 0.6725219573400251,
"grad_norm": 0.244140625,
"learning_rate": 3.2747804265997496e-06,
"loss": 1.0027,
"step": 536
},
{
"epoch": 0.6737766624843162,
"grad_norm": 0.203125,
"learning_rate": 3.2622333751568387e-06,
"loss": 1.0538,
"step": 537
},
{
"epoch": 0.6750313676286073,
"grad_norm": 0.2109375,
"learning_rate": 3.2496863237139277e-06,
"loss": 1.0506,
"step": 538
},
{
"epoch": 0.6762860727728983,
"grad_norm": 0.2099609375,
"learning_rate": 3.2371392722710167e-06,
"loss": 0.9803,
"step": 539
},
{
"epoch": 0.6775407779171895,
"grad_norm": 0.2041015625,
"learning_rate": 3.2245922208281057e-06,
"loss": 1.0177,
"step": 540
},
{
"epoch": 0.6787954830614805,
"grad_norm": 0.2021484375,
"learning_rate": 3.2120451693851943e-06,
"loss": 1.0454,
"step": 541
},
{
"epoch": 0.6800501882057717,
"grad_norm": 0.21875,
"learning_rate": 3.1994981179422842e-06,
"loss": 1.0216,
"step": 542
},
{
"epoch": 0.6813048933500627,
"grad_norm": 0.208984375,
"learning_rate": 3.186951066499373e-06,
"loss": 1.0246,
"step": 543
},
{
"epoch": 0.6825595984943539,
"grad_norm": 0.21484375,
"learning_rate": 3.174404015056462e-06,
"loss": 0.9709,
"step": 544
},
{
"epoch": 0.6838143036386449,
"grad_norm": 0.1953125,
"learning_rate": 3.161856963613551e-06,
"loss": 0.9927,
"step": 545
},
{
"epoch": 0.685069008782936,
"grad_norm": 0.208984375,
"learning_rate": 3.14930991217064e-06,
"loss": 1.0039,
"step": 546
},
{
"epoch": 0.6863237139272271,
"grad_norm": 0.2080078125,
"learning_rate": 3.1367628607277293e-06,
"loss": 1.0182,
"step": 547
},
{
"epoch": 0.6875784190715182,
"grad_norm": 0.2041015625,
"learning_rate": 3.1242158092848184e-06,
"loss": 1.0093,
"step": 548
},
{
"epoch": 0.6888331242158093,
"grad_norm": 0.220703125,
"learning_rate": 3.1116687578419074e-06,
"loss": 1.037,
"step": 549
},
{
"epoch": 0.6900878293601004,
"grad_norm": 0.201171875,
"learning_rate": 3.0991217063989964e-06,
"loss": 0.9981,
"step": 550
},
{
"epoch": 0.6913425345043914,
"grad_norm": 0.232421875,
"learning_rate": 3.0865746549560855e-06,
"loss": 1.0494,
"step": 551
},
{
"epoch": 0.6925972396486826,
"grad_norm": 0.2119140625,
"learning_rate": 3.0740276035131745e-06,
"loss": 1.028,
"step": 552
},
{
"epoch": 0.6938519447929736,
"grad_norm": 0.2109375,
"learning_rate": 3.061480552070264e-06,
"loss": 1.0407,
"step": 553
},
{
"epoch": 0.6951066499372648,
"grad_norm": 0.201171875,
"learning_rate": 3.048933500627353e-06,
"loss": 1.0021,
"step": 554
},
{
"epoch": 0.6963613550815558,
"grad_norm": 0.203125,
"learning_rate": 3.036386449184442e-06,
"loss": 1.042,
"step": 555
},
{
"epoch": 0.6976160602258469,
"grad_norm": 0.2490234375,
"learning_rate": 3.023839397741531e-06,
"loss": 1.0004,
"step": 556
},
{
"epoch": 0.698870765370138,
"grad_norm": 0.2119140625,
"learning_rate": 3.01129234629862e-06,
"loss": 1.0578,
"step": 557
},
{
"epoch": 0.7001254705144291,
"grad_norm": 0.2109375,
"learning_rate": 2.9987452948557095e-06,
"loss": 1.0015,
"step": 558
},
{
"epoch": 0.7013801756587202,
"grad_norm": 0.203125,
"learning_rate": 2.9861982434127985e-06,
"loss": 0.9725,
"step": 559
},
{
"epoch": 0.7026348808030113,
"grad_norm": 0.203125,
"learning_rate": 2.9736511919698875e-06,
"loss": 1.0287,
"step": 560
},
{
"epoch": 0.7038895859473023,
"grad_norm": 0.212890625,
"learning_rate": 2.9611041405269766e-06,
"loss": 0.9984,
"step": 561
},
{
"epoch": 0.7051442910915935,
"grad_norm": 0.2353515625,
"learning_rate": 2.948557089084065e-06,
"loss": 1.0141,
"step": 562
},
{
"epoch": 0.7063989962358845,
"grad_norm": 0.1962890625,
"learning_rate": 2.936010037641154e-06,
"loss": 0.9633,
"step": 563
},
{
"epoch": 0.7076537013801757,
"grad_norm": 0.205078125,
"learning_rate": 2.923462986198244e-06,
"loss": 1.0358,
"step": 564
},
{
"epoch": 0.7089084065244667,
"grad_norm": 0.22265625,
"learning_rate": 2.9109159347553327e-06,
"loss": 1.052,
"step": 565
},
{
"epoch": 0.7101631116687579,
"grad_norm": 0.20703125,
"learning_rate": 2.8983688833124217e-06,
"loss": 1.0014,
"step": 566
},
{
"epoch": 0.7114178168130489,
"grad_norm": 0.228515625,
"learning_rate": 2.8858218318695107e-06,
"loss": 1.0039,
"step": 567
},
{
"epoch": 0.71267252195734,
"grad_norm": 0.21875,
"learning_rate": 2.8732747804265998e-06,
"loss": 1.0154,
"step": 568
},
{
"epoch": 0.7139272271016311,
"grad_norm": 0.2578125,
"learning_rate": 2.860727728983689e-06,
"loss": 1.0154,
"step": 569
},
{
"epoch": 0.7151819322459222,
"grad_norm": 0.2001953125,
"learning_rate": 2.8481806775407782e-06,
"loss": 0.993,
"step": 570
},
{
"epoch": 0.7164366373902133,
"grad_norm": 0.2158203125,
"learning_rate": 2.8356336260978673e-06,
"loss": 1.0333,
"step": 571
},
{
"epoch": 0.7176913425345044,
"grad_norm": 0.2119140625,
"learning_rate": 2.8230865746549563e-06,
"loss": 1.0426,
"step": 572
},
{
"epoch": 0.7189460476787954,
"grad_norm": 0.2177734375,
"learning_rate": 2.8105395232120453e-06,
"loss": 1.0889,
"step": 573
},
{
"epoch": 0.7202007528230866,
"grad_norm": 0.2099609375,
"learning_rate": 2.7979924717691343e-06,
"loss": 1.0128,
"step": 574
},
{
"epoch": 0.7214554579673776,
"grad_norm": 0.21484375,
"learning_rate": 2.7854454203262238e-06,
"loss": 1.0716,
"step": 575
},
{
"epoch": 0.7227101631116688,
"grad_norm": 0.2021484375,
"learning_rate": 2.772898368883313e-06,
"loss": 1.0305,
"step": 576
},
{
"epoch": 0.7239648682559598,
"grad_norm": 0.2021484375,
"learning_rate": 2.760351317440402e-06,
"loss": 0.9754,
"step": 577
},
{
"epoch": 0.7252195734002509,
"grad_norm": 0.2158203125,
"learning_rate": 2.747804265997491e-06,
"loss": 1.0193,
"step": 578
},
{
"epoch": 0.726474278544542,
"grad_norm": 0.224609375,
"learning_rate": 2.73525721455458e-06,
"loss": 1.0029,
"step": 579
},
{
"epoch": 0.7277289836888331,
"grad_norm": 0.212890625,
"learning_rate": 2.7227101631116693e-06,
"loss": 1.0143,
"step": 580
},
{
"epoch": 0.7289836888331243,
"grad_norm": 0.21484375,
"learning_rate": 2.7101631116687584e-06,
"loss": 1.1078,
"step": 581
},
{
"epoch": 0.7302383939774153,
"grad_norm": 0.220703125,
"learning_rate": 2.6976160602258474e-06,
"loss": 0.995,
"step": 582
},
{
"epoch": 0.7314930991217063,
"grad_norm": 0.2138671875,
"learning_rate": 2.685069008782936e-06,
"loss": 1.0182,
"step": 583
},
{
"epoch": 0.7327478042659975,
"grad_norm": 0.228515625,
"learning_rate": 2.672521957340025e-06,
"loss": 1.0709,
"step": 584
},
{
"epoch": 0.7340025094102886,
"grad_norm": 0.20703125,
"learning_rate": 2.659974905897114e-06,
"loss": 0.9936,
"step": 585
},
{
"epoch": 0.7352572145545797,
"grad_norm": 0.2197265625,
"learning_rate": 2.6474278544542035e-06,
"loss": 1.0406,
"step": 586
},
{
"epoch": 0.7365119196988708,
"grad_norm": 0.2138671875,
"learning_rate": 2.6348808030112925e-06,
"loss": 1.0011,
"step": 587
},
{
"epoch": 0.7377666248431619,
"grad_norm": 0.2138671875,
"learning_rate": 2.6223337515683816e-06,
"loss": 1.0088,
"step": 588
},
{
"epoch": 0.739021329987453,
"grad_norm": 0.1982421875,
"learning_rate": 2.6097867001254706e-06,
"loss": 1.033,
"step": 589
},
{
"epoch": 0.740276035131744,
"grad_norm": 0.2099609375,
"learning_rate": 2.5972396486825596e-06,
"loss": 0.9935,
"step": 590
},
{
"epoch": 0.7415307402760352,
"grad_norm": 0.2060546875,
"learning_rate": 2.584692597239649e-06,
"loss": 1.025,
"step": 591
},
{
"epoch": 0.7427854454203262,
"grad_norm": 0.216796875,
"learning_rate": 2.572145545796738e-06,
"loss": 1.042,
"step": 592
},
{
"epoch": 0.7440401505646174,
"grad_norm": 0.197265625,
"learning_rate": 2.559598494353827e-06,
"loss": 0.9731,
"step": 593
},
{
"epoch": 0.7452948557089084,
"grad_norm": 0.205078125,
"learning_rate": 2.547051442910916e-06,
"loss": 0.9729,
"step": 594
},
{
"epoch": 0.7465495608531995,
"grad_norm": 0.205078125,
"learning_rate": 2.534504391468005e-06,
"loss": 1.03,
"step": 595
},
{
"epoch": 0.7478042659974906,
"grad_norm": 0.208984375,
"learning_rate": 2.521957340025094e-06,
"loss": 1.0294,
"step": 596
},
{
"epoch": 0.7490589711417817,
"grad_norm": 0.275390625,
"learning_rate": 2.5094102885821836e-06,
"loss": 1.076,
"step": 597
},
{
"epoch": 0.7503136762860728,
"grad_norm": 0.263671875,
"learning_rate": 2.4968632371392727e-06,
"loss": 1.0643,
"step": 598
},
{
"epoch": 0.7515683814303639,
"grad_norm": 0.2392578125,
"learning_rate": 2.4843161856963617e-06,
"loss": 1.0265,
"step": 599
},
{
"epoch": 0.7528230865746549,
"grad_norm": 0.2373046875,
"learning_rate": 2.4717691342534507e-06,
"loss": 1.0221,
"step": 600
},
{
"epoch": 0.7540777917189461,
"grad_norm": 0.201171875,
"learning_rate": 2.4592220828105398e-06,
"loss": 1.0333,
"step": 601
},
{
"epoch": 0.7553324968632371,
"grad_norm": 0.220703125,
"learning_rate": 2.4466750313676288e-06,
"loss": 0.9877,
"step": 602
},
{
"epoch": 0.7565872020075283,
"grad_norm": 0.275390625,
"learning_rate": 2.434127979924718e-06,
"loss": 1.0679,
"step": 603
},
{
"epoch": 0.7578419071518193,
"grad_norm": 0.216796875,
"learning_rate": 2.421580928481807e-06,
"loss": 1.066,
"step": 604
},
{
"epoch": 0.7590966122961104,
"grad_norm": 0.25390625,
"learning_rate": 2.409033877038896e-06,
"loss": 1.0166,
"step": 605
},
{
"epoch": 0.7603513174404015,
"grad_norm": 0.2080078125,
"learning_rate": 2.3964868255959853e-06,
"loss": 1.0421,
"step": 606
},
{
"epoch": 0.7616060225846926,
"grad_norm": 0.203125,
"learning_rate": 2.3839397741530743e-06,
"loss": 1.0617,
"step": 607
},
{
"epoch": 0.7628607277289837,
"grad_norm": 0.23828125,
"learning_rate": 2.3713927227101634e-06,
"loss": 0.9719,
"step": 608
},
{
"epoch": 0.7641154328732748,
"grad_norm": 0.25,
"learning_rate": 2.3588456712672524e-06,
"loss": 1.0267,
"step": 609
},
{
"epoch": 0.7653701380175659,
"grad_norm": 0.25,
"learning_rate": 2.3462986198243414e-06,
"loss": 1.0067,
"step": 610
},
{
"epoch": 0.766624843161857,
"grad_norm": 0.21875,
"learning_rate": 2.3337515683814304e-06,
"loss": 1.0346,
"step": 611
},
{
"epoch": 0.767879548306148,
"grad_norm": 0.232421875,
"learning_rate": 2.32120451693852e-06,
"loss": 0.9765,
"step": 612
},
{
"epoch": 0.7691342534504392,
"grad_norm": 0.2001953125,
"learning_rate": 2.3086574654956085e-06,
"loss": 1.0025,
"step": 613
},
{
"epoch": 0.7703889585947302,
"grad_norm": 0.2119140625,
"learning_rate": 2.2961104140526975e-06,
"loss": 1.0229,
"step": 614
},
{
"epoch": 0.7716436637390214,
"grad_norm": 0.1982421875,
"learning_rate": 2.283563362609787e-06,
"loss": 0.9822,
"step": 615
},
{
"epoch": 0.7728983688833124,
"grad_norm": 0.25,
"learning_rate": 2.271016311166876e-06,
"loss": 1.077,
"step": 616
},
{
"epoch": 0.7741530740276035,
"grad_norm": 0.20703125,
"learning_rate": 2.258469259723965e-06,
"loss": 1.044,
"step": 617
},
{
"epoch": 0.7754077791718946,
"grad_norm": 0.201171875,
"learning_rate": 2.245922208281054e-06,
"loss": 1.0348,
"step": 618
},
{
"epoch": 0.7766624843161857,
"grad_norm": 0.2158203125,
"learning_rate": 2.233375156838143e-06,
"loss": 0.9906,
"step": 619
},
{
"epoch": 0.7779171894604768,
"grad_norm": 0.1962890625,
"learning_rate": 2.2208281053952325e-06,
"loss": 1.0023,
"step": 620
},
{
"epoch": 0.7791718946047679,
"grad_norm": 0.2060546875,
"learning_rate": 2.2082810539523216e-06,
"loss": 1.0452,
"step": 621
},
{
"epoch": 0.7804265997490589,
"grad_norm": 0.2080078125,
"learning_rate": 2.1957340025094106e-06,
"loss": 1.0391,
"step": 622
},
{
"epoch": 0.7816813048933501,
"grad_norm": 0.197265625,
"learning_rate": 2.1831869510664996e-06,
"loss": 1.0097,
"step": 623
},
{
"epoch": 0.7829360100376411,
"grad_norm": 0.2197265625,
"learning_rate": 2.1706398996235886e-06,
"loss": 1.0256,
"step": 624
},
{
"epoch": 0.7841907151819323,
"grad_norm": 0.251953125,
"learning_rate": 2.1580928481806777e-06,
"loss": 1.0621,
"step": 625
},
{
"epoch": 0.7854454203262233,
"grad_norm": 0.2099609375,
"learning_rate": 2.1455457967377667e-06,
"loss": 1.0201,
"step": 626
},
{
"epoch": 0.7867001254705144,
"grad_norm": 0.203125,
"learning_rate": 2.1329987452948557e-06,
"loss": 1.0165,
"step": 627
},
{
"epoch": 0.7879548306148055,
"grad_norm": 0.208984375,
"learning_rate": 2.120451693851945e-06,
"loss": 1.0409,
"step": 628
},
{
"epoch": 0.7892095357590966,
"grad_norm": 0.203125,
"learning_rate": 2.107904642409034e-06,
"loss": 1.0204,
"step": 629
},
{
"epoch": 0.7904642409033877,
"grad_norm": 0.2109375,
"learning_rate": 2.0953575909661232e-06,
"loss": 1.0584,
"step": 630
},
{
"epoch": 0.7917189460476788,
"grad_norm": 0.2158203125,
"learning_rate": 2.0828105395232122e-06,
"loss": 1.0017,
"step": 631
},
{
"epoch": 0.7929736511919699,
"grad_norm": 0.21484375,
"learning_rate": 2.0702634880803013e-06,
"loss": 1.016,
"step": 632
},
{
"epoch": 0.794228356336261,
"grad_norm": 0.2158203125,
"learning_rate": 2.0577164366373903e-06,
"loss": 1.0895,
"step": 633
},
{
"epoch": 0.795483061480552,
"grad_norm": 0.2314453125,
"learning_rate": 2.0451693851944798e-06,
"loss": 1.0615,
"step": 634
},
{
"epoch": 0.7967377666248432,
"grad_norm": 0.2158203125,
"learning_rate": 2.0326223337515684e-06,
"loss": 1.0618,
"step": 635
},
{
"epoch": 0.7979924717691342,
"grad_norm": 0.2109375,
"learning_rate": 2.0200752823086574e-06,
"loss": 1.0634,
"step": 636
},
{
"epoch": 0.7992471769134254,
"grad_norm": 0.2236328125,
"learning_rate": 2.007528230865747e-06,
"loss": 1.0281,
"step": 637
},
{
"epoch": 0.8005018820577164,
"grad_norm": 0.2158203125,
"learning_rate": 1.994981179422836e-06,
"loss": 0.971,
"step": 638
},
{
"epoch": 0.8017565872020075,
"grad_norm": 0.2080078125,
"learning_rate": 1.982434127979925e-06,
"loss": 1.0409,
"step": 639
},
{
"epoch": 0.8030112923462986,
"grad_norm": 0.216796875,
"learning_rate": 1.969887076537014e-06,
"loss": 1.0139,
"step": 640
},
{
"epoch": 0.8042659974905897,
"grad_norm": 0.2001953125,
"learning_rate": 1.957340025094103e-06,
"loss": 1.0208,
"step": 641
},
{
"epoch": 0.8055207026348808,
"grad_norm": 0.2099609375,
"learning_rate": 1.9447929736511924e-06,
"loss": 1.0015,
"step": 642
},
{
"epoch": 0.8067754077791719,
"grad_norm": 0.2021484375,
"learning_rate": 1.9322459222082814e-06,
"loss": 0.989,
"step": 643
},
{
"epoch": 0.8080301129234629,
"grad_norm": 0.2060546875,
"learning_rate": 1.91969887076537e-06,
"loss": 1.0381,
"step": 644
},
{
"epoch": 0.8092848180677541,
"grad_norm": 0.228515625,
"learning_rate": 1.9071518193224595e-06,
"loss": 1.0115,
"step": 645
},
{
"epoch": 0.8105395232120451,
"grad_norm": 0.2041015625,
"learning_rate": 1.8946047678795485e-06,
"loss": 1.0244,
"step": 646
},
{
"epoch": 0.8117942283563363,
"grad_norm": 0.2099609375,
"learning_rate": 1.8820577164366375e-06,
"loss": 0.9951,
"step": 647
},
{
"epoch": 0.8130489335006273,
"grad_norm": 0.2216796875,
"learning_rate": 1.8695106649937268e-06,
"loss": 1.0914,
"step": 648
},
{
"epoch": 0.8143036386449184,
"grad_norm": 0.208984375,
"learning_rate": 1.8569636135508156e-06,
"loss": 1.0711,
"step": 649
},
{
"epoch": 0.8155583437892095,
"grad_norm": 0.21875,
"learning_rate": 1.844416562107905e-06,
"loss": 1.0151,
"step": 650
},
{
"epoch": 0.8168130489335006,
"grad_norm": 0.2060546875,
"learning_rate": 1.8318695106649938e-06,
"loss": 0.9684,
"step": 651
},
{
"epoch": 0.8180677540777918,
"grad_norm": 0.361328125,
"learning_rate": 1.8193224592220829e-06,
"loss": 0.9752,
"step": 652
},
{
"epoch": 0.8193224592220828,
"grad_norm": 0.20703125,
"learning_rate": 1.8067754077791721e-06,
"loss": 0.9948,
"step": 653
},
{
"epoch": 0.820577164366374,
"grad_norm": 0.240234375,
"learning_rate": 1.7942283563362611e-06,
"loss": 0.9991,
"step": 654
},
{
"epoch": 0.821831869510665,
"grad_norm": 0.2080078125,
"learning_rate": 1.7816813048933502e-06,
"loss": 0.9931,
"step": 655
},
{
"epoch": 0.823086574654956,
"grad_norm": 0.216796875,
"learning_rate": 1.7691342534504394e-06,
"loss": 1.0357,
"step": 656
},
{
"epoch": 0.8243412797992472,
"grad_norm": 0.2294921875,
"learning_rate": 1.7565872020075284e-06,
"loss": 1.0357,
"step": 657
},
{
"epoch": 0.8255959849435383,
"grad_norm": 0.21484375,
"learning_rate": 1.7440401505646172e-06,
"loss": 1.0378,
"step": 658
},
{
"epoch": 0.8268506900878294,
"grad_norm": 0.197265625,
"learning_rate": 1.7314930991217067e-06,
"loss": 1.0116,
"step": 659
},
{
"epoch": 0.8281053952321205,
"grad_norm": 0.2158203125,
"learning_rate": 1.7189460476787955e-06,
"loss": 0.9911,
"step": 660
},
{
"epoch": 0.8293601003764115,
"grad_norm": 0.1943359375,
"learning_rate": 1.7063989962358847e-06,
"loss": 0.978,
"step": 661
},
{
"epoch": 0.8306148055207027,
"grad_norm": 0.2158203125,
"learning_rate": 1.6938519447929738e-06,
"loss": 1.0247,
"step": 662
},
{
"epoch": 0.8318695106649937,
"grad_norm": 0.2041015625,
"learning_rate": 1.6813048933500628e-06,
"loss": 1.0207,
"step": 663
},
{
"epoch": 0.8331242158092849,
"grad_norm": 0.1953125,
"learning_rate": 1.668757841907152e-06,
"loss": 0.9531,
"step": 664
},
{
"epoch": 0.8343789209535759,
"grad_norm": 0.2353515625,
"learning_rate": 1.656210790464241e-06,
"loss": 0.9747,
"step": 665
},
{
"epoch": 0.835633626097867,
"grad_norm": 0.203125,
"learning_rate": 1.64366373902133e-06,
"loss": 1.0085,
"step": 666
},
{
"epoch": 0.8368883312421581,
"grad_norm": 0.216796875,
"learning_rate": 1.6311166875784193e-06,
"loss": 0.979,
"step": 667
},
{
"epoch": 0.8381430363864492,
"grad_norm": 0.2392578125,
"learning_rate": 1.6185696361355084e-06,
"loss": 1.0111,
"step": 668
},
{
"epoch": 0.8393977415307403,
"grad_norm": 0.22265625,
"learning_rate": 1.6060225846925972e-06,
"loss": 1.0272,
"step": 669
},
{
"epoch": 0.8406524466750314,
"grad_norm": 0.2138671875,
"learning_rate": 1.5934755332496864e-06,
"loss": 1.028,
"step": 670
},
{
"epoch": 0.8419071518193224,
"grad_norm": 0.2138671875,
"learning_rate": 1.5809284818067754e-06,
"loss": 1.066,
"step": 671
},
{
"epoch": 0.8431618569636136,
"grad_norm": 0.203125,
"learning_rate": 1.5683814303638647e-06,
"loss": 1.0011,
"step": 672
},
{
"epoch": 0.8444165621079046,
"grad_norm": 0.2197265625,
"learning_rate": 1.5558343789209537e-06,
"loss": 1.0091,
"step": 673
},
{
"epoch": 0.8456712672521958,
"grad_norm": 0.2080078125,
"learning_rate": 1.5432873274780427e-06,
"loss": 0.9805,
"step": 674
},
{
"epoch": 0.8469259723964868,
"grad_norm": 0.2177734375,
"learning_rate": 1.530740276035132e-06,
"loss": 1.05,
"step": 675
},
{
"epoch": 0.848180677540778,
"grad_norm": 0.2099609375,
"learning_rate": 1.518193224592221e-06,
"loss": 1.0343,
"step": 676
},
{
"epoch": 0.849435382685069,
"grad_norm": 0.205078125,
"learning_rate": 1.50564617314931e-06,
"loss": 1.058,
"step": 677
},
{
"epoch": 0.8506900878293601,
"grad_norm": 0.25390625,
"learning_rate": 1.4930991217063993e-06,
"loss": 1.0374,
"step": 678
},
{
"epoch": 0.8519447929736512,
"grad_norm": 0.2216796875,
"learning_rate": 1.4805520702634883e-06,
"loss": 1.0525,
"step": 679
},
{
"epoch": 0.8531994981179423,
"grad_norm": 0.22265625,
"learning_rate": 1.468005018820577e-06,
"loss": 0.9897,
"step": 680
},
{
"epoch": 0.8544542032622334,
"grad_norm": 0.201171875,
"learning_rate": 1.4554579673776663e-06,
"loss": 0.9919,
"step": 681
},
{
"epoch": 0.8557089084065245,
"grad_norm": 0.244140625,
"learning_rate": 1.4429109159347554e-06,
"loss": 0.9903,
"step": 682
},
{
"epoch": 0.8569636135508155,
"grad_norm": 0.2119140625,
"learning_rate": 1.4303638644918446e-06,
"loss": 1.0151,
"step": 683
},
{
"epoch": 0.8582183186951067,
"grad_norm": 0.240234375,
"learning_rate": 1.4178168130489336e-06,
"loss": 1.0299,
"step": 684
},
{
"epoch": 0.8594730238393977,
"grad_norm": 0.2177734375,
"learning_rate": 1.4052697616060227e-06,
"loss": 1.0317,
"step": 685
},
{
"epoch": 0.8607277289836889,
"grad_norm": 0.2138671875,
"learning_rate": 1.3927227101631119e-06,
"loss": 1.0427,
"step": 686
},
{
"epoch": 0.8619824341279799,
"grad_norm": 0.201171875,
"learning_rate": 1.380175658720201e-06,
"loss": 1.0358,
"step": 687
},
{
"epoch": 0.863237139272271,
"grad_norm": 0.203125,
"learning_rate": 1.36762860727729e-06,
"loss": 1.0435,
"step": 688
},
{
"epoch": 0.8644918444165621,
"grad_norm": 0.21484375,
"learning_rate": 1.3550815558343792e-06,
"loss": 1.0404,
"step": 689
},
{
"epoch": 0.8657465495608532,
"grad_norm": 0.208984375,
"learning_rate": 1.342534504391468e-06,
"loss": 1.0497,
"step": 690
},
{
"epoch": 0.8670012547051443,
"grad_norm": 0.20703125,
"learning_rate": 1.329987452948557e-06,
"loss": 0.9816,
"step": 691
},
{
"epoch": 0.8682559598494354,
"grad_norm": 0.2255859375,
"learning_rate": 1.3174404015056463e-06,
"loss": 0.9882,
"step": 692
},
{
"epoch": 0.8695106649937264,
"grad_norm": 0.2021484375,
"learning_rate": 1.3048933500627353e-06,
"loss": 1.0454,
"step": 693
},
{
"epoch": 0.8707653701380176,
"grad_norm": 0.2080078125,
"learning_rate": 1.2923462986198245e-06,
"loss": 1.0426,
"step": 694
},
{
"epoch": 0.8720200752823086,
"grad_norm": 0.2099609375,
"learning_rate": 1.2797992471769136e-06,
"loss": 1.0392,
"step": 695
},
{
"epoch": 0.8732747804265998,
"grad_norm": 0.19921875,
"learning_rate": 1.2672521957340026e-06,
"loss": 1.0051,
"step": 696
},
{
"epoch": 0.8745294855708908,
"grad_norm": 0.2021484375,
"learning_rate": 1.2547051442910918e-06,
"loss": 1.0095,
"step": 697
},
{
"epoch": 0.875784190715182,
"grad_norm": 0.2109375,
"learning_rate": 1.2421580928481808e-06,
"loss": 1.0611,
"step": 698
},
{
"epoch": 0.877038895859473,
"grad_norm": 0.2216796875,
"learning_rate": 1.2296110414052699e-06,
"loss": 1.0009,
"step": 699
},
{
"epoch": 0.8782936010037641,
"grad_norm": 0.212890625,
"learning_rate": 1.217063989962359e-06,
"loss": 1.0206,
"step": 700
},
{
"epoch": 0.8795483061480552,
"grad_norm": 0.2119140625,
"learning_rate": 1.204516938519448e-06,
"loss": 1.0475,
"step": 701
},
{
"epoch": 0.8808030112923463,
"grad_norm": 0.201171875,
"learning_rate": 1.1919698870765372e-06,
"loss": 0.9919,
"step": 702
},
{
"epoch": 0.8820577164366374,
"grad_norm": 0.259765625,
"learning_rate": 1.1794228356336262e-06,
"loss": 1.0449,
"step": 703
},
{
"epoch": 0.8833124215809285,
"grad_norm": 0.21875,
"learning_rate": 1.1668757841907152e-06,
"loss": 1.0439,
"step": 704
},
{
"epoch": 0.8845671267252195,
"grad_norm": 0.201171875,
"learning_rate": 1.1543287327478042e-06,
"loss": 1.0178,
"step": 705
},
{
"epoch": 0.8858218318695107,
"grad_norm": 0.216796875,
"learning_rate": 1.1417816813048935e-06,
"loss": 1.0516,
"step": 706
},
{
"epoch": 0.8870765370138017,
"grad_norm": 0.220703125,
"learning_rate": 1.1292346298619825e-06,
"loss": 1.1199,
"step": 707
},
{
"epoch": 0.8883312421580929,
"grad_norm": 0.21875,
"learning_rate": 1.1166875784190715e-06,
"loss": 1.0612,
"step": 708
},
{
"epoch": 0.8895859473023839,
"grad_norm": 0.208984375,
"learning_rate": 1.1041405269761608e-06,
"loss": 1.0453,
"step": 709
},
{
"epoch": 0.890840652446675,
"grad_norm": 0.203125,
"learning_rate": 1.0915934755332498e-06,
"loss": 0.9853,
"step": 710
},
{
"epoch": 0.8920953575909661,
"grad_norm": 0.203125,
"learning_rate": 1.0790464240903388e-06,
"loss": 1.0574,
"step": 711
},
{
"epoch": 0.8933500627352572,
"grad_norm": 0.24609375,
"learning_rate": 1.0664993726474279e-06,
"loss": 1.0094,
"step": 712
},
{
"epoch": 0.8946047678795483,
"grad_norm": 0.2060546875,
"learning_rate": 1.053952321204517e-06,
"loss": 1.0675,
"step": 713
},
{
"epoch": 0.8958594730238394,
"grad_norm": 0.2294921875,
"learning_rate": 1.0414052697616061e-06,
"loss": 1.0285,
"step": 714
},
{
"epoch": 0.8971141781681304,
"grad_norm": 0.1982421875,
"learning_rate": 1.0288582183186952e-06,
"loss": 1.065,
"step": 715
},
{
"epoch": 0.8983688833124216,
"grad_norm": 0.201171875,
"learning_rate": 1.0163111668757842e-06,
"loss": 1.0149,
"step": 716
},
{
"epoch": 0.8996235884567126,
"grad_norm": 0.2109375,
"learning_rate": 1.0037641154328734e-06,
"loss": 1.0481,
"step": 717
},
{
"epoch": 0.9008782936010038,
"grad_norm": 0.21484375,
"learning_rate": 9.912170639899624e-07,
"loss": 1.0658,
"step": 718
},
{
"epoch": 0.9021329987452948,
"grad_norm": 0.2119140625,
"learning_rate": 9.786700125470515e-07,
"loss": 1.0236,
"step": 719
},
{
"epoch": 0.903387703889586,
"grad_norm": 0.205078125,
"learning_rate": 9.661229611041407e-07,
"loss": 1.0063,
"step": 720
},
{
"epoch": 0.904642409033877,
"grad_norm": 0.19921875,
"learning_rate": 9.535759096612297e-07,
"loss": 1.0015,
"step": 721
},
{
"epoch": 0.9058971141781681,
"grad_norm": 0.201171875,
"learning_rate": 9.410288582183188e-07,
"loss": 0.9525,
"step": 722
},
{
"epoch": 0.9071518193224593,
"grad_norm": 0.23828125,
"learning_rate": 9.284818067754078e-07,
"loss": 1.0432,
"step": 723
},
{
"epoch": 0.9084065244667503,
"grad_norm": 0.275390625,
"learning_rate": 9.159347553324969e-07,
"loss": 1.0255,
"step": 724
},
{
"epoch": 0.9096612296110415,
"grad_norm": 0.203125,
"learning_rate": 9.033877038895861e-07,
"loss": 1.0301,
"step": 725
},
{
"epoch": 0.9109159347553325,
"grad_norm": 0.205078125,
"learning_rate": 8.908406524466751e-07,
"loss": 1.0571,
"step": 726
},
{
"epoch": 0.9121706398996235,
"grad_norm": 0.2041015625,
"learning_rate": 8.782936010037642e-07,
"loss": 1.0558,
"step": 727
},
{
"epoch": 0.9134253450439147,
"grad_norm": 0.205078125,
"learning_rate": 8.657465495608533e-07,
"loss": 1.045,
"step": 728
},
{
"epoch": 0.9146800501882058,
"grad_norm": 0.259765625,
"learning_rate": 8.531994981179424e-07,
"loss": 1.0567,
"step": 729
},
{
"epoch": 0.9159347553324969,
"grad_norm": 0.455078125,
"learning_rate": 8.406524466750314e-07,
"loss": 1.0789,
"step": 730
},
{
"epoch": 0.917189460476788,
"grad_norm": 0.2138671875,
"learning_rate": 8.281053952321205e-07,
"loss": 1.0029,
"step": 731
},
{
"epoch": 0.918444165621079,
"grad_norm": 0.2255859375,
"learning_rate": 8.155583437892097e-07,
"loss": 1.0012,
"step": 732
},
{
"epoch": 0.9196988707653702,
"grad_norm": 0.2119140625,
"learning_rate": 8.030112923462986e-07,
"loss": 1.0303,
"step": 733
},
{
"epoch": 0.9209535759096612,
"grad_norm": 0.2119140625,
"learning_rate": 7.904642409033877e-07,
"loss": 1.081,
"step": 734
},
{
"epoch": 0.9222082810539524,
"grad_norm": 0.197265625,
"learning_rate": 7.779171894604768e-07,
"loss": 0.973,
"step": 735
},
{
"epoch": 0.9234629861982434,
"grad_norm": 0.291015625,
"learning_rate": 7.65370138017566e-07,
"loss": 1.0737,
"step": 736
},
{
"epoch": 0.9247176913425345,
"grad_norm": 0.201171875,
"learning_rate": 7.52823086574655e-07,
"loss": 1.0283,
"step": 737
},
{
"epoch": 0.9259723964868256,
"grad_norm": 0.2080078125,
"learning_rate": 7.402760351317441e-07,
"loss": 0.9872,
"step": 738
},
{
"epoch": 0.9272271016311167,
"grad_norm": 0.2138671875,
"learning_rate": 7.277289836888332e-07,
"loss": 1.0571,
"step": 739
},
{
"epoch": 0.9284818067754078,
"grad_norm": 0.1982421875,
"learning_rate": 7.151819322459223e-07,
"loss": 0.9945,
"step": 740
},
{
"epoch": 0.9297365119196989,
"grad_norm": 0.2080078125,
"learning_rate": 7.026348808030113e-07,
"loss": 1.0531,
"step": 741
},
{
"epoch": 0.93099121706399,
"grad_norm": 0.2041015625,
"learning_rate": 6.900878293601005e-07,
"loss": 1.014,
"step": 742
},
{
"epoch": 0.9322459222082811,
"grad_norm": 0.2080078125,
"learning_rate": 6.775407779171896e-07,
"loss": 0.9942,
"step": 743
},
{
"epoch": 0.9335006273525721,
"grad_norm": 0.216796875,
"learning_rate": 6.649937264742785e-07,
"loss": 1.0488,
"step": 744
},
{
"epoch": 0.9347553324968633,
"grad_norm": 0.1982421875,
"learning_rate": 6.524466750313676e-07,
"loss": 1.0216,
"step": 745
},
{
"epoch": 0.9360100376411543,
"grad_norm": 0.2080078125,
"learning_rate": 6.398996235884568e-07,
"loss": 1.0371,
"step": 746
},
{
"epoch": 0.9372647427854455,
"grad_norm": 0.205078125,
"learning_rate": 6.273525721455459e-07,
"loss": 1.0918,
"step": 747
},
{
"epoch": 0.9385194479297365,
"grad_norm": 0.30859375,
"learning_rate": 6.148055207026349e-07,
"loss": 1.0376,
"step": 748
},
{
"epoch": 0.9397741530740276,
"grad_norm": 0.2021484375,
"learning_rate": 6.02258469259724e-07,
"loss": 1.013,
"step": 749
},
{
"epoch": 0.9410288582183187,
"grad_norm": 0.2158203125,
"learning_rate": 5.897114178168131e-07,
"loss": 1.0821,
"step": 750
},
{
"epoch": 0.9422835633626098,
"grad_norm": 0.21875,
"learning_rate": 5.771643663739021e-07,
"loss": 1.0363,
"step": 751
},
{
"epoch": 0.9435382685069009,
"grad_norm": 0.203125,
"learning_rate": 5.646173149309913e-07,
"loss": 1.0259,
"step": 752
},
{
"epoch": 0.944792973651192,
"grad_norm": 0.2177734375,
"learning_rate": 5.520702634880804e-07,
"loss": 1.0375,
"step": 753
},
{
"epoch": 0.946047678795483,
"grad_norm": 0.20703125,
"learning_rate": 5.395232120451694e-07,
"loss": 0.9884,
"step": 754
},
{
"epoch": 0.9473023839397742,
"grad_norm": 0.208984375,
"learning_rate": 5.269761606022585e-07,
"loss": 0.9806,
"step": 755
},
{
"epoch": 0.9485570890840652,
"grad_norm": 0.2138671875,
"learning_rate": 5.144291091593476e-07,
"loss": 1.0393,
"step": 756
},
{
"epoch": 0.9498117942283564,
"grad_norm": 0.2265625,
"learning_rate": 5.018820577164367e-07,
"loss": 1.0035,
"step": 757
},
{
"epoch": 0.9510664993726474,
"grad_norm": 0.2265625,
"learning_rate": 4.893350062735257e-07,
"loss": 1.0313,
"step": 758
},
{
"epoch": 0.9523212045169385,
"grad_norm": 0.2255859375,
"learning_rate": 4.7678795483061487e-07,
"loss": 1.0483,
"step": 759
},
{
"epoch": 0.9535759096612296,
"grad_norm": 0.2158203125,
"learning_rate": 4.642409033877039e-07,
"loss": 1.026,
"step": 760
},
{
"epoch": 0.9548306148055207,
"grad_norm": 0.2109375,
"learning_rate": 4.5169385194479303e-07,
"loss": 1.0188,
"step": 761
},
{
"epoch": 0.9560853199498118,
"grad_norm": 0.2060546875,
"learning_rate": 4.391468005018821e-07,
"loss": 1.0507,
"step": 762
},
{
"epoch": 0.9573400250941029,
"grad_norm": 0.2060546875,
"learning_rate": 4.265997490589712e-07,
"loss": 1.003,
"step": 763
},
{
"epoch": 0.958594730238394,
"grad_norm": 0.2236328125,
"learning_rate": 4.1405269761606027e-07,
"loss": 1.0485,
"step": 764
},
{
"epoch": 0.9598494353826851,
"grad_norm": 0.2451171875,
"learning_rate": 4.015056461731493e-07,
"loss": 1.1192,
"step": 765
},
{
"epoch": 0.9611041405269761,
"grad_norm": 0.2109375,
"learning_rate": 3.889585947302384e-07,
"loss": 0.9953,
"step": 766
},
{
"epoch": 0.9623588456712673,
"grad_norm": 0.216796875,
"learning_rate": 3.764115432873275e-07,
"loss": 1.0077,
"step": 767
},
{
"epoch": 0.9636135508155583,
"grad_norm": 0.2109375,
"learning_rate": 3.638644918444166e-07,
"loss": 1.0298,
"step": 768
},
{
"epoch": 0.9648682559598495,
"grad_norm": 0.21484375,
"learning_rate": 3.5131744040150566e-07,
"loss": 1.0681,
"step": 769
},
{
"epoch": 0.9661229611041405,
"grad_norm": 0.2080078125,
"learning_rate": 3.387703889585948e-07,
"loss": 1.0081,
"step": 770
},
{
"epoch": 0.9673776662484316,
"grad_norm": 0.2060546875,
"learning_rate": 3.262233375156838e-07,
"loss": 1.0426,
"step": 771
},
{
"epoch": 0.9686323713927227,
"grad_norm": 0.2177734375,
"learning_rate": 3.1367628607277296e-07,
"loss": 1.0218,
"step": 772
},
{
"epoch": 0.9698870765370138,
"grad_norm": 0.205078125,
"learning_rate": 3.01129234629862e-07,
"loss": 1.0126,
"step": 773
},
{
"epoch": 0.9711417816813049,
"grad_norm": 0.208984375,
"learning_rate": 2.8858218318695106e-07,
"loss": 1.0691,
"step": 774
},
{
"epoch": 0.972396486825596,
"grad_norm": 0.240234375,
"learning_rate": 2.760351317440402e-07,
"loss": 0.963,
"step": 775
},
{
"epoch": 0.973651191969887,
"grad_norm": 0.2333984375,
"learning_rate": 2.634880803011293e-07,
"loss": 1.0191,
"step": 776
},
{
"epoch": 0.9749058971141782,
"grad_norm": 0.283203125,
"learning_rate": 2.5094102885821835e-07,
"loss": 1.0333,
"step": 777
},
{
"epoch": 0.9761606022584692,
"grad_norm": 0.248046875,
"learning_rate": 2.3839397741530743e-07,
"loss": 1.0624,
"step": 778
},
{
"epoch": 0.9774153074027604,
"grad_norm": 0.2021484375,
"learning_rate": 2.2584692597239651e-07,
"loss": 1.0546,
"step": 779
},
{
"epoch": 0.9786700125470514,
"grad_norm": 0.21875,
"learning_rate": 2.132998745294856e-07,
"loss": 1.0582,
"step": 780
},
{
"epoch": 0.9799247176913425,
"grad_norm": 0.19921875,
"learning_rate": 2.0075282308657465e-07,
"loss": 1.0658,
"step": 781
},
{
"epoch": 0.9811794228356336,
"grad_norm": 0.2080078125,
"learning_rate": 1.8820577164366375e-07,
"loss": 1.0176,
"step": 782
},
{
"epoch": 0.9824341279799247,
"grad_norm": 0.2080078125,
"learning_rate": 1.7565872020075283e-07,
"loss": 1.0045,
"step": 783
},
{
"epoch": 0.9836888331242158,
"grad_norm": 0.2099609375,
"learning_rate": 1.631116687578419e-07,
"loss": 1.0487,
"step": 784
},
{
"epoch": 0.9849435382685069,
"grad_norm": 0.275390625,
"learning_rate": 1.50564617314931e-07,
"loss": 0.9917,
"step": 785
},
{
"epoch": 0.986198243412798,
"grad_norm": 0.21875,
"learning_rate": 1.380175658720201e-07,
"loss": 1.0878,
"step": 786
},
{
"epoch": 0.9874529485570891,
"grad_norm": 0.33203125,
"learning_rate": 1.2547051442910918e-07,
"loss": 1.0832,
"step": 787
},
{
"epoch": 0.9887076537013801,
"grad_norm": 0.232421875,
"learning_rate": 1.1292346298619826e-07,
"loss": 1.0357,
"step": 788
},
{
"epoch": 0.9899623588456713,
"grad_norm": 0.2001953125,
"learning_rate": 1.0037641154328732e-07,
"loss": 0.9572,
"step": 789
},
{
"epoch": 0.9912170639899623,
"grad_norm": 0.2041015625,
"learning_rate": 8.782936010037642e-08,
"loss": 1.0341,
"step": 790
},
{
"epoch": 0.9924717691342535,
"grad_norm": 0.2119140625,
"learning_rate": 7.52823086574655e-08,
"loss": 1.0047,
"step": 791
},
{
"epoch": 0.9937264742785445,
"grad_norm": 0.205078125,
"learning_rate": 6.273525721455459e-08,
"loss": 0.9957,
"step": 792
},
{
"epoch": 0.9949811794228356,
"grad_norm": 0.2333984375,
"learning_rate": 5.018820577164366e-08,
"loss": 1.0317,
"step": 793
},
{
"epoch": 0.9962358845671268,
"grad_norm": 0.203125,
"learning_rate": 3.764115432873275e-08,
"loss": 1.0379,
"step": 794
},
{
"epoch": 0.9974905897114178,
"grad_norm": 0.20703125,
"learning_rate": 2.509410288582183e-08,
"loss": 0.9841,
"step": 795
},
{
"epoch": 0.998745294855709,
"grad_norm": 0.205078125,
"learning_rate": 1.2547051442910915e-08,
"loss": 1.0625,
"step": 796
},
{
"epoch": 1.0,
"grad_norm": 0.208984375,
"learning_rate": 0.0,
"loss": 0.9426,
"step": 797
}
],
"logging_steps": 1.0,
"max_steps": 797,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.5590252305802854e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}