ViCA-ScanNet-20p / trainer_state.json
nkkbr's picture
Initial commit
37f40f1
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.2002136426064398,
"eval_steps": 500,
"global_step": 656,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0003052037234854265,
"grad_norm": 19.476922880741295,
"learning_rate": 1.0101010101010103e-07,
"loss": 1.1728,
"step": 1
},
{
"epoch": 0.000610407446970853,
"grad_norm": 29.879020388476594,
"learning_rate": 2.0202020202020205e-07,
"loss": 1.0955,
"step": 2
},
{
"epoch": 0.0009156111704562796,
"grad_norm": 24.931945947136526,
"learning_rate": 3.0303030303030305e-07,
"loss": 0.9541,
"step": 3
},
{
"epoch": 0.001220814893941706,
"grad_norm": 27.83214939667906,
"learning_rate": 4.040404040404041e-07,
"loss": 1.0735,
"step": 4
},
{
"epoch": 0.0015260186174271325,
"grad_norm": 21.219233961021736,
"learning_rate": 5.05050505050505e-07,
"loss": 1.0455,
"step": 5
},
{
"epoch": 0.0018312223409125592,
"grad_norm": 20.022707446211225,
"learning_rate": 6.060606060606061e-07,
"loss": 0.9675,
"step": 6
},
{
"epoch": 0.0021364260643979855,
"grad_norm": 26.532427830157193,
"learning_rate": 7.070707070707071e-07,
"loss": 1.1393,
"step": 7
},
{
"epoch": 0.002441629787883412,
"grad_norm": 27.89728780710031,
"learning_rate": 8.080808080808082e-07,
"loss": 1.0952,
"step": 8
},
{
"epoch": 0.0027468335113688385,
"grad_norm": 20.346264005570532,
"learning_rate": 9.090909090909091e-07,
"loss": 0.9626,
"step": 9
},
{
"epoch": 0.003052037234854265,
"grad_norm": 18.804489508720884,
"learning_rate": 1.01010101010101e-06,
"loss": 1.0255,
"step": 10
},
{
"epoch": 0.003357240958339692,
"grad_norm": 19.776534785573535,
"learning_rate": 1.111111111111111e-06,
"loss": 0.7399,
"step": 11
},
{
"epoch": 0.0036624446818251184,
"grad_norm": 21.16130386460154,
"learning_rate": 1.2121212121212122e-06,
"loss": 0.5413,
"step": 12
},
{
"epoch": 0.0039676484053105445,
"grad_norm": 16.482713371526263,
"learning_rate": 1.3131313131313134e-06,
"loss": 0.5773,
"step": 13
},
{
"epoch": 0.004272852128795971,
"grad_norm": 10.780528168770594,
"learning_rate": 1.4141414141414143e-06,
"loss": 0.6782,
"step": 14
},
{
"epoch": 0.0045780558522813975,
"grad_norm": 7.0900135030469915,
"learning_rate": 1.5151515151515152e-06,
"loss": 0.9153,
"step": 15
},
{
"epoch": 0.004883259575766824,
"grad_norm": 8.490445320662754,
"learning_rate": 1.6161616161616164e-06,
"loss": 0.4798,
"step": 16
},
{
"epoch": 0.0051884632992522505,
"grad_norm": 6.677142812986669,
"learning_rate": 1.7171717171717173e-06,
"loss": 0.4782,
"step": 17
},
{
"epoch": 0.005493667022737677,
"grad_norm": 5.9204247946017485,
"learning_rate": 1.8181818181818183e-06,
"loss": 0.3191,
"step": 18
},
{
"epoch": 0.0057988707462231035,
"grad_norm": 5.012462343754674,
"learning_rate": 1.9191919191919192e-06,
"loss": 0.4115,
"step": 19
},
{
"epoch": 0.00610407446970853,
"grad_norm": 3.9095937836899113,
"learning_rate": 2.02020202020202e-06,
"loss": 0.6158,
"step": 20
},
{
"epoch": 0.006409278193193957,
"grad_norm": 4.438163815129716,
"learning_rate": 2.1212121212121216e-06,
"loss": 0.7388,
"step": 21
},
{
"epoch": 0.006714481916679384,
"grad_norm": 3.62875198348435,
"learning_rate": 2.222222222222222e-06,
"loss": 0.2875,
"step": 22
},
{
"epoch": 0.00701968564016481,
"grad_norm": 4.963543929599541,
"learning_rate": 2.3232323232323234e-06,
"loss": 0.4662,
"step": 23
},
{
"epoch": 0.007324889363650237,
"grad_norm": 4.274904100558248,
"learning_rate": 2.4242424242424244e-06,
"loss": 0.5171,
"step": 24
},
{
"epoch": 0.007630093087135663,
"grad_norm": 2.670885047669819,
"learning_rate": 2.5252525252525258e-06,
"loss": 0.4488,
"step": 25
},
{
"epoch": 0.007935296810621089,
"grad_norm": 2.6864388610994014,
"learning_rate": 2.6262626262626267e-06,
"loss": 0.372,
"step": 26
},
{
"epoch": 0.008240500534106516,
"grad_norm": 3.804357369452407,
"learning_rate": 2.7272727272727272e-06,
"loss": 0.2646,
"step": 27
},
{
"epoch": 0.008545704257591942,
"grad_norm": 4.059008227452532,
"learning_rate": 2.8282828282828286e-06,
"loss": 0.5907,
"step": 28
},
{
"epoch": 0.008850907981077369,
"grad_norm": 4.9062443629918855,
"learning_rate": 2.9292929292929295e-06,
"loss": 0.2972,
"step": 29
},
{
"epoch": 0.009156111704562795,
"grad_norm": 3.5391495380267064,
"learning_rate": 3.0303030303030305e-06,
"loss": 0.3821,
"step": 30
},
{
"epoch": 0.009461315428048222,
"grad_norm": 2.5896920322264854,
"learning_rate": 3.131313131313132e-06,
"loss": 0.4164,
"step": 31
},
{
"epoch": 0.009766519151533648,
"grad_norm": 3.0230775761822937,
"learning_rate": 3.232323232323233e-06,
"loss": 0.4237,
"step": 32
},
{
"epoch": 0.010071722875019075,
"grad_norm": 2.8417717057519423,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.3353,
"step": 33
},
{
"epoch": 0.010376926598504501,
"grad_norm": 2.5789157463945878,
"learning_rate": 3.4343434343434347e-06,
"loss": 0.3769,
"step": 34
},
{
"epoch": 0.010682130321989928,
"grad_norm": 2.5222241581850096,
"learning_rate": 3.5353535353535356e-06,
"loss": 0.519,
"step": 35
},
{
"epoch": 0.010987334045475354,
"grad_norm": 2.8704682168269127,
"learning_rate": 3.6363636363636366e-06,
"loss": 0.2829,
"step": 36
},
{
"epoch": 0.01129253776896078,
"grad_norm": 3.24684532820184,
"learning_rate": 3.737373737373738e-06,
"loss": 0.3586,
"step": 37
},
{
"epoch": 0.011597741492446207,
"grad_norm": 5.24792475783676,
"learning_rate": 3.8383838383838385e-06,
"loss": 0.402,
"step": 38
},
{
"epoch": 0.011902945215931634,
"grad_norm": 3.111184671834165,
"learning_rate": 3.93939393939394e-06,
"loss": 0.466,
"step": 39
},
{
"epoch": 0.01220814893941706,
"grad_norm": 3.165565566985893,
"learning_rate": 4.04040404040404e-06,
"loss": 0.2678,
"step": 40
},
{
"epoch": 0.012513352662902488,
"grad_norm": 2.5486933296193257,
"learning_rate": 4.141414141414142e-06,
"loss": 0.5457,
"step": 41
},
{
"epoch": 0.012818556386387915,
"grad_norm": 3.4373721012250438,
"learning_rate": 4.242424242424243e-06,
"loss": 0.3862,
"step": 42
},
{
"epoch": 0.013123760109873341,
"grad_norm": 2.863317221380458,
"learning_rate": 4.343434343434344e-06,
"loss": 0.3601,
"step": 43
},
{
"epoch": 0.013428963833358768,
"grad_norm": 2.1041128573446035,
"learning_rate": 4.444444444444444e-06,
"loss": 0.3693,
"step": 44
},
{
"epoch": 0.013734167556844194,
"grad_norm": 2.286990324679626,
"learning_rate": 4.5454545454545455e-06,
"loss": 0.2513,
"step": 45
},
{
"epoch": 0.01403937128032962,
"grad_norm": 8.793466778432636,
"learning_rate": 4.646464646464647e-06,
"loss": 0.4343,
"step": 46
},
{
"epoch": 0.014344575003815047,
"grad_norm": 1.8648737533834159,
"learning_rate": 4.747474747474748e-06,
"loss": 0.2631,
"step": 47
},
{
"epoch": 0.014649778727300474,
"grad_norm": 2.3081781364995324,
"learning_rate": 4.848484848484849e-06,
"loss": 0.2755,
"step": 48
},
{
"epoch": 0.0149549824507859,
"grad_norm": 2.284005369243557,
"learning_rate": 4.94949494949495e-06,
"loss": 0.4186,
"step": 49
},
{
"epoch": 0.015260186174271327,
"grad_norm": 2.6759709423238096,
"learning_rate": 5.0505050505050515e-06,
"loss": 0.6459,
"step": 50
},
{
"epoch": 0.015565389897756753,
"grad_norm": 2.8773749120652523,
"learning_rate": 5.151515151515152e-06,
"loss": 0.3324,
"step": 51
},
{
"epoch": 0.015870593621242178,
"grad_norm": 2.8060164424498786,
"learning_rate": 5.252525252525253e-06,
"loss": 0.3608,
"step": 52
},
{
"epoch": 0.016175797344727606,
"grad_norm": 2.3060494229726793,
"learning_rate": 5.353535353535354e-06,
"loss": 0.3818,
"step": 53
},
{
"epoch": 0.01648100106821303,
"grad_norm": 2.073464811557714,
"learning_rate": 5.4545454545454545e-06,
"loss": 0.2667,
"step": 54
},
{
"epoch": 0.01678620479169846,
"grad_norm": 2.3474749655399245,
"learning_rate": 5.555555555555557e-06,
"loss": 0.35,
"step": 55
},
{
"epoch": 0.017091408515183884,
"grad_norm": 3.6988890036672086,
"learning_rate": 5.656565656565657e-06,
"loss": 0.284,
"step": 56
},
{
"epoch": 0.017396612238669312,
"grad_norm": 2.313501192849839,
"learning_rate": 5.7575757575757586e-06,
"loss": 0.3308,
"step": 57
},
{
"epoch": 0.017701815962154737,
"grad_norm": 2.411936098122121,
"learning_rate": 5.858585858585859e-06,
"loss": 0.3982,
"step": 58
},
{
"epoch": 0.018007019685640165,
"grad_norm": 2.724660127775508,
"learning_rate": 5.95959595959596e-06,
"loss": 0.3587,
"step": 59
},
{
"epoch": 0.01831222340912559,
"grad_norm": 3.130895013540925,
"learning_rate": 6.060606060606061e-06,
"loss": 0.3427,
"step": 60
},
{
"epoch": 0.01861742713261102,
"grad_norm": 3.4261489723004614,
"learning_rate": 6.1616161616161615e-06,
"loss": 0.4578,
"step": 61
},
{
"epoch": 0.018922630856096443,
"grad_norm": 2.413871881063889,
"learning_rate": 6.262626262626264e-06,
"loss": 0.2067,
"step": 62
},
{
"epoch": 0.01922783457958187,
"grad_norm": 2.0941348505038366,
"learning_rate": 6.363636363636364e-06,
"loss": 0.27,
"step": 63
},
{
"epoch": 0.019533038303067296,
"grad_norm": 2.2153240133926153,
"learning_rate": 6.464646464646466e-06,
"loss": 0.3298,
"step": 64
},
{
"epoch": 0.019838242026552724,
"grad_norm": 2.422022070572305,
"learning_rate": 6.565656565656566e-06,
"loss": 0.4894,
"step": 65
},
{
"epoch": 0.02014344575003815,
"grad_norm": 2.45442660843552,
"learning_rate": 6.666666666666667e-06,
"loss": 0.3684,
"step": 66
},
{
"epoch": 0.020448649473523577,
"grad_norm": 3.5398238081108304,
"learning_rate": 6.767676767676769e-06,
"loss": 0.4233,
"step": 67
},
{
"epoch": 0.020753853197009002,
"grad_norm": 2.530397719080883,
"learning_rate": 6.868686868686869e-06,
"loss": 0.2676,
"step": 68
},
{
"epoch": 0.02105905692049443,
"grad_norm": 2.259346305696615,
"learning_rate": 6.969696969696971e-06,
"loss": 0.4409,
"step": 69
},
{
"epoch": 0.021364260643979855,
"grad_norm": 2.3339543424453764,
"learning_rate": 7.070707070707071e-06,
"loss": 0.3882,
"step": 70
},
{
"epoch": 0.021669464367465283,
"grad_norm": 2.348843038116063,
"learning_rate": 7.171717171717172e-06,
"loss": 0.3904,
"step": 71
},
{
"epoch": 0.021974668090950708,
"grad_norm": 2.7011363922899965,
"learning_rate": 7.272727272727273e-06,
"loss": 0.3586,
"step": 72
},
{
"epoch": 0.022279871814436136,
"grad_norm": 2.6923381814173486,
"learning_rate": 7.373737373737374e-06,
"loss": 0.4331,
"step": 73
},
{
"epoch": 0.02258507553792156,
"grad_norm": 2.0435337430530924,
"learning_rate": 7.474747474747476e-06,
"loss": 0.2739,
"step": 74
},
{
"epoch": 0.02289027926140699,
"grad_norm": 2.257183264462076,
"learning_rate": 7.5757575757575764e-06,
"loss": 0.4554,
"step": 75
},
{
"epoch": 0.023195482984892414,
"grad_norm": 2.5384248372961626,
"learning_rate": 7.676767676767677e-06,
"loss": 0.4934,
"step": 76
},
{
"epoch": 0.023500686708377842,
"grad_norm": 2.1578730127908488,
"learning_rate": 7.77777777777778e-06,
"loss": 0.3519,
"step": 77
},
{
"epoch": 0.023805890431863267,
"grad_norm": 2.1316764516757476,
"learning_rate": 7.87878787878788e-06,
"loss": 0.3268,
"step": 78
},
{
"epoch": 0.024111094155348695,
"grad_norm": 2.095996278024237,
"learning_rate": 7.97979797979798e-06,
"loss": 0.3318,
"step": 79
},
{
"epoch": 0.02441629787883412,
"grad_norm": 1.9985574049541877,
"learning_rate": 8.08080808080808e-06,
"loss": 0.1852,
"step": 80
},
{
"epoch": 0.02472150160231955,
"grad_norm": 1.7092921737326583,
"learning_rate": 8.181818181818183e-06,
"loss": 0.2412,
"step": 81
},
{
"epoch": 0.025026705325804977,
"grad_norm": 1.9609482601524066,
"learning_rate": 8.282828282828283e-06,
"loss": 0.3349,
"step": 82
},
{
"epoch": 0.0253319090492904,
"grad_norm": 2.5619254980161412,
"learning_rate": 8.383838383838384e-06,
"loss": 0.3327,
"step": 83
},
{
"epoch": 0.02563711277277583,
"grad_norm": 2.1734116421771827,
"learning_rate": 8.484848484848486e-06,
"loss": 0.5005,
"step": 84
},
{
"epoch": 0.025942316496261254,
"grad_norm": 2.4612836321871785,
"learning_rate": 8.585858585858587e-06,
"loss": 0.5919,
"step": 85
},
{
"epoch": 0.026247520219746683,
"grad_norm": 2.050264187978962,
"learning_rate": 8.686868686868687e-06,
"loss": 0.2654,
"step": 86
},
{
"epoch": 0.026552723943232107,
"grad_norm": 1.7466792206761999,
"learning_rate": 8.787878787878788e-06,
"loss": 0.2875,
"step": 87
},
{
"epoch": 0.026857927666717536,
"grad_norm": 1.9114055019911376,
"learning_rate": 8.888888888888888e-06,
"loss": 0.3317,
"step": 88
},
{
"epoch": 0.02716313139020296,
"grad_norm": 2.136028617695754,
"learning_rate": 8.98989898989899e-06,
"loss": 0.4322,
"step": 89
},
{
"epoch": 0.02746833511368839,
"grad_norm": 2.0559196693817303,
"learning_rate": 9.090909090909091e-06,
"loss": 0.3372,
"step": 90
},
{
"epoch": 0.027773538837173813,
"grad_norm": 1.6053810559753854,
"learning_rate": 9.191919191919193e-06,
"loss": 0.2833,
"step": 91
},
{
"epoch": 0.02807874256065924,
"grad_norm": 1.9190338968500587,
"learning_rate": 9.292929292929294e-06,
"loss": 0.2358,
"step": 92
},
{
"epoch": 0.028383946284144666,
"grad_norm": 1.7424429804531956,
"learning_rate": 9.393939393939396e-06,
"loss": 0.2805,
"step": 93
},
{
"epoch": 0.028689150007630095,
"grad_norm": 1.5616301594921251,
"learning_rate": 9.494949494949497e-06,
"loss": 0.326,
"step": 94
},
{
"epoch": 0.02899435373111552,
"grad_norm": 2.6517363851490297,
"learning_rate": 9.595959595959597e-06,
"loss": 0.5839,
"step": 95
},
{
"epoch": 0.029299557454600948,
"grad_norm": 1.9068377479857994,
"learning_rate": 9.696969696969698e-06,
"loss": 0.4213,
"step": 96
},
{
"epoch": 0.029604761178086372,
"grad_norm": 2.147263972819766,
"learning_rate": 9.797979797979798e-06,
"loss": 0.3776,
"step": 97
},
{
"epoch": 0.0299099649015718,
"grad_norm": 2.3466004395170685,
"learning_rate": 9.8989898989899e-06,
"loss": 0.4828,
"step": 98
},
{
"epoch": 0.030215168625057225,
"grad_norm": 1.9328188798162316,
"learning_rate": 1e-05,
"loss": 0.3816,
"step": 99
},
{
"epoch": 0.030520372348542654,
"grad_norm": 2.120656679761712,
"learning_rate": 9.999997555414177e-06,
"loss": 0.287,
"step": 100
},
{
"epoch": 0.03082557607202808,
"grad_norm": 1.8272767014289886,
"learning_rate": 9.999990221659095e-06,
"loss": 0.2529,
"step": 101
},
{
"epoch": 0.031130779795513507,
"grad_norm": 2.108876035097533,
"learning_rate": 9.999977998741925e-06,
"loss": 0.4,
"step": 102
},
{
"epoch": 0.031435983518998935,
"grad_norm": 2.611227326027621,
"learning_rate": 9.999960886674623e-06,
"loss": 0.5577,
"step": 103
},
{
"epoch": 0.031741187242484356,
"grad_norm": 2.012760226088087,
"learning_rate": 9.999938885473916e-06,
"loss": 0.2397,
"step": 104
},
{
"epoch": 0.032046390965969784,
"grad_norm": 3.4069313977643088,
"learning_rate": 9.999911995161323e-06,
"loss": 0.3074,
"step": 105
},
{
"epoch": 0.03235159468945521,
"grad_norm": 1.5281487804348939,
"learning_rate": 9.999880215763133e-06,
"loss": 0.306,
"step": 106
},
{
"epoch": 0.03265679841294064,
"grad_norm": 1.5733903167529437,
"learning_rate": 9.999843547310427e-06,
"loss": 0.3123,
"step": 107
},
{
"epoch": 0.03296200213642606,
"grad_norm": 2.2084260837102776,
"learning_rate": 9.999801989839055e-06,
"loss": 0.2686,
"step": 108
},
{
"epoch": 0.03326720585991149,
"grad_norm": 2.0235527329790477,
"learning_rate": 9.999755543389658e-06,
"loss": 0.362,
"step": 109
},
{
"epoch": 0.03357240958339692,
"grad_norm": 1.4126246608311444,
"learning_rate": 9.999704208007647e-06,
"loss": 0.1868,
"step": 110
},
{
"epoch": 0.03387761330688235,
"grad_norm": 1.9363750145032863,
"learning_rate": 9.999647983743227e-06,
"loss": 0.4674,
"step": 111
},
{
"epoch": 0.03418281703036777,
"grad_norm": 2.306492812857686,
"learning_rate": 9.999586870651372e-06,
"loss": 0.7454,
"step": 112
},
{
"epoch": 0.034488020753853196,
"grad_norm": 1.9927578577114744,
"learning_rate": 9.999520868791839e-06,
"loss": 0.2964,
"step": 113
},
{
"epoch": 0.034793224477338625,
"grad_norm": 2.897230200199283,
"learning_rate": 9.99944997822917e-06,
"loss": 0.3507,
"step": 114
},
{
"epoch": 0.03509842820082405,
"grad_norm": 1.7040567211820554,
"learning_rate": 9.999374199032682e-06,
"loss": 0.358,
"step": 115
},
{
"epoch": 0.035403631924309474,
"grad_norm": 1.7684725864001616,
"learning_rate": 9.999293531276475e-06,
"loss": 0.469,
"step": 116
},
{
"epoch": 0.0357088356477949,
"grad_norm": 2.151331613378997,
"learning_rate": 9.999207975039429e-06,
"loss": 0.4007,
"step": 117
},
{
"epoch": 0.03601403937128033,
"grad_norm": 2.1827006415812678,
"learning_rate": 9.999117530405205e-06,
"loss": 0.373,
"step": 118
},
{
"epoch": 0.03631924309476576,
"grad_norm": 2.0424756244526283,
"learning_rate": 9.99902219746224e-06,
"loss": 0.4664,
"step": 119
},
{
"epoch": 0.03662444681825118,
"grad_norm": 2.4438750213097014,
"learning_rate": 9.998921976303757e-06,
"loss": 0.5884,
"step": 120
},
{
"epoch": 0.03692965054173661,
"grad_norm": 1.6168805259489245,
"learning_rate": 9.998816867027753e-06,
"loss": 0.3874,
"step": 121
},
{
"epoch": 0.03723485426522204,
"grad_norm": 2.4836564854380914,
"learning_rate": 9.99870686973701e-06,
"loss": 0.3865,
"step": 122
},
{
"epoch": 0.037540057988707465,
"grad_norm": 2.187549263535683,
"learning_rate": 9.998591984539085e-06,
"loss": 0.4419,
"step": 123
},
{
"epoch": 0.037845261712192886,
"grad_norm": 2.3145724108896366,
"learning_rate": 9.998472211546317e-06,
"loss": 0.5048,
"step": 124
},
{
"epoch": 0.038150465435678314,
"grad_norm": 2.6043824271784377,
"learning_rate": 9.998347550875825e-06,
"loss": 0.4323,
"step": 125
},
{
"epoch": 0.03845566915916374,
"grad_norm": 1.7266964407358079,
"learning_rate": 9.998218002649507e-06,
"loss": 0.3093,
"step": 126
},
{
"epoch": 0.03876087288264917,
"grad_norm": 2.3091863655820397,
"learning_rate": 9.99808356699404e-06,
"loss": 0.5394,
"step": 127
},
{
"epoch": 0.03906607660613459,
"grad_norm": 2.178584103245907,
"learning_rate": 9.997944244040877e-06,
"loss": 0.562,
"step": 128
},
{
"epoch": 0.03937128032962002,
"grad_norm": 1.4762803065381216,
"learning_rate": 9.997800033926252e-06,
"loss": 0.3012,
"step": 129
},
{
"epoch": 0.03967648405310545,
"grad_norm": 1.6768704233807339,
"learning_rate": 9.997650936791183e-06,
"loss": 0.3314,
"step": 130
},
{
"epoch": 0.03998168777659088,
"grad_norm": 1.8423584681568375,
"learning_rate": 9.997496952781461e-06,
"loss": 0.5373,
"step": 131
},
{
"epoch": 0.0402868915000763,
"grad_norm": 1.4926628434179245,
"learning_rate": 9.997338082047656e-06,
"loss": 0.1992,
"step": 132
},
{
"epoch": 0.040592095223561726,
"grad_norm": 1.6323074947028773,
"learning_rate": 9.997174324745117e-06,
"loss": 0.4872,
"step": 133
},
{
"epoch": 0.040897298947047155,
"grad_norm": 2.159688005520465,
"learning_rate": 9.997005681033973e-06,
"loss": 0.5076,
"step": 134
},
{
"epoch": 0.04120250267053258,
"grad_norm": 2.207163038792008,
"learning_rate": 9.996832151079127e-06,
"loss": 0.2677,
"step": 135
},
{
"epoch": 0.041507706394018004,
"grad_norm": 1.3990677420334965,
"learning_rate": 9.996653735050265e-06,
"loss": 0.2526,
"step": 136
},
{
"epoch": 0.04181291011750343,
"grad_norm": 1.7368886105229604,
"learning_rate": 9.996470433121847e-06,
"loss": 0.2874,
"step": 137
},
{
"epoch": 0.04211811384098886,
"grad_norm": 1.8138446424045762,
"learning_rate": 9.996282245473113e-06,
"loss": 0.2986,
"step": 138
},
{
"epoch": 0.04242331756447429,
"grad_norm": 1.8564789601928355,
"learning_rate": 9.996089172288078e-06,
"loss": 0.3954,
"step": 139
},
{
"epoch": 0.04272852128795971,
"grad_norm": 1.9085920361180522,
"learning_rate": 9.995891213755536e-06,
"loss": 0.2739,
"step": 140
},
{
"epoch": 0.04303372501144514,
"grad_norm": 1.8924678931794556,
"learning_rate": 9.99568837006906e-06,
"loss": 0.2766,
"step": 141
},
{
"epoch": 0.04333892873493057,
"grad_norm": 1.8418836037208652,
"learning_rate": 9.995480641426992e-06,
"loss": 0.488,
"step": 142
},
{
"epoch": 0.043644132458415995,
"grad_norm": 1.6305125707231247,
"learning_rate": 9.99526802803246e-06,
"loss": 0.3045,
"step": 143
},
{
"epoch": 0.043949336181901416,
"grad_norm": 2.143051665423358,
"learning_rate": 9.995050530093366e-06,
"loss": 0.3567,
"step": 144
},
{
"epoch": 0.044254539905386844,
"grad_norm": 1.994194545633334,
"learning_rate": 9.994828147822387e-06,
"loss": 0.3655,
"step": 145
},
{
"epoch": 0.04455974362887227,
"grad_norm": 1.8553346605537173,
"learning_rate": 9.994600881436972e-06,
"loss": 0.3249,
"step": 146
},
{
"epoch": 0.0448649473523577,
"grad_norm": 2.1613773805709857,
"learning_rate": 9.994368731159351e-06,
"loss": 0.4863,
"step": 147
},
{
"epoch": 0.04517015107584312,
"grad_norm": 2.199571706523493,
"learning_rate": 9.99413169721653e-06,
"loss": 0.465,
"step": 148
},
{
"epoch": 0.04547535479932855,
"grad_norm": 1.681707967900651,
"learning_rate": 9.99388977984029e-06,
"loss": 0.3472,
"step": 149
},
{
"epoch": 0.04578055852281398,
"grad_norm": 1.6586587053140593,
"learning_rate": 9.993642979267184e-06,
"loss": 0.3626,
"step": 150
},
{
"epoch": 0.04608576224629941,
"grad_norm": 2.12592721793332,
"learning_rate": 9.993391295738542e-06,
"loss": 0.3218,
"step": 151
},
{
"epoch": 0.04639096596978483,
"grad_norm": 1.6765944279655143,
"learning_rate": 9.99313472950047e-06,
"loss": 0.3402,
"step": 152
},
{
"epoch": 0.046696169693270256,
"grad_norm": 1.6019038139070678,
"learning_rate": 9.992873280803848e-06,
"loss": 0.4554,
"step": 153
},
{
"epoch": 0.047001373416755685,
"grad_norm": 1.6429860881882794,
"learning_rate": 9.99260694990433e-06,
"loss": 0.4086,
"step": 154
},
{
"epoch": 0.04730657714024111,
"grad_norm": 1.98592334325083,
"learning_rate": 9.992335737062338e-06,
"loss": 0.5733,
"step": 155
},
{
"epoch": 0.047611780863726534,
"grad_norm": 1.5624846648417388,
"learning_rate": 9.992059642543076e-06,
"loss": 0.2524,
"step": 156
},
{
"epoch": 0.04791698458721196,
"grad_norm": 1.4438198320418865,
"learning_rate": 9.991778666616523e-06,
"loss": 0.1756,
"step": 157
},
{
"epoch": 0.04822218831069739,
"grad_norm": 1.6284817295660008,
"learning_rate": 9.991492809557424e-06,
"loss": 0.4144,
"step": 158
},
{
"epoch": 0.04852739203418282,
"grad_norm": 1.2236340789910145,
"learning_rate": 9.991202071645298e-06,
"loss": 0.1664,
"step": 159
},
{
"epoch": 0.04883259575766824,
"grad_norm": 1.4874398163232816,
"learning_rate": 9.99090645316444e-06,
"loss": 0.3323,
"step": 160
},
{
"epoch": 0.04913779948115367,
"grad_norm": 2.5394515927833403,
"learning_rate": 9.990605954403917e-06,
"loss": 0.27,
"step": 161
},
{
"epoch": 0.0494430032046391,
"grad_norm": 1.7966332314422868,
"learning_rate": 9.990300575657565e-06,
"loss": 0.4453,
"step": 162
},
{
"epoch": 0.049748206928124525,
"grad_norm": 1.825976682624809,
"learning_rate": 9.989990317223995e-06,
"loss": 0.2646,
"step": 163
},
{
"epoch": 0.05005341065160995,
"grad_norm": 1.6554541925183588,
"learning_rate": 9.989675179406588e-06,
"loss": 0.445,
"step": 164
},
{
"epoch": 0.050358614375095374,
"grad_norm": 1.6711133844293076,
"learning_rate": 9.989355162513496e-06,
"loss": 0.3685,
"step": 165
},
{
"epoch": 0.0506638180985808,
"grad_norm": 1.8033315345252203,
"learning_rate": 9.989030266857644e-06,
"loss": 0.2566,
"step": 166
},
{
"epoch": 0.05096902182206623,
"grad_norm": 1.6879852444966537,
"learning_rate": 9.988700492756726e-06,
"loss": 0.4086,
"step": 167
},
{
"epoch": 0.05127422554555166,
"grad_norm": 1.6855038740169574,
"learning_rate": 9.988365840533204e-06,
"loss": 0.3081,
"step": 168
},
{
"epoch": 0.05157942926903708,
"grad_norm": 2.245121010490438,
"learning_rate": 9.988026310514316e-06,
"loss": 0.5646,
"step": 169
},
{
"epoch": 0.05188463299252251,
"grad_norm": 1.531117336209479,
"learning_rate": 9.987681903032065e-06,
"loss": 0.3598,
"step": 170
},
{
"epoch": 0.05218983671600794,
"grad_norm": 1.4368727600956301,
"learning_rate": 9.987332618423221e-06,
"loss": 0.3864,
"step": 171
},
{
"epoch": 0.052495040439493365,
"grad_norm": 2.039026486601271,
"learning_rate": 9.98697845702933e-06,
"loss": 0.2728,
"step": 172
},
{
"epoch": 0.052800244162978786,
"grad_norm": 1.5481974795842472,
"learning_rate": 9.986619419196704e-06,
"loss": 0.2376,
"step": 173
},
{
"epoch": 0.053105447886464215,
"grad_norm": 1.583025735121783,
"learning_rate": 9.986255505276418e-06,
"loss": 0.3941,
"step": 174
},
{
"epoch": 0.05341065160994964,
"grad_norm": 2.025610033619695,
"learning_rate": 9.985886715624326e-06,
"loss": 0.432,
"step": 175
},
{
"epoch": 0.05371585533343507,
"grad_norm": 1.9370365819159912,
"learning_rate": 9.985513050601037e-06,
"loss": 0.3311,
"step": 176
},
{
"epoch": 0.05402105905692049,
"grad_norm": 1.534591376747653,
"learning_rate": 9.985134510571936e-06,
"loss": 0.3804,
"step": 177
},
{
"epoch": 0.05432626278040592,
"grad_norm": 1.5627980520171343,
"learning_rate": 9.984751095907175e-06,
"loss": 0.3991,
"step": 178
},
{
"epoch": 0.05463146650389135,
"grad_norm": 1.858760828475349,
"learning_rate": 9.984362806981665e-06,
"loss": 0.4124,
"step": 179
},
{
"epoch": 0.05493667022737678,
"grad_norm": 1.4922057145689682,
"learning_rate": 9.983969644175092e-06,
"loss": 0.2571,
"step": 180
},
{
"epoch": 0.0552418739508622,
"grad_norm": 1.4358215484460224,
"learning_rate": 9.983571607871903e-06,
"loss": 0.3351,
"step": 181
},
{
"epoch": 0.05554707767434763,
"grad_norm": 1.7105120125454414,
"learning_rate": 9.983168698461312e-06,
"loss": 0.4374,
"step": 182
},
{
"epoch": 0.055852281397833055,
"grad_norm": 1.4100459259074987,
"learning_rate": 9.982760916337296e-06,
"loss": 0.3958,
"step": 183
},
{
"epoch": 0.05615748512131848,
"grad_norm": 1.667173817085955,
"learning_rate": 9.982348261898598e-06,
"loss": 0.2867,
"step": 184
},
{
"epoch": 0.056462688844803904,
"grad_norm": 1.8278737995984025,
"learning_rate": 9.981930735548731e-06,
"loss": 0.3738,
"step": 185
},
{
"epoch": 0.05676789256828933,
"grad_norm": 1.806852289121097,
"learning_rate": 9.98150833769596e-06,
"loss": 0.5608,
"step": 186
},
{
"epoch": 0.05707309629177476,
"grad_norm": 1.6986308867720055,
"learning_rate": 9.981081068753324e-06,
"loss": 0.4253,
"step": 187
},
{
"epoch": 0.05737830001526019,
"grad_norm": 1.6392088091109513,
"learning_rate": 9.98064892913862e-06,
"loss": 0.2444,
"step": 188
},
{
"epoch": 0.05768350373874561,
"grad_norm": 1.7762995408711126,
"learning_rate": 9.980211919274407e-06,
"loss": 0.3866,
"step": 189
},
{
"epoch": 0.05798870746223104,
"grad_norm": 1.7144647062044762,
"learning_rate": 9.979770039588013e-06,
"loss": 0.4504,
"step": 190
},
{
"epoch": 0.05829391118571647,
"grad_norm": 1.9069269572943617,
"learning_rate": 9.979323290511517e-06,
"loss": 0.4972,
"step": 191
},
{
"epoch": 0.058599114909201895,
"grad_norm": 1.831943664409223,
"learning_rate": 9.978871672481774e-06,
"loss": 0.3884,
"step": 192
},
{
"epoch": 0.058904318632687316,
"grad_norm": 1.60483584957947,
"learning_rate": 9.978415185940383e-06,
"loss": 0.3366,
"step": 193
},
{
"epoch": 0.059209522356172745,
"grad_norm": 2.041633475935638,
"learning_rate": 9.977953831333718e-06,
"loss": 0.4928,
"step": 194
},
{
"epoch": 0.05951472607965817,
"grad_norm": 2.1574861604284243,
"learning_rate": 9.977487609112904e-06,
"loss": 0.7092,
"step": 195
},
{
"epoch": 0.0598199298031436,
"grad_norm": 1.5382345073334531,
"learning_rate": 9.97701651973383e-06,
"loss": 0.2236,
"step": 196
},
{
"epoch": 0.06012513352662902,
"grad_norm": 2.1479787995768014,
"learning_rate": 9.976540563657143e-06,
"loss": 0.5182,
"step": 197
},
{
"epoch": 0.06043033725011445,
"grad_norm": 1.8579437774142544,
"learning_rate": 9.976059741348252e-06,
"loss": 0.3093,
"step": 198
},
{
"epoch": 0.06073554097359988,
"grad_norm": 1.5409701380525285,
"learning_rate": 9.975574053277317e-06,
"loss": 0.2877,
"step": 199
},
{
"epoch": 0.06104074469708531,
"grad_norm": 1.5474598097011698,
"learning_rate": 9.975083499919264e-06,
"loss": 0.2981,
"step": 200
},
{
"epoch": 0.06134594842057073,
"grad_norm": 1.9202152932180157,
"learning_rate": 9.974588081753773e-06,
"loss": 0.5369,
"step": 201
},
{
"epoch": 0.06165115214405616,
"grad_norm": 1.4598442515817716,
"learning_rate": 9.974087799265279e-06,
"loss": 0.3696,
"step": 202
},
{
"epoch": 0.061956355867541585,
"grad_norm": 1.48078814360119,
"learning_rate": 9.973582652942975e-06,
"loss": 0.284,
"step": 203
},
{
"epoch": 0.06226155959102701,
"grad_norm": 2.100326004155181,
"learning_rate": 9.973072643280813e-06,
"loss": 0.5681,
"step": 204
},
{
"epoch": 0.06256676331451244,
"grad_norm": 1.976128330719915,
"learning_rate": 9.972557770777496e-06,
"loss": 0.3655,
"step": 205
},
{
"epoch": 0.06287196703799787,
"grad_norm": 1.2103730393566896,
"learning_rate": 9.972038035936483e-06,
"loss": 0.2471,
"step": 206
},
{
"epoch": 0.06317717076148328,
"grad_norm": 1.670449906238349,
"learning_rate": 9.971513439265992e-06,
"loss": 0.2184,
"step": 207
},
{
"epoch": 0.06348237448496871,
"grad_norm": 1.5020544764497652,
"learning_rate": 9.970983981278989e-06,
"loss": 0.3196,
"step": 208
},
{
"epoch": 0.06378757820845414,
"grad_norm": 1.7833251911345853,
"learning_rate": 9.970449662493195e-06,
"loss": 0.4122,
"step": 209
},
{
"epoch": 0.06409278193193957,
"grad_norm": 1.4149595334362772,
"learning_rate": 9.96991048343109e-06,
"loss": 0.2947,
"step": 210
},
{
"epoch": 0.064397985655425,
"grad_norm": 1.5991867680932033,
"learning_rate": 9.969366444619898e-06,
"loss": 0.1902,
"step": 211
},
{
"epoch": 0.06470318937891043,
"grad_norm": 1.4132064841734169,
"learning_rate": 9.968817546591601e-06,
"loss": 0.3389,
"step": 212
},
{
"epoch": 0.06500839310239585,
"grad_norm": 1.7671902900221814,
"learning_rate": 9.968263789882926e-06,
"loss": 0.4294,
"step": 213
},
{
"epoch": 0.06531359682588128,
"grad_norm": 1.5709821497329826,
"learning_rate": 9.96770517503536e-06,
"loss": 0.2765,
"step": 214
},
{
"epoch": 0.0656188005493667,
"grad_norm": 1.5211731343844295,
"learning_rate": 9.967141702595134e-06,
"loss": 0.387,
"step": 215
},
{
"epoch": 0.06592400427285212,
"grad_norm": 1.5499265222668686,
"learning_rate": 9.96657337311323e-06,
"loss": 0.4535,
"step": 216
},
{
"epoch": 0.06622920799633755,
"grad_norm": 1.4736546539447488,
"learning_rate": 9.966000187145383e-06,
"loss": 0.3834,
"step": 217
},
{
"epoch": 0.06653441171982298,
"grad_norm": 1.3306288958233108,
"learning_rate": 9.965422145252072e-06,
"loss": 0.3172,
"step": 218
},
{
"epoch": 0.06683961544330841,
"grad_norm": 1.5745937005003143,
"learning_rate": 9.964839247998524e-06,
"loss": 0.2725,
"step": 219
},
{
"epoch": 0.06714481916679384,
"grad_norm": 1.7546511557153388,
"learning_rate": 9.96425149595472e-06,
"loss": 0.3577,
"step": 220
},
{
"epoch": 0.06745002289027927,
"grad_norm": 2.0422588449754286,
"learning_rate": 9.96365888969538e-06,
"loss": 0.4976,
"step": 221
},
{
"epoch": 0.0677552266137647,
"grad_norm": 1.4661824124133862,
"learning_rate": 9.963061429799979e-06,
"loss": 0.3672,
"step": 222
},
{
"epoch": 0.06806043033725011,
"grad_norm": 2.0959067552369666,
"learning_rate": 9.96245911685273e-06,
"loss": 0.5381,
"step": 223
},
{
"epoch": 0.06836563406073554,
"grad_norm": 1.3296813372997014,
"learning_rate": 9.961851951442599e-06,
"loss": 0.2799,
"step": 224
},
{
"epoch": 0.06867083778422096,
"grad_norm": 1.7385807765114274,
"learning_rate": 9.96123993416329e-06,
"loss": 0.5183,
"step": 225
},
{
"epoch": 0.06897604150770639,
"grad_norm": 1.5190119701865645,
"learning_rate": 9.960623065613254e-06,
"loss": 0.4608,
"step": 226
},
{
"epoch": 0.06928124523119182,
"grad_norm": 1.4393894383331207,
"learning_rate": 9.96000134639569e-06,
"loss": 0.3455,
"step": 227
},
{
"epoch": 0.06958644895467725,
"grad_norm": 1.7132863682619555,
"learning_rate": 9.959374777118533e-06,
"loss": 0.316,
"step": 228
},
{
"epoch": 0.06989165267816268,
"grad_norm": 1.3227120889592454,
"learning_rate": 9.958743358394464e-06,
"loss": 0.2467,
"step": 229
},
{
"epoch": 0.0701968564016481,
"grad_norm": 1.5331153407144422,
"learning_rate": 9.95810709084091e-06,
"loss": 0.3138,
"step": 230
},
{
"epoch": 0.07050206012513352,
"grad_norm": 1.7990748995190806,
"learning_rate": 9.957465975080031e-06,
"loss": 0.4747,
"step": 231
},
{
"epoch": 0.07080726384861895,
"grad_norm": 1.1638981235859056,
"learning_rate": 9.956820011738736e-06,
"loss": 0.2265,
"step": 232
},
{
"epoch": 0.07111246757210438,
"grad_norm": 1.5739388418179414,
"learning_rate": 9.956169201448665e-06,
"loss": 0.5066,
"step": 233
},
{
"epoch": 0.0714176712955898,
"grad_norm": 1.6803933013620869,
"learning_rate": 9.955513544846205e-06,
"loss": 0.4415,
"step": 234
},
{
"epoch": 0.07172287501907523,
"grad_norm": 1.4014872110785643,
"learning_rate": 9.954853042572479e-06,
"loss": 0.3271,
"step": 235
},
{
"epoch": 0.07202807874256066,
"grad_norm": 1.5310222689941932,
"learning_rate": 9.954187695273352e-06,
"loss": 0.3289,
"step": 236
},
{
"epoch": 0.07233328246604609,
"grad_norm": 2.166268226472017,
"learning_rate": 9.953517503599419e-06,
"loss": 0.622,
"step": 237
},
{
"epoch": 0.07263848618953152,
"grad_norm": 2.258081862277545,
"learning_rate": 9.952842468206019e-06,
"loss": 0.5071,
"step": 238
},
{
"epoch": 0.07294368991301693,
"grad_norm": 1.7322119894263104,
"learning_rate": 9.952162589753224e-06,
"loss": 0.5097,
"step": 239
},
{
"epoch": 0.07324889363650236,
"grad_norm": 1.9966284228033864,
"learning_rate": 9.951477868905843e-06,
"loss": 0.2263,
"step": 240
},
{
"epoch": 0.07355409735998779,
"grad_norm": 1.6793267860774614,
"learning_rate": 9.95078830633342e-06,
"loss": 0.2065,
"step": 241
},
{
"epoch": 0.07385930108347322,
"grad_norm": 2.122564153881175,
"learning_rate": 9.95009390271023e-06,
"loss": 0.2665,
"step": 242
},
{
"epoch": 0.07416450480695864,
"grad_norm": 1.5852282963187305,
"learning_rate": 9.949394658715289e-06,
"loss": 0.4453,
"step": 243
},
{
"epoch": 0.07446970853044407,
"grad_norm": 1.7534712016120517,
"learning_rate": 9.948690575032338e-06,
"loss": 0.3628,
"step": 244
},
{
"epoch": 0.0747749122539295,
"grad_norm": 1.351810586905304,
"learning_rate": 9.947981652349854e-06,
"loss": 0.3984,
"step": 245
},
{
"epoch": 0.07508011597741493,
"grad_norm": 1.8377506474408298,
"learning_rate": 9.947267891361051e-06,
"loss": 0.3677,
"step": 246
},
{
"epoch": 0.07538531970090036,
"grad_norm": 1.4655632998364951,
"learning_rate": 9.946549292763865e-06,
"loss": 0.3516,
"step": 247
},
{
"epoch": 0.07569052342438577,
"grad_norm": 3.240838121636416,
"learning_rate": 9.945825857260967e-06,
"loss": 0.2627,
"step": 248
},
{
"epoch": 0.0759957271478712,
"grad_norm": 1.4085823215183912,
"learning_rate": 9.945097585559757e-06,
"loss": 0.2716,
"step": 249
},
{
"epoch": 0.07630093087135663,
"grad_norm": 1.6361471921651585,
"learning_rate": 9.944364478372364e-06,
"loss": 0.3595,
"step": 250
},
{
"epoch": 0.07660613459484206,
"grad_norm": 1.0912978886499554,
"learning_rate": 9.943626536415647e-06,
"loss": 0.1968,
"step": 251
},
{
"epoch": 0.07691133831832749,
"grad_norm": 1.9515717700893849,
"learning_rate": 9.942883760411188e-06,
"loss": 0.374,
"step": 252
},
{
"epoch": 0.07721654204181291,
"grad_norm": 1.5560755068838334,
"learning_rate": 9.942136151085302e-06,
"loss": 0.44,
"step": 253
},
{
"epoch": 0.07752174576529834,
"grad_norm": 1.4843235207715992,
"learning_rate": 9.941383709169024e-06,
"loss": 0.3175,
"step": 254
},
{
"epoch": 0.07782694948878377,
"grad_norm": 1.5210960196158274,
"learning_rate": 9.94062643539812e-06,
"loss": 0.3722,
"step": 255
},
{
"epoch": 0.07813215321226918,
"grad_norm": 1.6656094376801425,
"learning_rate": 9.939864330513079e-06,
"loss": 0.3511,
"step": 256
},
{
"epoch": 0.07843735693575461,
"grad_norm": 1.2732857455769802,
"learning_rate": 9.939097395259108e-06,
"loss": 0.2619,
"step": 257
},
{
"epoch": 0.07874256065924004,
"grad_norm": 1.8947301386622588,
"learning_rate": 9.938325630386149e-06,
"loss": 0.3933,
"step": 258
},
{
"epoch": 0.07904776438272547,
"grad_norm": 1.5625416559388712,
"learning_rate": 9.937549036648857e-06,
"loss": 0.4491,
"step": 259
},
{
"epoch": 0.0793529681062109,
"grad_norm": 1.5125179888703784,
"learning_rate": 9.936767614806612e-06,
"loss": 0.3674,
"step": 260
},
{
"epoch": 0.07965817182969633,
"grad_norm": 1.5026525250547669,
"learning_rate": 9.935981365623516e-06,
"loss": 0.4103,
"step": 261
},
{
"epoch": 0.07996337555318175,
"grad_norm": 2.3948536293362115,
"learning_rate": 9.93519028986839e-06,
"loss": 0.4009,
"step": 262
},
{
"epoch": 0.08026857927666718,
"grad_norm": 2.416554371647352,
"learning_rate": 9.934394388314775e-06,
"loss": 0.4265,
"step": 263
},
{
"epoch": 0.0805737830001526,
"grad_norm": 1.560923734953618,
"learning_rate": 9.933593661740933e-06,
"loss": 0.303,
"step": 264
},
{
"epoch": 0.08087898672363802,
"grad_norm": 1.6053945705234087,
"learning_rate": 9.932788110929837e-06,
"loss": 0.3295,
"step": 265
},
{
"epoch": 0.08118419044712345,
"grad_norm": 1.7775437462596928,
"learning_rate": 9.931977736669185e-06,
"loss": 0.2197,
"step": 266
},
{
"epoch": 0.08148939417060888,
"grad_norm": 1.701318325041301,
"learning_rate": 9.931162539751392e-06,
"loss": 0.3581,
"step": 267
},
{
"epoch": 0.08179459789409431,
"grad_norm": 1.5974548511363529,
"learning_rate": 9.93034252097358e-06,
"loss": 0.3432,
"step": 268
},
{
"epoch": 0.08209980161757974,
"grad_norm": 1.8669593065073864,
"learning_rate": 9.929517681137594e-06,
"loss": 0.4133,
"step": 269
},
{
"epoch": 0.08240500534106517,
"grad_norm": 1.4895827642408586,
"learning_rate": 9.928688021049991e-06,
"loss": 0.3111,
"step": 270
},
{
"epoch": 0.0827102090645506,
"grad_norm": 1.4317804244871846,
"learning_rate": 9.927853541522041e-06,
"loss": 0.2915,
"step": 271
},
{
"epoch": 0.08301541278803601,
"grad_norm": 1.252478145781798,
"learning_rate": 9.927014243369727e-06,
"loss": 0.2794,
"step": 272
},
{
"epoch": 0.08332061651152144,
"grad_norm": 1.6973954865497314,
"learning_rate": 9.926170127413743e-06,
"loss": 0.6183,
"step": 273
},
{
"epoch": 0.08362582023500686,
"grad_norm": 1.4723277244112698,
"learning_rate": 9.925321194479494e-06,
"loss": 0.2815,
"step": 274
},
{
"epoch": 0.08393102395849229,
"grad_norm": 1.7075555550514414,
"learning_rate": 9.924467445397097e-06,
"loss": 0.4178,
"step": 275
},
{
"epoch": 0.08423622768197772,
"grad_norm": 1.5354808046910606,
"learning_rate": 9.923608881001377e-06,
"loss": 0.2355,
"step": 276
},
{
"epoch": 0.08454143140546315,
"grad_norm": 1.1795750747565834,
"learning_rate": 9.922745502131865e-06,
"loss": 0.3404,
"step": 277
},
{
"epoch": 0.08484663512894858,
"grad_norm": 1.427067758888222,
"learning_rate": 9.921877309632805e-06,
"loss": 0.3141,
"step": 278
},
{
"epoch": 0.085151838852434,
"grad_norm": 1.3691564278772157,
"learning_rate": 9.921004304353147e-06,
"loss": 0.287,
"step": 279
},
{
"epoch": 0.08545704257591942,
"grad_norm": 1.9220775714586407,
"learning_rate": 9.920126487146544e-06,
"loss": 0.6617,
"step": 280
},
{
"epoch": 0.08576224629940485,
"grad_norm": 1.6761030408371134,
"learning_rate": 9.919243858871355e-06,
"loss": 0.466,
"step": 281
},
{
"epoch": 0.08606745002289028,
"grad_norm": 1.6120747264173168,
"learning_rate": 9.918356420390645e-06,
"loss": 0.5351,
"step": 282
},
{
"epoch": 0.0863726537463757,
"grad_norm": 1.5236961732014556,
"learning_rate": 9.91746417257218e-06,
"loss": 0.33,
"step": 283
},
{
"epoch": 0.08667785746986113,
"grad_norm": 1.6328635321860312,
"learning_rate": 9.916567116288434e-06,
"loss": 0.4301,
"step": 284
},
{
"epoch": 0.08698306119334656,
"grad_norm": 1.4120804188821041,
"learning_rate": 9.915665252416577e-06,
"loss": 0.3025,
"step": 285
},
{
"epoch": 0.08728826491683199,
"grad_norm": 1.8410843798908767,
"learning_rate": 9.914758581838482e-06,
"loss": 0.5415,
"step": 286
},
{
"epoch": 0.08759346864031742,
"grad_norm": 1.1807475096034001,
"learning_rate": 9.913847105440725e-06,
"loss": 0.3184,
"step": 287
},
{
"epoch": 0.08789867236380283,
"grad_norm": 1.52681276111022,
"learning_rate": 9.912930824114577e-06,
"loss": 0.4266,
"step": 288
},
{
"epoch": 0.08820387608728826,
"grad_norm": 1.4904538614169496,
"learning_rate": 9.91200973875601e-06,
"loss": 0.3404,
"step": 289
},
{
"epoch": 0.08850907981077369,
"grad_norm": 1.7385111110311349,
"learning_rate": 9.911083850265692e-06,
"loss": 0.3371,
"step": 290
},
{
"epoch": 0.08881428353425912,
"grad_norm": 1.6013762575114376,
"learning_rate": 9.91015315954899e-06,
"loss": 0.4475,
"step": 291
},
{
"epoch": 0.08911948725774455,
"grad_norm": 1.5474202900018152,
"learning_rate": 9.909217667515964e-06,
"loss": 0.4162,
"step": 292
},
{
"epoch": 0.08942469098122997,
"grad_norm": 1.875769203080621,
"learning_rate": 9.908277375081371e-06,
"loss": 0.4446,
"step": 293
},
{
"epoch": 0.0897298947047154,
"grad_norm": 1.4914731218024286,
"learning_rate": 9.907332283164663e-06,
"loss": 0.4274,
"step": 294
},
{
"epoch": 0.09003509842820083,
"grad_norm": 1.6551811079983538,
"learning_rate": 9.90638239268998e-06,
"loss": 0.4883,
"step": 295
},
{
"epoch": 0.09034030215168624,
"grad_norm": 1.645510927644492,
"learning_rate": 9.905427704586158e-06,
"loss": 0.4885,
"step": 296
},
{
"epoch": 0.09064550587517167,
"grad_norm": 1.6759165462483547,
"learning_rate": 9.904468219786727e-06,
"loss": 0.3878,
"step": 297
},
{
"epoch": 0.0909507095986571,
"grad_norm": 1.596800484010474,
"learning_rate": 9.903503939229901e-06,
"loss": 0.2725,
"step": 298
},
{
"epoch": 0.09125591332214253,
"grad_norm": 1.4035704196730787,
"learning_rate": 9.902534863858588e-06,
"loss": 0.2147,
"step": 299
},
{
"epoch": 0.09156111704562796,
"grad_norm": 1.7460761357385464,
"learning_rate": 9.90156099462038e-06,
"loss": 0.3495,
"step": 300
},
{
"epoch": 0.09186632076911339,
"grad_norm": 1.3373562156184522,
"learning_rate": 9.900582332467566e-06,
"loss": 0.342,
"step": 301
},
{
"epoch": 0.09217152449259881,
"grad_norm": 1.1466755748188362,
"learning_rate": 9.89959887835711e-06,
"loss": 0.1737,
"step": 302
},
{
"epoch": 0.09247672821608424,
"grad_norm": 1.8078659273922337,
"learning_rate": 9.898610633250669e-06,
"loss": 0.3111,
"step": 303
},
{
"epoch": 0.09278193193956966,
"grad_norm": 1.5400638324339648,
"learning_rate": 9.897617598114584e-06,
"loss": 0.4746,
"step": 304
},
{
"epoch": 0.09308713566305508,
"grad_norm": 1.558728128630052,
"learning_rate": 9.896619773919878e-06,
"loss": 0.3085,
"step": 305
},
{
"epoch": 0.09339233938654051,
"grad_norm": 4.094736926672729,
"learning_rate": 9.895617161642257e-06,
"loss": 0.4664,
"step": 306
},
{
"epoch": 0.09369754311002594,
"grad_norm": 1.63116898024897,
"learning_rate": 9.89460976226211e-06,
"loss": 0.3878,
"step": 307
},
{
"epoch": 0.09400274683351137,
"grad_norm": 1.7238364123731507,
"learning_rate": 9.893597576764508e-06,
"loss": 0.2989,
"step": 308
},
{
"epoch": 0.0943079505569968,
"grad_norm": 1.2496662648050174,
"learning_rate": 9.8925806061392e-06,
"loss": 0.3054,
"step": 309
},
{
"epoch": 0.09461315428048223,
"grad_norm": 0.8807197003313585,
"learning_rate": 9.891558851380614e-06,
"loss": 0.1904,
"step": 310
},
{
"epoch": 0.09491835800396765,
"grad_norm": 1.5076918479598347,
"learning_rate": 9.890532313487858e-06,
"loss": 0.2679,
"step": 311
},
{
"epoch": 0.09522356172745307,
"grad_norm": 1.8465691043660122,
"learning_rate": 9.889500993464716e-06,
"loss": 0.5002,
"step": 312
},
{
"epoch": 0.0955287654509385,
"grad_norm": 1.9183643810942494,
"learning_rate": 9.888464892319647e-06,
"loss": 0.4869,
"step": 313
},
{
"epoch": 0.09583396917442392,
"grad_norm": 1.6515373264151805,
"learning_rate": 9.887424011065788e-06,
"loss": 0.4507,
"step": 314
},
{
"epoch": 0.09613917289790935,
"grad_norm": 1.6223391241834122,
"learning_rate": 9.886378350720945e-06,
"loss": 0.3445,
"step": 315
},
{
"epoch": 0.09644437662139478,
"grad_norm": 1.4416645097808285,
"learning_rate": 9.885327912307604e-06,
"loss": 0.2808,
"step": 316
},
{
"epoch": 0.09674958034488021,
"grad_norm": 1.4777192121308136,
"learning_rate": 9.88427269685292e-06,
"loss": 0.4335,
"step": 317
},
{
"epoch": 0.09705478406836564,
"grad_norm": 1.6934694740555867,
"learning_rate": 9.883212705388715e-06,
"loss": 0.4299,
"step": 318
},
{
"epoch": 0.09735998779185107,
"grad_norm": 1.9031284601590377,
"learning_rate": 9.882147938951489e-06,
"loss": 0.5364,
"step": 319
},
{
"epoch": 0.09766519151533648,
"grad_norm": 1.990035566558448,
"learning_rate": 9.881078398582406e-06,
"loss": 0.6476,
"step": 320
},
{
"epoch": 0.09797039523882191,
"grad_norm": 1.4458600630840748,
"learning_rate": 9.8800040853273e-06,
"loss": 0.268,
"step": 321
},
{
"epoch": 0.09827559896230734,
"grad_norm": 1.473557254783057,
"learning_rate": 9.878925000236667e-06,
"loss": 0.3889,
"step": 322
},
{
"epoch": 0.09858080268579276,
"grad_norm": 1.429462352597184,
"learning_rate": 9.877841144365681e-06,
"loss": 0.3348,
"step": 323
},
{
"epoch": 0.0988860064092782,
"grad_norm": 1.9126483909533352,
"learning_rate": 9.876752518774167e-06,
"loss": 0.5004,
"step": 324
},
{
"epoch": 0.09919121013276362,
"grad_norm": 1.528278815830415,
"learning_rate": 9.875659124526622e-06,
"loss": 0.1931,
"step": 325
},
{
"epoch": 0.09949641385624905,
"grad_norm": 1.6064809314060318,
"learning_rate": 9.874560962692207e-06,
"loss": 0.2627,
"step": 326
},
{
"epoch": 0.09980161757973448,
"grad_norm": 1.8583002911468363,
"learning_rate": 9.873458034344741e-06,
"loss": 0.4795,
"step": 327
},
{
"epoch": 0.1001068213032199,
"grad_norm": 2.180040993961252,
"learning_rate": 9.872350340562704e-06,
"loss": 0.3502,
"step": 328
},
{
"epoch": 0.10041202502670532,
"grad_norm": 2.2760944374886334,
"learning_rate": 9.871237882429237e-06,
"loss": 0.5504,
"step": 329
},
{
"epoch": 0.10071722875019075,
"grad_norm": 1.599604903553732,
"learning_rate": 9.87012066103214e-06,
"loss": 0.4642,
"step": 330
},
{
"epoch": 0.10102243247367618,
"grad_norm": 1.165592902920284,
"learning_rate": 9.868998677463874e-06,
"loss": 0.2118,
"step": 331
},
{
"epoch": 0.1013276361971616,
"grad_norm": 1.562687671834839,
"learning_rate": 9.867871932821549e-06,
"loss": 0.3389,
"step": 332
},
{
"epoch": 0.10163283992064703,
"grad_norm": 1.5690938291630006,
"learning_rate": 9.866740428206935e-06,
"loss": 0.3084,
"step": 333
},
{
"epoch": 0.10193804364413246,
"grad_norm": 1.6392469885959746,
"learning_rate": 9.865604164726456e-06,
"loss": 0.3935,
"step": 334
},
{
"epoch": 0.10224324736761789,
"grad_norm": 1.4807871775535164,
"learning_rate": 9.864463143491192e-06,
"loss": 0.4445,
"step": 335
},
{
"epoch": 0.10254845109110332,
"grad_norm": 1.1926217304533164,
"learning_rate": 9.86331736561687e-06,
"loss": 0.2623,
"step": 336
},
{
"epoch": 0.10285365481458873,
"grad_norm": 1.5461262133304665,
"learning_rate": 9.862166832223871e-06,
"loss": 0.4161,
"step": 337
},
{
"epoch": 0.10315885853807416,
"grad_norm": 1.5135662608063911,
"learning_rate": 9.861011544437226e-06,
"loss": 0.2864,
"step": 338
},
{
"epoch": 0.10346406226155959,
"grad_norm": 1.2723458882899108,
"learning_rate": 9.85985150338662e-06,
"loss": 0.3208,
"step": 339
},
{
"epoch": 0.10376926598504502,
"grad_norm": 1.5555768560283307,
"learning_rate": 9.858686710206373e-06,
"loss": 0.3341,
"step": 340
},
{
"epoch": 0.10407446970853045,
"grad_norm": 1.7562240544367693,
"learning_rate": 9.857517166035466e-06,
"loss": 0.4637,
"step": 341
},
{
"epoch": 0.10437967343201587,
"grad_norm": 3.9759170817857044,
"learning_rate": 9.856342872017515e-06,
"loss": 0.6559,
"step": 342
},
{
"epoch": 0.1046848771555013,
"grad_norm": 1.6864581247305628,
"learning_rate": 9.855163829300789e-06,
"loss": 0.4889,
"step": 343
},
{
"epoch": 0.10499008087898673,
"grad_norm": 1.5660173118774432,
"learning_rate": 9.853980039038193e-06,
"loss": 0.2512,
"step": 344
},
{
"epoch": 0.10529528460247214,
"grad_norm": 1.747919617181712,
"learning_rate": 9.85279150238728e-06,
"loss": 0.4796,
"step": 345
},
{
"epoch": 0.10560048832595757,
"grad_norm": 2.0064830609007496,
"learning_rate": 9.85159822051024e-06,
"loss": 0.6876,
"step": 346
},
{
"epoch": 0.105905692049443,
"grad_norm": 3.232705832740987,
"learning_rate": 9.850400194573908e-06,
"loss": 0.5043,
"step": 347
},
{
"epoch": 0.10621089577292843,
"grad_norm": 1.6750365694523466,
"learning_rate": 9.849197425749753e-06,
"loss": 0.4426,
"step": 348
},
{
"epoch": 0.10651609949641386,
"grad_norm": 1.2407353388150502,
"learning_rate": 9.847989915213883e-06,
"loss": 0.2728,
"step": 349
},
{
"epoch": 0.10682130321989929,
"grad_norm": 1.145770915094197,
"learning_rate": 9.846777664147046e-06,
"loss": 0.1828,
"step": 350
},
{
"epoch": 0.10712650694338471,
"grad_norm": 1.0259526919339048,
"learning_rate": 9.845560673734617e-06,
"loss": 0.1815,
"step": 351
},
{
"epoch": 0.10743171066687014,
"grad_norm": 1.5927868344920564,
"learning_rate": 9.844338945166619e-06,
"loss": 0.4563,
"step": 352
},
{
"epoch": 0.10773691439035556,
"grad_norm": 1.862511817843432,
"learning_rate": 9.843112479637692e-06,
"loss": 0.4873,
"step": 353
},
{
"epoch": 0.10804211811384098,
"grad_norm": 1.3346380863318066,
"learning_rate": 9.841881278347122e-06,
"loss": 0.344,
"step": 354
},
{
"epoch": 0.10834732183732641,
"grad_norm": 1.2558059799685006,
"learning_rate": 9.840645342498817e-06,
"loss": 0.3489,
"step": 355
},
{
"epoch": 0.10865252556081184,
"grad_norm": 1.734496310003891,
"learning_rate": 9.839404673301318e-06,
"loss": 0.4161,
"step": 356
},
{
"epoch": 0.10895772928429727,
"grad_norm": 1.5538063811233147,
"learning_rate": 9.838159271967795e-06,
"loss": 0.4971,
"step": 357
},
{
"epoch": 0.1092629330077827,
"grad_norm": 1.51967394631939,
"learning_rate": 9.836909139716044e-06,
"loss": 0.3422,
"step": 358
},
{
"epoch": 0.10956813673126813,
"grad_norm": 1.8153898943745974,
"learning_rate": 9.835654277768487e-06,
"loss": 0.5826,
"step": 359
},
{
"epoch": 0.10987334045475355,
"grad_norm": 1.3956832615282058,
"learning_rate": 9.834394687352168e-06,
"loss": 0.3444,
"step": 360
},
{
"epoch": 0.11017854417823897,
"grad_norm": 1.7028097158357791,
"learning_rate": 9.833130369698764e-06,
"loss": 0.4653,
"step": 361
},
{
"epoch": 0.1104837479017244,
"grad_norm": 1.984086744142622,
"learning_rate": 9.831861326044564e-06,
"loss": 0.7268,
"step": 362
},
{
"epoch": 0.11078895162520982,
"grad_norm": 1.6010962575202456,
"learning_rate": 9.830587557630481e-06,
"loss": 0.4979,
"step": 363
},
{
"epoch": 0.11109415534869525,
"grad_norm": 1.163967768763629,
"learning_rate": 9.829309065702054e-06,
"loss": 0.2721,
"step": 364
},
{
"epoch": 0.11139935907218068,
"grad_norm": 1.5116526665647354,
"learning_rate": 9.828025851509433e-06,
"loss": 0.4504,
"step": 365
},
{
"epoch": 0.11170456279566611,
"grad_norm": 1.3126613426949747,
"learning_rate": 9.82673791630739e-06,
"loss": 0.2915,
"step": 366
},
{
"epoch": 0.11200976651915154,
"grad_norm": 1.2919896962667308,
"learning_rate": 9.825445261355313e-06,
"loss": 0.2862,
"step": 367
},
{
"epoch": 0.11231497024263697,
"grad_norm": 1.7465438774619029,
"learning_rate": 9.824147887917201e-06,
"loss": 0.5347,
"step": 368
},
{
"epoch": 0.11262017396612238,
"grad_norm": 1.5226131458552274,
"learning_rate": 9.822845797261676e-06,
"loss": 0.4764,
"step": 369
},
{
"epoch": 0.11292537768960781,
"grad_norm": 1.157991129368488,
"learning_rate": 9.82153899066196e-06,
"loss": 0.2002,
"step": 370
},
{
"epoch": 0.11323058141309324,
"grad_norm": 0.9301400886155634,
"learning_rate": 9.8202274693959e-06,
"loss": 0.1684,
"step": 371
},
{
"epoch": 0.11353578513657867,
"grad_norm": 1.502487900950231,
"learning_rate": 9.818911234745942e-06,
"loss": 0.3093,
"step": 372
},
{
"epoch": 0.1138409888600641,
"grad_norm": 1.9784385010881411,
"learning_rate": 9.817590287999149e-06,
"loss": 0.4803,
"step": 373
},
{
"epoch": 0.11414619258354952,
"grad_norm": 0.9195545540370847,
"learning_rate": 9.816264630447186e-06,
"loss": 0.2159,
"step": 374
},
{
"epoch": 0.11445139630703495,
"grad_norm": 1.9332444000616873,
"learning_rate": 9.814934263386324e-06,
"loss": 0.704,
"step": 375
},
{
"epoch": 0.11475660003052038,
"grad_norm": 1.2982271338494382,
"learning_rate": 9.813599188117447e-06,
"loss": 0.2579,
"step": 376
},
{
"epoch": 0.11506180375400579,
"grad_norm": 1.4400207145843447,
"learning_rate": 9.812259405946033e-06,
"loss": 0.2581,
"step": 377
},
{
"epoch": 0.11536700747749122,
"grad_norm": 1.6214016909138647,
"learning_rate": 9.810914918182168e-06,
"loss": 0.3348,
"step": 378
},
{
"epoch": 0.11567221120097665,
"grad_norm": 1.3037284021243603,
"learning_rate": 9.80956572614054e-06,
"loss": 0.2009,
"step": 379
},
{
"epoch": 0.11597741492446208,
"grad_norm": 1.4597999101205241,
"learning_rate": 9.808211831140434e-06,
"loss": 0.4417,
"step": 380
},
{
"epoch": 0.1162826186479475,
"grad_norm": 1.3133108268461724,
"learning_rate": 9.806853234505736e-06,
"loss": 0.3305,
"step": 381
},
{
"epoch": 0.11658782237143293,
"grad_norm": 1.455145466929522,
"learning_rate": 9.805489937564926e-06,
"loss": 0.4611,
"step": 382
},
{
"epoch": 0.11689302609491836,
"grad_norm": 1.3560358010182432,
"learning_rate": 9.804121941651085e-06,
"loss": 0.2624,
"step": 383
},
{
"epoch": 0.11719822981840379,
"grad_norm": 1.5730489544680661,
"learning_rate": 9.802749248101885e-06,
"loss": 0.5959,
"step": 384
},
{
"epoch": 0.1175034335418892,
"grad_norm": 1.6080011966995038,
"learning_rate": 9.801371858259594e-06,
"loss": 0.3077,
"step": 385
},
{
"epoch": 0.11780863726537463,
"grad_norm": 1.3452073998773173,
"learning_rate": 9.799989773471071e-06,
"loss": 0.3877,
"step": 386
},
{
"epoch": 0.11811384098886006,
"grad_norm": 1.521531541360139,
"learning_rate": 9.798602995087764e-06,
"loss": 0.2978,
"step": 387
},
{
"epoch": 0.11841904471234549,
"grad_norm": 1.705542362457564,
"learning_rate": 9.797211524465715e-06,
"loss": 0.4298,
"step": 388
},
{
"epoch": 0.11872424843583092,
"grad_norm": 2.110762528312395,
"learning_rate": 9.79581536296555e-06,
"loss": 0.4766,
"step": 389
},
{
"epoch": 0.11902945215931635,
"grad_norm": 1.8442245131411212,
"learning_rate": 9.794414511952484e-06,
"loss": 0.2988,
"step": 390
},
{
"epoch": 0.11933465588280177,
"grad_norm": 1.2950053698748747,
"learning_rate": 9.793008972796318e-06,
"loss": 0.2907,
"step": 391
},
{
"epoch": 0.1196398596062872,
"grad_norm": 1.1692504994324242,
"learning_rate": 9.791598746871438e-06,
"loss": 0.2584,
"step": 392
},
{
"epoch": 0.11994506332977262,
"grad_norm": 1.7847769770487698,
"learning_rate": 9.790183835556806e-06,
"loss": 0.4874,
"step": 393
},
{
"epoch": 0.12025026705325804,
"grad_norm": 1.2562325547558533,
"learning_rate": 9.788764240235976e-06,
"loss": 0.3739,
"step": 394
},
{
"epoch": 0.12055547077674347,
"grad_norm": 1.340465345857484,
"learning_rate": 9.787339962297076e-06,
"loss": 0.1911,
"step": 395
},
{
"epoch": 0.1208606745002289,
"grad_norm": 1.5667496682899116,
"learning_rate": 9.785911003132811e-06,
"loss": 0.4243,
"step": 396
},
{
"epoch": 0.12116587822371433,
"grad_norm": 1.81118055527733,
"learning_rate": 9.78447736414047e-06,
"loss": 0.5246,
"step": 397
},
{
"epoch": 0.12147108194719976,
"grad_norm": 1.4053388398772566,
"learning_rate": 9.783039046721912e-06,
"loss": 0.2964,
"step": 398
},
{
"epoch": 0.12177628567068519,
"grad_norm": 1.5253960954907508,
"learning_rate": 9.781596052283573e-06,
"loss": 0.3939,
"step": 399
},
{
"epoch": 0.12208148939417061,
"grad_norm": 1.4250712259795966,
"learning_rate": 9.780148382236465e-06,
"loss": 0.2338,
"step": 400
},
{
"epoch": 0.12238669311765603,
"grad_norm": 1.6445234212439845,
"learning_rate": 9.778696037996167e-06,
"loss": 0.6218,
"step": 401
},
{
"epoch": 0.12269189684114146,
"grad_norm": 1.625655408880631,
"learning_rate": 9.777239020982834e-06,
"loss": 0.4565,
"step": 402
},
{
"epoch": 0.12299710056462689,
"grad_norm": 1.2734470484088918,
"learning_rate": 9.775777332621184e-06,
"loss": 0.3673,
"step": 403
},
{
"epoch": 0.12330230428811231,
"grad_norm": 1.5775928166525761,
"learning_rate": 9.774310974340506e-06,
"loss": 0.3673,
"step": 404
},
{
"epoch": 0.12360750801159774,
"grad_norm": 1.2252627733176171,
"learning_rate": 9.772839947574658e-06,
"loss": 0.4064,
"step": 405
},
{
"epoch": 0.12391271173508317,
"grad_norm": 1.7189364245488037,
"learning_rate": 9.77136425376206e-06,
"loss": 0.4633,
"step": 406
},
{
"epoch": 0.1242179154585686,
"grad_norm": 1.7558994988767551,
"learning_rate": 9.769883894345693e-06,
"loss": 0.5655,
"step": 407
},
{
"epoch": 0.12452311918205403,
"grad_norm": 1.4802133948901528,
"learning_rate": 9.768398870773108e-06,
"loss": 0.4175,
"step": 408
},
{
"epoch": 0.12482832290553945,
"grad_norm": 1.3302751078782091,
"learning_rate": 9.766909184496408e-06,
"loss": 0.3468,
"step": 409
},
{
"epoch": 0.12513352662902488,
"grad_norm": 1.704721594870751,
"learning_rate": 9.765414836972262e-06,
"loss": 0.4898,
"step": 410
},
{
"epoch": 0.1254387303525103,
"grad_norm": 1.2131375907547226,
"learning_rate": 9.763915829661891e-06,
"loss": 0.3886,
"step": 411
},
{
"epoch": 0.12574393407599574,
"grad_norm": 1.3401723742285034,
"learning_rate": 9.76241216403108e-06,
"loss": 0.4102,
"step": 412
},
{
"epoch": 0.12604913779948115,
"grad_norm": 1.6513004120286505,
"learning_rate": 9.760903841550164e-06,
"loss": 0.433,
"step": 413
},
{
"epoch": 0.12635434152296657,
"grad_norm": 2.2390646637016327,
"learning_rate": 9.75939086369403e-06,
"loss": 0.4213,
"step": 414
},
{
"epoch": 0.126659545246452,
"grad_norm": 1.585281461881457,
"learning_rate": 9.757873231942122e-06,
"loss": 0.2145,
"step": 415
},
{
"epoch": 0.12696474896993742,
"grad_norm": 1.616278056338578,
"learning_rate": 9.756350947778431e-06,
"loss": 0.3786,
"step": 416
},
{
"epoch": 0.12726995269342287,
"grad_norm": 1.3896314522086528,
"learning_rate": 9.754824012691499e-06,
"loss": 0.3886,
"step": 417
},
{
"epoch": 0.12757515641690828,
"grad_norm": 1.5388381717065547,
"learning_rate": 9.753292428174416e-06,
"loss": 0.4901,
"step": 418
},
{
"epoch": 0.12788036014039372,
"grad_norm": 1.2099556172325527,
"learning_rate": 9.75175619572482e-06,
"loss": 0.2379,
"step": 419
},
{
"epoch": 0.12818556386387914,
"grad_norm": 2.578673148403812,
"learning_rate": 9.750215316844886e-06,
"loss": 0.3093,
"step": 420
},
{
"epoch": 0.12849076758736458,
"grad_norm": 1.3557950817043143,
"learning_rate": 9.748669793041345e-06,
"loss": 0.2901,
"step": 421
},
{
"epoch": 0.12879597131085,
"grad_norm": 1.8058846206435177,
"learning_rate": 9.747119625825459e-06,
"loss": 0.5222,
"step": 422
},
{
"epoch": 0.1291011750343354,
"grad_norm": 1.449090093366816,
"learning_rate": 9.745564816713034e-06,
"loss": 0.3738,
"step": 423
},
{
"epoch": 0.12940637875782085,
"grad_norm": 1.3181509799847857,
"learning_rate": 9.74400536722442e-06,
"loss": 0.2624,
"step": 424
},
{
"epoch": 0.12971158248130626,
"grad_norm": 2.1599699357377675,
"learning_rate": 9.742441278884496e-06,
"loss": 0.4838,
"step": 425
},
{
"epoch": 0.1300167862047917,
"grad_norm": 1.6258096878519581,
"learning_rate": 9.740872553222685e-06,
"loss": 0.4999,
"step": 426
},
{
"epoch": 0.13032198992827712,
"grad_norm": 1.5046387312958875,
"learning_rate": 9.739299191772937e-06,
"loss": 0.3095,
"step": 427
},
{
"epoch": 0.13062719365176256,
"grad_norm": 3.2565177704876653,
"learning_rate": 9.737721196073742e-06,
"loss": 0.4886,
"step": 428
},
{
"epoch": 0.13093239737524798,
"grad_norm": 5.072257540244327,
"learning_rate": 9.736138567668117e-06,
"loss": 0.2433,
"step": 429
},
{
"epoch": 0.1312376010987334,
"grad_norm": 1.5984597272623309,
"learning_rate": 9.734551308103607e-06,
"loss": 0.3274,
"step": 430
},
{
"epoch": 0.13154280482221883,
"grad_norm": 1.9766310728941525,
"learning_rate": 9.732959418932297e-06,
"loss": 0.5601,
"step": 431
},
{
"epoch": 0.13184800854570425,
"grad_norm": 1.8500535183975242,
"learning_rate": 9.731362901710783e-06,
"loss": 0.8055,
"step": 432
},
{
"epoch": 0.1321532122691897,
"grad_norm": 1.5082490121163843,
"learning_rate": 9.7297617580002e-06,
"loss": 0.3715,
"step": 433
},
{
"epoch": 0.1324584159926751,
"grad_norm": 1.3484742708817405,
"learning_rate": 9.728155989366198e-06,
"loss": 0.2316,
"step": 434
},
{
"epoch": 0.13276361971616055,
"grad_norm": 1.4064281774599339,
"learning_rate": 9.726545597378953e-06,
"loss": 0.2529,
"step": 435
},
{
"epoch": 0.13306882343964596,
"grad_norm": 1.3619428913058238,
"learning_rate": 9.724930583613164e-06,
"loss": 0.2675,
"step": 436
},
{
"epoch": 0.1333740271631314,
"grad_norm": 1.8072201350967787,
"learning_rate": 9.723310949648044e-06,
"loss": 0.4832,
"step": 437
},
{
"epoch": 0.13367923088661682,
"grad_norm": 1.376837369774298,
"learning_rate": 9.721686697067328e-06,
"loss": 0.3855,
"step": 438
},
{
"epoch": 0.13398443461010223,
"grad_norm": 1.577084491012061,
"learning_rate": 9.720057827459264e-06,
"loss": 0.3307,
"step": 439
},
{
"epoch": 0.13428963833358767,
"grad_norm": 1.7335930513549882,
"learning_rate": 9.718424342416619e-06,
"loss": 0.3794,
"step": 440
},
{
"epoch": 0.1345948420570731,
"grad_norm": 1.3979759520207362,
"learning_rate": 9.716786243536672e-06,
"loss": 0.17,
"step": 441
},
{
"epoch": 0.13490004578055853,
"grad_norm": 1.5166020341173607,
"learning_rate": 9.715143532421208e-06,
"loss": 0.2782,
"step": 442
},
{
"epoch": 0.13520524950404395,
"grad_norm": 1.5547284451560954,
"learning_rate": 9.71349621067653e-06,
"loss": 0.2398,
"step": 443
},
{
"epoch": 0.1355104532275294,
"grad_norm": 1.4706416022210265,
"learning_rate": 9.711844279913443e-06,
"loss": 0.3486,
"step": 444
},
{
"epoch": 0.1358156569510148,
"grad_norm": 1.7726813901729983,
"learning_rate": 9.710187741747264e-06,
"loss": 0.4938,
"step": 445
},
{
"epoch": 0.13612086067450022,
"grad_norm": 1.3682846898358798,
"learning_rate": 9.708526597797812e-06,
"loss": 0.3399,
"step": 446
},
{
"epoch": 0.13642606439798566,
"grad_norm": 1.173206742476641,
"learning_rate": 9.70686084968941e-06,
"loss": 0.209,
"step": 447
},
{
"epoch": 0.13673126812147107,
"grad_norm": 1.5006667880365117,
"learning_rate": 9.705190499050885e-06,
"loss": 0.3918,
"step": 448
},
{
"epoch": 0.13703647184495651,
"grad_norm": 1.4081281623516984,
"learning_rate": 9.70351554751556e-06,
"loss": 0.3478,
"step": 449
},
{
"epoch": 0.13734167556844193,
"grad_norm": 1.3521857616183288,
"learning_rate": 9.701835996721267e-06,
"loss": 0.2527,
"step": 450
},
{
"epoch": 0.13764687929192737,
"grad_norm": 1.609701105059192,
"learning_rate": 9.70015184831032e-06,
"loss": 0.4019,
"step": 451
},
{
"epoch": 0.13795208301541279,
"grad_norm": 1.5247656094886828,
"learning_rate": 9.698463103929542e-06,
"loss": 0.3393,
"step": 452
},
{
"epoch": 0.13825728673889823,
"grad_norm": 1.7521263555747508,
"learning_rate": 9.696769765230244e-06,
"loss": 0.4711,
"step": 453
},
{
"epoch": 0.13856249046238364,
"grad_norm": 1.1750952661731386,
"learning_rate": 9.695071833868233e-06,
"loss": 0.3209,
"step": 454
},
{
"epoch": 0.13886769418586906,
"grad_norm": 1.1097718724387464,
"learning_rate": 9.693369311503801e-06,
"loss": 0.2077,
"step": 455
},
{
"epoch": 0.1391728979093545,
"grad_norm": 1.3477077308441543,
"learning_rate": 9.691662199801735e-06,
"loss": 0.3387,
"step": 456
},
{
"epoch": 0.1394781016328399,
"grad_norm": 1.5926509134118427,
"learning_rate": 9.689950500431306e-06,
"loss": 0.4737,
"step": 457
},
{
"epoch": 0.13978330535632535,
"grad_norm": 1.238821976588628,
"learning_rate": 9.688234215066274e-06,
"loss": 0.2941,
"step": 458
},
{
"epoch": 0.14008850907981077,
"grad_norm": 1.283103486116252,
"learning_rate": 9.68651334538488e-06,
"loss": 0.2298,
"step": 459
},
{
"epoch": 0.1403937128032962,
"grad_norm": 1.3964480095243228,
"learning_rate": 9.684787893069852e-06,
"loss": 0.2755,
"step": 460
},
{
"epoch": 0.14069891652678163,
"grad_norm": 4.5768637107742185,
"learning_rate": 9.683057859808394e-06,
"loss": 0.3969,
"step": 461
},
{
"epoch": 0.14100412025026704,
"grad_norm": 1.238974584705195,
"learning_rate": 9.681323247292193e-06,
"loss": 0.302,
"step": 462
},
{
"epoch": 0.14130932397375248,
"grad_norm": 1.484367852444757,
"learning_rate": 9.679584057217412e-06,
"loss": 0.3771,
"step": 463
},
{
"epoch": 0.1416145276972379,
"grad_norm": 1.8892302609803566,
"learning_rate": 9.677840291284693e-06,
"loss": 0.4296,
"step": 464
},
{
"epoch": 0.14191973142072334,
"grad_norm": 1.9264735588835349,
"learning_rate": 9.676091951199147e-06,
"loss": 0.1789,
"step": 465
},
{
"epoch": 0.14222493514420875,
"grad_norm": 1.6779527799580054,
"learning_rate": 9.674339038670362e-06,
"loss": 0.3394,
"step": 466
},
{
"epoch": 0.1425301388676942,
"grad_norm": 1.4269803792009585,
"learning_rate": 9.672581555412396e-06,
"loss": 0.3436,
"step": 467
},
{
"epoch": 0.1428353425911796,
"grad_norm": 1.6997561953129157,
"learning_rate": 9.67081950314378e-06,
"loss": 0.4244,
"step": 468
},
{
"epoch": 0.14314054631466505,
"grad_norm": 1.5847658235861504,
"learning_rate": 9.669052883587503e-06,
"loss": 0.4061,
"step": 469
},
{
"epoch": 0.14344575003815047,
"grad_norm": 1.5000132719088555,
"learning_rate": 9.667281698471032e-06,
"loss": 0.2349,
"step": 470
},
{
"epoch": 0.14375095376163588,
"grad_norm": 1.4700398564287758,
"learning_rate": 9.665505949526288e-06,
"loss": 0.4265,
"step": 471
},
{
"epoch": 0.14405615748512132,
"grad_norm": 1.1929108671224367,
"learning_rate": 9.663725638489662e-06,
"loss": 0.3107,
"step": 472
},
{
"epoch": 0.14436136120860674,
"grad_norm": 1.1577689014622543,
"learning_rate": 9.661940767102001e-06,
"loss": 0.1722,
"step": 473
},
{
"epoch": 0.14466656493209218,
"grad_norm": 2.093772204836488,
"learning_rate": 9.660151337108617e-06,
"loss": 0.4278,
"step": 474
},
{
"epoch": 0.1449717686555776,
"grad_norm": 1.6654665758895533,
"learning_rate": 9.658357350259274e-06,
"loss": 0.3394,
"step": 475
},
{
"epoch": 0.14527697237906304,
"grad_norm": 1.2202451289258887,
"learning_rate": 9.656558808308193e-06,
"loss": 0.3433,
"step": 476
},
{
"epoch": 0.14558217610254845,
"grad_norm": 1.5418473816537024,
"learning_rate": 9.654755713014052e-06,
"loss": 0.4099,
"step": 477
},
{
"epoch": 0.14588737982603386,
"grad_norm": 1.867172943441559,
"learning_rate": 9.652948066139978e-06,
"loss": 0.4445,
"step": 478
},
{
"epoch": 0.1461925835495193,
"grad_norm": 1.2274320399473075,
"learning_rate": 9.651135869453552e-06,
"loss": 0.3687,
"step": 479
},
{
"epoch": 0.14649778727300472,
"grad_norm": 1.3632570280967384,
"learning_rate": 9.649319124726799e-06,
"loss": 0.3554,
"step": 480
},
{
"epoch": 0.14680299099649016,
"grad_norm": 1.9368328668689925,
"learning_rate": 9.647497833736197e-06,
"loss": 0.433,
"step": 481
},
{
"epoch": 0.14710819471997558,
"grad_norm": 1.7750440538339176,
"learning_rate": 9.645671998262668e-06,
"loss": 0.4275,
"step": 482
},
{
"epoch": 0.14741339844346102,
"grad_norm": 1.7620981827052555,
"learning_rate": 9.643841620091572e-06,
"loss": 0.4327,
"step": 483
},
{
"epoch": 0.14771860216694643,
"grad_norm": 1.705808013569081,
"learning_rate": 9.642006701012719e-06,
"loss": 0.2914,
"step": 484
},
{
"epoch": 0.14802380589043188,
"grad_norm": 1.4490666476731855,
"learning_rate": 9.640167242820356e-06,
"loss": 0.4171,
"step": 485
},
{
"epoch": 0.1483290096139173,
"grad_norm": 1.4017898188106575,
"learning_rate": 9.638323247313167e-06,
"loss": 0.3609,
"step": 486
},
{
"epoch": 0.1486342133374027,
"grad_norm": 1.273585773097182,
"learning_rate": 9.636474716294275e-06,
"loss": 0.2781,
"step": 487
},
{
"epoch": 0.14893941706088815,
"grad_norm": 1.4426438722313946,
"learning_rate": 9.634621651571235e-06,
"loss": 0.3446,
"step": 488
},
{
"epoch": 0.14924462078437356,
"grad_norm": 1.3396568501630033,
"learning_rate": 9.632764054956042e-06,
"loss": 0.3209,
"step": 489
},
{
"epoch": 0.149549824507859,
"grad_norm": 1.869987463373602,
"learning_rate": 9.630901928265113e-06,
"loss": 0.6676,
"step": 490
},
{
"epoch": 0.14985502823134442,
"grad_norm": 1.3995020559881168,
"learning_rate": 9.6290352733193e-06,
"loss": 0.443,
"step": 491
},
{
"epoch": 0.15016023195482986,
"grad_norm": 1.5646801044524754,
"learning_rate": 9.627164091943886e-06,
"loss": 0.4808,
"step": 492
},
{
"epoch": 0.15046543567831527,
"grad_norm": 1.4064811516565765,
"learning_rate": 9.625288385968572e-06,
"loss": 0.3281,
"step": 493
},
{
"epoch": 0.15077063940180072,
"grad_norm": 1.3603824959747954,
"learning_rate": 9.623408157227493e-06,
"loss": 0.4203,
"step": 494
},
{
"epoch": 0.15107584312528613,
"grad_norm": 1.4648922518012057,
"learning_rate": 9.621523407559193e-06,
"loss": 0.4691,
"step": 495
},
{
"epoch": 0.15138104684877154,
"grad_norm": 1.2898880598011702,
"learning_rate": 9.619634138806653e-06,
"loss": 0.22,
"step": 496
},
{
"epoch": 0.151686250572257,
"grad_norm": 1.629312149082348,
"learning_rate": 9.61774035281726e-06,
"loss": 0.3852,
"step": 497
},
{
"epoch": 0.1519914542957424,
"grad_norm": 1.4209940770003642,
"learning_rate": 9.615842051442825e-06,
"loss": 0.3434,
"step": 498
},
{
"epoch": 0.15229665801922784,
"grad_norm": 1.5981471419786573,
"learning_rate": 9.613939236539571e-06,
"loss": 0.4405,
"step": 499
},
{
"epoch": 0.15260186174271326,
"grad_norm": 1.6345273007096384,
"learning_rate": 9.612031909968138e-06,
"loss": 0.4621,
"step": 500
},
{
"epoch": 0.1529070654661987,
"grad_norm": 1.7946527422515466,
"learning_rate": 9.610120073593574e-06,
"loss": 0.4215,
"step": 501
},
{
"epoch": 0.1532122691896841,
"grad_norm": 1.9696007964079152,
"learning_rate": 9.608203729285337e-06,
"loss": 0.2416,
"step": 502
},
{
"epoch": 0.15351747291316953,
"grad_norm": 1.186483550480542,
"learning_rate": 9.606282878917296e-06,
"loss": 0.1656,
"step": 503
},
{
"epoch": 0.15382267663665497,
"grad_norm": 1.3709781521921298,
"learning_rate": 9.604357524367723e-06,
"loss": 0.3374,
"step": 504
},
{
"epoch": 0.15412788036014038,
"grad_norm": 1.4744363645402312,
"learning_rate": 9.602427667519297e-06,
"loss": 0.3472,
"step": 505
},
{
"epoch": 0.15443308408362583,
"grad_norm": 1.7032963377834875,
"learning_rate": 9.600493310259098e-06,
"loss": 0.4352,
"step": 506
},
{
"epoch": 0.15473828780711124,
"grad_norm": 1.487020684601837,
"learning_rate": 9.598554454478608e-06,
"loss": 0.2661,
"step": 507
},
{
"epoch": 0.15504349153059668,
"grad_norm": 1.2499312381905126,
"learning_rate": 9.596611102073703e-06,
"loss": 0.2785,
"step": 508
},
{
"epoch": 0.1553486952540821,
"grad_norm": 1.529878897767237,
"learning_rate": 9.594663254944664e-06,
"loss": 0.3768,
"step": 509
},
{
"epoch": 0.15565389897756754,
"grad_norm": 1.5214931502474798,
"learning_rate": 9.592710914996162e-06,
"loss": 0.5126,
"step": 510
},
{
"epoch": 0.15595910270105295,
"grad_norm": 1.3836939529329817,
"learning_rate": 9.590754084137259e-06,
"loss": 0.3011,
"step": 511
},
{
"epoch": 0.15626430642453837,
"grad_norm": 1.4833094737816435,
"learning_rate": 9.588792764281413e-06,
"loss": 0.4761,
"step": 512
},
{
"epoch": 0.1565695101480238,
"grad_norm": 1.2346664760598378,
"learning_rate": 9.586826957346473e-06,
"loss": 0.2454,
"step": 513
},
{
"epoch": 0.15687471387150922,
"grad_norm": 1.6476636760719772,
"learning_rate": 9.584856665254667e-06,
"loss": 0.2867,
"step": 514
},
{
"epoch": 0.15717991759499467,
"grad_norm": 1.5637210652382973,
"learning_rate": 9.58288188993262e-06,
"loss": 0.2899,
"step": 515
},
{
"epoch": 0.15748512131848008,
"grad_norm": 1.3205463270086828,
"learning_rate": 9.580902633311331e-06,
"loss": 0.3756,
"step": 516
},
{
"epoch": 0.15779032504196552,
"grad_norm": 1.3975127161911243,
"learning_rate": 9.578918897326186e-06,
"loss": 0.5111,
"step": 517
},
{
"epoch": 0.15809552876545094,
"grad_norm": 1.4459383872321914,
"learning_rate": 9.57693068391695e-06,
"loss": 0.4283,
"step": 518
},
{
"epoch": 0.15840073248893635,
"grad_norm": 1.7061785001760192,
"learning_rate": 9.574937995027767e-06,
"loss": 0.3702,
"step": 519
},
{
"epoch": 0.1587059362124218,
"grad_norm": 1.822247438656905,
"learning_rate": 9.572940832607157e-06,
"loss": 0.3406,
"step": 520
},
{
"epoch": 0.1590111399359072,
"grad_norm": 2.44932543751886,
"learning_rate": 9.570939198608013e-06,
"loss": 0.3118,
"step": 521
},
{
"epoch": 0.15931634365939265,
"grad_norm": 1.6119202421775476,
"learning_rate": 9.5689330949876e-06,
"loss": 0.3903,
"step": 522
},
{
"epoch": 0.15962154738287807,
"grad_norm": 1.4848330945324375,
"learning_rate": 9.56692252370756e-06,
"loss": 0.2336,
"step": 523
},
{
"epoch": 0.1599267511063635,
"grad_norm": 1.410632362194396,
"learning_rate": 9.564907486733893e-06,
"loss": 0.2749,
"step": 524
},
{
"epoch": 0.16023195482984892,
"grad_norm": 1.596052638125191,
"learning_rate": 9.562887986036975e-06,
"loss": 0.4752,
"step": 525
},
{
"epoch": 0.16053715855333436,
"grad_norm": 1.7668740909494465,
"learning_rate": 9.560864023591541e-06,
"loss": 0.4457,
"step": 526
},
{
"epoch": 0.16084236227681978,
"grad_norm": 1.4009268145182425,
"learning_rate": 9.558835601376692e-06,
"loss": 0.2615,
"step": 527
},
{
"epoch": 0.1611475660003052,
"grad_norm": 1.7299333351168085,
"learning_rate": 9.55680272137589e-06,
"loss": 0.5216,
"step": 528
},
{
"epoch": 0.16145276972379063,
"grad_norm": 1.398003196407042,
"learning_rate": 9.554765385576951e-06,
"loss": 0.2917,
"step": 529
},
{
"epoch": 0.16175797344727605,
"grad_norm": 1.4037115710357768,
"learning_rate": 9.552723595972055e-06,
"loss": 0.2794,
"step": 530
},
{
"epoch": 0.1620631771707615,
"grad_norm": 1.4104804936912443,
"learning_rate": 9.550677354557734e-06,
"loss": 0.3294,
"step": 531
},
{
"epoch": 0.1623683808942469,
"grad_norm": 1.3043707731550427,
"learning_rate": 9.548626663334872e-06,
"loss": 0.3542,
"step": 532
},
{
"epoch": 0.16267358461773235,
"grad_norm": 1.4523817232860987,
"learning_rate": 9.546571524308707e-06,
"loss": 0.4394,
"step": 533
},
{
"epoch": 0.16297878834121776,
"grad_norm": 1.2378417959119585,
"learning_rate": 9.544511939488823e-06,
"loss": 0.2859,
"step": 534
},
{
"epoch": 0.16328399206470318,
"grad_norm": 1.1623856416463947,
"learning_rate": 9.542447910889152e-06,
"loss": 0.2682,
"step": 535
},
{
"epoch": 0.16358919578818862,
"grad_norm": 1.503292443987416,
"learning_rate": 9.540379440527974e-06,
"loss": 0.4513,
"step": 536
},
{
"epoch": 0.16389439951167403,
"grad_norm": 1.278183220840744,
"learning_rate": 9.538306530427908e-06,
"loss": 0.2486,
"step": 537
},
{
"epoch": 0.16419960323515947,
"grad_norm": 1.477438530587252,
"learning_rate": 9.536229182615919e-06,
"loss": 0.4748,
"step": 538
},
{
"epoch": 0.1645048069586449,
"grad_norm": 1.161000468008389,
"learning_rate": 9.534147399123308e-06,
"loss": 0.3166,
"step": 539
},
{
"epoch": 0.16481001068213033,
"grad_norm": 1.3151690275104762,
"learning_rate": 9.532061181985713e-06,
"loss": 0.3547,
"step": 540
},
{
"epoch": 0.16511521440561575,
"grad_norm": 1.750297725419665,
"learning_rate": 9.529970533243112e-06,
"loss": 0.4156,
"step": 541
},
{
"epoch": 0.1654204181291012,
"grad_norm": 1.3465531852012238,
"learning_rate": 9.52787545493981e-06,
"loss": 0.3366,
"step": 542
},
{
"epoch": 0.1657256218525866,
"grad_norm": 1.5440141830188223,
"learning_rate": 9.525775949124447e-06,
"loss": 0.3376,
"step": 543
},
{
"epoch": 0.16603082557607202,
"grad_norm": 1.5415130315298482,
"learning_rate": 9.523672017849995e-06,
"loss": 0.541,
"step": 544
},
{
"epoch": 0.16633602929955746,
"grad_norm": 1.7579856956776627,
"learning_rate": 9.521563663173746e-06,
"loss": 0.4806,
"step": 545
},
{
"epoch": 0.16664123302304287,
"grad_norm": 1.7818866729621587,
"learning_rate": 9.519450887157324e-06,
"loss": 0.5464,
"step": 546
},
{
"epoch": 0.16694643674652831,
"grad_norm": 1.6064088023016758,
"learning_rate": 9.517333691866672e-06,
"loss": 0.43,
"step": 547
},
{
"epoch": 0.16725164047001373,
"grad_norm": 1.4778043177300115,
"learning_rate": 9.515212079372059e-06,
"loss": 0.4399,
"step": 548
},
{
"epoch": 0.16755684419349917,
"grad_norm": 2.027608100840915,
"learning_rate": 9.513086051748069e-06,
"loss": 0.4069,
"step": 549
},
{
"epoch": 0.16786204791698459,
"grad_norm": 1.5850802204517391,
"learning_rate": 9.510955611073605e-06,
"loss": 0.3827,
"step": 550
},
{
"epoch": 0.16816725164047,
"grad_norm": 1.293889481401633,
"learning_rate": 9.508820759431883e-06,
"loss": 0.2572,
"step": 551
},
{
"epoch": 0.16847245536395544,
"grad_norm": 1.842230801320139,
"learning_rate": 9.506681498910437e-06,
"loss": 0.5275,
"step": 552
},
{
"epoch": 0.16877765908744086,
"grad_norm": 1.1873629641518748,
"learning_rate": 9.50453783160111e-06,
"loss": 0.3282,
"step": 553
},
{
"epoch": 0.1690828628109263,
"grad_norm": 1.6093453710106354,
"learning_rate": 9.50238975960005e-06,
"loss": 0.5784,
"step": 554
},
{
"epoch": 0.1693880665344117,
"grad_norm": 1.8900657646403543,
"learning_rate": 9.500237285007719e-06,
"loss": 0.5224,
"step": 555
},
{
"epoch": 0.16969327025789716,
"grad_norm": 1.4803576264157936,
"learning_rate": 9.498080409928878e-06,
"loss": 0.3726,
"step": 556
},
{
"epoch": 0.16999847398138257,
"grad_norm": 1.7675886459458987,
"learning_rate": 9.495919136472595e-06,
"loss": 0.4656,
"step": 557
},
{
"epoch": 0.170303677704868,
"grad_norm": 1.7910944298366285,
"learning_rate": 9.493753466752236e-06,
"loss": 0.8076,
"step": 558
},
{
"epoch": 0.17060888142835343,
"grad_norm": 1.4678825901279975,
"learning_rate": 9.49158340288547e-06,
"loss": 0.3575,
"step": 559
},
{
"epoch": 0.17091408515183884,
"grad_norm": 1.411596350121475,
"learning_rate": 9.489408946994256e-06,
"loss": 0.3316,
"step": 560
},
{
"epoch": 0.17121928887532428,
"grad_norm": 1.2961064094904746,
"learning_rate": 9.487230101204855e-06,
"loss": 0.3634,
"step": 561
},
{
"epoch": 0.1715244925988097,
"grad_norm": 1.2525457208629842,
"learning_rate": 9.485046867647816e-06,
"loss": 0.368,
"step": 562
},
{
"epoch": 0.17182969632229514,
"grad_norm": 1.5857071732762902,
"learning_rate": 9.48285924845798e-06,
"loss": 0.4546,
"step": 563
},
{
"epoch": 0.17213490004578055,
"grad_norm": 1.4510716523054648,
"learning_rate": 9.480667245774474e-06,
"loss": 0.2739,
"step": 564
},
{
"epoch": 0.172440103769266,
"grad_norm": 1.7890268513821783,
"learning_rate": 9.478470861740716e-06,
"loss": 0.4085,
"step": 565
},
{
"epoch": 0.1727453074927514,
"grad_norm": 1.2143129127325427,
"learning_rate": 9.476270098504405e-06,
"loss": 0.2669,
"step": 566
},
{
"epoch": 0.17305051121623685,
"grad_norm": 1.4725539275626134,
"learning_rate": 9.474064958217524e-06,
"loss": 0.3474,
"step": 567
},
{
"epoch": 0.17335571493972227,
"grad_norm": 1.6648347880329453,
"learning_rate": 9.471855443036333e-06,
"loss": 0.3059,
"step": 568
},
{
"epoch": 0.17366091866320768,
"grad_norm": 1.4826208089202084,
"learning_rate": 9.469641555121372e-06,
"loss": 0.3309,
"step": 569
},
{
"epoch": 0.17396612238669312,
"grad_norm": 1.2087588985869038,
"learning_rate": 9.467423296637458e-06,
"loss": 0.2765,
"step": 570
},
{
"epoch": 0.17427132611017854,
"grad_norm": 1.5271503296745377,
"learning_rate": 9.465200669753678e-06,
"loss": 0.4388,
"step": 571
},
{
"epoch": 0.17457652983366398,
"grad_norm": 1.720167996940521,
"learning_rate": 9.462973676643395e-06,
"loss": 0.4693,
"step": 572
},
{
"epoch": 0.1748817335571494,
"grad_norm": 1.4666930059033054,
"learning_rate": 9.46074231948424e-06,
"loss": 0.354,
"step": 573
},
{
"epoch": 0.17518693728063484,
"grad_norm": 1.8597430337784902,
"learning_rate": 9.458506600458106e-06,
"loss": 0.3892,
"step": 574
},
{
"epoch": 0.17549214100412025,
"grad_norm": 0.9696512337091734,
"learning_rate": 9.456266521751162e-06,
"loss": 0.2294,
"step": 575
},
{
"epoch": 0.17579734472760566,
"grad_norm": 1.781498807963985,
"learning_rate": 9.454022085553829e-06,
"loss": 0.4873,
"step": 576
},
{
"epoch": 0.1761025484510911,
"grad_norm": 1.3337716733106453,
"learning_rate": 9.451773294060797e-06,
"loss": 0.3031,
"step": 577
},
{
"epoch": 0.17640775217457652,
"grad_norm": 1.7055986970891146,
"learning_rate": 9.449520149471008e-06,
"loss": 0.6405,
"step": 578
},
{
"epoch": 0.17671295589806196,
"grad_norm": 1.5188407211523098,
"learning_rate": 9.447262653987668e-06,
"loss": 0.3739,
"step": 579
},
{
"epoch": 0.17701815962154738,
"grad_norm": 0.9308569789252137,
"learning_rate": 9.445000809818231e-06,
"loss": 0.2505,
"step": 580
},
{
"epoch": 0.17732336334503282,
"grad_norm": 1.5000700139554115,
"learning_rate": 9.442734619174408e-06,
"loss": 0.4158,
"step": 581
},
{
"epoch": 0.17762856706851823,
"grad_norm": 1.3548714588751447,
"learning_rate": 9.440464084272157e-06,
"loss": 0.3911,
"step": 582
},
{
"epoch": 0.17793377079200368,
"grad_norm": 1.3241166157002833,
"learning_rate": 9.438189207331684e-06,
"loss": 0.371,
"step": 583
},
{
"epoch": 0.1782389745154891,
"grad_norm": 1.251287417238196,
"learning_rate": 9.435909990577442e-06,
"loss": 0.4543,
"step": 584
},
{
"epoch": 0.1785441782389745,
"grad_norm": 1.6243898273186124,
"learning_rate": 9.433626436238128e-06,
"loss": 0.3955,
"step": 585
},
{
"epoch": 0.17884938196245995,
"grad_norm": 1.5933520250274826,
"learning_rate": 9.43133854654668e-06,
"loss": 0.4232,
"step": 586
},
{
"epoch": 0.17915458568594536,
"grad_norm": 1.271604048902552,
"learning_rate": 9.429046323740275e-06,
"loss": 0.186,
"step": 587
},
{
"epoch": 0.1794597894094308,
"grad_norm": 1.7480757897399084,
"learning_rate": 9.426749770060325e-06,
"loss": 0.3198,
"step": 588
},
{
"epoch": 0.17976499313291622,
"grad_norm": 1.6582908292655634,
"learning_rate": 9.424448887752485e-06,
"loss": 0.4497,
"step": 589
},
{
"epoch": 0.18007019685640166,
"grad_norm": 1.514808198729056,
"learning_rate": 9.42214367906663e-06,
"loss": 0.3135,
"step": 590
},
{
"epoch": 0.18037540057988707,
"grad_norm": 1.33735933544563,
"learning_rate": 9.419834146256875e-06,
"loss": 0.1512,
"step": 591
},
{
"epoch": 0.1806806043033725,
"grad_norm": 1.7983955915325747,
"learning_rate": 9.417520291581562e-06,
"loss": 0.397,
"step": 592
},
{
"epoch": 0.18098580802685793,
"grad_norm": 1.8357965942853254,
"learning_rate": 9.415202117303253e-06,
"loss": 0.3479,
"step": 593
},
{
"epoch": 0.18129101175034334,
"grad_norm": 2.1098502294891084,
"learning_rate": 9.412879625688742e-06,
"loss": 0.6081,
"step": 594
},
{
"epoch": 0.1815962154738288,
"grad_norm": 1.7002717361934219,
"learning_rate": 9.410552819009041e-06,
"loss": 0.2335,
"step": 595
},
{
"epoch": 0.1819014191973142,
"grad_norm": 1.6858392243118179,
"learning_rate": 9.408221699539381e-06,
"loss": 0.3502,
"step": 596
},
{
"epoch": 0.18220662292079964,
"grad_norm": 1.9779389304442994,
"learning_rate": 9.40588626955921e-06,
"loss": 0.5023,
"step": 597
},
{
"epoch": 0.18251182664428506,
"grad_norm": 1.984831269441273,
"learning_rate": 9.403546531352192e-06,
"loss": 0.1808,
"step": 598
},
{
"epoch": 0.1828170303677705,
"grad_norm": 1.7825668553552305,
"learning_rate": 9.401202487206205e-06,
"loss": 0.2451,
"step": 599
},
{
"epoch": 0.18312223409125591,
"grad_norm": 1.7395479880147604,
"learning_rate": 9.398854139413332e-06,
"loss": 0.4586,
"step": 600
},
{
"epoch": 0.18342743781474133,
"grad_norm": 1.7910877075998561,
"learning_rate": 9.396501490269871e-06,
"loss": 0.4334,
"step": 601
},
{
"epoch": 0.18373264153822677,
"grad_norm": 1.671153260357796,
"learning_rate": 9.394144542076321e-06,
"loss": 0.3457,
"step": 602
},
{
"epoch": 0.18403784526171219,
"grad_norm": 2.2927735747628057,
"learning_rate": 9.391783297137392e-06,
"loss": 0.5006,
"step": 603
},
{
"epoch": 0.18434304898519763,
"grad_norm": 1.4375155904173251,
"learning_rate": 9.389417757761983e-06,
"loss": 0.3412,
"step": 604
},
{
"epoch": 0.18464825270868304,
"grad_norm": 1.0876679937459988,
"learning_rate": 9.387047926263205e-06,
"loss": 0.2323,
"step": 605
},
{
"epoch": 0.18495345643216848,
"grad_norm": 1.5691166969698962,
"learning_rate": 9.384673804958357e-06,
"loss": 0.3929,
"step": 606
},
{
"epoch": 0.1852586601556539,
"grad_norm": 1.2508041656129554,
"learning_rate": 9.38229539616894e-06,
"loss": 0.2123,
"step": 607
},
{
"epoch": 0.1855638638791393,
"grad_norm": 1.6014009719897135,
"learning_rate": 9.379912702220641e-06,
"loss": 0.234,
"step": 608
},
{
"epoch": 0.18586906760262475,
"grad_norm": 1.4804611004553776,
"learning_rate": 9.377525725443341e-06,
"loss": 0.3951,
"step": 609
},
{
"epoch": 0.18617427132611017,
"grad_norm": 1.4589508747304376,
"learning_rate": 9.375134468171108e-06,
"loss": 0.2887,
"step": 610
},
{
"epoch": 0.1864794750495956,
"grad_norm": 1.9199770728371568,
"learning_rate": 9.372738932742193e-06,
"loss": 0.5627,
"step": 611
},
{
"epoch": 0.18678467877308103,
"grad_norm": 1.5903295576095668,
"learning_rate": 9.370339121499039e-06,
"loss": 0.4379,
"step": 612
},
{
"epoch": 0.18708988249656647,
"grad_norm": 1.6986262549100166,
"learning_rate": 9.367935036788257e-06,
"loss": 0.4873,
"step": 613
},
{
"epoch": 0.18739508622005188,
"grad_norm": 1.2194659257752518,
"learning_rate": 9.365526680960645e-06,
"loss": 0.3571,
"step": 614
},
{
"epoch": 0.18770028994353732,
"grad_norm": 1.3049138038493902,
"learning_rate": 9.363114056371178e-06,
"loss": 0.3114,
"step": 615
},
{
"epoch": 0.18800549366702274,
"grad_norm": 1.468685879220778,
"learning_rate": 9.360697165379004e-06,
"loss": 0.5043,
"step": 616
},
{
"epoch": 0.18831069739050815,
"grad_norm": 2.131454426646245,
"learning_rate": 9.35827601034744e-06,
"loss": 0.5165,
"step": 617
},
{
"epoch": 0.1886159011139936,
"grad_norm": 1.5563065422847613,
"learning_rate": 9.355850593643974e-06,
"loss": 0.4707,
"step": 618
},
{
"epoch": 0.188921104837479,
"grad_norm": 1.3992788671446874,
"learning_rate": 9.353420917640264e-06,
"loss": 0.3905,
"step": 619
},
{
"epoch": 0.18922630856096445,
"grad_norm": 1.2635210704880713,
"learning_rate": 9.350986984712128e-06,
"loss": 0.2399,
"step": 620
},
{
"epoch": 0.18953151228444987,
"grad_norm": 1.4071894580574145,
"learning_rate": 9.348548797239551e-06,
"loss": 0.3689,
"step": 621
},
{
"epoch": 0.1898367160079353,
"grad_norm": 1.8460324916935194,
"learning_rate": 9.346106357606675e-06,
"loss": 0.3337,
"step": 622
},
{
"epoch": 0.19014191973142072,
"grad_norm": 1.2199756132877848,
"learning_rate": 9.343659668201803e-06,
"loss": 0.3707,
"step": 623
},
{
"epoch": 0.19044712345490614,
"grad_norm": 1.3352934416971625,
"learning_rate": 9.34120873141739e-06,
"loss": 0.3332,
"step": 624
},
{
"epoch": 0.19075232717839158,
"grad_norm": 1.5847999740161538,
"learning_rate": 9.33875354965005e-06,
"loss": 0.4658,
"step": 625
},
{
"epoch": 0.191057530901877,
"grad_norm": 1.59267718540602,
"learning_rate": 9.336294125300538e-06,
"loss": 0.5138,
"step": 626
},
{
"epoch": 0.19136273462536244,
"grad_norm": 1.0558131110089597,
"learning_rate": 9.333830460773767e-06,
"loss": 0.2512,
"step": 627
},
{
"epoch": 0.19166793834884785,
"grad_norm": 1.6912848096424418,
"learning_rate": 9.331362558478793e-06,
"loss": 0.3129,
"step": 628
},
{
"epoch": 0.1919731420723333,
"grad_norm": 1.7565199783626735,
"learning_rate": 9.328890420828817e-06,
"loss": 0.2625,
"step": 629
},
{
"epoch": 0.1922783457958187,
"grad_norm": 1.6287483120468187,
"learning_rate": 9.326414050241176e-06,
"loss": 0.4631,
"step": 630
},
{
"epoch": 0.19258354951930415,
"grad_norm": 1.5343127080699748,
"learning_rate": 9.323933449137353e-06,
"loss": 0.35,
"step": 631
},
{
"epoch": 0.19288875324278956,
"grad_norm": 1.2901421851525343,
"learning_rate": 9.321448619942963e-06,
"loss": 0.3191,
"step": 632
},
{
"epoch": 0.19319395696627498,
"grad_norm": 1.3651871352420322,
"learning_rate": 9.318959565087761e-06,
"loss": 0.3063,
"step": 633
},
{
"epoch": 0.19349916068976042,
"grad_norm": 1.4900191765389657,
"learning_rate": 9.316466287005625e-06,
"loss": 0.3621,
"step": 634
},
{
"epoch": 0.19380436441324583,
"grad_norm": 1.836926327897149,
"learning_rate": 9.313968788134572e-06,
"loss": 0.6273,
"step": 635
},
{
"epoch": 0.19410956813673128,
"grad_norm": 1.6283757285797815,
"learning_rate": 9.311467070916743e-06,
"loss": 0.3191,
"step": 636
},
{
"epoch": 0.1944147718602167,
"grad_norm": 1.7047955780313857,
"learning_rate": 9.308961137798398e-06,
"loss": 0.5581,
"step": 637
},
{
"epoch": 0.19471997558370213,
"grad_norm": 1.4826549000090183,
"learning_rate": 9.306450991229927e-06,
"loss": 0.3157,
"step": 638
},
{
"epoch": 0.19502517930718755,
"grad_norm": 1.435361017145943,
"learning_rate": 9.30393663366584e-06,
"loss": 0.3084,
"step": 639
},
{
"epoch": 0.19533038303067296,
"grad_norm": 1.402358583674702,
"learning_rate": 9.301418067564758e-06,
"loss": 0.3351,
"step": 640
},
{
"epoch": 0.1956355867541584,
"grad_norm": 1.2933654176691274,
"learning_rate": 9.298895295389423e-06,
"loss": 0.2585,
"step": 641
},
{
"epoch": 0.19594079047764382,
"grad_norm": 1.5346090103364156,
"learning_rate": 9.29636831960669e-06,
"loss": 0.33,
"step": 642
},
{
"epoch": 0.19624599420112926,
"grad_norm": 1.3251725534327445,
"learning_rate": 9.293837142687522e-06,
"loss": 0.2104,
"step": 643
},
{
"epoch": 0.19655119792461467,
"grad_norm": 1.583285740923444,
"learning_rate": 9.291301767106986e-06,
"loss": 0.4326,
"step": 644
},
{
"epoch": 0.19685640164810012,
"grad_norm": 1.0905371753559963,
"learning_rate": 9.288762195344266e-06,
"loss": 0.274,
"step": 645
},
{
"epoch": 0.19716160537158553,
"grad_norm": 1.7263838041187525,
"learning_rate": 9.28621842988264e-06,
"loss": 0.5011,
"step": 646
},
{
"epoch": 0.19746680909507097,
"grad_norm": 1.4838510492072716,
"learning_rate": 9.283670473209488e-06,
"loss": 0.1956,
"step": 647
},
{
"epoch": 0.1977720128185564,
"grad_norm": 1.2036114489558822,
"learning_rate": 9.28111832781629e-06,
"loss": 0.2346,
"step": 648
},
{
"epoch": 0.1980772165420418,
"grad_norm": 1.3853687861302435,
"learning_rate": 9.278561996198622e-06,
"loss": 0.2313,
"step": 649
},
{
"epoch": 0.19838242026552724,
"grad_norm": 1.2708592907281826,
"learning_rate": 9.276001480856152e-06,
"loss": 0.3717,
"step": 650
},
{
"epoch": 0.19868762398901266,
"grad_norm": 1.6407943638167721,
"learning_rate": 9.273436784292641e-06,
"loss": 0.5533,
"step": 651
},
{
"epoch": 0.1989928277124981,
"grad_norm": 1.9614104046455092,
"learning_rate": 9.270867909015936e-06,
"loss": 0.4552,
"step": 652
},
{
"epoch": 0.1992980314359835,
"grad_norm": 1.6038724290911757,
"learning_rate": 9.268294857537973e-06,
"loss": 0.477,
"step": 653
},
{
"epoch": 0.19960323515946896,
"grad_norm": 1.9838192379267598,
"learning_rate": 9.26571763237477e-06,
"loss": 0.7118,
"step": 654
},
{
"epoch": 0.19990843888295437,
"grad_norm": 1.4066170955047037,
"learning_rate": 9.263136236046422e-06,
"loss": 0.4222,
"step": 655
},
{
"epoch": 0.2002136426064398,
"grad_norm": 1.800111592330467,
"learning_rate": 9.260550671077113e-06,
"loss": 0.4969,
"step": 656
}
],
"logging_steps": 1.0,
"max_steps": 3276,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 328,
"total_flos": 81687288602624.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}