9b-89 / trainer_state.json
furproxy's picture
Upload folder using huggingface_hub
bf5688c verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 2008,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00398406374501992,
"grad_norm": 4.348448276519775,
"learning_rate": 5.940594059405941e-08,
"loss": 2.1171607971191406,
"step": 2
},
{
"epoch": 0.00796812749003984,
"grad_norm": 2.55696177482605,
"learning_rate": 1.782178217821782e-07,
"loss": 2.068465232849121,
"step": 4
},
{
"epoch": 0.01195219123505976,
"grad_norm": 3.159899950027466,
"learning_rate": 2.9702970297029703e-07,
"loss": 2.136167287826538,
"step": 6
},
{
"epoch": 0.01593625498007968,
"grad_norm": 2.0796260833740234,
"learning_rate": 4.158415841584159e-07,
"loss": 1.8786698579788208,
"step": 8
},
{
"epoch": 0.0199203187250996,
"grad_norm": 5.41955041885376,
"learning_rate": 5.346534653465346e-07,
"loss": 1.9257912635803223,
"step": 10
},
{
"epoch": 0.02390438247011952,
"grad_norm": 11.406185150146484,
"learning_rate": 6.534653465346535e-07,
"loss": 2.368868827819824,
"step": 12
},
{
"epoch": 0.027888446215139442,
"grad_norm": 1.901093602180481,
"learning_rate": 7.722772277227723e-07,
"loss": 1.9428346157073975,
"step": 14
},
{
"epoch": 0.03187250996015936,
"grad_norm": 1.393601894378662,
"learning_rate": 8.910891089108911e-07,
"loss": 1.7873543500900269,
"step": 16
},
{
"epoch": 0.035856573705179286,
"grad_norm": 1.4436230659484863,
"learning_rate": 1.00990099009901e-06,
"loss": 1.2166668176651,
"step": 18
},
{
"epoch": 0.0398406374501992,
"grad_norm": 1.8145285844802856,
"learning_rate": 1.1287128712871288e-06,
"loss": 1.6057647466659546,
"step": 20
},
{
"epoch": 0.043824701195219126,
"grad_norm": 1.2188401222229004,
"learning_rate": 1.2475247524752474e-06,
"loss": 1.7550266981124878,
"step": 22
},
{
"epoch": 0.04780876494023904,
"grad_norm": 1.664843201637268,
"learning_rate": 1.3663366336633665e-06,
"loss": 1.5540839433670044,
"step": 24
},
{
"epoch": 0.05179282868525897,
"grad_norm": 9.42098617553711,
"learning_rate": 1.4851485148514852e-06,
"loss": 2.344756841659546,
"step": 26
},
{
"epoch": 0.055776892430278883,
"grad_norm": 2.532942771911621,
"learning_rate": 1.603960396039604e-06,
"loss": 1.6220295429229736,
"step": 28
},
{
"epoch": 0.05976095617529881,
"grad_norm": 14.82198429107666,
"learning_rate": 1.7227722772277227e-06,
"loss": 1.83803129196167,
"step": 30
},
{
"epoch": 0.06374501992031872,
"grad_norm": 4.736570358276367,
"learning_rate": 1.8415841584158415e-06,
"loss": 0.9668034315109253,
"step": 32
},
{
"epoch": 0.06772908366533864,
"grad_norm": 1.077216386795044,
"learning_rate": 1.9603960396039604e-06,
"loss": 1.5424432754516602,
"step": 34
},
{
"epoch": 0.07171314741035857,
"grad_norm": 2.908050060272217,
"learning_rate": 2.079207920792079e-06,
"loss": 1.578654408454895,
"step": 36
},
{
"epoch": 0.07569721115537849,
"grad_norm": 1.1227400302886963,
"learning_rate": 2.198019801980198e-06,
"loss": 1.4881534576416016,
"step": 38
},
{
"epoch": 0.0796812749003984,
"grad_norm": 1.1037142276763916,
"learning_rate": 2.316831683168317e-06,
"loss": 1.4990899562835693,
"step": 40
},
{
"epoch": 0.08366533864541832,
"grad_norm": 11.898150444030762,
"learning_rate": 2.4356435643564358e-06,
"loss": 0.6503542065620422,
"step": 42
},
{
"epoch": 0.08764940239043825,
"grad_norm": 1.8804869651794434,
"learning_rate": 2.5544554455445544e-06,
"loss": 1.5055851936340332,
"step": 44
},
{
"epoch": 0.09163346613545817,
"grad_norm": 1.0558547973632812,
"learning_rate": 2.6732673267326735e-06,
"loss": 1.4046030044555664,
"step": 46
},
{
"epoch": 0.09561752988047809,
"grad_norm": 1.6279054880142212,
"learning_rate": 2.792079207920792e-06,
"loss": 1.228537678718567,
"step": 48
},
{
"epoch": 0.099601593625498,
"grad_norm": 9.176322937011719,
"learning_rate": 2.9108910891089108e-06,
"loss": 1.4280906915664673,
"step": 50
},
{
"epoch": 0.10358565737051793,
"grad_norm": 0.7322860956192017,
"learning_rate": 3.02970297029703e-06,
"loss": 1.3861970901489258,
"step": 52
},
{
"epoch": 0.10756972111553785,
"grad_norm": 1.9037761688232422,
"learning_rate": 3.148514851485149e-06,
"loss": 1.8983885049819946,
"step": 54
},
{
"epoch": 0.11155378486055777,
"grad_norm": 2.3619227409362793,
"learning_rate": 3.2673267326732676e-06,
"loss": 1.265608549118042,
"step": 56
},
{
"epoch": 0.11553784860557768,
"grad_norm": 1.079222559928894,
"learning_rate": 3.3861386138613858e-06,
"loss": 1.3720718622207642,
"step": 58
},
{
"epoch": 0.11952191235059761,
"grad_norm": 2.0091183185577393,
"learning_rate": 3.504950495049505e-06,
"loss": 1.221197485923767,
"step": 60
},
{
"epoch": 0.12350597609561753,
"grad_norm": 1.6702687740325928,
"learning_rate": 3.623762376237624e-06,
"loss": 1.3659617900848389,
"step": 62
},
{
"epoch": 0.12749003984063745,
"grad_norm": 1.4772052764892578,
"learning_rate": 3.7425742574257425e-06,
"loss": 0.9504954218864441,
"step": 64
},
{
"epoch": 0.13147410358565736,
"grad_norm": 1.1839888095855713,
"learning_rate": 3.861386138613861e-06,
"loss": 1.3058485984802246,
"step": 66
},
{
"epoch": 0.13545816733067728,
"grad_norm": 2.274195671081543,
"learning_rate": 3.98019801980198e-06,
"loss": 1.142093300819397,
"step": 68
},
{
"epoch": 0.1394422310756972,
"grad_norm": 1.23581063747406,
"learning_rate": 4.099009900990099e-06,
"loss": 1.3501553535461426,
"step": 70
},
{
"epoch": 0.14342629482071714,
"grad_norm": 0.9609231352806091,
"learning_rate": 4.2178217821782175e-06,
"loss": 1.394975185394287,
"step": 72
},
{
"epoch": 0.14741035856573706,
"grad_norm": 1.0941250324249268,
"learning_rate": 4.336633663366337e-06,
"loss": 1.3037439584732056,
"step": 74
},
{
"epoch": 0.15139442231075698,
"grad_norm": 2.8824353218078613,
"learning_rate": 4.455445544554456e-06,
"loss": 1.0416653156280518,
"step": 76
},
{
"epoch": 0.1553784860557769,
"grad_norm": 0.9365191459655762,
"learning_rate": 4.574257425742575e-06,
"loss": 1.2687112092971802,
"step": 78
},
{
"epoch": 0.1593625498007968,
"grad_norm": 1.5261850357055664,
"learning_rate": 4.693069306930693e-06,
"loss": 1.317929744720459,
"step": 80
},
{
"epoch": 0.16334661354581673,
"grad_norm": 1.2774893045425415,
"learning_rate": 4.811881188118812e-06,
"loss": 1.0010876655578613,
"step": 82
},
{
"epoch": 0.16733067729083664,
"grad_norm": 2.2119550704956055,
"learning_rate": 4.93069306930693e-06,
"loss": 0.4978000223636627,
"step": 84
},
{
"epoch": 0.17131474103585656,
"grad_norm": 0.8584203124046326,
"learning_rate": 5.049504950495049e-06,
"loss": 0.5417638421058655,
"step": 86
},
{
"epoch": 0.1752988047808765,
"grad_norm": 1.1234948635101318,
"learning_rate": 5.168316831683168e-06,
"loss": 1.3661960363388062,
"step": 88
},
{
"epoch": 0.17928286852589642,
"grad_norm": 1.9819002151489258,
"learning_rate": 5.2871287128712874e-06,
"loss": 0.8402650952339172,
"step": 90
},
{
"epoch": 0.18326693227091634,
"grad_norm": 9.981027603149414,
"learning_rate": 5.4059405940594065e-06,
"loss": 1.0407862663269043,
"step": 92
},
{
"epoch": 0.18725099601593626,
"grad_norm": 5.5226335525512695,
"learning_rate": 5.524752475247525e-06,
"loss": 1.3026604652404785,
"step": 94
},
{
"epoch": 0.19123505976095617,
"grad_norm": 2.536931037902832,
"learning_rate": 5.643564356435644e-06,
"loss": 1.050534963607788,
"step": 96
},
{
"epoch": 0.1952191235059761,
"grad_norm": 2.8480377197265625,
"learning_rate": 5.762376237623762e-06,
"loss": 1.2697981595993042,
"step": 98
},
{
"epoch": 0.199203187250996,
"grad_norm": 1.2788549661636353,
"learning_rate": 5.881188118811881e-06,
"loss": 1.3857513666152954,
"step": 100
},
{
"epoch": 0.20318725099601595,
"grad_norm": 4.938348293304443,
"learning_rate": 6e-06,
"loss": 1.2503310441970825,
"step": 102
},
{
"epoch": 0.20717131474103587,
"grad_norm": 1.3763278722763062,
"learning_rate": 5.99998534480079e-06,
"loss": 1.316627025604248,
"step": 104
},
{
"epoch": 0.21115537848605578,
"grad_norm": 1.2016820907592773,
"learning_rate": 5.9999413793622525e-06,
"loss": 1.3336181640625,
"step": 106
},
{
"epoch": 0.2151394422310757,
"grad_norm": 2.7037742137908936,
"learning_rate": 5.9998681041616624e-06,
"loss": 0.848972737789154,
"step": 108
},
{
"epoch": 0.21912350597609562,
"grad_norm": 2.082820177078247,
"learning_rate": 5.999765519994475e-06,
"loss": 1.1773113012313843,
"step": 110
},
{
"epoch": 0.22310756972111553,
"grad_norm": 1.349158525466919,
"learning_rate": 5.999633627974312e-06,
"loss": 1.838499903678894,
"step": 112
},
{
"epoch": 0.22709163346613545,
"grad_norm": 1.0457987785339355,
"learning_rate": 5.9994724295329546e-06,
"loss": 1.2931954860687256,
"step": 114
},
{
"epoch": 0.23107569721115537,
"grad_norm": 1.0233925580978394,
"learning_rate": 5.999281926420326e-06,
"loss": 1.3657619953155518,
"step": 116
},
{
"epoch": 0.2350597609561753,
"grad_norm": 1.456226110458374,
"learning_rate": 5.999062120704471e-06,
"loss": 0.39271149039268494,
"step": 118
},
{
"epoch": 0.23904382470119523,
"grad_norm": 1.1591161489486694,
"learning_rate": 5.998813014771534e-06,
"loss": 1.283569097518921,
"step": 120
},
{
"epoch": 0.24302788844621515,
"grad_norm": 1.4893031120300293,
"learning_rate": 5.998534611325737e-06,
"loss": 1.3696374893188477,
"step": 122
},
{
"epoch": 0.24701195219123506,
"grad_norm": 1.0916317701339722,
"learning_rate": 5.998226913389344e-06,
"loss": 1.2977485656738281,
"step": 124
},
{
"epoch": 0.250996015936255,
"grad_norm": 1.5058797597885132,
"learning_rate": 5.997889924302632e-06,
"loss": 1.2800962924957275,
"step": 126
},
{
"epoch": 0.2549800796812749,
"grad_norm": 2.89294695854187,
"learning_rate": 5.997523647723856e-06,
"loss": 0.9177144169807434,
"step": 128
},
{
"epoch": 0.2589641434262948,
"grad_norm": 2.416161060333252,
"learning_rate": 5.997128087629205e-06,
"loss": 1.280983567237854,
"step": 130
},
{
"epoch": 0.26294820717131473,
"grad_norm": 1.2975496053695679,
"learning_rate": 5.996703248312762e-06,
"loss": 1.2503688335418701,
"step": 132
},
{
"epoch": 0.26693227091633465,
"grad_norm": 0.9795719385147095,
"learning_rate": 5.996249134386455e-06,
"loss": 1.2679003477096558,
"step": 134
},
{
"epoch": 0.27091633466135456,
"grad_norm": 1.4742954969406128,
"learning_rate": 5.995765750780013e-06,
"loss": 0.5531994700431824,
"step": 136
},
{
"epoch": 0.2749003984063745,
"grad_norm": 2.563380241394043,
"learning_rate": 5.995253102740903e-06,
"loss": 1.901612401008606,
"step": 138
},
{
"epoch": 0.2788844621513944,
"grad_norm": 1.4704535007476807,
"learning_rate": 5.994711195834279e-06,
"loss": 1.1717365980148315,
"step": 140
},
{
"epoch": 0.28286852589641437,
"grad_norm": 1.1811615228652954,
"learning_rate": 5.994140035942923e-06,
"loss": 0.7471544742584229,
"step": 142
},
{
"epoch": 0.2868525896414343,
"grad_norm": 1.6094988584518433,
"learning_rate": 5.993539629267178e-06,
"loss": 0.9018757939338684,
"step": 144
},
{
"epoch": 0.2908366533864542,
"grad_norm": 2.305218458175659,
"learning_rate": 5.992909982324879e-06,
"loss": 1.277273178100586,
"step": 146
},
{
"epoch": 0.2948207171314741,
"grad_norm": 3.697319746017456,
"learning_rate": 5.992251101951287e-06,
"loss": 1.0025593042373657,
"step": 148
},
{
"epoch": 0.29880478087649404,
"grad_norm": 1.539844036102295,
"learning_rate": 5.991562995299011e-06,
"loss": 1.3024755716323853,
"step": 150
},
{
"epoch": 0.30278884462151395,
"grad_norm": 1.0249600410461426,
"learning_rate": 5.990845669837933e-06,
"loss": 1.5959429740905762,
"step": 152
},
{
"epoch": 0.30677290836653387,
"grad_norm": 0.8561967015266418,
"learning_rate": 5.990099133355126e-06,
"loss": 1.2801433801651,
"step": 154
},
{
"epoch": 0.3107569721115538,
"grad_norm": 4.086156845092773,
"learning_rate": 5.989323393954767e-06,
"loss": 0.4956245422363281,
"step": 156
},
{
"epoch": 0.3147410358565737,
"grad_norm": 3.771010398864746,
"learning_rate": 5.988518460058054e-06,
"loss": 0.4668130576610565,
"step": 158
},
{
"epoch": 0.3187250996015936,
"grad_norm": 1.3703054189682007,
"learning_rate": 5.9876843404031096e-06,
"loss": 1.2212884426116943,
"step": 160
},
{
"epoch": 0.32270916334661354,
"grad_norm": 1.210668921470642,
"learning_rate": 5.986821044044889e-06,
"loss": 1.7916109561920166,
"step": 162
},
{
"epoch": 0.32669322709163345,
"grad_norm": 1.0227242708206177,
"learning_rate": 5.985928580355082e-06,
"loss": 0.8739029765129089,
"step": 164
},
{
"epoch": 0.33067729083665337,
"grad_norm": 2.860746383666992,
"learning_rate": 5.985006959022008e-06,
"loss": 0.4693869352340698,
"step": 166
},
{
"epoch": 0.3346613545816733,
"grad_norm": 1.755257487297058,
"learning_rate": 5.984056190050517e-06,
"loss": 1.324602723121643,
"step": 168
},
{
"epoch": 0.3386454183266932,
"grad_norm": 7.148312568664551,
"learning_rate": 5.983076283761872e-06,
"loss": 1.3821817636489868,
"step": 170
},
{
"epoch": 0.3426294820717131,
"grad_norm": 1.2952216863632202,
"learning_rate": 5.982067250793646e-06,
"loss": 1.2612062692642212,
"step": 172
},
{
"epoch": 0.3466135458167331,
"grad_norm": 1.727574348449707,
"learning_rate": 5.981029102099601e-06,
"loss": 1.341408133506775,
"step": 174
},
{
"epoch": 0.350597609561753,
"grad_norm": 2.543426513671875,
"learning_rate": 5.979961848949572e-06,
"loss": 0.5157387852668762,
"step": 176
},
{
"epoch": 0.3545816733067729,
"grad_norm": 1.489472508430481,
"learning_rate": 5.978865502929343e-06,
"loss": 1.3691034317016602,
"step": 178
},
{
"epoch": 0.35856573705179284,
"grad_norm": 3.3407742977142334,
"learning_rate": 5.977740075940517e-06,
"loss": 1.2798420190811157,
"step": 180
},
{
"epoch": 0.36254980079681276,
"grad_norm": 0.7936763763427734,
"learning_rate": 5.976585580200399e-06,
"loss": 1.2865771055221558,
"step": 182
},
{
"epoch": 0.3665338645418327,
"grad_norm": 1.722764492034912,
"learning_rate": 5.9754020282418505e-06,
"loss": 0.9274950623512268,
"step": 184
},
{
"epoch": 0.3705179282868526,
"grad_norm": 1.4277971982955933,
"learning_rate": 5.974189432913161e-06,
"loss": 1.2118057012557983,
"step": 186
},
{
"epoch": 0.3745019920318725,
"grad_norm": 0.7755621671676636,
"learning_rate": 5.972947807377905e-06,
"loss": 1.262542724609375,
"step": 188
},
{
"epoch": 0.3784860557768924,
"grad_norm": 2.0006139278411865,
"learning_rate": 5.971677165114801e-06,
"loss": 1.1163339614868164,
"step": 190
},
{
"epoch": 0.38247011952191234,
"grad_norm": 1.9247850179672241,
"learning_rate": 5.970377519917563e-06,
"loss": 1.0671018362045288,
"step": 192
},
{
"epoch": 0.38645418326693226,
"grad_norm": 1.1371593475341797,
"learning_rate": 5.969048885894754e-06,
"loss": 1.2458205223083496,
"step": 194
},
{
"epoch": 0.3904382470119522,
"grad_norm": 1.5814062356948853,
"learning_rate": 5.967691277469631e-06,
"loss": 1.2479208707809448,
"step": 196
},
{
"epoch": 0.3944223107569721,
"grad_norm": 1.3527947664260864,
"learning_rate": 5.9663047093799874e-06,
"loss": 0.46853581070899963,
"step": 198
},
{
"epoch": 0.398406374501992,
"grad_norm": 0.9908071160316467,
"learning_rate": 5.964889196677996e-06,
"loss": 1.2344821691513062,
"step": 200
},
{
"epoch": 0.40239043824701193,
"grad_norm": 0.9923727512359619,
"learning_rate": 5.9634447547300415e-06,
"loss": 1.2732172012329102,
"step": 202
},
{
"epoch": 0.4063745019920319,
"grad_norm": 2.537524700164795,
"learning_rate": 5.961971399216556e-06,
"loss": 1.234106183052063,
"step": 204
},
{
"epoch": 0.4103585657370518,
"grad_norm": 3.067852735519409,
"learning_rate": 5.960469146131851e-06,
"loss": 0.38716864585876465,
"step": 206
},
{
"epoch": 0.41434262948207173,
"grad_norm": 0.8039565086364746,
"learning_rate": 5.95893801178394e-06,
"loss": 1.223067045211792,
"step": 208
},
{
"epoch": 0.41832669322709165,
"grad_norm": 1.5125787258148193,
"learning_rate": 5.957378012794361e-06,
"loss": 0.698806881904602,
"step": 210
},
{
"epoch": 0.42231075697211157,
"grad_norm": 1.2418526411056519,
"learning_rate": 5.955789166098002e-06,
"loss": 0.7970227599143982,
"step": 212
},
{
"epoch": 0.4262948207171315,
"grad_norm": 2.7106666564941406,
"learning_rate": 5.954171488942911e-06,
"loss": 0.8325067758560181,
"step": 214
},
{
"epoch": 0.4302788844621514,
"grad_norm": 3.5096561908721924,
"learning_rate": 5.952524998890109e-06,
"loss": 1.1556031703948975,
"step": 216
},
{
"epoch": 0.4342629482071713,
"grad_norm": 1.513983130455017,
"learning_rate": 5.950849713813405e-06,
"loss": 1.263627529144287,
"step": 218
},
{
"epoch": 0.43824701195219123,
"grad_norm": 0.7860940098762512,
"learning_rate": 5.949145651899196e-06,
"loss": 1.2762495279312134,
"step": 220
},
{
"epoch": 0.44223107569721115,
"grad_norm": 1.6819899082183838,
"learning_rate": 5.947412831646271e-06,
"loss": 0.5981872081756592,
"step": 222
},
{
"epoch": 0.44621513944223107,
"grad_norm": 1.2630786895751953,
"learning_rate": 5.945651271865616e-06,
"loss": 1.120012879371643,
"step": 224
},
{
"epoch": 0.450199203187251,
"grad_norm": 0.9950310587882996,
"learning_rate": 5.943860991680195e-06,
"loss": 1.2754716873168945,
"step": 226
},
{
"epoch": 0.4541832669322709,
"grad_norm": 1.6684496402740479,
"learning_rate": 5.942042010524764e-06,
"loss": 0.9846575856208801,
"step": 228
},
{
"epoch": 0.4581673306772908,
"grad_norm": 1.4847872257232666,
"learning_rate": 5.9401943481456386e-06,
"loss": 1.2583152055740356,
"step": 230
},
{
"epoch": 0.46215139442231074,
"grad_norm": 0.9578908681869507,
"learning_rate": 5.9383180246004935e-06,
"loss": 1.2739794254302979,
"step": 232
},
{
"epoch": 0.46613545816733065,
"grad_norm": 1.1821162700653076,
"learning_rate": 5.936413060258143e-06,
"loss": 1.4074854850769043,
"step": 234
},
{
"epoch": 0.4701195219123506,
"grad_norm": 0.8178677558898926,
"learning_rate": 5.9344794757983115e-06,
"loss": 1.2413185834884644,
"step": 236
},
{
"epoch": 0.47410358565737054,
"grad_norm": 2.4166979789733887,
"learning_rate": 5.932517292211418e-06,
"loss": 1.1744059324264526,
"step": 238
},
{
"epoch": 0.47808764940239046,
"grad_norm": 1.1220707893371582,
"learning_rate": 5.930526530798347e-06,
"loss": 1.2574900388717651,
"step": 240
},
{
"epoch": 0.4820717131474104,
"grad_norm": 0.7189679741859436,
"learning_rate": 5.928507213170211e-06,
"loss": 1.2059662342071533,
"step": 242
},
{
"epoch": 0.4860557768924303,
"grad_norm": 1.4799033403396606,
"learning_rate": 5.926459361248125e-06,
"loss": 0.7257046103477478,
"step": 244
},
{
"epoch": 0.4900398406374502,
"grad_norm": 8.812633514404297,
"learning_rate": 5.9243829972629584e-06,
"loss": 1.0781515836715698,
"step": 246
},
{
"epoch": 0.4940239043824701,
"grad_norm": 2.5435431003570557,
"learning_rate": 5.922278143755105e-06,
"loss": 0.9890032410621643,
"step": 248
},
{
"epoch": 0.49800796812749004,
"grad_norm": 1.1066993474960327,
"learning_rate": 5.920144823574229e-06,
"loss": 1.275596261024475,
"step": 250
},
{
"epoch": 0.50199203187251,
"grad_norm": 3.8385164737701416,
"learning_rate": 5.917983059879021e-06,
"loss": 0.5777413249015808,
"step": 252
},
{
"epoch": 0.5059760956175299,
"grad_norm": 2.5549728870391846,
"learning_rate": 5.915792876136944e-06,
"loss": 1.2903834581375122,
"step": 254
},
{
"epoch": 0.5099601593625498,
"grad_norm": 1.1752848625183105,
"learning_rate": 5.913574296123985e-06,
"loss": 1.2607370615005493,
"step": 256
},
{
"epoch": 0.5139442231075697,
"grad_norm": 3.4985756874084473,
"learning_rate": 5.9113273439243885e-06,
"loss": 0.6077223420143127,
"step": 258
},
{
"epoch": 0.5179282868525896,
"grad_norm": 0.8346880674362183,
"learning_rate": 5.909052043930402e-06,
"loss": 1.2486491203308105,
"step": 260
},
{
"epoch": 0.5219123505976095,
"grad_norm": 1.6400198936462402,
"learning_rate": 5.9067484208420046e-06,
"loss": 0.3859616219997406,
"step": 262
},
{
"epoch": 0.5258964143426295,
"grad_norm": 2.0709147453308105,
"learning_rate": 5.904416499666646e-06,
"loss": 1.250545620918274,
"step": 264
},
{
"epoch": 0.5298804780876494,
"grad_norm": 3.2738661766052246,
"learning_rate": 5.902056305718969e-06,
"loss": 0.5132614970207214,
"step": 266
},
{
"epoch": 0.5338645418326693,
"grad_norm": 1.4471163749694824,
"learning_rate": 5.89966786462054e-06,
"loss": 1.2536060810089111,
"step": 268
},
{
"epoch": 0.5378486055776892,
"grad_norm": 2.023653030395508,
"learning_rate": 5.897251202299566e-06,
"loss": 1.7837636470794678,
"step": 270
},
{
"epoch": 0.5418326693227091,
"grad_norm": 0.7867792248725891,
"learning_rate": 5.894806344990614e-06,
"loss": 0.7907792329788208,
"step": 272
},
{
"epoch": 0.545816733067729,
"grad_norm": 0.9616872072219849,
"learning_rate": 5.892333319234332e-06,
"loss": 1.240364670753479,
"step": 274
},
{
"epoch": 0.549800796812749,
"grad_norm": 1.5364048480987549,
"learning_rate": 5.889832151877152e-06,
"loss": 0.6271519064903259,
"step": 276
},
{
"epoch": 0.5537848605577689,
"grad_norm": 1.9956889152526855,
"learning_rate": 5.887302870071004e-06,
"loss": 1.354748010635376,
"step": 278
},
{
"epoch": 0.5577689243027888,
"grad_norm": 3.179105043411255,
"learning_rate": 5.88474550127302e-06,
"loss": 0.7769224047660828,
"step": 280
},
{
"epoch": 0.5617529880478087,
"grad_norm": 2.1050288677215576,
"learning_rate": 5.882160073245238e-06,
"loss": 0.7815161347389221,
"step": 282
},
{
"epoch": 0.5657370517928287,
"grad_norm": 1.0835380554199219,
"learning_rate": 5.879546614054295e-06,
"loss": 1.2420227527618408,
"step": 284
},
{
"epoch": 0.5697211155378487,
"grad_norm": 0.9784935712814331,
"learning_rate": 5.876905152071131e-06,
"loss": 1.2437528371810913,
"step": 286
},
{
"epoch": 0.5737051792828686,
"grad_norm": 1.059682011604309,
"learning_rate": 5.874235715970671e-06,
"loss": 1.1747212409973145,
"step": 288
},
{
"epoch": 0.5776892430278885,
"grad_norm": 1.0844000577926636,
"learning_rate": 5.87153833473152e-06,
"loss": 1.2218478918075562,
"step": 290
},
{
"epoch": 0.5816733067729084,
"grad_norm": 1.2831990718841553,
"learning_rate": 5.868813037635649e-06,
"loss": 1.1690454483032227,
"step": 292
},
{
"epoch": 0.5856573705179283,
"grad_norm": 2.694718360900879,
"learning_rate": 5.866059854268076e-06,
"loss": 0.49895596504211426,
"step": 294
},
{
"epoch": 0.5896414342629482,
"grad_norm": 1.1014599800109863,
"learning_rate": 5.863278814516539e-06,
"loss": 1.4519755840301514,
"step": 296
},
{
"epoch": 0.5936254980079682,
"grad_norm": 6.0046305656433105,
"learning_rate": 5.860469948571181e-06,
"loss": 0.6872335076332092,
"step": 298
},
{
"epoch": 0.5976095617529881,
"grad_norm": 1.493370771408081,
"learning_rate": 5.857633286924219e-06,
"loss": 1.241629958152771,
"step": 300
},
{
"epoch": 0.601593625498008,
"grad_norm": 1.3740859031677246,
"learning_rate": 5.854768860369607e-06,
"loss": 1.0279847383499146,
"step": 302
},
{
"epoch": 0.6055776892430279,
"grad_norm": 4.5894083976745605,
"learning_rate": 5.85187670000271e-06,
"loss": 0.8594214916229248,
"step": 304
},
{
"epoch": 0.6095617529880478,
"grad_norm": 1.9348714351654053,
"learning_rate": 5.848956837219964e-06,
"loss": 1.1640937328338623,
"step": 306
},
{
"epoch": 0.6135458167330677,
"grad_norm": 3.6650631427764893,
"learning_rate": 5.846009303718529e-06,
"loss": 1.083706259727478,
"step": 308
},
{
"epoch": 0.6175298804780877,
"grad_norm": 0.8985078930854797,
"learning_rate": 5.8430341314959565e-06,
"loss": 1.2840549945831299,
"step": 310
},
{
"epoch": 0.6215139442231076,
"grad_norm": 3.3366034030914307,
"learning_rate": 5.840031352849833e-06,
"loss": 0.6729341149330139,
"step": 312
},
{
"epoch": 0.6254980079681275,
"grad_norm": 0.5400150418281555,
"learning_rate": 5.83700100037743e-06,
"loss": 0.9031069874763489,
"step": 314
},
{
"epoch": 0.6294820717131474,
"grad_norm": 0.8818338513374329,
"learning_rate": 5.833943106975355e-06,
"loss": 1.403872013092041,
"step": 316
},
{
"epoch": 0.6334661354581673,
"grad_norm": 0.9534677267074585,
"learning_rate": 5.830857705839191e-06,
"loss": 0.7257641553878784,
"step": 318
},
{
"epoch": 0.6374501992031872,
"grad_norm": 1.2703937292099,
"learning_rate": 5.8277448304631385e-06,
"loss": 1.2789297103881836,
"step": 320
},
{
"epoch": 0.6414342629482072,
"grad_norm": 2.5597033500671387,
"learning_rate": 5.824604514639647e-06,
"loss": 0.5666279792785645,
"step": 322
},
{
"epoch": 0.6454183266932271,
"grad_norm": 1.932152509689331,
"learning_rate": 5.8214367924590515e-06,
"loss": 0.9416989088058472,
"step": 324
},
{
"epoch": 0.649402390438247,
"grad_norm": 2.5085222721099854,
"learning_rate": 5.818241698309205e-06,
"loss": 0.9871986508369446,
"step": 326
},
{
"epoch": 0.6533864541832669,
"grad_norm": 0.8283513784408569,
"learning_rate": 5.8150192668751015e-06,
"loss": 1.2529672384262085,
"step": 328
},
{
"epoch": 0.6573705179282868,
"grad_norm": 7.669778347015381,
"learning_rate": 5.811769533138499e-06,
"loss": 0.46496719121932983,
"step": 330
},
{
"epoch": 0.6613545816733067,
"grad_norm": 3.1111960411071777,
"learning_rate": 5.808492532377542e-06,
"loss": 1.1308894157409668,
"step": 332
},
{
"epoch": 0.6653386454183267,
"grad_norm": 1.0599477291107178,
"learning_rate": 5.805188300166379e-06,
"loss": 1.1927093267440796,
"step": 334
},
{
"epoch": 0.6693227091633466,
"grad_norm": 0.7919442653656006,
"learning_rate": 5.801856872374772e-06,
"loss": 1.2229902744293213,
"step": 336
},
{
"epoch": 0.6733067729083665,
"grad_norm": 0.874751627445221,
"learning_rate": 5.798498285167714e-06,
"loss": 1.239054560661316,
"step": 338
},
{
"epoch": 0.6772908366533864,
"grad_norm": 3.267413854598999,
"learning_rate": 5.795112575005031e-06,
"loss": 0.5422060489654541,
"step": 340
},
{
"epoch": 0.6812749003984063,
"grad_norm": 0.603284478187561,
"learning_rate": 5.791699778640985e-06,
"loss": 0.5057201385498047,
"step": 342
},
{
"epoch": 0.6852589641434262,
"grad_norm": 1.073237419128418,
"learning_rate": 5.788259933123882e-06,
"loss": 1.212401270866394,
"step": 344
},
{
"epoch": 0.6892430278884463,
"grad_norm": 0.9039257168769836,
"learning_rate": 5.7847930757956626e-06,
"loss": 1.2373487949371338,
"step": 346
},
{
"epoch": 0.6932270916334662,
"grad_norm": 0.6864405870437622,
"learning_rate": 5.7812992442915016e-06,
"loss": 1.1827311515808105,
"step": 348
},
{
"epoch": 0.6972111553784861,
"grad_norm": 1.7330577373504639,
"learning_rate": 5.777778476539397e-06,
"loss": 0.7856748104095459,
"step": 350
},
{
"epoch": 0.701195219123506,
"grad_norm": 4.816940784454346,
"learning_rate": 5.774230810759756e-06,
"loss": 0.7216228246688843,
"step": 352
},
{
"epoch": 0.7051792828685259,
"grad_norm": 2.1332626342773438,
"learning_rate": 5.7706562854649866e-06,
"loss": 0.49049532413482666,
"step": 354
},
{
"epoch": 0.7091633466135459,
"grad_norm": 2.8059940338134766,
"learning_rate": 5.767054939459075e-06,
"loss": 1.3019351959228516,
"step": 356
},
{
"epoch": 0.7131474103585658,
"grad_norm": 4.427498817443848,
"learning_rate": 5.763426811837164e-06,
"loss": 0.48208871483802795,
"step": 358
},
{
"epoch": 0.7171314741035857,
"grad_norm": 4.743298530578613,
"learning_rate": 5.759771941985128e-06,
"loss": 1.6483818292617798,
"step": 360
},
{
"epoch": 0.7211155378486056,
"grad_norm": 0.8030229210853577,
"learning_rate": 5.75609036957915e-06,
"loss": 0.7936917543411255,
"step": 362
},
{
"epoch": 0.7250996015936255,
"grad_norm": 4.138736248016357,
"learning_rate": 5.752382134585289e-06,
"loss": 0.19702184200286865,
"step": 364
},
{
"epoch": 0.7290836653386454,
"grad_norm": 0.7204448580741882,
"learning_rate": 5.748647277259041e-06,
"loss": 1.3097480535507202,
"step": 366
},
{
"epoch": 0.7330677290836654,
"grad_norm": 0.6811744570732117,
"learning_rate": 5.744885838144908e-06,
"loss": 1.282241702079773,
"step": 368
},
{
"epoch": 0.7370517928286853,
"grad_norm": 1.3216296434402466,
"learning_rate": 5.741097858075958e-06,
"loss": 1.1899917125701904,
"step": 370
},
{
"epoch": 0.7410358565737052,
"grad_norm": 0.7291891574859619,
"learning_rate": 5.737283378173377e-06,
"loss": 1.289171576499939,
"step": 372
},
{
"epoch": 0.7450199203187251,
"grad_norm": 1.4926878213882446,
"learning_rate": 5.733442439846028e-06,
"loss": 0.9133517742156982,
"step": 374
},
{
"epoch": 0.749003984063745,
"grad_norm": 1.1999213695526123,
"learning_rate": 5.729575084789995e-06,
"loss": 1.2485815286636353,
"step": 376
},
{
"epoch": 0.7529880478087649,
"grad_norm": 0.4571026563644409,
"learning_rate": 5.725681354988137e-06,
"loss": 0.41173255443573,
"step": 378
},
{
"epoch": 0.7569721115537849,
"grad_norm": 0.9662789106369019,
"learning_rate": 5.72176129270963e-06,
"loss": 1.3222002983093262,
"step": 380
},
{
"epoch": 0.7609561752988048,
"grad_norm": 0.8864423036575317,
"learning_rate": 5.717814940509503e-06,
"loss": 1.2533366680145264,
"step": 382
},
{
"epoch": 0.7649402390438247,
"grad_norm": 1.8013001680374146,
"learning_rate": 5.713842341228187e-06,
"loss": 1.132637858390808,
"step": 384
},
{
"epoch": 0.7689243027888446,
"grad_norm": 1.4815607070922852,
"learning_rate": 5.70984353799104e-06,
"loss": 0.28086692094802856,
"step": 386
},
{
"epoch": 0.7729083665338645,
"grad_norm": 0.8467429280281067,
"learning_rate": 5.705818574207883e-06,
"loss": 1.4608538150787354,
"step": 388
},
{
"epoch": 0.7768924302788844,
"grad_norm": 2.4864161014556885,
"learning_rate": 5.701767493572526e-06,
"loss": 0.7464155554771423,
"step": 390
},
{
"epoch": 0.7808764940239044,
"grad_norm": 2.4926576614379883,
"learning_rate": 5.6976903400623e-06,
"loss": 0.5242215991020203,
"step": 392
},
{
"epoch": 0.7848605577689243,
"grad_norm": 3.3884170055389404,
"learning_rate": 5.693587157937572e-06,
"loss": 0.7744420766830444,
"step": 394
},
{
"epoch": 0.7888446215139442,
"grad_norm": 1.3466330766677856,
"learning_rate": 5.689457991741267e-06,
"loss": 0.8062616586685181,
"step": 396
},
{
"epoch": 0.7928286852589641,
"grad_norm": 0.8415664434432983,
"learning_rate": 5.685302886298392e-06,
"loss": 0.9788842797279358,
"step": 398
},
{
"epoch": 0.796812749003984,
"grad_norm": 1.0375547409057617,
"learning_rate": 5.681121886715534e-06,
"loss": 1.068263053894043,
"step": 400
},
{
"epoch": 0.8007968127490039,
"grad_norm": 1.184495210647583,
"learning_rate": 5.676915038380384e-06,
"loss": 0.7641897797584534,
"step": 402
},
{
"epoch": 0.8047808764940239,
"grad_norm": 0.5623915195465088,
"learning_rate": 5.67268238696124e-06,
"loss": 1.194584846496582,
"step": 404
},
{
"epoch": 0.8087649402390438,
"grad_norm": 1.6544809341430664,
"learning_rate": 5.668423978406509e-06,
"loss": 1.8557928800582886,
"step": 406
},
{
"epoch": 0.8127490039840638,
"grad_norm": 0.9776933193206787,
"learning_rate": 5.664139858944209e-06,
"loss": 1.157083511352539,
"step": 408
},
{
"epoch": 0.8167330677290837,
"grad_norm": 0.9368433356285095,
"learning_rate": 5.65983007508147e-06,
"loss": 1.1894208192825317,
"step": 410
},
{
"epoch": 0.8207171314741036,
"grad_norm": 1.024929165840149,
"learning_rate": 5.655494673604024e-06,
"loss": 1.2211333513259888,
"step": 412
},
{
"epoch": 0.8247011952191236,
"grad_norm": 0.9331441521644592,
"learning_rate": 5.651133701575706e-06,
"loss": 0.9813644289970398,
"step": 414
},
{
"epoch": 0.8286852589641435,
"grad_norm": 0.43455296754837036,
"learning_rate": 5.64674720633793e-06,
"loss": 0.2262841910123825,
"step": 416
},
{
"epoch": 0.8326693227091634,
"grad_norm": 0.9842036366462708,
"learning_rate": 5.642335235509189e-06,
"loss": 1.2737834453582764,
"step": 418
},
{
"epoch": 0.8366533864541833,
"grad_norm": 1.0286755561828613,
"learning_rate": 5.637897836984526e-06,
"loss": 1.2228126525878906,
"step": 420
},
{
"epoch": 0.8406374501992032,
"grad_norm": 0.8756253123283386,
"learning_rate": 5.633435058935023e-06,
"loss": 1.1928170919418335,
"step": 422
},
{
"epoch": 0.8446215139442231,
"grad_norm": 0.758901834487915,
"learning_rate": 5.628946949807274e-06,
"loss": 1.1966356039047241,
"step": 424
},
{
"epoch": 0.848605577689243,
"grad_norm": 2.6789400577545166,
"learning_rate": 5.624433558322859e-06,
"loss": 0.7115716338157654,
"step": 426
},
{
"epoch": 0.852589641434263,
"grad_norm": 1.1329255104064941,
"learning_rate": 5.619894933477816e-06,
"loss": 1.2351547479629517,
"step": 428
},
{
"epoch": 0.8565737051792829,
"grad_norm": 0.8669703602790833,
"learning_rate": 5.615331124542109e-06,
"loss": 1.0460853576660156,
"step": 430
},
{
"epoch": 0.8605577689243028,
"grad_norm": 1.4718725681304932,
"learning_rate": 5.610742181059092e-06,
"loss": 1.8136500120162964,
"step": 432
},
{
"epoch": 0.8645418326693227,
"grad_norm": 1.955024003982544,
"learning_rate": 5.606128152844975e-06,
"loss": 1.2090433835983276,
"step": 434
},
{
"epoch": 0.8685258964143426,
"grad_norm": 2.959174156188965,
"learning_rate": 5.601489089988277e-06,
"loss": 0.4959055483341217,
"step": 436
},
{
"epoch": 0.8725099601593626,
"grad_norm": 0.8022291660308838,
"learning_rate": 5.596825042849287e-06,
"loss": 1.2489244937896729,
"step": 438
},
{
"epoch": 0.8764940239043825,
"grad_norm": 0.867755651473999,
"learning_rate": 5.592136062059517e-06,
"loss": 1.187935709953308,
"step": 440
},
{
"epoch": 0.8804780876494024,
"grad_norm": 2.0213284492492676,
"learning_rate": 5.587422198521149e-06,
"loss": 1.6624571084976196,
"step": 442
},
{
"epoch": 0.8844621513944223,
"grad_norm": 1.8472967147827148,
"learning_rate": 5.582683503406488e-06,
"loss": 1.3048073053359985,
"step": 444
},
{
"epoch": 0.8884462151394422,
"grad_norm": 0.8281286954879761,
"learning_rate": 5.5779200281574e-06,
"loss": 1.043340802192688,
"step": 446
},
{
"epoch": 0.8924302788844621,
"grad_norm": 1.8063609600067139,
"learning_rate": 5.573131824484758e-06,
"loss": 0.371786892414093,
"step": 448
},
{
"epoch": 0.896414342629482,
"grad_norm": 0.8337019681930542,
"learning_rate": 5.56831894436788e-06,
"loss": 1.1593928337097168,
"step": 450
},
{
"epoch": 0.900398406374502,
"grad_norm": 0.808246374130249,
"learning_rate": 5.563481440053964e-06,
"loss": 0.8130660057067871,
"step": 452
},
{
"epoch": 0.9043824701195219,
"grad_norm": 0.7648867964744568,
"learning_rate": 5.55861936405752e-06,
"loss": 1.2445188760757446,
"step": 454
},
{
"epoch": 0.9083665338645418,
"grad_norm": 4.679040431976318,
"learning_rate": 5.5537327691598026e-06,
"loss": 0.9090757966041565,
"step": 456
},
{
"epoch": 0.9123505976095617,
"grad_norm": 0.8703306317329407,
"learning_rate": 5.548821708408234e-06,
"loss": 1.2912606000900269,
"step": 458
},
{
"epoch": 0.9163346613545816,
"grad_norm": 3.33894681930542,
"learning_rate": 5.543886235115832e-06,
"loss": 1.0427659749984741,
"step": 460
},
{
"epoch": 0.9203187250996016,
"grad_norm": 1.598880410194397,
"learning_rate": 5.538926402860631e-06,
"loss": 1.2816940546035767,
"step": 462
},
{
"epoch": 0.9243027888446215,
"grad_norm": 1.35460364818573,
"learning_rate": 5.533942265485095e-06,
"loss": 1.3399840593338013,
"step": 464
},
{
"epoch": 0.9282868525896414,
"grad_norm": 7.064363956451416,
"learning_rate": 5.528933877095541e-06,
"loss": 0.40876510739326477,
"step": 466
},
{
"epoch": 0.9322709163346613,
"grad_norm": 0.7858706712722778,
"learning_rate": 5.523901292061547e-06,
"loss": 1.1805975437164307,
"step": 468
},
{
"epoch": 0.9362549800796812,
"grad_norm": 8.24327278137207,
"learning_rate": 5.518844565015361e-06,
"loss": 0.38794469833374023,
"step": 470
},
{
"epoch": 0.9402390438247012,
"grad_norm": 0.7928199768066406,
"learning_rate": 5.51376375085131e-06,
"loss": 1.2316607236862183,
"step": 472
},
{
"epoch": 0.9442231075697212,
"grad_norm": 4.031145095825195,
"learning_rate": 5.508658904725206e-06,
"loss": 0.5695405602455139,
"step": 474
},
{
"epoch": 0.9482071713147411,
"grad_norm": 2.9237377643585205,
"learning_rate": 5.503530082053741e-06,
"loss": 0.338968962430954,
"step": 476
},
{
"epoch": 0.952191235059761,
"grad_norm": 0.8833221793174744,
"learning_rate": 5.498377338513894e-06,
"loss": 1.2102028131484985,
"step": 478
},
{
"epoch": 0.9561752988047809,
"grad_norm": 25.611223220825195,
"learning_rate": 5.493200730042317e-06,
"loss": 0.4739567041397095,
"step": 480
},
{
"epoch": 0.9601593625498008,
"grad_norm": 5.376172065734863,
"learning_rate": 5.488000312834735e-06,
"loss": 0.9883483648300171,
"step": 482
},
{
"epoch": 0.9641434262948207,
"grad_norm": 1.7662686109542847,
"learning_rate": 5.482776143345333e-06,
"loss": 1.2430894374847412,
"step": 484
},
{
"epoch": 0.9681274900398407,
"grad_norm": 2.5627293586730957,
"learning_rate": 5.477528278286145e-06,
"loss": 1.2240179777145386,
"step": 486
},
{
"epoch": 0.9721115537848606,
"grad_norm": 0.8417234420776367,
"learning_rate": 5.472256774626435e-06,
"loss": 1.1680150032043457,
"step": 488
},
{
"epoch": 0.9760956175298805,
"grad_norm": 0.8709147572517395,
"learning_rate": 5.4669616895920826e-06,
"loss": 1.2006162405014038,
"step": 490
},
{
"epoch": 0.9800796812749004,
"grad_norm": 5.11852502822876,
"learning_rate": 5.46164308066496e-06,
"loss": 0.7005679607391357,
"step": 492
},
{
"epoch": 0.9840637450199203,
"grad_norm": 2.7665576934814453,
"learning_rate": 5.456301005582304e-06,
"loss": 0.7001307606697083,
"step": 494
},
{
"epoch": 0.9880478087649402,
"grad_norm": 0.8219811320304871,
"learning_rate": 5.4509355223360956e-06,
"loss": 1.254296898841858,
"step": 496
},
{
"epoch": 0.9920318725099602,
"grad_norm": 1.0245788097381592,
"learning_rate": 5.445546689172432e-06,
"loss": 1.267047643661499,
"step": 498
},
{
"epoch": 0.9960159362549801,
"grad_norm": 1.1505917310714722,
"learning_rate": 5.440134564590883e-06,
"loss": 0.7141546010971069,
"step": 500
},
{
"epoch": 1.0,
"grad_norm": 6.24027681350708,
"learning_rate": 5.434699207343867e-06,
"loss": 1.0391122102737427,
"step": 502
},
{
"epoch": 1.00398406374502,
"grad_norm": 1.2134792804718018,
"learning_rate": 5.429240676436008e-06,
"loss": 0.7802969217300415,
"step": 504
},
{
"epoch": 1.0079681274900398,
"grad_norm": 1.5164703130722046,
"learning_rate": 5.423759031123498e-06,
"loss": 0.31817543506622314,
"step": 506
},
{
"epoch": 1.0119521912350598,
"grad_norm": 0.6141365170478821,
"learning_rate": 5.41825433091345e-06,
"loss": 1.0097558498382568,
"step": 508
},
{
"epoch": 1.0159362549800797,
"grad_norm": 0.8733232021331787,
"learning_rate": 5.4127266355632575e-06,
"loss": 1.0352897644042969,
"step": 510
},
{
"epoch": 1.0199203187250996,
"grad_norm": 2.5583245754241943,
"learning_rate": 5.407176005079938e-06,
"loss": 1.0885701179504395,
"step": 512
},
{
"epoch": 1.0239043824701195,
"grad_norm": 1.0007575750350952,
"learning_rate": 5.401602499719488e-06,
"loss": 1.0486167669296265,
"step": 514
},
{
"epoch": 1.0278884462151394,
"grad_norm": 1.1661553382873535,
"learning_rate": 5.396006179986228e-06,
"loss": 1.0347387790679932,
"step": 516
},
{
"epoch": 1.0318725099601593,
"grad_norm": 0.8863986134529114,
"learning_rate": 5.390387106632143e-06,
"loss": 1.0672526359558105,
"step": 518
},
{
"epoch": 1.0358565737051793,
"grad_norm": 2.13053035736084,
"learning_rate": 5.384745340656227e-06,
"loss": 0.8640899062156677,
"step": 520
},
{
"epoch": 1.0398406374501992,
"grad_norm": 2.6343281269073486,
"learning_rate": 5.379080943303814e-06,
"loss": 0.943762481212616,
"step": 522
},
{
"epoch": 1.043824701195219,
"grad_norm": 1.45510733127594,
"learning_rate": 5.373393976065921e-06,
"loss": 0.9649692177772522,
"step": 524
},
{
"epoch": 1.047808764940239,
"grad_norm": 1.4119848012924194,
"learning_rate": 5.367684500678576e-06,
"loss": 1.1445621252059937,
"step": 526
},
{
"epoch": 1.051792828685259,
"grad_norm": 1.0543644428253174,
"learning_rate": 5.361952579122149e-06,
"loss": 0.9114750027656555,
"step": 528
},
{
"epoch": 1.0557768924302788,
"grad_norm": 1.5039920806884766,
"learning_rate": 5.356198273620678e-06,
"loss": 0.8998257517814636,
"step": 530
},
{
"epoch": 1.0597609561752988,
"grad_norm": 2.6351239681243896,
"learning_rate": 5.350421646641195e-06,
"loss": 0.3897404074668884,
"step": 532
},
{
"epoch": 1.0637450199203187,
"grad_norm": 1.1779015064239502,
"learning_rate": 5.344622760893049e-06,
"loss": 1.2084486484527588,
"step": 534
},
{
"epoch": 1.0677290836653386,
"grad_norm": 0.50465989112854,
"learning_rate": 5.338801679327221e-06,
"loss": 0.48134946823120117,
"step": 536
},
{
"epoch": 1.0717131474103585,
"grad_norm": 6.834875106811523,
"learning_rate": 5.332958465135645e-06,
"loss": 0.8534721732139587,
"step": 538
},
{
"epoch": 1.0756972111553784,
"grad_norm": 0.8775362372398376,
"learning_rate": 5.327093181750519e-06,
"loss": 0.1745588630437851,
"step": 540
},
{
"epoch": 1.0796812749003983,
"grad_norm": 0.8401792049407959,
"learning_rate": 5.3212058928436175e-06,
"loss": 1.0862375497817993,
"step": 542
},
{
"epoch": 1.0836653386454183,
"grad_norm": 1.2075270414352417,
"learning_rate": 5.3152966623256026e-06,
"loss": 1.2837507724761963,
"step": 544
},
{
"epoch": 1.0876494023904382,
"grad_norm": 3.44868803024292,
"learning_rate": 5.309365554345325e-06,
"loss": 0.4348865747451782,
"step": 546
},
{
"epoch": 1.091633466135458,
"grad_norm": 1.060323715209961,
"learning_rate": 5.303412633289133e-06,
"loss": 0.7609821557998657,
"step": 548
},
{
"epoch": 1.095617529880478,
"grad_norm": 0.48030683398246765,
"learning_rate": 5.297437963780171e-06,
"loss": 0.5199949741363525,
"step": 550
},
{
"epoch": 1.099601593625498,
"grad_norm": 0.8254769444465637,
"learning_rate": 5.2914416106776745e-06,
"loss": 1.0883558988571167,
"step": 552
},
{
"epoch": 1.1035856573705178,
"grad_norm": 2.637892246246338,
"learning_rate": 5.2854236390762755e-06,
"loss": 0.48916831612586975,
"step": 554
},
{
"epoch": 1.1075697211155378,
"grad_norm": 1.684272050857544,
"learning_rate": 5.2793841143052855e-06,
"loss": 1.0254663228988647,
"step": 556
},
{
"epoch": 1.1115537848605577,
"grad_norm": 2.17739200592041,
"learning_rate": 5.273323101927994e-06,
"loss": 0.9679847359657288,
"step": 558
},
{
"epoch": 1.1155378486055776,
"grad_norm": 5.525514125823975,
"learning_rate": 5.26724066774095e-06,
"loss": 0.9007784128189087,
"step": 560
},
{
"epoch": 1.1195219123505975,
"grad_norm": 1.1246291399002075,
"learning_rate": 5.261136877773254e-06,
"loss": 1.0599032640457153,
"step": 562
},
{
"epoch": 1.1235059760956174,
"grad_norm": 1.811063289642334,
"learning_rate": 5.255011798285838e-06,
"loss": 1.053318738937378,
"step": 564
},
{
"epoch": 1.1274900398406373,
"grad_norm": 1.0067085027694702,
"learning_rate": 5.248865495770747e-06,
"loss": 1.0161441564559937,
"step": 566
},
{
"epoch": 1.1314741035856573,
"grad_norm": 1.653944730758667,
"learning_rate": 5.242698036950416e-06,
"loss": 1.211927890777588,
"step": 568
},
{
"epoch": 1.1354581673306772,
"grad_norm": 5.520211219787598,
"learning_rate": 5.236509488776946e-06,
"loss": 0.2512112259864807,
"step": 570
},
{
"epoch": 1.139442231075697,
"grad_norm": 0.6854221224784851,
"learning_rate": 5.230299918431381e-06,
"loss": 0.20837584137916565,
"step": 572
},
{
"epoch": 1.1434262948207172,
"grad_norm": 1.0965662002563477,
"learning_rate": 5.224069393322971e-06,
"loss": 0.8550689220428467,
"step": 574
},
{
"epoch": 1.1474103585657371,
"grad_norm": 0.5142279863357544,
"learning_rate": 5.2178179810884465e-06,
"loss": 0.5071516633033752,
"step": 576
},
{
"epoch": 1.151394422310757,
"grad_norm": 1.3928073644638062,
"learning_rate": 5.211545749591285e-06,
"loss": 1.1629210710525513,
"step": 578
},
{
"epoch": 1.155378486055777,
"grad_norm": 4.516799449920654,
"learning_rate": 5.205252766920967e-06,
"loss": 0.615897536277771,
"step": 580
},
{
"epoch": 1.159362549800797,
"grad_norm": 1.9076368808746338,
"learning_rate": 5.198939101392247e-06,
"loss": 0.6484902501106262,
"step": 582
},
{
"epoch": 1.1633466135458168,
"grad_norm": 2.9412710666656494,
"learning_rate": 5.192604821544402e-06,
"loss": 0.22438056766986847,
"step": 584
},
{
"epoch": 1.1673306772908367,
"grad_norm": 0.8736124038696289,
"learning_rate": 5.186249996140492e-06,
"loss": 1.1574631929397583,
"step": 586
},
{
"epoch": 1.1713147410358566,
"grad_norm": 1.56623375415802,
"learning_rate": 5.179874694166617e-06,
"loss": 1.0566999912261963,
"step": 588
},
{
"epoch": 1.1752988047808766,
"grad_norm": 3.406691551208496,
"learning_rate": 5.1734789848311635e-06,
"loss": 1.28257417678833,
"step": 590
},
{
"epoch": 1.1792828685258965,
"grad_norm": 1.163465976715088,
"learning_rate": 5.16706293756405e-06,
"loss": 1.0826280117034912,
"step": 592
},
{
"epoch": 1.1832669322709164,
"grad_norm": 3.0535504817962646,
"learning_rate": 5.160626622015983e-06,
"loss": 1.4529417753219604,
"step": 594
},
{
"epoch": 1.1872509960159363,
"grad_norm": 0.8099126815795898,
"learning_rate": 5.154170108057693e-06,
"loss": 1.1337939500808716,
"step": 596
},
{
"epoch": 1.1912350597609562,
"grad_norm": 3.8160228729248047,
"learning_rate": 5.147693465779179e-06,
"loss": 0.3046616017818451,
"step": 598
},
{
"epoch": 1.1952191235059761,
"grad_norm": 1.2103179693222046,
"learning_rate": 5.141196765488946e-06,
"loss": 0.8739789724349976,
"step": 600
},
{
"epoch": 1.199203187250996,
"grad_norm": 3.3165013790130615,
"learning_rate": 5.134680077713244e-06,
"loss": 0.5771604776382446,
"step": 602
},
{
"epoch": 1.203187250996016,
"grad_norm": 1.3412213325500488,
"learning_rate": 5.1281434731953e-06,
"loss": 1.1980223655700684,
"step": 604
},
{
"epoch": 1.207171314741036,
"grad_norm": 14.288922309875488,
"learning_rate": 5.121587022894554e-06,
"loss": 0.4752068817615509,
"step": 606
},
{
"epoch": 1.2111553784860558,
"grad_norm": 0.9397494196891785,
"learning_rate": 5.115010797985882e-06,
"loss": 0.5870952010154724,
"step": 608
},
{
"epoch": 1.2151394422310757,
"grad_norm": 0.735195517539978,
"learning_rate": 5.108414869858831e-06,
"loss": 1.0899227857589722,
"step": 610
},
{
"epoch": 1.2191235059760956,
"grad_norm": 0.9480123519897461,
"learning_rate": 5.1017993101168374e-06,
"loss": 1.1740434169769287,
"step": 612
},
{
"epoch": 1.2231075697211156,
"grad_norm": 1.5338431596755981,
"learning_rate": 5.095164190576452e-06,
"loss": 1.4396584033966064,
"step": 614
},
{
"epoch": 1.2270916334661355,
"grad_norm": 11.36307144165039,
"learning_rate": 5.0885095832665666e-06,
"loss": 0.3999689817428589,
"step": 616
},
{
"epoch": 1.2310756972111554,
"grad_norm": 1.546046495437622,
"learning_rate": 5.081835560427619e-06,
"loss": 0.9995384812355042,
"step": 618
},
{
"epoch": 1.2350597609561753,
"grad_norm": 1.254744291305542,
"learning_rate": 5.075142194510823e-06,
"loss": 1.0542714595794678,
"step": 620
},
{
"epoch": 1.2390438247011952,
"grad_norm": 2.047104597091675,
"learning_rate": 5.068429558177369e-06,
"loss": 0.9798321723937988,
"step": 622
},
{
"epoch": 1.2430278884462151,
"grad_norm": 1.0986047983169556,
"learning_rate": 5.061697724297646e-06,
"loss": 1.068199872970581,
"step": 624
},
{
"epoch": 1.247011952191235,
"grad_norm": 1.8080114126205444,
"learning_rate": 5.054946765950443e-06,
"loss": 0.9513214230537415,
"step": 626
},
{
"epoch": 1.250996015936255,
"grad_norm": 1.3059947490692139,
"learning_rate": 5.048176756422159e-06,
"loss": 0.7849744558334351,
"step": 628
},
{
"epoch": 1.254980079681275,
"grad_norm": 0.7330244779586792,
"learning_rate": 5.041387769206009e-06,
"loss": 1.0498535633087158,
"step": 630
},
{
"epoch": 1.2589641434262948,
"grad_norm": 5.962719440460205,
"learning_rate": 5.034579878001222e-06,
"loss": 0.2894093096256256,
"step": 632
},
{
"epoch": 1.2629482071713147,
"grad_norm": 4.925858974456787,
"learning_rate": 5.027753156712246e-06,
"loss": 0.36715632677078247,
"step": 634
},
{
"epoch": 1.2669322709163346,
"grad_norm": 3.4104573726654053,
"learning_rate": 5.020907679447936e-06,
"loss": 0.844882071018219,
"step": 636
},
{
"epoch": 1.2709163346613546,
"grad_norm": 1.9961673021316528,
"learning_rate": 5.0140435205207636e-06,
"loss": 0.8165204524993896,
"step": 638
},
{
"epoch": 1.2749003984063745,
"grad_norm": 2.4332053661346436,
"learning_rate": 5.007160754446002e-06,
"loss": 0.3054620623588562,
"step": 640
},
{
"epoch": 1.2788844621513944,
"grad_norm": 0.6446577906608582,
"learning_rate": 5.000259455940913e-06,
"loss": 0.9809127449989319,
"step": 642
},
{
"epoch": 1.2828685258964143,
"grad_norm": 1.2125827074050903,
"learning_rate": 4.9933396999239455e-06,
"loss": 0.7705118060112,
"step": 644
},
{
"epoch": 1.2868525896414342,
"grad_norm": 0.7487397193908691,
"learning_rate": 4.986401561513917e-06,
"loss": 1.0824811458587646,
"step": 646
},
{
"epoch": 1.2908366533864541,
"grad_norm": 1.9600952863693237,
"learning_rate": 4.979445116029199e-06,
"loss": 0.6253088116645813,
"step": 648
},
{
"epoch": 1.294820717131474,
"grad_norm": 1.7079068422317505,
"learning_rate": 4.972470438986896e-06,
"loss": 1.5013655424118042,
"step": 650
},
{
"epoch": 1.298804780876494,
"grad_norm": 1.1496132612228394,
"learning_rate": 4.965477606102033e-06,
"loss": 0.8948485255241394,
"step": 652
},
{
"epoch": 1.302788844621514,
"grad_norm": 1.8034613132476807,
"learning_rate": 4.9584666932867285e-06,
"loss": 0.24509888887405396,
"step": 654
},
{
"epoch": 1.3067729083665338,
"grad_norm": 0.6996963620185852,
"learning_rate": 4.951437776649368e-06,
"loss": 1.0769448280334473,
"step": 656
},
{
"epoch": 1.3107569721115537,
"grad_norm": 0.571880578994751,
"learning_rate": 4.944390932493787e-06,
"loss": 0.8138774633407593,
"step": 658
},
{
"epoch": 1.3147410358565736,
"grad_norm": 0.9483959674835205,
"learning_rate": 4.937326237318431e-06,
"loss": 0.6459387540817261,
"step": 660
},
{
"epoch": 1.3187250996015936,
"grad_norm": 0.9495901465415955,
"learning_rate": 4.930243767815534e-06,
"loss": 1.1829910278320312,
"step": 662
},
{
"epoch": 1.3227091633466135,
"grad_norm": 1.2907254695892334,
"learning_rate": 4.923143600870284e-06,
"loss": 0.5661064386367798,
"step": 664
},
{
"epoch": 1.3266932270916334,
"grad_norm": 1.5633907318115234,
"learning_rate": 4.916025813559983e-06,
"loss": 0.8189319372177124,
"step": 666
},
{
"epoch": 1.3306772908366533,
"grad_norm": 1.9113082885742188,
"learning_rate": 4.908890483153218e-06,
"loss": 0.38532766699790955,
"step": 668
},
{
"epoch": 1.3346613545816732,
"grad_norm": 0.9342731237411499,
"learning_rate": 4.901737687109019e-06,
"loss": 1.0321613550186157,
"step": 670
},
{
"epoch": 1.3386454183266931,
"grad_norm": 3.1048390865325928,
"learning_rate": 4.894567503076014e-06,
"loss": 0.5770927667617798,
"step": 672
},
{
"epoch": 1.342629482071713,
"grad_norm": 0.820324182510376,
"learning_rate": 4.887380008891593e-06,
"loss": 1.0886192321777344,
"step": 674
},
{
"epoch": 1.3466135458167332,
"grad_norm": 1.3751561641693115,
"learning_rate": 4.880175282581059e-06,
"loss": 0.97751384973526,
"step": 676
},
{
"epoch": 1.3505976095617531,
"grad_norm": 0.7426400184631348,
"learning_rate": 4.872953402356782e-06,
"loss": 1.076625943183899,
"step": 678
},
{
"epoch": 1.354581673306773,
"grad_norm": 1.1565395593643188,
"learning_rate": 4.86571444661735e-06,
"loss": 1.0121248960494995,
"step": 680
},
{
"epoch": 1.358565737051793,
"grad_norm": 0.7444704174995422,
"learning_rate": 4.858458493946716e-06,
"loss": 1.0811046361923218,
"step": 682
},
{
"epoch": 1.3625498007968129,
"grad_norm": 1.0144495964050293,
"learning_rate": 4.851185623113349e-06,
"loss": 1.1279915571212769,
"step": 684
},
{
"epoch": 1.3665338645418328,
"grad_norm": 0.7559702396392822,
"learning_rate": 4.843895913069377e-06,
"loss": 1.0942429304122925,
"step": 686
},
{
"epoch": 1.3705179282868527,
"grad_norm": 0.8456003069877625,
"learning_rate": 4.836589442949727e-06,
"loss": 1.0091909170150757,
"step": 688
},
{
"epoch": 1.3745019920318726,
"grad_norm": 0.7402591705322266,
"learning_rate": 4.829266292071268e-06,
"loss": 0.9695682525634766,
"step": 690
},
{
"epoch": 1.3784860557768925,
"grad_norm": 1.815006136894226,
"learning_rate": 4.821926539931952e-06,
"loss": 0.3355652689933777,
"step": 692
},
{
"epoch": 1.3824701195219125,
"grad_norm": 1.0571285486221313,
"learning_rate": 4.814570266209952e-06,
"loss": 1.1081352233886719,
"step": 694
},
{
"epoch": 1.3864541832669324,
"grad_norm": 1.3027758598327637,
"learning_rate": 4.80719755076279e-06,
"loss": 1.0507612228393555,
"step": 696
},
{
"epoch": 1.3904382470119523,
"grad_norm": 0.9322640299797058,
"learning_rate": 4.799808473626476e-06,
"loss": 1.1305720806121826,
"step": 698
},
{
"epoch": 1.3944223107569722,
"grad_norm": 1.1364309787750244,
"learning_rate": 4.792403115014637e-06,
"loss": 0.1400398164987564,
"step": 700
},
{
"epoch": 1.3984063745019921,
"grad_norm": 1.2325326204299927,
"learning_rate": 4.7849815553176476e-06,
"loss": 1.1220163106918335,
"step": 702
},
{
"epoch": 1.402390438247012,
"grad_norm": 1.0282156467437744,
"learning_rate": 4.777543875101757e-06,
"loss": 1.0591614246368408,
"step": 704
},
{
"epoch": 1.406374501992032,
"grad_norm": 0.7515193223953247,
"learning_rate": 4.770090155108215e-06,
"loss": 1.1357749700546265,
"step": 706
},
{
"epoch": 1.4103585657370519,
"grad_norm": 1.05164635181427,
"learning_rate": 4.7626204762523905e-06,
"loss": 0.9992522597312927,
"step": 708
},
{
"epoch": 1.4143426294820718,
"grad_norm": 0.7848185896873474,
"learning_rate": 4.755134919622901e-06,
"loss": 1.0771911144256592,
"step": 710
},
{
"epoch": 1.4183266932270917,
"grad_norm": 2.0036990642547607,
"learning_rate": 4.747633566480726e-06,
"loss": 0.6499975323677063,
"step": 712
},
{
"epoch": 1.4223107569721116,
"grad_norm": 1.088212251663208,
"learning_rate": 4.740116498258328e-06,
"loss": 1.0736567974090576,
"step": 714
},
{
"epoch": 1.4262948207171315,
"grad_norm": 1.0202051401138306,
"learning_rate": 4.73258379655877e-06,
"loss": 1.1317867040634155,
"step": 716
},
{
"epoch": 1.4302788844621515,
"grad_norm": 0.6986392140388489,
"learning_rate": 4.7250355431548244e-06,
"loss": 0.1079653948545456,
"step": 718
},
{
"epoch": 1.4342629482071714,
"grad_norm": 1.2315129041671753,
"learning_rate": 4.717471819988088e-06,
"loss": 1.070616364479065,
"step": 720
},
{
"epoch": 1.4382470119521913,
"grad_norm": 2.786571502685547,
"learning_rate": 4.709892709168096e-06,
"loss": 0.2563188672065735,
"step": 722
},
{
"epoch": 1.4422310756972112,
"grad_norm": 0.634524941444397,
"learning_rate": 4.702298292971422e-06,
"loss": 1.0500552654266357,
"step": 724
},
{
"epoch": 1.4462151394422311,
"grad_norm": 0.7324956059455872,
"learning_rate": 4.6946886538407975e-06,
"loss": 1.092575192451477,
"step": 726
},
{
"epoch": 1.450199203187251,
"grad_norm": 1.8564890623092651,
"learning_rate": 4.687063874384204e-06,
"loss": 0.8989277482032776,
"step": 728
},
{
"epoch": 1.454183266932271,
"grad_norm": 0.6646371483802795,
"learning_rate": 4.679424037373984e-06,
"loss": 1.0014073848724365,
"step": 730
},
{
"epoch": 1.4581673306772909,
"grad_norm": 2.136218786239624,
"learning_rate": 4.671769225745939e-06,
"loss": 1.0647640228271484,
"step": 732
},
{
"epoch": 1.4621513944223108,
"grad_norm": 0.5179296135902405,
"learning_rate": 4.664099522598432e-06,
"loss": 0.12710000574588776,
"step": 734
},
{
"epoch": 1.4661354581673307,
"grad_norm": 0.8502590656280518,
"learning_rate": 4.656415011191484e-06,
"loss": 1.085228681564331,
"step": 736
},
{
"epoch": 1.4701195219123506,
"grad_norm": 1.1160621643066406,
"learning_rate": 4.648715774945869e-06,
"loss": 1.1700797080993652,
"step": 738
},
{
"epoch": 1.4741035856573705,
"grad_norm": 4.530128002166748,
"learning_rate": 4.641001897442209e-06,
"loss": 0.19807864725589752,
"step": 740
},
{
"epoch": 1.4780876494023905,
"grad_norm": 1.182551383972168,
"learning_rate": 4.633273462420069e-06,
"loss": 1.2210465669631958,
"step": 742
},
{
"epoch": 1.4820717131474104,
"grad_norm": 7.367408752441406,
"learning_rate": 4.625530553777045e-06,
"loss": 1.2010120153427124,
"step": 744
},
{
"epoch": 1.4860557768924303,
"grad_norm": 0.8875226378440857,
"learning_rate": 4.617773255567855e-06,
"loss": 1.0283279418945312,
"step": 746
},
{
"epoch": 1.4900398406374502,
"grad_norm": 1.780938744544983,
"learning_rate": 4.610001652003426e-06,
"loss": 1.0667709112167358,
"step": 748
},
{
"epoch": 1.4940239043824701,
"grad_norm": 1.2433035373687744,
"learning_rate": 4.602215827449976e-06,
"loss": 1.0492123365402222,
"step": 750
},
{
"epoch": 1.49800796812749,
"grad_norm": 0.8798750638961792,
"learning_rate": 4.594415866428108e-06,
"loss": 1.0049997568130493,
"step": 752
},
{
"epoch": 1.50199203187251,
"grad_norm": 1.146921992301941,
"learning_rate": 4.586601853611882e-06,
"loss": 0.994334876537323,
"step": 754
},
{
"epoch": 1.5059760956175299,
"grad_norm": 3.869616746902466,
"learning_rate": 4.578773873827901e-06,
"loss": 0.7532044053077698,
"step": 756
},
{
"epoch": 1.5099601593625498,
"grad_norm": 1.7733598947525024,
"learning_rate": 4.57093201205439e-06,
"loss": 1.0711463689804077,
"step": 758
},
{
"epoch": 1.5139442231075697,
"grad_norm": 4.040090560913086,
"learning_rate": 4.563076353420272e-06,
"loss": 1.1239742040634155,
"step": 760
},
{
"epoch": 1.5179282868525896,
"grad_norm": 1.1118268966674805,
"learning_rate": 4.5552069832042455e-06,
"loss": 0.22398273646831512,
"step": 762
},
{
"epoch": 1.5219123505976095,
"grad_norm": 0.8436402678489685,
"learning_rate": 4.547323986833857e-06,
"loss": 1.0367255210876465,
"step": 764
},
{
"epoch": 1.5258964143426295,
"grad_norm": 1.7664424180984497,
"learning_rate": 4.539427449884576e-06,
"loss": 0.7687526941299438,
"step": 766
},
{
"epoch": 1.5298804780876494,
"grad_norm": 1.0416488647460938,
"learning_rate": 4.53151745807886e-06,
"loss": 0.5652468204498291,
"step": 768
},
{
"epoch": 1.5338645418326693,
"grad_norm": 1.3710383176803589,
"learning_rate": 4.523594097285234e-06,
"loss": 1.0875599384307861,
"step": 770
},
{
"epoch": 1.5378486055776892,
"grad_norm": 1.310120701789856,
"learning_rate": 4.51565745351735e-06,
"loss": 0.8149851560592651,
"step": 772
},
{
"epoch": 1.5418326693227091,
"grad_norm": 1.0462884902954102,
"learning_rate": 4.507707612933059e-06,
"loss": 1.044182300567627,
"step": 774
},
{
"epoch": 1.545816733067729,
"grad_norm": 2.2944624423980713,
"learning_rate": 4.4997446618334664e-06,
"loss": 1.1731159687042236,
"step": 776
},
{
"epoch": 1.549800796812749,
"grad_norm": 6.394598960876465,
"learning_rate": 4.491768686662005e-06,
"loss": 0.5516869425773621,
"step": 778
},
{
"epoch": 1.5537848605577689,
"grad_norm": 2.329699754714966,
"learning_rate": 4.483779774003498e-06,
"loss": 0.5405542850494385,
"step": 780
},
{
"epoch": 1.5577689243027888,
"grad_norm": 0.42006587982177734,
"learning_rate": 4.475778010583205e-06,
"loss": 0.20549674332141876,
"step": 782
},
{
"epoch": 1.5617529880478087,
"grad_norm": 2.271444082260132,
"learning_rate": 4.467763483265897e-06,
"loss": 0.9095351696014404,
"step": 784
},
{
"epoch": 1.5657370517928286,
"grad_norm": 1.6157774925231934,
"learning_rate": 4.459736279054901e-06,
"loss": 1.3291853666305542,
"step": 786
},
{
"epoch": 1.5697211155378485,
"grad_norm": 4.978515625,
"learning_rate": 4.451696485091164e-06,
"loss": 0.7586594223976135,
"step": 788
},
{
"epoch": 1.5737051792828685,
"grad_norm": 1.2765519618988037,
"learning_rate": 4.4436441886523025e-06,
"loss": 1.1358023881912231,
"step": 790
},
{
"epoch": 1.5776892430278884,
"grad_norm": 8.105411529541016,
"learning_rate": 4.435579477151655e-06,
"loss": 0.8000907897949219,
"step": 792
},
{
"epoch": 1.5816733067729083,
"grad_norm": 0.7435089349746704,
"learning_rate": 4.427502438137337e-06,
"loss": 1.073531150817871,
"step": 794
},
{
"epoch": 1.5856573705179282,
"grad_norm": 0.9908289313316345,
"learning_rate": 4.419413159291284e-06,
"loss": 1.011960744857788,
"step": 796
},
{
"epoch": 1.5896414342629481,
"grad_norm": 1.1573151350021362,
"learning_rate": 4.411311728428307e-06,
"loss": 0.8743354082107544,
"step": 798
},
{
"epoch": 1.593625498007968,
"grad_norm": 6.756656646728516,
"learning_rate": 4.403198233495133e-06,
"loss": 0.32545700669288635,
"step": 800
},
{
"epoch": 1.597609561752988,
"grad_norm": 1.2311936616897583,
"learning_rate": 4.395072762569457e-06,
"loss": 0.9778568744659424,
"step": 802
},
{
"epoch": 1.6015936254980079,
"grad_norm": 3.5830166339874268,
"learning_rate": 4.386935403858977e-06,
"loss": 1.0981725454330444,
"step": 804
},
{
"epoch": 1.6055776892430278,
"grad_norm": 0.9334324598312378,
"learning_rate": 4.378786245700443e-06,
"loss": 1.3115934133529663,
"step": 806
},
{
"epoch": 1.6095617529880477,
"grad_norm": 0.8329153656959534,
"learning_rate": 4.370625376558698e-06,
"loss": 1.028051733970642,
"step": 808
},
{
"epoch": 1.6135458167330676,
"grad_norm": 1.030179500579834,
"learning_rate": 4.362452885025713e-06,
"loss": 0.9735574722290039,
"step": 810
},
{
"epoch": 1.6175298804780875,
"grad_norm": 6.181675434112549,
"learning_rate": 4.35426885981963e-06,
"loss": 0.42590758204460144,
"step": 812
},
{
"epoch": 1.6215139442231075,
"grad_norm": 3.902128219604492,
"learning_rate": 4.346073389783799e-06,
"loss": 0.7486605048179626,
"step": 814
},
{
"epoch": 1.6254980079681274,
"grad_norm": 0.6811983585357666,
"learning_rate": 4.337866563885808e-06,
"loss": 0.2310914248228073,
"step": 816
},
{
"epoch": 1.6294820717131473,
"grad_norm": 0.7712540030479431,
"learning_rate": 4.329648471216523e-06,
"loss": 1.112511157989502,
"step": 818
},
{
"epoch": 1.6334661354581672,
"grad_norm": 1.0290017127990723,
"learning_rate": 4.321419200989117e-06,
"loss": 0.287282794713974,
"step": 820
},
{
"epoch": 1.6374501992031871,
"grad_norm": 2.3703389167785645,
"learning_rate": 4.313178842538107e-06,
"loss": 0.7247891426086426,
"step": 822
},
{
"epoch": 1.641434262948207,
"grad_norm": 1.919006586074829,
"learning_rate": 4.304927485318375e-06,
"loss": 0.21648265421390533,
"step": 824
},
{
"epoch": 1.645418326693227,
"grad_norm": 1.1350631713867188,
"learning_rate": 4.296665218904207e-06,
"loss": 1.0472216606140137,
"step": 826
},
{
"epoch": 1.6494023904382469,
"grad_norm": 0.42043375968933105,
"learning_rate": 4.288392132988313e-06,
"loss": 0.40000608563423157,
"step": 828
},
{
"epoch": 1.6533864541832668,
"grad_norm": 1.6645681858062744,
"learning_rate": 4.280108317380859e-06,
"loss": 0.4568580985069275,
"step": 830
},
{
"epoch": 1.6573705179282867,
"grad_norm": 1.5291117429733276,
"learning_rate": 4.27181386200849e-06,
"loss": 0.9923895597457886,
"step": 832
},
{
"epoch": 1.6613545816733066,
"grad_norm": 1.294873833656311,
"learning_rate": 4.263508856913346e-06,
"loss": 0.994326651096344,
"step": 834
},
{
"epoch": 1.6653386454183265,
"grad_norm": 2.7709615230560303,
"learning_rate": 4.2551933922521e-06,
"loss": 0.8918184041976929,
"step": 836
},
{
"epoch": 1.6693227091633465,
"grad_norm": 1.2106887102127075,
"learning_rate": 4.246867558294967e-06,
"loss": 1.1439393758773804,
"step": 838
},
{
"epoch": 1.6733067729083664,
"grad_norm": 1.091464877128601,
"learning_rate": 4.2385314454247275e-06,
"loss": 1.0264958143234253,
"step": 840
},
{
"epoch": 1.6772908366533863,
"grad_norm": 1.5609543323516846,
"learning_rate": 4.230185144135749e-06,
"loss": 0.8460158109664917,
"step": 842
},
{
"epoch": 1.6812749003984062,
"grad_norm": 0.8120943903923035,
"learning_rate": 4.221828745033002e-06,
"loss": 1.0981191396713257,
"step": 844
},
{
"epoch": 1.6852589641434261,
"grad_norm": 1.0494468212127686,
"learning_rate": 4.2134623388310706e-06,
"loss": 0.3851274847984314,
"step": 846
},
{
"epoch": 1.6892430278884463,
"grad_norm": 1.039975643157959,
"learning_rate": 4.20508601635318e-06,
"loss": 0.7145401239395142,
"step": 848
},
{
"epoch": 1.6932270916334662,
"grad_norm": 1.385925054550171,
"learning_rate": 4.1966998685302e-06,
"loss": 1.1264657974243164,
"step": 850
},
{
"epoch": 1.697211155378486,
"grad_norm": 0.7857804894447327,
"learning_rate": 4.18830398639966e-06,
"loss": 1.1105672121047974,
"step": 852
},
{
"epoch": 1.701195219123506,
"grad_norm": 1.1625089645385742,
"learning_rate": 4.179898461104764e-06,
"loss": 1.078861117362976,
"step": 854
},
{
"epoch": 1.705179282868526,
"grad_norm": 0.9041614532470703,
"learning_rate": 4.1714833838934006e-06,
"loss": 1.0313189029693604,
"step": 856
},
{
"epoch": 1.7091633466135459,
"grad_norm": 0.8065091967582703,
"learning_rate": 4.163058846117148e-06,
"loss": 0.34671998023986816,
"step": 858
},
{
"epoch": 1.7131474103585658,
"grad_norm": 1.2888925075531006,
"learning_rate": 4.154624939230289e-06,
"loss": 1.031374454498291,
"step": 860
},
{
"epoch": 1.7171314741035857,
"grad_norm": 0.8425755500793457,
"learning_rate": 4.146181754788813e-06,
"loss": 1.0426599979400635,
"step": 862
},
{
"epoch": 1.7211155378486056,
"grad_norm": 1.4209198951721191,
"learning_rate": 4.13772938444942e-06,
"loss": 0.6024843454360962,
"step": 864
},
{
"epoch": 1.7250996015936255,
"grad_norm": 1.0409010648727417,
"learning_rate": 4.129267919968536e-06,
"loss": 0.4379670023918152,
"step": 866
},
{
"epoch": 1.7290836653386454,
"grad_norm": 1.4887381792068481,
"learning_rate": 4.120797453201309e-06,
"loss": 0.8161473274230957,
"step": 868
},
{
"epoch": 1.7330677290836654,
"grad_norm": 12.129778861999512,
"learning_rate": 4.112318076100608e-06,
"loss": 0.22986909747123718,
"step": 870
},
{
"epoch": 1.7370517928286853,
"grad_norm": 2.050231456756592,
"learning_rate": 4.103829880716036e-06,
"loss": 0.5155397057533264,
"step": 872
},
{
"epoch": 1.7410358565737052,
"grad_norm": 3.127119541168213,
"learning_rate": 4.0953329591929204e-06,
"loss": 0.42298442125320435,
"step": 874
},
{
"epoch": 1.745019920318725,
"grad_norm": 1.210281491279602,
"learning_rate": 4.08682740377132e-06,
"loss": 1.0322401523590088,
"step": 876
},
{
"epoch": 1.749003984063745,
"grad_norm": 0.7078624367713928,
"learning_rate": 4.0783133067850185e-06,
"loss": 1.0741485357284546,
"step": 878
},
{
"epoch": 1.752988047808765,
"grad_norm": 0.9627106189727783,
"learning_rate": 4.069790760660525e-06,
"loss": 0.08892940729856491,
"step": 880
},
{
"epoch": 1.7569721115537849,
"grad_norm": 2.872758388519287,
"learning_rate": 4.06125985791607e-06,
"loss": 1.2808747291564941,
"step": 882
},
{
"epoch": 1.7609561752988048,
"grad_norm": 1.4781732559204102,
"learning_rate": 4.0527206911606025e-06,
"loss": 1.6314507722854614,
"step": 884
},
{
"epoch": 1.7649402390438247,
"grad_norm": 0.4292491674423218,
"learning_rate": 4.044173353092779e-06,
"loss": 0.2118670642375946,
"step": 886
},
{
"epoch": 1.7689243027888446,
"grad_norm": 1.0890276432037354,
"learning_rate": 4.035617936499967e-06,
"loss": 1.1356523036956787,
"step": 888
},
{
"epoch": 1.7729083665338645,
"grad_norm": 1.0168540477752686,
"learning_rate": 4.0270545342572265e-06,
"loss": 0.9910404086112976,
"step": 890
},
{
"epoch": 1.7768924302788844,
"grad_norm": 0.8853142261505127,
"learning_rate": 4.018483239326312e-06,
"loss": 0.9891409277915955,
"step": 892
},
{
"epoch": 1.7808764940239044,
"grad_norm": 0.7593168020248413,
"learning_rate": 4.009904144754655e-06,
"loss": 1.1023067235946655,
"step": 894
},
{
"epoch": 1.7848605577689243,
"grad_norm": 3.0125675201416016,
"learning_rate": 4.00131734367436e-06,
"loss": 0.9771660566329956,
"step": 896
},
{
"epoch": 1.7888446215139442,
"grad_norm": 1.7285772562026978,
"learning_rate": 3.99272292930119e-06,
"loss": 0.5689830780029297,
"step": 898
},
{
"epoch": 1.792828685258964,
"grad_norm": 0.7325118184089661,
"learning_rate": 3.984120994933558e-06,
"loss": 1.026572823524475,
"step": 900
},
{
"epoch": 1.796812749003984,
"grad_norm": 1.3268436193466187,
"learning_rate": 3.975511633951506e-06,
"loss": 0.5517056584358215,
"step": 902
},
{
"epoch": 1.800796812749004,
"grad_norm": 0.8117510676383972,
"learning_rate": 3.966894939815702e-06,
"loss": 0.3609198033809662,
"step": 904
},
{
"epoch": 1.8047808764940239,
"grad_norm": 1.122198224067688,
"learning_rate": 3.958271006066421e-06,
"loss": 0.9236494898796082,
"step": 906
},
{
"epoch": 1.8087649402390438,
"grad_norm": 2.9102554321289062,
"learning_rate": 3.949639926322527e-06,
"loss": 0.8726416230201721,
"step": 908
},
{
"epoch": 1.812749003984064,
"grad_norm": 13.756661415100098,
"learning_rate": 3.941001794280458e-06,
"loss": 1.0099586248397827,
"step": 910
},
{
"epoch": 1.8167330677290838,
"grad_norm": 3.1848342418670654,
"learning_rate": 3.932356703713212e-06,
"loss": 0.25727564096450806,
"step": 912
},
{
"epoch": 1.8207171314741037,
"grad_norm": 1.389024019241333,
"learning_rate": 3.923704748469326e-06,
"loss": 1.0060839653015137,
"step": 914
},
{
"epoch": 1.8247011952191237,
"grad_norm": 0.8609137535095215,
"learning_rate": 3.915046022471857e-06,
"loss": 1.0158603191375732,
"step": 916
},
{
"epoch": 1.8286852589641436,
"grad_norm": 0.8087533116340637,
"learning_rate": 3.906380619717363e-06,
"loss": 1.0479439496994019,
"step": 918
},
{
"epoch": 1.8326693227091635,
"grad_norm": 3.3105380535125732,
"learning_rate": 3.897708634274886e-06,
"loss": 0.36958053708076477,
"step": 920
},
{
"epoch": 1.8366533864541834,
"grad_norm": 1.9331108331680298,
"learning_rate": 3.889030160284922e-06,
"loss": 0.35556235909461975,
"step": 922
},
{
"epoch": 1.8406374501992033,
"grad_norm": 0.7566105723381042,
"learning_rate": 3.88034529195841e-06,
"loss": 1.1607534885406494,
"step": 924
},
{
"epoch": 1.8446215139442232,
"grad_norm": 0.2870655953884125,
"learning_rate": 3.871654123575704e-06,
"loss": 0.14478978514671326,
"step": 926
},
{
"epoch": 1.8486055776892432,
"grad_norm": 0.3280292749404907,
"learning_rate": 3.8629567494855445e-06,
"loss": 0.0896715372800827,
"step": 928
},
{
"epoch": 1.852589641434263,
"grad_norm": 1.354030728340149,
"learning_rate": 3.854253264104045e-06,
"loss": 1.078214168548584,
"step": 930
},
{
"epoch": 1.856573705179283,
"grad_norm": 1.015066146850586,
"learning_rate": 3.845543761913657e-06,
"loss": 1.114577293395996,
"step": 932
},
{
"epoch": 1.860557768924303,
"grad_norm": 0.39395958185195923,
"learning_rate": 3.836828337462152e-06,
"loss": 0.5930612087249756,
"step": 934
},
{
"epoch": 1.8645418326693228,
"grad_norm": 3.372042417526245,
"learning_rate": 3.82810708536159e-06,
"loss": 0.34988486766815186,
"step": 936
},
{
"epoch": 1.8685258964143427,
"grad_norm": 1.3925652503967285,
"learning_rate": 3.819380100287294e-06,
"loss": 1.0657780170440674,
"step": 938
},
{
"epoch": 1.8725099601593627,
"grad_norm": 1.6448031663894653,
"learning_rate": 3.810647476976824e-06,
"loss": 1.0907565355300903,
"step": 940
},
{
"epoch": 1.8764940239043826,
"grad_norm": 0.7891445159912109,
"learning_rate": 3.801909310228945e-06,
"loss": 0.35766711831092834,
"step": 942
},
{
"epoch": 1.8804780876494025,
"grad_norm": 1.724031686782837,
"learning_rate": 3.7931656949026028e-06,
"loss": 1.7528119087219238,
"step": 944
},
{
"epoch": 1.8844621513944224,
"grad_norm": 1.0190646648406982,
"learning_rate": 3.784416725915887e-06,
"loss": 0.706551194190979,
"step": 946
},
{
"epoch": 1.8884462151394423,
"grad_norm": 3.7524330615997314,
"learning_rate": 3.7756624982450105e-06,
"loss": 1.3365905284881592,
"step": 948
},
{
"epoch": 1.8924302788844622,
"grad_norm": 1.1480021476745605,
"learning_rate": 3.7669031069232684e-06,
"loss": 0.7811166048049927,
"step": 950
},
{
"epoch": 1.8964143426294822,
"grad_norm": 0.7147510647773743,
"learning_rate": 3.7581386470400106e-06,
"loss": 1.0117745399475098,
"step": 952
},
{
"epoch": 1.900398406374502,
"grad_norm": 2.004282236099243,
"learning_rate": 3.7493692137396153e-06,
"loss": 0.5164535045623779,
"step": 954
},
{
"epoch": 1.904382470119522,
"grad_norm": 0.7438123822212219,
"learning_rate": 3.7405949022204435e-06,
"loss": 1.0378838777542114,
"step": 956
},
{
"epoch": 1.908366533864542,
"grad_norm": 3.5988733768463135,
"learning_rate": 3.731815807733818e-06,
"loss": 0.6023346781730652,
"step": 958
},
{
"epoch": 1.9123505976095618,
"grad_norm": 2.4353888034820557,
"learning_rate": 3.723032025582982e-06,
"loss": 0.5875221490859985,
"step": 960
},
{
"epoch": 1.9163346613545817,
"grad_norm": 1.3933720588684082,
"learning_rate": 3.7142436511220676e-06,
"loss": 0.1774052381515503,
"step": 962
},
{
"epoch": 1.9203187250996017,
"grad_norm": 2.9852864742279053,
"learning_rate": 3.7054507797550564e-06,
"loss": 1.3314721584320068,
"step": 964
},
{
"epoch": 1.9243027888446216,
"grad_norm": 0.7507312893867493,
"learning_rate": 3.6966535069347523e-06,
"loss": 1.0096935033798218,
"step": 966
},
{
"epoch": 1.9282868525896415,
"grad_norm": 1.7996251583099365,
"learning_rate": 3.6878519281617354e-06,
"loss": 1.0307931900024414,
"step": 968
},
{
"epoch": 1.9322709163346614,
"grad_norm": 1.16811203956604,
"learning_rate": 3.6790461389833317e-06,
"loss": 0.9180192351341248,
"step": 970
},
{
"epoch": 1.9362549800796813,
"grad_norm": 0.7789274454116821,
"learning_rate": 3.670236234992576e-06,
"loss": 1.1056816577911377,
"step": 972
},
{
"epoch": 1.9402390438247012,
"grad_norm": 0.8071714639663696,
"learning_rate": 3.661422311827169e-06,
"loss": 1.061263084411621,
"step": 974
},
{
"epoch": 1.9442231075697212,
"grad_norm": 2.5436365604400635,
"learning_rate": 3.652604465168444e-06,
"loss": 0.9830687642097473,
"step": 976
},
{
"epoch": 1.948207171314741,
"grad_norm": 0.7201181054115295,
"learning_rate": 3.6437827907403273e-06,
"loss": 1.0000416040420532,
"step": 978
},
{
"epoch": 1.952191235059761,
"grad_norm": 0.7345990538597107,
"learning_rate": 3.6349573843082966e-06,
"loss": 1.0285298824310303,
"step": 980
},
{
"epoch": 1.956175298804781,
"grad_norm": 0.6029013395309448,
"learning_rate": 3.6261283416783447e-06,
"loss": 0.3689904808998108,
"step": 982
},
{
"epoch": 1.9601593625498008,
"grad_norm": 5.31935977935791,
"learning_rate": 3.6172957586959372e-06,
"loss": 1.075624704360962,
"step": 984
},
{
"epoch": 1.9641434262948207,
"grad_norm": 2.391829252243042,
"learning_rate": 3.6084597312449725e-06,
"loss": 0.8474624156951904,
"step": 986
},
{
"epoch": 1.9681274900398407,
"grad_norm": 5.1822967529296875,
"learning_rate": 3.599620355246742e-06,
"loss": 0.31603577733039856,
"step": 988
},
{
"epoch": 1.9721115537848606,
"grad_norm": 1.8022582530975342,
"learning_rate": 3.5907777266588856e-06,
"loss": 0.911726713180542,
"step": 990
},
{
"epoch": 1.9760956175298805,
"grad_norm": 0.7391871213912964,
"learning_rate": 3.5819319414743555e-06,
"loss": 1.0421473979949951,
"step": 992
},
{
"epoch": 1.9800796812749004,
"grad_norm": 1.211188554763794,
"learning_rate": 3.573083095720369e-06,
"loss": 1.0375580787658691,
"step": 994
},
{
"epoch": 1.9840637450199203,
"grad_norm": 6.231225967407227,
"learning_rate": 3.5642312854573686e-06,
"loss": 0.5392568707466125,
"step": 996
},
{
"epoch": 1.9880478087649402,
"grad_norm": 1.1782855987548828,
"learning_rate": 3.5553766067779785e-06,
"loss": 1.188450813293457,
"step": 998
},
{
"epoch": 1.9920318725099602,
"grad_norm": 0.6256092190742493,
"learning_rate": 3.546519155805962e-06,
"loss": 1.0698131322860718,
"step": 1000
},
{
"epoch": 1.99601593625498,
"grad_norm": 0.89486163854599,
"learning_rate": 3.5376590286951774e-06,
"loss": 1.02101469039917,
"step": 1002
},
{
"epoch": 2.0,
"grad_norm": 0.5744116902351379,
"learning_rate": 3.5287963216285337e-06,
"loss": 0.08481757342815399,
"step": 1004
},
{
"epoch": 2.00398406374502,
"grad_norm": 0.4444674849510193,
"learning_rate": 3.519931130816947e-06,
"loss": 0.14744052290916443,
"step": 1006
},
{
"epoch": 2.00796812749004,
"grad_norm": 1.0349431037902832,
"learning_rate": 3.511063552498299e-06,
"loss": 0.894745945930481,
"step": 1008
},
{
"epoch": 2.0119521912350598,
"grad_norm": 0.5005489587783813,
"learning_rate": 3.502193682936385e-06,
"loss": 0.29803839325904846,
"step": 1010
},
{
"epoch": 2.0159362549800797,
"grad_norm": 1.0027674436569214,
"learning_rate": 3.493321618419877e-06,
"loss": 0.6132505536079407,
"step": 1012
},
{
"epoch": 2.0199203187250996,
"grad_norm": 0.722247302532196,
"learning_rate": 3.484447455261272e-06,
"loss": 0.8650059700012207,
"step": 1014
},
{
"epoch": 2.0239043824701195,
"grad_norm": 0.1125183254480362,
"learning_rate": 3.4755712897958524e-06,
"loss": 0.06626415252685547,
"step": 1016
},
{
"epoch": 2.0278884462151394,
"grad_norm": 2.244713306427002,
"learning_rate": 3.4666932183806345e-06,
"loss": 0.6729474663734436,
"step": 1018
},
{
"epoch": 2.0318725099601593,
"grad_norm": 0.8710299730300903,
"learning_rate": 3.4578133373933263e-06,
"loss": 0.8701741099357605,
"step": 1020
},
{
"epoch": 2.0358565737051793,
"grad_norm": 0.8872413635253906,
"learning_rate": 3.4489317432312796e-06,
"loss": 0.8716042041778564,
"step": 1022
},
{
"epoch": 2.039840637450199,
"grad_norm": 1.219373106956482,
"learning_rate": 3.4400485323104426e-06,
"loss": 0.34580960869789124,
"step": 1024
},
{
"epoch": 2.043824701195219,
"grad_norm": 1.7070385217666626,
"learning_rate": 3.431163801064317e-06,
"loss": 0.3066391348838806,
"step": 1026
},
{
"epoch": 2.047808764940239,
"grad_norm": 3.4397644996643066,
"learning_rate": 3.422277645942907e-06,
"loss": 0.3099243938922882,
"step": 1028
},
{
"epoch": 2.051792828685259,
"grad_norm": 20.93805694580078,
"learning_rate": 3.413390163411675e-06,
"loss": 0.6691966652870178,
"step": 1030
},
{
"epoch": 2.055776892430279,
"grad_norm": 1.0854685306549072,
"learning_rate": 3.4045014499504923e-06,
"loss": 0.8780809640884399,
"step": 1032
},
{
"epoch": 2.0597609561752988,
"grad_norm": 11.395671844482422,
"learning_rate": 3.3956116020525924e-06,
"loss": 0.2683337926864624,
"step": 1034
},
{
"epoch": 2.0637450199203187,
"grad_norm": 2.4742014408111572,
"learning_rate": 3.3867207162235272e-06,
"loss": 0.7748890519142151,
"step": 1036
},
{
"epoch": 2.0677290836653386,
"grad_norm": 2.432234525680542,
"learning_rate": 3.377828888980112e-06,
"loss": 0.8894884586334229,
"step": 1038
},
{
"epoch": 2.0717131474103585,
"grad_norm": 2.468468427658081,
"learning_rate": 3.3689362168493844e-06,
"loss": 0.6649755239486694,
"step": 1040
},
{
"epoch": 2.0756972111553784,
"grad_norm": 0.6127830147743225,
"learning_rate": 3.3600427963675516e-06,
"loss": 0.8452335596084595,
"step": 1042
},
{
"epoch": 2.0796812749003983,
"grad_norm": 1.180112361907959,
"learning_rate": 3.3511487240789483e-06,
"loss": 0.929725170135498,
"step": 1044
},
{
"epoch": 2.0836653386454183,
"grad_norm": 0.738735020160675,
"learning_rate": 3.3422540965349806e-06,
"loss": 0.8923982381820679,
"step": 1046
},
{
"epoch": 2.087649402390438,
"grad_norm": 3.025284767150879,
"learning_rate": 3.333359010293085e-06,
"loss": 0.9607875347137451,
"step": 1048
},
{
"epoch": 2.091633466135458,
"grad_norm": 0.7996847033500671,
"learning_rate": 3.3244635619156786e-06,
"loss": 0.4797319769859314,
"step": 1050
},
{
"epoch": 2.095617529880478,
"grad_norm": 10.094463348388672,
"learning_rate": 3.315567847969106e-06,
"loss": 0.2578115165233612,
"step": 1052
},
{
"epoch": 2.099601593625498,
"grad_norm": 0.6219993233680725,
"learning_rate": 3.306671965022598e-06,
"loss": 0.315256267786026,
"step": 1054
},
{
"epoch": 2.103585657370518,
"grad_norm": 1.1088297367095947,
"learning_rate": 3.2977760096472184e-06,
"loss": 0.9286193251609802,
"step": 1056
},
{
"epoch": 2.1075697211155378,
"grad_norm": 1.1025009155273438,
"learning_rate": 3.2888800784148174e-06,
"loss": 0.7976268529891968,
"step": 1058
},
{
"epoch": 2.1115537848605577,
"grad_norm": 0.7398043274879456,
"learning_rate": 3.2799842678969835e-06,
"loss": 0.3379042148590088,
"step": 1060
},
{
"epoch": 2.1155378486055776,
"grad_norm": 1.8223795890808105,
"learning_rate": 3.2710886746639964e-06,
"loss": 0.29785844683647156,
"step": 1062
},
{
"epoch": 2.1195219123505975,
"grad_norm": 0.9167846441268921,
"learning_rate": 3.262193395283773e-06,
"loss": 0.10107379406690598,
"step": 1064
},
{
"epoch": 2.1235059760956174,
"grad_norm": 6.6176300048828125,
"learning_rate": 3.2532985263208266e-06,
"loss": 0.4440305829048157,
"step": 1066
},
{
"epoch": 2.1274900398406373,
"grad_norm": 0.8213241696357727,
"learning_rate": 3.244404164335213e-06,
"loss": 0.8258364796638489,
"step": 1068
},
{
"epoch": 2.1314741035856573,
"grad_norm": 2.339560031890869,
"learning_rate": 3.2355104058814874e-06,
"loss": 0.9001627564430237,
"step": 1070
},
{
"epoch": 2.135458167330677,
"grad_norm": 1.07158625125885,
"learning_rate": 3.226617347507649e-06,
"loss": 0.3943869471549988,
"step": 1072
},
{
"epoch": 2.139442231075697,
"grad_norm": 0.9587336182594299,
"learning_rate": 3.2177250857541007e-06,
"loss": 1.0341042280197144,
"step": 1074
},
{
"epoch": 2.143426294820717,
"grad_norm": 0.8883066773414612,
"learning_rate": 3.208833717152594e-06,
"loss": 0.19238322973251343,
"step": 1076
},
{
"epoch": 2.147410358565737,
"grad_norm": 1.4621644020080566,
"learning_rate": 3.199943338225189e-06,
"loss": 0.7075263261795044,
"step": 1078
},
{
"epoch": 2.151394422310757,
"grad_norm": 0.9659390449523926,
"learning_rate": 3.1910540454832e-06,
"loss": 0.9844989776611328,
"step": 1080
},
{
"epoch": 2.1553784860557768,
"grad_norm": 0.9126376509666443,
"learning_rate": 3.1821659354261478e-06,
"loss": 0.8773077130317688,
"step": 1082
},
{
"epoch": 2.1593625498007967,
"grad_norm": 1.5047764778137207,
"learning_rate": 3.173279104540719e-06,
"loss": 0.7283194065093994,
"step": 1084
},
{
"epoch": 2.1633466135458166,
"grad_norm": 2.4488370418548584,
"learning_rate": 3.164393649299711e-06,
"loss": 1.0191715955734253,
"step": 1086
},
{
"epoch": 2.1673306772908365,
"grad_norm": 0.6298505663871765,
"learning_rate": 3.155509666160986e-06,
"loss": 0.19404178857803345,
"step": 1088
},
{
"epoch": 2.1713147410358564,
"grad_norm": 3.298346519470215,
"learning_rate": 3.1466272515664287e-06,
"loss": 0.4330817759037018,
"step": 1090
},
{
"epoch": 2.1752988047808763,
"grad_norm": 1.4736095666885376,
"learning_rate": 3.137746501940894e-06,
"loss": 0.8412344455718994,
"step": 1092
},
{
"epoch": 2.1792828685258963,
"grad_norm": 1.3612383604049683,
"learning_rate": 3.1288675136911653e-06,
"loss": 0.7719582915306091,
"step": 1094
},
{
"epoch": 2.183266932270916,
"grad_norm": 1.6760456562042236,
"learning_rate": 3.1199903832049025e-06,
"loss": 0.8681936264038086,
"step": 1096
},
{
"epoch": 2.187250996015936,
"grad_norm": 0.9944242238998413,
"learning_rate": 3.1111152068495982e-06,
"loss": 0.8590313196182251,
"step": 1098
},
{
"epoch": 2.191235059760956,
"grad_norm": 1.1411633491516113,
"learning_rate": 3.102242080971531e-06,
"loss": 0.8502429723739624,
"step": 1100
},
{
"epoch": 2.195219123505976,
"grad_norm": 1.0093145370483398,
"learning_rate": 3.0933711018947217e-06,
"loss": 0.8326080441474915,
"step": 1102
},
{
"epoch": 2.199203187250996,
"grad_norm": 1.3518801927566528,
"learning_rate": 3.084502365919887e-06,
"loss": 0.31851112842559814,
"step": 1104
},
{
"epoch": 2.2031872509960158,
"grad_norm": 0.8486732840538025,
"learning_rate": 3.0756359693233897e-06,
"loss": 0.12462817877531052,
"step": 1106
},
{
"epoch": 2.2071713147410357,
"grad_norm": 3.158237934112549,
"learning_rate": 3.066772008356201e-06,
"loss": 0.7065569162368774,
"step": 1108
},
{
"epoch": 2.2111553784860556,
"grad_norm": 1.6595673561096191,
"learning_rate": 3.057910579242848e-06,
"loss": 0.32911333441734314,
"step": 1110
},
{
"epoch": 2.2151394422310755,
"grad_norm": 0.9766960740089417,
"learning_rate": 3.0490517781803748e-06,
"loss": 0.8282409906387329,
"step": 1112
},
{
"epoch": 2.2191235059760954,
"grad_norm": 2.551868438720703,
"learning_rate": 3.040195701337296e-06,
"loss": 0.8591130971908569,
"step": 1114
},
{
"epoch": 2.2231075697211153,
"grad_norm": 2.4142255783081055,
"learning_rate": 3.0313424448525513e-06,
"loss": 0.6863746643066406,
"step": 1116
},
{
"epoch": 2.2270916334661353,
"grad_norm": 1.8660197257995605,
"learning_rate": 3.022492104834467e-06,
"loss": 0.867939829826355,
"step": 1118
},
{
"epoch": 2.231075697211155,
"grad_norm": 1.012052297592163,
"learning_rate": 3.013644777359706e-06,
"loss": 0.862476110458374,
"step": 1120
},
{
"epoch": 2.235059760956175,
"grad_norm": 1.3242058753967285,
"learning_rate": 3.004800558472228e-06,
"loss": 0.8478327393531799,
"step": 1122
},
{
"epoch": 2.239043824701195,
"grad_norm": 1.5202715396881104,
"learning_rate": 2.995959544182248e-06,
"loss": 0.8780950307846069,
"step": 1124
},
{
"epoch": 2.243027888446215,
"grad_norm": 1.5164873600006104,
"learning_rate": 2.9871218304651926e-06,
"loss": 0.8773269653320312,
"step": 1126
},
{
"epoch": 2.247011952191235,
"grad_norm": 12.062283515930176,
"learning_rate": 2.9782875132606573e-06,
"loss": 0.5782788991928101,
"step": 1128
},
{
"epoch": 2.2509960159362548,
"grad_norm": 0.4626627266407013,
"learning_rate": 2.969456688471368e-06,
"loss": 0.17795492708683014,
"step": 1130
},
{
"epoch": 2.2549800796812747,
"grad_norm": 8.622909545898438,
"learning_rate": 2.960629451962137e-06,
"loss": 0.876864492893219,
"step": 1132
},
{
"epoch": 2.2589641434262946,
"grad_norm": 2.5603370666503906,
"learning_rate": 2.9518058995588217e-06,
"loss": 0.5039679408073425,
"step": 1134
},
{
"epoch": 2.2629482071713145,
"grad_norm": 1.9047883749008179,
"learning_rate": 2.9429861270472884e-06,
"loss": 0.8298702836036682,
"step": 1136
},
{
"epoch": 2.2669322709163344,
"grad_norm": 1.333377480506897,
"learning_rate": 2.9341702301723704e-06,
"loss": 0.8177191019058228,
"step": 1138
},
{
"epoch": 2.2709163346613543,
"grad_norm": 0.8072558641433716,
"learning_rate": 2.9253583046368243e-06,
"loss": 0.8483671545982361,
"step": 1140
},
{
"epoch": 2.2749003984063743,
"grad_norm": 1.162376046180725,
"learning_rate": 2.916550446100299e-06,
"loss": 0.8442429900169373,
"step": 1142
},
{
"epoch": 2.278884462151394,
"grad_norm": 2.1500282287597656,
"learning_rate": 2.907746750178293e-06,
"loss": 0.40876924991607666,
"step": 1144
},
{
"epoch": 2.2828685258964145,
"grad_norm": 1.5930662155151367,
"learning_rate": 2.8989473124411136e-06,
"loss": 0.3929884433746338,
"step": 1146
},
{
"epoch": 2.2868525896414345,
"grad_norm": 0.9812231659889221,
"learning_rate": 2.8901522284128454e-06,
"loss": 0.8924030661582947,
"step": 1148
},
{
"epoch": 2.2908366533864544,
"grad_norm": 4.809815883636475,
"learning_rate": 2.881361593570308e-06,
"loss": 0.412593275308609,
"step": 1150
},
{
"epoch": 2.2948207171314743,
"grad_norm": 0.34295371174812317,
"learning_rate": 2.872575503342027e-06,
"loss": 0.07170237600803375,
"step": 1152
},
{
"epoch": 2.298804780876494,
"grad_norm": 2.6662888526916504,
"learning_rate": 2.8637940531071856e-06,
"loss": 0.9125880599021912,
"step": 1154
},
{
"epoch": 2.302788844621514,
"grad_norm": 1.016099214553833,
"learning_rate": 2.8550173381946035e-06,
"loss": 0.20460867881774902,
"step": 1156
},
{
"epoch": 2.306772908366534,
"grad_norm": 1.2535561323165894,
"learning_rate": 2.84624545388169e-06,
"loss": 0.18213213980197906,
"step": 1158
},
{
"epoch": 2.310756972111554,
"grad_norm": 5.914939880371094,
"learning_rate": 2.837478495393418e-06,
"loss": 1.015434980392456,
"step": 1160
},
{
"epoch": 2.314741035856574,
"grad_norm": 3.516514539718628,
"learning_rate": 2.828716557901286e-06,
"loss": 0.4791782796382904,
"step": 1162
},
{
"epoch": 2.318725099601594,
"grad_norm": 1.2415333986282349,
"learning_rate": 2.819959736522286e-06,
"loss": 0.6430278420448303,
"step": 1164
},
{
"epoch": 2.3227091633466137,
"grad_norm": 6.374106407165527,
"learning_rate": 2.8112081263178727e-06,
"loss": 0.7340620756149292,
"step": 1166
},
{
"epoch": 2.3266932270916336,
"grad_norm": 0.7349236011505127,
"learning_rate": 2.8024618222929257e-06,
"loss": 0.8904776573181152,
"step": 1168
},
{
"epoch": 2.3306772908366535,
"grad_norm": 3.1692311763763428,
"learning_rate": 2.793720919394726e-06,
"loss": 0.3335300385951996,
"step": 1170
},
{
"epoch": 2.3346613545816735,
"grad_norm": 1.9627305269241333,
"learning_rate": 2.7849855125119204e-06,
"loss": 0.9338223338127136,
"step": 1172
},
{
"epoch": 2.3386454183266934,
"grad_norm": 1.715811014175415,
"learning_rate": 2.7762556964734925e-06,
"loss": 0.8548279404640198,
"step": 1174
},
{
"epoch": 2.3426294820717133,
"grad_norm": 1.2761598825454712,
"learning_rate": 2.7675315660477342e-06,
"loss": 0.6551219820976257,
"step": 1176
},
{
"epoch": 2.346613545816733,
"grad_norm": 0.5829970836639404,
"learning_rate": 2.7588132159412153e-06,
"loss": 0.8633916974067688,
"step": 1178
},
{
"epoch": 2.350597609561753,
"grad_norm": 0.8791594505310059,
"learning_rate": 2.7501007407977554e-06,
"loss": 0.8312200903892517,
"step": 1180
},
{
"epoch": 2.354581673306773,
"grad_norm": 0.8145209550857544,
"learning_rate": 2.7413942351973994e-06,
"loss": 0.8451777696609497,
"step": 1182
},
{
"epoch": 2.358565737051793,
"grad_norm": 0.8338920474052429,
"learning_rate": 2.7326937936553845e-06,
"loss": 0.9415311813354492,
"step": 1184
},
{
"epoch": 2.362549800796813,
"grad_norm": 0.9346828460693359,
"learning_rate": 2.7239995106211244e-06,
"loss": 0.8471455574035645,
"step": 1186
},
{
"epoch": 2.366533864541833,
"grad_norm": 1.4322340488433838,
"learning_rate": 2.715311480477173e-06,
"loss": 0.30060604214668274,
"step": 1188
},
{
"epoch": 2.3705179282868527,
"grad_norm": 1.1024688482284546,
"learning_rate": 2.7066297975382065e-06,
"loss": 0.7530568838119507,
"step": 1190
},
{
"epoch": 2.3745019920318726,
"grad_norm": 0.5967240333557129,
"learning_rate": 2.697954556049997e-06,
"loss": 0.867277204990387,
"step": 1192
},
{
"epoch": 2.3784860557768925,
"grad_norm": 0.9026405811309814,
"learning_rate": 2.689285850188391e-06,
"loss": 0.9335858225822449,
"step": 1194
},
{
"epoch": 2.3824701195219125,
"grad_norm": 0.48514679074287415,
"learning_rate": 2.6806237740582855e-06,
"loss": 0.2793917655944824,
"step": 1196
},
{
"epoch": 2.3864541832669324,
"grad_norm": 2.9039154052734375,
"learning_rate": 2.671968421692607e-06,
"loss": 1.4733071327209473,
"step": 1198
},
{
"epoch": 2.3904382470119523,
"grad_norm": 3.6072850227355957,
"learning_rate": 2.6633198870512927e-06,
"loss": 0.3655731976032257,
"step": 1200
},
{
"epoch": 2.394422310756972,
"grad_norm": 0.6584874391555786,
"learning_rate": 2.6546782640202666e-06,
"loss": 0.8660189509391785,
"step": 1202
},
{
"epoch": 2.398406374501992,
"grad_norm": 0.5407839417457581,
"learning_rate": 2.6460436464104216e-06,
"loss": 0.848800003528595,
"step": 1204
},
{
"epoch": 2.402390438247012,
"grad_norm": 1.0635416507720947,
"learning_rate": 2.6374161279566035e-06,
"loss": 0.9516815543174744,
"step": 1206
},
{
"epoch": 2.406374501992032,
"grad_norm": 0.41980046033859253,
"learning_rate": 2.628795802316591e-06,
"loss": 0.120535708963871,
"step": 1208
},
{
"epoch": 2.410358565737052,
"grad_norm": 0.3191829323768616,
"learning_rate": 2.620182763070081e-06,
"loss": 0.023226367309689522,
"step": 1210
},
{
"epoch": 2.414342629482072,
"grad_norm": 1.4996663331985474,
"learning_rate": 2.61157710371767e-06,
"loss": 0.45069432258605957,
"step": 1212
},
{
"epoch": 2.4183266932270917,
"grad_norm": 1.0962636470794678,
"learning_rate": 2.6029789176798417e-06,
"loss": 0.6983217000961304,
"step": 1214
},
{
"epoch": 2.4223107569721116,
"grad_norm": 0.8529632091522217,
"learning_rate": 2.594388298295949e-06,
"loss": 0.17169800400733948,
"step": 1216
},
{
"epoch": 2.4262948207171315,
"grad_norm": 0.9947030544281006,
"learning_rate": 2.585805338823208e-06,
"loss": 0.8718166947364807,
"step": 1218
},
{
"epoch": 2.4302788844621515,
"grad_norm": 0.39905738830566406,
"learning_rate": 2.577230132435678e-06,
"loss": 0.5236790776252747,
"step": 1220
},
{
"epoch": 2.4342629482071714,
"grad_norm": 1.6986416578292847,
"learning_rate": 2.5686627722232518e-06,
"loss": 0.4206949770450592,
"step": 1222
},
{
"epoch": 2.4382470119521913,
"grad_norm": 0.8914661407470703,
"learning_rate": 2.560103351190651e-06,
"loss": 0.8530100584030151,
"step": 1224
},
{
"epoch": 2.442231075697211,
"grad_norm": 1.940697193145752,
"learning_rate": 2.5515519622564086e-06,
"loss": 0.03098766878247261,
"step": 1226
},
{
"epoch": 2.446215139442231,
"grad_norm": 0.740294873714447,
"learning_rate": 2.543008698251863e-06,
"loss": 0.8904476165771484,
"step": 1228
},
{
"epoch": 2.450199203187251,
"grad_norm": 1.2256784439086914,
"learning_rate": 2.534473651920153e-06,
"loss": 0.6660670042037964,
"step": 1230
},
{
"epoch": 2.454183266932271,
"grad_norm": 1.3577665090560913,
"learning_rate": 2.5259469159152063e-06,
"loss": 0.8957257270812988,
"step": 1232
},
{
"epoch": 2.458167330677291,
"grad_norm": 5.5895209312438965,
"learning_rate": 2.5174285828007387e-06,
"loss": 0.4879809319972992,
"step": 1234
},
{
"epoch": 2.462151394422311,
"grad_norm": 1.602962851524353,
"learning_rate": 2.5089187450492464e-06,
"loss": 0.8527651429176331,
"step": 1236
},
{
"epoch": 2.4661354581673307,
"grad_norm": 1.6139048337936401,
"learning_rate": 2.5004174950409996e-06,
"loss": 0.814254641532898,
"step": 1238
},
{
"epoch": 2.4701195219123506,
"grad_norm": 2.1591413021087646,
"learning_rate": 2.4919249250630463e-06,
"loss": 0.620861828327179,
"step": 1240
},
{
"epoch": 2.4741035856573705,
"grad_norm": 2.2499430179595947,
"learning_rate": 2.483441127308202e-06,
"loss": 0.622882068157196,
"step": 1242
},
{
"epoch": 2.4780876494023905,
"grad_norm": 0.8735558390617371,
"learning_rate": 2.47496619387406e-06,
"loss": 0.8819273114204407,
"step": 1244
},
{
"epoch": 2.4820717131474104,
"grad_norm": 1.0973459482192993,
"learning_rate": 2.4665002167619798e-06,
"loss": 0.85080885887146,
"step": 1246
},
{
"epoch": 2.4860557768924303,
"grad_norm": 1.19606351852417,
"learning_rate": 2.4580432878760968e-06,
"loss": 0.5080418586730957,
"step": 1248
},
{
"epoch": 2.49003984063745,
"grad_norm": 0.36084145307540894,
"learning_rate": 2.449595499022318e-06,
"loss": 0.3111553192138672,
"step": 1250
},
{
"epoch": 2.49402390438247,
"grad_norm": 0.7546538710594177,
"learning_rate": 2.441156941907333e-06,
"loss": 0.6624001264572144,
"step": 1252
},
{
"epoch": 2.49800796812749,
"grad_norm": 0.7720620632171631,
"learning_rate": 2.432727708137612e-06,
"loss": 0.7852078676223755,
"step": 1254
},
{
"epoch": 2.50199203187251,
"grad_norm": 2.640068292617798,
"learning_rate": 2.424307889218414e-06,
"loss": 0.9888243079185486,
"step": 1256
},
{
"epoch": 2.50597609561753,
"grad_norm": 0.47891512513160706,
"learning_rate": 2.415897576552795e-06,
"loss": 0.11806351691484451,
"step": 1258
},
{
"epoch": 2.50996015936255,
"grad_norm": 1.773125171661377,
"learning_rate": 2.407496861440611e-06,
"loss": 0.712026834487915,
"step": 1260
},
{
"epoch": 2.5139442231075697,
"grad_norm": 0.8916162848472595,
"learning_rate": 2.3991058350775316e-06,
"loss": 0.27510854601860046,
"step": 1262
},
{
"epoch": 2.5179282868525896,
"grad_norm": 2.915144205093384,
"learning_rate": 2.3907245885540473e-06,
"loss": 0.5907682180404663,
"step": 1264
},
{
"epoch": 2.5219123505976095,
"grad_norm": 0.7523391842842102,
"learning_rate": 2.382353212854483e-06,
"loss": 0.875799298286438,
"step": 1266
},
{
"epoch": 2.5258964143426295,
"grad_norm": 0.7640947699546814,
"learning_rate": 2.373991798856008e-06,
"loss": 0.8100597858428955,
"step": 1268
},
{
"epoch": 2.5298804780876494,
"grad_norm": 0.9602063894271851,
"learning_rate": 2.3656404373276496e-06,
"loss": 0.8617823719978333,
"step": 1270
},
{
"epoch": 2.5338645418326693,
"grad_norm": 1.0857386589050293,
"learning_rate": 2.35729921892931e-06,
"loss": 0.7695320248603821,
"step": 1272
},
{
"epoch": 2.537848605577689,
"grad_norm": 2.655921220779419,
"learning_rate": 2.3489682342107787e-06,
"loss": 1.0393037796020508,
"step": 1274
},
{
"epoch": 2.541832669322709,
"grad_norm": 1.602705478668213,
"learning_rate": 2.3406475736107537e-06,
"loss": 0.8128276467323303,
"step": 1276
},
{
"epoch": 2.545816733067729,
"grad_norm": 1.7629623413085938,
"learning_rate": 2.332337327455856e-06,
"loss": 0.8416529893875122,
"step": 1278
},
{
"epoch": 2.549800796812749,
"grad_norm": 0.3072420656681061,
"learning_rate": 2.3240375859596493e-06,
"loss": 0.21107147634029388,
"step": 1280
},
{
"epoch": 2.553784860557769,
"grad_norm": 0.7584460973739624,
"learning_rate": 2.3157484392216645e-06,
"loss": 0.7613718509674072,
"step": 1282
},
{
"epoch": 2.557768924302789,
"grad_norm": 0.7467636466026306,
"learning_rate": 2.3074699772264184e-06,
"loss": 0.9068883657455444,
"step": 1284
},
{
"epoch": 2.5617529880478087,
"grad_norm": 2.827934503555298,
"learning_rate": 2.2992022898424358e-06,
"loss": 0.9814170002937317,
"step": 1286
},
{
"epoch": 2.5657370517928286,
"grad_norm": 0.6314749717712402,
"learning_rate": 2.2909454668212763e-06,
"loss": 0.9777659177780151,
"step": 1288
},
{
"epoch": 2.5697211155378485,
"grad_norm": 1.5785683393478394,
"learning_rate": 2.2826995977965586e-06,
"loss": 0.14857736229896545,
"step": 1290
},
{
"epoch": 2.5737051792828685,
"grad_norm": 0.8036978244781494,
"learning_rate": 2.27446477228299e-06,
"loss": 0.9405508041381836,
"step": 1292
},
{
"epoch": 2.5776892430278884,
"grad_norm": 0.7155508399009705,
"learning_rate": 2.2662410796753924e-06,
"loss": 0.8522077202796936,
"step": 1294
},
{
"epoch": 2.5816733067729083,
"grad_norm": 1.1586476564407349,
"learning_rate": 2.2580286092477285e-06,
"loss": 0.8515244722366333,
"step": 1296
},
{
"epoch": 2.585657370517928,
"grad_norm": 1.105276346206665,
"learning_rate": 2.2498274501521414e-06,
"loss": 0.8348259925842285,
"step": 1298
},
{
"epoch": 2.589641434262948,
"grad_norm": 0.5298115611076355,
"learning_rate": 2.2416376914179776e-06,
"loss": 0.37851282954216003,
"step": 1300
},
{
"epoch": 2.593625498007968,
"grad_norm": 0.8865681290626526,
"learning_rate": 2.2334594219508283e-06,
"loss": 0.493791401386261,
"step": 1302
},
{
"epoch": 2.597609561752988,
"grad_norm": 0.8937894105911255,
"learning_rate": 2.2252927305315587e-06,
"loss": 0.768490731716156,
"step": 1304
},
{
"epoch": 2.601593625498008,
"grad_norm": 2.249807119369507,
"learning_rate": 2.2171377058153465e-06,
"loss": 0.28239089250564575,
"step": 1306
},
{
"epoch": 2.605577689243028,
"grad_norm": 0.7723252773284912,
"learning_rate": 2.2089944363307165e-06,
"loss": 0.8856875896453857,
"step": 1308
},
{
"epoch": 2.6095617529880477,
"grad_norm": 0.43645548820495605,
"learning_rate": 2.2008630104785874e-06,
"loss": 0.352665513753891,
"step": 1310
},
{
"epoch": 2.6135458167330676,
"grad_norm": 2.615204095840454,
"learning_rate": 2.1927435165313036e-06,
"loss": 0.1691545695066452,
"step": 1312
},
{
"epoch": 2.6175298804780875,
"grad_norm": 0.7458433508872986,
"learning_rate": 2.184636042631679e-06,
"loss": 0.06585448980331421,
"step": 1314
},
{
"epoch": 2.6215139442231075,
"grad_norm": 1.3437604904174805,
"learning_rate": 2.176540676792046e-06,
"loss": 0.956698477268219,
"step": 1316
},
{
"epoch": 2.6254980079681274,
"grad_norm": 2.3479928970336914,
"learning_rate": 2.168457506893292e-06,
"loss": 0.669885516166687,
"step": 1318
},
{
"epoch": 2.6294820717131473,
"grad_norm": 0.6726356744766235,
"learning_rate": 2.1603866206839074e-06,
"loss": 0.9108378887176514,
"step": 1320
},
{
"epoch": 2.633466135458167,
"grad_norm": 0.6728199124336243,
"learning_rate": 2.152328105779041e-06,
"loss": 0.46163687109947205,
"step": 1322
},
{
"epoch": 2.637450199203187,
"grad_norm": 3.6970763206481934,
"learning_rate": 2.1442820496595337e-06,
"loss": 1.0799225568771362,
"step": 1324
},
{
"epoch": 2.641434262948207,
"grad_norm": 2.347198009490967,
"learning_rate": 2.1362485396709847e-06,
"loss": 0.2297479808330536,
"step": 1326
},
{
"epoch": 2.645418326693227,
"grad_norm": 1.014694094657898,
"learning_rate": 2.128227663022794e-06,
"loss": 0.7543836832046509,
"step": 1328
},
{
"epoch": 2.649402390438247,
"grad_norm": 1.9803884029388428,
"learning_rate": 2.1202195067872153e-06,
"loss": 0.8650748133659363,
"step": 1330
},
{
"epoch": 2.653386454183267,
"grad_norm": 1.038819432258606,
"learning_rate": 2.112224157898416e-06,
"loss": 0.7467201352119446,
"step": 1332
},
{
"epoch": 2.6573705179282867,
"grad_norm": 4.248292922973633,
"learning_rate": 2.1042417031515303e-06,
"loss": 1.0267494916915894,
"step": 1334
},
{
"epoch": 2.6613545816733066,
"grad_norm": 0.40952640771865845,
"learning_rate": 2.096272229201716e-06,
"loss": 0.06949189305305481,
"step": 1336
},
{
"epoch": 2.6653386454183265,
"grad_norm": 1.2858881950378418,
"learning_rate": 2.0883158225632168e-06,
"loss": 0.9944968223571777,
"step": 1338
},
{
"epoch": 2.6693227091633465,
"grad_norm": 1.2663077116012573,
"learning_rate": 2.0803725696084224e-06,
"loss": 0.32381299138069153,
"step": 1340
},
{
"epoch": 2.6733067729083664,
"grad_norm": 2.5092110633850098,
"learning_rate": 2.072442556566928e-06,
"loss": 0.5067175626754761,
"step": 1342
},
{
"epoch": 2.6772908366533863,
"grad_norm": 0.4816880226135254,
"learning_rate": 2.0645258695245993e-06,
"loss": 0.06836852431297302,
"step": 1344
},
{
"epoch": 2.681274900398406,
"grad_norm": 0.8811363577842712,
"learning_rate": 2.0566225944226414e-06,
"loss": 0.8118082284927368,
"step": 1346
},
{
"epoch": 2.685258964143426,
"grad_norm": 0.7595816850662231,
"learning_rate": 2.0487328170566643e-06,
"loss": 0.833029568195343,
"step": 1348
},
{
"epoch": 2.6892430278884465,
"grad_norm": 0.9555457830429077,
"learning_rate": 2.0408566230757465e-06,
"loss": 0.8837859034538269,
"step": 1350
},
{
"epoch": 2.6932270916334664,
"grad_norm": 2.7736618518829346,
"learning_rate": 2.0329940979815116e-06,
"loss": 0.3744777739048004,
"step": 1352
},
{
"epoch": 2.6972111553784863,
"grad_norm": 1.4651148319244385,
"learning_rate": 2.0251453271272e-06,
"loss": 0.3069399297237396,
"step": 1354
},
{
"epoch": 2.7011952191235062,
"grad_norm": 1.0298899412155151,
"learning_rate": 2.0173103957167367e-06,
"loss": 0.8419727087020874,
"step": 1356
},
{
"epoch": 2.705179282868526,
"grad_norm": 1.365960955619812,
"learning_rate": 2.009489388803809e-06,
"loss": 0.8394007682800293,
"step": 1358
},
{
"epoch": 2.709163346613546,
"grad_norm": 0.9906344413757324,
"learning_rate": 2.0016823912909486e-06,
"loss": 0.8413975238800049,
"step": 1360
},
{
"epoch": 2.713147410358566,
"grad_norm": 0.6724693775177002,
"learning_rate": 1.9938894879286024e-06,
"loss": 0.8469905853271484,
"step": 1362
},
{
"epoch": 2.717131474103586,
"grad_norm": 1.9248793125152588,
"learning_rate": 1.9861107633142155e-06,
"loss": 0.8509299755096436,
"step": 1364
},
{
"epoch": 2.721115537848606,
"grad_norm": 1.4797543287277222,
"learning_rate": 1.978346301891312e-06,
"loss": 0.35483643412590027,
"step": 1366
},
{
"epoch": 2.7250996015936257,
"grad_norm": 0.8299886584281921,
"learning_rate": 1.9705961879485813e-06,
"loss": 0.8987928628921509,
"step": 1368
},
{
"epoch": 2.7290836653386457,
"grad_norm": 1.4776321649551392,
"learning_rate": 1.962860505618958e-06,
"loss": 0.6491652131080627,
"step": 1370
},
{
"epoch": 2.7330677290836656,
"grad_norm": 6.724909782409668,
"learning_rate": 1.955139338878714e-06,
"loss": 0.19401389360427856,
"step": 1372
},
{
"epoch": 2.7370517928286855,
"grad_norm": 0.943676233291626,
"learning_rate": 1.9474327715465444e-06,
"loss": 0.8299869894981384,
"step": 1374
},
{
"epoch": 2.7410358565737054,
"grad_norm": 1.2990317344665527,
"learning_rate": 1.9397408872826545e-06,
"loss": 0.871895968914032,
"step": 1376
},
{
"epoch": 2.7450199203187253,
"grad_norm": 1.9206279516220093,
"learning_rate": 1.9320637695878555e-06,
"loss": 0.30201855301856995,
"step": 1378
},
{
"epoch": 2.7490039840637452,
"grad_norm": 0.7692667841911316,
"learning_rate": 1.924401501802659e-06,
"loss": 0.6371020078659058,
"step": 1380
},
{
"epoch": 2.752988047808765,
"grad_norm": 0.8262352347373962,
"learning_rate": 1.9167541671063703e-06,
"loss": 0.9497525691986084,
"step": 1382
},
{
"epoch": 2.756972111553785,
"grad_norm": 1.0128363370895386,
"learning_rate": 1.9091218485161824e-06,
"loss": 0.9976522922515869,
"step": 1384
},
{
"epoch": 2.760956175298805,
"grad_norm": 0.8022831082344055,
"learning_rate": 1.9015046288862815e-06,
"loss": 0.8430491089820862,
"step": 1386
},
{
"epoch": 2.764940239043825,
"grad_norm": 1.4386292695999146,
"learning_rate": 1.893902590906943e-06,
"loss": 0.6075490117073059,
"step": 1388
},
{
"epoch": 2.768924302788845,
"grad_norm": 1.3775461912155151,
"learning_rate": 1.8863158171036336e-06,
"loss": 0.12825116515159607,
"step": 1390
},
{
"epoch": 2.7729083665338647,
"grad_norm": 1.3699278831481934,
"learning_rate": 1.8787443898361158e-06,
"loss": 1.1316020488739014,
"step": 1392
},
{
"epoch": 2.7768924302788847,
"grad_norm": 0.8569239377975464,
"learning_rate": 1.8711883912975575e-06,
"loss": 0.655997633934021,
"step": 1394
},
{
"epoch": 2.7808764940239046,
"grad_norm": 0.7035950422286987,
"learning_rate": 1.8636479035136368e-06,
"loss": 0.8871821165084839,
"step": 1396
},
{
"epoch": 2.7848605577689245,
"grad_norm": 0.7683161497116089,
"learning_rate": 1.8561230083416488e-06,
"loss": 0.9570977687835693,
"step": 1398
},
{
"epoch": 2.7888446215139444,
"grad_norm": 0.8087801337242126,
"learning_rate": 1.8486137874696223e-06,
"loss": 0.8703477382659912,
"step": 1400
},
{
"epoch": 2.7928286852589643,
"grad_norm": 0.9088819622993469,
"learning_rate": 1.8411203224154289e-06,
"loss": 0.8619301915168762,
"step": 1402
},
{
"epoch": 2.7968127490039842,
"grad_norm": 0.3485574424266815,
"learning_rate": 1.833642694525902e-06,
"loss": 0.13462619483470917,
"step": 1404
},
{
"epoch": 2.800796812749004,
"grad_norm": 0.9604331851005554,
"learning_rate": 1.826180984975948e-06,
"loss": 0.8676316142082214,
"step": 1406
},
{
"epoch": 2.804780876494024,
"grad_norm": 1.302273154258728,
"learning_rate": 1.8187352747676718e-06,
"loss": 1.241036295890808,
"step": 1408
},
{
"epoch": 2.808764940239044,
"grad_norm": 1.2466564178466797,
"learning_rate": 1.8113056447294936e-06,
"loss": 1.0569744110107422,
"step": 1410
},
{
"epoch": 2.812749003984064,
"grad_norm": 0.9512035846710205,
"learning_rate": 1.8038921755152704e-06,
"loss": 0.8206438422203064,
"step": 1412
},
{
"epoch": 2.816733067729084,
"grad_norm": 1.0051904916763306,
"learning_rate": 1.7964949476034223e-06,
"loss": 0.9369583129882812,
"step": 1414
},
{
"epoch": 2.8207171314741037,
"grad_norm": 3.8374409675598145,
"learning_rate": 1.7891140412960615e-06,
"loss": 1.116792917251587,
"step": 1416
},
{
"epoch": 2.8247011952191237,
"grad_norm": 1.1146875619888306,
"learning_rate": 1.7817495367181132e-06,
"loss": 0.8257051110267639,
"step": 1418
},
{
"epoch": 2.8286852589641436,
"grad_norm": 0.2130766063928604,
"learning_rate": 1.774401513816454e-06,
"loss": 0.08374066650867462,
"step": 1420
},
{
"epoch": 2.8326693227091635,
"grad_norm": 0.8484716415405273,
"learning_rate": 1.76707005235904e-06,
"loss": 0.9364421963691711,
"step": 1422
},
{
"epoch": 2.8366533864541834,
"grad_norm": 0.7365440130233765,
"learning_rate": 1.759755231934039e-06,
"loss": 0.9269137978553772,
"step": 1424
},
{
"epoch": 2.8406374501992033,
"grad_norm": 0.9674385190010071,
"learning_rate": 1.7524571319489695e-06,
"loss": 0.24093596637248993,
"step": 1426
},
{
"epoch": 2.8446215139442232,
"grad_norm": 0.8217137455940247,
"learning_rate": 1.7451758316298386e-06,
"loss": 0.8590070605278015,
"step": 1428
},
{
"epoch": 2.848605577689243,
"grad_norm": 0.818912148475647,
"learning_rate": 1.7379114100202824e-06,
"loss": 0.8883748650550842,
"step": 1430
},
{
"epoch": 2.852589641434263,
"grad_norm": 2.239244222640991,
"learning_rate": 1.7306639459807026e-06,
"loss": 0.8789231777191162,
"step": 1432
},
{
"epoch": 2.856573705179283,
"grad_norm": 1.3130366802215576,
"learning_rate": 1.7234335181874197e-06,
"loss": 0.41715553402900696,
"step": 1434
},
{
"epoch": 2.860557768924303,
"grad_norm": 2.1881866455078125,
"learning_rate": 1.7162202051318092e-06,
"loss": 0.8317433595657349,
"step": 1436
},
{
"epoch": 2.864541832669323,
"grad_norm": 0.4997340440750122,
"learning_rate": 1.7090240851194576e-06,
"loss": 0.06248881667852402,
"step": 1438
},
{
"epoch": 2.8685258964143427,
"grad_norm": 0.7684650421142578,
"learning_rate": 1.7018452362693062e-06,
"loss": 0.9771674871444702,
"step": 1440
},
{
"epoch": 2.8725099601593627,
"grad_norm": 2.6358094215393066,
"learning_rate": 1.694683736512807e-06,
"loss": 0.4274534285068512,
"step": 1442
},
{
"epoch": 2.8764940239043826,
"grad_norm": 3.7041735649108887,
"learning_rate": 1.6875396635930767e-06,
"loss": 0.8502193689346313,
"step": 1444
},
{
"epoch": 2.8804780876494025,
"grad_norm": 1.7656716108322144,
"learning_rate": 1.6804130950640492e-06,
"loss": 0.2269526571035385,
"step": 1446
},
{
"epoch": 2.8844621513944224,
"grad_norm": 0.9704077839851379,
"learning_rate": 1.6733041082896355e-06,
"loss": 0.9017117619514465,
"step": 1448
},
{
"epoch": 2.8884462151394423,
"grad_norm": 1.1423131227493286,
"learning_rate": 1.666212780442887e-06,
"loss": 0.7310890555381775,
"step": 1450
},
{
"epoch": 2.8924302788844622,
"grad_norm": 0.8818380832672119,
"learning_rate": 1.659139188505152e-06,
"loss": 0.9649314880371094,
"step": 1452
},
{
"epoch": 2.896414342629482,
"grad_norm": 0.9627234935760498,
"learning_rate": 1.652083409265246e-06,
"loss": 0.1323651671409607,
"step": 1454
},
{
"epoch": 2.900398406374502,
"grad_norm": 0.625633955001831,
"learning_rate": 1.6450455193186137e-06,
"loss": 0.8300275206565857,
"step": 1456
},
{
"epoch": 2.904382470119522,
"grad_norm": 1.691175103187561,
"learning_rate": 1.638025595066499e-06,
"loss": 0.7612891793251038,
"step": 1458
},
{
"epoch": 2.908366533864542,
"grad_norm": 0.9278882145881653,
"learning_rate": 1.6310237127151137e-06,
"loss": 0.9076191782951355,
"step": 1460
},
{
"epoch": 2.912350597609562,
"grad_norm": 2.7954494953155518,
"learning_rate": 1.624039948274815e-06,
"loss": 0.37150129675865173,
"step": 1462
},
{
"epoch": 2.9163346613545817,
"grad_norm": 0.423910528421402,
"learning_rate": 1.6170743775592773e-06,
"loss": 0.20058873295783997,
"step": 1464
},
{
"epoch": 2.9203187250996017,
"grad_norm": 0.9244667887687683,
"learning_rate": 1.610127076184667e-06,
"loss": 0.8625198602676392,
"step": 1466
},
{
"epoch": 2.9243027888446216,
"grad_norm": 0.8803090453147888,
"learning_rate": 1.6031981195688252e-06,
"loss": 0.9291595816612244,
"step": 1468
},
{
"epoch": 2.9282868525896415,
"grad_norm": 1.0361244678497314,
"learning_rate": 1.59628758293045e-06,
"loss": 0.23180729150772095,
"step": 1470
},
{
"epoch": 2.9322709163346614,
"grad_norm": 5.147000789642334,
"learning_rate": 1.5893955412882733e-06,
"loss": 0.5987867712974548,
"step": 1472
},
{
"epoch": 2.9362549800796813,
"grad_norm": 0.5982325673103333,
"learning_rate": 1.582522069460253e-06,
"loss": 0.8363850116729736,
"step": 1474
},
{
"epoch": 2.9402390438247012,
"grad_norm": 3.7226884365081787,
"learning_rate": 1.5756672420627596e-06,
"loss": 0.8606371283531189,
"step": 1476
},
{
"epoch": 2.944223107569721,
"grad_norm": 1.0484495162963867,
"learning_rate": 1.5688311335097646e-06,
"loss": 0.9633500576019287,
"step": 1478
},
{
"epoch": 2.948207171314741,
"grad_norm": 0.7016828656196594,
"learning_rate": 1.5620138180120331e-06,
"loss": 0.8571369647979736,
"step": 1480
},
{
"epoch": 2.952191235059761,
"grad_norm": 2.1188414096832275,
"learning_rate": 1.5552153695763156e-06,
"loss": 0.44183531403541565,
"step": 1482
},
{
"epoch": 2.956175298804781,
"grad_norm": 2.2254960536956787,
"learning_rate": 1.5484358620045534e-06,
"loss": 0.28760015964508057,
"step": 1484
},
{
"epoch": 2.960159362549801,
"grad_norm": 2.748490333557129,
"learning_rate": 1.5416753688930654e-06,
"loss": 0.6493697166442871,
"step": 1486
},
{
"epoch": 2.9641434262948207,
"grad_norm": 1.3967127799987793,
"learning_rate": 1.5349339636317584e-06,
"loss": 0.8622140288352966,
"step": 1488
},
{
"epoch": 2.9681274900398407,
"grad_norm": 1.959518313407898,
"learning_rate": 1.528211719403328e-06,
"loss": 0.722124457359314,
"step": 1490
},
{
"epoch": 2.9721115537848606,
"grad_norm": 1.3386509418487549,
"learning_rate": 1.521508709182461e-06,
"loss": 0.9694193601608276,
"step": 1492
},
{
"epoch": 2.9760956175298805,
"grad_norm": 0.9864974617958069,
"learning_rate": 1.514825005735045e-06,
"loss": 0.8088407516479492,
"step": 1494
},
{
"epoch": 2.9800796812749004,
"grad_norm": 2.115551471710205,
"learning_rate": 1.5081606816173814e-06,
"loss": 0.12242338061332703,
"step": 1496
},
{
"epoch": 2.9840637450199203,
"grad_norm": 0.75198894739151,
"learning_rate": 1.5015158091753958e-06,
"loss": 0.1432493031024933,
"step": 1498
},
{
"epoch": 2.9880478087649402,
"grad_norm": 1.4102544784545898,
"learning_rate": 1.4948904605438477e-06,
"loss": 0.0790117010474205,
"step": 1500
},
{
"epoch": 2.99203187250996,
"grad_norm": 0.6461302638053894,
"learning_rate": 1.488284707645557e-06,
"loss": 0.7927932739257812,
"step": 1502
},
{
"epoch": 2.99601593625498,
"grad_norm": 0.9944819211959839,
"learning_rate": 1.4816986221906159e-06,
"loss": 0.8774588704109192,
"step": 1504
},
{
"epoch": 3.0,
"grad_norm": 2.3869407176971436,
"learning_rate": 1.4751322756756127e-06,
"loss": 0.23395386338233948,
"step": 1506
},
{
"epoch": 3.00398406374502,
"grad_norm": 0.6929567456245422,
"learning_rate": 1.4685857393828543e-06,
"loss": 0.6813750267028809,
"step": 1508
},
{
"epoch": 3.00796812749004,
"grad_norm": 1.4428455829620361,
"learning_rate": 1.4620590843795967e-06,
"loss": 0.27471280097961426,
"step": 1510
},
{
"epoch": 3.0119521912350598,
"grad_norm": 1.1208453178405762,
"learning_rate": 1.4555523815172693e-06,
"loss": 0.7926130294799805,
"step": 1512
},
{
"epoch": 3.0159362549800797,
"grad_norm": 1.4112131595611572,
"learning_rate": 1.449065701430705e-06,
"loss": 0.3855717182159424,
"step": 1514
},
{
"epoch": 3.0199203187250996,
"grad_norm": 7.652811527252197,
"learning_rate": 1.4425991145373788e-06,
"loss": 0.1316222846508026,
"step": 1516
},
{
"epoch": 3.0239043824701195,
"grad_norm": 1.6621893644332886,
"learning_rate": 1.4361526910366368e-06,
"loss": 0.2520155906677246,
"step": 1518
},
{
"epoch": 3.0278884462151394,
"grad_norm": 0.8125709891319275,
"learning_rate": 1.4297265009089397e-06,
"loss": 0.7272902727127075,
"step": 1520
},
{
"epoch": 3.0318725099601593,
"grad_norm": 1.4255092144012451,
"learning_rate": 1.423320613915099e-06,
"loss": 0.5655202865600586,
"step": 1522
},
{
"epoch": 3.0358565737051793,
"grad_norm": 1.9694007635116577,
"learning_rate": 1.416935099595522e-06,
"loss": 0.21059830486774445,
"step": 1524
},
{
"epoch": 3.039840637450199,
"grad_norm": 0.7592612504959106,
"learning_rate": 1.4105700272694578e-06,
"loss": 0.6575446724891663,
"step": 1526
},
{
"epoch": 3.043824701195219,
"grad_norm": 1.133392572402954,
"learning_rate": 1.4042254660342408e-06,
"loss": 0.9429333209991455,
"step": 1528
},
{
"epoch": 3.047808764940239,
"grad_norm": 1.231631875038147,
"learning_rate": 1.3979014847645435e-06,
"loss": 0.2242284119129181,
"step": 1530
},
{
"epoch": 3.051792828685259,
"grad_norm": 1.1999961137771606,
"learning_rate": 1.391598152111631e-06,
"loss": 0.15949700772762299,
"step": 1532
},
{
"epoch": 3.055776892430279,
"grad_norm": 1.6939618587493896,
"learning_rate": 1.385315536502609e-06,
"loss": 0.21413640677928925,
"step": 1534
},
{
"epoch": 3.0597609561752988,
"grad_norm": 1.3219988346099854,
"learning_rate": 1.3790537061396887e-06,
"loss": 0.6202045679092407,
"step": 1536
},
{
"epoch": 3.0637450199203187,
"grad_norm": 0.998444676399231,
"learning_rate": 1.372812728999442e-06,
"loss": 0.7671471238136292,
"step": 1538
},
{
"epoch": 3.0677290836653386,
"grad_norm": 1.4698975086212158,
"learning_rate": 1.3665926728320632e-06,
"loss": 0.47750726342201233,
"step": 1540
},
{
"epoch": 3.0717131474103585,
"grad_norm": 0.9587137699127197,
"learning_rate": 1.3603936051606346e-06,
"loss": 0.7269394397735596,
"step": 1542
},
{
"epoch": 3.0756972111553784,
"grad_norm": 2.3286054134368896,
"learning_rate": 1.3542155932803954e-06,
"loss": 0.7805855870246887,
"step": 1544
},
{
"epoch": 3.0796812749003983,
"grad_norm": 0.7439804077148438,
"learning_rate": 1.3480587042580092e-06,
"loss": 0.6787388324737549,
"step": 1546
},
{
"epoch": 3.0836653386454183,
"grad_norm": 1.8882228136062622,
"learning_rate": 1.3419230049308333e-06,
"loss": 0.6134771108627319,
"step": 1548
},
{
"epoch": 3.087649402390438,
"grad_norm": 1.0494561195373535,
"learning_rate": 1.3358085619062003e-06,
"loss": 0.7737662196159363,
"step": 1550
},
{
"epoch": 3.091633466135458,
"grad_norm": 0.31838488578796387,
"learning_rate": 1.3297154415606864e-06,
"loss": 0.034840308129787445,
"step": 1552
},
{
"epoch": 3.095617529880478,
"grad_norm": 1.5378990173339844,
"learning_rate": 1.3236437100393992e-06,
"loss": 0.21899044513702393,
"step": 1554
},
{
"epoch": 3.099601593625498,
"grad_norm": 0.9580462574958801,
"learning_rate": 1.3175934332552511e-06,
"loss": 0.635277271270752,
"step": 1556
},
{
"epoch": 3.103585657370518,
"grad_norm": 1.2689288854599,
"learning_rate": 1.3115646768882522e-06,
"loss": 0.6710810661315918,
"step": 1558
},
{
"epoch": 3.1075697211155378,
"grad_norm": 0.9133360385894775,
"learning_rate": 1.3055575063847923e-06,
"loss": 0.7197314500808716,
"step": 1560
},
{
"epoch": 3.1115537848605577,
"grad_norm": 3.067455768585205,
"learning_rate": 1.29957198695693e-06,
"loss": 0.21895435452461243,
"step": 1562
},
{
"epoch": 3.1155378486055776,
"grad_norm": 0.27349138259887695,
"learning_rate": 1.2936081835816867e-06,
"loss": 0.19600287079811096,
"step": 1564
},
{
"epoch": 3.1195219123505975,
"grad_norm": 1.1419686079025269,
"learning_rate": 1.2876661610003428e-06,
"loss": 0.7878577709197998,
"step": 1566
},
{
"epoch": 3.1235059760956174,
"grad_norm": 1.1395351886749268,
"learning_rate": 1.2817459837177298e-06,
"loss": 0.7802326679229736,
"step": 1568
},
{
"epoch": 3.1274900398406373,
"grad_norm": 1.9237797260284424,
"learning_rate": 1.2758477160015355e-06,
"loss": 0.5069929361343384,
"step": 1570
},
{
"epoch": 3.1314741035856573,
"grad_norm": 0.7889575958251953,
"learning_rate": 1.2699714218816036e-06,
"loss": 0.6714158654212952,
"step": 1572
},
{
"epoch": 3.135458167330677,
"grad_norm": 0.9449037313461304,
"learning_rate": 1.2641171651492383e-06,
"loss": 0.6565294861793518,
"step": 1574
},
{
"epoch": 3.139442231075697,
"grad_norm": 1.7222603559494019,
"learning_rate": 1.2582850093565115e-06,
"loss": 0.2423674762248993,
"step": 1576
},
{
"epoch": 3.143426294820717,
"grad_norm": 0.8361628651618958,
"learning_rate": 1.2524750178155762e-06,
"loss": 0.6483781933784485,
"step": 1578
},
{
"epoch": 3.147410358565737,
"grad_norm": 0.4106227159500122,
"learning_rate": 1.2466872535979755e-06,
"loss": 0.06941226869821548,
"step": 1580
},
{
"epoch": 3.151394422310757,
"grad_norm": 1.131303071975708,
"learning_rate": 1.2409217795339592e-06,
"loss": 0.6722179651260376,
"step": 1582
},
{
"epoch": 3.1553784860557768,
"grad_norm": 1.3526575565338135,
"learning_rate": 1.2351786582118018e-06,
"loss": 0.37432199716567993,
"step": 1584
},
{
"epoch": 3.1593625498007967,
"grad_norm": 1.5046707391738892,
"learning_rate": 1.2294579519771246e-06,
"loss": 0.36908501386642456,
"step": 1586
},
{
"epoch": 3.1633466135458166,
"grad_norm": 0.14365744590759277,
"learning_rate": 1.2237597229322155e-06,
"loss": 0.01732539013028145,
"step": 1588
},
{
"epoch": 3.1673306772908365,
"grad_norm": 0.7536062598228455,
"learning_rate": 1.2180840329353564e-06,
"loss": 0.2823001444339752,
"step": 1590
},
{
"epoch": 3.1713147410358564,
"grad_norm": 6.318256855010986,
"learning_rate": 1.2124309436001533e-06,
"loss": 0.5411125421524048,
"step": 1592
},
{
"epoch": 3.1752988047808763,
"grad_norm": 1.1654754877090454,
"learning_rate": 1.2068005162948668e-06,
"loss": 0.7602944374084473,
"step": 1594
},
{
"epoch": 3.1792828685258963,
"grad_norm": 2.5576841831207275,
"learning_rate": 1.2011928121417431e-06,
"loss": 0.1262691169977188,
"step": 1596
},
{
"epoch": 3.183266932270916,
"grad_norm": 1.2924350500106812,
"learning_rate": 1.195607892016354e-06,
"loss": 0.6975268721580505,
"step": 1598
},
{
"epoch": 3.187250996015936,
"grad_norm": 2.0278656482696533,
"learning_rate": 1.1900458165469345e-06,
"loss": 0.5072341561317444,
"step": 1600
},
{
"epoch": 3.191235059760956,
"grad_norm": 2.13330078125,
"learning_rate": 1.184506646113724e-06,
"loss": 0.7287152409553528,
"step": 1602
},
{
"epoch": 3.195219123505976,
"grad_norm": 0.19735604524612427,
"learning_rate": 1.1789904408483123e-06,
"loss": 0.20490704476833344,
"step": 1604
},
{
"epoch": 3.199203187250996,
"grad_norm": 2.342869997024536,
"learning_rate": 1.1734972606329874e-06,
"loss": 0.6201443076133728,
"step": 1606
},
{
"epoch": 3.2031872509960158,
"grad_norm": 1.9951808452606201,
"learning_rate": 1.1680271651000819e-06,
"loss": 0.2740911543369293,
"step": 1608
},
{
"epoch": 3.2071713147410357,
"grad_norm": 1.075411319732666,
"learning_rate": 1.162580213631328e-06,
"loss": 0.6568232774734497,
"step": 1610
},
{
"epoch": 3.2111553784860556,
"grad_norm": 2.3391730785369873,
"learning_rate": 1.1571564653572148e-06,
"loss": 1.0995919704437256,
"step": 1612
},
{
"epoch": 3.2151394422310755,
"grad_norm": 0.11555808782577515,
"learning_rate": 1.1517559791563439e-06,
"loss": 0.003191891126334667,
"step": 1614
},
{
"epoch": 3.2191235059760954,
"grad_norm": 2.371424674987793,
"learning_rate": 1.1463788136547887e-06,
"loss": 0.396582692861557,
"step": 1616
},
{
"epoch": 3.2231075697211153,
"grad_norm": 1.8076469898223877,
"learning_rate": 1.141025027225463e-06,
"loss": 0.3241533637046814,
"step": 1618
},
{
"epoch": 3.2270916334661353,
"grad_norm": 0.9942080974578857,
"learning_rate": 1.1356946779874825e-06,
"loss": 0.6740264296531677,
"step": 1620
},
{
"epoch": 3.231075697211155,
"grad_norm": 1.624965786933899,
"learning_rate": 1.1303878238055357e-06,
"loss": 0.44572022557258606,
"step": 1622
},
{
"epoch": 3.235059760956175,
"grad_norm": 1.6572600603103638,
"learning_rate": 1.1251045222892553e-06,
"loss": 0.21951913833618164,
"step": 1624
},
{
"epoch": 3.239043824701195,
"grad_norm": 1.5844409465789795,
"learning_rate": 1.119844830792595e-06,
"loss": 0.7072573900222778,
"step": 1626
},
{
"epoch": 3.243027888446215,
"grad_norm": 1.0160541534423828,
"learning_rate": 1.1146088064132052e-06,
"loss": 0.6218189001083374,
"step": 1628
},
{
"epoch": 3.247011952191235,
"grad_norm": 0.6660611033439636,
"learning_rate": 1.10939650599181e-06,
"loss": 0.15160006284713745,
"step": 1630
},
{
"epoch": 3.2509960159362548,
"grad_norm": 4.854979038238525,
"learning_rate": 1.1042079861115967e-06,
"loss": 0.4013654887676239,
"step": 1632
},
{
"epoch": 3.2549800796812747,
"grad_norm": 1.7456501722335815,
"learning_rate": 1.099043303097596e-06,
"loss": 0.6942977905273438,
"step": 1634
},
{
"epoch": 3.2589641434262946,
"grad_norm": 0.6688535809516907,
"learning_rate": 1.0939025130160743e-06,
"loss": 0.7660707831382751,
"step": 1636
},
{
"epoch": 3.2629482071713145,
"grad_norm": 1.3489729166030884,
"learning_rate": 1.088785671673921e-06,
"loss": 0.4087866544723511,
"step": 1638
},
{
"epoch": 3.2669322709163344,
"grad_norm": 3.7537801265716553,
"learning_rate": 1.0836928346180481e-06,
"loss": 0.26779600977897644,
"step": 1640
},
{
"epoch": 3.2709163346613543,
"grad_norm": 1.0913664102554321,
"learning_rate": 1.0786240571347827e-06,
"loss": 0.11661072820425034,
"step": 1642
},
{
"epoch": 3.2749003984063743,
"grad_norm": 1.3544014692306519,
"learning_rate": 1.0735793942492676e-06,
"loss": 0.9415394067764282,
"step": 1644
},
{
"epoch": 3.278884462151394,
"grad_norm": 1.880513072013855,
"learning_rate": 1.068558900724865e-06,
"loss": 0.6600284576416016,
"step": 1646
},
{
"epoch": 3.2828685258964145,
"grad_norm": 2.517366647720337,
"learning_rate": 1.0635626310625637e-06,
"loss": 0.3240680694580078,
"step": 1648
},
{
"epoch": 3.2868525896414345,
"grad_norm": 0.825859010219574,
"learning_rate": 1.058590639500382e-06,
"loss": 0.6646403074264526,
"step": 1650
},
{
"epoch": 3.2908366533864544,
"grad_norm": 0.9859835505485535,
"learning_rate": 1.0536429800127851e-06,
"loss": 0.642147958278656,
"step": 1652
},
{
"epoch": 3.2948207171314743,
"grad_norm": 1.7152155637741089,
"learning_rate": 1.0487197063100961e-06,
"loss": 0.7060829401016235,
"step": 1654
},
{
"epoch": 3.298804780876494,
"grad_norm": 1.7756178379058838,
"learning_rate": 1.0438208718379124e-06,
"loss": 0.7361951470375061,
"step": 1656
},
{
"epoch": 3.302788844621514,
"grad_norm": 1.7107096910476685,
"learning_rate": 1.0389465297765253e-06,
"loss": 0.6126337647438049,
"step": 1658
},
{
"epoch": 3.306772908366534,
"grad_norm": 1.4858530759811401,
"learning_rate": 1.0340967330403468e-06,
"loss": 0.614052414894104,
"step": 1660
},
{
"epoch": 3.310756972111554,
"grad_norm": 6.398506164550781,
"learning_rate": 1.02927153427733e-06,
"loss": 0.6388739347457886,
"step": 1662
},
{
"epoch": 3.314741035856574,
"grad_norm": 4.903992652893066,
"learning_rate": 1.0244709858683996e-06,
"loss": 0.1377391517162323,
"step": 1664
},
{
"epoch": 3.318725099601594,
"grad_norm": 1.644950270652771,
"learning_rate": 1.0196951399268847e-06,
"loss": 0.3214379549026489,
"step": 1666
},
{
"epoch": 3.3227091633466137,
"grad_norm": 6.5153608322143555,
"learning_rate": 1.0149440482979503e-06,
"loss": 0.23638975620269775,
"step": 1668
},
{
"epoch": 3.3266932270916336,
"grad_norm": 1.4857839345932007,
"learning_rate": 1.0102177625580375e-06,
"loss": 0.22218865156173706,
"step": 1670
},
{
"epoch": 3.3306772908366535,
"grad_norm": 8.828252792358398,
"learning_rate": 1.0055163340143e-06,
"loss": 0.6645467877388,
"step": 1672
},
{
"epoch": 3.3346613545816735,
"grad_norm": 1.0999014377593994,
"learning_rate": 1.0008398137040507e-06,
"loss": 0.5620592832565308,
"step": 1674
},
{
"epoch": 3.3386454183266934,
"grad_norm": 2.528717279434204,
"learning_rate": 9.961882523942068e-07,
"loss": 0.6080818176269531,
"step": 1676
},
{
"epoch": 3.3426294820717133,
"grad_norm": 0.1366569846868515,
"learning_rate": 9.915617005807357e-07,
"loss": 0.01138792559504509,
"step": 1678
},
{
"epoch": 3.346613545816733,
"grad_norm": 5.231603622436523,
"learning_rate": 9.869602084881103e-07,
"loss": 0.29557374119758606,
"step": 1680
},
{
"epoch": 3.350597609561753,
"grad_norm": 0.9051138758659363,
"learning_rate": 9.823838260687635e-07,
"loss": 0.41544756293296814,
"step": 1682
},
{
"epoch": 3.354581673306773,
"grad_norm": 1.6163842678070068,
"learning_rate": 9.778326030025432e-07,
"loss": 0.45938849449157715,
"step": 1684
},
{
"epoch": 3.358565737051793,
"grad_norm": 7.820988178253174,
"learning_rate": 9.733065886961764e-07,
"loss": 0.4935106337070465,
"step": 1686
},
{
"epoch": 3.362549800796813,
"grad_norm": 1.3769513368606567,
"learning_rate": 9.688058322827313e-07,
"loss": 0.5252028107643127,
"step": 1688
},
{
"epoch": 3.366533864541833,
"grad_norm": 1.0140272378921509,
"learning_rate": 9.643303826210824e-07,
"loss": 0.7207529544830322,
"step": 1690
},
{
"epoch": 3.3705179282868527,
"grad_norm": 1.3448855876922607,
"learning_rate": 9.598802882953828e-07,
"loss": 0.7529066205024719,
"step": 1692
},
{
"epoch": 3.3745019920318726,
"grad_norm": 1.0818604230880737,
"learning_rate": 9.554555976145349e-07,
"loss": 0.6526249647140503,
"step": 1694
},
{
"epoch": 3.3784860557768925,
"grad_norm": 0.858180046081543,
"learning_rate": 9.510563586116686e-07,
"loss": 0.6609078645706177,
"step": 1696
},
{
"epoch": 3.3824701195219125,
"grad_norm": 1.1475756168365479,
"learning_rate": 9.466826190436147e-07,
"loss": 0.7812352180480957,
"step": 1698
},
{
"epoch": 3.3864541832669324,
"grad_norm": 2.1600332260131836,
"learning_rate": 9.423344263903926e-07,
"loss": 0.7400810122489929,
"step": 1700
},
{
"epoch": 3.3904382470119523,
"grad_norm": 1.9892051219940186,
"learning_rate": 9.380118278546906e-07,
"loss": 0.6348077654838562,
"step": 1702
},
{
"epoch": 3.394422310756972,
"grad_norm": 0.9929773211479187,
"learning_rate": 9.337148703613554e-07,
"loss": 0.6541098356246948,
"step": 1704
},
{
"epoch": 3.398406374501992,
"grad_norm": 5.213384628295898,
"learning_rate": 9.29443600556881e-07,
"loss": 0.20520062744617462,
"step": 1706
},
{
"epoch": 3.402390438247012,
"grad_norm": 1.8277703523635864,
"learning_rate": 9.251980648089045e-07,
"loss": 0.596899688243866,
"step": 1708
},
{
"epoch": 3.406374501992032,
"grad_norm": 0.9781650304794312,
"learning_rate": 9.209783092057025e-07,
"loss": 0.7202063202857971,
"step": 1710
},
{
"epoch": 3.410358565737052,
"grad_norm": 1.2887661457061768,
"learning_rate": 9.16784379555688e-07,
"loss": 0.668391764163971,
"step": 1712
},
{
"epoch": 3.414342629482072,
"grad_norm": 1.2524248361587524,
"learning_rate": 9.126163213869171e-07,
"loss": 0.6738901138305664,
"step": 1714
},
{
"epoch": 3.4183266932270917,
"grad_norm": 0.8974006772041321,
"learning_rate": 9.084741799465915e-07,
"loss": 0.6369835734367371,
"step": 1716
},
{
"epoch": 3.4223107569721116,
"grad_norm": 0.9271976351737976,
"learning_rate": 9.043580002005681e-07,
"loss": 0.7468122839927673,
"step": 1718
},
{
"epoch": 3.4262948207171315,
"grad_norm": 0.9398600459098816,
"learning_rate": 9.002678268328732e-07,
"loss": 0.6316313743591309,
"step": 1720
},
{
"epoch": 3.4302788844621515,
"grad_norm": 3.112898111343384,
"learning_rate": 8.962037042452146e-07,
"loss": 0.3467191755771637,
"step": 1722
},
{
"epoch": 3.4342629482071714,
"grad_norm": 0.8903955817222595,
"learning_rate": 8.921656765564998e-07,
"loss": 0.5496594309806824,
"step": 1724
},
{
"epoch": 3.4382470119521913,
"grad_norm": 2.7363598346710205,
"learning_rate": 8.881537876023597e-07,
"loss": 0.6846615076065063,
"step": 1726
},
{
"epoch": 3.442231075697211,
"grad_norm": 1.7913397550582886,
"learning_rate": 8.841680809346684e-07,
"loss": 0.4614332914352417,
"step": 1728
},
{
"epoch": 3.446215139442231,
"grad_norm": 2.286719560623169,
"learning_rate": 8.802085998210754e-07,
"loss": 0.6514830589294434,
"step": 1730
},
{
"epoch": 3.450199203187251,
"grad_norm": 1.2754535675048828,
"learning_rate": 8.762753872445316e-07,
"loss": 0.6596709489822388,
"step": 1732
},
{
"epoch": 3.454183266932271,
"grad_norm": 3.1973865032196045,
"learning_rate": 8.723684859028244e-07,
"loss": 0.3601575791835785,
"step": 1734
},
{
"epoch": 3.458167330677291,
"grad_norm": 1.0521482229232788,
"learning_rate": 8.684879382081163e-07,
"loss": 0.6533339023590088,
"step": 1736
},
{
"epoch": 3.462151394422311,
"grad_norm": 1.2476742267608643,
"learning_rate": 8.646337862864804e-07,
"loss": 0.7225340604782104,
"step": 1738
},
{
"epoch": 3.4661354581673307,
"grad_norm": 8.218664169311523,
"learning_rate": 8.608060719774452e-07,
"loss": 0.14243163168430328,
"step": 1740
},
{
"epoch": 3.4701195219123506,
"grad_norm": 0.8877552151679993,
"learning_rate": 8.570048368335411e-07,
"loss": 0.7387225031852722,
"step": 1742
},
{
"epoch": 3.4741035856573705,
"grad_norm": 0.26608389616012573,
"learning_rate": 8.532301221198491e-07,
"loss": 0.060973528772592545,
"step": 1744
},
{
"epoch": 3.4780876494023905,
"grad_norm": 1.655069351196289,
"learning_rate": 8.494819688135502e-07,
"loss": 0.6722233891487122,
"step": 1746
},
{
"epoch": 3.4820717131474104,
"grad_norm": 0.3229190409183502,
"learning_rate": 8.457604176034851e-07,
"loss": 0.16490302979946136,
"step": 1748
},
{
"epoch": 3.4860557768924303,
"grad_norm": 0.3072760999202728,
"learning_rate": 8.42065508889708e-07,
"loss": 0.06224316358566284,
"step": 1750
},
{
"epoch": 3.49003984063745,
"grad_norm": 1.0425161123275757,
"learning_rate": 8.383972827830517e-07,
"loss": 0.6595985293388367,
"step": 1752
},
{
"epoch": 3.49402390438247,
"grad_norm": 1.6916478872299194,
"learning_rate": 8.347557791046892e-07,
"loss": 0.18403995037078857,
"step": 1754
},
{
"epoch": 3.49800796812749,
"grad_norm": 0.8162530064582825,
"learning_rate": 8.311410373857033e-07,
"loss": 0.6693860292434692,
"step": 1756
},
{
"epoch": 3.50199203187251,
"grad_norm": 3.898818254470825,
"learning_rate": 8.275530968666578e-07,
"loss": 0.5436112880706787,
"step": 1758
},
{
"epoch": 3.50597609561753,
"grad_norm": 0.576738178730011,
"learning_rate": 8.239919964971689e-07,
"loss": 0.1252291202545166,
"step": 1760
},
{
"epoch": 3.50996015936255,
"grad_norm": 0.9629335403442383,
"learning_rate": 8.20457774935485e-07,
"loss": 0.2324841022491455,
"step": 1762
},
{
"epoch": 3.5139442231075697,
"grad_norm": 1.051251769065857,
"learning_rate": 8.16950470548067e-07,
"loss": 0.5175900459289551,
"step": 1764
},
{
"epoch": 3.5179282868525896,
"grad_norm": 5.374156951904297,
"learning_rate": 8.134701214091691e-07,
"loss": 0.19936859607696533,
"step": 1766
},
{
"epoch": 3.5219123505976095,
"grad_norm": 1.134244680404663,
"learning_rate": 8.100167653004285e-07,
"loss": 0.09222012758255005,
"step": 1768
},
{
"epoch": 3.5258964143426295,
"grad_norm": 1.0654293298721313,
"learning_rate": 8.065904397104543e-07,
"loss": 0.6717595458030701,
"step": 1770
},
{
"epoch": 3.5298804780876494,
"grad_norm": 2.4975504875183105,
"learning_rate": 8.031911818344201e-07,
"loss": 0.5180625915527344,
"step": 1772
},
{
"epoch": 3.5338645418326693,
"grad_norm": 0.9296510219573975,
"learning_rate": 7.998190285736589e-07,
"loss": 0.6407575607299805,
"step": 1774
},
{
"epoch": 3.537848605577689,
"grad_norm": 2.6143455505371094,
"learning_rate": 7.964740165352664e-07,
"loss": 0.6667947769165039,
"step": 1776
},
{
"epoch": 3.541832669322709,
"grad_norm": 0.19827701151371002,
"learning_rate": 7.931561820317005e-07,
"loss": 0.023438258096575737,
"step": 1778
},
{
"epoch": 3.545816733067729,
"grad_norm": 1.148992657661438,
"learning_rate": 7.898655610803869e-07,
"loss": 0.6734960675239563,
"step": 1780
},
{
"epoch": 3.549800796812749,
"grad_norm": 1.8085567951202393,
"learning_rate": 7.866021894033296e-07,
"loss": 0.6972249150276184,
"step": 1782
},
{
"epoch": 3.553784860557769,
"grad_norm": 2.9096920490264893,
"learning_rate": 7.833661024267235e-07,
"loss": 0.6476399302482605,
"step": 1784
},
{
"epoch": 3.557768924302789,
"grad_norm": 0.7224079966545105,
"learning_rate": 7.80157335280568e-07,
"loss": 0.9946411848068237,
"step": 1786
},
{
"epoch": 3.5617529880478087,
"grad_norm": 1.2070460319519043,
"learning_rate": 7.769759227982855e-07,
"loss": 0.711801290512085,
"step": 1788
},
{
"epoch": 3.5657370517928286,
"grad_norm": 2.714474678039551,
"learning_rate": 7.738218995163462e-07,
"loss": 0.15059031546115875,
"step": 1790
},
{
"epoch": 3.5697211155378485,
"grad_norm": 1.3999918699264526,
"learning_rate": 7.70695299673891e-07,
"loss": 0.139665424823761,
"step": 1792
},
{
"epoch": 3.5737051792828685,
"grad_norm": 0.37299129366874695,
"learning_rate": 7.67596157212359e-07,
"loss": 0.11374976485967636,
"step": 1794
},
{
"epoch": 3.5776892430278884,
"grad_norm": 0.8067252039909363,
"learning_rate": 7.645245057751201e-07,
"loss": 0.6304631233215332,
"step": 1796
},
{
"epoch": 3.5816733067729083,
"grad_norm": 1.578432559967041,
"learning_rate": 7.614803787071115e-07,
"loss": 0.22770892083644867,
"step": 1798
},
{
"epoch": 3.585657370517928,
"grad_norm": 3.3027656078338623,
"learning_rate": 7.584638090544717e-07,
"loss": 0.20699705183506012,
"step": 1800
},
{
"epoch": 3.589641434262948,
"grad_norm": 0.14634272456169128,
"learning_rate": 7.554748295641862e-07,
"loss": 0.055411506444215775,
"step": 1802
},
{
"epoch": 3.593625498007968,
"grad_norm": 1.2589038610458374,
"learning_rate": 7.525134726837289e-07,
"loss": 0.15108336508274078,
"step": 1804
},
{
"epoch": 3.597609561752988,
"grad_norm": 1.8965911865234375,
"learning_rate": 7.49579770560711e-07,
"loss": 0.4452376961708069,
"step": 1806
},
{
"epoch": 3.601593625498008,
"grad_norm": 1.1629970073699951,
"learning_rate": 7.46673755042531e-07,
"loss": 0.6423868536949158,
"step": 1808
},
{
"epoch": 3.605577689243028,
"grad_norm": 0.5293740630149841,
"learning_rate": 7.437954576760312e-07,
"loss": 0.21336103975772858,
"step": 1810
},
{
"epoch": 3.6095617529880477,
"grad_norm": 1.164920449256897,
"learning_rate": 7.409449097071536e-07,
"loss": 0.5466434359550476,
"step": 1812
},
{
"epoch": 3.6135458167330676,
"grad_norm": 1.1033563613891602,
"learning_rate": 7.381221420805999e-07,
"loss": 0.6399943232536316,
"step": 1814
},
{
"epoch": 3.6175298804780875,
"grad_norm": 1.056943416595459,
"learning_rate": 7.353271854394979e-07,
"loss": 0.5917325019836426,
"step": 1816
},
{
"epoch": 3.6215139442231075,
"grad_norm": 0.9444670677185059,
"learning_rate": 7.325600701250674e-07,
"loss": 0.7685708403587341,
"step": 1818
},
{
"epoch": 3.6254980079681274,
"grad_norm": 1.8602865934371948,
"learning_rate": 7.298208261762906e-07,
"loss": 0.45633015036582947,
"step": 1820
},
{
"epoch": 3.6294820717131473,
"grad_norm": 0.10787267237901688,
"learning_rate": 7.271094833295859e-07,
"loss": 0.011536069214344025,
"step": 1822
},
{
"epoch": 3.633466135458167,
"grad_norm": 0.2886284291744232,
"learning_rate": 7.244260710184868e-07,
"loss": 0.024275042116642,
"step": 1824
},
{
"epoch": 3.637450199203187,
"grad_norm": 0.6795600652694702,
"learning_rate": 7.21770618373321e-07,
"loss": 0.45940348505973816,
"step": 1826
},
{
"epoch": 3.641434262948207,
"grad_norm": 2.2104618549346924,
"learning_rate": 7.191431542208935e-07,
"loss": 0.6470014452934265,
"step": 1828
},
{
"epoch": 3.645418326693227,
"grad_norm": 1.12752103805542,
"learning_rate": 7.165437070841758e-07,
"loss": 0.7721574902534485,
"step": 1830
},
{
"epoch": 3.649402390438247,
"grad_norm": 6.11736536026001,
"learning_rate": 7.139723051819938e-07,
"loss": 0.5740348696708679,
"step": 1832
},
{
"epoch": 3.653386454183267,
"grad_norm": 0.4044356048107147,
"learning_rate": 7.114289764287227e-07,
"loss": 0.05502355471253395,
"step": 1834
},
{
"epoch": 3.6573705179282867,
"grad_norm": 4.303436279296875,
"learning_rate": 7.08913748433985e-07,
"loss": 0.17597807943820953,
"step": 1836
},
{
"epoch": 3.6613545816733066,
"grad_norm": 1.0884654521942139,
"learning_rate": 7.064266485023493e-07,
"loss": 0.6930414438247681,
"step": 1838
},
{
"epoch": 3.6653386454183265,
"grad_norm": 2.256512403488159,
"learning_rate": 7.039677036330331e-07,
"loss": 0.6587978601455688,
"step": 1840
},
{
"epoch": 3.6693227091633465,
"grad_norm": 0.19702738523483276,
"learning_rate": 7.015369405196132e-07,
"loss": 0.016245799139142036,
"step": 1842
},
{
"epoch": 3.6733067729083664,
"grad_norm": 0.9400996565818787,
"learning_rate": 6.991343855497312e-07,
"loss": 0.15207843482494354,
"step": 1844
},
{
"epoch": 3.6772908366533863,
"grad_norm": 1.0055437088012695,
"learning_rate": 6.967600648048113e-07,
"loss": 0.6164069175720215,
"step": 1846
},
{
"epoch": 3.681274900398406,
"grad_norm": 1.8582080602645874,
"learning_rate": 6.944140040597742e-07,
"loss": 0.7226882576942444,
"step": 1848
},
{
"epoch": 3.685258964143426,
"grad_norm": 1.656290054321289,
"learning_rate": 6.920962287827587e-07,
"loss": 0.07943466305732727,
"step": 1850
},
{
"epoch": 3.6892430278884465,
"grad_norm": 1.666813611984253,
"learning_rate": 6.898067641348459e-07,
"loss": 0.30842339992523193,
"step": 1852
},
{
"epoch": 3.6932270916334664,
"grad_norm": 0.8802257776260376,
"learning_rate": 6.875456349697834e-07,
"loss": 0.6316725611686707,
"step": 1854
},
{
"epoch": 3.6972111553784863,
"grad_norm": 2.5803232192993164,
"learning_rate": 6.853128658337188e-07,
"loss": 0.09659645706415176,
"step": 1856
},
{
"epoch": 3.7011952191235062,
"grad_norm": 1.351311206817627,
"learning_rate": 6.831084809649302e-07,
"loss": 0.6809911131858826,
"step": 1858
},
{
"epoch": 3.705179282868526,
"grad_norm": 1.1612941026687622,
"learning_rate": 6.809325042935666e-07,
"loss": 0.3540644943714142,
"step": 1860
},
{
"epoch": 3.709163346613546,
"grad_norm": 0.9889734387397766,
"learning_rate": 6.787849594413833e-07,
"loss": 0.6793351173400879,
"step": 1862
},
{
"epoch": 3.713147410358566,
"grad_norm": 1.0778642892837524,
"learning_rate": 6.766658697214906e-07,
"loss": 0.6664227247238159,
"step": 1864
},
{
"epoch": 3.717131474103586,
"grad_norm": 2.6285629272460938,
"learning_rate": 6.745752581380965e-07,
"loss": 0.33559897541999817,
"step": 1866
},
{
"epoch": 3.721115537848606,
"grad_norm": 1.0389450788497925,
"learning_rate": 6.72513147386261e-07,
"loss": 0.5156994462013245,
"step": 1868
},
{
"epoch": 3.7250996015936257,
"grad_norm": 0.9331614375114441,
"learning_rate": 6.704795598516451e-07,
"loss": 0.5414950251579285,
"step": 1870
},
{
"epoch": 3.7290836653386457,
"grad_norm": 1.0866365432739258,
"learning_rate": 6.684745176102714e-07,
"loss": 0.735094428062439,
"step": 1872
},
{
"epoch": 3.7330677290836656,
"grad_norm": 1.4017014503479004,
"learning_rate": 6.664980424282842e-07,
"loss": 0.2802731692790985,
"step": 1874
},
{
"epoch": 3.7370517928286855,
"grad_norm": 2.2784199714660645,
"learning_rate": 6.645501557617104e-07,
"loss": 0.5592929124832153,
"step": 1876
},
{
"epoch": 3.7410358565737054,
"grad_norm": 4.115759372711182,
"learning_rate": 6.626308787562294e-07,
"loss": 0.41764435172080994,
"step": 1878
},
{
"epoch": 3.7450199203187253,
"grad_norm": 0.9289363622665405,
"learning_rate": 6.607402322469429e-07,
"loss": 0.6480333209037781,
"step": 1880
},
{
"epoch": 3.7490039840637452,
"grad_norm": 2.0568838119506836,
"learning_rate": 6.588782367581475e-07,
"loss": 0.773093581199646,
"step": 1882
},
{
"epoch": 3.752988047808765,
"grad_norm": 3.918016195297241,
"learning_rate": 6.570449125031144e-07,
"loss": 0.5592324137687683,
"step": 1884
},
{
"epoch": 3.756972111553785,
"grad_norm": 0.8172755241394043,
"learning_rate": 6.552402793838667e-07,
"loss": 0.6393176913261414,
"step": 1886
},
{
"epoch": 3.760956175298805,
"grad_norm": 0.3844411075115204,
"learning_rate": 6.534643569909665e-07,
"loss": 0.08161535859107971,
"step": 1888
},
{
"epoch": 3.764940239043825,
"grad_norm": 2.660936117172241,
"learning_rate": 6.517171646032988e-07,
"loss": 0.7531623244285583,
"step": 1890
},
{
"epoch": 3.768924302788845,
"grad_norm": 2.1934661865234375,
"learning_rate": 6.499987211878666e-07,
"loss": 0.6893159747123718,
"step": 1892
},
{
"epoch": 3.7729083665338647,
"grad_norm": 1.1734172105789185,
"learning_rate": 6.483090453995811e-07,
"loss": 0.09743469953536987,
"step": 1894
},
{
"epoch": 3.7768924302788847,
"grad_norm": 1.5317673683166504,
"learning_rate": 6.466481555810608e-07,
"loss": 0.6921253204345703,
"step": 1896
},
{
"epoch": 3.7808764940239046,
"grad_norm": 0.8458757996559143,
"learning_rate": 6.450160697624327e-07,
"loss": 0.6649323105812073,
"step": 1898
},
{
"epoch": 3.7848605577689245,
"grad_norm": 1.0291515588760376,
"learning_rate": 6.434128056611361e-07,
"loss": 0.6685061454772949,
"step": 1900
},
{
"epoch": 3.7888446215139444,
"grad_norm": 0.8199694156646729,
"learning_rate": 6.418383806817298e-07,
"loss": 0.7103414535522461,
"step": 1902
},
{
"epoch": 3.7928286852589643,
"grad_norm": 0.8696004748344421,
"learning_rate": 6.40292811915704e-07,
"loss": 0.6235980987548828,
"step": 1904
},
{
"epoch": 3.7968127490039842,
"grad_norm": 2.7558107376098633,
"learning_rate": 6.387761161412942e-07,
"loss": 0.14641408622264862,
"step": 1906
},
{
"epoch": 3.800796812749004,
"grad_norm": 0.8049102425575256,
"learning_rate": 6.372883098232999e-07,
"loss": 0.6313645839691162,
"step": 1908
},
{
"epoch": 3.804780876494024,
"grad_norm": 1.0484040975570679,
"learning_rate": 6.358294091129044e-07,
"loss": 0.689453661441803,
"step": 1910
},
{
"epoch": 3.808764940239044,
"grad_norm": 1.3624324798583984,
"learning_rate": 6.34399429847501e-07,
"loss": 0.4293438196182251,
"step": 1912
},
{
"epoch": 3.812749003984064,
"grad_norm": 2.118128538131714,
"learning_rate": 6.329983875505202e-07,
"loss": 0.7885560989379883,
"step": 1914
},
{
"epoch": 3.816733067729084,
"grad_norm": 1.88889479637146,
"learning_rate": 6.316262974312607e-07,
"loss": 0.12458698451519012,
"step": 1916
},
{
"epoch": 3.8207171314741037,
"grad_norm": 2.0474905967712402,
"learning_rate": 6.302831743847255e-07,
"loss": 0.7278786897659302,
"step": 1918
},
{
"epoch": 3.8247011952191237,
"grad_norm": 1.8699114322662354,
"learning_rate": 6.289690329914599e-07,
"loss": 0.10339318215847015,
"step": 1920
},
{
"epoch": 3.8286852589641436,
"grad_norm": 0.9766838550567627,
"learning_rate": 6.276838875173931e-07,
"loss": 0.7524492144584656,
"step": 1922
},
{
"epoch": 3.8326693227091635,
"grad_norm": 0.34323349595069885,
"learning_rate": 6.264277519136821e-07,
"loss": 0.051684651523828506,
"step": 1924
},
{
"epoch": 3.8366533864541834,
"grad_norm": 1.1233506202697754,
"learning_rate": 6.252006398165622e-07,
"loss": 0.7036517262458801,
"step": 1926
},
{
"epoch": 3.8406374501992033,
"grad_norm": 1.529929757118225,
"learning_rate": 6.240025645471986e-07,
"loss": 0.8575693368911743,
"step": 1928
},
{
"epoch": 3.8446215139442232,
"grad_norm": 0.11210882663726807,
"learning_rate": 6.228335391115402e-07,
"loss": 0.02451253868639469,
"step": 1930
},
{
"epoch": 3.848605577689243,
"grad_norm": 1.864715576171875,
"learning_rate": 6.216935762001803e-07,
"loss": 0.5305463671684265,
"step": 1932
},
{
"epoch": 3.852589641434263,
"grad_norm": 1.8157854080200195,
"learning_rate": 6.205826881882179e-07,
"loss": 0.13252875208854675,
"step": 1934
},
{
"epoch": 3.856573705179283,
"grad_norm": 0.9740794897079468,
"learning_rate": 6.195008871351232e-07,
"loss": 0.7859750986099243,
"step": 1936
},
{
"epoch": 3.860557768924303,
"grad_norm": 1.070713758468628,
"learning_rate": 6.184481847846074e-07,
"loss": 0.7027934789657593,
"step": 1938
},
{
"epoch": 3.864541832669323,
"grad_norm": 1.440918207168579,
"learning_rate": 6.174245925644948e-07,
"loss": 0.30577710270881653,
"step": 1940
},
{
"epoch": 3.8685258964143427,
"grad_norm": 2.0320322513580322,
"learning_rate": 6.164301215865982e-07,
"loss": 0.9369683265686035,
"step": 1942
},
{
"epoch": 3.8725099601593627,
"grad_norm": 0.6125801801681519,
"learning_rate": 6.154647826465999e-07,
"loss": 0.03845952823758125,
"step": 1944
},
{
"epoch": 3.8764940239043826,
"grad_norm": 3.9984986782073975,
"learning_rate": 6.145285862239327e-07,
"loss": 0.6496099233627319,
"step": 1946
},
{
"epoch": 3.8804780876494025,
"grad_norm": 0.08795814216136932,
"learning_rate": 6.136215424816668e-07,
"loss": 0.04779617115855217,
"step": 1948
},
{
"epoch": 3.8844621513944224,
"grad_norm": 0.9127535820007324,
"learning_rate": 6.127436612664e-07,
"loss": 0.6776239275932312,
"step": 1950
},
{
"epoch": 3.8884462151394423,
"grad_norm": 1.5462641716003418,
"learning_rate": 6.118949521081495e-07,
"loss": 0.7221356630325317,
"step": 1952
},
{
"epoch": 3.8924302788844622,
"grad_norm": 0.6864924430847168,
"learning_rate": 6.11075424220251e-07,
"loss": 0.6018074154853821,
"step": 1954
},
{
"epoch": 3.896414342629482,
"grad_norm": 8.130626678466797,
"learning_rate": 6.102850864992553e-07,
"loss": 0.15544459223747253,
"step": 1956
},
{
"epoch": 3.900398406374502,
"grad_norm": 1.5887444019317627,
"learning_rate": 6.095239475248345e-07,
"loss": 0.5947393178939819,
"step": 1958
},
{
"epoch": 3.904382470119522,
"grad_norm": 0.9882814288139343,
"learning_rate": 6.087920155596867e-07,
"loss": 0.016275843605399132,
"step": 1960
},
{
"epoch": 3.908366533864542,
"grad_norm": 0.3859656751155853,
"learning_rate": 6.080892985494482e-07,
"loss": 0.04228988662362099,
"step": 1962
},
{
"epoch": 3.912350597609562,
"grad_norm": 1.2562545537948608,
"learning_rate": 6.074158041226068e-07,
"loss": 0.6111615300178528,
"step": 1964
},
{
"epoch": 3.9163346613545817,
"grad_norm": 3.6256649494171143,
"learning_rate": 6.067715395904173e-07,
"loss": 0.6986129283905029,
"step": 1966
},
{
"epoch": 3.9203187250996017,
"grad_norm": 1.0995627641677856,
"learning_rate": 6.061565119468247e-07,
"loss": 0.7141016125679016,
"step": 1968
},
{
"epoch": 3.9243027888446216,
"grad_norm": 2.30956768989563,
"learning_rate": 6.055707278683863e-07,
"loss": 0.22550952434539795,
"step": 1970
},
{
"epoch": 3.9282868525896415,
"grad_norm": 1.4764176607131958,
"learning_rate": 6.050141937142003e-07,
"loss": 0.1283264309167862,
"step": 1972
},
{
"epoch": 3.9322709163346614,
"grad_norm": 0.9012427926063538,
"learning_rate": 6.04486915525836e-07,
"loss": 0.8311380743980408,
"step": 1974
},
{
"epoch": 3.9362549800796813,
"grad_norm": 1.559435486793518,
"learning_rate": 6.039888990272691e-07,
"loss": 0.1916397362947464,
"step": 1976
},
{
"epoch": 3.9402390438247012,
"grad_norm": 0.8929998874664307,
"learning_rate": 6.035201496248188e-07,
"loss": 0.6807030439376831,
"step": 1978
},
{
"epoch": 3.944223107569721,
"grad_norm": 0.25589969754219055,
"learning_rate": 6.030806724070893e-07,
"loss": 0.07943480461835861,
"step": 1980
},
{
"epoch": 3.948207171314741,
"grad_norm": 1.3471908569335938,
"learning_rate": 6.026704721449152e-07,
"loss": 0.805228590965271,
"step": 1982
},
{
"epoch": 3.952191235059761,
"grad_norm": 0.9127321243286133,
"learning_rate": 6.022895532913081e-07,
"loss": 0.6197107434272766,
"step": 1984
},
{
"epoch": 3.956175298804781,
"grad_norm": 2.661827802658081,
"learning_rate": 6.019379199814108e-07,
"loss": 0.49690714478492737,
"step": 1986
},
{
"epoch": 3.960159362549801,
"grad_norm": 0.08383038640022278,
"learning_rate": 6.016155760324495e-07,
"loss": 0.00437126774340868,
"step": 1988
},
{
"epoch": 3.9641434262948207,
"grad_norm": 0.9041069746017456,
"learning_rate": 6.013225249436945e-07,
"loss": 0.7191581726074219,
"step": 1990
},
{
"epoch": 3.9681274900398407,
"grad_norm": 1.6254363059997559,
"learning_rate": 6.010587698964216e-07,
"loss": 0.5217870473861694,
"step": 1992
},
{
"epoch": 3.9721115537848606,
"grad_norm": 1.7610574960708618,
"learning_rate": 6.008243137538774e-07,
"loss": 0.7896353006362915,
"step": 1994
},
{
"epoch": 3.9760956175298805,
"grad_norm": 0.506505012512207,
"learning_rate": 6.006191590612478e-07,
"loss": 0.06072104722261429,
"step": 1996
},
{
"epoch": 3.9800796812749004,
"grad_norm": 1.679490566253662,
"learning_rate": 6.004433080456312e-07,
"loss": 0.0873764306306839,
"step": 1998
},
{
"epoch": 3.9840637450199203,
"grad_norm": 1.07437002658844,
"learning_rate": 6.002967626160147e-07,
"loss": 0.6510695219039917,
"step": 2000
},
{
"epoch": 3.9880478087649402,
"grad_norm": 1.063508152961731,
"learning_rate": 6.001795243632514e-07,
"loss": 0.6352625489234924,
"step": 2002
},
{
"epoch": 3.99203187250996,
"grad_norm": 0.9537666440010071,
"learning_rate": 6.00091594560045e-07,
"loss": 0.7177177667617798,
"step": 2004
},
{
"epoch": 3.99601593625498,
"grad_norm": 4.541738986968994,
"learning_rate": 6.000329741609355e-07,
"loss": 0.23844213783740997,
"step": 2006
},
{
"epoch": 4.0,
"grad_norm": 0.5011924505233765,
"learning_rate": 6.000036638022886e-07,
"loss": 0.15317194163799286,
"step": 2008
},
{
"epoch": 4.0,
"step": 2008,
"total_flos": 3.519329208629199e+18,
"train_loss": 0.7788769946752703,
"train_runtime": 8944.5824,
"train_samples_per_second": 6.735,
"train_steps_per_second": 0.224
}
],
"logging_steps": 2,
"max_steps": 2008,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 99999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.519329208629199e+18,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}