{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.8472906403940885, "eval_steps": 500, "global_step": 18000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003082297338949964, "grad_norm": 0.5791122913360596, "learning_rate": 1.2325390304026295e-05, "loss": 11.5149, "step": 3 }, { "epoch": 0.0006164594677899928, "grad_norm": 1.7733268737792969, "learning_rate": 2.465078060805259e-05, "loss": 11.504, "step": 6 }, { "epoch": 0.0009246892016849893, "grad_norm": 3.047468423843384, "learning_rate": 3.697617091207888e-05, "loss": 11.4391, "step": 9 }, { "epoch": 0.0012329189355799856, "grad_norm": 3.346104621887207, "learning_rate": 4.930156121610518e-05, "loss": 11.2635, "step": 12 }, { "epoch": 0.001541148669474982, "grad_norm": 3.2847304344177246, "learning_rate": 6.162695152013147e-05, "loss": 10.9864, "step": 15 }, { "epoch": 0.0018493784033699785, "grad_norm": 3.2979729175567627, "learning_rate": 7.395234182415776e-05, "loss": 10.6487, "step": 18 }, { "epoch": 0.0021576081372649747, "grad_norm": 3.445396661758423, "learning_rate": 8.627773212818406e-05, "loss": 10.2667, "step": 21 }, { "epoch": 0.0024658378711599712, "grad_norm": 3.620011568069458, "learning_rate": 9.860312243221036e-05, "loss": 9.8679, "step": 24 }, { "epoch": 0.002774067605054968, "grad_norm": 3.66719913482666, "learning_rate": 0.00011092851273623665, "loss": 9.4532, "step": 27 }, { "epoch": 0.003082297338949964, "grad_norm": 3.709792375564575, "learning_rate": 0.00012325390304026294, "loss": 9.0393, "step": 30 }, { "epoch": 0.0033905270728449605, "grad_norm": 3.5757813453674316, "learning_rate": 0.00013557929334428925, "loss": 8.6449, "step": 33 }, { "epoch": 0.003698756806739957, "grad_norm": 3.200732707977295, "learning_rate": 0.00014790468364831553, "loss": 8.2932, "step": 36 }, { "epoch": 0.004006986540634953, "grad_norm": 2.7463502883911133, "learning_rate": 0.0001602300739523418, "loss": 8.0151, "step": 39 }, { "epoch": 0.004315216274529949, "grad_norm": 1.961776852607727, "learning_rate": 0.00017255546425636812, "loss": 7.7988, "step": 42 }, { "epoch": 0.004623446008424946, "grad_norm": 1.5326848030090332, "learning_rate": 0.0001848808545603944, "loss": 7.6688, "step": 45 }, { "epoch": 0.0049316757423199424, "grad_norm": 2.266148090362549, "learning_rate": 0.00019720624486442071, "loss": 7.6276, "step": 48 }, { "epoch": 0.005239905476214939, "grad_norm": 2.563206672668457, "learning_rate": 0.000209531635168447, "loss": 7.5954, "step": 51 }, { "epoch": 0.005548135210109936, "grad_norm": 1.9326294660568237, "learning_rate": 0.0002218570254724733, "loss": 7.5136, "step": 54 }, { "epoch": 0.005856364944004932, "grad_norm": 1.0131096839904785, "learning_rate": 0.0002341824157764996, "loss": 7.4167, "step": 57 }, { "epoch": 0.006164594677899928, "grad_norm": 1.1923354864120483, "learning_rate": 0.00024650780608052587, "loss": 7.3273, "step": 60 }, { "epoch": 0.006472824411794925, "grad_norm": 1.2690105438232422, "learning_rate": 0.00025883319638455215, "loss": 7.2615, "step": 63 }, { "epoch": 0.006781054145689921, "grad_norm": 0.8375588059425354, "learning_rate": 0.0002711585866885785, "loss": 7.2088, "step": 66 }, { "epoch": 0.007089283879584917, "grad_norm": 0.719011127948761, "learning_rate": 0.0002834839769926048, "loss": 7.1474, "step": 69 }, { "epoch": 0.007397513613479914, "grad_norm": 0.7095780372619629, "learning_rate": 0.00029580936729663106, "loss": 7.0923, "step": 72 }, { "epoch": 0.00770574334737491, "grad_norm": 0.5094121098518372, "learning_rate": 0.00030813475760065734, "loss": 7.0406, "step": 75 }, { "epoch": 0.008013973081269906, "grad_norm": 0.4085525572299957, "learning_rate": 0.0003204601479046836, "loss": 6.9586, "step": 78 }, { "epoch": 0.008322202815164903, "grad_norm": 0.4122146666049957, "learning_rate": 0.0003327855382087099, "loss": 6.9295, "step": 81 }, { "epoch": 0.008630432549059899, "grad_norm": 0.41669413447380066, "learning_rate": 0.00034511092851273624, "loss": 6.8629, "step": 84 }, { "epoch": 0.008938662282954896, "grad_norm": 0.40379494428634644, "learning_rate": 0.0003574363188167625, "loss": 6.7987, "step": 87 }, { "epoch": 0.009246892016849893, "grad_norm": 0.4202142059803009, "learning_rate": 0.0003697617091207888, "loss": 6.7779, "step": 90 }, { "epoch": 0.009555121750744888, "grad_norm": 0.3880140781402588, "learning_rate": 0.0003820870994248151, "loss": 6.7204, "step": 93 }, { "epoch": 0.009863351484639885, "grad_norm": 0.419883668422699, "learning_rate": 0.00039441248972884143, "loss": 6.6553, "step": 96 }, { "epoch": 0.010171581218534882, "grad_norm": 0.4570387601852417, "learning_rate": 0.0004067378800328677, "loss": 6.5936, "step": 99 }, { "epoch": 0.010479810952429877, "grad_norm": 0.5889524221420288, "learning_rate": 0.000419063270336894, "loss": 6.5271, "step": 102 }, { "epoch": 0.010788040686324874, "grad_norm": 0.5578189492225647, "learning_rate": 0.0004313886606409203, "loss": 6.4701, "step": 105 }, { "epoch": 0.011096270420219871, "grad_norm": 0.4710846245288849, "learning_rate": 0.0004437140509449466, "loss": 6.4254, "step": 108 }, { "epoch": 0.011404500154114866, "grad_norm": 0.30558162927627563, "learning_rate": 0.0004560394412489729, "loss": 6.4217, "step": 111 }, { "epoch": 0.011712729888009863, "grad_norm": 0.3634621500968933, "learning_rate": 0.0004683648315529992, "loss": 6.3422, "step": 114 }, { "epoch": 0.01202095962190486, "grad_norm": 0.369437575340271, "learning_rate": 0.00048069022185702546, "loss": 6.303, "step": 117 }, { "epoch": 0.012329189355799856, "grad_norm": 0.7418856024742126, "learning_rate": 0.0004930156121610517, "loss": 6.2535, "step": 120 }, { "epoch": 0.012637419089694853, "grad_norm": 0.855678915977478, "learning_rate": 0.000505341002465078, "loss": 6.2037, "step": 123 }, { "epoch": 0.01294564882358985, "grad_norm": 0.42414671182632446, "learning_rate": 0.0005176663927691043, "loss": 6.175, "step": 126 }, { "epoch": 0.013253878557484845, "grad_norm": 0.44572392106056213, "learning_rate": 0.0005299917830731307, "loss": 6.1548, "step": 129 }, { "epoch": 0.013562108291379842, "grad_norm": 0.24437515437602997, "learning_rate": 0.000542317173377157, "loss": 6.0967, "step": 132 }, { "epoch": 0.013870338025274839, "grad_norm": 0.4750615954399109, "learning_rate": 0.0005546425636811833, "loss": 6.0776, "step": 135 }, { "epoch": 0.014178567759169834, "grad_norm": 0.9504273533821106, "learning_rate": 0.0005669679539852095, "loss": 6.0471, "step": 138 }, { "epoch": 0.014486797493064831, "grad_norm": 0.5452646613121033, "learning_rate": 0.0005792933442892358, "loss": 5.9987, "step": 141 }, { "epoch": 0.014795027226959828, "grad_norm": 0.4314074218273163, "learning_rate": 0.0005916187345932621, "loss": 5.9874, "step": 144 }, { "epoch": 0.015103256960854823, "grad_norm": 0.48243793845176697, "learning_rate": 0.0006039441248972884, "loss": 5.962, "step": 147 }, { "epoch": 0.01541148669474982, "grad_norm": 0.7570855617523193, "learning_rate": 0.0006162695152013147, "loss": 5.921, "step": 150 }, { "epoch": 0.015719716428644816, "grad_norm": 0.4731276333332062, "learning_rate": 0.000628594905505341, "loss": 5.9032, "step": 153 }, { "epoch": 0.016027946162539813, "grad_norm": 0.3078465759754181, "learning_rate": 0.0006409202958093672, "loss": 5.8932, "step": 156 }, { "epoch": 0.01633617589643481, "grad_norm": 0.3441495895385742, "learning_rate": 0.0006532456861133935, "loss": 5.8695, "step": 159 }, { "epoch": 0.016644405630329807, "grad_norm": 0.43056854605674744, "learning_rate": 0.0006655710764174198, "loss": 5.8642, "step": 162 }, { "epoch": 0.016952635364224804, "grad_norm": 1.3783445358276367, "learning_rate": 0.0006778964667214461, "loss": 5.845, "step": 165 }, { "epoch": 0.017260865098119797, "grad_norm": 0.77984619140625, "learning_rate": 0.0006902218570254725, "loss": 5.8216, "step": 168 }, { "epoch": 0.017569094832014794, "grad_norm": 0.2990098297595978, "learning_rate": 0.0007025472473294988, "loss": 5.7801, "step": 171 }, { "epoch": 0.01787732456590979, "grad_norm": 0.29485219717025757, "learning_rate": 0.000714872637633525, "loss": 5.7757, "step": 174 }, { "epoch": 0.018185554299804788, "grad_norm": 0.4363936483860016, "learning_rate": 0.0007271980279375513, "loss": 5.7598, "step": 177 }, { "epoch": 0.018493784033699785, "grad_norm": 0.8902605175971985, "learning_rate": 0.0007395234182415776, "loss": 5.7719, "step": 180 }, { "epoch": 0.018802013767594782, "grad_norm": 0.7133249044418335, "learning_rate": 0.0007518488085456039, "loss": 5.7568, "step": 183 }, { "epoch": 0.019110243501489776, "grad_norm": 0.9105846881866455, "learning_rate": 0.0007641741988496302, "loss": 5.7191, "step": 186 }, { "epoch": 0.019418473235384773, "grad_norm": 0.9680726528167725, "learning_rate": 0.0007764995891536565, "loss": 5.7077, "step": 189 }, { "epoch": 0.01972670296927977, "grad_norm": 0.535446047782898, "learning_rate": 0.0007888249794576829, "loss": 5.6982, "step": 192 }, { "epoch": 0.020034932703174767, "grad_norm": 0.7894541621208191, "learning_rate": 0.0008011503697617091, "loss": 5.668, "step": 195 }, { "epoch": 0.020343162437069764, "grad_norm": 0.6975138187408447, "learning_rate": 0.0008134757600657354, "loss": 5.6432, "step": 198 }, { "epoch": 0.02065139217096476, "grad_norm": 0.6306262016296387, "learning_rate": 0.0008258011503697617, "loss": 5.6647, "step": 201 }, { "epoch": 0.020959621904859754, "grad_norm": 0.5615081787109375, "learning_rate": 0.000838126540673788, "loss": 5.6208, "step": 204 }, { "epoch": 0.02126785163875475, "grad_norm": 0.6468993425369263, "learning_rate": 0.0008504519309778143, "loss": 5.6272, "step": 207 }, { "epoch": 0.02157608137264975, "grad_norm": 0.8359414339065552, "learning_rate": 0.0008627773212818406, "loss": 5.6114, "step": 210 }, { "epoch": 0.021884311106544745, "grad_norm": 0.8909689784049988, "learning_rate": 0.0008751027115858668, "loss": 5.5957, "step": 213 }, { "epoch": 0.022192540840439742, "grad_norm": 0.39673465490341187, "learning_rate": 0.0008874281018898932, "loss": 5.568, "step": 216 }, { "epoch": 0.022500770574334736, "grad_norm": 0.9037743806838989, "learning_rate": 0.0008997534921939195, "loss": 5.5746, "step": 219 }, { "epoch": 0.022809000308229733, "grad_norm": 0.6929497122764587, "learning_rate": 0.0009120788824979458, "loss": 5.5482, "step": 222 }, { "epoch": 0.02311723004212473, "grad_norm": 0.5773665308952332, "learning_rate": 0.0009244042728019721, "loss": 5.5389, "step": 225 }, { "epoch": 0.023425459776019727, "grad_norm": 0.9532020092010498, "learning_rate": 0.0009367296631059984, "loss": 5.5369, "step": 228 }, { "epoch": 0.023733689509914724, "grad_norm": 1.2347012758255005, "learning_rate": 0.0009490550534100246, "loss": 5.5251, "step": 231 }, { "epoch": 0.02404191924380972, "grad_norm": 1.2062091827392578, "learning_rate": 0.0009613804437140509, "loss": 5.5042, "step": 234 }, { "epoch": 0.024350148977704714, "grad_norm": 1.4920969009399414, "learning_rate": 0.0009737058340180772, "loss": 5.4851, "step": 237 }, { "epoch": 0.02465837871159971, "grad_norm": 0.5619600415229797, "learning_rate": 0.0009860312243221035, "loss": 5.4889, "step": 240 }, { "epoch": 0.02496660844549471, "grad_norm": 0.8607615828514099, "learning_rate": 0.0009983566146261299, "loss": 5.4747, "step": 243 }, { "epoch": 0.025274838179389705, "grad_norm": 0.6228588223457336, "learning_rate": 0.001010682004930156, "loss": 5.4502, "step": 246 }, { "epoch": 0.025583067913284702, "grad_norm": 1.1925005912780762, "learning_rate": 0.0010230073952341824, "loss": 5.4449, "step": 249 }, { "epoch": 0.0258912976471797, "grad_norm": 0.7956414818763733, "learning_rate": 0.0010353327855382086, "loss": 5.4623, "step": 252 }, { "epoch": 0.026199527381074693, "grad_norm": 0.654242992401123, "learning_rate": 0.001047658175842235, "loss": 5.4287, "step": 255 }, { "epoch": 0.02650775711496969, "grad_norm": 0.592880368232727, "learning_rate": 0.0010599835661462614, "loss": 5.3891, "step": 258 }, { "epoch": 0.026815986848864687, "grad_norm": 0.9015865921974182, "learning_rate": 0.0010723089564502876, "loss": 5.4127, "step": 261 }, { "epoch": 0.027124216582759684, "grad_norm": 0.593488335609436, "learning_rate": 0.001084634346754314, "loss": 5.3887, "step": 264 }, { "epoch": 0.02743244631665468, "grad_norm": 0.7008156180381775, "learning_rate": 0.0010969597370583401, "loss": 5.386, "step": 267 }, { "epoch": 0.027740676050549678, "grad_norm": 0.32653194665908813, "learning_rate": 0.0011092851273623665, "loss": 5.3479, "step": 270 }, { "epoch": 0.02804890578444467, "grad_norm": 0.551142692565918, "learning_rate": 0.0011216105176663927, "loss": 5.3613, "step": 273 }, { "epoch": 0.02835713551833967, "grad_norm": 2.3521084785461426, "learning_rate": 0.001133935907970419, "loss": 5.3653, "step": 276 }, { "epoch": 0.028665365252234665, "grad_norm": 1.3452407121658325, "learning_rate": 0.0011462612982744455, "loss": 5.3559, "step": 279 }, { "epoch": 0.028973594986129662, "grad_norm": 1.0670260190963745, "learning_rate": 0.0011585866885784717, "loss": 5.3299, "step": 282 }, { "epoch": 0.02928182472002466, "grad_norm": 0.7768902778625488, "learning_rate": 0.001170912078882498, "loss": 5.3346, "step": 285 }, { "epoch": 0.029590054453919656, "grad_norm": 0.48641496896743774, "learning_rate": 0.0011832374691865242, "loss": 5.3178, "step": 288 }, { "epoch": 0.02989828418781465, "grad_norm": 0.5284126400947571, "learning_rate": 0.0011955628594905506, "loss": 5.3061, "step": 291 }, { "epoch": 0.030206513921709647, "grad_norm": 0.9099608659744263, "learning_rate": 0.0012078882497945768, "loss": 5.2764, "step": 294 }, { "epoch": 0.030514743655604644, "grad_norm": 0.7352691888809204, "learning_rate": 0.0012202136400986032, "loss": 5.2853, "step": 297 }, { "epoch": 0.03082297338949964, "grad_norm": 0.8361043334007263, "learning_rate": 0.0012325390304026294, "loss": 5.2838, "step": 300 }, { "epoch": 0.031131203123394638, "grad_norm": 1.525067925453186, "learning_rate": 0.0012448644207066558, "loss": 5.2612, "step": 303 }, { "epoch": 0.03143943285728963, "grad_norm": 0.6117688417434692, "learning_rate": 0.001257189811010682, "loss": 5.2488, "step": 306 }, { "epoch": 0.03174766259118463, "grad_norm": 0.9976358413696289, "learning_rate": 0.0012695152013147083, "loss": 5.2327, "step": 309 }, { "epoch": 0.032055892325079625, "grad_norm": 0.8152816891670227, "learning_rate": 0.0012818405916187345, "loss": 5.2095, "step": 312 }, { "epoch": 0.03236412205897462, "grad_norm": 0.8640046715736389, "learning_rate": 0.0012941659819227609, "loss": 5.1932, "step": 315 }, { "epoch": 0.03267235179286962, "grad_norm": 0.9461572170257568, "learning_rate": 0.001306491372226787, "loss": 5.1822, "step": 318 }, { "epoch": 0.03298058152676461, "grad_norm": 0.7717807292938232, "learning_rate": 0.0013188167625308134, "loss": 5.183, "step": 321 }, { "epoch": 0.03328881126065961, "grad_norm": 0.9057526588439941, "learning_rate": 0.0013311421528348396, "loss": 5.1686, "step": 324 }, { "epoch": 0.03359704099455461, "grad_norm": 0.5352618098258972, "learning_rate": 0.001343467543138866, "loss": 5.1378, "step": 327 }, { "epoch": 0.03390527072844961, "grad_norm": 1.2399810552597046, "learning_rate": 0.0013557929334428922, "loss": 5.1436, "step": 330 }, { "epoch": 0.0342135004623446, "grad_norm": 0.6678963303565979, "learning_rate": 0.0013681183237469186, "loss": 5.1488, "step": 333 }, { "epoch": 0.034521730196239594, "grad_norm": 0.6166791915893555, "learning_rate": 0.001380443714050945, "loss": 5.1239, "step": 336 }, { "epoch": 0.034829959930134595, "grad_norm": 1.1305850744247437, "learning_rate": 0.0013927691043549711, "loss": 5.1145, "step": 339 }, { "epoch": 0.03513818966402959, "grad_norm": 0.46510085463523865, "learning_rate": 0.0014050944946589975, "loss": 5.1041, "step": 342 }, { "epoch": 0.03544641939792459, "grad_norm": 0.4835362136363983, "learning_rate": 0.0014174198849630237, "loss": 5.0699, "step": 345 }, { "epoch": 0.03575464913181958, "grad_norm": 0.6595330238342285, "learning_rate": 0.00142974527526705, "loss": 5.0744, "step": 348 }, { "epoch": 0.036062878865714576, "grad_norm": 0.7306437492370605, "learning_rate": 0.0014420706655710763, "loss": 5.0703, "step": 351 }, { "epoch": 0.036371108599609576, "grad_norm": 0.5263068079948425, "learning_rate": 0.0014543960558751027, "loss": 5.0666, "step": 354 }, { "epoch": 0.03667933833350457, "grad_norm": 0.5896726250648499, "learning_rate": 0.001466721446179129, "loss": 5.0326, "step": 357 }, { "epoch": 0.03698756806739957, "grad_norm": 0.9357500672340393, "learning_rate": 0.0014790468364831552, "loss": 5.0363, "step": 360 }, { "epoch": 0.037295797801294564, "grad_norm": 0.7629897594451904, "learning_rate": 0.0014913722267871816, "loss": 5.0304, "step": 363 }, { "epoch": 0.037604027535189564, "grad_norm": 0.6347280144691467, "learning_rate": 0.0015036976170912078, "loss": 4.9962, "step": 366 }, { "epoch": 0.03791225726908456, "grad_norm": 0.4810947775840759, "learning_rate": 0.0015160230073952342, "loss": 4.9856, "step": 369 }, { "epoch": 0.03822048700297955, "grad_norm": 0.5907162427902222, "learning_rate": 0.0015283483976992604, "loss": 4.9712, "step": 372 }, { "epoch": 0.03852871673687455, "grad_norm": 0.5781192183494568, "learning_rate": 0.0015406737880032868, "loss": 5.0078, "step": 375 }, { "epoch": 0.038836946470769546, "grad_norm": 0.6017566323280334, "learning_rate": 0.001552999178307313, "loss": 4.9563, "step": 378 }, { "epoch": 0.039145176204664546, "grad_norm": 1.208348035812378, "learning_rate": 0.0015653245686113393, "loss": 4.9696, "step": 381 }, { "epoch": 0.03945340593855954, "grad_norm": 0.6113926768302917, "learning_rate": 0.0015776499589153657, "loss": 4.9461, "step": 384 }, { "epoch": 0.03976163567245453, "grad_norm": 0.6794010996818542, "learning_rate": 0.0015899753492193919, "loss": 4.9668, "step": 387 }, { "epoch": 0.040069865406349534, "grad_norm": 0.4383271038532257, "learning_rate": 0.0016023007395234183, "loss": 4.9283, "step": 390 }, { "epoch": 0.04037809514024453, "grad_norm": 0.9564613699913025, "learning_rate": 0.0016146261298274444, "loss": 4.8814, "step": 393 }, { "epoch": 0.04068632487413953, "grad_norm": 0.6730177402496338, "learning_rate": 0.0016269515201314708, "loss": 4.9158, "step": 396 }, { "epoch": 0.04099455460803452, "grad_norm": 0.5306158661842346, "learning_rate": 0.001639276910435497, "loss": 4.904, "step": 399 }, { "epoch": 0.04130278434192952, "grad_norm": 0.48708540201187134, "learning_rate": 0.0016516023007395234, "loss": 4.9002, "step": 402 }, { "epoch": 0.041611014075824515, "grad_norm": 0.4917944371700287, "learning_rate": 0.0016639276910435496, "loss": 4.8913, "step": 405 }, { "epoch": 0.04191924380971951, "grad_norm": 1.0929678678512573, "learning_rate": 0.001676253081347576, "loss": 4.8986, "step": 408 }, { "epoch": 0.04222747354361451, "grad_norm": 0.5417898297309875, "learning_rate": 0.0016885784716516024, "loss": 4.8702, "step": 411 }, { "epoch": 0.0425357032775095, "grad_norm": 1.1427472829818726, "learning_rate": 0.0017009038619556285, "loss": 4.8396, "step": 414 }, { "epoch": 0.0428439330114045, "grad_norm": 0.8225170969963074, "learning_rate": 0.001713229252259655, "loss": 4.8439, "step": 417 }, { "epoch": 0.0431521627452995, "grad_norm": 0.5638198256492615, "learning_rate": 0.001725554642563681, "loss": 4.8271, "step": 420 }, { "epoch": 0.04346039247919449, "grad_norm": 0.3389821946620941, "learning_rate": 0.0017378800328677075, "loss": 4.8207, "step": 423 }, { "epoch": 0.04376862221308949, "grad_norm": 0.38620057702064514, "learning_rate": 0.0017502054231717337, "loss": 4.8082, "step": 426 }, { "epoch": 0.044076851946984484, "grad_norm": 1.1568442583084106, "learning_rate": 0.00176253081347576, "loss": 4.7946, "step": 429 }, { "epoch": 0.044385081680879485, "grad_norm": 0.650175154209137, "learning_rate": 0.0017748562037797865, "loss": 4.7798, "step": 432 }, { "epoch": 0.04469331141477448, "grad_norm": 0.5364396572113037, "learning_rate": 0.0017871815940838126, "loss": 4.7732, "step": 435 }, { "epoch": 0.04500154114866947, "grad_norm": 0.7013806700706482, "learning_rate": 0.001799506984387839, "loss": 4.7733, "step": 438 }, { "epoch": 0.04530977088256447, "grad_norm": 0.4559784233570099, "learning_rate": 0.0018118323746918652, "loss": 4.7789, "step": 441 }, { "epoch": 0.045618000616459466, "grad_norm": 0.3456243872642517, "learning_rate": 0.0018241577649958916, "loss": 4.7456, "step": 444 }, { "epoch": 0.045926230350354466, "grad_norm": 0.6245532631874084, "learning_rate": 0.0018364831552999178, "loss": 4.7408, "step": 447 }, { "epoch": 0.04623446008424946, "grad_norm": 1.1933598518371582, "learning_rate": 0.0018488085456039441, "loss": 4.7728, "step": 450 }, { "epoch": 0.04654268981814446, "grad_norm": 0.8743248581886292, "learning_rate": 0.0018611339359079703, "loss": 4.7595, "step": 453 }, { "epoch": 0.046850919552039454, "grad_norm": 0.4980567693710327, "learning_rate": 0.0018734593262119967, "loss": 4.7222, "step": 456 }, { "epoch": 0.04715914928593445, "grad_norm": 0.6380690932273865, "learning_rate": 0.001885784716516023, "loss": 4.7175, "step": 459 }, { "epoch": 0.04746737901982945, "grad_norm": 0.3606894612312317, "learning_rate": 0.0018981101068200493, "loss": 4.7075, "step": 462 }, { "epoch": 0.04777560875372444, "grad_norm": 0.5618919730186462, "learning_rate": 0.0019104354971240757, "loss": 4.6939, "step": 465 }, { "epoch": 0.04808383848761944, "grad_norm": 0.639410138130188, "learning_rate": 0.0019227608874281018, "loss": 4.6748, "step": 468 }, { "epoch": 0.048392068221514435, "grad_norm": 0.7849680185317993, "learning_rate": 0.0019350862777321282, "loss": 4.6895, "step": 471 }, { "epoch": 0.04870029795540943, "grad_norm": 0.5419800877571106, "learning_rate": 0.0019474116680361544, "loss": 4.64, "step": 474 }, { "epoch": 0.04900852768930443, "grad_norm": 0.40359726548194885, "learning_rate": 0.001959737058340181, "loss": 4.6564, "step": 477 }, { "epoch": 0.04931675742319942, "grad_norm": 0.742076575756073, "learning_rate": 0.001972062448644207, "loss": 4.6434, "step": 480 }, { "epoch": 0.04962498715709442, "grad_norm": 0.620801568031311, "learning_rate": 0.0019843878389482336, "loss": 4.6509, "step": 483 }, { "epoch": 0.04993321689098942, "grad_norm": 0.5293563008308411, "learning_rate": 0.0019967132292522598, "loss": 4.6459, "step": 486 }, { "epoch": 0.05024144662488442, "grad_norm": 0.7527710795402527, "learning_rate": 0.002009038619556286, "loss": 4.6557, "step": 489 }, { "epoch": 0.05054967635877941, "grad_norm": 0.47365424036979675, "learning_rate": 0.002021364009860312, "loss": 4.6223, "step": 492 }, { "epoch": 0.050857906092674404, "grad_norm": 0.5232967734336853, "learning_rate": 0.0020336894001643387, "loss": 4.6186, "step": 495 }, { "epoch": 0.051166135826569405, "grad_norm": 0.40717506408691406, "learning_rate": 0.002046014790468365, "loss": 4.6125, "step": 498 }, { "epoch": 0.0514743655604644, "grad_norm": 0.5403701066970825, "learning_rate": 0.002058340180772391, "loss": 4.6143, "step": 501 }, { "epoch": 0.0517825952943594, "grad_norm": 0.7209203839302063, "learning_rate": 0.0020706655710764172, "loss": 4.5713, "step": 504 }, { "epoch": 0.05209082502825439, "grad_norm": 0.6991008520126343, "learning_rate": 0.002082990961380444, "loss": 4.6044, "step": 507 }, { "epoch": 0.052399054762149386, "grad_norm": 0.7478086352348328, "learning_rate": 0.00209531635168447, "loss": 4.5685, "step": 510 }, { "epoch": 0.052707284496044386, "grad_norm": 0.5864932537078857, "learning_rate": 0.002107641741988496, "loss": 4.588, "step": 513 }, { "epoch": 0.05301551422993938, "grad_norm": 0.44748950004577637, "learning_rate": 0.002119967132292523, "loss": 4.5823, "step": 516 }, { "epoch": 0.05332374396383438, "grad_norm": 0.32787564396858215, "learning_rate": 0.002132292522596549, "loss": 4.5522, "step": 519 }, { "epoch": 0.053631973697729374, "grad_norm": 0.30747687816619873, "learning_rate": 0.002144617912900575, "loss": 4.5429, "step": 522 }, { "epoch": 0.05394020343162437, "grad_norm": 0.3548784554004669, "learning_rate": 0.0021569433032046013, "loss": 4.5207, "step": 525 }, { "epoch": 0.05424843316551937, "grad_norm": 0.6617491841316223, "learning_rate": 0.002169268693508628, "loss": 4.5373, "step": 528 }, { "epoch": 0.05455666289941436, "grad_norm": 0.9917429089546204, "learning_rate": 0.002181594083812654, "loss": 4.5504, "step": 531 }, { "epoch": 0.05486489263330936, "grad_norm": 0.6506537795066833, "learning_rate": 0.0021939194741166803, "loss": 4.5385, "step": 534 }, { "epoch": 0.055173122367204355, "grad_norm": 0.3738003075122833, "learning_rate": 0.002206244864420707, "loss": 4.5169, "step": 537 }, { "epoch": 0.055481352101099356, "grad_norm": 0.3488200008869171, "learning_rate": 0.002218570254724733, "loss": 4.5119, "step": 540 }, { "epoch": 0.05578958183499435, "grad_norm": 0.31217944622039795, "learning_rate": 0.0022308956450287592, "loss": 4.4796, "step": 543 }, { "epoch": 0.05609781156888934, "grad_norm": 0.26770153641700745, "learning_rate": 0.0022432210353327854, "loss": 4.4699, "step": 546 }, { "epoch": 0.05640604130278434, "grad_norm": 0.3656662702560425, "learning_rate": 0.002255546425636812, "loss": 4.4817, "step": 549 }, { "epoch": 0.05671427103667934, "grad_norm": 0.5845988392829895, "learning_rate": 0.002267871815940838, "loss": 4.4525, "step": 552 }, { "epoch": 0.05702250077057434, "grad_norm": 0.41006627678871155, "learning_rate": 0.0022801972062448644, "loss": 4.4649, "step": 555 }, { "epoch": 0.05733073050446933, "grad_norm": 1.2013694047927856, "learning_rate": 0.002292522596548891, "loss": 4.4666, "step": 558 }, { "epoch": 0.057638960238364324, "grad_norm": 0.6116489171981812, "learning_rate": 0.002304847986852917, "loss": 4.4853, "step": 561 }, { "epoch": 0.057947189972259325, "grad_norm": 0.30115845799446106, "learning_rate": 0.0023171733771569433, "loss": 4.4409, "step": 564 }, { "epoch": 0.05825541970615432, "grad_norm": 0.2863396108150482, "learning_rate": 0.0023294987674609695, "loss": 4.4358, "step": 567 }, { "epoch": 0.05856364944004932, "grad_norm": 0.3191300928592682, "learning_rate": 0.002341824157764996, "loss": 4.454, "step": 570 }, { "epoch": 0.05887187917394431, "grad_norm": 0.4280944764614105, "learning_rate": 0.0023541495480690223, "loss": 4.3943, "step": 573 }, { "epoch": 0.05918010890783931, "grad_norm": 0.49310484528541565, "learning_rate": 0.0023664749383730485, "loss": 4.4097, "step": 576 }, { "epoch": 0.059488338641734306, "grad_norm": 0.4923991858959198, "learning_rate": 0.002378800328677075, "loss": 4.4454, "step": 579 }, { "epoch": 0.0597965683756293, "grad_norm": 0.5043625235557556, "learning_rate": 0.0023911257189811012, "loss": 4.3975, "step": 582 }, { "epoch": 0.0601047981095243, "grad_norm": 0.5404270887374878, "learning_rate": 0.0024034511092851274, "loss": 4.3957, "step": 585 }, { "epoch": 0.060413027843419294, "grad_norm": 0.9954332709312439, "learning_rate": 0.0024157764995891536, "loss": 4.3864, "step": 588 }, { "epoch": 0.060721257577314294, "grad_norm": 0.3632584512233734, "learning_rate": 0.00242810188989318, "loss": 4.38, "step": 591 }, { "epoch": 0.06102948731120929, "grad_norm": 0.2620343267917633, "learning_rate": 0.0024404272801972064, "loss": 4.3538, "step": 594 }, { "epoch": 0.06133771704510428, "grad_norm": 0.25050923228263855, "learning_rate": 0.0024527526705012325, "loss": 4.351, "step": 597 }, { "epoch": 0.06164594677899928, "grad_norm": 0.27279627323150635, "learning_rate": 0.0024650780608052587, "loss": 4.3335, "step": 600 }, { "epoch": 0.061954176512894275, "grad_norm": 0.6038771271705627, "learning_rate": 0.0024774034511092853, "loss": 4.3409, "step": 603 }, { "epoch": 0.062262406246789276, "grad_norm": 0.6948337554931641, "learning_rate": 0.0024897288414133115, "loss": 4.3555, "step": 606 }, { "epoch": 0.06257063598068427, "grad_norm": 0.5086238980293274, "learning_rate": 0.0025020542317173377, "loss": 4.3491, "step": 609 }, { "epoch": 0.06287886571457926, "grad_norm": 0.475999116897583, "learning_rate": 0.002514379622021364, "loss": 4.3412, "step": 612 }, { "epoch": 0.06318709544847426, "grad_norm": 0.3968357741832733, "learning_rate": 0.0025267050123253905, "loss": 4.3139, "step": 615 }, { "epoch": 0.06349532518236926, "grad_norm": 0.6681760549545288, "learning_rate": 0.0025390304026294166, "loss": 4.2999, "step": 618 }, { "epoch": 0.06380355491626426, "grad_norm": 0.3453294634819031, "learning_rate": 0.002551355792933443, "loss": 4.2873, "step": 621 }, { "epoch": 0.06411178465015925, "grad_norm": 0.3346744775772095, "learning_rate": 0.002563681183237469, "loss": 4.2868, "step": 624 }, { "epoch": 0.06442001438405424, "grad_norm": 0.39689645171165466, "learning_rate": 0.0025760065735414956, "loss": 4.2846, "step": 627 }, { "epoch": 0.06472824411794924, "grad_norm": 0.4017212688922882, "learning_rate": 0.0025883319638455218, "loss": 4.2625, "step": 630 }, { "epoch": 0.06503647385184425, "grad_norm": 0.3414025902748108, "learning_rate": 0.0026006573541495484, "loss": 4.2657, "step": 633 }, { "epoch": 0.06534470358573924, "grad_norm": 0.4091610312461853, "learning_rate": 0.002612982744453574, "loss": 4.2414, "step": 636 }, { "epoch": 0.06565293331963423, "grad_norm": 0.3916926085948944, "learning_rate": 0.0026253081347576007, "loss": 4.1801, "step": 639 }, { "epoch": 0.06596116305352923, "grad_norm": 1.0324465036392212, "learning_rate": 0.002637633525061627, "loss": 4.2162, "step": 642 }, { "epoch": 0.06626939278742423, "grad_norm": 0.4595172107219696, "learning_rate": 0.0026499589153656535, "loss": 4.2352, "step": 645 }, { "epoch": 0.06657762252131923, "grad_norm": 0.3215947151184082, "learning_rate": 0.0026622843056696792, "loss": 4.1865, "step": 648 }, { "epoch": 0.06688585225521422, "grad_norm": 0.2739149034023285, "learning_rate": 0.002674609695973706, "loss": 4.1644, "step": 651 }, { "epoch": 0.06719408198910921, "grad_norm": 0.250794917345047, "learning_rate": 0.002686935086277732, "loss": 4.1164, "step": 654 }, { "epoch": 0.06750231172300421, "grad_norm": 0.38465654850006104, "learning_rate": 0.0026992604765817586, "loss": 4.0844, "step": 657 }, { "epoch": 0.06781054145689921, "grad_norm": 0.5341691970825195, "learning_rate": 0.0027115858668857844, "loss": 4.1149, "step": 660 }, { "epoch": 0.06811877119079421, "grad_norm": 0.3479110896587372, "learning_rate": 0.002723911257189811, "loss": 4.1186, "step": 663 }, { "epoch": 0.0684270009246892, "grad_norm": 1.026038646697998, "learning_rate": 0.002736236647493837, "loss": 4.1293, "step": 666 }, { "epoch": 0.0687352306585842, "grad_norm": 0.445689857006073, "learning_rate": 0.0027485620377978638, "loss": 4.1319, "step": 669 }, { "epoch": 0.06904346039247919, "grad_norm": 0.3061058819293976, "learning_rate": 0.00276088742810189, "loss": 4.0338, "step": 672 }, { "epoch": 0.0693516901263742, "grad_norm": 0.26792746782302856, "learning_rate": 0.002773212818405916, "loss": 4.0154, "step": 675 }, { "epoch": 0.06965991986026919, "grad_norm": 0.2843894064426422, "learning_rate": 0.0027855382087099423, "loss": 4.0364, "step": 678 }, { "epoch": 0.06996814959416418, "grad_norm": 0.3073459565639496, "learning_rate": 0.002797863599013969, "loss": 3.9836, "step": 681 }, { "epoch": 0.07027637932805918, "grad_norm": 0.5893545746803284, "learning_rate": 0.002810188989317995, "loss": 4.0062, "step": 684 }, { "epoch": 0.07058460906195417, "grad_norm": 0.5386547446250916, "learning_rate": 0.0028225143796220217, "loss": 4.0066, "step": 687 }, { "epoch": 0.07089283879584918, "grad_norm": 0.7944250106811523, "learning_rate": 0.0028348397699260474, "loss": 3.9816, "step": 690 }, { "epoch": 0.07120106852974417, "grad_norm": 0.32200196385383606, "learning_rate": 0.002847165160230074, "loss": 3.9551, "step": 693 }, { "epoch": 0.07150929826363916, "grad_norm": 0.28814995288848877, "learning_rate": 0.0028594905505341, "loss": 3.912, "step": 696 }, { "epoch": 0.07181752799753416, "grad_norm": 0.2727998197078705, "learning_rate": 0.002871815940838127, "loss": 3.9203, "step": 699 }, { "epoch": 0.07212575773142915, "grad_norm": 0.2785607576370239, "learning_rate": 0.0028841413311421525, "loss": 3.8865, "step": 702 }, { "epoch": 0.07243398746532416, "grad_norm": 0.4318368136882782, "learning_rate": 0.002896466721446179, "loss": 3.8364, "step": 705 }, { "epoch": 0.07274221719921915, "grad_norm": 0.5888954997062683, "learning_rate": 0.0029087921117502053, "loss": 3.8772, "step": 708 }, { "epoch": 0.07305044693311415, "grad_norm": 0.5866847634315491, "learning_rate": 0.002921117502054232, "loss": 3.9039, "step": 711 }, { "epoch": 0.07335867666700914, "grad_norm": 0.40300968289375305, "learning_rate": 0.002933442892358258, "loss": 3.8332, "step": 714 }, { "epoch": 0.07366690640090415, "grad_norm": 0.2894107401371002, "learning_rate": 0.0029457682826622843, "loss": 3.8533, "step": 717 }, { "epoch": 0.07397513613479914, "grad_norm": 0.2637479901313782, "learning_rate": 0.0029580936729663105, "loss": 3.7962, "step": 720 }, { "epoch": 0.07428336586869413, "grad_norm": 0.5004228353500366, "learning_rate": 0.002970419063270337, "loss": 3.7759, "step": 723 }, { "epoch": 0.07459159560258913, "grad_norm": 0.30835986137390137, "learning_rate": 0.0029827444535743632, "loss": 3.7819, "step": 726 }, { "epoch": 0.07489982533648412, "grad_norm": 0.5601911544799805, "learning_rate": 0.00299506984387839, "loss": 3.7716, "step": 729 }, { "epoch": 0.07520805507037913, "grad_norm": 0.48242396116256714, "learning_rate": 0.0030073952341824156, "loss": 3.803, "step": 732 }, { "epoch": 0.07551628480427412, "grad_norm": 0.355916827917099, "learning_rate": 0.003019720624486442, "loss": 3.7532, "step": 735 }, { "epoch": 0.07582451453816912, "grad_norm": 0.4205069839954376, "learning_rate": 0.0030320460147904684, "loss": 3.7657, "step": 738 }, { "epoch": 0.07613274427206411, "grad_norm": 0.35680562257766724, "learning_rate": 0.003044371405094495, "loss": 3.7348, "step": 741 }, { "epoch": 0.0764409740059591, "grad_norm": 0.36372673511505127, "learning_rate": 0.0030566967953985207, "loss": 3.7569, "step": 744 }, { "epoch": 0.07674920373985411, "grad_norm": 0.2887914776802063, "learning_rate": 0.0030690221857025473, "loss": 3.7704, "step": 747 }, { "epoch": 0.0770574334737491, "grad_norm": 0.255290687084198, "learning_rate": 0.0030813475760065735, "loss": 3.7054, "step": 750 }, { "epoch": 0.0773656632076441, "grad_norm": 0.2969897389411926, "learning_rate": 0.0030936729663106, "loss": 3.708, "step": 753 }, { "epoch": 0.07767389294153909, "grad_norm": 0.491763710975647, "learning_rate": 0.003105998356614626, "loss": 3.7279, "step": 756 }, { "epoch": 0.07798212267543408, "grad_norm": 0.6437285542488098, "learning_rate": 0.0031183237469186525, "loss": 3.7129, "step": 759 }, { "epoch": 0.07829035240932909, "grad_norm": 0.3605806827545166, "learning_rate": 0.0031306491372226786, "loss": 3.6754, "step": 762 }, { "epoch": 0.07859858214322409, "grad_norm": 0.26162126660346985, "learning_rate": 0.0031429745275267052, "loss": 3.6869, "step": 765 }, { "epoch": 0.07890681187711908, "grad_norm": 0.3107220530509949, "learning_rate": 0.0031552999178307314, "loss": 3.6278, "step": 768 }, { "epoch": 0.07921504161101407, "grad_norm": 0.30417200922966003, "learning_rate": 0.0031676253081347576, "loss": 3.6046, "step": 771 }, { "epoch": 0.07952327134490907, "grad_norm": 0.5612326860427856, "learning_rate": 0.0031799506984387838, "loss": 3.6578, "step": 774 }, { "epoch": 0.07983150107880407, "grad_norm": 0.6136355996131897, "learning_rate": 0.0031922760887428104, "loss": 3.6897, "step": 777 }, { "epoch": 0.08013973081269907, "grad_norm": 0.4560060501098633, "learning_rate": 0.0032046014790468366, "loss": 3.6976, "step": 780 }, { "epoch": 0.08044796054659406, "grad_norm": 0.23871034383773804, "learning_rate": 0.003216926869350863, "loss": 3.657, "step": 783 }, { "epoch": 0.08075619028048905, "grad_norm": 0.17063000798225403, "learning_rate": 0.003229252259654889, "loss": 3.5905, "step": 786 }, { "epoch": 0.08106442001438405, "grad_norm": 0.35351842641830444, "learning_rate": 0.0032415776499589155, "loss": 3.603, "step": 789 }, { "epoch": 0.08137264974827906, "grad_norm": 0.340762197971344, "learning_rate": 0.0032539030402629417, "loss": 3.5978, "step": 792 }, { "epoch": 0.08168087948217405, "grad_norm": 0.22542034089565277, "learning_rate": 0.0032662284305669683, "loss": 3.5821, "step": 795 }, { "epoch": 0.08198910921606904, "grad_norm": 0.25130555033683777, "learning_rate": 0.003278553820870994, "loss": 3.5491, "step": 798 }, { "epoch": 0.08229733894996404, "grad_norm": 0.5155714750289917, "learning_rate": 0.0032908792111750206, "loss": 3.5605, "step": 801 }, { "epoch": 0.08260556868385904, "grad_norm": 0.3964254856109619, "learning_rate": 0.003303204601479047, "loss": 3.581, "step": 804 }, { "epoch": 0.08291379841775404, "grad_norm": 0.27110666036605835, "learning_rate": 0.0033155299917830734, "loss": 3.5995, "step": 807 }, { "epoch": 0.08322202815164903, "grad_norm": 0.38535767793655396, "learning_rate": 0.003327855382087099, "loss": 3.6029, "step": 810 }, { "epoch": 0.08353025788554402, "grad_norm": 0.6176712512969971, "learning_rate": 0.0033401807723911258, "loss": 3.5694, "step": 813 }, { "epoch": 0.08383848761943902, "grad_norm": 0.33828550577163696, "learning_rate": 0.003352506162695152, "loss": 3.5507, "step": 816 }, { "epoch": 0.08414671735333402, "grad_norm": 0.2286808043718338, "learning_rate": 0.0033648315529991786, "loss": 3.5345, "step": 819 }, { "epoch": 0.08445494708722902, "grad_norm": 0.30232542753219604, "learning_rate": 0.0033771569433032047, "loss": 3.5154, "step": 822 }, { "epoch": 0.08476317682112401, "grad_norm": 0.31767842173576355, "learning_rate": 0.0033894823336072313, "loss": 3.5442, "step": 825 }, { "epoch": 0.085071406555019, "grad_norm": 0.4275444746017456, "learning_rate": 0.003401807723911257, "loss": 3.5715, "step": 828 }, { "epoch": 0.085379636288914, "grad_norm": 0.3426364064216614, "learning_rate": 0.0034141331142152837, "loss": 3.5224, "step": 831 }, { "epoch": 0.085687866022809, "grad_norm": 0.33871403336524963, "learning_rate": 0.00342645850451931, "loss": 3.5119, "step": 834 }, { "epoch": 0.085996095756704, "grad_norm": 0.2641143202781677, "learning_rate": 0.0034387838948233365, "loss": 3.5179, "step": 837 }, { "epoch": 0.086304325490599, "grad_norm": 0.22955679893493652, "learning_rate": 0.003451109285127362, "loss": 3.4807, "step": 840 }, { "epoch": 0.08661255522449399, "grad_norm": 0.3795819878578186, "learning_rate": 0.003463434675431389, "loss": 3.4916, "step": 843 }, { "epoch": 0.08692078495838898, "grad_norm": 0.2942325174808502, "learning_rate": 0.003475760065735415, "loss": 3.4601, "step": 846 }, { "epoch": 0.08722901469228399, "grad_norm": 0.49732574820518494, "learning_rate": 0.0034880854560394416, "loss": 3.5021, "step": 849 }, { "epoch": 0.08753724442617898, "grad_norm": 0.4395911991596222, "learning_rate": 0.0035004108463434673, "loss": 3.4976, "step": 852 }, { "epoch": 0.08784547416007397, "grad_norm": 0.24201816320419312, "learning_rate": 0.003512736236647494, "loss": 3.4648, "step": 855 }, { "epoch": 0.08815370389396897, "grad_norm": 0.32818078994750977, "learning_rate": 0.00352506162695152, "loss": 3.4826, "step": 858 }, { "epoch": 0.08846193362786396, "grad_norm": 0.4433400630950928, "learning_rate": 0.0035373870172555467, "loss": 3.4763, "step": 861 }, { "epoch": 0.08877016336175897, "grad_norm": 0.2911035716533661, "learning_rate": 0.003549712407559573, "loss": 3.5024, "step": 864 }, { "epoch": 0.08907839309565396, "grad_norm": 0.27419009804725647, "learning_rate": 0.003562037797863599, "loss": 3.4382, "step": 867 }, { "epoch": 0.08938662282954896, "grad_norm": 0.2970244586467743, "learning_rate": 0.0035743631881676253, "loss": 3.4362, "step": 870 }, { "epoch": 0.08969485256344395, "grad_norm": 0.34221401810646057, "learning_rate": 0.003586688578471652, "loss": 3.4469, "step": 873 }, { "epoch": 0.09000308229733894, "grad_norm": 0.31807199120521545, "learning_rate": 0.003599013968775678, "loss": 3.3974, "step": 876 }, { "epoch": 0.09031131203123395, "grad_norm": 0.31519362330436707, "learning_rate": 0.0036113393590797046, "loss": 3.4275, "step": 879 }, { "epoch": 0.09061954176512894, "grad_norm": 0.5152423977851868, "learning_rate": 0.0036236647493837304, "loss": 3.4468, "step": 882 }, { "epoch": 0.09092777149902394, "grad_norm": 0.32447418570518494, "learning_rate": 0.003635990139687757, "loss": 3.4505, "step": 885 }, { "epoch": 0.09123600123291893, "grad_norm": 0.19884614646434784, "learning_rate": 0.003648315529991783, "loss": 3.4228, "step": 888 }, { "epoch": 0.09154423096681394, "grad_norm": 0.2726935148239136, "learning_rate": 0.0036606409202958098, "loss": 3.3957, "step": 891 }, { "epoch": 0.09185246070070893, "grad_norm": 0.29470425844192505, "learning_rate": 0.0036729663105998355, "loss": 3.3813, "step": 894 }, { "epoch": 0.09216069043460393, "grad_norm": 0.27806392312049866, "learning_rate": 0.003685291700903862, "loss": 3.3871, "step": 897 }, { "epoch": 0.09246892016849892, "grad_norm": 0.23773950338363647, "learning_rate": 0.0036976170912078883, "loss": 3.3941, "step": 900 }, { "epoch": 0.09277714990239391, "grad_norm": 0.45804303884506226, "learning_rate": 0.003709942481511915, "loss": 3.3752, "step": 903 }, { "epoch": 0.09308537963628892, "grad_norm": 0.45320865511894226, "learning_rate": 0.0037222678718159406, "loss": 3.4068, "step": 906 }, { "epoch": 0.09339360937018391, "grad_norm": 0.277089387178421, "learning_rate": 0.0037345932621199673, "loss": 3.4052, "step": 909 }, { "epoch": 0.09370183910407891, "grad_norm": 0.26548513770103455, "learning_rate": 0.0037469186524239934, "loss": 3.3753, "step": 912 }, { "epoch": 0.0940100688379739, "grad_norm": 0.24219335615634918, "learning_rate": 0.00375924404272802, "loss": 3.3913, "step": 915 }, { "epoch": 0.0943182985718689, "grad_norm": 0.2855617105960846, "learning_rate": 0.003771569433032046, "loss": 3.3636, "step": 918 }, { "epoch": 0.0946265283057639, "grad_norm": 0.35244864225387573, "learning_rate": 0.003783894823336073, "loss": 3.3603, "step": 921 }, { "epoch": 0.0949347580396589, "grad_norm": 0.3226896822452545, "learning_rate": 0.0037962202136400986, "loss": 3.3267, "step": 924 }, { "epoch": 0.09524298777355389, "grad_norm": 0.279863178730011, "learning_rate": 0.003808545603944125, "loss": 3.3192, "step": 927 }, { "epoch": 0.09555121750744888, "grad_norm": 0.35309404134750366, "learning_rate": 0.0038208709942481513, "loss": 3.2978, "step": 930 }, { "epoch": 0.09585944724134388, "grad_norm": 0.2359645515680313, "learning_rate": 0.003833196384552178, "loss": 3.3627, "step": 933 }, { "epoch": 0.09616767697523888, "grad_norm": 0.22583429515361786, "learning_rate": 0.0038455217748562037, "loss": 3.2669, "step": 936 }, { "epoch": 0.09647590670913388, "grad_norm": 0.2914174199104309, "learning_rate": 0.0038578471651602303, "loss": 3.3238, "step": 939 }, { "epoch": 0.09678413644302887, "grad_norm": 0.37748411297798157, "learning_rate": 0.0038701725554642565, "loss": 3.3232, "step": 942 }, { "epoch": 0.09709236617692386, "grad_norm": 0.28686878085136414, "learning_rate": 0.003882497945768283, "loss": 3.3143, "step": 945 }, { "epoch": 0.09740059591081886, "grad_norm": 0.22591544687747955, "learning_rate": 0.003894823336072309, "loss": 3.3285, "step": 948 }, { "epoch": 0.09770882564471386, "grad_norm": 0.24365665018558502, "learning_rate": 0.003907148726376336, "loss": 3.2799, "step": 951 }, { "epoch": 0.09801705537860886, "grad_norm": 0.3929263651371002, "learning_rate": 0.003919474116680362, "loss": 3.29, "step": 954 }, { "epoch": 0.09832528511250385, "grad_norm": 0.20268237590789795, "learning_rate": 0.003931799506984388, "loss": 3.2412, "step": 957 }, { "epoch": 0.09863351484639885, "grad_norm": 0.3333010673522949, "learning_rate": 0.003944124897288414, "loss": 3.2523, "step": 960 }, { "epoch": 0.09894174458029384, "grad_norm": 0.32760193943977356, "learning_rate": 0.0039564502875924406, "loss": 3.2958, "step": 963 }, { "epoch": 0.09924997431418885, "grad_norm": 0.27670565247535706, "learning_rate": 0.003968775677896467, "loss": 3.2683, "step": 966 }, { "epoch": 0.09955820404808384, "grad_norm": 0.32110410928726196, "learning_rate": 0.003981101068200493, "loss": 3.2576, "step": 969 }, { "epoch": 0.09986643378197883, "grad_norm": 0.43541696667671204, "learning_rate": 0.0039934264585045195, "loss": 3.2924, "step": 972 }, { "epoch": 0.10017466351587383, "grad_norm": 0.3483084738254547, "learning_rate": 0.004005751848808546, "loss": 3.2936, "step": 975 }, { "epoch": 0.10048289324976883, "grad_norm": 0.29586124420166016, "learning_rate": 0.004018077239112572, "loss": 3.2511, "step": 978 }, { "epoch": 0.10079112298366383, "grad_norm": 0.21434040367603302, "learning_rate": 0.0040304026294165985, "loss": 3.242, "step": 981 }, { "epoch": 0.10109935271755882, "grad_norm": 0.35204213857650757, "learning_rate": 0.004042728019720624, "loss": 3.2156, "step": 984 }, { "epoch": 0.10140758245145381, "grad_norm": 0.25223758816719055, "learning_rate": 0.004055053410024651, "loss": 3.257, "step": 987 }, { "epoch": 0.10171581218534881, "grad_norm": 0.2969653010368347, "learning_rate": 0.004067378800328677, "loss": 3.2576, "step": 990 }, { "epoch": 0.10202404191924382, "grad_norm": 0.26683250069618225, "learning_rate": 0.004079704190632704, "loss": 3.1998, "step": 993 }, { "epoch": 0.10233227165313881, "grad_norm": 0.26404044032096863, "learning_rate": 0.00409202958093673, "loss": 3.2303, "step": 996 }, { "epoch": 0.1026405013870338, "grad_norm": 0.2442736029624939, "learning_rate": 0.004104354971240756, "loss": 3.2428, "step": 999 }, { "epoch": 0.1029487311209288, "grad_norm": 0.2192964255809784, "learning_rate": 0.004116680361544782, "loss": 3.2661, "step": 1002 }, { "epoch": 0.10325696085482379, "grad_norm": 0.21057608723640442, "learning_rate": 0.004129005751848809, "loss": 3.1995, "step": 1005 }, { "epoch": 0.1035651905887188, "grad_norm": 0.3122745454311371, "learning_rate": 0.0041413311421528345, "loss": 3.2104, "step": 1008 }, { "epoch": 0.10387342032261379, "grad_norm": 0.643337607383728, "learning_rate": 0.004153656532456861, "loss": 3.2196, "step": 1011 }, { "epoch": 0.10418165005650878, "grad_norm": 0.265302449464798, "learning_rate": 0.004165981922760888, "loss": 3.2163, "step": 1014 }, { "epoch": 0.10448987979040378, "grad_norm": 0.27250421047210693, "learning_rate": 0.004178307313064914, "loss": 3.1781, "step": 1017 }, { "epoch": 0.10479810952429877, "grad_norm": 0.3951704800128937, "learning_rate": 0.00419063270336894, "loss": 3.2405, "step": 1020 }, { "epoch": 0.10510633925819378, "grad_norm": 0.20837850868701935, "learning_rate": 0.004202958093672967, "loss": 3.2269, "step": 1023 }, { "epoch": 0.10541456899208877, "grad_norm": 0.3887670338153839, "learning_rate": 0.004215283483976992, "loss": 3.219, "step": 1026 }, { "epoch": 0.10572279872598377, "grad_norm": 0.18901754915714264, "learning_rate": 0.004227608874281019, "loss": 3.1759, "step": 1029 }, { "epoch": 0.10603102845987876, "grad_norm": 0.3570176362991333, "learning_rate": 0.004239934264585046, "loss": 3.1544, "step": 1032 }, { "epoch": 0.10633925819377375, "grad_norm": 0.2346538007259369, "learning_rate": 0.004252259654889072, "loss": 3.1834, "step": 1035 }, { "epoch": 0.10664748792766876, "grad_norm": 0.1956055760383606, "learning_rate": 0.004264585045193098, "loss": 3.1597, "step": 1038 }, { "epoch": 0.10695571766156375, "grad_norm": 0.19475719332695007, "learning_rate": 0.0042769104354971246, "loss": 3.1818, "step": 1041 }, { "epoch": 0.10726394739545875, "grad_norm": 0.20991206169128418, "learning_rate": 0.00428923582580115, "loss": 3.148, "step": 1044 }, { "epoch": 0.10757217712935374, "grad_norm": 0.45754027366638184, "learning_rate": 0.004301561216105177, "loss": 3.1838, "step": 1047 }, { "epoch": 0.10788040686324873, "grad_norm": 0.2500004470348358, "learning_rate": 0.004313886606409203, "loss": 3.158, "step": 1050 }, { "epoch": 0.10818863659714374, "grad_norm": 0.29174116253852844, "learning_rate": 0.004326211996713229, "loss": 3.1619, "step": 1053 }, { "epoch": 0.10849686633103874, "grad_norm": 0.1642913520336151, "learning_rate": 0.004338537387017256, "loss": 3.1313, "step": 1056 }, { "epoch": 0.10880509606493373, "grad_norm": 0.20638629794120789, "learning_rate": 0.004350862777321282, "loss": 3.1553, "step": 1059 }, { "epoch": 0.10911332579882872, "grad_norm": 0.2534577548503876, "learning_rate": 0.004363188167625308, "loss": 3.146, "step": 1062 }, { "epoch": 0.10942155553272373, "grad_norm": 0.3894107937812805, "learning_rate": 0.004375513557929334, "loss": 3.1702, "step": 1065 }, { "epoch": 0.10972978526661872, "grad_norm": 0.18316411972045898, "learning_rate": 0.0043878389482333606, "loss": 3.1306, "step": 1068 }, { "epoch": 0.11003801500051372, "grad_norm": 0.22901946306228638, "learning_rate": 0.004400164338537387, "loss": 3.1012, "step": 1071 }, { "epoch": 0.11034624473440871, "grad_norm": 0.3013692796230316, "learning_rate": 0.004412489728841414, "loss": 3.1266, "step": 1074 }, { "epoch": 0.1106544744683037, "grad_norm": 0.26568275690078735, "learning_rate": 0.0044248151191454395, "loss": 3.1161, "step": 1077 }, { "epoch": 0.11096270420219871, "grad_norm": 0.23559318482875824, "learning_rate": 0.004437140509449466, "loss": 3.125, "step": 1080 }, { "epoch": 0.1112709339360937, "grad_norm": 0.29804936051368713, "learning_rate": 0.004449465899753492, "loss": 3.1212, "step": 1083 }, { "epoch": 0.1115791636699887, "grad_norm": 0.2965604066848755, "learning_rate": 0.0044617912900575185, "loss": 3.1435, "step": 1086 }, { "epoch": 0.11188739340388369, "grad_norm": 0.22977206110954285, "learning_rate": 0.004474116680361544, "loss": 3.1355, "step": 1089 }, { "epoch": 0.11219562313777869, "grad_norm": 0.2511363923549652, "learning_rate": 0.004486442070665571, "loss": 3.1041, "step": 1092 }, { "epoch": 0.11250385287167369, "grad_norm": 0.13533104956150055, "learning_rate": 0.004498767460969597, "loss": 3.1006, "step": 1095 }, { "epoch": 0.11281208260556869, "grad_norm": 0.1323193609714508, "learning_rate": 0.004511092851273624, "loss": 3.0623, "step": 1098 }, { "epoch": 0.11312031233946368, "grad_norm": 0.24355067312717438, "learning_rate": 0.00452341824157765, "loss": 3.109, "step": 1101 }, { "epoch": 0.11342854207335867, "grad_norm": 0.45989617705345154, "learning_rate": 0.004535743631881676, "loss": 3.1102, "step": 1104 }, { "epoch": 0.11373677180725367, "grad_norm": 0.27389761805534363, "learning_rate": 0.004548069022185702, "loss": 3.1058, "step": 1107 }, { "epoch": 0.11404500154114867, "grad_norm": 0.3120715320110321, "learning_rate": 0.004560394412489729, "loss": 3.0936, "step": 1110 }, { "epoch": 0.11435323127504367, "grad_norm": 0.3641244173049927, "learning_rate": 0.004572719802793755, "loss": 3.0895, "step": 1113 }, { "epoch": 0.11466146100893866, "grad_norm": 0.16439078748226166, "learning_rate": 0.004585045193097782, "loss": 3.0697, "step": 1116 }, { "epoch": 0.11496969074283366, "grad_norm": 0.21766935288906097, "learning_rate": 0.004597370583401808, "loss": 3.0952, "step": 1119 }, { "epoch": 0.11527792047672865, "grad_norm": 0.1682632714509964, "learning_rate": 0.004609695973705834, "loss": 3.0644, "step": 1122 }, { "epoch": 0.11558615021062366, "grad_norm": 0.18391060829162598, "learning_rate": 0.00462202136400986, "loss": 3.0565, "step": 1125 }, { "epoch": 0.11589437994451865, "grad_norm": 0.2503467798233032, "learning_rate": 0.004634346754313887, "loss": 3.0798, "step": 1128 }, { "epoch": 0.11620260967841364, "grad_norm": 0.3139159083366394, "learning_rate": 0.004646672144617912, "loss": 3.0784, "step": 1131 }, { "epoch": 0.11651083941230864, "grad_norm": 0.2205217182636261, "learning_rate": 0.004658997534921939, "loss": 3.0696, "step": 1134 }, { "epoch": 0.11681906914620364, "grad_norm": 0.322355180978775, "learning_rate": 0.004671322925225966, "loss": 3.0811, "step": 1137 }, { "epoch": 0.11712729888009864, "grad_norm": 0.27023863792419434, "learning_rate": 0.004683648315529992, "loss": 3.0955, "step": 1140 }, { "epoch": 0.11743552861399363, "grad_norm": 0.2672137916088104, "learning_rate": 0.004695973705834018, "loss": 3.0584, "step": 1143 }, { "epoch": 0.11774375834788862, "grad_norm": 0.271323561668396, "learning_rate": 0.0047082990961380446, "loss": 3.0483, "step": 1146 }, { "epoch": 0.11805198808178362, "grad_norm": 0.1428508758544922, "learning_rate": 0.00472062448644207, "loss": 3.0661, "step": 1149 }, { "epoch": 0.11836021781567863, "grad_norm": 0.29395970702171326, "learning_rate": 0.004732949876746097, "loss": 3.0391, "step": 1152 }, { "epoch": 0.11866844754957362, "grad_norm": 0.22083403170108795, "learning_rate": 0.0047452752670501235, "loss": 3.0579, "step": 1155 }, { "epoch": 0.11897667728346861, "grad_norm": 0.2015424370765686, "learning_rate": 0.00475760065735415, "loss": 3.0356, "step": 1158 }, { "epoch": 0.1192849070173636, "grad_norm": 0.21997034549713135, "learning_rate": 0.004769926047658176, "loss": 3.0301, "step": 1161 }, { "epoch": 0.1195931367512586, "grad_norm": 0.16206422448158264, "learning_rate": 0.0047822514379622025, "loss": 3.0407, "step": 1164 }, { "epoch": 0.11990136648515361, "grad_norm": 0.22591377794742584, "learning_rate": 0.004794576828266228, "loss": 3.0414, "step": 1167 }, { "epoch": 0.1202095962190486, "grad_norm": 0.2582632601261139, "learning_rate": 0.004806902218570255, "loss": 3.0148, "step": 1170 }, { "epoch": 0.1205178259529436, "grad_norm": 0.273416131734848, "learning_rate": 0.004819227608874281, "loss": 3.0023, "step": 1173 }, { "epoch": 0.12082605568683859, "grad_norm": 0.16373753547668457, "learning_rate": 0.004831552999178307, "loss": 3.0127, "step": 1176 }, { "epoch": 0.12113428542073358, "grad_norm": 0.2623594105243683, "learning_rate": 0.004843878389482334, "loss": 3.0635, "step": 1179 }, { "epoch": 0.12144251515462859, "grad_norm": 0.34809616208076477, "learning_rate": 0.00485620377978636, "loss": 3.0222, "step": 1182 }, { "epoch": 0.12175074488852358, "grad_norm": 0.23841938376426697, "learning_rate": 0.004868529170090386, "loss": 3.019, "step": 1185 }, { "epoch": 0.12205897462241858, "grad_norm": 0.2161986231803894, "learning_rate": 0.004880854560394413, "loss": 2.9934, "step": 1188 }, { "epoch": 0.12236720435631357, "grad_norm": 0.2870507836341858, "learning_rate": 0.0048931799506984385, "loss": 3.0438, "step": 1191 }, { "epoch": 0.12267543409020856, "grad_norm": 0.20796675980091095, "learning_rate": 0.004905505341002465, "loss": 2.9947, "step": 1194 }, { "epoch": 0.12298366382410357, "grad_norm": 0.1762983798980713, "learning_rate": 0.004917830731306492, "loss": 2.9729, "step": 1197 }, { "epoch": 0.12329189355799856, "grad_norm": 0.1240881159901619, "learning_rate": 0.0049301561216105174, "loss": 3.0149, "step": 1200 }, { "epoch": 0.12360012329189356, "grad_norm": 0.16968263685703278, "learning_rate": 0.004942481511914544, "loss": 2.9944, "step": 1203 }, { "epoch": 0.12390835302578855, "grad_norm": 0.1743592470884323, "learning_rate": 0.004954806902218571, "loss": 2.9947, "step": 1206 }, { "epoch": 0.12421658275968354, "grad_norm": 0.29677319526672363, "learning_rate": 0.004967132292522596, "loss": 2.9922, "step": 1209 }, { "epoch": 0.12452481249357855, "grad_norm": 0.273882657289505, "learning_rate": 0.004979457682826623, "loss": 2.9698, "step": 1212 }, { "epoch": 0.12483304222747355, "grad_norm": 0.3060019910335541, "learning_rate": 0.004991783073130649, "loss": 2.9925, "step": 1215 }, { "epoch": 0.12514127196136854, "grad_norm": 0.13856515288352966, "learning_rate": 0.005004108463434675, "loss": 3.0212, "step": 1218 }, { "epoch": 0.12544950169526353, "grad_norm": 0.12940354645252228, "learning_rate": 0.005016433853738702, "loss": 2.9472, "step": 1221 }, { "epoch": 0.12575773142915853, "grad_norm": 0.15493866801261902, "learning_rate": 0.005028759244042728, "loss": 2.9859, "step": 1224 }, { "epoch": 0.12606596116305352, "grad_norm": 0.4994816184043884, "learning_rate": 0.005041084634346754, "loss": 2.949, "step": 1227 }, { "epoch": 0.1263741908969485, "grad_norm": 0.37235137820243835, "learning_rate": 0.005053410024650781, "loss": 3.006, "step": 1230 }, { "epoch": 0.12668242063084353, "grad_norm": 0.24599948525428772, "learning_rate": 0.0050657354149548075, "loss": 2.9954, "step": 1233 }, { "epoch": 0.12699065036473853, "grad_norm": 0.1838703751564026, "learning_rate": 0.005078060805258833, "loss": 2.9886, "step": 1236 }, { "epoch": 0.12729888009863352, "grad_norm": 0.19366377592086792, "learning_rate": 0.005090386195562859, "loss": 2.9715, "step": 1239 }, { "epoch": 0.12760710983252851, "grad_norm": 0.11911759525537491, "learning_rate": 0.005102711585866886, "loss": 2.965, "step": 1242 }, { "epoch": 0.1279153395664235, "grad_norm": 0.12456653267145157, "learning_rate": 0.005115036976170912, "loss": 2.9343, "step": 1245 }, { "epoch": 0.1282235693003185, "grad_norm": 0.322380393743515, "learning_rate": 0.005127362366474938, "loss": 2.9604, "step": 1248 }, { "epoch": 0.1285317990342135, "grad_norm": 0.40975773334503174, "learning_rate": 0.005139687756778965, "loss": 2.9386, "step": 1251 }, { "epoch": 0.1288400287681085, "grad_norm": 0.2045045793056488, "learning_rate": 0.005152013147082991, "loss": 2.9459, "step": 1254 }, { "epoch": 0.12914825850200348, "grad_norm": 0.20005717873573303, "learning_rate": 0.005164338537387018, "loss": 2.9631, "step": 1257 }, { "epoch": 0.12945648823589848, "grad_norm": 0.18930204212665558, "learning_rate": 0.0051766639276910435, "loss": 2.9014, "step": 1260 }, { "epoch": 0.1297647179697935, "grad_norm": 0.3180810213088989, "learning_rate": 0.00518898931799507, "loss": 2.9242, "step": 1263 }, { "epoch": 0.1300729477036885, "grad_norm": 0.17843572795391083, "learning_rate": 0.005201314708299097, "loss": 2.9063, "step": 1266 }, { "epoch": 0.13038117743758348, "grad_norm": 0.12591248750686646, "learning_rate": 0.005213640098603123, "loss": 2.9095, "step": 1269 }, { "epoch": 0.13068940717147848, "grad_norm": 0.17976878583431244, "learning_rate": 0.005225965488907148, "loss": 2.928, "step": 1272 }, { "epoch": 0.13099763690537347, "grad_norm": 0.16759532690048218, "learning_rate": 0.005238290879211175, "loss": 2.9202, "step": 1275 }, { "epoch": 0.13130586663926846, "grad_norm": 0.27441859245300293, "learning_rate": 0.0052506162695152014, "loss": 2.9242, "step": 1278 }, { "epoch": 0.13161409637316346, "grad_norm": 0.23654502630233765, "learning_rate": 0.005262941659819228, "loss": 2.9175, "step": 1281 }, { "epoch": 0.13192232610705845, "grad_norm": 0.3399145007133484, "learning_rate": 0.005275267050123254, "loss": 2.9277, "step": 1284 }, { "epoch": 0.13223055584095345, "grad_norm": 0.199320450425148, "learning_rate": 0.00528759244042728, "loss": 2.9184, "step": 1287 }, { "epoch": 0.13253878557484847, "grad_norm": 0.16563403606414795, "learning_rate": 0.005299917830731307, "loss": 2.9166, "step": 1290 }, { "epoch": 0.13284701530874346, "grad_norm": 0.18119758367538452, "learning_rate": 0.005312243221035334, "loss": 2.9239, "step": 1293 }, { "epoch": 0.13315524504263845, "grad_norm": 0.1558375358581543, "learning_rate": 0.0053245686113393585, "loss": 2.9028, "step": 1296 }, { "epoch": 0.13346347477653345, "grad_norm": 0.36665746569633484, "learning_rate": 0.005336894001643385, "loss": 2.9081, "step": 1299 }, { "epoch": 0.13377170451042844, "grad_norm": 0.186012864112854, "learning_rate": 0.005349219391947412, "loss": 2.8836, "step": 1302 }, { "epoch": 0.13407993424432343, "grad_norm": 0.14102259278297424, "learning_rate": 0.005361544782251438, "loss": 2.8906, "step": 1305 }, { "epoch": 0.13438816397821843, "grad_norm": 0.12519022822380066, "learning_rate": 0.005373870172555464, "loss": 2.9148, "step": 1308 }, { "epoch": 0.13469639371211342, "grad_norm": 0.14027029275894165, "learning_rate": 0.005386195562859491, "loss": 2.9108, "step": 1311 }, { "epoch": 0.13500462344600841, "grad_norm": 0.2553085684776306, "learning_rate": 0.005398520953163517, "loss": 2.8837, "step": 1314 }, { "epoch": 0.1353128531799034, "grad_norm": 0.2809675335884094, "learning_rate": 0.005410846343467544, "loss": 2.8795, "step": 1317 }, { "epoch": 0.13562108291379843, "grad_norm": 0.19451378285884857, "learning_rate": 0.005423171733771569, "loss": 2.8648, "step": 1320 }, { "epoch": 0.13592931264769342, "grad_norm": 0.22285006940364838, "learning_rate": 0.005435497124075595, "loss": 2.8994, "step": 1323 }, { "epoch": 0.13623754238158842, "grad_norm": 0.14703693985939026, "learning_rate": 0.005447822514379622, "loss": 2.8984, "step": 1326 }, { "epoch": 0.1365457721154834, "grad_norm": 0.23260341584682465, "learning_rate": 0.005460147904683649, "loss": 2.863, "step": 1329 }, { "epoch": 0.1368540018493784, "grad_norm": 0.16448146104812622, "learning_rate": 0.005472473294987674, "loss": 2.8895, "step": 1332 }, { "epoch": 0.1371622315832734, "grad_norm": 0.1994483470916748, "learning_rate": 0.005484798685291701, "loss": 2.9012, "step": 1335 }, { "epoch": 0.1374704613171684, "grad_norm": 0.2786753177642822, "learning_rate": 0.0054971240755957275, "loss": 2.8753, "step": 1338 }, { "epoch": 0.13777869105106338, "grad_norm": 0.13169367611408234, "learning_rate": 0.005509449465899754, "loss": 2.8567, "step": 1341 }, { "epoch": 0.13808692078495838, "grad_norm": 0.21205192804336548, "learning_rate": 0.00552177485620378, "loss": 2.8523, "step": 1344 }, { "epoch": 0.1383951505188534, "grad_norm": 0.3462331295013428, "learning_rate": 0.0055341002465078065, "loss": 2.881, "step": 1347 }, { "epoch": 0.1387033802527484, "grad_norm": 0.26768332719802856, "learning_rate": 0.005546425636811832, "loss": 2.8803, "step": 1350 }, { "epoch": 0.1390116099866434, "grad_norm": 0.22518084943294525, "learning_rate": 0.005558751027115859, "loss": 2.874, "step": 1353 }, { "epoch": 0.13931983972053838, "grad_norm": 0.1767919361591339, "learning_rate": 0.005571076417419885, "loss": 2.8593, "step": 1356 }, { "epoch": 0.13962806945443337, "grad_norm": 0.14405187964439392, "learning_rate": 0.005583401807723911, "loss": 2.8576, "step": 1359 }, { "epoch": 0.13993629918832837, "grad_norm": 0.15364724397659302, "learning_rate": 0.005595727198027938, "loss": 2.856, "step": 1362 }, { "epoch": 0.14024452892222336, "grad_norm": 0.26737314462661743, "learning_rate": 0.005608052588331964, "loss": 2.8225, "step": 1365 }, { "epoch": 0.14055275865611835, "grad_norm": 0.14594382047653198, "learning_rate": 0.00562037797863599, "loss": 2.8397, "step": 1368 }, { "epoch": 0.14086098839001335, "grad_norm": 0.1974790245294571, "learning_rate": 0.005632703368940017, "loss": 2.8294, "step": 1371 }, { "epoch": 0.14116921812390834, "grad_norm": 0.12267682701349258, "learning_rate": 0.005645028759244043, "loss": 2.8543, "step": 1374 }, { "epoch": 0.14147744785780336, "grad_norm": 0.14111129939556122, "learning_rate": 0.00565735414954807, "loss": 2.8181, "step": 1377 }, { "epoch": 0.14178567759169836, "grad_norm": 0.1846015751361847, "learning_rate": 0.005669679539852095, "loss": 2.8272, "step": 1380 }, { "epoch": 0.14209390732559335, "grad_norm": 0.26931676268577576, "learning_rate": 0.0056820049301561214, "loss": 2.8286, "step": 1383 }, { "epoch": 0.14240213705948834, "grad_norm": 0.17969557642936707, "learning_rate": 0.005694330320460148, "loss": 2.8315, "step": 1386 }, { "epoch": 0.14271036679338334, "grad_norm": 0.2056432068347931, "learning_rate": 0.005706655710764175, "loss": 2.835, "step": 1389 }, { "epoch": 0.14301859652727833, "grad_norm": 0.29306477308273315, "learning_rate": 0.0057189811010682, "loss": 2.8294, "step": 1392 }, { "epoch": 0.14332682626117332, "grad_norm": 0.1792561262845993, "learning_rate": 0.005731306491372227, "loss": 2.8321, "step": 1395 }, { "epoch": 0.14363505599506832, "grad_norm": 0.11323501914739609, "learning_rate": 0.005743631881676254, "loss": 2.83, "step": 1398 }, { "epoch": 0.1439432857289633, "grad_norm": 0.2804841101169586, "learning_rate": 0.00575595727198028, "loss": 2.8271, "step": 1401 }, { "epoch": 0.1442515154628583, "grad_norm": 0.33056163787841797, "learning_rate": 0.005768282662284305, "loss": 2.7976, "step": 1404 }, { "epoch": 0.14455974519675333, "grad_norm": 0.12834665179252625, "learning_rate": 0.005780608052588332, "loss": 2.8169, "step": 1407 }, { "epoch": 0.14486797493064832, "grad_norm": 0.15917035937309265, "learning_rate": 0.005792933442892358, "loss": 2.8124, "step": 1410 }, { "epoch": 0.1451762046645433, "grad_norm": 0.28015008568763733, "learning_rate": 0.005805258833196385, "loss": 2.8019, "step": 1413 }, { "epoch": 0.1454844343984383, "grad_norm": 0.16829009354114532, "learning_rate": 0.005817584223500411, "loss": 2.8357, "step": 1416 }, { "epoch": 0.1457926641323333, "grad_norm": 0.14804339408874512, "learning_rate": 0.005829909613804437, "loss": 2.8102, "step": 1419 }, { "epoch": 0.1461008938662283, "grad_norm": 0.20360830426216125, "learning_rate": 0.005842235004108464, "loss": 2.8211, "step": 1422 }, { "epoch": 0.1464091236001233, "grad_norm": 0.22152036428451538, "learning_rate": 0.0058545603944124905, "loss": 2.8103, "step": 1425 }, { "epoch": 0.14671735333401828, "grad_norm": 0.20746375620365143, "learning_rate": 0.005866885784716516, "loss": 2.7994, "step": 1428 }, { "epoch": 0.14702558306791327, "grad_norm": 0.16845661401748657, "learning_rate": 0.005879211175020542, "loss": 2.8286, "step": 1431 }, { "epoch": 0.1473338128018083, "grad_norm": 0.1094370111823082, "learning_rate": 0.005891536565324569, "loss": 2.7888, "step": 1434 }, { "epoch": 0.1476420425357033, "grad_norm": 0.14844520390033722, "learning_rate": 0.005903861955628595, "loss": 2.8035, "step": 1437 }, { "epoch": 0.14795027226959828, "grad_norm": 0.12289691716432571, "learning_rate": 0.005916187345932621, "loss": 2.7852, "step": 1440 }, { "epoch": 0.14825850200349328, "grad_norm": 0.1203322485089302, "learning_rate": 0.0059285127362366475, "loss": 2.8101, "step": 1443 }, { "epoch": 0.14856673173738827, "grad_norm": 0.1871965080499649, "learning_rate": 0.005940838126540674, "loss": 2.7485, "step": 1446 }, { "epoch": 0.14887496147128326, "grad_norm": 0.1567300707101822, "learning_rate": 0.005953163516844701, "loss": 2.8097, "step": 1449 }, { "epoch": 0.14918319120517826, "grad_norm": 0.18046674132347107, "learning_rate": 0.0059654889071487265, "loss": 2.8118, "step": 1452 }, { "epoch": 0.14949142093907325, "grad_norm": 0.23180244863033295, "learning_rate": 0.005977814297452753, "loss": 2.7836, "step": 1455 }, { "epoch": 0.14979965067296824, "grad_norm": 0.2300175577402115, "learning_rate": 0.00599013968775678, "loss": 2.7675, "step": 1458 }, { "epoch": 0.15010788040686324, "grad_norm": 0.11340396106243134, "learning_rate": 0.006002465078060806, "loss": 2.8012, "step": 1461 }, { "epoch": 0.15041611014075826, "grad_norm": 0.10667074471712112, "learning_rate": 0.006014790468364831, "loss": 2.8154, "step": 1464 }, { "epoch": 0.15072433987465325, "grad_norm": 0.10800652205944061, "learning_rate": 0.006027115858668858, "loss": 2.7646, "step": 1467 }, { "epoch": 0.15103256960854824, "grad_norm": 0.2588643431663513, "learning_rate": 0.006039441248972884, "loss": 2.7912, "step": 1470 }, { "epoch": 0.15134079934244324, "grad_norm": 0.32462435960769653, "learning_rate": 0.006051766639276911, "loss": 2.7666, "step": 1473 }, { "epoch": 0.15164902907633823, "grad_norm": 0.23754975199699402, "learning_rate": 0.006064092029580937, "loss": 2.7694, "step": 1476 }, { "epoch": 0.15195725881023323, "grad_norm": 0.14895015954971313, "learning_rate": 0.006076417419884963, "loss": 2.7678, "step": 1479 }, { "epoch": 0.15226548854412822, "grad_norm": 0.3228299021720886, "learning_rate": 0.00608874281018899, "loss": 2.7786, "step": 1482 }, { "epoch": 0.1525737182780232, "grad_norm": 0.15597562491893768, "learning_rate": 0.006101068200493017, "loss": 2.7967, "step": 1485 }, { "epoch": 0.1528819480119182, "grad_norm": 0.09748488664627075, "learning_rate": 0.0061133935907970414, "loss": 2.7673, "step": 1488 }, { "epoch": 0.1531901777458132, "grad_norm": 0.12523339688777924, "learning_rate": 0.006125718981101068, "loss": 2.7391, "step": 1491 }, { "epoch": 0.15349840747970822, "grad_norm": 0.16529253125190735, "learning_rate": 0.006138044371405095, "loss": 2.7642, "step": 1494 }, { "epoch": 0.15380663721360321, "grad_norm": 0.2083311527967453, "learning_rate": 0.006150369761709121, "loss": 2.764, "step": 1497 }, { "epoch": 0.1541148669474982, "grad_norm": 0.13263079524040222, "learning_rate": 0.006162695152013147, "loss": 2.7828, "step": 1500 }, { "epoch": 0.1544230966813932, "grad_norm": 0.1473417580127716, "learning_rate": 0.006175020542317174, "loss": 2.7574, "step": 1503 }, { "epoch": 0.1547313264152882, "grad_norm": 0.22629734873771667, "learning_rate": 0.0061873459326212, "loss": 2.7792, "step": 1506 }, { "epoch": 0.1550395561491832, "grad_norm": 0.21652548015117645, "learning_rate": 0.006199671322925227, "loss": 2.7785, "step": 1509 }, { "epoch": 0.15534778588307818, "grad_norm": 0.1948641836643219, "learning_rate": 0.006211996713229252, "loss": 2.7969, "step": 1512 }, { "epoch": 0.15565601561697318, "grad_norm": 0.13890105485916138, "learning_rate": 0.006224322103533278, "loss": 2.7856, "step": 1515 }, { "epoch": 0.15596424535086817, "grad_norm": 0.09859870374202728, "learning_rate": 0.006236647493837305, "loss": 2.7523, "step": 1518 }, { "epoch": 0.1562724750847632, "grad_norm": 0.10258977860212326, "learning_rate": 0.0062489728841413315, "loss": 2.7466, "step": 1521 }, { "epoch": 0.15658070481865818, "grad_norm": 0.11476584523916245, "learning_rate": 0.006261298274445357, "loss": 2.7314, "step": 1524 }, { "epoch": 0.15688893455255318, "grad_norm": 0.1920320987701416, "learning_rate": 0.006273623664749384, "loss": 2.7647, "step": 1527 }, { "epoch": 0.15719716428644817, "grad_norm": 0.18576020002365112, "learning_rate": 0.0062859490550534105, "loss": 2.7632, "step": 1530 }, { "epoch": 0.15750539402034316, "grad_norm": 0.128046452999115, "learning_rate": 0.006298274445357437, "loss": 2.7237, "step": 1533 }, { "epoch": 0.15781362375423816, "grad_norm": 0.30617430806159973, "learning_rate": 0.006310599835661463, "loss": 2.7907, "step": 1536 }, { "epoch": 0.15812185348813315, "grad_norm": 0.140928253531456, "learning_rate": 0.0063229252259654894, "loss": 2.7879, "step": 1539 }, { "epoch": 0.15843008322202815, "grad_norm": 0.2537645399570465, "learning_rate": 0.006335250616269515, "loss": 2.7513, "step": 1542 }, { "epoch": 0.15873831295592314, "grad_norm": 0.40944191813468933, "learning_rate": 0.006347576006573542, "loss": 2.7418, "step": 1545 }, { "epoch": 0.15904654268981813, "grad_norm": 0.1284068077802658, "learning_rate": 0.0063599013968775675, "loss": 2.7235, "step": 1548 }, { "epoch": 0.15935477242371315, "grad_norm": 0.08984164893627167, "learning_rate": 0.006372226787181594, "loss": 2.7414, "step": 1551 }, { "epoch": 0.15966300215760815, "grad_norm": 0.13366155326366425, "learning_rate": 0.006384552177485621, "loss": 2.7456, "step": 1554 }, { "epoch": 0.15997123189150314, "grad_norm": 0.1179983913898468, "learning_rate": 0.006396877567789647, "loss": 2.7313, "step": 1557 }, { "epoch": 0.16027946162539813, "grad_norm": 0.15718503296375275, "learning_rate": 0.006409202958093673, "loss": 2.7315, "step": 1560 }, { "epoch": 0.16058769135929313, "grad_norm": 0.14405110478401184, "learning_rate": 0.0064215283483977, "loss": 2.7275, "step": 1563 }, { "epoch": 0.16089592109318812, "grad_norm": 0.13050544261932373, "learning_rate": 0.006433853738701726, "loss": 2.6935, "step": 1566 }, { "epoch": 0.16120415082708311, "grad_norm": 0.2343079298734665, "learning_rate": 0.006446179129005751, "loss": 2.6932, "step": 1569 }, { "epoch": 0.1615123805609781, "grad_norm": 0.2493698000907898, "learning_rate": 0.006458504519309778, "loss": 2.7414, "step": 1572 }, { "epoch": 0.1618206102948731, "grad_norm": 0.17371931672096252, "learning_rate": 0.006470829909613804, "loss": 2.7522, "step": 1575 }, { "epoch": 0.1621288400287681, "grad_norm": 0.16282691061496735, "learning_rate": 0.006483155299917831, "loss": 2.7659, "step": 1578 }, { "epoch": 0.16243706976266312, "grad_norm": 0.12791027128696442, "learning_rate": 0.006495480690221857, "loss": 2.7077, "step": 1581 }, { "epoch": 0.1627452994965581, "grad_norm": 0.09789251536130905, "learning_rate": 0.006507806080525883, "loss": 2.7041, "step": 1584 }, { "epoch": 0.1630535292304531, "grad_norm": 0.10156393051147461, "learning_rate": 0.00652013147082991, "loss": 2.685, "step": 1587 }, { "epoch": 0.1633617589643481, "grad_norm": 0.1974211484193802, "learning_rate": 0.006532456861133937, "loss": 2.7183, "step": 1590 }, { "epoch": 0.1636699886982431, "grad_norm": 0.1420728713274002, "learning_rate": 0.0065447822514379615, "loss": 2.7095, "step": 1593 }, { "epoch": 0.16397821843213808, "grad_norm": 0.3637617528438568, "learning_rate": 0.006557107641741988, "loss": 2.7578, "step": 1596 }, { "epoch": 0.16428644816603308, "grad_norm": 0.09830935299396515, "learning_rate": 0.006569433032046015, "loss": 2.6937, "step": 1599 }, { "epoch": 0.16459467789992807, "grad_norm": 0.15821218490600586, "learning_rate": 0.006581758422350041, "loss": 2.7031, "step": 1602 }, { "epoch": 0.16490290763382306, "grad_norm": 0.17226357758045197, "learning_rate": 0.006594083812654067, "loss": 2.6702, "step": 1605 }, { "epoch": 0.16521113736771809, "grad_norm": 0.21252015233039856, "learning_rate": 0.006606409202958094, "loss": 2.6893, "step": 1608 }, { "epoch": 0.16551936710161308, "grad_norm": 0.11433108150959015, "learning_rate": 0.00661873459326212, "loss": 2.6852, "step": 1611 }, { "epoch": 0.16582759683550807, "grad_norm": 0.15884144604206085, "learning_rate": 0.006631059983566147, "loss": 2.7164, "step": 1614 }, { "epoch": 0.16613582656940307, "grad_norm": 0.1429038643836975, "learning_rate": 0.006643385373870173, "loss": 2.6976, "step": 1617 }, { "epoch": 0.16644405630329806, "grad_norm": 0.09187953919172287, "learning_rate": 0.006655710764174198, "loss": 2.7134, "step": 1620 }, { "epoch": 0.16675228603719305, "grad_norm": 0.13670755922794342, "learning_rate": 0.006668036154478225, "loss": 2.6951, "step": 1623 }, { "epoch": 0.16706051577108805, "grad_norm": 0.17965632677078247, "learning_rate": 0.0066803615447822515, "loss": 2.6911, "step": 1626 }, { "epoch": 0.16736874550498304, "grad_norm": 0.21141032874584198, "learning_rate": 0.006692686935086277, "loss": 2.67, "step": 1629 }, { "epoch": 0.16767697523887803, "grad_norm": 0.30064719915390015, "learning_rate": 0.006705012325390304, "loss": 2.6837, "step": 1632 }, { "epoch": 0.16798520497277303, "grad_norm": 0.11874115467071533, "learning_rate": 0.0067173377156943305, "loss": 2.6968, "step": 1635 }, { "epoch": 0.16829343470666805, "grad_norm": 0.10265806317329407, "learning_rate": 0.006729663105998357, "loss": 2.6632, "step": 1638 }, { "epoch": 0.16860166444056304, "grad_norm": 0.10916320979595184, "learning_rate": 0.006741988496302383, "loss": 2.6749, "step": 1641 }, { "epoch": 0.16890989417445804, "grad_norm": 0.2549231946468353, "learning_rate": 0.0067543138866064095, "loss": 2.636, "step": 1644 }, { "epoch": 0.16921812390835303, "grad_norm": 0.15071339905261993, "learning_rate": 0.006766639276910436, "loss": 2.6933, "step": 1647 }, { "epoch": 0.16952635364224802, "grad_norm": 0.1088666021823883, "learning_rate": 0.006778964667214463, "loss": 2.6477, "step": 1650 }, { "epoch": 0.16983458337614302, "grad_norm": 0.0984036773443222, "learning_rate": 0.0067912900575184875, "loss": 2.6801, "step": 1653 }, { "epoch": 0.170142813110038, "grad_norm": 0.15402089059352875, "learning_rate": 0.006803615447822514, "loss": 2.6877, "step": 1656 }, { "epoch": 0.170451042843933, "grad_norm": 0.1299775093793869, "learning_rate": 0.006815940838126541, "loss": 2.6717, "step": 1659 }, { "epoch": 0.170759272577828, "grad_norm": 0.15615323185920715, "learning_rate": 0.006828266228430567, "loss": 2.6578, "step": 1662 }, { "epoch": 0.171067502311723, "grad_norm": 0.122567318379879, "learning_rate": 0.006840591618734593, "loss": 2.6959, "step": 1665 }, { "epoch": 0.171375732045618, "grad_norm": 0.1386043280363083, "learning_rate": 0.00685291700903862, "loss": 2.6491, "step": 1668 }, { "epoch": 0.171683961779513, "grad_norm": 0.1900375783443451, "learning_rate": 0.006865242399342646, "loss": 2.6643, "step": 1671 }, { "epoch": 0.171992191513408, "grad_norm": 0.1118064671754837, "learning_rate": 0.006877567789646673, "loss": 2.6496, "step": 1674 }, { "epoch": 0.172300421247303, "grad_norm": 0.1593448519706726, "learning_rate": 0.006889893179950698, "loss": 2.6833, "step": 1677 }, { "epoch": 0.172608650981198, "grad_norm": 0.17275281250476837, "learning_rate": 0.006902218570254724, "loss": 2.6909, "step": 1680 }, { "epoch": 0.17291688071509298, "grad_norm": 0.13396479189395905, "learning_rate": 0.006914543960558751, "loss": 2.692, "step": 1683 }, { "epoch": 0.17322511044898797, "grad_norm": 0.09812068939208984, "learning_rate": 0.006926869350862778, "loss": 2.6939, "step": 1686 }, { "epoch": 0.17353334018288297, "grad_norm": 0.08181022852659225, "learning_rate": 0.006939194741166803, "loss": 2.6408, "step": 1689 }, { "epoch": 0.17384156991677796, "grad_norm": 0.15573051571846008, "learning_rate": 0.00695152013147083, "loss": 2.6647, "step": 1692 }, { "epoch": 0.17414979965067298, "grad_norm": 0.2834240198135376, "learning_rate": 0.006963845521774857, "loss": 2.6585, "step": 1695 }, { "epoch": 0.17445802938456798, "grad_norm": 0.23794801533222198, "learning_rate": 0.006976170912078883, "loss": 2.6559, "step": 1698 }, { "epoch": 0.17476625911846297, "grad_norm": 0.1332167536020279, "learning_rate": 0.006988496302382908, "loss": 2.6695, "step": 1701 }, { "epoch": 0.17507448885235796, "grad_norm": 0.09555593878030777, "learning_rate": 0.007000821692686935, "loss": 2.6811, "step": 1704 }, { "epoch": 0.17538271858625296, "grad_norm": 0.10987939685583115, "learning_rate": 0.007013147082990961, "loss": 2.6524, "step": 1707 }, { "epoch": 0.17569094832014795, "grad_norm": 0.11458218097686768, "learning_rate": 0.007025472473294988, "loss": 2.6085, "step": 1710 }, { "epoch": 0.17599917805404294, "grad_norm": 0.12646709382534027, "learning_rate": 0.007037797863599014, "loss": 2.6561, "step": 1713 }, { "epoch": 0.17630740778793794, "grad_norm": 0.15338967740535736, "learning_rate": 0.00705012325390304, "loss": 2.6471, "step": 1716 }, { "epoch": 0.17661563752183293, "grad_norm": 0.14660318195819855, "learning_rate": 0.007062448644207067, "loss": 2.6532, "step": 1719 }, { "epoch": 0.17692386725572792, "grad_norm": 0.2730877995491028, "learning_rate": 0.0070747740345110935, "loss": 2.6565, "step": 1722 }, { "epoch": 0.17723209698962294, "grad_norm": 0.26743727922439575, "learning_rate": 0.007087099424815119, "loss": 2.6707, "step": 1725 }, { "epoch": 0.17754032672351794, "grad_norm": 0.13842618465423584, "learning_rate": 0.007099424815119146, "loss": 2.6652, "step": 1728 }, { "epoch": 0.17784855645741293, "grad_norm": 0.15871621668338776, "learning_rate": 0.0071117502054231715, "loss": 2.6464, "step": 1731 }, { "epoch": 0.17815678619130793, "grad_norm": 0.11526347696781158, "learning_rate": 0.007124075595727198, "loss": 2.662, "step": 1734 }, { "epoch": 0.17846501592520292, "grad_norm": 0.21620534360408783, "learning_rate": 0.007136400986031224, "loss": 2.6603, "step": 1737 }, { "epoch": 0.1787732456590979, "grad_norm": 0.0905444398522377, "learning_rate": 0.0071487263763352505, "loss": 2.6523, "step": 1740 }, { "epoch": 0.1790814753929929, "grad_norm": 0.28233054280281067, "learning_rate": 0.007161051766639277, "loss": 2.6597, "step": 1743 }, { "epoch": 0.1793897051268879, "grad_norm": 0.2363336831331253, "learning_rate": 0.007173377156943304, "loss": 2.6483, "step": 1746 }, { "epoch": 0.1796979348607829, "grad_norm": 0.11012139916419983, "learning_rate": 0.0071857025472473295, "loss": 2.6513, "step": 1749 }, { "epoch": 0.1800061645946779, "grad_norm": 0.09720948338508606, "learning_rate": 0.007198027937551356, "loss": 2.6511, "step": 1752 }, { "epoch": 0.1803143943285729, "grad_norm": 0.13130852580070496, "learning_rate": 0.007210353327855383, "loss": 2.6509, "step": 1755 }, { "epoch": 0.1806226240624679, "grad_norm": 0.14865098893642426, "learning_rate": 0.007222678718159409, "loss": 2.6253, "step": 1758 }, { "epoch": 0.1809308537963629, "grad_norm": 0.20482710003852844, "learning_rate": 0.007235004108463434, "loss": 2.6312, "step": 1761 }, { "epoch": 0.1812390835302579, "grad_norm": 0.12063097953796387, "learning_rate": 0.007247329498767461, "loss": 2.6007, "step": 1764 }, { "epoch": 0.18154731326415288, "grad_norm": 0.23084934055805206, "learning_rate": 0.007259654889071487, "loss": 2.6129, "step": 1767 }, { "epoch": 0.18185554299804788, "grad_norm": 0.10387217253446579, "learning_rate": 0.007271980279375514, "loss": 2.6309, "step": 1770 }, { "epoch": 0.18216377273194287, "grad_norm": 0.14229682087898254, "learning_rate": 0.00728430566967954, "loss": 2.6074, "step": 1773 }, { "epoch": 0.18247200246583786, "grad_norm": 0.12009115517139435, "learning_rate": 0.007296631059983566, "loss": 2.6407, "step": 1776 }, { "epoch": 0.18278023219973286, "grad_norm": 0.15677185356616974, "learning_rate": 0.007308956450287593, "loss": 2.6268, "step": 1779 }, { "epoch": 0.18308846193362788, "grad_norm": 0.13304303586483002, "learning_rate": 0.0073212818405916195, "loss": 2.6463, "step": 1782 }, { "epoch": 0.18339669166752287, "grad_norm": 0.15444768965244293, "learning_rate": 0.007333607230895644, "loss": 2.6218, "step": 1785 }, { "epoch": 0.18370492140141786, "grad_norm": 0.1738140732049942, "learning_rate": 0.007345932621199671, "loss": 2.6525, "step": 1788 }, { "epoch": 0.18401315113531286, "grad_norm": 0.13087227940559387, "learning_rate": 0.007358258011503698, "loss": 2.6266, "step": 1791 }, { "epoch": 0.18432138086920785, "grad_norm": 0.1026511862874031, "learning_rate": 0.007370583401807724, "loss": 2.6017, "step": 1794 }, { "epoch": 0.18462961060310285, "grad_norm": 0.11183813214302063, "learning_rate": 0.00738290879211175, "loss": 2.5966, "step": 1797 }, { "epoch": 0.18493784033699784, "grad_norm": 0.12239934504032135, "learning_rate": 0.007395234182415777, "loss": 2.6205, "step": 1800 }, { "epoch": 0.18524607007089283, "grad_norm": 0.2630854845046997, "learning_rate": 0.007407559572719803, "loss": 2.609, "step": 1803 }, { "epoch": 0.18555429980478783, "grad_norm": 0.24282613396644592, "learning_rate": 0.00741988496302383, "loss": 2.6405, "step": 1806 }, { "epoch": 0.18586252953868282, "grad_norm": 0.2825084328651428, "learning_rate": 0.007432210353327855, "loss": 2.5933, "step": 1809 }, { "epoch": 0.18617075927257784, "grad_norm": 0.26462721824645996, "learning_rate": 0.007444535743631881, "loss": 2.6021, "step": 1812 }, { "epoch": 0.18647898900647283, "grad_norm": 0.11797992140054703, "learning_rate": 0.007456861133935908, "loss": 2.6246, "step": 1815 }, { "epoch": 0.18678721874036783, "grad_norm": 0.14044708013534546, "learning_rate": 0.0074691865242399345, "loss": 2.6028, "step": 1818 }, { "epoch": 0.18709544847426282, "grad_norm": 0.1374548226594925, "learning_rate": 0.00748151191454396, "loss": 2.6092, "step": 1821 }, { "epoch": 0.18740367820815781, "grad_norm": 0.10084279626607895, "learning_rate": 0.007493837304847987, "loss": 2.6162, "step": 1824 }, { "epoch": 0.1877119079420528, "grad_norm": 0.1052001565694809, "learning_rate": 0.0075061626951520135, "loss": 2.5742, "step": 1827 }, { "epoch": 0.1880201376759478, "grad_norm": 0.11738535761833191, "learning_rate": 0.00751848808545604, "loss": 2.5715, "step": 1830 }, { "epoch": 0.1883283674098428, "grad_norm": 0.10453224182128906, "learning_rate": 0.007530813475760066, "loss": 2.5896, "step": 1833 }, { "epoch": 0.1886365971437378, "grad_norm": 0.10509374737739563, "learning_rate": 0.007543138866064092, "loss": 2.6047, "step": 1836 }, { "epoch": 0.18894482687763278, "grad_norm": 0.11291799694299698, "learning_rate": 0.007555464256368119, "loss": 2.6062, "step": 1839 }, { "epoch": 0.1892530566115278, "grad_norm": 0.11998583376407623, "learning_rate": 0.007567789646672146, "loss": 2.629, "step": 1842 }, { "epoch": 0.1895612863454228, "grad_norm": 0.21776226162910461, "learning_rate": 0.0075801150369761705, "loss": 2.5847, "step": 1845 }, { "epoch": 0.1898695160793178, "grad_norm": 0.210985466837883, "learning_rate": 0.007592440427280197, "loss": 2.5901, "step": 1848 }, { "epoch": 0.19017774581321278, "grad_norm": 0.11799308657646179, "learning_rate": 0.007604765817584224, "loss": 2.5893, "step": 1851 }, { "epoch": 0.19048597554710778, "grad_norm": 0.10019934922456741, "learning_rate": 0.00761709120788825, "loss": 2.6327, "step": 1854 }, { "epoch": 0.19079420528100277, "grad_norm": 0.07964596897363663, "learning_rate": 0.007629416598192276, "loss": 2.5921, "step": 1857 }, { "epoch": 0.19110243501489776, "grad_norm": 0.16393065452575684, "learning_rate": 0.007641741988496303, "loss": 2.5912, "step": 1860 }, { "epoch": 0.19141066474879276, "grad_norm": 0.324639230966568, "learning_rate": 0.007654067378800329, "loss": 2.5998, "step": 1863 }, { "epoch": 0.19171889448268775, "grad_norm": 0.14071421325206757, "learning_rate": 0.007666392769104356, "loss": 2.5803, "step": 1866 }, { "epoch": 0.19202712421658277, "grad_norm": 0.20063026249408722, "learning_rate": 0.007678718159408381, "loss": 2.6019, "step": 1869 }, { "epoch": 0.19233535395047777, "grad_norm": 0.11311519891023636, "learning_rate": 0.007691043549712407, "loss": 2.5645, "step": 1872 }, { "epoch": 0.19264358368437276, "grad_norm": 0.08542342483997345, "learning_rate": 0.007703368940016434, "loss": 2.6122, "step": 1875 }, { "epoch": 0.19295181341826775, "grad_norm": 0.08306868374347687, "learning_rate": 0.007715694330320461, "loss": 2.5859, "step": 1878 }, { "epoch": 0.19326004315216275, "grad_norm": 0.11635984480381012, "learning_rate": 0.007728019720624486, "loss": 2.5855, "step": 1881 }, { "epoch": 0.19356827288605774, "grad_norm": 0.08945252746343613, "learning_rate": 0.007740345110928513, "loss": 2.5509, "step": 1884 }, { "epoch": 0.19387650261995273, "grad_norm": 0.19044962525367737, "learning_rate": 0.0077526705012325395, "loss": 2.559, "step": 1887 }, { "epoch": 0.19418473235384773, "grad_norm": 0.1462780088186264, "learning_rate": 0.007764995891536566, "loss": 2.5749, "step": 1890 }, { "epoch": 0.19449296208774272, "grad_norm": 0.15944691002368927, "learning_rate": 0.007777321281840591, "loss": 2.5801, "step": 1893 }, { "epoch": 0.19480119182163771, "grad_norm": 0.10125305503606796, "learning_rate": 0.007789646672144618, "loss": 2.5821, "step": 1896 }, { "epoch": 0.19510942155553274, "grad_norm": 0.17344938218593597, "learning_rate": 0.007801972062448644, "loss": 2.5905, "step": 1899 }, { "epoch": 0.19541765128942773, "grad_norm": 0.16651591658592224, "learning_rate": 0.007814297452752672, "loss": 2.5668, "step": 1902 }, { "epoch": 0.19572588102332272, "grad_norm": 0.17417702078819275, "learning_rate": 0.007826622843056696, "loss": 2.568, "step": 1905 }, { "epoch": 0.19603411075721772, "grad_norm": 0.11182334274053574, "learning_rate": 0.007838948233360723, "loss": 2.5547, "step": 1908 }, { "epoch": 0.1963423404911127, "grad_norm": 0.23256631195545197, "learning_rate": 0.007851273623664749, "loss": 2.5722, "step": 1911 }, { "epoch": 0.1966505702250077, "grad_norm": 0.18180392682552338, "learning_rate": 0.007863599013968776, "loss": 2.558, "step": 1914 }, { "epoch": 0.1969587999589027, "grad_norm": 0.12168890237808228, "learning_rate": 0.007875924404272802, "loss": 2.5977, "step": 1917 }, { "epoch": 0.1972670296927977, "grad_norm": 0.11032187938690186, "learning_rate": 0.007888249794576828, "loss": 2.5846, "step": 1920 }, { "epoch": 0.19757525942669268, "grad_norm": 0.0740116760134697, "learning_rate": 0.007900575184880855, "loss": 2.5824, "step": 1923 }, { "epoch": 0.19788348916058768, "grad_norm": 0.05902474746108055, "learning_rate": 0.007912900575184881, "loss": 2.5497, "step": 1926 }, { "epoch": 0.1981917188944827, "grad_norm": 0.09003309905529022, "learning_rate": 0.007925225965488907, "loss": 2.5523, "step": 1929 }, { "epoch": 0.1984999486283777, "grad_norm": 0.4191035330295563, "learning_rate": 0.007937551355792934, "loss": 2.6223, "step": 1932 }, { "epoch": 0.1988081783622727, "grad_norm": 0.17093214392662048, "learning_rate": 0.00794987674609696, "loss": 2.5647, "step": 1935 }, { "epoch": 0.19911640809616768, "grad_norm": 0.0921127051115036, "learning_rate": 0.007962202136400986, "loss": 2.564, "step": 1938 }, { "epoch": 0.19942463783006267, "grad_norm": 0.14204134047031403, "learning_rate": 0.007974527526705012, "loss": 2.5972, "step": 1941 }, { "epoch": 0.19973286756395767, "grad_norm": 0.07556895911693573, "learning_rate": 0.007986852917009039, "loss": 2.5796, "step": 1944 }, { "epoch": 0.20004109729785266, "grad_norm": 0.07290320843458176, "learning_rate": 0.007999178307313065, "loss": 2.5564, "step": 1947 }, { "epoch": 0.20034932703174765, "grad_norm": 0.1624913364648819, "learning_rate": 0.008011503697617092, "loss": 2.5849, "step": 1950 }, { "epoch": 0.20065755676564265, "grad_norm": 0.11839967221021652, "learning_rate": 0.008023829087921118, "loss": 2.5611, "step": 1953 }, { "epoch": 0.20096578649953767, "grad_norm": 0.14280788600444794, "learning_rate": 0.008036154478225144, "loss": 2.5289, "step": 1956 }, { "epoch": 0.20127401623343266, "grad_norm": 0.11515247821807861, "learning_rate": 0.008048479868529171, "loss": 2.5678, "step": 1959 }, { "epoch": 0.20158224596732766, "grad_norm": 0.1147715225815773, "learning_rate": 0.008060805258833197, "loss": 2.5452, "step": 1962 }, { "epoch": 0.20189047570122265, "grad_norm": 0.09767001122236252, "learning_rate": 0.008073130649137223, "loss": 2.6023, "step": 1965 }, { "epoch": 0.20219870543511764, "grad_norm": 0.0866391509771347, "learning_rate": 0.008085456039441248, "loss": 2.5518, "step": 1968 }, { "epoch": 0.20250693516901264, "grad_norm": 0.1610632985830307, "learning_rate": 0.008097781429745276, "loss": 2.5271, "step": 1971 }, { "epoch": 0.20281516490290763, "grad_norm": 0.20238341391086578, "learning_rate": 0.008110106820049302, "loss": 2.5597, "step": 1974 }, { "epoch": 0.20312339463680262, "grad_norm": 0.11807162314653397, "learning_rate": 0.008122432210353327, "loss": 2.5663, "step": 1977 }, { "epoch": 0.20343162437069762, "grad_norm": 0.14654900133609772, "learning_rate": 0.008134757600657355, "loss": 2.5729, "step": 1980 }, { "epoch": 0.2037398541045926, "grad_norm": 0.17804567515850067, "learning_rate": 0.00814708299096138, "loss": 2.5658, "step": 1983 }, { "epoch": 0.20404808383848763, "grad_norm": 0.12376303225755692, "learning_rate": 0.008159408381265408, "loss": 2.5703, "step": 1986 }, { "epoch": 0.20435631357238263, "grad_norm": 0.1248418316245079, "learning_rate": 0.008171733771569432, "loss": 2.5328, "step": 1989 }, { "epoch": 0.20466454330627762, "grad_norm": 0.08159278333187103, "learning_rate": 0.00818405916187346, "loss": 2.5349, "step": 1992 }, { "epoch": 0.2049727730401726, "grad_norm": 0.11184779554605484, "learning_rate": 0.008196384552177485, "loss": 2.5557, "step": 1995 }, { "epoch": 0.2052810027740676, "grad_norm": 0.09568610787391663, "learning_rate": 0.008208709942481513, "loss": 2.5415, "step": 1998 }, { "epoch": 0.2055892325079626, "grad_norm": 0.08708583563566208, "learning_rate": 0.008221035332785539, "loss": 2.5369, "step": 2001 }, { "epoch": 0.2058974622418576, "grad_norm": 0.11849135160446167, "learning_rate": 0.008233360723089564, "loss": 2.5617, "step": 2004 }, { "epoch": 0.2062056919757526, "grad_norm": 0.1407340168952942, "learning_rate": 0.008245686113393592, "loss": 2.5374, "step": 2007 }, { "epoch": 0.20651392170964758, "grad_norm": 0.13198955357074738, "learning_rate": 0.008258011503697617, "loss": 2.57, "step": 2010 }, { "epoch": 0.20682215144354257, "grad_norm": 0.12408044934272766, "learning_rate": 0.008270336894001643, "loss": 2.5344, "step": 2013 }, { "epoch": 0.2071303811774376, "grad_norm": 0.149169921875, "learning_rate": 0.008282662284305669, "loss": 2.5357, "step": 2016 }, { "epoch": 0.2074386109113326, "grad_norm": 0.10010293871164322, "learning_rate": 0.008294987674609696, "loss": 2.5166, "step": 2019 }, { "epoch": 0.20774684064522758, "grad_norm": 0.17650344967842102, "learning_rate": 0.008307313064913722, "loss": 2.5664, "step": 2022 }, { "epoch": 0.20805507037912258, "grad_norm": 0.09946206212043762, "learning_rate": 0.008319638455217748, "loss": 2.5378, "step": 2025 }, { "epoch": 0.20836330011301757, "grad_norm": 0.07705225795507431, "learning_rate": 0.008331963845521775, "loss": 2.5088, "step": 2028 }, { "epoch": 0.20867152984691256, "grad_norm": 0.18174925446510315, "learning_rate": 0.008344289235825801, "loss": 2.5264, "step": 2031 }, { "epoch": 0.20897975958080756, "grad_norm": 0.14415894448757172, "learning_rate": 0.008356614626129829, "loss": 2.5549, "step": 2034 }, { "epoch": 0.20928798931470255, "grad_norm": 0.17721933126449585, "learning_rate": 0.008368940016433854, "loss": 2.5476, "step": 2037 }, { "epoch": 0.20959621904859754, "grad_norm": 0.1727544367313385, "learning_rate": 0.00838126540673788, "loss": 2.5809, "step": 2040 }, { "epoch": 0.20990444878249256, "grad_norm": 0.20624054968357086, "learning_rate": 0.008393590797041908, "loss": 2.5256, "step": 2043 }, { "epoch": 0.21021267851638756, "grad_norm": 0.08070924133062363, "learning_rate": 0.008405916187345933, "loss": 2.5537, "step": 2046 }, { "epoch": 0.21052090825028255, "grad_norm": 0.07868220657110214, "learning_rate": 0.008418241577649959, "loss": 2.5266, "step": 2049 }, { "epoch": 0.21082913798417754, "grad_norm": 0.19941876828670502, "learning_rate": 0.008430566967953985, "loss": 2.5344, "step": 2052 }, { "epoch": 0.21113736771807254, "grad_norm": 0.08758697658777237, "learning_rate": 0.008442892358258012, "loss": 2.5409, "step": 2055 }, { "epoch": 0.21144559745196753, "grad_norm": 0.11635969579219818, "learning_rate": 0.008455217748562038, "loss": 2.5497, "step": 2058 }, { "epoch": 0.21175382718586253, "grad_norm": 0.16910326480865479, "learning_rate": 0.008467543138866064, "loss": 2.5509, "step": 2061 }, { "epoch": 0.21206205691975752, "grad_norm": 0.14605827629566193, "learning_rate": 0.008479868529170091, "loss": 2.5589, "step": 2064 }, { "epoch": 0.2123702866536525, "grad_norm": 0.18890123069286346, "learning_rate": 0.008492193919474117, "loss": 2.5454, "step": 2067 }, { "epoch": 0.2126785163875475, "grad_norm": 0.09277717024087906, "learning_rate": 0.008504519309778144, "loss": 2.4984, "step": 2070 }, { "epoch": 0.21298674612144253, "grad_norm": 0.07268327474594116, "learning_rate": 0.008516844700082168, "loss": 2.5323, "step": 2073 }, { "epoch": 0.21329497585533752, "grad_norm": 0.0807403028011322, "learning_rate": 0.008529170090386196, "loss": 2.5083, "step": 2076 }, { "epoch": 0.21360320558923251, "grad_norm": 0.12681947648525238, "learning_rate": 0.008541495480690222, "loss": 2.5386, "step": 2079 }, { "epoch": 0.2139114353231275, "grad_norm": 0.25378334522247314, "learning_rate": 0.008553820870994249, "loss": 2.5188, "step": 2082 }, { "epoch": 0.2142196650570225, "grad_norm": 0.15101733803749084, "learning_rate": 0.008566146261298275, "loss": 2.5457, "step": 2085 }, { "epoch": 0.2145278947909175, "grad_norm": 0.17336703836917877, "learning_rate": 0.0085784716516023, "loss": 2.5206, "step": 2088 }, { "epoch": 0.2148361245248125, "grad_norm": 0.07735245674848557, "learning_rate": 0.008590797041906328, "loss": 2.5297, "step": 2091 }, { "epoch": 0.21514435425870748, "grad_norm": 0.15841136872768402, "learning_rate": 0.008603122432210354, "loss": 2.5086, "step": 2094 }, { "epoch": 0.21545258399260248, "grad_norm": 0.15941859781742096, "learning_rate": 0.00861544782251438, "loss": 2.5316, "step": 2097 }, { "epoch": 0.21576081372649747, "grad_norm": 0.13837756216526031, "learning_rate": 0.008627773212818405, "loss": 2.4818, "step": 2100 }, { "epoch": 0.2160690434603925, "grad_norm": 0.14743675291538239, "learning_rate": 0.008640098603122433, "loss": 2.5351, "step": 2103 }, { "epoch": 0.21637727319428748, "grad_norm": 0.15961112082004547, "learning_rate": 0.008652423993426459, "loss": 2.4916, "step": 2106 }, { "epoch": 0.21668550292818248, "grad_norm": 0.16091223061084747, "learning_rate": 0.008664749383730484, "loss": 2.5026, "step": 2109 }, { "epoch": 0.21699373266207747, "grad_norm": 0.1695915311574936, "learning_rate": 0.008677074774034512, "loss": 2.4994, "step": 2112 }, { "epoch": 0.21730196239597246, "grad_norm": 0.1457175761461258, "learning_rate": 0.008689400164338537, "loss": 2.5225, "step": 2115 }, { "epoch": 0.21761019212986746, "grad_norm": 0.0995342880487442, "learning_rate": 0.008701725554642563, "loss": 2.5373, "step": 2118 }, { "epoch": 0.21791842186376245, "grad_norm": 0.11527393013238907, "learning_rate": 0.00871405094494659, "loss": 2.5207, "step": 2121 }, { "epoch": 0.21822665159765745, "grad_norm": 0.07951527088880539, "learning_rate": 0.008726376335250616, "loss": 2.4868, "step": 2124 }, { "epoch": 0.21853488133155244, "grad_norm": 0.11319970339536667, "learning_rate": 0.008738701725554644, "loss": 2.4965, "step": 2127 }, { "epoch": 0.21884311106544746, "grad_norm": 0.14932893216609955, "learning_rate": 0.008751027115858668, "loss": 2.5164, "step": 2130 }, { "epoch": 0.21915134079934245, "grad_norm": 0.1703396886587143, "learning_rate": 0.008763352506162695, "loss": 2.5175, "step": 2133 }, { "epoch": 0.21945957053323745, "grad_norm": 0.2208787351846695, "learning_rate": 0.008775677896466721, "loss": 2.521, "step": 2136 }, { "epoch": 0.21976780026713244, "grad_norm": 0.0884699895977974, "learning_rate": 0.008788003286770749, "loss": 2.5356, "step": 2139 }, { "epoch": 0.22007603000102743, "grad_norm": 0.06739311665296555, "learning_rate": 0.008800328677074774, "loss": 2.5102, "step": 2142 }, { "epoch": 0.22038425973492243, "grad_norm": 0.09653139859437943, "learning_rate": 0.0088126540673788, "loss": 2.5047, "step": 2145 }, { "epoch": 0.22069248946881742, "grad_norm": 0.11972832679748535, "learning_rate": 0.008824979457682828, "loss": 2.5086, "step": 2148 }, { "epoch": 0.22100071920271241, "grad_norm": 0.13725396990776062, "learning_rate": 0.008837304847986853, "loss": 2.5034, "step": 2151 }, { "epoch": 0.2213089489366074, "grad_norm": 0.09293966740369797, "learning_rate": 0.008849630238290879, "loss": 2.5004, "step": 2154 }, { "epoch": 0.2216171786705024, "grad_norm": 0.07625159621238708, "learning_rate": 0.008861955628594905, "loss": 2.508, "step": 2157 }, { "epoch": 0.22192540840439742, "grad_norm": 0.08581928163766861, "learning_rate": 0.008874281018898932, "loss": 2.4973, "step": 2160 }, { "epoch": 0.22223363813829242, "grad_norm": 0.12700457870960236, "learning_rate": 0.008886606409202958, "loss": 2.5174, "step": 2163 }, { "epoch": 0.2225418678721874, "grad_norm": 0.17155064642429352, "learning_rate": 0.008898931799506984, "loss": 2.4969, "step": 2166 }, { "epoch": 0.2228500976060824, "grad_norm": 0.13356278836727142, "learning_rate": 0.008911257189811011, "loss": 2.4876, "step": 2169 }, { "epoch": 0.2231583273399774, "grad_norm": 0.07805536687374115, "learning_rate": 0.008923582580115037, "loss": 2.5151, "step": 2172 }, { "epoch": 0.2234665570738724, "grad_norm": 0.10661714524030685, "learning_rate": 0.008935907970419064, "loss": 2.4607, "step": 2175 }, { "epoch": 0.22377478680776738, "grad_norm": 0.15095242857933044, "learning_rate": 0.008948233360723088, "loss": 2.5358, "step": 2178 }, { "epoch": 0.22408301654166238, "grad_norm": 0.11287077516317368, "learning_rate": 0.008960558751027116, "loss": 2.5289, "step": 2181 }, { "epoch": 0.22439124627555737, "grad_norm": 0.16408318281173706, "learning_rate": 0.008972884141331142, "loss": 2.5256, "step": 2184 }, { "epoch": 0.22469947600945236, "grad_norm": 0.1227622851729393, "learning_rate": 0.008985209531635169, "loss": 2.5091, "step": 2187 }, { "epoch": 0.22500770574334739, "grad_norm": 0.06549924612045288, "learning_rate": 0.008997534921939195, "loss": 2.4908, "step": 2190 }, { "epoch": 0.22531593547724238, "grad_norm": 0.09310626983642578, "learning_rate": 0.00900986031224322, "loss": 2.4903, "step": 2193 }, { "epoch": 0.22562416521113737, "grad_norm": 0.12637357413768768, "learning_rate": 0.009022185702547248, "loss": 2.5089, "step": 2196 }, { "epoch": 0.22593239494503237, "grad_norm": 0.1691301167011261, "learning_rate": 0.009034511092851274, "loss": 2.4984, "step": 2199 }, { "epoch": 0.22624062467892736, "grad_norm": 0.18173068761825562, "learning_rate": 0.0090468364831553, "loss": 2.4552, "step": 2202 }, { "epoch": 0.22654885441282235, "grad_norm": 0.19549600780010223, "learning_rate": 0.009059161873459327, "loss": 2.4642, "step": 2205 }, { "epoch": 0.22685708414671735, "grad_norm": 0.09038446098566055, "learning_rate": 0.009071487263763353, "loss": 2.5017, "step": 2208 }, { "epoch": 0.22716531388061234, "grad_norm": 0.07959726452827454, "learning_rate": 0.009083812654067379, "loss": 2.4934, "step": 2211 }, { "epoch": 0.22747354361450733, "grad_norm": 0.07991699874401093, "learning_rate": 0.009096138044371404, "loss": 2.498, "step": 2214 }, { "epoch": 0.22778177334840236, "grad_norm": 0.09022307395935059, "learning_rate": 0.009108463434675432, "loss": 2.4832, "step": 2217 }, { "epoch": 0.22809000308229735, "grad_norm": 0.11399543285369873, "learning_rate": 0.009120788824979457, "loss": 2.4929, "step": 2220 }, { "epoch": 0.22839823281619234, "grad_norm": 0.10349836200475693, "learning_rate": 0.009133114215283485, "loss": 2.4622, "step": 2223 }, { "epoch": 0.22870646255008734, "grad_norm": 0.17096632719039917, "learning_rate": 0.00914543960558751, "loss": 2.5103, "step": 2226 }, { "epoch": 0.22901469228398233, "grad_norm": 0.13803228735923767, "learning_rate": 0.009157764995891536, "loss": 2.5034, "step": 2229 }, { "epoch": 0.22932292201787732, "grad_norm": 0.16332487761974335, "learning_rate": 0.009170090386195564, "loss": 2.5051, "step": 2232 }, { "epoch": 0.22963115175177232, "grad_norm": 0.12147244811058044, "learning_rate": 0.00918241577649959, "loss": 2.4794, "step": 2235 }, { "epoch": 0.2299393814856673, "grad_norm": 0.08943907916545868, "learning_rate": 0.009194741166803615, "loss": 2.5331, "step": 2238 }, { "epoch": 0.2302476112195623, "grad_norm": 0.08069117367267609, "learning_rate": 0.009207066557107641, "loss": 2.4807, "step": 2241 }, { "epoch": 0.2305558409534573, "grad_norm": 0.11125557869672775, "learning_rate": 0.009219391947411669, "loss": 2.4567, "step": 2244 }, { "epoch": 0.23086407068735232, "grad_norm": 0.2825096547603607, "learning_rate": 0.009231717337715694, "loss": 2.5101, "step": 2247 }, { "epoch": 0.2311723004212473, "grad_norm": 0.10534384101629257, "learning_rate": 0.00924404272801972, "loss": 2.5272, "step": 2250 }, { "epoch": 0.2314805301551423, "grad_norm": 0.07159514725208282, "learning_rate": 0.009256368118323748, "loss": 2.4707, "step": 2253 }, { "epoch": 0.2317887598890373, "grad_norm": 0.06435802578926086, "learning_rate": 0.009268693508627773, "loss": 2.4788, "step": 2256 }, { "epoch": 0.2320969896229323, "grad_norm": 0.09402693063020706, "learning_rate": 0.0092810188989318, "loss": 2.4639, "step": 2259 }, { "epoch": 0.2324052193568273, "grad_norm": 0.18836408853530884, "learning_rate": 0.009293344289235825, "loss": 2.4747, "step": 2262 }, { "epoch": 0.23271344909072228, "grad_norm": 0.09705471992492676, "learning_rate": 0.009305669679539852, "loss": 2.5041, "step": 2265 }, { "epoch": 0.23302167882461727, "grad_norm": 0.09185091406106949, "learning_rate": 0.009317995069843878, "loss": 2.4625, "step": 2268 }, { "epoch": 0.23332990855851227, "grad_norm": 0.0848812386393547, "learning_rate": 0.009330320460147905, "loss": 2.4876, "step": 2271 }, { "epoch": 0.2336381382924073, "grad_norm": 0.07989475131034851, "learning_rate": 0.009342645850451931, "loss": 2.4697, "step": 2274 }, { "epoch": 0.23394636802630228, "grad_norm": 0.09660454094409943, "learning_rate": 0.009354971240755957, "loss": 2.4917, "step": 2277 }, { "epoch": 0.23425459776019728, "grad_norm": 0.09550273418426514, "learning_rate": 0.009367296631059984, "loss": 2.4806, "step": 2280 }, { "epoch": 0.23456282749409227, "grad_norm": 0.16650651395320892, "learning_rate": 0.00937962202136401, "loss": 2.4424, "step": 2283 }, { "epoch": 0.23487105722798726, "grad_norm": 0.1455817073583603, "learning_rate": 0.009391947411668036, "loss": 2.4907, "step": 2286 }, { "epoch": 0.23517928696188226, "grad_norm": 0.075865738093853, "learning_rate": 0.009404272801972062, "loss": 2.5004, "step": 2289 }, { "epoch": 0.23548751669577725, "grad_norm": 0.188491553068161, "learning_rate": 0.009416598192276089, "loss": 2.5111, "step": 2292 }, { "epoch": 0.23579574642967224, "grad_norm": 0.07567702233791351, "learning_rate": 0.009428923582580115, "loss": 2.4966, "step": 2295 }, { "epoch": 0.23610397616356724, "grad_norm": 0.0682358667254448, "learning_rate": 0.00944124897288414, "loss": 2.4781, "step": 2298 }, { "epoch": 0.23641220589746223, "grad_norm": 0.173895925283432, "learning_rate": 0.009453574363188168, "loss": 2.4471, "step": 2301 }, { "epoch": 0.23672043563135725, "grad_norm": 0.15088587999343872, "learning_rate": 0.009465899753492194, "loss": 2.4783, "step": 2304 }, { "epoch": 0.23702866536525224, "grad_norm": 0.09947361797094345, "learning_rate": 0.009478225143796221, "loss": 2.4757, "step": 2307 }, { "epoch": 0.23733689509914724, "grad_norm": 0.0709480568766594, "learning_rate": 0.009490550534100247, "loss": 2.4617, "step": 2310 }, { "epoch": 0.23764512483304223, "grad_norm": 0.11335324496030807, "learning_rate": 0.009502875924404273, "loss": 2.4506, "step": 2313 }, { "epoch": 0.23795335456693723, "grad_norm": 0.10329569876194, "learning_rate": 0.0095152013147083, "loss": 2.4444, "step": 2316 }, { "epoch": 0.23826158430083222, "grad_norm": 0.18935157358646393, "learning_rate": 0.009527526705012326, "loss": 2.4739, "step": 2319 }, { "epoch": 0.2385698140347272, "grad_norm": 0.10977230221033096, "learning_rate": 0.009539852095316352, "loss": 2.4849, "step": 2322 }, { "epoch": 0.2388780437686222, "grad_norm": 0.1623351126909256, "learning_rate": 0.009552177485620377, "loss": 2.4856, "step": 2325 }, { "epoch": 0.2391862735025172, "grad_norm": 0.12067209929227829, "learning_rate": 0.009564502875924405, "loss": 2.427, "step": 2328 }, { "epoch": 0.2394945032364122, "grad_norm": 0.12578649818897247, "learning_rate": 0.00957682826622843, "loss": 2.4719, "step": 2331 }, { "epoch": 0.23980273297030721, "grad_norm": 0.09442924708127975, "learning_rate": 0.009589153656532456, "loss": 2.475, "step": 2334 }, { "epoch": 0.2401109627042022, "grad_norm": 0.06693053990602493, "learning_rate": 0.009601479046836484, "loss": 2.4949, "step": 2337 }, { "epoch": 0.2404191924380972, "grad_norm": 0.09371168911457062, "learning_rate": 0.00961380443714051, "loss": 2.4611, "step": 2340 }, { "epoch": 0.2407274221719922, "grad_norm": 0.11009377986192703, "learning_rate": 0.009626129827444537, "loss": 2.4998, "step": 2343 }, { "epoch": 0.2410356519058872, "grad_norm": 0.08789053559303284, "learning_rate": 0.009638455217748561, "loss": 2.4891, "step": 2346 }, { "epoch": 0.24134388163978218, "grad_norm": 0.2513992488384247, "learning_rate": 0.009650780608052589, "loss": 2.4613, "step": 2349 }, { "epoch": 0.24165211137367718, "grad_norm": 0.09223336726427078, "learning_rate": 0.009663105998356614, "loss": 2.4874, "step": 2352 }, { "epoch": 0.24196034110757217, "grad_norm": 0.08941586315631866, "learning_rate": 0.009675431388660642, "loss": 2.4777, "step": 2355 }, { "epoch": 0.24226857084146716, "grad_norm": 0.09664765000343323, "learning_rate": 0.009687756778964668, "loss": 2.4728, "step": 2358 }, { "epoch": 0.24257680057536218, "grad_norm": 0.08079587668180466, "learning_rate": 0.009700082169268693, "loss": 2.4621, "step": 2361 }, { "epoch": 0.24288503030925718, "grad_norm": 0.07663597911596298, "learning_rate": 0.00971240755957272, "loss": 2.487, "step": 2364 }, { "epoch": 0.24319326004315217, "grad_norm": 0.07564109563827515, "learning_rate": 0.009724732949876747, "loss": 2.4123, "step": 2367 }, { "epoch": 0.24350148977704716, "grad_norm": 0.1025756299495697, "learning_rate": 0.009737058340180772, "loss": 2.4669, "step": 2370 }, { "epoch": 0.24380971951094216, "grad_norm": 0.1370251476764679, "learning_rate": 0.009749383730484798, "loss": 2.4664, "step": 2373 }, { "epoch": 0.24411794924483715, "grad_norm": 0.11926325410604477, "learning_rate": 0.009761709120788825, "loss": 2.4483, "step": 2376 }, { "epoch": 0.24442617897873214, "grad_norm": 0.16847510635852814, "learning_rate": 0.009774034511092851, "loss": 2.4421, "step": 2379 }, { "epoch": 0.24473440871262714, "grad_norm": 0.14343461394309998, "learning_rate": 0.009786359901396877, "loss": 2.452, "step": 2382 }, { "epoch": 0.24504263844652213, "grad_norm": 0.0658588707447052, "learning_rate": 0.009798685291700904, "loss": 2.4717, "step": 2385 }, { "epoch": 0.24535086818041713, "grad_norm": 0.09394209086894989, "learning_rate": 0.00981101068200493, "loss": 2.4467, "step": 2388 }, { "epoch": 0.24565909791431215, "grad_norm": 0.0717134177684784, "learning_rate": 0.009823336072308958, "loss": 2.4505, "step": 2391 }, { "epoch": 0.24596732764820714, "grad_norm": 0.07518400996923447, "learning_rate": 0.009835661462612983, "loss": 2.431, "step": 2394 }, { "epoch": 0.24627555738210213, "grad_norm": 0.10242413729429245, "learning_rate": 0.00984798685291701, "loss": 2.451, "step": 2397 }, { "epoch": 0.24658378711599713, "grad_norm": 0.11668457090854645, "learning_rate": 0.009860312243221035, "loss": 2.4574, "step": 2400 }, { "epoch": 0.24689201684989212, "grad_norm": 0.1074887290596962, "learning_rate": 0.009872637633525062, "loss": 2.4688, "step": 2403 }, { "epoch": 0.24720024658378711, "grad_norm": 0.143118217587471, "learning_rate": 0.009884963023829088, "loss": 2.4614, "step": 2406 }, { "epoch": 0.2475084763176821, "grad_norm": 0.08865509182214737, "learning_rate": 0.009897288414133114, "loss": 2.4768, "step": 2409 }, { "epoch": 0.2478167060515771, "grad_norm": 0.10735021531581879, "learning_rate": 0.009909613804437141, "loss": 2.457, "step": 2412 }, { "epoch": 0.2481249357854721, "grad_norm": 0.11766096949577332, "learning_rate": 0.009921939194741167, "loss": 2.4661, "step": 2415 }, { "epoch": 0.2484331655193671, "grad_norm": 0.11476657539606094, "learning_rate": 0.009934264585045193, "loss": 2.4488, "step": 2418 }, { "epoch": 0.2487413952532621, "grad_norm": 0.05828983336687088, "learning_rate": 0.00994658997534922, "loss": 2.4167, "step": 2421 }, { "epoch": 0.2490496249871571, "grad_norm": 0.05311143398284912, "learning_rate": 0.009958915365653246, "loss": 2.451, "step": 2424 }, { "epoch": 0.2493578547210521, "grad_norm": 0.14447921514511108, "learning_rate": 0.009971240755957273, "loss": 2.4448, "step": 2427 }, { "epoch": 0.2496660844549471, "grad_norm": 0.178679421544075, "learning_rate": 0.009983566146261297, "loss": 2.4577, "step": 2430 }, { "epoch": 0.24997431418884208, "grad_norm": 0.18707922101020813, "learning_rate": 0.009995891536565325, "loss": 2.4544, "step": 2433 }, { "epoch": 0.2502825439227371, "grad_norm": 0.11012792587280273, "learning_rate": 0.01, "loss": 2.4636, "step": 2436 }, { "epoch": 0.25059077365663207, "grad_norm": 0.1133418157696724, "learning_rate": 0.01, "loss": 2.4694, "step": 2439 }, { "epoch": 0.25089900339052706, "grad_norm": 0.09263787418603897, "learning_rate": 0.01, "loss": 2.4174, "step": 2442 }, { "epoch": 0.25120723312442206, "grad_norm": 0.07637537270784378, "learning_rate": 0.01, "loss": 2.4546, "step": 2445 }, { "epoch": 0.25151546285831705, "grad_norm": 0.05083318054676056, "learning_rate": 0.01, "loss": 2.4517, "step": 2448 }, { "epoch": 0.25182369259221205, "grad_norm": 0.11429949849843979, "learning_rate": 0.01, "loss": 2.3998, "step": 2451 }, { "epoch": 0.25213192232610704, "grad_norm": 0.0740060955286026, "learning_rate": 0.01, "loss": 2.4572, "step": 2454 }, { "epoch": 0.25244015206000203, "grad_norm": 0.23151956498622894, "learning_rate": 0.01, "loss": 2.4507, "step": 2457 }, { "epoch": 0.252748381793897, "grad_norm": 0.09557089954614639, "learning_rate": 0.01, "loss": 2.438, "step": 2460 }, { "epoch": 0.2530566115277921, "grad_norm": 0.06453042477369308, "learning_rate": 0.01, "loss": 2.4444, "step": 2463 }, { "epoch": 0.25336484126168707, "grad_norm": 0.06805883347988129, "learning_rate": 0.01, "loss": 2.4333, "step": 2466 }, { "epoch": 0.25367307099558206, "grad_norm": 0.12063002586364746, "learning_rate": 0.01, "loss": 2.4349, "step": 2469 }, { "epoch": 0.25398130072947706, "grad_norm": 0.12683679163455963, "learning_rate": 0.01, "loss": 2.4615, "step": 2472 }, { "epoch": 0.25428953046337205, "grad_norm": 0.19388514757156372, "learning_rate": 0.01, "loss": 2.4251, "step": 2475 }, { "epoch": 0.25459776019726704, "grad_norm": 0.15118692815303802, "learning_rate": 0.01, "loss": 2.4493, "step": 2478 }, { "epoch": 0.25490598993116204, "grad_norm": 0.0716528594493866, "learning_rate": 0.01, "loss": 2.4177, "step": 2481 }, { "epoch": 0.25521421966505703, "grad_norm": 0.06410454958677292, "learning_rate": 0.01, "loss": 2.4472, "step": 2484 }, { "epoch": 0.255522449398952, "grad_norm": 0.0613977424800396, "learning_rate": 0.01, "loss": 2.4374, "step": 2487 }, { "epoch": 0.255830679132847, "grad_norm": 0.18522503972053528, "learning_rate": 0.01, "loss": 2.4237, "step": 2490 }, { "epoch": 0.256138908866742, "grad_norm": 0.10789433866739273, "learning_rate": 0.01, "loss": 2.4035, "step": 2493 }, { "epoch": 0.256447138600637, "grad_norm": 0.17734338343143463, "learning_rate": 0.01, "loss": 2.4809, "step": 2496 }, { "epoch": 0.256755368334532, "grad_norm": 0.09952409565448761, "learning_rate": 0.01, "loss": 2.4586, "step": 2499 }, { "epoch": 0.257063598068427, "grad_norm": 0.15578734874725342, "learning_rate": 0.01, "loss": 2.4451, "step": 2502 }, { "epoch": 0.257371827802322, "grad_norm": 0.11684698611497879, "learning_rate": 0.01, "loss": 2.4422, "step": 2505 }, { "epoch": 0.257680057536217, "grad_norm": 0.06539366394281387, "learning_rate": 0.01, "loss": 2.4369, "step": 2508 }, { "epoch": 0.25798828727011197, "grad_norm": 0.15363268554210663, "learning_rate": 0.01, "loss": 2.4307, "step": 2511 }, { "epoch": 0.25829651700400696, "grad_norm": 0.07657501846551895, "learning_rate": 0.01, "loss": 2.4287, "step": 2514 }, { "epoch": 0.25860474673790196, "grad_norm": 0.11238528788089752, "learning_rate": 0.01, "loss": 2.415, "step": 2517 }, { "epoch": 0.25891297647179695, "grad_norm": 0.08362044394016266, "learning_rate": 0.01, "loss": 2.4603, "step": 2520 }, { "epoch": 0.259221206205692, "grad_norm": 0.07373514771461487, "learning_rate": 0.01, "loss": 2.3937, "step": 2523 }, { "epoch": 0.259529435939587, "grad_norm": 0.062842458486557, "learning_rate": 0.01, "loss": 2.4096, "step": 2526 }, { "epoch": 0.259837665673482, "grad_norm": 0.12551096081733704, "learning_rate": 0.01, "loss": 2.4379, "step": 2529 }, { "epoch": 0.260145895407377, "grad_norm": 0.06409156322479248, "learning_rate": 0.01, "loss": 2.4212, "step": 2532 }, { "epoch": 0.260454125141272, "grad_norm": 0.10057753324508667, "learning_rate": 0.01, "loss": 2.4349, "step": 2535 }, { "epoch": 0.26076235487516697, "grad_norm": 0.1575561910867691, "learning_rate": 0.01, "loss": 2.44, "step": 2538 }, { "epoch": 0.26107058460906196, "grad_norm": 0.25684165954589844, "learning_rate": 0.01, "loss": 2.4308, "step": 2541 }, { "epoch": 0.26137881434295696, "grad_norm": 0.07472192496061325, "learning_rate": 0.01, "loss": 2.4065, "step": 2544 }, { "epoch": 0.26168704407685195, "grad_norm": 0.060896482318639755, "learning_rate": 0.01, "loss": 2.4347, "step": 2547 }, { "epoch": 0.26199527381074694, "grad_norm": 0.12883131206035614, "learning_rate": 0.01, "loss": 2.42, "step": 2550 }, { "epoch": 0.26230350354464194, "grad_norm": 0.10772990435361862, "learning_rate": 0.01, "loss": 2.3982, "step": 2553 }, { "epoch": 0.26261173327853693, "grad_norm": 0.20955395698547363, "learning_rate": 0.01, "loss": 2.4204, "step": 2556 }, { "epoch": 0.2629199630124319, "grad_norm": 0.08120223879814148, "learning_rate": 0.01, "loss": 2.4192, "step": 2559 }, { "epoch": 0.2632281927463269, "grad_norm": 0.059099119156599045, "learning_rate": 0.01, "loss": 2.4252, "step": 2562 }, { "epoch": 0.2635364224802219, "grad_norm": 0.08729352802038193, "learning_rate": 0.01, "loss": 2.4227, "step": 2565 }, { "epoch": 0.2638446522141169, "grad_norm": 0.1920178234577179, "learning_rate": 0.01, "loss": 2.4017, "step": 2568 }, { "epoch": 0.2641528819480119, "grad_norm": 0.15997105836868286, "learning_rate": 0.01, "loss": 2.444, "step": 2571 }, { "epoch": 0.2644611116819069, "grad_norm": 0.12249890714883804, "learning_rate": 0.01, "loss": 2.3957, "step": 2574 }, { "epoch": 0.2647693414158019, "grad_norm": 0.05974414199590683, "learning_rate": 0.01, "loss": 2.421, "step": 2577 }, { "epoch": 0.26507757114969693, "grad_norm": 0.13711535930633545, "learning_rate": 0.01, "loss": 2.4234, "step": 2580 }, { "epoch": 0.2653858008835919, "grad_norm": 0.15437988936901093, "learning_rate": 0.01, "loss": 2.4216, "step": 2583 }, { "epoch": 0.2656940306174869, "grad_norm": 0.10766157507896423, "learning_rate": 0.01, "loss": 2.4086, "step": 2586 }, { "epoch": 0.2660022603513819, "grad_norm": 0.0736764669418335, "learning_rate": 0.01, "loss": 2.4227, "step": 2589 }, { "epoch": 0.2663104900852769, "grad_norm": 0.06279190629720688, "learning_rate": 0.01, "loss": 2.4371, "step": 2592 }, { "epoch": 0.2666187198191719, "grad_norm": 0.11150863766670227, "learning_rate": 0.01, "loss": 2.3941, "step": 2595 }, { "epoch": 0.2669269495530669, "grad_norm": 0.1527506411075592, "learning_rate": 0.01, "loss": 2.4287, "step": 2598 }, { "epoch": 0.2672351792869619, "grad_norm": 0.13321219384670258, "learning_rate": 0.01, "loss": 2.3995, "step": 2601 }, { "epoch": 0.2675434090208569, "grad_norm": 0.1157502606511116, "learning_rate": 0.01, "loss": 2.4284, "step": 2604 }, { "epoch": 0.2678516387547519, "grad_norm": 0.10027257353067398, "learning_rate": 0.01, "loss": 2.3877, "step": 2607 }, { "epoch": 0.26815986848864687, "grad_norm": 0.10909545421600342, "learning_rate": 0.01, "loss": 2.4134, "step": 2610 }, { "epoch": 0.26846809822254186, "grad_norm": 0.09810952842235565, "learning_rate": 0.01, "loss": 2.4231, "step": 2613 }, { "epoch": 0.26877632795643686, "grad_norm": 0.06906435638666153, "learning_rate": 0.01, "loss": 2.3989, "step": 2616 }, { "epoch": 0.26908455769033185, "grad_norm": 0.10627961158752441, "learning_rate": 0.01, "loss": 2.4333, "step": 2619 }, { "epoch": 0.26939278742422684, "grad_norm": 0.10462147742509842, "learning_rate": 0.01, "loss": 2.394, "step": 2622 }, { "epoch": 0.26970101715812184, "grad_norm": 0.10885953158140182, "learning_rate": 0.01, "loss": 2.4172, "step": 2625 }, { "epoch": 0.27000924689201683, "grad_norm": 0.0981958881020546, "learning_rate": 0.01, "loss": 2.4112, "step": 2628 }, { "epoch": 0.2703174766259118, "grad_norm": 0.14177650213241577, "learning_rate": 0.01, "loss": 2.3748, "step": 2631 }, { "epoch": 0.2706257063598068, "grad_norm": 0.06374615430831909, "learning_rate": 0.01, "loss": 2.4054, "step": 2634 }, { "epoch": 0.27093393609370187, "grad_norm": 0.23363849520683289, "learning_rate": 0.01, "loss": 2.4194, "step": 2637 }, { "epoch": 0.27124216582759686, "grad_norm": 0.10294153541326523, "learning_rate": 0.01, "loss": 2.384, "step": 2640 }, { "epoch": 0.27155039556149185, "grad_norm": 0.17200984060764313, "learning_rate": 0.01, "loss": 2.4122, "step": 2643 }, { "epoch": 0.27185862529538685, "grad_norm": 0.06513970345258713, "learning_rate": 0.01, "loss": 2.4152, "step": 2646 }, { "epoch": 0.27216685502928184, "grad_norm": 0.08533628284931183, "learning_rate": 0.01, "loss": 2.4508, "step": 2649 }, { "epoch": 0.27247508476317683, "grad_norm": 0.07299966365098953, "learning_rate": 0.01, "loss": 2.4091, "step": 2652 }, { "epoch": 0.2727833144970718, "grad_norm": 0.06617329269647598, "learning_rate": 0.01, "loss": 2.4, "step": 2655 }, { "epoch": 0.2730915442309668, "grad_norm": 0.07062381505966187, "learning_rate": 0.01, "loss": 2.4002, "step": 2658 }, { "epoch": 0.2733997739648618, "grad_norm": 0.11162712424993515, "learning_rate": 0.01, "loss": 2.386, "step": 2661 }, { "epoch": 0.2737080036987568, "grad_norm": 0.07827174663543701, "learning_rate": 0.01, "loss": 2.4111, "step": 2664 }, { "epoch": 0.2740162334326518, "grad_norm": 0.07248109579086304, "learning_rate": 0.01, "loss": 2.3968, "step": 2667 }, { "epoch": 0.2743244631665468, "grad_norm": 0.1251075118780136, "learning_rate": 0.01, "loss": 2.4216, "step": 2670 }, { "epoch": 0.2746326929004418, "grad_norm": 0.1280512660741806, "learning_rate": 0.01, "loss": 2.4233, "step": 2673 }, { "epoch": 0.2749409226343368, "grad_norm": 0.06290891766548157, "learning_rate": 0.01, "loss": 2.412, "step": 2676 }, { "epoch": 0.2752491523682318, "grad_norm": 0.09324091672897339, "learning_rate": 0.01, "loss": 2.4025, "step": 2679 }, { "epoch": 0.27555738210212677, "grad_norm": 0.06253890693187714, "learning_rate": 0.01, "loss": 2.4197, "step": 2682 }, { "epoch": 0.27586561183602176, "grad_norm": 0.10279545187950134, "learning_rate": 0.01, "loss": 2.4099, "step": 2685 }, { "epoch": 0.27617384156991676, "grad_norm": 0.07942310720682144, "learning_rate": 0.01, "loss": 2.4052, "step": 2688 }, { "epoch": 0.27648207130381175, "grad_norm": 0.10373161733150482, "learning_rate": 0.01, "loss": 2.3899, "step": 2691 }, { "epoch": 0.2767903010377068, "grad_norm": 0.312575101852417, "learning_rate": 0.01, "loss": 2.4243, "step": 2694 }, { "epoch": 0.2770985307716018, "grad_norm": 0.07417728751897812, "learning_rate": 0.01, "loss": 2.3604, "step": 2697 }, { "epoch": 0.2774067605054968, "grad_norm": 0.09007294476032257, "learning_rate": 0.01, "loss": 2.3863, "step": 2700 }, { "epoch": 0.2777149902393918, "grad_norm": 0.10452757775783539, "learning_rate": 0.01, "loss": 2.41, "step": 2703 }, { "epoch": 0.2780232199732868, "grad_norm": 0.09276364743709564, "learning_rate": 0.01, "loss": 2.3878, "step": 2706 }, { "epoch": 0.27833144970718177, "grad_norm": 0.08949960023164749, "learning_rate": 0.01, "loss": 2.3823, "step": 2709 }, { "epoch": 0.27863967944107676, "grad_norm": 0.0589129813015461, "learning_rate": 0.01, "loss": 2.401, "step": 2712 }, { "epoch": 0.27894790917497175, "grad_norm": 0.08298425376415253, "learning_rate": 0.01, "loss": 2.4, "step": 2715 }, { "epoch": 0.27925613890886675, "grad_norm": 0.07719019800424576, "learning_rate": 0.01, "loss": 2.3726, "step": 2718 }, { "epoch": 0.27956436864276174, "grad_norm": 0.09369128197431564, "learning_rate": 0.01, "loss": 2.3893, "step": 2721 }, { "epoch": 0.27987259837665673, "grad_norm": 0.11461931467056274, "learning_rate": 0.01, "loss": 2.4017, "step": 2724 }, { "epoch": 0.2801808281105517, "grad_norm": 0.050078991800546646, "learning_rate": 0.01, "loss": 2.3852, "step": 2727 }, { "epoch": 0.2804890578444467, "grad_norm": 0.08188966661691666, "learning_rate": 0.01, "loss": 2.3469, "step": 2730 }, { "epoch": 0.2807972875783417, "grad_norm": 0.0805756077170372, "learning_rate": 0.01, "loss": 2.3632, "step": 2733 }, { "epoch": 0.2811055173122367, "grad_norm": 0.07377249747514725, "learning_rate": 0.01, "loss": 2.3852, "step": 2736 }, { "epoch": 0.2814137470461317, "grad_norm": 0.17040085792541504, "learning_rate": 0.01, "loss": 2.3904, "step": 2739 }, { "epoch": 0.2817219767800267, "grad_norm": 0.1419583261013031, "learning_rate": 0.01, "loss": 2.3735, "step": 2742 }, { "epoch": 0.2820302065139217, "grad_norm": 0.13182134926319122, "learning_rate": 0.01, "loss": 2.3904, "step": 2745 }, { "epoch": 0.2823384362478167, "grad_norm": 0.1058223620057106, "learning_rate": 0.01, "loss": 2.3922, "step": 2748 }, { "epoch": 0.2826466659817117, "grad_norm": 0.08037062734365463, "learning_rate": 0.01, "loss": 2.3692, "step": 2751 }, { "epoch": 0.2829548957156067, "grad_norm": 0.10247037559747696, "learning_rate": 0.01, "loss": 2.3712, "step": 2754 }, { "epoch": 0.2832631254495017, "grad_norm": 0.09925279021263123, "learning_rate": 0.01, "loss": 2.3632, "step": 2757 }, { "epoch": 0.2835713551833967, "grad_norm": 0.05111562833189964, "learning_rate": 0.01, "loss": 2.3622, "step": 2760 }, { "epoch": 0.2838795849172917, "grad_norm": 0.060480840504169464, "learning_rate": 0.01, "loss": 2.3592, "step": 2763 }, { "epoch": 0.2841878146511867, "grad_norm": 0.13488496840000153, "learning_rate": 0.01, "loss": 2.3822, "step": 2766 }, { "epoch": 0.2844960443850817, "grad_norm": 0.08369171619415283, "learning_rate": 0.01, "loss": 2.3922, "step": 2769 }, { "epoch": 0.2848042741189767, "grad_norm": 0.19474861025810242, "learning_rate": 0.01, "loss": 2.387, "step": 2772 }, { "epoch": 0.2851125038528717, "grad_norm": 0.17801512777805328, "learning_rate": 0.01, "loss": 2.3745, "step": 2775 }, { "epoch": 0.2854207335867667, "grad_norm": 0.0658038854598999, "learning_rate": 0.01, "loss": 2.3857, "step": 2778 }, { "epoch": 0.28572896332066167, "grad_norm": 0.0510118305683136, "learning_rate": 0.01, "loss": 2.3735, "step": 2781 }, { "epoch": 0.28603719305455666, "grad_norm": 0.0649714320898056, "learning_rate": 0.01, "loss": 2.4002, "step": 2784 }, { "epoch": 0.28634542278845165, "grad_norm": 0.11462211608886719, "learning_rate": 0.01, "loss": 2.3642, "step": 2787 }, { "epoch": 0.28665365252234665, "grad_norm": 0.0745900496840477, "learning_rate": 0.01, "loss": 2.4058, "step": 2790 }, { "epoch": 0.28696188225624164, "grad_norm": 0.2475040704011917, "learning_rate": 0.01, "loss": 2.3778, "step": 2793 }, { "epoch": 0.28727011199013663, "grad_norm": 0.08792129158973694, "learning_rate": 0.01, "loss": 2.3932, "step": 2796 }, { "epoch": 0.2875783417240316, "grad_norm": 0.04952983185648918, "learning_rate": 0.01, "loss": 2.3631, "step": 2799 }, { "epoch": 0.2878865714579266, "grad_norm": 0.053665559738874435, "learning_rate": 0.01, "loss": 2.3673, "step": 2802 }, { "epoch": 0.2881948011918216, "grad_norm": 0.0579262301325798, "learning_rate": 0.01, "loss": 2.3234, "step": 2805 }, { "epoch": 0.2885030309257166, "grad_norm": 0.13837358355522156, "learning_rate": 0.01, "loss": 2.3854, "step": 2808 }, { "epoch": 0.28881126065961166, "grad_norm": 0.09924750030040741, "learning_rate": 0.01, "loss": 2.3819, "step": 2811 }, { "epoch": 0.28911949039350665, "grad_norm": 0.14742402732372284, "learning_rate": 0.01, "loss": 2.3853, "step": 2814 }, { "epoch": 0.28942772012740164, "grad_norm": 0.11731177568435669, "learning_rate": 0.01, "loss": 2.4082, "step": 2817 }, { "epoch": 0.28973594986129664, "grad_norm": 0.16812686622142792, "learning_rate": 0.01, "loss": 2.3855, "step": 2820 }, { "epoch": 0.29004417959519163, "grad_norm": 0.06864415854215622, "learning_rate": 0.01, "loss": 2.3911, "step": 2823 }, { "epoch": 0.2903524093290866, "grad_norm": 0.050597239285707474, "learning_rate": 0.01, "loss": 2.3627, "step": 2826 }, { "epoch": 0.2906606390629816, "grad_norm": 0.06927742809057236, "learning_rate": 0.01, "loss": 2.3653, "step": 2829 }, { "epoch": 0.2909688687968766, "grad_norm": 0.049216922372579575, "learning_rate": 0.01, "loss": 2.3527, "step": 2832 }, { "epoch": 0.2912770985307716, "grad_norm": 0.06790090352296829, "learning_rate": 0.01, "loss": 2.4087, "step": 2835 }, { "epoch": 0.2915853282646666, "grad_norm": 0.14112398028373718, "learning_rate": 0.01, "loss": 2.3777, "step": 2838 }, { "epoch": 0.2918935579985616, "grad_norm": 0.07459170371294022, "learning_rate": 0.01, "loss": 2.3776, "step": 2841 }, { "epoch": 0.2922017877324566, "grad_norm": 0.05480146035552025, "learning_rate": 0.01, "loss": 2.3831, "step": 2844 }, { "epoch": 0.2925100174663516, "grad_norm": 0.11372058093547821, "learning_rate": 0.01, "loss": 2.3667, "step": 2847 }, { "epoch": 0.2928182472002466, "grad_norm": 0.05589181184768677, "learning_rate": 0.01, "loss": 2.3744, "step": 2850 }, { "epoch": 0.29312647693414157, "grad_norm": 0.10505107790231705, "learning_rate": 0.01, "loss": 2.3461, "step": 2853 }, { "epoch": 0.29343470666803656, "grad_norm": 0.06241190806031227, "learning_rate": 0.01, "loss": 2.3616, "step": 2856 }, { "epoch": 0.29374293640193155, "grad_norm": 0.2687353193759918, "learning_rate": 0.01, "loss": 2.3728, "step": 2859 }, { "epoch": 0.29405116613582655, "grad_norm": 0.13569511473178864, "learning_rate": 0.01, "loss": 2.3758, "step": 2862 }, { "epoch": 0.29435939586972154, "grad_norm": 0.08852502703666687, "learning_rate": 0.01, "loss": 2.3805, "step": 2865 }, { "epoch": 0.2946676256036166, "grad_norm": 0.0690246969461441, "learning_rate": 0.01, "loss": 2.3694, "step": 2868 }, { "epoch": 0.2949758553375116, "grad_norm": 0.13508114218711853, "learning_rate": 0.01, "loss": 2.3626, "step": 2871 }, { "epoch": 0.2952840850714066, "grad_norm": 0.06574945896863937, "learning_rate": 0.01, "loss": 2.3661, "step": 2874 }, { "epoch": 0.29559231480530157, "grad_norm": 0.08492054790258408, "learning_rate": 0.01, "loss": 2.3737, "step": 2877 }, { "epoch": 0.29590054453919656, "grad_norm": 0.11930177360773087, "learning_rate": 0.01, "loss": 2.3684, "step": 2880 }, { "epoch": 0.29620877427309156, "grad_norm": 0.06913982331752777, "learning_rate": 0.01, "loss": 2.3597, "step": 2883 }, { "epoch": 0.29651700400698655, "grad_norm": 0.1508978009223938, "learning_rate": 0.01, "loss": 2.3807, "step": 2886 }, { "epoch": 0.29682523374088154, "grad_norm": 0.059416841715574265, "learning_rate": 0.01, "loss": 2.3672, "step": 2889 }, { "epoch": 0.29713346347477654, "grad_norm": 0.07864934206008911, "learning_rate": 0.01, "loss": 2.3542, "step": 2892 }, { "epoch": 0.29744169320867153, "grad_norm": 0.15172207355499268, "learning_rate": 0.01, "loss": 2.3816, "step": 2895 }, { "epoch": 0.2977499229425665, "grad_norm": 0.08946362882852554, "learning_rate": 0.01, "loss": 2.3854, "step": 2898 }, { "epoch": 0.2980581526764615, "grad_norm": 0.06231836602091789, "learning_rate": 0.01, "loss": 2.3803, "step": 2901 }, { "epoch": 0.2983663824103565, "grad_norm": 0.06673764437437057, "learning_rate": 0.01, "loss": 2.3506, "step": 2904 }, { "epoch": 0.2986746121442515, "grad_norm": 0.11514609307050705, "learning_rate": 0.01, "loss": 2.3345, "step": 2907 }, { "epoch": 0.2989828418781465, "grad_norm": 0.05702753737568855, "learning_rate": 0.01, "loss": 2.353, "step": 2910 }, { "epoch": 0.2992910716120415, "grad_norm": 0.09202984720468521, "learning_rate": 0.01, "loss": 2.3978, "step": 2913 }, { "epoch": 0.2995993013459365, "grad_norm": 0.09088042378425598, "learning_rate": 0.01, "loss": 2.3508, "step": 2916 }, { "epoch": 0.2999075310798315, "grad_norm": 0.09106214344501495, "learning_rate": 0.01, "loss": 2.3695, "step": 2919 }, { "epoch": 0.3002157608137265, "grad_norm": 0.12793834507465363, "learning_rate": 0.01, "loss": 2.3585, "step": 2922 }, { "epoch": 0.30052399054762147, "grad_norm": 0.16437458992004395, "learning_rate": 0.01, "loss": 2.3708, "step": 2925 }, { "epoch": 0.3008322202815165, "grad_norm": 0.10168170928955078, "learning_rate": 0.01, "loss": 2.3839, "step": 2928 }, { "epoch": 0.3011404500154115, "grad_norm": 0.12716282904148102, "learning_rate": 0.01, "loss": 2.3653, "step": 2931 }, { "epoch": 0.3014486797493065, "grad_norm": 0.05094976723194122, "learning_rate": 0.01, "loss": 2.3315, "step": 2934 }, { "epoch": 0.3017569094832015, "grad_norm": 0.11750750988721848, "learning_rate": 0.01, "loss": 2.3544, "step": 2937 }, { "epoch": 0.3020651392170965, "grad_norm": 0.0688977912068367, "learning_rate": 0.01, "loss": 2.3485, "step": 2940 }, { "epoch": 0.3023733689509915, "grad_norm": 0.09537909924983978, "learning_rate": 0.01, "loss": 2.3357, "step": 2943 }, { "epoch": 0.3026815986848865, "grad_norm": 0.15028056502342224, "learning_rate": 0.01, "loss": 2.3029, "step": 2946 }, { "epoch": 0.30298982841878147, "grad_norm": 0.2069140523672104, "learning_rate": 0.01, "loss": 2.3658, "step": 2949 }, { "epoch": 0.30329805815267646, "grad_norm": 0.04774792492389679, "learning_rate": 0.01, "loss": 2.3488, "step": 2952 }, { "epoch": 0.30360628788657146, "grad_norm": 0.04033259302377701, "learning_rate": 0.01, "loss": 2.3536, "step": 2955 }, { "epoch": 0.30391451762046645, "grad_norm": 0.04587483033537865, "learning_rate": 0.01, "loss": 2.3377, "step": 2958 }, { "epoch": 0.30422274735436144, "grad_norm": 0.08392881602048874, "learning_rate": 0.01, "loss": 2.3323, "step": 2961 }, { "epoch": 0.30453097708825644, "grad_norm": 0.16665025055408478, "learning_rate": 0.01, "loss": 2.3763, "step": 2964 }, { "epoch": 0.30483920682215143, "grad_norm": 0.19268077611923218, "learning_rate": 0.01, "loss": 2.3545, "step": 2967 }, { "epoch": 0.3051474365560464, "grad_norm": 0.14428319036960602, "learning_rate": 0.01, "loss": 2.3481, "step": 2970 }, { "epoch": 0.3054556662899414, "grad_norm": 0.08958342671394348, "learning_rate": 0.01, "loss": 2.3704, "step": 2973 }, { "epoch": 0.3057638960238364, "grad_norm": 0.06964152306318283, "learning_rate": 0.01, "loss": 2.3649, "step": 2976 }, { "epoch": 0.3060721257577314, "grad_norm": 0.1336866170167923, "learning_rate": 0.01, "loss": 2.3426, "step": 2979 }, { "epoch": 0.3063803554916264, "grad_norm": 0.06913724541664124, "learning_rate": 0.01, "loss": 2.363, "step": 2982 }, { "epoch": 0.30668858522552145, "grad_norm": 0.0705854743719101, "learning_rate": 0.01, "loss": 2.327, "step": 2985 }, { "epoch": 0.30699681495941644, "grad_norm": 0.06596222519874573, "learning_rate": 0.01, "loss": 2.3669, "step": 2988 }, { "epoch": 0.30730504469331144, "grad_norm": 0.12716993689537048, "learning_rate": 0.01, "loss": 2.3395, "step": 2991 }, { "epoch": 0.30761327442720643, "grad_norm": 0.09933049976825714, "learning_rate": 0.01, "loss": 2.3532, "step": 2994 }, { "epoch": 0.3079215041611014, "grad_norm": 0.19280697405338287, "learning_rate": 0.01, "loss": 2.3513, "step": 2997 }, { "epoch": 0.3082297338949964, "grad_norm": 0.08448618650436401, "learning_rate": 0.01, "loss": 2.3828, "step": 3000 }, { "epoch": 0.3085379636288914, "grad_norm": 0.14882349967956543, "learning_rate": 0.01, "loss": 2.3398, "step": 3003 }, { "epoch": 0.3088461933627864, "grad_norm": 0.08360068500041962, "learning_rate": 0.01, "loss": 2.3414, "step": 3006 }, { "epoch": 0.3091544230966814, "grad_norm": 0.1378074288368225, "learning_rate": 0.01, "loss": 2.3461, "step": 3009 }, { "epoch": 0.3094626528305764, "grad_norm": 0.13160692155361176, "learning_rate": 0.01, "loss": 2.3517, "step": 3012 }, { "epoch": 0.3097708825644714, "grad_norm": 0.0702040046453476, "learning_rate": 0.01, "loss": 2.3524, "step": 3015 }, { "epoch": 0.3100791122983664, "grad_norm": 0.06959223002195358, "learning_rate": 0.01, "loss": 2.3398, "step": 3018 }, { "epoch": 0.31038734203226137, "grad_norm": 0.10830830782651901, "learning_rate": 0.01, "loss": 2.3437, "step": 3021 }, { "epoch": 0.31069557176615636, "grad_norm": 0.09298605471849442, "learning_rate": 0.01, "loss": 2.3473, "step": 3024 }, { "epoch": 0.31100380150005136, "grad_norm": 0.06620427966117859, "learning_rate": 0.01, "loss": 2.3341, "step": 3027 }, { "epoch": 0.31131203123394635, "grad_norm": 0.21722812950611115, "learning_rate": 0.01, "loss": 2.3281, "step": 3030 }, { "epoch": 0.31162026096784134, "grad_norm": 0.1475544422864914, "learning_rate": 0.01, "loss": 2.3383, "step": 3033 }, { "epoch": 0.31192849070173634, "grad_norm": 0.13449987769126892, "learning_rate": 0.01, "loss": 2.314, "step": 3036 }, { "epoch": 0.31223672043563133, "grad_norm": 0.06219559907913208, "learning_rate": 0.01, "loss": 2.3526, "step": 3039 }, { "epoch": 0.3125449501695264, "grad_norm": 0.05337538942694664, "learning_rate": 0.01, "loss": 2.3386, "step": 3042 }, { "epoch": 0.3128531799034214, "grad_norm": 0.11457488685846329, "learning_rate": 0.01, "loss": 2.3261, "step": 3045 }, { "epoch": 0.31316140963731637, "grad_norm": 0.23809069395065308, "learning_rate": 0.01, "loss": 2.3411, "step": 3048 }, { "epoch": 0.31346963937121136, "grad_norm": 0.11100046336650848, "learning_rate": 0.01, "loss": 2.3269, "step": 3051 }, { "epoch": 0.31377786910510636, "grad_norm": 0.05229029804468155, "learning_rate": 0.01, "loss": 2.3339, "step": 3054 }, { "epoch": 0.31408609883900135, "grad_norm": 0.05956039950251579, "learning_rate": 0.01, "loss": 2.3566, "step": 3057 }, { "epoch": 0.31439432857289634, "grad_norm": 0.13084881007671356, "learning_rate": 0.01, "loss": 2.332, "step": 3060 }, { "epoch": 0.31470255830679134, "grad_norm": 0.13889305293560028, "learning_rate": 0.01, "loss": 2.3118, "step": 3063 }, { "epoch": 0.31501078804068633, "grad_norm": 0.10443049669265747, "learning_rate": 0.01, "loss": 2.3246, "step": 3066 }, { "epoch": 0.3153190177745813, "grad_norm": 0.04321267828345299, "learning_rate": 0.01, "loss": 2.3353, "step": 3069 }, { "epoch": 0.3156272475084763, "grad_norm": 0.046873319894075394, "learning_rate": 0.01, "loss": 2.3144, "step": 3072 }, { "epoch": 0.3159354772423713, "grad_norm": 0.06548158824443817, "learning_rate": 0.01, "loss": 2.3285, "step": 3075 }, { "epoch": 0.3162437069762663, "grad_norm": 0.19105824828147888, "learning_rate": 0.01, "loss": 2.349, "step": 3078 }, { "epoch": 0.3165519367101613, "grad_norm": 0.14477142691612244, "learning_rate": 0.01, "loss": 2.3074, "step": 3081 }, { "epoch": 0.3168601664440563, "grad_norm": 0.08536936342716217, "learning_rate": 0.01, "loss": 2.3462, "step": 3084 }, { "epoch": 0.3171683961779513, "grad_norm": 0.0595535933971405, "learning_rate": 0.01, "loss": 2.3522, "step": 3087 }, { "epoch": 0.3174766259118463, "grad_norm": 0.058548733592033386, "learning_rate": 0.01, "loss": 2.3314, "step": 3090 }, { "epoch": 0.31778485564574127, "grad_norm": 0.04651311784982681, "learning_rate": 0.01, "loss": 2.3058, "step": 3093 }, { "epoch": 0.31809308537963626, "grad_norm": 0.0516805462539196, "learning_rate": 0.01, "loss": 2.3243, "step": 3096 }, { "epoch": 0.31840131511353126, "grad_norm": 0.16851970553398132, "learning_rate": 0.01, "loss": 2.3315, "step": 3099 }, { "epoch": 0.3187095448474263, "grad_norm": 0.08350600302219391, "learning_rate": 0.01, "loss": 2.315, "step": 3102 }, { "epoch": 0.3190177745813213, "grad_norm": 0.08899964392185211, "learning_rate": 0.01, "loss": 2.3218, "step": 3105 }, { "epoch": 0.3193260043152163, "grad_norm": 0.2424800843000412, "learning_rate": 0.01, "loss": 2.3207, "step": 3108 }, { "epoch": 0.3196342340491113, "grad_norm": 0.213782399892807, "learning_rate": 0.01, "loss": 2.3728, "step": 3111 }, { "epoch": 0.3199424637830063, "grad_norm": 0.15629780292510986, "learning_rate": 0.01, "loss": 2.3453, "step": 3114 }, { "epoch": 0.3202506935169013, "grad_norm": 0.06920924782752991, "learning_rate": 0.01, "loss": 2.3111, "step": 3117 }, { "epoch": 0.32055892325079627, "grad_norm": 0.04514181613922119, "learning_rate": 0.01, "loss": 2.33, "step": 3120 }, { "epoch": 0.32086715298469126, "grad_norm": 0.05500979721546173, "learning_rate": 0.01, "loss": 2.3078, "step": 3123 }, { "epoch": 0.32117538271858626, "grad_norm": 0.09148071706295013, "learning_rate": 0.01, "loss": 2.3457, "step": 3126 }, { "epoch": 0.32148361245248125, "grad_norm": 0.10582035779953003, "learning_rate": 0.01, "loss": 2.3114, "step": 3129 }, { "epoch": 0.32179184218637624, "grad_norm": 0.1557345986366272, "learning_rate": 0.01, "loss": 2.3334, "step": 3132 }, { "epoch": 0.32210007192027124, "grad_norm": 0.11304829269647598, "learning_rate": 0.01, "loss": 2.2797, "step": 3135 }, { "epoch": 0.32240830165416623, "grad_norm": 0.08236223459243774, "learning_rate": 0.01, "loss": 2.3357, "step": 3138 }, { "epoch": 0.3227165313880612, "grad_norm": 0.09718946367502213, "learning_rate": 0.01, "loss": 2.3096, "step": 3141 }, { "epoch": 0.3230247611219562, "grad_norm": 0.07455772161483765, "learning_rate": 0.01, "loss": 2.3127, "step": 3144 }, { "epoch": 0.3233329908558512, "grad_norm": 0.0556890033185482, "learning_rate": 0.01, "loss": 2.3088, "step": 3147 }, { "epoch": 0.3236412205897462, "grad_norm": 0.07595494389533997, "learning_rate": 0.01, "loss": 2.3159, "step": 3150 }, { "epoch": 0.3239494503236412, "grad_norm": 0.07064896821975708, "learning_rate": 0.01, "loss": 2.3336, "step": 3153 }, { "epoch": 0.3242576800575362, "grad_norm": 0.06646276265382767, "learning_rate": 0.01, "loss": 2.324, "step": 3156 }, { "epoch": 0.32456590979143124, "grad_norm": 0.08837945014238358, "learning_rate": 0.01, "loss": 2.3191, "step": 3159 }, { "epoch": 0.32487413952532623, "grad_norm": 0.13228796422481537, "learning_rate": 0.01, "loss": 2.3231, "step": 3162 }, { "epoch": 0.3251823692592212, "grad_norm": 0.1080455407500267, "learning_rate": 0.01, "loss": 2.3341, "step": 3165 }, { "epoch": 0.3254905989931162, "grad_norm": 0.1073957234621048, "learning_rate": 0.01, "loss": 2.3237, "step": 3168 }, { "epoch": 0.3257988287270112, "grad_norm": 0.12472347915172577, "learning_rate": 0.01, "loss": 2.3315, "step": 3171 }, { "epoch": 0.3261070584609062, "grad_norm": 0.09123571217060089, "learning_rate": 0.01, "loss": 2.3588, "step": 3174 }, { "epoch": 0.3264152881948012, "grad_norm": 0.07830306142568588, "learning_rate": 0.01, "loss": 2.3273, "step": 3177 }, { "epoch": 0.3267235179286962, "grad_norm": 0.11552650481462479, "learning_rate": 0.01, "loss": 2.3407, "step": 3180 }, { "epoch": 0.3270317476625912, "grad_norm": 0.13251489400863647, "learning_rate": 0.01, "loss": 2.3241, "step": 3183 }, { "epoch": 0.3273399773964862, "grad_norm": 0.12775808572769165, "learning_rate": 0.01, "loss": 2.331, "step": 3186 }, { "epoch": 0.3276482071303812, "grad_norm": 0.12069859355688095, "learning_rate": 0.01, "loss": 2.3486, "step": 3189 }, { "epoch": 0.32795643686427617, "grad_norm": 0.059109434485435486, "learning_rate": 0.01, "loss": 2.2969, "step": 3192 }, { "epoch": 0.32826466659817116, "grad_norm": 0.12731850147247314, "learning_rate": 0.01, "loss": 2.3269, "step": 3195 }, { "epoch": 0.32857289633206616, "grad_norm": 0.15247757732868195, "learning_rate": 0.01, "loss": 2.3312, "step": 3198 }, { "epoch": 0.32888112606596115, "grad_norm": 0.128463476896286, "learning_rate": 0.01, "loss": 2.3275, "step": 3201 }, { "epoch": 0.32918935579985614, "grad_norm": 0.09406638145446777, "learning_rate": 0.01, "loss": 2.3205, "step": 3204 }, { "epoch": 0.32949758553375114, "grad_norm": 0.10524141043424606, "learning_rate": 0.01, "loss": 2.3423, "step": 3207 }, { "epoch": 0.32980581526764613, "grad_norm": 0.11357913911342621, "learning_rate": 0.01, "loss": 2.3071, "step": 3210 }, { "epoch": 0.3301140450015411, "grad_norm": 0.06979521363973618, "learning_rate": 0.01, "loss": 2.3319, "step": 3213 }, { "epoch": 0.33042227473543617, "grad_norm": 0.07000034302473068, "learning_rate": 0.01, "loss": 2.3523, "step": 3216 }, { "epoch": 0.33073050446933117, "grad_norm": 0.07495003193616867, "learning_rate": 0.01, "loss": 2.305, "step": 3219 }, { "epoch": 0.33103873420322616, "grad_norm": 0.07131810486316681, "learning_rate": 0.01, "loss": 2.2896, "step": 3222 }, { "epoch": 0.33134696393712115, "grad_norm": 0.051389019936323166, "learning_rate": 0.01, "loss": 2.2974, "step": 3225 }, { "epoch": 0.33165519367101615, "grad_norm": 0.05159701779484749, "learning_rate": 0.01, "loss": 2.3344, "step": 3228 }, { "epoch": 0.33196342340491114, "grad_norm": 0.07632975280284882, "learning_rate": 0.01, "loss": 2.3091, "step": 3231 }, { "epoch": 0.33227165313880613, "grad_norm": 0.08053800463676453, "learning_rate": 0.01, "loss": 2.298, "step": 3234 }, { "epoch": 0.3325798828727011, "grad_norm": 0.1371622234582901, "learning_rate": 0.01, "loss": 2.3095, "step": 3237 }, { "epoch": 0.3328881126065961, "grad_norm": 0.11367069184780121, "learning_rate": 0.01, "loss": 2.3212, "step": 3240 }, { "epoch": 0.3331963423404911, "grad_norm": 0.13252900540828705, "learning_rate": 0.01, "loss": 2.3238, "step": 3243 }, { "epoch": 0.3335045720743861, "grad_norm": 0.15517258644104004, "learning_rate": 0.01, "loss": 2.3263, "step": 3246 }, { "epoch": 0.3338128018082811, "grad_norm": 0.14029370248317719, "learning_rate": 0.01, "loss": 2.3457, "step": 3249 }, { "epoch": 0.3341210315421761, "grad_norm": 0.105759397149086, "learning_rate": 0.01, "loss": 2.3008, "step": 3252 }, { "epoch": 0.3344292612760711, "grad_norm": 0.04762979596853256, "learning_rate": 0.01, "loss": 2.3306, "step": 3255 }, { "epoch": 0.3347374910099661, "grad_norm": 0.12065446376800537, "learning_rate": 0.01, "loss": 2.2904, "step": 3258 }, { "epoch": 0.3350457207438611, "grad_norm": 0.08886688947677612, "learning_rate": 0.01, "loss": 2.3243, "step": 3261 }, { "epoch": 0.33535395047775607, "grad_norm": 0.08021339774131775, "learning_rate": 0.01, "loss": 2.3313, "step": 3264 }, { "epoch": 0.33566218021165106, "grad_norm": 0.04490290582180023, "learning_rate": 0.01, "loss": 2.2888, "step": 3267 }, { "epoch": 0.33597040994554606, "grad_norm": 0.061480812728405, "learning_rate": 0.01, "loss": 2.2898, "step": 3270 }, { "epoch": 0.33627863967944105, "grad_norm": 0.04230419546365738, "learning_rate": 0.01, "loss": 2.3062, "step": 3273 }, { "epoch": 0.3365868694133361, "grad_norm": 0.12344948202371597, "learning_rate": 0.01, "loss": 2.3105, "step": 3276 }, { "epoch": 0.3368950991472311, "grad_norm": 0.13087160885334015, "learning_rate": 0.01, "loss": 2.3388, "step": 3279 }, { "epoch": 0.3372033288811261, "grad_norm": 0.06671308726072311, "learning_rate": 0.01, "loss": 2.3062, "step": 3282 }, { "epoch": 0.3375115586150211, "grad_norm": 0.055828843265771866, "learning_rate": 0.01, "loss": 2.3227, "step": 3285 }, { "epoch": 0.3378197883489161, "grad_norm": 0.07760481536388397, "learning_rate": 0.01, "loss": 2.307, "step": 3288 }, { "epoch": 0.33812801808281107, "grad_norm": 0.08074722439050674, "learning_rate": 0.01, "loss": 2.3363, "step": 3291 }, { "epoch": 0.33843624781670606, "grad_norm": 0.046514566987752914, "learning_rate": 0.01, "loss": 2.3152, "step": 3294 }, { "epoch": 0.33874447755060105, "grad_norm": 0.15358585119247437, "learning_rate": 0.01, "loss": 2.3114, "step": 3297 }, { "epoch": 0.33905270728449605, "grad_norm": 0.09048300981521606, "learning_rate": 0.01, "loss": 2.3218, "step": 3300 }, { "epoch": 0.33936093701839104, "grad_norm": 0.08199465274810791, "learning_rate": 0.01, "loss": 2.3133, "step": 3303 }, { "epoch": 0.33966916675228603, "grad_norm": 0.13738159835338593, "learning_rate": 0.01, "loss": 2.3108, "step": 3306 }, { "epoch": 0.339977396486181, "grad_norm": 0.11493804305791855, "learning_rate": 0.01, "loss": 2.2996, "step": 3309 }, { "epoch": 0.340285626220076, "grad_norm": 0.06872740387916565, "learning_rate": 0.01, "loss": 2.306, "step": 3312 }, { "epoch": 0.340593855953971, "grad_norm": 0.055139992386102676, "learning_rate": 0.01, "loss": 2.3129, "step": 3315 }, { "epoch": 0.340902085687866, "grad_norm": 0.16477546095848083, "learning_rate": 0.01, "loss": 2.3138, "step": 3318 }, { "epoch": 0.341210315421761, "grad_norm": 0.06387230008840561, "learning_rate": 0.01, "loss": 2.3025, "step": 3321 }, { "epoch": 0.341518545155656, "grad_norm": 0.1657593995332718, "learning_rate": 0.01, "loss": 2.3255, "step": 3324 }, { "epoch": 0.341826774889551, "grad_norm": 0.08980764448642731, "learning_rate": 0.01, "loss": 2.3024, "step": 3327 }, { "epoch": 0.342135004623446, "grad_norm": 0.05479981005191803, "learning_rate": 0.01, "loss": 2.2955, "step": 3330 }, { "epoch": 0.34244323435734103, "grad_norm": 0.05986113101243973, "learning_rate": 0.01, "loss": 2.3078, "step": 3333 }, { "epoch": 0.342751464091236, "grad_norm": 0.1339874267578125, "learning_rate": 0.01, "loss": 2.2974, "step": 3336 }, { "epoch": 0.343059693825131, "grad_norm": 0.11250229179859161, "learning_rate": 0.01, "loss": 2.3162, "step": 3339 }, { "epoch": 0.343367923559026, "grad_norm": 0.12179972976446152, "learning_rate": 0.01, "loss": 2.2746, "step": 3342 }, { "epoch": 0.343676153292921, "grad_norm": 0.10306143760681152, "learning_rate": 0.01, "loss": 2.3008, "step": 3345 }, { "epoch": 0.343984383026816, "grad_norm": 0.08372616767883301, "learning_rate": 0.01, "loss": 2.2962, "step": 3348 }, { "epoch": 0.344292612760711, "grad_norm": 0.05286876857280731, "learning_rate": 0.01, "loss": 2.3067, "step": 3351 }, { "epoch": 0.344600842494606, "grad_norm": 0.06248036026954651, "learning_rate": 0.01, "loss": 2.3432, "step": 3354 }, { "epoch": 0.344909072228501, "grad_norm": 0.1287723332643509, "learning_rate": 0.01, "loss": 2.3064, "step": 3357 }, { "epoch": 0.345217301962396, "grad_norm": 0.08843682706356049, "learning_rate": 0.01, "loss": 2.3059, "step": 3360 }, { "epoch": 0.34552553169629097, "grad_norm": 0.07060680538415909, "learning_rate": 0.01, "loss": 2.2627, "step": 3363 }, { "epoch": 0.34583376143018596, "grad_norm": 0.10443838685750961, "learning_rate": 0.01, "loss": 2.2876, "step": 3366 }, { "epoch": 0.34614199116408095, "grad_norm": 0.06748315691947937, "learning_rate": 0.01, "loss": 2.3182, "step": 3369 }, { "epoch": 0.34645022089797595, "grad_norm": 0.06599223613739014, "learning_rate": 0.01, "loss": 2.2997, "step": 3372 }, { "epoch": 0.34675845063187094, "grad_norm": 0.08530016988515854, "learning_rate": 0.01, "loss": 2.2959, "step": 3375 }, { "epoch": 0.34706668036576593, "grad_norm": 0.10694181174039841, "learning_rate": 0.01, "loss": 2.3248, "step": 3378 }, { "epoch": 0.3473749100996609, "grad_norm": 0.06598237156867981, "learning_rate": 0.01, "loss": 2.2837, "step": 3381 }, { "epoch": 0.3476831398335559, "grad_norm": 0.0782204419374466, "learning_rate": 0.01, "loss": 2.2926, "step": 3384 }, { "epoch": 0.3479913695674509, "grad_norm": 0.09585436433553696, "learning_rate": 0.01, "loss": 2.2984, "step": 3387 }, { "epoch": 0.34829959930134596, "grad_norm": 0.061477720737457275, "learning_rate": 0.01, "loss": 2.2693, "step": 3390 }, { "epoch": 0.34860782903524096, "grad_norm": 0.104725681245327, "learning_rate": 0.01, "loss": 2.2887, "step": 3393 }, { "epoch": 0.34891605876913595, "grad_norm": 0.12205322831869125, "learning_rate": 0.01, "loss": 2.3052, "step": 3396 }, { "epoch": 0.34922428850303094, "grad_norm": 0.16279913485050201, "learning_rate": 0.01, "loss": 2.2771, "step": 3399 }, { "epoch": 0.34953251823692594, "grad_norm": 0.059565551578998566, "learning_rate": 0.01, "loss": 2.3027, "step": 3402 }, { "epoch": 0.34984074797082093, "grad_norm": 0.06318376958370209, "learning_rate": 0.01, "loss": 2.3131, "step": 3405 }, { "epoch": 0.3501489777047159, "grad_norm": 0.05476443096995354, "learning_rate": 0.01, "loss": 2.2953, "step": 3408 }, { "epoch": 0.3504572074386109, "grad_norm": 0.07989142090082169, "learning_rate": 0.01, "loss": 2.31, "step": 3411 }, { "epoch": 0.3507654371725059, "grad_norm": 0.15566086769104004, "learning_rate": 0.01, "loss": 2.2839, "step": 3414 }, { "epoch": 0.3510736669064009, "grad_norm": 0.060441337525844574, "learning_rate": 0.01, "loss": 2.2952, "step": 3417 }, { "epoch": 0.3513818966402959, "grad_norm": 0.06277213245630264, "learning_rate": 0.01, "loss": 2.32, "step": 3420 }, { "epoch": 0.3516901263741909, "grad_norm": 0.04959907755255699, "learning_rate": 0.01, "loss": 2.3116, "step": 3423 }, { "epoch": 0.3519983561080859, "grad_norm": 0.06766139715909958, "learning_rate": 0.01, "loss": 2.3201, "step": 3426 }, { "epoch": 0.3523065858419809, "grad_norm": 0.053323931992053986, "learning_rate": 0.01, "loss": 2.3271, "step": 3429 }, { "epoch": 0.3526148155758759, "grad_norm": 0.06396596878767014, "learning_rate": 0.01, "loss": 2.2929, "step": 3432 }, { "epoch": 0.35292304530977087, "grad_norm": 0.07360636442899704, "learning_rate": 0.01, "loss": 2.2918, "step": 3435 }, { "epoch": 0.35323127504366586, "grad_norm": 0.10262563079595566, "learning_rate": 0.01, "loss": 2.2871, "step": 3438 }, { "epoch": 0.35353950477756085, "grad_norm": 0.09783780574798584, "learning_rate": 0.01, "loss": 2.3229, "step": 3441 }, { "epoch": 0.35384773451145585, "grad_norm": 0.08542583137750626, "learning_rate": 0.01, "loss": 2.2887, "step": 3444 }, { "epoch": 0.35415596424535084, "grad_norm": 0.11864805966615677, "learning_rate": 0.01, "loss": 2.2848, "step": 3447 }, { "epoch": 0.3544641939792459, "grad_norm": 0.10997387021780014, "learning_rate": 0.01, "loss": 2.2897, "step": 3450 }, { "epoch": 0.3547724237131409, "grad_norm": 0.10915081202983856, "learning_rate": 0.01, "loss": 2.3114, "step": 3453 }, { "epoch": 0.3550806534470359, "grad_norm": 0.15109725296497345, "learning_rate": 0.01, "loss": 2.2933, "step": 3456 }, { "epoch": 0.35538888318093087, "grad_norm": 0.04911811649799347, "learning_rate": 0.01, "loss": 2.3035, "step": 3459 }, { "epoch": 0.35569711291482586, "grad_norm": 0.12352598458528519, "learning_rate": 0.01, "loss": 2.2897, "step": 3462 }, { "epoch": 0.35600534264872086, "grad_norm": 0.10834213346242905, "learning_rate": 0.01, "loss": 2.2879, "step": 3465 }, { "epoch": 0.35631357238261585, "grad_norm": 0.10665787756443024, "learning_rate": 0.01, "loss": 2.2614, "step": 3468 }, { "epoch": 0.35662180211651084, "grad_norm": 0.0898185670375824, "learning_rate": 0.01, "loss": 2.2943, "step": 3471 }, { "epoch": 0.35693003185040584, "grad_norm": 0.07015782594680786, "learning_rate": 0.01, "loss": 2.298, "step": 3474 }, { "epoch": 0.35723826158430083, "grad_norm": 0.1292288452386856, "learning_rate": 0.01, "loss": 2.3122, "step": 3477 }, { "epoch": 0.3575464913181958, "grad_norm": 0.09300121665000916, "learning_rate": 0.01, "loss": 2.2769, "step": 3480 }, { "epoch": 0.3578547210520908, "grad_norm": 0.0449809767305851, "learning_rate": 0.01, "loss": 2.2564, "step": 3483 }, { "epoch": 0.3581629507859858, "grad_norm": 0.051362160593271255, "learning_rate": 0.01, "loss": 2.2739, "step": 3486 }, { "epoch": 0.3584711805198808, "grad_norm": 0.12473469972610474, "learning_rate": 0.01, "loss": 2.2844, "step": 3489 }, { "epoch": 0.3587794102537758, "grad_norm": 0.0925057902932167, "learning_rate": 0.01, "loss": 2.2618, "step": 3492 }, { "epoch": 0.3590876399876708, "grad_norm": 0.1026608943939209, "learning_rate": 0.01, "loss": 2.2814, "step": 3495 }, { "epoch": 0.3593958697215658, "grad_norm": 0.0995681881904602, "learning_rate": 0.01, "loss": 2.2861, "step": 3498 }, { "epoch": 0.3597040994554608, "grad_norm": 0.06513385474681854, "learning_rate": 0.01, "loss": 2.2827, "step": 3501 }, { "epoch": 0.3600123291893558, "grad_norm": 0.06724824756383896, "learning_rate": 0.01, "loss": 2.2799, "step": 3504 }, { "epoch": 0.3603205589232508, "grad_norm": 0.06367610394954681, "learning_rate": 0.01, "loss": 2.2846, "step": 3507 }, { "epoch": 0.3606287886571458, "grad_norm": 0.07489916682243347, "learning_rate": 0.01, "loss": 2.2816, "step": 3510 }, { "epoch": 0.3609370183910408, "grad_norm": 0.11221667379140854, "learning_rate": 0.01, "loss": 2.2869, "step": 3513 }, { "epoch": 0.3612452481249358, "grad_norm": 0.09854032099246979, "learning_rate": 0.01, "loss": 2.2646, "step": 3516 }, { "epoch": 0.3615534778588308, "grad_norm": 0.09218656271696091, "learning_rate": 0.01, "loss": 2.2844, "step": 3519 }, { "epoch": 0.3618617075927258, "grad_norm": 0.1531379073858261, "learning_rate": 0.01, "loss": 2.279, "step": 3522 }, { "epoch": 0.3621699373266208, "grad_norm": 0.07070820778608322, "learning_rate": 0.01, "loss": 2.2747, "step": 3525 }, { "epoch": 0.3624781670605158, "grad_norm": 0.1057102233171463, "learning_rate": 0.01, "loss": 2.275, "step": 3528 }, { "epoch": 0.36278639679441077, "grad_norm": 0.049471016973257065, "learning_rate": 0.01, "loss": 2.3013, "step": 3531 }, { "epoch": 0.36309462652830576, "grad_norm": 0.08196526020765305, "learning_rate": 0.01, "loss": 2.2571, "step": 3534 }, { "epoch": 0.36340285626220076, "grad_norm": 0.09507983922958374, "learning_rate": 0.01, "loss": 2.3196, "step": 3537 }, { "epoch": 0.36371108599609575, "grad_norm": 0.089228555560112, "learning_rate": 0.01, "loss": 2.2539, "step": 3540 }, { "epoch": 0.36401931572999074, "grad_norm": 0.0866270586848259, "learning_rate": 0.01, "loss": 2.284, "step": 3543 }, { "epoch": 0.36432754546388574, "grad_norm": 0.13805072009563446, "learning_rate": 0.01, "loss": 2.2723, "step": 3546 }, { "epoch": 0.36463577519778073, "grad_norm": 0.09308724105358124, "learning_rate": 0.01, "loss": 2.2969, "step": 3549 }, { "epoch": 0.3649440049316757, "grad_norm": 0.07004178315401077, "learning_rate": 0.01, "loss": 2.2959, "step": 3552 }, { "epoch": 0.3652522346655707, "grad_norm": 0.09345975518226624, "learning_rate": 0.01, "loss": 2.2656, "step": 3555 }, { "epoch": 0.3655604643994657, "grad_norm": 0.07694482058286667, "learning_rate": 0.01, "loss": 2.2921, "step": 3558 }, { "epoch": 0.3658686941333607, "grad_norm": 0.05591150000691414, "learning_rate": 0.01, "loss": 2.2869, "step": 3561 }, { "epoch": 0.36617692386725575, "grad_norm": 0.06863993406295776, "learning_rate": 0.01, "loss": 2.2897, "step": 3564 }, { "epoch": 0.36648515360115075, "grad_norm": 0.06258527934551239, "learning_rate": 0.01, "loss": 2.2994, "step": 3567 }, { "epoch": 0.36679338333504574, "grad_norm": 0.1049329936504364, "learning_rate": 0.01, "loss": 2.2794, "step": 3570 }, { "epoch": 0.36710161306894074, "grad_norm": 0.1229025200009346, "learning_rate": 0.01, "loss": 2.2949, "step": 3573 }, { "epoch": 0.36740984280283573, "grad_norm": 0.13274389505386353, "learning_rate": 0.01, "loss": 2.2791, "step": 3576 }, { "epoch": 0.3677180725367307, "grad_norm": 0.09388844668865204, "learning_rate": 0.01, "loss": 2.3067, "step": 3579 }, { "epoch": 0.3680263022706257, "grad_norm": 0.05375714227557182, "learning_rate": 0.01, "loss": 2.2946, "step": 3582 }, { "epoch": 0.3683345320045207, "grad_norm": 0.059105634689331055, "learning_rate": 0.01, "loss": 2.2821, "step": 3585 }, { "epoch": 0.3686427617384157, "grad_norm": 0.055578552186489105, "learning_rate": 0.01, "loss": 2.2694, "step": 3588 }, { "epoch": 0.3689509914723107, "grad_norm": 0.08778764307498932, "learning_rate": 0.01, "loss": 2.2712, "step": 3591 }, { "epoch": 0.3692592212062057, "grad_norm": 0.1044803187251091, "learning_rate": 0.01, "loss": 2.2797, "step": 3594 }, { "epoch": 0.3695674509401007, "grad_norm": 0.15398399531841278, "learning_rate": 0.01, "loss": 2.3042, "step": 3597 }, { "epoch": 0.3698756806739957, "grad_norm": 0.11562564969062805, "learning_rate": 0.01, "loss": 2.2609, "step": 3600 }, { "epoch": 0.37018391040789067, "grad_norm": 0.060630831867456436, "learning_rate": 0.01, "loss": 2.2663, "step": 3603 }, { "epoch": 0.37049214014178566, "grad_norm": 0.0576477013528347, "learning_rate": 0.01, "loss": 2.2974, "step": 3606 }, { "epoch": 0.37080036987568066, "grad_norm": 0.059915438294410706, "learning_rate": 0.01, "loss": 2.3031, "step": 3609 }, { "epoch": 0.37110859960957565, "grad_norm": 0.10807155817747116, "learning_rate": 0.01, "loss": 2.2739, "step": 3612 }, { "epoch": 0.37141682934347064, "grad_norm": 0.09196165949106216, "learning_rate": 0.01, "loss": 2.3054, "step": 3615 }, { "epoch": 0.37172505907736564, "grad_norm": 0.07379795610904694, "learning_rate": 0.01, "loss": 2.2805, "step": 3618 }, { "epoch": 0.3720332888112607, "grad_norm": 0.06034912168979645, "learning_rate": 0.01, "loss": 2.2549, "step": 3621 }, { "epoch": 0.3723415185451557, "grad_norm": 0.13983361423015594, "learning_rate": 0.01, "loss": 2.269, "step": 3624 }, { "epoch": 0.3726497482790507, "grad_norm": 0.11592069268226624, "learning_rate": 0.01, "loss": 2.2903, "step": 3627 }, { "epoch": 0.37295797801294567, "grad_norm": 0.15428505837917328, "learning_rate": 0.01, "loss": 2.2918, "step": 3630 }, { "epoch": 0.37326620774684066, "grad_norm": 0.19936774671077728, "learning_rate": 0.01, "loss": 2.2782, "step": 3633 }, { "epoch": 0.37357443748073565, "grad_norm": 0.15364627540111542, "learning_rate": 0.01, "loss": 2.2736, "step": 3636 }, { "epoch": 0.37388266721463065, "grad_norm": 0.047554273158311844, "learning_rate": 0.01, "loss": 2.3172, "step": 3639 }, { "epoch": 0.37419089694852564, "grad_norm": 0.0555570051074028, "learning_rate": 0.01, "loss": 2.2731, "step": 3642 }, { "epoch": 0.37449912668242064, "grad_norm": 0.052204012870788574, "learning_rate": 0.01, "loss": 2.281, "step": 3645 }, { "epoch": 0.37480735641631563, "grad_norm": 0.09206510335206985, "learning_rate": 0.01, "loss": 2.2639, "step": 3648 }, { "epoch": 0.3751155861502106, "grad_norm": 0.1199311912059784, "learning_rate": 0.01, "loss": 2.2873, "step": 3651 }, { "epoch": 0.3754238158841056, "grad_norm": 0.08949270099401474, "learning_rate": 0.01, "loss": 2.2668, "step": 3654 }, { "epoch": 0.3757320456180006, "grad_norm": 0.08521883934736252, "learning_rate": 0.01, "loss": 2.247, "step": 3657 }, { "epoch": 0.3760402753518956, "grad_norm": 0.07689694315195084, "learning_rate": 0.01, "loss": 2.2904, "step": 3660 }, { "epoch": 0.3763485050857906, "grad_norm": 0.08761987835168839, "learning_rate": 0.01, "loss": 2.2761, "step": 3663 }, { "epoch": 0.3766567348196856, "grad_norm": 0.056420013308525085, "learning_rate": 0.01, "loss": 2.259, "step": 3666 }, { "epoch": 0.3769649645535806, "grad_norm": 0.06192856654524803, "learning_rate": 0.01, "loss": 2.2294, "step": 3669 }, { "epoch": 0.3772731942874756, "grad_norm": 0.1021333709359169, "learning_rate": 0.01, "loss": 2.2649, "step": 3672 }, { "epoch": 0.37758142402137057, "grad_norm": 0.10071670264005661, "learning_rate": 0.01, "loss": 2.2584, "step": 3675 }, { "epoch": 0.37788965375526556, "grad_norm": 0.05968625843524933, "learning_rate": 0.01, "loss": 2.2699, "step": 3678 }, { "epoch": 0.3781978834891606, "grad_norm": 0.07489661872386932, "learning_rate": 0.01, "loss": 2.2663, "step": 3681 }, { "epoch": 0.3785061132230556, "grad_norm": 0.07880943268537521, "learning_rate": 0.01, "loss": 2.2709, "step": 3684 }, { "epoch": 0.3788143429569506, "grad_norm": 0.055632054805755615, "learning_rate": 0.01, "loss": 2.272, "step": 3687 }, { "epoch": 0.3791225726908456, "grad_norm": 0.05365302786231041, "learning_rate": 0.01, "loss": 2.2268, "step": 3690 }, { "epoch": 0.3794308024247406, "grad_norm": 0.0802481397986412, "learning_rate": 0.01, "loss": 2.2631, "step": 3693 }, { "epoch": 0.3797390321586356, "grad_norm": 0.1312764585018158, "learning_rate": 0.01, "loss": 2.2985, "step": 3696 }, { "epoch": 0.3800472618925306, "grad_norm": 0.14543971419334412, "learning_rate": 0.01, "loss": 2.25, "step": 3699 }, { "epoch": 0.38035549162642557, "grad_norm": 0.05727002024650574, "learning_rate": 0.01, "loss": 2.2556, "step": 3702 }, { "epoch": 0.38066372136032056, "grad_norm": 0.07309607416391373, "learning_rate": 0.01, "loss": 2.2574, "step": 3705 }, { "epoch": 0.38097195109421556, "grad_norm": 0.03849095106124878, "learning_rate": 0.01, "loss": 2.2501, "step": 3708 }, { "epoch": 0.38128018082811055, "grad_norm": 0.0623021237552166, "learning_rate": 0.01, "loss": 2.2672, "step": 3711 }, { "epoch": 0.38158841056200554, "grad_norm": 0.08916610479354858, "learning_rate": 0.01, "loss": 2.2683, "step": 3714 }, { "epoch": 0.38189664029590054, "grad_norm": 0.08126388490200043, "learning_rate": 0.01, "loss": 2.2574, "step": 3717 }, { "epoch": 0.38220487002979553, "grad_norm": 0.07121114432811737, "learning_rate": 0.01, "loss": 2.2358, "step": 3720 }, { "epoch": 0.3825130997636905, "grad_norm": 0.07406505942344666, "learning_rate": 0.01, "loss": 2.2736, "step": 3723 }, { "epoch": 0.3828213294975855, "grad_norm": 0.13355331122875214, "learning_rate": 0.01, "loss": 2.2685, "step": 3726 }, { "epoch": 0.3831295592314805, "grad_norm": 0.05672430619597435, "learning_rate": 0.01, "loss": 2.2913, "step": 3729 }, { "epoch": 0.3834377889653755, "grad_norm": 0.047647468745708466, "learning_rate": 0.01, "loss": 2.2533, "step": 3732 }, { "epoch": 0.3837460186992705, "grad_norm": 0.059008341282606125, "learning_rate": 0.01, "loss": 2.2867, "step": 3735 }, { "epoch": 0.38405424843316555, "grad_norm": 0.06551840156316757, "learning_rate": 0.01, "loss": 2.2742, "step": 3738 }, { "epoch": 0.38436247816706054, "grad_norm": 0.08781883865594864, "learning_rate": 0.01, "loss": 2.2427, "step": 3741 }, { "epoch": 0.38467070790095553, "grad_norm": 0.06808102875947952, "learning_rate": 0.01, "loss": 2.2493, "step": 3744 }, { "epoch": 0.3849789376348505, "grad_norm": 0.06570697575807571, "learning_rate": 0.01, "loss": 2.2445, "step": 3747 }, { "epoch": 0.3852871673687455, "grad_norm": 0.08742080628871918, "learning_rate": 0.01, "loss": 2.2576, "step": 3750 }, { "epoch": 0.3855953971026405, "grad_norm": 0.1518019735813141, "learning_rate": 0.01, "loss": 2.2819, "step": 3753 }, { "epoch": 0.3859036268365355, "grad_norm": 0.10349754244089127, "learning_rate": 0.01, "loss": 2.2465, "step": 3756 }, { "epoch": 0.3862118565704305, "grad_norm": 0.06008581072092056, "learning_rate": 0.01, "loss": 2.2817, "step": 3759 }, { "epoch": 0.3865200863043255, "grad_norm": 0.0450257770717144, "learning_rate": 0.01, "loss": 2.2585, "step": 3762 }, { "epoch": 0.3868283160382205, "grad_norm": 0.04145176708698273, "learning_rate": 0.01, "loss": 2.2634, "step": 3765 }, { "epoch": 0.3871365457721155, "grad_norm": 0.17084141075611115, "learning_rate": 0.01, "loss": 2.2355, "step": 3768 }, { "epoch": 0.3874447755060105, "grad_norm": 0.06679602712392807, "learning_rate": 0.01, "loss": 2.2737, "step": 3771 }, { "epoch": 0.38775300523990547, "grad_norm": 0.05363382026553154, "learning_rate": 0.01, "loss": 2.244, "step": 3774 }, { "epoch": 0.38806123497380046, "grad_norm": 0.05722133815288544, "learning_rate": 0.01, "loss": 2.2515, "step": 3777 }, { "epoch": 0.38836946470769546, "grad_norm": 0.06288215517997742, "learning_rate": 0.01, "loss": 2.2625, "step": 3780 }, { "epoch": 0.38867769444159045, "grad_norm": 0.05087801814079285, "learning_rate": 0.01, "loss": 2.2883, "step": 3783 }, { "epoch": 0.38898592417548544, "grad_norm": 0.08160998672246933, "learning_rate": 0.01, "loss": 2.2462, "step": 3786 }, { "epoch": 0.38929415390938044, "grad_norm": 0.22291240096092224, "learning_rate": 0.01, "loss": 2.2613, "step": 3789 }, { "epoch": 0.38960238364327543, "grad_norm": 0.11482773721218109, "learning_rate": 0.01, "loss": 2.2633, "step": 3792 }, { "epoch": 0.3899106133771705, "grad_norm": 0.056299589574337006, "learning_rate": 0.01, "loss": 2.2896, "step": 3795 }, { "epoch": 0.39021884311106547, "grad_norm": 0.04524017125368118, "learning_rate": 0.01, "loss": 2.2543, "step": 3798 }, { "epoch": 0.39052707284496047, "grad_norm": 0.0903107225894928, "learning_rate": 0.01, "loss": 2.2801, "step": 3801 }, { "epoch": 0.39083530257885546, "grad_norm": 0.0645504966378212, "learning_rate": 0.01, "loss": 2.2628, "step": 3804 }, { "epoch": 0.39114353231275045, "grad_norm": 0.06752094626426697, "learning_rate": 0.01, "loss": 2.2732, "step": 3807 }, { "epoch": 0.39145176204664545, "grad_norm": 0.04459339380264282, "learning_rate": 0.01, "loss": 2.2601, "step": 3810 }, { "epoch": 0.39175999178054044, "grad_norm": 0.07300913333892822, "learning_rate": 0.01, "loss": 2.2437, "step": 3813 }, { "epoch": 0.39206822151443543, "grad_norm": 0.16804097592830658, "learning_rate": 0.01, "loss": 2.26, "step": 3816 }, { "epoch": 0.3923764512483304, "grad_norm": 0.10682248324155807, "learning_rate": 0.01, "loss": 2.2764, "step": 3819 }, { "epoch": 0.3926846809822254, "grad_norm": 0.046895258128643036, "learning_rate": 0.01, "loss": 2.2654, "step": 3822 }, { "epoch": 0.3929929107161204, "grad_norm": 0.05799179524183273, "learning_rate": 0.01, "loss": 2.2254, "step": 3825 }, { "epoch": 0.3933011404500154, "grad_norm": 0.0474528968334198, "learning_rate": 0.01, "loss": 2.2604, "step": 3828 }, { "epoch": 0.3936093701839104, "grad_norm": 0.1437537968158722, "learning_rate": 0.01, "loss": 2.2532, "step": 3831 }, { "epoch": 0.3939175999178054, "grad_norm": 0.06202014535665512, "learning_rate": 0.01, "loss": 2.2486, "step": 3834 }, { "epoch": 0.3942258296517004, "grad_norm": 0.09379147738218307, "learning_rate": 0.01, "loss": 2.2602, "step": 3837 }, { "epoch": 0.3945340593855954, "grad_norm": 0.07898830622434616, "learning_rate": 0.01, "loss": 2.2605, "step": 3840 }, { "epoch": 0.3948422891194904, "grad_norm": 0.10186600685119629, "learning_rate": 0.01, "loss": 2.2807, "step": 3843 }, { "epoch": 0.39515051885338537, "grad_norm": 0.08611535280942917, "learning_rate": 0.01, "loss": 2.2571, "step": 3846 }, { "epoch": 0.39545874858728036, "grad_norm": 0.10435480624437332, "learning_rate": 0.01, "loss": 2.2721, "step": 3849 }, { "epoch": 0.39576697832117536, "grad_norm": 0.11543019860982895, "learning_rate": 0.01, "loss": 2.2598, "step": 3852 }, { "epoch": 0.3960752080550704, "grad_norm": 0.11996404081583023, "learning_rate": 0.01, "loss": 2.2536, "step": 3855 }, { "epoch": 0.3963834377889654, "grad_norm": 0.05615765228867531, "learning_rate": 0.01, "loss": 2.2637, "step": 3858 }, { "epoch": 0.3966916675228604, "grad_norm": 0.06568838655948639, "learning_rate": 0.01, "loss": 2.2756, "step": 3861 }, { "epoch": 0.3969998972567554, "grad_norm": 0.07747132331132889, "learning_rate": 0.01, "loss": 2.2816, "step": 3864 }, { "epoch": 0.3973081269906504, "grad_norm": 0.057373497635126114, "learning_rate": 0.01, "loss": 2.255, "step": 3867 }, { "epoch": 0.3976163567245454, "grad_norm": 0.11501277983188629, "learning_rate": 0.01, "loss": 2.2494, "step": 3870 }, { "epoch": 0.39792458645844037, "grad_norm": 0.07761958241462708, "learning_rate": 0.01, "loss": 2.2459, "step": 3873 }, { "epoch": 0.39823281619233536, "grad_norm": 0.06263428926467896, "learning_rate": 0.01, "loss": 2.2649, "step": 3876 }, { "epoch": 0.39854104592623035, "grad_norm": 0.04552373290061951, "learning_rate": 0.01, "loss": 2.2578, "step": 3879 }, { "epoch": 0.39884927566012535, "grad_norm": 0.0631655901670456, "learning_rate": 0.01, "loss": 2.2648, "step": 3882 }, { "epoch": 0.39915750539402034, "grad_norm": 0.06519417464733124, "learning_rate": 0.01, "loss": 2.2438, "step": 3885 }, { "epoch": 0.39946573512791533, "grad_norm": 0.10446424037218094, "learning_rate": 0.01, "loss": 2.2815, "step": 3888 }, { "epoch": 0.3997739648618103, "grad_norm": 0.07533372938632965, "learning_rate": 0.01, "loss": 2.272, "step": 3891 }, { "epoch": 0.4000821945957053, "grad_norm": 0.05748215690255165, "learning_rate": 0.01, "loss": 2.2971, "step": 3894 }, { "epoch": 0.4003904243296003, "grad_norm": 0.051343973726034164, "learning_rate": 0.01, "loss": 2.2316, "step": 3897 }, { "epoch": 0.4006986540634953, "grad_norm": 0.04799075797200203, "learning_rate": 0.01, "loss": 2.2333, "step": 3900 }, { "epoch": 0.4010068837973903, "grad_norm": 0.12885436415672302, "learning_rate": 0.01, "loss": 2.247, "step": 3903 }, { "epoch": 0.4013151135312853, "grad_norm": 0.07175249606370926, "learning_rate": 0.01, "loss": 2.2407, "step": 3906 }, { "epoch": 0.4016233432651803, "grad_norm": 0.10784266144037247, "learning_rate": 0.01, "loss": 2.2458, "step": 3909 }, { "epoch": 0.40193157299907534, "grad_norm": 0.08646712452173233, "learning_rate": 0.01, "loss": 2.2571, "step": 3912 }, { "epoch": 0.40223980273297033, "grad_norm": 0.05365338176488876, "learning_rate": 0.01, "loss": 2.2585, "step": 3915 }, { "epoch": 0.4025480324668653, "grad_norm": 0.07037780433893204, "learning_rate": 0.01, "loss": 2.2277, "step": 3918 }, { "epoch": 0.4028562622007603, "grad_norm": 0.040290024131536484, "learning_rate": 0.01, "loss": 2.2508, "step": 3921 }, { "epoch": 0.4031644919346553, "grad_norm": 0.050338853150606155, "learning_rate": 0.01, "loss": 2.2356, "step": 3924 }, { "epoch": 0.4034727216685503, "grad_norm": 0.1420246660709381, "learning_rate": 0.01, "loss": 2.2531, "step": 3927 }, { "epoch": 0.4037809514024453, "grad_norm": 0.07432923465967178, "learning_rate": 0.01, "loss": 2.2766, "step": 3930 }, { "epoch": 0.4040891811363403, "grad_norm": 0.04954257979989052, "learning_rate": 0.01, "loss": 2.2825, "step": 3933 }, { "epoch": 0.4043974108702353, "grad_norm": 0.05988876149058342, "learning_rate": 0.01, "loss": 2.2342, "step": 3936 }, { "epoch": 0.4047056406041303, "grad_norm": 0.09800540655851364, "learning_rate": 0.01, "loss": 2.2268, "step": 3939 }, { "epoch": 0.4050138703380253, "grad_norm": 0.09171874821186066, "learning_rate": 0.01, "loss": 2.2648, "step": 3942 }, { "epoch": 0.40532210007192027, "grad_norm": 0.07430606335401535, "learning_rate": 0.01, "loss": 2.2523, "step": 3945 }, { "epoch": 0.40563032980581526, "grad_norm": 0.043649185448884964, "learning_rate": 0.01, "loss": 2.2303, "step": 3948 }, { "epoch": 0.40593855953971025, "grad_norm": 0.04120480641722679, "learning_rate": 0.01, "loss": 2.2299, "step": 3951 }, { "epoch": 0.40624678927360525, "grad_norm": 0.0692945346236229, "learning_rate": 0.01, "loss": 2.2466, "step": 3954 }, { "epoch": 0.40655501900750024, "grad_norm": 0.08884318917989731, "learning_rate": 0.01, "loss": 2.2802, "step": 3957 }, { "epoch": 0.40686324874139523, "grad_norm": 0.05542384088039398, "learning_rate": 0.01, "loss": 2.2303, "step": 3960 }, { "epoch": 0.4071714784752902, "grad_norm": 0.08013599365949631, "learning_rate": 0.01, "loss": 2.2361, "step": 3963 }, { "epoch": 0.4074797082091852, "grad_norm": 0.15963242948055267, "learning_rate": 0.01, "loss": 2.2608, "step": 3966 }, { "epoch": 0.40778793794308027, "grad_norm": 0.05428241938352585, "learning_rate": 0.01, "loss": 2.2415, "step": 3969 }, { "epoch": 0.40809616767697526, "grad_norm": 0.09297880530357361, "learning_rate": 0.01, "loss": 2.2804, "step": 3972 }, { "epoch": 0.40840439741087026, "grad_norm": 0.11259882897138596, "learning_rate": 0.01, "loss": 2.2562, "step": 3975 }, { "epoch": 0.40871262714476525, "grad_norm": 0.0546397790312767, "learning_rate": 0.01, "loss": 2.2423, "step": 3978 }, { "epoch": 0.40902085687866024, "grad_norm": 0.13870957493782043, "learning_rate": 0.01, "loss": 2.2431, "step": 3981 }, { "epoch": 0.40932908661255524, "grad_norm": 0.05527504161000252, "learning_rate": 0.01, "loss": 2.2649, "step": 3984 }, { "epoch": 0.40963731634645023, "grad_norm": 0.08060980588197708, "learning_rate": 0.01, "loss": 2.2708, "step": 3987 }, { "epoch": 0.4099455460803452, "grad_norm": 0.05611690506339073, "learning_rate": 0.01, "loss": 2.2683, "step": 3990 }, { "epoch": 0.4102537758142402, "grad_norm": 0.08760816603899002, "learning_rate": 0.01, "loss": 2.2392, "step": 3993 }, { "epoch": 0.4105620055481352, "grad_norm": 0.07327746599912643, "learning_rate": 0.01, "loss": 2.2587, "step": 3996 }, { "epoch": 0.4108702352820302, "grad_norm": 0.05924748629331589, "learning_rate": 0.01, "loss": 2.2435, "step": 3999 }, { "epoch": 0.4111784650159252, "grad_norm": 0.08269370347261429, "learning_rate": 0.01, "loss": 2.2365, "step": 4002 }, { "epoch": 0.4114866947498202, "grad_norm": 0.06834371387958527, "learning_rate": 0.01, "loss": 2.2579, "step": 4005 }, { "epoch": 0.4117949244837152, "grad_norm": 0.06737885624170303, "learning_rate": 0.01, "loss": 2.2585, "step": 4008 }, { "epoch": 0.4121031542176102, "grad_norm": 0.0919148176908493, "learning_rate": 0.01, "loss": 2.2524, "step": 4011 }, { "epoch": 0.4124113839515052, "grad_norm": 0.0744348093867302, "learning_rate": 0.01, "loss": 2.2328, "step": 4014 }, { "epoch": 0.41271961368540017, "grad_norm": 0.08952994644641876, "learning_rate": 0.01, "loss": 2.2556, "step": 4017 }, { "epoch": 0.41302784341929516, "grad_norm": 0.054230738431215286, "learning_rate": 0.01, "loss": 2.2559, "step": 4020 }, { "epoch": 0.41333607315319015, "grad_norm": 0.11185753345489502, "learning_rate": 0.01, "loss": 2.2599, "step": 4023 }, { "epoch": 0.41364430288708515, "grad_norm": 0.11211541295051575, "learning_rate": 0.01, "loss": 2.2456, "step": 4026 }, { "epoch": 0.4139525326209802, "grad_norm": 0.08211257308721542, "learning_rate": 0.01, "loss": 2.2636, "step": 4029 }, { "epoch": 0.4142607623548752, "grad_norm": 0.07233046740293503, "learning_rate": 0.01, "loss": 2.2148, "step": 4032 }, { "epoch": 0.4145689920887702, "grad_norm": 0.1062379851937294, "learning_rate": 0.01, "loss": 2.2382, "step": 4035 }, { "epoch": 0.4148772218226652, "grad_norm": 0.07079877704381943, "learning_rate": 0.01, "loss": 2.2462, "step": 4038 }, { "epoch": 0.41518545155656017, "grad_norm": 0.04237307608127594, "learning_rate": 0.01, "loss": 2.2523, "step": 4041 }, { "epoch": 0.41549368129045516, "grad_norm": 0.12513239681720734, "learning_rate": 0.01, "loss": 2.2614, "step": 4044 }, { "epoch": 0.41580191102435016, "grad_norm": 0.07134360820055008, "learning_rate": 0.01, "loss": 2.2533, "step": 4047 }, { "epoch": 0.41611014075824515, "grad_norm": 0.07371515780687332, "learning_rate": 0.01, "loss": 2.2333, "step": 4050 }, { "epoch": 0.41641837049214014, "grad_norm": 0.05744464695453644, "learning_rate": 0.01, "loss": 2.2292, "step": 4053 }, { "epoch": 0.41672660022603514, "grad_norm": 0.0790088102221489, "learning_rate": 0.01, "loss": 2.2217, "step": 4056 }, { "epoch": 0.41703482995993013, "grad_norm": 0.12540112435817719, "learning_rate": 0.01, "loss": 2.2367, "step": 4059 }, { "epoch": 0.4173430596938251, "grad_norm": 0.06895852833986282, "learning_rate": 0.01, "loss": 2.2354, "step": 4062 }, { "epoch": 0.4176512894277201, "grad_norm": 0.09068478643894196, "learning_rate": 0.01, "loss": 2.2605, "step": 4065 }, { "epoch": 0.4179595191616151, "grad_norm": 0.051881443709135056, "learning_rate": 0.01, "loss": 2.2501, "step": 4068 }, { "epoch": 0.4182677488955101, "grad_norm": 0.20433951914310455, "learning_rate": 0.01, "loss": 2.2582, "step": 4071 }, { "epoch": 0.4185759786294051, "grad_norm": 0.08301309496164322, "learning_rate": 0.01, "loss": 2.2424, "step": 4074 }, { "epoch": 0.4188842083633001, "grad_norm": 0.07062964886426926, "learning_rate": 0.01, "loss": 2.2345, "step": 4077 }, { "epoch": 0.4191924380971951, "grad_norm": 0.09770773351192474, "learning_rate": 0.01, "loss": 2.264, "step": 4080 }, { "epoch": 0.4195006678310901, "grad_norm": 0.0847458690404892, "learning_rate": 0.01, "loss": 2.2329, "step": 4083 }, { "epoch": 0.41980889756498513, "grad_norm": 0.06491915881633759, "learning_rate": 0.01, "loss": 2.2174, "step": 4086 }, { "epoch": 0.4201171272988801, "grad_norm": 0.11355047672986984, "learning_rate": 0.01, "loss": 2.2653, "step": 4089 }, { "epoch": 0.4204253570327751, "grad_norm": 0.10509520024061203, "learning_rate": 0.01, "loss": 2.2435, "step": 4092 }, { "epoch": 0.4207335867666701, "grad_norm": 0.07456620037555695, "learning_rate": 0.01, "loss": 2.2348, "step": 4095 }, { "epoch": 0.4210418165005651, "grad_norm": 0.07531027495861053, "learning_rate": 0.01, "loss": 2.2524, "step": 4098 }, { "epoch": 0.4213500462344601, "grad_norm": 0.06129564717411995, "learning_rate": 0.01, "loss": 2.2577, "step": 4101 }, { "epoch": 0.4216582759683551, "grad_norm": 0.03984616696834564, "learning_rate": 0.01, "loss": 2.2354, "step": 4104 }, { "epoch": 0.4219665057022501, "grad_norm": 0.1273418813943863, "learning_rate": 0.01, "loss": 2.2478, "step": 4107 }, { "epoch": 0.4222747354361451, "grad_norm": 0.08859774470329285, "learning_rate": 0.01, "loss": 2.2504, "step": 4110 }, { "epoch": 0.42258296517004007, "grad_norm": 0.10512147098779678, "learning_rate": 0.01, "loss": 2.2435, "step": 4113 }, { "epoch": 0.42289119490393506, "grad_norm": 0.11181578040122986, "learning_rate": 0.01, "loss": 2.2396, "step": 4116 }, { "epoch": 0.42319942463783006, "grad_norm": 0.07474307715892792, "learning_rate": 0.01, "loss": 2.2518, "step": 4119 }, { "epoch": 0.42350765437172505, "grad_norm": 0.07233690470457077, "learning_rate": 0.01, "loss": 2.2283, "step": 4122 }, { "epoch": 0.42381588410562004, "grad_norm": 0.06051602587103844, "learning_rate": 0.01, "loss": 2.2429, "step": 4125 }, { "epoch": 0.42412411383951504, "grad_norm": 0.0492120198905468, "learning_rate": 0.01, "loss": 2.2312, "step": 4128 }, { "epoch": 0.42443234357341003, "grad_norm": 0.07249493151903152, "learning_rate": 0.01, "loss": 2.244, "step": 4131 }, { "epoch": 0.424740573307305, "grad_norm": 0.0993468165397644, "learning_rate": 0.01, "loss": 2.2441, "step": 4134 }, { "epoch": 0.4250488030412, "grad_norm": 0.07051920145750046, "learning_rate": 0.01, "loss": 2.2188, "step": 4137 }, { "epoch": 0.425357032775095, "grad_norm": 0.08267249912023544, "learning_rate": 0.01, "loss": 2.2472, "step": 4140 }, { "epoch": 0.42566526250899006, "grad_norm": 0.1307336390018463, "learning_rate": 0.01, "loss": 2.2359, "step": 4143 }, { "epoch": 0.42597349224288505, "grad_norm": 0.09383214265108109, "learning_rate": 0.01, "loss": 2.2519, "step": 4146 }, { "epoch": 0.42628172197678005, "grad_norm": 0.08928582817316055, "learning_rate": 0.01, "loss": 2.2322, "step": 4149 }, { "epoch": 0.42658995171067504, "grad_norm": 0.10554556548595428, "learning_rate": 0.01, "loss": 2.2219, "step": 4152 }, { "epoch": 0.42689818144457004, "grad_norm": 0.06501816213130951, "learning_rate": 0.01, "loss": 2.2351, "step": 4155 }, { "epoch": 0.42720641117846503, "grad_norm": 0.10736589878797531, "learning_rate": 0.01, "loss": 2.2327, "step": 4158 }, { "epoch": 0.42751464091236, "grad_norm": 0.11834681034088135, "learning_rate": 0.01, "loss": 2.2617, "step": 4161 }, { "epoch": 0.427822870646255, "grad_norm": 0.07011161744594574, "learning_rate": 0.01, "loss": 2.2218, "step": 4164 }, { "epoch": 0.42813110038015, "grad_norm": 0.0653071179986, "learning_rate": 0.01, "loss": 2.2115, "step": 4167 }, { "epoch": 0.428439330114045, "grad_norm": 0.057517893612384796, "learning_rate": 0.01, "loss": 2.2533, "step": 4170 }, { "epoch": 0.42874755984794, "grad_norm": 0.060261376202106476, "learning_rate": 0.01, "loss": 2.2199, "step": 4173 }, { "epoch": 0.429055789581835, "grad_norm": 0.12384762614965439, "learning_rate": 0.01, "loss": 2.2124, "step": 4176 }, { "epoch": 0.42936401931573, "grad_norm": 0.06436473876237869, "learning_rate": 0.01, "loss": 2.2558, "step": 4179 }, { "epoch": 0.429672249049625, "grad_norm": 0.049704987555742264, "learning_rate": 0.01, "loss": 2.2434, "step": 4182 }, { "epoch": 0.42998047878351997, "grad_norm": 0.0809103325009346, "learning_rate": 0.01, "loss": 2.2461, "step": 4185 }, { "epoch": 0.43028870851741496, "grad_norm": 0.04888701066374779, "learning_rate": 0.01, "loss": 2.2342, "step": 4188 }, { "epoch": 0.43059693825130996, "grad_norm": 0.04951067641377449, "learning_rate": 0.01, "loss": 2.2292, "step": 4191 }, { "epoch": 0.43090516798520495, "grad_norm": 0.13740333914756775, "learning_rate": 0.01, "loss": 2.2243, "step": 4194 }, { "epoch": 0.43121339771909994, "grad_norm": 0.09912848472595215, "learning_rate": 0.01, "loss": 2.2065, "step": 4197 }, { "epoch": 0.43152162745299494, "grad_norm": 0.1031954362988472, "learning_rate": 0.01, "loss": 2.2247, "step": 4200 }, { "epoch": 0.43182985718689, "grad_norm": 0.04378229379653931, "learning_rate": 0.01, "loss": 2.2485, "step": 4203 }, { "epoch": 0.432138086920785, "grad_norm": 0.05430865287780762, "learning_rate": 0.01, "loss": 2.2178, "step": 4206 }, { "epoch": 0.43244631665468, "grad_norm": 0.05675321817398071, "learning_rate": 0.01, "loss": 2.2568, "step": 4209 }, { "epoch": 0.43275454638857497, "grad_norm": 0.07637004554271698, "learning_rate": 0.01, "loss": 2.2567, "step": 4212 }, { "epoch": 0.43306277612246996, "grad_norm": 0.06263475120067596, "learning_rate": 0.01, "loss": 2.2597, "step": 4215 }, { "epoch": 0.43337100585636495, "grad_norm": 0.09689760208129883, "learning_rate": 0.01, "loss": 2.2376, "step": 4218 }, { "epoch": 0.43367923559025995, "grad_norm": 0.13923399150371552, "learning_rate": 0.01, "loss": 2.2394, "step": 4221 }, { "epoch": 0.43398746532415494, "grad_norm": 0.0607299767434597, "learning_rate": 0.01, "loss": 2.2366, "step": 4224 }, { "epoch": 0.43429569505804994, "grad_norm": 0.05221550166606903, "learning_rate": 0.01, "loss": 2.2587, "step": 4227 }, { "epoch": 0.43460392479194493, "grad_norm": 0.05556831881403923, "learning_rate": 0.01, "loss": 2.2422, "step": 4230 }, { "epoch": 0.4349121545258399, "grad_norm": 0.0843261182308197, "learning_rate": 0.01, "loss": 2.2399, "step": 4233 }, { "epoch": 0.4352203842597349, "grad_norm": 0.08864692598581314, "learning_rate": 0.01, "loss": 2.2155, "step": 4236 }, { "epoch": 0.4355286139936299, "grad_norm": 0.11530198156833649, "learning_rate": 0.01, "loss": 2.2612, "step": 4239 }, { "epoch": 0.4358368437275249, "grad_norm": 0.11549337208271027, "learning_rate": 0.01, "loss": 2.2233, "step": 4242 }, { "epoch": 0.4361450734614199, "grad_norm": 0.11105350404977798, "learning_rate": 0.01, "loss": 2.2426, "step": 4245 }, { "epoch": 0.4364533031953149, "grad_norm": 0.1190980076789856, "learning_rate": 0.01, "loss": 2.2353, "step": 4248 }, { "epoch": 0.4367615329292099, "grad_norm": 0.08560021221637726, "learning_rate": 0.01, "loss": 2.2542, "step": 4251 }, { "epoch": 0.4370697626631049, "grad_norm": 0.05514337494969368, "learning_rate": 0.01, "loss": 2.2171, "step": 4254 }, { "epoch": 0.43737799239699987, "grad_norm": 0.06764981150627136, "learning_rate": 0.01, "loss": 2.2363, "step": 4257 }, { "epoch": 0.4376862221308949, "grad_norm": 0.04801105335354805, "learning_rate": 0.01, "loss": 2.2352, "step": 4260 }, { "epoch": 0.4379944518647899, "grad_norm": 0.04782482981681824, "learning_rate": 0.01, "loss": 2.2458, "step": 4263 }, { "epoch": 0.4383026815986849, "grad_norm": 0.12880820035934448, "learning_rate": 0.01, "loss": 2.2384, "step": 4266 }, { "epoch": 0.4386109113325799, "grad_norm": 0.06714754551649094, "learning_rate": 0.01, "loss": 2.2214, "step": 4269 }, { "epoch": 0.4389191410664749, "grad_norm": 0.08878037333488464, "learning_rate": 0.01, "loss": 2.2597, "step": 4272 }, { "epoch": 0.4392273708003699, "grad_norm": 0.051335882395505905, "learning_rate": 0.01, "loss": 2.2065, "step": 4275 }, { "epoch": 0.4395356005342649, "grad_norm": 0.058174654841423035, "learning_rate": 0.01, "loss": 2.246, "step": 4278 }, { "epoch": 0.4398438302681599, "grad_norm": 0.053695593029260635, "learning_rate": 0.01, "loss": 2.2406, "step": 4281 }, { "epoch": 0.44015206000205487, "grad_norm": 0.07685926556587219, "learning_rate": 0.01, "loss": 2.2212, "step": 4284 }, { "epoch": 0.44046028973594986, "grad_norm": 0.13495223224163055, "learning_rate": 0.01, "loss": 2.2486, "step": 4287 }, { "epoch": 0.44076851946984485, "grad_norm": 0.0707453116774559, "learning_rate": 0.01, "loss": 2.247, "step": 4290 }, { "epoch": 0.44107674920373985, "grad_norm": 0.04909240081906319, "learning_rate": 0.01, "loss": 2.2528, "step": 4293 }, { "epoch": 0.44138497893763484, "grad_norm": 0.06148238107562065, "learning_rate": 0.01, "loss": 2.2462, "step": 4296 }, { "epoch": 0.44169320867152984, "grad_norm": 0.07306285202503204, "learning_rate": 0.01, "loss": 2.199, "step": 4299 }, { "epoch": 0.44200143840542483, "grad_norm": 0.12965865433216095, "learning_rate": 0.01, "loss": 2.2156, "step": 4302 }, { "epoch": 0.4423096681393198, "grad_norm": 0.059606775641441345, "learning_rate": 0.01, "loss": 2.2209, "step": 4305 }, { "epoch": 0.4426178978732148, "grad_norm": 0.06866457313299179, "learning_rate": 0.01, "loss": 2.2508, "step": 4308 }, { "epoch": 0.4429261276071098, "grad_norm": 0.08940677344799042, "learning_rate": 0.01, "loss": 2.2244, "step": 4311 }, { "epoch": 0.4432343573410048, "grad_norm": 0.10428988933563232, "learning_rate": 0.01, "loss": 2.2106, "step": 4314 }, { "epoch": 0.44354258707489985, "grad_norm": 0.1565064787864685, "learning_rate": 0.01, "loss": 2.2745, "step": 4317 }, { "epoch": 0.44385081680879485, "grad_norm": 0.11433500796556473, "learning_rate": 0.01, "loss": 2.2655, "step": 4320 }, { "epoch": 0.44415904654268984, "grad_norm": 0.07315809279680252, "learning_rate": 0.01, "loss": 2.2523, "step": 4323 }, { "epoch": 0.44446727627658483, "grad_norm": 0.048583708703517914, "learning_rate": 0.01, "loss": 2.2345, "step": 4326 }, { "epoch": 0.4447755060104798, "grad_norm": 0.03422848507761955, "learning_rate": 0.01, "loss": 2.1883, "step": 4329 }, { "epoch": 0.4450837357443748, "grad_norm": 0.05057518929243088, "learning_rate": 0.01, "loss": 2.2288, "step": 4332 }, { "epoch": 0.4453919654782698, "grad_norm": 0.10407044738531113, "learning_rate": 0.01, "loss": 2.1974, "step": 4335 }, { "epoch": 0.4457001952121648, "grad_norm": 0.06545260548591614, "learning_rate": 0.01, "loss": 2.2121, "step": 4338 }, { "epoch": 0.4460084249460598, "grad_norm": 0.09442485123872757, "learning_rate": 0.01, "loss": 2.2145, "step": 4341 }, { "epoch": 0.4463166546799548, "grad_norm": 0.11353209614753723, "learning_rate": 0.01, "loss": 2.227, "step": 4344 }, { "epoch": 0.4466248844138498, "grad_norm": 0.11243279278278351, "learning_rate": 0.01, "loss": 2.242, "step": 4347 }, { "epoch": 0.4469331141477448, "grad_norm": 0.14264856278896332, "learning_rate": 0.01, "loss": 2.2405, "step": 4350 }, { "epoch": 0.4472413438816398, "grad_norm": 0.048186566680669785, "learning_rate": 0.01, "loss": 2.1921, "step": 4353 }, { "epoch": 0.44754957361553477, "grad_norm": 0.0693448930978775, "learning_rate": 0.01, "loss": 2.2404, "step": 4356 }, { "epoch": 0.44785780334942976, "grad_norm": 0.04426461458206177, "learning_rate": 0.01, "loss": 2.2114, "step": 4359 }, { "epoch": 0.44816603308332476, "grad_norm": 0.06392990797758102, "learning_rate": 0.01, "loss": 2.224, "step": 4362 }, { "epoch": 0.44847426281721975, "grad_norm": 0.16224262118339539, "learning_rate": 0.01, "loss": 2.261, "step": 4365 }, { "epoch": 0.44878249255111474, "grad_norm": 0.06382444500923157, "learning_rate": 0.01, "loss": 2.2067, "step": 4368 }, { "epoch": 0.44909072228500974, "grad_norm": 0.09267281740903854, "learning_rate": 0.01, "loss": 2.2403, "step": 4371 }, { "epoch": 0.44939895201890473, "grad_norm": 0.09785914421081543, "learning_rate": 0.01, "loss": 2.2276, "step": 4374 }, { "epoch": 0.4497071817527998, "grad_norm": 0.06673259288072586, "learning_rate": 0.01, "loss": 2.214, "step": 4377 }, { "epoch": 0.45001541148669477, "grad_norm": 0.05463524907827377, "learning_rate": 0.01, "loss": 2.2048, "step": 4380 }, { "epoch": 0.45032364122058977, "grad_norm": 0.05466567724943161, "learning_rate": 0.01, "loss": 2.2062, "step": 4383 }, { "epoch": 0.45063187095448476, "grad_norm": 0.07413290441036224, "learning_rate": 0.01, "loss": 2.2178, "step": 4386 }, { "epoch": 0.45094010068837975, "grad_norm": 0.06564678996801376, "learning_rate": 0.01, "loss": 2.2304, "step": 4389 }, { "epoch": 0.45124833042227475, "grad_norm": 0.12468644231557846, "learning_rate": 0.01, "loss": 2.2301, "step": 4392 }, { "epoch": 0.45155656015616974, "grad_norm": 0.06898069381713867, "learning_rate": 0.01, "loss": 2.2255, "step": 4395 }, { "epoch": 0.45186478989006473, "grad_norm": 0.13579058647155762, "learning_rate": 0.01, "loss": 2.2021, "step": 4398 }, { "epoch": 0.4521730196239597, "grad_norm": 0.07980421930551529, "learning_rate": 0.01, "loss": 2.2598, "step": 4401 }, { "epoch": 0.4524812493578547, "grad_norm": 0.07771994173526764, "learning_rate": 0.01, "loss": 2.2166, "step": 4404 }, { "epoch": 0.4527894790917497, "grad_norm": 0.08967602998018265, "learning_rate": 0.01, "loss": 2.2095, "step": 4407 }, { "epoch": 0.4530977088256447, "grad_norm": 0.10909977555274963, "learning_rate": 0.01, "loss": 2.2064, "step": 4410 }, { "epoch": 0.4534059385595397, "grad_norm": 0.11167363077402115, "learning_rate": 0.01, "loss": 2.2021, "step": 4413 }, { "epoch": 0.4537141682934347, "grad_norm": 0.10310694575309753, "learning_rate": 0.01, "loss": 2.2582, "step": 4416 }, { "epoch": 0.4540223980273297, "grad_norm": 0.06411474943161011, "learning_rate": 0.01, "loss": 2.2203, "step": 4419 }, { "epoch": 0.4543306277612247, "grad_norm": 0.11141805350780487, "learning_rate": 0.01, "loss": 2.2163, "step": 4422 }, { "epoch": 0.4546388574951197, "grad_norm": 0.09054200351238251, "learning_rate": 0.01, "loss": 2.2054, "step": 4425 }, { "epoch": 0.45494708722901467, "grad_norm": 0.06952405720949173, "learning_rate": 0.01, "loss": 2.2488, "step": 4428 }, { "epoch": 0.45525531696290966, "grad_norm": 0.08597440272569656, "learning_rate": 0.01, "loss": 2.2044, "step": 4431 }, { "epoch": 0.4555635466968047, "grad_norm": 0.06718187034130096, "learning_rate": 0.01, "loss": 2.2419, "step": 4434 }, { "epoch": 0.4558717764306997, "grad_norm": 0.0558515265583992, "learning_rate": 0.01, "loss": 2.2102, "step": 4437 }, { "epoch": 0.4561800061645947, "grad_norm": 0.0560682937502861, "learning_rate": 0.01, "loss": 2.2324, "step": 4440 }, { "epoch": 0.4564882358984897, "grad_norm": 0.058881547302007675, "learning_rate": 0.01, "loss": 2.1966, "step": 4443 }, { "epoch": 0.4567964656323847, "grad_norm": 0.07034582644701004, "learning_rate": 0.01, "loss": 2.2021, "step": 4446 }, { "epoch": 0.4571046953662797, "grad_norm": 0.09703799337148666, "learning_rate": 0.01, "loss": 2.21, "step": 4449 }, { "epoch": 0.45741292510017467, "grad_norm": 0.06268820911645889, "learning_rate": 0.01, "loss": 2.2237, "step": 4452 }, { "epoch": 0.45772115483406967, "grad_norm": 0.123359814286232, "learning_rate": 0.01, "loss": 2.2063, "step": 4455 }, { "epoch": 0.45802938456796466, "grad_norm": 0.0536644384264946, "learning_rate": 0.01, "loss": 2.2002, "step": 4458 }, { "epoch": 0.45833761430185965, "grad_norm": 0.0957527682185173, "learning_rate": 0.01, "loss": 2.2484, "step": 4461 }, { "epoch": 0.45864584403575465, "grad_norm": 0.12607458233833313, "learning_rate": 0.01, "loss": 2.2241, "step": 4464 }, { "epoch": 0.45895407376964964, "grad_norm": 0.07415255159139633, "learning_rate": 0.01, "loss": 2.2083, "step": 4467 }, { "epoch": 0.45926230350354463, "grad_norm": 0.10248073190450668, "learning_rate": 0.01, "loss": 2.2253, "step": 4470 }, { "epoch": 0.4595705332374396, "grad_norm": 0.05264243111014366, "learning_rate": 0.01, "loss": 2.2166, "step": 4473 }, { "epoch": 0.4598787629713346, "grad_norm": 0.0557783767580986, "learning_rate": 0.01, "loss": 2.2213, "step": 4476 }, { "epoch": 0.4601869927052296, "grad_norm": 0.06835830211639404, "learning_rate": 0.01, "loss": 2.2255, "step": 4479 }, { "epoch": 0.4604952224391246, "grad_norm": 0.12045460939407349, "learning_rate": 0.01, "loss": 2.2331, "step": 4482 }, { "epoch": 0.4608034521730196, "grad_norm": 0.11495090276002884, "learning_rate": 0.01, "loss": 2.2191, "step": 4485 }, { "epoch": 0.4611116819069146, "grad_norm": 0.07859046757221222, "learning_rate": 0.01, "loss": 2.2282, "step": 4488 }, { "epoch": 0.46141991164080964, "grad_norm": 0.03789819777011871, "learning_rate": 0.01, "loss": 2.2188, "step": 4491 }, { "epoch": 0.46172814137470464, "grad_norm": 0.03617655113339424, "learning_rate": 0.01, "loss": 2.2496, "step": 4494 }, { "epoch": 0.46203637110859963, "grad_norm": 0.06894705444574356, "learning_rate": 0.01, "loss": 2.2007, "step": 4497 }, { "epoch": 0.4623446008424946, "grad_norm": 0.1143706887960434, "learning_rate": 0.01, "loss": 2.2247, "step": 4500 }, { "epoch": 0.4626528305763896, "grad_norm": 0.10069230943918228, "learning_rate": 0.01, "loss": 2.2114, "step": 4503 }, { "epoch": 0.4629610603102846, "grad_norm": 0.10068007558584213, "learning_rate": 0.01, "loss": 2.2438, "step": 4506 }, { "epoch": 0.4632692900441796, "grad_norm": 0.05319290608167648, "learning_rate": 0.01, "loss": 2.2422, "step": 4509 }, { "epoch": 0.4635775197780746, "grad_norm": 0.06933122128248215, "learning_rate": 0.01, "loss": 2.2059, "step": 4512 }, { "epoch": 0.4638857495119696, "grad_norm": 0.11921056360006332, "learning_rate": 0.01, "loss": 2.2137, "step": 4515 }, { "epoch": 0.4641939792458646, "grad_norm": 0.06092121824622154, "learning_rate": 0.01, "loss": 2.1941, "step": 4518 }, { "epoch": 0.4645022089797596, "grad_norm": 0.06017937511205673, "learning_rate": 0.01, "loss": 2.2539, "step": 4521 }, { "epoch": 0.4648104387136546, "grad_norm": 0.05721915140748024, "learning_rate": 0.01, "loss": 2.2348, "step": 4524 }, { "epoch": 0.46511866844754957, "grad_norm": 0.07706714421510696, "learning_rate": 0.01, "loss": 2.2169, "step": 4527 }, { "epoch": 0.46542689818144456, "grad_norm": 0.07279779762029648, "learning_rate": 0.01, "loss": 2.2163, "step": 4530 }, { "epoch": 0.46573512791533955, "grad_norm": 0.06781268864870071, "learning_rate": 0.01, "loss": 2.1682, "step": 4533 }, { "epoch": 0.46604335764923455, "grad_norm": 0.0807657316327095, "learning_rate": 0.01, "loss": 2.2123, "step": 4536 }, { "epoch": 0.46635158738312954, "grad_norm": 0.06467099487781525, "learning_rate": 0.01, "loss": 2.2152, "step": 4539 }, { "epoch": 0.46665981711702453, "grad_norm": 0.10680168867111206, "learning_rate": 0.01, "loss": 2.2062, "step": 4542 }, { "epoch": 0.4669680468509195, "grad_norm": 0.11668167263269424, "learning_rate": 0.01, "loss": 2.206, "step": 4545 }, { "epoch": 0.4672762765848146, "grad_norm": 0.06468226760625839, "learning_rate": 0.01, "loss": 2.2011, "step": 4548 }, { "epoch": 0.46758450631870957, "grad_norm": 0.07668601721525192, "learning_rate": 0.01, "loss": 2.2128, "step": 4551 }, { "epoch": 0.46789273605260456, "grad_norm": 0.05631673336029053, "learning_rate": 0.01, "loss": 2.1812, "step": 4554 }, { "epoch": 0.46820096578649956, "grad_norm": 0.12898530066013336, "learning_rate": 0.01, "loss": 2.2312, "step": 4557 }, { "epoch": 0.46850919552039455, "grad_norm": 0.07105603069067001, "learning_rate": 0.01, "loss": 2.1949, "step": 4560 }, { "epoch": 0.46881742525428954, "grad_norm": 0.07172367721796036, "learning_rate": 0.01, "loss": 2.2509, "step": 4563 }, { "epoch": 0.46912565498818454, "grad_norm": 0.1219574511051178, "learning_rate": 0.01, "loss": 2.2147, "step": 4566 }, { "epoch": 0.46943388472207953, "grad_norm": 0.05777307227253914, "learning_rate": 0.01, "loss": 2.2071, "step": 4569 }, { "epoch": 0.4697421144559745, "grad_norm": 0.12805253267288208, "learning_rate": 0.01, "loss": 2.2166, "step": 4572 }, { "epoch": 0.4700503441898695, "grad_norm": 0.11360877752304077, "learning_rate": 0.01, "loss": 2.1827, "step": 4575 }, { "epoch": 0.4703585739237645, "grad_norm": 0.07203348726034164, "learning_rate": 0.01, "loss": 2.2378, "step": 4578 }, { "epoch": 0.4706668036576595, "grad_norm": 0.05645303055644035, "learning_rate": 0.01, "loss": 2.2044, "step": 4581 }, { "epoch": 0.4709750333915545, "grad_norm": 0.06103040650486946, "learning_rate": 0.01, "loss": 2.2302, "step": 4584 }, { "epoch": 0.4712832631254495, "grad_norm": 0.0621771402657032, "learning_rate": 0.01, "loss": 2.2147, "step": 4587 }, { "epoch": 0.4715914928593445, "grad_norm": 0.08458666503429413, "learning_rate": 0.01, "loss": 2.1781, "step": 4590 }, { "epoch": 0.4718997225932395, "grad_norm": 0.092729851603508, "learning_rate": 0.01, "loss": 2.2326, "step": 4593 }, { "epoch": 0.4722079523271345, "grad_norm": 0.09255766123533249, "learning_rate": 0.01, "loss": 2.2082, "step": 4596 }, { "epoch": 0.47251618206102947, "grad_norm": 0.11929985135793686, "learning_rate": 0.01, "loss": 2.2064, "step": 4599 }, { "epoch": 0.47282441179492446, "grad_norm": 0.12234004586935043, "learning_rate": 0.01, "loss": 2.1513, "step": 4602 }, { "epoch": 0.47313264152881945, "grad_norm": 0.07648742944002151, "learning_rate": 0.01, "loss": 2.2376, "step": 4605 }, { "epoch": 0.4734408712627145, "grad_norm": 0.05717691034078598, "learning_rate": 0.01, "loss": 2.231, "step": 4608 }, { "epoch": 0.4737491009966095, "grad_norm": 0.048224568367004395, "learning_rate": 0.01, "loss": 2.2126, "step": 4611 }, { "epoch": 0.4740573307305045, "grad_norm": 0.07530826330184937, "learning_rate": 0.01, "loss": 2.2155, "step": 4614 }, { "epoch": 0.4743655604643995, "grad_norm": 0.08617862313985825, "learning_rate": 0.01, "loss": 2.2286, "step": 4617 }, { "epoch": 0.4746737901982945, "grad_norm": 0.10041820257902145, "learning_rate": 0.01, "loss": 2.1917, "step": 4620 }, { "epoch": 0.47498201993218947, "grad_norm": 0.04470205307006836, "learning_rate": 0.01, "loss": 2.2188, "step": 4623 }, { "epoch": 0.47529024966608446, "grad_norm": 0.060269374400377274, "learning_rate": 0.01, "loss": 2.2267, "step": 4626 }, { "epoch": 0.47559847939997946, "grad_norm": 0.06320520490407944, "learning_rate": 0.01, "loss": 2.2054, "step": 4629 }, { "epoch": 0.47590670913387445, "grad_norm": 0.05642838776111603, "learning_rate": 0.01, "loss": 2.2062, "step": 4632 }, { "epoch": 0.47621493886776944, "grad_norm": 0.064301997423172, "learning_rate": 0.01, "loss": 2.2296, "step": 4635 }, { "epoch": 0.47652316860166444, "grad_norm": 0.07448214292526245, "learning_rate": 0.01, "loss": 2.197, "step": 4638 }, { "epoch": 0.47683139833555943, "grad_norm": 0.08586326986551285, "learning_rate": 0.01, "loss": 2.1743, "step": 4641 }, { "epoch": 0.4771396280694544, "grad_norm": 0.13179326057434082, "learning_rate": 0.01, "loss": 2.2299, "step": 4644 }, { "epoch": 0.4774478578033494, "grad_norm": 0.1163720041513443, "learning_rate": 0.01, "loss": 2.2089, "step": 4647 }, { "epoch": 0.4777560875372444, "grad_norm": 0.04846031963825226, "learning_rate": 0.01, "loss": 2.1564, "step": 4650 }, { "epoch": 0.4780643172711394, "grad_norm": 0.13724131882190704, "learning_rate": 0.01, "loss": 2.2078, "step": 4653 }, { "epoch": 0.4783725470050344, "grad_norm": 0.062840536236763, "learning_rate": 0.01, "loss": 2.2252, "step": 4656 }, { "epoch": 0.4786807767389294, "grad_norm": 0.06721820682287216, "learning_rate": 0.01, "loss": 2.1781, "step": 4659 }, { "epoch": 0.4789890064728244, "grad_norm": 0.09086044877767563, "learning_rate": 0.01, "loss": 2.2179, "step": 4662 }, { "epoch": 0.47929723620671943, "grad_norm": 0.07732655107975006, "learning_rate": 0.01, "loss": 2.2334, "step": 4665 }, { "epoch": 0.47960546594061443, "grad_norm": 0.04763714596629143, "learning_rate": 0.01, "loss": 2.2262, "step": 4668 }, { "epoch": 0.4799136956745094, "grad_norm": 0.09649144858121872, "learning_rate": 0.01, "loss": 2.2141, "step": 4671 }, { "epoch": 0.4802219254084044, "grad_norm": 0.05458167567849159, "learning_rate": 0.01, "loss": 2.1967, "step": 4674 }, { "epoch": 0.4805301551422994, "grad_norm": 0.08577650040388107, "learning_rate": 0.01, "loss": 2.2183, "step": 4677 }, { "epoch": 0.4808383848761944, "grad_norm": 0.0733698159456253, "learning_rate": 0.01, "loss": 2.2185, "step": 4680 }, { "epoch": 0.4811466146100894, "grad_norm": 0.06648692488670349, "learning_rate": 0.01, "loss": 2.1904, "step": 4683 }, { "epoch": 0.4814548443439844, "grad_norm": 0.08376996219158173, "learning_rate": 0.01, "loss": 2.2097, "step": 4686 }, { "epoch": 0.4817630740778794, "grad_norm": 0.05270134285092354, "learning_rate": 0.01, "loss": 2.2304, "step": 4689 }, { "epoch": 0.4820713038117744, "grad_norm": 0.05531509965658188, "learning_rate": 0.01, "loss": 2.2039, "step": 4692 }, { "epoch": 0.48237953354566937, "grad_norm": 0.05848492309451103, "learning_rate": 0.01, "loss": 2.2113, "step": 4695 }, { "epoch": 0.48268776327956436, "grad_norm": 0.06692120432853699, "learning_rate": 0.01, "loss": 2.1972, "step": 4698 }, { "epoch": 0.48299599301345936, "grad_norm": 0.07243851572275162, "learning_rate": 0.01, "loss": 2.223, "step": 4701 }, { "epoch": 0.48330422274735435, "grad_norm": 0.06565523892641068, "learning_rate": 0.01, "loss": 2.1913, "step": 4704 }, { "epoch": 0.48361245248124934, "grad_norm": 0.04595122113823891, "learning_rate": 0.01, "loss": 2.1782, "step": 4707 }, { "epoch": 0.48392068221514434, "grad_norm": 0.06658844649791718, "learning_rate": 0.01, "loss": 2.224, "step": 4710 }, { "epoch": 0.48422891194903933, "grad_norm": 0.0807071253657341, "learning_rate": 0.01, "loss": 2.217, "step": 4713 }, { "epoch": 0.4845371416829343, "grad_norm": 0.0562782846391201, "learning_rate": 0.01, "loss": 2.2033, "step": 4716 }, { "epoch": 0.4848453714168293, "grad_norm": 0.07851718366146088, "learning_rate": 0.01, "loss": 2.1847, "step": 4719 }, { "epoch": 0.48515360115072437, "grad_norm": 0.07649900764226913, "learning_rate": 0.01, "loss": 2.2222, "step": 4722 }, { "epoch": 0.48546183088461936, "grad_norm": 0.07279150187969208, "learning_rate": 0.01, "loss": 2.1951, "step": 4725 }, { "epoch": 0.48577006061851435, "grad_norm": 0.053628645837306976, "learning_rate": 0.01, "loss": 2.1681, "step": 4728 }, { "epoch": 0.48607829035240935, "grad_norm": 0.09401357173919678, "learning_rate": 0.01, "loss": 2.1943, "step": 4731 }, { "epoch": 0.48638652008630434, "grad_norm": 0.1156088337302208, "learning_rate": 0.01, "loss": 2.2317, "step": 4734 }, { "epoch": 0.48669474982019933, "grad_norm": 0.12672138214111328, "learning_rate": 0.01, "loss": 2.2085, "step": 4737 }, { "epoch": 0.48700297955409433, "grad_norm": 0.06799574196338654, "learning_rate": 0.01, "loss": 2.2161, "step": 4740 }, { "epoch": 0.4873112092879893, "grad_norm": 0.06479325145483017, "learning_rate": 0.01, "loss": 2.1663, "step": 4743 }, { "epoch": 0.4876194390218843, "grad_norm": 0.09143824130296707, "learning_rate": 0.01, "loss": 2.2193, "step": 4746 }, { "epoch": 0.4879276687557793, "grad_norm": 0.09262688457965851, "learning_rate": 0.01, "loss": 2.218, "step": 4749 }, { "epoch": 0.4882358984896743, "grad_norm": 0.11519678682088852, "learning_rate": 0.01, "loss": 2.1937, "step": 4752 }, { "epoch": 0.4885441282235693, "grad_norm": 0.07646415382623672, "learning_rate": 0.01, "loss": 2.2133, "step": 4755 }, { "epoch": 0.4888523579574643, "grad_norm": 0.08090809732675552, "learning_rate": 0.01, "loss": 2.193, "step": 4758 }, { "epoch": 0.4891605876913593, "grad_norm": 0.08812209218740463, "learning_rate": 0.01, "loss": 2.2215, "step": 4761 }, { "epoch": 0.4894688174252543, "grad_norm": 0.14427846670150757, "learning_rate": 0.01, "loss": 2.2115, "step": 4764 }, { "epoch": 0.48977704715914927, "grad_norm": 0.08065719902515411, "learning_rate": 0.01, "loss": 2.1861, "step": 4767 }, { "epoch": 0.49008527689304426, "grad_norm": 0.04888691008090973, "learning_rate": 0.01, "loss": 2.1911, "step": 4770 }, { "epoch": 0.49039350662693926, "grad_norm": 0.04742259159684181, "learning_rate": 0.01, "loss": 2.2152, "step": 4773 }, { "epoch": 0.49070173636083425, "grad_norm": 0.061714138835668564, "learning_rate": 0.01, "loss": 2.2009, "step": 4776 }, { "epoch": 0.49100996609472924, "grad_norm": 0.07582443952560425, "learning_rate": 0.01, "loss": 2.2189, "step": 4779 }, { "epoch": 0.4913181958286243, "grad_norm": 0.1390780359506607, "learning_rate": 0.01, "loss": 2.211, "step": 4782 }, { "epoch": 0.4916264255625193, "grad_norm": 0.03784565255045891, "learning_rate": 0.01, "loss": 2.2011, "step": 4785 }, { "epoch": 0.4919346552964143, "grad_norm": 0.07413594424724579, "learning_rate": 0.01, "loss": 2.2103, "step": 4788 }, { "epoch": 0.4922428850303093, "grad_norm": 0.09402404725551605, "learning_rate": 0.01, "loss": 2.1912, "step": 4791 }, { "epoch": 0.49255111476420427, "grad_norm": 0.0717400312423706, "learning_rate": 0.01, "loss": 2.1868, "step": 4794 }, { "epoch": 0.49285934449809926, "grad_norm": 0.05179424583911896, "learning_rate": 0.01, "loss": 2.2298, "step": 4797 }, { "epoch": 0.49316757423199425, "grad_norm": 0.12123433500528336, "learning_rate": 0.01, "loss": 2.2005, "step": 4800 }, { "epoch": 0.49347580396588925, "grad_norm": 0.04941033944487572, "learning_rate": 0.01, "loss": 2.2113, "step": 4803 }, { "epoch": 0.49378403369978424, "grad_norm": 0.10987304151058197, "learning_rate": 0.01, "loss": 2.209, "step": 4806 }, { "epoch": 0.49409226343367924, "grad_norm": 0.09235193580389023, "learning_rate": 0.01, "loss": 2.1967, "step": 4809 }, { "epoch": 0.49440049316757423, "grad_norm": 0.057354703545570374, "learning_rate": 0.01, "loss": 2.219, "step": 4812 }, { "epoch": 0.4947087229014692, "grad_norm": 0.04692654311656952, "learning_rate": 0.01, "loss": 2.173, "step": 4815 }, { "epoch": 0.4950169526353642, "grad_norm": 0.09447453171014786, "learning_rate": 0.01, "loss": 2.1806, "step": 4818 }, { "epoch": 0.4953251823692592, "grad_norm": 0.09967079013586044, "learning_rate": 0.01, "loss": 2.1809, "step": 4821 }, { "epoch": 0.4956334121031542, "grad_norm": 0.06462189555168152, "learning_rate": 0.01, "loss": 2.1922, "step": 4824 }, { "epoch": 0.4959416418370492, "grad_norm": 0.038030870258808136, "learning_rate": 0.01, "loss": 2.2239, "step": 4827 }, { "epoch": 0.4962498715709442, "grad_norm": 0.06828872114419937, "learning_rate": 0.01, "loss": 2.1881, "step": 4830 }, { "epoch": 0.4965581013048392, "grad_norm": 0.10087070614099503, "learning_rate": 0.01, "loss": 2.22, "step": 4833 }, { "epoch": 0.4968663310387342, "grad_norm": 0.07630455493927002, "learning_rate": 0.01, "loss": 2.188, "step": 4836 }, { "epoch": 0.4971745607726292, "grad_norm": 0.05040668696165085, "learning_rate": 0.01, "loss": 2.2012, "step": 4839 }, { "epoch": 0.4974827905065242, "grad_norm": 0.05160282924771309, "learning_rate": 0.01, "loss": 2.2119, "step": 4842 }, { "epoch": 0.4977910202404192, "grad_norm": 0.04949258640408516, "learning_rate": 0.01, "loss": 2.1959, "step": 4845 }, { "epoch": 0.4980992499743142, "grad_norm": 0.07766029983758926, "learning_rate": 0.01, "loss": 2.1896, "step": 4848 }, { "epoch": 0.4984074797082092, "grad_norm": 0.06274580955505371, "learning_rate": 0.01, "loss": 2.2014, "step": 4851 }, { "epoch": 0.4987157094421042, "grad_norm": 0.1071280762553215, "learning_rate": 0.01, "loss": 2.2045, "step": 4854 }, { "epoch": 0.4990239391759992, "grad_norm": 0.10645020008087158, "learning_rate": 0.01, "loss": 2.1895, "step": 4857 }, { "epoch": 0.4993321689098942, "grad_norm": 0.1151091679930687, "learning_rate": 0.01, "loss": 2.1954, "step": 4860 }, { "epoch": 0.4996403986437892, "grad_norm": 0.09699530899524689, "learning_rate": 0.01, "loss": 2.1833, "step": 4863 }, { "epoch": 0.49994862837768417, "grad_norm": 0.06568959355354309, "learning_rate": 0.01, "loss": 2.1862, "step": 4866 }, { "epoch": 0.5002568581115792, "grad_norm": 0.0421447716653347, "learning_rate": 0.01, "loss": 2.1819, "step": 4869 }, { "epoch": 0.5005650878454742, "grad_norm": 0.04529868811368942, "learning_rate": 0.01, "loss": 2.1852, "step": 4872 }, { "epoch": 0.5008733175793691, "grad_norm": 0.059541650116443634, "learning_rate": 0.01, "loss": 2.1955, "step": 4875 }, { "epoch": 0.5011815473132641, "grad_norm": 0.061823770403862, "learning_rate": 0.01, "loss": 2.2039, "step": 4878 }, { "epoch": 0.5014897770471591, "grad_norm": 0.05892050638794899, "learning_rate": 0.01, "loss": 2.178, "step": 4881 }, { "epoch": 0.5017980067810541, "grad_norm": 0.04842402786016464, "learning_rate": 0.01, "loss": 2.1948, "step": 4884 }, { "epoch": 0.5021062365149491, "grad_norm": 0.05962050333619118, "learning_rate": 0.01, "loss": 2.1932, "step": 4887 }, { "epoch": 0.5024144662488441, "grad_norm": 0.056295089423656464, "learning_rate": 0.01, "loss": 2.1757, "step": 4890 }, { "epoch": 0.5027226959827391, "grad_norm": 0.07448049634695053, "learning_rate": 0.01, "loss": 2.2181, "step": 4893 }, { "epoch": 0.5030309257166341, "grad_norm": 0.07998815923929214, "learning_rate": 0.01, "loss": 2.1714, "step": 4896 }, { "epoch": 0.5033391554505291, "grad_norm": 0.08058517426252365, "learning_rate": 0.01, "loss": 2.2131, "step": 4899 }, { "epoch": 0.5036473851844241, "grad_norm": 0.07899410277605057, "learning_rate": 0.01, "loss": 2.1727, "step": 4902 }, { "epoch": 0.5039556149183191, "grad_norm": 0.05830831080675125, "learning_rate": 0.01, "loss": 2.176, "step": 4905 }, { "epoch": 0.5042638446522141, "grad_norm": 0.05831579118967056, "learning_rate": 0.01, "loss": 2.1754, "step": 4908 }, { "epoch": 0.5045720743861091, "grad_norm": 0.052614904940128326, "learning_rate": 0.01, "loss": 2.1935, "step": 4911 }, { "epoch": 0.5048803041200041, "grad_norm": 0.0830332413315773, "learning_rate": 0.01, "loss": 2.2274, "step": 4914 }, { "epoch": 0.5051885338538991, "grad_norm": 0.1138230562210083, "learning_rate": 0.01, "loss": 2.1976, "step": 4917 }, { "epoch": 0.505496763587794, "grad_norm": 0.07024016976356506, "learning_rate": 0.01, "loss": 2.1969, "step": 4920 }, { "epoch": 0.505804993321689, "grad_norm": 0.07235170155763626, "learning_rate": 0.01, "loss": 2.2163, "step": 4923 }, { "epoch": 0.5061132230555841, "grad_norm": 0.06894835084676743, "learning_rate": 0.01, "loss": 2.2232, "step": 4926 }, { "epoch": 0.5064214527894791, "grad_norm": 0.0825890600681305, "learning_rate": 0.01, "loss": 2.1896, "step": 4929 }, { "epoch": 0.5067296825233741, "grad_norm": 0.05901159718632698, "learning_rate": 0.01, "loss": 2.1988, "step": 4932 }, { "epoch": 0.5070379122572691, "grad_norm": 0.048157334327697754, "learning_rate": 0.01, "loss": 2.1904, "step": 4935 }, { "epoch": 0.5073461419911641, "grad_norm": 0.10036749392747879, "learning_rate": 0.01, "loss": 2.1977, "step": 4938 }, { "epoch": 0.5076543717250591, "grad_norm": 0.10984963923692703, "learning_rate": 0.01, "loss": 2.1957, "step": 4941 }, { "epoch": 0.5079626014589541, "grad_norm": 0.09587367624044418, "learning_rate": 0.01, "loss": 2.2, "step": 4944 }, { "epoch": 0.5082708311928491, "grad_norm": 0.06347552686929703, "learning_rate": 0.01, "loss": 2.1918, "step": 4947 }, { "epoch": 0.5085790609267441, "grad_norm": 0.0658629834651947, "learning_rate": 0.01, "loss": 2.1945, "step": 4950 }, { "epoch": 0.5088872906606391, "grad_norm": 0.045971643179655075, "learning_rate": 0.01, "loss": 2.2114, "step": 4953 }, { "epoch": 0.5091955203945341, "grad_norm": 0.04058291018009186, "learning_rate": 0.01, "loss": 2.2066, "step": 4956 }, { "epoch": 0.5095037501284291, "grad_norm": 0.052851296961307526, "learning_rate": 0.01, "loss": 2.1884, "step": 4959 }, { "epoch": 0.5098119798623241, "grad_norm": 0.033158350735902786, "learning_rate": 0.01, "loss": 2.2078, "step": 4962 }, { "epoch": 0.5101202095962191, "grad_norm": 0.05409036949276924, "learning_rate": 0.01, "loss": 2.181, "step": 4965 }, { "epoch": 0.5104284393301141, "grad_norm": 0.0731736570596695, "learning_rate": 0.01, "loss": 2.1825, "step": 4968 }, { "epoch": 0.510736669064009, "grad_norm": 0.05854470282793045, "learning_rate": 0.01, "loss": 2.2119, "step": 4971 }, { "epoch": 0.511044898797904, "grad_norm": 0.05071520060300827, "learning_rate": 0.01, "loss": 2.1886, "step": 4974 }, { "epoch": 0.511353128531799, "grad_norm": 0.060792725533246994, "learning_rate": 0.01, "loss": 2.2066, "step": 4977 }, { "epoch": 0.511661358265694, "grad_norm": 0.0910191684961319, "learning_rate": 0.01, "loss": 2.209, "step": 4980 }, { "epoch": 0.511969587999589, "grad_norm": 0.12366749346256256, "learning_rate": 0.01, "loss": 2.2263, "step": 4983 }, { "epoch": 0.512277817733484, "grad_norm": 0.11254429817199707, "learning_rate": 0.01, "loss": 2.175, "step": 4986 }, { "epoch": 0.512586047467379, "grad_norm": 0.11091643571853638, "learning_rate": 0.01, "loss": 2.2046, "step": 4989 }, { "epoch": 0.512894277201274, "grad_norm": 0.045611754059791565, "learning_rate": 0.01, "loss": 2.1861, "step": 4992 }, { "epoch": 0.513202506935169, "grad_norm": 0.09836157411336899, "learning_rate": 0.01, "loss": 2.1931, "step": 4995 }, { "epoch": 0.513510736669064, "grad_norm": 0.11932815611362457, "learning_rate": 0.01, "loss": 2.2087, "step": 4998 }, { "epoch": 0.513818966402959, "grad_norm": 0.10955359041690826, "learning_rate": 0.01, "loss": 2.19, "step": 5001 }, { "epoch": 0.514127196136854, "grad_norm": 0.09804633259773254, "learning_rate": 0.01, "loss": 2.1572, "step": 5004 }, { "epoch": 0.514435425870749, "grad_norm": 0.04835839942097664, "learning_rate": 0.01, "loss": 2.2115, "step": 5007 }, { "epoch": 0.514743655604644, "grad_norm": 0.04645110294222832, "learning_rate": 0.01, "loss": 2.2012, "step": 5010 }, { "epoch": 0.515051885338539, "grad_norm": 0.05947386845946312, "learning_rate": 0.01, "loss": 2.2039, "step": 5013 }, { "epoch": 0.515360115072434, "grad_norm": 0.05693971738219261, "learning_rate": 0.01, "loss": 2.1733, "step": 5016 }, { "epoch": 0.515668344806329, "grad_norm": 0.0724320039153099, "learning_rate": 0.01, "loss": 2.1944, "step": 5019 }, { "epoch": 0.5159765745402239, "grad_norm": 0.06627337634563446, "learning_rate": 0.01, "loss": 2.1932, "step": 5022 }, { "epoch": 0.5162848042741189, "grad_norm": 0.10879958420991898, "learning_rate": 0.01, "loss": 2.2024, "step": 5025 }, { "epoch": 0.5165930340080139, "grad_norm": 0.12266898900270462, "learning_rate": 0.01, "loss": 2.1938, "step": 5028 }, { "epoch": 0.5169012637419089, "grad_norm": 0.06240540370345116, "learning_rate": 0.01, "loss": 2.18, "step": 5031 }, { "epoch": 0.5172094934758039, "grad_norm": 0.05043266713619232, "learning_rate": 0.01, "loss": 2.1936, "step": 5034 }, { "epoch": 0.5175177232096989, "grad_norm": 0.052652738988399506, "learning_rate": 0.01, "loss": 2.1631, "step": 5037 }, { "epoch": 0.5178259529435939, "grad_norm": 0.04598904401063919, "learning_rate": 0.01, "loss": 2.2067, "step": 5040 }, { "epoch": 0.518134182677489, "grad_norm": 0.07040087133646011, "learning_rate": 0.01, "loss": 2.1737, "step": 5043 }, { "epoch": 0.518442412411384, "grad_norm": 0.04827702417969704, "learning_rate": 0.01, "loss": 2.2128, "step": 5046 }, { "epoch": 0.518750642145279, "grad_norm": 0.09803622215986252, "learning_rate": 0.01, "loss": 2.1851, "step": 5049 }, { "epoch": 0.519058871879174, "grad_norm": 0.1019926443696022, "learning_rate": 0.01, "loss": 2.181, "step": 5052 }, { "epoch": 0.519367101613069, "grad_norm": 0.08847504109144211, "learning_rate": 0.01, "loss": 2.1874, "step": 5055 }, { "epoch": 0.519675331346964, "grad_norm": 0.06151921674609184, "learning_rate": 0.01, "loss": 2.1785, "step": 5058 }, { "epoch": 0.519983561080859, "grad_norm": 0.04823022335767746, "learning_rate": 0.01, "loss": 2.1662, "step": 5061 }, { "epoch": 0.520291790814754, "grad_norm": 0.12454935908317566, "learning_rate": 0.01, "loss": 2.1864, "step": 5064 }, { "epoch": 0.520600020548649, "grad_norm": 0.0716002956032753, "learning_rate": 0.01, "loss": 2.1866, "step": 5067 }, { "epoch": 0.520908250282544, "grad_norm": 0.055079616606235504, "learning_rate": 0.01, "loss": 2.2137, "step": 5070 }, { "epoch": 0.5212164800164389, "grad_norm": 0.05969909206032753, "learning_rate": 0.01, "loss": 2.1972, "step": 5073 }, { "epoch": 0.5215247097503339, "grad_norm": 0.07373122125864029, "learning_rate": 0.01, "loss": 2.2312, "step": 5076 }, { "epoch": 0.5218329394842289, "grad_norm": 0.1899929642677307, "learning_rate": 0.01, "loss": 2.2141, "step": 5079 }, { "epoch": 0.5221411692181239, "grad_norm": 0.05221979692578316, "learning_rate": 0.01, "loss": 2.1899, "step": 5082 }, { "epoch": 0.5224493989520189, "grad_norm": 0.04537337273359299, "learning_rate": 0.01, "loss": 2.1571, "step": 5085 }, { "epoch": 0.5227576286859139, "grad_norm": 0.05490431934595108, "learning_rate": 0.01, "loss": 2.1705, "step": 5088 }, { "epoch": 0.5230658584198089, "grad_norm": 0.03813198208808899, "learning_rate": 0.01, "loss": 2.1773, "step": 5091 }, { "epoch": 0.5233740881537039, "grad_norm": 0.045411352068185806, "learning_rate": 0.01, "loss": 2.2065, "step": 5094 }, { "epoch": 0.5236823178875989, "grad_norm": 0.05433456227183342, "learning_rate": 0.01, "loss": 2.1901, "step": 5097 }, { "epoch": 0.5239905476214939, "grad_norm": 0.10771681368350983, "learning_rate": 0.01, "loss": 2.171, "step": 5100 }, { "epoch": 0.5242987773553889, "grad_norm": 0.06446761637926102, "learning_rate": 0.01, "loss": 2.2033, "step": 5103 }, { "epoch": 0.5246070070892839, "grad_norm": 0.06428392231464386, "learning_rate": 0.01, "loss": 2.1877, "step": 5106 }, { "epoch": 0.5249152368231789, "grad_norm": 0.0525304451584816, "learning_rate": 0.01, "loss": 2.206, "step": 5109 }, { "epoch": 0.5252234665570739, "grad_norm": 0.07332491129636765, "learning_rate": 0.01, "loss": 2.1992, "step": 5112 }, { "epoch": 0.5255316962909689, "grad_norm": 0.1738174557685852, "learning_rate": 0.01, "loss": 2.176, "step": 5115 }, { "epoch": 0.5258399260248638, "grad_norm": 0.08102334290742874, "learning_rate": 0.01, "loss": 2.2067, "step": 5118 }, { "epoch": 0.5261481557587588, "grad_norm": 0.06945500522851944, "learning_rate": 0.01, "loss": 2.206, "step": 5121 }, { "epoch": 0.5264563854926538, "grad_norm": 0.07017000019550323, "learning_rate": 0.01, "loss": 2.2002, "step": 5124 }, { "epoch": 0.5267646152265488, "grad_norm": 0.03883346915245056, "learning_rate": 0.01, "loss": 2.1608, "step": 5127 }, { "epoch": 0.5270728449604438, "grad_norm": 0.050974566489458084, "learning_rate": 0.01, "loss": 2.1973, "step": 5130 }, { "epoch": 0.5273810746943388, "grad_norm": 0.0665312334895134, "learning_rate": 0.01, "loss": 2.2064, "step": 5133 }, { "epoch": 0.5276893044282338, "grad_norm": 0.03946761414408684, "learning_rate": 0.01, "loss": 2.1794, "step": 5136 }, { "epoch": 0.5279975341621288, "grad_norm": 0.046160902827978134, "learning_rate": 0.01, "loss": 2.1919, "step": 5139 }, { "epoch": 0.5283057638960238, "grad_norm": 0.046186063438653946, "learning_rate": 0.01, "loss": 2.1687, "step": 5142 }, { "epoch": 0.5286139936299188, "grad_norm": 0.07073774188756943, "learning_rate": 0.01, "loss": 2.2154, "step": 5145 }, { "epoch": 0.5289222233638138, "grad_norm": 0.08319617807865143, "learning_rate": 0.01, "loss": 2.2104, "step": 5148 }, { "epoch": 0.5292304530977088, "grad_norm": 0.08551453799009323, "learning_rate": 0.01, "loss": 2.1743, "step": 5151 }, { "epoch": 0.5295386828316038, "grad_norm": 0.06613323837518692, "learning_rate": 0.01, "loss": 2.1774, "step": 5154 }, { "epoch": 0.5298469125654989, "grad_norm": 0.0601351298391819, "learning_rate": 0.01, "loss": 2.1831, "step": 5157 }, { "epoch": 0.5301551422993939, "grad_norm": 0.04963411018252373, "learning_rate": 0.01, "loss": 2.1915, "step": 5160 }, { "epoch": 0.5304633720332889, "grad_norm": 0.06755329668521881, "learning_rate": 0.01, "loss": 2.163, "step": 5163 }, { "epoch": 0.5307716017671839, "grad_norm": 0.04755258187651634, "learning_rate": 0.01, "loss": 2.1518, "step": 5166 }, { "epoch": 0.5310798315010788, "grad_norm": 0.15916316211223602, "learning_rate": 0.01, "loss": 2.1835, "step": 5169 }, { "epoch": 0.5313880612349738, "grad_norm": 0.0807122215628624, "learning_rate": 0.01, "loss": 2.2193, "step": 5172 }, { "epoch": 0.5316962909688688, "grad_norm": 0.05207689106464386, "learning_rate": 0.01, "loss": 2.1754, "step": 5175 }, { "epoch": 0.5320045207027638, "grad_norm": 0.045082803815603256, "learning_rate": 0.01, "loss": 2.1525, "step": 5178 }, { "epoch": 0.5323127504366588, "grad_norm": 0.07747700810432434, "learning_rate": 0.01, "loss": 2.1438, "step": 5181 }, { "epoch": 0.5326209801705538, "grad_norm": 0.13366450369358063, "learning_rate": 0.01, "loss": 2.1904, "step": 5184 }, { "epoch": 0.5329292099044488, "grad_norm": 0.06902889162302017, "learning_rate": 0.01, "loss": 2.1786, "step": 5187 }, { "epoch": 0.5332374396383438, "grad_norm": 0.04604712501168251, "learning_rate": 0.01, "loss": 2.1848, "step": 5190 }, { "epoch": 0.5335456693722388, "grad_norm": 0.08803047984838486, "learning_rate": 0.01, "loss": 2.1798, "step": 5193 }, { "epoch": 0.5338538991061338, "grad_norm": 0.08366485685110092, "learning_rate": 0.01, "loss": 2.2008, "step": 5196 }, { "epoch": 0.5341621288400288, "grad_norm": 0.06176333501935005, "learning_rate": 0.01, "loss": 2.1722, "step": 5199 }, { "epoch": 0.5344703585739238, "grad_norm": 0.0837249681353569, "learning_rate": 0.01, "loss": 2.1783, "step": 5202 }, { "epoch": 0.5347785883078188, "grad_norm": 0.1322035938501358, "learning_rate": 0.01, "loss": 2.1948, "step": 5205 }, { "epoch": 0.5350868180417138, "grad_norm": 0.11094444990158081, "learning_rate": 0.01, "loss": 2.1937, "step": 5208 }, { "epoch": 0.5353950477756088, "grad_norm": 0.05182232707738876, "learning_rate": 0.01, "loss": 2.1945, "step": 5211 }, { "epoch": 0.5357032775095038, "grad_norm": 0.08261944353580475, "learning_rate": 0.01, "loss": 2.1855, "step": 5214 }, { "epoch": 0.5360115072433987, "grad_norm": 0.11097295582294464, "learning_rate": 0.01, "loss": 2.1902, "step": 5217 }, { "epoch": 0.5363197369772937, "grad_norm": 0.05825675278902054, "learning_rate": 0.01, "loss": 2.1984, "step": 5220 }, { "epoch": 0.5366279667111887, "grad_norm": 0.11612821370363235, "learning_rate": 0.01, "loss": 2.1679, "step": 5223 }, { "epoch": 0.5369361964450837, "grad_norm": 0.09120064228773117, "learning_rate": 0.01, "loss": 2.1586, "step": 5226 }, { "epoch": 0.5372444261789787, "grad_norm": 0.055181995034217834, "learning_rate": 0.01, "loss": 2.2169, "step": 5229 }, { "epoch": 0.5375526559128737, "grad_norm": 0.055780068039894104, "learning_rate": 0.01, "loss": 2.1748, "step": 5232 }, { "epoch": 0.5378608856467687, "grad_norm": 0.06303024291992188, "learning_rate": 0.01, "loss": 2.1782, "step": 5235 }, { "epoch": 0.5381691153806637, "grad_norm": 0.10456321388483047, "learning_rate": 0.01, "loss": 2.1975, "step": 5238 }, { "epoch": 0.5384773451145587, "grad_norm": 0.054177962243556976, "learning_rate": 0.01, "loss": 2.1641, "step": 5241 }, { "epoch": 0.5387855748484537, "grad_norm": 0.06265738606452942, "learning_rate": 0.01, "loss": 2.183, "step": 5244 }, { "epoch": 0.5390938045823487, "grad_norm": 0.13720418512821198, "learning_rate": 0.01, "loss": 2.1698, "step": 5247 }, { "epoch": 0.5394020343162437, "grad_norm": 0.04917861148715019, "learning_rate": 0.01, "loss": 2.1692, "step": 5250 }, { "epoch": 0.5397102640501387, "grad_norm": 0.04919945448637009, "learning_rate": 0.01, "loss": 2.1652, "step": 5253 }, { "epoch": 0.5400184937840337, "grad_norm": 0.06462734192609787, "learning_rate": 0.01, "loss": 2.1987, "step": 5256 }, { "epoch": 0.5403267235179287, "grad_norm": 0.05275480076670647, "learning_rate": 0.01, "loss": 2.1955, "step": 5259 }, { "epoch": 0.5406349532518236, "grad_norm": 0.12235717475414276, "learning_rate": 0.01, "loss": 2.1937, "step": 5262 }, { "epoch": 0.5409431829857186, "grad_norm": 0.05300014466047287, "learning_rate": 0.01, "loss": 2.1589, "step": 5265 }, { "epoch": 0.5412514127196136, "grad_norm": 0.0429493710398674, "learning_rate": 0.01, "loss": 2.1618, "step": 5268 }, { "epoch": 0.5415596424535086, "grad_norm": 0.07041259855031967, "learning_rate": 0.01, "loss": 2.1661, "step": 5271 }, { "epoch": 0.5418678721874037, "grad_norm": 0.05304478853940964, "learning_rate": 0.01, "loss": 2.183, "step": 5274 }, { "epoch": 0.5421761019212987, "grad_norm": 0.12009457498788834, "learning_rate": 0.01, "loss": 2.1862, "step": 5277 }, { "epoch": 0.5424843316551937, "grad_norm": 0.11629784107208252, "learning_rate": 0.01, "loss": 2.1897, "step": 5280 }, { "epoch": 0.5427925613890887, "grad_norm": 0.07305426150560379, "learning_rate": 0.01, "loss": 2.1832, "step": 5283 }, { "epoch": 0.5431007911229837, "grad_norm": 0.0855623185634613, "learning_rate": 0.01, "loss": 2.1884, "step": 5286 }, { "epoch": 0.5434090208568787, "grad_norm": 0.04178578779101372, "learning_rate": 0.01, "loss": 2.1872, "step": 5289 }, { "epoch": 0.5437172505907737, "grad_norm": 0.05382310971617699, "learning_rate": 0.01, "loss": 2.1901, "step": 5292 }, { "epoch": 0.5440254803246687, "grad_norm": 0.10682760924100876, "learning_rate": 0.01, "loss": 2.1957, "step": 5295 }, { "epoch": 0.5443337100585637, "grad_norm": 0.15037471055984497, "learning_rate": 0.01, "loss": 2.2085, "step": 5298 }, { "epoch": 0.5446419397924587, "grad_norm": 0.08333491533994675, "learning_rate": 0.01, "loss": 2.1964, "step": 5301 }, { "epoch": 0.5449501695263537, "grad_norm": 0.08964785933494568, "learning_rate": 0.01, "loss": 2.1613, "step": 5304 }, { "epoch": 0.5452583992602487, "grad_norm": 0.06194687634706497, "learning_rate": 0.01, "loss": 2.1711, "step": 5307 }, { "epoch": 0.5455666289941437, "grad_norm": 0.047254305332899094, "learning_rate": 0.01, "loss": 2.1956, "step": 5310 }, { "epoch": 0.5458748587280386, "grad_norm": 0.052926719188690186, "learning_rate": 0.01, "loss": 2.1767, "step": 5313 }, { "epoch": 0.5461830884619336, "grad_norm": 0.08765383809804916, "learning_rate": 0.01, "loss": 2.1782, "step": 5316 }, { "epoch": 0.5464913181958286, "grad_norm": 0.0749160572886467, "learning_rate": 0.01, "loss": 2.1875, "step": 5319 }, { "epoch": 0.5467995479297236, "grad_norm": 0.09781020879745483, "learning_rate": 0.01, "loss": 2.1748, "step": 5322 }, { "epoch": 0.5471077776636186, "grad_norm": 0.04605260491371155, "learning_rate": 0.01, "loss": 2.145, "step": 5325 }, { "epoch": 0.5474160073975136, "grad_norm": 0.13507331907749176, "learning_rate": 0.01, "loss": 2.1769, "step": 5328 }, { "epoch": 0.5477242371314086, "grad_norm": 0.05028709024190903, "learning_rate": 0.01, "loss": 2.1925, "step": 5331 }, { "epoch": 0.5480324668653036, "grad_norm": 0.08754327893257141, "learning_rate": 0.01, "loss": 2.159, "step": 5334 }, { "epoch": 0.5483406965991986, "grad_norm": 0.10449190437793732, "learning_rate": 0.01, "loss": 2.1898, "step": 5337 }, { "epoch": 0.5486489263330936, "grad_norm": 0.10263057053089142, "learning_rate": 0.01, "loss": 2.1776, "step": 5340 }, { "epoch": 0.5489571560669886, "grad_norm": 0.0547097772359848, "learning_rate": 0.01, "loss": 2.1941, "step": 5343 }, { "epoch": 0.5492653858008836, "grad_norm": 0.06682941317558289, "learning_rate": 0.01, "loss": 2.1901, "step": 5346 }, { "epoch": 0.5495736155347786, "grad_norm": 0.06421027332544327, "learning_rate": 0.01, "loss": 2.1981, "step": 5349 }, { "epoch": 0.5498818452686736, "grad_norm": 0.041218411177396774, "learning_rate": 0.01, "loss": 2.1844, "step": 5352 }, { "epoch": 0.5501900750025686, "grad_norm": 0.042902372777462006, "learning_rate": 0.01, "loss": 2.1981, "step": 5355 }, { "epoch": 0.5504983047364636, "grad_norm": 0.05338321253657341, "learning_rate": 0.01, "loss": 2.168, "step": 5358 }, { "epoch": 0.5508065344703585, "grad_norm": 0.06692427396774292, "learning_rate": 0.01, "loss": 2.1891, "step": 5361 }, { "epoch": 0.5511147642042535, "grad_norm": 0.07927200943231583, "learning_rate": 0.01, "loss": 2.1853, "step": 5364 }, { "epoch": 0.5514229939381485, "grad_norm": 0.05655739828944206, "learning_rate": 0.01, "loss": 2.1838, "step": 5367 }, { "epoch": 0.5517312236720435, "grad_norm": 0.04488144442439079, "learning_rate": 0.01, "loss": 2.1754, "step": 5370 }, { "epoch": 0.5520394534059385, "grad_norm": 0.09253795444965363, "learning_rate": 0.01, "loss": 2.1742, "step": 5373 }, { "epoch": 0.5523476831398335, "grad_norm": 0.07396019250154495, "learning_rate": 0.01, "loss": 2.1582, "step": 5376 }, { "epoch": 0.5526559128737285, "grad_norm": 0.053663600236177444, "learning_rate": 0.01, "loss": 2.1508, "step": 5379 }, { "epoch": 0.5529641426076235, "grad_norm": 0.062076181173324585, "learning_rate": 0.01, "loss": 2.1772, "step": 5382 }, { "epoch": 0.5532723723415185, "grad_norm": 0.08481581509113312, "learning_rate": 0.01, "loss": 2.1836, "step": 5385 }, { "epoch": 0.5535806020754136, "grad_norm": 0.08981155604124069, "learning_rate": 0.01, "loss": 2.204, "step": 5388 }, { "epoch": 0.5538888318093086, "grad_norm": 0.10067261755466461, "learning_rate": 0.01, "loss": 2.1527, "step": 5391 }, { "epoch": 0.5541970615432036, "grad_norm": 0.06231047958135605, "learning_rate": 0.01, "loss": 2.194, "step": 5394 }, { "epoch": 0.5545052912770986, "grad_norm": 0.049111973494291306, "learning_rate": 0.01, "loss": 2.1889, "step": 5397 }, { "epoch": 0.5548135210109936, "grad_norm": 0.06446948647499084, "learning_rate": 0.01, "loss": 2.2103, "step": 5400 }, { "epoch": 0.5551217507448886, "grad_norm": 0.051946625113487244, "learning_rate": 0.01, "loss": 2.1977, "step": 5403 }, { "epoch": 0.5554299804787836, "grad_norm": 0.1369265466928482, "learning_rate": 0.01, "loss": 2.1771, "step": 5406 }, { "epoch": 0.5557382102126786, "grad_norm": 0.08489779382944107, "learning_rate": 0.01, "loss": 2.1782, "step": 5409 }, { "epoch": 0.5560464399465735, "grad_norm": 0.10673670470714569, "learning_rate": 0.01, "loss": 2.173, "step": 5412 }, { "epoch": 0.5563546696804685, "grad_norm": 0.055250637233257294, "learning_rate": 0.01, "loss": 2.1539, "step": 5415 }, { "epoch": 0.5566628994143635, "grad_norm": 0.05136672407388687, "learning_rate": 0.01, "loss": 2.2035, "step": 5418 }, { "epoch": 0.5569711291482585, "grad_norm": 0.040590591728687286, "learning_rate": 0.01, "loss": 2.1778, "step": 5421 }, { "epoch": 0.5572793588821535, "grad_norm": 0.048333633691072464, "learning_rate": 0.01, "loss": 2.191, "step": 5424 }, { "epoch": 0.5575875886160485, "grad_norm": 0.0582728311419487, "learning_rate": 0.01, "loss": 2.1734, "step": 5427 }, { "epoch": 0.5578958183499435, "grad_norm": 0.05272262915968895, "learning_rate": 0.01, "loss": 2.1714, "step": 5430 }, { "epoch": 0.5582040480838385, "grad_norm": 0.08472342789173126, "learning_rate": 0.01, "loss": 2.1624, "step": 5433 }, { "epoch": 0.5585122778177335, "grad_norm": 0.10869960486888885, "learning_rate": 0.01, "loss": 2.164, "step": 5436 }, { "epoch": 0.5588205075516285, "grad_norm": 0.0569114163517952, "learning_rate": 0.01, "loss": 2.1933, "step": 5439 }, { "epoch": 0.5591287372855235, "grad_norm": 0.14485467970371246, "learning_rate": 0.01, "loss": 2.1779, "step": 5442 }, { "epoch": 0.5594369670194185, "grad_norm": 0.08184878528118134, "learning_rate": 0.01, "loss": 2.1779, "step": 5445 }, { "epoch": 0.5597451967533135, "grad_norm": 0.06575775146484375, "learning_rate": 0.01, "loss": 2.136, "step": 5448 }, { "epoch": 0.5600534264872085, "grad_norm": 0.08628299832344055, "learning_rate": 0.01, "loss": 2.1696, "step": 5451 }, { "epoch": 0.5603616562211035, "grad_norm": 0.06078352406620979, "learning_rate": 0.01, "loss": 2.1865, "step": 5454 }, { "epoch": 0.5606698859549984, "grad_norm": 0.05207353085279465, "learning_rate": 0.01, "loss": 2.167, "step": 5457 }, { "epoch": 0.5609781156888934, "grad_norm": 0.059535857290029526, "learning_rate": 0.01, "loss": 2.1977, "step": 5460 }, { "epoch": 0.5612863454227884, "grad_norm": 0.05342729389667511, "learning_rate": 0.01, "loss": 2.1823, "step": 5463 }, { "epoch": 0.5615945751566834, "grad_norm": 0.04207632318139076, "learning_rate": 0.01, "loss": 2.1849, "step": 5466 }, { "epoch": 0.5619028048905784, "grad_norm": 0.1334255486726761, "learning_rate": 0.01, "loss": 2.1886, "step": 5469 }, { "epoch": 0.5622110346244734, "grad_norm": 0.06532323360443115, "learning_rate": 0.01, "loss": 2.1979, "step": 5472 }, { "epoch": 0.5625192643583684, "grad_norm": 0.0793483555316925, "learning_rate": 0.01, "loss": 2.188, "step": 5475 }, { "epoch": 0.5628274940922634, "grad_norm": 0.04637480154633522, "learning_rate": 0.01, "loss": 2.1562, "step": 5478 }, { "epoch": 0.5631357238261584, "grad_norm": 0.0482000894844532, "learning_rate": 0.01, "loss": 2.1587, "step": 5481 }, { "epoch": 0.5634439535600534, "grad_norm": 0.06253401190042496, "learning_rate": 0.01, "loss": 2.1978, "step": 5484 }, { "epoch": 0.5637521832939484, "grad_norm": 0.15622715651988983, "learning_rate": 0.01, "loss": 2.171, "step": 5487 }, { "epoch": 0.5640604130278434, "grad_norm": 0.10941077768802643, "learning_rate": 0.01, "loss": 2.1952, "step": 5490 }, { "epoch": 0.5643686427617384, "grad_norm": 0.08030713349580765, "learning_rate": 0.01, "loss": 2.1948, "step": 5493 }, { "epoch": 0.5646768724956334, "grad_norm": 0.13679014146327972, "learning_rate": 0.01, "loss": 2.1767, "step": 5496 }, { "epoch": 0.5649851022295284, "grad_norm": 0.04662426933646202, "learning_rate": 0.01, "loss": 2.1926, "step": 5499 }, { "epoch": 0.5652933319634234, "grad_norm": 0.05347858741879463, "learning_rate": 0.01, "loss": 2.1825, "step": 5502 }, { "epoch": 0.5656015616973185, "grad_norm": 0.06205238029360771, "learning_rate": 0.01, "loss": 2.1537, "step": 5505 }, { "epoch": 0.5659097914312134, "grad_norm": 0.05525955557823181, "learning_rate": 0.01, "loss": 2.1802, "step": 5508 }, { "epoch": 0.5662180211651084, "grad_norm": 0.055693045258522034, "learning_rate": 0.01, "loss": 2.1727, "step": 5511 }, { "epoch": 0.5665262508990034, "grad_norm": 0.051134396344423294, "learning_rate": 0.01, "loss": 2.1695, "step": 5514 }, { "epoch": 0.5668344806328984, "grad_norm": 0.05469521880149841, "learning_rate": 0.01, "loss": 2.1664, "step": 5517 }, { "epoch": 0.5671427103667934, "grad_norm": 0.039416272193193436, "learning_rate": 0.01, "loss": 2.1708, "step": 5520 }, { "epoch": 0.5674509401006884, "grad_norm": 0.10661659389734268, "learning_rate": 0.01, "loss": 2.1753, "step": 5523 }, { "epoch": 0.5677591698345834, "grad_norm": 0.07567829638719559, "learning_rate": 0.01, "loss": 2.1733, "step": 5526 }, { "epoch": 0.5680673995684784, "grad_norm": 0.06030309572815895, "learning_rate": 0.01, "loss": 2.1795, "step": 5529 }, { "epoch": 0.5683756293023734, "grad_norm": 0.07429811358451843, "learning_rate": 0.01, "loss": 2.1936, "step": 5532 }, { "epoch": 0.5686838590362684, "grad_norm": 0.08618849515914917, "learning_rate": 0.01, "loss": 2.2009, "step": 5535 }, { "epoch": 0.5689920887701634, "grad_norm": 0.04969833791255951, "learning_rate": 0.01, "loss": 2.1711, "step": 5538 }, { "epoch": 0.5693003185040584, "grad_norm": 0.11154712736606598, "learning_rate": 0.01, "loss": 2.1802, "step": 5541 }, { "epoch": 0.5696085482379534, "grad_norm": 0.07754155993461609, "learning_rate": 0.01, "loss": 2.164, "step": 5544 }, { "epoch": 0.5699167779718484, "grad_norm": 0.04600273445248604, "learning_rate": 0.01, "loss": 2.1918, "step": 5547 }, { "epoch": 0.5702250077057434, "grad_norm": 0.06788063049316406, "learning_rate": 0.01, "loss": 2.1477, "step": 5550 }, { "epoch": 0.5705332374396384, "grad_norm": 0.11349419504404068, "learning_rate": 0.01, "loss": 2.1603, "step": 5553 }, { "epoch": 0.5708414671735333, "grad_norm": 0.11178430914878845, "learning_rate": 0.01, "loss": 2.1439, "step": 5556 }, { "epoch": 0.5711496969074283, "grad_norm": 0.050257500261068344, "learning_rate": 0.01, "loss": 2.1851, "step": 5559 }, { "epoch": 0.5714579266413233, "grad_norm": 0.08327756822109222, "learning_rate": 0.01, "loss": 2.158, "step": 5562 }, { "epoch": 0.5717661563752183, "grad_norm": 0.06866388767957687, "learning_rate": 0.01, "loss": 2.1584, "step": 5565 }, { "epoch": 0.5720743861091133, "grad_norm": 0.1139674037694931, "learning_rate": 0.01, "loss": 2.1897, "step": 5568 }, { "epoch": 0.5723826158430083, "grad_norm": 0.07029612362384796, "learning_rate": 0.01, "loss": 2.1723, "step": 5571 }, { "epoch": 0.5726908455769033, "grad_norm": 0.10171212255954742, "learning_rate": 0.01, "loss": 2.1788, "step": 5574 }, { "epoch": 0.5729990753107983, "grad_norm": 0.11364202946424484, "learning_rate": 0.01, "loss": 2.1659, "step": 5577 }, { "epoch": 0.5733073050446933, "grad_norm": 0.08066857606172562, "learning_rate": 0.01, "loss": 2.1902, "step": 5580 }, { "epoch": 0.5736155347785883, "grad_norm": 0.09207342565059662, "learning_rate": 0.01, "loss": 2.1519, "step": 5583 }, { "epoch": 0.5739237645124833, "grad_norm": 0.06945987790822983, "learning_rate": 0.01, "loss": 2.1677, "step": 5586 }, { "epoch": 0.5742319942463783, "grad_norm": 0.05137445777654648, "learning_rate": 0.01, "loss": 2.1686, "step": 5589 }, { "epoch": 0.5745402239802733, "grad_norm": 0.10192268341779709, "learning_rate": 0.01, "loss": 2.1758, "step": 5592 }, { "epoch": 0.5748484537141683, "grad_norm": 0.056787896901369095, "learning_rate": 0.01, "loss": 2.1642, "step": 5595 }, { "epoch": 0.5751566834480633, "grad_norm": 0.07727455347776413, "learning_rate": 0.01, "loss": 2.1662, "step": 5598 }, { "epoch": 0.5754649131819582, "grad_norm": 0.1311456710100174, "learning_rate": 0.01, "loss": 2.1713, "step": 5601 }, { "epoch": 0.5757731429158532, "grad_norm": 0.1014258936047554, "learning_rate": 0.01, "loss": 2.1751, "step": 5604 }, { "epoch": 0.5760813726497482, "grad_norm": 0.06325560063123703, "learning_rate": 0.01, "loss": 2.1757, "step": 5607 }, { "epoch": 0.5763896023836432, "grad_norm": 0.07262448221445084, "learning_rate": 0.01, "loss": 2.1575, "step": 5610 }, { "epoch": 0.5766978321175382, "grad_norm": 0.07160039991140366, "learning_rate": 0.01, "loss": 2.1706, "step": 5613 }, { "epoch": 0.5770060618514332, "grad_norm": 0.050024017691612244, "learning_rate": 0.01, "loss": 2.1811, "step": 5616 }, { "epoch": 0.5773142915853282, "grad_norm": 0.09685138612985611, "learning_rate": 0.01, "loss": 2.1549, "step": 5619 }, { "epoch": 0.5776225213192233, "grad_norm": 0.058329988270998, "learning_rate": 0.01, "loss": 2.1813, "step": 5622 }, { "epoch": 0.5779307510531183, "grad_norm": 0.06637705117464066, "learning_rate": 0.01, "loss": 2.1717, "step": 5625 }, { "epoch": 0.5782389807870133, "grad_norm": 0.0906175896525383, "learning_rate": 0.01, "loss": 2.1677, "step": 5628 }, { "epoch": 0.5785472105209083, "grad_norm": 0.06751519441604614, "learning_rate": 0.01, "loss": 2.1584, "step": 5631 }, { "epoch": 0.5788554402548033, "grad_norm": 0.04437318444252014, "learning_rate": 0.01, "loss": 2.2013, "step": 5634 }, { "epoch": 0.5791636699886983, "grad_norm": 0.04365368187427521, "learning_rate": 0.01, "loss": 2.1746, "step": 5637 }, { "epoch": 0.5794718997225933, "grad_norm": 0.04844829812645912, "learning_rate": 0.01, "loss": 2.1818, "step": 5640 }, { "epoch": 0.5797801294564883, "grad_norm": 0.04154437035322189, "learning_rate": 0.01, "loss": 2.1536, "step": 5643 }, { "epoch": 0.5800883591903833, "grad_norm": 0.07691098004579544, "learning_rate": 0.01, "loss": 2.1883, "step": 5646 }, { "epoch": 0.5803965889242783, "grad_norm": 0.07065980136394501, "learning_rate": 0.01, "loss": 2.154, "step": 5649 }, { "epoch": 0.5807048186581732, "grad_norm": 0.1051129475235939, "learning_rate": 0.01, "loss": 2.1447, "step": 5652 }, { "epoch": 0.5810130483920682, "grad_norm": 0.10921964794397354, "learning_rate": 0.01, "loss": 2.1777, "step": 5655 }, { "epoch": 0.5813212781259632, "grad_norm": 0.1120898649096489, "learning_rate": 0.01, "loss": 2.1617, "step": 5658 }, { "epoch": 0.5816295078598582, "grad_norm": 0.09593590348958969, "learning_rate": 0.01, "loss": 2.1457, "step": 5661 }, { "epoch": 0.5819377375937532, "grad_norm": 0.054108936339616776, "learning_rate": 0.01, "loss": 2.1793, "step": 5664 }, { "epoch": 0.5822459673276482, "grad_norm": 0.07890141755342484, "learning_rate": 0.01, "loss": 2.1676, "step": 5667 }, { "epoch": 0.5825541970615432, "grad_norm": 0.07864063233137131, "learning_rate": 0.01, "loss": 2.1766, "step": 5670 }, { "epoch": 0.5828624267954382, "grad_norm": 0.08160068094730377, "learning_rate": 0.01, "loss": 2.166, "step": 5673 }, { "epoch": 0.5831706565293332, "grad_norm": 0.08126121759414673, "learning_rate": 0.01, "loss": 2.1691, "step": 5676 }, { "epoch": 0.5834788862632282, "grad_norm": 0.05922897160053253, "learning_rate": 0.01, "loss": 2.1854, "step": 5679 }, { "epoch": 0.5837871159971232, "grad_norm": 0.1024496778845787, "learning_rate": 0.01, "loss": 2.1818, "step": 5682 }, { "epoch": 0.5840953457310182, "grad_norm": 0.08880037069320679, "learning_rate": 0.01, "loss": 2.2054, "step": 5685 }, { "epoch": 0.5844035754649132, "grad_norm": 0.04404953494668007, "learning_rate": 0.01, "loss": 2.1524, "step": 5688 }, { "epoch": 0.5847118051988082, "grad_norm": 0.05817687511444092, "learning_rate": 0.01, "loss": 2.1813, "step": 5691 }, { "epoch": 0.5850200349327032, "grad_norm": 0.047581762075424194, "learning_rate": 0.01, "loss": 2.1545, "step": 5694 }, { "epoch": 0.5853282646665982, "grad_norm": 0.11034911125898361, "learning_rate": 0.01, "loss": 2.1803, "step": 5697 }, { "epoch": 0.5856364944004931, "grad_norm": 0.05118945613503456, "learning_rate": 0.01, "loss": 2.1314, "step": 5700 }, { "epoch": 0.5859447241343881, "grad_norm": 0.048316795378923416, "learning_rate": 0.01, "loss": 2.1711, "step": 5703 }, { "epoch": 0.5862529538682831, "grad_norm": 0.12578584253787994, "learning_rate": 0.01, "loss": 2.1636, "step": 5706 }, { "epoch": 0.5865611836021781, "grad_norm": 0.06594375520944595, "learning_rate": 0.01, "loss": 2.1977, "step": 5709 }, { "epoch": 0.5868694133360731, "grad_norm": 0.060622964054346085, "learning_rate": 0.01, "loss": 2.1408, "step": 5712 }, { "epoch": 0.5871776430699681, "grad_norm": 0.10055366903543472, "learning_rate": 0.01, "loss": 2.1999, "step": 5715 }, { "epoch": 0.5874858728038631, "grad_norm": 0.10235504060983658, "learning_rate": 0.01, "loss": 2.1337, "step": 5718 }, { "epoch": 0.5877941025377581, "grad_norm": 0.07707791030406952, "learning_rate": 0.01, "loss": 2.1387, "step": 5721 }, { "epoch": 0.5881023322716531, "grad_norm": 0.05508594587445259, "learning_rate": 0.01, "loss": 2.1494, "step": 5724 }, { "epoch": 0.5884105620055481, "grad_norm": 0.06580860912799835, "learning_rate": 0.01, "loss": 2.1598, "step": 5727 }, { "epoch": 0.5887187917394431, "grad_norm": 0.07102775573730469, "learning_rate": 0.01, "loss": 2.1618, "step": 5730 }, { "epoch": 0.5890270214733381, "grad_norm": 0.06750554591417313, "learning_rate": 0.01, "loss": 2.1782, "step": 5733 }, { "epoch": 0.5893352512072332, "grad_norm": 0.07100195437669754, "learning_rate": 0.01, "loss": 2.1456, "step": 5736 }, { "epoch": 0.5896434809411282, "grad_norm": 0.10585575550794601, "learning_rate": 0.01, "loss": 2.1751, "step": 5739 }, { "epoch": 0.5899517106750232, "grad_norm": 0.055082373321056366, "learning_rate": 0.01, "loss": 2.1808, "step": 5742 }, { "epoch": 0.5902599404089182, "grad_norm": 0.06285014003515244, "learning_rate": 0.01, "loss": 2.1588, "step": 5745 }, { "epoch": 0.5905681701428132, "grad_norm": 0.13328112661838531, "learning_rate": 0.01, "loss": 2.177, "step": 5748 }, { "epoch": 0.5908763998767081, "grad_norm": 0.08568006008863449, "learning_rate": 0.01, "loss": 2.1559, "step": 5751 }, { "epoch": 0.5911846296106031, "grad_norm": 0.07850711792707443, "learning_rate": 0.01, "loss": 2.2047, "step": 5754 }, { "epoch": 0.5914928593444981, "grad_norm": 0.07706760615110397, "learning_rate": 0.01, "loss": 2.1602, "step": 5757 }, { "epoch": 0.5918010890783931, "grad_norm": 0.07512292265892029, "learning_rate": 0.01, "loss": 2.1871, "step": 5760 }, { "epoch": 0.5921093188122881, "grad_norm": 0.059620197862386703, "learning_rate": 0.01, "loss": 2.1484, "step": 5763 }, { "epoch": 0.5924175485461831, "grad_norm": 0.04021789878606796, "learning_rate": 0.01, "loss": 2.1651, "step": 5766 }, { "epoch": 0.5927257782800781, "grad_norm": 0.050683967769145966, "learning_rate": 0.01, "loss": 2.1693, "step": 5769 }, { "epoch": 0.5930340080139731, "grad_norm": 0.07091210782527924, "learning_rate": 0.01, "loss": 2.1851, "step": 5772 }, { "epoch": 0.5933422377478681, "grad_norm": 0.09877889603376389, "learning_rate": 0.01, "loss": 2.1642, "step": 5775 }, { "epoch": 0.5936504674817631, "grad_norm": 0.08692251145839691, "learning_rate": 0.01, "loss": 2.1755, "step": 5778 }, { "epoch": 0.5939586972156581, "grad_norm": 0.06255677342414856, "learning_rate": 0.01, "loss": 2.1634, "step": 5781 }, { "epoch": 0.5942669269495531, "grad_norm": 0.05615478754043579, "learning_rate": 0.01, "loss": 2.1909, "step": 5784 }, { "epoch": 0.5945751566834481, "grad_norm": 0.04576956480741501, "learning_rate": 0.01, "loss": 2.1519, "step": 5787 }, { "epoch": 0.5948833864173431, "grad_norm": 0.044911667704582214, "learning_rate": 0.01, "loss": 2.1697, "step": 5790 }, { "epoch": 0.5951916161512381, "grad_norm": 0.07787128537893295, "learning_rate": 0.01, "loss": 2.1611, "step": 5793 }, { "epoch": 0.595499845885133, "grad_norm": 0.06199866533279419, "learning_rate": 0.01, "loss": 2.1576, "step": 5796 }, { "epoch": 0.595808075619028, "grad_norm": 0.07048948854207993, "learning_rate": 0.01, "loss": 2.1721, "step": 5799 }, { "epoch": 0.596116305352923, "grad_norm": 0.1173306256532669, "learning_rate": 0.01, "loss": 2.1573, "step": 5802 }, { "epoch": 0.596424535086818, "grad_norm": 0.06866045296192169, "learning_rate": 0.01, "loss": 2.1606, "step": 5805 }, { "epoch": 0.596732764820713, "grad_norm": 0.06821485608816147, "learning_rate": 0.01, "loss": 2.1842, "step": 5808 }, { "epoch": 0.597040994554608, "grad_norm": 0.09566816687583923, "learning_rate": 0.01, "loss": 2.1569, "step": 5811 }, { "epoch": 0.597349224288503, "grad_norm": 0.1130233108997345, "learning_rate": 0.01, "loss": 2.1649, "step": 5814 }, { "epoch": 0.597657454022398, "grad_norm": 0.07310149073600769, "learning_rate": 0.01, "loss": 2.1798, "step": 5817 }, { "epoch": 0.597965683756293, "grad_norm": 0.04523763060569763, "learning_rate": 0.01, "loss": 2.1515, "step": 5820 }, { "epoch": 0.598273913490188, "grad_norm": 0.05843660235404968, "learning_rate": 0.01, "loss": 2.1403, "step": 5823 }, { "epoch": 0.598582143224083, "grad_norm": 0.03981595113873482, "learning_rate": 0.01, "loss": 2.1598, "step": 5826 }, { "epoch": 0.598890372957978, "grad_norm": 0.057108644396066666, "learning_rate": 0.01, "loss": 2.1619, "step": 5829 }, { "epoch": 0.599198602691873, "grad_norm": 0.12298591434955597, "learning_rate": 0.01, "loss": 2.125, "step": 5832 }, { "epoch": 0.599506832425768, "grad_norm": 0.06120186299085617, "learning_rate": 0.01, "loss": 2.181, "step": 5835 }, { "epoch": 0.599815062159663, "grad_norm": 0.05780164897441864, "learning_rate": 0.01, "loss": 2.1555, "step": 5838 }, { "epoch": 0.600123291893558, "grad_norm": 0.0962534248828888, "learning_rate": 0.01, "loss": 2.1626, "step": 5841 }, { "epoch": 0.600431521627453, "grad_norm": 0.07417720556259155, "learning_rate": 0.01, "loss": 2.1996, "step": 5844 }, { "epoch": 0.6007397513613479, "grad_norm": 0.08221522718667984, "learning_rate": 0.01, "loss": 2.1562, "step": 5847 }, { "epoch": 0.6010479810952429, "grad_norm": 0.11511900275945663, "learning_rate": 0.01, "loss": 2.196, "step": 5850 }, { "epoch": 0.601356210829138, "grad_norm": 0.1331305354833603, "learning_rate": 0.01, "loss": 2.1649, "step": 5853 }, { "epoch": 0.601664440563033, "grad_norm": 0.07239941507577896, "learning_rate": 0.01, "loss": 2.1695, "step": 5856 }, { "epoch": 0.601972670296928, "grad_norm": 0.05865192040801048, "learning_rate": 0.01, "loss": 2.1808, "step": 5859 }, { "epoch": 0.602280900030823, "grad_norm": 0.047268107533454895, "learning_rate": 0.01, "loss": 2.176, "step": 5862 }, { "epoch": 0.602589129764718, "grad_norm": 0.046770863234996796, "learning_rate": 0.01, "loss": 2.1689, "step": 5865 }, { "epoch": 0.602897359498613, "grad_norm": 0.04817832633852959, "learning_rate": 0.01, "loss": 2.1566, "step": 5868 }, { "epoch": 0.603205589232508, "grad_norm": 0.05692889541387558, "learning_rate": 0.01, "loss": 2.1564, "step": 5871 }, { "epoch": 0.603513818966403, "grad_norm": 0.056694116443395615, "learning_rate": 0.01, "loss": 2.1591, "step": 5874 }, { "epoch": 0.603822048700298, "grad_norm": 0.08296339213848114, "learning_rate": 0.01, "loss": 2.1695, "step": 5877 }, { "epoch": 0.604130278434193, "grad_norm": 0.0934629738330841, "learning_rate": 0.01, "loss": 2.1472, "step": 5880 }, { "epoch": 0.604438508168088, "grad_norm": 0.10192359238862991, "learning_rate": 0.01, "loss": 2.1441, "step": 5883 }, { "epoch": 0.604746737901983, "grad_norm": 0.04818946123123169, "learning_rate": 0.01, "loss": 2.1747, "step": 5886 }, { "epoch": 0.605054967635878, "grad_norm": 0.10131523758172989, "learning_rate": 0.01, "loss": 2.1546, "step": 5889 }, { "epoch": 0.605363197369773, "grad_norm": 0.07115977257490158, "learning_rate": 0.01, "loss": 2.1597, "step": 5892 }, { "epoch": 0.605671427103668, "grad_norm": 0.03929082304239273, "learning_rate": 0.01, "loss": 2.171, "step": 5895 }, { "epoch": 0.6059796568375629, "grad_norm": 0.04109720513224602, "learning_rate": 0.01, "loss": 2.134, "step": 5898 }, { "epoch": 0.6062878865714579, "grad_norm": 0.05026080831885338, "learning_rate": 0.01, "loss": 2.1491, "step": 5901 }, { "epoch": 0.6065961163053529, "grad_norm": 0.08281126618385315, "learning_rate": 0.01, "loss": 2.1732, "step": 5904 }, { "epoch": 0.6069043460392479, "grad_norm": 0.04994012042880058, "learning_rate": 0.01, "loss": 2.1664, "step": 5907 }, { "epoch": 0.6072125757731429, "grad_norm": 0.06299131363630295, "learning_rate": 0.01, "loss": 2.1669, "step": 5910 }, { "epoch": 0.6075208055070379, "grad_norm": 0.059428080916404724, "learning_rate": 0.01, "loss": 2.1731, "step": 5913 }, { "epoch": 0.6078290352409329, "grad_norm": 0.07036252319812775, "learning_rate": 0.01, "loss": 2.1787, "step": 5916 }, { "epoch": 0.6081372649748279, "grad_norm": 0.04721888527274132, "learning_rate": 0.01, "loss": 2.1531, "step": 5919 }, { "epoch": 0.6084454947087229, "grad_norm": 0.06953759491443634, "learning_rate": 0.01, "loss": 2.1573, "step": 5922 }, { "epoch": 0.6087537244426179, "grad_norm": 0.11679168790578842, "learning_rate": 0.01, "loss": 2.155, "step": 5925 }, { "epoch": 0.6090619541765129, "grad_norm": 0.09196575731039047, "learning_rate": 0.01, "loss": 2.1574, "step": 5928 }, { "epoch": 0.6093701839104079, "grad_norm": 0.05219469591975212, "learning_rate": 0.01, "loss": 2.1605, "step": 5931 }, { "epoch": 0.6096784136443029, "grad_norm": 0.09352759271860123, "learning_rate": 0.01, "loss": 2.1456, "step": 5934 }, { "epoch": 0.6099866433781979, "grad_norm": 0.07393237948417664, "learning_rate": 0.01, "loss": 2.1611, "step": 5937 }, { "epoch": 0.6102948731120929, "grad_norm": 0.06727741658687592, "learning_rate": 0.01, "loss": 2.1599, "step": 5940 }, { "epoch": 0.6106031028459878, "grad_norm": 0.09024669975042343, "learning_rate": 0.01, "loss": 2.1621, "step": 5943 }, { "epoch": 0.6109113325798828, "grad_norm": 0.04514656960964203, "learning_rate": 0.01, "loss": 2.1809, "step": 5946 }, { "epoch": 0.6112195623137778, "grad_norm": 0.04011565446853638, "learning_rate": 0.01, "loss": 2.1715, "step": 5949 }, { "epoch": 0.6115277920476728, "grad_norm": 0.04640655592083931, "learning_rate": 0.01, "loss": 2.15, "step": 5952 }, { "epoch": 0.6118360217815678, "grad_norm": 0.0471080057322979, "learning_rate": 0.01, "loss": 2.1805, "step": 5955 }, { "epoch": 0.6121442515154628, "grad_norm": 0.17398513853549957, "learning_rate": 0.01, "loss": 2.1497, "step": 5958 }, { "epoch": 0.6124524812493578, "grad_norm": 0.06299551576375961, "learning_rate": 0.01, "loss": 2.1387, "step": 5961 }, { "epoch": 0.6127607109832528, "grad_norm": 0.07517322897911072, "learning_rate": 0.01, "loss": 2.1348, "step": 5964 }, { "epoch": 0.6130689407171478, "grad_norm": 0.050515878945589066, "learning_rate": 0.01, "loss": 2.1725, "step": 5967 }, { "epoch": 0.6133771704510429, "grad_norm": 0.04682675376534462, "learning_rate": 0.01, "loss": 2.1759, "step": 5970 }, { "epoch": 0.6136854001849379, "grad_norm": 0.05297816917300224, "learning_rate": 0.01, "loss": 2.1491, "step": 5973 }, { "epoch": 0.6139936299188329, "grad_norm": 0.07467235624790192, "learning_rate": 0.01, "loss": 2.1556, "step": 5976 }, { "epoch": 0.6143018596527279, "grad_norm": 0.06621374934911728, "learning_rate": 0.01, "loss": 2.1498, "step": 5979 }, { "epoch": 0.6146100893866229, "grad_norm": 0.0538405105471611, "learning_rate": 0.01, "loss": 2.1694, "step": 5982 }, { "epoch": 0.6149183191205179, "grad_norm": 0.09891212731599808, "learning_rate": 0.01, "loss": 2.1598, "step": 5985 }, { "epoch": 0.6152265488544129, "grad_norm": 0.042064208537340164, "learning_rate": 0.01, "loss": 2.1375, "step": 5988 }, { "epoch": 0.6155347785883079, "grad_norm": 0.06750064343214035, "learning_rate": 0.01, "loss": 2.1371, "step": 5991 }, { "epoch": 0.6158430083222028, "grad_norm": 0.0626809298992157, "learning_rate": 0.01, "loss": 2.1455, "step": 5994 }, { "epoch": 0.6161512380560978, "grad_norm": 0.04291335120797157, "learning_rate": 0.01, "loss": 2.1397, "step": 5997 }, { "epoch": 0.6164594677899928, "grad_norm": 0.05945251137018204, "learning_rate": 0.01, "loss": 2.1346, "step": 6000 }, { "epoch": 0.6167676975238878, "grad_norm": 0.15699933469295502, "learning_rate": 0.01, "loss": 2.1384, "step": 6003 }, { "epoch": 0.6170759272577828, "grad_norm": 0.06863987445831299, "learning_rate": 0.01, "loss": 2.1401, "step": 6006 }, { "epoch": 0.6173841569916778, "grad_norm": 0.04850529506802559, "learning_rate": 0.01, "loss": 2.1637, "step": 6009 }, { "epoch": 0.6176923867255728, "grad_norm": 0.05660491809248924, "learning_rate": 0.01, "loss": 2.1721, "step": 6012 }, { "epoch": 0.6180006164594678, "grad_norm": 0.050568364560604095, "learning_rate": 0.01, "loss": 2.1676, "step": 6015 }, { "epoch": 0.6183088461933628, "grad_norm": 0.060765717178583145, "learning_rate": 0.01, "loss": 2.127, "step": 6018 }, { "epoch": 0.6186170759272578, "grad_norm": 0.0731448233127594, "learning_rate": 0.01, "loss": 2.1531, "step": 6021 }, { "epoch": 0.6189253056611528, "grad_norm": 0.055431608110666275, "learning_rate": 0.01, "loss": 2.1662, "step": 6024 }, { "epoch": 0.6192335353950478, "grad_norm": 0.05376220867037773, "learning_rate": 0.01, "loss": 2.1465, "step": 6027 }, { "epoch": 0.6195417651289428, "grad_norm": 0.09729186445474625, "learning_rate": 0.01, "loss": 2.161, "step": 6030 }, { "epoch": 0.6198499948628378, "grad_norm": 0.08046093583106995, "learning_rate": 0.01, "loss": 2.1435, "step": 6033 }, { "epoch": 0.6201582245967328, "grad_norm": 0.09514495730400085, "learning_rate": 0.01, "loss": 2.1511, "step": 6036 }, { "epoch": 0.6204664543306277, "grad_norm": 0.056993287056684494, "learning_rate": 0.01, "loss": 2.1439, "step": 6039 }, { "epoch": 0.6207746840645227, "grad_norm": 0.06429582834243774, "learning_rate": 0.01, "loss": 2.1393, "step": 6042 }, { "epoch": 0.6210829137984177, "grad_norm": 0.1299380660057068, "learning_rate": 0.01, "loss": 2.1831, "step": 6045 }, { "epoch": 0.6213911435323127, "grad_norm": 0.13815906643867493, "learning_rate": 0.01, "loss": 2.1645, "step": 6048 }, { "epoch": 0.6216993732662077, "grad_norm": 0.056314874440431595, "learning_rate": 0.01, "loss": 2.1417, "step": 6051 }, { "epoch": 0.6220076030001027, "grad_norm": 0.06146218627691269, "learning_rate": 0.01, "loss": 2.1418, "step": 6054 }, { "epoch": 0.6223158327339977, "grad_norm": 0.062167149037122726, "learning_rate": 0.01, "loss": 2.1778, "step": 6057 }, { "epoch": 0.6226240624678927, "grad_norm": 0.059581879526376724, "learning_rate": 0.01, "loss": 2.1725, "step": 6060 }, { "epoch": 0.6229322922017877, "grad_norm": 0.044389910995960236, "learning_rate": 0.01, "loss": 2.1553, "step": 6063 }, { "epoch": 0.6232405219356827, "grad_norm": 0.036525238305330276, "learning_rate": 0.01, "loss": 2.1545, "step": 6066 }, { "epoch": 0.6235487516695777, "grad_norm": 0.0995573177933693, "learning_rate": 0.01, "loss": 2.1566, "step": 6069 }, { "epoch": 0.6238569814034727, "grad_norm": 0.10412520170211792, "learning_rate": 0.01, "loss": 2.1525, "step": 6072 }, { "epoch": 0.6241652111373677, "grad_norm": 0.10417335480451584, "learning_rate": 0.01, "loss": 2.1535, "step": 6075 }, { "epoch": 0.6244734408712627, "grad_norm": 0.09024351090192795, "learning_rate": 0.01, "loss": 2.1551, "step": 6078 }, { "epoch": 0.6247816706051577, "grad_norm": 0.04889573156833649, "learning_rate": 0.01, "loss": 2.1549, "step": 6081 }, { "epoch": 0.6250899003390528, "grad_norm": 0.05154373124241829, "learning_rate": 0.01, "loss": 2.1461, "step": 6084 }, { "epoch": 0.6253981300729478, "grad_norm": 0.04337237402796745, "learning_rate": 0.01, "loss": 2.1733, "step": 6087 }, { "epoch": 0.6257063598068427, "grad_norm": 0.06173473224043846, "learning_rate": 0.01, "loss": 2.1657, "step": 6090 }, { "epoch": 0.6260145895407377, "grad_norm": 0.06174352392554283, "learning_rate": 0.01, "loss": 2.1528, "step": 6093 }, { "epoch": 0.6263228192746327, "grad_norm": 0.07301110029220581, "learning_rate": 0.01, "loss": 2.1489, "step": 6096 }, { "epoch": 0.6266310490085277, "grad_norm": 0.04265190288424492, "learning_rate": 0.01, "loss": 2.1624, "step": 6099 }, { "epoch": 0.6269392787424227, "grad_norm": 0.056723251938819885, "learning_rate": 0.01, "loss": 2.1624, "step": 6102 }, { "epoch": 0.6272475084763177, "grad_norm": 0.06809309124946594, "learning_rate": 0.01, "loss": 2.1525, "step": 6105 }, { "epoch": 0.6275557382102127, "grad_norm": 0.06820474565029144, "learning_rate": 0.01, "loss": 2.1472, "step": 6108 }, { "epoch": 0.6278639679441077, "grad_norm": 0.05961904302239418, "learning_rate": 0.01, "loss": 2.1561, "step": 6111 }, { "epoch": 0.6281721976780027, "grad_norm": 0.04617665335536003, "learning_rate": 0.01, "loss": 2.1475, "step": 6114 }, { "epoch": 0.6284804274118977, "grad_norm": 0.040670618414878845, "learning_rate": 0.01, "loss": 2.153, "step": 6117 }, { "epoch": 0.6287886571457927, "grad_norm": 0.09909021109342575, "learning_rate": 0.01, "loss": 2.141, "step": 6120 }, { "epoch": 0.6290968868796877, "grad_norm": 0.04966261237859726, "learning_rate": 0.01, "loss": 2.1264, "step": 6123 }, { "epoch": 0.6294051166135827, "grad_norm": 0.0570046491920948, "learning_rate": 0.01, "loss": 2.1572, "step": 6126 }, { "epoch": 0.6297133463474777, "grad_norm": 0.10374405980110168, "learning_rate": 0.01, "loss": 2.149, "step": 6129 }, { "epoch": 0.6300215760813727, "grad_norm": 0.061325542628765106, "learning_rate": 0.01, "loss": 2.1521, "step": 6132 }, { "epoch": 0.6303298058152677, "grad_norm": 0.16151310503482819, "learning_rate": 0.01, "loss": 2.1825, "step": 6135 }, { "epoch": 0.6306380355491626, "grad_norm": 0.0921199768781662, "learning_rate": 0.01, "loss": 2.1773, "step": 6138 }, { "epoch": 0.6309462652830576, "grad_norm": 0.05603238567709923, "learning_rate": 0.01, "loss": 2.1452, "step": 6141 }, { "epoch": 0.6312544950169526, "grad_norm": 0.12173126637935638, "learning_rate": 0.01, "loss": 2.1713, "step": 6144 }, { "epoch": 0.6315627247508476, "grad_norm": 0.04609265923500061, "learning_rate": 0.01, "loss": 2.1518, "step": 6147 }, { "epoch": 0.6318709544847426, "grad_norm": 0.06445127725601196, "learning_rate": 0.01, "loss": 2.162, "step": 6150 }, { "epoch": 0.6321791842186376, "grad_norm": 0.05396106466650963, "learning_rate": 0.01, "loss": 2.1248, "step": 6153 }, { "epoch": 0.6324874139525326, "grad_norm": 0.06955734640359879, "learning_rate": 0.01, "loss": 2.1497, "step": 6156 }, { "epoch": 0.6327956436864276, "grad_norm": 0.04371445253491402, "learning_rate": 0.01, "loss": 2.1167, "step": 6159 }, { "epoch": 0.6331038734203226, "grad_norm": 0.07146921008825302, "learning_rate": 0.01, "loss": 2.1633, "step": 6162 }, { "epoch": 0.6334121031542176, "grad_norm": 0.08056561648845673, "learning_rate": 0.01, "loss": 2.1506, "step": 6165 }, { "epoch": 0.6337203328881126, "grad_norm": 0.08875605463981628, "learning_rate": 0.01, "loss": 2.1834, "step": 6168 }, { "epoch": 0.6340285626220076, "grad_norm": 0.05090434104204178, "learning_rate": 0.01, "loss": 2.1514, "step": 6171 }, { "epoch": 0.6343367923559026, "grad_norm": 0.11710961163043976, "learning_rate": 0.01, "loss": 2.1589, "step": 6174 }, { "epoch": 0.6346450220897976, "grad_norm": 0.04704523831605911, "learning_rate": 0.01, "loss": 2.1469, "step": 6177 }, { "epoch": 0.6349532518236926, "grad_norm": 0.045143596827983856, "learning_rate": 0.01, "loss": 2.1311, "step": 6180 }, { "epoch": 0.6352614815575875, "grad_norm": 0.04246919974684715, "learning_rate": 0.01, "loss": 2.1481, "step": 6183 }, { "epoch": 0.6355697112914825, "grad_norm": 0.04303867742419243, "learning_rate": 0.01, "loss": 2.1557, "step": 6186 }, { "epoch": 0.6358779410253775, "grad_norm": 0.17376503348350525, "learning_rate": 0.01, "loss": 2.1616, "step": 6189 }, { "epoch": 0.6361861707592725, "grad_norm": 0.11983154714107513, "learning_rate": 0.01, "loss": 2.1569, "step": 6192 }, { "epoch": 0.6364944004931675, "grad_norm": 0.0443497858941555, "learning_rate": 0.01, "loss": 2.1454, "step": 6195 }, { "epoch": 0.6368026302270625, "grad_norm": 0.04241250827908516, "learning_rate": 0.01, "loss": 2.1409, "step": 6198 }, { "epoch": 0.6371108599609576, "grad_norm": 0.07058902829885483, "learning_rate": 0.01, "loss": 2.1246, "step": 6201 }, { "epoch": 0.6374190896948526, "grad_norm": 0.060852985829114914, "learning_rate": 0.01, "loss": 2.1512, "step": 6204 }, { "epoch": 0.6377273194287476, "grad_norm": 0.058703117072582245, "learning_rate": 0.01, "loss": 2.1114, "step": 6207 }, { "epoch": 0.6380355491626426, "grad_norm": 0.08501632511615753, "learning_rate": 0.01, "loss": 2.1818, "step": 6210 }, { "epoch": 0.6383437788965376, "grad_norm": 0.07715412974357605, "learning_rate": 0.01, "loss": 2.1661, "step": 6213 }, { "epoch": 0.6386520086304326, "grad_norm": 0.06822165101766586, "learning_rate": 0.01, "loss": 2.1652, "step": 6216 }, { "epoch": 0.6389602383643276, "grad_norm": 0.048459213227033615, "learning_rate": 0.01, "loss": 2.1311, "step": 6219 }, { "epoch": 0.6392684680982226, "grad_norm": 0.08208850026130676, "learning_rate": 0.01, "loss": 2.1316, "step": 6222 }, { "epoch": 0.6395766978321176, "grad_norm": 0.06399821490049362, "learning_rate": 0.01, "loss": 2.1354, "step": 6225 }, { "epoch": 0.6398849275660126, "grad_norm": 0.12036826461553574, "learning_rate": 0.01, "loss": 2.1509, "step": 6228 }, { "epoch": 0.6401931572999076, "grad_norm": 0.08180755376815796, "learning_rate": 0.01, "loss": 2.1571, "step": 6231 }, { "epoch": 0.6405013870338025, "grad_norm": 0.053771521896123886, "learning_rate": 0.01, "loss": 2.1485, "step": 6234 }, { "epoch": 0.6408096167676975, "grad_norm": 0.042291607707738876, "learning_rate": 0.01, "loss": 2.1606, "step": 6237 }, { "epoch": 0.6411178465015925, "grad_norm": 0.044655315577983856, "learning_rate": 0.01, "loss": 2.1592, "step": 6240 }, { "epoch": 0.6414260762354875, "grad_norm": 0.07763859629631042, "learning_rate": 0.01, "loss": 2.1543, "step": 6243 }, { "epoch": 0.6417343059693825, "grad_norm": 0.055368274450302124, "learning_rate": 0.01, "loss": 2.1643, "step": 6246 }, { "epoch": 0.6420425357032775, "grad_norm": 0.047774944454431534, "learning_rate": 0.01, "loss": 2.1542, "step": 6249 }, { "epoch": 0.6423507654371725, "grad_norm": 0.06478223204612732, "learning_rate": 0.01, "loss": 2.1501, "step": 6252 }, { "epoch": 0.6426589951710675, "grad_norm": 0.03782160207629204, "learning_rate": 0.01, "loss": 2.1455, "step": 6255 }, { "epoch": 0.6429672249049625, "grad_norm": 0.11297930777072906, "learning_rate": 0.01, "loss": 2.1667, "step": 6258 }, { "epoch": 0.6432754546388575, "grad_norm": 0.09408997744321823, "learning_rate": 0.01, "loss": 2.146, "step": 6261 }, { "epoch": 0.6435836843727525, "grad_norm": 0.06677352637052536, "learning_rate": 0.01, "loss": 2.1723, "step": 6264 }, { "epoch": 0.6438919141066475, "grad_norm": 0.08687873184680939, "learning_rate": 0.01, "loss": 2.1517, "step": 6267 }, { "epoch": 0.6442001438405425, "grad_norm": 0.06850516051054001, "learning_rate": 0.01, "loss": 2.148, "step": 6270 }, { "epoch": 0.6445083735744375, "grad_norm": 0.07705084979534149, "learning_rate": 0.01, "loss": 2.1567, "step": 6273 }, { "epoch": 0.6448166033083325, "grad_norm": 0.1622423529624939, "learning_rate": 0.01, "loss": 2.1676, "step": 6276 }, { "epoch": 0.6451248330422275, "grad_norm": 0.11197759211063385, "learning_rate": 0.01, "loss": 2.1376, "step": 6279 }, { "epoch": 0.6454330627761224, "grad_norm": 0.06562814861536026, "learning_rate": 0.01, "loss": 2.1652, "step": 6282 }, { "epoch": 0.6457412925100174, "grad_norm": 0.0867902860045433, "learning_rate": 0.01, "loss": 2.1733, "step": 6285 }, { "epoch": 0.6460495222439124, "grad_norm": 0.08153738081455231, "learning_rate": 0.01, "loss": 2.1442, "step": 6288 }, { "epoch": 0.6463577519778074, "grad_norm": 0.09800709784030914, "learning_rate": 0.01, "loss": 2.1262, "step": 6291 }, { "epoch": 0.6466659817117024, "grad_norm": 0.07728230953216553, "learning_rate": 0.01, "loss": 2.139, "step": 6294 }, { "epoch": 0.6469742114455974, "grad_norm": 0.09658671170473099, "learning_rate": 0.01, "loss": 2.1421, "step": 6297 }, { "epoch": 0.6472824411794924, "grad_norm": 0.0448787659406662, "learning_rate": 0.01, "loss": 2.1415, "step": 6300 }, { "epoch": 0.6475906709133874, "grad_norm": 0.03848707675933838, "learning_rate": 0.01, "loss": 2.1209, "step": 6303 }, { "epoch": 0.6478989006472824, "grad_norm": 0.07465004920959473, "learning_rate": 0.01, "loss": 2.1395, "step": 6306 }, { "epoch": 0.6482071303811774, "grad_norm": 0.060424912720918655, "learning_rate": 0.01, "loss": 2.1806, "step": 6309 }, { "epoch": 0.6485153601150724, "grad_norm": 0.05204974114894867, "learning_rate": 0.01, "loss": 2.1287, "step": 6312 }, { "epoch": 0.6488235898489675, "grad_norm": 0.06045055389404297, "learning_rate": 0.01, "loss": 2.1727, "step": 6315 }, { "epoch": 0.6491318195828625, "grad_norm": 0.04978582262992859, "learning_rate": 0.01, "loss": 2.1264, "step": 6318 }, { "epoch": 0.6494400493167575, "grad_norm": 0.08131048828363419, "learning_rate": 0.01, "loss": 2.137, "step": 6321 }, { "epoch": 0.6497482790506525, "grad_norm": 0.09749994426965714, "learning_rate": 0.01, "loss": 2.1557, "step": 6324 }, { "epoch": 0.6500565087845475, "grad_norm": 0.06079535186290741, "learning_rate": 0.01, "loss": 2.1432, "step": 6327 }, { "epoch": 0.6503647385184425, "grad_norm": 0.08241060376167297, "learning_rate": 0.01, "loss": 2.1551, "step": 6330 }, { "epoch": 0.6506729682523374, "grad_norm": 0.12339378148317337, "learning_rate": 0.01, "loss": 2.1216, "step": 6333 }, { "epoch": 0.6509811979862324, "grad_norm": 0.0660511702299118, "learning_rate": 0.01, "loss": 2.1156, "step": 6336 }, { "epoch": 0.6512894277201274, "grad_norm": 0.06279938668012619, "learning_rate": 0.01, "loss": 2.1778, "step": 6339 }, { "epoch": 0.6515976574540224, "grad_norm": 0.068712018430233, "learning_rate": 0.01, "loss": 2.1348, "step": 6342 }, { "epoch": 0.6519058871879174, "grad_norm": 0.05808734893798828, "learning_rate": 0.01, "loss": 2.135, "step": 6345 }, { "epoch": 0.6522141169218124, "grad_norm": 0.044942643493413925, "learning_rate": 0.01, "loss": 2.1613, "step": 6348 }, { "epoch": 0.6525223466557074, "grad_norm": 0.11666214466094971, "learning_rate": 0.01, "loss": 2.1399, "step": 6351 }, { "epoch": 0.6528305763896024, "grad_norm": 0.06776747852563858, "learning_rate": 0.01, "loss": 2.1369, "step": 6354 }, { "epoch": 0.6531388061234974, "grad_norm": 0.10171874612569809, "learning_rate": 0.01, "loss": 2.1273, "step": 6357 }, { "epoch": 0.6534470358573924, "grad_norm": 0.04611232131719589, "learning_rate": 0.01, "loss": 2.1482, "step": 6360 }, { "epoch": 0.6537552655912874, "grad_norm": 0.042139992117881775, "learning_rate": 0.01, "loss": 2.1235, "step": 6363 }, { "epoch": 0.6540634953251824, "grad_norm": 0.057816632091999054, "learning_rate": 0.01, "loss": 2.1449, "step": 6366 }, { "epoch": 0.6543717250590774, "grad_norm": 0.11400949209928513, "learning_rate": 0.01, "loss": 2.1566, "step": 6369 }, { "epoch": 0.6546799547929724, "grad_norm": 0.07320736348628998, "learning_rate": 0.01, "loss": 2.1682, "step": 6372 }, { "epoch": 0.6549881845268674, "grad_norm": 0.07262291014194489, "learning_rate": 0.01, "loss": 2.1514, "step": 6375 }, { "epoch": 0.6552964142607623, "grad_norm": 0.05559679865837097, "learning_rate": 0.01, "loss": 2.1347, "step": 6378 }, { "epoch": 0.6556046439946573, "grad_norm": 0.049424149096012115, "learning_rate": 0.01, "loss": 2.1423, "step": 6381 }, { "epoch": 0.6559128737285523, "grad_norm": 0.05457301065325737, "learning_rate": 0.01, "loss": 2.1425, "step": 6384 }, { "epoch": 0.6562211034624473, "grad_norm": 0.058564141392707825, "learning_rate": 0.01, "loss": 2.1158, "step": 6387 }, { "epoch": 0.6565293331963423, "grad_norm": 0.10944786667823792, "learning_rate": 0.01, "loss": 2.146, "step": 6390 }, { "epoch": 0.6568375629302373, "grad_norm": 0.07760695368051529, "learning_rate": 0.01, "loss": 2.176, "step": 6393 }, { "epoch": 0.6571457926641323, "grad_norm": 0.07621042430400848, "learning_rate": 0.01, "loss": 2.1779, "step": 6396 }, { "epoch": 0.6574540223980273, "grad_norm": 0.09723789244890213, "learning_rate": 0.01, "loss": 2.1455, "step": 6399 }, { "epoch": 0.6577622521319223, "grad_norm": 0.05648832768201828, "learning_rate": 0.01, "loss": 2.154, "step": 6402 }, { "epoch": 0.6580704818658173, "grad_norm": 0.04370080679655075, "learning_rate": 0.01, "loss": 2.1374, "step": 6405 }, { "epoch": 0.6583787115997123, "grad_norm": 0.03729141131043434, "learning_rate": 0.01, "loss": 2.1275, "step": 6408 }, { "epoch": 0.6586869413336073, "grad_norm": 0.055584125220775604, "learning_rate": 0.01, "loss": 2.1442, "step": 6411 }, { "epoch": 0.6589951710675023, "grad_norm": 0.07981918007135391, "learning_rate": 0.01, "loss": 2.1618, "step": 6414 }, { "epoch": 0.6593034008013973, "grad_norm": 0.09241674095392227, "learning_rate": 0.01, "loss": 2.1519, "step": 6417 }, { "epoch": 0.6596116305352923, "grad_norm": 0.10454630106687546, "learning_rate": 0.01, "loss": 2.1309, "step": 6420 }, { "epoch": 0.6599198602691873, "grad_norm": 0.08674053847789764, "learning_rate": 0.01, "loss": 2.1617, "step": 6423 }, { "epoch": 0.6602280900030822, "grad_norm": 0.06003529578447342, "learning_rate": 0.01, "loss": 2.1475, "step": 6426 }, { "epoch": 0.6605363197369772, "grad_norm": 0.07370956987142563, "learning_rate": 0.01, "loss": 2.1466, "step": 6429 }, { "epoch": 0.6608445494708723, "grad_norm": 0.05090004578232765, "learning_rate": 0.01, "loss": 2.1506, "step": 6432 }, { "epoch": 0.6611527792047673, "grad_norm": 0.06062362715601921, "learning_rate": 0.01, "loss": 2.1601, "step": 6435 }, { "epoch": 0.6614610089386623, "grad_norm": 0.05484107881784439, "learning_rate": 0.01, "loss": 2.1452, "step": 6438 }, { "epoch": 0.6617692386725573, "grad_norm": 0.1367156058549881, "learning_rate": 0.01, "loss": 2.1586, "step": 6441 }, { "epoch": 0.6620774684064523, "grad_norm": 0.05140338093042374, "learning_rate": 0.01, "loss": 2.1463, "step": 6444 }, { "epoch": 0.6623856981403473, "grad_norm": 0.09168683737516403, "learning_rate": 0.01, "loss": 2.1467, "step": 6447 }, { "epoch": 0.6626939278742423, "grad_norm": 0.04098822921514511, "learning_rate": 0.01, "loss": 2.1648, "step": 6450 }, { "epoch": 0.6630021576081373, "grad_norm": 0.049763478338718414, "learning_rate": 0.01, "loss": 2.1289, "step": 6453 }, { "epoch": 0.6633103873420323, "grad_norm": 0.060069404542446136, "learning_rate": 0.01, "loss": 2.1467, "step": 6456 }, { "epoch": 0.6636186170759273, "grad_norm": 0.06611450761556625, "learning_rate": 0.01, "loss": 2.1599, "step": 6459 }, { "epoch": 0.6639268468098223, "grad_norm": 0.04955270141363144, "learning_rate": 0.01, "loss": 2.136, "step": 6462 }, { "epoch": 0.6642350765437173, "grad_norm": 0.04004522040486336, "learning_rate": 0.01, "loss": 2.1457, "step": 6465 }, { "epoch": 0.6645433062776123, "grad_norm": 0.06539756804704666, "learning_rate": 0.01, "loss": 2.1458, "step": 6468 }, { "epoch": 0.6648515360115073, "grad_norm": 0.10684728622436523, "learning_rate": 0.01, "loss": 2.1279, "step": 6471 }, { "epoch": 0.6651597657454023, "grad_norm": 0.09936464577913284, "learning_rate": 0.01, "loss": 2.1767, "step": 6474 }, { "epoch": 0.6654679954792972, "grad_norm": 0.04908827692270279, "learning_rate": 0.01, "loss": 2.1259, "step": 6477 }, { "epoch": 0.6657762252131922, "grad_norm": 0.048053622245788574, "learning_rate": 0.01, "loss": 2.1718, "step": 6480 }, { "epoch": 0.6660844549470872, "grad_norm": 0.05524458363652229, "learning_rate": 0.01, "loss": 2.1673, "step": 6483 }, { "epoch": 0.6663926846809822, "grad_norm": 0.05107030272483826, "learning_rate": 0.01, "loss": 2.13, "step": 6486 }, { "epoch": 0.6667009144148772, "grad_norm": 0.12472579628229141, "learning_rate": 0.01, "loss": 2.149, "step": 6489 }, { "epoch": 0.6670091441487722, "grad_norm": 0.05257454514503479, "learning_rate": 0.01, "loss": 2.1343, "step": 6492 }, { "epoch": 0.6673173738826672, "grad_norm": 0.05986837297677994, "learning_rate": 0.01, "loss": 2.1265, "step": 6495 }, { "epoch": 0.6676256036165622, "grad_norm": 0.08322940021753311, "learning_rate": 0.01, "loss": 2.1317, "step": 6498 }, { "epoch": 0.6679338333504572, "grad_norm": 0.0466473363339901, "learning_rate": 0.01, "loss": 2.1235, "step": 6501 }, { "epoch": 0.6682420630843522, "grad_norm": 0.05092160776257515, "learning_rate": 0.01, "loss": 2.1672, "step": 6504 }, { "epoch": 0.6685502928182472, "grad_norm": 0.08392294496297836, "learning_rate": 0.01, "loss": 2.1473, "step": 6507 }, { "epoch": 0.6688585225521422, "grad_norm": 0.042165517807006836, "learning_rate": 0.01, "loss": 2.1181, "step": 6510 }, { "epoch": 0.6691667522860372, "grad_norm": 0.06214481219649315, "learning_rate": 0.01, "loss": 2.138, "step": 6513 }, { "epoch": 0.6694749820199322, "grad_norm": 0.06087846681475639, "learning_rate": 0.01, "loss": 2.15, "step": 6516 }, { "epoch": 0.6697832117538272, "grad_norm": 0.047256652265787125, "learning_rate": 0.01, "loss": 2.1433, "step": 6519 }, { "epoch": 0.6700914414877222, "grad_norm": 0.10626421123743057, "learning_rate": 0.01, "loss": 2.156, "step": 6522 }, { "epoch": 0.6703996712216171, "grad_norm": 0.09426552802324295, "learning_rate": 0.01, "loss": 2.1472, "step": 6525 }, { "epoch": 0.6707079009555121, "grad_norm": 0.0632442831993103, "learning_rate": 0.01, "loss": 2.1536, "step": 6528 }, { "epoch": 0.6710161306894071, "grad_norm": 0.07149971276521683, "learning_rate": 0.01, "loss": 2.1694, "step": 6531 }, { "epoch": 0.6713243604233021, "grad_norm": 0.04060966521501541, "learning_rate": 0.01, "loss": 2.164, "step": 6534 }, { "epoch": 0.6716325901571971, "grad_norm": 0.20043891668319702, "learning_rate": 0.01, "loss": 2.125, "step": 6537 }, { "epoch": 0.6719408198910921, "grad_norm": 0.06755783408880234, "learning_rate": 0.01, "loss": 2.15, "step": 6540 }, { "epoch": 0.6722490496249871, "grad_norm": 0.0509268082678318, "learning_rate": 0.01, "loss": 2.1405, "step": 6543 }, { "epoch": 0.6725572793588821, "grad_norm": 0.04033916816115379, "learning_rate": 0.01, "loss": 2.136, "step": 6546 }, { "epoch": 0.6728655090927772, "grad_norm": 0.04707946255803108, "learning_rate": 0.01, "loss": 2.1514, "step": 6549 }, { "epoch": 0.6731737388266722, "grad_norm": 0.04360898956656456, "learning_rate": 0.01, "loss": 2.1518, "step": 6552 }, { "epoch": 0.6734819685605672, "grad_norm": 0.11959343403577805, "learning_rate": 0.01, "loss": 2.1377, "step": 6555 }, { "epoch": 0.6737901982944622, "grad_norm": 0.06620760262012482, "learning_rate": 0.01, "loss": 2.1419, "step": 6558 }, { "epoch": 0.6740984280283572, "grad_norm": 0.056747015565633774, "learning_rate": 0.01, "loss": 2.138, "step": 6561 }, { "epoch": 0.6744066577622522, "grad_norm": 0.05230560526251793, "learning_rate": 0.01, "loss": 2.1335, "step": 6564 }, { "epoch": 0.6747148874961472, "grad_norm": 0.0526299811899662, "learning_rate": 0.01, "loss": 2.131, "step": 6567 }, { "epoch": 0.6750231172300422, "grad_norm": 0.15683774650096893, "learning_rate": 0.01, "loss": 2.1167, "step": 6570 }, { "epoch": 0.6753313469639372, "grad_norm": 0.10133557766675949, "learning_rate": 0.01, "loss": 2.1219, "step": 6573 }, { "epoch": 0.6756395766978321, "grad_norm": 0.06826774775981903, "learning_rate": 0.01, "loss": 2.1416, "step": 6576 }, { "epoch": 0.6759478064317271, "grad_norm": 0.046236682683229446, "learning_rate": 0.01, "loss": 2.1704, "step": 6579 }, { "epoch": 0.6762560361656221, "grad_norm": 0.07654762268066406, "learning_rate": 0.01, "loss": 2.1411, "step": 6582 }, { "epoch": 0.6765642658995171, "grad_norm": 0.07760706543922424, "learning_rate": 0.01, "loss": 2.168, "step": 6585 }, { "epoch": 0.6768724956334121, "grad_norm": 0.04213540256023407, "learning_rate": 0.01, "loss": 2.1899, "step": 6588 }, { "epoch": 0.6771807253673071, "grad_norm": 0.0517420619726181, "learning_rate": 0.01, "loss": 2.1561, "step": 6591 }, { "epoch": 0.6774889551012021, "grad_norm": 0.04073292762041092, "learning_rate": 0.01, "loss": 2.1475, "step": 6594 }, { "epoch": 0.6777971848350971, "grad_norm": 0.11223835498094559, "learning_rate": 0.01, "loss": 2.1102, "step": 6597 }, { "epoch": 0.6781054145689921, "grad_norm": 0.08094224333763123, "learning_rate": 0.01, "loss": 2.1537, "step": 6600 }, { "epoch": 0.6784136443028871, "grad_norm": 0.036313675343990326, "learning_rate": 0.01, "loss": 2.1471, "step": 6603 }, { "epoch": 0.6787218740367821, "grad_norm": 0.09553749114274979, "learning_rate": 0.01, "loss": 2.1445, "step": 6606 }, { "epoch": 0.6790301037706771, "grad_norm": 0.07334265112876892, "learning_rate": 0.01, "loss": 2.1594, "step": 6609 }, { "epoch": 0.6793383335045721, "grad_norm": 0.12031051516532898, "learning_rate": 0.01, "loss": 2.1321, "step": 6612 }, { "epoch": 0.6796465632384671, "grad_norm": 0.08834968507289886, "learning_rate": 0.01, "loss": 2.1474, "step": 6615 }, { "epoch": 0.679954792972362, "grad_norm": 0.05016850307583809, "learning_rate": 0.01, "loss": 2.1582, "step": 6618 }, { "epoch": 0.680263022706257, "grad_norm": 0.039213377982378006, "learning_rate": 0.01, "loss": 2.1461, "step": 6621 }, { "epoch": 0.680571252440152, "grad_norm": 0.035611145198345184, "learning_rate": 0.01, "loss": 2.137, "step": 6624 }, { "epoch": 0.680879482174047, "grad_norm": 0.09345167875289917, "learning_rate": 0.01, "loss": 2.1357, "step": 6627 }, { "epoch": 0.681187711907942, "grad_norm": 0.04311450198292732, "learning_rate": 0.01, "loss": 2.1413, "step": 6630 }, { "epoch": 0.681495941641837, "grad_norm": 0.040315765887498856, "learning_rate": 0.01, "loss": 2.1091, "step": 6633 }, { "epoch": 0.681804171375732, "grad_norm": 0.11044291406869888, "learning_rate": 0.01, "loss": 2.1392, "step": 6636 }, { "epoch": 0.682112401109627, "grad_norm": 0.1288553774356842, "learning_rate": 0.01, "loss": 2.1129, "step": 6639 }, { "epoch": 0.682420630843522, "grad_norm": 0.0698169469833374, "learning_rate": 0.01, "loss": 2.137, "step": 6642 }, { "epoch": 0.682728860577417, "grad_norm": 0.037890784442424774, "learning_rate": 0.01, "loss": 2.1195, "step": 6645 }, { "epoch": 0.683037090311312, "grad_norm": 0.07425201684236526, "learning_rate": 0.01, "loss": 2.1194, "step": 6648 }, { "epoch": 0.683345320045207, "grad_norm": 0.058168716728687286, "learning_rate": 0.01, "loss": 2.1371, "step": 6651 }, { "epoch": 0.683653549779102, "grad_norm": 0.05515358969569206, "learning_rate": 0.01, "loss": 2.137, "step": 6654 }, { "epoch": 0.683961779512997, "grad_norm": 0.0501445047557354, "learning_rate": 0.01, "loss": 2.1539, "step": 6657 }, { "epoch": 0.684270009246892, "grad_norm": 0.06167145445942879, "learning_rate": 0.01, "loss": 2.1413, "step": 6660 }, { "epoch": 0.6845782389807871, "grad_norm": 0.0841723158955574, "learning_rate": 0.01, "loss": 2.1194, "step": 6663 }, { "epoch": 0.6848864687146821, "grad_norm": 0.06027607619762421, "learning_rate": 0.01, "loss": 2.158, "step": 6666 }, { "epoch": 0.685194698448577, "grad_norm": 0.1187741607427597, "learning_rate": 0.01, "loss": 2.1651, "step": 6669 }, { "epoch": 0.685502928182472, "grad_norm": 0.10789939761161804, "learning_rate": 0.01, "loss": 2.1465, "step": 6672 }, { "epoch": 0.685811157916367, "grad_norm": 0.06254967302083969, "learning_rate": 0.01, "loss": 2.1639, "step": 6675 }, { "epoch": 0.686119387650262, "grad_norm": 0.04242802783846855, "learning_rate": 0.01, "loss": 2.1563, "step": 6678 }, { "epoch": 0.686427617384157, "grad_norm": 0.03538980334997177, "learning_rate": 0.01, "loss": 2.1373, "step": 6681 }, { "epoch": 0.686735847118052, "grad_norm": 0.04609490931034088, "learning_rate": 0.01, "loss": 2.1345, "step": 6684 }, { "epoch": 0.687044076851947, "grad_norm": 0.1298975795507431, "learning_rate": 0.01, "loss": 2.1446, "step": 6687 }, { "epoch": 0.687352306585842, "grad_norm": 0.10049281269311905, "learning_rate": 0.01, "loss": 2.1432, "step": 6690 }, { "epoch": 0.687660536319737, "grad_norm": 0.05908266827464104, "learning_rate": 0.01, "loss": 2.1288, "step": 6693 }, { "epoch": 0.687968766053632, "grad_norm": 0.0546141043305397, "learning_rate": 0.01, "loss": 2.1086, "step": 6696 }, { "epoch": 0.688276995787527, "grad_norm": 0.04135862737894058, "learning_rate": 0.01, "loss": 2.1187, "step": 6699 }, { "epoch": 0.688585225521422, "grad_norm": 0.03824761137366295, "learning_rate": 0.01, "loss": 2.1162, "step": 6702 }, { "epoch": 0.688893455255317, "grad_norm": 0.041454900056123734, "learning_rate": 0.01, "loss": 2.1304, "step": 6705 }, { "epoch": 0.689201684989212, "grad_norm": 0.08948934823274612, "learning_rate": 0.01, "loss": 2.1538, "step": 6708 }, { "epoch": 0.689509914723107, "grad_norm": 0.07379783689975739, "learning_rate": 0.01, "loss": 2.145, "step": 6711 }, { "epoch": 0.689818144457002, "grad_norm": 0.0833912044763565, "learning_rate": 0.01, "loss": 2.1218, "step": 6714 }, { "epoch": 0.690126374190897, "grad_norm": 0.05899098515510559, "learning_rate": 0.01, "loss": 2.1516, "step": 6717 }, { "epoch": 0.690434603924792, "grad_norm": 0.06462058424949646, "learning_rate": 0.01, "loss": 2.1496, "step": 6720 }, { "epoch": 0.6907428336586869, "grad_norm": 0.04040443152189255, "learning_rate": 0.01, "loss": 2.1311, "step": 6723 }, { "epoch": 0.6910510633925819, "grad_norm": 0.05336814373731613, "learning_rate": 0.01, "loss": 2.1227, "step": 6726 }, { "epoch": 0.6913592931264769, "grad_norm": 0.05057406798005104, "learning_rate": 0.01, "loss": 2.1281, "step": 6729 }, { "epoch": 0.6916675228603719, "grad_norm": 0.08063513040542603, "learning_rate": 0.01, "loss": 2.1318, "step": 6732 }, { "epoch": 0.6919757525942669, "grad_norm": 0.08304840326309204, "learning_rate": 0.01, "loss": 2.1179, "step": 6735 }, { "epoch": 0.6922839823281619, "grad_norm": 0.04266434162855148, "learning_rate": 0.01, "loss": 2.1447, "step": 6738 }, { "epoch": 0.6925922120620569, "grad_norm": 0.07502007484436035, "learning_rate": 0.01, "loss": 2.1173, "step": 6741 }, { "epoch": 0.6929004417959519, "grad_norm": 0.10870220512151718, "learning_rate": 0.01, "loss": 2.1555, "step": 6744 }, { "epoch": 0.6932086715298469, "grad_norm": 0.15824924409389496, "learning_rate": 0.01, "loss": 2.1668, "step": 6747 }, { "epoch": 0.6935169012637419, "grad_norm": 0.06319935619831085, "learning_rate": 0.01, "loss": 2.1788, "step": 6750 }, { "epoch": 0.6938251309976369, "grad_norm": 0.06392507255077362, "learning_rate": 0.01, "loss": 2.1398, "step": 6753 }, { "epoch": 0.6941333607315319, "grad_norm": 0.044481996446847916, "learning_rate": 0.01, "loss": 2.147, "step": 6756 }, { "epoch": 0.6944415904654269, "grad_norm": 0.09093592315912247, "learning_rate": 0.01, "loss": 2.1399, "step": 6759 }, { "epoch": 0.6947498201993219, "grad_norm": 0.09249415248632431, "learning_rate": 0.01, "loss": 2.1274, "step": 6762 }, { "epoch": 0.6950580499332168, "grad_norm": 0.06134162098169327, "learning_rate": 0.01, "loss": 2.142, "step": 6765 }, { "epoch": 0.6953662796671118, "grad_norm": 0.048883359879255295, "learning_rate": 0.01, "loss": 2.1357, "step": 6768 }, { "epoch": 0.6956745094010068, "grad_norm": 0.04553356394171715, "learning_rate": 0.01, "loss": 2.132, "step": 6771 }, { "epoch": 0.6959827391349018, "grad_norm": 0.10365505516529083, "learning_rate": 0.01, "loss": 2.1568, "step": 6774 }, { "epoch": 0.6962909688687968, "grad_norm": 0.07474958896636963, "learning_rate": 0.01, "loss": 2.1209, "step": 6777 }, { "epoch": 0.6965991986026919, "grad_norm": 0.11140461266040802, "learning_rate": 0.01, "loss": 2.1585, "step": 6780 }, { "epoch": 0.6969074283365869, "grad_norm": 0.0529690645635128, "learning_rate": 0.01, "loss": 2.1228, "step": 6783 }, { "epoch": 0.6972156580704819, "grad_norm": 0.06484264135360718, "learning_rate": 0.01, "loss": 2.117, "step": 6786 }, { "epoch": 0.6975238878043769, "grad_norm": 0.0467400886118412, "learning_rate": 0.01, "loss": 2.1367, "step": 6789 }, { "epoch": 0.6978321175382719, "grad_norm": 0.09690822660923004, "learning_rate": 0.01, "loss": 2.1275, "step": 6792 }, { "epoch": 0.6981403472721669, "grad_norm": 0.053299982100725174, "learning_rate": 0.01, "loss": 2.1557, "step": 6795 }, { "epoch": 0.6984485770060619, "grad_norm": 0.08451724797487259, "learning_rate": 0.01, "loss": 2.1235, "step": 6798 }, { "epoch": 0.6987568067399569, "grad_norm": 0.11180119216442108, "learning_rate": 0.01, "loss": 2.1389, "step": 6801 }, { "epoch": 0.6990650364738519, "grad_norm": 0.04366112872958183, "learning_rate": 0.01, "loss": 2.1345, "step": 6804 }, { "epoch": 0.6993732662077469, "grad_norm": 0.057021014392375946, "learning_rate": 0.01, "loss": 2.145, "step": 6807 }, { "epoch": 0.6996814959416419, "grad_norm": 0.050035975873470306, "learning_rate": 0.01, "loss": 2.1245, "step": 6810 }, { "epoch": 0.6999897256755369, "grad_norm": 0.16434957087039948, "learning_rate": 0.01, "loss": 2.099, "step": 6813 }, { "epoch": 0.7002979554094318, "grad_norm": 0.0473979152739048, "learning_rate": 0.01, "loss": 2.124, "step": 6816 }, { "epoch": 0.7006061851433268, "grad_norm": 0.06207640469074249, "learning_rate": 0.01, "loss": 2.1528, "step": 6819 }, { "epoch": 0.7009144148772218, "grad_norm": 0.09829109162092209, "learning_rate": 0.01, "loss": 2.1359, "step": 6822 }, { "epoch": 0.7012226446111168, "grad_norm": 0.0563257597386837, "learning_rate": 0.01, "loss": 2.1639, "step": 6825 }, { "epoch": 0.7015308743450118, "grad_norm": 0.12371699512004852, "learning_rate": 0.01, "loss": 2.1479, "step": 6828 }, { "epoch": 0.7018391040789068, "grad_norm": 0.07342347502708435, "learning_rate": 0.01, "loss": 2.1786, "step": 6831 }, { "epoch": 0.7021473338128018, "grad_norm": 0.05420146882534027, "learning_rate": 0.01, "loss": 2.1261, "step": 6834 }, { "epoch": 0.7024555635466968, "grad_norm": 0.04500873014330864, "learning_rate": 0.01, "loss": 2.1356, "step": 6837 }, { "epoch": 0.7027637932805918, "grad_norm": 0.10648415237665176, "learning_rate": 0.01, "loss": 2.1205, "step": 6840 }, { "epoch": 0.7030720230144868, "grad_norm": 0.05089351162314415, "learning_rate": 0.01, "loss": 2.1403, "step": 6843 }, { "epoch": 0.7033802527483818, "grad_norm": 0.10011807084083557, "learning_rate": 0.01, "loss": 2.1508, "step": 6846 }, { "epoch": 0.7036884824822768, "grad_norm": 0.06787194311618805, "learning_rate": 0.01, "loss": 2.1391, "step": 6849 }, { "epoch": 0.7039967122161718, "grad_norm": 0.08248817175626755, "learning_rate": 0.01, "loss": 2.1782, "step": 6852 }, { "epoch": 0.7043049419500668, "grad_norm": 0.04949905723333359, "learning_rate": 0.01, "loss": 2.1401, "step": 6855 }, { "epoch": 0.7046131716839618, "grad_norm": 0.043910931795835495, "learning_rate": 0.01, "loss": 2.108, "step": 6858 }, { "epoch": 0.7049214014178568, "grad_norm": 0.05133078247308731, "learning_rate": 0.01, "loss": 2.1088, "step": 6861 }, { "epoch": 0.7052296311517517, "grad_norm": 0.11582443863153458, "learning_rate": 0.01, "loss": 2.1301, "step": 6864 }, { "epoch": 0.7055378608856467, "grad_norm": 0.04287354275584221, "learning_rate": 0.01, "loss": 2.124, "step": 6867 }, { "epoch": 0.7058460906195417, "grad_norm": 0.09393726289272308, "learning_rate": 0.01, "loss": 2.1326, "step": 6870 }, { "epoch": 0.7061543203534367, "grad_norm": 0.1286250203847885, "learning_rate": 0.01, "loss": 2.1292, "step": 6873 }, { "epoch": 0.7064625500873317, "grad_norm": 0.14816388487815857, "learning_rate": 0.01, "loss": 2.1439, "step": 6876 }, { "epoch": 0.7067707798212267, "grad_norm": 0.062444012612104416, "learning_rate": 0.01, "loss": 2.1421, "step": 6879 }, { "epoch": 0.7070790095551217, "grad_norm": 0.053750455379486084, "learning_rate": 0.01, "loss": 2.1185, "step": 6882 }, { "epoch": 0.7073872392890167, "grad_norm": 0.051356710493564606, "learning_rate": 0.01, "loss": 2.1298, "step": 6885 }, { "epoch": 0.7076954690229117, "grad_norm": 0.061504025012254715, "learning_rate": 0.01, "loss": 2.1132, "step": 6888 }, { "epoch": 0.7080036987568067, "grad_norm": 0.056496761739254, "learning_rate": 0.01, "loss": 2.1019, "step": 6891 }, { "epoch": 0.7083119284907017, "grad_norm": 0.048710647970438004, "learning_rate": 0.01, "loss": 2.126, "step": 6894 }, { "epoch": 0.7086201582245968, "grad_norm": 0.06260757148265839, "learning_rate": 0.01, "loss": 2.1534, "step": 6897 }, { "epoch": 0.7089283879584918, "grad_norm": 0.06622278690338135, "learning_rate": 0.01, "loss": 2.1247, "step": 6900 }, { "epoch": 0.7092366176923868, "grad_norm": 0.0810452550649643, "learning_rate": 0.01, "loss": 2.1336, "step": 6903 }, { "epoch": 0.7095448474262818, "grad_norm": 0.04692875221371651, "learning_rate": 0.01, "loss": 2.1096, "step": 6906 }, { "epoch": 0.7098530771601768, "grad_norm": 0.04757360368967056, "learning_rate": 0.01, "loss": 2.1181, "step": 6909 }, { "epoch": 0.7101613068940718, "grad_norm": 0.05597659945487976, "learning_rate": 0.01, "loss": 2.1425, "step": 6912 }, { "epoch": 0.7104695366279667, "grad_norm": 0.051605843007564545, "learning_rate": 0.01, "loss": 2.1118, "step": 6915 }, { "epoch": 0.7107777663618617, "grad_norm": 0.06179991737008095, "learning_rate": 0.01, "loss": 2.1362, "step": 6918 }, { "epoch": 0.7110859960957567, "grad_norm": 0.05455191805958748, "learning_rate": 0.01, "loss": 2.1279, "step": 6921 }, { "epoch": 0.7113942258296517, "grad_norm": 0.11560655385255814, "learning_rate": 0.01, "loss": 2.1316, "step": 6924 }, { "epoch": 0.7117024555635467, "grad_norm": 0.12203246355056763, "learning_rate": 0.01, "loss": 2.1173, "step": 6927 }, { "epoch": 0.7120106852974417, "grad_norm": 0.07024069130420685, "learning_rate": 0.01, "loss": 2.1395, "step": 6930 }, { "epoch": 0.7123189150313367, "grad_norm": 0.04773107171058655, "learning_rate": 0.01, "loss": 2.1455, "step": 6933 }, { "epoch": 0.7126271447652317, "grad_norm": 0.06106821820139885, "learning_rate": 0.01, "loss": 2.1352, "step": 6936 }, { "epoch": 0.7129353744991267, "grad_norm": 0.11438222974538803, "learning_rate": 0.01, "loss": 2.149, "step": 6939 }, { "epoch": 0.7132436042330217, "grad_norm": 0.07224932312965393, "learning_rate": 0.01, "loss": 2.1234, "step": 6942 }, { "epoch": 0.7135518339669167, "grad_norm": 0.06790932267904282, "learning_rate": 0.01, "loss": 2.1222, "step": 6945 }, { "epoch": 0.7138600637008117, "grad_norm": 0.12322958558797836, "learning_rate": 0.01, "loss": 2.106, "step": 6948 }, { "epoch": 0.7141682934347067, "grad_norm": 0.07186157256364822, "learning_rate": 0.01, "loss": 2.1365, "step": 6951 }, { "epoch": 0.7144765231686017, "grad_norm": 0.05366130173206329, "learning_rate": 0.01, "loss": 2.1264, "step": 6954 }, { "epoch": 0.7147847529024967, "grad_norm": 0.06682512164115906, "learning_rate": 0.01, "loss": 2.1163, "step": 6957 }, { "epoch": 0.7150929826363916, "grad_norm": 0.04629479721188545, "learning_rate": 0.01, "loss": 2.126, "step": 6960 }, { "epoch": 0.7154012123702866, "grad_norm": 0.053164754062891006, "learning_rate": 0.01, "loss": 2.1262, "step": 6963 }, { "epoch": 0.7157094421041816, "grad_norm": 0.08918699622154236, "learning_rate": 0.01, "loss": 2.157, "step": 6966 }, { "epoch": 0.7160176718380766, "grad_norm": 0.06226164847612381, "learning_rate": 0.01, "loss": 2.1391, "step": 6969 }, { "epoch": 0.7163259015719716, "grad_norm": 0.08120178431272507, "learning_rate": 0.01, "loss": 2.1118, "step": 6972 }, { "epoch": 0.7166341313058666, "grad_norm": 0.06390135735273361, "learning_rate": 0.01, "loss": 2.1302, "step": 6975 }, { "epoch": 0.7169423610397616, "grad_norm": 0.039068643003702164, "learning_rate": 0.01, "loss": 2.1304, "step": 6978 }, { "epoch": 0.7172505907736566, "grad_norm": 0.05006824806332588, "learning_rate": 0.01, "loss": 2.1352, "step": 6981 }, { "epoch": 0.7175588205075516, "grad_norm": 0.03946538642048836, "learning_rate": 0.01, "loss": 2.1513, "step": 6984 }, { "epoch": 0.7178670502414466, "grad_norm": 0.05072702839970589, "learning_rate": 0.01, "loss": 2.1298, "step": 6987 }, { "epoch": 0.7181752799753416, "grad_norm": 0.06457548588514328, "learning_rate": 0.01, "loss": 2.1276, "step": 6990 }, { "epoch": 0.7184835097092366, "grad_norm": 0.05759236589074135, "learning_rate": 0.01, "loss": 2.1198, "step": 6993 }, { "epoch": 0.7187917394431316, "grad_norm": 0.1151571124792099, "learning_rate": 0.01, "loss": 2.1217, "step": 6996 }, { "epoch": 0.7190999691770266, "grad_norm": 0.04867241531610489, "learning_rate": 0.01, "loss": 2.1343, "step": 6999 }, { "epoch": 0.7194081989109216, "grad_norm": 0.074817955493927, "learning_rate": 0.01, "loss": 2.1474, "step": 7002 }, { "epoch": 0.7197164286448166, "grad_norm": 0.04749060794711113, "learning_rate": 0.01, "loss": 2.1403, "step": 7005 }, { "epoch": 0.7200246583787115, "grad_norm": 0.04965493455529213, "learning_rate": 0.01, "loss": 2.142, "step": 7008 }, { "epoch": 0.7203328881126067, "grad_norm": 0.044914234429597855, "learning_rate": 0.01, "loss": 2.1397, "step": 7011 }, { "epoch": 0.7206411178465016, "grad_norm": 0.06727777421474457, "learning_rate": 0.01, "loss": 2.1443, "step": 7014 }, { "epoch": 0.7209493475803966, "grad_norm": 0.10670837014913559, "learning_rate": 0.01, "loss": 2.1316, "step": 7017 }, { "epoch": 0.7212575773142916, "grad_norm": 0.05047740787267685, "learning_rate": 0.01, "loss": 2.1268, "step": 7020 }, { "epoch": 0.7215658070481866, "grad_norm": 0.055116791278123856, "learning_rate": 0.01, "loss": 2.1194, "step": 7023 }, { "epoch": 0.7218740367820816, "grad_norm": 0.04873311519622803, "learning_rate": 0.01, "loss": 2.1122, "step": 7026 }, { "epoch": 0.7221822665159766, "grad_norm": 0.0893159881234169, "learning_rate": 0.01, "loss": 2.1413, "step": 7029 }, { "epoch": 0.7224904962498716, "grad_norm": 0.07278893142938614, "learning_rate": 0.01, "loss": 2.1394, "step": 7032 }, { "epoch": 0.7227987259837666, "grad_norm": 0.09431196749210358, "learning_rate": 0.01, "loss": 2.1489, "step": 7035 }, { "epoch": 0.7231069557176616, "grad_norm": 0.03588537499308586, "learning_rate": 0.01, "loss": 2.1585, "step": 7038 }, { "epoch": 0.7234151854515566, "grad_norm": 0.044003136456012726, "learning_rate": 0.01, "loss": 2.1442, "step": 7041 }, { "epoch": 0.7237234151854516, "grad_norm": 0.10805044323205948, "learning_rate": 0.01, "loss": 2.127, "step": 7044 }, { "epoch": 0.7240316449193466, "grad_norm": 0.06328746676445007, "learning_rate": 0.01, "loss": 2.1166, "step": 7047 }, { "epoch": 0.7243398746532416, "grad_norm": 0.08782347291707993, "learning_rate": 0.01, "loss": 2.1474, "step": 7050 }, { "epoch": 0.7246481043871366, "grad_norm": 0.06585227698087692, "learning_rate": 0.01, "loss": 2.1228, "step": 7053 }, { "epoch": 0.7249563341210316, "grad_norm": 0.06324558705091476, "learning_rate": 0.01, "loss": 2.1313, "step": 7056 }, { "epoch": 0.7252645638549265, "grad_norm": 0.057287219911813736, "learning_rate": 0.01, "loss": 2.1241, "step": 7059 }, { "epoch": 0.7255727935888215, "grad_norm": 0.07684747129678726, "learning_rate": 0.01, "loss": 2.1299, "step": 7062 }, { "epoch": 0.7258810233227165, "grad_norm": 0.10347555577754974, "learning_rate": 0.01, "loss": 2.12, "step": 7065 }, { "epoch": 0.7261892530566115, "grad_norm": 0.06019530072808266, "learning_rate": 0.01, "loss": 2.138, "step": 7068 }, { "epoch": 0.7264974827905065, "grad_norm": 0.04816723242402077, "learning_rate": 0.01, "loss": 2.1161, "step": 7071 }, { "epoch": 0.7268057125244015, "grad_norm": 0.05839864909648895, "learning_rate": 0.01, "loss": 2.136, "step": 7074 }, { "epoch": 0.7271139422582965, "grad_norm": 0.061795271933078766, "learning_rate": 0.01, "loss": 2.1315, "step": 7077 }, { "epoch": 0.7274221719921915, "grad_norm": 0.05736471712589264, "learning_rate": 0.01, "loss": 2.1403, "step": 7080 }, { "epoch": 0.7277304017260865, "grad_norm": 0.059238459914922714, "learning_rate": 0.01, "loss": 2.1101, "step": 7083 }, { "epoch": 0.7280386314599815, "grad_norm": 0.10844148695468903, "learning_rate": 0.01, "loss": 2.1454, "step": 7086 }, { "epoch": 0.7283468611938765, "grad_norm": 0.047568898648023605, "learning_rate": 0.01, "loss": 2.1183, "step": 7089 }, { "epoch": 0.7286550909277715, "grad_norm": 0.05178900063037872, "learning_rate": 0.01, "loss": 2.1346, "step": 7092 }, { "epoch": 0.7289633206616665, "grad_norm": 0.04113532230257988, "learning_rate": 0.01, "loss": 2.0915, "step": 7095 }, { "epoch": 0.7292715503955615, "grad_norm": 0.10488615930080414, "learning_rate": 0.01, "loss": 2.1239, "step": 7098 }, { "epoch": 0.7295797801294565, "grad_norm": 0.13013161718845367, "learning_rate": 0.01, "loss": 2.1251, "step": 7101 }, { "epoch": 0.7298880098633515, "grad_norm": 0.10956915467977524, "learning_rate": 0.01, "loss": 2.1113, "step": 7104 }, { "epoch": 0.7301962395972464, "grad_norm": 0.06996689736843109, "learning_rate": 0.01, "loss": 2.118, "step": 7107 }, { "epoch": 0.7305044693311414, "grad_norm": 0.07773365080356598, "learning_rate": 0.01, "loss": 2.1144, "step": 7110 }, { "epoch": 0.7308126990650364, "grad_norm": 0.06922838091850281, "learning_rate": 0.01, "loss": 2.1148, "step": 7113 }, { "epoch": 0.7311209287989314, "grad_norm": 0.08941454440355301, "learning_rate": 0.01, "loss": 2.1493, "step": 7116 }, { "epoch": 0.7314291585328264, "grad_norm": 0.04264171048998833, "learning_rate": 0.01, "loss": 2.136, "step": 7119 }, { "epoch": 0.7317373882667214, "grad_norm": 0.04473461955785751, "learning_rate": 0.01, "loss": 2.1294, "step": 7122 }, { "epoch": 0.7320456180006164, "grad_norm": 0.0396125465631485, "learning_rate": 0.01, "loss": 2.1439, "step": 7125 }, { "epoch": 0.7323538477345115, "grad_norm": 0.04613679647445679, "learning_rate": 0.01, "loss": 2.1503, "step": 7128 }, { "epoch": 0.7326620774684065, "grad_norm": 0.04897918924689293, "learning_rate": 0.01, "loss": 2.1214, "step": 7131 }, { "epoch": 0.7329703072023015, "grad_norm": 0.05057375133037567, "learning_rate": 0.01, "loss": 2.1112, "step": 7134 }, { "epoch": 0.7332785369361965, "grad_norm": 0.05711055174469948, "learning_rate": 0.01, "loss": 2.102, "step": 7137 }, { "epoch": 0.7335867666700915, "grad_norm": 0.08658434450626373, "learning_rate": 0.01, "loss": 2.1574, "step": 7140 }, { "epoch": 0.7338949964039865, "grad_norm": 0.07044188678264618, "learning_rate": 0.01, "loss": 2.1037, "step": 7143 }, { "epoch": 0.7342032261378815, "grad_norm": 0.03941315785050392, "learning_rate": 0.01, "loss": 2.1369, "step": 7146 }, { "epoch": 0.7345114558717765, "grad_norm": 0.04527783393859863, "learning_rate": 0.01, "loss": 2.1212, "step": 7149 }, { "epoch": 0.7348196856056715, "grad_norm": 0.07909847050905228, "learning_rate": 0.01, "loss": 2.1316, "step": 7152 }, { "epoch": 0.7351279153395665, "grad_norm": 0.12793006002902985, "learning_rate": 0.01, "loss": 2.1254, "step": 7155 }, { "epoch": 0.7354361450734614, "grad_norm": 0.0639350563287735, "learning_rate": 0.01, "loss": 2.1319, "step": 7158 }, { "epoch": 0.7357443748073564, "grad_norm": 0.0342305451631546, "learning_rate": 0.01, "loss": 2.1386, "step": 7161 }, { "epoch": 0.7360526045412514, "grad_norm": 0.049001939594745636, "learning_rate": 0.01, "loss": 2.1485, "step": 7164 }, { "epoch": 0.7363608342751464, "grad_norm": 0.047717638313770294, "learning_rate": 0.01, "loss": 2.1368, "step": 7167 }, { "epoch": 0.7366690640090414, "grad_norm": 0.04402822256088257, "learning_rate": 0.01, "loss": 2.1162, "step": 7170 }, { "epoch": 0.7369772937429364, "grad_norm": 0.06922505795955658, "learning_rate": 0.01, "loss": 2.1279, "step": 7173 }, { "epoch": 0.7372855234768314, "grad_norm": 0.06231709569692612, "learning_rate": 0.01, "loss": 2.0946, "step": 7176 }, { "epoch": 0.7375937532107264, "grad_norm": 0.11480400711297989, "learning_rate": 0.01, "loss": 2.146, "step": 7179 }, { "epoch": 0.7379019829446214, "grad_norm": 0.05144179239869118, "learning_rate": 0.01, "loss": 2.1128, "step": 7182 }, { "epoch": 0.7382102126785164, "grad_norm": 0.05130591616034508, "learning_rate": 0.01, "loss": 2.0964, "step": 7185 }, { "epoch": 0.7385184424124114, "grad_norm": 0.0549122579395771, "learning_rate": 0.01, "loss": 2.165, "step": 7188 }, { "epoch": 0.7388266721463064, "grad_norm": 0.1378844678401947, "learning_rate": 0.01, "loss": 2.1367, "step": 7191 }, { "epoch": 0.7391349018802014, "grad_norm": 0.06231486052274704, "learning_rate": 0.01, "loss": 2.1341, "step": 7194 }, { "epoch": 0.7394431316140964, "grad_norm": 0.10189559310674667, "learning_rate": 0.01, "loss": 2.1161, "step": 7197 }, { "epoch": 0.7397513613479914, "grad_norm": 0.053364284336566925, "learning_rate": 0.01, "loss": 2.1043, "step": 7200 }, { "epoch": 0.7400595910818863, "grad_norm": 0.046057943254709244, "learning_rate": 0.01, "loss": 2.1011, "step": 7203 }, { "epoch": 0.7403678208157813, "grad_norm": 0.04084615036845207, "learning_rate": 0.01, "loss": 2.1253, "step": 7206 }, { "epoch": 0.7406760505496763, "grad_norm": 0.04594961181282997, "learning_rate": 0.01, "loss": 2.1228, "step": 7209 }, { "epoch": 0.7409842802835713, "grad_norm": 0.06608622521162033, "learning_rate": 0.01, "loss": 2.1188, "step": 7212 }, { "epoch": 0.7412925100174663, "grad_norm": 0.125398188829422, "learning_rate": 0.01, "loss": 2.1057, "step": 7215 }, { "epoch": 0.7416007397513613, "grad_norm": 0.08068963885307312, "learning_rate": 0.01, "loss": 2.0947, "step": 7218 }, { "epoch": 0.7419089694852563, "grad_norm": 0.07993921637535095, "learning_rate": 0.01, "loss": 2.1214, "step": 7221 }, { "epoch": 0.7422171992191513, "grad_norm": 0.04969675466418266, "learning_rate": 0.01, "loss": 2.1099, "step": 7224 }, { "epoch": 0.7425254289530463, "grad_norm": 0.054677605628967285, "learning_rate": 0.01, "loss": 2.1229, "step": 7227 }, { "epoch": 0.7428336586869413, "grad_norm": 0.04562999680638313, "learning_rate": 0.01, "loss": 2.1409, "step": 7230 }, { "epoch": 0.7431418884208363, "grad_norm": 0.07618910074234009, "learning_rate": 0.01, "loss": 2.0924, "step": 7233 }, { "epoch": 0.7434501181547313, "grad_norm": 0.14368098974227905, "learning_rate": 0.01, "loss": 2.1348, "step": 7236 }, { "epoch": 0.7437583478886263, "grad_norm": 0.05517590045928955, "learning_rate": 0.01, "loss": 2.116, "step": 7239 }, { "epoch": 0.7440665776225214, "grad_norm": 0.17316390573978424, "learning_rate": 0.01, "loss": 2.1363, "step": 7242 }, { "epoch": 0.7443748073564164, "grad_norm": 0.15268415212631226, "learning_rate": 0.01, "loss": 2.1033, "step": 7245 }, { "epoch": 0.7446830370903114, "grad_norm": 0.06212317943572998, "learning_rate": 0.01, "loss": 2.0971, "step": 7248 }, { "epoch": 0.7449912668242064, "grad_norm": 0.04282272607088089, "learning_rate": 0.01, "loss": 2.1434, "step": 7251 }, { "epoch": 0.7452994965581013, "grad_norm": 0.04305952787399292, "learning_rate": 0.01, "loss": 2.1406, "step": 7254 }, { "epoch": 0.7456077262919963, "grad_norm": 0.048668697476387024, "learning_rate": 0.01, "loss": 2.1303, "step": 7257 }, { "epoch": 0.7459159560258913, "grad_norm": 0.05524542182683945, "learning_rate": 0.01, "loss": 2.1331, "step": 7260 }, { "epoch": 0.7462241857597863, "grad_norm": 0.0438026525080204, "learning_rate": 0.01, "loss": 2.109, "step": 7263 }, { "epoch": 0.7465324154936813, "grad_norm": 0.08154566586017609, "learning_rate": 0.01, "loss": 2.1053, "step": 7266 }, { "epoch": 0.7468406452275763, "grad_norm": 0.11754357814788818, "learning_rate": 0.01, "loss": 2.1298, "step": 7269 }, { "epoch": 0.7471488749614713, "grad_norm": 0.06593465805053711, "learning_rate": 0.01, "loss": 2.1323, "step": 7272 }, { "epoch": 0.7474571046953663, "grad_norm": 0.08065393567085266, "learning_rate": 0.01, "loss": 2.1297, "step": 7275 }, { "epoch": 0.7477653344292613, "grad_norm": 0.10624121129512787, "learning_rate": 0.01, "loss": 2.1175, "step": 7278 }, { "epoch": 0.7480735641631563, "grad_norm": 0.06357972323894501, "learning_rate": 0.01, "loss": 2.164, "step": 7281 }, { "epoch": 0.7483817938970513, "grad_norm": 0.03753754869103432, "learning_rate": 0.01, "loss": 2.1519, "step": 7284 }, { "epoch": 0.7486900236309463, "grad_norm": 0.04756931588053703, "learning_rate": 0.01, "loss": 2.0901, "step": 7287 }, { "epoch": 0.7489982533648413, "grad_norm": 0.0494108609855175, "learning_rate": 0.01, "loss": 2.1474, "step": 7290 }, { "epoch": 0.7493064830987363, "grad_norm": 0.063727006316185, "learning_rate": 0.01, "loss": 2.1425, "step": 7293 }, { "epoch": 0.7496147128326313, "grad_norm": 0.06327082961797714, "learning_rate": 0.01, "loss": 2.1346, "step": 7296 }, { "epoch": 0.7499229425665263, "grad_norm": 0.10383486747741699, "learning_rate": 0.01, "loss": 2.1245, "step": 7299 }, { "epoch": 0.7502311723004212, "grad_norm": 0.10473886877298355, "learning_rate": 0.01, "loss": 2.1302, "step": 7302 }, { "epoch": 0.7505394020343162, "grad_norm": 0.04905236139893532, "learning_rate": 0.01, "loss": 2.119, "step": 7305 }, { "epoch": 0.7508476317682112, "grad_norm": 0.04571664705872536, "learning_rate": 0.01, "loss": 2.1505, "step": 7308 }, { "epoch": 0.7511558615021062, "grad_norm": 0.06305412203073502, "learning_rate": 0.01, "loss": 2.1389, "step": 7311 }, { "epoch": 0.7514640912360012, "grad_norm": 0.05825283005833626, "learning_rate": 0.01, "loss": 2.1361, "step": 7314 }, { "epoch": 0.7517723209698962, "grad_norm": 0.059476301074028015, "learning_rate": 0.01, "loss": 2.1191, "step": 7317 }, { "epoch": 0.7520805507037912, "grad_norm": 0.042396873235702515, "learning_rate": 0.01, "loss": 2.1309, "step": 7320 }, { "epoch": 0.7523887804376862, "grad_norm": 0.04611228406429291, "learning_rate": 0.01, "loss": 2.1438, "step": 7323 }, { "epoch": 0.7526970101715812, "grad_norm": 0.09147686511278152, "learning_rate": 0.01, "loss": 2.1288, "step": 7326 }, { "epoch": 0.7530052399054762, "grad_norm": 0.08085332810878754, "learning_rate": 0.01, "loss": 2.1427, "step": 7329 }, { "epoch": 0.7533134696393712, "grad_norm": 0.03873496130108833, "learning_rate": 0.01, "loss": 2.1257, "step": 7332 }, { "epoch": 0.7536216993732662, "grad_norm": 0.05457824096083641, "learning_rate": 0.01, "loss": 2.1373, "step": 7335 }, { "epoch": 0.7539299291071612, "grad_norm": 0.049249522387981415, "learning_rate": 0.01, "loss": 2.1185, "step": 7338 }, { "epoch": 0.7542381588410562, "grad_norm": 0.07082841545343399, "learning_rate": 0.01, "loss": 2.1157, "step": 7341 }, { "epoch": 0.7545463885749512, "grad_norm": 0.046108178794384, "learning_rate": 0.01, "loss": 2.1238, "step": 7344 }, { "epoch": 0.7548546183088461, "grad_norm": 0.05572620406746864, "learning_rate": 0.01, "loss": 2.1445, "step": 7347 }, { "epoch": 0.7551628480427411, "grad_norm": 0.1091703474521637, "learning_rate": 0.01, "loss": 2.1281, "step": 7350 }, { "epoch": 0.7554710777766361, "grad_norm": 0.09372757375240326, "learning_rate": 0.01, "loss": 2.1231, "step": 7353 }, { "epoch": 0.7557793075105311, "grad_norm": 0.0482059009373188, "learning_rate": 0.01, "loss": 2.1003, "step": 7356 }, { "epoch": 0.7560875372444262, "grad_norm": 0.041941821575164795, "learning_rate": 0.01, "loss": 2.1382, "step": 7359 }, { "epoch": 0.7563957669783212, "grad_norm": 0.07122782617807388, "learning_rate": 0.01, "loss": 2.1419, "step": 7362 }, { "epoch": 0.7567039967122162, "grad_norm": 0.06854265183210373, "learning_rate": 0.01, "loss": 2.1328, "step": 7365 }, { "epoch": 0.7570122264461112, "grad_norm": 0.10073423385620117, "learning_rate": 0.01, "loss": 2.1322, "step": 7368 }, { "epoch": 0.7573204561800062, "grad_norm": 0.038869407027959824, "learning_rate": 0.01, "loss": 2.1273, "step": 7371 }, { "epoch": 0.7576286859139012, "grad_norm": 0.09483812749385834, "learning_rate": 0.01, "loss": 2.1465, "step": 7374 }, { "epoch": 0.7579369156477962, "grad_norm": 0.07226487994194031, "learning_rate": 0.01, "loss": 2.1386, "step": 7377 }, { "epoch": 0.7582451453816912, "grad_norm": 0.05041668191552162, "learning_rate": 0.01, "loss": 2.1249, "step": 7380 }, { "epoch": 0.7585533751155862, "grad_norm": 0.03839525580406189, "learning_rate": 0.01, "loss": 2.1125, "step": 7383 }, { "epoch": 0.7588616048494812, "grad_norm": 0.047746479511260986, "learning_rate": 0.01, "loss": 2.1027, "step": 7386 }, { "epoch": 0.7591698345833762, "grad_norm": 0.05524810031056404, "learning_rate": 0.01, "loss": 2.166, "step": 7389 }, { "epoch": 0.7594780643172712, "grad_norm": 0.050045181065797806, "learning_rate": 0.01, "loss": 2.1411, "step": 7392 }, { "epoch": 0.7597862940511662, "grad_norm": 0.09187906980514526, "learning_rate": 0.01, "loss": 2.13, "step": 7395 }, { "epoch": 0.7600945237850611, "grad_norm": 0.15085643529891968, "learning_rate": 0.01, "loss": 2.1198, "step": 7398 }, { "epoch": 0.7604027535189561, "grad_norm": 0.05295104160904884, "learning_rate": 0.01, "loss": 2.1067, "step": 7401 }, { "epoch": 0.7607109832528511, "grad_norm": 0.03696104511618614, "learning_rate": 0.01, "loss": 2.1159, "step": 7404 }, { "epoch": 0.7610192129867461, "grad_norm": 0.04209265485405922, "learning_rate": 0.01, "loss": 2.1243, "step": 7407 }, { "epoch": 0.7613274427206411, "grad_norm": 0.056943077594041824, "learning_rate": 0.01, "loss": 2.1038, "step": 7410 }, { "epoch": 0.7616356724545361, "grad_norm": 0.12749402225017548, "learning_rate": 0.01, "loss": 2.1087, "step": 7413 }, { "epoch": 0.7619439021884311, "grad_norm": 0.09119253605604172, "learning_rate": 0.01, "loss": 2.1252, "step": 7416 }, { "epoch": 0.7622521319223261, "grad_norm": 0.04251190647482872, "learning_rate": 0.01, "loss": 2.1384, "step": 7419 }, { "epoch": 0.7625603616562211, "grad_norm": 0.04010685533285141, "learning_rate": 0.01, "loss": 2.1449, "step": 7422 }, { "epoch": 0.7628685913901161, "grad_norm": 0.05524475499987602, "learning_rate": 0.01, "loss": 2.0841, "step": 7425 }, { "epoch": 0.7631768211240111, "grad_norm": 0.10250036418437958, "learning_rate": 0.01, "loss": 2.0827, "step": 7428 }, { "epoch": 0.7634850508579061, "grad_norm": 0.0748668685555458, "learning_rate": 0.01, "loss": 2.128, "step": 7431 }, { "epoch": 0.7637932805918011, "grad_norm": 0.08616036176681519, "learning_rate": 0.01, "loss": 2.1087, "step": 7434 }, { "epoch": 0.7641015103256961, "grad_norm": 0.09491308033466339, "learning_rate": 0.01, "loss": 2.1247, "step": 7437 }, { "epoch": 0.7644097400595911, "grad_norm": 0.08575759083032608, "learning_rate": 0.01, "loss": 2.1419, "step": 7440 }, { "epoch": 0.764717969793486, "grad_norm": 0.04314613714814186, "learning_rate": 0.01, "loss": 2.1462, "step": 7443 }, { "epoch": 0.765026199527381, "grad_norm": 0.035719119012355804, "learning_rate": 0.01, "loss": 2.1337, "step": 7446 }, { "epoch": 0.765334429261276, "grad_norm": 0.04597650095820427, "learning_rate": 0.01, "loss": 2.1258, "step": 7449 }, { "epoch": 0.765642658995171, "grad_norm": 0.10039210319519043, "learning_rate": 0.01, "loss": 2.1238, "step": 7452 }, { "epoch": 0.765950888729066, "grad_norm": 0.07157409191131592, "learning_rate": 0.01, "loss": 2.1349, "step": 7455 }, { "epoch": 0.766259118462961, "grad_norm": 0.09058292210102081, "learning_rate": 0.01, "loss": 2.1232, "step": 7458 }, { "epoch": 0.766567348196856, "grad_norm": 0.06009940057992935, "learning_rate": 0.01, "loss": 2.1386, "step": 7461 }, { "epoch": 0.766875577930751, "grad_norm": 0.1165439561009407, "learning_rate": 0.01, "loss": 2.1293, "step": 7464 }, { "epoch": 0.767183807664646, "grad_norm": 0.06138407811522484, "learning_rate": 0.01, "loss": 2.1066, "step": 7467 }, { "epoch": 0.767492037398541, "grad_norm": 0.06058945506811142, "learning_rate": 0.01, "loss": 2.1044, "step": 7470 }, { "epoch": 0.767800267132436, "grad_norm": 0.06741827726364136, "learning_rate": 0.01, "loss": 2.1191, "step": 7473 }, { "epoch": 0.7681084968663311, "grad_norm": 0.047926925122737885, "learning_rate": 0.01, "loss": 2.1333, "step": 7476 }, { "epoch": 0.7684167266002261, "grad_norm": 0.06450969725847244, "learning_rate": 0.01, "loss": 2.1061, "step": 7479 }, { "epoch": 0.7687249563341211, "grad_norm": 0.11133641749620438, "learning_rate": 0.01, "loss": 2.1129, "step": 7482 }, { "epoch": 0.7690331860680161, "grad_norm": 0.049795158207416534, "learning_rate": 0.01, "loss": 2.136, "step": 7485 }, { "epoch": 0.7693414158019111, "grad_norm": 0.06083859130740166, "learning_rate": 0.01, "loss": 2.1459, "step": 7488 }, { "epoch": 0.7696496455358061, "grad_norm": 0.04686833918094635, "learning_rate": 0.01, "loss": 2.1073, "step": 7491 }, { "epoch": 0.769957875269701, "grad_norm": 0.05475611612200737, "learning_rate": 0.01, "loss": 2.1167, "step": 7494 }, { "epoch": 0.770266105003596, "grad_norm": 0.04683786630630493, "learning_rate": 0.01, "loss": 2.1491, "step": 7497 }, { "epoch": 0.770574334737491, "grad_norm": 0.10841275751590729, "learning_rate": 0.01, "loss": 2.0967, "step": 7500 }, { "epoch": 0.770882564471386, "grad_norm": 0.09716581553220749, "learning_rate": 0.01, "loss": 2.1133, "step": 7503 }, { "epoch": 0.771190794205281, "grad_norm": 0.04913085699081421, "learning_rate": 0.01, "loss": 2.1401, "step": 7506 }, { "epoch": 0.771499023939176, "grad_norm": 0.04710682854056358, "learning_rate": 0.01, "loss": 2.109, "step": 7509 }, { "epoch": 0.771807253673071, "grad_norm": 0.054945673793554306, "learning_rate": 0.01, "loss": 2.1169, "step": 7512 }, { "epoch": 0.772115483406966, "grad_norm": 0.04265155643224716, "learning_rate": 0.01, "loss": 2.1156, "step": 7515 }, { "epoch": 0.772423713140861, "grad_norm": 0.03544042259454727, "learning_rate": 0.01, "loss": 2.1172, "step": 7518 }, { "epoch": 0.772731942874756, "grad_norm": 0.05048484355211258, "learning_rate": 0.01, "loss": 2.1015, "step": 7521 }, { "epoch": 0.773040172608651, "grad_norm": 0.14160272479057312, "learning_rate": 0.01, "loss": 2.1475, "step": 7524 }, { "epoch": 0.773348402342546, "grad_norm": 0.08693049848079681, "learning_rate": 0.01, "loss": 2.1266, "step": 7527 }, { "epoch": 0.773656632076441, "grad_norm": 0.06437800824642181, "learning_rate": 0.01, "loss": 2.1273, "step": 7530 }, { "epoch": 0.773964861810336, "grad_norm": 0.04450656846165657, "learning_rate": 0.01, "loss": 2.1192, "step": 7533 }, { "epoch": 0.774273091544231, "grad_norm": 0.05369933694601059, "learning_rate": 0.01, "loss": 2.1264, "step": 7536 }, { "epoch": 0.774581321278126, "grad_norm": 0.04080953076481819, "learning_rate": 0.01, "loss": 2.1319, "step": 7539 }, { "epoch": 0.774889551012021, "grad_norm": 0.03433745354413986, "learning_rate": 0.01, "loss": 2.1024, "step": 7542 }, { "epoch": 0.7751977807459159, "grad_norm": 0.1574896275997162, "learning_rate": 0.01, "loss": 2.1182, "step": 7545 }, { "epoch": 0.7755060104798109, "grad_norm": 0.1207810789346695, "learning_rate": 0.01, "loss": 2.1052, "step": 7548 }, { "epoch": 0.7758142402137059, "grad_norm": 0.07270894944667816, "learning_rate": 0.01, "loss": 2.1331, "step": 7551 }, { "epoch": 0.7761224699476009, "grad_norm": 0.07062831521034241, "learning_rate": 0.01, "loss": 2.099, "step": 7554 }, { "epoch": 0.7764306996814959, "grad_norm": 0.04142964631319046, "learning_rate": 0.01, "loss": 2.1192, "step": 7557 }, { "epoch": 0.7767389294153909, "grad_norm": 0.04645151272416115, "learning_rate": 0.01, "loss": 2.1117, "step": 7560 }, { "epoch": 0.7770471591492859, "grad_norm": 0.046251073479652405, "learning_rate": 0.01, "loss": 2.1399, "step": 7563 }, { "epoch": 0.7773553888831809, "grad_norm": 0.07185769826173782, "learning_rate": 0.01, "loss": 2.1261, "step": 7566 }, { "epoch": 0.7776636186170759, "grad_norm": 0.045216575264930725, "learning_rate": 0.01, "loss": 2.1302, "step": 7569 }, { "epoch": 0.7779718483509709, "grad_norm": 0.04923580586910248, "learning_rate": 0.01, "loss": 2.1482, "step": 7572 }, { "epoch": 0.7782800780848659, "grad_norm": 0.06434139609336853, "learning_rate": 0.01, "loss": 2.1325, "step": 7575 }, { "epoch": 0.7785883078187609, "grad_norm": 0.11186740547418594, "learning_rate": 0.01, "loss": 2.1168, "step": 7578 }, { "epoch": 0.7788965375526559, "grad_norm": 0.06694278120994568, "learning_rate": 0.01, "loss": 2.15, "step": 7581 }, { "epoch": 0.7792047672865509, "grad_norm": 0.05431769788265228, "learning_rate": 0.01, "loss": 2.1156, "step": 7584 }, { "epoch": 0.7795129970204459, "grad_norm": 0.05853963643312454, "learning_rate": 0.01, "loss": 2.1145, "step": 7587 }, { "epoch": 0.779821226754341, "grad_norm": 0.04059399664402008, "learning_rate": 0.01, "loss": 2.1373, "step": 7590 }, { "epoch": 0.780129456488236, "grad_norm": 0.06444236636161804, "learning_rate": 0.01, "loss": 2.1167, "step": 7593 }, { "epoch": 0.7804376862221309, "grad_norm": 0.09885245561599731, "learning_rate": 0.01, "loss": 2.1146, "step": 7596 }, { "epoch": 0.7807459159560259, "grad_norm": 0.08536794036626816, "learning_rate": 0.01, "loss": 2.1282, "step": 7599 }, { "epoch": 0.7810541456899209, "grad_norm": 0.04299011081457138, "learning_rate": 0.01, "loss": 2.103, "step": 7602 }, { "epoch": 0.7813623754238159, "grad_norm": 0.060757700353860855, "learning_rate": 0.01, "loss": 2.0923, "step": 7605 }, { "epoch": 0.7816706051577109, "grad_norm": 0.037401244044303894, "learning_rate": 0.01, "loss": 2.1343, "step": 7608 }, { "epoch": 0.7819788348916059, "grad_norm": 0.12264932692050934, "learning_rate": 0.01, "loss": 2.1193, "step": 7611 }, { "epoch": 0.7822870646255009, "grad_norm": 0.052691470831632614, "learning_rate": 0.01, "loss": 2.1097, "step": 7614 }, { "epoch": 0.7825952943593959, "grad_norm": 0.05509025603532791, "learning_rate": 0.01, "loss": 2.1208, "step": 7617 }, { "epoch": 0.7829035240932909, "grad_norm": 0.10352631658315659, "learning_rate": 0.01, "loss": 2.1277, "step": 7620 }, { "epoch": 0.7832117538271859, "grad_norm": 0.05865751951932907, "learning_rate": 0.01, "loss": 2.138, "step": 7623 }, { "epoch": 0.7835199835610809, "grad_norm": 0.09445837885141373, "learning_rate": 0.01, "loss": 2.1131, "step": 7626 }, { "epoch": 0.7838282132949759, "grad_norm": 0.11066542565822601, "learning_rate": 0.01, "loss": 2.1315, "step": 7629 }, { "epoch": 0.7841364430288709, "grad_norm": 0.05489170923829079, "learning_rate": 0.01, "loss": 2.1264, "step": 7632 }, { "epoch": 0.7844446727627659, "grad_norm": 0.06804061681032181, "learning_rate": 0.01, "loss": 2.1491, "step": 7635 }, { "epoch": 0.7847529024966609, "grad_norm": 0.07411237061023712, "learning_rate": 0.01, "loss": 2.126, "step": 7638 }, { "epoch": 0.7850611322305558, "grad_norm": 0.050356972962617874, "learning_rate": 0.01, "loss": 2.1237, "step": 7641 }, { "epoch": 0.7853693619644508, "grad_norm": 0.06125912442803383, "learning_rate": 0.01, "loss": 2.1328, "step": 7644 }, { "epoch": 0.7856775916983458, "grad_norm": 0.05983618274331093, "learning_rate": 0.01, "loss": 2.1152, "step": 7647 }, { "epoch": 0.7859858214322408, "grad_norm": 0.04065684601664543, "learning_rate": 0.01, "loss": 2.1213, "step": 7650 }, { "epoch": 0.7862940511661358, "grad_norm": 0.05535745993256569, "learning_rate": 0.01, "loss": 2.1106, "step": 7653 }, { "epoch": 0.7866022809000308, "grad_norm": 0.09727519005537033, "learning_rate": 0.01, "loss": 2.1202, "step": 7656 }, { "epoch": 0.7869105106339258, "grad_norm": 0.07764584571123123, "learning_rate": 0.01, "loss": 2.1181, "step": 7659 }, { "epoch": 0.7872187403678208, "grad_norm": 0.04933121055364609, "learning_rate": 0.01, "loss": 2.1217, "step": 7662 }, { "epoch": 0.7875269701017158, "grad_norm": 0.12199501693248749, "learning_rate": 0.01, "loss": 2.1412, "step": 7665 }, { "epoch": 0.7878351998356108, "grad_norm": 0.14431309700012207, "learning_rate": 0.01, "loss": 2.1249, "step": 7668 }, { "epoch": 0.7881434295695058, "grad_norm": 0.07583998888731003, "learning_rate": 0.01, "loss": 2.139, "step": 7671 }, { "epoch": 0.7884516593034008, "grad_norm": 0.10426465421915054, "learning_rate": 0.01, "loss": 2.1053, "step": 7674 }, { "epoch": 0.7887598890372958, "grad_norm": 0.06411170959472656, "learning_rate": 0.01, "loss": 2.1105, "step": 7677 }, { "epoch": 0.7890681187711908, "grad_norm": 0.07436025142669678, "learning_rate": 0.01, "loss": 2.1301, "step": 7680 }, { "epoch": 0.7893763485050858, "grad_norm": 0.10409426689147949, "learning_rate": 0.01, "loss": 2.1319, "step": 7683 }, { "epoch": 0.7896845782389807, "grad_norm": 0.05232664570212364, "learning_rate": 0.01, "loss": 2.1458, "step": 7686 }, { "epoch": 0.7899928079728757, "grad_norm": 0.06705309450626373, "learning_rate": 0.01, "loss": 2.1231, "step": 7689 }, { "epoch": 0.7903010377067707, "grad_norm": 0.04422546550631523, "learning_rate": 0.01, "loss": 2.0836, "step": 7692 }, { "epoch": 0.7906092674406657, "grad_norm": 0.04316714033484459, "learning_rate": 0.01, "loss": 2.1117, "step": 7695 }, { "epoch": 0.7909174971745607, "grad_norm": 0.058282140642404556, "learning_rate": 0.01, "loss": 2.0904, "step": 7698 }, { "epoch": 0.7912257269084557, "grad_norm": 0.07676571607589722, "learning_rate": 0.01, "loss": 2.1402, "step": 7701 }, { "epoch": 0.7915339566423507, "grad_norm": 0.07258665561676025, "learning_rate": 0.01, "loss": 2.1458, "step": 7704 }, { "epoch": 0.7918421863762458, "grad_norm": 0.04850257560610771, "learning_rate": 0.01, "loss": 2.0886, "step": 7707 }, { "epoch": 0.7921504161101408, "grad_norm": 0.05658482015132904, "learning_rate": 0.01, "loss": 2.1174, "step": 7710 }, { "epoch": 0.7924586458440358, "grad_norm": 0.06475166231393814, "learning_rate": 0.01, "loss": 2.0995, "step": 7713 }, { "epoch": 0.7927668755779308, "grad_norm": 0.10428962856531143, "learning_rate": 0.01, "loss": 2.109, "step": 7716 }, { "epoch": 0.7930751053118258, "grad_norm": 0.04227283224463463, "learning_rate": 0.01, "loss": 2.1124, "step": 7719 }, { "epoch": 0.7933833350457208, "grad_norm": 0.0594823881983757, "learning_rate": 0.01, "loss": 2.0944, "step": 7722 }, { "epoch": 0.7936915647796158, "grad_norm": 0.08695527911186218, "learning_rate": 0.01, "loss": 2.1077, "step": 7725 }, { "epoch": 0.7939997945135108, "grad_norm": 0.06003952398896217, "learning_rate": 0.01, "loss": 2.088, "step": 7728 }, { "epoch": 0.7943080242474058, "grad_norm": 0.058509476482868195, "learning_rate": 0.01, "loss": 2.1471, "step": 7731 }, { "epoch": 0.7946162539813008, "grad_norm": 0.048057131469249725, "learning_rate": 0.01, "loss": 2.1252, "step": 7734 }, { "epoch": 0.7949244837151958, "grad_norm": 0.11144626140594482, "learning_rate": 0.01, "loss": 2.1209, "step": 7737 }, { "epoch": 0.7952327134490907, "grad_norm": 0.041008081287145615, "learning_rate": 0.01, "loss": 2.1139, "step": 7740 }, { "epoch": 0.7955409431829857, "grad_norm": 0.04088988155126572, "learning_rate": 0.01, "loss": 2.0927, "step": 7743 }, { "epoch": 0.7958491729168807, "grad_norm": 0.1495555192232132, "learning_rate": 0.01, "loss": 2.0977, "step": 7746 }, { "epoch": 0.7961574026507757, "grad_norm": 0.042645204812288284, "learning_rate": 0.01, "loss": 2.1021, "step": 7749 }, { "epoch": 0.7964656323846707, "grad_norm": 0.04671596363186836, "learning_rate": 0.01, "loss": 2.1015, "step": 7752 }, { "epoch": 0.7967738621185657, "grad_norm": 0.07249152660369873, "learning_rate": 0.01, "loss": 2.1278, "step": 7755 }, { "epoch": 0.7970820918524607, "grad_norm": 0.05848756060004234, "learning_rate": 0.01, "loss": 2.1168, "step": 7758 }, { "epoch": 0.7973903215863557, "grad_norm": 0.05428781732916832, "learning_rate": 0.01, "loss": 2.1228, "step": 7761 }, { "epoch": 0.7976985513202507, "grad_norm": 0.04751111939549446, "learning_rate": 0.01, "loss": 2.1178, "step": 7764 }, { "epoch": 0.7980067810541457, "grad_norm": 0.08653240650892258, "learning_rate": 0.01, "loss": 2.1081, "step": 7767 }, { "epoch": 0.7983150107880407, "grad_norm": 0.04038892313838005, "learning_rate": 0.01, "loss": 2.1028, "step": 7770 }, { "epoch": 0.7986232405219357, "grad_norm": 0.05703849345445633, "learning_rate": 0.01, "loss": 2.1249, "step": 7773 }, { "epoch": 0.7989314702558307, "grad_norm": 0.06425055861473083, "learning_rate": 0.01, "loss": 2.1291, "step": 7776 }, { "epoch": 0.7992396999897257, "grad_norm": 0.05537475273013115, "learning_rate": 0.01, "loss": 2.1122, "step": 7779 }, { "epoch": 0.7995479297236207, "grad_norm": 0.05172963812947273, "learning_rate": 0.01, "loss": 2.1218, "step": 7782 }, { "epoch": 0.7998561594575156, "grad_norm": 0.05907023698091507, "learning_rate": 0.01, "loss": 2.1041, "step": 7785 }, { "epoch": 0.8001643891914106, "grad_norm": 0.10618621110916138, "learning_rate": 0.01, "loss": 2.1266, "step": 7788 }, { "epoch": 0.8004726189253056, "grad_norm": 0.06189849600195885, "learning_rate": 0.01, "loss": 2.1327, "step": 7791 }, { "epoch": 0.8007808486592006, "grad_norm": 0.10624901950359344, "learning_rate": 0.01, "loss": 2.0943, "step": 7794 }, { "epoch": 0.8010890783930956, "grad_norm": 0.04061825945973396, "learning_rate": 0.01, "loss": 2.0859, "step": 7797 }, { "epoch": 0.8013973081269906, "grad_norm": 0.04402461647987366, "learning_rate": 0.01, "loss": 2.1303, "step": 7800 }, { "epoch": 0.8017055378608856, "grad_norm": 0.05029004439711571, "learning_rate": 0.01, "loss": 2.1224, "step": 7803 }, { "epoch": 0.8020137675947806, "grad_norm": 0.055786702781915665, "learning_rate": 0.01, "loss": 2.1296, "step": 7806 }, { "epoch": 0.8023219973286756, "grad_norm": 0.11740477383136749, "learning_rate": 0.01, "loss": 2.1222, "step": 7809 }, { "epoch": 0.8026302270625706, "grad_norm": 0.10261218994855881, "learning_rate": 0.01, "loss": 2.1405, "step": 7812 }, { "epoch": 0.8029384567964656, "grad_norm": 0.05233708769083023, "learning_rate": 0.01, "loss": 2.1118, "step": 7815 }, { "epoch": 0.8032466865303606, "grad_norm": 0.04390858858823776, "learning_rate": 0.01, "loss": 2.1299, "step": 7818 }, { "epoch": 0.8035549162642556, "grad_norm": 0.05893026292324066, "learning_rate": 0.01, "loss": 2.1184, "step": 7821 }, { "epoch": 0.8038631459981507, "grad_norm": 0.06398338079452515, "learning_rate": 0.01, "loss": 2.1057, "step": 7824 }, { "epoch": 0.8041713757320457, "grad_norm": 0.07129772752523422, "learning_rate": 0.01, "loss": 2.1056, "step": 7827 }, { "epoch": 0.8044796054659407, "grad_norm": 0.07481534779071808, "learning_rate": 0.01, "loss": 2.1272, "step": 7830 }, { "epoch": 0.8047878351998357, "grad_norm": 0.049200594425201416, "learning_rate": 0.01, "loss": 2.0942, "step": 7833 }, { "epoch": 0.8050960649337306, "grad_norm": 0.05124384164810181, "learning_rate": 0.01, "loss": 2.0859, "step": 7836 }, { "epoch": 0.8054042946676256, "grad_norm": 0.07997792959213257, "learning_rate": 0.01, "loss": 2.1412, "step": 7839 }, { "epoch": 0.8057125244015206, "grad_norm": 0.12280064076185226, "learning_rate": 0.01, "loss": 2.0826, "step": 7842 }, { "epoch": 0.8060207541354156, "grad_norm": 0.05292202904820442, "learning_rate": 0.01, "loss": 2.0965, "step": 7845 }, { "epoch": 0.8063289838693106, "grad_norm": 0.04903187230229378, "learning_rate": 0.01, "loss": 2.0911, "step": 7848 }, { "epoch": 0.8066372136032056, "grad_norm": 0.06882268935441971, "learning_rate": 0.01, "loss": 2.108, "step": 7851 }, { "epoch": 0.8069454433371006, "grad_norm": 0.06937083601951599, "learning_rate": 0.01, "loss": 2.1234, "step": 7854 }, { "epoch": 0.8072536730709956, "grad_norm": 0.10075647383928299, "learning_rate": 0.01, "loss": 2.0983, "step": 7857 }, { "epoch": 0.8075619028048906, "grad_norm": 0.07185733318328857, "learning_rate": 0.01, "loss": 2.0998, "step": 7860 }, { "epoch": 0.8078701325387856, "grad_norm": 0.07266184687614441, "learning_rate": 0.01, "loss": 2.1056, "step": 7863 }, { "epoch": 0.8081783622726806, "grad_norm": 0.05049808695912361, "learning_rate": 0.01, "loss": 2.1126, "step": 7866 }, { "epoch": 0.8084865920065756, "grad_norm": 0.07260838896036148, "learning_rate": 0.01, "loss": 2.1311, "step": 7869 }, { "epoch": 0.8087948217404706, "grad_norm": 0.0659325122833252, "learning_rate": 0.01, "loss": 2.1317, "step": 7872 }, { "epoch": 0.8091030514743656, "grad_norm": 0.056960709393024445, "learning_rate": 0.01, "loss": 2.0988, "step": 7875 }, { "epoch": 0.8094112812082606, "grad_norm": 0.1266620010137558, "learning_rate": 0.01, "loss": 2.1274, "step": 7878 }, { "epoch": 0.8097195109421556, "grad_norm": 0.05951874330639839, "learning_rate": 0.01, "loss": 2.1342, "step": 7881 }, { "epoch": 0.8100277406760505, "grad_norm": 0.06081915274262428, "learning_rate": 0.01, "loss": 2.1036, "step": 7884 }, { "epoch": 0.8103359704099455, "grad_norm": 0.07136547565460205, "learning_rate": 0.01, "loss": 2.1067, "step": 7887 }, { "epoch": 0.8106442001438405, "grad_norm": 0.08835722506046295, "learning_rate": 0.01, "loss": 2.1123, "step": 7890 }, { "epoch": 0.8109524298777355, "grad_norm": 0.04469553008675575, "learning_rate": 0.01, "loss": 2.1117, "step": 7893 }, { "epoch": 0.8112606596116305, "grad_norm": 0.042171087116003036, "learning_rate": 0.01, "loss": 2.0875, "step": 7896 }, { "epoch": 0.8115688893455255, "grad_norm": 0.0847015529870987, "learning_rate": 0.01, "loss": 2.0998, "step": 7899 }, { "epoch": 0.8118771190794205, "grad_norm": 0.09157509356737137, "learning_rate": 0.01, "loss": 2.121, "step": 7902 }, { "epoch": 0.8121853488133155, "grad_norm": 0.06001126766204834, "learning_rate": 0.01, "loss": 2.1441, "step": 7905 }, { "epoch": 0.8124935785472105, "grad_norm": 0.03552449122071266, "learning_rate": 0.01, "loss": 2.1371, "step": 7908 }, { "epoch": 0.8128018082811055, "grad_norm": 0.034304428845644, "learning_rate": 0.01, "loss": 2.1066, "step": 7911 }, { "epoch": 0.8131100380150005, "grad_norm": 0.04897907376289368, "learning_rate": 0.01, "loss": 2.1054, "step": 7914 }, { "epoch": 0.8134182677488955, "grad_norm": 0.06674344837665558, "learning_rate": 0.01, "loss": 2.1156, "step": 7917 }, { "epoch": 0.8137264974827905, "grad_norm": 0.06437379866838455, "learning_rate": 0.01, "loss": 2.1189, "step": 7920 }, { "epoch": 0.8140347272166855, "grad_norm": 0.06402087956666946, "learning_rate": 0.01, "loss": 2.1111, "step": 7923 }, { "epoch": 0.8143429569505805, "grad_norm": 0.11063557863235474, "learning_rate": 0.01, "loss": 2.1131, "step": 7926 }, { "epoch": 0.8146511866844754, "grad_norm": 0.10625256597995758, "learning_rate": 0.01, "loss": 2.113, "step": 7929 }, { "epoch": 0.8149594164183704, "grad_norm": 0.0682268813252449, "learning_rate": 0.01, "loss": 2.0929, "step": 7932 }, { "epoch": 0.8152676461522654, "grad_norm": 0.08721883594989777, "learning_rate": 0.01, "loss": 2.0878, "step": 7935 }, { "epoch": 0.8155758758861605, "grad_norm": 0.07372716814279556, "learning_rate": 0.01, "loss": 2.1173, "step": 7938 }, { "epoch": 0.8158841056200555, "grad_norm": 0.049299102276563644, "learning_rate": 0.01, "loss": 2.1172, "step": 7941 }, { "epoch": 0.8161923353539505, "grad_norm": 0.06552339345216751, "learning_rate": 0.01, "loss": 2.1035, "step": 7944 }, { "epoch": 0.8165005650878455, "grad_norm": 0.08362871408462524, "learning_rate": 0.01, "loss": 2.0942, "step": 7947 }, { "epoch": 0.8168087948217405, "grad_norm": 0.07610680162906647, "learning_rate": 0.01, "loss": 2.1026, "step": 7950 }, { "epoch": 0.8171170245556355, "grad_norm": 0.058830149471759796, "learning_rate": 0.01, "loss": 2.121, "step": 7953 }, { "epoch": 0.8174252542895305, "grad_norm": 0.10281010717153549, "learning_rate": 0.01, "loss": 2.1084, "step": 7956 }, { "epoch": 0.8177334840234255, "grad_norm": 0.04509102553129196, "learning_rate": 0.01, "loss": 2.0917, "step": 7959 }, { "epoch": 0.8180417137573205, "grad_norm": 0.034059979021549225, "learning_rate": 0.01, "loss": 2.1286, "step": 7962 }, { "epoch": 0.8183499434912155, "grad_norm": 0.09370562434196472, "learning_rate": 0.01, "loss": 2.1298, "step": 7965 }, { "epoch": 0.8186581732251105, "grad_norm": 0.09386254847049713, "learning_rate": 0.01, "loss": 2.1018, "step": 7968 }, { "epoch": 0.8189664029590055, "grad_norm": 0.1801362931728363, "learning_rate": 0.01, "loss": 2.1125, "step": 7971 }, { "epoch": 0.8192746326929005, "grad_norm": 0.12590090930461884, "learning_rate": 0.01, "loss": 2.1145, "step": 7974 }, { "epoch": 0.8195828624267955, "grad_norm": 0.09913074970245361, "learning_rate": 0.01, "loss": 2.1049, "step": 7977 }, { "epoch": 0.8198910921606904, "grad_norm": 0.05249069631099701, "learning_rate": 0.01, "loss": 2.1348, "step": 7980 }, { "epoch": 0.8201993218945854, "grad_norm": 0.05334639549255371, "learning_rate": 0.01, "loss": 2.0952, "step": 7983 }, { "epoch": 0.8205075516284804, "grad_norm": 0.03963373601436615, "learning_rate": 0.01, "loss": 2.1133, "step": 7986 }, { "epoch": 0.8208157813623754, "grad_norm": 0.03334924206137657, "learning_rate": 0.01, "loss": 2.1156, "step": 7989 }, { "epoch": 0.8211240110962704, "grad_norm": 0.0628419816493988, "learning_rate": 0.01, "loss": 2.1298, "step": 7992 }, { "epoch": 0.8214322408301654, "grad_norm": 0.07143758237361908, "learning_rate": 0.01, "loss": 2.0891, "step": 7995 }, { "epoch": 0.8217404705640604, "grad_norm": 0.06662650406360626, "learning_rate": 0.01, "loss": 2.0976, "step": 7998 }, { "epoch": 0.8220487002979554, "grad_norm": 0.10575726628303528, "learning_rate": 0.01, "loss": 2.0946, "step": 8001 }, { "epoch": 0.8223569300318504, "grad_norm": 0.056455157697200775, "learning_rate": 0.01, "loss": 2.0924, "step": 8004 }, { "epoch": 0.8226651597657454, "grad_norm": 0.10326797515153885, "learning_rate": 0.01, "loss": 2.0823, "step": 8007 }, { "epoch": 0.8229733894996404, "grad_norm": 0.08464314043521881, "learning_rate": 0.01, "loss": 2.1274, "step": 8010 }, { "epoch": 0.8232816192335354, "grad_norm": 0.052144117653369904, "learning_rate": 0.01, "loss": 2.0952, "step": 8013 }, { "epoch": 0.8235898489674304, "grad_norm": 0.05464213341474533, "learning_rate": 0.01, "loss": 2.1117, "step": 8016 }, { "epoch": 0.8238980787013254, "grad_norm": 0.06700276583433151, "learning_rate": 0.01, "loss": 2.1289, "step": 8019 }, { "epoch": 0.8242063084352204, "grad_norm": 0.05322539806365967, "learning_rate": 0.01, "loss": 2.1346, "step": 8022 }, { "epoch": 0.8245145381691154, "grad_norm": 0.040953267365694046, "learning_rate": 0.01, "loss": 2.1035, "step": 8025 }, { "epoch": 0.8248227679030103, "grad_norm": 0.043644580990076065, "learning_rate": 0.01, "loss": 2.1238, "step": 8028 }, { "epoch": 0.8251309976369053, "grad_norm": 0.060951683670282364, "learning_rate": 0.01, "loss": 2.1169, "step": 8031 }, { "epoch": 0.8254392273708003, "grad_norm": 0.11269400268793106, "learning_rate": 0.01, "loss": 2.1184, "step": 8034 }, { "epoch": 0.8257474571046953, "grad_norm": 0.05554080754518509, "learning_rate": 0.01, "loss": 2.112, "step": 8037 }, { "epoch": 0.8260556868385903, "grad_norm": 0.08155755698680878, "learning_rate": 0.01, "loss": 2.096, "step": 8040 }, { "epoch": 0.8263639165724853, "grad_norm": 0.0661015510559082, "learning_rate": 0.01, "loss": 2.0943, "step": 8043 }, { "epoch": 0.8266721463063803, "grad_norm": 0.06186169385910034, "learning_rate": 0.01, "loss": 2.1225, "step": 8046 }, { "epoch": 0.8269803760402753, "grad_norm": 0.06658541411161423, "learning_rate": 0.01, "loss": 2.1189, "step": 8049 }, { "epoch": 0.8272886057741703, "grad_norm": 0.09120085090398788, "learning_rate": 0.01, "loss": 2.1181, "step": 8052 }, { "epoch": 0.8275968355080654, "grad_norm": 0.059662993997335434, "learning_rate": 0.01, "loss": 2.1257, "step": 8055 }, { "epoch": 0.8279050652419604, "grad_norm": 0.08305416256189346, "learning_rate": 0.01, "loss": 2.1108, "step": 8058 }, { "epoch": 0.8282132949758554, "grad_norm": 0.047110967338085175, "learning_rate": 0.01, "loss": 2.0786, "step": 8061 }, { "epoch": 0.8285215247097504, "grad_norm": 0.042120445519685745, "learning_rate": 0.01, "loss": 2.1081, "step": 8064 }, { "epoch": 0.8288297544436454, "grad_norm": 0.04596862941980362, "learning_rate": 0.01, "loss": 2.1025, "step": 8067 }, { "epoch": 0.8291379841775404, "grad_norm": 0.055216096341609955, "learning_rate": 0.01, "loss": 2.1416, "step": 8070 }, { "epoch": 0.8294462139114354, "grad_norm": 0.05959683656692505, "learning_rate": 0.01, "loss": 2.0832, "step": 8073 }, { "epoch": 0.8297544436453304, "grad_norm": 0.045481909066438675, "learning_rate": 0.01, "loss": 2.1391, "step": 8076 }, { "epoch": 0.8300626733792253, "grad_norm": 0.049282100051641464, "learning_rate": 0.01, "loss": 2.1227, "step": 8079 }, { "epoch": 0.8303709031131203, "grad_norm": 0.058084890246391296, "learning_rate": 0.01, "loss": 2.1211, "step": 8082 }, { "epoch": 0.8306791328470153, "grad_norm": 0.11113768070936203, "learning_rate": 0.01, "loss": 2.132, "step": 8085 }, { "epoch": 0.8309873625809103, "grad_norm": 0.07015852630138397, "learning_rate": 0.01, "loss": 2.0988, "step": 8088 }, { "epoch": 0.8312955923148053, "grad_norm": 0.09509722143411636, "learning_rate": 0.01, "loss": 2.1064, "step": 8091 }, { "epoch": 0.8316038220487003, "grad_norm": 0.03616593778133392, "learning_rate": 0.01, "loss": 2.1078, "step": 8094 }, { "epoch": 0.8319120517825953, "grad_norm": 0.0486396960914135, "learning_rate": 0.01, "loss": 2.1279, "step": 8097 }, { "epoch": 0.8322202815164903, "grad_norm": 0.050647489726543427, "learning_rate": 0.01, "loss": 2.0808, "step": 8100 }, { "epoch": 0.8325285112503853, "grad_norm": 0.08125802874565125, "learning_rate": 0.01, "loss": 2.1099, "step": 8103 }, { "epoch": 0.8328367409842803, "grad_norm": 0.08078313618898392, "learning_rate": 0.01, "loss": 2.1162, "step": 8106 }, { "epoch": 0.8331449707181753, "grad_norm": 0.06157573312520981, "learning_rate": 0.01, "loss": 2.1288, "step": 8109 }, { "epoch": 0.8334532004520703, "grad_norm": 0.057771824300289154, "learning_rate": 0.01, "loss": 2.1136, "step": 8112 }, { "epoch": 0.8337614301859653, "grad_norm": 0.06634260714054108, "learning_rate": 0.01, "loss": 2.1029, "step": 8115 }, { "epoch": 0.8340696599198603, "grad_norm": 0.12186034023761749, "learning_rate": 0.01, "loss": 2.1236, "step": 8118 }, { "epoch": 0.8343778896537553, "grad_norm": 0.03940106928348541, "learning_rate": 0.01, "loss": 2.1169, "step": 8121 }, { "epoch": 0.8346861193876502, "grad_norm": 0.06003478914499283, "learning_rate": 0.01, "loss": 2.1108, "step": 8124 }, { "epoch": 0.8349943491215452, "grad_norm": 0.04963524639606476, "learning_rate": 0.01, "loss": 2.0893, "step": 8127 }, { "epoch": 0.8353025788554402, "grad_norm": 0.04543556645512581, "learning_rate": 0.01, "loss": 2.1062, "step": 8130 }, { "epoch": 0.8356108085893352, "grad_norm": 0.05210501328110695, "learning_rate": 0.01, "loss": 2.1065, "step": 8133 }, { "epoch": 0.8359190383232302, "grad_norm": 0.10866094380617142, "learning_rate": 0.01, "loss": 2.1154, "step": 8136 }, { "epoch": 0.8362272680571252, "grad_norm": 0.07595928758382797, "learning_rate": 0.01, "loss": 2.1079, "step": 8139 }, { "epoch": 0.8365354977910202, "grad_norm": 0.04948664829134941, "learning_rate": 0.01, "loss": 2.0955, "step": 8142 }, { "epoch": 0.8368437275249152, "grad_norm": 0.11719872057437897, "learning_rate": 0.01, "loss": 2.1106, "step": 8145 }, { "epoch": 0.8371519572588102, "grad_norm": 0.04469067603349686, "learning_rate": 0.01, "loss": 2.1145, "step": 8148 }, { "epoch": 0.8374601869927052, "grad_norm": 0.038385387510061264, "learning_rate": 0.01, "loss": 2.0933, "step": 8151 }, { "epoch": 0.8377684167266002, "grad_norm": 0.04040665924549103, "learning_rate": 0.01, "loss": 2.1119, "step": 8154 }, { "epoch": 0.8380766464604952, "grad_norm": 0.042900413274765015, "learning_rate": 0.01, "loss": 2.1053, "step": 8157 }, { "epoch": 0.8383848761943902, "grad_norm": 0.06709393113851547, "learning_rate": 0.01, "loss": 2.0858, "step": 8160 }, { "epoch": 0.8386931059282852, "grad_norm": 0.08388926833868027, "learning_rate": 0.01, "loss": 2.1287, "step": 8163 }, { "epoch": 0.8390013356621802, "grad_norm": 0.0701015368103981, "learning_rate": 0.01, "loss": 2.0965, "step": 8166 }, { "epoch": 0.8393095653960753, "grad_norm": 0.0841115415096283, "learning_rate": 0.01, "loss": 2.1136, "step": 8169 }, { "epoch": 0.8396177951299703, "grad_norm": 0.08542285114526749, "learning_rate": 0.01, "loss": 2.1166, "step": 8172 }, { "epoch": 0.8399260248638653, "grad_norm": 0.046626705676317215, "learning_rate": 0.01, "loss": 2.1318, "step": 8175 }, { "epoch": 0.8402342545977602, "grad_norm": 0.08752947300672531, "learning_rate": 0.01, "loss": 2.0873, "step": 8178 }, { "epoch": 0.8405424843316552, "grad_norm": 0.04728331416845322, "learning_rate": 0.01, "loss": 2.0951, "step": 8181 }, { "epoch": 0.8408507140655502, "grad_norm": 0.04881293699145317, "learning_rate": 0.01, "loss": 2.1011, "step": 8184 }, { "epoch": 0.8411589437994452, "grad_norm": 0.049758728593587875, "learning_rate": 0.01, "loss": 2.1206, "step": 8187 }, { "epoch": 0.8414671735333402, "grad_norm": 0.037589117884635925, "learning_rate": 0.01, "loss": 2.085, "step": 8190 }, { "epoch": 0.8417754032672352, "grad_norm": 0.11785265803337097, "learning_rate": 0.01, "loss": 2.1195, "step": 8193 }, { "epoch": 0.8420836330011302, "grad_norm": 0.09815037995576859, "learning_rate": 0.01, "loss": 2.065, "step": 8196 }, { "epoch": 0.8423918627350252, "grad_norm": 0.07950727641582489, "learning_rate": 0.01, "loss": 2.1081, "step": 8199 }, { "epoch": 0.8427000924689202, "grad_norm": 0.04057254642248154, "learning_rate": 0.01, "loss": 2.0882, "step": 8202 }, { "epoch": 0.8430083222028152, "grad_norm": 0.07260222733020782, "learning_rate": 0.01, "loss": 2.1018, "step": 8205 }, { "epoch": 0.8433165519367102, "grad_norm": 0.057693734765052795, "learning_rate": 0.01, "loss": 2.112, "step": 8208 }, { "epoch": 0.8436247816706052, "grad_norm": 0.039680637419223785, "learning_rate": 0.01, "loss": 2.0974, "step": 8211 }, { "epoch": 0.8439330114045002, "grad_norm": 0.07584577798843384, "learning_rate": 0.01, "loss": 2.129, "step": 8214 }, { "epoch": 0.8442412411383952, "grad_norm": 0.044016819447278976, "learning_rate": 0.01, "loss": 2.1139, "step": 8217 }, { "epoch": 0.8445494708722902, "grad_norm": 0.04449582099914551, "learning_rate": 0.01, "loss": 2.1085, "step": 8220 }, { "epoch": 0.8448577006061851, "grad_norm": 0.044676005840301514, "learning_rate": 0.01, "loss": 2.1081, "step": 8223 }, { "epoch": 0.8451659303400801, "grad_norm": 0.04926025867462158, "learning_rate": 0.01, "loss": 2.1187, "step": 8226 }, { "epoch": 0.8454741600739751, "grad_norm": 0.10964366793632507, "learning_rate": 0.01, "loss": 2.0898, "step": 8229 }, { "epoch": 0.8457823898078701, "grad_norm": 0.09405852109193802, "learning_rate": 0.01, "loss": 2.1117, "step": 8232 }, { "epoch": 0.8460906195417651, "grad_norm": 0.09241268038749695, "learning_rate": 0.01, "loss": 2.1225, "step": 8235 }, { "epoch": 0.8463988492756601, "grad_norm": 0.05887102335691452, "learning_rate": 0.01, "loss": 2.0944, "step": 8238 }, { "epoch": 0.8467070790095551, "grad_norm": 0.13513131439685822, "learning_rate": 0.01, "loss": 2.1217, "step": 8241 }, { "epoch": 0.8470153087434501, "grad_norm": 0.06370443850755692, "learning_rate": 0.01, "loss": 2.121, "step": 8244 }, { "epoch": 0.8473235384773451, "grad_norm": 0.0426030196249485, "learning_rate": 0.01, "loss": 2.0937, "step": 8247 }, { "epoch": 0.8476317682112401, "grad_norm": 0.049412764608860016, "learning_rate": 0.01, "loss": 2.108, "step": 8250 }, { "epoch": 0.8479399979451351, "grad_norm": 0.061407607048749924, "learning_rate": 0.01, "loss": 2.1009, "step": 8253 }, { "epoch": 0.8482482276790301, "grad_norm": 0.12416908144950867, "learning_rate": 0.01, "loss": 2.0819, "step": 8256 }, { "epoch": 0.8485564574129251, "grad_norm": 0.05728744715452194, "learning_rate": 0.01, "loss": 2.1132, "step": 8259 }, { "epoch": 0.8488646871468201, "grad_norm": 0.06724981963634491, "learning_rate": 0.01, "loss": 2.0932, "step": 8262 }, { "epoch": 0.8491729168807151, "grad_norm": 0.055260930210351944, "learning_rate": 0.01, "loss": 2.0933, "step": 8265 }, { "epoch": 0.84948114661461, "grad_norm": 0.04230106249451637, "learning_rate": 0.01, "loss": 2.1135, "step": 8268 }, { "epoch": 0.849789376348505, "grad_norm": 0.04593104496598244, "learning_rate": 0.01, "loss": 2.1091, "step": 8271 }, { "epoch": 0.8500976060824, "grad_norm": 0.09625285863876343, "learning_rate": 0.01, "loss": 2.0946, "step": 8274 }, { "epoch": 0.850405835816295, "grad_norm": 0.04556501284241676, "learning_rate": 0.01, "loss": 2.0975, "step": 8277 }, { "epoch": 0.85071406555019, "grad_norm": 0.09413543343544006, "learning_rate": 0.01, "loss": 2.0863, "step": 8280 }, { "epoch": 0.851022295284085, "grad_norm": 0.08400101214647293, "learning_rate": 0.01, "loss": 2.1045, "step": 8283 }, { "epoch": 0.8513305250179801, "grad_norm": 0.06278138607740402, "learning_rate": 0.01, "loss": 2.116, "step": 8286 }, { "epoch": 0.8516387547518751, "grad_norm": 0.04442959651350975, "learning_rate": 0.01, "loss": 2.0796, "step": 8289 }, { "epoch": 0.8519469844857701, "grad_norm": 0.045807912945747375, "learning_rate": 0.01, "loss": 2.0823, "step": 8292 }, { "epoch": 0.8522552142196651, "grad_norm": 0.0426551029086113, "learning_rate": 0.01, "loss": 2.1109, "step": 8295 }, { "epoch": 0.8525634439535601, "grad_norm": 0.12200357019901276, "learning_rate": 0.01, "loss": 2.1146, "step": 8298 }, { "epoch": 0.8528716736874551, "grad_norm": 0.04152747616171837, "learning_rate": 0.01, "loss": 2.1204, "step": 8301 }, { "epoch": 0.8531799034213501, "grad_norm": 0.08464021235704422, "learning_rate": 0.01, "loss": 2.085, "step": 8304 }, { "epoch": 0.8534881331552451, "grad_norm": 0.050391390919685364, "learning_rate": 0.01, "loss": 2.0957, "step": 8307 }, { "epoch": 0.8537963628891401, "grad_norm": 0.08581732958555222, "learning_rate": 0.01, "loss": 2.1015, "step": 8310 }, { "epoch": 0.8541045926230351, "grad_norm": 0.10992308706045151, "learning_rate": 0.01, "loss": 2.0939, "step": 8313 }, { "epoch": 0.8544128223569301, "grad_norm": 0.053225912153720856, "learning_rate": 0.01, "loss": 2.1106, "step": 8316 }, { "epoch": 0.854721052090825, "grad_norm": 0.06759096682071686, "learning_rate": 0.01, "loss": 2.1035, "step": 8319 }, { "epoch": 0.85502928182472, "grad_norm": 0.058069922029972076, "learning_rate": 0.01, "loss": 2.0833, "step": 8322 }, { "epoch": 0.855337511558615, "grad_norm": 0.0657680481672287, "learning_rate": 0.01, "loss": 2.0918, "step": 8325 }, { "epoch": 0.85564574129251, "grad_norm": 0.1428556591272354, "learning_rate": 0.01, "loss": 2.0895, "step": 8328 }, { "epoch": 0.855953971026405, "grad_norm": 0.10311869531869888, "learning_rate": 0.01, "loss": 2.1041, "step": 8331 }, { "epoch": 0.8562622007603, "grad_norm": 0.12024179100990295, "learning_rate": 0.01, "loss": 2.1179, "step": 8334 }, { "epoch": 0.856570430494195, "grad_norm": 0.08294446766376495, "learning_rate": 0.01, "loss": 2.1042, "step": 8337 }, { "epoch": 0.85687866022809, "grad_norm": 0.05203935503959656, "learning_rate": 0.01, "loss": 2.1275, "step": 8340 }, { "epoch": 0.857186889961985, "grad_norm": 0.061564356088638306, "learning_rate": 0.01, "loss": 2.0951, "step": 8343 }, { "epoch": 0.85749511969588, "grad_norm": 0.043616339564323425, "learning_rate": 0.01, "loss": 2.1082, "step": 8346 }, { "epoch": 0.857803349429775, "grad_norm": 0.0885004997253418, "learning_rate": 0.01, "loss": 2.1071, "step": 8349 }, { "epoch": 0.85811157916367, "grad_norm": 0.06275481730699539, "learning_rate": 0.01, "loss": 2.137, "step": 8352 }, { "epoch": 0.858419808897565, "grad_norm": 0.054776523262262344, "learning_rate": 0.01, "loss": 2.1117, "step": 8355 }, { "epoch": 0.85872803863146, "grad_norm": 0.07782801240682602, "learning_rate": 0.01, "loss": 2.0822, "step": 8358 }, { "epoch": 0.859036268365355, "grad_norm": 0.12301263958215714, "learning_rate": 0.01, "loss": 2.1126, "step": 8361 }, { "epoch": 0.85934449809925, "grad_norm": 0.07181745767593384, "learning_rate": 0.01, "loss": 2.1359, "step": 8364 }, { "epoch": 0.859652727833145, "grad_norm": 0.07232604175806046, "learning_rate": 0.01, "loss": 2.0849, "step": 8367 }, { "epoch": 0.8599609575670399, "grad_norm": 0.06810937821865082, "learning_rate": 0.01, "loss": 2.1088, "step": 8370 }, { "epoch": 0.8602691873009349, "grad_norm": 0.048163384199142456, "learning_rate": 0.01, "loss": 2.1173, "step": 8373 }, { "epoch": 0.8605774170348299, "grad_norm": 0.05688156560063362, "learning_rate": 0.01, "loss": 2.114, "step": 8376 }, { "epoch": 0.8608856467687249, "grad_norm": 0.065540611743927, "learning_rate": 0.01, "loss": 2.0989, "step": 8379 }, { "epoch": 0.8611938765026199, "grad_norm": 0.09561596065759659, "learning_rate": 0.01, "loss": 2.0894, "step": 8382 }, { "epoch": 0.8615021062365149, "grad_norm": 0.06719313561916351, "learning_rate": 0.01, "loss": 2.0928, "step": 8385 }, { "epoch": 0.8618103359704099, "grad_norm": 0.05895761027932167, "learning_rate": 0.01, "loss": 2.1037, "step": 8388 }, { "epoch": 0.8621185657043049, "grad_norm": 0.09232669323682785, "learning_rate": 0.01, "loss": 2.1272, "step": 8391 }, { "epoch": 0.8624267954381999, "grad_norm": 0.06715840846300125, "learning_rate": 0.01, "loss": 2.072, "step": 8394 }, { "epoch": 0.8627350251720949, "grad_norm": 0.04794420674443245, "learning_rate": 0.01, "loss": 2.1087, "step": 8397 }, { "epoch": 0.8630432549059899, "grad_norm": 0.037383124232292175, "learning_rate": 0.01, "loss": 2.0761, "step": 8400 }, { "epoch": 0.863351484639885, "grad_norm": 0.05601905286312103, "learning_rate": 0.01, "loss": 2.0926, "step": 8403 }, { "epoch": 0.86365971437378, "grad_norm": 0.0839313194155693, "learning_rate": 0.01, "loss": 2.0887, "step": 8406 }, { "epoch": 0.863967944107675, "grad_norm": 0.07600929588079453, "learning_rate": 0.01, "loss": 2.1143, "step": 8409 }, { "epoch": 0.86427617384157, "grad_norm": 0.06851659715175629, "learning_rate": 0.01, "loss": 2.0921, "step": 8412 }, { "epoch": 0.864584403575465, "grad_norm": 0.05021858587861061, "learning_rate": 0.01, "loss": 2.0903, "step": 8415 }, { "epoch": 0.86489263330936, "grad_norm": 0.04881426692008972, "learning_rate": 0.01, "loss": 2.1047, "step": 8418 }, { "epoch": 0.8652008630432549, "grad_norm": 0.04262546822428703, "learning_rate": 0.01, "loss": 2.0852, "step": 8421 }, { "epoch": 0.8655090927771499, "grad_norm": 0.050467535853385925, "learning_rate": 0.01, "loss": 2.1045, "step": 8424 }, { "epoch": 0.8658173225110449, "grad_norm": 0.0725008100271225, "learning_rate": 0.01, "loss": 2.077, "step": 8427 }, { "epoch": 0.8661255522449399, "grad_norm": 0.07234456390142441, "learning_rate": 0.01, "loss": 2.13, "step": 8430 }, { "epoch": 0.8664337819788349, "grad_norm": 0.060751501470804214, "learning_rate": 0.01, "loss": 2.0948, "step": 8433 }, { "epoch": 0.8667420117127299, "grad_norm": 0.058911584317684174, "learning_rate": 0.01, "loss": 2.0908, "step": 8436 }, { "epoch": 0.8670502414466249, "grad_norm": 0.08380532264709473, "learning_rate": 0.01, "loss": 2.1438, "step": 8439 }, { "epoch": 0.8673584711805199, "grad_norm": 0.058240536600351334, "learning_rate": 0.01, "loss": 2.1384, "step": 8442 }, { "epoch": 0.8676667009144149, "grad_norm": 0.0422792062163353, "learning_rate": 0.01, "loss": 2.0926, "step": 8445 }, { "epoch": 0.8679749306483099, "grad_norm": 0.07096652686595917, "learning_rate": 0.01, "loss": 2.1195, "step": 8448 }, { "epoch": 0.8682831603822049, "grad_norm": 0.13370642066001892, "learning_rate": 0.01, "loss": 2.1367, "step": 8451 }, { "epoch": 0.8685913901160999, "grad_norm": 0.0597628615796566, "learning_rate": 0.01, "loss": 2.087, "step": 8454 }, { "epoch": 0.8688996198499949, "grad_norm": 0.039561979472637177, "learning_rate": 0.01, "loss": 2.112, "step": 8457 }, { "epoch": 0.8692078495838899, "grad_norm": 0.04080485925078392, "learning_rate": 0.01, "loss": 2.1024, "step": 8460 }, { "epoch": 0.8695160793177849, "grad_norm": 0.05293022468686104, "learning_rate": 0.01, "loss": 2.0731, "step": 8463 }, { "epoch": 0.8698243090516798, "grad_norm": 0.06960830092430115, "learning_rate": 0.01, "loss": 2.1255, "step": 8466 }, { "epoch": 0.8701325387855748, "grad_norm": 0.09768849611282349, "learning_rate": 0.01, "loss": 2.1217, "step": 8469 }, { "epoch": 0.8704407685194698, "grad_norm": 0.11970885097980499, "learning_rate": 0.01, "loss": 2.0932, "step": 8472 }, { "epoch": 0.8707489982533648, "grad_norm": 0.12014521658420563, "learning_rate": 0.01, "loss": 2.1009, "step": 8475 }, { "epoch": 0.8710572279872598, "grad_norm": 0.04288540408015251, "learning_rate": 0.01, "loss": 2.1111, "step": 8478 }, { "epoch": 0.8713654577211548, "grad_norm": 0.033004507422447205, "learning_rate": 0.01, "loss": 2.1029, "step": 8481 }, { "epoch": 0.8716736874550498, "grad_norm": 0.03685779869556427, "learning_rate": 0.01, "loss": 2.1077, "step": 8484 }, { "epoch": 0.8719819171889448, "grad_norm": 0.06450948119163513, "learning_rate": 0.01, "loss": 2.102, "step": 8487 }, { "epoch": 0.8722901469228398, "grad_norm": 0.04806706681847572, "learning_rate": 0.01, "loss": 2.1056, "step": 8490 }, { "epoch": 0.8725983766567348, "grad_norm": 0.05847964435815811, "learning_rate": 0.01, "loss": 2.095, "step": 8493 }, { "epoch": 0.8729066063906298, "grad_norm": 0.11569567024707794, "learning_rate": 0.01, "loss": 2.1058, "step": 8496 }, { "epoch": 0.8732148361245248, "grad_norm": 0.04440119490027428, "learning_rate": 0.01, "loss": 2.1127, "step": 8499 }, { "epoch": 0.8735230658584198, "grad_norm": 0.13856938481330872, "learning_rate": 0.01, "loss": 2.1072, "step": 8502 }, { "epoch": 0.8738312955923148, "grad_norm": 0.06448937207460403, "learning_rate": 0.01, "loss": 2.0813, "step": 8505 }, { "epoch": 0.8741395253262098, "grad_norm": 0.05872811749577522, "learning_rate": 0.01, "loss": 2.1227, "step": 8508 }, { "epoch": 0.8744477550601047, "grad_norm": 0.06387540698051453, "learning_rate": 0.01, "loss": 2.099, "step": 8511 }, { "epoch": 0.8747559847939997, "grad_norm": 0.044399481266736984, "learning_rate": 0.01, "loss": 2.0989, "step": 8514 }, { "epoch": 0.8750642145278948, "grad_norm": 0.118850938975811, "learning_rate": 0.01, "loss": 2.1261, "step": 8517 }, { "epoch": 0.8753724442617898, "grad_norm": 0.05479248985648155, "learning_rate": 0.01, "loss": 2.0701, "step": 8520 }, { "epoch": 0.8756806739956848, "grad_norm": 0.06442543119192123, "learning_rate": 0.01, "loss": 2.0844, "step": 8523 }, { "epoch": 0.8759889037295798, "grad_norm": 0.054294027388095856, "learning_rate": 0.01, "loss": 2.1051, "step": 8526 }, { "epoch": 0.8762971334634748, "grad_norm": 0.04776893928647041, "learning_rate": 0.01, "loss": 2.1056, "step": 8529 }, { "epoch": 0.8766053631973698, "grad_norm": 0.06740310043096542, "learning_rate": 0.01, "loss": 2.0956, "step": 8532 }, { "epoch": 0.8769135929312648, "grad_norm": 0.048034511506557465, "learning_rate": 0.01, "loss": 2.1223, "step": 8535 }, { "epoch": 0.8772218226651598, "grad_norm": 0.05819391459226608, "learning_rate": 0.01, "loss": 2.1133, "step": 8538 }, { "epoch": 0.8775300523990548, "grad_norm": 0.06093437224626541, "learning_rate": 0.01, "loss": 2.0889, "step": 8541 }, { "epoch": 0.8778382821329498, "grad_norm": 0.04628787562251091, "learning_rate": 0.01, "loss": 2.1202, "step": 8544 }, { "epoch": 0.8781465118668448, "grad_norm": 0.0903085321187973, "learning_rate": 0.01, "loss": 2.0495, "step": 8547 }, { "epoch": 0.8784547416007398, "grad_norm": 0.06924945116043091, "learning_rate": 0.01, "loss": 2.1004, "step": 8550 }, { "epoch": 0.8787629713346348, "grad_norm": 0.04104374721646309, "learning_rate": 0.01, "loss": 2.0954, "step": 8553 }, { "epoch": 0.8790712010685298, "grad_norm": 0.11671441793441772, "learning_rate": 0.01, "loss": 2.1027, "step": 8556 }, { "epoch": 0.8793794308024248, "grad_norm": 0.10247964411973953, "learning_rate": 0.01, "loss": 2.0861, "step": 8559 }, { "epoch": 0.8796876605363197, "grad_norm": 0.03979288041591644, "learning_rate": 0.01, "loss": 2.1307, "step": 8562 }, { "epoch": 0.8799958902702147, "grad_norm": 0.0406351312994957, "learning_rate": 0.01, "loss": 2.0868, "step": 8565 }, { "epoch": 0.8803041200041097, "grad_norm": 0.04127006232738495, "learning_rate": 0.01, "loss": 2.0899, "step": 8568 }, { "epoch": 0.8806123497380047, "grad_norm": 0.04559047520160675, "learning_rate": 0.01, "loss": 2.1071, "step": 8571 }, { "epoch": 0.8809205794718997, "grad_norm": 0.12507610023021698, "learning_rate": 0.01, "loss": 2.0944, "step": 8574 }, { "epoch": 0.8812288092057947, "grad_norm": 0.042683400213718414, "learning_rate": 0.01, "loss": 2.078, "step": 8577 }, { "epoch": 0.8815370389396897, "grad_norm": 0.04022818058729172, "learning_rate": 0.01, "loss": 2.0797, "step": 8580 }, { "epoch": 0.8818452686735847, "grad_norm": 0.0382862351834774, "learning_rate": 0.01, "loss": 2.0859, "step": 8583 }, { "epoch": 0.8821534984074797, "grad_norm": 0.05260771885514259, "learning_rate": 0.01, "loss": 2.0832, "step": 8586 }, { "epoch": 0.8824617281413747, "grad_norm": 0.05381648615002632, "learning_rate": 0.01, "loss": 2.1211, "step": 8589 }, { "epoch": 0.8827699578752697, "grad_norm": 0.055818814784288406, "learning_rate": 0.01, "loss": 2.1108, "step": 8592 }, { "epoch": 0.8830781876091647, "grad_norm": 0.16680215299129486, "learning_rate": 0.01, "loss": 2.0961, "step": 8595 }, { "epoch": 0.8833864173430597, "grad_norm": 0.10034742951393127, "learning_rate": 0.01, "loss": 2.1187, "step": 8598 }, { "epoch": 0.8836946470769547, "grad_norm": 0.0827341303229332, "learning_rate": 0.01, "loss": 2.1112, "step": 8601 }, { "epoch": 0.8840028768108497, "grad_norm": 0.07657956331968307, "learning_rate": 0.01, "loss": 2.0711, "step": 8604 }, { "epoch": 0.8843111065447447, "grad_norm": 0.036220960319042206, "learning_rate": 0.01, "loss": 2.1097, "step": 8607 }, { "epoch": 0.8846193362786396, "grad_norm": 0.04672658443450928, "learning_rate": 0.01, "loss": 2.1099, "step": 8610 }, { "epoch": 0.8849275660125346, "grad_norm": 0.04827800393104553, "learning_rate": 0.01, "loss": 2.1081, "step": 8613 }, { "epoch": 0.8852357957464296, "grad_norm": 0.04962724447250366, "learning_rate": 0.01, "loss": 2.0895, "step": 8616 }, { "epoch": 0.8855440254803246, "grad_norm": 0.03474809601902962, "learning_rate": 0.01, "loss": 2.0942, "step": 8619 }, { "epoch": 0.8858522552142196, "grad_norm": 0.07395246624946594, "learning_rate": 0.01, "loss": 2.1145, "step": 8622 }, { "epoch": 0.8861604849481146, "grad_norm": 0.09853484481573105, "learning_rate": 0.01, "loss": 2.0991, "step": 8625 }, { "epoch": 0.8864687146820096, "grad_norm": 0.11892013251781464, "learning_rate": 0.01, "loss": 2.0968, "step": 8628 }, { "epoch": 0.8867769444159046, "grad_norm": 0.12780621647834778, "learning_rate": 0.01, "loss": 2.1154, "step": 8631 }, { "epoch": 0.8870851741497997, "grad_norm": 0.04470033943653107, "learning_rate": 0.01, "loss": 2.1027, "step": 8634 }, { "epoch": 0.8873934038836947, "grad_norm": 0.054323747754096985, "learning_rate": 0.01, "loss": 2.0952, "step": 8637 }, { "epoch": 0.8877016336175897, "grad_norm": 0.08175788819789886, "learning_rate": 0.01, "loss": 2.0882, "step": 8640 }, { "epoch": 0.8880098633514847, "grad_norm": 0.07456079125404358, "learning_rate": 0.01, "loss": 2.141, "step": 8643 }, { "epoch": 0.8883180930853797, "grad_norm": 0.055910736322402954, "learning_rate": 0.01, "loss": 2.1102, "step": 8646 }, { "epoch": 0.8886263228192747, "grad_norm": 0.05231192335486412, "learning_rate": 0.01, "loss": 2.1026, "step": 8649 }, { "epoch": 0.8889345525531697, "grad_norm": 0.05306578800082207, "learning_rate": 0.01, "loss": 2.1051, "step": 8652 }, { "epoch": 0.8892427822870647, "grad_norm": 0.05569072067737579, "learning_rate": 0.01, "loss": 2.0835, "step": 8655 }, { "epoch": 0.8895510120209597, "grad_norm": 0.050971515476703644, "learning_rate": 0.01, "loss": 2.0718, "step": 8658 }, { "epoch": 0.8898592417548546, "grad_norm": 0.061436936259269714, "learning_rate": 0.01, "loss": 2.1167, "step": 8661 }, { "epoch": 0.8901674714887496, "grad_norm": 0.04307536780834198, "learning_rate": 0.01, "loss": 2.0972, "step": 8664 }, { "epoch": 0.8904757012226446, "grad_norm": 0.1459832638502121, "learning_rate": 0.01, "loss": 2.1306, "step": 8667 }, { "epoch": 0.8907839309565396, "grad_norm": 0.05527958646416664, "learning_rate": 0.01, "loss": 2.0974, "step": 8670 }, { "epoch": 0.8910921606904346, "grad_norm": 0.1319393813610077, "learning_rate": 0.01, "loss": 2.1259, "step": 8673 }, { "epoch": 0.8914003904243296, "grad_norm": 0.06124665215611458, "learning_rate": 0.01, "loss": 2.0997, "step": 8676 }, { "epoch": 0.8917086201582246, "grad_norm": 0.08667455613613129, "learning_rate": 0.01, "loss": 2.0941, "step": 8679 }, { "epoch": 0.8920168498921196, "grad_norm": 0.06631213426589966, "learning_rate": 0.01, "loss": 2.1196, "step": 8682 }, { "epoch": 0.8923250796260146, "grad_norm": 0.060188647359609604, "learning_rate": 0.01, "loss": 2.0971, "step": 8685 }, { "epoch": 0.8926333093599096, "grad_norm": 0.039312943816185, "learning_rate": 0.01, "loss": 2.1119, "step": 8688 }, { "epoch": 0.8929415390938046, "grad_norm": 0.03959662839770317, "learning_rate": 0.01, "loss": 2.0897, "step": 8691 }, { "epoch": 0.8932497688276996, "grad_norm": 0.09711046516895294, "learning_rate": 0.01, "loss": 2.1133, "step": 8694 }, { "epoch": 0.8935579985615946, "grad_norm": 0.07965920865535736, "learning_rate": 0.01, "loss": 2.0635, "step": 8697 }, { "epoch": 0.8938662282954896, "grad_norm": 0.08770687133073807, "learning_rate": 0.01, "loss": 2.0885, "step": 8700 }, { "epoch": 0.8941744580293846, "grad_norm": 0.04591045528650284, "learning_rate": 0.01, "loss": 2.0926, "step": 8703 }, { "epoch": 0.8944826877632795, "grad_norm": 0.09602218866348267, "learning_rate": 0.01, "loss": 2.0856, "step": 8706 }, { "epoch": 0.8947909174971745, "grad_norm": 0.09482742100954056, "learning_rate": 0.01, "loss": 2.0966, "step": 8709 }, { "epoch": 0.8950991472310695, "grad_norm": 0.03937089815735817, "learning_rate": 0.01, "loss": 2.1043, "step": 8712 }, { "epoch": 0.8954073769649645, "grad_norm": 0.056832704693078995, "learning_rate": 0.01, "loss": 2.1165, "step": 8715 }, { "epoch": 0.8957156066988595, "grad_norm": 0.06370353698730469, "learning_rate": 0.01, "loss": 2.1144, "step": 8718 }, { "epoch": 0.8960238364327545, "grad_norm": 0.06752549856901169, "learning_rate": 0.01, "loss": 2.1026, "step": 8721 }, { "epoch": 0.8963320661666495, "grad_norm": 0.13301892578601837, "learning_rate": 0.01, "loss": 2.11, "step": 8724 }, { "epoch": 0.8966402959005445, "grad_norm": 0.05210836976766586, "learning_rate": 0.01, "loss": 2.0925, "step": 8727 }, { "epoch": 0.8969485256344395, "grad_norm": 0.03570270165801048, "learning_rate": 0.01, "loss": 2.0809, "step": 8730 }, { "epoch": 0.8972567553683345, "grad_norm": 0.05898820236325264, "learning_rate": 0.01, "loss": 2.0786, "step": 8733 }, { "epoch": 0.8975649851022295, "grad_norm": 0.05087563395500183, "learning_rate": 0.01, "loss": 2.1071, "step": 8736 }, { "epoch": 0.8978732148361245, "grad_norm": 0.09473355114459991, "learning_rate": 0.01, "loss": 2.103, "step": 8739 }, { "epoch": 0.8981814445700195, "grad_norm": 0.09793075919151306, "learning_rate": 0.01, "loss": 2.0972, "step": 8742 }, { "epoch": 0.8984896743039145, "grad_norm": 0.05115204304456711, "learning_rate": 0.01, "loss": 2.0979, "step": 8745 }, { "epoch": 0.8987979040378095, "grad_norm": 0.057413987815380096, "learning_rate": 0.01, "loss": 2.1156, "step": 8748 }, { "epoch": 0.8991061337717046, "grad_norm": 0.04136224836111069, "learning_rate": 0.01, "loss": 2.1269, "step": 8751 }, { "epoch": 0.8994143635055996, "grad_norm": 0.06866753846406937, "learning_rate": 0.01, "loss": 2.1092, "step": 8754 }, { "epoch": 0.8997225932394946, "grad_norm": 0.0757627934217453, "learning_rate": 0.01, "loss": 2.0933, "step": 8757 }, { "epoch": 0.9000308229733895, "grad_norm": 0.08082983642816544, "learning_rate": 0.01, "loss": 2.1124, "step": 8760 }, { "epoch": 0.9003390527072845, "grad_norm": 0.046828944236040115, "learning_rate": 0.01, "loss": 2.0978, "step": 8763 }, { "epoch": 0.9006472824411795, "grad_norm": 0.11039458215236664, "learning_rate": 0.01, "loss": 2.0989, "step": 8766 }, { "epoch": 0.9009555121750745, "grad_norm": 0.048537638038396835, "learning_rate": 0.01, "loss": 2.0946, "step": 8769 }, { "epoch": 0.9012637419089695, "grad_norm": 0.06700310111045837, "learning_rate": 0.01, "loss": 2.1184, "step": 8772 }, { "epoch": 0.9015719716428645, "grad_norm": 0.044369909912347794, "learning_rate": 0.01, "loss": 2.1026, "step": 8775 }, { "epoch": 0.9018802013767595, "grad_norm": 0.041071876883506775, "learning_rate": 0.01, "loss": 2.0774, "step": 8778 }, { "epoch": 0.9021884311106545, "grad_norm": 0.04735315591096878, "learning_rate": 0.01, "loss": 2.0812, "step": 8781 }, { "epoch": 0.9024966608445495, "grad_norm": 0.11621284484863281, "learning_rate": 0.01, "loss": 2.0766, "step": 8784 }, { "epoch": 0.9028048905784445, "grad_norm": 0.11453153938055038, "learning_rate": 0.01, "loss": 2.0866, "step": 8787 }, { "epoch": 0.9031131203123395, "grad_norm": 0.057418763637542725, "learning_rate": 0.01, "loss": 2.081, "step": 8790 }, { "epoch": 0.9034213500462345, "grad_norm": 0.041579000651836395, "learning_rate": 0.01, "loss": 2.1154, "step": 8793 }, { "epoch": 0.9037295797801295, "grad_norm": 0.045673951506614685, "learning_rate": 0.01, "loss": 2.1241, "step": 8796 }, { "epoch": 0.9040378095140245, "grad_norm": 0.05963718518614769, "learning_rate": 0.01, "loss": 2.0955, "step": 8799 }, { "epoch": 0.9043460392479195, "grad_norm": 0.04776541888713837, "learning_rate": 0.01, "loss": 2.1138, "step": 8802 }, { "epoch": 0.9046542689818144, "grad_norm": 0.09103482216596603, "learning_rate": 0.01, "loss": 2.1192, "step": 8805 }, { "epoch": 0.9049624987157094, "grad_norm": 0.09218809008598328, "learning_rate": 0.01, "loss": 2.0985, "step": 8808 }, { "epoch": 0.9052707284496044, "grad_norm": 0.10253725945949554, "learning_rate": 0.01, "loss": 2.1189, "step": 8811 }, { "epoch": 0.9055789581834994, "grad_norm": 0.09638465940952301, "learning_rate": 0.01, "loss": 2.1008, "step": 8814 }, { "epoch": 0.9058871879173944, "grad_norm": 0.0947449579834938, "learning_rate": 0.01, "loss": 2.1222, "step": 8817 }, { "epoch": 0.9061954176512894, "grad_norm": 0.04588090255856514, "learning_rate": 0.01, "loss": 2.1198, "step": 8820 }, { "epoch": 0.9065036473851844, "grad_norm": 0.05041109770536423, "learning_rate": 0.01, "loss": 2.0843, "step": 8823 }, { "epoch": 0.9068118771190794, "grad_norm": 0.038898076862096786, "learning_rate": 0.01, "loss": 2.125, "step": 8826 }, { "epoch": 0.9071201068529744, "grad_norm": 0.03356321156024933, "learning_rate": 0.01, "loss": 2.0985, "step": 8829 }, { "epoch": 0.9074283365868694, "grad_norm": 0.04668448120355606, "learning_rate": 0.01, "loss": 2.1071, "step": 8832 }, { "epoch": 0.9077365663207644, "grad_norm": 0.051277391612529755, "learning_rate": 0.01, "loss": 2.0702, "step": 8835 }, { "epoch": 0.9080447960546594, "grad_norm": 0.049883171916007996, "learning_rate": 0.01, "loss": 2.1111, "step": 8838 }, { "epoch": 0.9083530257885544, "grad_norm": 0.04149313643574715, "learning_rate": 0.01, "loss": 2.0991, "step": 8841 }, { "epoch": 0.9086612555224494, "grad_norm": 0.09206261485815048, "learning_rate": 0.01, "loss": 2.0961, "step": 8844 }, { "epoch": 0.9089694852563444, "grad_norm": 0.1830751895904541, "learning_rate": 0.01, "loss": 2.1093, "step": 8847 }, { "epoch": 0.9092777149902393, "grad_norm": 0.0757865458726883, "learning_rate": 0.01, "loss": 2.115, "step": 8850 }, { "epoch": 0.9095859447241343, "grad_norm": 0.06030673533678055, "learning_rate": 0.01, "loss": 2.0874, "step": 8853 }, { "epoch": 0.9098941744580293, "grad_norm": 0.03440079465508461, "learning_rate": 0.01, "loss": 2.0997, "step": 8856 }, { "epoch": 0.9102024041919243, "grad_norm": 0.040004558861255646, "learning_rate": 0.01, "loss": 2.0767, "step": 8859 }, { "epoch": 0.9105106339258193, "grad_norm": 0.033261023461818695, "learning_rate": 0.01, "loss": 2.0834, "step": 8862 }, { "epoch": 0.9108188636597144, "grad_norm": 0.04814066365361214, "learning_rate": 0.01, "loss": 2.0868, "step": 8865 }, { "epoch": 0.9111270933936094, "grad_norm": 0.04939806088805199, "learning_rate": 0.01, "loss": 2.0944, "step": 8868 }, { "epoch": 0.9114353231275044, "grad_norm": 0.05242007225751877, "learning_rate": 0.01, "loss": 2.1035, "step": 8871 }, { "epoch": 0.9117435528613994, "grad_norm": 0.04576495289802551, "learning_rate": 0.01, "loss": 2.0881, "step": 8874 }, { "epoch": 0.9120517825952944, "grad_norm": 0.0369776152074337, "learning_rate": 0.01, "loss": 2.1017, "step": 8877 }, { "epoch": 0.9123600123291894, "grad_norm": 0.08296829462051392, "learning_rate": 0.01, "loss": 2.1199, "step": 8880 }, { "epoch": 0.9126682420630844, "grad_norm": 0.07186676561832428, "learning_rate": 0.01, "loss": 2.0906, "step": 8883 }, { "epoch": 0.9129764717969794, "grad_norm": 0.06849399209022522, "learning_rate": 0.01, "loss": 2.0944, "step": 8886 }, { "epoch": 0.9132847015308744, "grad_norm": 0.1285102367401123, "learning_rate": 0.01, "loss": 2.0959, "step": 8889 }, { "epoch": 0.9135929312647694, "grad_norm": 0.045700203627347946, "learning_rate": 0.01, "loss": 2.0924, "step": 8892 }, { "epoch": 0.9139011609986644, "grad_norm": 0.04561945050954819, "learning_rate": 0.01, "loss": 2.1126, "step": 8895 }, { "epoch": 0.9142093907325594, "grad_norm": 0.0417817123234272, "learning_rate": 0.01, "loss": 2.0692, "step": 8898 }, { "epoch": 0.9145176204664544, "grad_norm": 0.07923369109630585, "learning_rate": 0.01, "loss": 2.1059, "step": 8901 }, { "epoch": 0.9148258502003493, "grad_norm": 0.052836060523986816, "learning_rate": 0.01, "loss": 2.1089, "step": 8904 }, { "epoch": 0.9151340799342443, "grad_norm": 0.04591790586709976, "learning_rate": 0.01, "loss": 2.1007, "step": 8907 }, { "epoch": 0.9154423096681393, "grad_norm": 0.09871240705251694, "learning_rate": 0.01, "loss": 2.0718, "step": 8910 }, { "epoch": 0.9157505394020343, "grad_norm": 0.044554613530635834, "learning_rate": 0.01, "loss": 2.0956, "step": 8913 }, { "epoch": 0.9160587691359293, "grad_norm": 0.10009585320949554, "learning_rate": 0.01, "loss": 2.0838, "step": 8916 }, { "epoch": 0.9163669988698243, "grad_norm": 0.07252159714698792, "learning_rate": 0.01, "loss": 2.0973, "step": 8919 }, { "epoch": 0.9166752286037193, "grad_norm": 0.09162852168083191, "learning_rate": 0.01, "loss": 2.0961, "step": 8922 }, { "epoch": 0.9169834583376143, "grad_norm": 0.06149733439087868, "learning_rate": 0.01, "loss": 2.1377, "step": 8925 }, { "epoch": 0.9172916880715093, "grad_norm": 0.09315814077854156, "learning_rate": 0.01, "loss": 2.0901, "step": 8928 }, { "epoch": 0.9175999178054043, "grad_norm": 0.056877728551626205, "learning_rate": 0.01, "loss": 2.0934, "step": 8931 }, { "epoch": 0.9179081475392993, "grad_norm": 0.0976705476641655, "learning_rate": 0.01, "loss": 2.0791, "step": 8934 }, { "epoch": 0.9182163772731943, "grad_norm": 0.0493176206946373, "learning_rate": 0.01, "loss": 2.0937, "step": 8937 }, { "epoch": 0.9185246070070893, "grad_norm": 0.06268187612295151, "learning_rate": 0.01, "loss": 2.1053, "step": 8940 }, { "epoch": 0.9188328367409843, "grad_norm": 0.049251820892095566, "learning_rate": 0.01, "loss": 2.1104, "step": 8943 }, { "epoch": 0.9191410664748793, "grad_norm": 0.05342431366443634, "learning_rate": 0.01, "loss": 2.1005, "step": 8946 }, { "epoch": 0.9194492962087742, "grad_norm": 0.036090634763240814, "learning_rate": 0.01, "loss": 2.0815, "step": 8949 }, { "epoch": 0.9197575259426692, "grad_norm": 0.0320359505712986, "learning_rate": 0.01, "loss": 2.0704, "step": 8952 }, { "epoch": 0.9200657556765642, "grad_norm": 0.03514352813363075, "learning_rate": 0.01, "loss": 2.1046, "step": 8955 }, { "epoch": 0.9203739854104592, "grad_norm": 0.06132291629910469, "learning_rate": 0.01, "loss": 2.0887, "step": 8958 }, { "epoch": 0.9206822151443542, "grad_norm": 0.07312822341918945, "learning_rate": 0.01, "loss": 2.1079, "step": 8961 }, { "epoch": 0.9209904448782492, "grad_norm": 0.09670150279998779, "learning_rate": 0.01, "loss": 2.1195, "step": 8964 }, { "epoch": 0.9212986746121442, "grad_norm": 0.1106385663151741, "learning_rate": 0.01, "loss": 2.0809, "step": 8967 }, { "epoch": 0.9216069043460392, "grad_norm": 0.05964332073926926, "learning_rate": 0.01, "loss": 2.1108, "step": 8970 }, { "epoch": 0.9219151340799342, "grad_norm": 0.05584556236863136, "learning_rate": 0.01, "loss": 2.1274, "step": 8973 }, { "epoch": 0.9222233638138292, "grad_norm": 0.04485652595758438, "learning_rate": 0.01, "loss": 2.0627, "step": 8976 }, { "epoch": 0.9225315935477242, "grad_norm": 0.07286686450242996, "learning_rate": 0.01, "loss": 2.1087, "step": 8979 }, { "epoch": 0.9228398232816193, "grad_norm": 0.10815869271755219, "learning_rate": 0.01, "loss": 2.1057, "step": 8982 }, { "epoch": 0.9231480530155143, "grad_norm": 0.1037832722067833, "learning_rate": 0.01, "loss": 2.0836, "step": 8985 }, { "epoch": 0.9234562827494093, "grad_norm": 0.08297618478536606, "learning_rate": 0.01, "loss": 2.1181, "step": 8988 }, { "epoch": 0.9237645124833043, "grad_norm": 0.04203306511044502, "learning_rate": 0.01, "loss": 2.1112, "step": 8991 }, { "epoch": 0.9240727422171993, "grad_norm": 0.06641580909490585, "learning_rate": 0.01, "loss": 2.1004, "step": 8994 }, { "epoch": 0.9243809719510943, "grad_norm": 0.04921744763851166, "learning_rate": 0.01, "loss": 2.1116, "step": 8997 }, { "epoch": 0.9246892016849892, "grad_norm": 0.03472235053777695, "learning_rate": 0.01, "loss": 2.0777, "step": 9000 }, { "epoch": 0.9249974314188842, "grad_norm": 0.03650922700762749, "learning_rate": 0.01, "loss": 2.0802, "step": 9003 }, { "epoch": 0.9253056611527792, "grad_norm": 0.04657342657446861, "learning_rate": 0.01, "loss": 2.0773, "step": 9006 }, { "epoch": 0.9256138908866742, "grad_norm": 0.05943501368165016, "learning_rate": 0.01, "loss": 2.0753, "step": 9009 }, { "epoch": 0.9259221206205692, "grad_norm": 0.04763554409146309, "learning_rate": 0.01, "loss": 2.0959, "step": 9012 }, { "epoch": 0.9262303503544642, "grad_norm": 0.1267511248588562, "learning_rate": 0.01, "loss": 2.0971, "step": 9015 }, { "epoch": 0.9265385800883592, "grad_norm": 0.055529460310935974, "learning_rate": 0.01, "loss": 2.1327, "step": 9018 }, { "epoch": 0.9268468098222542, "grad_norm": 0.15508927404880524, "learning_rate": 0.01, "loss": 2.0947, "step": 9021 }, { "epoch": 0.9271550395561492, "grad_norm": 0.0593777671456337, "learning_rate": 0.01, "loss": 2.1171, "step": 9024 }, { "epoch": 0.9274632692900442, "grad_norm": 0.08907107263803482, "learning_rate": 0.01, "loss": 2.093, "step": 9027 }, { "epoch": 0.9277714990239392, "grad_norm": 0.07041808217763901, "learning_rate": 0.01, "loss": 2.0676, "step": 9030 }, { "epoch": 0.9280797287578342, "grad_norm": 0.03434208780527115, "learning_rate": 0.01, "loss": 2.0928, "step": 9033 }, { "epoch": 0.9283879584917292, "grad_norm": 0.07591548562049866, "learning_rate": 0.01, "loss": 2.0857, "step": 9036 }, { "epoch": 0.9286961882256242, "grad_norm": 0.08999443799257278, "learning_rate": 0.01, "loss": 2.0984, "step": 9039 }, { "epoch": 0.9290044179595192, "grad_norm": 0.11046464741230011, "learning_rate": 0.01, "loss": 2.1009, "step": 9042 }, { "epoch": 0.9293126476934142, "grad_norm": 0.08271370083093643, "learning_rate": 0.01, "loss": 2.1027, "step": 9045 }, { "epoch": 0.9296208774273091, "grad_norm": 0.046337299048900604, "learning_rate": 0.01, "loss": 2.0826, "step": 9048 }, { "epoch": 0.9299291071612041, "grad_norm": 0.037284769117832184, "learning_rate": 0.01, "loss": 2.1015, "step": 9051 }, { "epoch": 0.9302373368950991, "grad_norm": 0.04956496134400368, "learning_rate": 0.01, "loss": 2.1036, "step": 9054 }, { "epoch": 0.9305455666289941, "grad_norm": 0.12329571694135666, "learning_rate": 0.01, "loss": 2.0917, "step": 9057 }, { "epoch": 0.9308537963628891, "grad_norm": 0.06971380859613419, "learning_rate": 0.01, "loss": 2.114, "step": 9060 }, { "epoch": 0.9311620260967841, "grad_norm": 0.06084508076310158, "learning_rate": 0.01, "loss": 2.1122, "step": 9063 }, { "epoch": 0.9314702558306791, "grad_norm": 0.049602411687374115, "learning_rate": 0.01, "loss": 2.1268, "step": 9066 }, { "epoch": 0.9317784855645741, "grad_norm": 0.05200349539518356, "learning_rate": 0.01, "loss": 2.0979, "step": 9069 }, { "epoch": 0.9320867152984691, "grad_norm": 0.05793909728527069, "learning_rate": 0.01, "loss": 2.096, "step": 9072 }, { "epoch": 0.9323949450323641, "grad_norm": 0.10819883644580841, "learning_rate": 0.01, "loss": 2.1096, "step": 9075 }, { "epoch": 0.9327031747662591, "grad_norm": 0.07809442281723022, "learning_rate": 0.01, "loss": 2.0968, "step": 9078 }, { "epoch": 0.9330114045001541, "grad_norm": 0.09595733880996704, "learning_rate": 0.01, "loss": 2.0769, "step": 9081 }, { "epoch": 0.9333196342340491, "grad_norm": 0.11658616364002228, "learning_rate": 0.01, "loss": 2.0945, "step": 9084 }, { "epoch": 0.9336278639679441, "grad_norm": 0.07642678171396255, "learning_rate": 0.01, "loss": 2.0811, "step": 9087 }, { "epoch": 0.933936093701839, "grad_norm": 0.03174865245819092, "learning_rate": 0.01, "loss": 2.1017, "step": 9090 }, { "epoch": 0.934244323435734, "grad_norm": 0.05137626454234123, "learning_rate": 0.01, "loss": 2.0878, "step": 9093 }, { "epoch": 0.9345525531696292, "grad_norm": 0.05306951329112053, "learning_rate": 0.01, "loss": 2.1163, "step": 9096 }, { "epoch": 0.9348607829035241, "grad_norm": 0.0716642439365387, "learning_rate": 0.01, "loss": 2.0903, "step": 9099 }, { "epoch": 0.9351690126374191, "grad_norm": 0.10328514873981476, "learning_rate": 0.01, "loss": 2.0789, "step": 9102 }, { "epoch": 0.9354772423713141, "grad_norm": 0.04914560168981552, "learning_rate": 0.01, "loss": 2.0963, "step": 9105 }, { "epoch": 0.9357854721052091, "grad_norm": 0.04810576140880585, "learning_rate": 0.01, "loss": 2.1119, "step": 9108 }, { "epoch": 0.9360937018391041, "grad_norm": 0.05689787119626999, "learning_rate": 0.01, "loss": 2.0955, "step": 9111 }, { "epoch": 0.9364019315729991, "grad_norm": 0.06455382704734802, "learning_rate": 0.01, "loss": 2.0894, "step": 9114 }, { "epoch": 0.9367101613068941, "grad_norm": 0.044911760836839676, "learning_rate": 0.01, "loss": 2.0967, "step": 9117 }, { "epoch": 0.9370183910407891, "grad_norm": 0.06244887784123421, "learning_rate": 0.01, "loss": 2.0921, "step": 9120 }, { "epoch": 0.9373266207746841, "grad_norm": 0.052621614187955856, "learning_rate": 0.01, "loss": 2.1296, "step": 9123 }, { "epoch": 0.9376348505085791, "grad_norm": 0.05098232626914978, "learning_rate": 0.01, "loss": 2.0807, "step": 9126 }, { "epoch": 0.9379430802424741, "grad_norm": 0.058582011610269547, "learning_rate": 0.01, "loss": 2.0973, "step": 9129 }, { "epoch": 0.9382513099763691, "grad_norm": 0.10984500497579575, "learning_rate": 0.01, "loss": 2.0789, "step": 9132 }, { "epoch": 0.9385595397102641, "grad_norm": 0.045173123478889465, "learning_rate": 0.01, "loss": 2.0937, "step": 9135 }, { "epoch": 0.9388677694441591, "grad_norm": 0.06749478727579117, "learning_rate": 0.01, "loss": 2.1051, "step": 9138 }, { "epoch": 0.939175999178054, "grad_norm": 0.06236808001995087, "learning_rate": 0.01, "loss": 2.1099, "step": 9141 }, { "epoch": 0.939484228911949, "grad_norm": 0.06205837428569794, "learning_rate": 0.01, "loss": 2.0893, "step": 9144 }, { "epoch": 0.939792458645844, "grad_norm": 0.0742972195148468, "learning_rate": 0.01, "loss": 2.1034, "step": 9147 }, { "epoch": 0.940100688379739, "grad_norm": 0.06998419016599655, "learning_rate": 0.01, "loss": 2.0558, "step": 9150 }, { "epoch": 0.940408918113634, "grad_norm": 0.04214362055063248, "learning_rate": 0.01, "loss": 2.0968, "step": 9153 }, { "epoch": 0.940717147847529, "grad_norm": 0.055913276970386505, "learning_rate": 0.01, "loss": 2.0736, "step": 9156 }, { "epoch": 0.941025377581424, "grad_norm": 0.0941486805677414, "learning_rate": 0.01, "loss": 2.1038, "step": 9159 }, { "epoch": 0.941333607315319, "grad_norm": 0.05609782040119171, "learning_rate": 0.01, "loss": 2.096, "step": 9162 }, { "epoch": 0.941641837049214, "grad_norm": 0.05714662745594978, "learning_rate": 0.01, "loss": 2.0939, "step": 9165 }, { "epoch": 0.941950066783109, "grad_norm": 0.05364496633410454, "learning_rate": 0.01, "loss": 2.0838, "step": 9168 }, { "epoch": 0.942258296517004, "grad_norm": 0.050090350210666656, "learning_rate": 0.01, "loss": 2.087, "step": 9171 }, { "epoch": 0.942566526250899, "grad_norm": 0.07287559658288956, "learning_rate": 0.01, "loss": 2.098, "step": 9174 }, { "epoch": 0.942874755984794, "grad_norm": 0.04061901941895485, "learning_rate": 0.01, "loss": 2.0677, "step": 9177 }, { "epoch": 0.943182985718689, "grad_norm": 0.10750306397676468, "learning_rate": 0.01, "loss": 2.1105, "step": 9180 }, { "epoch": 0.943491215452584, "grad_norm": 0.10353365540504456, "learning_rate": 0.01, "loss": 2.0712, "step": 9183 }, { "epoch": 0.943799445186479, "grad_norm": 0.07502592355012894, "learning_rate": 0.01, "loss": 2.1115, "step": 9186 }, { "epoch": 0.944107674920374, "grad_norm": 0.046962104737758636, "learning_rate": 0.01, "loss": 2.0937, "step": 9189 }, { "epoch": 0.944415904654269, "grad_norm": 0.05084332078695297, "learning_rate": 0.01, "loss": 2.0943, "step": 9192 }, { "epoch": 0.9447241343881639, "grad_norm": 0.0458371527493, "learning_rate": 0.01, "loss": 2.0967, "step": 9195 }, { "epoch": 0.9450323641220589, "grad_norm": 0.040458545088768005, "learning_rate": 0.01, "loss": 2.0949, "step": 9198 }, { "epoch": 0.9453405938559539, "grad_norm": 0.046158358454704285, "learning_rate": 0.01, "loss": 2.0912, "step": 9201 }, { "epoch": 0.9456488235898489, "grad_norm": 0.10080043226480484, "learning_rate": 0.01, "loss": 2.1, "step": 9204 }, { "epoch": 0.9459570533237439, "grad_norm": 0.07679333537817001, "learning_rate": 0.01, "loss": 2.1013, "step": 9207 }, { "epoch": 0.9462652830576389, "grad_norm": 0.07189175486564636, "learning_rate": 0.01, "loss": 2.1008, "step": 9210 }, { "epoch": 0.946573512791534, "grad_norm": 0.07828579097986221, "learning_rate": 0.01, "loss": 2.1063, "step": 9213 }, { "epoch": 0.946881742525429, "grad_norm": 0.07649674266576767, "learning_rate": 0.01, "loss": 2.1146, "step": 9216 }, { "epoch": 0.947189972259324, "grad_norm": 0.06558651477098465, "learning_rate": 0.01, "loss": 2.0705, "step": 9219 }, { "epoch": 0.947498201993219, "grad_norm": 0.03276702016592026, "learning_rate": 0.01, "loss": 2.1065, "step": 9222 }, { "epoch": 0.947806431727114, "grad_norm": 0.03779645636677742, "learning_rate": 0.01, "loss": 2.0924, "step": 9225 }, { "epoch": 0.948114661461009, "grad_norm": 0.048466913402080536, "learning_rate": 0.01, "loss": 2.1037, "step": 9228 }, { "epoch": 0.948422891194904, "grad_norm": 0.04391203075647354, "learning_rate": 0.01, "loss": 2.0722, "step": 9231 }, { "epoch": 0.948731120928799, "grad_norm": 0.11353743076324463, "learning_rate": 0.01, "loss": 2.113, "step": 9234 }, { "epoch": 0.949039350662694, "grad_norm": 0.045930709689855576, "learning_rate": 0.01, "loss": 2.0699, "step": 9237 }, { "epoch": 0.949347580396589, "grad_norm": 0.06440164893865585, "learning_rate": 0.01, "loss": 2.0786, "step": 9240 }, { "epoch": 0.949655810130484, "grad_norm": 0.08666238933801651, "learning_rate": 0.01, "loss": 2.1049, "step": 9243 }, { "epoch": 0.9499640398643789, "grad_norm": 0.11012524366378784, "learning_rate": 0.01, "loss": 2.1018, "step": 9246 }, { "epoch": 0.9502722695982739, "grad_norm": 0.047307875007390976, "learning_rate": 0.01, "loss": 2.0943, "step": 9249 }, { "epoch": 0.9505804993321689, "grad_norm": 0.04565277695655823, "learning_rate": 0.01, "loss": 2.1174, "step": 9252 }, { "epoch": 0.9508887290660639, "grad_norm": 0.03389623388648033, "learning_rate": 0.01, "loss": 2.0896, "step": 9255 }, { "epoch": 0.9511969587999589, "grad_norm": 0.04582008346915245, "learning_rate": 0.01, "loss": 2.0888, "step": 9258 }, { "epoch": 0.9515051885338539, "grad_norm": 0.07722247391939163, "learning_rate": 0.01, "loss": 2.0843, "step": 9261 }, { "epoch": 0.9518134182677489, "grad_norm": 0.03505149856209755, "learning_rate": 0.01, "loss": 2.0903, "step": 9264 }, { "epoch": 0.9521216480016439, "grad_norm": 0.08010539412498474, "learning_rate": 0.01, "loss": 2.1249, "step": 9267 }, { "epoch": 0.9524298777355389, "grad_norm": 0.0723007321357727, "learning_rate": 0.01, "loss": 2.0951, "step": 9270 }, { "epoch": 0.9527381074694339, "grad_norm": 0.05629736930131912, "learning_rate": 0.01, "loss": 2.0948, "step": 9273 }, { "epoch": 0.9530463372033289, "grad_norm": 0.05514506623148918, "learning_rate": 0.01, "loss": 2.1214, "step": 9276 }, { "epoch": 0.9533545669372239, "grad_norm": 0.1107834130525589, "learning_rate": 0.01, "loss": 2.0876, "step": 9279 }, { "epoch": 0.9536627966711189, "grad_norm": 0.046309590339660645, "learning_rate": 0.01, "loss": 2.0669, "step": 9282 }, { "epoch": 0.9539710264050139, "grad_norm": 0.06956466287374496, "learning_rate": 0.01, "loss": 2.0903, "step": 9285 }, { "epoch": 0.9542792561389088, "grad_norm": 0.086011603474617, "learning_rate": 0.01, "loss": 2.0896, "step": 9288 }, { "epoch": 0.9545874858728038, "grad_norm": 0.04768074303865433, "learning_rate": 0.01, "loss": 2.0923, "step": 9291 }, { "epoch": 0.9548957156066988, "grad_norm": 0.0958017110824585, "learning_rate": 0.01, "loss": 2.1134, "step": 9294 }, { "epoch": 0.9552039453405938, "grad_norm": 0.06098558008670807, "learning_rate": 0.01, "loss": 2.0775, "step": 9297 }, { "epoch": 0.9555121750744888, "grad_norm": 0.05258086323738098, "learning_rate": 0.01, "loss": 2.0998, "step": 9300 }, { "epoch": 0.9558204048083838, "grad_norm": 0.06664231419563293, "learning_rate": 0.01, "loss": 2.1215, "step": 9303 }, { "epoch": 0.9561286345422788, "grad_norm": 0.05491488054394722, "learning_rate": 0.01, "loss": 2.0837, "step": 9306 }, { "epoch": 0.9564368642761738, "grad_norm": 0.0436725877225399, "learning_rate": 0.01, "loss": 2.1268, "step": 9309 }, { "epoch": 0.9567450940100688, "grad_norm": 0.08737560361623764, "learning_rate": 0.01, "loss": 2.0901, "step": 9312 }, { "epoch": 0.9570533237439638, "grad_norm": 0.08130110800266266, "learning_rate": 0.01, "loss": 2.0766, "step": 9315 }, { "epoch": 0.9573615534778588, "grad_norm": 0.07826768606901169, "learning_rate": 0.01, "loss": 2.0836, "step": 9318 }, { "epoch": 0.9576697832117538, "grad_norm": 0.09330857545137405, "learning_rate": 0.01, "loss": 2.0794, "step": 9321 }, { "epoch": 0.9579780129456488, "grad_norm": 0.03914652019739151, "learning_rate": 0.01, "loss": 2.102, "step": 9324 }, { "epoch": 0.9582862426795438, "grad_norm": 0.03853154182434082, "learning_rate": 0.01, "loss": 2.0915, "step": 9327 }, { "epoch": 0.9585944724134389, "grad_norm": 0.07349935919046402, "learning_rate": 0.01, "loss": 2.0856, "step": 9330 }, { "epoch": 0.9589027021473339, "grad_norm": 0.1473885178565979, "learning_rate": 0.01, "loss": 2.0904, "step": 9333 }, { "epoch": 0.9592109318812289, "grad_norm": 0.11091527342796326, "learning_rate": 0.01, "loss": 2.0934, "step": 9336 }, { "epoch": 0.9595191616151239, "grad_norm": 0.0400085523724556, "learning_rate": 0.01, "loss": 2.0924, "step": 9339 }, { "epoch": 0.9598273913490188, "grad_norm": 0.05025499314069748, "learning_rate": 0.01, "loss": 2.0845, "step": 9342 }, { "epoch": 0.9601356210829138, "grad_norm": 0.03745681792497635, "learning_rate": 0.01, "loss": 2.1006, "step": 9345 }, { "epoch": 0.9604438508168088, "grad_norm": 0.05147318169474602, "learning_rate": 0.01, "loss": 2.0912, "step": 9348 }, { "epoch": 0.9607520805507038, "grad_norm": 0.06338364630937576, "learning_rate": 0.01, "loss": 2.1169, "step": 9351 }, { "epoch": 0.9610603102845988, "grad_norm": 0.09458258748054504, "learning_rate": 0.01, "loss": 2.1005, "step": 9354 }, { "epoch": 0.9613685400184938, "grad_norm": 0.09883291274309158, "learning_rate": 0.01, "loss": 2.0934, "step": 9357 }, { "epoch": 0.9616767697523888, "grad_norm": 0.048908524215221405, "learning_rate": 0.01, "loss": 2.0863, "step": 9360 }, { "epoch": 0.9619849994862838, "grad_norm": 0.11762084811925888, "learning_rate": 0.01, "loss": 2.1182, "step": 9363 }, { "epoch": 0.9622932292201788, "grad_norm": 0.0835133045911789, "learning_rate": 0.01, "loss": 2.0728, "step": 9366 }, { "epoch": 0.9626014589540738, "grad_norm": 0.0580466203391552, "learning_rate": 0.01, "loss": 2.0756, "step": 9369 }, { "epoch": 0.9629096886879688, "grad_norm": 0.051043394953012466, "learning_rate": 0.01, "loss": 2.0936, "step": 9372 }, { "epoch": 0.9632179184218638, "grad_norm": 0.1081843450665474, "learning_rate": 0.01, "loss": 2.107, "step": 9375 }, { "epoch": 0.9635261481557588, "grad_norm": 0.04656577482819557, "learning_rate": 0.01, "loss": 2.1084, "step": 9378 }, { "epoch": 0.9638343778896538, "grad_norm": 0.03988798335194588, "learning_rate": 0.01, "loss": 2.1015, "step": 9381 }, { "epoch": 0.9641426076235488, "grad_norm": 0.07686126232147217, "learning_rate": 0.01, "loss": 2.1417, "step": 9384 }, { "epoch": 0.9644508373574437, "grad_norm": 0.057407401502132416, "learning_rate": 0.01, "loss": 2.1191, "step": 9387 }, { "epoch": 0.9647590670913387, "grad_norm": 0.0947386845946312, "learning_rate": 0.01, "loss": 2.0796, "step": 9390 }, { "epoch": 0.9650672968252337, "grad_norm": 0.05064699798822403, "learning_rate": 0.01, "loss": 2.1001, "step": 9393 }, { "epoch": 0.9653755265591287, "grad_norm": 0.04948986694216728, "learning_rate": 0.01, "loss": 2.0736, "step": 9396 }, { "epoch": 0.9656837562930237, "grad_norm": 0.10736438632011414, "learning_rate": 0.01, "loss": 2.0939, "step": 9399 }, { "epoch": 0.9659919860269187, "grad_norm": 0.039317477494478226, "learning_rate": 0.01, "loss": 2.1077, "step": 9402 }, { "epoch": 0.9663002157608137, "grad_norm": 0.06933067739009857, "learning_rate": 0.01, "loss": 2.1056, "step": 9405 }, { "epoch": 0.9666084454947087, "grad_norm": 0.03649623692035675, "learning_rate": 0.01, "loss": 2.0838, "step": 9408 }, { "epoch": 0.9669166752286037, "grad_norm": 0.09309684485197067, "learning_rate": 0.01, "loss": 2.0913, "step": 9411 }, { "epoch": 0.9672249049624987, "grad_norm": 0.11532922834157944, "learning_rate": 0.01, "loss": 2.1127, "step": 9414 }, { "epoch": 0.9675331346963937, "grad_norm": 0.053582970052957535, "learning_rate": 0.01, "loss": 2.0812, "step": 9417 }, { "epoch": 0.9678413644302887, "grad_norm": 0.07581201195716858, "learning_rate": 0.01, "loss": 2.1148, "step": 9420 }, { "epoch": 0.9681495941641837, "grad_norm": 0.051002178341150284, "learning_rate": 0.01, "loss": 2.0834, "step": 9423 }, { "epoch": 0.9684578238980787, "grad_norm": 0.06385383754968643, "learning_rate": 0.01, "loss": 2.0826, "step": 9426 }, { "epoch": 0.9687660536319737, "grad_norm": 0.10576994717121124, "learning_rate": 0.01, "loss": 2.0768, "step": 9429 }, { "epoch": 0.9690742833658686, "grad_norm": 0.054983410984277725, "learning_rate": 0.01, "loss": 2.0604, "step": 9432 }, { "epoch": 0.9693825130997636, "grad_norm": 0.09159716218709946, "learning_rate": 0.01, "loss": 2.0613, "step": 9435 }, { "epoch": 0.9696907428336586, "grad_norm": 0.07718406617641449, "learning_rate": 0.01, "loss": 2.1132, "step": 9438 }, { "epoch": 0.9699989725675536, "grad_norm": 0.0788009986281395, "learning_rate": 0.01, "loss": 2.0887, "step": 9441 }, { "epoch": 0.9703072023014487, "grad_norm": 0.040717653930187225, "learning_rate": 0.01, "loss": 2.084, "step": 9444 }, { "epoch": 0.9706154320353437, "grad_norm": 0.09677381813526154, "learning_rate": 0.01, "loss": 2.0903, "step": 9447 }, { "epoch": 0.9709236617692387, "grad_norm": 0.0706525593996048, "learning_rate": 0.01, "loss": 2.0647, "step": 9450 }, { "epoch": 0.9712318915031337, "grad_norm": 0.04624510183930397, "learning_rate": 0.01, "loss": 2.0818, "step": 9453 }, { "epoch": 0.9715401212370287, "grad_norm": 0.04585500434041023, "learning_rate": 0.01, "loss": 2.0927, "step": 9456 }, { "epoch": 0.9718483509709237, "grad_norm": 0.03468145430088043, "learning_rate": 0.01, "loss": 2.0759, "step": 9459 }, { "epoch": 0.9721565807048187, "grad_norm": 0.06956649571657181, "learning_rate": 0.01, "loss": 2.092, "step": 9462 }, { "epoch": 0.9724648104387137, "grad_norm": 0.04509080946445465, "learning_rate": 0.01, "loss": 2.1095, "step": 9465 }, { "epoch": 0.9727730401726087, "grad_norm": 0.09959586709737778, "learning_rate": 0.01, "loss": 2.0921, "step": 9468 }, { "epoch": 0.9730812699065037, "grad_norm": 0.08427727967500687, "learning_rate": 0.01, "loss": 2.1031, "step": 9471 }, { "epoch": 0.9733894996403987, "grad_norm": 0.14798741042613983, "learning_rate": 0.01, "loss": 2.091, "step": 9474 }, { "epoch": 0.9736977293742937, "grad_norm": 0.057735662907361984, "learning_rate": 0.01, "loss": 2.0701, "step": 9477 }, { "epoch": 0.9740059591081887, "grad_norm": 0.04484837129712105, "learning_rate": 0.01, "loss": 2.1015, "step": 9480 }, { "epoch": 0.9743141888420837, "grad_norm": 0.04166285693645477, "learning_rate": 0.01, "loss": 2.1021, "step": 9483 }, { "epoch": 0.9746224185759786, "grad_norm": 0.05640358105301857, "learning_rate": 0.01, "loss": 2.0925, "step": 9486 }, { "epoch": 0.9749306483098736, "grad_norm": 0.040314216166734695, "learning_rate": 0.01, "loss": 2.0797, "step": 9489 }, { "epoch": 0.9752388780437686, "grad_norm": 0.04522860422730446, "learning_rate": 0.01, "loss": 2.0935, "step": 9492 }, { "epoch": 0.9755471077776636, "grad_norm": 0.03492886200547218, "learning_rate": 0.01, "loss": 2.0968, "step": 9495 }, { "epoch": 0.9758553375115586, "grad_norm": 0.03252703323960304, "learning_rate": 0.01, "loss": 2.125, "step": 9498 }, { "epoch": 0.9761635672454536, "grad_norm": 0.04002056270837784, "learning_rate": 0.01, "loss": 2.0651, "step": 9501 }, { "epoch": 0.9764717969793486, "grad_norm": 0.07364718616008759, "learning_rate": 0.01, "loss": 2.0629, "step": 9504 }, { "epoch": 0.9767800267132436, "grad_norm": 0.05577448755502701, "learning_rate": 0.01, "loss": 2.0726, "step": 9507 }, { "epoch": 0.9770882564471386, "grad_norm": 0.13259132206439972, "learning_rate": 0.01, "loss": 2.1075, "step": 9510 }, { "epoch": 0.9773964861810336, "grad_norm": 0.06911557912826538, "learning_rate": 0.01, "loss": 2.0887, "step": 9513 }, { "epoch": 0.9777047159149286, "grad_norm": 0.10592345148324966, "learning_rate": 0.01, "loss": 2.0982, "step": 9516 }, { "epoch": 0.9780129456488236, "grad_norm": 0.05682144686579704, "learning_rate": 0.01, "loss": 2.0961, "step": 9519 }, { "epoch": 0.9783211753827186, "grad_norm": 0.07456633448600769, "learning_rate": 0.01, "loss": 2.0983, "step": 9522 }, { "epoch": 0.9786294051166136, "grad_norm": 0.062031425535678864, "learning_rate": 0.01, "loss": 2.1163, "step": 9525 }, { "epoch": 0.9789376348505086, "grad_norm": 0.0570233091711998, "learning_rate": 0.01, "loss": 2.1046, "step": 9528 }, { "epoch": 0.9792458645844035, "grad_norm": 0.04668619483709335, "learning_rate": 0.01, "loss": 2.081, "step": 9531 }, { "epoch": 0.9795540943182985, "grad_norm": 0.04718153178691864, "learning_rate": 0.01, "loss": 2.0678, "step": 9534 }, { "epoch": 0.9798623240521935, "grad_norm": 0.054066251963377, "learning_rate": 0.01, "loss": 2.0911, "step": 9537 }, { "epoch": 0.9801705537860885, "grad_norm": 0.1274210512638092, "learning_rate": 0.01, "loss": 2.097, "step": 9540 }, { "epoch": 0.9804787835199835, "grad_norm": 0.07543773949146271, "learning_rate": 0.01, "loss": 2.0824, "step": 9543 }, { "epoch": 0.9807870132538785, "grad_norm": 0.07845018804073334, "learning_rate": 0.01, "loss": 2.0749, "step": 9546 }, { "epoch": 0.9810952429877735, "grad_norm": 0.08444254100322723, "learning_rate": 0.01, "loss": 2.1019, "step": 9549 }, { "epoch": 0.9814034727216685, "grad_norm": 0.07719142735004425, "learning_rate": 0.01, "loss": 2.0811, "step": 9552 }, { "epoch": 0.9817117024555635, "grad_norm": 0.05624673515558243, "learning_rate": 0.01, "loss": 2.0752, "step": 9555 }, { "epoch": 0.9820199321894585, "grad_norm": 0.0419309176504612, "learning_rate": 0.01, "loss": 2.0812, "step": 9558 }, { "epoch": 0.9823281619233536, "grad_norm": 0.0343257375061512, "learning_rate": 0.01, "loss": 2.0694, "step": 9561 }, { "epoch": 0.9826363916572486, "grad_norm": 0.059452395886182785, "learning_rate": 0.01, "loss": 2.0521, "step": 9564 }, { "epoch": 0.9829446213911436, "grad_norm": 0.09073518216609955, "learning_rate": 0.01, "loss": 2.0636, "step": 9567 }, { "epoch": 0.9832528511250386, "grad_norm": 0.10660509765148163, "learning_rate": 0.01, "loss": 2.0796, "step": 9570 }, { "epoch": 0.9835610808589336, "grad_norm": 0.04380667209625244, "learning_rate": 0.01, "loss": 2.0992, "step": 9573 }, { "epoch": 0.9838693105928286, "grad_norm": 0.06383811682462692, "learning_rate": 0.01, "loss": 2.0722, "step": 9576 }, { "epoch": 0.9841775403267236, "grad_norm": 0.07926032692193985, "learning_rate": 0.01, "loss": 2.0571, "step": 9579 }, { "epoch": 0.9844857700606185, "grad_norm": 0.05310386046767235, "learning_rate": 0.01, "loss": 2.0739, "step": 9582 }, { "epoch": 0.9847939997945135, "grad_norm": 0.03591843321919441, "learning_rate": 0.01, "loss": 2.0757, "step": 9585 }, { "epoch": 0.9851022295284085, "grad_norm": 0.04773431271314621, "learning_rate": 0.01, "loss": 2.0525, "step": 9588 }, { "epoch": 0.9854104592623035, "grad_norm": 0.04679710045456886, "learning_rate": 0.01, "loss": 2.0771, "step": 9591 }, { "epoch": 0.9857186889961985, "grad_norm": 0.05671774223446846, "learning_rate": 0.01, "loss": 2.1106, "step": 9594 }, { "epoch": 0.9860269187300935, "grad_norm": 0.049488577991724014, "learning_rate": 0.01, "loss": 2.0695, "step": 9597 }, { "epoch": 0.9863351484639885, "grad_norm": 0.04207129031419754, "learning_rate": 0.01, "loss": 2.0903, "step": 9600 }, { "epoch": 0.9866433781978835, "grad_norm": 0.10019747167825699, "learning_rate": 0.01, "loss": 2.073, "step": 9603 }, { "epoch": 0.9869516079317785, "grad_norm": 0.051381729543209076, "learning_rate": 0.01, "loss": 2.0626, "step": 9606 }, { "epoch": 0.9872598376656735, "grad_norm": 0.13477744162082672, "learning_rate": 0.01, "loss": 2.1098, "step": 9609 }, { "epoch": 0.9875680673995685, "grad_norm": 0.09002148360013962, "learning_rate": 0.01, "loss": 2.0927, "step": 9612 }, { "epoch": 0.9878762971334635, "grad_norm": 0.05230112001299858, "learning_rate": 0.01, "loss": 2.0902, "step": 9615 }, { "epoch": 0.9881845268673585, "grad_norm": 0.0639885738492012, "learning_rate": 0.01, "loss": 2.1179, "step": 9618 }, { "epoch": 0.9884927566012535, "grad_norm": 0.0553070530295372, "learning_rate": 0.01, "loss": 2.0923, "step": 9621 }, { "epoch": 0.9888009863351485, "grad_norm": 0.04541468620300293, "learning_rate": 0.01, "loss": 2.0965, "step": 9624 }, { "epoch": 0.9891092160690435, "grad_norm": 0.08656930178403854, "learning_rate": 0.01, "loss": 2.1038, "step": 9627 }, { "epoch": 0.9894174458029384, "grad_norm": 0.04954921826720238, "learning_rate": 0.01, "loss": 2.0759, "step": 9630 }, { "epoch": 0.9897256755368334, "grad_norm": 0.07971720397472382, "learning_rate": 0.01, "loss": 2.0837, "step": 9633 }, { "epoch": 0.9900339052707284, "grad_norm": 0.12388944625854492, "learning_rate": 0.01, "loss": 2.1181, "step": 9636 }, { "epoch": 0.9903421350046234, "grad_norm": 0.040693242102861404, "learning_rate": 0.01, "loss": 2.0806, "step": 9639 }, { "epoch": 0.9906503647385184, "grad_norm": 0.032711997628211975, "learning_rate": 0.01, "loss": 2.0925, "step": 9642 }, { "epoch": 0.9909585944724134, "grad_norm": 0.04089382663369179, "learning_rate": 0.01, "loss": 2.0841, "step": 9645 }, { "epoch": 0.9912668242063084, "grad_norm": 0.05480481684207916, "learning_rate": 0.01, "loss": 2.0769, "step": 9648 }, { "epoch": 0.9915750539402034, "grad_norm": 0.04627472907304764, "learning_rate": 0.01, "loss": 2.094, "step": 9651 }, { "epoch": 0.9918832836740984, "grad_norm": 0.0517272874712944, "learning_rate": 0.01, "loss": 2.1181, "step": 9654 }, { "epoch": 0.9921915134079934, "grad_norm": 0.051012761890888214, "learning_rate": 0.01, "loss": 2.0985, "step": 9657 }, { "epoch": 0.9924997431418884, "grad_norm": 0.08666348457336426, "learning_rate": 0.01, "loss": 2.0875, "step": 9660 }, { "epoch": 0.9928079728757834, "grad_norm": 0.0972173810005188, "learning_rate": 0.01, "loss": 2.0995, "step": 9663 }, { "epoch": 0.9931162026096784, "grad_norm": 0.0765865370631218, "learning_rate": 0.01, "loss": 2.0729, "step": 9666 }, { "epoch": 0.9934244323435734, "grad_norm": 0.04532674700021744, "learning_rate": 0.01, "loss": 2.0656, "step": 9669 }, { "epoch": 0.9937326620774684, "grad_norm": 0.08642619848251343, "learning_rate": 0.01, "loss": 2.1036, "step": 9672 }, { "epoch": 0.9940408918113633, "grad_norm": 0.04758689925074577, "learning_rate": 0.01, "loss": 2.0683, "step": 9675 }, { "epoch": 0.9943491215452585, "grad_norm": 0.07701463252305984, "learning_rate": 0.01, "loss": 2.0898, "step": 9678 }, { "epoch": 0.9946573512791534, "grad_norm": 0.05999990925192833, "learning_rate": 0.01, "loss": 2.0694, "step": 9681 }, { "epoch": 0.9949655810130484, "grad_norm": 0.08793257176876068, "learning_rate": 0.01, "loss": 2.0689, "step": 9684 }, { "epoch": 0.9952738107469434, "grad_norm": 0.06139199063181877, "learning_rate": 0.01, "loss": 2.0801, "step": 9687 }, { "epoch": 0.9955820404808384, "grad_norm": 0.09202239662408829, "learning_rate": 0.01, "loss": 2.0837, "step": 9690 }, { "epoch": 0.9958902702147334, "grad_norm": 0.09284163266420364, "learning_rate": 0.01, "loss": 2.107, "step": 9693 }, { "epoch": 0.9961984999486284, "grad_norm": 0.08113729953765869, "learning_rate": 0.01, "loss": 2.076, "step": 9696 }, { "epoch": 0.9965067296825234, "grad_norm": 0.10663104802370071, "learning_rate": 0.01, "loss": 2.0973, "step": 9699 }, { "epoch": 0.9968149594164184, "grad_norm": 0.11791951954364777, "learning_rate": 0.01, "loss": 2.0885, "step": 9702 }, { "epoch": 0.9971231891503134, "grad_norm": 0.09039194136857986, "learning_rate": 0.01, "loss": 2.0957, "step": 9705 }, { "epoch": 0.9974314188842084, "grad_norm": 0.08142858743667603, "learning_rate": 0.01, "loss": 2.0721, "step": 9708 }, { "epoch": 0.9977396486181034, "grad_norm": 0.07347192615270615, "learning_rate": 0.01, "loss": 2.0985, "step": 9711 }, { "epoch": 0.9980478783519984, "grad_norm": 0.04449746012687683, "learning_rate": 0.01, "loss": 2.0728, "step": 9714 }, { "epoch": 0.9983561080858934, "grad_norm": 0.040178634226322174, "learning_rate": 0.01, "loss": 2.0773, "step": 9717 }, { "epoch": 0.9986643378197884, "grad_norm": 0.0577414333820343, "learning_rate": 0.01, "loss": 2.0854, "step": 9720 }, { "epoch": 0.9989725675536834, "grad_norm": 0.07444582879543304, "learning_rate": 0.01, "loss": 2.0834, "step": 9723 }, { "epoch": 0.9992807972875783, "grad_norm": 0.10387948155403137, "learning_rate": 0.01, "loss": 2.0698, "step": 9726 }, { "epoch": 0.9995890270214733, "grad_norm": 0.11066528409719467, "learning_rate": 0.01, "loss": 2.1035, "step": 9729 }, { "epoch": 0.9998972567553683, "grad_norm": 0.06454616039991379, "learning_rate": 0.01, "loss": 2.0692, "step": 9732 }, { "epoch": 0.9990763546798029, "grad_norm": 0.048325520008802414, "learning_rate": 0.01, "loss": 2.1225, "step": 9735 }, { "epoch": 0.999384236453202, "grad_norm": 0.03542228788137436, "learning_rate": 0.01, "loss": 2.1024, "step": 9738 }, { "epoch": 0.999692118226601, "grad_norm": 0.042020559310913086, "learning_rate": 0.01, "loss": 2.0968, "step": 9741 }, { "epoch": 1.0, "grad_norm": 0.04916913062334061, "learning_rate": 0.01, "loss": 2.1244, "step": 9744 }, { "epoch": 1.000307881773399, "grad_norm": 0.08905553072690964, "learning_rate": 0.01, "loss": 2.0867, "step": 9747 }, { "epoch": 1.000615763546798, "grad_norm": 0.07140953093767166, "learning_rate": 0.01, "loss": 2.0863, "step": 9750 }, { "epoch": 1.000923645320197, "grad_norm": 0.05284767597913742, "learning_rate": 0.01, "loss": 2.1131, "step": 9753 }, { "epoch": 1.001231527093596, "grad_norm": 0.1293289214372635, "learning_rate": 0.01, "loss": 2.1036, "step": 9756 }, { "epoch": 1.001539408866995, "grad_norm": 0.06052086502313614, "learning_rate": 0.01, "loss": 2.1189, "step": 9759 }, { "epoch": 1.0018472906403941, "grad_norm": 0.07361391931772232, "learning_rate": 0.01, "loss": 2.0962, "step": 9762 }, { "epoch": 1.0021551724137931, "grad_norm": 0.06513562798500061, "learning_rate": 0.01, "loss": 2.129, "step": 9765 }, { "epoch": 1.0024630541871922, "grad_norm": 0.036649156361818314, "learning_rate": 0.01, "loss": 2.0964, "step": 9768 }, { "epoch": 1.0027709359605912, "grad_norm": 0.05371764674782753, "learning_rate": 0.01, "loss": 2.0976, "step": 9771 }, { "epoch": 1.0030788177339902, "grad_norm": 0.06316730380058289, "learning_rate": 0.01, "loss": 2.097, "step": 9774 }, { "epoch": 1.0033866995073892, "grad_norm": 0.03097986802458763, "learning_rate": 0.01, "loss": 2.1128, "step": 9777 }, { "epoch": 1.0036945812807883, "grad_norm": 0.046021945774555206, "learning_rate": 0.01, "loss": 2.1296, "step": 9780 }, { "epoch": 1.0040024630541873, "grad_norm": 0.06580191850662231, "learning_rate": 0.01, "loss": 2.1106, "step": 9783 }, { "epoch": 1.0043103448275863, "grad_norm": 0.054073531180620193, "learning_rate": 0.01, "loss": 2.0986, "step": 9786 }, { "epoch": 1.0046182266009853, "grad_norm": 0.10088641196489334, "learning_rate": 0.01, "loss": 2.1301, "step": 9789 }, { "epoch": 1.0049261083743843, "grad_norm": 0.03944807127118111, "learning_rate": 0.01, "loss": 2.1337, "step": 9792 }, { "epoch": 1.0052339901477831, "grad_norm": 0.07183028757572174, "learning_rate": 0.01, "loss": 2.1272, "step": 9795 }, { "epoch": 1.0055418719211822, "grad_norm": 0.13821956515312195, "learning_rate": 0.01, "loss": 2.1016, "step": 9798 }, { "epoch": 1.0058497536945812, "grad_norm": 0.14031893014907837, "learning_rate": 0.01, "loss": 2.0924, "step": 9801 }, { "epoch": 1.0061576354679802, "grad_norm": 0.06494525820016861, "learning_rate": 0.01, "loss": 2.128, "step": 9804 }, { "epoch": 1.0064655172413792, "grad_norm": 0.05946667864918709, "learning_rate": 0.01, "loss": 2.1335, "step": 9807 }, { "epoch": 1.0067733990147782, "grad_norm": 0.05583272874355316, "learning_rate": 0.01, "loss": 2.1186, "step": 9810 }, { "epoch": 1.0070812807881773, "grad_norm": 0.06858284026384354, "learning_rate": 0.01, "loss": 2.1207, "step": 9813 }, { "epoch": 1.0073891625615763, "grad_norm": 0.05864641070365906, "learning_rate": 0.01, "loss": 2.0869, "step": 9816 }, { "epoch": 1.0076970443349753, "grad_norm": 0.043661102652549744, "learning_rate": 0.01, "loss": 2.1067, "step": 9819 }, { "epoch": 1.0080049261083743, "grad_norm": 0.07878375053405762, "learning_rate": 0.01, "loss": 2.1149, "step": 9822 }, { "epoch": 1.0083128078817734, "grad_norm": 0.04246210679411888, "learning_rate": 0.01, "loss": 2.1241, "step": 9825 }, { "epoch": 1.0086206896551724, "grad_norm": 0.06508597731590271, "learning_rate": 0.01, "loss": 2.1232, "step": 9828 }, { "epoch": 1.0089285714285714, "grad_norm": 0.07472758740186691, "learning_rate": 0.01, "loss": 2.0893, "step": 9831 }, { "epoch": 1.0092364532019704, "grad_norm": 0.13144147396087646, "learning_rate": 0.01, "loss": 2.1194, "step": 9834 }, { "epoch": 1.0095443349753694, "grad_norm": 0.08961367607116699, "learning_rate": 0.01, "loss": 2.1215, "step": 9837 }, { "epoch": 1.0098522167487685, "grad_norm": 0.053439076989889145, "learning_rate": 0.01, "loss": 2.1173, "step": 9840 }, { "epoch": 1.0101600985221675, "grad_norm": 0.03234443441033363, "learning_rate": 0.01, "loss": 2.104, "step": 9843 }, { "epoch": 1.0104679802955665, "grad_norm": 0.07516933977603912, "learning_rate": 0.01, "loss": 2.1186, "step": 9846 }, { "epoch": 1.0107758620689655, "grad_norm": 0.12221794575452805, "learning_rate": 0.01, "loss": 2.0934, "step": 9849 }, { "epoch": 1.0110837438423645, "grad_norm": 0.08198120445013046, "learning_rate": 0.01, "loss": 2.1495, "step": 9852 }, { "epoch": 1.0113916256157636, "grad_norm": 0.058380696922540665, "learning_rate": 0.01, "loss": 2.1234, "step": 9855 }, { "epoch": 1.0116995073891626, "grad_norm": 0.04831172525882721, "learning_rate": 0.01, "loss": 2.0977, "step": 9858 }, { "epoch": 1.0120073891625616, "grad_norm": 0.045920804142951965, "learning_rate": 0.01, "loss": 2.0842, "step": 9861 }, { "epoch": 1.0123152709359606, "grad_norm": 0.12969541549682617, "learning_rate": 0.01, "loss": 2.1005, "step": 9864 }, { "epoch": 1.0126231527093597, "grad_norm": 0.09659627079963684, "learning_rate": 0.01, "loss": 2.1126, "step": 9867 }, { "epoch": 1.0129310344827587, "grad_norm": 0.033160608261823654, "learning_rate": 0.01, "loss": 2.1244, "step": 9870 }, { "epoch": 1.0132389162561577, "grad_norm": 0.03523699939250946, "learning_rate": 0.01, "loss": 2.1009, "step": 9873 }, { "epoch": 1.0135467980295567, "grad_norm": 0.04670235142111778, "learning_rate": 0.01, "loss": 2.1107, "step": 9876 }, { "epoch": 1.0138546798029557, "grad_norm": 0.05278048664331436, "learning_rate": 0.01, "loss": 2.133, "step": 9879 }, { "epoch": 1.0141625615763548, "grad_norm": 0.1409105509519577, "learning_rate": 0.01, "loss": 2.105, "step": 9882 }, { "epoch": 1.0144704433497538, "grad_norm": 0.08208174258470535, "learning_rate": 0.01, "loss": 2.1202, "step": 9885 }, { "epoch": 1.0147783251231528, "grad_norm": 0.052980221807956696, "learning_rate": 0.01, "loss": 2.1108, "step": 9888 }, { "epoch": 1.0150862068965518, "grad_norm": 0.03402642160654068, "learning_rate": 0.01, "loss": 2.1058, "step": 9891 }, { "epoch": 1.0153940886699508, "grad_norm": 0.05165582895278931, "learning_rate": 0.01, "loss": 2.0962, "step": 9894 }, { "epoch": 1.0157019704433496, "grad_norm": 0.0488906130194664, "learning_rate": 0.01, "loss": 2.1157, "step": 9897 }, { "epoch": 1.0160098522167487, "grad_norm": 0.06578544527292252, "learning_rate": 0.01, "loss": 2.0783, "step": 9900 }, { "epoch": 1.0163177339901477, "grad_norm": 0.05930023267865181, "learning_rate": 0.01, "loss": 2.115, "step": 9903 }, { "epoch": 1.0166256157635467, "grad_norm": 0.07461842894554138, "learning_rate": 0.01, "loss": 2.0833, "step": 9906 }, { "epoch": 1.0169334975369457, "grad_norm": 0.04523751139640808, "learning_rate": 0.01, "loss": 2.1062, "step": 9909 }, { "epoch": 1.0172413793103448, "grad_norm": 0.05342249572277069, "learning_rate": 0.01, "loss": 2.1127, "step": 9912 }, { "epoch": 1.0175492610837438, "grad_norm": 0.040748368948698044, "learning_rate": 0.01, "loss": 2.0893, "step": 9915 }, { "epoch": 1.0178571428571428, "grad_norm": 0.03435824438929558, "learning_rate": 0.01, "loss": 2.1151, "step": 9918 }, { "epoch": 1.0181650246305418, "grad_norm": 0.04769265651702881, "learning_rate": 0.01, "loss": 2.0984, "step": 9921 }, { "epoch": 1.0184729064039408, "grad_norm": 0.07814217358827591, "learning_rate": 0.01, "loss": 2.1038, "step": 9924 }, { "epoch": 1.0187807881773399, "grad_norm": 0.12953363358974457, "learning_rate": 0.01, "loss": 2.1051, "step": 9927 }, { "epoch": 1.0190886699507389, "grad_norm": 0.11376773566007614, "learning_rate": 0.01, "loss": 2.0989, "step": 9930 }, { "epoch": 1.019396551724138, "grad_norm": 0.05323106423020363, "learning_rate": 0.01, "loss": 2.1135, "step": 9933 }, { "epoch": 1.019704433497537, "grad_norm": 0.07705114781856537, "learning_rate": 0.01, "loss": 2.1046, "step": 9936 }, { "epoch": 1.020012315270936, "grad_norm": 0.05934451147913933, "learning_rate": 0.01, "loss": 2.1207, "step": 9939 }, { "epoch": 1.020320197044335, "grad_norm": 0.10474961996078491, "learning_rate": 0.01, "loss": 2.1134, "step": 9942 }, { "epoch": 1.020628078817734, "grad_norm": 0.05283385515213013, "learning_rate": 0.01, "loss": 2.1085, "step": 9945 }, { "epoch": 1.020935960591133, "grad_norm": 0.043369196355342865, "learning_rate": 0.01, "loss": 2.1265, "step": 9948 }, { "epoch": 1.021243842364532, "grad_norm": 0.0366055853664875, "learning_rate": 0.01, "loss": 2.1214, "step": 9951 }, { "epoch": 1.021551724137931, "grad_norm": 0.06200672313570976, "learning_rate": 0.01, "loss": 2.0943, "step": 9954 }, { "epoch": 1.02185960591133, "grad_norm": 0.06652572005987167, "learning_rate": 0.01, "loss": 2.1139, "step": 9957 }, { "epoch": 1.022167487684729, "grad_norm": 0.04040740057826042, "learning_rate": 0.01, "loss": 2.0894, "step": 9960 }, { "epoch": 1.0224753694581281, "grad_norm": 0.049162358045578, "learning_rate": 0.01, "loss": 2.0955, "step": 9963 }, { "epoch": 1.0227832512315271, "grad_norm": 0.05465700104832649, "learning_rate": 0.01, "loss": 2.1109, "step": 9966 }, { "epoch": 1.0230911330049262, "grad_norm": 0.0575067512691021, "learning_rate": 0.01, "loss": 2.0956, "step": 9969 }, { "epoch": 1.0233990147783252, "grad_norm": 0.14622198045253754, "learning_rate": 0.01, "loss": 2.1031, "step": 9972 }, { "epoch": 1.0237068965517242, "grad_norm": 0.04765618219971657, "learning_rate": 0.01, "loss": 2.0834, "step": 9975 }, { "epoch": 1.0240147783251232, "grad_norm": 0.04039911553263664, "learning_rate": 0.01, "loss": 2.0933, "step": 9978 }, { "epoch": 1.0243226600985222, "grad_norm": 0.06009029969573021, "learning_rate": 0.01, "loss": 2.115, "step": 9981 }, { "epoch": 1.0246305418719213, "grad_norm": 0.06187298893928528, "learning_rate": 0.01, "loss": 2.079, "step": 9984 }, { "epoch": 1.0249384236453203, "grad_norm": 0.05368026718497276, "learning_rate": 0.01, "loss": 2.0875, "step": 9987 }, { "epoch": 1.0252463054187193, "grad_norm": 0.051921263337135315, "learning_rate": 0.01, "loss": 2.1243, "step": 9990 }, { "epoch": 1.0255541871921183, "grad_norm": 0.09820009768009186, "learning_rate": 0.01, "loss": 2.0983, "step": 9993 }, { "epoch": 1.0258620689655173, "grad_norm": 0.10601375997066498, "learning_rate": 0.01, "loss": 2.1288, "step": 9996 }, { "epoch": 1.0261699507389161, "grad_norm": 0.05488260090351105, "learning_rate": 0.01, "loss": 2.1033, "step": 9999 }, { "epoch": 1.0264778325123152, "grad_norm": 0.07482553273439407, "learning_rate": 0.01, "loss": 2.1181, "step": 10002 }, { "epoch": 1.0267857142857142, "grad_norm": 0.044733475893735886, "learning_rate": 0.01, "loss": 2.1237, "step": 10005 }, { "epoch": 1.0270935960591132, "grad_norm": 0.04775967076420784, "learning_rate": 0.01, "loss": 2.1288, "step": 10008 }, { "epoch": 1.0274014778325122, "grad_norm": 0.05972621962428093, "learning_rate": 0.01, "loss": 2.0878, "step": 10011 }, { "epoch": 1.0277093596059113, "grad_norm": 0.12219330668449402, "learning_rate": 0.01, "loss": 2.1034, "step": 10014 }, { "epoch": 1.0280172413793103, "grad_norm": 0.05171920731663704, "learning_rate": 0.01, "loss": 2.0925, "step": 10017 }, { "epoch": 1.0283251231527093, "grad_norm": 0.04166760668158531, "learning_rate": 0.01, "loss": 2.0928, "step": 10020 }, { "epoch": 1.0286330049261083, "grad_norm": 0.05231022089719772, "learning_rate": 0.01, "loss": 2.0945, "step": 10023 }, { "epoch": 1.0289408866995073, "grad_norm": 0.1091604232788086, "learning_rate": 0.01, "loss": 2.0878, "step": 10026 }, { "epoch": 1.0292487684729064, "grad_norm": 0.07104350626468658, "learning_rate": 0.01, "loss": 2.1125, "step": 10029 }, { "epoch": 1.0295566502463054, "grad_norm": 0.0466371588408947, "learning_rate": 0.01, "loss": 2.0973, "step": 10032 }, { "epoch": 1.0298645320197044, "grad_norm": 0.05548730120062828, "learning_rate": 0.01, "loss": 2.0846, "step": 10035 }, { "epoch": 1.0301724137931034, "grad_norm": 0.06483764201402664, "learning_rate": 0.01, "loss": 2.105, "step": 10038 }, { "epoch": 1.0304802955665024, "grad_norm": 0.05243910476565361, "learning_rate": 0.01, "loss": 2.1011, "step": 10041 }, { "epoch": 1.0307881773399015, "grad_norm": 0.09996815025806427, "learning_rate": 0.01, "loss": 2.1389, "step": 10044 }, { "epoch": 1.0310960591133005, "grad_norm": 0.04864559695124626, "learning_rate": 0.01, "loss": 2.1069, "step": 10047 }, { "epoch": 1.0314039408866995, "grad_norm": 0.14447607100009918, "learning_rate": 0.01, "loss": 2.1075, "step": 10050 }, { "epoch": 1.0317118226600985, "grad_norm": 0.050261352211236954, "learning_rate": 0.01, "loss": 2.1147, "step": 10053 }, { "epoch": 1.0320197044334976, "grad_norm": 0.07719244807958603, "learning_rate": 0.01, "loss": 2.1061, "step": 10056 }, { "epoch": 1.0323275862068966, "grad_norm": 0.10620381683111191, "learning_rate": 0.01, "loss": 2.1129, "step": 10059 }, { "epoch": 1.0326354679802956, "grad_norm": 0.05358508229255676, "learning_rate": 0.01, "loss": 2.1156, "step": 10062 }, { "epoch": 1.0329433497536946, "grad_norm": 0.04341145232319832, "learning_rate": 0.01, "loss": 2.1046, "step": 10065 }, { "epoch": 1.0332512315270936, "grad_norm": 0.04785105213522911, "learning_rate": 0.01, "loss": 2.0804, "step": 10068 }, { "epoch": 1.0335591133004927, "grad_norm": 0.04886849224567413, "learning_rate": 0.01, "loss": 2.0691, "step": 10071 }, { "epoch": 1.0338669950738917, "grad_norm": 0.03917735815048218, "learning_rate": 0.01, "loss": 2.0808, "step": 10074 }, { "epoch": 1.0341748768472907, "grad_norm": 0.10696244239807129, "learning_rate": 0.01, "loss": 2.085, "step": 10077 }, { "epoch": 1.0344827586206897, "grad_norm": 0.14525163173675537, "learning_rate": 0.01, "loss": 2.1246, "step": 10080 }, { "epoch": 1.0347906403940887, "grad_norm": 0.06464140862226486, "learning_rate": 0.01, "loss": 2.1088, "step": 10083 }, { "epoch": 1.0350985221674878, "grad_norm": 0.055628299713134766, "learning_rate": 0.01, "loss": 2.1013, "step": 10086 }, { "epoch": 1.0354064039408868, "grad_norm": 0.0457589291036129, "learning_rate": 0.01, "loss": 2.118, "step": 10089 }, { "epoch": 1.0357142857142858, "grad_norm": 0.07108809798955917, "learning_rate": 0.01, "loss": 2.0882, "step": 10092 }, { "epoch": 1.0360221674876848, "grad_norm": 0.07304032146930695, "learning_rate": 0.01, "loss": 2.1632, "step": 10095 }, { "epoch": 1.0363300492610836, "grad_norm": 0.04778844490647316, "learning_rate": 0.01, "loss": 2.1076, "step": 10098 }, { "epoch": 1.0366379310344827, "grad_norm": 0.0444946251809597, "learning_rate": 0.01, "loss": 2.1092, "step": 10101 }, { "epoch": 1.0369458128078817, "grad_norm": 0.03863450884819031, "learning_rate": 0.01, "loss": 2.0973, "step": 10104 }, { "epoch": 1.0372536945812807, "grad_norm": 0.11049003899097443, "learning_rate": 0.01, "loss": 2.1069, "step": 10107 }, { "epoch": 1.0375615763546797, "grad_norm": 0.055413637310266495, "learning_rate": 0.01, "loss": 2.0935, "step": 10110 }, { "epoch": 1.0378694581280787, "grad_norm": 0.1212301105260849, "learning_rate": 0.01, "loss": 2.1033, "step": 10113 }, { "epoch": 1.0381773399014778, "grad_norm": 0.06444283574819565, "learning_rate": 0.01, "loss": 2.0821, "step": 10116 }, { "epoch": 1.0384852216748768, "grad_norm": 0.048522353172302246, "learning_rate": 0.01, "loss": 2.1129, "step": 10119 }, { "epoch": 1.0387931034482758, "grad_norm": 0.03755674138665199, "learning_rate": 0.01, "loss": 2.0773, "step": 10122 }, { "epoch": 1.0391009852216748, "grad_norm": 0.03873259574174881, "learning_rate": 0.01, "loss": 2.0877, "step": 10125 }, { "epoch": 1.0394088669950738, "grad_norm": 0.062387898564338684, "learning_rate": 0.01, "loss": 2.1119, "step": 10128 }, { "epoch": 1.0397167487684729, "grad_norm": 0.037559203803539276, "learning_rate": 0.01, "loss": 2.1165, "step": 10131 }, { "epoch": 1.0400246305418719, "grad_norm": 0.0703917145729065, "learning_rate": 0.01, "loss": 2.0877, "step": 10134 }, { "epoch": 1.040332512315271, "grad_norm": 0.05063795670866966, "learning_rate": 0.01, "loss": 2.1282, "step": 10137 }, { "epoch": 1.04064039408867, "grad_norm": 0.08476493507623672, "learning_rate": 0.01, "loss": 2.1217, "step": 10140 }, { "epoch": 1.040948275862069, "grad_norm": 0.09482383728027344, "learning_rate": 0.01, "loss": 2.1002, "step": 10143 }, { "epoch": 1.041256157635468, "grad_norm": 0.1094396710395813, "learning_rate": 0.01, "loss": 2.1138, "step": 10146 }, { "epoch": 1.041564039408867, "grad_norm": 0.17252720892429352, "learning_rate": 0.01, "loss": 2.1079, "step": 10149 }, { "epoch": 1.041871921182266, "grad_norm": 0.11076754331588745, "learning_rate": 0.01, "loss": 2.1198, "step": 10152 }, { "epoch": 1.042179802955665, "grad_norm": 0.06879215687513351, "learning_rate": 0.01, "loss": 2.0878, "step": 10155 }, { "epoch": 1.042487684729064, "grad_norm": 0.07402212172746658, "learning_rate": 0.01, "loss": 2.0869, "step": 10158 }, { "epoch": 1.042795566502463, "grad_norm": 0.04562051594257355, "learning_rate": 0.01, "loss": 2.1139, "step": 10161 }, { "epoch": 1.043103448275862, "grad_norm": 0.04578396677970886, "learning_rate": 0.01, "loss": 2.0974, "step": 10164 }, { "epoch": 1.0434113300492611, "grad_norm": 0.051678020507097244, "learning_rate": 0.01, "loss": 2.0995, "step": 10167 }, { "epoch": 1.0437192118226601, "grad_norm": 0.03445015102624893, "learning_rate": 0.01, "loss": 2.106, "step": 10170 }, { "epoch": 1.0440270935960592, "grad_norm": 0.03868851810693741, "learning_rate": 0.01, "loss": 2.0732, "step": 10173 }, { "epoch": 1.0443349753694582, "grad_norm": 0.058904558420181274, "learning_rate": 0.01, "loss": 2.085, "step": 10176 }, { "epoch": 1.0446428571428572, "grad_norm": 0.10729484260082245, "learning_rate": 0.01, "loss": 2.0909, "step": 10179 }, { "epoch": 1.0449507389162562, "grad_norm": 0.10037554055452347, "learning_rate": 0.01, "loss": 2.0945, "step": 10182 }, { "epoch": 1.0452586206896552, "grad_norm": 0.07336730509996414, "learning_rate": 0.01, "loss": 2.0885, "step": 10185 }, { "epoch": 1.0455665024630543, "grad_norm": 0.11717227101325989, "learning_rate": 0.01, "loss": 2.1019, "step": 10188 }, { "epoch": 1.0458743842364533, "grad_norm": 0.06263696402311325, "learning_rate": 0.01, "loss": 2.1113, "step": 10191 }, { "epoch": 1.0461822660098523, "grad_norm": 0.07939436286687851, "learning_rate": 0.01, "loss": 2.0803, "step": 10194 }, { "epoch": 1.0464901477832513, "grad_norm": 0.05761004984378815, "learning_rate": 0.01, "loss": 2.1155, "step": 10197 }, { "epoch": 1.0467980295566504, "grad_norm": 0.04293765127658844, "learning_rate": 0.01, "loss": 2.0944, "step": 10200 }, { "epoch": 1.0471059113300492, "grad_norm": 0.04638001322746277, "learning_rate": 0.01, "loss": 2.113, "step": 10203 }, { "epoch": 1.0474137931034482, "grad_norm": 0.047882046550512314, "learning_rate": 0.01, "loss": 2.0733, "step": 10206 }, { "epoch": 1.0477216748768472, "grad_norm": 0.07461071759462357, "learning_rate": 0.01, "loss": 2.107, "step": 10209 }, { "epoch": 1.0480295566502462, "grad_norm": 0.10987289249897003, "learning_rate": 0.01, "loss": 2.105, "step": 10212 }, { "epoch": 1.0483374384236452, "grad_norm": 0.04183235019445419, "learning_rate": 0.01, "loss": 2.0953, "step": 10215 }, { "epoch": 1.0486453201970443, "grad_norm": 0.049700990319252014, "learning_rate": 0.01, "loss": 2.1067, "step": 10218 }, { "epoch": 1.0489532019704433, "grad_norm": 0.08448828011751175, "learning_rate": 0.01, "loss": 2.1113, "step": 10221 }, { "epoch": 1.0492610837438423, "grad_norm": 0.05486508831381798, "learning_rate": 0.01, "loss": 2.1156, "step": 10224 }, { "epoch": 1.0495689655172413, "grad_norm": 0.057925377041101456, "learning_rate": 0.01, "loss": 2.127, "step": 10227 }, { "epoch": 1.0498768472906403, "grad_norm": 0.05322302505373955, "learning_rate": 0.01, "loss": 2.0861, "step": 10230 }, { "epoch": 1.0501847290640394, "grad_norm": 0.046823181211948395, "learning_rate": 0.01, "loss": 2.089, "step": 10233 }, { "epoch": 1.0504926108374384, "grad_norm": 0.05037027224898338, "learning_rate": 0.01, "loss": 2.0841, "step": 10236 }, { "epoch": 1.0508004926108374, "grad_norm": 0.05172303318977356, "learning_rate": 0.01, "loss": 2.105, "step": 10239 }, { "epoch": 1.0511083743842364, "grad_norm": 0.07993052154779434, "learning_rate": 0.01, "loss": 2.1097, "step": 10242 }, { "epoch": 1.0514162561576355, "grad_norm": 0.039322953671216965, "learning_rate": 0.01, "loss": 2.0951, "step": 10245 }, { "epoch": 1.0517241379310345, "grad_norm": 0.05829343572258949, "learning_rate": 0.01, "loss": 2.1257, "step": 10248 }, { "epoch": 1.0520320197044335, "grad_norm": 0.12303601950407028, "learning_rate": 0.01, "loss": 2.1143, "step": 10251 }, { "epoch": 1.0523399014778325, "grad_norm": 0.07176418602466583, "learning_rate": 0.01, "loss": 2.1297, "step": 10254 }, { "epoch": 1.0526477832512315, "grad_norm": 0.05229344964027405, "learning_rate": 0.01, "loss": 2.0934, "step": 10257 }, { "epoch": 1.0529556650246306, "grad_norm": 0.041665658354759216, "learning_rate": 0.01, "loss": 2.116, "step": 10260 }, { "epoch": 1.0532635467980296, "grad_norm": 0.04542261362075806, "learning_rate": 0.01, "loss": 2.1277, "step": 10263 }, { "epoch": 1.0535714285714286, "grad_norm": 0.0501495897769928, "learning_rate": 0.01, "loss": 2.0911, "step": 10266 }, { "epoch": 1.0538793103448276, "grad_norm": 0.06474924832582474, "learning_rate": 0.01, "loss": 2.1254, "step": 10269 }, { "epoch": 1.0541871921182266, "grad_norm": 0.0736108273267746, "learning_rate": 0.01, "loss": 2.0685, "step": 10272 }, { "epoch": 1.0544950738916257, "grad_norm": 0.07487022131681442, "learning_rate": 0.01, "loss": 2.113, "step": 10275 }, { "epoch": 1.0548029556650247, "grad_norm": 0.04876410961151123, "learning_rate": 0.01, "loss": 2.1051, "step": 10278 }, { "epoch": 1.0551108374384237, "grad_norm": 0.056595779955387115, "learning_rate": 0.01, "loss": 2.0864, "step": 10281 }, { "epoch": 1.0554187192118227, "grad_norm": 0.06958241015672684, "learning_rate": 0.01, "loss": 2.1, "step": 10284 }, { "epoch": 1.0557266009852218, "grad_norm": 0.08811846375465393, "learning_rate": 0.01, "loss": 2.1021, "step": 10287 }, { "epoch": 1.0560344827586208, "grad_norm": 0.061557747423648834, "learning_rate": 0.01, "loss": 2.1063, "step": 10290 }, { "epoch": 1.0563423645320198, "grad_norm": 0.07043389976024628, "learning_rate": 0.01, "loss": 2.106, "step": 10293 }, { "epoch": 1.0566502463054188, "grad_norm": 0.0916379988193512, "learning_rate": 0.01, "loss": 2.0851, "step": 10296 }, { "epoch": 1.0569581280788178, "grad_norm": 0.050577979534864426, "learning_rate": 0.01, "loss": 2.0966, "step": 10299 }, { "epoch": 1.0572660098522166, "grad_norm": 0.06576110422611237, "learning_rate": 0.01, "loss": 2.1038, "step": 10302 }, { "epoch": 1.0575738916256157, "grad_norm": 0.09315023571252823, "learning_rate": 0.01, "loss": 2.1341, "step": 10305 }, { "epoch": 1.0578817733990147, "grad_norm": 0.0649820864200592, "learning_rate": 0.01, "loss": 2.1064, "step": 10308 }, { "epoch": 1.0581896551724137, "grad_norm": 0.07930494844913483, "learning_rate": 0.01, "loss": 2.107, "step": 10311 }, { "epoch": 1.0584975369458127, "grad_norm": 0.09142257273197174, "learning_rate": 0.01, "loss": 2.1162, "step": 10314 }, { "epoch": 1.0588054187192117, "grad_norm": 0.05011974647641182, "learning_rate": 0.01, "loss": 2.0686, "step": 10317 }, { "epoch": 1.0591133004926108, "grad_norm": 0.1002635508775711, "learning_rate": 0.01, "loss": 2.138, "step": 10320 }, { "epoch": 1.0594211822660098, "grad_norm": 0.07570278644561768, "learning_rate": 0.01, "loss": 2.0693, "step": 10323 }, { "epoch": 1.0597290640394088, "grad_norm": 0.05086719989776611, "learning_rate": 0.01, "loss": 2.0991, "step": 10326 }, { "epoch": 1.0600369458128078, "grad_norm": 0.03596855327486992, "learning_rate": 0.01, "loss": 2.1038, "step": 10329 }, { "epoch": 1.0603448275862069, "grad_norm": 0.05059434473514557, "learning_rate": 0.01, "loss": 2.1, "step": 10332 }, { "epoch": 1.0606527093596059, "grad_norm": 0.058818116784095764, "learning_rate": 0.01, "loss": 2.0855, "step": 10335 }, { "epoch": 1.060960591133005, "grad_norm": 0.14139403402805328, "learning_rate": 0.01, "loss": 2.0755, "step": 10338 }, { "epoch": 1.061268472906404, "grad_norm": 0.12123113870620728, "learning_rate": 0.01, "loss": 2.0896, "step": 10341 }, { "epoch": 1.061576354679803, "grad_norm": 0.04767270013689995, "learning_rate": 0.01, "loss": 2.11, "step": 10344 }, { "epoch": 1.061884236453202, "grad_norm": 0.03506815433502197, "learning_rate": 0.01, "loss": 2.0953, "step": 10347 }, { "epoch": 1.062192118226601, "grad_norm": 0.08807789534330368, "learning_rate": 0.01, "loss": 2.0903, "step": 10350 }, { "epoch": 1.0625, "grad_norm": 0.1130862608551979, "learning_rate": 0.01, "loss": 2.0888, "step": 10353 }, { "epoch": 1.062807881773399, "grad_norm": 0.05720696598291397, "learning_rate": 0.01, "loss": 2.0904, "step": 10356 }, { "epoch": 1.063115763546798, "grad_norm": 0.057933416217565536, "learning_rate": 0.01, "loss": 2.1138, "step": 10359 }, { "epoch": 1.063423645320197, "grad_norm": 0.056713253259658813, "learning_rate": 0.01, "loss": 2.0965, "step": 10362 }, { "epoch": 1.063731527093596, "grad_norm": 0.05062280595302582, "learning_rate": 0.01, "loss": 2.1058, "step": 10365 }, { "epoch": 1.064039408866995, "grad_norm": 0.03439073637127876, "learning_rate": 0.01, "loss": 2.0945, "step": 10368 }, { "epoch": 1.0643472906403941, "grad_norm": 0.10244173556566238, "learning_rate": 0.01, "loss": 2.0916, "step": 10371 }, { "epoch": 1.0646551724137931, "grad_norm": 0.04706069454550743, "learning_rate": 0.01, "loss": 2.103, "step": 10374 }, { "epoch": 1.0649630541871922, "grad_norm": 0.11580058932304382, "learning_rate": 0.01, "loss": 2.0995, "step": 10377 }, { "epoch": 1.0652709359605912, "grad_norm": 0.044736508280038834, "learning_rate": 0.01, "loss": 2.0906, "step": 10380 }, { "epoch": 1.0655788177339902, "grad_norm": 0.08990567922592163, "learning_rate": 0.01, "loss": 2.1197, "step": 10383 }, { "epoch": 1.0658866995073892, "grad_norm": 0.06923419237136841, "learning_rate": 0.01, "loss": 2.0997, "step": 10386 }, { "epoch": 1.0661945812807883, "grad_norm": 0.059495240449905396, "learning_rate": 0.01, "loss": 2.1106, "step": 10389 }, { "epoch": 1.0665024630541873, "grad_norm": 0.07906550914049149, "learning_rate": 0.01, "loss": 2.1196, "step": 10392 }, { "epoch": 1.0668103448275863, "grad_norm": 0.08792297542095184, "learning_rate": 0.01, "loss": 2.0985, "step": 10395 }, { "epoch": 1.0671182266009853, "grad_norm": 0.06077072396874428, "learning_rate": 0.01, "loss": 2.088, "step": 10398 }, { "epoch": 1.0674261083743843, "grad_norm": 0.03865751996636391, "learning_rate": 0.01, "loss": 2.0894, "step": 10401 }, { "epoch": 1.0677339901477834, "grad_norm": 0.03158612549304962, "learning_rate": 0.01, "loss": 2.0861, "step": 10404 }, { "epoch": 1.0680418719211822, "grad_norm": 0.03455328568816185, "learning_rate": 0.01, "loss": 2.0819, "step": 10407 }, { "epoch": 1.0683497536945812, "grad_norm": 0.062100328505039215, "learning_rate": 0.01, "loss": 2.0967, "step": 10410 }, { "epoch": 1.0686576354679802, "grad_norm": 0.10934283584356308, "learning_rate": 0.01, "loss": 2.1135, "step": 10413 }, { "epoch": 1.0689655172413792, "grad_norm": 0.07184179127216339, "learning_rate": 0.01, "loss": 2.0949, "step": 10416 }, { "epoch": 1.0692733990147782, "grad_norm": 0.06610151380300522, "learning_rate": 0.01, "loss": 2.1001, "step": 10419 }, { "epoch": 1.0695812807881773, "grad_norm": 0.06064629554748535, "learning_rate": 0.01, "loss": 2.0835, "step": 10422 }, { "epoch": 1.0698891625615763, "grad_norm": 0.0531432181596756, "learning_rate": 0.01, "loss": 2.1105, "step": 10425 }, { "epoch": 1.0701970443349753, "grad_norm": 0.056448470801115036, "learning_rate": 0.01, "loss": 2.0724, "step": 10428 }, { "epoch": 1.0705049261083743, "grad_norm": 0.03736816346645355, "learning_rate": 0.01, "loss": 2.1243, "step": 10431 }, { "epoch": 1.0708128078817734, "grad_norm": 0.12693117558956146, "learning_rate": 0.01, "loss": 2.1156, "step": 10434 }, { "epoch": 1.0711206896551724, "grad_norm": 0.0428193174302578, "learning_rate": 0.01, "loss": 2.1025, "step": 10437 }, { "epoch": 1.0714285714285714, "grad_norm": 0.0464596189558506, "learning_rate": 0.01, "loss": 2.1067, "step": 10440 }, { "epoch": 1.0717364532019704, "grad_norm": 0.07535267621278763, "learning_rate": 0.01, "loss": 2.0785, "step": 10443 }, { "epoch": 1.0720443349753694, "grad_norm": 0.0537327378988266, "learning_rate": 0.01, "loss": 2.0775, "step": 10446 }, { "epoch": 1.0723522167487685, "grad_norm": 0.03783145919442177, "learning_rate": 0.01, "loss": 2.0921, "step": 10449 }, { "epoch": 1.0726600985221675, "grad_norm": 0.052689142525196075, "learning_rate": 0.01, "loss": 2.116, "step": 10452 }, { "epoch": 1.0729679802955665, "grad_norm": 0.1437288373708725, "learning_rate": 0.01, "loss": 2.071, "step": 10455 }, { "epoch": 1.0732758620689655, "grad_norm": 0.07633062452077866, "learning_rate": 0.01, "loss": 2.088, "step": 10458 }, { "epoch": 1.0735837438423645, "grad_norm": 0.061189718544483185, "learning_rate": 0.01, "loss": 2.0796, "step": 10461 }, { "epoch": 1.0738916256157636, "grad_norm": 0.06256800144910812, "learning_rate": 0.01, "loss": 2.1056, "step": 10464 }, { "epoch": 1.0741995073891626, "grad_norm": 0.0745188519358635, "learning_rate": 0.01, "loss": 2.0782, "step": 10467 }, { "epoch": 1.0745073891625616, "grad_norm": 0.0663486197590828, "learning_rate": 0.01, "loss": 2.0704, "step": 10470 }, { "epoch": 1.0748152709359606, "grad_norm": 0.05472427234053612, "learning_rate": 0.01, "loss": 2.082, "step": 10473 }, { "epoch": 1.0751231527093597, "grad_norm": 0.10171230137348175, "learning_rate": 0.01, "loss": 2.1135, "step": 10476 }, { "epoch": 1.0754310344827587, "grad_norm": 0.05689026787877083, "learning_rate": 0.01, "loss": 2.0748, "step": 10479 }, { "epoch": 1.0757389162561577, "grad_norm": 0.0593440905213356, "learning_rate": 0.01, "loss": 2.0922, "step": 10482 }, { "epoch": 1.0760467980295567, "grad_norm": 0.07408995181322098, "learning_rate": 0.01, "loss": 2.0781, "step": 10485 }, { "epoch": 1.0763546798029557, "grad_norm": 0.05688070133328438, "learning_rate": 0.01, "loss": 2.1085, "step": 10488 }, { "epoch": 1.0766625615763548, "grad_norm": 0.05378828942775726, "learning_rate": 0.01, "loss": 2.1084, "step": 10491 }, { "epoch": 1.0769704433497538, "grad_norm": 0.057735592126846313, "learning_rate": 0.01, "loss": 2.1023, "step": 10494 }, { "epoch": 1.0772783251231528, "grad_norm": 0.0586666576564312, "learning_rate": 0.01, "loss": 2.1003, "step": 10497 }, { "epoch": 1.0775862068965518, "grad_norm": 0.12087473273277283, "learning_rate": 0.01, "loss": 2.0974, "step": 10500 }, { "epoch": 1.0778940886699506, "grad_norm": 0.07307861000299454, "learning_rate": 0.01, "loss": 2.0913, "step": 10503 }, { "epoch": 1.0782019704433496, "grad_norm": 0.06621012091636658, "learning_rate": 0.01, "loss": 2.1173, "step": 10506 }, { "epoch": 1.0785098522167487, "grad_norm": 0.0647876039147377, "learning_rate": 0.01, "loss": 2.1006, "step": 10509 }, { "epoch": 1.0788177339901477, "grad_norm": 0.06163914501667023, "learning_rate": 0.01, "loss": 2.0892, "step": 10512 }, { "epoch": 1.0791256157635467, "grad_norm": 0.04312353581190109, "learning_rate": 0.01, "loss": 2.0901, "step": 10515 }, { "epoch": 1.0794334975369457, "grad_norm": 0.0760812908411026, "learning_rate": 0.01, "loss": 2.0995, "step": 10518 }, { "epoch": 1.0797413793103448, "grad_norm": 0.0802140161395073, "learning_rate": 0.01, "loss": 2.0905, "step": 10521 }, { "epoch": 1.0800492610837438, "grad_norm": 0.09008529782295227, "learning_rate": 0.01, "loss": 2.08, "step": 10524 }, { "epoch": 1.0803571428571428, "grad_norm": 0.07469696551561356, "learning_rate": 0.01, "loss": 2.0725, "step": 10527 }, { "epoch": 1.0806650246305418, "grad_norm": 0.08821582794189453, "learning_rate": 0.01, "loss": 2.1086, "step": 10530 }, { "epoch": 1.0809729064039408, "grad_norm": 0.04690997302532196, "learning_rate": 0.01, "loss": 2.1095, "step": 10533 }, { "epoch": 1.0812807881773399, "grad_norm": 0.04316158965229988, "learning_rate": 0.01, "loss": 2.0818, "step": 10536 }, { "epoch": 1.0815886699507389, "grad_norm": 0.06996279209852219, "learning_rate": 0.01, "loss": 2.0993, "step": 10539 }, { "epoch": 1.081896551724138, "grad_norm": 0.10073279589414597, "learning_rate": 0.01, "loss": 2.112, "step": 10542 }, { "epoch": 1.082204433497537, "grad_norm": 0.0448322668671608, "learning_rate": 0.01, "loss": 2.0834, "step": 10545 }, { "epoch": 1.082512315270936, "grad_norm": 0.11411638557910919, "learning_rate": 0.01, "loss": 2.1082, "step": 10548 }, { "epoch": 1.082820197044335, "grad_norm": 0.10779088735580444, "learning_rate": 0.01, "loss": 2.0702, "step": 10551 }, { "epoch": 1.083128078817734, "grad_norm": 0.041448626667261124, "learning_rate": 0.01, "loss": 2.1041, "step": 10554 }, { "epoch": 1.083435960591133, "grad_norm": 0.07522560656070709, "learning_rate": 0.01, "loss": 2.0794, "step": 10557 }, { "epoch": 1.083743842364532, "grad_norm": 0.048221901059150696, "learning_rate": 0.01, "loss": 2.0936, "step": 10560 }, { "epoch": 1.084051724137931, "grad_norm": 0.05512038618326187, "learning_rate": 0.01, "loss": 2.0898, "step": 10563 }, { "epoch": 1.08435960591133, "grad_norm": 0.07599300891160965, "learning_rate": 0.01, "loss": 2.1246, "step": 10566 }, { "epoch": 1.084667487684729, "grad_norm": 0.06631644070148468, "learning_rate": 0.01, "loss": 2.0861, "step": 10569 }, { "epoch": 1.0849753694581281, "grad_norm": 0.04972488060593605, "learning_rate": 0.01, "loss": 2.11, "step": 10572 }, { "epoch": 1.0852832512315271, "grad_norm": 0.08250217139720917, "learning_rate": 0.01, "loss": 2.1142, "step": 10575 }, { "epoch": 1.0855911330049262, "grad_norm": 0.09104974567890167, "learning_rate": 0.01, "loss": 2.0822, "step": 10578 }, { "epoch": 1.0858990147783252, "grad_norm": 0.057310063391923904, "learning_rate": 0.01, "loss": 2.0819, "step": 10581 }, { "epoch": 1.0862068965517242, "grad_norm": 0.08102291077375412, "learning_rate": 0.01, "loss": 2.0931, "step": 10584 }, { "epoch": 1.0865147783251232, "grad_norm": 0.045641325414180756, "learning_rate": 0.01, "loss": 2.1096, "step": 10587 }, { "epoch": 1.0868226600985222, "grad_norm": 0.05350523442029953, "learning_rate": 0.01, "loss": 2.1151, "step": 10590 }, { "epoch": 1.0871305418719213, "grad_norm": 0.045734379440546036, "learning_rate": 0.01, "loss": 2.1043, "step": 10593 }, { "epoch": 1.0874384236453203, "grad_norm": 0.044645924121141434, "learning_rate": 0.01, "loss": 2.0882, "step": 10596 }, { "epoch": 1.0877463054187193, "grad_norm": 0.046704743057489395, "learning_rate": 0.01, "loss": 2.0823, "step": 10599 }, { "epoch": 1.0880541871921183, "grad_norm": 0.09600807726383209, "learning_rate": 0.01, "loss": 2.09, "step": 10602 }, { "epoch": 1.0883620689655173, "grad_norm": 0.062323443591594696, "learning_rate": 0.01, "loss": 2.091, "step": 10605 }, { "epoch": 1.0886699507389164, "grad_norm": 0.08459887653589249, "learning_rate": 0.01, "loss": 2.094, "step": 10608 }, { "epoch": 1.0889778325123152, "grad_norm": 0.0621943362057209, "learning_rate": 0.01, "loss": 2.0735, "step": 10611 }, { "epoch": 1.0892857142857142, "grad_norm": 0.10963741689920425, "learning_rate": 0.01, "loss": 2.0769, "step": 10614 }, { "epoch": 1.0895935960591132, "grad_norm": 0.07325689494609833, "learning_rate": 0.01, "loss": 2.0905, "step": 10617 }, { "epoch": 1.0899014778325122, "grad_norm": 0.08307964354753494, "learning_rate": 0.01, "loss": 2.0977, "step": 10620 }, { "epoch": 1.0902093596059113, "grad_norm": 0.18072094023227692, "learning_rate": 0.01, "loss": 2.1096, "step": 10623 }, { "epoch": 1.0905172413793103, "grad_norm": 0.10427471250295639, "learning_rate": 0.01, "loss": 2.0761, "step": 10626 }, { "epoch": 1.0908251231527093, "grad_norm": 0.0732191875576973, "learning_rate": 0.01, "loss": 2.1031, "step": 10629 }, { "epoch": 1.0911330049261083, "grad_norm": 0.03703717514872551, "learning_rate": 0.01, "loss": 2.0734, "step": 10632 }, { "epoch": 1.0914408866995073, "grad_norm": 0.04907006770372391, "learning_rate": 0.01, "loss": 2.096, "step": 10635 }, { "epoch": 1.0917487684729064, "grad_norm": 0.04126304015517235, "learning_rate": 0.01, "loss": 2.0824, "step": 10638 }, { "epoch": 1.0920566502463054, "grad_norm": 0.04017401486635208, "learning_rate": 0.01, "loss": 2.0694, "step": 10641 }, { "epoch": 1.0923645320197044, "grad_norm": 0.036132264882326126, "learning_rate": 0.01, "loss": 2.0792, "step": 10644 }, { "epoch": 1.0926724137931034, "grad_norm": 0.06275150179862976, "learning_rate": 0.01, "loss": 2.1172, "step": 10647 }, { "epoch": 1.0929802955665024, "grad_norm": 0.08319203555583954, "learning_rate": 0.01, "loss": 2.0868, "step": 10650 }, { "epoch": 1.0932881773399015, "grad_norm": 0.08663000166416168, "learning_rate": 0.01, "loss": 2.0834, "step": 10653 }, { "epoch": 1.0935960591133005, "grad_norm": 0.10765951871871948, "learning_rate": 0.01, "loss": 2.0891, "step": 10656 }, { "epoch": 1.0939039408866995, "grad_norm": 0.035412587225437164, "learning_rate": 0.01, "loss": 2.0912, "step": 10659 }, { "epoch": 1.0942118226600985, "grad_norm": 0.051735054701566696, "learning_rate": 0.01, "loss": 2.0986, "step": 10662 }, { "epoch": 1.0945197044334976, "grad_norm": 0.04320614039897919, "learning_rate": 0.01, "loss": 2.0912, "step": 10665 }, { "epoch": 1.0948275862068966, "grad_norm": 0.03285462409257889, "learning_rate": 0.01, "loss": 2.0957, "step": 10668 }, { "epoch": 1.0951354679802956, "grad_norm": 0.05172726511955261, "learning_rate": 0.01, "loss": 2.0706, "step": 10671 }, { "epoch": 1.0954433497536946, "grad_norm": 0.04941645637154579, "learning_rate": 0.01, "loss": 2.1018, "step": 10674 }, { "epoch": 1.0957512315270936, "grad_norm": 0.04746576398611069, "learning_rate": 0.01, "loss": 2.1002, "step": 10677 }, { "epoch": 1.0960591133004927, "grad_norm": 0.10900839418172836, "learning_rate": 0.01, "loss": 2.1188, "step": 10680 }, { "epoch": 1.0963669950738917, "grad_norm": 0.06924229860305786, "learning_rate": 0.01, "loss": 2.097, "step": 10683 }, { "epoch": 1.0966748768472907, "grad_norm": 0.11047599464654922, "learning_rate": 0.01, "loss": 2.0607, "step": 10686 }, { "epoch": 1.0969827586206897, "grad_norm": 0.10662158578634262, "learning_rate": 0.01, "loss": 2.078, "step": 10689 }, { "epoch": 1.0972906403940887, "grad_norm": 0.07408568263053894, "learning_rate": 0.01, "loss": 2.0918, "step": 10692 }, { "epoch": 1.0975985221674878, "grad_norm": 0.0471009686589241, "learning_rate": 0.01, "loss": 2.1248, "step": 10695 }, { "epoch": 1.0979064039408868, "grad_norm": 0.049591194838285446, "learning_rate": 0.01, "loss": 2.082, "step": 10698 }, { "epoch": 1.0982142857142858, "grad_norm": 0.0919683426618576, "learning_rate": 0.01, "loss": 2.1229, "step": 10701 }, { "epoch": 1.0985221674876848, "grad_norm": 0.05292963236570358, "learning_rate": 0.01, "loss": 2.1097, "step": 10704 }, { "epoch": 1.0988300492610836, "grad_norm": 0.053880974650382996, "learning_rate": 0.01, "loss": 2.0787, "step": 10707 }, { "epoch": 1.0991379310344827, "grad_norm": 0.05608196556568146, "learning_rate": 0.01, "loss": 2.0735, "step": 10710 }, { "epoch": 1.0994458128078817, "grad_norm": 0.06456641852855682, "learning_rate": 0.01, "loss": 2.1148, "step": 10713 }, { "epoch": 1.0997536945812807, "grad_norm": 0.08165917545557022, "learning_rate": 0.01, "loss": 2.1199, "step": 10716 }, { "epoch": 1.1000615763546797, "grad_norm": 0.0773044228553772, "learning_rate": 0.01, "loss": 2.0972, "step": 10719 }, { "epoch": 1.1003694581280787, "grad_norm": 0.07669848203659058, "learning_rate": 0.01, "loss": 2.101, "step": 10722 }, { "epoch": 1.1006773399014778, "grad_norm": 0.0773942843079567, "learning_rate": 0.01, "loss": 2.0573, "step": 10725 }, { "epoch": 1.1009852216748768, "grad_norm": 0.06698640435934067, "learning_rate": 0.01, "loss": 2.1189, "step": 10728 }, { "epoch": 1.1012931034482758, "grad_norm": 0.098200224339962, "learning_rate": 0.01, "loss": 2.0739, "step": 10731 }, { "epoch": 1.1016009852216748, "grad_norm": 0.06676481664180756, "learning_rate": 0.01, "loss": 2.097, "step": 10734 }, { "epoch": 1.1019088669950738, "grad_norm": 0.03925321251153946, "learning_rate": 0.01, "loss": 2.0904, "step": 10737 }, { "epoch": 1.1022167487684729, "grad_norm": 0.08387935161590576, "learning_rate": 0.01, "loss": 2.1069, "step": 10740 }, { "epoch": 1.1025246305418719, "grad_norm": 0.06382130831480026, "learning_rate": 0.01, "loss": 2.091, "step": 10743 }, { "epoch": 1.102832512315271, "grad_norm": 0.04457903653383255, "learning_rate": 0.01, "loss": 2.074, "step": 10746 }, { "epoch": 1.10314039408867, "grad_norm": 0.057858239859342575, "learning_rate": 0.01, "loss": 2.1021, "step": 10749 }, { "epoch": 1.103448275862069, "grad_norm": 0.055992983281612396, "learning_rate": 0.01, "loss": 2.0894, "step": 10752 }, { "epoch": 1.103756157635468, "grad_norm": 0.10200835764408112, "learning_rate": 0.01, "loss": 2.0948, "step": 10755 }, { "epoch": 1.104064039408867, "grad_norm": 0.11163626611232758, "learning_rate": 0.01, "loss": 2.0963, "step": 10758 }, { "epoch": 1.104371921182266, "grad_norm": 0.11462046951055527, "learning_rate": 0.01, "loss": 2.0808, "step": 10761 }, { "epoch": 1.104679802955665, "grad_norm": 0.08823121339082718, "learning_rate": 0.01, "loss": 2.1136, "step": 10764 }, { "epoch": 1.104987684729064, "grad_norm": 0.08843538910150528, "learning_rate": 0.01, "loss": 2.0767, "step": 10767 }, { "epoch": 1.105295566502463, "grad_norm": 0.05961614102125168, "learning_rate": 0.01, "loss": 2.1067, "step": 10770 }, { "epoch": 1.105603448275862, "grad_norm": 0.08095360547304153, "learning_rate": 0.01, "loss": 2.066, "step": 10773 }, { "epoch": 1.1059113300492611, "grad_norm": 0.08094312995672226, "learning_rate": 0.01, "loss": 2.0849, "step": 10776 }, { "epoch": 1.1062192118226601, "grad_norm": 0.05718453973531723, "learning_rate": 0.01, "loss": 2.1097, "step": 10779 }, { "epoch": 1.1065270935960592, "grad_norm": 0.0537499338388443, "learning_rate": 0.01, "loss": 2.082, "step": 10782 }, { "epoch": 1.1068349753694582, "grad_norm": 0.06437748670578003, "learning_rate": 0.01, "loss": 2.0982, "step": 10785 }, { "epoch": 1.1071428571428572, "grad_norm": 0.03420199081301689, "learning_rate": 0.01, "loss": 2.0919, "step": 10788 }, { "epoch": 1.1074507389162562, "grad_norm": 0.049510665237903595, "learning_rate": 0.01, "loss": 2.0777, "step": 10791 }, { "epoch": 1.1077586206896552, "grad_norm": 0.044145356863737106, "learning_rate": 0.01, "loss": 2.0994, "step": 10794 }, { "epoch": 1.1080665024630543, "grad_norm": 0.0494622103869915, "learning_rate": 0.01, "loss": 2.103, "step": 10797 }, { "epoch": 1.1083743842364533, "grad_norm": 0.039029188454151154, "learning_rate": 0.01, "loss": 2.1026, "step": 10800 }, { "epoch": 1.1086822660098523, "grad_norm": 0.05786842480301857, "learning_rate": 0.01, "loss": 2.0767, "step": 10803 }, { "epoch": 1.1089901477832513, "grad_norm": 0.07576561719179153, "learning_rate": 0.01, "loss": 2.1078, "step": 10806 }, { "epoch": 1.1092980295566504, "grad_norm": 0.084762342274189, "learning_rate": 0.01, "loss": 2.1027, "step": 10809 }, { "epoch": 1.1096059113300494, "grad_norm": 0.05042179673910141, "learning_rate": 0.01, "loss": 2.0742, "step": 10812 }, { "epoch": 1.1099137931034482, "grad_norm": 0.07194402068853378, "learning_rate": 0.01, "loss": 2.0985, "step": 10815 }, { "epoch": 1.1102216748768472, "grad_norm": 0.13966146111488342, "learning_rate": 0.01, "loss": 2.0924, "step": 10818 }, { "epoch": 1.1105295566502462, "grad_norm": 0.060582250356674194, "learning_rate": 0.01, "loss": 2.1039, "step": 10821 }, { "epoch": 1.1108374384236452, "grad_norm": 0.03663609176874161, "learning_rate": 0.01, "loss": 2.0731, "step": 10824 }, { "epoch": 1.1111453201970443, "grad_norm": 0.09468091279268265, "learning_rate": 0.01, "loss": 2.0961, "step": 10827 }, { "epoch": 1.1114532019704433, "grad_norm": 0.07199615240097046, "learning_rate": 0.01, "loss": 2.0834, "step": 10830 }, { "epoch": 1.1117610837438423, "grad_norm": 0.06624965369701385, "learning_rate": 0.01, "loss": 2.1286, "step": 10833 }, { "epoch": 1.1120689655172413, "grad_norm": 0.0414128340780735, "learning_rate": 0.01, "loss": 2.0922, "step": 10836 }, { "epoch": 1.1123768472906403, "grad_norm": 0.06416642665863037, "learning_rate": 0.01, "loss": 2.0908, "step": 10839 }, { "epoch": 1.1126847290640394, "grad_norm": 0.05309692397713661, "learning_rate": 0.01, "loss": 2.117, "step": 10842 }, { "epoch": 1.1129926108374384, "grad_norm": 0.04576392099261284, "learning_rate": 0.01, "loss": 2.0801, "step": 10845 }, { "epoch": 1.1133004926108374, "grad_norm": 0.0887250304222107, "learning_rate": 0.01, "loss": 2.0815, "step": 10848 }, { "epoch": 1.1136083743842364, "grad_norm": 0.061223480850458145, "learning_rate": 0.01, "loss": 2.0607, "step": 10851 }, { "epoch": 1.1139162561576355, "grad_norm": 0.12983545660972595, "learning_rate": 0.01, "loss": 2.0882, "step": 10854 }, { "epoch": 1.1142241379310345, "grad_norm": 0.09382637590169907, "learning_rate": 0.01, "loss": 2.0838, "step": 10857 }, { "epoch": 1.1145320197044335, "grad_norm": 0.04275491461157799, "learning_rate": 0.01, "loss": 2.0905, "step": 10860 }, { "epoch": 1.1148399014778325, "grad_norm": 0.044315680861473083, "learning_rate": 0.01, "loss": 2.0924, "step": 10863 }, { "epoch": 1.1151477832512315, "grad_norm": 0.05177663639187813, "learning_rate": 0.01, "loss": 2.0985, "step": 10866 }, { "epoch": 1.1154556650246306, "grad_norm": 0.08161107450723648, "learning_rate": 0.01, "loss": 2.105, "step": 10869 }, { "epoch": 1.1157635467980296, "grad_norm": 0.08273576200008392, "learning_rate": 0.01, "loss": 2.0991, "step": 10872 }, { "epoch": 1.1160714285714286, "grad_norm": 0.04973771795630455, "learning_rate": 0.01, "loss": 2.0849, "step": 10875 }, { "epoch": 1.1163793103448276, "grad_norm": 0.036696773022413254, "learning_rate": 0.01, "loss": 2.0651, "step": 10878 }, { "epoch": 1.1166871921182266, "grad_norm": 0.03647401183843613, "learning_rate": 0.01, "loss": 2.0772, "step": 10881 }, { "epoch": 1.1169950738916257, "grad_norm": 0.03360895812511444, "learning_rate": 0.01, "loss": 2.0952, "step": 10884 }, { "epoch": 1.1173029556650247, "grad_norm": 0.037918057292699814, "learning_rate": 0.01, "loss": 2.0776, "step": 10887 }, { "epoch": 1.1176108374384237, "grad_norm": 0.10544890910387039, "learning_rate": 0.01, "loss": 2.1079, "step": 10890 }, { "epoch": 1.1179187192118227, "grad_norm": 0.15091745555400848, "learning_rate": 0.01, "loss": 2.1231, "step": 10893 }, { "epoch": 1.1182266009852218, "grad_norm": 0.07386527210474014, "learning_rate": 0.01, "loss": 2.0922, "step": 10896 }, { "epoch": 1.1185344827586208, "grad_norm": 0.04889804869890213, "learning_rate": 0.01, "loss": 2.1016, "step": 10899 }, { "epoch": 1.1188423645320198, "grad_norm": 0.04805940017104149, "learning_rate": 0.01, "loss": 2.0833, "step": 10902 }, { "epoch": 1.1191502463054188, "grad_norm": 0.040073320269584656, "learning_rate": 0.01, "loss": 2.0943, "step": 10905 }, { "epoch": 1.1194581280788178, "grad_norm": 0.046124961227178574, "learning_rate": 0.01, "loss": 2.0891, "step": 10908 }, { "epoch": 1.1197660098522166, "grad_norm": 0.04982076957821846, "learning_rate": 0.01, "loss": 2.0595, "step": 10911 }, { "epoch": 1.1200738916256157, "grad_norm": 0.036569107323884964, "learning_rate": 0.01, "loss": 2.0602, "step": 10914 }, { "epoch": 1.1203817733990147, "grad_norm": 0.033519893884658813, "learning_rate": 0.01, "loss": 2.1026, "step": 10917 }, { "epoch": 1.1206896551724137, "grad_norm": 0.0513744130730629, "learning_rate": 0.01, "loss": 2.1119, "step": 10920 }, { "epoch": 1.1209975369458127, "grad_norm": 0.08677095174789429, "learning_rate": 0.01, "loss": 2.0791, "step": 10923 }, { "epoch": 1.1213054187192117, "grad_norm": 0.1263512223958969, "learning_rate": 0.01, "loss": 2.0912, "step": 10926 }, { "epoch": 1.1216133004926108, "grad_norm": 0.0737731009721756, "learning_rate": 0.01, "loss": 2.1193, "step": 10929 }, { "epoch": 1.1219211822660098, "grad_norm": 0.045122213661670685, "learning_rate": 0.01, "loss": 2.1029, "step": 10932 }, { "epoch": 1.1222290640394088, "grad_norm": 0.04616571217775345, "learning_rate": 0.01, "loss": 2.062, "step": 10935 }, { "epoch": 1.1225369458128078, "grad_norm": 0.03985420614480972, "learning_rate": 0.01, "loss": 2.0868, "step": 10938 }, { "epoch": 1.1228448275862069, "grad_norm": 0.11042526364326477, "learning_rate": 0.01, "loss": 2.1057, "step": 10941 }, { "epoch": 1.1231527093596059, "grad_norm": 0.08071359992027283, "learning_rate": 0.01, "loss": 2.0796, "step": 10944 }, { "epoch": 1.123460591133005, "grad_norm": 0.049534909427165985, "learning_rate": 0.01, "loss": 2.1055, "step": 10947 }, { "epoch": 1.123768472906404, "grad_norm": 0.08341135829687119, "learning_rate": 0.01, "loss": 2.0948, "step": 10950 }, { "epoch": 1.124076354679803, "grad_norm": 0.03842156007885933, "learning_rate": 0.01, "loss": 2.1051, "step": 10953 }, { "epoch": 1.124384236453202, "grad_norm": 0.04978267103433609, "learning_rate": 0.01, "loss": 2.0927, "step": 10956 }, { "epoch": 1.124692118226601, "grad_norm": 0.04545191302895546, "learning_rate": 0.01, "loss": 2.0847, "step": 10959 }, { "epoch": 1.125, "grad_norm": 0.10103368014097214, "learning_rate": 0.01, "loss": 2.105, "step": 10962 }, { "epoch": 1.125307881773399, "grad_norm": 0.05956938862800598, "learning_rate": 0.01, "loss": 2.1043, "step": 10965 }, { "epoch": 1.125615763546798, "grad_norm": 0.048797741532325745, "learning_rate": 0.01, "loss": 2.1044, "step": 10968 }, { "epoch": 1.125923645320197, "grad_norm": 0.041901495307683945, "learning_rate": 0.01, "loss": 2.0847, "step": 10971 }, { "epoch": 1.126231527093596, "grad_norm": 0.14950989186763763, "learning_rate": 0.01, "loss": 2.0919, "step": 10974 }, { "epoch": 1.126539408866995, "grad_norm": 0.049760669469833374, "learning_rate": 0.01, "loss": 2.0707, "step": 10977 }, { "epoch": 1.1268472906403941, "grad_norm": 0.07016187906265259, "learning_rate": 0.01, "loss": 2.0709, "step": 10980 }, { "epoch": 1.1271551724137931, "grad_norm": 0.057528458535671234, "learning_rate": 0.01, "loss": 2.0759, "step": 10983 }, { "epoch": 1.1274630541871922, "grad_norm": 0.06690733879804611, "learning_rate": 0.01, "loss": 2.102, "step": 10986 }, { "epoch": 1.1277709359605912, "grad_norm": 0.05225450173020363, "learning_rate": 0.01, "loss": 2.0675, "step": 10989 }, { "epoch": 1.1280788177339902, "grad_norm": 0.048363544046878815, "learning_rate": 0.01, "loss": 2.0634, "step": 10992 }, { "epoch": 1.1283866995073892, "grad_norm": 0.05356382206082344, "learning_rate": 0.01, "loss": 2.1003, "step": 10995 }, { "epoch": 1.1286945812807883, "grad_norm": 0.06921149045228958, "learning_rate": 0.01, "loss": 2.0934, "step": 10998 }, { "epoch": 1.1290024630541873, "grad_norm": 0.04210525006055832, "learning_rate": 0.01, "loss": 2.096, "step": 11001 }, { "epoch": 1.1293103448275863, "grad_norm": 0.11790584027767181, "learning_rate": 0.01, "loss": 2.0746, "step": 11004 }, { "epoch": 1.1296182266009853, "grad_norm": 0.08045307546854019, "learning_rate": 0.01, "loss": 2.0929, "step": 11007 }, { "epoch": 1.1299261083743843, "grad_norm": 0.10474243015050888, "learning_rate": 0.01, "loss": 2.1418, "step": 11010 }, { "epoch": 1.1302339901477834, "grad_norm": 0.06073759123682976, "learning_rate": 0.01, "loss": 2.1089, "step": 11013 }, { "epoch": 1.1305418719211824, "grad_norm": 0.057685475796461105, "learning_rate": 0.01, "loss": 2.0959, "step": 11016 }, { "epoch": 1.1308497536945814, "grad_norm": 0.04218476638197899, "learning_rate": 0.01, "loss": 2.0834, "step": 11019 }, { "epoch": 1.1311576354679802, "grad_norm": 0.04814853519201279, "learning_rate": 0.01, "loss": 2.1134, "step": 11022 }, { "epoch": 1.1314655172413792, "grad_norm": 0.1344536989927292, "learning_rate": 0.01, "loss": 2.1121, "step": 11025 }, { "epoch": 1.1317733990147782, "grad_norm": 0.057088855654001236, "learning_rate": 0.01, "loss": 2.0978, "step": 11028 }, { "epoch": 1.1320812807881773, "grad_norm": 0.04567364603281021, "learning_rate": 0.01, "loss": 2.0837, "step": 11031 }, { "epoch": 1.1323891625615763, "grad_norm": 0.07506916671991348, "learning_rate": 0.01, "loss": 2.0926, "step": 11034 }, { "epoch": 1.1326970443349753, "grad_norm": 0.05837171897292137, "learning_rate": 0.01, "loss": 2.0961, "step": 11037 }, { "epoch": 1.1330049261083743, "grad_norm": 0.0457015223801136, "learning_rate": 0.01, "loss": 2.101, "step": 11040 }, { "epoch": 1.1333128078817734, "grad_norm": 0.061310991644859314, "learning_rate": 0.01, "loss": 2.1128, "step": 11043 }, { "epoch": 1.1336206896551724, "grad_norm": 0.05517786741256714, "learning_rate": 0.01, "loss": 2.0844, "step": 11046 }, { "epoch": 1.1339285714285714, "grad_norm": 0.07835637778043747, "learning_rate": 0.01, "loss": 2.0996, "step": 11049 }, { "epoch": 1.1342364532019704, "grad_norm": 0.05821641907095909, "learning_rate": 0.01, "loss": 2.1074, "step": 11052 }, { "epoch": 1.1345443349753694, "grad_norm": 0.04394884407520294, "learning_rate": 0.01, "loss": 2.0799, "step": 11055 }, { "epoch": 1.1348522167487685, "grad_norm": 0.05148720741271973, "learning_rate": 0.01, "loss": 2.0856, "step": 11058 }, { "epoch": 1.1351600985221675, "grad_norm": 0.05766841769218445, "learning_rate": 0.01, "loss": 2.0973, "step": 11061 }, { "epoch": 1.1354679802955665, "grad_norm": 0.09894710779190063, "learning_rate": 0.01, "loss": 2.0831, "step": 11064 }, { "epoch": 1.1357758620689655, "grad_norm": 0.11916875094175339, "learning_rate": 0.01, "loss": 2.1044, "step": 11067 }, { "epoch": 1.1360837438423645, "grad_norm": 0.03926829248666763, "learning_rate": 0.01, "loss": 2.0866, "step": 11070 }, { "epoch": 1.1363916256157636, "grad_norm": 0.05105220153927803, "learning_rate": 0.01, "loss": 2.0911, "step": 11073 }, { "epoch": 1.1366995073891626, "grad_norm": 0.04516123607754707, "learning_rate": 0.01, "loss": 2.0693, "step": 11076 }, { "epoch": 1.1370073891625616, "grad_norm": 0.046173594892024994, "learning_rate": 0.01, "loss": 2.092, "step": 11079 }, { "epoch": 1.1373152709359606, "grad_norm": 0.05173357576131821, "learning_rate": 0.01, "loss": 2.1007, "step": 11082 }, { "epoch": 1.1376231527093597, "grad_norm": 0.06486919522285461, "learning_rate": 0.01, "loss": 2.0775, "step": 11085 }, { "epoch": 1.1379310344827587, "grad_norm": 0.09763675928115845, "learning_rate": 0.01, "loss": 2.0942, "step": 11088 }, { "epoch": 1.1382389162561577, "grad_norm": 0.1281820833683014, "learning_rate": 0.01, "loss": 2.0903, "step": 11091 }, { "epoch": 1.1385467980295567, "grad_norm": 0.05734977498650551, "learning_rate": 0.01, "loss": 2.0891, "step": 11094 }, { "epoch": 1.1388546798029557, "grad_norm": 0.06809762120246887, "learning_rate": 0.01, "loss": 2.0855, "step": 11097 }, { "epoch": 1.1391625615763548, "grad_norm": 0.05105281621217728, "learning_rate": 0.01, "loss": 2.0975, "step": 11100 }, { "epoch": 1.1394704433497538, "grad_norm": 0.07381090521812439, "learning_rate": 0.01, "loss": 2.0762, "step": 11103 }, { "epoch": 1.1397783251231528, "grad_norm": 0.050722070038318634, "learning_rate": 0.01, "loss": 2.0778, "step": 11106 }, { "epoch": 1.1400862068965516, "grad_norm": 0.03850618377327919, "learning_rate": 0.01, "loss": 2.0626, "step": 11109 }, { "epoch": 1.1403940886699506, "grad_norm": 0.08264841884374619, "learning_rate": 0.01, "loss": 2.1258, "step": 11112 }, { "epoch": 1.1407019704433496, "grad_norm": 0.06493505835533142, "learning_rate": 0.01, "loss": 2.0961, "step": 11115 }, { "epoch": 1.1410098522167487, "grad_norm": 0.06895186007022858, "learning_rate": 0.01, "loss": 2.1207, "step": 11118 }, { "epoch": 1.1413177339901477, "grad_norm": 0.042232003062963486, "learning_rate": 0.01, "loss": 2.0724, "step": 11121 }, { "epoch": 1.1416256157635467, "grad_norm": 0.10296539217233658, "learning_rate": 0.01, "loss": 2.1126, "step": 11124 }, { "epoch": 1.1419334975369457, "grad_norm": 0.043095991015434265, "learning_rate": 0.01, "loss": 2.0928, "step": 11127 }, { "epoch": 1.1422413793103448, "grad_norm": 0.046020470559597015, "learning_rate": 0.01, "loss": 2.092, "step": 11130 }, { "epoch": 1.1425492610837438, "grad_norm": 0.04754204675555229, "learning_rate": 0.01, "loss": 2.126, "step": 11133 }, { "epoch": 1.1428571428571428, "grad_norm": 0.03732014447450638, "learning_rate": 0.01, "loss": 2.0637, "step": 11136 }, { "epoch": 1.1431650246305418, "grad_norm": 0.039080169051885605, "learning_rate": 0.01, "loss": 2.0877, "step": 11139 }, { "epoch": 1.1434729064039408, "grad_norm": 0.04575611278414726, "learning_rate": 0.01, "loss": 2.0646, "step": 11142 }, { "epoch": 1.1437807881773399, "grad_norm": 0.11764515191316605, "learning_rate": 0.01, "loss": 2.0964, "step": 11145 }, { "epoch": 1.1440886699507389, "grad_norm": 0.10087098181247711, "learning_rate": 0.01, "loss": 2.069, "step": 11148 }, { "epoch": 1.144396551724138, "grad_norm": 0.05462269112467766, "learning_rate": 0.01, "loss": 2.0902, "step": 11151 }, { "epoch": 1.144704433497537, "grad_norm": 0.06296168267726898, "learning_rate": 0.01, "loss": 2.1257, "step": 11154 }, { "epoch": 1.145012315270936, "grad_norm": 0.041026949882507324, "learning_rate": 0.01, "loss": 2.0672, "step": 11157 }, { "epoch": 1.145320197044335, "grad_norm": 0.05761269852519035, "learning_rate": 0.01, "loss": 2.0833, "step": 11160 }, { "epoch": 1.145628078817734, "grad_norm": 0.12491103261709213, "learning_rate": 0.01, "loss": 2.0681, "step": 11163 }, { "epoch": 1.145935960591133, "grad_norm": 0.09269531071186066, "learning_rate": 0.01, "loss": 2.0618, "step": 11166 }, { "epoch": 1.146243842364532, "grad_norm": 0.05623659864068031, "learning_rate": 0.01, "loss": 2.0861, "step": 11169 }, { "epoch": 1.146551724137931, "grad_norm": 0.04075248911976814, "learning_rate": 0.01, "loss": 2.0513, "step": 11172 }, { "epoch": 1.14685960591133, "grad_norm": 0.04557061940431595, "learning_rate": 0.01, "loss": 2.0876, "step": 11175 }, { "epoch": 1.147167487684729, "grad_norm": 0.05686535686254501, "learning_rate": 0.01, "loss": 2.0746, "step": 11178 }, { "epoch": 1.1474753694581281, "grad_norm": 0.04164785146713257, "learning_rate": 0.01, "loss": 2.0684, "step": 11181 }, { "epoch": 1.1477832512315271, "grad_norm": 0.05453825742006302, "learning_rate": 0.01, "loss": 2.0809, "step": 11184 }, { "epoch": 1.1480911330049262, "grad_norm": 0.15215769410133362, "learning_rate": 0.01, "loss": 2.0806, "step": 11187 }, { "epoch": 1.1483990147783252, "grad_norm": 0.14634385704994202, "learning_rate": 0.01, "loss": 2.11, "step": 11190 }, { "epoch": 1.1487068965517242, "grad_norm": 0.06655893474817276, "learning_rate": 0.01, "loss": 2.0953, "step": 11193 }, { "epoch": 1.1490147783251232, "grad_norm": 0.07074826955795288, "learning_rate": 0.01, "loss": 2.0984, "step": 11196 }, { "epoch": 1.1493226600985222, "grad_norm": 0.044581200927495956, "learning_rate": 0.01, "loss": 2.0816, "step": 11199 }, { "epoch": 1.1496305418719213, "grad_norm": 0.06769565492868423, "learning_rate": 0.01, "loss": 2.0809, "step": 11202 }, { "epoch": 1.1499384236453203, "grad_norm": 0.11437363177537918, "learning_rate": 0.01, "loss": 2.0761, "step": 11205 }, { "epoch": 1.1502463054187193, "grad_norm": 0.040699586272239685, "learning_rate": 0.01, "loss": 2.1028, "step": 11208 }, { "epoch": 1.1505541871921183, "grad_norm": 0.06996385753154755, "learning_rate": 0.01, "loss": 2.0755, "step": 11211 }, { "epoch": 1.1508620689655173, "grad_norm": 0.04621044918894768, "learning_rate": 0.01, "loss": 2.11, "step": 11214 }, { "epoch": 1.1511699507389164, "grad_norm": 0.07714984565973282, "learning_rate": 0.01, "loss": 2.1008, "step": 11217 }, { "epoch": 1.1514778325123154, "grad_norm": 0.11194620281457901, "learning_rate": 0.01, "loss": 2.093, "step": 11220 }, { "epoch": 1.1517857142857142, "grad_norm": 0.08710391074419022, "learning_rate": 0.01, "loss": 2.1017, "step": 11223 }, { "epoch": 1.1520935960591132, "grad_norm": 0.06653989106416702, "learning_rate": 0.01, "loss": 2.1084, "step": 11226 }, { "epoch": 1.1524014778325122, "grad_norm": 0.056591540575027466, "learning_rate": 0.01, "loss": 2.107, "step": 11229 }, { "epoch": 1.1527093596059113, "grad_norm": 0.056475669145584106, "learning_rate": 0.01, "loss": 2.0774, "step": 11232 }, { "epoch": 1.1530172413793103, "grad_norm": 0.08408259600400925, "learning_rate": 0.01, "loss": 2.1193, "step": 11235 }, { "epoch": 1.1533251231527093, "grad_norm": 0.06853178143501282, "learning_rate": 0.01, "loss": 2.0796, "step": 11238 }, { "epoch": 1.1536330049261083, "grad_norm": 0.11699818074703217, "learning_rate": 0.01, "loss": 2.0932, "step": 11241 }, { "epoch": 1.1539408866995073, "grad_norm": 0.06365542113780975, "learning_rate": 0.01, "loss": 2.079, "step": 11244 }, { "epoch": 1.1542487684729064, "grad_norm": 0.040505316108465195, "learning_rate": 0.01, "loss": 2.0688, "step": 11247 }, { "epoch": 1.1545566502463054, "grad_norm": 0.06958888471126556, "learning_rate": 0.01, "loss": 2.0882, "step": 11250 }, { "epoch": 1.1548645320197044, "grad_norm": 0.08968232572078705, "learning_rate": 0.01, "loss": 2.064, "step": 11253 }, { "epoch": 1.1551724137931034, "grad_norm": 0.05143412947654724, "learning_rate": 0.01, "loss": 2.0682, "step": 11256 }, { "epoch": 1.1554802955665024, "grad_norm": 0.1219927966594696, "learning_rate": 0.01, "loss": 2.1038, "step": 11259 }, { "epoch": 1.1557881773399015, "grad_norm": 0.11974184960126877, "learning_rate": 0.01, "loss": 2.1099, "step": 11262 }, { "epoch": 1.1560960591133005, "grad_norm": 0.09557066112756729, "learning_rate": 0.01, "loss": 2.0976, "step": 11265 }, { "epoch": 1.1564039408866995, "grad_norm": 0.10169565677642822, "learning_rate": 0.01, "loss": 2.0904, "step": 11268 }, { "epoch": 1.1567118226600985, "grad_norm": 0.0586596317589283, "learning_rate": 0.01, "loss": 2.0787, "step": 11271 }, { "epoch": 1.1570197044334976, "grad_norm": 0.0423048697412014, "learning_rate": 0.01, "loss": 2.1084, "step": 11274 }, { "epoch": 1.1573275862068966, "grad_norm": 0.07394707947969437, "learning_rate": 0.01, "loss": 2.0802, "step": 11277 }, { "epoch": 1.1576354679802956, "grad_norm": 0.05507487431168556, "learning_rate": 0.01, "loss": 2.0777, "step": 11280 }, { "epoch": 1.1579433497536946, "grad_norm": 0.03251083940267563, "learning_rate": 0.01, "loss": 2.1128, "step": 11283 }, { "epoch": 1.1582512315270936, "grad_norm": 0.04480468109250069, "learning_rate": 0.01, "loss": 2.1162, "step": 11286 }, { "epoch": 1.1585591133004927, "grad_norm": 0.08275749534368515, "learning_rate": 0.01, "loss": 2.0808, "step": 11289 }, { "epoch": 1.1588669950738917, "grad_norm": 0.08199379593133926, "learning_rate": 0.01, "loss": 2.1006, "step": 11292 }, { "epoch": 1.1591748768472907, "grad_norm": 0.1015235111117363, "learning_rate": 0.01, "loss": 2.0926, "step": 11295 }, { "epoch": 1.1594827586206897, "grad_norm": 0.08872919529676437, "learning_rate": 0.01, "loss": 2.0739, "step": 11298 }, { "epoch": 1.1597906403940887, "grad_norm": 0.04640579596161842, "learning_rate": 0.01, "loss": 2.09, "step": 11301 }, { "epoch": 1.1600985221674878, "grad_norm": 0.044142261147499084, "learning_rate": 0.01, "loss": 2.115, "step": 11304 }, { "epoch": 1.1604064039408868, "grad_norm": 0.1030762568116188, "learning_rate": 0.01, "loss": 2.1358, "step": 11307 }, { "epoch": 1.1607142857142858, "grad_norm": 0.06712359189987183, "learning_rate": 0.01, "loss": 2.0504, "step": 11310 }, { "epoch": 1.1610221674876846, "grad_norm": 0.05579240992665291, "learning_rate": 0.01, "loss": 2.0915, "step": 11313 }, { "epoch": 1.1613300492610836, "grad_norm": 0.04237228259444237, "learning_rate": 0.01, "loss": 2.0514, "step": 11316 }, { "epoch": 1.1616379310344827, "grad_norm": 0.08990642428398132, "learning_rate": 0.01, "loss": 2.0753, "step": 11319 }, { "epoch": 1.1619458128078817, "grad_norm": 0.09788185358047485, "learning_rate": 0.01, "loss": 2.0907, "step": 11322 }, { "epoch": 1.1622536945812807, "grad_norm": 0.07207074761390686, "learning_rate": 0.01, "loss": 2.0858, "step": 11325 }, { "epoch": 1.1625615763546797, "grad_norm": 0.07704904675483704, "learning_rate": 0.01, "loss": 2.0938, "step": 11328 }, { "epoch": 1.1628694581280787, "grad_norm": 0.07003269344568253, "learning_rate": 0.01, "loss": 2.1031, "step": 11331 }, { "epoch": 1.1631773399014778, "grad_norm": 0.05584646388888359, "learning_rate": 0.01, "loss": 2.0852, "step": 11334 }, { "epoch": 1.1634852216748768, "grad_norm": 0.10223853588104248, "learning_rate": 0.01, "loss": 2.072, "step": 11337 }, { "epoch": 1.1637931034482758, "grad_norm": 0.13336369395256042, "learning_rate": 0.01, "loss": 2.0902, "step": 11340 }, { "epoch": 1.1641009852216748, "grad_norm": 0.04471458122134209, "learning_rate": 0.01, "loss": 2.0884, "step": 11343 }, { "epoch": 1.1644088669950738, "grad_norm": 0.04342315346002579, "learning_rate": 0.01, "loss": 2.0885, "step": 11346 }, { "epoch": 1.1647167487684729, "grad_norm": 0.05172869563102722, "learning_rate": 0.01, "loss": 2.069, "step": 11349 }, { "epoch": 1.1650246305418719, "grad_norm": 0.04304314777255058, "learning_rate": 0.01, "loss": 2.0915, "step": 11352 }, { "epoch": 1.165332512315271, "grad_norm": 0.14592792093753815, "learning_rate": 0.01, "loss": 2.0841, "step": 11355 }, { "epoch": 1.16564039408867, "grad_norm": 0.05238402634859085, "learning_rate": 0.01, "loss": 2.0772, "step": 11358 }, { "epoch": 1.165948275862069, "grad_norm": 0.04800669103860855, "learning_rate": 0.01, "loss": 2.0739, "step": 11361 }, { "epoch": 1.166256157635468, "grad_norm": 0.06648313999176025, "learning_rate": 0.01, "loss": 2.0956, "step": 11364 }, { "epoch": 1.166564039408867, "grad_norm": 0.03402326628565788, "learning_rate": 0.01, "loss": 2.0847, "step": 11367 }, { "epoch": 1.166871921182266, "grad_norm": 0.05076766759157181, "learning_rate": 0.01, "loss": 2.0833, "step": 11370 }, { "epoch": 1.167179802955665, "grad_norm": 0.07221470773220062, "learning_rate": 0.01, "loss": 2.0778, "step": 11373 }, { "epoch": 1.167487684729064, "grad_norm": 0.04556736722588539, "learning_rate": 0.01, "loss": 2.0957, "step": 11376 }, { "epoch": 1.167795566502463, "grad_norm": 0.03702834993600845, "learning_rate": 0.01, "loss": 2.0843, "step": 11379 }, { "epoch": 1.168103448275862, "grad_norm": 0.046527571976184845, "learning_rate": 0.01, "loss": 2.084, "step": 11382 }, { "epoch": 1.1684113300492611, "grad_norm": 0.09520363062620163, "learning_rate": 0.01, "loss": 2.0924, "step": 11385 }, { "epoch": 1.1687192118226601, "grad_norm": 0.12759263813495636, "learning_rate": 0.01, "loss": 2.1265, "step": 11388 }, { "epoch": 1.1690270935960592, "grad_norm": 0.03981192037463188, "learning_rate": 0.01, "loss": 2.076, "step": 11391 }, { "epoch": 1.1693349753694582, "grad_norm": 0.04739897698163986, "learning_rate": 0.01, "loss": 2.1013, "step": 11394 }, { "epoch": 1.1696428571428572, "grad_norm": 0.04937390610575676, "learning_rate": 0.01, "loss": 2.0832, "step": 11397 }, { "epoch": 1.1699507389162562, "grad_norm": 0.07097122073173523, "learning_rate": 0.01, "loss": 2.0699, "step": 11400 }, { "epoch": 1.1702586206896552, "grad_norm": 0.07773783802986145, "learning_rate": 0.01, "loss": 2.1215, "step": 11403 }, { "epoch": 1.1705665024630543, "grad_norm": 0.04591994732618332, "learning_rate": 0.01, "loss": 2.0657, "step": 11406 }, { "epoch": 1.1708743842364533, "grad_norm": 0.08724237233400345, "learning_rate": 0.01, "loss": 2.0817, "step": 11409 }, { "epoch": 1.1711822660098523, "grad_norm": 0.06528041511774063, "learning_rate": 0.01, "loss": 2.0962, "step": 11412 }, { "epoch": 1.1714901477832513, "grad_norm": 0.0660424679517746, "learning_rate": 0.01, "loss": 2.0904, "step": 11415 }, { "epoch": 1.1717980295566504, "grad_norm": 0.08304266631603241, "learning_rate": 0.01, "loss": 2.0912, "step": 11418 }, { "epoch": 1.1721059113300494, "grad_norm": 0.05073266103863716, "learning_rate": 0.01, "loss": 2.0747, "step": 11421 }, { "epoch": 1.1724137931034484, "grad_norm": 0.07305736094713211, "learning_rate": 0.01, "loss": 2.0852, "step": 11424 }, { "epoch": 1.1727216748768472, "grad_norm": 0.09365279227495193, "learning_rate": 0.01, "loss": 2.0883, "step": 11427 }, { "epoch": 1.1730295566502462, "grad_norm": 0.12279067188501358, "learning_rate": 0.01, "loss": 2.0999, "step": 11430 }, { "epoch": 1.1733374384236452, "grad_norm": 0.0589769072830677, "learning_rate": 0.01, "loss": 2.0948, "step": 11433 }, { "epoch": 1.1736453201970443, "grad_norm": 0.06621567159891129, "learning_rate": 0.01, "loss": 2.0823, "step": 11436 }, { "epoch": 1.1739532019704433, "grad_norm": 0.051341816782951355, "learning_rate": 0.01, "loss": 2.0622, "step": 11439 }, { "epoch": 1.1742610837438423, "grad_norm": 0.06027314066886902, "learning_rate": 0.01, "loss": 2.0798, "step": 11442 }, { "epoch": 1.1745689655172413, "grad_norm": 0.10131573677062988, "learning_rate": 0.01, "loss": 2.0882, "step": 11445 }, { "epoch": 1.1748768472906403, "grad_norm": 0.08082377910614014, "learning_rate": 0.01, "loss": 2.0949, "step": 11448 }, { "epoch": 1.1751847290640394, "grad_norm": 0.07095243781805038, "learning_rate": 0.01, "loss": 2.0655, "step": 11451 }, { "epoch": 1.1754926108374384, "grad_norm": 0.07132910192012787, "learning_rate": 0.01, "loss": 2.0903, "step": 11454 }, { "epoch": 1.1758004926108374, "grad_norm": 0.10488838702440262, "learning_rate": 0.01, "loss": 2.0639, "step": 11457 }, { "epoch": 1.1761083743842364, "grad_norm": 0.12755680084228516, "learning_rate": 0.01, "loss": 2.0989, "step": 11460 }, { "epoch": 1.1764162561576355, "grad_norm": 0.12174911797046661, "learning_rate": 0.01, "loss": 2.1026, "step": 11463 }, { "epoch": 1.1767241379310345, "grad_norm": 0.07873964309692383, "learning_rate": 0.01, "loss": 2.0908, "step": 11466 }, { "epoch": 1.1770320197044335, "grad_norm": 0.04275409132242203, "learning_rate": 0.01, "loss": 2.0899, "step": 11469 }, { "epoch": 1.1773399014778325, "grad_norm": 0.046134103089571, "learning_rate": 0.01, "loss": 2.1064, "step": 11472 }, { "epoch": 1.1776477832512315, "grad_norm": 0.07631804049015045, "learning_rate": 0.01, "loss": 2.0811, "step": 11475 }, { "epoch": 1.1779556650246306, "grad_norm": 0.04843062162399292, "learning_rate": 0.01, "loss": 2.1078, "step": 11478 }, { "epoch": 1.1782635467980296, "grad_norm": 0.04664747416973114, "learning_rate": 0.01, "loss": 2.0807, "step": 11481 }, { "epoch": 1.1785714285714286, "grad_norm": 0.042328983545303345, "learning_rate": 0.01, "loss": 2.0898, "step": 11484 }, { "epoch": 1.1788793103448276, "grad_norm": 0.04443054646253586, "learning_rate": 0.01, "loss": 2.092, "step": 11487 }, { "epoch": 1.1791871921182266, "grad_norm": 0.03439139202237129, "learning_rate": 0.01, "loss": 2.0438, "step": 11490 }, { "epoch": 1.1794950738916257, "grad_norm": 0.1651001274585724, "learning_rate": 0.01, "loss": 2.1068, "step": 11493 }, { "epoch": 1.1798029556650247, "grad_norm": 0.04535198211669922, "learning_rate": 0.01, "loss": 2.0951, "step": 11496 }, { "epoch": 1.1801108374384237, "grad_norm": 0.04346736520528793, "learning_rate": 0.01, "loss": 2.1032, "step": 11499 }, { "epoch": 1.1804187192118227, "grad_norm": 0.08131600171327591, "learning_rate": 0.01, "loss": 2.0709, "step": 11502 }, { "epoch": 1.1807266009852218, "grad_norm": 0.0638989582657814, "learning_rate": 0.01, "loss": 2.0676, "step": 11505 }, { "epoch": 1.1810344827586208, "grad_norm": 0.06305757910013199, "learning_rate": 0.01, "loss": 2.0636, "step": 11508 }, { "epoch": 1.1813423645320198, "grad_norm": 0.048068735748529434, "learning_rate": 0.01, "loss": 2.0803, "step": 11511 }, { "epoch": 1.1816502463054186, "grad_norm": 0.1859566867351532, "learning_rate": 0.01, "loss": 2.1206, "step": 11514 }, { "epoch": 1.1819581280788176, "grad_norm": 0.13449640572071075, "learning_rate": 0.01, "loss": 2.1047, "step": 11517 }, { "epoch": 1.1822660098522166, "grad_norm": 0.09624927490949631, "learning_rate": 0.01, "loss": 2.1338, "step": 11520 }, { "epoch": 1.1825738916256157, "grad_norm": 0.04823729023337364, "learning_rate": 0.01, "loss": 2.1485, "step": 11523 }, { "epoch": 1.1828817733990147, "grad_norm": 0.04005538672208786, "learning_rate": 0.01, "loss": 2.1062, "step": 11526 }, { "epoch": 1.1831896551724137, "grad_norm": 0.035647980868816376, "learning_rate": 0.01, "loss": 2.1002, "step": 11529 }, { "epoch": 1.1834975369458127, "grad_norm": 0.03485687077045441, "learning_rate": 0.01, "loss": 2.085, "step": 11532 }, { "epoch": 1.1838054187192117, "grad_norm": 0.03271855041384697, "learning_rate": 0.01, "loss": 2.1083, "step": 11535 }, { "epoch": 1.1841133004926108, "grad_norm": 0.14434824883937836, "learning_rate": 0.01, "loss": 2.0917, "step": 11538 }, { "epoch": 1.1844211822660098, "grad_norm": 0.16373054683208466, "learning_rate": 0.01, "loss": 2.0889, "step": 11541 }, { "epoch": 1.1847290640394088, "grad_norm": 0.1128426045179367, "learning_rate": 0.01, "loss": 2.0605, "step": 11544 }, { "epoch": 1.1850369458128078, "grad_norm": 0.03807492554187775, "learning_rate": 0.01, "loss": 2.0792, "step": 11547 }, { "epoch": 1.1853448275862069, "grad_norm": 0.0678490698337555, "learning_rate": 0.01, "loss": 2.072, "step": 11550 }, { "epoch": 1.1856527093596059, "grad_norm": 0.04688851907849312, "learning_rate": 0.01, "loss": 2.0648, "step": 11553 }, { "epoch": 1.185960591133005, "grad_norm": 0.03682132810354233, "learning_rate": 0.01, "loss": 2.096, "step": 11556 }, { "epoch": 1.186268472906404, "grad_norm": 0.0797944962978363, "learning_rate": 0.01, "loss": 2.0632, "step": 11559 }, { "epoch": 1.186576354679803, "grad_norm": 0.09593506157398224, "learning_rate": 0.01, "loss": 2.0923, "step": 11562 }, { "epoch": 1.186884236453202, "grad_norm": 0.10455387830734253, "learning_rate": 0.01, "loss": 2.0927, "step": 11565 }, { "epoch": 1.187192118226601, "grad_norm": 0.05642642080783844, "learning_rate": 0.01, "loss": 2.0688, "step": 11568 }, { "epoch": 1.1875, "grad_norm": 0.09467128664255142, "learning_rate": 0.01, "loss": 2.0561, "step": 11571 }, { "epoch": 1.187807881773399, "grad_norm": 0.061598166823387146, "learning_rate": 0.01, "loss": 2.0666, "step": 11574 }, { "epoch": 1.188115763546798, "grad_norm": 0.0875246673822403, "learning_rate": 0.01, "loss": 2.0753, "step": 11577 }, { "epoch": 1.188423645320197, "grad_norm": 0.05889583006501198, "learning_rate": 0.01, "loss": 2.0705, "step": 11580 }, { "epoch": 1.188731527093596, "grad_norm": 0.0796559602022171, "learning_rate": 0.01, "loss": 2.0939, "step": 11583 }, { "epoch": 1.189039408866995, "grad_norm": 0.04127117991447449, "learning_rate": 0.01, "loss": 2.0905, "step": 11586 }, { "epoch": 1.1893472906403941, "grad_norm": 0.06161842495203018, "learning_rate": 0.01, "loss": 2.0954, "step": 11589 }, { "epoch": 1.1896551724137931, "grad_norm": 0.05344879627227783, "learning_rate": 0.01, "loss": 2.0808, "step": 11592 }, { "epoch": 1.1899630541871922, "grad_norm": 0.03660701587796211, "learning_rate": 0.01, "loss": 2.0748, "step": 11595 }, { "epoch": 1.1902709359605912, "grad_norm": 0.04792351648211479, "learning_rate": 0.01, "loss": 2.0833, "step": 11598 }, { "epoch": 1.1905788177339902, "grad_norm": 0.04336618259549141, "learning_rate": 0.01, "loss": 2.1017, "step": 11601 }, { "epoch": 1.1908866995073892, "grad_norm": 0.0654226765036583, "learning_rate": 0.01, "loss": 2.0922, "step": 11604 }, { "epoch": 1.1911945812807883, "grad_norm": 0.08879897743463516, "learning_rate": 0.01, "loss": 2.0842, "step": 11607 }, { "epoch": 1.1915024630541873, "grad_norm": 0.15568268299102783, "learning_rate": 0.01, "loss": 2.0947, "step": 11610 }, { "epoch": 1.1918103448275863, "grad_norm": 0.11712448298931122, "learning_rate": 0.01, "loss": 2.0955, "step": 11613 }, { "epoch": 1.1921182266009853, "grad_norm": 0.04966702312231064, "learning_rate": 0.01, "loss": 2.1093, "step": 11616 }, { "epoch": 1.1924261083743843, "grad_norm": 0.04304838925600052, "learning_rate": 0.01, "loss": 2.0828, "step": 11619 }, { "epoch": 1.1927339901477834, "grad_norm": 0.04981999844312668, "learning_rate": 0.01, "loss": 2.0964, "step": 11622 }, { "epoch": 1.1930418719211824, "grad_norm": 0.045383159071207047, "learning_rate": 0.01, "loss": 2.0967, "step": 11625 }, { "epoch": 1.1933497536945814, "grad_norm": 0.0348484069108963, "learning_rate": 0.01, "loss": 2.0685, "step": 11628 }, { "epoch": 1.1936576354679802, "grad_norm": 0.04802081733942032, "learning_rate": 0.01, "loss": 2.0904, "step": 11631 }, { "epoch": 1.1939655172413792, "grad_norm": 0.0615711510181427, "learning_rate": 0.01, "loss": 2.1153, "step": 11634 }, { "epoch": 1.1942733990147782, "grad_norm": 0.15608100593090057, "learning_rate": 0.01, "loss": 2.0753, "step": 11637 }, { "epoch": 1.1945812807881773, "grad_norm": 0.10449741780757904, "learning_rate": 0.01, "loss": 2.1083, "step": 11640 }, { "epoch": 1.1948891625615763, "grad_norm": 0.062145963311195374, "learning_rate": 0.01, "loss": 2.1104, "step": 11643 }, { "epoch": 1.1951970443349753, "grad_norm": 0.04744469001889229, "learning_rate": 0.01, "loss": 2.0742, "step": 11646 }, { "epoch": 1.1955049261083743, "grad_norm": 0.036814477294683456, "learning_rate": 0.01, "loss": 2.0641, "step": 11649 }, { "epoch": 1.1958128078817734, "grad_norm": 0.037870246917009354, "learning_rate": 0.01, "loss": 2.0906, "step": 11652 }, { "epoch": 1.1961206896551724, "grad_norm": 0.17372412979602814, "learning_rate": 0.01, "loss": 2.1027, "step": 11655 }, { "epoch": 1.1964285714285714, "grad_norm": 0.04681265726685524, "learning_rate": 0.01, "loss": 2.1029, "step": 11658 }, { "epoch": 1.1967364532019704, "grad_norm": 0.058284103870391846, "learning_rate": 0.01, "loss": 2.0829, "step": 11661 }, { "epoch": 1.1970443349753694, "grad_norm": 0.07531574368476868, "learning_rate": 0.01, "loss": 2.0602, "step": 11664 }, { "epoch": 1.1973522167487685, "grad_norm": 0.053437430411577225, "learning_rate": 0.01, "loss": 2.0726, "step": 11667 }, { "epoch": 1.1976600985221675, "grad_norm": 0.047438427805900574, "learning_rate": 0.01, "loss": 2.107, "step": 11670 }, { "epoch": 1.1979679802955665, "grad_norm": 0.04474404826760292, "learning_rate": 0.01, "loss": 2.08, "step": 11673 }, { "epoch": 1.1982758620689655, "grad_norm": 0.15452256798744202, "learning_rate": 0.01, "loss": 2.0788, "step": 11676 }, { "epoch": 1.1985837438423645, "grad_norm": 0.05446213111281395, "learning_rate": 0.01, "loss": 2.0696, "step": 11679 }, { "epoch": 1.1988916256157636, "grad_norm": 0.059178926050662994, "learning_rate": 0.01, "loss": 2.0867, "step": 11682 }, { "epoch": 1.1991995073891626, "grad_norm": 0.05807918682694435, "learning_rate": 0.01, "loss": 2.0675, "step": 11685 }, { "epoch": 1.1995073891625616, "grad_norm": 0.046843890100717545, "learning_rate": 0.01, "loss": 2.0729, "step": 11688 }, { "epoch": 1.1998152709359606, "grad_norm": 0.042494870722293854, "learning_rate": 0.01, "loss": 2.079, "step": 11691 }, { "epoch": 1.2001231527093597, "grad_norm": 0.04506772756576538, "learning_rate": 0.01, "loss": 2.0862, "step": 11694 }, { "epoch": 1.2004310344827587, "grad_norm": 0.04237942770123482, "learning_rate": 0.01, "loss": 2.0712, "step": 11697 }, { "epoch": 1.2007389162561577, "grad_norm": 0.03488307446241379, "learning_rate": 0.01, "loss": 2.1051, "step": 11700 }, { "epoch": 1.2010467980295567, "grad_norm": 0.03693482652306557, "learning_rate": 0.01, "loss": 2.045, "step": 11703 }, { "epoch": 1.2013546798029557, "grad_norm": 0.10410935431718826, "learning_rate": 0.01, "loss": 2.0884, "step": 11706 }, { "epoch": 1.2016625615763548, "grad_norm": 0.11816459894180298, "learning_rate": 0.01, "loss": 2.0687, "step": 11709 }, { "epoch": 1.2019704433497538, "grad_norm": 0.06567800790071487, "learning_rate": 0.01, "loss": 2.063, "step": 11712 }, { "epoch": 1.2022783251231528, "grad_norm": 0.06639432907104492, "learning_rate": 0.01, "loss": 2.078, "step": 11715 }, { "epoch": 1.2025862068965516, "grad_norm": 0.05059380456805229, "learning_rate": 0.01, "loss": 2.1163, "step": 11718 }, { "epoch": 1.2028940886699506, "grad_norm": 0.04076917842030525, "learning_rate": 0.01, "loss": 2.0784, "step": 11721 }, { "epoch": 1.2032019704433496, "grad_norm": 0.05994633212685585, "learning_rate": 0.01, "loss": 2.0819, "step": 11724 }, { "epoch": 1.2035098522167487, "grad_norm": 0.05682201310992241, "learning_rate": 0.01, "loss": 2.0625, "step": 11727 }, { "epoch": 1.2038177339901477, "grad_norm": 0.05393010750412941, "learning_rate": 0.01, "loss": 2.072, "step": 11730 }, { "epoch": 1.2041256157635467, "grad_norm": 0.04697128012776375, "learning_rate": 0.01, "loss": 2.0796, "step": 11733 }, { "epoch": 1.2044334975369457, "grad_norm": 0.04945002868771553, "learning_rate": 0.01, "loss": 2.0666, "step": 11736 }, { "epoch": 1.2047413793103448, "grad_norm": 0.06519649177789688, "learning_rate": 0.01, "loss": 2.0873, "step": 11739 }, { "epoch": 1.2050492610837438, "grad_norm": 0.1188720241189003, "learning_rate": 0.01, "loss": 2.0967, "step": 11742 }, { "epoch": 1.2053571428571428, "grad_norm": 0.1045864149928093, "learning_rate": 0.01, "loss": 2.0834, "step": 11745 }, { "epoch": 1.2056650246305418, "grad_norm": 0.04561993479728699, "learning_rate": 0.01, "loss": 2.0872, "step": 11748 }, { "epoch": 1.2059729064039408, "grad_norm": 0.04972228407859802, "learning_rate": 0.01, "loss": 2.0599, "step": 11751 }, { "epoch": 1.2062807881773399, "grad_norm": 0.05342618376016617, "learning_rate": 0.01, "loss": 2.0606, "step": 11754 }, { "epoch": 1.2065886699507389, "grad_norm": 0.05637587606906891, "learning_rate": 0.01, "loss": 2.0836, "step": 11757 }, { "epoch": 1.206896551724138, "grad_norm": 0.11595457047224045, "learning_rate": 0.01, "loss": 2.0904, "step": 11760 }, { "epoch": 1.207204433497537, "grad_norm": 0.11803465336561203, "learning_rate": 0.01, "loss": 2.0741, "step": 11763 }, { "epoch": 1.207512315270936, "grad_norm": 0.045427508652210236, "learning_rate": 0.01, "loss": 2.0721, "step": 11766 }, { "epoch": 1.207820197044335, "grad_norm": 0.0365883894264698, "learning_rate": 0.01, "loss": 2.0599, "step": 11769 }, { "epoch": 1.208128078817734, "grad_norm": 0.03729262575507164, "learning_rate": 0.01, "loss": 2.087, "step": 11772 }, { "epoch": 1.208435960591133, "grad_norm": 0.05842882767319679, "learning_rate": 0.01, "loss": 2.0762, "step": 11775 }, { "epoch": 1.208743842364532, "grad_norm": 0.07687997072935104, "learning_rate": 0.01, "loss": 2.0691, "step": 11778 }, { "epoch": 1.209051724137931, "grad_norm": 0.06832735985517502, "learning_rate": 0.01, "loss": 2.0974, "step": 11781 }, { "epoch": 1.20935960591133, "grad_norm": 0.10200455039739609, "learning_rate": 0.01, "loss": 2.0866, "step": 11784 }, { "epoch": 1.209667487684729, "grad_norm": 0.10769661515951157, "learning_rate": 0.01, "loss": 2.0761, "step": 11787 }, { "epoch": 1.2099753694581281, "grad_norm": 0.12233126908540726, "learning_rate": 0.01, "loss": 2.0911, "step": 11790 }, { "epoch": 1.2102832512315271, "grad_norm": 0.046646762639284134, "learning_rate": 0.01, "loss": 2.0546, "step": 11793 }, { "epoch": 1.2105911330049262, "grad_norm": 0.030627859756350517, "learning_rate": 0.01, "loss": 2.0805, "step": 11796 }, { "epoch": 1.2108990147783252, "grad_norm": 0.03977693244814873, "learning_rate": 0.01, "loss": 2.0465, "step": 11799 }, { "epoch": 1.2112068965517242, "grad_norm": 0.06213162839412689, "learning_rate": 0.01, "loss": 2.1123, "step": 11802 }, { "epoch": 1.2115147783251232, "grad_norm": 0.04708317294716835, "learning_rate": 0.01, "loss": 2.1315, "step": 11805 }, { "epoch": 1.2118226600985222, "grad_norm": 0.10807790607213974, "learning_rate": 0.01, "loss": 2.0825, "step": 11808 }, { "epoch": 1.2121305418719213, "grad_norm": 0.08579280972480774, "learning_rate": 0.01, "loss": 2.1366, "step": 11811 }, { "epoch": 1.2124384236453203, "grad_norm": 0.05783751606941223, "learning_rate": 0.01, "loss": 2.0682, "step": 11814 }, { "epoch": 1.2127463054187193, "grad_norm": 0.04836808145046234, "learning_rate": 0.01, "loss": 2.0606, "step": 11817 }, { "epoch": 1.2130541871921183, "grad_norm": 0.04026191681623459, "learning_rate": 0.01, "loss": 2.0637, "step": 11820 }, { "epoch": 1.2133620689655173, "grad_norm": 0.03309273347258568, "learning_rate": 0.01, "loss": 2.08, "step": 11823 }, { "epoch": 1.2136699507389164, "grad_norm": 0.10611164569854736, "learning_rate": 0.01, "loss": 2.0787, "step": 11826 }, { "epoch": 1.2139778325123154, "grad_norm": 0.0817422866821289, "learning_rate": 0.01, "loss": 2.069, "step": 11829 }, { "epoch": 1.2142857142857142, "grad_norm": 0.05528125911951065, "learning_rate": 0.01, "loss": 2.0408, "step": 11832 }, { "epoch": 1.2145935960591132, "grad_norm": 0.05016999691724777, "learning_rate": 0.01, "loss": 2.0794, "step": 11835 }, { "epoch": 1.2149014778325122, "grad_norm": 0.06376414746046066, "learning_rate": 0.01, "loss": 2.1097, "step": 11838 }, { "epoch": 1.2152093596059113, "grad_norm": 0.06668904423713684, "learning_rate": 0.01, "loss": 2.0745, "step": 11841 }, { "epoch": 1.2155172413793103, "grad_norm": 0.046260587871074677, "learning_rate": 0.01, "loss": 2.0711, "step": 11844 }, { "epoch": 1.2158251231527093, "grad_norm": 0.039374321699142456, "learning_rate": 0.01, "loss": 2.087, "step": 11847 }, { "epoch": 1.2161330049261083, "grad_norm": 0.10993562638759613, "learning_rate": 0.01, "loss": 2.0615, "step": 11850 }, { "epoch": 1.2164408866995073, "grad_norm": 0.03676668182015419, "learning_rate": 0.01, "loss": 2.0717, "step": 11853 }, { "epoch": 1.2167487684729064, "grad_norm": 0.10777715593576431, "learning_rate": 0.01, "loss": 2.1016, "step": 11856 }, { "epoch": 1.2170566502463054, "grad_norm": 0.07948705554008484, "learning_rate": 0.01, "loss": 2.083, "step": 11859 }, { "epoch": 1.2173645320197044, "grad_norm": 0.11646637320518494, "learning_rate": 0.01, "loss": 2.0552, "step": 11862 }, { "epoch": 1.2176724137931034, "grad_norm": 0.07525186985731125, "learning_rate": 0.01, "loss": 2.0877, "step": 11865 }, { "epoch": 1.2179802955665024, "grad_norm": 0.048124101012945175, "learning_rate": 0.01, "loss": 2.0652, "step": 11868 }, { "epoch": 1.2182881773399015, "grad_norm": 0.04603361710906029, "learning_rate": 0.01, "loss": 2.0922, "step": 11871 }, { "epoch": 1.2185960591133005, "grad_norm": 0.07067687064409256, "learning_rate": 0.01, "loss": 2.0946, "step": 11874 }, { "epoch": 1.2189039408866995, "grad_norm": 0.0959327220916748, "learning_rate": 0.01, "loss": 2.096, "step": 11877 }, { "epoch": 1.2192118226600985, "grad_norm": 0.08565320819616318, "learning_rate": 0.01, "loss": 2.09, "step": 11880 }, { "epoch": 1.2195197044334976, "grad_norm": 0.06728377193212509, "learning_rate": 0.01, "loss": 2.0801, "step": 11883 }, { "epoch": 1.2198275862068966, "grad_norm": 0.03809618949890137, "learning_rate": 0.01, "loss": 2.0668, "step": 11886 }, { "epoch": 1.2201354679802956, "grad_norm": 0.049925826489925385, "learning_rate": 0.01, "loss": 2.0625, "step": 11889 }, { "epoch": 1.2204433497536946, "grad_norm": 0.05949478596448898, "learning_rate": 0.01, "loss": 2.0687, "step": 11892 }, { "epoch": 1.2207512315270936, "grad_norm": 0.08161807060241699, "learning_rate": 0.01, "loss": 2.0789, "step": 11895 }, { "epoch": 1.2210591133004927, "grad_norm": 0.05829952284693718, "learning_rate": 0.01, "loss": 2.0846, "step": 11898 }, { "epoch": 1.2213669950738917, "grad_norm": 0.05801619216799736, "learning_rate": 0.01, "loss": 2.1014, "step": 11901 }, { "epoch": 1.2216748768472907, "grad_norm": 0.04123099148273468, "learning_rate": 0.01, "loss": 2.0867, "step": 11904 }, { "epoch": 1.2219827586206897, "grad_norm": 0.05088057741522789, "learning_rate": 0.01, "loss": 2.0751, "step": 11907 }, { "epoch": 1.2222906403940887, "grad_norm": 0.07357197254896164, "learning_rate": 0.01, "loss": 2.1023, "step": 11910 }, { "epoch": 1.2225985221674878, "grad_norm": 0.060078103095293045, "learning_rate": 0.01, "loss": 2.0784, "step": 11913 }, { "epoch": 1.2229064039408868, "grad_norm": 0.12629617750644684, "learning_rate": 0.01, "loss": 2.0767, "step": 11916 }, { "epoch": 1.2232142857142858, "grad_norm": 0.07202067971229553, "learning_rate": 0.01, "loss": 2.0796, "step": 11919 }, { "epoch": 1.2235221674876846, "grad_norm": 0.06407934427261353, "learning_rate": 0.01, "loss": 2.0926, "step": 11922 }, { "epoch": 1.2238300492610836, "grad_norm": 0.053789231926202774, "learning_rate": 0.01, "loss": 2.092, "step": 11925 }, { "epoch": 1.2241379310344827, "grad_norm": 0.04130502790212631, "learning_rate": 0.01, "loss": 2.051, "step": 11928 }, { "epoch": 1.2244458128078817, "grad_norm": 0.05235166475176811, "learning_rate": 0.01, "loss": 2.0829, "step": 11931 }, { "epoch": 1.2247536945812807, "grad_norm": 0.04508119449019432, "learning_rate": 0.01, "loss": 2.0857, "step": 11934 }, { "epoch": 1.2250615763546797, "grad_norm": 0.03570512309670448, "learning_rate": 0.01, "loss": 2.0734, "step": 11937 }, { "epoch": 1.2253694581280787, "grad_norm": 0.04690218344330788, "learning_rate": 0.01, "loss": 2.0988, "step": 11940 }, { "epoch": 1.2256773399014778, "grad_norm": 0.10231764614582062, "learning_rate": 0.01, "loss": 2.0716, "step": 11943 }, { "epoch": 1.2259852216748768, "grad_norm": 0.05221893638372421, "learning_rate": 0.01, "loss": 2.0693, "step": 11946 }, { "epoch": 1.2262931034482758, "grad_norm": 0.0647406056523323, "learning_rate": 0.01, "loss": 2.0809, "step": 11949 }, { "epoch": 1.2266009852216748, "grad_norm": 0.06388009339570999, "learning_rate": 0.01, "loss": 2.0968, "step": 11952 }, { "epoch": 1.2269088669950738, "grad_norm": 0.06904192268848419, "learning_rate": 0.01, "loss": 2.0917, "step": 11955 }, { "epoch": 1.2272167487684729, "grad_norm": 0.08780385553836823, "learning_rate": 0.01, "loss": 2.0881, "step": 11958 }, { "epoch": 1.2275246305418719, "grad_norm": 0.037958092987537384, "learning_rate": 0.01, "loss": 2.0813, "step": 11961 }, { "epoch": 1.227832512315271, "grad_norm": 0.04035305231809616, "learning_rate": 0.01, "loss": 2.0718, "step": 11964 }, { "epoch": 1.22814039408867, "grad_norm": 0.056451354175806046, "learning_rate": 0.01, "loss": 2.0653, "step": 11967 }, { "epoch": 1.228448275862069, "grad_norm": 0.06248374283313751, "learning_rate": 0.01, "loss": 2.0794, "step": 11970 }, { "epoch": 1.228756157635468, "grad_norm": 0.05662978067994118, "learning_rate": 0.01, "loss": 2.0873, "step": 11973 }, { "epoch": 1.229064039408867, "grad_norm": 0.06416438519954681, "learning_rate": 0.01, "loss": 2.0933, "step": 11976 }, { "epoch": 1.229371921182266, "grad_norm": 0.04529969021677971, "learning_rate": 0.01, "loss": 2.0892, "step": 11979 }, { "epoch": 1.229679802955665, "grad_norm": 0.03636370226740837, "learning_rate": 0.01, "loss": 2.0968, "step": 11982 }, { "epoch": 1.229987684729064, "grad_norm": 0.03992651402950287, "learning_rate": 0.01, "loss": 2.0606, "step": 11985 }, { "epoch": 1.230295566502463, "grad_norm": 0.19436125457286835, "learning_rate": 0.01, "loss": 2.0779, "step": 11988 }, { "epoch": 1.230603448275862, "grad_norm": 0.15459048748016357, "learning_rate": 0.01, "loss": 2.0801, "step": 11991 }, { "epoch": 1.2309113300492611, "grad_norm": 0.11131371557712555, "learning_rate": 0.01, "loss": 2.0629, "step": 11994 }, { "epoch": 1.2312192118226601, "grad_norm": 0.06876586377620697, "learning_rate": 0.01, "loss": 2.1081, "step": 11997 }, { "epoch": 1.2315270935960592, "grad_norm": 0.03379599004983902, "learning_rate": 0.01, "loss": 2.0977, "step": 12000 }, { "epoch": 1.2318349753694582, "grad_norm": 0.06905510276556015, "learning_rate": 0.01, "loss": 2.0545, "step": 12003 }, { "epoch": 1.2321428571428572, "grad_norm": 0.05859539657831192, "learning_rate": 0.01, "loss": 2.0751, "step": 12006 }, { "epoch": 1.2324507389162562, "grad_norm": 0.1200842559337616, "learning_rate": 0.01, "loss": 2.1045, "step": 12009 }, { "epoch": 1.2327586206896552, "grad_norm": 0.09969060868024826, "learning_rate": 0.01, "loss": 2.0641, "step": 12012 }, { "epoch": 1.2330665024630543, "grad_norm": 0.08915867656469345, "learning_rate": 0.01, "loss": 2.0585, "step": 12015 }, { "epoch": 1.2333743842364533, "grad_norm": 0.10951671004295349, "learning_rate": 0.01, "loss": 2.1007, "step": 12018 }, { "epoch": 1.2336822660098523, "grad_norm": 0.15262556076049805, "learning_rate": 0.01, "loss": 2.0643, "step": 12021 }, { "epoch": 1.2339901477832513, "grad_norm": 0.05622226372361183, "learning_rate": 0.01, "loss": 2.0796, "step": 12024 }, { "epoch": 1.2342980295566504, "grad_norm": 0.05918841436505318, "learning_rate": 0.01, "loss": 2.0911, "step": 12027 }, { "epoch": 1.2346059113300494, "grad_norm": 0.04867622256278992, "learning_rate": 0.01, "loss": 2.0872, "step": 12030 }, { "epoch": 1.2349137931034484, "grad_norm": 0.04389597848057747, "learning_rate": 0.01, "loss": 2.0882, "step": 12033 }, { "epoch": 1.2352216748768472, "grad_norm": 0.1209108904004097, "learning_rate": 0.01, "loss": 2.0394, "step": 12036 }, { "epoch": 1.2355295566502462, "grad_norm": 0.08446931838989258, "learning_rate": 0.01, "loss": 2.0744, "step": 12039 }, { "epoch": 1.2358374384236452, "grad_norm": 0.07141686230897903, "learning_rate": 0.01, "loss": 2.1099, "step": 12042 }, { "epoch": 1.2361453201970443, "grad_norm": 0.1216801181435585, "learning_rate": 0.01, "loss": 2.0754, "step": 12045 }, { "epoch": 1.2364532019704433, "grad_norm": 0.10539086163043976, "learning_rate": 0.01, "loss": 2.0887, "step": 12048 }, { "epoch": 1.2367610837438423, "grad_norm": 0.09336747229099274, "learning_rate": 0.01, "loss": 2.0637, "step": 12051 }, { "epoch": 1.2370689655172413, "grad_norm": 0.091059610247612, "learning_rate": 0.01, "loss": 2.057, "step": 12054 }, { "epoch": 1.2373768472906403, "grad_norm": 0.08291159570217133, "learning_rate": 0.01, "loss": 2.1039, "step": 12057 }, { "epoch": 1.2376847290640394, "grad_norm": 0.07626821845769882, "learning_rate": 0.01, "loss": 2.0784, "step": 12060 }, { "epoch": 1.2379926108374384, "grad_norm": 0.05197496339678764, "learning_rate": 0.01, "loss": 2.0968, "step": 12063 }, { "epoch": 1.2383004926108374, "grad_norm": 0.061275266110897064, "learning_rate": 0.01, "loss": 2.095, "step": 12066 }, { "epoch": 1.2386083743842364, "grad_norm": 0.04282483085989952, "learning_rate": 0.01, "loss": 2.083, "step": 12069 }, { "epoch": 1.2389162561576355, "grad_norm": 0.037066422402858734, "learning_rate": 0.01, "loss": 2.0653, "step": 12072 }, { "epoch": 1.2392241379310345, "grad_norm": 0.0467105396091938, "learning_rate": 0.01, "loss": 2.0913, "step": 12075 }, { "epoch": 1.2395320197044335, "grad_norm": 0.053995974361896515, "learning_rate": 0.01, "loss": 2.0754, "step": 12078 }, { "epoch": 1.2398399014778325, "grad_norm": 0.08583737164735794, "learning_rate": 0.01, "loss": 2.089, "step": 12081 }, { "epoch": 1.2401477832512315, "grad_norm": 0.07264076173305511, "learning_rate": 0.01, "loss": 2.07, "step": 12084 }, { "epoch": 1.2404556650246306, "grad_norm": 0.062001802027225494, "learning_rate": 0.01, "loss": 2.0969, "step": 12087 }, { "epoch": 1.2407635467980296, "grad_norm": 0.05311381444334984, "learning_rate": 0.01, "loss": 2.0833, "step": 12090 }, { "epoch": 1.2410714285714286, "grad_norm": 0.04272656887769699, "learning_rate": 0.01, "loss": 2.0883, "step": 12093 }, { "epoch": 1.2413793103448276, "grad_norm": 0.10696172714233398, "learning_rate": 0.01, "loss": 2.0925, "step": 12096 }, { "epoch": 1.2416871921182266, "grad_norm": 0.08625493943691254, "learning_rate": 0.01, "loss": 2.0616, "step": 12099 }, { "epoch": 1.2419950738916257, "grad_norm": 0.06818173080682755, "learning_rate": 0.01, "loss": 2.0943, "step": 12102 }, { "epoch": 1.2423029556650247, "grad_norm": 0.050731681287288666, "learning_rate": 0.01, "loss": 2.0802, "step": 12105 }, { "epoch": 1.2426108374384237, "grad_norm": 0.08426883816719055, "learning_rate": 0.01, "loss": 2.0825, "step": 12108 }, { "epoch": 1.2429187192118227, "grad_norm": 0.09432832151651382, "learning_rate": 0.01, "loss": 2.0834, "step": 12111 }, { "epoch": 1.2432266009852218, "grad_norm": 0.06951441615819931, "learning_rate": 0.01, "loss": 2.0951, "step": 12114 }, { "epoch": 1.2435344827586208, "grad_norm": 0.06427393108606339, "learning_rate": 0.01, "loss": 2.0706, "step": 12117 }, { "epoch": 1.2438423645320198, "grad_norm": 0.03967609629034996, "learning_rate": 0.01, "loss": 2.0897, "step": 12120 }, { "epoch": 1.2441502463054186, "grad_norm": 0.036665160208940506, "learning_rate": 0.01, "loss": 2.0791, "step": 12123 }, { "epoch": 1.2444581280788176, "grad_norm": 0.072290800511837, "learning_rate": 0.01, "loss": 2.0889, "step": 12126 }, { "epoch": 1.2447660098522166, "grad_norm": 0.07136868685483932, "learning_rate": 0.01, "loss": 2.0688, "step": 12129 }, { "epoch": 1.2450738916256157, "grad_norm": 0.15400493144989014, "learning_rate": 0.01, "loss": 2.0821, "step": 12132 }, { "epoch": 1.2453817733990147, "grad_norm": 0.07114578038454056, "learning_rate": 0.01, "loss": 2.0719, "step": 12135 }, { "epoch": 1.2456896551724137, "grad_norm": 0.043961767107248306, "learning_rate": 0.01, "loss": 2.0877, "step": 12138 }, { "epoch": 1.2459975369458127, "grad_norm": 0.056267060339450836, "learning_rate": 0.01, "loss": 2.0688, "step": 12141 }, { "epoch": 1.2463054187192117, "grad_norm": 0.035889722406864166, "learning_rate": 0.01, "loss": 2.0783, "step": 12144 }, { "epoch": 1.2466133004926108, "grad_norm": 0.1781640499830246, "learning_rate": 0.01, "loss": 2.0574, "step": 12147 }, { "epoch": 1.2469211822660098, "grad_norm": 0.0891503393650055, "learning_rate": 0.01, "loss": 2.0957, "step": 12150 }, { "epoch": 1.2472290640394088, "grad_norm": 0.047431472688913345, "learning_rate": 0.01, "loss": 2.0696, "step": 12153 }, { "epoch": 1.2475369458128078, "grad_norm": 0.04693286865949631, "learning_rate": 0.01, "loss": 2.0438, "step": 12156 }, { "epoch": 1.2478448275862069, "grad_norm": 0.0382777564227581, "learning_rate": 0.01, "loss": 2.0851, "step": 12159 }, { "epoch": 1.2481527093596059, "grad_norm": 0.04085429012775421, "learning_rate": 0.01, "loss": 2.0846, "step": 12162 }, { "epoch": 1.248460591133005, "grad_norm": 0.05329781025648117, "learning_rate": 0.01, "loss": 2.0732, "step": 12165 }, { "epoch": 1.248768472906404, "grad_norm": 0.06961992383003235, "learning_rate": 0.01, "loss": 2.0372, "step": 12168 }, { "epoch": 1.249076354679803, "grad_norm": 0.05290938913822174, "learning_rate": 0.01, "loss": 2.0692, "step": 12171 }, { "epoch": 1.249384236453202, "grad_norm": 0.1247674971818924, "learning_rate": 0.01, "loss": 2.0679, "step": 12174 }, { "epoch": 1.249692118226601, "grad_norm": 0.04983863607048988, "learning_rate": 0.01, "loss": 2.0611, "step": 12177 }, { "epoch": 1.25, "grad_norm": 0.08552074432373047, "learning_rate": 0.01, "loss": 2.1084, "step": 12180 }, { "epoch": 1.250307881773399, "grad_norm": 0.1376069337129593, "learning_rate": 0.01, "loss": 2.0997, "step": 12183 }, { "epoch": 1.250615763546798, "grad_norm": 0.07097752392292023, "learning_rate": 0.01, "loss": 2.0761, "step": 12186 }, { "epoch": 1.250923645320197, "grad_norm": 0.03953644260764122, "learning_rate": 0.01, "loss": 2.0675, "step": 12189 }, { "epoch": 1.251231527093596, "grad_norm": 0.04611526057124138, "learning_rate": 0.01, "loss": 2.0717, "step": 12192 }, { "epoch": 1.251539408866995, "grad_norm": 0.07573895156383514, "learning_rate": 0.01, "loss": 2.0785, "step": 12195 }, { "epoch": 1.2518472906403941, "grad_norm": 0.07144660502672195, "learning_rate": 0.01, "loss": 2.0815, "step": 12198 }, { "epoch": 1.2521551724137931, "grad_norm": 0.05297645181417465, "learning_rate": 0.01, "loss": 2.093, "step": 12201 }, { "epoch": 1.2524630541871922, "grad_norm": 0.044887710362672806, "learning_rate": 0.01, "loss": 2.0752, "step": 12204 }, { "epoch": 1.2527709359605912, "grad_norm": 0.04305564984679222, "learning_rate": 0.01, "loss": 2.0596, "step": 12207 }, { "epoch": 1.2530788177339902, "grad_norm": 0.057785287499427795, "learning_rate": 0.01, "loss": 2.094, "step": 12210 }, { "epoch": 1.2533866995073892, "grad_norm": 0.04404570534825325, "learning_rate": 0.01, "loss": 2.1037, "step": 12213 }, { "epoch": 1.2536945812807883, "grad_norm": 0.055468104779720306, "learning_rate": 0.01, "loss": 2.0851, "step": 12216 }, { "epoch": 1.2540024630541873, "grad_norm": 0.17121906578540802, "learning_rate": 0.01, "loss": 2.081, "step": 12219 }, { "epoch": 1.2543103448275863, "grad_norm": 0.09411416202783585, "learning_rate": 0.01, "loss": 2.0974, "step": 12222 }, { "epoch": 1.2546182266009853, "grad_norm": 0.07855021953582764, "learning_rate": 0.01, "loss": 2.0733, "step": 12225 }, { "epoch": 1.2549261083743843, "grad_norm": 0.052616432309150696, "learning_rate": 0.01, "loss": 2.0372, "step": 12228 }, { "epoch": 1.2552339901477834, "grad_norm": 0.047992121428251266, "learning_rate": 0.01, "loss": 2.0918, "step": 12231 }, { "epoch": 1.2555418719211824, "grad_norm": 0.04336715489625931, "learning_rate": 0.01, "loss": 2.0511, "step": 12234 }, { "epoch": 1.2558497536945814, "grad_norm": 0.03128316253423691, "learning_rate": 0.01, "loss": 2.0882, "step": 12237 }, { "epoch": 1.2561576354679804, "grad_norm": 0.06315557658672333, "learning_rate": 0.01, "loss": 2.0918, "step": 12240 }, { "epoch": 1.2564655172413794, "grad_norm": 0.0528687946498394, "learning_rate": 0.01, "loss": 2.0556, "step": 12243 }, { "epoch": 1.2567733990147782, "grad_norm": 0.17166706919670105, "learning_rate": 0.01, "loss": 2.068, "step": 12246 }, { "epoch": 1.2570812807881773, "grad_norm": 0.11394128203392029, "learning_rate": 0.01, "loss": 2.1179, "step": 12249 }, { "epoch": 1.2573891625615763, "grad_norm": 0.08554805815219879, "learning_rate": 0.01, "loss": 2.0857, "step": 12252 }, { "epoch": 1.2576970443349753, "grad_norm": 0.05203767865896225, "learning_rate": 0.01, "loss": 2.0975, "step": 12255 }, { "epoch": 1.2580049261083743, "grad_norm": 0.06072428077459335, "learning_rate": 0.01, "loss": 2.0692, "step": 12258 }, { "epoch": 1.2583128078817734, "grad_norm": 0.044136617332696915, "learning_rate": 0.01, "loss": 2.0458, "step": 12261 }, { "epoch": 1.2586206896551724, "grad_norm": 0.038774993270635605, "learning_rate": 0.01, "loss": 2.0835, "step": 12264 }, { "epoch": 1.2589285714285714, "grad_norm": 0.03669529780745506, "learning_rate": 0.01, "loss": 2.0949, "step": 12267 }, { "epoch": 1.2592364532019704, "grad_norm": 0.050722066313028336, "learning_rate": 0.01, "loss": 2.0772, "step": 12270 }, { "epoch": 1.2595443349753694, "grad_norm": 0.12684300541877747, "learning_rate": 0.01, "loss": 2.0814, "step": 12273 }, { "epoch": 1.2598522167487685, "grad_norm": 0.07431039214134216, "learning_rate": 0.01, "loss": 2.1001, "step": 12276 }, { "epoch": 1.2601600985221675, "grad_norm": 0.07050034403800964, "learning_rate": 0.01, "loss": 2.0507, "step": 12279 }, { "epoch": 1.2604679802955665, "grad_norm": 0.0553620308637619, "learning_rate": 0.01, "loss": 2.0757, "step": 12282 }, { "epoch": 1.2607758620689655, "grad_norm": 0.10946903377771378, "learning_rate": 0.01, "loss": 2.0659, "step": 12285 }, { "epoch": 1.2610837438423645, "grad_norm": 0.06346802413463593, "learning_rate": 0.01, "loss": 2.0871, "step": 12288 }, { "epoch": 1.2613916256157636, "grad_norm": 0.09386668354272842, "learning_rate": 0.01, "loss": 2.0702, "step": 12291 }, { "epoch": 1.2616995073891626, "grad_norm": 0.05672946944832802, "learning_rate": 0.01, "loss": 2.0812, "step": 12294 }, { "epoch": 1.2620073891625616, "grad_norm": 0.1079539805650711, "learning_rate": 0.01, "loss": 2.0851, "step": 12297 }, { "epoch": 1.2623152709359606, "grad_norm": 0.043078985065221786, "learning_rate": 0.01, "loss": 2.0846, "step": 12300 }, { "epoch": 1.2626231527093597, "grad_norm": 0.043098706752061844, "learning_rate": 0.01, "loss": 2.0541, "step": 12303 }, { "epoch": 1.2629310344827587, "grad_norm": 0.04908977448940277, "learning_rate": 0.01, "loss": 2.0801, "step": 12306 }, { "epoch": 1.2632389162561577, "grad_norm": 0.09897246211767197, "learning_rate": 0.01, "loss": 2.0899, "step": 12309 }, { "epoch": 1.2635467980295567, "grad_norm": 0.0861278846859932, "learning_rate": 0.01, "loss": 2.0613, "step": 12312 }, { "epoch": 1.2638546798029557, "grad_norm": 0.06973280757665634, "learning_rate": 0.01, "loss": 2.0598, "step": 12315 }, { "epoch": 1.2641625615763548, "grad_norm": 0.048658501356840134, "learning_rate": 0.01, "loss": 2.0838, "step": 12318 }, { "epoch": 1.2644704433497536, "grad_norm": 0.05053295940160751, "learning_rate": 0.01, "loss": 2.0665, "step": 12321 }, { "epoch": 1.2647783251231526, "grad_norm": 0.0536380410194397, "learning_rate": 0.01, "loss": 2.0696, "step": 12324 }, { "epoch": 1.2650862068965516, "grad_norm": 0.13452892005443573, "learning_rate": 0.01, "loss": 2.0928, "step": 12327 }, { "epoch": 1.2653940886699506, "grad_norm": 0.056635159999132156, "learning_rate": 0.01, "loss": 2.0903, "step": 12330 }, { "epoch": 1.2657019704433496, "grad_norm": 0.09460306912660599, "learning_rate": 0.01, "loss": 2.0616, "step": 12333 }, { "epoch": 1.2660098522167487, "grad_norm": 0.09019794315099716, "learning_rate": 0.01, "loss": 2.0808, "step": 12336 }, { "epoch": 1.2663177339901477, "grad_norm": 0.04020017758011818, "learning_rate": 0.01, "loss": 2.0613, "step": 12339 }, { "epoch": 1.2666256157635467, "grad_norm": 0.05063892900943756, "learning_rate": 0.01, "loss": 2.0846, "step": 12342 }, { "epoch": 1.2669334975369457, "grad_norm": 0.06472054123878479, "learning_rate": 0.01, "loss": 2.0884, "step": 12345 }, { "epoch": 1.2672413793103448, "grad_norm": 0.0523315854370594, "learning_rate": 0.01, "loss": 2.0756, "step": 12348 }, { "epoch": 1.2675492610837438, "grad_norm": 0.040240950882434845, "learning_rate": 0.01, "loss": 2.0646, "step": 12351 }, { "epoch": 1.2678571428571428, "grad_norm": 0.061988551169633865, "learning_rate": 0.01, "loss": 2.0899, "step": 12354 }, { "epoch": 1.2681650246305418, "grad_norm": 0.03831657022237778, "learning_rate": 0.01, "loss": 2.0633, "step": 12357 }, { "epoch": 1.2684729064039408, "grad_norm": 0.105617955327034, "learning_rate": 0.01, "loss": 2.0553, "step": 12360 }, { "epoch": 1.2687807881773399, "grad_norm": 0.09372366219758987, "learning_rate": 0.01, "loss": 2.0522, "step": 12363 }, { "epoch": 1.2690886699507389, "grad_norm": 0.10305638611316681, "learning_rate": 0.01, "loss": 2.0543, "step": 12366 }, { "epoch": 1.269396551724138, "grad_norm": 0.07187418639659882, "learning_rate": 0.01, "loss": 2.0775, "step": 12369 }, { "epoch": 1.269704433497537, "grad_norm": 0.03744306415319443, "learning_rate": 0.01, "loss": 2.0771, "step": 12372 }, { "epoch": 1.270012315270936, "grad_norm": 0.03059488907456398, "learning_rate": 0.01, "loss": 2.067, "step": 12375 }, { "epoch": 1.270320197044335, "grad_norm": 0.09767211973667145, "learning_rate": 0.01, "loss": 2.0889, "step": 12378 }, { "epoch": 1.270628078817734, "grad_norm": 0.05093003436923027, "learning_rate": 0.01, "loss": 2.0852, "step": 12381 }, { "epoch": 1.270935960591133, "grad_norm": 0.04648155719041824, "learning_rate": 0.01, "loss": 2.0775, "step": 12384 }, { "epoch": 1.271243842364532, "grad_norm": 0.04745417460799217, "learning_rate": 0.01, "loss": 2.0888, "step": 12387 }, { "epoch": 1.271551724137931, "grad_norm": 0.05171092227101326, "learning_rate": 0.01, "loss": 2.075, "step": 12390 }, { "epoch": 1.27185960591133, "grad_norm": 0.05051850154995918, "learning_rate": 0.01, "loss": 2.0754, "step": 12393 }, { "epoch": 1.272167487684729, "grad_norm": 0.06048591807484627, "learning_rate": 0.01, "loss": 2.1025, "step": 12396 }, { "epoch": 1.2724753694581281, "grad_norm": 0.09673628211021423, "learning_rate": 0.01, "loss": 2.0866, "step": 12399 }, { "epoch": 1.2727832512315271, "grad_norm": 0.11826295405626297, "learning_rate": 0.01, "loss": 2.0757, "step": 12402 }, { "epoch": 1.2730911330049262, "grad_norm": 0.06271769106388092, "learning_rate": 0.01, "loss": 2.0797, "step": 12405 }, { "epoch": 1.2733990147783252, "grad_norm": 0.0463738851249218, "learning_rate": 0.01, "loss": 2.054, "step": 12408 }, { "epoch": 1.2737068965517242, "grad_norm": 0.035649579018354416, "learning_rate": 0.01, "loss": 2.0747, "step": 12411 }, { "epoch": 1.2740147783251232, "grad_norm": 0.08586122840642929, "learning_rate": 0.01, "loss": 2.0692, "step": 12414 }, { "epoch": 1.2743226600985222, "grad_norm": 0.08538860082626343, "learning_rate": 0.01, "loss": 2.0617, "step": 12417 }, { "epoch": 1.2746305418719213, "grad_norm": 0.05974709242582321, "learning_rate": 0.01, "loss": 2.0527, "step": 12420 }, { "epoch": 1.2749384236453203, "grad_norm": 0.057023100554943085, "learning_rate": 0.01, "loss": 2.0751, "step": 12423 }, { "epoch": 1.2752463054187193, "grad_norm": 0.062105972319841385, "learning_rate": 0.01, "loss": 2.0946, "step": 12426 }, { "epoch": 1.2755541871921183, "grad_norm": 0.048221688717603683, "learning_rate": 0.01, "loss": 2.0662, "step": 12429 }, { "epoch": 1.2758620689655173, "grad_norm": 0.04440610110759735, "learning_rate": 0.01, "loss": 2.0852, "step": 12432 }, { "epoch": 1.2761699507389164, "grad_norm": 0.06074054539203644, "learning_rate": 0.01, "loss": 2.0667, "step": 12435 }, { "epoch": 1.2764778325123154, "grad_norm": 0.09477720409631729, "learning_rate": 0.01, "loss": 2.0815, "step": 12438 }, { "epoch": 1.2767857142857144, "grad_norm": 0.08470610529184341, "learning_rate": 0.01, "loss": 2.1023, "step": 12441 }, { "epoch": 1.2770935960591134, "grad_norm": 0.09079881012439728, "learning_rate": 0.01, "loss": 2.0761, "step": 12444 }, { "epoch": 1.2774014778325122, "grad_norm": 0.06132930517196655, "learning_rate": 0.01, "loss": 2.1027, "step": 12447 }, { "epoch": 1.2777093596059113, "grad_norm": 0.04702606052160263, "learning_rate": 0.01, "loss": 2.1051, "step": 12450 }, { "epoch": 1.2780172413793103, "grad_norm": 0.06621818989515305, "learning_rate": 0.01, "loss": 2.0835, "step": 12453 }, { "epoch": 1.2783251231527093, "grad_norm": 0.047375794500112534, "learning_rate": 0.01, "loss": 2.0982, "step": 12456 }, { "epoch": 1.2786330049261083, "grad_norm": 0.03766035661101341, "learning_rate": 0.01, "loss": 2.0468, "step": 12459 }, { "epoch": 1.2789408866995073, "grad_norm": 0.10769324004650116, "learning_rate": 0.01, "loss": 2.0848, "step": 12462 }, { "epoch": 1.2792487684729064, "grad_norm": 0.06848851591348648, "learning_rate": 0.01, "loss": 2.0664, "step": 12465 }, { "epoch": 1.2795566502463054, "grad_norm": 0.09864836931228638, "learning_rate": 0.01, "loss": 2.0719, "step": 12468 }, { "epoch": 1.2798645320197044, "grad_norm": 0.04042387008666992, "learning_rate": 0.01, "loss": 2.0672, "step": 12471 }, { "epoch": 1.2801724137931034, "grad_norm": 0.09212526679039001, "learning_rate": 0.01, "loss": 2.0789, "step": 12474 }, { "epoch": 1.2804802955665024, "grad_norm": 0.08713985234498978, "learning_rate": 0.01, "loss": 2.0765, "step": 12477 }, { "epoch": 1.2807881773399015, "grad_norm": 0.04133505001664162, "learning_rate": 0.01, "loss": 2.0809, "step": 12480 }, { "epoch": 1.2810960591133005, "grad_norm": 0.07466418296098709, "learning_rate": 0.01, "loss": 2.0499, "step": 12483 }, { "epoch": 1.2814039408866995, "grad_norm": 0.08685484528541565, "learning_rate": 0.01, "loss": 2.0689, "step": 12486 }, { "epoch": 1.2817118226600985, "grad_norm": 0.13300663232803345, "learning_rate": 0.01, "loss": 2.0761, "step": 12489 }, { "epoch": 1.2820197044334976, "grad_norm": 0.1024186760187149, "learning_rate": 0.01, "loss": 2.0763, "step": 12492 }, { "epoch": 1.2823275862068966, "grad_norm": 0.05908042937517166, "learning_rate": 0.01, "loss": 2.0485, "step": 12495 }, { "epoch": 1.2826354679802956, "grad_norm": 0.03715427219867706, "learning_rate": 0.01, "loss": 2.0925, "step": 12498 }, { "epoch": 1.2829433497536946, "grad_norm": 0.05033004283905029, "learning_rate": 0.01, "loss": 2.0875, "step": 12501 }, { "epoch": 1.2832512315270936, "grad_norm": 0.04851456359028816, "learning_rate": 0.01, "loss": 2.0547, "step": 12504 }, { "epoch": 1.2835591133004927, "grad_norm": 0.08153282105922699, "learning_rate": 0.01, "loss": 2.0828, "step": 12507 }, { "epoch": 1.2838669950738917, "grad_norm": 0.07549238950014114, "learning_rate": 0.01, "loss": 2.0681, "step": 12510 }, { "epoch": 1.2841748768472907, "grad_norm": 0.08571973443031311, "learning_rate": 0.01, "loss": 2.0556, "step": 12513 }, { "epoch": 1.2844827586206897, "grad_norm": 0.1036754697561264, "learning_rate": 0.01, "loss": 2.0712, "step": 12516 }, { "epoch": 1.2847906403940887, "grad_norm": 0.04329349100589752, "learning_rate": 0.01, "loss": 2.0711, "step": 12519 }, { "epoch": 1.2850985221674878, "grad_norm": 0.03718428686261177, "learning_rate": 0.01, "loss": 2.0812, "step": 12522 }, { "epoch": 1.2854064039408866, "grad_norm": 0.07977878302335739, "learning_rate": 0.01, "loss": 2.0562, "step": 12525 }, { "epoch": 1.2857142857142856, "grad_norm": 0.11540202796459198, "learning_rate": 0.01, "loss": 2.0666, "step": 12528 }, { "epoch": 1.2860221674876846, "grad_norm": 0.07631656527519226, "learning_rate": 0.01, "loss": 2.0694, "step": 12531 }, { "epoch": 1.2863300492610836, "grad_norm": 0.051738426089286804, "learning_rate": 0.01, "loss": 2.0862, "step": 12534 }, { "epoch": 1.2866379310344827, "grad_norm": 0.05070396512746811, "learning_rate": 0.01, "loss": 2.0538, "step": 12537 }, { "epoch": 1.2869458128078817, "grad_norm": 0.039027947932481766, "learning_rate": 0.01, "loss": 2.0487, "step": 12540 }, { "epoch": 1.2872536945812807, "grad_norm": 0.052646003663539886, "learning_rate": 0.01, "loss": 2.0745, "step": 12543 }, { "epoch": 1.2875615763546797, "grad_norm": 0.06035429984331131, "learning_rate": 0.01, "loss": 2.0775, "step": 12546 }, { "epoch": 1.2878694581280787, "grad_norm": 0.0818098783493042, "learning_rate": 0.01, "loss": 2.0685, "step": 12549 }, { "epoch": 1.2881773399014778, "grad_norm": 0.030897030606865883, "learning_rate": 0.01, "loss": 2.0701, "step": 12552 }, { "epoch": 1.2884852216748768, "grad_norm": 0.042795561254024506, "learning_rate": 0.01, "loss": 2.0673, "step": 12555 }, { "epoch": 1.2887931034482758, "grad_norm": 0.11560031026601791, "learning_rate": 0.01, "loss": 2.0637, "step": 12558 }, { "epoch": 1.2891009852216748, "grad_norm": 0.03919963166117668, "learning_rate": 0.01, "loss": 2.0547, "step": 12561 }, { "epoch": 1.2894088669950738, "grad_norm": 0.03816407918930054, "learning_rate": 0.01, "loss": 2.0576, "step": 12564 }, { "epoch": 1.2897167487684729, "grad_norm": 0.04641805216670036, "learning_rate": 0.01, "loss": 2.0779, "step": 12567 }, { "epoch": 1.2900246305418719, "grad_norm": 0.16832102835178375, "learning_rate": 0.01, "loss": 2.0658, "step": 12570 }, { "epoch": 1.290332512315271, "grad_norm": 0.06651032716035843, "learning_rate": 0.01, "loss": 2.0602, "step": 12573 }, { "epoch": 1.29064039408867, "grad_norm": 0.05333925411105156, "learning_rate": 0.01, "loss": 2.0842, "step": 12576 }, { "epoch": 1.290948275862069, "grad_norm": 0.03173685073852539, "learning_rate": 0.01, "loss": 2.0834, "step": 12579 }, { "epoch": 1.291256157635468, "grad_norm": 0.0836583599448204, "learning_rate": 0.01, "loss": 2.0605, "step": 12582 }, { "epoch": 1.291564039408867, "grad_norm": 0.09913724660873413, "learning_rate": 0.01, "loss": 2.0577, "step": 12585 }, { "epoch": 1.291871921182266, "grad_norm": 0.04268624261021614, "learning_rate": 0.01, "loss": 2.1027, "step": 12588 }, { "epoch": 1.292179802955665, "grad_norm": 0.03744608163833618, "learning_rate": 0.01, "loss": 2.0506, "step": 12591 }, { "epoch": 1.292487684729064, "grad_norm": 0.11833969503641129, "learning_rate": 0.01, "loss": 2.0606, "step": 12594 }, { "epoch": 1.292795566502463, "grad_norm": 0.11814229190349579, "learning_rate": 0.01, "loss": 2.0641, "step": 12597 }, { "epoch": 1.293103448275862, "grad_norm": 0.04583375155925751, "learning_rate": 0.01, "loss": 2.0898, "step": 12600 }, { "epoch": 1.2934113300492611, "grad_norm": 0.11559072881937027, "learning_rate": 0.01, "loss": 2.0657, "step": 12603 }, { "epoch": 1.2937192118226601, "grad_norm": 0.050608474761247635, "learning_rate": 0.01, "loss": 2.0879, "step": 12606 }, { "epoch": 1.2940270935960592, "grad_norm": 0.04284593090415001, "learning_rate": 0.01, "loss": 2.0545, "step": 12609 }, { "epoch": 1.2943349753694582, "grad_norm": 0.042168114334344864, "learning_rate": 0.01, "loss": 2.0645, "step": 12612 }, { "epoch": 1.2946428571428572, "grad_norm": 0.06251826882362366, "learning_rate": 0.01, "loss": 2.0621, "step": 12615 }, { "epoch": 1.2949507389162562, "grad_norm": 0.11159554868936539, "learning_rate": 0.01, "loss": 2.0718, "step": 12618 }, { "epoch": 1.2952586206896552, "grad_norm": 0.0715017095208168, "learning_rate": 0.01, "loss": 2.0844, "step": 12621 }, { "epoch": 1.2955665024630543, "grad_norm": 0.12078917771577835, "learning_rate": 0.01, "loss": 2.0604, "step": 12624 }, { "epoch": 1.2958743842364533, "grad_norm": 0.11613093316555023, "learning_rate": 0.01, "loss": 2.0753, "step": 12627 }, { "epoch": 1.2961822660098523, "grad_norm": 0.053543299436569214, "learning_rate": 0.01, "loss": 2.0736, "step": 12630 }, { "epoch": 1.2964901477832513, "grad_norm": 0.05464399605989456, "learning_rate": 0.01, "loss": 2.0845, "step": 12633 }, { "epoch": 1.2967980295566504, "grad_norm": 0.0487944521009922, "learning_rate": 0.01, "loss": 2.0859, "step": 12636 }, { "epoch": 1.2971059113300494, "grad_norm": 0.05266605690121651, "learning_rate": 0.01, "loss": 2.0532, "step": 12639 }, { "epoch": 1.2974137931034484, "grad_norm": 0.07863074541091919, "learning_rate": 0.01, "loss": 2.0496, "step": 12642 }, { "epoch": 1.2977216748768474, "grad_norm": 0.05371072143316269, "learning_rate": 0.01, "loss": 2.1016, "step": 12645 }, { "epoch": 1.2980295566502464, "grad_norm": 0.05592924728989601, "learning_rate": 0.01, "loss": 2.0832, "step": 12648 }, { "epoch": 1.2983374384236452, "grad_norm": 0.0675397738814354, "learning_rate": 0.01, "loss": 2.0784, "step": 12651 }, { "epoch": 1.2986453201970443, "grad_norm": 0.04399113729596138, "learning_rate": 0.01, "loss": 2.0567, "step": 12654 }, { "epoch": 1.2989532019704433, "grad_norm": 0.04609301686286926, "learning_rate": 0.01, "loss": 2.0683, "step": 12657 }, { "epoch": 1.2992610837438423, "grad_norm": 0.05637912079691887, "learning_rate": 0.01, "loss": 2.0655, "step": 12660 }, { "epoch": 1.2995689655172413, "grad_norm": 0.1115126758813858, "learning_rate": 0.01, "loss": 2.0643, "step": 12663 }, { "epoch": 1.2998768472906403, "grad_norm": 0.07762522250413895, "learning_rate": 0.01, "loss": 2.0853, "step": 12666 }, { "epoch": 1.3001847290640394, "grad_norm": 0.06717406958341599, "learning_rate": 0.01, "loss": 2.0881, "step": 12669 }, { "epoch": 1.3004926108374384, "grad_norm": 0.0727803036570549, "learning_rate": 0.01, "loss": 2.0606, "step": 12672 }, { "epoch": 1.3008004926108374, "grad_norm": 0.06588024646043777, "learning_rate": 0.01, "loss": 2.0508, "step": 12675 }, { "epoch": 1.3011083743842364, "grad_norm": 0.09718842804431915, "learning_rate": 0.01, "loss": 2.0793, "step": 12678 }, { "epoch": 1.3014162561576355, "grad_norm": 0.05804411321878433, "learning_rate": 0.01, "loss": 2.0825, "step": 12681 }, { "epoch": 1.3017241379310345, "grad_norm": 0.07549803704023361, "learning_rate": 0.01, "loss": 2.0546, "step": 12684 }, { "epoch": 1.3020320197044335, "grad_norm": 0.04496621713042259, "learning_rate": 0.01, "loss": 2.0483, "step": 12687 }, { "epoch": 1.3023399014778325, "grad_norm": 0.1283668428659439, "learning_rate": 0.01, "loss": 2.0639, "step": 12690 }, { "epoch": 1.3026477832512315, "grad_norm": 0.1276516169309616, "learning_rate": 0.01, "loss": 2.062, "step": 12693 }, { "epoch": 1.3029556650246306, "grad_norm": 0.12865981459617615, "learning_rate": 0.01, "loss": 2.0697, "step": 12696 }, { "epoch": 1.3032635467980296, "grad_norm": 0.05869213864207268, "learning_rate": 0.01, "loss": 2.0955, "step": 12699 }, { "epoch": 1.3035714285714286, "grad_norm": 0.042082637548446655, "learning_rate": 0.01, "loss": 2.0847, "step": 12702 }, { "epoch": 1.3038793103448276, "grad_norm": 0.11474558711051941, "learning_rate": 0.01, "loss": 2.0767, "step": 12705 }, { "epoch": 1.3041871921182266, "grad_norm": 0.12426330894231796, "learning_rate": 0.01, "loss": 2.0787, "step": 12708 }, { "epoch": 1.3044950738916257, "grad_norm": 0.06731969118118286, "learning_rate": 0.01, "loss": 2.064, "step": 12711 }, { "epoch": 1.3048029556650247, "grad_norm": 0.049505047500133514, "learning_rate": 0.01, "loss": 2.0848, "step": 12714 }, { "epoch": 1.3051108374384237, "grad_norm": 0.04460617154836655, "learning_rate": 0.01, "loss": 2.0958, "step": 12717 }, { "epoch": 1.3054187192118227, "grad_norm": 0.05461740121245384, "learning_rate": 0.01, "loss": 2.0724, "step": 12720 }, { "epoch": 1.3057266009852218, "grad_norm": 0.04074997082352638, "learning_rate": 0.01, "loss": 2.0453, "step": 12723 }, { "epoch": 1.3060344827586206, "grad_norm": 0.04925557225942612, "learning_rate": 0.01, "loss": 2.0707, "step": 12726 }, { "epoch": 1.3063423645320196, "grad_norm": 0.06570678949356079, "learning_rate": 0.01, "loss": 2.0703, "step": 12729 }, { "epoch": 1.3066502463054186, "grad_norm": 0.076143778860569, "learning_rate": 0.01, "loss": 2.0639, "step": 12732 }, { "epoch": 1.3069581280788176, "grad_norm": 0.06691130995750427, "learning_rate": 0.01, "loss": 2.0858, "step": 12735 }, { "epoch": 1.3072660098522166, "grad_norm": 0.06733332574367523, "learning_rate": 0.01, "loss": 2.0622, "step": 12738 }, { "epoch": 1.3075738916256157, "grad_norm": 0.08008868247270584, "learning_rate": 0.01, "loss": 2.085, "step": 12741 }, { "epoch": 1.3078817733990147, "grad_norm": 0.10713468492031097, "learning_rate": 0.01, "loss": 2.0642, "step": 12744 }, { "epoch": 1.3081896551724137, "grad_norm": 0.101436547935009, "learning_rate": 0.01, "loss": 2.0835, "step": 12747 }, { "epoch": 1.3084975369458127, "grad_norm": 0.0552450455725193, "learning_rate": 0.01, "loss": 2.0553, "step": 12750 }, { "epoch": 1.3088054187192117, "grad_norm": 0.08755996823310852, "learning_rate": 0.01, "loss": 2.0564, "step": 12753 }, { "epoch": 1.3091133004926108, "grad_norm": 0.03980748727917671, "learning_rate": 0.01, "loss": 2.084, "step": 12756 }, { "epoch": 1.3094211822660098, "grad_norm": 0.0971774309873581, "learning_rate": 0.01, "loss": 2.0858, "step": 12759 }, { "epoch": 1.3097290640394088, "grad_norm": 0.05845404043793678, "learning_rate": 0.01, "loss": 2.0792, "step": 12762 }, { "epoch": 1.3100369458128078, "grad_norm": 0.08498022705316544, "learning_rate": 0.01, "loss": 2.0932, "step": 12765 }, { "epoch": 1.3103448275862069, "grad_norm": 0.05135398730635643, "learning_rate": 0.01, "loss": 2.0796, "step": 12768 }, { "epoch": 1.3106527093596059, "grad_norm": 0.048872511833906174, "learning_rate": 0.01, "loss": 2.0675, "step": 12771 }, { "epoch": 1.310960591133005, "grad_norm": 0.0529983788728714, "learning_rate": 0.01, "loss": 2.0573, "step": 12774 }, { "epoch": 1.311268472906404, "grad_norm": 0.03580658137798309, "learning_rate": 0.01, "loss": 2.0741, "step": 12777 }, { "epoch": 1.311576354679803, "grad_norm": 0.03904234617948532, "learning_rate": 0.01, "loss": 2.0691, "step": 12780 }, { "epoch": 1.311884236453202, "grad_norm": 0.04073551669716835, "learning_rate": 0.01, "loss": 2.0694, "step": 12783 }, { "epoch": 1.312192118226601, "grad_norm": 0.058973487466573715, "learning_rate": 0.01, "loss": 2.0381, "step": 12786 }, { "epoch": 1.3125, "grad_norm": 0.05673586577177048, "learning_rate": 0.01, "loss": 2.0691, "step": 12789 }, { "epoch": 1.312807881773399, "grad_norm": 0.13680770993232727, "learning_rate": 0.01, "loss": 2.0673, "step": 12792 }, { "epoch": 1.313115763546798, "grad_norm": 0.0454241968691349, "learning_rate": 0.01, "loss": 2.0766, "step": 12795 }, { "epoch": 1.313423645320197, "grad_norm": 0.04074293375015259, "learning_rate": 0.01, "loss": 2.0593, "step": 12798 }, { "epoch": 1.313731527093596, "grad_norm": 0.04174893721938133, "learning_rate": 0.01, "loss": 2.0537, "step": 12801 }, { "epoch": 1.314039408866995, "grad_norm": 0.06716062128543854, "learning_rate": 0.01, "loss": 2.0642, "step": 12804 }, { "epoch": 1.3143472906403941, "grad_norm": 0.09866861253976822, "learning_rate": 0.01, "loss": 2.0889, "step": 12807 }, { "epoch": 1.3146551724137931, "grad_norm": 0.13097235560417175, "learning_rate": 0.01, "loss": 2.0744, "step": 12810 }, { "epoch": 1.3149630541871922, "grad_norm": 0.07859724014997482, "learning_rate": 0.01, "loss": 2.0899, "step": 12813 }, { "epoch": 1.3152709359605912, "grad_norm": 0.06141912192106247, "learning_rate": 0.01, "loss": 2.0666, "step": 12816 }, { "epoch": 1.3155788177339902, "grad_norm": 0.05112985149025917, "learning_rate": 0.01, "loss": 2.0658, "step": 12819 }, { "epoch": 1.3158866995073892, "grad_norm": 0.03482404351234436, "learning_rate": 0.01, "loss": 2.0802, "step": 12822 }, { "epoch": 1.3161945812807883, "grad_norm": 0.05562854930758476, "learning_rate": 0.01, "loss": 2.0659, "step": 12825 }, { "epoch": 1.3165024630541873, "grad_norm": 0.04841645434498787, "learning_rate": 0.01, "loss": 2.058, "step": 12828 }, { "epoch": 1.3168103448275863, "grad_norm": 0.0756571963429451, "learning_rate": 0.01, "loss": 2.0283, "step": 12831 }, { "epoch": 1.3171182266009853, "grad_norm": 0.09575197845697403, "learning_rate": 0.01, "loss": 2.0709, "step": 12834 }, { "epoch": 1.3174261083743843, "grad_norm": 0.07003197073936462, "learning_rate": 0.01, "loss": 2.0409, "step": 12837 }, { "epoch": 1.3177339901477834, "grad_norm": 0.12592460215091705, "learning_rate": 0.01, "loss": 2.051, "step": 12840 }, { "epoch": 1.3180418719211824, "grad_norm": 0.07086621969938278, "learning_rate": 0.01, "loss": 2.0856, "step": 12843 }, { "epoch": 1.3183497536945814, "grad_norm": 0.07367062568664551, "learning_rate": 0.01, "loss": 2.0759, "step": 12846 }, { "epoch": 1.3186576354679804, "grad_norm": 0.06731852889060974, "learning_rate": 0.01, "loss": 2.077, "step": 12849 }, { "epoch": 1.3189655172413794, "grad_norm": 0.07499510049819946, "learning_rate": 0.01, "loss": 2.072, "step": 12852 }, { "epoch": 1.3192733990147782, "grad_norm": 0.07604499906301498, "learning_rate": 0.01, "loss": 2.0717, "step": 12855 }, { "epoch": 1.3195812807881773, "grad_norm": 0.0770401731133461, "learning_rate": 0.01, "loss": 2.0894, "step": 12858 }, { "epoch": 1.3198891625615763, "grad_norm": 0.06753168255090714, "learning_rate": 0.01, "loss": 2.0546, "step": 12861 }, { "epoch": 1.3201970443349753, "grad_norm": 0.14175836741924286, "learning_rate": 0.01, "loss": 2.0811, "step": 12864 }, { "epoch": 1.3205049261083743, "grad_norm": 0.04258207604289055, "learning_rate": 0.01, "loss": 2.0831, "step": 12867 }, { "epoch": 1.3208128078817734, "grad_norm": 0.03372815623879433, "learning_rate": 0.01, "loss": 2.0827, "step": 12870 }, { "epoch": 1.3211206896551724, "grad_norm": 0.040649689733982086, "learning_rate": 0.01, "loss": 2.067, "step": 12873 }, { "epoch": 1.3214285714285714, "grad_norm": 0.060684412717819214, "learning_rate": 0.01, "loss": 2.0701, "step": 12876 }, { "epoch": 1.3217364532019704, "grad_norm": 0.036452196538448334, "learning_rate": 0.01, "loss": 2.0726, "step": 12879 }, { "epoch": 1.3220443349753694, "grad_norm": 0.09457682818174362, "learning_rate": 0.01, "loss": 2.0709, "step": 12882 }, { "epoch": 1.3223522167487685, "grad_norm": 0.049776870757341385, "learning_rate": 0.01, "loss": 2.0354, "step": 12885 }, { "epoch": 1.3226600985221675, "grad_norm": 0.05794042348861694, "learning_rate": 0.01, "loss": 2.0781, "step": 12888 }, { "epoch": 1.3229679802955665, "grad_norm": 0.052100833505392075, "learning_rate": 0.01, "loss": 2.0801, "step": 12891 }, { "epoch": 1.3232758620689655, "grad_norm": 0.06558441370725632, "learning_rate": 0.01, "loss": 2.076, "step": 12894 }, { "epoch": 1.3235837438423645, "grad_norm": 0.1107778549194336, "learning_rate": 0.01, "loss": 2.0859, "step": 12897 }, { "epoch": 1.3238916256157636, "grad_norm": 0.052982959896326065, "learning_rate": 0.01, "loss": 2.0749, "step": 12900 }, { "epoch": 1.3241995073891626, "grad_norm": 0.0976911410689354, "learning_rate": 0.01, "loss": 2.067, "step": 12903 }, { "epoch": 1.3245073891625616, "grad_norm": 0.05620484799146652, "learning_rate": 0.01, "loss": 2.0892, "step": 12906 }, { "epoch": 1.3248152709359606, "grad_norm": 0.12753431499004364, "learning_rate": 0.01, "loss": 2.073, "step": 12909 }, { "epoch": 1.3251231527093597, "grad_norm": 0.14311249554157257, "learning_rate": 0.01, "loss": 2.0641, "step": 12912 }, { "epoch": 1.3254310344827587, "grad_norm": 0.049242276698350906, "learning_rate": 0.01, "loss": 2.0701, "step": 12915 }, { "epoch": 1.3257389162561577, "grad_norm": 0.07875422388315201, "learning_rate": 0.01, "loss": 2.0889, "step": 12918 }, { "epoch": 1.3260467980295567, "grad_norm": 0.036451369524002075, "learning_rate": 0.01, "loss": 2.0634, "step": 12921 }, { "epoch": 1.3263546798029557, "grad_norm": 0.03659353405237198, "learning_rate": 0.01, "loss": 2.0725, "step": 12924 }, { "epoch": 1.3266625615763548, "grad_norm": 0.049371130764484406, "learning_rate": 0.01, "loss": 2.0531, "step": 12927 }, { "epoch": 1.3269704433497536, "grad_norm": 0.05998126044869423, "learning_rate": 0.01, "loss": 2.0894, "step": 12930 }, { "epoch": 1.3272783251231526, "grad_norm": 0.08878383040428162, "learning_rate": 0.01, "loss": 2.0627, "step": 12933 }, { "epoch": 1.3275862068965516, "grad_norm": 0.059183619916439056, "learning_rate": 0.01, "loss": 2.0829, "step": 12936 }, { "epoch": 1.3278940886699506, "grad_norm": 0.05783310905098915, "learning_rate": 0.01, "loss": 2.0972, "step": 12939 }, { "epoch": 1.3282019704433496, "grad_norm": 0.07206647843122482, "learning_rate": 0.01, "loss": 2.0667, "step": 12942 }, { "epoch": 1.3285098522167487, "grad_norm": 0.07303550839424133, "learning_rate": 0.01, "loss": 2.0745, "step": 12945 }, { "epoch": 1.3288177339901477, "grad_norm": 0.04319525510072708, "learning_rate": 0.01, "loss": 2.0673, "step": 12948 }, { "epoch": 1.3291256157635467, "grad_norm": 0.044222913682460785, "learning_rate": 0.01, "loss": 2.0856, "step": 12951 }, { "epoch": 1.3294334975369457, "grad_norm": 0.08791283518075943, "learning_rate": 0.01, "loss": 2.0841, "step": 12954 }, { "epoch": 1.3297413793103448, "grad_norm": 0.05172525718808174, "learning_rate": 0.01, "loss": 2.0828, "step": 12957 }, { "epoch": 1.3300492610837438, "grad_norm": 0.053524646908044815, "learning_rate": 0.01, "loss": 2.0504, "step": 12960 }, { "epoch": 1.3303571428571428, "grad_norm": 0.03858666867017746, "learning_rate": 0.01, "loss": 2.1045, "step": 12963 }, { "epoch": 1.3306650246305418, "grad_norm": 0.037037089467048645, "learning_rate": 0.01, "loss": 2.0742, "step": 12966 }, { "epoch": 1.3309729064039408, "grad_norm": 0.04609520733356476, "learning_rate": 0.01, "loss": 2.0474, "step": 12969 }, { "epoch": 1.3312807881773399, "grad_norm": 0.03773853927850723, "learning_rate": 0.01, "loss": 2.0573, "step": 12972 }, { "epoch": 1.3315886699507389, "grad_norm": 0.040222879499197006, "learning_rate": 0.01, "loss": 2.073, "step": 12975 }, { "epoch": 1.331896551724138, "grad_norm": 0.11522398144006729, "learning_rate": 0.01, "loss": 2.0523, "step": 12978 }, { "epoch": 1.332204433497537, "grad_norm": 0.0496886670589447, "learning_rate": 0.01, "loss": 2.0873, "step": 12981 }, { "epoch": 1.332512315270936, "grad_norm": 0.0955866202712059, "learning_rate": 0.01, "loss": 2.0735, "step": 12984 }, { "epoch": 1.332820197044335, "grad_norm": 0.06260306388139725, "learning_rate": 0.01, "loss": 2.0696, "step": 12987 }, { "epoch": 1.333128078817734, "grad_norm": 0.049784719944000244, "learning_rate": 0.01, "loss": 2.0807, "step": 12990 }, { "epoch": 1.333435960591133, "grad_norm": 0.04571852833032608, "learning_rate": 0.01, "loss": 2.0648, "step": 12993 }, { "epoch": 1.333743842364532, "grad_norm": 0.11032246053218842, "learning_rate": 0.01, "loss": 2.0777, "step": 12996 }, { "epoch": 1.334051724137931, "grad_norm": 0.04030182585120201, "learning_rate": 0.01, "loss": 2.0491, "step": 12999 }, { "epoch": 1.33435960591133, "grad_norm": 0.06681946665048599, "learning_rate": 0.01, "loss": 2.051, "step": 13002 }, { "epoch": 1.334667487684729, "grad_norm": 0.04532696306705475, "learning_rate": 0.01, "loss": 2.0322, "step": 13005 }, { "epoch": 1.3349753694581281, "grad_norm": 0.03890594094991684, "learning_rate": 0.01, "loss": 2.0792, "step": 13008 }, { "epoch": 1.3352832512315271, "grad_norm": 0.08290864527225494, "learning_rate": 0.01, "loss": 2.0903, "step": 13011 }, { "epoch": 1.3355911330049262, "grad_norm": 0.09402919560670853, "learning_rate": 0.01, "loss": 2.0678, "step": 13014 }, { "epoch": 1.3358990147783252, "grad_norm": 0.06945643573999405, "learning_rate": 0.01, "loss": 2.0823, "step": 13017 }, { "epoch": 1.3362068965517242, "grad_norm": 0.0540471225976944, "learning_rate": 0.01, "loss": 2.0705, "step": 13020 }, { "epoch": 1.3365147783251232, "grad_norm": 0.04104168713092804, "learning_rate": 0.01, "loss": 2.0808, "step": 13023 }, { "epoch": 1.3368226600985222, "grad_norm": 0.04578167945146561, "learning_rate": 0.01, "loss": 2.0821, "step": 13026 }, { "epoch": 1.3371305418719213, "grad_norm": 0.07289981096982956, "learning_rate": 0.01, "loss": 2.0686, "step": 13029 }, { "epoch": 1.3374384236453203, "grad_norm": 0.09114310890436172, "learning_rate": 0.01, "loss": 2.0435, "step": 13032 }, { "epoch": 1.3377463054187193, "grad_norm": 0.06305088102817535, "learning_rate": 0.01, "loss": 2.0593, "step": 13035 }, { "epoch": 1.3380541871921183, "grad_norm": 0.06495746225118637, "learning_rate": 0.01, "loss": 2.0728, "step": 13038 }, { "epoch": 1.3383620689655173, "grad_norm": 0.05586539953947067, "learning_rate": 0.01, "loss": 2.0747, "step": 13041 }, { "epoch": 1.3386699507389164, "grad_norm": 0.04566524177789688, "learning_rate": 0.01, "loss": 2.0576, "step": 13044 }, { "epoch": 1.3389778325123154, "grad_norm": 0.060839373618364334, "learning_rate": 0.01, "loss": 2.0717, "step": 13047 }, { "epoch": 1.3392857142857144, "grad_norm": 0.11224903166294098, "learning_rate": 0.01, "loss": 2.0878, "step": 13050 }, { "epoch": 1.3395935960591134, "grad_norm": 0.09223728626966476, "learning_rate": 0.01, "loss": 2.0531, "step": 13053 }, { "epoch": 1.3399014778325122, "grad_norm": 0.0413731187582016, "learning_rate": 0.01, "loss": 2.0495, "step": 13056 }, { "epoch": 1.3402093596059113, "grad_norm": 0.050937358289957047, "learning_rate": 0.01, "loss": 2.0774, "step": 13059 }, { "epoch": 1.3405172413793103, "grad_norm": 0.0407971516251564, "learning_rate": 0.01, "loss": 2.0766, "step": 13062 }, { "epoch": 1.3408251231527093, "grad_norm": 0.0623883455991745, "learning_rate": 0.01, "loss": 2.0691, "step": 13065 }, { "epoch": 1.3411330049261083, "grad_norm": 0.09325427561998367, "learning_rate": 0.01, "loss": 2.0731, "step": 13068 }, { "epoch": 1.3414408866995073, "grad_norm": 0.06965765357017517, "learning_rate": 0.01, "loss": 2.0653, "step": 13071 }, { "epoch": 1.3417487684729064, "grad_norm": 0.12671297788619995, "learning_rate": 0.01, "loss": 2.1028, "step": 13074 }, { "epoch": 1.3420566502463054, "grad_norm": 0.04154878482222557, "learning_rate": 0.01, "loss": 2.0783, "step": 13077 }, { "epoch": 1.3423645320197044, "grad_norm": 0.04698561131954193, "learning_rate": 0.01, "loss": 2.0799, "step": 13080 }, { "epoch": 1.3426724137931034, "grad_norm": 0.031127501279115677, "learning_rate": 0.01, "loss": 2.0756, "step": 13083 }, { "epoch": 1.3429802955665024, "grad_norm": 0.05258537083864212, "learning_rate": 0.01, "loss": 2.0821, "step": 13086 }, { "epoch": 1.3432881773399015, "grad_norm": 0.06848637759685516, "learning_rate": 0.01, "loss": 2.0629, "step": 13089 }, { "epoch": 1.3435960591133005, "grad_norm": 0.07738485932350159, "learning_rate": 0.01, "loss": 2.0674, "step": 13092 }, { "epoch": 1.3439039408866995, "grad_norm": 0.09635680168867111, "learning_rate": 0.01, "loss": 2.0782, "step": 13095 }, { "epoch": 1.3442118226600985, "grad_norm": 0.04388611391186714, "learning_rate": 0.01, "loss": 2.0575, "step": 13098 }, { "epoch": 1.3445197044334976, "grad_norm": 0.0776490792632103, "learning_rate": 0.01, "loss": 2.0753, "step": 13101 }, { "epoch": 1.3448275862068966, "grad_norm": 0.11331035196781158, "learning_rate": 0.01, "loss": 2.053, "step": 13104 }, { "epoch": 1.3451354679802956, "grad_norm": 0.04267279431223869, "learning_rate": 0.01, "loss": 2.0812, "step": 13107 }, { "epoch": 1.3454433497536946, "grad_norm": 0.05454112961888313, "learning_rate": 0.01, "loss": 2.0711, "step": 13110 }, { "epoch": 1.3457512315270936, "grad_norm": 0.07470305263996124, "learning_rate": 0.01, "loss": 2.0878, "step": 13113 }, { "epoch": 1.3460591133004927, "grad_norm": 0.057337477803230286, "learning_rate": 0.01, "loss": 2.0607, "step": 13116 }, { "epoch": 1.3463669950738917, "grad_norm": 0.09155120700597763, "learning_rate": 0.01, "loss": 2.0898, "step": 13119 }, { "epoch": 1.3466748768472907, "grad_norm": 0.09644894301891327, "learning_rate": 0.01, "loss": 2.0905, "step": 13122 }, { "epoch": 1.3469827586206897, "grad_norm": 0.0579628124833107, "learning_rate": 0.01, "loss": 2.0834, "step": 13125 }, { "epoch": 1.3472906403940887, "grad_norm": 0.09968624264001846, "learning_rate": 0.01, "loss": 2.0797, "step": 13128 }, { "epoch": 1.3475985221674878, "grad_norm": 0.04834052175283432, "learning_rate": 0.01, "loss": 2.0552, "step": 13131 }, { "epoch": 1.3479064039408866, "grad_norm": 0.05561887100338936, "learning_rate": 0.01, "loss": 2.0564, "step": 13134 }, { "epoch": 1.3482142857142856, "grad_norm": 0.14990638196468353, "learning_rate": 0.01, "loss": 2.0738, "step": 13137 }, { "epoch": 1.3485221674876846, "grad_norm": 0.07530777156352997, "learning_rate": 0.01, "loss": 2.0765, "step": 13140 }, { "epoch": 1.3488300492610836, "grad_norm": 0.09080106765031815, "learning_rate": 0.01, "loss": 2.083, "step": 13143 }, { "epoch": 1.3491379310344827, "grad_norm": 0.042014699429273605, "learning_rate": 0.01, "loss": 2.0608, "step": 13146 }, { "epoch": 1.3494458128078817, "grad_norm": 0.08905219286680222, "learning_rate": 0.01, "loss": 2.0657, "step": 13149 }, { "epoch": 1.3497536945812807, "grad_norm": 0.1093059629201889, "learning_rate": 0.01, "loss": 2.102, "step": 13152 }, { "epoch": 1.3500615763546797, "grad_norm": 0.09834617376327515, "learning_rate": 0.01, "loss": 2.0601, "step": 13155 }, { "epoch": 1.3503694581280787, "grad_norm": 0.0754542201757431, "learning_rate": 0.01, "loss": 2.103, "step": 13158 }, { "epoch": 1.3506773399014778, "grad_norm": 0.09639342129230499, "learning_rate": 0.01, "loss": 2.0743, "step": 13161 }, { "epoch": 1.3509852216748768, "grad_norm": 0.05084405094385147, "learning_rate": 0.01, "loss": 2.0898, "step": 13164 }, { "epoch": 1.3512931034482758, "grad_norm": 0.04796381667256355, "learning_rate": 0.01, "loss": 2.0788, "step": 13167 }, { "epoch": 1.3516009852216748, "grad_norm": 0.05373486131429672, "learning_rate": 0.01, "loss": 2.0632, "step": 13170 }, { "epoch": 1.3519088669950738, "grad_norm": 0.05145580321550369, "learning_rate": 0.01, "loss": 2.0383, "step": 13173 }, { "epoch": 1.3522167487684729, "grad_norm": 0.03861214593052864, "learning_rate": 0.01, "loss": 2.0678, "step": 13176 }, { "epoch": 1.3525246305418719, "grad_norm": 0.04394346475601196, "learning_rate": 0.01, "loss": 2.0601, "step": 13179 }, { "epoch": 1.352832512315271, "grad_norm": 0.08851804584264755, "learning_rate": 0.01, "loss": 2.0374, "step": 13182 }, { "epoch": 1.35314039408867, "grad_norm": 0.059799451380968094, "learning_rate": 0.01, "loss": 2.0735, "step": 13185 }, { "epoch": 1.353448275862069, "grad_norm": 0.13764169812202454, "learning_rate": 0.01, "loss": 2.1001, "step": 13188 }, { "epoch": 1.353756157635468, "grad_norm": 0.05652278661727905, "learning_rate": 0.01, "loss": 2.0771, "step": 13191 }, { "epoch": 1.354064039408867, "grad_norm": 0.04755775257945061, "learning_rate": 0.01, "loss": 2.0467, "step": 13194 }, { "epoch": 1.354371921182266, "grad_norm": 0.058131635189056396, "learning_rate": 0.01, "loss": 2.0884, "step": 13197 }, { "epoch": 1.354679802955665, "grad_norm": 0.041266053915023804, "learning_rate": 0.01, "loss": 2.0684, "step": 13200 }, { "epoch": 1.354987684729064, "grad_norm": 0.034990034997463226, "learning_rate": 0.01, "loss": 2.0576, "step": 13203 }, { "epoch": 1.355295566502463, "grad_norm": 0.13107064366340637, "learning_rate": 0.01, "loss": 2.0616, "step": 13206 }, { "epoch": 1.355603448275862, "grad_norm": 0.05397200584411621, "learning_rate": 0.01, "loss": 2.0953, "step": 13209 }, { "epoch": 1.3559113300492611, "grad_norm": 0.04137737303972244, "learning_rate": 0.01, "loss": 2.0538, "step": 13212 }, { "epoch": 1.3562192118226601, "grad_norm": 0.05001407861709595, "learning_rate": 0.01, "loss": 2.0809, "step": 13215 }, { "epoch": 1.3565270935960592, "grad_norm": 0.10387953370809555, "learning_rate": 0.01, "loss": 2.0818, "step": 13218 }, { "epoch": 1.3568349753694582, "grad_norm": 0.052998363971710205, "learning_rate": 0.01, "loss": 2.0711, "step": 13221 }, { "epoch": 1.3571428571428572, "grad_norm": 0.06805765628814697, "learning_rate": 0.01, "loss": 2.0604, "step": 13224 }, { "epoch": 1.3574507389162562, "grad_norm": 0.06597940623760223, "learning_rate": 0.01, "loss": 2.0701, "step": 13227 }, { "epoch": 1.3577586206896552, "grad_norm": 0.10083628445863724, "learning_rate": 0.01, "loss": 2.086, "step": 13230 }, { "epoch": 1.3580665024630543, "grad_norm": 0.05467986315488815, "learning_rate": 0.01, "loss": 2.072, "step": 13233 }, { "epoch": 1.3583743842364533, "grad_norm": 0.08951261639595032, "learning_rate": 0.01, "loss": 2.1256, "step": 13236 }, { "epoch": 1.3586822660098523, "grad_norm": 0.052532244473695755, "learning_rate": 0.01, "loss": 2.0555, "step": 13239 }, { "epoch": 1.3589901477832513, "grad_norm": 0.038159146904945374, "learning_rate": 0.01, "loss": 2.062, "step": 13242 }, { "epoch": 1.3592980295566504, "grad_norm": 0.09895820915699005, "learning_rate": 0.01, "loss": 2.0559, "step": 13245 }, { "epoch": 1.3596059113300494, "grad_norm": 0.07522387057542801, "learning_rate": 0.01, "loss": 2.0634, "step": 13248 }, { "epoch": 1.3599137931034484, "grad_norm": 0.04762687534093857, "learning_rate": 0.01, "loss": 2.0848, "step": 13251 }, { "epoch": 1.3602216748768474, "grad_norm": 0.0716032013297081, "learning_rate": 0.01, "loss": 2.0783, "step": 13254 }, { "epoch": 1.3605295566502464, "grad_norm": 0.08518968522548676, "learning_rate": 0.01, "loss": 2.0818, "step": 13257 }, { "epoch": 1.3608374384236452, "grad_norm": 0.06927520781755447, "learning_rate": 0.01, "loss": 2.0728, "step": 13260 }, { "epoch": 1.3611453201970443, "grad_norm": 0.10368376970291138, "learning_rate": 0.01, "loss": 2.0419, "step": 13263 }, { "epoch": 1.3614532019704433, "grad_norm": 0.04249117895960808, "learning_rate": 0.01, "loss": 2.0695, "step": 13266 }, { "epoch": 1.3617610837438423, "grad_norm": 0.06504488736391068, "learning_rate": 0.01, "loss": 2.0658, "step": 13269 }, { "epoch": 1.3620689655172413, "grad_norm": 0.03990466147661209, "learning_rate": 0.01, "loss": 2.0796, "step": 13272 }, { "epoch": 1.3623768472906403, "grad_norm": 0.042559072375297546, "learning_rate": 0.01, "loss": 2.0643, "step": 13275 }, { "epoch": 1.3626847290640394, "grad_norm": 0.046650230884552, "learning_rate": 0.01, "loss": 2.0756, "step": 13278 }, { "epoch": 1.3629926108374384, "grad_norm": 0.08641167730093002, "learning_rate": 0.01, "loss": 2.0638, "step": 13281 }, { "epoch": 1.3633004926108374, "grad_norm": 0.11438708007335663, "learning_rate": 0.01, "loss": 2.0559, "step": 13284 }, { "epoch": 1.3636083743842364, "grad_norm": 0.05360870808362961, "learning_rate": 0.01, "loss": 2.0534, "step": 13287 }, { "epoch": 1.3639162561576355, "grad_norm": 0.07226021587848663, "learning_rate": 0.01, "loss": 2.0569, "step": 13290 }, { "epoch": 1.3642241379310345, "grad_norm": 0.04532739892601967, "learning_rate": 0.01, "loss": 2.0504, "step": 13293 }, { "epoch": 1.3645320197044335, "grad_norm": 0.06119906157255173, "learning_rate": 0.01, "loss": 2.0507, "step": 13296 }, { "epoch": 1.3648399014778325, "grad_norm": 0.05576052516698837, "learning_rate": 0.01, "loss": 2.0454, "step": 13299 }, { "epoch": 1.3651477832512315, "grad_norm": 0.038748834282159805, "learning_rate": 0.01, "loss": 2.0632, "step": 13302 }, { "epoch": 1.3654556650246306, "grad_norm": 0.09733711183071136, "learning_rate": 0.01, "loss": 2.0733, "step": 13305 }, { "epoch": 1.3657635467980296, "grad_norm": 0.043375931680202484, "learning_rate": 0.01, "loss": 2.0629, "step": 13308 }, { "epoch": 1.3660714285714286, "grad_norm": 0.11930018663406372, "learning_rate": 0.01, "loss": 2.0931, "step": 13311 }, { "epoch": 1.3663793103448276, "grad_norm": 0.06754540652036667, "learning_rate": 0.01, "loss": 2.0575, "step": 13314 }, { "epoch": 1.3666871921182266, "grad_norm": 0.07226148992776871, "learning_rate": 0.01, "loss": 2.0786, "step": 13317 }, { "epoch": 1.3669950738916257, "grad_norm": 0.08159705251455307, "learning_rate": 0.01, "loss": 2.0594, "step": 13320 }, { "epoch": 1.3673029556650247, "grad_norm": 0.044994477182626724, "learning_rate": 0.01, "loss": 2.0782, "step": 13323 }, { "epoch": 1.3676108374384237, "grad_norm": 0.05308050662279129, "learning_rate": 0.01, "loss": 2.0645, "step": 13326 }, { "epoch": 1.3679187192118227, "grad_norm": 0.09141236543655396, "learning_rate": 0.01, "loss": 2.0801, "step": 13329 }, { "epoch": 1.3682266009852218, "grad_norm": 0.040702883154153824, "learning_rate": 0.01, "loss": 2.0497, "step": 13332 }, { "epoch": 1.3685344827586206, "grad_norm": 0.06111524999141693, "learning_rate": 0.01, "loss": 2.0633, "step": 13335 }, { "epoch": 1.3688423645320196, "grad_norm": 0.10802275687456131, "learning_rate": 0.01, "loss": 2.0541, "step": 13338 }, { "epoch": 1.3691502463054186, "grad_norm": 0.09049344807863235, "learning_rate": 0.01, "loss": 2.0636, "step": 13341 }, { "epoch": 1.3694581280788176, "grad_norm": 0.055894456803798676, "learning_rate": 0.01, "loss": 2.0675, "step": 13344 }, { "epoch": 1.3697660098522166, "grad_norm": 0.054729919880628586, "learning_rate": 0.01, "loss": 2.0674, "step": 13347 }, { "epoch": 1.3700738916256157, "grad_norm": 0.05745011568069458, "learning_rate": 0.01, "loss": 2.0707, "step": 13350 }, { "epoch": 1.3703817733990147, "grad_norm": 0.06573651731014252, "learning_rate": 0.01, "loss": 2.0493, "step": 13353 }, { "epoch": 1.3706896551724137, "grad_norm": 0.20495210587978363, "learning_rate": 0.01, "loss": 2.0798, "step": 13356 }, { "epoch": 1.3709975369458127, "grad_norm": 0.10678639262914658, "learning_rate": 0.01, "loss": 2.0499, "step": 13359 }, { "epoch": 1.3713054187192117, "grad_norm": 0.10948281735181808, "learning_rate": 0.01, "loss": 2.087, "step": 13362 }, { "epoch": 1.3716133004926108, "grad_norm": 0.07788719981908798, "learning_rate": 0.01, "loss": 2.0745, "step": 13365 }, { "epoch": 1.3719211822660098, "grad_norm": 0.04947768524289131, "learning_rate": 0.01, "loss": 2.0608, "step": 13368 }, { "epoch": 1.3722290640394088, "grad_norm": 0.04789843037724495, "learning_rate": 0.01, "loss": 2.0607, "step": 13371 }, { "epoch": 1.3725369458128078, "grad_norm": 0.05217898637056351, "learning_rate": 0.01, "loss": 2.058, "step": 13374 }, { "epoch": 1.3728448275862069, "grad_norm": 0.04018987715244293, "learning_rate": 0.01, "loss": 2.0782, "step": 13377 }, { "epoch": 1.3731527093596059, "grad_norm": 0.035446904599666595, "learning_rate": 0.01, "loss": 2.0668, "step": 13380 }, { "epoch": 1.373460591133005, "grad_norm": 0.038600854575634, "learning_rate": 0.01, "loss": 2.0865, "step": 13383 }, { "epoch": 1.373768472906404, "grad_norm": 0.055341143161058426, "learning_rate": 0.01, "loss": 2.0591, "step": 13386 }, { "epoch": 1.374076354679803, "grad_norm": 0.11673317849636078, "learning_rate": 0.01, "loss": 2.0897, "step": 13389 }, { "epoch": 1.374384236453202, "grad_norm": 0.06797752529382706, "learning_rate": 0.01, "loss": 2.0676, "step": 13392 }, { "epoch": 1.374692118226601, "grad_norm": 0.041885413229465485, "learning_rate": 0.01, "loss": 2.0245, "step": 13395 }, { "epoch": 1.375, "grad_norm": 0.07391481846570969, "learning_rate": 0.01, "loss": 2.067, "step": 13398 }, { "epoch": 1.375307881773399, "grad_norm": 0.07959283888339996, "learning_rate": 0.01, "loss": 2.0192, "step": 13401 }, { "epoch": 1.375615763546798, "grad_norm": 0.09504754096269608, "learning_rate": 0.01, "loss": 2.0841, "step": 13404 }, { "epoch": 1.375923645320197, "grad_norm": 0.08874071389436722, "learning_rate": 0.01, "loss": 2.0473, "step": 13407 }, { "epoch": 1.376231527093596, "grad_norm": 0.05350710079073906, "learning_rate": 0.01, "loss": 2.0763, "step": 13410 }, { "epoch": 1.376539408866995, "grad_norm": 0.06738609820604324, "learning_rate": 0.01, "loss": 2.0782, "step": 13413 }, { "epoch": 1.3768472906403941, "grad_norm": 0.04335073009133339, "learning_rate": 0.01, "loss": 2.0601, "step": 13416 }, { "epoch": 1.3771551724137931, "grad_norm": 0.056045398116111755, "learning_rate": 0.01, "loss": 2.0588, "step": 13419 }, { "epoch": 1.3774630541871922, "grad_norm": 0.06593155860900879, "learning_rate": 0.01, "loss": 2.0637, "step": 13422 }, { "epoch": 1.3777709359605912, "grad_norm": 0.08942624181509018, "learning_rate": 0.01, "loss": 2.0724, "step": 13425 }, { "epoch": 1.3780788177339902, "grad_norm": 0.10098058730363846, "learning_rate": 0.01, "loss": 2.064, "step": 13428 }, { "epoch": 1.3783866995073892, "grad_norm": 0.04804680123925209, "learning_rate": 0.01, "loss": 2.0684, "step": 13431 }, { "epoch": 1.3786945812807883, "grad_norm": 0.08567364513874054, "learning_rate": 0.01, "loss": 2.0617, "step": 13434 }, { "epoch": 1.3790024630541873, "grad_norm": 0.06027091667056084, "learning_rate": 0.01, "loss": 2.0666, "step": 13437 }, { "epoch": 1.3793103448275863, "grad_norm": 0.08809462189674377, "learning_rate": 0.01, "loss": 2.0862, "step": 13440 }, { "epoch": 1.3796182266009853, "grad_norm": 0.053466469049453735, "learning_rate": 0.01, "loss": 2.0497, "step": 13443 }, { "epoch": 1.3799261083743843, "grad_norm": 0.033619511872529984, "learning_rate": 0.01, "loss": 2.0298, "step": 13446 }, { "epoch": 1.3802339901477834, "grad_norm": 0.04768878221511841, "learning_rate": 0.01, "loss": 2.054, "step": 13449 }, { "epoch": 1.3805418719211824, "grad_norm": 0.07854757457971573, "learning_rate": 0.01, "loss": 2.0538, "step": 13452 }, { "epoch": 1.3808497536945814, "grad_norm": 0.05409559607505798, "learning_rate": 0.01, "loss": 2.082, "step": 13455 }, { "epoch": 1.3811576354679804, "grad_norm": 0.057855118066072464, "learning_rate": 0.01, "loss": 2.0814, "step": 13458 }, { "epoch": 1.3814655172413794, "grad_norm": 0.047502126544713974, "learning_rate": 0.01, "loss": 2.0955, "step": 13461 }, { "epoch": 1.3817733990147782, "grad_norm": 0.040939487516880035, "learning_rate": 0.01, "loss": 2.0431, "step": 13464 }, { "epoch": 1.3820812807881773, "grad_norm": 0.1307850480079651, "learning_rate": 0.01, "loss": 2.0801, "step": 13467 }, { "epoch": 1.3823891625615763, "grad_norm": 0.04386845603585243, "learning_rate": 0.01, "loss": 2.0662, "step": 13470 }, { "epoch": 1.3826970443349753, "grad_norm": 0.08174968510866165, "learning_rate": 0.01, "loss": 2.0544, "step": 13473 }, { "epoch": 1.3830049261083743, "grad_norm": 0.1113237589597702, "learning_rate": 0.01, "loss": 2.0411, "step": 13476 }, { "epoch": 1.3833128078817734, "grad_norm": 0.06756308674812317, "learning_rate": 0.01, "loss": 2.0813, "step": 13479 }, { "epoch": 1.3836206896551724, "grad_norm": 0.05931835621595383, "learning_rate": 0.01, "loss": 2.0544, "step": 13482 }, { "epoch": 1.3839285714285714, "grad_norm": 0.043539129197597504, "learning_rate": 0.01, "loss": 2.0528, "step": 13485 }, { "epoch": 1.3842364532019704, "grad_norm": 0.0510721355676651, "learning_rate": 0.01, "loss": 2.075, "step": 13488 }, { "epoch": 1.3845443349753694, "grad_norm": 0.10811501741409302, "learning_rate": 0.01, "loss": 2.0626, "step": 13491 }, { "epoch": 1.3848522167487685, "grad_norm": 0.08322811871767044, "learning_rate": 0.01, "loss": 2.0882, "step": 13494 }, { "epoch": 1.3851600985221675, "grad_norm": 0.05101979896426201, "learning_rate": 0.01, "loss": 2.0677, "step": 13497 }, { "epoch": 1.3854679802955665, "grad_norm": 0.036535851657390594, "learning_rate": 0.01, "loss": 2.0791, "step": 13500 }, { "epoch": 1.3857758620689655, "grad_norm": 0.05161239951848984, "learning_rate": 0.01, "loss": 2.0513, "step": 13503 }, { "epoch": 1.3860837438423645, "grad_norm": 0.06677153706550598, "learning_rate": 0.01, "loss": 2.0581, "step": 13506 }, { "epoch": 1.3863916256157636, "grad_norm": 0.04239841178059578, "learning_rate": 0.01, "loss": 2.0383, "step": 13509 }, { "epoch": 1.3866995073891626, "grad_norm": 0.04252205416560173, "learning_rate": 0.01, "loss": 2.0875, "step": 13512 }, { "epoch": 1.3870073891625616, "grad_norm": 0.10147176682949066, "learning_rate": 0.01, "loss": 2.0631, "step": 13515 }, { "epoch": 1.3873152709359606, "grad_norm": 0.046371154487133026, "learning_rate": 0.01, "loss": 2.0784, "step": 13518 }, { "epoch": 1.3876231527093597, "grad_norm": 0.0997064933180809, "learning_rate": 0.01, "loss": 2.0578, "step": 13521 }, { "epoch": 1.3879310344827587, "grad_norm": 0.05733582749962807, "learning_rate": 0.01, "loss": 2.0742, "step": 13524 }, { "epoch": 1.3882389162561577, "grad_norm": 0.05061260983347893, "learning_rate": 0.01, "loss": 2.0448, "step": 13527 }, { "epoch": 1.3885467980295567, "grad_norm": 0.04336051642894745, "learning_rate": 0.01, "loss": 2.0846, "step": 13530 }, { "epoch": 1.3888546798029557, "grad_norm": 0.04115337133407593, "learning_rate": 0.01, "loss": 2.0577, "step": 13533 }, { "epoch": 1.3891625615763548, "grad_norm": 0.04914069175720215, "learning_rate": 0.01, "loss": 2.0815, "step": 13536 }, { "epoch": 1.3894704433497536, "grad_norm": 0.0677042305469513, "learning_rate": 0.01, "loss": 2.0754, "step": 13539 }, { "epoch": 1.3897783251231526, "grad_norm": 0.04985208064317703, "learning_rate": 0.01, "loss": 2.0633, "step": 13542 }, { "epoch": 1.3900862068965516, "grad_norm": 0.08042199164628983, "learning_rate": 0.01, "loss": 2.0828, "step": 13545 }, { "epoch": 1.3903940886699506, "grad_norm": 0.03648814186453819, "learning_rate": 0.01, "loss": 2.0669, "step": 13548 }, { "epoch": 1.3907019704433496, "grad_norm": 0.03645399957895279, "learning_rate": 0.01, "loss": 2.0495, "step": 13551 }, { "epoch": 1.3910098522167487, "grad_norm": 0.04866683483123779, "learning_rate": 0.01, "loss": 2.0531, "step": 13554 }, { "epoch": 1.3913177339901477, "grad_norm": 0.07728299498558044, "learning_rate": 0.01, "loss": 2.0671, "step": 13557 }, { "epoch": 1.3916256157635467, "grad_norm": 0.12097810208797455, "learning_rate": 0.01, "loss": 2.0935, "step": 13560 }, { "epoch": 1.3919334975369457, "grad_norm": 0.12485776096582413, "learning_rate": 0.01, "loss": 2.0712, "step": 13563 }, { "epoch": 1.3922413793103448, "grad_norm": 0.053524475544691086, "learning_rate": 0.01, "loss": 2.0595, "step": 13566 }, { "epoch": 1.3925492610837438, "grad_norm": 0.04277713969349861, "learning_rate": 0.01, "loss": 2.0791, "step": 13569 }, { "epoch": 1.3928571428571428, "grad_norm": 0.09847384691238403, "learning_rate": 0.01, "loss": 2.0754, "step": 13572 }, { "epoch": 1.3931650246305418, "grad_norm": 0.03410463035106659, "learning_rate": 0.01, "loss": 2.0556, "step": 13575 }, { "epoch": 1.3934729064039408, "grad_norm": 0.10606678575277328, "learning_rate": 0.01, "loss": 2.0637, "step": 13578 }, { "epoch": 1.3937807881773399, "grad_norm": 0.06554549187421799, "learning_rate": 0.01, "loss": 2.0549, "step": 13581 }, { "epoch": 1.3940886699507389, "grad_norm": 0.07487329095602036, "learning_rate": 0.01, "loss": 2.0645, "step": 13584 }, { "epoch": 1.394396551724138, "grad_norm": 0.07526996731758118, "learning_rate": 0.01, "loss": 2.0733, "step": 13587 }, { "epoch": 1.394704433497537, "grad_norm": 0.0581665076315403, "learning_rate": 0.01, "loss": 2.0491, "step": 13590 }, { "epoch": 1.395012315270936, "grad_norm": 0.057513732463121414, "learning_rate": 0.01, "loss": 2.087, "step": 13593 }, { "epoch": 1.395320197044335, "grad_norm": 0.037192508578300476, "learning_rate": 0.01, "loss": 2.0487, "step": 13596 }, { "epoch": 1.395628078817734, "grad_norm": 0.0965125560760498, "learning_rate": 0.01, "loss": 2.0549, "step": 13599 }, { "epoch": 1.395935960591133, "grad_norm": 0.04594407603144646, "learning_rate": 0.01, "loss": 2.067, "step": 13602 }, { "epoch": 1.396243842364532, "grad_norm": 0.08442319929599762, "learning_rate": 0.01, "loss": 2.0627, "step": 13605 }, { "epoch": 1.396551724137931, "grad_norm": 0.08673713356256485, "learning_rate": 0.01, "loss": 2.054, "step": 13608 }, { "epoch": 1.39685960591133, "grad_norm": 0.07299968600273132, "learning_rate": 0.01, "loss": 2.0672, "step": 13611 }, { "epoch": 1.397167487684729, "grad_norm": 0.052630744874477386, "learning_rate": 0.01, "loss": 2.079, "step": 13614 }, { "epoch": 1.3974753694581281, "grad_norm": 0.0626215934753418, "learning_rate": 0.01, "loss": 2.0732, "step": 13617 }, { "epoch": 1.3977832512315271, "grad_norm": 0.0866907387971878, "learning_rate": 0.01, "loss": 2.0743, "step": 13620 }, { "epoch": 1.3980911330049262, "grad_norm": 0.05650071054697037, "learning_rate": 0.01, "loss": 2.0856, "step": 13623 }, { "epoch": 1.3983990147783252, "grad_norm": 0.07526635378599167, "learning_rate": 0.01, "loss": 2.0707, "step": 13626 }, { "epoch": 1.3987068965517242, "grad_norm": 0.07472112774848938, "learning_rate": 0.01, "loss": 2.0608, "step": 13629 }, { "epoch": 1.3990147783251232, "grad_norm": 0.07251216471195221, "learning_rate": 0.01, "loss": 2.086, "step": 13632 }, { "epoch": 1.3993226600985222, "grad_norm": 0.08701921999454498, "learning_rate": 0.01, "loss": 2.0382, "step": 13635 }, { "epoch": 1.3996305418719213, "grad_norm": 0.033323436975479126, "learning_rate": 0.01, "loss": 2.0716, "step": 13638 }, { "epoch": 1.3999384236453203, "grad_norm": 0.04960713908076286, "learning_rate": 0.01, "loss": 2.0557, "step": 13641 }, { "epoch": 1.4002463054187193, "grad_norm": 0.1418198049068451, "learning_rate": 0.01, "loss": 2.0528, "step": 13644 }, { "epoch": 1.4005541871921183, "grad_norm": 0.06056910380721092, "learning_rate": 0.01, "loss": 2.0825, "step": 13647 }, { "epoch": 1.4008620689655173, "grad_norm": 0.062474992126226425, "learning_rate": 0.01, "loss": 2.0195, "step": 13650 }, { "epoch": 1.4011699507389164, "grad_norm": 0.05380658805370331, "learning_rate": 0.01, "loss": 2.0644, "step": 13653 }, { "epoch": 1.4014778325123154, "grad_norm": 0.046230513602495193, "learning_rate": 0.01, "loss": 2.062, "step": 13656 }, { "epoch": 1.4017857142857144, "grad_norm": 0.05238807573914528, "learning_rate": 0.01, "loss": 2.0976, "step": 13659 }, { "epoch": 1.4020935960591134, "grad_norm": 0.045423392206430435, "learning_rate": 0.01, "loss": 2.0729, "step": 13662 }, { "epoch": 1.4024014778325122, "grad_norm": 0.1077577993273735, "learning_rate": 0.01, "loss": 2.0481, "step": 13665 }, { "epoch": 1.4027093596059113, "grad_norm": 0.04370421916246414, "learning_rate": 0.01, "loss": 2.0634, "step": 13668 }, { "epoch": 1.4030172413793103, "grad_norm": 0.061496302485466, "learning_rate": 0.01, "loss": 2.0449, "step": 13671 }, { "epoch": 1.4033251231527093, "grad_norm": 0.048742834478616714, "learning_rate": 0.01, "loss": 2.0683, "step": 13674 }, { "epoch": 1.4036330049261083, "grad_norm": 0.14942513406276703, "learning_rate": 0.01, "loss": 2.0616, "step": 13677 }, { "epoch": 1.4039408866995073, "grad_norm": 0.04235846549272537, "learning_rate": 0.01, "loss": 2.0611, "step": 13680 }, { "epoch": 1.4042487684729064, "grad_norm": 0.05509978160262108, "learning_rate": 0.01, "loss": 2.0645, "step": 13683 }, { "epoch": 1.4045566502463054, "grad_norm": 0.09692233055830002, "learning_rate": 0.01, "loss": 2.0703, "step": 13686 }, { "epoch": 1.4048645320197044, "grad_norm": 0.11141908913850784, "learning_rate": 0.01, "loss": 2.0724, "step": 13689 }, { "epoch": 1.4051724137931034, "grad_norm": 0.06601562350988388, "learning_rate": 0.01, "loss": 2.0719, "step": 13692 }, { "epoch": 1.4054802955665024, "grad_norm": 0.04997260868549347, "learning_rate": 0.01, "loss": 2.056, "step": 13695 }, { "epoch": 1.4057881773399015, "grad_norm": 0.07198163866996765, "learning_rate": 0.01, "loss": 2.0546, "step": 13698 }, { "epoch": 1.4060960591133005, "grad_norm": 0.03802650794386864, "learning_rate": 0.01, "loss": 2.0696, "step": 13701 }, { "epoch": 1.4064039408866995, "grad_norm": 0.06259030848741531, "learning_rate": 0.01, "loss": 2.0459, "step": 13704 }, { "epoch": 1.4067118226600985, "grad_norm": 0.09554235637187958, "learning_rate": 0.01, "loss": 2.0476, "step": 13707 }, { "epoch": 1.4070197044334976, "grad_norm": 0.056935038417577744, "learning_rate": 0.01, "loss": 2.0522, "step": 13710 }, { "epoch": 1.4073275862068966, "grad_norm": 0.11038411408662796, "learning_rate": 0.01, "loss": 2.0567, "step": 13713 }, { "epoch": 1.4076354679802956, "grad_norm": 0.05257488042116165, "learning_rate": 0.01, "loss": 2.09, "step": 13716 }, { "epoch": 1.4079433497536946, "grad_norm": 0.0573866032063961, "learning_rate": 0.01, "loss": 2.0538, "step": 13719 }, { "epoch": 1.4082512315270936, "grad_norm": 0.04933631047606468, "learning_rate": 0.01, "loss": 2.0435, "step": 13722 }, { "epoch": 1.4085591133004927, "grad_norm": 0.05909980088472366, "learning_rate": 0.01, "loss": 2.0554, "step": 13725 }, { "epoch": 1.4088669950738917, "grad_norm": 0.09598751366138458, "learning_rate": 0.01, "loss": 2.0718, "step": 13728 }, { "epoch": 1.4091748768472907, "grad_norm": 0.05608231574296951, "learning_rate": 0.01, "loss": 2.0621, "step": 13731 }, { "epoch": 1.4094827586206897, "grad_norm": 0.08262834697961807, "learning_rate": 0.01, "loss": 2.0661, "step": 13734 }, { "epoch": 1.4097906403940887, "grad_norm": 0.041144959628582, "learning_rate": 0.01, "loss": 2.0646, "step": 13737 }, { "epoch": 1.4100985221674878, "grad_norm": 0.03748650476336479, "learning_rate": 0.01, "loss": 2.0558, "step": 13740 }, { "epoch": 1.4104064039408866, "grad_norm": 0.04054822400212288, "learning_rate": 0.01, "loss": 2.0564, "step": 13743 }, { "epoch": 1.4107142857142856, "grad_norm": 0.07961263507604599, "learning_rate": 0.01, "loss": 2.0513, "step": 13746 }, { "epoch": 1.4110221674876846, "grad_norm": 0.049971841275691986, "learning_rate": 0.01, "loss": 2.033, "step": 13749 }, { "epoch": 1.4113300492610836, "grad_norm": 0.040059734135866165, "learning_rate": 0.01, "loss": 2.0791, "step": 13752 }, { "epoch": 1.4116379310344827, "grad_norm": 0.0400179885327816, "learning_rate": 0.01, "loss": 2.0496, "step": 13755 }, { "epoch": 1.4119458128078817, "grad_norm": 0.04587862268090248, "learning_rate": 0.01, "loss": 2.0572, "step": 13758 }, { "epoch": 1.4122536945812807, "grad_norm": 0.08982817828655243, "learning_rate": 0.01, "loss": 2.0864, "step": 13761 }, { "epoch": 1.4125615763546797, "grad_norm": 0.05488836392760277, "learning_rate": 0.01, "loss": 2.0529, "step": 13764 }, { "epoch": 1.4128694581280787, "grad_norm": 0.06559593975543976, "learning_rate": 0.01, "loss": 2.0564, "step": 13767 }, { "epoch": 1.4131773399014778, "grad_norm": 0.10647718608379364, "learning_rate": 0.01, "loss": 2.0404, "step": 13770 }, { "epoch": 1.4134852216748768, "grad_norm": 0.05944173410534859, "learning_rate": 0.01, "loss": 2.053, "step": 13773 }, { "epoch": 1.4137931034482758, "grad_norm": 0.05548718199133873, "learning_rate": 0.01, "loss": 2.0534, "step": 13776 }, { "epoch": 1.4141009852216748, "grad_norm": 0.0694265142083168, "learning_rate": 0.01, "loss": 2.0647, "step": 13779 }, { "epoch": 1.4144088669950738, "grad_norm": 0.10526683181524277, "learning_rate": 0.01, "loss": 2.0768, "step": 13782 }, { "epoch": 1.4147167487684729, "grad_norm": 0.08820123970508575, "learning_rate": 0.01, "loss": 2.0693, "step": 13785 }, { "epoch": 1.4150246305418719, "grad_norm": 0.04513731971383095, "learning_rate": 0.01, "loss": 2.0596, "step": 13788 }, { "epoch": 1.415332512315271, "grad_norm": 0.05737076327204704, "learning_rate": 0.01, "loss": 2.0698, "step": 13791 }, { "epoch": 1.41564039408867, "grad_norm": 0.0431799478828907, "learning_rate": 0.01, "loss": 2.0603, "step": 13794 }, { "epoch": 1.415948275862069, "grad_norm": 0.09012471139431, "learning_rate": 0.01, "loss": 2.0634, "step": 13797 }, { "epoch": 1.416256157635468, "grad_norm": 0.05895904824137688, "learning_rate": 0.01, "loss": 2.0516, "step": 13800 }, { "epoch": 1.416564039408867, "grad_norm": 0.1610986888408661, "learning_rate": 0.01, "loss": 2.0743, "step": 13803 }, { "epoch": 1.416871921182266, "grad_norm": 0.07852904498577118, "learning_rate": 0.01, "loss": 2.064, "step": 13806 }, { "epoch": 1.417179802955665, "grad_norm": 0.06620481610298157, "learning_rate": 0.01, "loss": 2.0688, "step": 13809 }, { "epoch": 1.417487684729064, "grad_norm": 0.033222537487745285, "learning_rate": 0.01, "loss": 2.0702, "step": 13812 }, { "epoch": 1.417795566502463, "grad_norm": 0.02942623570561409, "learning_rate": 0.01, "loss": 2.0654, "step": 13815 }, { "epoch": 1.418103448275862, "grad_norm": 0.03543059900403023, "learning_rate": 0.01, "loss": 2.0776, "step": 13818 }, { "epoch": 1.4184113300492611, "grad_norm": 0.13414567708969116, "learning_rate": 0.01, "loss": 2.0621, "step": 13821 }, { "epoch": 1.4187192118226601, "grad_norm": 0.06474481523036957, "learning_rate": 0.01, "loss": 2.0675, "step": 13824 }, { "epoch": 1.4190270935960592, "grad_norm": 0.11285994201898575, "learning_rate": 0.01, "loss": 2.0533, "step": 13827 }, { "epoch": 1.4193349753694582, "grad_norm": 0.05104577913880348, "learning_rate": 0.01, "loss": 2.0835, "step": 13830 }, { "epoch": 1.4196428571428572, "grad_norm": 0.05463656783103943, "learning_rate": 0.01, "loss": 2.0788, "step": 13833 }, { "epoch": 1.4199507389162562, "grad_norm": 0.07886187732219696, "learning_rate": 0.01, "loss": 2.0538, "step": 13836 }, { "epoch": 1.4202586206896552, "grad_norm": 0.06960279494524002, "learning_rate": 0.01, "loss": 2.0927, "step": 13839 }, { "epoch": 1.4205665024630543, "grad_norm": 0.07481426745653152, "learning_rate": 0.01, "loss": 2.0421, "step": 13842 }, { "epoch": 1.4208743842364533, "grad_norm": 0.04317006468772888, "learning_rate": 0.01, "loss": 2.0665, "step": 13845 }, { "epoch": 1.4211822660098523, "grad_norm": 0.10644064098596573, "learning_rate": 0.01, "loss": 2.0434, "step": 13848 }, { "epoch": 1.4214901477832513, "grad_norm": 0.09246213734149933, "learning_rate": 0.01, "loss": 2.0487, "step": 13851 }, { "epoch": 1.4217980295566504, "grad_norm": 0.05824518948793411, "learning_rate": 0.01, "loss": 2.0633, "step": 13854 }, { "epoch": 1.4221059113300494, "grad_norm": 0.06316854059696198, "learning_rate": 0.01, "loss": 2.076, "step": 13857 }, { "epoch": 1.4224137931034484, "grad_norm": 0.058339640498161316, "learning_rate": 0.01, "loss": 2.0702, "step": 13860 }, { "epoch": 1.4227216748768474, "grad_norm": 0.05458427220582962, "learning_rate": 0.01, "loss": 2.0585, "step": 13863 }, { "epoch": 1.4230295566502464, "grad_norm": 0.041047900915145874, "learning_rate": 0.01, "loss": 2.0793, "step": 13866 }, { "epoch": 1.4233374384236452, "grad_norm": 0.04485390707850456, "learning_rate": 0.01, "loss": 2.0723, "step": 13869 }, { "epoch": 1.4236453201970443, "grad_norm": 0.13340137898921967, "learning_rate": 0.01, "loss": 2.0555, "step": 13872 }, { "epoch": 1.4239532019704433, "grad_norm": 0.05519254505634308, "learning_rate": 0.01, "loss": 2.0931, "step": 13875 }, { "epoch": 1.4242610837438423, "grad_norm": 0.07989728450775146, "learning_rate": 0.01, "loss": 2.0435, "step": 13878 }, { "epoch": 1.4245689655172413, "grad_norm": 0.09447802603244781, "learning_rate": 0.01, "loss": 2.061, "step": 13881 }, { "epoch": 1.4248768472906403, "grad_norm": 0.05240226909518242, "learning_rate": 0.01, "loss": 2.0643, "step": 13884 }, { "epoch": 1.4251847290640394, "grad_norm": 0.07171013206243515, "learning_rate": 0.01, "loss": 2.0829, "step": 13887 }, { "epoch": 1.4254926108374384, "grad_norm": 0.05098895728588104, "learning_rate": 0.01, "loss": 2.0716, "step": 13890 }, { "epoch": 1.4258004926108374, "grad_norm": 0.08569507300853729, "learning_rate": 0.01, "loss": 2.0699, "step": 13893 }, { "epoch": 1.4261083743842364, "grad_norm": 0.09055166691541672, "learning_rate": 0.01, "loss": 2.0843, "step": 13896 }, { "epoch": 1.4264162561576355, "grad_norm": 0.03242780640721321, "learning_rate": 0.01, "loss": 2.0433, "step": 13899 }, { "epoch": 1.4267241379310345, "grad_norm": 0.04612202197313309, "learning_rate": 0.01, "loss": 2.0327, "step": 13902 }, { "epoch": 1.4270320197044335, "grad_norm": 0.05800663307309151, "learning_rate": 0.01, "loss": 2.0362, "step": 13905 }, { "epoch": 1.4273399014778325, "grad_norm": 0.04150572046637535, "learning_rate": 0.01, "loss": 2.0504, "step": 13908 }, { "epoch": 1.4276477832512315, "grad_norm": 0.08542584627866745, "learning_rate": 0.01, "loss": 2.0406, "step": 13911 }, { "epoch": 1.4279556650246306, "grad_norm": 0.11966803669929504, "learning_rate": 0.01, "loss": 2.0829, "step": 13914 }, { "epoch": 1.4282635467980296, "grad_norm": 0.12066449970006943, "learning_rate": 0.01, "loss": 2.0657, "step": 13917 }, { "epoch": 1.4285714285714286, "grad_norm": 0.04103751480579376, "learning_rate": 0.01, "loss": 2.0496, "step": 13920 }, { "epoch": 1.4288793103448276, "grad_norm": 0.05432034656405449, "learning_rate": 0.01, "loss": 2.0563, "step": 13923 }, { "epoch": 1.4291871921182266, "grad_norm": 0.03935731574892998, "learning_rate": 0.01, "loss": 2.0463, "step": 13926 }, { "epoch": 1.4294950738916257, "grad_norm": 0.1475706547498703, "learning_rate": 0.01, "loss": 2.0641, "step": 13929 }, { "epoch": 1.4298029556650247, "grad_norm": 0.06562622636556625, "learning_rate": 0.01, "loss": 2.0536, "step": 13932 }, { "epoch": 1.4301108374384237, "grad_norm": 0.051726315170526505, "learning_rate": 0.01, "loss": 2.0506, "step": 13935 }, { "epoch": 1.4304187192118227, "grad_norm": 0.0998329371213913, "learning_rate": 0.01, "loss": 2.0521, "step": 13938 }, { "epoch": 1.4307266009852218, "grad_norm": 0.04965333640575409, "learning_rate": 0.01, "loss": 2.0568, "step": 13941 }, { "epoch": 1.4310344827586206, "grad_norm": 0.04430006071925163, "learning_rate": 0.01, "loss": 2.0735, "step": 13944 }, { "epoch": 1.4313423645320196, "grad_norm": 0.05260150134563446, "learning_rate": 0.01, "loss": 2.0887, "step": 13947 }, { "epoch": 1.4316502463054186, "grad_norm": 0.04135138541460037, "learning_rate": 0.01, "loss": 2.055, "step": 13950 }, { "epoch": 1.4319581280788176, "grad_norm": 0.08347123116254807, "learning_rate": 0.01, "loss": 2.0826, "step": 13953 }, { "epoch": 1.4322660098522166, "grad_norm": 0.12328385561704636, "learning_rate": 0.01, "loss": 2.0787, "step": 13956 }, { "epoch": 1.4325738916256157, "grad_norm": 0.05809056758880615, "learning_rate": 0.01, "loss": 2.0798, "step": 13959 }, { "epoch": 1.4328817733990147, "grad_norm": 0.038590408861637115, "learning_rate": 0.01, "loss": 2.0699, "step": 13962 }, { "epoch": 1.4331896551724137, "grad_norm": 0.11158851534128189, "learning_rate": 0.01, "loss": 2.0707, "step": 13965 }, { "epoch": 1.4334975369458127, "grad_norm": 0.0880589634180069, "learning_rate": 0.01, "loss": 2.0425, "step": 13968 }, { "epoch": 1.4338054187192117, "grad_norm": 0.059966232627630234, "learning_rate": 0.01, "loss": 2.0659, "step": 13971 }, { "epoch": 1.4341133004926108, "grad_norm": 0.04661833122372627, "learning_rate": 0.01, "loss": 2.0736, "step": 13974 }, { "epoch": 1.4344211822660098, "grad_norm": 0.04508896544575691, "learning_rate": 0.01, "loss": 2.0856, "step": 13977 }, { "epoch": 1.4347290640394088, "grad_norm": 0.051987554877996445, "learning_rate": 0.01, "loss": 2.0449, "step": 13980 }, { "epoch": 1.4350369458128078, "grad_norm": 0.04814029112458229, "learning_rate": 0.01, "loss": 2.0587, "step": 13983 }, { "epoch": 1.4353448275862069, "grad_norm": 0.09631717205047607, "learning_rate": 0.01, "loss": 2.0747, "step": 13986 }, { "epoch": 1.4356527093596059, "grad_norm": 0.06581971794366837, "learning_rate": 0.01, "loss": 2.0456, "step": 13989 }, { "epoch": 1.435960591133005, "grad_norm": 0.06483247131109238, "learning_rate": 0.01, "loss": 2.0627, "step": 13992 }, { "epoch": 1.436268472906404, "grad_norm": 0.1000155657529831, "learning_rate": 0.01, "loss": 2.0655, "step": 13995 }, { "epoch": 1.436576354679803, "grad_norm": 0.07297492027282715, "learning_rate": 0.01, "loss": 2.0686, "step": 13998 }, { "epoch": 1.436884236453202, "grad_norm": 0.054907578974962234, "learning_rate": 0.01, "loss": 2.0351, "step": 14001 }, { "epoch": 1.437192118226601, "grad_norm": 0.051127828657627106, "learning_rate": 0.01, "loss": 2.0583, "step": 14004 }, { "epoch": 1.4375, "grad_norm": 0.04157300665974617, "learning_rate": 0.01, "loss": 2.0548, "step": 14007 }, { "epoch": 1.437807881773399, "grad_norm": 0.07996746897697449, "learning_rate": 0.01, "loss": 2.0625, "step": 14010 }, { "epoch": 1.438115763546798, "grad_norm": 0.0764036774635315, "learning_rate": 0.01, "loss": 2.076, "step": 14013 }, { "epoch": 1.438423645320197, "grad_norm": 0.10736891627311707, "learning_rate": 0.01, "loss": 2.0682, "step": 14016 }, { "epoch": 1.438731527093596, "grad_norm": 0.0598980113863945, "learning_rate": 0.01, "loss": 2.0774, "step": 14019 }, { "epoch": 1.439039408866995, "grad_norm": 0.10858605802059174, "learning_rate": 0.01, "loss": 2.0708, "step": 14022 }, { "epoch": 1.4393472906403941, "grad_norm": 0.03999786823987961, "learning_rate": 0.01, "loss": 2.0521, "step": 14025 }, { "epoch": 1.4396551724137931, "grad_norm": 0.053138673305511475, "learning_rate": 0.01, "loss": 2.0547, "step": 14028 }, { "epoch": 1.4399630541871922, "grad_norm": 0.06477091461420059, "learning_rate": 0.01, "loss": 2.0371, "step": 14031 }, { "epoch": 1.4402709359605912, "grad_norm": 0.035987384617328644, "learning_rate": 0.01, "loss": 2.0897, "step": 14034 }, { "epoch": 1.4405788177339902, "grad_norm": 0.06938667595386505, "learning_rate": 0.01, "loss": 2.0895, "step": 14037 }, { "epoch": 1.4408866995073892, "grad_norm": 0.041746124625205994, "learning_rate": 0.01, "loss": 2.0705, "step": 14040 }, { "epoch": 1.4411945812807883, "grad_norm": 0.04503123462200165, "learning_rate": 0.01, "loss": 2.0528, "step": 14043 }, { "epoch": 1.4415024630541873, "grad_norm": 0.04153100401163101, "learning_rate": 0.01, "loss": 2.0596, "step": 14046 }, { "epoch": 1.4418103448275863, "grad_norm": 0.04992615804076195, "learning_rate": 0.01, "loss": 2.0527, "step": 14049 }, { "epoch": 1.4421182266009853, "grad_norm": 0.0718725174665451, "learning_rate": 0.01, "loss": 2.049, "step": 14052 }, { "epoch": 1.4424261083743843, "grad_norm": 0.13080431520938873, "learning_rate": 0.01, "loss": 2.0457, "step": 14055 }, { "epoch": 1.4427339901477834, "grad_norm": 0.04203762486577034, "learning_rate": 0.01, "loss": 2.0713, "step": 14058 }, { "epoch": 1.4430418719211824, "grad_norm": 0.04111120104789734, "learning_rate": 0.01, "loss": 2.0845, "step": 14061 }, { "epoch": 1.4433497536945814, "grad_norm": 0.044398125261068344, "learning_rate": 0.01, "loss": 2.0679, "step": 14064 }, { "epoch": 1.4436576354679804, "grad_norm": 0.031682152301073074, "learning_rate": 0.01, "loss": 2.0855, "step": 14067 }, { "epoch": 1.4439655172413794, "grad_norm": 0.07848865538835526, "learning_rate": 0.01, "loss": 2.0607, "step": 14070 }, { "epoch": 1.4442733990147782, "grad_norm": 0.08814079314470291, "learning_rate": 0.01, "loss": 2.0468, "step": 14073 }, { "epoch": 1.4445812807881773, "grad_norm": 0.05344429612159729, "learning_rate": 0.01, "loss": 2.0499, "step": 14076 }, { "epoch": 1.4448891625615763, "grad_norm": 0.05509471520781517, "learning_rate": 0.01, "loss": 2.0662, "step": 14079 }, { "epoch": 1.4451970443349753, "grad_norm": 0.08177798241376877, "learning_rate": 0.01, "loss": 2.0509, "step": 14082 }, { "epoch": 1.4455049261083743, "grad_norm": 0.07953787595033646, "learning_rate": 0.01, "loss": 2.0501, "step": 14085 }, { "epoch": 1.4458128078817734, "grad_norm": 0.06984551250934601, "learning_rate": 0.01, "loss": 2.0599, "step": 14088 }, { "epoch": 1.4461206896551724, "grad_norm": 0.07923319190740585, "learning_rate": 0.01, "loss": 2.06, "step": 14091 }, { "epoch": 1.4464285714285714, "grad_norm": 0.04370349645614624, "learning_rate": 0.01, "loss": 2.0839, "step": 14094 }, { "epoch": 1.4467364532019704, "grad_norm": 0.045787643641233444, "learning_rate": 0.01, "loss": 2.0512, "step": 14097 }, { "epoch": 1.4470443349753694, "grad_norm": 0.04126288741827011, "learning_rate": 0.01, "loss": 2.0603, "step": 14100 }, { "epoch": 1.4473522167487685, "grad_norm": 0.039805784821510315, "learning_rate": 0.01, "loss": 2.0561, "step": 14103 }, { "epoch": 1.4476600985221675, "grad_norm": 0.038430992513895035, "learning_rate": 0.01, "loss": 2.0697, "step": 14106 }, { "epoch": 1.4479679802955665, "grad_norm": 0.07664498686790466, "learning_rate": 0.01, "loss": 2.0381, "step": 14109 }, { "epoch": 1.4482758620689655, "grad_norm": 0.04592788219451904, "learning_rate": 0.01, "loss": 2.042, "step": 14112 }, { "epoch": 1.4485837438423645, "grad_norm": 0.06161922961473465, "learning_rate": 0.01, "loss": 2.0446, "step": 14115 }, { "epoch": 1.4488916256157636, "grad_norm": 0.07906373590230942, "learning_rate": 0.01, "loss": 2.0758, "step": 14118 }, { "epoch": 1.4491995073891626, "grad_norm": 0.09529503434896469, "learning_rate": 0.01, "loss": 2.0512, "step": 14121 }, { "epoch": 1.4495073891625616, "grad_norm": 0.05416659638285637, "learning_rate": 0.01, "loss": 2.0484, "step": 14124 }, { "epoch": 1.4498152709359606, "grad_norm": 0.07085006684064865, "learning_rate": 0.01, "loss": 2.0293, "step": 14127 }, { "epoch": 1.4501231527093597, "grad_norm": 0.07722880691289902, "learning_rate": 0.01, "loss": 2.0578, "step": 14130 }, { "epoch": 1.4504310344827587, "grad_norm": 0.06599342823028564, "learning_rate": 0.01, "loss": 2.0556, "step": 14133 }, { "epoch": 1.4507389162561577, "grad_norm": 0.11217498779296875, "learning_rate": 0.01, "loss": 2.068, "step": 14136 }, { "epoch": 1.4510467980295567, "grad_norm": 0.13082845509052277, "learning_rate": 0.01, "loss": 2.0677, "step": 14139 }, { "epoch": 1.4513546798029557, "grad_norm": 0.06812801957130432, "learning_rate": 0.01, "loss": 2.0668, "step": 14142 }, { "epoch": 1.4516625615763548, "grad_norm": 0.043554674834012985, "learning_rate": 0.01, "loss": 2.0537, "step": 14145 }, { "epoch": 1.4519704433497536, "grad_norm": 0.04550860822200775, "learning_rate": 0.01, "loss": 2.0415, "step": 14148 }, { "epoch": 1.4522783251231526, "grad_norm": 0.04838492348790169, "learning_rate": 0.01, "loss": 2.0642, "step": 14151 }, { "epoch": 1.4525862068965516, "grad_norm": 0.08433537930250168, "learning_rate": 0.01, "loss": 2.0549, "step": 14154 }, { "epoch": 1.4528940886699506, "grad_norm": 0.06986009329557419, "learning_rate": 0.01, "loss": 2.057, "step": 14157 }, { "epoch": 1.4532019704433496, "grad_norm": 0.06860263645648956, "learning_rate": 0.01, "loss": 2.0581, "step": 14160 }, { "epoch": 1.4535098522167487, "grad_norm": 0.06218327581882477, "learning_rate": 0.01, "loss": 2.0482, "step": 14163 }, { "epoch": 1.4538177339901477, "grad_norm": 0.10177832096815109, "learning_rate": 0.01, "loss": 2.059, "step": 14166 }, { "epoch": 1.4541256157635467, "grad_norm": 0.047695957124233246, "learning_rate": 0.01, "loss": 2.0372, "step": 14169 }, { "epoch": 1.4544334975369457, "grad_norm": 0.09761510044336319, "learning_rate": 0.01, "loss": 2.0671, "step": 14172 }, { "epoch": 1.4547413793103448, "grad_norm": 0.050296783447265625, "learning_rate": 0.01, "loss": 2.0536, "step": 14175 }, { "epoch": 1.4550492610837438, "grad_norm": 0.13070064783096313, "learning_rate": 0.01, "loss": 2.0579, "step": 14178 }, { "epoch": 1.4553571428571428, "grad_norm": 0.1080620214343071, "learning_rate": 0.01, "loss": 2.0455, "step": 14181 }, { "epoch": 1.4556650246305418, "grad_norm": 0.06132792308926582, "learning_rate": 0.01, "loss": 2.062, "step": 14184 }, { "epoch": 1.4559729064039408, "grad_norm": 0.07258635014295578, "learning_rate": 0.01, "loss": 2.0666, "step": 14187 }, { "epoch": 1.4562807881773399, "grad_norm": 0.05423443764448166, "learning_rate": 0.01, "loss": 2.0676, "step": 14190 }, { "epoch": 1.4565886699507389, "grad_norm": 0.06038088724017143, "learning_rate": 0.01, "loss": 2.0539, "step": 14193 }, { "epoch": 1.456896551724138, "grad_norm": 0.043958742171525955, "learning_rate": 0.01, "loss": 2.0739, "step": 14196 }, { "epoch": 1.457204433497537, "grad_norm": 0.03985238075256348, "learning_rate": 0.01, "loss": 2.0323, "step": 14199 }, { "epoch": 1.457512315270936, "grad_norm": 0.05626663193106651, "learning_rate": 0.01, "loss": 2.0518, "step": 14202 }, { "epoch": 1.457820197044335, "grad_norm": 0.06365952640771866, "learning_rate": 0.01, "loss": 2.042, "step": 14205 }, { "epoch": 1.458128078817734, "grad_norm": 0.0698857232928276, "learning_rate": 0.01, "loss": 2.0676, "step": 14208 }, { "epoch": 1.458435960591133, "grad_norm": 0.08149702101945877, "learning_rate": 0.01, "loss": 2.0659, "step": 14211 }, { "epoch": 1.458743842364532, "grad_norm": 0.055818330496549606, "learning_rate": 0.01, "loss": 2.0633, "step": 14214 }, { "epoch": 1.459051724137931, "grad_norm": 0.046251330524683, "learning_rate": 0.01, "loss": 2.0598, "step": 14217 }, { "epoch": 1.45935960591133, "grad_norm": 0.10986322909593582, "learning_rate": 0.01, "loss": 2.0768, "step": 14220 }, { "epoch": 1.459667487684729, "grad_norm": 0.06735626608133316, "learning_rate": 0.01, "loss": 2.0709, "step": 14223 }, { "epoch": 1.4599753694581281, "grad_norm": 0.1051633432507515, "learning_rate": 0.01, "loss": 2.0807, "step": 14226 }, { "epoch": 1.4602832512315271, "grad_norm": 0.0956743136048317, "learning_rate": 0.01, "loss": 2.0377, "step": 14229 }, { "epoch": 1.4605911330049262, "grad_norm": 0.04349840059876442, "learning_rate": 0.01, "loss": 2.0723, "step": 14232 }, { "epoch": 1.4608990147783252, "grad_norm": 0.0388668030500412, "learning_rate": 0.01, "loss": 2.0612, "step": 14235 }, { "epoch": 1.4612068965517242, "grad_norm": 0.04311763867735863, "learning_rate": 0.01, "loss": 2.0553, "step": 14238 }, { "epoch": 1.4615147783251232, "grad_norm": 0.09116464853286743, "learning_rate": 0.01, "loss": 2.0537, "step": 14241 }, { "epoch": 1.4618226600985222, "grad_norm": 0.08582088351249695, "learning_rate": 0.01, "loss": 2.0588, "step": 14244 }, { "epoch": 1.4621305418719213, "grad_norm": 0.0435602031648159, "learning_rate": 0.01, "loss": 2.0303, "step": 14247 }, { "epoch": 1.4624384236453203, "grad_norm": 0.067762091755867, "learning_rate": 0.01, "loss": 2.0453, "step": 14250 }, { "epoch": 1.4627463054187193, "grad_norm": 0.03980677202343941, "learning_rate": 0.01, "loss": 2.0488, "step": 14253 }, { "epoch": 1.4630541871921183, "grad_norm": 0.08521614968776703, "learning_rate": 0.01, "loss": 2.0617, "step": 14256 }, { "epoch": 1.4633620689655173, "grad_norm": 0.06770948320627213, "learning_rate": 0.01, "loss": 2.0535, "step": 14259 }, { "epoch": 1.4636699507389164, "grad_norm": 0.053458839654922485, "learning_rate": 0.01, "loss": 2.0594, "step": 14262 }, { "epoch": 1.4639778325123154, "grad_norm": 0.06733859330415726, "learning_rate": 0.01, "loss": 2.0422, "step": 14265 }, { "epoch": 1.4642857142857144, "grad_norm": 0.08033892512321472, "learning_rate": 0.01, "loss": 2.0685, "step": 14268 }, { "epoch": 1.4645935960591134, "grad_norm": 0.0832366794347763, "learning_rate": 0.01, "loss": 2.0867, "step": 14271 }, { "epoch": 1.4649014778325122, "grad_norm": 0.055291153490543365, "learning_rate": 0.01, "loss": 2.0618, "step": 14274 }, { "epoch": 1.4652093596059113, "grad_norm": 0.07180801033973694, "learning_rate": 0.01, "loss": 2.0569, "step": 14277 }, { "epoch": 1.4655172413793103, "grad_norm": 0.048950713127851486, "learning_rate": 0.01, "loss": 2.0433, "step": 14280 }, { "epoch": 1.4658251231527093, "grad_norm": 0.05428025498986244, "learning_rate": 0.01, "loss": 2.052, "step": 14283 }, { "epoch": 1.4661330049261083, "grad_norm": 0.06631309539079666, "learning_rate": 0.01, "loss": 2.0615, "step": 14286 }, { "epoch": 1.4664408866995073, "grad_norm": 0.06743253022432327, "learning_rate": 0.01, "loss": 2.0524, "step": 14289 }, { "epoch": 1.4667487684729064, "grad_norm": 0.10901882499456406, "learning_rate": 0.01, "loss": 2.076, "step": 14292 }, { "epoch": 1.4670566502463054, "grad_norm": 0.08234187960624695, "learning_rate": 0.01, "loss": 2.0432, "step": 14295 }, { "epoch": 1.4673645320197044, "grad_norm": 0.07249965518712997, "learning_rate": 0.01, "loss": 2.057, "step": 14298 }, { "epoch": 1.4676724137931034, "grad_norm": 0.0705137550830841, "learning_rate": 0.01, "loss": 2.0616, "step": 14301 }, { "epoch": 1.4679802955665024, "grad_norm": 0.10730472952127457, "learning_rate": 0.01, "loss": 2.0575, "step": 14304 }, { "epoch": 1.4682881773399015, "grad_norm": 0.048364557325839996, "learning_rate": 0.01, "loss": 2.0719, "step": 14307 }, { "epoch": 1.4685960591133005, "grad_norm": 0.03604978322982788, "learning_rate": 0.01, "loss": 2.0608, "step": 14310 }, { "epoch": 1.4689039408866995, "grad_norm": 0.09732489287853241, "learning_rate": 0.01, "loss": 2.0627, "step": 14313 }, { "epoch": 1.4692118226600985, "grad_norm": 0.06590714305639267, "learning_rate": 0.01, "loss": 2.0775, "step": 14316 }, { "epoch": 1.4695197044334976, "grad_norm": 0.075086310505867, "learning_rate": 0.01, "loss": 2.0633, "step": 14319 }, { "epoch": 1.4698275862068966, "grad_norm": 0.10288450121879578, "learning_rate": 0.01, "loss": 2.0607, "step": 14322 }, { "epoch": 1.4701354679802956, "grad_norm": 0.0535271093249321, "learning_rate": 0.01, "loss": 2.0702, "step": 14325 }, { "epoch": 1.4704433497536946, "grad_norm": 0.04609391465783119, "learning_rate": 0.01, "loss": 2.049, "step": 14328 }, { "epoch": 1.4707512315270936, "grad_norm": 0.044252909719944, "learning_rate": 0.01, "loss": 2.0586, "step": 14331 }, { "epoch": 1.4710591133004927, "grad_norm": 0.07837995141744614, "learning_rate": 0.01, "loss": 2.0541, "step": 14334 }, { "epoch": 1.4713669950738917, "grad_norm": 0.06548511236906052, "learning_rate": 0.01, "loss": 2.0575, "step": 14337 }, { "epoch": 1.4716748768472907, "grad_norm": 0.09237763285636902, "learning_rate": 0.01, "loss": 2.0607, "step": 14340 }, { "epoch": 1.4719827586206897, "grad_norm": 0.04163452237844467, "learning_rate": 0.01, "loss": 2.0604, "step": 14343 }, { "epoch": 1.4722906403940887, "grad_norm": 0.13814504444599152, "learning_rate": 0.01, "loss": 2.0667, "step": 14346 }, { "epoch": 1.4725985221674878, "grad_norm": 0.054490260779857635, "learning_rate": 0.01, "loss": 2.0754, "step": 14349 }, { "epoch": 1.4729064039408866, "grad_norm": 0.07470995932817459, "learning_rate": 0.01, "loss": 2.0731, "step": 14352 }, { "epoch": 1.4732142857142856, "grad_norm": 0.12089511752128601, "learning_rate": 0.01, "loss": 2.0671, "step": 14355 }, { "epoch": 1.4735221674876846, "grad_norm": 0.05507595092058182, "learning_rate": 0.01, "loss": 2.0278, "step": 14358 }, { "epoch": 1.4738300492610836, "grad_norm": 0.05130109563469887, "learning_rate": 0.01, "loss": 2.0491, "step": 14361 }, { "epoch": 1.4741379310344827, "grad_norm": 0.04399503022432327, "learning_rate": 0.01, "loss": 2.0669, "step": 14364 }, { "epoch": 1.4744458128078817, "grad_norm": 0.06161755695939064, "learning_rate": 0.01, "loss": 2.0458, "step": 14367 }, { "epoch": 1.4747536945812807, "grad_norm": 0.045603156089782715, "learning_rate": 0.01, "loss": 2.0477, "step": 14370 }, { "epoch": 1.4750615763546797, "grad_norm": 0.06444186717271805, "learning_rate": 0.01, "loss": 2.0514, "step": 14373 }, { "epoch": 1.4753694581280787, "grad_norm": 0.07450753450393677, "learning_rate": 0.01, "loss": 2.0327, "step": 14376 }, { "epoch": 1.4756773399014778, "grad_norm": 0.10367168486118317, "learning_rate": 0.01, "loss": 2.0453, "step": 14379 }, { "epoch": 1.4759852216748768, "grad_norm": 0.08999089151620865, "learning_rate": 0.01, "loss": 2.0588, "step": 14382 }, { "epoch": 1.4762931034482758, "grad_norm": 0.09803617745637894, "learning_rate": 0.01, "loss": 2.0972, "step": 14385 }, { "epoch": 1.4766009852216748, "grad_norm": 0.042447153478860855, "learning_rate": 0.01, "loss": 2.0295, "step": 14388 }, { "epoch": 1.4769088669950738, "grad_norm": 0.04479740187525749, "learning_rate": 0.01, "loss": 2.0545, "step": 14391 }, { "epoch": 1.4772167487684729, "grad_norm": 0.04113270714879036, "learning_rate": 0.01, "loss": 2.0522, "step": 14394 }, { "epoch": 1.4775246305418719, "grad_norm": 0.1087644025683403, "learning_rate": 0.01, "loss": 2.0668, "step": 14397 }, { "epoch": 1.477832512315271, "grad_norm": 0.05737099424004555, "learning_rate": 0.01, "loss": 2.0509, "step": 14400 }, { "epoch": 1.47814039408867, "grad_norm": 0.11025606095790863, "learning_rate": 0.01, "loss": 2.067, "step": 14403 }, { "epoch": 1.478448275862069, "grad_norm": 0.06662195175886154, "learning_rate": 0.01, "loss": 2.0554, "step": 14406 }, { "epoch": 1.478756157635468, "grad_norm": 0.05261904001235962, "learning_rate": 0.01, "loss": 2.0562, "step": 14409 }, { "epoch": 1.479064039408867, "grad_norm": 0.048272691667079926, "learning_rate": 0.01, "loss": 2.0858, "step": 14412 }, { "epoch": 1.479371921182266, "grad_norm": 0.048300545662641525, "learning_rate": 0.01, "loss": 2.0736, "step": 14415 }, { "epoch": 1.479679802955665, "grad_norm": 0.08697368204593658, "learning_rate": 0.01, "loss": 2.0425, "step": 14418 }, { "epoch": 1.479987684729064, "grad_norm": 0.07993713021278381, "learning_rate": 0.01, "loss": 2.0511, "step": 14421 }, { "epoch": 1.480295566502463, "grad_norm": 0.10037390887737274, "learning_rate": 0.01, "loss": 2.0584, "step": 14424 }, { "epoch": 1.480603448275862, "grad_norm": 0.06048484891653061, "learning_rate": 0.01, "loss": 2.0589, "step": 14427 }, { "epoch": 1.4809113300492611, "grad_norm": 0.08982612937688828, "learning_rate": 0.01, "loss": 2.0575, "step": 14430 }, { "epoch": 1.4812192118226601, "grad_norm": 0.06678975373506546, "learning_rate": 0.01, "loss": 2.0544, "step": 14433 }, { "epoch": 1.4815270935960592, "grad_norm": 0.07890944927930832, "learning_rate": 0.01, "loss": 2.0352, "step": 14436 }, { "epoch": 1.4818349753694582, "grad_norm": 0.05838685482740402, "learning_rate": 0.01, "loss": 2.0481, "step": 14439 }, { "epoch": 1.4821428571428572, "grad_norm": 0.06483394652605057, "learning_rate": 0.01, "loss": 2.0425, "step": 14442 }, { "epoch": 1.4824507389162562, "grad_norm": 0.07320713996887207, "learning_rate": 0.01, "loss": 2.0524, "step": 14445 }, { "epoch": 1.4827586206896552, "grad_norm": 0.07484092563390732, "learning_rate": 0.01, "loss": 2.0709, "step": 14448 }, { "epoch": 1.4830665024630543, "grad_norm": 0.07702804356813431, "learning_rate": 0.01, "loss": 2.0483, "step": 14451 }, { "epoch": 1.4833743842364533, "grad_norm": 0.05470692366361618, "learning_rate": 0.01, "loss": 2.0767, "step": 14454 }, { "epoch": 1.4836822660098523, "grad_norm": 0.055773910135030746, "learning_rate": 0.01, "loss": 2.0692, "step": 14457 }, { "epoch": 1.4839901477832513, "grad_norm": 0.03712743893265724, "learning_rate": 0.01, "loss": 2.0723, "step": 14460 }, { "epoch": 1.4842980295566504, "grad_norm": 0.035963475704193115, "learning_rate": 0.01, "loss": 2.0526, "step": 14463 }, { "epoch": 1.4846059113300494, "grad_norm": 0.08578921854496002, "learning_rate": 0.01, "loss": 2.0631, "step": 14466 }, { "epoch": 1.4849137931034484, "grad_norm": 0.08239159733057022, "learning_rate": 0.01, "loss": 2.0654, "step": 14469 }, { "epoch": 1.4852216748768474, "grad_norm": 0.05097891017794609, "learning_rate": 0.01, "loss": 2.0837, "step": 14472 }, { "epoch": 1.4855295566502464, "grad_norm": 0.051847904920578, "learning_rate": 0.01, "loss": 2.059, "step": 14475 }, { "epoch": 1.4858374384236452, "grad_norm": 0.04754810780286789, "learning_rate": 0.01, "loss": 2.0658, "step": 14478 }, { "epoch": 1.4861453201970443, "grad_norm": 0.046647075563669205, "learning_rate": 0.01, "loss": 2.0423, "step": 14481 }, { "epoch": 1.4864532019704433, "grad_norm": 0.06013277545571327, "learning_rate": 0.01, "loss": 2.0254, "step": 14484 }, { "epoch": 1.4867610837438423, "grad_norm": 0.13433513045310974, "learning_rate": 0.01, "loss": 2.0775, "step": 14487 }, { "epoch": 1.4870689655172413, "grad_norm": 0.046518564224243164, "learning_rate": 0.01, "loss": 2.0434, "step": 14490 }, { "epoch": 1.4873768472906403, "grad_norm": 0.09483514726161957, "learning_rate": 0.01, "loss": 2.0839, "step": 14493 }, { "epoch": 1.4876847290640394, "grad_norm": 0.07147302478551865, "learning_rate": 0.01, "loss": 2.0741, "step": 14496 }, { "epoch": 1.4879926108374384, "grad_norm": 0.12423846870660782, "learning_rate": 0.01, "loss": 2.045, "step": 14499 }, { "epoch": 1.4883004926108374, "grad_norm": 0.07726770639419556, "learning_rate": 0.01, "loss": 2.0578, "step": 14502 }, { "epoch": 1.4886083743842364, "grad_norm": 0.059802982956171036, "learning_rate": 0.01, "loss": 2.0526, "step": 14505 }, { "epoch": 1.4889162561576355, "grad_norm": 0.050745993852615356, "learning_rate": 0.01, "loss": 2.049, "step": 14508 }, { "epoch": 1.4892241379310345, "grad_norm": 0.052064161747694016, "learning_rate": 0.01, "loss": 2.0575, "step": 14511 }, { "epoch": 1.4895320197044335, "grad_norm": 0.06646674871444702, "learning_rate": 0.01, "loss": 2.0689, "step": 14514 }, { "epoch": 1.4898399014778325, "grad_norm": 0.043484605848789215, "learning_rate": 0.01, "loss": 2.0655, "step": 14517 }, { "epoch": 1.4901477832512315, "grad_norm": 0.14452145993709564, "learning_rate": 0.01, "loss": 2.0599, "step": 14520 }, { "epoch": 1.4904556650246306, "grad_norm": 0.08289093524217606, "learning_rate": 0.01, "loss": 2.0654, "step": 14523 }, { "epoch": 1.4907635467980296, "grad_norm": 0.05047908052802086, "learning_rate": 0.01, "loss": 2.0409, "step": 14526 }, { "epoch": 1.4910714285714286, "grad_norm": 0.04830252006649971, "learning_rate": 0.01, "loss": 2.0529, "step": 14529 }, { "epoch": 1.4913793103448276, "grad_norm": 0.0430610254406929, "learning_rate": 0.01, "loss": 2.0242, "step": 14532 }, { "epoch": 1.4916871921182266, "grad_norm": 0.04282008111476898, "learning_rate": 0.01, "loss": 2.0494, "step": 14535 }, { "epoch": 1.4919950738916257, "grad_norm": 0.037373676896095276, "learning_rate": 0.01, "loss": 2.0528, "step": 14538 }, { "epoch": 1.4923029556650247, "grad_norm": 0.04186755418777466, "learning_rate": 0.01, "loss": 2.058, "step": 14541 }, { "epoch": 1.4926108374384237, "grad_norm": 0.05514196678996086, "learning_rate": 0.01, "loss": 2.0647, "step": 14544 }, { "epoch": 1.4929187192118227, "grad_norm": 0.07391703873872757, "learning_rate": 0.01, "loss": 2.0812, "step": 14547 }, { "epoch": 1.4932266009852218, "grad_norm": 0.1295444518327713, "learning_rate": 0.01, "loss": 2.0571, "step": 14550 }, { "epoch": 1.4935344827586206, "grad_norm": 0.06389490514993668, "learning_rate": 0.01, "loss": 2.0756, "step": 14553 }, { "epoch": 1.4938423645320196, "grad_norm": 0.09335155785083771, "learning_rate": 0.01, "loss": 2.0904, "step": 14556 }, { "epoch": 1.4941502463054186, "grad_norm": 0.059700366109609604, "learning_rate": 0.01, "loss": 2.0598, "step": 14559 }, { "epoch": 1.4944581280788176, "grad_norm": 0.07785683870315552, "learning_rate": 0.01, "loss": 2.0567, "step": 14562 }, { "epoch": 1.4947660098522166, "grad_norm": 0.11935362964868546, "learning_rate": 0.01, "loss": 2.0536, "step": 14565 }, { "epoch": 1.4950738916256157, "grad_norm": 0.06188122183084488, "learning_rate": 0.01, "loss": 2.0436, "step": 14568 }, { "epoch": 1.4953817733990147, "grad_norm": 0.05302932485938072, "learning_rate": 0.01, "loss": 2.0704, "step": 14571 }, { "epoch": 1.4956896551724137, "grad_norm": 0.03871694207191467, "learning_rate": 0.01, "loss": 2.062, "step": 14574 }, { "epoch": 1.4959975369458127, "grad_norm": 0.03942064568400383, "learning_rate": 0.01, "loss": 2.0725, "step": 14577 }, { "epoch": 1.4963054187192117, "grad_norm": 0.05354088917374611, "learning_rate": 0.01, "loss": 2.054, "step": 14580 }, { "epoch": 1.4966133004926108, "grad_norm": 0.07863521575927734, "learning_rate": 0.01, "loss": 2.0577, "step": 14583 }, { "epoch": 1.4969211822660098, "grad_norm": 0.0440685860812664, "learning_rate": 0.01, "loss": 2.0422, "step": 14586 }, { "epoch": 1.4972290640394088, "grad_norm": 0.0724552571773529, "learning_rate": 0.01, "loss": 2.0707, "step": 14589 }, { "epoch": 1.4975369458128078, "grad_norm": 0.06099352613091469, "learning_rate": 0.01, "loss": 2.0567, "step": 14592 }, { "epoch": 1.4978448275862069, "grad_norm": 0.05534674599766731, "learning_rate": 0.01, "loss": 2.0343, "step": 14595 }, { "epoch": 1.4981527093596059, "grad_norm": 0.07876823097467422, "learning_rate": 0.01, "loss": 2.0686, "step": 14598 }, { "epoch": 1.498460591133005, "grad_norm": 0.07860377430915833, "learning_rate": 0.01, "loss": 2.0506, "step": 14601 }, { "epoch": 1.498768472906404, "grad_norm": 0.054005399346351624, "learning_rate": 0.01, "loss": 2.0429, "step": 14604 }, { "epoch": 1.499076354679803, "grad_norm": 0.10550951957702637, "learning_rate": 0.01, "loss": 2.0407, "step": 14607 }, { "epoch": 1.499384236453202, "grad_norm": 0.056426841765642166, "learning_rate": 0.01, "loss": 2.0589, "step": 14610 }, { "epoch": 1.499692118226601, "grad_norm": 0.09640904515981674, "learning_rate": 0.01, "loss": 2.0556, "step": 14613 }, { "epoch": 1.5, "grad_norm": 0.0822538211941719, "learning_rate": 0.01, "loss": 2.0684, "step": 14616 }, { "epoch": 1.500307881773399, "grad_norm": 0.05105495825409889, "learning_rate": 0.01, "loss": 2.0588, "step": 14619 }, { "epoch": 1.500615763546798, "grad_norm": 0.07851336896419525, "learning_rate": 0.01, "loss": 2.0635, "step": 14622 }, { "epoch": 1.500923645320197, "grad_norm": 0.051046207547187805, "learning_rate": 0.01, "loss": 2.0418, "step": 14625 }, { "epoch": 1.501231527093596, "grad_norm": 0.12335740774869919, "learning_rate": 0.01, "loss": 2.0483, "step": 14628 }, { "epoch": 1.501539408866995, "grad_norm": 0.04044636711478233, "learning_rate": 0.01, "loss": 2.0635, "step": 14631 }, { "epoch": 1.5018472906403941, "grad_norm": 0.0532059408724308, "learning_rate": 0.01, "loss": 2.0567, "step": 14634 }, { "epoch": 1.5021551724137931, "grad_norm": 0.0446847639977932, "learning_rate": 0.01, "loss": 2.0531, "step": 14637 }, { "epoch": 1.5024630541871922, "grad_norm": 0.05464153364300728, "learning_rate": 0.01, "loss": 2.0464, "step": 14640 }, { "epoch": 1.5027709359605912, "grad_norm": 0.08923088759183884, "learning_rate": 0.01, "loss": 2.0787, "step": 14643 }, { "epoch": 1.5030788177339902, "grad_norm": 0.06256496161222458, "learning_rate": 0.01, "loss": 2.0417, "step": 14646 }, { "epoch": 1.5033866995073892, "grad_norm": 0.05338229984045029, "learning_rate": 0.01, "loss": 2.053, "step": 14649 }, { "epoch": 1.5036945812807883, "grad_norm": 0.04416535049676895, "learning_rate": 0.01, "loss": 2.0532, "step": 14652 }, { "epoch": 1.5040024630541873, "grad_norm": 0.07076221704483032, "learning_rate": 0.01, "loss": 2.0677, "step": 14655 }, { "epoch": 1.5043103448275863, "grad_norm": 0.08566464483737946, "learning_rate": 0.01, "loss": 2.0499, "step": 14658 }, { "epoch": 1.5046182266009853, "grad_norm": 0.049552109092473984, "learning_rate": 0.01, "loss": 2.0549, "step": 14661 }, { "epoch": 1.5049261083743843, "grad_norm": 0.11802852898836136, "learning_rate": 0.01, "loss": 2.0657, "step": 14664 }, { "epoch": 1.5052339901477834, "grad_norm": 0.05280107632279396, "learning_rate": 0.01, "loss": 2.0722, "step": 14667 }, { "epoch": 1.5055418719211824, "grad_norm": 0.036458853632211685, "learning_rate": 0.01, "loss": 2.0357, "step": 14670 }, { "epoch": 1.5058497536945814, "grad_norm": 0.0465536043047905, "learning_rate": 0.01, "loss": 2.0765, "step": 14673 }, { "epoch": 1.5061576354679804, "grad_norm": 0.09052444994449615, "learning_rate": 0.01, "loss": 2.0677, "step": 14676 }, { "epoch": 1.5064655172413794, "grad_norm": 0.08750707656145096, "learning_rate": 0.01, "loss": 2.0719, "step": 14679 }, { "epoch": 1.5067733990147785, "grad_norm": 0.07876972109079361, "learning_rate": 0.01, "loss": 2.0539, "step": 14682 }, { "epoch": 1.5070812807881775, "grad_norm": 0.045561011880636215, "learning_rate": 0.01, "loss": 2.0656, "step": 14685 }, { "epoch": 1.5073891625615765, "grad_norm": 0.04548237472772598, "learning_rate": 0.01, "loss": 2.0811, "step": 14688 }, { "epoch": 1.5076970443349755, "grad_norm": 0.04897540062665939, "learning_rate": 0.01, "loss": 2.046, "step": 14691 }, { "epoch": 1.5080049261083743, "grad_norm": 0.08820399641990662, "learning_rate": 0.01, "loss": 2.0521, "step": 14694 }, { "epoch": 1.5083128078817734, "grad_norm": 0.0701432004570961, "learning_rate": 0.01, "loss": 2.0644, "step": 14697 }, { "epoch": 1.5086206896551724, "grad_norm": 0.10921904444694519, "learning_rate": 0.01, "loss": 2.0617, "step": 14700 }, { "epoch": 1.5089285714285714, "grad_norm": 0.08308566361665726, "learning_rate": 0.01, "loss": 2.101, "step": 14703 }, { "epoch": 1.5092364532019704, "grad_norm": 0.12545743584632874, "learning_rate": 0.01, "loss": 2.0495, "step": 14706 }, { "epoch": 1.5095443349753694, "grad_norm": 0.11245466768741608, "learning_rate": 0.01, "loss": 2.0264, "step": 14709 }, { "epoch": 1.5098522167487685, "grad_norm": 0.10128718614578247, "learning_rate": 0.01, "loss": 2.0413, "step": 14712 }, { "epoch": 1.5101600985221675, "grad_norm": 0.07226911187171936, "learning_rate": 0.01, "loss": 2.0487, "step": 14715 }, { "epoch": 1.5104679802955665, "grad_norm": 0.056605782359838486, "learning_rate": 0.01, "loss": 2.0686, "step": 14718 }, { "epoch": 1.5107758620689655, "grad_norm": 0.08795683085918427, "learning_rate": 0.01, "loss": 2.053, "step": 14721 }, { "epoch": 1.5110837438423645, "grad_norm": 0.07311341166496277, "learning_rate": 0.01, "loss": 2.0164, "step": 14724 }, { "epoch": 1.5113916256157636, "grad_norm": 0.07164688408374786, "learning_rate": 0.01, "loss": 2.048, "step": 14727 }, { "epoch": 1.5116995073891626, "grad_norm": 0.04577312618494034, "learning_rate": 0.01, "loss": 2.0523, "step": 14730 }, { "epoch": 1.5120073891625616, "grad_norm": 0.043186552822589874, "learning_rate": 0.01, "loss": 2.0694, "step": 14733 }, { "epoch": 1.5123152709359606, "grad_norm": 0.06101042777299881, "learning_rate": 0.01, "loss": 2.072, "step": 14736 }, { "epoch": 1.5126231527093597, "grad_norm": 0.12651608884334564, "learning_rate": 0.01, "loss": 2.0914, "step": 14739 }, { "epoch": 1.5129310344827587, "grad_norm": 0.039495594799518585, "learning_rate": 0.01, "loss": 2.0331, "step": 14742 }, { "epoch": 1.5132389162561575, "grad_norm": 0.10854054987430573, "learning_rate": 0.01, "loss": 2.0585, "step": 14745 }, { "epoch": 1.5135467980295565, "grad_norm": 0.142778679728508, "learning_rate": 0.01, "loss": 2.0433, "step": 14748 }, { "epoch": 1.5138546798029555, "grad_norm": 0.06473375856876373, "learning_rate": 0.01, "loss": 2.075, "step": 14751 }, { "epoch": 1.5141625615763545, "grad_norm": 0.050436701625585556, "learning_rate": 0.01, "loss": 2.0823, "step": 14754 }, { "epoch": 1.5144704433497536, "grad_norm": 0.057088159024715424, "learning_rate": 0.01, "loss": 2.0578, "step": 14757 }, { "epoch": 1.5147783251231526, "grad_norm": 0.051623161882162094, "learning_rate": 0.01, "loss": 2.0816, "step": 14760 }, { "epoch": 1.5150862068965516, "grad_norm": 0.0770149901509285, "learning_rate": 0.01, "loss": 2.0583, "step": 14763 }, { "epoch": 1.5153940886699506, "grad_norm": 0.06827536970376968, "learning_rate": 0.01, "loss": 2.0782, "step": 14766 }, { "epoch": 1.5157019704433496, "grad_norm": 0.06987358629703522, "learning_rate": 0.01, "loss": 2.0618, "step": 14769 }, { "epoch": 1.5160098522167487, "grad_norm": 0.05388219282031059, "learning_rate": 0.01, "loss": 2.084, "step": 14772 }, { "epoch": 1.5163177339901477, "grad_norm": 0.11866139620542526, "learning_rate": 0.01, "loss": 2.0622, "step": 14775 }, { "epoch": 1.5166256157635467, "grad_norm": 0.12754911184310913, "learning_rate": 0.01, "loss": 2.0387, "step": 14778 }, { "epoch": 1.5169334975369457, "grad_norm": 0.03591502830386162, "learning_rate": 0.01, "loss": 2.043, "step": 14781 }, { "epoch": 1.5172413793103448, "grad_norm": 0.09142038226127625, "learning_rate": 0.01, "loss": 2.05, "step": 14784 }, { "epoch": 1.5175492610837438, "grad_norm": 0.13140954077243805, "learning_rate": 0.01, "loss": 2.0302, "step": 14787 }, { "epoch": 1.5178571428571428, "grad_norm": 0.08330459147691727, "learning_rate": 0.01, "loss": 2.0461, "step": 14790 }, { "epoch": 1.5181650246305418, "grad_norm": 0.0779498815536499, "learning_rate": 0.01, "loss": 2.0662, "step": 14793 }, { "epoch": 1.5184729064039408, "grad_norm": 0.05396762117743492, "learning_rate": 0.01, "loss": 2.0586, "step": 14796 }, { "epoch": 1.5187807881773399, "grad_norm": 0.06744614988565445, "learning_rate": 0.01, "loss": 2.0578, "step": 14799 }, { "epoch": 1.5190886699507389, "grad_norm": 0.04777420684695244, "learning_rate": 0.01, "loss": 2.0645, "step": 14802 }, { "epoch": 1.519396551724138, "grad_norm": 0.044643301516771317, "learning_rate": 0.01, "loss": 2.0691, "step": 14805 }, { "epoch": 1.519704433497537, "grad_norm": 0.05263877660036087, "learning_rate": 0.01, "loss": 2.0475, "step": 14808 }, { "epoch": 1.520012315270936, "grad_norm": 0.07794903963804245, "learning_rate": 0.01, "loss": 2.0685, "step": 14811 }, { "epoch": 1.520320197044335, "grad_norm": 0.03846001625061035, "learning_rate": 0.01, "loss": 2.0547, "step": 14814 }, { "epoch": 1.520628078817734, "grad_norm": 0.03806301951408386, "learning_rate": 0.01, "loss": 2.0701, "step": 14817 }, { "epoch": 1.520935960591133, "grad_norm": 0.08289408683776855, "learning_rate": 0.01, "loss": 2.0712, "step": 14820 }, { "epoch": 1.521243842364532, "grad_norm": 0.04307285323739052, "learning_rate": 0.01, "loss": 2.0926, "step": 14823 }, { "epoch": 1.521551724137931, "grad_norm": 0.04523704573512077, "learning_rate": 0.01, "loss": 2.0613, "step": 14826 }, { "epoch": 1.52185960591133, "grad_norm": 0.0813162624835968, "learning_rate": 0.01, "loss": 2.0516, "step": 14829 }, { "epoch": 1.522167487684729, "grad_norm": 0.08958449214696884, "learning_rate": 0.01, "loss": 2.0534, "step": 14832 }, { "epoch": 1.5224753694581281, "grad_norm": 0.1036042720079422, "learning_rate": 0.01, "loss": 2.051, "step": 14835 }, { "epoch": 1.5227832512315271, "grad_norm": 0.06528764218091965, "learning_rate": 0.01, "loss": 2.0389, "step": 14838 }, { "epoch": 1.5230911330049262, "grad_norm": 0.04857415333390236, "learning_rate": 0.01, "loss": 2.0215, "step": 14841 }, { "epoch": 1.5233990147783252, "grad_norm": 0.11137302964925766, "learning_rate": 0.01, "loss": 2.0706, "step": 14844 }, { "epoch": 1.5237068965517242, "grad_norm": 0.05258537456393242, "learning_rate": 0.01, "loss": 2.053, "step": 14847 }, { "epoch": 1.5240147783251232, "grad_norm": 0.05203690007328987, "learning_rate": 0.01, "loss": 2.0719, "step": 14850 }, { "epoch": 1.5243226600985222, "grad_norm": 0.036557264626026154, "learning_rate": 0.01, "loss": 2.0827, "step": 14853 }, { "epoch": 1.5246305418719213, "grad_norm": 0.05553048849105835, "learning_rate": 0.01, "loss": 2.0526, "step": 14856 }, { "epoch": 1.5249384236453203, "grad_norm": 0.07551626116037369, "learning_rate": 0.01, "loss": 2.0696, "step": 14859 }, { "epoch": 1.5252463054187193, "grad_norm": 0.09335839748382568, "learning_rate": 0.01, "loss": 2.069, "step": 14862 }, { "epoch": 1.5255541871921183, "grad_norm": 0.07123745232820511, "learning_rate": 0.01, "loss": 2.048, "step": 14865 }, { "epoch": 1.5258620689655173, "grad_norm": 0.06792188435792923, "learning_rate": 0.01, "loss": 2.059, "step": 14868 }, { "epoch": 1.5261699507389164, "grad_norm": 0.035666827112436295, "learning_rate": 0.01, "loss": 2.0585, "step": 14871 }, { "epoch": 1.5264778325123154, "grad_norm": 0.039600104093551636, "learning_rate": 0.01, "loss": 2.0447, "step": 14874 }, { "epoch": 1.5267857142857144, "grad_norm": 0.03983796760439873, "learning_rate": 0.01, "loss": 2.0607, "step": 14877 }, { "epoch": 1.5270935960591134, "grad_norm": 0.07013492286205292, "learning_rate": 0.01, "loss": 2.0502, "step": 14880 }, { "epoch": 1.5274014778325125, "grad_norm": 0.07064792513847351, "learning_rate": 0.01, "loss": 2.0328, "step": 14883 }, { "epoch": 1.5277093596059115, "grad_norm": 0.10502810031175613, "learning_rate": 0.01, "loss": 2.0494, "step": 14886 }, { "epoch": 1.5280172413793105, "grad_norm": 0.050288375467061996, "learning_rate": 0.01, "loss": 2.0316, "step": 14889 }, { "epoch": 1.5283251231527095, "grad_norm": 0.07382049411535263, "learning_rate": 0.01, "loss": 2.0414, "step": 14892 }, { "epoch": 1.5286330049261085, "grad_norm": 0.08693026751279831, "learning_rate": 0.01, "loss": 2.0428, "step": 14895 }, { "epoch": 1.5289408866995073, "grad_norm": 0.04283773526549339, "learning_rate": 0.01, "loss": 2.068, "step": 14898 }, { "epoch": 1.5292487684729064, "grad_norm": 0.1044667437672615, "learning_rate": 0.01, "loss": 2.0583, "step": 14901 }, { "epoch": 1.5295566502463054, "grad_norm": 0.06316410005092621, "learning_rate": 0.01, "loss": 2.0372, "step": 14904 }, { "epoch": 1.5298645320197044, "grad_norm": 0.04687780514359474, "learning_rate": 0.01, "loss": 2.0575, "step": 14907 }, { "epoch": 1.5301724137931034, "grad_norm": 0.04785927012562752, "learning_rate": 0.01, "loss": 2.0435, "step": 14910 }, { "epoch": 1.5304802955665024, "grad_norm": 0.03788747265934944, "learning_rate": 0.01, "loss": 2.0483, "step": 14913 }, { "epoch": 1.5307881773399015, "grad_norm": 0.05618858337402344, "learning_rate": 0.01, "loss": 2.031, "step": 14916 }, { "epoch": 1.5310960591133005, "grad_norm": 0.10059016942977905, "learning_rate": 0.01, "loss": 2.0608, "step": 14919 }, { "epoch": 1.5314039408866995, "grad_norm": 0.06718064099550247, "learning_rate": 0.01, "loss": 2.0676, "step": 14922 }, { "epoch": 1.5317118226600985, "grad_norm": 0.09006398916244507, "learning_rate": 0.01, "loss": 2.059, "step": 14925 }, { "epoch": 1.5320197044334976, "grad_norm": 0.036577485501766205, "learning_rate": 0.01, "loss": 2.0425, "step": 14928 }, { "epoch": 1.5323275862068966, "grad_norm": 0.07979925721883774, "learning_rate": 0.01, "loss": 2.0387, "step": 14931 }, { "epoch": 1.5326354679802956, "grad_norm": 0.08286473900079727, "learning_rate": 0.01, "loss": 2.0523, "step": 14934 }, { "epoch": 1.5329433497536946, "grad_norm": 0.042596206068992615, "learning_rate": 0.01, "loss": 2.0692, "step": 14937 }, { "epoch": 1.5332512315270936, "grad_norm": 0.043695200234651566, "learning_rate": 0.01, "loss": 2.0429, "step": 14940 }, { "epoch": 1.5335591133004927, "grad_norm": 0.13041776418685913, "learning_rate": 0.01, "loss": 2.0966, "step": 14943 }, { "epoch": 1.5338669950738915, "grad_norm": 0.06076871603727341, "learning_rate": 0.01, "loss": 2.081, "step": 14946 }, { "epoch": 1.5341748768472905, "grad_norm": 0.08744147419929504, "learning_rate": 0.01, "loss": 2.0698, "step": 14949 }, { "epoch": 1.5344827586206895, "grad_norm": 0.041992440819740295, "learning_rate": 0.01, "loss": 2.0735, "step": 14952 }, { "epoch": 1.5347906403940885, "grad_norm": 0.0773782953619957, "learning_rate": 0.01, "loss": 2.0501, "step": 14955 }, { "epoch": 1.5350985221674875, "grad_norm": 0.04371657967567444, "learning_rate": 0.01, "loss": 2.043, "step": 14958 }, { "epoch": 1.5354064039408866, "grad_norm": 0.04753347858786583, "learning_rate": 0.01, "loss": 2.0399, "step": 14961 }, { "epoch": 1.5357142857142856, "grad_norm": 0.10644037276506424, "learning_rate": 0.01, "loss": 2.0838, "step": 14964 }, { "epoch": 1.5360221674876846, "grad_norm": 0.037067610770463943, "learning_rate": 0.01, "loss": 2.0677, "step": 14967 }, { "epoch": 1.5363300492610836, "grad_norm": 0.06745267659425735, "learning_rate": 0.01, "loss": 2.0464, "step": 14970 }, { "epoch": 1.5366379310344827, "grad_norm": 0.10053039342164993, "learning_rate": 0.01, "loss": 2.0696, "step": 14973 }, { "epoch": 1.5369458128078817, "grad_norm": 0.08785562962293625, "learning_rate": 0.01, "loss": 2.059, "step": 14976 }, { "epoch": 1.5372536945812807, "grad_norm": 0.12240536510944366, "learning_rate": 0.01, "loss": 2.0477, "step": 14979 }, { "epoch": 1.5375615763546797, "grad_norm": 0.08541588485240936, "learning_rate": 0.01, "loss": 2.0581, "step": 14982 }, { "epoch": 1.5378694581280787, "grad_norm": 0.0469081737101078, "learning_rate": 0.01, "loss": 2.0581, "step": 14985 }, { "epoch": 1.5381773399014778, "grad_norm": 0.04396476596593857, "learning_rate": 0.01, "loss": 2.0504, "step": 14988 }, { "epoch": 1.5384852216748768, "grad_norm": 0.033920448273420334, "learning_rate": 0.01, "loss": 2.0513, "step": 14991 }, { "epoch": 1.5387931034482758, "grad_norm": 0.035798102617263794, "learning_rate": 0.01, "loss": 2.0583, "step": 14994 }, { "epoch": 1.5391009852216748, "grad_norm": 0.030788132920861244, "learning_rate": 0.01, "loss": 2.0856, "step": 14997 }, { "epoch": 1.5394088669950738, "grad_norm": 0.06127138063311577, "learning_rate": 0.01, "loss": 2.0485, "step": 15000 }, { "epoch": 1.5397167487684729, "grad_norm": 0.07792042940855026, "learning_rate": 0.01, "loss": 2.0473, "step": 15003 }, { "epoch": 1.5400246305418719, "grad_norm": 0.1915716826915741, "learning_rate": 0.01, "loss": 2.06, "step": 15006 }, { "epoch": 1.540332512315271, "grad_norm": 0.13130734860897064, "learning_rate": 0.01, "loss": 2.0458, "step": 15009 }, { "epoch": 1.54064039408867, "grad_norm": 0.08750183880329132, "learning_rate": 0.01, "loss": 2.0569, "step": 15012 }, { "epoch": 1.540948275862069, "grad_norm": 0.0678631141781807, "learning_rate": 0.01, "loss": 2.0256, "step": 15015 }, { "epoch": 1.541256157635468, "grad_norm": 0.04352593049407005, "learning_rate": 0.01, "loss": 2.0541, "step": 15018 }, { "epoch": 1.541564039408867, "grad_norm": 0.059398628771305084, "learning_rate": 0.01, "loss": 2.0542, "step": 15021 }, { "epoch": 1.541871921182266, "grad_norm": 0.09179355949163437, "learning_rate": 0.01, "loss": 2.0623, "step": 15024 }, { "epoch": 1.542179802955665, "grad_norm": 0.08243024349212646, "learning_rate": 0.01, "loss": 2.0616, "step": 15027 }, { "epoch": 1.542487684729064, "grad_norm": 0.05103360861539841, "learning_rate": 0.01, "loss": 2.0498, "step": 15030 }, { "epoch": 1.542795566502463, "grad_norm": 0.04206395894289017, "learning_rate": 0.01, "loss": 2.0676, "step": 15033 }, { "epoch": 1.543103448275862, "grad_norm": 0.03659799322485924, "learning_rate": 0.01, "loss": 2.0376, "step": 15036 }, { "epoch": 1.5434113300492611, "grad_norm": 0.1279965043067932, "learning_rate": 0.01, "loss": 2.043, "step": 15039 }, { "epoch": 1.5437192118226601, "grad_norm": 0.09509512782096863, "learning_rate": 0.01, "loss": 2.0233, "step": 15042 }, { "epoch": 1.5440270935960592, "grad_norm": 0.07963217794895172, "learning_rate": 0.01, "loss": 2.0632, "step": 15045 }, { "epoch": 1.5443349753694582, "grad_norm": 0.06425557285547256, "learning_rate": 0.01, "loss": 2.0454, "step": 15048 }, { "epoch": 1.5446428571428572, "grad_norm": 0.1166144609451294, "learning_rate": 0.01, "loss": 2.0675, "step": 15051 }, { "epoch": 1.5449507389162562, "grad_norm": 0.0558270663022995, "learning_rate": 0.01, "loss": 2.0495, "step": 15054 }, { "epoch": 1.5452586206896552, "grad_norm": 0.05666494369506836, "learning_rate": 0.01, "loss": 2.0417, "step": 15057 }, { "epoch": 1.5455665024630543, "grad_norm": 0.048931702971458435, "learning_rate": 0.01, "loss": 2.0503, "step": 15060 }, { "epoch": 1.5458743842364533, "grad_norm": 0.10072410106658936, "learning_rate": 0.01, "loss": 2.0432, "step": 15063 }, { "epoch": 1.5461822660098523, "grad_norm": 0.06339754164218903, "learning_rate": 0.01, "loss": 2.048, "step": 15066 }, { "epoch": 1.5464901477832513, "grad_norm": 0.04913650080561638, "learning_rate": 0.01, "loss": 2.07, "step": 15069 }, { "epoch": 1.5467980295566504, "grad_norm": 0.1012924313545227, "learning_rate": 0.01, "loss": 2.0423, "step": 15072 }, { "epoch": 1.5471059113300494, "grad_norm": 0.048015668988227844, "learning_rate": 0.01, "loss": 2.0356, "step": 15075 }, { "epoch": 1.5474137931034484, "grad_norm": 0.09666754305362701, "learning_rate": 0.01, "loss": 2.0701, "step": 15078 }, { "epoch": 1.5477216748768474, "grad_norm": 0.07722094655036926, "learning_rate": 0.01, "loss": 2.0223, "step": 15081 }, { "epoch": 1.5480295566502464, "grad_norm": 0.06525082141160965, "learning_rate": 0.01, "loss": 2.056, "step": 15084 }, { "epoch": 1.5483374384236455, "grad_norm": 0.04979628697037697, "learning_rate": 0.01, "loss": 2.0662, "step": 15087 }, { "epoch": 1.5486453201970445, "grad_norm": 0.05903888866305351, "learning_rate": 0.01, "loss": 2.0551, "step": 15090 }, { "epoch": 1.5489532019704435, "grad_norm": 0.09098793566226959, "learning_rate": 0.01, "loss": 2.0758, "step": 15093 }, { "epoch": 1.5492610837438425, "grad_norm": 0.08262350410223007, "learning_rate": 0.01, "loss": 2.0548, "step": 15096 }, { "epoch": 1.5495689655172413, "grad_norm": 0.057414278388023376, "learning_rate": 0.01, "loss": 2.0887, "step": 15099 }, { "epoch": 1.5498768472906403, "grad_norm": 0.06718642264604568, "learning_rate": 0.01, "loss": 2.0731, "step": 15102 }, { "epoch": 1.5501847290640394, "grad_norm": 0.07351098954677582, "learning_rate": 0.01, "loss": 2.0589, "step": 15105 }, { "epoch": 1.5504926108374384, "grad_norm": 0.03318174555897713, "learning_rate": 0.01, "loss": 2.0545, "step": 15108 }, { "epoch": 1.5508004926108374, "grad_norm": 0.11198091506958008, "learning_rate": 0.01, "loss": 2.0306, "step": 15111 }, { "epoch": 1.5511083743842364, "grad_norm": 0.056512147188186646, "learning_rate": 0.01, "loss": 2.0724, "step": 15114 }, { "epoch": 1.5514162561576355, "grad_norm": 0.08460327982902527, "learning_rate": 0.01, "loss": 2.0537, "step": 15117 }, { "epoch": 1.5517241379310345, "grad_norm": 0.08536583930253983, "learning_rate": 0.01, "loss": 2.0696, "step": 15120 }, { "epoch": 1.5520320197044335, "grad_norm": 0.10857357084751129, "learning_rate": 0.01, "loss": 2.0645, "step": 15123 }, { "epoch": 1.5523399014778325, "grad_norm": 0.04923904314637184, "learning_rate": 0.01, "loss": 2.0339, "step": 15126 }, { "epoch": 1.5526477832512315, "grad_norm": 0.05313669145107269, "learning_rate": 0.01, "loss": 2.044, "step": 15129 }, { "epoch": 1.5529556650246306, "grad_norm": 0.058348409831523895, "learning_rate": 0.01, "loss": 2.0264, "step": 15132 }, { "epoch": 1.5532635467980296, "grad_norm": 0.04621830955147743, "learning_rate": 0.01, "loss": 2.0543, "step": 15135 }, { "epoch": 1.5535714285714286, "grad_norm": 0.079473577439785, "learning_rate": 0.01, "loss": 2.0494, "step": 15138 }, { "epoch": 1.5538793103448276, "grad_norm": 0.10176654160022736, "learning_rate": 0.01, "loss": 2.0569, "step": 15141 }, { "epoch": 1.5541871921182266, "grad_norm": 0.048997897654771805, "learning_rate": 0.01, "loss": 2.0832, "step": 15144 }, { "epoch": 1.5544950738916257, "grad_norm": 0.06281887739896774, "learning_rate": 0.01, "loss": 2.0446, "step": 15147 }, { "epoch": 1.5548029556650245, "grad_norm": 0.07801464200019836, "learning_rate": 0.01, "loss": 2.0608, "step": 15150 }, { "epoch": 1.5551108374384235, "grad_norm": 0.08535271883010864, "learning_rate": 0.01, "loss": 2.0597, "step": 15153 }, { "epoch": 1.5554187192118225, "grad_norm": 0.06536438316106796, "learning_rate": 0.01, "loss": 2.0491, "step": 15156 }, { "epoch": 1.5557266009852215, "grad_norm": 0.045388113707304, "learning_rate": 0.01, "loss": 2.0243, "step": 15159 }, { "epoch": 1.5560344827586206, "grad_norm": 0.041893068701028824, "learning_rate": 0.01, "loss": 2.0373, "step": 15162 }, { "epoch": 1.5563423645320196, "grad_norm": 0.04261694848537445, "learning_rate": 0.01, "loss": 2.028, "step": 15165 }, { "epoch": 1.5566502463054186, "grad_norm": 0.03123985230922699, "learning_rate": 0.01, "loss": 2.0653, "step": 15168 }, { "epoch": 1.5569581280788176, "grad_norm": 0.048562515527009964, "learning_rate": 0.01, "loss": 2.0249, "step": 15171 }, { "epoch": 1.5572660098522166, "grad_norm": 0.1343316286802292, "learning_rate": 0.01, "loss": 2.062, "step": 15174 }, { "epoch": 1.5575738916256157, "grad_norm": 0.10992839932441711, "learning_rate": 0.01, "loss": 2.0374, "step": 15177 }, { "epoch": 1.5578817733990147, "grad_norm": 0.09098651260137558, "learning_rate": 0.01, "loss": 2.0394, "step": 15180 }, { "epoch": 1.5581896551724137, "grad_norm": 0.05405926704406738, "learning_rate": 0.01, "loss": 2.0491, "step": 15183 }, { "epoch": 1.5584975369458127, "grad_norm": 0.04776093736290932, "learning_rate": 0.01, "loss": 2.0775, "step": 15186 }, { "epoch": 1.5588054187192117, "grad_norm": 0.04614724963903427, "learning_rate": 0.01, "loss": 2.0339, "step": 15189 }, { "epoch": 1.5591133004926108, "grad_norm": 0.05032865330576897, "learning_rate": 0.01, "loss": 2.0516, "step": 15192 }, { "epoch": 1.5594211822660098, "grad_norm": 0.051392171531915665, "learning_rate": 0.01, "loss": 2.0405, "step": 15195 }, { "epoch": 1.5597290640394088, "grad_norm": 0.10255944728851318, "learning_rate": 0.01, "loss": 2.0615, "step": 15198 }, { "epoch": 1.5600369458128078, "grad_norm": 0.04222560301423073, "learning_rate": 0.01, "loss": 2.0527, "step": 15201 }, { "epoch": 1.5603448275862069, "grad_norm": 0.045385826379060745, "learning_rate": 0.01, "loss": 2.0556, "step": 15204 }, { "epoch": 1.5606527093596059, "grad_norm": 0.04241577908396721, "learning_rate": 0.01, "loss": 2.0727, "step": 15207 }, { "epoch": 1.560960591133005, "grad_norm": 0.12387125194072723, "learning_rate": 0.01, "loss": 2.0337, "step": 15210 }, { "epoch": 1.561268472906404, "grad_norm": 0.14704599976539612, "learning_rate": 0.01, "loss": 2.0602, "step": 15213 }, { "epoch": 1.561576354679803, "grad_norm": 0.049915559589862823, "learning_rate": 0.01, "loss": 2.0203, "step": 15216 }, { "epoch": 1.561884236453202, "grad_norm": 0.057630617171525955, "learning_rate": 0.01, "loss": 2.0503, "step": 15219 }, { "epoch": 1.562192118226601, "grad_norm": 0.04765351861715317, "learning_rate": 0.01, "loss": 2.0763, "step": 15222 }, { "epoch": 1.5625, "grad_norm": 0.06213679164648056, "learning_rate": 0.01, "loss": 2.0831, "step": 15225 }, { "epoch": 1.562807881773399, "grad_norm": 0.07990710437297821, "learning_rate": 0.01, "loss": 2.0458, "step": 15228 }, { "epoch": 1.563115763546798, "grad_norm": 0.0683673620223999, "learning_rate": 0.01, "loss": 2.0294, "step": 15231 }, { "epoch": 1.563423645320197, "grad_norm": 0.12503905594348907, "learning_rate": 0.01, "loss": 2.0543, "step": 15234 }, { "epoch": 1.563731527093596, "grad_norm": 0.03973531350493431, "learning_rate": 0.01, "loss": 2.0474, "step": 15237 }, { "epoch": 1.564039408866995, "grad_norm": 0.07055282592773438, "learning_rate": 0.01, "loss": 2.0607, "step": 15240 }, { "epoch": 1.5643472906403941, "grad_norm": 0.06088467687368393, "learning_rate": 0.01, "loss": 2.0643, "step": 15243 }, { "epoch": 1.5646551724137931, "grad_norm": 0.06393450498580933, "learning_rate": 0.01, "loss": 2.0369, "step": 15246 }, { "epoch": 1.5649630541871922, "grad_norm": 0.08600255101919174, "learning_rate": 0.01, "loss": 2.072, "step": 15249 }, { "epoch": 1.5652709359605912, "grad_norm": 0.07075429707765579, "learning_rate": 0.01, "loss": 2.0509, "step": 15252 }, { "epoch": 1.5655788177339902, "grad_norm": 0.058057527989149094, "learning_rate": 0.01, "loss": 2.0373, "step": 15255 }, { "epoch": 1.5658866995073892, "grad_norm": 0.04670482128858566, "learning_rate": 0.01, "loss": 2.0447, "step": 15258 }, { "epoch": 1.5661945812807883, "grad_norm": 0.08971681445837021, "learning_rate": 0.01, "loss": 2.0599, "step": 15261 }, { "epoch": 1.5665024630541873, "grad_norm": 0.12580984830856323, "learning_rate": 0.01, "loss": 2.0402, "step": 15264 }, { "epoch": 1.5668103448275863, "grad_norm": 0.05133863538503647, "learning_rate": 0.01, "loss": 2.056, "step": 15267 }, { "epoch": 1.5671182266009853, "grad_norm": 0.07821512222290039, "learning_rate": 0.01, "loss": 2.0458, "step": 15270 }, { "epoch": 1.5674261083743843, "grad_norm": 0.07024712860584259, "learning_rate": 0.01, "loss": 2.0496, "step": 15273 }, { "epoch": 1.5677339901477834, "grad_norm": 0.09332927316427231, "learning_rate": 0.01, "loss": 2.0358, "step": 15276 }, { "epoch": 1.5680418719211824, "grad_norm": 0.06875135749578476, "learning_rate": 0.01, "loss": 2.0459, "step": 15279 }, { "epoch": 1.5683497536945814, "grad_norm": 0.08868546783924103, "learning_rate": 0.01, "loss": 2.0641, "step": 15282 }, { "epoch": 1.5686576354679804, "grad_norm": 0.07729046791791916, "learning_rate": 0.01, "loss": 2.0333, "step": 15285 }, { "epoch": 1.5689655172413794, "grad_norm": 0.07686775177717209, "learning_rate": 0.01, "loss": 2.0446, "step": 15288 }, { "epoch": 1.5692733990147785, "grad_norm": 0.0839667096734047, "learning_rate": 0.01, "loss": 2.0428, "step": 15291 }, { "epoch": 1.5695812807881775, "grad_norm": 0.0704370066523552, "learning_rate": 0.01, "loss": 2.0638, "step": 15294 }, { "epoch": 1.5698891625615765, "grad_norm": 0.05312497168779373, "learning_rate": 0.01, "loss": 2.029, "step": 15297 }, { "epoch": 1.5701970443349755, "grad_norm": 0.049166906625032425, "learning_rate": 0.01, "loss": 2.0544, "step": 15300 }, { "epoch": 1.5705049261083743, "grad_norm": 0.041398897767066956, "learning_rate": 0.01, "loss": 2.0652, "step": 15303 }, { "epoch": 1.5708128078817734, "grad_norm": 0.08617027848958969, "learning_rate": 0.01, "loss": 2.0675, "step": 15306 }, { "epoch": 1.5711206896551724, "grad_norm": 0.0348927266895771, "learning_rate": 0.01, "loss": 2.043, "step": 15309 }, { "epoch": 1.5714285714285714, "grad_norm": 0.060787077993154526, "learning_rate": 0.01, "loss": 2.0439, "step": 15312 }, { "epoch": 1.5717364532019704, "grad_norm": 0.050898227840662, "learning_rate": 0.01, "loss": 2.0475, "step": 15315 }, { "epoch": 1.5720443349753694, "grad_norm": 0.04594309255480766, "learning_rate": 0.01, "loss": 2.0352, "step": 15318 }, { "epoch": 1.5723522167487685, "grad_norm": 0.1161418929696083, "learning_rate": 0.01, "loss": 2.0232, "step": 15321 }, { "epoch": 1.5726600985221675, "grad_norm": 0.05419136583805084, "learning_rate": 0.01, "loss": 2.0417, "step": 15324 }, { "epoch": 1.5729679802955665, "grad_norm": 0.07661257684230804, "learning_rate": 0.01, "loss": 2.0548, "step": 15327 }, { "epoch": 1.5732758620689655, "grad_norm": 0.09760436415672302, "learning_rate": 0.01, "loss": 2.0539, "step": 15330 }, { "epoch": 1.5735837438423645, "grad_norm": 0.07211121916770935, "learning_rate": 0.01, "loss": 2.037, "step": 15333 }, { "epoch": 1.5738916256157636, "grad_norm": 0.08360971510410309, "learning_rate": 0.01, "loss": 2.0465, "step": 15336 }, { "epoch": 1.5741995073891626, "grad_norm": 0.05901337414979935, "learning_rate": 0.01, "loss": 2.0581, "step": 15339 }, { "epoch": 1.5745073891625616, "grad_norm": 0.07543767988681793, "learning_rate": 0.01, "loss": 2.0437, "step": 15342 }, { "epoch": 1.5748152709359606, "grad_norm": 0.04725690186023712, "learning_rate": 0.01, "loss": 2.0751, "step": 15345 }, { "epoch": 1.5751231527093597, "grad_norm": 0.051067035645246506, "learning_rate": 0.01, "loss": 2.0434, "step": 15348 }, { "epoch": 1.5754310344827587, "grad_norm": 0.03145357966423035, "learning_rate": 0.01, "loss": 2.0511, "step": 15351 }, { "epoch": 1.5757389162561575, "grad_norm": 0.09291981905698776, "learning_rate": 0.01, "loss": 2.0604, "step": 15354 }, { "epoch": 1.5760467980295565, "grad_norm": 0.03409574180841446, "learning_rate": 0.01, "loss": 2.0052, "step": 15357 }, { "epoch": 1.5763546798029555, "grad_norm": 0.045993223786354065, "learning_rate": 0.01, "loss": 2.0442, "step": 15360 }, { "epoch": 1.5766625615763545, "grad_norm": 0.03174202889204025, "learning_rate": 0.01, "loss": 2.0525, "step": 15363 }, { "epoch": 1.5769704433497536, "grad_norm": 0.057013627141714096, "learning_rate": 0.01, "loss": 2.0699, "step": 15366 }, { "epoch": 1.5772783251231526, "grad_norm": 0.05778640881180763, "learning_rate": 0.01, "loss": 2.0641, "step": 15369 }, { "epoch": 1.5775862068965516, "grad_norm": 0.11259103566408157, "learning_rate": 0.01, "loss": 2.0496, "step": 15372 }, { "epoch": 1.5778940886699506, "grad_norm": 0.05728684365749359, "learning_rate": 0.01, "loss": 2.0646, "step": 15375 }, { "epoch": 1.5782019704433496, "grad_norm": 0.07037964463233948, "learning_rate": 0.01, "loss": 2.072, "step": 15378 }, { "epoch": 1.5785098522167487, "grad_norm": 0.05147834122180939, "learning_rate": 0.01, "loss": 2.0339, "step": 15381 }, { "epoch": 1.5788177339901477, "grad_norm": 0.0663742870092392, "learning_rate": 0.01, "loss": 2.0534, "step": 15384 }, { "epoch": 1.5791256157635467, "grad_norm": 0.04665178433060646, "learning_rate": 0.01, "loss": 2.0698, "step": 15387 }, { "epoch": 1.5794334975369457, "grad_norm": 0.037410903722047806, "learning_rate": 0.01, "loss": 2.0529, "step": 15390 }, { "epoch": 1.5797413793103448, "grad_norm": 0.03849703446030617, "learning_rate": 0.01, "loss": 2.0484, "step": 15393 }, { "epoch": 1.5800492610837438, "grad_norm": 0.10119396448135376, "learning_rate": 0.01, "loss": 2.0752, "step": 15396 }, { "epoch": 1.5803571428571428, "grad_norm": 0.03294401615858078, "learning_rate": 0.01, "loss": 2.0218, "step": 15399 }, { "epoch": 1.5806650246305418, "grad_norm": 0.06310781091451645, "learning_rate": 0.01, "loss": 2.0311, "step": 15402 }, { "epoch": 1.5809729064039408, "grad_norm": 0.050191253423690796, "learning_rate": 0.01, "loss": 2.0408, "step": 15405 }, { "epoch": 1.5812807881773399, "grad_norm": 0.09952792525291443, "learning_rate": 0.01, "loss": 2.0622, "step": 15408 }, { "epoch": 1.5815886699507389, "grad_norm": 0.12618017196655273, "learning_rate": 0.01, "loss": 2.0696, "step": 15411 }, { "epoch": 1.581896551724138, "grad_norm": 0.19296571612358093, "learning_rate": 0.01, "loss": 2.072, "step": 15414 }, { "epoch": 1.582204433497537, "grad_norm": 0.1124732494354248, "learning_rate": 0.01, "loss": 2.0451, "step": 15417 }, { "epoch": 1.582512315270936, "grad_norm": 0.06556060910224915, "learning_rate": 0.01, "loss": 2.0325, "step": 15420 }, { "epoch": 1.582820197044335, "grad_norm": 0.05607735365629196, "learning_rate": 0.01, "loss": 2.0663, "step": 15423 }, { "epoch": 1.583128078817734, "grad_norm": 0.04731295630335808, "learning_rate": 0.01, "loss": 2.0559, "step": 15426 }, { "epoch": 1.583435960591133, "grad_norm": 0.060452669858932495, "learning_rate": 0.01, "loss": 2.0498, "step": 15429 }, { "epoch": 1.583743842364532, "grad_norm": 0.056996360421180725, "learning_rate": 0.01, "loss": 2.0752, "step": 15432 }, { "epoch": 1.584051724137931, "grad_norm": 0.11382313817739487, "learning_rate": 0.01, "loss": 2.0723, "step": 15435 }, { "epoch": 1.58435960591133, "grad_norm": 0.05176008865237236, "learning_rate": 0.01, "loss": 2.0706, "step": 15438 }, { "epoch": 1.584667487684729, "grad_norm": 0.05329972878098488, "learning_rate": 0.01, "loss": 2.0473, "step": 15441 }, { "epoch": 1.5849753694581281, "grad_norm": 0.05416284501552582, "learning_rate": 0.01, "loss": 2.0585, "step": 15444 }, { "epoch": 1.5852832512315271, "grad_norm": 0.052267350256443024, "learning_rate": 0.01, "loss": 2.0314, "step": 15447 }, { "epoch": 1.5855911330049262, "grad_norm": 0.1025579646229744, "learning_rate": 0.01, "loss": 2.0412, "step": 15450 }, { "epoch": 1.5858990147783252, "grad_norm": 0.06416940689086914, "learning_rate": 0.01, "loss": 2.0312, "step": 15453 }, { "epoch": 1.5862068965517242, "grad_norm": 0.05699596926569939, "learning_rate": 0.01, "loss": 2.0678, "step": 15456 }, { "epoch": 1.5865147783251232, "grad_norm": 0.036711812019348145, "learning_rate": 0.01, "loss": 2.0251, "step": 15459 }, { "epoch": 1.5868226600985222, "grad_norm": 0.1025582030415535, "learning_rate": 0.01, "loss": 2.0387, "step": 15462 }, { "epoch": 1.5871305418719213, "grad_norm": 0.03923096880316734, "learning_rate": 0.01, "loss": 2.067, "step": 15465 }, { "epoch": 1.5874384236453203, "grad_norm": 0.08267144113779068, "learning_rate": 0.01, "loss": 2.0583, "step": 15468 }, { "epoch": 1.5877463054187193, "grad_norm": 0.15374930202960968, "learning_rate": 0.01, "loss": 2.0384, "step": 15471 }, { "epoch": 1.5880541871921183, "grad_norm": 0.10246127098798752, "learning_rate": 0.01, "loss": 2.0696, "step": 15474 }, { "epoch": 1.5883620689655173, "grad_norm": 0.09784136712551117, "learning_rate": 0.01, "loss": 2.0546, "step": 15477 }, { "epoch": 1.5886699507389164, "grad_norm": 0.08747786283493042, "learning_rate": 0.01, "loss": 2.0754, "step": 15480 }, { "epoch": 1.5889778325123154, "grad_norm": 0.0755406990647316, "learning_rate": 0.01, "loss": 2.0477, "step": 15483 }, { "epoch": 1.5892857142857144, "grad_norm": 0.05593521520495415, "learning_rate": 0.01, "loss": 2.0485, "step": 15486 }, { "epoch": 1.5895935960591134, "grad_norm": 0.04462866857647896, "learning_rate": 0.01, "loss": 2.056, "step": 15489 }, { "epoch": 1.5899014778325125, "grad_norm": 0.040571678429841995, "learning_rate": 0.01, "loss": 2.0906, "step": 15492 }, { "epoch": 1.5902093596059115, "grad_norm": 0.038100458681583405, "learning_rate": 0.01, "loss": 2.0735, "step": 15495 }, { "epoch": 1.5905172413793105, "grad_norm": 0.04336906597018242, "learning_rate": 0.01, "loss": 2.0716, "step": 15498 }, { "epoch": 1.5908251231527095, "grad_norm": 0.1424836665391922, "learning_rate": 0.01, "loss": 2.0424, "step": 15501 }, { "epoch": 1.5911330049261085, "grad_norm": 0.09235331416130066, "learning_rate": 0.01, "loss": 2.0461, "step": 15504 }, { "epoch": 1.5914408866995073, "grad_norm": 0.07932816445827484, "learning_rate": 0.01, "loss": 2.0595, "step": 15507 }, { "epoch": 1.5917487684729064, "grad_norm": 0.057297565042972565, "learning_rate": 0.01, "loss": 2.0407, "step": 15510 }, { "epoch": 1.5920566502463054, "grad_norm": 0.05323542281985283, "learning_rate": 0.01, "loss": 2.0568, "step": 15513 }, { "epoch": 1.5923645320197044, "grad_norm": 0.039465416222810745, "learning_rate": 0.01, "loss": 2.0455, "step": 15516 }, { "epoch": 1.5926724137931034, "grad_norm": 0.044614970684051514, "learning_rate": 0.01, "loss": 2.0784, "step": 15519 }, { "epoch": 1.5929802955665024, "grad_norm": 0.044074248522520065, "learning_rate": 0.01, "loss": 2.0391, "step": 15522 }, { "epoch": 1.5932881773399015, "grad_norm": 0.04098647087812424, "learning_rate": 0.01, "loss": 2.0512, "step": 15525 }, { "epoch": 1.5935960591133005, "grad_norm": 0.18400810658931732, "learning_rate": 0.01, "loss": 2.0623, "step": 15528 }, { "epoch": 1.5939039408866995, "grad_norm": 0.10264160484075546, "learning_rate": 0.01, "loss": 2.0546, "step": 15531 }, { "epoch": 1.5942118226600985, "grad_norm": 0.10086511820554733, "learning_rate": 0.01, "loss": 2.0671, "step": 15534 }, { "epoch": 1.5945197044334976, "grad_norm": 0.03823179379105568, "learning_rate": 0.01, "loss": 2.0496, "step": 15537 }, { "epoch": 1.5948275862068966, "grad_norm": 0.0449577271938324, "learning_rate": 0.01, "loss": 2.0635, "step": 15540 }, { "epoch": 1.5951354679802956, "grad_norm": 0.04791559278964996, "learning_rate": 0.01, "loss": 2.0596, "step": 15543 }, { "epoch": 1.5954433497536946, "grad_norm": 0.04523475095629692, "learning_rate": 0.01, "loss": 2.0457, "step": 15546 }, { "epoch": 1.5957512315270936, "grad_norm": 0.10654012113809586, "learning_rate": 0.01, "loss": 2.0172, "step": 15549 }, { "epoch": 1.5960591133004927, "grad_norm": 0.06602972745895386, "learning_rate": 0.01, "loss": 2.0565, "step": 15552 }, { "epoch": 1.5963669950738915, "grad_norm": 0.10605626553297043, "learning_rate": 0.01, "loss": 2.0792, "step": 15555 }, { "epoch": 1.5966748768472905, "grad_norm": 0.05995124578475952, "learning_rate": 0.01, "loss": 2.0367, "step": 15558 }, { "epoch": 1.5969827586206895, "grad_norm": 0.05426995828747749, "learning_rate": 0.01, "loss": 2.0458, "step": 15561 }, { "epoch": 1.5972906403940885, "grad_norm": 0.08749561756849289, "learning_rate": 0.01, "loss": 2.0509, "step": 15564 }, { "epoch": 1.5975985221674875, "grad_norm": 0.0735105574131012, "learning_rate": 0.01, "loss": 2.0548, "step": 15567 }, { "epoch": 1.5979064039408866, "grad_norm": 0.05417585372924805, "learning_rate": 0.01, "loss": 2.0304, "step": 15570 }, { "epoch": 1.5982142857142856, "grad_norm": 0.04170646145939827, "learning_rate": 0.01, "loss": 2.0751, "step": 15573 }, { "epoch": 1.5985221674876846, "grad_norm": 0.05886775627732277, "learning_rate": 0.01, "loss": 2.0419, "step": 15576 }, { "epoch": 1.5988300492610836, "grad_norm": 0.04231201857328415, "learning_rate": 0.01, "loss": 2.0531, "step": 15579 }, { "epoch": 1.5991379310344827, "grad_norm": 0.06620585918426514, "learning_rate": 0.01, "loss": 2.043, "step": 15582 }, { "epoch": 1.5994458128078817, "grad_norm": 0.10536913573741913, "learning_rate": 0.01, "loss": 2.0327, "step": 15585 }, { "epoch": 1.5997536945812807, "grad_norm": 0.14467884600162506, "learning_rate": 0.01, "loss": 2.0491, "step": 15588 }, { "epoch": 1.6000615763546797, "grad_norm": 0.11715273559093475, "learning_rate": 0.01, "loss": 2.0362, "step": 15591 }, { "epoch": 1.6003694581280787, "grad_norm": 0.04978121817111969, "learning_rate": 0.01, "loss": 2.0444, "step": 15594 }, { "epoch": 1.6006773399014778, "grad_norm": 0.06248803436756134, "learning_rate": 0.01, "loss": 2.0527, "step": 15597 }, { "epoch": 1.6009852216748768, "grad_norm": 0.05408048257231712, "learning_rate": 0.01, "loss": 2.0519, "step": 15600 }, { "epoch": 1.6012931034482758, "grad_norm": 0.05805948004126549, "learning_rate": 0.01, "loss": 2.0539, "step": 15603 }, { "epoch": 1.6016009852216748, "grad_norm": 0.03809194639325142, "learning_rate": 0.01, "loss": 2.0515, "step": 15606 }, { "epoch": 1.6019088669950738, "grad_norm": 0.07981141656637192, "learning_rate": 0.01, "loss": 2.0238, "step": 15609 }, { "epoch": 1.6022167487684729, "grad_norm": 0.04769575223326683, "learning_rate": 0.01, "loss": 2.0703, "step": 15612 }, { "epoch": 1.6025246305418719, "grad_norm": 0.09913644194602966, "learning_rate": 0.01, "loss": 2.074, "step": 15615 }, { "epoch": 1.602832512315271, "grad_norm": 0.12298569083213806, "learning_rate": 0.01, "loss": 2.0662, "step": 15618 }, { "epoch": 1.60314039408867, "grad_norm": 0.0525309219956398, "learning_rate": 0.01, "loss": 2.0411, "step": 15621 }, { "epoch": 1.603448275862069, "grad_norm": 0.07430320978164673, "learning_rate": 0.01, "loss": 2.0465, "step": 15624 }, { "epoch": 1.603756157635468, "grad_norm": 0.036753058433532715, "learning_rate": 0.01, "loss": 2.0408, "step": 15627 }, { "epoch": 1.604064039408867, "grad_norm": 0.04560523107647896, "learning_rate": 0.01, "loss": 2.074, "step": 15630 }, { "epoch": 1.604371921182266, "grad_norm": 0.07089810073375702, "learning_rate": 0.01, "loss": 2.0599, "step": 15633 }, { "epoch": 1.604679802955665, "grad_norm": 0.10833004862070084, "learning_rate": 0.01, "loss": 2.066, "step": 15636 }, { "epoch": 1.604987684729064, "grad_norm": 0.06033416837453842, "learning_rate": 0.01, "loss": 2.0893, "step": 15639 }, { "epoch": 1.605295566502463, "grad_norm": 0.06819162517786026, "learning_rate": 0.01, "loss": 2.0432, "step": 15642 }, { "epoch": 1.605603448275862, "grad_norm": 0.08949002623558044, "learning_rate": 0.01, "loss": 2.0891, "step": 15645 }, { "epoch": 1.6059113300492611, "grad_norm": 0.04749004542827606, "learning_rate": 0.01, "loss": 2.0434, "step": 15648 }, { "epoch": 1.6062192118226601, "grad_norm": 0.06903103739023209, "learning_rate": 0.01, "loss": 2.0379, "step": 15651 }, { "epoch": 1.6065270935960592, "grad_norm": 0.10074819624423981, "learning_rate": 0.01, "loss": 2.0657, "step": 15654 }, { "epoch": 1.6068349753694582, "grad_norm": 0.0390753298997879, "learning_rate": 0.01, "loss": 2.0186, "step": 15657 }, { "epoch": 1.6071428571428572, "grad_norm": 0.04776669666171074, "learning_rate": 0.01, "loss": 2.0435, "step": 15660 }, { "epoch": 1.6074507389162562, "grad_norm": 0.1191340908408165, "learning_rate": 0.01, "loss": 2.0519, "step": 15663 }, { "epoch": 1.6077586206896552, "grad_norm": 0.08326657861471176, "learning_rate": 0.01, "loss": 2.0443, "step": 15666 }, { "epoch": 1.6080665024630543, "grad_norm": 0.044734589755535126, "learning_rate": 0.01, "loss": 2.0626, "step": 15669 }, { "epoch": 1.6083743842364533, "grad_norm": 0.047262392938137054, "learning_rate": 0.01, "loss": 2.0351, "step": 15672 }, { "epoch": 1.6086822660098523, "grad_norm": 0.0908563882112503, "learning_rate": 0.01, "loss": 2.0655, "step": 15675 }, { "epoch": 1.6089901477832513, "grad_norm": 0.06681264191865921, "learning_rate": 0.01, "loss": 2.0606, "step": 15678 }, { "epoch": 1.6092980295566504, "grad_norm": 0.09569018334150314, "learning_rate": 0.01, "loss": 2.0454, "step": 15681 }, { "epoch": 1.6096059113300494, "grad_norm": 0.04303963482379913, "learning_rate": 0.01, "loss": 2.0296, "step": 15684 }, { "epoch": 1.6099137931034484, "grad_norm": 0.09924867749214172, "learning_rate": 0.01, "loss": 2.0578, "step": 15687 }, { "epoch": 1.6102216748768474, "grad_norm": 0.041328392922878265, "learning_rate": 0.01, "loss": 2.0538, "step": 15690 }, { "epoch": 1.6105295566502464, "grad_norm": 0.056367840617895126, "learning_rate": 0.01, "loss": 2.0746, "step": 15693 }, { "epoch": 1.6108374384236455, "grad_norm": 0.06074264645576477, "learning_rate": 0.01, "loss": 2.0707, "step": 15696 }, { "epoch": 1.6111453201970445, "grad_norm": 0.06541740894317627, "learning_rate": 0.01, "loss": 2.0615, "step": 15699 }, { "epoch": 1.6114532019704435, "grad_norm": 0.06279835850000381, "learning_rate": 0.01, "loss": 2.0484, "step": 15702 }, { "epoch": 1.6117610837438425, "grad_norm": 0.03825109452009201, "learning_rate": 0.01, "loss": 2.0317, "step": 15705 }, { "epoch": 1.6120689655172413, "grad_norm": 0.03792817145586014, "learning_rate": 0.01, "loss": 2.0246, "step": 15708 }, { "epoch": 1.6123768472906403, "grad_norm": 0.05229473114013672, "learning_rate": 0.01, "loss": 2.0765, "step": 15711 }, { "epoch": 1.6126847290640394, "grad_norm": 0.11285384744405746, "learning_rate": 0.01, "loss": 2.0392, "step": 15714 }, { "epoch": 1.6129926108374384, "grad_norm": 0.07333546876907349, "learning_rate": 0.01, "loss": 2.05, "step": 15717 }, { "epoch": 1.6133004926108374, "grad_norm": 0.07698936760425568, "learning_rate": 0.01, "loss": 2.0757, "step": 15720 }, { "epoch": 1.6136083743842364, "grad_norm": 0.06517963856458664, "learning_rate": 0.01, "loss": 2.032, "step": 15723 }, { "epoch": 1.6139162561576355, "grad_norm": 0.07242800295352936, "learning_rate": 0.01, "loss": 2.0398, "step": 15726 }, { "epoch": 1.6142241379310345, "grad_norm": 0.03956649452447891, "learning_rate": 0.01, "loss": 2.0437, "step": 15729 }, { "epoch": 1.6145320197044335, "grad_norm": 0.10249898582696915, "learning_rate": 0.01, "loss": 2.0716, "step": 15732 }, { "epoch": 1.6148399014778325, "grad_norm": 0.09716839343309402, "learning_rate": 0.01, "loss": 2.0288, "step": 15735 }, { "epoch": 1.6151477832512315, "grad_norm": 0.0809134840965271, "learning_rate": 0.01, "loss": 2.0553, "step": 15738 }, { "epoch": 1.6154556650246306, "grad_norm": 0.07891330122947693, "learning_rate": 0.01, "loss": 2.0622, "step": 15741 }, { "epoch": 1.6157635467980296, "grad_norm": 0.06289231032133102, "learning_rate": 0.01, "loss": 2.0382, "step": 15744 }, { "epoch": 1.6160714285714286, "grad_norm": 0.032251689583063126, "learning_rate": 0.01, "loss": 2.049, "step": 15747 }, { "epoch": 1.6163793103448276, "grad_norm": 0.032203931361436844, "learning_rate": 0.01, "loss": 2.0474, "step": 15750 }, { "epoch": 1.6166871921182266, "grad_norm": 0.042572617530822754, "learning_rate": 0.01, "loss": 2.0604, "step": 15753 }, { "epoch": 1.6169950738916257, "grad_norm": 0.06869769096374512, "learning_rate": 0.01, "loss": 2.0643, "step": 15756 }, { "epoch": 1.6173029556650245, "grad_norm": 0.09649953991174698, "learning_rate": 0.01, "loss": 2.0811, "step": 15759 }, { "epoch": 1.6176108374384235, "grad_norm": 0.060255225747823715, "learning_rate": 0.01, "loss": 2.0543, "step": 15762 }, { "epoch": 1.6179187192118225, "grad_norm": 0.0548517182469368, "learning_rate": 0.01, "loss": 2.0427, "step": 15765 }, { "epoch": 1.6182266009852215, "grad_norm": 0.09392546862363815, "learning_rate": 0.01, "loss": 2.0722, "step": 15768 }, { "epoch": 1.6185344827586206, "grad_norm": 0.052100926637649536, "learning_rate": 0.01, "loss": 2.0469, "step": 15771 }, { "epoch": 1.6188423645320196, "grad_norm": 0.05099212005734444, "learning_rate": 0.01, "loss": 2.0589, "step": 15774 }, { "epoch": 1.6191502463054186, "grad_norm": 0.0486266165971756, "learning_rate": 0.01, "loss": 2.0308, "step": 15777 }, { "epoch": 1.6194581280788176, "grad_norm": 0.044072605669498444, "learning_rate": 0.01, "loss": 2.0877, "step": 15780 }, { "epoch": 1.6197660098522166, "grad_norm": 0.09196856617927551, "learning_rate": 0.01, "loss": 2.0224, "step": 15783 }, { "epoch": 1.6200738916256157, "grad_norm": 0.05948984995484352, "learning_rate": 0.01, "loss": 2.0425, "step": 15786 }, { "epoch": 1.6203817733990147, "grad_norm": 0.043075162917375565, "learning_rate": 0.01, "loss": 2.058, "step": 15789 }, { "epoch": 1.6206896551724137, "grad_norm": 0.06739038228988647, "learning_rate": 0.01, "loss": 2.0356, "step": 15792 }, { "epoch": 1.6209975369458127, "grad_norm": 0.05961238220334053, "learning_rate": 0.01, "loss": 2.0492, "step": 15795 }, { "epoch": 1.6213054187192117, "grad_norm": 0.06527238339185715, "learning_rate": 0.01, "loss": 2.0227, "step": 15798 }, { "epoch": 1.6216133004926108, "grad_norm": 0.09234929084777832, "learning_rate": 0.01, "loss": 2.039, "step": 15801 }, { "epoch": 1.6219211822660098, "grad_norm": 0.08050446212291718, "learning_rate": 0.01, "loss": 2.0769, "step": 15804 }, { "epoch": 1.6222290640394088, "grad_norm": 0.06419754028320312, "learning_rate": 0.01, "loss": 2.0613, "step": 15807 }, { "epoch": 1.6225369458128078, "grad_norm": 0.06302323937416077, "learning_rate": 0.01, "loss": 2.0535, "step": 15810 }, { "epoch": 1.6228448275862069, "grad_norm": 0.051602717489004135, "learning_rate": 0.01, "loss": 2.0462, "step": 15813 }, { "epoch": 1.6231527093596059, "grad_norm": 0.12424405664205551, "learning_rate": 0.01, "loss": 2.0562, "step": 15816 }, { "epoch": 1.623460591133005, "grad_norm": 0.10444232821464539, "learning_rate": 0.01, "loss": 2.0527, "step": 15819 }, { "epoch": 1.623768472906404, "grad_norm": 0.06170908361673355, "learning_rate": 0.01, "loss": 2.0456, "step": 15822 }, { "epoch": 1.624076354679803, "grad_norm": 0.05145244672894478, "learning_rate": 0.01, "loss": 2.0368, "step": 15825 }, { "epoch": 1.624384236453202, "grad_norm": 0.0459282286465168, "learning_rate": 0.01, "loss": 2.0547, "step": 15828 }, { "epoch": 1.624692118226601, "grad_norm": 0.05250949412584305, "learning_rate": 0.01, "loss": 2.0475, "step": 15831 }, { "epoch": 1.625, "grad_norm": 0.03222022205591202, "learning_rate": 0.01, "loss": 2.0347, "step": 15834 }, { "epoch": 1.625307881773399, "grad_norm": 0.05849120765924454, "learning_rate": 0.01, "loss": 2.0351, "step": 15837 }, { "epoch": 1.625615763546798, "grad_norm": 0.04638088122010231, "learning_rate": 0.01, "loss": 2.0222, "step": 15840 }, { "epoch": 1.625923645320197, "grad_norm": 0.046597037464380264, "learning_rate": 0.01, "loss": 2.0577, "step": 15843 }, { "epoch": 1.626231527093596, "grad_norm": 0.10477445274591446, "learning_rate": 0.01, "loss": 2.0528, "step": 15846 }, { "epoch": 1.626539408866995, "grad_norm": 0.03439783677458763, "learning_rate": 0.01, "loss": 2.0631, "step": 15849 }, { "epoch": 1.6268472906403941, "grad_norm": 0.05810544639825821, "learning_rate": 0.01, "loss": 2.0531, "step": 15852 }, { "epoch": 1.6271551724137931, "grad_norm": 0.04557522386312485, "learning_rate": 0.01, "loss": 2.0582, "step": 15855 }, { "epoch": 1.6274630541871922, "grad_norm": 0.04236530885100365, "learning_rate": 0.01, "loss": 2.0531, "step": 15858 }, { "epoch": 1.6277709359605912, "grad_norm": 0.04338948428630829, "learning_rate": 0.01, "loss": 2.0584, "step": 15861 }, { "epoch": 1.6280788177339902, "grad_norm": 0.039782583713531494, "learning_rate": 0.01, "loss": 2.0699, "step": 15864 }, { "epoch": 1.6283866995073892, "grad_norm": 0.06858891248703003, "learning_rate": 0.01, "loss": 2.0477, "step": 15867 }, { "epoch": 1.6286945812807883, "grad_norm": 0.0510399155318737, "learning_rate": 0.01, "loss": 2.0692, "step": 15870 }, { "epoch": 1.6290024630541873, "grad_norm": 0.12568604946136475, "learning_rate": 0.01, "loss": 2.0597, "step": 15873 }, { "epoch": 1.6293103448275863, "grad_norm": 0.09245727956295013, "learning_rate": 0.01, "loss": 2.0365, "step": 15876 }, { "epoch": 1.6296182266009853, "grad_norm": 0.05763734132051468, "learning_rate": 0.01, "loss": 2.0787, "step": 15879 }, { "epoch": 1.6299261083743843, "grad_norm": 0.06099852919578552, "learning_rate": 0.01, "loss": 2.0603, "step": 15882 }, { "epoch": 1.6302339901477834, "grad_norm": 0.05738021805882454, "learning_rate": 0.01, "loss": 2.0405, "step": 15885 }, { "epoch": 1.6305418719211824, "grad_norm": 0.04953853040933609, "learning_rate": 0.01, "loss": 2.0622, "step": 15888 }, { "epoch": 1.6308497536945814, "grad_norm": 0.08572196215391159, "learning_rate": 0.01, "loss": 2.0618, "step": 15891 }, { "epoch": 1.6311576354679804, "grad_norm": 0.09245479106903076, "learning_rate": 0.01, "loss": 2.0453, "step": 15894 }, { "epoch": 1.6314655172413794, "grad_norm": 0.057964712381362915, "learning_rate": 0.01, "loss": 2.0186, "step": 15897 }, { "epoch": 1.6317733990147785, "grad_norm": 0.05189305916428566, "learning_rate": 0.01, "loss": 2.0615, "step": 15900 }, { "epoch": 1.6320812807881775, "grad_norm": 0.07327884435653687, "learning_rate": 0.01, "loss": 2.0175, "step": 15903 }, { "epoch": 1.6323891625615765, "grad_norm": 0.07089177519083023, "learning_rate": 0.01, "loss": 2.0475, "step": 15906 }, { "epoch": 1.6326970443349755, "grad_norm": 0.09783073514699936, "learning_rate": 0.01, "loss": 2.051, "step": 15909 }, { "epoch": 1.6330049261083743, "grad_norm": 0.06617991626262665, "learning_rate": 0.01, "loss": 2.0408, "step": 15912 }, { "epoch": 1.6333128078817734, "grad_norm": 0.10033921152353287, "learning_rate": 0.01, "loss": 2.0308, "step": 15915 }, { "epoch": 1.6336206896551724, "grad_norm": 0.054432835429906845, "learning_rate": 0.01, "loss": 2.0404, "step": 15918 }, { "epoch": 1.6339285714285714, "grad_norm": 0.056940387934446335, "learning_rate": 0.01, "loss": 2.0566, "step": 15921 }, { "epoch": 1.6342364532019704, "grad_norm": 0.12047278136014938, "learning_rate": 0.01, "loss": 2.0464, "step": 15924 }, { "epoch": 1.6345443349753694, "grad_norm": 0.04637087881565094, "learning_rate": 0.01, "loss": 2.0514, "step": 15927 }, { "epoch": 1.6348522167487685, "grad_norm": 0.03925006836652756, "learning_rate": 0.01, "loss": 2.0536, "step": 15930 }, { "epoch": 1.6351600985221675, "grad_norm": 0.04180562496185303, "learning_rate": 0.01, "loss": 2.0206, "step": 15933 }, { "epoch": 1.6354679802955665, "grad_norm": 0.08031155914068222, "learning_rate": 0.01, "loss": 2.0509, "step": 15936 }, { "epoch": 1.6357758620689655, "grad_norm": 0.0812869518995285, "learning_rate": 0.01, "loss": 2.021, "step": 15939 }, { "epoch": 1.6360837438423645, "grad_norm": 0.07887094467878342, "learning_rate": 0.01, "loss": 2.0554, "step": 15942 }, { "epoch": 1.6363916256157636, "grad_norm": 0.12604457139968872, "learning_rate": 0.01, "loss": 2.0236, "step": 15945 }, { "epoch": 1.6366995073891626, "grad_norm": 0.1262006163597107, "learning_rate": 0.01, "loss": 2.0806, "step": 15948 }, { "epoch": 1.6370073891625616, "grad_norm": 0.07335629314184189, "learning_rate": 0.01, "loss": 2.0339, "step": 15951 }, { "epoch": 1.6373152709359606, "grad_norm": 0.043172985315322876, "learning_rate": 0.01, "loss": 2.0455, "step": 15954 }, { "epoch": 1.6376231527093597, "grad_norm": 0.07475942373275757, "learning_rate": 0.01, "loss": 2.0842, "step": 15957 }, { "epoch": 1.6379310344827587, "grad_norm": 0.06113087013363838, "learning_rate": 0.01, "loss": 2.0526, "step": 15960 }, { "epoch": 1.6382389162561575, "grad_norm": 0.08709672093391418, "learning_rate": 0.01, "loss": 2.0886, "step": 15963 }, { "epoch": 1.6385467980295565, "grad_norm": 0.05810529738664627, "learning_rate": 0.01, "loss": 2.043, "step": 15966 }, { "epoch": 1.6388546798029555, "grad_norm": 0.0831620916724205, "learning_rate": 0.01, "loss": 2.0599, "step": 15969 }, { "epoch": 1.6391625615763545, "grad_norm": 0.040577951818704605, "learning_rate": 0.01, "loss": 2.0676, "step": 15972 }, { "epoch": 1.6394704433497536, "grad_norm": 0.03428385406732559, "learning_rate": 0.01, "loss": 2.0491, "step": 15975 }, { "epoch": 1.6397783251231526, "grad_norm": 0.04800771176815033, "learning_rate": 0.01, "loss": 2.0371, "step": 15978 }, { "epoch": 1.6400862068965516, "grad_norm": 0.05769934877753258, "learning_rate": 0.01, "loss": 2.0468, "step": 15981 }, { "epoch": 1.6403940886699506, "grad_norm": 0.08842117339372635, "learning_rate": 0.01, "loss": 2.037, "step": 15984 }, { "epoch": 1.6407019704433496, "grad_norm": 0.09740414470434189, "learning_rate": 0.01, "loss": 2.0423, "step": 15987 }, { "epoch": 1.6410098522167487, "grad_norm": 0.11128890514373779, "learning_rate": 0.01, "loss": 2.0423, "step": 15990 }, { "epoch": 1.6413177339901477, "grad_norm": 0.03690354898571968, "learning_rate": 0.01, "loss": 2.0412, "step": 15993 }, { "epoch": 1.6416256157635467, "grad_norm": 0.07311075925827026, "learning_rate": 0.01, "loss": 2.036, "step": 15996 }, { "epoch": 1.6419334975369457, "grad_norm": 0.045825451612472534, "learning_rate": 0.01, "loss": 2.0491, "step": 15999 }, { "epoch": 1.6422413793103448, "grad_norm": 0.09123300760984421, "learning_rate": 0.01, "loss": 2.0574, "step": 16002 }, { "epoch": 1.6425492610837438, "grad_norm": 0.0702185183763504, "learning_rate": 0.01, "loss": 2.0715, "step": 16005 }, { "epoch": 1.6428571428571428, "grad_norm": 0.0355604812502861, "learning_rate": 0.01, "loss": 2.0461, "step": 16008 }, { "epoch": 1.6431650246305418, "grad_norm": 0.03151632100343704, "learning_rate": 0.01, "loss": 2.0205, "step": 16011 }, { "epoch": 1.6434729064039408, "grad_norm": 0.04302441328763962, "learning_rate": 0.01, "loss": 2.0824, "step": 16014 }, { "epoch": 1.6437807881773399, "grad_norm": 0.06012306734919548, "learning_rate": 0.01, "loss": 2.0494, "step": 16017 }, { "epoch": 1.6440886699507389, "grad_norm": 0.04698712378740311, "learning_rate": 0.01, "loss": 2.0364, "step": 16020 }, { "epoch": 1.644396551724138, "grad_norm": 0.03930363059043884, "learning_rate": 0.01, "loss": 2.0612, "step": 16023 }, { "epoch": 1.644704433497537, "grad_norm": 0.0881473496556282, "learning_rate": 0.01, "loss": 2.0724, "step": 16026 }, { "epoch": 1.645012315270936, "grad_norm": 0.04207085818052292, "learning_rate": 0.01, "loss": 2.0524, "step": 16029 }, { "epoch": 1.645320197044335, "grad_norm": 0.04729215428233147, "learning_rate": 0.01, "loss": 2.0538, "step": 16032 }, { "epoch": 1.645628078817734, "grad_norm": 0.050990305840969086, "learning_rate": 0.01, "loss": 2.0473, "step": 16035 }, { "epoch": 1.645935960591133, "grad_norm": 0.05049813538789749, "learning_rate": 0.01, "loss": 2.0609, "step": 16038 }, { "epoch": 1.646243842364532, "grad_norm": 0.07787630707025528, "learning_rate": 0.01, "loss": 2.0505, "step": 16041 }, { "epoch": 1.646551724137931, "grad_norm": 0.06656081229448318, "learning_rate": 0.01, "loss": 2.0365, "step": 16044 }, { "epoch": 1.64685960591133, "grad_norm": 0.08293109387159348, "learning_rate": 0.01, "loss": 2.062, "step": 16047 }, { "epoch": 1.647167487684729, "grad_norm": 0.0775810182094574, "learning_rate": 0.01, "loss": 2.0466, "step": 16050 }, { "epoch": 1.6474753694581281, "grad_norm": 0.07917825132608414, "learning_rate": 0.01, "loss": 2.067, "step": 16053 }, { "epoch": 1.6477832512315271, "grad_norm": 0.07658717036247253, "learning_rate": 0.01, "loss": 2.0323, "step": 16056 }, { "epoch": 1.6480911330049262, "grad_norm": 0.07735300809144974, "learning_rate": 0.01, "loss": 2.0481, "step": 16059 }, { "epoch": 1.6483990147783252, "grad_norm": 0.07964644581079483, "learning_rate": 0.01, "loss": 2.0469, "step": 16062 }, { "epoch": 1.6487068965517242, "grad_norm": 0.0601799339056015, "learning_rate": 0.01, "loss": 2.0453, "step": 16065 }, { "epoch": 1.6490147783251232, "grad_norm": 0.1039920225739479, "learning_rate": 0.01, "loss": 2.0474, "step": 16068 }, { "epoch": 1.6493226600985222, "grad_norm": 0.055755455046892166, "learning_rate": 0.01, "loss": 2.0615, "step": 16071 }, { "epoch": 1.6496305418719213, "grad_norm": 0.0998646542429924, "learning_rate": 0.01, "loss": 2.0675, "step": 16074 }, { "epoch": 1.6499384236453203, "grad_norm": 0.04582648724317551, "learning_rate": 0.01, "loss": 2.0277, "step": 16077 }, { "epoch": 1.6502463054187193, "grad_norm": 0.08638078719377518, "learning_rate": 0.01, "loss": 2.0473, "step": 16080 }, { "epoch": 1.6505541871921183, "grad_norm": 0.053813617676496506, "learning_rate": 0.01, "loss": 2.0488, "step": 16083 }, { "epoch": 1.6508620689655173, "grad_norm": 0.08186789602041245, "learning_rate": 0.01, "loss": 2.07, "step": 16086 }, { "epoch": 1.6511699507389164, "grad_norm": 0.037794895470142365, "learning_rate": 0.01, "loss": 2.0554, "step": 16089 }, { "epoch": 1.6514778325123154, "grad_norm": 0.1052238717675209, "learning_rate": 0.01, "loss": 2.0614, "step": 16092 }, { "epoch": 1.6517857142857144, "grad_norm": 0.07596205919981003, "learning_rate": 0.01, "loss": 2.0358, "step": 16095 }, { "epoch": 1.6520935960591134, "grad_norm": 0.047295670956373215, "learning_rate": 0.01, "loss": 2.0488, "step": 16098 }, { "epoch": 1.6524014778325125, "grad_norm": 0.05572659894824028, "learning_rate": 0.01, "loss": 2.0468, "step": 16101 }, { "epoch": 1.6527093596059115, "grad_norm": 0.0429069958627224, "learning_rate": 0.01, "loss": 2.0681, "step": 16104 }, { "epoch": 1.6530172413793105, "grad_norm": 0.055060967803001404, "learning_rate": 0.01, "loss": 2.0347, "step": 16107 }, { "epoch": 1.6533251231527095, "grad_norm": 0.05243745073676109, "learning_rate": 0.01, "loss": 2.0696, "step": 16110 }, { "epoch": 1.6536330049261085, "grad_norm": 0.052228983491659164, "learning_rate": 0.01, "loss": 2.06, "step": 16113 }, { "epoch": 1.6539408866995073, "grad_norm": 0.065925233066082, "learning_rate": 0.01, "loss": 2.0707, "step": 16116 }, { "epoch": 1.6542487684729064, "grad_norm": 0.05819106101989746, "learning_rate": 0.01, "loss": 2.0137, "step": 16119 }, { "epoch": 1.6545566502463054, "grad_norm": 0.04320794716477394, "learning_rate": 0.01, "loss": 2.0691, "step": 16122 }, { "epoch": 1.6548645320197044, "grad_norm": 0.04202846437692642, "learning_rate": 0.01, "loss": 2.0456, "step": 16125 }, { "epoch": 1.6551724137931034, "grad_norm": 0.12747296690940857, "learning_rate": 0.01, "loss": 2.0419, "step": 16128 }, { "epoch": 1.6554802955665024, "grad_norm": 0.07199030369520187, "learning_rate": 0.01, "loss": 2.0347, "step": 16131 }, { "epoch": 1.6557881773399015, "grad_norm": 0.085335373878479, "learning_rate": 0.01, "loss": 2.0383, "step": 16134 }, { "epoch": 1.6560960591133005, "grad_norm": 0.061818841844797134, "learning_rate": 0.01, "loss": 2.0631, "step": 16137 }, { "epoch": 1.6564039408866995, "grad_norm": 0.06255804747343063, "learning_rate": 0.01, "loss": 2.0492, "step": 16140 }, { "epoch": 1.6567118226600985, "grad_norm": 0.08308485150337219, "learning_rate": 0.01, "loss": 2.0814, "step": 16143 }, { "epoch": 1.6570197044334976, "grad_norm": 0.06358073651790619, "learning_rate": 0.01, "loss": 2.048, "step": 16146 }, { "epoch": 1.6573275862068966, "grad_norm": 0.085427425801754, "learning_rate": 0.01, "loss": 2.0433, "step": 16149 }, { "epoch": 1.6576354679802956, "grad_norm": 0.043243568390607834, "learning_rate": 0.01, "loss": 2.0432, "step": 16152 }, { "epoch": 1.6579433497536946, "grad_norm": 0.06593325734138489, "learning_rate": 0.01, "loss": 2.0469, "step": 16155 }, { "epoch": 1.6582512315270936, "grad_norm": 0.14644569158554077, "learning_rate": 0.01, "loss": 2.0689, "step": 16158 }, { "epoch": 1.6585591133004927, "grad_norm": 0.1211152896285057, "learning_rate": 0.01, "loss": 2.0505, "step": 16161 }, { "epoch": 1.6588669950738915, "grad_norm": 0.11020830273628235, "learning_rate": 0.01, "loss": 2.0572, "step": 16164 }, { "epoch": 1.6591748768472905, "grad_norm": 0.08850467950105667, "learning_rate": 0.01, "loss": 2.0424, "step": 16167 }, { "epoch": 1.6594827586206895, "grad_norm": 0.050562698394060135, "learning_rate": 0.01, "loss": 2.0542, "step": 16170 }, { "epoch": 1.6597906403940885, "grad_norm": 0.048076871782541275, "learning_rate": 0.01, "loss": 2.0433, "step": 16173 }, { "epoch": 1.6600985221674875, "grad_norm": 0.03727034851908684, "learning_rate": 0.01, "loss": 2.0333, "step": 16176 }, { "epoch": 1.6604064039408866, "grad_norm": 0.048614371567964554, "learning_rate": 0.01, "loss": 2.0552, "step": 16179 }, { "epoch": 1.6607142857142856, "grad_norm": 0.05649641901254654, "learning_rate": 0.01, "loss": 2.0536, "step": 16182 }, { "epoch": 1.6610221674876846, "grad_norm": 0.05329003930091858, "learning_rate": 0.01, "loss": 2.0386, "step": 16185 }, { "epoch": 1.6613300492610836, "grad_norm": 0.06444583833217621, "learning_rate": 0.01, "loss": 2.055, "step": 16188 }, { "epoch": 1.6616379310344827, "grad_norm": 0.045777902007102966, "learning_rate": 0.01, "loss": 2.0476, "step": 16191 }, { "epoch": 1.6619458128078817, "grad_norm": 0.04831868037581444, "learning_rate": 0.01, "loss": 2.0582, "step": 16194 }, { "epoch": 1.6622536945812807, "grad_norm": 0.10648196935653687, "learning_rate": 0.01, "loss": 2.0579, "step": 16197 }, { "epoch": 1.6625615763546797, "grad_norm": 0.08369257301092148, "learning_rate": 0.01, "loss": 2.0505, "step": 16200 }, { "epoch": 1.6628694581280787, "grad_norm": 0.13716475665569305, "learning_rate": 0.01, "loss": 2.0383, "step": 16203 }, { "epoch": 1.6631773399014778, "grad_norm": 0.05025027319788933, "learning_rate": 0.01, "loss": 2.0549, "step": 16206 }, { "epoch": 1.6634852216748768, "grad_norm": 0.03850054368376732, "learning_rate": 0.01, "loss": 2.0412, "step": 16209 }, { "epoch": 1.6637931034482758, "grad_norm": 0.046656832098960876, "learning_rate": 0.01, "loss": 2.0595, "step": 16212 }, { "epoch": 1.6641009852216748, "grad_norm": 0.03826647624373436, "learning_rate": 0.01, "loss": 2.0352, "step": 16215 }, { "epoch": 1.6644088669950738, "grad_norm": 0.061087023466825485, "learning_rate": 0.01, "loss": 2.0357, "step": 16218 }, { "epoch": 1.6647167487684729, "grad_norm": 0.03787006065249443, "learning_rate": 0.01, "loss": 2.0226, "step": 16221 }, { "epoch": 1.6650246305418719, "grad_norm": 0.09619265049695969, "learning_rate": 0.01, "loss": 2.0399, "step": 16224 }, { "epoch": 1.665332512315271, "grad_norm": 0.04012330621480942, "learning_rate": 0.01, "loss": 2.044, "step": 16227 }, { "epoch": 1.66564039408867, "grad_norm": 0.062126293778419495, "learning_rate": 0.01, "loss": 2.0726, "step": 16230 }, { "epoch": 1.665948275862069, "grad_norm": 0.050277624279260635, "learning_rate": 0.01, "loss": 2.0219, "step": 16233 }, { "epoch": 1.666256157635468, "grad_norm": 0.03983129933476448, "learning_rate": 0.01, "loss": 2.0554, "step": 16236 }, { "epoch": 1.666564039408867, "grad_norm": 0.13119915127754211, "learning_rate": 0.01, "loss": 2.0682, "step": 16239 }, { "epoch": 1.666871921182266, "grad_norm": 0.0525536946952343, "learning_rate": 0.01, "loss": 2.0524, "step": 16242 }, { "epoch": 1.667179802955665, "grad_norm": 0.056762780994176865, "learning_rate": 0.01, "loss": 2.0293, "step": 16245 }, { "epoch": 1.667487684729064, "grad_norm": 0.08652041852474213, "learning_rate": 0.01, "loss": 2.0574, "step": 16248 }, { "epoch": 1.667795566502463, "grad_norm": 0.14455944299697876, "learning_rate": 0.01, "loss": 2.0406, "step": 16251 }, { "epoch": 1.668103448275862, "grad_norm": 0.03951118513941765, "learning_rate": 0.01, "loss": 2.0368, "step": 16254 }, { "epoch": 1.6684113300492611, "grad_norm": 0.040585123002529144, "learning_rate": 0.01, "loss": 2.017, "step": 16257 }, { "epoch": 1.6687192118226601, "grad_norm": 0.05393810570240021, "learning_rate": 0.01, "loss": 2.0679, "step": 16260 }, { "epoch": 1.6690270935960592, "grad_norm": 0.050093088299036026, "learning_rate": 0.01, "loss": 2.0546, "step": 16263 }, { "epoch": 1.6693349753694582, "grad_norm": 0.04196159914135933, "learning_rate": 0.01, "loss": 2.0488, "step": 16266 }, { "epoch": 1.6696428571428572, "grad_norm": 0.03978092968463898, "learning_rate": 0.01, "loss": 2.0453, "step": 16269 }, { "epoch": 1.6699507389162562, "grad_norm": 0.05054232105612755, "learning_rate": 0.01, "loss": 2.0337, "step": 16272 }, { "epoch": 1.6702586206896552, "grad_norm": 0.0746975764632225, "learning_rate": 0.01, "loss": 2.0636, "step": 16275 }, { "epoch": 1.6705665024630543, "grad_norm": 0.05685516446828842, "learning_rate": 0.01, "loss": 2.0591, "step": 16278 }, { "epoch": 1.6708743842364533, "grad_norm": 0.031971871852874756, "learning_rate": 0.01, "loss": 2.0657, "step": 16281 }, { "epoch": 1.6711822660098523, "grad_norm": 0.03947863727807999, "learning_rate": 0.01, "loss": 2.0333, "step": 16284 }, { "epoch": 1.6714901477832513, "grad_norm": 0.11271070688962936, "learning_rate": 0.01, "loss": 2.0421, "step": 16287 }, { "epoch": 1.6717980295566504, "grad_norm": 0.05308755114674568, "learning_rate": 0.01, "loss": 2.0536, "step": 16290 }, { "epoch": 1.6721059113300494, "grad_norm": 0.042826078832149506, "learning_rate": 0.01, "loss": 2.0694, "step": 16293 }, { "epoch": 1.6724137931034484, "grad_norm": 0.0458630695939064, "learning_rate": 0.01, "loss": 2.0312, "step": 16296 }, { "epoch": 1.6727216748768474, "grad_norm": 0.05401900038123131, "learning_rate": 0.01, "loss": 2.0613, "step": 16299 }, { "epoch": 1.6730295566502464, "grad_norm": 0.05380195751786232, "learning_rate": 0.01, "loss": 2.0336, "step": 16302 }, { "epoch": 1.6733374384236455, "grad_norm": 0.038716450333595276, "learning_rate": 0.01, "loss": 2.0548, "step": 16305 }, { "epoch": 1.6736453201970445, "grad_norm": 0.04034694656729698, "learning_rate": 0.01, "loss": 2.0421, "step": 16308 }, { "epoch": 1.6739532019704435, "grad_norm": 0.06753403693437576, "learning_rate": 0.01, "loss": 2.0324, "step": 16311 }, { "epoch": 1.6742610837438425, "grad_norm": 0.10001173615455627, "learning_rate": 0.01, "loss": 2.0467, "step": 16314 }, { "epoch": 1.6745689655172413, "grad_norm": 0.04366351664066315, "learning_rate": 0.01, "loss": 2.0622, "step": 16317 }, { "epoch": 1.6748768472906403, "grad_norm": 0.07137630879878998, "learning_rate": 0.01, "loss": 2.0333, "step": 16320 }, { "epoch": 1.6751847290640394, "grad_norm": 0.049938492476940155, "learning_rate": 0.01, "loss": 2.0426, "step": 16323 }, { "epoch": 1.6754926108374384, "grad_norm": 0.03337172046303749, "learning_rate": 0.01, "loss": 2.0462, "step": 16326 }, { "epoch": 1.6758004926108374, "grad_norm": 0.07407473772764206, "learning_rate": 0.01, "loss": 2.0705, "step": 16329 }, { "epoch": 1.6761083743842364, "grad_norm": 0.07006946206092834, "learning_rate": 0.01, "loss": 2.0383, "step": 16332 }, { "epoch": 1.6764162561576355, "grad_norm": 0.05342825874686241, "learning_rate": 0.01, "loss": 2.0481, "step": 16335 }, { "epoch": 1.6767241379310345, "grad_norm": 0.052405234426259995, "learning_rate": 0.01, "loss": 2.0138, "step": 16338 }, { "epoch": 1.6770320197044335, "grad_norm": 0.20231324434280396, "learning_rate": 0.01, "loss": 2.0472, "step": 16341 }, { "epoch": 1.6773399014778325, "grad_norm": 0.07893595844507217, "learning_rate": 0.01, "loss": 2.0415, "step": 16344 }, { "epoch": 1.6776477832512315, "grad_norm": 0.06872416287660599, "learning_rate": 0.01, "loss": 2.0376, "step": 16347 }, { "epoch": 1.6779556650246306, "grad_norm": 0.041687123477458954, "learning_rate": 0.01, "loss": 2.0442, "step": 16350 }, { "epoch": 1.6782635467980296, "grad_norm": 0.04184873029589653, "learning_rate": 0.01, "loss": 2.0769, "step": 16353 }, { "epoch": 1.6785714285714286, "grad_norm": 0.036598458886146545, "learning_rate": 0.01, "loss": 2.0255, "step": 16356 }, { "epoch": 1.6788793103448276, "grad_norm": 0.062203384935855865, "learning_rate": 0.01, "loss": 2.0582, "step": 16359 }, { "epoch": 1.6791871921182266, "grad_norm": 0.04513971135020256, "learning_rate": 0.01, "loss": 2.0475, "step": 16362 }, { "epoch": 1.6794950738916257, "grad_norm": 0.043875399976968765, "learning_rate": 0.01, "loss": 2.0455, "step": 16365 }, { "epoch": 1.6798029556650245, "grad_norm": 0.030207300558686256, "learning_rate": 0.01, "loss": 2.0733, "step": 16368 }, { "epoch": 1.6801108374384235, "grad_norm": 0.07749854028224945, "learning_rate": 0.01, "loss": 2.0402, "step": 16371 }, { "epoch": 1.6804187192118225, "grad_norm": 0.10269973427057266, "learning_rate": 0.01, "loss": 2.0342, "step": 16374 }, { "epoch": 1.6807266009852215, "grad_norm": 0.043558500707149506, "learning_rate": 0.01, "loss": 2.0429, "step": 16377 }, { "epoch": 1.6810344827586206, "grad_norm": 0.0490686409175396, "learning_rate": 0.01, "loss": 2.0381, "step": 16380 }, { "epoch": 1.6813423645320196, "grad_norm": 0.062107689678668976, "learning_rate": 0.01, "loss": 2.0592, "step": 16383 }, { "epoch": 1.6816502463054186, "grad_norm": 0.0856776013970375, "learning_rate": 0.01, "loss": 2.0666, "step": 16386 }, { "epoch": 1.6819581280788176, "grad_norm": 0.11694356054067612, "learning_rate": 0.01, "loss": 2.0545, "step": 16389 }, { "epoch": 1.6822660098522166, "grad_norm": 0.07279752194881439, "learning_rate": 0.01, "loss": 2.0348, "step": 16392 }, { "epoch": 1.6825738916256157, "grad_norm": 0.06813056766986847, "learning_rate": 0.01, "loss": 2.0549, "step": 16395 }, { "epoch": 1.6828817733990147, "grad_norm": 0.045916907489299774, "learning_rate": 0.01, "loss": 2.0714, "step": 16398 }, { "epoch": 1.6831896551724137, "grad_norm": 0.04464447498321533, "learning_rate": 0.01, "loss": 2.0655, "step": 16401 }, { "epoch": 1.6834975369458127, "grad_norm": 0.04815223440527916, "learning_rate": 0.01, "loss": 2.0633, "step": 16404 }, { "epoch": 1.6838054187192117, "grad_norm": 0.06025001034140587, "learning_rate": 0.01, "loss": 2.0325, "step": 16407 }, { "epoch": 1.6841133004926108, "grad_norm": 0.05691540613770485, "learning_rate": 0.01, "loss": 2.07, "step": 16410 }, { "epoch": 1.6844211822660098, "grad_norm": 0.04643694683909416, "learning_rate": 0.01, "loss": 2.0478, "step": 16413 }, { "epoch": 1.6847290640394088, "grad_norm": 0.03540325164794922, "learning_rate": 0.01, "loss": 2.0739, "step": 16416 }, { "epoch": 1.6850369458128078, "grad_norm": 0.034472569823265076, "learning_rate": 0.01, "loss": 2.0441, "step": 16419 }, { "epoch": 1.6853448275862069, "grad_norm": 0.04316902533173561, "learning_rate": 0.01, "loss": 2.0422, "step": 16422 }, { "epoch": 1.6856527093596059, "grad_norm": 0.04943558946251869, "learning_rate": 0.01, "loss": 2.0377, "step": 16425 }, { "epoch": 1.685960591133005, "grad_norm": 0.11482315510511398, "learning_rate": 0.01, "loss": 2.0668, "step": 16428 }, { "epoch": 1.686268472906404, "grad_norm": 0.10594377666711807, "learning_rate": 0.01, "loss": 2.0513, "step": 16431 }, { "epoch": 1.686576354679803, "grad_norm": 0.09860610961914062, "learning_rate": 0.01, "loss": 2.0456, "step": 16434 }, { "epoch": 1.686884236453202, "grad_norm": 0.06849053502082825, "learning_rate": 0.01, "loss": 2.0645, "step": 16437 }, { "epoch": 1.687192118226601, "grad_norm": 0.05089464411139488, "learning_rate": 0.01, "loss": 2.0383, "step": 16440 }, { "epoch": 1.6875, "grad_norm": 0.04762034863233566, "learning_rate": 0.01, "loss": 2.0443, "step": 16443 }, { "epoch": 1.687807881773399, "grad_norm": 0.09014497697353363, "learning_rate": 0.01, "loss": 2.0736, "step": 16446 }, { "epoch": 1.688115763546798, "grad_norm": 0.06832917779684067, "learning_rate": 0.01, "loss": 2.0677, "step": 16449 }, { "epoch": 1.688423645320197, "grad_norm": 0.0529920794069767, "learning_rate": 0.01, "loss": 2.0423, "step": 16452 }, { "epoch": 1.688731527093596, "grad_norm": 0.03208652511239052, "learning_rate": 0.01, "loss": 2.0561, "step": 16455 }, { "epoch": 1.689039408866995, "grad_norm": 0.13702784478664398, "learning_rate": 0.01, "loss": 2.0393, "step": 16458 }, { "epoch": 1.6893472906403941, "grad_norm": 0.05972970649600029, "learning_rate": 0.01, "loss": 2.0795, "step": 16461 }, { "epoch": 1.6896551724137931, "grad_norm": 0.043536797165870667, "learning_rate": 0.01, "loss": 2.0622, "step": 16464 }, { "epoch": 1.6899630541871922, "grad_norm": 0.0556536540389061, "learning_rate": 0.01, "loss": 2.0523, "step": 16467 }, { "epoch": 1.6902709359605912, "grad_norm": 0.06583042442798615, "learning_rate": 0.01, "loss": 2.0568, "step": 16470 }, { "epoch": 1.6905788177339902, "grad_norm": 0.0535028837621212, "learning_rate": 0.01, "loss": 2.0427, "step": 16473 }, { "epoch": 1.6908866995073892, "grad_norm": 0.09974632412195206, "learning_rate": 0.01, "loss": 2.0589, "step": 16476 }, { "epoch": 1.6911945812807883, "grad_norm": 0.058350350707769394, "learning_rate": 0.01, "loss": 2.0392, "step": 16479 }, { "epoch": 1.6915024630541873, "grad_norm": 0.10049036890268326, "learning_rate": 0.01, "loss": 2.0643, "step": 16482 }, { "epoch": 1.6918103448275863, "grad_norm": 0.061119675636291504, "learning_rate": 0.01, "loss": 2.0455, "step": 16485 }, { "epoch": 1.6921182266009853, "grad_norm": 0.07189033925533295, "learning_rate": 0.01, "loss": 2.0629, "step": 16488 }, { "epoch": 1.6924261083743843, "grad_norm": 0.08962611109018326, "learning_rate": 0.01, "loss": 2.0586, "step": 16491 }, { "epoch": 1.6927339901477834, "grad_norm": 0.05600450560450554, "learning_rate": 0.01, "loss": 2.0434, "step": 16494 }, { "epoch": 1.6930418719211824, "grad_norm": 0.1281098574399948, "learning_rate": 0.01, "loss": 2.0241, "step": 16497 }, { "epoch": 1.6933497536945814, "grad_norm": 0.036696117371320724, "learning_rate": 0.01, "loss": 2.065, "step": 16500 }, { "epoch": 1.6936576354679804, "grad_norm": 0.12428770959377289, "learning_rate": 0.01, "loss": 2.0479, "step": 16503 }, { "epoch": 1.6939655172413794, "grad_norm": 0.07593953609466553, "learning_rate": 0.01, "loss": 2.0357, "step": 16506 }, { "epoch": 1.6942733990147785, "grad_norm": 0.0686376765370369, "learning_rate": 0.01, "loss": 2.05, "step": 16509 }, { "epoch": 1.6945812807881775, "grad_norm": 0.044805269688367844, "learning_rate": 0.01, "loss": 2.0514, "step": 16512 }, { "epoch": 1.6948891625615765, "grad_norm": 0.04698259010910988, "learning_rate": 0.01, "loss": 2.0505, "step": 16515 }, { "epoch": 1.6951970443349755, "grad_norm": 0.04546966403722763, "learning_rate": 0.01, "loss": 2.0265, "step": 16518 }, { "epoch": 1.6955049261083743, "grad_norm": 0.07239431142807007, "learning_rate": 0.01, "loss": 2.0403, "step": 16521 }, { "epoch": 1.6958128078817734, "grad_norm": 0.08790195733308792, "learning_rate": 0.01, "loss": 2.0721, "step": 16524 }, { "epoch": 1.6961206896551724, "grad_norm": 0.05445432290434837, "learning_rate": 0.01, "loss": 2.039, "step": 16527 }, { "epoch": 1.6964285714285714, "grad_norm": 0.048141270875930786, "learning_rate": 0.01, "loss": 2.0191, "step": 16530 }, { "epoch": 1.6967364532019704, "grad_norm": 0.05230564624071121, "learning_rate": 0.01, "loss": 2.0646, "step": 16533 }, { "epoch": 1.6970443349753694, "grad_norm": 0.1007009968161583, "learning_rate": 0.01, "loss": 2.0751, "step": 16536 }, { "epoch": 1.6973522167487685, "grad_norm": 0.03878286853432655, "learning_rate": 0.01, "loss": 2.0257, "step": 16539 }, { "epoch": 1.6976600985221675, "grad_norm": 0.08503543585538864, "learning_rate": 0.01, "loss": 2.0516, "step": 16542 }, { "epoch": 1.6979679802955665, "grad_norm": 0.06239473819732666, "learning_rate": 0.01, "loss": 2.057, "step": 16545 }, { "epoch": 1.6982758620689655, "grad_norm": 0.06893055140972137, "learning_rate": 0.01, "loss": 2.0435, "step": 16548 }, { "epoch": 1.6985837438423645, "grad_norm": 0.08434829860925674, "learning_rate": 0.01, "loss": 2.0319, "step": 16551 }, { "epoch": 1.6988916256157636, "grad_norm": 0.031773362308740616, "learning_rate": 0.01, "loss": 2.0585, "step": 16554 }, { "epoch": 1.6991995073891626, "grad_norm": 0.11598584800958633, "learning_rate": 0.01, "loss": 2.0423, "step": 16557 }, { "epoch": 1.6995073891625616, "grad_norm": 0.07008111476898193, "learning_rate": 0.01, "loss": 2.0787, "step": 16560 }, { "epoch": 1.6998152709359606, "grad_norm": 0.03940622881054878, "learning_rate": 0.01, "loss": 2.0525, "step": 16563 }, { "epoch": 1.7001231527093597, "grad_norm": 0.05206933617591858, "learning_rate": 0.01, "loss": 2.0671, "step": 16566 }, { "epoch": 1.7004310344827587, "grad_norm": 0.04568307474255562, "learning_rate": 0.01, "loss": 2.0413, "step": 16569 }, { "epoch": 1.7007389162561575, "grad_norm": 0.031628433614969254, "learning_rate": 0.01, "loss": 2.0323, "step": 16572 }, { "epoch": 1.7010467980295565, "grad_norm": 0.05636722221970558, "learning_rate": 0.01, "loss": 2.0403, "step": 16575 }, { "epoch": 1.7013546798029555, "grad_norm": 0.11134552955627441, "learning_rate": 0.01, "loss": 2.034, "step": 16578 }, { "epoch": 1.7016625615763545, "grad_norm": 0.06964823603630066, "learning_rate": 0.01, "loss": 2.0701, "step": 16581 }, { "epoch": 1.7019704433497536, "grad_norm": 0.041148003190755844, "learning_rate": 0.01, "loss": 2.0693, "step": 16584 }, { "epoch": 1.7022783251231526, "grad_norm": 0.03673578426241875, "learning_rate": 0.01, "loss": 2.0232, "step": 16587 }, { "epoch": 1.7025862068965516, "grad_norm": 0.03659043833613396, "learning_rate": 0.01, "loss": 2.0257, "step": 16590 }, { "epoch": 1.7028940886699506, "grad_norm": 0.03824566677212715, "learning_rate": 0.01, "loss": 2.0587, "step": 16593 }, { "epoch": 1.7032019704433496, "grad_norm": 0.07334180176258087, "learning_rate": 0.01, "loss": 2.0626, "step": 16596 }, { "epoch": 1.7035098522167487, "grad_norm": 0.055927857756614685, "learning_rate": 0.01, "loss": 2.0311, "step": 16599 }, { "epoch": 1.7038177339901477, "grad_norm": 0.07610691338777542, "learning_rate": 0.01, "loss": 2.0419, "step": 16602 }, { "epoch": 1.7041256157635467, "grad_norm": 0.06405298411846161, "learning_rate": 0.01, "loss": 2.0693, "step": 16605 }, { "epoch": 1.7044334975369457, "grad_norm": 0.06193486601114273, "learning_rate": 0.01, "loss": 2.0442, "step": 16608 }, { "epoch": 1.7047413793103448, "grad_norm": 0.12181366235017776, "learning_rate": 0.01, "loss": 2.0324, "step": 16611 }, { "epoch": 1.7050492610837438, "grad_norm": 0.049060508608818054, "learning_rate": 0.01, "loss": 2.044, "step": 16614 }, { "epoch": 1.7053571428571428, "grad_norm": 0.05021090805530548, "learning_rate": 0.01, "loss": 2.0501, "step": 16617 }, { "epoch": 1.7056650246305418, "grad_norm": 0.045171257108449936, "learning_rate": 0.01, "loss": 2.0418, "step": 16620 }, { "epoch": 1.7059729064039408, "grad_norm": 0.04944808408617973, "learning_rate": 0.01, "loss": 2.0576, "step": 16623 }, { "epoch": 1.7062807881773399, "grad_norm": 0.03556932508945465, "learning_rate": 0.01, "loss": 2.0405, "step": 16626 }, { "epoch": 1.7065886699507389, "grad_norm": 0.10005172342061996, "learning_rate": 0.01, "loss": 2.0422, "step": 16629 }, { "epoch": 1.706896551724138, "grad_norm": 0.04088572412729263, "learning_rate": 0.01, "loss": 2.0526, "step": 16632 }, { "epoch": 1.707204433497537, "grad_norm": 0.04937949404120445, "learning_rate": 0.01, "loss": 2.0656, "step": 16635 }, { "epoch": 1.707512315270936, "grad_norm": 0.07822302728891373, "learning_rate": 0.01, "loss": 2.0545, "step": 16638 }, { "epoch": 1.707820197044335, "grad_norm": 0.05767158418893814, "learning_rate": 0.01, "loss": 2.0515, "step": 16641 }, { "epoch": 1.708128078817734, "grad_norm": 0.08512212336063385, "learning_rate": 0.01, "loss": 2.0579, "step": 16644 }, { "epoch": 1.708435960591133, "grad_norm": 0.06758993119001389, "learning_rate": 0.01, "loss": 2.0307, "step": 16647 }, { "epoch": 1.708743842364532, "grad_norm": 0.08142005652189255, "learning_rate": 0.01, "loss": 2.0531, "step": 16650 }, { "epoch": 1.709051724137931, "grad_norm": 0.06357218325138092, "learning_rate": 0.01, "loss": 2.0475, "step": 16653 }, { "epoch": 1.70935960591133, "grad_norm": 0.10591546446084976, "learning_rate": 0.01, "loss": 2.0447, "step": 16656 }, { "epoch": 1.709667487684729, "grad_norm": 0.06571496278047562, "learning_rate": 0.01, "loss": 2.0176, "step": 16659 }, { "epoch": 1.7099753694581281, "grad_norm": 0.049450814723968506, "learning_rate": 0.01, "loss": 2.0522, "step": 16662 }, { "epoch": 1.7102832512315271, "grad_norm": 0.11850273609161377, "learning_rate": 0.01, "loss": 2.0575, "step": 16665 }, { "epoch": 1.7105911330049262, "grad_norm": 0.0952281728386879, "learning_rate": 0.01, "loss": 2.0442, "step": 16668 }, { "epoch": 1.7108990147783252, "grad_norm": 0.09431217610836029, "learning_rate": 0.01, "loss": 2.0403, "step": 16671 }, { "epoch": 1.7112068965517242, "grad_norm": 0.07080823183059692, "learning_rate": 0.01, "loss": 2.0111, "step": 16674 }, { "epoch": 1.7115147783251232, "grad_norm": 0.049033813178539276, "learning_rate": 0.01, "loss": 2.0489, "step": 16677 }, { "epoch": 1.7118226600985222, "grad_norm": 0.04356718435883522, "learning_rate": 0.01, "loss": 2.035, "step": 16680 }, { "epoch": 1.7121305418719213, "grad_norm": 0.03276592493057251, "learning_rate": 0.01, "loss": 2.0396, "step": 16683 }, { "epoch": 1.7124384236453203, "grad_norm": 0.04438839852809906, "learning_rate": 0.01, "loss": 2.0441, "step": 16686 }, { "epoch": 1.7127463054187193, "grad_norm": 0.07276454567909241, "learning_rate": 0.01, "loss": 2.0513, "step": 16689 }, { "epoch": 1.7130541871921183, "grad_norm": 0.11324001848697662, "learning_rate": 0.01, "loss": 2.0482, "step": 16692 }, { "epoch": 1.7133620689655173, "grad_norm": 0.14715081453323364, "learning_rate": 0.01, "loss": 2.048, "step": 16695 }, { "epoch": 1.7136699507389164, "grad_norm": 0.07661852240562439, "learning_rate": 0.01, "loss": 2.0396, "step": 16698 }, { "epoch": 1.7139778325123154, "grad_norm": 0.05308947339653969, "learning_rate": 0.01, "loss": 2.0601, "step": 16701 }, { "epoch": 1.7142857142857144, "grad_norm": 0.06816977262496948, "learning_rate": 0.01, "loss": 2.0456, "step": 16704 }, { "epoch": 1.7145935960591134, "grad_norm": 0.05123249441385269, "learning_rate": 0.01, "loss": 2.0532, "step": 16707 }, { "epoch": 1.7149014778325125, "grad_norm": 0.05118009075522423, "learning_rate": 0.01, "loss": 2.0659, "step": 16710 }, { "epoch": 1.7152093596059115, "grad_norm": 0.03276235982775688, "learning_rate": 0.01, "loss": 2.0718, "step": 16713 }, { "epoch": 1.7155172413793105, "grad_norm": 0.049824655055999756, "learning_rate": 0.01, "loss": 2.0388, "step": 16716 }, { "epoch": 1.7158251231527095, "grad_norm": 0.1416471302509308, "learning_rate": 0.01, "loss": 2.0526, "step": 16719 }, { "epoch": 1.7161330049261085, "grad_norm": 0.04109251871705055, "learning_rate": 0.01, "loss": 2.0329, "step": 16722 }, { "epoch": 1.7164408866995073, "grad_norm": 0.08853971213102341, "learning_rate": 0.01, "loss": 2.0506, "step": 16725 }, { "epoch": 1.7167487684729064, "grad_norm": 0.05180136114358902, "learning_rate": 0.01, "loss": 2.0608, "step": 16728 }, { "epoch": 1.7170566502463054, "grad_norm": 0.0667758584022522, "learning_rate": 0.01, "loss": 2.0347, "step": 16731 }, { "epoch": 1.7173645320197044, "grad_norm": 0.039203155785799026, "learning_rate": 0.01, "loss": 2.0331, "step": 16734 }, { "epoch": 1.7176724137931034, "grad_norm": 0.05210564285516739, "learning_rate": 0.01, "loss": 2.0666, "step": 16737 }, { "epoch": 1.7179802955665024, "grad_norm": 0.0668390691280365, "learning_rate": 0.01, "loss": 2.0365, "step": 16740 }, { "epoch": 1.7182881773399015, "grad_norm": 0.05041831359267235, "learning_rate": 0.01, "loss": 2.0261, "step": 16743 }, { "epoch": 1.7185960591133005, "grad_norm": 0.04496284946799278, "learning_rate": 0.01, "loss": 2.0182, "step": 16746 }, { "epoch": 1.7189039408866995, "grad_norm": 0.08660906553268433, "learning_rate": 0.01, "loss": 2.0434, "step": 16749 }, { "epoch": 1.7192118226600985, "grad_norm": 0.054843079298734665, "learning_rate": 0.01, "loss": 2.0522, "step": 16752 }, { "epoch": 1.7195197044334976, "grad_norm": 0.05377354100346565, "learning_rate": 0.01, "loss": 2.0263, "step": 16755 }, { "epoch": 1.7198275862068966, "grad_norm": 0.060547634959220886, "learning_rate": 0.01, "loss": 2.0165, "step": 16758 }, { "epoch": 1.7201354679802956, "grad_norm": 0.06253104656934738, "learning_rate": 0.01, "loss": 2.0479, "step": 16761 }, { "epoch": 1.7204433497536946, "grad_norm": 0.052225805819034576, "learning_rate": 0.01, "loss": 2.03, "step": 16764 }, { "epoch": 1.7207512315270936, "grad_norm": 0.04988449066877365, "learning_rate": 0.01, "loss": 2.0498, "step": 16767 }, { "epoch": 1.7210591133004927, "grad_norm": 0.049726665019989014, "learning_rate": 0.01, "loss": 2.0588, "step": 16770 }, { "epoch": 1.7213669950738915, "grad_norm": 0.05053321272134781, "learning_rate": 0.01, "loss": 2.0518, "step": 16773 }, { "epoch": 1.7216748768472905, "grad_norm": 0.060702208429574966, "learning_rate": 0.01, "loss": 2.0448, "step": 16776 }, { "epoch": 1.7219827586206895, "grad_norm": 0.06695824861526489, "learning_rate": 0.01, "loss": 2.0361, "step": 16779 }, { "epoch": 1.7222906403940885, "grad_norm": 0.07498425990343094, "learning_rate": 0.01, "loss": 2.0652, "step": 16782 }, { "epoch": 1.7225985221674875, "grad_norm": 0.04544251784682274, "learning_rate": 0.01, "loss": 2.0294, "step": 16785 }, { "epoch": 1.7229064039408866, "grad_norm": 0.09211461246013641, "learning_rate": 0.01, "loss": 2.0261, "step": 16788 }, { "epoch": 1.7232142857142856, "grad_norm": 0.19020068645477295, "learning_rate": 0.01, "loss": 2.0459, "step": 16791 }, { "epoch": 1.7235221674876846, "grad_norm": 0.08979224413633347, "learning_rate": 0.01, "loss": 2.052, "step": 16794 }, { "epoch": 1.7238300492610836, "grad_norm": 0.06724744290113449, "learning_rate": 0.01, "loss": 2.0463, "step": 16797 }, { "epoch": 1.7241379310344827, "grad_norm": 0.05009309947490692, "learning_rate": 0.01, "loss": 2.0338, "step": 16800 }, { "epoch": 1.7244458128078817, "grad_norm": 0.04953375831246376, "learning_rate": 0.01, "loss": 2.0554, "step": 16803 }, { "epoch": 1.7247536945812807, "grad_norm": 0.12439639121294022, "learning_rate": 0.01, "loss": 2.0363, "step": 16806 }, { "epoch": 1.7250615763546797, "grad_norm": 0.126046821475029, "learning_rate": 0.01, "loss": 2.0435, "step": 16809 }, { "epoch": 1.7253694581280787, "grad_norm": 0.06693071871995926, "learning_rate": 0.01, "loss": 2.058, "step": 16812 }, { "epoch": 1.7256773399014778, "grad_norm": 0.04883793368935585, "learning_rate": 0.01, "loss": 2.0768, "step": 16815 }, { "epoch": 1.7259852216748768, "grad_norm": 0.06895219534635544, "learning_rate": 0.01, "loss": 2.057, "step": 16818 }, { "epoch": 1.7262931034482758, "grad_norm": 0.06979874521493912, "learning_rate": 0.01, "loss": 2.0641, "step": 16821 }, { "epoch": 1.7266009852216748, "grad_norm": 0.06453370302915573, "learning_rate": 0.01, "loss": 2.025, "step": 16824 }, { "epoch": 1.7269088669950738, "grad_norm": 0.11405997723340988, "learning_rate": 0.01, "loss": 2.0469, "step": 16827 }, { "epoch": 1.7272167487684729, "grad_norm": 0.10612863302230835, "learning_rate": 0.01, "loss": 2.0825, "step": 16830 }, { "epoch": 1.7275246305418719, "grad_norm": 0.09805281460285187, "learning_rate": 0.01, "loss": 2.0256, "step": 16833 }, { "epoch": 1.727832512315271, "grad_norm": 0.08369138836860657, "learning_rate": 0.01, "loss": 2.0307, "step": 16836 }, { "epoch": 1.72814039408867, "grad_norm": 0.08893299847841263, "learning_rate": 0.01, "loss": 2.0362, "step": 16839 }, { "epoch": 1.728448275862069, "grad_norm": 0.06902442872524261, "learning_rate": 0.01, "loss": 2.0562, "step": 16842 }, { "epoch": 1.728756157635468, "grad_norm": 0.058703918009996414, "learning_rate": 0.01, "loss": 2.0406, "step": 16845 }, { "epoch": 1.729064039408867, "grad_norm": 0.03979343920946121, "learning_rate": 0.01, "loss": 2.0636, "step": 16848 }, { "epoch": 1.729371921182266, "grad_norm": 0.08049800246953964, "learning_rate": 0.01, "loss": 2.0483, "step": 16851 }, { "epoch": 1.729679802955665, "grad_norm": 0.07980278134346008, "learning_rate": 0.01, "loss": 2.0471, "step": 16854 }, { "epoch": 1.729987684729064, "grad_norm": 0.051650844514369965, "learning_rate": 0.01, "loss": 2.0322, "step": 16857 }, { "epoch": 1.730295566502463, "grad_norm": 0.06828897446393967, "learning_rate": 0.01, "loss": 2.0456, "step": 16860 }, { "epoch": 1.730603448275862, "grad_norm": 0.10987185686826706, "learning_rate": 0.01, "loss": 2.0334, "step": 16863 }, { "epoch": 1.7309113300492611, "grad_norm": 0.055677276104688644, "learning_rate": 0.01, "loss": 2.018, "step": 16866 }, { "epoch": 1.7312192118226601, "grad_norm": 0.056478437036275864, "learning_rate": 0.01, "loss": 2.0241, "step": 16869 }, { "epoch": 1.7315270935960592, "grad_norm": 0.1247091144323349, "learning_rate": 0.01, "loss": 2.0128, "step": 16872 }, { "epoch": 1.7318349753694582, "grad_norm": 0.06603918969631195, "learning_rate": 0.01, "loss": 2.0456, "step": 16875 }, { "epoch": 1.7321428571428572, "grad_norm": 0.04174170270562172, "learning_rate": 0.01, "loss": 2.0349, "step": 16878 }, { "epoch": 1.7324507389162562, "grad_norm": 0.03841250762343407, "learning_rate": 0.01, "loss": 2.0365, "step": 16881 }, { "epoch": 1.7327586206896552, "grad_norm": 0.03429241105914116, "learning_rate": 0.01, "loss": 2.038, "step": 16884 }, { "epoch": 1.7330665024630543, "grad_norm": 0.05175672471523285, "learning_rate": 0.01, "loss": 2.0583, "step": 16887 }, { "epoch": 1.7333743842364533, "grad_norm": 0.06318958103656769, "learning_rate": 0.01, "loss": 2.0398, "step": 16890 }, { "epoch": 1.7336822660098523, "grad_norm": 0.08888188004493713, "learning_rate": 0.01, "loss": 2.0447, "step": 16893 }, { "epoch": 1.7339901477832513, "grad_norm": 0.04479747265577316, "learning_rate": 0.01, "loss": 2.0671, "step": 16896 }, { "epoch": 1.7342980295566504, "grad_norm": 0.05286455899477005, "learning_rate": 0.01, "loss": 2.0342, "step": 16899 }, { "epoch": 1.7346059113300494, "grad_norm": 0.04719952121376991, "learning_rate": 0.01, "loss": 2.0251, "step": 16902 }, { "epoch": 1.7349137931034484, "grad_norm": 0.06204066798090935, "learning_rate": 0.01, "loss": 2.0412, "step": 16905 }, { "epoch": 1.7352216748768474, "grad_norm": 0.1099078357219696, "learning_rate": 0.01, "loss": 2.008, "step": 16908 }, { "epoch": 1.7355295566502464, "grad_norm": 0.08311746269464493, "learning_rate": 0.01, "loss": 2.0248, "step": 16911 }, { "epoch": 1.7358374384236455, "grad_norm": 0.09649046510457993, "learning_rate": 0.01, "loss": 2.0585, "step": 16914 }, { "epoch": 1.7361453201970445, "grad_norm": 0.06716254353523254, "learning_rate": 0.01, "loss": 2.0128, "step": 16917 }, { "epoch": 1.7364532019704435, "grad_norm": 0.05400918424129486, "learning_rate": 0.01, "loss": 2.035, "step": 16920 }, { "epoch": 1.7367610837438425, "grad_norm": 0.06857965141534805, "learning_rate": 0.01, "loss": 2.0394, "step": 16923 }, { "epoch": 1.7370689655172413, "grad_norm": 0.05842095986008644, "learning_rate": 0.01, "loss": 2.0418, "step": 16926 }, { "epoch": 1.7373768472906403, "grad_norm": 0.06279005855321884, "learning_rate": 0.01, "loss": 2.0528, "step": 16929 }, { "epoch": 1.7376847290640394, "grad_norm": 0.04209805652499199, "learning_rate": 0.01, "loss": 2.0374, "step": 16932 }, { "epoch": 1.7379926108374384, "grad_norm": 0.07557601481676102, "learning_rate": 0.01, "loss": 2.0351, "step": 16935 }, { "epoch": 1.7383004926108374, "grad_norm": 0.05998634174466133, "learning_rate": 0.01, "loss": 2.0283, "step": 16938 }, { "epoch": 1.7386083743842364, "grad_norm": 0.05477887764573097, "learning_rate": 0.01, "loss": 2.0659, "step": 16941 }, { "epoch": 1.7389162561576355, "grad_norm": 0.09489591419696808, "learning_rate": 0.01, "loss": 2.0313, "step": 16944 }, { "epoch": 1.7392241379310345, "grad_norm": 0.13861320912837982, "learning_rate": 0.01, "loss": 2.0778, "step": 16947 }, { "epoch": 1.7395320197044335, "grad_norm": 0.1302078664302826, "learning_rate": 0.01, "loss": 2.024, "step": 16950 }, { "epoch": 1.7398399014778325, "grad_norm": 0.08335380256175995, "learning_rate": 0.01, "loss": 2.0612, "step": 16953 }, { "epoch": 1.7401477832512315, "grad_norm": 0.06901963800191879, "learning_rate": 0.01, "loss": 2.0357, "step": 16956 }, { "epoch": 1.7404556650246306, "grad_norm": 0.06619597226381302, "learning_rate": 0.01, "loss": 2.0207, "step": 16959 }, { "epoch": 1.7407635467980296, "grad_norm": 0.045459166169166565, "learning_rate": 0.01, "loss": 2.071, "step": 16962 }, { "epoch": 1.7410714285714286, "grad_norm": 0.06302090734243393, "learning_rate": 0.01, "loss": 2.0311, "step": 16965 }, { "epoch": 1.7413793103448276, "grad_norm": 0.07184530049562454, "learning_rate": 0.01, "loss": 2.0309, "step": 16968 }, { "epoch": 1.7416871921182266, "grad_norm": 0.07319154590368271, "learning_rate": 0.01, "loss": 2.0316, "step": 16971 }, { "epoch": 1.7419950738916257, "grad_norm": 0.0667872503399849, "learning_rate": 0.01, "loss": 2.0406, "step": 16974 }, { "epoch": 1.7423029556650245, "grad_norm": 0.0988057404756546, "learning_rate": 0.01, "loss": 2.0466, "step": 16977 }, { "epoch": 1.7426108374384235, "grad_norm": 0.09142420440912247, "learning_rate": 0.01, "loss": 2.0503, "step": 16980 }, { "epoch": 1.7429187192118225, "grad_norm": 0.07581423968076706, "learning_rate": 0.01, "loss": 2.0004, "step": 16983 }, { "epoch": 1.7432266009852215, "grad_norm": 0.05793863534927368, "learning_rate": 0.01, "loss": 2.0278, "step": 16986 }, { "epoch": 1.7435344827586206, "grad_norm": 0.0717446580529213, "learning_rate": 0.01, "loss": 2.061, "step": 16989 }, { "epoch": 1.7438423645320196, "grad_norm": 0.03572774678468704, "learning_rate": 0.01, "loss": 2.0663, "step": 16992 }, { "epoch": 1.7441502463054186, "grad_norm": 0.09349701553583145, "learning_rate": 0.01, "loss": 2.0433, "step": 16995 }, { "epoch": 1.7444581280788176, "grad_norm": 0.08374779671430588, "learning_rate": 0.01, "loss": 2.0228, "step": 16998 }, { "epoch": 1.7447660098522166, "grad_norm": 0.045029062777757645, "learning_rate": 0.01, "loss": 2.0425, "step": 17001 }, { "epoch": 1.7450738916256157, "grad_norm": 0.06359710544347763, "learning_rate": 0.01, "loss": 2.073, "step": 17004 }, { "epoch": 1.7453817733990147, "grad_norm": 0.0948299989104271, "learning_rate": 0.01, "loss": 2.0326, "step": 17007 }, { "epoch": 1.7456896551724137, "grad_norm": 0.05561600998044014, "learning_rate": 0.01, "loss": 2.0702, "step": 17010 }, { "epoch": 1.7459975369458127, "grad_norm": 0.05672406032681465, "learning_rate": 0.01, "loss": 2.0591, "step": 17013 }, { "epoch": 1.7463054187192117, "grad_norm": 0.047100841999053955, "learning_rate": 0.01, "loss": 2.0597, "step": 17016 }, { "epoch": 1.7466133004926108, "grad_norm": 0.09477028995752335, "learning_rate": 0.01, "loss": 2.046, "step": 17019 }, { "epoch": 1.7469211822660098, "grad_norm": 0.07235399633646011, "learning_rate": 0.01, "loss": 2.0778, "step": 17022 }, { "epoch": 1.7472290640394088, "grad_norm": 0.08015649020671844, "learning_rate": 0.01, "loss": 2.0219, "step": 17025 }, { "epoch": 1.7475369458128078, "grad_norm": 0.07459922134876251, "learning_rate": 0.01, "loss": 2.0278, "step": 17028 }, { "epoch": 1.7478448275862069, "grad_norm": 0.10745642334222794, "learning_rate": 0.01, "loss": 2.0536, "step": 17031 }, { "epoch": 1.7481527093596059, "grad_norm": 0.049479395151138306, "learning_rate": 0.01, "loss": 2.0477, "step": 17034 }, { "epoch": 1.748460591133005, "grad_norm": 0.03935602307319641, "learning_rate": 0.01, "loss": 2.0495, "step": 17037 }, { "epoch": 1.748768472906404, "grad_norm": 0.05755804106593132, "learning_rate": 0.01, "loss": 2.0186, "step": 17040 }, { "epoch": 1.749076354679803, "grad_norm": 0.07461614906787872, "learning_rate": 0.01, "loss": 2.0577, "step": 17043 }, { "epoch": 1.749384236453202, "grad_norm": 0.07078621536493301, "learning_rate": 0.01, "loss": 2.0159, "step": 17046 }, { "epoch": 1.749692118226601, "grad_norm": 0.12035417556762695, "learning_rate": 0.01, "loss": 2.0549, "step": 17049 }, { "epoch": 1.75, "grad_norm": 0.054343827068805695, "learning_rate": 0.01, "loss": 2.0394, "step": 17052 }, { "epoch": 1.750307881773399, "grad_norm": 0.056529540568590164, "learning_rate": 0.01, "loss": 2.0604, "step": 17055 }, { "epoch": 1.750615763546798, "grad_norm": 0.09392616152763367, "learning_rate": 0.01, "loss": 2.0233, "step": 17058 }, { "epoch": 1.750923645320197, "grad_norm": 0.0874391570687294, "learning_rate": 0.01, "loss": 2.0733, "step": 17061 }, { "epoch": 1.751231527093596, "grad_norm": 0.03889552876353264, "learning_rate": 0.01, "loss": 2.0099, "step": 17064 }, { "epoch": 1.751539408866995, "grad_norm": 0.06299902498722076, "learning_rate": 0.01, "loss": 2.01, "step": 17067 }, { "epoch": 1.7518472906403941, "grad_norm": 0.05655315890908241, "learning_rate": 0.01, "loss": 2.0571, "step": 17070 }, { "epoch": 1.7521551724137931, "grad_norm": 0.04646646976470947, "learning_rate": 0.01, "loss": 2.0401, "step": 17073 }, { "epoch": 1.7524630541871922, "grad_norm": 0.04910219460725784, "learning_rate": 0.01, "loss": 2.0527, "step": 17076 }, { "epoch": 1.7527709359605912, "grad_norm": 0.03616022691130638, "learning_rate": 0.01, "loss": 2.0152, "step": 17079 }, { "epoch": 1.7530788177339902, "grad_norm": 0.05539899319410324, "learning_rate": 0.01, "loss": 2.029, "step": 17082 }, { "epoch": 1.7533866995073892, "grad_norm": 0.09409206360578537, "learning_rate": 0.01, "loss": 2.0621, "step": 17085 }, { "epoch": 1.7536945812807883, "grad_norm": 0.048882581293582916, "learning_rate": 0.01, "loss": 2.057, "step": 17088 }, { "epoch": 1.7540024630541873, "grad_norm": 0.09687826037406921, "learning_rate": 0.01, "loss": 2.0472, "step": 17091 }, { "epoch": 1.7543103448275863, "grad_norm": 0.06646592915058136, "learning_rate": 0.01, "loss": 2.0469, "step": 17094 }, { "epoch": 1.7546182266009853, "grad_norm": 0.07316572964191437, "learning_rate": 0.01, "loss": 2.0427, "step": 17097 }, { "epoch": 1.7549261083743843, "grad_norm": 0.058037593960762024, "learning_rate": 0.01, "loss": 2.0462, "step": 17100 }, { "epoch": 1.7552339901477834, "grad_norm": 0.05486461892724037, "learning_rate": 0.01, "loss": 2.0426, "step": 17103 }, { "epoch": 1.7555418719211824, "grad_norm": 0.0397610180079937, "learning_rate": 0.01, "loss": 2.0397, "step": 17106 }, { "epoch": 1.7558497536945814, "grad_norm": 0.11639061570167542, "learning_rate": 0.01, "loss": 2.0248, "step": 17109 }, { "epoch": 1.7561576354679804, "grad_norm": 0.04207361862063408, "learning_rate": 0.01, "loss": 2.064, "step": 17112 }, { "epoch": 1.7564655172413794, "grad_norm": 0.09336013346910477, "learning_rate": 0.01, "loss": 2.0399, "step": 17115 }, { "epoch": 1.7567733990147785, "grad_norm": 0.059693820774555206, "learning_rate": 0.01, "loss": 2.0347, "step": 17118 }, { "epoch": 1.7570812807881775, "grad_norm": 0.05269778147339821, "learning_rate": 0.01, "loss": 2.0222, "step": 17121 }, { "epoch": 1.7573891625615765, "grad_norm": 0.0548628568649292, "learning_rate": 0.01, "loss": 2.0269, "step": 17124 }, { "epoch": 1.7576970443349755, "grad_norm": 0.049783483147621155, "learning_rate": 0.01, "loss": 2.0469, "step": 17127 }, { "epoch": 1.7580049261083743, "grad_norm": 0.11240525543689728, "learning_rate": 0.01, "loss": 2.0405, "step": 17130 }, { "epoch": 1.7583128078817734, "grad_norm": 0.04133368283510208, "learning_rate": 0.01, "loss": 2.064, "step": 17133 }, { "epoch": 1.7586206896551724, "grad_norm": 0.042926132678985596, "learning_rate": 0.01, "loss": 2.0685, "step": 17136 }, { "epoch": 1.7589285714285714, "grad_norm": 0.053613241761922836, "learning_rate": 0.01, "loss": 2.0241, "step": 17139 }, { "epoch": 1.7592364532019704, "grad_norm": 0.03950737044215202, "learning_rate": 0.01, "loss": 2.0662, "step": 17142 }, { "epoch": 1.7595443349753694, "grad_norm": 0.045378413051366806, "learning_rate": 0.01, "loss": 2.0401, "step": 17145 }, { "epoch": 1.7598522167487685, "grad_norm": 0.036304816603660583, "learning_rate": 0.01, "loss": 2.0789, "step": 17148 }, { "epoch": 1.7601600985221675, "grad_norm": 0.03886290267109871, "learning_rate": 0.01, "loss": 2.0516, "step": 17151 }, { "epoch": 1.7604679802955665, "grad_norm": 0.0885484591126442, "learning_rate": 0.01, "loss": 2.0333, "step": 17154 }, { "epoch": 1.7607758620689655, "grad_norm": 0.06733599305152893, "learning_rate": 0.01, "loss": 2.039, "step": 17157 }, { "epoch": 1.7610837438423645, "grad_norm": 0.10319662094116211, "learning_rate": 0.01, "loss": 2.0629, "step": 17160 }, { "epoch": 1.7613916256157636, "grad_norm": 0.047492869198322296, "learning_rate": 0.01, "loss": 2.0726, "step": 17163 }, { "epoch": 1.7616995073891626, "grad_norm": 0.04345547780394554, "learning_rate": 0.01, "loss": 2.0259, "step": 17166 }, { "epoch": 1.7620073891625616, "grad_norm": 0.0452197827398777, "learning_rate": 0.01, "loss": 2.0393, "step": 17169 }, { "epoch": 1.7623152709359606, "grad_norm": 0.0703844428062439, "learning_rate": 0.01, "loss": 2.0458, "step": 17172 }, { "epoch": 1.7626231527093597, "grad_norm": 0.07864879071712494, "learning_rate": 0.01, "loss": 2.047, "step": 17175 }, { "epoch": 1.7629310344827587, "grad_norm": 0.1282995641231537, "learning_rate": 0.01, "loss": 2.0226, "step": 17178 }, { "epoch": 1.7632389162561575, "grad_norm": 0.0837298184633255, "learning_rate": 0.01, "loss": 2.0483, "step": 17181 }, { "epoch": 1.7635467980295565, "grad_norm": 0.05081562697887421, "learning_rate": 0.01, "loss": 2.0656, "step": 17184 }, { "epoch": 1.7638546798029555, "grad_norm": 0.07952243834733963, "learning_rate": 0.01, "loss": 2.0561, "step": 17187 }, { "epoch": 1.7641625615763545, "grad_norm": 0.06592147797346115, "learning_rate": 0.01, "loss": 2.0311, "step": 17190 }, { "epoch": 1.7644704433497536, "grad_norm": 0.04341195523738861, "learning_rate": 0.01, "loss": 2.0413, "step": 17193 }, { "epoch": 1.7647783251231526, "grad_norm": 0.04649266228079796, "learning_rate": 0.01, "loss": 2.0338, "step": 17196 }, { "epoch": 1.7650862068965516, "grad_norm": 0.04569242149591446, "learning_rate": 0.01, "loss": 2.0428, "step": 17199 }, { "epoch": 1.7653940886699506, "grad_norm": 0.040291350334882736, "learning_rate": 0.01, "loss": 2.0165, "step": 17202 }, { "epoch": 1.7657019704433496, "grad_norm": 0.05328141525387764, "learning_rate": 0.01, "loss": 2.0384, "step": 17205 }, { "epoch": 1.7660098522167487, "grad_norm": 0.04405885562300682, "learning_rate": 0.01, "loss": 2.0566, "step": 17208 }, { "epoch": 1.7663177339901477, "grad_norm": 0.06635614484548569, "learning_rate": 0.01, "loss": 2.0397, "step": 17211 }, { "epoch": 1.7666256157635467, "grad_norm": 0.09231774508953094, "learning_rate": 0.01, "loss": 2.0247, "step": 17214 }, { "epoch": 1.7669334975369457, "grad_norm": 0.056320998817682266, "learning_rate": 0.01, "loss": 2.066, "step": 17217 }, { "epoch": 1.7672413793103448, "grad_norm": 0.049784105271101, "learning_rate": 0.01, "loss": 2.0175, "step": 17220 }, { "epoch": 1.7675492610837438, "grad_norm": 0.03728071227669716, "learning_rate": 0.01, "loss": 2.0403, "step": 17223 }, { "epoch": 1.7678571428571428, "grad_norm": 0.06607525050640106, "learning_rate": 0.01, "loss": 2.0364, "step": 17226 }, { "epoch": 1.7681650246305418, "grad_norm": 0.07367686927318573, "learning_rate": 0.01, "loss": 2.0478, "step": 17229 }, { "epoch": 1.7684729064039408, "grad_norm": 0.039499782025814056, "learning_rate": 0.01, "loss": 2.0508, "step": 17232 }, { "epoch": 1.7687807881773399, "grad_norm": 0.04863186180591583, "learning_rate": 0.01, "loss": 2.0595, "step": 17235 }, { "epoch": 1.7690886699507389, "grad_norm": 0.03877348452806473, "learning_rate": 0.01, "loss": 2.0608, "step": 17238 }, { "epoch": 1.769396551724138, "grad_norm": 0.049965135753154755, "learning_rate": 0.01, "loss": 2.0521, "step": 17241 }, { "epoch": 1.769704433497537, "grad_norm": 0.0697547048330307, "learning_rate": 0.01, "loss": 2.0191, "step": 17244 }, { "epoch": 1.770012315270936, "grad_norm": 0.0562531016767025, "learning_rate": 0.01, "loss": 2.0581, "step": 17247 }, { "epoch": 1.770320197044335, "grad_norm": 0.12931805849075317, "learning_rate": 0.01, "loss": 2.072, "step": 17250 }, { "epoch": 1.770628078817734, "grad_norm": 0.06590058654546738, "learning_rate": 0.01, "loss": 2.0487, "step": 17253 }, { "epoch": 1.770935960591133, "grad_norm": 0.045246463268995285, "learning_rate": 0.01, "loss": 2.0424, "step": 17256 }, { "epoch": 1.771243842364532, "grad_norm": 0.03972258046269417, "learning_rate": 0.01, "loss": 2.043, "step": 17259 }, { "epoch": 1.771551724137931, "grad_norm": 0.030682874843478203, "learning_rate": 0.01, "loss": 2.0665, "step": 17262 }, { "epoch": 1.77185960591133, "grad_norm": 0.08989464491605759, "learning_rate": 0.01, "loss": 2.042, "step": 17265 }, { "epoch": 1.772167487684729, "grad_norm": 0.05595966801047325, "learning_rate": 0.01, "loss": 2.0399, "step": 17268 }, { "epoch": 1.7724753694581281, "grad_norm": 0.16923703253269196, "learning_rate": 0.01, "loss": 2.0161, "step": 17271 }, { "epoch": 1.7727832512315271, "grad_norm": 0.08722022920846939, "learning_rate": 0.01, "loss": 2.0379, "step": 17274 }, { "epoch": 1.7730911330049262, "grad_norm": 0.0741046667098999, "learning_rate": 0.01, "loss": 2.0512, "step": 17277 }, { "epoch": 1.7733990147783252, "grad_norm": 0.06061973422765732, "learning_rate": 0.01, "loss": 2.0318, "step": 17280 }, { "epoch": 1.7737068965517242, "grad_norm": 0.036843594163656235, "learning_rate": 0.01, "loss": 2.056, "step": 17283 }, { "epoch": 1.7740147783251232, "grad_norm": 0.03937767818570137, "learning_rate": 0.01, "loss": 2.019, "step": 17286 }, { "epoch": 1.7743226600985222, "grad_norm": 0.03801162540912628, "learning_rate": 0.01, "loss": 2.04, "step": 17289 }, { "epoch": 1.7746305418719213, "grad_norm": 0.045572392642498016, "learning_rate": 0.01, "loss": 2.0665, "step": 17292 }, { "epoch": 1.7749384236453203, "grad_norm": 0.06430240720510483, "learning_rate": 0.01, "loss": 2.0357, "step": 17295 }, { "epoch": 1.7752463054187193, "grad_norm": 0.09266401827335358, "learning_rate": 0.01, "loss": 2.0474, "step": 17298 }, { "epoch": 1.7755541871921183, "grad_norm": 0.09686179459095001, "learning_rate": 0.01, "loss": 2.0224, "step": 17301 }, { "epoch": 1.7758620689655173, "grad_norm": 0.04640132188796997, "learning_rate": 0.01, "loss": 2.0611, "step": 17304 }, { "epoch": 1.7761699507389164, "grad_norm": 0.03891894221305847, "learning_rate": 0.01, "loss": 2.0529, "step": 17307 }, { "epoch": 1.7764778325123154, "grad_norm": 0.06023077294230461, "learning_rate": 0.01, "loss": 2.0282, "step": 17310 }, { "epoch": 1.7767857142857144, "grad_norm": 0.12215135246515274, "learning_rate": 0.01, "loss": 2.0472, "step": 17313 }, { "epoch": 1.7770935960591134, "grad_norm": 0.04197768121957779, "learning_rate": 0.01, "loss": 2.038, "step": 17316 }, { "epoch": 1.7774014778325125, "grad_norm": 0.0429445244371891, "learning_rate": 0.01, "loss": 2.0291, "step": 17319 }, { "epoch": 1.7777093596059115, "grad_norm": 0.04674970358610153, "learning_rate": 0.01, "loss": 2.0493, "step": 17322 }, { "epoch": 1.7780172413793105, "grad_norm": 0.11712675541639328, "learning_rate": 0.01, "loss": 2.0421, "step": 17325 }, { "epoch": 1.7783251231527095, "grad_norm": 0.04812907800078392, "learning_rate": 0.01, "loss": 2.0395, "step": 17328 }, { "epoch": 1.7786330049261085, "grad_norm": 0.04147825017571449, "learning_rate": 0.01, "loss": 2.0057, "step": 17331 }, { "epoch": 1.7789408866995073, "grad_norm": 0.07262876629829407, "learning_rate": 0.01, "loss": 2.0383, "step": 17334 }, { "epoch": 1.7792487684729064, "grad_norm": 0.08528011292219162, "learning_rate": 0.01, "loss": 2.0151, "step": 17337 }, { "epoch": 1.7795566502463054, "grad_norm": 0.046615902334451675, "learning_rate": 0.01, "loss": 2.0368, "step": 17340 }, { "epoch": 1.7798645320197044, "grad_norm": 0.06018273904919624, "learning_rate": 0.01, "loss": 2.0411, "step": 17343 }, { "epoch": 1.7801724137931034, "grad_norm": 0.07272887974977493, "learning_rate": 0.01, "loss": 2.026, "step": 17346 }, { "epoch": 1.7804802955665024, "grad_norm": 0.07152794301509857, "learning_rate": 0.01, "loss": 2.0631, "step": 17349 }, { "epoch": 1.7807881773399015, "grad_norm": 0.07950329035520554, "learning_rate": 0.01, "loss": 2.0435, "step": 17352 }, { "epoch": 1.7810960591133005, "grad_norm": 0.040778059512376785, "learning_rate": 0.01, "loss": 2.0089, "step": 17355 }, { "epoch": 1.7814039408866995, "grad_norm": 0.06180460751056671, "learning_rate": 0.01, "loss": 2.0183, "step": 17358 }, { "epoch": 1.7817118226600985, "grad_norm": 0.06950334459543228, "learning_rate": 0.01, "loss": 2.0352, "step": 17361 }, { "epoch": 1.7820197044334976, "grad_norm": 0.037724483758211136, "learning_rate": 0.01, "loss": 2.0324, "step": 17364 }, { "epoch": 1.7823275862068966, "grad_norm": 0.05991238355636597, "learning_rate": 0.01, "loss": 2.053, "step": 17367 }, { "epoch": 1.7826354679802956, "grad_norm": 0.047278665006160736, "learning_rate": 0.01, "loss": 2.0427, "step": 17370 }, { "epoch": 1.7829433497536946, "grad_norm": 0.05376293137669563, "learning_rate": 0.01, "loss": 2.0315, "step": 17373 }, { "epoch": 1.7832512315270936, "grad_norm": 0.04049403965473175, "learning_rate": 0.01, "loss": 2.0483, "step": 17376 }, { "epoch": 1.7835591133004927, "grad_norm": 0.04954640567302704, "learning_rate": 0.01, "loss": 2.0393, "step": 17379 }, { "epoch": 1.7838669950738915, "grad_norm": 0.049089133739471436, "learning_rate": 0.01, "loss": 2.0633, "step": 17382 }, { "epoch": 1.7841748768472905, "grad_norm": 0.0531185045838356, "learning_rate": 0.01, "loss": 2.0474, "step": 17385 }, { "epoch": 1.7844827586206895, "grad_norm": 0.060973040759563446, "learning_rate": 0.01, "loss": 2.0219, "step": 17388 }, { "epoch": 1.7847906403940885, "grad_norm": 0.044274650514125824, "learning_rate": 0.01, "loss": 2.0403, "step": 17391 }, { "epoch": 1.7850985221674875, "grad_norm": 0.08154580742120743, "learning_rate": 0.01, "loss": 2.011, "step": 17394 }, { "epoch": 1.7854064039408866, "grad_norm": 0.05253531411290169, "learning_rate": 0.01, "loss": 2.0352, "step": 17397 }, { "epoch": 1.7857142857142856, "grad_norm": 0.056620582938194275, "learning_rate": 0.01, "loss": 2.04, "step": 17400 }, { "epoch": 1.7860221674876846, "grad_norm": 0.069371297955513, "learning_rate": 0.01, "loss": 2.0456, "step": 17403 }, { "epoch": 1.7863300492610836, "grad_norm": 0.04726189747452736, "learning_rate": 0.01, "loss": 2.018, "step": 17406 }, { "epoch": 1.7866379310344827, "grad_norm": 0.11150949448347092, "learning_rate": 0.01, "loss": 2.0503, "step": 17409 }, { "epoch": 1.7869458128078817, "grad_norm": 0.07482532411813736, "learning_rate": 0.01, "loss": 2.0361, "step": 17412 }, { "epoch": 1.7872536945812807, "grad_norm": 0.03803645819425583, "learning_rate": 0.01, "loss": 2.0555, "step": 17415 }, { "epoch": 1.7875615763546797, "grad_norm": 0.08635829389095306, "learning_rate": 0.01, "loss": 2.0551, "step": 17418 }, { "epoch": 1.7878694581280787, "grad_norm": 0.08558929711580276, "learning_rate": 0.01, "loss": 2.0611, "step": 17421 }, { "epoch": 1.7881773399014778, "grad_norm": 0.051051054149866104, "learning_rate": 0.01, "loss": 2.0375, "step": 17424 }, { "epoch": 1.7884852216748768, "grad_norm": 0.0584864616394043, "learning_rate": 0.01, "loss": 2.0131, "step": 17427 }, { "epoch": 1.7887931034482758, "grad_norm": 0.04015490040183067, "learning_rate": 0.01, "loss": 2.0559, "step": 17430 }, { "epoch": 1.7891009852216748, "grad_norm": 0.0499749630689621, "learning_rate": 0.01, "loss": 2.0611, "step": 17433 }, { "epoch": 1.7894088669950738, "grad_norm": 0.08796360343694687, "learning_rate": 0.01, "loss": 2.0538, "step": 17436 }, { "epoch": 1.7897167487684729, "grad_norm": 0.08200754970312119, "learning_rate": 0.01, "loss": 2.0459, "step": 17439 }, { "epoch": 1.7900246305418719, "grad_norm": 0.09300393611192703, "learning_rate": 0.01, "loss": 2.051, "step": 17442 }, { "epoch": 1.790332512315271, "grad_norm": 0.08223576098680496, "learning_rate": 0.01, "loss": 2.0194, "step": 17445 }, { "epoch": 1.79064039408867, "grad_norm": 0.05235210806131363, "learning_rate": 0.01, "loss": 2.0421, "step": 17448 }, { "epoch": 1.790948275862069, "grad_norm": 0.047677502036094666, "learning_rate": 0.01, "loss": 2.043, "step": 17451 }, { "epoch": 1.791256157635468, "grad_norm": 0.044341955333948135, "learning_rate": 0.01, "loss": 2.0381, "step": 17454 }, { "epoch": 1.791564039408867, "grad_norm": 0.09555595368146896, "learning_rate": 0.01, "loss": 2.0224, "step": 17457 }, { "epoch": 1.791871921182266, "grad_norm": 0.05652477219700813, "learning_rate": 0.01, "loss": 2.0318, "step": 17460 }, { "epoch": 1.792179802955665, "grad_norm": 0.0979117676615715, "learning_rate": 0.01, "loss": 2.0747, "step": 17463 }, { "epoch": 1.792487684729064, "grad_norm": 0.0674947127699852, "learning_rate": 0.01, "loss": 2.0723, "step": 17466 }, { "epoch": 1.792795566502463, "grad_norm": 0.05617907643318176, "learning_rate": 0.01, "loss": 2.0444, "step": 17469 }, { "epoch": 1.793103448275862, "grad_norm": 0.10979234427213669, "learning_rate": 0.01, "loss": 2.0638, "step": 17472 }, { "epoch": 1.7934113300492611, "grad_norm": 0.056006476283073425, "learning_rate": 0.01, "loss": 2.0396, "step": 17475 }, { "epoch": 1.7937192118226601, "grad_norm": 0.10030517727136612, "learning_rate": 0.01, "loss": 2.0379, "step": 17478 }, { "epoch": 1.7940270935960592, "grad_norm": 0.042350657284259796, "learning_rate": 0.01, "loss": 2.0319, "step": 17481 }, { "epoch": 1.7943349753694582, "grad_norm": 0.03725098446011543, "learning_rate": 0.01, "loss": 2.0537, "step": 17484 }, { "epoch": 1.7946428571428572, "grad_norm": 0.09215757250785828, "learning_rate": 0.01, "loss": 2.0247, "step": 17487 }, { "epoch": 1.7949507389162562, "grad_norm": 0.08012344688177109, "learning_rate": 0.01, "loss": 2.0428, "step": 17490 }, { "epoch": 1.7952586206896552, "grad_norm": 0.128404900431633, "learning_rate": 0.01, "loss": 2.038, "step": 17493 }, { "epoch": 1.7955665024630543, "grad_norm": 0.08718766272068024, "learning_rate": 0.01, "loss": 2.0557, "step": 17496 }, { "epoch": 1.7958743842364533, "grad_norm": 0.030426733195781708, "learning_rate": 0.01, "loss": 2.0192, "step": 17499 }, { "epoch": 1.7961822660098523, "grad_norm": 0.03950949385762215, "learning_rate": 0.01, "loss": 2.0228, "step": 17502 }, { "epoch": 1.7964901477832513, "grad_norm": 0.049466658383607864, "learning_rate": 0.01, "loss": 2.0514, "step": 17505 }, { "epoch": 1.7967980295566504, "grad_norm": 0.06188172101974487, "learning_rate": 0.01, "loss": 2.0512, "step": 17508 }, { "epoch": 1.7971059113300494, "grad_norm": 0.06420351564884186, "learning_rate": 0.01, "loss": 2.0365, "step": 17511 }, { "epoch": 1.7974137931034484, "grad_norm": 0.04329871013760567, "learning_rate": 0.01, "loss": 2.0511, "step": 17514 }, { "epoch": 1.7977216748768474, "grad_norm": 0.04420280084013939, "learning_rate": 0.01, "loss": 2.0481, "step": 17517 }, { "epoch": 1.7980295566502464, "grad_norm": 0.04043954238295555, "learning_rate": 0.01, "loss": 2.0184, "step": 17520 }, { "epoch": 1.7983374384236455, "grad_norm": 0.049305226653814316, "learning_rate": 0.01, "loss": 2.0353, "step": 17523 }, { "epoch": 1.7986453201970445, "grad_norm": 0.1928088515996933, "learning_rate": 0.01, "loss": 2.0869, "step": 17526 }, { "epoch": 1.7989532019704435, "grad_norm": 0.12283357232809067, "learning_rate": 0.01, "loss": 2.0378, "step": 17529 }, { "epoch": 1.7992610837438425, "grad_norm": 0.07897382229566574, "learning_rate": 0.01, "loss": 2.045, "step": 17532 }, { "epoch": 1.7995689655172413, "grad_norm": 0.0749836266040802, "learning_rate": 0.01, "loss": 2.0388, "step": 17535 }, { "epoch": 1.7998768472906403, "grad_norm": 0.06578727811574936, "learning_rate": 0.01, "loss": 2.0575, "step": 17538 }, { "epoch": 1.8001847290640394, "grad_norm": 0.06609571725130081, "learning_rate": 0.01, "loss": 2.0448, "step": 17541 }, { "epoch": 1.8004926108374384, "grad_norm": 0.047696053981781006, "learning_rate": 0.01, "loss": 2.0574, "step": 17544 }, { "epoch": 1.8008004926108374, "grad_norm": 0.05110754072666168, "learning_rate": 0.01, "loss": 2.0191, "step": 17547 }, { "epoch": 1.8011083743842364, "grad_norm": 0.03783520683646202, "learning_rate": 0.01, "loss": 2.0328, "step": 17550 }, { "epoch": 1.8014162561576355, "grad_norm": 0.03145405650138855, "learning_rate": 0.01, "loss": 2.0373, "step": 17553 }, { "epoch": 1.8017241379310345, "grad_norm": 0.09492892026901245, "learning_rate": 0.01, "loss": 2.0173, "step": 17556 }, { "epoch": 1.8020320197044335, "grad_norm": 0.06920488178730011, "learning_rate": 0.01, "loss": 2.0809, "step": 17559 }, { "epoch": 1.8023399014778325, "grad_norm": 0.0583655945956707, "learning_rate": 0.01, "loss": 2.0259, "step": 17562 }, { "epoch": 1.8026477832512315, "grad_norm": 0.08449242264032364, "learning_rate": 0.01, "loss": 2.0205, "step": 17565 }, { "epoch": 1.8029556650246306, "grad_norm": 0.12186135351657867, "learning_rate": 0.01, "loss": 2.0076, "step": 17568 }, { "epoch": 1.8032635467980296, "grad_norm": 0.09926268458366394, "learning_rate": 0.01, "loss": 2.0444, "step": 17571 }, { "epoch": 1.8035714285714286, "grad_norm": 0.06820474565029144, "learning_rate": 0.01, "loss": 2.0211, "step": 17574 }, { "epoch": 1.8038793103448276, "grad_norm": 0.050847604870796204, "learning_rate": 0.01, "loss": 2.0377, "step": 17577 }, { "epoch": 1.8041871921182266, "grad_norm": 0.053053803741931915, "learning_rate": 0.01, "loss": 2.0462, "step": 17580 }, { "epoch": 1.8044950738916257, "grad_norm": 0.047114890068769455, "learning_rate": 0.01, "loss": 2.0171, "step": 17583 }, { "epoch": 1.8048029556650245, "grad_norm": 0.05182573199272156, "learning_rate": 0.01, "loss": 2.0396, "step": 17586 }, { "epoch": 1.8051108374384235, "grad_norm": 0.12609605491161346, "learning_rate": 0.01, "loss": 2.053, "step": 17589 }, { "epoch": 1.8054187192118225, "grad_norm": 0.0496569462120533, "learning_rate": 0.01, "loss": 2.0418, "step": 17592 }, { "epoch": 1.8057266009852215, "grad_norm": 0.0490572527050972, "learning_rate": 0.01, "loss": 2.0199, "step": 17595 }, { "epoch": 1.8060344827586206, "grad_norm": 0.038300756365060806, "learning_rate": 0.01, "loss": 2.0337, "step": 17598 }, { "epoch": 1.8063423645320196, "grad_norm": 0.03666609153151512, "learning_rate": 0.01, "loss": 2.0392, "step": 17601 }, { "epoch": 1.8066502463054186, "grad_norm": 0.036330632865428925, "learning_rate": 0.01, "loss": 2.0319, "step": 17604 }, { "epoch": 1.8069581280788176, "grad_norm": 0.0605342797935009, "learning_rate": 0.01, "loss": 2.0356, "step": 17607 }, { "epoch": 1.8072660098522166, "grad_norm": 0.04346880316734314, "learning_rate": 0.01, "loss": 2.0188, "step": 17610 }, { "epoch": 1.8075738916256157, "grad_norm": 0.06400660425424576, "learning_rate": 0.01, "loss": 2.0342, "step": 17613 }, { "epoch": 1.8078817733990147, "grad_norm": 0.0812198668718338, "learning_rate": 0.01, "loss": 2.0622, "step": 17616 }, { "epoch": 1.8081896551724137, "grad_norm": 0.06756972521543503, "learning_rate": 0.01, "loss": 2.0396, "step": 17619 }, { "epoch": 1.8084975369458127, "grad_norm": 0.05277147516608238, "learning_rate": 0.01, "loss": 2.0149, "step": 17622 }, { "epoch": 1.8088054187192117, "grad_norm": 0.07904385775327682, "learning_rate": 0.01, "loss": 2.0393, "step": 17625 }, { "epoch": 1.8091133004926108, "grad_norm": 0.06955704092979431, "learning_rate": 0.01, "loss": 2.0433, "step": 17628 }, { "epoch": 1.8094211822660098, "grad_norm": 0.06605497002601624, "learning_rate": 0.01, "loss": 2.0439, "step": 17631 }, { "epoch": 1.8097290640394088, "grad_norm": 0.03861093521118164, "learning_rate": 0.01, "loss": 2.03, "step": 17634 }, { "epoch": 1.8100369458128078, "grad_norm": 0.04323074221611023, "learning_rate": 0.01, "loss": 2.0444, "step": 17637 }, { "epoch": 1.8103448275862069, "grad_norm": 0.03443233296275139, "learning_rate": 0.01, "loss": 2.0466, "step": 17640 }, { "epoch": 1.8106527093596059, "grad_norm": 0.04190131649374962, "learning_rate": 0.01, "loss": 2.0307, "step": 17643 }, { "epoch": 1.810960591133005, "grad_norm": 0.09095717966556549, "learning_rate": 0.01, "loss": 2.0529, "step": 17646 }, { "epoch": 1.811268472906404, "grad_norm": 0.05452005937695503, "learning_rate": 0.01, "loss": 2.0337, "step": 17649 }, { "epoch": 1.811576354679803, "grad_norm": 0.05032350867986679, "learning_rate": 0.01, "loss": 2.0398, "step": 17652 }, { "epoch": 1.811884236453202, "grad_norm": 0.05733015760779381, "learning_rate": 0.01, "loss": 2.0573, "step": 17655 }, { "epoch": 1.812192118226601, "grad_norm": 0.09373817592859268, "learning_rate": 0.01, "loss": 2.0278, "step": 17658 }, { "epoch": 1.8125, "grad_norm": 0.07385890185832977, "learning_rate": 0.01, "loss": 2.032, "step": 17661 }, { "epoch": 1.812807881773399, "grad_norm": 0.08643963187932968, "learning_rate": 0.01, "loss": 2.0351, "step": 17664 }, { "epoch": 1.813115763546798, "grad_norm": 0.09909530729055405, "learning_rate": 0.01, "loss": 2.0562, "step": 17667 }, { "epoch": 1.813423645320197, "grad_norm": 0.04600978642702103, "learning_rate": 0.01, "loss": 2.0219, "step": 17670 }, { "epoch": 1.813731527093596, "grad_norm": 0.033060222864151, "learning_rate": 0.01, "loss": 2.0479, "step": 17673 }, { "epoch": 1.814039408866995, "grad_norm": 0.03789517655968666, "learning_rate": 0.01, "loss": 2.0242, "step": 17676 }, { "epoch": 1.8143472906403941, "grad_norm": 0.0502844899892807, "learning_rate": 0.01, "loss": 2.0519, "step": 17679 }, { "epoch": 1.8146551724137931, "grad_norm": 0.0627695843577385, "learning_rate": 0.01, "loss": 2.0327, "step": 17682 }, { "epoch": 1.8149630541871922, "grad_norm": 0.15737055242061615, "learning_rate": 0.01, "loss": 2.0572, "step": 17685 }, { "epoch": 1.8152709359605912, "grad_norm": 0.09944868832826614, "learning_rate": 0.01, "loss": 2.0462, "step": 17688 }, { "epoch": 1.8155788177339902, "grad_norm": 0.12345952540636063, "learning_rate": 0.01, "loss": 2.0447, "step": 17691 }, { "epoch": 1.8158866995073892, "grad_norm": 0.06330909579992294, "learning_rate": 0.01, "loss": 2.0511, "step": 17694 }, { "epoch": 1.8161945812807883, "grad_norm": 0.0584748238325119, "learning_rate": 0.01, "loss": 2.0164, "step": 17697 }, { "epoch": 1.8165024630541873, "grad_norm": 0.07284627109766006, "learning_rate": 0.01, "loss": 2.0308, "step": 17700 }, { "epoch": 1.8168103448275863, "grad_norm": 0.07302995771169662, "learning_rate": 0.01, "loss": 2.0347, "step": 17703 }, { "epoch": 1.8171182266009853, "grad_norm": 0.06292667984962463, "learning_rate": 0.01, "loss": 2.026, "step": 17706 }, { "epoch": 1.8174261083743843, "grad_norm": 0.04821958392858505, "learning_rate": 0.01, "loss": 2.033, "step": 17709 }, { "epoch": 1.8177339901477834, "grad_norm": 0.03572079911828041, "learning_rate": 0.01, "loss": 2.047, "step": 17712 }, { "epoch": 1.8180418719211824, "grad_norm": 0.12643416225910187, "learning_rate": 0.01, "loss": 2.0621, "step": 17715 }, { "epoch": 1.8183497536945814, "grad_norm": 0.08803770691156387, "learning_rate": 0.01, "loss": 2.0422, "step": 17718 }, { "epoch": 1.8186576354679804, "grad_norm": 0.061583418399095535, "learning_rate": 0.01, "loss": 1.9895, "step": 17721 }, { "epoch": 1.8189655172413794, "grad_norm": 0.04947415366768837, "learning_rate": 0.01, "loss": 2.0249, "step": 17724 }, { "epoch": 1.8192733990147785, "grad_norm": 0.06042906641960144, "learning_rate": 0.01, "loss": 2.0563, "step": 17727 }, { "epoch": 1.8195812807881775, "grad_norm": 0.03236406669020653, "learning_rate": 0.01, "loss": 2.0482, "step": 17730 }, { "epoch": 1.8198891625615765, "grad_norm": 0.05975859984755516, "learning_rate": 0.01, "loss": 2.0353, "step": 17733 }, { "epoch": 1.8201970443349755, "grad_norm": 0.11028258502483368, "learning_rate": 0.01, "loss": 2.0654, "step": 17736 }, { "epoch": 1.8205049261083743, "grad_norm": 0.055842846632003784, "learning_rate": 0.01, "loss": 2.0589, "step": 17739 }, { "epoch": 1.8208128078817734, "grad_norm": 0.09189102053642273, "learning_rate": 0.01, "loss": 2.0444, "step": 17742 }, { "epoch": 1.8211206896551724, "grad_norm": 0.07795927673578262, "learning_rate": 0.01, "loss": 2.0628, "step": 17745 }, { "epoch": 1.8214285714285714, "grad_norm": 0.06452701985836029, "learning_rate": 0.01, "loss": 2.0415, "step": 17748 }, { "epoch": 1.8217364532019704, "grad_norm": 0.056360337883234024, "learning_rate": 0.01, "loss": 2.0159, "step": 17751 }, { "epoch": 1.8220443349753694, "grad_norm": 0.08861987292766571, "learning_rate": 0.01, "loss": 2.0325, "step": 17754 }, { "epoch": 1.8223522167487685, "grad_norm": 0.07276416569948196, "learning_rate": 0.01, "loss": 2.022, "step": 17757 }, { "epoch": 1.8226600985221675, "grad_norm": 0.07501320540904999, "learning_rate": 0.01, "loss": 2.0592, "step": 17760 }, { "epoch": 1.8229679802955665, "grad_norm": 0.08408310264348984, "learning_rate": 0.01, "loss": 2.0353, "step": 17763 }, { "epoch": 1.8232758620689655, "grad_norm": 0.039008378982543945, "learning_rate": 0.01, "loss": 2.0541, "step": 17766 }, { "epoch": 1.8235837438423645, "grad_norm": 0.05153367295861244, "learning_rate": 0.01, "loss": 2.0614, "step": 17769 }, { "epoch": 1.8238916256157636, "grad_norm": 0.05685068294405937, "learning_rate": 0.01, "loss": 2.0603, "step": 17772 }, { "epoch": 1.8241995073891626, "grad_norm": 0.10836745798587799, "learning_rate": 0.01, "loss": 2.0293, "step": 17775 }, { "epoch": 1.8245073891625616, "grad_norm": 0.13855011761188507, "learning_rate": 0.01, "loss": 2.044, "step": 17778 }, { "epoch": 1.8248152709359606, "grad_norm": 0.07912803441286087, "learning_rate": 0.01, "loss": 2.062, "step": 17781 }, { "epoch": 1.8251231527093597, "grad_norm": 0.065729521214962, "learning_rate": 0.01, "loss": 2.0416, "step": 17784 }, { "epoch": 1.8254310344827587, "grad_norm": 0.04546307399868965, "learning_rate": 0.01, "loss": 2.0291, "step": 17787 }, { "epoch": 1.8257389162561575, "grad_norm": 0.03415641188621521, "learning_rate": 0.01, "loss": 2.0391, "step": 17790 }, { "epoch": 1.8260467980295565, "grad_norm": 0.038325123488903046, "learning_rate": 0.01, "loss": 2.0249, "step": 17793 }, { "epoch": 1.8263546798029555, "grad_norm": 0.057417213916778564, "learning_rate": 0.01, "loss": 2.0465, "step": 17796 }, { "epoch": 1.8266625615763545, "grad_norm": 0.07312962412834167, "learning_rate": 0.01, "loss": 2.009, "step": 17799 }, { "epoch": 1.8269704433497536, "grad_norm": 0.06465096771717072, "learning_rate": 0.01, "loss": 2.0582, "step": 17802 }, { "epoch": 1.8272783251231526, "grad_norm": 0.049065001308918, "learning_rate": 0.01, "loss": 2.0466, "step": 17805 }, { "epoch": 1.8275862068965516, "grad_norm": 0.05004505068063736, "learning_rate": 0.01, "loss": 2.0368, "step": 17808 }, { "epoch": 1.8278940886699506, "grad_norm": 0.12177273631095886, "learning_rate": 0.01, "loss": 2.0299, "step": 17811 }, { "epoch": 1.8282019704433496, "grad_norm": 0.09006219357252121, "learning_rate": 0.01, "loss": 2.0442, "step": 17814 }, { "epoch": 1.8285098522167487, "grad_norm": 0.07000398635864258, "learning_rate": 0.01, "loss": 2.0346, "step": 17817 }, { "epoch": 1.8288177339901477, "grad_norm": 0.03561507910490036, "learning_rate": 0.01, "loss": 2.022, "step": 17820 }, { "epoch": 1.8291256157635467, "grad_norm": 0.050965216010808945, "learning_rate": 0.01, "loss": 2.0577, "step": 17823 }, { "epoch": 1.8294334975369457, "grad_norm": 0.04437123239040375, "learning_rate": 0.01, "loss": 2.0204, "step": 17826 }, { "epoch": 1.8297413793103448, "grad_norm": 0.046157170087099075, "learning_rate": 0.01, "loss": 2.0315, "step": 17829 }, { "epoch": 1.8300492610837438, "grad_norm": 0.0641985610127449, "learning_rate": 0.01, "loss": 2.0619, "step": 17832 }, { "epoch": 1.8303571428571428, "grad_norm": 0.10295763611793518, "learning_rate": 0.01, "loss": 2.0142, "step": 17835 }, { "epoch": 1.8306650246305418, "grad_norm": 0.08395816385746002, "learning_rate": 0.01, "loss": 2.0388, "step": 17838 }, { "epoch": 1.8309729064039408, "grad_norm": 0.07087874412536621, "learning_rate": 0.01, "loss": 2.0458, "step": 17841 }, { "epoch": 1.8312807881773399, "grad_norm": 0.04754515364766121, "learning_rate": 0.01, "loss": 2.0305, "step": 17844 }, { "epoch": 1.8315886699507389, "grad_norm": 0.042998362332582474, "learning_rate": 0.01, "loss": 2.0334, "step": 17847 }, { "epoch": 1.831896551724138, "grad_norm": 0.044786881655454636, "learning_rate": 0.01, "loss": 2.0545, "step": 17850 }, { "epoch": 1.832204433497537, "grad_norm": 0.05035366117954254, "learning_rate": 0.01, "loss": 2.0346, "step": 17853 }, { "epoch": 1.832512315270936, "grad_norm": 0.08760454505681992, "learning_rate": 0.01, "loss": 2.0407, "step": 17856 }, { "epoch": 1.832820197044335, "grad_norm": 0.07182349264621735, "learning_rate": 0.01, "loss": 2.0617, "step": 17859 }, { "epoch": 1.833128078817734, "grad_norm": 0.0653420239686966, "learning_rate": 0.01, "loss": 2.02, "step": 17862 }, { "epoch": 1.833435960591133, "grad_norm": 0.07664595544338226, "learning_rate": 0.01, "loss": 2.0453, "step": 17865 }, { "epoch": 1.833743842364532, "grad_norm": 0.052884750068187714, "learning_rate": 0.01, "loss": 2.0433, "step": 17868 }, { "epoch": 1.834051724137931, "grad_norm": 0.049432456493377686, "learning_rate": 0.01, "loss": 2.0392, "step": 17871 }, { "epoch": 1.83435960591133, "grad_norm": 0.10208621621131897, "learning_rate": 0.01, "loss": 2.0425, "step": 17874 }, { "epoch": 1.834667487684729, "grad_norm": 0.0663546770811081, "learning_rate": 0.01, "loss": 2.0276, "step": 17877 }, { "epoch": 1.8349753694581281, "grad_norm": 0.0952199399471283, "learning_rate": 0.01, "loss": 2.0273, "step": 17880 }, { "epoch": 1.8352832512315271, "grad_norm": 0.04969238117337227, "learning_rate": 0.01, "loss": 2.0227, "step": 17883 }, { "epoch": 1.8355911330049262, "grad_norm": 0.05101123824715614, "learning_rate": 0.01, "loss": 2.0642, "step": 17886 }, { "epoch": 1.8358990147783252, "grad_norm": 0.1026005819439888, "learning_rate": 0.01, "loss": 2.0118, "step": 17889 }, { "epoch": 1.8362068965517242, "grad_norm": 0.06481184810400009, "learning_rate": 0.01, "loss": 2.0457, "step": 17892 }, { "epoch": 1.8365147783251232, "grad_norm": 0.0684402734041214, "learning_rate": 0.01, "loss": 2.0364, "step": 17895 }, { "epoch": 1.8368226600985222, "grad_norm": 0.1051085963845253, "learning_rate": 0.01, "loss": 2.0178, "step": 17898 }, { "epoch": 1.8371305418719213, "grad_norm": 0.06582857668399811, "learning_rate": 0.01, "loss": 2.0409, "step": 17901 }, { "epoch": 1.8374384236453203, "grad_norm": 0.05665391683578491, "learning_rate": 0.01, "loss": 2.04, "step": 17904 }, { "epoch": 1.8377463054187193, "grad_norm": 0.06239892914891243, "learning_rate": 0.01, "loss": 2.0199, "step": 17907 }, { "epoch": 1.8380541871921183, "grad_norm": 0.08531507849693298, "learning_rate": 0.01, "loss": 2.0429, "step": 17910 }, { "epoch": 1.8383620689655173, "grad_norm": 0.07379250973463058, "learning_rate": 0.01, "loss": 2.0226, "step": 17913 }, { "epoch": 1.8386699507389164, "grad_norm": 0.052789974957704544, "learning_rate": 0.01, "loss": 2.0198, "step": 17916 }, { "epoch": 1.8389778325123154, "grad_norm": 0.09525316208600998, "learning_rate": 0.01, "loss": 2.0423, "step": 17919 }, { "epoch": 1.8392857142857144, "grad_norm": 0.05700648948550224, "learning_rate": 0.01, "loss": 2.0332, "step": 17922 }, { "epoch": 1.8395935960591134, "grad_norm": 0.061519671231508255, "learning_rate": 0.01, "loss": 2.038, "step": 17925 }, { "epoch": 1.8399014778325125, "grad_norm": 0.05594256520271301, "learning_rate": 0.01, "loss": 2.0247, "step": 17928 }, { "epoch": 1.8402093596059115, "grad_norm": 0.06823567301034927, "learning_rate": 0.01, "loss": 2.0319, "step": 17931 }, { "epoch": 1.8405172413793105, "grad_norm": 0.061398666352033615, "learning_rate": 0.01, "loss": 2.038, "step": 17934 }, { "epoch": 1.8408251231527095, "grad_norm": 0.10590513050556183, "learning_rate": 0.01, "loss": 2.0336, "step": 17937 }, { "epoch": 1.8411330049261085, "grad_norm": 0.0579022578895092, "learning_rate": 0.01, "loss": 2.0249, "step": 17940 }, { "epoch": 1.8414408866995073, "grad_norm": 0.07047640532255173, "learning_rate": 0.01, "loss": 2.0147, "step": 17943 }, { "epoch": 1.8417487684729064, "grad_norm": 0.07486578077077866, "learning_rate": 0.01, "loss": 2.0413, "step": 17946 }, { "epoch": 1.8420566502463054, "grad_norm": 0.057884715497493744, "learning_rate": 0.01, "loss": 2.0357, "step": 17949 }, { "epoch": 1.8423645320197044, "grad_norm": 0.10381656140089035, "learning_rate": 0.01, "loss": 2.0382, "step": 17952 }, { "epoch": 1.8426724137931034, "grad_norm": 0.041863467544317245, "learning_rate": 0.01, "loss": 2.0345, "step": 17955 }, { "epoch": 1.8429802955665024, "grad_norm": 0.10012530535459518, "learning_rate": 0.01, "loss": 2.0648, "step": 17958 }, { "epoch": 1.8432881773399015, "grad_norm": 0.05597177520394325, "learning_rate": 0.01, "loss": 2.0513, "step": 17961 }, { "epoch": 1.8435960591133005, "grad_norm": 0.05338521674275398, "learning_rate": 0.01, "loss": 2.0287, "step": 17964 }, { "epoch": 1.8439039408866995, "grad_norm": 0.049141060560941696, "learning_rate": 0.01, "loss": 2.0486, "step": 17967 }, { "epoch": 1.8442118226600985, "grad_norm": 0.0784049779176712, "learning_rate": 0.01, "loss": 2.0176, "step": 17970 }, { "epoch": 1.8445197044334976, "grad_norm": 0.038596317172050476, "learning_rate": 0.01, "loss": 2.0167, "step": 17973 }, { "epoch": 1.8448275862068966, "grad_norm": 0.08521022647619247, "learning_rate": 0.01, "loss": 2.0364, "step": 17976 }, { "epoch": 1.8451354679802956, "grad_norm": 0.05890432372689247, "learning_rate": 0.01, "loss": 2.062, "step": 17979 }, { "epoch": 1.8454433497536946, "grad_norm": 0.09090931713581085, "learning_rate": 0.01, "loss": 2.0514, "step": 17982 }, { "epoch": 1.8457512315270936, "grad_norm": 0.06019595265388489, "learning_rate": 0.01, "loss": 2.0463, "step": 17985 }, { "epoch": 1.8460591133004927, "grad_norm": 0.07712443917989731, "learning_rate": 0.01, "loss": 2.0466, "step": 17988 }, { "epoch": 1.8463669950738915, "grad_norm": 0.06155428662896156, "learning_rate": 0.01, "loss": 2.0224, "step": 17991 }, { "epoch": 1.8466748768472905, "grad_norm": 0.07221681624650955, "learning_rate": 0.01, "loss": 2.0128, "step": 17994 }, { "epoch": 1.8469827586206895, "grad_norm": 0.056776583194732666, "learning_rate": 0.01, "loss": 2.0156, "step": 17997 }, { "epoch": 1.8472906403940885, "grad_norm": 0.12099254876375198, "learning_rate": 0.01, "loss": 2.0522, "step": 18000 } ], "logging_steps": 3, "max_steps": 19488, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.11481979646509e+20, "train_batch_size": 18, "trial_name": null, "trial_params": null }