{ "best_metric": 0.02353788217437277, "best_model_checkpoint": "./results-cc/code-t5/codet5_qlora_official_0.0005/checkpoint-14718", "epoch": 1.0, "eval_steps": 500, "global_step": 14718, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003397200706617747, "grad_norm": 2.2353625297546387, "learning_rate": 0.0004999787674955837, "loss": 9.2314, "step": 5 }, { "epoch": 0.0006794401413235494, "grad_norm": 2.8416965007781982, "learning_rate": 0.0004999575349911673, "loss": 6.5213, "step": 10 }, { "epoch": 0.0010191602119853241, "grad_norm": 2.026658058166504, "learning_rate": 0.0004999363024867509, "loss": 5.5628, "step": 15 }, { "epoch": 0.001358880282647099, "grad_norm": 1.2800122499465942, "learning_rate": 0.0004999150699823346, "loss": 4.4513, "step": 20 }, { "epoch": 0.0016986003533088735, "grad_norm": 0.7811825275421143, "learning_rate": 0.0004998938374779182, "loss": 4.4534, "step": 25 }, { "epoch": 0.0020383204239706482, "grad_norm": 0.9920962452888489, "learning_rate": 0.0004998726049735018, "loss": 4.1556, "step": 30 }, { "epoch": 0.002378040494632423, "grad_norm": 3.779524803161621, "learning_rate": 0.0004998513724690855, "loss": 3.974, "step": 35 }, { "epoch": 0.002717760565294198, "grad_norm": 0.6368366479873657, "learning_rate": 0.0004998301399646691, "loss": 4.1671, "step": 40 }, { "epoch": 0.0030574806359559724, "grad_norm": 0.7747367024421692, "learning_rate": 0.0004998089074602528, "loss": 4.3581, "step": 45 }, { "epoch": 0.003397200706617747, "grad_norm": 0.7359101176261902, "learning_rate": 0.0004997876749558364, "loss": 3.9709, "step": 50 }, { "epoch": 0.0037369207772795215, "grad_norm": 0.9446442127227783, "learning_rate": 0.00049976644245142, "loss": 4.0363, "step": 55 }, { "epoch": 0.0040766408479412965, "grad_norm": 0.971868097782135, "learning_rate": 0.0004997452099470037, "loss": 4.015, "step": 60 }, { "epoch": 0.0044163609186030715, "grad_norm": 0.554659366607666, "learning_rate": 0.0004997239774425872, "loss": 4.0286, "step": 65 }, { "epoch": 0.004756080989264846, "grad_norm": 0.5678468346595764, "learning_rate": 0.000499702744938171, "loss": 4.2355, "step": 70 }, { "epoch": 0.005095801059926621, "grad_norm": 0.6306894421577454, "learning_rate": 0.0004996815124337546, "loss": 4.107, "step": 75 }, { "epoch": 0.005435521130588396, "grad_norm": 0.6269946098327637, "learning_rate": 0.0004996602799293383, "loss": 3.876, "step": 80 }, { "epoch": 0.00577524120125017, "grad_norm": 0.9158003926277161, "learning_rate": 0.0004996390474249219, "loss": 3.8671, "step": 85 }, { "epoch": 0.006114961271911945, "grad_norm": 0.6720385551452637, "learning_rate": 0.0004996178149205055, "loss": 4.0011, "step": 90 }, { "epoch": 0.006454681342573719, "grad_norm": 0.5876797437667847, "learning_rate": 0.0004995965824160891, "loss": 4.2092, "step": 95 }, { "epoch": 0.006794401413235494, "grad_norm": 13.15256118774414, "learning_rate": 0.0004995753499116728, "loss": 4.0583, "step": 100 }, { "epoch": 0.007134121483897269, "grad_norm": 0.5245499014854431, "learning_rate": 0.0004995541174072565, "loss": 3.6267, "step": 105 }, { "epoch": 0.007473841554559043, "grad_norm": 0.5815557837486267, "learning_rate": 0.00049953288490284, "loss": 4.0553, "step": 110 }, { "epoch": 0.007813561625220818, "grad_norm": 0.5052971839904785, "learning_rate": 0.0004995116523984237, "loss": 4.0417, "step": 115 }, { "epoch": 0.008153281695882593, "grad_norm": 0.7701963782310486, "learning_rate": 0.0004994904198940074, "loss": 4.2447, "step": 120 }, { "epoch": 0.008493001766544368, "grad_norm": 0.6031837463378906, "learning_rate": 0.0004994691873895909, "loss": 4.015, "step": 125 }, { "epoch": 0.008832721837206143, "grad_norm": 0.7434197664260864, "learning_rate": 0.0004994479548851746, "loss": 3.8354, "step": 130 }, { "epoch": 0.009172441907867916, "grad_norm": 0.761620044708252, "learning_rate": 0.0004994267223807583, "loss": 3.8388, "step": 135 }, { "epoch": 0.009512161978529691, "grad_norm": 0.7527087330818176, "learning_rate": 0.0004994054898763419, "loss": 3.8212, "step": 140 }, { "epoch": 0.009851882049191466, "grad_norm": 0.6151653528213501, "learning_rate": 0.0004993842573719256, "loss": 3.8239, "step": 145 }, { "epoch": 0.010191602119853241, "grad_norm": 0.7286589741706848, "learning_rate": 0.0004993630248675092, "loss": 4.0361, "step": 150 }, { "epoch": 0.010531322190515016, "grad_norm": 0.7502281069755554, "learning_rate": 0.0004993417923630928, "loss": 3.8669, "step": 155 }, { "epoch": 0.010871042261176791, "grad_norm": 0.701413094997406, "learning_rate": 0.0004993205598586765, "loss": 3.96, "step": 160 }, { "epoch": 0.011210762331838564, "grad_norm": 0.7256927490234375, "learning_rate": 0.00049929932735426, "loss": 3.9702, "step": 165 }, { "epoch": 0.01155048240250034, "grad_norm": 1.1132928133010864, "learning_rate": 0.0004992780948498437, "loss": 3.8704, "step": 170 }, { "epoch": 0.011890202473162114, "grad_norm": 0.674567461013794, "learning_rate": 0.0004992568623454274, "loss": 3.8778, "step": 175 }, { "epoch": 0.01222992254382389, "grad_norm": 0.8263887763023376, "learning_rate": 0.000499235629841011, "loss": 3.9284, "step": 180 }, { "epoch": 0.012569642614485664, "grad_norm": 3.578502655029297, "learning_rate": 0.0004992143973365947, "loss": 3.9699, "step": 185 }, { "epoch": 0.012909362685147438, "grad_norm": 0.6257832050323486, "learning_rate": 0.0004991931648321783, "loss": 3.8505, "step": 190 }, { "epoch": 0.013249082755809213, "grad_norm": 0.7396682500839233, "learning_rate": 0.0004991719323277619, "loss": 4.1419, "step": 195 }, { "epoch": 0.013588802826470988, "grad_norm": 0.6713815927505493, "learning_rate": 0.0004991506998233456, "loss": 3.9628, "step": 200 }, { "epoch": 0.013928522897132763, "grad_norm": 0.6354101300239563, "learning_rate": 0.0004991294673189292, "loss": 4.0602, "step": 205 }, { "epoch": 0.014268242967794538, "grad_norm": 0.791304349899292, "learning_rate": 0.0004991082348145128, "loss": 3.7822, "step": 210 }, { "epoch": 0.014607963038456313, "grad_norm": 0.776104211807251, "learning_rate": 0.0004990870023100965, "loss": 3.8612, "step": 215 }, { "epoch": 0.014947683109118086, "grad_norm": 0.5513173937797546, "learning_rate": 0.0004990657698056802, "loss": 3.998, "step": 220 }, { "epoch": 0.015287403179779861, "grad_norm": 0.7635427117347717, "learning_rate": 0.0004990445373012637, "loss": 3.86, "step": 225 }, { "epoch": 0.015627123250441636, "grad_norm": 0.5227985382080078, "learning_rate": 0.0004990233047968474, "loss": 3.8407, "step": 230 }, { "epoch": 0.01596684332110341, "grad_norm": 0.5980375409126282, "learning_rate": 0.0004990020722924311, "loss": 3.8813, "step": 235 }, { "epoch": 0.016306563391765186, "grad_norm": 0.8161038160324097, "learning_rate": 0.0004989808397880146, "loss": 3.811, "step": 240 }, { "epoch": 0.01664628346242696, "grad_norm": 0.6680299043655396, "learning_rate": 0.0004989596072835984, "loss": 3.8233, "step": 245 }, { "epoch": 0.016986003533088736, "grad_norm": 0.5727167129516602, "learning_rate": 0.000498938374779182, "loss": 3.9787, "step": 250 }, { "epoch": 0.01732572360375051, "grad_norm": 0.7090482711791992, "learning_rate": 0.0004989171422747656, "loss": 3.842, "step": 255 }, { "epoch": 0.017665443674412286, "grad_norm": 0.8892368674278259, "learning_rate": 0.0004988959097703493, "loss": 3.9685, "step": 260 }, { "epoch": 0.01800516374507406, "grad_norm": 0.7880306839942932, "learning_rate": 0.0004988746772659329, "loss": 4.0332, "step": 265 }, { "epoch": 0.018344883815735832, "grad_norm": 2.765075445175171, "learning_rate": 0.0004988534447615165, "loss": 3.8118, "step": 270 }, { "epoch": 0.01868460388639761, "grad_norm": 0.9651147127151489, "learning_rate": 0.0004988322122571002, "loss": 3.7532, "step": 275 }, { "epoch": 0.019024323957059382, "grad_norm": 0.5852766036987305, "learning_rate": 0.0004988109797526839, "loss": 3.8462, "step": 280 }, { "epoch": 0.01936404402772116, "grad_norm": 0.5143536329269409, "learning_rate": 0.0004987897472482674, "loss": 3.909, "step": 285 }, { "epoch": 0.019703764098382932, "grad_norm": 0.8465588688850403, "learning_rate": 0.0004987685147438511, "loss": 3.921, "step": 290 }, { "epoch": 0.020043484169044706, "grad_norm": 0.5650240182876587, "learning_rate": 0.0004987472822394347, "loss": 3.8594, "step": 295 }, { "epoch": 0.020383204239706482, "grad_norm": 0.6958757042884827, "learning_rate": 0.0004987260497350183, "loss": 3.6721, "step": 300 }, { "epoch": 0.020722924310368256, "grad_norm": 0.833765983581543, "learning_rate": 0.000498704817230602, "loss": 3.8761, "step": 305 }, { "epoch": 0.021062644381030032, "grad_norm": 0.6201319098472595, "learning_rate": 0.0004986835847261856, "loss": 3.6919, "step": 310 }, { "epoch": 0.021402364451691806, "grad_norm": 0.8628581762313843, "learning_rate": 0.0004986623522217693, "loss": 4.0967, "step": 315 }, { "epoch": 0.021742084522353582, "grad_norm": 0.764646589756012, "learning_rate": 0.000498641119717353, "loss": 3.9165, "step": 320 }, { "epoch": 0.022081804593015356, "grad_norm": 1.6150736808776855, "learning_rate": 0.0004986198872129365, "loss": 3.6974, "step": 325 }, { "epoch": 0.02242152466367713, "grad_norm": 0.5592671632766724, "learning_rate": 0.0004985986547085202, "loss": 3.5593, "step": 330 }, { "epoch": 0.022761244734338906, "grad_norm": 0.5588498711585999, "learning_rate": 0.0004985774222041039, "loss": 4.03, "step": 335 }, { "epoch": 0.02310096480500068, "grad_norm": 0.7053913474082947, "learning_rate": 0.0004985561896996874, "loss": 3.9243, "step": 340 }, { "epoch": 0.023440684875662456, "grad_norm": 0.8493243455886841, "learning_rate": 0.0004985349571952711, "loss": 3.8934, "step": 345 }, { "epoch": 0.02378040494632423, "grad_norm": 1.0158051252365112, "learning_rate": 0.0004985137246908548, "loss": 3.6815, "step": 350 }, { "epoch": 0.024120125016986002, "grad_norm": 0.7580332159996033, "learning_rate": 0.0004984924921864384, "loss": 4.1274, "step": 355 }, { "epoch": 0.02445984508764778, "grad_norm": 0.6487094759941101, "learning_rate": 0.000498471259682022, "loss": 3.9643, "step": 360 }, { "epoch": 0.024799565158309552, "grad_norm": 1.426354169845581, "learning_rate": 0.0004984500271776057, "loss": 3.8503, "step": 365 }, { "epoch": 0.02513928522897133, "grad_norm": 0.651797890663147, "learning_rate": 0.0004984287946731893, "loss": 4.0105, "step": 370 }, { "epoch": 0.025479005299633102, "grad_norm": 0.7262148857116699, "learning_rate": 0.0004984075621687729, "loss": 3.7927, "step": 375 }, { "epoch": 0.025818725370294875, "grad_norm": 3.5581183433532715, "learning_rate": 0.0004983863296643565, "loss": 4.0209, "step": 380 }, { "epoch": 0.026158445440956652, "grad_norm": 0.7328428626060486, "learning_rate": 0.0004983650971599402, "loss": 3.9751, "step": 385 }, { "epoch": 0.026498165511618425, "grad_norm": 0.5681852698326111, "learning_rate": 0.0004983438646555239, "loss": 3.7243, "step": 390 }, { "epoch": 0.026837885582280202, "grad_norm": 0.6385235786437988, "learning_rate": 0.0004983226321511075, "loss": 3.4724, "step": 395 }, { "epoch": 0.027177605652941975, "grad_norm": 0.630611002445221, "learning_rate": 0.0004983013996466911, "loss": 3.7743, "step": 400 }, { "epoch": 0.027517325723603752, "grad_norm": 0.9358086585998535, "learning_rate": 0.0004982801671422748, "loss": 3.5933, "step": 405 }, { "epoch": 0.027857045794265525, "grad_norm": 6.729480266571045, "learning_rate": 0.0004982589346378584, "loss": 3.9168, "step": 410 }, { "epoch": 0.0281967658649273, "grad_norm": 0.9895272254943848, "learning_rate": 0.000498237702133442, "loss": 4.046, "step": 415 }, { "epoch": 0.028536485935589075, "grad_norm": 0.9009426236152649, "learning_rate": 0.0004982164696290257, "loss": 3.7989, "step": 420 }, { "epoch": 0.02887620600625085, "grad_norm": 0.6883906126022339, "learning_rate": 0.0004981952371246093, "loss": 4.0833, "step": 425 }, { "epoch": 0.029215926076912625, "grad_norm": 0.7698491811752319, "learning_rate": 0.000498174004620193, "loss": 3.8872, "step": 430 }, { "epoch": 0.0295556461475744, "grad_norm": 0.6823627948760986, "learning_rate": 0.0004981527721157766, "loss": 3.9594, "step": 435 }, { "epoch": 0.029895366218236172, "grad_norm": 0.7193058133125305, "learning_rate": 0.0004981315396113602, "loss": 3.8547, "step": 440 }, { "epoch": 0.03023508628889795, "grad_norm": 1.7814664840698242, "learning_rate": 0.0004981103071069439, "loss": 3.7917, "step": 445 }, { "epoch": 0.030574806359559722, "grad_norm": 0.7801713943481445, "learning_rate": 0.0004980890746025275, "loss": 3.5918, "step": 450 }, { "epoch": 0.0309145264302215, "grad_norm": 0.8810610175132751, "learning_rate": 0.0004980678420981112, "loss": 3.8365, "step": 455 }, { "epoch": 0.03125424650088327, "grad_norm": 0.6745365262031555, "learning_rate": 0.0004980466095936948, "loss": 4.2267, "step": 460 }, { "epoch": 0.03159396657154505, "grad_norm": 0.8766850233078003, "learning_rate": 0.0004980253770892785, "loss": 3.856, "step": 465 }, { "epoch": 0.03193368664220682, "grad_norm": 0.6430597901344299, "learning_rate": 0.0004980041445848621, "loss": 3.8495, "step": 470 }, { "epoch": 0.032273406712868595, "grad_norm": 0.65704745054245, "learning_rate": 0.0004979829120804457, "loss": 3.9977, "step": 475 }, { "epoch": 0.03261312678353037, "grad_norm": 0.7645652890205383, "learning_rate": 0.0004979616795760293, "loss": 3.6759, "step": 480 }, { "epoch": 0.03295284685419215, "grad_norm": 0.908518373966217, "learning_rate": 0.000497940447071613, "loss": 3.6175, "step": 485 }, { "epoch": 0.03329256692485392, "grad_norm": 0.7490617632865906, "learning_rate": 0.0004979192145671967, "loss": 3.7913, "step": 490 }, { "epoch": 0.033632286995515695, "grad_norm": 0.8596498370170593, "learning_rate": 0.0004978979820627802, "loss": 3.9233, "step": 495 }, { "epoch": 0.03397200706617747, "grad_norm": 0.6238325238227844, "learning_rate": 0.0004978767495583639, "loss": 3.7352, "step": 500 }, { "epoch": 0.03431172713683924, "grad_norm": 2.2616941928863525, "learning_rate": 0.0004978555170539476, "loss": 3.8424, "step": 505 }, { "epoch": 0.03465144720750102, "grad_norm": 0.7381628751754761, "learning_rate": 0.0004978342845495311, "loss": 3.8248, "step": 510 }, { "epoch": 0.034991167278162795, "grad_norm": 0.777956485748291, "learning_rate": 0.0004978130520451148, "loss": 3.7334, "step": 515 }, { "epoch": 0.03533088734882457, "grad_norm": 1.6913574934005737, "learning_rate": 0.0004977918195406985, "loss": 3.9404, "step": 520 }, { "epoch": 0.03567060741948634, "grad_norm": 0.541786789894104, "learning_rate": 0.0004977705870362821, "loss": 3.7141, "step": 525 }, { "epoch": 0.03601032749014812, "grad_norm": 0.7132254242897034, "learning_rate": 0.0004977493545318658, "loss": 3.5903, "step": 530 }, { "epoch": 0.036350047560809895, "grad_norm": 0.7625871896743774, "learning_rate": 0.0004977281220274494, "loss": 3.825, "step": 535 }, { "epoch": 0.036689767631471665, "grad_norm": 1.4251145124435425, "learning_rate": 0.000497706889523033, "loss": 3.8064, "step": 540 }, { "epoch": 0.03702948770213344, "grad_norm": 0.8219479322433472, "learning_rate": 0.0004976856570186167, "loss": 3.7593, "step": 545 }, { "epoch": 0.03736920777279522, "grad_norm": 0.6071698665618896, "learning_rate": 0.0004976644245142003, "loss": 4.1131, "step": 550 }, { "epoch": 0.03770892784345699, "grad_norm": 0.7802040576934814, "learning_rate": 0.000497643192009784, "loss": 3.8296, "step": 555 }, { "epoch": 0.038048647914118765, "grad_norm": 1.3066742420196533, "learning_rate": 0.0004976219595053676, "loss": 3.8682, "step": 560 }, { "epoch": 0.03838836798478054, "grad_norm": 0.7376226782798767, "learning_rate": 0.0004976007270009513, "loss": 3.8694, "step": 565 }, { "epoch": 0.03872808805544232, "grad_norm": 0.68521648645401, "learning_rate": 0.0004975794944965349, "loss": 4.0428, "step": 570 }, { "epoch": 0.03906780812610409, "grad_norm": 1.752078652381897, "learning_rate": 0.0004975582619921185, "loss": 3.9359, "step": 575 }, { "epoch": 0.039407528196765865, "grad_norm": 1.1276261806488037, "learning_rate": 0.0004975370294877021, "loss": 3.7383, "step": 580 }, { "epoch": 0.03974724826742764, "grad_norm": 0.8473337292671204, "learning_rate": 0.0004975157969832858, "loss": 3.7056, "step": 585 }, { "epoch": 0.04008696833808941, "grad_norm": 0.7733874320983887, "learning_rate": 0.0004974945644788695, "loss": 3.622, "step": 590 }, { "epoch": 0.04042668840875119, "grad_norm": 0.6503618359565735, "learning_rate": 0.000497473331974453, "loss": 3.6731, "step": 595 }, { "epoch": 0.040766408479412965, "grad_norm": 0.7035828232765198, "learning_rate": 0.0004974520994700367, "loss": 3.8401, "step": 600 }, { "epoch": 0.04110612855007474, "grad_norm": 0.7617110013961792, "learning_rate": 0.0004974308669656204, "loss": 3.7335, "step": 605 }, { "epoch": 0.04144584862073651, "grad_norm": 0.6059450507164001, "learning_rate": 0.0004974096344612039, "loss": 3.8843, "step": 610 }, { "epoch": 0.04178556869139829, "grad_norm": 0.7100998163223267, "learning_rate": 0.0004973884019567876, "loss": 3.7792, "step": 615 }, { "epoch": 0.042125288762060065, "grad_norm": 0.6805708408355713, "learning_rate": 0.0004973671694523713, "loss": 3.6027, "step": 620 }, { "epoch": 0.042465008832721834, "grad_norm": 0.5260924100875854, "learning_rate": 0.0004973459369479549, "loss": 3.372, "step": 625 }, { "epoch": 0.04280472890338361, "grad_norm": 0.6643473505973816, "learning_rate": 0.0004973247044435386, "loss": 3.5743, "step": 630 }, { "epoch": 0.04314444897404539, "grad_norm": 0.6520421504974365, "learning_rate": 0.0004973034719391222, "loss": 3.7033, "step": 635 }, { "epoch": 0.043484169044707165, "grad_norm": 0.6301407217979431, "learning_rate": 0.0004972822394347058, "loss": 3.6671, "step": 640 }, { "epoch": 0.043823889115368934, "grad_norm": 0.6322331428527832, "learning_rate": 0.0004972610069302895, "loss": 4.0283, "step": 645 }, { "epoch": 0.04416360918603071, "grad_norm": 0.5669398903846741, "learning_rate": 0.000497239774425873, "loss": 3.9287, "step": 650 }, { "epoch": 0.04450332925669249, "grad_norm": 0.5742112994194031, "learning_rate": 0.0004972185419214567, "loss": 3.7538, "step": 655 }, { "epoch": 0.04484304932735426, "grad_norm": 0.7042970657348633, "learning_rate": 0.0004971973094170404, "loss": 3.6544, "step": 660 }, { "epoch": 0.045182769398016034, "grad_norm": 0.6912941336631775, "learning_rate": 0.0004971760769126241, "loss": 3.6851, "step": 665 }, { "epoch": 0.04552248946867781, "grad_norm": 5.583874702453613, "learning_rate": 0.0004971548444082076, "loss": 3.9047, "step": 670 }, { "epoch": 0.04586220953933958, "grad_norm": 0.749489426612854, "learning_rate": 0.0004971336119037913, "loss": 3.6825, "step": 675 }, { "epoch": 0.04620192961000136, "grad_norm": 0.7657076120376587, "learning_rate": 0.000497112379399375, "loss": 3.7969, "step": 680 }, { "epoch": 0.046541649680663134, "grad_norm": 10.631937026977539, "learning_rate": 0.0004970911468949585, "loss": 3.884, "step": 685 }, { "epoch": 0.04688136975132491, "grad_norm": 0.6928240656852722, "learning_rate": 0.0004970699143905422, "loss": 3.8932, "step": 690 }, { "epoch": 0.04722108982198668, "grad_norm": 0.630998432636261, "learning_rate": 0.0004970486818861258, "loss": 3.8042, "step": 695 }, { "epoch": 0.04756080989264846, "grad_norm": 0.5372489094734192, "learning_rate": 0.0004970274493817095, "loss": 3.9514, "step": 700 }, { "epoch": 0.047900529963310234, "grad_norm": 0.7061190009117126, "learning_rate": 0.0004970062168772932, "loss": 3.8645, "step": 705 }, { "epoch": 0.048240250033972004, "grad_norm": 0.6694191098213196, "learning_rate": 0.0004969849843728767, "loss": 3.8167, "step": 710 }, { "epoch": 0.04857997010463378, "grad_norm": 0.5248123407363892, "learning_rate": 0.0004969637518684604, "loss": 3.6327, "step": 715 }, { "epoch": 0.04891969017529556, "grad_norm": 0.5669602155685425, "learning_rate": 0.0004969425193640441, "loss": 3.8064, "step": 720 }, { "epoch": 0.049259410245957334, "grad_norm": 0.704684853553772, "learning_rate": 0.0004969212868596276, "loss": 3.6741, "step": 725 }, { "epoch": 0.049599130316619104, "grad_norm": 0.7057446241378784, "learning_rate": 0.0004969000543552113, "loss": 3.7326, "step": 730 }, { "epoch": 0.04993885038728088, "grad_norm": 0.7873995304107666, "learning_rate": 0.000496878821850795, "loss": 3.6522, "step": 735 }, { "epoch": 0.05027857045794266, "grad_norm": 0.7866527438163757, "learning_rate": 0.0004968575893463786, "loss": 3.9666, "step": 740 }, { "epoch": 0.05061829052860443, "grad_norm": 0.6742914319038391, "learning_rate": 0.0004968363568419622, "loss": 3.5888, "step": 745 }, { "epoch": 0.050958010599266204, "grad_norm": 0.7856414914131165, "learning_rate": 0.0004968151243375459, "loss": 3.8428, "step": 750 }, { "epoch": 0.05129773066992798, "grad_norm": 0.7090799808502197, "learning_rate": 0.0004967938918331295, "loss": 3.8336, "step": 755 }, { "epoch": 0.05163745074058975, "grad_norm": 0.7570465207099915, "learning_rate": 0.0004967726593287131, "loss": 3.9292, "step": 760 }, { "epoch": 0.05197717081125153, "grad_norm": 0.7134966850280762, "learning_rate": 0.0004967514268242969, "loss": 3.6571, "step": 765 }, { "epoch": 0.052316890881913304, "grad_norm": 0.7797589898109436, "learning_rate": 0.0004967301943198804, "loss": 3.7122, "step": 770 }, { "epoch": 0.05265661095257508, "grad_norm": 0.6913097500801086, "learning_rate": 0.0004967089618154641, "loss": 3.8454, "step": 775 }, { "epoch": 0.05299633102323685, "grad_norm": 0.8691904544830322, "learning_rate": 0.0004966877293110477, "loss": 3.8969, "step": 780 }, { "epoch": 0.05333605109389863, "grad_norm": 2.292844295501709, "learning_rate": 0.0004966664968066313, "loss": 3.6637, "step": 785 }, { "epoch": 0.053675771164560404, "grad_norm": 0.6494849324226379, "learning_rate": 0.000496645264302215, "loss": 3.8422, "step": 790 }, { "epoch": 0.054015491235222174, "grad_norm": 0.7740452885627747, "learning_rate": 0.0004966240317977986, "loss": 3.6909, "step": 795 }, { "epoch": 0.05435521130588395, "grad_norm": 0.7435924410820007, "learning_rate": 0.0004966027992933823, "loss": 3.9746, "step": 800 }, { "epoch": 0.05469493137654573, "grad_norm": 0.6132620573043823, "learning_rate": 0.0004965815667889659, "loss": 4.0076, "step": 805 }, { "epoch": 0.055034651447207504, "grad_norm": 0.5930618047714233, "learning_rate": 0.0004965603342845495, "loss": 3.9272, "step": 810 }, { "epoch": 0.055374371517869274, "grad_norm": 0.9302070140838623, "learning_rate": 0.0004965391017801332, "loss": 3.8387, "step": 815 }, { "epoch": 0.05571409158853105, "grad_norm": 0.8607485890388489, "learning_rate": 0.0004965178692757168, "loss": 3.7476, "step": 820 }, { "epoch": 0.05605381165919283, "grad_norm": 0.7967957854270935, "learning_rate": 0.0004964966367713004, "loss": 3.659, "step": 825 }, { "epoch": 0.0563935317298546, "grad_norm": 0.6653069257736206, "learning_rate": 0.0004964754042668841, "loss": 3.6523, "step": 830 }, { "epoch": 0.056733251800516374, "grad_norm": 0.9697920083999634, "learning_rate": 0.0004964541717624678, "loss": 3.7826, "step": 835 }, { "epoch": 0.05707297187117815, "grad_norm": 0.6360482573509216, "learning_rate": 0.0004964329392580514, "loss": 4.0557, "step": 840 }, { "epoch": 0.05741269194183993, "grad_norm": 0.7159570455551147, "learning_rate": 0.000496411706753635, "loss": 3.9723, "step": 845 }, { "epoch": 0.0577524120125017, "grad_norm": 0.7310690879821777, "learning_rate": 0.0004963904742492187, "loss": 3.844, "step": 850 }, { "epoch": 0.058092132083163474, "grad_norm": 0.6378340125083923, "learning_rate": 0.0004963692417448023, "loss": 3.6883, "step": 855 }, { "epoch": 0.05843185215382525, "grad_norm": 0.6603284478187561, "learning_rate": 0.0004963480092403859, "loss": 3.4889, "step": 860 }, { "epoch": 0.05877157222448702, "grad_norm": 0.7654943466186523, "learning_rate": 0.0004963267767359695, "loss": 3.8108, "step": 865 }, { "epoch": 0.0591112922951488, "grad_norm": 0.7190312147140503, "learning_rate": 0.0004963055442315532, "loss": 3.8286, "step": 870 }, { "epoch": 0.059451012365810574, "grad_norm": 0.6845301985740662, "learning_rate": 0.0004962843117271369, "loss": 3.5416, "step": 875 }, { "epoch": 0.059790732436472344, "grad_norm": 0.7921355962753296, "learning_rate": 0.0004962630792227205, "loss": 3.7017, "step": 880 }, { "epoch": 0.06013045250713412, "grad_norm": 0.7875977754592896, "learning_rate": 0.0004962418467183041, "loss": 3.7386, "step": 885 }, { "epoch": 0.0604701725777959, "grad_norm": 1.0727277994155884, "learning_rate": 0.0004962206142138878, "loss": 3.9105, "step": 890 }, { "epoch": 0.060809892648457674, "grad_norm": 0.6379724144935608, "learning_rate": 0.0004961993817094714, "loss": 3.8322, "step": 895 }, { "epoch": 0.061149612719119444, "grad_norm": 1.005824089050293, "learning_rate": 0.000496178149205055, "loss": 3.8712, "step": 900 }, { "epoch": 0.06148933278978122, "grad_norm": 0.7985267639160156, "learning_rate": 0.0004961569167006387, "loss": 3.7004, "step": 905 }, { "epoch": 0.061829052860443, "grad_norm": 0.7272144556045532, "learning_rate": 0.0004961356841962223, "loss": 3.6402, "step": 910 }, { "epoch": 0.06216877293110477, "grad_norm": 0.796370267868042, "learning_rate": 0.000496114451691806, "loss": 3.9339, "step": 915 }, { "epoch": 0.06250849300176654, "grad_norm": 0.6934226155281067, "learning_rate": 0.0004960932191873896, "loss": 3.7635, "step": 920 }, { "epoch": 0.06284821307242831, "grad_norm": 0.762178897857666, "learning_rate": 0.0004960719866829732, "loss": 3.8155, "step": 925 }, { "epoch": 0.0631879331430901, "grad_norm": 0.6808033585548401, "learning_rate": 0.0004960507541785569, "loss": 3.8366, "step": 930 }, { "epoch": 0.06352765321375187, "grad_norm": 0.6125309467315674, "learning_rate": 0.0004960295216741405, "loss": 3.9713, "step": 935 }, { "epoch": 0.06386737328441364, "grad_norm": 0.5628134608268738, "learning_rate": 0.0004960082891697242, "loss": 3.5922, "step": 940 }, { "epoch": 0.06420709335507542, "grad_norm": 0.663544237613678, "learning_rate": 0.0004959870566653078, "loss": 3.6954, "step": 945 }, { "epoch": 0.06454681342573719, "grad_norm": 0.6304472088813782, "learning_rate": 0.0004959658241608915, "loss": 3.7937, "step": 950 }, { "epoch": 0.06488653349639897, "grad_norm": 0.7555712461471558, "learning_rate": 0.0004959445916564751, "loss": 3.7296, "step": 955 }, { "epoch": 0.06522625356706074, "grad_norm": 0.7580234408378601, "learning_rate": 0.0004959233591520587, "loss": 3.8804, "step": 960 }, { "epoch": 0.06556597363772251, "grad_norm": 0.6481313109397888, "learning_rate": 0.0004959021266476423, "loss": 3.7493, "step": 965 }, { "epoch": 0.0659056937083843, "grad_norm": 1.0130565166473389, "learning_rate": 0.000495880894143226, "loss": 4.031, "step": 970 }, { "epoch": 0.06624541377904607, "grad_norm": 0.6825687885284424, "learning_rate": 0.0004958596616388097, "loss": 3.9392, "step": 975 }, { "epoch": 0.06658513384970784, "grad_norm": 0.624462902545929, "learning_rate": 0.0004958384291343932, "loss": 3.7824, "step": 980 }, { "epoch": 0.06692485392036962, "grad_norm": 0.7183303236961365, "learning_rate": 0.0004958171966299769, "loss": 3.8403, "step": 985 }, { "epoch": 0.06726457399103139, "grad_norm": 0.5774573683738708, "learning_rate": 0.0004957959641255606, "loss": 3.814, "step": 990 }, { "epoch": 0.06760429406169316, "grad_norm": 0.5939727425575256, "learning_rate": 0.0004957747316211441, "loss": 3.6691, "step": 995 }, { "epoch": 0.06794401413235494, "grad_norm": 0.8072649240493774, "learning_rate": 0.0004957534991167278, "loss": 3.7187, "step": 1000 }, { "epoch": 0.06828373420301671, "grad_norm": 0.8438680171966553, "learning_rate": 0.0004957322666123115, "loss": 3.7391, "step": 1005 }, { "epoch": 0.06862345427367848, "grad_norm": 0.6754587888717651, "learning_rate": 0.0004957110341078951, "loss": 3.5287, "step": 1010 }, { "epoch": 0.06896317434434027, "grad_norm": 0.7326998114585876, "learning_rate": 0.0004956898016034788, "loss": 3.8609, "step": 1015 }, { "epoch": 0.06930289441500204, "grad_norm": 0.6357223987579346, "learning_rate": 0.0004956685690990624, "loss": 3.6868, "step": 1020 }, { "epoch": 0.0696426144856638, "grad_norm": 0.6292480826377869, "learning_rate": 0.000495647336594646, "loss": 3.4447, "step": 1025 }, { "epoch": 0.06998233455632559, "grad_norm": 0.6179589629173279, "learning_rate": 0.0004956261040902297, "loss": 3.9628, "step": 1030 }, { "epoch": 0.07032205462698736, "grad_norm": 1.4641858339309692, "learning_rate": 0.0004956048715858133, "loss": 3.688, "step": 1035 }, { "epoch": 0.07066177469764914, "grad_norm": 0.9063203930854797, "learning_rate": 0.0004955836390813969, "loss": 3.7617, "step": 1040 }, { "epoch": 0.07100149476831091, "grad_norm": 0.8638570308685303, "learning_rate": 0.0004955624065769806, "loss": 3.7329, "step": 1045 }, { "epoch": 0.07134121483897268, "grad_norm": 0.704713761806488, "learning_rate": 0.0004955411740725643, "loss": 4.0072, "step": 1050 }, { "epoch": 0.07168093490963447, "grad_norm": 0.8378854393959045, "learning_rate": 0.0004955199415681478, "loss": 3.502, "step": 1055 }, { "epoch": 0.07202065498029624, "grad_norm": 0.6968774199485779, "learning_rate": 0.0004954987090637315, "loss": 3.5703, "step": 1060 }, { "epoch": 0.072360375050958, "grad_norm": 0.6384235620498657, "learning_rate": 0.0004954774765593151, "loss": 3.5478, "step": 1065 }, { "epoch": 0.07270009512161979, "grad_norm": 7.466708660125732, "learning_rate": 0.0004954562440548987, "loss": 3.9254, "step": 1070 }, { "epoch": 0.07303981519228156, "grad_norm": 1.0240126848220825, "learning_rate": 0.0004954350115504824, "loss": 3.5888, "step": 1075 }, { "epoch": 0.07337953526294333, "grad_norm": 0.8846316933631897, "learning_rate": 0.000495413779046066, "loss": 3.9182, "step": 1080 }, { "epoch": 0.07371925533360511, "grad_norm": 0.6337124109268188, "learning_rate": 0.0004953925465416497, "loss": 3.635, "step": 1085 }, { "epoch": 0.07405897540426688, "grad_norm": 0.7934439182281494, "learning_rate": 0.0004953713140372334, "loss": 3.768, "step": 1090 }, { "epoch": 0.07439869547492865, "grad_norm": 0.729831874370575, "learning_rate": 0.0004953500815328169, "loss": 3.8689, "step": 1095 }, { "epoch": 0.07473841554559044, "grad_norm": 0.6589436531066895, "learning_rate": 0.0004953288490284006, "loss": 3.3484, "step": 1100 }, { "epoch": 0.0750781356162522, "grad_norm": 0.6940023303031921, "learning_rate": 0.0004953076165239843, "loss": 3.9618, "step": 1105 }, { "epoch": 0.07541785568691398, "grad_norm": 0.7714236974716187, "learning_rate": 0.0004952863840195678, "loss": 3.686, "step": 1110 }, { "epoch": 0.07575757575757576, "grad_norm": 0.6663824319839478, "learning_rate": 0.0004952651515151515, "loss": 3.8182, "step": 1115 }, { "epoch": 0.07609729582823753, "grad_norm": 0.5791345834732056, "learning_rate": 0.0004952439190107352, "loss": 3.7071, "step": 1120 }, { "epoch": 0.07643701589889931, "grad_norm": 0.5998448729515076, "learning_rate": 0.0004952226865063188, "loss": 3.8312, "step": 1125 }, { "epoch": 0.07677673596956108, "grad_norm": 0.8122233748435974, "learning_rate": 0.0004952014540019024, "loss": 3.6049, "step": 1130 }, { "epoch": 0.07711645604022285, "grad_norm": 0.8393744826316833, "learning_rate": 0.0004951802214974861, "loss": 3.7622, "step": 1135 }, { "epoch": 0.07745617611088464, "grad_norm": 0.8642823100090027, "learning_rate": 0.0004951589889930697, "loss": 3.4399, "step": 1140 }, { "epoch": 0.0777958961815464, "grad_norm": 0.9607694149017334, "learning_rate": 0.0004951377564886533, "loss": 3.7056, "step": 1145 }, { "epoch": 0.07813561625220818, "grad_norm": 0.8687605857849121, "learning_rate": 0.0004951165239842371, "loss": 3.6516, "step": 1150 }, { "epoch": 0.07847533632286996, "grad_norm": 1.4810094833374023, "learning_rate": 0.0004950952914798206, "loss": 3.7821, "step": 1155 }, { "epoch": 0.07881505639353173, "grad_norm": 0.6449587941169739, "learning_rate": 0.0004950740589754043, "loss": 3.5837, "step": 1160 }, { "epoch": 0.0791547764641935, "grad_norm": 0.8072591423988342, "learning_rate": 0.000495052826470988, "loss": 3.833, "step": 1165 }, { "epoch": 0.07949449653485528, "grad_norm": 0.6176637411117554, "learning_rate": 0.0004950315939665715, "loss": 3.8064, "step": 1170 }, { "epoch": 0.07983421660551705, "grad_norm": 0.7759962677955627, "learning_rate": 0.0004950103614621552, "loss": 3.5918, "step": 1175 }, { "epoch": 0.08017393667617882, "grad_norm": 0.9783099889755249, "learning_rate": 0.0004949891289577388, "loss": 4.0571, "step": 1180 }, { "epoch": 0.0805136567468406, "grad_norm": 0.6491831541061401, "learning_rate": 0.0004949678964533225, "loss": 3.7901, "step": 1185 }, { "epoch": 0.08085337681750238, "grad_norm": 0.6656786799430847, "learning_rate": 0.0004949466639489061, "loss": 3.876, "step": 1190 }, { "epoch": 0.08119309688816416, "grad_norm": 0.6393648982048035, "learning_rate": 0.0004949254314444897, "loss": 3.4593, "step": 1195 }, { "epoch": 0.08153281695882593, "grad_norm": 0.8196907043457031, "learning_rate": 0.0004949041989400734, "loss": 3.8083, "step": 1200 }, { "epoch": 0.0818725370294877, "grad_norm": 1.0592135190963745, "learning_rate": 0.0004948829664356571, "loss": 3.9438, "step": 1205 }, { "epoch": 0.08221225710014948, "grad_norm": 0.6675680875778198, "learning_rate": 0.0004948617339312406, "loss": 3.5887, "step": 1210 }, { "epoch": 0.08255197717081125, "grad_norm": 0.6723314523696899, "learning_rate": 0.0004948405014268243, "loss": 3.652, "step": 1215 }, { "epoch": 0.08289169724147302, "grad_norm": 0.6910496354103088, "learning_rate": 0.000494819268922408, "loss": 3.6221, "step": 1220 }, { "epoch": 0.0832314173121348, "grad_norm": 0.6084297895431519, "learning_rate": 0.0004947980364179916, "loss": 3.6253, "step": 1225 }, { "epoch": 0.08357113738279658, "grad_norm": 0.6817795634269714, "learning_rate": 0.0004947768039135752, "loss": 3.8498, "step": 1230 }, { "epoch": 0.08391085745345835, "grad_norm": 0.7761330604553223, "learning_rate": 0.0004947555714091589, "loss": 3.6473, "step": 1235 }, { "epoch": 0.08425057752412013, "grad_norm": 0.7130883932113647, "learning_rate": 0.0004947343389047425, "loss": 3.7136, "step": 1240 }, { "epoch": 0.0845902975947819, "grad_norm": 0.7707271575927734, "learning_rate": 0.0004947131064003261, "loss": 3.447, "step": 1245 }, { "epoch": 0.08493001766544367, "grad_norm": 0.7748117446899414, "learning_rate": 0.0004946918738959099, "loss": 3.7931, "step": 1250 }, { "epoch": 0.08526973773610545, "grad_norm": 0.6777352094650269, "learning_rate": 0.0004946706413914934, "loss": 3.87, "step": 1255 }, { "epoch": 0.08560945780676722, "grad_norm": 0.759692907333374, "learning_rate": 0.0004946494088870771, "loss": 3.5758, "step": 1260 }, { "epoch": 0.08594917787742899, "grad_norm": 0.7235810160636902, "learning_rate": 0.0004946281763826608, "loss": 3.6186, "step": 1265 }, { "epoch": 0.08628889794809078, "grad_norm": 3.614327907562256, "learning_rate": 0.0004946069438782443, "loss": 3.6723, "step": 1270 }, { "epoch": 0.08662861801875255, "grad_norm": 0.7959243655204773, "learning_rate": 0.000494585711373828, "loss": 3.711, "step": 1275 }, { "epoch": 0.08696833808941433, "grad_norm": 8.740386009216309, "learning_rate": 0.0004945644788694116, "loss": 3.8968, "step": 1280 }, { "epoch": 0.0873080581600761, "grad_norm": 0.6901029944419861, "learning_rate": 0.0004945432463649953, "loss": 4.2531, "step": 1285 }, { "epoch": 0.08764777823073787, "grad_norm": 0.6305824518203735, "learning_rate": 0.0004945220138605789, "loss": 3.7051, "step": 1290 }, { "epoch": 0.08798749830139965, "grad_norm": 0.6807816624641418, "learning_rate": 0.0004945007813561625, "loss": 3.6892, "step": 1295 }, { "epoch": 0.08832721837206142, "grad_norm": 0.685790479183197, "learning_rate": 0.0004944795488517462, "loss": 3.9398, "step": 1300 }, { "epoch": 0.08866693844272319, "grad_norm": 0.7629333734512329, "learning_rate": 0.0004944583163473298, "loss": 3.7518, "step": 1305 }, { "epoch": 0.08900665851338498, "grad_norm": 0.6062626242637634, "learning_rate": 0.0004944370838429134, "loss": 3.7814, "step": 1310 }, { "epoch": 0.08934637858404675, "grad_norm": 0.8778780698776245, "learning_rate": 0.0004944158513384971, "loss": 3.6566, "step": 1315 }, { "epoch": 0.08968609865470852, "grad_norm": 0.6427554488182068, "learning_rate": 0.0004943946188340808, "loss": 3.3958, "step": 1320 }, { "epoch": 0.0900258187253703, "grad_norm": 0.734228253364563, "learning_rate": 0.0004943733863296644, "loss": 3.4477, "step": 1325 }, { "epoch": 0.09036553879603207, "grad_norm": 1.4460126161575317, "learning_rate": 0.000494352153825248, "loss": 3.7512, "step": 1330 }, { "epoch": 0.09070525886669384, "grad_norm": 1.9652591943740845, "learning_rate": 0.0004943309213208317, "loss": 3.7457, "step": 1335 }, { "epoch": 0.09104497893735562, "grad_norm": 0.823245644569397, "learning_rate": 0.0004943096888164153, "loss": 3.9511, "step": 1340 }, { "epoch": 0.09138469900801739, "grad_norm": 2.768824577331543, "learning_rate": 0.0004942884563119989, "loss": 3.4773, "step": 1345 }, { "epoch": 0.09172441907867916, "grad_norm": 0.9435215592384338, "learning_rate": 0.0004942672238075825, "loss": 3.6055, "step": 1350 }, { "epoch": 0.09206413914934095, "grad_norm": 0.8539856672286987, "learning_rate": 0.0004942459913031662, "loss": 3.7878, "step": 1355 }, { "epoch": 0.09240385922000272, "grad_norm": 0.8908941149711609, "learning_rate": 0.0004942247587987499, "loss": 3.6779, "step": 1360 }, { "epoch": 0.0927435792906645, "grad_norm": 0.6632312536239624, "learning_rate": 0.0004942035262943334, "loss": 3.6784, "step": 1365 }, { "epoch": 0.09308329936132627, "grad_norm": 1.356061577796936, "learning_rate": 0.0004941822937899171, "loss": 3.8055, "step": 1370 }, { "epoch": 0.09342301943198804, "grad_norm": 0.7378327250480652, "learning_rate": 0.0004941610612855008, "loss": 3.8091, "step": 1375 }, { "epoch": 0.09376273950264982, "grad_norm": 0.7094233632087708, "learning_rate": 0.0004941398287810843, "loss": 3.6226, "step": 1380 }, { "epoch": 0.09410245957331159, "grad_norm": 0.9981195330619812, "learning_rate": 0.000494118596276668, "loss": 3.751, "step": 1385 }, { "epoch": 0.09444217964397336, "grad_norm": 0.7891575694084167, "learning_rate": 0.0004940973637722517, "loss": 3.6728, "step": 1390 }, { "epoch": 0.09478189971463515, "grad_norm": 0.7278838753700256, "learning_rate": 0.0004940761312678353, "loss": 3.6828, "step": 1395 }, { "epoch": 0.09512161978529692, "grad_norm": 0.6019279360771179, "learning_rate": 0.000494054898763419, "loss": 3.7486, "step": 1400 }, { "epoch": 0.09546133985595869, "grad_norm": 0.5767970681190491, "learning_rate": 0.0004940336662590026, "loss": 3.7599, "step": 1405 }, { "epoch": 0.09580105992662047, "grad_norm": 0.6330364942550659, "learning_rate": 0.0004940124337545862, "loss": 3.8582, "step": 1410 }, { "epoch": 0.09614077999728224, "grad_norm": 0.694765567779541, "learning_rate": 0.0004939912012501699, "loss": 3.7821, "step": 1415 }, { "epoch": 0.09648050006794401, "grad_norm": 0.5734444856643677, "learning_rate": 0.0004939699687457535, "loss": 3.863, "step": 1420 }, { "epoch": 0.09682022013860579, "grad_norm": 0.7648805379867554, "learning_rate": 0.0004939487362413371, "loss": 3.7629, "step": 1425 }, { "epoch": 0.09715994020926756, "grad_norm": 0.5738037824630737, "learning_rate": 0.0004939275037369208, "loss": 3.6925, "step": 1430 }, { "epoch": 0.09749966027992933, "grad_norm": 0.7418702244758606, "learning_rate": 0.0004939062712325045, "loss": 3.6394, "step": 1435 }, { "epoch": 0.09783938035059112, "grad_norm": 0.8075767159461975, "learning_rate": 0.000493885038728088, "loss": 3.757, "step": 1440 }, { "epoch": 0.09817910042125289, "grad_norm": 0.6612007021903992, "learning_rate": 0.0004938638062236717, "loss": 3.6277, "step": 1445 }, { "epoch": 0.09851882049191467, "grad_norm": 0.6218631267547607, "learning_rate": 0.0004938425737192554, "loss": 3.717, "step": 1450 }, { "epoch": 0.09885854056257644, "grad_norm": 0.5911751389503479, "learning_rate": 0.0004938213412148389, "loss": 3.764, "step": 1455 }, { "epoch": 0.09919826063323821, "grad_norm": 0.6588653922080994, "learning_rate": 0.0004938001087104227, "loss": 3.8142, "step": 1460 }, { "epoch": 0.09953798070389999, "grad_norm": 0.9852591753005981, "learning_rate": 0.0004937788762060062, "loss": 3.6802, "step": 1465 }, { "epoch": 0.09987770077456176, "grad_norm": 1.2164267301559448, "learning_rate": 0.0004937576437015899, "loss": 3.8939, "step": 1470 }, { "epoch": 0.10021742084522353, "grad_norm": 0.6507337689399719, "learning_rate": 0.0004937364111971736, "loss": 3.4385, "step": 1475 }, { "epoch": 0.10055714091588532, "grad_norm": 0.9051929712295532, "learning_rate": 0.0004937151786927571, "loss": 3.7934, "step": 1480 }, { "epoch": 0.10089686098654709, "grad_norm": 0.6509424448013306, "learning_rate": 0.0004936939461883408, "loss": 3.5869, "step": 1485 }, { "epoch": 0.10123658105720885, "grad_norm": 0.6682103872299194, "learning_rate": 0.0004936727136839245, "loss": 3.6773, "step": 1490 }, { "epoch": 0.10157630112787064, "grad_norm": 0.7837647795677185, "learning_rate": 0.0004936514811795081, "loss": 3.6688, "step": 1495 }, { "epoch": 0.10191602119853241, "grad_norm": 0.7083595395088196, "learning_rate": 0.0004936302486750917, "loss": 3.8284, "step": 1500 }, { "epoch": 0.10225574126919418, "grad_norm": 1.9503401517868042, "learning_rate": 0.0004936090161706754, "loss": 3.761, "step": 1505 }, { "epoch": 0.10259546133985596, "grad_norm": 0.7743245959281921, "learning_rate": 0.000493587783666259, "loss": 3.6484, "step": 1510 }, { "epoch": 0.10293518141051773, "grad_norm": 0.6606540083885193, "learning_rate": 0.0004935665511618426, "loss": 3.7411, "step": 1515 }, { "epoch": 0.1032749014811795, "grad_norm": 0.6457474231719971, "learning_rate": 0.0004935453186574263, "loss": 3.5751, "step": 1520 }, { "epoch": 0.10361462155184128, "grad_norm": 0.7263528108596802, "learning_rate": 0.0004935240861530099, "loss": 3.8199, "step": 1525 }, { "epoch": 0.10395434162250305, "grad_norm": 0.6847793459892273, "learning_rate": 0.0004935028536485936, "loss": 3.7104, "step": 1530 }, { "epoch": 0.10429406169316484, "grad_norm": 2.655897855758667, "learning_rate": 0.0004934816211441773, "loss": 3.8885, "step": 1535 }, { "epoch": 0.10463378176382661, "grad_norm": 1.3483306169509888, "learning_rate": 0.0004934603886397608, "loss": 3.7366, "step": 1540 }, { "epoch": 0.10497350183448838, "grad_norm": 0.7469624876976013, "learning_rate": 0.0004934391561353445, "loss": 3.7408, "step": 1545 }, { "epoch": 0.10531322190515016, "grad_norm": 4.933581829071045, "learning_rate": 0.0004934179236309282, "loss": 3.6813, "step": 1550 }, { "epoch": 0.10565294197581193, "grad_norm": 0.7338085174560547, "learning_rate": 0.0004933966911265117, "loss": 3.6169, "step": 1555 }, { "epoch": 0.1059926620464737, "grad_norm": 0.7613298892974854, "learning_rate": 0.0004933754586220954, "loss": 3.6983, "step": 1560 }, { "epoch": 0.10633238211713548, "grad_norm": 0.9123542904853821, "learning_rate": 0.000493354226117679, "loss": 3.8113, "step": 1565 }, { "epoch": 0.10667210218779725, "grad_norm": 0.7202011346817017, "learning_rate": 0.0004933329936132627, "loss": 3.788, "step": 1570 }, { "epoch": 0.10701182225845902, "grad_norm": 0.7221234440803528, "learning_rate": 0.0004933117611088464, "loss": 3.8585, "step": 1575 }, { "epoch": 0.10735154232912081, "grad_norm": 0.8098806738853455, "learning_rate": 0.0004932905286044299, "loss": 3.8228, "step": 1580 }, { "epoch": 0.10769126239978258, "grad_norm": 0.6708495020866394, "learning_rate": 0.0004932692961000136, "loss": 3.68, "step": 1585 }, { "epoch": 0.10803098247044435, "grad_norm": 0.7177237868309021, "learning_rate": 0.0004932480635955973, "loss": 3.6198, "step": 1590 }, { "epoch": 0.10837070254110613, "grad_norm": 0.6466559767723083, "learning_rate": 0.0004932268310911808, "loss": 3.8878, "step": 1595 }, { "epoch": 0.1087104226117679, "grad_norm": 0.5524064302444458, "learning_rate": 0.0004932055985867645, "loss": 3.7217, "step": 1600 }, { "epoch": 0.10905014268242967, "grad_norm": 0.7076518535614014, "learning_rate": 0.0004931843660823482, "loss": 3.7129, "step": 1605 }, { "epoch": 0.10938986275309145, "grad_norm": 0.8710231184959412, "learning_rate": 0.0004931631335779318, "loss": 3.849, "step": 1610 }, { "epoch": 0.10972958282375322, "grad_norm": 0.7474174499511719, "learning_rate": 0.0004931419010735154, "loss": 3.7025, "step": 1615 }, { "epoch": 0.11006930289441501, "grad_norm": 0.6682606339454651, "learning_rate": 0.0004931206685690991, "loss": 3.5968, "step": 1620 }, { "epoch": 0.11040902296507678, "grad_norm": 0.6666601896286011, "learning_rate": 0.0004930994360646827, "loss": 3.6451, "step": 1625 }, { "epoch": 0.11074874303573855, "grad_norm": 0.7721078991889954, "learning_rate": 0.0004930782035602663, "loss": 3.7859, "step": 1630 }, { "epoch": 0.11108846310640033, "grad_norm": 0.7380674481391907, "learning_rate": 0.0004930569710558501, "loss": 3.7488, "step": 1635 }, { "epoch": 0.1114281831770621, "grad_norm": 0.7582091093063354, "learning_rate": 0.0004930357385514336, "loss": 3.8163, "step": 1640 }, { "epoch": 0.11176790324772387, "grad_norm": 0.6996340155601501, "learning_rate": 0.0004930145060470173, "loss": 3.8697, "step": 1645 }, { "epoch": 0.11210762331838565, "grad_norm": 0.743719756603241, "learning_rate": 0.000492993273542601, "loss": 3.604, "step": 1650 }, { "epoch": 0.11244734338904742, "grad_norm": 0.8432236313819885, "learning_rate": 0.0004929720410381845, "loss": 3.8508, "step": 1655 }, { "epoch": 0.1127870634597092, "grad_norm": 0.787841260433197, "learning_rate": 0.0004929508085337682, "loss": 3.7154, "step": 1660 }, { "epoch": 0.11312678353037098, "grad_norm": 0.8492522239685059, "learning_rate": 0.0004929295760293518, "loss": 3.6814, "step": 1665 }, { "epoch": 0.11346650360103275, "grad_norm": 0.7513903379440308, "learning_rate": 0.0004929083435249355, "loss": 3.6743, "step": 1670 }, { "epoch": 0.11380622367169452, "grad_norm": 0.6839136481285095, "learning_rate": 0.0004928871110205191, "loss": 3.8208, "step": 1675 }, { "epoch": 0.1141459437423563, "grad_norm": 0.7681917548179626, "learning_rate": 0.0004928658785161027, "loss": 3.7546, "step": 1680 }, { "epoch": 0.11448566381301807, "grad_norm": 0.6509638428688049, "learning_rate": 0.0004928446460116864, "loss": 3.6313, "step": 1685 }, { "epoch": 0.11482538388367985, "grad_norm": 0.6916874051094055, "learning_rate": 0.00049282341350727, "loss": 3.4296, "step": 1690 }, { "epoch": 0.11516510395434162, "grad_norm": 0.618182897567749, "learning_rate": 0.0004928021810028536, "loss": 3.6354, "step": 1695 }, { "epoch": 0.1155048240250034, "grad_norm": 0.8055250644683838, "learning_rate": 0.0004927809484984373, "loss": 3.7107, "step": 1700 }, { "epoch": 0.11584454409566518, "grad_norm": 0.587020754814148, "learning_rate": 0.000492759715994021, "loss": 3.4652, "step": 1705 }, { "epoch": 0.11618426416632695, "grad_norm": 0.5841749310493469, "learning_rate": 0.0004927384834896046, "loss": 3.8803, "step": 1710 }, { "epoch": 0.11652398423698872, "grad_norm": 0.9043471217155457, "learning_rate": 0.0004927172509851882, "loss": 3.5268, "step": 1715 }, { "epoch": 0.1168637043076505, "grad_norm": 0.7280998229980469, "learning_rate": 0.0004926960184807719, "loss": 3.7785, "step": 1720 }, { "epoch": 0.11720342437831227, "grad_norm": 1.102638840675354, "learning_rate": 0.0004926747859763555, "loss": 3.5181, "step": 1725 }, { "epoch": 0.11754314444897404, "grad_norm": 0.7297567129135132, "learning_rate": 0.0004926535534719391, "loss": 3.5923, "step": 1730 }, { "epoch": 0.11788286451963582, "grad_norm": 1.837306261062622, "learning_rate": 0.0004926323209675228, "loss": 3.6265, "step": 1735 }, { "epoch": 0.1182225845902976, "grad_norm": 0.791614830493927, "learning_rate": 0.0004926110884631064, "loss": 3.8197, "step": 1740 }, { "epoch": 0.11856230466095936, "grad_norm": 0.7620996832847595, "learning_rate": 0.0004925898559586901, "loss": 3.9394, "step": 1745 }, { "epoch": 0.11890202473162115, "grad_norm": 1.4246779680252075, "learning_rate": 0.0004925686234542736, "loss": 3.7734, "step": 1750 }, { "epoch": 0.11924174480228292, "grad_norm": 0.7304078936576843, "learning_rate": 0.0004925473909498573, "loss": 3.7981, "step": 1755 }, { "epoch": 0.11958146487294469, "grad_norm": 0.6806043982505798, "learning_rate": 0.000492526158445441, "loss": 3.8596, "step": 1760 }, { "epoch": 0.11992118494360647, "grad_norm": 0.754202127456665, "learning_rate": 0.0004925049259410245, "loss": 3.7412, "step": 1765 }, { "epoch": 0.12026090501426824, "grad_norm": 0.6883541941642761, "learning_rate": 0.0004924836934366083, "loss": 3.7953, "step": 1770 }, { "epoch": 0.12060062508493002, "grad_norm": 0.7564325928688049, "learning_rate": 0.0004924624609321919, "loss": 3.696, "step": 1775 }, { "epoch": 0.1209403451555918, "grad_norm": 1.191343069076538, "learning_rate": 0.0004924412284277755, "loss": 3.8015, "step": 1780 }, { "epoch": 0.12128006522625356, "grad_norm": 0.7389299869537354, "learning_rate": 0.0004924199959233592, "loss": 3.8348, "step": 1785 }, { "epoch": 0.12161978529691535, "grad_norm": 0.9015017747879028, "learning_rate": 0.0004923987634189428, "loss": 3.7917, "step": 1790 }, { "epoch": 0.12195950536757712, "grad_norm": 0.7832557559013367, "learning_rate": 0.0004923775309145264, "loss": 3.5608, "step": 1795 }, { "epoch": 0.12229922543823889, "grad_norm": 0.8731859922409058, "learning_rate": 0.0004923562984101101, "loss": 3.5859, "step": 1800 }, { "epoch": 0.12263894550890067, "grad_norm": 0.8186085820198059, "learning_rate": 0.0004923350659056938, "loss": 3.7167, "step": 1805 }, { "epoch": 0.12297866557956244, "grad_norm": 0.6881691813468933, "learning_rate": 0.0004923138334012773, "loss": 3.6401, "step": 1810 }, { "epoch": 0.12331838565022421, "grad_norm": 0.7310374975204468, "learning_rate": 0.000492292600896861, "loss": 3.6683, "step": 1815 }, { "epoch": 0.123658105720886, "grad_norm": 0.8642333149909973, "learning_rate": 0.0004922713683924447, "loss": 3.6405, "step": 1820 }, { "epoch": 0.12399782579154776, "grad_norm": 1.0276823043823242, "learning_rate": 0.0004922501358880282, "loss": 3.6092, "step": 1825 }, { "epoch": 0.12433754586220953, "grad_norm": 1.0267727375030518, "learning_rate": 0.0004922289033836119, "loss": 3.7603, "step": 1830 }, { "epoch": 0.12467726593287132, "grad_norm": 0.7356189489364624, "learning_rate": 0.0004922076708791956, "loss": 3.8069, "step": 1835 }, { "epoch": 0.1250169860035331, "grad_norm": 0.8401626348495483, "learning_rate": 0.0004921864383747792, "loss": 3.7748, "step": 1840 }, { "epoch": 0.12535670607419486, "grad_norm": 4.10971736907959, "learning_rate": 0.0004921652058703629, "loss": 3.6353, "step": 1845 }, { "epoch": 0.12569642614485663, "grad_norm": 0.8535498380661011, "learning_rate": 0.0004921439733659464, "loss": 3.7994, "step": 1850 }, { "epoch": 0.12603614621551842, "grad_norm": 0.7281075716018677, "learning_rate": 0.0004921227408615301, "loss": 3.741, "step": 1855 }, { "epoch": 0.1263758662861802, "grad_norm": 0.8881121277809143, "learning_rate": 0.0004921015083571138, "loss": 3.6808, "step": 1860 }, { "epoch": 0.12671558635684196, "grad_norm": 0.6927809715270996, "learning_rate": 0.0004920802758526973, "loss": 3.4848, "step": 1865 }, { "epoch": 0.12705530642750373, "grad_norm": 0.6604318618774414, "learning_rate": 0.000492059043348281, "loss": 3.821, "step": 1870 }, { "epoch": 0.1273950264981655, "grad_norm": 0.6511644721031189, "learning_rate": 0.0004920378108438647, "loss": 3.6034, "step": 1875 }, { "epoch": 0.12773474656882727, "grad_norm": 0.8243928551673889, "learning_rate": 0.0004920165783394483, "loss": 3.6491, "step": 1880 }, { "epoch": 0.12807446663948907, "grad_norm": 0.6462376713752747, "learning_rate": 0.000491995345835032, "loss": 3.7792, "step": 1885 }, { "epoch": 0.12841418671015084, "grad_norm": 0.8133718371391296, "learning_rate": 0.0004919741133306156, "loss": 3.2494, "step": 1890 }, { "epoch": 0.1287539067808126, "grad_norm": 0.8045417070388794, "learning_rate": 0.0004919528808261992, "loss": 3.7627, "step": 1895 }, { "epoch": 0.12909362685147438, "grad_norm": 0.7382818460464478, "learning_rate": 0.0004919316483217829, "loss": 3.6232, "step": 1900 }, { "epoch": 0.12943334692213615, "grad_norm": 0.6837460398674011, "learning_rate": 0.0004919104158173665, "loss": 3.4738, "step": 1905 }, { "epoch": 0.12977306699279795, "grad_norm": 0.7448748350143433, "learning_rate": 0.0004918891833129501, "loss": 3.7658, "step": 1910 }, { "epoch": 0.13011278706345972, "grad_norm": 0.6722131371498108, "learning_rate": 0.0004918679508085338, "loss": 3.8337, "step": 1915 }, { "epoch": 0.1304525071341215, "grad_norm": 0.7082023620605469, "learning_rate": 0.0004918467183041175, "loss": 3.4263, "step": 1920 }, { "epoch": 0.13079222720478326, "grad_norm": 0.8400975465774536, "learning_rate": 0.000491825485799701, "loss": 3.8012, "step": 1925 }, { "epoch": 0.13113194727544503, "grad_norm": 1.0848486423492432, "learning_rate": 0.0004918042532952847, "loss": 3.6599, "step": 1930 }, { "epoch": 0.1314716673461068, "grad_norm": 1.9695230722427368, "learning_rate": 0.0004917830207908684, "loss": 3.6611, "step": 1935 }, { "epoch": 0.1318113874167686, "grad_norm": 0.6733298897743225, "learning_rate": 0.0004917617882864519, "loss": 3.3162, "step": 1940 }, { "epoch": 0.13215110748743036, "grad_norm": 1.1085222959518433, "learning_rate": 0.0004917405557820357, "loss": 3.6175, "step": 1945 }, { "epoch": 0.13249082755809213, "grad_norm": 0.6675096750259399, "learning_rate": 0.0004917193232776192, "loss": 3.8169, "step": 1950 }, { "epoch": 0.1328305476287539, "grad_norm": 0.6941584348678589, "learning_rate": 0.0004916980907732029, "loss": 3.6589, "step": 1955 }, { "epoch": 0.13317026769941567, "grad_norm": 0.9514299631118774, "learning_rate": 0.0004916768582687866, "loss": 3.6998, "step": 1960 }, { "epoch": 0.13350998777007744, "grad_norm": 0.6938250064849854, "learning_rate": 0.0004916556257643701, "loss": 3.4415, "step": 1965 }, { "epoch": 0.13384970784073924, "grad_norm": 0.9484620690345764, "learning_rate": 0.0004916343932599538, "loss": 3.5786, "step": 1970 }, { "epoch": 0.134189427911401, "grad_norm": 0.7143712639808655, "learning_rate": 0.0004916131607555375, "loss": 3.8956, "step": 1975 }, { "epoch": 0.13452914798206278, "grad_norm": 0.7624014019966125, "learning_rate": 0.0004915919282511211, "loss": 3.546, "step": 1980 }, { "epoch": 0.13486886805272455, "grad_norm": 0.6610897183418274, "learning_rate": 0.0004915706957467047, "loss": 3.7742, "step": 1985 }, { "epoch": 0.13520858812338632, "grad_norm": 0.7593246698379517, "learning_rate": 0.0004915494632422884, "loss": 3.6491, "step": 1990 }, { "epoch": 0.13554830819404812, "grad_norm": 0.7027103304862976, "learning_rate": 0.000491528230737872, "loss": 3.7248, "step": 1995 }, { "epoch": 0.1358880282647099, "grad_norm": 0.7899437546730042, "learning_rate": 0.0004915069982334556, "loss": 3.6596, "step": 2000 }, { "epoch": 0.13622774833537166, "grad_norm": 0.8627012372016907, "learning_rate": 0.0004914857657290393, "loss": 3.5732, "step": 2005 }, { "epoch": 0.13656746840603343, "grad_norm": 1.0442651510238647, "learning_rate": 0.0004914645332246229, "loss": 3.9656, "step": 2010 }, { "epoch": 0.1369071884766952, "grad_norm": 0.5874420404434204, "learning_rate": 0.0004914433007202066, "loss": 3.7071, "step": 2015 }, { "epoch": 0.13724690854735697, "grad_norm": 0.8090359568595886, "learning_rate": 0.0004914220682157903, "loss": 3.861, "step": 2020 }, { "epoch": 0.13758662861801876, "grad_norm": 0.7161468267440796, "learning_rate": 0.0004914008357113738, "loss": 3.5639, "step": 2025 }, { "epoch": 0.13792634868868053, "grad_norm": 0.8931761384010315, "learning_rate": 0.0004913796032069575, "loss": 3.6262, "step": 2030 }, { "epoch": 0.1382660687593423, "grad_norm": 0.7959179878234863, "learning_rate": 0.0004913583707025412, "loss": 3.8592, "step": 2035 }, { "epoch": 0.13860578883000407, "grad_norm": 0.6171568036079407, "learning_rate": 0.0004913371381981247, "loss": 3.7405, "step": 2040 }, { "epoch": 0.13894550890066584, "grad_norm": 0.6043805480003357, "learning_rate": 0.0004913159056937084, "loss": 3.8385, "step": 2045 }, { "epoch": 0.1392852289713276, "grad_norm": 0.6656317710876465, "learning_rate": 0.000491294673189292, "loss": 3.656, "step": 2050 }, { "epoch": 0.1396249490419894, "grad_norm": 0.751796543598175, "learning_rate": 0.0004912734406848757, "loss": 3.748, "step": 2055 }, { "epoch": 0.13996466911265118, "grad_norm": 0.7975092530250549, "learning_rate": 0.0004912522081804593, "loss": 3.7599, "step": 2060 }, { "epoch": 0.14030438918331295, "grad_norm": 0.8609049320220947, "learning_rate": 0.0004912309756760429, "loss": 3.9297, "step": 2065 }, { "epoch": 0.14064410925397472, "grad_norm": 0.7666730284690857, "learning_rate": 0.0004912097431716266, "loss": 3.5912, "step": 2070 }, { "epoch": 0.1409838293246365, "grad_norm": 0.7912937998771667, "learning_rate": 0.0004911885106672102, "loss": 3.5584, "step": 2075 }, { "epoch": 0.1413235493952983, "grad_norm": 0.5775641202926636, "learning_rate": 0.0004911672781627938, "loss": 3.8099, "step": 2080 }, { "epoch": 0.14166326946596006, "grad_norm": 0.9047433137893677, "learning_rate": 0.0004911460456583775, "loss": 3.8254, "step": 2085 }, { "epoch": 0.14200298953662183, "grad_norm": 0.6990176439285278, "learning_rate": 0.0004911248131539612, "loss": 3.6656, "step": 2090 }, { "epoch": 0.1423427096072836, "grad_norm": 0.7259061336517334, "learning_rate": 0.0004911035806495448, "loss": 3.7406, "step": 2095 }, { "epoch": 0.14268242967794537, "grad_norm": 1.213557243347168, "learning_rate": 0.0004910823481451284, "loss": 3.64, "step": 2100 }, { "epoch": 0.14302214974860714, "grad_norm": 0.7467264533042908, "learning_rate": 0.0004910611156407121, "loss": 3.8154, "step": 2105 }, { "epoch": 0.14336186981926893, "grad_norm": 0.8508780002593994, "learning_rate": 0.0004910398831362957, "loss": 3.7988, "step": 2110 }, { "epoch": 0.1437015898899307, "grad_norm": 0.8289774060249329, "learning_rate": 0.0004910186506318793, "loss": 3.3569, "step": 2115 }, { "epoch": 0.14404130996059247, "grad_norm": 0.6189928650856018, "learning_rate": 0.000490997418127463, "loss": 3.8346, "step": 2120 }, { "epoch": 0.14438103003125424, "grad_norm": 0.7897703051567078, "learning_rate": 0.0004909761856230466, "loss": 3.6508, "step": 2125 }, { "epoch": 0.144720750101916, "grad_norm": 0.9006664156913757, "learning_rate": 0.0004909549531186303, "loss": 3.6285, "step": 2130 }, { "epoch": 0.14506047017257778, "grad_norm": 0.9556710720062256, "learning_rate": 0.0004909337206142138, "loss": 3.6623, "step": 2135 }, { "epoch": 0.14540019024323958, "grad_norm": 0.6847245693206787, "learning_rate": 0.0004909124881097975, "loss": 3.6669, "step": 2140 }, { "epoch": 0.14573991031390135, "grad_norm": 0.6775925755500793, "learning_rate": 0.0004908912556053812, "loss": 3.797, "step": 2145 }, { "epoch": 0.14607963038456312, "grad_norm": 6.741119384765625, "learning_rate": 0.0004908700231009647, "loss": 3.5778, "step": 2150 }, { "epoch": 0.1464193504552249, "grad_norm": 0.7240384221076965, "learning_rate": 0.0004908487905965485, "loss": 4.0842, "step": 2155 }, { "epoch": 0.14675907052588666, "grad_norm": 0.6737907528877258, "learning_rate": 0.0004908275580921321, "loss": 3.6925, "step": 2160 }, { "epoch": 0.14709879059654846, "grad_norm": 1.4479137659072876, "learning_rate": 0.0004908063255877157, "loss": 3.5953, "step": 2165 }, { "epoch": 0.14743851066721023, "grad_norm": 0.7829978466033936, "learning_rate": 0.0004907850930832994, "loss": 3.4532, "step": 2170 }, { "epoch": 0.147778230737872, "grad_norm": 0.6856682300567627, "learning_rate": 0.000490763860578883, "loss": 3.8842, "step": 2175 }, { "epoch": 0.14811795080853377, "grad_norm": 0.7501214146614075, "learning_rate": 0.0004907426280744666, "loss": 3.5752, "step": 2180 }, { "epoch": 0.14845767087919554, "grad_norm": 1.3756369352340698, "learning_rate": 0.0004907213955700503, "loss": 3.7354, "step": 2185 }, { "epoch": 0.1487973909498573, "grad_norm": 0.7789353728294373, "learning_rate": 0.000490700163065634, "loss": 3.6695, "step": 2190 }, { "epoch": 0.1491371110205191, "grad_norm": 0.7474542856216431, "learning_rate": 0.0004906789305612175, "loss": 3.7037, "step": 2195 }, { "epoch": 0.14947683109118087, "grad_norm": 0.749650776386261, "learning_rate": 0.0004906576980568012, "loss": 3.7329, "step": 2200 }, { "epoch": 0.14981655116184264, "grad_norm": 0.7057173252105713, "learning_rate": 0.0004906364655523849, "loss": 3.6687, "step": 2205 }, { "epoch": 0.1501562712325044, "grad_norm": 0.7271165251731873, "learning_rate": 0.0004906152330479684, "loss": 3.7925, "step": 2210 }, { "epoch": 0.15049599130316618, "grad_norm": 3.804062604904175, "learning_rate": 0.0004905940005435521, "loss": 3.6531, "step": 2215 }, { "epoch": 0.15083571137382795, "grad_norm": 0.8345944285392761, "learning_rate": 0.0004905727680391358, "loss": 3.7783, "step": 2220 }, { "epoch": 0.15117543144448975, "grad_norm": 0.6936637163162231, "learning_rate": 0.0004905515355347194, "loss": 3.8569, "step": 2225 }, { "epoch": 0.15151515151515152, "grad_norm": 0.8015171885490417, "learning_rate": 0.0004905303030303031, "loss": 3.4573, "step": 2230 }, { "epoch": 0.1518548715858133, "grad_norm": 0.6555799245834351, "learning_rate": 0.0004905090705258866, "loss": 3.8286, "step": 2235 }, { "epoch": 0.15219459165647506, "grad_norm": 0.7007747888565063, "learning_rate": 0.0004904878380214703, "loss": 3.8175, "step": 2240 }, { "epoch": 0.15253431172713683, "grad_norm": 0.6038751006126404, "learning_rate": 0.000490466605517054, "loss": 3.8134, "step": 2245 }, { "epoch": 0.15287403179779863, "grad_norm": 0.8329855799674988, "learning_rate": 0.0004904453730126375, "loss": 3.5291, "step": 2250 }, { "epoch": 0.1532137518684604, "grad_norm": 0.9038205146789551, "learning_rate": 0.0004904241405082213, "loss": 3.7092, "step": 2255 }, { "epoch": 0.15355347193912217, "grad_norm": 1.1969314813613892, "learning_rate": 0.0004904029080038049, "loss": 3.8153, "step": 2260 }, { "epoch": 0.15389319200978394, "grad_norm": 0.7161705493927002, "learning_rate": 0.0004903816754993885, "loss": 3.6924, "step": 2265 }, { "epoch": 0.1542329120804457, "grad_norm": 0.6664339303970337, "learning_rate": 0.0004903604429949722, "loss": 3.7695, "step": 2270 }, { "epoch": 0.15457263215110748, "grad_norm": 0.8777261972427368, "learning_rate": 0.0004903392104905558, "loss": 3.6233, "step": 2275 }, { "epoch": 0.15491235222176927, "grad_norm": 0.8664095401763916, "learning_rate": 0.0004903179779861394, "loss": 3.5797, "step": 2280 }, { "epoch": 0.15525207229243104, "grad_norm": 0.8031551241874695, "learning_rate": 0.0004902967454817231, "loss": 3.6172, "step": 2285 }, { "epoch": 0.1555917923630928, "grad_norm": 0.6813163757324219, "learning_rate": 0.0004902755129773068, "loss": 3.5929, "step": 2290 }, { "epoch": 0.15593151243375458, "grad_norm": 0.8453793525695801, "learning_rate": 0.0004902542804728903, "loss": 3.5075, "step": 2295 }, { "epoch": 0.15627123250441635, "grad_norm": 0.8528273105621338, "learning_rate": 0.000490233047968474, "loss": 3.666, "step": 2300 }, { "epoch": 0.15661095257507815, "grad_norm": 0.7870190143585205, "learning_rate": 0.0004902118154640577, "loss": 3.8238, "step": 2305 }, { "epoch": 0.15695067264573992, "grad_norm": 0.9592297673225403, "learning_rate": 0.0004901905829596412, "loss": 3.4855, "step": 2310 }, { "epoch": 0.1572903927164017, "grad_norm": 0.6766757965087891, "learning_rate": 0.0004901693504552249, "loss": 3.7462, "step": 2315 }, { "epoch": 0.15763011278706346, "grad_norm": 1.0727345943450928, "learning_rate": 0.0004901481179508086, "loss": 3.7272, "step": 2320 }, { "epoch": 0.15796983285772523, "grad_norm": 1.4382566213607788, "learning_rate": 0.0004901268854463922, "loss": 3.8211, "step": 2325 }, { "epoch": 0.158309552928387, "grad_norm": 0.6565765738487244, "learning_rate": 0.0004901056529419759, "loss": 3.6314, "step": 2330 }, { "epoch": 0.1586492729990488, "grad_norm": 0.6315727233886719, "learning_rate": 0.0004900844204375594, "loss": 3.6725, "step": 2335 }, { "epoch": 0.15898899306971057, "grad_norm": 0.9041573405265808, "learning_rate": 0.0004900631879331431, "loss": 3.7394, "step": 2340 }, { "epoch": 0.15932871314037234, "grad_norm": 0.7126863598823547, "learning_rate": 0.0004900419554287268, "loss": 3.7313, "step": 2345 }, { "epoch": 0.1596684332110341, "grad_norm": 0.8121592402458191, "learning_rate": 0.0004900207229243103, "loss": 3.8544, "step": 2350 }, { "epoch": 0.16000815328169588, "grad_norm": 0.6411019563674927, "learning_rate": 0.000489999490419894, "loss": 3.6979, "step": 2355 }, { "epoch": 0.16034787335235764, "grad_norm": 0.7645266056060791, "learning_rate": 0.0004899782579154777, "loss": 3.7905, "step": 2360 }, { "epoch": 0.16068759342301944, "grad_norm": 0.675037682056427, "learning_rate": 0.0004899570254110613, "loss": 3.4693, "step": 2365 }, { "epoch": 0.1610273134936812, "grad_norm": 1.146195411682129, "learning_rate": 0.0004899357929066449, "loss": 3.6358, "step": 2370 }, { "epoch": 0.16136703356434298, "grad_norm": 0.6876041889190674, "learning_rate": 0.0004899145604022286, "loss": 3.7075, "step": 2375 }, { "epoch": 0.16170675363500475, "grad_norm": 1.0540108680725098, "learning_rate": 0.0004898933278978122, "loss": 3.5627, "step": 2380 }, { "epoch": 0.16204647370566652, "grad_norm": 0.7455466389656067, "learning_rate": 0.0004898720953933958, "loss": 3.6721, "step": 2385 }, { "epoch": 0.16238619377632832, "grad_norm": 0.7870174050331116, "learning_rate": 0.0004898508628889795, "loss": 3.8299, "step": 2390 }, { "epoch": 0.1627259138469901, "grad_norm": 1.2914894819259644, "learning_rate": 0.0004898296303845631, "loss": 3.4942, "step": 2395 }, { "epoch": 0.16306563391765186, "grad_norm": 0.6331920027732849, "learning_rate": 0.0004898083978801468, "loss": 3.6977, "step": 2400 }, { "epoch": 0.16340535398831363, "grad_norm": 0.5884405970573425, "learning_rate": 0.0004897871653757305, "loss": 3.5073, "step": 2405 }, { "epoch": 0.1637450740589754, "grad_norm": 0.5583176016807556, "learning_rate": 0.000489765932871314, "loss": 3.8462, "step": 2410 }, { "epoch": 0.16408479412963717, "grad_norm": 0.8851171731948853, "learning_rate": 0.0004897447003668977, "loss": 3.8369, "step": 2415 }, { "epoch": 0.16442451420029897, "grad_norm": 0.9939813017845154, "learning_rate": 0.0004897234678624814, "loss": 3.6561, "step": 2420 }, { "epoch": 0.16476423427096074, "grad_norm": 0.7610264420509338, "learning_rate": 0.0004897022353580649, "loss": 3.7041, "step": 2425 }, { "epoch": 0.1651039543416225, "grad_norm": 0.8472675681114197, "learning_rate": 0.0004896810028536486, "loss": 3.6896, "step": 2430 }, { "epoch": 0.16544367441228428, "grad_norm": 0.720535933971405, "learning_rate": 0.0004896597703492322, "loss": 3.8384, "step": 2435 }, { "epoch": 0.16578339448294604, "grad_norm": 0.680776059627533, "learning_rate": 0.0004896385378448159, "loss": 3.7183, "step": 2440 }, { "epoch": 0.16612311455360781, "grad_norm": 1.0876171588897705, "learning_rate": 0.0004896173053403995, "loss": 3.7391, "step": 2445 }, { "epoch": 0.1664628346242696, "grad_norm": 0.6896780133247375, "learning_rate": 0.0004895960728359831, "loss": 3.6119, "step": 2450 }, { "epoch": 0.16680255469493138, "grad_norm": 1.0084868669509888, "learning_rate": 0.0004895748403315668, "loss": 3.6559, "step": 2455 }, { "epoch": 0.16714227476559315, "grad_norm": 0.8299074172973633, "learning_rate": 0.0004895536078271504, "loss": 3.6931, "step": 2460 }, { "epoch": 0.16748199483625492, "grad_norm": 0.8245034217834473, "learning_rate": 0.0004895323753227341, "loss": 3.5678, "step": 2465 }, { "epoch": 0.1678217149069167, "grad_norm": 0.9635615944862366, "learning_rate": 0.0004895111428183177, "loss": 3.8111, "step": 2470 }, { "epoch": 0.1681614349775785, "grad_norm": 0.9036959409713745, "learning_rate": 0.0004894899103139014, "loss": 3.7715, "step": 2475 }, { "epoch": 0.16850115504824026, "grad_norm": 0.7289747595787048, "learning_rate": 0.000489468677809485, "loss": 3.7065, "step": 2480 }, { "epoch": 0.16884087511890203, "grad_norm": 0.6939671635627747, "learning_rate": 0.0004894474453050686, "loss": 3.7439, "step": 2485 }, { "epoch": 0.1691805951895638, "grad_norm": 0.6586594581604004, "learning_rate": 0.0004894262128006523, "loss": 3.7673, "step": 2490 }, { "epoch": 0.16952031526022557, "grad_norm": 0.7694540619850159, "learning_rate": 0.0004894049802962359, "loss": 3.6815, "step": 2495 }, { "epoch": 0.16986003533088734, "grad_norm": 0.6492583751678467, "learning_rate": 0.0004893837477918196, "loss": 3.8308, "step": 2500 }, { "epoch": 0.17019975540154914, "grad_norm": 0.6619965434074402, "learning_rate": 0.0004893625152874032, "loss": 3.8354, "step": 2505 }, { "epoch": 0.1705394754722109, "grad_norm": 0.6523855924606323, "learning_rate": 0.0004893412827829868, "loss": 3.8809, "step": 2510 }, { "epoch": 0.17087919554287267, "grad_norm": 0.6041701436042786, "learning_rate": 0.0004893200502785705, "loss": 3.6803, "step": 2515 }, { "epoch": 0.17121891561353444, "grad_norm": 0.607407808303833, "learning_rate": 0.000489298817774154, "loss": 3.8188, "step": 2520 }, { "epoch": 0.17155863568419621, "grad_norm": 0.6803737282752991, "learning_rate": 0.0004892775852697377, "loss": 3.8566, "step": 2525 }, { "epoch": 0.17189835575485798, "grad_norm": 0.745516836643219, "learning_rate": 0.0004892563527653214, "loss": 3.6868, "step": 2530 }, { "epoch": 0.17223807582551978, "grad_norm": 0.8180308938026428, "learning_rate": 0.000489235120260905, "loss": 3.8913, "step": 2535 }, { "epoch": 0.17257779589618155, "grad_norm": 0.6997405886650085, "learning_rate": 0.0004892138877564887, "loss": 3.5939, "step": 2540 }, { "epoch": 0.17291751596684332, "grad_norm": 0.7282498478889465, "learning_rate": 0.0004891926552520723, "loss": 3.6258, "step": 2545 }, { "epoch": 0.1732572360375051, "grad_norm": 3.796626091003418, "learning_rate": 0.0004891714227476559, "loss": 3.7793, "step": 2550 }, { "epoch": 0.17359695610816686, "grad_norm": 0.6571747064590454, "learning_rate": 0.0004891501902432396, "loss": 3.5836, "step": 2555 }, { "epoch": 0.17393667617882866, "grad_norm": 0.9613381624221802, "learning_rate": 0.0004891289577388232, "loss": 3.4951, "step": 2560 }, { "epoch": 0.17427639624949043, "grad_norm": 1.0400304794311523, "learning_rate": 0.0004891077252344068, "loss": 3.6592, "step": 2565 }, { "epoch": 0.1746161163201522, "grad_norm": 1.0958144664764404, "learning_rate": 0.0004890864927299905, "loss": 3.7557, "step": 2570 }, { "epoch": 0.17495583639081397, "grad_norm": 1.0287315845489502, "learning_rate": 0.0004890652602255742, "loss": 3.8605, "step": 2575 }, { "epoch": 0.17529555646147574, "grad_norm": 0.8766208291053772, "learning_rate": 0.0004890440277211578, "loss": 3.8341, "step": 2580 }, { "epoch": 0.1756352765321375, "grad_norm": 0.6588528156280518, "learning_rate": 0.0004890227952167414, "loss": 3.5882, "step": 2585 }, { "epoch": 0.1759749966027993, "grad_norm": 0.789517879486084, "learning_rate": 0.0004890015627123251, "loss": 3.5365, "step": 2590 }, { "epoch": 0.17631471667346107, "grad_norm": 0.8558750748634338, "learning_rate": 0.0004889803302079087, "loss": 3.6243, "step": 2595 }, { "epoch": 0.17665443674412284, "grad_norm": 0.7893194556236267, "learning_rate": 0.0004889590977034923, "loss": 3.4765, "step": 2600 }, { "epoch": 0.17699415681478461, "grad_norm": 1.8730924129486084, "learning_rate": 0.000488937865199076, "loss": 3.9838, "step": 2605 }, { "epoch": 0.17733387688544638, "grad_norm": 0.7545998692512512, "learning_rate": 0.0004889166326946596, "loss": 3.6587, "step": 2610 }, { "epoch": 0.17767359695610815, "grad_norm": 0.6626622676849365, "learning_rate": 0.0004888954001902433, "loss": 3.3692, "step": 2615 }, { "epoch": 0.17801331702676995, "grad_norm": 0.7592429518699646, "learning_rate": 0.0004888741676858268, "loss": 3.6259, "step": 2620 }, { "epoch": 0.17835303709743172, "grad_norm": 0.7772994041442871, "learning_rate": 0.0004888529351814105, "loss": 3.5855, "step": 2625 }, { "epoch": 0.1786927571680935, "grad_norm": 0.8997565507888794, "learning_rate": 0.0004888317026769942, "loss": 3.5332, "step": 2630 }, { "epoch": 0.17903247723875526, "grad_norm": 0.6598993539810181, "learning_rate": 0.0004888104701725777, "loss": 3.6894, "step": 2635 }, { "epoch": 0.17937219730941703, "grad_norm": 0.724504828453064, "learning_rate": 0.0004887892376681615, "loss": 3.7929, "step": 2640 }, { "epoch": 0.17971191738007883, "grad_norm": 0.6006979942321777, "learning_rate": 0.0004887680051637451, "loss": 3.6706, "step": 2645 }, { "epoch": 0.1800516374507406, "grad_norm": 0.7377782464027405, "learning_rate": 0.0004887467726593287, "loss": 3.8517, "step": 2650 }, { "epoch": 0.18039135752140237, "grad_norm": 0.8025123476982117, "learning_rate": 0.0004887255401549124, "loss": 3.7842, "step": 2655 }, { "epoch": 0.18073107759206414, "grad_norm": 0.6233127117156982, "learning_rate": 0.000488704307650496, "loss": 3.4837, "step": 2660 }, { "epoch": 0.1810707976627259, "grad_norm": 0.7821118235588074, "learning_rate": 0.0004886830751460796, "loss": 3.6342, "step": 2665 }, { "epoch": 0.18141051773338768, "grad_norm": 0.7989711761474609, "learning_rate": 0.0004886618426416633, "loss": 3.6085, "step": 2670 }, { "epoch": 0.18175023780404947, "grad_norm": 1.2865526676177979, "learning_rate": 0.000488640610137247, "loss": 3.6428, "step": 2675 }, { "epoch": 0.18208995787471124, "grad_norm": 0.8912000060081482, "learning_rate": 0.0004886193776328305, "loss": 3.7833, "step": 2680 }, { "epoch": 0.18242967794537301, "grad_norm": 0.5777593851089478, "learning_rate": 0.0004885981451284142, "loss": 3.6923, "step": 2685 }, { "epoch": 0.18276939801603478, "grad_norm": 0.8895307183265686, "learning_rate": 0.0004885769126239979, "loss": 3.6709, "step": 2690 }, { "epoch": 0.18310911808669655, "grad_norm": 10.587468147277832, "learning_rate": 0.0004885556801195814, "loss": 3.271, "step": 2695 }, { "epoch": 0.18344883815735832, "grad_norm": 0.7461752891540527, "learning_rate": 0.0004885344476151651, "loss": 3.8479, "step": 2700 }, { "epoch": 0.18378855822802012, "grad_norm": 0.7754172086715698, "learning_rate": 0.0004885132151107488, "loss": 3.5703, "step": 2705 }, { "epoch": 0.1841282782986819, "grad_norm": 0.7618787884712219, "learning_rate": 0.0004884919826063324, "loss": 3.6141, "step": 2710 }, { "epoch": 0.18446799836934366, "grad_norm": 0.7424732446670532, "learning_rate": 0.0004884707501019161, "loss": 3.7161, "step": 2715 }, { "epoch": 0.18480771844000543, "grad_norm": 0.871410071849823, "learning_rate": 0.0004884495175974996, "loss": 3.7112, "step": 2720 }, { "epoch": 0.1851474385106672, "grad_norm": 0.7123041749000549, "learning_rate": 0.0004884282850930833, "loss": 3.7676, "step": 2725 }, { "epoch": 0.185487158581329, "grad_norm": 0.7749680280685425, "learning_rate": 0.000488407052588667, "loss": 3.9219, "step": 2730 }, { "epoch": 0.18582687865199077, "grad_norm": 0.7869135737419128, "learning_rate": 0.0004883858200842505, "loss": 3.7653, "step": 2735 }, { "epoch": 0.18616659872265254, "grad_norm": 0.6588122844696045, "learning_rate": 0.0004883645875798342, "loss": 3.4825, "step": 2740 }, { "epoch": 0.1865063187933143, "grad_norm": 0.5788915157318115, "learning_rate": 0.0004883433550754179, "loss": 3.5408, "step": 2745 }, { "epoch": 0.18684603886397608, "grad_norm": 0.8431991338729858, "learning_rate": 0.0004883221225710015, "loss": 3.7046, "step": 2750 }, { "epoch": 0.18718575893463785, "grad_norm": 0.917365550994873, "learning_rate": 0.0004883008900665851, "loss": 3.9465, "step": 2755 }, { "epoch": 0.18752547900529964, "grad_norm": 0.6573261022567749, "learning_rate": 0.0004882796575621688, "loss": 3.6875, "step": 2760 }, { "epoch": 0.18786519907596141, "grad_norm": 0.719317615032196, "learning_rate": 0.00048825842505775244, "loss": 3.7343, "step": 2765 }, { "epoch": 0.18820491914662318, "grad_norm": 0.6207064390182495, "learning_rate": 0.00048823719255333605, "loss": 3.7461, "step": 2770 }, { "epoch": 0.18854463921728495, "grad_norm": 0.818270742893219, "learning_rate": 0.0004882159600489197, "loss": 3.9241, "step": 2775 }, { "epoch": 0.18888435928794672, "grad_norm": 0.5928511023521423, "learning_rate": 0.00048819472754450333, "loss": 3.6758, "step": 2780 }, { "epoch": 0.1892240793586085, "grad_norm": 0.6911123394966125, "learning_rate": 0.00048817349504008694, "loss": 3.3732, "step": 2785 }, { "epoch": 0.1895637994292703, "grad_norm": 0.8556875586509705, "learning_rate": 0.0004881522625356706, "loss": 3.6977, "step": 2790 }, { "epoch": 0.18990351949993206, "grad_norm": 0.8025653958320618, "learning_rate": 0.0004881310300312543, "loss": 3.4866, "step": 2795 }, { "epoch": 0.19024323957059383, "grad_norm": 0.8513450622558594, "learning_rate": 0.0004881097975268379, "loss": 3.6009, "step": 2800 }, { "epoch": 0.1905829596412556, "grad_norm": 2.5956499576568604, "learning_rate": 0.00048808856502242156, "loss": 3.6464, "step": 2805 }, { "epoch": 0.19092267971191737, "grad_norm": 0.7112922072410583, "learning_rate": 0.00048806733251800517, "loss": 3.2374, "step": 2810 }, { "epoch": 0.19126239978257917, "grad_norm": 1.015148639678955, "learning_rate": 0.0004880461000135888, "loss": 3.6458, "step": 2815 }, { "epoch": 0.19160211985324094, "grad_norm": 0.886098325252533, "learning_rate": 0.00048802486750917245, "loss": 3.7923, "step": 2820 }, { "epoch": 0.1919418399239027, "grad_norm": 0.6111242771148682, "learning_rate": 0.00048800363500475606, "loss": 3.5841, "step": 2825 }, { "epoch": 0.19228155999456448, "grad_norm": 0.6688699126243591, "learning_rate": 0.00048798240250033973, "loss": 3.8184, "step": 2830 }, { "epoch": 0.19262128006522625, "grad_norm": 0.9176944494247437, "learning_rate": 0.0004879611699959234, "loss": 3.6452, "step": 2835 }, { "epoch": 0.19296100013588802, "grad_norm": 0.8387790322303772, "learning_rate": 0.000487939937491507, "loss": 3.6167, "step": 2840 }, { "epoch": 0.19330072020654981, "grad_norm": 0.5995345711708069, "learning_rate": 0.0004879187049870906, "loss": 3.801, "step": 2845 }, { "epoch": 0.19364044027721158, "grad_norm": 0.6525061726570129, "learning_rate": 0.0004878974724826743, "loss": 3.6712, "step": 2850 }, { "epoch": 0.19398016034787335, "grad_norm": 1.0600078105926514, "learning_rate": 0.0004878762399782579, "loss": 3.7473, "step": 2855 }, { "epoch": 0.19431988041853512, "grad_norm": 0.7363470792770386, "learning_rate": 0.0004878550074738415, "loss": 3.6191, "step": 2860 }, { "epoch": 0.1946596004891969, "grad_norm": 1.1621986627578735, "learning_rate": 0.00048783377496942524, "loss": 3.7329, "step": 2865 }, { "epoch": 0.19499932055985866, "grad_norm": 0.9590019583702087, "learning_rate": 0.00048781254246500885, "loss": 3.5139, "step": 2870 }, { "epoch": 0.19533904063052046, "grad_norm": 0.9282140135765076, "learning_rate": 0.00048779130996059246, "loss": 3.645, "step": 2875 }, { "epoch": 0.19567876070118223, "grad_norm": 0.696601152420044, "learning_rate": 0.00048777007745617613, "loss": 3.8607, "step": 2880 }, { "epoch": 0.196018480771844, "grad_norm": 0.7489016056060791, "learning_rate": 0.00048774884495175974, "loss": 3.6201, "step": 2885 }, { "epoch": 0.19635820084250577, "grad_norm": 0.7331322431564331, "learning_rate": 0.00048772761244734336, "loss": 3.305, "step": 2890 }, { "epoch": 0.19669792091316754, "grad_norm": 1.0016788244247437, "learning_rate": 0.000487706379942927, "loss": 3.3955, "step": 2895 }, { "epoch": 0.19703764098382934, "grad_norm": 1.2459478378295898, "learning_rate": 0.0004876851474385107, "loss": 3.7665, "step": 2900 }, { "epoch": 0.1973773610544911, "grad_norm": 0.8072220683097839, "learning_rate": 0.00048766391493409436, "loss": 3.404, "step": 2905 }, { "epoch": 0.19771708112515288, "grad_norm": 0.9906612634658813, "learning_rate": 0.00048764268242967797, "loss": 3.5161, "step": 2910 }, { "epoch": 0.19805680119581465, "grad_norm": 0.8436631560325623, "learning_rate": 0.0004876214499252616, "loss": 3.6928, "step": 2915 }, { "epoch": 0.19839652126647642, "grad_norm": 0.8599230647087097, "learning_rate": 0.00048760021742084525, "loss": 3.7027, "step": 2920 }, { "epoch": 0.1987362413371382, "grad_norm": 0.7203686237335205, "learning_rate": 0.00048757898491642886, "loss": 3.5902, "step": 2925 }, { "epoch": 0.19907596140779998, "grad_norm": 1.5745136737823486, "learning_rate": 0.0004875577524120125, "loss": 3.4799, "step": 2930 }, { "epoch": 0.19941568147846175, "grad_norm": 0.6809103488922119, "learning_rate": 0.0004875365199075962, "loss": 3.8251, "step": 2935 }, { "epoch": 0.19975540154912352, "grad_norm": 1.0106916427612305, "learning_rate": 0.0004875152874031798, "loss": 3.4818, "step": 2940 }, { "epoch": 0.2000951216197853, "grad_norm": 1.1775444746017456, "learning_rate": 0.0004874940548987634, "loss": 3.6687, "step": 2945 }, { "epoch": 0.20043484169044706, "grad_norm": 0.8074027895927429, "learning_rate": 0.0004874728223943471, "loss": 3.7417, "step": 2950 }, { "epoch": 0.20077456176110883, "grad_norm": 0.6968055367469788, "learning_rate": 0.0004874515898899307, "loss": 3.926, "step": 2955 }, { "epoch": 0.20111428183177063, "grad_norm": 0.7794591784477234, "learning_rate": 0.0004874303573855143, "loss": 3.7971, "step": 2960 }, { "epoch": 0.2014540019024324, "grad_norm": 0.6135337948799133, "learning_rate": 0.000487409124881098, "loss": 3.5821, "step": 2965 }, { "epoch": 0.20179372197309417, "grad_norm": 0.7302685976028442, "learning_rate": 0.00048738789237668165, "loss": 3.6496, "step": 2970 }, { "epoch": 0.20213344204375594, "grad_norm": 0.7609180808067322, "learning_rate": 0.00048736665987226526, "loss": 3.5834, "step": 2975 }, { "epoch": 0.2024731621144177, "grad_norm": 0.759857714176178, "learning_rate": 0.00048734542736784893, "loss": 3.6578, "step": 2980 }, { "epoch": 0.2028128821850795, "grad_norm": 0.8223823308944702, "learning_rate": 0.00048732419486343254, "loss": 3.5164, "step": 2985 }, { "epoch": 0.20315260225574128, "grad_norm": 0.6405567526817322, "learning_rate": 0.00048730296235901616, "loss": 3.6724, "step": 2990 }, { "epoch": 0.20349232232640305, "grad_norm": 1.040215015411377, "learning_rate": 0.0004872817298545998, "loss": 3.71, "step": 2995 }, { "epoch": 0.20383204239706482, "grad_norm": 0.6048935651779175, "learning_rate": 0.00048726049735018344, "loss": 3.6026, "step": 3000 }, { "epoch": 0.2041717624677266, "grad_norm": 1.0037267208099365, "learning_rate": 0.0004872392648457671, "loss": 3.6082, "step": 3005 }, { "epoch": 0.20451148253838836, "grad_norm": 0.7137578129768372, "learning_rate": 0.00048721803234135077, "loss": 3.7791, "step": 3010 }, { "epoch": 0.20485120260905015, "grad_norm": 0.6953353881835938, "learning_rate": 0.0004871967998369344, "loss": 3.7988, "step": 3015 }, { "epoch": 0.20519092267971192, "grad_norm": 0.7209901213645935, "learning_rate": 0.000487175567332518, "loss": 3.7794, "step": 3020 }, { "epoch": 0.2055306427503737, "grad_norm": 0.7369226217269897, "learning_rate": 0.00048715433482810166, "loss": 3.7009, "step": 3025 }, { "epoch": 0.20587036282103546, "grad_norm": 1.1090987920761108, "learning_rate": 0.0004871331023236853, "loss": 3.6596, "step": 3030 }, { "epoch": 0.20621008289169723, "grad_norm": 0.6461921334266663, "learning_rate": 0.0004871118698192689, "loss": 3.6205, "step": 3035 }, { "epoch": 0.206549802962359, "grad_norm": 0.8342731595039368, "learning_rate": 0.0004870906373148526, "loss": 3.6431, "step": 3040 }, { "epoch": 0.2068895230330208, "grad_norm": 0.8547791838645935, "learning_rate": 0.0004870694048104362, "loss": 3.555, "step": 3045 }, { "epoch": 0.20722924310368257, "grad_norm": 0.6723164319992065, "learning_rate": 0.00048704817230601984, "loss": 3.6104, "step": 3050 }, { "epoch": 0.20756896317434434, "grad_norm": 0.8235931396484375, "learning_rate": 0.0004870269398016035, "loss": 3.5132, "step": 3055 }, { "epoch": 0.2079086832450061, "grad_norm": 1.2890005111694336, "learning_rate": 0.0004870057072971871, "loss": 3.7192, "step": 3060 }, { "epoch": 0.20824840331566788, "grad_norm": 0.6429652571678162, "learning_rate": 0.00048698447479277073, "loss": 3.5215, "step": 3065 }, { "epoch": 0.20858812338632968, "grad_norm": 0.8776254057884216, "learning_rate": 0.0004869632422883544, "loss": 3.7688, "step": 3070 }, { "epoch": 0.20892784345699145, "grad_norm": 0.6718867421150208, "learning_rate": 0.00048694200978393806, "loss": 3.5297, "step": 3075 }, { "epoch": 0.20926756352765322, "grad_norm": 0.8000648021697998, "learning_rate": 0.0004869207772795217, "loss": 3.5619, "step": 3080 }, { "epoch": 0.209607283598315, "grad_norm": 0.9478837251663208, "learning_rate": 0.00048689954477510535, "loss": 3.8867, "step": 3085 }, { "epoch": 0.20994700366897676, "grad_norm": 0.860708475112915, "learning_rate": 0.00048687831227068896, "loss": 3.6163, "step": 3090 }, { "epoch": 0.21028672373963853, "grad_norm": 0.7412290573120117, "learning_rate": 0.00048685707976627257, "loss": 3.7178, "step": 3095 }, { "epoch": 0.21062644381030032, "grad_norm": 0.8216627836227417, "learning_rate": 0.00048683584726185624, "loss": 3.8917, "step": 3100 }, { "epoch": 0.2109661638809621, "grad_norm": 0.8016996383666992, "learning_rate": 0.00048681461475743985, "loss": 3.6723, "step": 3105 }, { "epoch": 0.21130588395162386, "grad_norm": 0.8483715653419495, "learning_rate": 0.0004867933822530235, "loss": 3.8619, "step": 3110 }, { "epoch": 0.21164560402228563, "grad_norm": 0.70466548204422, "learning_rate": 0.0004867721497486072, "loss": 3.7307, "step": 3115 }, { "epoch": 0.2119853240929474, "grad_norm": 1.2524949312210083, "learning_rate": 0.0004867509172441908, "loss": 3.7063, "step": 3120 }, { "epoch": 0.21232504416360917, "grad_norm": 0.8419017195701599, "learning_rate": 0.0004867296847397744, "loss": 3.4099, "step": 3125 }, { "epoch": 0.21266476423427097, "grad_norm": 0.8310908079147339, "learning_rate": 0.0004867084522353581, "loss": 3.8564, "step": 3130 }, { "epoch": 0.21300448430493274, "grad_norm": 1.7952288389205933, "learning_rate": 0.0004866872197309417, "loss": 3.6431, "step": 3135 }, { "epoch": 0.2133442043755945, "grad_norm": 0.6984266042709351, "learning_rate": 0.0004866659872265253, "loss": 3.563, "step": 3140 }, { "epoch": 0.21368392444625628, "grad_norm": 0.9820377230644226, "learning_rate": 0.000486644754722109, "loss": 3.734, "step": 3145 }, { "epoch": 0.21402364451691805, "grad_norm": 1.277506947517395, "learning_rate": 0.00048662352221769264, "loss": 3.2841, "step": 3150 }, { "epoch": 0.21436336458757985, "grad_norm": 0.8898820281028748, "learning_rate": 0.00048660228971327625, "loss": 3.665, "step": 3155 }, { "epoch": 0.21470308465824162, "grad_norm": 0.6653764843940735, "learning_rate": 0.0004865810572088599, "loss": 3.6531, "step": 3160 }, { "epoch": 0.2150428047289034, "grad_norm": 0.5920128226280212, "learning_rate": 0.00048655982470444353, "loss": 3.6532, "step": 3165 }, { "epoch": 0.21538252479956516, "grad_norm": 1.0638370513916016, "learning_rate": 0.00048653859220002714, "loss": 3.4259, "step": 3170 }, { "epoch": 0.21572224487022693, "grad_norm": 0.7301346063613892, "learning_rate": 0.00048651735969561087, "loss": 3.6647, "step": 3175 }, { "epoch": 0.2160619649408887, "grad_norm": 0.7654249668121338, "learning_rate": 0.0004864961271911945, "loss": 3.6831, "step": 3180 }, { "epoch": 0.2164016850115505, "grad_norm": 0.7753859758377075, "learning_rate": 0.0004864748946867781, "loss": 3.6969, "step": 3185 }, { "epoch": 0.21674140508221226, "grad_norm": 1.042198896408081, "learning_rate": 0.00048645366218236176, "loss": 3.8803, "step": 3190 }, { "epoch": 0.21708112515287403, "grad_norm": 0.6874271631240845, "learning_rate": 0.00048643242967794537, "loss": 3.6421, "step": 3195 }, { "epoch": 0.2174208452235358, "grad_norm": 0.731442928314209, "learning_rate": 0.000486411197173529, "loss": 3.7864, "step": 3200 }, { "epoch": 0.21776056529419757, "grad_norm": 0.8510879278182983, "learning_rate": 0.00048638996466911265, "loss": 3.8627, "step": 3205 }, { "epoch": 0.21810028536485934, "grad_norm": 0.7212687134742737, "learning_rate": 0.0004863687321646963, "loss": 3.8258, "step": 3210 }, { "epoch": 0.21844000543552114, "grad_norm": 0.8341764807701111, "learning_rate": 0.00048634749966027993, "loss": 3.6854, "step": 3215 }, { "epoch": 0.2187797255061829, "grad_norm": 0.802669882774353, "learning_rate": 0.0004863262671558636, "loss": 3.5651, "step": 3220 }, { "epoch": 0.21911944557684468, "grad_norm": 0.7741901278495789, "learning_rate": 0.0004863050346514472, "loss": 3.5404, "step": 3225 }, { "epoch": 0.21945916564750645, "grad_norm": 1.0574969053268433, "learning_rate": 0.0004862838021470308, "loss": 3.6772, "step": 3230 }, { "epoch": 0.21979888571816822, "grad_norm": 0.8207195997238159, "learning_rate": 0.0004862625696426145, "loss": 3.6011, "step": 3235 }, { "epoch": 0.22013860578883002, "grad_norm": 0.8084278702735901, "learning_rate": 0.0004862413371381981, "loss": 3.6064, "step": 3240 }, { "epoch": 0.22047832585949179, "grad_norm": 0.6928495764732361, "learning_rate": 0.0004862201046337818, "loss": 3.5457, "step": 3245 }, { "epoch": 0.22081804593015356, "grad_norm": 0.7987727522850037, "learning_rate": 0.00048619887212936544, "loss": 3.6952, "step": 3250 }, { "epoch": 0.22115776600081533, "grad_norm": 0.679037868976593, "learning_rate": 0.00048617763962494905, "loss": 3.5564, "step": 3255 }, { "epoch": 0.2214974860714771, "grad_norm": 0.7609912157058716, "learning_rate": 0.0004861564071205327, "loss": 3.7163, "step": 3260 }, { "epoch": 0.22183720614213887, "grad_norm": 0.6696768403053284, "learning_rate": 0.00048613517461611633, "loss": 3.6897, "step": 3265 }, { "epoch": 0.22217692621280066, "grad_norm": 0.8879420161247253, "learning_rate": 0.00048611394211169995, "loss": 3.7869, "step": 3270 }, { "epoch": 0.22251664628346243, "grad_norm": 0.8048788905143738, "learning_rate": 0.0004860927096072836, "loss": 3.7734, "step": 3275 }, { "epoch": 0.2228563663541242, "grad_norm": 0.6440518498420715, "learning_rate": 0.0004860714771028673, "loss": 3.5731, "step": 3280 }, { "epoch": 0.22319608642478597, "grad_norm": 2.0527195930480957, "learning_rate": 0.0004860502445984509, "loss": 3.913, "step": 3285 }, { "epoch": 0.22353580649544774, "grad_norm": 1.093522071838379, "learning_rate": 0.00048602901209403456, "loss": 3.6741, "step": 3290 }, { "epoch": 0.22387552656610954, "grad_norm": 0.9258810877799988, "learning_rate": 0.00048600777958961817, "loss": 3.3879, "step": 3295 }, { "epoch": 0.2242152466367713, "grad_norm": 0.8076410293579102, "learning_rate": 0.0004859865470852018, "loss": 3.5226, "step": 3300 }, { "epoch": 0.22455496670743308, "grad_norm": 0.7001386880874634, "learning_rate": 0.00048596531458078545, "loss": 3.8246, "step": 3305 }, { "epoch": 0.22489468677809485, "grad_norm": 0.7621698975563049, "learning_rate": 0.00048594408207636907, "loss": 3.3878, "step": 3310 }, { "epoch": 0.22523440684875662, "grad_norm": 1.5306898355484009, "learning_rate": 0.00048592284957195273, "loss": 3.4739, "step": 3315 }, { "epoch": 0.2255741269194184, "grad_norm": 0.8297240734100342, "learning_rate": 0.0004859016170675364, "loss": 3.554, "step": 3320 }, { "epoch": 0.22591384699008019, "grad_norm": 0.7946100234985352, "learning_rate": 0.00048588038456312, "loss": 3.5967, "step": 3325 }, { "epoch": 0.22625356706074196, "grad_norm": 0.6882669925689697, "learning_rate": 0.0004858591520587036, "loss": 3.7607, "step": 3330 }, { "epoch": 0.22659328713140373, "grad_norm": 0.7105723023414612, "learning_rate": 0.0004858379195542873, "loss": 3.5037, "step": 3335 }, { "epoch": 0.2269330072020655, "grad_norm": 1.238961100578308, "learning_rate": 0.0004858166870498709, "loss": 3.3841, "step": 3340 }, { "epoch": 0.22727272727272727, "grad_norm": 0.798149824142456, "learning_rate": 0.0004857954545454545, "loss": 3.6754, "step": 3345 }, { "epoch": 0.22761244734338903, "grad_norm": 0.7487467527389526, "learning_rate": 0.00048577422204103824, "loss": 3.7449, "step": 3350 }, { "epoch": 0.22795216741405083, "grad_norm": 0.9356561899185181, "learning_rate": 0.00048575298953662185, "loss": 3.5337, "step": 3355 }, { "epoch": 0.2282918874847126, "grad_norm": 0.6142210364341736, "learning_rate": 0.00048573175703220547, "loss": 3.6479, "step": 3360 }, { "epoch": 0.22863160755537437, "grad_norm": 0.7560938000679016, "learning_rate": 0.00048571052452778913, "loss": 3.7783, "step": 3365 }, { "epoch": 0.22897132762603614, "grad_norm": 0.8618605732917786, "learning_rate": 0.00048568929202337275, "loss": 3.6776, "step": 3370 }, { "epoch": 0.2293110476966979, "grad_norm": 0.7192989587783813, "learning_rate": 0.00048566805951895636, "loss": 3.6851, "step": 3375 }, { "epoch": 0.2296507677673597, "grad_norm": 0.9149174690246582, "learning_rate": 0.00048564682701454, "loss": 3.5901, "step": 3380 }, { "epoch": 0.22999048783802148, "grad_norm": 0.7006362676620483, "learning_rate": 0.0004856255945101237, "loss": 3.6675, "step": 3385 }, { "epoch": 0.23033020790868325, "grad_norm": 0.6280655860900879, "learning_rate": 0.0004856043620057073, "loss": 3.5465, "step": 3390 }, { "epoch": 0.23066992797934502, "grad_norm": 0.7238929271697998, "learning_rate": 0.000485583129501291, "loss": 3.7762, "step": 3395 }, { "epoch": 0.2310096480500068, "grad_norm": 0.8637920022010803, "learning_rate": 0.0004855618969968746, "loss": 3.5099, "step": 3400 }, { "epoch": 0.23134936812066856, "grad_norm": 0.9875428080558777, "learning_rate": 0.0004855406644924582, "loss": 3.7442, "step": 3405 }, { "epoch": 0.23168908819133036, "grad_norm": 0.7269251346588135, "learning_rate": 0.00048551943198804187, "loss": 3.5832, "step": 3410 }, { "epoch": 0.23202880826199213, "grad_norm": 0.8643271923065186, "learning_rate": 0.0004854981994836255, "loss": 3.6639, "step": 3415 }, { "epoch": 0.2323685283326539, "grad_norm": 0.6912517547607422, "learning_rate": 0.00048547696697920915, "loss": 3.8002, "step": 3420 }, { "epoch": 0.23270824840331567, "grad_norm": 0.8600926995277405, "learning_rate": 0.0004854557344747928, "loss": 3.793, "step": 3425 }, { "epoch": 0.23304796847397743, "grad_norm": 0.7723545432090759, "learning_rate": 0.0004854345019703764, "loss": 3.7412, "step": 3430 }, { "epoch": 0.2333876885446392, "grad_norm": 0.6232338547706604, "learning_rate": 0.00048541326946596004, "loss": 3.7861, "step": 3435 }, { "epoch": 0.233727408615301, "grad_norm": 0.9125848412513733, "learning_rate": 0.0004853920369615437, "loss": 3.4142, "step": 3440 }, { "epoch": 0.23406712868596277, "grad_norm": 1.0677344799041748, "learning_rate": 0.0004853708044571273, "loss": 3.5497, "step": 3445 }, { "epoch": 0.23440684875662454, "grad_norm": 0.8621318340301514, "learning_rate": 0.00048534957195271093, "loss": 3.6456, "step": 3450 }, { "epoch": 0.2347465688272863, "grad_norm": 0.9317499995231628, "learning_rate": 0.00048532833944829465, "loss": 3.6702, "step": 3455 }, { "epoch": 0.23508628889794808, "grad_norm": 1.0324983596801758, "learning_rate": 0.00048530710694387827, "loss": 3.5273, "step": 3460 }, { "epoch": 0.23542600896860988, "grad_norm": 0.6323010325431824, "learning_rate": 0.0004852858744394619, "loss": 3.753, "step": 3465 }, { "epoch": 0.23576572903927165, "grad_norm": 0.8227546215057373, "learning_rate": 0.00048526464193504555, "loss": 3.5406, "step": 3470 }, { "epoch": 0.23610544910993342, "grad_norm": 0.9106676578521729, "learning_rate": 0.00048524340943062916, "loss": 3.5498, "step": 3475 }, { "epoch": 0.2364451691805952, "grad_norm": 0.9897709488868713, "learning_rate": 0.00048522217692621277, "loss": 3.7933, "step": 3480 }, { "epoch": 0.23678488925125696, "grad_norm": 0.769070029258728, "learning_rate": 0.00048520094442179644, "loss": 3.7391, "step": 3485 }, { "epoch": 0.23712460932191873, "grad_norm": 0.6835725903511047, "learning_rate": 0.0004851797119173801, "loss": 3.3834, "step": 3490 }, { "epoch": 0.23746432939258053, "grad_norm": 0.6993653774261475, "learning_rate": 0.0004851584794129637, "loss": 3.6203, "step": 3495 }, { "epoch": 0.2378040494632423, "grad_norm": 0.774703860282898, "learning_rate": 0.0004851372469085474, "loss": 3.814, "step": 3500 }, { "epoch": 0.23814376953390406, "grad_norm": 0.8045734763145447, "learning_rate": 0.000485116014404131, "loss": 3.7767, "step": 3505 }, { "epoch": 0.23848348960456583, "grad_norm": 0.9734402298927307, "learning_rate": 0.0004850947818997146, "loss": 3.5764, "step": 3510 }, { "epoch": 0.2388232096752276, "grad_norm": 0.9567587971687317, "learning_rate": 0.0004850735493952983, "loss": 3.6308, "step": 3515 }, { "epoch": 0.23916292974588937, "grad_norm": 0.8175399303436279, "learning_rate": 0.0004850523168908819, "loss": 3.5266, "step": 3520 }, { "epoch": 0.23950264981655117, "grad_norm": 0.7373331189155579, "learning_rate": 0.00048503108438646556, "loss": 3.7532, "step": 3525 }, { "epoch": 0.23984236988721294, "grad_norm": 0.7923753261566162, "learning_rate": 0.00048500985188204923, "loss": 3.6485, "step": 3530 }, { "epoch": 0.2401820899578747, "grad_norm": 0.8847214579582214, "learning_rate": 0.00048498861937763284, "loss": 3.3194, "step": 3535 }, { "epoch": 0.24052181002853648, "grad_norm": 1.0416938066482544, "learning_rate": 0.00048496738687321645, "loss": 3.6524, "step": 3540 }, { "epoch": 0.24086153009919825, "grad_norm": 0.92363041639328, "learning_rate": 0.0004849461543688001, "loss": 3.696, "step": 3545 }, { "epoch": 0.24120125016986005, "grad_norm": 0.8743930459022522, "learning_rate": 0.00048492492186438373, "loss": 3.8208, "step": 3550 }, { "epoch": 0.24154097024052182, "grad_norm": 0.6873185634613037, "learning_rate": 0.00048490368935996735, "loss": 3.7768, "step": 3555 }, { "epoch": 0.2418806903111836, "grad_norm": 0.9848188161849976, "learning_rate": 0.00048488245685555107, "loss": 3.4607, "step": 3560 }, { "epoch": 0.24222041038184536, "grad_norm": 0.8857882022857666, "learning_rate": 0.0004848612243511347, "loss": 3.6447, "step": 3565 }, { "epoch": 0.24256013045250713, "grad_norm": 0.8508902788162231, "learning_rate": 0.0004848399918467183, "loss": 3.3845, "step": 3570 }, { "epoch": 0.2428998505231689, "grad_norm": 0.8264239430427551, "learning_rate": 0.00048481875934230196, "loss": 3.7685, "step": 3575 }, { "epoch": 0.2432395705938307, "grad_norm": 0.7623831629753113, "learning_rate": 0.0004847975268378856, "loss": 3.7974, "step": 3580 }, { "epoch": 0.24357929066449246, "grad_norm": 0.6936399936676025, "learning_rate": 0.00048477629433346924, "loss": 3.8472, "step": 3585 }, { "epoch": 0.24391901073515423, "grad_norm": 0.8710620403289795, "learning_rate": 0.00048475506182905285, "loss": 3.506, "step": 3590 }, { "epoch": 0.244258730805816, "grad_norm": 0.7444889545440674, "learning_rate": 0.0004847338293246365, "loss": 3.6287, "step": 3595 }, { "epoch": 0.24459845087647777, "grad_norm": 0.8406118154525757, "learning_rate": 0.0004847125968202202, "loss": 3.591, "step": 3600 }, { "epoch": 0.24493817094713954, "grad_norm": 0.8526303172111511, "learning_rate": 0.0004846913643158038, "loss": 3.576, "step": 3605 }, { "epoch": 0.24527789101780134, "grad_norm": 1.0247424840927124, "learning_rate": 0.0004846701318113874, "loss": 3.6876, "step": 3610 }, { "epoch": 0.2456176110884631, "grad_norm": 0.6543959975242615, "learning_rate": 0.0004846488993069711, "loss": 3.7592, "step": 3615 }, { "epoch": 0.24595733115912488, "grad_norm": 0.7236579060554504, "learning_rate": 0.0004846276668025547, "loss": 3.429, "step": 3620 }, { "epoch": 0.24629705122978665, "grad_norm": 0.9447727203369141, "learning_rate": 0.0004846064342981383, "loss": 3.5872, "step": 3625 }, { "epoch": 0.24663677130044842, "grad_norm": 0.7765605449676514, "learning_rate": 0.00048458520179372203, "loss": 3.5134, "step": 3630 }, { "epoch": 0.24697649137111022, "grad_norm": 0.8949222564697266, "learning_rate": 0.00048456396928930564, "loss": 3.7557, "step": 3635 }, { "epoch": 0.247316211441772, "grad_norm": 0.7536894083023071, "learning_rate": 0.00048454273678488925, "loss": 3.7475, "step": 3640 }, { "epoch": 0.24765593151243376, "grad_norm": 0.848987340927124, "learning_rate": 0.0004845215042804729, "loss": 3.8577, "step": 3645 }, { "epoch": 0.24799565158309553, "grad_norm": 0.7066874504089355, "learning_rate": 0.00048450027177605653, "loss": 3.6131, "step": 3650 }, { "epoch": 0.2483353716537573, "grad_norm": 0.9029439687728882, "learning_rate": 0.00048447903927164015, "loss": 3.4538, "step": 3655 }, { "epoch": 0.24867509172441907, "grad_norm": 0.7957408428192139, "learning_rate": 0.0004844578067672238, "loss": 3.5731, "step": 3660 }, { "epoch": 0.24901481179508086, "grad_norm": 0.7397556900978088, "learning_rate": 0.0004844365742628075, "loss": 3.5947, "step": 3665 }, { "epoch": 0.24935453186574263, "grad_norm": 0.8360456824302673, "learning_rate": 0.0004844153417583911, "loss": 3.6685, "step": 3670 }, { "epoch": 0.2496942519364044, "grad_norm": 0.9851311445236206, "learning_rate": 0.00048439410925397476, "loss": 3.7746, "step": 3675 }, { "epoch": 0.2500339720070662, "grad_norm": 0.6956314444541931, "learning_rate": 0.0004843728767495584, "loss": 3.6863, "step": 3680 }, { "epoch": 0.25037369207772797, "grad_norm": 0.7546476721763611, "learning_rate": 0.000484351644245142, "loss": 3.5604, "step": 3685 }, { "epoch": 0.2507134121483897, "grad_norm": 0.6995735764503479, "learning_rate": 0.00048433041174072565, "loss": 3.4909, "step": 3690 }, { "epoch": 0.2510531322190515, "grad_norm": 0.7551071047782898, "learning_rate": 0.00048430917923630927, "loss": 3.769, "step": 3695 }, { "epoch": 0.25139285228971325, "grad_norm": 0.8459709882736206, "learning_rate": 0.00048428794673189293, "loss": 3.7546, "step": 3700 }, { "epoch": 0.25173257236037505, "grad_norm": 0.7258584499359131, "learning_rate": 0.0004842667142274766, "loss": 3.7213, "step": 3705 }, { "epoch": 0.25207229243103685, "grad_norm": 0.8113131523132324, "learning_rate": 0.0004842454817230602, "loss": 3.6381, "step": 3710 }, { "epoch": 0.2524120125016986, "grad_norm": 0.98420250415802, "learning_rate": 0.00048422424921864383, "loss": 3.5861, "step": 3715 }, { "epoch": 0.2527517325723604, "grad_norm": 0.6205715537071228, "learning_rate": 0.0004842030167142275, "loss": 3.571, "step": 3720 }, { "epoch": 0.25309145264302213, "grad_norm": 0.6189042925834656, "learning_rate": 0.0004841817842098111, "loss": 3.6757, "step": 3725 }, { "epoch": 0.2534311727136839, "grad_norm": 0.893584132194519, "learning_rate": 0.0004841605517053947, "loss": 3.8547, "step": 3730 }, { "epoch": 0.2537708927843457, "grad_norm": 0.7845825552940369, "learning_rate": 0.00048413931920097844, "loss": 3.5841, "step": 3735 }, { "epoch": 0.25411061285500747, "grad_norm": 0.7666778564453125, "learning_rate": 0.00048411808669656205, "loss": 3.6682, "step": 3740 }, { "epoch": 0.25445033292566926, "grad_norm": 0.9131804704666138, "learning_rate": 0.00048409685419214567, "loss": 3.4879, "step": 3745 }, { "epoch": 0.254790052996331, "grad_norm": 0.7596086859703064, "learning_rate": 0.00048407562168772933, "loss": 3.4484, "step": 3750 }, { "epoch": 0.2551297730669928, "grad_norm": 0.9272613525390625, "learning_rate": 0.00048405438918331295, "loss": 3.8015, "step": 3755 }, { "epoch": 0.25546949313765455, "grad_norm": 0.8632339239120483, "learning_rate": 0.00048403315667889656, "loss": 3.5859, "step": 3760 }, { "epoch": 0.25580921320831634, "grad_norm": 0.6889895796775818, "learning_rate": 0.0004840119241744803, "loss": 3.6401, "step": 3765 }, { "epoch": 0.25614893327897814, "grad_norm": 0.9269962906837463, "learning_rate": 0.0004839906916700639, "loss": 3.6752, "step": 3770 }, { "epoch": 0.2564886533496399, "grad_norm": 0.6652761101722717, "learning_rate": 0.0004839694591656475, "loss": 3.5509, "step": 3775 }, { "epoch": 0.2568283734203017, "grad_norm": 0.9104167819023132, "learning_rate": 0.0004839482266612312, "loss": 3.7699, "step": 3780 }, { "epoch": 0.2571680934909634, "grad_norm": 0.7714758515357971, "learning_rate": 0.0004839269941568148, "loss": 3.6917, "step": 3785 }, { "epoch": 0.2575078135616252, "grad_norm": 1.1537338495254517, "learning_rate": 0.0004839057616523984, "loss": 3.7344, "step": 3790 }, { "epoch": 0.257847533632287, "grad_norm": 0.8503426313400269, "learning_rate": 0.00048388452914798207, "loss": 3.6791, "step": 3795 }, { "epoch": 0.25818725370294876, "grad_norm": 0.7337886691093445, "learning_rate": 0.00048386329664356574, "loss": 3.5419, "step": 3800 }, { "epoch": 0.25852697377361056, "grad_norm": 1.0000195503234863, "learning_rate": 0.00048384206413914935, "loss": 3.6312, "step": 3805 }, { "epoch": 0.2588666938442723, "grad_norm": 0.8565915822982788, "learning_rate": 0.000483820831634733, "loss": 3.6489, "step": 3810 }, { "epoch": 0.2592064139149341, "grad_norm": 0.7048497200012207, "learning_rate": 0.00048379959913031663, "loss": 3.6835, "step": 3815 }, { "epoch": 0.2595461339855959, "grad_norm": 0.8437478542327881, "learning_rate": 0.00048377836662590024, "loss": 3.5275, "step": 3820 }, { "epoch": 0.25988585405625764, "grad_norm": 1.184874176979065, "learning_rate": 0.0004837571341214839, "loss": 3.7383, "step": 3825 }, { "epoch": 0.26022557412691943, "grad_norm": 1.3638712167739868, "learning_rate": 0.0004837359016170675, "loss": 3.4631, "step": 3830 }, { "epoch": 0.2605652941975812, "grad_norm": 0.8119879961013794, "learning_rate": 0.0004837146691126512, "loss": 3.5254, "step": 3835 }, { "epoch": 0.260905014268243, "grad_norm": 0.7116072177886963, "learning_rate": 0.00048369343660823486, "loss": 3.6486, "step": 3840 }, { "epoch": 0.2612447343389047, "grad_norm": 0.7034022808074951, "learning_rate": 0.00048367220410381847, "loss": 3.8244, "step": 3845 }, { "epoch": 0.2615844544095665, "grad_norm": 0.8977922201156616, "learning_rate": 0.0004836509715994021, "loss": 3.7492, "step": 3850 }, { "epoch": 0.2619241744802283, "grad_norm": 0.8031010031700134, "learning_rate": 0.00048362973909498575, "loss": 3.4764, "step": 3855 }, { "epoch": 0.26226389455089005, "grad_norm": 0.913953423500061, "learning_rate": 0.00048360850659056936, "loss": 3.731, "step": 3860 }, { "epoch": 0.26260361462155185, "grad_norm": 0.7621992230415344, "learning_rate": 0.000483587274086153, "loss": 3.589, "step": 3865 }, { "epoch": 0.2629433346922136, "grad_norm": 0.7147590517997742, "learning_rate": 0.0004835660415817367, "loss": 3.3976, "step": 3870 }, { "epoch": 0.2632830547628754, "grad_norm": 0.7871402502059937, "learning_rate": 0.0004835448090773203, "loss": 3.6492, "step": 3875 }, { "epoch": 0.2636227748335372, "grad_norm": 0.7609280347824097, "learning_rate": 0.0004835235765729039, "loss": 3.7411, "step": 3880 }, { "epoch": 0.26396249490419893, "grad_norm": 0.7509335279464722, "learning_rate": 0.0004835023440684876, "loss": 3.6565, "step": 3885 }, { "epoch": 0.2643022149748607, "grad_norm": 0.6095419526100159, "learning_rate": 0.0004834811115640712, "loss": 3.7334, "step": 3890 }, { "epoch": 0.26464193504552247, "grad_norm": 0.7659920454025269, "learning_rate": 0.0004834598790596548, "loss": 3.8384, "step": 3895 }, { "epoch": 0.26498165511618427, "grad_norm": 0.793908953666687, "learning_rate": 0.0004834386465552385, "loss": 3.4143, "step": 3900 }, { "epoch": 0.26532137518684606, "grad_norm": 0.8057821989059448, "learning_rate": 0.00048341741405082215, "loss": 3.9054, "step": 3905 }, { "epoch": 0.2656610952575078, "grad_norm": 0.9020659327507019, "learning_rate": 0.00048339618154640576, "loss": 3.5106, "step": 3910 }, { "epoch": 0.2660008153281696, "grad_norm": 0.8447952270507812, "learning_rate": 0.00048337494904198943, "loss": 3.8232, "step": 3915 }, { "epoch": 0.26634053539883135, "grad_norm": 0.8326380848884583, "learning_rate": 0.00048335371653757304, "loss": 3.8444, "step": 3920 }, { "epoch": 0.26668025546949314, "grad_norm": 0.8360494375228882, "learning_rate": 0.0004833324840331567, "loss": 3.6693, "step": 3925 }, { "epoch": 0.2670199755401549, "grad_norm": 0.691567063331604, "learning_rate": 0.0004833112515287403, "loss": 3.3521, "step": 3930 }, { "epoch": 0.2673596956108167, "grad_norm": 0.6922131180763245, "learning_rate": 0.00048329001902432393, "loss": 3.8021, "step": 3935 }, { "epoch": 0.2676994156814785, "grad_norm": 0.642247200012207, "learning_rate": 0.00048326878651990766, "loss": 3.7556, "step": 3940 }, { "epoch": 0.2680391357521402, "grad_norm": 0.8216326832771301, "learning_rate": 0.00048324755401549127, "loss": 3.4531, "step": 3945 }, { "epoch": 0.268378855822802, "grad_norm": 0.7916221618652344, "learning_rate": 0.0004832263215110749, "loss": 3.9269, "step": 3950 }, { "epoch": 0.26871857589346376, "grad_norm": 0.7789226770401001, "learning_rate": 0.00048320508900665855, "loss": 3.3995, "step": 3955 }, { "epoch": 0.26905829596412556, "grad_norm": 0.6534916162490845, "learning_rate": 0.00048318385650224216, "loss": 3.6836, "step": 3960 }, { "epoch": 0.26939801603478736, "grad_norm": 0.8812726140022278, "learning_rate": 0.0004831626239978258, "loss": 3.8118, "step": 3965 }, { "epoch": 0.2697377361054491, "grad_norm": 0.8422336578369141, "learning_rate": 0.00048314139149340944, "loss": 3.4812, "step": 3970 }, { "epoch": 0.2700774561761109, "grad_norm": 1.1279243230819702, "learning_rate": 0.0004831201589889931, "loss": 3.9252, "step": 3975 }, { "epoch": 0.27041717624677264, "grad_norm": 0.7473322749137878, "learning_rate": 0.0004830989264845767, "loss": 3.7224, "step": 3980 }, { "epoch": 0.27075689631743444, "grad_norm": 0.7213800549507141, "learning_rate": 0.0004830776939801604, "loss": 3.8375, "step": 3985 }, { "epoch": 0.27109661638809623, "grad_norm": 0.6523196697235107, "learning_rate": 0.000483056461475744, "loss": 3.6648, "step": 3990 }, { "epoch": 0.271436336458758, "grad_norm": 0.8023510575294495, "learning_rate": 0.0004830352289713276, "loss": 3.5617, "step": 3995 }, { "epoch": 0.2717760565294198, "grad_norm": 0.8581197261810303, "learning_rate": 0.0004830139964669113, "loss": 3.9142, "step": 4000 }, { "epoch": 0.2721157766000815, "grad_norm": 0.7087665796279907, "learning_rate": 0.0004829927639624949, "loss": 3.623, "step": 4005 }, { "epoch": 0.2724554966707433, "grad_norm": 0.6277762651443481, "learning_rate": 0.00048297153145807856, "loss": 3.5364, "step": 4010 }, { "epoch": 0.27279521674140506, "grad_norm": 0.8127524256706238, "learning_rate": 0.00048295029895366223, "loss": 3.9332, "step": 4015 }, { "epoch": 0.27313493681206685, "grad_norm": 0.760769248008728, "learning_rate": 0.00048292906644924584, "loss": 3.5127, "step": 4020 }, { "epoch": 0.27347465688272865, "grad_norm": 0.8011043667793274, "learning_rate": 0.00048290783394482946, "loss": 3.5923, "step": 4025 }, { "epoch": 0.2738143769533904, "grad_norm": 0.7419449090957642, "learning_rate": 0.0004828866014404131, "loss": 3.3976, "step": 4030 }, { "epoch": 0.2741540970240522, "grad_norm": 0.6964859366416931, "learning_rate": 0.00048286536893599674, "loss": 3.7356, "step": 4035 }, { "epoch": 0.27449381709471393, "grad_norm": 0.7274975180625916, "learning_rate": 0.00048284413643158035, "loss": 3.5262, "step": 4040 }, { "epoch": 0.27483353716537573, "grad_norm": 0.796216607093811, "learning_rate": 0.00048282290392716407, "loss": 3.8483, "step": 4045 }, { "epoch": 0.2751732572360375, "grad_norm": 0.9056466221809387, "learning_rate": 0.0004828016714227477, "loss": 3.7474, "step": 4050 }, { "epoch": 0.27551297730669927, "grad_norm": 0.6942521333694458, "learning_rate": 0.0004827804389183313, "loss": 3.6624, "step": 4055 }, { "epoch": 0.27585269737736107, "grad_norm": 0.8146530389785767, "learning_rate": 0.00048275920641391496, "loss": 3.61, "step": 4060 }, { "epoch": 0.2761924174480228, "grad_norm": 0.8415518999099731, "learning_rate": 0.0004827379739094986, "loss": 3.5323, "step": 4065 }, { "epoch": 0.2765321375186846, "grad_norm": 0.8651626706123352, "learning_rate": 0.0004827167414050822, "loss": 3.4547, "step": 4070 }, { "epoch": 0.2768718575893464, "grad_norm": 0.7274301052093506, "learning_rate": 0.00048269550890066586, "loss": 3.723, "step": 4075 }, { "epoch": 0.27721157766000815, "grad_norm": 0.6925188899040222, "learning_rate": 0.0004826742763962495, "loss": 3.6677, "step": 4080 }, { "epoch": 0.27755129773066994, "grad_norm": 0.9260522723197937, "learning_rate": 0.00048265304389183314, "loss": 3.3891, "step": 4085 }, { "epoch": 0.2778910178013317, "grad_norm": 0.705966591835022, "learning_rate": 0.0004826318113874168, "loss": 3.5809, "step": 4090 }, { "epoch": 0.2782307378719935, "grad_norm": 0.9213706254959106, "learning_rate": 0.0004826105788830004, "loss": 3.6467, "step": 4095 }, { "epoch": 0.2785704579426552, "grad_norm": 0.8179945945739746, "learning_rate": 0.00048258934637858403, "loss": 3.7312, "step": 4100 }, { "epoch": 0.278910178013317, "grad_norm": 0.6742517948150635, "learning_rate": 0.0004825681138741677, "loss": 3.4855, "step": 4105 }, { "epoch": 0.2792498980839788, "grad_norm": 0.7688722014427185, "learning_rate": 0.0004825468813697513, "loss": 3.7083, "step": 4110 }, { "epoch": 0.27958961815464056, "grad_norm": 0.748856782913208, "learning_rate": 0.000482525648865335, "loss": 3.6585, "step": 4115 }, { "epoch": 0.27992933822530236, "grad_norm": 0.6517595648765564, "learning_rate": 0.00048250441636091864, "loss": 3.6445, "step": 4120 }, { "epoch": 0.2802690582959641, "grad_norm": 0.804399847984314, "learning_rate": 0.00048248318385650226, "loss": 3.4798, "step": 4125 }, { "epoch": 0.2806087783666259, "grad_norm": 0.7903982400894165, "learning_rate": 0.00048246195135208587, "loss": 3.5769, "step": 4130 }, { "epoch": 0.2809484984372877, "grad_norm": 0.6469139456748962, "learning_rate": 0.00048244071884766954, "loss": 3.516, "step": 4135 }, { "epoch": 0.28128821850794944, "grad_norm": 0.6604697704315186, "learning_rate": 0.00048241948634325315, "loss": 3.569, "step": 4140 }, { "epoch": 0.28162793857861124, "grad_norm": 0.7627045512199402, "learning_rate": 0.00048239825383883676, "loss": 3.6477, "step": 4145 }, { "epoch": 0.281967658649273, "grad_norm": 0.9339775443077087, "learning_rate": 0.0004823770213344205, "loss": 3.9106, "step": 4150 }, { "epoch": 0.2823073787199348, "grad_norm": 0.7784827947616577, "learning_rate": 0.0004823557888300041, "loss": 3.7843, "step": 4155 }, { "epoch": 0.2826470987905966, "grad_norm": 0.8077957630157471, "learning_rate": 0.0004823345563255877, "loss": 3.7732, "step": 4160 }, { "epoch": 0.2829868188612583, "grad_norm": 0.7772462964057922, "learning_rate": 0.0004823133238211714, "loss": 3.4964, "step": 4165 }, { "epoch": 0.2833265389319201, "grad_norm": 0.6485260128974915, "learning_rate": 0.000482292091316755, "loss": 3.5484, "step": 4170 }, { "epoch": 0.28366625900258186, "grad_norm": 0.7990649342536926, "learning_rate": 0.0004822708588123386, "loss": 3.6797, "step": 4175 }, { "epoch": 0.28400597907324365, "grad_norm": 0.9598472714424133, "learning_rate": 0.00048224962630792227, "loss": 3.3642, "step": 4180 }, { "epoch": 0.2843456991439054, "grad_norm": 0.8991593718528748, "learning_rate": 0.00048222839380350594, "loss": 3.7384, "step": 4185 }, { "epoch": 0.2846854192145672, "grad_norm": 0.9232274889945984, "learning_rate": 0.00048220716129908955, "loss": 3.3407, "step": 4190 }, { "epoch": 0.285025139285229, "grad_norm": 0.826651930809021, "learning_rate": 0.0004821859287946732, "loss": 3.6956, "step": 4195 }, { "epoch": 0.28536485935589073, "grad_norm": 0.714325487613678, "learning_rate": 0.00048216469629025683, "loss": 3.6187, "step": 4200 }, { "epoch": 0.28570457942655253, "grad_norm": 0.6865731477737427, "learning_rate": 0.00048214346378584044, "loss": 3.6388, "step": 4205 }, { "epoch": 0.28604429949721427, "grad_norm": 0.8726068139076233, "learning_rate": 0.0004821222312814241, "loss": 3.6868, "step": 4210 }, { "epoch": 0.28638401956787607, "grad_norm": 0.8011283278465271, "learning_rate": 0.0004821009987770077, "loss": 3.6926, "step": 4215 }, { "epoch": 0.28672373963853787, "grad_norm": 0.8238186836242676, "learning_rate": 0.0004820797662725914, "loss": 3.8608, "step": 4220 }, { "epoch": 0.2870634597091996, "grad_norm": 0.8819950819015503, "learning_rate": 0.00048205853376817506, "loss": 3.8327, "step": 4225 }, { "epoch": 0.2874031797798614, "grad_norm": 0.789667546749115, "learning_rate": 0.00048203730126375867, "loss": 3.7689, "step": 4230 }, { "epoch": 0.28774289985052315, "grad_norm": 0.6413083672523499, "learning_rate": 0.0004820160687593423, "loss": 3.4773, "step": 4235 }, { "epoch": 0.28808261992118495, "grad_norm": 0.7390180826187134, "learning_rate": 0.00048199483625492595, "loss": 3.7863, "step": 4240 }, { "epoch": 0.28842233999184674, "grad_norm": 0.7742720246315002, "learning_rate": 0.00048197360375050956, "loss": 3.7631, "step": 4245 }, { "epoch": 0.2887620600625085, "grad_norm": 0.8900217413902283, "learning_rate": 0.0004819523712460932, "loss": 3.5438, "step": 4250 }, { "epoch": 0.2891017801331703, "grad_norm": 0.6175942420959473, "learning_rate": 0.0004819311387416769, "loss": 3.4446, "step": 4255 }, { "epoch": 0.289441500203832, "grad_norm": 0.9142879843711853, "learning_rate": 0.0004819099062372605, "loss": 3.8941, "step": 4260 }, { "epoch": 0.2897812202744938, "grad_norm": 0.8873006701469421, "learning_rate": 0.0004818886737328442, "loss": 3.5537, "step": 4265 }, { "epoch": 0.29012094034515556, "grad_norm": 0.7806867957115173, "learning_rate": 0.0004818674412284278, "loss": 3.6231, "step": 4270 }, { "epoch": 0.29046066041581736, "grad_norm": 0.8718128800392151, "learning_rate": 0.0004818462087240114, "loss": 3.689, "step": 4275 }, { "epoch": 0.29080038048647916, "grad_norm": 1.3413289785385132, "learning_rate": 0.00048182497621959507, "loss": 3.5595, "step": 4280 }, { "epoch": 0.2911401005571409, "grad_norm": 0.6965761780738831, "learning_rate": 0.00048180374371517874, "loss": 3.8374, "step": 4285 }, { "epoch": 0.2914798206278027, "grad_norm": 1.26924729347229, "learning_rate": 0.00048178251121076235, "loss": 3.5019, "step": 4290 }, { "epoch": 0.29181954069846444, "grad_norm": 0.8461579084396362, "learning_rate": 0.000481761278706346, "loss": 3.8744, "step": 4295 }, { "epoch": 0.29215926076912624, "grad_norm": 0.7742342948913574, "learning_rate": 0.00048174004620192963, "loss": 3.5087, "step": 4300 }, { "epoch": 0.29249898083978804, "grad_norm": 0.874099612236023, "learning_rate": 0.00048171881369751324, "loss": 3.3987, "step": 4305 }, { "epoch": 0.2928387009104498, "grad_norm": 0.8618926405906677, "learning_rate": 0.0004816975811930969, "loss": 3.7872, "step": 4310 }, { "epoch": 0.2931784209811116, "grad_norm": 0.9116448760032654, "learning_rate": 0.0004816763486886805, "loss": 3.4683, "step": 4315 }, { "epoch": 0.2935181410517733, "grad_norm": 0.8707708716392517, "learning_rate": 0.0004816551161842642, "loss": 3.6371, "step": 4320 }, { "epoch": 0.2938578611224351, "grad_norm": 6.824081897735596, "learning_rate": 0.00048163388367984786, "loss": 3.5199, "step": 4325 }, { "epoch": 0.2941975811930969, "grad_norm": 0.8056143522262573, "learning_rate": 0.00048161265117543147, "loss": 3.6584, "step": 4330 }, { "epoch": 0.29453730126375866, "grad_norm": 0.7707433104515076, "learning_rate": 0.0004815914186710151, "loss": 3.7459, "step": 4335 }, { "epoch": 0.29487702133442045, "grad_norm": 1.0196235179901123, "learning_rate": 0.00048157018616659875, "loss": 3.5936, "step": 4340 }, { "epoch": 0.2952167414050822, "grad_norm": 0.6951677203178406, "learning_rate": 0.00048154895366218236, "loss": 3.1561, "step": 4345 }, { "epoch": 0.295556461475744, "grad_norm": 0.8316264748573303, "learning_rate": 0.000481527721157766, "loss": 3.5745, "step": 4350 }, { "epoch": 0.29589618154640573, "grad_norm": 0.7993782758712769, "learning_rate": 0.0004815064886533497, "loss": 3.6649, "step": 4355 }, { "epoch": 0.29623590161706753, "grad_norm": 0.7784118056297302, "learning_rate": 0.0004814852561489333, "loss": 3.4844, "step": 4360 }, { "epoch": 0.29657562168772933, "grad_norm": 0.7311197519302368, "learning_rate": 0.0004814640236445169, "loss": 3.8244, "step": 4365 }, { "epoch": 0.29691534175839107, "grad_norm": 0.66410231590271, "learning_rate": 0.0004814427911401006, "loss": 3.7066, "step": 4370 }, { "epoch": 0.29725506182905287, "grad_norm": 0.8942224383354187, "learning_rate": 0.0004814215586356842, "loss": 3.4701, "step": 4375 }, { "epoch": 0.2975947818997146, "grad_norm": 1.1050529479980469, "learning_rate": 0.0004814003261312678, "loss": 3.6214, "step": 4380 }, { "epoch": 0.2979345019703764, "grad_norm": 0.6967983841896057, "learning_rate": 0.0004813790936268515, "loss": 3.6533, "step": 4385 }, { "epoch": 0.2982742220410382, "grad_norm": 1.0326764583587646, "learning_rate": 0.00048135786112243515, "loss": 3.8269, "step": 4390 }, { "epoch": 0.29861394211169995, "grad_norm": 1.0148727893829346, "learning_rate": 0.00048133662861801876, "loss": 3.8164, "step": 4395 }, { "epoch": 0.29895366218236175, "grad_norm": 0.8085352778434753, "learning_rate": 0.00048131539611360243, "loss": 3.5522, "step": 4400 }, { "epoch": 0.2992933822530235, "grad_norm": 0.9734207391738892, "learning_rate": 0.00048129416360918604, "loss": 3.5972, "step": 4405 }, { "epoch": 0.2996331023236853, "grad_norm": 0.8657002449035645, "learning_rate": 0.00048127293110476966, "loss": 3.8044, "step": 4410 }, { "epoch": 0.2999728223943471, "grad_norm": 0.9213194251060486, "learning_rate": 0.0004812516986003533, "loss": 3.7888, "step": 4415 }, { "epoch": 0.3003125424650088, "grad_norm": 0.9707303643226624, "learning_rate": 0.00048123046609593694, "loss": 3.645, "step": 4420 }, { "epoch": 0.3006522625356706, "grad_norm": 0.6366541385650635, "learning_rate": 0.0004812092335915206, "loss": 3.8108, "step": 4425 }, { "epoch": 0.30099198260633236, "grad_norm": 0.8692352175712585, "learning_rate": 0.00048118800108710427, "loss": 3.9025, "step": 4430 }, { "epoch": 0.30133170267699416, "grad_norm": 1.0752865076065063, "learning_rate": 0.0004811667685826879, "loss": 3.9031, "step": 4435 }, { "epoch": 0.3016714227476559, "grad_norm": 0.9077790975570679, "learning_rate": 0.0004811455360782715, "loss": 3.5453, "step": 4440 }, { "epoch": 0.3020111428183177, "grad_norm": 0.7622203826904297, "learning_rate": 0.00048112430357385516, "loss": 3.6509, "step": 4445 }, { "epoch": 0.3023508628889795, "grad_norm": 0.7324275374412537, "learning_rate": 0.0004811030710694388, "loss": 3.6734, "step": 4450 }, { "epoch": 0.30269058295964124, "grad_norm": 0.6474744081497192, "learning_rate": 0.0004810818385650224, "loss": 3.397, "step": 4455 }, { "epoch": 0.30303030303030304, "grad_norm": 0.7048881649971008, "learning_rate": 0.0004810606060606061, "loss": 3.5326, "step": 4460 }, { "epoch": 0.3033700231009648, "grad_norm": 0.7620943188667297, "learning_rate": 0.0004810393735561897, "loss": 3.6118, "step": 4465 }, { "epoch": 0.3037097431716266, "grad_norm": 0.719811737537384, "learning_rate": 0.00048101814105177334, "loss": 3.4042, "step": 4470 }, { "epoch": 0.3040494632422884, "grad_norm": 0.7009125351905823, "learning_rate": 0.000480996908547357, "loss": 3.6511, "step": 4475 }, { "epoch": 0.3043891833129501, "grad_norm": 0.8082932233810425, "learning_rate": 0.0004809756760429406, "loss": 3.8209, "step": 4480 }, { "epoch": 0.3047289033836119, "grad_norm": 0.6715877652168274, "learning_rate": 0.00048095444353852423, "loss": 3.5106, "step": 4485 }, { "epoch": 0.30506862345427366, "grad_norm": 0.6869513988494873, "learning_rate": 0.0004809332110341079, "loss": 3.9377, "step": 4490 }, { "epoch": 0.30540834352493546, "grad_norm": 1.1212314367294312, "learning_rate": 0.00048091197852969156, "loss": 3.6315, "step": 4495 }, { "epoch": 0.30574806359559725, "grad_norm": 0.8123836517333984, "learning_rate": 0.0004808907460252752, "loss": 3.7075, "step": 4500 }, { "epoch": 0.306087783666259, "grad_norm": 1.3415272235870361, "learning_rate": 0.00048086951352085884, "loss": 3.5013, "step": 4505 }, { "epoch": 0.3064275037369208, "grad_norm": 0.8197923898696899, "learning_rate": 0.00048084828101644246, "loss": 3.9872, "step": 4510 }, { "epoch": 0.30676722380758253, "grad_norm": 0.5721219182014465, "learning_rate": 0.00048082704851202607, "loss": 3.7569, "step": 4515 }, { "epoch": 0.30710694387824433, "grad_norm": 0.8310877680778503, "learning_rate": 0.00048080581600760974, "loss": 3.6973, "step": 4520 }, { "epoch": 0.3074466639489061, "grad_norm": 0.8872714042663574, "learning_rate": 0.00048078458350319335, "loss": 3.484, "step": 4525 }, { "epoch": 0.30778638401956787, "grad_norm": 1.09501051902771, "learning_rate": 0.000480763350998777, "loss": 3.7298, "step": 4530 }, { "epoch": 0.30812610409022967, "grad_norm": 0.8060715198516846, "learning_rate": 0.0004807421184943607, "loss": 3.4286, "step": 4535 }, { "epoch": 0.3084658241608914, "grad_norm": 0.8077582120895386, "learning_rate": 0.0004807208859899443, "loss": 3.5842, "step": 4540 }, { "epoch": 0.3088055442315532, "grad_norm": 1.4224853515625, "learning_rate": 0.0004806996534855279, "loss": 3.3354, "step": 4545 }, { "epoch": 0.30914526430221495, "grad_norm": 0.734748363494873, "learning_rate": 0.0004806784209811116, "loss": 3.4981, "step": 4550 }, { "epoch": 0.30948498437287675, "grad_norm": 0.6556529402732849, "learning_rate": 0.0004806571884766952, "loss": 3.7627, "step": 4555 }, { "epoch": 0.30982470444353855, "grad_norm": 0.9104928374290466, "learning_rate": 0.0004806359559722788, "loss": 3.6878, "step": 4560 }, { "epoch": 0.3101644245142003, "grad_norm": 0.8796806931495667, "learning_rate": 0.0004806147234678625, "loss": 3.7418, "step": 4565 }, { "epoch": 0.3105041445848621, "grad_norm": 0.8270484805107117, "learning_rate": 0.00048059349096344614, "loss": 3.7091, "step": 4570 }, { "epoch": 0.3108438646555238, "grad_norm": 0.7395673394203186, "learning_rate": 0.00048057225845902975, "loss": 3.45, "step": 4575 }, { "epoch": 0.3111835847261856, "grad_norm": 0.9820281267166138, "learning_rate": 0.0004805510259546134, "loss": 3.9158, "step": 4580 }, { "epoch": 0.3115233047968474, "grad_norm": 0.7750831842422485, "learning_rate": 0.00048052979345019703, "loss": 3.6638, "step": 4585 }, { "epoch": 0.31186302486750916, "grad_norm": 0.8329160809516907, "learning_rate": 0.00048050856094578064, "loss": 3.6631, "step": 4590 }, { "epoch": 0.31220274493817096, "grad_norm": 0.755611777305603, "learning_rate": 0.0004804873284413643, "loss": 3.6861, "step": 4595 }, { "epoch": 0.3125424650088327, "grad_norm": 0.7526485919952393, "learning_rate": 0.000480466095936948, "loss": 3.7472, "step": 4600 }, { "epoch": 0.3128821850794945, "grad_norm": 0.6721206903457642, "learning_rate": 0.00048044486343253165, "loss": 3.7505, "step": 4605 }, { "epoch": 0.3132219051501563, "grad_norm": 0.9076430201530457, "learning_rate": 0.00048042363092811526, "loss": 3.5978, "step": 4610 }, { "epoch": 0.31356162522081804, "grad_norm": 0.6644553542137146, "learning_rate": 0.00048040239842369887, "loss": 3.6, "step": 4615 }, { "epoch": 0.31390134529147984, "grad_norm": 0.7938026785850525, "learning_rate": 0.00048038116591928254, "loss": 3.8869, "step": 4620 }, { "epoch": 0.3142410653621416, "grad_norm": 0.7472580671310425, "learning_rate": 0.00048035993341486615, "loss": 3.6961, "step": 4625 }, { "epoch": 0.3145807854328034, "grad_norm": 0.649684727191925, "learning_rate": 0.00048033870091044976, "loss": 3.4756, "step": 4630 }, { "epoch": 0.3149205055034651, "grad_norm": 7.001923561096191, "learning_rate": 0.0004803174684060335, "loss": 3.5824, "step": 4635 }, { "epoch": 0.3152602255741269, "grad_norm": 0.7479925155639648, "learning_rate": 0.0004802962359016171, "loss": 3.7975, "step": 4640 }, { "epoch": 0.3155999456447887, "grad_norm": 0.7013113498687744, "learning_rate": 0.0004802750033972007, "loss": 3.6365, "step": 4645 }, { "epoch": 0.31593966571545046, "grad_norm": 0.6994428038597107, "learning_rate": 0.0004802537708927844, "loss": 3.6275, "step": 4650 }, { "epoch": 0.31627938578611225, "grad_norm": 1.0238444805145264, "learning_rate": 0.000480232538388368, "loss": 3.5469, "step": 4655 }, { "epoch": 0.316619105856774, "grad_norm": 0.7033113837242126, "learning_rate": 0.0004802113058839516, "loss": 3.8777, "step": 4660 }, { "epoch": 0.3169588259274358, "grad_norm": 0.9958891868591309, "learning_rate": 0.00048019007337953527, "loss": 3.6526, "step": 4665 }, { "epoch": 0.3172985459980976, "grad_norm": 0.8302141427993774, "learning_rate": 0.00048016884087511894, "loss": 3.8786, "step": 4670 }, { "epoch": 0.31763826606875933, "grad_norm": 0.7919300198554993, "learning_rate": 0.00048014760837070255, "loss": 3.6957, "step": 4675 }, { "epoch": 0.31797798613942113, "grad_norm": 0.7482283115386963, "learning_rate": 0.0004801263758662862, "loss": 3.5539, "step": 4680 }, { "epoch": 0.3183177062100829, "grad_norm": 1.7307075262069702, "learning_rate": 0.00048010514336186983, "loss": 3.772, "step": 4685 }, { "epoch": 0.31865742628074467, "grad_norm": 0.7306472659111023, "learning_rate": 0.00048008391085745345, "loss": 3.7914, "step": 4690 }, { "epoch": 0.31899714635140647, "grad_norm": 0.668624758720398, "learning_rate": 0.0004800626783530371, "loss": 3.7515, "step": 4695 }, { "epoch": 0.3193368664220682, "grad_norm": 0.7254446148872375, "learning_rate": 0.0004800414458486207, "loss": 3.6622, "step": 4700 }, { "epoch": 0.31967658649273, "grad_norm": 0.8652665615081787, "learning_rate": 0.0004800202133442044, "loss": 3.5185, "step": 4705 }, { "epoch": 0.32001630656339175, "grad_norm": 0.8680751323699951, "learning_rate": 0.00047999898083978806, "loss": 3.5405, "step": 4710 }, { "epoch": 0.32035602663405355, "grad_norm": 0.741763710975647, "learning_rate": 0.00047997774833537167, "loss": 3.6582, "step": 4715 }, { "epoch": 0.3206957467047153, "grad_norm": 0.7348916530609131, "learning_rate": 0.0004799565158309553, "loss": 3.7903, "step": 4720 }, { "epoch": 0.3210354667753771, "grad_norm": 0.6636877655982971, "learning_rate": 0.00047993528332653895, "loss": 3.7656, "step": 4725 }, { "epoch": 0.3213751868460389, "grad_norm": 0.7589821815490723, "learning_rate": 0.00047991405082212257, "loss": 3.5279, "step": 4730 }, { "epoch": 0.3217149069167006, "grad_norm": 0.7472919821739197, "learning_rate": 0.0004798928183177062, "loss": 3.2931, "step": 4735 }, { "epoch": 0.3220546269873624, "grad_norm": 0.9180131554603577, "learning_rate": 0.0004798715858132899, "loss": 3.5294, "step": 4740 }, { "epoch": 0.32239434705802417, "grad_norm": 0.7130644917488098, "learning_rate": 0.0004798503533088735, "loss": 3.433, "step": 4745 }, { "epoch": 0.32273406712868596, "grad_norm": 0.7428444027900696, "learning_rate": 0.0004798291208044571, "loss": 3.5556, "step": 4750 }, { "epoch": 0.32307378719934776, "grad_norm": 0.8839291334152222, "learning_rate": 0.0004798078883000408, "loss": 3.7989, "step": 4755 }, { "epoch": 0.3234135072700095, "grad_norm": 3.2159390449523926, "learning_rate": 0.0004797866557956244, "loss": 3.4385, "step": 4760 }, { "epoch": 0.3237532273406713, "grad_norm": 0.8282374739646912, "learning_rate": 0.000479765423291208, "loss": 3.6886, "step": 4765 }, { "epoch": 0.32409294741133304, "grad_norm": 0.8508703112602234, "learning_rate": 0.0004797441907867917, "loss": 3.6907, "step": 4770 }, { "epoch": 0.32443266748199484, "grad_norm": 0.6721722483634949, "learning_rate": 0.00047972295828237535, "loss": 3.511, "step": 4775 }, { "epoch": 0.32477238755265664, "grad_norm": 0.683131992816925, "learning_rate": 0.00047970172577795897, "loss": 3.7288, "step": 4780 }, { "epoch": 0.3251121076233184, "grad_norm": 0.7878881096839905, "learning_rate": 0.00047968049327354263, "loss": 3.6337, "step": 4785 }, { "epoch": 0.3254518276939802, "grad_norm": 0.7755454778671265, "learning_rate": 0.00047965926076912625, "loss": 3.4191, "step": 4790 }, { "epoch": 0.3257915477646419, "grad_norm": 0.7246046662330627, "learning_rate": 0.00047963802826470986, "loss": 3.6869, "step": 4795 }, { "epoch": 0.3261312678353037, "grad_norm": 0.8967646360397339, "learning_rate": 0.0004796167957602935, "loss": 3.4094, "step": 4800 }, { "epoch": 0.32647098790596546, "grad_norm": 1.1424338817596436, "learning_rate": 0.00047959556325587714, "loss": 3.4955, "step": 4805 }, { "epoch": 0.32681070797662726, "grad_norm": 0.796363353729248, "learning_rate": 0.0004795743307514608, "loss": 3.868, "step": 4810 }, { "epoch": 0.32715042804728905, "grad_norm": 0.9569473266601562, "learning_rate": 0.0004795530982470445, "loss": 3.408, "step": 4815 }, { "epoch": 0.3274901481179508, "grad_norm": 0.9653334021568298, "learning_rate": 0.0004795318657426281, "loss": 3.6839, "step": 4820 }, { "epoch": 0.3278298681886126, "grad_norm": 0.6405031085014343, "learning_rate": 0.0004795106332382117, "loss": 3.7346, "step": 4825 }, { "epoch": 0.32816958825927434, "grad_norm": 0.856133759021759, "learning_rate": 0.00047948940073379537, "loss": 3.6768, "step": 4830 }, { "epoch": 0.32850930832993613, "grad_norm": 0.8333121538162231, "learning_rate": 0.000479468168229379, "loss": 3.6077, "step": 4835 }, { "epoch": 0.32884902840059793, "grad_norm": 0.7456134557723999, "learning_rate": 0.0004794469357249626, "loss": 3.8651, "step": 4840 }, { "epoch": 0.3291887484712597, "grad_norm": 1.8287484645843506, "learning_rate": 0.0004794257032205463, "loss": 3.7255, "step": 4845 }, { "epoch": 0.32952846854192147, "grad_norm": 0.9683613777160645, "learning_rate": 0.0004794044707161299, "loss": 3.6639, "step": 4850 }, { "epoch": 0.3298681886125832, "grad_norm": 1.0168418884277344, "learning_rate": 0.00047938323821171354, "loss": 3.7501, "step": 4855 }, { "epoch": 0.330207908683245, "grad_norm": 0.9115661978721619, "learning_rate": 0.0004793620057072972, "loss": 3.3368, "step": 4860 }, { "epoch": 0.3305476287539068, "grad_norm": 0.8951810598373413, "learning_rate": 0.0004793407732028808, "loss": 3.7791, "step": 4865 }, { "epoch": 0.33088734882456855, "grad_norm": 0.7897176146507263, "learning_rate": 0.00047931954069846443, "loss": 3.8778, "step": 4870 }, { "epoch": 0.33122706889523035, "grad_norm": 0.7435681223869324, "learning_rate": 0.00047929830819404815, "loss": 3.3613, "step": 4875 }, { "epoch": 0.3315667889658921, "grad_norm": 0.8922587633132935, "learning_rate": 0.00047927707568963177, "loss": 3.322, "step": 4880 }, { "epoch": 0.3319065090365539, "grad_norm": 0.6422141790390015, "learning_rate": 0.0004792558431852154, "loss": 3.785, "step": 4885 }, { "epoch": 0.33224622910721563, "grad_norm": 0.9997301697731018, "learning_rate": 0.00047923461068079905, "loss": 3.6997, "step": 4890 }, { "epoch": 0.3325859491778774, "grad_norm": 0.6974994540214539, "learning_rate": 0.00047921337817638266, "loss": 3.9152, "step": 4895 }, { "epoch": 0.3329256692485392, "grad_norm": 0.8423810601234436, "learning_rate": 0.00047919214567196627, "loss": 3.6421, "step": 4900 }, { "epoch": 0.33326538931920097, "grad_norm": 1.04576575756073, "learning_rate": 0.00047917091316754994, "loss": 3.5639, "step": 4905 }, { "epoch": 0.33360510938986276, "grad_norm": 0.7683984041213989, "learning_rate": 0.0004791496806631336, "loss": 3.759, "step": 4910 }, { "epoch": 0.3339448294605245, "grad_norm": 0.9783744215965271, "learning_rate": 0.0004791284481587172, "loss": 3.729, "step": 4915 }, { "epoch": 0.3342845495311863, "grad_norm": 3.6808738708496094, "learning_rate": 0.0004791072156543009, "loss": 3.4735, "step": 4920 }, { "epoch": 0.3346242696018481, "grad_norm": 0.7737945318222046, "learning_rate": 0.0004790859831498845, "loss": 3.7283, "step": 4925 }, { "epoch": 0.33496398967250984, "grad_norm": 0.9220679998397827, "learning_rate": 0.0004790647506454681, "loss": 4.0945, "step": 4930 }, { "epoch": 0.33530370974317164, "grad_norm": 0.9171364307403564, "learning_rate": 0.0004790435181410518, "loss": 3.6421, "step": 4935 }, { "epoch": 0.3356434298138334, "grad_norm": 0.6645728945732117, "learning_rate": 0.0004790222856366354, "loss": 3.5152, "step": 4940 }, { "epoch": 0.3359831498844952, "grad_norm": 0.6760745048522949, "learning_rate": 0.0004790010531322191, "loss": 3.7532, "step": 4945 }, { "epoch": 0.336322869955157, "grad_norm": 0.8458096385002136, "learning_rate": 0.0004789798206278027, "loss": 3.6828, "step": 4950 }, { "epoch": 0.3366625900258187, "grad_norm": 0.6048572063446045, "learning_rate": 0.00047895858812338634, "loss": 3.7422, "step": 4955 }, { "epoch": 0.3370023100964805, "grad_norm": 0.6547191143035889, "learning_rate": 0.00047893735561897, "loss": 3.4891, "step": 4960 }, { "epoch": 0.33734203016714226, "grad_norm": 0.5992053747177124, "learning_rate": 0.0004789161231145536, "loss": 3.6281, "step": 4965 }, { "epoch": 0.33768175023780406, "grad_norm": 1.1365339756011963, "learning_rate": 0.00047889489061013723, "loss": 3.4569, "step": 4970 }, { "epoch": 0.3380214703084658, "grad_norm": 0.901555061340332, "learning_rate": 0.0004788736581057209, "loss": 3.5849, "step": 4975 }, { "epoch": 0.3383611903791276, "grad_norm": 1.0830271244049072, "learning_rate": 0.00047885242560130457, "loss": 3.677, "step": 4980 }, { "epoch": 0.3387009104497894, "grad_norm": 0.7836243510246277, "learning_rate": 0.0004788311930968882, "loss": 3.619, "step": 4985 }, { "epoch": 0.33904063052045114, "grad_norm": 1.1091669797897339, "learning_rate": 0.00047880996059247185, "loss": 3.6675, "step": 4990 }, { "epoch": 0.33938035059111293, "grad_norm": 0.7148205041885376, "learning_rate": 0.00047878872808805546, "loss": 3.8451, "step": 4995 }, { "epoch": 0.3397200706617747, "grad_norm": 0.8236558437347412, "learning_rate": 0.0004787674955836391, "loss": 3.5847, "step": 5000 }, { "epoch": 0.3400597907324365, "grad_norm": 0.7701603770256042, "learning_rate": 0.00047874626307922274, "loss": 3.7919, "step": 5005 }, { "epoch": 0.34039951080309827, "grad_norm": 0.9239928126335144, "learning_rate": 0.00047872503057480635, "loss": 3.5013, "step": 5010 }, { "epoch": 0.34073923087376, "grad_norm": 0.7570794224739075, "learning_rate": 0.00047870379807039, "loss": 3.4755, "step": 5015 }, { "epoch": 0.3410789509444218, "grad_norm": 0.9240731596946716, "learning_rate": 0.0004786825655659737, "loss": 3.4095, "step": 5020 }, { "epoch": 0.34141867101508355, "grad_norm": 0.8503997921943665, "learning_rate": 0.0004786613330615573, "loss": 3.4124, "step": 5025 }, { "epoch": 0.34175839108574535, "grad_norm": 0.927055299282074, "learning_rate": 0.0004786401005571409, "loss": 3.5115, "step": 5030 }, { "epoch": 0.34209811115640715, "grad_norm": 0.8962202072143555, "learning_rate": 0.0004786188680527246, "loss": 3.4231, "step": 5035 }, { "epoch": 0.3424378312270689, "grad_norm": 1.2349814176559448, "learning_rate": 0.0004785976355483082, "loss": 3.4782, "step": 5040 }, { "epoch": 0.3427775512977307, "grad_norm": 0.8308776021003723, "learning_rate": 0.0004785764030438918, "loss": 3.6806, "step": 5045 }, { "epoch": 0.34311727136839243, "grad_norm": 0.9998466968536377, "learning_rate": 0.00047855517053947553, "loss": 3.7318, "step": 5050 }, { "epoch": 0.3434569914390542, "grad_norm": 0.5975036025047302, "learning_rate": 0.00047853393803505914, "loss": 3.9021, "step": 5055 }, { "epoch": 0.34379671150971597, "grad_norm": 0.8478381633758545, "learning_rate": 0.00047851270553064275, "loss": 3.4273, "step": 5060 }, { "epoch": 0.34413643158037777, "grad_norm": 0.7502886056900024, "learning_rate": 0.0004784914730262264, "loss": 3.3833, "step": 5065 }, { "epoch": 0.34447615165103956, "grad_norm": 0.6711516976356506, "learning_rate": 0.00047847024052181003, "loss": 3.5518, "step": 5070 }, { "epoch": 0.3448158717217013, "grad_norm": 0.7953229546546936, "learning_rate": 0.00047844900801739365, "loss": 3.5321, "step": 5075 }, { "epoch": 0.3451555917923631, "grad_norm": 0.9766865372657776, "learning_rate": 0.0004784277755129773, "loss": 3.6692, "step": 5080 }, { "epoch": 0.34549531186302485, "grad_norm": 1.037738561630249, "learning_rate": 0.000478406543008561, "loss": 3.4833, "step": 5085 }, { "epoch": 0.34583503193368664, "grad_norm": 1.0632696151733398, "learning_rate": 0.0004783853105041446, "loss": 3.418, "step": 5090 }, { "epoch": 0.34617475200434844, "grad_norm": 0.7830853462219238, "learning_rate": 0.00047836407799972826, "loss": 3.7494, "step": 5095 }, { "epoch": 0.3465144720750102, "grad_norm": 0.8010627627372742, "learning_rate": 0.0004783428454953119, "loss": 3.4206, "step": 5100 }, { "epoch": 0.346854192145672, "grad_norm": 1.7987987995147705, "learning_rate": 0.0004783216129908955, "loss": 3.5808, "step": 5105 }, { "epoch": 0.3471939122163337, "grad_norm": 0.7215706706047058, "learning_rate": 0.00047830038048647915, "loss": 3.6996, "step": 5110 }, { "epoch": 0.3475336322869955, "grad_norm": 0.8076146245002747, "learning_rate": 0.00047827914798206277, "loss": 3.7066, "step": 5115 }, { "epoch": 0.3478733523576573, "grad_norm": 0.8525733351707458, "learning_rate": 0.00047825791547764643, "loss": 3.5997, "step": 5120 }, { "epoch": 0.34821307242831906, "grad_norm": 0.7029271125793457, "learning_rate": 0.0004782366829732301, "loss": 3.6964, "step": 5125 }, { "epoch": 0.34855279249898086, "grad_norm": 0.6801184415817261, "learning_rate": 0.0004782154504688137, "loss": 3.434, "step": 5130 }, { "epoch": 0.3488925125696426, "grad_norm": 0.7289472818374634, "learning_rate": 0.00047819421796439733, "loss": 3.4855, "step": 5135 }, { "epoch": 0.3492322326403044, "grad_norm": 0.6491520404815674, "learning_rate": 0.000478172985459981, "loss": 3.5981, "step": 5140 }, { "epoch": 0.34957195271096614, "grad_norm": 0.8545582890510559, "learning_rate": 0.0004781517529555646, "loss": 3.3472, "step": 5145 }, { "epoch": 0.34991167278162794, "grad_norm": 0.7962835431098938, "learning_rate": 0.0004781305204511482, "loss": 3.7371, "step": 5150 }, { "epoch": 0.35025139285228973, "grad_norm": 0.6469713449478149, "learning_rate": 0.00047810928794673194, "loss": 3.4657, "step": 5155 }, { "epoch": 0.3505911129229515, "grad_norm": 1.5668987035751343, "learning_rate": 0.00047808805544231555, "loss": 3.7571, "step": 5160 }, { "epoch": 0.3509308329936133, "grad_norm": 0.7383027076721191, "learning_rate": 0.00047806682293789917, "loss": 3.5903, "step": 5165 }, { "epoch": 0.351270553064275, "grad_norm": 0.6920024752616882, "learning_rate": 0.00047804559043348283, "loss": 3.5563, "step": 5170 }, { "epoch": 0.3516102731349368, "grad_norm": 0.888149619102478, "learning_rate": 0.00047802435792906645, "loss": 3.3678, "step": 5175 }, { "epoch": 0.3519499932055986, "grad_norm": 1.1554219722747803, "learning_rate": 0.00047800312542465006, "loss": 3.7081, "step": 5180 }, { "epoch": 0.35228971327626035, "grad_norm": 1.0039440393447876, "learning_rate": 0.00047798189292023373, "loss": 3.5869, "step": 5185 }, { "epoch": 0.35262943334692215, "grad_norm": 0.7622624635696411, "learning_rate": 0.0004779606604158174, "loss": 3.7244, "step": 5190 }, { "epoch": 0.3529691534175839, "grad_norm": 0.7621424198150635, "learning_rate": 0.000477939427911401, "loss": 3.6328, "step": 5195 }, { "epoch": 0.3533088734882457, "grad_norm": 0.8672890663146973, "learning_rate": 0.0004779181954069847, "loss": 3.6794, "step": 5200 }, { "epoch": 0.3536485935589075, "grad_norm": 0.9563678503036499, "learning_rate": 0.0004778969629025683, "loss": 3.4722, "step": 5205 }, { "epoch": 0.35398831362956923, "grad_norm": 1.0506001710891724, "learning_rate": 0.0004778757303981519, "loss": 3.6712, "step": 5210 }, { "epoch": 0.354328033700231, "grad_norm": 0.7335140109062195, "learning_rate": 0.00047785449789373557, "loss": 3.7578, "step": 5215 }, { "epoch": 0.35466775377089277, "grad_norm": 0.9774301648139954, "learning_rate": 0.0004778332653893192, "loss": 3.7592, "step": 5220 }, { "epoch": 0.35500747384155457, "grad_norm": 0.8464666604995728, "learning_rate": 0.00047781203288490285, "loss": 3.5108, "step": 5225 }, { "epoch": 0.3553471939122163, "grad_norm": 0.6741433143615723, "learning_rate": 0.0004777908003804865, "loss": 3.5612, "step": 5230 }, { "epoch": 0.3556869139828781, "grad_norm": 1.0230048894882202, "learning_rate": 0.00047776956787607013, "loss": 3.3934, "step": 5235 }, { "epoch": 0.3560266340535399, "grad_norm": 0.6649307608604431, "learning_rate": 0.00047774833537165374, "loss": 3.6771, "step": 5240 }, { "epoch": 0.35636635412420165, "grad_norm": 0.8333790302276611, "learning_rate": 0.0004777271028672374, "loss": 3.4916, "step": 5245 }, { "epoch": 0.35670607419486344, "grad_norm": 0.6966885924339294, "learning_rate": 0.000477705870362821, "loss": 3.8448, "step": 5250 }, { "epoch": 0.3570457942655252, "grad_norm": 0.6850246787071228, "learning_rate": 0.00047768463785840463, "loss": 3.4364, "step": 5255 }, { "epoch": 0.357385514336187, "grad_norm": 0.8747521042823792, "learning_rate": 0.00047766340535398836, "loss": 3.5386, "step": 5260 }, { "epoch": 0.3577252344068488, "grad_norm": 0.7856281399726868, "learning_rate": 0.00047764217284957197, "loss": 3.2962, "step": 5265 }, { "epoch": 0.3580649544775105, "grad_norm": 2.386702060699463, "learning_rate": 0.0004776209403451556, "loss": 3.5126, "step": 5270 }, { "epoch": 0.3584046745481723, "grad_norm": 0.7956517338752747, "learning_rate": 0.00047759970784073925, "loss": 3.7578, "step": 5275 }, { "epoch": 0.35874439461883406, "grad_norm": 0.7211008071899414, "learning_rate": 0.00047757847533632286, "loss": 3.5301, "step": 5280 }, { "epoch": 0.35908411468949586, "grad_norm": 0.9479743838310242, "learning_rate": 0.00047755724283190653, "loss": 3.7322, "step": 5285 }, { "epoch": 0.35942383476015766, "grad_norm": 0.9992902874946594, "learning_rate": 0.00047753601032749014, "loss": 3.6554, "step": 5290 }, { "epoch": 0.3597635548308194, "grad_norm": 0.916776180267334, "learning_rate": 0.0004775147778230738, "loss": 3.8296, "step": 5295 }, { "epoch": 0.3601032749014812, "grad_norm": 0.753250241279602, "learning_rate": 0.0004774935453186575, "loss": 3.7738, "step": 5300 }, { "epoch": 0.36044299497214294, "grad_norm": 0.6998841762542725, "learning_rate": 0.0004774723128142411, "loss": 3.8227, "step": 5305 }, { "epoch": 0.36078271504280474, "grad_norm": 0.636833906173706, "learning_rate": 0.0004774510803098247, "loss": 3.7575, "step": 5310 }, { "epoch": 0.3611224351134665, "grad_norm": 1.2870655059814453, "learning_rate": 0.00047742984780540837, "loss": 3.9224, "step": 5315 }, { "epoch": 0.3614621551841283, "grad_norm": 0.8046035766601562, "learning_rate": 0.000477408615300992, "loss": 3.4816, "step": 5320 }, { "epoch": 0.3618018752547901, "grad_norm": 0.7784758806228638, "learning_rate": 0.0004773873827965756, "loss": 3.8732, "step": 5325 }, { "epoch": 0.3621415953254518, "grad_norm": 0.6650167107582092, "learning_rate": 0.0004773661502921593, "loss": 3.6379, "step": 5330 }, { "epoch": 0.3624813153961136, "grad_norm": 1.0664547681808472, "learning_rate": 0.00047734491778774293, "loss": 3.9329, "step": 5335 }, { "epoch": 0.36282103546677535, "grad_norm": 0.9676411151885986, "learning_rate": 0.00047732368528332654, "loss": 3.5466, "step": 5340 }, { "epoch": 0.36316075553743715, "grad_norm": 0.8718355298042297, "learning_rate": 0.0004773024527789102, "loss": 3.5097, "step": 5345 }, { "epoch": 0.36350047560809895, "grad_norm": 1.108154296875, "learning_rate": 0.0004772812202744938, "loss": 3.6627, "step": 5350 }, { "epoch": 0.3638401956787607, "grad_norm": 0.7122775912284851, "learning_rate": 0.00047725998777007743, "loss": 3.2819, "step": 5355 }, { "epoch": 0.3641799157494225, "grad_norm": 0.8560763597488403, "learning_rate": 0.0004772387552656611, "loss": 3.4582, "step": 5360 }, { "epoch": 0.36451963582008423, "grad_norm": 1.0967005491256714, "learning_rate": 0.00047721752276124477, "loss": 3.7311, "step": 5365 }, { "epoch": 0.36485935589074603, "grad_norm": 0.9255049824714661, "learning_rate": 0.0004771962902568284, "loss": 3.4128, "step": 5370 }, { "epoch": 0.3651990759614078, "grad_norm": 0.7785241603851318, "learning_rate": 0.00047717505775241205, "loss": 3.5722, "step": 5375 }, { "epoch": 0.36553879603206957, "grad_norm": 0.8435749411582947, "learning_rate": 0.00047715382524799566, "loss": 3.496, "step": 5380 }, { "epoch": 0.36587851610273137, "grad_norm": 0.7113354802131653, "learning_rate": 0.0004771325927435793, "loss": 3.7313, "step": 5385 }, { "epoch": 0.3662182361733931, "grad_norm": 0.863036036491394, "learning_rate": 0.00047711136023916294, "loss": 3.5264, "step": 5390 }, { "epoch": 0.3665579562440549, "grad_norm": 0.7676143050193787, "learning_rate": 0.00047709012773474655, "loss": 3.4508, "step": 5395 }, { "epoch": 0.36689767631471665, "grad_norm": 0.7856766581535339, "learning_rate": 0.0004770688952303302, "loss": 3.6534, "step": 5400 }, { "epoch": 0.36723739638537845, "grad_norm": 0.8559523224830627, "learning_rate": 0.0004770476627259139, "loss": 3.4381, "step": 5405 }, { "epoch": 0.36757711645604024, "grad_norm": 0.8145678639411926, "learning_rate": 0.0004770264302214975, "loss": 3.4954, "step": 5410 }, { "epoch": 0.367916836526702, "grad_norm": 0.7190720438957214, "learning_rate": 0.0004770051977170811, "loss": 3.6721, "step": 5415 }, { "epoch": 0.3682565565973638, "grad_norm": 0.7940743565559387, "learning_rate": 0.0004769839652126648, "loss": 3.5641, "step": 5420 }, { "epoch": 0.3685962766680255, "grad_norm": 1.0296911001205444, "learning_rate": 0.0004769627327082484, "loss": 3.41, "step": 5425 }, { "epoch": 0.3689359967386873, "grad_norm": 0.7094640731811523, "learning_rate": 0.000476941500203832, "loss": 3.6582, "step": 5430 }, { "epoch": 0.3692757168093491, "grad_norm": 0.8812994956970215, "learning_rate": 0.00047692026769941573, "loss": 3.5231, "step": 5435 }, { "epoch": 0.36961543688001086, "grad_norm": 0.8800371289253235, "learning_rate": 0.00047689903519499934, "loss": 3.4122, "step": 5440 }, { "epoch": 0.36995515695067266, "grad_norm": 0.7810664176940918, "learning_rate": 0.00047687780269058296, "loss": 3.6776, "step": 5445 }, { "epoch": 0.3702948770213344, "grad_norm": 0.9753801226615906, "learning_rate": 0.0004768565701861666, "loss": 3.4405, "step": 5450 }, { "epoch": 0.3706345970919962, "grad_norm": 0.8043168783187866, "learning_rate": 0.00047683533768175024, "loss": 3.7575, "step": 5455 }, { "epoch": 0.370974317162658, "grad_norm": 0.790428876876831, "learning_rate": 0.00047681410517733385, "loss": 3.5254, "step": 5460 }, { "epoch": 0.37131403723331974, "grad_norm": 1.0281344652175903, "learning_rate": 0.00047679287267291757, "loss": 3.5038, "step": 5465 }, { "epoch": 0.37165375730398154, "grad_norm": 1.029961347579956, "learning_rate": 0.0004767716401685012, "loss": 3.756, "step": 5470 }, { "epoch": 0.3719934773746433, "grad_norm": 0.7984892725944519, "learning_rate": 0.0004767504076640848, "loss": 3.7737, "step": 5475 }, { "epoch": 0.3723331974453051, "grad_norm": 0.7657549381256104, "learning_rate": 0.00047672917515966846, "loss": 3.7329, "step": 5480 }, { "epoch": 0.3726729175159668, "grad_norm": 0.8582470417022705, "learning_rate": 0.0004767079426552521, "loss": 3.9438, "step": 5485 }, { "epoch": 0.3730126375866286, "grad_norm": 0.9439883232116699, "learning_rate": 0.0004766867101508357, "loss": 3.54, "step": 5490 }, { "epoch": 0.3733523576572904, "grad_norm": 0.7983023524284363, "learning_rate": 0.00047666547764641936, "loss": 3.5908, "step": 5495 }, { "epoch": 0.37369207772795215, "grad_norm": 0.805391252040863, "learning_rate": 0.000476644245142003, "loss": 3.7542, "step": 5500 }, { "epoch": 0.37403179779861395, "grad_norm": 0.8234259486198425, "learning_rate": 0.00047662301263758664, "loss": 3.7628, "step": 5505 }, { "epoch": 0.3743715178692757, "grad_norm": 0.742196798324585, "learning_rate": 0.0004766017801331703, "loss": 3.4926, "step": 5510 }, { "epoch": 0.3747112379399375, "grad_norm": 0.965373694896698, "learning_rate": 0.0004765805476287539, "loss": 3.7153, "step": 5515 }, { "epoch": 0.3750509580105993, "grad_norm": 0.7976765036582947, "learning_rate": 0.00047655931512433753, "loss": 3.3965, "step": 5520 }, { "epoch": 0.37539067808126103, "grad_norm": 0.9751296043395996, "learning_rate": 0.0004765380826199212, "loss": 3.4577, "step": 5525 }, { "epoch": 0.37573039815192283, "grad_norm": 0.8272245526313782, "learning_rate": 0.0004765168501155048, "loss": 3.5354, "step": 5530 }, { "epoch": 0.37607011822258457, "grad_norm": 0.6712712645530701, "learning_rate": 0.0004764956176110885, "loss": 3.9714, "step": 5535 }, { "epoch": 0.37640983829324637, "grad_norm": 0.7778457403182983, "learning_rate": 0.00047647438510667214, "loss": 3.6563, "step": 5540 }, { "epoch": 0.37674955836390817, "grad_norm": 0.761876106262207, "learning_rate": 0.00047645315260225576, "loss": 3.9385, "step": 5545 }, { "epoch": 0.3770892784345699, "grad_norm": 0.6820372343063354, "learning_rate": 0.00047643192009783937, "loss": 3.8156, "step": 5550 }, { "epoch": 0.3774289985052317, "grad_norm": 0.874921977519989, "learning_rate": 0.00047641068759342304, "loss": 3.828, "step": 5555 }, { "epoch": 0.37776871857589345, "grad_norm": 1.0497303009033203, "learning_rate": 0.00047638945508900665, "loss": 3.7877, "step": 5560 }, { "epoch": 0.37810843864655524, "grad_norm": 0.8545297384262085, "learning_rate": 0.00047636822258459026, "loss": 3.5341, "step": 5565 }, { "epoch": 0.378448158717217, "grad_norm": 0.8650474548339844, "learning_rate": 0.000476346990080174, "loss": 3.6787, "step": 5570 }, { "epoch": 0.3787878787878788, "grad_norm": 0.7940987944602966, "learning_rate": 0.0004763257575757576, "loss": 3.3933, "step": 5575 }, { "epoch": 0.3791275988585406, "grad_norm": 0.8770955204963684, "learning_rate": 0.0004763045250713412, "loss": 3.4656, "step": 5580 }, { "epoch": 0.3794673189292023, "grad_norm": 1.191408395767212, "learning_rate": 0.0004762832925669249, "loss": 3.539, "step": 5585 }, { "epoch": 0.3798070389998641, "grad_norm": 0.7783030271530151, "learning_rate": 0.0004762620600625085, "loss": 3.3821, "step": 5590 }, { "epoch": 0.38014675907052586, "grad_norm": 0.8220301866531372, "learning_rate": 0.0004762408275580921, "loss": 3.5904, "step": 5595 }, { "epoch": 0.38048647914118766, "grad_norm": 0.7856647968292236, "learning_rate": 0.00047621959505367577, "loss": 3.7831, "step": 5600 }, { "epoch": 0.38082619921184946, "grad_norm": 0.8593004941940308, "learning_rate": 0.00047619836254925944, "loss": 3.6786, "step": 5605 }, { "epoch": 0.3811659192825112, "grad_norm": 0.8484439849853516, "learning_rate": 0.00047617713004484305, "loss": 3.4759, "step": 5610 }, { "epoch": 0.381505639353173, "grad_norm": 0.8383198380470276, "learning_rate": 0.0004761558975404267, "loss": 3.5473, "step": 5615 }, { "epoch": 0.38184535942383474, "grad_norm": 0.7296786904335022, "learning_rate": 0.00047613466503601033, "loss": 3.6181, "step": 5620 }, { "epoch": 0.38218507949449654, "grad_norm": 0.9584680199623108, "learning_rate": 0.000476113432531594, "loss": 3.5407, "step": 5625 }, { "epoch": 0.38252479956515834, "grad_norm": 0.9258373379707336, "learning_rate": 0.0004760922000271776, "loss": 3.7191, "step": 5630 }, { "epoch": 0.3828645196358201, "grad_norm": 0.8593915104866028, "learning_rate": 0.0004760709675227612, "loss": 3.4695, "step": 5635 }, { "epoch": 0.3832042397064819, "grad_norm": 0.8026646375656128, "learning_rate": 0.00047604973501834494, "loss": 3.6504, "step": 5640 }, { "epoch": 0.3835439597771436, "grad_norm": 0.8611423969268799, "learning_rate": 0.00047602850251392856, "loss": 3.6976, "step": 5645 }, { "epoch": 0.3838836798478054, "grad_norm": 0.8760219216346741, "learning_rate": 0.00047600727000951217, "loss": 3.5011, "step": 5650 }, { "epoch": 0.38422339991846716, "grad_norm": 0.8296730518341064, "learning_rate": 0.00047598603750509584, "loss": 3.7668, "step": 5655 }, { "epoch": 0.38456311998912895, "grad_norm": 0.7009175419807434, "learning_rate": 0.00047596480500067945, "loss": 3.5878, "step": 5660 }, { "epoch": 0.38490284005979075, "grad_norm": 0.885664701461792, "learning_rate": 0.00047594357249626306, "loss": 3.5655, "step": 5665 }, { "epoch": 0.3852425601304525, "grad_norm": 9.279609680175781, "learning_rate": 0.00047592233999184673, "loss": 3.4924, "step": 5670 }, { "epoch": 0.3855822802011143, "grad_norm": 0.8264347314834595, "learning_rate": 0.0004759011074874304, "loss": 3.5151, "step": 5675 }, { "epoch": 0.38592200027177603, "grad_norm": 0.8974717855453491, "learning_rate": 0.000475879874983014, "loss": 3.5534, "step": 5680 }, { "epoch": 0.38626172034243783, "grad_norm": 0.7704117894172668, "learning_rate": 0.0004758586424785977, "loss": 3.5225, "step": 5685 }, { "epoch": 0.38660144041309963, "grad_norm": 0.9731960892677307, "learning_rate": 0.0004758374099741813, "loss": 3.7821, "step": 5690 }, { "epoch": 0.38694116048376137, "grad_norm": 0.7608122229576111, "learning_rate": 0.0004758161774697649, "loss": 3.491, "step": 5695 }, { "epoch": 0.38728088055442317, "grad_norm": 0.895907998085022, "learning_rate": 0.00047579494496534857, "loss": 3.6496, "step": 5700 }, { "epoch": 0.3876206006250849, "grad_norm": 0.7859179377555847, "learning_rate": 0.0004757737124609322, "loss": 3.5479, "step": 5705 }, { "epoch": 0.3879603206957467, "grad_norm": 1.314813494682312, "learning_rate": 0.00047575247995651585, "loss": 3.6492, "step": 5710 }, { "epoch": 0.3883000407664085, "grad_norm": 0.7609358429908752, "learning_rate": 0.0004757312474520995, "loss": 3.6726, "step": 5715 }, { "epoch": 0.38863976083707025, "grad_norm": 0.8617343306541443, "learning_rate": 0.00047571001494768313, "loss": 3.6437, "step": 5720 }, { "epoch": 0.38897948090773204, "grad_norm": 0.8536332845687866, "learning_rate": 0.00047568878244326674, "loss": 3.6948, "step": 5725 }, { "epoch": 0.3893192009783938, "grad_norm": 0.8236358761787415, "learning_rate": 0.0004756675499388504, "loss": 3.6206, "step": 5730 }, { "epoch": 0.3896589210490556, "grad_norm": 1.0281975269317627, "learning_rate": 0.000475646317434434, "loss": 3.6226, "step": 5735 }, { "epoch": 0.3899986411197173, "grad_norm": 0.7688621282577515, "learning_rate": 0.00047562508493001764, "loss": 3.8616, "step": 5740 }, { "epoch": 0.3903383611903791, "grad_norm": 0.6764592528343201, "learning_rate": 0.00047560385242560136, "loss": 3.6185, "step": 5745 }, { "epoch": 0.3906780812610409, "grad_norm": 0.8856603503227234, "learning_rate": 0.00047558261992118497, "loss": 3.6486, "step": 5750 }, { "epoch": 0.39101780133170266, "grad_norm": 0.8634997606277466, "learning_rate": 0.0004755613874167686, "loss": 3.7112, "step": 5755 }, { "epoch": 0.39135752140236446, "grad_norm": 0.9516596794128418, "learning_rate": 0.00047554015491235225, "loss": 3.7641, "step": 5760 }, { "epoch": 0.3916972414730262, "grad_norm": 0.880127489566803, "learning_rate": 0.00047551892240793586, "loss": 3.4168, "step": 5765 }, { "epoch": 0.392036961543688, "grad_norm": 0.8090914487838745, "learning_rate": 0.0004754976899035195, "loss": 3.8077, "step": 5770 }, { "epoch": 0.3923766816143498, "grad_norm": 0.8414747714996338, "learning_rate": 0.00047547645739910314, "loss": 3.8144, "step": 5775 }, { "epoch": 0.39271640168501154, "grad_norm": 0.8610857725143433, "learning_rate": 0.0004754552248946868, "loss": 3.3962, "step": 5780 }, { "epoch": 0.39305612175567334, "grad_norm": 0.8228824138641357, "learning_rate": 0.0004754339923902704, "loss": 3.7453, "step": 5785 }, { "epoch": 0.3933958418263351, "grad_norm": 0.9328849911689758, "learning_rate": 0.0004754127598858541, "loss": 3.7705, "step": 5790 }, { "epoch": 0.3937355618969969, "grad_norm": 0.7857221961021423, "learning_rate": 0.0004753915273814377, "loss": 3.6358, "step": 5795 }, { "epoch": 0.3940752819676587, "grad_norm": 1.0740424394607544, "learning_rate": 0.0004753702948770213, "loss": 3.7843, "step": 5800 }, { "epoch": 0.3944150020383204, "grad_norm": 1.853467345237732, "learning_rate": 0.000475349062372605, "loss": 3.7607, "step": 5805 }, { "epoch": 0.3947547221089822, "grad_norm": 0.7358279824256897, "learning_rate": 0.0004753278298681886, "loss": 3.6111, "step": 5810 }, { "epoch": 0.39509444217964396, "grad_norm": 1.3832118511199951, "learning_rate": 0.00047530659736377226, "loss": 3.2591, "step": 5815 }, { "epoch": 0.39543416225030575, "grad_norm": 0.7932572960853577, "learning_rate": 0.00047528536485935593, "loss": 3.8234, "step": 5820 }, { "epoch": 0.3957738823209675, "grad_norm": 0.7916043996810913, "learning_rate": 0.00047526413235493954, "loss": 3.7114, "step": 5825 }, { "epoch": 0.3961136023916293, "grad_norm": 0.9452283382415771, "learning_rate": 0.00047524289985052316, "loss": 3.386, "step": 5830 }, { "epoch": 0.3964533224622911, "grad_norm": 0.8155869841575623, "learning_rate": 0.0004752216673461068, "loss": 3.7662, "step": 5835 }, { "epoch": 0.39679304253295283, "grad_norm": 0.7999407649040222, "learning_rate": 0.00047520043484169044, "loss": 3.5615, "step": 5840 }, { "epoch": 0.39713276260361463, "grad_norm": 0.8765337467193604, "learning_rate": 0.00047517920233727405, "loss": 3.5406, "step": 5845 }, { "epoch": 0.3974724826742764, "grad_norm": 0.8865240216255188, "learning_rate": 0.00047515796983285777, "loss": 3.5173, "step": 5850 }, { "epoch": 0.39781220274493817, "grad_norm": 0.7923928499221802, "learning_rate": 0.0004751367373284414, "loss": 3.6563, "step": 5855 }, { "epoch": 0.39815192281559997, "grad_norm": 0.6422514319419861, "learning_rate": 0.000475115504824025, "loss": 3.6685, "step": 5860 }, { "epoch": 0.3984916428862617, "grad_norm": 0.7002764344215393, "learning_rate": 0.00047509427231960866, "loss": 3.8302, "step": 5865 }, { "epoch": 0.3988313629569235, "grad_norm": 1.531689167022705, "learning_rate": 0.0004750730398151923, "loss": 3.5438, "step": 5870 }, { "epoch": 0.39917108302758525, "grad_norm": 0.8903918266296387, "learning_rate": 0.0004750518073107759, "loss": 3.3243, "step": 5875 }, { "epoch": 0.39951080309824705, "grad_norm": 0.9965741038322449, "learning_rate": 0.00047503057480635956, "loss": 3.6725, "step": 5880 }, { "epoch": 0.39985052316890884, "grad_norm": 0.8285887241363525, "learning_rate": 0.0004750093423019432, "loss": 3.8341, "step": 5885 }, { "epoch": 0.4001902432395706, "grad_norm": 0.7490633130073547, "learning_rate": 0.00047498810979752684, "loss": 3.657, "step": 5890 }, { "epoch": 0.4005299633102324, "grad_norm": 0.9193158745765686, "learning_rate": 0.0004749668772931105, "loss": 3.43, "step": 5895 }, { "epoch": 0.4008696833808941, "grad_norm": 1.714179277420044, "learning_rate": 0.0004749456447886941, "loss": 3.5626, "step": 5900 }, { "epoch": 0.4012094034515559, "grad_norm": 0.7238312363624573, "learning_rate": 0.00047492441228427773, "loss": 3.7095, "step": 5905 }, { "epoch": 0.40154912352221767, "grad_norm": 0.9134380221366882, "learning_rate": 0.0004749031797798614, "loss": 3.6802, "step": 5910 }, { "epoch": 0.40188884359287946, "grad_norm": 0.9745157957077026, "learning_rate": 0.000474881947275445, "loss": 3.7697, "step": 5915 }, { "epoch": 0.40222856366354126, "grad_norm": 0.9201056361198425, "learning_rate": 0.0004748607147710287, "loss": 3.8033, "step": 5920 }, { "epoch": 0.402568283734203, "grad_norm": 0.8083043694496155, "learning_rate": 0.00047483948226661234, "loss": 3.5749, "step": 5925 }, { "epoch": 0.4029080038048648, "grad_norm": 0.8412145376205444, "learning_rate": 0.00047481824976219596, "loss": 3.5601, "step": 5930 }, { "epoch": 0.40324772387552654, "grad_norm": 0.7263871431350708, "learning_rate": 0.00047479701725777957, "loss": 3.8005, "step": 5935 }, { "epoch": 0.40358744394618834, "grad_norm": 0.8157692551612854, "learning_rate": 0.00047477578475336324, "loss": 3.5469, "step": 5940 }, { "epoch": 0.40392716401685014, "grad_norm": 0.9168975353240967, "learning_rate": 0.00047475455224894685, "loss": 3.5608, "step": 5945 }, { "epoch": 0.4042668840875119, "grad_norm": 0.6916372179985046, "learning_rate": 0.00047473331974453046, "loss": 3.8657, "step": 5950 }, { "epoch": 0.4046066041581737, "grad_norm": 0.7986130714416504, "learning_rate": 0.0004747120872401142, "loss": 3.7443, "step": 5955 }, { "epoch": 0.4049463242288354, "grad_norm": 0.8563168048858643, "learning_rate": 0.0004746908547356978, "loss": 3.6174, "step": 5960 }, { "epoch": 0.4052860442994972, "grad_norm": 0.8278127312660217, "learning_rate": 0.00047466962223128147, "loss": 3.7343, "step": 5965 }, { "epoch": 0.405625764370159, "grad_norm": 0.9187147617340088, "learning_rate": 0.0004746483897268651, "loss": 3.9202, "step": 5970 }, { "epoch": 0.40596548444082076, "grad_norm": 0.8356426358222961, "learning_rate": 0.0004746271572224487, "loss": 3.3902, "step": 5975 }, { "epoch": 0.40630520451148255, "grad_norm": 1.0144851207733154, "learning_rate": 0.00047460592471803236, "loss": 3.5831, "step": 5980 }, { "epoch": 0.4066449245821443, "grad_norm": 0.7899532914161682, "learning_rate": 0.000474584692213616, "loss": 3.5183, "step": 5985 }, { "epoch": 0.4069846446528061, "grad_norm": 1.003927230834961, "learning_rate": 0.00047456345970919964, "loss": 3.911, "step": 5990 }, { "epoch": 0.40732436472346784, "grad_norm": 0.7329619526863098, "learning_rate": 0.0004745422272047833, "loss": 3.7198, "step": 5995 }, { "epoch": 0.40766408479412963, "grad_norm": 0.7401747703552246, "learning_rate": 0.0004745209947003669, "loss": 3.5257, "step": 6000 }, { "epoch": 0.40800380486479143, "grad_norm": 0.8684765696525574, "learning_rate": 0.00047449976219595053, "loss": 3.6844, "step": 6005 }, { "epoch": 0.4083435249354532, "grad_norm": 0.9315171241760254, "learning_rate": 0.0004744785296915342, "loss": 3.6216, "step": 6010 }, { "epoch": 0.40868324500611497, "grad_norm": 0.7699169516563416, "learning_rate": 0.0004744572971871178, "loss": 3.6149, "step": 6015 }, { "epoch": 0.4090229650767767, "grad_norm": 0.8939794301986694, "learning_rate": 0.0004744360646827015, "loss": 3.6785, "step": 6020 }, { "epoch": 0.4093626851474385, "grad_norm": 0.8668757677078247, "learning_rate": 0.00047441483217828515, "loss": 3.5086, "step": 6025 }, { "epoch": 0.4097024052181003, "grad_norm": 0.9104294180870056, "learning_rate": 0.00047439359967386876, "loss": 3.4609, "step": 6030 }, { "epoch": 0.41004212528876205, "grad_norm": 0.8542659878730774, "learning_rate": 0.00047437236716945237, "loss": 3.5432, "step": 6035 }, { "epoch": 0.41038184535942385, "grad_norm": 1.2275596857070923, "learning_rate": 0.00047435113466503604, "loss": 3.6086, "step": 6040 }, { "epoch": 0.4107215654300856, "grad_norm": 2.006124496459961, "learning_rate": 0.00047432990216061965, "loss": 3.5948, "step": 6045 }, { "epoch": 0.4110612855007474, "grad_norm": 0.9739130735397339, "learning_rate": 0.00047430866965620326, "loss": 3.6845, "step": 6050 }, { "epoch": 0.4114010055714092, "grad_norm": 0.7811376452445984, "learning_rate": 0.000474287437151787, "loss": 3.4589, "step": 6055 }, { "epoch": 0.4117407256420709, "grad_norm": 0.8545107841491699, "learning_rate": 0.0004742662046473706, "loss": 3.4876, "step": 6060 }, { "epoch": 0.4120804457127327, "grad_norm": 2.294248104095459, "learning_rate": 0.0004742449721429542, "loss": 3.7202, "step": 6065 }, { "epoch": 0.41242016578339447, "grad_norm": 0.9741974472999573, "learning_rate": 0.0004742237396385379, "loss": 3.9099, "step": 6070 }, { "epoch": 0.41275988585405626, "grad_norm": 0.8863165974617004, "learning_rate": 0.0004742025071341215, "loss": 3.1833, "step": 6075 }, { "epoch": 0.413099605924718, "grad_norm": 1.25752854347229, "learning_rate": 0.0004741812746297051, "loss": 3.7652, "step": 6080 }, { "epoch": 0.4134393259953798, "grad_norm": 1.0237845182418823, "learning_rate": 0.00047416004212528877, "loss": 3.551, "step": 6085 }, { "epoch": 0.4137790460660416, "grad_norm": 0.7842826843261719, "learning_rate": 0.00047413880962087244, "loss": 3.7703, "step": 6090 }, { "epoch": 0.41411876613670334, "grad_norm": 1.033345341682434, "learning_rate": 0.00047411757711645605, "loss": 3.5663, "step": 6095 }, { "epoch": 0.41445848620736514, "grad_norm": 0.7340295314788818, "learning_rate": 0.0004740963446120397, "loss": 3.531, "step": 6100 }, { "epoch": 0.4147982062780269, "grad_norm": 0.7569920420646667, "learning_rate": 0.00047407511210762333, "loss": 3.4984, "step": 6105 }, { "epoch": 0.4151379263486887, "grad_norm": 0.7893882989883423, "learning_rate": 0.00047405387960320694, "loss": 3.5742, "step": 6110 }, { "epoch": 0.4154776464193505, "grad_norm": 1.180941104888916, "learning_rate": 0.0004740326470987906, "loss": 3.5049, "step": 6115 }, { "epoch": 0.4158173664900122, "grad_norm": 0.841119110584259, "learning_rate": 0.0004740114145943742, "loss": 3.8498, "step": 6120 }, { "epoch": 0.416157086560674, "grad_norm": 0.808458149433136, "learning_rate": 0.0004739901820899579, "loss": 3.6421, "step": 6125 }, { "epoch": 0.41649680663133576, "grad_norm": 0.7173882126808167, "learning_rate": 0.00047396894958554156, "loss": 3.4565, "step": 6130 }, { "epoch": 0.41683652670199756, "grad_norm": 1.001106858253479, "learning_rate": 0.00047394771708112517, "loss": 3.5292, "step": 6135 }, { "epoch": 0.41717624677265935, "grad_norm": 0.7901259064674377, "learning_rate": 0.0004739264845767088, "loss": 3.6994, "step": 6140 }, { "epoch": 0.4175159668433211, "grad_norm": 0.8366681337356567, "learning_rate": 0.00047390525207229245, "loss": 3.7452, "step": 6145 }, { "epoch": 0.4178556869139829, "grad_norm": 1.2461090087890625, "learning_rate": 0.00047388401956787607, "loss": 3.7427, "step": 6150 }, { "epoch": 0.41819540698464464, "grad_norm": 0.8651979565620422, "learning_rate": 0.0004738627870634597, "loss": 3.4896, "step": 6155 }, { "epoch": 0.41853512705530643, "grad_norm": 0.7702454924583435, "learning_rate": 0.0004738415545590434, "loss": 3.6446, "step": 6160 }, { "epoch": 0.4188748471259682, "grad_norm": 0.8177877068519592, "learning_rate": 0.000473820322054627, "loss": 3.8589, "step": 6165 }, { "epoch": 0.41921456719663, "grad_norm": 0.901222825050354, "learning_rate": 0.0004737990895502106, "loss": 3.6644, "step": 6170 }, { "epoch": 0.41955428726729177, "grad_norm": 1.0574082136154175, "learning_rate": 0.0004737778570457943, "loss": 3.7551, "step": 6175 }, { "epoch": 0.4198940073379535, "grad_norm": 0.7880721688270569, "learning_rate": 0.0004737566245413779, "loss": 3.6154, "step": 6180 }, { "epoch": 0.4202337274086153, "grad_norm": 0.7653689980506897, "learning_rate": 0.0004737353920369615, "loss": 3.6044, "step": 6185 }, { "epoch": 0.42057344747927705, "grad_norm": 0.898750364780426, "learning_rate": 0.0004737141595325452, "loss": 3.4431, "step": 6190 }, { "epoch": 0.42091316754993885, "grad_norm": 0.6285688877105713, "learning_rate": 0.00047369292702812885, "loss": 3.6866, "step": 6195 }, { "epoch": 0.42125288762060065, "grad_norm": 0.7547000050544739, "learning_rate": 0.00047367169452371247, "loss": 3.5941, "step": 6200 }, { "epoch": 0.4215926076912624, "grad_norm": 0.932868242263794, "learning_rate": 0.00047365046201929613, "loss": 3.6271, "step": 6205 }, { "epoch": 0.4219323277619242, "grad_norm": 0.79076087474823, "learning_rate": 0.00047362922951487975, "loss": 3.5727, "step": 6210 }, { "epoch": 0.42227204783258593, "grad_norm": 0.9361346364021301, "learning_rate": 0.00047360799701046336, "loss": 3.4224, "step": 6215 }, { "epoch": 0.4226117679032477, "grad_norm": 0.7980191111564636, "learning_rate": 0.000473586764506047, "loss": 3.5981, "step": 6220 }, { "epoch": 0.4229514879739095, "grad_norm": 0.7762559652328491, "learning_rate": 0.00047356553200163064, "loss": 3.4929, "step": 6225 }, { "epoch": 0.42329120804457127, "grad_norm": 0.8038540482521057, "learning_rate": 0.0004735442994972143, "loss": 3.6, "step": 6230 }, { "epoch": 0.42363092811523306, "grad_norm": 0.7600985169410706, "learning_rate": 0.000473523066992798, "loss": 3.6565, "step": 6235 }, { "epoch": 0.4239706481858948, "grad_norm": 0.8075121641159058, "learning_rate": 0.0004735018344883816, "loss": 3.5085, "step": 6240 }, { "epoch": 0.4243103682565566, "grad_norm": 0.9684168696403503, "learning_rate": 0.0004734806019839652, "loss": 3.6085, "step": 6245 }, { "epoch": 0.42465008832721834, "grad_norm": 0.7998490929603577, "learning_rate": 0.00047345936947954887, "loss": 3.4333, "step": 6250 }, { "epoch": 0.42498980839788014, "grad_norm": 0.807165801525116, "learning_rate": 0.0004734381369751325, "loss": 3.6114, "step": 6255 }, { "epoch": 0.42532952846854194, "grad_norm": 0.952530026435852, "learning_rate": 0.0004734169044707161, "loss": 3.3047, "step": 6260 }, { "epoch": 0.4256692485392037, "grad_norm": 0.6746886372566223, "learning_rate": 0.0004733956719662998, "loss": 3.4235, "step": 6265 }, { "epoch": 0.4260089686098655, "grad_norm": 1.0420918464660645, "learning_rate": 0.0004733744394618834, "loss": 3.7808, "step": 6270 }, { "epoch": 0.4263486886805272, "grad_norm": 0.8014900088310242, "learning_rate": 0.00047335320695746704, "loss": 3.4453, "step": 6275 }, { "epoch": 0.426688408751189, "grad_norm": 1.307139277458191, "learning_rate": 0.0004733319744530507, "loss": 3.5684, "step": 6280 }, { "epoch": 0.4270281288218508, "grad_norm": 0.974854588508606, "learning_rate": 0.0004733107419486343, "loss": 3.734, "step": 6285 }, { "epoch": 0.42736784889251256, "grad_norm": 0.9062513709068298, "learning_rate": 0.00047328950944421793, "loss": 3.5523, "step": 6290 }, { "epoch": 0.42770756896317436, "grad_norm": 0.7424876093864441, "learning_rate": 0.0004732682769398016, "loss": 3.754, "step": 6295 }, { "epoch": 0.4280472890338361, "grad_norm": 0.7035219073295593, "learning_rate": 0.00047324704443538527, "loss": 3.64, "step": 6300 }, { "epoch": 0.4283870091044979, "grad_norm": 0.8318206667900085, "learning_rate": 0.00047322581193096893, "loss": 3.2769, "step": 6305 }, { "epoch": 0.4287267291751597, "grad_norm": 0.749793291091919, "learning_rate": 0.00047320457942655255, "loss": 3.9689, "step": 6310 }, { "epoch": 0.42906644924582144, "grad_norm": 0.7879555821418762, "learning_rate": 0.00047318334692213616, "loss": 3.7049, "step": 6315 }, { "epoch": 0.42940616931648323, "grad_norm": 0.8151434659957886, "learning_rate": 0.0004731621144177198, "loss": 3.5611, "step": 6320 }, { "epoch": 0.429745889387145, "grad_norm": 1.0580159425735474, "learning_rate": 0.00047314088191330344, "loss": 3.4972, "step": 6325 }, { "epoch": 0.4300856094578068, "grad_norm": 1.0412421226501465, "learning_rate": 0.00047311964940888705, "loss": 3.5584, "step": 6330 }, { "epoch": 0.4304253295284685, "grad_norm": 1.1151556968688965, "learning_rate": 0.0004730984169044708, "loss": 3.6602, "step": 6335 }, { "epoch": 0.4307650495991303, "grad_norm": 0.9003672003746033, "learning_rate": 0.0004730771844000544, "loss": 3.6042, "step": 6340 }, { "epoch": 0.4311047696697921, "grad_norm": 0.8656334280967712, "learning_rate": 0.000473055951895638, "loss": 3.4905, "step": 6345 }, { "epoch": 0.43144448974045385, "grad_norm": 0.9447208046913147, "learning_rate": 0.00047303471939122167, "loss": 3.5006, "step": 6350 }, { "epoch": 0.43178420981111565, "grad_norm": 0.8647967576980591, "learning_rate": 0.0004730134868868053, "loss": 3.5096, "step": 6355 }, { "epoch": 0.4321239298817774, "grad_norm": 0.8515398502349854, "learning_rate": 0.0004729922543823889, "loss": 3.3496, "step": 6360 }, { "epoch": 0.4324636499524392, "grad_norm": 0.7714278697967529, "learning_rate": 0.00047297102187797256, "loss": 3.6356, "step": 6365 }, { "epoch": 0.432803370023101, "grad_norm": 1.1053882837295532, "learning_rate": 0.0004729497893735562, "loss": 3.299, "step": 6370 }, { "epoch": 0.43314309009376273, "grad_norm": 0.9563339948654175, "learning_rate": 0.00047292855686913984, "loss": 3.4931, "step": 6375 }, { "epoch": 0.4334828101644245, "grad_norm": 0.948620080947876, "learning_rate": 0.0004729073243647235, "loss": 3.6568, "step": 6380 }, { "epoch": 0.43382253023508627, "grad_norm": 1.1028590202331543, "learning_rate": 0.0004728860918603071, "loss": 3.5788, "step": 6385 }, { "epoch": 0.43416225030574807, "grad_norm": 0.9642010927200317, "learning_rate": 0.00047286485935589073, "loss": 3.5412, "step": 6390 }, { "epoch": 0.43450197037640986, "grad_norm": 0.7737002968788147, "learning_rate": 0.0004728436268514744, "loss": 3.7834, "step": 6395 }, { "epoch": 0.4348416904470716, "grad_norm": 0.8719393014907837, "learning_rate": 0.000472822394347058, "loss": 3.5166, "step": 6400 }, { "epoch": 0.4351814105177334, "grad_norm": 0.7076466679573059, "learning_rate": 0.0004728011618426417, "loss": 3.622, "step": 6405 }, { "epoch": 0.43552113058839514, "grad_norm": 0.854110836982727, "learning_rate": 0.00047277992933822535, "loss": 3.4812, "step": 6410 }, { "epoch": 0.43586085065905694, "grad_norm": 0.9088481664657593, "learning_rate": 0.00047275869683380896, "loss": 3.5252, "step": 6415 }, { "epoch": 0.4362005707297187, "grad_norm": 0.7497720122337341, "learning_rate": 0.0004727374643293926, "loss": 3.5382, "step": 6420 }, { "epoch": 0.4365402908003805, "grad_norm": 0.8987573981285095, "learning_rate": 0.00047271623182497624, "loss": 3.4338, "step": 6425 }, { "epoch": 0.4368800108710423, "grad_norm": 0.8339664936065674, "learning_rate": 0.00047269499932055985, "loss": 3.7842, "step": 6430 }, { "epoch": 0.437219730941704, "grad_norm": 0.8107259273529053, "learning_rate": 0.00047267376681614347, "loss": 3.6638, "step": 6435 }, { "epoch": 0.4375594510123658, "grad_norm": 0.9555877447128296, "learning_rate": 0.0004726525343117272, "loss": 3.4981, "step": 6440 }, { "epoch": 0.43789917108302756, "grad_norm": 0.9899070262908936, "learning_rate": 0.0004726313018073108, "loss": 3.5735, "step": 6445 }, { "epoch": 0.43823889115368936, "grad_norm": 0.9385442137718201, "learning_rate": 0.0004726100693028944, "loss": 3.6586, "step": 6450 }, { "epoch": 0.43857861122435116, "grad_norm": 0.9405965805053711, "learning_rate": 0.0004725888367984781, "loss": 3.8092, "step": 6455 }, { "epoch": 0.4389183312950129, "grad_norm": 0.795551598072052, "learning_rate": 0.0004725676042940617, "loss": 3.429, "step": 6460 }, { "epoch": 0.4392580513656747, "grad_norm": 0.845453679561615, "learning_rate": 0.0004725463717896453, "loss": 3.5537, "step": 6465 }, { "epoch": 0.43959777143633644, "grad_norm": 0.9188764691352844, "learning_rate": 0.000472525139285229, "loss": 3.393, "step": 6470 }, { "epoch": 0.43993749150699824, "grad_norm": 0.8043771982192993, "learning_rate": 0.00047250390678081264, "loss": 3.8282, "step": 6475 }, { "epoch": 0.44027721157766003, "grad_norm": 0.7572502493858337, "learning_rate": 0.00047248267427639625, "loss": 3.8014, "step": 6480 }, { "epoch": 0.4406169316483218, "grad_norm": 0.8111985325813293, "learning_rate": 0.0004724614417719799, "loss": 3.4054, "step": 6485 }, { "epoch": 0.44095665171898357, "grad_norm": 0.8538852334022522, "learning_rate": 0.00047244020926756353, "loss": 3.8321, "step": 6490 }, { "epoch": 0.4412963717896453, "grad_norm": 2.6499388217926025, "learning_rate": 0.00047241897676314715, "loss": 3.5462, "step": 6495 }, { "epoch": 0.4416360918603071, "grad_norm": 1.1338897943496704, "learning_rate": 0.0004723977442587308, "loss": 3.8032, "step": 6500 }, { "epoch": 0.4419758119309689, "grad_norm": 1.0297547578811646, "learning_rate": 0.0004723765117543144, "loss": 3.4428, "step": 6505 }, { "epoch": 0.44231553200163065, "grad_norm": 0.8357832431793213, "learning_rate": 0.0004723552792498981, "loss": 3.7359, "step": 6510 }, { "epoch": 0.44265525207229245, "grad_norm": 1.162199854850769, "learning_rate": 0.00047233404674548176, "loss": 3.5001, "step": 6515 }, { "epoch": 0.4429949721429542, "grad_norm": 0.9315199255943298, "learning_rate": 0.0004723128142410654, "loss": 3.4112, "step": 6520 }, { "epoch": 0.443334692213616, "grad_norm": 1.0232312679290771, "learning_rate": 0.000472291581736649, "loss": 3.6916, "step": 6525 }, { "epoch": 0.44367441228427773, "grad_norm": 0.7338569164276123, "learning_rate": 0.00047227034923223265, "loss": 3.5949, "step": 6530 }, { "epoch": 0.44401413235493953, "grad_norm": 1.158220887184143, "learning_rate": 0.00047224911672781627, "loss": 3.3935, "step": 6535 }, { "epoch": 0.4443538524256013, "grad_norm": 0.8821459412574768, "learning_rate": 0.0004722278842233999, "loss": 3.4993, "step": 6540 }, { "epoch": 0.44469357249626307, "grad_norm": 0.7773975729942322, "learning_rate": 0.0004722066517189836, "loss": 3.6894, "step": 6545 }, { "epoch": 0.44503329256692487, "grad_norm": 0.8488813638687134, "learning_rate": 0.0004721854192145672, "loss": 3.3312, "step": 6550 }, { "epoch": 0.4453730126375866, "grad_norm": 0.8761072754859924, "learning_rate": 0.0004721641867101508, "loss": 3.713, "step": 6555 }, { "epoch": 0.4457127327082484, "grad_norm": 0.7968103289604187, "learning_rate": 0.0004721429542057345, "loss": 3.8592, "step": 6560 }, { "epoch": 0.4460524527789102, "grad_norm": 0.7724491953849792, "learning_rate": 0.0004721217217013181, "loss": 3.2671, "step": 6565 }, { "epoch": 0.44639217284957194, "grad_norm": 0.7908604145050049, "learning_rate": 0.0004721004891969017, "loss": 3.3915, "step": 6570 }, { "epoch": 0.44673189292023374, "grad_norm": 0.8952115178108215, "learning_rate": 0.00047207925669248544, "loss": 3.7513, "step": 6575 }, { "epoch": 0.4470716129908955, "grad_norm": 0.8479584455490112, "learning_rate": 0.00047205802418806905, "loss": 3.4934, "step": 6580 }, { "epoch": 0.4474113330615573, "grad_norm": 0.8883538842201233, "learning_rate": 0.00047203679168365267, "loss": 3.493, "step": 6585 }, { "epoch": 0.4477510531322191, "grad_norm": 0.83565753698349, "learning_rate": 0.00047201555917923633, "loss": 3.53, "step": 6590 }, { "epoch": 0.4480907732028808, "grad_norm": 1.236315131187439, "learning_rate": 0.00047199432667481995, "loss": 3.5426, "step": 6595 }, { "epoch": 0.4484304932735426, "grad_norm": 0.8248653411865234, "learning_rate": 0.00047197309417040356, "loss": 3.8173, "step": 6600 }, { "epoch": 0.44877021334420436, "grad_norm": 0.8703151345252991, "learning_rate": 0.00047195186166598723, "loss": 3.4868, "step": 6605 }, { "epoch": 0.44910993341486616, "grad_norm": 0.8155608177185059, "learning_rate": 0.0004719306291615709, "loss": 3.5555, "step": 6610 }, { "epoch": 0.4494496534855279, "grad_norm": 1.164979100227356, "learning_rate": 0.0004719093966571545, "loss": 3.7896, "step": 6615 }, { "epoch": 0.4497893735561897, "grad_norm": 0.8082951903343201, "learning_rate": 0.0004718881641527382, "loss": 3.6783, "step": 6620 }, { "epoch": 0.4501290936268515, "grad_norm": 0.9319656491279602, "learning_rate": 0.0004718669316483218, "loss": 3.7024, "step": 6625 }, { "epoch": 0.45046881369751324, "grad_norm": 0.9364271759986877, "learning_rate": 0.0004718456991439054, "loss": 3.8065, "step": 6630 }, { "epoch": 0.45080853376817503, "grad_norm": 2.8780393600463867, "learning_rate": 0.00047182446663948907, "loss": 3.6692, "step": 6635 }, { "epoch": 0.4511482538388368, "grad_norm": 1.2747070789337158, "learning_rate": 0.0004718032341350727, "loss": 3.7492, "step": 6640 }, { "epoch": 0.4514879739094986, "grad_norm": 0.804011881351471, "learning_rate": 0.0004717820016306564, "loss": 3.3256, "step": 6645 }, { "epoch": 0.45182769398016037, "grad_norm": 0.9058822393417358, "learning_rate": 0.00047176076912624, "loss": 3.7243, "step": 6650 }, { "epoch": 0.4521674140508221, "grad_norm": 1.5555955171585083, "learning_rate": 0.00047173953662182363, "loss": 3.528, "step": 6655 }, { "epoch": 0.4525071341214839, "grad_norm": 0.9295026659965515, "learning_rate": 0.0004717183041174073, "loss": 3.4064, "step": 6660 }, { "epoch": 0.45284685419214565, "grad_norm": 0.7785129547119141, "learning_rate": 0.0004716970716129909, "loss": 3.3777, "step": 6665 }, { "epoch": 0.45318657426280745, "grad_norm": 0.9681568145751953, "learning_rate": 0.0004716758391085745, "loss": 3.7213, "step": 6670 }, { "epoch": 0.45352629433346925, "grad_norm": 2.302454710006714, "learning_rate": 0.0004716546066041582, "loss": 3.9011, "step": 6675 }, { "epoch": 0.453866014404131, "grad_norm": 0.9806739091873169, "learning_rate": 0.00047163337409974186, "loss": 3.6659, "step": 6680 }, { "epoch": 0.4542057344747928, "grad_norm": 0.8210526704788208, "learning_rate": 0.00047161214159532547, "loss": 3.423, "step": 6685 }, { "epoch": 0.45454545454545453, "grad_norm": 1.052043080329895, "learning_rate": 0.00047159090909090914, "loss": 3.411, "step": 6690 }, { "epoch": 0.45488517461611633, "grad_norm": 1.0579310655593872, "learning_rate": 0.00047156967658649275, "loss": 3.6483, "step": 6695 }, { "epoch": 0.45522489468677807, "grad_norm": 0.9053061008453369, "learning_rate": 0.00047154844408207636, "loss": 3.3171, "step": 6700 }, { "epoch": 0.45556461475743987, "grad_norm": 1.1307685375213623, "learning_rate": 0.00047152721157766003, "loss": 3.4842, "step": 6705 }, { "epoch": 0.45590433482810166, "grad_norm": 0.9240822792053223, "learning_rate": 0.00047150597907324364, "loss": 3.6288, "step": 6710 }, { "epoch": 0.4562440548987634, "grad_norm": 0.7850516438484192, "learning_rate": 0.0004714847465688273, "loss": 3.6434, "step": 6715 }, { "epoch": 0.4565837749694252, "grad_norm": 0.8186911344528198, "learning_rate": 0.000471463514064411, "loss": 3.5305, "step": 6720 }, { "epoch": 0.45692349504008695, "grad_norm": 1.1185601949691772, "learning_rate": 0.0004714422815599946, "loss": 3.4736, "step": 6725 }, { "epoch": 0.45726321511074874, "grad_norm": 0.7493671178817749, "learning_rate": 0.0004714210490555782, "loss": 3.6686, "step": 6730 }, { "epoch": 0.45760293518141054, "grad_norm": 1.2695294618606567, "learning_rate": 0.00047139981655116187, "loss": 3.6333, "step": 6735 }, { "epoch": 0.4579426552520723, "grad_norm": 0.9285781383514404, "learning_rate": 0.0004713785840467455, "loss": 3.6729, "step": 6740 }, { "epoch": 0.4582823753227341, "grad_norm": 0.9622691869735718, "learning_rate": 0.0004713573515423291, "loss": 3.5847, "step": 6745 }, { "epoch": 0.4586220953933958, "grad_norm": 0.794783353805542, "learning_rate": 0.0004713361190379128, "loss": 3.686, "step": 6750 }, { "epoch": 0.4589618154640576, "grad_norm": 0.8385029435157776, "learning_rate": 0.00047131488653349643, "loss": 3.6943, "step": 6755 }, { "epoch": 0.4593015355347194, "grad_norm": 0.8678736090660095, "learning_rate": 0.00047129365402908004, "loss": 3.3275, "step": 6760 }, { "epoch": 0.45964125560538116, "grad_norm": 1.3470338582992554, "learning_rate": 0.0004712724215246637, "loss": 3.6544, "step": 6765 }, { "epoch": 0.45998097567604296, "grad_norm": 1.0065836906433105, "learning_rate": 0.0004712511890202473, "loss": 3.7172, "step": 6770 }, { "epoch": 0.4603206957467047, "grad_norm": 0.8770900964736938, "learning_rate": 0.00047122995651583093, "loss": 3.7288, "step": 6775 }, { "epoch": 0.4606604158173665, "grad_norm": 0.878991961479187, "learning_rate": 0.0004712087240114146, "loss": 3.7415, "step": 6780 }, { "epoch": 0.46100013588802824, "grad_norm": 0.8039005994796753, "learning_rate": 0.00047118749150699827, "loss": 3.6354, "step": 6785 }, { "epoch": 0.46133985595869004, "grad_norm": 0.9737961292266846, "learning_rate": 0.0004711662590025819, "loss": 3.4756, "step": 6790 }, { "epoch": 0.46167957602935183, "grad_norm": 1.0036921501159668, "learning_rate": 0.00047114502649816555, "loss": 3.4627, "step": 6795 }, { "epoch": 0.4620192961000136, "grad_norm": 0.8595765233039856, "learning_rate": 0.00047112379399374916, "loss": 3.6419, "step": 6800 }, { "epoch": 0.4623590161706754, "grad_norm": 1.0798704624176025, "learning_rate": 0.0004711025614893328, "loss": 3.4318, "step": 6805 }, { "epoch": 0.4626987362413371, "grad_norm": 1.0010710954666138, "learning_rate": 0.00047108132898491644, "loss": 3.4448, "step": 6810 }, { "epoch": 0.4630384563119989, "grad_norm": 0.8167574405670166, "learning_rate": 0.00047106009648050005, "loss": 3.3409, "step": 6815 }, { "epoch": 0.4633781763826607, "grad_norm": 1.1897807121276855, "learning_rate": 0.0004710388639760837, "loss": 3.8678, "step": 6820 }, { "epoch": 0.46371789645332245, "grad_norm": 1.001115083694458, "learning_rate": 0.0004710176314716674, "loss": 3.7231, "step": 6825 }, { "epoch": 0.46405761652398425, "grad_norm": 0.8895241022109985, "learning_rate": 0.000470996398967251, "loss": 3.5896, "step": 6830 }, { "epoch": 0.464397336594646, "grad_norm": 0.9458503723144531, "learning_rate": 0.0004709751664628346, "loss": 3.4652, "step": 6835 }, { "epoch": 0.4647370566653078, "grad_norm": 0.8209068179130554, "learning_rate": 0.0004709539339584183, "loss": 3.7555, "step": 6840 }, { "epoch": 0.4650767767359696, "grad_norm": 0.749652624130249, "learning_rate": 0.0004709327014540019, "loss": 3.5494, "step": 6845 }, { "epoch": 0.46541649680663133, "grad_norm": 1.2140450477600098, "learning_rate": 0.0004709114689495855, "loss": 3.7268, "step": 6850 }, { "epoch": 0.4657562168772931, "grad_norm": 0.8988654613494873, "learning_rate": 0.00047089023644516923, "loss": 3.6185, "step": 6855 }, { "epoch": 0.46609593694795487, "grad_norm": 0.9507482647895813, "learning_rate": 0.00047086900394075284, "loss": 3.6418, "step": 6860 }, { "epoch": 0.46643565701861667, "grad_norm": 0.7588338851928711, "learning_rate": 0.00047084777143633646, "loss": 3.4678, "step": 6865 }, { "epoch": 0.4667753770892784, "grad_norm": 0.8689852952957153, "learning_rate": 0.0004708265389319201, "loss": 3.4763, "step": 6870 }, { "epoch": 0.4671150971599402, "grad_norm": 0.9339428544044495, "learning_rate": 0.00047080530642750374, "loss": 3.3111, "step": 6875 }, { "epoch": 0.467454817230602, "grad_norm": 1.2112749814987183, "learning_rate": 0.00047078407392308735, "loss": 3.3171, "step": 6880 }, { "epoch": 0.46779453730126375, "grad_norm": 0.8746896386146545, "learning_rate": 0.000470762841418671, "loss": 3.5725, "step": 6885 }, { "epoch": 0.46813425737192554, "grad_norm": 0.7502477765083313, "learning_rate": 0.0004707416089142547, "loss": 3.6589, "step": 6890 }, { "epoch": 0.4684739774425873, "grad_norm": 1.1025844812393188, "learning_rate": 0.0004707203764098383, "loss": 3.6215, "step": 6895 }, { "epoch": 0.4688136975132491, "grad_norm": 0.9506344795227051, "learning_rate": 0.00047069914390542196, "loss": 3.7827, "step": 6900 }, { "epoch": 0.4691534175839109, "grad_norm": 0.9715796113014221, "learning_rate": 0.0004706779114010056, "loss": 3.7736, "step": 6905 }, { "epoch": 0.4694931376545726, "grad_norm": 1.1109237670898438, "learning_rate": 0.0004706566788965892, "loss": 3.6444, "step": 6910 }, { "epoch": 0.4698328577252344, "grad_norm": 1.156936526298523, "learning_rate": 0.00047063544639217286, "loss": 3.6375, "step": 6915 }, { "epoch": 0.47017257779589616, "grad_norm": 0.8747252225875854, "learning_rate": 0.00047061421388775647, "loss": 3.3968, "step": 6920 }, { "epoch": 0.47051229786655796, "grad_norm": 0.7255381345748901, "learning_rate": 0.00047059298138334014, "loss": 3.5555, "step": 6925 }, { "epoch": 0.47085201793721976, "grad_norm": 1.0958129167556763, "learning_rate": 0.0004705717488789238, "loss": 3.6183, "step": 6930 }, { "epoch": 0.4711917380078815, "grad_norm": 0.8962180018424988, "learning_rate": 0.0004705505163745074, "loss": 3.5442, "step": 6935 }, { "epoch": 0.4715314580785433, "grad_norm": 0.8231995701789856, "learning_rate": 0.00047052928387009103, "loss": 3.5562, "step": 6940 }, { "epoch": 0.47187117814920504, "grad_norm": 0.9573768973350525, "learning_rate": 0.0004705080513656747, "loss": 3.8185, "step": 6945 }, { "epoch": 0.47221089821986684, "grad_norm": 0.966521143913269, "learning_rate": 0.0004704868188612583, "loss": 3.4611, "step": 6950 }, { "epoch": 0.4725506182905286, "grad_norm": 0.9487773776054382, "learning_rate": 0.0004704655863568419, "loss": 3.8248, "step": 6955 }, { "epoch": 0.4728903383611904, "grad_norm": 1.189887285232544, "learning_rate": 0.00047044435385242564, "loss": 3.7753, "step": 6960 }, { "epoch": 0.4732300584318522, "grad_norm": 1.186806082725525, "learning_rate": 0.00047042312134800926, "loss": 3.6917, "step": 6965 }, { "epoch": 0.4735697785025139, "grad_norm": 1.252324104309082, "learning_rate": 0.00047040188884359287, "loss": 3.4038, "step": 6970 }, { "epoch": 0.4739094985731757, "grad_norm": 0.8394821882247925, "learning_rate": 0.00047038065633917654, "loss": 3.6303, "step": 6975 }, { "epoch": 0.47424921864383746, "grad_norm": 0.722270131111145, "learning_rate": 0.00047035942383476015, "loss": 3.8359, "step": 6980 }, { "epoch": 0.47458893871449925, "grad_norm": 0.837042510509491, "learning_rate": 0.0004703381913303438, "loss": 3.5381, "step": 6985 }, { "epoch": 0.47492865878516105, "grad_norm": 0.7896704077720642, "learning_rate": 0.00047031695882592743, "loss": 3.6179, "step": 6990 }, { "epoch": 0.4752683788558228, "grad_norm": 1.0711753368377686, "learning_rate": 0.0004702957263215111, "loss": 3.2638, "step": 6995 }, { "epoch": 0.4756080989264846, "grad_norm": 0.8350386023521423, "learning_rate": 0.00047027449381709476, "loss": 3.1967, "step": 7000 }, { "epoch": 0.47594781899714633, "grad_norm": 1.166980504989624, "learning_rate": 0.0004702532613126784, "loss": 3.9485, "step": 7005 }, { "epoch": 0.47628753906780813, "grad_norm": 1.0224041938781738, "learning_rate": 0.000470232028808262, "loss": 3.7159, "step": 7010 }, { "epoch": 0.4766272591384699, "grad_norm": 1.0159026384353638, "learning_rate": 0.00047021079630384566, "loss": 3.6103, "step": 7015 }, { "epoch": 0.47696697920913167, "grad_norm": 0.8833797574043274, "learning_rate": 0.00047018956379942927, "loss": 3.2097, "step": 7020 }, { "epoch": 0.47730669927979347, "grad_norm": 0.8448350429534912, "learning_rate": 0.0004701683312950129, "loss": 3.661, "step": 7025 }, { "epoch": 0.4776464193504552, "grad_norm": 0.9669728875160217, "learning_rate": 0.0004701470987905966, "loss": 3.6859, "step": 7030 }, { "epoch": 0.477986139421117, "grad_norm": 0.8792215585708618, "learning_rate": 0.0004701258662861802, "loss": 3.4252, "step": 7035 }, { "epoch": 0.47832585949177875, "grad_norm": 1.9950733184814453, "learning_rate": 0.00047010463378176383, "loss": 3.5112, "step": 7040 }, { "epoch": 0.47866557956244055, "grad_norm": 1.1802488565444946, "learning_rate": 0.0004700834012773475, "loss": 3.588, "step": 7045 }, { "epoch": 0.47900529963310234, "grad_norm": 0.8683750033378601, "learning_rate": 0.0004700621687729311, "loss": 3.597, "step": 7050 }, { "epoch": 0.4793450197037641, "grad_norm": 0.9744851589202881, "learning_rate": 0.0004700409362685147, "loss": 3.7498, "step": 7055 }, { "epoch": 0.4796847397744259, "grad_norm": 0.7924578189849854, "learning_rate": 0.0004700197037640984, "loss": 3.6682, "step": 7060 }, { "epoch": 0.4800244598450876, "grad_norm": 0.9199391603469849, "learning_rate": 0.00046999847125968206, "loss": 3.5305, "step": 7065 }, { "epoch": 0.4803641799157494, "grad_norm": 1.0641202926635742, "learning_rate": 0.00046997723875526567, "loss": 3.3808, "step": 7070 }, { "epoch": 0.4807038999864112, "grad_norm": 0.6925670504570007, "learning_rate": 0.00046995600625084934, "loss": 3.7109, "step": 7075 }, { "epoch": 0.48104362005707296, "grad_norm": 1.222713589668274, "learning_rate": 0.00046993477374643295, "loss": 3.772, "step": 7080 }, { "epoch": 0.48138334012773476, "grad_norm": 0.9653027057647705, "learning_rate": 0.00046991354124201656, "loss": 3.7964, "step": 7085 }, { "epoch": 0.4817230601983965, "grad_norm": 0.972819447517395, "learning_rate": 0.00046989230873760023, "loss": 3.5578, "step": 7090 }, { "epoch": 0.4820627802690583, "grad_norm": 0.7779053449630737, "learning_rate": 0.00046987107623318384, "loss": 3.6628, "step": 7095 }, { "epoch": 0.4824025003397201, "grad_norm": 0.6111833453178406, "learning_rate": 0.0004698498437287675, "loss": 3.5593, "step": 7100 }, { "epoch": 0.48274222041038184, "grad_norm": 0.9555612802505493, "learning_rate": 0.0004698286112243512, "loss": 3.6201, "step": 7105 }, { "epoch": 0.48308194048104364, "grad_norm": 0.9342659711837769, "learning_rate": 0.0004698073787199348, "loss": 3.7758, "step": 7110 }, { "epoch": 0.4834216605517054, "grad_norm": 0.8675495386123657, "learning_rate": 0.0004697861462155184, "loss": 3.5178, "step": 7115 }, { "epoch": 0.4837613806223672, "grad_norm": 0.7649567723274231, "learning_rate": 0.00046976491371110207, "loss": 3.6186, "step": 7120 }, { "epoch": 0.4841011006930289, "grad_norm": 0.8716285228729248, "learning_rate": 0.0004697436812066857, "loss": 3.5631, "step": 7125 }, { "epoch": 0.4844408207636907, "grad_norm": 1.040010690689087, "learning_rate": 0.0004697224487022693, "loss": 3.5357, "step": 7130 }, { "epoch": 0.4847805408343525, "grad_norm": 1.3115756511688232, "learning_rate": 0.000469701216197853, "loss": 3.4806, "step": 7135 }, { "epoch": 0.48512026090501426, "grad_norm": 0.8847962617874146, "learning_rate": 0.00046967998369343663, "loss": 3.6823, "step": 7140 }, { "epoch": 0.48545998097567605, "grad_norm": 1.052097201347351, "learning_rate": 0.00046965875118902024, "loss": 3.7097, "step": 7145 }, { "epoch": 0.4857997010463378, "grad_norm": 0.9073324203491211, "learning_rate": 0.0004696375186846039, "loss": 3.6764, "step": 7150 }, { "epoch": 0.4861394211169996, "grad_norm": 0.8677130341529846, "learning_rate": 0.0004696162861801875, "loss": 3.6977, "step": 7155 }, { "epoch": 0.4864791411876614, "grad_norm": 1.3751845359802246, "learning_rate": 0.00046959505367577114, "loss": 3.6746, "step": 7160 }, { "epoch": 0.48681886125832313, "grad_norm": 0.9535473585128784, "learning_rate": 0.00046957382117135486, "loss": 3.5555, "step": 7165 }, { "epoch": 0.48715858132898493, "grad_norm": 0.988778829574585, "learning_rate": 0.00046955258866693847, "loss": 3.2623, "step": 7170 }, { "epoch": 0.48749830139964667, "grad_norm": 1.0222786664962769, "learning_rate": 0.0004695313561625221, "loss": 3.6695, "step": 7175 }, { "epoch": 0.48783802147030847, "grad_norm": 0.9114978909492493, "learning_rate": 0.00046951012365810575, "loss": 3.6856, "step": 7180 }, { "epoch": 0.48817774154097027, "grad_norm": 0.9462752938270569, "learning_rate": 0.00046948889115368936, "loss": 3.5035, "step": 7185 }, { "epoch": 0.488517461611632, "grad_norm": 0.7095661759376526, "learning_rate": 0.000469467658649273, "loss": 3.6848, "step": 7190 }, { "epoch": 0.4888571816822938, "grad_norm": 0.8806315064430237, "learning_rate": 0.00046944642614485664, "loss": 3.7502, "step": 7195 }, { "epoch": 0.48919690175295555, "grad_norm": 1.4548368453979492, "learning_rate": 0.0004694251936404403, "loss": 3.4794, "step": 7200 }, { "epoch": 0.48953662182361735, "grad_norm": 0.9330841302871704, "learning_rate": 0.0004694039611360239, "loss": 3.5377, "step": 7205 }, { "epoch": 0.4898763418942791, "grad_norm": 1.2187128067016602, "learning_rate": 0.0004693827286316076, "loss": 3.8209, "step": 7210 }, { "epoch": 0.4902160619649409, "grad_norm": 0.7895124554634094, "learning_rate": 0.0004693614961271912, "loss": 3.6675, "step": 7215 }, { "epoch": 0.4905557820356027, "grad_norm": 0.8285120725631714, "learning_rate": 0.0004693402636227748, "loss": 3.5982, "step": 7220 }, { "epoch": 0.4908955021062644, "grad_norm": 0.9697214961051941, "learning_rate": 0.0004693190311183585, "loss": 3.7765, "step": 7225 }, { "epoch": 0.4912352221769262, "grad_norm": 0.9949163198471069, "learning_rate": 0.0004692977986139421, "loss": 3.3515, "step": 7230 }, { "epoch": 0.49157494224758796, "grad_norm": 0.8440485596656799, "learning_rate": 0.00046927656610952576, "loss": 3.6327, "step": 7235 }, { "epoch": 0.49191466231824976, "grad_norm": 1.0099966526031494, "learning_rate": 0.00046925533360510943, "loss": 3.6668, "step": 7240 }, { "epoch": 0.49225438238891156, "grad_norm": 0.9064828157424927, "learning_rate": 0.00046923410110069304, "loss": 3.6533, "step": 7245 }, { "epoch": 0.4925941024595733, "grad_norm": 0.9207621812820435, "learning_rate": 0.00046921286859627666, "loss": 3.7456, "step": 7250 }, { "epoch": 0.4929338225302351, "grad_norm": 0.8721938133239746, "learning_rate": 0.0004691916360918603, "loss": 3.2606, "step": 7255 }, { "epoch": 0.49327354260089684, "grad_norm": 0.8761111497879028, "learning_rate": 0.00046917040358744394, "loss": 3.6892, "step": 7260 }, { "epoch": 0.49361326267155864, "grad_norm": 0.7952359914779663, "learning_rate": 0.00046914917108302755, "loss": 3.6082, "step": 7265 }, { "epoch": 0.49395298274222044, "grad_norm": 0.9351627826690674, "learning_rate": 0.00046912793857861127, "loss": 3.5941, "step": 7270 }, { "epoch": 0.4942927028128822, "grad_norm": 1.0229564905166626, "learning_rate": 0.0004691067060741949, "loss": 3.5444, "step": 7275 }, { "epoch": 0.494632422883544, "grad_norm": 0.9218420386314392, "learning_rate": 0.0004690854735697785, "loss": 3.6477, "step": 7280 }, { "epoch": 0.4949721429542057, "grad_norm": 0.812005341053009, "learning_rate": 0.00046906424106536216, "loss": 3.8506, "step": 7285 }, { "epoch": 0.4953118630248675, "grad_norm": 0.8294686675071716, "learning_rate": 0.0004690430085609458, "loss": 3.3374, "step": 7290 }, { "epoch": 0.49565158309552926, "grad_norm": 1.0098185539245605, "learning_rate": 0.0004690217760565294, "loss": 3.5881, "step": 7295 }, { "epoch": 0.49599130316619106, "grad_norm": 0.7368996143341064, "learning_rate": 0.00046900054355211306, "loss": 3.6942, "step": 7300 }, { "epoch": 0.49633102323685285, "grad_norm": 0.8878200650215149, "learning_rate": 0.0004689793110476967, "loss": 3.5158, "step": 7305 }, { "epoch": 0.4966707433075146, "grad_norm": 1.7300747632980347, "learning_rate": 0.00046895807854328034, "loss": 3.6181, "step": 7310 }, { "epoch": 0.4970104633781764, "grad_norm": 0.8660399317741394, "learning_rate": 0.000468936846038864, "loss": 3.4594, "step": 7315 }, { "epoch": 0.49735018344883813, "grad_norm": 1.1630760431289673, "learning_rate": 0.0004689156135344476, "loss": 3.8103, "step": 7320 }, { "epoch": 0.49768990351949993, "grad_norm": 0.9857564568519592, "learning_rate": 0.0004688943810300313, "loss": 3.5899, "step": 7325 }, { "epoch": 0.49802962359016173, "grad_norm": 0.766424298286438, "learning_rate": 0.0004688731485256149, "loss": 3.521, "step": 7330 }, { "epoch": 0.49836934366082347, "grad_norm": 0.8071175813674927, "learning_rate": 0.0004688519160211985, "loss": 3.7532, "step": 7335 }, { "epoch": 0.49870906373148527, "grad_norm": 0.8051114678382874, "learning_rate": 0.00046883068351678223, "loss": 3.8173, "step": 7340 }, { "epoch": 0.499048783802147, "grad_norm": 0.8401608467102051, "learning_rate": 0.00046880945101236584, "loss": 3.6435, "step": 7345 }, { "epoch": 0.4993885038728088, "grad_norm": 0.8773759603500366, "learning_rate": 0.00046878821850794946, "loss": 3.3331, "step": 7350 }, { "epoch": 0.4997282239434706, "grad_norm": 0.8381858468055725, "learning_rate": 0.0004687669860035331, "loss": 3.5438, "step": 7355 }, { "epoch": 0.5000679440141323, "grad_norm": 0.8629751801490784, "learning_rate": 0.00046874575349911674, "loss": 3.7378, "step": 7360 }, { "epoch": 0.5004076640847941, "grad_norm": 4.6536760330200195, "learning_rate": 0.00046872452099470035, "loss": 3.648, "step": 7365 }, { "epoch": 0.5007473841554559, "grad_norm": 1.0652819871902466, "learning_rate": 0.000468703288490284, "loss": 3.8217, "step": 7370 }, { "epoch": 0.5010871042261177, "grad_norm": 1.7624415159225464, "learning_rate": 0.0004686820559858677, "loss": 3.683, "step": 7375 }, { "epoch": 0.5014268242967794, "grad_norm": 0.8332756161689758, "learning_rate": 0.0004686608234814513, "loss": 3.9171, "step": 7380 }, { "epoch": 0.5017665443674413, "grad_norm": 0.8871842622756958, "learning_rate": 0.00046863959097703497, "loss": 3.9252, "step": 7385 }, { "epoch": 0.502106264438103, "grad_norm": 0.745312511920929, "learning_rate": 0.0004686183584726186, "loss": 3.3627, "step": 7390 }, { "epoch": 0.5024459845087648, "grad_norm": 1.0115329027175903, "learning_rate": 0.0004685971259682022, "loss": 3.6207, "step": 7395 }, { "epoch": 0.5027857045794265, "grad_norm": 6.023624420166016, "learning_rate": 0.00046857589346378586, "loss": 3.7391, "step": 7400 }, { "epoch": 0.5031254246500884, "grad_norm": 1.441024899482727, "learning_rate": 0.00046855466095936947, "loss": 3.4777, "step": 7405 }, { "epoch": 0.5034651447207501, "grad_norm": 1.9653798341751099, "learning_rate": 0.00046853342845495314, "loss": 3.5586, "step": 7410 }, { "epoch": 0.5038048647914118, "grad_norm": 0.8200199604034424, "learning_rate": 0.0004685121959505368, "loss": 3.5976, "step": 7415 }, { "epoch": 0.5041445848620737, "grad_norm": 1.064880609512329, "learning_rate": 0.0004684909634461204, "loss": 3.7057, "step": 7420 }, { "epoch": 0.5044843049327354, "grad_norm": 0.9683533310890198, "learning_rate": 0.00046846973094170403, "loss": 3.4224, "step": 7425 }, { "epoch": 0.5048240250033972, "grad_norm": 2.3936219215393066, "learning_rate": 0.0004684484984372877, "loss": 3.613, "step": 7430 }, { "epoch": 0.5051637450740589, "grad_norm": 0.6951756477355957, "learning_rate": 0.0004684272659328713, "loss": 3.6617, "step": 7435 }, { "epoch": 0.5055034651447208, "grad_norm": 1.6612554788589478, "learning_rate": 0.0004684060334284549, "loss": 3.5516, "step": 7440 }, { "epoch": 0.5058431852153825, "grad_norm": 0.6749060750007629, "learning_rate": 0.00046838480092403865, "loss": 3.5307, "step": 7445 }, { "epoch": 0.5061829052860443, "grad_norm": 0.776546061038971, "learning_rate": 0.00046836356841962226, "loss": 3.5663, "step": 7450 }, { "epoch": 0.5065226253567061, "grad_norm": 2.1049482822418213, "learning_rate": 0.00046834233591520587, "loss": 3.5274, "step": 7455 }, { "epoch": 0.5068623454273679, "grad_norm": 0.9972699880599976, "learning_rate": 0.00046832110341078954, "loss": 3.6762, "step": 7460 }, { "epoch": 0.5072020654980296, "grad_norm": 1.032950758934021, "learning_rate": 0.00046829987090637315, "loss": 3.7084, "step": 7465 }, { "epoch": 0.5075417855686915, "grad_norm": 0.862369179725647, "learning_rate": 0.00046827863840195676, "loss": 3.2946, "step": 7470 }, { "epoch": 0.5078815056393532, "grad_norm": 0.7201554775238037, "learning_rate": 0.00046825740589754043, "loss": 3.9243, "step": 7475 }, { "epoch": 0.5082212257100149, "grad_norm": 0.9409382343292236, "learning_rate": 0.0004682361733931241, "loss": 3.3013, "step": 7480 }, { "epoch": 0.5085609457806767, "grad_norm": 2.2684149742126465, "learning_rate": 0.0004682149408887077, "loss": 3.9466, "step": 7485 }, { "epoch": 0.5089006658513385, "grad_norm": 1.0549654960632324, "learning_rate": 0.0004681937083842914, "loss": 3.4025, "step": 7490 }, { "epoch": 0.5092403859220003, "grad_norm": 0.9412028193473816, "learning_rate": 0.000468172475879875, "loss": 3.7297, "step": 7495 }, { "epoch": 0.509580105992662, "grad_norm": 0.9332839846611023, "learning_rate": 0.0004681512433754586, "loss": 3.7178, "step": 7500 }, { "epoch": 0.5099198260633239, "grad_norm": 0.763014554977417, "learning_rate": 0.00046813001087104227, "loss": 3.726, "step": 7505 }, { "epoch": 0.5102595461339856, "grad_norm": 1.0104658603668213, "learning_rate": 0.0004681087783666259, "loss": 3.8111, "step": 7510 }, { "epoch": 0.5105992662046474, "grad_norm": 0.9787155985832214, "learning_rate": 0.00046808754586220955, "loss": 3.7006, "step": 7515 }, { "epoch": 0.5109389862753091, "grad_norm": 3.7396483421325684, "learning_rate": 0.0004680663133577932, "loss": 3.69, "step": 7520 }, { "epoch": 0.511278706345971, "grad_norm": 1.0188279151916504, "learning_rate": 0.00046804508085337683, "loss": 3.6905, "step": 7525 }, { "epoch": 0.5116184264166327, "grad_norm": 1.60517418384552, "learning_rate": 0.00046802384834896044, "loss": 3.7547, "step": 7530 }, { "epoch": 0.5119581464872944, "grad_norm": 5.5376434326171875, "learning_rate": 0.0004680026158445441, "loss": 3.7321, "step": 7535 }, { "epoch": 0.5122978665579563, "grad_norm": 0.7040224075317383, "learning_rate": 0.0004679813833401277, "loss": 3.8448, "step": 7540 }, { "epoch": 0.512637586628618, "grad_norm": 0.8560564517974854, "learning_rate": 0.00046796015083571134, "loss": 3.4249, "step": 7545 }, { "epoch": 0.5129773066992798, "grad_norm": 0.8228334188461304, "learning_rate": 0.00046793891833129506, "loss": 3.5289, "step": 7550 }, { "epoch": 0.5133170267699416, "grad_norm": 1.3839235305786133, "learning_rate": 0.00046791768582687867, "loss": 3.7299, "step": 7555 }, { "epoch": 0.5136567468406034, "grad_norm": 0.8687189817428589, "learning_rate": 0.0004678964533224623, "loss": 3.7672, "step": 7560 }, { "epoch": 0.5139964669112651, "grad_norm": 0.945710301399231, "learning_rate": 0.00046787522081804595, "loss": 3.7285, "step": 7565 }, { "epoch": 0.5143361869819268, "grad_norm": 3.5585317611694336, "learning_rate": 0.00046785398831362957, "loss": 3.7429, "step": 7570 }, { "epoch": 0.5146759070525887, "grad_norm": 1.914366602897644, "learning_rate": 0.0004678327558092132, "loss": 3.5286, "step": 7575 }, { "epoch": 0.5150156271232504, "grad_norm": 4.376029014587402, "learning_rate": 0.00046781152330479685, "loss": 3.7161, "step": 7580 }, { "epoch": 0.5153553471939122, "grad_norm": 0.9216068387031555, "learning_rate": 0.0004677902908003805, "loss": 3.6936, "step": 7585 }, { "epoch": 0.515695067264574, "grad_norm": 0.8327890634536743, "learning_rate": 0.0004677690582959641, "loss": 3.5378, "step": 7590 }, { "epoch": 0.5160347873352358, "grad_norm": 2.8252291679382324, "learning_rate": 0.0004677478257915478, "loss": 3.5315, "step": 7595 }, { "epoch": 0.5163745074058975, "grad_norm": 0.8703281283378601, "learning_rate": 0.0004677265932871314, "loss": 3.6762, "step": 7600 }, { "epoch": 0.5167142274765593, "grad_norm": 0.7768443822860718, "learning_rate": 0.000467705360782715, "loss": 3.4759, "step": 7605 }, { "epoch": 0.5170539475472211, "grad_norm": 1.5822558403015137, "learning_rate": 0.0004676841282782987, "loss": 3.5499, "step": 7610 }, { "epoch": 0.5173936676178829, "grad_norm": 1.1410280466079712, "learning_rate": 0.0004676628957738823, "loss": 3.3176, "step": 7615 }, { "epoch": 0.5177333876885446, "grad_norm": 0.7129529714584351, "learning_rate": 0.00046764166326946597, "loss": 3.743, "step": 7620 }, { "epoch": 0.5180731077592065, "grad_norm": 0.8123133182525635, "learning_rate": 0.00046762043076504963, "loss": 3.7481, "step": 7625 }, { "epoch": 0.5184128278298682, "grad_norm": 0.9083758592605591, "learning_rate": 0.00046759919826063325, "loss": 3.4239, "step": 7630 }, { "epoch": 0.5187525479005299, "grad_norm": 1.1800650358200073, "learning_rate": 0.00046757796575621686, "loss": 3.7092, "step": 7635 }, { "epoch": 0.5190922679711918, "grad_norm": 0.799336314201355, "learning_rate": 0.0004675567332518005, "loss": 3.7909, "step": 7640 }, { "epoch": 0.5194319880418535, "grad_norm": 0.9873319864273071, "learning_rate": 0.00046753550074738414, "loss": 3.539, "step": 7645 }, { "epoch": 0.5197717081125153, "grad_norm": 0.9653818607330322, "learning_rate": 0.00046751426824296775, "loss": 3.668, "step": 7650 }, { "epoch": 0.520111428183177, "grad_norm": 0.8739514350891113, "learning_rate": 0.0004674930357385515, "loss": 3.4652, "step": 7655 }, { "epoch": 0.5204511482538389, "grad_norm": 0.9005382061004639, "learning_rate": 0.0004674718032341351, "loss": 3.5222, "step": 7660 }, { "epoch": 0.5207908683245006, "grad_norm": 1.5136737823486328, "learning_rate": 0.0004674505707297187, "loss": 3.3968, "step": 7665 }, { "epoch": 0.5211305883951624, "grad_norm": 1.2576749324798584, "learning_rate": 0.00046742933822530237, "loss": 3.6217, "step": 7670 }, { "epoch": 0.5214703084658242, "grad_norm": 0.9042896628379822, "learning_rate": 0.000467408105720886, "loss": 3.4749, "step": 7675 }, { "epoch": 0.521810028536486, "grad_norm": 0.8404830694198608, "learning_rate": 0.00046738687321646965, "loss": 3.7578, "step": 7680 }, { "epoch": 0.5221497486071477, "grad_norm": 0.8675946593284607, "learning_rate": 0.00046736564071205326, "loss": 3.6567, "step": 7685 }, { "epoch": 0.5224894686778094, "grad_norm": 0.8354901075363159, "learning_rate": 0.0004673444082076369, "loss": 3.8289, "step": 7690 }, { "epoch": 0.5228291887484713, "grad_norm": 1.032758355140686, "learning_rate": 0.0004673231757032206, "loss": 3.4577, "step": 7695 }, { "epoch": 0.523168908819133, "grad_norm": 1.5468162298202515, "learning_rate": 0.0004673019431988042, "loss": 3.6579, "step": 7700 }, { "epoch": 0.5235086288897948, "grad_norm": 1.161298155784607, "learning_rate": 0.0004672807106943878, "loss": 3.8484, "step": 7705 }, { "epoch": 0.5238483489604566, "grad_norm": 0.9132164716720581, "learning_rate": 0.0004672594781899715, "loss": 3.5808, "step": 7710 }, { "epoch": 0.5241880690311184, "grad_norm": 1.2979865074157715, "learning_rate": 0.0004672382456855551, "loss": 3.7121, "step": 7715 }, { "epoch": 0.5245277891017801, "grad_norm": 1.6748480796813965, "learning_rate": 0.0004672170131811387, "loss": 3.5973, "step": 7720 }, { "epoch": 0.524867509172442, "grad_norm": 0.8702155351638794, "learning_rate": 0.00046719578067672243, "loss": 3.7472, "step": 7725 }, { "epoch": 0.5252072292431037, "grad_norm": 0.8718984723091125, "learning_rate": 0.00046717454817230605, "loss": 3.6216, "step": 7730 }, { "epoch": 0.5255469493137654, "grad_norm": 0.8407930731773376, "learning_rate": 0.00046715331566788966, "loss": 3.5435, "step": 7735 }, { "epoch": 0.5258866693844272, "grad_norm": 5.863829135894775, "learning_rate": 0.0004671320831634733, "loss": 3.6996, "step": 7740 }, { "epoch": 0.526226389455089, "grad_norm": 0.9917728900909424, "learning_rate": 0.00046711085065905694, "loss": 3.5383, "step": 7745 }, { "epoch": 0.5265661095257508, "grad_norm": 0.8187193274497986, "learning_rate": 0.00046708961815464055, "loss": 3.9029, "step": 7750 }, { "epoch": 0.5269058295964125, "grad_norm": 0.9850929975509644, "learning_rate": 0.0004670683856502243, "loss": 3.7666, "step": 7755 }, { "epoch": 0.5272455496670744, "grad_norm": 1.1266725063323975, "learning_rate": 0.0004670471531458079, "loss": 3.0374, "step": 7760 }, { "epoch": 0.5275852697377361, "grad_norm": 0.9732920527458191, "learning_rate": 0.0004670259206413915, "loss": 3.717, "step": 7765 }, { "epoch": 0.5279249898083979, "grad_norm": 1.3048144578933716, "learning_rate": 0.00046700468813697517, "loss": 3.5867, "step": 7770 }, { "epoch": 0.5282647098790596, "grad_norm": 1.0944874286651611, "learning_rate": 0.0004669834556325588, "loss": 3.8633, "step": 7775 }, { "epoch": 0.5286044299497215, "grad_norm": 1.0076464414596558, "learning_rate": 0.0004669622231281424, "loss": 3.2481, "step": 7780 }, { "epoch": 0.5289441500203832, "grad_norm": 1.099632740020752, "learning_rate": 0.00046694099062372606, "loss": 3.8442, "step": 7785 }, { "epoch": 0.5292838700910449, "grad_norm": 0.9326073527336121, "learning_rate": 0.0004669197581193097, "loss": 3.8288, "step": 7790 }, { "epoch": 0.5296235901617068, "grad_norm": 1.029610514640808, "learning_rate": 0.00046689852561489334, "loss": 3.82, "step": 7795 }, { "epoch": 0.5299633102323685, "grad_norm": 0.9611042141914368, "learning_rate": 0.000466877293110477, "loss": 3.4129, "step": 7800 }, { "epoch": 0.5303030303030303, "grad_norm": 0.7985348105430603, "learning_rate": 0.0004668560606060606, "loss": 3.5075, "step": 7805 }, { "epoch": 0.5306427503736921, "grad_norm": 1.0885937213897705, "learning_rate": 0.00046683482810164423, "loss": 3.5701, "step": 7810 }, { "epoch": 0.5309824704443539, "grad_norm": 0.9616777300834656, "learning_rate": 0.0004668135955972279, "loss": 3.5537, "step": 7815 }, { "epoch": 0.5313221905150156, "grad_norm": 0.7692016363143921, "learning_rate": 0.0004667923630928115, "loss": 3.5779, "step": 7820 }, { "epoch": 0.5316619105856774, "grad_norm": 1.1926255226135254, "learning_rate": 0.0004667711305883952, "loss": 3.681, "step": 7825 }, { "epoch": 0.5320016306563392, "grad_norm": 0.9023371338844299, "learning_rate": 0.00046674989808397885, "loss": 3.5171, "step": 7830 }, { "epoch": 0.532341350727001, "grad_norm": 1.1241897344589233, "learning_rate": 0.00046672866557956246, "loss": 3.7116, "step": 7835 }, { "epoch": 0.5326810707976627, "grad_norm": 2.471149444580078, "learning_rate": 0.0004667074330751461, "loss": 3.8281, "step": 7840 }, { "epoch": 0.5330207908683245, "grad_norm": 0.9511035680770874, "learning_rate": 0.00046668620057072974, "loss": 3.5687, "step": 7845 }, { "epoch": 0.5333605109389863, "grad_norm": 0.7971816062927246, "learning_rate": 0.00046666496806631335, "loss": 3.6932, "step": 7850 }, { "epoch": 0.533700231009648, "grad_norm": 0.7273267507553101, "learning_rate": 0.00046664373556189697, "loss": 3.7437, "step": 7855 }, { "epoch": 0.5340399510803098, "grad_norm": 1.1009814739227295, "learning_rate": 0.0004666225030574807, "loss": 3.6201, "step": 7860 }, { "epoch": 0.5343796711509716, "grad_norm": 1.1999927759170532, "learning_rate": 0.0004666012705530643, "loss": 3.5814, "step": 7865 }, { "epoch": 0.5347193912216334, "grad_norm": 0.6871698498725891, "learning_rate": 0.0004665800380486479, "loss": 3.6666, "step": 7870 }, { "epoch": 0.5350591112922951, "grad_norm": 0.7896547317504883, "learning_rate": 0.0004665588055442316, "loss": 3.6033, "step": 7875 }, { "epoch": 0.535398831362957, "grad_norm": 1.0006742477416992, "learning_rate": 0.0004665375730398152, "loss": 3.714, "step": 7880 }, { "epoch": 0.5357385514336187, "grad_norm": 1.3997642993927002, "learning_rate": 0.0004665163405353988, "loss": 3.5548, "step": 7885 }, { "epoch": 0.5360782715042804, "grad_norm": 1.2676371335983276, "learning_rate": 0.0004664951080309825, "loss": 3.8496, "step": 7890 }, { "epoch": 0.5364179915749423, "grad_norm": 0.9120272397994995, "learning_rate": 0.00046647387552656614, "loss": 3.3579, "step": 7895 }, { "epoch": 0.536757711645604, "grad_norm": 0.8325341939926147, "learning_rate": 0.00046645264302214975, "loss": 3.3764, "step": 7900 }, { "epoch": 0.5370974317162658, "grad_norm": 0.7940071225166321, "learning_rate": 0.0004664314105177334, "loss": 3.7175, "step": 7905 }, { "epoch": 0.5374371517869275, "grad_norm": 0.9616426229476929, "learning_rate": 0.00046641017801331703, "loss": 3.7233, "step": 7910 }, { "epoch": 0.5377768718575894, "grad_norm": 0.6978499889373779, "learning_rate": 0.00046638894550890065, "loss": 3.7392, "step": 7915 }, { "epoch": 0.5381165919282511, "grad_norm": 2.1903634071350098, "learning_rate": 0.0004663677130044843, "loss": 3.546, "step": 7920 }, { "epoch": 0.5384563119989129, "grad_norm": 0.9321129322052002, "learning_rate": 0.0004663464805000679, "loss": 3.6936, "step": 7925 }, { "epoch": 0.5387960320695747, "grad_norm": 0.7919477820396423, "learning_rate": 0.0004663252479956516, "loss": 3.7729, "step": 7930 }, { "epoch": 0.5391357521402365, "grad_norm": 0.7324265837669373, "learning_rate": 0.00046630401549123526, "loss": 3.7293, "step": 7935 }, { "epoch": 0.5394754722108982, "grad_norm": 1.611940860748291, "learning_rate": 0.0004662827829868189, "loss": 3.5603, "step": 7940 }, { "epoch": 0.5398151922815599, "grad_norm": 0.9142165184020996, "learning_rate": 0.0004662615504824025, "loss": 3.5971, "step": 7945 }, { "epoch": 0.5401549123522218, "grad_norm": 1.054007887840271, "learning_rate": 0.00046624031797798615, "loss": 3.6861, "step": 7950 }, { "epoch": 0.5404946324228835, "grad_norm": 0.8323506712913513, "learning_rate": 0.00046621908547356977, "loss": 3.4046, "step": 7955 }, { "epoch": 0.5408343524935453, "grad_norm": 0.871013343334198, "learning_rate": 0.0004661978529691534, "loss": 3.5717, "step": 7960 }, { "epoch": 0.5411740725642071, "grad_norm": 0.6710253357887268, "learning_rate": 0.0004661766204647371, "loss": 3.6115, "step": 7965 }, { "epoch": 0.5415137926348689, "grad_norm": 1.0497071743011475, "learning_rate": 0.0004661553879603207, "loss": 3.5502, "step": 7970 }, { "epoch": 0.5418535127055306, "grad_norm": 1.0400034189224243, "learning_rate": 0.0004661341554559043, "loss": 3.6614, "step": 7975 }, { "epoch": 0.5421932327761925, "grad_norm": 0.8827162384986877, "learning_rate": 0.000466112922951488, "loss": 3.6278, "step": 7980 }, { "epoch": 0.5425329528468542, "grad_norm": 0.8478735685348511, "learning_rate": 0.0004660916904470716, "loss": 3.5722, "step": 7985 }, { "epoch": 0.542872672917516, "grad_norm": 1.7183061838150024, "learning_rate": 0.0004660704579426552, "loss": 3.5909, "step": 7990 }, { "epoch": 0.5432123929881777, "grad_norm": 0.9744554758071899, "learning_rate": 0.0004660492254382389, "loss": 3.6363, "step": 7995 }, { "epoch": 0.5435521130588395, "grad_norm": 12.643707275390625, "learning_rate": 0.00046602799293382255, "loss": 3.7798, "step": 8000 }, { "epoch": 0.5438918331295013, "grad_norm": 1.1114407777786255, "learning_rate": 0.00046600676042940617, "loss": 3.4727, "step": 8005 }, { "epoch": 0.544231553200163, "grad_norm": 0.9165973663330078, "learning_rate": 0.00046598552792498983, "loss": 3.7151, "step": 8010 }, { "epoch": 0.5445712732708249, "grad_norm": 0.66413414478302, "learning_rate": 0.00046596429542057345, "loss": 3.5291, "step": 8015 }, { "epoch": 0.5449109933414866, "grad_norm": 0.9342640042304993, "learning_rate": 0.0004659430629161571, "loss": 3.623, "step": 8020 }, { "epoch": 0.5452507134121484, "grad_norm": 0.9769925475120544, "learning_rate": 0.00046592183041174073, "loss": 3.8009, "step": 8025 }, { "epoch": 0.5455904334828101, "grad_norm": 1.01430082321167, "learning_rate": 0.00046590059790732434, "loss": 3.6231, "step": 8030 }, { "epoch": 0.545930153553472, "grad_norm": 0.991909921169281, "learning_rate": 0.00046587936540290806, "loss": 3.6404, "step": 8035 }, { "epoch": 0.5462698736241337, "grad_norm": 0.852908194065094, "learning_rate": 0.0004658581328984917, "loss": 3.3922, "step": 8040 }, { "epoch": 0.5466095936947954, "grad_norm": 1.024110198020935, "learning_rate": 0.0004658369003940753, "loss": 3.6079, "step": 8045 }, { "epoch": 0.5469493137654573, "grad_norm": 0.9279136061668396, "learning_rate": 0.00046581566788965895, "loss": 3.4183, "step": 8050 }, { "epoch": 0.547289033836119, "grad_norm": 0.7655441761016846, "learning_rate": 0.00046579443538524257, "loss": 3.5478, "step": 8055 }, { "epoch": 0.5476287539067808, "grad_norm": 1.004595160484314, "learning_rate": 0.0004657732028808262, "loss": 3.5034, "step": 8060 }, { "epoch": 0.5479684739774426, "grad_norm": 0.8732870817184448, "learning_rate": 0.00046575197037640985, "loss": 3.5241, "step": 8065 }, { "epoch": 0.5483081940481044, "grad_norm": 0.9807809591293335, "learning_rate": 0.0004657307378719935, "loss": 3.635, "step": 8070 }, { "epoch": 0.5486479141187661, "grad_norm": 0.773736834526062, "learning_rate": 0.00046570950536757713, "loss": 3.6489, "step": 8075 }, { "epoch": 0.5489876341894279, "grad_norm": 0.7747448682785034, "learning_rate": 0.0004656882728631608, "loss": 3.6375, "step": 8080 }, { "epoch": 0.5493273542600897, "grad_norm": 0.9060726761817932, "learning_rate": 0.0004656670403587444, "loss": 3.5903, "step": 8085 }, { "epoch": 0.5496670743307515, "grad_norm": 1.0314193964004517, "learning_rate": 0.000465645807854328, "loss": 3.5723, "step": 8090 }, { "epoch": 0.5500067944014132, "grad_norm": 0.9335294365882874, "learning_rate": 0.0004656245753499117, "loss": 3.6127, "step": 8095 }, { "epoch": 0.550346514472075, "grad_norm": 0.9772335290908813, "learning_rate": 0.0004656033428454953, "loss": 3.5464, "step": 8100 }, { "epoch": 0.5506862345427368, "grad_norm": 0.9815175533294678, "learning_rate": 0.00046558211034107897, "loss": 3.4316, "step": 8105 }, { "epoch": 0.5510259546133985, "grad_norm": 0.9871528744697571, "learning_rate": 0.00046556087783666264, "loss": 3.5379, "step": 8110 }, { "epoch": 0.5513656746840603, "grad_norm": 0.7671661376953125, "learning_rate": 0.00046553964533224625, "loss": 3.5237, "step": 8115 }, { "epoch": 0.5517053947547221, "grad_norm": 0.8236464262008667, "learning_rate": 0.00046551841282782986, "loss": 3.569, "step": 8120 }, { "epoch": 0.5520451148253839, "grad_norm": 0.8886585831642151, "learning_rate": 0.00046549718032341353, "loss": 3.7071, "step": 8125 }, { "epoch": 0.5523848348960456, "grad_norm": 1.0700992345809937, "learning_rate": 0.00046547594781899714, "loss": 3.7354, "step": 8130 }, { "epoch": 0.5527245549667075, "grad_norm": 0.9093068838119507, "learning_rate": 0.00046545471531458075, "loss": 3.4705, "step": 8135 }, { "epoch": 0.5530642750373692, "grad_norm": 0.8789447546005249, "learning_rate": 0.0004654334828101645, "loss": 3.6056, "step": 8140 }, { "epoch": 0.553403995108031, "grad_norm": 0.8029575943946838, "learning_rate": 0.0004654122503057481, "loss": 3.7289, "step": 8145 }, { "epoch": 0.5537437151786928, "grad_norm": 0.9833944439888, "learning_rate": 0.0004653910178013317, "loss": 3.4017, "step": 8150 }, { "epoch": 0.5540834352493546, "grad_norm": 0.9514707326889038, "learning_rate": 0.00046536978529691537, "loss": 3.4919, "step": 8155 }, { "epoch": 0.5544231553200163, "grad_norm": 0.8490001559257507, "learning_rate": 0.000465348552792499, "loss": 3.4446, "step": 8160 }, { "epoch": 0.554762875390678, "grad_norm": 0.9676173329353333, "learning_rate": 0.0004653273202880826, "loss": 3.8376, "step": 8165 }, { "epoch": 0.5551025954613399, "grad_norm": 2.326502799987793, "learning_rate": 0.00046530608778366626, "loss": 3.9536, "step": 8170 }, { "epoch": 0.5554423155320016, "grad_norm": 0.7183185815811157, "learning_rate": 0.00046528485527924993, "loss": 3.7008, "step": 8175 }, { "epoch": 0.5557820356026634, "grad_norm": 0.7919934391975403, "learning_rate": 0.00046526362277483354, "loss": 3.567, "step": 8180 }, { "epoch": 0.5561217556733252, "grad_norm": 0.8227603435516357, "learning_rate": 0.0004652423902704172, "loss": 3.4308, "step": 8185 }, { "epoch": 0.556461475743987, "grad_norm": 0.9760486483573914, "learning_rate": 0.0004652211577660008, "loss": 3.626, "step": 8190 }, { "epoch": 0.5568011958146487, "grad_norm": 0.9528893828392029, "learning_rate": 0.00046519992526158443, "loss": 3.6024, "step": 8195 }, { "epoch": 0.5571409158853105, "grad_norm": 0.697662353515625, "learning_rate": 0.0004651786927571681, "loss": 3.6557, "step": 8200 }, { "epoch": 0.5574806359559723, "grad_norm": 0.8949292302131653, "learning_rate": 0.0004651574602527517, "loss": 3.5162, "step": 8205 }, { "epoch": 0.557820356026634, "grad_norm": 0.9648535251617432, "learning_rate": 0.0004651362277483354, "loss": 3.5976, "step": 8210 }, { "epoch": 0.5581600760972958, "grad_norm": 0.8781564831733704, "learning_rate": 0.00046511499524391905, "loss": 3.5692, "step": 8215 }, { "epoch": 0.5584997961679576, "grad_norm": 0.748227059841156, "learning_rate": 0.00046509376273950266, "loss": 3.7706, "step": 8220 }, { "epoch": 0.5588395162386194, "grad_norm": 0.9339863061904907, "learning_rate": 0.0004650725302350863, "loss": 3.4034, "step": 8225 }, { "epoch": 0.5591792363092811, "grad_norm": 0.7359179854393005, "learning_rate": 0.00046505129773066994, "loss": 3.7181, "step": 8230 }, { "epoch": 0.559518956379943, "grad_norm": 0.8395485877990723, "learning_rate": 0.00046503006522625355, "loss": 3.5547, "step": 8235 }, { "epoch": 0.5598586764506047, "grad_norm": 0.754944384098053, "learning_rate": 0.00046500883272183717, "loss": 3.8679, "step": 8240 }, { "epoch": 0.5601983965212665, "grad_norm": 1.323588490486145, "learning_rate": 0.0004649876002174209, "loss": 3.4363, "step": 8245 }, { "epoch": 0.5605381165919282, "grad_norm": 0.9933255910873413, "learning_rate": 0.0004649663677130045, "loss": 3.5484, "step": 8250 }, { "epoch": 0.5608778366625901, "grad_norm": 0.9101434350013733, "learning_rate": 0.0004649451352085881, "loss": 3.3904, "step": 8255 }, { "epoch": 0.5612175567332518, "grad_norm": 0.798695981502533, "learning_rate": 0.0004649239027041718, "loss": 3.5049, "step": 8260 }, { "epoch": 0.5615572768039135, "grad_norm": 0.7379664778709412, "learning_rate": 0.0004649026701997554, "loss": 3.5863, "step": 8265 }, { "epoch": 0.5618969968745754, "grad_norm": 0.8691951036453247, "learning_rate": 0.000464881437695339, "loss": 3.5687, "step": 8270 }, { "epoch": 0.5622367169452371, "grad_norm": 1.14522385597229, "learning_rate": 0.00046486020519092273, "loss": 3.3321, "step": 8275 }, { "epoch": 0.5625764370158989, "grad_norm": 1.108525037765503, "learning_rate": 0.00046483897268650634, "loss": 3.6403, "step": 8280 }, { "epoch": 0.5629161570865606, "grad_norm": 0.8640968799591064, "learning_rate": 0.00046481774018208996, "loss": 3.5872, "step": 8285 }, { "epoch": 0.5632558771572225, "grad_norm": 0.8400288820266724, "learning_rate": 0.0004647965076776736, "loss": 3.744, "step": 8290 }, { "epoch": 0.5635955972278842, "grad_norm": 0.8674853444099426, "learning_rate": 0.00046477527517325724, "loss": 3.5181, "step": 8295 }, { "epoch": 0.563935317298546, "grad_norm": 0.7288232445716858, "learning_rate": 0.00046475404266884085, "loss": 3.7352, "step": 8300 }, { "epoch": 0.5642750373692078, "grad_norm": 0.7889117002487183, "learning_rate": 0.0004647328101644245, "loss": 3.6015, "step": 8305 }, { "epoch": 0.5646147574398696, "grad_norm": 0.8616487979888916, "learning_rate": 0.0004647115776600082, "loss": 3.6119, "step": 8310 }, { "epoch": 0.5649544775105313, "grad_norm": 0.9094602465629578, "learning_rate": 0.0004646903451555918, "loss": 3.3742, "step": 8315 }, { "epoch": 0.5652941975811931, "grad_norm": 0.8733056783676147, "learning_rate": 0.00046466911265117546, "loss": 3.3936, "step": 8320 }, { "epoch": 0.5656339176518549, "grad_norm": 0.9559192657470703, "learning_rate": 0.0004646478801467591, "loss": 3.8644, "step": 8325 }, { "epoch": 0.5659736377225166, "grad_norm": 0.963951051235199, "learning_rate": 0.0004646266476423427, "loss": 3.6595, "step": 8330 }, { "epoch": 0.5663133577931784, "grad_norm": 0.9458184242248535, "learning_rate": 0.00046460541513792636, "loss": 3.773, "step": 8335 }, { "epoch": 0.5666530778638402, "grad_norm": 1.0453698635101318, "learning_rate": 0.00046458418263350997, "loss": 3.6712, "step": 8340 }, { "epoch": 0.566992797934502, "grad_norm": 0.7665160298347473, "learning_rate": 0.00046456295012909364, "loss": 3.5103, "step": 8345 }, { "epoch": 0.5673325180051637, "grad_norm": 1.1309001445770264, "learning_rate": 0.0004645417176246773, "loss": 3.8304, "step": 8350 }, { "epoch": 0.5676722380758256, "grad_norm": 0.8026044964790344, "learning_rate": 0.0004645204851202609, "loss": 3.7321, "step": 8355 }, { "epoch": 0.5680119581464873, "grad_norm": 0.8684634566307068, "learning_rate": 0.0004644992526158446, "loss": 3.5079, "step": 8360 }, { "epoch": 0.568351678217149, "grad_norm": 0.7700868844985962, "learning_rate": 0.0004644780201114282, "loss": 3.4978, "step": 8365 }, { "epoch": 0.5686913982878108, "grad_norm": 0.8467879295349121, "learning_rate": 0.0004644567876070118, "loss": 3.5317, "step": 8370 }, { "epoch": 0.5690311183584726, "grad_norm": 0.7633691430091858, "learning_rate": 0.0004644355551025955, "loss": 3.7937, "step": 8375 }, { "epoch": 0.5693708384291344, "grad_norm": 0.7458816170692444, "learning_rate": 0.00046441432259817914, "loss": 3.7924, "step": 8380 }, { "epoch": 0.5697105584997961, "grad_norm": 0.7930266261100769, "learning_rate": 0.00046439309009376276, "loss": 3.6874, "step": 8385 }, { "epoch": 0.570050278570458, "grad_norm": 1.0936992168426514, "learning_rate": 0.0004643718575893464, "loss": 3.5195, "step": 8390 }, { "epoch": 0.5703899986411197, "grad_norm": 0.8362003564834595, "learning_rate": 0.00046435062508493004, "loss": 3.8652, "step": 8395 }, { "epoch": 0.5707297187117815, "grad_norm": 0.8051434755325317, "learning_rate": 0.00046432939258051365, "loss": 3.4576, "step": 8400 }, { "epoch": 0.5710694387824433, "grad_norm": 0.9332279562950134, "learning_rate": 0.0004643081600760973, "loss": 3.6636, "step": 8405 }, { "epoch": 0.5714091588531051, "grad_norm": 1.405193567276001, "learning_rate": 0.00046428692757168093, "loss": 3.6772, "step": 8410 }, { "epoch": 0.5717488789237668, "grad_norm": 0.9611175060272217, "learning_rate": 0.0004642656950672646, "loss": 3.2436, "step": 8415 }, { "epoch": 0.5720885989944285, "grad_norm": 0.8616746664047241, "learning_rate": 0.00046424446256284826, "loss": 3.6849, "step": 8420 }, { "epoch": 0.5724283190650904, "grad_norm": 0.6972823143005371, "learning_rate": 0.0004642232300584319, "loss": 3.6665, "step": 8425 }, { "epoch": 0.5727680391357521, "grad_norm": 0.7639859914779663, "learning_rate": 0.0004642019975540155, "loss": 3.6254, "step": 8430 }, { "epoch": 0.5731077592064139, "grad_norm": 1.0467149019241333, "learning_rate": 0.00046418076504959916, "loss": 3.2998, "step": 8435 }, { "epoch": 0.5734474792770757, "grad_norm": 0.9891260862350464, "learning_rate": 0.00046415953254518277, "loss": 3.5781, "step": 8440 }, { "epoch": 0.5737871993477375, "grad_norm": 1.1248908042907715, "learning_rate": 0.0004641383000407664, "loss": 3.4915, "step": 8445 }, { "epoch": 0.5741269194183992, "grad_norm": 0.9267275929450989, "learning_rate": 0.0004641170675363501, "loss": 3.7145, "step": 8450 }, { "epoch": 0.574466639489061, "grad_norm": 0.9186921119689941, "learning_rate": 0.0004640958350319337, "loss": 3.2855, "step": 8455 }, { "epoch": 0.5748063595597228, "grad_norm": 1.1536846160888672, "learning_rate": 0.00046407460252751733, "loss": 3.745, "step": 8460 }, { "epoch": 0.5751460796303846, "grad_norm": 0.8261640667915344, "learning_rate": 0.000464053370023101, "loss": 3.6783, "step": 8465 }, { "epoch": 0.5754857997010463, "grad_norm": 0.8631487488746643, "learning_rate": 0.0004640321375186846, "loss": 3.5461, "step": 8470 }, { "epoch": 0.5758255197717081, "grad_norm": 0.9185844659805298, "learning_rate": 0.0004640109050142682, "loss": 3.5031, "step": 8475 }, { "epoch": 0.5761652398423699, "grad_norm": 1.058948040008545, "learning_rate": 0.0004639896725098519, "loss": 3.4611, "step": 8480 }, { "epoch": 0.5765049599130316, "grad_norm": 1.076069712638855, "learning_rate": 0.00046396844000543556, "loss": 3.6835, "step": 8485 }, { "epoch": 0.5768446799836935, "grad_norm": 0.9464048147201538, "learning_rate": 0.00046394720750101917, "loss": 3.3596, "step": 8490 }, { "epoch": 0.5771844000543552, "grad_norm": 1.0705610513687134, "learning_rate": 0.00046392597499660284, "loss": 3.7126, "step": 8495 }, { "epoch": 0.577524120125017, "grad_norm": 0.7414143085479736, "learning_rate": 0.00046390474249218645, "loss": 3.6577, "step": 8500 }, { "epoch": 0.5778638401956787, "grad_norm": 1.956364631652832, "learning_rate": 0.00046388350998777006, "loss": 3.4839, "step": 8505 }, { "epoch": 0.5782035602663406, "grad_norm": 0.8971584439277649, "learning_rate": 0.00046386227748335373, "loss": 3.5495, "step": 8510 }, { "epoch": 0.5785432803370023, "grad_norm": 0.9079026579856873, "learning_rate": 0.00046384104497893734, "loss": 3.5485, "step": 8515 }, { "epoch": 0.578883000407664, "grad_norm": 0.8294443488121033, "learning_rate": 0.000463819812474521, "loss": 3.4171, "step": 8520 }, { "epoch": 0.5792227204783259, "grad_norm": 0.6865608096122742, "learning_rate": 0.0004637985799701047, "loss": 3.5304, "step": 8525 }, { "epoch": 0.5795624405489876, "grad_norm": 0.6489093899726868, "learning_rate": 0.0004637773474656883, "loss": 3.5493, "step": 8530 }, { "epoch": 0.5799021606196494, "grad_norm": 1.009374976158142, "learning_rate": 0.0004637561149612719, "loss": 3.492, "step": 8535 }, { "epoch": 0.5802418806903111, "grad_norm": 1.1420934200286865, "learning_rate": 0.00046373488245685557, "loss": 3.4822, "step": 8540 }, { "epoch": 0.580581600760973, "grad_norm": 1.1331899166107178, "learning_rate": 0.0004637136499524392, "loss": 3.7898, "step": 8545 }, { "epoch": 0.5809213208316347, "grad_norm": 1.2521001100540161, "learning_rate": 0.0004636924174480228, "loss": 3.3508, "step": 8550 }, { "epoch": 0.5812610409022965, "grad_norm": 0.8616347908973694, "learning_rate": 0.0004636711849436065, "loss": 3.742, "step": 8555 }, { "epoch": 0.5816007609729583, "grad_norm": 0.9102116227149963, "learning_rate": 0.00046364995243919013, "loss": 3.6064, "step": 8560 }, { "epoch": 0.5819404810436201, "grad_norm": 1.1602951288223267, "learning_rate": 0.00046362871993477374, "loss": 3.5475, "step": 8565 }, { "epoch": 0.5822802011142818, "grad_norm": 0.9259647727012634, "learning_rate": 0.0004636074874303574, "loss": 3.6004, "step": 8570 }, { "epoch": 0.5826199211849437, "grad_norm": 1.0286883115768433, "learning_rate": 0.000463586254925941, "loss": 3.6075, "step": 8575 }, { "epoch": 0.5829596412556054, "grad_norm": 0.8238376975059509, "learning_rate": 0.00046356502242152464, "loss": 3.4532, "step": 8580 }, { "epoch": 0.5832993613262671, "grad_norm": 0.8405466079711914, "learning_rate": 0.0004635437899171083, "loss": 3.6264, "step": 8585 }, { "epoch": 0.5836390813969289, "grad_norm": 0.9198514223098755, "learning_rate": 0.00046352255741269197, "loss": 3.5776, "step": 8590 }, { "epoch": 0.5839788014675907, "grad_norm": 0.7651392221450806, "learning_rate": 0.0004635013249082756, "loss": 3.8504, "step": 8595 }, { "epoch": 0.5843185215382525, "grad_norm": 0.8711015582084656, "learning_rate": 0.00046348009240385925, "loss": 3.5249, "step": 8600 }, { "epoch": 0.5846582416089142, "grad_norm": 1.0059685707092285, "learning_rate": 0.00046345885989944286, "loss": 3.8916, "step": 8605 }, { "epoch": 0.5849979616795761, "grad_norm": 1.2109445333480835, "learning_rate": 0.0004634376273950265, "loss": 3.5083, "step": 8610 }, { "epoch": 0.5853376817502378, "grad_norm": 0.7908108234405518, "learning_rate": 0.00046341639489061014, "loss": 3.6267, "step": 8615 }, { "epoch": 0.5856774018208996, "grad_norm": 1.1404927968978882, "learning_rate": 0.00046339516238619376, "loss": 3.4812, "step": 8620 }, { "epoch": 0.5860171218915613, "grad_norm": 0.9908376336097717, "learning_rate": 0.0004633739298817774, "loss": 3.6208, "step": 8625 }, { "epoch": 0.5863568419622232, "grad_norm": 1.010998249053955, "learning_rate": 0.0004633526973773611, "loss": 3.8308, "step": 8630 }, { "epoch": 0.5866965620328849, "grad_norm": 0.7181527614593506, "learning_rate": 0.0004633314648729447, "loss": 3.6616, "step": 8635 }, { "epoch": 0.5870362821035466, "grad_norm": 0.8619421124458313, "learning_rate": 0.0004633102323685283, "loss": 3.2036, "step": 8640 }, { "epoch": 0.5873760021742085, "grad_norm": 0.7832335829734802, "learning_rate": 0.000463288999864112, "loss": 3.9101, "step": 8645 }, { "epoch": 0.5877157222448702, "grad_norm": 0.7832859754562378, "learning_rate": 0.0004632677673596956, "loss": 3.906, "step": 8650 }, { "epoch": 0.588055442315532, "grad_norm": 0.8552325367927551, "learning_rate": 0.0004632465348552792, "loss": 3.4681, "step": 8655 }, { "epoch": 0.5883951623861938, "grad_norm": 1.155590295791626, "learning_rate": 0.00046322530235086293, "loss": 3.4337, "step": 8660 }, { "epoch": 0.5887348824568556, "grad_norm": 0.7940585017204285, "learning_rate": 0.00046320406984644654, "loss": 3.5771, "step": 8665 }, { "epoch": 0.5890746025275173, "grad_norm": 0.8319181203842163, "learning_rate": 0.00046318283734203016, "loss": 3.7548, "step": 8670 }, { "epoch": 0.589414322598179, "grad_norm": 0.8334320783615112, "learning_rate": 0.0004631616048376138, "loss": 3.6554, "step": 8675 }, { "epoch": 0.5897540426688409, "grad_norm": 0.7265830636024475, "learning_rate": 0.00046314037233319744, "loss": 3.6185, "step": 8680 }, { "epoch": 0.5900937627395026, "grad_norm": 0.8766757845878601, "learning_rate": 0.00046311913982878105, "loss": 3.4528, "step": 8685 }, { "epoch": 0.5904334828101644, "grad_norm": 0.9673860669136047, "learning_rate": 0.0004630979073243647, "loss": 3.3539, "step": 8690 }, { "epoch": 0.5907732028808262, "grad_norm": 0.753275990486145, "learning_rate": 0.0004630766748199484, "loss": 3.5676, "step": 8695 }, { "epoch": 0.591112922951488, "grad_norm": 0.8883139491081238, "learning_rate": 0.00046305544231553205, "loss": 3.641, "step": 8700 }, { "epoch": 0.5914526430221497, "grad_norm": 1.020063877105713, "learning_rate": 0.00046303420981111566, "loss": 3.7212, "step": 8705 }, { "epoch": 0.5917923630928115, "grad_norm": 1.012702226638794, "learning_rate": 0.0004630129773066993, "loss": 3.5299, "step": 8710 }, { "epoch": 0.5921320831634733, "grad_norm": 0.9492241144180298, "learning_rate": 0.00046299174480228294, "loss": 3.4561, "step": 8715 }, { "epoch": 0.5924718032341351, "grad_norm": 1.052270770072937, "learning_rate": 0.00046297051229786656, "loss": 3.5942, "step": 8720 }, { "epoch": 0.5928115233047968, "grad_norm": 0.9527101516723633, "learning_rate": 0.00046294927979345017, "loss": 3.6972, "step": 8725 }, { "epoch": 0.5931512433754587, "grad_norm": 15.030817985534668, "learning_rate": 0.0004629280472890339, "loss": 3.6915, "step": 8730 }, { "epoch": 0.5934909634461204, "grad_norm": 3.7890779972076416, "learning_rate": 0.0004629068147846175, "loss": 3.659, "step": 8735 }, { "epoch": 0.5938306835167821, "grad_norm": 0.9359475374221802, "learning_rate": 0.0004628855822802011, "loss": 3.5678, "step": 8740 }, { "epoch": 0.594170403587444, "grad_norm": 1.1487375497817993, "learning_rate": 0.0004628643497757848, "loss": 3.5941, "step": 8745 }, { "epoch": 0.5945101236581057, "grad_norm": 0.9198431372642517, "learning_rate": 0.0004628431172713684, "loss": 3.6968, "step": 8750 }, { "epoch": 0.5948498437287675, "grad_norm": 0.8088510036468506, "learning_rate": 0.000462821884766952, "loss": 3.465, "step": 8755 }, { "epoch": 0.5951895637994292, "grad_norm": 1.3213027715682983, "learning_rate": 0.0004628006522625357, "loss": 3.5969, "step": 8760 }, { "epoch": 0.5955292838700911, "grad_norm": 2.7065329551696777, "learning_rate": 0.00046277941975811934, "loss": 3.5626, "step": 8765 }, { "epoch": 0.5958690039407528, "grad_norm": 0.8254181146621704, "learning_rate": 0.00046275818725370296, "loss": 3.6698, "step": 8770 }, { "epoch": 0.5962087240114146, "grad_norm": 1.126598596572876, "learning_rate": 0.0004627369547492866, "loss": 3.6042, "step": 8775 }, { "epoch": 0.5965484440820764, "grad_norm": 0.8453353047370911, "learning_rate": 0.00046271572224487024, "loss": 3.5509, "step": 8780 }, { "epoch": 0.5968881641527382, "grad_norm": 0.9932458996772766, "learning_rate": 0.00046269448974045385, "loss": 3.5072, "step": 8785 }, { "epoch": 0.5972278842233999, "grad_norm": 4.577602386474609, "learning_rate": 0.0004626732572360375, "loss": 3.7078, "step": 8790 }, { "epoch": 0.5975676042940616, "grad_norm": 0.782005250453949, "learning_rate": 0.00046265202473162113, "loss": 3.7176, "step": 8795 }, { "epoch": 0.5979073243647235, "grad_norm": 1.0876250267028809, "learning_rate": 0.0004626307922272048, "loss": 3.5497, "step": 8800 }, { "epoch": 0.5982470444353852, "grad_norm": 0.8668751120567322, "learning_rate": 0.00046260955972278846, "loss": 3.5449, "step": 8805 }, { "epoch": 0.598586764506047, "grad_norm": 0.8464177846908569, "learning_rate": 0.0004625883272183721, "loss": 3.5925, "step": 8810 }, { "epoch": 0.5989264845767088, "grad_norm": 1.1895912885665894, "learning_rate": 0.0004625670947139557, "loss": 3.744, "step": 8815 }, { "epoch": 0.5992662046473706, "grad_norm": 0.9373434782028198, "learning_rate": 0.00046254586220953936, "loss": 3.3869, "step": 8820 }, { "epoch": 0.5996059247180323, "grad_norm": 1.7553216218948364, "learning_rate": 0.00046252462970512297, "loss": 3.4838, "step": 8825 }, { "epoch": 0.5999456447886942, "grad_norm": 0.7771705389022827, "learning_rate": 0.0004625033972007066, "loss": 3.4622, "step": 8830 }, { "epoch": 0.6002853648593559, "grad_norm": 1.5372329950332642, "learning_rate": 0.0004624821646962903, "loss": 3.6124, "step": 8835 }, { "epoch": 0.6006250849300176, "grad_norm": 0.8373830914497375, "learning_rate": 0.0004624609321918739, "loss": 3.8012, "step": 8840 }, { "epoch": 0.6009648050006794, "grad_norm": 0.8196567893028259, "learning_rate": 0.00046243969968745753, "loss": 3.5199, "step": 8845 }, { "epoch": 0.6013045250713412, "grad_norm": 0.8982058167457581, "learning_rate": 0.0004624184671830412, "loss": 3.7692, "step": 8850 }, { "epoch": 0.601644245142003, "grad_norm": 1.0460435152053833, "learning_rate": 0.0004623972346786248, "loss": 3.6256, "step": 8855 }, { "epoch": 0.6019839652126647, "grad_norm": 0.9102880954742432, "learning_rate": 0.0004623760021742084, "loss": 3.5208, "step": 8860 }, { "epoch": 0.6023236852833266, "grad_norm": 0.9894651174545288, "learning_rate": 0.00046235476966979215, "loss": 3.4936, "step": 8865 }, { "epoch": 0.6026634053539883, "grad_norm": 0.8317850828170776, "learning_rate": 0.00046233353716537576, "loss": 3.4836, "step": 8870 }, { "epoch": 0.6030031254246501, "grad_norm": 2.5010905265808105, "learning_rate": 0.00046231230466095937, "loss": 3.5218, "step": 8875 }, { "epoch": 0.6033428454953118, "grad_norm": 1.0112848281860352, "learning_rate": 0.00046229107215654304, "loss": 3.3859, "step": 8880 }, { "epoch": 0.6036825655659737, "grad_norm": 0.8860064744949341, "learning_rate": 0.00046226983965212665, "loss": 3.7785, "step": 8885 }, { "epoch": 0.6040222856366354, "grad_norm": 1.3213621377944946, "learning_rate": 0.00046224860714771026, "loss": 3.5352, "step": 8890 }, { "epoch": 0.6043620057072971, "grad_norm": 0.9340565204620361, "learning_rate": 0.00046222737464329393, "loss": 3.6229, "step": 8895 }, { "epoch": 0.604701725777959, "grad_norm": 0.9262966513633728, "learning_rate": 0.0004622061421388776, "loss": 3.7149, "step": 8900 }, { "epoch": 0.6050414458486207, "grad_norm": 1.0627355575561523, "learning_rate": 0.0004621849096344612, "loss": 3.464, "step": 8905 }, { "epoch": 0.6053811659192825, "grad_norm": 0.795377254486084, "learning_rate": 0.0004621636771300449, "loss": 3.5131, "step": 8910 }, { "epoch": 0.6057208859899443, "grad_norm": 1.2327501773834229, "learning_rate": 0.0004621424446256285, "loss": 3.484, "step": 8915 }, { "epoch": 0.6060606060606061, "grad_norm": 0.7634246349334717, "learning_rate": 0.0004621212121212121, "loss": 3.5731, "step": 8920 }, { "epoch": 0.6064003261312678, "grad_norm": 0.8679332137107849, "learning_rate": 0.00046209997961679577, "loss": 3.5441, "step": 8925 }, { "epoch": 0.6067400462019296, "grad_norm": 1.5829439163208008, "learning_rate": 0.0004620787471123794, "loss": 3.7493, "step": 8930 }, { "epoch": 0.6070797662725914, "grad_norm": 1.3126379251480103, "learning_rate": 0.00046205751460796305, "loss": 3.5692, "step": 8935 }, { "epoch": 0.6074194863432532, "grad_norm": 0.8255198001861572, "learning_rate": 0.0004620362821035467, "loss": 3.5466, "step": 8940 }, { "epoch": 0.6077592064139149, "grad_norm": 1.0258809328079224, "learning_rate": 0.00046201504959913033, "loss": 3.6526, "step": 8945 }, { "epoch": 0.6080989264845768, "grad_norm": 0.9340539574623108, "learning_rate": 0.00046199381709471394, "loss": 3.8197, "step": 8950 }, { "epoch": 0.6084386465552385, "grad_norm": 0.9238640069961548, "learning_rate": 0.0004619725845902976, "loss": 3.497, "step": 8955 }, { "epoch": 0.6087783666259002, "grad_norm": 0.8387094140052795, "learning_rate": 0.0004619513520858812, "loss": 3.6426, "step": 8960 }, { "epoch": 0.609118086696562, "grad_norm": 1.0382146835327148, "learning_rate": 0.00046193011958146484, "loss": 3.3636, "step": 8965 }, { "epoch": 0.6094578067672238, "grad_norm": 0.7303882241249084, "learning_rate": 0.00046190888707704856, "loss": 3.6239, "step": 8970 }, { "epoch": 0.6097975268378856, "grad_norm": 0.9969522953033447, "learning_rate": 0.00046188765457263217, "loss": 3.527, "step": 8975 }, { "epoch": 0.6101372469085473, "grad_norm": 1.005690097808838, "learning_rate": 0.0004618664220682158, "loss": 3.3985, "step": 8980 }, { "epoch": 0.6104769669792092, "grad_norm": 0.8273835778236389, "learning_rate": 0.00046184518956379945, "loss": 3.4046, "step": 8985 }, { "epoch": 0.6108166870498709, "grad_norm": 0.7939265370368958, "learning_rate": 0.00046182395705938307, "loss": 3.5328, "step": 8990 }, { "epoch": 0.6111564071205327, "grad_norm": 1.0987051725387573, "learning_rate": 0.0004618027245549667, "loss": 3.9092, "step": 8995 }, { "epoch": 0.6114961271911945, "grad_norm": 0.7866544127464294, "learning_rate": 0.00046178149205055035, "loss": 3.544, "step": 9000 }, { "epoch": 0.6118358472618562, "grad_norm": 0.9604711532592773, "learning_rate": 0.000461760259546134, "loss": 3.6771, "step": 9005 }, { "epoch": 0.612175567332518, "grad_norm": 0.9198035001754761, "learning_rate": 0.0004617390270417176, "loss": 3.5943, "step": 9010 }, { "epoch": 0.6125152874031797, "grad_norm": 0.8710593581199646, "learning_rate": 0.0004617177945373013, "loss": 3.5944, "step": 9015 }, { "epoch": 0.6128550074738416, "grad_norm": 0.8109589219093323, "learning_rate": 0.0004616965620328849, "loss": 3.6827, "step": 9020 }, { "epoch": 0.6131947275445033, "grad_norm": 1.1801024675369263, "learning_rate": 0.0004616753295284685, "loss": 3.5097, "step": 9025 }, { "epoch": 0.6135344476151651, "grad_norm": 0.9577484726905823, "learning_rate": 0.0004616540970240522, "loss": 3.4434, "step": 9030 }, { "epoch": 0.6138741676858269, "grad_norm": 0.9804816842079163, "learning_rate": 0.0004616328645196358, "loss": 3.4665, "step": 9035 }, { "epoch": 0.6142138877564887, "grad_norm": 0.6899266839027405, "learning_rate": 0.0004616116320152195, "loss": 3.7444, "step": 9040 }, { "epoch": 0.6145536078271504, "grad_norm": 1.0100535154342651, "learning_rate": 0.00046159039951080313, "loss": 3.7215, "step": 9045 }, { "epoch": 0.6148933278978121, "grad_norm": 0.8114267587661743, "learning_rate": 0.00046156916700638675, "loss": 3.606, "step": 9050 }, { "epoch": 0.615233047968474, "grad_norm": 1.00663423538208, "learning_rate": 0.0004615479345019704, "loss": 3.7594, "step": 9055 }, { "epoch": 0.6155727680391357, "grad_norm": 4.760237693786621, "learning_rate": 0.000461526701997554, "loss": 3.688, "step": 9060 }, { "epoch": 0.6159124881097975, "grad_norm": 0.7859367728233337, "learning_rate": 0.00046150546949313764, "loss": 3.745, "step": 9065 }, { "epoch": 0.6162522081804593, "grad_norm": 0.882507860660553, "learning_rate": 0.0004614842369887213, "loss": 3.4634, "step": 9070 }, { "epoch": 0.6165919282511211, "grad_norm": 0.9178627133369446, "learning_rate": 0.00046146300448430497, "loss": 3.3739, "step": 9075 }, { "epoch": 0.6169316483217828, "grad_norm": 0.6998899579048157, "learning_rate": 0.0004614417719798886, "loss": 3.6259, "step": 9080 }, { "epoch": 0.6172713683924447, "grad_norm": 0.8725610971450806, "learning_rate": 0.00046142053947547225, "loss": 3.6805, "step": 9085 }, { "epoch": 0.6176110884631064, "grad_norm": 4.87006139755249, "learning_rate": 0.00046139930697105587, "loss": 3.4203, "step": 9090 }, { "epoch": 0.6179508085337682, "grad_norm": 0.7953838109970093, "learning_rate": 0.0004613780744666395, "loss": 3.5376, "step": 9095 }, { "epoch": 0.6182905286044299, "grad_norm": 0.9432246088981628, "learning_rate": 0.00046135684196222315, "loss": 3.7042, "step": 9100 }, { "epoch": 0.6186302486750918, "grad_norm": 0.8905467391014099, "learning_rate": 0.00046133560945780676, "loss": 3.8461, "step": 9105 }, { "epoch": 0.6189699687457535, "grad_norm": 1.0755122900009155, "learning_rate": 0.0004613143769533904, "loss": 3.1763, "step": 9110 }, { "epoch": 0.6193096888164152, "grad_norm": 0.9225567579269409, "learning_rate": 0.0004612931444489741, "loss": 3.7286, "step": 9115 }, { "epoch": 0.6196494088870771, "grad_norm": 0.7830822467803955, "learning_rate": 0.0004612719119445577, "loss": 3.5736, "step": 9120 }, { "epoch": 0.6199891289577388, "grad_norm": 0.9648981690406799, "learning_rate": 0.0004612506794401413, "loss": 3.7685, "step": 9125 }, { "epoch": 0.6203288490284006, "grad_norm": 0.9156112670898438, "learning_rate": 0.000461229446935725, "loss": 3.6653, "step": 9130 }, { "epoch": 0.6206685690990623, "grad_norm": 0.8544549345970154, "learning_rate": 0.0004612082144313086, "loss": 3.7133, "step": 9135 }, { "epoch": 0.6210082891697242, "grad_norm": 0.7143400311470032, "learning_rate": 0.0004611869819268922, "loss": 3.4342, "step": 9140 }, { "epoch": 0.6213480092403859, "grad_norm": 1.011750340461731, "learning_rate": 0.00046116574942247593, "loss": 3.751, "step": 9145 }, { "epoch": 0.6216877293110477, "grad_norm": 0.5958943963050842, "learning_rate": 0.00046114451691805955, "loss": 3.4487, "step": 9150 }, { "epoch": 0.6220274493817095, "grad_norm": 0.7536500692367554, "learning_rate": 0.00046112328441364316, "loss": 3.5353, "step": 9155 }, { "epoch": 0.6223671694523712, "grad_norm": 0.8113008737564087, "learning_rate": 0.0004611020519092268, "loss": 3.605, "step": 9160 }, { "epoch": 0.622706889523033, "grad_norm": 0.9725861549377441, "learning_rate": 0.00046108081940481044, "loss": 3.5421, "step": 9165 }, { "epoch": 0.6230466095936948, "grad_norm": 0.9290528893470764, "learning_rate": 0.00046105958690039405, "loss": 3.5493, "step": 9170 }, { "epoch": 0.6233863296643566, "grad_norm": 0.9741479158401489, "learning_rate": 0.0004610383543959777, "loss": 3.6161, "step": 9175 }, { "epoch": 0.6237260497350183, "grad_norm": 1.1774533987045288, "learning_rate": 0.0004610171218915614, "loss": 3.2681, "step": 9180 }, { "epoch": 0.6240657698056801, "grad_norm": 0.7538641095161438, "learning_rate": 0.000460995889387145, "loss": 3.8644, "step": 9185 }, { "epoch": 0.6244054898763419, "grad_norm": 0.9997667670249939, "learning_rate": 0.00046097465688272867, "loss": 3.7068, "step": 9190 }, { "epoch": 0.6247452099470037, "grad_norm": 0.9657769203186035, "learning_rate": 0.0004609534243783123, "loss": 3.7854, "step": 9195 }, { "epoch": 0.6250849300176654, "grad_norm": 0.9157798290252686, "learning_rate": 0.0004609321918738959, "loss": 3.3871, "step": 9200 }, { "epoch": 0.6254246500883273, "grad_norm": 0.8904038071632385, "learning_rate": 0.00046091095936947956, "loss": 3.6366, "step": 9205 }, { "epoch": 0.625764370158989, "grad_norm": 1.110000491142273, "learning_rate": 0.00046088972686506317, "loss": 3.8531, "step": 9210 }, { "epoch": 0.6261040902296507, "grad_norm": 1.0073071718215942, "learning_rate": 0.00046086849436064684, "loss": 3.6096, "step": 9215 }, { "epoch": 0.6264438103003126, "grad_norm": 0.8766589760780334, "learning_rate": 0.0004608472618562305, "loss": 3.7387, "step": 9220 }, { "epoch": 0.6267835303709743, "grad_norm": 1.0139994621276855, "learning_rate": 0.0004608260293518141, "loss": 4.0488, "step": 9225 }, { "epoch": 0.6271232504416361, "grad_norm": 1.146382451057434, "learning_rate": 0.00046080479684739773, "loss": 3.5798, "step": 9230 }, { "epoch": 0.6274629705122978, "grad_norm": 1.0991572141647339, "learning_rate": 0.0004607835643429814, "loss": 3.5613, "step": 9235 }, { "epoch": 0.6278026905829597, "grad_norm": 0.9162105917930603, "learning_rate": 0.000460762331838565, "loss": 3.611, "step": 9240 }, { "epoch": 0.6281424106536214, "grad_norm": 1.1265350580215454, "learning_rate": 0.0004607410993341486, "loss": 3.5568, "step": 9245 }, { "epoch": 0.6284821307242832, "grad_norm": 1.0321701765060425, "learning_rate": 0.00046071986682973235, "loss": 3.4191, "step": 9250 }, { "epoch": 0.628821850794945, "grad_norm": 1.0759742259979248, "learning_rate": 0.00046069863432531596, "loss": 3.3326, "step": 9255 }, { "epoch": 0.6291615708656068, "grad_norm": 0.8322848081588745, "learning_rate": 0.0004606774018208996, "loss": 3.5711, "step": 9260 }, { "epoch": 0.6295012909362685, "grad_norm": 0.8317104578018188, "learning_rate": 0.00046065616931648324, "loss": 3.3688, "step": 9265 }, { "epoch": 0.6298410110069302, "grad_norm": 1.2199130058288574, "learning_rate": 0.00046063493681206685, "loss": 3.4358, "step": 9270 }, { "epoch": 0.6301807310775921, "grad_norm": 0.998866856098175, "learning_rate": 0.00046061370430765047, "loss": 3.5782, "step": 9275 }, { "epoch": 0.6305204511482538, "grad_norm": 0.7895593643188477, "learning_rate": 0.00046059247180323413, "loss": 3.7514, "step": 9280 }, { "epoch": 0.6308601712189156, "grad_norm": 0.8745203614234924, "learning_rate": 0.0004605712392988178, "loss": 3.7245, "step": 9285 }, { "epoch": 0.6311998912895774, "grad_norm": 0.9896027445793152, "learning_rate": 0.0004605500067944014, "loss": 3.2388, "step": 9290 }, { "epoch": 0.6315396113602392, "grad_norm": 0.7802938222885132, "learning_rate": 0.0004605287742899851, "loss": 3.5677, "step": 9295 }, { "epoch": 0.6318793314309009, "grad_norm": 0.9826740026473999, "learning_rate": 0.0004605075417855687, "loss": 3.7193, "step": 9300 }, { "epoch": 0.6322190515015628, "grad_norm": 0.8194214105606079, "learning_rate": 0.0004604863092811523, "loss": 3.5755, "step": 9305 }, { "epoch": 0.6325587715722245, "grad_norm": 0.7518576979637146, "learning_rate": 0.000460465076776736, "loss": 3.3924, "step": 9310 }, { "epoch": 0.6328984916428863, "grad_norm": 0.8938272595405579, "learning_rate": 0.0004604438442723196, "loss": 3.5096, "step": 9315 }, { "epoch": 0.633238211713548, "grad_norm": 0.7907491326332092, "learning_rate": 0.00046042261176790325, "loss": 3.4944, "step": 9320 }, { "epoch": 0.6335779317842098, "grad_norm": 1.103773593902588, "learning_rate": 0.0004604013792634869, "loss": 3.6755, "step": 9325 }, { "epoch": 0.6339176518548716, "grad_norm": 1.000203251838684, "learning_rate": 0.00046038014675907053, "loss": 3.725, "step": 9330 }, { "epoch": 0.6342573719255333, "grad_norm": 1.1501493453979492, "learning_rate": 0.00046035891425465415, "loss": 3.7583, "step": 9335 }, { "epoch": 0.6345970919961952, "grad_norm": 0.8807327151298523, "learning_rate": 0.0004603376817502378, "loss": 3.7172, "step": 9340 }, { "epoch": 0.6349368120668569, "grad_norm": 0.9867746233940125, "learning_rate": 0.0004603164492458214, "loss": 3.6328, "step": 9345 }, { "epoch": 0.6352765321375187, "grad_norm": 0.8152786493301392, "learning_rate": 0.00046029521674140504, "loss": 3.6449, "step": 9350 }, { "epoch": 0.6356162522081804, "grad_norm": 0.7000026702880859, "learning_rate": 0.00046027398423698876, "loss": 3.5353, "step": 9355 }, { "epoch": 0.6359559722788423, "grad_norm": 0.9022358059883118, "learning_rate": 0.0004602527517325724, "loss": 3.5007, "step": 9360 }, { "epoch": 0.636295692349504, "grad_norm": 0.918520450592041, "learning_rate": 0.000460231519228156, "loss": 3.7396, "step": 9365 }, { "epoch": 0.6366354124201657, "grad_norm": 0.7370713949203491, "learning_rate": 0.00046021028672373965, "loss": 3.8551, "step": 9370 }, { "epoch": 0.6369751324908276, "grad_norm": 1.0673881769180298, "learning_rate": 0.00046018905421932327, "loss": 3.5456, "step": 9375 }, { "epoch": 0.6373148525614893, "grad_norm": 0.8956276774406433, "learning_rate": 0.00046016782171490693, "loss": 3.4564, "step": 9380 }, { "epoch": 0.6376545726321511, "grad_norm": 0.710504949092865, "learning_rate": 0.00046014658921049055, "loss": 3.6144, "step": 9385 }, { "epoch": 0.6379942927028129, "grad_norm": 0.8338595628738403, "learning_rate": 0.0004601253567060742, "loss": 3.5579, "step": 9390 }, { "epoch": 0.6383340127734747, "grad_norm": 1.0136778354644775, "learning_rate": 0.0004601041242016579, "loss": 3.5695, "step": 9395 }, { "epoch": 0.6386737328441364, "grad_norm": 0.8059502243995667, "learning_rate": 0.0004600828916972415, "loss": 3.5128, "step": 9400 }, { "epoch": 0.6390134529147982, "grad_norm": 1.2197282314300537, "learning_rate": 0.0004600616591928251, "loss": 3.5161, "step": 9405 }, { "epoch": 0.63935317298546, "grad_norm": 0.8844379782676697, "learning_rate": 0.0004600404266884088, "loss": 3.547, "step": 9410 }, { "epoch": 0.6396928930561218, "grad_norm": 0.9357528686523438, "learning_rate": 0.0004600191941839924, "loss": 3.4304, "step": 9415 }, { "epoch": 0.6400326131267835, "grad_norm": 1.0188320875167847, "learning_rate": 0.000459997961679576, "loss": 3.5737, "step": 9420 }, { "epoch": 0.6403723331974454, "grad_norm": 0.910322904586792, "learning_rate": 0.0004599767291751597, "loss": 3.6286, "step": 9425 }, { "epoch": 0.6407120532681071, "grad_norm": 1.027490496635437, "learning_rate": 0.00045995549667074333, "loss": 3.4945, "step": 9430 }, { "epoch": 0.6410517733387688, "grad_norm": 1.080902099609375, "learning_rate": 0.00045993426416632695, "loss": 3.4382, "step": 9435 }, { "epoch": 0.6413914934094306, "grad_norm": 0.8632578253746033, "learning_rate": 0.0004599130316619106, "loss": 3.6217, "step": 9440 }, { "epoch": 0.6417312134800924, "grad_norm": 0.9357524514198303, "learning_rate": 0.00045989179915749423, "loss": 3.5353, "step": 9445 }, { "epoch": 0.6420709335507542, "grad_norm": 0.9288363456726074, "learning_rate": 0.00045987056665307784, "loss": 3.5511, "step": 9450 }, { "epoch": 0.6424106536214159, "grad_norm": 1.0764970779418945, "learning_rate": 0.00045984933414866156, "loss": 3.4911, "step": 9455 }, { "epoch": 0.6427503736920778, "grad_norm": 1.0171911716461182, "learning_rate": 0.0004598281016442452, "loss": 3.6483, "step": 9460 }, { "epoch": 0.6430900937627395, "grad_norm": 1.0251002311706543, "learning_rate": 0.0004598068691398288, "loss": 3.455, "step": 9465 }, { "epoch": 0.6434298138334013, "grad_norm": 1.2715011835098267, "learning_rate": 0.00045978563663541245, "loss": 3.5477, "step": 9470 }, { "epoch": 0.6437695339040631, "grad_norm": 0.8534067869186401, "learning_rate": 0.00045976440413099607, "loss": 3.7298, "step": 9475 }, { "epoch": 0.6441092539747248, "grad_norm": 0.8055020570755005, "learning_rate": 0.0004597431716265797, "loss": 3.4169, "step": 9480 }, { "epoch": 0.6444489740453866, "grad_norm": 0.8094373345375061, "learning_rate": 0.00045972193912216335, "loss": 4.0356, "step": 9485 }, { "epoch": 0.6447886941160483, "grad_norm": 0.9132965803146362, "learning_rate": 0.000459700706617747, "loss": 3.5045, "step": 9490 }, { "epoch": 0.6451284141867102, "grad_norm": 0.8611772060394287, "learning_rate": 0.00045967947411333063, "loss": 3.5241, "step": 9495 }, { "epoch": 0.6454681342573719, "grad_norm": 0.9592629671096802, "learning_rate": 0.0004596582416089143, "loss": 3.5198, "step": 9500 }, { "epoch": 0.6458078543280337, "grad_norm": 0.7513400912284851, "learning_rate": 0.0004596370091044979, "loss": 3.3757, "step": 9505 }, { "epoch": 0.6461475743986955, "grad_norm": 0.9203159213066101, "learning_rate": 0.0004596157766000815, "loss": 3.4903, "step": 9510 }, { "epoch": 0.6464872944693573, "grad_norm": 0.7978993654251099, "learning_rate": 0.0004595945440956652, "loss": 3.6998, "step": 9515 }, { "epoch": 0.646827014540019, "grad_norm": 1.1690152883529663, "learning_rate": 0.0004595733115912488, "loss": 3.6687, "step": 9520 }, { "epoch": 0.6471667346106807, "grad_norm": 0.9563865065574646, "learning_rate": 0.00045955207908683247, "loss": 3.6202, "step": 9525 }, { "epoch": 0.6475064546813426, "grad_norm": 0.7930966019630432, "learning_rate": 0.00045953084658241614, "loss": 3.6775, "step": 9530 }, { "epoch": 0.6478461747520043, "grad_norm": 0.8804518580436707, "learning_rate": 0.00045950961407799975, "loss": 3.5288, "step": 9535 }, { "epoch": 0.6481858948226661, "grad_norm": 1.1478990316390991, "learning_rate": 0.00045948838157358336, "loss": 3.7528, "step": 9540 }, { "epoch": 0.6485256148933279, "grad_norm": 1.0313726663589478, "learning_rate": 0.00045946714906916703, "loss": 3.7126, "step": 9545 }, { "epoch": 0.6488653349639897, "grad_norm": 0.8495091795921326, "learning_rate": 0.00045944591656475064, "loss": 3.7049, "step": 9550 }, { "epoch": 0.6492050550346514, "grad_norm": 0.7156116366386414, "learning_rate": 0.00045942468406033425, "loss": 3.5366, "step": 9555 }, { "epoch": 0.6495447751053133, "grad_norm": 0.8270073533058167, "learning_rate": 0.000459403451555918, "loss": 3.6527, "step": 9560 }, { "epoch": 0.649884495175975, "grad_norm": 2.5198137760162354, "learning_rate": 0.0004593822190515016, "loss": 3.7208, "step": 9565 }, { "epoch": 0.6502242152466368, "grad_norm": 0.9195221066474915, "learning_rate": 0.0004593609865470852, "loss": 3.5409, "step": 9570 }, { "epoch": 0.6505639353172985, "grad_norm": 0.9784737229347229, "learning_rate": 0.00045933975404266887, "loss": 3.3524, "step": 9575 }, { "epoch": 0.6509036553879604, "grad_norm": 0.8501176834106445, "learning_rate": 0.0004593185215382525, "loss": 3.5882, "step": 9580 }, { "epoch": 0.6512433754586221, "grad_norm": 1.1644701957702637, "learning_rate": 0.0004592972890338361, "loss": 3.4562, "step": 9585 }, { "epoch": 0.6515830955292838, "grad_norm": 0.6464702486991882, "learning_rate": 0.00045927605652941976, "loss": 3.5612, "step": 9590 }, { "epoch": 0.6519228155999457, "grad_norm": 0.998833417892456, "learning_rate": 0.00045925482402500343, "loss": 3.5895, "step": 9595 }, { "epoch": 0.6522625356706074, "grad_norm": 0.8270003199577332, "learning_rate": 0.00045923359152058704, "loss": 3.4, "step": 9600 }, { "epoch": 0.6526022557412692, "grad_norm": 0.7798835039138794, "learning_rate": 0.0004592123590161707, "loss": 3.6549, "step": 9605 }, { "epoch": 0.6529419758119309, "grad_norm": 0.9011204838752747, "learning_rate": 0.0004591911265117543, "loss": 3.6591, "step": 9610 }, { "epoch": 0.6532816958825928, "grad_norm": 0.8644018173217773, "learning_rate": 0.00045916989400733793, "loss": 3.5734, "step": 9615 }, { "epoch": 0.6536214159532545, "grad_norm": 0.9296190738677979, "learning_rate": 0.0004591486615029216, "loss": 3.5061, "step": 9620 }, { "epoch": 0.6539611360239163, "grad_norm": 1.0252554416656494, "learning_rate": 0.0004591274289985052, "loss": 3.7346, "step": 9625 }, { "epoch": 0.6543008560945781, "grad_norm": 1.0916153192520142, "learning_rate": 0.0004591061964940889, "loss": 3.5624, "step": 9630 }, { "epoch": 0.6546405761652399, "grad_norm": 0.8425079584121704, "learning_rate": 0.00045908496398967255, "loss": 3.5451, "step": 9635 }, { "epoch": 0.6549802962359016, "grad_norm": 1.0015895366668701, "learning_rate": 0.00045906373148525616, "loss": 3.515, "step": 9640 }, { "epoch": 0.6553200163065634, "grad_norm": 3.9878273010253906, "learning_rate": 0.0004590424989808398, "loss": 3.4685, "step": 9645 }, { "epoch": 0.6556597363772252, "grad_norm": 0.9477266073226929, "learning_rate": 0.00045902126647642344, "loss": 3.1723, "step": 9650 }, { "epoch": 0.6559994564478869, "grad_norm": 2.86820650100708, "learning_rate": 0.00045900003397200705, "loss": 3.6233, "step": 9655 }, { "epoch": 0.6563391765185487, "grad_norm": 0.9585785269737244, "learning_rate": 0.00045897880146759067, "loss": 3.9116, "step": 9660 }, { "epoch": 0.6566788965892105, "grad_norm": 1.4238396883010864, "learning_rate": 0.0004589575689631744, "loss": 3.448, "step": 9665 }, { "epoch": 0.6570186166598723, "grad_norm": 1.0368926525115967, "learning_rate": 0.000458936336458758, "loss": 3.1788, "step": 9670 }, { "epoch": 0.657358336730534, "grad_norm": 0.9537739753723145, "learning_rate": 0.0004589151039543416, "loss": 3.36, "step": 9675 }, { "epoch": 0.6576980568011959, "grad_norm": 0.822840690612793, "learning_rate": 0.0004588938714499253, "loss": 3.8255, "step": 9680 }, { "epoch": 0.6580377768718576, "grad_norm": 1.9501646757125854, "learning_rate": 0.0004588726389455089, "loss": 3.4015, "step": 9685 }, { "epoch": 0.6583774969425193, "grad_norm": 1.1545319557189941, "learning_rate": 0.0004588514064410925, "loss": 3.3183, "step": 9690 }, { "epoch": 0.6587172170131811, "grad_norm": 0.9598755836486816, "learning_rate": 0.0004588301739366762, "loss": 3.6636, "step": 9695 }, { "epoch": 0.6590569370838429, "grad_norm": 0.8198927640914917, "learning_rate": 0.00045880894143225984, "loss": 3.535, "step": 9700 }, { "epoch": 0.6593966571545047, "grad_norm": 0.9714953899383545, "learning_rate": 0.00045878770892784346, "loss": 3.5723, "step": 9705 }, { "epoch": 0.6597363772251664, "grad_norm": 0.6785500049591064, "learning_rate": 0.0004587664764234271, "loss": 3.561, "step": 9710 }, { "epoch": 0.6600760972958283, "grad_norm": 0.8565849661827087, "learning_rate": 0.00045874524391901074, "loss": 3.5828, "step": 9715 }, { "epoch": 0.66041581736649, "grad_norm": 0.7377936840057373, "learning_rate": 0.0004587240114145944, "loss": 4.0625, "step": 9720 }, { "epoch": 0.6607555374371518, "grad_norm": 0.929123044013977, "learning_rate": 0.000458702778910178, "loss": 3.6794, "step": 9725 }, { "epoch": 0.6610952575078136, "grad_norm": 0.9880486726760864, "learning_rate": 0.00045868154640576163, "loss": 3.2818, "step": 9730 }, { "epoch": 0.6614349775784754, "grad_norm": 2.7136390209198, "learning_rate": 0.00045866031390134535, "loss": 3.5045, "step": 9735 }, { "epoch": 0.6617746976491371, "grad_norm": 1.013340711593628, "learning_rate": 0.00045863908139692896, "loss": 3.5091, "step": 9740 }, { "epoch": 0.6621144177197988, "grad_norm": 0.8813528418540955, "learning_rate": 0.0004586178488925126, "loss": 3.5192, "step": 9745 }, { "epoch": 0.6624541377904607, "grad_norm": 1.1761775016784668, "learning_rate": 0.00045859661638809624, "loss": 3.6784, "step": 9750 }, { "epoch": 0.6627938578611224, "grad_norm": 1.0533415079116821, "learning_rate": 0.00045857538388367986, "loss": 3.7614, "step": 9755 }, { "epoch": 0.6631335779317842, "grad_norm": 1.0841963291168213, "learning_rate": 0.00045855415137926347, "loss": 3.5115, "step": 9760 }, { "epoch": 0.663473298002446, "grad_norm": 1.5362977981567383, "learning_rate": 0.00045853291887484714, "loss": 3.3164, "step": 9765 }, { "epoch": 0.6638130180731078, "grad_norm": 0.9944166541099548, "learning_rate": 0.0004585116863704308, "loss": 3.6459, "step": 9770 }, { "epoch": 0.6641527381437695, "grad_norm": 1.0781303644180298, "learning_rate": 0.0004584904538660144, "loss": 3.3531, "step": 9775 }, { "epoch": 0.6644924582144313, "grad_norm": 0.7228087186813354, "learning_rate": 0.0004584692213615981, "loss": 3.8196, "step": 9780 }, { "epoch": 0.6648321782850931, "grad_norm": 0.7790773510932922, "learning_rate": 0.0004584479888571817, "loss": 3.7605, "step": 9785 }, { "epoch": 0.6651718983557549, "grad_norm": 2.090298891067505, "learning_rate": 0.0004584267563527653, "loss": 3.4729, "step": 9790 }, { "epoch": 0.6655116184264166, "grad_norm": 1.0650956630706787, "learning_rate": 0.000458405523848349, "loss": 3.3582, "step": 9795 }, { "epoch": 0.6658513384970784, "grad_norm": 1.0348899364471436, "learning_rate": 0.0004583842913439326, "loss": 3.3882, "step": 9800 }, { "epoch": 0.6661910585677402, "grad_norm": 0.7861026525497437, "learning_rate": 0.00045836305883951626, "loss": 3.4832, "step": 9805 }, { "epoch": 0.6665307786384019, "grad_norm": 0.7690174579620361, "learning_rate": 0.0004583418263350999, "loss": 3.6063, "step": 9810 }, { "epoch": 0.6668704987090638, "grad_norm": 0.9716728329658508, "learning_rate": 0.00045832059383068354, "loss": 3.7318, "step": 9815 }, { "epoch": 0.6672102187797255, "grad_norm": 1.3865169286727905, "learning_rate": 0.00045829936132626715, "loss": 3.804, "step": 9820 }, { "epoch": 0.6675499388503873, "grad_norm": 0.9330248236656189, "learning_rate": 0.0004582781288218508, "loss": 3.6352, "step": 9825 }, { "epoch": 0.667889658921049, "grad_norm": 1.1931864023208618, "learning_rate": 0.00045825689631743443, "loss": 3.6322, "step": 9830 }, { "epoch": 0.6682293789917109, "grad_norm": 0.9977623224258423, "learning_rate": 0.00045823566381301804, "loss": 3.5664, "step": 9835 }, { "epoch": 0.6685690990623726, "grad_norm": 0.8810417652130127, "learning_rate": 0.00045821443130860176, "loss": 3.6205, "step": 9840 }, { "epoch": 0.6689088191330343, "grad_norm": 1.0680047273635864, "learning_rate": 0.0004581931988041854, "loss": 3.6669, "step": 9845 }, { "epoch": 0.6692485392036962, "grad_norm": 0.8277389407157898, "learning_rate": 0.000458171966299769, "loss": 3.4307, "step": 9850 }, { "epoch": 0.6695882592743579, "grad_norm": 1.0202958583831787, "learning_rate": 0.00045815073379535266, "loss": 3.8241, "step": 9855 }, { "epoch": 0.6699279793450197, "grad_norm": 0.7576801180839539, "learning_rate": 0.00045812950129093627, "loss": 3.6985, "step": 9860 }, { "epoch": 0.6702676994156814, "grad_norm": 0.7994440793991089, "learning_rate": 0.0004581082687865199, "loss": 3.7091, "step": 9865 }, { "epoch": 0.6706074194863433, "grad_norm": 0.9095951318740845, "learning_rate": 0.00045808703628210355, "loss": 3.4342, "step": 9870 }, { "epoch": 0.670947139557005, "grad_norm": 0.7999094724655151, "learning_rate": 0.0004580658037776872, "loss": 3.9598, "step": 9875 }, { "epoch": 0.6712868596276668, "grad_norm": 0.8752751350402832, "learning_rate": 0.00045804457127327083, "loss": 3.6725, "step": 9880 }, { "epoch": 0.6716265796983286, "grad_norm": 0.8329854607582092, "learning_rate": 0.0004580233387688545, "loss": 3.6641, "step": 9885 }, { "epoch": 0.6719662997689904, "grad_norm": 0.7222393155097961, "learning_rate": 0.0004580021062644381, "loss": 3.4672, "step": 9890 }, { "epoch": 0.6723060198396521, "grad_norm": 0.8520861268043518, "learning_rate": 0.0004579808737600217, "loss": 3.3425, "step": 9895 }, { "epoch": 0.672645739910314, "grad_norm": 0.8364282846450806, "learning_rate": 0.0004579596412556054, "loss": 3.6737, "step": 9900 }, { "epoch": 0.6729854599809757, "grad_norm": 0.8777763843536377, "learning_rate": 0.000457938408751189, "loss": 3.3635, "step": 9905 }, { "epoch": 0.6733251800516374, "grad_norm": 0.8812415599822998, "learning_rate": 0.00045791717624677267, "loss": 3.6374, "step": 9910 }, { "epoch": 0.6736649001222992, "grad_norm": 0.842777669429779, "learning_rate": 0.00045789594374235634, "loss": 3.7138, "step": 9915 }, { "epoch": 0.674004620192961, "grad_norm": 0.8989967703819275, "learning_rate": 0.00045787471123793995, "loss": 3.6487, "step": 9920 }, { "epoch": 0.6743443402636228, "grad_norm": 0.9392794370651245, "learning_rate": 0.00045785347873352356, "loss": 3.422, "step": 9925 }, { "epoch": 0.6746840603342845, "grad_norm": 0.6621817350387573, "learning_rate": 0.00045783224622910723, "loss": 3.5373, "step": 9930 }, { "epoch": 0.6750237804049464, "grad_norm": 1.0078260898590088, "learning_rate": 0.00045781101372469084, "loss": 3.6057, "step": 9935 }, { "epoch": 0.6753635004756081, "grad_norm": 1.0473227500915527, "learning_rate": 0.00045778978122027446, "loss": 3.6386, "step": 9940 }, { "epoch": 0.6757032205462699, "grad_norm": 3.8609049320220947, "learning_rate": 0.0004577685487158582, "loss": 3.5488, "step": 9945 }, { "epoch": 0.6760429406169316, "grad_norm": 0.8203685283660889, "learning_rate": 0.0004577473162114418, "loss": 3.8077, "step": 9950 }, { "epoch": 0.6763826606875935, "grad_norm": 0.9270029067993164, "learning_rate": 0.0004577260837070254, "loss": 3.5226, "step": 9955 }, { "epoch": 0.6767223807582552, "grad_norm": 0.9140582084655762, "learning_rate": 0.00045770485120260907, "loss": 3.6304, "step": 9960 }, { "epoch": 0.6770621008289169, "grad_norm": 0.9742290377616882, "learning_rate": 0.0004576836186981927, "loss": 3.5929, "step": 9965 }, { "epoch": 0.6774018208995788, "grad_norm": 1.282859444618225, "learning_rate": 0.0004576623861937763, "loss": 3.5009, "step": 9970 }, { "epoch": 0.6777415409702405, "grad_norm": 0.9461668133735657, "learning_rate": 0.00045764115368936, "loss": 3.5402, "step": 9975 }, { "epoch": 0.6780812610409023, "grad_norm": 0.8255449533462524, "learning_rate": 0.00045761992118494363, "loss": 3.5478, "step": 9980 }, { "epoch": 0.6784209811115641, "grad_norm": 1.2480297088623047, "learning_rate": 0.00045759868868052724, "loss": 3.5333, "step": 9985 }, { "epoch": 0.6787607011822259, "grad_norm": 0.7630453705787659, "learning_rate": 0.0004575774561761109, "loss": 3.6221, "step": 9990 }, { "epoch": 0.6791004212528876, "grad_norm": 0.8601205348968506, "learning_rate": 0.0004575562236716945, "loss": 3.303, "step": 9995 }, { "epoch": 0.6794401413235494, "grad_norm": 0.9809013605117798, "learning_rate": 0.00045753499116727814, "loss": 3.47, "step": 10000 }, { "epoch": 0.6797798613942112, "grad_norm": 0.9836441874504089, "learning_rate": 0.0004575137586628618, "loss": 3.8201, "step": 10005 }, { "epoch": 0.680119581464873, "grad_norm": 1.0007729530334473, "learning_rate": 0.00045749252615844547, "loss": 3.6478, "step": 10010 }, { "epoch": 0.6804593015355347, "grad_norm": 0.8302001357078552, "learning_rate": 0.0004574712936540291, "loss": 3.5738, "step": 10015 }, { "epoch": 0.6807990216061965, "grad_norm": 0.9424965977668762, "learning_rate": 0.00045745006114961275, "loss": 3.4707, "step": 10020 }, { "epoch": 0.6811387416768583, "grad_norm": 0.8781291842460632, "learning_rate": 0.00045742882864519636, "loss": 3.7083, "step": 10025 }, { "epoch": 0.68147846174752, "grad_norm": 0.9015694260597229, "learning_rate": 0.00045740759614078, "loss": 3.4671, "step": 10030 }, { "epoch": 0.6818181818181818, "grad_norm": 1.2038174867630005, "learning_rate": 0.00045738636363636364, "loss": 3.8655, "step": 10035 }, { "epoch": 0.6821579018888436, "grad_norm": 0.825340211391449, "learning_rate": 0.00045736513113194726, "loss": 3.5167, "step": 10040 }, { "epoch": 0.6824976219595054, "grad_norm": 1.032222867012024, "learning_rate": 0.0004573438986275309, "loss": 3.4334, "step": 10045 }, { "epoch": 0.6828373420301671, "grad_norm": 0.9650816917419434, "learning_rate": 0.0004573226661231146, "loss": 3.3831, "step": 10050 }, { "epoch": 0.683177062100829, "grad_norm": 0.9013504981994629, "learning_rate": 0.0004573014336186982, "loss": 3.6607, "step": 10055 }, { "epoch": 0.6835167821714907, "grad_norm": 1.056566596031189, "learning_rate": 0.00045728020111428187, "loss": 3.7816, "step": 10060 }, { "epoch": 0.6838565022421524, "grad_norm": 0.9176785349845886, "learning_rate": 0.0004572589686098655, "loss": 3.6593, "step": 10065 }, { "epoch": 0.6841962223128143, "grad_norm": 0.8258503079414368, "learning_rate": 0.0004572377361054491, "loss": 3.7927, "step": 10070 }, { "epoch": 0.684535942383476, "grad_norm": 1.0429688692092896, "learning_rate": 0.00045721650360103276, "loss": 3.2329, "step": 10075 }, { "epoch": 0.6848756624541378, "grad_norm": 1.0311551094055176, "learning_rate": 0.00045719527109661643, "loss": 3.5089, "step": 10080 }, { "epoch": 0.6852153825247995, "grad_norm": 0.8418834805488586, "learning_rate": 0.00045717403859220004, "loss": 3.5586, "step": 10085 }, { "epoch": 0.6855551025954614, "grad_norm": 0.9078406691551208, "learning_rate": 0.0004571528060877837, "loss": 3.4687, "step": 10090 }, { "epoch": 0.6858948226661231, "grad_norm": 0.9821232557296753, "learning_rate": 0.0004571315735833673, "loss": 3.534, "step": 10095 }, { "epoch": 0.6862345427367849, "grad_norm": 1.1640886068344116, "learning_rate": 0.00045711034107895094, "loss": 3.655, "step": 10100 }, { "epoch": 0.6865742628074467, "grad_norm": 0.8542157411575317, "learning_rate": 0.0004570891085745346, "loss": 3.4982, "step": 10105 }, { "epoch": 0.6869139828781085, "grad_norm": 0.8526970744132996, "learning_rate": 0.0004570678760701182, "loss": 3.571, "step": 10110 }, { "epoch": 0.6872537029487702, "grad_norm": 0.944163978099823, "learning_rate": 0.0004570466435657019, "loss": 3.6884, "step": 10115 }, { "epoch": 0.6875934230194319, "grad_norm": 1.2208729982376099, "learning_rate": 0.00045702541106128555, "loss": 3.319, "step": 10120 }, { "epoch": 0.6879331430900938, "grad_norm": 0.9452415704727173, "learning_rate": 0.00045700417855686916, "loss": 3.7208, "step": 10125 }, { "epoch": 0.6882728631607555, "grad_norm": 0.9409542083740234, "learning_rate": 0.0004569829460524528, "loss": 3.4302, "step": 10130 }, { "epoch": 0.6886125832314173, "grad_norm": 0.710719645023346, "learning_rate": 0.00045696171354803644, "loss": 3.6244, "step": 10135 }, { "epoch": 0.6889523033020791, "grad_norm": 1.1300169229507446, "learning_rate": 0.00045694048104362006, "loss": 3.4285, "step": 10140 }, { "epoch": 0.6892920233727409, "grad_norm": 0.8249067664146423, "learning_rate": 0.00045691924853920367, "loss": 3.685, "step": 10145 }, { "epoch": 0.6896317434434026, "grad_norm": 0.8984020352363586, "learning_rate": 0.0004568980160347874, "loss": 3.6933, "step": 10150 }, { "epoch": 0.6899714635140645, "grad_norm": 1.0149891376495361, "learning_rate": 0.000456876783530371, "loss": 3.5754, "step": 10155 }, { "epoch": 0.6903111835847262, "grad_norm": 0.9952086806297302, "learning_rate": 0.0004568555510259546, "loss": 3.4947, "step": 10160 }, { "epoch": 0.690650903655388, "grad_norm": 1.5736440420150757, "learning_rate": 0.0004568343185215383, "loss": 3.5016, "step": 10165 }, { "epoch": 0.6909906237260497, "grad_norm": 0.9171612858772278, "learning_rate": 0.0004568130860171219, "loss": 3.5443, "step": 10170 }, { "epoch": 0.6913303437967115, "grad_norm": 0.8679299354553223, "learning_rate": 0.0004567918535127055, "loss": 3.4249, "step": 10175 }, { "epoch": 0.6916700638673733, "grad_norm": 1.0204185247421265, "learning_rate": 0.0004567706210082892, "loss": 3.7859, "step": 10180 }, { "epoch": 0.692009783938035, "grad_norm": 0.8619295954704285, "learning_rate": 0.00045674938850387284, "loss": 3.1959, "step": 10185 }, { "epoch": 0.6923495040086969, "grad_norm": 0.8699820041656494, "learning_rate": 0.00045672815599945646, "loss": 3.4423, "step": 10190 }, { "epoch": 0.6926892240793586, "grad_norm": 0.8774884343147278, "learning_rate": 0.0004567069234950401, "loss": 3.6781, "step": 10195 }, { "epoch": 0.6930289441500204, "grad_norm": 0.8639410734176636, "learning_rate": 0.00045668569099062374, "loss": 3.5935, "step": 10200 }, { "epoch": 0.6933686642206821, "grad_norm": 1.0618315935134888, "learning_rate": 0.00045666445848620735, "loss": 3.3101, "step": 10205 }, { "epoch": 0.693708384291344, "grad_norm": 0.7706015110015869, "learning_rate": 0.000456643225981791, "loss": 3.5879, "step": 10210 }, { "epoch": 0.6940481043620057, "grad_norm": 0.9472137689590454, "learning_rate": 0.00045662199347737463, "loss": 3.6957, "step": 10215 }, { "epoch": 0.6943878244326674, "grad_norm": 1.115037202835083, "learning_rate": 0.0004566007609729583, "loss": 3.6865, "step": 10220 }, { "epoch": 0.6947275445033293, "grad_norm": 1.3475279808044434, "learning_rate": 0.00045657952846854196, "loss": 3.6007, "step": 10225 }, { "epoch": 0.695067264573991, "grad_norm": 0.9339306950569153, "learning_rate": 0.0004565582959641256, "loss": 3.6501, "step": 10230 }, { "epoch": 0.6954069846446528, "grad_norm": 0.9788309931755066, "learning_rate": 0.0004565370634597092, "loss": 3.6324, "step": 10235 }, { "epoch": 0.6957467047153146, "grad_norm": 0.7991085648536682, "learning_rate": 0.00045651583095529286, "loss": 3.5553, "step": 10240 }, { "epoch": 0.6960864247859764, "grad_norm": 0.8379011750221252, "learning_rate": 0.00045649459845087647, "loss": 3.7941, "step": 10245 }, { "epoch": 0.6964261448566381, "grad_norm": 0.8314170241355896, "learning_rate": 0.0004564733659464601, "loss": 3.5516, "step": 10250 }, { "epoch": 0.6967658649272999, "grad_norm": 1.0944492816925049, "learning_rate": 0.0004564521334420438, "loss": 3.4957, "step": 10255 }, { "epoch": 0.6971055849979617, "grad_norm": 1.1867443323135376, "learning_rate": 0.0004564309009376274, "loss": 3.6463, "step": 10260 }, { "epoch": 0.6974453050686235, "grad_norm": 1.0698970556259155, "learning_rate": 0.00045640966843321103, "loss": 3.5693, "step": 10265 }, { "epoch": 0.6977850251392852, "grad_norm": 0.8452979922294617, "learning_rate": 0.0004563884359287947, "loss": 3.728, "step": 10270 }, { "epoch": 0.698124745209947, "grad_norm": 0.9468900561332703, "learning_rate": 0.0004563672034243783, "loss": 3.4386, "step": 10275 }, { "epoch": 0.6984644652806088, "grad_norm": 1.0098146200180054, "learning_rate": 0.0004563459709199619, "loss": 3.5992, "step": 10280 }, { "epoch": 0.6988041853512705, "grad_norm": 0.7994229793548584, "learning_rate": 0.0004563247384155456, "loss": 3.5728, "step": 10285 }, { "epoch": 0.6991439054219323, "grad_norm": 0.9067478775978088, "learning_rate": 0.00045630350591112926, "loss": 3.4584, "step": 10290 }, { "epoch": 0.6994836254925941, "grad_norm": 0.9154629111289978, "learning_rate": 0.00045628227340671287, "loss": 3.2795, "step": 10295 }, { "epoch": 0.6998233455632559, "grad_norm": 0.7925425171852112, "learning_rate": 0.00045626104090229654, "loss": 3.5845, "step": 10300 }, { "epoch": 0.7001630656339176, "grad_norm": 1.1688870191574097, "learning_rate": 0.00045623980839788015, "loss": 3.5006, "step": 10305 }, { "epoch": 0.7005027857045795, "grad_norm": 1.1043447256088257, "learning_rate": 0.00045621857589346376, "loss": 3.4794, "step": 10310 }, { "epoch": 0.7008425057752412, "grad_norm": 1.0027509927749634, "learning_rate": 0.00045619734338904743, "loss": 3.6069, "step": 10315 }, { "epoch": 0.701182225845903, "grad_norm": 0.8364081978797913, "learning_rate": 0.00045617611088463104, "loss": 3.4533, "step": 10320 }, { "epoch": 0.7015219459165648, "grad_norm": 0.8713105916976929, "learning_rate": 0.0004561548783802147, "loss": 3.5355, "step": 10325 }, { "epoch": 0.7018616659872265, "grad_norm": 0.9970270991325378, "learning_rate": 0.0004561336458757984, "loss": 3.5548, "step": 10330 }, { "epoch": 0.7022013860578883, "grad_norm": 0.7746514081954956, "learning_rate": 0.000456112413371382, "loss": 3.5069, "step": 10335 }, { "epoch": 0.70254110612855, "grad_norm": 0.8967471718788147, "learning_rate": 0.0004560911808669656, "loss": 3.4011, "step": 10340 }, { "epoch": 0.7028808261992119, "grad_norm": 0.9691534042358398, "learning_rate": 0.00045606994836254927, "loss": 3.736, "step": 10345 }, { "epoch": 0.7032205462698736, "grad_norm": 0.9595784544944763, "learning_rate": 0.0004560487158581329, "loss": 3.4949, "step": 10350 }, { "epoch": 0.7035602663405354, "grad_norm": 0.8464652299880981, "learning_rate": 0.0004560274833537165, "loss": 3.594, "step": 10355 }, { "epoch": 0.7038999864111972, "grad_norm": 1.2040914297103882, "learning_rate": 0.0004560062508493002, "loss": 3.8107, "step": 10360 }, { "epoch": 0.704239706481859, "grad_norm": 1.1123908758163452, "learning_rate": 0.00045598501834488383, "loss": 3.3067, "step": 10365 }, { "epoch": 0.7045794265525207, "grad_norm": 1.1994515657424927, "learning_rate": 0.00045596378584046744, "loss": 3.6785, "step": 10370 }, { "epoch": 0.7049191466231824, "grad_norm": 0.8759797811508179, "learning_rate": 0.0004559425533360511, "loss": 3.4998, "step": 10375 }, { "epoch": 0.7052588666938443, "grad_norm": 0.7643389105796814, "learning_rate": 0.0004559213208316347, "loss": 3.6867, "step": 10380 }, { "epoch": 0.705598586764506, "grad_norm": 0.9667297005653381, "learning_rate": 0.00045590008832721834, "loss": 3.6454, "step": 10385 }, { "epoch": 0.7059383068351678, "grad_norm": 0.9207439422607422, "learning_rate": 0.000455878855822802, "loss": 3.5781, "step": 10390 }, { "epoch": 0.7062780269058296, "grad_norm": 0.8225126266479492, "learning_rate": 0.00045585762331838567, "loss": 3.6335, "step": 10395 }, { "epoch": 0.7066177469764914, "grad_norm": 0.8373608589172363, "learning_rate": 0.00045583639081396934, "loss": 3.549, "step": 10400 }, { "epoch": 0.7069574670471531, "grad_norm": 0.8856910467147827, "learning_rate": 0.00045581515830955295, "loss": 3.494, "step": 10405 }, { "epoch": 0.707297187117815, "grad_norm": 0.8334434032440186, "learning_rate": 0.00045579392580513657, "loss": 3.6768, "step": 10410 }, { "epoch": 0.7076369071884767, "grad_norm": 0.8110462427139282, "learning_rate": 0.00045577269330072023, "loss": 3.6806, "step": 10415 }, { "epoch": 0.7079766272591385, "grad_norm": 0.948260486125946, "learning_rate": 0.00045575146079630385, "loss": 3.6702, "step": 10420 }, { "epoch": 0.7083163473298002, "grad_norm": 0.9012405872344971, "learning_rate": 0.00045573022829188746, "loss": 3.7314, "step": 10425 }, { "epoch": 0.708656067400462, "grad_norm": 0.8277856111526489, "learning_rate": 0.0004557089957874712, "loss": 3.8158, "step": 10430 }, { "epoch": 0.7089957874711238, "grad_norm": 1.0936241149902344, "learning_rate": 0.0004556877632830548, "loss": 3.7613, "step": 10435 }, { "epoch": 0.7093355075417855, "grad_norm": 0.9220093488693237, "learning_rate": 0.0004556665307786384, "loss": 3.7407, "step": 10440 }, { "epoch": 0.7096752276124474, "grad_norm": 0.7949259281158447, "learning_rate": 0.00045564529827422207, "loss": 3.6837, "step": 10445 }, { "epoch": 0.7100149476831091, "grad_norm": 0.8399540185928345, "learning_rate": 0.0004556240657698057, "loss": 3.5911, "step": 10450 }, { "epoch": 0.7103546677537709, "grad_norm": 0.8649775981903076, "learning_rate": 0.0004556028332653893, "loss": 3.7716, "step": 10455 }, { "epoch": 0.7106943878244326, "grad_norm": 0.8853198885917664, "learning_rate": 0.00045558160076097297, "loss": 3.5613, "step": 10460 }, { "epoch": 0.7110341078950945, "grad_norm": 0.9997054934501648, "learning_rate": 0.00045556036825655663, "loss": 3.5428, "step": 10465 }, { "epoch": 0.7113738279657562, "grad_norm": 0.705693781375885, "learning_rate": 0.00045553913575214025, "loss": 3.7214, "step": 10470 }, { "epoch": 0.711713548036418, "grad_norm": 0.989611029624939, "learning_rate": 0.0004555179032477239, "loss": 3.4905, "step": 10475 }, { "epoch": 0.7120532681070798, "grad_norm": 2.179896593093872, "learning_rate": 0.0004554966707433075, "loss": 3.5856, "step": 10480 }, { "epoch": 0.7123929881777415, "grad_norm": 1.0092360973358154, "learning_rate": 0.00045547543823889114, "loss": 3.2657, "step": 10485 }, { "epoch": 0.7127327082484033, "grad_norm": 0.8293630480766296, "learning_rate": 0.0004554542057344748, "loss": 3.6127, "step": 10490 }, { "epoch": 0.7130724283190651, "grad_norm": 1.2011226415634155, "learning_rate": 0.0004554329732300584, "loss": 3.6147, "step": 10495 }, { "epoch": 0.7134121483897269, "grad_norm": 0.9297786355018616, "learning_rate": 0.0004554117407256421, "loss": 3.5582, "step": 10500 }, { "epoch": 0.7137518684603886, "grad_norm": 1.0013937950134277, "learning_rate": 0.00045539050822122575, "loss": 3.6122, "step": 10505 }, { "epoch": 0.7140915885310504, "grad_norm": 1.0570484399795532, "learning_rate": 0.00045536927571680937, "loss": 3.8492, "step": 10510 }, { "epoch": 0.7144313086017122, "grad_norm": 0.8673052787780762, "learning_rate": 0.000455348043212393, "loss": 3.4618, "step": 10515 }, { "epoch": 0.714771028672374, "grad_norm": 1.160805583000183, "learning_rate": 0.00045532681070797665, "loss": 3.514, "step": 10520 }, { "epoch": 0.7151107487430357, "grad_norm": 0.9520072340965271, "learning_rate": 0.00045530557820356026, "loss": 3.5139, "step": 10525 }, { "epoch": 0.7154504688136976, "grad_norm": 0.7889097332954407, "learning_rate": 0.00045528434569914387, "loss": 3.3948, "step": 10530 }, { "epoch": 0.7157901888843593, "grad_norm": 0.8136929869651794, "learning_rate": 0.0004552631131947276, "loss": 3.6394, "step": 10535 }, { "epoch": 0.716129908955021, "grad_norm": 0.976205050945282, "learning_rate": 0.0004552418806903112, "loss": 3.6023, "step": 10540 }, { "epoch": 0.7164696290256828, "grad_norm": 0.9447394013404846, "learning_rate": 0.0004552206481858948, "loss": 3.4746, "step": 10545 }, { "epoch": 0.7168093490963446, "grad_norm": 0.9591748118400574, "learning_rate": 0.0004551994156814785, "loss": 3.7164, "step": 10550 }, { "epoch": 0.7171490691670064, "grad_norm": 0.8005481362342834, "learning_rate": 0.0004551781831770621, "loss": 3.65, "step": 10555 }, { "epoch": 0.7174887892376681, "grad_norm": 0.7825222611427307, "learning_rate": 0.0004551569506726457, "loss": 3.6441, "step": 10560 }, { "epoch": 0.71782850930833, "grad_norm": 0.7435761094093323, "learning_rate": 0.00045513571816822943, "loss": 3.7197, "step": 10565 }, { "epoch": 0.7181682293789917, "grad_norm": 0.9320538640022278, "learning_rate": 0.00045511448566381305, "loss": 3.7945, "step": 10570 }, { "epoch": 0.7185079494496535, "grad_norm": 1.3199293613433838, "learning_rate": 0.00045509325315939666, "loss": 3.5319, "step": 10575 }, { "epoch": 0.7188476695203153, "grad_norm": 1.0553083419799805, "learning_rate": 0.0004550720206549803, "loss": 3.5761, "step": 10580 }, { "epoch": 0.719187389590977, "grad_norm": 1.0768038034439087, "learning_rate": 0.00045505078815056394, "loss": 3.381, "step": 10585 }, { "epoch": 0.7195271096616388, "grad_norm": 1.4266234636306763, "learning_rate": 0.00045502955564614755, "loss": 3.7401, "step": 10590 }, { "epoch": 0.7198668297323005, "grad_norm": 0.9246065020561218, "learning_rate": 0.0004550083231417312, "loss": 3.3828, "step": 10595 }, { "epoch": 0.7202065498029624, "grad_norm": 0.9452723264694214, "learning_rate": 0.0004549870906373149, "loss": 3.4547, "step": 10600 }, { "epoch": 0.7205462698736241, "grad_norm": 0.9403240084648132, "learning_rate": 0.0004549658581328985, "loss": 3.6254, "step": 10605 }, { "epoch": 0.7208859899442859, "grad_norm": 0.7218195199966431, "learning_rate": 0.00045494462562848217, "loss": 3.6307, "step": 10610 }, { "epoch": 0.7212257100149477, "grad_norm": 0.7235953211784363, "learning_rate": 0.0004549233931240658, "loss": 3.5648, "step": 10615 }, { "epoch": 0.7215654300856095, "grad_norm": 0.9723560810089111, "learning_rate": 0.0004549021606196494, "loss": 3.5488, "step": 10620 }, { "epoch": 0.7219051501562712, "grad_norm": 0.8729822039604187, "learning_rate": 0.00045488092811523306, "loss": 3.728, "step": 10625 }, { "epoch": 0.722244870226933, "grad_norm": 0.7662131190299988, "learning_rate": 0.00045485969561081667, "loss": 3.6201, "step": 10630 }, { "epoch": 0.7225845902975948, "grad_norm": 1.4485126733779907, "learning_rate": 0.00045483846310640034, "loss": 3.5443, "step": 10635 }, { "epoch": 0.7229243103682566, "grad_norm": 0.8402880430221558, "learning_rate": 0.000454817230601984, "loss": 3.4858, "step": 10640 }, { "epoch": 0.7232640304389183, "grad_norm": 1.1343055963516235, "learning_rate": 0.0004547959980975676, "loss": 3.6001, "step": 10645 }, { "epoch": 0.7236037505095801, "grad_norm": 1.377893328666687, "learning_rate": 0.00045477476559315123, "loss": 3.4448, "step": 10650 }, { "epoch": 0.7239434705802419, "grad_norm": 0.8980259895324707, "learning_rate": 0.0004547535330887349, "loss": 3.5425, "step": 10655 }, { "epoch": 0.7242831906509036, "grad_norm": 0.9098309874534607, "learning_rate": 0.0004547323005843185, "loss": 3.2038, "step": 10660 }, { "epoch": 0.7246229107215655, "grad_norm": 0.9590654969215393, "learning_rate": 0.0004547110680799021, "loss": 3.6157, "step": 10665 }, { "epoch": 0.7249626307922272, "grad_norm": 0.9119604229927063, "learning_rate": 0.00045468983557548585, "loss": 3.5822, "step": 10670 }, { "epoch": 0.725302350862889, "grad_norm": 0.9190789461135864, "learning_rate": 0.00045466860307106946, "loss": 3.6923, "step": 10675 }, { "epoch": 0.7256420709335507, "grad_norm": 0.8660645484924316, "learning_rate": 0.00045464737056665307, "loss": 3.6354, "step": 10680 }, { "epoch": 0.7259817910042126, "grad_norm": 1.1816734075546265, "learning_rate": 0.00045462613806223674, "loss": 3.5288, "step": 10685 }, { "epoch": 0.7263215110748743, "grad_norm": 1.1365407705307007, "learning_rate": 0.00045460490555782035, "loss": 3.6322, "step": 10690 }, { "epoch": 0.726661231145536, "grad_norm": 0.8032805323600769, "learning_rate": 0.00045458367305340397, "loss": 3.6401, "step": 10695 }, { "epoch": 0.7270009512161979, "grad_norm": 1.0391963720321655, "learning_rate": 0.00045456244054898763, "loss": 3.7568, "step": 10700 }, { "epoch": 0.7273406712868596, "grad_norm": 0.9448873996734619, "learning_rate": 0.0004545412080445713, "loss": 3.6928, "step": 10705 }, { "epoch": 0.7276803913575214, "grad_norm": 1.0348738431930542, "learning_rate": 0.0004545199755401549, "loss": 3.7946, "step": 10710 }, { "epoch": 0.7280201114281831, "grad_norm": 0.8416638374328613, "learning_rate": 0.0004544987430357386, "loss": 3.7151, "step": 10715 }, { "epoch": 0.728359831498845, "grad_norm": 1.0647612810134888, "learning_rate": 0.0004544775105313222, "loss": 3.4337, "step": 10720 }, { "epoch": 0.7286995515695067, "grad_norm": 0.7164780497550964, "learning_rate": 0.0004544562780269058, "loss": 3.6502, "step": 10725 }, { "epoch": 0.7290392716401685, "grad_norm": 0.8441203236579895, "learning_rate": 0.0004544350455224895, "loss": 3.4141, "step": 10730 }, { "epoch": 0.7293789917108303, "grad_norm": 1.11995530128479, "learning_rate": 0.0004544138130180731, "loss": 3.6809, "step": 10735 }, { "epoch": 0.7297187117814921, "grad_norm": 0.8134137392044067, "learning_rate": 0.0004543925805136568, "loss": 3.7513, "step": 10740 }, { "epoch": 0.7300584318521538, "grad_norm": 0.9562360644340515, "learning_rate": 0.0004543713480092404, "loss": 3.423, "step": 10745 }, { "epoch": 0.7303981519228157, "grad_norm": 1.4029264450073242, "learning_rate": 0.00045435011550482403, "loss": 3.6142, "step": 10750 }, { "epoch": 0.7307378719934774, "grad_norm": 0.8394050002098083, "learning_rate": 0.0004543288830004077, "loss": 3.5378, "step": 10755 }, { "epoch": 0.7310775920641391, "grad_norm": 0.9333934783935547, "learning_rate": 0.0004543076504959913, "loss": 3.7519, "step": 10760 }, { "epoch": 0.7314173121348009, "grad_norm": 1.0486069917678833, "learning_rate": 0.0004542864179915749, "loss": 3.6135, "step": 10765 }, { "epoch": 0.7317570322054627, "grad_norm": 0.9448487162590027, "learning_rate": 0.0004542651854871586, "loss": 3.4772, "step": 10770 }, { "epoch": 0.7320967522761245, "grad_norm": 0.8600449562072754, "learning_rate": 0.00045424395298274226, "loss": 3.7339, "step": 10775 }, { "epoch": 0.7324364723467862, "grad_norm": 0.8217036724090576, "learning_rate": 0.0004542227204783259, "loss": 3.5561, "step": 10780 }, { "epoch": 0.7327761924174481, "grad_norm": 0.9303250908851624, "learning_rate": 0.00045420148797390954, "loss": 3.7071, "step": 10785 }, { "epoch": 0.7331159124881098, "grad_norm": 0.9331584572792053, "learning_rate": 0.00045418025546949315, "loss": 3.471, "step": 10790 }, { "epoch": 0.7334556325587716, "grad_norm": 1.1170289516448975, "learning_rate": 0.00045415902296507677, "loss": 3.5597, "step": 10795 }, { "epoch": 0.7337953526294333, "grad_norm": 1.1373021602630615, "learning_rate": 0.00045413779046066043, "loss": 3.5056, "step": 10800 }, { "epoch": 0.7341350727000951, "grad_norm": 1.125393271446228, "learning_rate": 0.00045411655795624405, "loss": 3.65, "step": 10805 }, { "epoch": 0.7344747927707569, "grad_norm": 0.8622141480445862, "learning_rate": 0.0004540953254518277, "loss": 3.7062, "step": 10810 }, { "epoch": 0.7348145128414186, "grad_norm": 0.9908720850944519, "learning_rate": 0.0004540740929474114, "loss": 3.6258, "step": 10815 }, { "epoch": 0.7351542329120805, "grad_norm": 0.7416966557502747, "learning_rate": 0.000454052860442995, "loss": 3.5816, "step": 10820 }, { "epoch": 0.7354939529827422, "grad_norm": 0.7208784222602844, "learning_rate": 0.0004540316279385786, "loss": 3.667, "step": 10825 }, { "epoch": 0.735833673053404, "grad_norm": 1.0106205940246582, "learning_rate": 0.0004540103954341623, "loss": 3.4548, "step": 10830 }, { "epoch": 0.7361733931240658, "grad_norm": 0.9159315228462219, "learning_rate": 0.0004539891629297459, "loss": 3.6485, "step": 10835 }, { "epoch": 0.7365131131947276, "grad_norm": 0.8759803771972656, "learning_rate": 0.0004539679304253295, "loss": 3.4822, "step": 10840 }, { "epoch": 0.7368528332653893, "grad_norm": 0.9853613972663879, "learning_rate": 0.0004539466979209132, "loss": 3.4231, "step": 10845 }, { "epoch": 0.737192553336051, "grad_norm": 0.9251391291618347, "learning_rate": 0.00045392546541649683, "loss": 3.6672, "step": 10850 }, { "epoch": 0.7375322734067129, "grad_norm": 0.7719985842704773, "learning_rate": 0.00045390423291208045, "loss": 3.5448, "step": 10855 }, { "epoch": 0.7378719934773746, "grad_norm": 0.8033459186553955, "learning_rate": 0.0004538830004076641, "loss": 3.6355, "step": 10860 }, { "epoch": 0.7382117135480364, "grad_norm": 0.7925296425819397, "learning_rate": 0.00045386176790324773, "loss": 3.6984, "step": 10865 }, { "epoch": 0.7385514336186982, "grad_norm": 0.8499897718429565, "learning_rate": 0.00045384053539883134, "loss": 3.5504, "step": 10870 }, { "epoch": 0.73889115368936, "grad_norm": 1.048766016960144, "learning_rate": 0.000453819302894415, "loss": 3.6068, "step": 10875 }, { "epoch": 0.7392308737600217, "grad_norm": 1.1406528949737549, "learning_rate": 0.0004537980703899987, "loss": 3.6045, "step": 10880 }, { "epoch": 0.7395705938306835, "grad_norm": 0.8239940404891968, "learning_rate": 0.0004537768378855823, "loss": 3.5414, "step": 10885 }, { "epoch": 0.7399103139013453, "grad_norm": 1.0461480617523193, "learning_rate": 0.00045375560538116595, "loss": 3.4963, "step": 10890 }, { "epoch": 0.7402500339720071, "grad_norm": 0.9129328727722168, "learning_rate": 0.00045373437287674957, "loss": 3.5619, "step": 10895 }, { "epoch": 0.7405897540426688, "grad_norm": 0.9162158370018005, "learning_rate": 0.0004537131403723332, "loss": 3.5378, "step": 10900 }, { "epoch": 0.7409294741133307, "grad_norm": 0.8316112160682678, "learning_rate": 0.00045369190786791685, "loss": 3.675, "step": 10905 }, { "epoch": 0.7412691941839924, "grad_norm": 0.9143354892730713, "learning_rate": 0.00045367067536350046, "loss": 3.5652, "step": 10910 }, { "epoch": 0.7416089142546541, "grad_norm": 0.7671268582344055, "learning_rate": 0.00045364944285908413, "loss": 3.8238, "step": 10915 }, { "epoch": 0.741948634325316, "grad_norm": 1.0676747560501099, "learning_rate": 0.0004536282103546678, "loss": 3.4891, "step": 10920 }, { "epoch": 0.7422883543959777, "grad_norm": 1.1405770778656006, "learning_rate": 0.0004536069778502514, "loss": 3.4969, "step": 10925 }, { "epoch": 0.7426280744666395, "grad_norm": 1.1328486204147339, "learning_rate": 0.000453585745345835, "loss": 3.649, "step": 10930 }, { "epoch": 0.7429677945373012, "grad_norm": 1.004883885383606, "learning_rate": 0.0004535645128414187, "loss": 3.4483, "step": 10935 }, { "epoch": 0.7433075146079631, "grad_norm": 1.2241405248641968, "learning_rate": 0.0004535432803370023, "loss": 3.5548, "step": 10940 }, { "epoch": 0.7436472346786248, "grad_norm": 0.8886626362800598, "learning_rate": 0.0004535220478325859, "loss": 3.519, "step": 10945 }, { "epoch": 0.7439869547492866, "grad_norm": 0.8079833388328552, "learning_rate": 0.00045350081532816963, "loss": 3.9113, "step": 10950 }, { "epoch": 0.7443266748199484, "grad_norm": 0.8787316679954529, "learning_rate": 0.00045347958282375325, "loss": 3.4017, "step": 10955 }, { "epoch": 0.7446663948906102, "grad_norm": 0.7876694202423096, "learning_rate": 0.00045345835031933686, "loss": 3.5106, "step": 10960 }, { "epoch": 0.7450061149612719, "grad_norm": 1.3154869079589844, "learning_rate": 0.00045343711781492053, "loss": 3.773, "step": 10965 }, { "epoch": 0.7453458350319336, "grad_norm": 1.1527464389801025, "learning_rate": 0.00045341588531050414, "loss": 3.4004, "step": 10970 }, { "epoch": 0.7456855551025955, "grad_norm": 0.9307451248168945, "learning_rate": 0.00045339465280608775, "loss": 3.6317, "step": 10975 }, { "epoch": 0.7460252751732572, "grad_norm": 1.070565938949585, "learning_rate": 0.0004533734203016714, "loss": 3.4634, "step": 10980 }, { "epoch": 0.746364995243919, "grad_norm": 1.0132933855056763, "learning_rate": 0.0004533521877972551, "loss": 3.614, "step": 10985 }, { "epoch": 0.7467047153145808, "grad_norm": 0.9729132652282715, "learning_rate": 0.0004533309552928387, "loss": 3.4752, "step": 10990 }, { "epoch": 0.7470444353852426, "grad_norm": 4.9938130378723145, "learning_rate": 0.00045330972278842237, "loss": 3.3451, "step": 10995 }, { "epoch": 0.7473841554559043, "grad_norm": 1.0616689920425415, "learning_rate": 0.000453288490284006, "loss": 3.4061, "step": 11000 }, { "epoch": 0.7477238755265662, "grad_norm": 1.1283382177352905, "learning_rate": 0.0004532672577795896, "loss": 3.5017, "step": 11005 }, { "epoch": 0.7480635955972279, "grad_norm": 1.3812631368637085, "learning_rate": 0.00045324602527517326, "loss": 3.5218, "step": 11010 }, { "epoch": 0.7484033156678896, "grad_norm": 1.3514049053192139, "learning_rate": 0.0004532247927707569, "loss": 3.2909, "step": 11015 }, { "epoch": 0.7487430357385514, "grad_norm": 0.964059054851532, "learning_rate": 0.00045320356026634054, "loss": 3.6418, "step": 11020 }, { "epoch": 0.7490827558092132, "grad_norm": 1.34296452999115, "learning_rate": 0.0004531823277619242, "loss": 3.5609, "step": 11025 }, { "epoch": 0.749422475879875, "grad_norm": 1.1066815853118896, "learning_rate": 0.0004531610952575078, "loss": 3.6046, "step": 11030 }, { "epoch": 0.7497621959505367, "grad_norm": 0.8989728689193726, "learning_rate": 0.00045313986275309143, "loss": 3.6269, "step": 11035 }, { "epoch": 0.7501019160211986, "grad_norm": 0.9264765977859497, "learning_rate": 0.0004531186302486751, "loss": 3.7, "step": 11040 }, { "epoch": 0.7504416360918603, "grad_norm": 0.7904105186462402, "learning_rate": 0.0004530973977442587, "loss": 3.6285, "step": 11045 }, { "epoch": 0.7507813561625221, "grad_norm": 0.9528473019599915, "learning_rate": 0.00045307616523984233, "loss": 3.6208, "step": 11050 }, { "epoch": 0.7511210762331838, "grad_norm": 0.9605002999305725, "learning_rate": 0.00045305493273542605, "loss": 3.291, "step": 11055 }, { "epoch": 0.7514607963038457, "grad_norm": 0.9139696359634399, "learning_rate": 0.00045303370023100966, "loss": 3.6377, "step": 11060 }, { "epoch": 0.7518005163745074, "grad_norm": 0.9906828999519348, "learning_rate": 0.0004530124677265933, "loss": 3.7039, "step": 11065 }, { "epoch": 0.7521402364451691, "grad_norm": 1.097412109375, "learning_rate": 0.00045299123522217694, "loss": 3.468, "step": 11070 }, { "epoch": 0.752479956515831, "grad_norm": 0.8708056807518005, "learning_rate": 0.00045297000271776055, "loss": 3.6218, "step": 11075 }, { "epoch": 0.7528196765864927, "grad_norm": 0.7851992845535278, "learning_rate": 0.0004529487702133442, "loss": 3.5418, "step": 11080 }, { "epoch": 0.7531593966571545, "grad_norm": 1.209282398223877, "learning_rate": 0.00045292753770892783, "loss": 3.4122, "step": 11085 }, { "epoch": 0.7534991167278163, "grad_norm": 1.1235805749893188, "learning_rate": 0.0004529063052045115, "loss": 3.548, "step": 11090 }, { "epoch": 0.7538388367984781, "grad_norm": 0.7499932646751404, "learning_rate": 0.00045288507270009517, "loss": 3.7028, "step": 11095 }, { "epoch": 0.7541785568691398, "grad_norm": 0.9098990559577942, "learning_rate": 0.0004528638401956788, "loss": 3.4197, "step": 11100 }, { "epoch": 0.7545182769398016, "grad_norm": 0.7628928422927856, "learning_rate": 0.0004528426076912624, "loss": 3.5887, "step": 11105 }, { "epoch": 0.7548579970104634, "grad_norm": 0.8299646377563477, "learning_rate": 0.00045282137518684606, "loss": 3.4443, "step": 11110 }, { "epoch": 0.7551977170811252, "grad_norm": 1.2688863277435303, "learning_rate": 0.0004528001426824297, "loss": 3.5003, "step": 11115 }, { "epoch": 0.7555374371517869, "grad_norm": 1.0131100416183472, "learning_rate": 0.0004527789101780133, "loss": 3.5775, "step": 11120 }, { "epoch": 0.7558771572224487, "grad_norm": 0.8182028532028198, "learning_rate": 0.000452757677673597, "loss": 3.7111, "step": 11125 }, { "epoch": 0.7562168772931105, "grad_norm": 0.9067591428756714, "learning_rate": 0.0004527364451691806, "loss": 3.6549, "step": 11130 }, { "epoch": 0.7565565973637722, "grad_norm": 0.9048811793327332, "learning_rate": 0.00045271521266476424, "loss": 3.7614, "step": 11135 }, { "epoch": 0.756896317434434, "grad_norm": 0.7947191596031189, "learning_rate": 0.0004526939801603479, "loss": 3.6607, "step": 11140 }, { "epoch": 0.7572360375050958, "grad_norm": 0.9663615226745605, "learning_rate": 0.0004526727476559315, "loss": 3.3517, "step": 11145 }, { "epoch": 0.7575757575757576, "grad_norm": 0.9723917245864868, "learning_rate": 0.00045265151515151513, "loss": 3.8961, "step": 11150 }, { "epoch": 0.7579154776464193, "grad_norm": 1.2377142906188965, "learning_rate": 0.00045263028264709885, "loss": 3.6361, "step": 11155 }, { "epoch": 0.7582551977170812, "grad_norm": 0.883269190788269, "learning_rate": 0.00045260905014268246, "loss": 3.5792, "step": 11160 }, { "epoch": 0.7585949177877429, "grad_norm": 1.2643485069274902, "learning_rate": 0.0004525878176382661, "loss": 3.3188, "step": 11165 }, { "epoch": 0.7589346378584046, "grad_norm": 0.9383618235588074, "learning_rate": 0.00045256658513384974, "loss": 3.4557, "step": 11170 }, { "epoch": 0.7592743579290665, "grad_norm": 0.9304665327072144, "learning_rate": 0.00045254535262943336, "loss": 3.5407, "step": 11175 }, { "epoch": 0.7596140779997282, "grad_norm": 1.2491357326507568, "learning_rate": 0.00045252412012501697, "loss": 3.4117, "step": 11180 }, { "epoch": 0.75995379807039, "grad_norm": 0.843019962310791, "learning_rate": 0.00045250288762060064, "loss": 3.3972, "step": 11185 }, { "epoch": 0.7602935181410517, "grad_norm": 0.88554447889328, "learning_rate": 0.0004524816551161843, "loss": 3.5746, "step": 11190 }, { "epoch": 0.7606332382117136, "grad_norm": 0.786370038986206, "learning_rate": 0.0004524604226117679, "loss": 3.6518, "step": 11195 }, { "epoch": 0.7609729582823753, "grad_norm": 0.998896598815918, "learning_rate": 0.0004524391901073516, "loss": 3.4429, "step": 11200 }, { "epoch": 0.7613126783530371, "grad_norm": 0.9178299307823181, "learning_rate": 0.0004524179576029352, "loss": 3.6079, "step": 11205 }, { "epoch": 0.7616523984236989, "grad_norm": 1.024559736251831, "learning_rate": 0.0004523967250985188, "loss": 3.5984, "step": 11210 }, { "epoch": 0.7619921184943607, "grad_norm": 1.0146671533584595, "learning_rate": 0.0004523754925941025, "loss": 3.4118, "step": 11215 }, { "epoch": 0.7623318385650224, "grad_norm": 1.0860249996185303, "learning_rate": 0.0004523542600896861, "loss": 3.6369, "step": 11220 }, { "epoch": 0.7626715586356841, "grad_norm": 0.8720779418945312, "learning_rate": 0.00045233302758526976, "loss": 3.4479, "step": 11225 }, { "epoch": 0.763011278706346, "grad_norm": 0.7657794952392578, "learning_rate": 0.0004523117950808534, "loss": 3.7552, "step": 11230 }, { "epoch": 0.7633509987770077, "grad_norm": 0.8825445175170898, "learning_rate": 0.00045229056257643704, "loss": 3.6455, "step": 11235 }, { "epoch": 0.7636907188476695, "grad_norm": 1.3960474729537964, "learning_rate": 0.00045226933007202065, "loss": 3.5561, "step": 11240 }, { "epoch": 0.7640304389183313, "grad_norm": 0.9755242466926575, "learning_rate": 0.0004522480975676043, "loss": 3.2532, "step": 11245 }, { "epoch": 0.7643701589889931, "grad_norm": 0.9217225909233093, "learning_rate": 0.00045222686506318793, "loss": 3.4888, "step": 11250 }, { "epoch": 0.7647098790596548, "grad_norm": 0.8481307625770569, "learning_rate": 0.00045220563255877154, "loss": 3.783, "step": 11255 }, { "epoch": 0.7650495991303167, "grad_norm": 0.8524138331413269, "learning_rate": 0.00045218440005435526, "loss": 3.6982, "step": 11260 }, { "epoch": 0.7653893192009784, "grad_norm": 1.0357170104980469, "learning_rate": 0.0004521631675499389, "loss": 3.58, "step": 11265 }, { "epoch": 0.7657290392716402, "grad_norm": 1.6569840908050537, "learning_rate": 0.0004521419350455225, "loss": 3.6294, "step": 11270 }, { "epoch": 0.7660687593423019, "grad_norm": 1.4390497207641602, "learning_rate": 0.00045212070254110616, "loss": 3.6033, "step": 11275 }, { "epoch": 0.7664084794129638, "grad_norm": 0.9150564074516296, "learning_rate": 0.00045209947003668977, "loss": 3.5452, "step": 11280 }, { "epoch": 0.7667481994836255, "grad_norm": 0.9967918992042542, "learning_rate": 0.0004520782375322734, "loss": 3.4411, "step": 11285 }, { "epoch": 0.7670879195542872, "grad_norm": 0.8345723748207092, "learning_rate": 0.00045205700502785705, "loss": 3.603, "step": 11290 }, { "epoch": 0.7674276396249491, "grad_norm": 0.8594462275505066, "learning_rate": 0.0004520357725234407, "loss": 3.4414, "step": 11295 }, { "epoch": 0.7677673596956108, "grad_norm": 0.7158309817314148, "learning_rate": 0.00045201454001902433, "loss": 3.5334, "step": 11300 }, { "epoch": 0.7681070797662726, "grad_norm": 1.1492923498153687, "learning_rate": 0.000451993307514608, "loss": 3.4741, "step": 11305 }, { "epoch": 0.7684467998369343, "grad_norm": 1.0600974559783936, "learning_rate": 0.0004519720750101916, "loss": 3.6021, "step": 11310 }, { "epoch": 0.7687865199075962, "grad_norm": 1.061357855796814, "learning_rate": 0.0004519508425057752, "loss": 3.5312, "step": 11315 }, { "epoch": 0.7691262399782579, "grad_norm": 0.8759276270866394, "learning_rate": 0.0004519296100013589, "loss": 3.7274, "step": 11320 }, { "epoch": 0.7694659600489197, "grad_norm": 5.686620235443115, "learning_rate": 0.0004519083774969425, "loss": 3.4927, "step": 11325 }, { "epoch": 0.7698056801195815, "grad_norm": 1.0727049112319946, "learning_rate": 0.00045188714499252617, "loss": 3.4173, "step": 11330 }, { "epoch": 0.7701454001902432, "grad_norm": 1.2092201709747314, "learning_rate": 0.00045186591248810984, "loss": 3.376, "step": 11335 }, { "epoch": 0.770485120260905, "grad_norm": 0.9436143040657043, "learning_rate": 0.00045184467998369345, "loss": 3.2513, "step": 11340 }, { "epoch": 0.7708248403315668, "grad_norm": 0.9945254325866699, "learning_rate": 0.00045182344747927706, "loss": 3.6233, "step": 11345 }, { "epoch": 0.7711645604022286, "grad_norm": 0.7543816566467285, "learning_rate": 0.00045180221497486073, "loss": 3.5951, "step": 11350 }, { "epoch": 0.7715042804728903, "grad_norm": 1.5490140914916992, "learning_rate": 0.00045178098247044434, "loss": 3.3681, "step": 11355 }, { "epoch": 0.7718440005435521, "grad_norm": 1.158082127571106, "learning_rate": 0.00045175974996602796, "loss": 3.8774, "step": 11360 }, { "epoch": 0.7721837206142139, "grad_norm": 0.7379157543182373, "learning_rate": 0.0004517385174616117, "loss": 3.633, "step": 11365 }, { "epoch": 0.7725234406848757, "grad_norm": 1.048500418663025, "learning_rate": 0.0004517172849571953, "loss": 3.798, "step": 11370 }, { "epoch": 0.7728631607555374, "grad_norm": 1.172136902809143, "learning_rate": 0.0004516960524527789, "loss": 3.5048, "step": 11375 }, { "epoch": 0.7732028808261993, "grad_norm": 0.8952823281288147, "learning_rate": 0.00045167481994836257, "loss": 3.3642, "step": 11380 }, { "epoch": 0.773542600896861, "grad_norm": 0.9569706916809082, "learning_rate": 0.0004516535874439462, "loss": 3.6001, "step": 11385 }, { "epoch": 0.7738823209675227, "grad_norm": 1.102828025817871, "learning_rate": 0.0004516323549395298, "loss": 3.7246, "step": 11390 }, { "epoch": 0.7742220410381845, "grad_norm": 1.0838216543197632, "learning_rate": 0.00045161112243511346, "loss": 3.6192, "step": 11395 }, { "epoch": 0.7745617611088463, "grad_norm": 1.3636205196380615, "learning_rate": 0.00045158988993069713, "loss": 3.5293, "step": 11400 }, { "epoch": 0.7749014811795081, "grad_norm": 0.8680553436279297, "learning_rate": 0.00045156865742628074, "loss": 3.6973, "step": 11405 }, { "epoch": 0.7752412012501698, "grad_norm": 0.8290780782699585, "learning_rate": 0.0004515474249218644, "loss": 3.8296, "step": 11410 }, { "epoch": 0.7755809213208317, "grad_norm": 1.3644964694976807, "learning_rate": 0.000451526192417448, "loss": 3.6822, "step": 11415 }, { "epoch": 0.7759206413914934, "grad_norm": 1.4608840942382812, "learning_rate": 0.0004515049599130317, "loss": 3.7453, "step": 11420 }, { "epoch": 0.7762603614621552, "grad_norm": 1.0583796501159668, "learning_rate": 0.0004514837274086153, "loss": 3.2558, "step": 11425 }, { "epoch": 0.776600081532817, "grad_norm": 0.8346831202507019, "learning_rate": 0.0004514624949041989, "loss": 3.285, "step": 11430 }, { "epoch": 0.7769398016034788, "grad_norm": 0.7967115044593811, "learning_rate": 0.00045144126239978264, "loss": 3.4924, "step": 11435 }, { "epoch": 0.7772795216741405, "grad_norm": 0.9103440046310425, "learning_rate": 0.00045142002989536625, "loss": 3.5928, "step": 11440 }, { "epoch": 0.7776192417448022, "grad_norm": 0.9832020998001099, "learning_rate": 0.00045139879739094986, "loss": 3.4079, "step": 11445 }, { "epoch": 0.7779589618154641, "grad_norm": 1.2264084815979004, "learning_rate": 0.00045137756488653353, "loss": 3.5948, "step": 11450 }, { "epoch": 0.7782986818861258, "grad_norm": 0.9382302761077881, "learning_rate": 0.00045135633238211714, "loss": 3.7521, "step": 11455 }, { "epoch": 0.7786384019567876, "grad_norm": 0.790884256362915, "learning_rate": 0.00045133509987770076, "loss": 3.317, "step": 11460 }, { "epoch": 0.7789781220274494, "grad_norm": 0.8408787846565247, "learning_rate": 0.0004513138673732844, "loss": 3.7254, "step": 11465 }, { "epoch": 0.7793178420981112, "grad_norm": 0.9325862526893616, "learning_rate": 0.0004512926348688681, "loss": 3.5514, "step": 11470 }, { "epoch": 0.7796575621687729, "grad_norm": 0.8697798252105713, "learning_rate": 0.0004512714023644517, "loss": 3.6299, "step": 11475 }, { "epoch": 0.7799972822394347, "grad_norm": 0.9365302324295044, "learning_rate": 0.00045125016986003537, "loss": 3.6017, "step": 11480 }, { "epoch": 0.7803370023100965, "grad_norm": 0.8338518738746643, "learning_rate": 0.000451228937355619, "loss": 3.5307, "step": 11485 }, { "epoch": 0.7806767223807582, "grad_norm": 1.194667100906372, "learning_rate": 0.0004512077048512026, "loss": 3.4987, "step": 11490 }, { "epoch": 0.78101644245142, "grad_norm": 0.8087764382362366, "learning_rate": 0.00045118647234678626, "loss": 3.6059, "step": 11495 }, { "epoch": 0.7813561625220818, "grad_norm": 1.020198941230774, "learning_rate": 0.0004511652398423699, "loss": 3.1606, "step": 11500 }, { "epoch": 0.7816958825927436, "grad_norm": 0.819696843624115, "learning_rate": 0.00045114400733795354, "loss": 3.4747, "step": 11505 }, { "epoch": 0.7820356026634053, "grad_norm": 0.7799421548843384, "learning_rate": 0.0004511227748335372, "loss": 3.8093, "step": 11510 }, { "epoch": 0.7823753227340672, "grad_norm": 0.9194088578224182, "learning_rate": 0.0004511015423291208, "loss": 3.5803, "step": 11515 }, { "epoch": 0.7827150428047289, "grad_norm": 0.8595015406608582, "learning_rate": 0.00045108030982470444, "loss": 3.6766, "step": 11520 }, { "epoch": 0.7830547628753907, "grad_norm": 0.9601884484291077, "learning_rate": 0.0004510590773202881, "loss": 3.7969, "step": 11525 }, { "epoch": 0.7833944829460524, "grad_norm": 0.9373843669891357, "learning_rate": 0.0004510378448158717, "loss": 3.5302, "step": 11530 }, { "epoch": 0.7837342030167143, "grad_norm": 0.8142849802970886, "learning_rate": 0.00045101661231145533, "loss": 3.3922, "step": 11535 }, { "epoch": 0.784073923087376, "grad_norm": 0.7358511686325073, "learning_rate": 0.00045099537980703905, "loss": 3.7506, "step": 11540 }, { "epoch": 0.7844136431580377, "grad_norm": 0.7794761657714844, "learning_rate": 0.00045097414730262266, "loss": 3.5329, "step": 11545 }, { "epoch": 0.7847533632286996, "grad_norm": 0.6954411864280701, "learning_rate": 0.0004509529147982063, "loss": 3.6456, "step": 11550 }, { "epoch": 0.7850930832993613, "grad_norm": 0.8116132020950317, "learning_rate": 0.00045093168229378994, "loss": 3.5627, "step": 11555 }, { "epoch": 0.7854328033700231, "grad_norm": 0.8216436505317688, "learning_rate": 0.00045091044978937356, "loss": 3.714, "step": 11560 }, { "epoch": 0.7857725234406848, "grad_norm": 0.9271339774131775, "learning_rate": 0.00045088921728495717, "loss": 3.5695, "step": 11565 }, { "epoch": 0.7861122435113467, "grad_norm": 1.2083988189697266, "learning_rate": 0.00045086798478054084, "loss": 3.3223, "step": 11570 }, { "epoch": 0.7864519635820084, "grad_norm": 0.9259551763534546, "learning_rate": 0.0004508467522761245, "loss": 3.493, "step": 11575 }, { "epoch": 0.7867916836526702, "grad_norm": 1.0173684358596802, "learning_rate": 0.0004508255197717081, "loss": 3.8879, "step": 11580 }, { "epoch": 0.787131403723332, "grad_norm": 0.8055000901222229, "learning_rate": 0.0004508042872672918, "loss": 3.4878, "step": 11585 }, { "epoch": 0.7874711237939938, "grad_norm": 0.7334341406822205, "learning_rate": 0.0004507830547628754, "loss": 3.5665, "step": 11590 }, { "epoch": 0.7878108438646555, "grad_norm": 0.8810855150222778, "learning_rate": 0.000450761822258459, "loss": 3.4928, "step": 11595 }, { "epoch": 0.7881505639353173, "grad_norm": 0.9969966411590576, "learning_rate": 0.0004507405897540427, "loss": 3.4544, "step": 11600 }, { "epoch": 0.7884902840059791, "grad_norm": 0.9311866760253906, "learning_rate": 0.0004507193572496263, "loss": 3.7474, "step": 11605 }, { "epoch": 0.7888300040766408, "grad_norm": 0.9642701745033264, "learning_rate": 0.00045069812474520996, "loss": 3.778, "step": 11610 }, { "epoch": 0.7891697241473026, "grad_norm": 0.9641186594963074, "learning_rate": 0.0004506768922407936, "loss": 3.3883, "step": 11615 }, { "epoch": 0.7895094442179644, "grad_norm": 1.0308353900909424, "learning_rate": 0.00045065565973637724, "loss": 3.456, "step": 11620 }, { "epoch": 0.7898491642886262, "grad_norm": 0.9153664708137512, "learning_rate": 0.00045063442723196085, "loss": 3.4603, "step": 11625 }, { "epoch": 0.7901888843592879, "grad_norm": 0.9367057681083679, "learning_rate": 0.0004506131947275445, "loss": 3.5906, "step": 11630 }, { "epoch": 0.7905286044299498, "grad_norm": 0.9046686887741089, "learning_rate": 0.00045059196222312813, "loss": 3.5325, "step": 11635 }, { "epoch": 0.7908683245006115, "grad_norm": 0.8405258059501648, "learning_rate": 0.00045057072971871174, "loss": 3.7034, "step": 11640 }, { "epoch": 0.7912080445712733, "grad_norm": 0.9029679298400879, "learning_rate": 0.00045054949721429546, "loss": 3.3381, "step": 11645 }, { "epoch": 0.791547764641935, "grad_norm": 0.9144964814186096, "learning_rate": 0.0004505282647098791, "loss": 3.6311, "step": 11650 }, { "epoch": 0.7918874847125968, "grad_norm": 1.155037522315979, "learning_rate": 0.0004505070322054627, "loss": 3.641, "step": 11655 }, { "epoch": 0.7922272047832586, "grad_norm": 1.2123913764953613, "learning_rate": 0.00045048579970104636, "loss": 3.6197, "step": 11660 }, { "epoch": 0.7925669248539203, "grad_norm": 1.0333898067474365, "learning_rate": 0.00045046456719662997, "loss": 3.5328, "step": 11665 }, { "epoch": 0.7929066449245822, "grad_norm": 0.8405129909515381, "learning_rate": 0.0004504433346922136, "loss": 3.5325, "step": 11670 }, { "epoch": 0.7932463649952439, "grad_norm": 0.9599202275276184, "learning_rate": 0.0004504221021877973, "loss": 3.5238, "step": 11675 }, { "epoch": 0.7935860850659057, "grad_norm": 0.8704776167869568, "learning_rate": 0.0004504008696833809, "loss": 3.6035, "step": 11680 }, { "epoch": 0.7939258051365675, "grad_norm": 0.9665622115135193, "learning_rate": 0.00045037963717896453, "loss": 3.875, "step": 11685 }, { "epoch": 0.7942655252072293, "grad_norm": 0.9161460995674133, "learning_rate": 0.0004503584046745482, "loss": 3.3893, "step": 11690 }, { "epoch": 0.794605245277891, "grad_norm": 0.8528186678886414, "learning_rate": 0.0004503371721701318, "loss": 3.6533, "step": 11695 }, { "epoch": 0.7949449653485527, "grad_norm": 0.9194161295890808, "learning_rate": 0.0004503159396657154, "loss": 3.5554, "step": 11700 }, { "epoch": 0.7952846854192146, "grad_norm": 1.2495887279510498, "learning_rate": 0.0004502947071612991, "loss": 3.389, "step": 11705 }, { "epoch": 0.7956244054898763, "grad_norm": 0.8161107897758484, "learning_rate": 0.00045027347465688276, "loss": 3.3537, "step": 11710 }, { "epoch": 0.7959641255605381, "grad_norm": 0.7931812405586243, "learning_rate": 0.00045025224215246637, "loss": 3.6697, "step": 11715 }, { "epoch": 0.7963038456311999, "grad_norm": 0.8713650703430176, "learning_rate": 0.00045023100964805004, "loss": 3.6762, "step": 11720 }, { "epoch": 0.7966435657018617, "grad_norm": 0.9167910814285278, "learning_rate": 0.00045020977714363365, "loss": 3.5767, "step": 11725 }, { "epoch": 0.7969832857725234, "grad_norm": 1.0353357791900635, "learning_rate": 0.00045018854463921726, "loss": 3.6481, "step": 11730 }, { "epoch": 0.7973230058431852, "grad_norm": 1.0095230340957642, "learning_rate": 0.00045016731213480093, "loss": 3.5494, "step": 11735 }, { "epoch": 0.797662725913847, "grad_norm": 0.9550053477287292, "learning_rate": 0.00045014607963038454, "loss": 3.358, "step": 11740 }, { "epoch": 0.7980024459845088, "grad_norm": 0.9768257737159729, "learning_rate": 0.0004501248471259682, "loss": 3.5478, "step": 11745 }, { "epoch": 0.7983421660551705, "grad_norm": 1.0843186378479004, "learning_rate": 0.0004501036146215519, "loss": 3.7075, "step": 11750 }, { "epoch": 0.7986818861258324, "grad_norm": 0.8945038914680481, "learning_rate": 0.0004500823821171355, "loss": 3.2367, "step": 11755 }, { "epoch": 0.7990216061964941, "grad_norm": 1.454797625541687, "learning_rate": 0.00045006114961271916, "loss": 3.6487, "step": 11760 }, { "epoch": 0.7993613262671558, "grad_norm": 0.9339689612388611, "learning_rate": 0.00045003991710830277, "loss": 3.637, "step": 11765 }, { "epoch": 0.7997010463378177, "grad_norm": 0.6612041592597961, "learning_rate": 0.0004500186846038864, "loss": 3.5939, "step": 11770 }, { "epoch": 0.8000407664084794, "grad_norm": 0.7390630841255188, "learning_rate": 0.00044999745209947005, "loss": 3.4019, "step": 11775 }, { "epoch": 0.8003804864791412, "grad_norm": 0.8326549530029297, "learning_rate": 0.0004499762195950537, "loss": 3.7857, "step": 11780 }, { "epoch": 0.8007202065498029, "grad_norm": 0.9344022274017334, "learning_rate": 0.00044995498709063733, "loss": 3.7617, "step": 11785 }, { "epoch": 0.8010599266204648, "grad_norm": 0.7675136923789978, "learning_rate": 0.000449933754586221, "loss": 3.2585, "step": 11790 }, { "epoch": 0.8013996466911265, "grad_norm": 0.943437933921814, "learning_rate": 0.0004499125220818046, "loss": 3.6116, "step": 11795 }, { "epoch": 0.8017393667617883, "grad_norm": 0.9530118107795715, "learning_rate": 0.0004498912895773882, "loss": 3.6922, "step": 11800 }, { "epoch": 0.8020790868324501, "grad_norm": 1.0674951076507568, "learning_rate": 0.0004498700570729719, "loss": 3.8288, "step": 11805 }, { "epoch": 0.8024188069031118, "grad_norm": 1.1838650703430176, "learning_rate": 0.0004498488245685555, "loss": 3.3678, "step": 11810 }, { "epoch": 0.8027585269737736, "grad_norm": 1.094651460647583, "learning_rate": 0.00044982759206413917, "loss": 3.4516, "step": 11815 }, { "epoch": 0.8030982470444353, "grad_norm": 0.8997747898101807, "learning_rate": 0.00044980635955972284, "loss": 3.5359, "step": 11820 }, { "epoch": 0.8034379671150972, "grad_norm": 0.8751127123832703, "learning_rate": 0.00044978512705530645, "loss": 3.6892, "step": 11825 }, { "epoch": 0.8037776871857589, "grad_norm": 0.80302894115448, "learning_rate": 0.00044976389455089006, "loss": 3.6559, "step": 11830 }, { "epoch": 0.8041174072564207, "grad_norm": 0.7940371036529541, "learning_rate": 0.00044974266204647373, "loss": 3.6594, "step": 11835 }, { "epoch": 0.8044571273270825, "grad_norm": 1.1308082342147827, "learning_rate": 0.00044972142954205734, "loss": 3.7438, "step": 11840 }, { "epoch": 0.8047968473977443, "grad_norm": 1.2393527030944824, "learning_rate": 0.00044970019703764096, "loss": 3.4516, "step": 11845 }, { "epoch": 0.805136567468406, "grad_norm": 1.2231640815734863, "learning_rate": 0.0004496789645332247, "loss": 3.6816, "step": 11850 }, { "epoch": 0.8054762875390679, "grad_norm": 1.4405717849731445, "learning_rate": 0.0004496577320288083, "loss": 3.411, "step": 11855 }, { "epoch": 0.8058160076097296, "grad_norm": 1.0476617813110352, "learning_rate": 0.0004496364995243919, "loss": 3.3902, "step": 11860 }, { "epoch": 0.8061557276803913, "grad_norm": 1.0138416290283203, "learning_rate": 0.00044961526701997557, "loss": 3.4847, "step": 11865 }, { "epoch": 0.8064954477510531, "grad_norm": 0.7801641821861267, "learning_rate": 0.0004495940345155592, "loss": 3.657, "step": 11870 }, { "epoch": 0.8068351678217149, "grad_norm": 0.8453697562217712, "learning_rate": 0.0004495728020111428, "loss": 3.5551, "step": 11875 }, { "epoch": 0.8071748878923767, "grad_norm": 0.8592841029167175, "learning_rate": 0.00044955156950672647, "loss": 3.1185, "step": 11880 }, { "epoch": 0.8075146079630384, "grad_norm": 0.8163525462150574, "learning_rate": 0.00044953033700231013, "loss": 3.6062, "step": 11885 }, { "epoch": 0.8078543280337003, "grad_norm": 0.940747082233429, "learning_rate": 0.00044950910449789375, "loss": 3.2609, "step": 11890 }, { "epoch": 0.808194048104362, "grad_norm": 0.9482797384262085, "learning_rate": 0.0004494878719934774, "loss": 3.5626, "step": 11895 }, { "epoch": 0.8085337681750238, "grad_norm": 0.9382033944129944, "learning_rate": 0.000449466639489061, "loss": 3.4287, "step": 11900 }, { "epoch": 0.8088734882456855, "grad_norm": 0.9052550792694092, "learning_rate": 0.00044944540698464464, "loss": 3.4663, "step": 11905 }, { "epoch": 0.8092132083163474, "grad_norm": 0.8503194451332092, "learning_rate": 0.0004494241744802283, "loss": 3.6951, "step": 11910 }, { "epoch": 0.8095529283870091, "grad_norm": 0.7549372911453247, "learning_rate": 0.0004494029419758119, "loss": 3.6663, "step": 11915 }, { "epoch": 0.8098926484576708, "grad_norm": 0.7341042160987854, "learning_rate": 0.0004493817094713956, "loss": 3.3219, "step": 11920 }, { "epoch": 0.8102323685283327, "grad_norm": 0.7509133219718933, "learning_rate": 0.00044936047696697925, "loss": 3.6122, "step": 11925 }, { "epoch": 0.8105720885989944, "grad_norm": 0.8163796067237854, "learning_rate": 0.00044933924446256287, "loss": 3.665, "step": 11930 }, { "epoch": 0.8109118086696562, "grad_norm": 0.8404551148414612, "learning_rate": 0.0004493180119581465, "loss": 3.6558, "step": 11935 }, { "epoch": 0.811251528740318, "grad_norm": 0.7682790756225586, "learning_rate": 0.00044929677945373015, "loss": 3.4308, "step": 11940 }, { "epoch": 0.8115912488109798, "grad_norm": 0.8642573356628418, "learning_rate": 0.00044927554694931376, "loss": 3.3667, "step": 11945 }, { "epoch": 0.8119309688816415, "grad_norm": 0.8911049962043762, "learning_rate": 0.00044925431444489737, "loss": 3.6266, "step": 11950 }, { "epoch": 0.8122706889523033, "grad_norm": 1.420030117034912, "learning_rate": 0.0004492330819404811, "loss": 3.4731, "step": 11955 }, { "epoch": 0.8126104090229651, "grad_norm": 0.9012633562088013, "learning_rate": 0.0004492118494360647, "loss": 3.5072, "step": 11960 }, { "epoch": 0.8129501290936268, "grad_norm": 0.8820513486862183, "learning_rate": 0.0004491906169316483, "loss": 3.5137, "step": 11965 }, { "epoch": 0.8132898491642886, "grad_norm": 1.1120094060897827, "learning_rate": 0.000449169384427232, "loss": 3.5201, "step": 11970 }, { "epoch": 0.8136295692349504, "grad_norm": 1.200352430343628, "learning_rate": 0.0004491481519228156, "loss": 3.7519, "step": 11975 }, { "epoch": 0.8139692893056122, "grad_norm": 1.0852504968643188, "learning_rate": 0.0004491269194183992, "loss": 3.3821, "step": 11980 }, { "epoch": 0.8143090093762739, "grad_norm": 0.843419075012207, "learning_rate": 0.0004491056869139829, "loss": 3.4187, "step": 11985 }, { "epoch": 0.8146487294469357, "grad_norm": 1.060928463935852, "learning_rate": 0.00044908445440956655, "loss": 3.7343, "step": 11990 }, { "epoch": 0.8149884495175975, "grad_norm": 1.148818850517273, "learning_rate": 0.00044906322190515016, "loss": 3.3302, "step": 11995 }, { "epoch": 0.8153281695882593, "grad_norm": 1.0028592348098755, "learning_rate": 0.0004490419894007338, "loss": 3.4527, "step": 12000 }, { "epoch": 0.815667889658921, "grad_norm": 1.1745450496673584, "learning_rate": 0.00044902075689631744, "loss": 3.622, "step": 12005 }, { "epoch": 0.8160076097295829, "grad_norm": 1.0620641708374023, "learning_rate": 0.00044899952439190105, "loss": 3.6348, "step": 12010 }, { "epoch": 0.8163473298002446, "grad_norm": 0.8614559769630432, "learning_rate": 0.0004489782918874847, "loss": 3.7575, "step": 12015 }, { "epoch": 0.8166870498709063, "grad_norm": 0.9565348029136658, "learning_rate": 0.00044895705938306833, "loss": 3.5779, "step": 12020 }, { "epoch": 0.8170267699415682, "grad_norm": 0.8741623163223267, "learning_rate": 0.000448935826878652, "loss": 3.1402, "step": 12025 }, { "epoch": 0.8173664900122299, "grad_norm": 0.8060041069984436, "learning_rate": 0.00044891459437423567, "loss": 3.5845, "step": 12030 }, { "epoch": 0.8177062100828917, "grad_norm": 1.3091003894805908, "learning_rate": 0.0004488933618698193, "loss": 3.5217, "step": 12035 }, { "epoch": 0.8180459301535534, "grad_norm": 0.9124295711517334, "learning_rate": 0.0004488721293654029, "loss": 3.4506, "step": 12040 }, { "epoch": 0.8183856502242153, "grad_norm": 0.7031652927398682, "learning_rate": 0.00044885089686098656, "loss": 3.8347, "step": 12045 }, { "epoch": 0.818725370294877, "grad_norm": 0.6963446736335754, "learning_rate": 0.00044882966435657017, "loss": 3.5497, "step": 12050 }, { "epoch": 0.8190650903655388, "grad_norm": 1.1820546388626099, "learning_rate": 0.0004488084318521538, "loss": 3.5852, "step": 12055 }, { "epoch": 0.8194048104362006, "grad_norm": 0.9769718647003174, "learning_rate": 0.0004487871993477375, "loss": 3.442, "step": 12060 }, { "epoch": 0.8197445305068624, "grad_norm": 0.9314497709274292, "learning_rate": 0.0004487659668433211, "loss": 3.6775, "step": 12065 }, { "epoch": 0.8200842505775241, "grad_norm": 1.1369247436523438, "learning_rate": 0.00044874473433890473, "loss": 3.5321, "step": 12070 }, { "epoch": 0.8204239706481858, "grad_norm": 0.8237895965576172, "learning_rate": 0.0004487235018344884, "loss": 3.7151, "step": 12075 }, { "epoch": 0.8207636907188477, "grad_norm": 1.0787125825881958, "learning_rate": 0.000448702269330072, "loss": 3.3652, "step": 12080 }, { "epoch": 0.8211034107895094, "grad_norm": 0.9167943596839905, "learning_rate": 0.0004486810368256556, "loss": 3.5585, "step": 12085 }, { "epoch": 0.8214431308601712, "grad_norm": 0.9661017060279846, "learning_rate": 0.0004486598043212393, "loss": 3.7582, "step": 12090 }, { "epoch": 0.821782850930833, "grad_norm": 0.9280943870544434, "learning_rate": 0.00044863857181682296, "loss": 3.7041, "step": 12095 }, { "epoch": 0.8221225710014948, "grad_norm": 0.9253416657447815, "learning_rate": 0.0004486173393124066, "loss": 3.546, "step": 12100 }, { "epoch": 0.8224622910721565, "grad_norm": 0.9672086238861084, "learning_rate": 0.00044859610680799024, "loss": 3.4902, "step": 12105 }, { "epoch": 0.8228020111428184, "grad_norm": 1.0244386196136475, "learning_rate": 0.00044857487430357385, "loss": 3.5983, "step": 12110 }, { "epoch": 0.8231417312134801, "grad_norm": 0.9199314713478088, "learning_rate": 0.0004485536417991575, "loss": 3.598, "step": 12115 }, { "epoch": 0.8234814512841419, "grad_norm": 0.8801562190055847, "learning_rate": 0.00044853240929474113, "loss": 3.5664, "step": 12120 }, { "epoch": 0.8238211713548036, "grad_norm": 1.1390689611434937, "learning_rate": 0.00044851117679032475, "loss": 3.5739, "step": 12125 }, { "epoch": 0.8241608914254654, "grad_norm": 0.8554503321647644, "learning_rate": 0.00044848994428590847, "loss": 3.804, "step": 12130 }, { "epoch": 0.8245006114961272, "grad_norm": 1.011595368385315, "learning_rate": 0.0004484687117814921, "loss": 3.3629, "step": 12135 }, { "epoch": 0.8248403315667889, "grad_norm": 0.863946795463562, "learning_rate": 0.0004484474792770757, "loss": 3.5796, "step": 12140 }, { "epoch": 0.8251800516374508, "grad_norm": 0.8281433582305908, "learning_rate": 0.00044842624677265936, "loss": 3.5903, "step": 12145 }, { "epoch": 0.8255197717081125, "grad_norm": 0.925848662853241, "learning_rate": 0.000448405014268243, "loss": 3.4832, "step": 12150 }, { "epoch": 0.8258594917787743, "grad_norm": 0.8883832693099976, "learning_rate": 0.0004483837817638266, "loss": 3.6653, "step": 12155 }, { "epoch": 0.826199211849436, "grad_norm": 0.9255724549293518, "learning_rate": 0.00044836254925941025, "loss": 3.5352, "step": 12160 }, { "epoch": 0.8265389319200979, "grad_norm": 1.0200334787368774, "learning_rate": 0.0004483413167549939, "loss": 3.7126, "step": 12165 }, { "epoch": 0.8268786519907596, "grad_norm": 5.221991539001465, "learning_rate": 0.00044832008425057753, "loss": 3.5754, "step": 12170 }, { "epoch": 0.8272183720614213, "grad_norm": 1.404994249343872, "learning_rate": 0.0004482988517461612, "loss": 3.4721, "step": 12175 }, { "epoch": 0.8275580921320832, "grad_norm": 0.7565175890922546, "learning_rate": 0.0004482776192417448, "loss": 3.6152, "step": 12180 }, { "epoch": 0.8278978122027449, "grad_norm": 0.8664125800132751, "learning_rate": 0.0004482563867373284, "loss": 3.4985, "step": 12185 }, { "epoch": 0.8282375322734067, "grad_norm": 1.1271530389785767, "learning_rate": 0.0004482351542329121, "loss": 3.7794, "step": 12190 }, { "epoch": 0.8285772523440685, "grad_norm": 0.8101256489753723, "learning_rate": 0.0004482139217284957, "loss": 3.7401, "step": 12195 }, { "epoch": 0.8289169724147303, "grad_norm": 0.887892484664917, "learning_rate": 0.0004481926892240794, "loss": 3.7315, "step": 12200 }, { "epoch": 0.829256692485392, "grad_norm": 1.0046137571334839, "learning_rate": 0.00044817145671966304, "loss": 3.7484, "step": 12205 }, { "epoch": 0.8295964125560538, "grad_norm": 1.0045126676559448, "learning_rate": 0.00044815022421524665, "loss": 3.4854, "step": 12210 }, { "epoch": 0.8299361326267156, "grad_norm": 1.0555540323257446, "learning_rate": 0.00044812899171083027, "loss": 3.5149, "step": 12215 }, { "epoch": 0.8302758526973774, "grad_norm": 0.9744082689285278, "learning_rate": 0.00044810775920641393, "loss": 3.4514, "step": 12220 }, { "epoch": 0.8306155727680391, "grad_norm": 0.9558049440383911, "learning_rate": 0.00044808652670199755, "loss": 3.6999, "step": 12225 }, { "epoch": 0.830955292838701, "grad_norm": 0.9205308556556702, "learning_rate": 0.00044806529419758116, "loss": 3.4319, "step": 12230 }, { "epoch": 0.8312950129093627, "grad_norm": 0.8294905424118042, "learning_rate": 0.0004480440616931649, "loss": 3.0789, "step": 12235 }, { "epoch": 0.8316347329800244, "grad_norm": 0.8559340238571167, "learning_rate": 0.0004480228291887485, "loss": 3.3617, "step": 12240 }, { "epoch": 0.8319744530506862, "grad_norm": 0.94744873046875, "learning_rate": 0.0004480015966843321, "loss": 3.2203, "step": 12245 }, { "epoch": 0.832314173121348, "grad_norm": 0.8233264088630676, "learning_rate": 0.0004479803641799158, "loss": 3.5253, "step": 12250 }, { "epoch": 0.8326538931920098, "grad_norm": 1.0395047664642334, "learning_rate": 0.0004479591316754994, "loss": 3.3464, "step": 12255 }, { "epoch": 0.8329936132626715, "grad_norm": 1.0977054834365845, "learning_rate": 0.000447937899171083, "loss": 3.4364, "step": 12260 }, { "epoch": 0.8333333333333334, "grad_norm": 1.2124358415603638, "learning_rate": 0.0004479166666666667, "loss": 3.7128, "step": 12265 }, { "epoch": 0.8336730534039951, "grad_norm": 1.1547961235046387, "learning_rate": 0.00044789543416225033, "loss": 3.1682, "step": 12270 }, { "epoch": 0.8340127734746569, "grad_norm": 0.8189299702644348, "learning_rate": 0.00044787420165783395, "loss": 3.723, "step": 12275 }, { "epoch": 0.8343524935453187, "grad_norm": 1.1098175048828125, "learning_rate": 0.0004478529691534176, "loss": 3.5669, "step": 12280 }, { "epoch": 0.8346922136159804, "grad_norm": 0.964353084564209, "learning_rate": 0.00044783173664900123, "loss": 3.4691, "step": 12285 }, { "epoch": 0.8350319336866422, "grad_norm": 1.0151450634002686, "learning_rate": 0.00044781050414458484, "loss": 3.5603, "step": 12290 }, { "epoch": 0.8353716537573039, "grad_norm": 0.8595138192176819, "learning_rate": 0.0004477892716401685, "loss": 3.4816, "step": 12295 }, { "epoch": 0.8357113738279658, "grad_norm": 1.004218578338623, "learning_rate": 0.0004477680391357522, "loss": 3.5873, "step": 12300 }, { "epoch": 0.8360510938986275, "grad_norm": 1.2609730958938599, "learning_rate": 0.0004477468066313358, "loss": 3.2712, "step": 12305 }, { "epoch": 0.8363908139692893, "grad_norm": 1.059792399406433, "learning_rate": 0.00044772557412691945, "loss": 3.6885, "step": 12310 }, { "epoch": 0.8367305340399511, "grad_norm": 1.0797412395477295, "learning_rate": 0.00044770434162250307, "loss": 3.4383, "step": 12315 }, { "epoch": 0.8370702541106129, "grad_norm": 11.401029586791992, "learning_rate": 0.0004476831091180867, "loss": 3.6049, "step": 12320 }, { "epoch": 0.8374099741812746, "grad_norm": 0.9239528179168701, "learning_rate": 0.00044766187661367035, "loss": 3.7122, "step": 12325 }, { "epoch": 0.8377496942519363, "grad_norm": 0.9248178601264954, "learning_rate": 0.00044764064410925396, "loss": 3.1782, "step": 12330 }, { "epoch": 0.8380894143225982, "grad_norm": 0.8139250874519348, "learning_rate": 0.00044761941160483763, "loss": 3.7523, "step": 12335 }, { "epoch": 0.83842913439326, "grad_norm": 0.9096217155456543, "learning_rate": 0.0004475981791004213, "loss": 3.5755, "step": 12340 }, { "epoch": 0.8387688544639217, "grad_norm": 0.9166557788848877, "learning_rate": 0.0004475769465960049, "loss": 3.5373, "step": 12345 }, { "epoch": 0.8391085745345835, "grad_norm": 1.0374330282211304, "learning_rate": 0.0004475557140915885, "loss": 3.5362, "step": 12350 }, { "epoch": 0.8394482946052453, "grad_norm": 0.8321155309677124, "learning_rate": 0.0004475344815871722, "loss": 3.5269, "step": 12355 }, { "epoch": 0.839788014675907, "grad_norm": 1.030591368675232, "learning_rate": 0.0004475132490827558, "loss": 3.5977, "step": 12360 }, { "epoch": 0.8401277347465689, "grad_norm": 0.7157252430915833, "learning_rate": 0.0004474920165783394, "loss": 3.8338, "step": 12365 }, { "epoch": 0.8404674548172306, "grad_norm": 1.2520782947540283, "learning_rate": 0.00044747078407392313, "loss": 3.6456, "step": 12370 }, { "epoch": 0.8408071748878924, "grad_norm": 1.155260443687439, "learning_rate": 0.00044744955156950675, "loss": 3.4833, "step": 12375 }, { "epoch": 0.8411468949585541, "grad_norm": 1.0917608737945557, "learning_rate": 0.00044742831906509036, "loss": 3.5372, "step": 12380 }, { "epoch": 0.841486615029216, "grad_norm": 1.0403542518615723, "learning_rate": 0.00044740708656067403, "loss": 3.6782, "step": 12385 }, { "epoch": 0.8418263350998777, "grad_norm": 0.9849228262901306, "learning_rate": 0.00044738585405625764, "loss": 3.3426, "step": 12390 }, { "epoch": 0.8421660551705394, "grad_norm": 1.0195262432098389, "learning_rate": 0.00044736462155184125, "loss": 3.4388, "step": 12395 }, { "epoch": 0.8425057752412013, "grad_norm": 1.0979924201965332, "learning_rate": 0.0004473433890474249, "loss": 3.6348, "step": 12400 }, { "epoch": 0.842845495311863, "grad_norm": 0.8744705319404602, "learning_rate": 0.0004473221565430086, "loss": 3.5857, "step": 12405 }, { "epoch": 0.8431852153825248, "grad_norm": 1.0690370798110962, "learning_rate": 0.0004473009240385922, "loss": 3.7653, "step": 12410 }, { "epoch": 0.8435249354531865, "grad_norm": 0.9009366035461426, "learning_rate": 0.00044727969153417587, "loss": 3.7001, "step": 12415 }, { "epoch": 0.8438646555238484, "grad_norm": 0.7976779937744141, "learning_rate": 0.0004472584590297595, "loss": 3.6197, "step": 12420 }, { "epoch": 0.8442043755945101, "grad_norm": 1.001739501953125, "learning_rate": 0.0004472372265253431, "loss": 3.7709, "step": 12425 }, { "epoch": 0.8445440956651719, "grad_norm": 0.8135280609130859, "learning_rate": 0.00044721599402092676, "loss": 3.5222, "step": 12430 }, { "epoch": 0.8448838157358337, "grad_norm": 0.9763901233673096, "learning_rate": 0.0004471947615165104, "loss": 3.6824, "step": 12435 }, { "epoch": 0.8452235358064955, "grad_norm": 1.2669804096221924, "learning_rate": 0.0004471735290120941, "loss": 3.5604, "step": 12440 }, { "epoch": 0.8455632558771572, "grad_norm": 0.8094937205314636, "learning_rate": 0.0004471522965076777, "loss": 3.4507, "step": 12445 }, { "epoch": 0.845902975947819, "grad_norm": 0.8666526079177856, "learning_rate": 0.0004471310640032613, "loss": 3.5348, "step": 12450 }, { "epoch": 0.8462426960184808, "grad_norm": 1.2023850679397583, "learning_rate": 0.000447109831498845, "loss": 3.8176, "step": 12455 }, { "epoch": 0.8465824160891425, "grad_norm": 1.0765405893325806, "learning_rate": 0.0004470885989944286, "loss": 3.5034, "step": 12460 }, { "epoch": 0.8469221361598043, "grad_norm": 0.7848770618438721, "learning_rate": 0.0004470673664900122, "loss": 3.552, "step": 12465 }, { "epoch": 0.8472618562304661, "grad_norm": 0.9726797938346863, "learning_rate": 0.0004470461339855959, "loss": 3.4244, "step": 12470 }, { "epoch": 0.8476015763011279, "grad_norm": 0.8277277946472168, "learning_rate": 0.00044702490148117955, "loss": 3.164, "step": 12475 }, { "epoch": 0.8479412963717896, "grad_norm": 1.3635979890823364, "learning_rate": 0.00044700366897676316, "loss": 3.4579, "step": 12480 }, { "epoch": 0.8482810164424515, "grad_norm": 0.922616720199585, "learning_rate": 0.00044698243647234683, "loss": 3.5314, "step": 12485 }, { "epoch": 0.8486207365131132, "grad_norm": 0.890015721321106, "learning_rate": 0.00044696120396793044, "loss": 3.5859, "step": 12490 }, { "epoch": 0.848960456583775, "grad_norm": 0.8804162740707397, "learning_rate": 0.00044693997146351405, "loss": 3.6534, "step": 12495 }, { "epoch": 0.8493001766544367, "grad_norm": 1.053587794303894, "learning_rate": 0.0004469187389590977, "loss": 3.5923, "step": 12500 }, { "epoch": 0.8496398967250985, "grad_norm": 0.9308822154998779, "learning_rate": 0.00044689750645468133, "loss": 3.5273, "step": 12505 }, { "epoch": 0.8499796167957603, "grad_norm": 1.0513627529144287, "learning_rate": 0.000446876273950265, "loss": 3.3614, "step": 12510 }, { "epoch": 0.850319336866422, "grad_norm": 0.7864120602607727, "learning_rate": 0.00044685504144584867, "loss": 3.388, "step": 12515 }, { "epoch": 0.8506590569370839, "grad_norm": 0.8718185424804688, "learning_rate": 0.0004468338089414323, "loss": 3.5494, "step": 12520 }, { "epoch": 0.8509987770077456, "grad_norm": 1.0842076539993286, "learning_rate": 0.0004468125764370159, "loss": 3.6258, "step": 12525 }, { "epoch": 0.8513384970784074, "grad_norm": 3.10575795173645, "learning_rate": 0.00044679134393259956, "loss": 3.6663, "step": 12530 }, { "epoch": 0.8516782171490692, "grad_norm": 1.000199317932129, "learning_rate": 0.0004467701114281832, "loss": 3.8245, "step": 12535 }, { "epoch": 0.852017937219731, "grad_norm": 0.9667306542396545, "learning_rate": 0.0004467488789237668, "loss": 3.6202, "step": 12540 }, { "epoch": 0.8523576572903927, "grad_norm": 0.9195256233215332, "learning_rate": 0.0004467276464193505, "loss": 3.6453, "step": 12545 }, { "epoch": 0.8526973773610544, "grad_norm": 4.250603199005127, "learning_rate": 0.0004467064139149341, "loss": 3.6817, "step": 12550 }, { "epoch": 0.8530370974317163, "grad_norm": 1.1726361513137817, "learning_rate": 0.00044668518141051773, "loss": 3.5949, "step": 12555 }, { "epoch": 0.853376817502378, "grad_norm": 1.0950947999954224, "learning_rate": 0.0004466639489061014, "loss": 3.4044, "step": 12560 }, { "epoch": 0.8537165375730398, "grad_norm": 0.7752709984779358, "learning_rate": 0.000446642716401685, "loss": 3.7176, "step": 12565 }, { "epoch": 0.8540562576437016, "grad_norm": 1.1704014539718628, "learning_rate": 0.00044662148389726863, "loss": 3.4002, "step": 12570 }, { "epoch": 0.8543959777143634, "grad_norm": 0.9426574110984802, "learning_rate": 0.0004466002513928523, "loss": 3.6216, "step": 12575 }, { "epoch": 0.8547356977850251, "grad_norm": 0.961151659488678, "learning_rate": 0.00044657901888843596, "loss": 3.4957, "step": 12580 }, { "epoch": 0.8550754178556869, "grad_norm": 1.023395299911499, "learning_rate": 0.0004465577863840196, "loss": 3.3085, "step": 12585 }, { "epoch": 0.8554151379263487, "grad_norm": 0.8785600662231445, "learning_rate": 0.00044653655387960324, "loss": 3.6078, "step": 12590 }, { "epoch": 0.8557548579970105, "grad_norm": 0.9390338659286499, "learning_rate": 0.00044651532137518686, "loss": 3.7466, "step": 12595 }, { "epoch": 0.8560945780676722, "grad_norm": 1.0203425884246826, "learning_rate": 0.00044649408887077047, "loss": 3.6409, "step": 12600 }, { "epoch": 0.856434298138334, "grad_norm": 1.039806842803955, "learning_rate": 0.00044647285636635414, "loss": 3.2614, "step": 12605 }, { "epoch": 0.8567740182089958, "grad_norm": 0.7573713064193726, "learning_rate": 0.00044645162386193775, "loss": 3.5158, "step": 12610 }, { "epoch": 0.8571137382796575, "grad_norm": 0.8710591197013855, "learning_rate": 0.0004464303913575214, "loss": 3.463, "step": 12615 }, { "epoch": 0.8574534583503194, "grad_norm": 1.1224738359451294, "learning_rate": 0.0004464091588531051, "loss": 3.8647, "step": 12620 }, { "epoch": 0.8577931784209811, "grad_norm": 0.9952356815338135, "learning_rate": 0.0004463879263486887, "loss": 3.7217, "step": 12625 }, { "epoch": 0.8581328984916429, "grad_norm": 1.0827929973602295, "learning_rate": 0.0004463666938442723, "loss": 3.7055, "step": 12630 }, { "epoch": 0.8584726185623046, "grad_norm": 0.9734746217727661, "learning_rate": 0.000446345461339856, "loss": 3.3381, "step": 12635 }, { "epoch": 0.8588123386329665, "grad_norm": 0.9349880814552307, "learning_rate": 0.0004463242288354396, "loss": 3.7161, "step": 12640 }, { "epoch": 0.8591520587036282, "grad_norm": 0.7646616101264954, "learning_rate": 0.0004463029963310232, "loss": 3.6329, "step": 12645 }, { "epoch": 0.85949177877429, "grad_norm": 0.9950727820396423, "learning_rate": 0.0004462817638266069, "loss": 3.5601, "step": 12650 }, { "epoch": 0.8598314988449518, "grad_norm": 1.3951565027236938, "learning_rate": 0.00044626053132219054, "loss": 3.481, "step": 12655 }, { "epoch": 0.8601712189156135, "grad_norm": 1.0245457887649536, "learning_rate": 0.00044623929881777415, "loss": 3.4625, "step": 12660 }, { "epoch": 0.8605109389862753, "grad_norm": 0.9511963129043579, "learning_rate": 0.0004462180663133578, "loss": 3.4879, "step": 12665 }, { "epoch": 0.860850659056937, "grad_norm": 1.1182814836502075, "learning_rate": 0.00044619683380894143, "loss": 3.5285, "step": 12670 }, { "epoch": 0.8611903791275989, "grad_norm": 0.7132582664489746, "learning_rate": 0.00044617560130452504, "loss": 3.5913, "step": 12675 }, { "epoch": 0.8615300991982606, "grad_norm": 1.1309469938278198, "learning_rate": 0.0004461543688001087, "loss": 3.5405, "step": 12680 }, { "epoch": 0.8618698192689224, "grad_norm": 0.9120098948478699, "learning_rate": 0.0004461331362956924, "loss": 3.6121, "step": 12685 }, { "epoch": 0.8622095393395842, "grad_norm": 1.0434361696243286, "learning_rate": 0.000446111903791276, "loss": 3.3213, "step": 12690 }, { "epoch": 0.862549259410246, "grad_norm": 0.9681029915809631, "learning_rate": 0.00044609067128685966, "loss": 3.5791, "step": 12695 }, { "epoch": 0.8628889794809077, "grad_norm": 1.1229740381240845, "learning_rate": 0.00044606943878244327, "loss": 3.6259, "step": 12700 }, { "epoch": 0.8632286995515696, "grad_norm": 0.9277733564376831, "learning_rate": 0.0004460482062780269, "loss": 3.2885, "step": 12705 }, { "epoch": 0.8635684196222313, "grad_norm": 0.9835218787193298, "learning_rate": 0.00044602697377361055, "loss": 3.3333, "step": 12710 }, { "epoch": 0.863908139692893, "grad_norm": 1.0767079591751099, "learning_rate": 0.00044600574126919416, "loss": 3.67, "step": 12715 }, { "epoch": 0.8642478597635548, "grad_norm": 0.7789736390113831, "learning_rate": 0.00044598450876477783, "loss": 3.6443, "step": 12720 }, { "epoch": 0.8645875798342166, "grad_norm": 0.8214367628097534, "learning_rate": 0.0004459632762603615, "loss": 3.699, "step": 12725 }, { "epoch": 0.8649272999048784, "grad_norm": 0.9035855531692505, "learning_rate": 0.0004459420437559451, "loss": 3.5255, "step": 12730 }, { "epoch": 0.8652670199755401, "grad_norm": 1.1037840843200684, "learning_rate": 0.0004459208112515287, "loss": 3.8421, "step": 12735 }, { "epoch": 0.865606740046202, "grad_norm": 1.0417931079864502, "learning_rate": 0.0004458995787471124, "loss": 3.6928, "step": 12740 }, { "epoch": 0.8659464601168637, "grad_norm": 1.0071690082550049, "learning_rate": 0.000445878346242696, "loss": 3.6176, "step": 12745 }, { "epoch": 0.8662861801875255, "grad_norm": 0.746520459651947, "learning_rate": 0.0004458571137382796, "loss": 3.658, "step": 12750 }, { "epoch": 0.8666259002581872, "grad_norm": 0.8519235849380493, "learning_rate": 0.00044583588123386334, "loss": 3.506, "step": 12755 }, { "epoch": 0.866965620328849, "grad_norm": 1.044810175895691, "learning_rate": 0.00044581464872944695, "loss": 3.4388, "step": 12760 }, { "epoch": 0.8673053403995108, "grad_norm": 0.9035362005233765, "learning_rate": 0.00044579341622503056, "loss": 3.6117, "step": 12765 }, { "epoch": 0.8676450604701725, "grad_norm": 0.9660487771034241, "learning_rate": 0.00044577218372061423, "loss": 3.6836, "step": 12770 }, { "epoch": 0.8679847805408344, "grad_norm": 0.833626389503479, "learning_rate": 0.00044575095121619784, "loss": 3.6436, "step": 12775 }, { "epoch": 0.8683245006114961, "grad_norm": 0.8806959986686707, "learning_rate": 0.0004457297187117815, "loss": 3.492, "step": 12780 }, { "epoch": 0.8686642206821579, "grad_norm": 0.9339196085929871, "learning_rate": 0.0004457084862073651, "loss": 3.7205, "step": 12785 }, { "epoch": 0.8690039407528197, "grad_norm": 0.9823588728904724, "learning_rate": 0.0004456872537029488, "loss": 3.456, "step": 12790 }, { "epoch": 0.8693436608234815, "grad_norm": 0.7961263656616211, "learning_rate": 0.00044566602119853246, "loss": 3.5753, "step": 12795 }, { "epoch": 0.8696833808941432, "grad_norm": 0.9518716931343079, "learning_rate": 0.00044564478869411607, "loss": 3.7428, "step": 12800 }, { "epoch": 0.870023100964805, "grad_norm": 0.9649718999862671, "learning_rate": 0.0004456235561896997, "loss": 3.7478, "step": 12805 }, { "epoch": 0.8703628210354668, "grad_norm": 0.9161604046821594, "learning_rate": 0.00044560232368528335, "loss": 3.769, "step": 12810 }, { "epoch": 0.8707025411061285, "grad_norm": 0.9804285168647766, "learning_rate": 0.00044558109118086696, "loss": 3.6158, "step": 12815 }, { "epoch": 0.8710422611767903, "grad_norm": 0.8411022424697876, "learning_rate": 0.0004455598586764506, "loss": 3.6355, "step": 12820 }, { "epoch": 0.8713819812474521, "grad_norm": 1.2027101516723633, "learning_rate": 0.0004455386261720343, "loss": 3.729, "step": 12825 }, { "epoch": 0.8717217013181139, "grad_norm": 0.7500593662261963, "learning_rate": 0.0004455173936676179, "loss": 3.5389, "step": 12830 }, { "epoch": 0.8720614213887756, "grad_norm": 0.7673034071922302, "learning_rate": 0.0004454961611632015, "loss": 3.3885, "step": 12835 }, { "epoch": 0.8724011414594374, "grad_norm": 1.0513495206832886, "learning_rate": 0.0004454749286587852, "loss": 3.5473, "step": 12840 }, { "epoch": 0.8727408615300992, "grad_norm": 0.874727725982666, "learning_rate": 0.0004454536961543688, "loss": 3.4121, "step": 12845 }, { "epoch": 0.873080581600761, "grad_norm": 0.9480744004249573, "learning_rate": 0.0004454324636499524, "loss": 3.8741, "step": 12850 }, { "epoch": 0.8734203016714227, "grad_norm": 0.9010410308837891, "learning_rate": 0.00044541123114553614, "loss": 3.6529, "step": 12855 }, { "epoch": 0.8737600217420846, "grad_norm": 0.9184209704399109, "learning_rate": 0.00044538999864111975, "loss": 3.4853, "step": 12860 }, { "epoch": 0.8740997418127463, "grad_norm": 0.8163796067237854, "learning_rate": 0.00044536876613670336, "loss": 3.4907, "step": 12865 }, { "epoch": 0.874439461883408, "grad_norm": 1.4485867023468018, "learning_rate": 0.00044534753363228703, "loss": 3.2256, "step": 12870 }, { "epoch": 0.8747791819540699, "grad_norm": 0.9171136617660522, "learning_rate": 0.00044532630112787064, "loss": 3.6506, "step": 12875 }, { "epoch": 0.8751189020247316, "grad_norm": 0.8955926299095154, "learning_rate": 0.00044530506862345426, "loss": 3.2574, "step": 12880 }, { "epoch": 0.8754586220953934, "grad_norm": 1.004455327987671, "learning_rate": 0.0004452838361190379, "loss": 3.6162, "step": 12885 }, { "epoch": 0.8757983421660551, "grad_norm": 0.7679913640022278, "learning_rate": 0.0004452626036146216, "loss": 3.5029, "step": 12890 }, { "epoch": 0.876138062236717, "grad_norm": 1.0409228801727295, "learning_rate": 0.0004452413711102052, "loss": 3.6236, "step": 12895 }, { "epoch": 0.8764777823073787, "grad_norm": 0.9322468042373657, "learning_rate": 0.00044522013860578887, "loss": 3.7696, "step": 12900 }, { "epoch": 0.8768175023780405, "grad_norm": 1.007021427154541, "learning_rate": 0.0004451989061013725, "loss": 3.5307, "step": 12905 }, { "epoch": 0.8771572224487023, "grad_norm": 1.3184840679168701, "learning_rate": 0.0004451776735969561, "loss": 3.5444, "step": 12910 }, { "epoch": 0.877496942519364, "grad_norm": 1.1224373579025269, "learning_rate": 0.00044515644109253976, "loss": 3.2759, "step": 12915 }, { "epoch": 0.8778366625900258, "grad_norm": 1.022701382637024, "learning_rate": 0.0004451352085881234, "loss": 3.5087, "step": 12920 }, { "epoch": 0.8781763826606876, "grad_norm": 3.677396059036255, "learning_rate": 0.00044511397608370704, "loss": 3.3262, "step": 12925 }, { "epoch": 0.8785161027313494, "grad_norm": 1.1206810474395752, "learning_rate": 0.0004450927435792907, "loss": 3.6215, "step": 12930 }, { "epoch": 0.8788558228020111, "grad_norm": 0.9483447670936584, "learning_rate": 0.0004450715110748743, "loss": 3.54, "step": 12935 }, { "epoch": 0.8791955428726729, "grad_norm": 1.0204813480377197, "learning_rate": 0.00044505027857045794, "loss": 3.5458, "step": 12940 }, { "epoch": 0.8795352629433347, "grad_norm": 0.8052372932434082, "learning_rate": 0.0004450290460660416, "loss": 3.5583, "step": 12945 }, { "epoch": 0.8798749830139965, "grad_norm": 0.8698875904083252, "learning_rate": 0.0004450078135616252, "loss": 3.2555, "step": 12950 }, { "epoch": 0.8802147030846582, "grad_norm": 1.1111174821853638, "learning_rate": 0.00044498658105720883, "loss": 3.6441, "step": 12955 }, { "epoch": 0.8805544231553201, "grad_norm": 0.9337734580039978, "learning_rate": 0.00044496534855279255, "loss": 3.6019, "step": 12960 }, { "epoch": 0.8808941432259818, "grad_norm": 0.8609040975570679, "learning_rate": 0.00044494411604837616, "loss": 3.6209, "step": 12965 }, { "epoch": 0.8812338632966435, "grad_norm": 0.8916138410568237, "learning_rate": 0.0004449228835439598, "loss": 3.6175, "step": 12970 }, { "epoch": 0.8815735833673053, "grad_norm": 0.9046021103858948, "learning_rate": 0.00044490165103954344, "loss": 3.5646, "step": 12975 }, { "epoch": 0.8819133034379671, "grad_norm": 0.8501407504081726, "learning_rate": 0.00044488041853512706, "loss": 3.7051, "step": 12980 }, { "epoch": 0.8822530235086289, "grad_norm": 1.0695995092391968, "learning_rate": 0.00044485918603071067, "loss": 3.4824, "step": 12985 }, { "epoch": 0.8825927435792906, "grad_norm": 1.1519553661346436, "learning_rate": 0.00044483795352629434, "loss": 3.7243, "step": 12990 }, { "epoch": 0.8829324636499525, "grad_norm": 1.3812532424926758, "learning_rate": 0.000444816721021878, "loss": 3.5904, "step": 12995 }, { "epoch": 0.8832721837206142, "grad_norm": 1.3075042963027954, "learning_rate": 0.0004447954885174616, "loss": 3.6242, "step": 13000 }, { "epoch": 0.883611903791276, "grad_norm": 2.354304790496826, "learning_rate": 0.0004447742560130453, "loss": 3.6668, "step": 13005 }, { "epoch": 0.8839516238619378, "grad_norm": 1.0983448028564453, "learning_rate": 0.0004447530235086289, "loss": 3.3884, "step": 13010 }, { "epoch": 0.8842913439325996, "grad_norm": 3.6822054386138916, "learning_rate": 0.0004447317910042125, "loss": 3.4092, "step": 13015 }, { "epoch": 0.8846310640032613, "grad_norm": 1.6118465662002563, "learning_rate": 0.0004447105584997962, "loss": 3.428, "step": 13020 }, { "epoch": 0.884970784073923, "grad_norm": 1.0850859880447388, "learning_rate": 0.0004446893259953798, "loss": 3.4378, "step": 13025 }, { "epoch": 0.8853105041445849, "grad_norm": 1.2613999843597412, "learning_rate": 0.00044466809349096346, "loss": 3.4958, "step": 13030 }, { "epoch": 0.8856502242152466, "grad_norm": 1.0292744636535645, "learning_rate": 0.0004446468609865471, "loss": 3.5553, "step": 13035 }, { "epoch": 0.8859899442859084, "grad_norm": 1.0077913999557495, "learning_rate": 0.00044462562848213074, "loss": 3.4401, "step": 13040 }, { "epoch": 0.8863296643565702, "grad_norm": 0.7548008561134338, "learning_rate": 0.00044460439597771435, "loss": 3.5129, "step": 13045 }, { "epoch": 0.886669384427232, "grad_norm": 0.9413583874702454, "learning_rate": 0.000444583163473298, "loss": 3.7001, "step": 13050 }, { "epoch": 0.8870091044978937, "grad_norm": 0.9448134899139404, "learning_rate": 0.00044456193096888163, "loss": 3.531, "step": 13055 }, { "epoch": 0.8873488245685555, "grad_norm": 1.1951847076416016, "learning_rate": 0.00044454069846446524, "loss": 3.4725, "step": 13060 }, { "epoch": 0.8876885446392173, "grad_norm": 0.8186973929405212, "learning_rate": 0.00044451946596004896, "loss": 3.5701, "step": 13065 }, { "epoch": 0.8880282647098791, "grad_norm": 1.1335898637771606, "learning_rate": 0.0004444982334556326, "loss": 3.4188, "step": 13070 }, { "epoch": 0.8883679847805408, "grad_norm": 1.0515152215957642, "learning_rate": 0.0004444770009512162, "loss": 3.6522, "step": 13075 }, { "epoch": 0.8887077048512027, "grad_norm": 0.8767325282096863, "learning_rate": 0.00044445576844679986, "loss": 3.5706, "step": 13080 }, { "epoch": 0.8890474249218644, "grad_norm": 4.196320533752441, "learning_rate": 0.00044443453594238347, "loss": 3.4813, "step": 13085 }, { "epoch": 0.8893871449925261, "grad_norm": 0.9282090663909912, "learning_rate": 0.0004444133034379671, "loss": 3.586, "step": 13090 }, { "epoch": 0.889726865063188, "grad_norm": 1.1875864267349243, "learning_rate": 0.00044439207093355075, "loss": 3.5865, "step": 13095 }, { "epoch": 0.8900665851338497, "grad_norm": 0.8466371297836304, "learning_rate": 0.0004443708384291344, "loss": 3.5137, "step": 13100 }, { "epoch": 0.8904063052045115, "grad_norm": 0.954591691493988, "learning_rate": 0.00044434960592471803, "loss": 3.5654, "step": 13105 }, { "epoch": 0.8907460252751732, "grad_norm": 2.4860174655914307, "learning_rate": 0.0004443283734203017, "loss": 3.6497, "step": 13110 }, { "epoch": 0.8910857453458351, "grad_norm": 0.9467321634292603, "learning_rate": 0.0004443071409158853, "loss": 3.5093, "step": 13115 }, { "epoch": 0.8914254654164968, "grad_norm": 0.9325839281082153, "learning_rate": 0.000444285908411469, "loss": 3.7139, "step": 13120 }, { "epoch": 0.8917651854871586, "grad_norm": 1.0472116470336914, "learning_rate": 0.0004442646759070526, "loss": 3.3664, "step": 13125 }, { "epoch": 0.8921049055578204, "grad_norm": 1.0695289373397827, "learning_rate": 0.0004442434434026362, "loss": 3.3608, "step": 13130 }, { "epoch": 0.8924446256284821, "grad_norm": 1.0988632440567017, "learning_rate": 0.0004442222108982199, "loss": 3.4591, "step": 13135 }, { "epoch": 0.8927843456991439, "grad_norm": 0.9099370241165161, "learning_rate": 0.00044420097839380354, "loss": 3.531, "step": 13140 }, { "epoch": 0.8931240657698056, "grad_norm": 0.96169513463974, "learning_rate": 0.00044417974588938715, "loss": 3.7058, "step": 13145 }, { "epoch": 0.8934637858404675, "grad_norm": 1.0726947784423828, "learning_rate": 0.0004441585133849708, "loss": 3.5613, "step": 13150 }, { "epoch": 0.8938035059111292, "grad_norm": 0.8229075074195862, "learning_rate": 0.00044413728088055443, "loss": 3.689, "step": 13155 }, { "epoch": 0.894143225981791, "grad_norm": 1.029867172241211, "learning_rate": 0.00044411604837613804, "loss": 3.28, "step": 13160 }, { "epoch": 0.8944829460524528, "grad_norm": 0.8575703501701355, "learning_rate": 0.0004440948158717217, "loss": 3.3233, "step": 13165 }, { "epoch": 0.8948226661231146, "grad_norm": 0.9124071002006531, "learning_rate": 0.0004440735833673054, "loss": 3.3526, "step": 13170 }, { "epoch": 0.8951623861937763, "grad_norm": 0.6732546091079712, "learning_rate": 0.000444052350862889, "loss": 3.6354, "step": 13175 }, { "epoch": 0.8955021062644382, "grad_norm": 1.1759495735168457, "learning_rate": 0.00044403111835847266, "loss": 3.5501, "step": 13180 }, { "epoch": 0.8958418263350999, "grad_norm": 0.9575709700584412, "learning_rate": 0.00044400988585405627, "loss": 3.7879, "step": 13185 }, { "epoch": 0.8961815464057616, "grad_norm": 0.8911659121513367, "learning_rate": 0.0004439886533496399, "loss": 3.5229, "step": 13190 }, { "epoch": 0.8965212664764234, "grad_norm": 1.0208302736282349, "learning_rate": 0.00044396742084522355, "loss": 3.681, "step": 13195 }, { "epoch": 0.8968609865470852, "grad_norm": 0.8563026189804077, "learning_rate": 0.00044394618834080716, "loss": 3.4783, "step": 13200 }, { "epoch": 0.897200706617747, "grad_norm": 0.8496609926223755, "learning_rate": 0.00044392495583639083, "loss": 3.3879, "step": 13205 }, { "epoch": 0.8975404266884087, "grad_norm": 0.9146761894226074, "learning_rate": 0.0004439037233319745, "loss": 3.4643, "step": 13210 }, { "epoch": 0.8978801467590706, "grad_norm": 0.9994539022445679, "learning_rate": 0.0004438824908275581, "loss": 3.4189, "step": 13215 }, { "epoch": 0.8982198668297323, "grad_norm": 1.0348008871078491, "learning_rate": 0.0004438612583231417, "loss": 3.5323, "step": 13220 }, { "epoch": 0.8985595869003941, "grad_norm": 2.82961368560791, "learning_rate": 0.0004438400258187254, "loss": 3.7087, "step": 13225 }, { "epoch": 0.8988993069710558, "grad_norm": 1.0363900661468506, "learning_rate": 0.000443818793314309, "loss": 3.7662, "step": 13230 }, { "epoch": 0.8992390270417177, "grad_norm": 1.0600723028182983, "learning_rate": 0.0004437975608098926, "loss": 3.396, "step": 13235 }, { "epoch": 0.8995787471123794, "grad_norm": 0.8201188445091248, "learning_rate": 0.00044377632830547634, "loss": 3.7255, "step": 13240 }, { "epoch": 0.8999184671830411, "grad_norm": 0.9527798295021057, "learning_rate": 0.00044375509580105995, "loss": 3.2838, "step": 13245 }, { "epoch": 0.900258187253703, "grad_norm": 1.797564148902893, "learning_rate": 0.00044373386329664356, "loss": 3.493, "step": 13250 }, { "epoch": 0.9005979073243647, "grad_norm": 0.9097295999526978, "learning_rate": 0.00044371263079222723, "loss": 3.5627, "step": 13255 }, { "epoch": 0.9009376273950265, "grad_norm": 0.9185404777526855, "learning_rate": 0.00044369139828781084, "loss": 3.5972, "step": 13260 }, { "epoch": 0.9012773474656883, "grad_norm": 0.8827181458473206, "learning_rate": 0.00044367016578339446, "loss": 3.5043, "step": 13265 }, { "epoch": 0.9016170675363501, "grad_norm": 1.19967782497406, "learning_rate": 0.0004436489332789781, "loss": 3.5456, "step": 13270 }, { "epoch": 0.9019567876070118, "grad_norm": 1.0836889743804932, "learning_rate": 0.0004436277007745618, "loss": 3.7327, "step": 13275 }, { "epoch": 0.9022965076776736, "grad_norm": 1.1051934957504272, "learning_rate": 0.0004436064682701454, "loss": 3.7695, "step": 13280 }, { "epoch": 0.9026362277483354, "grad_norm": 0.7576434016227722, "learning_rate": 0.00044358523576572907, "loss": 3.5403, "step": 13285 }, { "epoch": 0.9029759478189971, "grad_norm": 1.060336709022522, "learning_rate": 0.0004435640032613127, "loss": 3.5868, "step": 13290 }, { "epoch": 0.9033156678896589, "grad_norm": 0.9955601692199707, "learning_rate": 0.0004435427707568963, "loss": 3.7579, "step": 13295 }, { "epoch": 0.9036553879603207, "grad_norm": 0.860587477684021, "learning_rate": 0.00044352153825247997, "loss": 3.6558, "step": 13300 }, { "epoch": 0.9039951080309825, "grad_norm": 1.0796478986740112, "learning_rate": 0.0004435003057480636, "loss": 3.332, "step": 13305 }, { "epoch": 0.9043348281016442, "grad_norm": 0.9480171799659729, "learning_rate": 0.00044347907324364725, "loss": 3.7115, "step": 13310 }, { "epoch": 0.904674548172306, "grad_norm": 2.0243005752563477, "learning_rate": 0.0004434578407392309, "loss": 3.8339, "step": 13315 }, { "epoch": 0.9050142682429678, "grad_norm": 1.299910068511963, "learning_rate": 0.0004434366082348145, "loss": 3.4589, "step": 13320 }, { "epoch": 0.9053539883136296, "grad_norm": 0.9730696082115173, "learning_rate": 0.00044341537573039814, "loss": 3.4917, "step": 13325 }, { "epoch": 0.9056937083842913, "grad_norm": 1.010367751121521, "learning_rate": 0.0004433941432259818, "loss": 3.4623, "step": 13330 }, { "epoch": 0.9060334284549532, "grad_norm": 0.9210299849510193, "learning_rate": 0.0004433729107215654, "loss": 3.6729, "step": 13335 }, { "epoch": 0.9063731485256149, "grad_norm": 0.9636004567146301, "learning_rate": 0.00044335167821714903, "loss": 3.4427, "step": 13340 }, { "epoch": 0.9067128685962766, "grad_norm": 0.8471945524215698, "learning_rate": 0.00044333044571273275, "loss": 3.6397, "step": 13345 }, { "epoch": 0.9070525886669385, "grad_norm": 2.223308563232422, "learning_rate": 0.00044330921320831637, "loss": 3.5624, "step": 13350 }, { "epoch": 0.9073923087376002, "grad_norm": 1.0406038761138916, "learning_rate": 0.0004432879807039, "loss": 3.5435, "step": 13355 }, { "epoch": 0.907732028808262, "grad_norm": 0.9700852632522583, "learning_rate": 0.00044326674819948365, "loss": 3.4871, "step": 13360 }, { "epoch": 0.9080717488789237, "grad_norm": 0.9706659913063049, "learning_rate": 0.00044324551569506726, "loss": 3.4875, "step": 13365 }, { "epoch": 0.9084114689495856, "grad_norm": 1.106802225112915, "learning_rate": 0.00044322428319065087, "loss": 3.507, "step": 13370 }, { "epoch": 0.9087511890202473, "grad_norm": 0.9407106041908264, "learning_rate": 0.00044320305068623454, "loss": 3.1798, "step": 13375 }, { "epoch": 0.9090909090909091, "grad_norm": 1.026228904724121, "learning_rate": 0.0004431818181818182, "loss": 3.7805, "step": 13380 }, { "epoch": 0.9094306291615709, "grad_norm": 0.9001518487930298, "learning_rate": 0.0004431605856774018, "loss": 3.7679, "step": 13385 }, { "epoch": 0.9097703492322327, "grad_norm": 1.1139030456542969, "learning_rate": 0.0004431393531729855, "loss": 3.6075, "step": 13390 }, { "epoch": 0.9101100693028944, "grad_norm": 0.8329018354415894, "learning_rate": 0.0004431181206685691, "loss": 3.3853, "step": 13395 }, { "epoch": 0.9104497893735561, "grad_norm": 0.9903094172477722, "learning_rate": 0.0004430968881641527, "loss": 3.3127, "step": 13400 }, { "epoch": 0.910789509444218, "grad_norm": 1.0699189901351929, "learning_rate": 0.0004430756556597364, "loss": 3.9453, "step": 13405 }, { "epoch": 0.9111292295148797, "grad_norm": 4.949556827545166, "learning_rate": 0.00044305442315532, "loss": 3.3005, "step": 13410 }, { "epoch": 0.9114689495855415, "grad_norm": 0.7176287770271301, "learning_rate": 0.00044303319065090366, "loss": 3.5565, "step": 13415 }, { "epoch": 0.9118086696562033, "grad_norm": 0.9006063342094421, "learning_rate": 0.0004430119581464873, "loss": 3.8039, "step": 13420 }, { "epoch": 0.9121483897268651, "grad_norm": 0.9720682501792908, "learning_rate": 0.00044299072564207094, "loss": 3.5334, "step": 13425 }, { "epoch": 0.9124881097975268, "grad_norm": 0.8172016143798828, "learning_rate": 0.00044296949313765455, "loss": 3.3425, "step": 13430 }, { "epoch": 0.9128278298681887, "grad_norm": 1.0075267553329468, "learning_rate": 0.0004429482606332382, "loss": 3.5319, "step": 13435 }, { "epoch": 0.9131675499388504, "grad_norm": 1.0824682712554932, "learning_rate": 0.00044292702812882183, "loss": 3.3809, "step": 13440 }, { "epoch": 0.9135072700095122, "grad_norm": 0.9852837324142456, "learning_rate": 0.00044290579562440545, "loss": 3.4348, "step": 13445 }, { "epoch": 0.9138469900801739, "grad_norm": 0.9620903730392456, "learning_rate": 0.00044288456311998917, "loss": 3.4096, "step": 13450 }, { "epoch": 0.9141867101508357, "grad_norm": 1.356423258781433, "learning_rate": 0.0004428633306155728, "loss": 3.4672, "step": 13455 }, { "epoch": 0.9145264302214975, "grad_norm": 1.0467629432678223, "learning_rate": 0.00044284209811115645, "loss": 3.816, "step": 13460 }, { "epoch": 0.9148661502921592, "grad_norm": 1.304762840270996, "learning_rate": 0.00044282086560674006, "loss": 3.6213, "step": 13465 }, { "epoch": 0.9152058703628211, "grad_norm": 1.0135976076126099, "learning_rate": 0.00044279963310232367, "loss": 3.7329, "step": 13470 }, { "epoch": 0.9155455904334828, "grad_norm": 0.8632964491844177, "learning_rate": 0.00044277840059790734, "loss": 3.5338, "step": 13475 }, { "epoch": 0.9158853105041446, "grad_norm": 0.9970428943634033, "learning_rate": 0.000442757168093491, "loss": 3.4091, "step": 13480 }, { "epoch": 0.9162250305748063, "grad_norm": 0.9053795337677002, "learning_rate": 0.0004427359355890746, "loss": 3.5097, "step": 13485 }, { "epoch": 0.9165647506454682, "grad_norm": 1.2105143070220947, "learning_rate": 0.0004427147030846583, "loss": 3.6512, "step": 13490 }, { "epoch": 0.9169044707161299, "grad_norm": 1.164586067199707, "learning_rate": 0.0004426934705802419, "loss": 3.6125, "step": 13495 }, { "epoch": 0.9172441907867916, "grad_norm": 0.9292495846748352, "learning_rate": 0.0004426722380758255, "loss": 3.4661, "step": 13500 }, { "epoch": 0.9175839108574535, "grad_norm": 0.9434965252876282, "learning_rate": 0.0004426510055714092, "loss": 3.6163, "step": 13505 }, { "epoch": 0.9179236309281152, "grad_norm": 0.8257644176483154, "learning_rate": 0.0004426297730669928, "loss": 3.9247, "step": 13510 }, { "epoch": 0.918263350998777, "grad_norm": 0.9839769601821899, "learning_rate": 0.00044260854056257646, "loss": 3.1172, "step": 13515 }, { "epoch": 0.9186030710694388, "grad_norm": 1.4962306022644043, "learning_rate": 0.0004425873080581601, "loss": 3.6362, "step": 13520 }, { "epoch": 0.9189427911401006, "grad_norm": 0.9489110708236694, "learning_rate": 0.00044256607555374374, "loss": 3.5296, "step": 13525 }, { "epoch": 0.9192825112107623, "grad_norm": 0.9288069009780884, "learning_rate": 0.00044254484304932735, "loss": 3.2476, "step": 13530 }, { "epoch": 0.9196222312814241, "grad_norm": 0.9621761441230774, "learning_rate": 0.000442523610544911, "loss": 3.6687, "step": 13535 }, { "epoch": 0.9199619513520859, "grad_norm": 0.8630064129829407, "learning_rate": 0.00044250237804049463, "loss": 3.7278, "step": 13540 }, { "epoch": 0.9203016714227477, "grad_norm": 1.080405831336975, "learning_rate": 0.00044248114553607825, "loss": 3.4846, "step": 13545 }, { "epoch": 0.9206413914934094, "grad_norm": 0.8927462697029114, "learning_rate": 0.00044245991303166197, "loss": 3.7365, "step": 13550 }, { "epoch": 0.9209811115640713, "grad_norm": 1.0494718551635742, "learning_rate": 0.0004424386805272456, "loss": 3.5788, "step": 13555 }, { "epoch": 0.921320831634733, "grad_norm": 1.0289667844772339, "learning_rate": 0.0004424174480228292, "loss": 3.5478, "step": 13560 }, { "epoch": 0.9216605517053947, "grad_norm": 1.0069925785064697, "learning_rate": 0.00044239621551841286, "loss": 3.5516, "step": 13565 }, { "epoch": 0.9220002717760565, "grad_norm": 0.8834696412086487, "learning_rate": 0.0004423749830139965, "loss": 3.5974, "step": 13570 }, { "epoch": 0.9223399918467183, "grad_norm": 0.9646829962730408, "learning_rate": 0.0004423537505095801, "loss": 3.5783, "step": 13575 }, { "epoch": 0.9226797119173801, "grad_norm": 1.7096019983291626, "learning_rate": 0.00044233251800516375, "loss": 3.4878, "step": 13580 }, { "epoch": 0.9230194319880418, "grad_norm": 1.06480872631073, "learning_rate": 0.0004423112855007474, "loss": 3.2486, "step": 13585 }, { "epoch": 0.9233591520587037, "grad_norm": 0.9053540825843811, "learning_rate": 0.00044229005299633103, "loss": 3.5922, "step": 13590 }, { "epoch": 0.9236988721293654, "grad_norm": 0.9553683400154114, "learning_rate": 0.0004422688204919147, "loss": 3.5793, "step": 13595 }, { "epoch": 0.9240385922000272, "grad_norm": 0.8622778654098511, "learning_rate": 0.0004422475879874983, "loss": 3.4842, "step": 13600 }, { "epoch": 0.924378312270689, "grad_norm": 1.052263855934143, "learning_rate": 0.0004422263554830819, "loss": 3.7285, "step": 13605 }, { "epoch": 0.9247180323413507, "grad_norm": 0.8415585160255432, "learning_rate": 0.0004422051229786656, "loss": 3.6685, "step": 13610 }, { "epoch": 0.9250577524120125, "grad_norm": 0.9726094603538513, "learning_rate": 0.0004421838904742492, "loss": 3.4441, "step": 13615 }, { "epoch": 0.9253974724826742, "grad_norm": 0.905716061592102, "learning_rate": 0.0004421626579698329, "loss": 3.5956, "step": 13620 }, { "epoch": 0.9257371925533361, "grad_norm": 0.7982794642448425, "learning_rate": 0.00044214142546541654, "loss": 3.6464, "step": 13625 }, { "epoch": 0.9260769126239978, "grad_norm": 0.9245908856391907, "learning_rate": 0.00044212019296100015, "loss": 3.5091, "step": 13630 }, { "epoch": 0.9264166326946596, "grad_norm": 0.9351087212562561, "learning_rate": 0.00044209896045658377, "loss": 3.7008, "step": 13635 }, { "epoch": 0.9267563527653214, "grad_norm": 1.1677684783935547, "learning_rate": 0.00044207772795216743, "loss": 3.5479, "step": 13640 }, { "epoch": 0.9270960728359832, "grad_norm": 0.9295387864112854, "learning_rate": 0.00044205649544775105, "loss": 3.3195, "step": 13645 }, { "epoch": 0.9274357929066449, "grad_norm": 1.0576543807983398, "learning_rate": 0.00044203526294333466, "loss": 3.7172, "step": 13650 }, { "epoch": 0.9277755129773066, "grad_norm": 1.404225468635559, "learning_rate": 0.0004420140304389184, "loss": 3.6344, "step": 13655 }, { "epoch": 0.9281152330479685, "grad_norm": 0.7672839760780334, "learning_rate": 0.000441992797934502, "loss": 3.6951, "step": 13660 }, { "epoch": 0.9284549531186302, "grad_norm": 0.9337202310562134, "learning_rate": 0.0004419715654300856, "loss": 3.5002, "step": 13665 }, { "epoch": 0.928794673189292, "grad_norm": 0.8002074360847473, "learning_rate": 0.0004419503329256693, "loss": 3.7219, "step": 13670 }, { "epoch": 0.9291343932599538, "grad_norm": 1.1406962871551514, "learning_rate": 0.0004419291004212529, "loss": 3.6681, "step": 13675 }, { "epoch": 0.9294741133306156, "grad_norm": 1.0014840364456177, "learning_rate": 0.0004419078679168365, "loss": 3.585, "step": 13680 }, { "epoch": 0.9298138334012773, "grad_norm": 1.0222225189208984, "learning_rate": 0.00044188663541242017, "loss": 3.3547, "step": 13685 }, { "epoch": 0.9301535534719392, "grad_norm": 0.9725627303123474, "learning_rate": 0.00044186540290800383, "loss": 3.4415, "step": 13690 }, { "epoch": 0.9304932735426009, "grad_norm": 1.0598926544189453, "learning_rate": 0.00044184417040358745, "loss": 3.4681, "step": 13695 }, { "epoch": 0.9308329936132627, "grad_norm": 0.8718339800834656, "learning_rate": 0.0004418229378991711, "loss": 3.384, "step": 13700 }, { "epoch": 0.9311727136839244, "grad_norm": 1.0832544565200806, "learning_rate": 0.0004418017053947547, "loss": 3.4788, "step": 13705 }, { "epoch": 0.9315124337545863, "grad_norm": 1.1541874408721924, "learning_rate": 0.00044178047289033834, "loss": 3.5528, "step": 13710 }, { "epoch": 0.931852153825248, "grad_norm": 0.9185882210731506, "learning_rate": 0.000441759240385922, "loss": 3.7123, "step": 13715 }, { "epoch": 0.9321918738959097, "grad_norm": 0.9624565243721008, "learning_rate": 0.0004417380078815056, "loss": 3.5044, "step": 13720 }, { "epoch": 0.9325315939665716, "grad_norm": 0.9797097444534302, "learning_rate": 0.0004417167753770893, "loss": 3.2764, "step": 13725 }, { "epoch": 0.9328713140372333, "grad_norm": 0.8659570813179016, "learning_rate": 0.00044169554287267295, "loss": 3.4671, "step": 13730 }, { "epoch": 0.9332110341078951, "grad_norm": 0.9372638463973999, "learning_rate": 0.00044167431036825657, "loss": 3.7455, "step": 13735 }, { "epoch": 0.9335507541785568, "grad_norm": 0.8745118975639343, "learning_rate": 0.0004416530778638402, "loss": 3.5649, "step": 13740 }, { "epoch": 0.9338904742492187, "grad_norm": 0.995104968547821, "learning_rate": 0.00044163184535942385, "loss": 3.6012, "step": 13745 }, { "epoch": 0.9342301943198804, "grad_norm": 1.038103699684143, "learning_rate": 0.00044161061285500746, "loss": 3.09, "step": 13750 }, { "epoch": 0.9345699143905422, "grad_norm": 0.7861025333404541, "learning_rate": 0.0004415893803505911, "loss": 3.7297, "step": 13755 }, { "epoch": 0.934909634461204, "grad_norm": 2.183199167251587, "learning_rate": 0.0004415681478461748, "loss": 3.4676, "step": 13760 }, { "epoch": 0.9352493545318658, "grad_norm": 1.0198116302490234, "learning_rate": 0.0004415469153417584, "loss": 3.5726, "step": 13765 }, { "epoch": 0.9355890746025275, "grad_norm": 0.7777231335639954, "learning_rate": 0.000441525682837342, "loss": 3.6592, "step": 13770 }, { "epoch": 0.9359287946731893, "grad_norm": 0.9396200180053711, "learning_rate": 0.0004415044503329257, "loss": 3.3224, "step": 13775 }, { "epoch": 0.9362685147438511, "grad_norm": 0.8858252763748169, "learning_rate": 0.0004414832178285093, "loss": 3.45, "step": 13780 }, { "epoch": 0.9366082348145128, "grad_norm": 0.786267101764679, "learning_rate": 0.0004414619853240929, "loss": 3.5773, "step": 13785 }, { "epoch": 0.9369479548851746, "grad_norm": 0.8539401888847351, "learning_rate": 0.0004414407528196766, "loss": 3.4705, "step": 13790 }, { "epoch": 0.9372876749558364, "grad_norm": 1.208612322807312, "learning_rate": 0.00044141952031526025, "loss": 3.5953, "step": 13795 }, { "epoch": 0.9376273950264982, "grad_norm": 0.8389764428138733, "learning_rate": 0.0004413982878108439, "loss": 3.3299, "step": 13800 }, { "epoch": 0.9379671150971599, "grad_norm": 0.7778698801994324, "learning_rate": 0.00044137705530642753, "loss": 3.4933, "step": 13805 }, { "epoch": 0.9383068351678218, "grad_norm": 1.015949010848999, "learning_rate": 0.00044135582280201114, "loss": 3.1949, "step": 13810 }, { "epoch": 0.9386465552384835, "grad_norm": 1.0659600496292114, "learning_rate": 0.0004413345902975948, "loss": 3.4927, "step": 13815 }, { "epoch": 0.9389862753091452, "grad_norm": 0.7723532915115356, "learning_rate": 0.0004413133577931784, "loss": 3.6151, "step": 13820 }, { "epoch": 0.939325995379807, "grad_norm": 1.0572954416275024, "learning_rate": 0.00044129212528876203, "loss": 3.1864, "step": 13825 }, { "epoch": 0.9396657154504688, "grad_norm": 1.1734416484832764, "learning_rate": 0.00044127089278434576, "loss": 3.6164, "step": 13830 }, { "epoch": 0.9400054355211306, "grad_norm": 0.8777297735214233, "learning_rate": 0.00044124966027992937, "loss": 3.4519, "step": 13835 }, { "epoch": 0.9403451555917923, "grad_norm": 0.7513958215713501, "learning_rate": 0.000441228427775513, "loss": 3.4644, "step": 13840 }, { "epoch": 0.9406848756624542, "grad_norm": 1.1917390823364258, "learning_rate": 0.00044120719527109665, "loss": 3.3985, "step": 13845 }, { "epoch": 0.9410245957331159, "grad_norm": 0.884412407875061, "learning_rate": 0.00044118596276668026, "loss": 3.6766, "step": 13850 }, { "epoch": 0.9413643158037777, "grad_norm": 0.8421000242233276, "learning_rate": 0.0004411647302622639, "loss": 3.6335, "step": 13855 }, { "epoch": 0.9417040358744395, "grad_norm": 1.1148154735565186, "learning_rate": 0.00044114349775784754, "loss": 3.5881, "step": 13860 }, { "epoch": 0.9420437559451013, "grad_norm": 1.2249623537063599, "learning_rate": 0.0004411222652534312, "loss": 3.2529, "step": 13865 }, { "epoch": 0.942383476015763, "grad_norm": 1.2252427339553833, "learning_rate": 0.0004411010327490148, "loss": 3.5587, "step": 13870 }, { "epoch": 0.9427231960864247, "grad_norm": 1.040501356124878, "learning_rate": 0.0004410798002445985, "loss": 3.5855, "step": 13875 }, { "epoch": 0.9430629161570866, "grad_norm": 1.1010082960128784, "learning_rate": 0.0004410585677401821, "loss": 3.558, "step": 13880 }, { "epoch": 0.9434026362277483, "grad_norm": 1.002020239830017, "learning_rate": 0.0004410373352357657, "loss": 3.2971, "step": 13885 }, { "epoch": 0.9437423562984101, "grad_norm": 0.8831568956375122, "learning_rate": 0.0004410161027313494, "loss": 3.7049, "step": 13890 }, { "epoch": 0.9440820763690719, "grad_norm": 0.8336716890335083, "learning_rate": 0.000440994870226933, "loss": 3.6928, "step": 13895 }, { "epoch": 0.9444217964397337, "grad_norm": 1.4800230264663696, "learning_rate": 0.00044097363772251666, "loss": 3.4263, "step": 13900 }, { "epoch": 0.9447615165103954, "grad_norm": 1.0641231536865234, "learning_rate": 0.00044095240521810033, "loss": 3.5515, "step": 13905 }, { "epoch": 0.9451012365810572, "grad_norm": 0.9438132643699646, "learning_rate": 0.00044093117271368394, "loss": 3.7154, "step": 13910 }, { "epoch": 0.945440956651719, "grad_norm": 1.071640133857727, "learning_rate": 0.00044090994020926755, "loss": 3.4951, "step": 13915 }, { "epoch": 0.9457806767223808, "grad_norm": 0.8943241834640503, "learning_rate": 0.0004408887077048512, "loss": 3.3311, "step": 13920 }, { "epoch": 0.9461203967930425, "grad_norm": 0.9142396450042725, "learning_rate": 0.00044086747520043483, "loss": 3.6552, "step": 13925 }, { "epoch": 0.9464601168637043, "grad_norm": 1.2697099447250366, "learning_rate": 0.00044084624269601845, "loss": 3.4206, "step": 13930 }, { "epoch": 0.9467998369343661, "grad_norm": 1.0873968601226807, "learning_rate": 0.00044082501019160217, "loss": 3.4687, "step": 13935 }, { "epoch": 0.9471395570050278, "grad_norm": 1.1348236799240112, "learning_rate": 0.0004408037776871858, "loss": 3.7003, "step": 13940 }, { "epoch": 0.9474792770756897, "grad_norm": 0.9899526238441467, "learning_rate": 0.0004407825451827694, "loss": 3.612, "step": 13945 }, { "epoch": 0.9478189971463514, "grad_norm": 1.0078707933425903, "learning_rate": 0.00044076131267835306, "loss": 3.6154, "step": 13950 }, { "epoch": 0.9481587172170132, "grad_norm": 0.7832008600234985, "learning_rate": 0.0004407400801739367, "loss": 3.2983, "step": 13955 }, { "epoch": 0.9484984372876749, "grad_norm": 0.8793731927871704, "learning_rate": 0.0004407188476695203, "loss": 3.4359, "step": 13960 }, { "epoch": 0.9488381573583368, "grad_norm": 0.8704424500465393, "learning_rate": 0.000440697615165104, "loss": 3.2534, "step": 13965 }, { "epoch": 0.9491778774289985, "grad_norm": 1.2797993421554565, "learning_rate": 0.0004406763826606876, "loss": 3.7746, "step": 13970 }, { "epoch": 0.9495175974996602, "grad_norm": 0.9371976256370544, "learning_rate": 0.00044065515015627123, "loss": 3.6065, "step": 13975 }, { "epoch": 0.9498573175703221, "grad_norm": 1.7058855295181274, "learning_rate": 0.0004406339176518549, "loss": 3.6722, "step": 13980 }, { "epoch": 0.9501970376409838, "grad_norm": 0.9982700347900391, "learning_rate": 0.0004406126851474385, "loss": 3.4334, "step": 13985 }, { "epoch": 0.9505367577116456, "grad_norm": 1.0867435932159424, "learning_rate": 0.00044059145264302213, "loss": 3.5563, "step": 13990 }, { "epoch": 0.9508764777823073, "grad_norm": 0.9400560855865479, "learning_rate": 0.0004405702201386058, "loss": 3.6169, "step": 13995 }, { "epoch": 0.9512161978529692, "grad_norm": 0.9144636392593384, "learning_rate": 0.00044054898763418946, "loss": 3.4379, "step": 14000 }, { "epoch": 0.9515559179236309, "grad_norm": 0.7027807831764221, "learning_rate": 0.0004405277551297731, "loss": 3.2558, "step": 14005 }, { "epoch": 0.9518956379942927, "grad_norm": 0.854626476764679, "learning_rate": 0.00044050652262535674, "loss": 3.3775, "step": 14010 }, { "epoch": 0.9522353580649545, "grad_norm": 0.976573646068573, "learning_rate": 0.00044048529012094036, "loss": 3.6538, "step": 14015 }, { "epoch": 0.9525750781356163, "grad_norm": 0.9175491333007812, "learning_rate": 0.00044046405761652397, "loss": 3.567, "step": 14020 }, { "epoch": 0.952914798206278, "grad_norm": 0.9751697778701782, "learning_rate": 0.00044044282511210764, "loss": 3.6619, "step": 14025 }, { "epoch": 0.9532545182769399, "grad_norm": 0.8073590397834778, "learning_rate": 0.00044042159260769125, "loss": 3.5251, "step": 14030 }, { "epoch": 0.9535942383476016, "grad_norm": 6.931807994842529, "learning_rate": 0.0004404003601032749, "loss": 3.5318, "step": 14035 }, { "epoch": 0.9539339584182633, "grad_norm": 1.1380449533462524, "learning_rate": 0.0004403791275988586, "loss": 3.6674, "step": 14040 }, { "epoch": 0.9542736784889251, "grad_norm": 1.169405221939087, "learning_rate": 0.0004403578950944422, "loss": 3.5738, "step": 14045 }, { "epoch": 0.9546133985595869, "grad_norm": 0.9875165820121765, "learning_rate": 0.0004403366625900258, "loss": 3.7792, "step": 14050 }, { "epoch": 0.9549531186302487, "grad_norm": 1.1411341428756714, "learning_rate": 0.0004403154300856095, "loss": 3.5115, "step": 14055 }, { "epoch": 0.9552928387009104, "grad_norm": 0.9959802627563477, "learning_rate": 0.0004402941975811931, "loss": 3.4759, "step": 14060 }, { "epoch": 0.9556325587715723, "grad_norm": 0.919893741607666, "learning_rate": 0.0004402729650767767, "loss": 3.5372, "step": 14065 }, { "epoch": 0.955972278842234, "grad_norm": 0.955470085144043, "learning_rate": 0.0004402517325723604, "loss": 3.4728, "step": 14070 }, { "epoch": 0.9563119989128958, "grad_norm": 0.8784619569778442, "learning_rate": 0.00044023050006794404, "loss": 3.6893, "step": 14075 }, { "epoch": 0.9566517189835575, "grad_norm": 1.2055732011795044, "learning_rate": 0.00044020926756352765, "loss": 3.592, "step": 14080 }, { "epoch": 0.9569914390542194, "grad_norm": 0.850624680519104, "learning_rate": 0.0004401880350591113, "loss": 3.4503, "step": 14085 }, { "epoch": 0.9573311591248811, "grad_norm": 1.0446397066116333, "learning_rate": 0.00044016680255469493, "loss": 3.5483, "step": 14090 }, { "epoch": 0.9576708791955428, "grad_norm": 0.8839070796966553, "learning_rate": 0.00044014557005027854, "loss": 3.6184, "step": 14095 }, { "epoch": 0.9580105992662047, "grad_norm": 1.0622018575668335, "learning_rate": 0.0004401243375458622, "loss": 3.411, "step": 14100 }, { "epoch": 0.9583503193368664, "grad_norm": 1.0349913835525513, "learning_rate": 0.0004401031050414459, "loss": 3.3464, "step": 14105 }, { "epoch": 0.9586900394075282, "grad_norm": 0.9302172660827637, "learning_rate": 0.0004400818725370295, "loss": 3.3343, "step": 14110 }, { "epoch": 0.95902975947819, "grad_norm": 1.651986837387085, "learning_rate": 0.00044006064003261316, "loss": 3.5453, "step": 14115 }, { "epoch": 0.9593694795488518, "grad_norm": 0.9349258542060852, "learning_rate": 0.00044003940752819677, "loss": 3.9331, "step": 14120 }, { "epoch": 0.9597091996195135, "grad_norm": 1.3609799146652222, "learning_rate": 0.0004400181750237804, "loss": 3.7338, "step": 14125 }, { "epoch": 0.9600489196901753, "grad_norm": 0.9267358183860779, "learning_rate": 0.00043999694251936405, "loss": 3.5473, "step": 14130 }, { "epoch": 0.9603886397608371, "grad_norm": 1.0306013822555542, "learning_rate": 0.00043997571001494766, "loss": 3.5473, "step": 14135 }, { "epoch": 0.9607283598314988, "grad_norm": 0.9533302187919617, "learning_rate": 0.0004399544775105314, "loss": 3.7068, "step": 14140 }, { "epoch": 0.9610680799021606, "grad_norm": 0.7740217447280884, "learning_rate": 0.000439933245006115, "loss": 3.434, "step": 14145 }, { "epoch": 0.9614077999728224, "grad_norm": 0.9012988209724426, "learning_rate": 0.0004399120125016986, "loss": 3.7167, "step": 14150 }, { "epoch": 0.9617475200434842, "grad_norm": 1.06401789188385, "learning_rate": 0.0004398907799972823, "loss": 3.558, "step": 14155 }, { "epoch": 0.9620872401141459, "grad_norm": 2.65097975730896, "learning_rate": 0.0004398695474928659, "loss": 3.6212, "step": 14160 }, { "epoch": 0.9624269601848077, "grad_norm": 0.9091796875, "learning_rate": 0.0004398483149884495, "loss": 3.6227, "step": 14165 }, { "epoch": 0.9627666802554695, "grad_norm": 0.9270727038383484, "learning_rate": 0.00043982708248403317, "loss": 3.5002, "step": 14170 }, { "epoch": 0.9631064003261313, "grad_norm": 0.946383535861969, "learning_rate": 0.00043980584997961684, "loss": 3.5289, "step": 14175 }, { "epoch": 0.963446120396793, "grad_norm": 0.9958605170249939, "learning_rate": 0.00043978461747520045, "loss": 3.5176, "step": 14180 }, { "epoch": 0.9637858404674549, "grad_norm": 1.0582526922225952, "learning_rate": 0.0004397633849707841, "loss": 3.7044, "step": 14185 }, { "epoch": 0.9641255605381166, "grad_norm": 2.444413900375366, "learning_rate": 0.00043974215246636773, "loss": 3.49, "step": 14190 }, { "epoch": 0.9644652806087783, "grad_norm": 1.0152722597122192, "learning_rate": 0.00043972091996195134, "loss": 3.5553, "step": 14195 }, { "epoch": 0.9648050006794402, "grad_norm": 0.9181049466133118, "learning_rate": 0.000439699687457535, "loss": 3.4769, "step": 14200 }, { "epoch": 0.9651447207501019, "grad_norm": 1.1314282417297363, "learning_rate": 0.0004396784549531186, "loss": 3.4152, "step": 14205 }, { "epoch": 0.9654844408207637, "grad_norm": 0.9631584882736206, "learning_rate": 0.0004396572224487023, "loss": 3.5142, "step": 14210 }, { "epoch": 0.9658241608914254, "grad_norm": 1.1719776391983032, "learning_rate": 0.00043963598994428596, "loss": 3.5147, "step": 14215 }, { "epoch": 0.9661638809620873, "grad_norm": 1.8382433652877808, "learning_rate": 0.00043961475743986957, "loss": 3.7269, "step": 14220 }, { "epoch": 0.966503601032749, "grad_norm": 1.141181468963623, "learning_rate": 0.0004395935249354532, "loss": 3.5494, "step": 14225 }, { "epoch": 0.9668433211034108, "grad_norm": 1.0092344284057617, "learning_rate": 0.00043957229243103685, "loss": 3.5569, "step": 14230 }, { "epoch": 0.9671830411740726, "grad_norm": 0.8426174521446228, "learning_rate": 0.00043955105992662046, "loss": 3.6038, "step": 14235 }, { "epoch": 0.9675227612447344, "grad_norm": 1.1085238456726074, "learning_rate": 0.0004395298274222041, "loss": 3.3949, "step": 14240 }, { "epoch": 0.9678624813153961, "grad_norm": 1.0652363300323486, "learning_rate": 0.0004395085949177878, "loss": 3.3948, "step": 14245 }, { "epoch": 0.9682022013860578, "grad_norm": 1.1395741701126099, "learning_rate": 0.0004394873624133714, "loss": 3.6254, "step": 14250 }, { "epoch": 0.9685419214567197, "grad_norm": 1.175266981124878, "learning_rate": 0.000439466129908955, "loss": 3.699, "step": 14255 }, { "epoch": 0.9688816415273814, "grad_norm": 0.8731865882873535, "learning_rate": 0.0004394448974045387, "loss": 3.5698, "step": 14260 }, { "epoch": 0.9692213615980432, "grad_norm": 0.7814122438430786, "learning_rate": 0.0004394236649001223, "loss": 3.4709, "step": 14265 }, { "epoch": 0.969561081668705, "grad_norm": 0.9858877062797546, "learning_rate": 0.0004394024323957059, "loss": 3.4972, "step": 14270 }, { "epoch": 0.9699008017393668, "grad_norm": 1.133333444595337, "learning_rate": 0.0004393811998912896, "loss": 3.6417, "step": 14275 }, { "epoch": 0.9702405218100285, "grad_norm": 1.310455083847046, "learning_rate": 0.00043935996738687325, "loss": 3.5194, "step": 14280 }, { "epoch": 0.9705802418806904, "grad_norm": 0.9394775629043579, "learning_rate": 0.00043933873488245686, "loss": 3.5727, "step": 14285 }, { "epoch": 0.9709199619513521, "grad_norm": 0.8055437803268433, "learning_rate": 0.00043931750237804053, "loss": 3.5464, "step": 14290 }, { "epoch": 0.9712596820220138, "grad_norm": 0.9413021206855774, "learning_rate": 0.00043929626987362414, "loss": 3.5369, "step": 14295 }, { "epoch": 0.9715994020926756, "grad_norm": 0.9261248707771301, "learning_rate": 0.00043927503736920776, "loss": 3.5618, "step": 14300 }, { "epoch": 0.9719391221633374, "grad_norm": 0.9271903038024902, "learning_rate": 0.0004392538048647914, "loss": 3.6878, "step": 14305 }, { "epoch": 0.9722788422339992, "grad_norm": 1.5374598503112793, "learning_rate": 0.00043923257236037504, "loss": 3.5908, "step": 14310 }, { "epoch": 0.9726185623046609, "grad_norm": 2.3589017391204834, "learning_rate": 0.0004392113398559587, "loss": 3.3492, "step": 14315 }, { "epoch": 0.9729582823753228, "grad_norm": 1.1739871501922607, "learning_rate": 0.00043919010735154237, "loss": 3.4962, "step": 14320 }, { "epoch": 0.9732980024459845, "grad_norm": 1.1032137870788574, "learning_rate": 0.000439168874847126, "loss": 3.08, "step": 14325 }, { "epoch": 0.9736377225166463, "grad_norm": 0.9096193313598633, "learning_rate": 0.0004391476423427096, "loss": 3.6538, "step": 14330 }, { "epoch": 0.973977442587308, "grad_norm": 0.7554748058319092, "learning_rate": 0.00043912640983829326, "loss": 3.6467, "step": 14335 }, { "epoch": 0.9743171626579699, "grad_norm": 0.6941629648208618, "learning_rate": 0.0004391051773338769, "loss": 3.5679, "step": 14340 }, { "epoch": 0.9746568827286316, "grad_norm": 0.8571432828903198, "learning_rate": 0.0004390839448294605, "loss": 3.6628, "step": 14345 }, { "epoch": 0.9749966027992933, "grad_norm": 0.9264265894889832, "learning_rate": 0.0004390627123250442, "loss": 3.5068, "step": 14350 }, { "epoch": 0.9753363228699552, "grad_norm": 0.899558424949646, "learning_rate": 0.0004390414798206278, "loss": 3.2138, "step": 14355 }, { "epoch": 0.9756760429406169, "grad_norm": 0.9839175343513489, "learning_rate": 0.00043902024731621144, "loss": 3.7133, "step": 14360 }, { "epoch": 0.9760157630112787, "grad_norm": 0.7452185750007629, "learning_rate": 0.0004389990148117951, "loss": 3.5526, "step": 14365 }, { "epoch": 0.9763554830819405, "grad_norm": 0.9220340251922607, "learning_rate": 0.0004389777823073787, "loss": 3.6692, "step": 14370 }, { "epoch": 0.9766952031526023, "grad_norm": 0.9767449498176575, "learning_rate": 0.00043895654980296233, "loss": 3.3451, "step": 14375 }, { "epoch": 0.977034923223264, "grad_norm": 0.9306650757789612, "learning_rate": 0.000438935317298546, "loss": 3.5964, "step": 14380 }, { "epoch": 0.9773746432939258, "grad_norm": 0.8170539140701294, "learning_rate": 0.00043891408479412966, "loss": 3.1983, "step": 14385 }, { "epoch": 0.9777143633645876, "grad_norm": 1.1695082187652588, "learning_rate": 0.0004388928522897133, "loss": 3.3445, "step": 14390 }, { "epoch": 0.9780540834352494, "grad_norm": 0.9874559640884399, "learning_rate": 0.00043887161978529694, "loss": 3.6645, "step": 14395 }, { "epoch": 0.9783938035059111, "grad_norm": 1.0070935487747192, "learning_rate": 0.00043885038728088056, "loss": 3.5558, "step": 14400 }, { "epoch": 0.978733523576573, "grad_norm": 0.8603373765945435, "learning_rate": 0.00043882915477646417, "loss": 3.325, "step": 14405 }, { "epoch": 0.9790732436472347, "grad_norm": 1.0488711595535278, "learning_rate": 0.00043880792227204784, "loss": 3.3902, "step": 14410 }, { "epoch": 0.9794129637178964, "grad_norm": 0.8430708050727844, "learning_rate": 0.00043878668976763145, "loss": 3.546, "step": 14415 }, { "epoch": 0.9797526837885582, "grad_norm": 1.125368356704712, "learning_rate": 0.0004387654572632151, "loss": 3.3052, "step": 14420 }, { "epoch": 0.98009240385922, "grad_norm": 0.8030582070350647, "learning_rate": 0.0004387442247587988, "loss": 3.2762, "step": 14425 }, { "epoch": 0.9804321239298818, "grad_norm": 2.8177764415740967, "learning_rate": 0.0004387229922543824, "loss": 3.6721, "step": 14430 }, { "epoch": 0.9807718440005435, "grad_norm": 2.182983636856079, "learning_rate": 0.000438701759749966, "loss": 3.1722, "step": 14435 }, { "epoch": 0.9811115640712054, "grad_norm": 0.9884249567985535, "learning_rate": 0.0004386805272455497, "loss": 3.5765, "step": 14440 }, { "epoch": 0.9814512841418671, "grad_norm": 0.8932371735572815, "learning_rate": 0.0004386592947411333, "loss": 3.6993, "step": 14445 }, { "epoch": 0.9817910042125289, "grad_norm": 1.2873276472091675, "learning_rate": 0.0004386380622367169, "loss": 3.1949, "step": 14450 }, { "epoch": 0.9821307242831907, "grad_norm": 0.6776039600372314, "learning_rate": 0.0004386168297323006, "loss": 3.667, "step": 14455 }, { "epoch": 0.9824704443538524, "grad_norm": 1.4132121801376343, "learning_rate": 0.00043859559722788424, "loss": 3.3689, "step": 14460 }, { "epoch": 0.9828101644245142, "grad_norm": 0.8015496134757996, "learning_rate": 0.00043857436472346785, "loss": 3.5999, "step": 14465 }, { "epoch": 0.9831498844951759, "grad_norm": 1.0027128458023071, "learning_rate": 0.0004385531322190515, "loss": 3.6108, "step": 14470 }, { "epoch": 0.9834896045658378, "grad_norm": 1.0885581970214844, "learning_rate": 0.00043853189971463513, "loss": 3.306, "step": 14475 }, { "epoch": 0.9838293246364995, "grad_norm": 1.0878907442092896, "learning_rate": 0.0004385106672102188, "loss": 3.4068, "step": 14480 }, { "epoch": 0.9841690447071613, "grad_norm": 0.8352845907211304, "learning_rate": 0.0004384894347058024, "loss": 3.6852, "step": 14485 }, { "epoch": 0.9845087647778231, "grad_norm": 0.9107652306556702, "learning_rate": 0.0004384682022013861, "loss": 3.4749, "step": 14490 }, { "epoch": 0.9848484848484849, "grad_norm": 1.2634153366088867, "learning_rate": 0.00043844696969696974, "loss": 3.7031, "step": 14495 }, { "epoch": 0.9851882049191466, "grad_norm": 0.9515687823295593, "learning_rate": 0.00043842573719255336, "loss": 3.5461, "step": 14500 }, { "epoch": 0.9855279249898083, "grad_norm": 0.9786934852600098, "learning_rate": 0.00043840450468813697, "loss": 3.4339, "step": 14505 }, { "epoch": 0.9858676450604702, "grad_norm": 1.3431333303451538, "learning_rate": 0.00043838327218372064, "loss": 3.3851, "step": 14510 }, { "epoch": 0.9862073651311319, "grad_norm": 0.9427924156188965, "learning_rate": 0.00043836203967930425, "loss": 3.6953, "step": 14515 }, { "epoch": 0.9865470852017937, "grad_norm": 0.8899068832397461, "learning_rate": 0.00043834080717488786, "loss": 3.4206, "step": 14520 }, { "epoch": 0.9868868052724555, "grad_norm": 0.743144154548645, "learning_rate": 0.0004383195746704716, "loss": 3.3144, "step": 14525 }, { "epoch": 0.9872265253431173, "grad_norm": 0.927723228931427, "learning_rate": 0.0004382983421660552, "loss": 3.5907, "step": 14530 }, { "epoch": 0.987566245413779, "grad_norm": 0.9800081849098206, "learning_rate": 0.0004382771096616388, "loss": 3.5071, "step": 14535 }, { "epoch": 0.9879059654844409, "grad_norm": 1.2955937385559082, "learning_rate": 0.0004382558771572225, "loss": 3.6711, "step": 14540 }, { "epoch": 0.9882456855551026, "grad_norm": 0.7317197918891907, "learning_rate": 0.0004382346446528061, "loss": 3.4282, "step": 14545 }, { "epoch": 0.9885854056257644, "grad_norm": 1.0374677181243896, "learning_rate": 0.0004382134121483897, "loss": 3.4202, "step": 14550 }, { "epoch": 0.9889251256964261, "grad_norm": 1.0559358596801758, "learning_rate": 0.0004381921796439734, "loss": 3.4537, "step": 14555 }, { "epoch": 0.989264845767088, "grad_norm": 1.0114065408706665, "learning_rate": 0.00043817094713955704, "loss": 3.5002, "step": 14560 }, { "epoch": 0.9896045658377497, "grad_norm": 0.8741345405578613, "learning_rate": 0.00043814971463514065, "loss": 3.5984, "step": 14565 }, { "epoch": 0.9899442859084114, "grad_norm": 0.7316309213638306, "learning_rate": 0.0004381284821307243, "loss": 3.353, "step": 14570 }, { "epoch": 0.9902840059790733, "grad_norm": 1.0699881315231323, "learning_rate": 0.00043810724962630793, "loss": 3.6502, "step": 14575 }, { "epoch": 0.990623726049735, "grad_norm": 1.2946524620056152, "learning_rate": 0.00043808601712189154, "loss": 3.4527, "step": 14580 }, { "epoch": 0.9909634461203968, "grad_norm": 0.9162943959236145, "learning_rate": 0.0004380647846174752, "loss": 3.517, "step": 14585 }, { "epoch": 0.9913031661910585, "grad_norm": 0.8214098215103149, "learning_rate": 0.0004380435521130589, "loss": 3.6479, "step": 14590 }, { "epoch": 0.9916428862617204, "grad_norm": 0.9749112725257874, "learning_rate": 0.0004380223196086425, "loss": 3.4931, "step": 14595 }, { "epoch": 0.9919826063323821, "grad_norm": 0.8387222290039062, "learning_rate": 0.00043800108710422616, "loss": 3.5742, "step": 14600 }, { "epoch": 0.9923223264030439, "grad_norm": 1.1342140436172485, "learning_rate": 0.00043797985459980977, "loss": 3.7442, "step": 14605 }, { "epoch": 0.9926620464737057, "grad_norm": 0.9792100787162781, "learning_rate": 0.0004379586220953934, "loss": 3.4549, "step": 14610 }, { "epoch": 0.9930017665443674, "grad_norm": 0.9276760220527649, "learning_rate": 0.00043793738959097705, "loss": 3.6125, "step": 14615 }, { "epoch": 0.9933414866150292, "grad_norm": 0.8439565300941467, "learning_rate": 0.00043791615708656066, "loss": 3.426, "step": 14620 }, { "epoch": 0.993681206685691, "grad_norm": 0.9246503710746765, "learning_rate": 0.00043789492458214433, "loss": 3.5574, "step": 14625 }, { "epoch": 0.9940209267563528, "grad_norm": 1.1263114213943481, "learning_rate": 0.000437873692077728, "loss": 3.7136, "step": 14630 }, { "epoch": 0.9943606468270145, "grad_norm": 1.132860541343689, "learning_rate": 0.0004378524595733116, "loss": 3.4564, "step": 14635 }, { "epoch": 0.9947003668976763, "grad_norm": 0.8425511121749878, "learning_rate": 0.0004378312270688952, "loss": 3.6844, "step": 14640 }, { "epoch": 0.9950400869683381, "grad_norm": 0.9429223537445068, "learning_rate": 0.0004378099945644789, "loss": 3.4863, "step": 14645 }, { "epoch": 0.9953798070389999, "grad_norm": 0.8464606404304504, "learning_rate": 0.0004377887620600625, "loss": 3.4275, "step": 14650 }, { "epoch": 0.9957195271096616, "grad_norm": 1.0782575607299805, "learning_rate": 0.0004377675295556461, "loss": 3.5384, "step": 14655 }, { "epoch": 0.9960592471803235, "grad_norm": 0.9185747504234314, "learning_rate": 0.00043774629705122984, "loss": 3.5237, "step": 14660 }, { "epoch": 0.9963989672509852, "grad_norm": 0.8369750380516052, "learning_rate": 0.00043772506454681345, "loss": 3.7467, "step": 14665 }, { "epoch": 0.9967386873216469, "grad_norm": 0.8333107829093933, "learning_rate": 0.00043770383204239706, "loss": 3.631, "step": 14670 }, { "epoch": 0.9970784073923087, "grad_norm": 0.908769965171814, "learning_rate": 0.00043768259953798073, "loss": 3.646, "step": 14675 }, { "epoch": 0.9974181274629705, "grad_norm": 0.9687934517860413, "learning_rate": 0.00043766136703356434, "loss": 3.6305, "step": 14680 }, { "epoch": 0.9977578475336323, "grad_norm": 0.9129026532173157, "learning_rate": 0.00043764013452914796, "loss": 3.4805, "step": 14685 }, { "epoch": 0.998097567604294, "grad_norm": 0.6658088564872742, "learning_rate": 0.0004376189020247316, "loss": 3.6416, "step": 14690 }, { "epoch": 0.9984372876749559, "grad_norm": 0.949741005897522, "learning_rate": 0.0004375976695203153, "loss": 3.8658, "step": 14695 }, { "epoch": 0.9987770077456176, "grad_norm": 0.9700025916099548, "learning_rate": 0.0004375764370158989, "loss": 3.733, "step": 14700 }, { "epoch": 0.9991167278162794, "grad_norm": 1.0414241552352905, "learning_rate": 0.00043755520451148257, "loss": 3.4624, "step": 14705 }, { "epoch": 0.9994564478869412, "grad_norm": 1.1105138063430786, "learning_rate": 0.0004375339720070662, "loss": 3.6248, "step": 14710 }, { "epoch": 0.999796167957603, "grad_norm": 1.0124422311782837, "learning_rate": 0.0004375127395026498, "loss": 3.7868, "step": 14715 }, { "epoch": 1.0, "eval_bertscore": { "f1": 0.8380511583185829, "precision": 0.8360792870657402, "recall": 0.8409853013625312 }, "eval_bleu_4": 0.02353788217437277, "eval_exact_match": 0.0002907258455276674, "eval_loss": 3.421555995941162, "eval_meteor": 0.0877485167791103, "eval_rouge": { "rouge1": 0.1188266125889923, "rouge2": 0.01922633047906528, "rougeL": 0.10274273610935375, "rougeLsum": 0.10282861822225708 }, "eval_runtime": 3950.6327, "eval_samples_per_second": 2.612, "eval_steps_per_second": 0.327, "step": 14718 } ], "logging_steps": 5, "max_steps": 117744, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.404478824657715e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }