{ "best_metric": 0.021728611886642827, "best_model_checkpoint": "./results-cc/code-t5/codet5_fmft_official_0.0001/checkpoint-14718", "epoch": 1.0, "eval_steps": 500, "global_step": 14718, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003397200706617747, "grad_norm": 6.21469259262085, "learning_rate": 9.999660279929339e-05, "loss": 7.0272, "step": 5 }, { "epoch": 0.0006794401413235494, "grad_norm": 4.770811557769775, "learning_rate": 9.999235629841012e-05, "loss": 4.8093, "step": 10 }, { "epoch": 0.0010191602119853241, "grad_norm": 3.3407115936279297, "learning_rate": 9.998810979752684e-05, "loss": 4.5152, "step": 15 }, { "epoch": 0.001358880282647099, "grad_norm": 3.316413402557373, "learning_rate": 9.998386329664357e-05, "loss": 3.8516, "step": 20 }, { "epoch": 0.0016986003533088735, "grad_norm": 3.275665760040283, "learning_rate": 9.99796167957603e-05, "loss": 4.1015, "step": 25 }, { "epoch": 0.0020383204239706482, "grad_norm": 3.4907727241516113, "learning_rate": 9.997537029487703e-05, "loss": 3.9727, "step": 30 }, { "epoch": 0.002378040494632423, "grad_norm": 4.418203353881836, "learning_rate": 9.997112379399374e-05, "loss": 3.8008, "step": 35 }, { "epoch": 0.002717760565294198, "grad_norm": 2.678400993347168, "learning_rate": 9.996687729311048e-05, "loss": 4.036, "step": 40 }, { "epoch": 0.0030574806359559724, "grad_norm": 3.6774635314941406, "learning_rate": 9.996263079222721e-05, "loss": 4.2775, "step": 45 }, { "epoch": 0.003397200706617747, "grad_norm": 7.125624179840088, "learning_rate": 9.995838429134394e-05, "loss": 3.8021, "step": 50 }, { "epoch": 0.0037369207772795215, "grad_norm": 3.47074294090271, "learning_rate": 9.995413779046067e-05, "loss": 3.9785, "step": 55 }, { "epoch": 0.0040766408479412965, "grad_norm": 4.336516380310059, "learning_rate": 9.99498912895774e-05, "loss": 3.9175, "step": 60 }, { "epoch": 0.0044163609186030715, "grad_norm": 2.7094192504882812, "learning_rate": 9.994564478869412e-05, "loss": 3.9308, "step": 65 }, { "epoch": 0.004756080989264846, "grad_norm": 2.82883620262146, "learning_rate": 9.994139828781085e-05, "loss": 4.2121, "step": 70 }, { "epoch": 0.005095801059926621, "grad_norm": 2.981565475463867, "learning_rate": 9.993715178692758e-05, "loss": 4.0359, "step": 75 }, { "epoch": 0.005435521130588396, "grad_norm": 10.057659149169922, "learning_rate": 9.993290528604431e-05, "loss": 3.8392, "step": 80 }, { "epoch": 0.00577524120125017, "grad_norm": 4.061558723449707, "learning_rate": 9.992865878516104e-05, "loss": 3.8535, "step": 85 }, { "epoch": 0.006114961271911945, "grad_norm": 12.613106727600098, "learning_rate": 9.992441228427776e-05, "loss": 3.9515, "step": 90 }, { "epoch": 0.006454681342573719, "grad_norm": 2.677182197570801, "learning_rate": 9.992016578339449e-05, "loss": 4.1063, "step": 95 }, { "epoch": 0.006794401413235494, "grad_norm": 3.033088207244873, "learning_rate": 9.991591928251122e-05, "loss": 3.98, "step": 100 }, { "epoch": 0.007134121483897269, "grad_norm": 3.225051164627075, "learning_rate": 9.991167278162795e-05, "loss": 3.5691, "step": 105 }, { "epoch": 0.007473841554559043, "grad_norm": 2.7921855449676514, "learning_rate": 9.990742628074468e-05, "loss": 4.0241, "step": 110 }, { "epoch": 0.007813561625220818, "grad_norm": 2.826827049255371, "learning_rate": 9.99031797798614e-05, "loss": 4.0395, "step": 115 }, { "epoch": 0.008153281695882593, "grad_norm": 9.27625560760498, "learning_rate": 9.989893327897812e-05, "loss": 4.1619, "step": 120 }, { "epoch": 0.008493001766544368, "grad_norm": 2.8031373023986816, "learning_rate": 9.989468677809486e-05, "loss": 3.9835, "step": 125 }, { "epoch": 0.008832721837206143, "grad_norm": 3.7431797981262207, "learning_rate": 9.989044027721159e-05, "loss": 3.8106, "step": 130 }, { "epoch": 0.009172441907867916, "grad_norm": 5.541841983795166, "learning_rate": 9.98861937763283e-05, "loss": 3.7821, "step": 135 }, { "epoch": 0.009512161978529691, "grad_norm": 3.7987961769104004, "learning_rate": 9.988194727544504e-05, "loss": 3.7472, "step": 140 }, { "epoch": 0.009851882049191466, "grad_norm": 6.096441745758057, "learning_rate": 9.987770077456177e-05, "loss": 3.7836, "step": 145 }, { "epoch": 0.010191602119853241, "grad_norm": 3.9141364097595215, "learning_rate": 9.987345427367849e-05, "loss": 4.0505, "step": 150 }, { "epoch": 0.010531322190515016, "grad_norm": 3.785848379135132, "learning_rate": 9.986920777279523e-05, "loss": 3.8544, "step": 155 }, { "epoch": 0.010871042261176791, "grad_norm": 3.2531163692474365, "learning_rate": 9.986496127191196e-05, "loss": 3.8316, "step": 160 }, { "epoch": 0.011210762331838564, "grad_norm": 3.1866166591644287, "learning_rate": 9.986071477102867e-05, "loss": 3.8971, "step": 165 }, { "epoch": 0.01155048240250034, "grad_norm": 2.796600818634033, "learning_rate": 9.985646827014541e-05, "loss": 3.7587, "step": 170 }, { "epoch": 0.011890202473162114, "grad_norm": 2.753453254699707, "learning_rate": 9.985222176926214e-05, "loss": 3.7977, "step": 175 }, { "epoch": 0.01222992254382389, "grad_norm": 3.392162322998047, "learning_rate": 9.984797526837885e-05, "loss": 3.9074, "step": 180 }, { "epoch": 0.012569642614485664, "grad_norm": 3.0789575576782227, "learning_rate": 9.98437287674956e-05, "loss": 3.9379, "step": 185 }, { "epoch": 0.012909362685147438, "grad_norm": 3.4421534538269043, "learning_rate": 9.983948226661231e-05, "loss": 3.8396, "step": 190 }, { "epoch": 0.013249082755809213, "grad_norm": 8.736903190612793, "learning_rate": 9.983523576572904e-05, "loss": 4.0782, "step": 195 }, { "epoch": 0.013588802826470988, "grad_norm": 3.5333974361419678, "learning_rate": 9.983098926484578e-05, "loss": 3.9485, "step": 200 }, { "epoch": 0.013928522897132763, "grad_norm": 3.0175955295562744, "learning_rate": 9.98267427639625e-05, "loss": 4.0261, "step": 205 }, { "epoch": 0.014268242967794538, "grad_norm": 2.8383960723876953, "learning_rate": 9.982249626307922e-05, "loss": 3.6848, "step": 210 }, { "epoch": 0.014607963038456313, "grad_norm": 2.407959222793579, "learning_rate": 9.981824976219596e-05, "loss": 3.806, "step": 215 }, { "epoch": 0.014947683109118086, "grad_norm": 2.9977355003356934, "learning_rate": 9.981400326131268e-05, "loss": 3.9132, "step": 220 }, { "epoch": 0.015287403179779861, "grad_norm": 3.05517578125, "learning_rate": 9.98097567604294e-05, "loss": 3.8259, "step": 225 }, { "epoch": 0.015627123250441636, "grad_norm": 2.4192965030670166, "learning_rate": 9.980551025954615e-05, "loss": 3.7206, "step": 230 }, { "epoch": 0.01596684332110341, "grad_norm": 3.0928332805633545, "learning_rate": 9.980126375866286e-05, "loss": 3.8799, "step": 235 }, { "epoch": 0.016306563391765186, "grad_norm": 3.791109085083008, "learning_rate": 9.979701725777959e-05, "loss": 3.8032, "step": 240 }, { "epoch": 0.01664628346242696, "grad_norm": 2.8189823627471924, "learning_rate": 9.979277075689633e-05, "loss": 3.8046, "step": 245 }, { "epoch": 0.016986003533088736, "grad_norm": 4.813939094543457, "learning_rate": 9.978852425601305e-05, "loss": 3.9856, "step": 250 }, { "epoch": 0.01732572360375051, "grad_norm": 2.9254181385040283, "learning_rate": 9.978427775512977e-05, "loss": 3.7032, "step": 255 }, { "epoch": 0.017665443674412286, "grad_norm": 4.05214786529541, "learning_rate": 9.97800312542465e-05, "loss": 3.8685, "step": 260 }, { "epoch": 0.01800516374507406, "grad_norm": 2.7574172019958496, "learning_rate": 9.977578475336323e-05, "loss": 4.0542, "step": 265 }, { "epoch": 0.018344883815735832, "grad_norm": 12.412968635559082, "learning_rate": 9.977153825247996e-05, "loss": 3.7525, "step": 270 }, { "epoch": 0.01868460388639761, "grad_norm": 2.9556572437286377, "learning_rate": 9.976729175159669e-05, "loss": 3.747, "step": 275 }, { "epoch": 0.019024323957059382, "grad_norm": 2.2956831455230713, "learning_rate": 9.976304525071341e-05, "loss": 3.7616, "step": 280 }, { "epoch": 0.01936404402772116, "grad_norm": 2.4142673015594482, "learning_rate": 9.975879874983014e-05, "loss": 3.8668, "step": 285 }, { "epoch": 0.019703764098382932, "grad_norm": 2.9800455570220947, "learning_rate": 9.975455224894687e-05, "loss": 3.8922, "step": 290 }, { "epoch": 0.020043484169044706, "grad_norm": 2.6110286712646484, "learning_rate": 9.97503057480636e-05, "loss": 3.8381, "step": 295 }, { "epoch": 0.020383204239706482, "grad_norm": 2.673398494720459, "learning_rate": 9.974605924718033e-05, "loss": 3.6466, "step": 300 }, { "epoch": 0.020722924310368256, "grad_norm": 3.2894580364227295, "learning_rate": 9.974181274629705e-05, "loss": 3.8729, "step": 305 }, { "epoch": 0.021062644381030032, "grad_norm": 3.046811103820801, "learning_rate": 9.973756624541378e-05, "loss": 3.7184, "step": 310 }, { "epoch": 0.021402364451691806, "grad_norm": 2.517362594604492, "learning_rate": 9.973331974453051e-05, "loss": 4.1241, "step": 315 }, { "epoch": 0.021742084522353582, "grad_norm": 3.15246319770813, "learning_rate": 9.972907324364724e-05, "loss": 3.8871, "step": 320 }, { "epoch": 0.022081804593015356, "grad_norm": 3.8455631732940674, "learning_rate": 9.972482674276397e-05, "loss": 3.6376, "step": 325 }, { "epoch": 0.02242152466367713, "grad_norm": 2.374818801879883, "learning_rate": 9.972142954205735e-05, "loss": 3.4756, "step": 330 }, { "epoch": 0.022761244734338906, "grad_norm": 3.907341718673706, "learning_rate": 9.971718304117408e-05, "loss": 3.9709, "step": 335 }, { "epoch": 0.02310096480500068, "grad_norm": 3.2161388397216797, "learning_rate": 9.97129365402908e-05, "loss": 3.8017, "step": 340 }, { "epoch": 0.023440684875662456, "grad_norm": 3.5958497524261475, "learning_rate": 9.970869003940753e-05, "loss": 3.8609, "step": 345 }, { "epoch": 0.02378040494632423, "grad_norm": 2.839556932449341, "learning_rate": 9.970444353852426e-05, "loss": 3.5558, "step": 350 }, { "epoch": 0.024120125016986002, "grad_norm": 2.8804948329925537, "learning_rate": 9.970019703764099e-05, "loss": 4.0357, "step": 355 }, { "epoch": 0.02445984508764778, "grad_norm": 4.942351818084717, "learning_rate": 9.969595053675772e-05, "loss": 3.9725, "step": 360 }, { "epoch": 0.024799565158309552, "grad_norm": 3.3335719108581543, "learning_rate": 9.969170403587444e-05, "loss": 3.8432, "step": 365 }, { "epoch": 0.02513928522897133, "grad_norm": 2.7704007625579834, "learning_rate": 9.968745753499117e-05, "loss": 3.9828, "step": 370 }, { "epoch": 0.025479005299633102, "grad_norm": 3.0958409309387207, "learning_rate": 9.96832110341079e-05, "loss": 3.7734, "step": 375 }, { "epoch": 0.025818725370294875, "grad_norm": 2.696258068084717, "learning_rate": 9.967896453322463e-05, "loss": 4.0272, "step": 380 }, { "epoch": 0.026158445440956652, "grad_norm": 2.9321091175079346, "learning_rate": 9.967471803234136e-05, "loss": 3.955, "step": 385 }, { "epoch": 0.026498165511618425, "grad_norm": 2.6848959922790527, "learning_rate": 9.967047153145808e-05, "loss": 3.6993, "step": 390 }, { "epoch": 0.026837885582280202, "grad_norm": 2.8681793212890625, "learning_rate": 9.966622503057481e-05, "loss": 3.4435, "step": 395 }, { "epoch": 0.027177605652941975, "grad_norm": 2.906076431274414, "learning_rate": 9.966197852969154e-05, "loss": 3.6707, "step": 400 }, { "epoch": 0.027517325723603752, "grad_norm": 3.168874979019165, "learning_rate": 9.965773202880827e-05, "loss": 3.5505, "step": 405 }, { "epoch": 0.027857045794265525, "grad_norm": 2.90678334236145, "learning_rate": 9.9653485527925e-05, "loss": 3.9038, "step": 410 }, { "epoch": 0.0281967658649273, "grad_norm": 3.7996811866760254, "learning_rate": 9.964923902704172e-05, "loss": 3.9709, "step": 415 }, { "epoch": 0.028536485935589075, "grad_norm": 3.3655152320861816, "learning_rate": 9.964499252615845e-05, "loss": 3.6655, "step": 420 }, { "epoch": 0.02887620600625085, "grad_norm": 2.801706075668335, "learning_rate": 9.964074602527518e-05, "loss": 4.0456, "step": 425 }, { "epoch": 0.029215926076912625, "grad_norm": 2.752469539642334, "learning_rate": 9.963649952439191e-05, "loss": 3.8482, "step": 430 }, { "epoch": 0.0295556461475744, "grad_norm": 2.6563782691955566, "learning_rate": 9.963225302350864e-05, "loss": 3.8063, "step": 435 }, { "epoch": 0.029895366218236172, "grad_norm": 4.1556220054626465, "learning_rate": 9.962800652262536e-05, "loss": 3.7697, "step": 440 }, { "epoch": 0.03023508628889795, "grad_norm": 3.3705992698669434, "learning_rate": 9.962376002174208e-05, "loss": 3.6937, "step": 445 }, { "epoch": 0.030574806359559722, "grad_norm": 2.6561191082000732, "learning_rate": 9.961951352085882e-05, "loss": 3.55, "step": 450 }, { "epoch": 0.0309145264302215, "grad_norm": 3.07314395904541, "learning_rate": 9.961526701997555e-05, "loss": 3.7798, "step": 455 }, { "epoch": 0.03125424650088327, "grad_norm": 2.819225311279297, "learning_rate": 9.961102051909226e-05, "loss": 4.1133, "step": 460 }, { "epoch": 0.03159396657154505, "grad_norm": 3.2146010398864746, "learning_rate": 9.9606774018209e-05, "loss": 3.7746, "step": 465 }, { "epoch": 0.03193368664220682, "grad_norm": 2.454878568649292, "learning_rate": 9.960252751732573e-05, "loss": 3.8051, "step": 470 }, { "epoch": 0.032273406712868595, "grad_norm": 2.937072515487671, "learning_rate": 9.959828101644245e-05, "loss": 3.9788, "step": 475 }, { "epoch": 0.03261312678353037, "grad_norm": 3.164280414581299, "learning_rate": 9.959403451555919e-05, "loss": 3.5624, "step": 480 }, { "epoch": 0.03295284685419215, "grad_norm": 3.0861897468566895, "learning_rate": 9.958978801467592e-05, "loss": 3.5828, "step": 485 }, { "epoch": 0.03329256692485392, "grad_norm": 3.0691051483154297, "learning_rate": 9.958554151379263e-05, "loss": 3.7145, "step": 490 }, { "epoch": 0.033632286995515695, "grad_norm": 2.841773271560669, "learning_rate": 9.958129501290937e-05, "loss": 3.8588, "step": 495 }, { "epoch": 0.03397200706617747, "grad_norm": 12.969367980957031, "learning_rate": 9.95770485120261e-05, "loss": 3.6362, "step": 500 }, { "epoch": 0.03431172713683924, "grad_norm": 2.858612060546875, "learning_rate": 9.957280201114282e-05, "loss": 3.7403, "step": 505 }, { "epoch": 0.03465144720750102, "grad_norm": 3.6948654651641846, "learning_rate": 9.956855551025956e-05, "loss": 3.7201, "step": 510 }, { "epoch": 0.034991167278162795, "grad_norm": 3.085568904876709, "learning_rate": 9.956430900937627e-05, "loss": 3.6721, "step": 515 }, { "epoch": 0.03533088734882457, "grad_norm": 3.0437660217285156, "learning_rate": 9.9560062508493e-05, "loss": 3.8041, "step": 520 }, { "epoch": 0.03567060741948634, "grad_norm": 2.336348533630371, "learning_rate": 9.955581600760974e-05, "loss": 3.703, "step": 525 }, { "epoch": 0.03601032749014812, "grad_norm": 3.068443536758423, "learning_rate": 9.955156950672646e-05, "loss": 3.5309, "step": 530 }, { "epoch": 0.036350047560809895, "grad_norm": 2.803267240524292, "learning_rate": 9.954732300584318e-05, "loss": 3.74, "step": 535 }, { "epoch": 0.036689767631471665, "grad_norm": 2.932396173477173, "learning_rate": 9.954307650495992e-05, "loss": 3.6244, "step": 540 }, { "epoch": 0.03702948770213344, "grad_norm": 3.2129034996032715, "learning_rate": 9.953883000407664e-05, "loss": 3.6903, "step": 545 }, { "epoch": 0.03736920777279522, "grad_norm": 2.4441819190979004, "learning_rate": 9.953458350319337e-05, "loss": 4.0107, "step": 550 }, { "epoch": 0.03770892784345699, "grad_norm": 2.7356765270233154, "learning_rate": 9.953033700231011e-05, "loss": 3.755, "step": 555 }, { "epoch": 0.038048647914118765, "grad_norm": 2.823246479034424, "learning_rate": 9.952609050142682e-05, "loss": 3.7853, "step": 560 }, { "epoch": 0.03838836798478054, "grad_norm": 3.929452419281006, "learning_rate": 9.952184400054355e-05, "loss": 3.8028, "step": 565 }, { "epoch": 0.03872808805544232, "grad_norm": 3.2468960285186768, "learning_rate": 9.951759749966029e-05, "loss": 3.9392, "step": 570 }, { "epoch": 0.03906780812610409, "grad_norm": 2.5720834732055664, "learning_rate": 9.951335099877701e-05, "loss": 3.8785, "step": 575 }, { "epoch": 0.039407528196765865, "grad_norm": 2.521256446838379, "learning_rate": 9.950910449789374e-05, "loss": 3.6521, "step": 580 }, { "epoch": 0.03974724826742764, "grad_norm": 2.8516931533813477, "learning_rate": 9.950485799701046e-05, "loss": 3.6808, "step": 585 }, { "epoch": 0.04008696833808941, "grad_norm": 3.3582510948181152, "learning_rate": 9.950061149612719e-05, "loss": 3.6077, "step": 590 }, { "epoch": 0.04042668840875119, "grad_norm": 2.874913454055786, "learning_rate": 9.949636499524393e-05, "loss": 3.6599, "step": 595 }, { "epoch": 0.040766408479412965, "grad_norm": 5.1997480392456055, "learning_rate": 9.949211849436065e-05, "loss": 3.7753, "step": 600 }, { "epoch": 0.04110612855007474, "grad_norm": 3.319835662841797, "learning_rate": 9.948787199347738e-05, "loss": 3.5933, "step": 605 }, { "epoch": 0.04144584862073651, "grad_norm": 2.802927017211914, "learning_rate": 9.948362549259412e-05, "loss": 3.841, "step": 610 }, { "epoch": 0.04178556869139829, "grad_norm": 2.867325782775879, "learning_rate": 9.947937899171083e-05, "loss": 3.6816, "step": 615 }, { "epoch": 0.042125288762060065, "grad_norm": 2.727735757827759, "learning_rate": 9.947513249082756e-05, "loss": 3.6244, "step": 620 }, { "epoch": 0.042465008832721834, "grad_norm": 2.464614152908325, "learning_rate": 9.94708859899443e-05, "loss": 3.2751, "step": 625 }, { "epoch": 0.04280472890338361, "grad_norm": 2.5432565212249756, "learning_rate": 9.946663948906102e-05, "loss": 3.5677, "step": 630 }, { "epoch": 0.04314444897404539, "grad_norm": 2.9960644245147705, "learning_rate": 9.946239298817774e-05, "loss": 3.6849, "step": 635 }, { "epoch": 0.043484169044707165, "grad_norm": 2.4648358821868896, "learning_rate": 9.945814648729449e-05, "loss": 3.6208, "step": 640 }, { "epoch": 0.043823889115368934, "grad_norm": 2.461162805557251, "learning_rate": 9.94538999864112e-05, "loss": 3.9922, "step": 645 }, { "epoch": 0.04416360918603071, "grad_norm": 2.333400011062622, "learning_rate": 9.944965348552793e-05, "loss": 3.8104, "step": 650 }, { "epoch": 0.04450332925669249, "grad_norm": 2.3225696086883545, "learning_rate": 9.944540698464467e-05, "loss": 3.6985, "step": 655 }, { "epoch": 0.04484304932735426, "grad_norm": 3.624342441558838, "learning_rate": 9.944116048376138e-05, "loss": 3.6232, "step": 660 }, { "epoch": 0.045182769398016034, "grad_norm": 2.766263723373413, "learning_rate": 9.943691398287811e-05, "loss": 3.6127, "step": 665 }, { "epoch": 0.04552248946867781, "grad_norm": 2.61580491065979, "learning_rate": 9.943266748199484e-05, "loss": 3.8995, "step": 670 }, { "epoch": 0.04586220953933958, "grad_norm": 2.9932680130004883, "learning_rate": 9.942842098111157e-05, "loss": 3.6624, "step": 675 }, { "epoch": 0.04620192961000136, "grad_norm": 2.72580623626709, "learning_rate": 9.94241744802283e-05, "loss": 3.7734, "step": 680 }, { "epoch": 0.046541649680663134, "grad_norm": 2.9109714031219482, "learning_rate": 9.941992797934502e-05, "loss": 3.8153, "step": 685 }, { "epoch": 0.04688136975132491, "grad_norm": 2.517665147781372, "learning_rate": 9.941568147846175e-05, "loss": 3.7694, "step": 690 }, { "epoch": 0.04722108982198668, "grad_norm": 3.241339921951294, "learning_rate": 9.941143497757848e-05, "loss": 3.7621, "step": 695 }, { "epoch": 0.04756080989264846, "grad_norm": 2.0298354625701904, "learning_rate": 9.940718847669521e-05, "loss": 3.9023, "step": 700 }, { "epoch": 0.047900529963310234, "grad_norm": 3.248481035232544, "learning_rate": 9.940294197581194e-05, "loss": 3.7963, "step": 705 }, { "epoch": 0.048240250033972004, "grad_norm": 2.496670722961426, "learning_rate": 9.939869547492866e-05, "loss": 3.7356, "step": 710 }, { "epoch": 0.04857997010463378, "grad_norm": 4.151010990142822, "learning_rate": 9.939444897404539e-05, "loss": 3.6123, "step": 715 }, { "epoch": 0.04891969017529556, "grad_norm": 2.359184503555298, "learning_rate": 9.939020247316212e-05, "loss": 3.7655, "step": 720 }, { "epoch": 0.049259410245957334, "grad_norm": 2.5961592197418213, "learning_rate": 9.938595597227885e-05, "loss": 3.604, "step": 725 }, { "epoch": 0.049599130316619104, "grad_norm": 2.7124698162078857, "learning_rate": 9.938170947139558e-05, "loss": 3.714, "step": 730 }, { "epoch": 0.04993885038728088, "grad_norm": 3.47074556350708, "learning_rate": 9.93774629705123e-05, "loss": 3.5581, "step": 735 }, { "epoch": 0.05027857045794266, "grad_norm": 2.783320426940918, "learning_rate": 9.937321646962903e-05, "loss": 3.9011, "step": 740 }, { "epoch": 0.05061829052860443, "grad_norm": 2.4175233840942383, "learning_rate": 9.936896996874576e-05, "loss": 3.5208, "step": 745 }, { "epoch": 0.050958010599266204, "grad_norm": 2.8976969718933105, "learning_rate": 9.936472346786249e-05, "loss": 3.7739, "step": 750 }, { "epoch": 0.05129773066992798, "grad_norm": 2.563922882080078, "learning_rate": 9.936047696697922e-05, "loss": 3.7629, "step": 755 }, { "epoch": 0.05163745074058975, "grad_norm": 2.919168472290039, "learning_rate": 9.935623046609594e-05, "loss": 3.8214, "step": 760 }, { "epoch": 0.05197717081125153, "grad_norm": 2.8702971935272217, "learning_rate": 9.935198396521267e-05, "loss": 3.6658, "step": 765 }, { "epoch": 0.052316890881913304, "grad_norm": 2.588665008544922, "learning_rate": 9.93477374643294e-05, "loss": 3.6532, "step": 770 }, { "epoch": 0.05265661095257508, "grad_norm": 2.481748580932617, "learning_rate": 9.934349096344613e-05, "loss": 3.8163, "step": 775 }, { "epoch": 0.05299633102323685, "grad_norm": 2.211552381515503, "learning_rate": 9.933924446256286e-05, "loss": 3.8311, "step": 780 }, { "epoch": 0.05333605109389863, "grad_norm": 2.36183762550354, "learning_rate": 9.933499796167957e-05, "loss": 3.6162, "step": 785 }, { "epoch": 0.053675771164560404, "grad_norm": 2.776430368423462, "learning_rate": 9.933075146079631e-05, "loss": 3.7577, "step": 790 }, { "epoch": 0.054015491235222174, "grad_norm": 2.7656915187835693, "learning_rate": 9.932650495991304e-05, "loss": 3.604, "step": 795 }, { "epoch": 0.05435521130588395, "grad_norm": 2.6776299476623535, "learning_rate": 9.932225845902975e-05, "loss": 3.9155, "step": 800 }, { "epoch": 0.05469493137654573, "grad_norm": 2.4317731857299805, "learning_rate": 9.93180119581465e-05, "loss": 3.9431, "step": 805 }, { "epoch": 0.055034651447207504, "grad_norm": 2.411588191986084, "learning_rate": 9.931376545726322e-05, "loss": 3.9271, "step": 810 }, { "epoch": 0.055374371517869274, "grad_norm": 3.2233071327209473, "learning_rate": 9.930951895637994e-05, "loss": 3.7217, "step": 815 }, { "epoch": 0.05571409158853105, "grad_norm": 2.9518697261810303, "learning_rate": 9.930527245549668e-05, "loss": 3.6582, "step": 820 }, { "epoch": 0.05605381165919283, "grad_norm": 2.211646318435669, "learning_rate": 9.930102595461341e-05, "loss": 3.6279, "step": 825 }, { "epoch": 0.0563935317298546, "grad_norm": 3.2264046669006348, "learning_rate": 9.929677945373012e-05, "loss": 3.5536, "step": 830 }, { "epoch": 0.056733251800516374, "grad_norm": 3.2947652339935303, "learning_rate": 9.929253295284686e-05, "loss": 3.6845, "step": 835 }, { "epoch": 0.05707297187117815, "grad_norm": 2.8133134841918945, "learning_rate": 9.928828645196359e-05, "loss": 3.9895, "step": 840 }, { "epoch": 0.05741269194183993, "grad_norm": 2.6523597240448, "learning_rate": 9.92840399510803e-05, "loss": 3.8254, "step": 845 }, { "epoch": 0.0577524120125017, "grad_norm": 2.699572801589966, "learning_rate": 9.927979345019705e-05, "loss": 3.7742, "step": 850 }, { "epoch": 0.058092132083163474, "grad_norm": 2.490971565246582, "learning_rate": 9.927554694931378e-05, "loss": 3.6874, "step": 855 }, { "epoch": 0.05843185215382525, "grad_norm": 2.4525551795959473, "learning_rate": 9.927130044843049e-05, "loss": 3.3945, "step": 860 }, { "epoch": 0.05877157222448702, "grad_norm": 3.1005003452301025, "learning_rate": 9.926705394754723e-05, "loss": 3.72, "step": 865 }, { "epoch": 0.0591112922951488, "grad_norm": 2.757838010787964, "learning_rate": 9.926280744666395e-05, "loss": 3.7281, "step": 870 }, { "epoch": 0.059451012365810574, "grad_norm": 2.688002109527588, "learning_rate": 9.925856094578067e-05, "loss": 3.4413, "step": 875 }, { "epoch": 0.059790732436472344, "grad_norm": 2.5481104850769043, "learning_rate": 9.925431444489742e-05, "loss": 3.5383, "step": 880 }, { "epoch": 0.06013045250713412, "grad_norm": 2.7239933013916016, "learning_rate": 9.925006794401413e-05, "loss": 3.6688, "step": 885 }, { "epoch": 0.0604701725777959, "grad_norm": 3.6569783687591553, "learning_rate": 9.924582144313086e-05, "loss": 3.8351, "step": 890 }, { "epoch": 0.060809892648457674, "grad_norm": 2.423292636871338, "learning_rate": 9.92415749422476e-05, "loss": 3.8014, "step": 895 }, { "epoch": 0.061149612719119444, "grad_norm": 3.125342845916748, "learning_rate": 9.923732844136431e-05, "loss": 3.7874, "step": 900 }, { "epoch": 0.06148933278978122, "grad_norm": 3.0751779079437256, "learning_rate": 9.923308194048104e-05, "loss": 3.6073, "step": 905 }, { "epoch": 0.061829052860443, "grad_norm": 3.3726134300231934, "learning_rate": 9.922883543959778e-05, "loss": 3.6009, "step": 910 }, { "epoch": 0.06216877293110477, "grad_norm": 2.86334490776062, "learning_rate": 9.92245889387145e-05, "loss": 3.8159, "step": 915 }, { "epoch": 0.06250849300176654, "grad_norm": 2.39286732673645, "learning_rate": 9.922034243783123e-05, "loss": 3.5785, "step": 920 }, { "epoch": 0.06284821307242831, "grad_norm": 2.7859303951263428, "learning_rate": 9.921609593694797e-05, "loss": 3.8303, "step": 925 }, { "epoch": 0.0631879331430901, "grad_norm": 2.4177496433258057, "learning_rate": 9.921184943606468e-05, "loss": 3.7221, "step": 930 }, { "epoch": 0.06352765321375187, "grad_norm": 2.441819190979004, "learning_rate": 9.920760293518142e-05, "loss": 3.875, "step": 935 }, { "epoch": 0.06386737328441364, "grad_norm": 2.359551429748535, "learning_rate": 9.920335643429814e-05, "loss": 3.5418, "step": 940 }, { "epoch": 0.06420709335507542, "grad_norm": 2.5077199935913086, "learning_rate": 9.919910993341487e-05, "loss": 3.6693, "step": 945 }, { "epoch": 0.06454681342573719, "grad_norm": 2.7118682861328125, "learning_rate": 9.919486343253161e-05, "loss": 3.7542, "step": 950 }, { "epoch": 0.06488653349639897, "grad_norm": 3.035710573196411, "learning_rate": 9.919061693164832e-05, "loss": 3.6694, "step": 955 }, { "epoch": 0.06522625356706074, "grad_norm": 3.3193423748016357, "learning_rate": 9.918637043076505e-05, "loss": 3.8316, "step": 960 }, { "epoch": 0.06556597363772251, "grad_norm": 2.4807193279266357, "learning_rate": 9.918212392988179e-05, "loss": 3.6416, "step": 965 }, { "epoch": 0.0659056937083843, "grad_norm": 3.7772791385650635, "learning_rate": 9.91778774289985e-05, "loss": 3.932, "step": 970 }, { "epoch": 0.06624541377904607, "grad_norm": 2.8233423233032227, "learning_rate": 9.917363092811523e-05, "loss": 3.8445, "step": 975 }, { "epoch": 0.06658513384970784, "grad_norm": 2.7697527408599854, "learning_rate": 9.916938442723198e-05, "loss": 3.7474, "step": 980 }, { "epoch": 0.06692485392036962, "grad_norm": 3.04189395904541, "learning_rate": 9.916513792634869e-05, "loss": 3.7973, "step": 985 }, { "epoch": 0.06726457399103139, "grad_norm": 2.437936782836914, "learning_rate": 9.916089142546542e-05, "loss": 3.7282, "step": 990 }, { "epoch": 0.06760429406169316, "grad_norm": 2.2944767475128174, "learning_rate": 9.915664492458216e-05, "loss": 3.6289, "step": 995 }, { "epoch": 0.06794401413235494, "grad_norm": 2.751984119415283, "learning_rate": 9.915239842369887e-05, "loss": 3.7081, "step": 1000 }, { "epoch": 0.06828373420301671, "grad_norm": 2.8478729724884033, "learning_rate": 9.91481519228156e-05, "loss": 3.6821, "step": 1005 }, { "epoch": 0.06862345427367848, "grad_norm": 3.9907171726226807, "learning_rate": 9.914390542193233e-05, "loss": 3.5488, "step": 1010 }, { "epoch": 0.06896317434434027, "grad_norm": 2.771101713180542, "learning_rate": 9.913965892104906e-05, "loss": 3.8093, "step": 1015 }, { "epoch": 0.06930289441500204, "grad_norm": 2.20055890083313, "learning_rate": 9.913541242016579e-05, "loss": 3.5228, "step": 1020 }, { "epoch": 0.0696426144856638, "grad_norm": 2.4106743335723877, "learning_rate": 9.913116591928251e-05, "loss": 3.4087, "step": 1025 }, { "epoch": 0.06998233455632559, "grad_norm": 2.4740729331970215, "learning_rate": 9.912691941839924e-05, "loss": 3.8916, "step": 1030 }, { "epoch": 0.07032205462698736, "grad_norm": 3.0354933738708496, "learning_rate": 9.912267291751597e-05, "loss": 3.5902, "step": 1035 }, { "epoch": 0.07066177469764914, "grad_norm": 2.821380138397217, "learning_rate": 9.91184264166327e-05, "loss": 3.6938, "step": 1040 }, { "epoch": 0.07100149476831091, "grad_norm": 3.0581631660461426, "learning_rate": 9.911417991574943e-05, "loss": 3.7114, "step": 1045 }, { "epoch": 0.07134121483897268, "grad_norm": 2.672722816467285, "learning_rate": 9.910993341486615e-05, "loss": 3.9402, "step": 1050 }, { "epoch": 0.07168093490963447, "grad_norm": 3.6198601722717285, "learning_rate": 9.910568691398288e-05, "loss": 3.4995, "step": 1055 }, { "epoch": 0.07202065498029624, "grad_norm": 2.590298891067505, "learning_rate": 9.910144041309961e-05, "loss": 3.4578, "step": 1060 }, { "epoch": 0.072360375050958, "grad_norm": 2.5233349800109863, "learning_rate": 9.909719391221634e-05, "loss": 3.4242, "step": 1065 }, { "epoch": 0.07270009512161979, "grad_norm": 3.3468916416168213, "learning_rate": 9.909294741133307e-05, "loss": 3.9203, "step": 1070 }, { "epoch": 0.07303981519228156, "grad_norm": 2.8617589473724365, "learning_rate": 9.90887009104498e-05, "loss": 3.4865, "step": 1075 }, { "epoch": 0.07337953526294333, "grad_norm": 3.0948257446289062, "learning_rate": 9.908445440956652e-05, "loss": 3.8441, "step": 1080 }, { "epoch": 0.07371925533360511, "grad_norm": 2.5815205574035645, "learning_rate": 9.908020790868325e-05, "loss": 3.6181, "step": 1085 }, { "epoch": 0.07405897540426688, "grad_norm": 2.946096420288086, "learning_rate": 9.907596140779998e-05, "loss": 3.6933, "step": 1090 }, { "epoch": 0.07439869547492865, "grad_norm": 2.984200954437256, "learning_rate": 9.90717149069167e-05, "loss": 3.7388, "step": 1095 }, { "epoch": 0.07473841554559044, "grad_norm": 2.596780776977539, "learning_rate": 9.906746840603343e-05, "loss": 3.2643, "step": 1100 }, { "epoch": 0.0750781356162522, "grad_norm": 2.4430718421936035, "learning_rate": 9.906322190515016e-05, "loss": 3.9153, "step": 1105 }, { "epoch": 0.07541785568691398, "grad_norm": 3.0134854316711426, "learning_rate": 9.905897540426689e-05, "loss": 3.6416, "step": 1110 }, { "epoch": 0.07575757575757576, "grad_norm": 2.158900260925293, "learning_rate": 9.905472890338362e-05, "loss": 3.6849, "step": 1115 }, { "epoch": 0.07609729582823753, "grad_norm": 2.352266311645508, "learning_rate": 9.905048240250035e-05, "loss": 3.5914, "step": 1120 }, { "epoch": 0.07643701589889931, "grad_norm": 2.2301273345947266, "learning_rate": 9.904623590161707e-05, "loss": 3.7726, "step": 1125 }, { "epoch": 0.07677673596956108, "grad_norm": 2.9615561962127686, "learning_rate": 9.90419894007338e-05, "loss": 3.5834, "step": 1130 }, { "epoch": 0.07711645604022285, "grad_norm": 2.662644624710083, "learning_rate": 9.903774289985053e-05, "loss": 3.6984, "step": 1135 }, { "epoch": 0.07745617611088464, "grad_norm": 3.0979316234588623, "learning_rate": 9.903349639896725e-05, "loss": 3.4165, "step": 1140 }, { "epoch": 0.0777958961815464, "grad_norm": 3.0384013652801514, "learning_rate": 9.902924989808399e-05, "loss": 3.6552, "step": 1145 }, { "epoch": 0.07813561625220818, "grad_norm": 2.941535234451294, "learning_rate": 9.902500339720071e-05, "loss": 3.5969, "step": 1150 }, { "epoch": 0.07847533632286996, "grad_norm": 2.47575306892395, "learning_rate": 9.902075689631743e-05, "loss": 3.6427, "step": 1155 }, { "epoch": 0.07881505639353173, "grad_norm": 2.178506374359131, "learning_rate": 9.901651039543417e-05, "loss": 3.5326, "step": 1160 }, { "epoch": 0.0791547764641935, "grad_norm": 3.017336130142212, "learning_rate": 9.90122638945509e-05, "loss": 3.7905, "step": 1165 }, { "epoch": 0.07949449653485528, "grad_norm": 2.3930509090423584, "learning_rate": 9.900801739366761e-05, "loss": 3.6515, "step": 1170 }, { "epoch": 0.07983421660551705, "grad_norm": 2.8017749786376953, "learning_rate": 9.900377089278435e-05, "loss": 3.5386, "step": 1175 }, { "epoch": 0.08017393667617882, "grad_norm": 2.6371870040893555, "learning_rate": 9.899952439190108e-05, "loss": 4.0497, "step": 1180 }, { "epoch": 0.0805136567468406, "grad_norm": 2.327693462371826, "learning_rate": 9.89952778910178e-05, "loss": 3.6986, "step": 1185 }, { "epoch": 0.08085337681750238, "grad_norm": 4.935518264770508, "learning_rate": 9.899103139013454e-05, "loss": 3.7905, "step": 1190 }, { "epoch": 0.08119309688816416, "grad_norm": 2.404540538787842, "learning_rate": 9.898678488925127e-05, "loss": 3.3917, "step": 1195 }, { "epoch": 0.08153281695882593, "grad_norm": 2.52348256111145, "learning_rate": 9.898253838836798e-05, "loss": 3.7255, "step": 1200 }, { "epoch": 0.0818725370294877, "grad_norm": 3.3984553813934326, "learning_rate": 9.897829188748472e-05, "loss": 3.924, "step": 1205 }, { "epoch": 0.08221225710014948, "grad_norm": 2.7035470008850098, "learning_rate": 9.897404538660144e-05, "loss": 3.4641, "step": 1210 }, { "epoch": 0.08255197717081125, "grad_norm": 2.4212405681610107, "learning_rate": 9.896979888571817e-05, "loss": 3.5081, "step": 1215 }, { "epoch": 0.08289169724147302, "grad_norm": 2.361762285232544, "learning_rate": 9.896555238483491e-05, "loss": 3.601, "step": 1220 }, { "epoch": 0.0832314173121348, "grad_norm": 2.352565288543701, "learning_rate": 9.896130588395162e-05, "loss": 3.5627, "step": 1225 }, { "epoch": 0.08357113738279658, "grad_norm": 2.543168067932129, "learning_rate": 9.895705938306835e-05, "loss": 3.8297, "step": 1230 }, { "epoch": 0.08391085745345835, "grad_norm": 2.7356700897216797, "learning_rate": 9.895281288218509e-05, "loss": 3.5042, "step": 1235 }, { "epoch": 0.08425057752412013, "grad_norm": 2.7182724475860596, "learning_rate": 9.89485663813018e-05, "loss": 3.6767, "step": 1240 }, { "epoch": 0.0845902975947819, "grad_norm": 2.528550624847412, "learning_rate": 9.894431988041853e-05, "loss": 3.4373, "step": 1245 }, { "epoch": 0.08493001766544367, "grad_norm": 4.637968063354492, "learning_rate": 9.894007337953527e-05, "loss": 3.682, "step": 1250 }, { "epoch": 0.08526973773610545, "grad_norm": 2.421348810195923, "learning_rate": 9.893582687865199e-05, "loss": 3.8257, "step": 1255 }, { "epoch": 0.08560945780676722, "grad_norm": 3.2755727767944336, "learning_rate": 9.893158037776872e-05, "loss": 3.4979, "step": 1260 }, { "epoch": 0.08594917787742899, "grad_norm": 2.8488996028900146, "learning_rate": 9.892733387688546e-05, "loss": 3.6051, "step": 1265 }, { "epoch": 0.08628889794809078, "grad_norm": 2.797036647796631, "learning_rate": 9.892308737600217e-05, "loss": 3.615, "step": 1270 }, { "epoch": 0.08662861801875255, "grad_norm": 2.750133514404297, "learning_rate": 9.891884087511891e-05, "loss": 3.6867, "step": 1275 }, { "epoch": 0.08696833808941433, "grad_norm": 2.2481203079223633, "learning_rate": 9.891459437423564e-05, "loss": 3.7979, "step": 1280 }, { "epoch": 0.0873080581600761, "grad_norm": 2.9319496154785156, "learning_rate": 9.891034787335236e-05, "loss": 4.1288, "step": 1285 }, { "epoch": 0.08764777823073787, "grad_norm": 2.4174962043762207, "learning_rate": 9.89061013724691e-05, "loss": 3.6116, "step": 1290 }, { "epoch": 0.08798749830139965, "grad_norm": 2.875229597091675, "learning_rate": 9.890185487158581e-05, "loss": 3.663, "step": 1295 }, { "epoch": 0.08832721837206142, "grad_norm": 2.4175915718078613, "learning_rate": 9.889760837070254e-05, "loss": 3.8735, "step": 1300 }, { "epoch": 0.08866693844272319, "grad_norm": 2.8554160594940186, "learning_rate": 9.889336186981928e-05, "loss": 3.6893, "step": 1305 }, { "epoch": 0.08900665851338498, "grad_norm": 2.5718233585357666, "learning_rate": 9.8889115368936e-05, "loss": 3.7129, "step": 1310 }, { "epoch": 0.08934637858404675, "grad_norm": 2.9183428287506104, "learning_rate": 9.888486886805273e-05, "loss": 3.5695, "step": 1315 }, { "epoch": 0.08968609865470852, "grad_norm": 2.3678832054138184, "learning_rate": 9.888062236716947e-05, "loss": 3.3905, "step": 1320 }, { "epoch": 0.0900258187253703, "grad_norm": 3.0546958446502686, "learning_rate": 9.887637586628618e-05, "loss": 3.4117, "step": 1325 }, { "epoch": 0.09036553879603207, "grad_norm": 2.8776729106903076, "learning_rate": 9.887212936540291e-05, "loss": 3.6729, "step": 1330 }, { "epoch": 0.09070525886669384, "grad_norm": 3.1774954795837402, "learning_rate": 9.886788286451965e-05, "loss": 3.7087, "step": 1335 }, { "epoch": 0.09104497893735562, "grad_norm": 2.4546725749969482, "learning_rate": 9.886363636363637e-05, "loss": 3.8551, "step": 1340 }, { "epoch": 0.09138469900801739, "grad_norm": 2.312204599380493, "learning_rate": 9.88593898627531e-05, "loss": 3.4144, "step": 1345 }, { "epoch": 0.09172441907867916, "grad_norm": 2.9866554737091064, "learning_rate": 9.885514336186984e-05, "loss": 3.5115, "step": 1350 }, { "epoch": 0.09206413914934095, "grad_norm": 2.046361207962036, "learning_rate": 9.885089686098655e-05, "loss": 3.7515, "step": 1355 }, { "epoch": 0.09240385922000272, "grad_norm": 3.2899272441864014, "learning_rate": 9.884665036010328e-05, "loss": 3.5764, "step": 1360 }, { "epoch": 0.0927435792906645, "grad_norm": 2.3010950088500977, "learning_rate": 9.884240385922e-05, "loss": 3.6661, "step": 1365 }, { "epoch": 0.09308329936132627, "grad_norm": 2.771374464035034, "learning_rate": 9.883815735833673e-05, "loss": 3.7498, "step": 1370 }, { "epoch": 0.09342301943198804, "grad_norm": 2.3871958255767822, "learning_rate": 9.883391085745346e-05, "loss": 3.7738, "step": 1375 }, { "epoch": 0.09376273950264982, "grad_norm": 2.7825450897216797, "learning_rate": 9.882966435657019e-05, "loss": 3.5485, "step": 1380 }, { "epoch": 0.09410245957331159, "grad_norm": 3.3543453216552734, "learning_rate": 9.882541785568692e-05, "loss": 3.6235, "step": 1385 }, { "epoch": 0.09444217964397336, "grad_norm": 2.853057622909546, "learning_rate": 9.882117135480365e-05, "loss": 3.6737, "step": 1390 }, { "epoch": 0.09478189971463515, "grad_norm": 3.0069384574890137, "learning_rate": 9.881692485392037e-05, "loss": 3.644, "step": 1395 }, { "epoch": 0.09512161978529692, "grad_norm": 2.6202545166015625, "learning_rate": 9.88126783530371e-05, "loss": 3.6517, "step": 1400 }, { "epoch": 0.09546133985595869, "grad_norm": 6.95131254196167, "learning_rate": 9.880843185215383e-05, "loss": 3.7388, "step": 1405 }, { "epoch": 0.09580105992662047, "grad_norm": 3.5970633029937744, "learning_rate": 9.880418535127056e-05, "loss": 3.8082, "step": 1410 }, { "epoch": 0.09614077999728224, "grad_norm": 11.684786796569824, "learning_rate": 9.879993885038729e-05, "loss": 3.7492, "step": 1415 }, { "epoch": 0.09648050006794401, "grad_norm": 3.0684146881103516, "learning_rate": 9.879569234950401e-05, "loss": 3.8429, "step": 1420 }, { "epoch": 0.09682022013860579, "grad_norm": 2.6395862102508545, "learning_rate": 9.87922951487974e-05, "loss": 3.7778, "step": 1425 }, { "epoch": 0.09715994020926756, "grad_norm": 3.00087308883667, "learning_rate": 9.878804864791412e-05, "loss": 3.5962, "step": 1430 }, { "epoch": 0.09749966027992933, "grad_norm": 2.7398698329925537, "learning_rate": 9.878380214703085e-05, "loss": 3.5594, "step": 1435 }, { "epoch": 0.09783938035059112, "grad_norm": 3.682832717895508, "learning_rate": 9.877955564614758e-05, "loss": 3.784, "step": 1440 }, { "epoch": 0.09817910042125289, "grad_norm": 2.15097975730896, "learning_rate": 9.877530914526431e-05, "loss": 3.5341, "step": 1445 }, { "epoch": 0.09851882049191467, "grad_norm": 2.5064339637756348, "learning_rate": 9.877106264438104e-05, "loss": 3.6966, "step": 1450 }, { "epoch": 0.09885854056257644, "grad_norm": 2.165351629257202, "learning_rate": 9.876681614349776e-05, "loss": 3.7644, "step": 1455 }, { "epoch": 0.09919826063323821, "grad_norm": 2.6611499786376953, "learning_rate": 9.876256964261449e-05, "loss": 3.7581, "step": 1460 }, { "epoch": 0.09953798070389999, "grad_norm": 4.449843883514404, "learning_rate": 9.87583231417312e-05, "loss": 3.6901, "step": 1465 }, { "epoch": 0.09987770077456176, "grad_norm": 3.5374207496643066, "learning_rate": 9.875407664084795e-05, "loss": 3.816, "step": 1470 }, { "epoch": 0.10021742084522353, "grad_norm": 2.6360864639282227, "learning_rate": 9.874983013996468e-05, "loss": 3.3863, "step": 1475 }, { "epoch": 0.10055714091588532, "grad_norm": 3.614423990249634, "learning_rate": 9.87455836390814e-05, "loss": 3.6914, "step": 1480 }, { "epoch": 0.10089686098654709, "grad_norm": 2.6101832389831543, "learning_rate": 9.874133713819813e-05, "loss": 3.4576, "step": 1485 }, { "epoch": 0.10123658105720885, "grad_norm": 2.3915576934814453, "learning_rate": 9.873709063731486e-05, "loss": 3.5092, "step": 1490 }, { "epoch": 0.10157630112787064, "grad_norm": 2.73832368850708, "learning_rate": 9.873284413643159e-05, "loss": 3.564, "step": 1495 }, { "epoch": 0.10191602119853241, "grad_norm": 2.4439172744750977, "learning_rate": 9.872859763554832e-05, "loss": 3.7701, "step": 1500 }, { "epoch": 0.10225574126919418, "grad_norm": 2.556349754333496, "learning_rate": 9.872435113466504e-05, "loss": 3.6641, "step": 1505 }, { "epoch": 0.10259546133985596, "grad_norm": 2.692896604537964, "learning_rate": 9.872010463378177e-05, "loss": 3.5379, "step": 1510 }, { "epoch": 0.10293518141051773, "grad_norm": 2.2422587871551514, "learning_rate": 9.87158581328985e-05, "loss": 3.5895, "step": 1515 }, { "epoch": 0.1032749014811795, "grad_norm": 2.542940855026245, "learning_rate": 9.871161163201523e-05, "loss": 3.545, "step": 1520 }, { "epoch": 0.10361462155184128, "grad_norm": 8.475870132446289, "learning_rate": 9.870736513113196e-05, "loss": 3.7558, "step": 1525 }, { "epoch": 0.10395434162250305, "grad_norm": 2.605943441390991, "learning_rate": 9.870311863024868e-05, "loss": 3.698, "step": 1530 }, { "epoch": 0.10429406169316484, "grad_norm": 2.659353733062744, "learning_rate": 9.86988721293654e-05, "loss": 3.879, "step": 1535 }, { "epoch": 0.10463378176382661, "grad_norm": 2.6726794242858887, "learning_rate": 9.869462562848214e-05, "loss": 3.7048, "step": 1540 }, { "epoch": 0.10497350183448838, "grad_norm": 2.2029128074645996, "learning_rate": 9.869037912759887e-05, "loss": 3.6834, "step": 1545 }, { "epoch": 0.10531322190515016, "grad_norm": 2.46644926071167, "learning_rate": 9.868613262671558e-05, "loss": 3.6618, "step": 1550 }, { "epoch": 0.10565294197581193, "grad_norm": 2.5602245330810547, "learning_rate": 9.868188612583232e-05, "loss": 3.5211, "step": 1555 }, { "epoch": 0.1059926620464737, "grad_norm": 2.3584413528442383, "learning_rate": 9.867763962494905e-05, "loss": 3.6223, "step": 1560 }, { "epoch": 0.10633238211713548, "grad_norm": 3.620461940765381, "learning_rate": 9.867339312406577e-05, "loss": 3.7434, "step": 1565 }, { "epoch": 0.10667210218779725, "grad_norm": 2.8504605293273926, "learning_rate": 9.866914662318251e-05, "loss": 3.7301, "step": 1570 }, { "epoch": 0.10701182225845902, "grad_norm": 3.402848720550537, "learning_rate": 9.866490012229924e-05, "loss": 3.9087, "step": 1575 }, { "epoch": 0.10735154232912081, "grad_norm": 2.317671060562134, "learning_rate": 9.866065362141595e-05, "loss": 3.7321, "step": 1580 }, { "epoch": 0.10769126239978258, "grad_norm": 2.246663808822632, "learning_rate": 9.865640712053269e-05, "loss": 3.5694, "step": 1585 }, { "epoch": 0.10803098247044435, "grad_norm": 2.606186866760254, "learning_rate": 9.865216061964942e-05, "loss": 3.5645, "step": 1590 }, { "epoch": 0.10837070254110613, "grad_norm": 2.2364070415496826, "learning_rate": 9.864791411876613e-05, "loss": 3.8505, "step": 1595 }, { "epoch": 0.1087104226117679, "grad_norm": 1.8168416023254395, "learning_rate": 9.864366761788288e-05, "loss": 3.7063, "step": 1600 }, { "epoch": 0.10905014268242967, "grad_norm": 2.6164488792419434, "learning_rate": 9.863942111699959e-05, "loss": 3.6088, "step": 1605 }, { "epoch": 0.10938986275309145, "grad_norm": 3.5399606227874756, "learning_rate": 9.863517461611632e-05, "loss": 3.7959, "step": 1610 }, { "epoch": 0.10972958282375322, "grad_norm": 2.352106809616089, "learning_rate": 9.863092811523306e-05, "loss": 3.6098, "step": 1615 }, { "epoch": 0.11006930289441501, "grad_norm": 2.274197816848755, "learning_rate": 9.862668161434977e-05, "loss": 3.5673, "step": 1620 }, { "epoch": 0.11040902296507678, "grad_norm": 2.0550856590270996, "learning_rate": 9.86224351134665e-05, "loss": 3.5434, "step": 1625 }, { "epoch": 0.11074874303573855, "grad_norm": 2.5609278678894043, "learning_rate": 9.861818861258324e-05, "loss": 3.7825, "step": 1630 }, { "epoch": 0.11108846310640033, "grad_norm": 2.473008871078491, "learning_rate": 9.861394211169996e-05, "loss": 3.6442, "step": 1635 }, { "epoch": 0.1114281831770621, "grad_norm": 2.609839677810669, "learning_rate": 9.860969561081669e-05, "loss": 3.7895, "step": 1640 }, { "epoch": 0.11176790324772387, "grad_norm": 2.270355701446533, "learning_rate": 9.860544910993343e-05, "loss": 3.6965, "step": 1645 }, { "epoch": 0.11210762331838565, "grad_norm": 8.605286598205566, "learning_rate": 9.860120260905014e-05, "loss": 3.5205, "step": 1650 }, { "epoch": 0.11244734338904742, "grad_norm": 2.727208137512207, "learning_rate": 9.859695610816687e-05, "loss": 3.7554, "step": 1655 }, { "epoch": 0.1127870634597092, "grad_norm": 2.6802847385406494, "learning_rate": 9.859270960728361e-05, "loss": 3.6455, "step": 1660 }, { "epoch": 0.11312678353037098, "grad_norm": 3.5274956226348877, "learning_rate": 9.858846310640033e-05, "loss": 3.6387, "step": 1665 }, { "epoch": 0.11346650360103275, "grad_norm": 2.9241151809692383, "learning_rate": 9.858421660551705e-05, "loss": 3.6712, "step": 1670 }, { "epoch": 0.11380622367169452, "grad_norm": 2.2806825637817383, "learning_rate": 9.85799701046338e-05, "loss": 3.7348, "step": 1675 }, { "epoch": 0.1141459437423563, "grad_norm": 2.5223445892333984, "learning_rate": 9.857572360375051e-05, "loss": 3.5781, "step": 1680 }, { "epoch": 0.11448566381301807, "grad_norm": 2.367621660232544, "learning_rate": 9.857147710286724e-05, "loss": 3.5701, "step": 1685 }, { "epoch": 0.11482538388367985, "grad_norm": 2.555980682373047, "learning_rate": 9.856723060198397e-05, "loss": 3.3252, "step": 1690 }, { "epoch": 0.11516510395434162, "grad_norm": 2.4443697929382324, "learning_rate": 9.85629841011007e-05, "loss": 3.5512, "step": 1695 }, { "epoch": 0.1155048240250034, "grad_norm": 2.950860023498535, "learning_rate": 9.855873760021742e-05, "loss": 3.7208, "step": 1700 }, { "epoch": 0.11584454409566518, "grad_norm": 2.127361297607422, "learning_rate": 9.855449109933415e-05, "loss": 3.3163, "step": 1705 }, { "epoch": 0.11618426416632695, "grad_norm": 2.0718114376068115, "learning_rate": 9.855024459845088e-05, "loss": 3.8084, "step": 1710 }, { "epoch": 0.11652398423698872, "grad_norm": 2.4193952083587646, "learning_rate": 9.854599809756761e-05, "loss": 3.2907, "step": 1715 }, { "epoch": 0.1168637043076505, "grad_norm": 2.3924756050109863, "learning_rate": 9.854175159668433e-05, "loss": 3.7214, "step": 1720 }, { "epoch": 0.11720342437831227, "grad_norm": 3.228001117706299, "learning_rate": 9.853750509580106e-05, "loss": 3.4578, "step": 1725 }, { "epoch": 0.11754314444897404, "grad_norm": 2.3939614295959473, "learning_rate": 9.853325859491779e-05, "loss": 3.5184, "step": 1730 }, { "epoch": 0.11788286451963582, "grad_norm": 4.840658187866211, "learning_rate": 9.852901209403452e-05, "loss": 3.5232, "step": 1735 }, { "epoch": 0.1182225845902976, "grad_norm": 2.6658291816711426, "learning_rate": 9.852476559315125e-05, "loss": 3.7119, "step": 1740 }, { "epoch": 0.11856230466095936, "grad_norm": 3.069031238555908, "learning_rate": 9.852051909226797e-05, "loss": 3.8233, "step": 1745 }, { "epoch": 0.11890202473162115, "grad_norm": 2.8473379611968994, "learning_rate": 9.85162725913847e-05, "loss": 3.6721, "step": 1750 }, { "epoch": 0.11924174480228292, "grad_norm": 2.682267427444458, "learning_rate": 9.851202609050143e-05, "loss": 3.7382, "step": 1755 }, { "epoch": 0.11958146487294469, "grad_norm": 2.3937129974365234, "learning_rate": 9.850777958961816e-05, "loss": 3.7596, "step": 1760 }, { "epoch": 0.11992118494360647, "grad_norm": 2.733515739440918, "learning_rate": 9.850353308873489e-05, "loss": 3.6185, "step": 1765 }, { "epoch": 0.12026090501426824, "grad_norm": 2.3822734355926514, "learning_rate": 9.849928658785161e-05, "loss": 3.7106, "step": 1770 }, { "epoch": 0.12060062508493002, "grad_norm": 2.527878999710083, "learning_rate": 9.849504008696834e-05, "loss": 3.6467, "step": 1775 }, { "epoch": 0.1209403451555918, "grad_norm": 2.95981502532959, "learning_rate": 9.849079358608507e-05, "loss": 3.7278, "step": 1780 }, { "epoch": 0.12128006522625356, "grad_norm": 2.6949033737182617, "learning_rate": 9.84865470852018e-05, "loss": 3.8295, "step": 1785 }, { "epoch": 0.12161978529691535, "grad_norm": 2.6312930583953857, "learning_rate": 9.848230058431853e-05, "loss": 3.7378, "step": 1790 }, { "epoch": 0.12195950536757712, "grad_norm": 2.5927481651306152, "learning_rate": 9.847805408343525e-05, "loss": 3.457, "step": 1795 }, { "epoch": 0.12229922543823889, "grad_norm": 2.448457717895508, "learning_rate": 9.847380758255198e-05, "loss": 3.45, "step": 1800 }, { "epoch": 0.12263894550890067, "grad_norm": 4.1560821533203125, "learning_rate": 9.84695610816687e-05, "loss": 3.6898, "step": 1805 }, { "epoch": 0.12297866557956244, "grad_norm": 2.262101888656616, "learning_rate": 9.846531458078544e-05, "loss": 3.6114, "step": 1810 }, { "epoch": 0.12331838565022421, "grad_norm": 2.7449049949645996, "learning_rate": 9.846106807990217e-05, "loss": 3.5611, "step": 1815 }, { "epoch": 0.123658105720886, "grad_norm": 3.1933770179748535, "learning_rate": 9.84568215790189e-05, "loss": 3.59, "step": 1820 }, { "epoch": 0.12399782579154776, "grad_norm": 2.600872755050659, "learning_rate": 9.845257507813562e-05, "loss": 3.532, "step": 1825 }, { "epoch": 0.12433754586220953, "grad_norm": 2.23490571975708, "learning_rate": 9.844832857725235e-05, "loss": 3.667, "step": 1830 }, { "epoch": 0.12467726593287132, "grad_norm": 2.7708678245544434, "learning_rate": 9.844408207636908e-05, "loss": 3.7112, "step": 1835 }, { "epoch": 0.1250169860035331, "grad_norm": 8.08360481262207, "learning_rate": 9.843983557548581e-05, "loss": 3.7553, "step": 1840 }, { "epoch": 0.12535670607419486, "grad_norm": 3.5631556510925293, "learning_rate": 9.843558907460254e-05, "loss": 3.5741, "step": 1845 }, { "epoch": 0.12569642614485663, "grad_norm": 3.6369643211364746, "learning_rate": 9.843134257371926e-05, "loss": 3.7011, "step": 1850 }, { "epoch": 0.12603614621551842, "grad_norm": 2.509939432144165, "learning_rate": 9.842709607283599e-05, "loss": 3.7049, "step": 1855 }, { "epoch": 0.1263758662861802, "grad_norm": 2.6086642742156982, "learning_rate": 9.842284957195272e-05, "loss": 3.6444, "step": 1860 }, { "epoch": 0.12671558635684196, "grad_norm": 2.4885952472686768, "learning_rate": 9.841860307106945e-05, "loss": 3.3782, "step": 1865 }, { "epoch": 0.12705530642750373, "grad_norm": 2.3255302906036377, "learning_rate": 9.841435657018618e-05, "loss": 3.8069, "step": 1870 }, { "epoch": 0.1273950264981655, "grad_norm": 2.29691219329834, "learning_rate": 9.84101100693029e-05, "loss": 3.5613, "step": 1875 }, { "epoch": 0.12773474656882727, "grad_norm": 5.276554584503174, "learning_rate": 9.840586356841963e-05, "loss": 3.6252, "step": 1880 }, { "epoch": 0.12807446663948907, "grad_norm": 2.1562600135803223, "learning_rate": 9.840161706753636e-05, "loss": 3.7605, "step": 1885 }, { "epoch": 0.12841418671015084, "grad_norm": 2.6067538261413574, "learning_rate": 9.839737056665307e-05, "loss": 3.1914, "step": 1890 }, { "epoch": 0.1287539067808126, "grad_norm": 2.827772617340088, "learning_rate": 9.839312406576982e-05, "loss": 3.6718, "step": 1895 }, { "epoch": 0.12909362685147438, "grad_norm": 2.7478749752044678, "learning_rate": 9.838887756488654e-05, "loss": 3.6127, "step": 1900 }, { "epoch": 0.12943334692213615, "grad_norm": 2.3657724857330322, "learning_rate": 9.838463106400326e-05, "loss": 3.4427, "step": 1905 }, { "epoch": 0.12977306699279795, "grad_norm": 2.349724292755127, "learning_rate": 9.838038456312e-05, "loss": 3.6335, "step": 1910 }, { "epoch": 0.13011278706345972, "grad_norm": 3.6742427349090576, "learning_rate": 9.837613806223673e-05, "loss": 3.7661, "step": 1915 }, { "epoch": 0.1304525071341215, "grad_norm": 2.6719472408294678, "learning_rate": 9.837189156135344e-05, "loss": 3.3573, "step": 1920 }, { "epoch": 0.13079222720478326, "grad_norm": 2.593247652053833, "learning_rate": 9.836764506047018e-05, "loss": 3.7031, "step": 1925 }, { "epoch": 0.13113194727544503, "grad_norm": 2.6783385276794434, "learning_rate": 9.836339855958691e-05, "loss": 3.6197, "step": 1930 }, { "epoch": 0.1314716673461068, "grad_norm": 2.8472886085510254, "learning_rate": 9.835915205870363e-05, "loss": 3.6187, "step": 1935 }, { "epoch": 0.1318113874167686, "grad_norm": 2.1278092861175537, "learning_rate": 9.835490555782037e-05, "loss": 3.209, "step": 1940 }, { "epoch": 0.13215110748743036, "grad_norm": 3.7434868812561035, "learning_rate": 9.83506590569371e-05, "loss": 3.505, "step": 1945 }, { "epoch": 0.13249082755809213, "grad_norm": 2.496720314025879, "learning_rate": 9.834641255605381e-05, "loss": 3.8107, "step": 1950 }, { "epoch": 0.1328305476287539, "grad_norm": 2.278834342956543, "learning_rate": 9.834216605517055e-05, "loss": 3.6206, "step": 1955 }, { "epoch": 0.13317026769941567, "grad_norm": 2.7811431884765625, "learning_rate": 9.833791955428727e-05, "loss": 3.6232, "step": 1960 }, { "epoch": 0.13350998777007744, "grad_norm": 2.345259428024292, "learning_rate": 9.8333673053404e-05, "loss": 3.3393, "step": 1965 }, { "epoch": 0.13384970784073924, "grad_norm": 2.5706770420074463, "learning_rate": 9.832942655252074e-05, "loss": 3.5282, "step": 1970 }, { "epoch": 0.134189427911401, "grad_norm": 2.311833143234253, "learning_rate": 9.832518005163745e-05, "loss": 3.83, "step": 1975 }, { "epoch": 0.13452914798206278, "grad_norm": 2.6815695762634277, "learning_rate": 9.832093355075418e-05, "loss": 3.5066, "step": 1980 }, { "epoch": 0.13486886805272455, "grad_norm": 2.2054522037506104, "learning_rate": 9.831668704987092e-05, "loss": 3.704, "step": 1985 }, { "epoch": 0.13520858812338632, "grad_norm": 3.077894687652588, "learning_rate": 9.831244054898763e-05, "loss": 3.5649, "step": 1990 }, { "epoch": 0.13554830819404812, "grad_norm": 4.214877128601074, "learning_rate": 9.830819404810436e-05, "loss": 3.6439, "step": 1995 }, { "epoch": 0.1358880282647099, "grad_norm": 2.8064329624176025, "learning_rate": 9.83039475472211e-05, "loss": 3.6047, "step": 2000 }, { "epoch": 0.13622774833537166, "grad_norm": 2.8148748874664307, "learning_rate": 9.829970104633782e-05, "loss": 3.4838, "step": 2005 }, { "epoch": 0.13656746840603343, "grad_norm": 2.6066055297851562, "learning_rate": 9.829545454545455e-05, "loss": 3.9006, "step": 2010 }, { "epoch": 0.1369071884766952, "grad_norm": 1.8680016994476318, "learning_rate": 9.829120804457129e-05, "loss": 3.6725, "step": 2015 }, { "epoch": 0.13724690854735697, "grad_norm": 2.4525768756866455, "learning_rate": 9.8286961543688e-05, "loss": 3.8145, "step": 2020 }, { "epoch": 0.13758662861801876, "grad_norm": 2.4238014221191406, "learning_rate": 9.828271504280473e-05, "loss": 3.4119, "step": 2025 }, { "epoch": 0.13792634868868053, "grad_norm": 3.079613208770752, "learning_rate": 9.827846854192146e-05, "loss": 3.5258, "step": 2030 }, { "epoch": 0.1382660687593423, "grad_norm": 2.3786492347717285, "learning_rate": 9.827422204103819e-05, "loss": 3.7729, "step": 2035 }, { "epoch": 0.13860578883000407, "grad_norm": 2.0804662704467773, "learning_rate": 9.826997554015491e-05, "loss": 3.7198, "step": 2040 }, { "epoch": 0.13894550890066584, "grad_norm": 2.224865436553955, "learning_rate": 9.826572903927164e-05, "loss": 3.7894, "step": 2045 }, { "epoch": 0.1392852289713276, "grad_norm": 2.318671464920044, "learning_rate": 9.826148253838837e-05, "loss": 3.5705, "step": 2050 }, { "epoch": 0.1396249490419894, "grad_norm": 2.839855194091797, "learning_rate": 9.82572360375051e-05, "loss": 3.6663, "step": 2055 }, { "epoch": 0.13996466911265118, "grad_norm": 2.529090404510498, "learning_rate": 9.825298953662183e-05, "loss": 3.7397, "step": 2060 }, { "epoch": 0.14030438918331295, "grad_norm": 2.648512601852417, "learning_rate": 9.824874303573855e-05, "loss": 3.8091, "step": 2065 }, { "epoch": 0.14064410925397472, "grad_norm": 2.4608304500579834, "learning_rate": 9.824449653485528e-05, "loss": 3.5435, "step": 2070 }, { "epoch": 0.1409838293246365, "grad_norm": 2.5319597721099854, "learning_rate": 9.824025003397201e-05, "loss": 3.4596, "step": 2075 }, { "epoch": 0.1413235493952983, "grad_norm": 1.8423856496810913, "learning_rate": 9.823600353308874e-05, "loss": 3.7239, "step": 2080 }, { "epoch": 0.14166326946596006, "grad_norm": 2.655466079711914, "learning_rate": 9.823175703220547e-05, "loss": 3.7743, "step": 2085 }, { "epoch": 0.14200298953662183, "grad_norm": 2.5335872173309326, "learning_rate": 9.82275105313222e-05, "loss": 3.6003, "step": 2090 }, { "epoch": 0.1423427096072836, "grad_norm": 2.2621281147003174, "learning_rate": 9.822326403043892e-05, "loss": 3.6408, "step": 2095 }, { "epoch": 0.14268242967794537, "grad_norm": 2.2347781658172607, "learning_rate": 9.821901752955565e-05, "loss": 3.5515, "step": 2100 }, { "epoch": 0.14302214974860714, "grad_norm": 2.8384768962860107, "learning_rate": 9.821477102867238e-05, "loss": 3.6728, "step": 2105 }, { "epoch": 0.14336186981926893, "grad_norm": 2.3546082973480225, "learning_rate": 9.82105245277891e-05, "loss": 3.7157, "step": 2110 }, { "epoch": 0.1437015898899307, "grad_norm": 2.647495746612549, "learning_rate": 9.820627802690583e-05, "loss": 3.2504, "step": 2115 }, { "epoch": 0.14404130996059247, "grad_norm": 2.484992504119873, "learning_rate": 9.820203152602256e-05, "loss": 3.7337, "step": 2120 }, { "epoch": 0.14438103003125424, "grad_norm": 2.94370436668396, "learning_rate": 9.819778502513929e-05, "loss": 3.523, "step": 2125 }, { "epoch": 0.144720750101916, "grad_norm": 2.8493423461914062, "learning_rate": 9.819353852425602e-05, "loss": 3.5636, "step": 2130 }, { "epoch": 0.14506047017257778, "grad_norm": 2.9806811809539795, "learning_rate": 9.818929202337275e-05, "loss": 3.5644, "step": 2135 }, { "epoch": 0.14540019024323958, "grad_norm": 2.0938262939453125, "learning_rate": 9.818504552248947e-05, "loss": 3.5947, "step": 2140 }, { "epoch": 0.14573991031390135, "grad_norm": 12.675642967224121, "learning_rate": 9.81807990216062e-05, "loss": 3.6706, "step": 2145 }, { "epoch": 0.14607963038456312, "grad_norm": 2.495978355407715, "learning_rate": 9.817655252072293e-05, "loss": 3.47, "step": 2150 }, { "epoch": 0.1464193504552249, "grad_norm": 2.372793436050415, "learning_rate": 9.817230601983966e-05, "loss": 4.0237, "step": 2155 }, { "epoch": 0.14675907052588666, "grad_norm": 2.2783010005950928, "learning_rate": 9.816805951895639e-05, "loss": 3.6148, "step": 2160 }, { "epoch": 0.14709879059654846, "grad_norm": 2.8050243854522705, "learning_rate": 9.816381301807311e-05, "loss": 3.5389, "step": 2165 }, { "epoch": 0.14743851066721023, "grad_norm": 2.6023781299591064, "learning_rate": 9.815956651718984e-05, "loss": 3.4244, "step": 2170 }, { "epoch": 0.147778230737872, "grad_norm": 2.3167836666107178, "learning_rate": 9.815532001630657e-05, "loss": 3.7994, "step": 2175 }, { "epoch": 0.14811795080853377, "grad_norm": 2.3294200897216797, "learning_rate": 9.81510735154233e-05, "loss": 3.5152, "step": 2180 }, { "epoch": 0.14845767087919554, "grad_norm": 2.136742115020752, "learning_rate": 9.814682701454003e-05, "loss": 3.653, "step": 2185 }, { "epoch": 0.1487973909498573, "grad_norm": 2.252354860305786, "learning_rate": 9.814258051365675e-05, "loss": 3.6066, "step": 2190 }, { "epoch": 0.1491371110205191, "grad_norm": 2.5286755561828613, "learning_rate": 9.813833401277348e-05, "loss": 3.5312, "step": 2195 }, { "epoch": 0.14947683109118087, "grad_norm": 2.4802448749542236, "learning_rate": 9.813408751189021e-05, "loss": 3.6054, "step": 2200 }, { "epoch": 0.14981655116184264, "grad_norm": 2.19534969329834, "learning_rate": 9.812984101100694e-05, "loss": 3.5825, "step": 2205 }, { "epoch": 0.1501562712325044, "grad_norm": 2.286517858505249, "learning_rate": 9.812559451012367e-05, "loss": 3.7755, "step": 2210 }, { "epoch": 0.15049599130316618, "grad_norm": 2.3610992431640625, "learning_rate": 9.81213480092404e-05, "loss": 3.5939, "step": 2215 }, { "epoch": 0.15083571137382795, "grad_norm": 2.321992874145508, "learning_rate": 9.811710150835712e-05, "loss": 3.707, "step": 2220 }, { "epoch": 0.15117543144448975, "grad_norm": 2.371002435684204, "learning_rate": 9.811285500747385e-05, "loss": 3.7992, "step": 2225 }, { "epoch": 0.15151515151515152, "grad_norm": 2.4403250217437744, "learning_rate": 9.810860850659058e-05, "loss": 3.4019, "step": 2230 }, { "epoch": 0.1518548715858133, "grad_norm": 2.438884735107422, "learning_rate": 9.81043620057073e-05, "loss": 3.8167, "step": 2235 }, { "epoch": 0.15219459165647506, "grad_norm": 2.1409878730773926, "learning_rate": 9.810011550482403e-05, "loss": 3.7638, "step": 2240 }, { "epoch": 0.15253431172713683, "grad_norm": 1.9637494087219238, "learning_rate": 9.809586900394075e-05, "loss": 3.7475, "step": 2245 }, { "epoch": 0.15287403179779863, "grad_norm": 2.9680416584014893, "learning_rate": 9.809162250305749e-05, "loss": 3.3943, "step": 2250 }, { "epoch": 0.1532137518684604, "grad_norm": 2.462663412094116, "learning_rate": 9.808737600217422e-05, "loss": 3.5647, "step": 2255 }, { "epoch": 0.15355347193912217, "grad_norm": 2.943225145339966, "learning_rate": 9.808312950129093e-05, "loss": 3.8152, "step": 2260 }, { "epoch": 0.15389319200978394, "grad_norm": 2.1768798828125, "learning_rate": 9.807888300040767e-05, "loss": 3.6182, "step": 2265 }, { "epoch": 0.1542329120804457, "grad_norm": 2.3816909790039062, "learning_rate": 9.80746364995244e-05, "loss": 3.701, "step": 2270 }, { "epoch": 0.15457263215110748, "grad_norm": 3.084084987640381, "learning_rate": 9.807038999864112e-05, "loss": 3.4905, "step": 2275 }, { "epoch": 0.15491235222176927, "grad_norm": 2.858288049697876, "learning_rate": 9.806614349775786e-05, "loss": 3.4518, "step": 2280 }, { "epoch": 0.15525207229243104, "grad_norm": 2.808328866958618, "learning_rate": 9.806189699687459e-05, "loss": 3.5431, "step": 2285 }, { "epoch": 0.1555917923630928, "grad_norm": 2.4387338161468506, "learning_rate": 9.80576504959913e-05, "loss": 3.5698, "step": 2290 }, { "epoch": 0.15593151243375458, "grad_norm": 3.9449453353881836, "learning_rate": 9.805340399510804e-05, "loss": 3.5102, "step": 2295 }, { "epoch": 0.15627123250441635, "grad_norm": 3.0978400707244873, "learning_rate": 9.804915749422477e-05, "loss": 3.6287, "step": 2300 }, { "epoch": 0.15661095257507815, "grad_norm": 2.203216552734375, "learning_rate": 9.804491099334148e-05, "loss": 3.7373, "step": 2305 }, { "epoch": 0.15695067264573992, "grad_norm": 2.424668312072754, "learning_rate": 9.804066449245823e-05, "loss": 3.4089, "step": 2310 }, { "epoch": 0.1572903927164017, "grad_norm": 2.095552921295166, "learning_rate": 9.803641799157494e-05, "loss": 3.6822, "step": 2315 }, { "epoch": 0.15763011278706346, "grad_norm": 2.480710983276367, "learning_rate": 9.803217149069167e-05, "loss": 3.6402, "step": 2320 }, { "epoch": 0.15796983285772523, "grad_norm": 2.6255786418914795, "learning_rate": 9.802792498980841e-05, "loss": 3.8064, "step": 2325 }, { "epoch": 0.158309552928387, "grad_norm": 2.1788408756256104, "learning_rate": 9.802367848892512e-05, "loss": 3.5313, "step": 2330 }, { "epoch": 0.1586492729990488, "grad_norm": 2.2660152912139893, "learning_rate": 9.801943198804185e-05, "loss": 3.6233, "step": 2335 }, { "epoch": 0.15898899306971057, "grad_norm": 1.9752708673477173, "learning_rate": 9.80151854871586e-05, "loss": 3.6753, "step": 2340 }, { "epoch": 0.15932871314037234, "grad_norm": 2.174795150756836, "learning_rate": 9.801093898627531e-05, "loss": 3.6857, "step": 2345 }, { "epoch": 0.1596684332110341, "grad_norm": 2.283968448638916, "learning_rate": 9.800669248539204e-05, "loss": 3.8543, "step": 2350 }, { "epoch": 0.16000815328169588, "grad_norm": 2.2027220726013184, "learning_rate": 9.800244598450878e-05, "loss": 3.6323, "step": 2355 }, { "epoch": 0.16034787335235764, "grad_norm": 2.3527281284332275, "learning_rate": 9.799819948362549e-05, "loss": 3.6871, "step": 2360 }, { "epoch": 0.16068759342301944, "grad_norm": 2.2693097591400146, "learning_rate": 9.799395298274222e-05, "loss": 3.4205, "step": 2365 }, { "epoch": 0.1610273134936812, "grad_norm": 2.471632242202759, "learning_rate": 9.798970648185896e-05, "loss": 3.503, "step": 2370 }, { "epoch": 0.16136703356434298, "grad_norm": 2.77603816986084, "learning_rate": 9.798545998097568e-05, "loss": 3.6174, "step": 2375 }, { "epoch": 0.16170675363500475, "grad_norm": 2.966644048690796, "learning_rate": 9.79812134800924e-05, "loss": 3.408, "step": 2380 }, { "epoch": 0.16204647370566652, "grad_norm": 2.5603301525115967, "learning_rate": 9.797696697920913e-05, "loss": 3.5237, "step": 2385 }, { "epoch": 0.16238619377632832, "grad_norm": 2.822503089904785, "learning_rate": 9.797272047832586e-05, "loss": 3.6871, "step": 2390 }, { "epoch": 0.1627259138469901, "grad_norm": 3.1340339183807373, "learning_rate": 9.796847397744259e-05, "loss": 3.4174, "step": 2395 }, { "epoch": 0.16306563391765186, "grad_norm": 2.0358223915100098, "learning_rate": 9.796422747655932e-05, "loss": 3.6311, "step": 2400 }, { "epoch": 0.16340535398831363, "grad_norm": 2.0518910884857178, "learning_rate": 9.795998097567604e-05, "loss": 3.3313, "step": 2405 }, { "epoch": 0.1637450740589754, "grad_norm": 2.1445281505584717, "learning_rate": 9.795573447479277e-05, "loss": 3.7508, "step": 2410 }, { "epoch": 0.16408479412963717, "grad_norm": 2.2487223148345947, "learning_rate": 9.79514879739095e-05, "loss": 3.8102, "step": 2415 }, { "epoch": 0.16442451420029897, "grad_norm": 2.8430604934692383, "learning_rate": 9.794724147302623e-05, "loss": 3.591, "step": 2420 }, { "epoch": 0.16476423427096074, "grad_norm": 2.6248815059661865, "learning_rate": 9.794299497214296e-05, "loss": 3.6117, "step": 2425 }, { "epoch": 0.1651039543416225, "grad_norm": 2.956120729446411, "learning_rate": 9.793874847125968e-05, "loss": 3.6201, "step": 2430 }, { "epoch": 0.16544367441228428, "grad_norm": 2.6630165576934814, "learning_rate": 9.793450197037641e-05, "loss": 3.7398, "step": 2435 }, { "epoch": 0.16578339448294604, "grad_norm": 2.3322465419769287, "learning_rate": 9.793025546949314e-05, "loss": 3.6164, "step": 2440 }, { "epoch": 0.16612311455360781, "grad_norm": 2.939293146133423, "learning_rate": 9.792600896860987e-05, "loss": 3.5847, "step": 2445 }, { "epoch": 0.1664628346242696, "grad_norm": 2.323683738708496, "learning_rate": 9.79217624677266e-05, "loss": 3.4673, "step": 2450 }, { "epoch": 0.16680255469493138, "grad_norm": 3.1200194358825684, "learning_rate": 9.791751596684332e-05, "loss": 3.5641, "step": 2455 }, { "epoch": 0.16714227476559315, "grad_norm": 2.71930193901062, "learning_rate": 9.791326946596005e-05, "loss": 3.6519, "step": 2460 }, { "epoch": 0.16748199483625492, "grad_norm": 2.446498155593872, "learning_rate": 9.790902296507678e-05, "loss": 3.5345, "step": 2465 }, { "epoch": 0.1678217149069167, "grad_norm": 2.8127427101135254, "learning_rate": 9.790477646419351e-05, "loss": 3.7365, "step": 2470 }, { "epoch": 0.1681614349775785, "grad_norm": 2.792457103729248, "learning_rate": 9.790052996331024e-05, "loss": 3.7071, "step": 2475 }, { "epoch": 0.16850115504824026, "grad_norm": 2.2959587574005127, "learning_rate": 9.789628346242696e-05, "loss": 3.5937, "step": 2480 }, { "epoch": 0.16884087511890203, "grad_norm": 2.0903351306915283, "learning_rate": 9.789203696154369e-05, "loss": 3.7096, "step": 2485 }, { "epoch": 0.1691805951895638, "grad_norm": 1.8774720430374146, "learning_rate": 9.788779046066042e-05, "loss": 3.6802, "step": 2490 }, { "epoch": 0.16952031526022557, "grad_norm": 2.540419816970825, "learning_rate": 9.788354395977715e-05, "loss": 3.6033, "step": 2495 }, { "epoch": 0.16986003533088734, "grad_norm": 2.4474573135375977, "learning_rate": 9.787929745889388e-05, "loss": 3.7732, "step": 2500 }, { "epoch": 0.17019975540154914, "grad_norm": 2.218492269515991, "learning_rate": 9.78750509580106e-05, "loss": 3.8521, "step": 2505 }, { "epoch": 0.1705394754722109, "grad_norm": 2.1093363761901855, "learning_rate": 9.787080445712733e-05, "loss": 3.8351, "step": 2510 }, { "epoch": 0.17087919554287267, "grad_norm": 1.7610983848571777, "learning_rate": 9.786655795624406e-05, "loss": 3.5571, "step": 2515 }, { "epoch": 0.17121891561353444, "grad_norm": 2.0385890007019043, "learning_rate": 9.786231145536079e-05, "loss": 3.8087, "step": 2520 }, { "epoch": 0.17155863568419621, "grad_norm": 1.9753596782684326, "learning_rate": 9.785806495447752e-05, "loss": 3.7648, "step": 2525 }, { "epoch": 0.17189835575485798, "grad_norm": 2.7233643531799316, "learning_rate": 9.785381845359425e-05, "loss": 3.6077, "step": 2530 }, { "epoch": 0.17223807582551978, "grad_norm": 2.5640554428100586, "learning_rate": 9.784957195271097e-05, "loss": 3.8141, "step": 2535 }, { "epoch": 0.17257779589618155, "grad_norm": 2.5163915157318115, "learning_rate": 9.78453254518277e-05, "loss": 3.49, "step": 2540 }, { "epoch": 0.17291751596684332, "grad_norm": 2.22219181060791, "learning_rate": 9.784107895094443e-05, "loss": 3.5199, "step": 2545 }, { "epoch": 0.1732572360375051, "grad_norm": 2.1310431957244873, "learning_rate": 9.783683245006116e-05, "loss": 3.7165, "step": 2550 }, { "epoch": 0.17359695610816686, "grad_norm": 2.130831480026245, "learning_rate": 9.783258594917789e-05, "loss": 3.471, "step": 2555 }, { "epoch": 0.17393667617882866, "grad_norm": 3.715970754623413, "learning_rate": 9.782833944829461e-05, "loss": 3.4676, "step": 2560 }, { "epoch": 0.17427639624949043, "grad_norm": 3.315221071243286, "learning_rate": 9.782409294741134e-05, "loss": 3.5581, "step": 2565 }, { "epoch": 0.1746161163201522, "grad_norm": 3.4642419815063477, "learning_rate": 9.781984644652807e-05, "loss": 3.6419, "step": 2570 }, { "epoch": 0.17495583639081397, "grad_norm": 3.6717991828918457, "learning_rate": 9.78155999456448e-05, "loss": 3.7798, "step": 2575 }, { "epoch": 0.17529555646147574, "grad_norm": 2.8997764587402344, "learning_rate": 9.781135344476153e-05, "loss": 3.7123, "step": 2580 }, { "epoch": 0.1756352765321375, "grad_norm": 2.0655264854431152, "learning_rate": 9.780710694387824e-05, "loss": 3.5152, "step": 2585 }, { "epoch": 0.1759749966027993, "grad_norm": 2.969449520111084, "learning_rate": 9.780286044299498e-05, "loss": 3.4419, "step": 2590 }, { "epoch": 0.17631471667346107, "grad_norm": 2.859215259552002, "learning_rate": 9.779861394211171e-05, "loss": 3.5513, "step": 2595 }, { "epoch": 0.17665443674412284, "grad_norm": 2.473663806915283, "learning_rate": 9.779436744122842e-05, "loss": 3.4312, "step": 2600 }, { "epoch": 0.17699415681478461, "grad_norm": 3.0627822875976562, "learning_rate": 9.779012094034517e-05, "loss": 3.8827, "step": 2605 }, { "epoch": 0.17733387688544638, "grad_norm": 3.38277530670166, "learning_rate": 9.778587443946189e-05, "loss": 3.6351, "step": 2610 }, { "epoch": 0.17767359695610815, "grad_norm": 2.0422134399414062, "learning_rate": 9.778162793857861e-05, "loss": 3.3612, "step": 2615 }, { "epoch": 0.17801331702676995, "grad_norm": 2.3042876720428467, "learning_rate": 9.777738143769535e-05, "loss": 3.5923, "step": 2620 }, { "epoch": 0.17835303709743172, "grad_norm": 2.577887773513794, "learning_rate": 9.777313493681208e-05, "loss": 3.4756, "step": 2625 }, { "epoch": 0.1786927571680935, "grad_norm": 2.7902963161468506, "learning_rate": 9.776888843592879e-05, "loss": 3.4211, "step": 2630 }, { "epoch": 0.17903247723875526, "grad_norm": 2.413365125656128, "learning_rate": 9.776464193504553e-05, "loss": 3.5782, "step": 2635 }, { "epoch": 0.17937219730941703, "grad_norm": 2.4663658142089844, "learning_rate": 9.776039543416226e-05, "loss": 3.7296, "step": 2640 }, { "epoch": 0.17971191738007883, "grad_norm": 2.2067759037017822, "learning_rate": 9.775614893327898e-05, "loss": 3.5974, "step": 2645 }, { "epoch": 0.1800516374507406, "grad_norm": 2.559173822402954, "learning_rate": 9.775190243239572e-05, "loss": 3.778, "step": 2650 }, { "epoch": 0.18039135752140237, "grad_norm": 3.064662456512451, "learning_rate": 9.774765593151245e-05, "loss": 3.6979, "step": 2655 }, { "epoch": 0.18073107759206414, "grad_norm": 1.996138095855713, "learning_rate": 9.774340943062916e-05, "loss": 3.4503, "step": 2660 }, { "epoch": 0.1810707976627259, "grad_norm": 2.4670746326446533, "learning_rate": 9.77391629297459e-05, "loss": 3.5571, "step": 2665 }, { "epoch": 0.18141051773338768, "grad_norm": 3.6713316440582275, "learning_rate": 9.773491642886262e-05, "loss": 3.5245, "step": 2670 }, { "epoch": 0.18175023780404947, "grad_norm": 3.433065414428711, "learning_rate": 9.773066992797934e-05, "loss": 3.4949, "step": 2675 }, { "epoch": 0.18208995787471124, "grad_norm": 2.7018165588378906, "learning_rate": 9.772642342709609e-05, "loss": 3.7476, "step": 2680 }, { "epoch": 0.18242967794537301, "grad_norm": 1.8158012628555298, "learning_rate": 9.77221769262128e-05, "loss": 3.5977, "step": 2685 }, { "epoch": 0.18276939801603478, "grad_norm": 2.106041431427002, "learning_rate": 9.771793042532953e-05, "loss": 3.6258, "step": 2690 }, { "epoch": 0.18310911808669655, "grad_norm": 2.2543394565582275, "learning_rate": 9.771368392444627e-05, "loss": 3.2172, "step": 2695 }, { "epoch": 0.18344883815735832, "grad_norm": 2.4051880836486816, "learning_rate": 9.770943742356298e-05, "loss": 3.7232, "step": 2700 }, { "epoch": 0.18378855822802012, "grad_norm": 2.4393973350524902, "learning_rate": 9.770519092267971e-05, "loss": 3.4493, "step": 2705 }, { "epoch": 0.1841282782986819, "grad_norm": 2.4562902450561523, "learning_rate": 9.770094442179645e-05, "loss": 3.4672, "step": 2710 }, { "epoch": 0.18446799836934366, "grad_norm": 2.406501531600952, "learning_rate": 9.769669792091317e-05, "loss": 3.633, "step": 2715 }, { "epoch": 0.18480771844000543, "grad_norm": 2.541698932647705, "learning_rate": 9.76924514200299e-05, "loss": 3.6351, "step": 2720 }, { "epoch": 0.1851474385106672, "grad_norm": 2.210376739501953, "learning_rate": 9.768820491914664e-05, "loss": 3.7154, "step": 2725 }, { "epoch": 0.185487158581329, "grad_norm": 2.403480052947998, "learning_rate": 9.768395841826335e-05, "loss": 3.8181, "step": 2730 }, { "epoch": 0.18582687865199077, "grad_norm": 2.3813345432281494, "learning_rate": 9.767971191738008e-05, "loss": 3.639, "step": 2735 }, { "epoch": 0.18616659872265254, "grad_norm": 1.8875386714935303, "learning_rate": 9.767546541649681e-05, "loss": 3.44, "step": 2740 }, { "epoch": 0.1865063187933143, "grad_norm": 9.793167114257812, "learning_rate": 9.767121891561354e-05, "loss": 3.4686, "step": 2745 }, { "epoch": 0.18684603886397608, "grad_norm": 2.6854958534240723, "learning_rate": 9.766697241473026e-05, "loss": 3.6919, "step": 2750 }, { "epoch": 0.18718575893463785, "grad_norm": 2.650813579559326, "learning_rate": 9.766272591384699e-05, "loss": 3.8755, "step": 2755 }, { "epoch": 0.18752547900529964, "grad_norm": 2.170989751815796, "learning_rate": 9.765847941296372e-05, "loss": 3.6333, "step": 2760 }, { "epoch": 0.18786519907596141, "grad_norm": 1.9350907802581787, "learning_rate": 9.765423291208045e-05, "loss": 3.7114, "step": 2765 }, { "epoch": 0.18820491914662318, "grad_norm": 1.8634272813796997, "learning_rate": 9.764998641119718e-05, "loss": 3.6276, "step": 2770 }, { "epoch": 0.18854463921728495, "grad_norm": 2.8237640857696533, "learning_rate": 9.76457399103139e-05, "loss": 3.8684, "step": 2775 }, { "epoch": 0.18888435928794672, "grad_norm": 2.0793941020965576, "learning_rate": 9.764149340943063e-05, "loss": 3.6411, "step": 2780 }, { "epoch": 0.1892240793586085, "grad_norm": 2.297959089279175, "learning_rate": 9.763724690854736e-05, "loss": 3.3032, "step": 2785 }, { "epoch": 0.1895637994292703, "grad_norm": 3.117781639099121, "learning_rate": 9.763300040766409e-05, "loss": 3.6391, "step": 2790 }, { "epoch": 0.18990351949993206, "grad_norm": 2.892871141433716, "learning_rate": 9.762875390678082e-05, "loss": 3.4297, "step": 2795 }, { "epoch": 0.19024323957059383, "grad_norm": 2.8412797451019287, "learning_rate": 9.762450740589754e-05, "loss": 3.5017, "step": 2800 }, { "epoch": 0.1905829596412556, "grad_norm": 2.782804489135742, "learning_rate": 9.762026090501427e-05, "loss": 3.5181, "step": 2805 }, { "epoch": 0.19092267971191737, "grad_norm": 2.209777355194092, "learning_rate": 9.7616014404131e-05, "loss": 3.2019, "step": 2810 }, { "epoch": 0.19126239978257917, "grad_norm": 3.334501028060913, "learning_rate": 9.761176790324773e-05, "loss": 3.582, "step": 2815 }, { "epoch": 0.19160211985324094, "grad_norm": 3.1110005378723145, "learning_rate": 9.760752140236446e-05, "loss": 3.7484, "step": 2820 }, { "epoch": 0.1919418399239027, "grad_norm": 1.786243200302124, "learning_rate": 9.760327490148118e-05, "loss": 3.4655, "step": 2825 }, { "epoch": 0.19228155999456448, "grad_norm": 2.2295684814453125, "learning_rate": 9.759902840059791e-05, "loss": 3.6777, "step": 2830 }, { "epoch": 0.19262128006522625, "grad_norm": 2.8025248050689697, "learning_rate": 9.759478189971464e-05, "loss": 3.6121, "step": 2835 }, { "epoch": 0.19296100013588802, "grad_norm": 2.6978213787078857, "learning_rate": 9.759053539883137e-05, "loss": 3.5891, "step": 2840 }, { "epoch": 0.19330072020654981, "grad_norm": 1.7959308624267578, "learning_rate": 9.75862888979481e-05, "loss": 3.6874, "step": 2845 }, { "epoch": 0.19364044027721158, "grad_norm": 2.0286197662353516, "learning_rate": 9.758204239706482e-05, "loss": 3.5829, "step": 2850 }, { "epoch": 0.19398016034787335, "grad_norm": 2.8474998474121094, "learning_rate": 9.757779589618155e-05, "loss": 3.6931, "step": 2855 }, { "epoch": 0.19431988041853512, "grad_norm": 2.3275139331817627, "learning_rate": 9.757354939529828e-05, "loss": 3.5373, "step": 2860 }, { "epoch": 0.1946596004891969, "grad_norm": 2.9940762519836426, "learning_rate": 9.756930289441501e-05, "loss": 3.7331, "step": 2865 }, { "epoch": 0.19499932055985866, "grad_norm": 2.6775050163269043, "learning_rate": 9.756505639353174e-05, "loss": 3.4479, "step": 2870 }, { "epoch": 0.19533904063052046, "grad_norm": 2.838517904281616, "learning_rate": 9.756080989264846e-05, "loss": 3.5264, "step": 2875 }, { "epoch": 0.19567876070118223, "grad_norm": 2.5054478645324707, "learning_rate": 9.755656339176519e-05, "loss": 3.6542, "step": 2880 }, { "epoch": 0.196018480771844, "grad_norm": 2.3794655799865723, "learning_rate": 9.755231689088192e-05, "loss": 3.5861, "step": 2885 }, { "epoch": 0.19635820084250577, "grad_norm": 2.6987698078155518, "learning_rate": 9.754807038999865e-05, "loss": 3.1002, "step": 2890 }, { "epoch": 0.19669792091316754, "grad_norm": 2.576537847518921, "learning_rate": 9.754382388911538e-05, "loss": 3.2865, "step": 2895 }, { "epoch": 0.19703764098382934, "grad_norm": 2.7286622524261475, "learning_rate": 9.75395773882321e-05, "loss": 3.6333, "step": 2900 }, { "epoch": 0.1973773610544911, "grad_norm": 2.4445180892944336, "learning_rate": 9.753533088734883e-05, "loss": 3.3457, "step": 2905 }, { "epoch": 0.19771708112515288, "grad_norm": 2.815880298614502, "learning_rate": 9.753108438646556e-05, "loss": 3.4336, "step": 2910 }, { "epoch": 0.19805680119581465, "grad_norm": 2.600489854812622, "learning_rate": 9.752683788558229e-05, "loss": 3.5123, "step": 2915 }, { "epoch": 0.19839652126647642, "grad_norm": 2.266153573989868, "learning_rate": 9.752259138469902e-05, "loss": 3.5787, "step": 2920 }, { "epoch": 0.1987362413371382, "grad_norm": 2.3167531490325928, "learning_rate": 9.751834488381574e-05, "loss": 3.5104, "step": 2925 }, { "epoch": 0.19907596140779998, "grad_norm": 2.650376319885254, "learning_rate": 9.751409838293247e-05, "loss": 3.3488, "step": 2930 }, { "epoch": 0.19941568147846175, "grad_norm": 2.216982126235962, "learning_rate": 9.75098518820492e-05, "loss": 3.7367, "step": 2935 }, { "epoch": 0.19975540154912352, "grad_norm": 3.7843616008758545, "learning_rate": 9.750560538116591e-05, "loss": 3.3857, "step": 2940 }, { "epoch": 0.2000951216197853, "grad_norm": 2.531493902206421, "learning_rate": 9.750135888028266e-05, "loss": 3.57, "step": 2945 }, { "epoch": 0.20043484169044706, "grad_norm": 2.5871853828430176, "learning_rate": 9.749711237939938e-05, "loss": 3.7574, "step": 2950 }, { "epoch": 0.20077456176110883, "grad_norm": 1.9841268062591553, "learning_rate": 9.74928658785161e-05, "loss": 3.8677, "step": 2955 }, { "epoch": 0.20111428183177063, "grad_norm": 2.5171425342559814, "learning_rate": 9.748861937763284e-05, "loss": 3.7339, "step": 2960 }, { "epoch": 0.2014540019024324, "grad_norm": 1.8714319467544556, "learning_rate": 9.748437287674957e-05, "loss": 3.47, "step": 2965 }, { "epoch": 0.20179372197309417, "grad_norm": 2.2297403812408447, "learning_rate": 9.748012637586628e-05, "loss": 3.5625, "step": 2970 }, { "epoch": 0.20213344204375594, "grad_norm": 2.2692747116088867, "learning_rate": 9.747587987498302e-05, "loss": 3.4923, "step": 2975 }, { "epoch": 0.2024731621144177, "grad_norm": 2.2953994274139404, "learning_rate": 9.747163337409975e-05, "loss": 3.5081, "step": 2980 }, { "epoch": 0.2028128821850795, "grad_norm": 2.5327563285827637, "learning_rate": 9.746738687321647e-05, "loss": 3.475, "step": 2985 }, { "epoch": 0.20315260225574128, "grad_norm": 1.9328463077545166, "learning_rate": 9.746314037233321e-05, "loss": 3.6246, "step": 2990 }, { "epoch": 0.20349232232640305, "grad_norm": 3.280945062637329, "learning_rate": 9.745889387144994e-05, "loss": 3.6621, "step": 2995 }, { "epoch": 0.20383204239706482, "grad_norm": 2.1963016986846924, "learning_rate": 9.745464737056665e-05, "loss": 3.5739, "step": 3000 }, { "epoch": 0.2041717624677266, "grad_norm": 3.0117032527923584, "learning_rate": 9.745040086968339e-05, "loss": 3.543, "step": 3005 }, { "epoch": 0.20451148253838836, "grad_norm": 2.1569318771362305, "learning_rate": 9.74461543688001e-05, "loss": 3.5851, "step": 3010 }, { "epoch": 0.20485120260905015, "grad_norm": 2.2693192958831787, "learning_rate": 9.744190786791683e-05, "loss": 3.7163, "step": 3015 }, { "epoch": 0.20519092267971192, "grad_norm": 2.4745140075683594, "learning_rate": 9.743766136703358e-05, "loss": 3.6156, "step": 3020 }, { "epoch": 0.2055306427503737, "grad_norm": 2.7690165042877197, "learning_rate": 9.743341486615029e-05, "loss": 3.6366, "step": 3025 }, { "epoch": 0.20587036282103546, "grad_norm": 2.6442840099334717, "learning_rate": 9.742916836526702e-05, "loss": 3.5539, "step": 3030 }, { "epoch": 0.20621008289169723, "grad_norm": 2.116943359375, "learning_rate": 9.742492186438376e-05, "loss": 3.5905, "step": 3035 }, { "epoch": 0.206549802962359, "grad_norm": 2.459552526473999, "learning_rate": 9.742067536350047e-05, "loss": 3.6107, "step": 3040 }, { "epoch": 0.2068895230330208, "grad_norm": 2.3407833576202393, "learning_rate": 9.74164288626172e-05, "loss": 3.4376, "step": 3045 }, { "epoch": 0.20722924310368257, "grad_norm": 2.142601490020752, "learning_rate": 9.741218236173394e-05, "loss": 3.5207, "step": 3050 }, { "epoch": 0.20756896317434434, "grad_norm": 2.957223892211914, "learning_rate": 9.740793586085066e-05, "loss": 3.4058, "step": 3055 }, { "epoch": 0.2079086832450061, "grad_norm": 3.0165371894836426, "learning_rate": 9.740368935996739e-05, "loss": 3.6678, "step": 3060 }, { "epoch": 0.20824840331566788, "grad_norm": 1.903260588645935, "learning_rate": 9.739944285908413e-05, "loss": 3.3852, "step": 3065 }, { "epoch": 0.20858812338632968, "grad_norm": 2.46872878074646, "learning_rate": 9.739519635820084e-05, "loss": 3.6929, "step": 3070 }, { "epoch": 0.20892784345699145, "grad_norm": 1.9820877313613892, "learning_rate": 9.739094985731757e-05, "loss": 3.4162, "step": 3075 }, { "epoch": 0.20926756352765322, "grad_norm": 2.5532376766204834, "learning_rate": 9.738670335643431e-05, "loss": 3.3846, "step": 3080 }, { "epoch": 0.209607283598315, "grad_norm": 2.64316463470459, "learning_rate": 9.738245685555103e-05, "loss": 3.8069, "step": 3085 }, { "epoch": 0.20994700366897676, "grad_norm": 2.5200743675231934, "learning_rate": 9.737821035466775e-05, "loss": 3.5191, "step": 3090 }, { "epoch": 0.21028672373963853, "grad_norm": 2.303730010986328, "learning_rate": 9.737396385378448e-05, "loss": 3.6871, "step": 3095 }, { "epoch": 0.21062644381030032, "grad_norm": 2.920701503753662, "learning_rate": 9.736971735290121e-05, "loss": 3.8136, "step": 3100 }, { "epoch": 0.2109661638809621, "grad_norm": 2.594109296798706, "learning_rate": 9.736547085201794e-05, "loss": 3.5586, "step": 3105 }, { "epoch": 0.21130588395162386, "grad_norm": 2.717984437942505, "learning_rate": 9.736122435113467e-05, "loss": 3.766, "step": 3110 }, { "epoch": 0.21164560402228563, "grad_norm": 2.254509210586548, "learning_rate": 9.73569778502514e-05, "loss": 3.6191, "step": 3115 }, { "epoch": 0.2119853240929474, "grad_norm": 3.8217010498046875, "learning_rate": 9.735273134936812e-05, "loss": 3.615, "step": 3120 }, { "epoch": 0.21232504416360917, "grad_norm": 2.576765537261963, "learning_rate": 9.734848484848485e-05, "loss": 3.3398, "step": 3125 }, { "epoch": 0.21266476423427097, "grad_norm": 3.2126941680908203, "learning_rate": 9.734423834760158e-05, "loss": 3.7622, "step": 3130 }, { "epoch": 0.21300448430493274, "grad_norm": 2.3438591957092285, "learning_rate": 9.733999184671831e-05, "loss": 3.4653, "step": 3135 }, { "epoch": 0.2133442043755945, "grad_norm": 1.928268551826477, "learning_rate": 9.733574534583503e-05, "loss": 3.4275, "step": 3140 }, { "epoch": 0.21368392444625628, "grad_norm": 2.6850974559783936, "learning_rate": 9.733149884495176e-05, "loss": 3.6838, "step": 3145 }, { "epoch": 0.21402364451691805, "grad_norm": 2.4214115142822266, "learning_rate": 9.732725234406849e-05, "loss": 3.2707, "step": 3150 }, { "epoch": 0.21436336458757985, "grad_norm": 3.014404535293579, "learning_rate": 9.732300584318522e-05, "loss": 3.5808, "step": 3155 }, { "epoch": 0.21470308465824162, "grad_norm": 2.328543186187744, "learning_rate": 9.731875934230195e-05, "loss": 3.5542, "step": 3160 }, { "epoch": 0.2150428047289034, "grad_norm": 1.7934205532073975, "learning_rate": 9.731451284141867e-05, "loss": 3.6418, "step": 3165 }, { "epoch": 0.21538252479956516, "grad_norm": 3.1732406616210938, "learning_rate": 9.73102663405354e-05, "loss": 3.3123, "step": 3170 }, { "epoch": 0.21572224487022693, "grad_norm": 2.4772515296936035, "learning_rate": 9.730601983965213e-05, "loss": 3.5331, "step": 3175 }, { "epoch": 0.2160619649408887, "grad_norm": 2.125526189804077, "learning_rate": 9.730177333876886e-05, "loss": 3.6172, "step": 3180 }, { "epoch": 0.2164016850115505, "grad_norm": 2.6105737686157227, "learning_rate": 9.729752683788559e-05, "loss": 3.6548, "step": 3185 }, { "epoch": 0.21674140508221226, "grad_norm": 2.7148489952087402, "learning_rate": 9.729328033700231e-05, "loss": 3.8312, "step": 3190 }, { "epoch": 0.21708112515287403, "grad_norm": 2.44311785697937, "learning_rate": 9.728903383611904e-05, "loss": 3.5378, "step": 3195 }, { "epoch": 0.2174208452235358, "grad_norm": 2.3602492809295654, "learning_rate": 9.728478733523577e-05, "loss": 3.7604, "step": 3200 }, { "epoch": 0.21776056529419757, "grad_norm": 2.6540510654449463, "learning_rate": 9.72805408343525e-05, "loss": 3.7915, "step": 3205 }, { "epoch": 0.21810028536485934, "grad_norm": 2.384971857070923, "learning_rate": 9.727629433346923e-05, "loss": 3.7734, "step": 3210 }, { "epoch": 0.21844000543552114, "grad_norm": 2.243856191635132, "learning_rate": 9.727204783258595e-05, "loss": 3.6214, "step": 3215 }, { "epoch": 0.2187797255061829, "grad_norm": 2.1839208602905273, "learning_rate": 9.726780133170268e-05, "loss": 3.4916, "step": 3220 }, { "epoch": 0.21911944557684468, "grad_norm": 2.262728691101074, "learning_rate": 9.726355483081941e-05, "loss": 3.4205, "step": 3225 }, { "epoch": 0.21945916564750645, "grad_norm": 2.3842148780822754, "learning_rate": 9.725930832993614e-05, "loss": 3.6629, "step": 3230 }, { "epoch": 0.21979888571816822, "grad_norm": 2.1511716842651367, "learning_rate": 9.725506182905287e-05, "loss": 3.4792, "step": 3235 }, { "epoch": 0.22013860578883002, "grad_norm": 2.3301377296447754, "learning_rate": 9.72508153281696e-05, "loss": 3.5182, "step": 3240 }, { "epoch": 0.22047832585949179, "grad_norm": 2.0076563358306885, "learning_rate": 9.724656882728632e-05, "loss": 3.4083, "step": 3245 }, { "epoch": 0.22081804593015356, "grad_norm": 2.830758810043335, "learning_rate": 9.724232232640305e-05, "loss": 3.5786, "step": 3250 }, { "epoch": 0.22115776600081533, "grad_norm": 2.079038381576538, "learning_rate": 9.723807582551978e-05, "loss": 3.4956, "step": 3255 }, { "epoch": 0.2214974860714771, "grad_norm": 2.3984270095825195, "learning_rate": 9.723382932463651e-05, "loss": 3.6339, "step": 3260 }, { "epoch": 0.22183720614213887, "grad_norm": 2.0549917221069336, "learning_rate": 9.722958282375324e-05, "loss": 3.619, "step": 3265 }, { "epoch": 0.22217692621280066, "grad_norm": 2.448378801345825, "learning_rate": 9.722533632286996e-05, "loss": 3.7021, "step": 3270 }, { "epoch": 0.22251664628346243, "grad_norm": 2.4262993335723877, "learning_rate": 9.722108982198669e-05, "loss": 3.5763, "step": 3275 }, { "epoch": 0.2228563663541242, "grad_norm": 2.3491122722625732, "learning_rate": 9.721684332110342e-05, "loss": 3.5363, "step": 3280 }, { "epoch": 0.22319608642478597, "grad_norm": 2.4455039501190186, "learning_rate": 9.721259682022015e-05, "loss": 3.8102, "step": 3285 }, { "epoch": 0.22353580649544774, "grad_norm": 3.75618052482605, "learning_rate": 9.720835031933688e-05, "loss": 3.5561, "step": 3290 }, { "epoch": 0.22387552656610954, "grad_norm": 4.276167869567871, "learning_rate": 9.720410381845359e-05, "loss": 3.2564, "step": 3295 }, { "epoch": 0.2242152466367713, "grad_norm": 2.3141064643859863, "learning_rate": 9.719985731757033e-05, "loss": 3.4724, "step": 3300 }, { "epoch": 0.22455496670743308, "grad_norm": 2.0347092151641846, "learning_rate": 9.719561081668706e-05, "loss": 3.7681, "step": 3305 }, { "epoch": 0.22489468677809485, "grad_norm": 3.3582491874694824, "learning_rate": 9.719136431580377e-05, "loss": 3.3226, "step": 3310 }, { "epoch": 0.22523440684875662, "grad_norm": 2.4000747203826904, "learning_rate": 9.718711781492052e-05, "loss": 3.3896, "step": 3315 }, { "epoch": 0.2255741269194184, "grad_norm": 2.094728469848633, "learning_rate": 9.718287131403724e-05, "loss": 3.4655, "step": 3320 }, { "epoch": 0.22591384699008019, "grad_norm": 2.277259349822998, "learning_rate": 9.717862481315396e-05, "loss": 3.5437, "step": 3325 }, { "epoch": 0.22625356706074196, "grad_norm": 2.1364047527313232, "learning_rate": 9.71743783122707e-05, "loss": 3.7251, "step": 3330 }, { "epoch": 0.22659328713140373, "grad_norm": 2.2038934230804443, "learning_rate": 9.717013181138743e-05, "loss": 3.4311, "step": 3335 }, { "epoch": 0.2269330072020655, "grad_norm": 3.897040605545044, "learning_rate": 9.716588531050414e-05, "loss": 3.2396, "step": 3340 }, { "epoch": 0.22727272727272727, "grad_norm": 2.321673631668091, "learning_rate": 9.716163880962088e-05, "loss": 3.5803, "step": 3345 }, { "epoch": 0.22761244734338903, "grad_norm": 2.327251434326172, "learning_rate": 9.715739230873761e-05, "loss": 3.6083, "step": 3350 }, { "epoch": 0.22795216741405083, "grad_norm": 2.3726818561553955, "learning_rate": 9.715314580785433e-05, "loss": 3.5619, "step": 3355 }, { "epoch": 0.2282918874847126, "grad_norm": 1.785314679145813, "learning_rate": 9.714889930697107e-05, "loss": 3.6039, "step": 3360 }, { "epoch": 0.22863160755537437, "grad_norm": 2.9010629653930664, "learning_rate": 9.714465280608778e-05, "loss": 3.7284, "step": 3365 }, { "epoch": 0.22897132762603614, "grad_norm": 2.874483823776245, "learning_rate": 9.714040630520451e-05, "loss": 3.6143, "step": 3370 }, { "epoch": 0.2293110476966979, "grad_norm": 2.348184823989868, "learning_rate": 9.713615980432125e-05, "loss": 3.6277, "step": 3375 }, { "epoch": 0.2296507677673597, "grad_norm": 2.8848817348480225, "learning_rate": 9.713191330343797e-05, "loss": 3.4631, "step": 3380 }, { "epoch": 0.22999048783802148, "grad_norm": 2.196012496948242, "learning_rate": 9.71276668025547e-05, "loss": 3.6293, "step": 3385 }, { "epoch": 0.23033020790868325, "grad_norm": 2.025202989578247, "learning_rate": 9.712342030167144e-05, "loss": 3.4078, "step": 3390 }, { "epoch": 0.23066992797934502, "grad_norm": 2.2144856452941895, "learning_rate": 9.711917380078815e-05, "loss": 3.656, "step": 3395 }, { "epoch": 0.2310096480500068, "grad_norm": 2.4922642707824707, "learning_rate": 9.711492729990488e-05, "loss": 3.4842, "step": 3400 }, { "epoch": 0.23134936812066856, "grad_norm": 2.624922037124634, "learning_rate": 9.711068079902162e-05, "loss": 3.6389, "step": 3405 }, { "epoch": 0.23168908819133036, "grad_norm": 2.332202196121216, "learning_rate": 9.710643429813833e-05, "loss": 3.5247, "step": 3410 }, { "epoch": 0.23202880826199213, "grad_norm": 2.6892404556274414, "learning_rate": 9.710218779725506e-05, "loss": 3.5611, "step": 3415 }, { "epoch": 0.2323685283326539, "grad_norm": 2.2102909088134766, "learning_rate": 9.70979412963718e-05, "loss": 3.759, "step": 3420 }, { "epoch": 0.23270824840331567, "grad_norm": 2.823338031768799, "learning_rate": 9.709369479548852e-05, "loss": 3.6453, "step": 3425 }, { "epoch": 0.23304796847397743, "grad_norm": 2.9724040031433105, "learning_rate": 9.708944829460525e-05, "loss": 3.6166, "step": 3430 }, { "epoch": 0.2333876885446392, "grad_norm": 1.7646759748458862, "learning_rate": 9.708520179372197e-05, "loss": 3.6706, "step": 3435 }, { "epoch": 0.233727408615301, "grad_norm": 3.241206169128418, "learning_rate": 9.70809552928387e-05, "loss": 3.3441, "step": 3440 }, { "epoch": 0.23406712868596277, "grad_norm": 3.5427963733673096, "learning_rate": 9.707670879195543e-05, "loss": 3.377, "step": 3445 }, { "epoch": 0.23440684875662454, "grad_norm": 2.484489917755127, "learning_rate": 9.707246229107216e-05, "loss": 3.555, "step": 3450 }, { "epoch": 0.2347465688272863, "grad_norm": 2.6747255325317383, "learning_rate": 9.706821579018889e-05, "loss": 3.5748, "step": 3455 }, { "epoch": 0.23508628889794808, "grad_norm": 2.3457493782043457, "learning_rate": 9.706396928930561e-05, "loss": 3.4077, "step": 3460 }, { "epoch": 0.23542600896860988, "grad_norm": 1.8754417896270752, "learning_rate": 9.705972278842234e-05, "loss": 3.6926, "step": 3465 }, { "epoch": 0.23576572903927165, "grad_norm": 2.3581864833831787, "learning_rate": 9.705547628753907e-05, "loss": 3.4543, "step": 3470 }, { "epoch": 0.23610544910993342, "grad_norm": 2.6379165649414062, "learning_rate": 9.70512297866558e-05, "loss": 3.4407, "step": 3475 }, { "epoch": 0.2364451691805952, "grad_norm": 3.1290669441223145, "learning_rate": 9.704698328577253e-05, "loss": 3.7414, "step": 3480 }, { "epoch": 0.23678488925125696, "grad_norm": 2.228219747543335, "learning_rate": 9.704273678488925e-05, "loss": 3.6329, "step": 3485 }, { "epoch": 0.23712460932191873, "grad_norm": 2.2058937549591064, "learning_rate": 9.703849028400598e-05, "loss": 3.3255, "step": 3490 }, { "epoch": 0.23746432939258053, "grad_norm": 2.2477197647094727, "learning_rate": 9.703424378312271e-05, "loss": 3.5478, "step": 3495 }, { "epoch": 0.2378040494632423, "grad_norm": 2.473595142364502, "learning_rate": 9.702999728223944e-05, "loss": 3.6665, "step": 3500 }, { "epoch": 0.23814376953390406, "grad_norm": 2.5324323177337646, "learning_rate": 9.702575078135617e-05, "loss": 3.6785, "step": 3505 }, { "epoch": 0.23848348960456583, "grad_norm": 2.982419013977051, "learning_rate": 9.70215042804729e-05, "loss": 3.472, "step": 3510 }, { "epoch": 0.2388232096752276, "grad_norm": 2.538529396057129, "learning_rate": 9.701725777958962e-05, "loss": 3.5603, "step": 3515 }, { "epoch": 0.23916292974588937, "grad_norm": 2.321845054626465, "learning_rate": 9.701301127870635e-05, "loss": 3.4214, "step": 3520 }, { "epoch": 0.23950264981655117, "grad_norm": 1.846472144126892, "learning_rate": 9.700876477782308e-05, "loss": 3.6138, "step": 3525 }, { "epoch": 0.23984236988721294, "grad_norm": 2.2306771278381348, "learning_rate": 9.70045182769398e-05, "loss": 3.5543, "step": 3530 }, { "epoch": 0.2401820899578747, "grad_norm": 2.332549571990967, "learning_rate": 9.700027177605653e-05, "loss": 3.2609, "step": 3535 }, { "epoch": 0.24052181002853648, "grad_norm": 2.479609966278076, "learning_rate": 9.699602527517326e-05, "loss": 3.5471, "step": 3540 }, { "epoch": 0.24086153009919825, "grad_norm": 2.5703837871551514, "learning_rate": 9.699177877428999e-05, "loss": 3.5975, "step": 3545 }, { "epoch": 0.24120125016986005, "grad_norm": 2.5452921390533447, "learning_rate": 9.698753227340672e-05, "loss": 3.7861, "step": 3550 }, { "epoch": 0.24154097024052182, "grad_norm": 2.0180318355560303, "learning_rate": 9.698328577252345e-05, "loss": 3.6417, "step": 3555 }, { "epoch": 0.2418806903111836, "grad_norm": 2.594252347946167, "learning_rate": 9.697903927164017e-05, "loss": 3.3996, "step": 3560 }, { "epoch": 0.24222041038184536, "grad_norm": 2.105578899383545, "learning_rate": 9.69747927707569e-05, "loss": 3.5252, "step": 3565 }, { "epoch": 0.24256013045250713, "grad_norm": 2.715123414993286, "learning_rate": 9.697054626987363e-05, "loss": 3.3542, "step": 3570 }, { "epoch": 0.2428998505231689, "grad_norm": 3.476891040802002, "learning_rate": 9.696629976899036e-05, "loss": 3.6821, "step": 3575 }, { "epoch": 0.2432395705938307, "grad_norm": 2.4146616458892822, "learning_rate": 9.696205326810709e-05, "loss": 3.6638, "step": 3580 }, { "epoch": 0.24357929066449246, "grad_norm": 2.325687885284424, "learning_rate": 9.695780676722381e-05, "loss": 3.6656, "step": 3585 }, { "epoch": 0.24391901073515423, "grad_norm": 2.509408712387085, "learning_rate": 9.695356026634054e-05, "loss": 3.3896, "step": 3590 }, { "epoch": 0.244258730805816, "grad_norm": 2.271212100982666, "learning_rate": 9.694931376545727e-05, "loss": 3.5217, "step": 3595 }, { "epoch": 0.24459845087647777, "grad_norm": 2.4985265731811523, "learning_rate": 9.6945067264574e-05, "loss": 3.483, "step": 3600 }, { "epoch": 0.24493817094713954, "grad_norm": 2.5406460762023926, "learning_rate": 9.694082076369073e-05, "loss": 3.4885, "step": 3605 }, { "epoch": 0.24527789101780134, "grad_norm": 2.8880388736724854, "learning_rate": 9.693657426280745e-05, "loss": 3.5721, "step": 3610 }, { "epoch": 0.2456176110884631, "grad_norm": 2.0021584033966064, "learning_rate": 9.693232776192418e-05, "loss": 3.7434, "step": 3615 }, { "epoch": 0.24595733115912488, "grad_norm": 2.264113664627075, "learning_rate": 9.692808126104091e-05, "loss": 3.3232, "step": 3620 }, { "epoch": 0.24629705122978665, "grad_norm": 2.6743524074554443, "learning_rate": 9.692383476015764e-05, "loss": 3.4665, "step": 3625 }, { "epoch": 0.24663677130044842, "grad_norm": 2.0711376667022705, "learning_rate": 9.691958825927437e-05, "loss": 3.4765, "step": 3630 }, { "epoch": 0.24697649137111022, "grad_norm": 2.702500820159912, "learning_rate": 9.691534175839108e-05, "loss": 3.723, "step": 3635 }, { "epoch": 0.247316211441772, "grad_norm": 2.507871627807617, "learning_rate": 9.691109525750782e-05, "loss": 3.7241, "step": 3640 }, { "epoch": 0.24765593151243376, "grad_norm": 2.5994646549224854, "learning_rate": 9.690684875662455e-05, "loss": 3.7523, "step": 3645 }, { "epoch": 0.24799565158309553, "grad_norm": 1.9709161520004272, "learning_rate": 9.690260225574126e-05, "loss": 3.53, "step": 3650 }, { "epoch": 0.2483353716537573, "grad_norm": 2.298203468322754, "learning_rate": 9.6898355754858e-05, "loss": 3.3325, "step": 3655 }, { "epoch": 0.24867509172441907, "grad_norm": 2.473635196685791, "learning_rate": 9.689410925397473e-05, "loss": 3.4705, "step": 3660 }, { "epoch": 0.24901481179508086, "grad_norm": 2.189612865447998, "learning_rate": 9.688986275309145e-05, "loss": 3.4748, "step": 3665 }, { "epoch": 0.24935453186574263, "grad_norm": 2.4042279720306396, "learning_rate": 9.688561625220819e-05, "loss": 3.5936, "step": 3670 }, { "epoch": 0.2496942519364044, "grad_norm": 3.3932950496673584, "learning_rate": 9.688136975132492e-05, "loss": 3.6917, "step": 3675 }, { "epoch": 0.2500339720070662, "grad_norm": 2.467170000076294, "learning_rate": 9.687712325044163e-05, "loss": 3.6054, "step": 3680 }, { "epoch": 0.25037369207772797, "grad_norm": 2.3353543281555176, "learning_rate": 9.687287674955837e-05, "loss": 3.4045, "step": 3685 }, { "epoch": 0.2507134121483897, "grad_norm": 2.071033477783203, "learning_rate": 9.68686302486751e-05, "loss": 3.46, "step": 3690 }, { "epoch": 0.2510531322190515, "grad_norm": 2.2171244621276855, "learning_rate": 9.686438374779182e-05, "loss": 3.637, "step": 3695 }, { "epoch": 0.25139285228971325, "grad_norm": 2.5467214584350586, "learning_rate": 9.686013724690856e-05, "loss": 3.6963, "step": 3700 }, { "epoch": 0.25173257236037505, "grad_norm": 2.5387585163116455, "learning_rate": 9.685589074602529e-05, "loss": 3.6495, "step": 3705 }, { "epoch": 0.25207229243103685, "grad_norm": 2.4366462230682373, "learning_rate": 9.6851644245142e-05, "loss": 3.5348, "step": 3710 }, { "epoch": 0.2524120125016986, "grad_norm": 3.1417269706726074, "learning_rate": 9.684739774425874e-05, "loss": 3.5507, "step": 3715 }, { "epoch": 0.2527517325723604, "grad_norm": 1.702452540397644, "learning_rate": 9.684315124337546e-05, "loss": 3.4484, "step": 3720 }, { "epoch": 0.25309145264302213, "grad_norm": 1.9584827423095703, "learning_rate": 9.683890474249218e-05, "loss": 3.6776, "step": 3725 }, { "epoch": 0.2534311727136839, "grad_norm": 2.385561943054199, "learning_rate": 9.683465824160893e-05, "loss": 3.701, "step": 3730 }, { "epoch": 0.2537708927843457, "grad_norm": 2.3099288940429688, "learning_rate": 9.683041174072564e-05, "loss": 3.5095, "step": 3735 }, { "epoch": 0.25411061285500747, "grad_norm": 2.428244113922119, "learning_rate": 9.682616523984237e-05, "loss": 3.6398, "step": 3740 }, { "epoch": 0.25445033292566926, "grad_norm": 3.5786471366882324, "learning_rate": 9.682191873895911e-05, "loss": 3.4363, "step": 3745 }, { "epoch": 0.254790052996331, "grad_norm": 1.8464620113372803, "learning_rate": 9.681767223807582e-05, "loss": 3.4019, "step": 3750 }, { "epoch": 0.2551297730669928, "grad_norm": 2.6754884719848633, "learning_rate": 9.681342573719255e-05, "loss": 3.6996, "step": 3755 }, { "epoch": 0.25546949313765455, "grad_norm": 2.255600690841675, "learning_rate": 9.68091792363093e-05, "loss": 3.5124, "step": 3760 }, { "epoch": 0.25580921320831634, "grad_norm": 1.8826730251312256, "learning_rate": 9.680493273542601e-05, "loss": 3.5347, "step": 3765 }, { "epoch": 0.25614893327897814, "grad_norm": 2.5889227390289307, "learning_rate": 9.680068623454274e-05, "loss": 3.5486, "step": 3770 }, { "epoch": 0.2564886533496399, "grad_norm": 2.1120548248291016, "learning_rate": 9.679643973365948e-05, "loss": 3.4781, "step": 3775 }, { "epoch": 0.2568283734203017, "grad_norm": 2.3759171962738037, "learning_rate": 9.679219323277619e-05, "loss": 3.6285, "step": 3780 }, { "epoch": 0.2571680934909634, "grad_norm": 2.5310583114624023, "learning_rate": 9.678794673189292e-05, "loss": 3.6126, "step": 3785 }, { "epoch": 0.2575078135616252, "grad_norm": 2.1929080486297607, "learning_rate": 9.678370023100965e-05, "loss": 3.6937, "step": 3790 }, { "epoch": 0.257847533632287, "grad_norm": 2.534999132156372, "learning_rate": 9.677945373012638e-05, "loss": 3.5688, "step": 3795 }, { "epoch": 0.25818725370294876, "grad_norm": 2.346290349960327, "learning_rate": 9.67752072292431e-05, "loss": 3.4974, "step": 3800 }, { "epoch": 0.25852697377361056, "grad_norm": 2.659761428833008, "learning_rate": 9.677096072835983e-05, "loss": 3.5312, "step": 3805 }, { "epoch": 0.2588666938442723, "grad_norm": 2.5047528743743896, "learning_rate": 9.676671422747656e-05, "loss": 3.6177, "step": 3810 }, { "epoch": 0.2592064139149341, "grad_norm": 2.0250766277313232, "learning_rate": 9.676246772659329e-05, "loss": 3.5748, "step": 3815 }, { "epoch": 0.2595461339855959, "grad_norm": 2.3063530921936035, "learning_rate": 9.675822122571002e-05, "loss": 3.3994, "step": 3820 }, { "epoch": 0.25988585405625764, "grad_norm": 2.4666457176208496, "learning_rate": 9.675397472482674e-05, "loss": 3.6596, "step": 3825 }, { "epoch": 0.26022557412691943, "grad_norm": 3.235499858856201, "learning_rate": 9.674972822394347e-05, "loss": 3.3593, "step": 3830 }, { "epoch": 0.2605652941975812, "grad_norm": 2.335772752761841, "learning_rate": 9.67454817230602e-05, "loss": 3.431, "step": 3835 }, { "epoch": 0.260905014268243, "grad_norm": 1.8728452920913696, "learning_rate": 9.674123522217693e-05, "loss": 3.5788, "step": 3840 }, { "epoch": 0.2612447343389047, "grad_norm": 2.2257189750671387, "learning_rate": 9.673698872129366e-05, "loss": 3.7165, "step": 3845 }, { "epoch": 0.2615844544095665, "grad_norm": 2.8375751972198486, "learning_rate": 9.673274222041038e-05, "loss": 3.6698, "step": 3850 }, { "epoch": 0.2619241744802283, "grad_norm": 2.3299713134765625, "learning_rate": 9.672849571952711e-05, "loss": 3.3994, "step": 3855 }, { "epoch": 0.26226389455089005, "grad_norm": 3.0395591259002686, "learning_rate": 9.672424921864384e-05, "loss": 3.6161, "step": 3860 }, { "epoch": 0.26260361462155185, "grad_norm": 2.087859630584717, "learning_rate": 9.672000271776057e-05, "loss": 3.4369, "step": 3865 }, { "epoch": 0.2629433346922136, "grad_norm": 2.1337099075317383, "learning_rate": 9.67157562168773e-05, "loss": 3.3582, "step": 3870 }, { "epoch": 0.2632830547628754, "grad_norm": 2.8431849479675293, "learning_rate": 9.671150971599402e-05, "loss": 3.6454, "step": 3875 }, { "epoch": 0.2636227748335372, "grad_norm": 1.9242441654205322, "learning_rate": 9.670726321511075e-05, "loss": 3.6938, "step": 3880 }, { "epoch": 0.26396249490419893, "grad_norm": 2.4420218467712402, "learning_rate": 9.670301671422748e-05, "loss": 3.5834, "step": 3885 }, { "epoch": 0.2643022149748607, "grad_norm": 1.7619452476501465, "learning_rate": 9.669877021334421e-05, "loss": 3.663, "step": 3890 }, { "epoch": 0.26464193504552247, "grad_norm": 2.231281042098999, "learning_rate": 9.669452371246094e-05, "loss": 3.6955, "step": 3895 }, { "epoch": 0.26498165511618427, "grad_norm": 2.4212920665740967, "learning_rate": 9.669027721157766e-05, "loss": 3.3691, "step": 3900 }, { "epoch": 0.26532137518684606, "grad_norm": 2.491760730743408, "learning_rate": 9.668603071069439e-05, "loss": 3.8895, "step": 3905 }, { "epoch": 0.2656610952575078, "grad_norm": 2.7254176139831543, "learning_rate": 9.668178420981112e-05, "loss": 3.4169, "step": 3910 }, { "epoch": 0.2660008153281696, "grad_norm": 2.8254876136779785, "learning_rate": 9.667753770892785e-05, "loss": 3.7341, "step": 3915 }, { "epoch": 0.26634053539883135, "grad_norm": 2.8087313175201416, "learning_rate": 9.667329120804458e-05, "loss": 3.767, "step": 3920 }, { "epoch": 0.26668025546949314, "grad_norm": 2.340233564376831, "learning_rate": 9.66690447071613e-05, "loss": 3.5807, "step": 3925 }, { "epoch": 0.2670199755401549, "grad_norm": 2.023094654083252, "learning_rate": 9.666479820627803e-05, "loss": 3.3063, "step": 3930 }, { "epoch": 0.2673596956108167, "grad_norm": 2.119107246398926, "learning_rate": 9.666055170539476e-05, "loss": 3.6316, "step": 3935 }, { "epoch": 0.2676994156814785, "grad_norm": 1.8223646879196167, "learning_rate": 9.665630520451149e-05, "loss": 3.6356, "step": 3940 }, { "epoch": 0.2680391357521402, "grad_norm": 2.998859405517578, "learning_rate": 9.665205870362822e-05, "loss": 3.4163, "step": 3945 }, { "epoch": 0.268378855822802, "grad_norm": 2.333075761795044, "learning_rate": 9.664781220274495e-05, "loss": 3.782, "step": 3950 }, { "epoch": 0.26871857589346376, "grad_norm": 2.799476385116577, "learning_rate": 9.664356570186167e-05, "loss": 3.2197, "step": 3955 }, { "epoch": 0.26905829596412556, "grad_norm": 1.8839623928070068, "learning_rate": 9.66393192009784e-05, "loss": 3.6562, "step": 3960 }, { "epoch": 0.26939801603478736, "grad_norm": 2.6432347297668457, "learning_rate": 9.663507270009513e-05, "loss": 3.7665, "step": 3965 }, { "epoch": 0.2697377361054491, "grad_norm": 2.414016008377075, "learning_rate": 9.663082619921186e-05, "loss": 3.4374, "step": 3970 }, { "epoch": 0.2700774561761109, "grad_norm": 2.284450054168701, "learning_rate": 9.662657969832859e-05, "loss": 3.8625, "step": 3975 }, { "epoch": 0.27041717624677264, "grad_norm": 2.4175004959106445, "learning_rate": 9.662233319744531e-05, "loss": 3.6093, "step": 3980 }, { "epoch": 0.27075689631743444, "grad_norm": 2.2012200355529785, "learning_rate": 9.661808669656204e-05, "loss": 3.7269, "step": 3985 }, { "epoch": 0.27109661638809623, "grad_norm": 1.7872872352600098, "learning_rate": 9.661384019567876e-05, "loss": 3.5437, "step": 3990 }, { "epoch": 0.271436336458758, "grad_norm": 2.2658510208129883, "learning_rate": 9.66095936947955e-05, "loss": 3.5009, "step": 3995 }, { "epoch": 0.2717760565294198, "grad_norm": 2.562431812286377, "learning_rate": 9.660534719391223e-05, "loss": 3.8177, "step": 4000 }, { "epoch": 0.2721157766000815, "grad_norm": 2.3021233081817627, "learning_rate": 9.660110069302894e-05, "loss": 3.5899, "step": 4005 }, { "epoch": 0.2724554966707433, "grad_norm": 1.9640750885009766, "learning_rate": 9.659685419214568e-05, "loss": 3.4248, "step": 4010 }, { "epoch": 0.27279521674140506, "grad_norm": 2.4887568950653076, "learning_rate": 9.659260769126241e-05, "loss": 3.8734, "step": 4015 }, { "epoch": 0.27313493681206685, "grad_norm": 2.33853816986084, "learning_rate": 9.658836119037912e-05, "loss": 3.4048, "step": 4020 }, { "epoch": 0.27347465688272865, "grad_norm": 2.5136539936065674, "learning_rate": 9.658411468949587e-05, "loss": 3.4624, "step": 4025 }, { "epoch": 0.2738143769533904, "grad_norm": 2.3944242000579834, "learning_rate": 9.657986818861259e-05, "loss": 3.3688, "step": 4030 }, { "epoch": 0.2741540970240522, "grad_norm": 1.9791560173034668, "learning_rate": 9.657562168772931e-05, "loss": 3.6241, "step": 4035 }, { "epoch": 0.27449381709471393, "grad_norm": 2.2201993465423584, "learning_rate": 9.657137518684605e-05, "loss": 3.4826, "step": 4040 }, { "epoch": 0.27483353716537573, "grad_norm": 2.487506628036499, "learning_rate": 9.656712868596278e-05, "loss": 3.746, "step": 4045 }, { "epoch": 0.2751732572360375, "grad_norm": 2.5716309547424316, "learning_rate": 9.656288218507949e-05, "loss": 3.7071, "step": 4050 }, { "epoch": 0.27551297730669927, "grad_norm": 2.1051158905029297, "learning_rate": 9.655863568419623e-05, "loss": 3.5788, "step": 4055 }, { "epoch": 0.27585269737736107, "grad_norm": 2.3651576042175293, "learning_rate": 9.655438918331295e-05, "loss": 3.4922, "step": 4060 }, { "epoch": 0.2761924174480228, "grad_norm": 2.6557517051696777, "learning_rate": 9.655014268242968e-05, "loss": 3.4848, "step": 4065 }, { "epoch": 0.2765321375186846, "grad_norm": 2.4949405193328857, "learning_rate": 9.654589618154642e-05, "loss": 3.3538, "step": 4070 }, { "epoch": 0.2768718575893464, "grad_norm": 4.292568206787109, "learning_rate": 9.654164968066313e-05, "loss": 3.6778, "step": 4075 }, { "epoch": 0.27721157766000815, "grad_norm": 2.119539499282837, "learning_rate": 9.653740317977986e-05, "loss": 3.608, "step": 4080 }, { "epoch": 0.27755129773066994, "grad_norm": 2.371422290802002, "learning_rate": 9.65331566788966e-05, "loss": 3.2927, "step": 4085 }, { "epoch": 0.2778910178013317, "grad_norm": 1.9895979166030884, "learning_rate": 9.652891017801332e-05, "loss": 3.4853, "step": 4090 }, { "epoch": 0.2782307378719935, "grad_norm": 2.7541933059692383, "learning_rate": 9.652466367713004e-05, "loss": 3.554, "step": 4095 }, { "epoch": 0.2785704579426552, "grad_norm": 2.2593939304351807, "learning_rate": 9.652041717624679e-05, "loss": 3.6386, "step": 4100 }, { "epoch": 0.278910178013317, "grad_norm": 2.2123637199401855, "learning_rate": 9.65161706753635e-05, "loss": 3.43, "step": 4105 }, { "epoch": 0.2792498980839788, "grad_norm": 2.282215118408203, "learning_rate": 9.651192417448023e-05, "loss": 3.6618, "step": 4110 }, { "epoch": 0.27958961815464056, "grad_norm": 1.8907086849212646, "learning_rate": 9.650767767359697e-05, "loss": 3.5053, "step": 4115 }, { "epoch": 0.27992933822530236, "grad_norm": 2.011336326599121, "learning_rate": 9.650343117271368e-05, "loss": 3.5264, "step": 4120 }, { "epoch": 0.2802690582959641, "grad_norm": 2.488196849822998, "learning_rate": 9.649918467183041e-05, "loss": 3.4374, "step": 4125 }, { "epoch": 0.2806087783666259, "grad_norm": 2.4004621505737305, "learning_rate": 9.649493817094715e-05, "loss": 3.5411, "step": 4130 }, { "epoch": 0.2809484984372877, "grad_norm": 1.8012967109680176, "learning_rate": 9.649069167006387e-05, "loss": 3.4135, "step": 4135 }, { "epoch": 0.28128821850794944, "grad_norm": 2.6300768852233887, "learning_rate": 9.64864451691806e-05, "loss": 3.4507, "step": 4140 }, { "epoch": 0.28162793857861124, "grad_norm": 2.403141498565674, "learning_rate": 9.648219866829732e-05, "loss": 3.5553, "step": 4145 }, { "epoch": 0.281967658649273, "grad_norm": 2.485696792602539, "learning_rate": 9.647795216741405e-05, "loss": 3.7829, "step": 4150 }, { "epoch": 0.2823073787199348, "grad_norm": 2.2314913272857666, "learning_rate": 9.647370566653078e-05, "loss": 3.7152, "step": 4155 }, { "epoch": 0.2826470987905966, "grad_norm": 2.596282958984375, "learning_rate": 9.646945916564751e-05, "loss": 3.6471, "step": 4160 }, { "epoch": 0.2829868188612583, "grad_norm": 2.3855040073394775, "learning_rate": 9.646521266476424e-05, "loss": 3.3684, "step": 4165 }, { "epoch": 0.2833265389319201, "grad_norm": 1.9615055322647095, "learning_rate": 9.646096616388096e-05, "loss": 3.4021, "step": 4170 }, { "epoch": 0.28366625900258186, "grad_norm": 2.558335781097412, "learning_rate": 9.645671966299769e-05, "loss": 3.6088, "step": 4175 }, { "epoch": 0.28400597907324365, "grad_norm": 2.681771993637085, "learning_rate": 9.645247316211442e-05, "loss": 3.2109, "step": 4180 }, { "epoch": 0.2843456991439054, "grad_norm": 2.2063992023468018, "learning_rate": 9.644822666123115e-05, "loss": 3.6736, "step": 4185 }, { "epoch": 0.2846854192145672, "grad_norm": 2.680952310562134, "learning_rate": 9.644398016034788e-05, "loss": 3.301, "step": 4190 }, { "epoch": 0.285025139285229, "grad_norm": 2.5799989700317383, "learning_rate": 9.64397336594646e-05, "loss": 3.6096, "step": 4195 }, { "epoch": 0.28536485935589073, "grad_norm": 2.01418137550354, "learning_rate": 9.643548715858135e-05, "loss": 3.5222, "step": 4200 }, { "epoch": 0.28570457942655253, "grad_norm": 1.9510987997055054, "learning_rate": 9.643124065769806e-05, "loss": 3.6441, "step": 4205 }, { "epoch": 0.28604429949721427, "grad_norm": 2.36946439743042, "learning_rate": 9.642699415681479e-05, "loss": 3.5422, "step": 4210 }, { "epoch": 0.28638401956787607, "grad_norm": 2.337088108062744, "learning_rate": 9.642274765593152e-05, "loss": 3.53, "step": 4215 }, { "epoch": 0.28672373963853787, "grad_norm": 2.494980573654175, "learning_rate": 9.641850115504824e-05, "loss": 3.7402, "step": 4220 }, { "epoch": 0.2870634597091996, "grad_norm": 2.3410887718200684, "learning_rate": 9.641425465416497e-05, "loss": 3.7098, "step": 4225 }, { "epoch": 0.2874031797798614, "grad_norm": 2.20373272895813, "learning_rate": 9.64100081532817e-05, "loss": 3.6799, "step": 4230 }, { "epoch": 0.28774289985052315, "grad_norm": 2.4828975200653076, "learning_rate": 9.640576165239843e-05, "loss": 3.396, "step": 4235 }, { "epoch": 0.28808261992118495, "grad_norm": 2.2657108306884766, "learning_rate": 9.640151515151516e-05, "loss": 3.7589, "step": 4240 }, { "epoch": 0.28842233999184674, "grad_norm": 2.0548276901245117, "learning_rate": 9.639726865063188e-05, "loss": 3.6342, "step": 4245 }, { "epoch": 0.2887620600625085, "grad_norm": 2.6554222106933594, "learning_rate": 9.639302214974861e-05, "loss": 3.4329, "step": 4250 }, { "epoch": 0.2891017801331703, "grad_norm": 1.936296820640564, "learning_rate": 9.638877564886534e-05, "loss": 3.4031, "step": 4255 }, { "epoch": 0.289441500203832, "grad_norm": 2.6779937744140625, "learning_rate": 9.638452914798207e-05, "loss": 3.7708, "step": 4260 }, { "epoch": 0.2897812202744938, "grad_norm": 2.238558292388916, "learning_rate": 9.63802826470988e-05, "loss": 3.4513, "step": 4265 }, { "epoch": 0.29012094034515556, "grad_norm": 2.370130777359009, "learning_rate": 9.637603614621552e-05, "loss": 3.4908, "step": 4270 }, { "epoch": 0.29046066041581736, "grad_norm": 2.2709288597106934, "learning_rate": 9.637178964533225e-05, "loss": 3.6773, "step": 4275 }, { "epoch": 0.29080038048647916, "grad_norm": 3.465778112411499, "learning_rate": 9.636754314444898e-05, "loss": 3.4743, "step": 4280 }, { "epoch": 0.2911401005571409, "grad_norm": 1.926073431968689, "learning_rate": 9.636329664356571e-05, "loss": 3.7204, "step": 4285 }, { "epoch": 0.2914798206278027, "grad_norm": 2.610278606414795, "learning_rate": 9.635905014268244e-05, "loss": 3.4249, "step": 4290 }, { "epoch": 0.29181954069846444, "grad_norm": 2.490473508834839, "learning_rate": 9.635480364179916e-05, "loss": 3.7892, "step": 4295 }, { "epoch": 0.29215926076912624, "grad_norm": 2.551053524017334, "learning_rate": 9.635055714091589e-05, "loss": 3.4143, "step": 4300 }, { "epoch": 0.29249898083978804, "grad_norm": 2.625108242034912, "learning_rate": 9.634631064003262e-05, "loss": 3.2754, "step": 4305 }, { "epoch": 0.2928387009104498, "grad_norm": 2.3275227546691895, "learning_rate": 9.634206413914935e-05, "loss": 3.6869, "step": 4310 }, { "epoch": 0.2931784209811116, "grad_norm": 2.8762521743774414, "learning_rate": 9.633781763826608e-05, "loss": 3.2725, "step": 4315 }, { "epoch": 0.2935181410517733, "grad_norm": 2.3196375370025635, "learning_rate": 9.63335711373828e-05, "loss": 3.4897, "step": 4320 }, { "epoch": 0.2938578611224351, "grad_norm": 2.0370218753814697, "learning_rate": 9.632932463649953e-05, "loss": 3.4089, "step": 4325 }, { "epoch": 0.2941975811930969, "grad_norm": 2.1044533252716064, "learning_rate": 9.632507813561626e-05, "loss": 3.5402, "step": 4330 }, { "epoch": 0.29453730126375866, "grad_norm": 2.2539141178131104, "learning_rate": 9.632083163473299e-05, "loss": 3.7155, "step": 4335 }, { "epoch": 0.29487702133442045, "grad_norm": 2.5959103107452393, "learning_rate": 9.631658513384972e-05, "loss": 3.4873, "step": 4340 }, { "epoch": 0.2952167414050822, "grad_norm": 2.0770015716552734, "learning_rate": 9.631233863296643e-05, "loss": 3.0496, "step": 4345 }, { "epoch": 0.295556461475744, "grad_norm": 2.2833893299102783, "learning_rate": 9.630809213208317e-05, "loss": 3.4672, "step": 4350 }, { "epoch": 0.29589618154640573, "grad_norm": 2.616764545440674, "learning_rate": 9.63038456311999e-05, "loss": 3.5965, "step": 4355 }, { "epoch": 0.29623590161706753, "grad_norm": 2.0542845726013184, "learning_rate": 9.629959913031661e-05, "loss": 3.3098, "step": 4360 }, { "epoch": 0.29657562168772933, "grad_norm": 2.2543091773986816, "learning_rate": 9.629535262943336e-05, "loss": 3.7651, "step": 4365 }, { "epoch": 0.29691534175839107, "grad_norm": 2.0595855712890625, "learning_rate": 9.629110612855008e-05, "loss": 3.6352, "step": 4370 }, { "epoch": 0.29725506182905287, "grad_norm": 2.7353787422180176, "learning_rate": 9.62868596276668e-05, "loss": 3.348, "step": 4375 }, { "epoch": 0.2975947818997146, "grad_norm": 2.508931875228882, "learning_rate": 9.628261312678354e-05, "loss": 3.5119, "step": 4380 }, { "epoch": 0.2979345019703764, "grad_norm": 1.9135644435882568, "learning_rate": 9.627836662590027e-05, "loss": 3.5481, "step": 4385 }, { "epoch": 0.2982742220410382, "grad_norm": 2.5715854167938232, "learning_rate": 9.627412012501698e-05, "loss": 3.7529, "step": 4390 }, { "epoch": 0.29861394211169995, "grad_norm": 2.355053424835205, "learning_rate": 9.626987362413372e-05, "loss": 3.7664, "step": 4395 }, { "epoch": 0.29895366218236175, "grad_norm": 2.377655506134033, "learning_rate": 9.626562712325045e-05, "loss": 3.5368, "step": 4400 }, { "epoch": 0.2992933822530235, "grad_norm": 2.6739094257354736, "learning_rate": 9.626138062236717e-05, "loss": 3.5576, "step": 4405 }, { "epoch": 0.2996331023236853, "grad_norm": 2.7493178844451904, "learning_rate": 9.625713412148391e-05, "loss": 3.6849, "step": 4410 }, { "epoch": 0.2999728223943471, "grad_norm": 2.9010813236236572, "learning_rate": 9.625288762060062e-05, "loss": 3.7381, "step": 4415 }, { "epoch": 0.3003125424650088, "grad_norm": 3.4917478561401367, "learning_rate": 9.624864111971735e-05, "loss": 3.5783, "step": 4420 }, { "epoch": 0.3006522625356706, "grad_norm": 2.0238993167877197, "learning_rate": 9.624439461883409e-05, "loss": 3.7146, "step": 4425 }, { "epoch": 0.30099198260633236, "grad_norm": 2.591813802719116, "learning_rate": 9.62401481179508e-05, "loss": 3.7996, "step": 4430 }, { "epoch": 0.30133170267699416, "grad_norm": 3.253753185272217, "learning_rate": 9.623590161706753e-05, "loss": 3.8351, "step": 4435 }, { "epoch": 0.3016714227476559, "grad_norm": 2.0914976596832275, "learning_rate": 9.623165511618428e-05, "loss": 3.4595, "step": 4440 }, { "epoch": 0.3020111428183177, "grad_norm": 2.1238136291503906, "learning_rate": 9.622740861530099e-05, "loss": 3.5304, "step": 4445 }, { "epoch": 0.3023508628889795, "grad_norm": 2.066178798675537, "learning_rate": 9.622316211441772e-05, "loss": 3.545, "step": 4450 }, { "epoch": 0.30269058295964124, "grad_norm": 1.8286415338516235, "learning_rate": 9.621891561353446e-05, "loss": 3.3081, "step": 4455 }, { "epoch": 0.30303030303030304, "grad_norm": 2.3473880290985107, "learning_rate": 9.621466911265117e-05, "loss": 3.4117, "step": 4460 }, { "epoch": 0.3033700231009648, "grad_norm": 2.1917498111724854, "learning_rate": 9.62104226117679e-05, "loss": 3.5267, "step": 4465 }, { "epoch": 0.3037097431716266, "grad_norm": 2.225980758666992, "learning_rate": 9.620617611088464e-05, "loss": 3.3304, "step": 4470 }, { "epoch": 0.3040494632422884, "grad_norm": 2.079214334487915, "learning_rate": 9.620192961000136e-05, "loss": 3.5595, "step": 4475 }, { "epoch": 0.3043891833129501, "grad_norm": 2.7024800777435303, "learning_rate": 9.619768310911809e-05, "loss": 3.6907, "step": 4480 }, { "epoch": 0.3047289033836119, "grad_norm": 2.1574583053588867, "learning_rate": 9.619343660823483e-05, "loss": 3.3908, "step": 4485 }, { "epoch": 0.30506862345427366, "grad_norm": 2.0015764236450195, "learning_rate": 9.618919010735154e-05, "loss": 3.859, "step": 4490 }, { "epoch": 0.30540834352493546, "grad_norm": 3.2876758575439453, "learning_rate": 9.618494360646827e-05, "loss": 3.5404, "step": 4495 }, { "epoch": 0.30574806359559725, "grad_norm": 2.3011112213134766, "learning_rate": 9.6180697105585e-05, "loss": 3.5167, "step": 4500 }, { "epoch": 0.306087783666259, "grad_norm": 4.733278274536133, "learning_rate": 9.617645060470173e-05, "loss": 3.3622, "step": 4505 }, { "epoch": 0.3064275037369208, "grad_norm": 2.109318256378174, "learning_rate": 9.617220410381845e-05, "loss": 3.9048, "step": 4510 }, { "epoch": 0.30676722380758253, "grad_norm": 1.850805640220642, "learning_rate": 9.616795760293518e-05, "loss": 3.6222, "step": 4515 }, { "epoch": 0.30710694387824433, "grad_norm": 2.6362383365631104, "learning_rate": 9.616371110205191e-05, "loss": 3.6315, "step": 4520 }, { "epoch": 0.3074466639489061, "grad_norm": 2.3985159397125244, "learning_rate": 9.615946460116864e-05, "loss": 3.3994, "step": 4525 }, { "epoch": 0.30778638401956787, "grad_norm": 3.1595442295074463, "learning_rate": 9.615521810028537e-05, "loss": 3.6236, "step": 4530 }, { "epoch": 0.30812610409022967, "grad_norm": 2.6055166721343994, "learning_rate": 9.61509715994021e-05, "loss": 3.399, "step": 4535 }, { "epoch": 0.3084658241608914, "grad_norm": 2.2961208820343018, "learning_rate": 9.614672509851884e-05, "loss": 3.4636, "step": 4540 }, { "epoch": 0.3088055442315532, "grad_norm": 2.6362643241882324, "learning_rate": 9.614247859763555e-05, "loss": 3.3003, "step": 4545 }, { "epoch": 0.30914526430221495, "grad_norm": 2.211743116378784, "learning_rate": 9.613823209675228e-05, "loss": 3.4525, "step": 4550 }, { "epoch": 0.30948498437287675, "grad_norm": 1.7058196067810059, "learning_rate": 9.613398559586902e-05, "loss": 3.6407, "step": 4555 }, { "epoch": 0.30982470444353855, "grad_norm": 1.867773413658142, "learning_rate": 9.612973909498573e-05, "loss": 3.5318, "step": 4560 }, { "epoch": 0.3101644245142003, "grad_norm": 2.837249994277954, "learning_rate": 9.612549259410246e-05, "loss": 3.5919, "step": 4565 }, { "epoch": 0.3105041445848621, "grad_norm": 3.0847020149230957, "learning_rate": 9.612124609321919e-05, "loss": 3.6143, "step": 4570 }, { "epoch": 0.3108438646555238, "grad_norm": 2.0619595050811768, "learning_rate": 9.611699959233592e-05, "loss": 3.2918, "step": 4575 }, { "epoch": 0.3111835847261856, "grad_norm": 2.6687724590301514, "learning_rate": 9.611275309145265e-05, "loss": 3.7714, "step": 4580 }, { "epoch": 0.3115233047968474, "grad_norm": 2.254484176635742, "learning_rate": 9.610850659056937e-05, "loss": 3.6149, "step": 4585 }, { "epoch": 0.31186302486750916, "grad_norm": 2.5946383476257324, "learning_rate": 9.61042600896861e-05, "loss": 3.5097, "step": 4590 }, { "epoch": 0.31220274493817096, "grad_norm": 2.005279064178467, "learning_rate": 9.610001358880283e-05, "loss": 3.5782, "step": 4595 }, { "epoch": 0.3125424650088327, "grad_norm": 2.2426390647888184, "learning_rate": 9.609576708791956e-05, "loss": 3.6735, "step": 4600 }, { "epoch": 0.3128821850794945, "grad_norm": 2.1684017181396484, "learning_rate": 9.609152058703629e-05, "loss": 3.6251, "step": 4605 }, { "epoch": 0.3132219051501563, "grad_norm": 2.69171142578125, "learning_rate": 9.608727408615301e-05, "loss": 3.519, "step": 4610 }, { "epoch": 0.31356162522081804, "grad_norm": 2.0534961223602295, "learning_rate": 9.608302758526974e-05, "loss": 3.4568, "step": 4615 }, { "epoch": 0.31390134529147984, "grad_norm": 2.2909598350524902, "learning_rate": 9.607878108438647e-05, "loss": 3.741, "step": 4620 }, { "epoch": 0.3142410653621416, "grad_norm": 1.7981642484664917, "learning_rate": 9.60745345835032e-05, "loss": 3.4916, "step": 4625 }, { "epoch": 0.3145807854328034, "grad_norm": 1.9211735725402832, "learning_rate": 9.607028808261993e-05, "loss": 3.3815, "step": 4630 }, { "epoch": 0.3149205055034651, "grad_norm": 2.814077854156494, "learning_rate": 9.606604158173665e-05, "loss": 3.4764, "step": 4635 }, { "epoch": 0.3152602255741269, "grad_norm": 2.218183994293213, "learning_rate": 9.606179508085338e-05, "loss": 3.6787, "step": 4640 }, { "epoch": 0.3155999456447887, "grad_norm": 2.0759880542755127, "learning_rate": 9.605754857997011e-05, "loss": 3.5326, "step": 4645 }, { "epoch": 0.31593966571545046, "grad_norm": 2.1081411838531494, "learning_rate": 9.605330207908684e-05, "loss": 3.4549, "step": 4650 }, { "epoch": 0.31627938578611225, "grad_norm": 2.7850899696350098, "learning_rate": 9.604905557820357e-05, "loss": 3.4321, "step": 4655 }, { "epoch": 0.316619105856774, "grad_norm": 2.095616340637207, "learning_rate": 9.60448090773203e-05, "loss": 3.7483, "step": 4660 }, { "epoch": 0.3169588259274358, "grad_norm": 3.430894613265991, "learning_rate": 9.604056257643702e-05, "loss": 3.5369, "step": 4665 }, { "epoch": 0.3172985459980976, "grad_norm": 2.5829780101776123, "learning_rate": 9.603631607555375e-05, "loss": 3.8033, "step": 4670 }, { "epoch": 0.31763826606875933, "grad_norm": 2.3534040451049805, "learning_rate": 9.603206957467048e-05, "loss": 3.63, "step": 4675 }, { "epoch": 0.31797798613942113, "grad_norm": 1.994956612586975, "learning_rate": 9.602782307378721e-05, "loss": 3.4428, "step": 4680 }, { "epoch": 0.3183177062100829, "grad_norm": 1.761325716972351, "learning_rate": 9.602357657290394e-05, "loss": 3.6485, "step": 4685 }, { "epoch": 0.31865742628074467, "grad_norm": 2.204400062561035, "learning_rate": 9.601933007202066e-05, "loss": 3.7453, "step": 4690 }, { "epoch": 0.31899714635140647, "grad_norm": 1.9333000183105469, "learning_rate": 9.601508357113739e-05, "loss": 3.6396, "step": 4695 }, { "epoch": 0.3193368664220682, "grad_norm": 2.2336275577545166, "learning_rate": 9.60108370702541e-05, "loss": 3.5145, "step": 4700 }, { "epoch": 0.31967658649273, "grad_norm": 2.0712366104125977, "learning_rate": 9.600659056937085e-05, "loss": 3.408, "step": 4705 }, { "epoch": 0.32001630656339175, "grad_norm": 2.6495025157928467, "learning_rate": 9.600234406848758e-05, "loss": 3.3555, "step": 4710 }, { "epoch": 0.32035602663405355, "grad_norm": 2.1428282260894775, "learning_rate": 9.599809756760429e-05, "loss": 3.5858, "step": 4715 }, { "epoch": 0.3206957467047153, "grad_norm": 2.094815254211426, "learning_rate": 9.599385106672103e-05, "loss": 3.6793, "step": 4720 }, { "epoch": 0.3210354667753771, "grad_norm": 2.002483606338501, "learning_rate": 9.598960456583776e-05, "loss": 3.6505, "step": 4725 }, { "epoch": 0.3213751868460389, "grad_norm": 2.252927541732788, "learning_rate": 9.598535806495447e-05, "loss": 3.4, "step": 4730 }, { "epoch": 0.3217149069167006, "grad_norm": 2.516890287399292, "learning_rate": 9.598111156407122e-05, "loss": 3.2724, "step": 4735 }, { "epoch": 0.3220546269873624, "grad_norm": 2.3295066356658936, "learning_rate": 9.597686506318794e-05, "loss": 3.3916, "step": 4740 }, { "epoch": 0.32239434705802417, "grad_norm": 2.1854660511016846, "learning_rate": 9.597261856230466e-05, "loss": 3.3465, "step": 4745 }, { "epoch": 0.32273406712868596, "grad_norm": 2.062574863433838, "learning_rate": 9.59683720614214e-05, "loss": 3.4885, "step": 4750 }, { "epoch": 0.32307378719934776, "grad_norm": 2.5735292434692383, "learning_rate": 9.596412556053813e-05, "loss": 3.671, "step": 4755 }, { "epoch": 0.3234135072700095, "grad_norm": 2.2522685527801514, "learning_rate": 9.595987905965484e-05, "loss": 3.3854, "step": 4760 }, { "epoch": 0.3237532273406713, "grad_norm": 2.4699902534484863, "learning_rate": 9.595563255877158e-05, "loss": 3.588, "step": 4765 }, { "epoch": 0.32409294741133304, "grad_norm": 2.5795655250549316, "learning_rate": 9.59513860578883e-05, "loss": 3.5708, "step": 4770 }, { "epoch": 0.32443266748199484, "grad_norm": 2.078995943069458, "learning_rate": 9.594713955700503e-05, "loss": 3.3525, "step": 4775 }, { "epoch": 0.32477238755265664, "grad_norm": 1.9790847301483154, "learning_rate": 9.594289305612177e-05, "loss": 3.626, "step": 4780 }, { "epoch": 0.3251121076233184, "grad_norm": 2.1064956188201904, "learning_rate": 9.593864655523848e-05, "loss": 3.5161, "step": 4785 }, { "epoch": 0.3254518276939802, "grad_norm": 2.2212953567504883, "learning_rate": 9.593440005435521e-05, "loss": 3.3063, "step": 4790 }, { "epoch": 0.3257915477646419, "grad_norm": 1.9809656143188477, "learning_rate": 9.593015355347195e-05, "loss": 3.5678, "step": 4795 }, { "epoch": 0.3261312678353037, "grad_norm": 2.668142080307007, "learning_rate": 9.592590705258867e-05, "loss": 3.3486, "step": 4800 }, { "epoch": 0.32647098790596546, "grad_norm": 2.9718236923217773, "learning_rate": 9.59216605517054e-05, "loss": 3.3339, "step": 4805 }, { "epoch": 0.32681070797662726, "grad_norm": 2.3413827419281006, "learning_rate": 9.591741405082214e-05, "loss": 3.765, "step": 4810 }, { "epoch": 0.32715042804728905, "grad_norm": 2.590475559234619, "learning_rate": 9.591316754993885e-05, "loss": 3.3701, "step": 4815 }, { "epoch": 0.3274901481179508, "grad_norm": 3.03633975982666, "learning_rate": 9.590892104905558e-05, "loss": 3.6036, "step": 4820 }, { "epoch": 0.3278298681886126, "grad_norm": 1.715158224105835, "learning_rate": 9.590467454817232e-05, "loss": 3.6644, "step": 4825 }, { "epoch": 0.32816958825927434, "grad_norm": 2.420332193374634, "learning_rate": 9.590042804728903e-05, "loss": 3.5532, "step": 4830 }, { "epoch": 0.32850930832993613, "grad_norm": 2.3807718753814697, "learning_rate": 9.589618154640576e-05, "loss": 3.5792, "step": 4835 }, { "epoch": 0.32884902840059793, "grad_norm": 2.3275177478790283, "learning_rate": 9.589193504552249e-05, "loss": 3.7666, "step": 4840 }, { "epoch": 0.3291887484712597, "grad_norm": 2.4652233123779297, "learning_rate": 9.588768854463922e-05, "loss": 3.6853, "step": 4845 }, { "epoch": 0.32952846854192147, "grad_norm": 3.2501232624053955, "learning_rate": 9.588344204375595e-05, "loss": 3.5766, "step": 4850 }, { "epoch": 0.3298681886125832, "grad_norm": 3.4538049697875977, "learning_rate": 9.587919554287267e-05, "loss": 3.6567, "step": 4855 }, { "epoch": 0.330207908683245, "grad_norm": 2.496765375137329, "learning_rate": 9.58749490419894e-05, "loss": 3.2606, "step": 4860 }, { "epoch": 0.3305476287539068, "grad_norm": 2.3711674213409424, "learning_rate": 9.587070254110613e-05, "loss": 3.6201, "step": 4865 }, { "epoch": 0.33088734882456855, "grad_norm": 2.1462559700012207, "learning_rate": 9.586645604022286e-05, "loss": 3.769, "step": 4870 }, { "epoch": 0.33122706889523035, "grad_norm": 2.1831979751586914, "learning_rate": 9.586220953933959e-05, "loss": 3.2845, "step": 4875 }, { "epoch": 0.3315667889658921, "grad_norm": 2.3417398929595947, "learning_rate": 9.585796303845633e-05, "loss": 3.2206, "step": 4880 }, { "epoch": 0.3319065090365539, "grad_norm": 1.9240666627883911, "learning_rate": 9.585371653757304e-05, "loss": 3.6608, "step": 4885 }, { "epoch": 0.33224622910721563, "grad_norm": 2.5419371128082275, "learning_rate": 9.584947003668977e-05, "loss": 3.5532, "step": 4890 }, { "epoch": 0.3325859491778774, "grad_norm": 2.2226459980010986, "learning_rate": 9.584522353580651e-05, "loss": 3.7632, "step": 4895 }, { "epoch": 0.3329256692485392, "grad_norm": 2.428110122680664, "learning_rate": 9.584097703492323e-05, "loss": 3.5621, "step": 4900 }, { "epoch": 0.33326538931920097, "grad_norm": 2.192072868347168, "learning_rate": 9.583673053403995e-05, "loss": 3.4916, "step": 4905 }, { "epoch": 0.33360510938986276, "grad_norm": 1.971107006072998, "learning_rate": 9.58324840331567e-05, "loss": 3.7121, "step": 4910 }, { "epoch": 0.3339448294605245, "grad_norm": 2.5853402614593506, "learning_rate": 9.582823753227341e-05, "loss": 3.617, "step": 4915 }, { "epoch": 0.3342845495311863, "grad_norm": 2.0271685123443604, "learning_rate": 9.582399103139014e-05, "loss": 3.3627, "step": 4920 }, { "epoch": 0.3346242696018481, "grad_norm": 2.1375293731689453, "learning_rate": 9.581974453050687e-05, "loss": 3.5635, "step": 4925 }, { "epoch": 0.33496398967250984, "grad_norm": 2.15071702003479, "learning_rate": 9.58154980296236e-05, "loss": 4.0459, "step": 4930 }, { "epoch": 0.33530370974317164, "grad_norm": 2.625150680541992, "learning_rate": 9.581125152874032e-05, "loss": 3.4702, "step": 4935 }, { "epoch": 0.3356434298138334, "grad_norm": 2.021744728088379, "learning_rate": 9.580700502785705e-05, "loss": 3.4294, "step": 4940 }, { "epoch": 0.3359831498844952, "grad_norm": 2.2073516845703125, "learning_rate": 9.580275852697378e-05, "loss": 3.6793, "step": 4945 }, { "epoch": 0.336322869955157, "grad_norm": 2.4387354850769043, "learning_rate": 9.57985120260905e-05, "loss": 3.5551, "step": 4950 }, { "epoch": 0.3366625900258187, "grad_norm": 1.6607073545455933, "learning_rate": 9.579426552520723e-05, "loss": 3.639, "step": 4955 }, { "epoch": 0.3370023100964805, "grad_norm": 2.0000860691070557, "learning_rate": 9.579001902432396e-05, "loss": 3.3364, "step": 4960 }, { "epoch": 0.33734203016714226, "grad_norm": 1.7590445280075073, "learning_rate": 9.578577252344069e-05, "loss": 3.5949, "step": 4965 }, { "epoch": 0.33768175023780406, "grad_norm": 2.9744365215301514, "learning_rate": 9.578152602255742e-05, "loss": 3.3676, "step": 4970 }, { "epoch": 0.3380214703084658, "grad_norm": 3.2376041412353516, "learning_rate": 9.577727952167415e-05, "loss": 3.5237, "step": 4975 }, { "epoch": 0.3383611903791276, "grad_norm": 2.650528907775879, "learning_rate": 9.577303302079087e-05, "loss": 3.5577, "step": 4980 }, { "epoch": 0.3387009104497894, "grad_norm": 2.194971799850464, "learning_rate": 9.57687865199076e-05, "loss": 3.464, "step": 4985 }, { "epoch": 0.33904063052045114, "grad_norm": 3.3826820850372314, "learning_rate": 9.576454001902433e-05, "loss": 3.6076, "step": 4990 }, { "epoch": 0.33938035059111293, "grad_norm": 2.0753471851348877, "learning_rate": 9.576029351814106e-05, "loss": 3.679, "step": 4995 }, { "epoch": 0.3397200706617747, "grad_norm": 2.0594770908355713, "learning_rate": 9.575604701725779e-05, "loss": 3.4702, "step": 5000 }, { "epoch": 0.3400597907324365, "grad_norm": 2.103978157043457, "learning_rate": 9.575180051637451e-05, "loss": 3.7303, "step": 5005 }, { "epoch": 0.34039951080309827, "grad_norm": 2.249831199645996, "learning_rate": 9.574755401549124e-05, "loss": 3.3897, "step": 5010 }, { "epoch": 0.34073923087376, "grad_norm": 2.216661214828491, "learning_rate": 9.574330751460797e-05, "loss": 3.3432, "step": 5015 }, { "epoch": 0.3410789509444218, "grad_norm": 2.2671594619750977, "learning_rate": 9.57390610137247e-05, "loss": 3.2468, "step": 5020 }, { "epoch": 0.34141867101508355, "grad_norm": 2.5372536182403564, "learning_rate": 9.573481451284143e-05, "loss": 3.2482, "step": 5025 }, { "epoch": 0.34175839108574535, "grad_norm": 2.581930637359619, "learning_rate": 9.573056801195815e-05, "loss": 3.3906, "step": 5030 }, { "epoch": 0.34209811115640715, "grad_norm": 2.497941493988037, "learning_rate": 9.572632151107488e-05, "loss": 3.2897, "step": 5035 }, { "epoch": 0.3424378312270689, "grad_norm": 2.802440881729126, "learning_rate": 9.57220750101916e-05, "loss": 3.3894, "step": 5040 }, { "epoch": 0.3427775512977307, "grad_norm": 2.4767963886260986, "learning_rate": 9.571782850930834e-05, "loss": 3.597, "step": 5045 }, { "epoch": 0.34311727136839243, "grad_norm": 2.506659507751465, "learning_rate": 9.571358200842507e-05, "loss": 3.5804, "step": 5050 }, { "epoch": 0.3434569914390542, "grad_norm": 2.0703656673431396, "learning_rate": 9.570933550754178e-05, "loss": 3.7641, "step": 5055 }, { "epoch": 0.34379671150971597, "grad_norm": 3.0426642894744873, "learning_rate": 9.570508900665852e-05, "loss": 3.3222, "step": 5060 }, { "epoch": 0.34413643158037777, "grad_norm": 1.9872699975967407, "learning_rate": 9.570084250577525e-05, "loss": 3.2471, "step": 5065 }, { "epoch": 0.34447615165103956, "grad_norm": 2.0045838356018066, "learning_rate": 9.569659600489196e-05, "loss": 3.4617, "step": 5070 }, { "epoch": 0.3448158717217013, "grad_norm": 2.3296327590942383, "learning_rate": 9.56923495040087e-05, "loss": 3.4086, "step": 5075 }, { "epoch": 0.3451555917923631, "grad_norm": 2.673248291015625, "learning_rate": 9.568810300312543e-05, "loss": 3.5193, "step": 5080 }, { "epoch": 0.34549531186302485, "grad_norm": 2.696136713027954, "learning_rate": 9.568385650224215e-05, "loss": 3.3428, "step": 5085 }, { "epoch": 0.34583503193368664, "grad_norm": 4.353693008422852, "learning_rate": 9.567961000135889e-05, "loss": 3.3181, "step": 5090 }, { "epoch": 0.34617475200434844, "grad_norm": 2.4072203636169434, "learning_rate": 9.567536350047562e-05, "loss": 3.6271, "step": 5095 }, { "epoch": 0.3465144720750102, "grad_norm": 2.293489933013916, "learning_rate": 9.567111699959233e-05, "loss": 3.291, "step": 5100 }, { "epoch": 0.346854192145672, "grad_norm": 1.891231656074524, "learning_rate": 9.566687049870907e-05, "loss": 3.4596, "step": 5105 }, { "epoch": 0.3471939122163337, "grad_norm": 2.0043277740478516, "learning_rate": 9.56626239978258e-05, "loss": 3.6103, "step": 5110 }, { "epoch": 0.3475336322869955, "grad_norm": 2.2673497200012207, "learning_rate": 9.565837749694252e-05, "loss": 3.5595, "step": 5115 }, { "epoch": 0.3478733523576573, "grad_norm": 2.216912031173706, "learning_rate": 9.565413099605926e-05, "loss": 3.461, "step": 5120 }, { "epoch": 0.34821307242831906, "grad_norm": 2.1620354652404785, "learning_rate": 9.564988449517597e-05, "loss": 3.6213, "step": 5125 }, { "epoch": 0.34855279249898086, "grad_norm": 2.035184383392334, "learning_rate": 9.56456379942927e-05, "loss": 3.389, "step": 5130 }, { "epoch": 0.3488925125696426, "grad_norm": 2.0516014099121094, "learning_rate": 9.564139149340944e-05, "loss": 3.3663, "step": 5135 }, { "epoch": 0.3492322326403044, "grad_norm": 1.7857179641723633, "learning_rate": 9.563714499252616e-05, "loss": 3.4649, "step": 5140 }, { "epoch": 0.34957195271096614, "grad_norm": 2.283069133758545, "learning_rate": 9.563289849164288e-05, "loss": 3.2298, "step": 5145 }, { "epoch": 0.34991167278162794, "grad_norm": 2.4059247970581055, "learning_rate": 9.562865199075963e-05, "loss": 3.6538, "step": 5150 }, { "epoch": 0.35025139285228973, "grad_norm": 1.8156726360321045, "learning_rate": 9.562440548987634e-05, "loss": 3.4423, "step": 5155 }, { "epoch": 0.3505911129229515, "grad_norm": 1.9691323041915894, "learning_rate": 9.562015898899307e-05, "loss": 3.6515, "step": 5160 }, { "epoch": 0.3509308329936133, "grad_norm": 1.8808988332748413, "learning_rate": 9.561591248810981e-05, "loss": 3.4709, "step": 5165 }, { "epoch": 0.351270553064275, "grad_norm": 2.020207405090332, "learning_rate": 9.561166598722652e-05, "loss": 3.4481, "step": 5170 }, { "epoch": 0.3516102731349368, "grad_norm": 2.298236846923828, "learning_rate": 9.560741948634325e-05, "loss": 3.2875, "step": 5175 }, { "epoch": 0.3519499932055986, "grad_norm": 3.182771921157837, "learning_rate": 9.560317298546e-05, "loss": 3.6108, "step": 5180 }, { "epoch": 0.35228971327626035, "grad_norm": 2.5850470066070557, "learning_rate": 9.559892648457671e-05, "loss": 3.3886, "step": 5185 }, { "epoch": 0.35262943334692215, "grad_norm": 2.092212438583374, "learning_rate": 9.559467998369344e-05, "loss": 3.7107, "step": 5190 }, { "epoch": 0.3529691534175839, "grad_norm": 2.263517141342163, "learning_rate": 9.559043348281016e-05, "loss": 3.4494, "step": 5195 }, { "epoch": 0.3533088734882457, "grad_norm": 2.3949036598205566, "learning_rate": 9.558618698192689e-05, "loss": 3.5598, "step": 5200 }, { "epoch": 0.3536485935589075, "grad_norm": 2.4407613277435303, "learning_rate": 9.558194048104362e-05, "loss": 3.366, "step": 5205 }, { "epoch": 0.35398831362956923, "grad_norm": 3.042372226715088, "learning_rate": 9.557769398016035e-05, "loss": 3.4785, "step": 5210 }, { "epoch": 0.354328033700231, "grad_norm": 2.424865484237671, "learning_rate": 9.557344747927708e-05, "loss": 3.6114, "step": 5215 }, { "epoch": 0.35466775377089277, "grad_norm": 2.8207926750183105, "learning_rate": 9.556920097839382e-05, "loss": 3.6909, "step": 5220 }, { "epoch": 0.35500747384155457, "grad_norm": 2.065338373184204, "learning_rate": 9.556495447751053e-05, "loss": 3.4274, "step": 5225 }, { "epoch": 0.3553471939122163, "grad_norm": 1.9987508058547974, "learning_rate": 9.556070797662726e-05, "loss": 3.5197, "step": 5230 }, { "epoch": 0.3556869139828781, "grad_norm": 2.4522342681884766, "learning_rate": 9.5556461475744e-05, "loss": 3.2803, "step": 5235 }, { "epoch": 0.3560266340535399, "grad_norm": 1.7566057443618774, "learning_rate": 9.555221497486072e-05, "loss": 3.5834, "step": 5240 }, { "epoch": 0.35636635412420165, "grad_norm": 2.4280521869659424, "learning_rate": 9.554796847397744e-05, "loss": 3.4192, "step": 5245 }, { "epoch": 0.35670607419486344, "grad_norm": 2.3163771629333496, "learning_rate": 9.554372197309419e-05, "loss": 3.7487, "step": 5250 }, { "epoch": 0.3570457942655252, "grad_norm": 1.7652122974395752, "learning_rate": 9.55394754722109e-05, "loss": 3.3039, "step": 5255 }, { "epoch": 0.357385514336187, "grad_norm": 2.1827664375305176, "learning_rate": 9.553522897132763e-05, "loss": 3.3813, "step": 5260 }, { "epoch": 0.3577252344068488, "grad_norm": 2.1868386268615723, "learning_rate": 9.553098247044436e-05, "loss": 3.2241, "step": 5265 }, { "epoch": 0.3580649544775105, "grad_norm": 2.2805964946746826, "learning_rate": 9.552673596956108e-05, "loss": 3.4951, "step": 5270 }, { "epoch": 0.3584046745481723, "grad_norm": 2.293192148208618, "learning_rate": 9.552248946867781e-05, "loss": 3.6963, "step": 5275 }, { "epoch": 0.35874439461883406, "grad_norm": 1.9848803281784058, "learning_rate": 9.551824296779454e-05, "loss": 3.5111, "step": 5280 }, { "epoch": 0.35908411468949586, "grad_norm": 2.6939401626586914, "learning_rate": 9.551399646691127e-05, "loss": 3.6023, "step": 5285 }, { "epoch": 0.35942383476015766, "grad_norm": 2.6907401084899902, "learning_rate": 9.5509749966028e-05, "loss": 3.572, "step": 5290 }, { "epoch": 0.3597635548308194, "grad_norm": 2.4348137378692627, "learning_rate": 9.550550346514472e-05, "loss": 3.7261, "step": 5295 }, { "epoch": 0.3601032749014812, "grad_norm": 2.1113297939300537, "learning_rate": 9.550125696426145e-05, "loss": 3.6574, "step": 5300 }, { "epoch": 0.36044299497214294, "grad_norm": 2.00673508644104, "learning_rate": 9.549701046337818e-05, "loss": 3.6794, "step": 5305 }, { "epoch": 0.36078271504280474, "grad_norm": 1.7474217414855957, "learning_rate": 9.549276396249491e-05, "loss": 3.7086, "step": 5310 }, { "epoch": 0.3611224351134665, "grad_norm": 1.9775686264038086, "learning_rate": 9.548851746161164e-05, "loss": 3.761, "step": 5315 }, { "epoch": 0.3614621551841283, "grad_norm": 2.300579071044922, "learning_rate": 9.548427096072836e-05, "loss": 3.4078, "step": 5320 }, { "epoch": 0.3618018752547901, "grad_norm": 2.7633800506591797, "learning_rate": 9.548002445984509e-05, "loss": 3.7536, "step": 5325 }, { "epoch": 0.3621415953254518, "grad_norm": 2.896941900253296, "learning_rate": 9.547577795896182e-05, "loss": 3.5601, "step": 5330 }, { "epoch": 0.3624813153961136, "grad_norm": 2.7240138053894043, "learning_rate": 9.547153145807855e-05, "loss": 3.8074, "step": 5335 }, { "epoch": 0.36282103546677535, "grad_norm": 2.397312879562378, "learning_rate": 9.546728495719528e-05, "loss": 3.4732, "step": 5340 }, { "epoch": 0.36316075553743715, "grad_norm": 3.0793724060058594, "learning_rate": 9.5463038456312e-05, "loss": 3.3193, "step": 5345 }, { "epoch": 0.36350047560809895, "grad_norm": 2.4621636867523193, "learning_rate": 9.545879195542873e-05, "loss": 3.4898, "step": 5350 }, { "epoch": 0.3638401956787607, "grad_norm": 1.9295380115509033, "learning_rate": 9.545454545454546e-05, "loss": 3.2048, "step": 5355 }, { "epoch": 0.3641799157494225, "grad_norm": 2.3588459491729736, "learning_rate": 9.545029895366219e-05, "loss": 3.4091, "step": 5360 }, { "epoch": 0.36451963582008423, "grad_norm": 2.6944656372070312, "learning_rate": 9.544605245277892e-05, "loss": 3.5929, "step": 5365 }, { "epoch": 0.36485935589074603, "grad_norm": 2.7152786254882812, "learning_rate": 9.544180595189564e-05, "loss": 3.3166, "step": 5370 }, { "epoch": 0.3651990759614078, "grad_norm": 2.3631365299224854, "learning_rate": 9.543755945101237e-05, "loss": 3.4736, "step": 5375 }, { "epoch": 0.36553879603206957, "grad_norm": 2.3416121006011963, "learning_rate": 9.54333129501291e-05, "loss": 3.3867, "step": 5380 }, { "epoch": 0.36587851610273137, "grad_norm": 2.008406639099121, "learning_rate": 9.542906644924583e-05, "loss": 3.5575, "step": 5385 }, { "epoch": 0.3662182361733931, "grad_norm": 2.2432711124420166, "learning_rate": 9.542481994836256e-05, "loss": 3.458, "step": 5390 }, { "epoch": 0.3665579562440549, "grad_norm": 2.037553310394287, "learning_rate": 9.542057344747927e-05, "loss": 3.4037, "step": 5395 }, { "epoch": 0.36689767631471665, "grad_norm": 1.8744285106658936, "learning_rate": 9.541632694659601e-05, "loss": 3.5522, "step": 5400 }, { "epoch": 0.36723739638537845, "grad_norm": 2.1985297203063965, "learning_rate": 9.541208044571274e-05, "loss": 3.4464, "step": 5405 }, { "epoch": 0.36757711645604024, "grad_norm": 2.1987900733947754, "learning_rate": 9.540783394482946e-05, "loss": 3.3822, "step": 5410 }, { "epoch": 0.367916836526702, "grad_norm": 2.3122260570526123, "learning_rate": 9.54035874439462e-05, "loss": 3.5572, "step": 5415 }, { "epoch": 0.3682565565973638, "grad_norm": 1.7977787256240845, "learning_rate": 9.539934094306293e-05, "loss": 3.4209, "step": 5420 }, { "epoch": 0.3685962766680255, "grad_norm": 2.850813627243042, "learning_rate": 9.539509444217964e-05, "loss": 3.3063, "step": 5425 }, { "epoch": 0.3689359967386873, "grad_norm": 2.0067622661590576, "learning_rate": 9.539084794129638e-05, "loss": 3.5631, "step": 5430 }, { "epoch": 0.3692757168093491, "grad_norm": 2.521108627319336, "learning_rate": 9.538660144041311e-05, "loss": 3.4673, "step": 5435 }, { "epoch": 0.36961543688001086, "grad_norm": 2.341568946838379, "learning_rate": 9.538235493952982e-05, "loss": 3.3754, "step": 5440 }, { "epoch": 0.36995515695067266, "grad_norm": 1.9837868213653564, "learning_rate": 9.537895773882322e-05, "loss": 3.5923, "step": 5445 }, { "epoch": 0.3702948770213344, "grad_norm": 2.3855340480804443, "learning_rate": 9.537471123793993e-05, "loss": 3.3687, "step": 5450 }, { "epoch": 0.3706345970919962, "grad_norm": 2.099226713180542, "learning_rate": 9.537046473705668e-05, "loss": 3.7035, "step": 5455 }, { "epoch": 0.370974317162658, "grad_norm": 2.2473480701446533, "learning_rate": 9.53662182361734e-05, "loss": 3.3972, "step": 5460 }, { "epoch": 0.37131403723331974, "grad_norm": 2.3042585849761963, "learning_rate": 9.536197173529012e-05, "loss": 3.3928, "step": 5465 }, { "epoch": 0.37165375730398154, "grad_norm": 2.688464879989624, "learning_rate": 9.535772523440686e-05, "loss": 3.6379, "step": 5470 }, { "epoch": 0.3719934773746433, "grad_norm": 2.2118613719940186, "learning_rate": 9.535347873352359e-05, "loss": 3.6294, "step": 5475 }, { "epoch": 0.3723331974453051, "grad_norm": 1.9588185548782349, "learning_rate": 9.53492322326403e-05, "loss": 3.6541, "step": 5480 }, { "epoch": 0.3726729175159668, "grad_norm": 2.7694761753082275, "learning_rate": 9.534498573175704e-05, "loss": 3.8579, "step": 5485 }, { "epoch": 0.3730126375866286, "grad_norm": 2.9138429164886475, "learning_rate": 9.534073923087377e-05, "loss": 3.4537, "step": 5490 }, { "epoch": 0.3733523576572904, "grad_norm": 2.030308485031128, "learning_rate": 9.533649272999049e-05, "loss": 3.5327, "step": 5495 }, { "epoch": 0.37369207772795215, "grad_norm": 2.130218029022217, "learning_rate": 9.533224622910723e-05, "loss": 3.5519, "step": 5500 }, { "epoch": 0.37403179779861395, "grad_norm": 2.3214871883392334, "learning_rate": 9.532799972822396e-05, "loss": 3.6258, "step": 5505 }, { "epoch": 0.3743715178692757, "grad_norm": 2.158155918121338, "learning_rate": 9.532375322734067e-05, "loss": 3.3948, "step": 5510 }, { "epoch": 0.3747112379399375, "grad_norm": 2.628941059112549, "learning_rate": 9.531950672645741e-05, "loss": 3.5988, "step": 5515 }, { "epoch": 0.3750509580105993, "grad_norm": 1.88633131980896, "learning_rate": 9.531526022557413e-05, "loss": 3.2646, "step": 5520 }, { "epoch": 0.37539067808126103, "grad_norm": 2.2778618335723877, "learning_rate": 9.531101372469085e-05, "loss": 3.4196, "step": 5525 }, { "epoch": 0.37573039815192283, "grad_norm": 2.4018216133117676, "learning_rate": 9.53067672238076e-05, "loss": 3.4266, "step": 5530 }, { "epoch": 0.37607011822258457, "grad_norm": 1.891074538230896, "learning_rate": 9.530252072292431e-05, "loss": 3.8662, "step": 5535 }, { "epoch": 0.37640983829324637, "grad_norm": 2.3034353256225586, "learning_rate": 9.529827422204104e-05, "loss": 3.4704, "step": 5540 }, { "epoch": 0.37674955836390817, "grad_norm": 2.0530219078063965, "learning_rate": 9.529402772115778e-05, "loss": 3.8118, "step": 5545 }, { "epoch": 0.3770892784345699, "grad_norm": 1.7378071546554565, "learning_rate": 9.52897812202745e-05, "loss": 3.7089, "step": 5550 }, { "epoch": 0.3774289985052317, "grad_norm": 4.47321891784668, "learning_rate": 9.528553471939122e-05, "loss": 3.7129, "step": 5555 }, { "epoch": 0.37776871857589345, "grad_norm": 2.6330199241638184, "learning_rate": 9.528128821850796e-05, "loss": 3.6119, "step": 5560 }, { "epoch": 0.37810843864655524, "grad_norm": 2.302823305130005, "learning_rate": 9.527704171762468e-05, "loss": 3.3904, "step": 5565 }, { "epoch": 0.378448158717217, "grad_norm": 2.3510868549346924, "learning_rate": 9.52727952167414e-05, "loss": 3.6345, "step": 5570 }, { "epoch": 0.3787878787878788, "grad_norm": 1.9679723978042603, "learning_rate": 9.526854871585815e-05, "loss": 3.3001, "step": 5575 }, { "epoch": 0.3791275988585406, "grad_norm": 2.5521187782287598, "learning_rate": 9.526430221497486e-05, "loss": 3.3881, "step": 5580 }, { "epoch": 0.3794673189292023, "grad_norm": 2.2444393634796143, "learning_rate": 9.526005571409159e-05, "loss": 3.4638, "step": 5585 }, { "epoch": 0.3798070389998641, "grad_norm": 2.0025293827056885, "learning_rate": 9.525580921320832e-05, "loss": 3.2738, "step": 5590 }, { "epoch": 0.38014675907052586, "grad_norm": 2.1449875831604004, "learning_rate": 9.525156271232505e-05, "loss": 3.508, "step": 5595 }, { "epoch": 0.38048647914118766, "grad_norm": 2.309112071990967, "learning_rate": 9.524731621144177e-05, "loss": 3.7207, "step": 5600 }, { "epoch": 0.38082619921184946, "grad_norm": 2.2592954635620117, "learning_rate": 9.52430697105585e-05, "loss": 3.5663, "step": 5605 }, { "epoch": 0.3811659192825112, "grad_norm": 2.173708915710449, "learning_rate": 9.523882320967523e-05, "loss": 3.3756, "step": 5610 }, { "epoch": 0.381505639353173, "grad_norm": 2.0735418796539307, "learning_rate": 9.523457670879196e-05, "loss": 3.3884, "step": 5615 }, { "epoch": 0.38184535942383474, "grad_norm": 1.907126784324646, "learning_rate": 9.523033020790869e-05, "loss": 3.4768, "step": 5620 }, { "epoch": 0.38218507949449654, "grad_norm": 2.386720895767212, "learning_rate": 9.522608370702541e-05, "loss": 3.4243, "step": 5625 }, { "epoch": 0.38252479956515834, "grad_norm": 2.549302339553833, "learning_rate": 9.522183720614214e-05, "loss": 3.6601, "step": 5630 }, { "epoch": 0.3828645196358201, "grad_norm": 2.2185428142547607, "learning_rate": 9.521759070525887e-05, "loss": 3.2892, "step": 5635 }, { "epoch": 0.3832042397064819, "grad_norm": 2.2634174823760986, "learning_rate": 9.52133442043756e-05, "loss": 3.5779, "step": 5640 }, { "epoch": 0.3835439597771436, "grad_norm": 2.3982715606689453, "learning_rate": 9.520909770349233e-05, "loss": 3.5883, "step": 5645 }, { "epoch": 0.3838836798478054, "grad_norm": 2.4622321128845215, "learning_rate": 9.520485120260905e-05, "loss": 3.3979, "step": 5650 }, { "epoch": 0.38422339991846716, "grad_norm": 2.497669219970703, "learning_rate": 9.520060470172578e-05, "loss": 3.7132, "step": 5655 }, { "epoch": 0.38456311998912895, "grad_norm": 1.95124089717865, "learning_rate": 9.519635820084251e-05, "loss": 3.4086, "step": 5660 }, { "epoch": 0.38490284005979075, "grad_norm": 2.267400026321411, "learning_rate": 9.519211169995924e-05, "loss": 3.4912, "step": 5665 }, { "epoch": 0.3852425601304525, "grad_norm": 2.9256820678710938, "learning_rate": 9.518786519907597e-05, "loss": 3.457, "step": 5670 }, { "epoch": 0.3855822802011143, "grad_norm": 2.201944351196289, "learning_rate": 9.51836186981927e-05, "loss": 3.418, "step": 5675 }, { "epoch": 0.38592200027177603, "grad_norm": 2.2088499069213867, "learning_rate": 9.517937219730942e-05, "loss": 3.4102, "step": 5680 }, { "epoch": 0.38626172034243783, "grad_norm": 2.034724712371826, "learning_rate": 9.517512569642615e-05, "loss": 3.4246, "step": 5685 }, { "epoch": 0.38660144041309963, "grad_norm": 2.6734964847564697, "learning_rate": 9.517087919554288e-05, "loss": 3.6603, "step": 5690 }, { "epoch": 0.38694116048376137, "grad_norm": 2.102245569229126, "learning_rate": 9.51666326946596e-05, "loss": 3.3954, "step": 5695 }, { "epoch": 0.38728088055442317, "grad_norm": 2.590301036834717, "learning_rate": 9.516238619377633e-05, "loss": 3.5354, "step": 5700 }, { "epoch": 0.3876206006250849, "grad_norm": 2.55387544631958, "learning_rate": 9.515813969289306e-05, "loss": 3.4349, "step": 5705 }, { "epoch": 0.3879603206957467, "grad_norm": 2.1026203632354736, "learning_rate": 9.515389319200979e-05, "loss": 3.5727, "step": 5710 }, { "epoch": 0.3883000407664085, "grad_norm": 2.344355583190918, "learning_rate": 9.514964669112652e-05, "loss": 3.5671, "step": 5715 }, { "epoch": 0.38863976083707025, "grad_norm": 2.415301561355591, "learning_rate": 9.514540019024323e-05, "loss": 3.5547, "step": 5720 }, { "epoch": 0.38897948090773204, "grad_norm": 2.3402092456817627, "learning_rate": 9.514115368935997e-05, "loss": 3.6248, "step": 5725 }, { "epoch": 0.3893192009783938, "grad_norm": 5.19041633605957, "learning_rate": 9.51369071884767e-05, "loss": 3.5613, "step": 5730 }, { "epoch": 0.3896589210490556, "grad_norm": 2.601196527481079, "learning_rate": 9.513266068759342e-05, "loss": 3.5092, "step": 5735 }, { "epoch": 0.3899986411197173, "grad_norm": 2.2364370822906494, "learning_rate": 9.512841418671016e-05, "loss": 3.7385, "step": 5740 }, { "epoch": 0.3903383611903791, "grad_norm": 2.4521467685699463, "learning_rate": 9.512416768582689e-05, "loss": 3.5398, "step": 5745 }, { "epoch": 0.3906780812610409, "grad_norm": 2.12900710105896, "learning_rate": 9.51199211849436e-05, "loss": 3.5456, "step": 5750 }, { "epoch": 0.39101780133170266, "grad_norm": 2.26472544670105, "learning_rate": 9.511567468406034e-05, "loss": 3.6027, "step": 5755 }, { "epoch": 0.39135752140236446, "grad_norm": 2.145358085632324, "learning_rate": 9.511142818317707e-05, "loss": 3.6173, "step": 5760 }, { "epoch": 0.3916972414730262, "grad_norm": 2.2755930423736572, "learning_rate": 9.51071816822938e-05, "loss": 3.3401, "step": 5765 }, { "epoch": 0.392036961543688, "grad_norm": 2.128591537475586, "learning_rate": 9.510293518141053e-05, "loss": 3.5998, "step": 5770 }, { "epoch": 0.3923766816143498, "grad_norm": 2.3939907550811768, "learning_rate": 9.509868868052725e-05, "loss": 3.5838, "step": 5775 }, { "epoch": 0.39271640168501154, "grad_norm": 2.3246493339538574, "learning_rate": 9.509444217964398e-05, "loss": 3.239, "step": 5780 }, { "epoch": 0.39305612175567334, "grad_norm": 2.4405014514923096, "learning_rate": 9.509019567876071e-05, "loss": 3.5871, "step": 5785 }, { "epoch": 0.3933958418263351, "grad_norm": 2.5543372631073, "learning_rate": 9.508594917787742e-05, "loss": 3.7152, "step": 5790 }, { "epoch": 0.3937355618969969, "grad_norm": 2.156559705734253, "learning_rate": 9.508170267699417e-05, "loss": 3.4728, "step": 5795 }, { "epoch": 0.3940752819676587, "grad_norm": 2.7907774448394775, "learning_rate": 9.50774561761109e-05, "loss": 3.6774, "step": 5800 }, { "epoch": 0.3944150020383204, "grad_norm": 2.349163770675659, "learning_rate": 9.507320967522761e-05, "loss": 3.6738, "step": 5805 }, { "epoch": 0.3947547221089822, "grad_norm": 2.1579880714416504, "learning_rate": 9.506896317434435e-05, "loss": 3.5, "step": 5810 }, { "epoch": 0.39509444217964396, "grad_norm": 2.3118982315063477, "learning_rate": 9.506471667346108e-05, "loss": 3.1216, "step": 5815 }, { "epoch": 0.39543416225030575, "grad_norm": 2.0580344200134277, "learning_rate": 9.506047017257779e-05, "loss": 3.728, "step": 5820 }, { "epoch": 0.3957738823209675, "grad_norm": 2.1640231609344482, "learning_rate": 9.505622367169453e-05, "loss": 3.607, "step": 5825 }, { "epoch": 0.3961136023916293, "grad_norm": 2.620502233505249, "learning_rate": 9.505197717081126e-05, "loss": 3.2637, "step": 5830 }, { "epoch": 0.3964533224622911, "grad_norm": 2.107973098754883, "learning_rate": 9.504773066992798e-05, "loss": 3.6636, "step": 5835 }, { "epoch": 0.39679304253295283, "grad_norm": 2.1638150215148926, "learning_rate": 9.504348416904472e-05, "loss": 3.4899, "step": 5840 }, { "epoch": 0.39713276260361463, "grad_norm": 2.3276584148406982, "learning_rate": 9.503923766816145e-05, "loss": 3.4387, "step": 5845 }, { "epoch": 0.3974724826742764, "grad_norm": 2.4550554752349854, "learning_rate": 9.503499116727816e-05, "loss": 3.3838, "step": 5850 }, { "epoch": 0.39781220274493817, "grad_norm": 2.314365863800049, "learning_rate": 9.50307446663949e-05, "loss": 3.5617, "step": 5855 }, { "epoch": 0.39815192281559997, "grad_norm": 1.8994426727294922, "learning_rate": 9.502649816551162e-05, "loss": 3.5274, "step": 5860 }, { "epoch": 0.3984916428862617, "grad_norm": 1.8744722604751587, "learning_rate": 9.502225166462835e-05, "loss": 3.7462, "step": 5865 }, { "epoch": 0.3988313629569235, "grad_norm": 2.5736844539642334, "learning_rate": 9.501800516374509e-05, "loss": 3.4497, "step": 5870 }, { "epoch": 0.39917108302758525, "grad_norm": 1.7807859182357788, "learning_rate": 9.50137586628618e-05, "loss": 3.2401, "step": 5875 }, { "epoch": 0.39951080309824705, "grad_norm": 2.3960299491882324, "learning_rate": 9.500951216197853e-05, "loss": 3.5057, "step": 5880 }, { "epoch": 0.39985052316890884, "grad_norm": 2.1863772869110107, "learning_rate": 9.500526566109527e-05, "loss": 3.6953, "step": 5885 }, { "epoch": 0.4001902432395706, "grad_norm": 1.8718239068984985, "learning_rate": 9.500101916021199e-05, "loss": 3.5513, "step": 5890 }, { "epoch": 0.4005299633102324, "grad_norm": 2.161097288131714, "learning_rate": 9.499677265932871e-05, "loss": 3.2804, "step": 5895 }, { "epoch": 0.4008696833808941, "grad_norm": 2.724787712097168, "learning_rate": 9.499252615844545e-05, "loss": 3.4583, "step": 5900 }, { "epoch": 0.4012094034515559, "grad_norm": 2.079878330230713, "learning_rate": 9.498827965756217e-05, "loss": 3.569, "step": 5905 }, { "epoch": 0.40154912352221767, "grad_norm": 3.1078429222106934, "learning_rate": 9.49840331566789e-05, "loss": 3.6336, "step": 5910 }, { "epoch": 0.40188884359287946, "grad_norm": 2.195901393890381, "learning_rate": 9.497978665579564e-05, "loss": 3.6751, "step": 5915 }, { "epoch": 0.40222856366354126, "grad_norm": 3.3410584926605225, "learning_rate": 9.497554015491235e-05, "loss": 3.6466, "step": 5920 }, { "epoch": 0.402568283734203, "grad_norm": 1.914284348487854, "learning_rate": 9.497129365402908e-05, "loss": 3.433, "step": 5925 }, { "epoch": 0.4029080038048648, "grad_norm": 2.947977304458618, "learning_rate": 9.496704715314582e-05, "loss": 3.5166, "step": 5930 }, { "epoch": 0.40324772387552654, "grad_norm": 2.164297103881836, "learning_rate": 9.496280065226254e-05, "loss": 3.7164, "step": 5935 }, { "epoch": 0.40358744394618834, "grad_norm": 2.2850894927978516, "learning_rate": 9.495855415137927e-05, "loss": 3.4645, "step": 5940 }, { "epoch": 0.40392716401685014, "grad_norm": 2.1502060890197754, "learning_rate": 9.495430765049599e-05, "loss": 3.4292, "step": 5945 }, { "epoch": 0.4042668840875119, "grad_norm": 1.9091222286224365, "learning_rate": 9.495006114961272e-05, "loss": 3.7687, "step": 5950 }, { "epoch": 0.4046066041581737, "grad_norm": 1.9365631341934204, "learning_rate": 9.494581464872945e-05, "loss": 3.5556, "step": 5955 }, { "epoch": 0.4049463242288354, "grad_norm": 2.008934736251831, "learning_rate": 9.494156814784618e-05, "loss": 3.4835, "step": 5960 }, { "epoch": 0.4052860442994972, "grad_norm": 2.1476967334747314, "learning_rate": 9.49373216469629e-05, "loss": 3.5901, "step": 5965 }, { "epoch": 0.405625764370159, "grad_norm": 2.3163695335388184, "learning_rate": 9.493307514607963e-05, "loss": 3.7873, "step": 5970 }, { "epoch": 0.40596548444082076, "grad_norm": 2.0577564239501953, "learning_rate": 9.492882864519636e-05, "loss": 3.3, "step": 5975 }, { "epoch": 0.40630520451148255, "grad_norm": 2.55914568901062, "learning_rate": 9.492458214431309e-05, "loss": 3.5086, "step": 5980 }, { "epoch": 0.4066449245821443, "grad_norm": 1.9641376733779907, "learning_rate": 9.492033564342982e-05, "loss": 3.3942, "step": 5985 }, { "epoch": 0.4069846446528061, "grad_norm": 2.674025297164917, "learning_rate": 9.491608914254655e-05, "loss": 3.7973, "step": 5990 }, { "epoch": 0.40732436472346784, "grad_norm": 2.183528184890747, "learning_rate": 9.491184264166327e-05, "loss": 3.5388, "step": 5995 }, { "epoch": 0.40766408479412963, "grad_norm": 2.0168113708496094, "learning_rate": 9.490759614078e-05, "loss": 3.4047, "step": 6000 }, { "epoch": 0.40800380486479143, "grad_norm": 2.4202096462249756, "learning_rate": 9.490334963989673e-05, "loss": 3.5396, "step": 6005 }, { "epoch": 0.4083435249354532, "grad_norm": 2.2292206287384033, "learning_rate": 9.489910313901346e-05, "loss": 3.5008, "step": 6010 }, { "epoch": 0.40868324500611497, "grad_norm": 2.375166416168213, "learning_rate": 9.489485663813019e-05, "loss": 3.5598, "step": 6015 }, { "epoch": 0.4090229650767767, "grad_norm": 2.449183225631714, "learning_rate": 9.489061013724691e-05, "loss": 3.5098, "step": 6020 }, { "epoch": 0.4093626851474385, "grad_norm": 2.4351933002471924, "learning_rate": 9.488636363636364e-05, "loss": 3.3801, "step": 6025 }, { "epoch": 0.4097024052181003, "grad_norm": 2.2387006282806396, "learning_rate": 9.488211713548037e-05, "loss": 3.3374, "step": 6030 }, { "epoch": 0.41004212528876205, "grad_norm": 2.148315906524658, "learning_rate": 9.48778706345971e-05, "loss": 3.3949, "step": 6035 }, { "epoch": 0.41038184535942385, "grad_norm": 3.2371878623962402, "learning_rate": 9.487362413371383e-05, "loss": 3.5393, "step": 6040 }, { "epoch": 0.4107215654300856, "grad_norm": 2.1698648929595947, "learning_rate": 9.486937763283055e-05, "loss": 3.4602, "step": 6045 }, { "epoch": 0.4110612855007474, "grad_norm": 2.2219765186309814, "learning_rate": 9.486513113194728e-05, "loss": 3.5677, "step": 6050 }, { "epoch": 0.4114010055714092, "grad_norm": 2.333155632019043, "learning_rate": 9.486088463106401e-05, "loss": 3.2506, "step": 6055 }, { "epoch": 0.4117407256420709, "grad_norm": 2.4780113697052, "learning_rate": 9.485663813018072e-05, "loss": 3.3625, "step": 6060 }, { "epoch": 0.4120804457127327, "grad_norm": 3.1555166244506836, "learning_rate": 9.485239162929747e-05, "loss": 3.6358, "step": 6065 }, { "epoch": 0.41242016578339447, "grad_norm": 2.722064971923828, "learning_rate": 9.48481451284142e-05, "loss": 3.7669, "step": 6070 }, { "epoch": 0.41275988585405626, "grad_norm": 2.3357059955596924, "learning_rate": 9.484389862753091e-05, "loss": 3.1026, "step": 6075 }, { "epoch": 0.413099605924718, "grad_norm": 2.924870491027832, "learning_rate": 9.483965212664765e-05, "loss": 3.6327, "step": 6080 }, { "epoch": 0.4134393259953798, "grad_norm": 2.819397211074829, "learning_rate": 9.483540562576438e-05, "loss": 3.4774, "step": 6085 }, { "epoch": 0.4137790460660416, "grad_norm": 2.039072275161743, "learning_rate": 9.483115912488109e-05, "loss": 3.6857, "step": 6090 }, { "epoch": 0.41411876613670334, "grad_norm": 2.581530809402466, "learning_rate": 9.482691262399783e-05, "loss": 3.474, "step": 6095 }, { "epoch": 0.41445848620736514, "grad_norm": 2.0482287406921387, "learning_rate": 9.482266612311456e-05, "loss": 3.4328, "step": 6100 }, { "epoch": 0.4147982062780269, "grad_norm": 2.2304725646972656, "learning_rate": 9.481841962223129e-05, "loss": 3.3463, "step": 6105 }, { "epoch": 0.4151379263486887, "grad_norm": 2.487243413925171, "learning_rate": 9.481417312134802e-05, "loss": 3.4985, "step": 6110 }, { "epoch": 0.4154776464193505, "grad_norm": 3.1797499656677246, "learning_rate": 9.480992662046475e-05, "loss": 3.4223, "step": 6115 }, { "epoch": 0.4158173664900122, "grad_norm": 2.1688737869262695, "learning_rate": 9.480568011958147e-05, "loss": 3.7536, "step": 6120 }, { "epoch": 0.416157086560674, "grad_norm": 2.112556219100952, "learning_rate": 9.48014336186982e-05, "loss": 3.5606, "step": 6125 }, { "epoch": 0.41649680663133576, "grad_norm": 1.8179993629455566, "learning_rate": 9.479718711781493e-05, "loss": 3.3376, "step": 6130 }, { "epoch": 0.41683652670199756, "grad_norm": 2.383026599884033, "learning_rate": 9.479294061693166e-05, "loss": 3.4151, "step": 6135 }, { "epoch": 0.41717624677265935, "grad_norm": 2.0007266998291016, "learning_rate": 9.478869411604839e-05, "loss": 3.5658, "step": 6140 }, { "epoch": 0.4175159668433211, "grad_norm": 2.4773383140563965, "learning_rate": 9.47844476151651e-05, "loss": 3.6117, "step": 6145 }, { "epoch": 0.4178556869139829, "grad_norm": 3.078857898712158, "learning_rate": 9.478020111428184e-05, "loss": 3.6391, "step": 6150 }, { "epoch": 0.41819540698464464, "grad_norm": 2.535287618637085, "learning_rate": 9.477595461339857e-05, "loss": 3.3409, "step": 6155 }, { "epoch": 0.41853512705530643, "grad_norm": 2.0010769367218018, "learning_rate": 9.477170811251528e-05, "loss": 3.4942, "step": 6160 }, { "epoch": 0.4188748471259682, "grad_norm": 2.3958261013031006, "learning_rate": 9.476746161163203e-05, "loss": 3.7077, "step": 6165 }, { "epoch": 0.41921456719663, "grad_norm": 2.1570017337799072, "learning_rate": 9.476321511074875e-05, "loss": 3.6067, "step": 6170 }, { "epoch": 0.41955428726729177, "grad_norm": 2.6229751110076904, "learning_rate": 9.475896860986547e-05, "loss": 3.5907, "step": 6175 }, { "epoch": 0.4198940073379535, "grad_norm": 2.0216729640960693, "learning_rate": 9.475472210898221e-05, "loss": 3.4843, "step": 6180 }, { "epoch": 0.4202337274086153, "grad_norm": 2.4097788333892822, "learning_rate": 9.475047560809894e-05, "loss": 3.4321, "step": 6185 }, { "epoch": 0.42057344747927705, "grad_norm": 2.1403191089630127, "learning_rate": 9.474622910721565e-05, "loss": 3.3428, "step": 6190 }, { "epoch": 0.42091316754993885, "grad_norm": 1.7724568843841553, "learning_rate": 9.47419826063324e-05, "loss": 3.5977, "step": 6195 }, { "epoch": 0.42125288762060065, "grad_norm": 1.9858791828155518, "learning_rate": 9.473773610544912e-05, "loss": 3.5343, "step": 6200 }, { "epoch": 0.4215926076912624, "grad_norm": 2.3160152435302734, "learning_rate": 9.473348960456584e-05, "loss": 3.4312, "step": 6205 }, { "epoch": 0.4219323277619242, "grad_norm": 2.1834471225738525, "learning_rate": 9.472924310368258e-05, "loss": 3.4977, "step": 6210 }, { "epoch": 0.42227204783258593, "grad_norm": 2.7510712146759033, "learning_rate": 9.472499660279929e-05, "loss": 3.215, "step": 6215 }, { "epoch": 0.4226117679032477, "grad_norm": 2.227858066558838, "learning_rate": 9.472075010191602e-05, "loss": 3.5091, "step": 6220 }, { "epoch": 0.4229514879739095, "grad_norm": 2.0628483295440674, "learning_rate": 9.471650360103276e-05, "loss": 3.3493, "step": 6225 }, { "epoch": 0.42329120804457127, "grad_norm": 2.113569974899292, "learning_rate": 9.471225710014948e-05, "loss": 3.4865, "step": 6230 }, { "epoch": 0.42363092811523306, "grad_norm": 1.8367078304290771, "learning_rate": 9.47080105992662e-05, "loss": 3.5712, "step": 6235 }, { "epoch": 0.4239706481858948, "grad_norm": 2.3989691734313965, "learning_rate": 9.470376409838295e-05, "loss": 3.4337, "step": 6240 }, { "epoch": 0.4243103682565566, "grad_norm": 2.3639042377471924, "learning_rate": 9.469951759749966e-05, "loss": 3.4743, "step": 6245 }, { "epoch": 0.42465008832721834, "grad_norm": 2.052133560180664, "learning_rate": 9.469527109661639e-05, "loss": 3.3761, "step": 6250 }, { "epoch": 0.42498980839788014, "grad_norm": 2.2083051204681396, "learning_rate": 9.469102459573313e-05, "loss": 3.4967, "step": 6255 }, { "epoch": 0.42532952846854194, "grad_norm": 2.2909905910491943, "learning_rate": 9.468677809484984e-05, "loss": 3.1705, "step": 6260 }, { "epoch": 0.4256692485392037, "grad_norm": 1.757896900177002, "learning_rate": 9.468253159396657e-05, "loss": 3.1889, "step": 6265 }, { "epoch": 0.4260089686098655, "grad_norm": 2.757215738296509, "learning_rate": 9.467828509308331e-05, "loss": 3.631, "step": 6270 }, { "epoch": 0.4263486886805272, "grad_norm": 2.077026844024658, "learning_rate": 9.467403859220003e-05, "loss": 3.3514, "step": 6275 }, { "epoch": 0.426688408751189, "grad_norm": 2.645669460296631, "learning_rate": 9.466979209131676e-05, "loss": 3.4541, "step": 6280 }, { "epoch": 0.4270281288218508, "grad_norm": 2.870403528213501, "learning_rate": 9.466554559043348e-05, "loss": 3.6692, "step": 6285 }, { "epoch": 0.42736784889251256, "grad_norm": 2.545874834060669, "learning_rate": 9.466129908955021e-05, "loss": 3.4684, "step": 6290 }, { "epoch": 0.42770756896317436, "grad_norm": 1.9612118005752563, "learning_rate": 9.465705258866694e-05, "loss": 3.6194, "step": 6295 }, { "epoch": 0.4280472890338361, "grad_norm": 1.9753509759902954, "learning_rate": 9.465280608778367e-05, "loss": 3.4943, "step": 6300 }, { "epoch": 0.4283870091044979, "grad_norm": 2.0523602962493896, "learning_rate": 9.46485595869004e-05, "loss": 3.1491, "step": 6305 }, { "epoch": 0.4287267291751597, "grad_norm": 2.113978624343872, "learning_rate": 9.464431308601712e-05, "loss": 3.8279, "step": 6310 }, { "epoch": 0.42906644924582144, "grad_norm": 6.667606830596924, "learning_rate": 9.464006658513385e-05, "loss": 3.6007, "step": 6315 }, { "epoch": 0.42940616931648323, "grad_norm": 2.3507540225982666, "learning_rate": 9.463582008425058e-05, "loss": 3.5137, "step": 6320 }, { "epoch": 0.429745889387145, "grad_norm": 2.382766008377075, "learning_rate": 9.463157358336731e-05, "loss": 3.3439, "step": 6325 }, { "epoch": 0.4300856094578068, "grad_norm": 2.8620731830596924, "learning_rate": 9.462732708248404e-05, "loss": 3.4246, "step": 6330 }, { "epoch": 0.4304253295284685, "grad_norm": 2.1794207096099854, "learning_rate": 9.462308058160076e-05, "loss": 3.5608, "step": 6335 }, { "epoch": 0.4307650495991303, "grad_norm": 2.4482569694519043, "learning_rate": 9.461883408071749e-05, "loss": 3.409, "step": 6340 }, { "epoch": 0.4311047696697921, "grad_norm": 2.355351686477661, "learning_rate": 9.461458757983422e-05, "loss": 3.3397, "step": 6345 }, { "epoch": 0.43144448974045385, "grad_norm": 2.7132089138031006, "learning_rate": 9.461034107895095e-05, "loss": 3.4117, "step": 6350 }, { "epoch": 0.43178420981111565, "grad_norm": 2.6843183040618896, "learning_rate": 9.460609457806768e-05, "loss": 3.4725, "step": 6355 }, { "epoch": 0.4321239298817774, "grad_norm": 2.08958101272583, "learning_rate": 9.46018480771844e-05, "loss": 3.2091, "step": 6360 }, { "epoch": 0.4324636499524392, "grad_norm": 2.0385537147521973, "learning_rate": 9.459760157630113e-05, "loss": 3.5679, "step": 6365 }, { "epoch": 0.432803370023101, "grad_norm": 2.765249013900757, "learning_rate": 9.459335507541786e-05, "loss": 3.1881, "step": 6370 }, { "epoch": 0.43314309009376273, "grad_norm": 2.6931862831115723, "learning_rate": 9.458910857453459e-05, "loss": 3.3704, "step": 6375 }, { "epoch": 0.4334828101644245, "grad_norm": 2.6158087253570557, "learning_rate": 9.458486207365132e-05, "loss": 3.5237, "step": 6380 }, { "epoch": 0.43382253023508627, "grad_norm": 2.631019115447998, "learning_rate": 9.458061557276804e-05, "loss": 3.4318, "step": 6385 }, { "epoch": 0.43416225030574807, "grad_norm": 2.567272186279297, "learning_rate": 9.457636907188477e-05, "loss": 3.3813, "step": 6390 }, { "epoch": 0.43450197037640986, "grad_norm": 2.3392422199249268, "learning_rate": 9.45721225710015e-05, "loss": 3.6318, "step": 6395 }, { "epoch": 0.4348416904470716, "grad_norm": 2.278768539428711, "learning_rate": 9.456787607011823e-05, "loss": 3.3889, "step": 6400 }, { "epoch": 0.4351814105177334, "grad_norm": 1.941454529762268, "learning_rate": 9.456362956923496e-05, "loss": 3.553, "step": 6405 }, { "epoch": 0.43552113058839514, "grad_norm": 2.306364059448242, "learning_rate": 9.455938306835168e-05, "loss": 3.3899, "step": 6410 }, { "epoch": 0.43586085065905694, "grad_norm": 2.377596855163574, "learning_rate": 9.45551365674684e-05, "loss": 3.3728, "step": 6415 }, { "epoch": 0.4362005707297187, "grad_norm": 1.8657268285751343, "learning_rate": 9.455089006658514e-05, "loss": 3.3541, "step": 6420 }, { "epoch": 0.4365402908003805, "grad_norm": 2.3879880905151367, "learning_rate": 9.454664356570187e-05, "loss": 3.3301, "step": 6425 }, { "epoch": 0.4368800108710423, "grad_norm": 2.1015968322753906, "learning_rate": 9.454239706481858e-05, "loss": 3.7275, "step": 6430 }, { "epoch": 0.437219730941704, "grad_norm": 2.0952141284942627, "learning_rate": 9.453815056393532e-05, "loss": 3.5275, "step": 6435 }, { "epoch": 0.4375594510123658, "grad_norm": 2.6250181198120117, "learning_rate": 9.453390406305205e-05, "loss": 3.407, "step": 6440 }, { "epoch": 0.43789917108302756, "grad_norm": 2.5968611240386963, "learning_rate": 9.452965756216878e-05, "loss": 3.4617, "step": 6445 }, { "epoch": 0.43823889115368936, "grad_norm": 2.3081111907958984, "learning_rate": 9.452541106128551e-05, "loss": 3.5497, "step": 6450 }, { "epoch": 0.43857861122435116, "grad_norm": 2.375849723815918, "learning_rate": 9.452116456040224e-05, "loss": 3.725, "step": 6455 }, { "epoch": 0.4389183312950129, "grad_norm": 2.0327749252319336, "learning_rate": 9.451691805951896e-05, "loss": 3.2806, "step": 6460 }, { "epoch": 0.4392580513656747, "grad_norm": 2.4605295658111572, "learning_rate": 9.451267155863569e-05, "loss": 3.4662, "step": 6465 }, { "epoch": 0.43959777143633644, "grad_norm": 2.1697449684143066, "learning_rate": 9.450842505775242e-05, "loss": 3.2462, "step": 6470 }, { "epoch": 0.43993749150699824, "grad_norm": 2.1426942348480225, "learning_rate": 9.450417855686915e-05, "loss": 3.7187, "step": 6475 }, { "epoch": 0.44027721157766003, "grad_norm": 2.0731704235076904, "learning_rate": 9.449993205598588e-05, "loss": 3.6408, "step": 6480 }, { "epoch": 0.4406169316483218, "grad_norm": 2.120217800140381, "learning_rate": 9.44956855551026e-05, "loss": 3.355, "step": 6485 }, { "epoch": 0.44095665171898357, "grad_norm": 2.223435640335083, "learning_rate": 9.449143905421933e-05, "loss": 3.6731, "step": 6490 }, { "epoch": 0.4412963717896453, "grad_norm": 2.6069273948669434, "learning_rate": 9.448719255333606e-05, "loss": 3.4692, "step": 6495 }, { "epoch": 0.4416360918603071, "grad_norm": 2.720393657684326, "learning_rate": 9.448294605245277e-05, "loss": 3.6989, "step": 6500 }, { "epoch": 0.4419758119309689, "grad_norm": 2.779848575592041, "learning_rate": 9.447869955156952e-05, "loss": 3.2684, "step": 6505 }, { "epoch": 0.44231553200163065, "grad_norm": 2.641580104827881, "learning_rate": 9.447445305068624e-05, "loss": 3.6752, "step": 6510 }, { "epoch": 0.44265525207229245, "grad_norm": 2.4761297702789307, "learning_rate": 9.447020654980296e-05, "loss": 3.4422, "step": 6515 }, { "epoch": 0.4429949721429542, "grad_norm": 2.3604958057403564, "learning_rate": 9.44659600489197e-05, "loss": 3.2987, "step": 6520 }, { "epoch": 0.443334692213616, "grad_norm": 13.032170295715332, "learning_rate": 9.446171354803643e-05, "loss": 3.5987, "step": 6525 }, { "epoch": 0.44367441228427773, "grad_norm": 2.362748146057129, "learning_rate": 9.445746704715314e-05, "loss": 3.4385, "step": 6530 }, { "epoch": 0.44401413235493953, "grad_norm": 2.326183795928955, "learning_rate": 9.445322054626988e-05, "loss": 3.2894, "step": 6535 }, { "epoch": 0.4443538524256013, "grad_norm": 2.1152613162994385, "learning_rate": 9.444897404538661e-05, "loss": 3.404, "step": 6540 }, { "epoch": 0.44469357249626307, "grad_norm": 1.8823668956756592, "learning_rate": 9.444472754450333e-05, "loss": 3.6945, "step": 6545 }, { "epoch": 0.44503329256692487, "grad_norm": 2.385741710662842, "learning_rate": 9.444048104362007e-05, "loss": 3.1683, "step": 6550 }, { "epoch": 0.4453730126375866, "grad_norm": 2.7184717655181885, "learning_rate": 9.44362345427368e-05, "loss": 3.5738, "step": 6555 }, { "epoch": 0.4457127327082484, "grad_norm": 1.9414417743682861, "learning_rate": 9.443198804185351e-05, "loss": 3.7462, "step": 6560 }, { "epoch": 0.4460524527789102, "grad_norm": 1.9968997240066528, "learning_rate": 9.442774154097025e-05, "loss": 3.2228, "step": 6565 }, { "epoch": 0.44639217284957194, "grad_norm": 2.190063953399658, "learning_rate": 9.442349504008697e-05, "loss": 3.269, "step": 6570 }, { "epoch": 0.44673189292023374, "grad_norm": 2.6371009349823, "learning_rate": 9.44192485392037e-05, "loss": 3.6551, "step": 6575 }, { "epoch": 0.4470716129908955, "grad_norm": 2.2833216190338135, "learning_rate": 9.441500203832044e-05, "loss": 3.3992, "step": 6580 }, { "epoch": 0.4474113330615573, "grad_norm": 2.629237174987793, "learning_rate": 9.441075553743715e-05, "loss": 3.4378, "step": 6585 }, { "epoch": 0.4477510531322191, "grad_norm": 1.9535990953445435, "learning_rate": 9.440650903655388e-05, "loss": 3.3535, "step": 6590 }, { "epoch": 0.4480907732028808, "grad_norm": 1.9888969659805298, "learning_rate": 9.440226253567062e-05, "loss": 3.4303, "step": 6595 }, { "epoch": 0.4484304932735426, "grad_norm": 2.4825336933135986, "learning_rate": 9.439801603478734e-05, "loss": 3.6693, "step": 6600 }, { "epoch": 0.44877021334420436, "grad_norm": 1.7589823007583618, "learning_rate": 9.439376953390406e-05, "loss": 3.404, "step": 6605 }, { "epoch": 0.44910993341486616, "grad_norm": 2.042783498764038, "learning_rate": 9.43895230330208e-05, "loss": 3.434, "step": 6610 }, { "epoch": 0.4494496534855279, "grad_norm": 1.9554245471954346, "learning_rate": 9.438527653213752e-05, "loss": 3.6922, "step": 6615 }, { "epoch": 0.4497893735561897, "grad_norm": 1.8562333583831787, "learning_rate": 9.438103003125425e-05, "loss": 3.4767, "step": 6620 }, { "epoch": 0.4501290936268515, "grad_norm": 2.6993446350097656, "learning_rate": 9.437678353037099e-05, "loss": 3.5627, "step": 6625 }, { "epoch": 0.45046881369751324, "grad_norm": 2.9945762157440186, "learning_rate": 9.43725370294877e-05, "loss": 3.6948, "step": 6630 }, { "epoch": 0.45080853376817503, "grad_norm": 2.996058702468872, "learning_rate": 9.436829052860443e-05, "loss": 3.5105, "step": 6635 }, { "epoch": 0.4511482538388368, "grad_norm": 2.4074368476867676, "learning_rate": 9.436404402772116e-05, "loss": 3.5571, "step": 6640 }, { "epoch": 0.4514879739094986, "grad_norm": 2.7382924556732178, "learning_rate": 9.435979752683789e-05, "loss": 3.2168, "step": 6645 }, { "epoch": 0.45182769398016037, "grad_norm": 2.3451027870178223, "learning_rate": 9.435555102595462e-05, "loss": 3.6281, "step": 6650 }, { "epoch": 0.4521674140508221, "grad_norm": 2.5332274436950684, "learning_rate": 9.435130452507134e-05, "loss": 3.3889, "step": 6655 }, { "epoch": 0.4525071341214839, "grad_norm": 2.3876821994781494, "learning_rate": 9.434705802418807e-05, "loss": 3.3261, "step": 6660 }, { "epoch": 0.45284685419214565, "grad_norm": 2.107971429824829, "learning_rate": 9.43428115233048e-05, "loss": 3.2646, "step": 6665 }, { "epoch": 0.45318657426280745, "grad_norm": 3.0384490489959717, "learning_rate": 9.433856502242153e-05, "loss": 3.6383, "step": 6670 }, { "epoch": 0.45352629433346925, "grad_norm": 2.549604654312134, "learning_rate": 9.433431852153826e-05, "loss": 3.7763, "step": 6675 }, { "epoch": 0.453866014404131, "grad_norm": 2.0201973915100098, "learning_rate": 9.433007202065498e-05, "loss": 3.5223, "step": 6680 }, { "epoch": 0.4542057344747928, "grad_norm": 2.8859968185424805, "learning_rate": 9.432582551977171e-05, "loss": 3.3874, "step": 6685 }, { "epoch": 0.45454545454545453, "grad_norm": 2.010023593902588, "learning_rate": 9.432157901888844e-05, "loss": 3.3481, "step": 6690 }, { "epoch": 0.45488517461611633, "grad_norm": 2.603708028793335, "learning_rate": 9.431733251800517e-05, "loss": 3.5631, "step": 6695 }, { "epoch": 0.45522489468677807, "grad_norm": 2.7841217517852783, "learning_rate": 9.43130860171219e-05, "loss": 3.2159, "step": 6700 }, { "epoch": 0.45556461475743987, "grad_norm": 3.0173606872558594, "learning_rate": 9.430883951623862e-05, "loss": 3.3789, "step": 6705 }, { "epoch": 0.45590433482810166, "grad_norm": 2.4205286502838135, "learning_rate": 9.430459301535535e-05, "loss": 3.543, "step": 6710 }, { "epoch": 0.4562440548987634, "grad_norm": 2.243870735168457, "learning_rate": 9.430034651447208e-05, "loss": 3.5436, "step": 6715 }, { "epoch": 0.4565837749694252, "grad_norm": 2.2129411697387695, "learning_rate": 9.429610001358881e-05, "loss": 3.4442, "step": 6720 }, { "epoch": 0.45692349504008695, "grad_norm": 2.4223175048828125, "learning_rate": 9.429185351270554e-05, "loss": 3.3737, "step": 6725 }, { "epoch": 0.45726321511074874, "grad_norm": 1.9786146879196167, "learning_rate": 9.428760701182226e-05, "loss": 3.5645, "step": 6730 }, { "epoch": 0.45760293518141054, "grad_norm": 2.7910783290863037, "learning_rate": 9.428336051093899e-05, "loss": 3.487, "step": 6735 }, { "epoch": 0.4579426552520723, "grad_norm": 2.45232892036438, "learning_rate": 9.427911401005572e-05, "loss": 3.5406, "step": 6740 }, { "epoch": 0.4582823753227341, "grad_norm": 2.456681251525879, "learning_rate": 9.427486750917245e-05, "loss": 3.475, "step": 6745 }, { "epoch": 0.4586220953933958, "grad_norm": 1.9153252840042114, "learning_rate": 9.427062100828918e-05, "loss": 3.5629, "step": 6750 }, { "epoch": 0.4589618154640576, "grad_norm": 2.0055551528930664, "learning_rate": 9.42663745074059e-05, "loss": 3.5447, "step": 6755 }, { "epoch": 0.4593015355347194, "grad_norm": 2.0676677227020264, "learning_rate": 9.426212800652263e-05, "loss": 3.2286, "step": 6760 }, { "epoch": 0.45964125560538116, "grad_norm": 2.464555501937866, "learning_rate": 9.425788150563936e-05, "loss": 3.5286, "step": 6765 }, { "epoch": 0.45998097567604296, "grad_norm": 2.44195294380188, "learning_rate": 9.425363500475607e-05, "loss": 3.5644, "step": 6770 }, { "epoch": 0.4603206957467047, "grad_norm": 2.6626532077789307, "learning_rate": 9.424938850387282e-05, "loss": 3.5582, "step": 6775 }, { "epoch": 0.4606604158173665, "grad_norm": 2.4734203815460205, "learning_rate": 9.424514200298954e-05, "loss": 3.6252, "step": 6780 }, { "epoch": 0.46100013588802824, "grad_norm": 2.028855085372925, "learning_rate": 9.424089550210627e-05, "loss": 3.5049, "step": 6785 }, { "epoch": 0.46133985595869004, "grad_norm": 2.358114004135132, "learning_rate": 9.4236649001223e-05, "loss": 3.4021, "step": 6790 }, { "epoch": 0.46167957602935183, "grad_norm": 2.8885111808776855, "learning_rate": 9.423240250033973e-05, "loss": 3.3187, "step": 6795 }, { "epoch": 0.4620192961000136, "grad_norm": 2.25164794921875, "learning_rate": 9.422815599945646e-05, "loss": 3.4916, "step": 6800 }, { "epoch": 0.4623590161706754, "grad_norm": 3.0029280185699463, "learning_rate": 9.422390949857318e-05, "loss": 3.3904, "step": 6805 }, { "epoch": 0.4626987362413371, "grad_norm": 5.231080055236816, "learning_rate": 9.421966299768991e-05, "loss": 3.3713, "step": 6810 }, { "epoch": 0.4630384563119989, "grad_norm": 2.0717029571533203, "learning_rate": 9.421541649680664e-05, "loss": 3.1369, "step": 6815 }, { "epoch": 0.4633781763826607, "grad_norm": 2.319355010986328, "learning_rate": 9.421116999592337e-05, "loss": 3.6872, "step": 6820 }, { "epoch": 0.46371789645332245, "grad_norm": 2.5064797401428223, "learning_rate": 9.42069234950401e-05, "loss": 3.5946, "step": 6825 }, { "epoch": 0.46405761652398425, "grad_norm": 2.3484818935394287, "learning_rate": 9.420267699415682e-05, "loss": 3.414, "step": 6830 }, { "epoch": 0.464397336594646, "grad_norm": 2.2609245777130127, "learning_rate": 9.419843049327355e-05, "loss": 3.3876, "step": 6835 }, { "epoch": 0.4647370566653078, "grad_norm": 1.8349295854568481, "learning_rate": 9.419418399239027e-05, "loss": 3.6525, "step": 6840 }, { "epoch": 0.4650767767359696, "grad_norm": 1.8431589603424072, "learning_rate": 9.418993749150701e-05, "loss": 3.3925, "step": 6845 }, { "epoch": 0.46541649680663133, "grad_norm": 2.482482671737671, "learning_rate": 9.418569099062374e-05, "loss": 3.6049, "step": 6850 }, { "epoch": 0.4657562168772931, "grad_norm": 2.1367428302764893, "learning_rate": 9.418144448974045e-05, "loss": 3.4989, "step": 6855 }, { "epoch": 0.46609593694795487, "grad_norm": 2.3413004875183105, "learning_rate": 9.417719798885719e-05, "loss": 3.6076, "step": 6860 }, { "epoch": 0.46643565701861667, "grad_norm": 1.8435660600662231, "learning_rate": 9.417295148797392e-05, "loss": 3.3502, "step": 6865 }, { "epoch": 0.4667753770892784, "grad_norm": 2.126728057861328, "learning_rate": 9.416870498709063e-05, "loss": 3.4011, "step": 6870 }, { "epoch": 0.4671150971599402, "grad_norm": 2.902451992034912, "learning_rate": 9.416445848620738e-05, "loss": 3.2404, "step": 6875 }, { "epoch": 0.467454817230602, "grad_norm": 2.37249493598938, "learning_rate": 9.41602119853241e-05, "loss": 3.153, "step": 6880 }, { "epoch": 0.46779453730126375, "grad_norm": 2.160510778427124, "learning_rate": 9.415596548444082e-05, "loss": 3.4056, "step": 6885 }, { "epoch": 0.46813425737192554, "grad_norm": 1.893813133239746, "learning_rate": 9.415171898355756e-05, "loss": 3.4616, "step": 6890 }, { "epoch": 0.4684739774425873, "grad_norm": 2.4358766078948975, "learning_rate": 9.414747248267429e-05, "loss": 3.5515, "step": 6895 }, { "epoch": 0.4688136975132491, "grad_norm": 2.7551798820495605, "learning_rate": 9.4143225981791e-05, "loss": 3.5848, "step": 6900 }, { "epoch": 0.4691534175839109, "grad_norm": 2.432792901992798, "learning_rate": 9.413897948090774e-05, "loss": 3.5817, "step": 6905 }, { "epoch": 0.4694931376545726, "grad_norm": 2.665708541870117, "learning_rate": 9.413473298002447e-05, "loss": 3.5486, "step": 6910 }, { "epoch": 0.4698328577252344, "grad_norm": 2.0607924461364746, "learning_rate": 9.413048647914119e-05, "loss": 3.5216, "step": 6915 }, { "epoch": 0.47017257779589616, "grad_norm": 2.703848123550415, "learning_rate": 9.412623997825793e-05, "loss": 3.2601, "step": 6920 }, { "epoch": 0.47051229786655796, "grad_norm": 1.9270505905151367, "learning_rate": 9.412199347737464e-05, "loss": 3.4523, "step": 6925 }, { "epoch": 0.47085201793721976, "grad_norm": 2.961392641067505, "learning_rate": 9.411774697649137e-05, "loss": 3.5116, "step": 6930 }, { "epoch": 0.4711917380078815, "grad_norm": 2.5065722465515137, "learning_rate": 9.411350047560811e-05, "loss": 3.4238, "step": 6935 }, { "epoch": 0.4715314580785433, "grad_norm": 2.1807680130004883, "learning_rate": 9.410925397472483e-05, "loss": 3.392, "step": 6940 }, { "epoch": 0.47187117814920504, "grad_norm": 2.7778799533843994, "learning_rate": 9.410500747384155e-05, "loss": 3.6427, "step": 6945 }, { "epoch": 0.47221089821986684, "grad_norm": 2.6303834915161133, "learning_rate": 9.41007609729583e-05, "loss": 3.3448, "step": 6950 }, { "epoch": 0.4725506182905286, "grad_norm": 2.7593629360198975, "learning_rate": 9.409651447207501e-05, "loss": 3.7642, "step": 6955 }, { "epoch": 0.4728903383611904, "grad_norm": 3.0659103393554688, "learning_rate": 9.409226797119174e-05, "loss": 3.6468, "step": 6960 }, { "epoch": 0.4732300584318522, "grad_norm": 2.652475595474243, "learning_rate": 9.408802147030848e-05, "loss": 3.5978, "step": 6965 }, { "epoch": 0.4735697785025139, "grad_norm": 2.0086569786071777, "learning_rate": 9.40837749694252e-05, "loss": 3.2978, "step": 6970 }, { "epoch": 0.4739094985731757, "grad_norm": 2.729933261871338, "learning_rate": 9.407952846854192e-05, "loss": 3.5512, "step": 6975 }, { "epoch": 0.47424921864383746, "grad_norm": 1.6890413761138916, "learning_rate": 9.407528196765866e-05, "loss": 3.6644, "step": 6980 }, { "epoch": 0.47458893871449925, "grad_norm": 1.9818360805511475, "learning_rate": 9.407103546677538e-05, "loss": 3.3539, "step": 6985 }, { "epoch": 0.47492865878516105, "grad_norm": 2.3442163467407227, "learning_rate": 9.40667889658921e-05, "loss": 3.4523, "step": 6990 }, { "epoch": 0.4752683788558228, "grad_norm": 2.061002492904663, "learning_rate": 9.406254246500883e-05, "loss": 3.0931, "step": 6995 }, { "epoch": 0.4756080989264846, "grad_norm": 8.470160484313965, "learning_rate": 9.405829596412556e-05, "loss": 3.1827, "step": 7000 }, { "epoch": 0.47594781899714633, "grad_norm": 2.8169310092926025, "learning_rate": 9.405404946324229e-05, "loss": 3.8022, "step": 7005 }, { "epoch": 0.47628753906780813, "grad_norm": 2.561768054962158, "learning_rate": 9.404980296235902e-05, "loss": 3.6038, "step": 7010 }, { "epoch": 0.4766272591384699, "grad_norm": 2.4986679553985596, "learning_rate": 9.404555646147575e-05, "loss": 3.4651, "step": 7015 }, { "epoch": 0.47696697920913167, "grad_norm": 2.27316951751709, "learning_rate": 9.404130996059247e-05, "loss": 3.101, "step": 7020 }, { "epoch": 0.47730669927979347, "grad_norm": 2.2828121185302734, "learning_rate": 9.40370634597092e-05, "loss": 3.5669, "step": 7025 }, { "epoch": 0.4776464193504552, "grad_norm": 2.379077434539795, "learning_rate": 9.403281695882593e-05, "loss": 3.4817, "step": 7030 }, { "epoch": 0.477986139421117, "grad_norm": 2.2172257900238037, "learning_rate": 9.402857045794266e-05, "loss": 3.2743, "step": 7035 }, { "epoch": 0.47832585949177875, "grad_norm": 2.4453556537628174, "learning_rate": 9.402432395705939e-05, "loss": 3.383, "step": 7040 }, { "epoch": 0.47866557956244055, "grad_norm": 2.9633665084838867, "learning_rate": 9.402007745617611e-05, "loss": 3.5467, "step": 7045 }, { "epoch": 0.47900529963310234, "grad_norm": 2.0435760021209717, "learning_rate": 9.401583095529284e-05, "loss": 3.4691, "step": 7050 }, { "epoch": 0.4793450197037641, "grad_norm": 2.785783290863037, "learning_rate": 9.401158445440957e-05, "loss": 3.6137, "step": 7055 }, { "epoch": 0.4796847397744259, "grad_norm": 2.068085193634033, "learning_rate": 9.40073379535263e-05, "loss": 3.5638, "step": 7060 }, { "epoch": 0.4800244598450876, "grad_norm": 2.291339159011841, "learning_rate": 9.400309145264303e-05, "loss": 3.4057, "step": 7065 }, { "epoch": 0.4803641799157494, "grad_norm": 2.7503013610839844, "learning_rate": 9.399884495175975e-05, "loss": 3.3003, "step": 7070 }, { "epoch": 0.4807038999864112, "grad_norm": 1.7548198699951172, "learning_rate": 9.399459845087648e-05, "loss": 3.6192, "step": 7075 }, { "epoch": 0.48104362005707296, "grad_norm": 1.8460267782211304, "learning_rate": 9.399035194999321e-05, "loss": 3.6293, "step": 7080 }, { "epoch": 0.48138334012773476, "grad_norm": 2.46345591545105, "learning_rate": 9.398610544910994e-05, "loss": 3.6347, "step": 7085 }, { "epoch": 0.4817230601983965, "grad_norm": 2.366938829421997, "learning_rate": 9.398185894822667e-05, "loss": 3.4579, "step": 7090 }, { "epoch": 0.4820627802690583, "grad_norm": 2.4367525577545166, "learning_rate": 9.39776124473434e-05, "loss": 3.5454, "step": 7095 }, { "epoch": 0.4824025003397201, "grad_norm": 1.74684476852417, "learning_rate": 9.397336594646012e-05, "loss": 3.416, "step": 7100 }, { "epoch": 0.48274222041038184, "grad_norm": 2.6481809616088867, "learning_rate": 9.396911944557685e-05, "loss": 3.452, "step": 7105 }, { "epoch": 0.48308194048104364, "grad_norm": 2.301795244216919, "learning_rate": 9.396487294469358e-05, "loss": 3.6525, "step": 7110 }, { "epoch": 0.4834216605517054, "grad_norm": 2.501932144165039, "learning_rate": 9.39606264438103e-05, "loss": 3.3493, "step": 7115 }, { "epoch": 0.4837613806223672, "grad_norm": 2.062826633453369, "learning_rate": 9.395637994292703e-05, "loss": 3.4704, "step": 7120 }, { "epoch": 0.4841011006930289, "grad_norm": 2.595562696456909, "learning_rate": 9.395213344204376e-05, "loss": 3.3932, "step": 7125 }, { "epoch": 0.4844408207636907, "grad_norm": 2.828798294067383, "learning_rate": 9.394788694116049e-05, "loss": 3.3996, "step": 7130 }, { "epoch": 0.4847805408343525, "grad_norm": 2.470886468887329, "learning_rate": 9.394364044027722e-05, "loss": 3.3655, "step": 7135 }, { "epoch": 0.48512026090501426, "grad_norm": 2.382993221282959, "learning_rate": 9.393939393939395e-05, "loss": 3.6529, "step": 7140 }, { "epoch": 0.48545998097567605, "grad_norm": 2.754181146621704, "learning_rate": 9.393514743851067e-05, "loss": 3.6594, "step": 7145 }, { "epoch": 0.4857997010463378, "grad_norm": 1.9599021673202515, "learning_rate": 9.39309009376274e-05, "loss": 3.5488, "step": 7150 }, { "epoch": 0.4861394211169996, "grad_norm": 2.0592167377471924, "learning_rate": 9.392665443674413e-05, "loss": 3.5513, "step": 7155 }, { "epoch": 0.4864791411876614, "grad_norm": 2.4830880165100098, "learning_rate": 9.392240793586086e-05, "loss": 3.5027, "step": 7160 }, { "epoch": 0.48681886125832313, "grad_norm": 2.4404654502868652, "learning_rate": 9.391816143497759e-05, "loss": 3.4388, "step": 7165 }, { "epoch": 0.48715858132898493, "grad_norm": 2.4427778720855713, "learning_rate": 9.391391493409431e-05, "loss": 3.0905, "step": 7170 }, { "epoch": 0.48749830139964667, "grad_norm": 2.395155429840088, "learning_rate": 9.390966843321104e-05, "loss": 3.5831, "step": 7175 }, { "epoch": 0.48783802147030847, "grad_norm": 2.1079752445220947, "learning_rate": 9.390542193232777e-05, "loss": 3.5322, "step": 7180 }, { "epoch": 0.48817774154097027, "grad_norm": 2.6965034008026123, "learning_rate": 9.39011754314445e-05, "loss": 3.4407, "step": 7185 }, { "epoch": 0.488517461611632, "grad_norm": 1.781265139579773, "learning_rate": 9.389692893056123e-05, "loss": 3.522, "step": 7190 }, { "epoch": 0.4888571816822938, "grad_norm": 2.130610466003418, "learning_rate": 9.389268242967794e-05, "loss": 3.6397, "step": 7195 }, { "epoch": 0.48919690175295555, "grad_norm": 2.013097047805786, "learning_rate": 9.388843592879468e-05, "loss": 3.2335, "step": 7200 }, { "epoch": 0.48953662182361735, "grad_norm": 2.235908269882202, "learning_rate": 9.388418942791141e-05, "loss": 3.4402, "step": 7205 }, { "epoch": 0.4898763418942791, "grad_norm": 2.2331745624542236, "learning_rate": 9.387994292702812e-05, "loss": 3.659, "step": 7210 }, { "epoch": 0.4902160619649409, "grad_norm": 1.9735581874847412, "learning_rate": 9.387569642614487e-05, "loss": 3.5952, "step": 7215 }, { "epoch": 0.4905557820356027, "grad_norm": 2.35489821434021, "learning_rate": 9.38714499252616e-05, "loss": 3.4633, "step": 7220 }, { "epoch": 0.4908955021062644, "grad_norm": 2.8531453609466553, "learning_rate": 9.386720342437831e-05, "loss": 3.6025, "step": 7225 }, { "epoch": 0.4912352221769262, "grad_norm": 2.5958797931671143, "learning_rate": 9.386295692349505e-05, "loss": 3.223, "step": 7230 }, { "epoch": 0.49157494224758796, "grad_norm": 2.380370855331421, "learning_rate": 9.385871042261178e-05, "loss": 3.4611, "step": 7235 }, { "epoch": 0.49191466231824976, "grad_norm": 3.0313644409179688, "learning_rate": 9.385446392172849e-05, "loss": 3.5145, "step": 7240 }, { "epoch": 0.49225438238891156, "grad_norm": 2.2284817695617676, "learning_rate": 9.385021742084523e-05, "loss": 3.6086, "step": 7245 }, { "epoch": 0.4925941024595733, "grad_norm": 2.185957193374634, "learning_rate": 9.384597091996196e-05, "loss": 3.6982, "step": 7250 }, { "epoch": 0.4929338225302351, "grad_norm": 2.016425132751465, "learning_rate": 9.384172441907868e-05, "loss": 3.0776, "step": 7255 }, { "epoch": 0.49327354260089684, "grad_norm": 2.518101215362549, "learning_rate": 9.383747791819542e-05, "loss": 3.5951, "step": 7260 }, { "epoch": 0.49361326267155864, "grad_norm": 2.331812858581543, "learning_rate": 9.383323141731213e-05, "loss": 3.4408, "step": 7265 }, { "epoch": 0.49395298274222044, "grad_norm": 2.329537868499756, "learning_rate": 9.382898491642886e-05, "loss": 3.4495, "step": 7270 }, { "epoch": 0.4942927028128822, "grad_norm": 2.441767454147339, "learning_rate": 9.38247384155456e-05, "loss": 3.4213, "step": 7275 }, { "epoch": 0.494632422883544, "grad_norm": 2.638486623764038, "learning_rate": 9.382049191466232e-05, "loss": 3.522, "step": 7280 }, { "epoch": 0.4949721429542057, "grad_norm": 2.0256617069244385, "learning_rate": 9.381624541377905e-05, "loss": 3.6722, "step": 7285 }, { "epoch": 0.4953118630248675, "grad_norm": 2.3431129455566406, "learning_rate": 9.381199891289579e-05, "loss": 3.2951, "step": 7290 }, { "epoch": 0.49565158309552926, "grad_norm": 2.6712446212768555, "learning_rate": 9.38077524120125e-05, "loss": 3.3398, "step": 7295 }, { "epoch": 0.49599130316619106, "grad_norm": 1.864131212234497, "learning_rate": 9.380350591112923e-05, "loss": 3.4863, "step": 7300 }, { "epoch": 0.49633102323685285, "grad_norm": 2.352243661880493, "learning_rate": 9.379925941024597e-05, "loss": 3.3791, "step": 7305 }, { "epoch": 0.4966707433075146, "grad_norm": 2.14172625541687, "learning_rate": 9.379501290936269e-05, "loss": 3.4757, "step": 7310 }, { "epoch": 0.4970104633781764, "grad_norm": 2.0858139991760254, "learning_rate": 9.379076640847941e-05, "loss": 3.4032, "step": 7315 }, { "epoch": 0.49735018344883813, "grad_norm": 2.608372688293457, "learning_rate": 9.378651990759615e-05, "loss": 3.6841, "step": 7320 }, { "epoch": 0.49768990351949993, "grad_norm": 2.271538257598877, "learning_rate": 9.378227340671287e-05, "loss": 3.4201, "step": 7325 }, { "epoch": 0.49802962359016173, "grad_norm": 1.973562479019165, "learning_rate": 9.37780269058296e-05, "loss": 3.332, "step": 7330 }, { "epoch": 0.49836934366082347, "grad_norm": 2.0143280029296875, "learning_rate": 9.377378040494634e-05, "loss": 3.5688, "step": 7335 }, { "epoch": 0.49870906373148527, "grad_norm": 2.195525646209717, "learning_rate": 9.376953390406305e-05, "loss": 3.644, "step": 7340 }, { "epoch": 0.499048783802147, "grad_norm": 2.025935649871826, "learning_rate": 9.376528740317978e-05, "loss": 3.491, "step": 7345 }, { "epoch": 0.4993885038728088, "grad_norm": 1.9479522705078125, "learning_rate": 9.376104090229651e-05, "loss": 3.1799, "step": 7350 }, { "epoch": 0.4997282239434706, "grad_norm": 1.8824591636657715, "learning_rate": 9.375679440141324e-05, "loss": 3.4035, "step": 7355 }, { "epoch": 0.5000679440141323, "grad_norm": 2.545675277709961, "learning_rate": 9.375254790052997e-05, "loss": 3.6221, "step": 7360 }, { "epoch": 0.5004076640847941, "grad_norm": 2.1282083988189697, "learning_rate": 9.374830139964669e-05, "loss": 3.5629, "step": 7365 }, { "epoch": 0.5007473841554559, "grad_norm": 2.6184799671173096, "learning_rate": 9.374405489876342e-05, "loss": 3.7157, "step": 7370 }, { "epoch": 0.5010871042261177, "grad_norm": 2.4793124198913574, "learning_rate": 9.373980839788015e-05, "loss": 3.5615, "step": 7375 }, { "epoch": 0.5014268242967794, "grad_norm": 2.208740234375, "learning_rate": 9.373556189699688e-05, "loss": 3.7752, "step": 7380 }, { "epoch": 0.5017665443674413, "grad_norm": 1.948194980621338, "learning_rate": 9.37313153961136e-05, "loss": 3.7429, "step": 7385 }, { "epoch": 0.502106264438103, "grad_norm": 1.986470103263855, "learning_rate": 9.372706889523033e-05, "loss": 3.2519, "step": 7390 }, { "epoch": 0.5024459845087648, "grad_norm": 2.75628924369812, "learning_rate": 9.372282239434706e-05, "loss": 3.4789, "step": 7395 }, { "epoch": 0.5027857045794265, "grad_norm": 2.468574047088623, "learning_rate": 9.371857589346379e-05, "loss": 3.5915, "step": 7400 }, { "epoch": 0.5031254246500884, "grad_norm": 2.94883394241333, "learning_rate": 9.371432939258052e-05, "loss": 3.3152, "step": 7405 }, { "epoch": 0.5034651447207501, "grad_norm": 2.3007545471191406, "learning_rate": 9.371008289169725e-05, "loss": 3.4305, "step": 7410 }, { "epoch": 0.5038048647914118, "grad_norm": 2.000077962875366, "learning_rate": 9.370583639081397e-05, "loss": 3.4782, "step": 7415 }, { "epoch": 0.5041445848620737, "grad_norm": 2.7133595943450928, "learning_rate": 9.37015898899307e-05, "loss": 3.5937, "step": 7420 }, { "epoch": 0.5044843049327354, "grad_norm": 2.718013286590576, "learning_rate": 9.369734338904743e-05, "loss": 3.2927, "step": 7425 }, { "epoch": 0.5048240250033972, "grad_norm": 2.0150578022003174, "learning_rate": 9.369309688816416e-05, "loss": 3.4968, "step": 7430 }, { "epoch": 0.5051637450740589, "grad_norm": 1.6512356996536255, "learning_rate": 9.368885038728089e-05, "loss": 3.5757, "step": 7435 }, { "epoch": 0.5055034651447208, "grad_norm": 2.2183570861816406, "learning_rate": 9.368460388639761e-05, "loss": 3.3624, "step": 7440 }, { "epoch": 0.5058431852153825, "grad_norm": 2.0382187366485596, "learning_rate": 9.368035738551434e-05, "loss": 3.371, "step": 7445 }, { "epoch": 0.5061829052860443, "grad_norm": 1.9518389701843262, "learning_rate": 9.367611088463107e-05, "loss": 3.3832, "step": 7450 }, { "epoch": 0.5065226253567061, "grad_norm": 3.202738046646118, "learning_rate": 9.36718643837478e-05, "loss": 3.3198, "step": 7455 }, { "epoch": 0.5068623454273679, "grad_norm": 2.3529140949249268, "learning_rate": 9.366761788286453e-05, "loss": 3.5379, "step": 7460 }, { "epoch": 0.5072020654980296, "grad_norm": 2.8575384616851807, "learning_rate": 9.366337138198124e-05, "loss": 3.5806, "step": 7465 }, { "epoch": 0.5075417855686915, "grad_norm": 2.269850730895996, "learning_rate": 9.365912488109798e-05, "loss": 3.1829, "step": 7470 }, { "epoch": 0.5078815056393532, "grad_norm": 2.088341474533081, "learning_rate": 9.365487838021471e-05, "loss": 3.688, "step": 7475 }, { "epoch": 0.5082212257100149, "grad_norm": 2.2586138248443604, "learning_rate": 9.365063187933144e-05, "loss": 3.1449, "step": 7480 }, { "epoch": 0.5085609457806767, "grad_norm": 2.2704641819000244, "learning_rate": 9.364638537844817e-05, "loss": 3.8194, "step": 7485 }, { "epoch": 0.5089006658513385, "grad_norm": 2.153308391571045, "learning_rate": 9.36421388775649e-05, "loss": 3.2544, "step": 7490 }, { "epoch": 0.5092403859220003, "grad_norm": 2.500732421875, "learning_rate": 9.363789237668162e-05, "loss": 3.6002, "step": 7495 }, { "epoch": 0.509580105992662, "grad_norm": 2.318199872970581, "learning_rate": 9.363364587579835e-05, "loss": 3.5671, "step": 7500 }, { "epoch": 0.5099198260633239, "grad_norm": 1.8818392753601074, "learning_rate": 9.362939937491508e-05, "loss": 3.5941, "step": 7505 }, { "epoch": 0.5102595461339856, "grad_norm": 2.1657497882843018, "learning_rate": 9.36251528740318e-05, "loss": 3.7112, "step": 7510 }, { "epoch": 0.5105992662046474, "grad_norm": 2.491417169570923, "learning_rate": 9.362090637314853e-05, "loss": 3.5348, "step": 7515 }, { "epoch": 0.5109389862753091, "grad_norm": 2.4612016677856445, "learning_rate": 9.361665987226526e-05, "loss": 3.5846, "step": 7520 }, { "epoch": 0.511278706345971, "grad_norm": 2.693815231323242, "learning_rate": 9.361241337138199e-05, "loss": 3.6232, "step": 7525 }, { "epoch": 0.5116184264166327, "grad_norm": 2.876926898956299, "learning_rate": 9.360816687049872e-05, "loss": 3.5763, "step": 7530 }, { "epoch": 0.5119581464872944, "grad_norm": 3.4614970684051514, "learning_rate": 9.360392036961545e-05, "loss": 3.4691, "step": 7535 }, { "epoch": 0.5122978665579563, "grad_norm": 1.8120405673980713, "learning_rate": 9.359967386873217e-05, "loss": 3.6407, "step": 7540 }, { "epoch": 0.512637586628618, "grad_norm": 2.506169319152832, "learning_rate": 9.35954273678489e-05, "loss": 3.3106, "step": 7545 }, { "epoch": 0.5129773066992798, "grad_norm": 1.9559695720672607, "learning_rate": 9.359118086696562e-05, "loss": 3.3331, "step": 7550 }, { "epoch": 0.5133170267699416, "grad_norm": 2.0006439685821533, "learning_rate": 9.358693436608236e-05, "loss": 3.5985, "step": 7555 }, { "epoch": 0.5136567468406034, "grad_norm": 2.290966510772705, "learning_rate": 9.358268786519909e-05, "loss": 3.6198, "step": 7560 }, { "epoch": 0.5139964669112651, "grad_norm": 2.4100987911224365, "learning_rate": 9.35784413643158e-05, "loss": 3.5429, "step": 7565 }, { "epoch": 0.5143361869819268, "grad_norm": 1.9473223686218262, "learning_rate": 9.357419486343254e-05, "loss": 3.5272, "step": 7570 }, { "epoch": 0.5146759070525887, "grad_norm": 2.3493199348449707, "learning_rate": 9.356994836254927e-05, "loss": 3.3414, "step": 7575 }, { "epoch": 0.5150156271232504, "grad_norm": 2.2954466342926025, "learning_rate": 9.356570186166598e-05, "loss": 3.589, "step": 7580 }, { "epoch": 0.5153553471939122, "grad_norm": 2.340156078338623, "learning_rate": 9.356145536078273e-05, "loss": 3.5733, "step": 7585 }, { "epoch": 0.515695067264574, "grad_norm": 15.759178161621094, "learning_rate": 9.355720885989945e-05, "loss": 3.4173, "step": 7590 }, { "epoch": 0.5160347873352358, "grad_norm": 2.4459228515625, "learning_rate": 9.355296235901617e-05, "loss": 3.4002, "step": 7595 }, { "epoch": 0.5163745074058975, "grad_norm": 1.8496448993682861, "learning_rate": 9.354871585813291e-05, "loss": 3.4946, "step": 7600 }, { "epoch": 0.5167142274765593, "grad_norm": 1.9563500881195068, "learning_rate": 9.354446935724964e-05, "loss": 3.3107, "step": 7605 }, { "epoch": 0.5170539475472211, "grad_norm": 2.895695686340332, "learning_rate": 9.354022285636635e-05, "loss": 3.4064, "step": 7610 }, { "epoch": 0.5173936676178829, "grad_norm": 2.685279369354248, "learning_rate": 9.35359763554831e-05, "loss": 3.0983, "step": 7615 }, { "epoch": 0.5177333876885446, "grad_norm": 1.6636030673980713, "learning_rate": 9.353172985459981e-05, "loss": 3.6338, "step": 7620 }, { "epoch": 0.5180731077592065, "grad_norm": 2.0679140090942383, "learning_rate": 9.352748335371654e-05, "loss": 3.6541, "step": 7625 }, { "epoch": 0.5184128278298682, "grad_norm": 2.401170015335083, "learning_rate": 9.352323685283328e-05, "loss": 3.327, "step": 7630 }, { "epoch": 0.5187525479005299, "grad_norm": 2.8371365070343018, "learning_rate": 9.351899035194999e-05, "loss": 3.6023, "step": 7635 }, { "epoch": 0.5190922679711918, "grad_norm": 1.9553372859954834, "learning_rate": 9.351474385106672e-05, "loss": 3.6175, "step": 7640 }, { "epoch": 0.5194319880418535, "grad_norm": 2.2127702236175537, "learning_rate": 9.351049735018346e-05, "loss": 3.4014, "step": 7645 }, { "epoch": 0.5197717081125153, "grad_norm": 2.1941514015197754, "learning_rate": 9.350625084930018e-05, "loss": 3.5133, "step": 7650 }, { "epoch": 0.520111428183177, "grad_norm": 2.303549289703369, "learning_rate": 9.35020043484169e-05, "loss": 3.2347, "step": 7655 }, { "epoch": 0.5204511482538389, "grad_norm": 2.5060219764709473, "learning_rate": 9.349775784753365e-05, "loss": 3.4632, "step": 7660 }, { "epoch": 0.5207908683245006, "grad_norm": 2.187718391418457, "learning_rate": 9.349351134665036e-05, "loss": 3.2201, "step": 7665 }, { "epoch": 0.5211305883951624, "grad_norm": 2.0518343448638916, "learning_rate": 9.348926484576709e-05, "loss": 3.5035, "step": 7670 }, { "epoch": 0.5214703084658242, "grad_norm": 2.011011838912964, "learning_rate": 9.348501834488383e-05, "loss": 3.2746, "step": 7675 }, { "epoch": 0.521810028536486, "grad_norm": 1.9551658630371094, "learning_rate": 9.348077184400054e-05, "loss": 3.5228, "step": 7680 }, { "epoch": 0.5221497486071477, "grad_norm": 2.1620616912841797, "learning_rate": 9.347652534311727e-05, "loss": 3.4825, "step": 7685 }, { "epoch": 0.5224894686778094, "grad_norm": 1.9534777402877808, "learning_rate": 9.3472278842234e-05, "loss": 3.6559, "step": 7690 }, { "epoch": 0.5228291887484713, "grad_norm": 2.493875503540039, "learning_rate": 9.346803234135073e-05, "loss": 3.1751, "step": 7695 }, { "epoch": 0.523168908819133, "grad_norm": 1.9353262186050415, "learning_rate": 9.346378584046746e-05, "loss": 3.4514, "step": 7700 }, { "epoch": 0.5235086288897948, "grad_norm": 1.9636565446853638, "learning_rate": 9.345953933958418e-05, "loss": 3.7546, "step": 7705 }, { "epoch": 0.5238483489604566, "grad_norm": 1.9866634607315063, "learning_rate": 9.345529283870091e-05, "loss": 3.3293, "step": 7710 }, { "epoch": 0.5241880690311184, "grad_norm": 3.0144455432891846, "learning_rate": 9.345104633781764e-05, "loss": 3.4535, "step": 7715 }, { "epoch": 0.5245277891017801, "grad_norm": 2.969761371612549, "learning_rate": 9.344679983693437e-05, "loss": 3.4912, "step": 7720 }, { "epoch": 0.524867509172442, "grad_norm": 2.070857286453247, "learning_rate": 9.34425533360511e-05, "loss": 3.5424, "step": 7725 }, { "epoch": 0.5252072292431037, "grad_norm": 2.1081411838531494, "learning_rate": 9.343830683516782e-05, "loss": 3.4614, "step": 7730 }, { "epoch": 0.5255469493137654, "grad_norm": 1.9040887355804443, "learning_rate": 9.343406033428455e-05, "loss": 3.3716, "step": 7735 }, { "epoch": 0.5258866693844272, "grad_norm": 1.9735606908798218, "learning_rate": 9.342981383340128e-05, "loss": 3.4665, "step": 7740 }, { "epoch": 0.526226389455089, "grad_norm": 2.214897632598877, "learning_rate": 9.342556733251801e-05, "loss": 3.3892, "step": 7745 }, { "epoch": 0.5265661095257508, "grad_norm": 2.2447516918182373, "learning_rate": 9.342132083163474e-05, "loss": 3.748, "step": 7750 }, { "epoch": 0.5269058295964125, "grad_norm": 2.7474708557128906, "learning_rate": 9.341707433075146e-05, "loss": 3.6698, "step": 7755 }, { "epoch": 0.5272455496670744, "grad_norm": 2.5779128074645996, "learning_rate": 9.341282782986819e-05, "loss": 2.9593, "step": 7760 }, { "epoch": 0.5275852697377361, "grad_norm": 2.1730480194091797, "learning_rate": 9.340858132898492e-05, "loss": 3.6393, "step": 7765 }, { "epoch": 0.5279249898083979, "grad_norm": 3.3322739601135254, "learning_rate": 9.340433482810165e-05, "loss": 3.4001, "step": 7770 }, { "epoch": 0.5282647098790596, "grad_norm": 2.4361867904663086, "learning_rate": 9.340008832721838e-05, "loss": 3.7426, "step": 7775 }, { "epoch": 0.5286044299497215, "grad_norm": 2.4264729022979736, "learning_rate": 9.33958418263351e-05, "loss": 3.0694, "step": 7780 }, { "epoch": 0.5289441500203832, "grad_norm": 2.169437885284424, "learning_rate": 9.339159532545183e-05, "loss": 3.6901, "step": 7785 }, { "epoch": 0.5292838700910449, "grad_norm": 2.2769272327423096, "learning_rate": 9.338734882456856e-05, "loss": 3.6727, "step": 7790 }, { "epoch": 0.5296235901617068, "grad_norm": 3.0056302547454834, "learning_rate": 9.338310232368529e-05, "loss": 3.6638, "step": 7795 }, { "epoch": 0.5299633102323685, "grad_norm": 2.313436269760132, "learning_rate": 9.337885582280202e-05, "loss": 3.2055, "step": 7800 }, { "epoch": 0.5303030303030303, "grad_norm": 1.8805210590362549, "learning_rate": 9.337460932191874e-05, "loss": 3.355, "step": 7805 }, { "epoch": 0.5306427503736921, "grad_norm": 2.6432125568389893, "learning_rate": 9.337036282103547e-05, "loss": 3.4257, "step": 7810 }, { "epoch": 0.5309824704443539, "grad_norm": 2.71626877784729, "learning_rate": 9.33661163201522e-05, "loss": 3.5226, "step": 7815 }, { "epoch": 0.5313221905150156, "grad_norm": 1.9068526029586792, "learning_rate": 9.336186981926893e-05, "loss": 3.4854, "step": 7820 }, { "epoch": 0.5316619105856774, "grad_norm": 2.322669267654419, "learning_rate": 9.335762331838566e-05, "loss": 3.5672, "step": 7825 }, { "epoch": 0.5320016306563392, "grad_norm": 2.248173475265503, "learning_rate": 9.335337681750238e-05, "loss": 3.4054, "step": 7830 }, { "epoch": 0.532341350727001, "grad_norm": 2.2564051151275635, "learning_rate": 9.334913031661911e-05, "loss": 3.6523, "step": 7835 }, { "epoch": 0.5326810707976627, "grad_norm": 2.781511068344116, "learning_rate": 9.334488381573584e-05, "loss": 3.7222, "step": 7840 }, { "epoch": 0.5330207908683245, "grad_norm": 2.621516466140747, "learning_rate": 9.334063731485257e-05, "loss": 3.4049, "step": 7845 }, { "epoch": 0.5333605109389863, "grad_norm": 2.0905559062957764, "learning_rate": 9.33363908139693e-05, "loss": 3.5393, "step": 7850 }, { "epoch": 0.533700231009648, "grad_norm": 1.8236668109893799, "learning_rate": 9.333214431308602e-05, "loss": 3.5404, "step": 7855 }, { "epoch": 0.5340399510803098, "grad_norm": 2.09511661529541, "learning_rate": 9.332789781220275e-05, "loss": 3.5058, "step": 7860 }, { "epoch": 0.5343796711509716, "grad_norm": 3.618272304534912, "learning_rate": 9.332365131131948e-05, "loss": 3.4683, "step": 7865 }, { "epoch": 0.5347193912216334, "grad_norm": 3.5706093311309814, "learning_rate": 9.331940481043621e-05, "loss": 3.4599, "step": 7870 }, { "epoch": 0.5350591112922951, "grad_norm": 1.727651834487915, "learning_rate": 9.331515830955294e-05, "loss": 3.4183, "step": 7875 }, { "epoch": 0.535398831362957, "grad_norm": 2.5069267749786377, "learning_rate": 9.331091180866966e-05, "loss": 3.5288, "step": 7880 }, { "epoch": 0.5357385514336187, "grad_norm": 2.4135208129882812, "learning_rate": 9.330666530778639e-05, "loss": 3.3558, "step": 7885 }, { "epoch": 0.5360782715042804, "grad_norm": 2.3433218002319336, "learning_rate": 9.330241880690311e-05, "loss": 3.6919, "step": 7890 }, { "epoch": 0.5364179915749423, "grad_norm": 2.7258450984954834, "learning_rate": 9.329817230601985e-05, "loss": 3.3195, "step": 7895 }, { "epoch": 0.536757711645604, "grad_norm": 2.2421512603759766, "learning_rate": 9.329392580513658e-05, "loss": 3.2954, "step": 7900 }, { "epoch": 0.5370974317162658, "grad_norm": 3.3051373958587646, "learning_rate": 9.328967930425329e-05, "loss": 3.6181, "step": 7905 }, { "epoch": 0.5374371517869275, "grad_norm": 2.076032876968384, "learning_rate": 9.328543280337003e-05, "loss": 3.5818, "step": 7910 }, { "epoch": 0.5377768718575894, "grad_norm": 1.787664771080017, "learning_rate": 9.328118630248676e-05, "loss": 3.6571, "step": 7915 }, { "epoch": 0.5381165919282511, "grad_norm": 2.2575559616088867, "learning_rate": 9.327693980160347e-05, "loss": 3.3945, "step": 7920 }, { "epoch": 0.5384563119989129, "grad_norm": 2.319761037826538, "learning_rate": 9.327269330072022e-05, "loss": 3.5612, "step": 7925 }, { "epoch": 0.5387960320695747, "grad_norm": 2.307724952697754, "learning_rate": 9.326844679983694e-05, "loss": 3.668, "step": 7930 }, { "epoch": 0.5391357521402365, "grad_norm": 1.859480857849121, "learning_rate": 9.326420029895366e-05, "loss": 3.6731, "step": 7935 }, { "epoch": 0.5394754722108982, "grad_norm": 2.80035138130188, "learning_rate": 9.32599537980704e-05, "loss": 3.3538, "step": 7940 }, { "epoch": 0.5398151922815599, "grad_norm": 2.4471659660339355, "learning_rate": 9.325570729718713e-05, "loss": 3.4815, "step": 7945 }, { "epoch": 0.5401549123522218, "grad_norm": 2.661670446395874, "learning_rate": 9.325146079630384e-05, "loss": 3.5947, "step": 7950 }, { "epoch": 0.5404946324228835, "grad_norm": 2.048238515853882, "learning_rate": 9.324721429542058e-05, "loss": 3.2475, "step": 7955 }, { "epoch": 0.5408343524935453, "grad_norm": 2.1112208366394043, "learning_rate": 9.324296779453731e-05, "loss": 3.4487, "step": 7960 }, { "epoch": 0.5411740725642071, "grad_norm": 1.6436501741409302, "learning_rate": 9.323872129365403e-05, "loss": 3.5088, "step": 7965 }, { "epoch": 0.5415137926348689, "grad_norm": 2.4236257076263428, "learning_rate": 9.323447479277077e-05, "loss": 3.44, "step": 7970 }, { "epoch": 0.5418535127055306, "grad_norm": 2.173462152481079, "learning_rate": 9.323022829188748e-05, "loss": 3.5009, "step": 7975 }, { "epoch": 0.5421932327761925, "grad_norm": 2.001016139984131, "learning_rate": 9.322598179100421e-05, "loss": 3.4251, "step": 7980 }, { "epoch": 0.5425329528468542, "grad_norm": 2.305433988571167, "learning_rate": 9.322173529012095e-05, "loss": 3.4381, "step": 7985 }, { "epoch": 0.542872672917516, "grad_norm": 2.1340959072113037, "learning_rate": 9.321748878923767e-05, "loss": 3.5409, "step": 7990 }, { "epoch": 0.5432123929881777, "grad_norm": 2.51355242729187, "learning_rate": 9.32132422883544e-05, "loss": 3.5031, "step": 7995 }, { "epoch": 0.5435521130588395, "grad_norm": 2.293647527694702, "learning_rate": 9.320899578747114e-05, "loss": 3.6442, "step": 8000 }, { "epoch": 0.5438918331295013, "grad_norm": 2.1015965938568115, "learning_rate": 9.320474928658785e-05, "loss": 3.3585, "step": 8005 }, { "epoch": 0.544231553200163, "grad_norm": 2.1317553520202637, "learning_rate": 9.320050278570458e-05, "loss": 3.5866, "step": 8010 }, { "epoch": 0.5445712732708249, "grad_norm": 1.9175254106521606, "learning_rate": 9.319625628482132e-05, "loss": 3.3682, "step": 8015 }, { "epoch": 0.5449109933414866, "grad_norm": 2.203145742416382, "learning_rate": 9.319200978393804e-05, "loss": 3.5032, "step": 8020 }, { "epoch": 0.5452507134121484, "grad_norm": 2.406024694442749, "learning_rate": 9.318776328305476e-05, "loss": 3.6251, "step": 8025 }, { "epoch": 0.5455904334828101, "grad_norm": 2.588593006134033, "learning_rate": 9.31835167821715e-05, "loss": 3.4116, "step": 8030 }, { "epoch": 0.545930153553472, "grad_norm": 2.538975477218628, "learning_rate": 9.317927028128822e-05, "loss": 3.4752, "step": 8035 }, { "epoch": 0.5462698736241337, "grad_norm": 2.235506772994995, "learning_rate": 9.317502378040495e-05, "loss": 3.2153, "step": 8040 }, { "epoch": 0.5466095936947954, "grad_norm": 2.4122209548950195, "learning_rate": 9.317077727952168e-05, "loss": 3.4472, "step": 8045 }, { "epoch": 0.5469493137654573, "grad_norm": 2.3780431747436523, "learning_rate": 9.31665307786384e-05, "loss": 3.2939, "step": 8050 }, { "epoch": 0.547289033836119, "grad_norm": 2.1943743228912354, "learning_rate": 9.316228427775513e-05, "loss": 3.4332, "step": 8055 }, { "epoch": 0.5476287539067808, "grad_norm": 2.5293846130371094, "learning_rate": 9.315803777687186e-05, "loss": 3.4539, "step": 8060 }, { "epoch": 0.5479684739774426, "grad_norm": 2.6729140281677246, "learning_rate": 9.315379127598859e-05, "loss": 3.443, "step": 8065 }, { "epoch": 0.5483081940481044, "grad_norm": 2.1881024837493896, "learning_rate": 9.314954477510532e-05, "loss": 3.4367, "step": 8070 }, { "epoch": 0.5486479141187661, "grad_norm": 2.0456316471099854, "learning_rate": 9.314529827422204e-05, "loss": 3.4568, "step": 8075 }, { "epoch": 0.5489876341894279, "grad_norm": 1.9006816148757935, "learning_rate": 9.314105177333877e-05, "loss": 3.5054, "step": 8080 }, { "epoch": 0.5493273542600897, "grad_norm": 2.4287989139556885, "learning_rate": 9.31368052724555e-05, "loss": 3.4349, "step": 8085 }, { "epoch": 0.5496670743307515, "grad_norm": 2.4924192428588867, "learning_rate": 9.313255877157223e-05, "loss": 3.4955, "step": 8090 }, { "epoch": 0.5500067944014132, "grad_norm": 3.8655943870544434, "learning_rate": 9.312831227068896e-05, "loss": 3.4524, "step": 8095 }, { "epoch": 0.550346514472075, "grad_norm": 2.8569958209991455, "learning_rate": 9.312406576980568e-05, "loss": 3.4709, "step": 8100 }, { "epoch": 0.5506862345427368, "grad_norm": 2.472886562347412, "learning_rate": 9.311981926892241e-05, "loss": 3.2456, "step": 8105 }, { "epoch": 0.5510259546133985, "grad_norm": 3.545949697494507, "learning_rate": 9.311557276803914e-05, "loss": 3.4041, "step": 8110 }, { "epoch": 0.5513656746840603, "grad_norm": 1.8553361892700195, "learning_rate": 9.311132626715587e-05, "loss": 3.4402, "step": 8115 }, { "epoch": 0.5517053947547221, "grad_norm": 2.2590484619140625, "learning_rate": 9.31070797662726e-05, "loss": 3.3832, "step": 8120 }, { "epoch": 0.5520451148253839, "grad_norm": 2.084836721420288, "learning_rate": 9.310283326538932e-05, "loss": 3.5876, "step": 8125 }, { "epoch": 0.5523848348960456, "grad_norm": 3.508754014968872, "learning_rate": 9.309858676450605e-05, "loss": 3.6488, "step": 8130 }, { "epoch": 0.5527245549667075, "grad_norm": 2.4495081901550293, "learning_rate": 9.309434026362278e-05, "loss": 3.3691, "step": 8135 }, { "epoch": 0.5530642750373692, "grad_norm": 2.252100706100464, "learning_rate": 9.309009376273951e-05, "loss": 3.4972, "step": 8140 }, { "epoch": 0.553403995108031, "grad_norm": 2.076917886734009, "learning_rate": 9.308584726185624e-05, "loss": 3.578, "step": 8145 }, { "epoch": 0.5537437151786928, "grad_norm": 3.1268932819366455, "learning_rate": 9.308160076097296e-05, "loss": 3.231, "step": 8150 }, { "epoch": 0.5540834352493546, "grad_norm": 3.3976073265075684, "learning_rate": 9.307735426008969e-05, "loss": 3.4044, "step": 8155 }, { "epoch": 0.5544231553200163, "grad_norm": 2.028085470199585, "learning_rate": 9.307310775920642e-05, "loss": 3.3118, "step": 8160 }, { "epoch": 0.554762875390678, "grad_norm": 2.361985206604004, "learning_rate": 9.306886125832315e-05, "loss": 3.7181, "step": 8165 }, { "epoch": 0.5551025954613399, "grad_norm": 1.8930116891860962, "learning_rate": 9.306461475743988e-05, "loss": 3.7768, "step": 8170 }, { "epoch": 0.5554423155320016, "grad_norm": 1.7071287631988525, "learning_rate": 9.30603682565566e-05, "loss": 3.5297, "step": 8175 }, { "epoch": 0.5557820356026634, "grad_norm": 2.141624927520752, "learning_rate": 9.305612175567333e-05, "loss": 3.3615, "step": 8180 }, { "epoch": 0.5561217556733252, "grad_norm": 1.87115478515625, "learning_rate": 9.305187525479006e-05, "loss": 3.3518, "step": 8185 }, { "epoch": 0.556461475743987, "grad_norm": 2.391692638397217, "learning_rate": 9.304762875390679e-05, "loss": 3.4963, "step": 8190 }, { "epoch": 0.5568011958146487, "grad_norm": 2.429943799972534, "learning_rate": 9.304338225302352e-05, "loss": 3.412, "step": 8195 }, { "epoch": 0.5571409158853105, "grad_norm": 1.8239959478378296, "learning_rate": 9.303913575214024e-05, "loss": 3.5649, "step": 8200 }, { "epoch": 0.5574806359559723, "grad_norm": 2.659858226776123, "learning_rate": 9.303488925125697e-05, "loss": 3.4042, "step": 8205 }, { "epoch": 0.557820356026634, "grad_norm": 2.560776948928833, "learning_rate": 9.30306427503737e-05, "loss": 3.4747, "step": 8210 }, { "epoch": 0.5581600760972958, "grad_norm": 4.9363322257995605, "learning_rate": 9.302639624949043e-05, "loss": 3.4194, "step": 8215 }, { "epoch": 0.5584997961679576, "grad_norm": 2.1661062240600586, "learning_rate": 9.302214974860716e-05, "loss": 3.6843, "step": 8220 }, { "epoch": 0.5588395162386194, "grad_norm": 2.345493793487549, "learning_rate": 9.301790324772388e-05, "loss": 3.2607, "step": 8225 }, { "epoch": 0.5591792363092811, "grad_norm": 2.23356032371521, "learning_rate": 9.301365674684061e-05, "loss": 3.5943, "step": 8230 }, { "epoch": 0.559518956379943, "grad_norm": 2.2561025619506836, "learning_rate": 9.300941024595734e-05, "loss": 3.4041, "step": 8235 }, { "epoch": 0.5598586764506047, "grad_norm": 1.8876768350601196, "learning_rate": 9.300516374507407e-05, "loss": 3.7598, "step": 8240 }, { "epoch": 0.5601983965212665, "grad_norm": 2.052811861038208, "learning_rate": 9.300091724419078e-05, "loss": 3.2988, "step": 8245 }, { "epoch": 0.5605381165919282, "grad_norm": 2.5591275691986084, "learning_rate": 9.299667074330752e-05, "loss": 3.3676, "step": 8250 }, { "epoch": 0.5608778366625901, "grad_norm": 2.2649238109588623, "learning_rate": 9.299242424242425e-05, "loss": 3.2285, "step": 8255 }, { "epoch": 0.5612175567332518, "grad_norm": 2.2049078941345215, "learning_rate": 9.298817774154097e-05, "loss": 3.4271, "step": 8260 }, { "epoch": 0.5615572768039135, "grad_norm": 1.873483419418335, "learning_rate": 9.298393124065771e-05, "loss": 3.4574, "step": 8265 }, { "epoch": 0.5618969968745754, "grad_norm": 1.9567097425460815, "learning_rate": 9.297968473977444e-05, "loss": 3.4079, "step": 8270 }, { "epoch": 0.5622367169452371, "grad_norm": 2.712512493133545, "learning_rate": 9.297543823889115e-05, "loss": 3.1644, "step": 8275 }, { "epoch": 0.5625764370158989, "grad_norm": 2.7369048595428467, "learning_rate": 9.297119173800789e-05, "loss": 3.5099, "step": 8280 }, { "epoch": 0.5629161570865606, "grad_norm": 2.09191632270813, "learning_rate": 9.296694523712462e-05, "loss": 3.4824, "step": 8285 }, { "epoch": 0.5632558771572225, "grad_norm": 2.352203845977783, "learning_rate": 9.296269873624133e-05, "loss": 3.642, "step": 8290 }, { "epoch": 0.5635955972278842, "grad_norm": 2.490455389022827, "learning_rate": 9.295845223535808e-05, "loss": 3.3203, "step": 8295 }, { "epoch": 0.563935317298546, "grad_norm": 1.7819089889526367, "learning_rate": 9.29542057344748e-05, "loss": 3.6042, "step": 8300 }, { "epoch": 0.5642750373692078, "grad_norm": 2.003439426422119, "learning_rate": 9.294995923359152e-05, "loss": 3.4489, "step": 8305 }, { "epoch": 0.5646147574398696, "grad_norm": 2.1695783138275146, "learning_rate": 9.294571273270826e-05, "loss": 3.445, "step": 8310 }, { "epoch": 0.5649544775105313, "grad_norm": 2.241365432739258, "learning_rate": 9.294146623182497e-05, "loss": 3.2735, "step": 8315 }, { "epoch": 0.5652941975811931, "grad_norm": 2.0233113765716553, "learning_rate": 9.29372197309417e-05, "loss": 3.306, "step": 8320 }, { "epoch": 0.5656339176518549, "grad_norm": 2.314389705657959, "learning_rate": 9.293297323005844e-05, "loss": 3.6903, "step": 8325 }, { "epoch": 0.5659736377225166, "grad_norm": 2.3685622215270996, "learning_rate": 9.292872672917516e-05, "loss": 3.5154, "step": 8330 }, { "epoch": 0.5663133577931784, "grad_norm": 2.357534408569336, "learning_rate": 9.292448022829189e-05, "loss": 3.6782, "step": 8335 }, { "epoch": 0.5666530778638402, "grad_norm": 2.2818925380706787, "learning_rate": 9.292023372740863e-05, "loss": 3.5172, "step": 8340 }, { "epoch": 0.566992797934502, "grad_norm": 2.028261423110962, "learning_rate": 9.291598722652534e-05, "loss": 3.3944, "step": 8345 }, { "epoch": 0.5673325180051637, "grad_norm": 2.5161893367767334, "learning_rate": 9.291174072564207e-05, "loss": 3.7035, "step": 8350 }, { "epoch": 0.5676722380758256, "grad_norm": 1.9733099937438965, "learning_rate": 9.290749422475881e-05, "loss": 3.6006, "step": 8355 }, { "epoch": 0.5680119581464873, "grad_norm": 2.111640691757202, "learning_rate": 9.290324772387553e-05, "loss": 3.3657, "step": 8360 }, { "epoch": 0.568351678217149, "grad_norm": 1.9056446552276611, "learning_rate": 9.289900122299225e-05, "loss": 3.3409, "step": 8365 }, { "epoch": 0.5686913982878108, "grad_norm": 2.0116829872131348, "learning_rate": 9.2894754722109e-05, "loss": 3.4526, "step": 8370 }, { "epoch": 0.5690311183584726, "grad_norm": 2.0043551921844482, "learning_rate": 9.289050822122571e-05, "loss": 3.6603, "step": 8375 }, { "epoch": 0.5693708384291344, "grad_norm": 2.0761964321136475, "learning_rate": 9.288626172034244e-05, "loss": 3.6551, "step": 8380 }, { "epoch": 0.5697105584997961, "grad_norm": 2.1102120876312256, "learning_rate": 9.288201521945918e-05, "loss": 3.52, "step": 8385 }, { "epoch": 0.570050278570458, "grad_norm": 2.037296772003174, "learning_rate": 9.28777687185759e-05, "loss": 3.3974, "step": 8390 }, { "epoch": 0.5703899986411197, "grad_norm": 2.359480381011963, "learning_rate": 9.287352221769262e-05, "loss": 3.7609, "step": 8395 }, { "epoch": 0.5707297187117815, "grad_norm": 1.9942586421966553, "learning_rate": 9.286927571680935e-05, "loss": 3.3588, "step": 8400 }, { "epoch": 0.5710694387824433, "grad_norm": 2.1668598651885986, "learning_rate": 9.286502921592608e-05, "loss": 3.4829, "step": 8405 }, { "epoch": 0.5714091588531051, "grad_norm": 2.974886417388916, "learning_rate": 9.28607827150428e-05, "loss": 3.4702, "step": 8410 }, { "epoch": 0.5717488789237668, "grad_norm": 2.2977232933044434, "learning_rate": 9.285653621415953e-05, "loss": 3.0362, "step": 8415 }, { "epoch": 0.5720885989944285, "grad_norm": 2.6215264797210693, "learning_rate": 9.285228971327626e-05, "loss": 3.6049, "step": 8420 }, { "epoch": 0.5724283190650904, "grad_norm": 1.657969355583191, "learning_rate": 9.284804321239299e-05, "loss": 3.5216, "step": 8425 }, { "epoch": 0.5727680391357521, "grad_norm": 2.9846670627593994, "learning_rate": 9.284379671150972e-05, "loss": 3.5265, "step": 8430 }, { "epoch": 0.5731077592064139, "grad_norm": 3.2366905212402344, "learning_rate": 9.283955021062645e-05, "loss": 3.1744, "step": 8435 }, { "epoch": 0.5734474792770757, "grad_norm": 2.417916774749756, "learning_rate": 9.283530370974317e-05, "loss": 3.429, "step": 8440 }, { "epoch": 0.5737871993477375, "grad_norm": 2.7880892753601074, "learning_rate": 9.28310572088599e-05, "loss": 3.3616, "step": 8445 }, { "epoch": 0.5741269194183992, "grad_norm": 2.4934799671173096, "learning_rate": 9.282681070797663e-05, "loss": 3.5551, "step": 8450 }, { "epoch": 0.574466639489061, "grad_norm": 2.4630706310272217, "learning_rate": 9.282256420709336e-05, "loss": 3.149, "step": 8455 }, { "epoch": 0.5748063595597228, "grad_norm": 2.399496555328369, "learning_rate": 9.281831770621009e-05, "loss": 3.6542, "step": 8460 }, { "epoch": 0.5751460796303846, "grad_norm": 2.125706672668457, "learning_rate": 9.281407120532681e-05, "loss": 3.5369, "step": 8465 }, { "epoch": 0.5754857997010463, "grad_norm": 2.127157211303711, "learning_rate": 9.280982470444354e-05, "loss": 3.3497, "step": 8470 }, { "epoch": 0.5758255197717081, "grad_norm": 2.590937376022339, "learning_rate": 9.280557820356027e-05, "loss": 3.3688, "step": 8475 }, { "epoch": 0.5761652398423699, "grad_norm": 2.6459944248199463, "learning_rate": 9.2801331702677e-05, "loss": 3.3186, "step": 8480 }, { "epoch": 0.5765049599130316, "grad_norm": 2.029041051864624, "learning_rate": 9.279708520179373e-05, "loss": 3.5337, "step": 8485 }, { "epoch": 0.5768446799836935, "grad_norm": 2.311288356781006, "learning_rate": 9.279283870091045e-05, "loss": 3.174, "step": 8490 }, { "epoch": 0.5771844000543552, "grad_norm": 2.555339813232422, "learning_rate": 9.278859220002718e-05, "loss": 3.5468, "step": 8495 }, { "epoch": 0.577524120125017, "grad_norm": 1.7727775573730469, "learning_rate": 9.278434569914391e-05, "loss": 3.5269, "step": 8500 }, { "epoch": 0.5778638401956787, "grad_norm": 2.611510753631592, "learning_rate": 9.278009919826064e-05, "loss": 3.3579, "step": 8505 }, { "epoch": 0.5782035602663406, "grad_norm": 2.1769232749938965, "learning_rate": 9.277585269737737e-05, "loss": 3.4102, "step": 8510 }, { "epoch": 0.5785432803370023, "grad_norm": 2.586691379547119, "learning_rate": 9.27716061964941e-05, "loss": 3.337, "step": 8515 }, { "epoch": 0.578883000407664, "grad_norm": 2.089334011077881, "learning_rate": 9.276735969561082e-05, "loss": 3.2416, "step": 8520 }, { "epoch": 0.5792227204783259, "grad_norm": 1.9141018390655518, "learning_rate": 9.276311319472755e-05, "loss": 3.4277, "step": 8525 }, { "epoch": 0.5795624405489876, "grad_norm": 1.7541699409484863, "learning_rate": 9.275886669384428e-05, "loss": 3.3736, "step": 8530 }, { "epoch": 0.5799021606196494, "grad_norm": 2.5130348205566406, "learning_rate": 9.2754620192961e-05, "loss": 3.3604, "step": 8535 }, { "epoch": 0.5802418806903111, "grad_norm": 3.10162091255188, "learning_rate": 9.275037369207773e-05, "loss": 3.4115, "step": 8540 }, { "epoch": 0.580581600760973, "grad_norm": 2.407411813735962, "learning_rate": 9.274612719119446e-05, "loss": 3.6455, "step": 8545 }, { "epoch": 0.5809213208316347, "grad_norm": 2.0053093433380127, "learning_rate": 9.274188069031119e-05, "loss": 3.2288, "step": 8550 }, { "epoch": 0.5812610409022965, "grad_norm": 2.641085624694824, "learning_rate": 9.273763418942792e-05, "loss": 3.5824, "step": 8555 }, { "epoch": 0.5816007609729583, "grad_norm": 2.637511730194092, "learning_rate": 9.273338768854465e-05, "loss": 3.5481, "step": 8560 }, { "epoch": 0.5819404810436201, "grad_norm": 2.107805013656616, "learning_rate": 9.272914118766137e-05, "loss": 3.4166, "step": 8565 }, { "epoch": 0.5822802011142818, "grad_norm": 2.414813756942749, "learning_rate": 9.27248946867781e-05, "loss": 3.4619, "step": 8570 }, { "epoch": 0.5826199211849437, "grad_norm": 2.577255964279175, "learning_rate": 9.272064818589483e-05, "loss": 3.5583, "step": 8575 }, { "epoch": 0.5829596412556054, "grad_norm": 1.952826976776123, "learning_rate": 9.271640168501156e-05, "loss": 3.326, "step": 8580 }, { "epoch": 0.5832993613262671, "grad_norm": 2.2093346118927, "learning_rate": 9.271215518412829e-05, "loss": 3.4601, "step": 8585 }, { "epoch": 0.5836390813969289, "grad_norm": 2.598940372467041, "learning_rate": 9.270790868324501e-05, "loss": 3.4668, "step": 8590 }, { "epoch": 0.5839788014675907, "grad_norm": 2.000551700592041, "learning_rate": 9.270366218236174e-05, "loss": 3.6846, "step": 8595 }, { "epoch": 0.5843185215382525, "grad_norm": 2.354728937149048, "learning_rate": 9.269941568147846e-05, "loss": 3.361, "step": 8600 }, { "epoch": 0.5846582416089142, "grad_norm": 2.418919563293457, "learning_rate": 9.26951691805952e-05, "loss": 3.6553, "step": 8605 }, { "epoch": 0.5849979616795761, "grad_norm": 1.7047700881958008, "learning_rate": 9.269092267971193e-05, "loss": 3.2775, "step": 8610 }, { "epoch": 0.5853376817502378, "grad_norm": 2.0444867610931396, "learning_rate": 9.268667617882864e-05, "loss": 3.5458, "step": 8615 }, { "epoch": 0.5856774018208996, "grad_norm": 3.0210981369018555, "learning_rate": 9.268242967794538e-05, "loss": 3.3262, "step": 8620 }, { "epoch": 0.5860171218915613, "grad_norm": 2.4124979972839355, "learning_rate": 9.267818317706211e-05, "loss": 3.5382, "step": 8625 }, { "epoch": 0.5863568419622232, "grad_norm": 2.2351627349853516, "learning_rate": 9.267393667617882e-05, "loss": 3.7378, "step": 8630 }, { "epoch": 0.5866965620328849, "grad_norm": 1.7832891941070557, "learning_rate": 9.266969017529557e-05, "loss": 3.5345, "step": 8635 }, { "epoch": 0.5870362821035466, "grad_norm": 4.006323337554932, "learning_rate": 9.26654436744123e-05, "loss": 3.0423, "step": 8640 }, { "epoch": 0.5873760021742085, "grad_norm": 2.2737832069396973, "learning_rate": 9.266119717352901e-05, "loss": 3.7005, "step": 8645 }, { "epoch": 0.5877157222448702, "grad_norm": 1.8196533918380737, "learning_rate": 9.265695067264575e-05, "loss": 3.7564, "step": 8650 }, { "epoch": 0.588055442315532, "grad_norm": 1.9307434558868408, "learning_rate": 9.265270417176248e-05, "loss": 3.3201, "step": 8655 }, { "epoch": 0.5883951623861938, "grad_norm": 2.5794484615325928, "learning_rate": 9.264845767087919e-05, "loss": 3.3153, "step": 8660 }, { "epoch": 0.5887348824568556, "grad_norm": 2.2121431827545166, "learning_rate": 9.264421116999593e-05, "loss": 3.4331, "step": 8665 }, { "epoch": 0.5890746025275173, "grad_norm": 3.738420248031616, "learning_rate": 9.263996466911265e-05, "loss": 3.603, "step": 8670 }, { "epoch": 0.589414322598179, "grad_norm": 2.2575557231903076, "learning_rate": 9.263571816822938e-05, "loss": 3.5148, "step": 8675 }, { "epoch": 0.5897540426688409, "grad_norm": 1.7424854040145874, "learning_rate": 9.263147166734612e-05, "loss": 3.4102, "step": 8680 }, { "epoch": 0.5900937627395026, "grad_norm": 1.972262978553772, "learning_rate": 9.262722516646283e-05, "loss": 3.3401, "step": 8685 }, { "epoch": 0.5904334828101644, "grad_norm": 2.403106689453125, "learning_rate": 9.262297866557956e-05, "loss": 3.1883, "step": 8690 }, { "epoch": 0.5907732028808262, "grad_norm": 2.0763771533966064, "learning_rate": 9.26187321646963e-05, "loss": 3.4608, "step": 8695 }, { "epoch": 0.591112922951488, "grad_norm": 2.262354612350464, "learning_rate": 9.261448566381302e-05, "loss": 3.5027, "step": 8700 }, { "epoch": 0.5914526430221497, "grad_norm": 2.96211838722229, "learning_rate": 9.261023916292975e-05, "loss": 3.5515, "step": 8705 }, { "epoch": 0.5917923630928115, "grad_norm": 2.1572303771972656, "learning_rate": 9.260599266204649e-05, "loss": 3.4375, "step": 8710 }, { "epoch": 0.5921320831634733, "grad_norm": 2.367905855178833, "learning_rate": 9.26017461611632e-05, "loss": 3.2595, "step": 8715 }, { "epoch": 0.5924718032341351, "grad_norm": 2.771169900894165, "learning_rate": 9.259749966027993e-05, "loss": 3.5077, "step": 8720 }, { "epoch": 0.5928115233047968, "grad_norm": 2.8313136100769043, "learning_rate": 9.259410245957331e-05, "loss": 3.5583, "step": 8725 }, { "epoch": 0.5931512433754587, "grad_norm": 2.2947633266448975, "learning_rate": 9.258985595869004e-05, "loss": 3.594, "step": 8730 }, { "epoch": 0.5934909634461204, "grad_norm": 2.876166820526123, "learning_rate": 9.258560945780678e-05, "loss": 3.538, "step": 8735 }, { "epoch": 0.5938306835167821, "grad_norm": 2.4721367359161377, "learning_rate": 9.25813629569235e-05, "loss": 3.4472, "step": 8740 }, { "epoch": 0.594170403587444, "grad_norm": 2.5023317337036133, "learning_rate": 9.257711645604022e-05, "loss": 3.4651, "step": 8745 }, { "epoch": 0.5945101236581057, "grad_norm": 2.4450182914733887, "learning_rate": 9.257286995515697e-05, "loss": 3.5858, "step": 8750 }, { "epoch": 0.5948498437287675, "grad_norm": 2.0935065746307373, "learning_rate": 9.256862345427368e-05, "loss": 3.3318, "step": 8755 }, { "epoch": 0.5951895637994292, "grad_norm": 3.6565756797790527, "learning_rate": 9.256437695339041e-05, "loss": 3.4703, "step": 8760 }, { "epoch": 0.5955292838700911, "grad_norm": 2.7227180004119873, "learning_rate": 9.256013045250715e-05, "loss": 3.4155, "step": 8765 }, { "epoch": 0.5958690039407528, "grad_norm": 2.177351951599121, "learning_rate": 9.255588395162386e-05, "loss": 3.4463, "step": 8770 }, { "epoch": 0.5962087240114146, "grad_norm": 2.880723237991333, "learning_rate": 9.255163745074059e-05, "loss": 3.4563, "step": 8775 }, { "epoch": 0.5965484440820764, "grad_norm": 2.088820457458496, "learning_rate": 9.254739094985733e-05, "loss": 3.4154, "step": 8780 }, { "epoch": 0.5968881641527382, "grad_norm": 3.313237190246582, "learning_rate": 9.254314444897405e-05, "loss": 3.4208, "step": 8785 }, { "epoch": 0.5972278842233999, "grad_norm": 2.361285924911499, "learning_rate": 9.253889794809078e-05, "loss": 3.5362, "step": 8790 }, { "epoch": 0.5975676042940616, "grad_norm": 1.9548826217651367, "learning_rate": 9.25346514472075e-05, "loss": 3.5581, "step": 8795 }, { "epoch": 0.5979073243647235, "grad_norm": 2.136289596557617, "learning_rate": 9.253040494632423e-05, "loss": 3.3625, "step": 8800 }, { "epoch": 0.5982470444353852, "grad_norm": 2.0891830921173096, "learning_rate": 9.252615844544096e-05, "loss": 3.4685, "step": 8805 }, { "epoch": 0.598586764506047, "grad_norm": 2.046870231628418, "learning_rate": 9.252191194455769e-05, "loss": 3.4844, "step": 8810 }, { "epoch": 0.5989264845767088, "grad_norm": 2.767624616622925, "learning_rate": 9.251766544367442e-05, "loss": 3.4608, "step": 8815 }, { "epoch": 0.5992662046473706, "grad_norm": 2.1752066612243652, "learning_rate": 9.251341894279114e-05, "loss": 3.2597, "step": 8820 }, { "epoch": 0.5996059247180323, "grad_norm": 2.496392011642456, "learning_rate": 9.250917244190787e-05, "loss": 3.3379, "step": 8825 }, { "epoch": 0.5999456447886942, "grad_norm": 2.0173327922821045, "learning_rate": 9.25049259410246e-05, "loss": 3.3509, "step": 8830 }, { "epoch": 0.6002853648593559, "grad_norm": 2.064110517501831, "learning_rate": 9.250067944014133e-05, "loss": 3.4919, "step": 8835 }, { "epoch": 0.6006250849300176, "grad_norm": 2.5744454860687256, "learning_rate": 9.249643293925806e-05, "loss": 3.7008, "step": 8840 }, { "epoch": 0.6009648050006794, "grad_norm": 1.8868316411972046, "learning_rate": 9.249218643837478e-05, "loss": 3.4246, "step": 8845 }, { "epoch": 0.6013045250713412, "grad_norm": 2.1251633167266846, "learning_rate": 9.248793993749151e-05, "loss": 3.6764, "step": 8850 }, { "epoch": 0.601644245142003, "grad_norm": 2.310070037841797, "learning_rate": 9.248369343660824e-05, "loss": 3.5046, "step": 8855 }, { "epoch": 0.6019839652126647, "grad_norm": 2.216578483581543, "learning_rate": 9.247944693572497e-05, "loss": 3.4614, "step": 8860 }, { "epoch": 0.6023236852833266, "grad_norm": 1.9413403272628784, "learning_rate": 9.24752004348417e-05, "loss": 3.4046, "step": 8865 }, { "epoch": 0.6026634053539883, "grad_norm": 2.4662301540374756, "learning_rate": 9.247095393395842e-05, "loss": 3.3895, "step": 8870 }, { "epoch": 0.6030031254246501, "grad_norm": 2.511380195617676, "learning_rate": 9.246670743307515e-05, "loss": 3.3947, "step": 8875 }, { "epoch": 0.6033428454953118, "grad_norm": 2.423506736755371, "learning_rate": 9.246246093219188e-05, "loss": 3.2647, "step": 8880 }, { "epoch": 0.6036825655659737, "grad_norm": 2.3918538093566895, "learning_rate": 9.245821443130861e-05, "loss": 3.6556, "step": 8885 }, { "epoch": 0.6040222856366354, "grad_norm": 1.7094987630844116, "learning_rate": 9.245396793042534e-05, "loss": 3.4037, "step": 8890 }, { "epoch": 0.6043620057072971, "grad_norm": 2.2404685020446777, "learning_rate": 9.244972142954206e-05, "loss": 3.4772, "step": 8895 }, { "epoch": 0.604701725777959, "grad_norm": 2.090378522872925, "learning_rate": 9.244547492865879e-05, "loss": 3.5517, "step": 8900 }, { "epoch": 0.6050414458486207, "grad_norm": 2.432039976119995, "learning_rate": 9.244122842777552e-05, "loss": 3.354, "step": 8905 }, { "epoch": 0.6053811659192825, "grad_norm": 2.135570764541626, "learning_rate": 9.243698192689225e-05, "loss": 3.4889, "step": 8910 }, { "epoch": 0.6057208859899443, "grad_norm": 3.326573610305786, "learning_rate": 9.243273542600898e-05, "loss": 3.2613, "step": 8915 }, { "epoch": 0.6060606060606061, "grad_norm": 1.7053751945495605, "learning_rate": 9.24284889251257e-05, "loss": 3.4801, "step": 8920 }, { "epoch": 0.6064003261312678, "grad_norm": 2.004863739013672, "learning_rate": 9.242424242424242e-05, "loss": 3.386, "step": 8925 }, { "epoch": 0.6067400462019296, "grad_norm": 2.6540749073028564, "learning_rate": 9.241999592335916e-05, "loss": 3.6117, "step": 8930 }, { "epoch": 0.6070797662725914, "grad_norm": 2.4861056804656982, "learning_rate": 9.241574942247589e-05, "loss": 3.4069, "step": 8935 }, { "epoch": 0.6074194863432532, "grad_norm": 1.9969956874847412, "learning_rate": 9.24115029215926e-05, "loss": 3.4672, "step": 8940 }, { "epoch": 0.6077592064139149, "grad_norm": 2.5103538036346436, "learning_rate": 9.240725642070934e-05, "loss": 3.5373, "step": 8945 }, { "epoch": 0.6080989264845768, "grad_norm": 2.205951690673828, "learning_rate": 9.240300991982607e-05, "loss": 3.5909, "step": 8950 }, { "epoch": 0.6084386465552385, "grad_norm": 2.1522507667541504, "learning_rate": 9.239876341894279e-05, "loss": 3.3089, "step": 8955 }, { "epoch": 0.6087783666259002, "grad_norm": 2.1917757987976074, "learning_rate": 9.239451691805953e-05, "loss": 3.4894, "step": 8960 }, { "epoch": 0.609118086696562, "grad_norm": 2.4323723316192627, "learning_rate": 9.239027041717626e-05, "loss": 3.3356, "step": 8965 }, { "epoch": 0.6094578067672238, "grad_norm": 2.0584442615509033, "learning_rate": 9.238602391629297e-05, "loss": 3.4695, "step": 8970 }, { "epoch": 0.6097975268378856, "grad_norm": 2.437145948410034, "learning_rate": 9.238177741540971e-05, "loss": 3.3846, "step": 8975 }, { "epoch": 0.6101372469085473, "grad_norm": 2.3354413509368896, "learning_rate": 9.237753091452644e-05, "loss": 3.3006, "step": 8980 }, { "epoch": 0.6104769669792092, "grad_norm": 2.009671211242676, "learning_rate": 9.237328441364315e-05, "loss": 3.2417, "step": 8985 }, { "epoch": 0.6108166870498709, "grad_norm": 1.7423853874206543, "learning_rate": 9.23690379127599e-05, "loss": 3.4108, "step": 8990 }, { "epoch": 0.6111564071205327, "grad_norm": 2.296311616897583, "learning_rate": 9.236479141187661e-05, "loss": 3.7126, "step": 8995 }, { "epoch": 0.6114961271911945, "grad_norm": 2.9706532955169678, "learning_rate": 9.236054491099334e-05, "loss": 3.4258, "step": 9000 }, { "epoch": 0.6118358472618562, "grad_norm": 2.3803908824920654, "learning_rate": 9.235629841011008e-05, "loss": 3.485, "step": 9005 }, { "epoch": 0.612175567332518, "grad_norm": 2.1578822135925293, "learning_rate": 9.23520519092268e-05, "loss": 3.4563, "step": 9010 }, { "epoch": 0.6125152874031797, "grad_norm": 2.049509048461914, "learning_rate": 9.234780540834352e-05, "loss": 3.3914, "step": 9015 }, { "epoch": 0.6128550074738416, "grad_norm": 2.0547492504119873, "learning_rate": 9.234355890746026e-05, "loss": 3.5119, "step": 9020 }, { "epoch": 0.6131947275445033, "grad_norm": 3.450504779815674, "learning_rate": 9.233931240657698e-05, "loss": 3.38, "step": 9025 }, { "epoch": 0.6135344476151651, "grad_norm": 2.7689361572265625, "learning_rate": 9.23350659056937e-05, "loss": 3.335, "step": 9030 }, { "epoch": 0.6138741676858269, "grad_norm": 2.228415012359619, "learning_rate": 9.233081940481045e-05, "loss": 3.3001, "step": 9035 }, { "epoch": 0.6142138877564887, "grad_norm": 1.9736047983169556, "learning_rate": 9.232657290392716e-05, "loss": 3.5795, "step": 9040 }, { "epoch": 0.6145536078271504, "grad_norm": 2.6956357955932617, "learning_rate": 9.23223264030439e-05, "loss": 3.5509, "step": 9045 }, { "epoch": 0.6148933278978121, "grad_norm": 2.0668585300445557, "learning_rate": 9.231807990216063e-05, "loss": 3.4657, "step": 9050 }, { "epoch": 0.615233047968474, "grad_norm": 2.4969048500061035, "learning_rate": 9.231383340127735e-05, "loss": 3.6714, "step": 9055 }, { "epoch": 0.6155727680391357, "grad_norm": 2.2255680561065674, "learning_rate": 9.230958690039409e-05, "loss": 3.5104, "step": 9060 }, { "epoch": 0.6159124881097975, "grad_norm": 2.149826765060425, "learning_rate": 9.23053403995108e-05, "loss": 3.5722, "step": 9065 }, { "epoch": 0.6162522081804593, "grad_norm": 3.0322883129119873, "learning_rate": 9.230109389862753e-05, "loss": 3.3271, "step": 9070 }, { "epoch": 0.6165919282511211, "grad_norm": 2.4431819915771484, "learning_rate": 9.229684739774427e-05, "loss": 3.2296, "step": 9075 }, { "epoch": 0.6169316483217828, "grad_norm": 1.9841899871826172, "learning_rate": 9.229260089686099e-05, "loss": 3.5309, "step": 9080 }, { "epoch": 0.6172713683924447, "grad_norm": 2.1378426551818848, "learning_rate": 9.228835439597771e-05, "loss": 3.5327, "step": 9085 }, { "epoch": 0.6176110884631064, "grad_norm": 2.0772478580474854, "learning_rate": 9.228410789509446e-05, "loss": 3.2381, "step": 9090 }, { "epoch": 0.6179508085337682, "grad_norm": 2.1273200511932373, "learning_rate": 9.227986139421117e-05, "loss": 3.3578, "step": 9095 }, { "epoch": 0.6182905286044299, "grad_norm": 2.3347697257995605, "learning_rate": 9.22756148933279e-05, "loss": 3.5752, "step": 9100 }, { "epoch": 0.6186302486750918, "grad_norm": 2.3471555709838867, "learning_rate": 9.227136839244464e-05, "loss": 3.7021, "step": 9105 }, { "epoch": 0.6189699687457535, "grad_norm": 2.6924638748168945, "learning_rate": 9.226712189156135e-05, "loss": 3.0245, "step": 9110 }, { "epoch": 0.6193096888164152, "grad_norm": 2.1283507347106934, "learning_rate": 9.226287539067808e-05, "loss": 3.5864, "step": 9115 }, { "epoch": 0.6196494088870771, "grad_norm": 2.029057264328003, "learning_rate": 9.225862888979482e-05, "loss": 3.5111, "step": 9120 }, { "epoch": 0.6199891289577388, "grad_norm": 2.3345603942871094, "learning_rate": 9.225438238891154e-05, "loss": 3.6344, "step": 9125 }, { "epoch": 0.6203288490284006, "grad_norm": 2.1531529426574707, "learning_rate": 9.225013588802827e-05, "loss": 3.4414, "step": 9130 }, { "epoch": 0.6206685690990623, "grad_norm": 2.1636009216308594, "learning_rate": 9.224588938714501e-05, "loss": 3.5252, "step": 9135 }, { "epoch": 0.6210082891697242, "grad_norm": 1.7845302820205688, "learning_rate": 9.224164288626172e-05, "loss": 3.3295, "step": 9140 }, { "epoch": 0.6213480092403859, "grad_norm": 2.275392532348633, "learning_rate": 9.223739638537845e-05, "loss": 3.5735, "step": 9145 }, { "epoch": 0.6216877293110477, "grad_norm": 1.6259108781814575, "learning_rate": 9.223314988449518e-05, "loss": 3.3143, "step": 9150 }, { "epoch": 0.6220274493817095, "grad_norm": 2.047922134399414, "learning_rate": 9.22289033836119e-05, "loss": 3.4208, "step": 9155 }, { "epoch": 0.6223671694523712, "grad_norm": 1.9892585277557373, "learning_rate": 9.222465688272863e-05, "loss": 3.5035, "step": 9160 }, { "epoch": 0.622706889523033, "grad_norm": 2.327022075653076, "learning_rate": 9.222041038184536e-05, "loss": 3.4351, "step": 9165 }, { "epoch": 0.6230466095936948, "grad_norm": 2.236118793487549, "learning_rate": 9.221616388096209e-05, "loss": 3.4589, "step": 9170 }, { "epoch": 0.6233863296643566, "grad_norm": 2.212754487991333, "learning_rate": 9.221191738007882e-05, "loss": 3.4827, "step": 9175 }, { "epoch": 0.6237260497350183, "grad_norm": 2.8504140377044678, "learning_rate": 9.220767087919555e-05, "loss": 3.1599, "step": 9180 }, { "epoch": 0.6240657698056801, "grad_norm": 1.8453243970870972, "learning_rate": 9.220342437831227e-05, "loss": 3.7203, "step": 9185 }, { "epoch": 0.6244054898763419, "grad_norm": 2.2583439350128174, "learning_rate": 9.2199177877429e-05, "loss": 3.5995, "step": 9190 }, { "epoch": 0.6247452099470037, "grad_norm": 2.018906593322754, "learning_rate": 9.219493137654573e-05, "loss": 3.629, "step": 9195 }, { "epoch": 0.6250849300176654, "grad_norm": 2.3241515159606934, "learning_rate": 9.219068487566246e-05, "loss": 3.2277, "step": 9200 }, { "epoch": 0.6254246500883273, "grad_norm": 2.038961172103882, "learning_rate": 9.218643837477919e-05, "loss": 3.538, "step": 9205 }, { "epoch": 0.625764370158989, "grad_norm": 2.4797563552856445, "learning_rate": 9.218219187389591e-05, "loss": 3.6085, "step": 9210 }, { "epoch": 0.6261040902296507, "grad_norm": 2.335033655166626, "learning_rate": 9.217794537301264e-05, "loss": 3.436, "step": 9215 }, { "epoch": 0.6264438103003126, "grad_norm": 2.303050994873047, "learning_rate": 9.217369887212937e-05, "loss": 3.5978, "step": 9220 }, { "epoch": 0.6267835303709743, "grad_norm": 2.4888527393341064, "learning_rate": 9.21694523712461e-05, "loss": 3.829, "step": 9225 }, { "epoch": 0.6271232504416361, "grad_norm": 2.4484055042266846, "learning_rate": 9.216520587036283e-05, "loss": 3.4508, "step": 9230 }, { "epoch": 0.6274629705122978, "grad_norm": 2.2140262126922607, "learning_rate": 9.216095936947955e-05, "loss": 3.4847, "step": 9235 }, { "epoch": 0.6278026905829597, "grad_norm": 2.3871517181396484, "learning_rate": 9.215671286859628e-05, "loss": 3.3973, "step": 9240 }, { "epoch": 0.6281424106536214, "grad_norm": 2.2835378646850586, "learning_rate": 9.215246636771301e-05, "loss": 3.3773, "step": 9245 }, { "epoch": 0.6284821307242832, "grad_norm": 2.3547534942626953, "learning_rate": 9.214821986682974e-05, "loss": 3.2761, "step": 9250 }, { "epoch": 0.628821850794945, "grad_norm": 2.603886127471924, "learning_rate": 9.214397336594647e-05, "loss": 3.2656, "step": 9255 }, { "epoch": 0.6291615708656068, "grad_norm": 2.165262460708618, "learning_rate": 9.21397268650632e-05, "loss": 3.4739, "step": 9260 }, { "epoch": 0.6295012909362685, "grad_norm": 2.0599539279937744, "learning_rate": 9.213548036417991e-05, "loss": 3.2759, "step": 9265 }, { "epoch": 0.6298410110069302, "grad_norm": 2.8846681118011475, "learning_rate": 9.213123386329665e-05, "loss": 3.3199, "step": 9270 }, { "epoch": 0.6301807310775921, "grad_norm": 2.104825258255005, "learning_rate": 9.212698736241338e-05, "loss": 3.4047, "step": 9275 }, { "epoch": 0.6305204511482538, "grad_norm": 2.2291181087493896, "learning_rate": 9.212274086153009e-05, "loss": 3.6294, "step": 9280 }, { "epoch": 0.6308601712189156, "grad_norm": 1.9005157947540283, "learning_rate": 9.211849436064683e-05, "loss": 3.5974, "step": 9285 }, { "epoch": 0.6311998912895774, "grad_norm": 2.375688076019287, "learning_rate": 9.211424785976356e-05, "loss": 3.0638, "step": 9290 }, { "epoch": 0.6315396113602392, "grad_norm": 2.006298303604126, "learning_rate": 9.211000135888028e-05, "loss": 3.4262, "step": 9295 }, { "epoch": 0.6318793314309009, "grad_norm": 2.4329819679260254, "learning_rate": 9.210575485799702e-05, "loss": 3.6207, "step": 9300 }, { "epoch": 0.6322190515015628, "grad_norm": 1.9812030792236328, "learning_rate": 9.210150835711375e-05, "loss": 3.4401, "step": 9305 }, { "epoch": 0.6325587715722245, "grad_norm": 2.0097873210906982, "learning_rate": 9.209726185623046e-05, "loss": 3.2702, "step": 9310 }, { "epoch": 0.6328984916428863, "grad_norm": 2.418948173522949, "learning_rate": 9.20930153553472e-05, "loss": 3.365, "step": 9315 }, { "epoch": 0.633238211713548, "grad_norm": 1.9337080717086792, "learning_rate": 9.208876885446393e-05, "loss": 3.3297, "step": 9320 }, { "epoch": 0.6335779317842098, "grad_norm": 2.334913492202759, "learning_rate": 9.208452235358065e-05, "loss": 3.5442, "step": 9325 }, { "epoch": 0.6339176518548716, "grad_norm": 1.979494333267212, "learning_rate": 9.208027585269739e-05, "loss": 3.5878, "step": 9330 }, { "epoch": 0.6342573719255333, "grad_norm": 2.2706778049468994, "learning_rate": 9.207602935181411e-05, "loss": 3.6239, "step": 9335 }, { "epoch": 0.6345970919961952, "grad_norm": 1.9782280921936035, "learning_rate": 9.207178285093083e-05, "loss": 3.5322, "step": 9340 }, { "epoch": 0.6349368120668569, "grad_norm": 2.21012282371521, "learning_rate": 9.206753635004757e-05, "loss": 3.4804, "step": 9345 }, { "epoch": 0.6352765321375187, "grad_norm": 1.9822043180465698, "learning_rate": 9.206328984916429e-05, "loss": 3.5511, "step": 9350 }, { "epoch": 0.6356162522081804, "grad_norm": 1.61638343334198, "learning_rate": 9.205904334828101e-05, "loss": 3.3277, "step": 9355 }, { "epoch": 0.6359559722788423, "grad_norm": 2.369020462036133, "learning_rate": 9.205479684739775e-05, "loss": 3.3067, "step": 9360 }, { "epoch": 0.636295692349504, "grad_norm": 2.076824426651001, "learning_rate": 9.205055034651447e-05, "loss": 3.5835, "step": 9365 }, { "epoch": 0.6366354124201657, "grad_norm": 1.8298687934875488, "learning_rate": 9.20463038456312e-05, "loss": 3.7377, "step": 9370 }, { "epoch": 0.6369751324908276, "grad_norm": 2.4179134368896484, "learning_rate": 9.204205734474794e-05, "loss": 3.3789, "step": 9375 }, { "epoch": 0.6373148525614893, "grad_norm": 2.1561992168426514, "learning_rate": 9.203781084386465e-05, "loss": 3.3464, "step": 9380 }, { "epoch": 0.6376545726321511, "grad_norm": 1.7424547672271729, "learning_rate": 9.20335643429814e-05, "loss": 3.4852, "step": 9385 }, { "epoch": 0.6379942927028129, "grad_norm": 2.0857956409454346, "learning_rate": 9.202931784209812e-05, "loss": 3.4334, "step": 9390 }, { "epoch": 0.6383340127734747, "grad_norm": 2.058171510696411, "learning_rate": 9.202507134121484e-05, "loss": 3.437, "step": 9395 }, { "epoch": 0.6386737328441364, "grad_norm": 2.1025898456573486, "learning_rate": 9.202082484033158e-05, "loss": 3.3733, "step": 9400 }, { "epoch": 0.6390134529147982, "grad_norm": 2.4429051876068115, "learning_rate": 9.201657833944831e-05, "loss": 3.3843, "step": 9405 }, { "epoch": 0.63935317298546, "grad_norm": 1.996283769607544, "learning_rate": 9.201233183856502e-05, "loss": 3.3407, "step": 9410 }, { "epoch": 0.6396928930561218, "grad_norm": 1.9609453678131104, "learning_rate": 9.200808533768176e-05, "loss": 3.2856, "step": 9415 }, { "epoch": 0.6400326131267835, "grad_norm": 2.6451570987701416, "learning_rate": 9.200383883679848e-05, "loss": 3.4036, "step": 9420 }, { "epoch": 0.6403723331974454, "grad_norm": 2.3675267696380615, "learning_rate": 9.19995923359152e-05, "loss": 3.5064, "step": 9425 }, { "epoch": 0.6407120532681071, "grad_norm": 2.80167818069458, "learning_rate": 9.199534583503195e-05, "loss": 3.4674, "step": 9430 }, { "epoch": 0.6410517733387688, "grad_norm": 2.440291404724121, "learning_rate": 9.199109933414866e-05, "loss": 3.28, "step": 9435 }, { "epoch": 0.6413914934094306, "grad_norm": 2.2997565269470215, "learning_rate": 9.198685283326539e-05, "loss": 3.4094, "step": 9440 }, { "epoch": 0.6417312134800924, "grad_norm": 2.3637266159057617, "learning_rate": 9.198260633238213e-05, "loss": 3.373, "step": 9445 }, { "epoch": 0.6420709335507542, "grad_norm": 2.2341365814208984, "learning_rate": 9.197835983149885e-05, "loss": 3.4228, "step": 9450 }, { "epoch": 0.6424106536214159, "grad_norm": 2.6212844848632812, "learning_rate": 9.197411333061557e-05, "loss": 3.3152, "step": 9455 }, { "epoch": 0.6427503736920778, "grad_norm": 2.5143659114837646, "learning_rate": 9.196986682973232e-05, "loss": 3.5497, "step": 9460 }, { "epoch": 0.6430900937627395, "grad_norm": 2.1178486347198486, "learning_rate": 9.196562032884903e-05, "loss": 3.3481, "step": 9465 }, { "epoch": 0.6434298138334013, "grad_norm": 2.7853009700775146, "learning_rate": 9.196137382796576e-05, "loss": 3.4423, "step": 9470 }, { "epoch": 0.6437695339040631, "grad_norm": 2.1077260971069336, "learning_rate": 9.19571273270825e-05, "loss": 3.5083, "step": 9475 }, { "epoch": 0.6441092539747248, "grad_norm": 2.360146999359131, "learning_rate": 9.195288082619921e-05, "loss": 3.3045, "step": 9480 }, { "epoch": 0.6444489740453866, "grad_norm": 2.139396905899048, "learning_rate": 9.194863432531594e-05, "loss": 3.9015, "step": 9485 }, { "epoch": 0.6447886941160483, "grad_norm": 2.755953311920166, "learning_rate": 9.194438782443267e-05, "loss": 3.4252, "step": 9490 }, { "epoch": 0.6451284141867102, "grad_norm": 2.182157278060913, "learning_rate": 9.19401413235494e-05, "loss": 3.3928, "step": 9495 }, { "epoch": 0.6454681342573719, "grad_norm": 2.243736982345581, "learning_rate": 9.193589482266613e-05, "loss": 3.3703, "step": 9500 }, { "epoch": 0.6458078543280337, "grad_norm": 1.755360722541809, "learning_rate": 9.193164832178285e-05, "loss": 3.2493, "step": 9505 }, { "epoch": 0.6461475743986955, "grad_norm": 1.9161643981933594, "learning_rate": 9.192740182089958e-05, "loss": 3.3648, "step": 9510 }, { "epoch": 0.6464872944693573, "grad_norm": 2.177321195602417, "learning_rate": 9.192315532001631e-05, "loss": 3.5422, "step": 9515 }, { "epoch": 0.646827014540019, "grad_norm": 2.789555311203003, "learning_rate": 9.191890881913304e-05, "loss": 3.5221, "step": 9520 }, { "epoch": 0.6471667346106807, "grad_norm": 2.1760945320129395, "learning_rate": 9.191466231824977e-05, "loss": 3.469, "step": 9525 }, { "epoch": 0.6475064546813426, "grad_norm": 1.720252513885498, "learning_rate": 9.19104158173665e-05, "loss": 3.4823, "step": 9530 }, { "epoch": 0.6478461747520043, "grad_norm": 2.2654337882995605, "learning_rate": 9.190616931648322e-05, "loss": 3.428, "step": 9535 }, { "epoch": 0.6481858948226661, "grad_norm": 4.516109466552734, "learning_rate": 9.190192281559995e-05, "loss": 3.592, "step": 9540 }, { "epoch": 0.6485256148933279, "grad_norm": 2.6933679580688477, "learning_rate": 9.189767631471668e-05, "loss": 3.5928, "step": 9545 }, { "epoch": 0.6488653349639897, "grad_norm": 2.0785329341888428, "learning_rate": 9.18934298138334e-05, "loss": 3.5918, "step": 9550 }, { "epoch": 0.6492050550346514, "grad_norm": 1.7163983583450317, "learning_rate": 9.188918331295013e-05, "loss": 3.3801, "step": 9555 }, { "epoch": 0.6495447751053133, "grad_norm": 2.295823812484741, "learning_rate": 9.188493681206686e-05, "loss": 3.5109, "step": 9560 }, { "epoch": 0.649884495175975, "grad_norm": 2.0109357833862305, "learning_rate": 9.188069031118359e-05, "loss": 3.6524, "step": 9565 }, { "epoch": 0.6502242152466368, "grad_norm": 2.102390766143799, "learning_rate": 9.187644381030032e-05, "loss": 3.418, "step": 9570 }, { "epoch": 0.6505639353172985, "grad_norm": 3.062297821044922, "learning_rate": 9.187219730941705e-05, "loss": 3.1961, "step": 9575 }, { "epoch": 0.6509036553879604, "grad_norm": 2.4372718334198, "learning_rate": 9.186795080853377e-05, "loss": 3.5075, "step": 9580 }, { "epoch": 0.6512433754586221, "grad_norm": 2.544154405593872, "learning_rate": 9.18637043076505e-05, "loss": 3.2986, "step": 9585 }, { "epoch": 0.6515830955292838, "grad_norm": 1.6882457733154297, "learning_rate": 9.185945780676723e-05, "loss": 3.4571, "step": 9590 }, { "epoch": 0.6519228155999457, "grad_norm": 2.341235637664795, "learning_rate": 9.185521130588396e-05, "loss": 3.5237, "step": 9595 }, { "epoch": 0.6522625356706074, "grad_norm": 2.0831446647644043, "learning_rate": 9.185096480500069e-05, "loss": 3.2293, "step": 9600 }, { "epoch": 0.6526022557412692, "grad_norm": 2.001333713531494, "learning_rate": 9.184671830411741e-05, "loss": 3.4746, "step": 9605 }, { "epoch": 0.6529419758119309, "grad_norm": 2.2685201168060303, "learning_rate": 9.184247180323414e-05, "loss": 3.4989, "step": 9610 }, { "epoch": 0.6532816958825928, "grad_norm": 2.427672863006592, "learning_rate": 9.183822530235087e-05, "loss": 3.4831, "step": 9615 }, { "epoch": 0.6536214159532545, "grad_norm": 2.1567723751068115, "learning_rate": 9.183397880146758e-05, "loss": 3.3328, "step": 9620 }, { "epoch": 0.6539611360239163, "grad_norm": 2.433497667312622, "learning_rate": 9.182973230058433e-05, "loss": 3.6096, "step": 9625 }, { "epoch": 0.6543008560945781, "grad_norm": 2.287774085998535, "learning_rate": 9.182548579970105e-05, "loss": 3.4264, "step": 9630 }, { "epoch": 0.6546405761652399, "grad_norm": 1.9578315019607544, "learning_rate": 9.182123929881777e-05, "loss": 3.3448, "step": 9635 }, { "epoch": 0.6549802962359016, "grad_norm": 2.3514602184295654, "learning_rate": 9.181699279793451e-05, "loss": 3.3377, "step": 9640 }, { "epoch": 0.6553200163065634, "grad_norm": 1.8604629039764404, "learning_rate": 9.181274629705124e-05, "loss": 3.3196, "step": 9645 }, { "epoch": 0.6556597363772252, "grad_norm": 2.3649957180023193, "learning_rate": 9.180849979616795e-05, "loss": 3.0222, "step": 9650 }, { "epoch": 0.6559994564478869, "grad_norm": 2.3329782485961914, "learning_rate": 9.18042532952847e-05, "loss": 3.4149, "step": 9655 }, { "epoch": 0.6563391765185487, "grad_norm": 2.2776143550872803, "learning_rate": 9.180000679440142e-05, "loss": 3.7491, "step": 9660 }, { "epoch": 0.6566788965892105, "grad_norm": 2.8028154373168945, "learning_rate": 9.179576029351814e-05, "loss": 3.3465, "step": 9665 }, { "epoch": 0.6570186166598723, "grad_norm": 2.7860090732574463, "learning_rate": 9.179151379263488e-05, "loss": 3.0798, "step": 9670 }, { "epoch": 0.657358336730534, "grad_norm": 2.304365634918213, "learning_rate": 9.17872672917516e-05, "loss": 3.2702, "step": 9675 }, { "epoch": 0.6576980568011959, "grad_norm": 2.0030386447906494, "learning_rate": 9.178302079086832e-05, "loss": 3.6393, "step": 9680 }, { "epoch": 0.6580377768718576, "grad_norm": 2.4066038131713867, "learning_rate": 9.177877428998506e-05, "loss": 3.2818, "step": 9685 }, { "epoch": 0.6583774969425193, "grad_norm": 3.0386524200439453, "learning_rate": 9.177452778910178e-05, "loss": 3.2272, "step": 9690 }, { "epoch": 0.6587172170131811, "grad_norm": 2.250406503677368, "learning_rate": 9.17702812882185e-05, "loss": 3.452, "step": 9695 }, { "epoch": 0.6590569370838429, "grad_norm": 2.1529886722564697, "learning_rate": 9.176603478733525e-05, "loss": 3.3902, "step": 9700 }, { "epoch": 0.6593966571545047, "grad_norm": 2.389617919921875, "learning_rate": 9.176178828645196e-05, "loss": 3.4426, "step": 9705 }, { "epoch": 0.6597363772251664, "grad_norm": 1.7933048009872437, "learning_rate": 9.175754178556869e-05, "loss": 3.4909, "step": 9710 }, { "epoch": 0.6600760972958283, "grad_norm": 2.0979044437408447, "learning_rate": 9.175329528468543e-05, "loss": 3.4606, "step": 9715 }, { "epoch": 0.66041581736649, "grad_norm": 1.7562857866287231, "learning_rate": 9.174904878380214e-05, "loss": 3.945, "step": 9720 }, { "epoch": 0.6607555374371518, "grad_norm": 2.451220989227295, "learning_rate": 9.174480228291889e-05, "loss": 3.5378, "step": 9725 }, { "epoch": 0.6610952575078136, "grad_norm": 2.4984099864959717, "learning_rate": 9.174055578203561e-05, "loss": 3.1811, "step": 9730 }, { "epoch": 0.6614349775784754, "grad_norm": 1.889421820640564, "learning_rate": 9.173630928115233e-05, "loss": 3.4117, "step": 9735 }, { "epoch": 0.6617746976491371, "grad_norm": 2.304583787918091, "learning_rate": 9.173206278026907e-05, "loss": 3.3953, "step": 9740 }, { "epoch": 0.6621144177197988, "grad_norm": 2.1655526161193848, "learning_rate": 9.17278162793858e-05, "loss": 3.435, "step": 9745 }, { "epoch": 0.6624541377904607, "grad_norm": 2.957169532775879, "learning_rate": 9.172356977850251e-05, "loss": 3.5148, "step": 9750 }, { "epoch": 0.6627938578611224, "grad_norm": 2.8838658332824707, "learning_rate": 9.171932327761925e-05, "loss": 3.6021, "step": 9755 }, { "epoch": 0.6631335779317842, "grad_norm": 2.703655958175659, "learning_rate": 9.171507677673598e-05, "loss": 3.3602, "step": 9760 }, { "epoch": 0.663473298002446, "grad_norm": 2.6492762565612793, "learning_rate": 9.17108302758527e-05, "loss": 3.1517, "step": 9765 }, { "epoch": 0.6638130180731078, "grad_norm": 2.315916061401367, "learning_rate": 9.170658377496944e-05, "loss": 3.5733, "step": 9770 }, { "epoch": 0.6641527381437695, "grad_norm": 3.022876262664795, "learning_rate": 9.170233727408615e-05, "loss": 3.2548, "step": 9775 }, { "epoch": 0.6644924582144313, "grad_norm": 1.8072370290756226, "learning_rate": 9.169809077320288e-05, "loss": 3.6563, "step": 9780 }, { "epoch": 0.6648321782850931, "grad_norm": 1.8629965782165527, "learning_rate": 9.169384427231962e-05, "loss": 3.6613, "step": 9785 }, { "epoch": 0.6651718983557549, "grad_norm": 1.7219549417495728, "learning_rate": 9.168959777143634e-05, "loss": 3.3072, "step": 9790 }, { "epoch": 0.6655116184264166, "grad_norm": 2.3677175045013428, "learning_rate": 9.168535127055306e-05, "loss": 3.1922, "step": 9795 }, { "epoch": 0.6658513384970784, "grad_norm": 2.2522263526916504, "learning_rate": 9.16811047696698e-05, "loss": 3.2677, "step": 9800 }, { "epoch": 0.6661910585677402, "grad_norm": 2.0835580825805664, "learning_rate": 9.167685826878652e-05, "loss": 3.3593, "step": 9805 }, { "epoch": 0.6665307786384019, "grad_norm": 1.9465687274932861, "learning_rate": 9.167261176790325e-05, "loss": 3.4494, "step": 9810 }, { "epoch": 0.6668704987090638, "grad_norm": 2.414717197418213, "learning_rate": 9.166836526701999e-05, "loss": 3.6492, "step": 9815 }, { "epoch": 0.6672102187797255, "grad_norm": 2.6287484169006348, "learning_rate": 9.16641187661367e-05, "loss": 3.7339, "step": 9820 }, { "epoch": 0.6675499388503873, "grad_norm": 2.2006239891052246, "learning_rate": 9.165987226525343e-05, "loss": 3.5375, "step": 9825 }, { "epoch": 0.667889658921049, "grad_norm": 1.995247721672058, "learning_rate": 9.165562576437017e-05, "loss": 3.4984, "step": 9830 }, { "epoch": 0.6682293789917109, "grad_norm": 2.1331167221069336, "learning_rate": 9.165137926348689e-05, "loss": 3.387, "step": 9835 }, { "epoch": 0.6685690990623726, "grad_norm": 2.292428493499756, "learning_rate": 9.164713276260362e-05, "loss": 3.4255, "step": 9840 }, { "epoch": 0.6689088191330343, "grad_norm": 2.4552714824676514, "learning_rate": 9.164288626172034e-05, "loss": 3.5577, "step": 9845 }, { "epoch": 0.6692485392036962, "grad_norm": 1.9254560470581055, "learning_rate": 9.163863976083707e-05, "loss": 3.2334, "step": 9850 }, { "epoch": 0.6695882592743579, "grad_norm": 2.543802499771118, "learning_rate": 9.16343932599538e-05, "loss": 3.6791, "step": 9855 }, { "epoch": 0.6699279793450197, "grad_norm": 1.9474387168884277, "learning_rate": 9.163014675907053e-05, "loss": 3.5802, "step": 9860 }, { "epoch": 0.6702676994156814, "grad_norm": 2.0926296710968018, "learning_rate": 9.162590025818726e-05, "loss": 3.5644, "step": 9865 }, { "epoch": 0.6706074194863433, "grad_norm": 2.4188356399536133, "learning_rate": 9.162165375730398e-05, "loss": 3.2545, "step": 9870 }, { "epoch": 0.670947139557005, "grad_norm": 1.9968558549880981, "learning_rate": 9.161740725642071e-05, "loss": 3.8078, "step": 9875 }, { "epoch": 0.6712868596276668, "grad_norm": 2.048319101333618, "learning_rate": 9.161316075553744e-05, "loss": 3.4627, "step": 9880 }, { "epoch": 0.6716265796983286, "grad_norm": 2.169461965560913, "learning_rate": 9.160891425465417e-05, "loss": 3.606, "step": 9885 }, { "epoch": 0.6719662997689904, "grad_norm": 1.8501156568527222, "learning_rate": 9.16046677537709e-05, "loss": 3.3269, "step": 9890 }, { "epoch": 0.6723060198396521, "grad_norm": 2.0102906227111816, "learning_rate": 9.160042125288762e-05, "loss": 3.194, "step": 9895 }, { "epoch": 0.672645739910314, "grad_norm": 3.0749316215515137, "learning_rate": 9.159617475200435e-05, "loss": 3.4971, "step": 9900 }, { "epoch": 0.6729854599809757, "grad_norm": 2.1286542415618896, "learning_rate": 9.159192825112108e-05, "loss": 3.2454, "step": 9905 }, { "epoch": 0.6733251800516374, "grad_norm": 1.8950390815734863, "learning_rate": 9.158768175023781e-05, "loss": 3.5541, "step": 9910 }, { "epoch": 0.6736649001222992, "grad_norm": 2.1330528259277344, "learning_rate": 9.158343524935454e-05, "loss": 3.5395, "step": 9915 }, { "epoch": 0.674004620192961, "grad_norm": 2.268120288848877, "learning_rate": 9.157918874847126e-05, "loss": 3.4602, "step": 9920 }, { "epoch": 0.6743443402636228, "grad_norm": 2.3372132778167725, "learning_rate": 9.157494224758799e-05, "loss": 3.3258, "step": 9925 }, { "epoch": 0.6746840603342845, "grad_norm": 2.7525475025177, "learning_rate": 9.157069574670472e-05, "loss": 3.4347, "step": 9930 }, { "epoch": 0.6750237804049464, "grad_norm": 2.215385913848877, "learning_rate": 9.156644924582145e-05, "loss": 3.5077, "step": 9935 }, { "epoch": 0.6753635004756081, "grad_norm": 2.8587841987609863, "learning_rate": 9.156220274493818e-05, "loss": 3.361, "step": 9940 }, { "epoch": 0.6757032205462699, "grad_norm": 2.27107572555542, "learning_rate": 9.15579562440549e-05, "loss": 3.4283, "step": 9945 }, { "epoch": 0.6760429406169316, "grad_norm": 1.9669655561447144, "learning_rate": 9.155370974317163e-05, "loss": 3.6681, "step": 9950 }, { "epoch": 0.6763826606875935, "grad_norm": 2.0418825149536133, "learning_rate": 9.154946324228836e-05, "loss": 3.3541, "step": 9955 }, { "epoch": 0.6767223807582552, "grad_norm": 1.9659185409545898, "learning_rate": 9.154521674140509e-05, "loss": 3.3576, "step": 9960 }, { "epoch": 0.6770621008289169, "grad_norm": 2.1350066661834717, "learning_rate": 9.154097024052182e-05, "loss": 3.406, "step": 9965 }, { "epoch": 0.6774018208995788, "grad_norm": 2.241499900817871, "learning_rate": 9.153672373963854e-05, "loss": 3.3658, "step": 9970 }, { "epoch": 0.6777415409702405, "grad_norm": 4.039486885070801, "learning_rate": 9.153247723875526e-05, "loss": 3.3798, "step": 9975 }, { "epoch": 0.6780812610409023, "grad_norm": 2.198073625564575, "learning_rate": 9.1528230737872e-05, "loss": 3.3894, "step": 9980 }, { "epoch": 0.6784209811115641, "grad_norm": 2.494109630584717, "learning_rate": 9.152398423698873e-05, "loss": 3.3189, "step": 9985 }, { "epoch": 0.6787607011822259, "grad_norm": 1.901872992515564, "learning_rate": 9.151973773610544e-05, "loss": 3.5007, "step": 9990 }, { "epoch": 0.6791004212528876, "grad_norm": 2.490457534790039, "learning_rate": 9.151549123522218e-05, "loss": 3.2335, "step": 9995 }, { "epoch": 0.6794401413235494, "grad_norm": 2.129027843475342, "learning_rate": 9.151124473433891e-05, "loss": 3.3407, "step": 10000 }, { "epoch": 0.6797798613942112, "grad_norm": 2.3029603958129883, "learning_rate": 9.150699823345563e-05, "loss": 3.6818, "step": 10005 }, { "epoch": 0.680119581464873, "grad_norm": 2.6103415489196777, "learning_rate": 9.150275173257237e-05, "loss": 3.4749, "step": 10010 }, { "epoch": 0.6804593015355347, "grad_norm": 2.0774784088134766, "learning_rate": 9.14985052316891e-05, "loss": 3.4361, "step": 10015 }, { "epoch": 0.6807990216061965, "grad_norm": 2.4018054008483887, "learning_rate": 9.149425873080581e-05, "loss": 3.3953, "step": 10020 }, { "epoch": 0.6811387416768583, "grad_norm": 2.4003589153289795, "learning_rate": 9.149001222992255e-05, "loss": 3.6101, "step": 10025 }, { "epoch": 0.68147846174752, "grad_norm": 2.0229439735412598, "learning_rate": 9.148576572903928e-05, "loss": 3.3307, "step": 10030 }, { "epoch": 0.6818181818181818, "grad_norm": 1.962277889251709, "learning_rate": 9.1481519228156e-05, "loss": 3.6952, "step": 10035 }, { "epoch": 0.6821579018888436, "grad_norm": 2.5740997791290283, "learning_rate": 9.147727272727274e-05, "loss": 3.3417, "step": 10040 }, { "epoch": 0.6824976219595054, "grad_norm": 2.2659201622009277, "learning_rate": 9.147302622638945e-05, "loss": 3.3707, "step": 10045 }, { "epoch": 0.6828373420301671, "grad_norm": 2.5698659420013428, "learning_rate": 9.146877972550618e-05, "loss": 3.2316, "step": 10050 }, { "epoch": 0.683177062100829, "grad_norm": 1.8414411544799805, "learning_rate": 9.146453322462292e-05, "loss": 3.4781, "step": 10055 }, { "epoch": 0.6835167821714907, "grad_norm": 2.0387251377105713, "learning_rate": 9.146028672373964e-05, "loss": 3.6213, "step": 10060 }, { "epoch": 0.6838565022421524, "grad_norm": 2.15816593170166, "learning_rate": 9.145604022285638e-05, "loss": 3.5245, "step": 10065 }, { "epoch": 0.6841962223128143, "grad_norm": 1.8952358961105347, "learning_rate": 9.14517937219731e-05, "loss": 3.706, "step": 10070 }, { "epoch": 0.684535942383476, "grad_norm": 2.976165533065796, "learning_rate": 9.144754722108982e-05, "loss": 3.1039, "step": 10075 }, { "epoch": 0.6848756624541378, "grad_norm": 1.9853427410125732, "learning_rate": 9.144330072020656e-05, "loss": 3.39, "step": 10080 }, { "epoch": 0.6852153825247995, "grad_norm": 2.650207757949829, "learning_rate": 9.143905421932329e-05, "loss": 3.3963, "step": 10085 }, { "epoch": 0.6855551025954614, "grad_norm": 2.18571400642395, "learning_rate": 9.143480771844e-05, "loss": 3.3268, "step": 10090 }, { "epoch": 0.6858948226661231, "grad_norm": 2.225778102874756, "learning_rate": 9.143056121755674e-05, "loss": 3.3894, "step": 10095 }, { "epoch": 0.6862345427367849, "grad_norm": 2.971393346786499, "learning_rate": 9.142631471667347e-05, "loss": 3.5052, "step": 10100 }, { "epoch": 0.6865742628074467, "grad_norm": 2.1307294368743896, "learning_rate": 9.142206821579019e-05, "loss": 3.2559, "step": 10105 }, { "epoch": 0.6869139828781085, "grad_norm": 2.2580299377441406, "learning_rate": 9.141782171490693e-05, "loss": 3.4579, "step": 10110 }, { "epoch": 0.6872537029487702, "grad_norm": 2.247525691986084, "learning_rate": 9.141357521402364e-05, "loss": 3.5955, "step": 10115 }, { "epoch": 0.6875934230194319, "grad_norm": 2.9665870666503906, "learning_rate": 9.140932871314037e-05, "loss": 3.1336, "step": 10120 }, { "epoch": 0.6879331430900938, "grad_norm": 2.1075119972229004, "learning_rate": 9.140508221225711e-05, "loss": 3.5494, "step": 10125 }, { "epoch": 0.6882728631607555, "grad_norm": 2.1065688133239746, "learning_rate": 9.140083571137383e-05, "loss": 3.2784, "step": 10130 }, { "epoch": 0.6886125832314173, "grad_norm": 1.5755414962768555, "learning_rate": 9.139658921049056e-05, "loss": 3.455, "step": 10135 }, { "epoch": 0.6889523033020791, "grad_norm": 2.1131350994110107, "learning_rate": 9.13923427096073e-05, "loss": 3.2843, "step": 10140 }, { "epoch": 0.6892920233727409, "grad_norm": 1.853386640548706, "learning_rate": 9.138809620872401e-05, "loss": 3.5551, "step": 10145 }, { "epoch": 0.6896317434434026, "grad_norm": 2.1120431423187256, "learning_rate": 9.138384970784074e-05, "loss": 3.5836, "step": 10150 }, { "epoch": 0.6899714635140645, "grad_norm": 2.3452959060668945, "learning_rate": 9.137960320695748e-05, "loss": 3.4539, "step": 10155 }, { "epoch": 0.6903111835847262, "grad_norm": 2.2407162189483643, "learning_rate": 9.13753567060742e-05, "loss": 3.3432, "step": 10160 }, { "epoch": 0.690650903655388, "grad_norm": 2.4129278659820557, "learning_rate": 9.137111020519092e-05, "loss": 3.3533, "step": 10165 }, { "epoch": 0.6909906237260497, "grad_norm": 2.294137477874756, "learning_rate": 9.136686370430767e-05, "loss": 3.3819, "step": 10170 }, { "epoch": 0.6913303437967115, "grad_norm": 2.1614489555358887, "learning_rate": 9.136261720342438e-05, "loss": 3.2577, "step": 10175 }, { "epoch": 0.6916700638673733, "grad_norm": 2.5635268688201904, "learning_rate": 9.135837070254111e-05, "loss": 3.5839, "step": 10180 }, { "epoch": 0.692009783938035, "grad_norm": 2.0306272506713867, "learning_rate": 9.135412420165785e-05, "loss": 2.997, "step": 10185 }, { "epoch": 0.6923495040086969, "grad_norm": 3.441389799118042, "learning_rate": 9.134987770077456e-05, "loss": 3.2866, "step": 10190 }, { "epoch": 0.6926892240793586, "grad_norm": 1.9897480010986328, "learning_rate": 9.134563119989129e-05, "loss": 3.5245, "step": 10195 }, { "epoch": 0.6930289441500204, "grad_norm": 2.4812498092651367, "learning_rate": 9.134138469900802e-05, "loss": 3.4362, "step": 10200 }, { "epoch": 0.6933686642206821, "grad_norm": 2.473639726638794, "learning_rate": 9.133713819812475e-05, "loss": 3.2035, "step": 10205 }, { "epoch": 0.693708384291344, "grad_norm": 1.88172447681427, "learning_rate": 9.133289169724148e-05, "loss": 3.415, "step": 10210 }, { "epoch": 0.6940481043620057, "grad_norm": 2.242117166519165, "learning_rate": 9.13286451963582e-05, "loss": 3.5095, "step": 10215 }, { "epoch": 0.6943878244326674, "grad_norm": 3.2184598445892334, "learning_rate": 9.132439869547493e-05, "loss": 3.582, "step": 10220 }, { "epoch": 0.6947275445033293, "grad_norm": 2.613415002822876, "learning_rate": 9.132015219459166e-05, "loss": 3.4259, "step": 10225 }, { "epoch": 0.695067264573991, "grad_norm": 2.203864097595215, "learning_rate": 9.131590569370839e-05, "loss": 3.5516, "step": 10230 }, { "epoch": 0.6954069846446528, "grad_norm": 2.1330056190490723, "learning_rate": 9.131165919282512e-05, "loss": 3.5234, "step": 10235 }, { "epoch": 0.6957467047153146, "grad_norm": 2.382392644882202, "learning_rate": 9.130741269194184e-05, "loss": 3.4453, "step": 10240 }, { "epoch": 0.6960864247859764, "grad_norm": 2.107342481613159, "learning_rate": 9.130316619105857e-05, "loss": 3.622, "step": 10245 }, { "epoch": 0.6964261448566381, "grad_norm": 2.135791540145874, "learning_rate": 9.12989196901753e-05, "loss": 3.3522, "step": 10250 }, { "epoch": 0.6967658649272999, "grad_norm": 3.066088914871216, "learning_rate": 9.129467318929203e-05, "loss": 3.3842, "step": 10255 }, { "epoch": 0.6971055849979617, "grad_norm": 3.687159538269043, "learning_rate": 9.129042668840876e-05, "loss": 3.4694, "step": 10260 }, { "epoch": 0.6974453050686235, "grad_norm": 2.5193636417388916, "learning_rate": 9.128618018752548e-05, "loss": 3.4444, "step": 10265 }, { "epoch": 0.6977850251392852, "grad_norm": 2.3423962593078613, "learning_rate": 9.128193368664221e-05, "loss": 3.6123, "step": 10270 }, { "epoch": 0.698124745209947, "grad_norm": 1.9859436750411987, "learning_rate": 9.127768718575894e-05, "loss": 3.2797, "step": 10275 }, { "epoch": 0.6984644652806088, "grad_norm": 2.5177195072174072, "learning_rate": 9.127344068487567e-05, "loss": 3.3931, "step": 10280 }, { "epoch": 0.6988041853512705, "grad_norm": 2.1163747310638428, "learning_rate": 9.12691941839924e-05, "loss": 3.3861, "step": 10285 }, { "epoch": 0.6991439054219323, "grad_norm": 2.503312826156616, "learning_rate": 9.126494768310912e-05, "loss": 3.3352, "step": 10290 }, { "epoch": 0.6994836254925941, "grad_norm": 2.0320215225219727, "learning_rate": 9.126070118222585e-05, "loss": 3.0865, "step": 10295 }, { "epoch": 0.6998233455632559, "grad_norm": 1.8928866386413574, "learning_rate": 9.125645468134258e-05, "loss": 3.4041, "step": 10300 }, { "epoch": 0.7001630656339176, "grad_norm": 2.484449863433838, "learning_rate": 9.125220818045931e-05, "loss": 3.2752, "step": 10305 }, { "epoch": 0.7005027857045795, "grad_norm": 2.4888932704925537, "learning_rate": 9.124796167957604e-05, "loss": 3.3456, "step": 10310 }, { "epoch": 0.7008425057752412, "grad_norm": 2.85724139213562, "learning_rate": 9.124371517869275e-05, "loss": 3.4374, "step": 10315 }, { "epoch": 0.701182225845903, "grad_norm": 1.9421014785766602, "learning_rate": 9.123946867780949e-05, "loss": 3.3827, "step": 10320 }, { "epoch": 0.7015219459165648, "grad_norm": 1.892106294631958, "learning_rate": 9.123522217692622e-05, "loss": 3.412, "step": 10325 }, { "epoch": 0.7018616659872265, "grad_norm": 2.7637858390808105, "learning_rate": 9.123097567604293e-05, "loss": 3.452, "step": 10330 }, { "epoch": 0.7022013860578883, "grad_norm": 2.0583410263061523, "learning_rate": 9.122672917515968e-05, "loss": 3.3105, "step": 10335 }, { "epoch": 0.70254110612855, "grad_norm": 1.8880698680877686, "learning_rate": 9.12224826742764e-05, "loss": 3.2688, "step": 10340 }, { "epoch": 0.7028808261992119, "grad_norm": 2.3727846145629883, "learning_rate": 9.121823617339312e-05, "loss": 3.4935, "step": 10345 }, { "epoch": 0.7032205462698736, "grad_norm": 2.1201822757720947, "learning_rate": 9.121398967250986e-05, "loss": 3.3217, "step": 10350 }, { "epoch": 0.7035602663405354, "grad_norm": 2.1377501487731934, "learning_rate": 9.120974317162659e-05, "loss": 3.4306, "step": 10355 }, { "epoch": 0.7038999864111972, "grad_norm": 2.378638982772827, "learning_rate": 9.12054966707433e-05, "loss": 3.6065, "step": 10360 }, { "epoch": 0.704239706481859, "grad_norm": 2.6018004417419434, "learning_rate": 9.120125016986004e-05, "loss": 3.1431, "step": 10365 }, { "epoch": 0.7045794265525207, "grad_norm": 2.800001382827759, "learning_rate": 9.119700366897677e-05, "loss": 3.4992, "step": 10370 }, { "epoch": 0.7049191466231824, "grad_norm": 1.935689926147461, "learning_rate": 9.119275716809349e-05, "loss": 3.3126, "step": 10375 }, { "epoch": 0.7052588666938443, "grad_norm": 1.9114010334014893, "learning_rate": 9.118851066721023e-05, "loss": 3.5247, "step": 10380 }, { "epoch": 0.705598586764506, "grad_norm": 2.6919593811035156, "learning_rate": 9.118426416632696e-05, "loss": 3.5271, "step": 10385 }, { "epoch": 0.7059383068351678, "grad_norm": 2.280654191970825, "learning_rate": 9.118001766544367e-05, "loss": 3.4718, "step": 10390 }, { "epoch": 0.7062780269058296, "grad_norm": 2.0293657779693604, "learning_rate": 9.117577116456041e-05, "loss": 3.5115, "step": 10395 }, { "epoch": 0.7066177469764914, "grad_norm": 2.0440328121185303, "learning_rate": 9.117152466367713e-05, "loss": 3.3796, "step": 10400 }, { "epoch": 0.7069574670471531, "grad_norm": 2.232790231704712, "learning_rate": 9.116727816279387e-05, "loss": 3.3604, "step": 10405 }, { "epoch": 0.707297187117815, "grad_norm": 1.9614242315292358, "learning_rate": 9.11630316619106e-05, "loss": 3.4927, "step": 10410 }, { "epoch": 0.7076369071884767, "grad_norm": 1.8019899129867554, "learning_rate": 9.115878516102731e-05, "loss": 3.5046, "step": 10415 }, { "epoch": 0.7079766272591385, "grad_norm": 2.18086314201355, "learning_rate": 9.115453866014405e-05, "loss": 3.5707, "step": 10420 }, { "epoch": 0.7083163473298002, "grad_norm": 2.0439870357513428, "learning_rate": 9.115029215926078e-05, "loss": 3.6305, "step": 10425 }, { "epoch": 0.708656067400462, "grad_norm": 1.9378706216812134, "learning_rate": 9.11460456583775e-05, "loss": 3.718, "step": 10430 }, { "epoch": 0.7089957874711238, "grad_norm": 1.7242690324783325, "learning_rate": 9.114179915749424e-05, "loss": 3.6396, "step": 10435 }, { "epoch": 0.7093355075417855, "grad_norm": 1.859023094177246, "learning_rate": 9.113755265661096e-05, "loss": 3.5972, "step": 10440 }, { "epoch": 0.7096752276124474, "grad_norm": 1.8366934061050415, "learning_rate": 9.113330615572768e-05, "loss": 3.5045, "step": 10445 }, { "epoch": 0.7100149476831091, "grad_norm": 2.2177224159240723, "learning_rate": 9.112905965484442e-05, "loss": 3.4228, "step": 10450 }, { "epoch": 0.7103546677537709, "grad_norm": 2.008777618408203, "learning_rate": 9.112481315396115e-05, "loss": 3.6157, "step": 10455 }, { "epoch": 0.7106943878244326, "grad_norm": 2.003328323364258, "learning_rate": 9.112056665307786e-05, "loss": 3.4033, "step": 10460 }, { "epoch": 0.7110341078950945, "grad_norm": 2.2293601036071777, "learning_rate": 9.11163201521946e-05, "loss": 3.4233, "step": 10465 }, { "epoch": 0.7113738279657562, "grad_norm": 1.7605472803115845, "learning_rate": 9.111207365131132e-05, "loss": 3.5793, "step": 10470 }, { "epoch": 0.711713548036418, "grad_norm": 2.229853868484497, "learning_rate": 9.110782715042805e-05, "loss": 3.3773, "step": 10475 }, { "epoch": 0.7120532681070798, "grad_norm": 2.0041186809539795, "learning_rate": 9.110358064954479e-05, "loss": 3.5001, "step": 10480 }, { "epoch": 0.7123929881777415, "grad_norm": 2.100130081176758, "learning_rate": 9.10993341486615e-05, "loss": 3.0908, "step": 10485 }, { "epoch": 0.7127327082484033, "grad_norm": 1.9602878093719482, "learning_rate": 9.109508764777823e-05, "loss": 3.4569, "step": 10490 }, { "epoch": 0.7130724283190651, "grad_norm": 3.0202150344848633, "learning_rate": 9.109084114689497e-05, "loss": 3.3881, "step": 10495 }, { "epoch": 0.7134121483897269, "grad_norm": 2.5624940395355225, "learning_rate": 9.108659464601169e-05, "loss": 3.3818, "step": 10500 }, { "epoch": 0.7137518684603886, "grad_norm": 2.795262575149536, "learning_rate": 9.108234814512841e-05, "loss": 3.4421, "step": 10505 }, { "epoch": 0.7140915885310504, "grad_norm": 1.8148218393325806, "learning_rate": 9.107810164424516e-05, "loss": 3.6402, "step": 10510 }, { "epoch": 0.7144313086017122, "grad_norm": 1.9620920419692993, "learning_rate": 9.107385514336187e-05, "loss": 3.3249, "step": 10515 }, { "epoch": 0.714771028672374, "grad_norm": 2.484452962875366, "learning_rate": 9.10696086424786e-05, "loss": 3.3529, "step": 10520 }, { "epoch": 0.7151107487430357, "grad_norm": 2.289315938949585, "learning_rate": 9.106536214159534e-05, "loss": 3.4068, "step": 10525 }, { "epoch": 0.7154504688136976, "grad_norm": 2.0351531505584717, "learning_rate": 9.106111564071205e-05, "loss": 3.2509, "step": 10530 }, { "epoch": 0.7157901888843593, "grad_norm": 1.8132216930389404, "learning_rate": 9.105686913982878e-05, "loss": 3.522, "step": 10535 }, { "epoch": 0.716129908955021, "grad_norm": 2.4029035568237305, "learning_rate": 9.105262263894551e-05, "loss": 3.5176, "step": 10540 }, { "epoch": 0.7164696290256828, "grad_norm": 2.159874677658081, "learning_rate": 9.104837613806224e-05, "loss": 3.2876, "step": 10545 }, { "epoch": 0.7168093490963446, "grad_norm": 2.4289910793304443, "learning_rate": 9.104412963717897e-05, "loss": 3.5906, "step": 10550 }, { "epoch": 0.7171490691670064, "grad_norm": 1.7269231081008911, "learning_rate": 9.10398831362957e-05, "loss": 3.5046, "step": 10555 }, { "epoch": 0.7174887892376681, "grad_norm": 1.981059193611145, "learning_rate": 9.103563663541242e-05, "loss": 3.4799, "step": 10560 }, { "epoch": 0.71782850930833, "grad_norm": 1.7709567546844482, "learning_rate": 9.103139013452915e-05, "loss": 3.5944, "step": 10565 }, { "epoch": 0.7181682293789917, "grad_norm": 1.9505103826522827, "learning_rate": 9.102714363364588e-05, "loss": 3.5716, "step": 10570 }, { "epoch": 0.7185079494496535, "grad_norm": 2.706106185913086, "learning_rate": 9.10228971327626e-05, "loss": 3.343, "step": 10575 }, { "epoch": 0.7188476695203153, "grad_norm": 2.7203116416931152, "learning_rate": 9.101865063187933e-05, "loss": 3.3993, "step": 10580 }, { "epoch": 0.719187389590977, "grad_norm": 3.2484118938446045, "learning_rate": 9.101440413099606e-05, "loss": 3.264, "step": 10585 }, { "epoch": 0.7195271096616388, "grad_norm": 2.256519079208374, "learning_rate": 9.101015763011279e-05, "loss": 3.5549, "step": 10590 }, { "epoch": 0.7198668297323005, "grad_norm": 2.5238773822784424, "learning_rate": 9.100591112922952e-05, "loss": 3.2853, "step": 10595 }, { "epoch": 0.7202065498029624, "grad_norm": 2.011028528213501, "learning_rate": 9.100166462834625e-05, "loss": 3.3662, "step": 10600 }, { "epoch": 0.7205462698736241, "grad_norm": 2.2392654418945312, "learning_rate": 9.099741812746297e-05, "loss": 3.5608, "step": 10605 }, { "epoch": 0.7208859899442859, "grad_norm": 1.7563396692276, "learning_rate": 9.09931716265797e-05, "loss": 3.4139, "step": 10610 }, { "epoch": 0.7212257100149477, "grad_norm": 1.82901132106781, "learning_rate": 9.098892512569643e-05, "loss": 3.4043, "step": 10615 }, { "epoch": 0.7215654300856095, "grad_norm": 2.18038010597229, "learning_rate": 9.098467862481316e-05, "loss": 3.4499, "step": 10620 }, { "epoch": 0.7219051501562712, "grad_norm": 2.0786614418029785, "learning_rate": 9.098043212392989e-05, "loss": 3.6284, "step": 10625 }, { "epoch": 0.722244870226933, "grad_norm": 1.9242092370986938, "learning_rate": 9.097618562304661e-05, "loss": 3.4669, "step": 10630 }, { "epoch": 0.7225845902975948, "grad_norm": 2.1038947105407715, "learning_rate": 9.097193912216334e-05, "loss": 3.3863, "step": 10635 }, { "epoch": 0.7229243103682566, "grad_norm": 2.0907907485961914, "learning_rate": 9.096769262128007e-05, "loss": 3.3652, "step": 10640 }, { "epoch": 0.7232640304389183, "grad_norm": 2.884026527404785, "learning_rate": 9.09634461203968e-05, "loss": 3.4208, "step": 10645 }, { "epoch": 0.7236037505095801, "grad_norm": 2.813676357269287, "learning_rate": 9.095919961951353e-05, "loss": 3.3257, "step": 10650 }, { "epoch": 0.7239434705802419, "grad_norm": 2.612833261489868, "learning_rate": 9.095495311863025e-05, "loss": 3.3911, "step": 10655 }, { "epoch": 0.7242831906509036, "grad_norm": 2.0771286487579346, "learning_rate": 9.095070661774698e-05, "loss": 3.1228, "step": 10660 }, { "epoch": 0.7246229107215655, "grad_norm": 2.3251900672912598, "learning_rate": 9.094646011686371e-05, "loss": 3.4598, "step": 10665 }, { "epoch": 0.7249626307922272, "grad_norm": 2.2367258071899414, "learning_rate": 9.094221361598043e-05, "loss": 3.4079, "step": 10670 }, { "epoch": 0.725302350862889, "grad_norm": 2.5025484561920166, "learning_rate": 9.093796711509717e-05, "loss": 3.5936, "step": 10675 }, { "epoch": 0.7256420709335507, "grad_norm": 2.1150619983673096, "learning_rate": 9.09337206142139e-05, "loss": 3.446, "step": 10680 }, { "epoch": 0.7259817910042126, "grad_norm": 2.5962636470794678, "learning_rate": 9.092947411333061e-05, "loss": 3.3378, "step": 10685 }, { "epoch": 0.7263215110748743, "grad_norm": 1.8724310398101807, "learning_rate": 9.092522761244735e-05, "loss": 3.4324, "step": 10690 }, { "epoch": 0.726661231145536, "grad_norm": 2.1106648445129395, "learning_rate": 9.092098111156408e-05, "loss": 3.4576, "step": 10695 }, { "epoch": 0.7270009512161979, "grad_norm": 3.0349202156066895, "learning_rate": 9.091673461068079e-05, "loss": 3.6279, "step": 10700 }, { "epoch": 0.7273406712868596, "grad_norm": 2.1640307903289795, "learning_rate": 9.091248810979753e-05, "loss": 3.5625, "step": 10705 }, { "epoch": 0.7276803913575214, "grad_norm": 2.3512585163116455, "learning_rate": 9.090824160891426e-05, "loss": 3.6583, "step": 10710 }, { "epoch": 0.7280201114281831, "grad_norm": 1.8274447917938232, "learning_rate": 9.090399510803098e-05, "loss": 3.5874, "step": 10715 }, { "epoch": 0.728359831498845, "grad_norm": 2.6690142154693604, "learning_rate": 9.089974860714772e-05, "loss": 3.2241, "step": 10720 }, { "epoch": 0.7286995515695067, "grad_norm": 1.6083638668060303, "learning_rate": 9.089550210626445e-05, "loss": 3.4425, "step": 10725 }, { "epoch": 0.7290392716401685, "grad_norm": 1.9268238544464111, "learning_rate": 9.089125560538116e-05, "loss": 3.2873, "step": 10730 }, { "epoch": 0.7293789917108303, "grad_norm": 2.3528811931610107, "learning_rate": 9.08870091044979e-05, "loss": 3.5524, "step": 10735 }, { "epoch": 0.7297187117814921, "grad_norm": 2.069373846054077, "learning_rate": 9.088276260361463e-05, "loss": 3.609, "step": 10740 }, { "epoch": 0.7300584318521538, "grad_norm": 1.9688727855682373, "learning_rate": 9.087851610273136e-05, "loss": 3.3181, "step": 10745 }, { "epoch": 0.7303981519228157, "grad_norm": 2.8601443767547607, "learning_rate": 9.087426960184809e-05, "loss": 3.4131, "step": 10750 }, { "epoch": 0.7307378719934774, "grad_norm": 2.1676247119903564, "learning_rate": 9.08700231009648e-05, "loss": 3.4029, "step": 10755 }, { "epoch": 0.7310775920641391, "grad_norm": 2.119716167449951, "learning_rate": 9.086577660008154e-05, "loss": 3.6311, "step": 10760 }, { "epoch": 0.7314173121348009, "grad_norm": 2.2926578521728516, "learning_rate": 9.086153009919827e-05, "loss": 3.5058, "step": 10765 }, { "epoch": 0.7317570322054627, "grad_norm": 2.3588671684265137, "learning_rate": 9.085728359831499e-05, "loss": 3.3528, "step": 10770 }, { "epoch": 0.7320967522761245, "grad_norm": 2.3392527103424072, "learning_rate": 9.085303709743173e-05, "loss": 3.6571, "step": 10775 }, { "epoch": 0.7324364723467862, "grad_norm": 1.9288994073867798, "learning_rate": 9.084879059654845e-05, "loss": 3.4191, "step": 10780 }, { "epoch": 0.7327761924174481, "grad_norm": 2.903181552886963, "learning_rate": 9.084454409566517e-05, "loss": 3.5195, "step": 10785 }, { "epoch": 0.7331159124881098, "grad_norm": 2.0483531951904297, "learning_rate": 9.084029759478191e-05, "loss": 3.288, "step": 10790 }, { "epoch": 0.7334556325587716, "grad_norm": 2.4179515838623047, "learning_rate": 9.083605109389864e-05, "loss": 3.4188, "step": 10795 }, { "epoch": 0.7337953526294333, "grad_norm": 2.3747267723083496, "learning_rate": 9.083180459301535e-05, "loss": 3.3079, "step": 10800 }, { "epoch": 0.7341350727000951, "grad_norm": 2.533458948135376, "learning_rate": 9.08275580921321e-05, "loss": 3.5124, "step": 10805 }, { "epoch": 0.7344747927707569, "grad_norm": 2.1378977298736572, "learning_rate": 9.082331159124882e-05, "loss": 3.473, "step": 10810 }, { "epoch": 0.7348145128414186, "grad_norm": 2.3398780822753906, "learning_rate": 9.081906509036554e-05, "loss": 3.3911, "step": 10815 }, { "epoch": 0.7351542329120805, "grad_norm": 2.040144681930542, "learning_rate": 9.081481858948228e-05, "loss": 3.4196, "step": 10820 }, { "epoch": 0.7354939529827422, "grad_norm": 1.981629729270935, "learning_rate": 9.0810572088599e-05, "loss": 3.4989, "step": 10825 }, { "epoch": 0.735833673053404, "grad_norm": 2.342792272567749, "learning_rate": 9.080632558771572e-05, "loss": 3.4088, "step": 10830 }, { "epoch": 0.7361733931240658, "grad_norm": 2.331023931503296, "learning_rate": 9.080207908683246e-05, "loss": 3.4984, "step": 10835 }, { "epoch": 0.7365131131947276, "grad_norm": 2.042898654937744, "learning_rate": 9.079783258594918e-05, "loss": 3.4063, "step": 10840 }, { "epoch": 0.7368528332653893, "grad_norm": 4.239623546600342, "learning_rate": 9.07935860850659e-05, "loss": 3.3617, "step": 10845 }, { "epoch": 0.737192553336051, "grad_norm": 2.5794546604156494, "learning_rate": 9.078933958418265e-05, "loss": 3.5826, "step": 10850 }, { "epoch": 0.7375322734067129, "grad_norm": 1.8803948163986206, "learning_rate": 9.078509308329936e-05, "loss": 3.3706, "step": 10855 }, { "epoch": 0.7378719934773746, "grad_norm": 1.9100732803344727, "learning_rate": 9.078084658241609e-05, "loss": 3.5624, "step": 10860 }, { "epoch": 0.7382117135480364, "grad_norm": 1.75567626953125, "learning_rate": 9.077660008153283e-05, "loss": 3.6124, "step": 10865 }, { "epoch": 0.7385514336186982, "grad_norm": 2.0038938522338867, "learning_rate": 9.077235358064955e-05, "loss": 3.4023, "step": 10870 }, { "epoch": 0.73889115368936, "grad_norm": 2.361287832260132, "learning_rate": 9.076810707976627e-05, "loss": 3.4801, "step": 10875 }, { "epoch": 0.7392308737600217, "grad_norm": 2.0191619396209717, "learning_rate": 9.076386057888302e-05, "loss": 3.4571, "step": 10880 }, { "epoch": 0.7395705938306835, "grad_norm": 1.9986028671264648, "learning_rate": 9.075961407799973e-05, "loss": 3.4198, "step": 10885 }, { "epoch": 0.7399103139013453, "grad_norm": 2.421579360961914, "learning_rate": 9.075536757711646e-05, "loss": 3.3339, "step": 10890 }, { "epoch": 0.7402500339720071, "grad_norm": 2.5237321853637695, "learning_rate": 9.075112107623319e-05, "loss": 3.4519, "step": 10895 }, { "epoch": 0.7405897540426688, "grad_norm": 4.795865535736084, "learning_rate": 9.074687457534991e-05, "loss": 3.33, "step": 10900 }, { "epoch": 0.7409294741133307, "grad_norm": 3.025031089782715, "learning_rate": 9.074262807446664e-05, "loss": 3.5102, "step": 10905 }, { "epoch": 0.7412691941839924, "grad_norm": 2.056140422821045, "learning_rate": 9.073838157358337e-05, "loss": 3.4382, "step": 10910 }, { "epoch": 0.7416089142546541, "grad_norm": 1.9269989728927612, "learning_rate": 9.07341350727001e-05, "loss": 3.6869, "step": 10915 }, { "epoch": 0.741948634325316, "grad_norm": 2.2858726978302, "learning_rate": 9.072988857181683e-05, "loss": 3.387, "step": 10920 }, { "epoch": 0.7422883543959777, "grad_norm": 2.283271074295044, "learning_rate": 9.072564207093355e-05, "loss": 3.3383, "step": 10925 }, { "epoch": 0.7426280744666395, "grad_norm": 2.2543795108795166, "learning_rate": 9.072139557005028e-05, "loss": 3.4783, "step": 10930 }, { "epoch": 0.7429677945373012, "grad_norm": 2.2956485748291016, "learning_rate": 9.071714906916701e-05, "loss": 3.3174, "step": 10935 }, { "epoch": 0.7433075146079631, "grad_norm": 2.596615791320801, "learning_rate": 9.071290256828374e-05, "loss": 3.3084, "step": 10940 }, { "epoch": 0.7436472346786248, "grad_norm": 1.9923878908157349, "learning_rate": 9.070865606740047e-05, "loss": 3.4221, "step": 10945 }, { "epoch": 0.7439869547492866, "grad_norm": 1.891037106513977, "learning_rate": 9.07044095665172e-05, "loss": 3.7177, "step": 10950 }, { "epoch": 0.7443266748199484, "grad_norm": 1.9931647777557373, "learning_rate": 9.070016306563392e-05, "loss": 3.2509, "step": 10955 }, { "epoch": 0.7446663948906102, "grad_norm": 1.6091890335083008, "learning_rate": 9.069591656475065e-05, "loss": 3.376, "step": 10960 }, { "epoch": 0.7450061149612719, "grad_norm": 2.1172564029693604, "learning_rate": 9.069167006386738e-05, "loss": 3.6299, "step": 10965 }, { "epoch": 0.7453458350319336, "grad_norm": 2.5945401191711426, "learning_rate": 9.06874235629841e-05, "loss": 3.3684, "step": 10970 }, { "epoch": 0.7456855551025955, "grad_norm": 2.4114725589752197, "learning_rate": 9.068317706210083e-05, "loss": 3.4607, "step": 10975 }, { "epoch": 0.7460252751732572, "grad_norm": 2.6435694694519043, "learning_rate": 9.067893056121756e-05, "loss": 3.3231, "step": 10980 }, { "epoch": 0.746364995243919, "grad_norm": 1.8973042964935303, "learning_rate": 9.067468406033429e-05, "loss": 3.4685, "step": 10985 }, { "epoch": 0.7467047153145808, "grad_norm": 2.3614795207977295, "learning_rate": 9.067043755945102e-05, "loss": 3.2778, "step": 10990 }, { "epoch": 0.7470444353852426, "grad_norm": 2.1101648807525635, "learning_rate": 9.066619105856775e-05, "loss": 3.2096, "step": 10995 }, { "epoch": 0.7473841554559043, "grad_norm": 1.946252703666687, "learning_rate": 9.066194455768447e-05, "loss": 3.2057, "step": 11000 }, { "epoch": 0.7477238755265662, "grad_norm": 2.2493433952331543, "learning_rate": 9.06576980568012e-05, "loss": 3.2899, "step": 11005 }, { "epoch": 0.7480635955972279, "grad_norm": 2.2197651863098145, "learning_rate": 9.065345155591793e-05, "loss": 3.3169, "step": 11010 }, { "epoch": 0.7484033156678896, "grad_norm": 1.8556232452392578, "learning_rate": 9.064920505503466e-05, "loss": 3.0973, "step": 11015 }, { "epoch": 0.7487430357385514, "grad_norm": 2.3237788677215576, "learning_rate": 9.064495855415139e-05, "loss": 3.4265, "step": 11020 }, { "epoch": 0.7490827558092132, "grad_norm": 2.0329315662384033, "learning_rate": 9.06407120532681e-05, "loss": 3.4525, "step": 11025 }, { "epoch": 0.749422475879875, "grad_norm": 2.1035406589508057, "learning_rate": 9.063646555238484e-05, "loss": 3.4395, "step": 11030 }, { "epoch": 0.7497621959505367, "grad_norm": 2.091654062271118, "learning_rate": 9.063221905150157e-05, "loss": 3.4969, "step": 11035 }, { "epoch": 0.7501019160211986, "grad_norm": 2.510631561279297, "learning_rate": 9.062797255061828e-05, "loss": 3.5577, "step": 11040 }, { "epoch": 0.7504416360918603, "grad_norm": 2.022939920425415, "learning_rate": 9.062372604973503e-05, "loss": 3.4426, "step": 11045 }, { "epoch": 0.7507813561625221, "grad_norm": 2.287958860397339, "learning_rate": 9.061947954885175e-05, "loss": 3.4774, "step": 11050 }, { "epoch": 0.7511210762331838, "grad_norm": 2.3949368000030518, "learning_rate": 9.061523304796847e-05, "loss": 3.1168, "step": 11055 }, { "epoch": 0.7514607963038457, "grad_norm": 2.22428297996521, "learning_rate": 9.061098654708521e-05, "loss": 3.5221, "step": 11060 }, { "epoch": 0.7518005163745074, "grad_norm": 2.5449883937835693, "learning_rate": 9.060674004620194e-05, "loss": 3.587, "step": 11065 }, { "epoch": 0.7521402364451691, "grad_norm": 2.7419300079345703, "learning_rate": 9.060249354531865e-05, "loss": 3.3353, "step": 11070 }, { "epoch": 0.752479956515831, "grad_norm": 1.9885202646255493, "learning_rate": 9.05982470444354e-05, "loss": 3.4969, "step": 11075 }, { "epoch": 0.7528196765864927, "grad_norm": 1.9539488554000854, "learning_rate": 9.059400054355212e-05, "loss": 3.4032, "step": 11080 }, { "epoch": 0.7531593966571545, "grad_norm": 2.6884703636169434, "learning_rate": 9.058975404266885e-05, "loss": 3.2384, "step": 11085 }, { "epoch": 0.7534991167278163, "grad_norm": 3.0844130516052246, "learning_rate": 9.058550754178558e-05, "loss": 3.4376, "step": 11090 }, { "epoch": 0.7538388367984781, "grad_norm": 1.7788217067718506, "learning_rate": 9.058126104090229e-05, "loss": 3.5911, "step": 11095 }, { "epoch": 0.7541785568691398, "grad_norm": 1.9761929512023926, "learning_rate": 9.057701454001903e-05, "loss": 3.2833, "step": 11100 }, { "epoch": 0.7545182769398016, "grad_norm": 1.8782196044921875, "learning_rate": 9.057276803913576e-05, "loss": 3.4258, "step": 11105 }, { "epoch": 0.7548579970104634, "grad_norm": 1.9788384437561035, "learning_rate": 9.056852153825248e-05, "loss": 3.367, "step": 11110 }, { "epoch": 0.7551977170811252, "grad_norm": 2.1012284755706787, "learning_rate": 9.056427503736922e-05, "loss": 3.3178, "step": 11115 }, { "epoch": 0.7555374371517869, "grad_norm": 2.2467188835144043, "learning_rate": 9.056002853648595e-05, "loss": 3.3999, "step": 11120 }, { "epoch": 0.7558771572224487, "grad_norm": 1.7547935247421265, "learning_rate": 9.055578203560266e-05, "loss": 3.5821, "step": 11125 }, { "epoch": 0.7562168772931105, "grad_norm": 1.9440996646881104, "learning_rate": 9.05515355347194e-05, "loss": 3.519, "step": 11130 }, { "epoch": 0.7565565973637722, "grad_norm": 2.0020744800567627, "learning_rate": 9.054728903383613e-05, "loss": 3.5756, "step": 11135 }, { "epoch": 0.756896317434434, "grad_norm": 1.671148419380188, "learning_rate": 9.054304253295284e-05, "loss": 3.4798, "step": 11140 }, { "epoch": 0.7572360375050958, "grad_norm": 2.3796920776367188, "learning_rate": 9.053879603206959e-05, "loss": 3.2064, "step": 11145 }, { "epoch": 0.7575757575757576, "grad_norm": 2.4899344444274902, "learning_rate": 9.053454953118631e-05, "loss": 3.6966, "step": 11150 }, { "epoch": 0.7579154776464193, "grad_norm": 2.257075071334839, "learning_rate": 9.053030303030303e-05, "loss": 3.3965, "step": 11155 }, { "epoch": 0.7582551977170812, "grad_norm": 2.018040180206299, "learning_rate": 9.052605652941977e-05, "loss": 3.4178, "step": 11160 }, { "epoch": 0.7585949177877429, "grad_norm": 2.3814074993133545, "learning_rate": 9.05218100285365e-05, "loss": 3.1839, "step": 11165 }, { "epoch": 0.7589346378584046, "grad_norm": 1.9290012121200562, "learning_rate": 9.051756352765321e-05, "loss": 3.2344, "step": 11170 }, { "epoch": 0.7592743579290665, "grad_norm": 2.2735488414764404, "learning_rate": 9.051331702676995e-05, "loss": 3.4017, "step": 11175 }, { "epoch": 0.7596140779997282, "grad_norm": 2.5904603004455566, "learning_rate": 9.050907052588667e-05, "loss": 3.2168, "step": 11180 }, { "epoch": 0.75995379807039, "grad_norm": 1.96474289894104, "learning_rate": 9.05048240250034e-05, "loss": 3.2424, "step": 11185 }, { "epoch": 0.7602935181410517, "grad_norm": 1.849856972694397, "learning_rate": 9.050057752412014e-05, "loss": 3.4237, "step": 11190 }, { "epoch": 0.7606332382117136, "grad_norm": 2.0133066177368164, "learning_rate": 9.049633102323685e-05, "loss": 3.5238, "step": 11195 }, { "epoch": 0.7609729582823753, "grad_norm": 2.219210147857666, "learning_rate": 9.049208452235358e-05, "loss": 3.2749, "step": 11200 }, { "epoch": 0.7613126783530371, "grad_norm": 2.412020206451416, "learning_rate": 9.048783802147032e-05, "loss": 3.4784, "step": 11205 }, { "epoch": 0.7616523984236989, "grad_norm": 4.569281101226807, "learning_rate": 9.048359152058704e-05, "loss": 3.4909, "step": 11210 }, { "epoch": 0.7619921184943607, "grad_norm": 2.299633502960205, "learning_rate": 9.047934501970376e-05, "loss": 3.303, "step": 11215 }, { "epoch": 0.7623318385650224, "grad_norm": 1.8916287422180176, "learning_rate": 9.04750985188205e-05, "loss": 3.5023, "step": 11220 }, { "epoch": 0.7626715586356841, "grad_norm": 1.9439961910247803, "learning_rate": 9.047085201793722e-05, "loss": 3.3326, "step": 11225 }, { "epoch": 0.763011278706346, "grad_norm": 1.908080816268921, "learning_rate": 9.046660551705395e-05, "loss": 3.5699, "step": 11230 }, { "epoch": 0.7633509987770077, "grad_norm": 1.8485513925552368, "learning_rate": 9.046235901617069e-05, "loss": 3.4751, "step": 11235 }, { "epoch": 0.7636907188476695, "grad_norm": 2.3641703128814697, "learning_rate": 9.04581125152874e-05, "loss": 3.3947, "step": 11240 }, { "epoch": 0.7640304389183313, "grad_norm": 2.173023223876953, "learning_rate": 9.045386601440413e-05, "loss": 3.1465, "step": 11245 }, { "epoch": 0.7643701589889931, "grad_norm": 2.2282724380493164, "learning_rate": 9.044961951352086e-05, "loss": 3.3554, "step": 11250 }, { "epoch": 0.7647098790596548, "grad_norm": 1.8429909944534302, "learning_rate": 9.044537301263759e-05, "loss": 3.6429, "step": 11255 }, { "epoch": 0.7650495991303167, "grad_norm": 1.8007175922393799, "learning_rate": 9.044112651175432e-05, "loss": 3.509, "step": 11260 }, { "epoch": 0.7653893192009784, "grad_norm": 2.144818067550659, "learning_rate": 9.043688001087104e-05, "loss": 3.3788, "step": 11265 }, { "epoch": 0.7657290392716402, "grad_norm": 2.8039710521698, "learning_rate": 9.043263350998777e-05, "loss": 3.4663, "step": 11270 }, { "epoch": 0.7660687593423019, "grad_norm": 2.3489575386047363, "learning_rate": 9.04283870091045e-05, "loss": 3.4442, "step": 11275 }, { "epoch": 0.7664084794129638, "grad_norm": 2.214536428451538, "learning_rate": 9.042414050822123e-05, "loss": 3.3436, "step": 11280 }, { "epoch": 0.7667481994836255, "grad_norm": 2.0688140392303467, "learning_rate": 9.041989400733796e-05, "loss": 3.2298, "step": 11285 }, { "epoch": 0.7670879195542872, "grad_norm": 2.198549747467041, "learning_rate": 9.041564750645468e-05, "loss": 3.3888, "step": 11290 }, { "epoch": 0.7674276396249491, "grad_norm": 1.9206246137619019, "learning_rate": 9.041140100557141e-05, "loss": 3.301, "step": 11295 }, { "epoch": 0.7677673596956108, "grad_norm": 1.651972770690918, "learning_rate": 9.040715450468814e-05, "loss": 3.4455, "step": 11300 }, { "epoch": 0.7681070797662726, "grad_norm": 2.2197558879852295, "learning_rate": 9.040290800380487e-05, "loss": 3.2435, "step": 11305 }, { "epoch": 0.7684467998369343, "grad_norm": 2.584726572036743, "learning_rate": 9.03986615029216e-05, "loss": 3.4971, "step": 11310 }, { "epoch": 0.7687865199075962, "grad_norm": 2.4776198863983154, "learning_rate": 9.039441500203832e-05, "loss": 3.3481, "step": 11315 }, { "epoch": 0.7691262399782579, "grad_norm": 1.9876595735549927, "learning_rate": 9.039016850115505e-05, "loss": 3.5562, "step": 11320 }, { "epoch": 0.7694659600489197, "grad_norm": 2.591463088989258, "learning_rate": 9.038592200027178e-05, "loss": 3.335, "step": 11325 }, { "epoch": 0.7698056801195815, "grad_norm": 2.3524725437164307, "learning_rate": 9.038167549938851e-05, "loss": 3.2752, "step": 11330 }, { "epoch": 0.7701454001902432, "grad_norm": 3.10605525970459, "learning_rate": 9.037742899850524e-05, "loss": 3.2207, "step": 11335 }, { "epoch": 0.770485120260905, "grad_norm": 2.462714672088623, "learning_rate": 9.037318249762196e-05, "loss": 3.1319, "step": 11340 }, { "epoch": 0.7708248403315668, "grad_norm": 2.4269654750823975, "learning_rate": 9.036893599673869e-05, "loss": 3.4491, "step": 11345 }, { "epoch": 0.7711645604022286, "grad_norm": 1.8146681785583496, "learning_rate": 9.036468949585542e-05, "loss": 3.4463, "step": 11350 }, { "epoch": 0.7715042804728903, "grad_norm": 2.8521134853363037, "learning_rate": 9.036044299497215e-05, "loss": 3.3275, "step": 11355 }, { "epoch": 0.7718440005435521, "grad_norm": 2.598526954650879, "learning_rate": 9.035619649408888e-05, "loss": 3.7912, "step": 11360 }, { "epoch": 0.7721837206142139, "grad_norm": 1.7714214324951172, "learning_rate": 9.03519499932056e-05, "loss": 3.4521, "step": 11365 }, { "epoch": 0.7725234406848757, "grad_norm": 2.496175527572632, "learning_rate": 9.034770349232233e-05, "loss": 3.6247, "step": 11370 }, { "epoch": 0.7728631607555374, "grad_norm": 2.84682297706604, "learning_rate": 9.034345699143906e-05, "loss": 3.2996, "step": 11375 }, { "epoch": 0.7732028808261993, "grad_norm": 2.184936761856079, "learning_rate": 9.033921049055578e-05, "loss": 3.2265, "step": 11380 }, { "epoch": 0.773542600896861, "grad_norm": 1.8503079414367676, "learning_rate": 9.033496398967252e-05, "loss": 3.4367, "step": 11385 }, { "epoch": 0.7738823209675227, "grad_norm": 2.2539453506469727, "learning_rate": 9.033071748878924e-05, "loss": 3.5811, "step": 11390 }, { "epoch": 0.7742220410381845, "grad_norm": 2.4276680946350098, "learning_rate": 9.032647098790596e-05, "loss": 3.4865, "step": 11395 }, { "epoch": 0.7745617611088463, "grad_norm": 2.0705325603485107, "learning_rate": 9.03222244870227e-05, "loss": 3.3396, "step": 11400 }, { "epoch": 0.7749014811795081, "grad_norm": 2.168041706085205, "learning_rate": 9.031797798613943e-05, "loss": 3.5976, "step": 11405 }, { "epoch": 0.7752412012501698, "grad_norm": 2.1073849201202393, "learning_rate": 9.031373148525614e-05, "loss": 3.7443, "step": 11410 }, { "epoch": 0.7755809213208317, "grad_norm": 2.7408745288848877, "learning_rate": 9.030948498437288e-05, "loss": 3.5112, "step": 11415 }, { "epoch": 0.7759206413914934, "grad_norm": 2.7794125080108643, "learning_rate": 9.030523848348961e-05, "loss": 3.5839, "step": 11420 }, { "epoch": 0.7762603614621552, "grad_norm": 2.4596080780029297, "learning_rate": 9.030099198260634e-05, "loss": 3.1659, "step": 11425 }, { "epoch": 0.776600081532817, "grad_norm": 2.173229217529297, "learning_rate": 9.029674548172307e-05, "loss": 3.1308, "step": 11430 }, { "epoch": 0.7769398016034788, "grad_norm": 1.906375765800476, "learning_rate": 9.02924989808398e-05, "loss": 3.3156, "step": 11435 }, { "epoch": 0.7772795216741405, "grad_norm": 2.454479217529297, "learning_rate": 9.028825247995652e-05, "loss": 3.4941, "step": 11440 }, { "epoch": 0.7776192417448022, "grad_norm": 2.167015552520752, "learning_rate": 9.028400597907325e-05, "loss": 3.3087, "step": 11445 }, { "epoch": 0.7779589618154641, "grad_norm": 2.4903719425201416, "learning_rate": 9.027975947818997e-05, "loss": 3.4101, "step": 11450 }, { "epoch": 0.7782986818861258, "grad_norm": 2.269793748855591, "learning_rate": 9.027551297730671e-05, "loss": 3.5565, "step": 11455 }, { "epoch": 0.7786384019567876, "grad_norm": 1.8844324350357056, "learning_rate": 9.027126647642344e-05, "loss": 3.1795, "step": 11460 }, { "epoch": 0.7789781220274494, "grad_norm": 1.9648065567016602, "learning_rate": 9.026701997554015e-05, "loss": 3.5654, "step": 11465 }, { "epoch": 0.7793178420981112, "grad_norm": 2.3887670040130615, "learning_rate": 9.026277347465689e-05, "loss": 3.4269, "step": 11470 }, { "epoch": 0.7796575621687729, "grad_norm": 2.103919506072998, "learning_rate": 9.025852697377362e-05, "loss": 3.4802, "step": 11475 }, { "epoch": 0.7799972822394347, "grad_norm": 2.0800790786743164, "learning_rate": 9.025428047289034e-05, "loss": 3.4395, "step": 11480 }, { "epoch": 0.7803370023100965, "grad_norm": 1.862479329109192, "learning_rate": 9.025003397200708e-05, "loss": 3.3586, "step": 11485 }, { "epoch": 0.7806767223807582, "grad_norm": 2.47119402885437, "learning_rate": 9.02457874711238e-05, "loss": 3.278, "step": 11490 }, { "epoch": 0.78101644245142, "grad_norm": 1.9219058752059937, "learning_rate": 9.024154097024052e-05, "loss": 3.4069, "step": 11495 }, { "epoch": 0.7813561625220818, "grad_norm": 2.386284351348877, "learning_rate": 9.023729446935726e-05, "loss": 3.0464, "step": 11500 }, { "epoch": 0.7816958825927436, "grad_norm": 1.8137407302856445, "learning_rate": 9.023304796847399e-05, "loss": 3.3531, "step": 11505 }, { "epoch": 0.7820356026634053, "grad_norm": 2.1406009197235107, "learning_rate": 9.02288014675907e-05, "loss": 3.6528, "step": 11510 }, { "epoch": 0.7823753227340672, "grad_norm": 2.3624815940856934, "learning_rate": 9.022455496670744e-05, "loss": 3.486, "step": 11515 }, { "epoch": 0.7827150428047289, "grad_norm": 2.1248888969421387, "learning_rate": 9.022030846582416e-05, "loss": 3.5324, "step": 11520 }, { "epoch": 0.7830547628753907, "grad_norm": 2.0932772159576416, "learning_rate": 9.021606196494089e-05, "loss": 3.6139, "step": 11525 }, { "epoch": 0.7833944829460524, "grad_norm": 2.263762950897217, "learning_rate": 9.021181546405763e-05, "loss": 3.3103, "step": 11530 }, { "epoch": 0.7837342030167143, "grad_norm": 1.8357583284378052, "learning_rate": 9.020756896317434e-05, "loss": 3.2178, "step": 11535 }, { "epoch": 0.784073923087376, "grad_norm": 2.109544038772583, "learning_rate": 9.020332246229107e-05, "loss": 3.6964, "step": 11540 }, { "epoch": 0.7844136431580377, "grad_norm": 2.2789525985717773, "learning_rate": 9.019907596140781e-05, "loss": 3.3875, "step": 11545 }, { "epoch": 0.7847533632286996, "grad_norm": 1.6911152601242065, "learning_rate": 9.019482946052453e-05, "loss": 3.5005, "step": 11550 }, { "epoch": 0.7850930832993613, "grad_norm": 2.174966812133789, "learning_rate": 9.019058295964126e-05, "loss": 3.3693, "step": 11555 }, { "epoch": 0.7854328033700231, "grad_norm": 2.047849178314209, "learning_rate": 9.0186336458758e-05, "loss": 3.6064, "step": 11560 }, { "epoch": 0.7857725234406848, "grad_norm": 2.1291744709014893, "learning_rate": 9.018208995787471e-05, "loss": 3.4093, "step": 11565 }, { "epoch": 0.7861122435113467, "grad_norm": 2.612980842590332, "learning_rate": 9.017784345699144e-05, "loss": 3.1426, "step": 11570 }, { "epoch": 0.7864519635820084, "grad_norm": 2.067013740539551, "learning_rate": 9.017359695610818e-05, "loss": 3.3052, "step": 11575 }, { "epoch": 0.7867916836526702, "grad_norm": 2.4694230556488037, "learning_rate": 9.01693504552249e-05, "loss": 3.7424, "step": 11580 }, { "epoch": 0.787131403723332, "grad_norm": 1.8251795768737793, "learning_rate": 9.016510395434162e-05, "loss": 3.3368, "step": 11585 }, { "epoch": 0.7874711237939938, "grad_norm": 1.5800342559814453, "learning_rate": 9.016085745345836e-05, "loss": 3.4217, "step": 11590 }, { "epoch": 0.7878108438646555, "grad_norm": 2.0962321758270264, "learning_rate": 9.015661095257508e-05, "loss": 3.2964, "step": 11595 }, { "epoch": 0.7881505639353173, "grad_norm": 1.998167634010315, "learning_rate": 9.015236445169181e-05, "loss": 3.2976, "step": 11600 }, { "epoch": 0.7884902840059791, "grad_norm": 2.041581153869629, "learning_rate": 9.014811795080854e-05, "loss": 3.587, "step": 11605 }, { "epoch": 0.7888300040766408, "grad_norm": 2.483847141265869, "learning_rate": 9.014387144992526e-05, "loss": 3.5872, "step": 11610 }, { "epoch": 0.7891697241473026, "grad_norm": 15.90668773651123, "learning_rate": 9.013962494904199e-05, "loss": 3.2744, "step": 11615 }, { "epoch": 0.7895094442179644, "grad_norm": 2.419654369354248, "learning_rate": 9.013537844815872e-05, "loss": 3.3145, "step": 11620 }, { "epoch": 0.7898491642886262, "grad_norm": 2.0530455112457275, "learning_rate": 9.013113194727545e-05, "loss": 3.2514, "step": 11625 }, { "epoch": 0.7901888843592879, "grad_norm": 2.198951482772827, "learning_rate": 9.012688544639218e-05, "loss": 3.4445, "step": 11630 }, { "epoch": 0.7905286044299498, "grad_norm": 2.1728460788726807, "learning_rate": 9.01226389455089e-05, "loss": 3.3801, "step": 11635 }, { "epoch": 0.7908683245006115, "grad_norm": 1.963135004043579, "learning_rate": 9.011839244462563e-05, "loss": 3.5995, "step": 11640 }, { "epoch": 0.7912080445712733, "grad_norm": 2.1429896354675293, "learning_rate": 9.011414594374236e-05, "loss": 3.1804, "step": 11645 }, { "epoch": 0.791547764641935, "grad_norm": 2.343388557434082, "learning_rate": 9.010989944285909e-05, "loss": 3.4809, "step": 11650 }, { "epoch": 0.7918874847125968, "grad_norm": 2.60878849029541, "learning_rate": 9.010565294197582e-05, "loss": 3.475, "step": 11655 }, { "epoch": 0.7922272047832586, "grad_norm": 2.281998872756958, "learning_rate": 9.010140644109254e-05, "loss": 3.4948, "step": 11660 }, { "epoch": 0.7925669248539203, "grad_norm": 2.4443137645721436, "learning_rate": 9.009715994020927e-05, "loss": 3.3156, "step": 11665 }, { "epoch": 0.7929066449245822, "grad_norm": 2.0320675373077393, "learning_rate": 9.0092913439326e-05, "loss": 3.4341, "step": 11670 }, { "epoch": 0.7932463649952439, "grad_norm": 2.0136067867279053, "learning_rate": 9.008866693844273e-05, "loss": 3.3965, "step": 11675 }, { "epoch": 0.7935860850659057, "grad_norm": 2.0364115238189697, "learning_rate": 9.008442043755946e-05, "loss": 3.4643, "step": 11680 }, { "epoch": 0.7939258051365675, "grad_norm": 2.2068874835968018, "learning_rate": 9.008017393667618e-05, "loss": 3.7331, "step": 11685 }, { "epoch": 0.7942655252072293, "grad_norm": 2.0830392837524414, "learning_rate": 9.007592743579291e-05, "loss": 3.3041, "step": 11690 }, { "epoch": 0.794605245277891, "grad_norm": 2.5593206882476807, "learning_rate": 9.007168093490964e-05, "loss": 3.3785, "step": 11695 }, { "epoch": 0.7949449653485527, "grad_norm": 1.8627713918685913, "learning_rate": 9.006743443402637e-05, "loss": 3.2584, "step": 11700 }, { "epoch": 0.7952846854192146, "grad_norm": 4.073328971862793, "learning_rate": 9.00631879331431e-05, "loss": 3.3095, "step": 11705 }, { "epoch": 0.7956244054898763, "grad_norm": 2.2592155933380127, "learning_rate": 9.005894143225982e-05, "loss": 3.1838, "step": 11710 }, { "epoch": 0.7959641255605381, "grad_norm": 1.8784817457199097, "learning_rate": 9.005469493137655e-05, "loss": 3.5494, "step": 11715 }, { "epoch": 0.7963038456311999, "grad_norm": 2.0339643955230713, "learning_rate": 9.005044843049327e-05, "loss": 3.4768, "step": 11720 }, { "epoch": 0.7966435657018617, "grad_norm": 2.1371805667877197, "learning_rate": 9.004620192961001e-05, "loss": 3.3907, "step": 11725 }, { "epoch": 0.7969832857725234, "grad_norm": 2.3924808502197266, "learning_rate": 9.004195542872674e-05, "loss": 3.5149, "step": 11730 }, { "epoch": 0.7973230058431852, "grad_norm": 2.4105582237243652, "learning_rate": 9.003770892784345e-05, "loss": 3.4412, "step": 11735 }, { "epoch": 0.797662725913847, "grad_norm": 2.1838505268096924, "learning_rate": 9.003346242696019e-05, "loss": 3.1949, "step": 11740 }, { "epoch": 0.7980024459845088, "grad_norm": 2.1127848625183105, "learning_rate": 9.002921592607692e-05, "loss": 3.3228, "step": 11745 }, { "epoch": 0.7983421660551705, "grad_norm": 2.802795648574829, "learning_rate": 9.002496942519363e-05, "loss": 3.519, "step": 11750 }, { "epoch": 0.7986818861258324, "grad_norm": 2.131911277770996, "learning_rate": 9.002072292431038e-05, "loss": 3.1634, "step": 11755 }, { "epoch": 0.7990216061964941, "grad_norm": 2.7319459915161133, "learning_rate": 9.00164764234271e-05, "loss": 3.4152, "step": 11760 }, { "epoch": 0.7993613262671558, "grad_norm": 2.0982625484466553, "learning_rate": 9.001222992254383e-05, "loss": 3.4346, "step": 11765 }, { "epoch": 0.7997010463378177, "grad_norm": 1.7234692573547363, "learning_rate": 9.000798342166056e-05, "loss": 3.4908, "step": 11770 }, { "epoch": 0.8000407664084794, "grad_norm": 1.9733598232269287, "learning_rate": 9.000373692077729e-05, "loss": 3.3127, "step": 11775 }, { "epoch": 0.8003804864791412, "grad_norm": 2.0834248065948486, "learning_rate": 8.999949041989402e-05, "loss": 3.6224, "step": 11780 }, { "epoch": 0.8007202065498029, "grad_norm": 2.2089688777923584, "learning_rate": 8.999524391901074e-05, "loss": 3.5807, "step": 11785 }, { "epoch": 0.8010599266204648, "grad_norm": 1.7381982803344727, "learning_rate": 8.999099741812747e-05, "loss": 3.1458, "step": 11790 }, { "epoch": 0.8013996466911265, "grad_norm": 1.9252065420150757, "learning_rate": 8.99867509172442e-05, "loss": 3.4913, "step": 11795 }, { "epoch": 0.8017393667617883, "grad_norm": 2.1850547790527344, "learning_rate": 8.998250441636093e-05, "loss": 3.5705, "step": 11800 }, { "epoch": 0.8020790868324501, "grad_norm": 2.3850276470184326, "learning_rate": 8.997825791547764e-05, "loss": 3.6488, "step": 11805 }, { "epoch": 0.8024188069031118, "grad_norm": 2.362490177154541, "learning_rate": 8.997401141459438e-05, "loss": 3.1729, "step": 11810 }, { "epoch": 0.8027585269737736, "grad_norm": 2.5183753967285156, "learning_rate": 8.996976491371111e-05, "loss": 3.3475, "step": 11815 }, { "epoch": 0.8030982470444353, "grad_norm": 2.144824266433716, "learning_rate": 8.996551841282783e-05, "loss": 3.4031, "step": 11820 }, { "epoch": 0.8034379671150972, "grad_norm": 2.101644277572632, "learning_rate": 8.996127191194457e-05, "loss": 3.6251, "step": 11825 }, { "epoch": 0.8037776871857589, "grad_norm": 1.8469126224517822, "learning_rate": 8.99570254110613e-05, "loss": 3.4567, "step": 11830 }, { "epoch": 0.8041174072564207, "grad_norm": 2.1471898555755615, "learning_rate": 8.995277891017801e-05, "loss": 3.5695, "step": 11835 }, { "epoch": 0.8044571273270825, "grad_norm": 2.626678228378296, "learning_rate": 8.994853240929475e-05, "loss": 3.5137, "step": 11840 }, { "epoch": 0.8047968473977443, "grad_norm": 2.2975993156433105, "learning_rate": 8.994428590841148e-05, "loss": 3.3626, "step": 11845 }, { "epoch": 0.805136567468406, "grad_norm": 2.5563313961029053, "learning_rate": 8.99400394075282e-05, "loss": 3.5137, "step": 11850 }, { "epoch": 0.8054762875390679, "grad_norm": 3.081108808517456, "learning_rate": 8.993579290664494e-05, "loss": 3.2473, "step": 11855 }, { "epoch": 0.8058160076097296, "grad_norm": 2.4409470558166504, "learning_rate": 8.993154640576166e-05, "loss": 3.2571, "step": 11860 }, { "epoch": 0.8061557276803913, "grad_norm": 2.2482988834381104, "learning_rate": 8.992729990487838e-05, "loss": 3.3227, "step": 11865 }, { "epoch": 0.8064954477510531, "grad_norm": 1.7942568063735962, "learning_rate": 8.992305340399512e-05, "loss": 3.5038, "step": 11870 }, { "epoch": 0.8068351678217149, "grad_norm": 1.990861415863037, "learning_rate": 8.991880690311183e-05, "loss": 3.4663, "step": 11875 }, { "epoch": 0.8071748878923767, "grad_norm": 2.294863700866699, "learning_rate": 8.991456040222856e-05, "loss": 2.9932, "step": 11880 }, { "epoch": 0.8075146079630384, "grad_norm": 2.1445112228393555, "learning_rate": 8.99103139013453e-05, "loss": 3.4847, "step": 11885 }, { "epoch": 0.8078543280337003, "grad_norm": 2.0598299503326416, "learning_rate": 8.990606740046202e-05, "loss": 3.1814, "step": 11890 }, { "epoch": 0.808194048104362, "grad_norm": 1.9853487014770508, "learning_rate": 8.990182089957875e-05, "loss": 3.4542, "step": 11895 }, { "epoch": 0.8085337681750238, "grad_norm": 2.471073865890503, "learning_rate": 8.989757439869549e-05, "loss": 3.2959, "step": 11900 }, { "epoch": 0.8088734882456855, "grad_norm": 1.9424799680709839, "learning_rate": 8.98933278978122e-05, "loss": 3.3625, "step": 11905 }, { "epoch": 0.8092132083163474, "grad_norm": 1.9726063013076782, "learning_rate": 8.988908139692893e-05, "loss": 3.5501, "step": 11910 }, { "epoch": 0.8095529283870091, "grad_norm": 1.877665638923645, "learning_rate": 8.988483489604567e-05, "loss": 3.478, "step": 11915 }, { "epoch": 0.8098926484576708, "grad_norm": 1.8277817964553833, "learning_rate": 8.988058839516239e-05, "loss": 3.187, "step": 11920 }, { "epoch": 0.8102323685283327, "grad_norm": 1.7110615968704224, "learning_rate": 8.987634189427911e-05, "loss": 3.4653, "step": 11925 }, { "epoch": 0.8105720885989944, "grad_norm": 1.9001785516738892, "learning_rate": 8.987209539339586e-05, "loss": 3.4926, "step": 11930 }, { "epoch": 0.8109118086696562, "grad_norm": 1.8427282571792603, "learning_rate": 8.986784889251257e-05, "loss": 3.4295, "step": 11935 }, { "epoch": 0.811251528740318, "grad_norm": 1.6634604930877686, "learning_rate": 8.98636023916293e-05, "loss": 3.2924, "step": 11940 }, { "epoch": 0.8115912488109798, "grad_norm": 1.9675753116607666, "learning_rate": 8.985935589074603e-05, "loss": 3.1548, "step": 11945 }, { "epoch": 0.8119309688816415, "grad_norm": 2.0560007095336914, "learning_rate": 8.985510938986275e-05, "loss": 3.5211, "step": 11950 }, { "epoch": 0.8122706889523033, "grad_norm": 2.9898903369903564, "learning_rate": 8.985086288897948e-05, "loss": 3.295, "step": 11955 }, { "epoch": 0.8126104090229651, "grad_norm": 2.198967933654785, "learning_rate": 8.984661638809621e-05, "loss": 3.4186, "step": 11960 }, { "epoch": 0.8129501290936268, "grad_norm": 1.7902920246124268, "learning_rate": 8.984236988721294e-05, "loss": 3.3827, "step": 11965 }, { "epoch": 0.8132898491642886, "grad_norm": 2.6019420623779297, "learning_rate": 8.983812338632967e-05, "loss": 3.385, "step": 11970 }, { "epoch": 0.8136295692349504, "grad_norm": 2.494234800338745, "learning_rate": 8.98338768854464e-05, "loss": 3.5925, "step": 11975 }, { "epoch": 0.8139692893056122, "grad_norm": 2.5273165702819824, "learning_rate": 8.982963038456312e-05, "loss": 3.183, "step": 11980 }, { "epoch": 0.8143090093762739, "grad_norm": 2.065208911895752, "learning_rate": 8.982538388367985e-05, "loss": 3.2692, "step": 11985 }, { "epoch": 0.8146487294469357, "grad_norm": 2.5276691913604736, "learning_rate": 8.982113738279658e-05, "loss": 3.6195, "step": 11990 }, { "epoch": 0.8149884495175975, "grad_norm": 2.551149845123291, "learning_rate": 8.98168908819133e-05, "loss": 3.122, "step": 11995 }, { "epoch": 0.8153281695882593, "grad_norm": 2.36954402923584, "learning_rate": 8.981264438103003e-05, "loss": 3.4339, "step": 12000 }, { "epoch": 0.815667889658921, "grad_norm": 2.6555187702178955, "learning_rate": 8.980839788014676e-05, "loss": 3.3592, "step": 12005 }, { "epoch": 0.8160076097295829, "grad_norm": 2.415350914001465, "learning_rate": 8.980415137926349e-05, "loss": 3.4568, "step": 12010 }, { "epoch": 0.8163473298002446, "grad_norm": 2.061955690383911, "learning_rate": 8.979990487838022e-05, "loss": 3.6163, "step": 12015 }, { "epoch": 0.8166870498709063, "grad_norm": 2.0661327838897705, "learning_rate": 8.979565837749695e-05, "loss": 3.3972, "step": 12020 }, { "epoch": 0.8170267699415682, "grad_norm": 1.9909498691558838, "learning_rate": 8.979141187661367e-05, "loss": 3.0278, "step": 12025 }, { "epoch": 0.8173664900122299, "grad_norm": 1.9194748401641846, "learning_rate": 8.97871653757304e-05, "loss": 3.3678, "step": 12030 }, { "epoch": 0.8177062100828917, "grad_norm": 2.6531505584716797, "learning_rate": 8.978291887484713e-05, "loss": 3.4254, "step": 12035 }, { "epoch": 0.8180459301535534, "grad_norm": 1.943596601486206, "learning_rate": 8.977867237396386e-05, "loss": 3.3382, "step": 12040 }, { "epoch": 0.8183856502242153, "grad_norm": 1.5993430614471436, "learning_rate": 8.977442587308059e-05, "loss": 3.7341, "step": 12045 }, { "epoch": 0.818725370294877, "grad_norm": 1.6929173469543457, "learning_rate": 8.977017937219731e-05, "loss": 3.4217, "step": 12050 }, { "epoch": 0.8190650903655388, "grad_norm": 2.355818748474121, "learning_rate": 8.976593287131404e-05, "loss": 3.4165, "step": 12055 }, { "epoch": 0.8194048104362006, "grad_norm": 2.1425981521606445, "learning_rate": 8.976168637043077e-05, "loss": 3.3265, "step": 12060 }, { "epoch": 0.8197445305068624, "grad_norm": 2.230255126953125, "learning_rate": 8.97574398695475e-05, "loss": 3.4455, "step": 12065 }, { "epoch": 0.8200842505775241, "grad_norm": 3.521151542663574, "learning_rate": 8.975319336866423e-05, "loss": 3.4047, "step": 12070 }, { "epoch": 0.8204239706481858, "grad_norm": 2.098658323287964, "learning_rate": 8.974894686778094e-05, "loss": 3.5767, "step": 12075 }, { "epoch": 0.8207636907188477, "grad_norm": 2.61434006690979, "learning_rate": 8.974470036689768e-05, "loss": 3.2252, "step": 12080 }, { "epoch": 0.8211034107895094, "grad_norm": 2.139094352722168, "learning_rate": 8.974045386601441e-05, "loss": 3.4537, "step": 12085 }, { "epoch": 0.8214431308601712, "grad_norm": 2.295015335083008, "learning_rate": 8.973620736513113e-05, "loss": 3.6022, "step": 12090 }, { "epoch": 0.821782850930833, "grad_norm": 2.326263189315796, "learning_rate": 8.973196086424787e-05, "loss": 3.572, "step": 12095 }, { "epoch": 0.8221225710014948, "grad_norm": 2.0316975116729736, "learning_rate": 8.97277143633646e-05, "loss": 3.4458, "step": 12100 }, { "epoch": 0.8224622910721565, "grad_norm": 2.1909048557281494, "learning_rate": 8.972346786248132e-05, "loss": 3.2895, "step": 12105 }, { "epoch": 0.8228020111428184, "grad_norm": 2.148266077041626, "learning_rate": 8.971922136159805e-05, "loss": 3.4308, "step": 12110 }, { "epoch": 0.8231417312134801, "grad_norm": 2.0468289852142334, "learning_rate": 8.971497486071478e-05, "loss": 3.3819, "step": 12115 }, { "epoch": 0.8234814512841419, "grad_norm": 1.9551857709884644, "learning_rate": 8.97107283598315e-05, "loss": 3.4927, "step": 12120 }, { "epoch": 0.8238211713548036, "grad_norm": 5.074975967407227, "learning_rate": 8.970648185894823e-05, "loss": 3.4348, "step": 12125 }, { "epoch": 0.8241608914254654, "grad_norm": 2.103262186050415, "learning_rate": 8.970223535806496e-05, "loss": 3.657, "step": 12130 }, { "epoch": 0.8245006114961272, "grad_norm": 2.4705734252929688, "learning_rate": 8.969798885718169e-05, "loss": 3.0682, "step": 12135 }, { "epoch": 0.8248403315667889, "grad_norm": 1.8436349630355835, "learning_rate": 8.969374235629842e-05, "loss": 3.399, "step": 12140 }, { "epoch": 0.8251800516374508, "grad_norm": 2.4647295475006104, "learning_rate": 8.968949585541513e-05, "loss": 3.4332, "step": 12145 }, { "epoch": 0.8255197717081125, "grad_norm": 2.043389081954956, "learning_rate": 8.968524935453187e-05, "loss": 3.3697, "step": 12150 }, { "epoch": 0.8258594917787743, "grad_norm": 1.9643347263336182, "learning_rate": 8.96810028536486e-05, "loss": 3.5348, "step": 12155 }, { "epoch": 0.826199211849436, "grad_norm": 2.261094808578491, "learning_rate": 8.967675635276532e-05, "loss": 3.3601, "step": 12160 }, { "epoch": 0.8265389319200979, "grad_norm": 2.189314126968384, "learning_rate": 8.967250985188206e-05, "loss": 3.5678, "step": 12165 }, { "epoch": 0.8268786519907596, "grad_norm": 2.5339884757995605, "learning_rate": 8.966826335099879e-05, "loss": 3.4697, "step": 12170 }, { "epoch": 0.8272183720614213, "grad_norm": 2.93972110748291, "learning_rate": 8.96640168501155e-05, "loss": 3.271, "step": 12175 }, { "epoch": 0.8275580921320832, "grad_norm": 1.789829969406128, "learning_rate": 8.965977034923224e-05, "loss": 3.5214, "step": 12180 }, { "epoch": 0.8278978122027449, "grad_norm": 1.7386118173599243, "learning_rate": 8.965552384834897e-05, "loss": 3.3794, "step": 12185 }, { "epoch": 0.8282375322734067, "grad_norm": 3.1779747009277344, "learning_rate": 8.965127734746569e-05, "loss": 3.5923, "step": 12190 }, { "epoch": 0.8285772523440685, "grad_norm": 1.8903814554214478, "learning_rate": 8.964703084658243e-05, "loss": 3.6321, "step": 12195 }, { "epoch": 0.8289169724147303, "grad_norm": 2.069563627243042, "learning_rate": 8.964278434569915e-05, "loss": 3.5606, "step": 12200 }, { "epoch": 0.829256692485392, "grad_norm": 2.354498863220215, "learning_rate": 8.963853784481587e-05, "loss": 3.6108, "step": 12205 }, { "epoch": 0.8295964125560538, "grad_norm": 2.1593213081359863, "learning_rate": 8.963429134393261e-05, "loss": 3.2927, "step": 12210 }, { "epoch": 0.8299361326267156, "grad_norm": 2.683704376220703, "learning_rate": 8.963004484304934e-05, "loss": 3.3888, "step": 12215 }, { "epoch": 0.8302758526973774, "grad_norm": 2.1463334560394287, "learning_rate": 8.962579834216605e-05, "loss": 3.2958, "step": 12220 }, { "epoch": 0.8306155727680391, "grad_norm": 2.190086841583252, "learning_rate": 8.96215518412828e-05, "loss": 3.5065, "step": 12225 }, { "epoch": 0.830955292838701, "grad_norm": 2.2001006603240967, "learning_rate": 8.961730534039951e-05, "loss": 3.2064, "step": 12230 }, { "epoch": 0.8312950129093627, "grad_norm": 2.6916494369506836, "learning_rate": 8.961305883951624e-05, "loss": 2.9586, "step": 12235 }, { "epoch": 0.8316347329800244, "grad_norm": 1.9479535818099976, "learning_rate": 8.960881233863298e-05, "loss": 3.2363, "step": 12240 }, { "epoch": 0.8319744530506862, "grad_norm": 1.9741711616516113, "learning_rate": 8.96045658377497e-05, "loss": 3.0458, "step": 12245 }, { "epoch": 0.832314173121348, "grad_norm": 2.184316396713257, "learning_rate": 8.960031933686642e-05, "loss": 3.3535, "step": 12250 }, { "epoch": 0.8326538931920098, "grad_norm": 2.348109006881714, "learning_rate": 8.959607283598316e-05, "loss": 3.1871, "step": 12255 }, { "epoch": 0.8329936132626715, "grad_norm": 2.490572690963745, "learning_rate": 8.959182633509988e-05, "loss": 3.3422, "step": 12260 }, { "epoch": 0.8333333333333334, "grad_norm": 2.7501299381256104, "learning_rate": 8.95875798342166e-05, "loss": 3.5116, "step": 12265 }, { "epoch": 0.8336730534039951, "grad_norm": 2.969365358352661, "learning_rate": 8.958333333333335e-05, "loss": 3.0079, "step": 12270 }, { "epoch": 0.8340127734746569, "grad_norm": 2.0569348335266113, "learning_rate": 8.957908683245006e-05, "loss": 3.5527, "step": 12275 }, { "epoch": 0.8343524935453187, "grad_norm": 2.477231025695801, "learning_rate": 8.957484033156679e-05, "loss": 3.4916, "step": 12280 }, { "epoch": 0.8346922136159804, "grad_norm": 2.0322723388671875, "learning_rate": 8.957059383068353e-05, "loss": 3.2195, "step": 12285 }, { "epoch": 0.8350319336866422, "grad_norm": 2.4855687618255615, "learning_rate": 8.956634732980025e-05, "loss": 3.3189, "step": 12290 }, { "epoch": 0.8353716537573039, "grad_norm": 1.9906818866729736, "learning_rate": 8.956210082891697e-05, "loss": 3.3418, "step": 12295 }, { "epoch": 0.8357113738279658, "grad_norm": 2.5079314708709717, "learning_rate": 8.95578543280337e-05, "loss": 3.4239, "step": 12300 }, { "epoch": 0.8360510938986275, "grad_norm": 3.2814083099365234, "learning_rate": 8.955360782715043e-05, "loss": 3.1097, "step": 12305 }, { "epoch": 0.8363908139692893, "grad_norm": 2.568511962890625, "learning_rate": 8.954936132626716e-05, "loss": 3.5821, "step": 12310 }, { "epoch": 0.8367305340399511, "grad_norm": 2.464090585708618, "learning_rate": 8.954511482538389e-05, "loss": 3.2909, "step": 12315 }, { "epoch": 0.8370702541106129, "grad_norm": 2.4630167484283447, "learning_rate": 8.954086832450061e-05, "loss": 3.5004, "step": 12320 }, { "epoch": 0.8374099741812746, "grad_norm": 1.9303792715072632, "learning_rate": 8.953662182361734e-05, "loss": 3.5628, "step": 12325 }, { "epoch": 0.8377496942519363, "grad_norm": 2.097794771194458, "learning_rate": 8.953237532273407e-05, "loss": 3.0507, "step": 12330 }, { "epoch": 0.8380894143225982, "grad_norm": 2.0385208129882812, "learning_rate": 8.95281288218508e-05, "loss": 3.612, "step": 12335 }, { "epoch": 0.83842913439326, "grad_norm": 2.2375195026397705, "learning_rate": 8.952388232096753e-05, "loss": 3.452, "step": 12340 }, { "epoch": 0.8387688544639217, "grad_norm": 2.2851905822753906, "learning_rate": 8.951963582008425e-05, "loss": 3.424, "step": 12345 }, { "epoch": 0.8391085745345835, "grad_norm": 2.183159351348877, "learning_rate": 8.951538931920098e-05, "loss": 3.3318, "step": 12350 }, { "epoch": 0.8394482946052453, "grad_norm": 2.781352996826172, "learning_rate": 8.951114281831771e-05, "loss": 3.3904, "step": 12355 }, { "epoch": 0.839788014675907, "grad_norm": 2.242860794067383, "learning_rate": 8.950689631743444e-05, "loss": 3.4498, "step": 12360 }, { "epoch": 0.8401277347465689, "grad_norm": 1.608384370803833, "learning_rate": 8.950264981655117e-05, "loss": 3.6722, "step": 12365 }, { "epoch": 0.8404674548172306, "grad_norm": 3.1077661514282227, "learning_rate": 8.94984033156679e-05, "loss": 3.4293, "step": 12370 }, { "epoch": 0.8408071748878924, "grad_norm": 2.1576080322265625, "learning_rate": 8.949415681478462e-05, "loss": 3.3204, "step": 12375 }, { "epoch": 0.8411468949585541, "grad_norm": 2.1023166179656982, "learning_rate": 8.948991031390135e-05, "loss": 3.3246, "step": 12380 }, { "epoch": 0.841486615029216, "grad_norm": 2.3234474658966064, "learning_rate": 8.948566381301808e-05, "loss": 3.5551, "step": 12385 }, { "epoch": 0.8418263350998777, "grad_norm": 1.919976830482483, "learning_rate": 8.94814173121348e-05, "loss": 3.1543, "step": 12390 }, { "epoch": 0.8421660551705394, "grad_norm": 2.105980157852173, "learning_rate": 8.947717081125153e-05, "loss": 3.2298, "step": 12395 }, { "epoch": 0.8425057752412013, "grad_norm": 2.2233755588531494, "learning_rate": 8.947292431036826e-05, "loss": 3.4588, "step": 12400 }, { "epoch": 0.842845495311863, "grad_norm": 2.014068841934204, "learning_rate": 8.946867780948499e-05, "loss": 3.3832, "step": 12405 }, { "epoch": 0.8431852153825248, "grad_norm": 2.395500659942627, "learning_rate": 8.946443130860172e-05, "loss": 3.6357, "step": 12410 }, { "epoch": 0.8435249354531865, "grad_norm": 2.483025550842285, "learning_rate": 8.946018480771845e-05, "loss": 3.4749, "step": 12415 }, { "epoch": 0.8438646555238484, "grad_norm": 1.8673661947250366, "learning_rate": 8.945593830683517e-05, "loss": 3.4196, "step": 12420 }, { "epoch": 0.8442043755945101, "grad_norm": 2.281325101852417, "learning_rate": 8.94516918059519e-05, "loss": 3.6027, "step": 12425 }, { "epoch": 0.8445440956651719, "grad_norm": 1.9356565475463867, "learning_rate": 8.944744530506862e-05, "loss": 3.3846, "step": 12430 }, { "epoch": 0.8448838157358337, "grad_norm": 2.9688615798950195, "learning_rate": 8.944319880418536e-05, "loss": 3.5697, "step": 12435 }, { "epoch": 0.8452235358064955, "grad_norm": 3.5359628200531006, "learning_rate": 8.943895230330209e-05, "loss": 3.3995, "step": 12440 }, { "epoch": 0.8455632558771572, "grad_norm": 1.894195556640625, "learning_rate": 8.943470580241881e-05, "loss": 3.3548, "step": 12445 }, { "epoch": 0.845902975947819, "grad_norm": 1.9114396572113037, "learning_rate": 8.943045930153554e-05, "loss": 3.3295, "step": 12450 }, { "epoch": 0.8462426960184808, "grad_norm": 2.432551145553589, "learning_rate": 8.942621280065227e-05, "loss": 3.6598, "step": 12455 }, { "epoch": 0.8465824160891425, "grad_norm": 2.635043144226074, "learning_rate": 8.9421966299769e-05, "loss": 3.3133, "step": 12460 }, { "epoch": 0.8469221361598043, "grad_norm": 1.818357229232788, "learning_rate": 8.941771979888573e-05, "loss": 3.5124, "step": 12465 }, { "epoch": 0.8472618562304661, "grad_norm": 2.465458869934082, "learning_rate": 8.941347329800245e-05, "loss": 3.3309, "step": 12470 }, { "epoch": 0.8476015763011279, "grad_norm": 1.9214965105056763, "learning_rate": 8.940922679711918e-05, "loss": 3.0097, "step": 12475 }, { "epoch": 0.8479412963717896, "grad_norm": 2.410896062850952, "learning_rate": 8.940498029623591e-05, "loss": 3.2621, "step": 12480 }, { "epoch": 0.8482810164424515, "grad_norm": 1.94785737991333, "learning_rate": 8.940073379535264e-05, "loss": 3.3998, "step": 12485 }, { "epoch": 0.8486207365131132, "grad_norm": 1.8780781030654907, "learning_rate": 8.939648729446937e-05, "loss": 3.4095, "step": 12490 }, { "epoch": 0.848960456583775, "grad_norm": 1.8824725151062012, "learning_rate": 8.93922407935861e-05, "loss": 3.5479, "step": 12495 }, { "epoch": 0.8493001766544367, "grad_norm": 2.340257406234741, "learning_rate": 8.938799429270281e-05, "loss": 3.4268, "step": 12500 }, { "epoch": 0.8496398967250985, "grad_norm": 2.2857558727264404, "learning_rate": 8.938374779181955e-05, "loss": 3.3992, "step": 12505 }, { "epoch": 0.8499796167957603, "grad_norm": 2.3237154483795166, "learning_rate": 8.937950129093628e-05, "loss": 3.2269, "step": 12510 }, { "epoch": 0.850319336866422, "grad_norm": 2.065633535385132, "learning_rate": 8.937525479005299e-05, "loss": 3.2727, "step": 12515 }, { "epoch": 0.8506590569370839, "grad_norm": 2.3923022747039795, "learning_rate": 8.937100828916973e-05, "loss": 3.3786, "step": 12520 }, { "epoch": 0.8509987770077456, "grad_norm": 2.556154251098633, "learning_rate": 8.936676178828646e-05, "loss": 3.4553, "step": 12525 }, { "epoch": 0.8513384970784074, "grad_norm": 2.490079164505005, "learning_rate": 8.936251528740318e-05, "loss": 3.4441, "step": 12530 }, { "epoch": 0.8516782171490692, "grad_norm": 2.2074363231658936, "learning_rate": 8.935826878651992e-05, "loss": 3.7676, "step": 12535 }, { "epoch": 0.852017937219731, "grad_norm": 2.49845552444458, "learning_rate": 8.935402228563665e-05, "loss": 3.4963, "step": 12540 }, { "epoch": 0.8523576572903927, "grad_norm": 2.1002702713012695, "learning_rate": 8.934977578475336e-05, "loss": 3.4969, "step": 12545 }, { "epoch": 0.8526973773610544, "grad_norm": 3.216614007949829, "learning_rate": 8.93455292838701e-05, "loss": 3.5337, "step": 12550 }, { "epoch": 0.8530370974317163, "grad_norm": 2.5423402786254883, "learning_rate": 8.934128278298683e-05, "loss": 3.3793, "step": 12555 }, { "epoch": 0.853376817502378, "grad_norm": 2.6236515045166016, "learning_rate": 8.933703628210354e-05, "loss": 3.2957, "step": 12560 }, { "epoch": 0.8537165375730398, "grad_norm": 2.173349142074585, "learning_rate": 8.933278978122029e-05, "loss": 3.5842, "step": 12565 }, { "epoch": 0.8540562576437016, "grad_norm": 2.4947588443756104, "learning_rate": 8.9328543280337e-05, "loss": 3.2113, "step": 12570 }, { "epoch": 0.8543959777143634, "grad_norm": 2.3870837688446045, "learning_rate": 8.932429677945373e-05, "loss": 3.4508, "step": 12575 }, { "epoch": 0.8547356977850251, "grad_norm": 2.351508378982544, "learning_rate": 8.932005027857047e-05, "loss": 3.3167, "step": 12580 }, { "epoch": 0.8550754178556869, "grad_norm": 2.275207757949829, "learning_rate": 8.931580377768718e-05, "loss": 3.0924, "step": 12585 }, { "epoch": 0.8554151379263487, "grad_norm": 1.9779943227767944, "learning_rate": 8.931155727680391e-05, "loss": 3.4408, "step": 12590 }, { "epoch": 0.8557548579970105, "grad_norm": 2.062061071395874, "learning_rate": 8.930731077592065e-05, "loss": 3.5864, "step": 12595 }, { "epoch": 0.8560945780676722, "grad_norm": 2.147888660430908, "learning_rate": 8.930306427503737e-05, "loss": 3.4524, "step": 12600 }, { "epoch": 0.856434298138334, "grad_norm": 3.2547719478607178, "learning_rate": 8.92988177741541e-05, "loss": 3.1025, "step": 12605 }, { "epoch": 0.8567740182089958, "grad_norm": 1.7158524990081787, "learning_rate": 8.929457127327084e-05, "loss": 3.4208, "step": 12610 }, { "epoch": 0.8571137382796575, "grad_norm": 2.0777182579040527, "learning_rate": 8.929032477238755e-05, "loss": 3.283, "step": 12615 }, { "epoch": 0.8574534583503194, "grad_norm": 2.6932685375213623, "learning_rate": 8.928607827150428e-05, "loss": 3.7409, "step": 12620 }, { "epoch": 0.8577931784209811, "grad_norm": 2.5824766159057617, "learning_rate": 8.928183177062102e-05, "loss": 3.4157, "step": 12625 }, { "epoch": 0.8581328984916429, "grad_norm": 2.1701784133911133, "learning_rate": 8.927758526973774e-05, "loss": 3.5725, "step": 12630 }, { "epoch": 0.8584726185623046, "grad_norm": 2.7500288486480713, "learning_rate": 8.927333876885446e-05, "loss": 3.271, "step": 12635 }, { "epoch": 0.8588123386329665, "grad_norm": 2.2366464138031006, "learning_rate": 8.92690922679712e-05, "loss": 3.4973, "step": 12640 }, { "epoch": 0.8591520587036282, "grad_norm": 1.7908543348312378, "learning_rate": 8.926484576708792e-05, "loss": 3.5301, "step": 12645 }, { "epoch": 0.85949177877429, "grad_norm": 2.123262405395508, "learning_rate": 8.926059926620465e-05, "loss": 3.3515, "step": 12650 }, { "epoch": 0.8598314988449518, "grad_norm": 1.9612798690795898, "learning_rate": 8.925635276532138e-05, "loss": 3.3157, "step": 12655 }, { "epoch": 0.8601712189156135, "grad_norm": 2.2451987266540527, "learning_rate": 8.92521062644381e-05, "loss": 3.3019, "step": 12660 }, { "epoch": 0.8605109389862753, "grad_norm": 2.1565518379211426, "learning_rate": 8.924785976355483e-05, "loss": 3.3352, "step": 12665 }, { "epoch": 0.860850659056937, "grad_norm": 2.283409595489502, "learning_rate": 8.924361326267156e-05, "loss": 3.3587, "step": 12670 }, { "epoch": 0.8611903791275989, "grad_norm": 1.6637094020843506, "learning_rate": 8.923936676178829e-05, "loss": 3.4987, "step": 12675 }, { "epoch": 0.8615300991982606, "grad_norm": 2.5579943656921387, "learning_rate": 8.923512026090502e-05, "loss": 3.3384, "step": 12680 }, { "epoch": 0.8618698192689224, "grad_norm": 2.9387600421905518, "learning_rate": 8.923087376002174e-05, "loss": 3.4482, "step": 12685 }, { "epoch": 0.8622095393395842, "grad_norm": 2.019212007522583, "learning_rate": 8.922662725913847e-05, "loss": 3.1966, "step": 12690 }, { "epoch": 0.862549259410246, "grad_norm": 2.327361583709717, "learning_rate": 8.92223807582552e-05, "loss": 3.3307, "step": 12695 }, { "epoch": 0.8628889794809077, "grad_norm": 2.6048264503479004, "learning_rate": 8.921813425737193e-05, "loss": 3.4112, "step": 12700 }, { "epoch": 0.8632286995515696, "grad_norm": 1.9804767370224, "learning_rate": 8.921388775648866e-05, "loss": 3.137, "step": 12705 }, { "epoch": 0.8635684196222313, "grad_norm": 2.4206314086914062, "learning_rate": 8.920964125560538e-05, "loss": 3.1874, "step": 12710 }, { "epoch": 0.863908139692893, "grad_norm": 2.8207027912139893, "learning_rate": 8.920539475472211e-05, "loss": 3.4444, "step": 12715 }, { "epoch": 0.8642478597635548, "grad_norm": 1.9976999759674072, "learning_rate": 8.920114825383884e-05, "loss": 3.4778, "step": 12720 }, { "epoch": 0.8645875798342166, "grad_norm": 2.2826948165893555, "learning_rate": 8.919690175295557e-05, "loss": 3.528, "step": 12725 }, { "epoch": 0.8649272999048784, "grad_norm": 1.950202226638794, "learning_rate": 8.91926552520723e-05, "loss": 3.3601, "step": 12730 }, { "epoch": 0.8652670199755401, "grad_norm": 2.4248416423797607, "learning_rate": 8.918840875118902e-05, "loss": 3.6821, "step": 12735 }, { "epoch": 0.865606740046202, "grad_norm": 2.4804511070251465, "learning_rate": 8.918416225030575e-05, "loss": 3.4957, "step": 12740 }, { "epoch": 0.8659464601168637, "grad_norm": 2.0834972858428955, "learning_rate": 8.917991574942248e-05, "loss": 3.4513, "step": 12745 }, { "epoch": 0.8662861801875255, "grad_norm": 1.7990913391113281, "learning_rate": 8.917566924853921e-05, "loss": 3.4229, "step": 12750 }, { "epoch": 0.8666259002581872, "grad_norm": 2.037576675415039, "learning_rate": 8.917142274765594e-05, "loss": 3.2739, "step": 12755 }, { "epoch": 0.866965620328849, "grad_norm": 2.4587152004241943, "learning_rate": 8.916717624677266e-05, "loss": 3.3435, "step": 12760 }, { "epoch": 0.8673053403995108, "grad_norm": 2.065459728240967, "learning_rate": 8.916292974588939e-05, "loss": 3.46, "step": 12765 }, { "epoch": 0.8676450604701725, "grad_norm": 2.088833808898926, "learning_rate": 8.915868324500611e-05, "loss": 3.4938, "step": 12770 }, { "epoch": 0.8679847805408344, "grad_norm": 2.0702526569366455, "learning_rate": 8.915443674412285e-05, "loss": 3.4722, "step": 12775 }, { "epoch": 0.8683245006114961, "grad_norm": 1.93660306930542, "learning_rate": 8.915019024323958e-05, "loss": 3.2508, "step": 12780 }, { "epoch": 0.8686642206821579, "grad_norm": 2.8605918884277344, "learning_rate": 8.91459437423563e-05, "loss": 3.5661, "step": 12785 }, { "epoch": 0.8690039407528197, "grad_norm": 2.237478494644165, "learning_rate": 8.914169724147303e-05, "loss": 3.3289, "step": 12790 }, { "epoch": 0.8693436608234815, "grad_norm": 1.5462170839309692, "learning_rate": 8.913745074058976e-05, "loss": 3.4486, "step": 12795 }, { "epoch": 0.8696833808941432, "grad_norm": 2.314127206802368, "learning_rate": 8.913320423970649e-05, "loss": 3.5927, "step": 12800 }, { "epoch": 0.870023100964805, "grad_norm": 2.1215624809265137, "learning_rate": 8.912895773882322e-05, "loss": 3.5857, "step": 12805 }, { "epoch": 0.8703628210354668, "grad_norm": 2.038442373275757, "learning_rate": 8.912471123793994e-05, "loss": 3.5813, "step": 12810 }, { "epoch": 0.8707025411061285, "grad_norm": 2.324225425720215, "learning_rate": 8.912046473705667e-05, "loss": 3.488, "step": 12815 }, { "epoch": 0.8710422611767903, "grad_norm": 2.0202338695526123, "learning_rate": 8.91162182361734e-05, "loss": 3.4948, "step": 12820 }, { "epoch": 0.8713819812474521, "grad_norm": 3.1543400287628174, "learning_rate": 8.911197173529013e-05, "loss": 3.5433, "step": 12825 }, { "epoch": 0.8717217013181139, "grad_norm": 1.6956403255462646, "learning_rate": 8.910772523440686e-05, "loss": 3.3524, "step": 12830 }, { "epoch": 0.8720614213887756, "grad_norm": 1.625807523727417, "learning_rate": 8.910347873352358e-05, "loss": 3.266, "step": 12835 }, { "epoch": 0.8724011414594374, "grad_norm": 2.3619906902313232, "learning_rate": 8.909923223264031e-05, "loss": 3.4372, "step": 12840 }, { "epoch": 0.8727408615300992, "grad_norm": 2.4061622619628906, "learning_rate": 8.909498573175704e-05, "loss": 3.3166, "step": 12845 }, { "epoch": 0.873080581600761, "grad_norm": 2.2131905555725098, "learning_rate": 8.909073923087377e-05, "loss": 3.6936, "step": 12850 }, { "epoch": 0.8734203016714227, "grad_norm": 2.0261764526367188, "learning_rate": 8.908649272999048e-05, "loss": 3.5052, "step": 12855 }, { "epoch": 0.8737600217420846, "grad_norm": 2.1919784545898438, "learning_rate": 8.908224622910722e-05, "loss": 3.2626, "step": 12860 }, { "epoch": 0.8740997418127463, "grad_norm": 1.908305287361145, "learning_rate": 8.907799972822395e-05, "loss": 3.3079, "step": 12865 }, { "epoch": 0.874439461883408, "grad_norm": 3.6500329971313477, "learning_rate": 8.907375322734067e-05, "loss": 3.0749, "step": 12870 }, { "epoch": 0.8747791819540699, "grad_norm": 2.219109296798706, "learning_rate": 8.906950672645741e-05, "loss": 3.5303, "step": 12875 }, { "epoch": 0.8751189020247316, "grad_norm": 1.99685800075531, "learning_rate": 8.906526022557414e-05, "loss": 3.1335, "step": 12880 }, { "epoch": 0.8754586220953934, "grad_norm": 2.4649786949157715, "learning_rate": 8.906101372469085e-05, "loss": 3.4878, "step": 12885 }, { "epoch": 0.8757983421660551, "grad_norm": 1.9280425310134888, "learning_rate": 8.905676722380759e-05, "loss": 3.3292, "step": 12890 }, { "epoch": 0.876138062236717, "grad_norm": 2.0135698318481445, "learning_rate": 8.905252072292432e-05, "loss": 3.4459, "step": 12895 }, { "epoch": 0.8764777823073787, "grad_norm": 2.0553817749023438, "learning_rate": 8.904827422204104e-05, "loss": 3.6205, "step": 12900 }, { "epoch": 0.8768175023780405, "grad_norm": 2.1602604389190674, "learning_rate": 8.904402772115778e-05, "loss": 3.4049, "step": 12905 }, { "epoch": 0.8771572224487023, "grad_norm": 2.554487705230713, "learning_rate": 8.90397812202745e-05, "loss": 3.4514, "step": 12910 }, { "epoch": 0.877496942519364, "grad_norm": 2.9594929218292236, "learning_rate": 8.903553471939122e-05, "loss": 3.0998, "step": 12915 }, { "epoch": 0.8778366625900258, "grad_norm": 2.31552791595459, "learning_rate": 8.903128821850796e-05, "loss": 3.3427, "step": 12920 }, { "epoch": 0.8781763826606876, "grad_norm": 2.297060012817383, "learning_rate": 8.902704171762468e-05, "loss": 3.196, "step": 12925 }, { "epoch": 0.8785161027313494, "grad_norm": 2.477471351623535, "learning_rate": 8.90227952167414e-05, "loss": 3.4359, "step": 12930 }, { "epoch": 0.8788558228020111, "grad_norm": 2.201030969619751, "learning_rate": 8.901854871585814e-05, "loss": 3.3868, "step": 12935 }, { "epoch": 0.8791955428726729, "grad_norm": 2.3145620822906494, "learning_rate": 8.901430221497486e-05, "loss": 3.3346, "step": 12940 }, { "epoch": 0.8795352629433347, "grad_norm": 1.816537618637085, "learning_rate": 8.901005571409159e-05, "loss": 3.4542, "step": 12945 }, { "epoch": 0.8798749830139965, "grad_norm": 1.8864638805389404, "learning_rate": 8.900580921320833e-05, "loss": 2.9929, "step": 12950 }, { "epoch": 0.8802147030846582, "grad_norm": 2.606241226196289, "learning_rate": 8.900156271232504e-05, "loss": 3.5346, "step": 12955 }, { "epoch": 0.8805544231553201, "grad_norm": 1.8780237436294556, "learning_rate": 8.899731621144177e-05, "loss": 3.4796, "step": 12960 }, { "epoch": 0.8808941432259818, "grad_norm": 2.0902504920959473, "learning_rate": 8.899306971055851e-05, "loss": 3.5528, "step": 12965 }, { "epoch": 0.8812338632966435, "grad_norm": 2.3323183059692383, "learning_rate": 8.898882320967523e-05, "loss": 3.4537, "step": 12970 }, { "epoch": 0.8815735833673053, "grad_norm": 2.0410828590393066, "learning_rate": 8.898457670879196e-05, "loss": 3.4332, "step": 12975 }, { "epoch": 0.8819133034379671, "grad_norm": 2.0101664066314697, "learning_rate": 8.89803302079087e-05, "loss": 3.5238, "step": 12980 }, { "epoch": 0.8822530235086289, "grad_norm": 2.3145689964294434, "learning_rate": 8.897608370702541e-05, "loss": 3.3466, "step": 12985 }, { "epoch": 0.8825927435792906, "grad_norm": 2.9193103313446045, "learning_rate": 8.897183720614214e-05, "loss": 3.5716, "step": 12990 }, { "epoch": 0.8829324636499525, "grad_norm": 2.369565010070801, "learning_rate": 8.896759070525888e-05, "loss": 3.468, "step": 12995 }, { "epoch": 0.8832721837206142, "grad_norm": 2.023770809173584, "learning_rate": 8.89633442043756e-05, "loss": 3.4391, "step": 13000 }, { "epoch": 0.883611903791276, "grad_norm": 2.9925358295440674, "learning_rate": 8.895909770349232e-05, "loss": 3.4964, "step": 13005 }, { "epoch": 0.8839516238619378, "grad_norm": 5.115194320678711, "learning_rate": 8.895485120260905e-05, "loss": 3.2782, "step": 13010 }, { "epoch": 0.8842913439325996, "grad_norm": 1.8293346166610718, "learning_rate": 8.895060470172578e-05, "loss": 3.2187, "step": 13015 }, { "epoch": 0.8846310640032613, "grad_norm": 1.6921168565750122, "learning_rate": 8.894635820084251e-05, "loss": 3.2743, "step": 13020 }, { "epoch": 0.884970784073923, "grad_norm": 2.1380624771118164, "learning_rate": 8.894211169995924e-05, "loss": 3.2137, "step": 13025 }, { "epoch": 0.8853105041445849, "grad_norm": 2.3241119384765625, "learning_rate": 8.893786519907596e-05, "loss": 3.405, "step": 13030 }, { "epoch": 0.8856502242152466, "grad_norm": 2.54184889793396, "learning_rate": 8.893361869819269e-05, "loss": 3.4104, "step": 13035 }, { "epoch": 0.8859899442859084, "grad_norm": 2.4962642192840576, "learning_rate": 8.892937219730942e-05, "loss": 3.1967, "step": 13040 }, { "epoch": 0.8863296643565702, "grad_norm": 1.788737177848816, "learning_rate": 8.892512569642615e-05, "loss": 3.4218, "step": 13045 }, { "epoch": 0.886669384427232, "grad_norm": 2.252824068069458, "learning_rate": 8.892087919554288e-05, "loss": 3.5111, "step": 13050 }, { "epoch": 0.8870091044978937, "grad_norm": 1.9539830684661865, "learning_rate": 8.89166326946596e-05, "loss": 3.3384, "step": 13055 }, { "epoch": 0.8873488245685555, "grad_norm": 2.8625988960266113, "learning_rate": 8.891238619377633e-05, "loss": 3.1979, "step": 13060 }, { "epoch": 0.8876885446392173, "grad_norm": 1.923761248588562, "learning_rate": 8.890813969289306e-05, "loss": 3.3952, "step": 13065 }, { "epoch": 0.8880282647098791, "grad_norm": 3.0056352615356445, "learning_rate": 8.890389319200979e-05, "loss": 3.3172, "step": 13070 }, { "epoch": 0.8883679847805408, "grad_norm": 2.1578927040100098, "learning_rate": 8.889964669112652e-05, "loss": 3.4347, "step": 13075 }, { "epoch": 0.8887077048512027, "grad_norm": 1.9011409282684326, "learning_rate": 8.889540019024324e-05, "loss": 3.4688, "step": 13080 }, { "epoch": 0.8890474249218644, "grad_norm": 2.308225393295288, "learning_rate": 8.889115368935997e-05, "loss": 3.3298, "step": 13085 }, { "epoch": 0.8893871449925261, "grad_norm": 2.313910484313965, "learning_rate": 8.88869071884767e-05, "loss": 3.4784, "step": 13090 }, { "epoch": 0.889726865063188, "grad_norm": 2.3167572021484375, "learning_rate": 8.888266068759343e-05, "loss": 3.4088, "step": 13095 }, { "epoch": 0.8900665851338497, "grad_norm": 1.786078691482544, "learning_rate": 8.887841418671016e-05, "loss": 3.3644, "step": 13100 }, { "epoch": 0.8904063052045115, "grad_norm": 2.768312692642212, "learning_rate": 8.887416768582688e-05, "loss": 3.4611, "step": 13105 }, { "epoch": 0.8907460252751732, "grad_norm": 2.6162333488464355, "learning_rate": 8.886992118494361e-05, "loss": 3.4614, "step": 13110 }, { "epoch": 0.8910857453458351, "grad_norm": 2.2137365341186523, "learning_rate": 8.886567468406034e-05, "loss": 3.3557, "step": 13115 }, { "epoch": 0.8914254654164968, "grad_norm": 2.2785744667053223, "learning_rate": 8.886142818317707e-05, "loss": 3.5574, "step": 13120 }, { "epoch": 0.8917651854871586, "grad_norm": 2.1758687496185303, "learning_rate": 8.88571816822938e-05, "loss": 3.1697, "step": 13125 }, { "epoch": 0.8921049055578204, "grad_norm": 2.1688098907470703, "learning_rate": 8.885293518141052e-05, "loss": 3.2151, "step": 13130 }, { "epoch": 0.8924446256284821, "grad_norm": 2.6832351684570312, "learning_rate": 8.884868868052725e-05, "loss": 3.3154, "step": 13135 }, { "epoch": 0.8927843456991439, "grad_norm": 1.8473397493362427, "learning_rate": 8.884444217964398e-05, "loss": 3.4229, "step": 13140 }, { "epoch": 0.8931240657698056, "grad_norm": 1.936287522315979, "learning_rate": 8.884019567876071e-05, "loss": 3.5728, "step": 13145 }, { "epoch": 0.8934637858404675, "grad_norm": 2.4662015438079834, "learning_rate": 8.883594917787744e-05, "loss": 3.4527, "step": 13150 }, { "epoch": 0.8938035059111292, "grad_norm": 3.8504714965820312, "learning_rate": 8.883170267699416e-05, "loss": 3.5006, "step": 13155 }, { "epoch": 0.894143225981791, "grad_norm": 1.987755537033081, "learning_rate": 8.882745617611089e-05, "loss": 3.166, "step": 13160 }, { "epoch": 0.8944829460524528, "grad_norm": 1.9100432395935059, "learning_rate": 8.882320967522762e-05, "loss": 3.1983, "step": 13165 }, { "epoch": 0.8948226661231146, "grad_norm": 3.521446704864502, "learning_rate": 8.881896317434435e-05, "loss": 3.2565, "step": 13170 }, { "epoch": 0.8951623861937763, "grad_norm": 1.5010722875595093, "learning_rate": 8.881471667346108e-05, "loss": 3.4737, "step": 13175 }, { "epoch": 0.8955021062644382, "grad_norm": 2.792729139328003, "learning_rate": 8.88104701725778e-05, "loss": 3.3971, "step": 13180 }, { "epoch": 0.8958418263350999, "grad_norm": 2.1972968578338623, "learning_rate": 8.880622367169453e-05, "loss": 3.6645, "step": 13185 }, { "epoch": 0.8961815464057616, "grad_norm": 1.9702963829040527, "learning_rate": 8.880197717081126e-05, "loss": 3.2824, "step": 13190 }, { "epoch": 0.8965212664764234, "grad_norm": 3.380788564682007, "learning_rate": 8.879773066992799e-05, "loss": 3.5972, "step": 13195 }, { "epoch": 0.8968609865470852, "grad_norm": 1.9863908290863037, "learning_rate": 8.879348416904472e-05, "loss": 3.2562, "step": 13200 }, { "epoch": 0.897200706617747, "grad_norm": 2.170250177383423, "learning_rate": 8.878923766816144e-05, "loss": 3.2255, "step": 13205 }, { "epoch": 0.8975404266884087, "grad_norm": 1.925988793373108, "learning_rate": 8.878499116727816e-05, "loss": 3.2652, "step": 13210 }, { "epoch": 0.8978801467590706, "grad_norm": 2.326113224029541, "learning_rate": 8.87807446663949e-05, "loss": 3.2761, "step": 13215 }, { "epoch": 0.8982198668297323, "grad_norm": 2.4070301055908203, "learning_rate": 8.877649816551163e-05, "loss": 3.3647, "step": 13220 }, { "epoch": 0.8985595869003941, "grad_norm": 2.393268346786499, "learning_rate": 8.877225166462834e-05, "loss": 3.575, "step": 13225 }, { "epoch": 0.8988993069710558, "grad_norm": 2.325731039047241, "learning_rate": 8.876800516374508e-05, "loss": 3.6121, "step": 13230 }, { "epoch": 0.8992390270417177, "grad_norm": 2.6531436443328857, "learning_rate": 8.876375866286181e-05, "loss": 3.2707, "step": 13235 }, { "epoch": 0.8995787471123794, "grad_norm": 1.7149523496627808, "learning_rate": 8.875951216197853e-05, "loss": 3.5532, "step": 13240 }, { "epoch": 0.8999184671830411, "grad_norm": 2.0498011112213135, "learning_rate": 8.875526566109527e-05, "loss": 3.1953, "step": 13245 }, { "epoch": 0.900258187253703, "grad_norm": 2.6770033836364746, "learning_rate": 8.8751019160212e-05, "loss": 3.4046, "step": 13250 }, { "epoch": 0.9005979073243647, "grad_norm": 2.205594062805176, "learning_rate": 8.874677265932871e-05, "loss": 3.3253, "step": 13255 }, { "epoch": 0.9009376273950265, "grad_norm": 2.046020984649658, "learning_rate": 8.874252615844545e-05, "loss": 3.3483, "step": 13260 }, { "epoch": 0.9012773474656883, "grad_norm": 2.020634651184082, "learning_rate": 8.873827965756218e-05, "loss": 3.334, "step": 13265 }, { "epoch": 0.9016170675363501, "grad_norm": 3.039229393005371, "learning_rate": 8.87340331566789e-05, "loss": 3.3703, "step": 13270 }, { "epoch": 0.9019567876070118, "grad_norm": 4.250556945800781, "learning_rate": 8.872978665579564e-05, "loss": 3.5719, "step": 13275 }, { "epoch": 0.9022965076776736, "grad_norm": 2.8187313079833984, "learning_rate": 8.872554015491235e-05, "loss": 3.6244, "step": 13280 }, { "epoch": 0.9026362277483354, "grad_norm": 1.824264407157898, "learning_rate": 8.872129365402908e-05, "loss": 3.456, "step": 13285 }, { "epoch": 0.9029759478189971, "grad_norm": 2.8311729431152344, "learning_rate": 8.871704715314582e-05, "loss": 3.4809, "step": 13290 }, { "epoch": 0.9033156678896589, "grad_norm": 2.219768762588501, "learning_rate": 8.871280065226253e-05, "loss": 3.6001, "step": 13295 }, { "epoch": 0.9036553879603207, "grad_norm": 2.002916097640991, "learning_rate": 8.870855415137926e-05, "loss": 3.5155, "step": 13300 }, { "epoch": 0.9039951080309825, "grad_norm": 2.181034803390503, "learning_rate": 8.8704307650496e-05, "loss": 3.2217, "step": 13305 }, { "epoch": 0.9043348281016442, "grad_norm": 1.8515734672546387, "learning_rate": 8.870006114961272e-05, "loss": 3.5193, "step": 13310 }, { "epoch": 0.904674548172306, "grad_norm": 1.9891297817230225, "learning_rate": 8.869581464872945e-05, "loss": 3.6964, "step": 13315 }, { "epoch": 0.9050142682429678, "grad_norm": 1.9584465026855469, "learning_rate": 8.869156814784619e-05, "loss": 3.2573, "step": 13320 }, { "epoch": 0.9053539883136296, "grad_norm": 1.926998257637024, "learning_rate": 8.86873216469629e-05, "loss": 3.2995, "step": 13325 }, { "epoch": 0.9056937083842913, "grad_norm": 2.677941083908081, "learning_rate": 8.868307514607963e-05, "loss": 3.3437, "step": 13330 }, { "epoch": 0.9060334284549532, "grad_norm": 2.195709228515625, "learning_rate": 8.867882864519637e-05, "loss": 3.4825, "step": 13335 }, { "epoch": 0.9063731485256149, "grad_norm": 1.9107495546340942, "learning_rate": 8.867458214431309e-05, "loss": 3.125, "step": 13340 }, { "epoch": 0.9067128685962766, "grad_norm": 1.839128017425537, "learning_rate": 8.867033564342981e-05, "loss": 3.4562, "step": 13345 }, { "epoch": 0.9070525886669385, "grad_norm": 2.709388256072998, "learning_rate": 8.866608914254654e-05, "loss": 3.3654, "step": 13350 }, { "epoch": 0.9073923087376002, "grad_norm": 2.196286678314209, "learning_rate": 8.866184264166327e-05, "loss": 3.347, "step": 13355 }, { "epoch": 0.907732028808262, "grad_norm": 2.0276081562042236, "learning_rate": 8.865759614078e-05, "loss": 3.3006, "step": 13360 }, { "epoch": 0.9080717488789237, "grad_norm": 2.9177396297454834, "learning_rate": 8.865334963989673e-05, "loss": 3.2102, "step": 13365 }, { "epoch": 0.9084114689495856, "grad_norm": 2.3971760272979736, "learning_rate": 8.864910313901345e-05, "loss": 3.3668, "step": 13370 }, { "epoch": 0.9087511890202473, "grad_norm": 1.934773325920105, "learning_rate": 8.864485663813018e-05, "loss": 3.04, "step": 13375 }, { "epoch": 0.9090909090909091, "grad_norm": 1.9874131679534912, "learning_rate": 8.864061013724691e-05, "loss": 3.5997, "step": 13380 }, { "epoch": 0.9094306291615709, "grad_norm": 2.0478274822235107, "learning_rate": 8.863636363636364e-05, "loss": 3.5252, "step": 13385 }, { "epoch": 0.9097703492322327, "grad_norm": 2.533780813217163, "learning_rate": 8.863211713548037e-05, "loss": 3.4548, "step": 13390 }, { "epoch": 0.9101100693028944, "grad_norm": 2.431217670440674, "learning_rate": 8.86278706345971e-05, "loss": 3.1835, "step": 13395 }, { "epoch": 0.9104497893735561, "grad_norm": 2.06567645072937, "learning_rate": 8.862362413371382e-05, "loss": 3.1748, "step": 13400 }, { "epoch": 0.910789509444218, "grad_norm": 2.533534288406372, "learning_rate": 8.861937763283055e-05, "loss": 3.8377, "step": 13405 }, { "epoch": 0.9111292295148797, "grad_norm": 1.6790692806243896, "learning_rate": 8.861513113194728e-05, "loss": 3.1369, "step": 13410 }, { "epoch": 0.9114689495855415, "grad_norm": 1.5203920602798462, "learning_rate": 8.8610884631064e-05, "loss": 3.4221, "step": 13415 }, { "epoch": 0.9118086696562033, "grad_norm": 1.9265767335891724, "learning_rate": 8.860663813018073e-05, "loss": 3.6573, "step": 13420 }, { "epoch": 0.9121483897268651, "grad_norm": 3.2400338649749756, "learning_rate": 8.860239162929746e-05, "loss": 3.3486, "step": 13425 }, { "epoch": 0.9124881097975268, "grad_norm": 1.939512848854065, "learning_rate": 8.859814512841419e-05, "loss": 3.129, "step": 13430 }, { "epoch": 0.9128278298681887, "grad_norm": 2.470653772354126, "learning_rate": 8.859389862753092e-05, "loss": 3.3915, "step": 13435 }, { "epoch": 0.9131675499388504, "grad_norm": 2.515514612197876, "learning_rate": 8.858965212664765e-05, "loss": 3.2656, "step": 13440 }, { "epoch": 0.9135072700095122, "grad_norm": 2.438286781311035, "learning_rate": 8.858540562576437e-05, "loss": 3.3285, "step": 13445 }, { "epoch": 0.9138469900801739, "grad_norm": 2.142879009246826, "learning_rate": 8.85811591248811e-05, "loss": 3.2102, "step": 13450 }, { "epoch": 0.9141867101508357, "grad_norm": 1.9283112287521362, "learning_rate": 8.857691262399783e-05, "loss": 3.3321, "step": 13455 }, { "epoch": 0.9145264302214975, "grad_norm": 2.196021556854248, "learning_rate": 8.857266612311456e-05, "loss": 3.5754, "step": 13460 }, { "epoch": 0.9148661502921592, "grad_norm": 2.6829655170440674, "learning_rate": 8.856841962223129e-05, "loss": 3.4622, "step": 13465 }, { "epoch": 0.9152058703628211, "grad_norm": 2.14595627784729, "learning_rate": 8.856417312134801e-05, "loss": 3.5408, "step": 13470 }, { "epoch": 0.9155455904334828, "grad_norm": 1.946702003479004, "learning_rate": 8.855992662046474e-05, "loss": 3.3058, "step": 13475 }, { "epoch": 0.9158853105041446, "grad_norm": 2.204606771469116, "learning_rate": 8.855568011958147e-05, "loss": 3.2866, "step": 13480 }, { "epoch": 0.9162250305748063, "grad_norm": 1.8814940452575684, "learning_rate": 8.85514336186982e-05, "loss": 3.317, "step": 13485 }, { "epoch": 0.9165647506454682, "grad_norm": 2.5195305347442627, "learning_rate": 8.854718711781493e-05, "loss": 3.3938, "step": 13490 }, { "epoch": 0.9169044707161299, "grad_norm": 2.58943247795105, "learning_rate": 8.854294061693165e-05, "loss": 3.4735, "step": 13495 }, { "epoch": 0.9172441907867916, "grad_norm": 2.0806257724761963, "learning_rate": 8.853869411604838e-05, "loss": 3.3306, "step": 13500 }, { "epoch": 0.9175839108574535, "grad_norm": 2.1838865280151367, "learning_rate": 8.853444761516511e-05, "loss": 3.4814, "step": 13505 }, { "epoch": 0.9179236309281152, "grad_norm": 1.8753262758255005, "learning_rate": 8.853020111428184e-05, "loss": 3.7305, "step": 13510 }, { "epoch": 0.918263350998777, "grad_norm": 1.9816781282424927, "learning_rate": 8.852595461339857e-05, "loss": 2.9606, "step": 13515 }, { "epoch": 0.9186030710694388, "grad_norm": 3.587141275405884, "learning_rate": 8.85217081125153e-05, "loss": 3.4243, "step": 13520 }, { "epoch": 0.9189427911401006, "grad_norm": 2.19759464263916, "learning_rate": 8.851746161163202e-05, "loss": 3.4074, "step": 13525 }, { "epoch": 0.9192825112107623, "grad_norm": 2.2791662216186523, "learning_rate": 8.851321511074875e-05, "loss": 3.1331, "step": 13530 }, { "epoch": 0.9196222312814241, "grad_norm": 2.315096855163574, "learning_rate": 8.850896860986548e-05, "loss": 3.4695, "step": 13535 }, { "epoch": 0.9199619513520859, "grad_norm": 1.9696284532546997, "learning_rate": 8.85047221089822e-05, "loss": 3.5776, "step": 13540 }, { "epoch": 0.9203016714227477, "grad_norm": 2.397587299346924, "learning_rate": 8.850047560809893e-05, "loss": 3.3722, "step": 13545 }, { "epoch": 0.9206413914934094, "grad_norm": 2.582270860671997, "learning_rate": 8.849622910721565e-05, "loss": 3.5678, "step": 13550 }, { "epoch": 0.9209811115640713, "grad_norm": 2.483783006668091, "learning_rate": 8.849198260633239e-05, "loss": 3.4023, "step": 13555 }, { "epoch": 0.921320831634733, "grad_norm": 2.3398361206054688, "learning_rate": 8.848773610544912e-05, "loss": 3.3163, "step": 13560 }, { "epoch": 0.9216605517053947, "grad_norm": 2.571629285812378, "learning_rate": 8.848348960456583e-05, "loss": 3.3716, "step": 13565 }, { "epoch": 0.9220002717760565, "grad_norm": 3.2918901443481445, "learning_rate": 8.847924310368257e-05, "loss": 3.3708, "step": 13570 }, { "epoch": 0.9223399918467183, "grad_norm": 1.8452770709991455, "learning_rate": 8.84749966027993e-05, "loss": 3.398, "step": 13575 }, { "epoch": 0.9226797119173801, "grad_norm": 2.250333547592163, "learning_rate": 8.847075010191602e-05, "loss": 3.2594, "step": 13580 }, { "epoch": 0.9230194319880418, "grad_norm": 2.371022939682007, "learning_rate": 8.846650360103276e-05, "loss": 3.1019, "step": 13585 }, { "epoch": 0.9233591520587037, "grad_norm": 2.073197603225708, "learning_rate": 8.846225710014949e-05, "loss": 3.406, "step": 13590 }, { "epoch": 0.9236988721293654, "grad_norm": 2.479724168777466, "learning_rate": 8.84580105992662e-05, "loss": 3.4001, "step": 13595 }, { "epoch": 0.9240385922000272, "grad_norm": 1.9628310203552246, "learning_rate": 8.845376409838294e-05, "loss": 3.3249, "step": 13600 }, { "epoch": 0.924378312270689, "grad_norm": 2.0974276065826416, "learning_rate": 8.844951759749967e-05, "loss": 3.5215, "step": 13605 }, { "epoch": 0.9247180323413507, "grad_norm": 1.8485081195831299, "learning_rate": 8.844527109661639e-05, "loss": 3.5419, "step": 13610 }, { "epoch": 0.9250577524120125, "grad_norm": 2.2899010181427, "learning_rate": 8.844102459573313e-05, "loss": 3.3265, "step": 13615 }, { "epoch": 0.9253974724826742, "grad_norm": 1.9482983350753784, "learning_rate": 8.843677809484985e-05, "loss": 3.4175, "step": 13620 }, { "epoch": 0.9257371925533361, "grad_norm": 1.6159031391143799, "learning_rate": 8.843253159396657e-05, "loss": 3.4558, "step": 13625 }, { "epoch": 0.9260769126239978, "grad_norm": 1.5389013290405273, "learning_rate": 8.842828509308331e-05, "loss": 3.2855, "step": 13630 }, { "epoch": 0.9264166326946596, "grad_norm": 2.0275795459747314, "learning_rate": 8.842403859220003e-05, "loss": 3.494, "step": 13635 }, { "epoch": 0.9267563527653214, "grad_norm": 2.7954111099243164, "learning_rate": 8.841979209131675e-05, "loss": 3.373, "step": 13640 }, { "epoch": 0.9270960728359832, "grad_norm": 2.6040596961975098, "learning_rate": 8.84155455904335e-05, "loss": 3.147, "step": 13645 }, { "epoch": 0.9274357929066449, "grad_norm": 2.22871732711792, "learning_rate": 8.841129908955021e-05, "loss": 3.5235, "step": 13650 }, { "epoch": 0.9277755129773066, "grad_norm": 2.38703989982605, "learning_rate": 8.840705258866694e-05, "loss": 3.4076, "step": 13655 }, { "epoch": 0.9281152330479685, "grad_norm": 1.7858290672302246, "learning_rate": 8.840280608778368e-05, "loss": 3.4969, "step": 13660 }, { "epoch": 0.9284549531186302, "grad_norm": 2.1832544803619385, "learning_rate": 8.83985595869004e-05, "loss": 3.381, "step": 13665 }, { "epoch": 0.928794673189292, "grad_norm": 2.0679519176483154, "learning_rate": 8.839431308601712e-05, "loss": 3.5828, "step": 13670 }, { "epoch": 0.9291343932599538, "grad_norm": 2.5063905715942383, "learning_rate": 8.839006658513386e-05, "loss": 3.5292, "step": 13675 }, { "epoch": 0.9294741133306156, "grad_norm": 2.0488312244415283, "learning_rate": 8.838582008425058e-05, "loss": 3.3768, "step": 13680 }, { "epoch": 0.9298138334012773, "grad_norm": 2.228555679321289, "learning_rate": 8.83815735833673e-05, "loss": 3.1115, "step": 13685 }, { "epoch": 0.9301535534719392, "grad_norm": 2.152878522872925, "learning_rate": 8.837732708248405e-05, "loss": 3.2995, "step": 13690 }, { "epoch": 0.9304932735426009, "grad_norm": 2.3059911727905273, "learning_rate": 8.837308058160076e-05, "loss": 3.3245, "step": 13695 }, { "epoch": 0.9308329936132627, "grad_norm": 1.7861649990081787, "learning_rate": 8.836883408071749e-05, "loss": 3.1596, "step": 13700 }, { "epoch": 0.9311727136839244, "grad_norm": 2.206907272338867, "learning_rate": 8.836458757983422e-05, "loss": 3.3185, "step": 13705 }, { "epoch": 0.9315124337545863, "grad_norm": 1.6934267282485962, "learning_rate": 8.836034107895095e-05, "loss": 3.3716, "step": 13710 }, { "epoch": 0.931852153825248, "grad_norm": 2.176382303237915, "learning_rate": 8.835609457806767e-05, "loss": 3.6565, "step": 13715 }, { "epoch": 0.9321918738959097, "grad_norm": 2.210233449935913, "learning_rate": 8.83518480771844e-05, "loss": 3.3932, "step": 13720 }, { "epoch": 0.9325315939665716, "grad_norm": 2.223986864089966, "learning_rate": 8.834760157630113e-05, "loss": 3.149, "step": 13725 }, { "epoch": 0.9328713140372333, "grad_norm": 2.146643877029419, "learning_rate": 8.834335507541786e-05, "loss": 3.2761, "step": 13730 }, { "epoch": 0.9332110341078951, "grad_norm": 1.9866292476654053, "learning_rate": 8.833910857453459e-05, "loss": 3.5739, "step": 13735 }, { "epoch": 0.9335507541785568, "grad_norm": 2.1259379386901855, "learning_rate": 8.833486207365131e-05, "loss": 3.3762, "step": 13740 }, { "epoch": 0.9338904742492187, "grad_norm": 2.23091459274292, "learning_rate": 8.833061557276804e-05, "loss": 3.5156, "step": 13745 }, { "epoch": 0.9342301943198804, "grad_norm": 2.2373642921447754, "learning_rate": 8.832636907188477e-05, "loss": 2.9909, "step": 13750 }, { "epoch": 0.9345699143905422, "grad_norm": 1.9161615371704102, "learning_rate": 8.83221225710015e-05, "loss": 3.6332, "step": 13755 }, { "epoch": 0.934909634461204, "grad_norm": 2.2176098823547363, "learning_rate": 8.831787607011823e-05, "loss": 3.3376, "step": 13760 }, { "epoch": 0.9352493545318658, "grad_norm": 2.131208896636963, "learning_rate": 8.831362956923495e-05, "loss": 3.4767, "step": 13765 }, { "epoch": 0.9355890746025275, "grad_norm": 1.7842234373092651, "learning_rate": 8.830938306835168e-05, "loss": 3.4977, "step": 13770 }, { "epoch": 0.9359287946731893, "grad_norm": 2.3341705799102783, "learning_rate": 8.830513656746841e-05, "loss": 3.1641, "step": 13775 }, { "epoch": 0.9362685147438511, "grad_norm": 2.0824098587036133, "learning_rate": 8.830089006658514e-05, "loss": 3.2631, "step": 13780 }, { "epoch": 0.9366082348145128, "grad_norm": 2.0086612701416016, "learning_rate": 8.829664356570187e-05, "loss": 3.4141, "step": 13785 }, { "epoch": 0.9369479548851746, "grad_norm": 1.7281066179275513, "learning_rate": 8.82923970648186e-05, "loss": 3.2272, "step": 13790 }, { "epoch": 0.9372876749558364, "grad_norm": 5.72705602645874, "learning_rate": 8.828815056393532e-05, "loss": 3.4826, "step": 13795 }, { "epoch": 0.9376273950264982, "grad_norm": 2.011859893798828, "learning_rate": 8.828390406305205e-05, "loss": 3.1706, "step": 13800 }, { "epoch": 0.9379671150971599, "grad_norm": 1.9322566986083984, "learning_rate": 8.827965756216878e-05, "loss": 3.3577, "step": 13805 }, { "epoch": 0.9383068351678218, "grad_norm": 2.202861785888672, "learning_rate": 8.82754110612855e-05, "loss": 3.0179, "step": 13810 }, { "epoch": 0.9386465552384835, "grad_norm": 1.9804877042770386, "learning_rate": 8.827116456040223e-05, "loss": 3.3431, "step": 13815 }, { "epoch": 0.9389862753091452, "grad_norm": 1.7738773822784424, "learning_rate": 8.826691805951896e-05, "loss": 3.4447, "step": 13820 }, { "epoch": 0.939325995379807, "grad_norm": 2.635908842086792, "learning_rate": 8.826267155863569e-05, "loss": 2.9742, "step": 13825 }, { "epoch": 0.9396657154504688, "grad_norm": 2.6631758213043213, "learning_rate": 8.825842505775242e-05, "loss": 3.4358, "step": 13830 }, { "epoch": 0.9400054355211306, "grad_norm": 2.182623863220215, "learning_rate": 8.825417855686915e-05, "loss": 3.3175, "step": 13835 }, { "epoch": 0.9403451555917923, "grad_norm": 1.8466054201126099, "learning_rate": 8.824993205598587e-05, "loss": 3.3017, "step": 13840 }, { "epoch": 0.9406848756624542, "grad_norm": 2.6957767009735107, "learning_rate": 8.82456855551026e-05, "loss": 3.1934, "step": 13845 }, { "epoch": 0.9410245957331159, "grad_norm": 1.862452507019043, "learning_rate": 8.824143905421933e-05, "loss": 3.4138, "step": 13850 }, { "epoch": 0.9413643158037777, "grad_norm": 1.6880158185958862, "learning_rate": 8.823719255333606e-05, "loss": 3.4685, "step": 13855 }, { "epoch": 0.9417040358744395, "grad_norm": 2.598220109939575, "learning_rate": 8.823294605245279e-05, "loss": 3.4484, "step": 13860 }, { "epoch": 0.9420437559451013, "grad_norm": 2.7040443420410156, "learning_rate": 8.822869955156951e-05, "loss": 3.0738, "step": 13865 }, { "epoch": 0.942383476015763, "grad_norm": 2.614271879196167, "learning_rate": 8.822445305068624e-05, "loss": 3.3705, "step": 13870 }, { "epoch": 0.9427231960864247, "grad_norm": 2.1435706615448, "learning_rate": 8.822020654980297e-05, "loss": 3.3804, "step": 13875 }, { "epoch": 0.9430629161570866, "grad_norm": 2.510624408721924, "learning_rate": 8.82159600489197e-05, "loss": 3.4607, "step": 13880 }, { "epoch": 0.9434026362277483, "grad_norm": 2.1398496627807617, "learning_rate": 8.821171354803643e-05, "loss": 3.0927, "step": 13885 }, { "epoch": 0.9437423562984101, "grad_norm": 2.012885570526123, "learning_rate": 8.820746704715315e-05, "loss": 3.5992, "step": 13890 }, { "epoch": 0.9440820763690719, "grad_norm": 2.2159385681152344, "learning_rate": 8.820322054626988e-05, "loss": 3.5982, "step": 13895 }, { "epoch": 0.9444217964397337, "grad_norm": 2.6419429779052734, "learning_rate": 8.819897404538661e-05, "loss": 3.3276, "step": 13900 }, { "epoch": 0.9447615165103954, "grad_norm": 2.559035301208496, "learning_rate": 8.819472754450332e-05, "loss": 3.3703, "step": 13905 }, { "epoch": 0.9451012365810572, "grad_norm": 2.028944492340088, "learning_rate": 8.819048104362007e-05, "loss": 3.5419, "step": 13910 }, { "epoch": 0.945440956651719, "grad_norm": 2.531256914138794, "learning_rate": 8.81862345427368e-05, "loss": 3.2886, "step": 13915 }, { "epoch": 0.9457806767223808, "grad_norm": 1.943610668182373, "learning_rate": 8.818198804185351e-05, "loss": 3.0905, "step": 13920 }, { "epoch": 0.9461203967930425, "grad_norm": 1.7919096946716309, "learning_rate": 8.817774154097025e-05, "loss": 3.5025, "step": 13925 }, { "epoch": 0.9464601168637043, "grad_norm": 2.541760206222534, "learning_rate": 8.817349504008698e-05, "loss": 3.2375, "step": 13930 }, { "epoch": 0.9467998369343661, "grad_norm": 2.244941234588623, "learning_rate": 8.816924853920369e-05, "loss": 3.2675, "step": 13935 }, { "epoch": 0.9471395570050278, "grad_norm": 2.9352285861968994, "learning_rate": 8.816500203832043e-05, "loss": 3.5117, "step": 13940 }, { "epoch": 0.9474792770756897, "grad_norm": 2.416170597076416, "learning_rate": 8.816075553743716e-05, "loss": 3.4273, "step": 13945 }, { "epoch": 0.9478189971463514, "grad_norm": 2.170205593109131, "learning_rate": 8.815650903655388e-05, "loss": 3.4009, "step": 13950 }, { "epoch": 0.9481587172170132, "grad_norm": 1.6898329257965088, "learning_rate": 8.815226253567062e-05, "loss": 3.1527, "step": 13955 }, { "epoch": 0.9484984372876749, "grad_norm": 1.983485221862793, "learning_rate": 8.814801603478735e-05, "loss": 3.3173, "step": 13960 }, { "epoch": 0.9488381573583368, "grad_norm": 1.9404557943344116, "learning_rate": 8.814376953390406e-05, "loss": 3.1729, "step": 13965 }, { "epoch": 0.9491778774289985, "grad_norm": 2.5552010536193848, "learning_rate": 8.81395230330208e-05, "loss": 3.5427, "step": 13970 }, { "epoch": 0.9495175974996602, "grad_norm": 2.0226283073425293, "learning_rate": 8.813527653213752e-05, "loss": 3.4911, "step": 13975 }, { "epoch": 0.9498573175703221, "grad_norm": 2.0019125938415527, "learning_rate": 8.813103003125424e-05, "loss": 3.5023, "step": 13980 }, { "epoch": 0.9501970376409838, "grad_norm": 2.6398468017578125, "learning_rate": 8.812678353037099e-05, "loss": 3.2408, "step": 13985 }, { "epoch": 0.9505367577116456, "grad_norm": 1.8547154664993286, "learning_rate": 8.81225370294877e-05, "loss": 3.4152, "step": 13990 }, { "epoch": 0.9508764777823073, "grad_norm": 2.198720932006836, "learning_rate": 8.811829052860443e-05, "loss": 3.4573, "step": 13995 }, { "epoch": 0.9512161978529692, "grad_norm": 2.129786968231201, "learning_rate": 8.811404402772117e-05, "loss": 3.2805, "step": 14000 }, { "epoch": 0.9515559179236309, "grad_norm": 1.6172959804534912, "learning_rate": 8.810979752683788e-05, "loss": 3.0507, "step": 14005 }, { "epoch": 0.9518956379942927, "grad_norm": 1.756527304649353, "learning_rate": 8.810555102595461e-05, "loss": 3.2287, "step": 14010 }, { "epoch": 0.9522353580649545, "grad_norm": 2.1423068046569824, "learning_rate": 8.810130452507135e-05, "loss": 3.4567, "step": 14015 }, { "epoch": 0.9525750781356163, "grad_norm": 2.0487897396087646, "learning_rate": 8.809705802418807e-05, "loss": 3.4508, "step": 14020 }, { "epoch": 0.952914798206278, "grad_norm": 2.11043381690979, "learning_rate": 8.80928115233048e-05, "loss": 3.436, "step": 14025 }, { "epoch": 0.9532545182769399, "grad_norm": 1.6826726198196411, "learning_rate": 8.808856502242154e-05, "loss": 3.3798, "step": 14030 }, { "epoch": 0.9535942383476016, "grad_norm": 2.220339298248291, "learning_rate": 8.808431852153825e-05, "loss": 3.3067, "step": 14035 }, { "epoch": 0.9539339584182633, "grad_norm": 1.793248176574707, "learning_rate": 8.808007202065498e-05, "loss": 3.4956, "step": 14040 }, { "epoch": 0.9542736784889251, "grad_norm": 3.180000066757202, "learning_rate": 8.807582551977172e-05, "loss": 3.3711, "step": 14045 }, { "epoch": 0.9546133985595869, "grad_norm": 2.0635814666748047, "learning_rate": 8.807157901888844e-05, "loss": 3.5886, "step": 14050 }, { "epoch": 0.9549531186302487, "grad_norm": 1.8114535808563232, "learning_rate": 8.806733251800516e-05, "loss": 3.3088, "step": 14055 }, { "epoch": 0.9552928387009104, "grad_norm": 2.163492202758789, "learning_rate": 8.806308601712189e-05, "loss": 3.3083, "step": 14060 }, { "epoch": 0.9556325587715723, "grad_norm": 2.1202809810638428, "learning_rate": 8.805883951623862e-05, "loss": 3.3011, "step": 14065 }, { "epoch": 0.955972278842234, "grad_norm": 5.391999244689941, "learning_rate": 8.805459301535535e-05, "loss": 3.3349, "step": 14070 }, { "epoch": 0.9563119989128958, "grad_norm": 1.9948046207427979, "learning_rate": 8.805034651447208e-05, "loss": 3.6082, "step": 14075 }, { "epoch": 0.9566517189835575, "grad_norm": 2.9853687286376953, "learning_rate": 8.80461000135888e-05, "loss": 3.4616, "step": 14080 }, { "epoch": 0.9569914390542194, "grad_norm": 1.9888941049575806, "learning_rate": 8.804185351270553e-05, "loss": 3.227, "step": 14085 }, { "epoch": 0.9573311591248811, "grad_norm": 2.2460319995880127, "learning_rate": 8.803760701182226e-05, "loss": 3.2841, "step": 14090 }, { "epoch": 0.9576708791955428, "grad_norm": 1.9560335874557495, "learning_rate": 8.803336051093899e-05, "loss": 3.4355, "step": 14095 }, { "epoch": 0.9580105992662047, "grad_norm": 2.192253828048706, "learning_rate": 8.802911401005572e-05, "loss": 3.3058, "step": 14100 }, { "epoch": 0.9583503193368664, "grad_norm": 2.2094857692718506, "learning_rate": 8.802486750917244e-05, "loss": 3.1896, "step": 14105 }, { "epoch": 0.9586900394075282, "grad_norm": 1.9594848155975342, "learning_rate": 8.802062100828917e-05, "loss": 3.1649, "step": 14110 }, { "epoch": 0.95902975947819, "grad_norm": 2.1726417541503906, "learning_rate": 8.80163745074059e-05, "loss": 3.3859, "step": 14115 }, { "epoch": 0.9593694795488518, "grad_norm": 2.2436349391937256, "learning_rate": 8.801212800652263e-05, "loss": 3.742, "step": 14120 }, { "epoch": 0.9597091996195135, "grad_norm": 2.6532297134399414, "learning_rate": 8.800788150563936e-05, "loss": 3.5557, "step": 14125 }, { "epoch": 0.9600489196901753, "grad_norm": 2.1286096572875977, "learning_rate": 8.800363500475608e-05, "loss": 3.4458, "step": 14130 }, { "epoch": 0.9603886397608371, "grad_norm": 2.4496521949768066, "learning_rate": 8.799938850387281e-05, "loss": 3.3531, "step": 14135 }, { "epoch": 0.9607283598314988, "grad_norm": 2.0682320594787598, "learning_rate": 8.799514200298954e-05, "loss": 3.5732, "step": 14140 }, { "epoch": 0.9610680799021606, "grad_norm": 1.8080471754074097, "learning_rate": 8.799089550210627e-05, "loss": 3.3395, "step": 14145 }, { "epoch": 0.9614077999728224, "grad_norm": 1.9637908935546875, "learning_rate": 8.7986649001223e-05, "loss": 3.5554, "step": 14150 }, { "epoch": 0.9617475200434842, "grad_norm": 2.443098306655884, "learning_rate": 8.798240250033972e-05, "loss": 3.417, "step": 14155 }, { "epoch": 0.9620872401141459, "grad_norm": 2.088620185852051, "learning_rate": 8.797815599945645e-05, "loss": 3.4192, "step": 14160 }, { "epoch": 0.9624269601848077, "grad_norm": 2.1158978939056396, "learning_rate": 8.797390949857318e-05, "loss": 3.4457, "step": 14165 }, { "epoch": 0.9627666802554695, "grad_norm": 2.1730027198791504, "learning_rate": 8.796966299768991e-05, "loss": 3.3562, "step": 14170 }, { "epoch": 0.9631064003261313, "grad_norm": 2.249563694000244, "learning_rate": 8.796541649680664e-05, "loss": 3.3702, "step": 14175 }, { "epoch": 0.963446120396793, "grad_norm": 2.219883441925049, "learning_rate": 8.796116999592336e-05, "loss": 3.3654, "step": 14180 }, { "epoch": 0.9637858404674549, "grad_norm": 2.218933343887329, "learning_rate": 8.795692349504009e-05, "loss": 3.5882, "step": 14185 }, { "epoch": 0.9641255605381166, "grad_norm": 2.0734035968780518, "learning_rate": 8.795267699415682e-05, "loss": 3.3429, "step": 14190 }, { "epoch": 0.9644652806087783, "grad_norm": 2.0548102855682373, "learning_rate": 8.794843049327355e-05, "loss": 3.42, "step": 14195 }, { "epoch": 0.9648050006794402, "grad_norm": 1.7602564096450806, "learning_rate": 8.794418399239028e-05, "loss": 3.3179, "step": 14200 }, { "epoch": 0.9651447207501019, "grad_norm": 2.2944834232330322, "learning_rate": 8.7939937491507e-05, "loss": 3.2597, "step": 14205 }, { "epoch": 0.9654844408207637, "grad_norm": 1.991561770439148, "learning_rate": 8.793569099062373e-05, "loss": 3.366, "step": 14210 }, { "epoch": 0.9658241608914254, "grad_norm": 2.391232490539551, "learning_rate": 8.793144448974046e-05, "loss": 3.289, "step": 14215 }, { "epoch": 0.9661638809620873, "grad_norm": 2.7012839317321777, "learning_rate": 8.792719798885719e-05, "loss": 3.6302, "step": 14220 }, { "epoch": 0.966503601032749, "grad_norm": 2.4641456604003906, "learning_rate": 8.792295148797392e-05, "loss": 3.3902, "step": 14225 }, { "epoch": 0.9668433211034108, "grad_norm": 2.2539710998535156, "learning_rate": 8.791870498709064e-05, "loss": 3.4028, "step": 14230 }, { "epoch": 0.9671830411740726, "grad_norm": 1.8685566186904907, "learning_rate": 8.791445848620737e-05, "loss": 3.4354, "step": 14235 }, { "epoch": 0.9675227612447344, "grad_norm": 2.8558037281036377, "learning_rate": 8.79102119853241e-05, "loss": 3.2689, "step": 14240 }, { "epoch": 0.9678624813153961, "grad_norm": 2.2012362480163574, "learning_rate": 8.790596548444083e-05, "loss": 3.2239, "step": 14245 }, { "epoch": 0.9682022013860578, "grad_norm": 1.9266421794891357, "learning_rate": 8.790171898355756e-05, "loss": 3.4227, "step": 14250 }, { "epoch": 0.9685419214567197, "grad_norm": 1.6999174356460571, "learning_rate": 8.789747248267428e-05, "loss": 3.5226, "step": 14255 }, { "epoch": 0.9688816415273814, "grad_norm": 1.8867813348770142, "learning_rate": 8.7893225981791e-05, "loss": 3.4338, "step": 14260 }, { "epoch": 0.9692213615980432, "grad_norm": 1.8755052089691162, "learning_rate": 8.788897948090774e-05, "loss": 3.2911, "step": 14265 }, { "epoch": 0.969561081668705, "grad_norm": 2.179497718811035, "learning_rate": 8.788473298002447e-05, "loss": 3.3562, "step": 14270 }, { "epoch": 0.9699008017393668, "grad_norm": 2.0865354537963867, "learning_rate": 8.788048647914118e-05, "loss": 3.4683, "step": 14275 }, { "epoch": 0.9702405218100285, "grad_norm": 2.681731700897217, "learning_rate": 8.787623997825792e-05, "loss": 3.3607, "step": 14280 }, { "epoch": 0.9705802418806904, "grad_norm": 2.1763722896575928, "learning_rate": 8.787199347737465e-05, "loss": 3.4264, "step": 14285 }, { "epoch": 0.9709199619513521, "grad_norm": 1.6971964836120605, "learning_rate": 8.786774697649137e-05, "loss": 3.4209, "step": 14290 }, { "epoch": 0.9712596820220138, "grad_norm": 1.9688156843185425, "learning_rate": 8.786350047560811e-05, "loss": 3.3535, "step": 14295 }, { "epoch": 0.9715994020926756, "grad_norm": 2.2232604026794434, "learning_rate": 8.785925397472484e-05, "loss": 3.3133, "step": 14300 }, { "epoch": 0.9719391221633374, "grad_norm": 2.290151357650757, "learning_rate": 8.785500747384155e-05, "loss": 3.5035, "step": 14305 }, { "epoch": 0.9722788422339992, "grad_norm": 2.404773712158203, "learning_rate": 8.785076097295829e-05, "loss": 3.3521, "step": 14310 }, { "epoch": 0.9726185623046609, "grad_norm": 2.1335349082946777, "learning_rate": 8.784651447207502e-05, "loss": 3.1834, "step": 14315 }, { "epoch": 0.9729582823753228, "grad_norm": 1.9422911405563354, "learning_rate": 8.784226797119174e-05, "loss": 3.3794, "step": 14320 }, { "epoch": 0.9732980024459845, "grad_norm": 2.3396897315979004, "learning_rate": 8.783802147030848e-05, "loss": 2.9371, "step": 14325 }, { "epoch": 0.9736377225166463, "grad_norm": 2.0865442752838135, "learning_rate": 8.783377496942519e-05, "loss": 3.4637, "step": 14330 }, { "epoch": 0.973977442587308, "grad_norm": 1.9042447805404663, "learning_rate": 8.782952846854192e-05, "loss": 3.4505, "step": 14335 }, { "epoch": 0.9743171626579699, "grad_norm": 1.6497031450271606, "learning_rate": 8.782528196765866e-05, "loss": 3.3054, "step": 14340 }, { "epoch": 0.9746568827286316, "grad_norm": 1.9402196407318115, "learning_rate": 8.782103546677538e-05, "loss": 3.54, "step": 14345 }, { "epoch": 0.9749966027992933, "grad_norm": 2.109180450439453, "learning_rate": 8.78167889658921e-05, "loss": 3.4088, "step": 14350 }, { "epoch": 0.9753363228699552, "grad_norm": 1.9940447807312012, "learning_rate": 8.781254246500884e-05, "loss": 2.9455, "step": 14355 }, { "epoch": 0.9756760429406169, "grad_norm": 1.989651083946228, "learning_rate": 8.780829596412556e-05, "loss": 3.5452, "step": 14360 }, { "epoch": 0.9760157630112787, "grad_norm": 1.6461108922958374, "learning_rate": 8.780404946324229e-05, "loss": 3.3594, "step": 14365 }, { "epoch": 0.9763554830819405, "grad_norm": 2.007110357284546, "learning_rate": 8.779980296235903e-05, "loss": 3.4678, "step": 14370 }, { "epoch": 0.9766952031526023, "grad_norm": 2.036989450454712, "learning_rate": 8.779555646147574e-05, "loss": 3.1833, "step": 14375 }, { "epoch": 0.977034923223264, "grad_norm": 1.9842579364776611, "learning_rate": 8.779130996059247e-05, "loss": 3.4287, "step": 14380 }, { "epoch": 0.9773746432939258, "grad_norm": 2.1302239894866943, "learning_rate": 8.778706345970921e-05, "loss": 2.9818, "step": 14385 }, { "epoch": 0.9777143633645876, "grad_norm": 2.6202449798583984, "learning_rate": 8.778281695882593e-05, "loss": 3.2379, "step": 14390 }, { "epoch": 0.9780540834352494, "grad_norm": 2.211385488510132, "learning_rate": 8.777857045794266e-05, "loss": 3.5694, "step": 14395 }, { "epoch": 0.9783938035059111, "grad_norm": 2.216343879699707, "learning_rate": 8.777432395705938e-05, "loss": 3.3475, "step": 14400 }, { "epoch": 0.978733523576573, "grad_norm": 2.0956733226776123, "learning_rate": 8.777007745617611e-05, "loss": 3.1273, "step": 14405 }, { "epoch": 0.9790732436472347, "grad_norm": 1.9711989164352417, "learning_rate": 8.776583095529284e-05, "loss": 3.1431, "step": 14410 }, { "epoch": 0.9794129637178964, "grad_norm": 1.7499561309814453, "learning_rate": 8.776158445440957e-05, "loss": 3.3817, "step": 14415 }, { "epoch": 0.9797526837885582, "grad_norm": 2.2698473930358887, "learning_rate": 8.77573379535263e-05, "loss": 3.1078, "step": 14420 }, { "epoch": 0.98009240385922, "grad_norm": 1.723395586013794, "learning_rate": 8.775309145264302e-05, "loss": 3.1959, "step": 14425 }, { "epoch": 0.9804321239298818, "grad_norm": 2.517970561981201, "learning_rate": 8.774884495175975e-05, "loss": 3.4642, "step": 14430 }, { "epoch": 0.9807718440005435, "grad_norm": 4.220458507537842, "learning_rate": 8.774459845087648e-05, "loss": 3.0291, "step": 14435 }, { "epoch": 0.9811115640712054, "grad_norm": 2.1747329235076904, "learning_rate": 8.774035194999321e-05, "loss": 3.4183, "step": 14440 }, { "epoch": 0.9814512841418671, "grad_norm": 1.8945306539535522, "learning_rate": 8.773610544910994e-05, "loss": 3.5566, "step": 14445 }, { "epoch": 0.9817910042125289, "grad_norm": 2.3650691509246826, "learning_rate": 8.773185894822666e-05, "loss": 2.9793, "step": 14450 }, { "epoch": 0.9821307242831907, "grad_norm": 1.4567208290100098, "learning_rate": 8.772761244734339e-05, "loss": 3.5264, "step": 14455 }, { "epoch": 0.9824704443538524, "grad_norm": 2.0683977603912354, "learning_rate": 8.772336594646012e-05, "loss": 3.2673, "step": 14460 }, { "epoch": 0.9828101644245142, "grad_norm": 1.6733638048171997, "learning_rate": 8.771911944557685e-05, "loss": 3.405, "step": 14465 }, { "epoch": 0.9831498844951759, "grad_norm": 2.290371894836426, "learning_rate": 8.771487294469358e-05, "loss": 3.3912, "step": 14470 }, { "epoch": 0.9834896045658378, "grad_norm": 2.452742338180542, "learning_rate": 8.77106264438103e-05, "loss": 3.0632, "step": 14475 }, { "epoch": 0.9838293246364995, "grad_norm": 2.672314167022705, "learning_rate": 8.770637994292703e-05, "loss": 3.2689, "step": 14480 }, { "epoch": 0.9841690447071613, "grad_norm": 2.217886209487915, "learning_rate": 8.770213344204376e-05, "loss": 3.4091, "step": 14485 }, { "epoch": 0.9845087647778231, "grad_norm": 1.8328226804733276, "learning_rate": 8.769788694116049e-05, "loss": 3.3401, "step": 14490 }, { "epoch": 0.9848484848484849, "grad_norm": 3.0413577556610107, "learning_rate": 8.769364044027722e-05, "loss": 3.5316, "step": 14495 }, { "epoch": 0.9851882049191466, "grad_norm": 2.2129108905792236, "learning_rate": 8.768939393939394e-05, "loss": 3.3554, "step": 14500 }, { "epoch": 0.9855279249898083, "grad_norm": 2.103874683380127, "learning_rate": 8.768514743851067e-05, "loss": 3.2374, "step": 14505 }, { "epoch": 0.9858676450604702, "grad_norm": 2.4148151874542236, "learning_rate": 8.76809009376274e-05, "loss": 3.2403, "step": 14510 }, { "epoch": 0.9862073651311319, "grad_norm": 2.0161502361297607, "learning_rate": 8.767665443674413e-05, "loss": 3.4749, "step": 14515 }, { "epoch": 0.9865470852017937, "grad_norm": 1.8706799745559692, "learning_rate": 8.767240793586086e-05, "loss": 3.2181, "step": 14520 }, { "epoch": 0.9868868052724555, "grad_norm": 1.7251646518707275, "learning_rate": 8.766816143497758e-05, "loss": 3.1983, "step": 14525 }, { "epoch": 0.9872265253431173, "grad_norm": 2.270615339279175, "learning_rate": 8.766391493409431e-05, "loss": 3.3982, "step": 14530 }, { "epoch": 0.987566245413779, "grad_norm": 2.3085289001464844, "learning_rate": 8.765966843321104e-05, "loss": 3.3536, "step": 14535 }, { "epoch": 0.9879059654844409, "grad_norm": 3.0530669689178467, "learning_rate": 8.765542193232777e-05, "loss": 3.4448, "step": 14540 }, { "epoch": 0.9882456855551026, "grad_norm": 1.8535819053649902, "learning_rate": 8.76511754314445e-05, "loss": 3.2159, "step": 14545 }, { "epoch": 0.9885854056257644, "grad_norm": 2.3296449184417725, "learning_rate": 8.764692893056122e-05, "loss": 3.2978, "step": 14550 }, { "epoch": 0.9889251256964261, "grad_norm": 3.934795379638672, "learning_rate": 8.764268242967795e-05, "loss": 3.3364, "step": 14555 }, { "epoch": 0.989264845767088, "grad_norm": 2.124008893966675, "learning_rate": 8.763843592879468e-05, "loss": 3.3821, "step": 14560 }, { "epoch": 0.9896045658377497, "grad_norm": 1.8345386981964111, "learning_rate": 8.763418942791141e-05, "loss": 3.4093, "step": 14565 }, { "epoch": 0.9899442859084114, "grad_norm": 1.6654386520385742, "learning_rate": 8.762994292702814e-05, "loss": 3.1737, "step": 14570 }, { "epoch": 0.9902840059790733, "grad_norm": 2.241921901702881, "learning_rate": 8.762569642614486e-05, "loss": 3.4927, "step": 14575 }, { "epoch": 0.990623726049735, "grad_norm": 2.1256651878356934, "learning_rate": 8.762144992526159e-05, "loss": 3.3094, "step": 14580 }, { "epoch": 0.9909634461203968, "grad_norm": 2.0170223712921143, "learning_rate": 8.761720342437832e-05, "loss": 3.3643, "step": 14585 }, { "epoch": 0.9913031661910585, "grad_norm": 1.6097463369369507, "learning_rate": 8.761295692349505e-05, "loss": 3.4341, "step": 14590 }, { "epoch": 0.9916428862617204, "grad_norm": 1.8417088985443115, "learning_rate": 8.760871042261178e-05, "loss": 3.2865, "step": 14595 }, { "epoch": 0.9919826063323821, "grad_norm": 1.768991231918335, "learning_rate": 8.760446392172849e-05, "loss": 3.3081, "step": 14600 }, { "epoch": 0.9923223264030439, "grad_norm": 2.3983466625213623, "learning_rate": 8.760021742084523e-05, "loss": 3.5978, "step": 14605 }, { "epoch": 0.9926620464737057, "grad_norm": 2.188138246536255, "learning_rate": 8.759597091996196e-05, "loss": 3.2194, "step": 14610 }, { "epoch": 0.9930017665443674, "grad_norm": 1.9903393983840942, "learning_rate": 8.759172441907867e-05, "loss": 3.4792, "step": 14615 }, { "epoch": 0.9933414866150292, "grad_norm": 2.0076169967651367, "learning_rate": 8.758747791819542e-05, "loss": 3.1612, "step": 14620 }, { "epoch": 0.993681206685691, "grad_norm": 2.4887337684631348, "learning_rate": 8.758323141731214e-05, "loss": 3.3489, "step": 14625 }, { "epoch": 0.9940209267563528, "grad_norm": 2.093916177749634, "learning_rate": 8.757898491642886e-05, "loss": 3.5879, "step": 14630 }, { "epoch": 0.9943606468270145, "grad_norm": 2.3841006755828857, "learning_rate": 8.75747384155456e-05, "loss": 3.2517, "step": 14635 }, { "epoch": 0.9947003668976763, "grad_norm": 2.1051478385925293, "learning_rate": 8.757049191466233e-05, "loss": 3.4353, "step": 14640 }, { "epoch": 0.9950400869683381, "grad_norm": 2.1174380779266357, "learning_rate": 8.756624541377904e-05, "loss": 3.3293, "step": 14645 }, { "epoch": 0.9953798070389999, "grad_norm": 2.258221387863159, "learning_rate": 8.756199891289578e-05, "loss": 3.2861, "step": 14650 }, { "epoch": 0.9957195271096616, "grad_norm": 2.3565449714660645, "learning_rate": 8.755775241201251e-05, "loss": 3.3414, "step": 14655 }, { "epoch": 0.9960592471803235, "grad_norm": 1.9497843980789185, "learning_rate": 8.755350591112923e-05, "loss": 3.38, "step": 14660 }, { "epoch": 0.9963989672509852, "grad_norm": 2.1668646335601807, "learning_rate": 8.754925941024597e-05, "loss": 3.5752, "step": 14665 }, { "epoch": 0.9967386873216469, "grad_norm": 1.968654990196228, "learning_rate": 8.75450129093627e-05, "loss": 3.4732, "step": 14670 }, { "epoch": 0.9970784073923087, "grad_norm": 2.0822348594665527, "learning_rate": 8.754076640847941e-05, "loss": 3.4756, "step": 14675 }, { "epoch": 0.9974181274629705, "grad_norm": 2.294171094894409, "learning_rate": 8.753651990759615e-05, "loss": 3.3943, "step": 14680 }, { "epoch": 0.9977578475336323, "grad_norm": 2.1640141010284424, "learning_rate": 8.753227340671287e-05, "loss": 3.3356, "step": 14685 }, { "epoch": 0.998097567604294, "grad_norm": 1.4752922058105469, "learning_rate": 8.75280269058296e-05, "loss": 3.4327, "step": 14690 }, { "epoch": 0.9984372876749559, "grad_norm": 1.9537899494171143, "learning_rate": 8.752378040494634e-05, "loss": 3.6484, "step": 14695 }, { "epoch": 0.9987770077456176, "grad_norm": 2.025458335876465, "learning_rate": 8.751953390406305e-05, "loss": 3.5732, "step": 14700 }, { "epoch": 0.9991167278162794, "grad_norm": 2.4621469974517822, "learning_rate": 8.751528740317978e-05, "loss": 3.3452, "step": 14705 }, { "epoch": 0.9994564478869412, "grad_norm": 2.4667470455169678, "learning_rate": 8.751104090229652e-05, "loss": 3.4136, "step": 14710 }, { "epoch": 0.999796167957603, "grad_norm": 2.1196353435516357, "learning_rate": 8.750679440141323e-05, "loss": 3.5448, "step": 14715 }, { "epoch": 1.0, "eval_bertscore": { "f1": 0.8405732463549616, "precision": 0.8403871552243415, "recall": 0.8418096007793862 }, "eval_bleu_4": 0.021728611886642827, "eval_exact_match": 0.0005814516910553348, "eval_loss": 3.374361753463745, "eval_meteor": 0.08745923686208683, "eval_rouge": { "rouge1": 0.12130096569603133, "rouge2": 0.0190627440734581, "rougeL": 0.10542714029787417, "rougeLsum": 0.10548014916626183 }, "eval_runtime": 1964.3551, "eval_samples_per_second": 5.253, "eval_steps_per_second": 0.657, "step": 14718 } ], "logging_steps": 5, "max_steps": 117744, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.169809307664384e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }