{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4063565778971047, "eval_steps": 500, "global_step": 70000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005805093969958639, "grad_norm": 0.7240252494812012, "learning_rate": 5.746459252379847e-07, "loss": 0.1079, "step": 100 }, { "epoch": 0.0011610187939917278, "grad_norm": 5.649069786071777, "learning_rate": 1.1550963547713026e-06, "loss": 0.1034, "step": 200 }, { "epoch": 0.0017415281909875916, "grad_norm": 2.677654504776001, "learning_rate": 1.7355467843046206e-06, "loss": 0.0933, "step": 300 }, { "epoch": 0.0023220375879834556, "grad_norm": 1.4429970979690552, "learning_rate": 2.3159972138379382e-06, "loss": 0.0998, "step": 400 }, { "epoch": 0.0029025469849793192, "grad_norm": 1.217147707939148, "learning_rate": 2.8964476433712563e-06, "loss": 0.0968, "step": 500 }, { "epoch": 0.0034830563819751833, "grad_norm": 0.10677797347307205, "learning_rate": 3.476898072904574e-06, "loss": 0.0903, "step": 600 }, { "epoch": 0.004063565778971047, "grad_norm": 18.24258804321289, "learning_rate": 4.0573485024378915e-06, "loss": 0.0898, "step": 700 }, { "epoch": 0.004644075175966911, "grad_norm": 0.49379172921180725, "learning_rate": 4.63779893197121e-06, "loss": 0.0741, "step": 800 }, { "epoch": 0.0052245845729627744, "grad_norm": 0.35244712233543396, "learning_rate": 5.218249361504528e-06, "loss": 0.0691, "step": 900 }, { "epoch": 0.0058050939699586385, "grad_norm": 1.6648738384246826, "learning_rate": 5.798699791037845e-06, "loss": 0.0652, "step": 1000 }, { "epoch": 0.0063856033669545025, "grad_norm": 1.3582569360733032, "learning_rate": 6.379150220571163e-06, "loss": 0.0632, "step": 1100 }, { "epoch": 0.0069661127639503665, "grad_norm": 8.560323715209961, "learning_rate": 6.9596006501044805e-06, "loss": 0.0573, "step": 1200 }, { "epoch": 0.0075466221609462305, "grad_norm": 1.7866237163543701, "learning_rate": 7.5400510796378e-06, "loss": 0.0517, "step": 1300 }, { "epoch": 0.008127131557942095, "grad_norm": 2.1602494716644287, "learning_rate": 8.120501509171117e-06, "loss": 0.0531, "step": 1400 }, { "epoch": 0.008707640954937958, "grad_norm": 3.2753777503967285, "learning_rate": 8.700951938704436e-06, "loss": 0.0446, "step": 1500 }, { "epoch": 0.009288150351933823, "grad_norm": 7.379251003265381, "learning_rate": 9.281402368237753e-06, "loss": 0.0415, "step": 1600 }, { "epoch": 0.009868659748929686, "grad_norm": 2.0278890132904053, "learning_rate": 9.861852797771071e-06, "loss": 0.0471, "step": 1700 }, { "epoch": 0.010449169145925549, "grad_norm": 1.9143238067626953, "learning_rate": 1.0442303227304388e-05, "loss": 0.0431, "step": 1800 }, { "epoch": 0.011029678542921414, "grad_norm": 3.3644015789031982, "learning_rate": 1.1022753656837706e-05, "loss": 0.0388, "step": 1900 }, { "epoch": 0.011610187939917277, "grad_norm": 5.762856960296631, "learning_rate": 1.1603204086371025e-05, "loss": 0.0387, "step": 2000 }, { "epoch": 0.012190697336913142, "grad_norm": 3.5617690086364746, "learning_rate": 1.2183654515904343e-05, "loss": 0.0437, "step": 2100 }, { "epoch": 0.012771206733909005, "grad_norm": 4.118985176086426, "learning_rate": 1.276410494543766e-05, "loss": 0.0368, "step": 2200 }, { "epoch": 0.01335171613090487, "grad_norm": 0.7795373201370239, "learning_rate": 1.3344555374970977e-05, "loss": 0.0441, "step": 2300 }, { "epoch": 0.013932225527900733, "grad_norm": 6.839756011962891, "learning_rate": 1.3925005804504295e-05, "loss": 0.0362, "step": 2400 }, { "epoch": 0.014512734924896596, "grad_norm": 0.0, "learning_rate": 1.4505456234037616e-05, "loss": 0.0355, "step": 2500 }, { "epoch": 0.015093244321892461, "grad_norm": 0.7248427271842957, "learning_rate": 1.508590666357093e-05, "loss": 0.0352, "step": 2600 }, { "epoch": 0.015673753718888326, "grad_norm": 1.2530128955841064, "learning_rate": 1.566635709310425e-05, "loss": 0.0337, "step": 2700 }, { "epoch": 0.01625426311588419, "grad_norm": 1.8551280498504639, "learning_rate": 1.6246807522637568e-05, "loss": 0.0392, "step": 2800 }, { "epoch": 0.016834772512880052, "grad_norm": 1.7844129800796509, "learning_rate": 1.6827257952170884e-05, "loss": 0.0327, "step": 2900 }, { "epoch": 0.017415281909875915, "grad_norm": 1.0492717027664185, "learning_rate": 1.74077083817042e-05, "loss": 0.0326, "step": 3000 }, { "epoch": 0.01799579130687178, "grad_norm": 1.5272563695907593, "learning_rate": 1.798815881123752e-05, "loss": 0.0363, "step": 3100 }, { "epoch": 0.018576300703867645, "grad_norm": 1.028568983078003, "learning_rate": 1.856860924077084e-05, "loss": 0.0326, "step": 3200 }, { "epoch": 0.01915681010086351, "grad_norm": 3.0390264987945557, "learning_rate": 1.9149059670304155e-05, "loss": 0.0327, "step": 3300 }, { "epoch": 0.01973731949785937, "grad_norm": 0.0, "learning_rate": 1.9729510099837475e-05, "loss": 0.034, "step": 3400 }, { "epoch": 0.020317828894855235, "grad_norm": 3.5412075519561768, "learning_rate": 2.0309960529370792e-05, "loss": 0.0301, "step": 3500 }, { "epoch": 0.020898338291851098, "grad_norm": 63.408573150634766, "learning_rate": 2.0890410958904112e-05, "loss": 0.0391, "step": 3600 }, { "epoch": 0.021478847688846964, "grad_norm": 2.8030202388763428, "learning_rate": 2.147086138843743e-05, "loss": 0.0269, "step": 3700 }, { "epoch": 0.022059357085842828, "grad_norm": 9.339249610900879, "learning_rate": 2.2051311817970746e-05, "loss": 0.0322, "step": 3800 }, { "epoch": 0.02263986648283869, "grad_norm": 1.6153712272644043, "learning_rate": 2.2631762247504066e-05, "loss": 0.0347, "step": 3900 }, { "epoch": 0.023220375879834554, "grad_norm": 2.073003053665161, "learning_rate": 2.3212212677037383e-05, "loss": 0.0252, "step": 4000 }, { "epoch": 0.023800885276830417, "grad_norm": 1.535114049911499, "learning_rate": 2.37926631065707e-05, "loss": 0.0319, "step": 4100 }, { "epoch": 0.024381394673826284, "grad_norm": 2.764871120452881, "learning_rate": 2.4373113536104016e-05, "loss": 0.0316, "step": 4200 }, { "epoch": 0.024961904070822147, "grad_norm": 1.169055700302124, "learning_rate": 2.4953563965637336e-05, "loss": 0.0289, "step": 4300 }, { "epoch": 0.02554241346781801, "grad_norm": 0.0, "learning_rate": 2.5534014395170653e-05, "loss": 0.0301, "step": 4400 }, { "epoch": 0.026122922864813873, "grad_norm": 5.054708957672119, "learning_rate": 2.611446482470397e-05, "loss": 0.0276, "step": 4500 }, { "epoch": 0.02670343226180974, "grad_norm": 3.1561946868896484, "learning_rate": 2.669491525423729e-05, "loss": 0.0284, "step": 4600 }, { "epoch": 0.027283941658805603, "grad_norm": 1.3254655599594116, "learning_rate": 2.727536568377061e-05, "loss": 0.029, "step": 4700 }, { "epoch": 0.027864451055801466, "grad_norm": 15.622490882873535, "learning_rate": 2.7855816113303924e-05, "loss": 0.0319, "step": 4800 }, { "epoch": 0.02844496045279733, "grad_norm": 11.673898696899414, "learning_rate": 2.8436266542837244e-05, "loss": 0.0311, "step": 4900 }, { "epoch": 0.029025469849793192, "grad_norm": 3.8026087284088135, "learning_rate": 2.901671697237056e-05, "loss": 0.0273, "step": 5000 }, { "epoch": 0.02960597924678906, "grad_norm": 0.24046263098716736, "learning_rate": 2.959716740190388e-05, "loss": 0.0283, "step": 5100 }, { "epoch": 0.030186488643784922, "grad_norm": 3.2492616176605225, "learning_rate": 3.01776178314372e-05, "loss": 0.0288, "step": 5200 }, { "epoch": 0.030766998040780785, "grad_norm": 5.840951919555664, "learning_rate": 3.075806826097051e-05, "loss": 0.0267, "step": 5300 }, { "epoch": 0.03134750743777665, "grad_norm": 4.141648769378662, "learning_rate": 3.1338518690503834e-05, "loss": 0.031, "step": 5400 }, { "epoch": 0.03192801683477251, "grad_norm": 0.6267948746681213, "learning_rate": 3.191896912003715e-05, "loss": 0.027, "step": 5500 }, { "epoch": 0.03250852623176838, "grad_norm": 4.212204933166504, "learning_rate": 3.249941954957047e-05, "loss": 0.0312, "step": 5600 }, { "epoch": 0.03308903562876424, "grad_norm": 9.002190589904785, "learning_rate": 3.3079869979103785e-05, "loss": 0.0328, "step": 5700 }, { "epoch": 0.033669545025760104, "grad_norm": 5.377740383148193, "learning_rate": 3.36603204086371e-05, "loss": 0.0289, "step": 5800 }, { "epoch": 0.03425005442275597, "grad_norm": 4.514215469360352, "learning_rate": 3.4240770838170425e-05, "loss": 0.0288, "step": 5900 }, { "epoch": 0.03483056381975183, "grad_norm": 11.520332336425781, "learning_rate": 3.482122126770374e-05, "loss": 0.0315, "step": 6000 }, { "epoch": 0.0354110732167477, "grad_norm": 2.2946815490722656, "learning_rate": 3.540167169723706e-05, "loss": 0.0289, "step": 6100 }, { "epoch": 0.03599158261374356, "grad_norm": 2.24802565574646, "learning_rate": 3.5982122126770375e-05, "loss": 0.0233, "step": 6200 }, { "epoch": 0.036572092010739424, "grad_norm": 0.0, "learning_rate": 3.656257255630369e-05, "loss": 0.0258, "step": 6300 }, { "epoch": 0.03715260140773529, "grad_norm": 0.0, "learning_rate": 3.714302298583701e-05, "loss": 0.0279, "step": 6400 }, { "epoch": 0.03773311080473115, "grad_norm": 0.2940079867839813, "learning_rate": 3.772347341537033e-05, "loss": 0.0239, "step": 6500 }, { "epoch": 0.03831362020172702, "grad_norm": 1.4886199235916138, "learning_rate": 3.830392384490365e-05, "loss": 0.0284, "step": 6600 }, { "epoch": 0.038894129598722876, "grad_norm": 2.378464698791504, "learning_rate": 3.8884374274436966e-05, "loss": 0.0292, "step": 6700 }, { "epoch": 0.03947463899571874, "grad_norm": 1.4737799167633057, "learning_rate": 3.946482470397028e-05, "loss": 0.0252, "step": 6800 }, { "epoch": 0.04005514839271461, "grad_norm": 1.7435976266860962, "learning_rate": 4.00452751335036e-05, "loss": 0.0252, "step": 6900 }, { "epoch": 0.04063565778971047, "grad_norm": 3.012014627456665, "learning_rate": 4.062572556303692e-05, "loss": 0.0341, "step": 7000 }, { "epoch": 0.041216167186706336, "grad_norm": 1.463402509689331, "learning_rate": 4.120617599257023e-05, "loss": 0.0242, "step": 7100 }, { "epoch": 0.041796676583702196, "grad_norm": 2.936508893966675, "learning_rate": 4.178662642210355e-05, "loss": 0.0303, "step": 7200 }, { "epoch": 0.04237718598069806, "grad_norm": 2.3473055362701416, "learning_rate": 4.2367076851636874e-05, "loss": 0.0268, "step": 7300 }, { "epoch": 0.04295769537769393, "grad_norm": 3.3722922801971436, "learning_rate": 4.294752728117019e-05, "loss": 0.0212, "step": 7400 }, { "epoch": 0.04353820477468979, "grad_norm": 3.0405075550079346, "learning_rate": 4.352797771070351e-05, "loss": 0.0277, "step": 7500 }, { "epoch": 0.044118714171685655, "grad_norm": 1.044071912765503, "learning_rate": 4.4108428140236824e-05, "loss": 0.0225, "step": 7600 }, { "epoch": 0.044699223568681515, "grad_norm": 1.0134261846542358, "learning_rate": 4.468887856977014e-05, "loss": 0.0253, "step": 7700 }, { "epoch": 0.04527973296567738, "grad_norm": 9.729911804199219, "learning_rate": 4.5269328999303464e-05, "loss": 0.0323, "step": 7800 }, { "epoch": 0.04586024236267325, "grad_norm": 2.1204724311828613, "learning_rate": 4.584977942883678e-05, "loss": 0.0279, "step": 7900 }, { "epoch": 0.04644075175966911, "grad_norm": 9.481526374816895, "learning_rate": 4.64302298583701e-05, "loss": 0.031, "step": 8000 }, { "epoch": 0.047021261156664974, "grad_norm": 0.6424680948257446, "learning_rate": 4.7010680287903415e-05, "loss": 0.0278, "step": 8100 }, { "epoch": 0.047601770553660834, "grad_norm": 2.0485119819641113, "learning_rate": 4.759113071743673e-05, "loss": 0.0261, "step": 8200 }, { "epoch": 0.0481822799506567, "grad_norm": 2.485046148300171, "learning_rate": 4.8171581146970055e-05, "loss": 0.0257, "step": 8300 }, { "epoch": 0.04876278934765257, "grad_norm": 1.421764612197876, "learning_rate": 4.875203157650337e-05, "loss": 0.0312, "step": 8400 }, { "epoch": 0.04934329874464843, "grad_norm": 1.3517789840698242, "learning_rate": 4.933248200603668e-05, "loss": 0.0325, "step": 8500 }, { "epoch": 0.049923808141644294, "grad_norm": 0.42899224162101746, "learning_rate": 4.9912932435570005e-05, "loss": 0.0225, "step": 8600 }, { "epoch": 0.05050431753864015, "grad_norm": 1.7937917709350586, "learning_rate": 4.9999966717127464e-05, "loss": 0.0289, "step": 8700 }, { "epoch": 0.05108482693563602, "grad_norm": 0.9099732041358948, "learning_rate": 4.9999842338357364e-05, "loss": 0.0277, "step": 8800 }, { "epoch": 0.051665336332631887, "grad_norm": 1.6560391187667847, "learning_rate": 4.999962582765702e-05, "loss": 0.0218, "step": 8900 }, { "epoch": 0.052245845729627746, "grad_norm": 1.163989543914795, "learning_rate": 4.999931718582432e-05, "loss": 0.0225, "step": 9000 }, { "epoch": 0.05282635512662361, "grad_norm": 7.1312971115112305, "learning_rate": 4.9998916413996715e-05, "loss": 0.0301, "step": 9100 }, { "epoch": 0.05340686452361948, "grad_norm": 10.158438682556152, "learning_rate": 4.999842351365117e-05, "loss": 0.0241, "step": 9200 }, { "epoch": 0.05398737392061534, "grad_norm": 0.19601650536060333, "learning_rate": 4.999783848660417e-05, "loss": 0.026, "step": 9300 }, { "epoch": 0.054567883317611206, "grad_norm": 0.716726541519165, "learning_rate": 4.999716133501171e-05, "loss": 0.0247, "step": 9400 }, { "epoch": 0.055148392714607065, "grad_norm": 2.3324790000915527, "learning_rate": 4.99963920613693e-05, "loss": 0.0237, "step": 9500 }, { "epoch": 0.05572890211160293, "grad_norm": 5.88576078414917, "learning_rate": 4.9995530668511946e-05, "loss": 0.0213, "step": 9600 }, { "epoch": 0.0563094115085988, "grad_norm": 0.17054684460163116, "learning_rate": 4.9994577159614144e-05, "loss": 0.0225, "step": 9700 }, { "epoch": 0.05688992090559466, "grad_norm": 1.9155703783035278, "learning_rate": 4.9993531538189854e-05, "loss": 0.0292, "step": 9800 }, { "epoch": 0.057470430302590525, "grad_norm": 0.0, "learning_rate": 4.99923938080925e-05, "loss": 0.025, "step": 9900 }, { "epoch": 0.058050939699586385, "grad_norm": 3.8306055068969727, "learning_rate": 4.999116397351497e-05, "loss": 0.0257, "step": 10000 }, { "epoch": 0.05863144909658225, "grad_norm": 2.8682026863098145, "learning_rate": 4.998984203898957e-05, "loss": 0.0276, "step": 10100 }, { "epoch": 0.05921195849357812, "grad_norm": 1.6766282320022583, "learning_rate": 4.9988428009388026e-05, "loss": 0.0261, "step": 10200 }, { "epoch": 0.05979246789057398, "grad_norm": 1.1323601007461548, "learning_rate": 4.998692188992147e-05, "loss": 0.0219, "step": 10300 }, { "epoch": 0.060372977287569844, "grad_norm": 1.3007240295410156, "learning_rate": 4.998532368614038e-05, "loss": 0.0321, "step": 10400 }, { "epoch": 0.060953486684565704, "grad_norm": 0.4534103572368622, "learning_rate": 4.998363340393465e-05, "loss": 0.0271, "step": 10500 }, { "epoch": 0.06153399608156157, "grad_norm": 1.0366442203521729, "learning_rate": 4.9981851049533446e-05, "loss": 0.0212, "step": 10600 }, { "epoch": 0.06211450547855744, "grad_norm": 0.7673536539077759, "learning_rate": 4.9979976629505305e-05, "loss": 0.0214, "step": 10700 }, { "epoch": 0.0626950148755533, "grad_norm": 6.375418186187744, "learning_rate": 4.9978010150758016e-05, "loss": 0.0241, "step": 10800 }, { "epoch": 0.06327552427254916, "grad_norm": 2.6672778129577637, "learning_rate": 4.9975951620538644e-05, "loss": 0.023, "step": 10900 }, { "epoch": 0.06385603366954502, "grad_norm": 0.0, "learning_rate": 4.9973801046433494e-05, "loss": 0.0238, "step": 11000 }, { "epoch": 0.06443654306654088, "grad_norm": 0.47239989042282104, "learning_rate": 4.997155843636808e-05, "loss": 0.023, "step": 11100 }, { "epoch": 0.06501705246353676, "grad_norm": 2.6296310424804688, "learning_rate": 4.996922379860708e-05, "loss": 0.0247, "step": 11200 }, { "epoch": 0.06559756186053262, "grad_norm": 1.4962682723999023, "learning_rate": 4.996679714175436e-05, "loss": 0.0217, "step": 11300 }, { "epoch": 0.06617807125752848, "grad_norm": 1.51057767868042, "learning_rate": 4.996427847475286e-05, "loss": 0.0249, "step": 11400 }, { "epoch": 0.06675858065452435, "grad_norm": 0.9044837951660156, "learning_rate": 4.9961667806884625e-05, "loss": 0.0274, "step": 11500 }, { "epoch": 0.06733909005152021, "grad_norm": 1.1930240392684937, "learning_rate": 4.9958965147770764e-05, "loss": 0.0249, "step": 11600 }, { "epoch": 0.06791959944851607, "grad_norm": 0.13868218660354614, "learning_rate": 4.995617050737138e-05, "loss": 0.0285, "step": 11700 }, { "epoch": 0.06850010884551194, "grad_norm": 1.4061717987060547, "learning_rate": 4.995328389598556e-05, "loss": 0.0235, "step": 11800 }, { "epoch": 0.0690806182425078, "grad_norm": 3.641000509262085, "learning_rate": 4.995030532425134e-05, "loss": 0.0179, "step": 11900 }, { "epoch": 0.06966112763950366, "grad_norm": 0.7060804963111877, "learning_rate": 4.994723480314565e-05, "loss": 0.0205, "step": 12000 }, { "epoch": 0.07024163703649954, "grad_norm": 0.5786570310592651, "learning_rate": 4.994407234398427e-05, "loss": 0.0236, "step": 12100 }, { "epoch": 0.0708221464334954, "grad_norm": 1.2352372407913208, "learning_rate": 4.994081795842183e-05, "loss": 0.0217, "step": 12200 }, { "epoch": 0.07140265583049125, "grad_norm": 4.269515037536621, "learning_rate": 4.9937471658451715e-05, "loss": 0.0207, "step": 12300 }, { "epoch": 0.07198316522748711, "grad_norm": 0.17409491539001465, "learning_rate": 4.9934033456406035e-05, "loss": 0.0186, "step": 12400 }, { "epoch": 0.07256367462448299, "grad_norm": 1.2208822965621948, "learning_rate": 4.993050336495562e-05, "loss": 0.0261, "step": 12500 }, { "epoch": 0.07314418402147885, "grad_norm": 0.0, "learning_rate": 4.9926881397109896e-05, "loss": 0.0285, "step": 12600 }, { "epoch": 0.07372469341847471, "grad_norm": 7.06033992767334, "learning_rate": 4.99231675662169e-05, "loss": 0.0202, "step": 12700 }, { "epoch": 0.07430520281547058, "grad_norm": 0.0, "learning_rate": 4.9919361885963234e-05, "loss": 0.0276, "step": 12800 }, { "epoch": 0.07488571221246644, "grad_norm": 0.406585693359375, "learning_rate": 4.991546437037396e-05, "loss": 0.02, "step": 12900 }, { "epoch": 0.0754662216094623, "grad_norm": 0.26512107253074646, "learning_rate": 4.9911475033812596e-05, "loss": 0.0234, "step": 13000 }, { "epoch": 0.07604673100645817, "grad_norm": 1.349731206893921, "learning_rate": 4.990739389098105e-05, "loss": 0.027, "step": 13100 }, { "epoch": 0.07662724040345403, "grad_norm": 4.345367908477783, "learning_rate": 4.990322095691956e-05, "loss": 0.02, "step": 13200 }, { "epoch": 0.07720774980044989, "grad_norm": 0.672780454158783, "learning_rate": 4.9898956247006636e-05, "loss": 0.0199, "step": 13300 }, { "epoch": 0.07778825919744575, "grad_norm": 0.8305972218513489, "learning_rate": 4.9894599776959015e-05, "loss": 0.0217, "step": 13400 }, { "epoch": 0.07836876859444163, "grad_norm": 0.0, "learning_rate": 4.9890151562831606e-05, "loss": 0.0245, "step": 13500 }, { "epoch": 0.07894927799143749, "grad_norm": 0.9174224734306335, "learning_rate": 4.9885611621017403e-05, "loss": 0.0184, "step": 13600 }, { "epoch": 0.07952978738843335, "grad_norm": 5.607458114624023, "learning_rate": 4.988097996824746e-05, "loss": 0.0196, "step": 13700 }, { "epoch": 0.08011029678542922, "grad_norm": 0.9925137162208557, "learning_rate": 4.987625662159083e-05, "loss": 0.021, "step": 13800 }, { "epoch": 0.08069080618242508, "grad_norm": 5.234767436981201, "learning_rate": 4.987144159845443e-05, "loss": 0.0226, "step": 13900 }, { "epoch": 0.08127131557942094, "grad_norm": 0.462223082780838, "learning_rate": 4.986653491658309e-05, "loss": 0.0201, "step": 14000 }, { "epoch": 0.08185182497641681, "grad_norm": 0.5672245025634766, "learning_rate": 4.986153659405939e-05, "loss": 0.0173, "step": 14100 }, { "epoch": 0.08243233437341267, "grad_norm": 7.8867011070251465, "learning_rate": 4.985644664930367e-05, "loss": 0.0173, "step": 14200 }, { "epoch": 0.08301284377040853, "grad_norm": 0.7449125647544861, "learning_rate": 4.9851265101073886e-05, "loss": 0.024, "step": 14300 }, { "epoch": 0.08359335316740439, "grad_norm": 0.16529901325702667, "learning_rate": 4.984599196846562e-05, "loss": 0.0227, "step": 14400 }, { "epoch": 0.08417386256440026, "grad_norm": 11.49720287322998, "learning_rate": 4.9840627270911934e-05, "loss": 0.0232, "step": 14500 }, { "epoch": 0.08475437196139612, "grad_norm": 1.724212884902954, "learning_rate": 4.9835171028183355e-05, "loss": 0.0222, "step": 14600 }, { "epoch": 0.08533488135839198, "grad_norm": 0.8492684364318848, "learning_rate": 4.982962326038778e-05, "loss": 0.0202, "step": 14700 }, { "epoch": 0.08591539075538786, "grad_norm": 1.6152923107147217, "learning_rate": 4.9823983987970396e-05, "loss": 0.0195, "step": 14800 }, { "epoch": 0.08649590015238372, "grad_norm": 0.52412348985672, "learning_rate": 4.981825323171362e-05, "loss": 0.0206, "step": 14900 }, { "epoch": 0.08707640954937958, "grad_norm": 0.41687363386154175, "learning_rate": 4.9812431012737006e-05, "loss": 0.023, "step": 15000 }, { "epoch": 0.08765691894637545, "grad_norm": 1.0384039878845215, "learning_rate": 4.9806517352497184e-05, "loss": 0.0244, "step": 15100 }, { "epoch": 0.08823742834337131, "grad_norm": 5.219573974609375, "learning_rate": 4.980051227278777e-05, "loss": 0.0209, "step": 15200 }, { "epoch": 0.08881793774036717, "grad_norm": 2.8465518951416016, "learning_rate": 4.979441579573928e-05, "loss": 0.0253, "step": 15300 }, { "epoch": 0.08939844713736303, "grad_norm": 7.259213924407959, "learning_rate": 4.978822794381908e-05, "loss": 0.0246, "step": 15400 }, { "epoch": 0.0899789565343589, "grad_norm": 1.2073447704315186, "learning_rate": 4.978194873983124e-05, "loss": 0.0168, "step": 15500 }, { "epoch": 0.09055946593135476, "grad_norm": 3.7044851779937744, "learning_rate": 4.977557820691653e-05, "loss": 0.0188, "step": 15600 }, { "epoch": 0.09113997532835062, "grad_norm": 0.4414062798023224, "learning_rate": 4.976911636855227e-05, "loss": 0.0224, "step": 15700 }, { "epoch": 0.0917204847253465, "grad_norm": 2.013897657394409, "learning_rate": 4.976256324855227e-05, "loss": 0.0198, "step": 15800 }, { "epoch": 0.09230099412234236, "grad_norm": 0.45843204855918884, "learning_rate": 4.975591887106677e-05, "loss": 0.0176, "step": 15900 }, { "epoch": 0.09288150351933822, "grad_norm": 1.0656216144561768, "learning_rate": 4.9749183260582274e-05, "loss": 0.0249, "step": 16000 }, { "epoch": 0.09346201291633409, "grad_norm": 0.3733506500720978, "learning_rate": 4.9742356441921544e-05, "loss": 0.0203, "step": 16100 }, { "epoch": 0.09404252231332995, "grad_norm": 0.9329887628555298, "learning_rate": 4.973543844024345e-05, "loss": 0.0218, "step": 16200 }, { "epoch": 0.09462303171032581, "grad_norm": 4.0852251052856445, "learning_rate": 4.972842928104291e-05, "loss": 0.027, "step": 16300 }, { "epoch": 0.09520354110732167, "grad_norm": 0.3162221610546112, "learning_rate": 4.9721328990150776e-05, "loss": 0.0225, "step": 16400 }, { "epoch": 0.09578405050431754, "grad_norm": 0.2578160762786865, "learning_rate": 4.971413759373376e-05, "loss": 0.0176, "step": 16500 }, { "epoch": 0.0963645599013134, "grad_norm": 0.3880905210971832, "learning_rate": 4.970685511829432e-05, "loss": 0.0183, "step": 16600 }, { "epoch": 0.09694506929830926, "grad_norm": 1.3224152326583862, "learning_rate": 4.969948159067056e-05, "loss": 0.0202, "step": 16700 }, { "epoch": 0.09752557869530513, "grad_norm": 1.7293118238449097, "learning_rate": 4.969201703803614e-05, "loss": 0.0234, "step": 16800 }, { "epoch": 0.098106088092301, "grad_norm": 1.8660351037979126, "learning_rate": 4.9684461487900195e-05, "loss": 0.0207, "step": 16900 }, { "epoch": 0.09868659748929685, "grad_norm": 2.0726287364959717, "learning_rate": 4.967681496810719e-05, "loss": 0.0218, "step": 17000 }, { "epoch": 0.09926710688629273, "grad_norm": 6.986308574676514, "learning_rate": 4.966907750683684e-05, "loss": 0.0194, "step": 17100 }, { "epoch": 0.09984761628328859, "grad_norm": 0.9702723622322083, "learning_rate": 4.966124913260402e-05, "loss": 0.022, "step": 17200 }, { "epoch": 0.10042812568028445, "grad_norm": 0.1596653163433075, "learning_rate": 4.9653329874258647e-05, "loss": 0.0195, "step": 17300 }, { "epoch": 0.1010086350772803, "grad_norm": 4.516726970672607, "learning_rate": 4.964531976098556e-05, "loss": 0.0216, "step": 17400 }, { "epoch": 0.10158914447427618, "grad_norm": 3.8601644039154053, "learning_rate": 4.9637218822304446e-05, "loss": 0.0211, "step": 17500 }, { "epoch": 0.10216965387127204, "grad_norm": 0.3691064715385437, "learning_rate": 4.962902708806968e-05, "loss": 0.0237, "step": 17600 }, { "epoch": 0.1027501632682679, "grad_norm": 1.635680913925171, "learning_rate": 4.9620744588470256e-05, "loss": 0.0229, "step": 17700 }, { "epoch": 0.10333067266526377, "grad_norm": 0.3783847391605377, "learning_rate": 4.9612371354029706e-05, "loss": 0.0167, "step": 17800 }, { "epoch": 0.10391118206225963, "grad_norm": 1.172295093536377, "learning_rate": 4.96039074156059e-05, "loss": 0.0217, "step": 17900 }, { "epoch": 0.10449169145925549, "grad_norm": 8.094454765319824, "learning_rate": 4.959535280439098e-05, "loss": 0.019, "step": 18000 }, { "epoch": 0.10507220085625137, "grad_norm": 0.4028318524360657, "learning_rate": 4.958670755191127e-05, "loss": 0.0234, "step": 18100 }, { "epoch": 0.10565271025324723, "grad_norm": 0.5673860907554626, "learning_rate": 4.9577971690027136e-05, "loss": 0.0214, "step": 18200 }, { "epoch": 0.10623321965024309, "grad_norm": 0.0952591523528099, "learning_rate": 4.956914525093283e-05, "loss": 0.0195, "step": 18300 }, { "epoch": 0.10681372904723896, "grad_norm": 0.11975416541099548, "learning_rate": 4.9560228267156445e-05, "loss": 0.0214, "step": 18400 }, { "epoch": 0.10739423844423482, "grad_norm": 0.31427842378616333, "learning_rate": 4.955122077155974e-05, "loss": 0.0204, "step": 18500 }, { "epoch": 0.10797474784123068, "grad_norm": 0.2761117219924927, "learning_rate": 4.9542122797338054e-05, "loss": 0.018, "step": 18600 }, { "epoch": 0.10855525723822654, "grad_norm": 0.0, "learning_rate": 4.953293437802014e-05, "loss": 0.0203, "step": 18700 }, { "epoch": 0.10913576663522241, "grad_norm": 1.229196548461914, "learning_rate": 4.9523655547468095e-05, "loss": 0.0209, "step": 18800 }, { "epoch": 0.10971627603221827, "grad_norm": 1.3233908414840698, "learning_rate": 4.951428633987719e-05, "loss": 0.0192, "step": 18900 }, { "epoch": 0.11029678542921413, "grad_norm": 4.7784857749938965, "learning_rate": 4.950482678977577e-05, "loss": 0.021, "step": 19000 }, { "epoch": 0.11087729482621, "grad_norm": 0.059221718460321426, "learning_rate": 4.949527693202513e-05, "loss": 0.0232, "step": 19100 }, { "epoch": 0.11145780422320586, "grad_norm": 1.453174352645874, "learning_rate": 4.9485636801819356e-05, "loss": 0.0222, "step": 19200 }, { "epoch": 0.11203831362020172, "grad_norm": 0.19113394618034363, "learning_rate": 4.947590643468523e-05, "loss": 0.0213, "step": 19300 }, { "epoch": 0.1126188230171976, "grad_norm": 0.06702837347984314, "learning_rate": 4.946608586648206e-05, "loss": 0.0262, "step": 19400 }, { "epoch": 0.11319933241419346, "grad_norm": 0.7900282144546509, "learning_rate": 4.945617513340162e-05, "loss": 0.0179, "step": 19500 }, { "epoch": 0.11377984181118932, "grad_norm": 0.9422081112861633, "learning_rate": 4.944617427196792e-05, "loss": 0.0179, "step": 19600 }, { "epoch": 0.11436035120818518, "grad_norm": 6.721597194671631, "learning_rate": 4.9436083319037134e-05, "loss": 0.0228, "step": 19700 }, { "epoch": 0.11494086060518105, "grad_norm": 2.548957109451294, "learning_rate": 4.942590231179747e-05, "loss": 0.0208, "step": 19800 }, { "epoch": 0.11552137000217691, "grad_norm": 2.1897802352905273, "learning_rate": 4.9415631287768995e-05, "loss": 0.0293, "step": 19900 }, { "epoch": 0.11610187939917277, "grad_norm": 1.0778768062591553, "learning_rate": 4.9405270284803516e-05, "loss": 0.0205, "step": 20000 }, { "epoch": 0.11668238879616864, "grad_norm": 0.8228683471679688, "learning_rate": 4.939481934108444e-05, "loss": 0.0182, "step": 20100 }, { "epoch": 0.1172628981931645, "grad_norm": 0.5803897976875305, "learning_rate": 4.938427849512664e-05, "loss": 0.0253, "step": 20200 }, { "epoch": 0.11784340759016036, "grad_norm": 1.7605079412460327, "learning_rate": 4.93736477857763e-05, "loss": 0.022, "step": 20300 }, { "epoch": 0.11842391698715624, "grad_norm": 6.337480068206787, "learning_rate": 4.9362927252210764e-05, "loss": 0.0167, "step": 20400 }, { "epoch": 0.1190044263841521, "grad_norm": 1.6917483806610107, "learning_rate": 4.935211693393844e-05, "loss": 0.0197, "step": 20500 }, { "epoch": 0.11958493578114796, "grad_norm": 3.9351799488067627, "learning_rate": 4.934121687079859e-05, "loss": 0.024, "step": 20600 }, { "epoch": 0.12016544517814381, "grad_norm": 1.328538417816162, "learning_rate": 4.933022710296121e-05, "loss": 0.0215, "step": 20700 }, { "epoch": 0.12074595457513969, "grad_norm": 0.6960548758506775, "learning_rate": 4.931914767092692e-05, "loss": 0.0214, "step": 20800 }, { "epoch": 0.12132646397213555, "grad_norm": 3.212674140930176, "learning_rate": 4.930797861552674e-05, "loss": 0.0201, "step": 20900 }, { "epoch": 0.12190697336913141, "grad_norm": 0.33600953221321106, "learning_rate": 4.929671997792199e-05, "loss": 0.0188, "step": 21000 }, { "epoch": 0.12248748276612728, "grad_norm": 0.37020212411880493, "learning_rate": 4.928537179960415e-05, "loss": 0.0172, "step": 21100 }, { "epoch": 0.12306799216312314, "grad_norm": 1.471659541130066, "learning_rate": 4.927393412239465e-05, "loss": 0.022, "step": 21200 }, { "epoch": 0.123648501560119, "grad_norm": 0.34243813157081604, "learning_rate": 4.9262406988444773e-05, "loss": 0.0186, "step": 21300 }, { "epoch": 0.12422901095711487, "grad_norm": 1.0350617170333862, "learning_rate": 4.9250790440235487e-05, "loss": 0.0192, "step": 21400 }, { "epoch": 0.12480952035411073, "grad_norm": 0.2393186092376709, "learning_rate": 4.923908452057723e-05, "loss": 0.0202, "step": 21500 }, { "epoch": 0.1253900297511066, "grad_norm": 0.8566457629203796, "learning_rate": 4.9227289272609855e-05, "loss": 0.0225, "step": 21600 }, { "epoch": 0.12597053914810247, "grad_norm": 3.1956393718719482, "learning_rate": 4.92154047398024e-05, "loss": 0.0238, "step": 21700 }, { "epoch": 0.12655104854509833, "grad_norm": 2.6811676025390625, "learning_rate": 4.920343096595291e-05, "loss": 0.0225, "step": 21800 }, { "epoch": 0.1271315579420942, "grad_norm": 0.4638591706752777, "learning_rate": 4.9191367995188376e-05, "loss": 0.018, "step": 21900 }, { "epoch": 0.12771206733909005, "grad_norm": 1.0029135942459106, "learning_rate": 4.917921587196444e-05, "loss": 0.0282, "step": 22000 }, { "epoch": 0.1282925767360859, "grad_norm": 1.0247883796691895, "learning_rate": 4.916697464106535e-05, "loss": 0.0196, "step": 22100 }, { "epoch": 0.12887308613308177, "grad_norm": 1.2868847846984863, "learning_rate": 4.915464434760369e-05, "loss": 0.0239, "step": 22200 }, { "epoch": 0.12945359553007765, "grad_norm": 1.3176194429397583, "learning_rate": 4.914222503702033e-05, "loss": 0.0174, "step": 22300 }, { "epoch": 0.1300341049270735, "grad_norm": 0.4307650029659271, "learning_rate": 4.912971675508414e-05, "loss": 0.0205, "step": 22400 }, { "epoch": 0.13061461432406937, "grad_norm": 0.6782764196395874, "learning_rate": 4.911711954789191e-05, "loss": 0.0155, "step": 22500 }, { "epoch": 0.13119512372106523, "grad_norm": 4.9244866371154785, "learning_rate": 4.910443346186812e-05, "loss": 0.0216, "step": 22600 }, { "epoch": 0.1317756331180611, "grad_norm": 0.7279213666915894, "learning_rate": 4.9091658543764816e-05, "loss": 0.0192, "step": 22700 }, { "epoch": 0.13235614251505695, "grad_norm": 0.7150142192840576, "learning_rate": 4.9078794840661415e-05, "loss": 0.023, "step": 22800 }, { "epoch": 0.13293665191205284, "grad_norm": 0.3685489594936371, "learning_rate": 4.906584239996451e-05, "loss": 0.022, "step": 22900 }, { "epoch": 0.1335171613090487, "grad_norm": 2.9621119499206543, "learning_rate": 4.905280126940775e-05, "loss": 0.0172, "step": 23000 }, { "epoch": 0.13409767070604456, "grad_norm": 3.5731096267700195, "learning_rate": 4.9039671497051623e-05, "loss": 0.0197, "step": 23100 }, { "epoch": 0.13467818010304042, "grad_norm": 0.0, "learning_rate": 4.902645313128327e-05, "loss": 0.0168, "step": 23200 }, { "epoch": 0.13525868950003628, "grad_norm": 0.6063820123672485, "learning_rate": 4.901314622081635e-05, "loss": 0.0178, "step": 23300 }, { "epoch": 0.13583919889703214, "grad_norm": 1.5601248741149902, "learning_rate": 4.8999750814690825e-05, "loss": 0.0153, "step": 23400 }, { "epoch": 0.136419708294028, "grad_norm": 1.3011990785598755, "learning_rate": 4.89862669622728e-05, "loss": 0.0182, "step": 23500 }, { "epoch": 0.13700021769102388, "grad_norm": 2.8852713108062744, "learning_rate": 4.897269471325431e-05, "loss": 0.0187, "step": 23600 }, { "epoch": 0.13758072708801974, "grad_norm": 0.9321713447570801, "learning_rate": 4.895903411765317e-05, "loss": 0.0163, "step": 23700 }, { "epoch": 0.1381612364850156, "grad_norm": 1.3897167444229126, "learning_rate": 4.894528522581279e-05, "loss": 0.0255, "step": 23800 }, { "epoch": 0.13874174588201146, "grad_norm": 0.32952451705932617, "learning_rate": 4.893144808840196e-05, "loss": 0.0206, "step": 23900 }, { "epoch": 0.13932225527900732, "grad_norm": 0.41645577549934387, "learning_rate": 4.891752275641468e-05, "loss": 0.0187, "step": 24000 }, { "epoch": 0.13990276467600318, "grad_norm": 1.9071933031082153, "learning_rate": 4.890350928117e-05, "loss": 0.0189, "step": 24100 }, { "epoch": 0.14048327407299907, "grad_norm": 11.354212760925293, "learning_rate": 4.888940771431178e-05, "loss": 0.0193, "step": 24200 }, { "epoch": 0.14106378346999493, "grad_norm": 0.22728995978832245, "learning_rate": 4.887521810780853e-05, "loss": 0.0197, "step": 24300 }, { "epoch": 0.1416442928669908, "grad_norm": 0.6692954897880554, "learning_rate": 4.88609405139532e-05, "loss": 0.0211, "step": 24400 }, { "epoch": 0.14222480226398665, "grad_norm": 0.3103114664554596, "learning_rate": 4.884657498536304e-05, "loss": 0.0171, "step": 24500 }, { "epoch": 0.1428053116609825, "grad_norm": 0.7131453156471252, "learning_rate": 4.8832121574979314e-05, "loss": 0.0171, "step": 24600 }, { "epoch": 0.14338582105797837, "grad_norm": 0.8742627501487732, "learning_rate": 4.88175803360672e-05, "loss": 0.0171, "step": 24700 }, { "epoch": 0.14396633045497423, "grad_norm": 1.464080810546875, "learning_rate": 4.880295132221552e-05, "loss": 0.0217, "step": 24800 }, { "epoch": 0.14454683985197012, "grad_norm": 0.1914157271385193, "learning_rate": 4.87882345873366e-05, "loss": 0.0226, "step": 24900 }, { "epoch": 0.14512734924896598, "grad_norm": 0.7907546162605286, "learning_rate": 4.877343018566601e-05, "loss": 0.014, "step": 25000 }, { "epoch": 0.14570785864596184, "grad_norm": 0.7815316319465637, "learning_rate": 4.875853817176243e-05, "loss": 0.0208, "step": 25100 }, { "epoch": 0.1462883680429577, "grad_norm": 0.8793790340423584, "learning_rate": 4.87435586005074e-05, "loss": 0.0188, "step": 25200 }, { "epoch": 0.14686887743995355, "grad_norm": 0.35067594051361084, "learning_rate": 4.872849152710515e-05, "loss": 0.0247, "step": 25300 }, { "epoch": 0.14744938683694941, "grad_norm": 0.6180335283279419, "learning_rate": 4.871333700708236e-05, "loss": 0.0202, "step": 25400 }, { "epoch": 0.14802989623394527, "grad_norm": 1.0985257625579834, "learning_rate": 4.8698095096288e-05, "loss": 0.0197, "step": 25500 }, { "epoch": 0.14861040563094116, "grad_norm": 0.47718220949172974, "learning_rate": 4.8682765850893085e-05, "loss": 0.019, "step": 25600 }, { "epoch": 0.14919091502793702, "grad_norm": 3.357231616973877, "learning_rate": 4.866734932739049e-05, "loss": 0.021, "step": 25700 }, { "epoch": 0.14977142442493288, "grad_norm": 0.9318442940711975, "learning_rate": 4.865184558259474e-05, "loss": 0.0185, "step": 25800 }, { "epoch": 0.15035193382192874, "grad_norm": 0.1340111941099167, "learning_rate": 4.863625467364179e-05, "loss": 0.0164, "step": 25900 }, { "epoch": 0.1509324432189246, "grad_norm": 1.3237409591674805, "learning_rate": 4.862057665798883e-05, "loss": 0.0195, "step": 26000 }, { "epoch": 0.15151295261592046, "grad_norm": 0.23375193774700165, "learning_rate": 4.860481159341405e-05, "loss": 0.0169, "step": 26100 }, { "epoch": 0.15209346201291635, "grad_norm": 3.856459617614746, "learning_rate": 4.858895953801644e-05, "loss": 0.0181, "step": 26200 }, { "epoch": 0.1526739714099122, "grad_norm": 0.30206841230392456, "learning_rate": 4.8573020550215606e-05, "loss": 0.0203, "step": 26300 }, { "epoch": 0.15325448080690807, "grad_norm": 0.32139310240745544, "learning_rate": 4.855699468875151e-05, "loss": 0.0153, "step": 26400 }, { "epoch": 0.15383499020390393, "grad_norm": 0.3401035964488983, "learning_rate": 4.854088201268425e-05, "loss": 0.02, "step": 26500 }, { "epoch": 0.15441549960089979, "grad_norm": 1.1033488512039185, "learning_rate": 4.852468258139388e-05, "loss": 0.019, "step": 26600 }, { "epoch": 0.15499600899789565, "grad_norm": 0.26150408387184143, "learning_rate": 4.8508396454580174e-05, "loss": 0.0217, "step": 26700 }, { "epoch": 0.1555765183948915, "grad_norm": 1.0009557008743286, "learning_rate": 4.849202369226241e-05, "loss": 0.0178, "step": 26800 }, { "epoch": 0.1561570277918874, "grad_norm": 0.949277400970459, "learning_rate": 4.8475564354779135e-05, "loss": 0.021, "step": 26900 }, { "epoch": 0.15673753718888325, "grad_norm": 0.5669822692871094, "learning_rate": 4.845901850278794e-05, "loss": 0.0203, "step": 27000 }, { "epoch": 0.1573180465858791, "grad_norm": 0.8850467205047607, "learning_rate": 4.844238619726528e-05, "loss": 0.0161, "step": 27100 }, { "epoch": 0.15789855598287497, "grad_norm": 1.5471025705337524, "learning_rate": 4.842566749950618e-05, "loss": 0.0169, "step": 27200 }, { "epoch": 0.15847906537987083, "grad_norm": 0.35896119475364685, "learning_rate": 4.8408862471124075e-05, "loss": 0.0232, "step": 27300 }, { "epoch": 0.1590595747768667, "grad_norm": 1.772006630897522, "learning_rate": 4.839197117405053e-05, "loss": 0.0195, "step": 27400 }, { "epoch": 0.15964008417386255, "grad_norm": 0.5070587396621704, "learning_rate": 4.837499367053508e-05, "loss": 0.0159, "step": 27500 }, { "epoch": 0.16022059357085844, "grad_norm": 0.4093703329563141, "learning_rate": 4.835793002314489e-05, "loss": 0.018, "step": 27600 }, { "epoch": 0.1608011029678543, "grad_norm": 0.20872582495212555, "learning_rate": 4.8340780294764655e-05, "loss": 0.0206, "step": 27700 }, { "epoch": 0.16138161236485016, "grad_norm": 1.1325550079345703, "learning_rate": 4.8323544548596256e-05, "loss": 0.0179, "step": 27800 }, { "epoch": 0.16196212176184602, "grad_norm": 0.0, "learning_rate": 4.8306222848158615e-05, "loss": 0.0198, "step": 27900 }, { "epoch": 0.16254263115884188, "grad_norm": 0.10438452661037445, "learning_rate": 4.828881525728739e-05, "loss": 0.0246, "step": 28000 }, { "epoch": 0.16312314055583774, "grad_norm": 0.5154972076416016, "learning_rate": 4.827132184013479e-05, "loss": 0.0203, "step": 28100 }, { "epoch": 0.16370364995283362, "grad_norm": 0.43772637844085693, "learning_rate": 4.825374266116931e-05, "loss": 0.0152, "step": 28200 }, { "epoch": 0.16428415934982948, "grad_norm": 0.9545837640762329, "learning_rate": 4.82360777851755e-05, "loss": 0.0193, "step": 28300 }, { "epoch": 0.16486466874682534, "grad_norm": 0.4668686091899872, "learning_rate": 4.821832727725375e-05, "loss": 0.0163, "step": 28400 }, { "epoch": 0.1654451781438212, "grad_norm": 1.1754403114318848, "learning_rate": 4.8200491202819995e-05, "loss": 0.018, "step": 28500 }, { "epoch": 0.16602568754081706, "grad_norm": 0.6412404179573059, "learning_rate": 4.8182569627605556e-05, "loss": 0.015, "step": 28600 }, { "epoch": 0.16660619693781292, "grad_norm": 3.5581717491149902, "learning_rate": 4.81645626176568e-05, "loss": 0.0128, "step": 28700 }, { "epoch": 0.16718670633480878, "grad_norm": 0.8452811241149902, "learning_rate": 4.814647023933497e-05, "loss": 0.021, "step": 28800 }, { "epoch": 0.16776721573180467, "grad_norm": 0.7324305772781372, "learning_rate": 4.812829255931592e-05, "loss": 0.0228, "step": 28900 }, { "epoch": 0.16834772512880053, "grad_norm": 2.6767971515655518, "learning_rate": 4.811002964458987e-05, "loss": 0.0194, "step": 29000 }, { "epoch": 0.1689282345257964, "grad_norm": 1.0238885879516602, "learning_rate": 4.809168156246113e-05, "loss": 0.0145, "step": 29100 }, { "epoch": 0.16950874392279225, "grad_norm": 0.48919227719306946, "learning_rate": 4.807324838054792e-05, "loss": 0.0199, "step": 29200 }, { "epoch": 0.1700892533197881, "grad_norm": 1.0482101440429688, "learning_rate": 4.8054730166782035e-05, "loss": 0.0204, "step": 29300 }, { "epoch": 0.17066976271678397, "grad_norm": 0.22599902749061584, "learning_rate": 4.8036126989408666e-05, "loss": 0.0197, "step": 29400 }, { "epoch": 0.17125027211377986, "grad_norm": 0.2554647922515869, "learning_rate": 4.80174389169861e-05, "loss": 0.0178, "step": 29500 }, { "epoch": 0.17183078151077572, "grad_norm": 1.0132629871368408, "learning_rate": 4.7998666018385506e-05, "loss": 0.0172, "step": 29600 }, { "epoch": 0.17241129090777157, "grad_norm": 3.19964599609375, "learning_rate": 4.7979808362790655e-05, "loss": 0.0183, "step": 29700 }, { "epoch": 0.17299180030476743, "grad_norm": 0.5451412200927734, "learning_rate": 4.796086601969768e-05, "loss": 0.0189, "step": 29800 }, { "epoch": 0.1735723097017633, "grad_norm": 0.18623612821102142, "learning_rate": 4.7941839058914796e-05, "loss": 0.0165, "step": 29900 }, { "epoch": 0.17415281909875915, "grad_norm": 1.2114591598510742, "learning_rate": 4.792272755056207e-05, "loss": 0.0185, "step": 30000 }, { "epoch": 0.174733328495755, "grad_norm": 0.8469420075416565, "learning_rate": 4.790353156507117e-05, "loss": 0.0191, "step": 30100 }, { "epoch": 0.1753138378927509, "grad_norm": 1.4895416498184204, "learning_rate": 4.7884251173185045e-05, "loss": 0.0202, "step": 30200 }, { "epoch": 0.17589434728974676, "grad_norm": 0.26737430691719055, "learning_rate": 4.786488644595775e-05, "loss": 0.0154, "step": 30300 }, { "epoch": 0.17647485668674262, "grad_norm": 1.4356130361557007, "learning_rate": 4.7845437454754116e-05, "loss": 0.0164, "step": 30400 }, { "epoch": 0.17705536608373848, "grad_norm": 0.28000500798225403, "learning_rate": 4.782590427124952e-05, "loss": 0.0158, "step": 30500 }, { "epoch": 0.17763587548073434, "grad_norm": 2.8763926029205322, "learning_rate": 4.7806286967429606e-05, "loss": 0.0182, "step": 30600 }, { "epoch": 0.1782163848777302, "grad_norm": 1.897760272026062, "learning_rate": 4.778658561559004e-05, "loss": 0.0255, "step": 30700 }, { "epoch": 0.17879689427472606, "grad_norm": 1.896153450012207, "learning_rate": 4.776680028833623e-05, "loss": 0.0187, "step": 30800 }, { "epoch": 0.17937740367172195, "grad_norm": 0.31827715039253235, "learning_rate": 4.7746931058583035e-05, "loss": 0.0172, "step": 30900 }, { "epoch": 0.1799579130687178, "grad_norm": 2.0092689990997314, "learning_rate": 4.772697799955455e-05, "loss": 0.0156, "step": 31000 }, { "epoch": 0.18053842246571367, "grad_norm": 3.24516224861145, "learning_rate": 4.7706941184783776e-05, "loss": 0.0157, "step": 31100 }, { "epoch": 0.18111893186270953, "grad_norm": 4.248687744140625, "learning_rate": 4.768682068811241e-05, "loss": 0.0223, "step": 31200 }, { "epoch": 0.18169944125970539, "grad_norm": 1.3310073614120483, "learning_rate": 4.7666616583690525e-05, "loss": 0.0181, "step": 31300 }, { "epoch": 0.18227995065670125, "grad_norm": 0.7074719071388245, "learning_rate": 4.764632894597632e-05, "loss": 0.0165, "step": 31400 }, { "epoch": 0.18286046005369713, "grad_norm": 1.0923957824707031, "learning_rate": 4.7625957849735826e-05, "loss": 0.0209, "step": 31500 }, { "epoch": 0.183440969450693, "grad_norm": 0.4126807749271393, "learning_rate": 4.760550337004266e-05, "loss": 0.021, "step": 31600 }, { "epoch": 0.18402147884768885, "grad_norm": 0.7564171552658081, "learning_rate": 4.758496558227771e-05, "loss": 0.02, "step": 31700 }, { "epoch": 0.1846019882446847, "grad_norm": 5.621452808380127, "learning_rate": 4.756434456212892e-05, "loss": 0.0218, "step": 31800 }, { "epoch": 0.18518249764168057, "grad_norm": 0.5979048013687134, "learning_rate": 4.7543640385590925e-05, "loss": 0.018, "step": 31900 }, { "epoch": 0.18576300703867643, "grad_norm": 0.4124324917793274, "learning_rate": 4.752285312896485e-05, "loss": 0.0192, "step": 32000 }, { "epoch": 0.1863435164356723, "grad_norm": 19.721843719482422, "learning_rate": 4.750198286885797e-05, "loss": 0.0191, "step": 32100 }, { "epoch": 0.18692402583266818, "grad_norm": 1.7610396146774292, "learning_rate": 4.748102968218347e-05, "loss": 0.0205, "step": 32200 }, { "epoch": 0.18750453522966404, "grad_norm": 0.6155940890312195, "learning_rate": 4.745999364616014e-05, "loss": 0.0233, "step": 32300 }, { "epoch": 0.1880850446266599, "grad_norm": 1.3487342596054077, "learning_rate": 4.743887483831208e-05, "loss": 0.0182, "step": 32400 }, { "epoch": 0.18866555402365576, "grad_norm": 1.6848351955413818, "learning_rate": 4.741767333646846e-05, "loss": 0.0196, "step": 32500 }, { "epoch": 0.18924606342065162, "grad_norm": 0.3916856348514557, "learning_rate": 4.739638921876317e-05, "loss": 0.0157, "step": 32600 }, { "epoch": 0.18982657281764748, "grad_norm": 0.43171945214271545, "learning_rate": 4.737502256363459e-05, "loss": 0.015, "step": 32700 }, { "epoch": 0.19040708221464334, "grad_norm": 2.9373903274536133, "learning_rate": 4.735357344982525e-05, "loss": 0.0182, "step": 32800 }, { "epoch": 0.19098759161163922, "grad_norm": 1.7581216096878052, "learning_rate": 4.733204195638159e-05, "loss": 0.021, "step": 32900 }, { "epoch": 0.19156810100863508, "grad_norm": 0.5465153455734253, "learning_rate": 4.731042816265364e-05, "loss": 0.0165, "step": 33000 }, { "epoch": 0.19214861040563094, "grad_norm": 1.2427330017089844, "learning_rate": 4.72887321482947e-05, "loss": 0.0172, "step": 33100 }, { "epoch": 0.1927291198026268, "grad_norm": 1.4515526294708252, "learning_rate": 4.726695399326113e-05, "loss": 0.0166, "step": 33200 }, { "epoch": 0.19330962919962266, "grad_norm": 0.47813165187835693, "learning_rate": 4.7245093777811945e-05, "loss": 0.0165, "step": 33300 }, { "epoch": 0.19389013859661852, "grad_norm": 0.4670131206512451, "learning_rate": 4.722315158250863e-05, "loss": 0.0171, "step": 33400 }, { "epoch": 0.1944706479936144, "grad_norm": 1.1390752792358398, "learning_rate": 4.720112748821475e-05, "loss": 0.0219, "step": 33500 }, { "epoch": 0.19505115739061027, "grad_norm": 0.6364420652389526, "learning_rate": 4.7179021576095724e-05, "loss": 0.0186, "step": 33600 }, { "epoch": 0.19563166678760613, "grad_norm": 0.0812646821141243, "learning_rate": 4.7156833927618475e-05, "loss": 0.0184, "step": 33700 }, { "epoch": 0.196212176184602, "grad_norm": 0.5782191753387451, "learning_rate": 4.713456462455116e-05, "loss": 0.0212, "step": 33800 }, { "epoch": 0.19679268558159785, "grad_norm": 0.3221706449985504, "learning_rate": 4.711221374896283e-05, "loss": 0.0183, "step": 33900 }, { "epoch": 0.1973731949785937, "grad_norm": 0.4310432970523834, "learning_rate": 4.7089781383223203e-05, "loss": 0.0194, "step": 34000 }, { "epoch": 0.19795370437558957, "grad_norm": 0.4918677508831024, "learning_rate": 4.706726761000227e-05, "loss": 0.0192, "step": 34100 }, { "epoch": 0.19853421377258545, "grad_norm": 0.8517147302627563, "learning_rate": 4.704467251227006e-05, "loss": 0.0179, "step": 34200 }, { "epoch": 0.19911472316958131, "grad_norm": 0.8899397850036621, "learning_rate": 4.702199617329629e-05, "loss": 0.0216, "step": 34300 }, { "epoch": 0.19969523256657717, "grad_norm": 1.310464859008789, "learning_rate": 4.6999238676650074e-05, "loss": 0.0196, "step": 34400 }, { "epoch": 0.20027574196357303, "grad_norm": 2.9320359230041504, "learning_rate": 4.697640010619965e-05, "loss": 0.0167, "step": 34500 }, { "epoch": 0.2008562513605689, "grad_norm": 0.2071259319782257, "learning_rate": 4.6953480546111986e-05, "loss": 0.019, "step": 34600 }, { "epoch": 0.20143676075756475, "grad_norm": 5.163755893707275, "learning_rate": 4.6930480080852553e-05, "loss": 0.0147, "step": 34700 }, { "epoch": 0.2020172701545606, "grad_norm": 0.6986225843429565, "learning_rate": 4.6907398795184995e-05, "loss": 0.0248, "step": 34800 }, { "epoch": 0.2025977795515565, "grad_norm": 0.5469736456871033, "learning_rate": 4.6884236774170766e-05, "loss": 0.0147, "step": 34900 }, { "epoch": 0.20317828894855236, "grad_norm": 1.1931012868881226, "learning_rate": 4.686099410316888e-05, "loss": 0.0183, "step": 35000 }, { "epoch": 0.20375879834554822, "grad_norm": 3.972321033477783, "learning_rate": 4.6837670867835546e-05, "loss": 0.0199, "step": 35100 }, { "epoch": 0.20433930774254408, "grad_norm": 0.9743729829788208, "learning_rate": 4.681426715412392e-05, "loss": 0.0161, "step": 35200 }, { "epoch": 0.20491981713953994, "grad_norm": 1.3011478185653687, "learning_rate": 4.67907830482837e-05, "loss": 0.0177, "step": 35300 }, { "epoch": 0.2055003265365358, "grad_norm": 0.2887895405292511, "learning_rate": 4.676721863686088e-05, "loss": 0.0156, "step": 35400 }, { "epoch": 0.2060808359335317, "grad_norm": 0.5841540098190308, "learning_rate": 4.67435740066974e-05, "loss": 0.0201, "step": 35500 }, { "epoch": 0.20666134533052755, "grad_norm": 0.38853272795677185, "learning_rate": 4.671984924493081e-05, "loss": 0.0185, "step": 35600 }, { "epoch": 0.2072418547275234, "grad_norm": 2.471120834350586, "learning_rate": 4.6696044438994004e-05, "loss": 0.0201, "step": 35700 }, { "epoch": 0.20782236412451927, "grad_norm": 1.8866631984710693, "learning_rate": 4.667215967661483e-05, "loss": 0.0199, "step": 35800 }, { "epoch": 0.20840287352151513, "grad_norm": 0.6856237053871155, "learning_rate": 4.664819504581582e-05, "loss": 0.0161, "step": 35900 }, { "epoch": 0.20898338291851098, "grad_norm": 0.3085954487323761, "learning_rate": 4.662415063491384e-05, "loss": 0.0173, "step": 36000 }, { "epoch": 0.20956389231550684, "grad_norm": 0.6165205240249634, "learning_rate": 4.660002653251977e-05, "loss": 0.0184, "step": 36100 }, { "epoch": 0.21014440171250273, "grad_norm": 3.60754132270813, "learning_rate": 4.657582282753816e-05, "loss": 0.0212, "step": 36200 }, { "epoch": 0.2107249111094986, "grad_norm": 0.378738135099411, "learning_rate": 4.655153960916695e-05, "loss": 0.0247, "step": 36300 }, { "epoch": 0.21130542050649445, "grad_norm": 1.4534658193588257, "learning_rate": 4.652717696689709e-05, "loss": 0.02, "step": 36400 }, { "epoch": 0.2118859299034903, "grad_norm": 0.41120943427085876, "learning_rate": 4.6502734990512255e-05, "loss": 0.0136, "step": 36500 }, { "epoch": 0.21246643930048617, "grad_norm": 11.224574089050293, "learning_rate": 4.647821377008844e-05, "loss": 0.0208, "step": 36600 }, { "epoch": 0.21304694869748203, "grad_norm": 2.768289089202881, "learning_rate": 4.645361339599373e-05, "loss": 0.0174, "step": 36700 }, { "epoch": 0.21362745809447792, "grad_norm": 1.1909620761871338, "learning_rate": 4.6428933958887885e-05, "loss": 0.0194, "step": 36800 }, { "epoch": 0.21420796749147378, "grad_norm": 2.583638906478882, "learning_rate": 4.6404175549722055e-05, "loss": 0.0151, "step": 36900 }, { "epoch": 0.21478847688846964, "grad_norm": 1.1585899591445923, "learning_rate": 4.6379338259738414e-05, "loss": 0.019, "step": 37000 }, { "epoch": 0.2153689862854655, "grad_norm": 0.24345244467258453, "learning_rate": 4.6354422180469834e-05, "loss": 0.0158, "step": 37100 }, { "epoch": 0.21594949568246136, "grad_norm": 0.9591251611709595, "learning_rate": 4.632942740373955e-05, "loss": 0.0162, "step": 37200 }, { "epoch": 0.21653000507945722, "grad_norm": 0.12427254766225815, "learning_rate": 4.630435402166083e-05, "loss": 0.0291, "step": 37300 }, { "epoch": 0.21711051447645308, "grad_norm": 12.628028869628906, "learning_rate": 4.6279202126636624e-05, "loss": 0.0147, "step": 37400 }, { "epoch": 0.21769102387344896, "grad_norm": 0.6221101880073547, "learning_rate": 4.625397181135922e-05, "loss": 0.0188, "step": 37500 }, { "epoch": 0.21827153327044482, "grad_norm": 0.11033707112073898, "learning_rate": 4.6228663168809904e-05, "loss": 0.0141, "step": 37600 }, { "epoch": 0.21885204266744068, "grad_norm": 0.5746279954910278, "learning_rate": 4.620327629225863e-05, "loss": 0.0169, "step": 37700 }, { "epoch": 0.21943255206443654, "grad_norm": 1.458079218864441, "learning_rate": 4.6177811275263665e-05, "loss": 0.0195, "step": 37800 }, { "epoch": 0.2200130614614324, "grad_norm": 8.152167320251465, "learning_rate": 4.615226821167126e-05, "loss": 0.0155, "step": 37900 }, { "epoch": 0.22059357085842826, "grad_norm": 2.139084577560425, "learning_rate": 4.612664719561526e-05, "loss": 0.0179, "step": 38000 }, { "epoch": 0.22117408025542412, "grad_norm": 2.3556296825408936, "learning_rate": 4.610094832151681e-05, "loss": 0.0187, "step": 38100 }, { "epoch": 0.22175458965242, "grad_norm": 0.26898398995399475, "learning_rate": 4.6075171684084e-05, "loss": 0.0208, "step": 38200 }, { "epoch": 0.22233509904941587, "grad_norm": 0.16663870215415955, "learning_rate": 4.604931737831146e-05, "loss": 0.0189, "step": 38300 }, { "epoch": 0.22291560844641173, "grad_norm": 4.2696943283081055, "learning_rate": 4.60233854994801e-05, "loss": 0.0168, "step": 38400 }, { "epoch": 0.2234961178434076, "grad_norm": 1.2985143661499023, "learning_rate": 4.5997376143156654e-05, "loss": 0.0161, "step": 38500 }, { "epoch": 0.22407662724040345, "grad_norm": 0.7481074333190918, "learning_rate": 4.597128940519344e-05, "loss": 0.0132, "step": 38600 }, { "epoch": 0.2246571366373993, "grad_norm": 1.7323493957519531, "learning_rate": 4.5945125381727924e-05, "loss": 0.0147, "step": 38700 }, { "epoch": 0.2252376460343952, "grad_norm": 0.9689391851425171, "learning_rate": 4.591888416918238e-05, "loss": 0.0175, "step": 38800 }, { "epoch": 0.22581815543139105, "grad_norm": 0.9196175336837769, "learning_rate": 4.589256586426356e-05, "loss": 0.0167, "step": 38900 }, { "epoch": 0.22639866482838691, "grad_norm": 1.5890405178070068, "learning_rate": 4.586617056396234e-05, "loss": 0.0198, "step": 39000 }, { "epoch": 0.22697917422538277, "grad_norm": 4.1506829261779785, "learning_rate": 4.583969836555333e-05, "loss": 0.015, "step": 39100 }, { "epoch": 0.22755968362237863, "grad_norm": 1.0635732412338257, "learning_rate": 4.581314936659451e-05, "loss": 0.0186, "step": 39200 }, { "epoch": 0.2281401930193745, "grad_norm": 1.0515848398208618, "learning_rate": 4.578652366492695e-05, "loss": 0.0248, "step": 39300 }, { "epoch": 0.22872070241637035, "grad_norm": 0.3499925434589386, "learning_rate": 4.5759821358674346e-05, "loss": 0.0176, "step": 39400 }, { "epoch": 0.22930121181336624, "grad_norm": 0.0, "learning_rate": 4.573304254624271e-05, "loss": 0.0164, "step": 39500 }, { "epoch": 0.2298817212103621, "grad_norm": 0.6939957141876221, "learning_rate": 4.570618732632003e-05, "loss": 0.0191, "step": 39600 }, { "epoch": 0.23046223060735796, "grad_norm": 0.3061050772666931, "learning_rate": 4.5679255797875856e-05, "loss": 0.0188, "step": 39700 }, { "epoch": 0.23104274000435382, "grad_norm": 0.3814498484134674, "learning_rate": 4.565224806016095e-05, "loss": 0.0164, "step": 39800 }, { "epoch": 0.23162324940134968, "grad_norm": 0.0, "learning_rate": 4.562516421270695e-05, "loss": 0.017, "step": 39900 }, { "epoch": 0.23220375879834554, "grad_norm": 1.1685712337493896, "learning_rate": 4.559800435532596e-05, "loss": 0.018, "step": 40000 }, { "epoch": 0.2327842681953414, "grad_norm": 0.8218551874160767, "learning_rate": 4.5570768588110235e-05, "loss": 0.0162, "step": 40100 }, { "epoch": 0.23336477759233729, "grad_norm": 2.128337860107422, "learning_rate": 4.5543457011431744e-05, "loss": 0.0178, "step": 40200 }, { "epoch": 0.23394528698933315, "grad_norm": 36.10731887817383, "learning_rate": 4.5516069725941854e-05, "loss": 0.0185, "step": 40300 }, { "epoch": 0.234525796386329, "grad_norm": 1.233340859413147, "learning_rate": 4.548860683257096e-05, "loss": 0.0175, "step": 40400 }, { "epoch": 0.23510630578332486, "grad_norm": 0.9268346428871155, "learning_rate": 4.546106843252804e-05, "loss": 0.0245, "step": 40500 }, { "epoch": 0.23568681518032072, "grad_norm": 0.29465100169181824, "learning_rate": 4.54334546273004e-05, "loss": 0.0211, "step": 40600 }, { "epoch": 0.23626732457731658, "grad_norm": 0.3362191319465637, "learning_rate": 4.5405765518653204e-05, "loss": 0.0151, "step": 40700 }, { "epoch": 0.23684783397431247, "grad_norm": 0.8512314558029175, "learning_rate": 4.537800120862913e-05, "loss": 0.0162, "step": 40800 }, { "epoch": 0.23742834337130833, "grad_norm": 0.07062964141368866, "learning_rate": 4.5350161799548e-05, "loss": 0.0162, "step": 40900 }, { "epoch": 0.2380088527683042, "grad_norm": 0.0, "learning_rate": 4.5322247394006415e-05, "loss": 0.0164, "step": 41000 }, { "epoch": 0.23858936216530005, "grad_norm": 6.001936435699463, "learning_rate": 4.529425809487733e-05, "loss": 0.018, "step": 41100 }, { "epoch": 0.2391698715622959, "grad_norm": 1.6303260326385498, "learning_rate": 4.526619400530973e-05, "loss": 0.0154, "step": 41200 }, { "epoch": 0.23975038095929177, "grad_norm": 0.2997598648071289, "learning_rate": 4.523805522872822e-05, "loss": 0.0133, "step": 41300 }, { "epoch": 0.24033089035628763, "grad_norm": 0.1703944355249405, "learning_rate": 4.5209841868832635e-05, "loss": 0.0161, "step": 41400 }, { "epoch": 0.24091139975328352, "grad_norm": 2.1550252437591553, "learning_rate": 4.51815540295977e-05, "loss": 0.0148, "step": 41500 }, { "epoch": 0.24149190915027938, "grad_norm": 3.253269910812378, "learning_rate": 4.515319181527259e-05, "loss": 0.0197, "step": 41600 }, { "epoch": 0.24207241854727524, "grad_norm": 0.663278341293335, "learning_rate": 4.512475533038059e-05, "loss": 0.0152, "step": 41700 }, { "epoch": 0.2426529279442711, "grad_norm": 0.05046732723712921, "learning_rate": 4.5096244679718676e-05, "loss": 0.0207, "step": 41800 }, { "epoch": 0.24323343734126696, "grad_norm": 2.685068368911743, "learning_rate": 4.506765996835718e-05, "loss": 0.0154, "step": 41900 }, { "epoch": 0.24381394673826282, "grad_norm": 0.698753833770752, "learning_rate": 4.503900130163935e-05, "loss": 0.0161, "step": 42000 }, { "epoch": 0.2443944561352587, "grad_norm": 0.11762864887714386, "learning_rate": 4.501026878518097e-05, "loss": 0.0187, "step": 42100 }, { "epoch": 0.24497496553225456, "grad_norm": 1.077953577041626, "learning_rate": 4.498146252487002e-05, "loss": 0.0185, "step": 42200 }, { "epoch": 0.24555547492925042, "grad_norm": 0.18901602923870087, "learning_rate": 4.49525826268662e-05, "loss": 0.0153, "step": 42300 }, { "epoch": 0.24613598432624628, "grad_norm": 1.0348716974258423, "learning_rate": 4.492362919760063e-05, "loss": 0.0178, "step": 42400 }, { "epoch": 0.24671649372324214, "grad_norm": 0.6340203285217285, "learning_rate": 4.489460234377538e-05, "loss": 0.0158, "step": 42500 }, { "epoch": 0.247297003120238, "grad_norm": 1.0056567192077637, "learning_rate": 4.4865502172363126e-05, "loss": 0.0189, "step": 42600 }, { "epoch": 0.24787751251723386, "grad_norm": 2.1102306842803955, "learning_rate": 4.483632879060676e-05, "loss": 0.0158, "step": 42700 }, { "epoch": 0.24845802191422975, "grad_norm": 0.1478302925825119, "learning_rate": 4.480708230601895e-05, "loss": 0.0166, "step": 42800 }, { "epoch": 0.2490385313112256, "grad_norm": 0.5817315578460693, "learning_rate": 4.4777762826381775e-05, "loss": 0.0202, "step": 42900 }, { "epoch": 0.24961904070822147, "grad_norm": 2.631985664367676, "learning_rate": 4.4748370459746334e-05, "loss": 0.0151, "step": 43000 }, { "epoch": 0.2501995501052173, "grad_norm": 0.3138137459754944, "learning_rate": 4.471890531443232e-05, "loss": 0.0188, "step": 43100 }, { "epoch": 0.2507800595022132, "grad_norm": 0.6783995628356934, "learning_rate": 4.4689367499027654e-05, "loss": 0.0195, "step": 43200 }, { "epoch": 0.25136056889920905, "grad_norm": 2.4091744422912598, "learning_rate": 4.4659757122388066e-05, "loss": 0.0158, "step": 43300 }, { "epoch": 0.25194107829620493, "grad_norm": 1.260305643081665, "learning_rate": 4.463007429363668e-05, "loss": 0.0186, "step": 43400 }, { "epoch": 0.25252158769320077, "grad_norm": 1.3665226697921753, "learning_rate": 4.460031912216363e-05, "loss": 0.0163, "step": 43500 }, { "epoch": 0.25310209709019665, "grad_norm": 0.0, "learning_rate": 4.457049171762568e-05, "loss": 0.0163, "step": 43600 }, { "epoch": 0.2536826064871925, "grad_norm": 0.5342715382575989, "learning_rate": 4.454059218994577e-05, "loss": 0.0164, "step": 43700 }, { "epoch": 0.2542631158841884, "grad_norm": 0.4309409558773041, "learning_rate": 4.4510620649312643e-05, "loss": 0.0182, "step": 43800 }, { "epoch": 0.25484362528118426, "grad_norm": 0.212936669588089, "learning_rate": 4.4480577206180436e-05, "loss": 0.0145, "step": 43900 }, { "epoch": 0.2554241346781801, "grad_norm": 2.9309751987457275, "learning_rate": 4.4450461971268256e-05, "loss": 0.0152, "step": 44000 }, { "epoch": 0.256004644075176, "grad_norm": 6.400189399719238, "learning_rate": 4.4420275055559795e-05, "loss": 0.0172, "step": 44100 }, { "epoch": 0.2565851534721718, "grad_norm": 0.43217283487319946, "learning_rate": 4.43900165703029e-05, "loss": 0.0223, "step": 44200 }, { "epoch": 0.2571656628691677, "grad_norm": 0.4472719132900238, "learning_rate": 4.4359686627009204e-05, "loss": 0.0166, "step": 44300 }, { "epoch": 0.25774617226616353, "grad_norm": 0.8468680381774902, "learning_rate": 4.432928533745364e-05, "loss": 0.0267, "step": 44400 }, { "epoch": 0.2583266816631594, "grad_norm": 0.5906082987785339, "learning_rate": 4.4298812813674096e-05, "loss": 0.019, "step": 44500 }, { "epoch": 0.2589071910601553, "grad_norm": 0.6340333819389343, "learning_rate": 4.4268269167970977e-05, "loss": 0.015, "step": 44600 }, { "epoch": 0.25948770045715114, "grad_norm": 3.455953598022461, "learning_rate": 4.42376545129068e-05, "loss": 0.0179, "step": 44700 }, { "epoch": 0.260068209854147, "grad_norm": 1.7209389209747314, "learning_rate": 4.420696896130576e-05, "loss": 0.0182, "step": 44800 }, { "epoch": 0.26064871925114286, "grad_norm": 0.23180946707725525, "learning_rate": 4.417621262625334e-05, "loss": 0.0234, "step": 44900 }, { "epoch": 0.26122922864813874, "grad_norm": 1.2846555709838867, "learning_rate": 4.414538562109588e-05, "loss": 0.0185, "step": 45000 }, { "epoch": 0.26180973804513463, "grad_norm": 1.6205638647079468, "learning_rate": 4.411448805944015e-05, "loss": 0.0164, "step": 45100 }, { "epoch": 0.26239024744213046, "grad_norm": 1.5716558694839478, "learning_rate": 4.408352005515295e-05, "loss": 0.0152, "step": 45200 }, { "epoch": 0.26297075683912635, "grad_norm": 2.2886829376220703, "learning_rate": 4.4052481722360675e-05, "loss": 0.0124, "step": 45300 }, { "epoch": 0.2635512662361222, "grad_norm": 0.44392430782318115, "learning_rate": 4.402137317544891e-05, "loss": 0.0182, "step": 45400 }, { "epoch": 0.26413177563311807, "grad_norm": 1.4996153116226196, "learning_rate": 4.399019452906199e-05, "loss": 0.0181, "step": 45500 }, { "epoch": 0.2647122850301139, "grad_norm": 0.37333735823631287, "learning_rate": 4.395894589810261e-05, "loss": 0.0187, "step": 45600 }, { "epoch": 0.2652927944271098, "grad_norm": 1.011172890663147, "learning_rate": 4.392762739773135e-05, "loss": 0.0132, "step": 45700 }, { "epoch": 0.2658733038241057, "grad_norm": 0.7283264398574829, "learning_rate": 4.389623914336631e-05, "loss": 0.0212, "step": 45800 }, { "epoch": 0.2664538132211015, "grad_norm": 0.0, "learning_rate": 4.386478125068262e-05, "loss": 0.0191, "step": 45900 }, { "epoch": 0.2670343226180974, "grad_norm": 0.8834022879600525, "learning_rate": 4.3833253835612074e-05, "loss": 0.018, "step": 46000 }, { "epoch": 0.26761483201509323, "grad_norm": 0.6831231713294983, "learning_rate": 4.380165701434267e-05, "loss": 0.0145, "step": 46100 }, { "epoch": 0.2681953414120891, "grad_norm": 0.0, "learning_rate": 4.376999090331818e-05, "loss": 0.0154, "step": 46200 }, { "epoch": 0.26877585080908495, "grad_norm": 0.10557834059000015, "learning_rate": 4.3738255619237745e-05, "loss": 0.0124, "step": 46300 }, { "epoch": 0.26935636020608084, "grad_norm": 1.9961583614349365, "learning_rate": 4.370645127905542e-05, "loss": 0.0208, "step": 46400 }, { "epoch": 0.2699368696030767, "grad_norm": 0.8881611824035645, "learning_rate": 4.367457799997976e-05, "loss": 0.0132, "step": 46500 }, { "epoch": 0.27051737900007256, "grad_norm": 0.7779310345649719, "learning_rate": 4.3642635899473364e-05, "loss": 0.0207, "step": 46600 }, { "epoch": 0.27109788839706844, "grad_norm": 1.000813364982605, "learning_rate": 4.3610625095252474e-05, "loss": 0.0217, "step": 46700 }, { "epoch": 0.2716783977940643, "grad_norm": 0.2656024396419525, "learning_rate": 4.357854570528652e-05, "loss": 0.0197, "step": 46800 }, { "epoch": 0.27225890719106016, "grad_norm": 0.3503284156322479, "learning_rate": 4.3546397847797695e-05, "loss": 0.0155, "step": 46900 }, { "epoch": 0.272839416588056, "grad_norm": 2.816612482070923, "learning_rate": 4.3514181641260515e-05, "loss": 0.0196, "step": 47000 }, { "epoch": 0.2734199259850519, "grad_norm": 0.44417452812194824, "learning_rate": 4.3481897204401376e-05, "loss": 0.0164, "step": 47100 }, { "epoch": 0.27400043538204777, "grad_norm": 1.1510508060455322, "learning_rate": 4.3449544656198123e-05, "loss": 0.0155, "step": 47200 }, { "epoch": 0.2745809447790436, "grad_norm": 0.0, "learning_rate": 4.3417124115879623e-05, "loss": 0.0151, "step": 47300 }, { "epoch": 0.2751614541760395, "grad_norm": 0.687237024307251, "learning_rate": 4.3384635702925315e-05, "loss": 0.0161, "step": 47400 }, { "epoch": 0.2757419635730353, "grad_norm": 15.054317474365234, "learning_rate": 4.335207953706475e-05, "loss": 0.0159, "step": 47500 }, { "epoch": 0.2763224729700312, "grad_norm": 0.6675468683242798, "learning_rate": 4.3319455738277184e-05, "loss": 0.0226, "step": 47600 }, { "epoch": 0.27690298236702704, "grad_norm": 0.44973939657211304, "learning_rate": 4.328676442679112e-05, "loss": 0.0161, "step": 47700 }, { "epoch": 0.2774834917640229, "grad_norm": 0.5629274249076843, "learning_rate": 4.3254005723083855e-05, "loss": 0.0145, "step": 47800 }, { "epoch": 0.2780640011610188, "grad_norm": 1.0498775243759155, "learning_rate": 4.322117974788107e-05, "loss": 0.0166, "step": 47900 }, { "epoch": 0.27864451055801465, "grad_norm": 0.404738187789917, "learning_rate": 4.318828662215633e-05, "loss": 0.0148, "step": 48000 }, { "epoch": 0.27922501995501053, "grad_norm": 0.3776521384716034, "learning_rate": 4.3155326467130696e-05, "loss": 0.0208, "step": 48100 }, { "epoch": 0.27980552935200637, "grad_norm": 0.3664938807487488, "learning_rate": 4.312229940427224e-05, "loss": 0.014, "step": 48200 }, { "epoch": 0.28038603874900225, "grad_norm": 0.41216275095939636, "learning_rate": 4.308920555529561e-05, "loss": 0.0159, "step": 48300 }, { "epoch": 0.28096654814599814, "grad_norm": 0.7025476694107056, "learning_rate": 4.305604504216157e-05, "loss": 0.0144, "step": 48400 }, { "epoch": 0.281547057542994, "grad_norm": 0.5805770754814148, "learning_rate": 4.3022817987076615e-05, "loss": 0.0206, "step": 48500 }, { "epoch": 0.28212756693998986, "grad_norm": 0.10411791503429413, "learning_rate": 4.298952451249238e-05, "loss": 0.0139, "step": 48600 }, { "epoch": 0.2827080763369857, "grad_norm": 1.3499836921691895, "learning_rate": 4.295616474110534e-05, "loss": 0.0168, "step": 48700 }, { "epoch": 0.2832885857339816, "grad_norm": 0.8422473073005676, "learning_rate": 4.292273879585628e-05, "loss": 0.0169, "step": 48800 }, { "epoch": 0.2838690951309774, "grad_norm": 1.0992027521133423, "learning_rate": 4.288924679992985e-05, "loss": 0.0179, "step": 48900 }, { "epoch": 0.2844496045279733, "grad_norm": 0.9558140635490417, "learning_rate": 4.2855688876754104e-05, "loss": 0.0162, "step": 49000 }, { "epoch": 0.2850301139249692, "grad_norm": 2.5504753589630127, "learning_rate": 4.2822065150000105e-05, "loss": 0.0125, "step": 49100 }, { "epoch": 0.285610623321965, "grad_norm": 6.963260650634766, "learning_rate": 4.278837574358134e-05, "loss": 0.0145, "step": 49200 }, { "epoch": 0.2861911327189609, "grad_norm": 0.19555258750915527, "learning_rate": 4.275462078165343e-05, "loss": 0.0144, "step": 49300 }, { "epoch": 0.28677164211595674, "grad_norm": 0.13861818611621857, "learning_rate": 4.2720800388613545e-05, "loss": 0.015, "step": 49400 }, { "epoch": 0.2873521515129526, "grad_norm": 0.10115107148885727, "learning_rate": 4.2686914689099986e-05, "loss": 0.0208, "step": 49500 }, { "epoch": 0.28793266090994846, "grad_norm": 0.12271959334611893, "learning_rate": 4.265296380799174e-05, "loss": 0.0177, "step": 49600 }, { "epoch": 0.28851317030694434, "grad_norm": 0.6255984902381897, "learning_rate": 4.261894787040801e-05, "loss": 0.0142, "step": 49700 }, { "epoch": 0.28909367970394023, "grad_norm": 0.24981549382209778, "learning_rate": 4.258486700170774e-05, "loss": 0.0129, "step": 49800 }, { "epoch": 0.28967418910093606, "grad_norm": 0.42702168226242065, "learning_rate": 4.2550721327489165e-05, "loss": 0.0197, "step": 49900 }, { "epoch": 0.29025469849793195, "grad_norm": 0.2005091905593872, "learning_rate": 4.2516510973589366e-05, "loss": 0.0165, "step": 50000 }, { "epoch": 0.2908352078949278, "grad_norm": 0.18545077741146088, "learning_rate": 4.248223606608378e-05, "loss": 0.0197, "step": 50100 }, { "epoch": 0.29141571729192367, "grad_norm": 2.60361385345459, "learning_rate": 4.244789673128572e-05, "loss": 0.0208, "step": 50200 }, { "epoch": 0.2919962266889195, "grad_norm": 1.1765265464782715, "learning_rate": 4.241349309574596e-05, "loss": 0.0161, "step": 50300 }, { "epoch": 0.2925767360859154, "grad_norm": 0.3382522463798523, "learning_rate": 4.237902528625224e-05, "loss": 0.0164, "step": 50400 }, { "epoch": 0.2931572454829113, "grad_norm": 0.8997277021408081, "learning_rate": 4.234449342982879e-05, "loss": 0.0173, "step": 50500 }, { "epoch": 0.2937377548799071, "grad_norm": 0.3323515057563782, "learning_rate": 4.230989765373587e-05, "loss": 0.0156, "step": 50600 }, { "epoch": 0.294318264276903, "grad_norm": 0.0954294502735138, "learning_rate": 4.2275238085469326e-05, "loss": 0.0171, "step": 50700 }, { "epoch": 0.29489877367389883, "grad_norm": 0.3944256007671356, "learning_rate": 4.224051485276006e-05, "loss": 0.0147, "step": 50800 }, { "epoch": 0.2954792830708947, "grad_norm": 0.0, "learning_rate": 4.220572808357363e-05, "loss": 0.0178, "step": 50900 }, { "epoch": 0.29605979246789055, "grad_norm": 1.3922127485275269, "learning_rate": 4.217087790610973e-05, "loss": 0.016, "step": 51000 }, { "epoch": 0.29664030186488644, "grad_norm": 0.48834431171417236, "learning_rate": 4.213596444880173e-05, "loss": 0.013, "step": 51100 }, { "epoch": 0.2972208112618823, "grad_norm": 1.1236047744750977, "learning_rate": 4.210098784031621e-05, "loss": 0.0177, "step": 51200 }, { "epoch": 0.29780132065887815, "grad_norm": 0.22413845360279083, "learning_rate": 4.206594820955249e-05, "loss": 0.0195, "step": 51300 }, { "epoch": 0.29838183005587404, "grad_norm": 1.6964247226715088, "learning_rate": 4.2030845685642136e-05, "loss": 0.0171, "step": 51400 }, { "epoch": 0.2989623394528699, "grad_norm": 0.4666268825531006, "learning_rate": 4.199568039794848e-05, "loss": 0.0181, "step": 51500 }, { "epoch": 0.29954284884986576, "grad_norm": 0.7793068289756775, "learning_rate": 4.196045247606619e-05, "loss": 0.0149, "step": 51600 }, { "epoch": 0.3001233582468616, "grad_norm": 0.6577598452568054, "learning_rate": 4.192516204982073e-05, "loss": 0.0215, "step": 51700 }, { "epoch": 0.3007038676438575, "grad_norm": 0.31358566880226135, "learning_rate": 4.188980924926794e-05, "loss": 0.0208, "step": 51800 }, { "epoch": 0.30128437704085337, "grad_norm": 0.1279175728559494, "learning_rate": 4.1854394204693495e-05, "loss": 0.0132, "step": 51900 }, { "epoch": 0.3018648864378492, "grad_norm": 1.288215160369873, "learning_rate": 4.1818917046612474e-05, "loss": 0.0196, "step": 52000 }, { "epoch": 0.3024453958348451, "grad_norm": 0.7573376893997192, "learning_rate": 4.178337790576888e-05, "loss": 0.018, "step": 52100 }, { "epoch": 0.3030259052318409, "grad_norm": 0.07305464148521423, "learning_rate": 4.1747776913135115e-05, "loss": 0.0144, "step": 52200 }, { "epoch": 0.3036064146288368, "grad_norm": 0.41413378715515137, "learning_rate": 4.1712114199911534e-05, "loss": 0.0195, "step": 52300 }, { "epoch": 0.3041869240258327, "grad_norm": 0.6894093155860901, "learning_rate": 4.1676389897525946e-05, "loss": 0.0147, "step": 52400 }, { "epoch": 0.3047674334228285, "grad_norm": 0.0, "learning_rate": 4.1640604137633144e-05, "loss": 0.0191, "step": 52500 }, { "epoch": 0.3053479428198244, "grad_norm": 0.42625299096107483, "learning_rate": 4.16047570521144e-05, "loss": 0.0143, "step": 52600 }, { "epoch": 0.30592845221682025, "grad_norm": 0.7680391073226929, "learning_rate": 4.156884877307701e-05, "loss": 0.0141, "step": 52700 }, { "epoch": 0.30650896161381613, "grad_norm": 0.7309791445732117, "learning_rate": 4.1532879432853744e-05, "loss": 0.0142, "step": 52800 }, { "epoch": 0.30708947101081197, "grad_norm": 0.5670241117477417, "learning_rate": 4.149684916400246e-05, "loss": 0.016, "step": 52900 }, { "epoch": 0.30766998040780785, "grad_norm": 1.2159571647644043, "learning_rate": 4.146075809930549e-05, "loss": 0.0192, "step": 53000 }, { "epoch": 0.30825048980480374, "grad_norm": 1.0170856714248657, "learning_rate": 4.142460637176928e-05, "loss": 0.0139, "step": 53100 }, { "epoch": 0.30883099920179957, "grad_norm": 0.7249845266342163, "learning_rate": 4.138839411462379e-05, "loss": 0.0162, "step": 53200 }, { "epoch": 0.30941150859879546, "grad_norm": 0.25050070881843567, "learning_rate": 4.1352121461322065e-05, "loss": 0.0164, "step": 53300 }, { "epoch": 0.3099920179957913, "grad_norm": 0.68352210521698, "learning_rate": 4.131578854553976e-05, "loss": 0.0175, "step": 53400 }, { "epoch": 0.3105725273927872, "grad_norm": 0.3992615342140198, "learning_rate": 4.1279395501174544e-05, "loss": 0.0178, "step": 53500 }, { "epoch": 0.311153036789783, "grad_norm": 0.0, "learning_rate": 4.1242942462345744e-05, "loss": 0.0192, "step": 53600 }, { "epoch": 0.3117335461867789, "grad_norm": 1.5646060705184937, "learning_rate": 4.1206429563393765e-05, "loss": 0.0185, "step": 53700 }, { "epoch": 0.3123140555837748, "grad_norm": 0.9312039613723755, "learning_rate": 4.11698569388796e-05, "loss": 0.0136, "step": 53800 }, { "epoch": 0.3128945649807706, "grad_norm": 1.540569543838501, "learning_rate": 4.113322472358436e-05, "loss": 0.0188, "step": 53900 }, { "epoch": 0.3134750743777665, "grad_norm": 0.13842260837554932, "learning_rate": 4.109653305250877e-05, "loss": 0.0142, "step": 54000 }, { "epoch": 0.31405558377476234, "grad_norm": 1.753185510635376, "learning_rate": 4.105978206087265e-05, "loss": 0.0188, "step": 54100 }, { "epoch": 0.3146360931717582, "grad_norm": 0.4443669319152832, "learning_rate": 4.102297188411446e-05, "loss": 0.0165, "step": 54200 }, { "epoch": 0.31521660256875406, "grad_norm": 0.6722429990768433, "learning_rate": 4.0986102657890744e-05, "loss": 0.0192, "step": 54300 }, { "epoch": 0.31579711196574994, "grad_norm": 0.7747224569320679, "learning_rate": 4.09491745180757e-05, "loss": 0.0169, "step": 54400 }, { "epoch": 0.31637762136274583, "grad_norm": 0.2598312795162201, "learning_rate": 4.09121876007606e-05, "loss": 0.0172, "step": 54500 }, { "epoch": 0.31695813075974166, "grad_norm": 0.11571415513753891, "learning_rate": 4.087514204225336e-05, "loss": 0.013, "step": 54600 }, { "epoch": 0.31753864015673755, "grad_norm": 0.6480383276939392, "learning_rate": 4.0838037979077976e-05, "loss": 0.0182, "step": 54700 }, { "epoch": 0.3181191495537334, "grad_norm": 0.6114629507064819, "learning_rate": 4.080087554797408e-05, "loss": 0.0206, "step": 54800 }, { "epoch": 0.31869965895072927, "grad_norm": 0.8482924699783325, "learning_rate": 4.076365488589641e-05, "loss": 0.0229, "step": 54900 }, { "epoch": 0.3192801683477251, "grad_norm": 0.37672215700149536, "learning_rate": 4.072637613001426e-05, "loss": 0.0188, "step": 55000 }, { "epoch": 0.319860677744721, "grad_norm": 0.7157580256462097, "learning_rate": 4.0689039417711075e-05, "loss": 0.0176, "step": 55100 }, { "epoch": 0.3204411871417169, "grad_norm": 1.3182079792022705, "learning_rate": 4.065164488658383e-05, "loss": 0.0183, "step": 55200 }, { "epoch": 0.3210216965387127, "grad_norm": 17.849079132080078, "learning_rate": 4.061419267444263e-05, "loss": 0.0147, "step": 55300 }, { "epoch": 0.3216022059357086, "grad_norm": 2.8836376667022705, "learning_rate": 4.057668291931012e-05, "loss": 0.0164, "step": 55400 }, { "epoch": 0.32218271533270443, "grad_norm": 2.2379231452941895, "learning_rate": 4.0539115759421016e-05, "loss": 0.0155, "step": 55500 }, { "epoch": 0.3227632247297003, "grad_norm": 0.6565619707107544, "learning_rate": 4.050149133322158e-05, "loss": 0.0158, "step": 55600 }, { "epoch": 0.3233437341266962, "grad_norm": 9.40256404876709, "learning_rate": 4.046380977936915e-05, "loss": 0.0161, "step": 55700 }, { "epoch": 0.32392424352369203, "grad_norm": 1.0256812572479248, "learning_rate": 4.042607123673156e-05, "loss": 0.0185, "step": 55800 }, { "epoch": 0.3245047529206879, "grad_norm": 0.3539896011352539, "learning_rate": 4.038827584438668e-05, "loss": 0.013, "step": 55900 }, { "epoch": 0.32508526231768375, "grad_norm": 0.3358542323112488, "learning_rate": 4.035042374162189e-05, "loss": 0.0179, "step": 56000 }, { "epoch": 0.32566577171467964, "grad_norm": 0.6049757599830627, "learning_rate": 4.0312515067933545e-05, "loss": 0.0158, "step": 56100 }, { "epoch": 0.3262462811116755, "grad_norm": 0.1417369246482849, "learning_rate": 4.027454996302652e-05, "loss": 0.019, "step": 56200 }, { "epoch": 0.32682679050867136, "grad_norm": 1.0133875608444214, "learning_rate": 4.023652856681363e-05, "loss": 0.0145, "step": 56300 }, { "epoch": 0.32740729990566725, "grad_norm": 5.445352554321289, "learning_rate": 4.019845101941512e-05, "loss": 0.0202, "step": 56400 }, { "epoch": 0.3279878093026631, "grad_norm": 2.076885223388672, "learning_rate": 4.0160317461158213e-05, "loss": 0.0138, "step": 56500 }, { "epoch": 0.32856831869965897, "grad_norm": 0.7435348033905029, "learning_rate": 4.0122128032576524e-05, "loss": 0.0163, "step": 56600 }, { "epoch": 0.3291488280966548, "grad_norm": 0.662987470626831, "learning_rate": 4.0083882874409576e-05, "loss": 0.0179, "step": 56700 }, { "epoch": 0.3297293374936507, "grad_norm": 0.7310676574707031, "learning_rate": 4.004558212760227e-05, "loss": 0.0136, "step": 56800 }, { "epoch": 0.3303098468906465, "grad_norm": 1.144674301147461, "learning_rate": 4.0007225933304344e-05, "loss": 0.0183, "step": 56900 }, { "epoch": 0.3308903562876424, "grad_norm": 0.7550173997879028, "learning_rate": 3.9968814432869914e-05, "loss": 0.0125, "step": 57000 }, { "epoch": 0.3314708656846383, "grad_norm": 0.5192617774009705, "learning_rate": 3.993034776785691e-05, "loss": 0.014, "step": 57100 }, { "epoch": 0.3320513750816341, "grad_norm": 0.10176233947277069, "learning_rate": 3.9891826080026535e-05, "loss": 0.0148, "step": 57200 }, { "epoch": 0.33263188447863, "grad_norm": 0.0, "learning_rate": 3.9853249511342786e-05, "loss": 0.0153, "step": 57300 }, { "epoch": 0.33321239387562585, "grad_norm": 0.5603938698768616, "learning_rate": 3.981461820397191e-05, "loss": 0.0153, "step": 57400 }, { "epoch": 0.33379290327262173, "grad_norm": 0.9487095475196838, "learning_rate": 3.977593230028188e-05, "loss": 0.0158, "step": 57500 }, { "epoch": 0.33437341266961756, "grad_norm": 4.972527027130127, "learning_rate": 3.973719194284188e-05, "loss": 0.016, "step": 57600 }, { "epoch": 0.33495392206661345, "grad_norm": 0.1742544323205948, "learning_rate": 3.969839727442175e-05, "loss": 0.017, "step": 57700 }, { "epoch": 0.33553443146360934, "grad_norm": 0.43199607729911804, "learning_rate": 3.965954843799152e-05, "loss": 0.0156, "step": 57800 }, { "epoch": 0.33611494086060517, "grad_norm": 2.0231590270996094, "learning_rate": 3.9620645576720815e-05, "loss": 0.0173, "step": 57900 }, { "epoch": 0.33669545025760106, "grad_norm": 1.236526608467102, "learning_rate": 3.9581688833978375e-05, "loss": 0.0171, "step": 58000 }, { "epoch": 0.3372759596545969, "grad_norm": 1.1368087530136108, "learning_rate": 3.954267835333148e-05, "loss": 0.0118, "step": 58100 }, { "epoch": 0.3378564690515928, "grad_norm": 0.8430467844009399, "learning_rate": 3.9503614278545494e-05, "loss": 0.0141, "step": 58200 }, { "epoch": 0.3384369784485886, "grad_norm": 0.19449672102928162, "learning_rate": 3.946449675358327e-05, "loss": 0.0158, "step": 58300 }, { "epoch": 0.3390174878455845, "grad_norm": 0.10014590620994568, "learning_rate": 3.9425325922604615e-05, "loss": 0.0152, "step": 58400 }, { "epoch": 0.3395979972425804, "grad_norm": 0.4984476864337921, "learning_rate": 3.938610192996584e-05, "loss": 0.0164, "step": 58500 }, { "epoch": 0.3401785066395762, "grad_norm": 2.3255436420440674, "learning_rate": 3.934682492021913e-05, "loss": 0.0181, "step": 58600 }, { "epoch": 0.3407590160365721, "grad_norm": 1.8835875988006592, "learning_rate": 3.930749503811206e-05, "loss": 0.012, "step": 58700 }, { "epoch": 0.34133952543356794, "grad_norm": 1.3894046545028687, "learning_rate": 3.9268112428587074e-05, "loss": 0.015, "step": 58800 }, { "epoch": 0.3419200348305638, "grad_norm": 0.15835818648338318, "learning_rate": 3.922867723678091e-05, "loss": 0.0166, "step": 58900 }, { "epoch": 0.3425005442275597, "grad_norm": 0.3661365807056427, "learning_rate": 3.918918960802411e-05, "loss": 0.0162, "step": 59000 }, { "epoch": 0.34308105362455554, "grad_norm": 0.11089111864566803, "learning_rate": 3.914964968784044e-05, "loss": 0.0232, "step": 59100 }, { "epoch": 0.34366156302155143, "grad_norm": 2.527754306793213, "learning_rate": 3.911005762194639e-05, "loss": 0.0147, "step": 59200 }, { "epoch": 0.34424207241854726, "grad_norm": 0.15072380006313324, "learning_rate": 3.9070413556250616e-05, "loss": 0.0189, "step": 59300 }, { "epoch": 0.34482258181554315, "grad_norm": 0.5700109004974365, "learning_rate": 3.903071763685342e-05, "loss": 0.0151, "step": 59400 }, { "epoch": 0.345403091212539, "grad_norm": 0.8213745951652527, "learning_rate": 3.899097001004618e-05, "loss": 0.0167, "step": 59500 }, { "epoch": 0.34598360060953487, "grad_norm": 1.022154450416565, "learning_rate": 3.895117082231085e-05, "loss": 0.0146, "step": 59600 }, { "epoch": 0.34656411000653076, "grad_norm": 0.2379520982503891, "learning_rate": 3.891132022031939e-05, "loss": 0.0179, "step": 59700 }, { "epoch": 0.3471446194035266, "grad_norm": 0.835014283657074, "learning_rate": 3.8871418350933256e-05, "loss": 0.0145, "step": 59800 }, { "epoch": 0.3477251288005225, "grad_norm": 5.786501884460449, "learning_rate": 3.8831465361202794e-05, "loss": 0.0145, "step": 59900 }, { "epoch": 0.3483056381975183, "grad_norm": 0.40479740500450134, "learning_rate": 3.87914613983668e-05, "loss": 0.0175, "step": 60000 }, { "epoch": 0.3488861475945142, "grad_norm": 0.2653241753578186, "learning_rate": 3.875140660985189e-05, "loss": 0.0156, "step": 60100 }, { "epoch": 0.34946665699151, "grad_norm": 0.2719464600086212, "learning_rate": 3.8711301143272004e-05, "loss": 0.0122, "step": 60200 }, { "epoch": 0.3500471663885059, "grad_norm": 0.23439522087574005, "learning_rate": 3.8671145146427825e-05, "loss": 0.0169, "step": 60300 }, { "epoch": 0.3506276757855018, "grad_norm": 0.06842320412397385, "learning_rate": 3.8630938767306256e-05, "loss": 0.0141, "step": 60400 }, { "epoch": 0.35120818518249763, "grad_norm": 0.0, "learning_rate": 3.85906821540799e-05, "loss": 0.0144, "step": 60500 }, { "epoch": 0.3517886945794935, "grad_norm": 0.0, "learning_rate": 3.855037545510648e-05, "loss": 0.017, "step": 60600 }, { "epoch": 0.35236920397648935, "grad_norm": 0.27966034412384033, "learning_rate": 3.851001881892827e-05, "loss": 0.0197, "step": 60700 }, { "epoch": 0.35294971337348524, "grad_norm": 2.5935139656066895, "learning_rate": 3.846961239427161e-05, "loss": 0.0164, "step": 60800 }, { "epoch": 0.3535302227704811, "grad_norm": 0.5523900985717773, "learning_rate": 3.842915633004632e-05, "loss": 0.0186, "step": 60900 }, { "epoch": 0.35411073216747696, "grad_norm": 7.492378234863281, "learning_rate": 3.8388650775345144e-05, "loss": 0.0182, "step": 61000 }, { "epoch": 0.35469124156447285, "grad_norm": 0.31653234362602234, "learning_rate": 3.8348095879443226e-05, "loss": 0.0145, "step": 61100 }, { "epoch": 0.3552717509614687, "grad_norm": 1.4802820682525635, "learning_rate": 3.830749179179752e-05, "loss": 0.015, "step": 61200 }, { "epoch": 0.35585226035846457, "grad_norm": 0.4932232201099396, "learning_rate": 3.8266838662046334e-05, "loss": 0.0133, "step": 61300 }, { "epoch": 0.3564327697554604, "grad_norm": 3.2751312255859375, "learning_rate": 3.822613664000862e-05, "loss": 0.0155, "step": 61400 }, { "epoch": 0.3570132791524563, "grad_norm": 0.8037987351417542, "learning_rate": 3.818538587568359e-05, "loss": 0.0196, "step": 61500 }, { "epoch": 0.3575937885494521, "grad_norm": 0.5324920415878296, "learning_rate": 3.8144586519250044e-05, "loss": 0.0161, "step": 61600 }, { "epoch": 0.358174297946448, "grad_norm": 0.251559317111969, "learning_rate": 3.8103738721065856e-05, "loss": 0.014, "step": 61700 }, { "epoch": 0.3587548073434439, "grad_norm": 2.9223034381866455, "learning_rate": 3.806284263166745e-05, "loss": 0.0119, "step": 61800 }, { "epoch": 0.3593353167404397, "grad_norm": 0.4311857521533966, "learning_rate": 3.8021898401769205e-05, "loss": 0.0149, "step": 61900 }, { "epoch": 0.3599158261374356, "grad_norm": 0.5429189801216125, "learning_rate": 3.7980906182262893e-05, "loss": 0.0211, "step": 62000 }, { "epoch": 0.36049633553443144, "grad_norm": 1.3071308135986328, "learning_rate": 3.793986612421717e-05, "loss": 0.0132, "step": 62100 }, { "epoch": 0.36107684493142733, "grad_norm": 0.4390534460544586, "learning_rate": 3.789877837887698e-05, "loss": 0.0165, "step": 62200 }, { "epoch": 0.36165735432842316, "grad_norm": 42.51701354980469, "learning_rate": 3.7857643097663006e-05, "loss": 0.0151, "step": 62300 }, { "epoch": 0.36223786372541905, "grad_norm": 1.4220099449157715, "learning_rate": 3.7816460432171135e-05, "loss": 0.014, "step": 62400 }, { "epoch": 0.36281837312241494, "grad_norm": 0.5632671117782593, "learning_rate": 3.777523053417184e-05, "loss": 0.0168, "step": 62500 }, { "epoch": 0.36339888251941077, "grad_norm": 0.7514089345932007, "learning_rate": 3.7733953555609696e-05, "loss": 0.0171, "step": 62600 }, { "epoch": 0.36397939191640666, "grad_norm": 0.2681547701358795, "learning_rate": 3.769262964860276e-05, "loss": 0.0134, "step": 62700 }, { "epoch": 0.3645599013134025, "grad_norm": 1.961303472518921, "learning_rate": 3.765125896544206e-05, "loss": 0.0201, "step": 62800 }, { "epoch": 0.3651404107103984, "grad_norm": 0.6701322197914124, "learning_rate": 3.7609841658590985e-05, "loss": 0.0159, "step": 62900 }, { "epoch": 0.36572092010739427, "grad_norm": 0.3040947914123535, "learning_rate": 3.756837788068475e-05, "loss": 0.0157, "step": 63000 }, { "epoch": 0.3663014295043901, "grad_norm": 0.28461697697639465, "learning_rate": 3.7526867784529835e-05, "loss": 0.0172, "step": 63100 }, { "epoch": 0.366881938901386, "grad_norm": 1.9548020362854004, "learning_rate": 3.7485311523103427e-05, "loss": 0.0199, "step": 63200 }, { "epoch": 0.3674624482983818, "grad_norm": 0.6341608762741089, "learning_rate": 3.744370924955282e-05, "loss": 0.0206, "step": 63300 }, { "epoch": 0.3680429576953777, "grad_norm": 0.3764314651489258, "learning_rate": 3.7402061117194915e-05, "loss": 0.0151, "step": 63400 }, { "epoch": 0.36862346709237354, "grad_norm": 0.3538680076599121, "learning_rate": 3.7360367279515565e-05, "loss": 0.0157, "step": 63500 }, { "epoch": 0.3692039764893694, "grad_norm": 0.1386338174343109, "learning_rate": 3.731862789016911e-05, "loss": 0.0147, "step": 63600 }, { "epoch": 0.3697844858863653, "grad_norm": 0.5350472331047058, "learning_rate": 3.7276843102977725e-05, "loss": 0.0128, "step": 63700 }, { "epoch": 0.37036499528336114, "grad_norm": 1.673578143119812, "learning_rate": 3.723501307193091e-05, "loss": 0.0138, "step": 63800 }, { "epoch": 0.37094550468035703, "grad_norm": 0.18305979669094086, "learning_rate": 3.719313795118491e-05, "loss": 0.0198, "step": 63900 }, { "epoch": 0.37152601407735286, "grad_norm": 0.8609408736228943, "learning_rate": 3.7151217895062105e-05, "loss": 0.0188, "step": 64000 }, { "epoch": 0.37210652347434875, "grad_norm": 0.28579720854759216, "learning_rate": 3.710925305805051e-05, "loss": 0.0163, "step": 64100 }, { "epoch": 0.3726870328713446, "grad_norm": 0.5904589295387268, "learning_rate": 3.706724359480316e-05, "loss": 0.0156, "step": 64200 }, { "epoch": 0.37326754226834047, "grad_norm": 1.1671172380447388, "learning_rate": 3.7025189660137535e-05, "loss": 0.0157, "step": 64300 }, { "epoch": 0.37384805166533636, "grad_norm": 0.12482750415802002, "learning_rate": 3.698309140903504e-05, "loss": 0.0143, "step": 64400 }, { "epoch": 0.3744285610623322, "grad_norm": 2.280238151550293, "learning_rate": 3.694094899664037e-05, "loss": 0.0142, "step": 64500 }, { "epoch": 0.3750090704593281, "grad_norm": 0.1061575785279274, "learning_rate": 3.689876257826096e-05, "loss": 0.0228, "step": 64600 }, { "epoch": 0.3755895798563239, "grad_norm": 0.531701385974884, "learning_rate": 3.685653230936646e-05, "loss": 0.0193, "step": 64700 }, { "epoch": 0.3761700892533198, "grad_norm": 0.49610480666160583, "learning_rate": 3.681425834558808e-05, "loss": 0.0182, "step": 64800 }, { "epoch": 0.3767505986503156, "grad_norm": 2.2673187255859375, "learning_rate": 3.67719408427181e-05, "loss": 0.0153, "step": 64900 }, { "epoch": 0.3773311080473115, "grad_norm": 0.8174604177474976, "learning_rate": 3.672957995670921e-05, "loss": 0.0152, "step": 65000 }, { "epoch": 0.3779116174443074, "grad_norm": 2.4207687377929688, "learning_rate": 3.668717584367401e-05, "loss": 0.0135, "step": 65100 }, { "epoch": 0.37849212684130323, "grad_norm": 0.7421271204948425, "learning_rate": 3.664472865988441e-05, "loss": 0.0171, "step": 65200 }, { "epoch": 0.3790726362382991, "grad_norm": 0.5504394173622131, "learning_rate": 3.660223856177102e-05, "loss": 0.0171, "step": 65300 }, { "epoch": 0.37965314563529495, "grad_norm": 0.41542255878448486, "learning_rate": 3.655970570592262e-05, "loss": 0.0118, "step": 65400 }, { "epoch": 0.38023365503229084, "grad_norm": 0.0, "learning_rate": 3.651713024908556e-05, "loss": 0.012, "step": 65500 }, { "epoch": 0.3808141644292867, "grad_norm": 0.054853569716215134, "learning_rate": 3.64745123481632e-05, "loss": 0.0201, "step": 65600 }, { "epoch": 0.38139467382628256, "grad_norm": 1.7980788946151733, "learning_rate": 3.643185216021531e-05, "loss": 0.0114, "step": 65700 }, { "epoch": 0.38197518322327845, "grad_norm": 0.3637460470199585, "learning_rate": 3.6389149842457486e-05, "loss": 0.0158, "step": 65800 }, { "epoch": 0.3825556926202743, "grad_norm": 1.9792327880859375, "learning_rate": 3.634640555226062e-05, "loss": 0.0156, "step": 65900 }, { "epoch": 0.38313620201727017, "grad_norm": 0.38713720440864563, "learning_rate": 3.630361944715024e-05, "loss": 0.0162, "step": 66000 }, { "epoch": 0.383716711414266, "grad_norm": 0.6612209677696228, "learning_rate": 3.626079168480601e-05, "loss": 0.0147, "step": 66100 }, { "epoch": 0.3842972208112619, "grad_norm": 1.3527421951293945, "learning_rate": 3.621792242306111e-05, "loss": 0.0168, "step": 66200 }, { "epoch": 0.3848777302082578, "grad_norm": 3.078646421432495, "learning_rate": 3.617501181990164e-05, "loss": 0.015, "step": 66300 }, { "epoch": 0.3854582396052536, "grad_norm": 2.9273505210876465, "learning_rate": 3.613206003346606e-05, "loss": 0.0182, "step": 66400 }, { "epoch": 0.3860387490022495, "grad_norm": 0.700567364692688, "learning_rate": 3.608906722204463e-05, "loss": 0.0138, "step": 66500 }, { "epoch": 0.3866192583992453, "grad_norm": 0.4075513482093811, "learning_rate": 3.6046033544078736e-05, "loss": 0.0151, "step": 66600 }, { "epoch": 0.3871997677962412, "grad_norm": 1.424938678741455, "learning_rate": 3.6002959158160454e-05, "loss": 0.0141, "step": 66700 }, { "epoch": 0.38778027719323704, "grad_norm": 1.095062255859375, "learning_rate": 3.595984422303182e-05, "loss": 0.0177, "step": 66800 }, { "epoch": 0.38836078659023293, "grad_norm": 1.1501364707946777, "learning_rate": 3.591668889758432e-05, "loss": 0.0128, "step": 66900 }, { "epoch": 0.3889412959872288, "grad_norm": 0.35930997133255005, "learning_rate": 3.587349334085831e-05, "loss": 0.0163, "step": 67000 }, { "epoch": 0.38952180538422465, "grad_norm": 0.22883236408233643, "learning_rate": 3.5830257712042374e-05, "loss": 0.0144, "step": 67100 }, { "epoch": 0.39010231478122054, "grad_norm": 0.0, "learning_rate": 3.578698217047281e-05, "loss": 0.0146, "step": 67200 }, { "epoch": 0.39068282417821637, "grad_norm": 0.4836377501487732, "learning_rate": 3.574366687563298e-05, "loss": 0.0155, "step": 67300 }, { "epoch": 0.39126333357521226, "grad_norm": 1.9902615547180176, "learning_rate": 3.570031198715277e-05, "loss": 0.0189, "step": 67400 }, { "epoch": 0.3918438429722081, "grad_norm": 0.6981222629547119, "learning_rate": 3.565691766480795e-05, "loss": 0.0167, "step": 67500 }, { "epoch": 0.392424352369204, "grad_norm": 1.2047152519226074, "learning_rate": 3.561348406851966e-05, "loss": 0.0158, "step": 67600 }, { "epoch": 0.39300486176619986, "grad_norm": 1.062116026878357, "learning_rate": 3.557001135835375e-05, "loss": 0.0156, "step": 67700 }, { "epoch": 0.3935853711631957, "grad_norm": 3.757115602493286, "learning_rate": 3.55264996945202e-05, "loss": 0.0157, "step": 67800 }, { "epoch": 0.3941658805601916, "grad_norm": 2.9534924030303955, "learning_rate": 3.548294923737258e-05, "loss": 0.0157, "step": 67900 }, { "epoch": 0.3947463899571874, "grad_norm": 0.46122029423713684, "learning_rate": 3.5439360147407404e-05, "loss": 0.016, "step": 68000 }, { "epoch": 0.3953268993541833, "grad_norm": 2.8722681999206543, "learning_rate": 3.5395732585263566e-05, "loss": 0.0144, "step": 68100 }, { "epoch": 0.39590740875117914, "grad_norm": 0.988606870174408, "learning_rate": 3.535206671172175e-05, "loss": 0.014, "step": 68200 }, { "epoch": 0.396487918148175, "grad_norm": 0.39610621333122253, "learning_rate": 3.530836268770379e-05, "loss": 0.0141, "step": 68300 }, { "epoch": 0.3970684275451709, "grad_norm": 3.2667903900146484, "learning_rate": 3.526462067427218e-05, "loss": 0.0212, "step": 68400 }, { "epoch": 0.39764893694216674, "grad_norm": 0.9565812945365906, "learning_rate": 3.522084083262935e-05, "loss": 0.0145, "step": 68500 }, { "epoch": 0.39822944633916263, "grad_norm": 1.0002511739730835, "learning_rate": 3.5177023324117206e-05, "loss": 0.0158, "step": 68600 }, { "epoch": 0.39880995573615846, "grad_norm": 0.8633850812911987, "learning_rate": 3.51331683102164e-05, "loss": 0.0192, "step": 68700 }, { "epoch": 0.39939046513315435, "grad_norm": 0.359651654958725, "learning_rate": 3.508927595254585e-05, "loss": 0.0198, "step": 68800 }, { "epoch": 0.3999709745301502, "grad_norm": 1.0570274591445923, "learning_rate": 3.504534641286209e-05, "loss": 0.0163, "step": 68900 }, { "epoch": 0.40055148392714607, "grad_norm": 0.0, "learning_rate": 3.500137985305865e-05, "loss": 0.0141, "step": 69000 }, { "epoch": 0.40113199332414196, "grad_norm": 1.52181077003479, "learning_rate": 3.495737643516552e-05, "loss": 0.0145, "step": 69100 }, { "epoch": 0.4017125027211378, "grad_norm": 0.6070308685302734, "learning_rate": 3.491333632134852e-05, "loss": 0.0179, "step": 69200 }, { "epoch": 0.4022930121181337, "grad_norm": 0.19646623730659485, "learning_rate": 3.486925967390871e-05, "loss": 0.0139, "step": 69300 }, { "epoch": 0.4028735215151295, "grad_norm": 0.0677868127822876, "learning_rate": 3.482514665528176e-05, "loss": 0.0186, "step": 69400 }, { "epoch": 0.4034540309121254, "grad_norm": 9.331048965454102, "learning_rate": 3.4780997428037424e-05, "loss": 0.0139, "step": 69500 }, { "epoch": 0.4040345403091212, "grad_norm": 1.8702892065048218, "learning_rate": 3.473681215487884e-05, "loss": 0.0162, "step": 69600 }, { "epoch": 0.4046150497061171, "grad_norm": 0.36429017782211304, "learning_rate": 3.4692590998642026e-05, "loss": 0.0164, "step": 69700 }, { "epoch": 0.405195559103113, "grad_norm": 2.0621962547302246, "learning_rate": 3.464833412229523e-05, "loss": 0.0125, "step": 69800 }, { "epoch": 0.40577606850010883, "grad_norm": 0.6523299217224121, "learning_rate": 3.460404168893834e-05, "loss": 0.0171, "step": 69900 }, { "epoch": 0.4063565778971047, "grad_norm": 0.20909562706947327, "learning_rate": 3.455971386180229e-05, "loss": 0.0179, "step": 70000 } ], "logging_steps": 100, "max_steps": 172263, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.1534812258533112e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }