diff --git "a/final/trainer_state.json" "b/final/trainer_state.json" new file mode 100644--- /dev/null +++ "b/final/trainer_state.json" @@ -0,0 +1,67431 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 9627, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00010387451958034694, + "grad_norm": 1.3366456031799316, + "learning_rate": 9.9999997337695e-05, + "loss": 2.4557, + "step": 1 + }, + { + "epoch": 0.00020774903916069389, + "grad_norm": 0.7125677466392517, + "learning_rate": 9.999998935078033e-05, + "loss": 2.1366, + "step": 2 + }, + { + "epoch": 0.0003116235587410408, + "grad_norm": 0.5620805621147156, + "learning_rate": 9.999997603925677e-05, + "loss": 1.9955, + "step": 3 + }, + { + "epoch": 0.00041549807832138777, + "grad_norm": 0.44992074370384216, + "learning_rate": 9.99999574031258e-05, + "loss": 2.1548, + "step": 4 + }, + { + "epoch": 0.0005193725979017347, + "grad_norm": 0.4193553626537323, + "learning_rate": 9.999993344238936e-05, + "loss": 2.0443, + "step": 5 + }, + { + "epoch": 0.0006232471174820816, + "grad_norm": 0.4412941336631775, + "learning_rate": 9.999990415705005e-05, + "loss": 2.0479, + "step": 6 + }, + { + "epoch": 0.0007271216370624286, + "grad_norm": 0.40010932087898254, + "learning_rate": 9.999986954711094e-05, + "loss": 2.0064, + "step": 7 + }, + { + "epoch": 0.0008309961566427755, + "grad_norm": 0.4279223680496216, + "learning_rate": 9.999982961257574e-05, + "loss": 1.8892, + "step": 8 + }, + { + "epoch": 0.0009348706762231225, + "grad_norm": 0.4031948149204254, + "learning_rate": 9.999978435344872e-05, + "loss": 1.9309, + "step": 9 + }, + { + "epoch": 0.0010387451958034693, + "grad_norm": 0.34114745259284973, + "learning_rate": 9.999973376973466e-05, + "loss": 1.6737, + "step": 10 + }, + { + "epoch": 0.0011426197153838163, + "grad_norm": 0.4245925545692444, + "learning_rate": 9.999967786143895e-05, + "loss": 1.8743, + "step": 11 + }, + { + "epoch": 0.0012464942349641633, + "grad_norm": 0.4021143317222595, + "learning_rate": 9.99996166285676e-05, + "loss": 1.8595, + "step": 12 + }, + { + "epoch": 0.0013503687545445102, + "grad_norm": 0.38615134358406067, + "learning_rate": 9.999955007112706e-05, + "loss": 1.8856, + "step": 13 + }, + { + "epoch": 0.0014542432741248572, + "grad_norm": 0.3954271078109741, + "learning_rate": 9.999947818912448e-05, + "loss": 2.0519, + "step": 14 + }, + { + "epoch": 0.0015581177937052041, + "grad_norm": 0.3857433795928955, + "learning_rate": 9.999940098256747e-05, + "loss": 1.8855, + "step": 15 + }, + { + "epoch": 0.001661992313285551, + "grad_norm": 0.38353338837623596, + "learning_rate": 9.999931845146424e-05, + "loss": 2.0608, + "step": 16 + }, + { + "epoch": 0.001765866832865898, + "grad_norm": 0.46258458495140076, + "learning_rate": 9.999923059582363e-05, + "loss": 2.2282, + "step": 17 + }, + { + "epoch": 0.001869741352446245, + "grad_norm": 0.40921229124069214, + "learning_rate": 9.999913741565496e-05, + "loss": 2.0698, + "step": 18 + }, + { + "epoch": 0.0019736158720265917, + "grad_norm": 0.39826393127441406, + "learning_rate": 9.999903891096817e-05, + "loss": 2.0232, + "step": 19 + }, + { + "epoch": 0.0020774903916069387, + "grad_norm": 0.4504700005054474, + "learning_rate": 9.999893508177373e-05, + "loss": 1.8005, + "step": 20 + }, + { + "epoch": 0.0021813649111872857, + "grad_norm": 0.35965996980667114, + "learning_rate": 9.999882592808274e-05, + "loss": 1.8358, + "step": 21 + }, + { + "epoch": 0.0022852394307676326, + "grad_norm": 0.36941030621528625, + "learning_rate": 9.999871144990677e-05, + "loss": 1.8202, + "step": 22 + }, + { + "epoch": 0.0023891139503479796, + "grad_norm": 0.3631148338317871, + "learning_rate": 9.999859164725804e-05, + "loss": 1.9011, + "step": 23 + }, + { + "epoch": 0.0024929884699283265, + "grad_norm": 0.3719225525856018, + "learning_rate": 9.999846652014931e-05, + "loss": 1.8046, + "step": 24 + }, + { + "epoch": 0.0025968629895086735, + "grad_norm": 0.3886321783065796, + "learning_rate": 9.99983360685939e-05, + "loss": 1.7041, + "step": 25 + }, + { + "epoch": 0.0027007375090890204, + "grad_norm": 0.35539302229881287, + "learning_rate": 9.999820029260569e-05, + "loss": 1.8635, + "step": 26 + }, + { + "epoch": 0.0028046120286693674, + "grad_norm": 0.40370672941207886, + "learning_rate": 9.999805919219916e-05, + "loss": 2.025, + "step": 27 + }, + { + "epoch": 0.0029084865482497143, + "grad_norm": 0.36346563696861267, + "learning_rate": 9.999791276738933e-05, + "loss": 1.9266, + "step": 28 + }, + { + "epoch": 0.0030123610678300613, + "grad_norm": 0.4094865322113037, + "learning_rate": 9.999776101819177e-05, + "loss": 1.923, + "step": 29 + }, + { + "epoch": 0.0031162355874104083, + "grad_norm": 0.33642393350601196, + "learning_rate": 9.999760394462268e-05, + "loss": 1.876, + "step": 30 + }, + { + "epoch": 0.0032201101069907552, + "grad_norm": 0.386692613363266, + "learning_rate": 9.999744154669874e-05, + "loss": 2.0935, + "step": 31 + }, + { + "epoch": 0.003323984626571102, + "grad_norm": 0.32473698258399963, + "learning_rate": 9.999727382443728e-05, + "loss": 1.7944, + "step": 32 + }, + { + "epoch": 0.003427859146151449, + "grad_norm": 0.3738991916179657, + "learning_rate": 9.999710077785616e-05, + "loss": 2.0774, + "step": 33 + }, + { + "epoch": 0.003531733665731796, + "grad_norm": 0.3248012959957123, + "learning_rate": 9.99969224069738e-05, + "loss": 1.7769, + "step": 34 + }, + { + "epoch": 0.003635608185312143, + "grad_norm": 0.3610652685165405, + "learning_rate": 9.99967387118092e-05, + "loss": 1.9222, + "step": 35 + }, + { + "epoch": 0.00373948270489249, + "grad_norm": 0.3891034722328186, + "learning_rate": 9.99965496923819e-05, + "loss": 2.0162, + "step": 36 + }, + { + "epoch": 0.003843357224472837, + "grad_norm": 0.37236684560775757, + "learning_rate": 9.999635534871205e-05, + "loss": 2.0374, + "step": 37 + }, + { + "epoch": 0.0039472317440531835, + "grad_norm": 0.3498350977897644, + "learning_rate": 9.999615568082036e-05, + "loss": 1.7788, + "step": 38 + }, + { + "epoch": 0.0040511062636335304, + "grad_norm": 0.41840648651123047, + "learning_rate": 9.999595068872805e-05, + "loss": 1.6986, + "step": 39 + }, + { + "epoch": 0.004154980783213877, + "grad_norm": 0.3720596134662628, + "learning_rate": 9.9995740372457e-05, + "loss": 1.9754, + "step": 40 + }, + { + "epoch": 0.004258855302794224, + "grad_norm": 0.32730332016944885, + "learning_rate": 9.999552473202955e-05, + "loss": 1.699, + "step": 41 + }, + { + "epoch": 0.004362729822374571, + "grad_norm": 0.36480000615119934, + "learning_rate": 9.999530376746873e-05, + "loss": 2.1077, + "step": 42 + }, + { + "epoch": 0.004466604341954918, + "grad_norm": 0.3458911180496216, + "learning_rate": 9.999507747879802e-05, + "loss": 1.9693, + "step": 43 + }, + { + "epoch": 0.004570478861535265, + "grad_norm": 0.34066182374954224, + "learning_rate": 9.999484586604154e-05, + "loss": 1.9005, + "step": 44 + }, + { + "epoch": 0.004674353381115612, + "grad_norm": 0.34163591265678406, + "learning_rate": 9.999460892922394e-05, + "loss": 1.8829, + "step": 45 + }, + { + "epoch": 0.004778227900695959, + "grad_norm": 0.37044212222099304, + "learning_rate": 9.999436666837048e-05, + "loss": 1.9172, + "step": 46 + }, + { + "epoch": 0.004882102420276306, + "grad_norm": 0.3332674205303192, + "learning_rate": 9.999411908350692e-05, + "loss": 1.8282, + "step": 47 + }, + { + "epoch": 0.004985976939856653, + "grad_norm": 0.3461020886898041, + "learning_rate": 9.999386617465966e-05, + "loss": 1.8652, + "step": 48 + }, + { + "epoch": 0.005089851459437, + "grad_norm": 0.31899985671043396, + "learning_rate": 9.999360794185562e-05, + "loss": 1.9479, + "step": 49 + }, + { + "epoch": 0.005193725979017347, + "grad_norm": 0.3465663194656372, + "learning_rate": 9.999334438512228e-05, + "loss": 1.8856, + "step": 50 + }, + { + "epoch": 0.005297600498597694, + "grad_norm": 0.4567152261734009, + "learning_rate": 9.999307550448776e-05, + "loss": 2.26, + "step": 51 + }, + { + "epoch": 0.005401475018178041, + "grad_norm": 0.36369699239730835, + "learning_rate": 9.999280129998063e-05, + "loss": 1.9131, + "step": 52 + }, + { + "epoch": 0.005505349537758388, + "grad_norm": 0.3090951442718506, + "learning_rate": 9.999252177163013e-05, + "loss": 1.7429, + "step": 53 + }, + { + "epoch": 0.005609224057338735, + "grad_norm": 0.3246748745441437, + "learning_rate": 9.999223691946602e-05, + "loss": 1.8882, + "step": 54 + }, + { + "epoch": 0.005713098576919082, + "grad_norm": 0.3664419949054718, + "learning_rate": 9.999194674351864e-05, + "loss": 2.2768, + "step": 55 + }, + { + "epoch": 0.005816973096499429, + "grad_norm": 0.3436775505542755, + "learning_rate": 9.999165124381887e-05, + "loss": 1.7684, + "step": 56 + }, + { + "epoch": 0.005920847616079776, + "grad_norm": 0.3558364510536194, + "learning_rate": 9.99913504203982e-05, + "loss": 1.8547, + "step": 57 + }, + { + "epoch": 0.006024722135660123, + "grad_norm": 0.3412851393222809, + "learning_rate": 9.999104427328865e-05, + "loss": 1.9773, + "step": 58 + }, + { + "epoch": 0.0061285966552404696, + "grad_norm": 0.3379881978034973, + "learning_rate": 9.999073280252283e-05, + "loss": 1.9732, + "step": 59 + }, + { + "epoch": 0.0062324711748208165, + "grad_norm": 0.38533058762550354, + "learning_rate": 9.999041600813393e-05, + "loss": 1.9285, + "step": 60 + }, + { + "epoch": 0.0063363456944011635, + "grad_norm": 0.3420720398426056, + "learning_rate": 9.999009389015564e-05, + "loss": 1.9619, + "step": 61 + }, + { + "epoch": 0.0064402202139815104, + "grad_norm": 0.3268018662929535, + "learning_rate": 9.99897664486223e-05, + "loss": 2.0373, + "step": 62 + }, + { + "epoch": 0.006544094733561857, + "grad_norm": 0.32013919949531555, + "learning_rate": 9.998943368356877e-05, + "loss": 1.8389, + "step": 63 + }, + { + "epoch": 0.006647969253142204, + "grad_norm": 0.38457608222961426, + "learning_rate": 9.998909559503048e-05, + "loss": 2.0334, + "step": 64 + }, + { + "epoch": 0.006751843772722551, + "grad_norm": 0.35168957710266113, + "learning_rate": 9.998875218304345e-05, + "loss": 2.0735, + "step": 65 + }, + { + "epoch": 0.006855718292302898, + "grad_norm": 0.3252507746219635, + "learning_rate": 9.998840344764422e-05, + "loss": 1.7135, + "step": 66 + }, + { + "epoch": 0.006959592811883245, + "grad_norm": 0.353354275226593, + "learning_rate": 9.998804938886997e-05, + "loss": 1.9234, + "step": 67 + }, + { + "epoch": 0.007063467331463592, + "grad_norm": 0.3150123357772827, + "learning_rate": 9.998769000675836e-05, + "loss": 1.6909, + "step": 68 + }, + { + "epoch": 0.007167341851043939, + "grad_norm": 0.33036699891090393, + "learning_rate": 9.99873253013477e-05, + "loss": 1.7841, + "step": 69 + }, + { + "epoch": 0.007271216370624286, + "grad_norm": 0.3279257118701935, + "learning_rate": 9.998695527267681e-05, + "loss": 1.8755, + "step": 70 + }, + { + "epoch": 0.007375090890204633, + "grad_norm": 0.3096674978733063, + "learning_rate": 9.998657992078509e-05, + "loss": 1.8373, + "step": 71 + }, + { + "epoch": 0.00747896540978498, + "grad_norm": 0.3656545877456665, + "learning_rate": 9.998619924571251e-05, + "loss": 2.0858, + "step": 72 + }, + { + "epoch": 0.007582839929365327, + "grad_norm": 0.31613457202911377, + "learning_rate": 9.998581324749964e-05, + "loss": 1.7906, + "step": 73 + }, + { + "epoch": 0.007686714448945674, + "grad_norm": 0.3255898356437683, + "learning_rate": 9.998542192618755e-05, + "loss": 1.8037, + "step": 74 + }, + { + "epoch": 0.007790588968526021, + "grad_norm": 0.33904996514320374, + "learning_rate": 9.998502528181793e-05, + "loss": 1.6548, + "step": 75 + }, + { + "epoch": 0.007894463488106367, + "grad_norm": 0.3505510985851288, + "learning_rate": 9.998462331443301e-05, + "loss": 2.0898, + "step": 76 + }, + { + "epoch": 0.007998338007686715, + "grad_norm": 0.3301371932029724, + "learning_rate": 9.99842160240756e-05, + "loss": 1.9709, + "step": 77 + }, + { + "epoch": 0.008102212527267061, + "grad_norm": 0.3287624418735504, + "learning_rate": 9.998380341078909e-05, + "loss": 1.6921, + "step": 78 + }, + { + "epoch": 0.008206087046847409, + "grad_norm": 0.3255762755870819, + "learning_rate": 9.99833854746174e-05, + "loss": 1.8009, + "step": 79 + }, + { + "epoch": 0.008309961566427755, + "grad_norm": 0.3210470378398895, + "learning_rate": 9.998296221560504e-05, + "loss": 1.9141, + "step": 80 + }, + { + "epoch": 0.008413836086008103, + "grad_norm": 0.3226737082004547, + "learning_rate": 9.998253363379708e-05, + "loss": 1.8365, + "step": 81 + }, + { + "epoch": 0.008517710605588449, + "grad_norm": 0.3540826439857483, + "learning_rate": 9.998209972923918e-05, + "loss": 2.0948, + "step": 82 + }, + { + "epoch": 0.008621585125168797, + "grad_norm": 0.3137127161026001, + "learning_rate": 9.998166050197751e-05, + "loss": 1.8404, + "step": 83 + }, + { + "epoch": 0.008725459644749143, + "grad_norm": 0.3495839536190033, + "learning_rate": 9.998121595205889e-05, + "loss": 1.8201, + "step": 84 + }, + { + "epoch": 0.00882933416432949, + "grad_norm": 0.3384789824485779, + "learning_rate": 9.998076607953064e-05, + "loss": 1.9134, + "step": 85 + }, + { + "epoch": 0.008933208683909837, + "grad_norm": 0.3618725836277008, + "learning_rate": 9.998031088444067e-05, + "loss": 1.9558, + "step": 86 + }, + { + "epoch": 0.009037083203490184, + "grad_norm": 0.370078444480896, + "learning_rate": 9.997985036683744e-05, + "loss": 2.045, + "step": 87 + }, + { + "epoch": 0.00914095772307053, + "grad_norm": 0.32593002915382385, + "learning_rate": 9.997938452677e-05, + "loss": 1.7929, + "step": 88 + }, + { + "epoch": 0.009244832242650878, + "grad_norm": 0.3358675241470337, + "learning_rate": 9.997891336428797e-05, + "loss": 1.8742, + "step": 89 + }, + { + "epoch": 0.009348706762231224, + "grad_norm": 0.35875624418258667, + "learning_rate": 9.997843687944152e-05, + "loss": 1.9634, + "step": 90 + }, + { + "epoch": 0.009452581281811572, + "grad_norm": 0.3237409293651581, + "learning_rate": 9.997795507228139e-05, + "loss": 1.8137, + "step": 91 + }, + { + "epoch": 0.009556455801391918, + "grad_norm": 0.3336571455001831, + "learning_rate": 9.997746794285887e-05, + "loss": 1.9342, + "step": 92 + }, + { + "epoch": 0.009660330320972266, + "grad_norm": 0.3358410894870758, + "learning_rate": 9.997697549122586e-05, + "loss": 1.8772, + "step": 93 + }, + { + "epoch": 0.009764204840552612, + "grad_norm": 0.3093462884426117, + "learning_rate": 9.997647771743482e-05, + "loss": 1.8048, + "step": 94 + }, + { + "epoch": 0.00986807936013296, + "grad_norm": 0.34004223346710205, + "learning_rate": 9.997597462153871e-05, + "loss": 1.8228, + "step": 95 + }, + { + "epoch": 0.009971953879713306, + "grad_norm": 0.33898910880088806, + "learning_rate": 9.997546620359114e-05, + "loss": 1.8203, + "step": 96 + }, + { + "epoch": 0.010075828399293654, + "grad_norm": 0.3222779929637909, + "learning_rate": 9.997495246364624e-05, + "loss": 1.7667, + "step": 97 + }, + { + "epoch": 0.010179702918874, + "grad_norm": 0.3132547438144684, + "learning_rate": 9.997443340175872e-05, + "loss": 1.7176, + "step": 98 + }, + { + "epoch": 0.010283577438454348, + "grad_norm": 0.30829909443855286, + "learning_rate": 9.997390901798386e-05, + "loss": 1.7196, + "step": 99 + }, + { + "epoch": 0.010387451958034694, + "grad_norm": 0.3239877223968506, + "learning_rate": 9.997337931237751e-05, + "loss": 1.6993, + "step": 100 + }, + { + "epoch": 0.010491326477615042, + "grad_norm": 0.34558477997779846, + "learning_rate": 9.997284428499605e-05, + "loss": 1.9082, + "step": 101 + }, + { + "epoch": 0.010595200997195388, + "grad_norm": 0.34653279185295105, + "learning_rate": 9.99723039358965e-05, + "loss": 1.8484, + "step": 102 + }, + { + "epoch": 0.010699075516775736, + "grad_norm": 0.4017353057861328, + "learning_rate": 9.997175826513637e-05, + "loss": 2.0352, + "step": 103 + }, + { + "epoch": 0.010802950036356082, + "grad_norm": 0.3651774227619171, + "learning_rate": 9.997120727277378e-05, + "loss": 2.1032, + "step": 104 + }, + { + "epoch": 0.01090682455593643, + "grad_norm": 0.32084140181541443, + "learning_rate": 9.997065095886741e-05, + "loss": 1.7161, + "step": 105 + }, + { + "epoch": 0.011010699075516776, + "grad_norm": 0.3476540446281433, + "learning_rate": 9.997008932347649e-05, + "loss": 1.8586, + "step": 106 + }, + { + "epoch": 0.011114573595097123, + "grad_norm": 0.3458161950111389, + "learning_rate": 9.996952236666084e-05, + "loss": 1.8134, + "step": 107 + }, + { + "epoch": 0.01121844811467747, + "grad_norm": 0.3564072549343109, + "learning_rate": 9.996895008848085e-05, + "loss": 1.9237, + "step": 108 + }, + { + "epoch": 0.011322322634257817, + "grad_norm": 0.33512404561042786, + "learning_rate": 9.996837248899741e-05, + "loss": 1.8897, + "step": 109 + }, + { + "epoch": 0.011426197153838163, + "grad_norm": 0.36306363344192505, + "learning_rate": 9.99677895682721e-05, + "loss": 1.9856, + "step": 110 + }, + { + "epoch": 0.011530071673418511, + "grad_norm": 0.32571837306022644, + "learning_rate": 9.996720132636697e-05, + "loss": 1.6574, + "step": 111 + }, + { + "epoch": 0.011633946192998857, + "grad_norm": 0.3236237168312073, + "learning_rate": 9.996660776334464e-05, + "loss": 1.7604, + "step": 112 + }, + { + "epoch": 0.011737820712579203, + "grad_norm": 0.34493985772132874, + "learning_rate": 9.996600887926834e-05, + "loss": 1.766, + "step": 113 + }, + { + "epoch": 0.011841695232159551, + "grad_norm": 0.34191280603408813, + "learning_rate": 9.996540467420186e-05, + "loss": 1.7565, + "step": 114 + }, + { + "epoch": 0.011945569751739897, + "grad_norm": 0.33236685395240784, + "learning_rate": 9.996479514820952e-05, + "loss": 1.8874, + "step": 115 + }, + { + "epoch": 0.012049444271320245, + "grad_norm": 0.37998783588409424, + "learning_rate": 9.996418030135622e-05, + "loss": 2.0761, + "step": 116 + }, + { + "epoch": 0.012153318790900591, + "grad_norm": 0.34933921694755554, + "learning_rate": 9.996356013370747e-05, + "loss": 2.0269, + "step": 117 + }, + { + "epoch": 0.012257193310480939, + "grad_norm": 0.3566916286945343, + "learning_rate": 9.99629346453293e-05, + "loss": 1.7982, + "step": 118 + }, + { + "epoch": 0.012361067830061285, + "grad_norm": 0.36033695936203003, + "learning_rate": 9.996230383628831e-05, + "loss": 1.9449, + "step": 119 + }, + { + "epoch": 0.012464942349641633, + "grad_norm": 0.34493738412857056, + "learning_rate": 9.996166770665167e-05, + "loss": 1.9453, + "step": 120 + }, + { + "epoch": 0.012568816869221979, + "grad_norm": 0.3142033815383911, + "learning_rate": 9.996102625648715e-05, + "loss": 1.7562, + "step": 121 + }, + { + "epoch": 0.012672691388802327, + "grad_norm": 0.3889004588127136, + "learning_rate": 9.996037948586305e-05, + "loss": 1.9439, + "step": 122 + }, + { + "epoch": 0.012776565908382673, + "grad_norm": 0.32579320669174194, + "learning_rate": 9.995972739484822e-05, + "loss": 1.7729, + "step": 123 + }, + { + "epoch": 0.012880440427963021, + "grad_norm": 0.3976999521255493, + "learning_rate": 9.995906998351215e-05, + "loss": 2.246, + "step": 124 + }, + { + "epoch": 0.012984314947543367, + "grad_norm": 0.38258862495422363, + "learning_rate": 9.99584072519248e-05, + "loss": 1.9098, + "step": 125 + }, + { + "epoch": 0.013088189467123715, + "grad_norm": 0.3394373655319214, + "learning_rate": 9.995773920015679e-05, + "loss": 1.7569, + "step": 126 + }, + { + "epoch": 0.013192063986704061, + "grad_norm": 0.3687582015991211, + "learning_rate": 9.995706582827922e-05, + "loss": 2.0098, + "step": 127 + }, + { + "epoch": 0.013295938506284409, + "grad_norm": 0.36636659502983093, + "learning_rate": 9.995638713636382e-05, + "loss": 1.9661, + "step": 128 + }, + { + "epoch": 0.013399813025864755, + "grad_norm": 0.33423036336898804, + "learning_rate": 9.995570312448289e-05, + "loss": 1.768, + "step": 129 + }, + { + "epoch": 0.013503687545445103, + "grad_norm": 0.34405717253685, + "learning_rate": 9.995501379270922e-05, + "loss": 1.649, + "step": 130 + }, + { + "epoch": 0.013607562065025449, + "grad_norm": 0.352104514837265, + "learning_rate": 9.995431914111624e-05, + "loss": 1.904, + "step": 131 + }, + { + "epoch": 0.013711436584605797, + "grad_norm": 0.34278222918510437, + "learning_rate": 9.995361916977795e-05, + "loss": 1.9214, + "step": 132 + }, + { + "epoch": 0.013815311104186143, + "grad_norm": 0.3315337300300598, + "learning_rate": 9.995291387876886e-05, + "loss": 1.9647, + "step": 133 + }, + { + "epoch": 0.01391918562376649, + "grad_norm": 0.349161297082901, + "learning_rate": 9.99522032681641e-05, + "loss": 1.8797, + "step": 134 + }, + { + "epoch": 0.014023060143346837, + "grad_norm": 0.32067403197288513, + "learning_rate": 9.995148733803933e-05, + "loss": 1.9065, + "step": 135 + }, + { + "epoch": 0.014126934662927184, + "grad_norm": 0.336671382188797, + "learning_rate": 9.995076608847078e-05, + "loss": 1.7649, + "step": 136 + }, + { + "epoch": 0.01423080918250753, + "grad_norm": 0.3314901292324066, + "learning_rate": 9.99500395195353e-05, + "loss": 1.7482, + "step": 137 + }, + { + "epoch": 0.014334683702087878, + "grad_norm": 0.3493605852127075, + "learning_rate": 9.994930763131022e-05, + "loss": 2.0021, + "step": 138 + }, + { + "epoch": 0.014438558221668224, + "grad_norm": 0.37969377636909485, + "learning_rate": 9.99485704238735e-05, + "loss": 1.9293, + "step": 139 + }, + { + "epoch": 0.014542432741248572, + "grad_norm": 0.31969866156578064, + "learning_rate": 9.994782789730363e-05, + "loss": 1.789, + "step": 140 + }, + { + "epoch": 0.014646307260828918, + "grad_norm": 0.3660503625869751, + "learning_rate": 9.994708005167972e-05, + "loss": 2.0092, + "step": 141 + }, + { + "epoch": 0.014750181780409266, + "grad_norm": 0.346720427274704, + "learning_rate": 9.994632688708138e-05, + "loss": 1.9393, + "step": 142 + }, + { + "epoch": 0.014854056299989612, + "grad_norm": 0.3383733630180359, + "learning_rate": 9.994556840358882e-05, + "loss": 1.7967, + "step": 143 + }, + { + "epoch": 0.01495793081956996, + "grad_norm": 0.3345518708229065, + "learning_rate": 9.994480460128282e-05, + "loss": 1.8901, + "step": 144 + }, + { + "epoch": 0.015061805339150306, + "grad_norm": 0.34996676445007324, + "learning_rate": 9.99440354802447e-05, + "loss": 1.8006, + "step": 145 + }, + { + "epoch": 0.015165679858730654, + "grad_norm": 0.34575289487838745, + "learning_rate": 9.994326104055639e-05, + "loss": 1.9967, + "step": 146 + }, + { + "epoch": 0.015269554378311, + "grad_norm": 0.39413705468177795, + "learning_rate": 9.994248128230035e-05, + "loss": 1.9811, + "step": 147 + }, + { + "epoch": 0.015373428897891348, + "grad_norm": 0.3441784083843231, + "learning_rate": 9.994169620555962e-05, + "loss": 1.7302, + "step": 148 + }, + { + "epoch": 0.015477303417471694, + "grad_norm": 0.32033270597457886, + "learning_rate": 9.994090581041781e-05, + "loss": 1.8822, + "step": 149 + }, + { + "epoch": 0.015581177937052042, + "grad_norm": 0.37517550587654114, + "learning_rate": 9.994011009695908e-05, + "loss": 2.0916, + "step": 150 + }, + { + "epoch": 0.01568505245663239, + "grad_norm": 0.3081076145172119, + "learning_rate": 9.993930906526818e-05, + "loss": 1.6987, + "step": 151 + }, + { + "epoch": 0.015788926976212734, + "grad_norm": 0.31799814105033875, + "learning_rate": 9.993850271543039e-05, + "loss": 1.6714, + "step": 152 + }, + { + "epoch": 0.015892801495793082, + "grad_norm": 0.3602600693702698, + "learning_rate": 9.99376910475316e-05, + "loss": 2.0209, + "step": 153 + }, + { + "epoch": 0.01599667601537343, + "grad_norm": 0.31287044286727905, + "learning_rate": 9.993687406165826e-05, + "loss": 1.6677, + "step": 154 + }, + { + "epoch": 0.016100550534953777, + "grad_norm": 0.34423595666885376, + "learning_rate": 9.993605175789733e-05, + "loss": 1.863, + "step": 155 + }, + { + "epoch": 0.016204425054534122, + "grad_norm": 0.3550304174423218, + "learning_rate": 9.993522413633643e-05, + "loss": 1.9312, + "step": 156 + }, + { + "epoch": 0.01630829957411447, + "grad_norm": 0.3672383725643158, + "learning_rate": 9.993439119706364e-05, + "loss": 2.0446, + "step": 157 + }, + { + "epoch": 0.016412174093694817, + "grad_norm": 0.32233211398124695, + "learning_rate": 9.993355294016771e-05, + "loss": 1.7413, + "step": 158 + }, + { + "epoch": 0.016516048613275165, + "grad_norm": 0.33033961057662964, + "learning_rate": 9.993270936573788e-05, + "loss": 1.8413, + "step": 159 + }, + { + "epoch": 0.01661992313285551, + "grad_norm": 0.3530619442462921, + "learning_rate": 9.9931860473864e-05, + "loss": 1.9083, + "step": 160 + }, + { + "epoch": 0.016723797652435857, + "grad_norm": 0.33920934796333313, + "learning_rate": 9.993100626463646e-05, + "loss": 1.8213, + "step": 161 + }, + { + "epoch": 0.016827672172016205, + "grad_norm": 0.3833538889884949, + "learning_rate": 9.993014673814624e-05, + "loss": 2.1405, + "step": 162 + }, + { + "epoch": 0.016931546691596553, + "grad_norm": 0.35124388337135315, + "learning_rate": 9.992928189448484e-05, + "loss": 1.8398, + "step": 163 + }, + { + "epoch": 0.017035421211176897, + "grad_norm": 0.34892386198043823, + "learning_rate": 9.992841173374441e-05, + "loss": 1.8151, + "step": 164 + }, + { + "epoch": 0.017139295730757245, + "grad_norm": 0.33861371874809265, + "learning_rate": 9.992753625601756e-05, + "loss": 1.7062, + "step": 165 + }, + { + "epoch": 0.017243170250337593, + "grad_norm": 0.34012502431869507, + "learning_rate": 9.992665546139757e-05, + "loss": 1.6682, + "step": 166 + }, + { + "epoch": 0.017347044769917937, + "grad_norm": 0.38989123702049255, + "learning_rate": 9.992576934997819e-05, + "loss": 1.9738, + "step": 167 + }, + { + "epoch": 0.017450919289498285, + "grad_norm": 0.37011152505874634, + "learning_rate": 9.992487792185383e-05, + "loss": 1.9593, + "step": 168 + }, + { + "epoch": 0.017554793809078633, + "grad_norm": 0.341496080160141, + "learning_rate": 9.992398117711941e-05, + "loss": 1.8838, + "step": 169 + }, + { + "epoch": 0.01765866832865898, + "grad_norm": 0.3466763496398926, + "learning_rate": 9.99230791158704e-05, + "loss": 1.7701, + "step": 170 + }, + { + "epoch": 0.017762542848239325, + "grad_norm": 0.36252373456954956, + "learning_rate": 9.992217173820288e-05, + "loss": 1.8656, + "step": 171 + }, + { + "epoch": 0.017866417367819673, + "grad_norm": 0.3116392195224762, + "learning_rate": 9.992125904421348e-05, + "loss": 1.7344, + "step": 172 + }, + { + "epoch": 0.01797029188740002, + "grad_norm": 0.3929993808269501, + "learning_rate": 9.992034103399939e-05, + "loss": 1.6855, + "step": 173 + }, + { + "epoch": 0.01807416640698037, + "grad_norm": 0.3422747552394867, + "learning_rate": 9.991941770765838e-05, + "loss": 1.8554, + "step": 174 + }, + { + "epoch": 0.018178040926560713, + "grad_norm": 0.3650684058666229, + "learning_rate": 9.991848906528876e-05, + "loss": 1.9407, + "step": 175 + }, + { + "epoch": 0.01828191544614106, + "grad_norm": 0.35484546422958374, + "learning_rate": 9.991755510698944e-05, + "loss": 1.8553, + "step": 176 + }, + { + "epoch": 0.01838578996572141, + "grad_norm": 0.3734646737575531, + "learning_rate": 9.991661583285987e-05, + "loss": 1.77, + "step": 177 + }, + { + "epoch": 0.018489664485301757, + "grad_norm": 0.3550145924091339, + "learning_rate": 9.991567124300009e-05, + "loss": 1.8825, + "step": 178 + }, + { + "epoch": 0.0185935390048821, + "grad_norm": 0.3555574119091034, + "learning_rate": 9.991472133751067e-05, + "loss": 1.9889, + "step": 179 + }, + { + "epoch": 0.01869741352446245, + "grad_norm": 0.32128405570983887, + "learning_rate": 9.991376611649279e-05, + "loss": 1.683, + "step": 180 + }, + { + "epoch": 0.018801288044042797, + "grad_norm": 0.3892490863800049, + "learning_rate": 9.991280558004815e-05, + "loss": 1.9933, + "step": 181 + }, + { + "epoch": 0.018905162563623144, + "grad_norm": 0.32436904311180115, + "learning_rate": 9.991183972827904e-05, + "loss": 1.8897, + "step": 182 + }, + { + "epoch": 0.01900903708320349, + "grad_norm": 0.3443160355091095, + "learning_rate": 9.991086856128833e-05, + "loss": 1.6934, + "step": 183 + }, + { + "epoch": 0.019112911602783837, + "grad_norm": 0.3968258202075958, + "learning_rate": 9.990989207917944e-05, + "loss": 1.9867, + "step": 184 + }, + { + "epoch": 0.019216786122364184, + "grad_norm": 0.3794998526573181, + "learning_rate": 9.990891028205636e-05, + "loss": 2.0397, + "step": 185 + }, + { + "epoch": 0.019320660641944532, + "grad_norm": 0.3027797043323517, + "learning_rate": 9.990792317002364e-05, + "loss": 1.6816, + "step": 186 + }, + { + "epoch": 0.019424535161524877, + "grad_norm": 0.3459644913673401, + "learning_rate": 9.99069307431864e-05, + "loss": 1.8124, + "step": 187 + }, + { + "epoch": 0.019528409681105224, + "grad_norm": 0.3326679766178131, + "learning_rate": 9.990593300165033e-05, + "loss": 1.6452, + "step": 188 + }, + { + "epoch": 0.019632284200685572, + "grad_norm": 0.3605174422264099, + "learning_rate": 9.990492994552167e-05, + "loss": 2.088, + "step": 189 + }, + { + "epoch": 0.01973615872026592, + "grad_norm": 0.3342396914958954, + "learning_rate": 9.990392157490724e-05, + "loss": 1.9396, + "step": 190 + }, + { + "epoch": 0.019840033239846264, + "grad_norm": 0.37349754571914673, + "learning_rate": 9.990290788991443e-05, + "loss": 1.9863, + "step": 191 + }, + { + "epoch": 0.019943907759426612, + "grad_norm": 0.3778427541255951, + "learning_rate": 9.99018888906512e-05, + "loss": 1.902, + "step": 192 + }, + { + "epoch": 0.02004778227900696, + "grad_norm": 0.3246352970600128, + "learning_rate": 9.990086457722604e-05, + "loss": 1.8484, + "step": 193 + }, + { + "epoch": 0.020151656798587308, + "grad_norm": 0.37238767743110657, + "learning_rate": 9.989983494974805e-05, + "loss": 1.9543, + "step": 194 + }, + { + "epoch": 0.020255531318167652, + "grad_norm": 0.37035781145095825, + "learning_rate": 9.989880000832686e-05, + "loss": 1.8695, + "step": 195 + }, + { + "epoch": 0.020359405837748, + "grad_norm": 0.3354018032550812, + "learning_rate": 9.989775975307272e-05, + "loss": 1.7589, + "step": 196 + }, + { + "epoch": 0.020463280357328348, + "grad_norm": 0.4081863462924957, + "learning_rate": 9.989671418409636e-05, + "loss": 2.1527, + "step": 197 + }, + { + "epoch": 0.020567154876908696, + "grad_norm": 0.33487629890441895, + "learning_rate": 9.989566330150914e-05, + "loss": 1.7132, + "step": 198 + }, + { + "epoch": 0.02067102939648904, + "grad_norm": 0.3445211350917816, + "learning_rate": 9.989460710542301e-05, + "loss": 1.8913, + "step": 199 + }, + { + "epoch": 0.020774903916069388, + "grad_norm": 0.40155649185180664, + "learning_rate": 9.98935455959504e-05, + "loss": 2.191, + "step": 200 + }, + { + "epoch": 0.020878778435649736, + "grad_norm": 0.35195931792259216, + "learning_rate": 9.989247877320436e-05, + "loss": 1.9199, + "step": 201 + }, + { + "epoch": 0.020982652955230083, + "grad_norm": 0.34090521931648254, + "learning_rate": 9.989140663729852e-05, + "loss": 1.831, + "step": 202 + }, + { + "epoch": 0.021086527474810428, + "grad_norm": 0.3321789801120758, + "learning_rate": 9.989032918834704e-05, + "loss": 1.8922, + "step": 203 + }, + { + "epoch": 0.021190401994390776, + "grad_norm": 0.321913480758667, + "learning_rate": 9.988924642646466e-05, + "loss": 1.8271, + "step": 204 + }, + { + "epoch": 0.021294276513971123, + "grad_norm": 0.39516186714172363, + "learning_rate": 9.988815835176668e-05, + "loss": 1.9391, + "step": 205 + }, + { + "epoch": 0.02139815103355147, + "grad_norm": 0.35352519154548645, + "learning_rate": 9.9887064964369e-05, + "loss": 1.6495, + "step": 206 + }, + { + "epoch": 0.021502025553131816, + "grad_norm": 0.3435503840446472, + "learning_rate": 9.988596626438801e-05, + "loss": 1.8272, + "step": 207 + }, + { + "epoch": 0.021605900072712163, + "grad_norm": 0.35792356729507446, + "learning_rate": 9.988486225194075e-05, + "loss": 1.8506, + "step": 208 + }, + { + "epoch": 0.02170977459229251, + "grad_norm": 0.32657817006111145, + "learning_rate": 9.988375292714478e-05, + "loss": 1.8219, + "step": 209 + }, + { + "epoch": 0.02181364911187286, + "grad_norm": 0.35596343874931335, + "learning_rate": 9.988263829011822e-05, + "loss": 1.8984, + "step": 210 + }, + { + "epoch": 0.021917523631453203, + "grad_norm": 0.35671266913414, + "learning_rate": 9.988151834097979e-05, + "loss": 1.8276, + "step": 211 + }, + { + "epoch": 0.02202139815103355, + "grad_norm": 0.3640732765197754, + "learning_rate": 9.988039307984874e-05, + "loss": 1.8979, + "step": 212 + }, + { + "epoch": 0.0221252726706139, + "grad_norm": 0.32779642939567566, + "learning_rate": 9.987926250684491e-05, + "loss": 1.8392, + "step": 213 + }, + { + "epoch": 0.022229147190194247, + "grad_norm": 0.3471597731113434, + "learning_rate": 9.98781266220887e-05, + "loss": 1.8287, + "step": 214 + }, + { + "epoch": 0.02233302170977459, + "grad_norm": 0.3553629219532013, + "learning_rate": 9.987698542570107e-05, + "loss": 1.8612, + "step": 215 + }, + { + "epoch": 0.02243689622935494, + "grad_norm": 0.3513118028640747, + "learning_rate": 9.987583891780355e-05, + "loss": 1.9711, + "step": 216 + }, + { + "epoch": 0.022540770748935287, + "grad_norm": 0.3648374378681183, + "learning_rate": 9.987468709851823e-05, + "loss": 1.7615, + "step": 217 + }, + { + "epoch": 0.022644645268515635, + "grad_norm": 0.3236382007598877, + "learning_rate": 9.987352996796777e-05, + "loss": 1.694, + "step": 218 + }, + { + "epoch": 0.02274851978809598, + "grad_norm": 0.3441942632198334, + "learning_rate": 9.98723675262754e-05, + "loss": 1.7834, + "step": 219 + }, + { + "epoch": 0.022852394307676327, + "grad_norm": 0.3617098331451416, + "learning_rate": 9.987119977356491e-05, + "loss": 1.8788, + "step": 220 + }, + { + "epoch": 0.022956268827256675, + "grad_norm": 0.3559406101703644, + "learning_rate": 9.987002670996064e-05, + "loss": 1.9663, + "step": 221 + }, + { + "epoch": 0.023060143346837023, + "grad_norm": 0.3278948664665222, + "learning_rate": 9.986884833558754e-05, + "loss": 1.822, + "step": 222 + }, + { + "epoch": 0.023164017866417367, + "grad_norm": 0.3330172896385193, + "learning_rate": 9.986766465057108e-05, + "loss": 1.8147, + "step": 223 + }, + { + "epoch": 0.023267892385997715, + "grad_norm": 0.3398604094982147, + "learning_rate": 9.986647565503731e-05, + "loss": 1.7854, + "step": 224 + }, + { + "epoch": 0.023371766905578063, + "grad_norm": 0.2998906970024109, + "learning_rate": 9.986528134911288e-05, + "loss": 1.6783, + "step": 225 + }, + { + "epoch": 0.023475641425158407, + "grad_norm": 0.3525906801223755, + "learning_rate": 9.986408173292492e-05, + "loss": 1.9996, + "step": 226 + }, + { + "epoch": 0.023579515944738755, + "grad_norm": 0.34057801961898804, + "learning_rate": 9.986287680660123e-05, + "loss": 1.8241, + "step": 227 + }, + { + "epoch": 0.023683390464319103, + "grad_norm": 0.3602697551250458, + "learning_rate": 9.98616665702701e-05, + "loss": 1.9336, + "step": 228 + }, + { + "epoch": 0.02378726498389945, + "grad_norm": 0.31595826148986816, + "learning_rate": 9.986045102406042e-05, + "loss": 1.7719, + "step": 229 + }, + { + "epoch": 0.023891139503479795, + "grad_norm": 0.34162575006484985, + "learning_rate": 9.985923016810163e-05, + "loss": 1.8026, + "step": 230 + }, + { + "epoch": 0.023995014023060143, + "grad_norm": 0.3397705852985382, + "learning_rate": 9.985800400252374e-05, + "loss": 1.9196, + "step": 231 + }, + { + "epoch": 0.02409888854264049, + "grad_norm": 0.3344639539718628, + "learning_rate": 9.985677252745733e-05, + "loss": 1.8512, + "step": 232 + }, + { + "epoch": 0.024202763062220838, + "grad_norm": 0.44500732421875, + "learning_rate": 9.985553574303354e-05, + "loss": 1.8336, + "step": 233 + }, + { + "epoch": 0.024306637581801183, + "grad_norm": 0.3289027512073517, + "learning_rate": 9.98542936493841e-05, + "loss": 1.6927, + "step": 234 + }, + { + "epoch": 0.02441051210138153, + "grad_norm": 0.3510255217552185, + "learning_rate": 9.985304624664125e-05, + "loss": 1.8582, + "step": 235 + }, + { + "epoch": 0.024514386620961878, + "grad_norm": 0.3442867696285248, + "learning_rate": 9.985179353493785e-05, + "loss": 1.9107, + "step": 236 + }, + { + "epoch": 0.024618261140542226, + "grad_norm": 0.3105999529361725, + "learning_rate": 9.98505355144073e-05, + "loss": 1.6351, + "step": 237 + }, + { + "epoch": 0.02472213566012257, + "grad_norm": 0.3260045647621155, + "learning_rate": 9.984927218518356e-05, + "loss": 1.7564, + "step": 238 + }, + { + "epoch": 0.024826010179702918, + "grad_norm": 0.32900920510292053, + "learning_rate": 9.984800354740117e-05, + "loss": 1.7372, + "step": 239 + }, + { + "epoch": 0.024929884699283266, + "grad_norm": 0.3402916193008423, + "learning_rate": 9.984672960119523e-05, + "loss": 1.7131, + "step": 240 + }, + { + "epoch": 0.025033759218863614, + "grad_norm": 0.35345762968063354, + "learning_rate": 9.984545034670142e-05, + "loss": 1.7636, + "step": 241 + }, + { + "epoch": 0.025137633738443958, + "grad_norm": 0.3537079393863678, + "learning_rate": 9.984416578405596e-05, + "loss": 1.9847, + "step": 242 + }, + { + "epoch": 0.025241508258024306, + "grad_norm": 0.3712044358253479, + "learning_rate": 9.984287591339562e-05, + "loss": 2.0124, + "step": 243 + }, + { + "epoch": 0.025345382777604654, + "grad_norm": 0.3348478376865387, + "learning_rate": 9.98415807348578e-05, + "loss": 1.8527, + "step": 244 + }, + { + "epoch": 0.025449257297185002, + "grad_norm": 0.3452380299568176, + "learning_rate": 9.984028024858041e-05, + "loss": 1.8608, + "step": 245 + }, + { + "epoch": 0.025553131816765346, + "grad_norm": 0.38733747601509094, + "learning_rate": 9.983897445470194e-05, + "loss": 2.029, + "step": 246 + }, + { + "epoch": 0.025657006336345694, + "grad_norm": 0.3828756809234619, + "learning_rate": 9.983766335336144e-05, + "loss": 1.9682, + "step": 247 + }, + { + "epoch": 0.025760880855926042, + "grad_norm": 0.38744086027145386, + "learning_rate": 9.983634694469855e-05, + "loss": 1.6414, + "step": 248 + }, + { + "epoch": 0.02586475537550639, + "grad_norm": 0.3316837251186371, + "learning_rate": 9.983502522885347e-05, + "loss": 1.7982, + "step": 249 + }, + { + "epoch": 0.025968629895086734, + "grad_norm": 0.3396342694759369, + "learning_rate": 9.983369820596691e-05, + "loss": 1.8246, + "step": 250 + }, + { + "epoch": 0.026072504414667082, + "grad_norm": 0.3051803708076477, + "learning_rate": 9.98323658761802e-05, + "loss": 1.733, + "step": 251 + }, + { + "epoch": 0.02617637893424743, + "grad_norm": 0.31522175669670105, + "learning_rate": 9.983102823963524e-05, + "loss": 1.6995, + "step": 252 + }, + { + "epoch": 0.026280253453827777, + "grad_norm": 0.33415964245796204, + "learning_rate": 9.982968529647447e-05, + "loss": 1.5215, + "step": 253 + }, + { + "epoch": 0.026384127973408122, + "grad_norm": 0.35238194465637207, + "learning_rate": 9.982833704684091e-05, + "loss": 1.8707, + "step": 254 + }, + { + "epoch": 0.02648800249298847, + "grad_norm": 0.3153392970561981, + "learning_rate": 9.982698349087812e-05, + "loss": 1.7123, + "step": 255 + }, + { + "epoch": 0.026591877012568817, + "grad_norm": 0.3397294282913208, + "learning_rate": 9.982562462873026e-05, + "loss": 1.7335, + "step": 256 + }, + { + "epoch": 0.026695751532149165, + "grad_norm": 0.3764454126358032, + "learning_rate": 9.982426046054204e-05, + "loss": 1.9431, + "step": 257 + }, + { + "epoch": 0.02679962605172951, + "grad_norm": 0.3482568860054016, + "learning_rate": 9.982289098645872e-05, + "loss": 1.8024, + "step": 258 + }, + { + "epoch": 0.026903500571309857, + "grad_norm": 0.3208499550819397, + "learning_rate": 9.982151620662612e-05, + "loss": 1.8129, + "step": 259 + }, + { + "epoch": 0.027007375090890205, + "grad_norm": 0.3308181166648865, + "learning_rate": 9.98201361211907e-05, + "loss": 1.8653, + "step": 260 + }, + { + "epoch": 0.027111249610470553, + "grad_norm": 0.3466727137565613, + "learning_rate": 9.981875073029938e-05, + "loss": 1.7716, + "step": 261 + }, + { + "epoch": 0.027215124130050897, + "grad_norm": 0.33778107166290283, + "learning_rate": 9.981736003409971e-05, + "loss": 1.8481, + "step": 262 + }, + { + "epoch": 0.027318998649631245, + "grad_norm": 0.367851197719574, + "learning_rate": 9.981596403273978e-05, + "loss": 2.0175, + "step": 263 + }, + { + "epoch": 0.027422873169211593, + "grad_norm": 0.3659020662307739, + "learning_rate": 9.981456272636826e-05, + "loss": 1.6928, + "step": 264 + }, + { + "epoch": 0.02752674768879194, + "grad_norm": 0.35648471117019653, + "learning_rate": 9.981315611513438e-05, + "loss": 1.9024, + "step": 265 + }, + { + "epoch": 0.027630622208372285, + "grad_norm": 0.3503887355327606, + "learning_rate": 9.981174419918795e-05, + "loss": 1.8439, + "step": 266 + }, + { + "epoch": 0.027734496727952633, + "grad_norm": 0.37520813941955566, + "learning_rate": 9.981032697867929e-05, + "loss": 2.0282, + "step": 267 + }, + { + "epoch": 0.02783837124753298, + "grad_norm": 0.3300826847553253, + "learning_rate": 9.980890445375934e-05, + "loss": 1.7793, + "step": 268 + }, + { + "epoch": 0.02794224576711333, + "grad_norm": 0.32417240738868713, + "learning_rate": 9.980747662457961e-05, + "loss": 1.8332, + "step": 269 + }, + { + "epoch": 0.028046120286693673, + "grad_norm": 0.32837119698524475, + "learning_rate": 9.980604349129211e-05, + "loss": 1.6546, + "step": 270 + }, + { + "epoch": 0.02814999480627402, + "grad_norm": 0.36148953437805176, + "learning_rate": 9.980460505404949e-05, + "loss": 1.9148, + "step": 271 + }, + { + "epoch": 0.02825386932585437, + "grad_norm": 0.3625042736530304, + "learning_rate": 9.980316131300493e-05, + "loss": 1.8963, + "step": 272 + }, + { + "epoch": 0.028357743845434717, + "grad_norm": 0.322068452835083, + "learning_rate": 9.980171226831216e-05, + "loss": 1.7182, + "step": 273 + }, + { + "epoch": 0.02846161836501506, + "grad_norm": 0.3443426787853241, + "learning_rate": 9.980025792012551e-05, + "loss": 1.7612, + "step": 274 + }, + { + "epoch": 0.02856549288459541, + "grad_norm": 0.3243624269962311, + "learning_rate": 9.979879826859983e-05, + "loss": 1.8107, + "step": 275 + }, + { + "epoch": 0.028669367404175757, + "grad_norm": 0.34362664818763733, + "learning_rate": 9.97973333138906e-05, + "loss": 1.8116, + "step": 276 + }, + { + "epoch": 0.028773241923756104, + "grad_norm": 0.3423418402671814, + "learning_rate": 9.97958630561538e-05, + "loss": 1.968, + "step": 277 + }, + { + "epoch": 0.02887711644333645, + "grad_norm": 0.3265102207660675, + "learning_rate": 9.9794387495546e-05, + "loss": 1.7281, + "step": 278 + }, + { + "epoch": 0.028980990962916797, + "grad_norm": 0.34078219532966614, + "learning_rate": 9.979290663222434e-05, + "loss": 1.8249, + "step": 279 + }, + { + "epoch": 0.029084865482497144, + "grad_norm": 0.34047967195510864, + "learning_rate": 9.979142046634653e-05, + "loss": 1.8084, + "step": 280 + }, + { + "epoch": 0.02918874000207749, + "grad_norm": 0.36841025948524475, + "learning_rate": 9.978992899807084e-05, + "loss": 1.9606, + "step": 281 + }, + { + "epoch": 0.029292614521657837, + "grad_norm": 0.32618024945259094, + "learning_rate": 9.978843222755607e-05, + "loss": 1.8753, + "step": 282 + }, + { + "epoch": 0.029396489041238184, + "grad_norm": 0.33189335465431213, + "learning_rate": 9.978693015496165e-05, + "loss": 1.948, + "step": 283 + }, + { + "epoch": 0.029500363560818532, + "grad_norm": 0.32388558983802795, + "learning_rate": 9.978542278044751e-05, + "loss": 1.8796, + "step": 284 + }, + { + "epoch": 0.029604238080398877, + "grad_norm": 0.34063830971717834, + "learning_rate": 9.978391010417418e-05, + "loss": 1.8221, + "step": 285 + }, + { + "epoch": 0.029708112599979224, + "grad_norm": 0.32267820835113525, + "learning_rate": 9.978239212630277e-05, + "loss": 1.7903, + "step": 286 + }, + { + "epoch": 0.029811987119559572, + "grad_norm": 0.35702529549598694, + "learning_rate": 9.978086884699492e-05, + "loss": 1.9183, + "step": 287 + }, + { + "epoch": 0.02991586163913992, + "grad_norm": 0.36471986770629883, + "learning_rate": 9.977934026641282e-05, + "loss": 1.8877, + "step": 288 + }, + { + "epoch": 0.030019736158720264, + "grad_norm": 0.3516945540904999, + "learning_rate": 9.977780638471928e-05, + "loss": 1.7437, + "step": 289 + }, + { + "epoch": 0.030123610678300612, + "grad_norm": 0.37206095457077026, + "learning_rate": 9.977626720207764e-05, + "loss": 1.794, + "step": 290 + }, + { + "epoch": 0.03022748519788096, + "grad_norm": 0.3756014108657837, + "learning_rate": 9.977472271865182e-05, + "loss": 1.8116, + "step": 291 + }, + { + "epoch": 0.030331359717461308, + "grad_norm": 0.3230532705783844, + "learning_rate": 9.977317293460631e-05, + "loss": 1.7871, + "step": 292 + }, + { + "epoch": 0.030435234237041652, + "grad_norm": 0.3606550693511963, + "learning_rate": 9.97716178501061e-05, + "loss": 1.8669, + "step": 293 + }, + { + "epoch": 0.030539108756622, + "grad_norm": 0.34765157103538513, + "learning_rate": 9.977005746531682e-05, + "loss": 1.8061, + "step": 294 + }, + { + "epoch": 0.030642983276202348, + "grad_norm": 0.3419150114059448, + "learning_rate": 9.976849178040466e-05, + "loss": 1.8189, + "step": 295 + }, + { + "epoch": 0.030746857795782696, + "grad_norm": 0.33086076378822327, + "learning_rate": 9.976692079553633e-05, + "loss": 1.7308, + "step": 296 + }, + { + "epoch": 0.03085073231536304, + "grad_norm": 0.3839011490345001, + "learning_rate": 9.976534451087913e-05, + "loss": 1.881, + "step": 297 + }, + { + "epoch": 0.030954606834943388, + "grad_norm": 0.3362378478050232, + "learning_rate": 9.976376292660091e-05, + "loss": 2.0111, + "step": 298 + }, + { + "epoch": 0.031058481354523736, + "grad_norm": 0.3804178535938263, + "learning_rate": 9.976217604287013e-05, + "loss": 2.1103, + "step": 299 + }, + { + "epoch": 0.031162355874104083, + "grad_norm": 0.38724952936172485, + "learning_rate": 9.976058385985575e-05, + "loss": 1.8858, + "step": 300 + }, + { + "epoch": 0.03126623039368443, + "grad_norm": 0.35871621966362, + "learning_rate": 9.975898637772734e-05, + "loss": 1.7076, + "step": 301 + }, + { + "epoch": 0.03137010491326478, + "grad_norm": 0.32210198044776917, + "learning_rate": 9.975738359665501e-05, + "loss": 1.8058, + "step": 302 + }, + { + "epoch": 0.03147397943284512, + "grad_norm": 0.34519657492637634, + "learning_rate": 9.975577551680946e-05, + "loss": 1.8827, + "step": 303 + }, + { + "epoch": 0.03157785395242547, + "grad_norm": 0.3406042456626892, + "learning_rate": 9.975416213836193e-05, + "loss": 1.801, + "step": 304 + }, + { + "epoch": 0.031681728472005816, + "grad_norm": 0.33171379566192627, + "learning_rate": 9.975254346148422e-05, + "loss": 1.7826, + "step": 305 + }, + { + "epoch": 0.031785602991586163, + "grad_norm": 0.31799814105033875, + "learning_rate": 9.975091948634871e-05, + "loss": 1.7969, + "step": 306 + }, + { + "epoch": 0.03188947751116651, + "grad_norm": 0.33427590131759644, + "learning_rate": 9.974929021312836e-05, + "loss": 1.9499, + "step": 307 + }, + { + "epoch": 0.03199335203074686, + "grad_norm": 0.336112916469574, + "learning_rate": 9.974765564199665e-05, + "loss": 1.9003, + "step": 308 + }, + { + "epoch": 0.03209722655032721, + "grad_norm": 0.33370164036750793, + "learning_rate": 9.974601577312768e-05, + "loss": 1.8738, + "step": 309 + }, + { + "epoch": 0.032201101069907555, + "grad_norm": 0.3412269949913025, + "learning_rate": 9.974437060669603e-05, + "loss": 1.8817, + "step": 310 + }, + { + "epoch": 0.032304975589487896, + "grad_norm": 0.34361353516578674, + "learning_rate": 9.974272014287697e-05, + "loss": 1.8391, + "step": 311 + }, + { + "epoch": 0.032408850109068243, + "grad_norm": 0.3552245795726776, + "learning_rate": 9.97410643818462e-05, + "loss": 1.8881, + "step": 312 + }, + { + "epoch": 0.03251272462864859, + "grad_norm": 0.3323882222175598, + "learning_rate": 9.973940332378007e-05, + "loss": 1.8975, + "step": 313 + }, + { + "epoch": 0.03261659914822894, + "grad_norm": 0.34055736660957336, + "learning_rate": 9.973773696885547e-05, + "loss": 1.7162, + "step": 314 + }, + { + "epoch": 0.03272047366780929, + "grad_norm": 0.3609575927257538, + "learning_rate": 9.973606531724985e-05, + "loss": 1.9902, + "step": 315 + }, + { + "epoch": 0.032824348187389635, + "grad_norm": 0.3580215573310852, + "learning_rate": 9.973438836914124e-05, + "loss": 1.9183, + "step": 316 + }, + { + "epoch": 0.03292822270696998, + "grad_norm": 0.34166282415390015, + "learning_rate": 9.973270612470822e-05, + "loss": 1.8884, + "step": 317 + }, + { + "epoch": 0.03303209722655033, + "grad_norm": 0.3344467878341675, + "learning_rate": 9.97310185841299e-05, + "loss": 1.9219, + "step": 318 + }, + { + "epoch": 0.03313597174613067, + "grad_norm": 0.3929234743118286, + "learning_rate": 9.972932574758604e-05, + "loss": 1.9071, + "step": 319 + }, + { + "epoch": 0.03323984626571102, + "grad_norm": 0.3376573324203491, + "learning_rate": 9.972762761525689e-05, + "loss": 1.7418, + "step": 320 + }, + { + "epoch": 0.03334372078529137, + "grad_norm": 0.33105456829071045, + "learning_rate": 9.972592418732327e-05, + "loss": 1.7941, + "step": 321 + }, + { + "epoch": 0.033447595304871715, + "grad_norm": 0.3593199551105499, + "learning_rate": 9.972421546396662e-05, + "loss": 1.8983, + "step": 322 + }, + { + "epoch": 0.03355146982445206, + "grad_norm": 0.37045425176620483, + "learning_rate": 9.972250144536888e-05, + "loss": 1.947, + "step": 323 + }, + { + "epoch": 0.03365534434403241, + "grad_norm": 0.3378489911556244, + "learning_rate": 9.972078213171259e-05, + "loss": 1.7318, + "step": 324 + }, + { + "epoch": 0.03375921886361276, + "grad_norm": 0.354125440120697, + "learning_rate": 9.971905752318084e-05, + "loss": 1.6396, + "step": 325 + }, + { + "epoch": 0.033863093383193106, + "grad_norm": 0.36457526683807373, + "learning_rate": 9.971732761995728e-05, + "loss": 1.8063, + "step": 326 + }, + { + "epoch": 0.03396696790277345, + "grad_norm": 0.33944860100746155, + "learning_rate": 9.971559242222615e-05, + "loss": 1.7624, + "step": 327 + }, + { + "epoch": 0.034070842422353795, + "grad_norm": 0.327573299407959, + "learning_rate": 9.971385193017221e-05, + "loss": 1.7963, + "step": 328 + }, + { + "epoch": 0.03417471694193414, + "grad_norm": 0.3168799877166748, + "learning_rate": 9.971210614398084e-05, + "loss": 1.7648, + "step": 329 + }, + { + "epoch": 0.03427859146151449, + "grad_norm": 0.3881419003009796, + "learning_rate": 9.971035506383792e-05, + "loss": 1.8365, + "step": 330 + }, + { + "epoch": 0.03438246598109484, + "grad_norm": 0.3255634903907776, + "learning_rate": 9.970859868992995e-05, + "loss": 1.9249, + "step": 331 + }, + { + "epoch": 0.034486340500675186, + "grad_norm": 0.3285115361213684, + "learning_rate": 9.970683702244395e-05, + "loss": 1.881, + "step": 332 + }, + { + "epoch": 0.034590215020255534, + "grad_norm": 0.34438276290893555, + "learning_rate": 9.970507006156755e-05, + "loss": 1.8085, + "step": 333 + }, + { + "epoch": 0.034694089539835875, + "grad_norm": 0.347380667924881, + "learning_rate": 9.970329780748888e-05, + "loss": 1.8762, + "step": 334 + }, + { + "epoch": 0.03479796405941622, + "grad_norm": 0.32174697518348694, + "learning_rate": 9.970152026039672e-05, + "loss": 1.7262, + "step": 335 + }, + { + "epoch": 0.03490183857899657, + "grad_norm": 0.3503490686416626, + "learning_rate": 9.969973742048032e-05, + "loss": 1.7248, + "step": 336 + }, + { + "epoch": 0.03500571309857692, + "grad_norm": 0.327619343996048, + "learning_rate": 9.969794928792957e-05, + "loss": 1.7031, + "step": 337 + }, + { + "epoch": 0.035109587618157266, + "grad_norm": 0.4252755343914032, + "learning_rate": 9.969615586293488e-05, + "loss": 2.0297, + "step": 338 + }, + { + "epoch": 0.035213462137737614, + "grad_norm": 0.36586689949035645, + "learning_rate": 9.969435714568722e-05, + "loss": 1.9272, + "step": 339 + }, + { + "epoch": 0.03531733665731796, + "grad_norm": 0.3425196409225464, + "learning_rate": 9.969255313637818e-05, + "loss": 1.8647, + "step": 340 + }, + { + "epoch": 0.03542121117689831, + "grad_norm": 0.3617844581604004, + "learning_rate": 9.969074383519983e-05, + "loss": 1.8622, + "step": 341 + }, + { + "epoch": 0.03552508569647865, + "grad_norm": 0.33944642543792725, + "learning_rate": 9.968892924234487e-05, + "loss": 1.7988, + "step": 342 + }, + { + "epoch": 0.035628960216059, + "grad_norm": 0.3269334137439728, + "learning_rate": 9.968710935800652e-05, + "loss": 1.8039, + "step": 343 + }, + { + "epoch": 0.035732834735639346, + "grad_norm": 0.3931077718734741, + "learning_rate": 9.968528418237862e-05, + "loss": 2.0847, + "step": 344 + }, + { + "epoch": 0.035836709255219694, + "grad_norm": 0.3752608299255371, + "learning_rate": 9.96834537156555e-05, + "loss": 2.0207, + "step": 345 + }, + { + "epoch": 0.03594058377480004, + "grad_norm": 0.33372077345848083, + "learning_rate": 9.96816179580321e-05, + "loss": 1.9464, + "step": 346 + }, + { + "epoch": 0.03604445829438039, + "grad_norm": 0.36559945344924927, + "learning_rate": 9.967977690970393e-05, + "loss": 1.9318, + "step": 347 + }, + { + "epoch": 0.03614833281396074, + "grad_norm": 0.3152011036872864, + "learning_rate": 9.967793057086706e-05, + "loss": 1.8081, + "step": 348 + }, + { + "epoch": 0.036252207333541085, + "grad_norm": 0.32508155703544617, + "learning_rate": 9.967607894171804e-05, + "loss": 1.7424, + "step": 349 + }, + { + "epoch": 0.036356081853121426, + "grad_norm": 0.3422446846961975, + "learning_rate": 9.967422202245413e-05, + "loss": 1.9266, + "step": 350 + }, + { + "epoch": 0.036459956372701774, + "grad_norm": 0.3278721570968628, + "learning_rate": 9.967235981327304e-05, + "loss": 1.8344, + "step": 351 + }, + { + "epoch": 0.03656383089228212, + "grad_norm": 0.32580530643463135, + "learning_rate": 9.967049231437309e-05, + "loss": 1.7158, + "step": 352 + }, + { + "epoch": 0.03666770541186247, + "grad_norm": 0.38664954900741577, + "learning_rate": 9.966861952595316e-05, + "loss": 1.9844, + "step": 353 + }, + { + "epoch": 0.03677157993144282, + "grad_norm": 0.40230998396873474, + "learning_rate": 9.966674144821266e-05, + "loss": 2.0037, + "step": 354 + }, + { + "epoch": 0.036875454451023165, + "grad_norm": 0.33027613162994385, + "learning_rate": 9.966485808135165e-05, + "loss": 1.7922, + "step": 355 + }, + { + "epoch": 0.03697932897060351, + "grad_norm": 0.342254102230072, + "learning_rate": 9.966296942557062e-05, + "loss": 1.8094, + "step": 356 + }, + { + "epoch": 0.03708320349018386, + "grad_norm": 0.3351558446884155, + "learning_rate": 9.966107548107074e-05, + "loss": 1.803, + "step": 357 + }, + { + "epoch": 0.0371870780097642, + "grad_norm": 0.34574928879737854, + "learning_rate": 9.96591762480537e-05, + "loss": 1.9305, + "step": 358 + }, + { + "epoch": 0.03729095252934455, + "grad_norm": 0.34656211733818054, + "learning_rate": 9.965727172672174e-05, + "loss": 1.781, + "step": 359 + }, + { + "epoch": 0.0373948270489249, + "grad_norm": 0.35084661841392517, + "learning_rate": 9.965536191727769e-05, + "loss": 1.886, + "step": 360 + }, + { + "epoch": 0.037498701568505245, + "grad_norm": 0.34373828768730164, + "learning_rate": 9.965344681992491e-05, + "loss": 1.8182, + "step": 361 + }, + { + "epoch": 0.03760257608808559, + "grad_norm": 0.33524051308631897, + "learning_rate": 9.965152643486738e-05, + "loss": 1.8175, + "step": 362 + }, + { + "epoch": 0.03770645060766594, + "grad_norm": 0.3301374912261963, + "learning_rate": 9.964960076230955e-05, + "loss": 1.7128, + "step": 363 + }, + { + "epoch": 0.03781032512724629, + "grad_norm": 0.35539349913597107, + "learning_rate": 9.964766980245653e-05, + "loss": 1.8029, + "step": 364 + }, + { + "epoch": 0.037914199646826637, + "grad_norm": 0.3248385190963745, + "learning_rate": 9.964573355551394e-05, + "loss": 1.6429, + "step": 365 + }, + { + "epoch": 0.03801807416640698, + "grad_norm": 0.34955573081970215, + "learning_rate": 9.964379202168799e-05, + "loss": 1.9152, + "step": 366 + }, + { + "epoch": 0.038121948685987325, + "grad_norm": 0.32366281747817993, + "learning_rate": 9.964184520118542e-05, + "loss": 1.7536, + "step": 367 + }, + { + "epoch": 0.03822582320556767, + "grad_norm": 0.3177925646305084, + "learning_rate": 9.963989309421356e-05, + "loss": 1.7626, + "step": 368 + }, + { + "epoch": 0.03832969772514802, + "grad_norm": 0.35649746656417847, + "learning_rate": 9.96379357009803e-05, + "loss": 1.822, + "step": 369 + }, + { + "epoch": 0.03843357224472837, + "grad_norm": 0.3334449827671051, + "learning_rate": 9.963597302169406e-05, + "loss": 1.7122, + "step": 370 + }, + { + "epoch": 0.038537446764308717, + "grad_norm": 0.35319963097572327, + "learning_rate": 9.963400505656388e-05, + "loss": 1.8249, + "step": 371 + }, + { + "epoch": 0.038641321283889064, + "grad_norm": 0.34933820366859436, + "learning_rate": 9.963203180579932e-05, + "loss": 2.1094, + "step": 372 + }, + { + "epoch": 0.03874519580346941, + "grad_norm": 0.35011550784111023, + "learning_rate": 9.963005326961052e-05, + "loss": 1.8247, + "step": 373 + }, + { + "epoch": 0.03884907032304975, + "grad_norm": 0.4046299159526825, + "learning_rate": 9.962806944820817e-05, + "loss": 2.1071, + "step": 374 + }, + { + "epoch": 0.0389529448426301, + "grad_norm": 0.3381158113479614, + "learning_rate": 9.962608034180353e-05, + "loss": 1.8242, + "step": 375 + }, + { + "epoch": 0.03905681936221045, + "grad_norm": 0.35168975591659546, + "learning_rate": 9.962408595060845e-05, + "loss": 2.0001, + "step": 376 + }, + { + "epoch": 0.039160693881790797, + "grad_norm": 0.35159188508987427, + "learning_rate": 9.962208627483529e-05, + "loss": 1.8534, + "step": 377 + }, + { + "epoch": 0.039264568401371144, + "grad_norm": 0.31816786527633667, + "learning_rate": 9.962008131469703e-05, + "loss": 1.6624, + "step": 378 + }, + { + "epoch": 0.03936844292095149, + "grad_norm": 0.33059608936309814, + "learning_rate": 9.961807107040712e-05, + "loss": 1.7612, + "step": 379 + }, + { + "epoch": 0.03947231744053184, + "grad_norm": 0.33961811661720276, + "learning_rate": 9.96160555421797e-05, + "loss": 1.8432, + "step": 380 + }, + { + "epoch": 0.03957619196011219, + "grad_norm": 0.3444601595401764, + "learning_rate": 9.961403473022939e-05, + "loss": 1.8842, + "step": 381 + }, + { + "epoch": 0.03968006647969253, + "grad_norm": 0.3596336841583252, + "learning_rate": 9.961200863477139e-05, + "loss": 2.0091, + "step": 382 + }, + { + "epoch": 0.039783940999272877, + "grad_norm": 0.35697799921035767, + "learning_rate": 9.960997725602144e-05, + "loss": 1.8124, + "step": 383 + }, + { + "epoch": 0.039887815518853224, + "grad_norm": 0.33248475193977356, + "learning_rate": 9.96079405941959e-05, + "loss": 1.9146, + "step": 384 + }, + { + "epoch": 0.03999169003843357, + "grad_norm": 0.3686772882938385, + "learning_rate": 9.960589864951162e-05, + "loss": 1.8714, + "step": 385 + }, + { + "epoch": 0.04009556455801392, + "grad_norm": 0.36085087060928345, + "learning_rate": 9.960385142218609e-05, + "loss": 1.8268, + "step": 386 + }, + { + "epoch": 0.04019943907759427, + "grad_norm": 0.32262122631073, + "learning_rate": 9.960179891243731e-05, + "loss": 1.6027, + "step": 387 + }, + { + "epoch": 0.040303313597174616, + "grad_norm": 0.361555278301239, + "learning_rate": 9.959974112048386e-05, + "loss": 1.9221, + "step": 388 + }, + { + "epoch": 0.040407188116754963, + "grad_norm": 0.3558778166770935, + "learning_rate": 9.959767804654487e-05, + "loss": 1.8234, + "step": 389 + }, + { + "epoch": 0.040511062636335304, + "grad_norm": 0.3629186749458313, + "learning_rate": 9.959560969084003e-05, + "loss": 1.9483, + "step": 390 + }, + { + "epoch": 0.04061493715591565, + "grad_norm": 0.37084469199180603, + "learning_rate": 9.959353605358964e-05, + "loss": 1.7024, + "step": 391 + }, + { + "epoch": 0.040718811675496, + "grad_norm": 0.3552490770816803, + "learning_rate": 9.95914571350145e-05, + "loss": 1.884, + "step": 392 + }, + { + "epoch": 0.04082268619507635, + "grad_norm": 0.34589436650276184, + "learning_rate": 9.958937293533599e-05, + "loss": 1.8548, + "step": 393 + }, + { + "epoch": 0.040926560714656696, + "grad_norm": 0.3737829327583313, + "learning_rate": 9.958728345477608e-05, + "loss": 2.0568, + "step": 394 + }, + { + "epoch": 0.041030435234237043, + "grad_norm": 0.35843873023986816, + "learning_rate": 9.958518869355728e-05, + "loss": 1.7908, + "step": 395 + }, + { + "epoch": 0.04113430975381739, + "grad_norm": 0.6880629062652588, + "learning_rate": 9.958308865190267e-05, + "loss": 1.8225, + "step": 396 + }, + { + "epoch": 0.04123818427339773, + "grad_norm": 0.3438432812690735, + "learning_rate": 9.958098333003588e-05, + "loss": 1.7719, + "step": 397 + }, + { + "epoch": 0.04134205879297808, + "grad_norm": 0.3586975634098053, + "learning_rate": 9.957887272818112e-05, + "loss": 1.9513, + "step": 398 + }, + { + "epoch": 0.04144593331255843, + "grad_norm": 0.3421470522880554, + "learning_rate": 9.957675684656312e-05, + "loss": 1.8509, + "step": 399 + }, + { + "epoch": 0.041549807832138776, + "grad_norm": 0.3313000202178955, + "learning_rate": 9.957463568540725e-05, + "loss": 1.8173, + "step": 400 + }, + { + "epoch": 0.041653682351719123, + "grad_norm": 0.3544430732727051, + "learning_rate": 9.957250924493938e-05, + "loss": 1.8059, + "step": 401 + }, + { + "epoch": 0.04175755687129947, + "grad_norm": 0.36002522706985474, + "learning_rate": 9.957037752538592e-05, + "loss": 1.9456, + "step": 402 + }, + { + "epoch": 0.04186143139087982, + "grad_norm": 0.3440776765346527, + "learning_rate": 9.956824052697395e-05, + "loss": 1.7831, + "step": 403 + }, + { + "epoch": 0.04196530591046017, + "grad_norm": 0.36796486377716064, + "learning_rate": 9.956609824993099e-05, + "loss": 1.791, + "step": 404 + }, + { + "epoch": 0.04206918043004051, + "grad_norm": 0.3968099355697632, + "learning_rate": 9.956395069448523e-05, + "loss": 1.8728, + "step": 405 + }, + { + "epoch": 0.042173054949620856, + "grad_norm": 0.3467409610748291, + "learning_rate": 9.95617978608653e-05, + "loss": 1.8004, + "step": 406 + }, + { + "epoch": 0.042276929469201203, + "grad_norm": 0.3945915102958679, + "learning_rate": 9.955963974930052e-05, + "loss": 1.7891, + "step": 407 + }, + { + "epoch": 0.04238080398878155, + "grad_norm": 0.3563166856765747, + "learning_rate": 9.955747636002068e-05, + "loss": 1.9755, + "step": 408 + }, + { + "epoch": 0.0424846785083619, + "grad_norm": 0.4015011191368103, + "learning_rate": 9.955530769325616e-05, + "loss": 1.6478, + "step": 409 + }, + { + "epoch": 0.04258855302794225, + "grad_norm": 0.36491334438323975, + "learning_rate": 9.955313374923791e-05, + "loss": 1.9403, + "step": 410 + }, + { + "epoch": 0.042692427547522595, + "grad_norm": 0.3891184628009796, + "learning_rate": 9.955095452819747e-05, + "loss": 2.0413, + "step": 411 + }, + { + "epoch": 0.04279630206710294, + "grad_norm": 0.3426514267921448, + "learning_rate": 9.954877003036687e-05, + "loss": 1.6253, + "step": 412 + }, + { + "epoch": 0.042900176586683283, + "grad_norm": 0.48841214179992676, + "learning_rate": 9.954658025597876e-05, + "loss": 2.1789, + "step": 413 + }, + { + "epoch": 0.04300405110626363, + "grad_norm": 0.3180578351020813, + "learning_rate": 9.954438520526635e-05, + "loss": 1.4562, + "step": 414 + }, + { + "epoch": 0.04310792562584398, + "grad_norm": 0.3487285077571869, + "learning_rate": 9.954218487846334e-05, + "loss": 1.7904, + "step": 415 + }, + { + "epoch": 0.04321180014542433, + "grad_norm": 0.3434458374977112, + "learning_rate": 9.953997927580411e-05, + "loss": 1.8548, + "step": 416 + }, + { + "epoch": 0.043315674665004675, + "grad_norm": 0.3450503647327423, + "learning_rate": 9.953776839752351e-05, + "loss": 1.7331, + "step": 417 + }, + { + "epoch": 0.04341954918458502, + "grad_norm": 0.33946654200553894, + "learning_rate": 9.953555224385698e-05, + "loss": 1.883, + "step": 418 + }, + { + "epoch": 0.04352342370416537, + "grad_norm": 0.37217575311660767, + "learning_rate": 9.953333081504052e-05, + "loss": 1.6892, + "step": 419 + }, + { + "epoch": 0.04362729822374572, + "grad_norm": 0.3341975510120392, + "learning_rate": 9.953110411131072e-05, + "loss": 1.734, + "step": 420 + }, + { + "epoch": 0.04373117274332606, + "grad_norm": 0.3347923159599304, + "learning_rate": 9.952887213290469e-05, + "loss": 1.8173, + "step": 421 + }, + { + "epoch": 0.04383504726290641, + "grad_norm": 0.35578978061676025, + "learning_rate": 9.952663488006011e-05, + "loss": 1.9877, + "step": 422 + }, + { + "epoch": 0.043938921782486755, + "grad_norm": 0.3420504331588745, + "learning_rate": 9.952439235301522e-05, + "loss": 1.8042, + "step": 423 + }, + { + "epoch": 0.0440427963020671, + "grad_norm": 0.3436269760131836, + "learning_rate": 9.952214455200887e-05, + "loss": 1.7833, + "step": 424 + }, + { + "epoch": 0.04414667082164745, + "grad_norm": 0.3772909641265869, + "learning_rate": 9.951989147728043e-05, + "loss": 1.9778, + "step": 425 + }, + { + "epoch": 0.0442505453412278, + "grad_norm": 0.3602845370769501, + "learning_rate": 9.951763312906979e-05, + "loss": 1.8447, + "step": 426 + }, + { + "epoch": 0.044354419860808146, + "grad_norm": 0.3500402271747589, + "learning_rate": 9.951536950761748e-05, + "loss": 1.6772, + "step": 427 + }, + { + "epoch": 0.044458294380388494, + "grad_norm": 0.3483547866344452, + "learning_rate": 9.951310061316455e-05, + "loss": 1.8328, + "step": 428 + }, + { + "epoch": 0.044562168899968835, + "grad_norm": 0.3545311391353607, + "learning_rate": 9.951082644595264e-05, + "loss": 1.8814, + "step": 429 + }, + { + "epoch": 0.04466604341954918, + "grad_norm": 0.33376967906951904, + "learning_rate": 9.95085470062239e-05, + "loss": 1.8108, + "step": 430 + }, + { + "epoch": 0.04476991793912953, + "grad_norm": 0.330287903547287, + "learning_rate": 9.950626229422107e-05, + "loss": 1.7423, + "step": 431 + }, + { + "epoch": 0.04487379245870988, + "grad_norm": 0.36695408821105957, + "learning_rate": 9.950397231018748e-05, + "loss": 1.7873, + "step": 432 + }, + { + "epoch": 0.044977666978290226, + "grad_norm": 0.3538840711116791, + "learning_rate": 9.950167705436699e-05, + "loss": 1.9949, + "step": 433 + }, + { + "epoch": 0.045081541497870574, + "grad_norm": 0.31848835945129395, + "learning_rate": 9.949937652700402e-05, + "loss": 1.5736, + "step": 434 + }, + { + "epoch": 0.04518541601745092, + "grad_norm": 0.3445279598236084, + "learning_rate": 9.949707072834356e-05, + "loss": 1.7662, + "step": 435 + }, + { + "epoch": 0.04528929053703127, + "grad_norm": 0.37137672305107117, + "learning_rate": 9.949475965863116e-05, + "loss": 1.8209, + "step": 436 + }, + { + "epoch": 0.04539316505661161, + "grad_norm": 0.43532824516296387, + "learning_rate": 9.949244331811293e-05, + "loss": 2.1643, + "step": 437 + }, + { + "epoch": 0.04549703957619196, + "grad_norm": 0.35221028327941895, + "learning_rate": 9.949012170703556e-05, + "loss": 1.8557, + "step": 438 + }, + { + "epoch": 0.045600914095772306, + "grad_norm": 0.34177568554878235, + "learning_rate": 9.948779482564624e-05, + "loss": 1.7119, + "step": 439 + }, + { + "epoch": 0.045704788615352654, + "grad_norm": 0.3455469012260437, + "learning_rate": 9.948546267419278e-05, + "loss": 1.8411, + "step": 440 + }, + { + "epoch": 0.045808663134933, + "grad_norm": 0.34957823157310486, + "learning_rate": 9.948312525292358e-05, + "loss": 1.7674, + "step": 441 + }, + { + "epoch": 0.04591253765451335, + "grad_norm": 0.3494671583175659, + "learning_rate": 9.948078256208751e-05, + "loss": 1.9521, + "step": 442 + }, + { + "epoch": 0.0460164121740937, + "grad_norm": 0.3724217116832733, + "learning_rate": 9.947843460193407e-05, + "loss": 1.9358, + "step": 443 + }, + { + "epoch": 0.046120286693674045, + "grad_norm": 0.34524059295654297, + "learning_rate": 9.947608137271328e-05, + "loss": 1.8426, + "step": 444 + }, + { + "epoch": 0.046224161213254386, + "grad_norm": 0.3344314396381378, + "learning_rate": 9.947372287467576e-05, + "loss": 1.7263, + "step": 445 + }, + { + "epoch": 0.046328035732834734, + "grad_norm": 0.3640967309474945, + "learning_rate": 9.947135910807265e-05, + "loss": 1.9716, + "step": 446 + }, + { + "epoch": 0.04643191025241508, + "grad_norm": 0.38016462326049805, + "learning_rate": 9.946899007315569e-05, + "loss": 1.7952, + "step": 447 + }, + { + "epoch": 0.04653578477199543, + "grad_norm": 0.35454219579696655, + "learning_rate": 9.946661577017717e-05, + "loss": 1.8804, + "step": 448 + }, + { + "epoch": 0.04663965929157578, + "grad_norm": 0.3755532503128052, + "learning_rate": 9.946423619938992e-05, + "loss": 2.0071, + "step": 449 + }, + { + "epoch": 0.046743533811156125, + "grad_norm": 0.3422885537147522, + "learning_rate": 9.946185136104735e-05, + "loss": 1.6811, + "step": 450 + }, + { + "epoch": 0.04684740833073647, + "grad_norm": 0.33953240513801575, + "learning_rate": 9.945946125540343e-05, + "loss": 1.7127, + "step": 451 + }, + { + "epoch": 0.046951282850316814, + "grad_norm": 0.313754677772522, + "learning_rate": 9.945706588271269e-05, + "loss": 1.7116, + "step": 452 + }, + { + "epoch": 0.04705515736989716, + "grad_norm": 0.3217809200286865, + "learning_rate": 9.94546652432302e-05, + "loss": 1.7475, + "step": 453 + }, + { + "epoch": 0.04715903188947751, + "grad_norm": 0.35068783164024353, + "learning_rate": 9.945225933721163e-05, + "loss": 1.7588, + "step": 454 + }, + { + "epoch": 0.04726290640905786, + "grad_norm": 0.32471030950546265, + "learning_rate": 9.944984816491318e-05, + "loss": 1.6391, + "step": 455 + }, + { + "epoch": 0.047366780928638205, + "grad_norm": 0.3416842222213745, + "learning_rate": 9.944743172659164e-05, + "loss": 1.7689, + "step": 456 + }, + { + "epoch": 0.04747065544821855, + "grad_norm": 0.3535037934780121, + "learning_rate": 9.94450100225043e-05, + "loss": 1.6911, + "step": 457 + }, + { + "epoch": 0.0475745299677989, + "grad_norm": 0.3632698953151703, + "learning_rate": 9.944258305290908e-05, + "loss": 1.949, + "step": 458 + }, + { + "epoch": 0.04767840448737925, + "grad_norm": 0.38424357771873474, + "learning_rate": 9.944015081806445e-05, + "loss": 1.9012, + "step": 459 + }, + { + "epoch": 0.04778227900695959, + "grad_norm": 0.3942817747592926, + "learning_rate": 9.94377133182294e-05, + "loss": 2.0238, + "step": 460 + }, + { + "epoch": 0.04788615352653994, + "grad_norm": 0.3337697684764862, + "learning_rate": 9.943527055366351e-05, + "loss": 1.7753, + "step": 461 + }, + { + "epoch": 0.047990028046120285, + "grad_norm": 0.3492131233215332, + "learning_rate": 9.94328225246269e-05, + "loss": 1.7137, + "step": 462 + }, + { + "epoch": 0.04809390256570063, + "grad_norm": 0.3351963758468628, + "learning_rate": 9.943036923138032e-05, + "loss": 1.7599, + "step": 463 + }, + { + "epoch": 0.04819777708528098, + "grad_norm": 0.33879536390304565, + "learning_rate": 9.942791067418496e-05, + "loss": 1.7868, + "step": 464 + }, + { + "epoch": 0.04830165160486133, + "grad_norm": 0.3504945933818817, + "learning_rate": 9.942544685330267e-05, + "loss": 1.93, + "step": 465 + }, + { + "epoch": 0.048405526124441677, + "grad_norm": 0.3818790912628174, + "learning_rate": 9.942297776899583e-05, + "loss": 1.9025, + "step": 466 + }, + { + "epoch": 0.048509400644022024, + "grad_norm": 0.346236914396286, + "learning_rate": 9.942050342152736e-05, + "loss": 1.7295, + "step": 467 + }, + { + "epoch": 0.048613275163602365, + "grad_norm": 0.39538463950157166, + "learning_rate": 9.941802381116078e-05, + "loss": 1.5522, + "step": 468 + }, + { + "epoch": 0.04871714968318271, + "grad_norm": 0.35648438334465027, + "learning_rate": 9.941553893816014e-05, + "loss": 1.7374, + "step": 469 + }, + { + "epoch": 0.04882102420276306, + "grad_norm": 0.3466714024543762, + "learning_rate": 9.941304880279005e-05, + "loss": 1.7781, + "step": 470 + }, + { + "epoch": 0.04892489872234341, + "grad_norm": 0.38975635170936584, + "learning_rate": 9.941055340531571e-05, + "loss": 1.9754, + "step": 471 + }, + { + "epoch": 0.049028773241923757, + "grad_norm": 0.3584454655647278, + "learning_rate": 9.940805274600285e-05, + "loss": 1.8484, + "step": 472 + }, + { + "epoch": 0.049132647761504104, + "grad_norm": 0.33715423941612244, + "learning_rate": 9.940554682511775e-05, + "loss": 1.9061, + "step": 473 + }, + { + "epoch": 0.04923652228108445, + "grad_norm": 0.35068365931510925, + "learning_rate": 9.940303564292731e-05, + "loss": 1.759, + "step": 474 + }, + { + "epoch": 0.0493403968006648, + "grad_norm": 0.37917405366897583, + "learning_rate": 9.940051919969893e-05, + "loss": 1.8638, + "step": 475 + }, + { + "epoch": 0.04944427132024514, + "grad_norm": 0.38259661197662354, + "learning_rate": 9.939799749570059e-05, + "loss": 1.9465, + "step": 476 + }, + { + "epoch": 0.04954814583982549, + "grad_norm": 0.334435373544693, + "learning_rate": 9.939547053120084e-05, + "loss": 1.8029, + "step": 477 + }, + { + "epoch": 0.049652020359405837, + "grad_norm": 0.3447658121585846, + "learning_rate": 9.939293830646878e-05, + "loss": 1.6021, + "step": 478 + }, + { + "epoch": 0.049755894878986184, + "grad_norm": 0.3410322964191437, + "learning_rate": 9.939040082177407e-05, + "loss": 1.9107, + "step": 479 + }, + { + "epoch": 0.04985976939856653, + "grad_norm": 0.3771478831768036, + "learning_rate": 9.938785807738693e-05, + "loss": 2.0531, + "step": 480 + }, + { + "epoch": 0.04996364391814688, + "grad_norm": 0.421427845954895, + "learning_rate": 9.938531007357813e-05, + "loss": 1.9953, + "step": 481 + }, + { + "epoch": 0.05006751843772723, + "grad_norm": 0.347931444644928, + "learning_rate": 9.938275681061903e-05, + "loss": 1.75, + "step": 482 + }, + { + "epoch": 0.050171392957307576, + "grad_norm": 0.38480523228645325, + "learning_rate": 9.938019828878155e-05, + "loss": 1.9639, + "step": 483 + }, + { + "epoch": 0.050275267476887917, + "grad_norm": 0.3436914384365082, + "learning_rate": 9.937763450833813e-05, + "loss": 1.8305, + "step": 484 + }, + { + "epoch": 0.050379141996468264, + "grad_norm": 0.3643799126148224, + "learning_rate": 9.93750654695618e-05, + "loss": 1.8667, + "step": 485 + }, + { + "epoch": 0.05048301651604861, + "grad_norm": 0.3727877140045166, + "learning_rate": 9.937249117272612e-05, + "loss": 1.8097, + "step": 486 + }, + { + "epoch": 0.05058689103562896, + "grad_norm": 0.34742361307144165, + "learning_rate": 9.936991161810525e-05, + "loss": 1.872, + "step": 487 + }, + { + "epoch": 0.05069076555520931, + "grad_norm": 0.34192922711372375, + "learning_rate": 9.93673268059739e-05, + "loss": 1.777, + "step": 488 + }, + { + "epoch": 0.050794640074789656, + "grad_norm": 0.34614014625549316, + "learning_rate": 9.936473673660733e-05, + "loss": 1.8364, + "step": 489 + }, + { + "epoch": 0.050898514594370003, + "grad_norm": 0.3394782841205597, + "learning_rate": 9.936214141028135e-05, + "loss": 1.7858, + "step": 490 + }, + { + "epoch": 0.05100238911395035, + "grad_norm": 0.3589742183685303, + "learning_rate": 9.935954082727235e-05, + "loss": 2.1136, + "step": 491 + }, + { + "epoch": 0.05110626363353069, + "grad_norm": 0.3754216134548187, + "learning_rate": 9.935693498785728e-05, + "loss": 1.8873, + "step": 492 + }, + { + "epoch": 0.05121013815311104, + "grad_norm": 0.3502977192401886, + "learning_rate": 9.93543238923136e-05, + "loss": 1.6858, + "step": 493 + }, + { + "epoch": 0.05131401267269139, + "grad_norm": 0.3583030104637146, + "learning_rate": 9.935170754091945e-05, + "loss": 1.8528, + "step": 494 + }, + { + "epoch": 0.051417887192271736, + "grad_norm": 0.3400440216064453, + "learning_rate": 9.934908593395338e-05, + "loss": 1.7657, + "step": 495 + }, + { + "epoch": 0.051521761711852083, + "grad_norm": 0.31973132491111755, + "learning_rate": 9.934645907169459e-05, + "loss": 1.7152, + "step": 496 + }, + { + "epoch": 0.05162563623143243, + "grad_norm": 0.33886149525642395, + "learning_rate": 9.934382695442284e-05, + "loss": 1.6483, + "step": 497 + }, + { + "epoch": 0.05172951075101278, + "grad_norm": 0.3560135066509247, + "learning_rate": 9.934118958241842e-05, + "loss": 1.8188, + "step": 498 + }, + { + "epoch": 0.05183338527059313, + "grad_norm": 0.32066768407821655, + "learning_rate": 9.933854695596216e-05, + "loss": 1.6657, + "step": 499 + }, + { + "epoch": 0.05193725979017347, + "grad_norm": 0.3544224798679352, + "learning_rate": 9.933589907533554e-05, + "loss": 1.7714, + "step": 500 + }, + { + "epoch": 0.052041134309753816, + "grad_norm": 0.3491422235965729, + "learning_rate": 9.933324594082048e-05, + "loss": 1.9087, + "step": 501 + }, + { + "epoch": 0.052145008829334163, + "grad_norm": 0.32746124267578125, + "learning_rate": 9.933058755269955e-05, + "loss": 1.8386, + "step": 502 + }, + { + "epoch": 0.05224888334891451, + "grad_norm": 0.32933396100997925, + "learning_rate": 9.932792391125583e-05, + "loss": 1.7782, + "step": 503 + }, + { + "epoch": 0.05235275786849486, + "grad_norm": 0.35885295271873474, + "learning_rate": 9.932525501677299e-05, + "loss": 1.8822, + "step": 504 + }, + { + "epoch": 0.05245663238807521, + "grad_norm": 0.3308489918708801, + "learning_rate": 9.932258086953525e-05, + "loss": 1.8777, + "step": 505 + }, + { + "epoch": 0.052560506907655555, + "grad_norm": 0.32857421040534973, + "learning_rate": 9.931990146982736e-05, + "loss": 1.6693, + "step": 506 + }, + { + "epoch": 0.052664381427235896, + "grad_norm": 0.37959519028663635, + "learning_rate": 9.931721681793468e-05, + "loss": 1.6766, + "step": 507 + }, + { + "epoch": 0.052768255946816243, + "grad_norm": 0.39978745579719543, + "learning_rate": 9.931452691414311e-05, + "loss": 1.9316, + "step": 508 + }, + { + "epoch": 0.05287213046639659, + "grad_norm": 0.36127495765686035, + "learning_rate": 9.93118317587391e-05, + "loss": 1.9157, + "step": 509 + }, + { + "epoch": 0.05297600498597694, + "grad_norm": 0.3422827422618866, + "learning_rate": 9.930913135200963e-05, + "loss": 1.718, + "step": 510 + }, + { + "epoch": 0.05307987950555729, + "grad_norm": 0.3710615932941437, + "learning_rate": 9.930642569424231e-05, + "loss": 1.7702, + "step": 511 + }, + { + "epoch": 0.053183754025137635, + "grad_norm": 0.3597875237464905, + "learning_rate": 9.930371478572526e-05, + "loss": 1.6665, + "step": 512 + }, + { + "epoch": 0.05328762854471798, + "grad_norm": 0.3505166172981262, + "learning_rate": 9.930099862674716e-05, + "loss": 1.7484, + "step": 513 + }, + { + "epoch": 0.05339150306429833, + "grad_norm": 0.343287855386734, + "learning_rate": 9.929827721759728e-05, + "loss": 1.9197, + "step": 514 + }, + { + "epoch": 0.05349537758387867, + "grad_norm": 0.35222482681274414, + "learning_rate": 9.92955505585654e-05, + "loss": 1.8467, + "step": 515 + }, + { + "epoch": 0.05359925210345902, + "grad_norm": 0.39000675082206726, + "learning_rate": 9.929281864994193e-05, + "loss": 1.8709, + "step": 516 + }, + { + "epoch": 0.05370312662303937, + "grad_norm": 0.3652053773403168, + "learning_rate": 9.929008149201774e-05, + "loss": 1.9486, + "step": 517 + }, + { + "epoch": 0.053807001142619715, + "grad_norm": 0.3311000168323517, + "learning_rate": 9.928733908508438e-05, + "loss": 1.7749, + "step": 518 + }, + { + "epoch": 0.05391087566220006, + "grad_norm": 0.3284410536289215, + "learning_rate": 9.928459142943385e-05, + "loss": 1.8424, + "step": 519 + }, + { + "epoch": 0.05401475018178041, + "grad_norm": 0.3507472574710846, + "learning_rate": 9.928183852535877e-05, + "loss": 1.8719, + "step": 520 + }, + { + "epoch": 0.05411862470136076, + "grad_norm": 0.35009628534317017, + "learning_rate": 9.92790803731523e-05, + "loss": 1.8434, + "step": 521 + }, + { + "epoch": 0.054222499220941106, + "grad_norm": 0.39859047532081604, + "learning_rate": 9.927631697310816e-05, + "loss": 1.9473, + "step": 522 + }, + { + "epoch": 0.05432637374052145, + "grad_norm": 0.369045615196228, + "learning_rate": 9.927354832552063e-05, + "loss": 1.9063, + "step": 523 + }, + { + "epoch": 0.054430248260101795, + "grad_norm": 0.3442009389400482, + "learning_rate": 9.927077443068455e-05, + "loss": 1.7318, + "step": 524 + }, + { + "epoch": 0.05453412277968214, + "grad_norm": 0.3606685996055603, + "learning_rate": 9.926799528889534e-05, + "loss": 1.8991, + "step": 525 + }, + { + "epoch": 0.05463799729926249, + "grad_norm": 0.36547592282295227, + "learning_rate": 9.926521090044891e-05, + "loss": 2.1391, + "step": 526 + }, + { + "epoch": 0.05474187181884284, + "grad_norm": 0.334673136472702, + "learning_rate": 9.926242126564182e-05, + "loss": 1.8072, + "step": 527 + }, + { + "epoch": 0.054845746338423186, + "grad_norm": 0.3335951864719391, + "learning_rate": 9.925962638477113e-05, + "loss": 1.7599, + "step": 528 + }, + { + "epoch": 0.054949620858003534, + "grad_norm": 0.33183959126472473, + "learning_rate": 9.925682625813446e-05, + "loss": 1.7878, + "step": 529 + }, + { + "epoch": 0.05505349537758388, + "grad_norm": 0.33561477065086365, + "learning_rate": 9.925402088603e-05, + "loss": 1.7764, + "step": 530 + }, + { + "epoch": 0.05515736989716422, + "grad_norm": 0.33029311895370483, + "learning_rate": 9.925121026875654e-05, + "loss": 1.7976, + "step": 531 + }, + { + "epoch": 0.05526124441674457, + "grad_norm": 0.3405066430568695, + "learning_rate": 9.924839440661334e-05, + "loss": 1.8439, + "step": 532 + }, + { + "epoch": 0.05536511893632492, + "grad_norm": 0.3600308299064636, + "learning_rate": 9.92455732999003e-05, + "loss": 1.9871, + "step": 533 + }, + { + "epoch": 0.055468993455905266, + "grad_norm": 0.4009020924568176, + "learning_rate": 9.924274694891782e-05, + "loss": 1.7941, + "step": 534 + }, + { + "epoch": 0.055572867975485614, + "grad_norm": 0.3483946919441223, + "learning_rate": 9.92399153539669e-05, + "loss": 1.7015, + "step": 535 + }, + { + "epoch": 0.05567674249506596, + "grad_norm": 0.37152403593063354, + "learning_rate": 9.923707851534909e-05, + "loss": 1.791, + "step": 536 + }, + { + "epoch": 0.05578061701464631, + "grad_norm": 0.3480570912361145, + "learning_rate": 9.923423643336648e-05, + "loss": 1.7576, + "step": 537 + }, + { + "epoch": 0.05588449153422666, + "grad_norm": 0.38511136174201965, + "learning_rate": 9.923138910832172e-05, + "loss": 1.963, + "step": 538 + }, + { + "epoch": 0.055988366053807, + "grad_norm": 0.31676754355430603, + "learning_rate": 9.922853654051806e-05, + "loss": 1.6796, + "step": 539 + }, + { + "epoch": 0.056092240573387346, + "grad_norm": 0.325538694858551, + "learning_rate": 9.922567873025924e-05, + "loss": 1.7261, + "step": 540 + }, + { + "epoch": 0.056196115092967694, + "grad_norm": 0.3761240839958191, + "learning_rate": 9.922281567784961e-05, + "loss": 1.825, + "step": 541 + }, + { + "epoch": 0.05629998961254804, + "grad_norm": 0.33543258905410767, + "learning_rate": 9.921994738359409e-05, + "loss": 1.791, + "step": 542 + }, + { + "epoch": 0.05640386413212839, + "grad_norm": 0.34951722621917725, + "learning_rate": 9.921707384779807e-05, + "loss": 1.8141, + "step": 543 + }, + { + "epoch": 0.05650773865170874, + "grad_norm": 0.34788161516189575, + "learning_rate": 9.92141950707676e-05, + "loss": 1.7599, + "step": 544 + }, + { + "epoch": 0.056611613171289085, + "grad_norm": 0.32082879543304443, + "learning_rate": 9.921131105280925e-05, + "loss": 1.5647, + "step": 545 + }, + { + "epoch": 0.05671548769086943, + "grad_norm": 0.37312445044517517, + "learning_rate": 9.920842179423013e-05, + "loss": 1.9926, + "step": 546 + }, + { + "epoch": 0.056819362210449774, + "grad_norm": 0.32151979207992554, + "learning_rate": 9.920552729533792e-05, + "loss": 1.6564, + "step": 547 + }, + { + "epoch": 0.05692323673003012, + "grad_norm": 0.3291560709476471, + "learning_rate": 9.92026275564409e-05, + "loss": 1.8871, + "step": 548 + }, + { + "epoch": 0.05702711124961047, + "grad_norm": 0.370978444814682, + "learning_rate": 9.919972257784783e-05, + "loss": 2.0246, + "step": 549 + }, + { + "epoch": 0.05713098576919082, + "grad_norm": 0.3654458820819855, + "learning_rate": 9.919681235986806e-05, + "loss": 1.8523, + "step": 550 + }, + { + "epoch": 0.057234860288771165, + "grad_norm": 0.3720274567604065, + "learning_rate": 9.919389690281154e-05, + "loss": 1.7427, + "step": 551 + }, + { + "epoch": 0.05733873480835151, + "grad_norm": 0.3372805714607239, + "learning_rate": 9.919097620698872e-05, + "loss": 1.8491, + "step": 552 + }, + { + "epoch": 0.05744260932793186, + "grad_norm": 0.3331336975097656, + "learning_rate": 9.918805027271064e-05, + "loss": 1.7309, + "step": 553 + }, + { + "epoch": 0.05754648384751221, + "grad_norm": 0.3529910743236542, + "learning_rate": 9.918511910028888e-05, + "loss": 1.8213, + "step": 554 + }, + { + "epoch": 0.05765035836709255, + "grad_norm": 0.3342648446559906, + "learning_rate": 9.918218269003561e-05, + "loss": 1.6017, + "step": 555 + }, + { + "epoch": 0.0577542328866729, + "grad_norm": 0.41469427943229675, + "learning_rate": 9.917924104226351e-05, + "loss": 1.7434, + "step": 556 + }, + { + "epoch": 0.057858107406253245, + "grad_norm": 0.3674336373806, + "learning_rate": 9.917629415728584e-05, + "loss": 1.8997, + "step": 557 + }, + { + "epoch": 0.05796198192583359, + "grad_norm": 0.3625181317329407, + "learning_rate": 9.917334203541645e-05, + "loss": 1.8736, + "step": 558 + }, + { + "epoch": 0.05806585644541394, + "grad_norm": 0.35162436962127686, + "learning_rate": 9.917038467696968e-05, + "loss": 1.8951, + "step": 559 + }, + { + "epoch": 0.05816973096499429, + "grad_norm": 0.35170653462409973, + "learning_rate": 9.916742208226051e-05, + "loss": 1.9565, + "step": 560 + }, + { + "epoch": 0.058273605484574637, + "grad_norm": 0.3626965284347534, + "learning_rate": 9.91644542516044e-05, + "loss": 1.9673, + "step": 561 + }, + { + "epoch": 0.05837748000415498, + "grad_norm": 0.36217907071113586, + "learning_rate": 9.91614811853174e-05, + "loss": 2.0416, + "step": 562 + }, + { + "epoch": 0.058481354523735325, + "grad_norm": 0.3251532018184662, + "learning_rate": 9.915850288371616e-05, + "loss": 1.8444, + "step": 563 + }, + { + "epoch": 0.05858522904331567, + "grad_norm": 0.3693266212940216, + "learning_rate": 9.915551934711777e-05, + "loss": 1.7398, + "step": 564 + }, + { + "epoch": 0.05868910356289602, + "grad_norm": 0.34907597303390503, + "learning_rate": 9.915253057584003e-05, + "loss": 1.5016, + "step": 565 + }, + { + "epoch": 0.05879297808247637, + "grad_norm": 0.3413955867290497, + "learning_rate": 9.914953657020118e-05, + "loss": 1.7225, + "step": 566 + }, + { + "epoch": 0.058896852602056717, + "grad_norm": 0.34406933188438416, + "learning_rate": 9.914653733052006e-05, + "loss": 1.7444, + "step": 567 + }, + { + "epoch": 0.059000727121637064, + "grad_norm": 0.36242082715034485, + "learning_rate": 9.914353285711607e-05, + "loss": 1.8678, + "step": 568 + }, + { + "epoch": 0.05910460164121741, + "grad_norm": 0.359131395816803, + "learning_rate": 9.914052315030917e-05, + "loss": 1.9416, + "step": 569 + }, + { + "epoch": 0.05920847616079775, + "grad_norm": 0.35352620482444763, + "learning_rate": 9.913750821041987e-05, + "loss": 1.8087, + "step": 570 + }, + { + "epoch": 0.0593123506803781, + "grad_norm": 0.3557063043117523, + "learning_rate": 9.913448803776922e-05, + "loss": 1.9329, + "step": 571 + }, + { + "epoch": 0.05941622519995845, + "grad_norm": 0.3462260067462921, + "learning_rate": 9.913146263267887e-05, + "loss": 1.882, + "step": 572 + }, + { + "epoch": 0.059520099719538797, + "grad_norm": 0.32889804244041443, + "learning_rate": 9.912843199547099e-05, + "loss": 1.6253, + "step": 573 + }, + { + "epoch": 0.059623974239119144, + "grad_norm": 0.3543769419193268, + "learning_rate": 9.912539612646832e-05, + "loss": 1.7809, + "step": 574 + }, + { + "epoch": 0.05972784875869949, + "grad_norm": 0.38579022884368896, + "learning_rate": 9.912235502599414e-05, + "loss": 2.0883, + "step": 575 + }, + { + "epoch": 0.05983172327827984, + "grad_norm": 0.3288674056529999, + "learning_rate": 9.911930869437233e-05, + "loss": 1.7313, + "step": 576 + }, + { + "epoch": 0.05993559779786019, + "grad_norm": 0.35373395681381226, + "learning_rate": 9.911625713192729e-05, + "loss": 1.906, + "step": 577 + }, + { + "epoch": 0.06003947231744053, + "grad_norm": 0.33762502670288086, + "learning_rate": 9.911320033898398e-05, + "loss": 1.8296, + "step": 578 + }, + { + "epoch": 0.060143346837020877, + "grad_norm": 0.3767317831516266, + "learning_rate": 9.911013831586795e-05, + "loss": 1.8199, + "step": 579 + }, + { + "epoch": 0.060247221356601224, + "grad_norm": 0.38876909017562866, + "learning_rate": 9.910707106290525e-05, + "loss": 1.6965, + "step": 580 + }, + { + "epoch": 0.06035109587618157, + "grad_norm": 0.34621986746788025, + "learning_rate": 9.910399858042252e-05, + "loss": 1.7974, + "step": 581 + }, + { + "epoch": 0.06045497039576192, + "grad_norm": 0.3526759743690491, + "learning_rate": 9.910092086874699e-05, + "loss": 1.7587, + "step": 582 + }, + { + "epoch": 0.06055884491534227, + "grad_norm": 0.3554968237876892, + "learning_rate": 9.909783792820637e-05, + "loss": 1.7185, + "step": 583 + }, + { + "epoch": 0.060662719434922616, + "grad_norm": 0.34225958585739136, + "learning_rate": 9.909474975912899e-05, + "loss": 1.9277, + "step": 584 + }, + { + "epoch": 0.060766593954502963, + "grad_norm": 0.35381531715393066, + "learning_rate": 9.909165636184372e-05, + "loss": 1.8624, + "step": 585 + }, + { + "epoch": 0.060870468474083304, + "grad_norm": 0.3494366407394409, + "learning_rate": 9.908855773667997e-05, + "loss": 1.8565, + "step": 586 + }, + { + "epoch": 0.06097434299366365, + "grad_norm": 0.358967661857605, + "learning_rate": 9.908545388396775e-05, + "loss": 1.8135, + "step": 587 + }, + { + "epoch": 0.061078217513244, + "grad_norm": 0.32239392399787903, + "learning_rate": 9.908234480403755e-05, + "loss": 1.7439, + "step": 588 + }, + { + "epoch": 0.06118209203282435, + "grad_norm": 0.33216288685798645, + "learning_rate": 9.90792304972205e-05, + "loss": 1.8637, + "step": 589 + }, + { + "epoch": 0.061285966552404696, + "grad_norm": 0.3340390622615814, + "learning_rate": 9.907611096384823e-05, + "loss": 1.6959, + "step": 590 + }, + { + "epoch": 0.061389841071985043, + "grad_norm": 0.3495895564556122, + "learning_rate": 9.907298620425294e-05, + "loss": 1.7969, + "step": 591 + }, + { + "epoch": 0.06149371559156539, + "grad_norm": 0.365520715713501, + "learning_rate": 9.90698562187674e-05, + "loss": 1.8457, + "step": 592 + }, + { + "epoch": 0.06159759011114574, + "grad_norm": 0.33886629343032837, + "learning_rate": 9.906672100772496e-05, + "loss": 1.6876, + "step": 593 + }, + { + "epoch": 0.06170146463072608, + "grad_norm": 0.3432716131210327, + "learning_rate": 9.906358057145945e-05, + "loss": 1.7644, + "step": 594 + }, + { + "epoch": 0.06180533915030643, + "grad_norm": 0.35947611927986145, + "learning_rate": 9.906043491030532e-05, + "loss": 1.9074, + "step": 595 + }, + { + "epoch": 0.061909213669886776, + "grad_norm": 0.3474954664707184, + "learning_rate": 9.905728402459755e-05, + "loss": 1.834, + "step": 596 + }, + { + "epoch": 0.062013088189467123, + "grad_norm": 0.32925835251808167, + "learning_rate": 9.905412791467171e-05, + "loss": 1.7292, + "step": 597 + }, + { + "epoch": 0.06211696270904747, + "grad_norm": 0.37957310676574707, + "learning_rate": 9.905096658086387e-05, + "loss": 2.0142, + "step": 598 + }, + { + "epoch": 0.06222083722862782, + "grad_norm": 0.3465014398097992, + "learning_rate": 9.904780002351071e-05, + "loss": 1.852, + "step": 599 + }, + { + "epoch": 0.06232471174820817, + "grad_norm": 0.364199697971344, + "learning_rate": 9.904462824294944e-05, + "loss": 1.7035, + "step": 600 + }, + { + "epoch": 0.062428586267788515, + "grad_norm": 0.33271247148513794, + "learning_rate": 9.904145123951781e-05, + "loss": 1.7046, + "step": 601 + }, + { + "epoch": 0.06253246078736886, + "grad_norm": 0.38924211263656616, + "learning_rate": 9.903826901355417e-05, + "loss": 1.8107, + "step": 602 + }, + { + "epoch": 0.0626363353069492, + "grad_norm": 0.39860212802886963, + "learning_rate": 9.90350815653974e-05, + "loss": 1.8613, + "step": 603 + }, + { + "epoch": 0.06274020982652956, + "grad_norm": 0.335452139377594, + "learning_rate": 9.903188889538692e-05, + "loss": 1.6605, + "step": 604 + }, + { + "epoch": 0.0628440843461099, + "grad_norm": 0.3518449664115906, + "learning_rate": 9.902869100386273e-05, + "loss": 1.8248, + "step": 605 + }, + { + "epoch": 0.06294795886569024, + "grad_norm": 0.36134716868400574, + "learning_rate": 9.90254878911654e-05, + "loss": 1.8456, + "step": 606 + }, + { + "epoch": 0.0630518333852706, + "grad_norm": 0.3638603687286377, + "learning_rate": 9.902227955763602e-05, + "loss": 1.6454, + "step": 607 + }, + { + "epoch": 0.06315570790485094, + "grad_norm": 0.3790292739868164, + "learning_rate": 9.901906600361625e-05, + "loss": 1.8036, + "step": 608 + }, + { + "epoch": 0.06325958242443129, + "grad_norm": 0.3539649546146393, + "learning_rate": 9.901584722944833e-05, + "loss": 1.8153, + "step": 609 + }, + { + "epoch": 0.06336345694401163, + "grad_norm": 0.33538633584976196, + "learning_rate": 9.9012623235475e-05, + "loss": 1.7256, + "step": 610 + }, + { + "epoch": 0.06346733146359199, + "grad_norm": 0.3604874610900879, + "learning_rate": 9.900939402203962e-05, + "loss": 2.0305, + "step": 611 + }, + { + "epoch": 0.06357120598317233, + "grad_norm": 0.34128594398498535, + "learning_rate": 9.900615958948606e-05, + "loss": 1.7801, + "step": 612 + }, + { + "epoch": 0.06367508050275267, + "grad_norm": 0.3321555554866791, + "learning_rate": 9.900291993815877e-05, + "loss": 1.6963, + "step": 613 + }, + { + "epoch": 0.06377895502233302, + "grad_norm": 0.32713523507118225, + "learning_rate": 9.899967506840273e-05, + "loss": 1.8211, + "step": 614 + }, + { + "epoch": 0.06388282954191336, + "grad_norm": 0.34873101115226746, + "learning_rate": 9.899642498056353e-05, + "loss": 1.8328, + "step": 615 + }, + { + "epoch": 0.06398670406149372, + "grad_norm": 0.36176908016204834, + "learning_rate": 9.899316967498724e-05, + "loss": 1.9433, + "step": 616 + }, + { + "epoch": 0.06409057858107406, + "grad_norm": 0.35845959186553955, + "learning_rate": 9.898990915202055e-05, + "loss": 1.8383, + "step": 617 + }, + { + "epoch": 0.06419445310065441, + "grad_norm": 0.3352903425693512, + "learning_rate": 9.898664341201067e-05, + "loss": 1.8279, + "step": 618 + }, + { + "epoch": 0.06429832762023475, + "grad_norm": 0.3244091272354126, + "learning_rate": 9.898337245530536e-05, + "loss": 1.6484, + "step": 619 + }, + { + "epoch": 0.06440220213981511, + "grad_norm": 0.39862748980522156, + "learning_rate": 9.8980096282253e-05, + "loss": 1.9679, + "step": 620 + }, + { + "epoch": 0.06450607665939545, + "grad_norm": 0.34560245275497437, + "learning_rate": 9.897681489320242e-05, + "loss": 1.8297, + "step": 621 + }, + { + "epoch": 0.06460995117897579, + "grad_norm": 0.37943777441978455, + "learning_rate": 9.897352828850309e-05, + "loss": 1.9845, + "step": 622 + }, + { + "epoch": 0.06471382569855615, + "grad_norm": 0.3497175872325897, + "learning_rate": 9.897023646850502e-05, + "loss": 1.7576, + "step": 623 + }, + { + "epoch": 0.06481770021813649, + "grad_norm": 0.3344300091266632, + "learning_rate": 9.896693943355874e-05, + "loss": 1.7502, + "step": 624 + }, + { + "epoch": 0.06492157473771684, + "grad_norm": 0.3299304246902466, + "learning_rate": 9.896363718401536e-05, + "loss": 1.5323, + "step": 625 + }, + { + "epoch": 0.06502544925729718, + "grad_norm": 0.3126195967197418, + "learning_rate": 9.896032972022656e-05, + "loss": 1.5328, + "step": 626 + }, + { + "epoch": 0.06512932377687754, + "grad_norm": 0.33860689401626587, + "learning_rate": 9.895701704254455e-05, + "loss": 1.7393, + "step": 627 + }, + { + "epoch": 0.06523319829645788, + "grad_norm": 0.332501083612442, + "learning_rate": 9.89536991513221e-05, + "loss": 1.6363, + "step": 628 + }, + { + "epoch": 0.06533707281603822, + "grad_norm": 0.32221221923828125, + "learning_rate": 9.895037604691256e-05, + "loss": 1.6374, + "step": 629 + }, + { + "epoch": 0.06544094733561857, + "grad_norm": 0.381583034992218, + "learning_rate": 9.894704772966977e-05, + "loss": 1.773, + "step": 630 + }, + { + "epoch": 0.06554482185519891, + "grad_norm": 0.36580127477645874, + "learning_rate": 9.894371419994821e-05, + "loss": 1.7417, + "step": 631 + }, + { + "epoch": 0.06564869637477927, + "grad_norm": 0.3486615717411041, + "learning_rate": 9.894037545810288e-05, + "loss": 1.8542, + "step": 632 + }, + { + "epoch": 0.06575257089435961, + "grad_norm": 0.3709062337875366, + "learning_rate": 9.89370315044893e-05, + "loss": 1.917, + "step": 633 + }, + { + "epoch": 0.06585644541393997, + "grad_norm": 0.37561196088790894, + "learning_rate": 9.893368233946359e-05, + "loss": 1.9253, + "step": 634 + }, + { + "epoch": 0.0659603199335203, + "grad_norm": 0.32539495825767517, + "learning_rate": 9.89303279633824e-05, + "loss": 1.6948, + "step": 635 + }, + { + "epoch": 0.06606419445310066, + "grad_norm": 0.32918450236320496, + "learning_rate": 9.892696837660295e-05, + "loss": 1.8504, + "step": 636 + }, + { + "epoch": 0.066168068972681, + "grad_norm": 0.32979822158813477, + "learning_rate": 9.892360357948303e-05, + "loss": 1.7728, + "step": 637 + }, + { + "epoch": 0.06627194349226134, + "grad_norm": 0.39485815167427063, + "learning_rate": 9.892023357238094e-05, + "loss": 1.8216, + "step": 638 + }, + { + "epoch": 0.0663758180118417, + "grad_norm": 0.3529796302318573, + "learning_rate": 9.891685835565557e-05, + "loss": 1.725, + "step": 639 + }, + { + "epoch": 0.06647969253142204, + "grad_norm": 0.34411367774009705, + "learning_rate": 9.891347792966635e-05, + "loss": 1.7718, + "step": 640 + }, + { + "epoch": 0.06658356705100239, + "grad_norm": 0.38490092754364014, + "learning_rate": 9.891009229477326e-05, + "loss": 1.9705, + "step": 641 + }, + { + "epoch": 0.06668744157058273, + "grad_norm": 0.3569811284542084, + "learning_rate": 9.890670145133687e-05, + "loss": 2.0534, + "step": 642 + }, + { + "epoch": 0.06679131609016309, + "grad_norm": 0.3439379334449768, + "learning_rate": 9.890330539971825e-05, + "loss": 1.9179, + "step": 643 + }, + { + "epoch": 0.06689519060974343, + "grad_norm": 0.3299177587032318, + "learning_rate": 9.889990414027907e-05, + "loss": 1.8077, + "step": 644 + }, + { + "epoch": 0.06699906512932377, + "grad_norm": 0.3420833647251129, + "learning_rate": 9.889649767338153e-05, + "loss": 1.6712, + "step": 645 + }, + { + "epoch": 0.06710293964890413, + "grad_norm": 0.3195805549621582, + "learning_rate": 9.88930859993884e-05, + "loss": 1.629, + "step": 646 + }, + { + "epoch": 0.06720681416848447, + "grad_norm": 0.3861030638217926, + "learning_rate": 9.888966911866299e-05, + "loss": 2.02, + "step": 647 + }, + { + "epoch": 0.06731068868806482, + "grad_norm": 0.3432799279689789, + "learning_rate": 9.888624703156918e-05, + "loss": 1.6907, + "step": 648 + }, + { + "epoch": 0.06741456320764516, + "grad_norm": 0.33698904514312744, + "learning_rate": 9.888281973847138e-05, + "loss": 1.6967, + "step": 649 + }, + { + "epoch": 0.06751843772722552, + "grad_norm": 0.3321373462677002, + "learning_rate": 9.887938723973458e-05, + "loss": 1.7385, + "step": 650 + }, + { + "epoch": 0.06762231224680586, + "grad_norm": 0.3467056155204773, + "learning_rate": 9.887594953572432e-05, + "loss": 1.6206, + "step": 651 + }, + { + "epoch": 0.06772618676638621, + "grad_norm": 0.3018084764480591, + "learning_rate": 9.887250662680667e-05, + "loss": 1.6185, + "step": 652 + }, + { + "epoch": 0.06783006128596655, + "grad_norm": 0.37331244349479675, + "learning_rate": 9.886905851334829e-05, + "loss": 1.8004, + "step": 653 + }, + { + "epoch": 0.0679339358055469, + "grad_norm": 0.36824825406074524, + "learning_rate": 9.886560519571637e-05, + "loss": 1.7545, + "step": 654 + }, + { + "epoch": 0.06803781032512725, + "grad_norm": 0.33622682094573975, + "learning_rate": 9.886214667427867e-05, + "loss": 1.7991, + "step": 655 + }, + { + "epoch": 0.06814168484470759, + "grad_norm": 0.3627051115036011, + "learning_rate": 9.885868294940349e-05, + "loss": 1.7288, + "step": 656 + }, + { + "epoch": 0.06824555936428794, + "grad_norm": 0.3434629440307617, + "learning_rate": 9.885521402145967e-05, + "loss": 1.7928, + "step": 657 + }, + { + "epoch": 0.06834943388386829, + "grad_norm": 0.3145413100719452, + "learning_rate": 9.885173989081664e-05, + "loss": 1.6569, + "step": 658 + }, + { + "epoch": 0.06845330840344864, + "grad_norm": 0.3603506088256836, + "learning_rate": 9.88482605578444e-05, + "loss": 1.9213, + "step": 659 + }, + { + "epoch": 0.06855718292302898, + "grad_norm": 0.33825626969337463, + "learning_rate": 9.884477602291343e-05, + "loss": 1.6129, + "step": 660 + }, + { + "epoch": 0.06866105744260932, + "grad_norm": 0.35486510396003723, + "learning_rate": 9.88412862863948e-05, + "loss": 1.8986, + "step": 661 + }, + { + "epoch": 0.06876493196218968, + "grad_norm": 0.3892877399921417, + "learning_rate": 9.883779134866016e-05, + "loss": 2.0618, + "step": 662 + }, + { + "epoch": 0.06886880648177002, + "grad_norm": 0.3327191472053528, + "learning_rate": 9.883429121008169e-05, + "loss": 1.6847, + "step": 663 + }, + { + "epoch": 0.06897268100135037, + "grad_norm": 0.32258662581443787, + "learning_rate": 9.883078587103215e-05, + "loss": 1.7325, + "step": 664 + }, + { + "epoch": 0.06907655552093071, + "grad_norm": 0.3689654767513275, + "learning_rate": 9.882727533188481e-05, + "loss": 2.0511, + "step": 665 + }, + { + "epoch": 0.06918043004051107, + "grad_norm": 0.3166467547416687, + "learning_rate": 9.882375959301349e-05, + "loss": 1.7457, + "step": 666 + }, + { + "epoch": 0.06928430456009141, + "grad_norm": 0.3648569583892822, + "learning_rate": 9.882023865479263e-05, + "loss": 1.9451, + "step": 667 + }, + { + "epoch": 0.06938817907967175, + "grad_norm": 0.35240545868873596, + "learning_rate": 9.881671251759717e-05, + "loss": 1.9018, + "step": 668 + }, + { + "epoch": 0.0694920535992521, + "grad_norm": 0.3445391356945038, + "learning_rate": 9.881318118180258e-05, + "loss": 1.9383, + "step": 669 + }, + { + "epoch": 0.06959592811883245, + "grad_norm": 0.33014625310897827, + "learning_rate": 9.880964464778499e-05, + "loss": 1.6511, + "step": 670 + }, + { + "epoch": 0.0696998026384128, + "grad_norm": 0.3588254451751709, + "learning_rate": 9.880610291592096e-05, + "loss": 1.7165, + "step": 671 + }, + { + "epoch": 0.06980367715799314, + "grad_norm": 0.3672218322753906, + "learning_rate": 9.880255598658767e-05, + "loss": 1.6612, + "step": 672 + }, + { + "epoch": 0.0699075516775735, + "grad_norm": 0.3593609035015106, + "learning_rate": 9.879900386016284e-05, + "loss": 1.8946, + "step": 673 + }, + { + "epoch": 0.07001142619715384, + "grad_norm": 0.45219045877456665, + "learning_rate": 9.879544653702476e-05, + "loss": 1.8398, + "step": 674 + }, + { + "epoch": 0.07011530071673419, + "grad_norm": 0.3960374891757965, + "learning_rate": 9.879188401755222e-05, + "loss": 1.928, + "step": 675 + }, + { + "epoch": 0.07021917523631453, + "grad_norm": 0.38661623001098633, + "learning_rate": 9.878831630212465e-05, + "loss": 1.9605, + "step": 676 + }, + { + "epoch": 0.07032304975589487, + "grad_norm": 0.34862831234931946, + "learning_rate": 9.878474339112193e-05, + "loss": 1.7933, + "step": 677 + }, + { + "epoch": 0.07042692427547523, + "grad_norm": 0.3596080243587494, + "learning_rate": 9.878116528492458e-05, + "loss": 1.7487, + "step": 678 + }, + { + "epoch": 0.07053079879505557, + "grad_norm": 0.3401545584201813, + "learning_rate": 9.877758198391365e-05, + "loss": 1.8613, + "step": 679 + }, + { + "epoch": 0.07063467331463592, + "grad_norm": 0.3352885842323303, + "learning_rate": 9.87739934884707e-05, + "loss": 1.6381, + "step": 680 + }, + { + "epoch": 0.07073854783421626, + "grad_norm": 0.36678406596183777, + "learning_rate": 9.87703997989779e-05, + "loss": 1.944, + "step": 681 + }, + { + "epoch": 0.07084242235379662, + "grad_norm": 0.3366321623325348, + "learning_rate": 9.876680091581796e-05, + "loss": 1.7308, + "step": 682 + }, + { + "epoch": 0.07094629687337696, + "grad_norm": 0.37314730882644653, + "learning_rate": 9.876319683937412e-05, + "loss": 1.8055, + "step": 683 + }, + { + "epoch": 0.0710501713929573, + "grad_norm": 0.3612748384475708, + "learning_rate": 9.875958757003017e-05, + "loss": 1.7043, + "step": 684 + }, + { + "epoch": 0.07115404591253766, + "grad_norm": 0.37414270639419556, + "learning_rate": 9.875597310817049e-05, + "loss": 1.8259, + "step": 685 + }, + { + "epoch": 0.071257920432118, + "grad_norm": 0.3539772033691406, + "learning_rate": 9.875235345417997e-05, + "loss": 1.7541, + "step": 686 + }, + { + "epoch": 0.07136179495169835, + "grad_norm": 0.35429245233535767, + "learning_rate": 9.874872860844411e-05, + "loss": 1.5966, + "step": 687 + }, + { + "epoch": 0.07146566947127869, + "grad_norm": 0.32633036375045776, + "learning_rate": 9.87450985713489e-05, + "loss": 1.7182, + "step": 688 + }, + { + "epoch": 0.07156954399085905, + "grad_norm": 0.316303014755249, + "learning_rate": 9.874146334328092e-05, + "loss": 1.553, + "step": 689 + }, + { + "epoch": 0.07167341851043939, + "grad_norm": 0.39488279819488525, + "learning_rate": 9.873782292462727e-05, + "loss": 1.9359, + "step": 690 + }, + { + "epoch": 0.07177729303001974, + "grad_norm": 0.3820980489253998, + "learning_rate": 9.873417731577568e-05, + "loss": 1.9939, + "step": 691 + }, + { + "epoch": 0.07188116754960008, + "grad_norm": 0.4182794392108917, + "learning_rate": 9.873052651711434e-05, + "loss": 1.8192, + "step": 692 + }, + { + "epoch": 0.07198504206918042, + "grad_norm": 0.3251241147518158, + "learning_rate": 9.872687052903203e-05, + "loss": 1.6569, + "step": 693 + }, + { + "epoch": 0.07208891658876078, + "grad_norm": 0.33113572001457214, + "learning_rate": 9.87232093519181e-05, + "loss": 1.743, + "step": 694 + }, + { + "epoch": 0.07219279110834112, + "grad_norm": 0.3508453369140625, + "learning_rate": 9.871954298616242e-05, + "loss": 1.7822, + "step": 695 + }, + { + "epoch": 0.07229666562792147, + "grad_norm": 0.36456069350242615, + "learning_rate": 9.871587143215545e-05, + "loss": 1.7646, + "step": 696 + }, + { + "epoch": 0.07240054014750182, + "grad_norm": 0.3450940251350403, + "learning_rate": 9.871219469028816e-05, + "loss": 1.798, + "step": 697 + }, + { + "epoch": 0.07250441466708217, + "grad_norm": 0.3370673358440399, + "learning_rate": 9.870851276095212e-05, + "loss": 1.757, + "step": 698 + }, + { + "epoch": 0.07260828918666251, + "grad_norm": 0.34746289253234863, + "learning_rate": 9.87048256445394e-05, + "loss": 1.6328, + "step": 699 + }, + { + "epoch": 0.07271216370624285, + "grad_norm": 0.3589167296886444, + "learning_rate": 9.870113334144267e-05, + "loss": 1.7512, + "step": 700 + }, + { + "epoch": 0.07281603822582321, + "grad_norm": 0.4009071886539459, + "learning_rate": 9.869743585205511e-05, + "loss": 1.8032, + "step": 701 + }, + { + "epoch": 0.07291991274540355, + "grad_norm": 0.38780277967453003, + "learning_rate": 9.869373317677049e-05, + "loss": 1.8526, + "step": 702 + }, + { + "epoch": 0.0730237872649839, + "grad_norm": 0.37040579319000244, + "learning_rate": 9.869002531598312e-05, + "loss": 1.8994, + "step": 703 + }, + { + "epoch": 0.07312766178456424, + "grad_norm": 0.3585062325000763, + "learning_rate": 9.868631227008785e-05, + "loss": 1.853, + "step": 704 + }, + { + "epoch": 0.0732315363041446, + "grad_norm": 0.3617033362388611, + "learning_rate": 9.868259403948008e-05, + "loss": 1.9527, + "step": 705 + }, + { + "epoch": 0.07333541082372494, + "grad_norm": 0.35170087218284607, + "learning_rate": 9.867887062455579e-05, + "loss": 1.7766, + "step": 706 + }, + { + "epoch": 0.0734392853433053, + "grad_norm": 0.34852275252342224, + "learning_rate": 9.867514202571149e-05, + "loss": 1.5639, + "step": 707 + }, + { + "epoch": 0.07354315986288563, + "grad_norm": 0.3750884532928467, + "learning_rate": 9.867140824334424e-05, + "loss": 1.9035, + "step": 708 + }, + { + "epoch": 0.07364703438246598, + "grad_norm": 0.37743544578552246, + "learning_rate": 9.866766927785167e-05, + "loss": 1.8442, + "step": 709 + }, + { + "epoch": 0.07375090890204633, + "grad_norm": 0.3599265217781067, + "learning_rate": 9.866392512963195e-05, + "loss": 1.6519, + "step": 710 + }, + { + "epoch": 0.07385478342162667, + "grad_norm": 0.35439547896385193, + "learning_rate": 9.86601757990838e-05, + "loss": 1.8225, + "step": 711 + }, + { + "epoch": 0.07395865794120703, + "grad_norm": 0.35483318567276, + "learning_rate": 9.865642128660647e-05, + "loss": 1.778, + "step": 712 + }, + { + "epoch": 0.07406253246078737, + "grad_norm": 0.39381521940231323, + "learning_rate": 9.865266159259981e-05, + "loss": 1.9915, + "step": 713 + }, + { + "epoch": 0.07416640698036772, + "grad_norm": 0.4099736213684082, + "learning_rate": 9.864889671746421e-05, + "loss": 2.0117, + "step": 714 + }, + { + "epoch": 0.07427028149994806, + "grad_norm": 0.3895813822746277, + "learning_rate": 9.864512666160058e-05, + "loss": 2.2284, + "step": 715 + }, + { + "epoch": 0.0743741560195284, + "grad_norm": 0.3480944037437439, + "learning_rate": 9.86413514254104e-05, + "loss": 1.7284, + "step": 716 + }, + { + "epoch": 0.07447803053910876, + "grad_norm": 0.3395494818687439, + "learning_rate": 9.863757100929573e-05, + "loss": 1.7128, + "step": 717 + }, + { + "epoch": 0.0745819050586891, + "grad_norm": 0.4045158624649048, + "learning_rate": 9.863378541365912e-05, + "loss": 1.8116, + "step": 718 + }, + { + "epoch": 0.07468577957826945, + "grad_norm": 0.34248921275138855, + "learning_rate": 9.862999463890372e-05, + "loss": 1.9717, + "step": 719 + }, + { + "epoch": 0.0747896540978498, + "grad_norm": 0.33910906314849854, + "learning_rate": 9.862619868543322e-05, + "loss": 1.886, + "step": 720 + }, + { + "epoch": 0.07489352861743015, + "grad_norm": 0.33199310302734375, + "learning_rate": 9.862239755365186e-05, + "loss": 1.7202, + "step": 721 + }, + { + "epoch": 0.07499740313701049, + "grad_norm": 0.39172807335853577, + "learning_rate": 9.861859124396444e-05, + "loss": 1.9074, + "step": 722 + }, + { + "epoch": 0.07510127765659085, + "grad_norm": 0.38816559314727783, + "learning_rate": 9.86147797567763e-05, + "loss": 1.963, + "step": 723 + }, + { + "epoch": 0.07520515217617119, + "grad_norm": 0.39771151542663574, + "learning_rate": 9.86109630924933e-05, + "loss": 2.119, + "step": 724 + }, + { + "epoch": 0.07530902669575153, + "grad_norm": 0.34102803468704224, + "learning_rate": 9.860714125152191e-05, + "loss": 1.8536, + "step": 725 + }, + { + "epoch": 0.07541290121533188, + "grad_norm": 0.35469919443130493, + "learning_rate": 9.860331423426914e-05, + "loss": 1.7498, + "step": 726 + }, + { + "epoch": 0.07551677573491222, + "grad_norm": 0.3550052046775818, + "learning_rate": 9.859948204114253e-05, + "loss": 1.9569, + "step": 727 + }, + { + "epoch": 0.07562065025449258, + "grad_norm": 0.3915771245956421, + "learning_rate": 9.859564467255015e-05, + "loss": 2.0232, + "step": 728 + }, + { + "epoch": 0.07572452477407292, + "grad_norm": 0.326684832572937, + "learning_rate": 9.859180212890069e-05, + "loss": 1.6597, + "step": 729 + }, + { + "epoch": 0.07582839929365327, + "grad_norm": 0.33311307430267334, + "learning_rate": 9.858795441060333e-05, + "loss": 1.8534, + "step": 730 + }, + { + "epoch": 0.07593227381323361, + "grad_norm": 0.3527379631996155, + "learning_rate": 9.858410151806783e-05, + "loss": 1.8047, + "step": 731 + }, + { + "epoch": 0.07603614833281395, + "grad_norm": 0.3848365545272827, + "learning_rate": 9.85802434517045e-05, + "loss": 1.7222, + "step": 732 + }, + { + "epoch": 0.07614002285239431, + "grad_norm": 0.3726034462451935, + "learning_rate": 9.857638021192417e-05, + "loss": 1.9158, + "step": 733 + }, + { + "epoch": 0.07624389737197465, + "grad_norm": 0.4738229513168335, + "learning_rate": 9.857251179913824e-05, + "loss": 2.1136, + "step": 734 + }, + { + "epoch": 0.076347771891555, + "grad_norm": 0.3687533438205719, + "learning_rate": 9.856863821375872e-05, + "loss": 1.802, + "step": 735 + }, + { + "epoch": 0.07645164641113535, + "grad_norm": 0.35550445318222046, + "learning_rate": 9.856475945619806e-05, + "loss": 1.8454, + "step": 736 + }, + { + "epoch": 0.0765555209307157, + "grad_norm": 0.3472285866737366, + "learning_rate": 9.856087552686932e-05, + "loss": 1.7785, + "step": 737 + }, + { + "epoch": 0.07665939545029604, + "grad_norm": 0.3866637051105499, + "learning_rate": 9.855698642618616e-05, + "loss": 1.9353, + "step": 738 + }, + { + "epoch": 0.07676326996987638, + "grad_norm": 0.3777254819869995, + "learning_rate": 9.855309215456268e-05, + "loss": 1.9781, + "step": 739 + }, + { + "epoch": 0.07686714448945674, + "grad_norm": 0.413335919380188, + "learning_rate": 9.854919271241363e-05, + "loss": 1.917, + "step": 740 + }, + { + "epoch": 0.07697101900903708, + "grad_norm": 0.3310924470424652, + "learning_rate": 9.854528810015425e-05, + "loss": 1.6168, + "step": 741 + }, + { + "epoch": 0.07707489352861743, + "grad_norm": 0.34101495146751404, + "learning_rate": 9.854137831820035e-05, + "loss": 1.7935, + "step": 742 + }, + { + "epoch": 0.07717876804819777, + "grad_norm": 0.34622523188591003, + "learning_rate": 9.85374633669683e-05, + "loss": 1.7695, + "step": 743 + }, + { + "epoch": 0.07728264256777813, + "grad_norm": 0.39640340209007263, + "learning_rate": 9.8533543246875e-05, + "loss": 1.8163, + "step": 744 + }, + { + "epoch": 0.07738651708735847, + "grad_norm": 0.35431715846061707, + "learning_rate": 9.852961795833793e-05, + "loss": 1.7706, + "step": 745 + }, + { + "epoch": 0.07749039160693882, + "grad_norm": 0.37833094596862793, + "learning_rate": 9.852568750177508e-05, + "loss": 1.6099, + "step": 746 + }, + { + "epoch": 0.07759426612651917, + "grad_norm": 0.3214152753353119, + "learning_rate": 9.852175187760504e-05, + "loss": 1.5333, + "step": 747 + }, + { + "epoch": 0.0776981406460995, + "grad_norm": 0.3399621546268463, + "learning_rate": 9.851781108624692e-05, + "loss": 1.7676, + "step": 748 + }, + { + "epoch": 0.07780201516567986, + "grad_norm": 0.3282534182071686, + "learning_rate": 9.851386512812036e-05, + "loss": 1.8172, + "step": 749 + }, + { + "epoch": 0.0779058896852602, + "grad_norm": 0.37723109126091003, + "learning_rate": 9.850991400364557e-05, + "loss": 1.8644, + "step": 750 + }, + { + "epoch": 0.07800976420484056, + "grad_norm": 0.36292698979377747, + "learning_rate": 9.850595771324335e-05, + "loss": 1.8879, + "step": 751 + }, + { + "epoch": 0.0781136387244209, + "grad_norm": 0.3753519654273987, + "learning_rate": 9.850199625733501e-05, + "loss": 1.9628, + "step": 752 + }, + { + "epoch": 0.07821751324400125, + "grad_norm": 0.33497151732444763, + "learning_rate": 9.849802963634238e-05, + "loss": 1.8383, + "step": 753 + }, + { + "epoch": 0.07832138776358159, + "grad_norm": 0.3661734163761139, + "learning_rate": 9.849405785068789e-05, + "loss": 1.903, + "step": 754 + }, + { + "epoch": 0.07842526228316193, + "grad_norm": 0.33177104592323303, + "learning_rate": 9.84900809007945e-05, + "loss": 1.7703, + "step": 755 + }, + { + "epoch": 0.07852913680274229, + "grad_norm": 0.41701647639274597, + "learning_rate": 9.848609878708575e-05, + "loss": 2.017, + "step": 756 + }, + { + "epoch": 0.07863301132232263, + "grad_norm": 0.3774707317352295, + "learning_rate": 9.848211150998568e-05, + "loss": 1.8528, + "step": 757 + }, + { + "epoch": 0.07873688584190298, + "grad_norm": 0.3493809700012207, + "learning_rate": 9.84781190699189e-05, + "loss": 1.8787, + "step": 758 + }, + { + "epoch": 0.07884076036148333, + "grad_norm": 0.4020559787750244, + "learning_rate": 9.847412146731058e-05, + "loss": 2.0583, + "step": 759 + }, + { + "epoch": 0.07894463488106368, + "grad_norm": 0.37172991037368774, + "learning_rate": 9.847011870258644e-05, + "loss": 1.9294, + "step": 760 + }, + { + "epoch": 0.07904850940064402, + "grad_norm": 0.3182731568813324, + "learning_rate": 9.846611077617274e-05, + "loss": 1.7817, + "step": 761 + }, + { + "epoch": 0.07915238392022438, + "grad_norm": 0.4014347791671753, + "learning_rate": 9.84620976884963e-05, + "loss": 1.8936, + "step": 762 + }, + { + "epoch": 0.07925625843980472, + "grad_norm": 0.317242294549942, + "learning_rate": 9.845807943998448e-05, + "loss": 1.6904, + "step": 763 + }, + { + "epoch": 0.07936013295938506, + "grad_norm": 0.3695957064628601, + "learning_rate": 9.845405603106518e-05, + "loss": 1.9056, + "step": 764 + }, + { + "epoch": 0.07946400747896541, + "grad_norm": 0.3249827027320862, + "learning_rate": 9.845002746216687e-05, + "loss": 1.8097, + "step": 765 + }, + { + "epoch": 0.07956788199854575, + "grad_norm": 0.3424146771430969, + "learning_rate": 9.844599373371855e-05, + "loss": 1.6326, + "step": 766 + }, + { + "epoch": 0.07967175651812611, + "grad_norm": 0.35552307963371277, + "learning_rate": 9.84419548461498e-05, + "loss": 1.9049, + "step": 767 + }, + { + "epoch": 0.07977563103770645, + "grad_norm": 0.3346835970878601, + "learning_rate": 9.843791079989071e-05, + "loss": 1.7215, + "step": 768 + }, + { + "epoch": 0.0798795055572868, + "grad_norm": 0.339160680770874, + "learning_rate": 9.843386159537196e-05, + "loss": 1.6931, + "step": 769 + }, + { + "epoch": 0.07998338007686714, + "grad_norm": 0.3619834780693054, + "learning_rate": 9.842980723302476e-05, + "loss": 1.7751, + "step": 770 + }, + { + "epoch": 0.08008725459644749, + "grad_norm": 0.37088942527770996, + "learning_rate": 9.842574771328084e-05, + "loss": 1.7259, + "step": 771 + }, + { + "epoch": 0.08019112911602784, + "grad_norm": 0.3355197012424469, + "learning_rate": 9.842168303657253e-05, + "loss": 1.8242, + "step": 772 + }, + { + "epoch": 0.08029500363560818, + "grad_norm": 0.37746527791023254, + "learning_rate": 9.841761320333266e-05, + "loss": 1.7493, + "step": 773 + }, + { + "epoch": 0.08039887815518854, + "grad_norm": 0.3684033751487732, + "learning_rate": 9.841353821399469e-05, + "loss": 1.7955, + "step": 774 + }, + { + "epoch": 0.08050275267476888, + "grad_norm": 0.3551725447177887, + "learning_rate": 9.840945806899253e-05, + "loss": 1.9765, + "step": 775 + }, + { + "epoch": 0.08060662719434923, + "grad_norm": 0.32390061020851135, + "learning_rate": 9.840537276876069e-05, + "loss": 1.6581, + "step": 776 + }, + { + "epoch": 0.08071050171392957, + "grad_norm": 0.3525629937648773, + "learning_rate": 9.840128231373422e-05, + "loss": 1.8436, + "step": 777 + }, + { + "epoch": 0.08081437623350993, + "grad_norm": 0.3583666682243347, + "learning_rate": 9.839718670434873e-05, + "loss": 1.8948, + "step": 778 + }, + { + "epoch": 0.08091825075309027, + "grad_norm": 0.3378392457962036, + "learning_rate": 9.839308594104038e-05, + "loss": 1.7955, + "step": 779 + }, + { + "epoch": 0.08102212527267061, + "grad_norm": 0.3844630718231201, + "learning_rate": 9.838898002424585e-05, + "loss": 1.987, + "step": 780 + }, + { + "epoch": 0.08112599979225096, + "grad_norm": 0.41022759675979614, + "learning_rate": 9.83848689544024e-05, + "loss": 1.959, + "step": 781 + }, + { + "epoch": 0.0812298743118313, + "grad_norm": 0.3520353436470032, + "learning_rate": 9.83807527319478e-05, + "loss": 1.8185, + "step": 782 + }, + { + "epoch": 0.08133374883141166, + "grad_norm": 0.36484450101852417, + "learning_rate": 9.837663135732045e-05, + "loss": 1.8079, + "step": 783 + }, + { + "epoch": 0.081437623350992, + "grad_norm": 0.34820452332496643, + "learning_rate": 9.83725048309592e-05, + "loss": 1.7167, + "step": 784 + }, + { + "epoch": 0.08154149787057235, + "grad_norm": 0.35253769159317017, + "learning_rate": 9.83683731533035e-05, + "loss": 1.8118, + "step": 785 + }, + { + "epoch": 0.0816453723901527, + "grad_norm": 0.3355175256729126, + "learning_rate": 9.836423632479335e-05, + "loss": 1.7767, + "step": 786 + }, + { + "epoch": 0.08174924690973304, + "grad_norm": 0.3633790612220764, + "learning_rate": 9.836009434586929e-05, + "loss": 1.9734, + "step": 787 + }, + { + "epoch": 0.08185312142931339, + "grad_norm": 0.36171820759773254, + "learning_rate": 9.835594721697241e-05, + "loss": 1.8049, + "step": 788 + }, + { + "epoch": 0.08195699594889373, + "grad_norm": 0.329885333776474, + "learning_rate": 9.835179493854433e-05, + "loss": 1.8358, + "step": 789 + }, + { + "epoch": 0.08206087046847409, + "grad_norm": 0.33959150314331055, + "learning_rate": 9.834763751102726e-05, + "loss": 1.6816, + "step": 790 + }, + { + "epoch": 0.08216474498805443, + "grad_norm": 0.3971327245235443, + "learning_rate": 9.834347493486391e-05, + "loss": 1.8448, + "step": 791 + }, + { + "epoch": 0.08226861950763478, + "grad_norm": 0.3603891134262085, + "learning_rate": 9.833930721049758e-05, + "loss": 1.9219, + "step": 792 + }, + { + "epoch": 0.08237249402721512, + "grad_norm": 0.3858663737773895, + "learning_rate": 9.833513433837209e-05, + "loss": 2.0203, + "step": 793 + }, + { + "epoch": 0.08247636854679546, + "grad_norm": 0.36965909600257874, + "learning_rate": 9.833095631893183e-05, + "loss": 1.5018, + "step": 794 + }, + { + "epoch": 0.08258024306637582, + "grad_norm": 0.3826972544193268, + "learning_rate": 9.832677315262171e-05, + "loss": 1.9229, + "step": 795 + }, + { + "epoch": 0.08268411758595616, + "grad_norm": 0.3521627187728882, + "learning_rate": 9.832258483988721e-05, + "loss": 1.8676, + "step": 796 + }, + { + "epoch": 0.08278799210553651, + "grad_norm": 0.38269051909446716, + "learning_rate": 9.831839138117435e-05, + "loss": 1.8478, + "step": 797 + }, + { + "epoch": 0.08289186662511686, + "grad_norm": 0.33963099122047424, + "learning_rate": 9.831419277692972e-05, + "loss": 1.7087, + "step": 798 + }, + { + "epoch": 0.08299574114469721, + "grad_norm": 0.3614671528339386, + "learning_rate": 9.830998902760043e-05, + "loss": 1.8661, + "step": 799 + }, + { + "epoch": 0.08309961566427755, + "grad_norm": 0.3493926227092743, + "learning_rate": 9.830578013363411e-05, + "loss": 1.8653, + "step": 800 + }, + { + "epoch": 0.0832034901838579, + "grad_norm": 0.3648037314414978, + "learning_rate": 9.830156609547905e-05, + "loss": 1.8252, + "step": 801 + }, + { + "epoch": 0.08330736470343825, + "grad_norm": 0.35134628415107727, + "learning_rate": 9.829734691358393e-05, + "loss": 1.8055, + "step": 802 + }, + { + "epoch": 0.08341123922301859, + "grad_norm": 0.3299328088760376, + "learning_rate": 9.829312258839811e-05, + "loss": 1.7022, + "step": 803 + }, + { + "epoch": 0.08351511374259894, + "grad_norm": 0.39070454239845276, + "learning_rate": 9.828889312037143e-05, + "loss": 2.0613, + "step": 804 + }, + { + "epoch": 0.08361898826217928, + "grad_norm": 0.3565296232700348, + "learning_rate": 9.828465850995433e-05, + "loss": 1.973, + "step": 805 + }, + { + "epoch": 0.08372286278175964, + "grad_norm": 0.35673460364341736, + "learning_rate": 9.82804187575977e-05, + "loss": 1.8557, + "step": 806 + }, + { + "epoch": 0.08382673730133998, + "grad_norm": 0.34928351640701294, + "learning_rate": 9.827617386375307e-05, + "loss": 1.9153, + "step": 807 + }, + { + "epoch": 0.08393061182092033, + "grad_norm": 0.35845404863357544, + "learning_rate": 9.827192382887251e-05, + "loss": 1.8209, + "step": 808 + }, + { + "epoch": 0.08403448634050067, + "grad_norm": 0.3234979510307312, + "learning_rate": 9.82676686534086e-05, + "loss": 1.6632, + "step": 809 + }, + { + "epoch": 0.08413836086008102, + "grad_norm": 0.3533652722835541, + "learning_rate": 9.826340833781447e-05, + "loss": 1.7217, + "step": 810 + }, + { + "epoch": 0.08424223537966137, + "grad_norm": 0.3984024226665497, + "learning_rate": 9.825914288254383e-05, + "loss": 1.9889, + "step": 811 + }, + { + "epoch": 0.08434610989924171, + "grad_norm": 0.3283498287200928, + "learning_rate": 9.82548722880509e-05, + "loss": 1.7551, + "step": 812 + }, + { + "epoch": 0.08444998441882207, + "grad_norm": 0.3530416190624237, + "learning_rate": 9.825059655479047e-05, + "loss": 1.5998, + "step": 813 + }, + { + "epoch": 0.08455385893840241, + "grad_norm": 0.377096951007843, + "learning_rate": 9.82463156832179e-05, + "loss": 1.93, + "step": 814 + }, + { + "epoch": 0.08465773345798276, + "grad_norm": 0.3375512361526489, + "learning_rate": 9.824202967378903e-05, + "loss": 1.7851, + "step": 815 + }, + { + "epoch": 0.0847616079775631, + "grad_norm": 0.3516164720058441, + "learning_rate": 9.82377385269603e-05, + "loss": 1.7856, + "step": 816 + }, + { + "epoch": 0.08486548249714346, + "grad_norm": 0.34516972303390503, + "learning_rate": 9.823344224318869e-05, + "loss": 1.7032, + "step": 817 + }, + { + "epoch": 0.0849693570167238, + "grad_norm": 0.33883360028266907, + "learning_rate": 9.822914082293171e-05, + "loss": 1.8525, + "step": 818 + }, + { + "epoch": 0.08507323153630414, + "grad_norm": 0.3980461657047272, + "learning_rate": 9.822483426664744e-05, + "loss": 1.8626, + "step": 819 + }, + { + "epoch": 0.0851771060558845, + "grad_norm": 0.36254748702049255, + "learning_rate": 9.822052257479448e-05, + "loss": 1.7556, + "step": 820 + }, + { + "epoch": 0.08528098057546483, + "grad_norm": 0.33397209644317627, + "learning_rate": 9.821620574783202e-05, + "loss": 1.713, + "step": 821 + }, + { + "epoch": 0.08538485509504519, + "grad_norm": 0.3705761432647705, + "learning_rate": 9.821188378621973e-05, + "loss": 2.0036, + "step": 822 + }, + { + "epoch": 0.08548872961462553, + "grad_norm": 0.40211713314056396, + "learning_rate": 9.820755669041787e-05, + "loss": 1.8456, + "step": 823 + }, + { + "epoch": 0.08559260413420589, + "grad_norm": 0.3149651288986206, + "learning_rate": 9.820322446088729e-05, + "loss": 1.6277, + "step": 824 + }, + { + "epoch": 0.08569647865378623, + "grad_norm": 0.36002975702285767, + "learning_rate": 9.819888709808929e-05, + "loss": 1.8758, + "step": 825 + }, + { + "epoch": 0.08580035317336657, + "grad_norm": 0.40512141585350037, + "learning_rate": 9.819454460248578e-05, + "loss": 1.8693, + "step": 826 + }, + { + "epoch": 0.08590422769294692, + "grad_norm": 0.33460405468940735, + "learning_rate": 9.81901969745392e-05, + "loss": 1.8883, + "step": 827 + }, + { + "epoch": 0.08600810221252726, + "grad_norm": 0.3666040301322937, + "learning_rate": 9.818584421471254e-05, + "loss": 1.7379, + "step": 828 + }, + { + "epoch": 0.08611197673210762, + "grad_norm": 0.3242300748825073, + "learning_rate": 9.818148632346934e-05, + "loss": 1.67, + "step": 829 + }, + { + "epoch": 0.08621585125168796, + "grad_norm": 0.37964361906051636, + "learning_rate": 9.817712330127367e-05, + "loss": 1.9572, + "step": 830 + }, + { + "epoch": 0.08631972577126831, + "grad_norm": 0.3320227265357971, + "learning_rate": 9.817275514859017e-05, + "loss": 1.7383, + "step": 831 + }, + { + "epoch": 0.08642360029084865, + "grad_norm": 0.3565314710140228, + "learning_rate": 9.8168381865884e-05, + "loss": 1.9483, + "step": 832 + }, + { + "epoch": 0.08652747481042901, + "grad_norm": 0.35897499322891235, + "learning_rate": 9.81640034536209e-05, + "loss": 1.7132, + "step": 833 + }, + { + "epoch": 0.08663134933000935, + "grad_norm": 0.37124866247177124, + "learning_rate": 9.815961991226711e-05, + "loss": 2.0216, + "step": 834 + }, + { + "epoch": 0.08673522384958969, + "grad_norm": 0.36309394240379333, + "learning_rate": 9.815523124228949e-05, + "loss": 1.7324, + "step": 835 + }, + { + "epoch": 0.08683909836917005, + "grad_norm": 0.35258007049560547, + "learning_rate": 9.815083744415534e-05, + "loss": 1.7634, + "step": 836 + }, + { + "epoch": 0.08694297288875039, + "grad_norm": 0.335984468460083, + "learning_rate": 9.81464385183326e-05, + "loss": 1.4646, + "step": 837 + }, + { + "epoch": 0.08704684740833074, + "grad_norm": 0.31050655245780945, + "learning_rate": 9.814203446528971e-05, + "loss": 1.3127, + "step": 838 + }, + { + "epoch": 0.08715072192791108, + "grad_norm": 0.3857136368751526, + "learning_rate": 9.813762528549568e-05, + "loss": 1.9647, + "step": 839 + }, + { + "epoch": 0.08725459644749144, + "grad_norm": 0.35393643379211426, + "learning_rate": 9.813321097942005e-05, + "loss": 1.7981, + "step": 840 + }, + { + "epoch": 0.08735847096707178, + "grad_norm": 0.3497065305709839, + "learning_rate": 9.812879154753289e-05, + "loss": 1.8632, + "step": 841 + }, + { + "epoch": 0.08746234548665212, + "grad_norm": 0.34144020080566406, + "learning_rate": 9.812436699030486e-05, + "loss": 1.653, + "step": 842 + }, + { + "epoch": 0.08756622000623247, + "grad_norm": 0.375434935092926, + "learning_rate": 9.811993730820712e-05, + "loss": 1.798, + "step": 843 + }, + { + "epoch": 0.08767009452581281, + "grad_norm": 0.33210819959640503, + "learning_rate": 9.811550250171142e-05, + "loss": 1.7409, + "step": 844 + }, + { + "epoch": 0.08777396904539317, + "grad_norm": 0.3796077072620392, + "learning_rate": 9.811106257129001e-05, + "loss": 1.8968, + "step": 845 + }, + { + "epoch": 0.08787784356497351, + "grad_norm": 0.3507767915725708, + "learning_rate": 9.810661751741573e-05, + "loss": 1.7981, + "step": 846 + }, + { + "epoch": 0.08798171808455386, + "grad_norm": 0.3411443829536438, + "learning_rate": 9.810216734056192e-05, + "loss": 1.5857, + "step": 847 + }, + { + "epoch": 0.0880855926041342, + "grad_norm": 0.3522975444793701, + "learning_rate": 9.809771204120248e-05, + "loss": 1.7766, + "step": 848 + }, + { + "epoch": 0.08818946712371455, + "grad_norm": 0.35262516140937805, + "learning_rate": 9.809325161981191e-05, + "loss": 1.6296, + "step": 849 + }, + { + "epoch": 0.0882933416432949, + "grad_norm": 0.36089739203453064, + "learning_rate": 9.808878607686518e-05, + "loss": 1.8433, + "step": 850 + }, + { + "epoch": 0.08839721616287524, + "grad_norm": 0.3271440863609314, + "learning_rate": 9.808431541283783e-05, + "loss": 1.8521, + "step": 851 + }, + { + "epoch": 0.0885010906824556, + "grad_norm": 0.39304596185684204, + "learning_rate": 9.807983962820597e-05, + "loss": 2.0822, + "step": 852 + }, + { + "epoch": 0.08860496520203594, + "grad_norm": 0.3730714023113251, + "learning_rate": 9.807535872344622e-05, + "loss": 1.8393, + "step": 853 + }, + { + "epoch": 0.08870883972161629, + "grad_norm": 0.37659719586372375, + "learning_rate": 9.807087269903577e-05, + "loss": 1.921, + "step": 854 + }, + { + "epoch": 0.08881271424119663, + "grad_norm": 0.3870750367641449, + "learning_rate": 9.806638155545236e-05, + "loss": 2.0497, + "step": 855 + }, + { + "epoch": 0.08891658876077699, + "grad_norm": 0.3502962291240692, + "learning_rate": 9.806188529317423e-05, + "loss": 1.849, + "step": 856 + }, + { + "epoch": 0.08902046328035733, + "grad_norm": 0.3573035001754761, + "learning_rate": 9.805738391268021e-05, + "loss": 1.6662, + "step": 857 + }, + { + "epoch": 0.08912433779993767, + "grad_norm": 0.46759751439094543, + "learning_rate": 9.805287741444968e-05, + "loss": 1.5148, + "step": 858 + }, + { + "epoch": 0.08922821231951802, + "grad_norm": 0.3513263463973999, + "learning_rate": 9.804836579896253e-05, + "loss": 1.9624, + "step": 859 + }, + { + "epoch": 0.08933208683909837, + "grad_norm": 0.3362573981285095, + "learning_rate": 9.80438490666992e-05, + "loss": 1.7757, + "step": 860 + }, + { + "epoch": 0.08943596135867872, + "grad_norm": 0.3206601142883301, + "learning_rate": 9.803932721814071e-05, + "loss": 1.692, + "step": 861 + }, + { + "epoch": 0.08953983587825906, + "grad_norm": 0.3470083773136139, + "learning_rate": 9.803480025376858e-05, + "loss": 1.8445, + "step": 862 + }, + { + "epoch": 0.08964371039783942, + "grad_norm": 0.3480392098426819, + "learning_rate": 9.803026817406493e-05, + "loss": 1.8338, + "step": 863 + }, + { + "epoch": 0.08974758491741976, + "grad_norm": 0.3229011297225952, + "learning_rate": 9.802573097951235e-05, + "loss": 1.7572, + "step": 864 + }, + { + "epoch": 0.0898514594370001, + "grad_norm": 0.3733616769313812, + "learning_rate": 9.802118867059407e-05, + "loss": 1.8358, + "step": 865 + }, + { + "epoch": 0.08995533395658045, + "grad_norm": 0.35253623127937317, + "learning_rate": 9.801664124779373e-05, + "loss": 1.8347, + "step": 866 + }, + { + "epoch": 0.09005920847616079, + "grad_norm": 0.3676695227622986, + "learning_rate": 9.801208871159567e-05, + "loss": 1.8158, + "step": 867 + }, + { + "epoch": 0.09016308299574115, + "grad_norm": 0.3626701235771179, + "learning_rate": 9.800753106248468e-05, + "loss": 1.9947, + "step": 868 + }, + { + "epoch": 0.09026695751532149, + "grad_norm": 0.3680635392665863, + "learning_rate": 9.800296830094609e-05, + "loss": 1.7856, + "step": 869 + }, + { + "epoch": 0.09037083203490184, + "grad_norm": 0.34749433398246765, + "learning_rate": 9.79984004274658e-05, + "loss": 1.7665, + "step": 870 + }, + { + "epoch": 0.09047470655448218, + "grad_norm": 0.3646162152290344, + "learning_rate": 9.799382744253029e-05, + "loss": 1.715, + "step": 871 + }, + { + "epoch": 0.09057858107406254, + "grad_norm": 0.37328121066093445, + "learning_rate": 9.79892493466265e-05, + "loss": 1.7945, + "step": 872 + }, + { + "epoch": 0.09068245559364288, + "grad_norm": 0.35278576612472534, + "learning_rate": 9.7984666140242e-05, + "loss": 1.7249, + "step": 873 + }, + { + "epoch": 0.09078633011322322, + "grad_norm": 0.33178964257240295, + "learning_rate": 9.798007782386485e-05, + "loss": 1.6934, + "step": 874 + }, + { + "epoch": 0.09089020463280358, + "grad_norm": 0.3599662780761719, + "learning_rate": 9.797548439798368e-05, + "loss": 1.9192, + "step": 875 + }, + { + "epoch": 0.09099407915238392, + "grad_norm": 0.37787213921546936, + "learning_rate": 9.797088586308762e-05, + "loss": 1.9863, + "step": 876 + }, + { + "epoch": 0.09109795367196427, + "grad_norm": 0.31273946166038513, + "learning_rate": 9.796628221966642e-05, + "loss": 1.6311, + "step": 877 + }, + { + "epoch": 0.09120182819154461, + "grad_norm": 0.33196964859962463, + "learning_rate": 9.796167346821032e-05, + "loss": 1.6965, + "step": 878 + }, + { + "epoch": 0.09130570271112497, + "grad_norm": 0.3330535590648651, + "learning_rate": 9.79570596092101e-05, + "loss": 1.745, + "step": 879 + }, + { + "epoch": 0.09140957723070531, + "grad_norm": 0.3526756465435028, + "learning_rate": 9.795244064315712e-05, + "loss": 1.886, + "step": 880 + }, + { + "epoch": 0.09151345175028565, + "grad_norm": 0.3859866261482239, + "learning_rate": 9.794781657054324e-05, + "loss": 1.8935, + "step": 881 + }, + { + "epoch": 0.091617326269866, + "grad_norm": 0.3710609972476959, + "learning_rate": 9.794318739186092e-05, + "loss": 1.9279, + "step": 882 + }, + { + "epoch": 0.09172120078944634, + "grad_norm": 0.37800589203834534, + "learning_rate": 9.79385531076031e-05, + "loss": 1.9028, + "step": 883 + }, + { + "epoch": 0.0918250753090267, + "grad_norm": 0.36296647787094116, + "learning_rate": 9.793391371826332e-05, + "loss": 1.9369, + "step": 884 + }, + { + "epoch": 0.09192894982860704, + "grad_norm": 0.3620465397834778, + "learning_rate": 9.792926922433563e-05, + "loss": 1.9072, + "step": 885 + }, + { + "epoch": 0.0920328243481874, + "grad_norm": 0.35136178135871887, + "learning_rate": 9.792461962631462e-05, + "loss": 1.6753, + "step": 886 + }, + { + "epoch": 0.09213669886776774, + "grad_norm": 0.33308306336402893, + "learning_rate": 9.791996492469544e-05, + "loss": 1.8044, + "step": 887 + }, + { + "epoch": 0.09224057338734809, + "grad_norm": 0.36434537172317505, + "learning_rate": 9.79153051199738e-05, + "loss": 1.6905, + "step": 888 + }, + { + "epoch": 0.09234444790692843, + "grad_norm": 0.3682979345321655, + "learning_rate": 9.791064021264591e-05, + "loss": 1.9002, + "step": 889 + }, + { + "epoch": 0.09244832242650877, + "grad_norm": 0.39494451880455017, + "learning_rate": 9.790597020320856e-05, + "loss": 1.8316, + "step": 890 + }, + { + "epoch": 0.09255219694608913, + "grad_norm": 0.3634776473045349, + "learning_rate": 9.790129509215907e-05, + "loss": 1.7874, + "step": 891 + }, + { + "epoch": 0.09265607146566947, + "grad_norm": 0.49316370487213135, + "learning_rate": 9.789661487999528e-05, + "loss": 1.8666, + "step": 892 + }, + { + "epoch": 0.09275994598524982, + "grad_norm": 0.4129713475704193, + "learning_rate": 9.789192956721563e-05, + "loss": 2.0314, + "step": 893 + }, + { + "epoch": 0.09286382050483016, + "grad_norm": 0.33426743745803833, + "learning_rate": 9.788723915431905e-05, + "loss": 1.6938, + "step": 894 + }, + { + "epoch": 0.09296769502441052, + "grad_norm": 0.3889116048812866, + "learning_rate": 9.788254364180503e-05, + "loss": 1.9863, + "step": 895 + }, + { + "epoch": 0.09307156954399086, + "grad_norm": 0.37265413999557495, + "learning_rate": 9.78778430301736e-05, + "loss": 1.7899, + "step": 896 + }, + { + "epoch": 0.0931754440635712, + "grad_norm": 0.4090532958507538, + "learning_rate": 9.787313731992537e-05, + "loss": 1.8823, + "step": 897 + }, + { + "epoch": 0.09327931858315155, + "grad_norm": 0.3560778796672821, + "learning_rate": 9.786842651156143e-05, + "loss": 1.8849, + "step": 898 + }, + { + "epoch": 0.0933831931027319, + "grad_norm": 0.43143633008003235, + "learning_rate": 9.786371060558346e-05, + "loss": 1.7419, + "step": 899 + }, + { + "epoch": 0.09348706762231225, + "grad_norm": 0.4012468755245209, + "learning_rate": 9.785898960249364e-05, + "loss": 1.9191, + "step": 900 + }, + { + "epoch": 0.09359094214189259, + "grad_norm": 0.38962993025779724, + "learning_rate": 9.785426350279477e-05, + "loss": 1.8023, + "step": 901 + }, + { + "epoch": 0.09369481666147295, + "grad_norm": 0.38585248589515686, + "learning_rate": 9.78495323069901e-05, + "loss": 1.9383, + "step": 902 + }, + { + "epoch": 0.09379869118105329, + "grad_norm": 0.347837895154953, + "learning_rate": 9.784479601558348e-05, + "loss": 1.8187, + "step": 903 + }, + { + "epoch": 0.09390256570063363, + "grad_norm": 0.32775819301605225, + "learning_rate": 9.78400546290793e-05, + "loss": 1.5762, + "step": 904 + }, + { + "epoch": 0.09400644022021398, + "grad_norm": 0.3671472668647766, + "learning_rate": 9.783530814798245e-05, + "loss": 1.8772, + "step": 905 + }, + { + "epoch": 0.09411031473979432, + "grad_norm": 0.32809990644454956, + "learning_rate": 9.783055657279843e-05, + "loss": 1.7432, + "step": 906 + }, + { + "epoch": 0.09421418925937468, + "grad_norm": 0.33797743916511536, + "learning_rate": 9.782579990403321e-05, + "loss": 1.7051, + "step": 907 + }, + { + "epoch": 0.09431806377895502, + "grad_norm": 0.3659290373325348, + "learning_rate": 9.782103814219337e-05, + "loss": 1.7051, + "step": 908 + }, + { + "epoch": 0.09442193829853537, + "grad_norm": 0.4114789068698883, + "learning_rate": 9.781627128778598e-05, + "loss": 2.0238, + "step": 909 + }, + { + "epoch": 0.09452581281811571, + "grad_norm": 0.3247332274913788, + "learning_rate": 9.781149934131868e-05, + "loss": 1.6112, + "step": 910 + }, + { + "epoch": 0.09462968733769607, + "grad_norm": 0.3638227880001068, + "learning_rate": 9.780672230329964e-05, + "loss": 1.7238, + "step": 911 + }, + { + "epoch": 0.09473356185727641, + "grad_norm": 0.35563889145851135, + "learning_rate": 9.780194017423758e-05, + "loss": 1.8491, + "step": 912 + }, + { + "epoch": 0.09483743637685675, + "grad_norm": 0.38154855370521545, + "learning_rate": 9.779715295464178e-05, + "loss": 1.666, + "step": 913 + }, + { + "epoch": 0.0949413108964371, + "grad_norm": 0.3501157760620117, + "learning_rate": 9.779236064502201e-05, + "loss": 1.8345, + "step": 914 + }, + { + "epoch": 0.09504518541601745, + "grad_norm": 0.3432106375694275, + "learning_rate": 9.778756324588863e-05, + "loss": 1.7373, + "step": 915 + }, + { + "epoch": 0.0951490599355978, + "grad_norm": 0.39171236753463745, + "learning_rate": 9.778276075775251e-05, + "loss": 1.9764, + "step": 916 + }, + { + "epoch": 0.09525293445517814, + "grad_norm": 0.35228273272514343, + "learning_rate": 9.77779531811251e-05, + "loss": 1.8864, + "step": 917 + }, + { + "epoch": 0.0953568089747585, + "grad_norm": 0.3456995487213135, + "learning_rate": 9.777314051651836e-05, + "loss": 1.8929, + "step": 918 + }, + { + "epoch": 0.09546068349433884, + "grad_norm": 0.39113515615463257, + "learning_rate": 9.776832276444479e-05, + "loss": 1.8455, + "step": 919 + }, + { + "epoch": 0.09556455801391918, + "grad_norm": 0.35368141531944275, + "learning_rate": 9.776349992541746e-05, + "loss": 1.7881, + "step": 920 + }, + { + "epoch": 0.09566843253349953, + "grad_norm": 0.3645842671394348, + "learning_rate": 9.775867199994994e-05, + "loss": 1.9618, + "step": 921 + }, + { + "epoch": 0.09577230705307987, + "grad_norm": 0.4213750958442688, + "learning_rate": 9.775383898855641e-05, + "loss": 1.9835, + "step": 922 + }, + { + "epoch": 0.09587618157266023, + "grad_norm": 0.3259928226470947, + "learning_rate": 9.77490008917515e-05, + "loss": 1.6711, + "step": 923 + }, + { + "epoch": 0.09598005609224057, + "grad_norm": 0.33279410004615784, + "learning_rate": 9.774415771005048e-05, + "loss": 1.7854, + "step": 924 + }, + { + "epoch": 0.09608393061182093, + "grad_norm": 0.3503129482269287, + "learning_rate": 9.773930944396905e-05, + "loss": 1.8105, + "step": 925 + }, + { + "epoch": 0.09618780513140127, + "grad_norm": 0.3861035704612732, + "learning_rate": 9.773445609402356e-05, + "loss": 1.8127, + "step": 926 + }, + { + "epoch": 0.09629167965098162, + "grad_norm": 0.4022195339202881, + "learning_rate": 9.772959766073084e-05, + "loss": 1.8921, + "step": 927 + }, + { + "epoch": 0.09639555417056196, + "grad_norm": 0.34826424717903137, + "learning_rate": 9.772473414460828e-05, + "loss": 1.8411, + "step": 928 + }, + { + "epoch": 0.0964994286901423, + "grad_norm": 0.3265725374221802, + "learning_rate": 9.77198655461738e-05, + "loss": 1.6894, + "step": 929 + }, + { + "epoch": 0.09660330320972266, + "grad_norm": 0.3737447261810303, + "learning_rate": 9.771499186594586e-05, + "loss": 1.8592, + "step": 930 + }, + { + "epoch": 0.096707177729303, + "grad_norm": 0.3337661623954773, + "learning_rate": 9.771011310444348e-05, + "loss": 1.6941, + "step": 931 + }, + { + "epoch": 0.09681105224888335, + "grad_norm": 0.3704545199871063, + "learning_rate": 9.770522926218622e-05, + "loss": 1.7519, + "step": 932 + }, + { + "epoch": 0.0969149267684637, + "grad_norm": 0.35930871963500977, + "learning_rate": 9.770034033969413e-05, + "loss": 1.9341, + "step": 933 + }, + { + "epoch": 0.09701880128804405, + "grad_norm": 0.38769471645355225, + "learning_rate": 9.769544633748789e-05, + "loss": 1.8773, + "step": 934 + }, + { + "epoch": 0.09712267580762439, + "grad_norm": 0.37353989481925964, + "learning_rate": 9.769054725608867e-05, + "loss": 1.9544, + "step": 935 + }, + { + "epoch": 0.09722655032720473, + "grad_norm": 0.39026400446891785, + "learning_rate": 9.768564309601813e-05, + "loss": 2.0178, + "step": 936 + }, + { + "epoch": 0.09733042484678509, + "grad_norm": 0.37478938698768616, + "learning_rate": 9.768073385779859e-05, + "loss": 1.6323, + "step": 937 + }, + { + "epoch": 0.09743429936636543, + "grad_norm": 0.3543946146965027, + "learning_rate": 9.76758195419528e-05, + "loss": 1.8568, + "step": 938 + }, + { + "epoch": 0.09753817388594578, + "grad_norm": 0.3243783712387085, + "learning_rate": 9.767090014900413e-05, + "loss": 1.6446, + "step": 939 + }, + { + "epoch": 0.09764204840552612, + "grad_norm": 0.390176922082901, + "learning_rate": 9.766597567947644e-05, + "loss": 1.7432, + "step": 940 + }, + { + "epoch": 0.09774592292510648, + "grad_norm": 0.40544116497039795, + "learning_rate": 9.766104613389413e-05, + "loss": 1.8066, + "step": 941 + }, + { + "epoch": 0.09784979744468682, + "grad_norm": 0.4119703471660614, + "learning_rate": 9.76561115127822e-05, + "loss": 1.8682, + "step": 942 + }, + { + "epoch": 0.09795367196426717, + "grad_norm": 0.41198158264160156, + "learning_rate": 9.765117181666612e-05, + "loss": 1.8556, + "step": 943 + }, + { + "epoch": 0.09805754648384751, + "grad_norm": 0.3242831528186798, + "learning_rate": 9.764622704607193e-05, + "loss": 1.8249, + "step": 944 + }, + { + "epoch": 0.09816142100342785, + "grad_norm": 0.336739718914032, + "learning_rate": 9.764127720152623e-05, + "loss": 1.7983, + "step": 945 + }, + { + "epoch": 0.09826529552300821, + "grad_norm": 0.43454012274742126, + "learning_rate": 9.763632228355611e-05, + "loss": 1.8482, + "step": 946 + }, + { + "epoch": 0.09836917004258855, + "grad_norm": 0.39079445600509644, + "learning_rate": 9.763136229268924e-05, + "loss": 1.9735, + "step": 947 + }, + { + "epoch": 0.0984730445621689, + "grad_norm": 0.3446981906890869, + "learning_rate": 9.762639722945384e-05, + "loss": 1.8289, + "step": 948 + }, + { + "epoch": 0.09857691908174925, + "grad_norm": 0.3308192193508148, + "learning_rate": 9.762142709437863e-05, + "loss": 1.6953, + "step": 949 + }, + { + "epoch": 0.0986807936013296, + "grad_norm": 0.40982964634895325, + "learning_rate": 9.76164518879929e-05, + "loss": 1.9994, + "step": 950 + }, + { + "epoch": 0.09878466812090994, + "grad_norm": 0.3556596040725708, + "learning_rate": 9.761147161082644e-05, + "loss": 1.7591, + "step": 951 + }, + { + "epoch": 0.09888854264049028, + "grad_norm": 0.361565500497818, + "learning_rate": 9.760648626340965e-05, + "loss": 1.8883, + "step": 952 + }, + { + "epoch": 0.09899241716007064, + "grad_norm": 0.36020350456237793, + "learning_rate": 9.760149584627343e-05, + "loss": 1.7859, + "step": 953 + }, + { + "epoch": 0.09909629167965098, + "grad_norm": 0.3831625282764435, + "learning_rate": 9.759650035994921e-05, + "loss": 1.9948, + "step": 954 + }, + { + "epoch": 0.09920016619923133, + "grad_norm": 0.34753647446632385, + "learning_rate": 9.759149980496896e-05, + "loss": 1.8239, + "step": 955 + }, + { + "epoch": 0.09930404071881167, + "grad_norm": 0.3743743896484375, + "learning_rate": 9.758649418186521e-05, + "loss": 1.8389, + "step": 956 + }, + { + "epoch": 0.09940791523839203, + "grad_norm": 0.38041260838508606, + "learning_rate": 9.758148349117101e-05, + "loss": 1.9304, + "step": 957 + }, + { + "epoch": 0.09951178975797237, + "grad_norm": 0.4073218107223511, + "learning_rate": 9.757646773341998e-05, + "loss": 1.9184, + "step": 958 + }, + { + "epoch": 0.09961566427755271, + "grad_norm": 0.3602345585823059, + "learning_rate": 9.757144690914624e-05, + "loss": 1.8935, + "step": 959 + }, + { + "epoch": 0.09971953879713306, + "grad_norm": 0.3824581801891327, + "learning_rate": 9.756642101888448e-05, + "loss": 1.8969, + "step": 960 + }, + { + "epoch": 0.0998234133167134, + "grad_norm": 0.3472011387348175, + "learning_rate": 9.756139006316991e-05, + "loss": 1.7702, + "step": 961 + }, + { + "epoch": 0.09992728783629376, + "grad_norm": 0.3461735248565674, + "learning_rate": 9.755635404253829e-05, + "loss": 1.6524, + "step": 962 + }, + { + "epoch": 0.1000311623558741, + "grad_norm": 0.34832313656806946, + "learning_rate": 9.755131295752593e-05, + "loss": 1.701, + "step": 963 + }, + { + "epoch": 0.10013503687545446, + "grad_norm": 0.3486674726009369, + "learning_rate": 9.754626680866964e-05, + "loss": 1.8465, + "step": 964 + }, + { + "epoch": 0.1002389113950348, + "grad_norm": 0.3982466161251068, + "learning_rate": 9.754121559650681e-05, + "loss": 2.0038, + "step": 965 + }, + { + "epoch": 0.10034278591461515, + "grad_norm": 0.3699825406074524, + "learning_rate": 9.753615932157536e-05, + "loss": 2.1116, + "step": 966 + }, + { + "epoch": 0.10044666043419549, + "grad_norm": 0.37160253524780273, + "learning_rate": 9.753109798441375e-05, + "loss": 1.9158, + "step": 967 + }, + { + "epoch": 0.10055053495377583, + "grad_norm": 0.34480687975883484, + "learning_rate": 9.752603158556095e-05, + "loss": 1.7587, + "step": 968 + }, + { + "epoch": 0.10065440947335619, + "grad_norm": 0.34491807222366333, + "learning_rate": 9.752096012555652e-05, + "loss": 1.8125, + "step": 969 + }, + { + "epoch": 0.10075828399293653, + "grad_norm": 0.36737021803855896, + "learning_rate": 9.751588360494048e-05, + "loss": 1.9693, + "step": 970 + }, + { + "epoch": 0.10086215851251688, + "grad_norm": 0.372712641954422, + "learning_rate": 9.75108020242535e-05, + "loss": 1.7953, + "step": 971 + }, + { + "epoch": 0.10096603303209722, + "grad_norm": 0.3740496039390564, + "learning_rate": 9.75057153840367e-05, + "loss": 1.7753, + "step": 972 + }, + { + "epoch": 0.10106990755167758, + "grad_norm": 0.46863672137260437, + "learning_rate": 9.750062368483176e-05, + "loss": 2.1136, + "step": 973 + }, + { + "epoch": 0.10117378207125792, + "grad_norm": 0.3760620355606079, + "learning_rate": 9.749552692718093e-05, + "loss": 1.7655, + "step": 974 + }, + { + "epoch": 0.10127765659083826, + "grad_norm": 0.35611656308174133, + "learning_rate": 9.749042511162696e-05, + "loss": 1.8326, + "step": 975 + }, + { + "epoch": 0.10138153111041862, + "grad_norm": 0.33176377415657043, + "learning_rate": 9.748531823871314e-05, + "loss": 1.7562, + "step": 976 + }, + { + "epoch": 0.10148540562999896, + "grad_norm": 0.33755457401275635, + "learning_rate": 9.748020630898333e-05, + "loss": 1.7919, + "step": 977 + }, + { + "epoch": 0.10158928014957931, + "grad_norm": 0.40104183554649353, + "learning_rate": 9.747508932298192e-05, + "loss": 1.7349, + "step": 978 + }, + { + "epoch": 0.10169315466915965, + "grad_norm": 0.33952096104621887, + "learning_rate": 9.746996728125382e-05, + "loss": 1.6836, + "step": 979 + }, + { + "epoch": 0.10179702918874001, + "grad_norm": 0.3854829668998718, + "learning_rate": 9.746484018434447e-05, + "loss": 1.6913, + "step": 980 + }, + { + "epoch": 0.10190090370832035, + "grad_norm": 0.44178688526153564, + "learning_rate": 9.745970803279989e-05, + "loss": 2.2579, + "step": 981 + }, + { + "epoch": 0.1020047782279007, + "grad_norm": 0.35316595435142517, + "learning_rate": 9.74545708271666e-05, + "loss": 1.7616, + "step": 982 + }, + { + "epoch": 0.10210865274748104, + "grad_norm": 0.36709627509117126, + "learning_rate": 9.744942856799169e-05, + "loss": 1.6341, + "step": 983 + }, + { + "epoch": 0.10221252726706138, + "grad_norm": 0.4684566557407379, + "learning_rate": 9.744428125582275e-05, + "loss": 2.0114, + "step": 984 + }, + { + "epoch": 0.10231640178664174, + "grad_norm": 0.34113651514053345, + "learning_rate": 9.743912889120791e-05, + "loss": 1.7651, + "step": 985 + }, + { + "epoch": 0.10242027630622208, + "grad_norm": 0.4054688513278961, + "learning_rate": 9.743397147469592e-05, + "loss": 2.0589, + "step": 986 + }, + { + "epoch": 0.10252415082580243, + "grad_norm": 0.36325010657310486, + "learning_rate": 9.742880900683596e-05, + "loss": 1.8003, + "step": 987 + }, + { + "epoch": 0.10262802534538278, + "grad_norm": 0.3484811782836914, + "learning_rate": 9.742364148817777e-05, + "loss": 1.8546, + "step": 988 + }, + { + "epoch": 0.10273189986496313, + "grad_norm": 0.3415622115135193, + "learning_rate": 9.74184689192717e-05, + "loss": 1.6446, + "step": 989 + }, + { + "epoch": 0.10283577438454347, + "grad_norm": 0.377069354057312, + "learning_rate": 9.741329130066859e-05, + "loss": 2.0174, + "step": 990 + }, + { + "epoch": 0.10293964890412381, + "grad_norm": 0.37598928809165955, + "learning_rate": 9.740810863291976e-05, + "loss": 1.9646, + "step": 991 + }, + { + "epoch": 0.10304352342370417, + "grad_norm": 0.34742510318756104, + "learning_rate": 9.740292091657718e-05, + "loss": 1.7112, + "step": 992 + }, + { + "epoch": 0.10314739794328451, + "grad_norm": 0.35279062390327454, + "learning_rate": 9.739772815219327e-05, + "loss": 1.736, + "step": 993 + }, + { + "epoch": 0.10325127246286486, + "grad_norm": 0.33563071489334106, + "learning_rate": 9.739253034032103e-05, + "loss": 1.7482, + "step": 994 + }, + { + "epoch": 0.1033551469824452, + "grad_norm": 0.4013693332672119, + "learning_rate": 9.738732748151397e-05, + "loss": 1.9931, + "step": 995 + }, + { + "epoch": 0.10345902150202556, + "grad_norm": 0.3738352954387665, + "learning_rate": 9.738211957632619e-05, + "loss": 1.7324, + "step": 996 + }, + { + "epoch": 0.1035628960216059, + "grad_norm": 0.34439367055892944, + "learning_rate": 9.737690662531227e-05, + "loss": 1.7277, + "step": 997 + }, + { + "epoch": 0.10366677054118625, + "grad_norm": 0.34663471579551697, + "learning_rate": 9.737168862902735e-05, + "loss": 1.8904, + "step": 998 + }, + { + "epoch": 0.1037706450607666, + "grad_norm": 0.3536369800567627, + "learning_rate": 9.736646558802709e-05, + "loss": 1.898, + "step": 999 + }, + { + "epoch": 0.10387451958034694, + "grad_norm": 0.37141430377960205, + "learning_rate": 9.736123750286772e-05, + "loss": 1.9826, + "step": 1000 + }, + { + "epoch": 0.10397839409992729, + "grad_norm": 0.33750221133232117, + "learning_rate": 9.7356004374106e-05, + "loss": 1.6245, + "step": 1001 + }, + { + "epoch": 0.10408226861950763, + "grad_norm": 0.39704445004463196, + "learning_rate": 9.735076620229919e-05, + "loss": 1.9813, + "step": 1002 + }, + { + "epoch": 0.10418614313908799, + "grad_norm": 0.3396252691745758, + "learning_rate": 9.734552298800513e-05, + "loss": 1.7119, + "step": 1003 + }, + { + "epoch": 0.10429001765866833, + "grad_norm": 0.3506071865558624, + "learning_rate": 9.734027473178219e-05, + "loss": 1.8563, + "step": 1004 + }, + { + "epoch": 0.10439389217824868, + "grad_norm": 0.3697708249092102, + "learning_rate": 9.733502143418925e-05, + "loss": 1.5552, + "step": 1005 + }, + { + "epoch": 0.10449776669782902, + "grad_norm": 0.34835517406463623, + "learning_rate": 9.732976309578577e-05, + "loss": 1.6061, + "step": 1006 + }, + { + "epoch": 0.10460164121740936, + "grad_norm": 0.4172792434692383, + "learning_rate": 9.73244997171317e-05, + "loss": 2.0223, + "step": 1007 + }, + { + "epoch": 0.10470551573698972, + "grad_norm": 0.37893515825271606, + "learning_rate": 9.731923129878754e-05, + "loss": 1.8512, + "step": 1008 + }, + { + "epoch": 0.10480939025657006, + "grad_norm": 0.3517685830593109, + "learning_rate": 9.731395784131437e-05, + "loss": 1.7601, + "step": 1009 + }, + { + "epoch": 0.10491326477615041, + "grad_norm": 0.3533296585083008, + "learning_rate": 9.730867934527374e-05, + "loss": 1.7264, + "step": 1010 + }, + { + "epoch": 0.10501713929573075, + "grad_norm": 0.3656322956085205, + "learning_rate": 9.73033958112278e-05, + "loss": 1.8542, + "step": 1011 + }, + { + "epoch": 0.10512101381531111, + "grad_norm": 0.36052483320236206, + "learning_rate": 9.729810723973917e-05, + "loss": 1.7967, + "step": 1012 + }, + { + "epoch": 0.10522488833489145, + "grad_norm": 0.3687455356121063, + "learning_rate": 9.729281363137106e-05, + "loss": 1.9559, + "step": 1013 + }, + { + "epoch": 0.10532876285447179, + "grad_norm": 0.3520287275314331, + "learning_rate": 9.72875149866872e-05, + "loss": 1.8265, + "step": 1014 + }, + { + "epoch": 0.10543263737405215, + "grad_norm": 0.36388278007507324, + "learning_rate": 9.728221130625186e-05, + "loss": 1.7892, + "step": 1015 + }, + { + "epoch": 0.10553651189363249, + "grad_norm": 0.3430229127407074, + "learning_rate": 9.727690259062982e-05, + "loss": 1.7436, + "step": 1016 + }, + { + "epoch": 0.10564038641321284, + "grad_norm": 0.40209057927131653, + "learning_rate": 9.727158884038643e-05, + "loss": 2.0679, + "step": 1017 + }, + { + "epoch": 0.10574426093279318, + "grad_norm": 0.35472702980041504, + "learning_rate": 9.726627005608756e-05, + "loss": 1.8322, + "step": 1018 + }, + { + "epoch": 0.10584813545237354, + "grad_norm": 0.3761141896247864, + "learning_rate": 9.726094623829962e-05, + "loss": 1.9986, + "step": 1019 + }, + { + "epoch": 0.10595200997195388, + "grad_norm": 0.37328705191612244, + "learning_rate": 9.725561738758956e-05, + "loss": 1.8012, + "step": 1020 + }, + { + "epoch": 0.10605588449153423, + "grad_norm": 0.3950223922729492, + "learning_rate": 9.725028350452485e-05, + "loss": 1.859, + "step": 1021 + }, + { + "epoch": 0.10615975901111457, + "grad_norm": 0.37431395053863525, + "learning_rate": 9.724494458967352e-05, + "loss": 1.6539, + "step": 1022 + }, + { + "epoch": 0.10626363353069491, + "grad_norm": 0.3411618173122406, + "learning_rate": 9.723960064360412e-05, + "loss": 1.6884, + "step": 1023 + }, + { + "epoch": 0.10636750805027527, + "grad_norm": 0.3407668173313141, + "learning_rate": 9.72342516668857e-05, + "loss": 1.7554, + "step": 1024 + }, + { + "epoch": 0.10647138256985561, + "grad_norm": 0.3708615005016327, + "learning_rate": 9.722889766008796e-05, + "loss": 1.7656, + "step": 1025 + }, + { + "epoch": 0.10657525708943597, + "grad_norm": 0.3609970510005951, + "learning_rate": 9.722353862378101e-05, + "loss": 1.8226, + "step": 1026 + }, + { + "epoch": 0.1066791316090163, + "grad_norm": 0.341370552778244, + "learning_rate": 9.721817455853554e-05, + "loss": 1.6735, + "step": 1027 + }, + { + "epoch": 0.10678300612859666, + "grad_norm": 0.3461858630180359, + "learning_rate": 9.72128054649228e-05, + "loss": 1.6466, + "step": 1028 + }, + { + "epoch": 0.106886880648177, + "grad_norm": 0.3534831404685974, + "learning_rate": 9.720743134351457e-05, + "loss": 1.8349, + "step": 1029 + }, + { + "epoch": 0.10699075516775734, + "grad_norm": 0.33681240677833557, + "learning_rate": 9.720205219488312e-05, + "loss": 1.7535, + "step": 1030 + }, + { + "epoch": 0.1070946296873377, + "grad_norm": 0.3691225051879883, + "learning_rate": 9.71966680196013e-05, + "loss": 1.7835, + "step": 1031 + }, + { + "epoch": 0.10719850420691804, + "grad_norm": 0.3454303443431854, + "learning_rate": 9.719127881824249e-05, + "loss": 1.7698, + "step": 1032 + }, + { + "epoch": 0.10730237872649839, + "grad_norm": 0.3353855311870575, + "learning_rate": 9.718588459138059e-05, + "loss": 1.8369, + "step": 1033 + }, + { + "epoch": 0.10740625324607873, + "grad_norm": 0.3854605257511139, + "learning_rate": 9.718048533959004e-05, + "loss": 1.8615, + "step": 1034 + }, + { + "epoch": 0.10751012776565909, + "grad_norm": 0.34844595193862915, + "learning_rate": 9.717508106344582e-05, + "loss": 1.9681, + "step": 1035 + }, + { + "epoch": 0.10761400228523943, + "grad_norm": 0.3415117561817169, + "learning_rate": 9.716967176352345e-05, + "loss": 1.8015, + "step": 1036 + }, + { + "epoch": 0.10771787680481978, + "grad_norm": 0.3829508125782013, + "learning_rate": 9.716425744039898e-05, + "loss": 1.9817, + "step": 1037 + }, + { + "epoch": 0.10782175132440013, + "grad_norm": 0.3588506877422333, + "learning_rate": 9.715883809464899e-05, + "loss": 1.7871, + "step": 1038 + }, + { + "epoch": 0.10792562584398047, + "grad_norm": 0.36895090341567993, + "learning_rate": 9.715341372685059e-05, + "loss": 1.9029, + "step": 1039 + }, + { + "epoch": 0.10802950036356082, + "grad_norm": 0.4375429153442383, + "learning_rate": 9.714798433758144e-05, + "loss": 1.6931, + "step": 1040 + }, + { + "epoch": 0.10813337488314116, + "grad_norm": 0.34080272912979126, + "learning_rate": 9.714254992741971e-05, + "loss": 1.7422, + "step": 1041 + }, + { + "epoch": 0.10823724940272152, + "grad_norm": 0.35768958926200867, + "learning_rate": 9.713711049694416e-05, + "loss": 1.7005, + "step": 1042 + }, + { + "epoch": 0.10834112392230186, + "grad_norm": 0.35327664017677307, + "learning_rate": 9.713166604673403e-05, + "loss": 1.7248, + "step": 1043 + }, + { + "epoch": 0.10844499844188221, + "grad_norm": 0.33711856603622437, + "learning_rate": 9.71262165773691e-05, + "loss": 1.719, + "step": 1044 + }, + { + "epoch": 0.10854887296146255, + "grad_norm": 0.35835373401641846, + "learning_rate": 9.712076208942968e-05, + "loss": 1.7452, + "step": 1045 + }, + { + "epoch": 0.1086527474810429, + "grad_norm": 0.3395202159881592, + "learning_rate": 9.711530258349668e-05, + "loss": 1.7313, + "step": 1046 + }, + { + "epoch": 0.10875662200062325, + "grad_norm": 0.3273974061012268, + "learning_rate": 9.710983806015148e-05, + "loss": 1.7674, + "step": 1047 + }, + { + "epoch": 0.10886049652020359, + "grad_norm": 0.35871952772140503, + "learning_rate": 9.710436851997598e-05, + "loss": 1.964, + "step": 1048 + }, + { + "epoch": 0.10896437103978394, + "grad_norm": 0.3521716296672821, + "learning_rate": 9.709889396355265e-05, + "loss": 1.8366, + "step": 1049 + }, + { + "epoch": 0.10906824555936429, + "grad_norm": 0.37341251969337463, + "learning_rate": 9.709341439146452e-05, + "loss": 1.8713, + "step": 1050 + }, + { + "epoch": 0.10917212007894464, + "grad_norm": 0.37906524538993835, + "learning_rate": 9.708792980429508e-05, + "loss": 1.988, + "step": 1051 + }, + { + "epoch": 0.10927599459852498, + "grad_norm": 0.3761735260486603, + "learning_rate": 9.708244020262842e-05, + "loss": 1.9337, + "step": 1052 + }, + { + "epoch": 0.10937986911810534, + "grad_norm": 0.33354657888412476, + "learning_rate": 9.707694558704914e-05, + "loss": 1.8026, + "step": 1053 + }, + { + "epoch": 0.10948374363768568, + "grad_norm": 0.3411722779273987, + "learning_rate": 9.707144595814237e-05, + "loss": 1.6728, + "step": 1054 + }, + { + "epoch": 0.10958761815726602, + "grad_norm": 0.3556554913520813, + "learning_rate": 9.706594131649378e-05, + "loss": 1.8825, + "step": 1055 + }, + { + "epoch": 0.10969149267684637, + "grad_norm": 0.3727862536907196, + "learning_rate": 9.706043166268957e-05, + "loss": 1.8012, + "step": 1056 + }, + { + "epoch": 0.10979536719642671, + "grad_norm": 0.3387303650379181, + "learning_rate": 9.705491699731646e-05, + "loss": 1.6768, + "step": 1057 + }, + { + "epoch": 0.10989924171600707, + "grad_norm": 0.36352434754371643, + "learning_rate": 9.704939732096175e-05, + "loss": 1.8888, + "step": 1058 + }, + { + "epoch": 0.11000311623558741, + "grad_norm": 0.3399582803249359, + "learning_rate": 9.704387263421322e-05, + "loss": 1.6474, + "step": 1059 + }, + { + "epoch": 0.11010699075516776, + "grad_norm": 0.41029804944992065, + "learning_rate": 9.70383429376592e-05, + "loss": 1.9323, + "step": 1060 + }, + { + "epoch": 0.1102108652747481, + "grad_norm": 0.40055909752845764, + "learning_rate": 9.703280823188858e-05, + "loss": 1.8642, + "step": 1061 + }, + { + "epoch": 0.11031473979432845, + "grad_norm": 0.365138441324234, + "learning_rate": 9.702726851749074e-05, + "loss": 1.7669, + "step": 1062 + }, + { + "epoch": 0.1104186143139088, + "grad_norm": 0.3488384187221527, + "learning_rate": 9.702172379505565e-05, + "loss": 1.6532, + "step": 1063 + }, + { + "epoch": 0.11052248883348914, + "grad_norm": 0.34194380044937134, + "learning_rate": 9.701617406517374e-05, + "loss": 1.6235, + "step": 1064 + }, + { + "epoch": 0.1106263633530695, + "grad_norm": 0.3830929696559906, + "learning_rate": 9.701061932843605e-05, + "loss": 1.9023, + "step": 1065 + }, + { + "epoch": 0.11073023787264984, + "grad_norm": 0.3511880934238434, + "learning_rate": 9.700505958543409e-05, + "loss": 1.6955, + "step": 1066 + }, + { + "epoch": 0.11083411239223019, + "grad_norm": 0.35780179500579834, + "learning_rate": 9.699949483675994e-05, + "loss": 1.7417, + "step": 1067 + }, + { + "epoch": 0.11093798691181053, + "grad_norm": 0.3555978536605835, + "learning_rate": 9.69939250830062e-05, + "loss": 1.7226, + "step": 1068 + }, + { + "epoch": 0.11104186143139087, + "grad_norm": 0.34007731080055237, + "learning_rate": 9.698835032476598e-05, + "loss": 1.7571, + "step": 1069 + }, + { + "epoch": 0.11114573595097123, + "grad_norm": 0.34321436285972595, + "learning_rate": 9.6982770562633e-05, + "loss": 1.8685, + "step": 1070 + }, + { + "epoch": 0.11124961047055157, + "grad_norm": 0.32458704710006714, + "learning_rate": 9.697718579720144e-05, + "loss": 1.6949, + "step": 1071 + }, + { + "epoch": 0.11135348499013192, + "grad_norm": 0.37475547194480896, + "learning_rate": 9.697159602906603e-05, + "loss": 1.9906, + "step": 1072 + }, + { + "epoch": 0.11145735950971226, + "grad_norm": 0.38911253213882446, + "learning_rate": 9.696600125882202e-05, + "loss": 1.9341, + "step": 1073 + }, + { + "epoch": 0.11156123402929262, + "grad_norm": 0.3872954249382019, + "learning_rate": 9.696040148706523e-05, + "loss": 1.929, + "step": 1074 + }, + { + "epoch": 0.11166510854887296, + "grad_norm": 0.3806956708431244, + "learning_rate": 9.695479671439198e-05, + "loss": 1.8981, + "step": 1075 + }, + { + "epoch": 0.11176898306845331, + "grad_norm": 0.3513946235179901, + "learning_rate": 9.694918694139915e-05, + "loss": 1.689, + "step": 1076 + }, + { + "epoch": 0.11187285758803366, + "grad_norm": 0.3258873522281647, + "learning_rate": 9.694357216868413e-05, + "loss": 1.6753, + "step": 1077 + }, + { + "epoch": 0.111976732107614, + "grad_norm": 0.36498063802719116, + "learning_rate": 9.693795239684484e-05, + "loss": 1.6901, + "step": 1078 + }, + { + "epoch": 0.11208060662719435, + "grad_norm": 0.34291768074035645, + "learning_rate": 9.693232762647976e-05, + "loss": 1.6372, + "step": 1079 + }, + { + "epoch": 0.11218448114677469, + "grad_norm": 0.38301602005958557, + "learning_rate": 9.692669785818786e-05, + "loss": 1.8587, + "step": 1080 + }, + { + "epoch": 0.11228835566635505, + "grad_norm": 0.3735436201095581, + "learning_rate": 9.69210630925687e-05, + "loss": 1.9017, + "step": 1081 + }, + { + "epoch": 0.11239223018593539, + "grad_norm": 0.34948012232780457, + "learning_rate": 9.69154233302223e-05, + "loss": 1.7744, + "step": 1082 + }, + { + "epoch": 0.11249610470551574, + "grad_norm": 0.30937257409095764, + "learning_rate": 9.690977857174927e-05, + "loss": 1.6596, + "step": 1083 + }, + { + "epoch": 0.11259997922509608, + "grad_norm": 0.3430674970149994, + "learning_rate": 9.690412881775076e-05, + "loss": 1.8651, + "step": 1084 + }, + { + "epoch": 0.11270385374467642, + "grad_norm": 0.4052720069885254, + "learning_rate": 9.689847406882838e-05, + "loss": 1.9878, + "step": 1085 + }, + { + "epoch": 0.11280772826425678, + "grad_norm": 0.3367058038711548, + "learning_rate": 9.689281432558432e-05, + "loss": 1.9459, + "step": 1086 + }, + { + "epoch": 0.11291160278383712, + "grad_norm": 0.32296720147132874, + "learning_rate": 9.688714958862133e-05, + "loss": 1.7604, + "step": 1087 + }, + { + "epoch": 0.11301547730341747, + "grad_norm": 0.36404433846473694, + "learning_rate": 9.688147985854264e-05, + "loss": 1.8102, + "step": 1088 + }, + { + "epoch": 0.11311935182299782, + "grad_norm": 0.4039054811000824, + "learning_rate": 9.687580513595202e-05, + "loss": 1.8611, + "step": 1089 + }, + { + "epoch": 0.11322322634257817, + "grad_norm": 0.4218135476112366, + "learning_rate": 9.687012542145382e-05, + "loss": 1.962, + "step": 1090 + }, + { + "epoch": 0.11332710086215851, + "grad_norm": 0.36868998408317566, + "learning_rate": 9.686444071565285e-05, + "loss": 1.7224, + "step": 1091 + }, + { + "epoch": 0.11343097538173887, + "grad_norm": 0.3493179380893707, + "learning_rate": 9.68587510191545e-05, + "loss": 1.715, + "step": 1092 + }, + { + "epoch": 0.11353484990131921, + "grad_norm": 0.3461728096008301, + "learning_rate": 9.685305633256469e-05, + "loss": 1.6688, + "step": 1093 + }, + { + "epoch": 0.11363872442089955, + "grad_norm": 0.35678285360336304, + "learning_rate": 9.684735665648983e-05, + "loss": 1.851, + "step": 1094 + }, + { + "epoch": 0.1137425989404799, + "grad_norm": 0.3611486852169037, + "learning_rate": 9.684165199153692e-05, + "loss": 1.8039, + "step": 1095 + }, + { + "epoch": 0.11384647346006024, + "grad_norm": 0.3748013377189636, + "learning_rate": 9.683594233831345e-05, + "loss": 1.8317, + "step": 1096 + }, + { + "epoch": 0.1139503479796406, + "grad_norm": 0.39046037197113037, + "learning_rate": 9.683022769742746e-05, + "loss": 1.9756, + "step": 1097 + }, + { + "epoch": 0.11405422249922094, + "grad_norm": 0.32861414551734924, + "learning_rate": 9.68245080694875e-05, + "loss": 1.6867, + "step": 1098 + }, + { + "epoch": 0.1141580970188013, + "grad_norm": 0.4004691541194916, + "learning_rate": 9.68187834551027e-05, + "loss": 1.9397, + "step": 1099 + }, + { + "epoch": 0.11426197153838163, + "grad_norm": 0.33561286330223083, + "learning_rate": 9.681305385488264e-05, + "loss": 1.7001, + "step": 1100 + }, + { + "epoch": 0.11436584605796198, + "grad_norm": 0.3517715036869049, + "learning_rate": 9.680731926943752e-05, + "loss": 1.774, + "step": 1101 + }, + { + "epoch": 0.11446972057754233, + "grad_norm": 0.37641438841819763, + "learning_rate": 9.680157969937799e-05, + "loss": 1.7585, + "step": 1102 + }, + { + "epoch": 0.11457359509712267, + "grad_norm": 0.36919966340065, + "learning_rate": 9.67958351453153e-05, + "loss": 1.8175, + "step": 1103 + }, + { + "epoch": 0.11467746961670303, + "grad_norm": 0.3542039096355438, + "learning_rate": 9.679008560786117e-05, + "loss": 1.7941, + "step": 1104 + }, + { + "epoch": 0.11478134413628337, + "grad_norm": 0.3409385681152344, + "learning_rate": 9.678433108762791e-05, + "loss": 1.7144, + "step": 1105 + }, + { + "epoch": 0.11488521865586372, + "grad_norm": 0.3563219904899597, + "learning_rate": 9.677857158522833e-05, + "loss": 1.9284, + "step": 1106 + }, + { + "epoch": 0.11498909317544406, + "grad_norm": 0.3560921251773834, + "learning_rate": 9.677280710127575e-05, + "loss": 1.5077, + "step": 1107 + }, + { + "epoch": 0.11509296769502442, + "grad_norm": 0.43264439702033997, + "learning_rate": 9.676703763638406e-05, + "loss": 2.0279, + "step": 1108 + }, + { + "epoch": 0.11519684221460476, + "grad_norm": 0.36117497086524963, + "learning_rate": 9.676126319116766e-05, + "loss": 1.9437, + "step": 1109 + }, + { + "epoch": 0.1153007167341851, + "grad_norm": 0.3572199046611786, + "learning_rate": 9.675548376624148e-05, + "loss": 1.8091, + "step": 1110 + }, + { + "epoch": 0.11540459125376545, + "grad_norm": 0.35779669880867004, + "learning_rate": 9.674969936222099e-05, + "loss": 1.7403, + "step": 1111 + }, + { + "epoch": 0.1155084657733458, + "grad_norm": 0.36604002118110657, + "learning_rate": 9.674390997972218e-05, + "loss": 1.9467, + "step": 1112 + }, + { + "epoch": 0.11561234029292615, + "grad_norm": 0.38619542121887207, + "learning_rate": 9.673811561936157e-05, + "loss": 1.8208, + "step": 1113 + }, + { + "epoch": 0.11571621481250649, + "grad_norm": 0.33228105306625366, + "learning_rate": 9.673231628175622e-05, + "loss": 1.7336, + "step": 1114 + }, + { + "epoch": 0.11582008933208685, + "grad_norm": 0.3648672103881836, + "learning_rate": 9.672651196752371e-05, + "loss": 1.9191, + "step": 1115 + }, + { + "epoch": 0.11592396385166719, + "grad_norm": 0.33783960342407227, + "learning_rate": 9.672070267728215e-05, + "loss": 1.7849, + "step": 1116 + }, + { + "epoch": 0.11602783837124753, + "grad_norm": 0.4384811818599701, + "learning_rate": 9.67148884116502e-05, + "loss": 2.076, + "step": 1117 + }, + { + "epoch": 0.11613171289082788, + "grad_norm": 0.3607281446456909, + "learning_rate": 9.670906917124703e-05, + "loss": 1.7839, + "step": 1118 + }, + { + "epoch": 0.11623558741040822, + "grad_norm": 0.3579871356487274, + "learning_rate": 9.670324495669231e-05, + "loss": 1.7778, + "step": 1119 + }, + { + "epoch": 0.11633946192998858, + "grad_norm": 0.3565009832382202, + "learning_rate": 9.669741576860634e-05, + "loss": 1.8653, + "step": 1120 + }, + { + "epoch": 0.11644333644956892, + "grad_norm": 0.36752045154571533, + "learning_rate": 9.669158160760982e-05, + "loss": 1.9041, + "step": 1121 + }, + { + "epoch": 0.11654721096914927, + "grad_norm": 0.32525351643562317, + "learning_rate": 9.668574247432407e-05, + "loss": 1.5888, + "step": 1122 + }, + { + "epoch": 0.11665108548872961, + "grad_norm": 0.3645164370536804, + "learning_rate": 9.667989836937092e-05, + "loss": 1.8802, + "step": 1123 + }, + { + "epoch": 0.11675496000830995, + "grad_norm": 0.34458303451538086, + "learning_rate": 9.66740492933727e-05, + "loss": 1.8201, + "step": 1124 + }, + { + "epoch": 0.11685883452789031, + "grad_norm": 0.33625590801239014, + "learning_rate": 9.66681952469523e-05, + "loss": 1.758, + "step": 1125 + }, + { + "epoch": 0.11696270904747065, + "grad_norm": 0.3612918257713318, + "learning_rate": 9.666233623073315e-05, + "loss": 1.8567, + "step": 1126 + }, + { + "epoch": 0.117066583567051, + "grad_norm": 0.3521936535835266, + "learning_rate": 9.665647224533915e-05, + "loss": 1.8514, + "step": 1127 + }, + { + "epoch": 0.11717045808663135, + "grad_norm": 0.42195868492126465, + "learning_rate": 9.665060329139479e-05, + "loss": 1.9686, + "step": 1128 + }, + { + "epoch": 0.1172743326062117, + "grad_norm": 0.35592785477638245, + "learning_rate": 9.66447293695251e-05, + "loss": 1.6642, + "step": 1129 + }, + { + "epoch": 0.11737820712579204, + "grad_norm": 0.3389096260070801, + "learning_rate": 9.663885048035555e-05, + "loss": 1.6711, + "step": 1130 + }, + { + "epoch": 0.1174820816453724, + "grad_norm": 0.3547491729259491, + "learning_rate": 9.663296662451221e-05, + "loss": 1.7615, + "step": 1131 + }, + { + "epoch": 0.11758595616495274, + "grad_norm": 0.33912357687950134, + "learning_rate": 9.66270778026217e-05, + "loss": 1.6615, + "step": 1132 + }, + { + "epoch": 0.11768983068453308, + "grad_norm": 0.37360331416130066, + "learning_rate": 9.662118401531108e-05, + "loss": 1.8592, + "step": 1133 + }, + { + "epoch": 0.11779370520411343, + "grad_norm": 0.3422006666660309, + "learning_rate": 9.661528526320804e-05, + "loss": 1.8253, + "step": 1134 + }, + { + "epoch": 0.11789757972369377, + "grad_norm": 0.36613208055496216, + "learning_rate": 9.660938154694075e-05, + "loss": 1.592, + "step": 1135 + }, + { + "epoch": 0.11800145424327413, + "grad_norm": 0.38055816292762756, + "learning_rate": 9.660347286713787e-05, + "loss": 1.6502, + "step": 1136 + }, + { + "epoch": 0.11810532876285447, + "grad_norm": 0.3511364459991455, + "learning_rate": 9.659755922442866e-05, + "loss": 1.6945, + "step": 1137 + }, + { + "epoch": 0.11820920328243482, + "grad_norm": 0.3394559621810913, + "learning_rate": 9.659164061944285e-05, + "loss": 1.7614, + "step": 1138 + }, + { + "epoch": 0.11831307780201517, + "grad_norm": 0.38711434602737427, + "learning_rate": 9.658571705281076e-05, + "loss": 1.8105, + "step": 1139 + }, + { + "epoch": 0.1184169523215955, + "grad_norm": 0.3627341389656067, + "learning_rate": 9.657978852516319e-05, + "loss": 1.6527, + "step": 1140 + }, + { + "epoch": 0.11852082684117586, + "grad_norm": 0.3539731800556183, + "learning_rate": 9.657385503713146e-05, + "loss": 1.8393, + "step": 1141 + }, + { + "epoch": 0.1186247013607562, + "grad_norm": 0.3663812279701233, + "learning_rate": 9.656791658934747e-05, + "loss": 1.7833, + "step": 1142 + }, + { + "epoch": 0.11872857588033656, + "grad_norm": 0.3631526231765747, + "learning_rate": 9.656197318244359e-05, + "loss": 1.9837, + "step": 1143 + }, + { + "epoch": 0.1188324503999169, + "grad_norm": 0.382820725440979, + "learning_rate": 9.655602481705279e-05, + "loss": 1.8836, + "step": 1144 + }, + { + "epoch": 0.11893632491949725, + "grad_norm": 0.33744722604751587, + "learning_rate": 9.655007149380849e-05, + "loss": 1.7552, + "step": 1145 + }, + { + "epoch": 0.11904019943907759, + "grad_norm": 0.34012195467948914, + "learning_rate": 9.654411321334468e-05, + "loss": 1.6319, + "step": 1146 + }, + { + "epoch": 0.11914407395865795, + "grad_norm": 0.34437379240989685, + "learning_rate": 9.653814997629587e-05, + "loss": 1.6788, + "step": 1147 + }, + { + "epoch": 0.11924794847823829, + "grad_norm": 0.3765188753604889, + "learning_rate": 9.653218178329709e-05, + "loss": 1.8694, + "step": 1148 + }, + { + "epoch": 0.11935182299781863, + "grad_norm": 0.341296911239624, + "learning_rate": 9.652620863498393e-05, + "loss": 1.7641, + "step": 1149 + }, + { + "epoch": 0.11945569751739898, + "grad_norm": 0.37878480553627014, + "learning_rate": 9.652023053199247e-05, + "loss": 1.8416, + "step": 1150 + }, + { + "epoch": 0.11955957203697933, + "grad_norm": 0.39128556847572327, + "learning_rate": 9.651424747495931e-05, + "loss": 1.9727, + "step": 1151 + }, + { + "epoch": 0.11966344655655968, + "grad_norm": 0.32593607902526855, + "learning_rate": 9.650825946452164e-05, + "loss": 1.6429, + "step": 1152 + }, + { + "epoch": 0.11976732107614002, + "grad_norm": 0.365972101688385, + "learning_rate": 9.65022665013171e-05, + "loss": 1.7789, + "step": 1153 + }, + { + "epoch": 0.11987119559572038, + "grad_norm": 0.3774997889995575, + "learning_rate": 9.64962685859839e-05, + "loss": 1.8482, + "step": 1154 + }, + { + "epoch": 0.11997507011530072, + "grad_norm": 0.36030101776123047, + "learning_rate": 9.64902657191608e-05, + "loss": 1.621, + "step": 1155 + }, + { + "epoch": 0.12007894463488106, + "grad_norm": 0.40642428398132324, + "learning_rate": 9.648425790148704e-05, + "loss": 1.869, + "step": 1156 + }, + { + "epoch": 0.12018281915446141, + "grad_norm": 0.374668151140213, + "learning_rate": 9.64782451336024e-05, + "loss": 1.8155, + "step": 1157 + }, + { + "epoch": 0.12028669367404175, + "grad_norm": 0.33325710892677307, + "learning_rate": 9.64722274161472e-05, + "loss": 1.6419, + "step": 1158 + }, + { + "epoch": 0.12039056819362211, + "grad_norm": 0.36537379026412964, + "learning_rate": 9.646620474976226e-05, + "loss": 1.8969, + "step": 1159 + }, + { + "epoch": 0.12049444271320245, + "grad_norm": 0.3427730202674866, + "learning_rate": 9.6460177135089e-05, + "loss": 1.7814, + "step": 1160 + }, + { + "epoch": 0.1205983172327828, + "grad_norm": 0.39888283610343933, + "learning_rate": 9.645414457276925e-05, + "loss": 1.8611, + "step": 1161 + }, + { + "epoch": 0.12070219175236314, + "grad_norm": 0.35970908403396606, + "learning_rate": 9.644810706344547e-05, + "loss": 1.838, + "step": 1162 + }, + { + "epoch": 0.1208060662719435, + "grad_norm": 0.35236963629722595, + "learning_rate": 9.644206460776058e-05, + "loss": 1.7295, + "step": 1163 + }, + { + "epoch": 0.12090994079152384, + "grad_norm": 0.3539031445980072, + "learning_rate": 9.643601720635809e-05, + "loss": 1.8078, + "step": 1164 + }, + { + "epoch": 0.12101381531110418, + "grad_norm": 0.3607688546180725, + "learning_rate": 9.642996485988198e-05, + "loss": 1.8958, + "step": 1165 + }, + { + "epoch": 0.12111768983068454, + "grad_norm": 0.363936185836792, + "learning_rate": 9.642390756897677e-05, + "loss": 1.7418, + "step": 1166 + }, + { + "epoch": 0.12122156435026488, + "grad_norm": 0.3548314571380615, + "learning_rate": 9.641784533428754e-05, + "loss": 1.6745, + "step": 1167 + }, + { + "epoch": 0.12132543886984523, + "grad_norm": 0.43917304277420044, + "learning_rate": 9.641177815645984e-05, + "loss": 2.0477, + "step": 1168 + }, + { + "epoch": 0.12142931338942557, + "grad_norm": 0.36484494805336, + "learning_rate": 9.64057060361398e-05, + "loss": 1.8097, + "step": 1169 + }, + { + "epoch": 0.12153318790900593, + "grad_norm": 0.41941696405410767, + "learning_rate": 9.639962897397404e-05, + "loss": 1.921, + "step": 1170 + }, + { + "epoch": 0.12163706242858627, + "grad_norm": 0.3769179582595825, + "learning_rate": 9.639354697060974e-05, + "loss": 1.9793, + "step": 1171 + }, + { + "epoch": 0.12174093694816661, + "grad_norm": 0.3361814618110657, + "learning_rate": 9.638746002669455e-05, + "loss": 1.685, + "step": 1172 + }, + { + "epoch": 0.12184481146774696, + "grad_norm": 0.3244837522506714, + "learning_rate": 9.638136814287672e-05, + "loss": 1.6935, + "step": 1173 + }, + { + "epoch": 0.1219486859873273, + "grad_norm": 0.3598596751689911, + "learning_rate": 9.637527131980496e-05, + "loss": 1.6975, + "step": 1174 + }, + { + "epoch": 0.12205256050690766, + "grad_norm": 0.396697461605072, + "learning_rate": 9.636916955812856e-05, + "loss": 1.982, + "step": 1175 + }, + { + "epoch": 0.122156435026488, + "grad_norm": 0.37619683146476746, + "learning_rate": 9.636306285849728e-05, + "loss": 1.8531, + "step": 1176 + }, + { + "epoch": 0.12226030954606835, + "grad_norm": 0.3777252733707428, + "learning_rate": 9.635695122156145e-05, + "loss": 1.9038, + "step": 1177 + }, + { + "epoch": 0.1223641840656487, + "grad_norm": 0.39798396825790405, + "learning_rate": 9.635083464797192e-05, + "loss": 2.0153, + "step": 1178 + }, + { + "epoch": 0.12246805858522904, + "grad_norm": 0.3506300449371338, + "learning_rate": 9.634471313838005e-05, + "loss": 1.743, + "step": 1179 + }, + { + "epoch": 0.12257193310480939, + "grad_norm": 0.3415655493736267, + "learning_rate": 9.633858669343774e-05, + "loss": 1.7878, + "step": 1180 + }, + { + "epoch": 0.12267580762438973, + "grad_norm": 0.39789021015167236, + "learning_rate": 9.633245531379739e-05, + "loss": 1.9123, + "step": 1181 + }, + { + "epoch": 0.12277968214397009, + "grad_norm": 0.37617623805999756, + "learning_rate": 9.632631900011197e-05, + "loss": 1.8383, + "step": 1182 + }, + { + "epoch": 0.12288355666355043, + "grad_norm": 0.37131214141845703, + "learning_rate": 9.632017775303492e-05, + "loss": 1.817, + "step": 1183 + }, + { + "epoch": 0.12298743118313078, + "grad_norm": 0.4080236256122589, + "learning_rate": 9.631403157322026e-05, + "loss": 1.9845, + "step": 1184 + }, + { + "epoch": 0.12309130570271112, + "grad_norm": 0.37740015983581543, + "learning_rate": 9.630788046132251e-05, + "loss": 1.904, + "step": 1185 + }, + { + "epoch": 0.12319518022229148, + "grad_norm": 0.33329272270202637, + "learning_rate": 9.63017244179967e-05, + "loss": 1.7073, + "step": 1186 + }, + { + "epoch": 0.12329905474187182, + "grad_norm": 0.3489798307418823, + "learning_rate": 9.629556344389839e-05, + "loss": 1.7045, + "step": 1187 + }, + { + "epoch": 0.12340292926145216, + "grad_norm": 0.3395266532897949, + "learning_rate": 9.628939753968371e-05, + "loss": 1.7442, + "step": 1188 + }, + { + "epoch": 0.12350680378103251, + "grad_norm": 0.3406863808631897, + "learning_rate": 9.628322670600926e-05, + "loss": 1.7656, + "step": 1189 + }, + { + "epoch": 0.12361067830061286, + "grad_norm": 0.35596734285354614, + "learning_rate": 9.627705094353219e-05, + "loss": 2.0144, + "step": 1190 + }, + { + "epoch": 0.12371455282019321, + "grad_norm": 0.3764468729496002, + "learning_rate": 9.627087025291015e-05, + "loss": 1.8333, + "step": 1191 + }, + { + "epoch": 0.12381842733977355, + "grad_norm": 0.3464232087135315, + "learning_rate": 9.626468463480138e-05, + "loss": 1.5966, + "step": 1192 + }, + { + "epoch": 0.1239223018593539, + "grad_norm": 0.35695046186447144, + "learning_rate": 9.625849408986456e-05, + "loss": 1.7578, + "step": 1193 + }, + { + "epoch": 0.12402617637893425, + "grad_norm": 0.3387443721294403, + "learning_rate": 9.625229861875897e-05, + "loss": 1.6883, + "step": 1194 + }, + { + "epoch": 0.12413005089851459, + "grad_norm": 0.3431154191493988, + "learning_rate": 9.624609822214434e-05, + "loss": 1.534, + "step": 1195 + }, + { + "epoch": 0.12423392541809494, + "grad_norm": 0.3574540317058563, + "learning_rate": 9.623989290068099e-05, + "loss": 1.804, + "step": 1196 + }, + { + "epoch": 0.12433779993767528, + "grad_norm": 0.35088303685188293, + "learning_rate": 9.623368265502973e-05, + "loss": 1.8282, + "step": 1197 + }, + { + "epoch": 0.12444167445725564, + "grad_norm": 0.35268113017082214, + "learning_rate": 9.62274674858519e-05, + "loss": 1.748, + "step": 1198 + }, + { + "epoch": 0.12454554897683598, + "grad_norm": 0.4166131019592285, + "learning_rate": 9.622124739380936e-05, + "loss": 1.9086, + "step": 1199 + }, + { + "epoch": 0.12464942349641633, + "grad_norm": 0.38566648960113525, + "learning_rate": 9.621502237956452e-05, + "loss": 1.7812, + "step": 1200 + }, + { + "epoch": 0.12475329801599667, + "grad_norm": 0.35631853342056274, + "learning_rate": 9.62087924437803e-05, + "loss": 1.6562, + "step": 1201 + }, + { + "epoch": 0.12485717253557703, + "grad_norm": 0.34664246439933777, + "learning_rate": 9.620255758712012e-05, + "loss": 1.743, + "step": 1202 + }, + { + "epoch": 0.12496104705515737, + "grad_norm": 0.4030589759349823, + "learning_rate": 9.619631781024797e-05, + "loss": 1.7807, + "step": 1203 + }, + { + "epoch": 0.12506492157473773, + "grad_norm": 0.3608097732067108, + "learning_rate": 9.619007311382828e-05, + "loss": 1.8727, + "step": 1204 + }, + { + "epoch": 0.12516879609431805, + "grad_norm": 0.3390495777130127, + "learning_rate": 9.618382349852612e-05, + "loss": 1.6474, + "step": 1205 + }, + { + "epoch": 0.1252726706138984, + "grad_norm": 0.34509339928627014, + "learning_rate": 9.617756896500701e-05, + "loss": 1.7651, + "step": 1206 + }, + { + "epoch": 0.12537654513347876, + "grad_norm": 0.37745997309684753, + "learning_rate": 9.6171309513937e-05, + "loss": 1.6898, + "step": 1207 + }, + { + "epoch": 0.12548041965305912, + "grad_norm": 0.33759596943855286, + "learning_rate": 9.616504514598267e-05, + "loss": 1.5825, + "step": 1208 + }, + { + "epoch": 0.12558429417263944, + "grad_norm": 0.3818475008010864, + "learning_rate": 9.615877586181114e-05, + "loss": 1.7711, + "step": 1209 + }, + { + "epoch": 0.1256881686922198, + "grad_norm": 0.3622841536998749, + "learning_rate": 9.615250166209003e-05, + "loss": 1.8772, + "step": 1210 + }, + { + "epoch": 0.12579204321180015, + "grad_norm": 0.38078466057777405, + "learning_rate": 9.614622254748749e-05, + "loss": 1.6718, + "step": 1211 + }, + { + "epoch": 0.12589591773138048, + "grad_norm": 0.3571094572544098, + "learning_rate": 9.613993851867221e-05, + "loss": 1.7133, + "step": 1212 + }, + { + "epoch": 0.12599979225096083, + "grad_norm": 0.3744667172431946, + "learning_rate": 9.613364957631339e-05, + "loss": 1.8524, + "step": 1213 + }, + { + "epoch": 0.1261036667705412, + "grad_norm": 0.3752327561378479, + "learning_rate": 9.612735572108073e-05, + "loss": 1.7747, + "step": 1214 + }, + { + "epoch": 0.12620754129012154, + "grad_norm": 0.3768329620361328, + "learning_rate": 9.61210569536445e-05, + "loss": 1.7503, + "step": 1215 + }, + { + "epoch": 0.12631141580970187, + "grad_norm": 0.35730409622192383, + "learning_rate": 9.611475327467547e-05, + "loss": 1.8134, + "step": 1216 + }, + { + "epoch": 0.12641529032928223, + "grad_norm": 0.38782814145088196, + "learning_rate": 9.610844468484492e-05, + "loss": 1.9043, + "step": 1217 + }, + { + "epoch": 0.12651916484886258, + "grad_norm": 0.35042253136634827, + "learning_rate": 9.610213118482468e-05, + "loss": 1.6922, + "step": 1218 + }, + { + "epoch": 0.1266230393684429, + "grad_norm": 0.3477722108364105, + "learning_rate": 9.609581277528708e-05, + "loss": 1.8249, + "step": 1219 + }, + { + "epoch": 0.12672691388802326, + "grad_norm": 0.358598530292511, + "learning_rate": 9.608948945690496e-05, + "loss": 1.6571, + "step": 1220 + }, + { + "epoch": 0.12683078840760362, + "grad_norm": 0.34587663412094116, + "learning_rate": 9.608316123035172e-05, + "loss": 1.8308, + "step": 1221 + }, + { + "epoch": 0.12693466292718397, + "grad_norm": 0.3662189543247223, + "learning_rate": 9.60768280963013e-05, + "loss": 1.8263, + "step": 1222 + }, + { + "epoch": 0.1270385374467643, + "grad_norm": 0.4229857325553894, + "learning_rate": 9.60704900554281e-05, + "loss": 1.8465, + "step": 1223 + }, + { + "epoch": 0.12714241196634465, + "grad_norm": 0.3771613538265228, + "learning_rate": 9.606414710840706e-05, + "loss": 1.9523, + "step": 1224 + }, + { + "epoch": 0.127246286485925, + "grad_norm": 0.37686896324157715, + "learning_rate": 9.605779925591366e-05, + "loss": 1.8436, + "step": 1225 + }, + { + "epoch": 0.12735016100550534, + "grad_norm": 0.3408556580543518, + "learning_rate": 9.605144649862391e-05, + "loss": 1.8095, + "step": 1226 + }, + { + "epoch": 0.1274540355250857, + "grad_norm": 0.3675098717212677, + "learning_rate": 9.604508883721432e-05, + "loss": 1.9452, + "step": 1227 + }, + { + "epoch": 0.12755791004466605, + "grad_norm": 0.3680543899536133, + "learning_rate": 9.603872627236194e-05, + "loss": 1.8987, + "step": 1228 + }, + { + "epoch": 0.1276617845642464, + "grad_norm": 0.3140665888786316, + "learning_rate": 9.603235880474433e-05, + "loss": 1.6107, + "step": 1229 + }, + { + "epoch": 0.12776565908382673, + "grad_norm": 0.35767602920532227, + "learning_rate": 9.602598643503957e-05, + "loss": 1.5594, + "step": 1230 + }, + { + "epoch": 0.12786953360340708, + "grad_norm": 0.40325549244880676, + "learning_rate": 9.601960916392627e-05, + "loss": 2.082, + "step": 1231 + }, + { + "epoch": 0.12797340812298744, + "grad_norm": 0.34380728006362915, + "learning_rate": 9.601322699208357e-05, + "loss": 1.7435, + "step": 1232 + }, + { + "epoch": 0.1280772826425678, + "grad_norm": 0.3566792607307434, + "learning_rate": 9.60068399201911e-05, + "loss": 1.7594, + "step": 1233 + }, + { + "epoch": 0.12818115716214812, + "grad_norm": 0.34519535303115845, + "learning_rate": 9.600044794892905e-05, + "loss": 1.8354, + "step": 1234 + }, + { + "epoch": 0.12828503168172847, + "grad_norm": 0.36046940088272095, + "learning_rate": 9.599405107897811e-05, + "loss": 1.7715, + "step": 1235 + }, + { + "epoch": 0.12838890620130883, + "grad_norm": 0.33479416370391846, + "learning_rate": 9.598764931101949e-05, + "loss": 1.6034, + "step": 1236 + }, + { + "epoch": 0.12849278072088915, + "grad_norm": 0.3153260052204132, + "learning_rate": 9.598124264573495e-05, + "loss": 1.6561, + "step": 1237 + }, + { + "epoch": 0.1285966552404695, + "grad_norm": 0.3522169888019562, + "learning_rate": 9.597483108380674e-05, + "loss": 1.8149, + "step": 1238 + }, + { + "epoch": 0.12870052976004986, + "grad_norm": 0.39202046394348145, + "learning_rate": 9.596841462591761e-05, + "loss": 1.8963, + "step": 1239 + }, + { + "epoch": 0.12880440427963022, + "grad_norm": 0.3598194718360901, + "learning_rate": 9.596199327275091e-05, + "loss": 1.7161, + "step": 1240 + }, + { + "epoch": 0.12890827879921055, + "grad_norm": 0.36012735962867737, + "learning_rate": 9.595556702499045e-05, + "loss": 1.86, + "step": 1241 + }, + { + "epoch": 0.1290121533187909, + "grad_norm": 0.32633063197135925, + "learning_rate": 9.594913588332055e-05, + "loss": 1.5897, + "step": 1242 + }, + { + "epoch": 0.12911602783837126, + "grad_norm": 0.35175660252571106, + "learning_rate": 9.594269984842612e-05, + "loss": 1.8549, + "step": 1243 + }, + { + "epoch": 0.12921990235795158, + "grad_norm": 0.3716254234313965, + "learning_rate": 9.59362589209925e-05, + "loss": 1.8379, + "step": 1244 + }, + { + "epoch": 0.12932377687753194, + "grad_norm": 0.3607979118824005, + "learning_rate": 9.592981310170563e-05, + "loss": 1.8592, + "step": 1245 + }, + { + "epoch": 0.1294276513971123, + "grad_norm": 0.3418334424495697, + "learning_rate": 9.592336239125193e-05, + "loss": 1.7502, + "step": 1246 + }, + { + "epoch": 0.12953152591669265, + "grad_norm": 0.3638349771499634, + "learning_rate": 9.591690679031835e-05, + "loss": 1.8981, + "step": 1247 + }, + { + "epoch": 0.12963540043627297, + "grad_norm": 0.36420029401779175, + "learning_rate": 9.591044629959236e-05, + "loss": 1.913, + "step": 1248 + }, + { + "epoch": 0.12973927495585333, + "grad_norm": 0.3521292805671692, + "learning_rate": 9.590398091976195e-05, + "loss": 1.6886, + "step": 1249 + }, + { + "epoch": 0.12984314947543368, + "grad_norm": 0.3583468198776245, + "learning_rate": 9.589751065151567e-05, + "loss": 1.7328, + "step": 1250 + }, + { + "epoch": 0.129947023995014, + "grad_norm": 0.3620645999908447, + "learning_rate": 9.589103549554248e-05, + "loss": 1.9107, + "step": 1251 + }, + { + "epoch": 0.13005089851459437, + "grad_norm": 0.361101895570755, + "learning_rate": 9.5884555452532e-05, + "loss": 1.7693, + "step": 1252 + }, + { + "epoch": 0.13015477303417472, + "grad_norm": 0.3591129183769226, + "learning_rate": 9.587807052317426e-05, + "loss": 1.696, + "step": 1253 + }, + { + "epoch": 0.13025864755375507, + "grad_norm": 0.3596014678478241, + "learning_rate": 9.587158070815988e-05, + "loss": 1.9454, + "step": 1254 + }, + { + "epoch": 0.1303625220733354, + "grad_norm": 0.35805267095565796, + "learning_rate": 9.586508600817997e-05, + "loss": 1.8159, + "step": 1255 + }, + { + "epoch": 0.13046639659291576, + "grad_norm": 0.3794294595718384, + "learning_rate": 9.585858642392616e-05, + "loss": 1.7915, + "step": 1256 + }, + { + "epoch": 0.1305702711124961, + "grad_norm": 0.350418359041214, + "learning_rate": 9.585208195609059e-05, + "loss": 1.754, + "step": 1257 + }, + { + "epoch": 0.13067414563207644, + "grad_norm": 0.37485983967781067, + "learning_rate": 9.584557260536597e-05, + "loss": 1.8592, + "step": 1258 + }, + { + "epoch": 0.1307780201516568, + "grad_norm": 0.38537418842315674, + "learning_rate": 9.583905837244547e-05, + "loss": 1.814, + "step": 1259 + }, + { + "epoch": 0.13088189467123715, + "grad_norm": 0.3657982349395752, + "learning_rate": 9.583253925802283e-05, + "loss": 1.7968, + "step": 1260 + }, + { + "epoch": 0.1309857691908175, + "grad_norm": 0.4190506935119629, + "learning_rate": 9.582601526279225e-05, + "loss": 2.0191, + "step": 1261 + }, + { + "epoch": 0.13108964371039783, + "grad_norm": 0.3682914674282074, + "learning_rate": 9.581948638744849e-05, + "loss": 1.7923, + "step": 1262 + }, + { + "epoch": 0.13119351822997818, + "grad_norm": 0.33630824089050293, + "learning_rate": 9.581295263268684e-05, + "loss": 1.6561, + "step": 1263 + }, + { + "epoch": 0.13129739274955854, + "grad_norm": 0.37690606713294983, + "learning_rate": 9.580641399920311e-05, + "loss": 1.8836, + "step": 1264 + }, + { + "epoch": 0.1314012672691389, + "grad_norm": 0.3465617895126343, + "learning_rate": 9.579987048769356e-05, + "loss": 1.7498, + "step": 1265 + }, + { + "epoch": 0.13150514178871922, + "grad_norm": 0.3308243155479431, + "learning_rate": 9.579332209885508e-05, + "loss": 1.6476, + "step": 1266 + }, + { + "epoch": 0.13160901630829958, + "grad_norm": 0.38376688957214355, + "learning_rate": 9.5786768833385e-05, + "loss": 1.7795, + "step": 1267 + }, + { + "epoch": 0.13171289082787993, + "grad_norm": 0.35049548745155334, + "learning_rate": 9.578021069198118e-05, + "loss": 1.718, + "step": 1268 + }, + { + "epoch": 0.13181676534746026, + "grad_norm": 0.34886178374290466, + "learning_rate": 9.577364767534202e-05, + "loss": 1.7214, + "step": 1269 + }, + { + "epoch": 0.1319206398670406, + "grad_norm": 0.36722105741500854, + "learning_rate": 9.576707978416644e-05, + "loss": 1.6402, + "step": 1270 + }, + { + "epoch": 0.13202451438662097, + "grad_norm": 0.4114188849925995, + "learning_rate": 9.576050701915385e-05, + "loss": 1.8282, + "step": 1271 + }, + { + "epoch": 0.13212838890620132, + "grad_norm": 0.40800419449806213, + "learning_rate": 9.575392938100422e-05, + "loss": 2.0049, + "step": 1272 + }, + { + "epoch": 0.13223226342578165, + "grad_norm": 0.3313359320163727, + "learning_rate": 9.5747346870418e-05, + "loss": 1.678, + "step": 1273 + }, + { + "epoch": 0.132336137945362, + "grad_norm": 0.33527714014053345, + "learning_rate": 9.574075948809618e-05, + "loss": 1.7132, + "step": 1274 + }, + { + "epoch": 0.13244001246494236, + "grad_norm": 0.37509602308273315, + "learning_rate": 9.573416723474026e-05, + "loss": 1.7306, + "step": 1275 + }, + { + "epoch": 0.13254388698452269, + "grad_norm": 0.41604387760162354, + "learning_rate": 9.572757011105227e-05, + "loss": 1.8486, + "step": 1276 + }, + { + "epoch": 0.13264776150410304, + "grad_norm": 0.3609203100204468, + "learning_rate": 9.572096811773477e-05, + "loss": 1.7497, + "step": 1277 + }, + { + "epoch": 0.1327516360236834, + "grad_norm": 0.346780389547348, + "learning_rate": 9.571436125549078e-05, + "loss": 1.6892, + "step": 1278 + }, + { + "epoch": 0.13285551054326375, + "grad_norm": 0.37450289726257324, + "learning_rate": 9.57077495250239e-05, + "loss": 1.77, + "step": 1279 + }, + { + "epoch": 0.13295938506284408, + "grad_norm": 0.36386212706565857, + "learning_rate": 9.570113292703826e-05, + "loss": 1.8936, + "step": 1280 + }, + { + "epoch": 0.13306325958242443, + "grad_norm": 0.381849080324173, + "learning_rate": 9.569451146223843e-05, + "loss": 1.7727, + "step": 1281 + }, + { + "epoch": 0.13316713410200479, + "grad_norm": 0.3608989715576172, + "learning_rate": 9.568788513132955e-05, + "loss": 1.8763, + "step": 1282 + }, + { + "epoch": 0.1332710086215851, + "grad_norm": 0.37927472591400146, + "learning_rate": 9.568125393501728e-05, + "loss": 1.864, + "step": 1283 + }, + { + "epoch": 0.13337488314116547, + "grad_norm": 0.37271663546562195, + "learning_rate": 9.567461787400781e-05, + "loss": 1.7884, + "step": 1284 + }, + { + "epoch": 0.13347875766074582, + "grad_norm": 0.34601160883903503, + "learning_rate": 9.566797694900779e-05, + "loss": 1.7951, + "step": 1285 + }, + { + "epoch": 0.13358263218032618, + "grad_norm": 0.32820773124694824, + "learning_rate": 9.566133116072447e-05, + "loss": 1.602, + "step": 1286 + }, + { + "epoch": 0.1336865066999065, + "grad_norm": 0.36603811383247375, + "learning_rate": 9.565468050986556e-05, + "loss": 1.7799, + "step": 1287 + }, + { + "epoch": 0.13379038121948686, + "grad_norm": 0.33986133337020874, + "learning_rate": 9.564802499713928e-05, + "loss": 1.7055, + "step": 1288 + }, + { + "epoch": 0.13389425573906721, + "grad_norm": 0.37927207350730896, + "learning_rate": 9.564136462325442e-05, + "loss": 1.8803, + "step": 1289 + }, + { + "epoch": 0.13399813025864754, + "grad_norm": 0.3654598593711853, + "learning_rate": 9.563469938892022e-05, + "loss": 1.8929, + "step": 1290 + }, + { + "epoch": 0.1341020047782279, + "grad_norm": 0.38735446333885193, + "learning_rate": 9.562802929484651e-05, + "loss": 1.6864, + "step": 1291 + }, + { + "epoch": 0.13420587929780825, + "grad_norm": 0.3329310417175293, + "learning_rate": 9.56213543417436e-05, + "loss": 1.7724, + "step": 1292 + }, + { + "epoch": 0.1343097538173886, + "grad_norm": 0.37568461894989014, + "learning_rate": 9.561467453032231e-05, + "loss": 1.794, + "step": 1293 + }, + { + "epoch": 0.13441362833696893, + "grad_norm": 0.3253065049648285, + "learning_rate": 9.560798986129399e-05, + "loss": 1.7469, + "step": 1294 + }, + { + "epoch": 0.1345175028565493, + "grad_norm": 0.32807695865631104, + "learning_rate": 9.56013003353705e-05, + "loss": 1.7181, + "step": 1295 + }, + { + "epoch": 0.13462137737612964, + "grad_norm": 0.34811633825302124, + "learning_rate": 9.559460595326424e-05, + "loss": 1.6823, + "step": 1296 + }, + { + "epoch": 0.13472525189570997, + "grad_norm": 0.3585864007472992, + "learning_rate": 9.558790671568809e-05, + "loss": 1.8669, + "step": 1297 + }, + { + "epoch": 0.13482912641529032, + "grad_norm": 0.3390277624130249, + "learning_rate": 9.558120262335546e-05, + "loss": 1.6597, + "step": 1298 + }, + { + "epoch": 0.13493300093487068, + "grad_norm": 0.3469536006450653, + "learning_rate": 9.557449367698032e-05, + "loss": 1.8582, + "step": 1299 + }, + { + "epoch": 0.13503687545445103, + "grad_norm": 0.35477346181869507, + "learning_rate": 9.55677798772771e-05, + "loss": 1.8483, + "step": 1300 + }, + { + "epoch": 0.13514074997403136, + "grad_norm": 0.3245786726474762, + "learning_rate": 9.556106122496076e-05, + "loss": 1.6194, + "step": 1301 + }, + { + "epoch": 0.13524462449361171, + "grad_norm": 0.37269967794418335, + "learning_rate": 9.555433772074678e-05, + "loss": 1.8113, + "step": 1302 + }, + { + "epoch": 0.13534849901319207, + "grad_norm": 0.41088709235191345, + "learning_rate": 9.554760936535116e-05, + "loss": 1.9416, + "step": 1303 + }, + { + "epoch": 0.13545237353277242, + "grad_norm": 0.35153427720069885, + "learning_rate": 9.554087615949046e-05, + "loss": 1.6847, + "step": 1304 + }, + { + "epoch": 0.13555624805235275, + "grad_norm": 0.38244035840034485, + "learning_rate": 9.553413810388165e-05, + "loss": 1.8235, + "step": 1305 + }, + { + "epoch": 0.1356601225719331, + "grad_norm": 0.33529362082481384, + "learning_rate": 9.552739519924235e-05, + "loss": 1.6378, + "step": 1306 + }, + { + "epoch": 0.13576399709151346, + "grad_norm": 0.335685133934021, + "learning_rate": 9.552064744629054e-05, + "loss": 1.8144, + "step": 1307 + }, + { + "epoch": 0.1358678716110938, + "grad_norm": 0.33851975202560425, + "learning_rate": 9.551389484574489e-05, + "loss": 1.6885, + "step": 1308 + }, + { + "epoch": 0.13597174613067414, + "grad_norm": 0.4007343649864197, + "learning_rate": 9.550713739832444e-05, + "loss": 2.0096, + "step": 1309 + }, + { + "epoch": 0.1360756206502545, + "grad_norm": 0.3581026494503021, + "learning_rate": 9.550037510474883e-05, + "loss": 1.7002, + "step": 1310 + }, + { + "epoch": 0.13617949516983485, + "grad_norm": 0.3608642518520355, + "learning_rate": 9.549360796573821e-05, + "loss": 1.7883, + "step": 1311 + }, + { + "epoch": 0.13628336968941518, + "grad_norm": 0.39638829231262207, + "learning_rate": 9.548683598201317e-05, + "loss": 1.7193, + "step": 1312 + }, + { + "epoch": 0.13638724420899553, + "grad_norm": 0.3727129399776459, + "learning_rate": 9.548005915429495e-05, + "loss": 1.8664, + "step": 1313 + }, + { + "epoch": 0.1364911187285759, + "grad_norm": 0.34268417954444885, + "learning_rate": 9.547327748330514e-05, + "loss": 1.7827, + "step": 1314 + }, + { + "epoch": 0.13659499324815622, + "grad_norm": 0.3418048918247223, + "learning_rate": 9.5466490969766e-05, + "loss": 1.7287, + "step": 1315 + }, + { + "epoch": 0.13669886776773657, + "grad_norm": 0.3453529477119446, + "learning_rate": 9.545969961440025e-05, + "loss": 1.7565, + "step": 1316 + }, + { + "epoch": 0.13680274228731693, + "grad_norm": 0.35973402857780457, + "learning_rate": 9.545290341793107e-05, + "loss": 1.8727, + "step": 1317 + }, + { + "epoch": 0.13690661680689728, + "grad_norm": 0.3567400872707367, + "learning_rate": 9.544610238108223e-05, + "loss": 1.8201, + "step": 1318 + }, + { + "epoch": 0.1370104913264776, + "grad_norm": 0.37093833088874817, + "learning_rate": 9.543929650457796e-05, + "loss": 1.8134, + "step": 1319 + }, + { + "epoch": 0.13711436584605796, + "grad_norm": 0.3393130600452423, + "learning_rate": 9.543248578914307e-05, + "loss": 1.6867, + "step": 1320 + }, + { + "epoch": 0.13721824036563832, + "grad_norm": 0.3978770673274994, + "learning_rate": 9.542567023550283e-05, + "loss": 1.9418, + "step": 1321 + }, + { + "epoch": 0.13732211488521864, + "grad_norm": 0.33980122208595276, + "learning_rate": 9.541884984438304e-05, + "loss": 1.6986, + "step": 1322 + }, + { + "epoch": 0.137425989404799, + "grad_norm": 0.3643237054347992, + "learning_rate": 9.541202461651002e-05, + "loss": 1.6953, + "step": 1323 + }, + { + "epoch": 0.13752986392437935, + "grad_norm": 0.3416014313697815, + "learning_rate": 9.54051945526106e-05, + "loss": 1.6347, + "step": 1324 + }, + { + "epoch": 0.1376337384439597, + "grad_norm": 0.42336559295654297, + "learning_rate": 9.539835965341215e-05, + "loss": 2.0321, + "step": 1325 + }, + { + "epoch": 0.13773761296354003, + "grad_norm": 0.3578283488750458, + "learning_rate": 9.539151991964253e-05, + "loss": 1.7978, + "step": 1326 + }, + { + "epoch": 0.1378414874831204, + "grad_norm": 0.39390215277671814, + "learning_rate": 9.538467535203007e-05, + "loss": 2.0113, + "step": 1327 + }, + { + "epoch": 0.13794536200270074, + "grad_norm": 0.34902918338775635, + "learning_rate": 9.537782595130371e-05, + "loss": 1.7936, + "step": 1328 + }, + { + "epoch": 0.13804923652228107, + "grad_norm": 0.3545171916484833, + "learning_rate": 9.537097171819287e-05, + "loss": 1.8352, + "step": 1329 + }, + { + "epoch": 0.13815311104186143, + "grad_norm": 0.41688135266304016, + "learning_rate": 9.536411265342742e-05, + "loss": 1.9648, + "step": 1330 + }, + { + "epoch": 0.13825698556144178, + "grad_norm": 0.34818556904792786, + "learning_rate": 9.535724875773784e-05, + "loss": 1.6804, + "step": 1331 + }, + { + "epoch": 0.13836086008102214, + "grad_norm": 0.3562292754650116, + "learning_rate": 9.535038003185507e-05, + "loss": 1.697, + "step": 1332 + }, + { + "epoch": 0.13846473460060246, + "grad_norm": 0.3660946786403656, + "learning_rate": 9.534350647651056e-05, + "loss": 1.8382, + "step": 1333 + }, + { + "epoch": 0.13856860912018282, + "grad_norm": 0.38286975026130676, + "learning_rate": 9.533662809243632e-05, + "loss": 1.8386, + "step": 1334 + }, + { + "epoch": 0.13867248363976317, + "grad_norm": 0.3726559281349182, + "learning_rate": 9.532974488036481e-05, + "loss": 1.701, + "step": 1335 + }, + { + "epoch": 0.1387763581593435, + "grad_norm": 0.37573710083961487, + "learning_rate": 9.532285684102907e-05, + "loss": 1.9953, + "step": 1336 + }, + { + "epoch": 0.13888023267892385, + "grad_norm": 0.36941513419151306, + "learning_rate": 9.53159639751626e-05, + "loss": 1.7335, + "step": 1337 + }, + { + "epoch": 0.1389841071985042, + "grad_norm": 0.3815058767795563, + "learning_rate": 9.530906628349945e-05, + "loss": 1.929, + "step": 1338 + }, + { + "epoch": 0.13908798171808456, + "grad_norm": 0.3447597026824951, + "learning_rate": 9.530216376677417e-05, + "loss": 1.7699, + "step": 1339 + }, + { + "epoch": 0.1391918562376649, + "grad_norm": 0.3957836627960205, + "learning_rate": 9.529525642572182e-05, + "loss": 1.8501, + "step": 1340 + }, + { + "epoch": 0.13929573075724525, + "grad_norm": 0.44103363156318665, + "learning_rate": 9.528834426107795e-05, + "loss": 1.9139, + "step": 1341 + }, + { + "epoch": 0.1393996052768256, + "grad_norm": 0.3422475755214691, + "learning_rate": 9.52814272735787e-05, + "loss": 1.7613, + "step": 1342 + }, + { + "epoch": 0.13950347979640595, + "grad_norm": 0.3645455837249756, + "learning_rate": 9.527450546396067e-05, + "loss": 1.7897, + "step": 1343 + }, + { + "epoch": 0.13960735431598628, + "grad_norm": 0.3628111779689789, + "learning_rate": 9.526757883296094e-05, + "loss": 1.782, + "step": 1344 + }, + { + "epoch": 0.13971122883556664, + "grad_norm": 0.3505362868309021, + "learning_rate": 9.526064738131717e-05, + "loss": 1.7047, + "step": 1345 + }, + { + "epoch": 0.139815103355147, + "grad_norm": 0.36256182193756104, + "learning_rate": 9.52537111097675e-05, + "loss": 1.8108, + "step": 1346 + }, + { + "epoch": 0.13991897787472732, + "grad_norm": 0.34468889236450195, + "learning_rate": 9.524677001905059e-05, + "loss": 1.7674, + "step": 1347 + }, + { + "epoch": 0.14002285239430767, + "grad_norm": 0.36335834860801697, + "learning_rate": 9.52398241099056e-05, + "loss": 1.7181, + "step": 1348 + }, + { + "epoch": 0.14012672691388803, + "grad_norm": 0.3589484393596649, + "learning_rate": 9.523287338307224e-05, + "loss": 1.8431, + "step": 1349 + }, + { + "epoch": 0.14023060143346838, + "grad_norm": 0.3601551949977875, + "learning_rate": 9.522591783929069e-05, + "loss": 1.6761, + "step": 1350 + }, + { + "epoch": 0.1403344759530487, + "grad_norm": 0.34505337476730347, + "learning_rate": 9.521895747930168e-05, + "loss": 1.7923, + "step": 1351 + }, + { + "epoch": 0.14043835047262906, + "grad_norm": 0.3898836374282837, + "learning_rate": 9.521199230384641e-05, + "loss": 1.9424, + "step": 1352 + }, + { + "epoch": 0.14054222499220942, + "grad_norm": 0.4122181534767151, + "learning_rate": 9.520502231366661e-05, + "loss": 1.7709, + "step": 1353 + }, + { + "epoch": 0.14064609951178975, + "grad_norm": 0.36563369631767273, + "learning_rate": 9.519804750950457e-05, + "loss": 1.8535, + "step": 1354 + }, + { + "epoch": 0.1407499740313701, + "grad_norm": 0.3425121009349823, + "learning_rate": 9.519106789210301e-05, + "loss": 1.6418, + "step": 1355 + }, + { + "epoch": 0.14085384855095046, + "grad_norm": 0.36973652243614197, + "learning_rate": 9.518408346220525e-05, + "loss": 1.7506, + "step": 1356 + }, + { + "epoch": 0.1409577230705308, + "grad_norm": 0.36737698316574097, + "learning_rate": 9.517709422055502e-05, + "loss": 1.756, + "step": 1357 + }, + { + "epoch": 0.14106159759011114, + "grad_norm": 0.36824169754981995, + "learning_rate": 9.517010016789667e-05, + "loss": 1.7849, + "step": 1358 + }, + { + "epoch": 0.1411654721096915, + "grad_norm": 0.44827938079833984, + "learning_rate": 9.516310130497498e-05, + "loss": 2.191, + "step": 1359 + }, + { + "epoch": 0.14126934662927185, + "grad_norm": 0.33879658579826355, + "learning_rate": 9.51560976325353e-05, + "loss": 1.6468, + "step": 1360 + }, + { + "epoch": 0.14137322114885217, + "grad_norm": 0.3727847933769226, + "learning_rate": 9.514908915132346e-05, + "loss": 1.5898, + "step": 1361 + }, + { + "epoch": 0.14147709566843253, + "grad_norm": 0.34985223412513733, + "learning_rate": 9.514207586208578e-05, + "loss": 1.5243, + "step": 1362 + }, + { + "epoch": 0.14158097018801288, + "grad_norm": 0.39218172430992126, + "learning_rate": 9.513505776556915e-05, + "loss": 1.8003, + "step": 1363 + }, + { + "epoch": 0.14168484470759324, + "grad_norm": 0.3964877426624298, + "learning_rate": 9.512803486252093e-05, + "loss": 1.7633, + "step": 1364 + }, + { + "epoch": 0.14178871922717357, + "grad_norm": 0.3854914903640747, + "learning_rate": 9.512100715368902e-05, + "loss": 1.8179, + "step": 1365 + }, + { + "epoch": 0.14189259374675392, + "grad_norm": 0.35278111696243286, + "learning_rate": 9.511397463982181e-05, + "loss": 1.8105, + "step": 1366 + }, + { + "epoch": 0.14199646826633427, + "grad_norm": 0.45402637124061584, + "learning_rate": 9.510693732166819e-05, + "loss": 1.9222, + "step": 1367 + }, + { + "epoch": 0.1421003427859146, + "grad_norm": 0.3488807678222656, + "learning_rate": 9.50998951999776e-05, + "loss": 1.6662, + "step": 1368 + }, + { + "epoch": 0.14220421730549496, + "grad_norm": 0.3388180136680603, + "learning_rate": 9.509284827549996e-05, + "loss": 1.6714, + "step": 1369 + }, + { + "epoch": 0.1423080918250753, + "grad_norm": 0.36008358001708984, + "learning_rate": 9.508579654898571e-05, + "loss": 1.8606, + "step": 1370 + }, + { + "epoch": 0.14241196634465567, + "grad_norm": 0.37350037693977356, + "learning_rate": 9.507874002118582e-05, + "loss": 1.8999, + "step": 1371 + }, + { + "epoch": 0.142515840864236, + "grad_norm": 0.3648621737957001, + "learning_rate": 9.507167869285174e-05, + "loss": 1.9342, + "step": 1372 + }, + { + "epoch": 0.14261971538381635, + "grad_norm": 0.3702144920825958, + "learning_rate": 9.506461256473545e-05, + "loss": 1.8062, + "step": 1373 + }, + { + "epoch": 0.1427235899033967, + "grad_norm": 0.37488213181495667, + "learning_rate": 9.505754163758946e-05, + "loss": 1.8751, + "step": 1374 + }, + { + "epoch": 0.14282746442297706, + "grad_norm": 0.37895387411117554, + "learning_rate": 9.505046591216674e-05, + "loss": 1.6707, + "step": 1375 + }, + { + "epoch": 0.14293133894255738, + "grad_norm": 0.3528745174407959, + "learning_rate": 9.504338538922078e-05, + "loss": 1.8434, + "step": 1376 + }, + { + "epoch": 0.14303521346213774, + "grad_norm": 0.3752725422382355, + "learning_rate": 9.503630006950566e-05, + "loss": 1.9281, + "step": 1377 + }, + { + "epoch": 0.1431390879817181, + "grad_norm": 0.38882383704185486, + "learning_rate": 9.502920995377587e-05, + "loss": 1.6748, + "step": 1378 + }, + { + "epoch": 0.14324296250129842, + "grad_norm": 0.3682365417480469, + "learning_rate": 9.502211504278644e-05, + "loss": 1.767, + "step": 1379 + }, + { + "epoch": 0.14334683702087878, + "grad_norm": 0.339139848947525, + "learning_rate": 9.501501533729297e-05, + "loss": 1.5963, + "step": 1380 + }, + { + "epoch": 0.14345071154045913, + "grad_norm": 0.3614344298839569, + "learning_rate": 9.50079108380515e-05, + "loss": 1.7736, + "step": 1381 + }, + { + "epoch": 0.14355458606003949, + "grad_norm": 0.33216235041618347, + "learning_rate": 9.500080154581859e-05, + "loss": 1.8277, + "step": 1382 + }, + { + "epoch": 0.1436584605796198, + "grad_norm": 0.3475283086299896, + "learning_rate": 9.499368746135133e-05, + "loss": 1.7244, + "step": 1383 + }, + { + "epoch": 0.14376233509920017, + "grad_norm": 0.35839635133743286, + "learning_rate": 9.498656858540731e-05, + "loss": 1.8232, + "step": 1384 + }, + { + "epoch": 0.14386620961878052, + "grad_norm": 0.37137824296951294, + "learning_rate": 9.497944491874467e-05, + "loss": 1.7515, + "step": 1385 + }, + { + "epoch": 0.14397008413836085, + "grad_norm": 0.35471752285957336, + "learning_rate": 9.4972316462122e-05, + "loss": 1.8021, + "step": 1386 + }, + { + "epoch": 0.1440739586579412, + "grad_norm": 0.3838224411010742, + "learning_rate": 9.49651832162984e-05, + "loss": 1.8813, + "step": 1387 + }, + { + "epoch": 0.14417783317752156, + "grad_norm": 0.32974106073379517, + "learning_rate": 9.495804518203355e-05, + "loss": 1.7155, + "step": 1388 + }, + { + "epoch": 0.1442817076971019, + "grad_norm": 0.35001546144485474, + "learning_rate": 9.495090236008756e-05, + "loss": 1.8771, + "step": 1389 + }, + { + "epoch": 0.14438558221668224, + "grad_norm": 0.37329480051994324, + "learning_rate": 9.49437547512211e-05, + "loss": 1.894, + "step": 1390 + }, + { + "epoch": 0.1444894567362626, + "grad_norm": 0.35633528232574463, + "learning_rate": 9.493660235619535e-05, + "loss": 1.7673, + "step": 1391 + }, + { + "epoch": 0.14459333125584295, + "grad_norm": 0.37964895367622375, + "learning_rate": 9.492944517577196e-05, + "loss": 2.0935, + "step": 1392 + }, + { + "epoch": 0.14469720577542328, + "grad_norm": 0.33758020401000977, + "learning_rate": 9.492228321071311e-05, + "loss": 1.6482, + "step": 1393 + }, + { + "epoch": 0.14480108029500363, + "grad_norm": 0.39080965518951416, + "learning_rate": 9.491511646178151e-05, + "loss": 1.6859, + "step": 1394 + }, + { + "epoch": 0.14490495481458399, + "grad_norm": 0.36754679679870605, + "learning_rate": 9.490794492974038e-05, + "loss": 1.6268, + "step": 1395 + }, + { + "epoch": 0.14500882933416434, + "grad_norm": 0.33904823660850525, + "learning_rate": 9.490076861535339e-05, + "loss": 1.8217, + "step": 1396 + }, + { + "epoch": 0.14511270385374467, + "grad_norm": 0.36078742146492004, + "learning_rate": 9.489358751938477e-05, + "loss": 1.7979, + "step": 1397 + }, + { + "epoch": 0.14521657837332502, + "grad_norm": 0.37665989995002747, + "learning_rate": 9.488640164259929e-05, + "loss": 1.7041, + "step": 1398 + }, + { + "epoch": 0.14532045289290538, + "grad_norm": 0.3702797591686249, + "learning_rate": 9.487921098576216e-05, + "loss": 1.8746, + "step": 1399 + }, + { + "epoch": 0.1454243274124857, + "grad_norm": 0.3909272253513336, + "learning_rate": 9.487201554963911e-05, + "loss": 1.9115, + "step": 1400 + }, + { + "epoch": 0.14552820193206606, + "grad_norm": 0.33725446462631226, + "learning_rate": 9.486481533499642e-05, + "loss": 1.6462, + "step": 1401 + }, + { + "epoch": 0.14563207645164641, + "grad_norm": 0.3404362201690674, + "learning_rate": 9.485761034260087e-05, + "loss": 1.5963, + "step": 1402 + }, + { + "epoch": 0.14573595097122677, + "grad_norm": 0.3509582281112671, + "learning_rate": 9.485040057321972e-05, + "loss": 1.7931, + "step": 1403 + }, + { + "epoch": 0.1458398254908071, + "grad_norm": 0.3783257007598877, + "learning_rate": 9.484318602762077e-05, + "loss": 2.0324, + "step": 1404 + }, + { + "epoch": 0.14594370001038745, + "grad_norm": 0.36144283413887024, + "learning_rate": 9.483596670657228e-05, + "loss": 1.5949, + "step": 1405 + }, + { + "epoch": 0.1460475745299678, + "grad_norm": 0.33135420083999634, + "learning_rate": 9.482874261084307e-05, + "loss": 1.6486, + "step": 1406 + }, + { + "epoch": 0.14615144904954813, + "grad_norm": 0.35105010867118835, + "learning_rate": 9.482151374120244e-05, + "loss": 1.7674, + "step": 1407 + }, + { + "epoch": 0.1462553235691285, + "grad_norm": 0.3350829780101776, + "learning_rate": 9.481428009842024e-05, + "loss": 1.6668, + "step": 1408 + }, + { + "epoch": 0.14635919808870884, + "grad_norm": 0.3917206823825836, + "learning_rate": 9.480704168326676e-05, + "loss": 1.7319, + "step": 1409 + }, + { + "epoch": 0.1464630726082892, + "grad_norm": 0.35552677512168884, + "learning_rate": 9.479979849651286e-05, + "loss": 1.7501, + "step": 1410 + }, + { + "epoch": 0.14656694712786952, + "grad_norm": 0.3432137072086334, + "learning_rate": 9.479255053892986e-05, + "loss": 1.7739, + "step": 1411 + }, + { + "epoch": 0.14667082164744988, + "grad_norm": 0.37724825739860535, + "learning_rate": 9.478529781128962e-05, + "loss": 1.8712, + "step": 1412 + }, + { + "epoch": 0.14677469616703023, + "grad_norm": 0.37483900785446167, + "learning_rate": 9.477804031436451e-05, + "loss": 1.7262, + "step": 1413 + }, + { + "epoch": 0.1468785706866106, + "grad_norm": 0.35302111506462097, + "learning_rate": 9.47707780489274e-05, + "loss": 1.68, + "step": 1414 + }, + { + "epoch": 0.14698244520619091, + "grad_norm": 0.3883351683616638, + "learning_rate": 9.476351101575163e-05, + "loss": 1.7192, + "step": 1415 + }, + { + "epoch": 0.14708631972577127, + "grad_norm": 0.3823285400867462, + "learning_rate": 9.475623921561112e-05, + "loss": 1.8355, + "step": 1416 + }, + { + "epoch": 0.14719019424535162, + "grad_norm": 0.3662102520465851, + "learning_rate": 9.474896264928024e-05, + "loss": 1.5081, + "step": 1417 + }, + { + "epoch": 0.14729406876493195, + "grad_norm": 0.35046282410621643, + "learning_rate": 9.47416813175339e-05, + "loss": 1.7609, + "step": 1418 + }, + { + "epoch": 0.1473979432845123, + "grad_norm": 0.37491992115974426, + "learning_rate": 9.47343952211475e-05, + "loss": 1.9254, + "step": 1419 + }, + { + "epoch": 0.14750181780409266, + "grad_norm": 0.38457345962524414, + "learning_rate": 9.472710436089693e-05, + "loss": 1.8976, + "step": 1420 + }, + { + "epoch": 0.14760569232367302, + "grad_norm": 0.39856263995170593, + "learning_rate": 9.471980873755865e-05, + "loss": 1.989, + "step": 1421 + }, + { + "epoch": 0.14770956684325334, + "grad_norm": 0.3745686113834381, + "learning_rate": 9.471250835190957e-05, + "loss": 1.7134, + "step": 1422 + }, + { + "epoch": 0.1478134413628337, + "grad_norm": 0.360574871301651, + "learning_rate": 9.470520320472712e-05, + "loss": 1.722, + "step": 1423 + }, + { + "epoch": 0.14791731588241405, + "grad_norm": 0.35429859161376953, + "learning_rate": 9.469789329678922e-05, + "loss": 1.5907, + "step": 1424 + }, + { + "epoch": 0.14802119040199438, + "grad_norm": 0.3642517030239105, + "learning_rate": 9.469057862887436e-05, + "loss": 1.7298, + "step": 1425 + }, + { + "epoch": 0.14812506492157473, + "grad_norm": 0.4171449840068817, + "learning_rate": 9.468325920176148e-05, + "loss": 1.9391, + "step": 1426 + }, + { + "epoch": 0.1482289394411551, + "grad_norm": 0.3378536105155945, + "learning_rate": 9.467593501623003e-05, + "loss": 1.7072, + "step": 1427 + }, + { + "epoch": 0.14833281396073544, + "grad_norm": 0.37104731798171997, + "learning_rate": 9.466860607305998e-05, + "loss": 1.7929, + "step": 1428 + }, + { + "epoch": 0.14843668848031577, + "grad_norm": 0.38163742423057556, + "learning_rate": 9.466127237303182e-05, + "loss": 1.8288, + "step": 1429 + }, + { + "epoch": 0.14854056299989613, + "grad_norm": 0.3787825107574463, + "learning_rate": 9.465393391692653e-05, + "loss": 1.7313, + "step": 1430 + }, + { + "epoch": 0.14864443751947648, + "grad_norm": 0.3504464626312256, + "learning_rate": 9.464659070552558e-05, + "loss": 1.7905, + "step": 1431 + }, + { + "epoch": 0.1487483120390568, + "grad_norm": 0.3874281346797943, + "learning_rate": 9.463924273961099e-05, + "loss": 1.8856, + "step": 1432 + }, + { + "epoch": 0.14885218655863716, + "grad_norm": 0.35603591799736023, + "learning_rate": 9.463189001996525e-05, + "loss": 1.7671, + "step": 1433 + }, + { + "epoch": 0.14895606107821752, + "grad_norm": 0.37857985496520996, + "learning_rate": 9.462453254737133e-05, + "loss": 1.7791, + "step": 1434 + }, + { + "epoch": 0.14905993559779787, + "grad_norm": 0.3580288290977478, + "learning_rate": 9.461717032261282e-05, + "loss": 1.6404, + "step": 1435 + }, + { + "epoch": 0.1491638101173782, + "grad_norm": 0.38064590096473694, + "learning_rate": 9.460980334647367e-05, + "loss": 1.8078, + "step": 1436 + }, + { + "epoch": 0.14926768463695855, + "grad_norm": 0.3878273069858551, + "learning_rate": 9.460243161973846e-05, + "loss": 1.6382, + "step": 1437 + }, + { + "epoch": 0.1493715591565389, + "grad_norm": 0.3887486457824707, + "learning_rate": 9.459505514319216e-05, + "loss": 1.8884, + "step": 1438 + }, + { + "epoch": 0.14947543367611923, + "grad_norm": 0.42063435912132263, + "learning_rate": 9.458767391762038e-05, + "loss": 1.9866, + "step": 1439 + }, + { + "epoch": 0.1495793081956996, + "grad_norm": 0.39019012451171875, + "learning_rate": 9.45802879438091e-05, + "loss": 1.8562, + "step": 1440 + }, + { + "epoch": 0.14968318271527994, + "grad_norm": 0.36793825030326843, + "learning_rate": 9.457289722254489e-05, + "loss": 1.8974, + "step": 1441 + }, + { + "epoch": 0.1497870572348603, + "grad_norm": 0.37522581219673157, + "learning_rate": 9.456550175461483e-05, + "loss": 1.878, + "step": 1442 + }, + { + "epoch": 0.14989093175444063, + "grad_norm": 0.3458879590034485, + "learning_rate": 9.455810154080644e-05, + "loss": 1.7597, + "step": 1443 + }, + { + "epoch": 0.14999480627402098, + "grad_norm": 0.3603193759918213, + "learning_rate": 9.45506965819078e-05, + "loss": 1.7119, + "step": 1444 + }, + { + "epoch": 0.15009868079360134, + "grad_norm": 0.35782983899116516, + "learning_rate": 9.45432868787075e-05, + "loss": 1.7925, + "step": 1445 + }, + { + "epoch": 0.1502025553131817, + "grad_norm": 0.3702535331249237, + "learning_rate": 9.453587243199458e-05, + "loss": 1.8274, + "step": 1446 + }, + { + "epoch": 0.15030642983276202, + "grad_norm": 0.37376201152801514, + "learning_rate": 9.452845324255865e-05, + "loss": 1.6782, + "step": 1447 + }, + { + "epoch": 0.15041030435234237, + "grad_norm": 0.3699168264865875, + "learning_rate": 9.452102931118979e-05, + "loss": 1.8381, + "step": 1448 + }, + { + "epoch": 0.15051417887192273, + "grad_norm": 0.36657705903053284, + "learning_rate": 9.451360063867857e-05, + "loss": 1.8949, + "step": 1449 + }, + { + "epoch": 0.15061805339150305, + "grad_norm": 0.35056763887405396, + "learning_rate": 9.450616722581611e-05, + "loss": 1.7087, + "step": 1450 + }, + { + "epoch": 0.1507219279110834, + "grad_norm": 0.33786895871162415, + "learning_rate": 9.449872907339401e-05, + "loss": 1.7222, + "step": 1451 + }, + { + "epoch": 0.15082580243066376, + "grad_norm": 0.3312149941921234, + "learning_rate": 9.449128618220436e-05, + "loss": 1.7871, + "step": 1452 + }, + { + "epoch": 0.15092967695024412, + "grad_norm": 0.3575022518634796, + "learning_rate": 9.448383855303977e-05, + "loss": 1.7017, + "step": 1453 + }, + { + "epoch": 0.15103355146982445, + "grad_norm": 0.35011738538742065, + "learning_rate": 9.447638618669336e-05, + "loss": 1.9229, + "step": 1454 + }, + { + "epoch": 0.1511374259894048, + "grad_norm": 0.37051519751548767, + "learning_rate": 9.446892908395877e-05, + "loss": 1.8912, + "step": 1455 + }, + { + "epoch": 0.15124130050898515, + "grad_norm": 0.3656577169895172, + "learning_rate": 9.446146724563009e-05, + "loss": 1.8665, + "step": 1456 + }, + { + "epoch": 0.15134517502856548, + "grad_norm": 0.34047558903694153, + "learning_rate": 9.445400067250196e-05, + "loss": 1.6188, + "step": 1457 + }, + { + "epoch": 0.15144904954814584, + "grad_norm": 0.3526557385921478, + "learning_rate": 9.44465293653695e-05, + "loss": 1.7484, + "step": 1458 + }, + { + "epoch": 0.1515529240677262, + "grad_norm": 0.3618306815624237, + "learning_rate": 9.443905332502837e-05, + "loss": 1.6957, + "step": 1459 + }, + { + "epoch": 0.15165679858730655, + "grad_norm": 0.39453551173210144, + "learning_rate": 9.443157255227472e-05, + "loss": 1.866, + "step": 1460 + }, + { + "epoch": 0.15176067310688687, + "grad_norm": 0.3841235637664795, + "learning_rate": 9.442408704790516e-05, + "loss": 1.8885, + "step": 1461 + }, + { + "epoch": 0.15186454762646723, + "grad_norm": 0.3559393286705017, + "learning_rate": 9.441659681271684e-05, + "loss": 1.5994, + "step": 1462 + }, + { + "epoch": 0.15196842214604758, + "grad_norm": 0.3648277521133423, + "learning_rate": 9.440910184750743e-05, + "loss": 1.741, + "step": 1463 + }, + { + "epoch": 0.1520722966656279, + "grad_norm": 0.35485556721687317, + "learning_rate": 9.440160215307508e-05, + "loss": 1.7756, + "step": 1464 + }, + { + "epoch": 0.15217617118520826, + "grad_norm": 0.35386818647384644, + "learning_rate": 9.439409773021844e-05, + "loss": 1.877, + "step": 1465 + }, + { + "epoch": 0.15228004570478862, + "grad_norm": 0.3678518235683441, + "learning_rate": 9.438658857973668e-05, + "loss": 1.7585, + "step": 1466 + }, + { + "epoch": 0.15238392022436897, + "grad_norm": 0.36395224928855896, + "learning_rate": 9.437907470242946e-05, + "loss": 1.7805, + "step": 1467 + }, + { + "epoch": 0.1524877947439493, + "grad_norm": 0.3579014539718628, + "learning_rate": 9.437155609909696e-05, + "loss": 1.7271, + "step": 1468 + }, + { + "epoch": 0.15259166926352966, + "grad_norm": 0.3654552102088928, + "learning_rate": 9.436403277053984e-05, + "loss": 1.858, + "step": 1469 + }, + { + "epoch": 0.15269554378311, + "grad_norm": 0.3869788646697998, + "learning_rate": 9.435650471755931e-05, + "loss": 1.8507, + "step": 1470 + }, + { + "epoch": 0.15279941830269034, + "grad_norm": 0.3339064121246338, + "learning_rate": 9.4348971940957e-05, + "loss": 1.7086, + "step": 1471 + }, + { + "epoch": 0.1529032928222707, + "grad_norm": 0.3778363764286041, + "learning_rate": 9.434143444153511e-05, + "loss": 1.936, + "step": 1472 + }, + { + "epoch": 0.15300716734185105, + "grad_norm": 0.4051001965999603, + "learning_rate": 9.433389222009633e-05, + "loss": 1.7736, + "step": 1473 + }, + { + "epoch": 0.1531110418614314, + "grad_norm": 0.35207393765449524, + "learning_rate": 9.432634527744386e-05, + "loss": 1.7176, + "step": 1474 + }, + { + "epoch": 0.15321491638101173, + "grad_norm": 0.3851469159126282, + "learning_rate": 9.431879361438137e-05, + "loss": 1.9448, + "step": 1475 + }, + { + "epoch": 0.15331879090059208, + "grad_norm": 0.35818904638290405, + "learning_rate": 9.431123723171305e-05, + "loss": 1.7773, + "step": 1476 + }, + { + "epoch": 0.15342266542017244, + "grad_norm": 0.37601613998413086, + "learning_rate": 9.430367613024361e-05, + "loss": 1.9316, + "step": 1477 + }, + { + "epoch": 0.15352653993975277, + "grad_norm": 0.3351070284843445, + "learning_rate": 9.429611031077825e-05, + "loss": 1.5443, + "step": 1478 + }, + { + "epoch": 0.15363041445933312, + "grad_norm": 0.4057861566543579, + "learning_rate": 9.428853977412266e-05, + "loss": 1.8619, + "step": 1479 + }, + { + "epoch": 0.15373428897891347, + "grad_norm": 0.3662700057029724, + "learning_rate": 9.428096452108305e-05, + "loss": 1.7752, + "step": 1480 + }, + { + "epoch": 0.15383816349849383, + "grad_norm": 0.38205060362815857, + "learning_rate": 9.427338455246612e-05, + "loss": 1.7397, + "step": 1481 + }, + { + "epoch": 0.15394203801807416, + "grad_norm": 0.3742862641811371, + "learning_rate": 9.426579986907909e-05, + "loss": 1.7385, + "step": 1482 + }, + { + "epoch": 0.1540459125376545, + "grad_norm": 0.37825533747673035, + "learning_rate": 9.425821047172965e-05, + "loss": 1.8288, + "step": 1483 + }, + { + "epoch": 0.15414978705723487, + "grad_norm": 0.42222508788108826, + "learning_rate": 9.425061636122603e-05, + "loss": 1.9567, + "step": 1484 + }, + { + "epoch": 0.15425366157681522, + "grad_norm": 0.3459799885749817, + "learning_rate": 9.424301753837692e-05, + "loss": 1.7383, + "step": 1485 + }, + { + "epoch": 0.15435753609639555, + "grad_norm": 0.38764312863349915, + "learning_rate": 9.423541400399157e-05, + "loss": 1.8508, + "step": 1486 + }, + { + "epoch": 0.1544614106159759, + "grad_norm": 0.3998440206050873, + "learning_rate": 9.422780575887967e-05, + "loss": 1.9329, + "step": 1487 + }, + { + "epoch": 0.15456528513555626, + "grad_norm": 0.365713894367218, + "learning_rate": 9.422019280385145e-05, + "loss": 1.9418, + "step": 1488 + }, + { + "epoch": 0.15466915965513658, + "grad_norm": 0.3910145163536072, + "learning_rate": 9.421257513971762e-05, + "loss": 1.8998, + "step": 1489 + }, + { + "epoch": 0.15477303417471694, + "grad_norm": 0.3563111424446106, + "learning_rate": 9.420495276728943e-05, + "loss": 1.8042, + "step": 1490 + }, + { + "epoch": 0.1548769086942973, + "grad_norm": 0.3810329735279083, + "learning_rate": 9.419732568737856e-05, + "loss": 1.9044, + "step": 1491 + }, + { + "epoch": 0.15498078321387765, + "grad_norm": 0.4031761884689331, + "learning_rate": 9.418969390079726e-05, + "loss": 2.098, + "step": 1492 + }, + { + "epoch": 0.15508465773345798, + "grad_norm": 0.3679083287715912, + "learning_rate": 9.418205740835825e-05, + "loss": 1.7396, + "step": 1493 + }, + { + "epoch": 0.15518853225303833, + "grad_norm": 0.35544419288635254, + "learning_rate": 9.417441621087476e-05, + "loss": 1.7114, + "step": 1494 + }, + { + "epoch": 0.15529240677261869, + "grad_norm": 0.36036983132362366, + "learning_rate": 9.416677030916053e-05, + "loss": 1.828, + "step": 1495 + }, + { + "epoch": 0.155396281292199, + "grad_norm": 0.34864529967308044, + "learning_rate": 9.415911970402976e-05, + "loss": 1.7821, + "step": 1496 + }, + { + "epoch": 0.15550015581177937, + "grad_norm": 0.4014929533004761, + "learning_rate": 9.41514643962972e-05, + "loss": 1.8746, + "step": 1497 + }, + { + "epoch": 0.15560403033135972, + "grad_norm": 0.39249807596206665, + "learning_rate": 9.414380438677808e-05, + "loss": 1.9818, + "step": 1498 + }, + { + "epoch": 0.15570790485094008, + "grad_norm": 0.33457159996032715, + "learning_rate": 9.413613967628814e-05, + "loss": 1.7178, + "step": 1499 + }, + { + "epoch": 0.1558117793705204, + "grad_norm": 0.326725572347641, + "learning_rate": 9.412847026564357e-05, + "loss": 1.5971, + "step": 1500 + }, + { + "epoch": 0.15591565389010076, + "grad_norm": 0.36268436908721924, + "learning_rate": 9.412079615566116e-05, + "loss": 1.8022, + "step": 1501 + }, + { + "epoch": 0.1560195284096811, + "grad_norm": 0.3470126986503601, + "learning_rate": 9.411311734715811e-05, + "loss": 1.8473, + "step": 1502 + }, + { + "epoch": 0.15612340292926144, + "grad_norm": 0.3726639747619629, + "learning_rate": 9.410543384095214e-05, + "loss": 1.7718, + "step": 1503 + }, + { + "epoch": 0.1562272774488418, + "grad_norm": 0.37733760476112366, + "learning_rate": 9.409774563786152e-05, + "loss": 1.8614, + "step": 1504 + }, + { + "epoch": 0.15633115196842215, + "grad_norm": 0.4124131202697754, + "learning_rate": 9.409005273870495e-05, + "loss": 2.1187, + "step": 1505 + }, + { + "epoch": 0.1564350264880025, + "grad_norm": 0.3965660631656647, + "learning_rate": 9.40823551443017e-05, + "loss": 1.7244, + "step": 1506 + }, + { + "epoch": 0.15653890100758283, + "grad_norm": 0.390770822763443, + "learning_rate": 9.407465285547147e-05, + "loss": 1.8046, + "step": 1507 + }, + { + "epoch": 0.15664277552716319, + "grad_norm": 0.369103342294693, + "learning_rate": 9.406694587303451e-05, + "loss": 1.7574, + "step": 1508 + }, + { + "epoch": 0.15674665004674354, + "grad_norm": 0.38057541847229004, + "learning_rate": 9.405923419781154e-05, + "loss": 1.8895, + "step": 1509 + }, + { + "epoch": 0.15685052456632387, + "grad_norm": 0.3767298460006714, + "learning_rate": 9.405151783062381e-05, + "loss": 1.6415, + "step": 1510 + }, + { + "epoch": 0.15695439908590422, + "grad_norm": 0.35092034935951233, + "learning_rate": 9.404379677229305e-05, + "loss": 1.7707, + "step": 1511 + }, + { + "epoch": 0.15705827360548458, + "grad_norm": 0.3567756712436676, + "learning_rate": 9.403607102364148e-05, + "loss": 1.7814, + "step": 1512 + }, + { + "epoch": 0.15716214812506493, + "grad_norm": 0.38232964277267456, + "learning_rate": 9.402834058549184e-05, + "loss": 2.0017, + "step": 1513 + }, + { + "epoch": 0.15726602264464526, + "grad_norm": 0.35852310061454773, + "learning_rate": 9.402060545866737e-05, + "loss": 1.8552, + "step": 1514 + }, + { + "epoch": 0.15736989716422561, + "grad_norm": 0.3555811047554016, + "learning_rate": 9.401286564399178e-05, + "loss": 1.8447, + "step": 1515 + }, + { + "epoch": 0.15747377168380597, + "grad_norm": 0.33671462535858154, + "learning_rate": 9.400512114228935e-05, + "loss": 1.6783, + "step": 1516 + }, + { + "epoch": 0.1575776462033863, + "grad_norm": 0.3423287570476532, + "learning_rate": 9.399737195438473e-05, + "loss": 1.7901, + "step": 1517 + }, + { + "epoch": 0.15768152072296665, + "grad_norm": 0.3938925564289093, + "learning_rate": 9.398961808110321e-05, + "loss": 1.6647, + "step": 1518 + }, + { + "epoch": 0.157785395242547, + "grad_norm": 0.366240918636322, + "learning_rate": 9.398185952327048e-05, + "loss": 1.7797, + "step": 1519 + }, + { + "epoch": 0.15788926976212736, + "grad_norm": 0.3558516204357147, + "learning_rate": 9.397409628171279e-05, + "loss": 1.6302, + "step": 1520 + }, + { + "epoch": 0.1579931442817077, + "grad_norm": 0.3891289234161377, + "learning_rate": 9.396632835725687e-05, + "loss": 1.9296, + "step": 1521 + }, + { + "epoch": 0.15809701880128804, + "grad_norm": 0.4402526319026947, + "learning_rate": 9.395855575072991e-05, + "loss": 1.9748, + "step": 1522 + }, + { + "epoch": 0.1582008933208684, + "grad_norm": 0.3928702473640442, + "learning_rate": 9.395077846295967e-05, + "loss": 1.9662, + "step": 1523 + }, + { + "epoch": 0.15830476784044875, + "grad_norm": 0.36685439944267273, + "learning_rate": 9.394299649477434e-05, + "loss": 1.7051, + "step": 1524 + }, + { + "epoch": 0.15840864236002908, + "grad_norm": 0.36427903175354004, + "learning_rate": 9.393520984700266e-05, + "loss": 1.8365, + "step": 1525 + }, + { + "epoch": 0.15851251687960943, + "grad_norm": 0.3728634715080261, + "learning_rate": 9.392741852047385e-05, + "loss": 1.8812, + "step": 1526 + }, + { + "epoch": 0.1586163913991898, + "grad_norm": 0.3916126489639282, + "learning_rate": 9.391962251601758e-05, + "loss": 1.8, + "step": 1527 + }, + { + "epoch": 0.15872026591877011, + "grad_norm": 0.3563532531261444, + "learning_rate": 9.391182183446413e-05, + "loss": 1.5527, + "step": 1528 + }, + { + "epoch": 0.15882414043835047, + "grad_norm": 0.3756893277168274, + "learning_rate": 9.390401647664417e-05, + "loss": 1.9147, + "step": 1529 + }, + { + "epoch": 0.15892801495793082, + "grad_norm": 0.35163572430610657, + "learning_rate": 9.389620644338892e-05, + "loss": 1.7478, + "step": 1530 + }, + { + "epoch": 0.15903188947751118, + "grad_norm": 0.3400569260120392, + "learning_rate": 9.388839173553007e-05, + "loss": 1.5696, + "step": 1531 + }, + { + "epoch": 0.1591357639970915, + "grad_norm": 0.3381504416465759, + "learning_rate": 9.388057235389986e-05, + "loss": 1.6998, + "step": 1532 + }, + { + "epoch": 0.15923963851667186, + "grad_norm": 0.35516080260276794, + "learning_rate": 9.3872748299331e-05, + "loss": 1.8103, + "step": 1533 + }, + { + "epoch": 0.15934351303625222, + "grad_norm": 0.3808714747428894, + "learning_rate": 9.386491957265662e-05, + "loss": 1.7707, + "step": 1534 + }, + { + "epoch": 0.15944738755583254, + "grad_norm": 0.3629775941371918, + "learning_rate": 9.38570861747105e-05, + "loss": 1.8373, + "step": 1535 + }, + { + "epoch": 0.1595512620754129, + "grad_norm": 0.340930700302124, + "learning_rate": 9.384924810632679e-05, + "loss": 1.7374, + "step": 1536 + }, + { + "epoch": 0.15965513659499325, + "grad_norm": 0.40548157691955566, + "learning_rate": 9.384140536834018e-05, + "loss": 2.0285, + "step": 1537 + }, + { + "epoch": 0.1597590111145736, + "grad_norm": 0.39659062027931213, + "learning_rate": 9.383355796158588e-05, + "loss": 1.8455, + "step": 1538 + }, + { + "epoch": 0.15986288563415393, + "grad_norm": 0.35021933913230896, + "learning_rate": 9.382570588689958e-05, + "loss": 1.8248, + "step": 1539 + }, + { + "epoch": 0.1599667601537343, + "grad_norm": 0.39401671290397644, + "learning_rate": 9.381784914511746e-05, + "loss": 2.1518, + "step": 1540 + }, + { + "epoch": 0.16007063467331464, + "grad_norm": 0.3451925814151764, + "learning_rate": 9.38099877370762e-05, + "loss": 1.7234, + "step": 1541 + }, + { + "epoch": 0.16017450919289497, + "grad_norm": 0.3629453778266907, + "learning_rate": 9.380212166361298e-05, + "loss": 1.7744, + "step": 1542 + }, + { + "epoch": 0.16027838371247533, + "grad_norm": 0.4119042456150055, + "learning_rate": 9.379425092556545e-05, + "loss": 2.0752, + "step": 1543 + }, + { + "epoch": 0.16038225823205568, + "grad_norm": 0.347231388092041, + "learning_rate": 9.378637552377181e-05, + "loss": 1.7362, + "step": 1544 + }, + { + "epoch": 0.16048613275163603, + "grad_norm": 0.4104901850223541, + "learning_rate": 9.377849545907074e-05, + "loss": 1.9631, + "step": 1545 + }, + { + "epoch": 0.16059000727121636, + "grad_norm": 0.396040141582489, + "learning_rate": 9.377061073230138e-05, + "loss": 1.955, + "step": 1546 + }, + { + "epoch": 0.16069388179079672, + "grad_norm": 0.3335644602775574, + "learning_rate": 9.37627213443034e-05, + "loss": 1.5817, + "step": 1547 + }, + { + "epoch": 0.16079775631037707, + "grad_norm": 0.3619517385959625, + "learning_rate": 9.375482729591696e-05, + "loss": 1.7563, + "step": 1548 + }, + { + "epoch": 0.1609016308299574, + "grad_norm": 0.35229092836380005, + "learning_rate": 9.37469285879827e-05, + "loss": 1.7638, + "step": 1549 + }, + { + "epoch": 0.16100550534953775, + "grad_norm": 0.36668309569358826, + "learning_rate": 9.373902522134181e-05, + "loss": 1.6821, + "step": 1550 + }, + { + "epoch": 0.1611093798691181, + "grad_norm": 0.38943883776664734, + "learning_rate": 9.37311171968359e-05, + "loss": 1.6423, + "step": 1551 + }, + { + "epoch": 0.16121325438869846, + "grad_norm": 0.35452452301979065, + "learning_rate": 9.372320451530711e-05, + "loss": 1.7089, + "step": 1552 + }, + { + "epoch": 0.1613171289082788, + "grad_norm": 0.39926689863204956, + "learning_rate": 9.371528717759811e-05, + "loss": 1.801, + "step": 1553 + }, + { + "epoch": 0.16142100342785914, + "grad_norm": 0.37590524554252625, + "learning_rate": 9.370736518455203e-05, + "loss": 1.7258, + "step": 1554 + }, + { + "epoch": 0.1615248779474395, + "grad_norm": 0.35222113132476807, + "learning_rate": 9.369943853701246e-05, + "loss": 1.8522, + "step": 1555 + }, + { + "epoch": 0.16162875246701985, + "grad_norm": 0.35563012957572937, + "learning_rate": 9.369150723582358e-05, + "loss": 1.6075, + "step": 1556 + }, + { + "epoch": 0.16173262698660018, + "grad_norm": 0.39180493354797363, + "learning_rate": 9.368357128182998e-05, + "loss": 1.7797, + "step": 1557 + }, + { + "epoch": 0.16183650150618054, + "grad_norm": 0.3635503053665161, + "learning_rate": 9.367563067587677e-05, + "loss": 1.8579, + "step": 1558 + }, + { + "epoch": 0.1619403760257609, + "grad_norm": 0.3636340796947479, + "learning_rate": 9.366768541880958e-05, + "loss": 1.708, + "step": 1559 + }, + { + "epoch": 0.16204425054534122, + "grad_norm": 0.39017453789711, + "learning_rate": 9.365973551147453e-05, + "loss": 1.8155, + "step": 1560 + }, + { + "epoch": 0.16214812506492157, + "grad_norm": 0.3406469225883484, + "learning_rate": 9.365178095471819e-05, + "loss": 1.7573, + "step": 1561 + }, + { + "epoch": 0.16225199958450193, + "grad_norm": 0.36579304933547974, + "learning_rate": 9.364382174938768e-05, + "loss": 1.898, + "step": 1562 + }, + { + "epoch": 0.16235587410408228, + "grad_norm": 0.3369811773300171, + "learning_rate": 9.363585789633057e-05, + "loss": 1.6292, + "step": 1563 + }, + { + "epoch": 0.1624597486236626, + "grad_norm": 0.3355672061443329, + "learning_rate": 9.3627889396395e-05, + "loss": 1.6484, + "step": 1564 + }, + { + "epoch": 0.16256362314324296, + "grad_norm": 0.37393924593925476, + "learning_rate": 9.36199162504295e-05, + "loss": 1.7915, + "step": 1565 + }, + { + "epoch": 0.16266749766282332, + "grad_norm": 0.37588125467300415, + "learning_rate": 9.361193845928318e-05, + "loss": 1.6923, + "step": 1566 + }, + { + "epoch": 0.16277137218240365, + "grad_norm": 0.4559624493122101, + "learning_rate": 9.360395602380559e-05, + "loss": 1.8842, + "step": 1567 + }, + { + "epoch": 0.162875246701984, + "grad_norm": 0.39234891533851624, + "learning_rate": 9.359596894484682e-05, + "loss": 1.9783, + "step": 1568 + }, + { + "epoch": 0.16297912122156435, + "grad_norm": 0.36912286281585693, + "learning_rate": 9.35879772232574e-05, + "loss": 1.8153, + "step": 1569 + }, + { + "epoch": 0.1630829957411447, + "grad_norm": 0.3693285584449768, + "learning_rate": 9.357998085988842e-05, + "loss": 1.5263, + "step": 1570 + }, + { + "epoch": 0.16318687026072504, + "grad_norm": 0.38860028982162476, + "learning_rate": 9.35719798555914e-05, + "loss": 1.6806, + "step": 1571 + }, + { + "epoch": 0.1632907447803054, + "grad_norm": 0.401967853307724, + "learning_rate": 9.356397421121842e-05, + "loss": 1.855, + "step": 1572 + }, + { + "epoch": 0.16339461929988575, + "grad_norm": 0.35193613171577454, + "learning_rate": 9.355596392762197e-05, + "loss": 1.8909, + "step": 1573 + }, + { + "epoch": 0.16349849381946607, + "grad_norm": 0.38218873739242554, + "learning_rate": 9.354794900565515e-05, + "loss": 1.874, + "step": 1574 + }, + { + "epoch": 0.16360236833904643, + "grad_norm": 0.3593958020210266, + "learning_rate": 9.353992944617142e-05, + "loss": 1.6768, + "step": 1575 + }, + { + "epoch": 0.16370624285862678, + "grad_norm": 0.3737293481826782, + "learning_rate": 9.353190525002484e-05, + "loss": 1.6279, + "step": 1576 + }, + { + "epoch": 0.16381011737820714, + "grad_norm": 0.38832589983940125, + "learning_rate": 9.35238764180699e-05, + "loss": 1.8541, + "step": 1577 + }, + { + "epoch": 0.16391399189778746, + "grad_norm": 0.37959024310112, + "learning_rate": 9.351584295116165e-05, + "loss": 1.5282, + "step": 1578 + }, + { + "epoch": 0.16401786641736782, + "grad_norm": 0.37193652987480164, + "learning_rate": 9.350780485015553e-05, + "loss": 1.6896, + "step": 1579 + }, + { + "epoch": 0.16412174093694817, + "grad_norm": 0.35193783044815063, + "learning_rate": 9.349976211590758e-05, + "loss": 1.57, + "step": 1580 + }, + { + "epoch": 0.1642256154565285, + "grad_norm": 0.3922156095504761, + "learning_rate": 9.349171474927427e-05, + "loss": 1.5819, + "step": 1581 + }, + { + "epoch": 0.16432948997610886, + "grad_norm": 0.36705270409584045, + "learning_rate": 9.34836627511126e-05, + "loss": 1.6357, + "step": 1582 + }, + { + "epoch": 0.1644333644956892, + "grad_norm": 0.36877331137657166, + "learning_rate": 9.347560612228003e-05, + "loss": 1.7312, + "step": 1583 + }, + { + "epoch": 0.16453723901526957, + "grad_norm": 0.3872382342815399, + "learning_rate": 9.346754486363452e-05, + "loss": 1.8537, + "step": 1584 + }, + { + "epoch": 0.1646411135348499, + "grad_norm": 0.3447991609573364, + "learning_rate": 9.345947897603455e-05, + "loss": 1.6428, + "step": 1585 + }, + { + "epoch": 0.16474498805443025, + "grad_norm": 0.3650977909564972, + "learning_rate": 9.345140846033906e-05, + "loss": 1.8029, + "step": 1586 + }, + { + "epoch": 0.1648488625740106, + "grad_norm": 0.35949772596359253, + "learning_rate": 9.344333331740751e-05, + "loss": 1.6589, + "step": 1587 + }, + { + "epoch": 0.16495273709359093, + "grad_norm": 0.38417744636535645, + "learning_rate": 9.343525354809984e-05, + "loss": 1.7891, + "step": 1588 + }, + { + "epoch": 0.16505661161317128, + "grad_norm": 0.36684268712997437, + "learning_rate": 9.342716915327647e-05, + "loss": 1.7708, + "step": 1589 + }, + { + "epoch": 0.16516048613275164, + "grad_norm": 0.3678865134716034, + "learning_rate": 9.341908013379831e-05, + "loss": 1.7155, + "step": 1590 + }, + { + "epoch": 0.165264360652332, + "grad_norm": 0.38696667551994324, + "learning_rate": 9.341098649052681e-05, + "loss": 1.8784, + "step": 1591 + }, + { + "epoch": 0.16536823517191232, + "grad_norm": 0.3689199388027191, + "learning_rate": 9.340288822432388e-05, + "loss": 1.874, + "step": 1592 + }, + { + "epoch": 0.16547210969149267, + "grad_norm": 0.34134095907211304, + "learning_rate": 9.339478533605191e-05, + "loss": 1.5365, + "step": 1593 + }, + { + "epoch": 0.16557598421107303, + "grad_norm": 0.3707195818424225, + "learning_rate": 9.338667782657378e-05, + "loss": 1.7984, + "step": 1594 + }, + { + "epoch": 0.16567985873065338, + "grad_norm": 0.3389596939086914, + "learning_rate": 9.337856569675288e-05, + "loss": 1.6244, + "step": 1595 + }, + { + "epoch": 0.1657837332502337, + "grad_norm": 0.3761754035949707, + "learning_rate": 9.337044894745313e-05, + "loss": 1.8774, + "step": 1596 + }, + { + "epoch": 0.16588760776981407, + "grad_norm": 0.37581124901771545, + "learning_rate": 9.336232757953884e-05, + "loss": 1.7503, + "step": 1597 + }, + { + "epoch": 0.16599148228939442, + "grad_norm": 0.37551531195640564, + "learning_rate": 9.335420159387491e-05, + "loss": 1.8433, + "step": 1598 + }, + { + "epoch": 0.16609535680897475, + "grad_norm": 0.3547391891479492, + "learning_rate": 9.33460709913267e-05, + "loss": 1.8019, + "step": 1599 + }, + { + "epoch": 0.1661992313285551, + "grad_norm": 0.40177521109580994, + "learning_rate": 9.333793577276004e-05, + "loss": 1.9248, + "step": 1600 + }, + { + "epoch": 0.16630310584813546, + "grad_norm": 0.345114141702652, + "learning_rate": 9.332979593904125e-05, + "loss": 1.674, + "step": 1601 + }, + { + "epoch": 0.1664069803677158, + "grad_norm": 0.36117351055145264, + "learning_rate": 9.33216514910372e-05, + "loss": 1.9506, + "step": 1602 + }, + { + "epoch": 0.16651085488729614, + "grad_norm": 0.3475273847579956, + "learning_rate": 9.331350242961518e-05, + "loss": 1.716, + "step": 1603 + }, + { + "epoch": 0.1666147294068765, + "grad_norm": 0.3611377477645874, + "learning_rate": 9.3305348755643e-05, + "loss": 1.7274, + "step": 1604 + }, + { + "epoch": 0.16671860392645685, + "grad_norm": 0.40187951922416687, + "learning_rate": 9.329719046998898e-05, + "loss": 1.7231, + "step": 1605 + }, + { + "epoch": 0.16682247844603718, + "grad_norm": 0.3646019399166107, + "learning_rate": 9.328902757352191e-05, + "loss": 1.6772, + "step": 1606 + }, + { + "epoch": 0.16692635296561753, + "grad_norm": 0.34303680062294006, + "learning_rate": 9.328086006711108e-05, + "loss": 1.6853, + "step": 1607 + }, + { + "epoch": 0.16703022748519789, + "grad_norm": 0.36761873960494995, + "learning_rate": 9.327268795162622e-05, + "loss": 1.7584, + "step": 1608 + }, + { + "epoch": 0.16713410200477824, + "grad_norm": 0.36471322178840637, + "learning_rate": 9.326451122793766e-05, + "loss": 1.7157, + "step": 1609 + }, + { + "epoch": 0.16723797652435857, + "grad_norm": 0.385139524936676, + "learning_rate": 9.325632989691613e-05, + "loss": 1.7911, + "step": 1610 + }, + { + "epoch": 0.16734185104393892, + "grad_norm": 0.3613028824329376, + "learning_rate": 9.324814395943289e-05, + "loss": 1.7849, + "step": 1611 + }, + { + "epoch": 0.16744572556351928, + "grad_norm": 0.3828355669975281, + "learning_rate": 9.323995341635963e-05, + "loss": 1.9662, + "step": 1612 + }, + { + "epoch": 0.1675496000830996, + "grad_norm": 0.3685465157032013, + "learning_rate": 9.323175826856865e-05, + "loss": 1.8091, + "step": 1613 + }, + { + "epoch": 0.16765347460267996, + "grad_norm": 0.36837613582611084, + "learning_rate": 9.322355851693263e-05, + "loss": 1.716, + "step": 1614 + }, + { + "epoch": 0.1677573491222603, + "grad_norm": 0.4158804416656494, + "learning_rate": 9.321535416232476e-05, + "loss": 1.8553, + "step": 1615 + }, + { + "epoch": 0.16786122364184067, + "grad_norm": 0.35498541593551636, + "learning_rate": 9.32071452056188e-05, + "loss": 1.6873, + "step": 1616 + }, + { + "epoch": 0.167965098161421, + "grad_norm": 0.37819525599479675, + "learning_rate": 9.319893164768888e-05, + "loss": 1.7064, + "step": 1617 + }, + { + "epoch": 0.16806897268100135, + "grad_norm": 0.3379732668399811, + "learning_rate": 9.31907134894097e-05, + "loss": 1.6289, + "step": 1618 + }, + { + "epoch": 0.1681728472005817, + "grad_norm": 0.38348469138145447, + "learning_rate": 9.318249073165646e-05, + "loss": 1.7028, + "step": 1619 + }, + { + "epoch": 0.16827672172016203, + "grad_norm": 0.3996196985244751, + "learning_rate": 9.317426337530477e-05, + "loss": 1.7427, + "step": 1620 + }, + { + "epoch": 0.16838059623974239, + "grad_norm": 0.3799591064453125, + "learning_rate": 9.316603142123082e-05, + "loss": 1.6961, + "step": 1621 + }, + { + "epoch": 0.16848447075932274, + "grad_norm": 0.36466121673583984, + "learning_rate": 9.315779487031122e-05, + "loss": 1.8285, + "step": 1622 + }, + { + "epoch": 0.1685883452789031, + "grad_norm": 0.35838162899017334, + "learning_rate": 9.314955372342311e-05, + "loss": 1.643, + "step": 1623 + }, + { + "epoch": 0.16869221979848342, + "grad_norm": 0.37461066246032715, + "learning_rate": 9.314130798144412e-05, + "loss": 1.8822, + "step": 1624 + }, + { + "epoch": 0.16879609431806378, + "grad_norm": 0.4094388484954834, + "learning_rate": 9.313305764525232e-05, + "loss": 1.9001, + "step": 1625 + }, + { + "epoch": 0.16889996883764413, + "grad_norm": 0.382744163274765, + "learning_rate": 9.312480271572634e-05, + "loss": 1.8858, + "step": 1626 + }, + { + "epoch": 0.16900384335722446, + "grad_norm": 0.3661576807498932, + "learning_rate": 9.311654319374528e-05, + "loss": 1.8909, + "step": 1627 + }, + { + "epoch": 0.16910771787680481, + "grad_norm": 0.38933804631233215, + "learning_rate": 9.310827908018866e-05, + "loss": 1.8039, + "step": 1628 + }, + { + "epoch": 0.16921159239638517, + "grad_norm": 0.3421494662761688, + "learning_rate": 9.31000103759366e-05, + "loss": 1.6478, + "step": 1629 + }, + { + "epoch": 0.16931546691596552, + "grad_norm": 0.3714037239551544, + "learning_rate": 9.309173708186963e-05, + "loss": 1.7259, + "step": 1630 + }, + { + "epoch": 0.16941934143554585, + "grad_norm": 0.4035167694091797, + "learning_rate": 9.308345919886877e-05, + "loss": 1.8386, + "step": 1631 + }, + { + "epoch": 0.1695232159551262, + "grad_norm": 0.37035608291625977, + "learning_rate": 9.307517672781557e-05, + "loss": 1.8391, + "step": 1632 + }, + { + "epoch": 0.16962709047470656, + "grad_norm": 0.37822678685188293, + "learning_rate": 9.306688966959205e-05, + "loss": 1.8277, + "step": 1633 + }, + { + "epoch": 0.16973096499428691, + "grad_norm": 0.3702380955219269, + "learning_rate": 9.305859802508073e-05, + "loss": 1.7996, + "step": 1634 + }, + { + "epoch": 0.16983483951386724, + "grad_norm": 0.37005430459976196, + "learning_rate": 9.305030179516458e-05, + "loss": 1.7441, + "step": 1635 + }, + { + "epoch": 0.1699387140334476, + "grad_norm": 0.38137298822402954, + "learning_rate": 9.304200098072709e-05, + "loss": 1.9145, + "step": 1636 + }, + { + "epoch": 0.17004258855302795, + "grad_norm": 0.39384132623672485, + "learning_rate": 9.303369558265224e-05, + "loss": 1.7381, + "step": 1637 + }, + { + "epoch": 0.17014646307260828, + "grad_norm": 0.34569051861763, + "learning_rate": 9.30253856018245e-05, + "loss": 1.6391, + "step": 1638 + }, + { + "epoch": 0.17025033759218863, + "grad_norm": 0.38778260350227356, + "learning_rate": 9.301707103912878e-05, + "loss": 1.8337, + "step": 1639 + }, + { + "epoch": 0.170354212111769, + "grad_norm": 0.390543133020401, + "learning_rate": 9.300875189545055e-05, + "loss": 1.9798, + "step": 1640 + }, + { + "epoch": 0.17045808663134934, + "grad_norm": 0.3621176779270172, + "learning_rate": 9.300042817167574e-05, + "loss": 1.6744, + "step": 1641 + }, + { + "epoch": 0.17056196115092967, + "grad_norm": 0.43153461813926697, + "learning_rate": 9.299209986869073e-05, + "loss": 2.0351, + "step": 1642 + }, + { + "epoch": 0.17066583567051002, + "grad_norm": 0.39063888788223267, + "learning_rate": 9.298376698738242e-05, + "loss": 1.8298, + "step": 1643 + }, + { + "epoch": 0.17076971019009038, + "grad_norm": 0.36155179142951965, + "learning_rate": 9.297542952863824e-05, + "loss": 1.7124, + "step": 1644 + }, + { + "epoch": 0.1708735847096707, + "grad_norm": 0.4032329022884369, + "learning_rate": 9.296708749334603e-05, + "loss": 2.0271, + "step": 1645 + }, + { + "epoch": 0.17097745922925106, + "grad_norm": 0.39317673444747925, + "learning_rate": 9.295874088239416e-05, + "loss": 1.8808, + "step": 1646 + }, + { + "epoch": 0.17108133374883142, + "grad_norm": 0.36430609226226807, + "learning_rate": 9.295038969667145e-05, + "loss": 1.8879, + "step": 1647 + }, + { + "epoch": 0.17118520826841177, + "grad_norm": 0.3385809361934662, + "learning_rate": 9.29420339370673e-05, + "loss": 1.644, + "step": 1648 + }, + { + "epoch": 0.1712890827879921, + "grad_norm": 0.3685392737388611, + "learning_rate": 9.293367360447148e-05, + "loss": 1.907, + "step": 1649 + }, + { + "epoch": 0.17139295730757245, + "grad_norm": 0.36233484745025635, + "learning_rate": 9.292530869977432e-05, + "loss": 1.6838, + "step": 1650 + }, + { + "epoch": 0.1714968318271528, + "grad_norm": 0.4106137752532959, + "learning_rate": 9.29169392238666e-05, + "loss": 1.8516, + "step": 1651 + }, + { + "epoch": 0.17160070634673313, + "grad_norm": 0.4084801971912384, + "learning_rate": 9.290856517763965e-05, + "loss": 1.8696, + "step": 1652 + }, + { + "epoch": 0.1717045808663135, + "grad_norm": 0.3670850694179535, + "learning_rate": 9.290018656198517e-05, + "loss": 1.7186, + "step": 1653 + }, + { + "epoch": 0.17180845538589384, + "grad_norm": 0.362164705991745, + "learning_rate": 9.289180337779547e-05, + "loss": 1.6732, + "step": 1654 + }, + { + "epoch": 0.1719123299054742, + "grad_norm": 0.3619086742401123, + "learning_rate": 9.288341562596328e-05, + "loss": 1.6082, + "step": 1655 + }, + { + "epoch": 0.17201620442505453, + "grad_norm": 0.3776550889015198, + "learning_rate": 9.287502330738183e-05, + "loss": 1.7092, + "step": 1656 + }, + { + "epoch": 0.17212007894463488, + "grad_norm": 0.3793870210647583, + "learning_rate": 9.286662642294484e-05, + "loss": 1.6828, + "step": 1657 + }, + { + "epoch": 0.17222395346421523, + "grad_norm": 0.3709464967250824, + "learning_rate": 9.28582249735465e-05, + "loss": 1.7037, + "step": 1658 + }, + { + "epoch": 0.17232782798379556, + "grad_norm": 0.35695111751556396, + "learning_rate": 9.284981896008152e-05, + "loss": 1.7627, + "step": 1659 + }, + { + "epoch": 0.17243170250337592, + "grad_norm": 0.3538592457771301, + "learning_rate": 9.284140838344505e-05, + "loss": 1.8424, + "step": 1660 + }, + { + "epoch": 0.17253557702295627, + "grad_norm": 0.3740057647228241, + "learning_rate": 9.283299324453276e-05, + "loss": 1.9412, + "step": 1661 + }, + { + "epoch": 0.17263945154253663, + "grad_norm": 0.3617721199989319, + "learning_rate": 9.28245735442408e-05, + "loss": 1.6443, + "step": 1662 + }, + { + "epoch": 0.17274332606211695, + "grad_norm": 0.40431293845176697, + "learning_rate": 9.281614928346582e-05, + "loss": 1.9303, + "step": 1663 + }, + { + "epoch": 0.1728472005816973, + "grad_norm": 0.3580422103404999, + "learning_rate": 9.280772046310491e-05, + "loss": 1.5823, + "step": 1664 + }, + { + "epoch": 0.17295107510127766, + "grad_norm": 0.38240423798561096, + "learning_rate": 9.279928708405568e-05, + "loss": 1.6381, + "step": 1665 + }, + { + "epoch": 0.17305494962085802, + "grad_norm": 0.3335290253162384, + "learning_rate": 9.279084914721623e-05, + "loss": 1.686, + "step": 1666 + }, + { + "epoch": 0.17315882414043834, + "grad_norm": 0.3627927899360657, + "learning_rate": 9.278240665348512e-05, + "loss": 1.8426, + "step": 1667 + }, + { + "epoch": 0.1732626986600187, + "grad_norm": 0.4823165833950043, + "learning_rate": 9.277395960376143e-05, + "loss": 2.0619, + "step": 1668 + }, + { + "epoch": 0.17336657317959905, + "grad_norm": 0.36185434460639954, + "learning_rate": 9.276550799894468e-05, + "loss": 1.7277, + "step": 1669 + }, + { + "epoch": 0.17347044769917938, + "grad_norm": 0.36980822682380676, + "learning_rate": 9.275705183993491e-05, + "loss": 1.7649, + "step": 1670 + }, + { + "epoch": 0.17357432221875974, + "grad_norm": 0.3434619605541229, + "learning_rate": 9.274859112763264e-05, + "loss": 1.702, + "step": 1671 + }, + { + "epoch": 0.1736781967383401, + "grad_norm": 0.3595278859138489, + "learning_rate": 9.274012586293887e-05, + "loss": 1.7485, + "step": 1672 + }, + { + "epoch": 0.17378207125792045, + "grad_norm": 0.4083694815635681, + "learning_rate": 9.273165604675509e-05, + "loss": 1.7587, + "step": 1673 + }, + { + "epoch": 0.17388594577750077, + "grad_norm": 0.3653002679347992, + "learning_rate": 9.272318167998325e-05, + "loss": 1.7234, + "step": 1674 + }, + { + "epoch": 0.17398982029708113, + "grad_norm": 0.40430814027786255, + "learning_rate": 9.27147027635258e-05, + "loss": 1.8682, + "step": 1675 + }, + { + "epoch": 0.17409369481666148, + "grad_norm": 0.4132486879825592, + "learning_rate": 9.270621929828571e-05, + "loss": 1.9371, + "step": 1676 + }, + { + "epoch": 0.1741975693362418, + "grad_norm": 0.3685455322265625, + "learning_rate": 9.269773128516638e-05, + "loss": 1.8035, + "step": 1677 + }, + { + "epoch": 0.17430144385582216, + "grad_norm": 0.35275524854660034, + "learning_rate": 9.268923872507174e-05, + "loss": 1.7536, + "step": 1678 + }, + { + "epoch": 0.17440531837540252, + "grad_norm": 0.364773154258728, + "learning_rate": 9.268074161890614e-05, + "loss": 1.8075, + "step": 1679 + }, + { + "epoch": 0.17450919289498287, + "grad_norm": 0.40323731303215027, + "learning_rate": 9.267223996757451e-05, + "loss": 1.8028, + "step": 1680 + }, + { + "epoch": 0.1746130674145632, + "grad_norm": 0.34777867794036865, + "learning_rate": 9.266373377198216e-05, + "loss": 1.7143, + "step": 1681 + }, + { + "epoch": 0.17471694193414355, + "grad_norm": 0.39449429512023926, + "learning_rate": 9.265522303303494e-05, + "loss": 1.8556, + "step": 1682 + }, + { + "epoch": 0.1748208164537239, + "grad_norm": 0.3480819761753082, + "learning_rate": 9.264670775163922e-05, + "loss": 1.6931, + "step": 1683 + }, + { + "epoch": 0.17492469097330424, + "grad_norm": 0.3808116316795349, + "learning_rate": 9.263818792870176e-05, + "loss": 1.6403, + "step": 1684 + }, + { + "epoch": 0.1750285654928846, + "grad_norm": 0.3749628961086273, + "learning_rate": 9.262966356512987e-05, + "loss": 1.6206, + "step": 1685 + }, + { + "epoch": 0.17513244001246495, + "grad_norm": 0.3475877642631531, + "learning_rate": 9.262113466183136e-05, + "loss": 1.6314, + "step": 1686 + }, + { + "epoch": 0.1752363145320453, + "grad_norm": 0.3954698443412781, + "learning_rate": 9.261260121971445e-05, + "loss": 1.7696, + "step": 1687 + }, + { + "epoch": 0.17534018905162563, + "grad_norm": 0.36054256558418274, + "learning_rate": 9.260406323968791e-05, + "loss": 1.734, + "step": 1688 + }, + { + "epoch": 0.17544406357120598, + "grad_norm": 0.3659268617630005, + "learning_rate": 9.259552072266096e-05, + "loss": 1.6301, + "step": 1689 + }, + { + "epoch": 0.17554793809078634, + "grad_norm": 0.3359396457672119, + "learning_rate": 9.258697366954329e-05, + "loss": 1.6708, + "step": 1690 + }, + { + "epoch": 0.17565181261036666, + "grad_norm": 0.3496209979057312, + "learning_rate": 9.257842208124514e-05, + "loss": 1.7327, + "step": 1691 + }, + { + "epoch": 0.17575568712994702, + "grad_norm": 0.4164491295814514, + "learning_rate": 9.256986595867715e-05, + "loss": 1.7489, + "step": 1692 + }, + { + "epoch": 0.17585956164952737, + "grad_norm": 0.4587213695049286, + "learning_rate": 9.256130530275048e-05, + "loss": 2.1299, + "step": 1693 + }, + { + "epoch": 0.17596343616910773, + "grad_norm": 0.40228530764579773, + "learning_rate": 9.255274011437683e-05, + "loss": 1.8252, + "step": 1694 + }, + { + "epoch": 0.17606731068868806, + "grad_norm": 0.3676128685474396, + "learning_rate": 9.254417039446824e-05, + "loss": 1.6666, + "step": 1695 + }, + { + "epoch": 0.1761711852082684, + "grad_norm": 0.3423883318901062, + "learning_rate": 9.253559614393739e-05, + "loss": 1.7213, + "step": 1696 + }, + { + "epoch": 0.17627505972784877, + "grad_norm": 0.3809538781642914, + "learning_rate": 9.252701736369732e-05, + "loss": 1.8218, + "step": 1697 + }, + { + "epoch": 0.1763789342474291, + "grad_norm": 0.351162314414978, + "learning_rate": 9.251843405466162e-05, + "loss": 1.7478, + "step": 1698 + }, + { + "epoch": 0.17648280876700945, + "grad_norm": 0.3520200550556183, + "learning_rate": 9.250984621774437e-05, + "loss": 1.7541, + "step": 1699 + }, + { + "epoch": 0.1765866832865898, + "grad_norm": 0.3424931764602661, + "learning_rate": 9.250125385386007e-05, + "loss": 1.6592, + "step": 1700 + }, + { + "epoch": 0.17669055780617016, + "grad_norm": 0.3925075829029083, + "learning_rate": 9.249265696392377e-05, + "loss": 1.784, + "step": 1701 + }, + { + "epoch": 0.17679443232575048, + "grad_norm": 0.3799852132797241, + "learning_rate": 9.248405554885095e-05, + "loss": 1.7904, + "step": 1702 + }, + { + "epoch": 0.17689830684533084, + "grad_norm": 0.37215399742126465, + "learning_rate": 9.24754496095576e-05, + "loss": 1.8846, + "step": 1703 + }, + { + "epoch": 0.1770021813649112, + "grad_norm": 0.37840306758880615, + "learning_rate": 9.246683914696019e-05, + "loss": 1.7425, + "step": 1704 + }, + { + "epoch": 0.17710605588449155, + "grad_norm": 0.362456351518631, + "learning_rate": 9.245822416197567e-05, + "loss": 1.8047, + "step": 1705 + }, + { + "epoch": 0.17720993040407187, + "grad_norm": 0.3728417456150055, + "learning_rate": 9.244960465552148e-05, + "loss": 1.8033, + "step": 1706 + }, + { + "epoch": 0.17731380492365223, + "grad_norm": 0.3797183334827423, + "learning_rate": 9.244098062851549e-05, + "loss": 1.9075, + "step": 1707 + }, + { + "epoch": 0.17741767944323258, + "grad_norm": 0.38207125663757324, + "learning_rate": 9.243235208187612e-05, + "loss": 1.8054, + "step": 1708 + }, + { + "epoch": 0.1775215539628129, + "grad_norm": 0.3418576419353485, + "learning_rate": 9.242371901652223e-05, + "loss": 1.8205, + "step": 1709 + }, + { + "epoch": 0.17762542848239327, + "grad_norm": 0.4341420829296112, + "learning_rate": 9.241508143337319e-05, + "loss": 2.1321, + "step": 1710 + }, + { + "epoch": 0.17772930300197362, + "grad_norm": 0.38019636273384094, + "learning_rate": 9.240643933334884e-05, + "loss": 1.6551, + "step": 1711 + }, + { + "epoch": 0.17783317752155398, + "grad_norm": 0.3897992968559265, + "learning_rate": 9.239779271736947e-05, + "loss": 1.8644, + "step": 1712 + }, + { + "epoch": 0.1779370520411343, + "grad_norm": 0.3626013398170471, + "learning_rate": 9.238914158635589e-05, + "loss": 1.6417, + "step": 1713 + }, + { + "epoch": 0.17804092656071466, + "grad_norm": 0.38448286056518555, + "learning_rate": 9.238048594122939e-05, + "loss": 1.7762, + "step": 1714 + }, + { + "epoch": 0.178144801080295, + "grad_norm": 0.3795235753059387, + "learning_rate": 9.237182578291171e-05, + "loss": 1.7897, + "step": 1715 + }, + { + "epoch": 0.17824867559987534, + "grad_norm": 0.3505716323852539, + "learning_rate": 9.23631611123251e-05, + "loss": 1.8132, + "step": 1716 + }, + { + "epoch": 0.1783525501194557, + "grad_norm": 0.36175671219825745, + "learning_rate": 9.235449193039227e-05, + "loss": 1.6795, + "step": 1717 + }, + { + "epoch": 0.17845642463903605, + "grad_norm": 0.3952975571155548, + "learning_rate": 9.234581823803645e-05, + "loss": 1.7109, + "step": 1718 + }, + { + "epoch": 0.1785602991586164, + "grad_norm": 0.3580237925052643, + "learning_rate": 9.233714003618127e-05, + "loss": 1.8293, + "step": 1719 + }, + { + "epoch": 0.17866417367819673, + "grad_norm": 0.33214688301086426, + "learning_rate": 9.232845732575093e-05, + "loss": 1.6718, + "step": 1720 + }, + { + "epoch": 0.17876804819777709, + "grad_norm": 0.359581857919693, + "learning_rate": 9.231977010767005e-05, + "loss": 1.8014, + "step": 1721 + }, + { + "epoch": 0.17887192271735744, + "grad_norm": 0.3323659300804138, + "learning_rate": 9.231107838286378e-05, + "loss": 1.6053, + "step": 1722 + }, + { + "epoch": 0.17897579723693777, + "grad_norm": 0.3609966039657593, + "learning_rate": 9.230238215225769e-05, + "loss": 1.9612, + "step": 1723 + }, + { + "epoch": 0.17907967175651812, + "grad_norm": 0.33431074023246765, + "learning_rate": 9.229368141677788e-05, + "loss": 1.6002, + "step": 1724 + }, + { + "epoch": 0.17918354627609848, + "grad_norm": 0.37472623586654663, + "learning_rate": 9.228497617735088e-05, + "loss": 1.7805, + "step": 1725 + }, + { + "epoch": 0.17928742079567883, + "grad_norm": 0.3726039528846741, + "learning_rate": 9.227626643490377e-05, + "loss": 1.8624, + "step": 1726 + }, + { + "epoch": 0.17939129531525916, + "grad_norm": 0.4065108895301819, + "learning_rate": 9.226755219036405e-05, + "loss": 1.7586, + "step": 1727 + }, + { + "epoch": 0.1794951698348395, + "grad_norm": 0.35552752017974854, + "learning_rate": 9.225883344465972e-05, + "loss": 1.6, + "step": 1728 + }, + { + "epoch": 0.17959904435441987, + "grad_norm": 0.3508833944797516, + "learning_rate": 9.225011019871927e-05, + "loss": 1.7917, + "step": 1729 + }, + { + "epoch": 0.1797029188740002, + "grad_norm": 0.374521404504776, + "learning_rate": 9.224138245347163e-05, + "loss": 1.7524, + "step": 1730 + }, + { + "epoch": 0.17980679339358055, + "grad_norm": 0.36706939339637756, + "learning_rate": 9.223265020984626e-05, + "loss": 1.7125, + "step": 1731 + }, + { + "epoch": 0.1799106679131609, + "grad_norm": 0.3556000590324402, + "learning_rate": 9.222391346877308e-05, + "loss": 1.6935, + "step": 1732 + }, + { + "epoch": 0.18001454243274126, + "grad_norm": 0.3797661066055298, + "learning_rate": 9.221517223118245e-05, + "loss": 1.7725, + "step": 1733 + }, + { + "epoch": 0.18011841695232159, + "grad_norm": 0.3610544502735138, + "learning_rate": 9.22064264980053e-05, + "loss": 1.8192, + "step": 1734 + }, + { + "epoch": 0.18022229147190194, + "grad_norm": 0.46052882075309753, + "learning_rate": 9.219767627017293e-05, + "loss": 2.0761, + "step": 1735 + }, + { + "epoch": 0.1803261659914823, + "grad_norm": 0.4026206135749817, + "learning_rate": 9.218892154861721e-05, + "loss": 1.978, + "step": 1736 + }, + { + "epoch": 0.18043004051106262, + "grad_norm": 0.37845221161842346, + "learning_rate": 9.218016233427042e-05, + "loss": 1.7811, + "step": 1737 + }, + { + "epoch": 0.18053391503064298, + "grad_norm": 0.4019536077976227, + "learning_rate": 9.217139862806534e-05, + "loss": 1.9864, + "step": 1738 + }, + { + "epoch": 0.18063778955022333, + "grad_norm": 0.36031562089920044, + "learning_rate": 9.21626304309353e-05, + "loss": 1.6712, + "step": 1739 + }, + { + "epoch": 0.1807416640698037, + "grad_norm": 0.37847834825515747, + "learning_rate": 9.215385774381395e-05, + "loss": 1.7967, + "step": 1740 + }, + { + "epoch": 0.18084553858938401, + "grad_norm": 0.38624006509780884, + "learning_rate": 9.21450805676356e-05, + "loss": 1.8833, + "step": 1741 + }, + { + "epoch": 0.18094941310896437, + "grad_norm": 0.34793728590011597, + "learning_rate": 9.213629890333489e-05, + "loss": 1.8027, + "step": 1742 + }, + { + "epoch": 0.18105328762854472, + "grad_norm": 0.3407808542251587, + "learning_rate": 9.212751275184704e-05, + "loss": 1.658, + "step": 1743 + }, + { + "epoch": 0.18115716214812508, + "grad_norm": 0.3690899908542633, + "learning_rate": 9.211872211410769e-05, + "loss": 1.6251, + "step": 1744 + }, + { + "epoch": 0.1812610366677054, + "grad_norm": 0.3415517210960388, + "learning_rate": 9.210992699105296e-05, + "loss": 1.5998, + "step": 1745 + }, + { + "epoch": 0.18136491118728576, + "grad_norm": 0.36001309752464294, + "learning_rate": 9.210112738361949e-05, + "loss": 1.8308, + "step": 1746 + }, + { + "epoch": 0.18146878570686611, + "grad_norm": 0.3720461130142212, + "learning_rate": 9.209232329274433e-05, + "loss": 1.7902, + "step": 1747 + }, + { + "epoch": 0.18157266022644644, + "grad_norm": 0.42275193333625793, + "learning_rate": 9.208351471936509e-05, + "loss": 1.9225, + "step": 1748 + }, + { + "epoch": 0.1816765347460268, + "grad_norm": 0.36535343527793884, + "learning_rate": 9.20747016644198e-05, + "loss": 1.7431, + "step": 1749 + }, + { + "epoch": 0.18178040926560715, + "grad_norm": 0.3501549959182739, + "learning_rate": 9.206588412884698e-05, + "loss": 1.6417, + "step": 1750 + }, + { + "epoch": 0.1818842837851875, + "grad_norm": 0.34967172145843506, + "learning_rate": 9.205706211358563e-05, + "loss": 1.7612, + "step": 1751 + }, + { + "epoch": 0.18198815830476783, + "grad_norm": 0.3451713025569916, + "learning_rate": 9.204823561957521e-05, + "loss": 1.6711, + "step": 1752 + }, + { + "epoch": 0.1820920328243482, + "grad_norm": 0.37724578380584717, + "learning_rate": 9.203940464775568e-05, + "loss": 1.9513, + "step": 1753 + }, + { + "epoch": 0.18219590734392854, + "grad_norm": 0.34096843004226685, + "learning_rate": 9.203056919906748e-05, + "loss": 1.6691, + "step": 1754 + }, + { + "epoch": 0.18229978186350887, + "grad_norm": 0.3424280881881714, + "learning_rate": 9.202172927445152e-05, + "loss": 1.6556, + "step": 1755 + }, + { + "epoch": 0.18240365638308922, + "grad_norm": 0.3700230121612549, + "learning_rate": 9.201288487484916e-05, + "loss": 1.7761, + "step": 1756 + }, + { + "epoch": 0.18250753090266958, + "grad_norm": 0.4259064197540283, + "learning_rate": 9.200403600120229e-05, + "loss": 2.1197, + "step": 1757 + }, + { + "epoch": 0.18261140542224993, + "grad_norm": 0.40020573139190674, + "learning_rate": 9.199518265445321e-05, + "loss": 1.9191, + "step": 1758 + }, + { + "epoch": 0.18271527994183026, + "grad_norm": 0.36124834418296814, + "learning_rate": 9.198632483554478e-05, + "loss": 1.8001, + "step": 1759 + }, + { + "epoch": 0.18281915446141062, + "grad_norm": 0.4086042642593384, + "learning_rate": 9.197746254542025e-05, + "loss": 1.8518, + "step": 1760 + }, + { + "epoch": 0.18292302898099097, + "grad_norm": 0.37972450256347656, + "learning_rate": 9.196859578502338e-05, + "loss": 1.952, + "step": 1761 + }, + { + "epoch": 0.1830269035005713, + "grad_norm": 0.4148435890674591, + "learning_rate": 9.195972455529844e-05, + "loss": 1.8547, + "step": 1762 + }, + { + "epoch": 0.18313077802015165, + "grad_norm": 0.3922263979911804, + "learning_rate": 9.195084885719014e-05, + "loss": 1.9248, + "step": 1763 + }, + { + "epoch": 0.183234652539732, + "grad_norm": 0.3774312138557434, + "learning_rate": 9.194196869164366e-05, + "loss": 1.8411, + "step": 1764 + }, + { + "epoch": 0.18333852705931236, + "grad_norm": 0.35800519585609436, + "learning_rate": 9.193308405960466e-05, + "loss": 1.6903, + "step": 1765 + }, + { + "epoch": 0.1834424015788927, + "grad_norm": 0.40143853425979614, + "learning_rate": 9.192419496201932e-05, + "loss": 1.7171, + "step": 1766 + }, + { + "epoch": 0.18354627609847304, + "grad_norm": 0.3324778974056244, + "learning_rate": 9.191530139983423e-05, + "loss": 1.6295, + "step": 1767 + }, + { + "epoch": 0.1836501506180534, + "grad_norm": 0.3377951979637146, + "learning_rate": 9.190640337399647e-05, + "loss": 1.5843, + "step": 1768 + }, + { + "epoch": 0.18375402513763373, + "grad_norm": 0.39436647295951843, + "learning_rate": 9.189750088545367e-05, + "loss": 1.957, + "step": 1769 + }, + { + "epoch": 0.18385789965721408, + "grad_norm": 0.42431554198265076, + "learning_rate": 9.188859393515382e-05, + "loss": 1.9529, + "step": 1770 + }, + { + "epoch": 0.18396177417679443, + "grad_norm": 0.3626527786254883, + "learning_rate": 9.187968252404547e-05, + "loss": 1.7235, + "step": 1771 + }, + { + "epoch": 0.1840656486963748, + "grad_norm": 0.42632752656936646, + "learning_rate": 9.187076665307758e-05, + "loss": 2.1363, + "step": 1772 + }, + { + "epoch": 0.18416952321595512, + "grad_norm": 0.41993650794029236, + "learning_rate": 9.186184632319968e-05, + "loss": 1.7975, + "step": 1773 + }, + { + "epoch": 0.18427339773553547, + "grad_norm": 0.37287190556526184, + "learning_rate": 9.185292153536165e-05, + "loss": 1.8601, + "step": 1774 + }, + { + "epoch": 0.18437727225511583, + "grad_norm": 0.38827162981033325, + "learning_rate": 9.184399229051397e-05, + "loss": 1.8401, + "step": 1775 + }, + { + "epoch": 0.18448114677469618, + "grad_norm": 0.3717474937438965, + "learning_rate": 9.183505858960748e-05, + "loss": 1.7523, + "step": 1776 + }, + { + "epoch": 0.1845850212942765, + "grad_norm": 0.3732149600982666, + "learning_rate": 9.18261204335936e-05, + "loss": 1.7327, + "step": 1777 + }, + { + "epoch": 0.18468889581385686, + "grad_norm": 0.3960385024547577, + "learning_rate": 9.181717782342414e-05, + "loss": 1.9453, + "step": 1778 + }, + { + "epoch": 0.18479277033343722, + "grad_norm": 0.38847383856773376, + "learning_rate": 9.180823076005143e-05, + "loss": 1.5841, + "step": 1779 + }, + { + "epoch": 0.18489664485301754, + "grad_norm": 0.37122729420661926, + "learning_rate": 9.179927924442826e-05, + "loss": 1.8535, + "step": 1780 + }, + { + "epoch": 0.1850005193725979, + "grad_norm": 0.4589596688747406, + "learning_rate": 9.17903232775079e-05, + "loss": 2.1207, + "step": 1781 + }, + { + "epoch": 0.18510439389217825, + "grad_norm": 0.4033050537109375, + "learning_rate": 9.178136286024408e-05, + "loss": 1.9405, + "step": 1782 + }, + { + "epoch": 0.1852082684117586, + "grad_norm": 0.37945422530174255, + "learning_rate": 9.177239799359102e-05, + "loss": 1.8607, + "step": 1783 + }, + { + "epoch": 0.18531214293133894, + "grad_norm": 0.44369664788246155, + "learning_rate": 9.176342867850343e-05, + "loss": 1.9337, + "step": 1784 + }, + { + "epoch": 0.1854160174509193, + "grad_norm": 0.3960365951061249, + "learning_rate": 9.175445491593644e-05, + "loss": 1.8079, + "step": 1785 + }, + { + "epoch": 0.18551989197049965, + "grad_norm": 0.34727954864501953, + "learning_rate": 9.174547670684569e-05, + "loss": 1.5856, + "step": 1786 + }, + { + "epoch": 0.18562376649007997, + "grad_norm": 0.3571012616157532, + "learning_rate": 9.173649405218732e-05, + "loss": 1.6828, + "step": 1787 + }, + { + "epoch": 0.18572764100966033, + "grad_norm": 0.35931140184402466, + "learning_rate": 9.172750695291787e-05, + "loss": 1.5845, + "step": 1788 + }, + { + "epoch": 0.18583151552924068, + "grad_norm": 0.36305472254753113, + "learning_rate": 9.171851540999442e-05, + "loss": 1.7407, + "step": 1789 + }, + { + "epoch": 0.18593539004882104, + "grad_norm": 0.377492219209671, + "learning_rate": 9.17095194243745e-05, + "loss": 1.8655, + "step": 1790 + }, + { + "epoch": 0.18603926456840136, + "grad_norm": 0.3644843101501465, + "learning_rate": 9.17005189970161e-05, + "loss": 1.7179, + "step": 1791 + }, + { + "epoch": 0.18614313908798172, + "grad_norm": 0.3360661268234253, + "learning_rate": 9.169151412887774e-05, + "loss": 1.6426, + "step": 1792 + }, + { + "epoch": 0.18624701360756207, + "grad_norm": 0.38643765449523926, + "learning_rate": 9.16825048209183e-05, + "loss": 1.7468, + "step": 1793 + }, + { + "epoch": 0.1863508881271424, + "grad_norm": 0.34640881419181824, + "learning_rate": 9.167349107409721e-05, + "loss": 1.5946, + "step": 1794 + }, + { + "epoch": 0.18645476264672275, + "grad_norm": 0.33807694911956787, + "learning_rate": 9.166447288937444e-05, + "loss": 1.6169, + "step": 1795 + }, + { + "epoch": 0.1865586371663031, + "grad_norm": 0.4004231095314026, + "learning_rate": 9.165545026771026e-05, + "loss": 1.7109, + "step": 1796 + }, + { + "epoch": 0.18666251168588346, + "grad_norm": 0.3662926256656647, + "learning_rate": 9.164642321006555e-05, + "loss": 1.7274, + "step": 1797 + }, + { + "epoch": 0.1867663862054638, + "grad_norm": 0.37025517225265503, + "learning_rate": 9.163739171740167e-05, + "loss": 1.8982, + "step": 1798 + }, + { + "epoch": 0.18687026072504415, + "grad_norm": 0.4032931327819824, + "learning_rate": 9.162835579068032e-05, + "loss": 1.8586, + "step": 1799 + }, + { + "epoch": 0.1869741352446245, + "grad_norm": 0.36516958475112915, + "learning_rate": 9.161931543086379e-05, + "loss": 1.8012, + "step": 1800 + }, + { + "epoch": 0.18707800976420483, + "grad_norm": 0.35469850897789, + "learning_rate": 9.161027063891481e-05, + "loss": 1.6804, + "step": 1801 + }, + { + "epoch": 0.18718188428378518, + "grad_norm": 0.4029258191585541, + "learning_rate": 9.160122141579659e-05, + "loss": 2.0161, + "step": 1802 + }, + { + "epoch": 0.18728575880336554, + "grad_norm": 0.3713259696960449, + "learning_rate": 9.159216776247279e-05, + "loss": 1.7743, + "step": 1803 + }, + { + "epoch": 0.1873896333229459, + "grad_norm": 0.3715076744556427, + "learning_rate": 9.158310967990755e-05, + "loss": 1.8117, + "step": 1804 + }, + { + "epoch": 0.18749350784252622, + "grad_norm": 0.43088141083717346, + "learning_rate": 9.15740471690655e-05, + "loss": 1.8559, + "step": 1805 + }, + { + "epoch": 0.18759738236210657, + "grad_norm": 0.38683950901031494, + "learning_rate": 9.156498023091172e-05, + "loss": 1.8888, + "step": 1806 + }, + { + "epoch": 0.18770125688168693, + "grad_norm": 0.3979988694190979, + "learning_rate": 9.155590886641174e-05, + "loss": 1.8934, + "step": 1807 + }, + { + "epoch": 0.18780513140126726, + "grad_norm": 0.39728519320487976, + "learning_rate": 9.154683307653162e-05, + "loss": 1.8035, + "step": 1808 + }, + { + "epoch": 0.1879090059208476, + "grad_norm": 0.42086923122406006, + "learning_rate": 9.153775286223787e-05, + "loss": 1.9612, + "step": 1809 + }, + { + "epoch": 0.18801288044042797, + "grad_norm": 0.40852025151252747, + "learning_rate": 9.152866822449743e-05, + "loss": 1.6764, + "step": 1810 + }, + { + "epoch": 0.18811675496000832, + "grad_norm": 0.3808266222476959, + "learning_rate": 9.151957916427778e-05, + "loss": 1.7292, + "step": 1811 + }, + { + "epoch": 0.18822062947958865, + "grad_norm": 0.4031354784965515, + "learning_rate": 9.151048568254682e-05, + "loss": 1.9939, + "step": 1812 + }, + { + "epoch": 0.188324503999169, + "grad_norm": 0.3865586817264557, + "learning_rate": 9.15013877802729e-05, + "loss": 1.7508, + "step": 1813 + }, + { + "epoch": 0.18842837851874936, + "grad_norm": 0.35779932141304016, + "learning_rate": 9.149228545842492e-05, + "loss": 1.766, + "step": 1814 + }, + { + "epoch": 0.1885322530383297, + "grad_norm": 0.35968711972236633, + "learning_rate": 9.148317871797219e-05, + "loss": 1.8, + "step": 1815 + }, + { + "epoch": 0.18863612755791004, + "grad_norm": 0.44821977615356445, + "learning_rate": 9.14740675598845e-05, + "loss": 1.8142, + "step": 1816 + }, + { + "epoch": 0.1887400020774904, + "grad_norm": 0.433288037776947, + "learning_rate": 9.146495198513214e-05, + "loss": 1.9191, + "step": 1817 + }, + { + "epoch": 0.18884387659707075, + "grad_norm": 0.39990267157554626, + "learning_rate": 9.145583199468582e-05, + "loss": 1.8575, + "step": 1818 + }, + { + "epoch": 0.18894775111665107, + "grad_norm": 0.37523946166038513, + "learning_rate": 9.144670758951677e-05, + "loss": 1.799, + "step": 1819 + }, + { + "epoch": 0.18905162563623143, + "grad_norm": 0.3855798542499542, + "learning_rate": 9.143757877059665e-05, + "loss": 1.8317, + "step": 1820 + }, + { + "epoch": 0.18915550015581178, + "grad_norm": 0.35304591059684753, + "learning_rate": 9.142844553889762e-05, + "loss": 1.5719, + "step": 1821 + }, + { + "epoch": 0.18925937467539214, + "grad_norm": 0.3850051760673523, + "learning_rate": 9.141930789539229e-05, + "loss": 1.7996, + "step": 1822 + }, + { + "epoch": 0.18936324919497247, + "grad_norm": 0.3510708212852478, + "learning_rate": 9.141016584105376e-05, + "loss": 1.5699, + "step": 1823 + }, + { + "epoch": 0.18946712371455282, + "grad_norm": 0.3610669672489166, + "learning_rate": 9.140101937685558e-05, + "loss": 1.7303, + "step": 1824 + }, + { + "epoch": 0.18957099823413318, + "grad_norm": 0.34818708896636963, + "learning_rate": 9.139186850377177e-05, + "loss": 1.7005, + "step": 1825 + }, + { + "epoch": 0.1896748727537135, + "grad_norm": 0.35581493377685547, + "learning_rate": 9.138271322277685e-05, + "loss": 1.6708, + "step": 1826 + }, + { + "epoch": 0.18977874727329386, + "grad_norm": 0.35102319717407227, + "learning_rate": 9.137355353484573e-05, + "loss": 1.8142, + "step": 1827 + }, + { + "epoch": 0.1898826217928742, + "grad_norm": 0.3373032212257385, + "learning_rate": 9.136438944095392e-05, + "loss": 1.741, + "step": 1828 + }, + { + "epoch": 0.18998649631245457, + "grad_norm": 0.37855619192123413, + "learning_rate": 9.135522094207729e-05, + "loss": 1.7995, + "step": 1829 + }, + { + "epoch": 0.1900903708320349, + "grad_norm": 0.35194653272628784, + "learning_rate": 9.13460480391922e-05, + "loss": 1.5989, + "step": 1830 + }, + { + "epoch": 0.19019424535161525, + "grad_norm": 0.3600010871887207, + "learning_rate": 9.13368707332755e-05, + "loss": 1.8601, + "step": 1831 + }, + { + "epoch": 0.1902981198711956, + "grad_norm": 0.37306833267211914, + "learning_rate": 9.13276890253045e-05, + "loss": 1.86, + "step": 1832 + }, + { + "epoch": 0.19040199439077593, + "grad_norm": 0.355836421251297, + "learning_rate": 9.131850291625702e-05, + "loss": 1.6658, + "step": 1833 + }, + { + "epoch": 0.19050586891035629, + "grad_norm": 0.4272918999195099, + "learning_rate": 9.130931240711126e-05, + "loss": 1.9726, + "step": 1834 + }, + { + "epoch": 0.19060974342993664, + "grad_norm": 0.4324601888656616, + "learning_rate": 9.130011749884595e-05, + "loss": 1.851, + "step": 1835 + }, + { + "epoch": 0.190713617949517, + "grad_norm": 0.36520737409591675, + "learning_rate": 9.129091819244028e-05, + "loss": 1.8169, + "step": 1836 + }, + { + "epoch": 0.19081749246909732, + "grad_norm": 0.4533885419368744, + "learning_rate": 9.12817144888739e-05, + "loss": 2.0705, + "step": 1837 + }, + { + "epoch": 0.19092136698867768, + "grad_norm": 0.3979325592517853, + "learning_rate": 9.127250638912695e-05, + "loss": 1.7838, + "step": 1838 + }, + { + "epoch": 0.19102524150825803, + "grad_norm": 0.3549365997314453, + "learning_rate": 9.126329389418e-05, + "loss": 1.8239, + "step": 1839 + }, + { + "epoch": 0.19112911602783836, + "grad_norm": 0.3781158924102783, + "learning_rate": 9.125407700501412e-05, + "loss": 1.6151, + "step": 1840 + }, + { + "epoch": 0.1912329905474187, + "grad_norm": 0.388790100812912, + "learning_rate": 9.124485572261082e-05, + "loss": 1.9243, + "step": 1841 + }, + { + "epoch": 0.19133686506699907, + "grad_norm": 0.4158736765384674, + "learning_rate": 9.123563004795213e-05, + "loss": 1.9735, + "step": 1842 + }, + { + "epoch": 0.19144073958657942, + "grad_norm": 0.39373651146888733, + "learning_rate": 9.122639998202047e-05, + "loss": 1.8774, + "step": 1843 + }, + { + "epoch": 0.19154461410615975, + "grad_norm": 0.3896943926811218, + "learning_rate": 9.12171655257988e-05, + "loss": 1.8337, + "step": 1844 + }, + { + "epoch": 0.1916484886257401, + "grad_norm": 0.3980332016944885, + "learning_rate": 9.120792668027049e-05, + "loss": 1.8081, + "step": 1845 + }, + { + "epoch": 0.19175236314532046, + "grad_norm": 0.4258505403995514, + "learning_rate": 9.119868344641944e-05, + "loss": 1.8096, + "step": 1846 + }, + { + "epoch": 0.19185623766490079, + "grad_norm": 0.3683486580848694, + "learning_rate": 9.118943582522996e-05, + "loss": 1.826, + "step": 1847 + }, + { + "epoch": 0.19196011218448114, + "grad_norm": 0.3608863949775696, + "learning_rate": 9.118018381768687e-05, + "loss": 1.6942, + "step": 1848 + }, + { + "epoch": 0.1920639867040615, + "grad_norm": 0.35737764835357666, + "learning_rate": 9.11709274247754e-05, + "loss": 1.8741, + "step": 1849 + }, + { + "epoch": 0.19216786122364185, + "grad_norm": 0.36617088317871094, + "learning_rate": 9.116166664748131e-05, + "loss": 1.7608, + "step": 1850 + }, + { + "epoch": 0.19227173574322218, + "grad_norm": 0.40373408794403076, + "learning_rate": 9.11524014867908e-05, + "loss": 1.715, + "step": 1851 + }, + { + "epoch": 0.19237561026280253, + "grad_norm": 0.36378052830696106, + "learning_rate": 9.114313194369053e-05, + "loss": 1.5889, + "step": 1852 + }, + { + "epoch": 0.1924794847823829, + "grad_norm": 0.37960201501846313, + "learning_rate": 9.113385801916765e-05, + "loss": 1.6605, + "step": 1853 + }, + { + "epoch": 0.19258335930196324, + "grad_norm": 0.3682219982147217, + "learning_rate": 9.112457971420973e-05, + "loss": 1.8351, + "step": 1854 + }, + { + "epoch": 0.19268723382154357, + "grad_norm": 0.38188329339027405, + "learning_rate": 9.111529702980486e-05, + "loss": 1.8845, + "step": 1855 + }, + { + "epoch": 0.19279110834112392, + "grad_norm": 0.39885640144348145, + "learning_rate": 9.110600996694157e-05, + "loss": 1.8629, + "step": 1856 + }, + { + "epoch": 0.19289498286070428, + "grad_norm": 0.36927422881126404, + "learning_rate": 9.109671852660885e-05, + "loss": 1.718, + "step": 1857 + }, + { + "epoch": 0.1929988573802846, + "grad_norm": 0.35329553484916687, + "learning_rate": 9.108742270979618e-05, + "loss": 1.6209, + "step": 1858 + }, + { + "epoch": 0.19310273189986496, + "grad_norm": 0.40359756350517273, + "learning_rate": 9.107812251749349e-05, + "loss": 1.936, + "step": 1859 + }, + { + "epoch": 0.19320660641944531, + "grad_norm": 0.3696520924568176, + "learning_rate": 9.106881795069116e-05, + "loss": 1.8329, + "step": 1860 + }, + { + "epoch": 0.19331048093902567, + "grad_norm": 0.38084790110588074, + "learning_rate": 9.105950901038008e-05, + "loss": 1.8072, + "step": 1861 + }, + { + "epoch": 0.193414355458606, + "grad_norm": 0.4722822606563568, + "learning_rate": 9.105019569755157e-05, + "loss": 1.8516, + "step": 1862 + }, + { + "epoch": 0.19351822997818635, + "grad_norm": 0.3589233160018921, + "learning_rate": 9.104087801319741e-05, + "loss": 1.7609, + "step": 1863 + }, + { + "epoch": 0.1936221044977667, + "grad_norm": 0.4372257590293884, + "learning_rate": 9.103155595830988e-05, + "loss": 1.9742, + "step": 1864 + }, + { + "epoch": 0.19372597901734703, + "grad_norm": 0.3735795021057129, + "learning_rate": 9.10222295338817e-05, + "loss": 1.71, + "step": 1865 + }, + { + "epoch": 0.1938298535369274, + "grad_norm": 0.35734692215919495, + "learning_rate": 9.101289874090605e-05, + "loss": 1.6526, + "step": 1866 + }, + { + "epoch": 0.19393372805650774, + "grad_norm": 0.36955031752586365, + "learning_rate": 9.10035635803766e-05, + "loss": 1.787, + "step": 1867 + }, + { + "epoch": 0.1940376025760881, + "grad_norm": 0.3762964904308319, + "learning_rate": 9.099422405328748e-05, + "loss": 1.6645, + "step": 1868 + }, + { + "epoch": 0.19414147709566842, + "grad_norm": 0.40508878231048584, + "learning_rate": 9.098488016063325e-05, + "loss": 1.8264, + "step": 1869 + }, + { + "epoch": 0.19424535161524878, + "grad_norm": 0.43582627177238464, + "learning_rate": 9.097553190340899e-05, + "loss": 2.0092, + "step": 1870 + }, + { + "epoch": 0.19434922613482913, + "grad_norm": 0.379056453704834, + "learning_rate": 9.09661792826102e-05, + "loss": 1.7522, + "step": 1871 + }, + { + "epoch": 0.19445310065440946, + "grad_norm": 0.35009434819221497, + "learning_rate": 9.095682229923287e-05, + "loss": 1.7053, + "step": 1872 + }, + { + "epoch": 0.19455697517398982, + "grad_norm": 0.3844042420387268, + "learning_rate": 9.094746095427342e-05, + "loss": 1.7458, + "step": 1873 + }, + { + "epoch": 0.19466084969357017, + "grad_norm": 0.3690871000289917, + "learning_rate": 9.09380952487288e-05, + "loss": 1.7563, + "step": 1874 + }, + { + "epoch": 0.19476472421315053, + "grad_norm": 0.3871258497238159, + "learning_rate": 9.092872518359637e-05, + "loss": 1.9391, + "step": 1875 + }, + { + "epoch": 0.19486859873273085, + "grad_norm": 0.3885674774646759, + "learning_rate": 9.091935075987395e-05, + "loss": 1.7193, + "step": 1876 + }, + { + "epoch": 0.1949724732523112, + "grad_norm": 0.37015703320503235, + "learning_rate": 9.090997197855986e-05, + "loss": 1.7996, + "step": 1877 + }, + { + "epoch": 0.19507634777189156, + "grad_norm": 0.3506444990634918, + "learning_rate": 9.090058884065287e-05, + "loss": 1.5394, + "step": 1878 + }, + { + "epoch": 0.1951802222914719, + "grad_norm": 0.350041002035141, + "learning_rate": 9.089120134715222e-05, + "loss": 1.8134, + "step": 1879 + }, + { + "epoch": 0.19528409681105224, + "grad_norm": 0.35820406675338745, + "learning_rate": 9.088180949905756e-05, + "loss": 1.7417, + "step": 1880 + }, + { + "epoch": 0.1953879713306326, + "grad_norm": 0.37615320086479187, + "learning_rate": 9.087241329736909e-05, + "loss": 1.7162, + "step": 1881 + }, + { + "epoch": 0.19549184585021295, + "grad_norm": 0.35000884532928467, + "learning_rate": 9.086301274308742e-05, + "loss": 1.6961, + "step": 1882 + }, + { + "epoch": 0.19559572036979328, + "grad_norm": 0.3935372233390808, + "learning_rate": 9.085360783721363e-05, + "loss": 1.6883, + "step": 1883 + }, + { + "epoch": 0.19569959488937363, + "grad_norm": 0.37884512543678284, + "learning_rate": 9.084419858074928e-05, + "loss": 1.756, + "step": 1884 + }, + { + "epoch": 0.195803469408954, + "grad_norm": 0.33219221234321594, + "learning_rate": 9.08347849746964e-05, + "loss": 1.5695, + "step": 1885 + }, + { + "epoch": 0.19590734392853434, + "grad_norm": 0.3523041903972626, + "learning_rate": 9.082536702005741e-05, + "loss": 1.7402, + "step": 1886 + }, + { + "epoch": 0.19601121844811467, + "grad_norm": 0.38876470923423767, + "learning_rate": 9.08159447178353e-05, + "loss": 1.6901, + "step": 1887 + }, + { + "epoch": 0.19611509296769503, + "grad_norm": 0.35667043924331665, + "learning_rate": 9.080651806903346e-05, + "loss": 1.6679, + "step": 1888 + }, + { + "epoch": 0.19621896748727538, + "grad_norm": 0.3476751446723938, + "learning_rate": 9.079708707465575e-05, + "loss": 1.6656, + "step": 1889 + }, + { + "epoch": 0.1963228420068557, + "grad_norm": 0.3749213218688965, + "learning_rate": 9.078765173570648e-05, + "loss": 1.6932, + "step": 1890 + }, + { + "epoch": 0.19642671652643606, + "grad_norm": 0.38287267088890076, + "learning_rate": 9.077821205319045e-05, + "loss": 1.8927, + "step": 1891 + }, + { + "epoch": 0.19653059104601642, + "grad_norm": 0.3563637137413025, + "learning_rate": 9.076876802811293e-05, + "loss": 1.7309, + "step": 1892 + }, + { + "epoch": 0.19663446556559677, + "grad_norm": 0.4049491882324219, + "learning_rate": 9.075931966147964e-05, + "loss": 1.8704, + "step": 1893 + }, + { + "epoch": 0.1967383400851771, + "grad_norm": 0.38649019598960876, + "learning_rate": 9.074986695429673e-05, + "loss": 1.7995, + "step": 1894 + }, + { + "epoch": 0.19684221460475745, + "grad_norm": 0.38346853852272034, + "learning_rate": 9.074040990757085e-05, + "loss": 1.7893, + "step": 1895 + }, + { + "epoch": 0.1969460891243378, + "grad_norm": 0.39437806606292725, + "learning_rate": 9.07309485223091e-05, + "loss": 1.8473, + "step": 1896 + }, + { + "epoch": 0.19704996364391814, + "grad_norm": 0.3689126670360565, + "learning_rate": 9.072148279951905e-05, + "loss": 1.6973, + "step": 1897 + }, + { + "epoch": 0.1971538381634985, + "grad_norm": 0.37634965777397156, + "learning_rate": 9.07120127402087e-05, + "loss": 1.5333, + "step": 1898 + }, + { + "epoch": 0.19725771268307885, + "grad_norm": 0.365343302488327, + "learning_rate": 9.070253834538658e-05, + "loss": 1.7896, + "step": 1899 + }, + { + "epoch": 0.1973615872026592, + "grad_norm": 0.3354664444923401, + "learning_rate": 9.069305961606162e-05, + "loss": 1.5792, + "step": 1900 + }, + { + "epoch": 0.19746546172223953, + "grad_norm": 0.4631555676460266, + "learning_rate": 9.068357655324322e-05, + "loss": 1.9998, + "step": 1901 + }, + { + "epoch": 0.19756933624181988, + "grad_norm": 0.3880319893360138, + "learning_rate": 9.067408915794126e-05, + "loss": 1.6883, + "step": 1902 + }, + { + "epoch": 0.19767321076140024, + "grad_norm": 0.3678785562515259, + "learning_rate": 9.066459743116609e-05, + "loss": 1.7639, + "step": 1903 + }, + { + "epoch": 0.19777708528098056, + "grad_norm": 0.39961332082748413, + "learning_rate": 9.065510137392848e-05, + "loss": 1.8451, + "step": 1904 + }, + { + "epoch": 0.19788095980056092, + "grad_norm": 0.4052257239818573, + "learning_rate": 9.06456009872397e-05, + "loss": 1.7716, + "step": 1905 + }, + { + "epoch": 0.19798483432014127, + "grad_norm": 0.3587489128112793, + "learning_rate": 9.063609627211145e-05, + "loss": 1.7665, + "step": 1906 + }, + { + "epoch": 0.19808870883972163, + "grad_norm": 0.36075523495674133, + "learning_rate": 9.062658722955593e-05, + "loss": 1.7998, + "step": 1907 + }, + { + "epoch": 0.19819258335930195, + "grad_norm": 0.3680761754512787, + "learning_rate": 9.061707386058577e-05, + "loss": 1.7033, + "step": 1908 + }, + { + "epoch": 0.1982964578788823, + "grad_norm": 0.37647727131843567, + "learning_rate": 9.060755616621406e-05, + "loss": 1.8304, + "step": 1909 + }, + { + "epoch": 0.19840033239846266, + "grad_norm": 0.3684910237789154, + "learning_rate": 9.059803414745438e-05, + "loss": 1.6814, + "step": 1910 + }, + { + "epoch": 0.198504206918043, + "grad_norm": 0.3947627544403076, + "learning_rate": 9.058850780532074e-05, + "loss": 1.8224, + "step": 1911 + }, + { + "epoch": 0.19860808143762335, + "grad_norm": 0.38467562198638916, + "learning_rate": 9.057897714082761e-05, + "loss": 1.6889, + "step": 1912 + }, + { + "epoch": 0.1987119559572037, + "grad_norm": 0.38776230812072754, + "learning_rate": 9.056944215498994e-05, + "loss": 1.5652, + "step": 1913 + }, + { + "epoch": 0.19881583047678406, + "grad_norm": 0.36061033606529236, + "learning_rate": 9.055990284882317e-05, + "loss": 1.7888, + "step": 1914 + }, + { + "epoch": 0.19891970499636438, + "grad_norm": 0.37597185373306274, + "learning_rate": 9.05503592233431e-05, + "loss": 1.8526, + "step": 1915 + }, + { + "epoch": 0.19902357951594474, + "grad_norm": 0.39023831486701965, + "learning_rate": 9.054081127956607e-05, + "loss": 1.7178, + "step": 1916 + }, + { + "epoch": 0.1991274540355251, + "grad_norm": 0.37832608819007874, + "learning_rate": 9.053125901850888e-05, + "loss": 1.7645, + "step": 1917 + }, + { + "epoch": 0.19923132855510542, + "grad_norm": 0.3865016996860504, + "learning_rate": 9.052170244118876e-05, + "loss": 1.9984, + "step": 1918 + }, + { + "epoch": 0.19933520307468577, + "grad_norm": 0.42755410075187683, + "learning_rate": 9.05121415486234e-05, + "loss": 1.7952, + "step": 1919 + }, + { + "epoch": 0.19943907759426613, + "grad_norm": 0.3337387442588806, + "learning_rate": 9.050257634183099e-05, + "loss": 1.5986, + "step": 1920 + }, + { + "epoch": 0.19954295211384648, + "grad_norm": 0.35701167583465576, + "learning_rate": 9.049300682183012e-05, + "loss": 1.5109, + "step": 1921 + }, + { + "epoch": 0.1996468266334268, + "grad_norm": 0.3579074442386627, + "learning_rate": 9.048343298963988e-05, + "loss": 1.8023, + "step": 1922 + }, + { + "epoch": 0.19975070115300717, + "grad_norm": 0.3522278666496277, + "learning_rate": 9.047385484627982e-05, + "loss": 1.8454, + "step": 1923 + }, + { + "epoch": 0.19985457567258752, + "grad_norm": 0.35523468255996704, + "learning_rate": 9.046427239276991e-05, + "loss": 1.7483, + "step": 1924 + }, + { + "epoch": 0.19995845019216787, + "grad_norm": 0.32691970467567444, + "learning_rate": 9.045468563013064e-05, + "loss": 1.6541, + "step": 1925 + }, + { + "epoch": 0.2000623247117482, + "grad_norm": 0.3599613606929779, + "learning_rate": 9.04450945593829e-05, + "loss": 1.7776, + "step": 1926 + }, + { + "epoch": 0.20016619923132856, + "grad_norm": 0.3572538495063782, + "learning_rate": 9.043549918154808e-05, + "loss": 1.7373, + "step": 1927 + }, + { + "epoch": 0.2002700737509089, + "grad_norm": 0.3507987856864929, + "learning_rate": 9.0425899497648e-05, + "loss": 1.7154, + "step": 1928 + }, + { + "epoch": 0.20037394827048924, + "grad_norm": 0.3993472456932068, + "learning_rate": 9.041629550870496e-05, + "loss": 1.8059, + "step": 1929 + }, + { + "epoch": 0.2004778227900696, + "grad_norm": 0.34402957558631897, + "learning_rate": 9.040668721574171e-05, + "loss": 1.7075, + "step": 1930 + }, + { + "epoch": 0.20058169730964995, + "grad_norm": 0.3825515806674957, + "learning_rate": 9.039707461978146e-05, + "loss": 1.5313, + "step": 1931 + }, + { + "epoch": 0.2006855718292303, + "grad_norm": 0.3753582537174225, + "learning_rate": 9.038745772184786e-05, + "loss": 1.792, + "step": 1932 + }, + { + "epoch": 0.20078944634881063, + "grad_norm": 0.39727622270584106, + "learning_rate": 9.037783652296505e-05, + "loss": 1.7264, + "step": 1933 + }, + { + "epoch": 0.20089332086839098, + "grad_norm": 0.3653308153152466, + "learning_rate": 9.036821102415762e-05, + "loss": 1.6462, + "step": 1934 + }, + { + "epoch": 0.20099719538797134, + "grad_norm": 0.3842056095600128, + "learning_rate": 9.03585812264506e-05, + "loss": 1.8057, + "step": 1935 + }, + { + "epoch": 0.20110106990755167, + "grad_norm": 0.3687531054019928, + "learning_rate": 9.034894713086947e-05, + "loss": 1.6832, + "step": 1936 + }, + { + "epoch": 0.20120494442713202, + "grad_norm": 0.4252033829689026, + "learning_rate": 9.033930873844023e-05, + "loss": 1.8803, + "step": 1937 + }, + { + "epoch": 0.20130881894671238, + "grad_norm": 0.3911295235157013, + "learning_rate": 9.032966605018924e-05, + "loss": 1.8326, + "step": 1938 + }, + { + "epoch": 0.20141269346629273, + "grad_norm": 0.3815619647502899, + "learning_rate": 9.032001906714343e-05, + "loss": 1.8654, + "step": 1939 + }, + { + "epoch": 0.20151656798587306, + "grad_norm": 0.42655375599861145, + "learning_rate": 9.031036779033006e-05, + "loss": 1.8325, + "step": 1940 + }, + { + "epoch": 0.2016204425054534, + "grad_norm": 0.3972933292388916, + "learning_rate": 9.030071222077698e-05, + "loss": 1.5501, + "step": 1941 + }, + { + "epoch": 0.20172431702503377, + "grad_norm": 0.3478373885154724, + "learning_rate": 9.02910523595124e-05, + "loss": 1.5961, + "step": 1942 + }, + { + "epoch": 0.2018281915446141, + "grad_norm": 0.36317178606987, + "learning_rate": 9.028138820756504e-05, + "loss": 1.7352, + "step": 1943 + }, + { + "epoch": 0.20193206606419445, + "grad_norm": 0.36241620779037476, + "learning_rate": 9.027171976596402e-05, + "loss": 1.7462, + "step": 1944 + }, + { + "epoch": 0.2020359405837748, + "grad_norm": 0.3782672584056854, + "learning_rate": 9.026204703573899e-05, + "loss": 1.7317, + "step": 1945 + }, + { + "epoch": 0.20213981510335516, + "grad_norm": 0.3928276002407074, + "learning_rate": 9.025237001791999e-05, + "loss": 1.7494, + "step": 1946 + }, + { + "epoch": 0.20224368962293549, + "grad_norm": 0.38330700993537903, + "learning_rate": 9.024268871353758e-05, + "loss": 1.6797, + "step": 1947 + }, + { + "epoch": 0.20234756414251584, + "grad_norm": 0.38679707050323486, + "learning_rate": 9.023300312362273e-05, + "loss": 1.7813, + "step": 1948 + }, + { + "epoch": 0.2024514386620962, + "grad_norm": 0.41488325595855713, + "learning_rate": 9.022331324920687e-05, + "loss": 1.8702, + "step": 1949 + }, + { + "epoch": 0.20255531318167652, + "grad_norm": 0.39658311009407043, + "learning_rate": 9.02136190913219e-05, + "loss": 1.7838, + "step": 1950 + }, + { + "epoch": 0.20265918770125688, + "grad_norm": 0.39057308435440063, + "learning_rate": 9.020392065100018e-05, + "loss": 1.814, + "step": 1951 + }, + { + "epoch": 0.20276306222083723, + "grad_norm": 0.374887079000473, + "learning_rate": 9.019421792927452e-05, + "loss": 1.7414, + "step": 1952 + }, + { + "epoch": 0.20286693674041759, + "grad_norm": 0.3583182990550995, + "learning_rate": 9.018451092717816e-05, + "loss": 1.6376, + "step": 1953 + }, + { + "epoch": 0.2029708112599979, + "grad_norm": 0.35195374488830566, + "learning_rate": 9.017479964574485e-05, + "loss": 1.6618, + "step": 1954 + }, + { + "epoch": 0.20307468577957827, + "grad_norm": 0.3840351104736328, + "learning_rate": 9.016508408600875e-05, + "loss": 1.6952, + "step": 1955 + }, + { + "epoch": 0.20317856029915862, + "grad_norm": 0.37217509746551514, + "learning_rate": 9.015536424900449e-05, + "loss": 1.6792, + "step": 1956 + }, + { + "epoch": 0.20328243481873895, + "grad_norm": 0.39523565769195557, + "learning_rate": 9.014564013576716e-05, + "loss": 1.7578, + "step": 1957 + }, + { + "epoch": 0.2033863093383193, + "grad_norm": 0.38107234239578247, + "learning_rate": 9.013591174733231e-05, + "loss": 1.8505, + "step": 1958 + }, + { + "epoch": 0.20349018385789966, + "grad_norm": 0.3982243835926056, + "learning_rate": 9.012617908473593e-05, + "loss": 1.9007, + "step": 1959 + }, + { + "epoch": 0.20359405837748001, + "grad_norm": 0.3710818290710449, + "learning_rate": 9.011644214901447e-05, + "loss": 1.7955, + "step": 1960 + }, + { + "epoch": 0.20369793289706034, + "grad_norm": 0.37171775102615356, + "learning_rate": 9.010670094120485e-05, + "loss": 1.8637, + "step": 1961 + }, + { + "epoch": 0.2038018074166407, + "grad_norm": 0.3907235562801361, + "learning_rate": 9.009695546234443e-05, + "loss": 1.7001, + "step": 1962 + }, + { + "epoch": 0.20390568193622105, + "grad_norm": 0.34530210494995117, + "learning_rate": 9.0087205713471e-05, + "loss": 1.6479, + "step": 1963 + }, + { + "epoch": 0.2040095564558014, + "grad_norm": 0.37163665890693665, + "learning_rate": 9.007745169562285e-05, + "loss": 1.7622, + "step": 1964 + }, + { + "epoch": 0.20411343097538173, + "grad_norm": 0.36681413650512695, + "learning_rate": 9.006769340983873e-05, + "loss": 1.9061, + "step": 1965 + }, + { + "epoch": 0.2042173054949621, + "grad_norm": 0.370410680770874, + "learning_rate": 9.00579308571578e-05, + "loss": 1.823, + "step": 1966 + }, + { + "epoch": 0.20432118001454244, + "grad_norm": 0.43219515681266785, + "learning_rate": 9.004816403861969e-05, + "loss": 1.9272, + "step": 1967 + }, + { + "epoch": 0.20442505453412277, + "grad_norm": 0.3857935667037964, + "learning_rate": 9.00383929552645e-05, + "loss": 1.7746, + "step": 1968 + }, + { + "epoch": 0.20452892905370312, + "grad_norm": 0.37894824147224426, + "learning_rate": 9.002861760813278e-05, + "loss": 1.7197, + "step": 1969 + }, + { + "epoch": 0.20463280357328348, + "grad_norm": 0.35982224345207214, + "learning_rate": 9.001883799826551e-05, + "loss": 1.7175, + "step": 1970 + }, + { + "epoch": 0.20473667809286383, + "grad_norm": 0.3776380717754364, + "learning_rate": 9.000905412670415e-05, + "loss": 1.7624, + "step": 1971 + }, + { + "epoch": 0.20484055261244416, + "grad_norm": 0.38274866342544556, + "learning_rate": 8.999926599449062e-05, + "loss": 1.7201, + "step": 1972 + }, + { + "epoch": 0.20494442713202451, + "grad_norm": 0.3876325488090515, + "learning_rate": 8.998947360266726e-05, + "loss": 1.8462, + "step": 1973 + }, + { + "epoch": 0.20504830165160487, + "grad_norm": 0.3528856039047241, + "learning_rate": 8.99796769522769e-05, + "loss": 1.7983, + "step": 1974 + }, + { + "epoch": 0.2051521761711852, + "grad_norm": 0.3806709945201874, + "learning_rate": 8.996987604436279e-05, + "loss": 2.0234, + "step": 1975 + }, + { + "epoch": 0.20525605069076555, + "grad_norm": 0.3677120506763458, + "learning_rate": 8.996007087996866e-05, + "loss": 1.7771, + "step": 1976 + }, + { + "epoch": 0.2053599252103459, + "grad_norm": 0.3892456591129303, + "learning_rate": 8.995026146013867e-05, + "loss": 1.6017, + "step": 1977 + }, + { + "epoch": 0.20546379972992626, + "grad_norm": 0.4247187077999115, + "learning_rate": 8.994044778591749e-05, + "loss": 1.7722, + "step": 1978 + }, + { + "epoch": 0.2055676742495066, + "grad_norm": 0.3515567183494568, + "learning_rate": 8.993062985835013e-05, + "loss": 1.4704, + "step": 1979 + }, + { + "epoch": 0.20567154876908694, + "grad_norm": 0.3741825520992279, + "learning_rate": 8.99208076784822e-05, + "loss": 1.6708, + "step": 1980 + }, + { + "epoch": 0.2057754232886673, + "grad_norm": 0.37131038308143616, + "learning_rate": 8.991098124735961e-05, + "loss": 1.7259, + "step": 1981 + }, + { + "epoch": 0.20587929780824762, + "grad_norm": 0.3691762387752533, + "learning_rate": 8.990115056602885e-05, + "loss": 1.8153, + "step": 1982 + }, + { + "epoch": 0.20598317232782798, + "grad_norm": 0.3380625545978546, + "learning_rate": 8.989131563553679e-05, + "loss": 1.7063, + "step": 1983 + }, + { + "epoch": 0.20608704684740833, + "grad_norm": 0.4201214909553528, + "learning_rate": 8.98814764569308e-05, + "loss": 1.8642, + "step": 1984 + }, + { + "epoch": 0.2061909213669887, + "grad_norm": 0.3742324709892273, + "learning_rate": 8.987163303125863e-05, + "loss": 1.6623, + "step": 1985 + }, + { + "epoch": 0.20629479588656902, + "grad_norm": 0.3564354181289673, + "learning_rate": 8.986178535956856e-05, + "loss": 1.7477, + "step": 1986 + }, + { + "epoch": 0.20639867040614937, + "grad_norm": 0.41312122344970703, + "learning_rate": 8.985193344290929e-05, + "loss": 1.901, + "step": 1987 + }, + { + "epoch": 0.20650254492572973, + "grad_norm": 0.4109034538269043, + "learning_rate": 8.984207728232995e-05, + "loss": 1.7853, + "step": 1988 + }, + { + "epoch": 0.20660641944531005, + "grad_norm": 0.383986234664917, + "learning_rate": 8.983221687888017e-05, + "loss": 1.8616, + "step": 1989 + }, + { + "epoch": 0.2067102939648904, + "grad_norm": 0.38250911235809326, + "learning_rate": 8.982235223360999e-05, + "loss": 1.7691, + "step": 1990 + }, + { + "epoch": 0.20681416848447076, + "grad_norm": 0.35506096482276917, + "learning_rate": 8.981248334756994e-05, + "loss": 1.8013, + "step": 1991 + }, + { + "epoch": 0.20691804300405112, + "grad_norm": 0.33986783027648926, + "learning_rate": 8.980261022181095e-05, + "loss": 1.624, + "step": 1992 + }, + { + "epoch": 0.20702191752363144, + "grad_norm": 0.35081538558006287, + "learning_rate": 8.979273285738445e-05, + "loss": 1.6776, + "step": 1993 + }, + { + "epoch": 0.2071257920432118, + "grad_norm": 0.3631971478462219, + "learning_rate": 8.978285125534229e-05, + "loss": 1.5715, + "step": 1994 + }, + { + "epoch": 0.20722966656279215, + "grad_norm": 0.38734301924705505, + "learning_rate": 8.977296541673678e-05, + "loss": 1.781, + "step": 1995 + }, + { + "epoch": 0.2073335410823725, + "grad_norm": 0.40530499815940857, + "learning_rate": 8.976307534262072e-05, + "loss": 1.8291, + "step": 1996 + }, + { + "epoch": 0.20743741560195283, + "grad_norm": 0.4037350118160248, + "learning_rate": 8.975318103404728e-05, + "loss": 1.9573, + "step": 1997 + }, + { + "epoch": 0.2075412901215332, + "grad_norm": 0.3434680700302124, + "learning_rate": 8.974328249207015e-05, + "loss": 1.7654, + "step": 1998 + }, + { + "epoch": 0.20764516464111354, + "grad_norm": 0.38100457191467285, + "learning_rate": 8.973337971774344e-05, + "loss": 1.7762, + "step": 1999 + }, + { + "epoch": 0.20774903916069387, + "grad_norm": 0.3673883080482483, + "learning_rate": 8.972347271212173e-05, + "loss": 1.7563, + "step": 2000 + }, + { + "epoch": 0.20785291368027423, + "grad_norm": 0.3593236207962036, + "learning_rate": 8.971356147626004e-05, + "loss": 1.6374, + "step": 2001 + }, + { + "epoch": 0.20795678819985458, + "grad_norm": 0.4013366401195526, + "learning_rate": 8.970364601121382e-05, + "loss": 1.7564, + "step": 2002 + }, + { + "epoch": 0.20806066271943494, + "grad_norm": 0.36467140913009644, + "learning_rate": 8.9693726318039e-05, + "loss": 1.7618, + "step": 2003 + }, + { + "epoch": 0.20816453723901526, + "grad_norm": 0.36317500472068787, + "learning_rate": 8.968380239779194e-05, + "loss": 1.6911, + "step": 2004 + }, + { + "epoch": 0.20826841175859562, + "grad_norm": 0.3494802713394165, + "learning_rate": 8.96738742515295e-05, + "loss": 1.7087, + "step": 2005 + }, + { + "epoch": 0.20837228627817597, + "grad_norm": 0.3602224290370941, + "learning_rate": 8.96639418803089e-05, + "loss": 1.698, + "step": 2006 + }, + { + "epoch": 0.2084761607977563, + "grad_norm": 0.3831143379211426, + "learning_rate": 8.965400528518787e-05, + "loss": 1.7733, + "step": 2007 + }, + { + "epoch": 0.20858003531733665, + "grad_norm": 0.3766723871231079, + "learning_rate": 8.964406446722459e-05, + "loss": 1.7092, + "step": 2008 + }, + { + "epoch": 0.208683909836917, + "grad_norm": 0.38016197085380554, + "learning_rate": 8.96341194274777e-05, + "loss": 1.6285, + "step": 2009 + }, + { + "epoch": 0.20878778435649736, + "grad_norm": 0.3981560170650482, + "learning_rate": 8.962417016700624e-05, + "loss": 1.7449, + "step": 2010 + }, + { + "epoch": 0.2088916588760777, + "grad_norm": 0.3760923445224762, + "learning_rate": 8.961421668686974e-05, + "loss": 1.7459, + "step": 2011 + }, + { + "epoch": 0.20899553339565805, + "grad_norm": 0.3543214797973633, + "learning_rate": 8.960425898812816e-05, + "loss": 1.7329, + "step": 2012 + }, + { + "epoch": 0.2090994079152384, + "grad_norm": 0.3547232747077942, + "learning_rate": 8.959429707184192e-05, + "loss": 1.7992, + "step": 2013 + }, + { + "epoch": 0.20920328243481873, + "grad_norm": 0.43568694591522217, + "learning_rate": 8.95843309390719e-05, + "loss": 2.0488, + "step": 2014 + }, + { + "epoch": 0.20930715695439908, + "grad_norm": 0.4254542291164398, + "learning_rate": 8.95743605908794e-05, + "loss": 1.796, + "step": 2015 + }, + { + "epoch": 0.20941103147397944, + "grad_norm": 0.3894093632698059, + "learning_rate": 8.956438602832619e-05, + "loss": 1.8641, + "step": 2016 + }, + { + "epoch": 0.2095149059935598, + "grad_norm": 0.383073091506958, + "learning_rate": 8.955440725247448e-05, + "loss": 1.7898, + "step": 2017 + }, + { + "epoch": 0.20961878051314012, + "grad_norm": 0.37630245089530945, + "learning_rate": 8.954442426438694e-05, + "loss": 1.738, + "step": 2018 + }, + { + "epoch": 0.20972265503272047, + "grad_norm": 0.3684820234775543, + "learning_rate": 8.953443706512667e-05, + "loss": 1.8819, + "step": 2019 + }, + { + "epoch": 0.20982652955230083, + "grad_norm": 0.36796849966049194, + "learning_rate": 8.952444565575723e-05, + "loss": 1.7886, + "step": 2020 + }, + { + "epoch": 0.20993040407188115, + "grad_norm": 0.35004639625549316, + "learning_rate": 8.951445003734263e-05, + "loss": 1.7872, + "step": 2021 + }, + { + "epoch": 0.2100342785914615, + "grad_norm": 0.3540632724761963, + "learning_rate": 8.950445021094733e-05, + "loss": 1.6917, + "step": 2022 + }, + { + "epoch": 0.21013815311104186, + "grad_norm": 0.3964502215385437, + "learning_rate": 8.949444617763623e-05, + "loss": 1.7487, + "step": 2023 + }, + { + "epoch": 0.21024202763062222, + "grad_norm": 0.3707832098007202, + "learning_rate": 8.948443793847467e-05, + "loss": 1.8752, + "step": 2024 + }, + { + "epoch": 0.21034590215020255, + "grad_norm": 0.3899782598018646, + "learning_rate": 8.947442549452846e-05, + "loss": 1.7619, + "step": 2025 + }, + { + "epoch": 0.2104497766697829, + "grad_norm": 0.349178671836853, + "learning_rate": 8.946440884686387e-05, + "loss": 1.5852, + "step": 2026 + }, + { + "epoch": 0.21055365118936326, + "grad_norm": 0.3928735554218292, + "learning_rate": 8.945438799654756e-05, + "loss": 1.8503, + "step": 2027 + }, + { + "epoch": 0.21065752570894358, + "grad_norm": 0.3914186954498291, + "learning_rate": 8.944436294464669e-05, + "loss": 1.6764, + "step": 2028 + }, + { + "epoch": 0.21076140022852394, + "grad_norm": 0.370089054107666, + "learning_rate": 8.943433369222882e-05, + "loss": 1.6127, + "step": 2029 + }, + { + "epoch": 0.2108652747481043, + "grad_norm": 0.35939261317253113, + "learning_rate": 8.942430024036203e-05, + "loss": 1.7161, + "step": 2030 + }, + { + "epoch": 0.21096914926768465, + "grad_norm": 0.36686834692955017, + "learning_rate": 8.941426259011478e-05, + "loss": 1.8648, + "step": 2031 + }, + { + "epoch": 0.21107302378726497, + "grad_norm": 0.3749295473098755, + "learning_rate": 8.940422074255602e-05, + "loss": 1.7433, + "step": 2032 + }, + { + "epoch": 0.21117689830684533, + "grad_norm": 0.36779385805130005, + "learning_rate": 8.93941746987551e-05, + "loss": 1.778, + "step": 2033 + }, + { + "epoch": 0.21128077282642568, + "grad_norm": 0.3840864896774292, + "learning_rate": 8.938412445978186e-05, + "loss": 1.7633, + "step": 2034 + }, + { + "epoch": 0.21138464734600604, + "grad_norm": 0.3512420356273651, + "learning_rate": 8.937407002670659e-05, + "loss": 1.6726, + "step": 2035 + }, + { + "epoch": 0.21148852186558637, + "grad_norm": 0.3997120261192322, + "learning_rate": 8.936401140059998e-05, + "loss": 1.7913, + "step": 2036 + }, + { + "epoch": 0.21159239638516672, + "grad_norm": 0.4093436300754547, + "learning_rate": 8.935394858253321e-05, + "loss": 1.7161, + "step": 2037 + }, + { + "epoch": 0.21169627090474707, + "grad_norm": 0.3791636824607849, + "learning_rate": 8.934388157357788e-05, + "loss": 1.9097, + "step": 2038 + }, + { + "epoch": 0.2118001454243274, + "grad_norm": 0.39315351843833923, + "learning_rate": 8.933381037480606e-05, + "loss": 1.9521, + "step": 2039 + }, + { + "epoch": 0.21190401994390776, + "grad_norm": 0.3954058289527893, + "learning_rate": 8.932373498729025e-05, + "loss": 1.7008, + "step": 2040 + }, + { + "epoch": 0.2120078944634881, + "grad_norm": 0.44486141204833984, + "learning_rate": 8.931365541210342e-05, + "loss": 1.8255, + "step": 2041 + }, + { + "epoch": 0.21211176898306847, + "grad_norm": 0.4147118330001831, + "learning_rate": 8.930357165031893e-05, + "loss": 1.9377, + "step": 2042 + }, + { + "epoch": 0.2122156435026488, + "grad_norm": 0.4043649733066559, + "learning_rate": 8.929348370301063e-05, + "loss": 1.8809, + "step": 2043 + }, + { + "epoch": 0.21231951802222915, + "grad_norm": 0.37084200978279114, + "learning_rate": 8.928339157125283e-05, + "loss": 1.7093, + "step": 2044 + }, + { + "epoch": 0.2124233925418095, + "grad_norm": 0.33814066648483276, + "learning_rate": 8.927329525612025e-05, + "loss": 1.7248, + "step": 2045 + }, + { + "epoch": 0.21252726706138983, + "grad_norm": 0.3607522249221802, + "learning_rate": 8.926319475868807e-05, + "loss": 1.6424, + "step": 2046 + }, + { + "epoch": 0.21263114158097018, + "grad_norm": 0.35810667276382446, + "learning_rate": 8.925309008003192e-05, + "loss": 1.6919, + "step": 2047 + }, + { + "epoch": 0.21273501610055054, + "grad_norm": 0.3557438850402832, + "learning_rate": 8.924298122122783e-05, + "loss": 1.8568, + "step": 2048 + }, + { + "epoch": 0.2128388906201309, + "grad_norm": 0.37123459577560425, + "learning_rate": 8.923286818335237e-05, + "loss": 1.8214, + "step": 2049 + }, + { + "epoch": 0.21294276513971122, + "grad_norm": 0.42854633927345276, + "learning_rate": 8.922275096748247e-05, + "loss": 2.1206, + "step": 2050 + }, + { + "epoch": 0.21304663965929158, + "grad_norm": 0.4002090394496918, + "learning_rate": 8.921262957469554e-05, + "loss": 1.8579, + "step": 2051 + }, + { + "epoch": 0.21315051417887193, + "grad_norm": 0.34181690216064453, + "learning_rate": 8.920250400606944e-05, + "loss": 1.7962, + "step": 2052 + }, + { + "epoch": 0.21325438869845226, + "grad_norm": 0.35831955075263977, + "learning_rate": 8.919237426268246e-05, + "loss": 1.6304, + "step": 2053 + }, + { + "epoch": 0.2133582632180326, + "grad_norm": 0.4297682046890259, + "learning_rate": 8.918224034561333e-05, + "loss": 1.9758, + "step": 2054 + }, + { + "epoch": 0.21346213773761297, + "grad_norm": 0.3815910518169403, + "learning_rate": 8.917210225594122e-05, + "loss": 1.7476, + "step": 2055 + }, + { + "epoch": 0.21356601225719332, + "grad_norm": 0.3674294650554657, + "learning_rate": 8.916195999474579e-05, + "loss": 1.7741, + "step": 2056 + }, + { + "epoch": 0.21366988677677365, + "grad_norm": 0.3721073567867279, + "learning_rate": 8.915181356310709e-05, + "loss": 1.7451, + "step": 2057 + }, + { + "epoch": 0.213773761296354, + "grad_norm": 0.379859983921051, + "learning_rate": 8.914166296210564e-05, + "loss": 1.9665, + "step": 2058 + }, + { + "epoch": 0.21387763581593436, + "grad_norm": 0.3658583462238312, + "learning_rate": 8.913150819282242e-05, + "loss": 1.6796, + "step": 2059 + }, + { + "epoch": 0.21398151033551469, + "grad_norm": 0.3702607750892639, + "learning_rate": 8.912134925633878e-05, + "loss": 1.7107, + "step": 2060 + }, + { + "epoch": 0.21408538485509504, + "grad_norm": 0.37530508637428284, + "learning_rate": 8.911118615373661e-05, + "loss": 1.6896, + "step": 2061 + }, + { + "epoch": 0.2141892593746754, + "grad_norm": 0.3804415464401245, + "learning_rate": 8.910101888609821e-05, + "loss": 1.8049, + "step": 2062 + }, + { + "epoch": 0.21429313389425575, + "grad_norm": 0.36921432614326477, + "learning_rate": 8.909084745450628e-05, + "loss": 1.7652, + "step": 2063 + }, + { + "epoch": 0.21439700841383608, + "grad_norm": 0.38994041085243225, + "learning_rate": 8.908067186004405e-05, + "loss": 1.701, + "step": 2064 + }, + { + "epoch": 0.21450088293341643, + "grad_norm": 0.3705255389213562, + "learning_rate": 8.907049210379508e-05, + "loss": 1.781, + "step": 2065 + }, + { + "epoch": 0.21460475745299679, + "grad_norm": 0.3754466772079468, + "learning_rate": 8.906030818684348e-05, + "loss": 1.6878, + "step": 2066 + }, + { + "epoch": 0.2147086319725771, + "grad_norm": 0.3889169991016388, + "learning_rate": 8.905012011027372e-05, + "loss": 1.6964, + "step": 2067 + }, + { + "epoch": 0.21481250649215747, + "grad_norm": 0.3674185872077942, + "learning_rate": 8.903992787517078e-05, + "loss": 1.2466, + "step": 2068 + }, + { + "epoch": 0.21491638101173782, + "grad_norm": 0.37006455659866333, + "learning_rate": 8.902973148262004e-05, + "loss": 1.7607, + "step": 2069 + }, + { + "epoch": 0.21502025553131818, + "grad_norm": 0.3947128355503082, + "learning_rate": 8.901953093370734e-05, + "loss": 1.8514, + "step": 2070 + }, + { + "epoch": 0.2151241300508985, + "grad_norm": 0.36148831248283386, + "learning_rate": 8.900932622951897e-05, + "loss": 1.5947, + "step": 2071 + }, + { + "epoch": 0.21522800457047886, + "grad_norm": 0.34942540526390076, + "learning_rate": 8.899911737114163e-05, + "loss": 1.7449, + "step": 2072 + }, + { + "epoch": 0.21533187909005921, + "grad_norm": 0.3522939383983612, + "learning_rate": 8.898890435966251e-05, + "loss": 1.7266, + "step": 2073 + }, + { + "epoch": 0.21543575360963957, + "grad_norm": 0.4223746955394745, + "learning_rate": 8.897868719616919e-05, + "loss": 1.8562, + "step": 2074 + }, + { + "epoch": 0.2155396281292199, + "grad_norm": 0.3850705921649933, + "learning_rate": 8.896846588174973e-05, + "loss": 1.8528, + "step": 2075 + }, + { + "epoch": 0.21564350264880025, + "grad_norm": 0.391237735748291, + "learning_rate": 8.89582404174926e-05, + "loss": 1.6268, + "step": 2076 + }, + { + "epoch": 0.2157473771683806, + "grad_norm": 0.40240150690078735, + "learning_rate": 8.89480108044868e-05, + "loss": 1.7256, + "step": 2077 + }, + { + "epoch": 0.21585125168796093, + "grad_norm": 0.400774210691452, + "learning_rate": 8.893777704382163e-05, + "loss": 1.7654, + "step": 2078 + }, + { + "epoch": 0.2159551262075413, + "grad_norm": 0.4116402268409729, + "learning_rate": 8.892753913658691e-05, + "loss": 1.5765, + "step": 2079 + }, + { + "epoch": 0.21605900072712164, + "grad_norm": 0.3588711619377136, + "learning_rate": 8.891729708387294e-05, + "loss": 1.7399, + "step": 2080 + }, + { + "epoch": 0.216162875246702, + "grad_norm": 0.40262454748153687, + "learning_rate": 8.890705088677039e-05, + "loss": 1.8868, + "step": 2081 + }, + { + "epoch": 0.21626674976628232, + "grad_norm": 0.38147544860839844, + "learning_rate": 8.889680054637042e-05, + "loss": 1.907, + "step": 2082 + }, + { + "epoch": 0.21637062428586268, + "grad_norm": 0.40591171383857727, + "learning_rate": 8.888654606376459e-05, + "loss": 1.9046, + "step": 2083 + }, + { + "epoch": 0.21647449880544303, + "grad_norm": 0.3788270652294159, + "learning_rate": 8.887628744004493e-05, + "loss": 1.7253, + "step": 2084 + }, + { + "epoch": 0.21657837332502336, + "grad_norm": 0.38902533054351807, + "learning_rate": 8.88660246763039e-05, + "loss": 1.7522, + "step": 2085 + }, + { + "epoch": 0.21668224784460371, + "grad_norm": 0.3673681318759918, + "learning_rate": 8.885575777363442e-05, + "loss": 1.7231, + "step": 2086 + }, + { + "epoch": 0.21678612236418407, + "grad_norm": 0.33182385563850403, + "learning_rate": 8.884548673312981e-05, + "loss": 1.6142, + "step": 2087 + }, + { + "epoch": 0.21688999688376442, + "grad_norm": 0.38326361775398254, + "learning_rate": 8.883521155588388e-05, + "loss": 1.8635, + "step": 2088 + }, + { + "epoch": 0.21699387140334475, + "grad_norm": 0.38930484652519226, + "learning_rate": 8.882493224299084e-05, + "loss": 1.6453, + "step": 2089 + }, + { + "epoch": 0.2170977459229251, + "grad_norm": 0.3819931447505951, + "learning_rate": 8.881464879554536e-05, + "loss": 1.821, + "step": 2090 + }, + { + "epoch": 0.21720162044250546, + "grad_norm": 0.3670012354850769, + "learning_rate": 8.880436121464255e-05, + "loss": 1.8536, + "step": 2091 + }, + { + "epoch": 0.2173054949620858, + "grad_norm": 0.3817991614341736, + "learning_rate": 8.879406950137796e-05, + "loss": 1.7896, + "step": 2092 + }, + { + "epoch": 0.21740936948166614, + "grad_norm": 0.44066059589385986, + "learning_rate": 8.878377365684758e-05, + "loss": 1.7763, + "step": 2093 + }, + { + "epoch": 0.2175132440012465, + "grad_norm": 0.362020343542099, + "learning_rate": 8.877347368214783e-05, + "loss": 1.7594, + "step": 2094 + }, + { + "epoch": 0.21761711852082685, + "grad_norm": 0.385455459356308, + "learning_rate": 8.876316957837556e-05, + "loss": 1.9007, + "step": 2095 + }, + { + "epoch": 0.21772099304040718, + "grad_norm": 0.3932596743106842, + "learning_rate": 8.87528613466281e-05, + "loss": 1.6164, + "step": 2096 + }, + { + "epoch": 0.21782486755998753, + "grad_norm": 0.3642088770866394, + "learning_rate": 8.874254898800321e-05, + "loss": 1.626, + "step": 2097 + }, + { + "epoch": 0.2179287420795679, + "grad_norm": 0.4024796783924103, + "learning_rate": 8.873223250359903e-05, + "loss": 1.8004, + "step": 2098 + }, + { + "epoch": 0.21803261659914822, + "grad_norm": 0.3582911491394043, + "learning_rate": 8.872191189451422e-05, + "loss": 1.7169, + "step": 2099 + }, + { + "epoch": 0.21813649111872857, + "grad_norm": 0.40865403413772583, + "learning_rate": 8.871158716184784e-05, + "loss": 1.6472, + "step": 2100 + }, + { + "epoch": 0.21824036563830893, + "grad_norm": 0.39730289578437805, + "learning_rate": 8.87012583066994e-05, + "loss": 1.6177, + "step": 2101 + }, + { + "epoch": 0.21834424015788928, + "grad_norm": 0.3871819078922272, + "learning_rate": 8.869092533016882e-05, + "loss": 1.6924, + "step": 2102 + }, + { + "epoch": 0.2184481146774696, + "grad_norm": 0.377941757440567, + "learning_rate": 8.86805882333565e-05, + "loss": 1.8299, + "step": 2103 + }, + { + "epoch": 0.21855198919704996, + "grad_norm": 0.397905558347702, + "learning_rate": 8.867024701736325e-05, + "loss": 1.8411, + "step": 2104 + }, + { + "epoch": 0.21865586371663032, + "grad_norm": 0.3686088025569916, + "learning_rate": 8.865990168329031e-05, + "loss": 1.6618, + "step": 2105 + }, + { + "epoch": 0.21875973823621067, + "grad_norm": 0.394415020942688, + "learning_rate": 8.864955223223943e-05, + "loss": 1.8495, + "step": 2106 + }, + { + "epoch": 0.218863612755791, + "grad_norm": 0.3614397943019867, + "learning_rate": 8.863919866531269e-05, + "loss": 1.7565, + "step": 2107 + }, + { + "epoch": 0.21896748727537135, + "grad_norm": 0.3643103837966919, + "learning_rate": 8.862884098361271e-05, + "loss": 1.7943, + "step": 2108 + }, + { + "epoch": 0.2190713617949517, + "grad_norm": 0.37451231479644775, + "learning_rate": 8.861847918824248e-05, + "loss": 1.5722, + "step": 2109 + }, + { + "epoch": 0.21917523631453203, + "grad_norm": 0.40897324681282043, + "learning_rate": 8.860811328030545e-05, + "loss": 1.8011, + "step": 2110 + }, + { + "epoch": 0.2192791108341124, + "grad_norm": 0.38252517580986023, + "learning_rate": 8.85977432609055e-05, + "loss": 1.7692, + "step": 2111 + }, + { + "epoch": 0.21938298535369274, + "grad_norm": 0.3585284352302551, + "learning_rate": 8.858736913114698e-05, + "loss": 1.6858, + "step": 2112 + }, + { + "epoch": 0.2194868598732731, + "grad_norm": 0.40061917901039124, + "learning_rate": 8.857699089213462e-05, + "loss": 1.5681, + "step": 2113 + }, + { + "epoch": 0.21959073439285343, + "grad_norm": 0.38758501410484314, + "learning_rate": 8.856660854497367e-05, + "loss": 1.7354, + "step": 2114 + }, + { + "epoch": 0.21969460891243378, + "grad_norm": 0.37637653946876526, + "learning_rate": 8.855622209076972e-05, + "loss": 1.6739, + "step": 2115 + }, + { + "epoch": 0.21979848343201414, + "grad_norm": 0.4087028205394745, + "learning_rate": 8.854583153062887e-05, + "loss": 2.0526, + "step": 2116 + }, + { + "epoch": 0.21990235795159446, + "grad_norm": 0.3950178623199463, + "learning_rate": 8.853543686565765e-05, + "loss": 1.8204, + "step": 2117 + }, + { + "epoch": 0.22000623247117482, + "grad_norm": 0.3853364586830139, + "learning_rate": 8.852503809696297e-05, + "loss": 1.7236, + "step": 2118 + }, + { + "epoch": 0.22011010699075517, + "grad_norm": 0.36730098724365234, + "learning_rate": 8.851463522565226e-05, + "loss": 1.7896, + "step": 2119 + }, + { + "epoch": 0.22021398151033553, + "grad_norm": 0.3799102008342743, + "learning_rate": 8.85042282528333e-05, + "loss": 1.8652, + "step": 2120 + }, + { + "epoch": 0.22031785602991585, + "grad_norm": 0.37538158893585205, + "learning_rate": 8.84938171796144e-05, + "loss": 1.8323, + "step": 2121 + }, + { + "epoch": 0.2204217305494962, + "grad_norm": 0.3702995777130127, + "learning_rate": 8.848340200710421e-05, + "loss": 1.6752, + "step": 2122 + }, + { + "epoch": 0.22052560506907656, + "grad_norm": 0.3841915726661682, + "learning_rate": 8.847298273641192e-05, + "loss": 1.8893, + "step": 2123 + }, + { + "epoch": 0.2206294795886569, + "grad_norm": 0.3636316955089569, + "learning_rate": 8.846255936864703e-05, + "loss": 1.6729, + "step": 2124 + }, + { + "epoch": 0.22073335410823725, + "grad_norm": 0.3771231770515442, + "learning_rate": 8.845213190491962e-05, + "loss": 1.6048, + "step": 2125 + }, + { + "epoch": 0.2208372286278176, + "grad_norm": 0.4207725524902344, + "learning_rate": 8.844170034634007e-05, + "loss": 1.9401, + "step": 2126 + }, + { + "epoch": 0.22094110314739795, + "grad_norm": 0.42666593194007874, + "learning_rate": 8.843126469401931e-05, + "loss": 1.9608, + "step": 2127 + }, + { + "epoch": 0.22104497766697828, + "grad_norm": 0.3714624345302582, + "learning_rate": 8.842082494906863e-05, + "loss": 1.7066, + "step": 2128 + }, + { + "epoch": 0.22114885218655864, + "grad_norm": 0.41800081729888916, + "learning_rate": 8.841038111259979e-05, + "loss": 1.8553, + "step": 2129 + }, + { + "epoch": 0.221252726706139, + "grad_norm": 0.34756773710250854, + "learning_rate": 8.839993318572497e-05, + "loss": 1.6992, + "step": 2130 + }, + { + "epoch": 0.22135660122571932, + "grad_norm": 0.3820962905883789, + "learning_rate": 8.83894811695568e-05, + "loss": 1.9582, + "step": 2131 + }, + { + "epoch": 0.22146047574529967, + "grad_norm": 0.3578221797943115, + "learning_rate": 8.837902506520835e-05, + "loss": 1.7399, + "step": 2132 + }, + { + "epoch": 0.22156435026488003, + "grad_norm": 0.3778248727321625, + "learning_rate": 8.836856487379306e-05, + "loss": 1.6818, + "step": 2133 + }, + { + "epoch": 0.22166822478446038, + "grad_norm": 0.36357828974723816, + "learning_rate": 8.835810059642493e-05, + "loss": 1.6495, + "step": 2134 + }, + { + "epoch": 0.2217720993040407, + "grad_norm": 0.384027898311615, + "learning_rate": 8.834763223421828e-05, + "loss": 1.7112, + "step": 2135 + }, + { + "epoch": 0.22187597382362106, + "grad_norm": 0.3461948037147522, + "learning_rate": 8.833715978828793e-05, + "loss": 1.713, + "step": 2136 + }, + { + "epoch": 0.22197984834320142, + "grad_norm": 0.38593146204948425, + "learning_rate": 8.83266832597491e-05, + "loss": 1.6994, + "step": 2137 + }, + { + "epoch": 0.22208372286278175, + "grad_norm": 0.346113383769989, + "learning_rate": 8.831620264971745e-05, + "loss": 1.6051, + "step": 2138 + }, + { + "epoch": 0.2221875973823621, + "grad_norm": 0.3933405876159668, + "learning_rate": 8.830571795930911e-05, + "loss": 1.678, + "step": 2139 + }, + { + "epoch": 0.22229147190194246, + "grad_norm": 0.34719663858413696, + "learning_rate": 8.829522918964058e-05, + "loss": 1.6881, + "step": 2140 + }, + { + "epoch": 0.2223953464215228, + "grad_norm": 0.3760213553905487, + "learning_rate": 8.828473634182888e-05, + "loss": 1.8456, + "step": 2141 + }, + { + "epoch": 0.22249922094110314, + "grad_norm": 0.3988967537879944, + "learning_rate": 8.827423941699137e-05, + "loss": 1.8562, + "step": 2142 + }, + { + "epoch": 0.2226030954606835, + "grad_norm": 0.40549615025520325, + "learning_rate": 8.826373841624593e-05, + "loss": 1.7444, + "step": 2143 + }, + { + "epoch": 0.22270696998026385, + "grad_norm": 0.37362509965896606, + "learning_rate": 8.825323334071083e-05, + "loss": 1.7901, + "step": 2144 + }, + { + "epoch": 0.2228108444998442, + "grad_norm": 0.34235090017318726, + "learning_rate": 8.824272419150475e-05, + "loss": 1.7461, + "step": 2145 + }, + { + "epoch": 0.22291471901942453, + "grad_norm": 0.3918958306312561, + "learning_rate": 8.823221096974684e-05, + "loss": 1.782, + "step": 2146 + }, + { + "epoch": 0.22301859353900488, + "grad_norm": 0.37165430188179016, + "learning_rate": 8.822169367655669e-05, + "loss": 1.803, + "step": 2147 + }, + { + "epoch": 0.22312246805858524, + "grad_norm": 0.3669700026512146, + "learning_rate": 8.821117231305431e-05, + "loss": 1.8741, + "step": 2148 + }, + { + "epoch": 0.22322634257816557, + "grad_norm": 0.3532780408859253, + "learning_rate": 8.820064688036014e-05, + "loss": 1.7279, + "step": 2149 + }, + { + "epoch": 0.22333021709774592, + "grad_norm": 0.3830491006374359, + "learning_rate": 8.819011737959504e-05, + "loss": 1.8127, + "step": 2150 + }, + { + "epoch": 0.22343409161732627, + "grad_norm": 0.36550477147102356, + "learning_rate": 8.817958381188033e-05, + "loss": 1.7985, + "step": 2151 + }, + { + "epoch": 0.22353796613690663, + "grad_norm": 0.3734526038169861, + "learning_rate": 8.816904617833778e-05, + "loss": 1.7096, + "step": 2152 + }, + { + "epoch": 0.22364184065648696, + "grad_norm": 0.36251428723335266, + "learning_rate": 8.815850448008953e-05, + "loss": 1.7936, + "step": 2153 + }, + { + "epoch": 0.2237457151760673, + "grad_norm": 0.3801165223121643, + "learning_rate": 8.81479587182582e-05, + "loss": 1.7445, + "step": 2154 + }, + { + "epoch": 0.22384958969564767, + "grad_norm": 0.38031628727912903, + "learning_rate": 8.813740889396682e-05, + "loss": 1.8105, + "step": 2155 + }, + { + "epoch": 0.223953464215228, + "grad_norm": 0.42004433274269104, + "learning_rate": 8.81268550083389e-05, + "loss": 1.7913, + "step": 2156 + }, + { + "epoch": 0.22405733873480835, + "grad_norm": 0.38428744673728943, + "learning_rate": 8.81162970624983e-05, + "loss": 1.805, + "step": 2157 + }, + { + "epoch": 0.2241612132543887, + "grad_norm": 0.4019632935523987, + "learning_rate": 8.81057350575694e-05, + "loss": 1.6997, + "step": 2158 + }, + { + "epoch": 0.22426508777396906, + "grad_norm": 0.3467335104942322, + "learning_rate": 8.809516899467694e-05, + "loss": 1.5612, + "step": 2159 + }, + { + "epoch": 0.22436896229354938, + "grad_norm": 0.36768898367881775, + "learning_rate": 8.808459887494615e-05, + "loss": 1.6928, + "step": 2160 + }, + { + "epoch": 0.22447283681312974, + "grad_norm": 0.41053932905197144, + "learning_rate": 8.807402469950263e-05, + "loss": 1.7605, + "step": 2161 + }, + { + "epoch": 0.2245767113327101, + "grad_norm": 0.36271244287490845, + "learning_rate": 8.806344646947249e-05, + "loss": 1.7773, + "step": 2162 + }, + { + "epoch": 0.22468058585229042, + "grad_norm": 0.38008955121040344, + "learning_rate": 8.80528641859822e-05, + "loss": 1.8473, + "step": 2163 + }, + { + "epoch": 0.22478446037187078, + "grad_norm": 0.382253497838974, + "learning_rate": 8.804227785015869e-05, + "loss": 1.686, + "step": 2164 + }, + { + "epoch": 0.22488833489145113, + "grad_norm": 0.3781398832798004, + "learning_rate": 8.803168746312934e-05, + "loss": 1.7894, + "step": 2165 + }, + { + "epoch": 0.22499220941103149, + "grad_norm": 0.39482182264328003, + "learning_rate": 8.802109302602193e-05, + "loss": 1.8895, + "step": 2166 + }, + { + "epoch": 0.2250960839306118, + "grad_norm": 0.40680569410324097, + "learning_rate": 8.801049453996468e-05, + "loss": 2.0375, + "step": 2167 + }, + { + "epoch": 0.22519995845019217, + "grad_norm": 0.3483993113040924, + "learning_rate": 8.799989200608627e-05, + "loss": 1.6489, + "step": 2168 + }, + { + "epoch": 0.22530383296977252, + "grad_norm": 0.3627791702747345, + "learning_rate": 8.798928542551576e-05, + "loss": 1.8188, + "step": 2169 + }, + { + "epoch": 0.22540770748935285, + "grad_norm": 0.3764524459838867, + "learning_rate": 8.797867479938269e-05, + "loss": 1.9716, + "step": 2170 + }, + { + "epoch": 0.2255115820089332, + "grad_norm": 0.3761730492115021, + "learning_rate": 8.796806012881699e-05, + "loss": 1.857, + "step": 2171 + }, + { + "epoch": 0.22561545652851356, + "grad_norm": 0.3673245906829834, + "learning_rate": 8.795744141494905e-05, + "loss": 1.6663, + "step": 2172 + }, + { + "epoch": 0.2257193310480939, + "grad_norm": 0.3899873197078705, + "learning_rate": 8.794681865890968e-05, + "loss": 1.7631, + "step": 2173 + }, + { + "epoch": 0.22582320556767424, + "grad_norm": 0.36863207817077637, + "learning_rate": 8.793619186183011e-05, + "loss": 1.8269, + "step": 2174 + }, + { + "epoch": 0.2259270800872546, + "grad_norm": 0.36547142267227173, + "learning_rate": 8.792556102484204e-05, + "loss": 1.6963, + "step": 2175 + }, + { + "epoch": 0.22603095460683495, + "grad_norm": 0.34882575273513794, + "learning_rate": 8.791492614907754e-05, + "loss": 1.6336, + "step": 2176 + }, + { + "epoch": 0.22613482912641528, + "grad_norm": 0.38748350739479065, + "learning_rate": 8.790428723566915e-05, + "loss": 1.8569, + "step": 2177 + }, + { + "epoch": 0.22623870364599563, + "grad_norm": 0.362943559885025, + "learning_rate": 8.789364428574984e-05, + "loss": 1.7161, + "step": 2178 + }, + { + "epoch": 0.22634257816557599, + "grad_norm": 0.37946629524230957, + "learning_rate": 8.788299730045299e-05, + "loss": 1.8211, + "step": 2179 + }, + { + "epoch": 0.22644645268515634, + "grad_norm": 0.3608231842517853, + "learning_rate": 8.787234628091243e-05, + "loss": 1.7704, + "step": 2180 + }, + { + "epoch": 0.22655032720473667, + "grad_norm": 0.38280248641967773, + "learning_rate": 8.786169122826242e-05, + "loss": 1.7376, + "step": 2181 + }, + { + "epoch": 0.22665420172431702, + "grad_norm": 0.36834195256233215, + "learning_rate": 8.785103214363761e-05, + "loss": 1.6503, + "step": 2182 + }, + { + "epoch": 0.22675807624389738, + "grad_norm": 0.3779491186141968, + "learning_rate": 8.784036902817312e-05, + "loss": 1.7421, + "step": 2183 + }, + { + "epoch": 0.22686195076347773, + "grad_norm": 0.40714192390441895, + "learning_rate": 8.78297018830045e-05, + "loss": 1.7828, + "step": 2184 + }, + { + "epoch": 0.22696582528305806, + "grad_norm": 0.36981451511383057, + "learning_rate": 8.781903070926774e-05, + "loss": 1.7687, + "step": 2185 + }, + { + "epoch": 0.22706969980263841, + "grad_norm": 0.37374863028526306, + "learning_rate": 8.780835550809918e-05, + "loss": 1.7469, + "step": 2186 + }, + { + "epoch": 0.22717357432221877, + "grad_norm": 0.3481188416481018, + "learning_rate": 8.779767628063567e-05, + "loss": 1.5268, + "step": 2187 + }, + { + "epoch": 0.2272774488417991, + "grad_norm": 0.352603942155838, + "learning_rate": 8.77869930280145e-05, + "loss": 1.5966, + "step": 2188 + }, + { + "epoch": 0.22738132336137945, + "grad_norm": 0.38702160120010376, + "learning_rate": 8.77763057513733e-05, + "loss": 1.7469, + "step": 2189 + }, + { + "epoch": 0.2274851978809598, + "grad_norm": 0.37263423204421997, + "learning_rate": 8.77656144518502e-05, + "loss": 1.6986, + "step": 2190 + }, + { + "epoch": 0.22758907240054016, + "grad_norm": 0.3658749759197235, + "learning_rate": 8.775491913058376e-05, + "loss": 1.7425, + "step": 2191 + }, + { + "epoch": 0.2276929469201205, + "grad_norm": 0.38823366165161133, + "learning_rate": 8.774421978871292e-05, + "loss": 1.7665, + "step": 2192 + }, + { + "epoch": 0.22779682143970084, + "grad_norm": 0.35970309376716614, + "learning_rate": 8.773351642737708e-05, + "loss": 1.6914, + "step": 2193 + }, + { + "epoch": 0.2279006959592812, + "grad_norm": 0.3693389892578125, + "learning_rate": 8.772280904771608e-05, + "loss": 1.8031, + "step": 2194 + }, + { + "epoch": 0.22800457047886152, + "grad_norm": 0.34769538044929504, + "learning_rate": 8.771209765087018e-05, + "loss": 1.7248, + "step": 2195 + }, + { + "epoch": 0.22810844499844188, + "grad_norm": 0.37290412187576294, + "learning_rate": 8.770138223798003e-05, + "loss": 1.7994, + "step": 2196 + }, + { + "epoch": 0.22821231951802223, + "grad_norm": 0.37960392236709595, + "learning_rate": 8.769066281018676e-05, + "loss": 1.8668, + "step": 2197 + }, + { + "epoch": 0.2283161940376026, + "grad_norm": 0.3551698625087738, + "learning_rate": 8.76799393686319e-05, + "loss": 1.7928, + "step": 2198 + }, + { + "epoch": 0.22842006855718291, + "grad_norm": 0.3406142294406891, + "learning_rate": 8.766921191445742e-05, + "loss": 1.6981, + "step": 2199 + }, + { + "epoch": 0.22852394307676327, + "grad_norm": 0.410206139087677, + "learning_rate": 8.765848044880569e-05, + "loss": 1.634, + "step": 2200 + }, + { + "epoch": 0.22862781759634362, + "grad_norm": 0.42800506949424744, + "learning_rate": 8.764774497281954e-05, + "loss": 1.7564, + "step": 2201 + }, + { + "epoch": 0.22873169211592395, + "grad_norm": 0.39173564314842224, + "learning_rate": 8.763700548764223e-05, + "loss": 1.8504, + "step": 2202 + }, + { + "epoch": 0.2288355666355043, + "grad_norm": 0.35888996720314026, + "learning_rate": 8.76262619944174e-05, + "loss": 1.5756, + "step": 2203 + }, + { + "epoch": 0.22893944115508466, + "grad_norm": 0.4852285087108612, + "learning_rate": 8.761551449428918e-05, + "loss": 1.9804, + "step": 2204 + }, + { + "epoch": 0.22904331567466502, + "grad_norm": 0.4047568440437317, + "learning_rate": 8.760476298840206e-05, + "loss": 1.81, + "step": 2205 + }, + { + "epoch": 0.22914719019424534, + "grad_norm": 0.3965972065925598, + "learning_rate": 8.7594007477901e-05, + "loss": 1.8159, + "step": 2206 + }, + { + "epoch": 0.2292510647138257, + "grad_norm": 0.4031708538532257, + "learning_rate": 8.758324796393142e-05, + "loss": 1.8237, + "step": 2207 + }, + { + "epoch": 0.22935493923340605, + "grad_norm": 0.3491591215133667, + "learning_rate": 8.757248444763908e-05, + "loss": 1.6837, + "step": 2208 + }, + { + "epoch": 0.22945881375298638, + "grad_norm": 0.36505648493766785, + "learning_rate": 8.756171693017022e-05, + "loss": 1.6408, + "step": 2209 + }, + { + "epoch": 0.22956268827256673, + "grad_norm": 0.394775927066803, + "learning_rate": 8.755094541267149e-05, + "loss": 1.8816, + "step": 2210 + }, + { + "epoch": 0.2296665627921471, + "grad_norm": 0.4164559543132782, + "learning_rate": 8.754016989629e-05, + "loss": 1.9372, + "step": 2211 + }, + { + "epoch": 0.22977043731172744, + "grad_norm": 0.3899494707584381, + "learning_rate": 8.752939038217322e-05, + "loss": 1.7356, + "step": 2212 + }, + { + "epoch": 0.22987431183130777, + "grad_norm": 0.36787334084510803, + "learning_rate": 8.751860687146912e-05, + "loss": 1.7804, + "step": 2213 + }, + { + "epoch": 0.22997818635088813, + "grad_norm": 0.3581325113773346, + "learning_rate": 8.750781936532604e-05, + "loss": 1.825, + "step": 2214 + }, + { + "epoch": 0.23008206087046848, + "grad_norm": 0.3675483465194702, + "learning_rate": 8.749702786489277e-05, + "loss": 1.7208, + "step": 2215 + }, + { + "epoch": 0.23018593539004883, + "grad_norm": 0.38968801498413086, + "learning_rate": 8.748623237131853e-05, + "loss": 1.8447, + "step": 2216 + }, + { + "epoch": 0.23028980990962916, + "grad_norm": 0.3647545576095581, + "learning_rate": 8.747543288575293e-05, + "loss": 1.6169, + "step": 2217 + }, + { + "epoch": 0.23039368442920952, + "grad_norm": 0.3703446388244629, + "learning_rate": 8.746462940934605e-05, + "loss": 1.6316, + "step": 2218 + }, + { + "epoch": 0.23049755894878987, + "grad_norm": 0.3884913921356201, + "learning_rate": 8.745382194324839e-05, + "loss": 1.7359, + "step": 2219 + }, + { + "epoch": 0.2306014334683702, + "grad_norm": 0.39596831798553467, + "learning_rate": 8.744301048861083e-05, + "loss": 1.7425, + "step": 2220 + }, + { + "epoch": 0.23070530798795055, + "grad_norm": 0.41258224844932556, + "learning_rate": 8.743219504658472e-05, + "loss": 1.8832, + "step": 2221 + }, + { + "epoch": 0.2308091825075309, + "grad_norm": 0.3803333342075348, + "learning_rate": 8.742137561832182e-05, + "loss": 1.8591, + "step": 2222 + }, + { + "epoch": 0.23091305702711126, + "grad_norm": 0.3818683326244354, + "learning_rate": 8.741055220497431e-05, + "loss": 1.9393, + "step": 2223 + }, + { + "epoch": 0.2310169315466916, + "grad_norm": 0.3759542405605316, + "learning_rate": 8.739972480769481e-05, + "loss": 1.8439, + "step": 2224 + }, + { + "epoch": 0.23112080606627194, + "grad_norm": 0.3411445617675781, + "learning_rate": 8.738889342763635e-05, + "loss": 1.7198, + "step": 2225 + }, + { + "epoch": 0.2312246805858523, + "grad_norm": 0.3545193076133728, + "learning_rate": 8.737805806595239e-05, + "loss": 1.6465, + "step": 2226 + }, + { + "epoch": 0.23132855510543263, + "grad_norm": 0.3674876093864441, + "learning_rate": 8.73672187237968e-05, + "loss": 1.7859, + "step": 2227 + }, + { + "epoch": 0.23143242962501298, + "grad_norm": 0.3644886612892151, + "learning_rate": 8.735637540232389e-05, + "loss": 1.7008, + "step": 2228 + }, + { + "epoch": 0.23153630414459334, + "grad_norm": 0.3750406503677368, + "learning_rate": 8.734552810268838e-05, + "loss": 1.6847, + "step": 2229 + }, + { + "epoch": 0.2316401786641737, + "grad_norm": 0.3923105299472809, + "learning_rate": 8.733467682604545e-05, + "loss": 1.7455, + "step": 2230 + }, + { + "epoch": 0.23174405318375402, + "grad_norm": 0.3961966931819916, + "learning_rate": 8.732382157355066e-05, + "loss": 1.6726, + "step": 2231 + }, + { + "epoch": 0.23184792770333437, + "grad_norm": 0.4062844514846802, + "learning_rate": 8.731296234636e-05, + "loss": 1.9202, + "step": 2232 + }, + { + "epoch": 0.23195180222291473, + "grad_norm": 0.4073842167854309, + "learning_rate": 8.73020991456299e-05, + "loss": 1.9388, + "step": 2233 + }, + { + "epoch": 0.23205567674249505, + "grad_norm": 0.38866791129112244, + "learning_rate": 8.72912319725172e-05, + "loss": 1.6893, + "step": 2234 + }, + { + "epoch": 0.2321595512620754, + "grad_norm": 0.3830127716064453, + "learning_rate": 8.72803608281792e-05, + "loss": 1.812, + "step": 2235 + }, + { + "epoch": 0.23226342578165576, + "grad_norm": 0.4291350245475769, + "learning_rate": 8.726948571377356e-05, + "loss": 1.8426, + "step": 2236 + }, + { + "epoch": 0.23236730030123612, + "grad_norm": 0.36664703488349915, + "learning_rate": 8.72586066304584e-05, + "loss": 1.6714, + "step": 2237 + }, + { + "epoch": 0.23247117482081645, + "grad_norm": 0.34283211827278137, + "learning_rate": 8.724772357939229e-05, + "loss": 1.7595, + "step": 2238 + }, + { + "epoch": 0.2325750493403968, + "grad_norm": 0.3679027557373047, + "learning_rate": 8.723683656173413e-05, + "loss": 1.6493, + "step": 2239 + }, + { + "epoch": 0.23267892385997715, + "grad_norm": 0.3953079879283905, + "learning_rate": 8.722594557864335e-05, + "loss": 1.7885, + "step": 2240 + }, + { + "epoch": 0.23278279837955748, + "grad_norm": 0.37095022201538086, + "learning_rate": 8.721505063127972e-05, + "loss": 1.6947, + "step": 2241 + }, + { + "epoch": 0.23288667289913784, + "grad_norm": 0.38997137546539307, + "learning_rate": 8.72041517208035e-05, + "loss": 1.8158, + "step": 2242 + }, + { + "epoch": 0.2329905474187182, + "grad_norm": 0.36092621088027954, + "learning_rate": 8.719324884837531e-05, + "loss": 1.7928, + "step": 2243 + }, + { + "epoch": 0.23309442193829855, + "grad_norm": 0.3828462064266205, + "learning_rate": 8.718234201515627e-05, + "loss": 1.7782, + "step": 2244 + }, + { + "epoch": 0.23319829645787887, + "grad_norm": 0.35145506262779236, + "learning_rate": 8.717143122230782e-05, + "loss": 1.6527, + "step": 2245 + }, + { + "epoch": 0.23330217097745923, + "grad_norm": 0.3912096321582794, + "learning_rate": 8.71605164709919e-05, + "loss": 1.8666, + "step": 2246 + }, + { + "epoch": 0.23340604549703958, + "grad_norm": 0.3545803725719452, + "learning_rate": 8.714959776237083e-05, + "loss": 1.7043, + "step": 2247 + }, + { + "epoch": 0.2335099200166199, + "grad_norm": 0.3926384449005127, + "learning_rate": 8.713867509760738e-05, + "loss": 2.007, + "step": 2248 + }, + { + "epoch": 0.23361379453620026, + "grad_norm": 0.3713902533054352, + "learning_rate": 8.712774847786471e-05, + "loss": 1.7151, + "step": 2249 + }, + { + "epoch": 0.23371766905578062, + "grad_norm": 0.39191189408302307, + "learning_rate": 8.711681790430645e-05, + "loss": 1.8144, + "step": 2250 + }, + { + "epoch": 0.23382154357536097, + "grad_norm": 0.40152493119239807, + "learning_rate": 8.710588337809662e-05, + "loss": 1.9487, + "step": 2251 + }, + { + "epoch": 0.2339254180949413, + "grad_norm": 0.3732980489730835, + "learning_rate": 8.709494490039963e-05, + "loss": 1.6245, + "step": 2252 + }, + { + "epoch": 0.23402929261452166, + "grad_norm": 0.3951932191848755, + "learning_rate": 8.708400247238035e-05, + "loss": 1.8078, + "step": 2253 + }, + { + "epoch": 0.234133167134102, + "grad_norm": 0.3723011910915375, + "learning_rate": 8.707305609520408e-05, + "loss": 1.7107, + "step": 2254 + }, + { + "epoch": 0.23423704165368237, + "grad_norm": 0.3788382411003113, + "learning_rate": 8.706210577003653e-05, + "loss": 1.6238, + "step": 2255 + }, + { + "epoch": 0.2343409161732627, + "grad_norm": 0.3792799115180969, + "learning_rate": 8.705115149804381e-05, + "loss": 1.7069, + "step": 2256 + }, + { + "epoch": 0.23444479069284305, + "grad_norm": 0.3782130181789398, + "learning_rate": 8.704019328039244e-05, + "loss": 1.6198, + "step": 2257 + }, + { + "epoch": 0.2345486652124234, + "grad_norm": 0.3798910975456238, + "learning_rate": 8.702923111824943e-05, + "loss": 1.8176, + "step": 2258 + }, + { + "epoch": 0.23465253973200373, + "grad_norm": 0.38398462533950806, + "learning_rate": 8.701826501278216e-05, + "loss": 1.7607, + "step": 2259 + }, + { + "epoch": 0.23475641425158408, + "grad_norm": 0.3783421814441681, + "learning_rate": 8.70072949651584e-05, + "loss": 1.765, + "step": 2260 + }, + { + "epoch": 0.23486028877116444, + "grad_norm": 0.34123924374580383, + "learning_rate": 8.69963209765464e-05, + "loss": 1.6595, + "step": 2261 + }, + { + "epoch": 0.2349641632907448, + "grad_norm": 0.33665284514427185, + "learning_rate": 8.698534304811478e-05, + "loss": 1.4557, + "step": 2262 + }, + { + "epoch": 0.23506803781032512, + "grad_norm": 0.37269553542137146, + "learning_rate": 8.697436118103264e-05, + "loss": 1.8497, + "step": 2263 + }, + { + "epoch": 0.23517191232990547, + "grad_norm": 0.38616353273391724, + "learning_rate": 8.696337537646944e-05, + "loss": 1.7943, + "step": 2264 + }, + { + "epoch": 0.23527578684948583, + "grad_norm": 0.4167322814464569, + "learning_rate": 8.695238563559509e-05, + "loss": 1.9579, + "step": 2265 + }, + { + "epoch": 0.23537966136906616, + "grad_norm": 0.38582777976989746, + "learning_rate": 8.694139195957991e-05, + "loss": 1.7782, + "step": 2266 + }, + { + "epoch": 0.2354835358886465, + "grad_norm": 0.3635927438735962, + "learning_rate": 8.693039434959464e-05, + "loss": 1.7276, + "step": 2267 + }, + { + "epoch": 0.23558741040822687, + "grad_norm": 0.42506465315818787, + "learning_rate": 8.691939280681045e-05, + "loss": 1.818, + "step": 2268 + }, + { + "epoch": 0.23569128492780722, + "grad_norm": 0.38665080070495605, + "learning_rate": 8.69083873323989e-05, + "loss": 1.7603, + "step": 2269 + }, + { + "epoch": 0.23579515944738755, + "grad_norm": 0.3853725790977478, + "learning_rate": 8.689737792753198e-05, + "loss": 1.6039, + "step": 2270 + }, + { + "epoch": 0.2358990339669679, + "grad_norm": 0.37736237049102783, + "learning_rate": 8.688636459338215e-05, + "loss": 1.8416, + "step": 2271 + }, + { + "epoch": 0.23600290848654826, + "grad_norm": 0.4537546634674072, + "learning_rate": 8.68753473311222e-05, + "loss": 1.6987, + "step": 2272 + }, + { + "epoch": 0.23610678300612858, + "grad_norm": 0.3903726041316986, + "learning_rate": 8.686432614192538e-05, + "loss": 1.9472, + "step": 2273 + }, + { + "epoch": 0.23621065752570894, + "grad_norm": 0.41961470246315, + "learning_rate": 8.68533010269654e-05, + "loss": 1.8414, + "step": 2274 + }, + { + "epoch": 0.2363145320452893, + "grad_norm": 0.36322250962257385, + "learning_rate": 8.684227198741633e-05, + "loss": 1.5978, + "step": 2275 + }, + { + "epoch": 0.23641840656486965, + "grad_norm": 0.4004223644733429, + "learning_rate": 8.683123902445267e-05, + "loss": 1.7981, + "step": 2276 + }, + { + "epoch": 0.23652228108444998, + "grad_norm": 0.38842740654945374, + "learning_rate": 8.682020213924935e-05, + "loss": 1.7098, + "step": 2277 + }, + { + "epoch": 0.23662615560403033, + "grad_norm": 0.3592800199985504, + "learning_rate": 8.680916133298171e-05, + "loss": 1.661, + "step": 2278 + }, + { + "epoch": 0.23673003012361069, + "grad_norm": 0.357117623090744, + "learning_rate": 8.67981166068255e-05, + "loss": 1.5888, + "step": 2279 + }, + { + "epoch": 0.236833904643191, + "grad_norm": 0.3849972188472748, + "learning_rate": 8.678706796195692e-05, + "loss": 1.7733, + "step": 2280 + }, + { + "epoch": 0.23693777916277137, + "grad_norm": 0.36867091059684753, + "learning_rate": 8.677601539955256e-05, + "loss": 1.8096, + "step": 2281 + }, + { + "epoch": 0.23704165368235172, + "grad_norm": 0.3629715144634247, + "learning_rate": 8.676495892078941e-05, + "loss": 1.6919, + "step": 2282 + }, + { + "epoch": 0.23714552820193208, + "grad_norm": 0.4076031446456909, + "learning_rate": 8.675389852684492e-05, + "loss": 1.9165, + "step": 2283 + }, + { + "epoch": 0.2372494027215124, + "grad_norm": 0.37175339460372925, + "learning_rate": 8.674283421889691e-05, + "loss": 1.6767, + "step": 2284 + }, + { + "epoch": 0.23735327724109276, + "grad_norm": 0.3934227526187897, + "learning_rate": 8.673176599812368e-05, + "loss": 1.7891, + "step": 2285 + }, + { + "epoch": 0.2374571517606731, + "grad_norm": 0.4014788269996643, + "learning_rate": 8.672069386570389e-05, + "loss": 1.9999, + "step": 2286 + }, + { + "epoch": 0.23756102628025344, + "grad_norm": 0.35122254490852356, + "learning_rate": 8.670961782281664e-05, + "loss": 1.8249, + "step": 2287 + }, + { + "epoch": 0.2376649007998338, + "grad_norm": 0.36434510350227356, + "learning_rate": 8.669853787064142e-05, + "loss": 1.8532, + "step": 2288 + }, + { + "epoch": 0.23776877531941415, + "grad_norm": 0.3808654546737671, + "learning_rate": 8.668745401035818e-05, + "loss": 1.7474, + "step": 2289 + }, + { + "epoch": 0.2378726498389945, + "grad_norm": 0.3531155288219452, + "learning_rate": 8.667636624314725e-05, + "loss": 1.8941, + "step": 2290 + }, + { + "epoch": 0.23797652435857483, + "grad_norm": 0.472049742937088, + "learning_rate": 8.666527457018943e-05, + "loss": 1.985, + "step": 2291 + }, + { + "epoch": 0.23808039887815519, + "grad_norm": 0.39368936419487, + "learning_rate": 8.665417899266586e-05, + "loss": 1.6889, + "step": 2292 + }, + { + "epoch": 0.23818427339773554, + "grad_norm": 0.3707980811595917, + "learning_rate": 8.664307951175814e-05, + "loss": 1.5088, + "step": 2293 + }, + { + "epoch": 0.2382881479173159, + "grad_norm": 0.3548358380794525, + "learning_rate": 8.663197612864827e-05, + "loss": 1.7288, + "step": 2294 + }, + { + "epoch": 0.23839202243689622, + "grad_norm": 0.47809383273124695, + "learning_rate": 8.662086884451869e-05, + "loss": 1.8472, + "step": 2295 + }, + { + "epoch": 0.23849589695647658, + "grad_norm": 0.4353974163532257, + "learning_rate": 8.660975766055224e-05, + "loss": 1.8897, + "step": 2296 + }, + { + "epoch": 0.23859977147605693, + "grad_norm": 0.4466110169887543, + "learning_rate": 8.659864257793215e-05, + "loss": 1.9391, + "step": 2297 + }, + { + "epoch": 0.23870364599563726, + "grad_norm": 0.39143791794776917, + "learning_rate": 8.65875235978421e-05, + "loss": 1.7464, + "step": 2298 + }, + { + "epoch": 0.23880752051521761, + "grad_norm": 0.38728442788124084, + "learning_rate": 8.65764007214662e-05, + "loss": 1.7393, + "step": 2299 + }, + { + "epoch": 0.23891139503479797, + "grad_norm": 0.3674345910549164, + "learning_rate": 8.656527394998892e-05, + "loss": 1.6336, + "step": 2300 + }, + { + "epoch": 0.23901526955437832, + "grad_norm": 0.38445186614990234, + "learning_rate": 8.655414328459519e-05, + "loss": 1.8982, + "step": 2301 + }, + { + "epoch": 0.23911914407395865, + "grad_norm": 0.38056597113609314, + "learning_rate": 8.654300872647033e-05, + "loss": 1.6851, + "step": 2302 + }, + { + "epoch": 0.239223018593539, + "grad_norm": 0.4110845625400543, + "learning_rate": 8.65318702768001e-05, + "loss": 1.8767, + "step": 2303 + }, + { + "epoch": 0.23932689311311936, + "grad_norm": 0.38014286756515503, + "learning_rate": 8.652072793677061e-05, + "loss": 1.725, + "step": 2304 + }, + { + "epoch": 0.2394307676326997, + "grad_norm": 0.39622190594673157, + "learning_rate": 8.650958170756852e-05, + "loss": 1.9115, + "step": 2305 + }, + { + "epoch": 0.23953464215228004, + "grad_norm": 0.36890217661857605, + "learning_rate": 8.649843159038071e-05, + "loss": 1.698, + "step": 2306 + }, + { + "epoch": 0.2396385166718604, + "grad_norm": 0.38787323236465454, + "learning_rate": 8.648727758639467e-05, + "loss": 1.8998, + "step": 2307 + }, + { + "epoch": 0.23974239119144075, + "grad_norm": 0.38866347074508667, + "learning_rate": 8.647611969679816e-05, + "loss": 1.7095, + "step": 2308 + }, + { + "epoch": 0.23984626571102108, + "grad_norm": 0.39878830313682556, + "learning_rate": 8.646495792277943e-05, + "loss": 1.9387, + "step": 2309 + }, + { + "epoch": 0.23995014023060143, + "grad_norm": 0.3923484981060028, + "learning_rate": 8.645379226552712e-05, + "loss": 1.8302, + "step": 2310 + }, + { + "epoch": 0.2400540147501818, + "grad_norm": 0.3727096617221832, + "learning_rate": 8.644262272623029e-05, + "loss": 1.6717, + "step": 2311 + }, + { + "epoch": 0.24015788926976211, + "grad_norm": 0.3535787761211395, + "learning_rate": 8.64314493060784e-05, + "loss": 1.6446, + "step": 2312 + }, + { + "epoch": 0.24026176378934247, + "grad_norm": 0.3555033504962921, + "learning_rate": 8.642027200626135e-05, + "loss": 1.7145, + "step": 2313 + }, + { + "epoch": 0.24036563830892282, + "grad_norm": 0.3748113512992859, + "learning_rate": 8.640909082796939e-05, + "loss": 1.7144, + "step": 2314 + }, + { + "epoch": 0.24046951282850318, + "grad_norm": 0.37921851873397827, + "learning_rate": 8.639790577239328e-05, + "loss": 1.8516, + "step": 2315 + }, + { + "epoch": 0.2405733873480835, + "grad_norm": 0.36275529861450195, + "learning_rate": 8.638671684072412e-05, + "loss": 1.6144, + "step": 2316 + }, + { + "epoch": 0.24067726186766386, + "grad_norm": 0.35582250356674194, + "learning_rate": 8.637552403415343e-05, + "loss": 1.7601, + "step": 2317 + }, + { + "epoch": 0.24078113638724422, + "grad_norm": 0.3954647183418274, + "learning_rate": 8.636432735387319e-05, + "loss": 1.798, + "step": 2318 + }, + { + "epoch": 0.24088501090682454, + "grad_norm": 0.36659133434295654, + "learning_rate": 8.635312680107572e-05, + "loss": 1.551, + "step": 2319 + }, + { + "epoch": 0.2409888854264049, + "grad_norm": 0.35685068368911743, + "learning_rate": 8.634192237695382e-05, + "loss": 1.6926, + "step": 2320 + }, + { + "epoch": 0.24109275994598525, + "grad_norm": 0.35937169194221497, + "learning_rate": 8.633071408270065e-05, + "loss": 1.604, + "step": 2321 + }, + { + "epoch": 0.2411966344655656, + "grad_norm": 0.44768527150154114, + "learning_rate": 8.631950191950983e-05, + "loss": 2.0036, + "step": 2322 + }, + { + "epoch": 0.24130050898514593, + "grad_norm": 0.37177640199661255, + "learning_rate": 8.630828588857537e-05, + "loss": 1.7145, + "step": 2323 + }, + { + "epoch": 0.2414043835047263, + "grad_norm": 0.38756901025772095, + "learning_rate": 8.629706599109169e-05, + "loss": 1.7844, + "step": 2324 + }, + { + "epoch": 0.24150825802430664, + "grad_norm": 0.37525761127471924, + "learning_rate": 8.628584222825357e-05, + "loss": 1.586, + "step": 2325 + }, + { + "epoch": 0.241612132543887, + "grad_norm": 0.4448417127132416, + "learning_rate": 8.627461460125632e-05, + "loss": 1.6915, + "step": 2326 + }, + { + "epoch": 0.24171600706346733, + "grad_norm": 0.39078620076179504, + "learning_rate": 8.626338311129557e-05, + "loss": 1.7567, + "step": 2327 + }, + { + "epoch": 0.24181988158304768, + "grad_norm": 0.38780951499938965, + "learning_rate": 8.625214775956737e-05, + "loss": 1.6969, + "step": 2328 + }, + { + "epoch": 0.24192375610262803, + "grad_norm": 0.3700442314147949, + "learning_rate": 8.624090854726822e-05, + "loss": 1.6198, + "step": 2329 + }, + { + "epoch": 0.24202763062220836, + "grad_norm": 0.37854552268981934, + "learning_rate": 8.622966547559499e-05, + "loss": 1.7932, + "step": 2330 + }, + { + "epoch": 0.24213150514178872, + "grad_norm": 0.49389779567718506, + "learning_rate": 8.621841854574501e-05, + "loss": 1.8561, + "step": 2331 + }, + { + "epoch": 0.24223537966136907, + "grad_norm": 0.35576608777046204, + "learning_rate": 8.620716775891595e-05, + "loss": 1.5224, + "step": 2332 + }, + { + "epoch": 0.24233925418094943, + "grad_norm": 0.37670719623565674, + "learning_rate": 8.619591311630595e-05, + "loss": 1.8392, + "step": 2333 + }, + { + "epoch": 0.24244312870052975, + "grad_norm": 0.3652971386909485, + "learning_rate": 8.618465461911355e-05, + "loss": 1.7624, + "step": 2334 + }, + { + "epoch": 0.2425470032201101, + "grad_norm": 0.3692745268344879, + "learning_rate": 8.617339226853768e-05, + "loss": 1.6901, + "step": 2335 + }, + { + "epoch": 0.24265087773969046, + "grad_norm": 0.36297428607940674, + "learning_rate": 8.61621260657777e-05, + "loss": 1.8461, + "step": 2336 + }, + { + "epoch": 0.2427547522592708, + "grad_norm": 0.37688395380973816, + "learning_rate": 8.615085601203337e-05, + "loss": 1.6562, + "step": 2337 + }, + { + "epoch": 0.24285862677885114, + "grad_norm": 0.3952527642250061, + "learning_rate": 8.613958210850485e-05, + "loss": 1.7862, + "step": 2338 + }, + { + "epoch": 0.2429625012984315, + "grad_norm": 0.35479936003685, + "learning_rate": 8.612830435639275e-05, + "loss": 1.9233, + "step": 2339 + }, + { + "epoch": 0.24306637581801185, + "grad_norm": 0.40649887919425964, + "learning_rate": 8.611702275689805e-05, + "loss": 1.8637, + "step": 2340 + }, + { + "epoch": 0.24317025033759218, + "grad_norm": 0.36132219433784485, + "learning_rate": 8.610573731122214e-05, + "loss": 1.7277, + "step": 2341 + }, + { + "epoch": 0.24327412485717254, + "grad_norm": 0.35456719994544983, + "learning_rate": 8.609444802056686e-05, + "loss": 1.7141, + "step": 2342 + }, + { + "epoch": 0.2433779993767529, + "grad_norm": 0.38762810826301575, + "learning_rate": 8.608315488613439e-05, + "loss": 1.7315, + "step": 2343 + }, + { + "epoch": 0.24348187389633322, + "grad_norm": 0.3507632911205292, + "learning_rate": 8.607185790912739e-05, + "loss": 1.712, + "step": 2344 + }, + { + "epoch": 0.24358574841591357, + "grad_norm": 0.39255252480506897, + "learning_rate": 8.60605570907489e-05, + "loss": 1.7046, + "step": 2345 + }, + { + "epoch": 0.24368962293549393, + "grad_norm": 0.35092976689338684, + "learning_rate": 8.604925243220235e-05, + "loss": 1.8009, + "step": 2346 + }, + { + "epoch": 0.24379349745507428, + "grad_norm": 0.37961629033088684, + "learning_rate": 8.603794393469162e-05, + "loss": 1.8602, + "step": 2347 + }, + { + "epoch": 0.2438973719746546, + "grad_norm": 0.38758939504623413, + "learning_rate": 8.602663159942098e-05, + "loss": 1.8524, + "step": 2348 + }, + { + "epoch": 0.24400124649423496, + "grad_norm": 0.3866504430770874, + "learning_rate": 8.601531542759506e-05, + "loss": 1.722, + "step": 2349 + }, + { + "epoch": 0.24410512101381532, + "grad_norm": 0.3876989483833313, + "learning_rate": 8.600399542041901e-05, + "loss": 1.7846, + "step": 2350 + }, + { + "epoch": 0.24420899553339565, + "grad_norm": 0.3649909198284149, + "learning_rate": 8.599267157909827e-05, + "loss": 1.7255, + "step": 2351 + }, + { + "epoch": 0.244312870052976, + "grad_norm": 0.3867836892604828, + "learning_rate": 8.598134390483879e-05, + "loss": 1.8129, + "step": 2352 + }, + { + "epoch": 0.24441674457255635, + "grad_norm": 0.40322503447532654, + "learning_rate": 8.59700123988468e-05, + "loss": 2.0753, + "step": 2353 + }, + { + "epoch": 0.2445206190921367, + "grad_norm": 0.37695807218551636, + "learning_rate": 8.595867706232911e-05, + "loss": 1.8298, + "step": 2354 + }, + { + "epoch": 0.24462449361171704, + "grad_norm": 0.3737477958202362, + "learning_rate": 8.594733789649279e-05, + "loss": 1.7315, + "step": 2355 + }, + { + "epoch": 0.2447283681312974, + "grad_norm": 0.3679026663303375, + "learning_rate": 8.593599490254538e-05, + "loss": 1.6989, + "step": 2356 + }, + { + "epoch": 0.24483224265087775, + "grad_norm": 0.3665264844894409, + "learning_rate": 8.592464808169482e-05, + "loss": 1.7326, + "step": 2357 + }, + { + "epoch": 0.24493611717045807, + "grad_norm": 0.4071952700614929, + "learning_rate": 8.591329743514947e-05, + "loss": 1.774, + "step": 2358 + }, + { + "epoch": 0.24503999169003843, + "grad_norm": 0.4158788025379181, + "learning_rate": 8.590194296411806e-05, + "loss": 2.0193, + "step": 2359 + }, + { + "epoch": 0.24514386620961878, + "grad_norm": 0.3840784430503845, + "learning_rate": 8.589058466980979e-05, + "loss": 1.6699, + "step": 2360 + }, + { + "epoch": 0.24524774072919914, + "grad_norm": 0.36834099888801575, + "learning_rate": 8.587922255343421e-05, + "loss": 1.6759, + "step": 2361 + }, + { + "epoch": 0.24535161524877946, + "grad_norm": 0.35662734508514404, + "learning_rate": 8.58678566162013e-05, + "loss": 1.4792, + "step": 2362 + }, + { + "epoch": 0.24545548976835982, + "grad_norm": 0.369302362203598, + "learning_rate": 8.585648685932141e-05, + "loss": 1.7362, + "step": 2363 + }, + { + "epoch": 0.24555936428794017, + "grad_norm": 0.38440340757369995, + "learning_rate": 8.58451132840054e-05, + "loss": 1.8942, + "step": 2364 + }, + { + "epoch": 0.24566323880752053, + "grad_norm": 0.3850240707397461, + "learning_rate": 8.58337358914644e-05, + "loss": 1.8449, + "step": 2365 + }, + { + "epoch": 0.24576711332710086, + "grad_norm": 0.393264502286911, + "learning_rate": 8.582235468291007e-05, + "loss": 1.88, + "step": 2366 + }, + { + "epoch": 0.2458709878466812, + "grad_norm": 0.40909549593925476, + "learning_rate": 8.581096965955436e-05, + "loss": 1.8228, + "step": 2367 + }, + { + "epoch": 0.24597486236626157, + "grad_norm": 0.36390751600265503, + "learning_rate": 8.579958082260973e-05, + "loss": 1.7457, + "step": 2368 + }, + { + "epoch": 0.2460787368858419, + "grad_norm": 0.3853476047515869, + "learning_rate": 8.5788188173289e-05, + "loss": 1.8158, + "step": 2369 + }, + { + "epoch": 0.24618261140542225, + "grad_norm": 0.37678855657577515, + "learning_rate": 8.577679171280537e-05, + "loss": 1.8368, + "step": 2370 + }, + { + "epoch": 0.2462864859250026, + "grad_norm": 0.3758280277252197, + "learning_rate": 8.57653914423725e-05, + "loss": 1.6665, + "step": 2371 + }, + { + "epoch": 0.24639036044458296, + "grad_norm": 0.38738977909088135, + "learning_rate": 8.575398736320442e-05, + "loss": 1.8557, + "step": 2372 + }, + { + "epoch": 0.24649423496416328, + "grad_norm": 0.35604584217071533, + "learning_rate": 8.574257947651558e-05, + "loss": 1.6039, + "step": 2373 + }, + { + "epoch": 0.24659810948374364, + "grad_norm": 0.3709351420402527, + "learning_rate": 8.573116778352084e-05, + "loss": 1.85, + "step": 2374 + }, + { + "epoch": 0.246701984003324, + "grad_norm": 0.37650248408317566, + "learning_rate": 8.571975228543543e-05, + "loss": 1.8249, + "step": 2375 + }, + { + "epoch": 0.24680585852290432, + "grad_norm": 0.3389437794685364, + "learning_rate": 8.570833298347502e-05, + "loss": 1.7355, + "step": 2376 + }, + { + "epoch": 0.24690973304248467, + "grad_norm": 0.3376055061817169, + "learning_rate": 8.56969098788557e-05, + "loss": 1.5235, + "step": 2377 + }, + { + "epoch": 0.24701360756206503, + "grad_norm": 0.3974694013595581, + "learning_rate": 8.568548297279392e-05, + "loss": 1.8553, + "step": 2378 + }, + { + "epoch": 0.24711748208164538, + "grad_norm": 0.35596492886543274, + "learning_rate": 8.567405226650656e-05, + "loss": 1.4544, + "step": 2379 + }, + { + "epoch": 0.2472213566012257, + "grad_norm": 0.37058472633361816, + "learning_rate": 8.56626177612109e-05, + "loss": 1.7564, + "step": 2380 + }, + { + "epoch": 0.24732523112080607, + "grad_norm": 0.3887064456939697, + "learning_rate": 8.565117945812463e-05, + "loss": 1.8172, + "step": 2381 + }, + { + "epoch": 0.24742910564038642, + "grad_norm": 0.3546907901763916, + "learning_rate": 8.563973735846583e-05, + "loss": 1.6763, + "step": 2382 + }, + { + "epoch": 0.24753298015996675, + "grad_norm": 0.38635915517807007, + "learning_rate": 8.562829146345301e-05, + "loss": 1.7017, + "step": 2383 + }, + { + "epoch": 0.2476368546795471, + "grad_norm": 0.3599991500377655, + "learning_rate": 8.561684177430507e-05, + "loss": 1.6087, + "step": 2384 + }, + { + "epoch": 0.24774072919912746, + "grad_norm": 0.3766249120235443, + "learning_rate": 8.560538829224129e-05, + "loss": 1.6795, + "step": 2385 + }, + { + "epoch": 0.2478446037187078, + "grad_norm": 0.38669300079345703, + "learning_rate": 8.559393101848139e-05, + "loss": 1.7984, + "step": 2386 + }, + { + "epoch": 0.24794847823828814, + "grad_norm": 0.35774049162864685, + "learning_rate": 8.558246995424548e-05, + "loss": 1.5587, + "step": 2387 + }, + { + "epoch": 0.2480523527578685, + "grad_norm": 0.3456031382083893, + "learning_rate": 8.557100510075406e-05, + "loss": 1.6269, + "step": 2388 + }, + { + "epoch": 0.24815622727744885, + "grad_norm": 0.42325082421302795, + "learning_rate": 8.555953645922809e-05, + "loss": 2.0851, + "step": 2389 + }, + { + "epoch": 0.24826010179702918, + "grad_norm": 0.32227158546447754, + "learning_rate": 8.554806403088884e-05, + "loss": 1.5321, + "step": 2390 + }, + { + "epoch": 0.24836397631660953, + "grad_norm": 0.35737884044647217, + "learning_rate": 8.553658781695807e-05, + "loss": 1.7002, + "step": 2391 + }, + { + "epoch": 0.24846785083618989, + "grad_norm": 0.3779980540275574, + "learning_rate": 8.552510781865788e-05, + "loss": 1.74, + "step": 2392 + }, + { + "epoch": 0.24857172535577024, + "grad_norm": 0.3488198518753052, + "learning_rate": 8.551362403721084e-05, + "loss": 1.7332, + "step": 2393 + }, + { + "epoch": 0.24867559987535057, + "grad_norm": 0.3871583342552185, + "learning_rate": 8.550213647383982e-05, + "loss": 1.8173, + "step": 2394 + }, + { + "epoch": 0.24877947439493092, + "grad_norm": 0.3559076488018036, + "learning_rate": 8.549064512976822e-05, + "loss": 1.824, + "step": 2395 + }, + { + "epoch": 0.24888334891451128, + "grad_norm": 0.36714842915534973, + "learning_rate": 8.547915000621974e-05, + "loss": 1.7691, + "step": 2396 + }, + { + "epoch": 0.2489872234340916, + "grad_norm": 0.35242655873298645, + "learning_rate": 8.546765110441855e-05, + "loss": 1.7228, + "step": 2397 + }, + { + "epoch": 0.24909109795367196, + "grad_norm": 0.3446803689002991, + "learning_rate": 8.545614842558915e-05, + "loss": 1.613, + "step": 2398 + }, + { + "epoch": 0.2491949724732523, + "grad_norm": 0.36509135365486145, + "learning_rate": 8.544464197095651e-05, + "loss": 1.6443, + "step": 2399 + }, + { + "epoch": 0.24929884699283267, + "grad_norm": 0.3790837228298187, + "learning_rate": 8.5433131741746e-05, + "loss": 1.7364, + "step": 2400 + }, + { + "epoch": 0.249402721512413, + "grad_norm": 0.4487643837928772, + "learning_rate": 8.542161773918334e-05, + "loss": 1.8304, + "step": 2401 + }, + { + "epoch": 0.24950659603199335, + "grad_norm": 0.3896706700325012, + "learning_rate": 8.54100999644947e-05, + "loss": 1.7448, + "step": 2402 + }, + { + "epoch": 0.2496104705515737, + "grad_norm": 0.382664293050766, + "learning_rate": 8.53985784189066e-05, + "loss": 1.7384, + "step": 2403 + }, + { + "epoch": 0.24971434507115406, + "grad_norm": 0.3803345561027527, + "learning_rate": 8.538705310364603e-05, + "loss": 1.8425, + "step": 2404 + }, + { + "epoch": 0.24981821959073439, + "grad_norm": 0.37627753615379333, + "learning_rate": 8.537552401994034e-05, + "loss": 1.7596, + "step": 2405 + }, + { + "epoch": 0.24992209411031474, + "grad_norm": 0.3746028542518616, + "learning_rate": 8.536399116901728e-05, + "loss": 1.8014, + "step": 2406 + }, + { + "epoch": 0.25002596862989507, + "grad_norm": 0.3502449095249176, + "learning_rate": 8.535245455210501e-05, + "loss": 1.5849, + "step": 2407 + }, + { + "epoch": 0.25012984314947545, + "grad_norm": 0.40077903866767883, + "learning_rate": 8.534091417043208e-05, + "loss": 1.9111, + "step": 2408 + }, + { + "epoch": 0.2502337176690558, + "grad_norm": 0.3578510284423828, + "learning_rate": 8.532937002522747e-05, + "loss": 1.6662, + "step": 2409 + }, + { + "epoch": 0.2503375921886361, + "grad_norm": 0.38965925574302673, + "learning_rate": 8.531782211772052e-05, + "loss": 1.5775, + "step": 2410 + }, + { + "epoch": 0.2504414667082165, + "grad_norm": 0.35679998993873596, + "learning_rate": 8.530627044914101e-05, + "loss": 1.796, + "step": 2411 + }, + { + "epoch": 0.2505453412277968, + "grad_norm": 0.3703193962574005, + "learning_rate": 8.52947150207191e-05, + "loss": 1.8143, + "step": 2412 + }, + { + "epoch": 0.25064921574737714, + "grad_norm": 0.3648841977119446, + "learning_rate": 8.528315583368536e-05, + "loss": 1.7078, + "step": 2413 + }, + { + "epoch": 0.2507530902669575, + "grad_norm": 0.38748499751091003, + "learning_rate": 8.527159288927074e-05, + "loss": 1.876, + "step": 2414 + }, + { + "epoch": 0.25085696478653785, + "grad_norm": 0.37417733669281006, + "learning_rate": 8.526002618870659e-05, + "loss": 1.646, + "step": 2415 + }, + { + "epoch": 0.25096083930611823, + "grad_norm": 0.3735947012901306, + "learning_rate": 8.52484557332247e-05, + "loss": 1.8018, + "step": 2416 + }, + { + "epoch": 0.25106471382569856, + "grad_norm": 0.36857619881629944, + "learning_rate": 8.523688152405722e-05, + "loss": 1.6909, + "step": 2417 + }, + { + "epoch": 0.2511685883452789, + "grad_norm": 0.41932663321495056, + "learning_rate": 8.522530356243671e-05, + "loss": 1.6112, + "step": 2418 + }, + { + "epoch": 0.25127246286485927, + "grad_norm": 0.4023738503456116, + "learning_rate": 8.521372184959615e-05, + "loss": 1.9221, + "step": 2419 + }, + { + "epoch": 0.2513763373844396, + "grad_norm": 0.35625362396240234, + "learning_rate": 8.520213638676889e-05, + "loss": 1.6999, + "step": 2420 + }, + { + "epoch": 0.2514802119040199, + "grad_norm": 0.36791056394577026, + "learning_rate": 8.519054717518868e-05, + "loss": 1.8009, + "step": 2421 + }, + { + "epoch": 0.2515840864236003, + "grad_norm": 0.3679400086402893, + "learning_rate": 8.517895421608972e-05, + "loss": 1.6935, + "step": 2422 + }, + { + "epoch": 0.25168796094318063, + "grad_norm": 0.36350300908088684, + "learning_rate": 8.516735751070652e-05, + "loss": 1.7611, + "step": 2423 + }, + { + "epoch": 0.25179183546276096, + "grad_norm": 0.36680689454078674, + "learning_rate": 8.515575706027406e-05, + "loss": 1.7451, + "step": 2424 + }, + { + "epoch": 0.25189570998234134, + "grad_norm": 0.3586255609989166, + "learning_rate": 8.514415286602771e-05, + "loss": 1.6652, + "step": 2425 + }, + { + "epoch": 0.25199958450192167, + "grad_norm": 0.380088210105896, + "learning_rate": 8.51325449292032e-05, + "loss": 1.7082, + "step": 2426 + }, + { + "epoch": 0.25210345902150205, + "grad_norm": 0.3451653718948364, + "learning_rate": 8.512093325103671e-05, + "loss": 1.6654, + "step": 2427 + }, + { + "epoch": 0.2522073335410824, + "grad_norm": 0.42833617329597473, + "learning_rate": 8.510931783276477e-05, + "loss": 1.9137, + "step": 2428 + }, + { + "epoch": 0.2523112080606627, + "grad_norm": 0.42482277750968933, + "learning_rate": 8.509769867562436e-05, + "loss": 2.0342, + "step": 2429 + }, + { + "epoch": 0.2524150825802431, + "grad_norm": 0.3890083134174347, + "learning_rate": 8.508607578085282e-05, + "loss": 1.7393, + "step": 2430 + }, + { + "epoch": 0.2525189570998234, + "grad_norm": 0.37014102935791016, + "learning_rate": 8.507444914968788e-05, + "loss": 1.8187, + "step": 2431 + }, + { + "epoch": 0.25262283161940374, + "grad_norm": 0.370259553194046, + "learning_rate": 8.506281878336768e-05, + "loss": 1.733, + "step": 2432 + }, + { + "epoch": 0.2527267061389841, + "grad_norm": 0.37391120195388794, + "learning_rate": 8.50511846831308e-05, + "loss": 1.691, + "step": 2433 + }, + { + "epoch": 0.25283058065856445, + "grad_norm": 0.3874087631702423, + "learning_rate": 8.503954685021616e-05, + "loss": 1.7962, + "step": 2434 + }, + { + "epoch": 0.2529344551781448, + "grad_norm": 0.3736341893672943, + "learning_rate": 8.50279052858631e-05, + "loss": 1.8226, + "step": 2435 + }, + { + "epoch": 0.25303832969772516, + "grad_norm": 0.37461403012275696, + "learning_rate": 8.501625999131134e-05, + "loss": 1.8345, + "step": 2436 + }, + { + "epoch": 0.2531422042173055, + "grad_norm": 0.39281967282295227, + "learning_rate": 8.500461096780105e-05, + "loss": 1.6779, + "step": 2437 + }, + { + "epoch": 0.2532460787368858, + "grad_norm": 0.4174926280975342, + "learning_rate": 8.499295821657273e-05, + "loss": 1.8124, + "step": 2438 + }, + { + "epoch": 0.2533499532564662, + "grad_norm": 0.4330326318740845, + "learning_rate": 8.498130173886731e-05, + "loss": 1.9252, + "step": 2439 + }, + { + "epoch": 0.2534538277760465, + "grad_norm": 0.357661634683609, + "learning_rate": 8.496964153592613e-05, + "loss": 1.7423, + "step": 2440 + }, + { + "epoch": 0.2535577022956269, + "grad_norm": 0.3977978527545929, + "learning_rate": 8.495797760899088e-05, + "loss": 1.4438, + "step": 2441 + }, + { + "epoch": 0.25366157681520723, + "grad_norm": 0.4523005783557892, + "learning_rate": 8.494630995930372e-05, + "loss": 1.5389, + "step": 2442 + }, + { + "epoch": 0.25376545133478756, + "grad_norm": 0.3693365156650543, + "learning_rate": 8.493463858810713e-05, + "loss": 1.8215, + "step": 2443 + }, + { + "epoch": 0.25386932585436794, + "grad_norm": 0.40159082412719727, + "learning_rate": 8.492296349664401e-05, + "loss": 1.8385, + "step": 2444 + }, + { + "epoch": 0.25397320037394827, + "grad_norm": 0.3591073751449585, + "learning_rate": 8.491128468615772e-05, + "loss": 1.642, + "step": 2445 + }, + { + "epoch": 0.2540770748935286, + "grad_norm": 0.3740187883377075, + "learning_rate": 8.489960215789192e-05, + "loss": 1.8311, + "step": 2446 + }, + { + "epoch": 0.254180949413109, + "grad_norm": 0.38675010204315186, + "learning_rate": 8.488791591309072e-05, + "loss": 1.7373, + "step": 2447 + }, + { + "epoch": 0.2542848239326893, + "grad_norm": 0.34369611740112305, + "learning_rate": 8.48762259529986e-05, + "loss": 1.7054, + "step": 2448 + }, + { + "epoch": 0.25438869845226963, + "grad_norm": 0.3643989861011505, + "learning_rate": 8.486453227886049e-05, + "loss": 1.6371, + "step": 2449 + }, + { + "epoch": 0.25449257297185, + "grad_norm": 0.38395020365715027, + "learning_rate": 8.485283489192162e-05, + "loss": 1.5869, + "step": 2450 + }, + { + "epoch": 0.25459644749143034, + "grad_norm": 0.365596741437912, + "learning_rate": 8.48411337934277e-05, + "loss": 1.5886, + "step": 2451 + }, + { + "epoch": 0.25470032201101067, + "grad_norm": 0.3677592873573303, + "learning_rate": 8.48294289846248e-05, + "loss": 1.8044, + "step": 2452 + }, + { + "epoch": 0.25480419653059105, + "grad_norm": 0.39587604999542236, + "learning_rate": 8.48177204667594e-05, + "loss": 1.8413, + "step": 2453 + }, + { + "epoch": 0.2549080710501714, + "grad_norm": 0.3834674656391144, + "learning_rate": 8.480600824107837e-05, + "loss": 1.6651, + "step": 2454 + }, + { + "epoch": 0.25501194556975176, + "grad_norm": 0.38077157735824585, + "learning_rate": 8.479429230882893e-05, + "loss": 1.558, + "step": 2455 + }, + { + "epoch": 0.2551158200893321, + "grad_norm": 0.4156430959701538, + "learning_rate": 8.478257267125878e-05, + "loss": 1.7901, + "step": 2456 + }, + { + "epoch": 0.2552196946089124, + "grad_norm": 0.3814376890659332, + "learning_rate": 8.477084932961596e-05, + "loss": 1.6559, + "step": 2457 + }, + { + "epoch": 0.2553235691284928, + "grad_norm": 0.3800637722015381, + "learning_rate": 8.475912228514889e-05, + "loss": 1.7842, + "step": 2458 + }, + { + "epoch": 0.2554274436480731, + "grad_norm": 0.36574795842170715, + "learning_rate": 8.474739153910646e-05, + "loss": 1.6281, + "step": 2459 + }, + { + "epoch": 0.25553131816765345, + "grad_norm": 0.3729517161846161, + "learning_rate": 8.473565709273785e-05, + "loss": 1.8031, + "step": 2460 + }, + { + "epoch": 0.25563519268723384, + "grad_norm": 0.4632837176322937, + "learning_rate": 8.47239189472927e-05, + "loss": 1.9434, + "step": 2461 + }, + { + "epoch": 0.25573906720681416, + "grad_norm": 0.3544566333293915, + "learning_rate": 8.471217710402106e-05, + "loss": 1.8198, + "step": 2462 + }, + { + "epoch": 0.2558429417263945, + "grad_norm": 0.3790525794029236, + "learning_rate": 8.470043156417333e-05, + "loss": 1.8017, + "step": 2463 + }, + { + "epoch": 0.2559468162459749, + "grad_norm": 0.3937493562698364, + "learning_rate": 8.46886823290003e-05, + "loss": 1.7512, + "step": 2464 + }, + { + "epoch": 0.2560506907655552, + "grad_norm": 0.4141665995121002, + "learning_rate": 8.467692939975316e-05, + "loss": 1.9163, + "step": 2465 + }, + { + "epoch": 0.2561545652851356, + "grad_norm": 0.4007442891597748, + "learning_rate": 8.466517277768355e-05, + "loss": 1.9303, + "step": 2466 + }, + { + "epoch": 0.2562584398047159, + "grad_norm": 0.36880823969841003, + "learning_rate": 8.465341246404345e-05, + "loss": 1.6991, + "step": 2467 + }, + { + "epoch": 0.25636231432429624, + "grad_norm": 0.3562661409378052, + "learning_rate": 8.464164846008522e-05, + "loss": 1.7796, + "step": 2468 + }, + { + "epoch": 0.2564661888438766, + "grad_norm": 0.3477732241153717, + "learning_rate": 8.462988076706164e-05, + "loss": 1.7226, + "step": 2469 + }, + { + "epoch": 0.25657006336345695, + "grad_norm": 0.35839709639549255, + "learning_rate": 8.461810938622589e-05, + "loss": 1.7105, + "step": 2470 + }, + { + "epoch": 0.2566739378830373, + "grad_norm": 0.39441442489624023, + "learning_rate": 8.460633431883151e-05, + "loss": 1.6964, + "step": 2471 + }, + { + "epoch": 0.25677781240261766, + "grad_norm": 0.41418394446372986, + "learning_rate": 8.459455556613247e-05, + "loss": 1.7889, + "step": 2472 + }, + { + "epoch": 0.256881686922198, + "grad_norm": 0.3982704281806946, + "learning_rate": 8.458277312938312e-05, + "loss": 2.066, + "step": 2473 + }, + { + "epoch": 0.2569855614417783, + "grad_norm": 0.38881179690361023, + "learning_rate": 8.457098700983818e-05, + "loss": 1.8605, + "step": 2474 + }, + { + "epoch": 0.2570894359613587, + "grad_norm": 0.39533254504203796, + "learning_rate": 8.455919720875279e-05, + "loss": 1.8728, + "step": 2475 + }, + { + "epoch": 0.257193310480939, + "grad_norm": 0.4153515100479126, + "learning_rate": 8.454740372738246e-05, + "loss": 1.6178, + "step": 2476 + }, + { + "epoch": 0.25729718500051935, + "grad_norm": 0.35797804594039917, + "learning_rate": 8.453560656698311e-05, + "loss": 1.5962, + "step": 2477 + }, + { + "epoch": 0.25740105952009973, + "grad_norm": 0.3884628415107727, + "learning_rate": 8.452380572881107e-05, + "loss": 1.6725, + "step": 2478 + }, + { + "epoch": 0.25750493403968006, + "grad_norm": 0.4121970534324646, + "learning_rate": 8.451200121412299e-05, + "loss": 1.731, + "step": 2479 + }, + { + "epoch": 0.25760880855926044, + "grad_norm": 0.3405035734176636, + "learning_rate": 8.4500193024176e-05, + "loss": 1.5419, + "step": 2480 + }, + { + "epoch": 0.25771268307884077, + "grad_norm": 0.38876980543136597, + "learning_rate": 8.448838116022758e-05, + "loss": 1.6488, + "step": 2481 + }, + { + "epoch": 0.2578165575984211, + "grad_norm": 0.38169679045677185, + "learning_rate": 8.447656562353557e-05, + "loss": 1.8325, + "step": 2482 + }, + { + "epoch": 0.2579204321180015, + "grad_norm": 0.36781740188598633, + "learning_rate": 8.446474641535824e-05, + "loss": 1.5927, + "step": 2483 + }, + { + "epoch": 0.2580243066375818, + "grad_norm": 0.39887794852256775, + "learning_rate": 8.445292353695427e-05, + "loss": 1.7471, + "step": 2484 + }, + { + "epoch": 0.25812818115716213, + "grad_norm": 0.381283164024353, + "learning_rate": 8.444109698958267e-05, + "loss": 1.7668, + "step": 2485 + }, + { + "epoch": 0.2582320556767425, + "grad_norm": 0.38031065464019775, + "learning_rate": 8.44292667745029e-05, + "loss": 1.7104, + "step": 2486 + }, + { + "epoch": 0.25833593019632284, + "grad_norm": 0.36943575739860535, + "learning_rate": 8.441743289297476e-05, + "loss": 1.8359, + "step": 2487 + }, + { + "epoch": 0.25843980471590317, + "grad_norm": 0.3790445625782013, + "learning_rate": 8.440559534625851e-05, + "loss": 1.6393, + "step": 2488 + }, + { + "epoch": 0.25854367923548355, + "grad_norm": 0.3714222013950348, + "learning_rate": 8.439375413561472e-05, + "loss": 1.6612, + "step": 2489 + }, + { + "epoch": 0.2586475537550639, + "grad_norm": 0.38887256383895874, + "learning_rate": 8.43819092623044e-05, + "loss": 1.7044, + "step": 2490 + }, + { + "epoch": 0.25875142827464426, + "grad_norm": 0.3530022203922272, + "learning_rate": 8.437006072758891e-05, + "loss": 1.608, + "step": 2491 + }, + { + "epoch": 0.2588553027942246, + "grad_norm": 0.38229474425315857, + "learning_rate": 8.435820853273007e-05, + "loss": 1.7361, + "step": 2492 + }, + { + "epoch": 0.2589591773138049, + "grad_norm": 0.35574570298194885, + "learning_rate": 8.434635267899002e-05, + "loss": 1.799, + "step": 2493 + }, + { + "epoch": 0.2590630518333853, + "grad_norm": 0.40232568979263306, + "learning_rate": 8.433449316763133e-05, + "loss": 1.8073, + "step": 2494 + }, + { + "epoch": 0.2591669263529656, + "grad_norm": 0.42104196548461914, + "learning_rate": 8.432262999991694e-05, + "loss": 2.0099, + "step": 2495 + }, + { + "epoch": 0.25927080087254595, + "grad_norm": 0.37814000248908997, + "learning_rate": 8.431076317711017e-05, + "loss": 1.6813, + "step": 2496 + }, + { + "epoch": 0.25937467539212633, + "grad_norm": 0.38011592626571655, + "learning_rate": 8.429889270047475e-05, + "loss": 1.6921, + "step": 2497 + }, + { + "epoch": 0.25947854991170666, + "grad_norm": 0.3853015601634979, + "learning_rate": 8.428701857127481e-05, + "loss": 1.8239, + "step": 2498 + }, + { + "epoch": 0.259582424431287, + "grad_norm": 0.351633757352829, + "learning_rate": 8.427514079077485e-05, + "loss": 1.6058, + "step": 2499 + }, + { + "epoch": 0.25968629895086737, + "grad_norm": 0.4057175815105438, + "learning_rate": 8.426325936023974e-05, + "loss": 1.8754, + "step": 2500 + }, + { + "epoch": 0.2597901734704477, + "grad_norm": 0.3578610420227051, + "learning_rate": 8.425137428093477e-05, + "loss": 1.7336, + "step": 2501 + }, + { + "epoch": 0.259894047990028, + "grad_norm": 0.3505629301071167, + "learning_rate": 8.423948555412562e-05, + "loss": 1.7356, + "step": 2502 + }, + { + "epoch": 0.2599979225096084, + "grad_norm": 0.3745979368686676, + "learning_rate": 8.422759318107832e-05, + "loss": 1.8169, + "step": 2503 + }, + { + "epoch": 0.26010179702918873, + "grad_norm": 0.37145760655403137, + "learning_rate": 8.421569716305934e-05, + "loss": 1.904, + "step": 2504 + }, + { + "epoch": 0.2602056715487691, + "grad_norm": 0.3965916931629181, + "learning_rate": 8.42037975013355e-05, + "loss": 1.809, + "step": 2505 + }, + { + "epoch": 0.26030954606834944, + "grad_norm": 0.39221835136413574, + "learning_rate": 8.419189419717404e-05, + "loss": 1.7811, + "step": 2506 + }, + { + "epoch": 0.26041342058792977, + "grad_norm": 0.3546196222305298, + "learning_rate": 8.417998725184254e-05, + "loss": 1.7371, + "step": 2507 + }, + { + "epoch": 0.26051729510751015, + "grad_norm": 0.3630322217941284, + "learning_rate": 8.416807666660901e-05, + "loss": 1.719, + "step": 2508 + }, + { + "epoch": 0.2606211696270905, + "grad_norm": 0.34440553188323975, + "learning_rate": 8.415616244274185e-05, + "loss": 1.6887, + "step": 2509 + }, + { + "epoch": 0.2607250441466708, + "grad_norm": 0.39099493622779846, + "learning_rate": 8.414424458150983e-05, + "loss": 1.7395, + "step": 2510 + }, + { + "epoch": 0.2608289186662512, + "grad_norm": 0.35405251383781433, + "learning_rate": 8.413232308418206e-05, + "loss": 1.6363, + "step": 2511 + }, + { + "epoch": 0.2609327931858315, + "grad_norm": 0.41946062445640564, + "learning_rate": 8.412039795202816e-05, + "loss": 1.91, + "step": 2512 + }, + { + "epoch": 0.26103666770541184, + "grad_norm": 0.4249524474143982, + "learning_rate": 8.4108469186318e-05, + "loss": 1.7341, + "step": 2513 + }, + { + "epoch": 0.2611405422249922, + "grad_norm": 0.3654036223888397, + "learning_rate": 8.409653678832194e-05, + "loss": 1.6744, + "step": 2514 + }, + { + "epoch": 0.26124441674457255, + "grad_norm": 0.35637834668159485, + "learning_rate": 8.408460075931068e-05, + "loss": 1.6597, + "step": 2515 + }, + { + "epoch": 0.2613482912641529, + "grad_norm": 0.39124104380607605, + "learning_rate": 8.407266110055531e-05, + "loss": 1.6995, + "step": 2516 + }, + { + "epoch": 0.26145216578373326, + "grad_norm": 0.3535708487033844, + "learning_rate": 8.40607178133273e-05, + "loss": 1.657, + "step": 2517 + }, + { + "epoch": 0.2615560403033136, + "grad_norm": 0.4048800766468048, + "learning_rate": 8.404877089889853e-05, + "loss": 1.7071, + "step": 2518 + }, + { + "epoch": 0.26165991482289397, + "grad_norm": 0.3928733170032501, + "learning_rate": 8.403682035854125e-05, + "loss": 1.5713, + "step": 2519 + }, + { + "epoch": 0.2617637893424743, + "grad_norm": 0.3696751892566681, + "learning_rate": 8.40248661935281e-05, + "loss": 1.6968, + "step": 2520 + }, + { + "epoch": 0.2618676638620546, + "grad_norm": 0.35033291578292847, + "learning_rate": 8.40129084051321e-05, + "loss": 1.5176, + "step": 2521 + }, + { + "epoch": 0.261971538381635, + "grad_norm": 0.38887977600097656, + "learning_rate": 8.400094699462667e-05, + "loss": 1.84, + "step": 2522 + }, + { + "epoch": 0.26207541290121533, + "grad_norm": 0.382174015045166, + "learning_rate": 8.398898196328561e-05, + "loss": 1.818, + "step": 2523 + }, + { + "epoch": 0.26217928742079566, + "grad_norm": 0.3753756880760193, + "learning_rate": 8.39770133123831e-05, + "loss": 1.764, + "step": 2524 + }, + { + "epoch": 0.26228316194037604, + "grad_norm": 0.37997132539749146, + "learning_rate": 8.396504104319366e-05, + "loss": 1.8256, + "step": 2525 + }, + { + "epoch": 0.26238703645995637, + "grad_norm": 0.36985552310943604, + "learning_rate": 8.395306515699234e-05, + "loss": 1.5408, + "step": 2526 + }, + { + "epoch": 0.2624909109795367, + "grad_norm": 0.36994847655296326, + "learning_rate": 8.394108565505441e-05, + "loss": 1.6385, + "step": 2527 + }, + { + "epoch": 0.2625947854991171, + "grad_norm": 0.3789272904396057, + "learning_rate": 8.392910253865557e-05, + "loss": 1.6302, + "step": 2528 + }, + { + "epoch": 0.2626986600186974, + "grad_norm": 0.39001405239105225, + "learning_rate": 8.391711580907202e-05, + "loss": 1.8292, + "step": 2529 + }, + { + "epoch": 0.2628025345382778, + "grad_norm": 0.3820188343524933, + "learning_rate": 8.390512546758016e-05, + "loss": 1.8284, + "step": 2530 + }, + { + "epoch": 0.2629064090578581, + "grad_norm": 0.43149465322494507, + "learning_rate": 8.389313151545694e-05, + "loss": 1.8026, + "step": 2531 + }, + { + "epoch": 0.26301028357743844, + "grad_norm": 0.4724147319793701, + "learning_rate": 8.388113395397957e-05, + "loss": 1.7194, + "step": 2532 + }, + { + "epoch": 0.2631141580970188, + "grad_norm": 0.3973971903324127, + "learning_rate": 8.386913278442571e-05, + "loss": 1.703, + "step": 2533 + }, + { + "epoch": 0.26321803261659915, + "grad_norm": 0.38330528140068054, + "learning_rate": 8.385712800807343e-05, + "loss": 1.8659, + "step": 2534 + }, + { + "epoch": 0.2633219071361795, + "grad_norm": 0.37077972292900085, + "learning_rate": 8.38451196262011e-05, + "loss": 1.8752, + "step": 2535 + }, + { + "epoch": 0.26342578165575986, + "grad_norm": 0.36941617727279663, + "learning_rate": 8.383310764008751e-05, + "loss": 1.765, + "step": 2536 + }, + { + "epoch": 0.2635296561753402, + "grad_norm": 0.3706973195075989, + "learning_rate": 8.382109205101188e-05, + "loss": 1.6668, + "step": 2537 + }, + { + "epoch": 0.2636335306949205, + "grad_norm": 0.37633809447288513, + "learning_rate": 8.380907286025377e-05, + "loss": 1.6682, + "step": 2538 + }, + { + "epoch": 0.2637374052145009, + "grad_norm": 0.39212766289711, + "learning_rate": 8.37970500690931e-05, + "loss": 1.6501, + "step": 2539 + }, + { + "epoch": 0.2638412797340812, + "grad_norm": 0.3803769052028656, + "learning_rate": 8.378502367881025e-05, + "loss": 1.6854, + "step": 2540 + }, + { + "epoch": 0.26394515425366155, + "grad_norm": 0.40050208568573, + "learning_rate": 8.37729936906859e-05, + "loss": 1.8676, + "step": 2541 + }, + { + "epoch": 0.26404902877324193, + "grad_norm": 0.39779967069625854, + "learning_rate": 8.376096010600116e-05, + "loss": 1.7777, + "step": 2542 + }, + { + "epoch": 0.26415290329282226, + "grad_norm": 0.4135241210460663, + "learning_rate": 8.374892292603751e-05, + "loss": 1.7512, + "step": 2543 + }, + { + "epoch": 0.26425677781240264, + "grad_norm": 0.36392831802368164, + "learning_rate": 8.373688215207682e-05, + "loss": 1.7389, + "step": 2544 + }, + { + "epoch": 0.26436065233198297, + "grad_norm": 0.40198683738708496, + "learning_rate": 8.372483778540134e-05, + "loss": 1.9784, + "step": 2545 + }, + { + "epoch": 0.2644645268515633, + "grad_norm": 0.35998931527137756, + "learning_rate": 8.371278982729371e-05, + "loss": 1.8175, + "step": 2546 + }, + { + "epoch": 0.2645684013711437, + "grad_norm": 0.4466514587402344, + "learning_rate": 8.370073827903693e-05, + "loss": 1.8014, + "step": 2547 + }, + { + "epoch": 0.264672275890724, + "grad_norm": 0.39324405789375305, + "learning_rate": 8.368868314191439e-05, + "loss": 1.8389, + "step": 2548 + }, + { + "epoch": 0.26477615041030433, + "grad_norm": 0.3979739248752594, + "learning_rate": 8.367662441720989e-05, + "loss": 1.7975, + "step": 2549 + }, + { + "epoch": 0.2648800249298847, + "grad_norm": 0.3831605315208435, + "learning_rate": 8.366456210620757e-05, + "loss": 1.7531, + "step": 2550 + }, + { + "epoch": 0.26498389944946504, + "grad_norm": 0.3608654737472534, + "learning_rate": 8.365249621019197e-05, + "loss": 1.6416, + "step": 2551 + }, + { + "epoch": 0.26508777396904537, + "grad_norm": 0.37154313921928406, + "learning_rate": 8.364042673044803e-05, + "loss": 1.775, + "step": 2552 + }, + { + "epoch": 0.26519164848862575, + "grad_norm": 0.36865541338920593, + "learning_rate": 8.362835366826105e-05, + "loss": 1.6302, + "step": 2553 + }, + { + "epoch": 0.2652955230082061, + "grad_norm": 0.40405401587486267, + "learning_rate": 8.361627702491673e-05, + "loss": 1.8325, + "step": 2554 + }, + { + "epoch": 0.2653993975277864, + "grad_norm": 0.3631226122379303, + "learning_rate": 8.360419680170111e-05, + "loss": 1.6433, + "step": 2555 + }, + { + "epoch": 0.2655032720473668, + "grad_norm": 0.3895721435546875, + "learning_rate": 8.359211299990064e-05, + "loss": 1.5261, + "step": 2556 + }, + { + "epoch": 0.2656071465669471, + "grad_norm": 0.34275680780410767, + "learning_rate": 8.358002562080219e-05, + "loss": 1.7036, + "step": 2557 + }, + { + "epoch": 0.2657110210865275, + "grad_norm": 0.4039117991924286, + "learning_rate": 8.356793466569293e-05, + "loss": 1.801, + "step": 2558 + }, + { + "epoch": 0.2658148956061078, + "grad_norm": 0.444367915391922, + "learning_rate": 8.355584013586047e-05, + "loss": 2.0989, + "step": 2559 + }, + { + "epoch": 0.26591877012568815, + "grad_norm": 0.3847164809703827, + "learning_rate": 8.354374203259278e-05, + "loss": 1.8125, + "step": 2560 + }, + { + "epoch": 0.26602264464526854, + "grad_norm": 0.36114174127578735, + "learning_rate": 8.353164035717822e-05, + "loss": 1.6731, + "step": 2561 + }, + { + "epoch": 0.26612651916484886, + "grad_norm": 0.37462207674980164, + "learning_rate": 8.351953511090551e-05, + "loss": 1.7634, + "step": 2562 + }, + { + "epoch": 0.2662303936844292, + "grad_norm": 0.38300392031669617, + "learning_rate": 8.350742629506378e-05, + "loss": 1.8874, + "step": 2563 + }, + { + "epoch": 0.26633426820400957, + "grad_norm": 0.37371304631233215, + "learning_rate": 8.349531391094251e-05, + "loss": 1.7228, + "step": 2564 + }, + { + "epoch": 0.2664381427235899, + "grad_norm": 0.33728665113449097, + "learning_rate": 8.34831979598316e-05, + "loss": 1.6236, + "step": 2565 + }, + { + "epoch": 0.2665420172431702, + "grad_norm": 0.38045376539230347, + "learning_rate": 8.347107844302129e-05, + "loss": 1.8109, + "step": 2566 + }, + { + "epoch": 0.2666458917627506, + "grad_norm": 0.36660969257354736, + "learning_rate": 8.345895536180219e-05, + "loss": 1.7463, + "step": 2567 + }, + { + "epoch": 0.26674976628233094, + "grad_norm": 0.3986735939979553, + "learning_rate": 8.344682871746534e-05, + "loss": 1.8362, + "step": 2568 + }, + { + "epoch": 0.2668536408019113, + "grad_norm": 0.41932711005210876, + "learning_rate": 8.343469851130212e-05, + "loss": 1.845, + "step": 2569 + }, + { + "epoch": 0.26695751532149165, + "grad_norm": 0.40920954942703247, + "learning_rate": 8.342256474460431e-05, + "loss": 1.6763, + "step": 2570 + }, + { + "epoch": 0.26706138984107197, + "grad_norm": 0.3709944784641266, + "learning_rate": 8.341042741866408e-05, + "loss": 1.8428, + "step": 2571 + }, + { + "epoch": 0.26716526436065235, + "grad_norm": 0.3826795220375061, + "learning_rate": 8.339828653477391e-05, + "loss": 1.5346, + "step": 2572 + }, + { + "epoch": 0.2672691388802327, + "grad_norm": 0.37877601385116577, + "learning_rate": 8.338614209422677e-05, + "loss": 1.7763, + "step": 2573 + }, + { + "epoch": 0.267373013399813, + "grad_norm": 0.4136562645435333, + "learning_rate": 8.33739940983159e-05, + "loss": 1.9153, + "step": 2574 + }, + { + "epoch": 0.2674768879193934, + "grad_norm": 0.3574436604976654, + "learning_rate": 8.336184254833499e-05, + "loss": 1.4252, + "step": 2575 + }, + { + "epoch": 0.2675807624389737, + "grad_norm": 0.37612104415893555, + "learning_rate": 8.33496874455781e-05, + "loss": 1.8131, + "step": 2576 + }, + { + "epoch": 0.26768463695855405, + "grad_norm": 0.4038298726081848, + "learning_rate": 8.33375287913396e-05, + "loss": 1.769, + "step": 2577 + }, + { + "epoch": 0.26778851147813443, + "grad_norm": 0.3684050440788269, + "learning_rate": 8.332536658691435e-05, + "loss": 1.806, + "step": 2578 + }, + { + "epoch": 0.26789238599771475, + "grad_norm": 0.39676815271377563, + "learning_rate": 8.331320083359751e-05, + "loss": 1.8308, + "step": 2579 + }, + { + "epoch": 0.2679962605172951, + "grad_norm": 0.37601250410079956, + "learning_rate": 8.330103153268462e-05, + "loss": 1.6708, + "step": 2580 + }, + { + "epoch": 0.26810013503687546, + "grad_norm": 0.35616087913513184, + "learning_rate": 8.328885868547164e-05, + "loss": 1.6567, + "step": 2581 + }, + { + "epoch": 0.2682040095564558, + "grad_norm": 0.4200495183467865, + "learning_rate": 8.327668229325487e-05, + "loss": 1.9518, + "step": 2582 + }, + { + "epoch": 0.2683078840760362, + "grad_norm": 0.4047677516937256, + "learning_rate": 8.326450235733098e-05, + "loss": 1.8429, + "step": 2583 + }, + { + "epoch": 0.2684117585956165, + "grad_norm": 0.35730311274528503, + "learning_rate": 8.32523188789971e-05, + "loss": 1.5773, + "step": 2584 + }, + { + "epoch": 0.26851563311519683, + "grad_norm": 0.38395121693611145, + "learning_rate": 8.324013185955062e-05, + "loss": 1.685, + "step": 2585 + }, + { + "epoch": 0.2686195076347772, + "grad_norm": 0.3953067362308502, + "learning_rate": 8.322794130028938e-05, + "loss": 1.8974, + "step": 2586 + }, + { + "epoch": 0.26872338215435754, + "grad_norm": 0.3572498559951782, + "learning_rate": 8.321574720251158e-05, + "loss": 1.6507, + "step": 2587 + }, + { + "epoch": 0.26882725667393786, + "grad_norm": 0.38150545954704285, + "learning_rate": 8.32035495675158e-05, + "loss": 1.7104, + "step": 2588 + }, + { + "epoch": 0.26893113119351825, + "grad_norm": 0.36428940296173096, + "learning_rate": 8.3191348396601e-05, + "loss": 1.6052, + "step": 2589 + }, + { + "epoch": 0.2690350057130986, + "grad_norm": 0.4125150144100189, + "learning_rate": 8.317914369106648e-05, + "loss": 1.9357, + "step": 2590 + }, + { + "epoch": 0.2691388802326789, + "grad_norm": 0.3636937439441681, + "learning_rate": 8.316693545221196e-05, + "loss": 1.7948, + "step": 2591 + }, + { + "epoch": 0.2692427547522593, + "grad_norm": 0.40792933106422424, + "learning_rate": 8.315472368133754e-05, + "loss": 1.6696, + "step": 2592 + }, + { + "epoch": 0.2693466292718396, + "grad_norm": 0.40981337428092957, + "learning_rate": 8.314250837974364e-05, + "loss": 1.8025, + "step": 2593 + }, + { + "epoch": 0.26945050379141994, + "grad_norm": 0.3620801270008087, + "learning_rate": 8.313028954873114e-05, + "loss": 1.725, + "step": 2594 + }, + { + "epoch": 0.2695543783110003, + "grad_norm": 0.35369589924812317, + "learning_rate": 8.311806718960124e-05, + "loss": 1.5303, + "step": 2595 + }, + { + "epoch": 0.26965825283058065, + "grad_norm": 0.38566333055496216, + "learning_rate": 8.310584130365551e-05, + "loss": 1.9351, + "step": 2596 + }, + { + "epoch": 0.26976212735016103, + "grad_norm": 0.3703290820121765, + "learning_rate": 8.309361189219589e-05, + "loss": 1.6705, + "step": 2597 + }, + { + "epoch": 0.26986600186974136, + "grad_norm": 0.3818873167037964, + "learning_rate": 8.308137895652477e-05, + "loss": 1.7879, + "step": 2598 + }, + { + "epoch": 0.2699698763893217, + "grad_norm": 0.3748812675476074, + "learning_rate": 8.306914249794483e-05, + "loss": 1.7799, + "step": 2599 + }, + { + "epoch": 0.27007375090890207, + "grad_norm": 0.3769199252128601, + "learning_rate": 8.305690251775915e-05, + "loss": 1.6268, + "step": 2600 + }, + { + "epoch": 0.2701776254284824, + "grad_norm": 0.3615618944168091, + "learning_rate": 8.304465901727123e-05, + "loss": 1.7004, + "step": 2601 + }, + { + "epoch": 0.2702814999480627, + "grad_norm": 0.36809107661247253, + "learning_rate": 8.303241199778486e-05, + "loss": 1.7739, + "step": 2602 + }, + { + "epoch": 0.2703853744676431, + "grad_norm": 0.37990957498550415, + "learning_rate": 8.30201614606043e-05, + "loss": 1.5496, + "step": 2603 + }, + { + "epoch": 0.27048924898722343, + "grad_norm": 0.3890332579612732, + "learning_rate": 8.300790740703409e-05, + "loss": 1.7503, + "step": 2604 + }, + { + "epoch": 0.27059312350680376, + "grad_norm": 0.3829896152019501, + "learning_rate": 8.299564983837922e-05, + "loss": 1.7856, + "step": 2605 + }, + { + "epoch": 0.27069699802638414, + "grad_norm": 0.36139553785324097, + "learning_rate": 8.298338875594501e-05, + "loss": 1.7488, + "step": 2606 + }, + { + "epoch": 0.27080087254596447, + "grad_norm": 0.3622409403324127, + "learning_rate": 8.29711241610372e-05, + "loss": 1.6934, + "step": 2607 + }, + { + "epoch": 0.27090474706554485, + "grad_norm": 0.37896421551704407, + "learning_rate": 8.295885605496184e-05, + "loss": 1.785, + "step": 2608 + }, + { + "epoch": 0.2710086215851252, + "grad_norm": 0.36020195484161377, + "learning_rate": 8.294658443902539e-05, + "loss": 1.7482, + "step": 2609 + }, + { + "epoch": 0.2711124961047055, + "grad_norm": 0.3811848759651184, + "learning_rate": 8.29343093145347e-05, + "loss": 1.7077, + "step": 2610 + }, + { + "epoch": 0.2712163706242859, + "grad_norm": 0.3960614502429962, + "learning_rate": 8.292203068279695e-05, + "loss": 1.8574, + "step": 2611 + }, + { + "epoch": 0.2713202451438662, + "grad_norm": 0.4237765371799469, + "learning_rate": 8.290974854511974e-05, + "loss": 1.7988, + "step": 2612 + }, + { + "epoch": 0.27142411966344654, + "grad_norm": 0.3800017237663269, + "learning_rate": 8.289746290281104e-05, + "loss": 1.6488, + "step": 2613 + }, + { + "epoch": 0.2715279941830269, + "grad_norm": 0.38529491424560547, + "learning_rate": 8.288517375717913e-05, + "loss": 1.8813, + "step": 2614 + }, + { + "epoch": 0.27163186870260725, + "grad_norm": 0.3625027537345886, + "learning_rate": 8.287288110953274e-05, + "loss": 1.4855, + "step": 2615 + }, + { + "epoch": 0.2717357432221876, + "grad_norm": 0.4243226647377014, + "learning_rate": 8.286058496118091e-05, + "loss": 1.921, + "step": 2616 + }, + { + "epoch": 0.27183961774176796, + "grad_norm": 0.36872926354408264, + "learning_rate": 8.284828531343312e-05, + "loss": 1.6206, + "step": 2617 + }, + { + "epoch": 0.2719434922613483, + "grad_norm": 0.40654125809669495, + "learning_rate": 8.283598216759915e-05, + "loss": 1.8668, + "step": 2618 + }, + { + "epoch": 0.2720473667809286, + "grad_norm": 0.3534424602985382, + "learning_rate": 8.282367552498925e-05, + "loss": 1.6809, + "step": 2619 + }, + { + "epoch": 0.272151241300509, + "grad_norm": 0.41357290744781494, + "learning_rate": 8.28113653869139e-05, + "loss": 1.922, + "step": 2620 + }, + { + "epoch": 0.2722551158200893, + "grad_norm": 0.41436415910720825, + "learning_rate": 8.27990517546841e-05, + "loss": 1.8944, + "step": 2621 + }, + { + "epoch": 0.2723589903396697, + "grad_norm": 0.3990755081176758, + "learning_rate": 8.278673462961112e-05, + "loss": 1.8936, + "step": 2622 + }, + { + "epoch": 0.27246286485925003, + "grad_norm": 0.3946811258792877, + "learning_rate": 8.277441401300665e-05, + "loss": 1.9192, + "step": 2623 + }, + { + "epoch": 0.27256673937883036, + "grad_norm": 0.37876197695732117, + "learning_rate": 8.276208990618274e-05, + "loss": 1.6761, + "step": 2624 + }, + { + "epoch": 0.27267061389841074, + "grad_norm": 0.4105803072452545, + "learning_rate": 8.27497623104518e-05, + "loss": 1.7843, + "step": 2625 + }, + { + "epoch": 0.27277448841799107, + "grad_norm": 0.37790194153785706, + "learning_rate": 8.273743122712664e-05, + "loss": 1.5891, + "step": 2626 + }, + { + "epoch": 0.2728783629375714, + "grad_norm": 0.43674150109291077, + "learning_rate": 8.272509665752041e-05, + "loss": 1.793, + "step": 2627 + }, + { + "epoch": 0.2729822374571518, + "grad_norm": 0.45887747406959534, + "learning_rate": 8.271275860294667e-05, + "loss": 1.9712, + "step": 2628 + }, + { + "epoch": 0.2730861119767321, + "grad_norm": 0.3662078082561493, + "learning_rate": 8.27004170647193e-05, + "loss": 1.6459, + "step": 2629 + }, + { + "epoch": 0.27318998649631243, + "grad_norm": 0.364083468914032, + "learning_rate": 8.268807204415258e-05, + "loss": 1.6445, + "step": 2630 + }, + { + "epoch": 0.2732938610158928, + "grad_norm": 0.3766544759273529, + "learning_rate": 8.267572354256117e-05, + "loss": 1.6357, + "step": 2631 + }, + { + "epoch": 0.27339773553547314, + "grad_norm": 0.38634899258613586, + "learning_rate": 8.266337156126008e-05, + "loss": 1.7254, + "step": 2632 + }, + { + "epoch": 0.27350161005505347, + "grad_norm": 0.391665518283844, + "learning_rate": 8.26510161015647e-05, + "loss": 1.7073, + "step": 2633 + }, + { + "epoch": 0.27360548457463385, + "grad_norm": 0.5298207998275757, + "learning_rate": 8.263865716479078e-05, + "loss": 1.8689, + "step": 2634 + }, + { + "epoch": 0.2737093590942142, + "grad_norm": 0.4072588086128235, + "learning_rate": 8.262629475225448e-05, + "loss": 1.8721, + "step": 2635 + }, + { + "epoch": 0.27381323361379456, + "grad_norm": 0.3996543288230896, + "learning_rate": 8.261392886527229e-05, + "loss": 1.7335, + "step": 2636 + }, + { + "epoch": 0.2739171081333749, + "grad_norm": 0.414880633354187, + "learning_rate": 8.260155950516106e-05, + "loss": 1.8539, + "step": 2637 + }, + { + "epoch": 0.2740209826529552, + "grad_norm": 0.36667972803115845, + "learning_rate": 8.258918667323806e-05, + "loss": 1.6214, + "step": 2638 + }, + { + "epoch": 0.2741248571725356, + "grad_norm": 0.39150235056877136, + "learning_rate": 8.257681037082085e-05, + "loss": 1.7361, + "step": 2639 + }, + { + "epoch": 0.2742287316921159, + "grad_norm": 0.38801634311676025, + "learning_rate": 8.256443059922749e-05, + "loss": 1.8081, + "step": 2640 + }, + { + "epoch": 0.27433260621169625, + "grad_norm": 0.42313215136528015, + "learning_rate": 8.255204735977626e-05, + "loss": 1.7, + "step": 2641 + }, + { + "epoch": 0.27443648073127663, + "grad_norm": 0.36863112449645996, + "learning_rate": 8.25396606537859e-05, + "loss": 1.5137, + "step": 2642 + }, + { + "epoch": 0.27454035525085696, + "grad_norm": 0.39456894993782043, + "learning_rate": 8.252727048257551e-05, + "loss": 1.7127, + "step": 2643 + }, + { + "epoch": 0.2746442297704373, + "grad_norm": 0.3853185474872589, + "learning_rate": 8.251487684746453e-05, + "loss": 1.8356, + "step": 2644 + }, + { + "epoch": 0.27474810429001767, + "grad_norm": 0.4087388813495636, + "learning_rate": 8.250247974977279e-05, + "loss": 1.6803, + "step": 2645 + }, + { + "epoch": 0.274851978809598, + "grad_norm": 0.3977982997894287, + "learning_rate": 8.249007919082048e-05, + "loss": 1.7868, + "step": 2646 + }, + { + "epoch": 0.2749558533291784, + "grad_norm": 0.3721441626548767, + "learning_rate": 8.247767517192818e-05, + "loss": 1.8521, + "step": 2647 + }, + { + "epoch": 0.2750597278487587, + "grad_norm": 0.3743959963321686, + "learning_rate": 8.246526769441681e-05, + "loss": 1.7785, + "step": 2648 + }, + { + "epoch": 0.27516360236833903, + "grad_norm": 0.39412543177604675, + "learning_rate": 8.245285675960767e-05, + "loss": 1.8763, + "step": 2649 + }, + { + "epoch": 0.2752674768879194, + "grad_norm": 0.3404049575328827, + "learning_rate": 8.244044236882242e-05, + "loss": 1.4576, + "step": 2650 + }, + { + "epoch": 0.27537135140749974, + "grad_norm": 0.3886807858943939, + "learning_rate": 8.24280245233831e-05, + "loss": 1.7468, + "step": 2651 + }, + { + "epoch": 0.27547522592708007, + "grad_norm": 0.3996857702732086, + "learning_rate": 8.241560322461212e-05, + "loss": 1.92, + "step": 2652 + }, + { + "epoch": 0.27557910044666045, + "grad_norm": 0.3869626224040985, + "learning_rate": 8.240317847383226e-05, + "loss": 1.6042, + "step": 2653 + }, + { + "epoch": 0.2756829749662408, + "grad_norm": 0.4047796130180359, + "learning_rate": 8.239075027236663e-05, + "loss": 1.8099, + "step": 2654 + }, + { + "epoch": 0.2757868494858211, + "grad_norm": 0.44502919912338257, + "learning_rate": 8.237831862153877e-05, + "loss": 1.7363, + "step": 2655 + }, + { + "epoch": 0.2758907240054015, + "grad_norm": 0.388954222202301, + "learning_rate": 8.236588352267252e-05, + "loss": 1.8691, + "step": 2656 + }, + { + "epoch": 0.2759945985249818, + "grad_norm": 0.39591512084007263, + "learning_rate": 8.235344497709214e-05, + "loss": 1.8538, + "step": 2657 + }, + { + "epoch": 0.27609847304456214, + "grad_norm": 0.38203978538513184, + "learning_rate": 8.234100298612226e-05, + "loss": 1.28, + "step": 2658 + }, + { + "epoch": 0.2762023475641425, + "grad_norm": 0.43226009607315063, + "learning_rate": 8.23285575510878e-05, + "loss": 1.9326, + "step": 2659 + }, + { + "epoch": 0.27630622208372285, + "grad_norm": 0.37048643827438354, + "learning_rate": 8.231610867331415e-05, + "loss": 1.7221, + "step": 2660 + }, + { + "epoch": 0.27641009660330323, + "grad_norm": 0.41580793261528015, + "learning_rate": 8.2303656354127e-05, + "loss": 1.9031, + "step": 2661 + }, + { + "epoch": 0.27651397112288356, + "grad_norm": 0.3827223777770996, + "learning_rate": 8.229120059485243e-05, + "loss": 1.6825, + "step": 2662 + }, + { + "epoch": 0.2766178456424639, + "grad_norm": 0.37424013018608093, + "learning_rate": 8.227874139681688e-05, + "loss": 1.7172, + "step": 2663 + }, + { + "epoch": 0.27672172016204427, + "grad_norm": 0.43709954619407654, + "learning_rate": 8.226627876134714e-05, + "loss": 1.8136, + "step": 2664 + }, + { + "epoch": 0.2768255946816246, + "grad_norm": 0.35949236154556274, + "learning_rate": 8.22538126897704e-05, + "loss": 1.7346, + "step": 2665 + }, + { + "epoch": 0.2769294692012049, + "grad_norm": 0.39692017436027527, + "learning_rate": 8.224134318341421e-05, + "loss": 1.925, + "step": 2666 + }, + { + "epoch": 0.2770333437207853, + "grad_norm": 0.3917725384235382, + "learning_rate": 8.222887024360646e-05, + "loss": 1.7681, + "step": 2667 + }, + { + "epoch": 0.27713721824036563, + "grad_norm": 0.4082455635070801, + "learning_rate": 8.221639387167543e-05, + "loss": 1.8118, + "step": 2668 + }, + { + "epoch": 0.27724109275994596, + "grad_norm": 0.36559662222862244, + "learning_rate": 8.220391406894973e-05, + "loss": 1.8121, + "step": 2669 + }, + { + "epoch": 0.27734496727952634, + "grad_norm": 0.372790664434433, + "learning_rate": 8.21914308367584e-05, + "loss": 1.7102, + "step": 2670 + }, + { + "epoch": 0.27744884179910667, + "grad_norm": 0.37069472670555115, + "learning_rate": 8.217894417643078e-05, + "loss": 1.6396, + "step": 2671 + }, + { + "epoch": 0.277552716318687, + "grad_norm": 0.38368502259254456, + "learning_rate": 8.216645408929661e-05, + "loss": 1.8278, + "step": 2672 + }, + { + "epoch": 0.2776565908382674, + "grad_norm": 0.4187244474887848, + "learning_rate": 8.215396057668598e-05, + "loss": 1.7567, + "step": 2673 + }, + { + "epoch": 0.2777604653578477, + "grad_norm": 0.37204065918922424, + "learning_rate": 8.214146363992939e-05, + "loss": 1.7282, + "step": 2674 + }, + { + "epoch": 0.2778643398774281, + "grad_norm": 0.37890782952308655, + "learning_rate": 8.21289632803576e-05, + "loss": 1.6675, + "step": 2675 + }, + { + "epoch": 0.2779682143970084, + "grad_norm": 0.37642747163772583, + "learning_rate": 8.211645949930187e-05, + "loss": 1.6311, + "step": 2676 + }, + { + "epoch": 0.27807208891658874, + "grad_norm": 0.3810623288154602, + "learning_rate": 8.210395229809369e-05, + "loss": 1.7166, + "step": 2677 + }, + { + "epoch": 0.2781759634361691, + "grad_norm": 0.3708658516407013, + "learning_rate": 8.209144167806502e-05, + "loss": 1.7207, + "step": 2678 + }, + { + "epoch": 0.27827983795574945, + "grad_norm": 0.3930984139442444, + "learning_rate": 8.207892764054814e-05, + "loss": 1.7495, + "step": 2679 + }, + { + "epoch": 0.2783837124753298, + "grad_norm": 0.3427664339542389, + "learning_rate": 8.20664101868757e-05, + "loss": 1.6362, + "step": 2680 + }, + { + "epoch": 0.27848758699491016, + "grad_norm": 0.3461865484714508, + "learning_rate": 8.205388931838068e-05, + "loss": 1.594, + "step": 2681 + }, + { + "epoch": 0.2785914615144905, + "grad_norm": 0.43719297647476196, + "learning_rate": 8.20413650363965e-05, + "loss": 1.9431, + "step": 2682 + }, + { + "epoch": 0.2786953360340708, + "grad_norm": 0.3867358863353729, + "learning_rate": 8.202883734225686e-05, + "loss": 1.7749, + "step": 2683 + }, + { + "epoch": 0.2787992105536512, + "grad_norm": 0.4017643928527832, + "learning_rate": 8.201630623729588e-05, + "loss": 1.8466, + "step": 2684 + }, + { + "epoch": 0.2789030850732315, + "grad_norm": 0.4061945974826813, + "learning_rate": 8.200377172284803e-05, + "loss": 2.0036, + "step": 2685 + }, + { + "epoch": 0.2790069595928119, + "grad_norm": 0.36883074045181274, + "learning_rate": 8.199123380024812e-05, + "loss": 1.6884, + "step": 2686 + }, + { + "epoch": 0.27911083411239224, + "grad_norm": 0.37533366680145264, + "learning_rate": 8.197869247083136e-05, + "loss": 1.6512, + "step": 2687 + }, + { + "epoch": 0.27921470863197256, + "grad_norm": 0.3572383522987366, + "learning_rate": 8.196614773593329e-05, + "loss": 1.6059, + "step": 2688 + }, + { + "epoch": 0.27931858315155295, + "grad_norm": 0.37231144309043884, + "learning_rate": 8.195359959688985e-05, + "loss": 1.6252, + "step": 2689 + }, + { + "epoch": 0.2794224576711333, + "grad_norm": 0.3484655022621155, + "learning_rate": 8.194104805503728e-05, + "loss": 1.7011, + "step": 2690 + }, + { + "epoch": 0.2795263321907136, + "grad_norm": 0.3684258460998535, + "learning_rate": 8.192849311171225e-05, + "loss": 1.6864, + "step": 2691 + }, + { + "epoch": 0.279630206710294, + "grad_norm": 0.34496328234672546, + "learning_rate": 8.191593476825174e-05, + "loss": 1.5545, + "step": 2692 + }, + { + "epoch": 0.2797340812298743, + "grad_norm": 0.3760117292404175, + "learning_rate": 8.190337302599314e-05, + "loss": 1.7473, + "step": 2693 + }, + { + "epoch": 0.27983795574945464, + "grad_norm": 0.3527992367744446, + "learning_rate": 8.189080788627419e-05, + "loss": 1.6099, + "step": 2694 + }, + { + "epoch": 0.279941830269035, + "grad_norm": 0.37164172530174255, + "learning_rate": 8.187823935043292e-05, + "loss": 1.6274, + "step": 2695 + }, + { + "epoch": 0.28004570478861535, + "grad_norm": 0.37998443841934204, + "learning_rate": 8.186566741980786e-05, + "loss": 1.8226, + "step": 2696 + }, + { + "epoch": 0.2801495793081957, + "grad_norm": 0.3952192962169647, + "learning_rate": 8.185309209573775e-05, + "loss": 1.6987, + "step": 2697 + }, + { + "epoch": 0.28025345382777606, + "grad_norm": 0.3859109878540039, + "learning_rate": 8.18405133795618e-05, + "loss": 1.7248, + "step": 2698 + }, + { + "epoch": 0.2803573283473564, + "grad_norm": 0.3739028573036194, + "learning_rate": 8.182793127261955e-05, + "loss": 1.8529, + "step": 2699 + }, + { + "epoch": 0.28046120286693677, + "grad_norm": 0.44212570786476135, + "learning_rate": 8.181534577625088e-05, + "loss": 1.8915, + "step": 2700 + }, + { + "epoch": 0.2805650773865171, + "grad_norm": 0.3574367165565491, + "learning_rate": 8.180275689179607e-05, + "loss": 1.7146, + "step": 2701 + }, + { + "epoch": 0.2806689519060974, + "grad_norm": 0.36751818656921387, + "learning_rate": 8.17901646205957e-05, + "loss": 1.5943, + "step": 2702 + }, + { + "epoch": 0.2807728264256778, + "grad_norm": 0.3904435634613037, + "learning_rate": 8.177756896399078e-05, + "loss": 1.7849, + "step": 2703 + }, + { + "epoch": 0.28087670094525813, + "grad_norm": 0.38835206627845764, + "learning_rate": 8.176496992332265e-05, + "loss": 1.7837, + "step": 2704 + }, + { + "epoch": 0.28098057546483846, + "grad_norm": 0.46619629859924316, + "learning_rate": 8.175236749993298e-05, + "loss": 1.9918, + "step": 2705 + }, + { + "epoch": 0.28108444998441884, + "grad_norm": 0.412544846534729, + "learning_rate": 8.173976169516386e-05, + "loss": 1.8577, + "step": 2706 + }, + { + "epoch": 0.28118832450399917, + "grad_norm": 0.36168932914733887, + "learning_rate": 8.17271525103577e-05, + "loss": 1.6675, + "step": 2707 + }, + { + "epoch": 0.2812921990235795, + "grad_norm": 0.4263473451137543, + "learning_rate": 8.171453994685728e-05, + "loss": 1.8988, + "step": 2708 + }, + { + "epoch": 0.2813960735431599, + "grad_norm": 0.369133323431015, + "learning_rate": 8.170192400600574e-05, + "loss": 1.6811, + "step": 2709 + }, + { + "epoch": 0.2814999480627402, + "grad_norm": 0.39230549335479736, + "learning_rate": 8.168930468914658e-05, + "loss": 1.5888, + "step": 2710 + }, + { + "epoch": 0.2816038225823206, + "grad_norm": 0.3522338569164276, + "learning_rate": 8.167668199762364e-05, + "loss": 1.5904, + "step": 2711 + }, + { + "epoch": 0.2817076971019009, + "grad_norm": 0.3948661983013153, + "learning_rate": 8.166405593278116e-05, + "loss": 1.7703, + "step": 2712 + }, + { + "epoch": 0.28181157162148124, + "grad_norm": 0.35214731097221375, + "learning_rate": 8.165142649596372e-05, + "loss": 1.6956, + "step": 2713 + }, + { + "epoch": 0.2819154461410616, + "grad_norm": 0.4061570167541504, + "learning_rate": 8.163879368851623e-05, + "loss": 1.9759, + "step": 2714 + }, + { + "epoch": 0.28201932066064195, + "grad_norm": 0.3767627477645874, + "learning_rate": 8.162615751178405e-05, + "loss": 1.6924, + "step": 2715 + }, + { + "epoch": 0.2821231951802223, + "grad_norm": 0.37810075283050537, + "learning_rate": 8.161351796711274e-05, + "loss": 1.8406, + "step": 2716 + }, + { + "epoch": 0.28222706969980266, + "grad_norm": 0.37295323610305786, + "learning_rate": 8.16008750558484e-05, + "loss": 1.7651, + "step": 2717 + }, + { + "epoch": 0.282330944219383, + "grad_norm": 0.3852209448814392, + "learning_rate": 8.158822877933733e-05, + "loss": 1.7538, + "step": 2718 + }, + { + "epoch": 0.2824348187389633, + "grad_norm": 0.36513835191726685, + "learning_rate": 8.15755791389263e-05, + "loss": 1.7825, + "step": 2719 + }, + { + "epoch": 0.2825386932585437, + "grad_norm": 0.3820558190345764, + "learning_rate": 8.15629261359624e-05, + "loss": 1.737, + "step": 2720 + }, + { + "epoch": 0.282642567778124, + "grad_norm": 0.3873055875301361, + "learning_rate": 8.155026977179307e-05, + "loss": 1.774, + "step": 2721 + }, + { + "epoch": 0.28274644229770435, + "grad_norm": 0.3791974186897278, + "learning_rate": 8.15376100477661e-05, + "loss": 1.7034, + "step": 2722 + }, + { + "epoch": 0.28285031681728473, + "grad_norm": 0.35163450241088867, + "learning_rate": 8.152494696522965e-05, + "loss": 1.5747, + "step": 2723 + }, + { + "epoch": 0.28295419133686506, + "grad_norm": 0.390493780374527, + "learning_rate": 8.151228052553228e-05, + "loss": 1.7802, + "step": 2724 + }, + { + "epoch": 0.28305806585644544, + "grad_norm": 0.3626655340194702, + "learning_rate": 8.149961073002283e-05, + "loss": 1.7353, + "step": 2725 + }, + { + "epoch": 0.28316194037602577, + "grad_norm": 0.40224525332450867, + "learning_rate": 8.148693758005054e-05, + "loss": 1.8108, + "step": 2726 + }, + { + "epoch": 0.2832658148956061, + "grad_norm": 0.3655959367752075, + "learning_rate": 8.1474261076965e-05, + "loss": 1.7521, + "step": 2727 + }, + { + "epoch": 0.2833696894151865, + "grad_norm": 0.3598358929157257, + "learning_rate": 8.146158122211618e-05, + "loss": 1.6139, + "step": 2728 + }, + { + "epoch": 0.2834735639347668, + "grad_norm": 0.4097810983657837, + "learning_rate": 8.144889801685436e-05, + "loss": 1.9751, + "step": 2729 + }, + { + "epoch": 0.28357743845434713, + "grad_norm": 0.3773553967475891, + "learning_rate": 8.143621146253022e-05, + "loss": 1.7618, + "step": 2730 + }, + { + "epoch": 0.2836813129739275, + "grad_norm": 0.3876679837703705, + "learning_rate": 8.142352156049475e-05, + "loss": 1.7756, + "step": 2731 + }, + { + "epoch": 0.28378518749350784, + "grad_norm": 0.3985688090324402, + "learning_rate": 8.141082831209938e-05, + "loss": 1.8868, + "step": 2732 + }, + { + "epoch": 0.28388906201308817, + "grad_norm": 0.40376704931259155, + "learning_rate": 8.139813171869579e-05, + "loss": 1.9038, + "step": 2733 + }, + { + "epoch": 0.28399293653266855, + "grad_norm": 0.333291620016098, + "learning_rate": 8.138543178163607e-05, + "loss": 1.5568, + "step": 2734 + }, + { + "epoch": 0.2840968110522489, + "grad_norm": 0.39361095428466797, + "learning_rate": 8.137272850227271e-05, + "loss": 1.6809, + "step": 2735 + }, + { + "epoch": 0.2842006855718292, + "grad_norm": 0.39014434814453125, + "learning_rate": 8.136002188195846e-05, + "loss": 1.7479, + "step": 2736 + }, + { + "epoch": 0.2843045600914096, + "grad_norm": 0.4338228404521942, + "learning_rate": 8.13473119220465e-05, + "loss": 1.8751, + "step": 2737 + }, + { + "epoch": 0.2844084346109899, + "grad_norm": 0.44359880685806274, + "learning_rate": 8.133459862389036e-05, + "loss": 1.8787, + "step": 2738 + }, + { + "epoch": 0.2845123091305703, + "grad_norm": 0.42322838306427, + "learning_rate": 8.132188198884386e-05, + "loss": 1.827, + "step": 2739 + }, + { + "epoch": 0.2846161836501506, + "grad_norm": 0.363816499710083, + "learning_rate": 8.130916201826126e-05, + "loss": 1.5635, + "step": 2740 + }, + { + "epoch": 0.28472005816973095, + "grad_norm": 0.39882132411003113, + "learning_rate": 8.129643871349712e-05, + "loss": 2.0049, + "step": 2741 + }, + { + "epoch": 0.28482393268931133, + "grad_norm": 0.3964424729347229, + "learning_rate": 8.128371207590639e-05, + "loss": 1.6739, + "step": 2742 + }, + { + "epoch": 0.28492780720889166, + "grad_norm": 0.42000818252563477, + "learning_rate": 8.127098210684434e-05, + "loss": 1.9091, + "step": 2743 + }, + { + "epoch": 0.285031681728472, + "grad_norm": 0.3911057412624359, + "learning_rate": 8.125824880766661e-05, + "loss": 1.7084, + "step": 2744 + }, + { + "epoch": 0.28513555624805237, + "grad_norm": 0.37378305196762085, + "learning_rate": 8.124551217972922e-05, + "loss": 1.7252, + "step": 2745 + }, + { + "epoch": 0.2852394307676327, + "grad_norm": 0.40639790892601013, + "learning_rate": 8.12327722243885e-05, + "loss": 1.8441, + "step": 2746 + }, + { + "epoch": 0.285343305287213, + "grad_norm": 0.3873542249202728, + "learning_rate": 8.122002894300117e-05, + "loss": 1.506, + "step": 2747 + }, + { + "epoch": 0.2854471798067934, + "grad_norm": 0.34335261583328247, + "learning_rate": 8.120728233692428e-05, + "loss": 1.6598, + "step": 2748 + }, + { + "epoch": 0.28555105432637373, + "grad_norm": 0.4100406765937805, + "learning_rate": 8.119453240751525e-05, + "loss": 1.6315, + "step": 2749 + }, + { + "epoch": 0.2856549288459541, + "grad_norm": 0.34580525755882263, + "learning_rate": 8.118177915613182e-05, + "loss": 1.562, + "step": 2750 + }, + { + "epoch": 0.28575880336553444, + "grad_norm": 0.39648303389549255, + "learning_rate": 8.116902258413216e-05, + "loss": 1.9186, + "step": 2751 + }, + { + "epoch": 0.28586267788511477, + "grad_norm": 0.3890984356403351, + "learning_rate": 8.115626269287473e-05, + "loss": 1.862, + "step": 2752 + }, + { + "epoch": 0.28596655240469515, + "grad_norm": 0.3671417236328125, + "learning_rate": 8.114349948371833e-05, + "loss": 1.7655, + "step": 2753 + }, + { + "epoch": 0.2860704269242755, + "grad_norm": 0.3553464412689209, + "learning_rate": 8.113073295802219e-05, + "loss": 1.5148, + "step": 2754 + }, + { + "epoch": 0.2861743014438558, + "grad_norm": 0.3629017472267151, + "learning_rate": 8.111796311714579e-05, + "loss": 1.5992, + "step": 2755 + }, + { + "epoch": 0.2862781759634362, + "grad_norm": 0.4407372772693634, + "learning_rate": 8.110518996244907e-05, + "loss": 1.7642, + "step": 2756 + }, + { + "epoch": 0.2863820504830165, + "grad_norm": 0.42658326029777527, + "learning_rate": 8.109241349529223e-05, + "loss": 1.9091, + "step": 2757 + }, + { + "epoch": 0.28648592500259684, + "grad_norm": 0.38717007637023926, + "learning_rate": 8.107963371703589e-05, + "loss": 1.7881, + "step": 2758 + }, + { + "epoch": 0.2865897995221772, + "grad_norm": 0.39365988969802856, + "learning_rate": 8.1066850629041e-05, + "loss": 1.6742, + "step": 2759 + }, + { + "epoch": 0.28669367404175755, + "grad_norm": 0.3701498806476593, + "learning_rate": 8.105406423266884e-05, + "loss": 1.6718, + "step": 2760 + }, + { + "epoch": 0.2867975485613379, + "grad_norm": 0.3934311866760254, + "learning_rate": 8.104127452928107e-05, + "loss": 1.7174, + "step": 2761 + }, + { + "epoch": 0.28690142308091826, + "grad_norm": 0.3584936261177063, + "learning_rate": 8.10284815202397e-05, + "loss": 1.7269, + "step": 2762 + }, + { + "epoch": 0.2870052976004986, + "grad_norm": 0.4173574447631836, + "learning_rate": 8.101568520690708e-05, + "loss": 1.7251, + "step": 2763 + }, + { + "epoch": 0.28710917212007897, + "grad_norm": 0.3903685212135315, + "learning_rate": 8.10028855906459e-05, + "loss": 1.7643, + "step": 2764 + }, + { + "epoch": 0.2872130466396593, + "grad_norm": 0.37448278069496155, + "learning_rate": 8.099008267281924e-05, + "loss": 1.7648, + "step": 2765 + }, + { + "epoch": 0.2873169211592396, + "grad_norm": 0.3669279217720032, + "learning_rate": 8.097727645479053e-05, + "loss": 1.7988, + "step": 2766 + }, + { + "epoch": 0.28742079567882, + "grad_norm": 0.4136315584182739, + "learning_rate": 8.096446693792347e-05, + "loss": 1.857, + "step": 2767 + }, + { + "epoch": 0.28752467019840033, + "grad_norm": 0.33900344371795654, + "learning_rate": 8.095165412358225e-05, + "loss": 1.5582, + "step": 2768 + }, + { + "epoch": 0.28762854471798066, + "grad_norm": 0.36668214201927185, + "learning_rate": 8.093883801313129e-05, + "loss": 1.6839, + "step": 2769 + }, + { + "epoch": 0.28773241923756104, + "grad_norm": 0.4014098048210144, + "learning_rate": 8.09260186079354e-05, + "loss": 1.7254, + "step": 2770 + }, + { + "epoch": 0.28783629375714137, + "grad_norm": 0.44136208295822144, + "learning_rate": 8.091319590935978e-05, + "loss": 1.6938, + "step": 2771 + }, + { + "epoch": 0.2879401682767217, + "grad_norm": 0.42486312985420227, + "learning_rate": 8.090036991876992e-05, + "loss": 1.7066, + "step": 2772 + }, + { + "epoch": 0.2880440427963021, + "grad_norm": 0.4575170576572418, + "learning_rate": 8.08875406375317e-05, + "loss": 2.0641, + "step": 2773 + }, + { + "epoch": 0.2881479173158824, + "grad_norm": 0.4238605499267578, + "learning_rate": 8.087470806701131e-05, + "loss": 1.9339, + "step": 2774 + }, + { + "epoch": 0.28825179183546273, + "grad_norm": 0.37625259160995483, + "learning_rate": 8.086187220857537e-05, + "loss": 1.6855, + "step": 2775 + }, + { + "epoch": 0.2883556663550431, + "grad_norm": 0.3760613799095154, + "learning_rate": 8.084903306359078e-05, + "loss": 1.786, + "step": 2776 + }, + { + "epoch": 0.28845954087462344, + "grad_norm": 0.4117826521396637, + "learning_rate": 8.083619063342477e-05, + "loss": 1.796, + "step": 2777 + }, + { + "epoch": 0.2885634153942038, + "grad_norm": 0.3975922465324402, + "learning_rate": 8.0823344919445e-05, + "loss": 1.8309, + "step": 2778 + }, + { + "epoch": 0.28866728991378415, + "grad_norm": 0.39822137355804443, + "learning_rate": 8.081049592301944e-05, + "loss": 1.8686, + "step": 2779 + }, + { + "epoch": 0.2887711644333645, + "grad_norm": 0.3700173795223236, + "learning_rate": 8.079764364551638e-05, + "loss": 1.7419, + "step": 2780 + }, + { + "epoch": 0.28887503895294486, + "grad_norm": 0.3475184440612793, + "learning_rate": 8.07847880883045e-05, + "loss": 1.478, + "step": 2781 + }, + { + "epoch": 0.2889789134725252, + "grad_norm": 0.37445908784866333, + "learning_rate": 8.077192925275283e-05, + "loss": 1.8183, + "step": 2782 + }, + { + "epoch": 0.2890827879921055, + "grad_norm": 0.3727985918521881, + "learning_rate": 8.075906714023073e-05, + "loss": 1.7757, + "step": 2783 + }, + { + "epoch": 0.2891866625116859, + "grad_norm": 0.36759766936302185, + "learning_rate": 8.07462017521079e-05, + "loss": 1.6917, + "step": 2784 + }, + { + "epoch": 0.2892905370312662, + "grad_norm": 0.39550402760505676, + "learning_rate": 8.073333308975442e-05, + "loss": 1.7787, + "step": 2785 + }, + { + "epoch": 0.28939441155084655, + "grad_norm": 0.3845430612564087, + "learning_rate": 8.072046115454069e-05, + "loss": 1.9255, + "step": 2786 + }, + { + "epoch": 0.28949828607042694, + "grad_norm": 0.39458414912223816, + "learning_rate": 8.070758594783748e-05, + "loss": 1.6905, + "step": 2787 + }, + { + "epoch": 0.28960216059000726, + "grad_norm": 0.38234516978263855, + "learning_rate": 8.069470747101588e-05, + "loss": 1.784, + "step": 2788 + }, + { + "epoch": 0.28970603510958765, + "grad_norm": 0.4639817774295807, + "learning_rate": 8.068182572544736e-05, + "loss": 1.7567, + "step": 2789 + }, + { + "epoch": 0.28980990962916797, + "grad_norm": 0.43574076890945435, + "learning_rate": 8.066894071250375e-05, + "loss": 1.8726, + "step": 2790 + }, + { + "epoch": 0.2899137841487483, + "grad_norm": 0.37440717220306396, + "learning_rate": 8.065605243355717e-05, + "loss": 1.7629, + "step": 2791 + }, + { + "epoch": 0.2900176586683287, + "grad_norm": 0.4101695716381073, + "learning_rate": 8.064316088998012e-05, + "loss": 1.7499, + "step": 2792 + }, + { + "epoch": 0.290121533187909, + "grad_norm": 0.3932081162929535, + "learning_rate": 8.063026608314547e-05, + "loss": 1.7148, + "step": 2793 + }, + { + "epoch": 0.29022540770748934, + "grad_norm": 0.37535983324050903, + "learning_rate": 8.061736801442643e-05, + "loss": 1.8031, + "step": 2794 + }, + { + "epoch": 0.2903292822270697, + "grad_norm": 0.3890056610107422, + "learning_rate": 8.06044666851965e-05, + "loss": 1.718, + "step": 2795 + }, + { + "epoch": 0.29043315674665005, + "grad_norm": 0.44528940320014954, + "learning_rate": 8.059156209682959e-05, + "loss": 1.7655, + "step": 2796 + }, + { + "epoch": 0.29053703126623037, + "grad_norm": 0.3969075679779053, + "learning_rate": 8.057865425069995e-05, + "loss": 1.7541, + "step": 2797 + }, + { + "epoch": 0.29064090578581075, + "grad_norm": 0.37456732988357544, + "learning_rate": 8.056574314818215e-05, + "loss": 1.747, + "step": 2798 + }, + { + "epoch": 0.2907447803053911, + "grad_norm": 0.3638935387134552, + "learning_rate": 8.055282879065114e-05, + "loss": 1.6764, + "step": 2799 + }, + { + "epoch": 0.2908486548249714, + "grad_norm": 0.3996514081954956, + "learning_rate": 8.053991117948216e-05, + "loss": 1.7869, + "step": 2800 + }, + { + "epoch": 0.2909525293445518, + "grad_norm": 0.3886312246322632, + "learning_rate": 8.052699031605087e-05, + "loss": 1.8845, + "step": 2801 + }, + { + "epoch": 0.2910564038641321, + "grad_norm": 0.37657225131988525, + "learning_rate": 8.051406620173323e-05, + "loss": 1.6482, + "step": 2802 + }, + { + "epoch": 0.2911602783837125, + "grad_norm": 0.3608708083629608, + "learning_rate": 8.050113883790556e-05, + "loss": 1.6466, + "step": 2803 + }, + { + "epoch": 0.29126415290329283, + "grad_norm": 0.41230741143226624, + "learning_rate": 8.048820822594452e-05, + "loss": 1.8663, + "step": 2804 + }, + { + "epoch": 0.29136802742287315, + "grad_norm": 0.3793978989124298, + "learning_rate": 8.047527436722713e-05, + "loss": 1.7933, + "step": 2805 + }, + { + "epoch": 0.29147190194245354, + "grad_norm": 0.35737892985343933, + "learning_rate": 8.046233726313073e-05, + "loss": 1.6788, + "step": 2806 + }, + { + "epoch": 0.29157577646203386, + "grad_norm": 0.4132479429244995, + "learning_rate": 8.044939691503304e-05, + "loss": 1.7392, + "step": 2807 + }, + { + "epoch": 0.2916796509816142, + "grad_norm": 0.4083154499530792, + "learning_rate": 8.043645332431206e-05, + "loss": 1.6476, + "step": 2808 + }, + { + "epoch": 0.2917835255011946, + "grad_norm": 0.38581231236457825, + "learning_rate": 8.042350649234625e-05, + "loss": 1.7906, + "step": 2809 + }, + { + "epoch": 0.2918874000207749, + "grad_norm": 0.4001144766807556, + "learning_rate": 8.041055642051429e-05, + "loss": 1.5869, + "step": 2810 + }, + { + "epoch": 0.29199127454035523, + "grad_norm": 0.3624385893344879, + "learning_rate": 8.039760311019529e-05, + "loss": 1.7447, + "step": 2811 + }, + { + "epoch": 0.2920951490599356, + "grad_norm": 0.3716486394405365, + "learning_rate": 8.038464656276867e-05, + "loss": 1.8424, + "step": 2812 + }, + { + "epoch": 0.29219902357951594, + "grad_norm": 0.39141982793807983, + "learning_rate": 8.03716867796142e-05, + "loss": 1.6962, + "step": 2813 + }, + { + "epoch": 0.29230289809909626, + "grad_norm": 0.3740139901638031, + "learning_rate": 8.035872376211199e-05, + "loss": 1.6874, + "step": 2814 + }, + { + "epoch": 0.29240677261867665, + "grad_norm": 0.464403361082077, + "learning_rate": 8.03457575116425e-05, + "loss": 1.8638, + "step": 2815 + }, + { + "epoch": 0.292510647138257, + "grad_norm": 0.3929448425769806, + "learning_rate": 8.033278802958656e-05, + "loss": 1.808, + "step": 2816 + }, + { + "epoch": 0.29261452165783736, + "grad_norm": 0.4005575180053711, + "learning_rate": 8.031981531732529e-05, + "loss": 1.6983, + "step": 2817 + }, + { + "epoch": 0.2927183961774177, + "grad_norm": 0.3648793697357178, + "learning_rate": 8.03068393762402e-05, + "loss": 1.7543, + "step": 2818 + }, + { + "epoch": 0.292822270696998, + "grad_norm": 0.3590324819087982, + "learning_rate": 8.02938602077131e-05, + "loss": 1.8414, + "step": 2819 + }, + { + "epoch": 0.2929261452165784, + "grad_norm": 0.3629642426967621, + "learning_rate": 8.02808778131262e-05, + "loss": 1.8325, + "step": 2820 + }, + { + "epoch": 0.2930300197361587, + "grad_norm": 0.42311742901802063, + "learning_rate": 8.026789219386201e-05, + "loss": 1.7774, + "step": 2821 + }, + { + "epoch": 0.29313389425573905, + "grad_norm": 0.37744253873825073, + "learning_rate": 8.025490335130341e-05, + "loss": 1.7133, + "step": 2822 + }, + { + "epoch": 0.29323776877531943, + "grad_norm": 0.3763290047645569, + "learning_rate": 8.02419112868336e-05, + "loss": 1.7046, + "step": 2823 + }, + { + "epoch": 0.29334164329489976, + "grad_norm": 0.3736268877983093, + "learning_rate": 8.022891600183613e-05, + "loss": 1.6936, + "step": 2824 + }, + { + "epoch": 0.2934455178144801, + "grad_norm": 0.3851562440395355, + "learning_rate": 8.021591749769489e-05, + "loss": 1.7645, + "step": 2825 + }, + { + "epoch": 0.29354939233406047, + "grad_norm": 0.4103233814239502, + "learning_rate": 8.020291577579413e-05, + "loss": 1.8886, + "step": 2826 + }, + { + "epoch": 0.2936532668536408, + "grad_norm": 0.38178038597106934, + "learning_rate": 8.018991083751846e-05, + "loss": 1.7208, + "step": 2827 + }, + { + "epoch": 0.2937571413732212, + "grad_norm": 0.36348870396614075, + "learning_rate": 8.017690268425276e-05, + "loss": 1.6585, + "step": 2828 + }, + { + "epoch": 0.2938610158928015, + "grad_norm": 0.3771371841430664, + "learning_rate": 8.016389131738229e-05, + "loss": 1.7227, + "step": 2829 + }, + { + "epoch": 0.29396489041238183, + "grad_norm": 0.38507938385009766, + "learning_rate": 8.015087673829271e-05, + "loss": 1.8273, + "step": 2830 + }, + { + "epoch": 0.2940687649319622, + "grad_norm": 0.3727617561817169, + "learning_rate": 8.013785894836993e-05, + "loss": 1.6725, + "step": 2831 + }, + { + "epoch": 0.29417263945154254, + "grad_norm": 0.37300458550453186, + "learning_rate": 8.012483794900026e-05, + "loss": 1.729, + "step": 2832 + }, + { + "epoch": 0.29427651397112287, + "grad_norm": 0.4091015160083771, + "learning_rate": 8.011181374157034e-05, + "loss": 1.6771, + "step": 2833 + }, + { + "epoch": 0.29438038849070325, + "grad_norm": 0.4204038083553314, + "learning_rate": 8.009878632746712e-05, + "loss": 1.8316, + "step": 2834 + }, + { + "epoch": 0.2944842630102836, + "grad_norm": 0.4067396819591522, + "learning_rate": 8.008575570807795e-05, + "loss": 1.7071, + "step": 2835 + }, + { + "epoch": 0.2945881375298639, + "grad_norm": 0.4235256314277649, + "learning_rate": 8.007272188479049e-05, + "loss": 1.8557, + "step": 2836 + }, + { + "epoch": 0.2946920120494443, + "grad_norm": 0.3819893002510071, + "learning_rate": 8.005968485899269e-05, + "loss": 1.7695, + "step": 2837 + }, + { + "epoch": 0.2947958865690246, + "grad_norm": 0.3644767999649048, + "learning_rate": 8.004664463207294e-05, + "loss": 1.6902, + "step": 2838 + }, + { + "epoch": 0.29489976108860494, + "grad_norm": 0.36920779943466187, + "learning_rate": 8.003360120541992e-05, + "loss": 1.7109, + "step": 2839 + }, + { + "epoch": 0.2950036356081853, + "grad_norm": 0.3806985318660736, + "learning_rate": 8.002055458042264e-05, + "loss": 1.8152, + "step": 2840 + }, + { + "epoch": 0.29510751012776565, + "grad_norm": 0.4298724830150604, + "learning_rate": 8.000750475847045e-05, + "loss": 1.8906, + "step": 2841 + }, + { + "epoch": 0.29521138464734603, + "grad_norm": 0.3639010787010193, + "learning_rate": 7.999445174095308e-05, + "loss": 1.6325, + "step": 2842 + }, + { + "epoch": 0.29531525916692636, + "grad_norm": 0.36708855628967285, + "learning_rate": 7.998139552926057e-05, + "loss": 1.6156, + "step": 2843 + }, + { + "epoch": 0.2954191336865067, + "grad_norm": 0.3640385866165161, + "learning_rate": 7.996833612478331e-05, + "loss": 1.71, + "step": 2844 + }, + { + "epoch": 0.29552300820608707, + "grad_norm": 0.44802719354629517, + "learning_rate": 7.9955273528912e-05, + "loss": 1.7186, + "step": 2845 + }, + { + "epoch": 0.2956268827256674, + "grad_norm": 0.37511417269706726, + "learning_rate": 7.994220774303771e-05, + "loss": 1.5034, + "step": 2846 + }, + { + "epoch": 0.2957307572452477, + "grad_norm": 0.36259564757347107, + "learning_rate": 7.992913876855188e-05, + "loss": 1.6271, + "step": 2847 + }, + { + "epoch": 0.2958346317648281, + "grad_norm": 0.3824392855167389, + "learning_rate": 7.99160666068462e-05, + "loss": 1.7693, + "step": 2848 + }, + { + "epoch": 0.29593850628440843, + "grad_norm": 0.41231241822242737, + "learning_rate": 7.99029912593128e-05, + "loss": 1.5047, + "step": 2849 + }, + { + "epoch": 0.29604238080398876, + "grad_norm": 0.3836499750614166, + "learning_rate": 7.988991272734407e-05, + "loss": 1.6688, + "step": 2850 + }, + { + "epoch": 0.29614625532356914, + "grad_norm": 0.4020180106163025, + "learning_rate": 7.987683101233279e-05, + "loss": 1.7687, + "step": 2851 + }, + { + "epoch": 0.29625012984314947, + "grad_norm": 0.3831713795661926, + "learning_rate": 7.986374611567207e-05, + "loss": 1.7141, + "step": 2852 + }, + { + "epoch": 0.2963540043627298, + "grad_norm": 0.44648897647857666, + "learning_rate": 7.985065803875532e-05, + "loss": 1.9484, + "step": 2853 + }, + { + "epoch": 0.2964578788823102, + "grad_norm": 0.36257603764533997, + "learning_rate": 7.983756678297633e-05, + "loss": 1.6849, + "step": 2854 + }, + { + "epoch": 0.2965617534018905, + "grad_norm": 0.40492746233940125, + "learning_rate": 7.982447234972922e-05, + "loss": 1.7369, + "step": 2855 + }, + { + "epoch": 0.2966656279214709, + "grad_norm": 0.37346115708351135, + "learning_rate": 7.981137474040845e-05, + "loss": 1.7769, + "step": 2856 + }, + { + "epoch": 0.2967695024410512, + "grad_norm": 0.3808712959289551, + "learning_rate": 7.979827395640883e-05, + "loss": 1.6387, + "step": 2857 + }, + { + "epoch": 0.29687337696063154, + "grad_norm": 0.3708186149597168, + "learning_rate": 7.978516999912544e-05, + "loss": 1.7448, + "step": 2858 + }, + { + "epoch": 0.2969772514802119, + "grad_norm": 0.3663538992404938, + "learning_rate": 7.977206286995379e-05, + "loss": 1.7705, + "step": 2859 + }, + { + "epoch": 0.29708112599979225, + "grad_norm": 0.41986000537872314, + "learning_rate": 7.975895257028965e-05, + "loss": 1.9017, + "step": 2860 + }, + { + "epoch": 0.2971850005193726, + "grad_norm": 0.36581292748451233, + "learning_rate": 7.974583910152922e-05, + "loss": 1.5924, + "step": 2861 + }, + { + "epoch": 0.29728887503895296, + "grad_norm": 0.38697493076324463, + "learning_rate": 7.973272246506893e-05, + "loss": 1.7093, + "step": 2862 + }, + { + "epoch": 0.2973927495585333, + "grad_norm": 0.38015836477279663, + "learning_rate": 7.971960266230565e-05, + "loss": 1.7662, + "step": 2863 + }, + { + "epoch": 0.2974966240781136, + "grad_norm": 0.37382447719573975, + "learning_rate": 7.97064796946365e-05, + "loss": 1.6753, + "step": 2864 + }, + { + "epoch": 0.297600498597694, + "grad_norm": 0.42534875869750977, + "learning_rate": 7.969335356345894e-05, + "loss": 2.0455, + "step": 2865 + }, + { + "epoch": 0.2977043731172743, + "grad_norm": 0.39144277572631836, + "learning_rate": 7.968022427017088e-05, + "loss": 1.4737, + "step": 2866 + }, + { + "epoch": 0.2978082476368547, + "grad_norm": 0.3727049231529236, + "learning_rate": 7.966709181617045e-05, + "loss": 1.8034, + "step": 2867 + }, + { + "epoch": 0.29791212215643503, + "grad_norm": 0.3644360601902008, + "learning_rate": 7.965395620285616e-05, + "loss": 1.7262, + "step": 2868 + }, + { + "epoch": 0.29801599667601536, + "grad_norm": 0.40929368138313293, + "learning_rate": 7.964081743162684e-05, + "loss": 1.8834, + "step": 2869 + }, + { + "epoch": 0.29811987119559574, + "grad_norm": 0.40274330973625183, + "learning_rate": 7.962767550388166e-05, + "loss": 1.6383, + "step": 2870 + }, + { + "epoch": 0.29822374571517607, + "grad_norm": 0.38310062885284424, + "learning_rate": 7.961453042102014e-05, + "loss": 1.6704, + "step": 2871 + }, + { + "epoch": 0.2983276202347564, + "grad_norm": 0.3942394554615021, + "learning_rate": 7.960138218444215e-05, + "loss": 1.8623, + "step": 2872 + }, + { + "epoch": 0.2984314947543368, + "grad_norm": 0.3523310124874115, + "learning_rate": 7.958823079554785e-05, + "loss": 1.7932, + "step": 2873 + }, + { + "epoch": 0.2985353692739171, + "grad_norm": 0.36860281229019165, + "learning_rate": 7.957507625573776e-05, + "loss": 1.7049, + "step": 2874 + }, + { + "epoch": 0.29863924379349743, + "grad_norm": 0.37940865755081177, + "learning_rate": 7.956191856641276e-05, + "loss": 1.682, + "step": 2875 + }, + { + "epoch": 0.2987431183130778, + "grad_norm": 0.4322430193424225, + "learning_rate": 7.9548757728974e-05, + "loss": 1.9822, + "step": 2876 + }, + { + "epoch": 0.29884699283265814, + "grad_norm": 0.39275041222572327, + "learning_rate": 7.953559374482305e-05, + "loss": 1.8279, + "step": 2877 + }, + { + "epoch": 0.29895086735223847, + "grad_norm": 0.3673188388347626, + "learning_rate": 7.952242661536175e-05, + "loss": 1.6634, + "step": 2878 + }, + { + "epoch": 0.29905474187181885, + "grad_norm": 0.39935851097106934, + "learning_rate": 7.950925634199228e-05, + "loss": 1.7705, + "step": 2879 + }, + { + "epoch": 0.2991586163913992, + "grad_norm": 0.4122634530067444, + "learning_rate": 7.949608292611721e-05, + "loss": 1.7795, + "step": 2880 + }, + { + "epoch": 0.29926249091097956, + "grad_norm": 0.40487560629844666, + "learning_rate": 7.948290636913939e-05, + "loss": 1.9119, + "step": 2881 + }, + { + "epoch": 0.2993663654305599, + "grad_norm": 0.3445805013179779, + "learning_rate": 7.946972667246201e-05, + "loss": 1.5662, + "step": 2882 + }, + { + "epoch": 0.2994702399501402, + "grad_norm": 0.3658605217933655, + "learning_rate": 7.945654383748861e-05, + "loss": 1.5546, + "step": 2883 + }, + { + "epoch": 0.2995741144697206, + "grad_norm": 0.3820507824420929, + "learning_rate": 7.944335786562307e-05, + "loss": 1.5994, + "step": 2884 + }, + { + "epoch": 0.2996779889893009, + "grad_norm": 0.4138948321342468, + "learning_rate": 7.943016875826957e-05, + "loss": 1.829, + "step": 2885 + }, + { + "epoch": 0.29978186350888125, + "grad_norm": 0.35458049178123474, + "learning_rate": 7.941697651683267e-05, + "loss": 1.6141, + "step": 2886 + }, + { + "epoch": 0.29988573802846163, + "grad_norm": 0.4207065999507904, + "learning_rate": 7.940378114271723e-05, + "loss": 1.837, + "step": 2887 + }, + { + "epoch": 0.29998961254804196, + "grad_norm": 0.35922253131866455, + "learning_rate": 7.939058263732846e-05, + "loss": 1.5883, + "step": 2888 + }, + { + "epoch": 0.3000934870676223, + "grad_norm": 0.38362789154052734, + "learning_rate": 7.93773810020719e-05, + "loss": 1.7448, + "step": 2889 + }, + { + "epoch": 0.30019736158720267, + "grad_norm": 0.39529237151145935, + "learning_rate": 7.93641762383534e-05, + "loss": 1.7747, + "step": 2890 + }, + { + "epoch": 0.300301236106783, + "grad_norm": 0.37079519033432007, + "learning_rate": 7.93509683475792e-05, + "loss": 1.6808, + "step": 2891 + }, + { + "epoch": 0.3004051106263634, + "grad_norm": 0.37962087988853455, + "learning_rate": 7.93377573311558e-05, + "loss": 1.6966, + "step": 2892 + }, + { + "epoch": 0.3005089851459437, + "grad_norm": 0.36086609959602356, + "learning_rate": 7.932454319049008e-05, + "loss": 1.6517, + "step": 2893 + }, + { + "epoch": 0.30061285966552403, + "grad_norm": 0.3676837980747223, + "learning_rate": 7.931132592698927e-05, + "loss": 1.5688, + "step": 2894 + }, + { + "epoch": 0.3007167341851044, + "grad_norm": 0.38151493668556213, + "learning_rate": 7.929810554206088e-05, + "loss": 1.7309, + "step": 2895 + }, + { + "epoch": 0.30082060870468474, + "grad_norm": 0.38594383001327515, + "learning_rate": 7.928488203711279e-05, + "loss": 1.7323, + "step": 2896 + }, + { + "epoch": 0.30092448322426507, + "grad_norm": 0.3639463186264038, + "learning_rate": 7.927165541355319e-05, + "loss": 1.6411, + "step": 2897 + }, + { + "epoch": 0.30102835774384545, + "grad_norm": 0.36965441703796387, + "learning_rate": 7.92584256727906e-05, + "loss": 1.6233, + "step": 2898 + }, + { + "epoch": 0.3011322322634258, + "grad_norm": 0.40190815925598145, + "learning_rate": 7.924519281623393e-05, + "loss": 1.8847, + "step": 2899 + }, + { + "epoch": 0.3012361067830061, + "grad_norm": 0.3761938214302063, + "learning_rate": 7.923195684529232e-05, + "loss": 1.6344, + "step": 2900 + }, + { + "epoch": 0.3013399813025865, + "grad_norm": 0.39748868346214294, + "learning_rate": 7.921871776137533e-05, + "loss": 1.9062, + "step": 2901 + }, + { + "epoch": 0.3014438558221668, + "grad_norm": 0.3873717188835144, + "learning_rate": 7.920547556589282e-05, + "loss": 1.8971, + "step": 2902 + }, + { + "epoch": 0.30154773034174714, + "grad_norm": 0.39114367961883545, + "learning_rate": 7.919223026025498e-05, + "loss": 1.7381, + "step": 2903 + }, + { + "epoch": 0.3016516048613275, + "grad_norm": 0.38005557656288147, + "learning_rate": 7.917898184587231e-05, + "loss": 1.8914, + "step": 2904 + }, + { + "epoch": 0.30175547938090785, + "grad_norm": 0.3953699469566345, + "learning_rate": 7.916573032415569e-05, + "loss": 1.8567, + "step": 2905 + }, + { + "epoch": 0.30185935390048824, + "grad_norm": 0.3641657829284668, + "learning_rate": 7.915247569651627e-05, + "loss": 1.5998, + "step": 2906 + }, + { + "epoch": 0.30196322842006856, + "grad_norm": 0.35653895139694214, + "learning_rate": 7.913921796436561e-05, + "loss": 1.5639, + "step": 2907 + }, + { + "epoch": 0.3020671029396489, + "grad_norm": 0.3951781690120697, + "learning_rate": 7.912595712911552e-05, + "loss": 1.7428, + "step": 2908 + }, + { + "epoch": 0.3021709774592293, + "grad_norm": 0.47132858633995056, + "learning_rate": 7.91126931921782e-05, + "loss": 1.7877, + "step": 2909 + }, + { + "epoch": 0.3022748519788096, + "grad_norm": 0.4093819260597229, + "learning_rate": 7.909942615496613e-05, + "loss": 1.8667, + "step": 2910 + }, + { + "epoch": 0.3023787264983899, + "grad_norm": 0.43610236048698425, + "learning_rate": 7.908615601889218e-05, + "loss": 1.6627, + "step": 2911 + }, + { + "epoch": 0.3024826010179703, + "grad_norm": 0.42912378907203674, + "learning_rate": 7.907288278536947e-05, + "loss": 1.6026, + "step": 2912 + }, + { + "epoch": 0.30258647553755064, + "grad_norm": 0.4064681828022003, + "learning_rate": 7.905960645581152e-05, + "loss": 1.7998, + "step": 2913 + }, + { + "epoch": 0.30269035005713096, + "grad_norm": 0.373544842004776, + "learning_rate": 7.904632703163218e-05, + "loss": 1.6253, + "step": 2914 + }, + { + "epoch": 0.30279422457671135, + "grad_norm": 0.376240074634552, + "learning_rate": 7.903304451424556e-05, + "loss": 1.7304, + "step": 2915 + }, + { + "epoch": 0.3028980990962917, + "grad_norm": 0.3998112678527832, + "learning_rate": 7.901975890506616e-05, + "loss": 1.7403, + "step": 2916 + }, + { + "epoch": 0.303001973615872, + "grad_norm": 0.38144102692604065, + "learning_rate": 7.900647020550882e-05, + "loss": 1.7706, + "step": 2917 + }, + { + "epoch": 0.3031058481354524, + "grad_norm": 0.370951771736145, + "learning_rate": 7.899317841698864e-05, + "loss": 1.7351, + "step": 2918 + }, + { + "epoch": 0.3032097226550327, + "grad_norm": 0.41188183426856995, + "learning_rate": 7.897988354092113e-05, + "loss": 1.8509, + "step": 2919 + }, + { + "epoch": 0.3033135971746131, + "grad_norm": 0.4149079918861389, + "learning_rate": 7.896658557872207e-05, + "loss": 1.6496, + "step": 2920 + }, + { + "epoch": 0.3034174716941934, + "grad_norm": 0.38840773701667786, + "learning_rate": 7.89532845318076e-05, + "loss": 1.4942, + "step": 2921 + }, + { + "epoch": 0.30352134621377375, + "grad_norm": 0.4095969498157501, + "learning_rate": 7.893998040159418e-05, + "loss": 1.5979, + "step": 2922 + }, + { + "epoch": 0.30362522073335413, + "grad_norm": 0.3632447123527527, + "learning_rate": 7.892667318949855e-05, + "loss": 1.5617, + "step": 2923 + }, + { + "epoch": 0.30372909525293446, + "grad_norm": 0.34822526574134827, + "learning_rate": 7.891336289693789e-05, + "loss": 1.6198, + "step": 2924 + }, + { + "epoch": 0.3038329697725148, + "grad_norm": 0.3828188180923462, + "learning_rate": 7.890004952532962e-05, + "loss": 1.7494, + "step": 2925 + }, + { + "epoch": 0.30393684429209517, + "grad_norm": 0.37072688341140747, + "learning_rate": 7.888673307609149e-05, + "loss": 1.5492, + "step": 2926 + }, + { + "epoch": 0.3040407188116755, + "grad_norm": 0.3795013427734375, + "learning_rate": 7.887341355064162e-05, + "loss": 1.6222, + "step": 2927 + }, + { + "epoch": 0.3041445933312558, + "grad_norm": 0.40049639344215393, + "learning_rate": 7.886009095039843e-05, + "loss": 1.8504, + "step": 2928 + }, + { + "epoch": 0.3042484678508362, + "grad_norm": 0.4613747000694275, + "learning_rate": 7.884676527678068e-05, + "loss": 2.1015, + "step": 2929 + }, + { + "epoch": 0.30435234237041653, + "grad_norm": 0.3635263442993164, + "learning_rate": 7.883343653120743e-05, + "loss": 1.6364, + "step": 2930 + }, + { + "epoch": 0.3044562168899969, + "grad_norm": 0.4116704761981964, + "learning_rate": 7.882010471509809e-05, + "loss": 1.5827, + "step": 2931 + }, + { + "epoch": 0.30456009140957724, + "grad_norm": 0.41490453481674194, + "learning_rate": 7.88067698298724e-05, + "loss": 1.9797, + "step": 2932 + }, + { + "epoch": 0.30466396592915757, + "grad_norm": 0.3712293207645416, + "learning_rate": 7.879343187695045e-05, + "loss": 1.7037, + "step": 2933 + }, + { + "epoch": 0.30476784044873795, + "grad_norm": 0.37144172191619873, + "learning_rate": 7.878009085775257e-05, + "loss": 1.7411, + "step": 2934 + }, + { + "epoch": 0.3048717149683183, + "grad_norm": 0.38958555459976196, + "learning_rate": 7.876674677369954e-05, + "loss": 1.7444, + "step": 2935 + }, + { + "epoch": 0.3049755894878986, + "grad_norm": 0.3905205726623535, + "learning_rate": 7.875339962621235e-05, + "loss": 1.8388, + "step": 2936 + }, + { + "epoch": 0.305079464007479, + "grad_norm": 0.4097367525100708, + "learning_rate": 7.874004941671239e-05, + "loss": 1.7877, + "step": 2937 + }, + { + "epoch": 0.3051833385270593, + "grad_norm": 0.39029520750045776, + "learning_rate": 7.872669614662135e-05, + "loss": 1.7491, + "step": 2938 + }, + { + "epoch": 0.30528721304663964, + "grad_norm": 0.3893304467201233, + "learning_rate": 7.871333981736124e-05, + "loss": 1.7892, + "step": 2939 + }, + { + "epoch": 0.30539108756622, + "grad_norm": 0.39329490065574646, + "learning_rate": 7.869998043035442e-05, + "loss": 1.7247, + "step": 2940 + }, + { + "epoch": 0.30549496208580035, + "grad_norm": 0.4152490496635437, + "learning_rate": 7.868661798702355e-05, + "loss": 1.767, + "step": 2941 + }, + { + "epoch": 0.3055988366053807, + "grad_norm": 0.40060338377952576, + "learning_rate": 7.867325248879163e-05, + "loss": 1.6133, + "step": 2942 + }, + { + "epoch": 0.30570271112496106, + "grad_norm": 0.422410786151886, + "learning_rate": 7.865988393708197e-05, + "loss": 1.6876, + "step": 2943 + }, + { + "epoch": 0.3058065856445414, + "grad_norm": 0.4146443009376526, + "learning_rate": 7.864651233331823e-05, + "loss": 1.7866, + "step": 2944 + }, + { + "epoch": 0.30591046016412177, + "grad_norm": 0.363182008266449, + "learning_rate": 7.863313767892438e-05, + "loss": 1.5605, + "step": 2945 + }, + { + "epoch": 0.3060143346837021, + "grad_norm": 0.4126920998096466, + "learning_rate": 7.861975997532471e-05, + "loss": 1.8965, + "step": 2946 + }, + { + "epoch": 0.3061182092032824, + "grad_norm": 0.4038070738315582, + "learning_rate": 7.860637922394387e-05, + "loss": 1.8668, + "step": 2947 + }, + { + "epoch": 0.3062220837228628, + "grad_norm": 0.3751528859138489, + "learning_rate": 7.859299542620675e-05, + "loss": 1.8258, + "step": 2948 + }, + { + "epoch": 0.30632595824244313, + "grad_norm": 0.422244668006897, + "learning_rate": 7.857960858353866e-05, + "loss": 1.8309, + "step": 2949 + }, + { + "epoch": 0.30642983276202346, + "grad_norm": 0.37908682227134705, + "learning_rate": 7.85662186973652e-05, + "loss": 1.7936, + "step": 2950 + }, + { + "epoch": 0.30653370728160384, + "grad_norm": 0.36694470047950745, + "learning_rate": 7.855282576911224e-05, + "loss": 1.7259, + "step": 2951 + }, + { + "epoch": 0.30663758180118417, + "grad_norm": 0.3961583077907562, + "learning_rate": 7.85394298002061e-05, + "loss": 1.7949, + "step": 2952 + }, + { + "epoch": 0.3067414563207645, + "grad_norm": 0.3951317071914673, + "learning_rate": 7.852603079207328e-05, + "loss": 1.6679, + "step": 2953 + }, + { + "epoch": 0.3068453308403449, + "grad_norm": 0.3763517737388611, + "learning_rate": 7.851262874614069e-05, + "loss": 1.6965, + "step": 2954 + }, + { + "epoch": 0.3069492053599252, + "grad_norm": 0.3869110941886902, + "learning_rate": 7.849922366383555e-05, + "loss": 1.6044, + "step": 2955 + }, + { + "epoch": 0.30705307987950553, + "grad_norm": 0.4839021563529968, + "learning_rate": 7.848581554658539e-05, + "loss": 2.1448, + "step": 2956 + }, + { + "epoch": 0.3071569543990859, + "grad_norm": 0.4335784912109375, + "learning_rate": 7.847240439581806e-05, + "loss": 1.7972, + "step": 2957 + }, + { + "epoch": 0.30726082891866624, + "grad_norm": 0.5460455417633057, + "learning_rate": 7.845899021296178e-05, + "loss": 1.732, + "step": 2958 + }, + { + "epoch": 0.3073647034382466, + "grad_norm": 0.39001351594924927, + "learning_rate": 7.844557299944501e-05, + "loss": 1.7266, + "step": 2959 + }, + { + "epoch": 0.30746857795782695, + "grad_norm": 0.39050352573394775, + "learning_rate": 7.843215275669663e-05, + "loss": 1.7042, + "step": 2960 + }, + { + "epoch": 0.3075724524774073, + "grad_norm": 0.3831944465637207, + "learning_rate": 7.841872948614573e-05, + "loss": 1.7086, + "step": 2961 + }, + { + "epoch": 0.30767632699698766, + "grad_norm": 0.4240471422672272, + "learning_rate": 7.840530318922181e-05, + "loss": 1.7215, + "step": 2962 + }, + { + "epoch": 0.307780201516568, + "grad_norm": 0.3831408619880676, + "learning_rate": 7.839187386735469e-05, + "loss": 1.7995, + "step": 2963 + }, + { + "epoch": 0.3078840760361483, + "grad_norm": 0.43340522050857544, + "learning_rate": 7.837844152197447e-05, + "loss": 1.7028, + "step": 2964 + }, + { + "epoch": 0.3079879505557287, + "grad_norm": 0.3613108992576599, + "learning_rate": 7.836500615451156e-05, + "loss": 1.4274, + "step": 2965 + }, + { + "epoch": 0.308091825075309, + "grad_norm": 0.4255940914154053, + "learning_rate": 7.835156776639679e-05, + "loss": 1.5747, + "step": 2966 + }, + { + "epoch": 0.30819569959488935, + "grad_norm": 0.3656274974346161, + "learning_rate": 7.833812635906118e-05, + "loss": 1.6061, + "step": 2967 + }, + { + "epoch": 0.30829957411446973, + "grad_norm": 0.4550713002681732, + "learning_rate": 7.832468193393616e-05, + "loss": 1.8704, + "step": 2968 + }, + { + "epoch": 0.30840344863405006, + "grad_norm": 0.40488240122795105, + "learning_rate": 7.831123449245345e-05, + "loss": 1.7132, + "step": 2969 + }, + { + "epoch": 0.30850732315363044, + "grad_norm": 0.3986703157424927, + "learning_rate": 7.829778403604513e-05, + "loss": 1.8511, + "step": 2970 + }, + { + "epoch": 0.30861119767321077, + "grad_norm": 0.41377493739128113, + "learning_rate": 7.828433056614351e-05, + "loss": 1.8369, + "step": 2971 + }, + { + "epoch": 0.3087150721927911, + "grad_norm": 0.3639225959777832, + "learning_rate": 7.827087408418132e-05, + "loss": 1.7907, + "step": 2972 + }, + { + "epoch": 0.3088189467123715, + "grad_norm": 0.3887461721897125, + "learning_rate": 7.825741459159157e-05, + "loss": 1.7639, + "step": 2973 + }, + { + "epoch": 0.3089228212319518, + "grad_norm": 0.38962462544441223, + "learning_rate": 7.824395208980758e-05, + "loss": 1.7871, + "step": 2974 + }, + { + "epoch": 0.30902669575153213, + "grad_norm": 0.43991488218307495, + "learning_rate": 7.8230486580263e-05, + "loss": 1.988, + "step": 2975 + }, + { + "epoch": 0.3091305702711125, + "grad_norm": 0.4023156464099884, + "learning_rate": 7.821701806439179e-05, + "loss": 1.812, + "step": 2976 + }, + { + "epoch": 0.30923444479069284, + "grad_norm": 0.37170884013175964, + "learning_rate": 7.820354654362828e-05, + "loss": 1.7231, + "step": 2977 + }, + { + "epoch": 0.30933831931027317, + "grad_norm": 0.38935086131095886, + "learning_rate": 7.819007201940706e-05, + "loss": 1.7493, + "step": 2978 + }, + { + "epoch": 0.30944219382985355, + "grad_norm": 0.3613695204257965, + "learning_rate": 7.817659449316305e-05, + "loss": 1.621, + "step": 2979 + }, + { + "epoch": 0.3095460683494339, + "grad_norm": 0.3804933726787567, + "learning_rate": 7.816311396633152e-05, + "loss": 1.7622, + "step": 2980 + }, + { + "epoch": 0.3096499428690142, + "grad_norm": 0.3554391860961914, + "learning_rate": 7.814963044034802e-05, + "loss": 1.615, + "step": 2981 + }, + { + "epoch": 0.3097538173885946, + "grad_norm": 0.42907533049583435, + "learning_rate": 7.813614391664847e-05, + "loss": 1.9264, + "step": 2982 + }, + { + "epoch": 0.3098576919081749, + "grad_norm": 0.46726834774017334, + "learning_rate": 7.812265439666906e-05, + "loss": 1.6321, + "step": 2983 + }, + { + "epoch": 0.3099615664277553, + "grad_norm": 0.43149927258491516, + "learning_rate": 7.81091618818463e-05, + "loss": 1.902, + "step": 2984 + }, + { + "epoch": 0.3100654409473356, + "grad_norm": 0.42148879170417786, + "learning_rate": 7.80956663736171e-05, + "loss": 1.8288, + "step": 2985 + }, + { + "epoch": 0.31016931546691595, + "grad_norm": 0.39359238743782043, + "learning_rate": 7.808216787341855e-05, + "loss": 1.7853, + "step": 2986 + }, + { + "epoch": 0.31027318998649633, + "grad_norm": 0.37277668714523315, + "learning_rate": 7.806866638268818e-05, + "loss": 1.702, + "step": 2987 + }, + { + "epoch": 0.31037706450607666, + "grad_norm": 0.3854857087135315, + "learning_rate": 7.80551619028638e-05, + "loss": 1.7776, + "step": 2988 + }, + { + "epoch": 0.310480939025657, + "grad_norm": 0.3613017201423645, + "learning_rate": 7.80416544353835e-05, + "loss": 1.6451, + "step": 2989 + }, + { + "epoch": 0.31058481354523737, + "grad_norm": 0.3885599970817566, + "learning_rate": 7.802814398168572e-05, + "loss": 1.8862, + "step": 2990 + }, + { + "epoch": 0.3106886880648177, + "grad_norm": 0.3823319375514984, + "learning_rate": 7.801463054320927e-05, + "loss": 1.7783, + "step": 2991 + }, + { + "epoch": 0.310792562584398, + "grad_norm": 0.43862950801849365, + "learning_rate": 7.800111412139318e-05, + "loss": 1.654, + "step": 2992 + }, + { + "epoch": 0.3108964371039784, + "grad_norm": 0.4039106070995331, + "learning_rate": 7.798759471767684e-05, + "loss": 1.7425, + "step": 2993 + }, + { + "epoch": 0.31100031162355873, + "grad_norm": 0.3942197263240814, + "learning_rate": 7.797407233349998e-05, + "loss": 1.7184, + "step": 2994 + }, + { + "epoch": 0.31110418614313906, + "grad_norm": 0.35280346870422363, + "learning_rate": 7.796054697030262e-05, + "loss": 1.5123, + "step": 2995 + }, + { + "epoch": 0.31120806066271944, + "grad_norm": 0.38291507959365845, + "learning_rate": 7.794701862952512e-05, + "loss": 1.7482, + "step": 2996 + }, + { + "epoch": 0.31131193518229977, + "grad_norm": 0.4096364378929138, + "learning_rate": 7.793348731260813e-05, + "loss": 1.8343, + "step": 2997 + }, + { + "epoch": 0.31141580970188015, + "grad_norm": 0.40099769830703735, + "learning_rate": 7.79199530209926e-05, + "loss": 1.655, + "step": 2998 + }, + { + "epoch": 0.3115196842214605, + "grad_norm": 0.3833656907081604, + "learning_rate": 7.79064157561199e-05, + "loss": 1.7954, + "step": 2999 + }, + { + "epoch": 0.3116235587410408, + "grad_norm": 0.3824611008167267, + "learning_rate": 7.789287551943158e-05, + "loss": 1.5854, + "step": 3000 + }, + { + "epoch": 0.3117274332606212, + "grad_norm": 0.3726818561553955, + "learning_rate": 7.787933231236958e-05, + "loss": 1.6898, + "step": 3001 + }, + { + "epoch": 0.3118313077802015, + "grad_norm": 0.37149757146835327, + "learning_rate": 7.786578613637617e-05, + "loss": 1.6463, + "step": 3002 + }, + { + "epoch": 0.31193518229978184, + "grad_norm": 0.42311716079711914, + "learning_rate": 7.78522369928939e-05, + "loss": 1.951, + "step": 3003 + }, + { + "epoch": 0.3120390568193622, + "grad_norm": 0.47437840700149536, + "learning_rate": 7.783868488336563e-05, + "loss": 1.6597, + "step": 3004 + }, + { + "epoch": 0.31214293133894255, + "grad_norm": 0.3761157989501953, + "learning_rate": 7.782512980923459e-05, + "loss": 1.8036, + "step": 3005 + }, + { + "epoch": 0.3122468058585229, + "grad_norm": 0.3912384510040283, + "learning_rate": 7.781157177194426e-05, + "loss": 1.6382, + "step": 3006 + }, + { + "epoch": 0.31235068037810326, + "grad_norm": 0.3848210871219635, + "learning_rate": 7.779801077293847e-05, + "loss": 1.7521, + "step": 3007 + }, + { + "epoch": 0.3124545548976836, + "grad_norm": 0.3805733323097229, + "learning_rate": 7.778444681366137e-05, + "loss": 1.7019, + "step": 3008 + }, + { + "epoch": 0.31255842941726397, + "grad_norm": 0.38343513011932373, + "learning_rate": 7.777087989555741e-05, + "loss": 1.7686, + "step": 3009 + }, + { + "epoch": 0.3126623039368443, + "grad_norm": 0.37739554047584534, + "learning_rate": 7.775731002007138e-05, + "loss": 1.5242, + "step": 3010 + }, + { + "epoch": 0.3127661784564246, + "grad_norm": 0.37050339579582214, + "learning_rate": 7.774373718864833e-05, + "loss": 1.771, + "step": 3011 + }, + { + "epoch": 0.312870052976005, + "grad_norm": 0.39899301528930664, + "learning_rate": 7.773016140273368e-05, + "loss": 1.8089, + "step": 3012 + }, + { + "epoch": 0.31297392749558534, + "grad_norm": 0.3865603506565094, + "learning_rate": 7.771658266377315e-05, + "loss": 1.6336, + "step": 3013 + }, + { + "epoch": 0.31307780201516566, + "grad_norm": 0.3612881302833557, + "learning_rate": 7.770300097321276e-05, + "loss": 1.4883, + "step": 3014 + }, + { + "epoch": 0.31318167653474605, + "grad_norm": 0.422632098197937, + "learning_rate": 7.768941633249884e-05, + "loss": 1.8948, + "step": 3015 + }, + { + "epoch": 0.31328555105432637, + "grad_norm": 0.3684242069721222, + "learning_rate": 7.76758287430781e-05, + "loss": 1.702, + "step": 3016 + }, + { + "epoch": 0.3133894255739067, + "grad_norm": 0.36709776520729065, + "learning_rate": 7.766223820639747e-05, + "loss": 1.687, + "step": 3017 + }, + { + "epoch": 0.3134933000934871, + "grad_norm": 0.39332762360572815, + "learning_rate": 7.764864472390423e-05, + "loss": 1.7333, + "step": 3018 + }, + { + "epoch": 0.3135971746130674, + "grad_norm": 0.3547021448612213, + "learning_rate": 7.7635048297046e-05, + "loss": 1.6883, + "step": 3019 + }, + { + "epoch": 0.31370104913264774, + "grad_norm": 0.393284410238266, + "learning_rate": 7.762144892727069e-05, + "loss": 1.9121, + "step": 3020 + }, + { + "epoch": 0.3138049236522281, + "grad_norm": 0.4200425148010254, + "learning_rate": 7.760784661602653e-05, + "loss": 1.8525, + "step": 3021 + }, + { + "epoch": 0.31390879817180845, + "grad_norm": 0.3398437798023224, + "learning_rate": 7.759424136476206e-05, + "loss": 1.529, + "step": 3022 + }, + { + "epoch": 0.31401267269138883, + "grad_norm": 0.3940331041812897, + "learning_rate": 7.75806331749261e-05, + "loss": 1.8153, + "step": 3023 + }, + { + "epoch": 0.31411654721096915, + "grad_norm": 0.37523049116134644, + "learning_rate": 7.756702204796786e-05, + "loss": 1.6807, + "step": 3024 + }, + { + "epoch": 0.3142204217305495, + "grad_norm": 0.3940492868423462, + "learning_rate": 7.75534079853368e-05, + "loss": 1.8526, + "step": 3025 + }, + { + "epoch": 0.31432429625012986, + "grad_norm": 0.40810003876686096, + "learning_rate": 7.753979098848272e-05, + "loss": 1.6186, + "step": 3026 + }, + { + "epoch": 0.3144281707697102, + "grad_norm": 0.3908647894859314, + "learning_rate": 7.752617105885574e-05, + "loss": 1.6896, + "step": 3027 + }, + { + "epoch": 0.3145320452892905, + "grad_norm": 0.35582828521728516, + "learning_rate": 7.751254819790623e-05, + "loss": 1.719, + "step": 3028 + }, + { + "epoch": 0.3146359198088709, + "grad_norm": 0.3710813820362091, + "learning_rate": 7.749892240708494e-05, + "loss": 1.7155, + "step": 3029 + }, + { + "epoch": 0.31473979432845123, + "grad_norm": 0.4214909076690674, + "learning_rate": 7.748529368784292e-05, + "loss": 1.8792, + "step": 3030 + }, + { + "epoch": 0.31484366884803155, + "grad_norm": 0.4322544038295746, + "learning_rate": 7.747166204163152e-05, + "loss": 1.8348, + "step": 3031 + }, + { + "epoch": 0.31494754336761194, + "grad_norm": 0.38530245423316956, + "learning_rate": 7.745802746990239e-05, + "loss": 1.7037, + "step": 3032 + }, + { + "epoch": 0.31505141788719226, + "grad_norm": 0.3923969268798828, + "learning_rate": 7.744438997410752e-05, + "loss": 1.6523, + "step": 3033 + }, + { + "epoch": 0.3151552924067726, + "grad_norm": 0.3855164349079132, + "learning_rate": 7.74307495556992e-05, + "loss": 1.705, + "step": 3034 + }, + { + "epoch": 0.315259166926353, + "grad_norm": 0.3986038267612457, + "learning_rate": 7.741710621613002e-05, + "loss": 1.9339, + "step": 3035 + }, + { + "epoch": 0.3153630414459333, + "grad_norm": 0.42609813809394836, + "learning_rate": 7.740345995685287e-05, + "loss": 1.8712, + "step": 3036 + }, + { + "epoch": 0.3154669159655137, + "grad_norm": 0.4008728563785553, + "learning_rate": 7.738981077932099e-05, + "loss": 1.8456, + "step": 3037 + }, + { + "epoch": 0.315570790485094, + "grad_norm": 0.4060449004173279, + "learning_rate": 7.737615868498793e-05, + "loss": 1.7126, + "step": 3038 + }, + { + "epoch": 0.31567466500467434, + "grad_norm": 0.36810052394866943, + "learning_rate": 7.736250367530751e-05, + "loss": 1.6292, + "step": 3039 + }, + { + "epoch": 0.3157785395242547, + "grad_norm": 0.39102548360824585, + "learning_rate": 7.734884575173387e-05, + "loss": 1.7703, + "step": 3040 + }, + { + "epoch": 0.31588241404383505, + "grad_norm": 0.4772125780582428, + "learning_rate": 7.73351849157215e-05, + "loss": 1.7719, + "step": 3041 + }, + { + "epoch": 0.3159862885634154, + "grad_norm": 0.40599432587623596, + "learning_rate": 7.732152116872515e-05, + "loss": 1.6538, + "step": 3042 + }, + { + "epoch": 0.31609016308299576, + "grad_norm": 0.4301255941390991, + "learning_rate": 7.730785451219991e-05, + "loss": 1.755, + "step": 3043 + }, + { + "epoch": 0.3161940376025761, + "grad_norm": 0.386177122592926, + "learning_rate": 7.729418494760119e-05, + "loss": 1.5637, + "step": 3044 + }, + { + "epoch": 0.3162979121221564, + "grad_norm": 0.3636477589607239, + "learning_rate": 7.728051247638468e-05, + "loss": 1.5991, + "step": 3045 + }, + { + "epoch": 0.3164017866417368, + "grad_norm": 0.38973626494407654, + "learning_rate": 7.726683710000637e-05, + "loss": 1.7331, + "step": 3046 + }, + { + "epoch": 0.3165056611613171, + "grad_norm": 0.42155221104621887, + "learning_rate": 7.72531588199226e-05, + "loss": 1.8179, + "step": 3047 + }, + { + "epoch": 0.3166095356808975, + "grad_norm": 0.4188525080680847, + "learning_rate": 7.723947763759e-05, + "loss": 1.7464, + "step": 3048 + }, + { + "epoch": 0.31671341020047783, + "grad_norm": 0.38648781180381775, + "learning_rate": 7.722579355446551e-05, + "loss": 1.5823, + "step": 3049 + }, + { + "epoch": 0.31681728472005816, + "grad_norm": 0.4101918339729309, + "learning_rate": 7.721210657200637e-05, + "loss": 1.7697, + "step": 3050 + }, + { + "epoch": 0.31692115923963854, + "grad_norm": 0.366359680891037, + "learning_rate": 7.719841669167014e-05, + "loss": 1.6861, + "step": 3051 + }, + { + "epoch": 0.31702503375921887, + "grad_norm": 0.4024447202682495, + "learning_rate": 7.71847239149147e-05, + "loss": 1.9079, + "step": 3052 + }, + { + "epoch": 0.3171289082787992, + "grad_norm": 0.3973731994628906, + "learning_rate": 7.717102824319822e-05, + "loss": 1.8736, + "step": 3053 + }, + { + "epoch": 0.3172327827983796, + "grad_norm": 0.42864885926246643, + "learning_rate": 7.715732967797915e-05, + "loss": 1.7298, + "step": 3054 + }, + { + "epoch": 0.3173366573179599, + "grad_norm": 0.37840789556503296, + "learning_rate": 7.714362822071632e-05, + "loss": 1.8096, + "step": 3055 + }, + { + "epoch": 0.31744053183754023, + "grad_norm": 0.3534778654575348, + "learning_rate": 7.712992387286882e-05, + "loss": 1.5215, + "step": 3056 + }, + { + "epoch": 0.3175444063571206, + "grad_norm": 0.37285417318344116, + "learning_rate": 7.711621663589602e-05, + "loss": 1.6497, + "step": 3057 + }, + { + "epoch": 0.31764828087670094, + "grad_norm": 0.35734882950782776, + "learning_rate": 7.71025065112577e-05, + "loss": 1.6053, + "step": 3058 + }, + { + "epoch": 0.31775215539628127, + "grad_norm": 0.38647857308387756, + "learning_rate": 7.708879350041383e-05, + "loss": 1.6968, + "step": 3059 + }, + { + "epoch": 0.31785602991586165, + "grad_norm": 0.4510471522808075, + "learning_rate": 7.707507760482474e-05, + "loss": 1.6815, + "step": 3060 + }, + { + "epoch": 0.317959904435442, + "grad_norm": 0.4056614935398102, + "learning_rate": 7.706135882595108e-05, + "loss": 1.6113, + "step": 3061 + }, + { + "epoch": 0.31806377895502236, + "grad_norm": 0.4500080645084381, + "learning_rate": 7.70476371652538e-05, + "loss": 1.9964, + "step": 3062 + }, + { + "epoch": 0.3181676534746027, + "grad_norm": 0.3800092935562134, + "learning_rate": 7.703391262419413e-05, + "loss": 1.67, + "step": 3063 + }, + { + "epoch": 0.318271527994183, + "grad_norm": 0.4021591544151306, + "learning_rate": 7.702018520423367e-05, + "loss": 1.7399, + "step": 3064 + }, + { + "epoch": 0.3183754025137634, + "grad_norm": 0.42205873131752014, + "learning_rate": 7.700645490683421e-05, + "loss": 1.8155, + "step": 3065 + }, + { + "epoch": 0.3184792770333437, + "grad_norm": 0.3718779385089874, + "learning_rate": 7.699272173345799e-05, + "loss": 1.7583, + "step": 3066 + }, + { + "epoch": 0.31858315155292405, + "grad_norm": 0.3647688031196594, + "learning_rate": 7.697898568556745e-05, + "loss": 1.5891, + "step": 3067 + }, + { + "epoch": 0.31868702607250443, + "grad_norm": 0.3905600309371948, + "learning_rate": 7.696524676462538e-05, + "loss": 1.7358, + "step": 3068 + }, + { + "epoch": 0.31879090059208476, + "grad_norm": 0.36496713757514954, + "learning_rate": 7.695150497209485e-05, + "loss": 1.7218, + "step": 3069 + }, + { + "epoch": 0.3188947751116651, + "grad_norm": 0.4626975357532501, + "learning_rate": 7.69377603094393e-05, + "loss": 1.8209, + "step": 3070 + }, + { + "epoch": 0.31899864963124547, + "grad_norm": 0.4146973490715027, + "learning_rate": 7.692401277812236e-05, + "loss": 1.6095, + "step": 3071 + }, + { + "epoch": 0.3191025241508258, + "grad_norm": 0.3558516800403595, + "learning_rate": 7.69102623796081e-05, + "loss": 1.5576, + "step": 3072 + }, + { + "epoch": 0.3192063986704061, + "grad_norm": 0.4161911606788635, + "learning_rate": 7.689650911536081e-05, + "loss": 1.656, + "step": 3073 + }, + { + "epoch": 0.3193102731899865, + "grad_norm": 0.40754130482673645, + "learning_rate": 7.688275298684509e-05, + "loss": 1.7435, + "step": 3074 + }, + { + "epoch": 0.31941414770956683, + "grad_norm": 0.36731454730033875, + "learning_rate": 7.686899399552587e-05, + "loss": 1.6421, + "step": 3075 + }, + { + "epoch": 0.3195180222291472, + "grad_norm": 0.4020611047744751, + "learning_rate": 7.685523214286839e-05, + "loss": 1.7208, + "step": 3076 + }, + { + "epoch": 0.31962189674872754, + "grad_norm": 0.48006507754325867, + "learning_rate": 7.684146743033815e-05, + "loss": 1.7648, + "step": 3077 + }, + { + "epoch": 0.31972577126830787, + "grad_norm": 0.39849093556404114, + "learning_rate": 7.6827699859401e-05, + "loss": 1.751, + "step": 3078 + }, + { + "epoch": 0.31982964578788825, + "grad_norm": 0.3924444615840912, + "learning_rate": 7.681392943152308e-05, + "loss": 1.7878, + "step": 3079 + }, + { + "epoch": 0.3199335203074686, + "grad_norm": 0.4005417823791504, + "learning_rate": 7.680015614817083e-05, + "loss": 1.7752, + "step": 3080 + }, + { + "epoch": 0.3200373948270489, + "grad_norm": 0.39548689126968384, + "learning_rate": 7.678638001081102e-05, + "loss": 1.7377, + "step": 3081 + }, + { + "epoch": 0.3201412693466293, + "grad_norm": 0.4022079408168793, + "learning_rate": 7.677260102091066e-05, + "loss": 1.7732, + "step": 3082 + }, + { + "epoch": 0.3202451438662096, + "grad_norm": 0.4020305573940277, + "learning_rate": 7.675881917993713e-05, + "loss": 1.6768, + "step": 3083 + }, + { + "epoch": 0.32034901838578994, + "grad_norm": 0.374347448348999, + "learning_rate": 7.674503448935808e-05, + "loss": 1.59, + "step": 3084 + }, + { + "epoch": 0.3204528929053703, + "grad_norm": 0.3810427784919739, + "learning_rate": 7.673124695064148e-05, + "loss": 1.3873, + "step": 3085 + }, + { + "epoch": 0.32055676742495065, + "grad_norm": 0.43509384989738464, + "learning_rate": 7.671745656525559e-05, + "loss": 1.5037, + "step": 3086 + }, + { + "epoch": 0.32066064194453103, + "grad_norm": 0.38209038972854614, + "learning_rate": 7.670366333466899e-05, + "loss": 1.6842, + "step": 3087 + }, + { + "epoch": 0.32076451646411136, + "grad_norm": 0.4374238848686218, + "learning_rate": 7.668986726035054e-05, + "loss": 1.858, + "step": 3088 + }, + { + "epoch": 0.3208683909836917, + "grad_norm": 0.4977831244468689, + "learning_rate": 7.66760683437694e-05, + "loss": 1.9369, + "step": 3089 + }, + { + "epoch": 0.32097226550327207, + "grad_norm": 0.37174275517463684, + "learning_rate": 7.666226658639507e-05, + "loss": 1.6107, + "step": 3090 + }, + { + "epoch": 0.3210761400228524, + "grad_norm": 0.3945876657962799, + "learning_rate": 7.664846198969733e-05, + "loss": 1.8063, + "step": 3091 + }, + { + "epoch": 0.3211800145424327, + "grad_norm": 0.4560282230377197, + "learning_rate": 7.663465455514625e-05, + "loss": 2.2025, + "step": 3092 + }, + { + "epoch": 0.3212838890620131, + "grad_norm": 0.3639695346355438, + "learning_rate": 7.662084428421221e-05, + "loss": 1.6021, + "step": 3093 + }, + { + "epoch": 0.32138776358159343, + "grad_norm": 0.36369383335113525, + "learning_rate": 7.660703117836591e-05, + "loss": 1.6781, + "step": 3094 + }, + { + "epoch": 0.32149163810117376, + "grad_norm": 0.3726049065589905, + "learning_rate": 7.659321523907834e-05, + "loss": 1.8033, + "step": 3095 + }, + { + "epoch": 0.32159551262075414, + "grad_norm": 0.3972417116165161, + "learning_rate": 7.657939646782077e-05, + "loss": 1.6971, + "step": 3096 + }, + { + "epoch": 0.32169938714033447, + "grad_norm": 0.4059099555015564, + "learning_rate": 7.656557486606482e-05, + "loss": 1.7625, + "step": 3097 + }, + { + "epoch": 0.3218032616599148, + "grad_norm": 0.3809730112552643, + "learning_rate": 7.655175043528235e-05, + "loss": 1.79, + "step": 3098 + }, + { + "epoch": 0.3219071361794952, + "grad_norm": 0.37229931354522705, + "learning_rate": 7.653792317694556e-05, + "loss": 1.7281, + "step": 3099 + }, + { + "epoch": 0.3220110106990755, + "grad_norm": 0.40562358498573303, + "learning_rate": 7.652409309252699e-05, + "loss": 1.7644, + "step": 3100 + }, + { + "epoch": 0.3221148852186559, + "grad_norm": 0.3879106044769287, + "learning_rate": 7.651026018349938e-05, + "loss": 1.815, + "step": 3101 + }, + { + "epoch": 0.3222187597382362, + "grad_norm": 0.39501863718032837, + "learning_rate": 7.649642445133585e-05, + "loss": 1.8004, + "step": 3102 + }, + { + "epoch": 0.32232263425781654, + "grad_norm": 0.42894792556762695, + "learning_rate": 7.64825858975098e-05, + "loss": 1.8058, + "step": 3103 + }, + { + "epoch": 0.3224265087773969, + "grad_norm": 0.4245396852493286, + "learning_rate": 7.646874452349491e-05, + "loss": 1.7229, + "step": 3104 + }, + { + "epoch": 0.32253038329697725, + "grad_norm": 0.3750874996185303, + "learning_rate": 7.64549003307652e-05, + "loss": 1.6437, + "step": 3105 + }, + { + "epoch": 0.3226342578165576, + "grad_norm": 0.44294023513793945, + "learning_rate": 7.644105332079497e-05, + "loss": 1.8526, + "step": 3106 + }, + { + "epoch": 0.32273813233613796, + "grad_norm": 0.4349030554294586, + "learning_rate": 7.642720349505881e-05, + "loss": 1.8628, + "step": 3107 + }, + { + "epoch": 0.3228420068557183, + "grad_norm": 0.4126608967781067, + "learning_rate": 7.641335085503161e-05, + "loss": 1.7873, + "step": 3108 + }, + { + "epoch": 0.3229458813752986, + "grad_norm": 0.3600351810455322, + "learning_rate": 7.639949540218858e-05, + "loss": 1.7672, + "step": 3109 + }, + { + "epoch": 0.323049755894879, + "grad_norm": 0.38123077154159546, + "learning_rate": 7.63856371380052e-05, + "loss": 1.7417, + "step": 3110 + }, + { + "epoch": 0.3231536304144593, + "grad_norm": 0.41736799478530884, + "learning_rate": 7.63717760639573e-05, + "loss": 1.855, + "step": 3111 + }, + { + "epoch": 0.3232575049340397, + "grad_norm": 0.42737138271331787, + "learning_rate": 7.635791218152094e-05, + "loss": 1.766, + "step": 3112 + }, + { + "epoch": 0.32336137945362003, + "grad_norm": 0.37469685077667236, + "learning_rate": 7.634404549217254e-05, + "loss": 1.6696, + "step": 3113 + }, + { + "epoch": 0.32346525397320036, + "grad_norm": 0.4223494231700897, + "learning_rate": 7.633017599738879e-05, + "loss": 1.7205, + "step": 3114 + }, + { + "epoch": 0.32356912849278074, + "grad_norm": 0.39290282130241394, + "learning_rate": 7.631630369864668e-05, + "loss": 1.6764, + "step": 3115 + }, + { + "epoch": 0.32367300301236107, + "grad_norm": 0.38165774941444397, + "learning_rate": 7.63024285974235e-05, + "loss": 1.633, + "step": 3116 + }, + { + "epoch": 0.3237768775319414, + "grad_norm": 0.46332311630249023, + "learning_rate": 7.628855069519685e-05, + "loss": 1.9059, + "step": 3117 + }, + { + "epoch": 0.3238807520515218, + "grad_norm": 0.38814160227775574, + "learning_rate": 7.627466999344459e-05, + "loss": 1.6475, + "step": 3118 + }, + { + "epoch": 0.3239846265711021, + "grad_norm": 0.44505298137664795, + "learning_rate": 7.626078649364494e-05, + "loss": 1.8901, + "step": 3119 + }, + { + "epoch": 0.32408850109068243, + "grad_norm": 0.4025139808654785, + "learning_rate": 7.624690019727636e-05, + "loss": 1.62, + "step": 3120 + }, + { + "epoch": 0.3241923756102628, + "grad_norm": 0.37177082896232605, + "learning_rate": 7.623301110581764e-05, + "loss": 1.7613, + "step": 3121 + }, + { + "epoch": 0.32429625012984314, + "grad_norm": 0.38354530930519104, + "learning_rate": 7.621911922074787e-05, + "loss": 1.6685, + "step": 3122 + }, + { + "epoch": 0.32440012464942347, + "grad_norm": 0.404633492231369, + "learning_rate": 7.620522454354644e-05, + "loss": 1.8067, + "step": 3123 + }, + { + "epoch": 0.32450399916900385, + "grad_norm": 0.3900499641895294, + "learning_rate": 7.619132707569299e-05, + "loss": 1.8046, + "step": 3124 + }, + { + "epoch": 0.3246078736885842, + "grad_norm": 0.4414837062358856, + "learning_rate": 7.617742681866749e-05, + "loss": 1.9807, + "step": 3125 + }, + { + "epoch": 0.32471174820816456, + "grad_norm": 0.3875233232975006, + "learning_rate": 7.616352377395025e-05, + "loss": 1.7041, + "step": 3126 + }, + { + "epoch": 0.3248156227277449, + "grad_norm": 0.4394824206829071, + "learning_rate": 7.614961794302178e-05, + "loss": 1.7103, + "step": 3127 + }, + { + "epoch": 0.3249194972473252, + "grad_norm": 0.38259413838386536, + "learning_rate": 7.6135709327363e-05, + "loss": 1.6377, + "step": 3128 + }, + { + "epoch": 0.3250233717669056, + "grad_norm": 0.4169695973396301, + "learning_rate": 7.612179792845504e-05, + "loss": 1.7518, + "step": 3129 + }, + { + "epoch": 0.3251272462864859, + "grad_norm": 0.4079383909702301, + "learning_rate": 7.610788374777935e-05, + "loss": 1.6768, + "step": 3130 + }, + { + "epoch": 0.32523112080606625, + "grad_norm": 0.4074368476867676, + "learning_rate": 7.609396678681771e-05, + "loss": 1.8424, + "step": 3131 + }, + { + "epoch": 0.32533499532564664, + "grad_norm": 0.41162872314453125, + "learning_rate": 7.608004704705212e-05, + "loss": 1.8505, + "step": 3132 + }, + { + "epoch": 0.32543886984522696, + "grad_norm": 0.41227778792381287, + "learning_rate": 7.606612452996495e-05, + "loss": 1.6893, + "step": 3133 + }, + { + "epoch": 0.3255427443648073, + "grad_norm": 0.3721327781677246, + "learning_rate": 7.605219923703886e-05, + "loss": 1.7895, + "step": 3134 + }, + { + "epoch": 0.3256466188843877, + "grad_norm": 0.3553870916366577, + "learning_rate": 7.603827116975677e-05, + "loss": 1.5966, + "step": 3135 + }, + { + "epoch": 0.325750493403968, + "grad_norm": 0.35715457797050476, + "learning_rate": 7.602434032960189e-05, + "loss": 1.7413, + "step": 3136 + }, + { + "epoch": 0.3258543679235483, + "grad_norm": 0.39415115118026733, + "learning_rate": 7.601040671805779e-05, + "loss": 1.3362, + "step": 3137 + }, + { + "epoch": 0.3259582424431287, + "grad_norm": 0.3815273344516754, + "learning_rate": 7.599647033660824e-05, + "loss": 1.6745, + "step": 3138 + }, + { + "epoch": 0.32606211696270904, + "grad_norm": 0.3810808062553406, + "learning_rate": 7.598253118673738e-05, + "loss": 1.6381, + "step": 3139 + }, + { + "epoch": 0.3261659914822894, + "grad_norm": 0.43945252895355225, + "learning_rate": 7.596858926992962e-05, + "loss": 1.8387, + "step": 3140 + }, + { + "epoch": 0.32626986600186975, + "grad_norm": 0.38324853777885437, + "learning_rate": 7.595464458766968e-05, + "loss": 1.6446, + "step": 3141 + }, + { + "epoch": 0.3263737405214501, + "grad_norm": 0.4006134569644928, + "learning_rate": 7.594069714144252e-05, + "loss": 1.4691, + "step": 3142 + }, + { + "epoch": 0.32647761504103046, + "grad_norm": 0.4041348993778229, + "learning_rate": 7.592674693273348e-05, + "loss": 1.8108, + "step": 3143 + }, + { + "epoch": 0.3265814895606108, + "grad_norm": 0.380634605884552, + "learning_rate": 7.591279396302812e-05, + "loss": 1.7062, + "step": 3144 + }, + { + "epoch": 0.3266853640801911, + "grad_norm": 0.3991457223892212, + "learning_rate": 7.589883823381234e-05, + "loss": 1.6658, + "step": 3145 + }, + { + "epoch": 0.3267892385997715, + "grad_norm": 0.4109683334827423, + "learning_rate": 7.58848797465723e-05, + "loss": 1.7899, + "step": 3146 + }, + { + "epoch": 0.3268931131193518, + "grad_norm": 0.38975322246551514, + "learning_rate": 7.587091850279447e-05, + "loss": 1.7506, + "step": 3147 + }, + { + "epoch": 0.32699698763893215, + "grad_norm": 0.3871016800403595, + "learning_rate": 7.585695450396564e-05, + "loss": 1.7103, + "step": 3148 + }, + { + "epoch": 0.32710086215851253, + "grad_norm": 0.36475732922554016, + "learning_rate": 7.584298775157282e-05, + "loss": 1.5979, + "step": 3149 + }, + { + "epoch": 0.32720473667809286, + "grad_norm": 0.3877626359462738, + "learning_rate": 7.58290182471034e-05, + "loss": 1.8509, + "step": 3150 + }, + { + "epoch": 0.32730861119767324, + "grad_norm": 0.36147022247314453, + "learning_rate": 7.581504599204503e-05, + "loss": 1.686, + "step": 3151 + }, + { + "epoch": 0.32741248571725357, + "grad_norm": 0.38483577966690063, + "learning_rate": 7.58010709878856e-05, + "loss": 1.8448, + "step": 3152 + }, + { + "epoch": 0.3275163602368339, + "grad_norm": 0.415414035320282, + "learning_rate": 7.578709323611338e-05, + "loss": 1.7782, + "step": 3153 + }, + { + "epoch": 0.3276202347564143, + "grad_norm": 0.3928332030773163, + "learning_rate": 7.577311273821687e-05, + "loss": 1.7276, + "step": 3154 + }, + { + "epoch": 0.3277241092759946, + "grad_norm": 0.36280107498168945, + "learning_rate": 7.575912949568489e-05, + "loss": 1.7094, + "step": 3155 + }, + { + "epoch": 0.32782798379557493, + "grad_norm": 0.36135435104370117, + "learning_rate": 7.574514351000657e-05, + "loss": 1.6252, + "step": 3156 + }, + { + "epoch": 0.3279318583151553, + "grad_norm": 0.38758692145347595, + "learning_rate": 7.573115478267126e-05, + "loss": 1.7715, + "step": 3157 + }, + { + "epoch": 0.32803573283473564, + "grad_norm": 0.39279675483703613, + "learning_rate": 7.571716331516869e-05, + "loss": 1.71, + "step": 3158 + }, + { + "epoch": 0.32813960735431597, + "grad_norm": 0.36881011724472046, + "learning_rate": 7.570316910898882e-05, + "loss": 1.6717, + "step": 3159 + }, + { + "epoch": 0.32824348187389635, + "grad_norm": 0.3746923506259918, + "learning_rate": 7.568917216562193e-05, + "loss": 1.6144, + "step": 3160 + }, + { + "epoch": 0.3283473563934767, + "grad_norm": 0.4102190434932709, + "learning_rate": 7.56751724865586e-05, + "loss": 1.7229, + "step": 3161 + }, + { + "epoch": 0.328451230913057, + "grad_norm": 0.39784857630729675, + "learning_rate": 7.566117007328967e-05, + "loss": 1.6128, + "step": 3162 + }, + { + "epoch": 0.3285551054326374, + "grad_norm": 0.37368080019950867, + "learning_rate": 7.564716492730628e-05, + "loss": 1.8158, + "step": 3163 + }, + { + "epoch": 0.3286589799522177, + "grad_norm": 0.4041743576526642, + "learning_rate": 7.56331570500999e-05, + "loss": 1.5258, + "step": 3164 + }, + { + "epoch": 0.3287628544717981, + "grad_norm": 0.4261610507965088, + "learning_rate": 7.561914644316221e-05, + "loss": 1.8395, + "step": 3165 + }, + { + "epoch": 0.3288667289913784, + "grad_norm": 0.3845587372779846, + "learning_rate": 7.560513310798529e-05, + "loss": 1.7382, + "step": 3166 + }, + { + "epoch": 0.32897060351095875, + "grad_norm": 0.37753623723983765, + "learning_rate": 7.559111704606142e-05, + "loss": 1.7647, + "step": 3167 + }, + { + "epoch": 0.32907447803053913, + "grad_norm": 0.3923317492008209, + "learning_rate": 7.557709825888319e-05, + "loss": 1.6671, + "step": 3168 + }, + { + "epoch": 0.32917835255011946, + "grad_norm": 0.4080863893032074, + "learning_rate": 7.556307674794351e-05, + "loss": 1.8564, + "step": 3169 + }, + { + "epoch": 0.3292822270696998, + "grad_norm": 0.38211774826049805, + "learning_rate": 7.554905251473557e-05, + "loss": 1.8405, + "step": 3170 + }, + { + "epoch": 0.32938610158928017, + "grad_norm": 0.3729119896888733, + "learning_rate": 7.55350255607528e-05, + "loss": 1.6871, + "step": 3171 + }, + { + "epoch": 0.3294899761088605, + "grad_norm": 0.36250361800193787, + "learning_rate": 7.5520995887489e-05, + "loss": 1.6192, + "step": 3172 + }, + { + "epoch": 0.3295938506284408, + "grad_norm": 0.4741998314857483, + "learning_rate": 7.550696349643822e-05, + "loss": 1.9924, + "step": 3173 + }, + { + "epoch": 0.3296977251480212, + "grad_norm": 0.39271846413612366, + "learning_rate": 7.549292838909478e-05, + "loss": 1.7255, + "step": 3174 + }, + { + "epoch": 0.32980159966760153, + "grad_norm": 0.4134219288825989, + "learning_rate": 7.547889056695332e-05, + "loss": 1.7508, + "step": 3175 + }, + { + "epoch": 0.32990547418718186, + "grad_norm": 0.377145379781723, + "learning_rate": 7.546485003150876e-05, + "loss": 1.7239, + "step": 3176 + }, + { + "epoch": 0.33000934870676224, + "grad_norm": 0.37137743830680847, + "learning_rate": 7.54508067842563e-05, + "loss": 1.6987, + "step": 3177 + }, + { + "epoch": 0.33011322322634257, + "grad_norm": 0.36599913239479065, + "learning_rate": 7.543676082669146e-05, + "loss": 1.675, + "step": 3178 + }, + { + "epoch": 0.33021709774592295, + "grad_norm": 0.3733290731906891, + "learning_rate": 7.542271216030998e-05, + "loss": 1.6841, + "step": 3179 + }, + { + "epoch": 0.3303209722655033, + "grad_norm": 0.3710775375366211, + "learning_rate": 7.540866078660797e-05, + "loss": 1.7604, + "step": 3180 + }, + { + "epoch": 0.3304248467850836, + "grad_norm": 0.4162429869174957, + "learning_rate": 7.539460670708178e-05, + "loss": 1.7021, + "step": 3181 + }, + { + "epoch": 0.330528721304664, + "grad_norm": 0.4121178090572357, + "learning_rate": 7.538054992322806e-05, + "loss": 1.885, + "step": 3182 + }, + { + "epoch": 0.3306325958242443, + "grad_norm": 0.3530847728252411, + "learning_rate": 7.536649043654376e-05, + "loss": 1.3871, + "step": 3183 + }, + { + "epoch": 0.33073647034382464, + "grad_norm": 0.42063409090042114, + "learning_rate": 7.535242824852608e-05, + "loss": 1.7323, + "step": 3184 + }, + { + "epoch": 0.330840344863405, + "grad_norm": 0.4144652485847473, + "learning_rate": 7.533836336067257e-05, + "loss": 1.7173, + "step": 3185 + }, + { + "epoch": 0.33094421938298535, + "grad_norm": 0.3715633749961853, + "learning_rate": 7.5324295774481e-05, + "loss": 1.6793, + "step": 3186 + }, + { + "epoch": 0.3310480939025657, + "grad_norm": 0.40452414751052856, + "learning_rate": 7.531022549144946e-05, + "loss": 1.7443, + "step": 3187 + }, + { + "epoch": 0.33115196842214606, + "grad_norm": 0.3777833878993988, + "learning_rate": 7.529615251307631e-05, + "loss": 1.675, + "step": 3188 + }, + { + "epoch": 0.3312558429417264, + "grad_norm": 0.3736540973186493, + "learning_rate": 7.528207684086027e-05, + "loss": 1.7026, + "step": 3189 + }, + { + "epoch": 0.33135971746130677, + "grad_norm": 0.3794771134853363, + "learning_rate": 7.526799847630024e-05, + "loss": 1.6583, + "step": 3190 + }, + { + "epoch": 0.3314635919808871, + "grad_norm": 0.38491761684417725, + "learning_rate": 7.525391742089547e-05, + "loss": 1.6339, + "step": 3191 + }, + { + "epoch": 0.3315674665004674, + "grad_norm": 0.3660680055618286, + "learning_rate": 7.52398336761455e-05, + "loss": 1.7295, + "step": 3192 + }, + { + "epoch": 0.3316713410200478, + "grad_norm": 0.4049983024597168, + "learning_rate": 7.522574724355009e-05, + "loss": 1.8415, + "step": 3193 + }, + { + "epoch": 0.33177521553962813, + "grad_norm": 0.4031364917755127, + "learning_rate": 7.52116581246094e-05, + "loss": 1.8444, + "step": 3194 + }, + { + "epoch": 0.33187909005920846, + "grad_norm": 0.3920493721961975, + "learning_rate": 7.519756632082376e-05, + "loss": 1.9211, + "step": 3195 + }, + { + "epoch": 0.33198296457878884, + "grad_norm": 0.40728676319122314, + "learning_rate": 7.518347183369385e-05, + "loss": 1.5216, + "step": 3196 + }, + { + "epoch": 0.33208683909836917, + "grad_norm": 0.3954370617866516, + "learning_rate": 7.516937466472065e-05, + "loss": 1.6549, + "step": 3197 + }, + { + "epoch": 0.3321907136179495, + "grad_norm": 0.3614353537559509, + "learning_rate": 7.515527481540536e-05, + "loss": 1.6972, + "step": 3198 + }, + { + "epoch": 0.3322945881375299, + "grad_norm": 0.35628658533096313, + "learning_rate": 7.514117228724953e-05, + "loss": 1.6622, + "step": 3199 + }, + { + "epoch": 0.3323984626571102, + "grad_norm": 0.3558284342288971, + "learning_rate": 7.512706708175496e-05, + "loss": 1.5866, + "step": 3200 + }, + { + "epoch": 0.33250233717669053, + "grad_norm": 0.3872045576572418, + "learning_rate": 7.511295920042374e-05, + "loss": 1.7721, + "step": 3201 + }, + { + "epoch": 0.3326062116962709, + "grad_norm": 0.3917664587497711, + "learning_rate": 7.509884864475825e-05, + "loss": 1.7278, + "step": 3202 + }, + { + "epoch": 0.33271008621585124, + "grad_norm": 0.3571791648864746, + "learning_rate": 7.508473541626115e-05, + "loss": 1.7481, + "step": 3203 + }, + { + "epoch": 0.3328139607354316, + "grad_norm": 0.3802284002304077, + "learning_rate": 7.507061951643541e-05, + "loss": 1.7085, + "step": 3204 + }, + { + "epoch": 0.33291783525501195, + "grad_norm": 0.41221603751182556, + "learning_rate": 7.505650094678425e-05, + "loss": 1.8142, + "step": 3205 + }, + { + "epoch": 0.3330217097745923, + "grad_norm": 0.41970524191856384, + "learning_rate": 7.504237970881118e-05, + "loss": 1.7213, + "step": 3206 + }, + { + "epoch": 0.33312558429417266, + "grad_norm": 0.38147467374801636, + "learning_rate": 7.502825580402002e-05, + "loss": 1.637, + "step": 3207 + }, + { + "epoch": 0.333229458813753, + "grad_norm": 0.430268257856369, + "learning_rate": 7.501412923391483e-05, + "loss": 1.7966, + "step": 3208 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.422599732875824, + "learning_rate": 7.500000000000001e-05, + "loss": 1.7097, + "step": 3209 + }, + { + "epoch": 0.3334372078529137, + "grad_norm": 0.3947455585002899, + "learning_rate": 7.498586810378019e-05, + "loss": 1.6064, + "step": 3210 + }, + { + "epoch": 0.333541082372494, + "grad_norm": 0.43581533432006836, + "learning_rate": 7.49717335467603e-05, + "loss": 1.8244, + "step": 3211 + }, + { + "epoch": 0.33364495689207435, + "grad_norm": 0.42619240283966064, + "learning_rate": 7.49575963304456e-05, + "loss": 1.8339, + "step": 3212 + }, + { + "epoch": 0.33374883141165473, + "grad_norm": 0.4488205015659332, + "learning_rate": 7.494345645634156e-05, + "loss": 1.6953, + "step": 3213 + }, + { + "epoch": 0.33385270593123506, + "grad_norm": 0.35742267966270447, + "learning_rate": 7.492931392595397e-05, + "loss": 1.7246, + "step": 3214 + }, + { + "epoch": 0.3339565804508154, + "grad_norm": 0.37364768981933594, + "learning_rate": 7.49151687407889e-05, + "loss": 1.6528, + "step": 3215 + }, + { + "epoch": 0.33406045497039577, + "grad_norm": 0.3801439702510834, + "learning_rate": 7.49010209023527e-05, + "loss": 1.5913, + "step": 3216 + }, + { + "epoch": 0.3341643294899761, + "grad_norm": 0.37618300318717957, + "learning_rate": 7.488687041215202e-05, + "loss": 1.609, + "step": 3217 + }, + { + "epoch": 0.3342682040095565, + "grad_norm": 0.37247294187545776, + "learning_rate": 7.487271727169377e-05, + "loss": 1.6473, + "step": 3218 + }, + { + "epoch": 0.3343720785291368, + "grad_norm": 0.3996807336807251, + "learning_rate": 7.485856148248513e-05, + "loss": 1.7744, + "step": 3219 + }, + { + "epoch": 0.33447595304871713, + "grad_norm": 0.3898894488811493, + "learning_rate": 7.484440304603362e-05, + "loss": 1.8776, + "step": 3220 + }, + { + "epoch": 0.3345798275682975, + "grad_norm": 0.4372991621494293, + "learning_rate": 7.483024196384696e-05, + "loss": 1.8293, + "step": 3221 + }, + { + "epoch": 0.33468370208787784, + "grad_norm": 0.4678659439086914, + "learning_rate": 7.481607823743321e-05, + "loss": 1.8113, + "step": 3222 + }, + { + "epoch": 0.33478757660745817, + "grad_norm": 0.38630014657974243, + "learning_rate": 7.480191186830072e-05, + "loss": 1.6579, + "step": 3223 + }, + { + "epoch": 0.33489145112703855, + "grad_norm": 0.4188450276851654, + "learning_rate": 7.478774285795805e-05, + "loss": 1.9302, + "step": 3224 + }, + { + "epoch": 0.3349953256466189, + "grad_norm": 0.3890570402145386, + "learning_rate": 7.477357120791415e-05, + "loss": 1.8298, + "step": 3225 + }, + { + "epoch": 0.3350992001661992, + "grad_norm": 0.4579260051250458, + "learning_rate": 7.475939691967814e-05, + "loss": 1.673, + "step": 3226 + }, + { + "epoch": 0.3352030746857796, + "grad_norm": 0.39062821865081787, + "learning_rate": 7.47452199947595e-05, + "loss": 1.7051, + "step": 3227 + }, + { + "epoch": 0.3353069492053599, + "grad_norm": 0.37739062309265137, + "learning_rate": 7.473104043466794e-05, + "loss": 1.6151, + "step": 3228 + }, + { + "epoch": 0.3354108237249403, + "grad_norm": 0.3767510950565338, + "learning_rate": 7.471685824091348e-05, + "loss": 1.7629, + "step": 3229 + }, + { + "epoch": 0.3355146982445206, + "grad_norm": 0.41610831022262573, + "learning_rate": 7.470267341500642e-05, + "loss": 1.7987, + "step": 3230 + }, + { + "epoch": 0.33561857276410095, + "grad_norm": 0.3895459771156311, + "learning_rate": 7.468848595845733e-05, + "loss": 1.5922, + "step": 3231 + }, + { + "epoch": 0.33572244728368134, + "grad_norm": 0.3886757493019104, + "learning_rate": 7.467429587277705e-05, + "loss": 1.6534, + "step": 3232 + }, + { + "epoch": 0.33582632180326166, + "grad_norm": 0.37435784935951233, + "learning_rate": 7.466010315947676e-05, + "loss": 1.7319, + "step": 3233 + }, + { + "epoch": 0.335930196322842, + "grad_norm": 0.4168533682823181, + "learning_rate": 7.464590782006782e-05, + "loss": 1.8126, + "step": 3234 + }, + { + "epoch": 0.33603407084242237, + "grad_norm": 0.3834715187549591, + "learning_rate": 7.463170985606194e-05, + "loss": 1.7701, + "step": 3235 + }, + { + "epoch": 0.3361379453620027, + "grad_norm": 0.44510364532470703, + "learning_rate": 7.46175092689711e-05, + "loss": 1.8166, + "step": 3236 + }, + { + "epoch": 0.336241819881583, + "grad_norm": 0.41382500529289246, + "learning_rate": 7.460330606030754e-05, + "loss": 1.9408, + "step": 3237 + }, + { + "epoch": 0.3363456944011634, + "grad_norm": 0.3592391014099121, + "learning_rate": 7.45891002315838e-05, + "loss": 1.6185, + "step": 3238 + }, + { + "epoch": 0.33644956892074374, + "grad_norm": 0.36257967352867126, + "learning_rate": 7.457489178431269e-05, + "loss": 1.5973, + "step": 3239 + }, + { + "epoch": 0.33655344344032406, + "grad_norm": 0.368685245513916, + "learning_rate": 7.45606807200073e-05, + "loss": 1.5255, + "step": 3240 + }, + { + "epoch": 0.33665731795990445, + "grad_norm": 0.3916224241256714, + "learning_rate": 7.4546467040181e-05, + "loss": 1.6482, + "step": 3241 + }, + { + "epoch": 0.33676119247948477, + "grad_norm": 0.36512747406959534, + "learning_rate": 7.453225074634742e-05, + "loss": 1.5935, + "step": 3242 + }, + { + "epoch": 0.33686506699906515, + "grad_norm": 0.40340378880500793, + "learning_rate": 7.451803184002048e-05, + "loss": 1.8217, + "step": 3243 + }, + { + "epoch": 0.3369689415186455, + "grad_norm": 0.4013363718986511, + "learning_rate": 7.450381032271442e-05, + "loss": 1.698, + "step": 3244 + }, + { + "epoch": 0.3370728160382258, + "grad_norm": 0.4022335708141327, + "learning_rate": 7.44895861959437e-05, + "loss": 1.7154, + "step": 3245 + }, + { + "epoch": 0.3371766905578062, + "grad_norm": 0.4084652364253998, + "learning_rate": 7.447535946122304e-05, + "loss": 1.6902, + "step": 3246 + }, + { + "epoch": 0.3372805650773865, + "grad_norm": 0.41394492983818054, + "learning_rate": 7.446113012006756e-05, + "loss": 1.6136, + "step": 3247 + }, + { + "epoch": 0.33738443959696685, + "grad_norm": 0.3942053020000458, + "learning_rate": 7.444689817399251e-05, + "loss": 1.7797, + "step": 3248 + }, + { + "epoch": 0.33748831411654723, + "grad_norm": 0.41565534472465515, + "learning_rate": 7.443266362451349e-05, + "loss": 1.7013, + "step": 3249 + }, + { + "epoch": 0.33759218863612755, + "grad_norm": 0.3893018960952759, + "learning_rate": 7.441842647314638e-05, + "loss": 1.7065, + "step": 3250 + }, + { + "epoch": 0.3376960631557079, + "grad_norm": 0.4059962034225464, + "learning_rate": 7.440418672140733e-05, + "loss": 1.7509, + "step": 3251 + }, + { + "epoch": 0.33779993767528826, + "grad_norm": 0.4266146719455719, + "learning_rate": 7.438994437081272e-05, + "loss": 1.7629, + "step": 3252 + }, + { + "epoch": 0.3379038121948686, + "grad_norm": 0.37736377120018005, + "learning_rate": 7.437569942287932e-05, + "loss": 1.8509, + "step": 3253 + }, + { + "epoch": 0.3380076867144489, + "grad_norm": 0.3954285681247711, + "learning_rate": 7.436145187912406e-05, + "loss": 1.6512, + "step": 3254 + }, + { + "epoch": 0.3381115612340293, + "grad_norm": 0.3633100092411041, + "learning_rate": 7.43472017410642e-05, + "loss": 1.5277, + "step": 3255 + }, + { + "epoch": 0.33821543575360963, + "grad_norm": 0.3525071144104004, + "learning_rate": 7.433294901021727e-05, + "loss": 1.5944, + "step": 3256 + }, + { + "epoch": 0.33831931027319, + "grad_norm": 0.4135834574699402, + "learning_rate": 7.431869368810106e-05, + "loss": 1.6653, + "step": 3257 + }, + { + "epoch": 0.33842318479277034, + "grad_norm": 0.40316396951675415, + "learning_rate": 7.430443577623369e-05, + "loss": 1.7842, + "step": 3258 + }, + { + "epoch": 0.33852705931235066, + "grad_norm": 0.3851187825202942, + "learning_rate": 7.429017527613347e-05, + "loss": 1.721, + "step": 3259 + }, + { + "epoch": 0.33863093383193105, + "grad_norm": 0.4158236086368561, + "learning_rate": 7.427591218931906e-05, + "loss": 1.9518, + "step": 3260 + }, + { + "epoch": 0.3387348083515114, + "grad_norm": 0.3790571689605713, + "learning_rate": 7.426164651730937e-05, + "loss": 1.5893, + "step": 3261 + }, + { + "epoch": 0.3388386828710917, + "grad_norm": 0.37710022926330566, + "learning_rate": 7.424737826162358e-05, + "loss": 1.6549, + "step": 3262 + }, + { + "epoch": 0.3389425573906721, + "grad_norm": 0.3882453441619873, + "learning_rate": 7.423310742378113e-05, + "loss": 1.6472, + "step": 3263 + }, + { + "epoch": 0.3390464319102524, + "grad_norm": 0.39669013023376465, + "learning_rate": 7.421883400530176e-05, + "loss": 1.7416, + "step": 3264 + }, + { + "epoch": 0.33915030642983274, + "grad_norm": 0.41558176279067993, + "learning_rate": 7.42045580077055e-05, + "loss": 1.6492, + "step": 3265 + }, + { + "epoch": 0.3392541809494131, + "grad_norm": 0.4130480885505676, + "learning_rate": 7.41902794325126e-05, + "loss": 1.7518, + "step": 3266 + }, + { + "epoch": 0.33935805546899345, + "grad_norm": 0.398885577917099, + "learning_rate": 7.417599828124363e-05, + "loss": 1.6747, + "step": 3267 + }, + { + "epoch": 0.33946192998857383, + "grad_norm": 0.39864227175712585, + "learning_rate": 7.416171455541943e-05, + "loss": 1.7727, + "step": 3268 + }, + { + "epoch": 0.33956580450815416, + "grad_norm": 0.4171668589115143, + "learning_rate": 7.414742825656111e-05, + "loss": 1.6616, + "step": 3269 + }, + { + "epoch": 0.3396696790277345, + "grad_norm": 0.379443883895874, + "learning_rate": 7.413313938619003e-05, + "loss": 1.8302, + "step": 3270 + }, + { + "epoch": 0.33977355354731487, + "grad_norm": 0.3963342607021332, + "learning_rate": 7.411884794582784e-05, + "loss": 1.7629, + "step": 3271 + }, + { + "epoch": 0.3398774280668952, + "grad_norm": 0.3690603971481323, + "learning_rate": 7.410455393699647e-05, + "loss": 1.6549, + "step": 3272 + }, + { + "epoch": 0.3399813025864755, + "grad_norm": 0.36229610443115234, + "learning_rate": 7.409025736121816e-05, + "loss": 1.6949, + "step": 3273 + }, + { + "epoch": 0.3400851771060559, + "grad_norm": 0.3683745265007019, + "learning_rate": 7.407595822001532e-05, + "loss": 1.8114, + "step": 3274 + }, + { + "epoch": 0.34018905162563623, + "grad_norm": 0.3948541283607483, + "learning_rate": 7.406165651491075e-05, + "loss": 1.6883, + "step": 3275 + }, + { + "epoch": 0.34029292614521656, + "grad_norm": 0.4150826036930084, + "learning_rate": 7.404735224742745e-05, + "loss": 1.7254, + "step": 3276 + }, + { + "epoch": 0.34039680066479694, + "grad_norm": 0.4049845337867737, + "learning_rate": 7.40330454190887e-05, + "loss": 1.6418, + "step": 3277 + }, + { + "epoch": 0.34050067518437727, + "grad_norm": 0.37728235125541687, + "learning_rate": 7.401873603141809e-05, + "loss": 1.8138, + "step": 3278 + }, + { + "epoch": 0.3406045497039576, + "grad_norm": 0.4334777891635895, + "learning_rate": 7.400442408593944e-05, + "loss": 1.9727, + "step": 3279 + }, + { + "epoch": 0.340708424223538, + "grad_norm": 0.395805299282074, + "learning_rate": 7.399010958417684e-05, + "loss": 1.7511, + "step": 3280 + }, + { + "epoch": 0.3408122987431183, + "grad_norm": 0.3635464310646057, + "learning_rate": 7.397579252765475e-05, + "loss": 1.7119, + "step": 3281 + }, + { + "epoch": 0.3409161732626987, + "grad_norm": 0.3630931079387665, + "learning_rate": 7.396147291789775e-05, + "loss": 1.7253, + "step": 3282 + }, + { + "epoch": 0.341020047782279, + "grad_norm": 0.37721481919288635, + "learning_rate": 7.394715075643078e-05, + "loss": 1.7831, + "step": 3283 + }, + { + "epoch": 0.34112392230185934, + "grad_norm": 0.41346338391304016, + "learning_rate": 7.393282604477907e-05, + "loss": 1.8239, + "step": 3284 + }, + { + "epoch": 0.3412277968214397, + "grad_norm": 0.3926272392272949, + "learning_rate": 7.391849878446805e-05, + "loss": 1.8158, + "step": 3285 + }, + { + "epoch": 0.34133167134102005, + "grad_norm": 0.37988725304603577, + "learning_rate": 7.39041689770235e-05, + "loss": 1.6549, + "step": 3286 + }, + { + "epoch": 0.3414355458606004, + "grad_norm": 0.46685877442359924, + "learning_rate": 7.388983662397139e-05, + "loss": 1.8314, + "step": 3287 + }, + { + "epoch": 0.34153942038018076, + "grad_norm": 0.38952314853668213, + "learning_rate": 7.387550172683803e-05, + "loss": 1.9213, + "step": 3288 + }, + { + "epoch": 0.3416432948997611, + "grad_norm": 0.3613092005252838, + "learning_rate": 7.386116428715e-05, + "loss": 1.7343, + "step": 3289 + }, + { + "epoch": 0.3417471694193414, + "grad_norm": 0.3758079707622528, + "learning_rate": 7.384682430643409e-05, + "loss": 1.6233, + "step": 3290 + }, + { + "epoch": 0.3418510439389218, + "grad_norm": 0.36782777309417725, + "learning_rate": 7.383248178621739e-05, + "loss": 1.7206, + "step": 3291 + }, + { + "epoch": 0.3419549184585021, + "grad_norm": 0.36162370443344116, + "learning_rate": 7.381813672802729e-05, + "loss": 1.6387, + "step": 3292 + }, + { + "epoch": 0.34205879297808245, + "grad_norm": 0.3870641887187958, + "learning_rate": 7.380378913339142e-05, + "loss": 1.7021, + "step": 3293 + }, + { + "epoch": 0.34216266749766283, + "grad_norm": 0.366452157497406, + "learning_rate": 7.378943900383766e-05, + "loss": 1.6213, + "step": 3294 + }, + { + "epoch": 0.34226654201724316, + "grad_norm": 0.40911996364593506, + "learning_rate": 7.377508634089422e-05, + "loss": 1.8177, + "step": 3295 + }, + { + "epoch": 0.34237041653682354, + "grad_norm": 0.396062433719635, + "learning_rate": 7.376073114608955e-05, + "loss": 1.7282, + "step": 3296 + }, + { + "epoch": 0.34247429105640387, + "grad_norm": 0.4013812839984894, + "learning_rate": 7.374637342095236e-05, + "loss": 1.7302, + "step": 3297 + }, + { + "epoch": 0.3425781655759842, + "grad_norm": 0.37226590514183044, + "learning_rate": 7.373201316701162e-05, + "loss": 1.6312, + "step": 3298 + }, + { + "epoch": 0.3426820400955646, + "grad_norm": 0.39679157733917236, + "learning_rate": 7.371765038579658e-05, + "loss": 1.6591, + "step": 3299 + }, + { + "epoch": 0.3427859146151449, + "grad_norm": 0.4248996078968048, + "learning_rate": 7.37032850788368e-05, + "loss": 1.8358, + "step": 3300 + }, + { + "epoch": 0.34288978913472523, + "grad_norm": 0.37451761960983276, + "learning_rate": 7.368891724766204e-05, + "loss": 1.7225, + "step": 3301 + }, + { + "epoch": 0.3429936636543056, + "grad_norm": 0.38723859190940857, + "learning_rate": 7.367454689380238e-05, + "loss": 1.5813, + "step": 3302 + }, + { + "epoch": 0.34309753817388594, + "grad_norm": 0.4239073395729065, + "learning_rate": 7.366017401878813e-05, + "loss": 1.8608, + "step": 3303 + }, + { + "epoch": 0.34320141269346627, + "grad_norm": 0.39057475328445435, + "learning_rate": 7.364579862414993e-05, + "loss": 1.7135, + "step": 3304 + }, + { + "epoch": 0.34330528721304665, + "grad_norm": 0.4171145558357239, + "learning_rate": 7.36314207114186e-05, + "loss": 1.8721, + "step": 3305 + }, + { + "epoch": 0.343409161732627, + "grad_norm": 0.4564177095890045, + "learning_rate": 7.36170402821253e-05, + "loss": 1.6559, + "step": 3306 + }, + { + "epoch": 0.34351303625220736, + "grad_norm": 0.4414724111557007, + "learning_rate": 7.360265733780142e-05, + "loss": 1.7733, + "step": 3307 + }, + { + "epoch": 0.3436169107717877, + "grad_norm": 0.35561293363571167, + "learning_rate": 7.358827187997867e-05, + "loss": 1.5218, + "step": 3308 + }, + { + "epoch": 0.343720785291368, + "grad_norm": 0.41624075174331665, + "learning_rate": 7.357388391018893e-05, + "loss": 1.6683, + "step": 3309 + }, + { + "epoch": 0.3438246598109484, + "grad_norm": 0.38379234075546265, + "learning_rate": 7.355949342996447e-05, + "loss": 1.7642, + "step": 3310 + }, + { + "epoch": 0.3439285343305287, + "grad_norm": 0.40274274349212646, + "learning_rate": 7.35451004408377e-05, + "loss": 1.7529, + "step": 3311 + }, + { + "epoch": 0.34403240885010905, + "grad_norm": 0.4023614525794983, + "learning_rate": 7.353070494434141e-05, + "loss": 1.77, + "step": 3312 + }, + { + "epoch": 0.34413628336968943, + "grad_norm": 0.41878741979599, + "learning_rate": 7.351630694200857e-05, + "loss": 1.7528, + "step": 3313 + }, + { + "epoch": 0.34424015788926976, + "grad_norm": 0.3700146973133087, + "learning_rate": 7.35019064353725e-05, + "loss": 1.677, + "step": 3314 + }, + { + "epoch": 0.3443440324088501, + "grad_norm": 0.44311147928237915, + "learning_rate": 7.348750342596671e-05, + "loss": 1.9235, + "step": 3315 + }, + { + "epoch": 0.34444790692843047, + "grad_norm": 0.4150424599647522, + "learning_rate": 7.3473097915325e-05, + "loss": 1.6732, + "step": 3316 + }, + { + "epoch": 0.3445517814480108, + "grad_norm": 0.3866238594055176, + "learning_rate": 7.345868990498146e-05, + "loss": 1.6598, + "step": 3317 + }, + { + "epoch": 0.3446556559675911, + "grad_norm": 0.42701780796051025, + "learning_rate": 7.344427939647045e-05, + "loss": 1.7695, + "step": 3318 + }, + { + "epoch": 0.3447595304871715, + "grad_norm": 0.3794342577457428, + "learning_rate": 7.342986639132654e-05, + "loss": 1.6552, + "step": 3319 + }, + { + "epoch": 0.34486340500675183, + "grad_norm": 0.4094541668891907, + "learning_rate": 7.341545089108464e-05, + "loss": 1.9607, + "step": 3320 + }, + { + "epoch": 0.3449672795263322, + "grad_norm": 0.4172968566417694, + "learning_rate": 7.340103289727983e-05, + "loss": 1.7933, + "step": 3321 + }, + { + "epoch": 0.34507115404591254, + "grad_norm": 0.402832955121994, + "learning_rate": 7.338661241144758e-05, + "loss": 1.8795, + "step": 3322 + }, + { + "epoch": 0.34517502856549287, + "grad_norm": 0.38485434651374817, + "learning_rate": 7.337218943512353e-05, + "loss": 1.764, + "step": 3323 + }, + { + "epoch": 0.34527890308507325, + "grad_norm": 0.4096323847770691, + "learning_rate": 7.335776396984361e-05, + "loss": 1.814, + "step": 3324 + }, + { + "epoch": 0.3453827776046536, + "grad_norm": 0.347913920879364, + "learning_rate": 7.334333601714403e-05, + "loss": 1.5926, + "step": 3325 + }, + { + "epoch": 0.3454866521242339, + "grad_norm": 0.404429167509079, + "learning_rate": 7.332890557856125e-05, + "loss": 1.8246, + "step": 3326 + }, + { + "epoch": 0.3455905266438143, + "grad_norm": 0.3735859990119934, + "learning_rate": 7.331447265563197e-05, + "loss": 1.5425, + "step": 3327 + }, + { + "epoch": 0.3456944011633946, + "grad_norm": 0.35603833198547363, + "learning_rate": 7.330003724989324e-05, + "loss": 1.5314, + "step": 3328 + }, + { + "epoch": 0.34579827568297494, + "grad_norm": 0.3887482285499573, + "learning_rate": 7.328559936288228e-05, + "loss": 1.688, + "step": 3329 + }, + { + "epoch": 0.3459021502025553, + "grad_norm": 0.4250495731830597, + "learning_rate": 7.327115899613662e-05, + "loss": 1.8245, + "step": 3330 + }, + { + "epoch": 0.34600602472213565, + "grad_norm": 0.36763063073158264, + "learning_rate": 7.325671615119407e-05, + "loss": 1.4173, + "step": 3331 + }, + { + "epoch": 0.34610989924171603, + "grad_norm": 0.3886317312717438, + "learning_rate": 7.324227082959264e-05, + "loss": 1.7182, + "step": 3332 + }, + { + "epoch": 0.34621377376129636, + "grad_norm": 0.4716704487800598, + "learning_rate": 7.322782303287067e-05, + "loss": 1.6838, + "step": 3333 + }, + { + "epoch": 0.3463176482808767, + "grad_norm": 0.3932298719882965, + "learning_rate": 7.321337276256674e-05, + "loss": 1.8532, + "step": 3334 + }, + { + "epoch": 0.34642152280045707, + "grad_norm": 0.4240279793739319, + "learning_rate": 7.319892002021966e-05, + "loss": 1.7919, + "step": 3335 + }, + { + "epoch": 0.3465253973200374, + "grad_norm": 0.41877415776252747, + "learning_rate": 7.318446480736857e-05, + "loss": 1.6985, + "step": 3336 + }, + { + "epoch": 0.3466292718396177, + "grad_norm": 0.45144563913345337, + "learning_rate": 7.317000712555283e-05, + "loss": 1.9947, + "step": 3337 + }, + { + "epoch": 0.3467331463591981, + "grad_norm": 0.3652666509151459, + "learning_rate": 7.315554697631205e-05, + "loss": 1.7206, + "step": 3338 + }, + { + "epoch": 0.34683702087877843, + "grad_norm": 0.3598730266094208, + "learning_rate": 7.314108436118614e-05, + "loss": 1.6871, + "step": 3339 + }, + { + "epoch": 0.34694089539835876, + "grad_norm": 0.3652264177799225, + "learning_rate": 7.312661928171525e-05, + "loss": 1.5859, + "step": 3340 + }, + { + "epoch": 0.34704476991793914, + "grad_norm": 0.36526259779930115, + "learning_rate": 7.311215173943982e-05, + "loss": 1.5721, + "step": 3341 + }, + { + "epoch": 0.34714864443751947, + "grad_norm": 0.3666329085826874, + "learning_rate": 7.309768173590049e-05, + "loss": 1.6762, + "step": 3342 + }, + { + "epoch": 0.3472525189570998, + "grad_norm": 0.39144429564476013, + "learning_rate": 7.308320927263823e-05, + "loss": 1.6709, + "step": 3343 + }, + { + "epoch": 0.3473563934766802, + "grad_norm": 0.41937124729156494, + "learning_rate": 7.306873435119423e-05, + "loss": 1.7338, + "step": 3344 + }, + { + "epoch": 0.3474602679962605, + "grad_norm": 0.38105306029319763, + "learning_rate": 7.305425697311e-05, + "loss": 1.5917, + "step": 3345 + }, + { + "epoch": 0.3475641425158409, + "grad_norm": 0.3741595149040222, + "learning_rate": 7.30397771399272e-05, + "loss": 1.6511, + "step": 3346 + }, + { + "epoch": 0.3476680170354212, + "grad_norm": 0.399387389421463, + "learning_rate": 7.302529485318786e-05, + "loss": 1.6948, + "step": 3347 + }, + { + "epoch": 0.34777189155500154, + "grad_norm": 0.3723578155040741, + "learning_rate": 7.301081011443423e-05, + "loss": 1.6319, + "step": 3348 + }, + { + "epoch": 0.3478757660745819, + "grad_norm": 0.37226778268814087, + "learning_rate": 7.29963229252088e-05, + "loss": 1.7856, + "step": 3349 + }, + { + "epoch": 0.34797964059416225, + "grad_norm": 0.38649681210517883, + "learning_rate": 7.298183328705436e-05, + "loss": 1.65, + "step": 3350 + }, + { + "epoch": 0.3480835151137426, + "grad_norm": 0.3898165822029114, + "learning_rate": 7.296734120151394e-05, + "loss": 1.8236, + "step": 3351 + }, + { + "epoch": 0.34818738963332296, + "grad_norm": 0.37378284335136414, + "learning_rate": 7.295284667013083e-05, + "loss": 1.5903, + "step": 3352 + }, + { + "epoch": 0.3482912641529033, + "grad_norm": 0.39997485280036926, + "learning_rate": 7.293834969444861e-05, + "loss": 1.8107, + "step": 3353 + }, + { + "epoch": 0.3483951386724836, + "grad_norm": 0.37795597314834595, + "learning_rate": 7.292385027601105e-05, + "loss": 1.7677, + "step": 3354 + }, + { + "epoch": 0.348499013192064, + "grad_norm": 0.3679714500904083, + "learning_rate": 7.290934841636225e-05, + "loss": 1.6464, + "step": 3355 + }, + { + "epoch": 0.3486028877116443, + "grad_norm": 0.3939383029937744, + "learning_rate": 7.289484411704656e-05, + "loss": 1.7565, + "step": 3356 + }, + { + "epoch": 0.34870676223122465, + "grad_norm": 0.38755881786346436, + "learning_rate": 7.288033737960855e-05, + "loss": 1.5508, + "step": 3357 + }, + { + "epoch": 0.34881063675080504, + "grad_norm": 0.3855699598789215, + "learning_rate": 7.286582820559308e-05, + "loss": 1.6979, + "step": 3358 + }, + { + "epoch": 0.34891451127038536, + "grad_norm": 0.37389200925827026, + "learning_rate": 7.285131659654527e-05, + "loss": 1.703, + "step": 3359 + }, + { + "epoch": 0.34901838578996575, + "grad_norm": 0.3579116761684418, + "learning_rate": 7.283680255401049e-05, + "loss": 1.6348, + "step": 3360 + }, + { + "epoch": 0.3491222603095461, + "grad_norm": 0.39986559748649597, + "learning_rate": 7.282228607953436e-05, + "loss": 1.7792, + "step": 3361 + }, + { + "epoch": 0.3492261348291264, + "grad_norm": 0.3745286166667938, + "learning_rate": 7.28077671746628e-05, + "loss": 1.7119, + "step": 3362 + }, + { + "epoch": 0.3493300093487068, + "grad_norm": 0.45481473207473755, + "learning_rate": 7.279324584094194e-05, + "loss": 1.8279, + "step": 3363 + }, + { + "epoch": 0.3494338838682871, + "grad_norm": 0.42257899045944214, + "learning_rate": 7.277872207991818e-05, + "loss": 1.6366, + "step": 3364 + }, + { + "epoch": 0.34953775838786744, + "grad_norm": 0.3691715896129608, + "learning_rate": 7.276419589313821e-05, + "loss": 1.6741, + "step": 3365 + }, + { + "epoch": 0.3496416329074478, + "grad_norm": 0.36931112408638, + "learning_rate": 7.274966728214895e-05, + "loss": 1.7447, + "step": 3366 + }, + { + "epoch": 0.34974550742702815, + "grad_norm": 0.4134025573730469, + "learning_rate": 7.273513624849757e-05, + "loss": 1.9171, + "step": 3367 + }, + { + "epoch": 0.3498493819466085, + "grad_norm": 0.4000683128833771, + "learning_rate": 7.272060279373152e-05, + "loss": 1.7925, + "step": 3368 + }, + { + "epoch": 0.34995325646618886, + "grad_norm": 0.3902685344219208, + "learning_rate": 7.27060669193985e-05, + "loss": 1.6959, + "step": 3369 + }, + { + "epoch": 0.3500571309857692, + "grad_norm": 0.40209463238716125, + "learning_rate": 7.269152862704647e-05, + "loss": 1.7667, + "step": 3370 + }, + { + "epoch": 0.35016100550534957, + "grad_norm": 0.46758002042770386, + "learning_rate": 7.267698791822363e-05, + "loss": 1.8386, + "step": 3371 + }, + { + "epoch": 0.3502648800249299, + "grad_norm": 0.4505844712257385, + "learning_rate": 7.266244479447847e-05, + "loss": 1.9704, + "step": 3372 + }, + { + "epoch": 0.3503687545445102, + "grad_norm": 0.4215162694454193, + "learning_rate": 7.264789925735974e-05, + "loss": 1.9084, + "step": 3373 + }, + { + "epoch": 0.3504726290640906, + "grad_norm": 0.4192196726799011, + "learning_rate": 7.263335130841638e-05, + "loss": 1.7802, + "step": 3374 + }, + { + "epoch": 0.35057650358367093, + "grad_norm": 0.4384503662586212, + "learning_rate": 7.261880094919765e-05, + "loss": 1.6945, + "step": 3375 + }, + { + "epoch": 0.35068037810325126, + "grad_norm": 0.4039691984653473, + "learning_rate": 7.260424818125305e-05, + "loss": 1.7436, + "step": 3376 + }, + { + "epoch": 0.35078425262283164, + "grad_norm": 0.3905966579914093, + "learning_rate": 7.258969300613233e-05, + "loss": 1.7358, + "step": 3377 + }, + { + "epoch": 0.35088812714241197, + "grad_norm": 0.4261592626571655, + "learning_rate": 7.257513542538553e-05, + "loss": 1.8172, + "step": 3378 + }, + { + "epoch": 0.3509920016619923, + "grad_norm": 0.39150550961494446, + "learning_rate": 7.25605754405629e-05, + "loss": 1.6052, + "step": 3379 + }, + { + "epoch": 0.3510958761815727, + "grad_norm": 0.3744257092475891, + "learning_rate": 7.254601305321496e-05, + "loss": 1.7486, + "step": 3380 + }, + { + "epoch": 0.351199750701153, + "grad_norm": 0.36264219880104065, + "learning_rate": 7.253144826489249e-05, + "loss": 1.6068, + "step": 3381 + }, + { + "epoch": 0.35130362522073333, + "grad_norm": 0.35563141107559204, + "learning_rate": 7.251688107714654e-05, + "loss": 1.6542, + "step": 3382 + }, + { + "epoch": 0.3514074997403137, + "grad_norm": 0.39678165316581726, + "learning_rate": 7.250231149152838e-05, + "loss": 1.8309, + "step": 3383 + }, + { + "epoch": 0.35151137425989404, + "grad_norm": 0.3613341748714447, + "learning_rate": 7.248773950958958e-05, + "loss": 1.735, + "step": 3384 + }, + { + "epoch": 0.3516152487794744, + "grad_norm": 0.3558713495731354, + "learning_rate": 7.247316513288192e-05, + "loss": 1.5545, + "step": 3385 + }, + { + "epoch": 0.35171912329905475, + "grad_norm": 0.3634355366230011, + "learning_rate": 7.245858836295749e-05, + "loss": 1.748, + "step": 3386 + }, + { + "epoch": 0.3518229978186351, + "grad_norm": 0.3754010796546936, + "learning_rate": 7.244400920136858e-05, + "loss": 1.703, + "step": 3387 + }, + { + "epoch": 0.35192687233821546, + "grad_norm": 0.380953848361969, + "learning_rate": 7.242942764966776e-05, + "loss": 1.6779, + "step": 3388 + }, + { + "epoch": 0.3520307468577958, + "grad_norm": 0.4804019033908844, + "learning_rate": 7.241484370940784e-05, + "loss": 1.7014, + "step": 3389 + }, + { + "epoch": 0.3521346213773761, + "grad_norm": 0.41277310252189636, + "learning_rate": 7.240025738214192e-05, + "loss": 1.6532, + "step": 3390 + }, + { + "epoch": 0.3522384958969565, + "grad_norm": 0.4273497760295868, + "learning_rate": 7.238566866942332e-05, + "loss": 1.9818, + "step": 3391 + }, + { + "epoch": 0.3523423704165368, + "grad_norm": 0.38524994254112244, + "learning_rate": 7.23710775728056e-05, + "loss": 1.7702, + "step": 3392 + }, + { + "epoch": 0.35244624493611715, + "grad_norm": 0.37887367606163025, + "learning_rate": 7.235648409384263e-05, + "loss": 1.6614, + "step": 3393 + }, + { + "epoch": 0.35255011945569753, + "grad_norm": 0.3723335564136505, + "learning_rate": 7.234188823408851e-05, + "loss": 1.6692, + "step": 3394 + }, + { + "epoch": 0.35265399397527786, + "grad_norm": 0.38009339570999146, + "learning_rate": 7.232728999509754e-05, + "loss": 1.7263, + "step": 3395 + }, + { + "epoch": 0.3527578684948582, + "grad_norm": 0.4187556505203247, + "learning_rate": 7.231268937842438e-05, + "loss": 1.7052, + "step": 3396 + }, + { + "epoch": 0.35286174301443857, + "grad_norm": 0.3810417056083679, + "learning_rate": 7.229808638562381e-05, + "loss": 1.659, + "step": 3397 + }, + { + "epoch": 0.3529656175340189, + "grad_norm": 0.41524961590766907, + "learning_rate": 7.228348101825099e-05, + "loss": 1.5006, + "step": 3398 + }, + { + "epoch": 0.3530694920535993, + "grad_norm": 0.38853874802589417, + "learning_rate": 7.226887327786125e-05, + "loss": 1.8323, + "step": 3399 + }, + { + "epoch": 0.3531733665731796, + "grad_norm": 0.3678146004676819, + "learning_rate": 7.22542631660102e-05, + "loss": 1.6357, + "step": 3400 + }, + { + "epoch": 0.35327724109275993, + "grad_norm": 0.3867931365966797, + "learning_rate": 7.223965068425376e-05, + "loss": 1.7819, + "step": 3401 + }, + { + "epoch": 0.3533811156123403, + "grad_norm": 0.37931546568870544, + "learning_rate": 7.222503583414795e-05, + "loss": 1.6478, + "step": 3402 + }, + { + "epoch": 0.35348499013192064, + "grad_norm": 0.43002453446388245, + "learning_rate": 7.22104186172492e-05, + "loss": 1.8901, + "step": 3403 + }, + { + "epoch": 0.35358886465150097, + "grad_norm": 0.36890125274658203, + "learning_rate": 7.219579903511412e-05, + "loss": 1.7524, + "step": 3404 + }, + { + "epoch": 0.35369273917108135, + "grad_norm": 0.4097067415714264, + "learning_rate": 7.218117708929957e-05, + "loss": 1.6646, + "step": 3405 + }, + { + "epoch": 0.3537966136906617, + "grad_norm": 0.47631388902664185, + "learning_rate": 7.216655278136269e-05, + "loss": 1.8199, + "step": 3406 + }, + { + "epoch": 0.353900488210242, + "grad_norm": 0.4068619906902313, + "learning_rate": 7.215192611286083e-05, + "loss": 1.5446, + "step": 3407 + }, + { + "epoch": 0.3540043627298224, + "grad_norm": 0.36159148812294006, + "learning_rate": 7.213729708535164e-05, + "loss": 1.4702, + "step": 3408 + }, + { + "epoch": 0.3541082372494027, + "grad_norm": 0.39345964789390564, + "learning_rate": 7.212266570039299e-05, + "loss": 1.7324, + "step": 3409 + }, + { + "epoch": 0.3542121117689831, + "grad_norm": 0.4041385352611542, + "learning_rate": 7.2108031959543e-05, + "loss": 1.7712, + "step": 3410 + }, + { + "epoch": 0.3543159862885634, + "grad_norm": 0.40899065136909485, + "learning_rate": 7.209339586436005e-05, + "loss": 1.5812, + "step": 3411 + }, + { + "epoch": 0.35441986080814375, + "grad_norm": 0.37464478611946106, + "learning_rate": 7.20787574164028e-05, + "loss": 1.4846, + "step": 3412 + }, + { + "epoch": 0.35452373532772413, + "grad_norm": 0.41557177901268005, + "learning_rate": 7.206411661723007e-05, + "loss": 1.7985, + "step": 3413 + }, + { + "epoch": 0.35462760984730446, + "grad_norm": 0.4185909032821655, + "learning_rate": 7.204947346840106e-05, + "loss": 1.7178, + "step": 3414 + }, + { + "epoch": 0.3547314843668848, + "grad_norm": 0.40856650471687317, + "learning_rate": 7.203482797147512e-05, + "loss": 1.5386, + "step": 3415 + }, + { + "epoch": 0.35483535888646517, + "grad_norm": 0.40840035676956177, + "learning_rate": 7.202018012801185e-05, + "loss": 1.8462, + "step": 3416 + }, + { + "epoch": 0.3549392334060455, + "grad_norm": 0.4034786820411682, + "learning_rate": 7.200552993957117e-05, + "loss": 1.6743, + "step": 3417 + }, + { + "epoch": 0.3550431079256258, + "grad_norm": 0.39444810152053833, + "learning_rate": 7.199087740771321e-05, + "loss": 1.6801, + "step": 3418 + }, + { + "epoch": 0.3551469824452062, + "grad_norm": 0.4487433433532715, + "learning_rate": 7.197622253399833e-05, + "loss": 1.9052, + "step": 3419 + }, + { + "epoch": 0.35525085696478653, + "grad_norm": 0.37087568640708923, + "learning_rate": 7.196156531998718e-05, + "loss": 1.6895, + "step": 3420 + }, + { + "epoch": 0.35535473148436686, + "grad_norm": 0.3976334035396576, + "learning_rate": 7.194690576724064e-05, + "loss": 1.6898, + "step": 3421 + }, + { + "epoch": 0.35545860600394724, + "grad_norm": 0.3944483697414398, + "learning_rate": 7.19322438773198e-05, + "loss": 1.7735, + "step": 3422 + }, + { + "epoch": 0.35556248052352757, + "grad_norm": 0.4049598276615143, + "learning_rate": 7.191757965178609e-05, + "loss": 1.7066, + "step": 3423 + }, + { + "epoch": 0.35566635504310795, + "grad_norm": 0.3984638452529907, + "learning_rate": 7.19029130922011e-05, + "loss": 1.622, + "step": 3424 + }, + { + "epoch": 0.3557702295626883, + "grad_norm": 0.37969374656677246, + "learning_rate": 7.188824420012671e-05, + "loss": 1.5406, + "step": 3425 + }, + { + "epoch": 0.3558741040822686, + "grad_norm": 0.39994993805885315, + "learning_rate": 7.187357297712506e-05, + "loss": 1.6681, + "step": 3426 + }, + { + "epoch": 0.355977978601849, + "grad_norm": 0.38335686922073364, + "learning_rate": 7.185889942475851e-05, + "loss": 1.7488, + "step": 3427 + }, + { + "epoch": 0.3560818531214293, + "grad_norm": 0.37399354577064514, + "learning_rate": 7.184422354458966e-05, + "loss": 1.6041, + "step": 3428 + }, + { + "epoch": 0.35618572764100964, + "grad_norm": 0.3858386278152466, + "learning_rate": 7.182954533818142e-05, + "loss": 1.7968, + "step": 3429 + }, + { + "epoch": 0.35628960216059, + "grad_norm": 0.39342349767684937, + "learning_rate": 7.181486480709687e-05, + "loss": 1.7446, + "step": 3430 + }, + { + "epoch": 0.35639347668017035, + "grad_norm": 0.3939875364303589, + "learning_rate": 7.180018195289937e-05, + "loss": 1.7684, + "step": 3431 + }, + { + "epoch": 0.3564973511997507, + "grad_norm": 0.3789867162704468, + "learning_rate": 7.178549677715254e-05, + "loss": 1.7591, + "step": 3432 + }, + { + "epoch": 0.35660122571933106, + "grad_norm": 0.41745778918266296, + "learning_rate": 7.177080928142024e-05, + "loss": 1.8228, + "step": 3433 + }, + { + "epoch": 0.3567051002389114, + "grad_norm": 0.3856961727142334, + "learning_rate": 7.175611946726657e-05, + "loss": 1.7026, + "step": 3434 + }, + { + "epoch": 0.3568089747584917, + "grad_norm": 0.36199331283569336, + "learning_rate": 7.174142733625589e-05, + "loss": 1.4263, + "step": 3435 + }, + { + "epoch": 0.3569128492780721, + "grad_norm": 0.38251036405563354, + "learning_rate": 7.172673288995277e-05, + "loss": 1.6076, + "step": 3436 + }, + { + "epoch": 0.3570167237976524, + "grad_norm": 0.3627626895904541, + "learning_rate": 7.171203612992209e-05, + "loss": 1.7049, + "step": 3437 + }, + { + "epoch": 0.3571205983172328, + "grad_norm": 0.37557271122932434, + "learning_rate": 7.169733705772891e-05, + "loss": 1.5924, + "step": 3438 + }, + { + "epoch": 0.35722447283681313, + "grad_norm": 0.4099326729774475, + "learning_rate": 7.168263567493859e-05, + "loss": 1.8338, + "step": 3439 + }, + { + "epoch": 0.35732834735639346, + "grad_norm": 0.3865790367126465, + "learning_rate": 7.16679319831167e-05, + "loss": 1.8652, + "step": 3440 + }, + { + "epoch": 0.35743222187597384, + "grad_norm": 0.3986462950706482, + "learning_rate": 7.165322598382905e-05, + "loss": 1.8941, + "step": 3441 + }, + { + "epoch": 0.35753609639555417, + "grad_norm": 0.3964523673057556, + "learning_rate": 7.163851767864176e-05, + "loss": 1.8183, + "step": 3442 + }, + { + "epoch": 0.3576399709151345, + "grad_norm": 0.4160378575325012, + "learning_rate": 7.162380706912112e-05, + "loss": 1.7119, + "step": 3443 + }, + { + "epoch": 0.3577438454347149, + "grad_norm": 0.3585418462753296, + "learning_rate": 7.16090941568337e-05, + "loss": 1.5949, + "step": 3444 + }, + { + "epoch": 0.3578477199542952, + "grad_norm": 0.3526414930820465, + "learning_rate": 7.159437894334629e-05, + "loss": 1.5768, + "step": 3445 + }, + { + "epoch": 0.35795159447387553, + "grad_norm": 0.4531826674938202, + "learning_rate": 7.157966143022599e-05, + "loss": 1.7067, + "step": 3446 + }, + { + "epoch": 0.3580554689934559, + "grad_norm": 0.3793008625507355, + "learning_rate": 7.156494161904005e-05, + "loss": 1.6634, + "step": 3447 + }, + { + "epoch": 0.35815934351303624, + "grad_norm": 0.3665551245212555, + "learning_rate": 7.155021951135605e-05, + "loss": 1.6025, + "step": 3448 + }, + { + "epoch": 0.3582632180326166, + "grad_norm": 0.4132823646068573, + "learning_rate": 7.153549510874178e-05, + "loss": 1.709, + "step": 3449 + }, + { + "epoch": 0.35836709255219695, + "grad_norm": 0.38809409737586975, + "learning_rate": 7.152076841276527e-05, + "loss": 1.6638, + "step": 3450 + }, + { + "epoch": 0.3584709670717773, + "grad_norm": 0.388092964887619, + "learning_rate": 7.150603942499478e-05, + "loss": 1.7261, + "step": 3451 + }, + { + "epoch": 0.35857484159135766, + "grad_norm": 0.4031296372413635, + "learning_rate": 7.149130814699884e-05, + "loss": 1.7373, + "step": 3452 + }, + { + "epoch": 0.358678716110938, + "grad_norm": 0.36466771364212036, + "learning_rate": 7.147657458034622e-05, + "loss": 1.6811, + "step": 3453 + }, + { + "epoch": 0.3587825906305183, + "grad_norm": 0.3705892562866211, + "learning_rate": 7.146183872660595e-05, + "loss": 1.6032, + "step": 3454 + }, + { + "epoch": 0.3588864651500987, + "grad_norm": 0.3856612741947174, + "learning_rate": 7.144710058734724e-05, + "loss": 1.5488, + "step": 3455 + }, + { + "epoch": 0.358990339669679, + "grad_norm": 0.3789518177509308, + "learning_rate": 7.143236016413963e-05, + "loss": 1.6905, + "step": 3456 + }, + { + "epoch": 0.35909421418925935, + "grad_norm": 0.3953239321708679, + "learning_rate": 7.141761745855284e-05, + "loss": 1.7514, + "step": 3457 + }, + { + "epoch": 0.35919808870883974, + "grad_norm": 0.3837435841560364, + "learning_rate": 7.140287247215685e-05, + "loss": 1.3697, + "step": 3458 + }, + { + "epoch": 0.35930196322842006, + "grad_norm": 0.3627301752567291, + "learning_rate": 7.13881252065219e-05, + "loss": 1.6055, + "step": 3459 + }, + { + "epoch": 0.3594058377480004, + "grad_norm": 0.41467034816741943, + "learning_rate": 7.137337566321844e-05, + "loss": 1.7294, + "step": 3460 + }, + { + "epoch": 0.35950971226758077, + "grad_norm": 0.4232648015022278, + "learning_rate": 7.135862384381717e-05, + "loss": 1.8147, + "step": 3461 + }, + { + "epoch": 0.3596135867871611, + "grad_norm": 0.43227145075798035, + "learning_rate": 7.134386974988909e-05, + "loss": 1.5631, + "step": 3462 + }, + { + "epoch": 0.3597174613067415, + "grad_norm": 0.4435618817806244, + "learning_rate": 7.132911338300537e-05, + "loss": 1.8052, + "step": 3463 + }, + { + "epoch": 0.3598213358263218, + "grad_norm": 0.41515088081359863, + "learning_rate": 7.131435474473744e-05, + "loss": 1.8405, + "step": 3464 + }, + { + "epoch": 0.35992521034590214, + "grad_norm": 0.3956370949745178, + "learning_rate": 7.1299593836657e-05, + "loss": 1.6957, + "step": 3465 + }, + { + "epoch": 0.3600290848654825, + "grad_norm": 0.4101237654685974, + "learning_rate": 7.128483066033594e-05, + "loss": 1.6149, + "step": 3466 + }, + { + "epoch": 0.36013295938506285, + "grad_norm": 0.39739376306533813, + "learning_rate": 7.127006521734646e-05, + "loss": 1.7836, + "step": 3467 + }, + { + "epoch": 0.36023683390464317, + "grad_norm": 0.4086010158061981, + "learning_rate": 7.125529750926094e-05, + "loss": 1.7712, + "step": 3468 + }, + { + "epoch": 0.36034070842422355, + "grad_norm": 0.3838006556034088, + "learning_rate": 7.124052753765202e-05, + "loss": 1.7435, + "step": 3469 + }, + { + "epoch": 0.3604445829438039, + "grad_norm": 0.40963372588157654, + "learning_rate": 7.122575530409262e-05, + "loss": 1.4758, + "step": 3470 + }, + { + "epoch": 0.3605484574633842, + "grad_norm": 0.4056002199649811, + "learning_rate": 7.121098081015586e-05, + "loss": 1.6, + "step": 3471 + }, + { + "epoch": 0.3606523319829646, + "grad_norm": 0.46054428815841675, + "learning_rate": 7.119620405741506e-05, + "loss": 1.845, + "step": 3472 + }, + { + "epoch": 0.3607562065025449, + "grad_norm": 0.41523477435112, + "learning_rate": 7.118142504744389e-05, + "loss": 1.7532, + "step": 3473 + }, + { + "epoch": 0.36086008102212525, + "grad_norm": 0.40955179929733276, + "learning_rate": 7.116664378181616e-05, + "loss": 1.7787, + "step": 3474 + }, + { + "epoch": 0.36096395554170563, + "grad_norm": 0.39328327775001526, + "learning_rate": 7.115186026210597e-05, + "loss": 1.7785, + "step": 3475 + }, + { + "epoch": 0.36106783006128595, + "grad_norm": 0.380825012922287, + "learning_rate": 7.113707448988767e-05, + "loss": 1.6208, + "step": 3476 + }, + { + "epoch": 0.36117170458086634, + "grad_norm": 0.3730035722255707, + "learning_rate": 7.11222864667358e-05, + "loss": 1.66, + "step": 3477 + }, + { + "epoch": 0.36127557910044666, + "grad_norm": 0.39808517694473267, + "learning_rate": 7.110749619422519e-05, + "loss": 1.6975, + "step": 3478 + }, + { + "epoch": 0.361379453620027, + "grad_norm": 0.4094315767288208, + "learning_rate": 7.109270367393088e-05, + "loss": 1.7015, + "step": 3479 + }, + { + "epoch": 0.3614833281396074, + "grad_norm": 0.41740068793296814, + "learning_rate": 7.107790890742815e-05, + "loss": 1.7778, + "step": 3480 + }, + { + "epoch": 0.3615872026591877, + "grad_norm": 0.41227155923843384, + "learning_rate": 7.106311189629253e-05, + "loss": 1.8917, + "step": 3481 + }, + { + "epoch": 0.36169107717876803, + "grad_norm": 0.3613469898700714, + "learning_rate": 7.104831264209982e-05, + "loss": 1.6972, + "step": 3482 + }, + { + "epoch": 0.3617949516983484, + "grad_norm": 0.402305543422699, + "learning_rate": 7.103351114642596e-05, + "loss": 1.5879, + "step": 3483 + }, + { + "epoch": 0.36189882621792874, + "grad_norm": 0.39771807193756104, + "learning_rate": 7.101870741084726e-05, + "loss": 1.7124, + "step": 3484 + }, + { + "epoch": 0.36200270073750906, + "grad_norm": 0.36613133549690247, + "learning_rate": 7.100390143694016e-05, + "loss": 1.681, + "step": 3485 + }, + { + "epoch": 0.36210657525708945, + "grad_norm": 0.3933365046977997, + "learning_rate": 7.098909322628138e-05, + "loss": 1.703, + "step": 3486 + }, + { + "epoch": 0.3622104497766698, + "grad_norm": 0.39445760846138, + "learning_rate": 7.097428278044792e-05, + "loss": 1.7333, + "step": 3487 + }, + { + "epoch": 0.36231432429625016, + "grad_norm": 0.3644881844520569, + "learning_rate": 7.095947010101693e-05, + "loss": 1.713, + "step": 3488 + }, + { + "epoch": 0.3624181988158305, + "grad_norm": 0.3871423304080963, + "learning_rate": 7.094465518956588e-05, + "loss": 1.7749, + "step": 3489 + }, + { + "epoch": 0.3625220733354108, + "grad_norm": 0.38430991768836975, + "learning_rate": 7.092983804767242e-05, + "loss": 1.6216, + "step": 3490 + }, + { + "epoch": 0.3626259478549912, + "grad_norm": 0.3718239367008209, + "learning_rate": 7.091501867691446e-05, + "loss": 1.6914, + "step": 3491 + }, + { + "epoch": 0.3627298223745715, + "grad_norm": 0.39211010932922363, + "learning_rate": 7.090019707887017e-05, + "loss": 1.4819, + "step": 3492 + }, + { + "epoch": 0.36283369689415185, + "grad_norm": 0.3777989149093628, + "learning_rate": 7.08853732551179e-05, + "loss": 1.6425, + "step": 3493 + }, + { + "epoch": 0.36293757141373223, + "grad_norm": 0.3853297829627991, + "learning_rate": 7.087054720723629e-05, + "loss": 1.6724, + "step": 3494 + }, + { + "epoch": 0.36304144593331256, + "grad_norm": 0.3982897400856018, + "learning_rate": 7.085571893680421e-05, + "loss": 1.7549, + "step": 3495 + }, + { + "epoch": 0.3631453204528929, + "grad_norm": 0.38394638895988464, + "learning_rate": 7.084088844540074e-05, + "loss": 1.7959, + "step": 3496 + }, + { + "epoch": 0.36324919497247327, + "grad_norm": 0.4506365656852722, + "learning_rate": 7.08260557346052e-05, + "loss": 1.907, + "step": 3497 + }, + { + "epoch": 0.3633530694920536, + "grad_norm": 0.43902164697647095, + "learning_rate": 7.081122080599721e-05, + "loss": 1.7573, + "step": 3498 + }, + { + "epoch": 0.3634569440116339, + "grad_norm": 0.4061184525489807, + "learning_rate": 7.079638366115653e-05, + "loss": 1.8399, + "step": 3499 + }, + { + "epoch": 0.3635608185312143, + "grad_norm": 0.3644791841506958, + "learning_rate": 7.078154430166319e-05, + "loss": 1.5973, + "step": 3500 + }, + { + "epoch": 0.36366469305079463, + "grad_norm": 0.44396528601646423, + "learning_rate": 7.07667027290975e-05, + "loss": 1.8829, + "step": 3501 + }, + { + "epoch": 0.363768567570375, + "grad_norm": 0.44041764736175537, + "learning_rate": 7.075185894503993e-05, + "loss": 1.9544, + "step": 3502 + }, + { + "epoch": 0.36387244208995534, + "grad_norm": 0.3821714520454407, + "learning_rate": 7.073701295107128e-05, + "loss": 1.7017, + "step": 3503 + }, + { + "epoch": 0.36397631660953567, + "grad_norm": 0.3823404014110565, + "learning_rate": 7.072216474877249e-05, + "loss": 1.5732, + "step": 3504 + }, + { + "epoch": 0.36408019112911605, + "grad_norm": 0.38946443796157837, + "learning_rate": 7.070731433972481e-05, + "loss": 1.5651, + "step": 3505 + }, + { + "epoch": 0.3641840656486964, + "grad_norm": 0.3907792270183563, + "learning_rate": 7.069246172550967e-05, + "loss": 1.5276, + "step": 3506 + }, + { + "epoch": 0.3642879401682767, + "grad_norm": 0.374189168214798, + "learning_rate": 7.067760690770876e-05, + "loss": 1.4874, + "step": 3507 + }, + { + "epoch": 0.3643918146878571, + "grad_norm": 0.37600815296173096, + "learning_rate": 7.066274988790399e-05, + "loss": 1.6168, + "step": 3508 + }, + { + "epoch": 0.3644956892074374, + "grad_norm": 0.39330026507377625, + "learning_rate": 7.064789066767755e-05, + "loss": 1.8023, + "step": 3509 + }, + { + "epoch": 0.36459956372701774, + "grad_norm": 0.4047221541404724, + "learning_rate": 7.063302924861182e-05, + "loss": 1.718, + "step": 3510 + }, + { + "epoch": 0.3647034382465981, + "grad_norm": 0.3758139908313751, + "learning_rate": 7.06181656322894e-05, + "loss": 1.6952, + "step": 3511 + }, + { + "epoch": 0.36480731276617845, + "grad_norm": 0.3800835907459259, + "learning_rate": 7.060329982029317e-05, + "loss": 1.5578, + "step": 3512 + }, + { + "epoch": 0.3649111872857588, + "grad_norm": 0.38068559765815735, + "learning_rate": 7.058843181420624e-05, + "loss": 1.7363, + "step": 3513 + }, + { + "epoch": 0.36501506180533916, + "grad_norm": 0.40776312351226807, + "learning_rate": 7.05735616156119e-05, + "loss": 1.5816, + "step": 3514 + }, + { + "epoch": 0.3651189363249195, + "grad_norm": 0.40851452946662903, + "learning_rate": 7.055868922609374e-05, + "loss": 1.8035, + "step": 3515 + }, + { + "epoch": 0.36522281084449987, + "grad_norm": 0.4785706698894501, + "learning_rate": 7.054381464723554e-05, + "loss": 2.132, + "step": 3516 + }, + { + "epoch": 0.3653266853640802, + "grad_norm": 0.3917141556739807, + "learning_rate": 7.052893788062131e-05, + "loss": 1.7564, + "step": 3517 + }, + { + "epoch": 0.3654305598836605, + "grad_norm": 0.35814887285232544, + "learning_rate": 7.051405892783536e-05, + "loss": 1.6212, + "step": 3518 + }, + { + "epoch": 0.3655344344032409, + "grad_norm": 0.37529706954956055, + "learning_rate": 7.049917779046213e-05, + "loss": 1.6613, + "step": 3519 + }, + { + "epoch": 0.36563830892282123, + "grad_norm": 0.4299023449420929, + "learning_rate": 7.048429447008636e-05, + "loss": 1.7108, + "step": 3520 + }, + { + "epoch": 0.36574218344240156, + "grad_norm": 0.39884042739868164, + "learning_rate": 7.046940896829302e-05, + "loss": 1.722, + "step": 3521 + }, + { + "epoch": 0.36584605796198194, + "grad_norm": 0.3787887394428253, + "learning_rate": 7.04545212866673e-05, + "loss": 1.7693, + "step": 3522 + }, + { + "epoch": 0.36594993248156227, + "grad_norm": 0.37919625639915466, + "learning_rate": 7.043963142679459e-05, + "loss": 1.8372, + "step": 3523 + }, + { + "epoch": 0.3660538070011426, + "grad_norm": 0.4805675446987152, + "learning_rate": 7.04247393902606e-05, + "loss": 1.9524, + "step": 3524 + }, + { + "epoch": 0.366157681520723, + "grad_norm": 0.3908388316631317, + "learning_rate": 7.040984517865116e-05, + "loss": 1.7887, + "step": 3525 + }, + { + "epoch": 0.3662615560403033, + "grad_norm": 0.39634791016578674, + "learning_rate": 7.039494879355242e-05, + "loss": 1.7503, + "step": 3526 + }, + { + "epoch": 0.3663654305598837, + "grad_norm": 0.3950914740562439, + "learning_rate": 7.038005023655072e-05, + "loss": 1.6939, + "step": 3527 + }, + { + "epoch": 0.366469305079464, + "grad_norm": 0.3956228792667389, + "learning_rate": 7.036514950923264e-05, + "loss": 1.7095, + "step": 3528 + }, + { + "epoch": 0.36657317959904434, + "grad_norm": 0.38164466619491577, + "learning_rate": 7.0350246613185e-05, + "loss": 1.6409, + "step": 3529 + }, + { + "epoch": 0.3666770541186247, + "grad_norm": 0.39597025513648987, + "learning_rate": 7.033534154999481e-05, + "loss": 1.781, + "step": 3530 + }, + { + "epoch": 0.36678092863820505, + "grad_norm": 0.3930729329586029, + "learning_rate": 7.032043432124939e-05, + "loss": 1.7551, + "step": 3531 + }, + { + "epoch": 0.3668848031577854, + "grad_norm": 0.38284942507743835, + "learning_rate": 7.030552492853621e-05, + "loss": 1.4163, + "step": 3532 + }, + { + "epoch": 0.36698867767736576, + "grad_norm": 0.3863504230976105, + "learning_rate": 7.029061337344303e-05, + "loss": 1.8067, + "step": 3533 + }, + { + "epoch": 0.3670925521969461, + "grad_norm": 0.42066165804862976, + "learning_rate": 7.027569965755777e-05, + "loss": 1.819, + "step": 3534 + }, + { + "epoch": 0.3671964267165264, + "grad_norm": 0.3733902871608734, + "learning_rate": 7.026078378246868e-05, + "loss": 1.6656, + "step": 3535 + }, + { + "epoch": 0.3673003012361068, + "grad_norm": 0.39036303758621216, + "learning_rate": 7.024586574976414e-05, + "loss": 1.7219, + "step": 3536 + }, + { + "epoch": 0.3674041757556871, + "grad_norm": 0.3913772702217102, + "learning_rate": 7.023094556103283e-05, + "loss": 1.6209, + "step": 3537 + }, + { + "epoch": 0.36750805027526745, + "grad_norm": 0.3987481892108917, + "learning_rate": 7.021602321786362e-05, + "loss": 1.7411, + "step": 3538 + }, + { + "epoch": 0.36761192479484783, + "grad_norm": 0.38136929273605347, + "learning_rate": 7.020109872184563e-05, + "loss": 1.6651, + "step": 3539 + }, + { + "epoch": 0.36771579931442816, + "grad_norm": 0.3978997766971588, + "learning_rate": 7.018617207456821e-05, + "loss": 1.7601, + "step": 3540 + }, + { + "epoch": 0.36781967383400854, + "grad_norm": 0.39841562509536743, + "learning_rate": 7.01712432776209e-05, + "loss": 1.6426, + "step": 3541 + }, + { + "epoch": 0.36792354835358887, + "grad_norm": 0.40489330887794495, + "learning_rate": 7.015631233259354e-05, + "loss": 1.4646, + "step": 3542 + }, + { + "epoch": 0.3680274228731692, + "grad_norm": 0.3978332579135895, + "learning_rate": 7.014137924107614e-05, + "loss": 1.7999, + "step": 3543 + }, + { + "epoch": 0.3681312973927496, + "grad_norm": 0.4058266878128052, + "learning_rate": 7.012644400465895e-05, + "loss": 1.8085, + "step": 3544 + }, + { + "epoch": 0.3682351719123299, + "grad_norm": 0.4187474846839905, + "learning_rate": 7.011150662493248e-05, + "loss": 1.851, + "step": 3545 + }, + { + "epoch": 0.36833904643191023, + "grad_norm": 0.43871933221817017, + "learning_rate": 7.009656710348742e-05, + "loss": 2.0409, + "step": 3546 + }, + { + "epoch": 0.3684429209514906, + "grad_norm": 0.4018607437610626, + "learning_rate": 7.008162544191474e-05, + "loss": 1.7269, + "step": 3547 + }, + { + "epoch": 0.36854679547107094, + "grad_norm": 0.4518108665943146, + "learning_rate": 7.006668164180557e-05, + "loss": 1.8257, + "step": 3548 + }, + { + "epoch": 0.36865066999065127, + "grad_norm": 0.4045441150665283, + "learning_rate": 7.005173570475135e-05, + "loss": 1.8499, + "step": 3549 + }, + { + "epoch": 0.36875454451023165, + "grad_norm": 0.41423842310905457, + "learning_rate": 7.003678763234367e-05, + "loss": 1.8105, + "step": 3550 + }, + { + "epoch": 0.368858419029812, + "grad_norm": 0.37954044342041016, + "learning_rate": 7.002183742617441e-05, + "loss": 1.7043, + "step": 3551 + }, + { + "epoch": 0.36896229354939236, + "grad_norm": 0.39509761333465576, + "learning_rate": 7.000688508783564e-05, + "loss": 1.7842, + "step": 3552 + }, + { + "epoch": 0.3690661680689727, + "grad_norm": 0.4210776686668396, + "learning_rate": 6.999193061891967e-05, + "loss": 1.5356, + "step": 3553 + }, + { + "epoch": 0.369170042588553, + "grad_norm": 0.4176965355873108, + "learning_rate": 6.997697402101904e-05, + "loss": 1.8425, + "step": 3554 + }, + { + "epoch": 0.3692739171081334, + "grad_norm": 0.4520731270313263, + "learning_rate": 6.996201529572648e-05, + "loss": 1.7272, + "step": 3555 + }, + { + "epoch": 0.3693777916277137, + "grad_norm": 0.3699086606502533, + "learning_rate": 6.994705444463501e-05, + "loss": 1.661, + "step": 3556 + }, + { + "epoch": 0.36948166614729405, + "grad_norm": 0.38104352355003357, + "learning_rate": 6.993209146933784e-05, + "loss": 1.5626, + "step": 3557 + }, + { + "epoch": 0.36958554066687443, + "grad_norm": 0.4503292739391327, + "learning_rate": 6.991712637142839e-05, + "loss": 1.8327, + "step": 3558 + }, + { + "epoch": 0.36968941518645476, + "grad_norm": 0.43513697385787964, + "learning_rate": 6.990215915250035e-05, + "loss": 1.6256, + "step": 3559 + }, + { + "epoch": 0.3697932897060351, + "grad_norm": 0.4127349555492401, + "learning_rate": 6.988718981414761e-05, + "loss": 1.8755, + "step": 3560 + }, + { + "epoch": 0.36989716422561547, + "grad_norm": 0.4628101885318756, + "learning_rate": 6.987221835796429e-05, + "loss": 1.8136, + "step": 3561 + }, + { + "epoch": 0.3700010387451958, + "grad_norm": 0.4025733768939972, + "learning_rate": 6.98572447855447e-05, + "loss": 1.66, + "step": 3562 + }, + { + "epoch": 0.3701049132647761, + "grad_norm": 0.3781670331954956, + "learning_rate": 6.984226909848346e-05, + "loss": 1.6159, + "step": 3563 + }, + { + "epoch": 0.3702087877843565, + "grad_norm": 0.40306591987609863, + "learning_rate": 6.982729129837531e-05, + "loss": 1.888, + "step": 3564 + }, + { + "epoch": 0.37031266230393683, + "grad_norm": 0.4283866584300995, + "learning_rate": 6.981231138681531e-05, + "loss": 1.7992, + "step": 3565 + }, + { + "epoch": 0.3704165368235172, + "grad_norm": 0.4207936227321625, + "learning_rate": 6.979732936539868e-05, + "loss": 1.7709, + "step": 3566 + }, + { + "epoch": 0.37052041134309754, + "grad_norm": 0.4613027274608612, + "learning_rate": 6.97823452357209e-05, + "loss": 1.765, + "step": 3567 + }, + { + "epoch": 0.37062428586267787, + "grad_norm": 0.40821146965026855, + "learning_rate": 6.976735899937767e-05, + "loss": 1.7137, + "step": 3568 + }, + { + "epoch": 0.37072816038225825, + "grad_norm": 0.3744267225265503, + "learning_rate": 6.97523706579649e-05, + "loss": 1.7768, + "step": 3569 + }, + { + "epoch": 0.3708320349018386, + "grad_norm": 0.410510778427124, + "learning_rate": 6.973738021307871e-05, + "loss": 1.4456, + "step": 3570 + }, + { + "epoch": 0.3709359094214189, + "grad_norm": 0.4135860800743103, + "learning_rate": 6.97223876663155e-05, + "loss": 1.6854, + "step": 3571 + }, + { + "epoch": 0.3710397839409993, + "grad_norm": 0.43364089727401733, + "learning_rate": 6.970739301927183e-05, + "loss": 1.7468, + "step": 3572 + }, + { + "epoch": 0.3711436584605796, + "grad_norm": 0.37639757990837097, + "learning_rate": 6.969239627354453e-05, + "loss": 1.6043, + "step": 3573 + }, + { + "epoch": 0.37124753298015994, + "grad_norm": 0.415420800447464, + "learning_rate": 6.967739743073065e-05, + "loss": 1.7144, + "step": 3574 + }, + { + "epoch": 0.3713514074997403, + "grad_norm": 0.39380958676338196, + "learning_rate": 6.966239649242743e-05, + "loss": 1.8287, + "step": 3575 + }, + { + "epoch": 0.37145528201932065, + "grad_norm": 0.41520291566848755, + "learning_rate": 6.964739346023235e-05, + "loss": 1.8377, + "step": 3576 + }, + { + "epoch": 0.371559156538901, + "grad_norm": 0.4204116761684418, + "learning_rate": 6.963238833574312e-05, + "loss": 1.7471, + "step": 3577 + }, + { + "epoch": 0.37166303105848136, + "grad_norm": 0.4574458599090576, + "learning_rate": 6.961738112055767e-05, + "loss": 1.4237, + "step": 3578 + }, + { + "epoch": 0.3717669055780617, + "grad_norm": 0.3843066990375519, + "learning_rate": 6.960237181627416e-05, + "loss": 1.7035, + "step": 3579 + }, + { + "epoch": 0.3718707800976421, + "grad_norm": 0.3904613256454468, + "learning_rate": 6.958736042449095e-05, + "loss": 1.8277, + "step": 3580 + }, + { + "epoch": 0.3719746546172224, + "grad_norm": 0.3790266811847687, + "learning_rate": 6.957234694680663e-05, + "loss": 1.7919, + "step": 3581 + }, + { + "epoch": 0.3720785291368027, + "grad_norm": 0.3974626362323761, + "learning_rate": 6.955733138482006e-05, + "loss": 1.795, + "step": 3582 + }, + { + "epoch": 0.3721824036563831, + "grad_norm": 0.3986896574497223, + "learning_rate": 6.954231374013022e-05, + "loss": 1.7831, + "step": 3583 + }, + { + "epoch": 0.37228627817596344, + "grad_norm": 0.4144607484340668, + "learning_rate": 6.952729401433642e-05, + "loss": 1.4946, + "step": 3584 + }, + { + "epoch": 0.37239015269554376, + "grad_norm": 0.4263852536678314, + "learning_rate": 6.951227220903813e-05, + "loss": 1.7404, + "step": 3585 + }, + { + "epoch": 0.37249402721512415, + "grad_norm": 0.4002395272254944, + "learning_rate": 6.949724832583504e-05, + "loss": 1.6906, + "step": 3586 + }, + { + "epoch": 0.3725979017347045, + "grad_norm": 0.41054767370224, + "learning_rate": 6.948222236632709e-05, + "loss": 1.7632, + "step": 3587 + }, + { + "epoch": 0.3727017762542848, + "grad_norm": 0.3650989234447479, + "learning_rate": 6.946719433211442e-05, + "loss": 1.6058, + "step": 3588 + }, + { + "epoch": 0.3728056507738652, + "grad_norm": 0.4016493856906891, + "learning_rate": 6.945216422479741e-05, + "loss": 1.7819, + "step": 3589 + }, + { + "epoch": 0.3729095252934455, + "grad_norm": 0.3928471505641937, + "learning_rate": 6.943713204597664e-05, + "loss": 1.8151, + "step": 3590 + }, + { + "epoch": 0.3730133998130259, + "grad_norm": 0.39063897728919983, + "learning_rate": 6.942209779725292e-05, + "loss": 1.775, + "step": 3591 + }, + { + "epoch": 0.3731172743326062, + "grad_norm": 0.38129961490631104, + "learning_rate": 6.94070614802273e-05, + "loss": 1.6656, + "step": 3592 + }, + { + "epoch": 0.37322114885218655, + "grad_norm": 0.438495934009552, + "learning_rate": 6.9392023096501e-05, + "loss": 1.7774, + "step": 3593 + }, + { + "epoch": 0.37332502337176693, + "grad_norm": 0.43866604566574097, + "learning_rate": 6.937698264767549e-05, + "loss": 1.8816, + "step": 3594 + }, + { + "epoch": 0.37342889789134726, + "grad_norm": 0.41461604833602905, + "learning_rate": 6.93619401353525e-05, + "loss": 1.7163, + "step": 3595 + }, + { + "epoch": 0.3735327724109276, + "grad_norm": 0.36867401003837585, + "learning_rate": 6.934689556113391e-05, + "loss": 1.7318, + "step": 3596 + }, + { + "epoch": 0.37363664693050797, + "grad_norm": 0.41185227036476135, + "learning_rate": 6.933184892662185e-05, + "loss": 1.8164, + "step": 3597 + }, + { + "epoch": 0.3737405214500883, + "grad_norm": 0.3863617777824402, + "learning_rate": 6.931680023341867e-05, + "loss": 1.7503, + "step": 3598 + }, + { + "epoch": 0.3738443959696686, + "grad_norm": 0.3842338025569916, + "learning_rate": 6.930174948312696e-05, + "loss": 1.6838, + "step": 3599 + }, + { + "epoch": 0.373948270489249, + "grad_norm": 0.3778021037578583, + "learning_rate": 6.928669667734945e-05, + "loss": 1.743, + "step": 3600 + }, + { + "epoch": 0.37405214500882933, + "grad_norm": 0.40780338644981384, + "learning_rate": 6.927164181768921e-05, + "loss": 1.868, + "step": 3601 + }, + { + "epoch": 0.37415601952840966, + "grad_norm": 0.4100748300552368, + "learning_rate": 6.925658490574945e-05, + "loss": 1.8172, + "step": 3602 + }, + { + "epoch": 0.37425989404799004, + "grad_norm": 0.3898886740207672, + "learning_rate": 6.92415259431336e-05, + "loss": 1.671, + "step": 3603 + }, + { + "epoch": 0.37436376856757037, + "grad_norm": 0.4094226062297821, + "learning_rate": 6.922646493144532e-05, + "loss": 1.6493, + "step": 3604 + }, + { + "epoch": 0.37446764308715075, + "grad_norm": 0.36988019943237305, + "learning_rate": 6.92114018722885e-05, + "loss": 1.6539, + "step": 3605 + }, + { + "epoch": 0.3745715176067311, + "grad_norm": 0.3859472870826721, + "learning_rate": 6.919633676726722e-05, + "loss": 1.5816, + "step": 3606 + }, + { + "epoch": 0.3746753921263114, + "grad_norm": 0.4098130464553833, + "learning_rate": 6.918126961798583e-05, + "loss": 1.7291, + "step": 3607 + }, + { + "epoch": 0.3747792666458918, + "grad_norm": 0.40433475375175476, + "learning_rate": 6.916620042604883e-05, + "loss": 1.7515, + "step": 3608 + }, + { + "epoch": 0.3748831411654721, + "grad_norm": 0.38681918382644653, + "learning_rate": 6.915112919306101e-05, + "loss": 1.6408, + "step": 3609 + }, + { + "epoch": 0.37498701568505244, + "grad_norm": 0.43381497263908386, + "learning_rate": 6.913605592062731e-05, + "loss": 1.7996, + "step": 3610 + }, + { + "epoch": 0.3750908902046328, + "grad_norm": 0.37756863236427307, + "learning_rate": 6.91209806103529e-05, + "loss": 1.6188, + "step": 3611 + }, + { + "epoch": 0.37519476472421315, + "grad_norm": 0.44089293479919434, + "learning_rate": 6.910590326384323e-05, + "loss": 1.8074, + "step": 3612 + }, + { + "epoch": 0.3752986392437935, + "grad_norm": 0.3893734812736511, + "learning_rate": 6.909082388270389e-05, + "loss": 1.6607, + "step": 3613 + }, + { + "epoch": 0.37540251376337386, + "grad_norm": 0.40464848279953003, + "learning_rate": 6.90757424685407e-05, + "loss": 1.7711, + "step": 3614 + }, + { + "epoch": 0.3755063882829542, + "grad_norm": 0.3943893313407898, + "learning_rate": 6.906065902295975e-05, + "loss": 1.8252, + "step": 3615 + }, + { + "epoch": 0.3756102628025345, + "grad_norm": 0.38202980160713196, + "learning_rate": 6.904557354756728e-05, + "loss": 1.7832, + "step": 3616 + }, + { + "epoch": 0.3757141373221149, + "grad_norm": 0.38370367884635925, + "learning_rate": 6.903048604396981e-05, + "loss": 1.6776, + "step": 3617 + }, + { + "epoch": 0.3758180118416952, + "grad_norm": 0.39055919647216797, + "learning_rate": 6.9015396513774e-05, + "loss": 1.7664, + "step": 3618 + }, + { + "epoch": 0.3759218863612756, + "grad_norm": 0.3747326731681824, + "learning_rate": 6.900030495858681e-05, + "loss": 1.5807, + "step": 3619 + }, + { + "epoch": 0.37602576088085593, + "grad_norm": 0.4175710380077362, + "learning_rate": 6.898521138001533e-05, + "loss": 1.8023, + "step": 3620 + }, + { + "epoch": 0.37612963540043626, + "grad_norm": 0.4446984529495239, + "learning_rate": 6.897011577966693e-05, + "loss": 1.7831, + "step": 3621 + }, + { + "epoch": 0.37623350992001664, + "grad_norm": 0.3818349242210388, + "learning_rate": 6.895501815914917e-05, + "loss": 1.7742, + "step": 3622 + }, + { + "epoch": 0.37633738443959697, + "grad_norm": 0.4104137122631073, + "learning_rate": 6.893991852006983e-05, + "loss": 1.6418, + "step": 3623 + }, + { + "epoch": 0.3764412589591773, + "grad_norm": 0.3920380771160126, + "learning_rate": 6.892481686403694e-05, + "loss": 1.6307, + "step": 3624 + }, + { + "epoch": 0.3765451334787577, + "grad_norm": 0.36929798126220703, + "learning_rate": 6.890971319265863e-05, + "loss": 1.6734, + "step": 3625 + }, + { + "epoch": 0.376649007998338, + "grad_norm": 0.39406248927116394, + "learning_rate": 6.889460750754337e-05, + "loss": 1.6584, + "step": 3626 + }, + { + "epoch": 0.37675288251791833, + "grad_norm": 0.4494403898715973, + "learning_rate": 6.887949981029981e-05, + "loss": 1.9331, + "step": 3627 + }, + { + "epoch": 0.3768567570374987, + "grad_norm": 0.4188593924045563, + "learning_rate": 6.886439010253678e-05, + "loss": 1.8305, + "step": 3628 + }, + { + "epoch": 0.37696063155707904, + "grad_norm": 0.3847233057022095, + "learning_rate": 6.884927838586336e-05, + "loss": 1.675, + "step": 3629 + }, + { + "epoch": 0.3770645060766594, + "grad_norm": 0.3903995156288147, + "learning_rate": 6.88341646618888e-05, + "loss": 1.6902, + "step": 3630 + }, + { + "epoch": 0.37716838059623975, + "grad_norm": 0.38415199518203735, + "learning_rate": 6.881904893222265e-05, + "loss": 1.593, + "step": 3631 + }, + { + "epoch": 0.3772722551158201, + "grad_norm": 0.3710866868495941, + "learning_rate": 6.880393119847456e-05, + "loss": 1.5337, + "step": 3632 + }, + { + "epoch": 0.37737612963540046, + "grad_norm": 0.3745911419391632, + "learning_rate": 6.878881146225447e-05, + "loss": 1.7957, + "step": 3633 + }, + { + "epoch": 0.3774800041549808, + "grad_norm": 0.42726603150367737, + "learning_rate": 6.877368972517252e-05, + "loss": 1.783, + "step": 3634 + }, + { + "epoch": 0.3775838786745611, + "grad_norm": 0.36636191606521606, + "learning_rate": 6.875856598883907e-05, + "loss": 1.756, + "step": 3635 + }, + { + "epoch": 0.3776877531941415, + "grad_norm": 0.40645745396614075, + "learning_rate": 6.874344025486464e-05, + "loss": 1.7357, + "step": 3636 + }, + { + "epoch": 0.3777916277137218, + "grad_norm": 0.42991214990615845, + "learning_rate": 6.872831252486005e-05, + "loss": 1.9177, + "step": 3637 + }, + { + "epoch": 0.37789550223330215, + "grad_norm": 0.39431822299957275, + "learning_rate": 6.871318280043626e-05, + "loss": 1.6861, + "step": 3638 + }, + { + "epoch": 0.37799937675288253, + "grad_norm": 0.4022711515426636, + "learning_rate": 6.869805108320446e-05, + "loss": 1.7634, + "step": 3639 + }, + { + "epoch": 0.37810325127246286, + "grad_norm": 0.3846032917499542, + "learning_rate": 6.868291737477608e-05, + "loss": 1.616, + "step": 3640 + }, + { + "epoch": 0.3782071257920432, + "grad_norm": 0.3910578787326813, + "learning_rate": 6.866778167676272e-05, + "loss": 1.6158, + "step": 3641 + }, + { + "epoch": 0.37831100031162357, + "grad_norm": 0.39738887548446655, + "learning_rate": 6.865264399077623e-05, + "loss": 1.657, + "step": 3642 + }, + { + "epoch": 0.3784148748312039, + "grad_norm": 0.3639686703681946, + "learning_rate": 6.863750431842865e-05, + "loss": 1.536, + "step": 3643 + }, + { + "epoch": 0.3785187493507843, + "grad_norm": 0.4142943322658539, + "learning_rate": 6.862236266133225e-05, + "loss": 1.8389, + "step": 3644 + }, + { + "epoch": 0.3786226238703646, + "grad_norm": 0.42220360040664673, + "learning_rate": 6.860721902109948e-05, + "loss": 1.6854, + "step": 3645 + }, + { + "epoch": 0.37872649838994493, + "grad_norm": 0.43140527606010437, + "learning_rate": 6.859207339934302e-05, + "loss": 1.8258, + "step": 3646 + }, + { + "epoch": 0.3788303729095253, + "grad_norm": 0.40784284472465515, + "learning_rate": 6.857692579767576e-05, + "loss": 1.7949, + "step": 3647 + }, + { + "epoch": 0.37893424742910564, + "grad_norm": 0.39824676513671875, + "learning_rate": 6.856177621771083e-05, + "loss": 1.6723, + "step": 3648 + }, + { + "epoch": 0.37903812194868597, + "grad_norm": 0.3684784471988678, + "learning_rate": 6.854662466106151e-05, + "loss": 1.7018, + "step": 3649 + }, + { + "epoch": 0.37914199646826635, + "grad_norm": 0.4189784824848175, + "learning_rate": 6.853147112934132e-05, + "loss": 1.7914, + "step": 3650 + }, + { + "epoch": 0.3792458709878467, + "grad_norm": 0.41028210520744324, + "learning_rate": 6.851631562416402e-05, + "loss": 1.7757, + "step": 3651 + }, + { + "epoch": 0.379349745507427, + "grad_norm": 0.37003424763679504, + "learning_rate": 6.850115814714355e-05, + "loss": 1.5974, + "step": 3652 + }, + { + "epoch": 0.3794536200270074, + "grad_norm": 0.4214622676372528, + "learning_rate": 6.848599869989403e-05, + "loss": 1.8257, + "step": 3653 + }, + { + "epoch": 0.3795574945465877, + "grad_norm": 0.4845244586467743, + "learning_rate": 6.847083728402986e-05, + "loss": 1.5854, + "step": 3654 + }, + { + "epoch": 0.37966136906616804, + "grad_norm": 0.4533901512622833, + "learning_rate": 6.845567390116558e-05, + "loss": 1.8799, + "step": 3655 + }, + { + "epoch": 0.3797652435857484, + "grad_norm": 0.43582406640052795, + "learning_rate": 6.8440508552916e-05, + "loss": 1.9001, + "step": 3656 + }, + { + "epoch": 0.37986911810532875, + "grad_norm": 0.3865016996860504, + "learning_rate": 6.84253412408961e-05, + "loss": 1.5654, + "step": 3657 + }, + { + "epoch": 0.37997299262490913, + "grad_norm": 0.3525942862033844, + "learning_rate": 6.841017196672109e-05, + "loss": 1.5015, + "step": 3658 + }, + { + "epoch": 0.38007686714448946, + "grad_norm": 0.3993504047393799, + "learning_rate": 6.839500073200636e-05, + "loss": 1.7024, + "step": 3659 + }, + { + "epoch": 0.3801807416640698, + "grad_norm": 0.4344191253185272, + "learning_rate": 6.837982753836755e-05, + "loss": 1.7241, + "step": 3660 + }, + { + "epoch": 0.38028461618365017, + "grad_norm": 0.3942391574382782, + "learning_rate": 6.836465238742045e-05, + "loss": 1.6898, + "step": 3661 + }, + { + "epoch": 0.3803884907032305, + "grad_norm": 0.4017980396747589, + "learning_rate": 6.834947528078114e-05, + "loss": 1.8323, + "step": 3662 + }, + { + "epoch": 0.3804923652228108, + "grad_norm": 0.4126379191875458, + "learning_rate": 6.833429622006584e-05, + "loss": 1.7061, + "step": 3663 + }, + { + "epoch": 0.3805962397423912, + "grad_norm": 0.4057486951351166, + "learning_rate": 6.8319115206891e-05, + "loss": 1.8332, + "step": 3664 + }, + { + "epoch": 0.38070011426197153, + "grad_norm": 0.4050893783569336, + "learning_rate": 6.83039322428733e-05, + "loss": 1.6706, + "step": 3665 + }, + { + "epoch": 0.38080398878155186, + "grad_norm": 0.37479984760284424, + "learning_rate": 6.828874732962958e-05, + "loss": 1.4825, + "step": 3666 + }, + { + "epoch": 0.38090786330113224, + "grad_norm": 0.39387476444244385, + "learning_rate": 6.827356046877693e-05, + "loss": 1.6615, + "step": 3667 + }, + { + "epoch": 0.38101173782071257, + "grad_norm": 0.35786718130111694, + "learning_rate": 6.825837166193263e-05, + "loss": 1.5442, + "step": 3668 + }, + { + "epoch": 0.38111561234029295, + "grad_norm": 0.3664378225803375, + "learning_rate": 6.824318091071416e-05, + "loss": 1.6057, + "step": 3669 + }, + { + "epoch": 0.3812194868598733, + "grad_norm": 0.4093641936779022, + "learning_rate": 6.822798821673923e-05, + "loss": 1.7737, + "step": 3670 + }, + { + "epoch": 0.3813233613794536, + "grad_norm": 0.3681464195251465, + "learning_rate": 6.821279358162575e-05, + "loss": 1.506, + "step": 3671 + }, + { + "epoch": 0.381427235899034, + "grad_norm": 0.4376184344291687, + "learning_rate": 6.819759700699181e-05, + "loss": 1.7086, + "step": 3672 + }, + { + "epoch": 0.3815311104186143, + "grad_norm": 0.4148000180721283, + "learning_rate": 6.818239849445574e-05, + "loss": 1.8619, + "step": 3673 + }, + { + "epoch": 0.38163498493819464, + "grad_norm": 0.39664286375045776, + "learning_rate": 6.816719804563606e-05, + "loss": 1.7338, + "step": 3674 + }, + { + "epoch": 0.381738859457775, + "grad_norm": 0.37723055481910706, + "learning_rate": 6.815199566215149e-05, + "loss": 1.5691, + "step": 3675 + }, + { + "epoch": 0.38184273397735535, + "grad_norm": 0.42841121554374695, + "learning_rate": 6.813679134562098e-05, + "loss": 1.8749, + "step": 3676 + }, + { + "epoch": 0.3819466084969357, + "grad_norm": 0.3727743625640869, + "learning_rate": 6.812158509766366e-05, + "loss": 1.6885, + "step": 3677 + }, + { + "epoch": 0.38205048301651606, + "grad_norm": 0.401883989572525, + "learning_rate": 6.810637691989887e-05, + "loss": 1.7689, + "step": 3678 + }, + { + "epoch": 0.3821543575360964, + "grad_norm": 0.3472015857696533, + "learning_rate": 6.809116681394618e-05, + "loss": 1.6354, + "step": 3679 + }, + { + "epoch": 0.3822582320556767, + "grad_norm": 0.3772028684616089, + "learning_rate": 6.807595478142536e-05, + "loss": 1.6749, + "step": 3680 + }, + { + "epoch": 0.3823621065752571, + "grad_norm": 0.37408995628356934, + "learning_rate": 6.806074082395633e-05, + "loss": 1.7558, + "step": 3681 + }, + { + "epoch": 0.3824659810948374, + "grad_norm": 0.4021287262439728, + "learning_rate": 6.80455249431593e-05, + "loss": 1.6639, + "step": 3682 + }, + { + "epoch": 0.3825698556144178, + "grad_norm": 0.3701534867286682, + "learning_rate": 6.803030714065461e-05, + "loss": 1.6497, + "step": 3683 + }, + { + "epoch": 0.38267373013399814, + "grad_norm": 0.4467286765575409, + "learning_rate": 6.801508741806285e-05, + "loss": 1.86, + "step": 3684 + }, + { + "epoch": 0.38277760465357846, + "grad_norm": 0.40490084886550903, + "learning_rate": 6.799986577700481e-05, + "loss": 1.7865, + "step": 3685 + }, + { + "epoch": 0.38288147917315885, + "grad_norm": 0.39086583256721497, + "learning_rate": 6.798464221910147e-05, + "loss": 1.7091, + "step": 3686 + }, + { + "epoch": 0.38298535369273917, + "grad_norm": 0.4162215292453766, + "learning_rate": 6.796941674597402e-05, + "loss": 1.7886, + "step": 3687 + }, + { + "epoch": 0.3830892282123195, + "grad_norm": 0.3935025930404663, + "learning_rate": 6.795418935924384e-05, + "loss": 1.8681, + "step": 3688 + }, + { + "epoch": 0.3831931027318999, + "grad_norm": 0.40464386343955994, + "learning_rate": 6.793896006053255e-05, + "loss": 1.8301, + "step": 3689 + }, + { + "epoch": 0.3832969772514802, + "grad_norm": 0.3767825961112976, + "learning_rate": 6.792372885146195e-05, + "loss": 1.6537, + "step": 3690 + }, + { + "epoch": 0.38340085177106054, + "grad_norm": 0.36534345149993896, + "learning_rate": 6.790849573365404e-05, + "loss": 1.5565, + "step": 3691 + }, + { + "epoch": 0.3835047262906409, + "grad_norm": 0.38759198784828186, + "learning_rate": 6.7893260708731e-05, + "loss": 1.8307, + "step": 3692 + }, + { + "epoch": 0.38360860081022125, + "grad_norm": 0.38123390078544617, + "learning_rate": 6.78780237783153e-05, + "loss": 1.6495, + "step": 3693 + }, + { + "epoch": 0.38371247532980157, + "grad_norm": 0.4280903935432434, + "learning_rate": 6.786278494402952e-05, + "loss": 1.8522, + "step": 3694 + }, + { + "epoch": 0.38381634984938195, + "grad_norm": 0.42416825890541077, + "learning_rate": 6.784754420749645e-05, + "loss": 1.7361, + "step": 3695 + }, + { + "epoch": 0.3839202243689623, + "grad_norm": 0.37206903100013733, + "learning_rate": 6.783230157033915e-05, + "loss": 1.7802, + "step": 3696 + }, + { + "epoch": 0.38402409888854266, + "grad_norm": 0.4096907079219818, + "learning_rate": 6.781705703418084e-05, + "loss": 1.7411, + "step": 3697 + }, + { + "epoch": 0.384127973408123, + "grad_norm": 0.38418519496917725, + "learning_rate": 6.780181060064493e-05, + "loss": 1.4685, + "step": 3698 + }, + { + "epoch": 0.3842318479277033, + "grad_norm": 0.3541015088558197, + "learning_rate": 6.778656227135505e-05, + "loss": 1.4894, + "step": 3699 + }, + { + "epoch": 0.3843357224472837, + "grad_norm": 0.3923327326774597, + "learning_rate": 6.777131204793502e-05, + "loss": 1.5972, + "step": 3700 + }, + { + "epoch": 0.38443959696686403, + "grad_norm": 0.38584020733833313, + "learning_rate": 6.775605993200889e-05, + "loss": 1.474, + "step": 3701 + }, + { + "epoch": 0.38454347148644435, + "grad_norm": 0.42586928606033325, + "learning_rate": 6.774080592520088e-05, + "loss": 1.8876, + "step": 3702 + }, + { + "epoch": 0.38464734600602474, + "grad_norm": 0.4268626570701599, + "learning_rate": 6.77255500291354e-05, + "loss": 1.8743, + "step": 3703 + }, + { + "epoch": 0.38475122052560506, + "grad_norm": 0.43447503447532654, + "learning_rate": 6.771029224543713e-05, + "loss": 1.9229, + "step": 3704 + }, + { + "epoch": 0.3848550950451854, + "grad_norm": 0.36980974674224854, + "learning_rate": 6.769503257573089e-05, + "loss": 1.5463, + "step": 3705 + }, + { + "epoch": 0.3849589695647658, + "grad_norm": 0.3850055932998657, + "learning_rate": 6.767977102164167e-05, + "loss": 1.724, + "step": 3706 + }, + { + "epoch": 0.3850628440843461, + "grad_norm": 0.4217112064361572, + "learning_rate": 6.766450758479479e-05, + "loss": 1.965, + "step": 3707 + }, + { + "epoch": 0.3851667186039265, + "grad_norm": 0.3799114525318146, + "learning_rate": 6.76492422668156e-05, + "loss": 1.6066, + "step": 3708 + }, + { + "epoch": 0.3852705931235068, + "grad_norm": 0.39665165543556213, + "learning_rate": 6.76339750693298e-05, + "loss": 1.8164, + "step": 3709 + }, + { + "epoch": 0.38537446764308714, + "grad_norm": 0.38629379868507385, + "learning_rate": 6.76187059939632e-05, + "loss": 1.6843, + "step": 3710 + }, + { + "epoch": 0.3854783421626675, + "grad_norm": 0.4092472493648529, + "learning_rate": 6.760343504234184e-05, + "loss": 1.5059, + "step": 3711 + }, + { + "epoch": 0.38558221668224785, + "grad_norm": 0.40238040685653687, + "learning_rate": 6.758816221609196e-05, + "loss": 1.8893, + "step": 3712 + }, + { + "epoch": 0.3856860912018282, + "grad_norm": 0.39806345105171204, + "learning_rate": 6.757288751684e-05, + "loss": 1.7954, + "step": 3713 + }, + { + "epoch": 0.38578996572140856, + "grad_norm": 0.41928762197494507, + "learning_rate": 6.75576109462126e-05, + "loss": 1.8645, + "step": 3714 + }, + { + "epoch": 0.3858938402409889, + "grad_norm": 0.4039596915245056, + "learning_rate": 6.75423325058366e-05, + "loss": 1.8103, + "step": 3715 + }, + { + "epoch": 0.3859977147605692, + "grad_norm": 0.400309681892395, + "learning_rate": 6.7527052197339e-05, + "loss": 1.6896, + "step": 3716 + }, + { + "epoch": 0.3861015892801496, + "grad_norm": 0.43688875436782837, + "learning_rate": 6.751177002234706e-05, + "loss": 1.9697, + "step": 3717 + }, + { + "epoch": 0.3862054637997299, + "grad_norm": 0.5165666937828064, + "learning_rate": 6.749648598248821e-05, + "loss": 1.9304, + "step": 3718 + }, + { + "epoch": 0.38630933831931025, + "grad_norm": 0.4006554186344147, + "learning_rate": 6.74812000793901e-05, + "loss": 1.6306, + "step": 3719 + }, + { + "epoch": 0.38641321283889063, + "grad_norm": 0.3924461007118225, + "learning_rate": 6.74659123146805e-05, + "loss": 1.6424, + "step": 3720 + }, + { + "epoch": 0.38651708735847096, + "grad_norm": 0.35347139835357666, + "learning_rate": 6.745062268998752e-05, + "loss": 1.4823, + "step": 3721 + }, + { + "epoch": 0.38662096187805134, + "grad_norm": 0.41199618577957153, + "learning_rate": 6.743533120693932e-05, + "loss": 1.6857, + "step": 3722 + }, + { + "epoch": 0.38672483639763167, + "grad_norm": 0.40314194560050964, + "learning_rate": 6.742003786716435e-05, + "loss": 1.7333, + "step": 3723 + }, + { + "epoch": 0.386828710917212, + "grad_norm": 0.402383416891098, + "learning_rate": 6.740474267229123e-05, + "loss": 1.7304, + "step": 3724 + }, + { + "epoch": 0.3869325854367924, + "grad_norm": 0.4002920687198639, + "learning_rate": 6.738944562394878e-05, + "loss": 1.6984, + "step": 3725 + }, + { + "epoch": 0.3870364599563727, + "grad_norm": 0.3783153295516968, + "learning_rate": 6.737414672376601e-05, + "loss": 1.6048, + "step": 3726 + }, + { + "epoch": 0.38714033447595303, + "grad_norm": 0.4129243791103363, + "learning_rate": 6.735884597337214e-05, + "loss": 1.8856, + "step": 3727 + }, + { + "epoch": 0.3872442089955334, + "grad_norm": 0.36337247490882874, + "learning_rate": 6.73435433743966e-05, + "loss": 1.6613, + "step": 3728 + }, + { + "epoch": 0.38734808351511374, + "grad_norm": 0.38264912366867065, + "learning_rate": 6.732823892846895e-05, + "loss": 1.6385, + "step": 3729 + }, + { + "epoch": 0.38745195803469407, + "grad_norm": 0.394731342792511, + "learning_rate": 6.731293263721902e-05, + "loss": 1.6883, + "step": 3730 + }, + { + "epoch": 0.38755583255427445, + "grad_norm": 0.3739268183708191, + "learning_rate": 6.72976245022768e-05, + "loss": 1.677, + "step": 3731 + }, + { + "epoch": 0.3876597070738548, + "grad_norm": 0.37496718764305115, + "learning_rate": 6.728231452527251e-05, + "loss": 1.8645, + "step": 3732 + }, + { + "epoch": 0.3877635815934351, + "grad_norm": 0.39028218388557434, + "learning_rate": 6.726700270783655e-05, + "loss": 1.7683, + "step": 3733 + }, + { + "epoch": 0.3878674561130155, + "grad_norm": 0.4083595871925354, + "learning_rate": 6.725168905159945e-05, + "loss": 1.821, + "step": 3734 + }, + { + "epoch": 0.3879713306325958, + "grad_norm": 0.3999233543872833, + "learning_rate": 6.723637355819206e-05, + "loss": 1.7861, + "step": 3735 + }, + { + "epoch": 0.3880752051521762, + "grad_norm": 0.3761618435382843, + "learning_rate": 6.722105622924533e-05, + "loss": 1.7543, + "step": 3736 + }, + { + "epoch": 0.3881790796717565, + "grad_norm": 0.4310716390609741, + "learning_rate": 6.720573706639044e-05, + "loss": 1.7804, + "step": 3737 + }, + { + "epoch": 0.38828295419133685, + "grad_norm": 0.3662225902080536, + "learning_rate": 6.719041607125877e-05, + "loss": 1.5168, + "step": 3738 + }, + { + "epoch": 0.38838682871091723, + "grad_norm": 0.3875160813331604, + "learning_rate": 6.717509324548187e-05, + "loss": 1.801, + "step": 3739 + }, + { + "epoch": 0.38849070323049756, + "grad_norm": 0.379233181476593, + "learning_rate": 6.715976859069151e-05, + "loss": 1.6002, + "step": 3740 + }, + { + "epoch": 0.3885945777500779, + "grad_norm": 0.3968057632446289, + "learning_rate": 6.714444210851964e-05, + "loss": 1.5895, + "step": 3741 + }, + { + "epoch": 0.38869845226965827, + "grad_norm": 0.45486825704574585, + "learning_rate": 6.712911380059845e-05, + "loss": 1.8872, + "step": 3742 + }, + { + "epoch": 0.3888023267892386, + "grad_norm": 0.38296419382095337, + "learning_rate": 6.711378366856023e-05, + "loss": 1.7437, + "step": 3743 + }, + { + "epoch": 0.3889062013088189, + "grad_norm": 0.394843190908432, + "learning_rate": 6.709845171403754e-05, + "loss": 1.7002, + "step": 3744 + }, + { + "epoch": 0.3890100758283993, + "grad_norm": 0.4002247154712677, + "learning_rate": 6.70831179386631e-05, + "loss": 1.7328, + "step": 3745 + }, + { + "epoch": 0.38911395034797963, + "grad_norm": 0.45309674739837646, + "learning_rate": 6.706778234406987e-05, + "loss": 1.7358, + "step": 3746 + }, + { + "epoch": 0.38921782486756, + "grad_norm": 0.40047159790992737, + "learning_rate": 6.705244493189094e-05, + "loss": 1.7627, + "step": 3747 + }, + { + "epoch": 0.38932169938714034, + "grad_norm": 0.3902556300163269, + "learning_rate": 6.703710570375966e-05, + "loss": 1.6005, + "step": 3748 + }, + { + "epoch": 0.38942557390672067, + "grad_norm": 0.4025695025920868, + "learning_rate": 6.702176466130953e-05, + "loss": 1.6763, + "step": 3749 + }, + { + "epoch": 0.38952944842630105, + "grad_norm": 0.40774810314178467, + "learning_rate": 6.70064218061742e-05, + "loss": 1.8209, + "step": 3750 + }, + { + "epoch": 0.3896333229458814, + "grad_norm": 0.36347082257270813, + "learning_rate": 6.699107713998762e-05, + "loss": 1.4882, + "step": 3751 + }, + { + "epoch": 0.3897371974654617, + "grad_norm": 0.39139434695243835, + "learning_rate": 6.697573066438387e-05, + "loss": 1.713, + "step": 3752 + }, + { + "epoch": 0.3898410719850421, + "grad_norm": 0.4328778386116028, + "learning_rate": 6.69603823809972e-05, + "loss": 1.9039, + "step": 3753 + }, + { + "epoch": 0.3899449465046224, + "grad_norm": 0.38652610778808594, + "learning_rate": 6.694503229146212e-05, + "loss": 1.7179, + "step": 3754 + }, + { + "epoch": 0.39004882102420274, + "grad_norm": 0.36628860235214233, + "learning_rate": 6.692968039741328e-05, + "loss": 1.5286, + "step": 3755 + }, + { + "epoch": 0.3901526955437831, + "grad_norm": 0.3640875220298767, + "learning_rate": 6.691432670048552e-05, + "loss": 1.6235, + "step": 3756 + }, + { + "epoch": 0.39025657006336345, + "grad_norm": 0.3808233439922333, + "learning_rate": 6.689897120231389e-05, + "loss": 1.6554, + "step": 3757 + }, + { + "epoch": 0.3903604445829438, + "grad_norm": 0.4116235375404358, + "learning_rate": 6.688361390453368e-05, + "loss": 1.7059, + "step": 3758 + }, + { + "epoch": 0.39046431910252416, + "grad_norm": 0.39567288756370544, + "learning_rate": 6.686825480878026e-05, + "loss": 1.5193, + "step": 3759 + }, + { + "epoch": 0.3905681936221045, + "grad_norm": 0.39557141065597534, + "learning_rate": 6.685289391668929e-05, + "loss": 1.7383, + "step": 3760 + }, + { + "epoch": 0.39067206814168487, + "grad_norm": 0.40352147817611694, + "learning_rate": 6.683753122989656e-05, + "loss": 1.6945, + "step": 3761 + }, + { + "epoch": 0.3907759426612652, + "grad_norm": 0.38249486684799194, + "learning_rate": 6.682216675003811e-05, + "loss": 1.5765, + "step": 3762 + }, + { + "epoch": 0.3908798171808455, + "grad_norm": 0.3883412480354309, + "learning_rate": 6.680680047875014e-05, + "loss": 1.7523, + "step": 3763 + }, + { + "epoch": 0.3909836917004259, + "grad_norm": 0.38825148344039917, + "learning_rate": 6.679143241766898e-05, + "loss": 1.7129, + "step": 3764 + }, + { + "epoch": 0.39108756622000623, + "grad_norm": 0.4497055113315582, + "learning_rate": 6.677606256843126e-05, + "loss": 1.7823, + "step": 3765 + }, + { + "epoch": 0.39119144073958656, + "grad_norm": 0.39453697204589844, + "learning_rate": 6.676069093267374e-05, + "loss": 1.7184, + "step": 3766 + }, + { + "epoch": 0.39129531525916694, + "grad_norm": 0.4167402982711792, + "learning_rate": 6.674531751203335e-05, + "loss": 1.921, + "step": 3767 + }, + { + "epoch": 0.39139918977874727, + "grad_norm": 0.39788275957107544, + "learning_rate": 6.672994230814729e-05, + "loss": 1.6948, + "step": 3768 + }, + { + "epoch": 0.3915030642983276, + "grad_norm": 0.3702066242694855, + "learning_rate": 6.671456532265285e-05, + "loss": 1.6143, + "step": 3769 + }, + { + "epoch": 0.391606938817908, + "grad_norm": 0.4112524390220642, + "learning_rate": 6.669918655718759e-05, + "loss": 1.6526, + "step": 3770 + }, + { + "epoch": 0.3917108133374883, + "grad_norm": 0.4594886898994446, + "learning_rate": 6.668380601338924e-05, + "loss": 1.8916, + "step": 3771 + }, + { + "epoch": 0.3918146878570687, + "grad_norm": 0.3847801089286804, + "learning_rate": 6.666842369289566e-05, + "loss": 1.803, + "step": 3772 + }, + { + "epoch": 0.391918562376649, + "grad_norm": 0.413424551486969, + "learning_rate": 6.665303959734499e-05, + "loss": 1.7646, + "step": 3773 + }, + { + "epoch": 0.39202243689622934, + "grad_norm": 0.42914092540740967, + "learning_rate": 6.66376537283755e-05, + "loss": 1.8358, + "step": 3774 + }, + { + "epoch": 0.3921263114158097, + "grad_norm": 0.42516449093818665, + "learning_rate": 6.662226608762566e-05, + "loss": 1.7087, + "step": 3775 + }, + { + "epoch": 0.39223018593539005, + "grad_norm": 0.40384045243263245, + "learning_rate": 6.660687667673416e-05, + "loss": 1.81, + "step": 3776 + }, + { + "epoch": 0.3923340604549704, + "grad_norm": 0.3911544978618622, + "learning_rate": 6.659148549733983e-05, + "loss": 1.6589, + "step": 3777 + }, + { + "epoch": 0.39243793497455076, + "grad_norm": 0.3848823606967926, + "learning_rate": 6.657609255108169e-05, + "loss": 1.653, + "step": 3778 + }, + { + "epoch": 0.3925418094941311, + "grad_norm": 0.38892778754234314, + "learning_rate": 6.656069783959901e-05, + "loss": 1.7404, + "step": 3779 + }, + { + "epoch": 0.3926456840137114, + "grad_norm": 0.4406839609146118, + "learning_rate": 6.654530136453118e-05, + "loss": 2.1317, + "step": 3780 + }, + { + "epoch": 0.3927495585332918, + "grad_norm": 0.39131468534469604, + "learning_rate": 6.652990312751781e-05, + "loss": 1.864, + "step": 3781 + }, + { + "epoch": 0.3928534330528721, + "grad_norm": 0.38605618476867676, + "learning_rate": 6.65145031301987e-05, + "loss": 1.7456, + "step": 3782 + }, + { + "epoch": 0.39295730757245245, + "grad_norm": 0.3929480314254761, + "learning_rate": 6.649910137421383e-05, + "loss": 1.7215, + "step": 3783 + }, + { + "epoch": 0.39306118209203283, + "grad_norm": 0.4117045998573303, + "learning_rate": 6.648369786120336e-05, + "loss": 1.8767, + "step": 3784 + }, + { + "epoch": 0.39316505661161316, + "grad_norm": 0.4199231266975403, + "learning_rate": 6.646829259280765e-05, + "loss": 1.7678, + "step": 3785 + }, + { + "epoch": 0.39326893113119354, + "grad_norm": 0.3810809254646301, + "learning_rate": 6.645288557066722e-05, + "loss": 1.6417, + "step": 3786 + }, + { + "epoch": 0.39337280565077387, + "grad_norm": 0.40853750705718994, + "learning_rate": 6.643747679642282e-05, + "loss": 1.7405, + "step": 3787 + }, + { + "epoch": 0.3934766801703542, + "grad_norm": 0.3977396488189697, + "learning_rate": 6.642206627171536e-05, + "loss": 1.7114, + "step": 3788 + }, + { + "epoch": 0.3935805546899346, + "grad_norm": 0.40110906958580017, + "learning_rate": 6.640665399818594e-05, + "loss": 1.433, + "step": 3789 + }, + { + "epoch": 0.3936844292095149, + "grad_norm": 0.37747594714164734, + "learning_rate": 6.639123997747585e-05, + "loss": 1.6693, + "step": 3790 + }, + { + "epoch": 0.39378830372909523, + "grad_norm": 0.39845001697540283, + "learning_rate": 6.637582421122657e-05, + "loss": 1.7434, + "step": 3791 + }, + { + "epoch": 0.3938921782486756, + "grad_norm": 0.3821636438369751, + "learning_rate": 6.636040670107972e-05, + "loss": 1.6637, + "step": 3792 + }, + { + "epoch": 0.39399605276825594, + "grad_norm": 0.4459479749202728, + "learning_rate": 6.63449874486772e-05, + "loss": 1.9512, + "step": 3793 + }, + { + "epoch": 0.39409992728783627, + "grad_norm": 0.35890012979507446, + "learning_rate": 6.632956645566099e-05, + "loss": 1.5369, + "step": 3794 + }, + { + "epoch": 0.39420380180741665, + "grad_norm": 0.40800046920776367, + "learning_rate": 6.631414372367333e-05, + "loss": 1.7759, + "step": 3795 + }, + { + "epoch": 0.394307676326997, + "grad_norm": 0.42685750126838684, + "learning_rate": 6.629871925435663e-05, + "loss": 1.8498, + "step": 3796 + }, + { + "epoch": 0.3944115508465773, + "grad_norm": 0.3911482095718384, + "learning_rate": 6.628329304935346e-05, + "loss": 1.7042, + "step": 3797 + }, + { + "epoch": 0.3945154253661577, + "grad_norm": 0.3812938332557678, + "learning_rate": 6.62678651103066e-05, + "loss": 1.6208, + "step": 3798 + }, + { + "epoch": 0.394619299885738, + "grad_norm": 0.4290582239627838, + "learning_rate": 6.625243543885899e-05, + "loss": 1.8095, + "step": 3799 + }, + { + "epoch": 0.3947231744053184, + "grad_norm": 0.38044747710227966, + "learning_rate": 6.623700403665378e-05, + "loss": 1.821, + "step": 3800 + }, + { + "epoch": 0.3948270489248987, + "grad_norm": 0.42807307839393616, + "learning_rate": 6.622157090533431e-05, + "loss": 1.9329, + "step": 3801 + }, + { + "epoch": 0.39493092344447905, + "grad_norm": 0.3865020275115967, + "learning_rate": 6.620613604654405e-05, + "loss": 1.641, + "step": 3802 + }, + { + "epoch": 0.39503479796405944, + "grad_norm": 0.4715527892112732, + "learning_rate": 6.619069946192672e-05, + "loss": 1.8927, + "step": 3803 + }, + { + "epoch": 0.39513867248363976, + "grad_norm": 0.3802943229675293, + "learning_rate": 6.61752611531262e-05, + "loss": 1.7063, + "step": 3804 + }, + { + "epoch": 0.3952425470032201, + "grad_norm": 0.38618597388267517, + "learning_rate": 6.615982112178653e-05, + "loss": 1.6576, + "step": 3805 + }, + { + "epoch": 0.3953464215228005, + "grad_norm": 0.4029250741004944, + "learning_rate": 6.614437936955196e-05, + "loss": 1.7814, + "step": 3806 + }, + { + "epoch": 0.3954502960423808, + "grad_norm": 0.3960702121257782, + "learning_rate": 6.612893589806693e-05, + "loss": 1.7102, + "step": 3807 + }, + { + "epoch": 0.3955541705619611, + "grad_norm": 0.38714170455932617, + "learning_rate": 6.611349070897604e-05, + "loss": 1.6924, + "step": 3808 + }, + { + "epoch": 0.3956580450815415, + "grad_norm": 0.4107656180858612, + "learning_rate": 6.609804380392406e-05, + "loss": 1.5255, + "step": 3809 + }, + { + "epoch": 0.39576191960112184, + "grad_norm": 0.40633097290992737, + "learning_rate": 6.6082595184556e-05, + "loss": 1.734, + "step": 3810 + }, + { + "epoch": 0.3958657941207022, + "grad_norm": 0.4659360945224762, + "learning_rate": 6.606714485251699e-05, + "loss": 1.7854, + "step": 3811 + }, + { + "epoch": 0.39596966864028255, + "grad_norm": 0.4087457060813904, + "learning_rate": 6.605169280945238e-05, + "loss": 1.7713, + "step": 3812 + }, + { + "epoch": 0.3960735431598629, + "grad_norm": 0.38341912627220154, + "learning_rate": 6.603623905700771e-05, + "loss": 1.5804, + "step": 3813 + }, + { + "epoch": 0.39617741767944326, + "grad_norm": 0.41718804836273193, + "learning_rate": 6.602078359682865e-05, + "loss": 1.7562, + "step": 3814 + }, + { + "epoch": 0.3962812921990236, + "grad_norm": 0.4072135388851166, + "learning_rate": 6.600532643056112e-05, + "loss": 1.6313, + "step": 3815 + }, + { + "epoch": 0.3963851667186039, + "grad_norm": 0.4200040400028229, + "learning_rate": 6.598986755985116e-05, + "loss": 1.7867, + "step": 3816 + }, + { + "epoch": 0.3964890412381843, + "grad_norm": 0.38370397686958313, + "learning_rate": 6.597440698634503e-05, + "loss": 1.5696, + "step": 3817 + }, + { + "epoch": 0.3965929157577646, + "grad_norm": 0.4030844271183014, + "learning_rate": 6.595894471168917e-05, + "loss": 1.688, + "step": 3818 + }, + { + "epoch": 0.39669679027734495, + "grad_norm": 0.4080040454864502, + "learning_rate": 6.594348073753019e-05, + "loss": 1.7786, + "step": 3819 + }, + { + "epoch": 0.39680066479692533, + "grad_norm": 0.36491355299949646, + "learning_rate": 6.592801506551486e-05, + "loss": 1.5053, + "step": 3820 + }, + { + "epoch": 0.39690453931650566, + "grad_norm": 0.4053652882575989, + "learning_rate": 6.591254769729018e-05, + "loss": 1.7091, + "step": 3821 + }, + { + "epoch": 0.397008413836086, + "grad_norm": 0.40529873967170715, + "learning_rate": 6.589707863450327e-05, + "loss": 1.6866, + "step": 3822 + }, + { + "epoch": 0.39711228835566637, + "grad_norm": 0.3853190243244171, + "learning_rate": 6.588160787880151e-05, + "loss": 1.703, + "step": 3823 + }, + { + "epoch": 0.3972161628752467, + "grad_norm": 0.39480873942375183, + "learning_rate": 6.586613543183237e-05, + "loss": 1.6685, + "step": 3824 + }, + { + "epoch": 0.3973200373948271, + "grad_norm": 0.38072872161865234, + "learning_rate": 6.58506612952436e-05, + "loss": 1.6512, + "step": 3825 + }, + { + "epoch": 0.3974239119144074, + "grad_norm": 0.42218974232673645, + "learning_rate": 6.583518547068302e-05, + "loss": 1.7624, + "step": 3826 + }, + { + "epoch": 0.39752778643398773, + "grad_norm": 0.34545183181762695, + "learning_rate": 6.581970795979871e-05, + "loss": 1.4234, + "step": 3827 + }, + { + "epoch": 0.3976316609535681, + "grad_norm": 0.39277178049087524, + "learning_rate": 6.580422876423891e-05, + "loss": 1.7441, + "step": 3828 + }, + { + "epoch": 0.39773553547314844, + "grad_norm": 0.39970070123672485, + "learning_rate": 6.578874788565201e-05, + "loss": 1.6835, + "step": 3829 + }, + { + "epoch": 0.39783940999272877, + "grad_norm": 0.38639700412750244, + "learning_rate": 6.577326532568663e-05, + "loss": 1.5985, + "step": 3830 + }, + { + "epoch": 0.39794328451230915, + "grad_norm": 0.3735407292842865, + "learning_rate": 6.575778108599151e-05, + "loss": 1.734, + "step": 3831 + }, + { + "epoch": 0.3980471590318895, + "grad_norm": 0.38914212584495544, + "learning_rate": 6.574229516821564e-05, + "loss": 1.6279, + "step": 3832 + }, + { + "epoch": 0.3981510335514698, + "grad_norm": 0.38888975977897644, + "learning_rate": 6.572680757400813e-05, + "loss": 1.7271, + "step": 3833 + }, + { + "epoch": 0.3982549080710502, + "grad_norm": 0.4465731382369995, + "learning_rate": 6.571131830501827e-05, + "loss": 1.8058, + "step": 3834 + }, + { + "epoch": 0.3983587825906305, + "grad_norm": 0.3945710062980652, + "learning_rate": 6.569582736289556e-05, + "loss": 1.6509, + "step": 3835 + }, + { + "epoch": 0.39846265711021084, + "grad_norm": 0.39435771107673645, + "learning_rate": 6.56803347492897e-05, + "loss": 1.5513, + "step": 3836 + }, + { + "epoch": 0.3985665316297912, + "grad_norm": 0.3789151906967163, + "learning_rate": 6.566484046585047e-05, + "loss": 1.5777, + "step": 3837 + }, + { + "epoch": 0.39867040614937155, + "grad_norm": 0.39071643352508545, + "learning_rate": 6.564934451422793e-05, + "loss": 1.8612, + "step": 3838 + }, + { + "epoch": 0.39877428066895193, + "grad_norm": 0.3789994716644287, + "learning_rate": 6.563384689607227e-05, + "loss": 1.7228, + "step": 3839 + }, + { + "epoch": 0.39887815518853226, + "grad_norm": 0.3888130486011505, + "learning_rate": 6.561834761303386e-05, + "loss": 1.7733, + "step": 3840 + }, + { + "epoch": 0.3989820297081126, + "grad_norm": 0.41186729073524475, + "learning_rate": 6.560284666676327e-05, + "loss": 1.8526, + "step": 3841 + }, + { + "epoch": 0.39908590422769297, + "grad_norm": 0.3710212707519531, + "learning_rate": 6.558734405891121e-05, + "loss": 1.4567, + "step": 3842 + }, + { + "epoch": 0.3991897787472733, + "grad_norm": 0.4276915490627289, + "learning_rate": 6.557183979112859e-05, + "loss": 1.8229, + "step": 3843 + }, + { + "epoch": 0.3992936532668536, + "grad_norm": 0.4046017527580261, + "learning_rate": 6.55563338650665e-05, + "loss": 1.7953, + "step": 3844 + }, + { + "epoch": 0.399397527786434, + "grad_norm": 0.36397168040275574, + "learning_rate": 6.554082628237619e-05, + "loss": 1.4815, + "step": 3845 + }, + { + "epoch": 0.39950140230601433, + "grad_norm": 0.49856171011924744, + "learning_rate": 6.552531704470911e-05, + "loss": 1.9411, + "step": 3846 + }, + { + "epoch": 0.39960527682559466, + "grad_norm": 0.36472898721694946, + "learning_rate": 6.550980615371687e-05, + "loss": 1.6889, + "step": 3847 + }, + { + "epoch": 0.39970915134517504, + "grad_norm": 0.37358102202415466, + "learning_rate": 6.549429361105126e-05, + "loss": 1.6639, + "step": 3848 + }, + { + "epoch": 0.39981302586475537, + "grad_norm": 0.3778083920478821, + "learning_rate": 6.547877941836424e-05, + "loss": 1.7107, + "step": 3849 + }, + { + "epoch": 0.39991690038433575, + "grad_norm": 0.424234002828598, + "learning_rate": 6.546326357730794e-05, + "loss": 1.7377, + "step": 3850 + }, + { + "epoch": 0.4000207749039161, + "grad_norm": 0.4264697730541229, + "learning_rate": 6.544774608953468e-05, + "loss": 1.8146, + "step": 3851 + }, + { + "epoch": 0.4001246494234964, + "grad_norm": 0.4023953676223755, + "learning_rate": 6.543222695669697e-05, + "loss": 1.6474, + "step": 3852 + }, + { + "epoch": 0.4002285239430768, + "grad_norm": 0.41954219341278076, + "learning_rate": 6.541670618044748e-05, + "loss": 1.8214, + "step": 3853 + }, + { + "epoch": 0.4003323984626571, + "grad_norm": 0.41983938217163086, + "learning_rate": 6.540118376243903e-05, + "loss": 1.7134, + "step": 3854 + }, + { + "epoch": 0.40043627298223744, + "grad_norm": 0.4455628991127014, + "learning_rate": 6.538565970432463e-05, + "loss": 1.974, + "step": 3855 + }, + { + "epoch": 0.4005401475018178, + "grad_norm": 0.41283226013183594, + "learning_rate": 6.537013400775748e-05, + "loss": 1.5895, + "step": 3856 + }, + { + "epoch": 0.40064402202139815, + "grad_norm": 0.4059913158416748, + "learning_rate": 6.535460667439097e-05, + "loss": 1.6404, + "step": 3857 + }, + { + "epoch": 0.4007478965409785, + "grad_norm": 0.4247495234012604, + "learning_rate": 6.53390777058786e-05, + "loss": 1.8849, + "step": 3858 + }, + { + "epoch": 0.40085177106055886, + "grad_norm": 0.4334213137626648, + "learning_rate": 6.532354710387411e-05, + "loss": 1.7472, + "step": 3859 + }, + { + "epoch": 0.4009556455801392, + "grad_norm": 0.3892272114753723, + "learning_rate": 6.530801487003139e-05, + "loss": 1.601, + "step": 3860 + }, + { + "epoch": 0.4010595200997195, + "grad_norm": 0.3871516287326813, + "learning_rate": 6.529248100600449e-05, + "loss": 1.7275, + "step": 3861 + }, + { + "epoch": 0.4011633946192999, + "grad_norm": 0.3974870443344116, + "learning_rate": 6.527694551344765e-05, + "loss": 1.6634, + "step": 3862 + }, + { + "epoch": 0.4012672691388802, + "grad_norm": 0.3988457918167114, + "learning_rate": 6.526140839401527e-05, + "loss": 1.6647, + "step": 3863 + }, + { + "epoch": 0.4013711436584606, + "grad_norm": 0.42209574580192566, + "learning_rate": 6.524586964936194e-05, + "loss": 1.8112, + "step": 3864 + }, + { + "epoch": 0.40147501817804093, + "grad_norm": 0.4116060733795166, + "learning_rate": 6.523032928114242e-05, + "loss": 1.7179, + "step": 3865 + }, + { + "epoch": 0.40157889269762126, + "grad_norm": 0.341496080160141, + "learning_rate": 6.521478729101164e-05, + "loss": 1.4234, + "step": 3866 + }, + { + "epoch": 0.40168276721720164, + "grad_norm": 0.3907933831214905, + "learning_rate": 6.519924368062468e-05, + "loss": 1.7079, + "step": 3867 + }, + { + "epoch": 0.40178664173678197, + "grad_norm": 0.4084981679916382, + "learning_rate": 6.518369845163683e-05, + "loss": 1.6538, + "step": 3868 + }, + { + "epoch": 0.4018905162563623, + "grad_norm": 0.37673652172088623, + "learning_rate": 6.516815160570355e-05, + "loss": 1.7034, + "step": 3869 + }, + { + "epoch": 0.4019943907759427, + "grad_norm": 0.36309683322906494, + "learning_rate": 6.515260314448041e-05, + "loss": 1.5712, + "step": 3870 + }, + { + "epoch": 0.402098265295523, + "grad_norm": 0.40556076169013977, + "learning_rate": 6.513705306962325e-05, + "loss": 1.6909, + "step": 3871 + }, + { + "epoch": 0.40220213981510333, + "grad_norm": 0.39010584354400635, + "learning_rate": 6.5121501382788e-05, + "loss": 1.5812, + "step": 3872 + }, + { + "epoch": 0.4023060143346837, + "grad_norm": 0.4054809510707855, + "learning_rate": 6.51059480856308e-05, + "loss": 1.6611, + "step": 3873 + }, + { + "epoch": 0.40240988885426404, + "grad_norm": 0.3722795844078064, + "learning_rate": 6.509039317980798e-05, + "loss": 1.5964, + "step": 3874 + }, + { + "epoch": 0.40251376337384437, + "grad_norm": 0.41028568148612976, + "learning_rate": 6.5074836666976e-05, + "loss": 1.6734, + "step": 3875 + }, + { + "epoch": 0.40261763789342475, + "grad_norm": 0.39521047472953796, + "learning_rate": 6.505927854879148e-05, + "loss": 1.5993, + "step": 3876 + }, + { + "epoch": 0.4027215124130051, + "grad_norm": 0.3818994462490082, + "learning_rate": 6.504371882691126e-05, + "loss": 1.6791, + "step": 3877 + }, + { + "epoch": 0.40282538693258546, + "grad_norm": 0.4162769615650177, + "learning_rate": 6.502815750299235e-05, + "loss": 1.7729, + "step": 3878 + }, + { + "epoch": 0.4029292614521658, + "grad_norm": 0.38468343019485474, + "learning_rate": 6.501259457869187e-05, + "loss": 1.7063, + "step": 3879 + }, + { + "epoch": 0.4030331359717461, + "grad_norm": 0.3765242397785187, + "learning_rate": 6.49970300556672e-05, + "loss": 1.546, + "step": 3880 + }, + { + "epoch": 0.4031370104913265, + "grad_norm": 0.3874824345111847, + "learning_rate": 6.498146393557578e-05, + "loss": 1.7616, + "step": 3881 + }, + { + "epoch": 0.4032408850109068, + "grad_norm": 0.39522698521614075, + "learning_rate": 6.496589622007532e-05, + "loss": 1.7472, + "step": 3882 + }, + { + "epoch": 0.40334475953048715, + "grad_norm": 0.3761104941368103, + "learning_rate": 6.495032691082367e-05, + "loss": 1.5378, + "step": 3883 + }, + { + "epoch": 0.40344863405006753, + "grad_norm": 0.4124884605407715, + "learning_rate": 6.493475600947879e-05, + "loss": 1.6228, + "step": 3884 + }, + { + "epoch": 0.40355250856964786, + "grad_norm": 0.3794013559818268, + "learning_rate": 6.491918351769891e-05, + "loss": 1.8077, + "step": 3885 + }, + { + "epoch": 0.4036563830892282, + "grad_norm": 0.43968185782432556, + "learning_rate": 6.490360943714235e-05, + "loss": 1.8393, + "step": 3886 + }, + { + "epoch": 0.40376025760880857, + "grad_norm": 0.4507423937320709, + "learning_rate": 6.488803376946764e-05, + "loss": 1.9139, + "step": 3887 + }, + { + "epoch": 0.4038641321283889, + "grad_norm": 0.39006999135017395, + "learning_rate": 6.48724565163335e-05, + "loss": 1.7191, + "step": 3888 + }, + { + "epoch": 0.4039680066479693, + "grad_norm": 0.3846725523471832, + "learning_rate": 6.48568776793987e-05, + "loss": 1.6855, + "step": 3889 + }, + { + "epoch": 0.4040718811675496, + "grad_norm": 0.4519991874694824, + "learning_rate": 6.484129726032233e-05, + "loss": 1.7529, + "step": 3890 + }, + { + "epoch": 0.40417575568712993, + "grad_norm": 0.36061856150627136, + "learning_rate": 6.482571526076357e-05, + "loss": 1.5992, + "step": 3891 + }, + { + "epoch": 0.4042796302067103, + "grad_norm": 0.4295485317707062, + "learning_rate": 6.481013168238178e-05, + "loss": 1.7882, + "step": 3892 + }, + { + "epoch": 0.40438350472629064, + "grad_norm": 0.4181903600692749, + "learning_rate": 6.479454652683649e-05, + "loss": 1.9548, + "step": 3893 + }, + { + "epoch": 0.40448737924587097, + "grad_norm": 0.37351468205451965, + "learning_rate": 6.47789597957874e-05, + "loss": 1.5784, + "step": 3894 + }, + { + "epoch": 0.40459125376545135, + "grad_norm": 0.505387544631958, + "learning_rate": 6.476337149089438e-05, + "loss": 1.9224, + "step": 3895 + }, + { + "epoch": 0.4046951282850317, + "grad_norm": 0.3897416293621063, + "learning_rate": 6.474778161381743e-05, + "loss": 1.4597, + "step": 3896 + }, + { + "epoch": 0.404799002804612, + "grad_norm": 0.44174665212631226, + "learning_rate": 6.47321901662168e-05, + "loss": 1.9036, + "step": 3897 + }, + { + "epoch": 0.4049028773241924, + "grad_norm": 0.39586788415908813, + "learning_rate": 6.471659714975282e-05, + "loss": 1.802, + "step": 3898 + }, + { + "epoch": 0.4050067518437727, + "grad_norm": 0.40716326236724854, + "learning_rate": 6.470100256608603e-05, + "loss": 1.7556, + "step": 3899 + }, + { + "epoch": 0.40511062636335304, + "grad_norm": 0.3839757740497589, + "learning_rate": 6.468540641687716e-05, + "loss": 1.6419, + "step": 3900 + }, + { + "epoch": 0.4052145008829334, + "grad_norm": 0.3808140754699707, + "learning_rate": 6.466980870378704e-05, + "loss": 1.7802, + "step": 3901 + }, + { + "epoch": 0.40531837540251375, + "grad_norm": 0.4827220141887665, + "learning_rate": 6.465420942847673e-05, + "loss": 1.9368, + "step": 3902 + }, + { + "epoch": 0.40542224992209414, + "grad_norm": 0.42109206318855286, + "learning_rate": 6.463860859260742e-05, + "loss": 1.8801, + "step": 3903 + }, + { + "epoch": 0.40552612444167446, + "grad_norm": 0.41569140553474426, + "learning_rate": 6.462300619784048e-05, + "loss": 1.8356, + "step": 3904 + }, + { + "epoch": 0.4056299989612548, + "grad_norm": 0.3784182071685791, + "learning_rate": 6.460740224583746e-05, + "loss": 1.5941, + "step": 3905 + }, + { + "epoch": 0.40573387348083517, + "grad_norm": 0.38103383779525757, + "learning_rate": 6.459179673826001e-05, + "loss": 1.8269, + "step": 3906 + }, + { + "epoch": 0.4058377480004155, + "grad_norm": 0.38035640120506287, + "learning_rate": 6.457618967677004e-05, + "loss": 1.7104, + "step": 3907 + }, + { + "epoch": 0.4059416225199958, + "grad_norm": 0.4427489638328552, + "learning_rate": 6.456058106302958e-05, + "loss": 2.0598, + "step": 3908 + }, + { + "epoch": 0.4060454970395762, + "grad_norm": 0.38776329159736633, + "learning_rate": 6.454497089870082e-05, + "loss": 1.8005, + "step": 3909 + }, + { + "epoch": 0.40614937155915654, + "grad_norm": 0.39295095205307007, + "learning_rate": 6.45293591854461e-05, + "loss": 1.5732, + "step": 3910 + }, + { + "epoch": 0.40625324607873686, + "grad_norm": 0.4256858229637146, + "learning_rate": 6.451374592492796e-05, + "loss": 1.8357, + "step": 3911 + }, + { + "epoch": 0.40635712059831725, + "grad_norm": 0.45658764243125916, + "learning_rate": 6.449813111880909e-05, + "loss": 1.851, + "step": 3912 + }, + { + "epoch": 0.40646099511789757, + "grad_norm": 0.40819069743156433, + "learning_rate": 6.448251476875235e-05, + "loss": 1.4937, + "step": 3913 + }, + { + "epoch": 0.4065648696374779, + "grad_norm": 0.35700273513793945, + "learning_rate": 6.446689687642076e-05, + "loss": 1.6035, + "step": 3914 + }, + { + "epoch": 0.4066687441570583, + "grad_norm": 0.3629886209964752, + "learning_rate": 6.445127744347748e-05, + "loss": 1.5591, + "step": 3915 + }, + { + "epoch": 0.4067726186766386, + "grad_norm": 0.38591131567955017, + "learning_rate": 6.443565647158589e-05, + "loss": 1.5891, + "step": 3916 + }, + { + "epoch": 0.406876493196219, + "grad_norm": 0.41065141558647156, + "learning_rate": 6.442003396240949e-05, + "loss": 1.8029, + "step": 3917 + }, + { + "epoch": 0.4069803677157993, + "grad_norm": 0.4433719515800476, + "learning_rate": 6.440440991761195e-05, + "loss": 1.8338, + "step": 3918 + }, + { + "epoch": 0.40708424223537965, + "grad_norm": 0.40136539936065674, + "learning_rate": 6.438878433885711e-05, + "loss": 1.5756, + "step": 3919 + }, + { + "epoch": 0.40718811675496003, + "grad_norm": 0.39997127652168274, + "learning_rate": 6.437315722780898e-05, + "loss": 1.6578, + "step": 3920 + }, + { + "epoch": 0.40729199127454035, + "grad_norm": 0.41028451919555664, + "learning_rate": 6.435752858613171e-05, + "loss": 1.7394, + "step": 3921 + }, + { + "epoch": 0.4073958657941207, + "grad_norm": 0.3836668133735657, + "learning_rate": 6.434189841548965e-05, + "loss": 1.7368, + "step": 3922 + }, + { + "epoch": 0.40749974031370106, + "grad_norm": 0.38762718439102173, + "learning_rate": 6.432626671754729e-05, + "loss": 1.6122, + "step": 3923 + }, + { + "epoch": 0.4076036148332814, + "grad_norm": 0.37233954668045044, + "learning_rate": 6.431063349396926e-05, + "loss": 1.5676, + "step": 3924 + }, + { + "epoch": 0.4077074893528617, + "grad_norm": 0.3751562237739563, + "learning_rate": 6.429499874642038e-05, + "loss": 1.5929, + "step": 3925 + }, + { + "epoch": 0.4078113638724421, + "grad_norm": 0.41311609745025635, + "learning_rate": 6.427936247656565e-05, + "loss": 1.4869, + "step": 3926 + }, + { + "epoch": 0.40791523839202243, + "grad_norm": 0.37297797203063965, + "learning_rate": 6.42637246860702e-05, + "loss": 1.6425, + "step": 3927 + }, + { + "epoch": 0.4080191129116028, + "grad_norm": 0.4266083240509033, + "learning_rate": 6.424808537659932e-05, + "loss": 1.7858, + "step": 3928 + }, + { + "epoch": 0.40812298743118314, + "grad_norm": 0.46376118063926697, + "learning_rate": 6.423244454981851e-05, + "loss": 1.7072, + "step": 3929 + }, + { + "epoch": 0.40822686195076346, + "grad_norm": 0.4833356440067291, + "learning_rate": 6.421680220739336e-05, + "loss": 1.792, + "step": 3930 + }, + { + "epoch": 0.40833073647034385, + "grad_norm": 0.42818766832351685, + "learning_rate": 6.420115835098967e-05, + "loss": 1.9283, + "step": 3931 + }, + { + "epoch": 0.4084346109899242, + "grad_norm": 0.41609877347946167, + "learning_rate": 6.418551298227338e-05, + "loss": 1.6607, + "step": 3932 + }, + { + "epoch": 0.4085384855095045, + "grad_norm": 0.3854401111602783, + "learning_rate": 6.416986610291064e-05, + "loss": 1.7383, + "step": 3933 + }, + { + "epoch": 0.4086423600290849, + "grad_norm": 0.4421563744544983, + "learning_rate": 6.415421771456766e-05, + "loss": 1.5812, + "step": 3934 + }, + { + "epoch": 0.4087462345486652, + "grad_norm": 0.40960294008255005, + "learning_rate": 6.41385678189109e-05, + "loss": 1.7514, + "step": 3935 + }, + { + "epoch": 0.40885010906824554, + "grad_norm": 0.37743479013442993, + "learning_rate": 6.412291641760697e-05, + "loss": 1.6141, + "step": 3936 + }, + { + "epoch": 0.4089539835878259, + "grad_norm": 0.3768305480480194, + "learning_rate": 6.410726351232259e-05, + "loss": 1.6868, + "step": 3937 + }, + { + "epoch": 0.40905785810740625, + "grad_norm": 0.40039536356925964, + "learning_rate": 6.409160910472469e-05, + "loss": 1.7051, + "step": 3938 + }, + { + "epoch": 0.4091617326269866, + "grad_norm": 0.4146733283996582, + "learning_rate": 6.407595319648034e-05, + "loss": 1.7432, + "step": 3939 + }, + { + "epoch": 0.40926560714656696, + "grad_norm": 0.3897091746330261, + "learning_rate": 6.406029578925677e-05, + "loss": 1.6319, + "step": 3940 + }, + { + "epoch": 0.4093694816661473, + "grad_norm": 0.3977634608745575, + "learning_rate": 6.404463688472138e-05, + "loss": 1.6741, + "step": 3941 + }, + { + "epoch": 0.40947335618572767, + "grad_norm": 0.3753795027732849, + "learning_rate": 6.402897648454171e-05, + "loss": 1.6135, + "step": 3942 + }, + { + "epoch": 0.409577230705308, + "grad_norm": 0.40746620297431946, + "learning_rate": 6.401331459038546e-05, + "loss": 1.6742, + "step": 3943 + }, + { + "epoch": 0.4096811052248883, + "grad_norm": 0.4224250614643097, + "learning_rate": 6.399765120392054e-05, + "loss": 1.7462, + "step": 3944 + }, + { + "epoch": 0.4097849797444687, + "grad_norm": 0.4034820795059204, + "learning_rate": 6.398198632681494e-05, + "loss": 1.6549, + "step": 3945 + }, + { + "epoch": 0.40988885426404903, + "grad_norm": 0.40801072120666504, + "learning_rate": 6.396631996073686e-05, + "loss": 1.5801, + "step": 3946 + }, + { + "epoch": 0.40999272878362936, + "grad_norm": 0.4579329192638397, + "learning_rate": 6.395065210735464e-05, + "loss": 1.6841, + "step": 3947 + }, + { + "epoch": 0.41009660330320974, + "grad_norm": 0.4300961494445801, + "learning_rate": 6.39349827683368e-05, + "loss": 1.8659, + "step": 3948 + }, + { + "epoch": 0.41020047782279007, + "grad_norm": 0.4603518843650818, + "learning_rate": 6.391931194535199e-05, + "loss": 1.82, + "step": 3949 + }, + { + "epoch": 0.4103043523423704, + "grad_norm": 0.36023688316345215, + "learning_rate": 6.390363964006903e-05, + "loss": 1.5473, + "step": 3950 + }, + { + "epoch": 0.4104082268619508, + "grad_norm": 0.36991775035858154, + "learning_rate": 6.38879658541569e-05, + "loss": 1.554, + "step": 3951 + }, + { + "epoch": 0.4105121013815311, + "grad_norm": 0.3790716528892517, + "learning_rate": 6.387229058928475e-05, + "loss": 1.6563, + "step": 3952 + }, + { + "epoch": 0.41061597590111143, + "grad_norm": 0.38919469714164734, + "learning_rate": 6.385661384712185e-05, + "loss": 1.6243, + "step": 3953 + }, + { + "epoch": 0.4107198504206918, + "grad_norm": 0.3934738337993622, + "learning_rate": 6.384093562933765e-05, + "loss": 1.6521, + "step": 3954 + }, + { + "epoch": 0.41082372494027214, + "grad_norm": 0.39593058824539185, + "learning_rate": 6.382525593760178e-05, + "loss": 1.7887, + "step": 3955 + }, + { + "epoch": 0.4109275994598525, + "grad_norm": 0.47822320461273193, + "learning_rate": 6.380957477358399e-05, + "loss": 1.8647, + "step": 3956 + }, + { + "epoch": 0.41103147397943285, + "grad_norm": 0.44262126088142395, + "learning_rate": 6.379389213895421e-05, + "loss": 1.8746, + "step": 3957 + }, + { + "epoch": 0.4111353484990132, + "grad_norm": 0.37435340881347656, + "learning_rate": 6.377820803538253e-05, + "loss": 1.6782, + "step": 3958 + }, + { + "epoch": 0.41123922301859356, + "grad_norm": 0.41944101452827454, + "learning_rate": 6.376252246453914e-05, + "loss": 1.7188, + "step": 3959 + }, + { + "epoch": 0.4113430975381739, + "grad_norm": 0.43215543031692505, + "learning_rate": 6.374683542809447e-05, + "loss": 1.8221, + "step": 3960 + }, + { + "epoch": 0.4114469720577542, + "grad_norm": 0.40915408730506897, + "learning_rate": 6.373114692771906e-05, + "loss": 1.7294, + "step": 3961 + }, + { + "epoch": 0.4115508465773346, + "grad_norm": 0.41122907400131226, + "learning_rate": 6.371545696508358e-05, + "loss": 1.708, + "step": 3962 + }, + { + "epoch": 0.4116547210969149, + "grad_norm": 0.3763563930988312, + "learning_rate": 6.369976554185896e-05, + "loss": 1.8057, + "step": 3963 + }, + { + "epoch": 0.41175859561649525, + "grad_norm": 0.4643213748931885, + "learning_rate": 6.368407265971616e-05, + "loss": 1.6913, + "step": 3964 + }, + { + "epoch": 0.41186247013607563, + "grad_norm": 0.3934086263179779, + "learning_rate": 6.366837832032635e-05, + "loss": 1.8158, + "step": 3965 + }, + { + "epoch": 0.41196634465565596, + "grad_norm": 0.35763460397720337, + "learning_rate": 6.365268252536087e-05, + "loss": 1.5065, + "step": 3966 + }, + { + "epoch": 0.41207021917523634, + "grad_norm": 0.381234347820282, + "learning_rate": 6.36369852764912e-05, + "loss": 1.672, + "step": 3967 + }, + { + "epoch": 0.41217409369481667, + "grad_norm": 0.4128655195236206, + "learning_rate": 6.362128657538897e-05, + "loss": 1.8548, + "step": 3968 + }, + { + "epoch": 0.412277968214397, + "grad_norm": 0.42314839363098145, + "learning_rate": 6.360558642372597e-05, + "loss": 1.8469, + "step": 3969 + }, + { + "epoch": 0.4123818427339774, + "grad_norm": 0.3665439188480377, + "learning_rate": 6.358988482317414e-05, + "loss": 1.6893, + "step": 3970 + }, + { + "epoch": 0.4124857172535577, + "grad_norm": 0.3973866105079651, + "learning_rate": 6.357418177540558e-05, + "loss": 1.5002, + "step": 3971 + }, + { + "epoch": 0.41258959177313803, + "grad_norm": 0.40862172842025757, + "learning_rate": 6.355847728209257e-05, + "loss": 1.817, + "step": 3972 + }, + { + "epoch": 0.4126934662927184, + "grad_norm": 0.439984530210495, + "learning_rate": 6.354277134490746e-05, + "loss": 1.7698, + "step": 3973 + }, + { + "epoch": 0.41279734081229874, + "grad_norm": 0.3773249685764313, + "learning_rate": 6.352706396552285e-05, + "loss": 1.4967, + "step": 3974 + }, + { + "epoch": 0.41290121533187907, + "grad_norm": 0.4068536162376404, + "learning_rate": 6.351135514561144e-05, + "loss": 1.6282, + "step": 3975 + }, + { + "epoch": 0.41300508985145945, + "grad_norm": 0.3690778613090515, + "learning_rate": 6.34956448868461e-05, + "loss": 1.637, + "step": 3976 + }, + { + "epoch": 0.4131089643710398, + "grad_norm": 0.4202233552932739, + "learning_rate": 6.347993319089985e-05, + "loss": 1.9794, + "step": 3977 + }, + { + "epoch": 0.4132128388906201, + "grad_norm": 0.44085201621055603, + "learning_rate": 6.346422005944586e-05, + "loss": 1.8328, + "step": 3978 + }, + { + "epoch": 0.4133167134102005, + "grad_norm": 0.39549145102500916, + "learning_rate": 6.344850549415746e-05, + "loss": 1.7727, + "step": 3979 + }, + { + "epoch": 0.4134205879297808, + "grad_norm": 0.39016398787498474, + "learning_rate": 6.343278949670812e-05, + "loss": 1.6293, + "step": 3980 + }, + { + "epoch": 0.4135244624493612, + "grad_norm": 0.40789923071861267, + "learning_rate": 6.341707206877149e-05, + "loss": 1.8089, + "step": 3981 + }, + { + "epoch": 0.4136283369689415, + "grad_norm": 0.41951802372932434, + "learning_rate": 6.340135321202134e-05, + "loss": 1.793, + "step": 3982 + }, + { + "epoch": 0.41373221148852185, + "grad_norm": 0.41457945108413696, + "learning_rate": 6.33856329281316e-05, + "loss": 1.8366, + "step": 3983 + }, + { + "epoch": 0.41383608600810223, + "grad_norm": 0.4352897107601166, + "learning_rate": 6.336991121877637e-05, + "loss": 1.7176, + "step": 3984 + }, + { + "epoch": 0.41393996052768256, + "grad_norm": 0.414348840713501, + "learning_rate": 6.335418808562988e-05, + "loss": 1.8262, + "step": 3985 + }, + { + "epoch": 0.4140438350472629, + "grad_norm": 0.3896345794200897, + "learning_rate": 6.333846353036654e-05, + "loss": 1.8285, + "step": 3986 + }, + { + "epoch": 0.41414770956684327, + "grad_norm": 0.4062201678752899, + "learning_rate": 6.332273755466087e-05, + "loss": 1.709, + "step": 3987 + }, + { + "epoch": 0.4142515840864236, + "grad_norm": 0.4064597189426422, + "learning_rate": 6.330701016018757e-05, + "loss": 1.7283, + "step": 3988 + }, + { + "epoch": 0.4143554586060039, + "grad_norm": 0.42126670479774475, + "learning_rate": 6.329128134862148e-05, + "loss": 1.7227, + "step": 3989 + }, + { + "epoch": 0.4144593331255843, + "grad_norm": 0.38570138812065125, + "learning_rate": 6.32755511216376e-05, + "loss": 1.7244, + "step": 3990 + }, + { + "epoch": 0.41456320764516463, + "grad_norm": 0.4073387682437897, + "learning_rate": 6.32598194809111e-05, + "loss": 1.6718, + "step": 3991 + }, + { + "epoch": 0.414667082164745, + "grad_norm": 0.39126601815223694, + "learning_rate": 6.324408642811726e-05, + "loss": 1.584, + "step": 3992 + }, + { + "epoch": 0.41477095668432534, + "grad_norm": 0.38054293394088745, + "learning_rate": 6.32283519649315e-05, + "loss": 1.5411, + "step": 3993 + }, + { + "epoch": 0.41487483120390567, + "grad_norm": 0.42494648694992065, + "learning_rate": 6.321261609302945e-05, + "loss": 1.8188, + "step": 3994 + }, + { + "epoch": 0.41497870572348605, + "grad_norm": 0.3690442442893982, + "learning_rate": 6.319687881408683e-05, + "loss": 1.6147, + "step": 3995 + }, + { + "epoch": 0.4150825802430664, + "grad_norm": 0.41196659207344055, + "learning_rate": 6.318114012977958e-05, + "loss": 1.917, + "step": 3996 + }, + { + "epoch": 0.4151864547626467, + "grad_norm": 0.38468247652053833, + "learning_rate": 6.316540004178371e-05, + "loss": 1.7175, + "step": 3997 + }, + { + "epoch": 0.4152903292822271, + "grad_norm": 0.36904019117355347, + "learning_rate": 6.314965855177544e-05, + "loss": 1.6544, + "step": 3998 + }, + { + "epoch": 0.4153942038018074, + "grad_norm": 0.3496026396751404, + "learning_rate": 6.31339156614311e-05, + "loss": 1.4105, + "step": 3999 + }, + { + "epoch": 0.41549807832138774, + "grad_norm": 0.3861815333366394, + "learning_rate": 6.311817137242721e-05, + "loss": 1.7974, + "step": 4000 + }, + { + "epoch": 0.4156019528409681, + "grad_norm": 0.37794098258018494, + "learning_rate": 6.310242568644035e-05, + "loss": 1.5556, + "step": 4001 + }, + { + "epoch": 0.41570582736054845, + "grad_norm": 0.42547518014907837, + "learning_rate": 6.30866786051474e-05, + "loss": 1.6694, + "step": 4002 + }, + { + "epoch": 0.4158097018801288, + "grad_norm": 0.3894122540950775, + "learning_rate": 6.307093013022525e-05, + "loss": 1.5479, + "step": 4003 + }, + { + "epoch": 0.41591357639970916, + "grad_norm": 0.36968937516212463, + "learning_rate": 6.305518026335097e-05, + "loss": 1.6548, + "step": 4004 + }, + { + "epoch": 0.4160174509192895, + "grad_norm": 0.40135595202445984, + "learning_rate": 6.303942900620185e-05, + "loss": 1.5989, + "step": 4005 + }, + { + "epoch": 0.41612132543886987, + "grad_norm": 0.39152655005455017, + "learning_rate": 6.302367636045526e-05, + "loss": 1.6618, + "step": 4006 + }, + { + "epoch": 0.4162251999584502, + "grad_norm": 0.382112592458725, + "learning_rate": 6.300792232778872e-05, + "loss": 1.5055, + "step": 4007 + }, + { + "epoch": 0.4163290744780305, + "grad_norm": 0.4156250059604645, + "learning_rate": 6.29921669098799e-05, + "loss": 1.9369, + "step": 4008 + }, + { + "epoch": 0.4164329489976109, + "grad_norm": 0.3940093219280243, + "learning_rate": 6.297641010840665e-05, + "loss": 1.6275, + "step": 4009 + }, + { + "epoch": 0.41653682351719123, + "grad_norm": 0.4581923186779022, + "learning_rate": 6.296065192504697e-05, + "loss": 1.8641, + "step": 4010 + }, + { + "epoch": 0.41664069803677156, + "grad_norm": 0.3822265863418579, + "learning_rate": 6.294489236147894e-05, + "loss": 1.7272, + "step": 4011 + }, + { + "epoch": 0.41674457255635194, + "grad_norm": 0.3892137110233307, + "learning_rate": 6.292913141938084e-05, + "loss": 1.683, + "step": 4012 + }, + { + "epoch": 0.41684844707593227, + "grad_norm": 0.3944839835166931, + "learning_rate": 6.29133691004311e-05, + "loss": 1.6662, + "step": 4013 + }, + { + "epoch": 0.4169523215955126, + "grad_norm": 0.39415454864501953, + "learning_rate": 6.289760540630829e-05, + "loss": 1.6862, + "step": 4014 + }, + { + "epoch": 0.417056196115093, + "grad_norm": 0.39324238896369934, + "learning_rate": 6.288184033869109e-05, + "loss": 1.7041, + "step": 4015 + }, + { + "epoch": 0.4171600706346733, + "grad_norm": 0.4160117506980896, + "learning_rate": 6.28660738992584e-05, + "loss": 1.7689, + "step": 4016 + }, + { + "epoch": 0.41726394515425363, + "grad_norm": 0.4204423725605011, + "learning_rate": 6.285030608968918e-05, + "loss": 1.7707, + "step": 4017 + }, + { + "epoch": 0.417367819673834, + "grad_norm": 0.3935635983943939, + "learning_rate": 6.28345369116626e-05, + "loss": 1.5884, + "step": 4018 + }, + { + "epoch": 0.41747169419341434, + "grad_norm": 0.41393959522247314, + "learning_rate": 6.281876636685795e-05, + "loss": 1.6574, + "step": 4019 + }, + { + "epoch": 0.4175755687129947, + "grad_norm": 0.4115196764469147, + "learning_rate": 6.280299445695469e-05, + "loss": 1.8304, + "step": 4020 + }, + { + "epoch": 0.41767944323257505, + "grad_norm": 0.3949635922908783, + "learning_rate": 6.278722118363237e-05, + "loss": 1.5268, + "step": 4021 + }, + { + "epoch": 0.4177833177521554, + "grad_norm": 0.41173237562179565, + "learning_rate": 6.277144654857074e-05, + "loss": 1.8324, + "step": 4022 + }, + { + "epoch": 0.41788719227173576, + "grad_norm": 0.3953053951263428, + "learning_rate": 6.275567055344967e-05, + "loss": 1.6818, + "step": 4023 + }, + { + "epoch": 0.4179910667913161, + "grad_norm": 0.46665164828300476, + "learning_rate": 6.27398931999492e-05, + "loss": 1.8911, + "step": 4024 + }, + { + "epoch": 0.4180949413108964, + "grad_norm": 0.4181447923183441, + "learning_rate": 6.272411448974947e-05, + "loss": 1.7229, + "step": 4025 + }, + { + "epoch": 0.4181988158304768, + "grad_norm": 0.3725024461746216, + "learning_rate": 6.270833442453079e-05, + "loss": 1.664, + "step": 4026 + }, + { + "epoch": 0.4183026903500571, + "grad_norm": 0.430210679769516, + "learning_rate": 6.269255300597364e-05, + "loss": 1.6161, + "step": 4027 + }, + { + "epoch": 0.41840656486963745, + "grad_norm": 0.3929589092731476, + "learning_rate": 6.267677023575859e-05, + "loss": 1.7089, + "step": 4028 + }, + { + "epoch": 0.41851043938921784, + "grad_norm": 0.3702670931816101, + "learning_rate": 6.26609861155664e-05, + "loss": 1.6041, + "step": 4029 + }, + { + "epoch": 0.41861431390879816, + "grad_norm": 0.3990215063095093, + "learning_rate": 6.264520064707795e-05, + "loss": 1.7304, + "step": 4030 + }, + { + "epoch": 0.41871818842837855, + "grad_norm": 0.4187242388725281, + "learning_rate": 6.262941383197425e-05, + "loss": 1.813, + "step": 4031 + }, + { + "epoch": 0.4188220629479589, + "grad_norm": 0.46325740218162537, + "learning_rate": 6.261362567193651e-05, + "loss": 1.8182, + "step": 4032 + }, + { + "epoch": 0.4189259374675392, + "grad_norm": 0.3769034445285797, + "learning_rate": 6.259783616864601e-05, + "loss": 1.733, + "step": 4033 + }, + { + "epoch": 0.4190298119871196, + "grad_norm": 0.42521029710769653, + "learning_rate": 6.258204532378423e-05, + "loss": 1.8581, + "step": 4034 + }, + { + "epoch": 0.4191336865066999, + "grad_norm": 0.3773384988307953, + "learning_rate": 6.256625313903278e-05, + "loss": 1.6578, + "step": 4035 + }, + { + "epoch": 0.41923756102628024, + "grad_norm": 0.4068077504634857, + "learning_rate": 6.255045961607338e-05, + "loss": 1.7315, + "step": 4036 + }, + { + "epoch": 0.4193414355458606, + "grad_norm": 0.38190507888793945, + "learning_rate": 6.253466475658792e-05, + "loss": 1.5881, + "step": 4037 + }, + { + "epoch": 0.41944531006544095, + "grad_norm": 0.373384028673172, + "learning_rate": 6.251886856225844e-05, + "loss": 1.719, + "step": 4038 + }, + { + "epoch": 0.4195491845850213, + "grad_norm": 0.3872770369052887, + "learning_rate": 6.250307103476712e-05, + "loss": 1.6688, + "step": 4039 + }, + { + "epoch": 0.41965305910460166, + "grad_norm": 0.4040100574493408, + "learning_rate": 6.248727217579625e-05, + "loss": 1.7883, + "step": 4040 + }, + { + "epoch": 0.419756933624182, + "grad_norm": 0.5108729004859924, + "learning_rate": 6.247147198702832e-05, + "loss": 1.8919, + "step": 4041 + }, + { + "epoch": 0.4198608081437623, + "grad_norm": 0.4295211732387543, + "learning_rate": 6.245567047014589e-05, + "loss": 1.7379, + "step": 4042 + }, + { + "epoch": 0.4199646826633427, + "grad_norm": 0.4218069911003113, + "learning_rate": 6.243986762683171e-05, + "loss": 1.7343, + "step": 4043 + }, + { + "epoch": 0.420068557182923, + "grad_norm": 0.40920522809028625, + "learning_rate": 6.242406345876866e-05, + "loss": 1.8147, + "step": 4044 + }, + { + "epoch": 0.4201724317025034, + "grad_norm": 0.3913504481315613, + "learning_rate": 6.240825796763977e-05, + "loss": 1.6202, + "step": 4045 + }, + { + "epoch": 0.42027630622208373, + "grad_norm": 0.387791246175766, + "learning_rate": 6.239245115512818e-05, + "loss": 1.6385, + "step": 4046 + }, + { + "epoch": 0.42038018074166406, + "grad_norm": 0.3972698450088501, + "learning_rate": 6.237664302291722e-05, + "loss": 1.6763, + "step": 4047 + }, + { + "epoch": 0.42048405526124444, + "grad_norm": 0.35716134309768677, + "learning_rate": 6.236083357269033e-05, + "loss": 1.6527, + "step": 4048 + }, + { + "epoch": 0.42058792978082477, + "grad_norm": 0.40162917971611023, + "learning_rate": 6.234502280613108e-05, + "loss": 1.7658, + "step": 4049 + }, + { + "epoch": 0.4206918043004051, + "grad_norm": 0.3818826377391815, + "learning_rate": 6.232921072492318e-05, + "loss": 1.7382, + "step": 4050 + }, + { + "epoch": 0.4207956788199855, + "grad_norm": 0.4017144739627838, + "learning_rate": 6.23133973307505e-05, + "loss": 1.7945, + "step": 4051 + }, + { + "epoch": 0.4208995533395658, + "grad_norm": 0.3896738290786743, + "learning_rate": 6.229758262529708e-05, + "loss": 1.8404, + "step": 4052 + }, + { + "epoch": 0.42100342785914613, + "grad_norm": 0.38667070865631104, + "learning_rate": 6.228176661024703e-05, + "loss": 1.619, + "step": 4053 + }, + { + "epoch": 0.4211073023787265, + "grad_norm": 0.38369256258010864, + "learning_rate": 6.226594928728462e-05, + "loss": 1.6359, + "step": 4054 + }, + { + "epoch": 0.42121117689830684, + "grad_norm": 0.44246816635131836, + "learning_rate": 6.225013065809431e-05, + "loss": 1.8973, + "step": 4055 + }, + { + "epoch": 0.42131505141788717, + "grad_norm": 0.42337489128112793, + "learning_rate": 6.223431072436063e-05, + "loss": 1.6915, + "step": 4056 + }, + { + "epoch": 0.42141892593746755, + "grad_norm": 0.419403612613678, + "learning_rate": 6.221848948776828e-05, + "loss": 1.6772, + "step": 4057 + }, + { + "epoch": 0.4215228004570479, + "grad_norm": 0.3898250460624695, + "learning_rate": 6.220266695000211e-05, + "loss": 1.7213, + "step": 4058 + }, + { + "epoch": 0.42162667497662826, + "grad_norm": 0.4045391380786896, + "learning_rate": 6.21868431127471e-05, + "loss": 1.9761, + "step": 4059 + }, + { + "epoch": 0.4217305494962086, + "grad_norm": 0.3763904273509979, + "learning_rate": 6.217101797768837e-05, + "loss": 1.7037, + "step": 4060 + }, + { + "epoch": 0.4218344240157889, + "grad_norm": 0.397461861371994, + "learning_rate": 6.215519154651116e-05, + "loss": 1.7218, + "step": 4061 + }, + { + "epoch": 0.4219382985353693, + "grad_norm": 0.4115433990955353, + "learning_rate": 6.213936382090085e-05, + "loss": 1.7957, + "step": 4062 + }, + { + "epoch": 0.4220421730549496, + "grad_norm": 0.393285870552063, + "learning_rate": 6.2123534802543e-05, + "loss": 1.7508, + "step": 4063 + }, + { + "epoch": 0.42214604757452995, + "grad_norm": 0.3760216236114502, + "learning_rate": 6.210770449312326e-05, + "loss": 1.6243, + "step": 4064 + }, + { + "epoch": 0.42224992209411033, + "grad_norm": 0.4444178342819214, + "learning_rate": 6.209187289432742e-05, + "loss": 1.8161, + "step": 4065 + }, + { + "epoch": 0.42235379661369066, + "grad_norm": 0.37198397517204285, + "learning_rate": 6.207604000784143e-05, + "loss": 1.6786, + "step": 4066 + }, + { + "epoch": 0.422457671133271, + "grad_norm": 0.39783281087875366, + "learning_rate": 6.206020583535141e-05, + "loss": 1.7703, + "step": 4067 + }, + { + "epoch": 0.42256154565285137, + "grad_norm": 0.45979025959968567, + "learning_rate": 6.204437037854351e-05, + "loss": 1.8734, + "step": 4068 + }, + { + "epoch": 0.4226654201724317, + "grad_norm": 0.42977213859558105, + "learning_rate": 6.202853363910413e-05, + "loss": 1.7048, + "step": 4069 + }, + { + "epoch": 0.4227692946920121, + "grad_norm": 0.41636922955513, + "learning_rate": 6.201269561871975e-05, + "loss": 1.6683, + "step": 4070 + }, + { + "epoch": 0.4228731692115924, + "grad_norm": 0.4053545594215393, + "learning_rate": 6.199685631907697e-05, + "loss": 1.6983, + "step": 4071 + }, + { + "epoch": 0.42297704373117273, + "grad_norm": 0.378132164478302, + "learning_rate": 6.19810157418626e-05, + "loss": 1.6345, + "step": 4072 + }, + { + "epoch": 0.4230809182507531, + "grad_norm": 0.4450819492340088, + "learning_rate": 6.196517388876348e-05, + "loss": 1.6669, + "step": 4073 + }, + { + "epoch": 0.42318479277033344, + "grad_norm": 0.3707204759120941, + "learning_rate": 6.194933076146668e-05, + "loss": 1.5462, + "step": 4074 + }, + { + "epoch": 0.42328866728991377, + "grad_norm": 0.41867223381996155, + "learning_rate": 6.193348636165937e-05, + "loss": 1.8317, + "step": 4075 + }, + { + "epoch": 0.42339254180949415, + "grad_norm": 0.4516044557094574, + "learning_rate": 6.191764069102883e-05, + "loss": 1.8396, + "step": 4076 + }, + { + "epoch": 0.4234964163290745, + "grad_norm": 0.421350359916687, + "learning_rate": 6.190179375126254e-05, + "loss": 1.7317, + "step": 4077 + }, + { + "epoch": 0.4236002908486548, + "grad_norm": 0.3848811984062195, + "learning_rate": 6.188594554404804e-05, + "loss": 1.6922, + "step": 4078 + }, + { + "epoch": 0.4237041653682352, + "grad_norm": 0.41102609038352966, + "learning_rate": 6.187009607107304e-05, + "loss": 1.7322, + "step": 4079 + }, + { + "epoch": 0.4238080398878155, + "grad_norm": 0.4180265963077545, + "learning_rate": 6.185424533402543e-05, + "loss": 1.8053, + "step": 4080 + }, + { + "epoch": 0.42391191440739584, + "grad_norm": 0.4037413001060486, + "learning_rate": 6.183839333459315e-05, + "loss": 1.6599, + "step": 4081 + }, + { + "epoch": 0.4240157889269762, + "grad_norm": 0.40133136510849, + "learning_rate": 6.18225400744643e-05, + "loss": 1.7394, + "step": 4082 + }, + { + "epoch": 0.42411966344655655, + "grad_norm": 0.37302759289741516, + "learning_rate": 6.180668555532719e-05, + "loss": 1.5259, + "step": 4083 + }, + { + "epoch": 0.42422353796613693, + "grad_norm": 0.39756691455841064, + "learning_rate": 6.179082977887013e-05, + "loss": 1.6746, + "step": 4084 + }, + { + "epoch": 0.42432741248571726, + "grad_norm": 0.37990039587020874, + "learning_rate": 6.177497274678168e-05, + "loss": 1.6812, + "step": 4085 + }, + { + "epoch": 0.4244312870052976, + "grad_norm": 0.3902164697647095, + "learning_rate": 6.175911446075049e-05, + "loss": 1.6457, + "step": 4086 + }, + { + "epoch": 0.42453516152487797, + "grad_norm": 0.4064858555793762, + "learning_rate": 6.174325492246531e-05, + "loss": 1.7168, + "step": 4087 + }, + { + "epoch": 0.4246390360444583, + "grad_norm": 0.40128055214881897, + "learning_rate": 6.17273941336151e-05, + "loss": 1.7847, + "step": 4088 + }, + { + "epoch": 0.4247429105640386, + "grad_norm": 0.4626508951187134, + "learning_rate": 6.171153209588891e-05, + "loss": 1.951, + "step": 4089 + }, + { + "epoch": 0.424846785083619, + "grad_norm": 0.40969690680503845, + "learning_rate": 6.169566881097588e-05, + "loss": 1.4734, + "step": 4090 + }, + { + "epoch": 0.42495065960319933, + "grad_norm": 0.3807416558265686, + "learning_rate": 6.167980428056537e-05, + "loss": 1.5504, + "step": 4091 + }, + { + "epoch": 0.42505453412277966, + "grad_norm": 0.3962438702583313, + "learning_rate": 6.16639385063468e-05, + "loss": 1.8144, + "step": 4092 + }, + { + "epoch": 0.42515840864236004, + "grad_norm": 0.3545083701610565, + "learning_rate": 6.164807149000974e-05, + "loss": 1.5118, + "step": 4093 + }, + { + "epoch": 0.42526228316194037, + "grad_norm": 0.43927130103111267, + "learning_rate": 6.163220323324395e-05, + "loss": 1.8641, + "step": 4094 + }, + { + "epoch": 0.4253661576815207, + "grad_norm": 0.41815078258514404, + "learning_rate": 6.161633373773925e-05, + "loss": 1.7244, + "step": 4095 + }, + { + "epoch": 0.4254700322011011, + "grad_norm": 0.4570396840572357, + "learning_rate": 6.16004630051856e-05, + "loss": 1.8696, + "step": 4096 + }, + { + "epoch": 0.4255739067206814, + "grad_norm": 0.3970779478549957, + "learning_rate": 6.158459103727316e-05, + "loss": 1.7826, + "step": 4097 + }, + { + "epoch": 0.4256777812402618, + "grad_norm": 0.437796026468277, + "learning_rate": 6.15687178356921e-05, + "loss": 1.6452, + "step": 4098 + }, + { + "epoch": 0.4257816557598421, + "grad_norm": 0.4131401777267456, + "learning_rate": 6.155284340213285e-05, + "loss": 1.6827, + "step": 4099 + }, + { + "epoch": 0.42588553027942244, + "grad_norm": 0.4367378056049347, + "learning_rate": 6.15369677382859e-05, + "loss": 1.9042, + "step": 4100 + }, + { + "epoch": 0.4259894047990028, + "grad_norm": 0.39361101388931274, + "learning_rate": 6.152109084584186e-05, + "loss": 1.643, + "step": 4101 + }, + { + "epoch": 0.42609327931858315, + "grad_norm": 0.37626150250434875, + "learning_rate": 6.150521272649152e-05, + "loss": 1.5454, + "step": 4102 + }, + { + "epoch": 0.4261971538381635, + "grad_norm": 0.3899744749069214, + "learning_rate": 6.148933338192578e-05, + "loss": 1.6159, + "step": 4103 + }, + { + "epoch": 0.42630102835774386, + "grad_norm": 0.4217323362827301, + "learning_rate": 6.147345281383565e-05, + "loss": 1.8407, + "step": 4104 + }, + { + "epoch": 0.4264049028773242, + "grad_norm": 0.42654165625572205, + "learning_rate": 6.145757102391227e-05, + "loss": 1.7704, + "step": 4105 + }, + { + "epoch": 0.4265087773969045, + "grad_norm": 0.3943133056163788, + "learning_rate": 6.144168801384698e-05, + "loss": 1.7799, + "step": 4106 + }, + { + "epoch": 0.4266126519164849, + "grad_norm": 0.37911903858184814, + "learning_rate": 6.142580378533114e-05, + "loss": 1.6106, + "step": 4107 + }, + { + "epoch": 0.4267165264360652, + "grad_norm": 0.42597952485084534, + "learning_rate": 6.140991834005632e-05, + "loss": 1.6198, + "step": 4108 + }, + { + "epoch": 0.4268204009556456, + "grad_norm": 0.43222105503082275, + "learning_rate": 6.139403167971422e-05, + "loss": 1.7884, + "step": 4109 + }, + { + "epoch": 0.42692427547522593, + "grad_norm": 0.4335835576057434, + "learning_rate": 6.13781438059966e-05, + "loss": 1.7371, + "step": 4110 + }, + { + "epoch": 0.42702814999480626, + "grad_norm": 0.41362127661705017, + "learning_rate": 6.136225472059545e-05, + "loss": 1.6047, + "step": 4111 + }, + { + "epoch": 0.42713202451438664, + "grad_norm": 0.38855838775634766, + "learning_rate": 6.134636442520277e-05, + "loss": 1.6329, + "step": 4112 + }, + { + "epoch": 0.42723589903396697, + "grad_norm": 0.39782068133354187, + "learning_rate": 6.13304729215108e-05, + "loss": 1.711, + "step": 4113 + }, + { + "epoch": 0.4273397735535473, + "grad_norm": 0.4341132342815399, + "learning_rate": 6.131458021121184e-05, + "loss": 1.6623, + "step": 4114 + }, + { + "epoch": 0.4274436480731277, + "grad_norm": 0.4523240625858307, + "learning_rate": 6.129868629599832e-05, + "loss": 1.8319, + "step": 4115 + }, + { + "epoch": 0.427547522592708, + "grad_norm": 0.3760691285133362, + "learning_rate": 6.128279117756288e-05, + "loss": 1.8514, + "step": 4116 + }, + { + "epoch": 0.42765139711228833, + "grad_norm": 0.45057788491249084, + "learning_rate": 6.126689485759818e-05, + "loss": 1.7907, + "step": 4117 + }, + { + "epoch": 0.4277552716318687, + "grad_norm": 0.4334897994995117, + "learning_rate": 6.125099733779706e-05, + "loss": 1.5299, + "step": 4118 + }, + { + "epoch": 0.42785914615144904, + "grad_norm": 0.4173983633518219, + "learning_rate": 6.12350986198525e-05, + "loss": 1.6515, + "step": 4119 + }, + { + "epoch": 0.42796302067102937, + "grad_norm": 0.4509902000427246, + "learning_rate": 6.121919870545755e-05, + "loss": 1.9308, + "step": 4120 + }, + { + "epoch": 0.42806689519060975, + "grad_norm": 0.433244526386261, + "learning_rate": 6.120329759630546e-05, + "loss": 1.9438, + "step": 4121 + }, + { + "epoch": 0.4281707697101901, + "grad_norm": 0.4360440671443939, + "learning_rate": 6.118739529408956e-05, + "loss": 1.8285, + "step": 4122 + }, + { + "epoch": 0.42827464422977046, + "grad_norm": 0.44721856713294983, + "learning_rate": 6.117149180050332e-05, + "loss": 1.7948, + "step": 4123 + }, + { + "epoch": 0.4283785187493508, + "grad_norm": 0.4097500741481781, + "learning_rate": 6.115558711724036e-05, + "loss": 1.6341, + "step": 4124 + }, + { + "epoch": 0.4284823932689311, + "grad_norm": 0.3732147514820099, + "learning_rate": 6.113968124599439e-05, + "loss": 1.6512, + "step": 4125 + }, + { + "epoch": 0.4285862677885115, + "grad_norm": 0.407043993473053, + "learning_rate": 6.112377418845925e-05, + "loss": 1.7734, + "step": 4126 + }, + { + "epoch": 0.4286901423080918, + "grad_norm": 0.4379003047943115, + "learning_rate": 6.110786594632892e-05, + "loss": 1.8773, + "step": 4127 + }, + { + "epoch": 0.42879401682767215, + "grad_norm": 0.49205613136291504, + "learning_rate": 6.109195652129753e-05, + "loss": 1.6428, + "step": 4128 + }, + { + "epoch": 0.42889789134725254, + "grad_norm": 0.40176907181739807, + "learning_rate": 6.107604591505928e-05, + "loss": 1.5291, + "step": 4129 + }, + { + "epoch": 0.42900176586683286, + "grad_norm": 0.4463737905025482, + "learning_rate": 6.106013412930853e-05, + "loss": 1.8709, + "step": 4130 + }, + { + "epoch": 0.4291056403864132, + "grad_norm": 0.49634161591529846, + "learning_rate": 6.10442211657398e-05, + "loss": 2.1084, + "step": 4131 + }, + { + "epoch": 0.42920951490599357, + "grad_norm": 0.3978555500507355, + "learning_rate": 6.1028307026047624e-05, + "loss": 1.7182, + "step": 4132 + }, + { + "epoch": 0.4293133894255739, + "grad_norm": 0.39746221899986267, + "learning_rate": 6.1012391711926795e-05, + "loss": 1.7323, + "step": 4133 + }, + { + "epoch": 0.4294172639451542, + "grad_norm": 0.39065563678741455, + "learning_rate": 6.099647522507216e-05, + "loss": 1.7496, + "step": 4134 + }, + { + "epoch": 0.4295211384647346, + "grad_norm": 0.41617366671562195, + "learning_rate": 6.098055756717866e-05, + "loss": 1.781, + "step": 4135 + }, + { + "epoch": 0.42962501298431494, + "grad_norm": 0.41195693612098694, + "learning_rate": 6.0964638739941447e-05, + "loss": 1.7705, + "step": 4136 + }, + { + "epoch": 0.4297288875038953, + "grad_norm": 0.397640585899353, + "learning_rate": 6.094871874505572e-05, + "loss": 1.7518, + "step": 4137 + }, + { + "epoch": 0.42983276202347565, + "grad_norm": 0.37663549184799194, + "learning_rate": 6.093279758421687e-05, + "loss": 1.6859, + "step": 4138 + }, + { + "epoch": 0.42993663654305597, + "grad_norm": 0.3834432065486908, + "learning_rate": 6.0916875259120366e-05, + "loss": 1.5105, + "step": 4139 + }, + { + "epoch": 0.43004051106263635, + "grad_norm": 0.4278203248977661, + "learning_rate": 6.090095177146178e-05, + "loss": 1.7621, + "step": 4140 + }, + { + "epoch": 0.4301443855822167, + "grad_norm": 0.4155001938343048, + "learning_rate": 6.088502712293687e-05, + "loss": 1.8687, + "step": 4141 + }, + { + "epoch": 0.430248260101797, + "grad_norm": 0.4091452658176422, + "learning_rate": 6.086910131524147e-05, + "loss": 1.6906, + "step": 4142 + }, + { + "epoch": 0.4303521346213774, + "grad_norm": 0.4341062605381012, + "learning_rate": 6.085317435007156e-05, + "loss": 1.7431, + "step": 4143 + }, + { + "epoch": 0.4304560091409577, + "grad_norm": 0.4048025906085968, + "learning_rate": 6.0837246229123246e-05, + "loss": 1.7628, + "step": 4144 + }, + { + "epoch": 0.43055988366053805, + "grad_norm": 0.3721714913845062, + "learning_rate": 6.0821316954092745e-05, + "loss": 1.4422, + "step": 4145 + }, + { + "epoch": 0.43066375818011843, + "grad_norm": 0.39783287048339844, + "learning_rate": 6.08053865266764e-05, + "loss": 1.6521, + "step": 4146 + }, + { + "epoch": 0.43076763269969875, + "grad_norm": 0.44611310958862305, + "learning_rate": 6.078945494857068e-05, + "loss": 1.9144, + "step": 4147 + }, + { + "epoch": 0.43087150721927914, + "grad_norm": 0.3994828462600708, + "learning_rate": 6.077352222147214e-05, + "loss": 1.6819, + "step": 4148 + }, + { + "epoch": 0.43097538173885946, + "grad_norm": 0.4165269434452057, + "learning_rate": 6.075758834707754e-05, + "loss": 1.7586, + "step": 4149 + }, + { + "epoch": 0.4310792562584398, + "grad_norm": 0.4540640413761139, + "learning_rate": 6.0741653327083703e-05, + "loss": 1.8504, + "step": 4150 + }, + { + "epoch": 0.4311831307780202, + "grad_norm": 0.4224712550640106, + "learning_rate": 6.072571716318756e-05, + "loss": 1.7859, + "step": 4151 + }, + { + "epoch": 0.4312870052976005, + "grad_norm": 0.4140393137931824, + "learning_rate": 6.0709779857086214e-05, + "loss": 1.6534, + "step": 4152 + }, + { + "epoch": 0.43139087981718083, + "grad_norm": 0.37748152017593384, + "learning_rate": 6.069384141047686e-05, + "loss": 1.5697, + "step": 4153 + }, + { + "epoch": 0.4314947543367612, + "grad_norm": 0.4263037145137787, + "learning_rate": 6.067790182505678e-05, + "loss": 1.6565, + "step": 4154 + }, + { + "epoch": 0.43159862885634154, + "grad_norm": 0.38402366638183594, + "learning_rate": 6.066196110252347e-05, + "loss": 1.6159, + "step": 4155 + }, + { + "epoch": 0.43170250337592186, + "grad_norm": 0.3785112798213959, + "learning_rate": 6.064601924457447e-05, + "loss": 1.4971, + "step": 4156 + }, + { + "epoch": 0.43180637789550225, + "grad_norm": 0.3928913474082947, + "learning_rate": 6.0630076252907445e-05, + "loss": 1.6329, + "step": 4157 + }, + { + "epoch": 0.4319102524150826, + "grad_norm": 0.42711085081100464, + "learning_rate": 6.061413212922022e-05, + "loss": 1.7333, + "step": 4158 + }, + { + "epoch": 0.4320141269346629, + "grad_norm": 0.40235477685928345, + "learning_rate": 6.059818687521074e-05, + "loss": 1.6936, + "step": 4159 + }, + { + "epoch": 0.4321180014542433, + "grad_norm": 0.4238249957561493, + "learning_rate": 6.0582240492577015e-05, + "loss": 1.7692, + "step": 4160 + }, + { + "epoch": 0.4322218759738236, + "grad_norm": 0.4094812870025635, + "learning_rate": 6.056629298301722e-05, + "loss": 1.6549, + "step": 4161 + }, + { + "epoch": 0.432325750493404, + "grad_norm": 0.4390838146209717, + "learning_rate": 6.0550344348229626e-05, + "loss": 1.8607, + "step": 4162 + }, + { + "epoch": 0.4324296250129843, + "grad_norm": 0.3718215823173523, + "learning_rate": 6.053439458991268e-05, + "loss": 1.6305, + "step": 4163 + }, + { + "epoch": 0.43253349953256465, + "grad_norm": 0.42819488048553467, + "learning_rate": 6.051844370976487e-05, + "loss": 1.5788, + "step": 4164 + }, + { + "epoch": 0.43263737405214503, + "grad_norm": 0.37748944759368896, + "learning_rate": 6.050249170948484e-05, + "loss": 1.6356, + "step": 4165 + }, + { + "epoch": 0.43274124857172536, + "grad_norm": 0.4419516623020172, + "learning_rate": 6.048653859077138e-05, + "loss": 1.7449, + "step": 4166 + }, + { + "epoch": 0.4328451230913057, + "grad_norm": 0.40298861265182495, + "learning_rate": 6.0470584355323375e-05, + "loss": 1.648, + "step": 4167 + }, + { + "epoch": 0.43294899761088607, + "grad_norm": 0.41812488436698914, + "learning_rate": 6.045462900483978e-05, + "loss": 1.8709, + "step": 4168 + }, + { + "epoch": 0.4330528721304664, + "grad_norm": 0.4024185538291931, + "learning_rate": 6.0438672541019755e-05, + "loss": 1.7013, + "step": 4169 + }, + { + "epoch": 0.4331567466500467, + "grad_norm": 0.3921603262424469, + "learning_rate": 6.042271496556254e-05, + "loss": 1.6795, + "step": 4170 + }, + { + "epoch": 0.4332606211696271, + "grad_norm": 0.41133809089660645, + "learning_rate": 6.040675628016746e-05, + "loss": 1.7468, + "step": 4171 + }, + { + "epoch": 0.43336449568920743, + "grad_norm": 0.4003753960132599, + "learning_rate": 6.039079648653403e-05, + "loss": 1.7661, + "step": 4172 + }, + { + "epoch": 0.4334683702087878, + "grad_norm": 0.3852844536304474, + "learning_rate": 6.037483558636183e-05, + "loss": 1.6341, + "step": 4173 + }, + { + "epoch": 0.43357224472836814, + "grad_norm": 0.40245676040649414, + "learning_rate": 6.035887358135056e-05, + "loss": 1.7788, + "step": 4174 + }, + { + "epoch": 0.43367611924794847, + "grad_norm": 0.4260011613368988, + "learning_rate": 6.0342910473200054e-05, + "loss": 1.8262, + "step": 4175 + }, + { + "epoch": 0.43377999376752885, + "grad_norm": 0.4148736298084259, + "learning_rate": 6.032694626361025e-05, + "loss": 1.7661, + "step": 4176 + }, + { + "epoch": 0.4338838682871092, + "grad_norm": 0.38179653882980347, + "learning_rate": 6.0310980954281234e-05, + "loss": 1.5742, + "step": 4177 + }, + { + "epoch": 0.4339877428066895, + "grad_norm": 0.4099988639354706, + "learning_rate": 6.029501454691318e-05, + "loss": 1.66, + "step": 4178 + }, + { + "epoch": 0.4340916173262699, + "grad_norm": 0.3874553442001343, + "learning_rate": 6.027904704320636e-05, + "loss": 1.6484, + "step": 4179 + }, + { + "epoch": 0.4341954918458502, + "grad_norm": 0.390216588973999, + "learning_rate": 6.026307844486123e-05, + "loss": 1.7898, + "step": 4180 + }, + { + "epoch": 0.43429936636543054, + "grad_norm": 0.38214191794395447, + "learning_rate": 6.024710875357831e-05, + "loss": 1.6147, + "step": 4181 + }, + { + "epoch": 0.4344032408850109, + "grad_norm": 0.37591472268104553, + "learning_rate": 6.02311379710582e-05, + "loss": 1.685, + "step": 4182 + }, + { + "epoch": 0.43450711540459125, + "grad_norm": 0.42642828822135925, + "learning_rate": 6.021516609900173e-05, + "loss": 1.8715, + "step": 4183 + }, + { + "epoch": 0.4346109899241716, + "grad_norm": 0.4037899672985077, + "learning_rate": 6.019919313910976e-05, + "loss": 1.6407, + "step": 4184 + }, + { + "epoch": 0.43471486444375196, + "grad_norm": 0.40876585245132446, + "learning_rate": 6.0183219093083244e-05, + "loss": 1.6688, + "step": 4185 + }, + { + "epoch": 0.4348187389633323, + "grad_norm": 0.4056430757045746, + "learning_rate": 6.016724396262335e-05, + "loss": 1.8047, + "step": 4186 + }, + { + "epoch": 0.43492261348291267, + "grad_norm": 0.4015723168849945, + "learning_rate": 6.015126774943128e-05, + "loss": 1.7502, + "step": 4187 + }, + { + "epoch": 0.435026488002493, + "grad_norm": 0.4126836359500885, + "learning_rate": 6.013529045520838e-05, + "loss": 1.8191, + "step": 4188 + }, + { + "epoch": 0.4351303625220733, + "grad_norm": 0.39362239837646484, + "learning_rate": 6.011931208165611e-05, + "loss": 1.6365, + "step": 4189 + }, + { + "epoch": 0.4352342370416537, + "grad_norm": 0.37851396203041077, + "learning_rate": 6.0103332630476017e-05, + "loss": 1.7451, + "step": 4190 + }, + { + "epoch": 0.43533811156123403, + "grad_norm": 0.3897380530834198, + "learning_rate": 6.008735210336982e-05, + "loss": 1.7397, + "step": 4191 + }, + { + "epoch": 0.43544198608081436, + "grad_norm": 0.38429588079452515, + "learning_rate": 6.00713705020393e-05, + "loss": 1.5615, + "step": 4192 + }, + { + "epoch": 0.43554586060039474, + "grad_norm": 0.4072874188423157, + "learning_rate": 6.005538782818638e-05, + "loss": 1.8237, + "step": 4193 + }, + { + "epoch": 0.43564973511997507, + "grad_norm": 0.4054878354072571, + "learning_rate": 6.003940408351311e-05, + "loss": 1.7932, + "step": 4194 + }, + { + "epoch": 0.4357536096395554, + "grad_norm": 0.38141578435897827, + "learning_rate": 6.0023419269721613e-05, + "loss": 1.6768, + "step": 4195 + }, + { + "epoch": 0.4358574841591358, + "grad_norm": 0.4173438251018524, + "learning_rate": 6.000743338851413e-05, + "loss": 1.513, + "step": 4196 + }, + { + "epoch": 0.4359613586787161, + "grad_norm": 0.4424532651901245, + "learning_rate": 5.999144644159307e-05, + "loss": 1.7438, + "step": 4197 + }, + { + "epoch": 0.43606523319829643, + "grad_norm": 0.43783673644065857, + "learning_rate": 5.997545843066089e-05, + "loss": 1.7892, + "step": 4198 + }, + { + "epoch": 0.4361691077178768, + "grad_norm": 0.3705306649208069, + "learning_rate": 5.995946935742019e-05, + "loss": 1.6238, + "step": 4199 + }, + { + "epoch": 0.43627298223745714, + "grad_norm": 0.38308608531951904, + "learning_rate": 5.994347922357372e-05, + "loss": 1.5195, + "step": 4200 + }, + { + "epoch": 0.4363768567570375, + "grad_norm": 0.43744367361068726, + "learning_rate": 5.992748803082425e-05, + "loss": 1.9181, + "step": 4201 + }, + { + "epoch": 0.43648073127661785, + "grad_norm": 0.42308345437049866, + "learning_rate": 5.991149578087476e-05, + "loss": 1.767, + "step": 4202 + }, + { + "epoch": 0.4365846057961982, + "grad_norm": 0.4409851133823395, + "learning_rate": 5.9895502475428265e-05, + "loss": 1.8537, + "step": 4203 + }, + { + "epoch": 0.43668848031577856, + "grad_norm": 0.377109169960022, + "learning_rate": 5.9879508116187947e-05, + "loss": 1.6446, + "step": 4204 + }, + { + "epoch": 0.4367923548353589, + "grad_norm": 0.4228600263595581, + "learning_rate": 5.9863512704857085e-05, + "loss": 1.7255, + "step": 4205 + }, + { + "epoch": 0.4368962293549392, + "grad_norm": 0.410178542137146, + "learning_rate": 5.9847516243139055e-05, + "loss": 1.6686, + "step": 4206 + }, + { + "epoch": 0.4370001038745196, + "grad_norm": 0.37821877002716064, + "learning_rate": 5.9831518732737344e-05, + "loss": 1.563, + "step": 4207 + }, + { + "epoch": 0.4371039783940999, + "grad_norm": 0.3783068358898163, + "learning_rate": 5.981552017535561e-05, + "loss": 1.6068, + "step": 4208 + }, + { + "epoch": 0.43720785291368025, + "grad_norm": 0.4072893261909485, + "learning_rate": 5.979952057269752e-05, + "loss": 1.7952, + "step": 4209 + }, + { + "epoch": 0.43731172743326063, + "grad_norm": 0.3989700376987457, + "learning_rate": 5.9783519926466924e-05, + "loss": 1.66, + "step": 4210 + }, + { + "epoch": 0.43741560195284096, + "grad_norm": 0.4692555367946625, + "learning_rate": 5.976751823836778e-05, + "loss": 1.6255, + "step": 4211 + }, + { + "epoch": 0.43751947647242134, + "grad_norm": 0.3792388439178467, + "learning_rate": 5.975151551010412e-05, + "loss": 1.5188, + "step": 4212 + }, + { + "epoch": 0.43762335099200167, + "grad_norm": 0.38823702931404114, + "learning_rate": 5.973551174338014e-05, + "loss": 1.6951, + "step": 4213 + }, + { + "epoch": 0.437727225511582, + "grad_norm": 0.37780869007110596, + "learning_rate": 5.9719506939900094e-05, + "loss": 1.5638, + "step": 4214 + }, + { + "epoch": 0.4378311000311624, + "grad_norm": 0.45048758387565613, + "learning_rate": 5.9703501101368374e-05, + "loss": 1.8339, + "step": 4215 + }, + { + "epoch": 0.4379349745507427, + "grad_norm": 0.3920362889766693, + "learning_rate": 5.968749422948947e-05, + "loss": 1.723, + "step": 4216 + }, + { + "epoch": 0.43803884907032303, + "grad_norm": 0.3886549770832062, + "learning_rate": 5.967148632596801e-05, + "loss": 1.6502, + "step": 4217 + }, + { + "epoch": 0.4381427235899034, + "grad_norm": 0.4173612594604492, + "learning_rate": 5.965547739250867e-05, + "loss": 1.8315, + "step": 4218 + }, + { + "epoch": 0.43824659810948374, + "grad_norm": 0.45074644684791565, + "learning_rate": 5.963946743081633e-05, + "loss": 1.7273, + "step": 4219 + }, + { + "epoch": 0.43835047262906407, + "grad_norm": 0.42388424277305603, + "learning_rate": 5.962345644259589e-05, + "loss": 1.7344, + "step": 4220 + }, + { + "epoch": 0.43845434714864445, + "grad_norm": 0.44487425684928894, + "learning_rate": 5.9607444429552405e-05, + "loss": 1.771, + "step": 4221 + }, + { + "epoch": 0.4385582216682248, + "grad_norm": 0.4247962236404419, + "learning_rate": 5.959143139339104e-05, + "loss": 1.7933, + "step": 4222 + }, + { + "epoch": 0.4386620961878051, + "grad_norm": 0.4087913930416107, + "learning_rate": 5.957541733581704e-05, + "loss": 1.667, + "step": 4223 + }, + { + "epoch": 0.4387659707073855, + "grad_norm": 0.3898204267024994, + "learning_rate": 5.955940225853577e-05, + "loss": 1.6835, + "step": 4224 + }, + { + "epoch": 0.4388698452269658, + "grad_norm": 0.3906581997871399, + "learning_rate": 5.9543386163252744e-05, + "loss": 1.7532, + "step": 4225 + }, + { + "epoch": 0.4389737197465462, + "grad_norm": 0.37566760182380676, + "learning_rate": 5.952736905167352e-05, + "loss": 1.5813, + "step": 4226 + }, + { + "epoch": 0.4390775942661265, + "grad_norm": 0.42560824751853943, + "learning_rate": 5.951135092550382e-05, + "loss": 1.8791, + "step": 4227 + }, + { + "epoch": 0.43918146878570685, + "grad_norm": 0.4161139130592346, + "learning_rate": 5.949533178644943e-05, + "loss": 1.6862, + "step": 4228 + }, + { + "epoch": 0.43928534330528723, + "grad_norm": 0.398384690284729, + "learning_rate": 5.9479311636216274e-05, + "loss": 1.7341, + "step": 4229 + }, + { + "epoch": 0.43938921782486756, + "grad_norm": 0.36606839299201965, + "learning_rate": 5.946329047651037e-05, + "loss": 1.6987, + "step": 4230 + }, + { + "epoch": 0.4394930923444479, + "grad_norm": 0.41310790181159973, + "learning_rate": 5.944726830903785e-05, + "loss": 1.7809, + "step": 4231 + }, + { + "epoch": 0.43959696686402827, + "grad_norm": 0.41169822216033936, + "learning_rate": 5.9431245135504934e-05, + "loss": 1.7338, + "step": 4232 + }, + { + "epoch": 0.4397008413836086, + "grad_norm": 0.3810160756111145, + "learning_rate": 5.941522095761799e-05, + "loss": 1.5948, + "step": 4233 + }, + { + "epoch": 0.4398047159031889, + "grad_norm": 0.3669174909591675, + "learning_rate": 5.939919577708346e-05, + "loss": 1.2893, + "step": 4234 + }, + { + "epoch": 0.4399085904227693, + "grad_norm": 0.4046458601951599, + "learning_rate": 5.938316959560788e-05, + "loss": 1.6906, + "step": 4235 + }, + { + "epoch": 0.44001246494234963, + "grad_norm": 0.40834760665893555, + "learning_rate": 5.9367142414897945e-05, + "loss": 1.7074, + "step": 4236 + }, + { + "epoch": 0.44011633946192996, + "grad_norm": 0.425662636756897, + "learning_rate": 5.935111423666041e-05, + "loss": 1.7486, + "step": 4237 + }, + { + "epoch": 0.44022021398151034, + "grad_norm": 0.37841665744781494, + "learning_rate": 5.933508506260214e-05, + "loss": 1.7472, + "step": 4238 + }, + { + "epoch": 0.44032408850109067, + "grad_norm": 0.4464881420135498, + "learning_rate": 5.9319054894430126e-05, + "loss": 1.8697, + "step": 4239 + }, + { + "epoch": 0.44042796302067105, + "grad_norm": 0.4150855541229248, + "learning_rate": 5.930302373385145e-05, + "loss": 1.5873, + "step": 4240 + }, + { + "epoch": 0.4405318375402514, + "grad_norm": 0.38565129041671753, + "learning_rate": 5.928699158257333e-05, + "loss": 1.71, + "step": 4241 + }, + { + "epoch": 0.4406357120598317, + "grad_norm": 0.44895511865615845, + "learning_rate": 5.9270958442303035e-05, + "loss": 1.8128, + "step": 4242 + }, + { + "epoch": 0.4407395865794121, + "grad_norm": 0.3845813572406769, + "learning_rate": 5.9254924314748e-05, + "loss": 1.7892, + "step": 4243 + }, + { + "epoch": 0.4408434610989924, + "grad_norm": 0.4151462912559509, + "learning_rate": 5.923888920161571e-05, + "loss": 1.7573, + "step": 4244 + }, + { + "epoch": 0.44094733561857274, + "grad_norm": 0.38922443985939026, + "learning_rate": 5.922285310461378e-05, + "loss": 1.7848, + "step": 4245 + }, + { + "epoch": 0.4410512101381531, + "grad_norm": 0.38882389664649963, + "learning_rate": 5.920681602544993e-05, + "loss": 1.7333, + "step": 4246 + }, + { + "epoch": 0.44115508465773345, + "grad_norm": 0.38732343912124634, + "learning_rate": 5.9190777965832e-05, + "loss": 1.6265, + "step": 4247 + }, + { + "epoch": 0.4412589591773138, + "grad_norm": 0.3835653066635132, + "learning_rate": 5.917473892746791e-05, + "loss": 1.5993, + "step": 4248 + }, + { + "epoch": 0.44136283369689416, + "grad_norm": 0.41864073276519775, + "learning_rate": 5.915869891206567e-05, + "loss": 1.7911, + "step": 4249 + }, + { + "epoch": 0.4414667082164745, + "grad_norm": 0.43092262744903564, + "learning_rate": 5.9142657921333466e-05, + "loss": 1.487, + "step": 4250 + }, + { + "epoch": 0.4415705827360549, + "grad_norm": 0.3756438195705414, + "learning_rate": 5.912661595697948e-05, + "loss": 1.5246, + "step": 4251 + }, + { + "epoch": 0.4416744572556352, + "grad_norm": 0.3791937828063965, + "learning_rate": 5.91105730207121e-05, + "loss": 1.7199, + "step": 4252 + }, + { + "epoch": 0.4417783317752155, + "grad_norm": 0.381509393453598, + "learning_rate": 5.909452911423976e-05, + "loss": 1.6264, + "step": 4253 + }, + { + "epoch": 0.4418822062947959, + "grad_norm": 0.3744488060474396, + "learning_rate": 5.9078484239271e-05, + "loss": 1.5363, + "step": 4254 + }, + { + "epoch": 0.44198608081437624, + "grad_norm": 0.3968122601509094, + "learning_rate": 5.9062438397514496e-05, + "loss": 1.7037, + "step": 4255 + }, + { + "epoch": 0.44208995533395656, + "grad_norm": 0.4114841818809509, + "learning_rate": 5.904639159067898e-05, + "loss": 1.7508, + "step": 4256 + }, + { + "epoch": 0.44219382985353695, + "grad_norm": 0.37198910117149353, + "learning_rate": 5.9030343820473335e-05, + "loss": 1.7378, + "step": 4257 + }, + { + "epoch": 0.4422977043731173, + "grad_norm": 0.47801533341407776, + "learning_rate": 5.9014295088606505e-05, + "loss": 1.838, + "step": 4258 + }, + { + "epoch": 0.4424015788926976, + "grad_norm": 0.43482860922813416, + "learning_rate": 5.899824539678758e-05, + "loss": 1.8126, + "step": 4259 + }, + { + "epoch": 0.442505453412278, + "grad_norm": 0.3933789134025574, + "learning_rate": 5.898219474672568e-05, + "loss": 1.5948, + "step": 4260 + }, + { + "epoch": 0.4426093279318583, + "grad_norm": 0.369711697101593, + "learning_rate": 5.896614314013012e-05, + "loss": 1.5256, + "step": 4261 + }, + { + "epoch": 0.44271320245143864, + "grad_norm": 0.43509915471076965, + "learning_rate": 5.8950090578710246e-05, + "loss": 1.762, + "step": 4262 + }, + { + "epoch": 0.442817076971019, + "grad_norm": 0.4086291193962097, + "learning_rate": 5.893403706417554e-05, + "loss": 1.8018, + "step": 4263 + }, + { + "epoch": 0.44292095149059935, + "grad_norm": 0.38804736733436584, + "learning_rate": 5.89179825982356e-05, + "loss": 1.717, + "step": 4264 + }, + { + "epoch": 0.44302482601017973, + "grad_norm": 0.4058080017566681, + "learning_rate": 5.8901927182600035e-05, + "loss": 1.8017, + "step": 4265 + }, + { + "epoch": 0.44312870052976006, + "grad_norm": 0.381388783454895, + "learning_rate": 5.88858708189787e-05, + "loss": 1.8363, + "step": 4266 + }, + { + "epoch": 0.4432325750493404, + "grad_norm": 0.3509836196899414, + "learning_rate": 5.886981350908142e-05, + "loss": 1.5213, + "step": 4267 + }, + { + "epoch": 0.44333644956892077, + "grad_norm": 0.39383694529533386, + "learning_rate": 5.885375525461817e-05, + "loss": 1.6166, + "step": 4268 + }, + { + "epoch": 0.4434403240885011, + "grad_norm": 0.4015539586544037, + "learning_rate": 5.883769605729907e-05, + "loss": 1.7148, + "step": 4269 + }, + { + "epoch": 0.4435441986080814, + "grad_norm": 0.3899979293346405, + "learning_rate": 5.882163591883427e-05, + "loss": 1.5109, + "step": 4270 + }, + { + "epoch": 0.4436480731276618, + "grad_norm": 0.41530346870422363, + "learning_rate": 5.8805574840934067e-05, + "loss": 1.6773, + "step": 4271 + }, + { + "epoch": 0.44375194764724213, + "grad_norm": 0.4475281536579132, + "learning_rate": 5.878951282530881e-05, + "loss": 1.6551, + "step": 4272 + }, + { + "epoch": 0.44385582216682246, + "grad_norm": 0.442122220993042, + "learning_rate": 5.877344987366902e-05, + "loss": 1.7879, + "step": 4273 + }, + { + "epoch": 0.44395969668640284, + "grad_norm": 0.39350566267967224, + "learning_rate": 5.875738598772522e-05, + "loss": 1.705, + "step": 4274 + }, + { + "epoch": 0.44406357120598317, + "grad_norm": 0.43882155418395996, + "learning_rate": 5.874132116918816e-05, + "loss": 1.7977, + "step": 4275 + }, + { + "epoch": 0.4441674457255635, + "grad_norm": 0.42583081126213074, + "learning_rate": 5.8725255419768565e-05, + "loss": 1.6302, + "step": 4276 + }, + { + "epoch": 0.4442713202451439, + "grad_norm": 0.40676143765449524, + "learning_rate": 5.870918874117731e-05, + "loss": 1.8449, + "step": 4277 + }, + { + "epoch": 0.4443751947647242, + "grad_norm": 0.42294740676879883, + "learning_rate": 5.869312113512542e-05, + "loss": 1.7231, + "step": 4278 + }, + { + "epoch": 0.4444790692843046, + "grad_norm": 0.41314196586608887, + "learning_rate": 5.867705260332391e-05, + "loss": 1.6797, + "step": 4279 + }, + { + "epoch": 0.4445829438038849, + "grad_norm": 0.395781934261322, + "learning_rate": 5.866098314748401e-05, + "loss": 1.7341, + "step": 4280 + }, + { + "epoch": 0.44468681832346524, + "grad_norm": 0.412775456905365, + "learning_rate": 5.864491276931694e-05, + "loss": 1.685, + "step": 4281 + }, + { + "epoch": 0.4447906928430456, + "grad_norm": 0.38872653245925903, + "learning_rate": 5.862884147053409e-05, + "loss": 1.7268, + "step": 4282 + }, + { + "epoch": 0.44489456736262595, + "grad_norm": 0.36840468645095825, + "learning_rate": 5.861276925284694e-05, + "loss": 1.5863, + "step": 4283 + }, + { + "epoch": 0.4449984418822063, + "grad_norm": 0.37796318531036377, + "learning_rate": 5.8596696117967054e-05, + "loss": 1.6242, + "step": 4284 + }, + { + "epoch": 0.44510231640178666, + "grad_norm": 0.3686732351779938, + "learning_rate": 5.8580622067606083e-05, + "loss": 1.5881, + "step": 4285 + }, + { + "epoch": 0.445206190921367, + "grad_norm": 0.4148024320602417, + "learning_rate": 5.8564547103475795e-05, + "loss": 1.815, + "step": 4286 + }, + { + "epoch": 0.4453100654409473, + "grad_norm": 0.39075860381126404, + "learning_rate": 5.854847122728805e-05, + "loss": 1.6837, + "step": 4287 + }, + { + "epoch": 0.4454139399605277, + "grad_norm": 0.40239056944847107, + "learning_rate": 5.853239444075479e-05, + "loss": 1.7351, + "step": 4288 + }, + { + "epoch": 0.445517814480108, + "grad_norm": 0.4210759401321411, + "learning_rate": 5.8516316745588085e-05, + "loss": 1.6621, + "step": 4289 + }, + { + "epoch": 0.4456216889996884, + "grad_norm": 0.39281001687049866, + "learning_rate": 5.850023814350006e-05, + "loss": 1.7465, + "step": 4290 + }, + { + "epoch": 0.44572556351926873, + "grad_norm": 0.4055309295654297, + "learning_rate": 5.848415863620299e-05, + "loss": 1.6812, + "step": 4291 + }, + { + "epoch": 0.44582943803884906, + "grad_norm": 0.3883981704711914, + "learning_rate": 5.846807822540922e-05, + "loss": 1.6243, + "step": 4292 + }, + { + "epoch": 0.44593331255842944, + "grad_norm": 0.394539475440979, + "learning_rate": 5.8451996912831156e-05, + "loss": 1.7954, + "step": 4293 + }, + { + "epoch": 0.44603718707800977, + "grad_norm": 0.40347257256507874, + "learning_rate": 5.8435914700181347e-05, + "loss": 1.6049, + "step": 4294 + }, + { + "epoch": 0.4461410615975901, + "grad_norm": 0.47316792607307434, + "learning_rate": 5.8419831589172426e-05, + "loss": 1.9193, + "step": 4295 + }, + { + "epoch": 0.4462449361171705, + "grad_norm": 0.427937775850296, + "learning_rate": 5.840374758151711e-05, + "loss": 1.6742, + "step": 4296 + }, + { + "epoch": 0.4463488106367508, + "grad_norm": 0.44503235816955566, + "learning_rate": 5.838766267892825e-05, + "loss": 1.9509, + "step": 4297 + }, + { + "epoch": 0.44645268515633113, + "grad_norm": 0.4180712103843689, + "learning_rate": 5.837157688311873e-05, + "loss": 1.8231, + "step": 4298 + }, + { + "epoch": 0.4465565596759115, + "grad_norm": 0.4312383532524109, + "learning_rate": 5.8355490195801566e-05, + "loss": 1.7526, + "step": 4299 + }, + { + "epoch": 0.44666043419549184, + "grad_norm": 0.38129255175590515, + "learning_rate": 5.8339402618689885e-05, + "loss": 1.749, + "step": 4300 + }, + { + "epoch": 0.44676430871507217, + "grad_norm": 0.47581759095191956, + "learning_rate": 5.832331415349687e-05, + "loss": 2.0137, + "step": 4301 + }, + { + "epoch": 0.44686818323465255, + "grad_norm": 0.3956213593482971, + "learning_rate": 5.83072248019358e-05, + "loss": 1.7437, + "step": 4302 + }, + { + "epoch": 0.4469720577542329, + "grad_norm": 0.42220327258110046, + "learning_rate": 5.829113456572012e-05, + "loss": 1.9, + "step": 4303 + }, + { + "epoch": 0.44707593227381326, + "grad_norm": 0.39744284749031067, + "learning_rate": 5.827504344656326e-05, + "loss": 1.5549, + "step": 4304 + }, + { + "epoch": 0.4471798067933936, + "grad_norm": 0.3954363763332367, + "learning_rate": 5.825895144617883e-05, + "loss": 1.7203, + "step": 4305 + }, + { + "epoch": 0.4472836813129739, + "grad_norm": 0.39236709475517273, + "learning_rate": 5.82428585662805e-05, + "loss": 1.7229, + "step": 4306 + }, + { + "epoch": 0.4473875558325543, + "grad_norm": 0.3697808086872101, + "learning_rate": 5.822676480858201e-05, + "loss": 1.6764, + "step": 4307 + }, + { + "epoch": 0.4474914303521346, + "grad_norm": 0.45007967948913574, + "learning_rate": 5.821067017479727e-05, + "loss": 1.8381, + "step": 4308 + }, + { + "epoch": 0.44759530487171495, + "grad_norm": 0.49184536933898926, + "learning_rate": 5.819457466664018e-05, + "loss": 2.1298, + "step": 4309 + }, + { + "epoch": 0.44769917939129533, + "grad_norm": 0.3896716833114624, + "learning_rate": 5.81784782858248e-05, + "loss": 1.7621, + "step": 4310 + }, + { + "epoch": 0.44780305391087566, + "grad_norm": 0.39417698979377747, + "learning_rate": 5.816238103406529e-05, + "loss": 1.7275, + "step": 4311 + }, + { + "epoch": 0.447906928430456, + "grad_norm": 0.45076748728752136, + "learning_rate": 5.814628291307587e-05, + "loss": 1.7021, + "step": 4312 + }, + { + "epoch": 0.44801080295003637, + "grad_norm": 0.43015792965888977, + "learning_rate": 5.813018392457086e-05, + "loss": 1.8654, + "step": 4313 + }, + { + "epoch": 0.4481146774696167, + "grad_norm": 0.3831145167350769, + "learning_rate": 5.8114084070264686e-05, + "loss": 1.7, + "step": 4314 + }, + { + "epoch": 0.448218551989197, + "grad_norm": 0.3929171860218048, + "learning_rate": 5.8097983351871844e-05, + "loss": 1.611, + "step": 4315 + }, + { + "epoch": 0.4483224265087774, + "grad_norm": 0.41042855381965637, + "learning_rate": 5.808188177110694e-05, + "loss": 2.0493, + "step": 4316 + }, + { + "epoch": 0.44842630102835773, + "grad_norm": 0.4465937316417694, + "learning_rate": 5.806577932968467e-05, + "loss": 1.8463, + "step": 4317 + }, + { + "epoch": 0.4485301755479381, + "grad_norm": 0.4185175597667694, + "learning_rate": 5.804967602931981e-05, + "loss": 1.649, + "step": 4318 + }, + { + "epoch": 0.44863405006751844, + "grad_norm": 0.413787841796875, + "learning_rate": 5.803357187172726e-05, + "loss": 1.8159, + "step": 4319 + }, + { + "epoch": 0.44873792458709877, + "grad_norm": 0.4084557890892029, + "learning_rate": 5.801746685862197e-05, + "loss": 1.6689, + "step": 4320 + }, + { + "epoch": 0.44884179910667915, + "grad_norm": 0.4004693329334259, + "learning_rate": 5.8001360991718976e-05, + "loss": 1.5459, + "step": 4321 + }, + { + "epoch": 0.4489456736262595, + "grad_norm": 0.40109360218048096, + "learning_rate": 5.798525427273347e-05, + "loss": 1.6249, + "step": 4322 + }, + { + "epoch": 0.4490495481458398, + "grad_norm": 0.4452827274799347, + "learning_rate": 5.796914670338067e-05, + "loss": 1.7092, + "step": 4323 + }, + { + "epoch": 0.4491534226654202, + "grad_norm": 0.4333063066005707, + "learning_rate": 5.795303828537589e-05, + "loss": 1.8151, + "step": 4324 + }, + { + "epoch": 0.4492572971850005, + "grad_norm": 0.4180525243282318, + "learning_rate": 5.7936929020434585e-05, + "loss": 1.7309, + "step": 4325 + }, + { + "epoch": 0.44936117170458084, + "grad_norm": 0.40769898891448975, + "learning_rate": 5.792081891027224e-05, + "loss": 1.7786, + "step": 4326 + }, + { + "epoch": 0.4494650462241612, + "grad_norm": 0.46367326378822327, + "learning_rate": 5.790470795660447e-05, + "loss": 1.9491, + "step": 4327 + }, + { + "epoch": 0.44956892074374155, + "grad_norm": 0.40147140622138977, + "learning_rate": 5.788859616114697e-05, + "loss": 1.7364, + "step": 4328 + }, + { + "epoch": 0.44967279526332193, + "grad_norm": 0.4153495132923126, + "learning_rate": 5.787248352561549e-05, + "loss": 1.7383, + "step": 4329 + }, + { + "epoch": 0.44977666978290226, + "grad_norm": 0.40633097290992737, + "learning_rate": 5.785637005172595e-05, + "loss": 1.4243, + "step": 4330 + }, + { + "epoch": 0.4498805443024826, + "grad_norm": 0.3852026164531708, + "learning_rate": 5.784025574119427e-05, + "loss": 1.6868, + "step": 4331 + }, + { + "epoch": 0.44998441882206297, + "grad_norm": 0.3848346769809723, + "learning_rate": 5.78241405957365e-05, + "loss": 1.5136, + "step": 4332 + }, + { + "epoch": 0.4500882933416433, + "grad_norm": 0.3910854160785675, + "learning_rate": 5.78080246170688e-05, + "loss": 1.6864, + "step": 4333 + }, + { + "epoch": 0.4501921678612236, + "grad_norm": 0.3685462176799774, + "learning_rate": 5.7791907806907395e-05, + "loss": 1.6707, + "step": 4334 + }, + { + "epoch": 0.450296042380804, + "grad_norm": 0.4090143144130707, + "learning_rate": 5.777579016696856e-05, + "loss": 1.5942, + "step": 4335 + }, + { + "epoch": 0.45039991690038433, + "grad_norm": 0.377214252948761, + "learning_rate": 5.7759671698968745e-05, + "loss": 1.5599, + "step": 4336 + }, + { + "epoch": 0.45050379141996466, + "grad_norm": 0.41875123977661133, + "learning_rate": 5.774355240462441e-05, + "loss": 1.7982, + "step": 4337 + }, + { + "epoch": 0.45060766593954504, + "grad_norm": 0.3935871422290802, + "learning_rate": 5.772743228565215e-05, + "loss": 1.7028, + "step": 4338 + }, + { + "epoch": 0.45071154045912537, + "grad_norm": 0.406841903924942, + "learning_rate": 5.771131134376863e-05, + "loss": 1.7354, + "step": 4339 + }, + { + "epoch": 0.4508154149787057, + "grad_norm": 0.37955769896507263, + "learning_rate": 5.76951895806906e-05, + "loss": 1.647, + "step": 4340 + }, + { + "epoch": 0.4509192894982861, + "grad_norm": 0.42771968245506287, + "learning_rate": 5.76790669981349e-05, + "loss": 1.825, + "step": 4341 + }, + { + "epoch": 0.4510231640178664, + "grad_norm": 0.3990226686000824, + "learning_rate": 5.766294359781848e-05, + "loss": 1.611, + "step": 4342 + }, + { + "epoch": 0.4511270385374468, + "grad_norm": 0.4627864360809326, + "learning_rate": 5.764681938145832e-05, + "loss": 1.7403, + "step": 4343 + }, + { + "epoch": 0.4512309130570271, + "grad_norm": 0.3694743812084198, + "learning_rate": 5.763069435077155e-05, + "loss": 1.5556, + "step": 4344 + }, + { + "epoch": 0.45133478757660744, + "grad_norm": 0.37064674496650696, + "learning_rate": 5.761456850747536e-05, + "loss": 1.6155, + "step": 4345 + }, + { + "epoch": 0.4514386620961878, + "grad_norm": 0.37530389428138733, + "learning_rate": 5.759844185328701e-05, + "loss": 1.462, + "step": 4346 + }, + { + "epoch": 0.45154253661576815, + "grad_norm": 0.4151467978954315, + "learning_rate": 5.7582314389923876e-05, + "loss": 1.6177, + "step": 4347 + }, + { + "epoch": 0.4516464111353485, + "grad_norm": 0.4127391278743744, + "learning_rate": 5.756618611910343e-05, + "loss": 1.782, + "step": 4348 + }, + { + "epoch": 0.45175028565492886, + "grad_norm": 0.41236239671707153, + "learning_rate": 5.7550057042543137e-05, + "loss": 1.7076, + "step": 4349 + }, + { + "epoch": 0.4518541601745092, + "grad_norm": 0.3952932357788086, + "learning_rate": 5.753392716196069e-05, + "loss": 1.6074, + "step": 4350 + }, + { + "epoch": 0.4519580346940895, + "grad_norm": 0.5258936285972595, + "learning_rate": 5.751779647907376e-05, + "loss": 1.8871, + "step": 4351 + }, + { + "epoch": 0.4520619092136699, + "grad_norm": 0.3987799882888794, + "learning_rate": 5.7501664995600134e-05, + "loss": 1.7228, + "step": 4352 + }, + { + "epoch": 0.4521657837332502, + "grad_norm": 0.3863909840583801, + "learning_rate": 5.748553271325772e-05, + "loss": 1.5237, + "step": 4353 + }, + { + "epoch": 0.45226965825283055, + "grad_norm": 0.40286675095558167, + "learning_rate": 5.746939963376445e-05, + "loss": 1.6773, + "step": 4354 + }, + { + "epoch": 0.45237353277241094, + "grad_norm": 0.4835149645805359, + "learning_rate": 5.745326575883839e-05, + "loss": 1.7949, + "step": 4355 + }, + { + "epoch": 0.45247740729199126, + "grad_norm": 0.3882110118865967, + "learning_rate": 5.743713109019766e-05, + "loss": 1.6454, + "step": 4356 + }, + { + "epoch": 0.45258128181157165, + "grad_norm": 0.4016772508621216, + "learning_rate": 5.742099562956048e-05, + "loss": 1.6265, + "step": 4357 + }, + { + "epoch": 0.45268515633115197, + "grad_norm": 0.4201604425907135, + "learning_rate": 5.740485937864515e-05, + "loss": 1.6872, + "step": 4358 + }, + { + "epoch": 0.4527890308507323, + "grad_norm": 0.45017364621162415, + "learning_rate": 5.7388722339170066e-05, + "loss": 1.8138, + "step": 4359 + }, + { + "epoch": 0.4528929053703127, + "grad_norm": 0.39483213424682617, + "learning_rate": 5.7372584512853665e-05, + "loss": 1.5788, + "step": 4360 + }, + { + "epoch": 0.452996779889893, + "grad_norm": 0.4778917133808136, + "learning_rate": 5.7356445901414545e-05, + "loss": 1.8203, + "step": 4361 + }, + { + "epoch": 0.45310065440947334, + "grad_norm": 0.3956323564052582, + "learning_rate": 5.734030650657132e-05, + "loss": 1.6614, + "step": 4362 + }, + { + "epoch": 0.4532045289290537, + "grad_norm": 0.42811745405197144, + "learning_rate": 5.73241663300427e-05, + "loss": 1.851, + "step": 4363 + }, + { + "epoch": 0.45330840344863405, + "grad_norm": 0.39979517459869385, + "learning_rate": 5.730802537354749e-05, + "loss": 1.6255, + "step": 4364 + }, + { + "epoch": 0.45341227796821437, + "grad_norm": 0.4086802899837494, + "learning_rate": 5.729188363880459e-05, + "loss": 1.4243, + "step": 4365 + }, + { + "epoch": 0.45351615248779475, + "grad_norm": 0.43692517280578613, + "learning_rate": 5.727574112753296e-05, + "loss": 1.8354, + "step": 4366 + }, + { + "epoch": 0.4536200270073751, + "grad_norm": 0.37770721316337585, + "learning_rate": 5.725959784145165e-05, + "loss": 1.5944, + "step": 4367 + }, + { + "epoch": 0.45372390152695546, + "grad_norm": 0.4468797445297241, + "learning_rate": 5.72434537822798e-05, + "loss": 1.7066, + "step": 4368 + }, + { + "epoch": 0.4538277760465358, + "grad_norm": 0.4045311510562897, + "learning_rate": 5.722730895173662e-05, + "loss": 1.7554, + "step": 4369 + }, + { + "epoch": 0.4539316505661161, + "grad_norm": 0.40388810634613037, + "learning_rate": 5.721116335154142e-05, + "loss": 1.4915, + "step": 4370 + }, + { + "epoch": 0.4540355250856965, + "grad_norm": 0.4059275686740875, + "learning_rate": 5.719501698341356e-05, + "loss": 1.6267, + "step": 4371 + }, + { + "epoch": 0.45413939960527683, + "grad_norm": 0.3910108804702759, + "learning_rate": 5.7178869849072526e-05, + "loss": 1.6264, + "step": 4372 + }, + { + "epoch": 0.45424327412485715, + "grad_norm": 0.445516437292099, + "learning_rate": 5.716272195023785e-05, + "loss": 1.7784, + "step": 4373 + }, + { + "epoch": 0.45434714864443754, + "grad_norm": 0.4089578688144684, + "learning_rate": 5.7146573288629145e-05, + "loss": 1.7885, + "step": 4374 + }, + { + "epoch": 0.45445102316401786, + "grad_norm": 0.4145938754081726, + "learning_rate": 5.713042386596614e-05, + "loss": 1.6938, + "step": 4375 + }, + { + "epoch": 0.4545548976835982, + "grad_norm": 0.38517993688583374, + "learning_rate": 5.7114273683968625e-05, + "loss": 1.5404, + "step": 4376 + }, + { + "epoch": 0.4546587722031786, + "grad_norm": 0.40502604842185974, + "learning_rate": 5.709812274435643e-05, + "loss": 1.6948, + "step": 4377 + }, + { + "epoch": 0.4547626467227589, + "grad_norm": 0.3674617409706116, + "learning_rate": 5.708197104884955e-05, + "loss": 1.6687, + "step": 4378 + }, + { + "epoch": 0.45486652124233923, + "grad_norm": 0.38975459337234497, + "learning_rate": 5.706581859916799e-05, + "loss": 1.5973, + "step": 4379 + }, + { + "epoch": 0.4549703957619196, + "grad_norm": 0.3994758725166321, + "learning_rate": 5.704966539703185e-05, + "loss": 1.6266, + "step": 4380 + }, + { + "epoch": 0.45507427028149994, + "grad_norm": 0.39510855078697205, + "learning_rate": 5.7033511444161355e-05, + "loss": 1.7119, + "step": 4381 + }, + { + "epoch": 0.4551781448010803, + "grad_norm": 0.4508286416530609, + "learning_rate": 5.701735674227675e-05, + "loss": 1.9857, + "step": 4382 + }, + { + "epoch": 0.45528201932066065, + "grad_norm": 0.4357827603816986, + "learning_rate": 5.7001201293098386e-05, + "loss": 1.9188, + "step": 4383 + }, + { + "epoch": 0.455385893840241, + "grad_norm": 0.4613460898399353, + "learning_rate": 5.69850450983467e-05, + "loss": 1.7152, + "step": 4384 + }, + { + "epoch": 0.45548976835982136, + "grad_norm": 0.43691280484199524, + "learning_rate": 5.6968888159742184e-05, + "loss": 1.7644, + "step": 4385 + }, + { + "epoch": 0.4555936428794017, + "grad_norm": 0.40065327286720276, + "learning_rate": 5.695273047900544e-05, + "loss": 1.6811, + "step": 4386 + }, + { + "epoch": 0.455697517398982, + "grad_norm": 0.4466818869113922, + "learning_rate": 5.693657205785715e-05, + "loss": 1.8516, + "step": 4387 + }, + { + "epoch": 0.4558013919185624, + "grad_norm": 0.4043349623680115, + "learning_rate": 5.6920412898018026e-05, + "loss": 1.8134, + "step": 4388 + }, + { + "epoch": 0.4559052664381427, + "grad_norm": 0.42350390553474426, + "learning_rate": 5.690425300120893e-05, + "loss": 1.8296, + "step": 4389 + }, + { + "epoch": 0.45600914095772305, + "grad_norm": 0.38826295733451843, + "learning_rate": 5.6888092369150734e-05, + "loss": 1.7926, + "step": 4390 + }, + { + "epoch": 0.45611301547730343, + "grad_norm": 0.39671480655670166, + "learning_rate": 5.6871931003564414e-05, + "loss": 1.8088, + "step": 4391 + }, + { + "epoch": 0.45621688999688376, + "grad_norm": 0.38095131516456604, + "learning_rate": 5.685576890617107e-05, + "loss": 1.5183, + "step": 4392 + }, + { + "epoch": 0.45632076451646414, + "grad_norm": 0.37252774834632874, + "learning_rate": 5.683960607869179e-05, + "loss": 1.6363, + "step": 4393 + }, + { + "epoch": 0.45642463903604447, + "grad_norm": 0.43346065282821655, + "learning_rate": 5.6823442522847835e-05, + "loss": 1.5937, + "step": 4394 + }, + { + "epoch": 0.4565285135556248, + "grad_norm": 0.3876819610595703, + "learning_rate": 5.680727824036046e-05, + "loss": 1.723, + "step": 4395 + }, + { + "epoch": 0.4566323880752052, + "grad_norm": 0.3917519152164459, + "learning_rate": 5.6791113232951063e-05, + "loss": 1.5769, + "step": 4396 + }, + { + "epoch": 0.4567362625947855, + "grad_norm": 0.4223952889442444, + "learning_rate": 5.677494750234108e-05, + "loss": 1.7898, + "step": 4397 + }, + { + "epoch": 0.45684013711436583, + "grad_norm": 0.39051714539527893, + "learning_rate": 5.675878105025203e-05, + "loss": 1.7079, + "step": 4398 + }, + { + "epoch": 0.4569440116339462, + "grad_norm": 0.3843806982040405, + "learning_rate": 5.674261387840551e-05, + "loss": 1.5216, + "step": 4399 + }, + { + "epoch": 0.45704788615352654, + "grad_norm": 0.4141395092010498, + "learning_rate": 5.6726445988523224e-05, + "loss": 1.6415, + "step": 4400 + }, + { + "epoch": 0.45715176067310687, + "grad_norm": 0.4137116074562073, + "learning_rate": 5.67102773823269e-05, + "loss": 1.7626, + "step": 4401 + }, + { + "epoch": 0.45725563519268725, + "grad_norm": 0.3695942461490631, + "learning_rate": 5.669410806153838e-05, + "loss": 1.6487, + "step": 4402 + }, + { + "epoch": 0.4573595097122676, + "grad_norm": 0.3959349989891052, + "learning_rate": 5.667793802787957e-05, + "loss": 1.7431, + "step": 4403 + }, + { + "epoch": 0.4574633842318479, + "grad_norm": 0.39805370569229126, + "learning_rate": 5.6661767283072444e-05, + "loss": 1.6595, + "step": 4404 + }, + { + "epoch": 0.4575672587514283, + "grad_norm": 0.38343602418899536, + "learning_rate": 5.664559582883906e-05, + "loss": 1.8235, + "step": 4405 + }, + { + "epoch": 0.4576711332710086, + "grad_norm": 0.4133247435092926, + "learning_rate": 5.662942366690157e-05, + "loss": 1.7427, + "step": 4406 + }, + { + "epoch": 0.457775007790589, + "grad_norm": 0.44640305638313293, + "learning_rate": 5.6613250798982156e-05, + "loss": 1.7745, + "step": 4407 + }, + { + "epoch": 0.4578788823101693, + "grad_norm": 0.4178631603717804, + "learning_rate": 5.659707722680313e-05, + "loss": 1.7777, + "step": 4408 + }, + { + "epoch": 0.45798275682974965, + "grad_norm": 0.44724735617637634, + "learning_rate": 5.6580902952086836e-05, + "loss": 1.5845, + "step": 4409 + }, + { + "epoch": 0.45808663134933003, + "grad_norm": 0.39849719405174255, + "learning_rate": 5.656472797655571e-05, + "loss": 1.5669, + "step": 4410 + }, + { + "epoch": 0.45819050586891036, + "grad_norm": 0.3746553957462311, + "learning_rate": 5.6548552301932265e-05, + "loss": 1.5851, + "step": 4411 + }, + { + "epoch": 0.4582943803884907, + "grad_norm": 0.4282001852989197, + "learning_rate": 5.6532375929939075e-05, + "loss": 1.8875, + "step": 4412 + }, + { + "epoch": 0.45839825490807107, + "grad_norm": 0.3972803056240082, + "learning_rate": 5.65161988622988e-05, + "loss": 1.6262, + "step": 4413 + }, + { + "epoch": 0.4585021294276514, + "grad_norm": 0.38488489389419556, + "learning_rate": 5.650002110073418e-05, + "loss": 1.5766, + "step": 4414 + }, + { + "epoch": 0.4586060039472317, + "grad_norm": 0.45945605635643005, + "learning_rate": 5.648384264696802e-05, + "loss": 1.5854, + "step": 4415 + }, + { + "epoch": 0.4587098784668121, + "grad_norm": 0.3760433793067932, + "learning_rate": 5.6467663502723175e-05, + "loss": 1.6934, + "step": 4416 + }, + { + "epoch": 0.45881375298639243, + "grad_norm": 0.38754063844680786, + "learning_rate": 5.645148366972264e-05, + "loss": 1.7554, + "step": 4417 + }, + { + "epoch": 0.45891762750597276, + "grad_norm": 0.388735830783844, + "learning_rate": 5.643530314968941e-05, + "loss": 1.4804, + "step": 4418 + }, + { + "epoch": 0.45902150202555314, + "grad_norm": 0.39035165309906006, + "learning_rate": 5.6419121944346585e-05, + "loss": 1.6639, + "step": 4419 + }, + { + "epoch": 0.45912537654513347, + "grad_norm": 0.3769480586051941, + "learning_rate": 5.640294005541735e-05, + "loss": 1.7626, + "step": 4420 + }, + { + "epoch": 0.45922925106471385, + "grad_norm": 0.41671907901763916, + "learning_rate": 5.638675748462493e-05, + "loss": 1.7363, + "step": 4421 + }, + { + "epoch": 0.4593331255842942, + "grad_norm": 0.4125288426876068, + "learning_rate": 5.637057423369268e-05, + "loss": 1.725, + "step": 4422 + }, + { + "epoch": 0.4594370001038745, + "grad_norm": 0.44852516055107117, + "learning_rate": 5.635439030434395e-05, + "loss": 1.8253, + "step": 4423 + }, + { + "epoch": 0.4595408746234549, + "grad_norm": 0.4015621542930603, + "learning_rate": 5.6338205698302224e-05, + "loss": 1.6797, + "step": 4424 + }, + { + "epoch": 0.4596447491430352, + "grad_norm": 0.40786075592041016, + "learning_rate": 5.6322020417291034e-05, + "loss": 1.7569, + "step": 4425 + }, + { + "epoch": 0.45974862366261554, + "grad_norm": 0.4462954103946686, + "learning_rate": 5.630583446303399e-05, + "loss": 1.8999, + "step": 4426 + }, + { + "epoch": 0.4598524981821959, + "grad_norm": 0.40880078077316284, + "learning_rate": 5.6289647837254744e-05, + "loss": 1.5374, + "step": 4427 + }, + { + "epoch": 0.45995637270177625, + "grad_norm": 0.4236305356025696, + "learning_rate": 5.627346054167707e-05, + "loss": 1.7143, + "step": 4428 + }, + { + "epoch": 0.4600602472213566, + "grad_norm": 0.3998420834541321, + "learning_rate": 5.625727257802479e-05, + "loss": 1.8177, + "step": 4429 + }, + { + "epoch": 0.46016412174093696, + "grad_norm": 0.4473719000816345, + "learning_rate": 5.624108394802178e-05, + "loss": 1.8531, + "step": 4430 + }, + { + "epoch": 0.4602679962605173, + "grad_norm": 0.39816030859947205, + "learning_rate": 5.6224894653392035e-05, + "loss": 1.6969, + "step": 4431 + }, + { + "epoch": 0.46037187078009767, + "grad_norm": 0.36577850580215454, + "learning_rate": 5.6208704695859535e-05, + "loss": 1.5994, + "step": 4432 + }, + { + "epoch": 0.460475745299678, + "grad_norm": 0.3840119540691376, + "learning_rate": 5.619251407714843e-05, + "loss": 1.6662, + "step": 4433 + }, + { + "epoch": 0.4605796198192583, + "grad_norm": 0.4154397249221802, + "learning_rate": 5.617632279898288e-05, + "loss": 1.7509, + "step": 4434 + }, + { + "epoch": 0.4606834943388387, + "grad_norm": 0.3793880045413971, + "learning_rate": 5.6160130863087115e-05, + "loss": 1.6896, + "step": 4435 + }, + { + "epoch": 0.46078736885841903, + "grad_norm": 0.3721482455730438, + "learning_rate": 5.614393827118548e-05, + "loss": 1.6231, + "step": 4436 + }, + { + "epoch": 0.46089124337799936, + "grad_norm": 0.37379753589630127, + "learning_rate": 5.6127745025002344e-05, + "loss": 1.6131, + "step": 4437 + }, + { + "epoch": 0.46099511789757974, + "grad_norm": 0.3776273727416992, + "learning_rate": 5.611155112626215e-05, + "loss": 1.7206, + "step": 4438 + }, + { + "epoch": 0.46109899241716007, + "grad_norm": 0.37556153535842896, + "learning_rate": 5.609535657668945e-05, + "loss": 1.6558, + "step": 4439 + }, + { + "epoch": 0.4612028669367404, + "grad_norm": 0.41758623719215393, + "learning_rate": 5.6079161378008805e-05, + "loss": 1.7243, + "step": 4440 + }, + { + "epoch": 0.4613067414563208, + "grad_norm": 0.39208123087882996, + "learning_rate": 5.606296553194489e-05, + "loss": 1.7576, + "step": 4441 + }, + { + "epoch": 0.4614106159759011, + "grad_norm": 0.3936588764190674, + "learning_rate": 5.6046769040222446e-05, + "loss": 1.5866, + "step": 4442 + }, + { + "epoch": 0.46151449049548143, + "grad_norm": 0.3708806335926056, + "learning_rate": 5.603057190456628e-05, + "loss": 1.6697, + "step": 4443 + }, + { + "epoch": 0.4616183650150618, + "grad_norm": 0.4200730621814728, + "learning_rate": 5.601437412670121e-05, + "loss": 1.4743, + "step": 4444 + }, + { + "epoch": 0.46172223953464214, + "grad_norm": 0.40892651677131653, + "learning_rate": 5.5998175708352255e-05, + "loss": 1.7079, + "step": 4445 + }, + { + "epoch": 0.4618261140542225, + "grad_norm": 0.41610532999038696, + "learning_rate": 5.598197665124434e-05, + "loss": 1.6898, + "step": 4446 + }, + { + "epoch": 0.46192998857380285, + "grad_norm": 0.39250999689102173, + "learning_rate": 5.596577695710258e-05, + "loss": 1.686, + "step": 4447 + }, + { + "epoch": 0.4620338630933832, + "grad_norm": 0.3861941695213318, + "learning_rate": 5.594957662765211e-05, + "loss": 1.6314, + "step": 4448 + }, + { + "epoch": 0.46213773761296356, + "grad_norm": 0.4138748049736023, + "learning_rate": 5.5933375664618134e-05, + "loss": 1.6829, + "step": 4449 + }, + { + "epoch": 0.4622416121325439, + "grad_norm": 0.45050138235092163, + "learning_rate": 5.591717406972594e-05, + "loss": 1.9202, + "step": 4450 + }, + { + "epoch": 0.4623454866521242, + "grad_norm": 0.40642380714416504, + "learning_rate": 5.590097184470087e-05, + "loss": 1.7383, + "step": 4451 + }, + { + "epoch": 0.4624493611717046, + "grad_norm": 0.4135674238204956, + "learning_rate": 5.588476899126832e-05, + "loss": 1.7811, + "step": 4452 + }, + { + "epoch": 0.4625532356912849, + "grad_norm": 0.4031025767326355, + "learning_rate": 5.586856551115378e-05, + "loss": 1.6378, + "step": 4453 + }, + { + "epoch": 0.46265711021086525, + "grad_norm": 0.41000255942344666, + "learning_rate": 5.585236140608279e-05, + "loss": 1.7076, + "step": 4454 + }, + { + "epoch": 0.46276098473044563, + "grad_norm": 0.41616103053092957, + "learning_rate": 5.583615667778094e-05, + "loss": 1.7697, + "step": 4455 + }, + { + "epoch": 0.46286485925002596, + "grad_norm": 0.4099687337875366, + "learning_rate": 5.581995132797394e-05, + "loss": 1.5478, + "step": 4456 + }, + { + "epoch": 0.4629687337696063, + "grad_norm": 0.41153401136398315, + "learning_rate": 5.5803745358387526e-05, + "loss": 1.7247, + "step": 4457 + }, + { + "epoch": 0.46307260828918667, + "grad_norm": 0.41319242119789124, + "learning_rate": 5.578753877074749e-05, + "loss": 1.768, + "step": 4458 + }, + { + "epoch": 0.463176482808767, + "grad_norm": 0.3985736072063446, + "learning_rate": 5.577133156677975e-05, + "loss": 1.7182, + "step": 4459 + }, + { + "epoch": 0.4632803573283474, + "grad_norm": 0.4140629470348358, + "learning_rate": 5.575512374821018e-05, + "loss": 1.8411, + "step": 4460 + }, + { + "epoch": 0.4633842318479277, + "grad_norm": 0.40386584401130676, + "learning_rate": 5.573891531676484e-05, + "loss": 1.6013, + "step": 4461 + }, + { + "epoch": 0.46348810636750803, + "grad_norm": 0.40092045068740845, + "learning_rate": 5.572270627416979e-05, + "loss": 1.8274, + "step": 4462 + }, + { + "epoch": 0.4635919808870884, + "grad_norm": 0.4397892355918884, + "learning_rate": 5.5706496622151136e-05, + "loss": 1.8865, + "step": 4463 + }, + { + "epoch": 0.46369585540666874, + "grad_norm": 0.41136595606803894, + "learning_rate": 5.569028636243513e-05, + "loss": 1.6921, + "step": 4464 + }, + { + "epoch": 0.46379972992624907, + "grad_norm": 0.3688594698905945, + "learning_rate": 5.5674075496748e-05, + "loss": 1.6099, + "step": 4465 + }, + { + "epoch": 0.46390360444582945, + "grad_norm": 0.41220369935035706, + "learning_rate": 5.56578640268161e-05, + "loss": 1.7032, + "step": 4466 + }, + { + "epoch": 0.4640074789654098, + "grad_norm": 0.38784199953079224, + "learning_rate": 5.5641651954365803e-05, + "loss": 1.7474, + "step": 4467 + }, + { + "epoch": 0.4641113534849901, + "grad_norm": 0.3877417743206024, + "learning_rate": 5.562543928112358e-05, + "loss": 1.7868, + "step": 4468 + }, + { + "epoch": 0.4642152280045705, + "grad_norm": 0.41094866394996643, + "learning_rate": 5.560922600881595e-05, + "loss": 1.684, + "step": 4469 + }, + { + "epoch": 0.4643191025241508, + "grad_norm": 0.3839239180088043, + "learning_rate": 5.559301213916952e-05, + "loss": 1.601, + "step": 4470 + }, + { + "epoch": 0.4644229770437312, + "grad_norm": 0.3839179277420044, + "learning_rate": 5.557679767391091e-05, + "loss": 1.773, + "step": 4471 + }, + { + "epoch": 0.4645268515633115, + "grad_norm": 0.3948630392551422, + "learning_rate": 5.5560582614766845e-05, + "loss": 1.7059, + "step": 4472 + }, + { + "epoch": 0.46463072608289185, + "grad_norm": 0.4155755639076233, + "learning_rate": 5.5544366963464134e-05, + "loss": 1.7463, + "step": 4473 + }, + { + "epoch": 0.46473460060247224, + "grad_norm": 0.3817596733570099, + "learning_rate": 5.552815072172955e-05, + "loss": 1.6543, + "step": 4474 + }, + { + "epoch": 0.46483847512205256, + "grad_norm": 0.40604111552238464, + "learning_rate": 5.5511933891290056e-05, + "loss": 1.6462, + "step": 4475 + }, + { + "epoch": 0.4649423496416329, + "grad_norm": 0.39084017276763916, + "learning_rate": 5.5495716473872604e-05, + "loss": 1.656, + "step": 4476 + }, + { + "epoch": 0.4650462241612133, + "grad_norm": 0.4190458059310913, + "learning_rate": 5.5479498471204196e-05, + "loss": 1.7544, + "step": 4477 + }, + { + "epoch": 0.4651500986807936, + "grad_norm": 0.37424778938293457, + "learning_rate": 5.546327988501196e-05, + "loss": 1.5076, + "step": 4478 + }, + { + "epoch": 0.4652539732003739, + "grad_norm": 0.4260268807411194, + "learning_rate": 5.544706071702302e-05, + "loss": 1.638, + "step": 4479 + }, + { + "epoch": 0.4653578477199543, + "grad_norm": 0.3845141530036926, + "learning_rate": 5.5430840968964615e-05, + "loss": 1.5985, + "step": 4480 + }, + { + "epoch": 0.46546172223953464, + "grad_norm": 0.385220468044281, + "learning_rate": 5.541462064256401e-05, + "loss": 1.7247, + "step": 4481 + }, + { + "epoch": 0.46556559675911496, + "grad_norm": 0.41517341136932373, + "learning_rate": 5.5398399739548524e-05, + "loss": 1.8184, + "step": 4482 + }, + { + "epoch": 0.46566947127869535, + "grad_norm": 0.4744667112827301, + "learning_rate": 5.538217826164558e-05, + "loss": 1.8839, + "step": 4483 + }, + { + "epoch": 0.4657733457982757, + "grad_norm": 0.40122199058532715, + "learning_rate": 5.5365956210582634e-05, + "loss": 1.5911, + "step": 4484 + }, + { + "epoch": 0.46587722031785606, + "grad_norm": 0.4111073911190033, + "learning_rate": 5.5349733588087216e-05, + "loss": 1.7894, + "step": 4485 + }, + { + "epoch": 0.4659810948374364, + "grad_norm": 0.38711994886398315, + "learning_rate": 5.533351039588689e-05, + "loss": 1.7112, + "step": 4486 + }, + { + "epoch": 0.4660849693570167, + "grad_norm": 0.4004124104976654, + "learning_rate": 5.531728663570933e-05, + "loss": 1.7559, + "step": 4487 + }, + { + "epoch": 0.4661888438765971, + "grad_norm": 0.3998907804489136, + "learning_rate": 5.53010623092822e-05, + "loss": 1.6631, + "step": 4488 + }, + { + "epoch": 0.4662927183961774, + "grad_norm": 0.3848699927330017, + "learning_rate": 5.5284837418333294e-05, + "loss": 1.736, + "step": 4489 + }, + { + "epoch": 0.46639659291575775, + "grad_norm": 0.41114541888237, + "learning_rate": 5.5268611964590425e-05, + "loss": 1.8165, + "step": 4490 + }, + { + "epoch": 0.46650046743533813, + "grad_norm": 0.475700706243515, + "learning_rate": 5.525238594978146e-05, + "loss": 1.8559, + "step": 4491 + }, + { + "epoch": 0.46660434195491846, + "grad_norm": 0.37533336877822876, + "learning_rate": 5.523615937563438e-05, + "loss": 1.5622, + "step": 4492 + }, + { + "epoch": 0.4667082164744988, + "grad_norm": 0.38787853717803955, + "learning_rate": 5.521993224387717e-05, + "loss": 1.6647, + "step": 4493 + }, + { + "epoch": 0.46681209099407917, + "grad_norm": 0.4438975155353546, + "learning_rate": 5.520370455623789e-05, + "loss": 1.7663, + "step": 4494 + }, + { + "epoch": 0.4669159655136595, + "grad_norm": 0.4348212480545044, + "learning_rate": 5.518747631444467e-05, + "loss": 1.8218, + "step": 4495 + }, + { + "epoch": 0.4670198400332398, + "grad_norm": 0.4120256304740906, + "learning_rate": 5.517124752022569e-05, + "loss": 1.6427, + "step": 4496 + }, + { + "epoch": 0.4671237145528202, + "grad_norm": 0.3732791244983673, + "learning_rate": 5.515501817530917e-05, + "loss": 1.5317, + "step": 4497 + }, + { + "epoch": 0.46722758907240053, + "grad_norm": 0.40980592370033264, + "learning_rate": 5.513878828142344e-05, + "loss": 1.736, + "step": 4498 + }, + { + "epoch": 0.4673314635919809, + "grad_norm": 0.42405858635902405, + "learning_rate": 5.5122557840296815e-05, + "loss": 1.7212, + "step": 4499 + }, + { + "epoch": 0.46743533811156124, + "grad_norm": 0.380643367767334, + "learning_rate": 5.5106326853657764e-05, + "loss": 1.6732, + "step": 4500 + }, + { + "epoch": 0.46753921263114157, + "grad_norm": 0.4484168589115143, + "learning_rate": 5.5090095323234735e-05, + "loss": 1.8326, + "step": 4501 + }, + { + "epoch": 0.46764308715072195, + "grad_norm": 0.44353044033050537, + "learning_rate": 5.507386325075625e-05, + "loss": 1.8159, + "step": 4502 + }, + { + "epoch": 0.4677469616703023, + "grad_norm": 0.40523630380630493, + "learning_rate": 5.505763063795091e-05, + "loss": 1.8045, + "step": 4503 + }, + { + "epoch": 0.4678508361898826, + "grad_norm": 0.41718611121177673, + "learning_rate": 5.504139748654735e-05, + "loss": 1.7684, + "step": 4504 + }, + { + "epoch": 0.467954710709463, + "grad_norm": 0.3799745440483093, + "learning_rate": 5.5025163798274283e-05, + "loss": 1.7299, + "step": 4505 + }, + { + "epoch": 0.4680585852290433, + "grad_norm": 0.5379766821861267, + "learning_rate": 5.500892957486048e-05, + "loss": 1.8735, + "step": 4506 + }, + { + "epoch": 0.46816245974862364, + "grad_norm": 0.42083024978637695, + "learning_rate": 5.499269481803474e-05, + "loss": 1.6187, + "step": 4507 + }, + { + "epoch": 0.468266334268204, + "grad_norm": 0.44988521933555603, + "learning_rate": 5.497645952952596e-05, + "loss": 1.8866, + "step": 4508 + }, + { + "epoch": 0.46837020878778435, + "grad_norm": 0.37519097328186035, + "learning_rate": 5.496022371106304e-05, + "loss": 1.6358, + "step": 4509 + }, + { + "epoch": 0.46847408330736473, + "grad_norm": 0.40404897928237915, + "learning_rate": 5.4943987364374985e-05, + "loss": 1.6527, + "step": 4510 + }, + { + "epoch": 0.46857795782694506, + "grad_norm": 0.3824155032634735, + "learning_rate": 5.492775049119085e-05, + "loss": 1.571, + "step": 4511 + }, + { + "epoch": 0.4686818323465254, + "grad_norm": 0.38465747237205505, + "learning_rate": 5.491151309323973e-05, + "loss": 1.8012, + "step": 4512 + }, + { + "epoch": 0.46878570686610577, + "grad_norm": 0.41244107484817505, + "learning_rate": 5.489527517225076e-05, + "loss": 1.7915, + "step": 4513 + }, + { + "epoch": 0.4688895813856861, + "grad_norm": 0.39716869592666626, + "learning_rate": 5.487903672995318e-05, + "loss": 1.6541, + "step": 4514 + }, + { + "epoch": 0.4689934559052664, + "grad_norm": 0.3766006529331207, + "learning_rate": 5.486279776807626e-05, + "loss": 1.7457, + "step": 4515 + }, + { + "epoch": 0.4690973304248468, + "grad_norm": 0.3792710602283478, + "learning_rate": 5.484655828834929e-05, + "loss": 1.5523, + "step": 4516 + }, + { + "epoch": 0.46920120494442713, + "grad_norm": 0.4360962212085724, + "learning_rate": 5.483031829250167e-05, + "loss": 1.7419, + "step": 4517 + }, + { + "epoch": 0.46930507946400746, + "grad_norm": 0.4073307514190674, + "learning_rate": 5.481407778226284e-05, + "loss": 1.7789, + "step": 4518 + }, + { + "epoch": 0.46940895398358784, + "grad_norm": 0.3982420563697815, + "learning_rate": 5.4797836759362256e-05, + "loss": 1.6478, + "step": 4519 + }, + { + "epoch": 0.46951282850316817, + "grad_norm": 0.3903031051158905, + "learning_rate": 5.478159522552949e-05, + "loss": 1.5748, + "step": 4520 + }, + { + "epoch": 0.4696167030227485, + "grad_norm": 0.4410886764526367, + "learning_rate": 5.476535318249414e-05, + "loss": 1.7942, + "step": 4521 + }, + { + "epoch": 0.4697205775423289, + "grad_norm": 0.3934234380722046, + "learning_rate": 5.4749110631985846e-05, + "loss": 1.6272, + "step": 4522 + }, + { + "epoch": 0.4698244520619092, + "grad_norm": 0.4124247431755066, + "learning_rate": 5.4732867575734314e-05, + "loss": 1.7511, + "step": 4523 + }, + { + "epoch": 0.4699283265814896, + "grad_norm": 0.4178972840309143, + "learning_rate": 5.4716624015469284e-05, + "loss": 1.7224, + "step": 4524 + }, + { + "epoch": 0.4700322011010699, + "grad_norm": 0.44093725085258484, + "learning_rate": 5.470037995292061e-05, + "loss": 1.6125, + "step": 4525 + }, + { + "epoch": 0.47013607562065024, + "grad_norm": 0.3923119008541107, + "learning_rate": 5.468413538981814e-05, + "loss": 1.6888, + "step": 4526 + }, + { + "epoch": 0.4702399501402306, + "grad_norm": 0.4040331244468689, + "learning_rate": 5.466789032789176e-05, + "loss": 1.7642, + "step": 4527 + }, + { + "epoch": 0.47034382465981095, + "grad_norm": 0.4173428416252136, + "learning_rate": 5.465164476887149e-05, + "loss": 1.6372, + "step": 4528 + }, + { + "epoch": 0.4704476991793913, + "grad_norm": 0.36699429154396057, + "learning_rate": 5.463539871448735e-05, + "loss": 1.7026, + "step": 4529 + }, + { + "epoch": 0.47055157369897166, + "grad_norm": 0.37579965591430664, + "learning_rate": 5.461915216646938e-05, + "loss": 1.513, + "step": 4530 + }, + { + "epoch": 0.470655448218552, + "grad_norm": 0.39434462785720825, + "learning_rate": 5.4602905126547744e-05, + "loss": 1.6832, + "step": 4531 + }, + { + "epoch": 0.4707593227381323, + "grad_norm": 0.40880143642425537, + "learning_rate": 5.4586657596452615e-05, + "loss": 1.5767, + "step": 4532 + }, + { + "epoch": 0.4708631972577127, + "grad_norm": 0.4027133584022522, + "learning_rate": 5.4570409577914215e-05, + "loss": 1.6025, + "step": 4533 + }, + { + "epoch": 0.470967071777293, + "grad_norm": 0.37982282042503357, + "learning_rate": 5.455416107266287e-05, + "loss": 1.6899, + "step": 4534 + }, + { + "epoch": 0.47107094629687335, + "grad_norm": 0.4183795154094696, + "learning_rate": 5.453791208242889e-05, + "loss": 1.7355, + "step": 4535 + }, + { + "epoch": 0.47117482081645373, + "grad_norm": 0.4518531560897827, + "learning_rate": 5.452166260894267e-05, + "loss": 1.9148, + "step": 4536 + }, + { + "epoch": 0.47127869533603406, + "grad_norm": 0.41706570982933044, + "learning_rate": 5.450541265393464e-05, + "loss": 1.7175, + "step": 4537 + }, + { + "epoch": 0.47138256985561444, + "grad_norm": 0.38298308849334717, + "learning_rate": 5.4489162219135314e-05, + "loss": 1.8146, + "step": 4538 + }, + { + "epoch": 0.47148644437519477, + "grad_norm": 0.40449774265289307, + "learning_rate": 5.447291130627522e-05, + "loss": 1.8353, + "step": 4539 + }, + { + "epoch": 0.4715903188947751, + "grad_norm": 0.40344521403312683, + "learning_rate": 5.4456659917084975e-05, + "loss": 1.672, + "step": 4540 + }, + { + "epoch": 0.4716941934143555, + "grad_norm": 0.3761052191257477, + "learning_rate": 5.44404080532952e-05, + "loss": 1.7119, + "step": 4541 + }, + { + "epoch": 0.4717980679339358, + "grad_norm": 0.45389243960380554, + "learning_rate": 5.442415571663661e-05, + "loss": 1.9569, + "step": 4542 + }, + { + "epoch": 0.47190194245351613, + "grad_norm": 0.3938455581665039, + "learning_rate": 5.440790290883997e-05, + "loss": 1.5407, + "step": 4543 + }, + { + "epoch": 0.4720058169730965, + "grad_norm": 0.39464959502220154, + "learning_rate": 5.439164963163601e-05, + "loss": 1.7799, + "step": 4544 + }, + { + "epoch": 0.47210969149267684, + "grad_norm": 0.3845635652542114, + "learning_rate": 5.437539588675564e-05, + "loss": 1.6902, + "step": 4545 + }, + { + "epoch": 0.47221356601225717, + "grad_norm": 0.41250425577163696, + "learning_rate": 5.435914167592975e-05, + "loss": 1.7341, + "step": 4546 + }, + { + "epoch": 0.47231744053183755, + "grad_norm": 0.42640382051467896, + "learning_rate": 5.434288700088925e-05, + "loss": 1.7448, + "step": 4547 + }, + { + "epoch": 0.4724213150514179, + "grad_norm": 0.3739955425262451, + "learning_rate": 5.432663186336517e-05, + "loss": 1.6007, + "step": 4548 + }, + { + "epoch": 0.47252518957099826, + "grad_norm": 0.3853756785392761, + "learning_rate": 5.431037626508856e-05, + "loss": 1.6348, + "step": 4549 + }, + { + "epoch": 0.4726290640905786, + "grad_norm": 0.4050140082836151, + "learning_rate": 5.429412020779049e-05, + "loss": 1.6318, + "step": 4550 + }, + { + "epoch": 0.4727329386101589, + "grad_norm": 0.39661628007888794, + "learning_rate": 5.427786369320211e-05, + "loss": 1.7079, + "step": 4551 + }, + { + "epoch": 0.4728368131297393, + "grad_norm": 0.3745506703853607, + "learning_rate": 5.4261606723054604e-05, + "loss": 1.5058, + "step": 4552 + }, + { + "epoch": 0.4729406876493196, + "grad_norm": 0.45182400941848755, + "learning_rate": 5.424534929907924e-05, + "loss": 1.7662, + "step": 4553 + }, + { + "epoch": 0.47304456216889995, + "grad_norm": 0.39401260018348694, + "learning_rate": 5.422909142300729e-05, + "loss": 1.6387, + "step": 4554 + }, + { + "epoch": 0.47314843668848033, + "grad_norm": 0.43190351128578186, + "learning_rate": 5.4212833096570085e-05, + "loss": 1.9062, + "step": 4555 + }, + { + "epoch": 0.47325231120806066, + "grad_norm": 0.3953791558742523, + "learning_rate": 5.419657432149903e-05, + "loss": 1.779, + "step": 4556 + }, + { + "epoch": 0.473356185727641, + "grad_norm": 0.4370119273662567, + "learning_rate": 5.418031509952554e-05, + "loss": 1.7205, + "step": 4557 + }, + { + "epoch": 0.47346006024722137, + "grad_norm": 0.46050959825515747, + "learning_rate": 5.4164055432381076e-05, + "loss": 1.8187, + "step": 4558 + }, + { + "epoch": 0.4735639347668017, + "grad_norm": 0.3929772675037384, + "learning_rate": 5.414779532179721e-05, + "loss": 1.7266, + "step": 4559 + }, + { + "epoch": 0.473667809286382, + "grad_norm": 0.3913351595401764, + "learning_rate": 5.41315347695055e-05, + "loss": 1.6847, + "step": 4560 + }, + { + "epoch": 0.4737716838059624, + "grad_norm": 0.37702706456184387, + "learning_rate": 5.4115273777237554e-05, + "loss": 1.5548, + "step": 4561 + }, + { + "epoch": 0.47387555832554273, + "grad_norm": 0.4072859585285187, + "learning_rate": 5.4099012346725054e-05, + "loss": 1.6328, + "step": 4562 + }, + { + "epoch": 0.4739794328451231, + "grad_norm": 0.4485922157764435, + "learning_rate": 5.4082750479699715e-05, + "loss": 1.7856, + "step": 4563 + }, + { + "epoch": 0.47408330736470344, + "grad_norm": 0.4083961248397827, + "learning_rate": 5.406648817789329e-05, + "loss": 1.6751, + "step": 4564 + }, + { + "epoch": 0.47418718188428377, + "grad_norm": 0.4277319312095642, + "learning_rate": 5.40502254430376e-05, + "loss": 1.793, + "step": 4565 + }, + { + "epoch": 0.47429105640386415, + "grad_norm": 0.4535433053970337, + "learning_rate": 5.403396227686449e-05, + "loss": 1.5738, + "step": 4566 + }, + { + "epoch": 0.4743949309234445, + "grad_norm": 0.41116324067115784, + "learning_rate": 5.4017698681105865e-05, + "loss": 1.7443, + "step": 4567 + }, + { + "epoch": 0.4744988054430248, + "grad_norm": 0.38053932785987854, + "learning_rate": 5.400143465749368e-05, + "loss": 1.6522, + "step": 4568 + }, + { + "epoch": 0.4746026799626052, + "grad_norm": 0.40427184104919434, + "learning_rate": 5.398517020775989e-05, + "loss": 1.7336, + "step": 4569 + }, + { + "epoch": 0.4747065544821855, + "grad_norm": 0.4339055120944977, + "learning_rate": 5.39689053336366e-05, + "loss": 1.7395, + "step": 4570 + }, + { + "epoch": 0.47481042900176584, + "grad_norm": 0.415379136800766, + "learning_rate": 5.3952640036855825e-05, + "loss": 1.5726, + "step": 4571 + }, + { + "epoch": 0.4749143035213462, + "grad_norm": 0.4650099575519562, + "learning_rate": 5.3936374319149705e-05, + "loss": 2.0233, + "step": 4572 + }, + { + "epoch": 0.47501817804092655, + "grad_norm": 0.429559588432312, + "learning_rate": 5.392010818225044e-05, + "loss": 1.539, + "step": 4573 + }, + { + "epoch": 0.4751220525605069, + "grad_norm": 0.4143827259540558, + "learning_rate": 5.390384162789024e-05, + "loss": 1.767, + "step": 4574 + }, + { + "epoch": 0.47522592708008726, + "grad_norm": 0.4033125042915344, + "learning_rate": 5.3887574657801346e-05, + "loss": 1.7017, + "step": 4575 + }, + { + "epoch": 0.4753298015996676, + "grad_norm": 0.41067859530448914, + "learning_rate": 5.3871307273716074e-05, + "loss": 1.6445, + "step": 4576 + }, + { + "epoch": 0.47543367611924797, + "grad_norm": 0.41794803738594055, + "learning_rate": 5.3855039477366775e-05, + "loss": 1.7336, + "step": 4577 + }, + { + "epoch": 0.4755375506388283, + "grad_norm": 0.38265594840049744, + "learning_rate": 5.3838771270485854e-05, + "loss": 1.6598, + "step": 4578 + }, + { + "epoch": 0.4756414251584086, + "grad_norm": 0.3966656029224396, + "learning_rate": 5.3822502654805725e-05, + "loss": 1.5692, + "step": 4579 + }, + { + "epoch": 0.475745299677989, + "grad_norm": 0.4140309989452362, + "learning_rate": 5.3806233632058876e-05, + "loss": 1.687, + "step": 4580 + }, + { + "epoch": 0.47584917419756934, + "grad_norm": 0.43315640091896057, + "learning_rate": 5.378996420397784e-05, + "loss": 1.6507, + "step": 4581 + }, + { + "epoch": 0.47595304871714966, + "grad_norm": 0.4187996983528137, + "learning_rate": 5.377369437229518e-05, + "loss": 1.782, + "step": 4582 + }, + { + "epoch": 0.47605692323673005, + "grad_norm": 0.41366416215896606, + "learning_rate": 5.375742413874351e-05, + "loss": 1.7035, + "step": 4583 + }, + { + "epoch": 0.47616079775631037, + "grad_norm": 0.4386520981788635, + "learning_rate": 5.3741153505055475e-05, + "loss": 1.6678, + "step": 4584 + }, + { + "epoch": 0.4762646722758907, + "grad_norm": 0.4309309124946594, + "learning_rate": 5.372488247296378e-05, + "loss": 1.7662, + "step": 4585 + }, + { + "epoch": 0.4763685467954711, + "grad_norm": 0.4261937737464905, + "learning_rate": 5.370861104420114e-05, + "loss": 1.7769, + "step": 4586 + }, + { + "epoch": 0.4764724213150514, + "grad_norm": 0.3945866823196411, + "learning_rate": 5.369233922050036e-05, + "loss": 1.4977, + "step": 4587 + }, + { + "epoch": 0.4765762958346318, + "grad_norm": 0.38651934266090393, + "learning_rate": 5.3676067003594255e-05, + "loss": 1.7031, + "step": 4588 + }, + { + "epoch": 0.4766801703542121, + "grad_norm": 0.43587467074394226, + "learning_rate": 5.365979439521569e-05, + "loss": 1.6961, + "step": 4589 + }, + { + "epoch": 0.47678404487379245, + "grad_norm": 0.43202391266822815, + "learning_rate": 5.364352139709758e-05, + "loss": 1.8117, + "step": 4590 + }, + { + "epoch": 0.47688791939337283, + "grad_norm": 0.4378650486469269, + "learning_rate": 5.362724801097284e-05, + "loss": 1.6322, + "step": 4591 + }, + { + "epoch": 0.47699179391295315, + "grad_norm": 0.4283282160758972, + "learning_rate": 5.3610974238574495e-05, + "loss": 1.7273, + "step": 4592 + }, + { + "epoch": 0.4770956684325335, + "grad_norm": 0.42612189054489136, + "learning_rate": 5.359470008163556e-05, + "loss": 1.824, + "step": 4593 + }, + { + "epoch": 0.47719954295211386, + "grad_norm": 0.38064736127853394, + "learning_rate": 5.35784255418891e-05, + "loss": 1.5924, + "step": 4594 + }, + { + "epoch": 0.4773034174716942, + "grad_norm": 0.40858712792396545, + "learning_rate": 5.356215062106823e-05, + "loss": 1.7402, + "step": 4595 + }, + { + "epoch": 0.4774072919912745, + "grad_norm": 0.4329719543457031, + "learning_rate": 5.354587532090611e-05, + "loss": 1.861, + "step": 4596 + }, + { + "epoch": 0.4775111665108549, + "grad_norm": 0.4516226649284363, + "learning_rate": 5.352959964313592e-05, + "loss": 1.8382, + "step": 4597 + }, + { + "epoch": 0.47761504103043523, + "grad_norm": 0.4081841707229614, + "learning_rate": 5.3513323589490924e-05, + "loss": 1.6581, + "step": 4598 + }, + { + "epoch": 0.47771891555001555, + "grad_norm": 0.3968779444694519, + "learning_rate": 5.349704716170435e-05, + "loss": 1.5309, + "step": 4599 + }, + { + "epoch": 0.47782279006959594, + "grad_norm": 0.4048573076725006, + "learning_rate": 5.348077036150954e-05, + "loss": 1.7319, + "step": 4600 + }, + { + "epoch": 0.47792666458917626, + "grad_norm": 0.4110606014728546, + "learning_rate": 5.346449319063983e-05, + "loss": 1.7603, + "step": 4601 + }, + { + "epoch": 0.47803053910875665, + "grad_norm": 0.4106254577636719, + "learning_rate": 5.34482156508286e-05, + "loss": 1.5944, + "step": 4602 + }, + { + "epoch": 0.478134413628337, + "grad_norm": 0.3852187693119049, + "learning_rate": 5.343193774380933e-05, + "loss": 1.5459, + "step": 4603 + }, + { + "epoch": 0.4782382881479173, + "grad_norm": 0.40513893961906433, + "learning_rate": 5.341565947131546e-05, + "loss": 1.6581, + "step": 4604 + }, + { + "epoch": 0.4783421626674977, + "grad_norm": 0.38794079422950745, + "learning_rate": 5.3399380835080493e-05, + "loss": 1.6475, + "step": 4605 + }, + { + "epoch": 0.478446037187078, + "grad_norm": 0.3935708999633789, + "learning_rate": 5.338310183683799e-05, + "loss": 1.7491, + "step": 4606 + }, + { + "epoch": 0.47854991170665834, + "grad_norm": 0.5326302647590637, + "learning_rate": 5.3366822478321524e-05, + "loss": 1.7748, + "step": 4607 + }, + { + "epoch": 0.4786537862262387, + "grad_norm": 0.4071970283985138, + "learning_rate": 5.335054276126471e-05, + "loss": 1.6662, + "step": 4608 + }, + { + "epoch": 0.47875766074581905, + "grad_norm": 0.43349170684814453, + "learning_rate": 5.333426268740125e-05, + "loss": 1.8815, + "step": 4609 + }, + { + "epoch": 0.4788615352653994, + "grad_norm": 0.372646689414978, + "learning_rate": 5.331798225846482e-05, + "loss": 1.5864, + "step": 4610 + }, + { + "epoch": 0.47896540978497976, + "grad_norm": 0.4014649987220764, + "learning_rate": 5.330170147618914e-05, + "loss": 1.6708, + "step": 4611 + }, + { + "epoch": 0.4790692843045601, + "grad_norm": 0.4292800724506378, + "learning_rate": 5.3285420342308035e-05, + "loss": 1.8315, + "step": 4612 + }, + { + "epoch": 0.47917315882414047, + "grad_norm": 0.41147077083587646, + "learning_rate": 5.326913885855527e-05, + "loss": 1.6449, + "step": 4613 + }, + { + "epoch": 0.4792770333437208, + "grad_norm": 0.35840991139411926, + "learning_rate": 5.325285702666473e-05, + "loss": 1.5407, + "step": 4614 + }, + { + "epoch": 0.4793809078633011, + "grad_norm": 0.4264986515045166, + "learning_rate": 5.323657484837028e-05, + "loss": 1.6642, + "step": 4615 + }, + { + "epoch": 0.4794847823828815, + "grad_norm": 0.4104459881782532, + "learning_rate": 5.3220292325405854e-05, + "loss": 1.6084, + "step": 4616 + }, + { + "epoch": 0.47958865690246183, + "grad_norm": 0.4270613193511963, + "learning_rate": 5.320400945950542e-05, + "loss": 1.9207, + "step": 4617 + }, + { + "epoch": 0.47969253142204216, + "grad_norm": 0.4374467730522156, + "learning_rate": 5.318772625240298e-05, + "loss": 1.6418, + "step": 4618 + }, + { + "epoch": 0.47979640594162254, + "grad_norm": 0.3923068940639496, + "learning_rate": 5.3171442705832554e-05, + "loss": 1.6963, + "step": 4619 + }, + { + "epoch": 0.47990028046120287, + "grad_norm": 0.3642514646053314, + "learning_rate": 5.315515882152822e-05, + "loss": 1.5201, + "step": 4620 + }, + { + "epoch": 0.4800041549807832, + "grad_norm": 0.41580015420913696, + "learning_rate": 5.313887460122408e-05, + "loss": 1.5179, + "step": 4621 + }, + { + "epoch": 0.4801080295003636, + "grad_norm": 0.42775875329971313, + "learning_rate": 5.312259004665426e-05, + "loss": 1.7967, + "step": 4622 + }, + { + "epoch": 0.4802119040199439, + "grad_norm": 0.3977086544036865, + "learning_rate": 5.310630515955297e-05, + "loss": 1.4864, + "step": 4623 + }, + { + "epoch": 0.48031577853952423, + "grad_norm": 0.41823333501815796, + "learning_rate": 5.309001994165442e-05, + "loss": 1.7526, + "step": 4624 + }, + { + "epoch": 0.4804196530591046, + "grad_norm": 0.41474875807762146, + "learning_rate": 5.307373439469283e-05, + "loss": 1.5733, + "step": 4625 + }, + { + "epoch": 0.48052352757868494, + "grad_norm": 0.370061457157135, + "learning_rate": 5.305744852040253e-05, + "loss": 1.541, + "step": 4626 + }, + { + "epoch": 0.4806274020982653, + "grad_norm": 0.36893734335899353, + "learning_rate": 5.304116232051779e-05, + "loss": 1.6238, + "step": 4627 + }, + { + "epoch": 0.48073127661784565, + "grad_norm": 0.4357246458530426, + "learning_rate": 5.3024875796773e-05, + "loss": 1.9604, + "step": 4628 + }, + { + "epoch": 0.480835151137426, + "grad_norm": 0.3936040997505188, + "learning_rate": 5.3008588950902524e-05, + "loss": 1.4643, + "step": 4629 + }, + { + "epoch": 0.48093902565700636, + "grad_norm": 0.38085660338401794, + "learning_rate": 5.2992301784640785e-05, + "loss": 1.6453, + "step": 4630 + }, + { + "epoch": 0.4810429001765867, + "grad_norm": 0.3864935338497162, + "learning_rate": 5.2976014299722263e-05, + "loss": 1.6807, + "step": 4631 + }, + { + "epoch": 0.481146774696167, + "grad_norm": 0.405137836933136, + "learning_rate": 5.2959726497881425e-05, + "loss": 1.7978, + "step": 4632 + }, + { + "epoch": 0.4812506492157474, + "grad_norm": 0.3816376328468323, + "learning_rate": 5.29434383808528e-05, + "loss": 1.7011, + "step": 4633 + }, + { + "epoch": 0.4813545237353277, + "grad_norm": 0.3807872533798218, + "learning_rate": 5.292714995037096e-05, + "loss": 1.6109, + "step": 4634 + }, + { + "epoch": 0.48145839825490805, + "grad_norm": 0.3880075514316559, + "learning_rate": 5.291086120817047e-05, + "loss": 1.6175, + "step": 4635 + }, + { + "epoch": 0.48156227277448843, + "grad_norm": 0.40148478746414185, + "learning_rate": 5.289457215598595e-05, + "loss": 1.5225, + "step": 4636 + }, + { + "epoch": 0.48166614729406876, + "grad_norm": 0.4223523437976837, + "learning_rate": 5.28782827955521e-05, + "loss": 1.7701, + "step": 4637 + }, + { + "epoch": 0.4817700218136491, + "grad_norm": 0.41836312413215637, + "learning_rate": 5.286199312860358e-05, + "loss": 1.6871, + "step": 4638 + }, + { + "epoch": 0.48187389633322947, + "grad_norm": 0.41826310753822327, + "learning_rate": 5.2845703156875106e-05, + "loss": 1.7912, + "step": 4639 + }, + { + "epoch": 0.4819777708528098, + "grad_norm": 0.3970772325992584, + "learning_rate": 5.282941288210146e-05, + "loss": 1.6757, + "step": 4640 + }, + { + "epoch": 0.4820816453723902, + "grad_norm": 0.4118634760379791, + "learning_rate": 5.2813122306017396e-05, + "loss": 1.8312, + "step": 4641 + }, + { + "epoch": 0.4821855198919705, + "grad_norm": 0.38596856594085693, + "learning_rate": 5.279683143035775e-05, + "loss": 1.6572, + "step": 4642 + }, + { + "epoch": 0.48228939441155083, + "grad_norm": 0.3717338442802429, + "learning_rate": 5.278054025685739e-05, + "loss": 1.6622, + "step": 4643 + }, + { + "epoch": 0.4823932689311312, + "grad_norm": 0.38928812742233276, + "learning_rate": 5.276424878725116e-05, + "loss": 1.578, + "step": 4644 + }, + { + "epoch": 0.48249714345071154, + "grad_norm": 0.4005277156829834, + "learning_rate": 5.2747957023274005e-05, + "loss": 1.6593, + "step": 4645 + }, + { + "epoch": 0.48260101797029187, + "grad_norm": 0.37568166851997375, + "learning_rate": 5.2731664966660864e-05, + "loss": 1.6828, + "step": 4646 + }, + { + "epoch": 0.48270489248987225, + "grad_norm": 0.4337156116962433, + "learning_rate": 5.271537261914672e-05, + "loss": 1.8237, + "step": 4647 + }, + { + "epoch": 0.4828087670094526, + "grad_norm": 0.4006423056125641, + "learning_rate": 5.269907998246656e-05, + "loss": 1.6222, + "step": 4648 + }, + { + "epoch": 0.4829126415290329, + "grad_norm": 0.3779585361480713, + "learning_rate": 5.268278705835544e-05, + "loss": 1.6672, + "step": 4649 + }, + { + "epoch": 0.4830165160486133, + "grad_norm": 0.4423217475414276, + "learning_rate": 5.266649384854841e-05, + "loss": 1.7731, + "step": 4650 + }, + { + "epoch": 0.4831203905681936, + "grad_norm": 0.4306303560733795, + "learning_rate": 5.265020035478061e-05, + "loss": 1.7711, + "step": 4651 + }, + { + "epoch": 0.483224265087774, + "grad_norm": 0.38442516326904297, + "learning_rate": 5.263390657878712e-05, + "loss": 1.4701, + "step": 4652 + }, + { + "epoch": 0.4833281396073543, + "grad_norm": 0.38074007630348206, + "learning_rate": 5.2617612522303136e-05, + "loss": 1.6486, + "step": 4653 + }, + { + "epoch": 0.48343201412693465, + "grad_norm": 0.3747223913669586, + "learning_rate": 5.2601318187063855e-05, + "loss": 1.6641, + "step": 4654 + }, + { + "epoch": 0.48353588864651503, + "grad_norm": 0.48334217071533203, + "learning_rate": 5.258502357480445e-05, + "loss": 1.5608, + "step": 4655 + }, + { + "epoch": 0.48363976316609536, + "grad_norm": 0.4297388792037964, + "learning_rate": 5.256872868726021e-05, + "loss": 1.6617, + "step": 4656 + }, + { + "epoch": 0.4837436376856757, + "grad_norm": 0.433788001537323, + "learning_rate": 5.255243352616641e-05, + "loss": 1.8598, + "step": 4657 + }, + { + "epoch": 0.48384751220525607, + "grad_norm": 0.4389197528362274, + "learning_rate": 5.253613809325834e-05, + "loss": 1.8134, + "step": 4658 + }, + { + "epoch": 0.4839513867248364, + "grad_norm": 0.39111611247062683, + "learning_rate": 5.251984239027133e-05, + "loss": 1.5018, + "step": 4659 + }, + { + "epoch": 0.4840552612444167, + "grad_norm": 0.39344149827957153, + "learning_rate": 5.2503546418940784e-05, + "loss": 1.7079, + "step": 4660 + }, + { + "epoch": 0.4841591357639971, + "grad_norm": 0.42502087354660034, + "learning_rate": 5.248725018100207e-05, + "loss": 1.7178, + "step": 4661 + }, + { + "epoch": 0.48426301028357743, + "grad_norm": 0.4289288818836212, + "learning_rate": 5.2470953678190615e-05, + "loss": 1.455, + "step": 4662 + }, + { + "epoch": 0.48436688480315776, + "grad_norm": 0.41218411922454834, + "learning_rate": 5.2454656912241864e-05, + "loss": 1.8562, + "step": 4663 + }, + { + "epoch": 0.48447075932273814, + "grad_norm": 0.38502606749534607, + "learning_rate": 5.2438359884891285e-05, + "loss": 1.5738, + "step": 4664 + }, + { + "epoch": 0.48457463384231847, + "grad_norm": 0.4311594069004059, + "learning_rate": 5.242206259787441e-05, + "loss": 1.69, + "step": 4665 + }, + { + "epoch": 0.48467850836189885, + "grad_norm": 0.4154076874256134, + "learning_rate": 5.2405765052926755e-05, + "loss": 1.7648, + "step": 4666 + }, + { + "epoch": 0.4847823828814792, + "grad_norm": 0.4217148721218109, + "learning_rate": 5.23894672517839e-05, + "loss": 1.6886, + "step": 4667 + }, + { + "epoch": 0.4848862574010595, + "grad_norm": 0.39737677574157715, + "learning_rate": 5.237316919618143e-05, + "loss": 1.6023, + "step": 4668 + }, + { + "epoch": 0.4849901319206399, + "grad_norm": 0.3839234709739685, + "learning_rate": 5.2356870887854926e-05, + "loss": 1.7436, + "step": 4669 + }, + { + "epoch": 0.4850940064402202, + "grad_norm": 0.39807283878326416, + "learning_rate": 5.234057232854006e-05, + "loss": 1.7422, + "step": 4670 + }, + { + "epoch": 0.48519788095980054, + "grad_norm": 0.40253087878227234, + "learning_rate": 5.232427351997251e-05, + "loss": 1.6793, + "step": 4671 + }, + { + "epoch": 0.4853017554793809, + "grad_norm": 0.41623252630233765, + "learning_rate": 5.2307974463887944e-05, + "loss": 1.6998, + "step": 4672 + }, + { + "epoch": 0.48540562999896125, + "grad_norm": 0.39853668212890625, + "learning_rate": 5.229167516202211e-05, + "loss": 1.7607, + "step": 4673 + }, + { + "epoch": 0.4855095045185416, + "grad_norm": 0.433289110660553, + "learning_rate": 5.2275375616110755e-05, + "loss": 1.707, + "step": 4674 + }, + { + "epoch": 0.48561337903812196, + "grad_norm": 0.37123411893844604, + "learning_rate": 5.2259075827889644e-05, + "loss": 1.765, + "step": 4675 + }, + { + "epoch": 0.4857172535577023, + "grad_norm": 0.3944990634918213, + "learning_rate": 5.224277579909458e-05, + "loss": 1.6677, + "step": 4676 + }, + { + "epoch": 0.4858211280772826, + "grad_norm": 0.38303902745246887, + "learning_rate": 5.222647553146138e-05, + "loss": 1.5869, + "step": 4677 + }, + { + "epoch": 0.485925002596863, + "grad_norm": 0.40630918741226196, + "learning_rate": 5.221017502672589e-05, + "loss": 1.6596, + "step": 4678 + }, + { + "epoch": 0.4860288771164433, + "grad_norm": 0.38045310974121094, + "learning_rate": 5.219387428662402e-05, + "loss": 1.75, + "step": 4679 + }, + { + "epoch": 0.4861327516360237, + "grad_norm": 0.36957138776779175, + "learning_rate": 5.2177573312891646e-05, + "loss": 1.5169, + "step": 4680 + }, + { + "epoch": 0.48623662615560403, + "grad_norm": 0.40403348207473755, + "learning_rate": 5.21612721072647e-05, + "loss": 1.8167, + "step": 4681 + }, + { + "epoch": 0.48634050067518436, + "grad_norm": 0.4342941641807556, + "learning_rate": 5.214497067147914e-05, + "loss": 1.6114, + "step": 4682 + }, + { + "epoch": 0.48644437519476474, + "grad_norm": 0.40665194392204285, + "learning_rate": 5.212866900727093e-05, + "loss": 1.6924, + "step": 4683 + }, + { + "epoch": 0.48654824971434507, + "grad_norm": 0.397088885307312, + "learning_rate": 5.211236711637607e-05, + "loss": 1.6505, + "step": 4684 + }, + { + "epoch": 0.4866521242339254, + "grad_norm": 0.4288082718849182, + "learning_rate": 5.209606500053059e-05, + "loss": 1.9258, + "step": 4685 + }, + { + "epoch": 0.4867559987535058, + "grad_norm": 0.43021494150161743, + "learning_rate": 5.207976266147053e-05, + "loss": 1.7348, + "step": 4686 + }, + { + "epoch": 0.4868598732730861, + "grad_norm": 0.4119536280632019, + "learning_rate": 5.206346010093198e-05, + "loss": 1.6395, + "step": 4687 + }, + { + "epoch": 0.48696374779266643, + "grad_norm": 0.4928002655506134, + "learning_rate": 5.204715732065103e-05, + "loss": 1.846, + "step": 4688 + }, + { + "epoch": 0.4870676223122468, + "grad_norm": 0.3708318769931793, + "learning_rate": 5.203085432236379e-05, + "loss": 1.4251, + "step": 4689 + }, + { + "epoch": 0.48717149683182714, + "grad_norm": 0.3991853892803192, + "learning_rate": 5.201455110780641e-05, + "loss": 1.6869, + "step": 4690 + }, + { + "epoch": 0.4872753713514075, + "grad_norm": 0.4395716190338135, + "learning_rate": 5.199824767871503e-05, + "loss": 1.8586, + "step": 4691 + }, + { + "epoch": 0.48737924587098785, + "grad_norm": 0.4379446804523468, + "learning_rate": 5.198194403682588e-05, + "loss": 1.6928, + "step": 4692 + }, + { + "epoch": 0.4874831203905682, + "grad_norm": 0.4161565601825714, + "learning_rate": 5.1965640183875144e-05, + "loss": 1.8182, + "step": 4693 + }, + { + "epoch": 0.48758699491014856, + "grad_norm": 0.41332149505615234, + "learning_rate": 5.194933612159906e-05, + "loss": 1.8115, + "step": 4694 + }, + { + "epoch": 0.4876908694297289, + "grad_norm": 0.38883647322654724, + "learning_rate": 5.193303185173388e-05, + "loss": 1.7725, + "step": 4695 + }, + { + "epoch": 0.4877947439493092, + "grad_norm": 0.4268670976161957, + "learning_rate": 5.191672737601591e-05, + "loss": 1.823, + "step": 4696 + }, + { + "epoch": 0.4878986184688896, + "grad_norm": 0.4016038775444031, + "learning_rate": 5.1900422696181396e-05, + "loss": 1.5653, + "step": 4697 + }, + { + "epoch": 0.4880024929884699, + "grad_norm": 0.43578583002090454, + "learning_rate": 5.1884117813966705e-05, + "loss": 1.7653, + "step": 4698 + }, + { + "epoch": 0.48810636750805025, + "grad_norm": 0.45143744349479675, + "learning_rate": 5.186781273110816e-05, + "loss": 1.7976, + "step": 4699 + }, + { + "epoch": 0.48821024202763064, + "grad_norm": 0.415283739566803, + "learning_rate": 5.1851507449342104e-05, + "loss": 1.6952, + "step": 4700 + }, + { + "epoch": 0.48831411654721096, + "grad_norm": 0.38803666830062866, + "learning_rate": 5.183520197040498e-05, + "loss": 1.6622, + "step": 4701 + }, + { + "epoch": 0.4884179910667913, + "grad_norm": 0.400741845369339, + "learning_rate": 5.1818896296033145e-05, + "loss": 1.7436, + "step": 4702 + }, + { + "epoch": 0.4885218655863717, + "grad_norm": 0.39219218492507935, + "learning_rate": 5.180259042796306e-05, + "loss": 1.6381, + "step": 4703 + }, + { + "epoch": 0.488625740105952, + "grad_norm": 0.44362738728523254, + "learning_rate": 5.178628436793115e-05, + "loss": 1.8482, + "step": 4704 + }, + { + "epoch": 0.4887296146255324, + "grad_norm": 0.4290272295475006, + "learning_rate": 5.176997811767387e-05, + "loss": 1.7622, + "step": 4705 + }, + { + "epoch": 0.4888334891451127, + "grad_norm": 0.38011881709098816, + "learning_rate": 5.175367167892774e-05, + "loss": 1.7822, + "step": 4706 + }, + { + "epoch": 0.48893736366469304, + "grad_norm": 0.4040461778640747, + "learning_rate": 5.1737365053429254e-05, + "loss": 1.4567, + "step": 4707 + }, + { + "epoch": 0.4890412381842734, + "grad_norm": 0.40654054284095764, + "learning_rate": 5.1721058242914935e-05, + "loss": 1.7415, + "step": 4708 + }, + { + "epoch": 0.48914511270385375, + "grad_norm": 0.3765581548213959, + "learning_rate": 5.170475124912134e-05, + "loss": 1.6306, + "step": 4709 + }, + { + "epoch": 0.4892489872234341, + "grad_norm": 0.4085083305835724, + "learning_rate": 5.168844407378506e-05, + "loss": 1.6247, + "step": 4710 + }, + { + "epoch": 0.48935286174301446, + "grad_norm": 0.4177039861679077, + "learning_rate": 5.167213671864263e-05, + "loss": 1.6903, + "step": 4711 + }, + { + "epoch": 0.4894567362625948, + "grad_norm": 0.449349582195282, + "learning_rate": 5.1655829185430685e-05, + "loss": 1.6055, + "step": 4712 + }, + { + "epoch": 0.4895606107821751, + "grad_norm": 0.4039447605609894, + "learning_rate": 5.1639521475885854e-05, + "loss": 1.6719, + "step": 4713 + }, + { + "epoch": 0.4896644853017555, + "grad_norm": 0.39503759145736694, + "learning_rate": 5.162321359174476e-05, + "loss": 1.7133, + "step": 4714 + }, + { + "epoch": 0.4897683598213358, + "grad_norm": 0.4664006531238556, + "learning_rate": 5.16069055347441e-05, + "loss": 1.805, + "step": 4715 + }, + { + "epoch": 0.48987223434091615, + "grad_norm": 0.39584633708000183, + "learning_rate": 5.1590597306620525e-05, + "loss": 1.5628, + "step": 4716 + }, + { + "epoch": 0.48997610886049653, + "grad_norm": 0.3739977478981018, + "learning_rate": 5.1574288909110744e-05, + "loss": 1.5699, + "step": 4717 + }, + { + "epoch": 0.49007998338007686, + "grad_norm": 0.3966865539550781, + "learning_rate": 5.155798034395147e-05, + "loss": 1.7159, + "step": 4718 + }, + { + "epoch": 0.49018385789965724, + "grad_norm": 0.48280176520347595, + "learning_rate": 5.154167161287945e-05, + "loss": 1.8649, + "step": 4719 + }, + { + "epoch": 0.49028773241923757, + "grad_norm": 0.43992406129837036, + "learning_rate": 5.152536271763143e-05, + "loss": 1.7603, + "step": 4720 + }, + { + "epoch": 0.4903916069388179, + "grad_norm": 0.40175652503967285, + "learning_rate": 5.1509053659944175e-05, + "loss": 1.664, + "step": 4721 + }, + { + "epoch": 0.4904954814583983, + "grad_norm": 0.3840475082397461, + "learning_rate": 5.149274444155446e-05, + "loss": 1.5613, + "step": 4722 + }, + { + "epoch": 0.4905993559779786, + "grad_norm": 0.42482373118400574, + "learning_rate": 5.147643506419914e-05, + "loss": 1.7978, + "step": 4723 + }, + { + "epoch": 0.49070323049755893, + "grad_norm": 0.40053901076316833, + "learning_rate": 5.146012552961499e-05, + "loss": 1.736, + "step": 4724 + }, + { + "epoch": 0.4908071050171393, + "grad_norm": 0.3966452479362488, + "learning_rate": 5.1443815839538845e-05, + "loss": 1.7367, + "step": 4725 + }, + { + "epoch": 0.49091097953671964, + "grad_norm": 0.39208799600601196, + "learning_rate": 5.1427505995707594e-05, + "loss": 1.4424, + "step": 4726 + }, + { + "epoch": 0.49101485405629997, + "grad_norm": 0.42873436212539673, + "learning_rate": 5.1411195999858074e-05, + "loss": 1.8207, + "step": 4727 + }, + { + "epoch": 0.49111872857588035, + "grad_norm": 0.44238024950027466, + "learning_rate": 5.1394885853727195e-05, + "loss": 1.756, + "step": 4728 + }, + { + "epoch": 0.4912226030954607, + "grad_norm": 0.38255882263183594, + "learning_rate": 5.137857555905184e-05, + "loss": 1.5362, + "step": 4729 + }, + { + "epoch": 0.49132647761504106, + "grad_norm": 0.4030906558036804, + "learning_rate": 5.1362265117568965e-05, + "loss": 1.6924, + "step": 4730 + }, + { + "epoch": 0.4914303521346214, + "grad_norm": 0.4658985733985901, + "learning_rate": 5.134595453101547e-05, + "loss": 1.7965, + "step": 4731 + }, + { + "epoch": 0.4915342266542017, + "grad_norm": 0.3776882290840149, + "learning_rate": 5.1329643801128324e-05, + "loss": 1.5735, + "step": 4732 + }, + { + "epoch": 0.4916381011737821, + "grad_norm": 0.3989698588848114, + "learning_rate": 5.131333292964447e-05, + "loss": 1.7203, + "step": 4733 + }, + { + "epoch": 0.4917419756933624, + "grad_norm": 0.4176570177078247, + "learning_rate": 5.12970219183009e-05, + "loss": 1.7932, + "step": 4734 + }, + { + "epoch": 0.49184585021294275, + "grad_norm": 0.42140617966651917, + "learning_rate": 5.128071076883463e-05, + "loss": 1.8202, + "step": 4735 + }, + { + "epoch": 0.49194972473252313, + "grad_norm": 0.41681942343711853, + "learning_rate": 5.126439948298264e-05, + "loss": 1.6721, + "step": 4736 + }, + { + "epoch": 0.49205359925210346, + "grad_norm": 0.465843141078949, + "learning_rate": 5.124808806248199e-05, + "loss": 1.6901, + "step": 4737 + }, + { + "epoch": 0.4921574737716838, + "grad_norm": 0.41670602560043335, + "learning_rate": 5.123177650906968e-05, + "loss": 1.745, + "step": 4738 + }, + { + "epoch": 0.49226134829126417, + "grad_norm": 0.415339857339859, + "learning_rate": 5.1215464824482775e-05, + "loss": 1.7013, + "step": 4739 + }, + { + "epoch": 0.4923652228108445, + "grad_norm": 0.3894515931606293, + "learning_rate": 5.119915301045836e-05, + "loss": 1.5359, + "step": 4740 + }, + { + "epoch": 0.4924690973304248, + "grad_norm": 0.37234368920326233, + "learning_rate": 5.118284106873349e-05, + "loss": 1.6351, + "step": 4741 + }, + { + "epoch": 0.4925729718500052, + "grad_norm": 0.47200119495391846, + "learning_rate": 5.116652900104527e-05, + "loss": 1.9368, + "step": 4742 + }, + { + "epoch": 0.49267684636958553, + "grad_norm": 0.430601567029953, + "learning_rate": 5.1150216809130826e-05, + "loss": 1.6977, + "step": 4743 + }, + { + "epoch": 0.4927807208891659, + "grad_norm": 0.4601489305496216, + "learning_rate": 5.113390449472726e-05, + "loss": 1.8549, + "step": 4744 + }, + { + "epoch": 0.49288459540874624, + "grad_norm": 0.42277416586875916, + "learning_rate": 5.11175920595717e-05, + "loss": 1.7991, + "step": 4745 + }, + { + "epoch": 0.49298846992832657, + "grad_norm": 0.3993331789970398, + "learning_rate": 5.1101279505401325e-05, + "loss": 1.7089, + "step": 4746 + }, + { + "epoch": 0.49309234444790695, + "grad_norm": 0.4037160277366638, + "learning_rate": 5.108496683395324e-05, + "loss": 1.6423, + "step": 4747 + }, + { + "epoch": 0.4931962189674873, + "grad_norm": 0.3775770664215088, + "learning_rate": 5.106865404696468e-05, + "loss": 1.6201, + "step": 4748 + }, + { + "epoch": 0.4933000934870676, + "grad_norm": 0.4403548836708069, + "learning_rate": 5.1052341146172785e-05, + "loss": 1.7299, + "step": 4749 + }, + { + "epoch": 0.493403968006648, + "grad_norm": 0.4192941188812256, + "learning_rate": 5.103602813331476e-05, + "loss": 1.7541, + "step": 4750 + }, + { + "epoch": 0.4935078425262283, + "grad_norm": 0.3859345018863678, + "learning_rate": 5.101971501012785e-05, + "loss": 1.3569, + "step": 4751 + }, + { + "epoch": 0.49361171704580864, + "grad_norm": 0.4286668598651886, + "learning_rate": 5.1003401778349224e-05, + "loss": 1.8115, + "step": 4752 + }, + { + "epoch": 0.493715591565389, + "grad_norm": 0.4297914206981659, + "learning_rate": 5.0987088439716134e-05, + "loss": 1.6486, + "step": 4753 + }, + { + "epoch": 0.49381946608496935, + "grad_norm": 0.4609547555446625, + "learning_rate": 5.097077499596583e-05, + "loss": 1.8906, + "step": 4754 + }, + { + "epoch": 0.4939233406045497, + "grad_norm": 0.3923378884792328, + "learning_rate": 5.095446144883556e-05, + "loss": 1.5989, + "step": 4755 + }, + { + "epoch": 0.49402721512413006, + "grad_norm": 0.42669469118118286, + "learning_rate": 5.0938147800062586e-05, + "loss": 1.5584, + "step": 4756 + }, + { + "epoch": 0.4941310896437104, + "grad_norm": 0.4073004126548767, + "learning_rate": 5.092183405138419e-05, + "loss": 1.6068, + "step": 4757 + }, + { + "epoch": 0.49423496416329077, + "grad_norm": 0.4162704050540924, + "learning_rate": 5.090552020453766e-05, + "loss": 1.8445, + "step": 4758 + }, + { + "epoch": 0.4943388386828711, + "grad_norm": 0.45715436339378357, + "learning_rate": 5.08892062612603e-05, + "loss": 1.8761, + "step": 4759 + }, + { + "epoch": 0.4944427132024514, + "grad_norm": 0.38431254029273987, + "learning_rate": 5.08728922232894e-05, + "loss": 1.5639, + "step": 4760 + }, + { + "epoch": 0.4945465877220318, + "grad_norm": 0.4815351963043213, + "learning_rate": 5.0856578092362285e-05, + "loss": 1.7764, + "step": 4761 + }, + { + "epoch": 0.49465046224161213, + "grad_norm": 0.41645729541778564, + "learning_rate": 5.084026387021629e-05, + "loss": 1.6924, + "step": 4762 + }, + { + "epoch": 0.49475433676119246, + "grad_norm": 0.4079589247703552, + "learning_rate": 5.0823949558588746e-05, + "loss": 1.7549, + "step": 4763 + }, + { + "epoch": 0.49485821128077284, + "grad_norm": 0.40971922874450684, + "learning_rate": 5.0807635159216993e-05, + "loss": 1.5389, + "step": 4764 + }, + { + "epoch": 0.49496208580035317, + "grad_norm": 0.41493481397628784, + "learning_rate": 5.079132067383842e-05, + "loss": 1.7683, + "step": 4765 + }, + { + "epoch": 0.4950659603199335, + "grad_norm": 0.3941478729248047, + "learning_rate": 5.077500610419036e-05, + "loss": 1.7123, + "step": 4766 + }, + { + "epoch": 0.4951698348395139, + "grad_norm": 0.4111056327819824, + "learning_rate": 5.0758691452010175e-05, + "loss": 1.7361, + "step": 4767 + }, + { + "epoch": 0.4952737093590942, + "grad_norm": 0.4429363012313843, + "learning_rate": 5.0742376719035276e-05, + "loss": 1.8205, + "step": 4768 + }, + { + "epoch": 0.4953775838786746, + "grad_norm": 0.3898932635784149, + "learning_rate": 5.0726061907003045e-05, + "loss": 1.6695, + "step": 4769 + }, + { + "epoch": 0.4954814583982549, + "grad_norm": 0.43110302090644836, + "learning_rate": 5.070974701765089e-05, + "loss": 1.6963, + "step": 4770 + }, + { + "epoch": 0.49558533291783524, + "grad_norm": 0.4207279682159424, + "learning_rate": 5.069343205271622e-05, + "loss": 1.5213, + "step": 4771 + }, + { + "epoch": 0.4956892074374156, + "grad_norm": 0.423676073551178, + "learning_rate": 5.067711701393644e-05, + "loss": 1.5632, + "step": 4772 + }, + { + "epoch": 0.49579308195699595, + "grad_norm": 0.39001235365867615, + "learning_rate": 5.066080190304898e-05, + "loss": 1.717, + "step": 4773 + }, + { + "epoch": 0.4958969564765763, + "grad_norm": 0.3835489749908447, + "learning_rate": 5.0644486721791273e-05, + "loss": 1.7107, + "step": 4774 + }, + { + "epoch": 0.49600083099615666, + "grad_norm": 0.4401542842388153, + "learning_rate": 5.0628171471900756e-05, + "loss": 1.8646, + "step": 4775 + }, + { + "epoch": 0.496104705515737, + "grad_norm": 0.38474228978157043, + "learning_rate": 5.0611856155114876e-05, + "loss": 1.6334, + "step": 4776 + }, + { + "epoch": 0.4962085800353173, + "grad_norm": 0.4321017563343048, + "learning_rate": 5.059554077317109e-05, + "loss": 1.7811, + "step": 4777 + }, + { + "epoch": 0.4963124545548977, + "grad_norm": 0.38683265447616577, + "learning_rate": 5.0579225327806847e-05, + "loss": 1.6125, + "step": 4778 + }, + { + "epoch": 0.496416329074478, + "grad_norm": 0.3820945918560028, + "learning_rate": 5.0562909820759654e-05, + "loss": 1.554, + "step": 4779 + }, + { + "epoch": 0.49652020359405835, + "grad_norm": 0.38485878705978394, + "learning_rate": 5.054659425376693e-05, + "loss": 1.6951, + "step": 4780 + }, + { + "epoch": 0.49662407811363873, + "grad_norm": 0.40460067987442017, + "learning_rate": 5.0530278628566175e-05, + "loss": 1.7489, + "step": 4781 + }, + { + "epoch": 0.49672795263321906, + "grad_norm": 0.4411293566226959, + "learning_rate": 5.05139629468949e-05, + "loss": 1.7431, + "step": 4782 + }, + { + "epoch": 0.49683182715279944, + "grad_norm": 0.41115111112594604, + "learning_rate": 5.0497647210490565e-05, + "loss": 1.5739, + "step": 4783 + }, + { + "epoch": 0.49693570167237977, + "grad_norm": 0.4054515063762665, + "learning_rate": 5.048133142109069e-05, + "loss": 1.6096, + "step": 4784 + }, + { + "epoch": 0.4970395761919601, + "grad_norm": 0.45801469683647156, + "learning_rate": 5.046501558043278e-05, + "loss": 1.8216, + "step": 4785 + }, + { + "epoch": 0.4971434507115405, + "grad_norm": 0.3939175009727478, + "learning_rate": 5.044869969025434e-05, + "loss": 1.6871, + "step": 4786 + }, + { + "epoch": 0.4972473252311208, + "grad_norm": 0.38540032505989075, + "learning_rate": 5.043238375229289e-05, + "loss": 1.6358, + "step": 4787 + }, + { + "epoch": 0.49735119975070113, + "grad_norm": 0.4039050340652466, + "learning_rate": 5.0416067768285924e-05, + "loss": 1.6841, + "step": 4788 + }, + { + "epoch": 0.4974550742702815, + "grad_norm": 0.3593752384185791, + "learning_rate": 5.039975173997099e-05, + "loss": 1.4662, + "step": 4789 + }, + { + "epoch": 0.49755894878986184, + "grad_norm": 0.4602617919445038, + "learning_rate": 5.038343566908562e-05, + "loss": 1.8741, + "step": 4790 + }, + { + "epoch": 0.49766282330944217, + "grad_norm": 0.39395564794540405, + "learning_rate": 5.036711955736734e-05, + "loss": 1.6235, + "step": 4791 + }, + { + "epoch": 0.49776669782902255, + "grad_norm": 0.3926306664943695, + "learning_rate": 5.0350803406553684e-05, + "loss": 1.4756, + "step": 4792 + }, + { + "epoch": 0.4978705723486029, + "grad_norm": 0.4372478723526001, + "learning_rate": 5.033448721838222e-05, + "loss": 1.6785, + "step": 4793 + }, + { + "epoch": 0.4979744468681832, + "grad_norm": 0.41775140166282654, + "learning_rate": 5.0318170994590464e-05, + "loss": 1.6019, + "step": 4794 + }, + { + "epoch": 0.4980783213877636, + "grad_norm": 0.4097503423690796, + "learning_rate": 5.030185473691597e-05, + "loss": 1.8253, + "step": 4795 + }, + { + "epoch": 0.4981821959073439, + "grad_norm": 0.38628101348876953, + "learning_rate": 5.0285538447096315e-05, + "loss": 1.6124, + "step": 4796 + }, + { + "epoch": 0.4982860704269243, + "grad_norm": 0.4230360984802246, + "learning_rate": 5.026922212686902e-05, + "loss": 1.67, + "step": 4797 + }, + { + "epoch": 0.4983899449465046, + "grad_norm": 0.40452295541763306, + "learning_rate": 5.025290577797169e-05, + "loss": 1.6923, + "step": 4798 + }, + { + "epoch": 0.49849381946608495, + "grad_norm": 0.38569211959838867, + "learning_rate": 5.0236589402141864e-05, + "loss": 1.585, + "step": 4799 + }, + { + "epoch": 0.49859769398566534, + "grad_norm": 0.4305466413497925, + "learning_rate": 5.022027300111711e-05, + "loss": 1.4631, + "step": 4800 + }, + { + "epoch": 0.49870156850524566, + "grad_norm": 0.42307230830192566, + "learning_rate": 5.0203956576635015e-05, + "loss": 1.6969, + "step": 4801 + }, + { + "epoch": 0.498805443024826, + "grad_norm": 0.44654223322868347, + "learning_rate": 5.018764013043312e-05, + "loss": 1.8514, + "step": 4802 + }, + { + "epoch": 0.49890931754440637, + "grad_norm": 0.4396657943725586, + "learning_rate": 5.0171323664249005e-05, + "loss": 1.7264, + "step": 4803 + }, + { + "epoch": 0.4990131920639867, + "grad_norm": 0.39395204186439514, + "learning_rate": 5.0155007179820266e-05, + "loss": 1.6963, + "step": 4804 + }, + { + "epoch": 0.499117066583567, + "grad_norm": 0.4187118411064148, + "learning_rate": 5.013869067888448e-05, + "loss": 1.7839, + "step": 4805 + }, + { + "epoch": 0.4992209411031474, + "grad_norm": 0.40489673614501953, + "learning_rate": 5.01223741631792e-05, + "loss": 1.6467, + "step": 4806 + }, + { + "epoch": 0.49932481562272774, + "grad_norm": 0.41493701934814453, + "learning_rate": 5.010605763444205e-05, + "loss": 1.8405, + "step": 4807 + }, + { + "epoch": 0.4994286901423081, + "grad_norm": 0.4441494643688202, + "learning_rate": 5.008974109441057e-05, + "loss": 1.6736, + "step": 4808 + }, + { + "epoch": 0.49953256466188845, + "grad_norm": 0.38821157813072205, + "learning_rate": 5.007342454482238e-05, + "loss": 1.6041, + "step": 4809 + }, + { + "epoch": 0.49963643918146877, + "grad_norm": 0.4361104965209961, + "learning_rate": 5.005710798741503e-05, + "loss": 1.7678, + "step": 4810 + }, + { + "epoch": 0.49974031370104915, + "grad_norm": 0.39289024472236633, + "learning_rate": 5.0040791423926125e-05, + "loss": 1.5475, + "step": 4811 + }, + { + "epoch": 0.4998441882206295, + "grad_norm": 0.35669323801994324, + "learning_rate": 5.002447485609327e-05, + "loss": 1.4978, + "step": 4812 + }, + { + "epoch": 0.4999480627402098, + "grad_norm": 0.38865354657173157, + "learning_rate": 5.0008158285654027e-05, + "loss": 1.7025, + "step": 4813 + }, + { + "epoch": 0.5000519372597901, + "grad_norm": 0.42419496178627014, + "learning_rate": 4.9991841714345985e-05, + "loss": 1.7601, + "step": 4814 + }, + { + "epoch": 0.5001558117793705, + "grad_norm": 0.39531058073043823, + "learning_rate": 4.997552514390674e-05, + "loss": 1.5614, + "step": 4815 + }, + { + "epoch": 0.5002596862989509, + "grad_norm": 0.415740430355072, + "learning_rate": 4.9959208576073866e-05, + "loss": 1.7504, + "step": 4816 + }, + { + "epoch": 0.5003635608185312, + "grad_norm": 0.4087817966938019, + "learning_rate": 4.994289201258498e-05, + "loss": 1.6628, + "step": 4817 + }, + { + "epoch": 0.5004674353381116, + "grad_norm": 0.39313584566116333, + "learning_rate": 4.992657545517764e-05, + "loss": 1.7745, + "step": 4818 + }, + { + "epoch": 0.5005713098576919, + "grad_norm": 0.3950080871582031, + "learning_rate": 4.9910258905589427e-05, + "loss": 1.6339, + "step": 4819 + }, + { + "epoch": 0.5006751843772722, + "grad_norm": 0.37852367758750916, + "learning_rate": 4.989394236555797e-05, + "loss": 1.6569, + "step": 4820 + }, + { + "epoch": 0.5007790588968526, + "grad_norm": 0.43450725078582764, + "learning_rate": 4.9877625836820806e-05, + "loss": 1.602, + "step": 4821 + }, + { + "epoch": 0.500882933416433, + "grad_norm": 0.42665791511535645, + "learning_rate": 4.986130932111553e-05, + "loss": 1.6783, + "step": 4822 + }, + { + "epoch": 0.5009868079360132, + "grad_norm": 0.396518737077713, + "learning_rate": 4.984499282017975e-05, + "loss": 1.6118, + "step": 4823 + }, + { + "epoch": 0.5010906824555936, + "grad_norm": 0.4622015655040741, + "learning_rate": 4.9828676335751006e-05, + "loss": 1.8231, + "step": 4824 + }, + { + "epoch": 0.501194556975174, + "grad_norm": 0.3886905908584595, + "learning_rate": 4.9812359869566896e-05, + "loss": 1.667, + "step": 4825 + }, + { + "epoch": 0.5012984314947543, + "grad_norm": 0.4054078161716461, + "learning_rate": 4.979604342336501e-05, + "loss": 1.6415, + "step": 4826 + }, + { + "epoch": 0.5014023060143347, + "grad_norm": 0.38207703828811646, + "learning_rate": 4.977972699888289e-05, + "loss": 1.7066, + "step": 4827 + }, + { + "epoch": 0.501506180533915, + "grad_norm": 0.43799397349357605, + "learning_rate": 4.976341059785815e-05, + "loss": 1.6469, + "step": 4828 + }, + { + "epoch": 0.5016100550534954, + "grad_norm": 0.5063971281051636, + "learning_rate": 4.974709422202831e-05, + "loss": 1.9436, + "step": 4829 + }, + { + "epoch": 0.5017139295730757, + "grad_norm": 0.39013898372650146, + "learning_rate": 4.973077787313099e-05, + "loss": 1.5765, + "step": 4830 + }, + { + "epoch": 0.5018178040926561, + "grad_norm": 0.4318859875202179, + "learning_rate": 4.9714461552903704e-05, + "loss": 1.8306, + "step": 4831 + }, + { + "epoch": 0.5019216786122365, + "grad_norm": 0.4619205892086029, + "learning_rate": 4.9698145263084036e-05, + "loss": 1.7797, + "step": 4832 + }, + { + "epoch": 0.5020255531318167, + "grad_norm": 0.3963909447193146, + "learning_rate": 4.968182900540956e-05, + "loss": 1.646, + "step": 4833 + }, + { + "epoch": 0.5021294276513971, + "grad_norm": 0.3879808187484741, + "learning_rate": 4.966551278161779e-05, + "loss": 1.6855, + "step": 4834 + }, + { + "epoch": 0.5022333021709775, + "grad_norm": 0.393045037984848, + "learning_rate": 4.964919659344632e-05, + "loss": 1.678, + "step": 4835 + }, + { + "epoch": 0.5023371766905578, + "grad_norm": 0.38027775287628174, + "learning_rate": 4.963288044263268e-05, + "loss": 1.6203, + "step": 4836 + }, + { + "epoch": 0.5024410512101382, + "grad_norm": 0.4136490821838379, + "learning_rate": 4.96165643309144e-05, + "loss": 1.772, + "step": 4837 + }, + { + "epoch": 0.5025449257297185, + "grad_norm": 0.3826773762702942, + "learning_rate": 4.9600248260029016e-05, + "loss": 1.8326, + "step": 4838 + }, + { + "epoch": 0.5026488002492988, + "grad_norm": 0.4194203317165375, + "learning_rate": 4.9583932231714095e-05, + "loss": 1.5652, + "step": 4839 + }, + { + "epoch": 0.5027526747688792, + "grad_norm": 0.4387541115283966, + "learning_rate": 4.956761624770713e-05, + "loss": 1.8814, + "step": 4840 + }, + { + "epoch": 0.5028565492884596, + "grad_norm": 0.3808096945285797, + "learning_rate": 4.9551300309745666e-05, + "loss": 1.748, + "step": 4841 + }, + { + "epoch": 0.5029604238080398, + "grad_norm": 0.4226476550102234, + "learning_rate": 4.953498441956723e-05, + "loss": 1.6055, + "step": 4842 + }, + { + "epoch": 0.5030642983276202, + "grad_norm": 0.4065408706665039, + "learning_rate": 4.951866857890931e-05, + "loss": 1.7516, + "step": 4843 + }, + { + "epoch": 0.5031681728472006, + "grad_norm": 0.3814634084701538, + "learning_rate": 4.950235278950943e-05, + "loss": 1.521, + "step": 4844 + }, + { + "epoch": 0.5032720473667809, + "grad_norm": 0.4002365469932556, + "learning_rate": 4.9486037053105125e-05, + "loss": 1.7387, + "step": 4845 + }, + { + "epoch": 0.5033759218863613, + "grad_norm": 0.39226168394088745, + "learning_rate": 4.946972137143383e-05, + "loss": 1.6565, + "step": 4846 + }, + { + "epoch": 0.5034797964059416, + "grad_norm": 0.425890177488327, + "learning_rate": 4.9453405746233076e-05, + "loss": 1.7404, + "step": 4847 + }, + { + "epoch": 0.5035836709255219, + "grad_norm": 0.41715189814567566, + "learning_rate": 4.943709017924038e-05, + "loss": 1.7237, + "step": 4848 + }, + { + "epoch": 0.5036875454451023, + "grad_norm": 0.4025704860687256, + "learning_rate": 4.9420774672193165e-05, + "loss": 1.4753, + "step": 4849 + }, + { + "epoch": 0.5037914199646827, + "grad_norm": 0.4412018060684204, + "learning_rate": 4.940445922682892e-05, + "loss": 1.9151, + "step": 4850 + }, + { + "epoch": 0.503895294484263, + "grad_norm": 0.3976673483848572, + "learning_rate": 4.938814384488515e-05, + "loss": 1.707, + "step": 4851 + }, + { + "epoch": 0.5039991690038433, + "grad_norm": 0.40271249413490295, + "learning_rate": 4.937182852809926e-05, + "loss": 1.658, + "step": 4852 + }, + { + "epoch": 0.5041030435234237, + "grad_norm": 0.4666357934474945, + "learning_rate": 4.935551327820873e-05, + "loss": 1.8472, + "step": 4853 + }, + { + "epoch": 0.5042069180430041, + "grad_norm": 0.42836159467697144, + "learning_rate": 4.933919809695103e-05, + "loss": 1.7557, + "step": 4854 + }, + { + "epoch": 0.5043107925625844, + "grad_norm": 0.44381582736968994, + "learning_rate": 4.9322882986063566e-05, + "loss": 1.7145, + "step": 4855 + }, + { + "epoch": 0.5044146670821648, + "grad_norm": 0.4695770740509033, + "learning_rate": 4.9306567947283785e-05, + "loss": 1.5709, + "step": 4856 + }, + { + "epoch": 0.5045185416017451, + "grad_norm": 0.40107208490371704, + "learning_rate": 4.92902529823491e-05, + "loss": 1.6956, + "step": 4857 + }, + { + "epoch": 0.5046224161213254, + "grad_norm": 0.39944395422935486, + "learning_rate": 4.927393809299696e-05, + "loss": 1.6442, + "step": 4858 + }, + { + "epoch": 0.5047262906409058, + "grad_norm": 0.42077764868736267, + "learning_rate": 4.925762328096473e-05, + "loss": 1.7561, + "step": 4859 + }, + { + "epoch": 0.5048301651604862, + "grad_norm": 0.4306979477405548, + "learning_rate": 4.924130854798983e-05, + "loss": 1.6888, + "step": 4860 + }, + { + "epoch": 0.5049340396800664, + "grad_norm": 0.41281387209892273, + "learning_rate": 4.922499389580967e-05, + "loss": 1.8061, + "step": 4861 + }, + { + "epoch": 0.5050379141996468, + "grad_norm": 0.4295770227909088, + "learning_rate": 4.920867932616159e-05, + "loss": 1.7401, + "step": 4862 + }, + { + "epoch": 0.5051417887192272, + "grad_norm": 0.37192273139953613, + "learning_rate": 4.9192364840783e-05, + "loss": 1.5993, + "step": 4863 + }, + { + "epoch": 0.5052456632388075, + "grad_norm": 0.4081692695617676, + "learning_rate": 4.917605044141127e-05, + "loss": 1.7359, + "step": 4864 + }, + { + "epoch": 0.5053495377583879, + "grad_norm": 0.40556600689888, + "learning_rate": 4.915973612978372e-05, + "loss": 1.6507, + "step": 4865 + }, + { + "epoch": 0.5054534122779683, + "grad_norm": 0.41496628522872925, + "learning_rate": 4.914342190763771e-05, + "loss": 1.6881, + "step": 4866 + }, + { + "epoch": 0.5055572867975485, + "grad_norm": 0.3876533806324005, + "learning_rate": 4.912710777671062e-05, + "loss": 1.5866, + "step": 4867 + }, + { + "epoch": 0.5056611613171289, + "grad_norm": 0.3975103795528412, + "learning_rate": 4.9110793738739715e-05, + "loss": 1.7478, + "step": 4868 + }, + { + "epoch": 0.5057650358367093, + "grad_norm": 0.37800195813179016, + "learning_rate": 4.909447979546235e-05, + "loss": 1.6304, + "step": 4869 + }, + { + "epoch": 0.5058689103562896, + "grad_norm": 0.3874056041240692, + "learning_rate": 4.907816594861582e-05, + "loss": 1.6151, + "step": 4870 + }, + { + "epoch": 0.5059727848758699, + "grad_norm": 0.42271357774734497, + "learning_rate": 4.906185219993743e-05, + "loss": 1.7042, + "step": 4871 + }, + { + "epoch": 0.5060766593954503, + "grad_norm": 0.44538614153862, + "learning_rate": 4.904553855116445e-05, + "loss": 1.7608, + "step": 4872 + }, + { + "epoch": 0.5061805339150306, + "grad_norm": 0.41812676191329956, + "learning_rate": 4.90292250040342e-05, + "loss": 1.7, + "step": 4873 + }, + { + "epoch": 0.506284408434611, + "grad_norm": 0.3845331370830536, + "learning_rate": 4.901291156028388e-05, + "loss": 1.5532, + "step": 4874 + }, + { + "epoch": 0.5063882829541914, + "grad_norm": 0.4065440595149994, + "learning_rate": 4.899659822165079e-05, + "loss": 1.7019, + "step": 4875 + }, + { + "epoch": 0.5064921574737716, + "grad_norm": 0.42397770285606384, + "learning_rate": 4.898028498987217e-05, + "loss": 1.7414, + "step": 4876 + }, + { + "epoch": 0.506596031993352, + "grad_norm": 0.38539576530456543, + "learning_rate": 4.8963971866685245e-05, + "loss": 1.7377, + "step": 4877 + }, + { + "epoch": 0.5066999065129324, + "grad_norm": 0.373605877161026, + "learning_rate": 4.894765885382722e-05, + "loss": 1.6698, + "step": 4878 + }, + { + "epoch": 0.5068037810325128, + "grad_norm": 0.36256158351898193, + "learning_rate": 4.8931345953035326e-05, + "loss": 1.5029, + "step": 4879 + }, + { + "epoch": 0.506907655552093, + "grad_norm": 0.39593711495399475, + "learning_rate": 4.891503316604677e-05, + "loss": 1.7055, + "step": 4880 + }, + { + "epoch": 0.5070115300716734, + "grad_norm": 0.3991319239139557, + "learning_rate": 4.889872049459869e-05, + "loss": 1.6002, + "step": 4881 + }, + { + "epoch": 0.5071154045912538, + "grad_norm": 0.3885091543197632, + "learning_rate": 4.88824079404283e-05, + "loss": 1.6086, + "step": 4882 + }, + { + "epoch": 0.5072192791108341, + "grad_norm": 0.41479891538619995, + "learning_rate": 4.8866095505272754e-05, + "loss": 1.4958, + "step": 4883 + }, + { + "epoch": 0.5073231536304145, + "grad_norm": 0.4320788085460663, + "learning_rate": 4.8849783190869186e-05, + "loss": 1.6549, + "step": 4884 + }, + { + "epoch": 0.5074270281499949, + "grad_norm": 0.49210458993911743, + "learning_rate": 4.883347099895473e-05, + "loss": 1.9594, + "step": 4885 + }, + { + "epoch": 0.5075309026695751, + "grad_norm": 0.38703057169914246, + "learning_rate": 4.8817158931266534e-05, + "loss": 1.5258, + "step": 4886 + }, + { + "epoch": 0.5076347771891555, + "grad_norm": 0.387260764837265, + "learning_rate": 4.880084698954166e-05, + "loss": 1.5779, + "step": 4887 + }, + { + "epoch": 0.5077386517087359, + "grad_norm": 0.42436152696609497, + "learning_rate": 4.878453517551722e-05, + "loss": 1.874, + "step": 4888 + }, + { + "epoch": 0.5078425262283162, + "grad_norm": 0.3863707184791565, + "learning_rate": 4.876822349093034e-05, + "loss": 1.4991, + "step": 4889 + }, + { + "epoch": 0.5079464007478965, + "grad_norm": 0.40059852600097656, + "learning_rate": 4.875191193751802e-05, + "loss": 1.5891, + "step": 4890 + }, + { + "epoch": 0.5080502752674769, + "grad_norm": 0.45580723881721497, + "learning_rate": 4.873560051701736e-05, + "loss": 1.8611, + "step": 4891 + }, + { + "epoch": 0.5081541497870572, + "grad_norm": 0.3890886902809143, + "learning_rate": 4.871928923116539e-05, + "loss": 1.5606, + "step": 4892 + }, + { + "epoch": 0.5082580243066376, + "grad_norm": 0.42675158381462097, + "learning_rate": 4.8702978081699104e-05, + "loss": 1.7001, + "step": 4893 + }, + { + "epoch": 0.508361898826218, + "grad_norm": 0.44091951847076416, + "learning_rate": 4.868666707035554e-05, + "loss": 1.8272, + "step": 4894 + }, + { + "epoch": 0.5084657733457982, + "grad_norm": 0.3957894444465637, + "learning_rate": 4.86703561988717e-05, + "loss": 1.8023, + "step": 4895 + }, + { + "epoch": 0.5085696478653786, + "grad_norm": 0.37731900811195374, + "learning_rate": 4.865404546898454e-05, + "loss": 1.6024, + "step": 4896 + }, + { + "epoch": 0.508673522384959, + "grad_norm": 0.41856059432029724, + "learning_rate": 4.863773488243105e-05, + "loss": 1.6815, + "step": 4897 + }, + { + "epoch": 0.5087773969045393, + "grad_norm": 0.45765843987464905, + "learning_rate": 4.862142444094816e-05, + "loss": 1.8893, + "step": 4898 + }, + { + "epoch": 0.5088812714241197, + "grad_norm": 0.4164111018180847, + "learning_rate": 4.8605114146272824e-05, + "loss": 1.6191, + "step": 4899 + }, + { + "epoch": 0.5089851459437, + "grad_norm": 0.4314156770706177, + "learning_rate": 4.858880400014194e-05, + "loss": 1.9152, + "step": 4900 + }, + { + "epoch": 0.5090890204632803, + "grad_norm": 0.42690524458885193, + "learning_rate": 4.857249400429244e-05, + "loss": 1.787, + "step": 4901 + }, + { + "epoch": 0.5091928949828607, + "grad_norm": 0.3987329602241516, + "learning_rate": 4.8556184160461167e-05, + "loss": 1.6278, + "step": 4902 + }, + { + "epoch": 0.5092967695024411, + "grad_norm": 0.427369624376297, + "learning_rate": 4.8539874470385024e-05, + "loss": 1.8381, + "step": 4903 + }, + { + "epoch": 0.5094006440220213, + "grad_norm": 0.39449864625930786, + "learning_rate": 4.852356493580088e-05, + "loss": 1.7702, + "step": 4904 + }, + { + "epoch": 0.5095045185416017, + "grad_norm": 0.4443662464618683, + "learning_rate": 4.850725555844555e-05, + "loss": 1.8187, + "step": 4905 + }, + { + "epoch": 0.5096083930611821, + "grad_norm": 0.42031386494636536, + "learning_rate": 4.8490946340055837e-05, + "loss": 1.7612, + "step": 4906 + }, + { + "epoch": 0.5097122675807625, + "grad_norm": 0.44448190927505493, + "learning_rate": 4.8474637282368577e-05, + "loss": 1.6873, + "step": 4907 + }, + { + "epoch": 0.5098161421003428, + "grad_norm": 0.4955507516860962, + "learning_rate": 4.8458328387120565e-05, + "loss": 2.1091, + "step": 4908 + }, + { + "epoch": 0.5099200166199231, + "grad_norm": 0.3980197012424469, + "learning_rate": 4.844201965604853e-05, + "loss": 1.6583, + "step": 4909 + }, + { + "epoch": 0.5100238911395035, + "grad_norm": 0.41697457432746887, + "learning_rate": 4.842571109088927e-05, + "loss": 1.5176, + "step": 4910 + }, + { + "epoch": 0.5101277656590838, + "grad_norm": 0.412379652261734, + "learning_rate": 4.840940269337949e-05, + "loss": 1.5699, + "step": 4911 + }, + { + "epoch": 0.5102316401786642, + "grad_norm": 0.420980840921402, + "learning_rate": 4.839309446525592e-05, + "loss": 1.6266, + "step": 4912 + }, + { + "epoch": 0.5103355146982446, + "grad_norm": 0.4162856340408325, + "learning_rate": 4.837678640825524e-05, + "loss": 1.6759, + "step": 4913 + }, + { + "epoch": 0.5104393892178248, + "grad_norm": 0.4442167580127716, + "learning_rate": 4.836047852411417e-05, + "loss": 1.7818, + "step": 4914 + }, + { + "epoch": 0.5105432637374052, + "grad_norm": 0.41404595971107483, + "learning_rate": 4.834417081456933e-05, + "loss": 1.7876, + "step": 4915 + }, + { + "epoch": 0.5106471382569856, + "grad_norm": 0.4493081271648407, + "learning_rate": 4.8327863281357376e-05, + "loss": 1.9106, + "step": 4916 + }, + { + "epoch": 0.5107510127765659, + "grad_norm": 0.379351407289505, + "learning_rate": 4.8311555926214966e-05, + "loss": 1.6729, + "step": 4917 + }, + { + "epoch": 0.5108548872961463, + "grad_norm": 0.40297406911849976, + "learning_rate": 4.829524875087867e-05, + "loss": 1.7575, + "step": 4918 + }, + { + "epoch": 0.5109587618157266, + "grad_norm": 0.4127398133277893, + "learning_rate": 4.8278941757085063e-05, + "loss": 1.5132, + "step": 4919 + }, + { + "epoch": 0.5110626363353069, + "grad_norm": 0.38783156871795654, + "learning_rate": 4.8262634946570764e-05, + "loss": 1.6706, + "step": 4920 + }, + { + "epoch": 0.5111665108548873, + "grad_norm": 0.39855116605758667, + "learning_rate": 4.824632832107227e-05, + "loss": 1.7722, + "step": 4921 + }, + { + "epoch": 0.5112703853744677, + "grad_norm": 0.38658004999160767, + "learning_rate": 4.8230021882326135e-05, + "loss": 1.7226, + "step": 4922 + }, + { + "epoch": 0.5113742598940479, + "grad_norm": 0.4114481210708618, + "learning_rate": 4.821371563206888e-05, + "loss": 1.7858, + "step": 4923 + }, + { + "epoch": 0.5114781344136283, + "grad_norm": 0.3694150745868683, + "learning_rate": 4.819740957203696e-05, + "loss": 1.4821, + "step": 4924 + }, + { + "epoch": 0.5115820089332087, + "grad_norm": 0.40538889169692993, + "learning_rate": 4.818110370396686e-05, + "loss": 1.6467, + "step": 4925 + }, + { + "epoch": 0.511685883452789, + "grad_norm": 0.4382588863372803, + "learning_rate": 4.816479802959503e-05, + "loss": 1.6146, + "step": 4926 + }, + { + "epoch": 0.5117897579723694, + "grad_norm": 0.41980645060539246, + "learning_rate": 4.81484925506579e-05, + "loss": 1.6783, + "step": 4927 + }, + { + "epoch": 0.5118936324919497, + "grad_norm": 0.413632333278656, + "learning_rate": 4.813218726889186e-05, + "loss": 1.749, + "step": 4928 + }, + { + "epoch": 0.51199750701153, + "grad_norm": 0.40364977717399597, + "learning_rate": 4.8115882186033326e-05, + "loss": 1.5492, + "step": 4929 + }, + { + "epoch": 0.5121013815311104, + "grad_norm": 0.3991188406944275, + "learning_rate": 4.8099577303818616e-05, + "loss": 1.7461, + "step": 4930 + }, + { + "epoch": 0.5122052560506908, + "grad_norm": 0.4893375337123871, + "learning_rate": 4.8083272623984104e-05, + "loss": 1.9981, + "step": 4931 + }, + { + "epoch": 0.5123091305702712, + "grad_norm": 0.40781837701797485, + "learning_rate": 4.8066968148266116e-05, + "loss": 1.7219, + "step": 4932 + }, + { + "epoch": 0.5124130050898514, + "grad_norm": 0.39397352933883667, + "learning_rate": 4.805066387840096e-05, + "loss": 1.6126, + "step": 4933 + }, + { + "epoch": 0.5125168796094318, + "grad_norm": 0.41246309876441956, + "learning_rate": 4.803435981612486e-05, + "loss": 1.7436, + "step": 4934 + }, + { + "epoch": 0.5126207541290122, + "grad_norm": 0.3805597424507141, + "learning_rate": 4.801805596317412e-05, + "loss": 1.6401, + "step": 4935 + }, + { + "epoch": 0.5127246286485925, + "grad_norm": 0.404367059469223, + "learning_rate": 4.800175232128498e-05, + "loss": 1.6504, + "step": 4936 + }, + { + "epoch": 0.5128285031681729, + "grad_norm": 0.3783699870109558, + "learning_rate": 4.79854488921936e-05, + "loss": 1.6495, + "step": 4937 + }, + { + "epoch": 0.5129323776877532, + "grad_norm": 0.48037418723106384, + "learning_rate": 4.7969145677636215e-05, + "loss": 1.6088, + "step": 4938 + }, + { + "epoch": 0.5130362522073335, + "grad_norm": 0.4683350622653961, + "learning_rate": 4.795284267934898e-05, + "loss": 1.8945, + "step": 4939 + }, + { + "epoch": 0.5131401267269139, + "grad_norm": 0.4192398488521576, + "learning_rate": 4.793653989906802e-05, + "loss": 1.8459, + "step": 4940 + }, + { + "epoch": 0.5132440012464943, + "grad_norm": 0.3951408565044403, + "learning_rate": 4.792023733852947e-05, + "loss": 1.6314, + "step": 4941 + }, + { + "epoch": 0.5133478757660745, + "grad_norm": 0.38078486919403076, + "learning_rate": 4.790393499946943e-05, + "loss": 1.6349, + "step": 4942 + }, + { + "epoch": 0.5134517502856549, + "grad_norm": 0.4152866005897522, + "learning_rate": 4.788763288362394e-05, + "loss": 1.7195, + "step": 4943 + }, + { + "epoch": 0.5135556248052353, + "grad_norm": 0.37676650285720825, + "learning_rate": 4.787133099272908e-05, + "loss": 1.5917, + "step": 4944 + }, + { + "epoch": 0.5136594993248156, + "grad_norm": 0.3816780149936676, + "learning_rate": 4.7855029328520876e-05, + "loss": 1.6664, + "step": 4945 + }, + { + "epoch": 0.513763373844396, + "grad_norm": 0.3960273265838623, + "learning_rate": 4.7838727892735315e-05, + "loss": 1.6255, + "step": 4946 + }, + { + "epoch": 0.5138672483639763, + "grad_norm": 0.41314899921417236, + "learning_rate": 4.782242668710836e-05, + "loss": 1.7316, + "step": 4947 + }, + { + "epoch": 0.5139711228835566, + "grad_norm": 0.3930761516094208, + "learning_rate": 4.7806125713376e-05, + "loss": 1.6513, + "step": 4948 + }, + { + "epoch": 0.514074997403137, + "grad_norm": 0.4532455503940582, + "learning_rate": 4.7789824973274114e-05, + "loss": 1.6415, + "step": 4949 + }, + { + "epoch": 0.5141788719227174, + "grad_norm": 0.4238525927066803, + "learning_rate": 4.7773524468538624e-05, + "loss": 1.8161, + "step": 4950 + }, + { + "epoch": 0.5142827464422977, + "grad_norm": 0.40888962149620056, + "learning_rate": 4.7757224200905444e-05, + "loss": 1.7179, + "step": 4951 + }, + { + "epoch": 0.514386620961878, + "grad_norm": 0.38617759943008423, + "learning_rate": 4.774092417211037e-05, + "loss": 1.6069, + "step": 4952 + }, + { + "epoch": 0.5144904954814584, + "grad_norm": 0.41479814052581787, + "learning_rate": 4.7724624383889256e-05, + "loss": 1.7158, + "step": 4953 + }, + { + "epoch": 0.5145943700010387, + "grad_norm": 0.38144171237945557, + "learning_rate": 4.770832483797789e-05, + "loss": 1.6917, + "step": 4954 + }, + { + "epoch": 0.5146982445206191, + "grad_norm": 0.41436558961868286, + "learning_rate": 4.769202553611206e-05, + "loss": 1.817, + "step": 4955 + }, + { + "epoch": 0.5148021190401995, + "grad_norm": 0.43558332324028015, + "learning_rate": 4.76757264800275e-05, + "loss": 1.7038, + "step": 4956 + }, + { + "epoch": 0.5149059935597798, + "grad_norm": 0.3971516788005829, + "learning_rate": 4.765942767145994e-05, + "loss": 1.6663, + "step": 4957 + }, + { + "epoch": 0.5150098680793601, + "grad_norm": 0.39739498496055603, + "learning_rate": 4.764312911214509e-05, + "loss": 1.5594, + "step": 4958 + }, + { + "epoch": 0.5151137425989405, + "grad_norm": 0.408383846282959, + "learning_rate": 4.762683080381859e-05, + "loss": 1.7743, + "step": 4959 + }, + { + "epoch": 0.5152176171185209, + "grad_norm": 0.39290642738342285, + "learning_rate": 4.761053274821611e-05, + "loss": 1.6531, + "step": 4960 + }, + { + "epoch": 0.5153214916381011, + "grad_norm": 0.46999290585517883, + "learning_rate": 4.7594234947073256e-05, + "loss": 1.8683, + "step": 4961 + }, + { + "epoch": 0.5154253661576815, + "grad_norm": 0.40802252292633057, + "learning_rate": 4.75779374021256e-05, + "loss": 1.5823, + "step": 4962 + }, + { + "epoch": 0.5155292406772619, + "grad_norm": 0.42017194628715515, + "learning_rate": 4.756164011510871e-05, + "loss": 1.802, + "step": 4963 + }, + { + "epoch": 0.5156331151968422, + "grad_norm": 0.4172336161136627, + "learning_rate": 4.754534308775816e-05, + "loss": 1.7046, + "step": 4964 + }, + { + "epoch": 0.5157369897164226, + "grad_norm": 0.38955143094062805, + "learning_rate": 4.75290463218094e-05, + "loss": 1.5606, + "step": 4965 + }, + { + "epoch": 0.515840864236003, + "grad_norm": 0.47302696108818054, + "learning_rate": 4.7512749818997936e-05, + "loss": 1.6814, + "step": 4966 + }, + { + "epoch": 0.5159447387555832, + "grad_norm": 0.40039142966270447, + "learning_rate": 4.749645358105923e-05, + "loss": 1.6354, + "step": 4967 + }, + { + "epoch": 0.5160486132751636, + "grad_norm": 0.3926571011543274, + "learning_rate": 4.748015760972867e-05, + "loss": 1.6332, + "step": 4968 + }, + { + "epoch": 0.516152487794744, + "grad_norm": 0.4233894348144531, + "learning_rate": 4.7463861906741666e-05, + "loss": 1.3089, + "step": 4969 + }, + { + "epoch": 0.5162563623143243, + "grad_norm": 0.3955030143260956, + "learning_rate": 4.7447566473833615e-05, + "loss": 1.5917, + "step": 4970 + }, + { + "epoch": 0.5163602368339046, + "grad_norm": 0.3940108120441437, + "learning_rate": 4.743127131273979e-05, + "loss": 1.7515, + "step": 4971 + }, + { + "epoch": 0.516464111353485, + "grad_norm": 0.44134819507598877, + "learning_rate": 4.7414976425195544e-05, + "loss": 1.5085, + "step": 4972 + }, + { + "epoch": 0.5165679858730653, + "grad_norm": 0.44649553298950195, + "learning_rate": 4.739868181293616e-05, + "loss": 1.8201, + "step": 4973 + }, + { + "epoch": 0.5166718603926457, + "grad_norm": 0.4249851703643799, + "learning_rate": 4.738238747769687e-05, + "loss": 1.7515, + "step": 4974 + }, + { + "epoch": 0.5167757349122261, + "grad_norm": 0.41049546003341675, + "learning_rate": 4.7366093421212884e-05, + "loss": 1.6677, + "step": 4975 + }, + { + "epoch": 0.5168796094318063, + "grad_norm": 0.4584263563156128, + "learning_rate": 4.7349799645219415e-05, + "loss": 1.7249, + "step": 4976 + }, + { + "epoch": 0.5169834839513867, + "grad_norm": 0.4268789291381836, + "learning_rate": 4.73335061514516e-05, + "loss": 1.6639, + "step": 4977 + }, + { + "epoch": 0.5170873584709671, + "grad_norm": 0.4349389672279358, + "learning_rate": 4.731721294164457e-05, + "loss": 1.6422, + "step": 4978 + }, + { + "epoch": 0.5171912329905474, + "grad_norm": 0.43998467922210693, + "learning_rate": 4.730092001753346e-05, + "loss": 1.8851, + "step": 4979 + }, + { + "epoch": 0.5172951075101277, + "grad_norm": 0.4167867600917816, + "learning_rate": 4.72846273808533e-05, + "loss": 1.6397, + "step": 4980 + }, + { + "epoch": 0.5173989820297081, + "grad_norm": 0.40250730514526367, + "learning_rate": 4.726833503333915e-05, + "loss": 1.4696, + "step": 4981 + }, + { + "epoch": 0.5175028565492885, + "grad_norm": 0.44303545355796814, + "learning_rate": 4.7252042976726e-05, + "loss": 1.814, + "step": 4982 + }, + { + "epoch": 0.5176067310688688, + "grad_norm": 0.3853762745857239, + "learning_rate": 4.723575121274885e-05, + "loss": 1.6521, + "step": 4983 + }, + { + "epoch": 0.5177106055884492, + "grad_norm": 0.39856797456741333, + "learning_rate": 4.721945974314263e-05, + "loss": 1.6085, + "step": 4984 + }, + { + "epoch": 0.5178144801080296, + "grad_norm": 0.380560040473938, + "learning_rate": 4.720316856964224e-05, + "loss": 1.6189, + "step": 4985 + }, + { + "epoch": 0.5179183546276098, + "grad_norm": 0.4493720531463623, + "learning_rate": 4.7186877693982616e-05, + "loss": 1.7474, + "step": 4986 + }, + { + "epoch": 0.5180222291471902, + "grad_norm": 0.38229069113731384, + "learning_rate": 4.717058711789855e-05, + "loss": 1.5625, + "step": 4987 + }, + { + "epoch": 0.5181261036667706, + "grad_norm": 0.3856019675731659, + "learning_rate": 4.715429684312489e-05, + "loss": 1.6004, + "step": 4988 + }, + { + "epoch": 0.5182299781863509, + "grad_norm": 0.3876418471336365, + "learning_rate": 4.713800687139644e-05, + "loss": 1.6596, + "step": 4989 + }, + { + "epoch": 0.5183338527059312, + "grad_norm": 0.41648218035697937, + "learning_rate": 4.712171720444791e-05, + "loss": 1.8412, + "step": 4990 + }, + { + "epoch": 0.5184377272255116, + "grad_norm": 0.43493372201919556, + "learning_rate": 4.7105427844014046e-05, + "loss": 1.8457, + "step": 4991 + }, + { + "epoch": 0.5185416017450919, + "grad_norm": 0.37969422340393066, + "learning_rate": 4.7089138791829554e-05, + "loss": 1.6451, + "step": 4992 + }, + { + "epoch": 0.5186454762646723, + "grad_norm": 0.4082360863685608, + "learning_rate": 4.707285004962906e-05, + "loss": 1.8016, + "step": 4993 + }, + { + "epoch": 0.5187493507842527, + "grad_norm": 0.42352405190467834, + "learning_rate": 4.70565616191472e-05, + "loss": 1.5526, + "step": 4994 + }, + { + "epoch": 0.5188532253038329, + "grad_norm": 0.3847419321537018, + "learning_rate": 4.704027350211859e-05, + "loss": 1.6518, + "step": 4995 + }, + { + "epoch": 0.5189570998234133, + "grad_norm": 0.40276169776916504, + "learning_rate": 4.702398570027775e-05, + "loss": 1.5955, + "step": 4996 + }, + { + "epoch": 0.5190609743429937, + "grad_norm": 0.44198933243751526, + "learning_rate": 4.700769821535921e-05, + "loss": 1.7202, + "step": 4997 + }, + { + "epoch": 0.519164848862574, + "grad_norm": 0.39979052543640137, + "learning_rate": 4.6991411049097494e-05, + "loss": 1.6186, + "step": 4998 + }, + { + "epoch": 0.5192687233821544, + "grad_norm": 0.39757975935935974, + "learning_rate": 4.697512420322701e-05, + "loss": 1.7077, + "step": 4999 + }, + { + "epoch": 0.5193725979017347, + "grad_norm": 0.3819902837276459, + "learning_rate": 4.69588376794822e-05, + "loss": 1.5771, + "step": 5000 + }, + { + "epoch": 0.519476472421315, + "grad_norm": 0.3874730169773102, + "learning_rate": 4.694255147959749e-05, + "loss": 1.6497, + "step": 5001 + }, + { + "epoch": 0.5195803469408954, + "grad_norm": 0.38986441493034363, + "learning_rate": 4.6926265605307175e-05, + "loss": 1.6801, + "step": 5002 + }, + { + "epoch": 0.5196842214604758, + "grad_norm": 0.4043687880039215, + "learning_rate": 4.690998005834559e-05, + "loss": 1.5736, + "step": 5003 + }, + { + "epoch": 0.519788095980056, + "grad_norm": 0.4174960255622864, + "learning_rate": 4.689369484044705e-05, + "loss": 1.7306, + "step": 5004 + }, + { + "epoch": 0.5198919704996364, + "grad_norm": 0.43060052394866943, + "learning_rate": 4.687740995334575e-05, + "loss": 1.7073, + "step": 5005 + }, + { + "epoch": 0.5199958450192168, + "grad_norm": 0.405344158411026, + "learning_rate": 4.686112539877594e-05, + "loss": 1.7753, + "step": 5006 + }, + { + "epoch": 0.5200997195387971, + "grad_norm": 0.39156869053840637, + "learning_rate": 4.684484117847181e-05, + "loss": 1.5766, + "step": 5007 + }, + { + "epoch": 0.5202035940583775, + "grad_norm": 0.40544578433036804, + "learning_rate": 4.6828557294167465e-05, + "loss": 1.6717, + "step": 5008 + }, + { + "epoch": 0.5203074685779578, + "grad_norm": 0.41265055537223816, + "learning_rate": 4.681227374759704e-05, + "loss": 1.7227, + "step": 5009 + }, + { + "epoch": 0.5204113430975382, + "grad_norm": 0.3951095640659332, + "learning_rate": 4.679599054049458e-05, + "loss": 1.6448, + "step": 5010 + }, + { + "epoch": 0.5205152176171185, + "grad_norm": 0.4113246500492096, + "learning_rate": 4.677970767459416e-05, + "loss": 1.7294, + "step": 5011 + }, + { + "epoch": 0.5206190921366989, + "grad_norm": 0.40176793932914734, + "learning_rate": 4.676342515162973e-05, + "loss": 1.6219, + "step": 5012 + }, + { + "epoch": 0.5207229666562793, + "grad_norm": 0.6003035306930542, + "learning_rate": 4.674714297333527e-05, + "loss": 1.99, + "step": 5013 + }, + { + "epoch": 0.5208268411758595, + "grad_norm": 0.36669179797172546, + "learning_rate": 4.673086114144474e-05, + "loss": 1.5943, + "step": 5014 + }, + { + "epoch": 0.5209307156954399, + "grad_norm": 0.3950868546962738, + "learning_rate": 4.671457965769198e-05, + "loss": 1.6115, + "step": 5015 + }, + { + "epoch": 0.5210345902150203, + "grad_norm": 0.4209156036376953, + "learning_rate": 4.6698298523810854e-05, + "loss": 1.6077, + "step": 5016 + }, + { + "epoch": 0.5211384647346006, + "grad_norm": 0.38038328289985657, + "learning_rate": 4.668201774153521e-05, + "loss": 1.5769, + "step": 5017 + }, + { + "epoch": 0.521242339254181, + "grad_norm": 0.4059074819087982, + "learning_rate": 4.666573731259877e-05, + "loss": 1.6872, + "step": 5018 + }, + { + "epoch": 0.5213462137737613, + "grad_norm": 0.3864630162715912, + "learning_rate": 4.664945723873529e-05, + "loss": 1.6696, + "step": 5019 + }, + { + "epoch": 0.5214500882933416, + "grad_norm": 0.3989625871181488, + "learning_rate": 4.66331775216785e-05, + "loss": 1.716, + "step": 5020 + }, + { + "epoch": 0.521553962812922, + "grad_norm": 0.376228392124176, + "learning_rate": 4.6616898163162024e-05, + "loss": 1.6107, + "step": 5021 + }, + { + "epoch": 0.5216578373325024, + "grad_norm": 0.40415439009666443, + "learning_rate": 4.660061916491952e-05, + "loss": 1.6419, + "step": 5022 + }, + { + "epoch": 0.5217617118520826, + "grad_norm": 0.4512626528739929, + "learning_rate": 4.658434052868455e-05, + "loss": 1.8734, + "step": 5023 + }, + { + "epoch": 0.521865586371663, + "grad_norm": 0.5184199810028076, + "learning_rate": 4.656806225619067e-05, + "loss": 1.8781, + "step": 5024 + }, + { + "epoch": 0.5219694608912434, + "grad_norm": 0.4308519661426544, + "learning_rate": 4.655178434917139e-05, + "loss": 1.7415, + "step": 5025 + }, + { + "epoch": 0.5220733354108237, + "grad_norm": 0.41828837990760803, + "learning_rate": 4.65355068093602e-05, + "loss": 1.6995, + "step": 5026 + }, + { + "epoch": 0.5221772099304041, + "grad_norm": 0.3959868252277374, + "learning_rate": 4.651922963849048e-05, + "loss": 1.7536, + "step": 5027 + }, + { + "epoch": 0.5222810844499844, + "grad_norm": 0.427030473947525, + "learning_rate": 4.6502952838295654e-05, + "loss": 1.6309, + "step": 5028 + }, + { + "epoch": 0.5223849589695647, + "grad_norm": 0.3931408226490021, + "learning_rate": 4.64866764105091e-05, + "loss": 1.6537, + "step": 5029 + }, + { + "epoch": 0.5224888334891451, + "grad_norm": 0.343965619802475, + "learning_rate": 4.647040035686409e-05, + "loss": 1.3474, + "step": 5030 + }, + { + "epoch": 0.5225927080087255, + "grad_norm": 0.40805351734161377, + "learning_rate": 4.64541246790939e-05, + "loss": 1.6824, + "step": 5031 + }, + { + "epoch": 0.5226965825283058, + "grad_norm": 0.39621418714523315, + "learning_rate": 4.643784937893179e-05, + "loss": 1.6969, + "step": 5032 + }, + { + "epoch": 0.5228004570478861, + "grad_norm": 0.38394787907600403, + "learning_rate": 4.642157445811092e-05, + "loss": 1.614, + "step": 5033 + }, + { + "epoch": 0.5229043315674665, + "grad_norm": 0.3906930685043335, + "learning_rate": 4.6405299918364456e-05, + "loss": 1.7664, + "step": 5034 + }, + { + "epoch": 0.5230082060870469, + "grad_norm": 0.37266916036605835, + "learning_rate": 4.638902576142552e-05, + "loss": 1.4807, + "step": 5035 + }, + { + "epoch": 0.5231120806066272, + "grad_norm": 0.4071597456932068, + "learning_rate": 4.637275198902717e-05, + "loss": 1.4604, + "step": 5036 + }, + { + "epoch": 0.5232159551262076, + "grad_norm": 0.4267309606075287, + "learning_rate": 4.635647860290244e-05, + "loss": 1.7642, + "step": 5037 + }, + { + "epoch": 0.5233198296457879, + "grad_norm": 0.4255138635635376, + "learning_rate": 4.634020560478431e-05, + "loss": 1.7285, + "step": 5038 + }, + { + "epoch": 0.5234237041653682, + "grad_norm": 0.3941444456577301, + "learning_rate": 4.6323932996405764e-05, + "loss": 1.665, + "step": 5039 + }, + { + "epoch": 0.5235275786849486, + "grad_norm": 0.4022713005542755, + "learning_rate": 4.630766077949965e-05, + "loss": 1.731, + "step": 5040 + }, + { + "epoch": 0.523631453204529, + "grad_norm": 0.3884924352169037, + "learning_rate": 4.629138895579886e-05, + "loss": 1.541, + "step": 5041 + }, + { + "epoch": 0.5237353277241092, + "grad_norm": 0.41130009293556213, + "learning_rate": 4.6275117527036246e-05, + "loss": 1.6621, + "step": 5042 + }, + { + "epoch": 0.5238392022436896, + "grad_norm": 0.3775813579559326, + "learning_rate": 4.625884649494453e-05, + "loss": 1.5298, + "step": 5043 + }, + { + "epoch": 0.52394307676327, + "grad_norm": 0.4054543375968933, + "learning_rate": 4.6242575861256496e-05, + "loss": 1.8429, + "step": 5044 + }, + { + "epoch": 0.5240469512828503, + "grad_norm": 0.3855196535587311, + "learning_rate": 4.622630562770483e-05, + "loss": 1.589, + "step": 5045 + }, + { + "epoch": 0.5241508258024307, + "grad_norm": 0.3769000470638275, + "learning_rate": 4.6210035796022166e-05, + "loss": 1.6403, + "step": 5046 + }, + { + "epoch": 0.524254700322011, + "grad_norm": 0.4284866750240326, + "learning_rate": 4.619376636794113e-05, + "loss": 1.6255, + "step": 5047 + }, + { + "epoch": 0.5243585748415913, + "grad_norm": 0.40864747762680054, + "learning_rate": 4.61774973451943e-05, + "loss": 1.627, + "step": 5048 + }, + { + "epoch": 0.5244624493611717, + "grad_norm": 0.4019078016281128, + "learning_rate": 4.6161228729514164e-05, + "loss": 1.5215, + "step": 5049 + }, + { + "epoch": 0.5245663238807521, + "grad_norm": 0.40582525730133057, + "learning_rate": 4.614496052263324e-05, + "loss": 1.615, + "step": 5050 + }, + { + "epoch": 0.5246701984003324, + "grad_norm": 0.4704711437225342, + "learning_rate": 4.612869272628394e-05, + "loss": 1.8979, + "step": 5051 + }, + { + "epoch": 0.5247740729199127, + "grad_norm": 0.4247235059738159, + "learning_rate": 4.611242534219867e-05, + "loss": 1.7807, + "step": 5052 + }, + { + "epoch": 0.5248779474394931, + "grad_norm": 0.4381190240383148, + "learning_rate": 4.609615837210977e-05, + "loss": 1.6944, + "step": 5053 + }, + { + "epoch": 0.5249818219590734, + "grad_norm": 0.37743279337882996, + "learning_rate": 4.6079891817749576e-05, + "loss": 1.5408, + "step": 5054 + }, + { + "epoch": 0.5250856964786538, + "grad_norm": 0.41715410351753235, + "learning_rate": 4.6063625680850306e-05, + "loss": 1.8648, + "step": 5055 + }, + { + "epoch": 0.5251895709982342, + "grad_norm": 0.3836834132671356, + "learning_rate": 4.6047359963144187e-05, + "loss": 1.5844, + "step": 5056 + }, + { + "epoch": 0.5252934455178144, + "grad_norm": 0.4763341248035431, + "learning_rate": 4.6031094666363426e-05, + "loss": 1.9145, + "step": 5057 + }, + { + "epoch": 0.5253973200373948, + "grad_norm": 0.5105146765708923, + "learning_rate": 4.601482979224012e-05, + "loss": 1.9522, + "step": 5058 + }, + { + "epoch": 0.5255011945569752, + "grad_norm": 0.4624611735343933, + "learning_rate": 4.599856534250633e-05, + "loss": 1.7487, + "step": 5059 + }, + { + "epoch": 0.5256050690765556, + "grad_norm": 0.46096524596214294, + "learning_rate": 4.5982301318894126e-05, + "loss": 1.7626, + "step": 5060 + }, + { + "epoch": 0.5257089435961358, + "grad_norm": 0.42811891436576843, + "learning_rate": 4.596603772313552e-05, + "loss": 1.9035, + "step": 5061 + }, + { + "epoch": 0.5258128181157162, + "grad_norm": 0.38237836956977844, + "learning_rate": 4.59497745569624e-05, + "loss": 1.654, + "step": 5062 + }, + { + "epoch": 0.5259166926352966, + "grad_norm": 0.38058823347091675, + "learning_rate": 4.593351182210671e-05, + "loss": 1.6587, + "step": 5063 + }, + { + "epoch": 0.5260205671548769, + "grad_norm": 0.4481144845485687, + "learning_rate": 4.59172495203003e-05, + "loss": 1.7732, + "step": 5064 + }, + { + "epoch": 0.5261244416744573, + "grad_norm": 0.3913651406764984, + "learning_rate": 4.590098765327496e-05, + "loss": 1.7154, + "step": 5065 + }, + { + "epoch": 0.5262283161940376, + "grad_norm": 0.3957405388355255, + "learning_rate": 4.588472622276245e-05, + "loss": 1.7209, + "step": 5066 + }, + { + "epoch": 0.5263321907136179, + "grad_norm": 0.4275793135166168, + "learning_rate": 4.5868465230494525e-05, + "loss": 1.7113, + "step": 5067 + }, + { + "epoch": 0.5264360652331983, + "grad_norm": 0.39351120591163635, + "learning_rate": 4.58522046782028e-05, + "loss": 1.7318, + "step": 5068 + }, + { + "epoch": 0.5265399397527787, + "grad_norm": 0.42391449213027954, + "learning_rate": 4.5835944567618916e-05, + "loss": 1.721, + "step": 5069 + }, + { + "epoch": 0.526643814272359, + "grad_norm": 0.42262449860572815, + "learning_rate": 4.5819684900474486e-05, + "loss": 1.7771, + "step": 5070 + }, + { + "epoch": 0.5267476887919393, + "grad_norm": 0.4281655251979828, + "learning_rate": 4.580342567850098e-05, + "loss": 1.6757, + "step": 5071 + }, + { + "epoch": 0.5268515633115197, + "grad_norm": 0.4202190935611725, + "learning_rate": 4.578716690342992e-05, + "loss": 1.597, + "step": 5072 + }, + { + "epoch": 0.5269554378311, + "grad_norm": 0.39920350909233093, + "learning_rate": 4.577090857699273e-05, + "loss": 1.4776, + "step": 5073 + }, + { + "epoch": 0.5270593123506804, + "grad_norm": 0.4190593361854553, + "learning_rate": 4.575465070092077e-05, + "loss": 1.7005, + "step": 5074 + }, + { + "epoch": 0.5271631868702608, + "grad_norm": 0.4298344850540161, + "learning_rate": 4.573839327694539e-05, + "loss": 1.7374, + "step": 5075 + }, + { + "epoch": 0.527267061389841, + "grad_norm": 0.3944896161556244, + "learning_rate": 4.5722136306797914e-05, + "loss": 1.7146, + "step": 5076 + }, + { + "epoch": 0.5273709359094214, + "grad_norm": 0.44215551018714905, + "learning_rate": 4.570587979220953e-05, + "loss": 1.7662, + "step": 5077 + }, + { + "epoch": 0.5274748104290018, + "grad_norm": 0.3969592750072479, + "learning_rate": 4.568962373491146e-05, + "loss": 1.5801, + "step": 5078 + }, + { + "epoch": 0.5275786849485821, + "grad_norm": 0.4148666262626648, + "learning_rate": 4.567336813663484e-05, + "loss": 1.6804, + "step": 5079 + }, + { + "epoch": 0.5276825594681624, + "grad_norm": 0.3804900050163269, + "learning_rate": 4.565711299911076e-05, + "loss": 1.6895, + "step": 5080 + }, + { + "epoch": 0.5277864339877428, + "grad_norm": 0.4792127013206482, + "learning_rate": 4.564085832407026e-05, + "loss": 1.9221, + "step": 5081 + }, + { + "epoch": 0.5278903085073231, + "grad_norm": 0.36079883575439453, + "learning_rate": 4.5624604113244376e-05, + "loss": 1.6225, + "step": 5082 + }, + { + "epoch": 0.5279941830269035, + "grad_norm": 0.3568095266819, + "learning_rate": 4.5608350368363995e-05, + "loss": 1.4058, + "step": 5083 + }, + { + "epoch": 0.5280980575464839, + "grad_norm": 0.43193015456199646, + "learning_rate": 4.559209709116005e-05, + "loss": 1.4616, + "step": 5084 + }, + { + "epoch": 0.5282019320660641, + "grad_norm": 0.43650293350219727, + "learning_rate": 4.557584428336339e-05, + "loss": 1.7984, + "step": 5085 + }, + { + "epoch": 0.5283058065856445, + "grad_norm": 0.3841239809989929, + "learning_rate": 4.555959194670481e-05, + "loss": 1.6124, + "step": 5086 + }, + { + "epoch": 0.5284096811052249, + "grad_norm": 0.3851770758628845, + "learning_rate": 4.554334008291503e-05, + "loss": 1.629, + "step": 5087 + }, + { + "epoch": 0.5285135556248053, + "grad_norm": 0.4100612998008728, + "learning_rate": 4.5527088693724775e-05, + "loss": 1.6488, + "step": 5088 + }, + { + "epoch": 0.5286174301443856, + "grad_norm": 0.42137911915779114, + "learning_rate": 4.5510837780864705e-05, + "loss": 1.8, + "step": 5089 + }, + { + "epoch": 0.5287213046639659, + "grad_norm": 0.5062525868415833, + "learning_rate": 4.549458734606537e-05, + "loss": 1.6027, + "step": 5090 + }, + { + "epoch": 0.5288251791835463, + "grad_norm": 0.3984263241291046, + "learning_rate": 4.547833739105735e-05, + "loss": 1.4691, + "step": 5091 + }, + { + "epoch": 0.5289290537031266, + "grad_norm": 0.4082936644554138, + "learning_rate": 4.5462087917571124e-05, + "loss": 1.7615, + "step": 5092 + }, + { + "epoch": 0.529032928222707, + "grad_norm": 0.43450236320495605, + "learning_rate": 4.544583892733714e-05, + "loss": 1.6093, + "step": 5093 + }, + { + "epoch": 0.5291368027422874, + "grad_norm": 0.4008505344390869, + "learning_rate": 4.5429590422085776e-05, + "loss": 1.7545, + "step": 5094 + }, + { + "epoch": 0.5292406772618676, + "grad_norm": 0.41801589727401733, + "learning_rate": 4.5413342403547403e-05, + "loss": 1.6709, + "step": 5095 + }, + { + "epoch": 0.529344551781448, + "grad_norm": 0.39262089133262634, + "learning_rate": 4.539709487345227e-05, + "loss": 1.4997, + "step": 5096 + }, + { + "epoch": 0.5294484263010284, + "grad_norm": 0.41034695506095886, + "learning_rate": 4.5380847833530625e-05, + "loss": 1.803, + "step": 5097 + }, + { + "epoch": 0.5295523008206087, + "grad_norm": 0.42729073762893677, + "learning_rate": 4.5364601285512674e-05, + "loss": 1.589, + "step": 5098 + }, + { + "epoch": 0.529656175340189, + "grad_norm": 0.3822977542877197, + "learning_rate": 4.534835523112852e-05, + "loss": 1.6351, + "step": 5099 + }, + { + "epoch": 0.5297600498597694, + "grad_norm": 0.4156366288661957, + "learning_rate": 4.533210967210824e-05, + "loss": 1.7417, + "step": 5100 + }, + { + "epoch": 0.5298639243793497, + "grad_norm": 0.4239443242549896, + "learning_rate": 4.5315864610181894e-05, + "loss": 1.9393, + "step": 5101 + }, + { + "epoch": 0.5299677988989301, + "grad_norm": 0.40841466188430786, + "learning_rate": 4.52996200470794e-05, + "loss": 1.5193, + "step": 5102 + }, + { + "epoch": 0.5300716734185105, + "grad_norm": 0.4005943238735199, + "learning_rate": 4.528337598453071e-05, + "loss": 1.6073, + "step": 5103 + }, + { + "epoch": 0.5301755479380907, + "grad_norm": 0.3967158794403076, + "learning_rate": 4.5267132424265705e-05, + "loss": 1.6722, + "step": 5104 + }, + { + "epoch": 0.5302794224576711, + "grad_norm": 0.3896438181400299, + "learning_rate": 4.5250889368014166e-05, + "loss": 1.684, + "step": 5105 + }, + { + "epoch": 0.5303832969772515, + "grad_norm": 0.4113629460334778, + "learning_rate": 4.523464681750587e-05, + "loss": 1.5668, + "step": 5106 + }, + { + "epoch": 0.5304871714968318, + "grad_norm": 0.39420047402381897, + "learning_rate": 4.521840477447051e-05, + "loss": 1.8714, + "step": 5107 + }, + { + "epoch": 0.5305910460164122, + "grad_norm": 0.3833891451358795, + "learning_rate": 4.5202163240637755e-05, + "loss": 1.6501, + "step": 5108 + }, + { + "epoch": 0.5306949205359925, + "grad_norm": 0.40579354763031006, + "learning_rate": 4.5185922217737175e-05, + "loss": 1.5287, + "step": 5109 + }, + { + "epoch": 0.5307987950555728, + "grad_norm": 0.39992040395736694, + "learning_rate": 4.5169681707498354e-05, + "loss": 1.7683, + "step": 5110 + }, + { + "epoch": 0.5309026695751532, + "grad_norm": 0.3796159327030182, + "learning_rate": 4.515344171165072e-05, + "loss": 1.4853, + "step": 5111 + }, + { + "epoch": 0.5310065440947336, + "grad_norm": 0.3873627185821533, + "learning_rate": 4.5137202231923744e-05, + "loss": 1.7049, + "step": 5112 + }, + { + "epoch": 0.531110418614314, + "grad_norm": 0.42043253779411316, + "learning_rate": 4.512096327004682e-05, + "loss": 1.822, + "step": 5113 + }, + { + "epoch": 0.5312142931338942, + "grad_norm": 0.4138578176498413, + "learning_rate": 4.5104724827749254e-05, + "loss": 1.8201, + "step": 5114 + }, + { + "epoch": 0.5313181676534746, + "grad_norm": 0.4510040879249573, + "learning_rate": 4.508848690676028e-05, + "loss": 1.8475, + "step": 5115 + }, + { + "epoch": 0.531422042173055, + "grad_norm": 0.45895758271217346, + "learning_rate": 4.507224950880915e-05, + "loss": 1.8352, + "step": 5116 + }, + { + "epoch": 0.5315259166926353, + "grad_norm": 0.3979472815990448, + "learning_rate": 4.505601263562502e-05, + "loss": 1.7358, + "step": 5117 + }, + { + "epoch": 0.5316297912122157, + "grad_norm": 0.3930369019508362, + "learning_rate": 4.503977628893697e-05, + "loss": 1.7055, + "step": 5118 + }, + { + "epoch": 0.531733665731796, + "grad_norm": 0.3967008590698242, + "learning_rate": 4.502354047047406e-05, + "loss": 1.6146, + "step": 5119 + }, + { + "epoch": 0.5318375402513763, + "grad_norm": 0.4116523563861847, + "learning_rate": 4.500730518196527e-05, + "loss": 1.6813, + "step": 5120 + }, + { + "epoch": 0.5319414147709567, + "grad_norm": 0.4032840430736542, + "learning_rate": 4.499107042513953e-05, + "loss": 1.566, + "step": 5121 + }, + { + "epoch": 0.5320452892905371, + "grad_norm": 0.3959857225418091, + "learning_rate": 4.4974836201725715e-05, + "loss": 1.5526, + "step": 5122 + }, + { + "epoch": 0.5321491638101173, + "grad_norm": 0.399959534406662, + "learning_rate": 4.495860251345266e-05, + "loss": 1.6931, + "step": 5123 + }, + { + "epoch": 0.5322530383296977, + "grad_norm": 0.4058084487915039, + "learning_rate": 4.49423693620491e-05, + "loss": 1.5507, + "step": 5124 + }, + { + "epoch": 0.5323569128492781, + "grad_norm": 0.4492541551589966, + "learning_rate": 4.4926136749243754e-05, + "loss": 1.7912, + "step": 5125 + }, + { + "epoch": 0.5324607873688584, + "grad_norm": 0.4223318099975586, + "learning_rate": 4.490990467676528e-05, + "loss": 1.7354, + "step": 5126 + }, + { + "epoch": 0.5325646618884388, + "grad_norm": 0.38780099153518677, + "learning_rate": 4.489367314634225e-05, + "loss": 1.5337, + "step": 5127 + }, + { + "epoch": 0.5326685364080191, + "grad_norm": 0.40995070338249207, + "learning_rate": 4.487744215970318e-05, + "loss": 1.7209, + "step": 5128 + }, + { + "epoch": 0.5327724109275994, + "grad_norm": 0.45095205307006836, + "learning_rate": 4.486121171857659e-05, + "loss": 1.7835, + "step": 5129 + }, + { + "epoch": 0.5328762854471798, + "grad_norm": 0.42928487062454224, + "learning_rate": 4.4844981824690844e-05, + "loss": 1.7334, + "step": 5130 + }, + { + "epoch": 0.5329801599667602, + "grad_norm": 0.38315919041633606, + "learning_rate": 4.4828752479774324e-05, + "loss": 1.5904, + "step": 5131 + }, + { + "epoch": 0.5330840344863405, + "grad_norm": 0.3942658603191376, + "learning_rate": 4.481252368555535e-05, + "loss": 1.6793, + "step": 5132 + }, + { + "epoch": 0.5331879090059208, + "grad_norm": 0.419535756111145, + "learning_rate": 4.4796295443762116e-05, + "loss": 1.7484, + "step": 5133 + }, + { + "epoch": 0.5332917835255012, + "grad_norm": 0.378946989774704, + "learning_rate": 4.478006775612284e-05, + "loss": 1.6216, + "step": 5134 + }, + { + "epoch": 0.5333956580450815, + "grad_norm": 0.43386486172676086, + "learning_rate": 4.476384062436563e-05, + "loss": 1.8178, + "step": 5135 + }, + { + "epoch": 0.5334995325646619, + "grad_norm": 0.4382666051387787, + "learning_rate": 4.474761405021855e-05, + "loss": 1.8414, + "step": 5136 + }, + { + "epoch": 0.5336034070842423, + "grad_norm": 0.4107944667339325, + "learning_rate": 4.4731388035409586e-05, + "loss": 1.7848, + "step": 5137 + }, + { + "epoch": 0.5337072816038226, + "grad_norm": 0.4328972399234772, + "learning_rate": 4.471516258166671e-05, + "loss": 1.8443, + "step": 5138 + }, + { + "epoch": 0.5338111561234029, + "grad_norm": 0.3964642584323883, + "learning_rate": 4.4698937690717816e-05, + "loss": 1.6394, + "step": 5139 + }, + { + "epoch": 0.5339150306429833, + "grad_norm": 0.37310969829559326, + "learning_rate": 4.468271336429068e-05, + "loss": 1.5261, + "step": 5140 + }, + { + "epoch": 0.5340189051625637, + "grad_norm": 0.3795417547225952, + "learning_rate": 4.466648960411311e-05, + "loss": 1.512, + "step": 5141 + }, + { + "epoch": 0.5341227796821439, + "grad_norm": 0.40631794929504395, + "learning_rate": 4.46502664119128e-05, + "loss": 1.8027, + "step": 5142 + }, + { + "epoch": 0.5342266542017243, + "grad_norm": 0.47585329413414, + "learning_rate": 4.463404378941737e-05, + "loss": 1.8224, + "step": 5143 + }, + { + "epoch": 0.5343305287213047, + "grad_norm": 0.4149931073188782, + "learning_rate": 4.461782173835442e-05, + "loss": 1.6209, + "step": 5144 + }, + { + "epoch": 0.534434403240885, + "grad_norm": 0.41741567850112915, + "learning_rate": 4.4601600260451494e-05, + "loss": 1.8086, + "step": 5145 + }, + { + "epoch": 0.5345382777604654, + "grad_norm": 0.43015965819358826, + "learning_rate": 4.458537935743601e-05, + "loss": 1.7298, + "step": 5146 + }, + { + "epoch": 0.5346421522800457, + "grad_norm": 0.3866709768772125, + "learning_rate": 4.45691590310354e-05, + "loss": 1.6128, + "step": 5147 + }, + { + "epoch": 0.534746026799626, + "grad_norm": 0.4770145118236542, + "learning_rate": 4.455293928297698e-05, + "loss": 1.8353, + "step": 5148 + }, + { + "epoch": 0.5348499013192064, + "grad_norm": 0.39021778106689453, + "learning_rate": 4.453672011498805e-05, + "loss": 1.5159, + "step": 5149 + }, + { + "epoch": 0.5349537758387868, + "grad_norm": 0.4521084129810333, + "learning_rate": 4.45205015287958e-05, + "loss": 1.7056, + "step": 5150 + }, + { + "epoch": 0.535057650358367, + "grad_norm": 0.3977662920951843, + "learning_rate": 4.450428352612742e-05, + "loss": 1.5728, + "step": 5151 + }, + { + "epoch": 0.5351615248779474, + "grad_norm": 0.4670353829860687, + "learning_rate": 4.448806610870995e-05, + "loss": 1.8235, + "step": 5152 + }, + { + "epoch": 0.5352653993975278, + "grad_norm": 0.4064006507396698, + "learning_rate": 4.4471849278270446e-05, + "loss": 1.6757, + "step": 5153 + }, + { + "epoch": 0.5353692739171081, + "grad_norm": 0.41022297739982605, + "learning_rate": 4.445563303653589e-05, + "loss": 1.7759, + "step": 5154 + }, + { + "epoch": 0.5354731484366885, + "grad_norm": 0.37725940346717834, + "learning_rate": 4.443941738523316e-05, + "loss": 1.6766, + "step": 5155 + }, + { + "epoch": 0.5355770229562689, + "grad_norm": 0.4157141447067261, + "learning_rate": 4.4423202326089095e-05, + "loss": 1.5477, + "step": 5156 + }, + { + "epoch": 0.5356808974758491, + "grad_norm": 0.43074604868888855, + "learning_rate": 4.4406987860830505e-05, + "loss": 1.6997, + "step": 5157 + }, + { + "epoch": 0.5357847719954295, + "grad_norm": 0.4153660833835602, + "learning_rate": 4.439077399118405e-05, + "loss": 1.7147, + "step": 5158 + }, + { + "epoch": 0.5358886465150099, + "grad_norm": 0.40564560890197754, + "learning_rate": 4.4374560718876426e-05, + "loss": 1.6825, + "step": 5159 + }, + { + "epoch": 0.5359925210345902, + "grad_norm": 0.41882357001304626, + "learning_rate": 4.4358348045634215e-05, + "loss": 1.6888, + "step": 5160 + }, + { + "epoch": 0.5360963955541705, + "grad_norm": 0.4029204249382019, + "learning_rate": 4.4342135973183915e-05, + "loss": 1.7675, + "step": 5161 + }, + { + "epoch": 0.5362002700737509, + "grad_norm": 0.42406782507896423, + "learning_rate": 4.432592450325201e-05, + "loss": 1.7902, + "step": 5162 + }, + { + "epoch": 0.5363041445933313, + "grad_norm": 0.4365594983100891, + "learning_rate": 4.430971363756487e-05, + "loss": 1.8162, + "step": 5163 + }, + { + "epoch": 0.5364080191129116, + "grad_norm": 0.3890252709388733, + "learning_rate": 4.429350337784887e-05, + "loss": 1.7314, + "step": 5164 + }, + { + "epoch": 0.536511893632492, + "grad_norm": 0.3934305012226105, + "learning_rate": 4.427729372583023e-05, + "loss": 1.7803, + "step": 5165 + }, + { + "epoch": 0.5366157681520723, + "grad_norm": 0.40188831090927124, + "learning_rate": 4.426108468323516e-05, + "loss": 1.6121, + "step": 5166 + }, + { + "epoch": 0.5367196426716526, + "grad_norm": 0.37544122338294983, + "learning_rate": 4.424487625178983e-05, + "loss": 1.6125, + "step": 5167 + }, + { + "epoch": 0.536823517191233, + "grad_norm": 0.43579837679862976, + "learning_rate": 4.4228668433220264e-05, + "loss": 1.806, + "step": 5168 + }, + { + "epoch": 0.5369273917108134, + "grad_norm": 0.37916016578674316, + "learning_rate": 4.421246122925251e-05, + "loss": 1.4683, + "step": 5169 + }, + { + "epoch": 0.5370312662303937, + "grad_norm": 0.3701176345348358, + "learning_rate": 4.419625464161249e-05, + "loss": 1.7148, + "step": 5170 + }, + { + "epoch": 0.537135140749974, + "grad_norm": 0.39556387066841125, + "learning_rate": 4.4180048672026066e-05, + "loss": 1.5598, + "step": 5171 + }, + { + "epoch": 0.5372390152695544, + "grad_norm": 0.413783460855484, + "learning_rate": 4.416384332221906e-05, + "loss": 1.6101, + "step": 5172 + }, + { + "epoch": 0.5373428897891347, + "grad_norm": 0.41170206665992737, + "learning_rate": 4.414763859391724e-05, + "loss": 1.7957, + "step": 5173 + }, + { + "epoch": 0.5374467643087151, + "grad_norm": 0.3944395184516907, + "learning_rate": 4.4131434488846236e-05, + "loss": 1.5504, + "step": 5174 + }, + { + "epoch": 0.5375506388282955, + "grad_norm": 0.47782790660858154, + "learning_rate": 4.4115231008731696e-05, + "loss": 1.7433, + "step": 5175 + }, + { + "epoch": 0.5376545133478757, + "grad_norm": 0.42878782749176025, + "learning_rate": 4.409902815529914e-05, + "loss": 1.7387, + "step": 5176 + }, + { + "epoch": 0.5377583878674561, + "grad_norm": 0.42631304264068604, + "learning_rate": 4.4082825930274066e-05, + "loss": 1.8365, + "step": 5177 + }, + { + "epoch": 0.5378622623870365, + "grad_norm": 0.4046842157840729, + "learning_rate": 4.4066624335381864e-05, + "loss": 1.6611, + "step": 5178 + }, + { + "epoch": 0.5379661369066168, + "grad_norm": 0.41138967871665955, + "learning_rate": 4.40504233723479e-05, + "loss": 1.7386, + "step": 5179 + }, + { + "epoch": 0.5380700114261971, + "grad_norm": 0.4140818417072296, + "learning_rate": 4.4034223042897423e-05, + "loss": 1.7121, + "step": 5180 + }, + { + "epoch": 0.5381738859457775, + "grad_norm": 0.4164121150970459, + "learning_rate": 4.4018023348755666e-05, + "loss": 1.746, + "step": 5181 + }, + { + "epoch": 0.5382777604653578, + "grad_norm": 0.4518551528453827, + "learning_rate": 4.400182429164778e-05, + "loss": 1.5526, + "step": 5182 + }, + { + "epoch": 0.5383816349849382, + "grad_norm": 0.39879488945007324, + "learning_rate": 4.39856258732988e-05, + "loss": 1.7516, + "step": 5183 + }, + { + "epoch": 0.5384855095045186, + "grad_norm": 0.43033257126808167, + "learning_rate": 4.396942809543374e-05, + "loss": 1.7992, + "step": 5184 + }, + { + "epoch": 0.5385893840240988, + "grad_norm": 0.4049583673477173, + "learning_rate": 4.395323095977757e-05, + "loss": 1.6651, + "step": 5185 + }, + { + "epoch": 0.5386932585436792, + "grad_norm": 0.43411314487457275, + "learning_rate": 4.393703446805512e-05, + "loss": 1.7299, + "step": 5186 + }, + { + "epoch": 0.5387971330632596, + "grad_norm": 0.47748497128486633, + "learning_rate": 4.39208386219912e-05, + "loss": 1.8707, + "step": 5187 + }, + { + "epoch": 0.5389010075828399, + "grad_norm": 0.3915466070175171, + "learning_rate": 4.3904643423310574e-05, + "loss": 1.6647, + "step": 5188 + }, + { + "epoch": 0.5390048821024203, + "grad_norm": 0.46273308992385864, + "learning_rate": 4.388844887373785e-05, + "loss": 1.7202, + "step": 5189 + }, + { + "epoch": 0.5391087566220006, + "grad_norm": 0.37621983885765076, + "learning_rate": 4.387225497499767e-05, + "loss": 1.7193, + "step": 5190 + }, + { + "epoch": 0.539212631141581, + "grad_norm": 0.4030202627182007, + "learning_rate": 4.385606172881452e-05, + "loss": 1.6711, + "step": 5191 + }, + { + "epoch": 0.5393165056611613, + "grad_norm": 0.4407753646373749, + "learning_rate": 4.383986913691289e-05, + "loss": 1.7142, + "step": 5192 + }, + { + "epoch": 0.5394203801807417, + "grad_norm": 0.40765175223350525, + "learning_rate": 4.382367720101713e-05, + "loss": 1.7295, + "step": 5193 + }, + { + "epoch": 0.5395242547003221, + "grad_norm": 0.40376102924346924, + "learning_rate": 4.3807485922851574e-05, + "loss": 1.574, + "step": 5194 + }, + { + "epoch": 0.5396281292199023, + "grad_norm": 0.41411998867988586, + "learning_rate": 4.3791295304140476e-05, + "loss": 1.6128, + "step": 5195 + }, + { + "epoch": 0.5397320037394827, + "grad_norm": 0.4627225399017334, + "learning_rate": 4.3775105346607983e-05, + "loss": 1.8546, + "step": 5196 + }, + { + "epoch": 0.5398358782590631, + "grad_norm": 0.4189532399177551, + "learning_rate": 4.375891605197822e-05, + "loss": 1.7621, + "step": 5197 + }, + { + "epoch": 0.5399397527786434, + "grad_norm": 0.40579670667648315, + "learning_rate": 4.3742727421975224e-05, + "loss": 1.6286, + "step": 5198 + }, + { + "epoch": 0.5400436272982237, + "grad_norm": 0.4409949481487274, + "learning_rate": 4.3726539458322943e-05, + "loss": 1.8302, + "step": 5199 + }, + { + "epoch": 0.5401475018178041, + "grad_norm": 0.4035548269748688, + "learning_rate": 4.371035216274526e-05, + "loss": 1.7163, + "step": 5200 + }, + { + "epoch": 0.5402513763373844, + "grad_norm": 0.41622182726860046, + "learning_rate": 4.369416553696604e-05, + "loss": 1.5436, + "step": 5201 + }, + { + "epoch": 0.5403552508569648, + "grad_norm": 0.41848766803741455, + "learning_rate": 4.367797958270897e-05, + "loss": 1.6028, + "step": 5202 + }, + { + "epoch": 0.5404591253765452, + "grad_norm": 0.4080602824687958, + "learning_rate": 4.366179430169779e-05, + "loss": 1.6886, + "step": 5203 + }, + { + "epoch": 0.5405629998961254, + "grad_norm": 0.40305790305137634, + "learning_rate": 4.3645609695656056e-05, + "loss": 1.604, + "step": 5204 + }, + { + "epoch": 0.5406668744157058, + "grad_norm": 0.42248713970184326, + "learning_rate": 4.3629425766307334e-05, + "loss": 1.6635, + "step": 5205 + }, + { + "epoch": 0.5407707489352862, + "grad_norm": 0.41928306221961975, + "learning_rate": 4.361324251537506e-05, + "loss": 1.5922, + "step": 5206 + }, + { + "epoch": 0.5408746234548665, + "grad_norm": 0.45712658762931824, + "learning_rate": 4.359705994458267e-05, + "loss": 1.7722, + "step": 5207 + }, + { + "epoch": 0.5409784979744469, + "grad_norm": 0.3815540373325348, + "learning_rate": 4.358087805565343e-05, + "loss": 1.6145, + "step": 5208 + }, + { + "epoch": 0.5410823724940272, + "grad_norm": 0.41365593671798706, + "learning_rate": 4.35646968503106e-05, + "loss": 1.5844, + "step": 5209 + }, + { + "epoch": 0.5411862470136075, + "grad_norm": 0.4178517162799835, + "learning_rate": 4.354851633027738e-05, + "loss": 1.7365, + "step": 5210 + }, + { + "epoch": 0.5412901215331879, + "grad_norm": 0.4436066448688507, + "learning_rate": 4.3532336497276836e-05, + "loss": 1.7656, + "step": 5211 + }, + { + "epoch": 0.5413939960527683, + "grad_norm": 0.4040657877922058, + "learning_rate": 4.351615735303199e-05, + "loss": 1.7024, + "step": 5212 + }, + { + "epoch": 0.5414978705723485, + "grad_norm": 0.4756530821323395, + "learning_rate": 4.3499978899265844e-05, + "loss": 1.9407, + "step": 5213 + }, + { + "epoch": 0.5416017450919289, + "grad_norm": 0.43450602889060974, + "learning_rate": 4.348380113770121e-05, + "loss": 1.7781, + "step": 5214 + }, + { + "epoch": 0.5417056196115093, + "grad_norm": 0.4165419638156891, + "learning_rate": 4.346762407006093e-05, + "loss": 1.6077, + "step": 5215 + }, + { + "epoch": 0.5418094941310897, + "grad_norm": 0.4009828269481659, + "learning_rate": 4.345144769806775e-05, + "loss": 1.5677, + "step": 5216 + }, + { + "epoch": 0.54191336865067, + "grad_norm": 0.37052732706069946, + "learning_rate": 4.3435272023444304e-05, + "loss": 1.5191, + "step": 5217 + }, + { + "epoch": 0.5420172431702504, + "grad_norm": 0.4109463095664978, + "learning_rate": 4.3419097047913175e-05, + "loss": 1.5929, + "step": 5218 + }, + { + "epoch": 0.5421211176898307, + "grad_norm": 0.4167434275150299, + "learning_rate": 4.340292277319687e-05, + "loss": 1.612, + "step": 5219 + }, + { + "epoch": 0.542224992209411, + "grad_norm": 0.39904865622520447, + "learning_rate": 4.3386749201017856e-05, + "loss": 1.6542, + "step": 5220 + }, + { + "epoch": 0.5423288667289914, + "grad_norm": 0.4138846695423126, + "learning_rate": 4.3370576333098446e-05, + "loss": 1.6344, + "step": 5221 + }, + { + "epoch": 0.5424327412485718, + "grad_norm": 0.38887113332748413, + "learning_rate": 4.335440417116094e-05, + "loss": 1.6036, + "step": 5222 + }, + { + "epoch": 0.542536615768152, + "grad_norm": 0.4017149806022644, + "learning_rate": 4.3338232716927574e-05, + "loss": 1.6746, + "step": 5223 + }, + { + "epoch": 0.5426404902877324, + "grad_norm": 0.39271867275238037, + "learning_rate": 4.3322061972120445e-05, + "loss": 1.6285, + "step": 5224 + }, + { + "epoch": 0.5427443648073128, + "grad_norm": 0.4323449432849884, + "learning_rate": 4.3305891938461626e-05, + "loss": 1.5169, + "step": 5225 + }, + { + "epoch": 0.5428482393268931, + "grad_norm": 0.37628036737442017, + "learning_rate": 4.3289722617673115e-05, + "loss": 1.6436, + "step": 5226 + }, + { + "epoch": 0.5429521138464735, + "grad_norm": 0.43014097213745117, + "learning_rate": 4.327355401147679e-05, + "loss": 1.7777, + "step": 5227 + }, + { + "epoch": 0.5430559883660538, + "grad_norm": 0.3823567032814026, + "learning_rate": 4.3257386121594485e-05, + "loss": 1.5837, + "step": 5228 + }, + { + "epoch": 0.5431598628856341, + "grad_norm": 0.4464382827281952, + "learning_rate": 4.324121894974799e-05, + "loss": 1.8451, + "step": 5229 + }, + { + "epoch": 0.5432637374052145, + "grad_norm": 0.37794092297554016, + "learning_rate": 4.322505249765893e-05, + "loss": 1.7015, + "step": 5230 + }, + { + "epoch": 0.5433676119247949, + "grad_norm": 0.40177834033966064, + "learning_rate": 4.320888676704895e-05, + "loss": 1.7247, + "step": 5231 + }, + { + "epoch": 0.5434714864443752, + "grad_norm": 0.3933226764202118, + "learning_rate": 4.3192721759639545e-05, + "loss": 1.6449, + "step": 5232 + }, + { + "epoch": 0.5435753609639555, + "grad_norm": 0.4006556272506714, + "learning_rate": 4.317655747715218e-05, + "loss": 1.5498, + "step": 5233 + }, + { + "epoch": 0.5436792354835359, + "grad_norm": 0.4249705970287323, + "learning_rate": 4.3160393921308204e-05, + "loss": 1.7594, + "step": 5234 + }, + { + "epoch": 0.5437831100031162, + "grad_norm": 0.44793421030044556, + "learning_rate": 4.3144231093828954e-05, + "loss": 1.8968, + "step": 5235 + }, + { + "epoch": 0.5438869845226966, + "grad_norm": 0.3769989311695099, + "learning_rate": 4.312806899643559e-05, + "loss": 1.5737, + "step": 5236 + }, + { + "epoch": 0.543990859042277, + "grad_norm": 0.41074395179748535, + "learning_rate": 4.311190763084928e-05, + "loss": 1.6493, + "step": 5237 + }, + { + "epoch": 0.5440947335618572, + "grad_norm": 0.4340376555919647, + "learning_rate": 4.309574699879109e-05, + "loss": 1.7359, + "step": 5238 + }, + { + "epoch": 0.5441986080814376, + "grad_norm": 0.4758669435977936, + "learning_rate": 4.3079587101981985e-05, + "loss": 1.8211, + "step": 5239 + }, + { + "epoch": 0.544302482601018, + "grad_norm": 0.3941514790058136, + "learning_rate": 4.306342794214286e-05, + "loss": 1.4555, + "step": 5240 + }, + { + "epoch": 0.5444063571205984, + "grad_norm": 0.40955716371536255, + "learning_rate": 4.3047269520994555e-05, + "loss": 1.6498, + "step": 5241 + }, + { + "epoch": 0.5445102316401786, + "grad_norm": 0.45448553562164307, + "learning_rate": 4.3031111840257834e-05, + "loss": 1.7542, + "step": 5242 + }, + { + "epoch": 0.544614106159759, + "grad_norm": 0.4177002012729645, + "learning_rate": 4.301495490165331e-05, + "loss": 1.6508, + "step": 5243 + }, + { + "epoch": 0.5447179806793394, + "grad_norm": 0.431246280670166, + "learning_rate": 4.299879870690162e-05, + "loss": 1.8128, + "step": 5244 + }, + { + "epoch": 0.5448218551989197, + "grad_norm": 0.4549204707145691, + "learning_rate": 4.298264325772327e-05, + "loss": 1.6395, + "step": 5245 + }, + { + "epoch": 0.5449257297185001, + "grad_norm": 0.4020931124687195, + "learning_rate": 4.2966488555838656e-05, + "loss": 1.6735, + "step": 5246 + }, + { + "epoch": 0.5450296042380804, + "grad_norm": 0.4968905448913574, + "learning_rate": 4.295033460296814e-05, + "loss": 1.7222, + "step": 5247 + }, + { + "epoch": 0.5451334787576607, + "grad_norm": 0.3912610709667206, + "learning_rate": 4.293418140083203e-05, + "loss": 1.6682, + "step": 5248 + }, + { + "epoch": 0.5452373532772411, + "grad_norm": 0.39999935030937195, + "learning_rate": 4.291802895115046e-05, + "loss": 1.746, + "step": 5249 + }, + { + "epoch": 0.5453412277968215, + "grad_norm": 0.4106389582157135, + "learning_rate": 4.290187725564356e-05, + "loss": 1.5329, + "step": 5250 + }, + { + "epoch": 0.5454451023164018, + "grad_norm": 0.4551447927951813, + "learning_rate": 4.28857263160314e-05, + "loss": 1.7031, + "step": 5251 + }, + { + "epoch": 0.5455489768359821, + "grad_norm": 0.39867910742759705, + "learning_rate": 4.286957613403387e-05, + "loss": 1.5662, + "step": 5252 + }, + { + "epoch": 0.5456528513555625, + "grad_norm": 0.38527026772499084, + "learning_rate": 4.285342671137086e-05, + "loss": 1.5221, + "step": 5253 + }, + { + "epoch": 0.5457567258751428, + "grad_norm": 0.40201330184936523, + "learning_rate": 4.283727804976217e-05, + "loss": 1.5841, + "step": 5254 + }, + { + "epoch": 0.5458606003947232, + "grad_norm": 0.40640905499458313, + "learning_rate": 4.2821130150927486e-05, + "loss": 1.6659, + "step": 5255 + }, + { + "epoch": 0.5459644749143036, + "grad_norm": 0.39006564021110535, + "learning_rate": 4.280498301658644e-05, + "loss": 1.463, + "step": 5256 + }, + { + "epoch": 0.5460683494338838, + "grad_norm": 0.44832733273506165, + "learning_rate": 4.27888366484586e-05, + "loss": 1.941, + "step": 5257 + }, + { + "epoch": 0.5461722239534642, + "grad_norm": 0.41479620337486267, + "learning_rate": 4.277269104826339e-05, + "loss": 1.7727, + "step": 5258 + }, + { + "epoch": 0.5462760984730446, + "grad_norm": 0.40158823132514954, + "learning_rate": 4.275654621772021e-05, + "loss": 1.6807, + "step": 5259 + }, + { + "epoch": 0.5463799729926249, + "grad_norm": 0.39162763953208923, + "learning_rate": 4.2740402158548366e-05, + "loss": 1.6696, + "step": 5260 + }, + { + "epoch": 0.5464838475122052, + "grad_norm": 0.41060155630111694, + "learning_rate": 4.272425887246706e-05, + "loss": 1.7343, + "step": 5261 + }, + { + "epoch": 0.5465877220317856, + "grad_norm": 0.3723122179508209, + "learning_rate": 4.270811636119542e-05, + "loss": 1.6486, + "step": 5262 + }, + { + "epoch": 0.5466915965513659, + "grad_norm": 0.4098038375377655, + "learning_rate": 4.269197462645253e-05, + "loss": 1.691, + "step": 5263 + }, + { + "epoch": 0.5467954710709463, + "grad_norm": 0.38569438457489014, + "learning_rate": 4.267583366995732e-05, + "loss": 1.6651, + "step": 5264 + }, + { + "epoch": 0.5468993455905267, + "grad_norm": 0.42734023928642273, + "learning_rate": 4.2659693493428684e-05, + "loss": 1.8253, + "step": 5265 + }, + { + "epoch": 0.5470032201101069, + "grad_norm": 0.4274364709854126, + "learning_rate": 4.264355409858547e-05, + "loss": 1.6397, + "step": 5266 + }, + { + "epoch": 0.5471070946296873, + "grad_norm": 0.38870930671691895, + "learning_rate": 4.2627415487146346e-05, + "loss": 1.5403, + "step": 5267 + }, + { + "epoch": 0.5472109691492677, + "grad_norm": 0.3967702388763428, + "learning_rate": 4.261127766082995e-05, + "loss": 1.8323, + "step": 5268 + }, + { + "epoch": 0.5473148436688481, + "grad_norm": 0.42525243759155273, + "learning_rate": 4.2595140621354846e-05, + "loss": 1.7435, + "step": 5269 + }, + { + "epoch": 0.5474187181884284, + "grad_norm": 0.41377174854278564, + "learning_rate": 4.257900437043954e-05, + "loss": 1.5484, + "step": 5270 + }, + { + "epoch": 0.5475225927080087, + "grad_norm": 0.36868613958358765, + "learning_rate": 4.256286890980234e-05, + "loss": 1.6659, + "step": 5271 + }, + { + "epoch": 0.5476264672275891, + "grad_norm": 0.43371888995170593, + "learning_rate": 4.254673424116162e-05, + "loss": 1.6907, + "step": 5272 + }, + { + "epoch": 0.5477303417471694, + "grad_norm": 0.4247437119483948, + "learning_rate": 4.2530600366235554e-05, + "loss": 1.7444, + "step": 5273 + }, + { + "epoch": 0.5478342162667498, + "grad_norm": 0.43270236253738403, + "learning_rate": 4.251446728674229e-05, + "loss": 1.6692, + "step": 5274 + }, + { + "epoch": 0.5479380907863302, + "grad_norm": 0.40581294894218445, + "learning_rate": 4.2498335004399864e-05, + "loss": 1.7245, + "step": 5275 + }, + { + "epoch": 0.5480419653059104, + "grad_norm": 0.40520188212394714, + "learning_rate": 4.248220352092626e-05, + "loss": 1.6344, + "step": 5276 + }, + { + "epoch": 0.5481458398254908, + "grad_norm": 0.41677728295326233, + "learning_rate": 4.246607283803932e-05, + "loss": 1.7355, + "step": 5277 + }, + { + "epoch": 0.5482497143450712, + "grad_norm": 0.4243401885032654, + "learning_rate": 4.2449942957456855e-05, + "loss": 1.7528, + "step": 5278 + }, + { + "epoch": 0.5483535888646515, + "grad_norm": 0.44001367688179016, + "learning_rate": 4.24338138808966e-05, + "loss": 1.7903, + "step": 5279 + }, + { + "epoch": 0.5484574633842318, + "grad_norm": 0.40032702684402466, + "learning_rate": 4.241768561007613e-05, + "loss": 1.6569, + "step": 5280 + }, + { + "epoch": 0.5485613379038122, + "grad_norm": 0.4183237850666046, + "learning_rate": 4.2401558146712994e-05, + "loss": 1.6377, + "step": 5281 + }, + { + "epoch": 0.5486652124233925, + "grad_norm": 0.4048320949077606, + "learning_rate": 4.238543149252466e-05, + "loss": 1.5852, + "step": 5282 + }, + { + "epoch": 0.5487690869429729, + "grad_norm": 0.4144987165927887, + "learning_rate": 4.2369305649228455e-05, + "loss": 1.7107, + "step": 5283 + }, + { + "epoch": 0.5488729614625533, + "grad_norm": 0.40201154351234436, + "learning_rate": 4.235318061854168e-05, + "loss": 1.7281, + "step": 5284 + }, + { + "epoch": 0.5489768359821335, + "grad_norm": 0.41914767026901245, + "learning_rate": 4.233705640218154e-05, + "loss": 1.7443, + "step": 5285 + }, + { + "epoch": 0.5490807105017139, + "grad_norm": 0.39670661091804504, + "learning_rate": 4.2320933001865106e-05, + "loss": 1.5151, + "step": 5286 + }, + { + "epoch": 0.5491845850212943, + "grad_norm": 0.49306583404541016, + "learning_rate": 4.230481041930941e-05, + "loss": 2.0025, + "step": 5287 + }, + { + "epoch": 0.5492884595408746, + "grad_norm": 0.4066086709499359, + "learning_rate": 4.2288688656231384e-05, + "loss": 1.6324, + "step": 5288 + }, + { + "epoch": 0.549392334060455, + "grad_norm": 0.43332695960998535, + "learning_rate": 4.227256771434787e-05, + "loss": 1.7869, + "step": 5289 + }, + { + "epoch": 0.5494962085800353, + "grad_norm": 0.37857580184936523, + "learning_rate": 4.2256447595375596e-05, + "loss": 1.6478, + "step": 5290 + }, + { + "epoch": 0.5496000830996156, + "grad_norm": 0.45964285731315613, + "learning_rate": 4.224032830103128e-05, + "loss": 1.7415, + "step": 5291 + }, + { + "epoch": 0.549703957619196, + "grad_norm": 0.3959317207336426, + "learning_rate": 4.2224209833031445e-05, + "loss": 1.6643, + "step": 5292 + }, + { + "epoch": 0.5498078321387764, + "grad_norm": 0.48961135745048523, + "learning_rate": 4.220809219309262e-05, + "loss": 1.76, + "step": 5293 + }, + { + "epoch": 0.5499117066583568, + "grad_norm": 0.38995257019996643, + "learning_rate": 4.21919753829312e-05, + "loss": 1.6642, + "step": 5294 + }, + { + "epoch": 0.550015581177937, + "grad_norm": 0.4197613000869751, + "learning_rate": 4.217585940426351e-05, + "loss": 1.7614, + "step": 5295 + }, + { + "epoch": 0.5501194556975174, + "grad_norm": 0.40901002287864685, + "learning_rate": 4.2159744258805736e-05, + "loss": 1.8191, + "step": 5296 + }, + { + "epoch": 0.5502233302170978, + "grad_norm": 0.4012939929962158, + "learning_rate": 4.214362994827405e-05, + "loss": 1.4622, + "step": 5297 + }, + { + "epoch": 0.5503272047366781, + "grad_norm": 0.4541564881801605, + "learning_rate": 4.212751647438451e-05, + "loss": 1.9298, + "step": 5298 + }, + { + "epoch": 0.5504310792562584, + "grad_norm": 0.3447628915309906, + "learning_rate": 4.2111403838853034e-05, + "loss": 1.3871, + "step": 5299 + }, + { + "epoch": 0.5505349537758388, + "grad_norm": 0.4184315502643585, + "learning_rate": 4.209529204339553e-05, + "loss": 1.5378, + "step": 5300 + }, + { + "epoch": 0.5506388282954191, + "grad_norm": 0.49825429916381836, + "learning_rate": 4.207918108972776e-05, + "loss": 1.8026, + "step": 5301 + }, + { + "epoch": 0.5507427028149995, + "grad_norm": 0.385919451713562, + "learning_rate": 4.2063070979565426e-05, + "loss": 1.6863, + "step": 5302 + }, + { + "epoch": 0.5508465773345799, + "grad_norm": 0.40161100029945374, + "learning_rate": 4.204696171462411e-05, + "loss": 1.5964, + "step": 5303 + }, + { + "epoch": 0.5509504518541601, + "grad_norm": 0.4033989906311035, + "learning_rate": 4.203085329661935e-05, + "loss": 1.7334, + "step": 5304 + }, + { + "epoch": 0.5510543263737405, + "grad_norm": 0.416106641292572, + "learning_rate": 4.201474572726654e-05, + "loss": 1.7121, + "step": 5305 + }, + { + "epoch": 0.5511582008933209, + "grad_norm": 0.43745073676109314, + "learning_rate": 4.199863900828102e-05, + "loss": 1.7883, + "step": 5306 + }, + { + "epoch": 0.5512620754129012, + "grad_norm": 0.4253213703632355, + "learning_rate": 4.1982533141378056e-05, + "loss": 1.7145, + "step": 5307 + }, + { + "epoch": 0.5513659499324816, + "grad_norm": 0.4426640570163727, + "learning_rate": 4.1966428128272754e-05, + "loss": 1.7868, + "step": 5308 + }, + { + "epoch": 0.5514698244520619, + "grad_norm": 0.3622012436389923, + "learning_rate": 4.195032397068019e-05, + "loss": 1.5348, + "step": 5309 + }, + { + "epoch": 0.5515736989716422, + "grad_norm": 0.45180484652519226, + "learning_rate": 4.193422067031535e-05, + "loss": 1.8076, + "step": 5310 + }, + { + "epoch": 0.5516775734912226, + "grad_norm": 0.41505420207977295, + "learning_rate": 4.191811822889307e-05, + "loss": 1.7471, + "step": 5311 + }, + { + "epoch": 0.551781448010803, + "grad_norm": 0.42771202325820923, + "learning_rate": 4.190201664812816e-05, + "loss": 1.6095, + "step": 5312 + }, + { + "epoch": 0.5518853225303832, + "grad_norm": 0.41721728444099426, + "learning_rate": 4.188591592973534e-05, + "loss": 1.7719, + "step": 5313 + }, + { + "epoch": 0.5519891970499636, + "grad_norm": 0.4197467267513275, + "learning_rate": 4.1869816075429145e-05, + "loss": 1.7216, + "step": 5314 + }, + { + "epoch": 0.552093071569544, + "grad_norm": 0.43040528893470764, + "learning_rate": 4.1853717086924136e-05, + "loss": 1.7245, + "step": 5315 + }, + { + "epoch": 0.5521969460891243, + "grad_norm": 0.46574804186820984, + "learning_rate": 4.183761896593472e-05, + "loss": 1.8018, + "step": 5316 + }, + { + "epoch": 0.5523008206087047, + "grad_norm": 0.41133713722229004, + "learning_rate": 4.182152171417521e-05, + "loss": 1.628, + "step": 5317 + }, + { + "epoch": 0.552404695128285, + "grad_norm": 0.3880963623523712, + "learning_rate": 4.180542533335984e-05, + "loss": 1.6717, + "step": 5318 + }, + { + "epoch": 0.5525085696478654, + "grad_norm": 0.3985794484615326, + "learning_rate": 4.1789329825202736e-05, + "loss": 1.7005, + "step": 5319 + }, + { + "epoch": 0.5526124441674457, + "grad_norm": 0.43377330899238586, + "learning_rate": 4.177323519141799e-05, + "loss": 1.7466, + "step": 5320 + }, + { + "epoch": 0.5527163186870261, + "grad_norm": 0.44802311062812805, + "learning_rate": 4.1757141433719506e-05, + "loss": 1.6648, + "step": 5321 + }, + { + "epoch": 0.5528201932066065, + "grad_norm": 0.4492226839065552, + "learning_rate": 4.174104855382117e-05, + "loss": 1.5708, + "step": 5322 + }, + { + "epoch": 0.5529240677261867, + "grad_norm": 0.4240928590297699, + "learning_rate": 4.1724956553436756e-05, + "loss": 1.6883, + "step": 5323 + }, + { + "epoch": 0.5530279422457671, + "grad_norm": 0.4252764582633972, + "learning_rate": 4.17088654342799e-05, + "loss": 1.7223, + "step": 5324 + }, + { + "epoch": 0.5531318167653475, + "grad_norm": 0.44476190209388733, + "learning_rate": 4.1692775198064194e-05, + "loss": 1.8179, + "step": 5325 + }, + { + "epoch": 0.5532356912849278, + "grad_norm": 0.38664236664772034, + "learning_rate": 4.167668584650316e-05, + "loss": 1.5664, + "step": 5326 + }, + { + "epoch": 0.5533395658045082, + "grad_norm": 0.3723793923854828, + "learning_rate": 4.166059738131013e-05, + "loss": 1.6096, + "step": 5327 + }, + { + "epoch": 0.5534434403240885, + "grad_norm": 0.4321557283401489, + "learning_rate": 4.164450980419844e-05, + "loss": 1.8148, + "step": 5328 + }, + { + "epoch": 0.5535473148436688, + "grad_norm": 0.40069714188575745, + "learning_rate": 4.162842311688129e-05, + "loss": 1.6699, + "step": 5329 + }, + { + "epoch": 0.5536511893632492, + "grad_norm": 0.4085390269756317, + "learning_rate": 4.161233732107177e-05, + "loss": 1.7729, + "step": 5330 + }, + { + "epoch": 0.5537550638828296, + "grad_norm": 0.4812339246273041, + "learning_rate": 4.1596252418482884e-05, + "loss": 1.963, + "step": 5331 + }, + { + "epoch": 0.5538589384024099, + "grad_norm": 0.458366721868515, + "learning_rate": 4.158016841082759e-05, + "loss": 1.8112, + "step": 5332 + }, + { + "epoch": 0.5539628129219902, + "grad_norm": 0.4018287658691406, + "learning_rate": 4.1564085299818665e-05, + "loss": 1.7265, + "step": 5333 + }, + { + "epoch": 0.5540666874415706, + "grad_norm": 0.41217106580734253, + "learning_rate": 4.154800308716885e-05, + "loss": 1.7181, + "step": 5334 + }, + { + "epoch": 0.5541705619611509, + "grad_norm": 0.5003406405448914, + "learning_rate": 4.15319217745908e-05, + "loss": 1.8835, + "step": 5335 + }, + { + "epoch": 0.5542744364807313, + "grad_norm": 0.4172942340373993, + "learning_rate": 4.151584136379702e-05, + "loss": 1.6447, + "step": 5336 + }, + { + "epoch": 0.5543783110003117, + "grad_norm": 0.4157778322696686, + "learning_rate": 4.149976185649994e-05, + "loss": 1.7282, + "step": 5337 + }, + { + "epoch": 0.5544821855198919, + "grad_norm": 0.40136539936065674, + "learning_rate": 4.148368325441194e-05, + "loss": 1.7406, + "step": 5338 + }, + { + "epoch": 0.5545860600394723, + "grad_norm": 0.4425347149372101, + "learning_rate": 4.1467605559245226e-05, + "loss": 1.7426, + "step": 5339 + }, + { + "epoch": 0.5546899345590527, + "grad_norm": 0.44530102610588074, + "learning_rate": 4.145152877271196e-05, + "loss": 1.8979, + "step": 5340 + }, + { + "epoch": 0.554793809078633, + "grad_norm": 0.41977494955062866, + "learning_rate": 4.143545289652422e-05, + "loss": 1.7588, + "step": 5341 + }, + { + "epoch": 0.5548976835982133, + "grad_norm": 0.43756383657455444, + "learning_rate": 4.141937793239392e-05, + "loss": 1.6979, + "step": 5342 + }, + { + "epoch": 0.5550015581177937, + "grad_norm": 0.42378005385398865, + "learning_rate": 4.140330388203295e-05, + "loss": 1.6303, + "step": 5343 + }, + { + "epoch": 0.555105432637374, + "grad_norm": 0.39726606011390686, + "learning_rate": 4.138723074715306e-05, + "loss": 1.6592, + "step": 5344 + }, + { + "epoch": 0.5552093071569544, + "grad_norm": 0.4202333092689514, + "learning_rate": 4.1371158529465916e-05, + "loss": 1.7364, + "step": 5345 + }, + { + "epoch": 0.5553131816765348, + "grad_norm": 0.40389859676361084, + "learning_rate": 4.1355087230683065e-05, + "loss": 1.5759, + "step": 5346 + }, + { + "epoch": 0.5554170561961151, + "grad_norm": 0.37404200434684753, + "learning_rate": 4.1339016852516e-05, + "loss": 1.544, + "step": 5347 + }, + { + "epoch": 0.5555209307156954, + "grad_norm": 0.45083087682724, + "learning_rate": 4.132294739667609e-05, + "loss": 1.8926, + "step": 5348 + }, + { + "epoch": 0.5556248052352758, + "grad_norm": 0.4377882480621338, + "learning_rate": 4.130687886487459e-05, + "loss": 1.741, + "step": 5349 + }, + { + "epoch": 0.5557286797548562, + "grad_norm": 0.39583009481430054, + "learning_rate": 4.1290811258822685e-05, + "loss": 1.6806, + "step": 5350 + }, + { + "epoch": 0.5558325542744365, + "grad_norm": 0.48825472593307495, + "learning_rate": 4.127474458023146e-05, + "loss": 1.9815, + "step": 5351 + }, + { + "epoch": 0.5559364287940168, + "grad_norm": 0.39083537459373474, + "learning_rate": 4.125867883081186e-05, + "loss": 1.7249, + "step": 5352 + }, + { + "epoch": 0.5560403033135972, + "grad_norm": 0.385078489780426, + "learning_rate": 4.124261401227477e-05, + "loss": 1.546, + "step": 5353 + }, + { + "epoch": 0.5561441778331775, + "grad_norm": 0.41559967398643494, + "learning_rate": 4.122655012633101e-05, + "loss": 1.6632, + "step": 5354 + }, + { + "epoch": 0.5562480523527579, + "grad_norm": 0.4069918990135193, + "learning_rate": 4.121048717469119e-05, + "loss": 1.6128, + "step": 5355 + }, + { + "epoch": 0.5563519268723383, + "grad_norm": 0.43804723024368286, + "learning_rate": 4.119442515906595e-05, + "loss": 1.618, + "step": 5356 + }, + { + "epoch": 0.5564558013919185, + "grad_norm": 0.46744030714035034, + "learning_rate": 4.117836408116574e-05, + "loss": 1.787, + "step": 5357 + }, + { + "epoch": 0.5565596759114989, + "grad_norm": 0.4054070711135864, + "learning_rate": 4.116230394270094e-05, + "loss": 1.6505, + "step": 5358 + }, + { + "epoch": 0.5566635504310793, + "grad_norm": 0.4278983771800995, + "learning_rate": 4.114624474538183e-05, + "loss": 1.7579, + "step": 5359 + }, + { + "epoch": 0.5567674249506596, + "grad_norm": 0.4365118443965912, + "learning_rate": 4.113018649091861e-05, + "loss": 1.7672, + "step": 5360 + }, + { + "epoch": 0.5568712994702399, + "grad_norm": 0.4227050244808197, + "learning_rate": 4.111412918102132e-05, + "loss": 1.8521, + "step": 5361 + }, + { + "epoch": 0.5569751739898203, + "grad_norm": 0.41989371180534363, + "learning_rate": 4.1098072817399956e-05, + "loss": 1.6816, + "step": 5362 + }, + { + "epoch": 0.5570790485094006, + "grad_norm": 0.43567296862602234, + "learning_rate": 4.108201740176443e-05, + "loss": 1.7746, + "step": 5363 + }, + { + "epoch": 0.557182923028981, + "grad_norm": 0.48267829418182373, + "learning_rate": 4.106596293582446e-05, + "loss": 1.7837, + "step": 5364 + }, + { + "epoch": 0.5572867975485614, + "grad_norm": 0.44921717047691345, + "learning_rate": 4.104990942128976e-05, + "loss": 1.8066, + "step": 5365 + }, + { + "epoch": 0.5573906720681416, + "grad_norm": 0.4056643843650818, + "learning_rate": 4.10338568598699e-05, + "loss": 1.472, + "step": 5366 + }, + { + "epoch": 0.557494546587722, + "grad_norm": 0.4002752900123596, + "learning_rate": 4.1017805253274336e-05, + "loss": 1.6328, + "step": 5367 + }, + { + "epoch": 0.5575984211073024, + "grad_norm": 0.42509204149246216, + "learning_rate": 4.100175460321244e-05, + "loss": 1.5223, + "step": 5368 + }, + { + "epoch": 0.5577022956268827, + "grad_norm": 0.391414612531662, + "learning_rate": 4.0985704911393506e-05, + "loss": 1.685, + "step": 5369 + }, + { + "epoch": 0.557806170146463, + "grad_norm": 0.4597155749797821, + "learning_rate": 4.096965617952667e-05, + "loss": 1.7821, + "step": 5370 + }, + { + "epoch": 0.5579100446660434, + "grad_norm": 0.4139951467514038, + "learning_rate": 4.0953608409321024e-05, + "loss": 1.7831, + "step": 5371 + }, + { + "epoch": 0.5580139191856238, + "grad_norm": 0.4334876239299774, + "learning_rate": 4.093756160248551e-05, + "loss": 1.6776, + "step": 5372 + }, + { + "epoch": 0.5581177937052041, + "grad_norm": 0.47372686862945557, + "learning_rate": 4.092151576072901e-05, + "loss": 1.8743, + "step": 5373 + }, + { + "epoch": 0.5582216682247845, + "grad_norm": 0.39616334438323975, + "learning_rate": 4.0905470885760244e-05, + "loss": 1.6113, + "step": 5374 + }, + { + "epoch": 0.5583255427443649, + "grad_norm": 0.44439664483070374, + "learning_rate": 4.088942697928789e-05, + "loss": 1.842, + "step": 5375 + }, + { + "epoch": 0.5584294172639451, + "grad_norm": 0.3774793744087219, + "learning_rate": 4.087338404302052e-05, + "loss": 1.7603, + "step": 5376 + }, + { + "epoch": 0.5585332917835255, + "grad_norm": 0.4046783745288849, + "learning_rate": 4.0857342078666546e-05, + "loss": 1.597, + "step": 5377 + }, + { + "epoch": 0.5586371663031059, + "grad_norm": 0.41748616099357605, + "learning_rate": 4.0841301087934326e-05, + "loss": 1.7326, + "step": 5378 + }, + { + "epoch": 0.5587410408226862, + "grad_norm": 0.3903839886188507, + "learning_rate": 4.082526107253212e-05, + "loss": 1.5858, + "step": 5379 + }, + { + "epoch": 0.5588449153422665, + "grad_norm": 0.4938502311706543, + "learning_rate": 4.080922203416801e-05, + "loss": 1.5894, + "step": 5380 + }, + { + "epoch": 0.5589487898618469, + "grad_norm": 0.4077463150024414, + "learning_rate": 4.079318397455007e-05, + "loss": 1.6721, + "step": 5381 + }, + { + "epoch": 0.5590526643814272, + "grad_norm": 0.42645683884620667, + "learning_rate": 4.077714689538624e-05, + "loss": 1.8013, + "step": 5382 + }, + { + "epoch": 0.5591565389010076, + "grad_norm": 0.3947215676307678, + "learning_rate": 4.076111079838431e-05, + "loss": 1.6274, + "step": 5383 + }, + { + "epoch": 0.559260413420588, + "grad_norm": 0.43122047185897827, + "learning_rate": 4.074507568525201e-05, + "loss": 1.7273, + "step": 5384 + }, + { + "epoch": 0.5593642879401682, + "grad_norm": 0.41860494017601013, + "learning_rate": 4.072904155769697e-05, + "loss": 1.6412, + "step": 5385 + }, + { + "epoch": 0.5594681624597486, + "grad_norm": 0.4793721139431, + "learning_rate": 4.0713008417426676e-05, + "loss": 1.8271, + "step": 5386 + }, + { + "epoch": 0.559572036979329, + "grad_norm": 0.4130531847476959, + "learning_rate": 4.069697626614855e-05, + "loss": 1.7992, + "step": 5387 + }, + { + "epoch": 0.5596759114989093, + "grad_norm": 0.40088602900505066, + "learning_rate": 4.068094510556989e-05, + "loss": 1.4903, + "step": 5388 + }, + { + "epoch": 0.5597797860184897, + "grad_norm": 0.40667760372161865, + "learning_rate": 4.066491493739788e-05, + "loss": 1.5853, + "step": 5389 + }, + { + "epoch": 0.55988366053807, + "grad_norm": 0.44721919298171997, + "learning_rate": 4.064888576333961e-05, + "loss": 1.7506, + "step": 5390 + }, + { + "epoch": 0.5599875350576503, + "grad_norm": 0.4468611478805542, + "learning_rate": 4.063285758510207e-05, + "loss": 1.6431, + "step": 5391 + }, + { + "epoch": 0.5600914095772307, + "grad_norm": 0.39779260754585266, + "learning_rate": 4.061683040439214e-05, + "loss": 1.7715, + "step": 5392 + }, + { + "epoch": 0.5601952840968111, + "grad_norm": 0.41888242959976196, + "learning_rate": 4.060080422291656e-05, + "loss": 1.6373, + "step": 5393 + }, + { + "epoch": 0.5602991586163913, + "grad_norm": 0.4231390953063965, + "learning_rate": 4.058477904238203e-05, + "loss": 1.7707, + "step": 5394 + }, + { + "epoch": 0.5604030331359717, + "grad_norm": 0.4443194568157196, + "learning_rate": 4.056875486449507e-05, + "loss": 1.8821, + "step": 5395 + }, + { + "epoch": 0.5605069076555521, + "grad_norm": 0.4171590507030487, + "learning_rate": 4.055273169096216e-05, + "loss": 1.7404, + "step": 5396 + }, + { + "epoch": 0.5606107821751325, + "grad_norm": 0.41153210401535034, + "learning_rate": 4.053670952348964e-05, + "loss": 1.679, + "step": 5397 + }, + { + "epoch": 0.5607146566947128, + "grad_norm": 0.40622779726982117, + "learning_rate": 4.052068836378374e-05, + "loss": 1.69, + "step": 5398 + }, + { + "epoch": 0.5608185312142931, + "grad_norm": 0.40083175897598267, + "learning_rate": 4.050466821355058e-05, + "loss": 1.6595, + "step": 5399 + }, + { + "epoch": 0.5609224057338735, + "grad_norm": 0.4463566243648529, + "learning_rate": 4.0488649074496186e-05, + "loss": 1.7722, + "step": 5400 + }, + { + "epoch": 0.5610262802534538, + "grad_norm": 0.39217326045036316, + "learning_rate": 4.04726309483265e-05, + "loss": 1.5485, + "step": 5401 + }, + { + "epoch": 0.5611301547730342, + "grad_norm": 0.39511242508888245, + "learning_rate": 4.045661383674727e-05, + "loss": 1.6444, + "step": 5402 + }, + { + "epoch": 0.5612340292926146, + "grad_norm": 0.411687970161438, + "learning_rate": 4.044059774146423e-05, + "loss": 1.8602, + "step": 5403 + }, + { + "epoch": 0.5613379038121948, + "grad_norm": 0.38304659724235535, + "learning_rate": 4.0424582664182986e-05, + "loss": 1.6232, + "step": 5404 + }, + { + "epoch": 0.5614417783317752, + "grad_norm": 0.4158017635345459, + "learning_rate": 4.040856860660897e-05, + "loss": 1.6325, + "step": 5405 + }, + { + "epoch": 0.5615456528513556, + "grad_norm": 0.43518683314323425, + "learning_rate": 4.03925555704476e-05, + "loss": 1.6961, + "step": 5406 + }, + { + "epoch": 0.5616495273709359, + "grad_norm": 0.3899015486240387, + "learning_rate": 4.0376543557404125e-05, + "loss": 1.6689, + "step": 5407 + }, + { + "epoch": 0.5617534018905163, + "grad_norm": 0.3894193470478058, + "learning_rate": 4.036053256918368e-05, + "loss": 1.5847, + "step": 5408 + }, + { + "epoch": 0.5618572764100966, + "grad_norm": 0.41329947113990784, + "learning_rate": 4.034452260749132e-05, + "loss": 1.7453, + "step": 5409 + }, + { + "epoch": 0.5619611509296769, + "grad_norm": 0.43754690885543823, + "learning_rate": 4.032851367403201e-05, + "loss": 1.7059, + "step": 5410 + }, + { + "epoch": 0.5620650254492573, + "grad_norm": 0.423709899187088, + "learning_rate": 4.031250577051053e-05, + "loss": 1.6802, + "step": 5411 + }, + { + "epoch": 0.5621688999688377, + "grad_norm": 0.40563976764678955, + "learning_rate": 4.0296498898631644e-05, + "loss": 1.7492, + "step": 5412 + }, + { + "epoch": 0.562272774488418, + "grad_norm": 0.40074846148490906, + "learning_rate": 4.028049306009992e-05, + "loss": 1.7393, + "step": 5413 + }, + { + "epoch": 0.5623766490079983, + "grad_norm": 0.4513157308101654, + "learning_rate": 4.026448825661987e-05, + "loss": 1.7905, + "step": 5414 + }, + { + "epoch": 0.5624805235275787, + "grad_norm": 0.4043268859386444, + "learning_rate": 4.024848448989588e-05, + "loss": 1.8213, + "step": 5415 + }, + { + "epoch": 0.562584398047159, + "grad_norm": 0.36234188079833984, + "learning_rate": 4.023248176163224e-05, + "loss": 1.4981, + "step": 5416 + }, + { + "epoch": 0.5626882725667394, + "grad_norm": 0.42101240158081055, + "learning_rate": 4.021648007353309e-05, + "loss": 1.6849, + "step": 5417 + }, + { + "epoch": 0.5627921470863197, + "grad_norm": 0.3881211280822754, + "learning_rate": 4.0200479427302486e-05, + "loss": 1.5648, + "step": 5418 + }, + { + "epoch": 0.5628960216059, + "grad_norm": 0.4609657824039459, + "learning_rate": 4.0184479824644413e-05, + "loss": 1.8176, + "step": 5419 + }, + { + "epoch": 0.5629998961254804, + "grad_norm": 0.4178425669670105, + "learning_rate": 4.016848126726266e-05, + "loss": 1.6165, + "step": 5420 + }, + { + "epoch": 0.5631037706450608, + "grad_norm": 0.390349805355072, + "learning_rate": 4.0152483756860956e-05, + "loss": 1.5583, + "step": 5421 + }, + { + "epoch": 0.5632076451646412, + "grad_norm": 0.40884721279144287, + "learning_rate": 4.0136487295142913e-05, + "loss": 1.7998, + "step": 5422 + }, + { + "epoch": 0.5633115196842214, + "grad_norm": 0.4485437273979187, + "learning_rate": 4.0120491883812065e-05, + "loss": 2.054, + "step": 5423 + }, + { + "epoch": 0.5634153942038018, + "grad_norm": 0.43038633465766907, + "learning_rate": 4.010449752457175e-05, + "loss": 1.7295, + "step": 5424 + }, + { + "epoch": 0.5635192687233822, + "grad_norm": 0.36314454674720764, + "learning_rate": 4.0088504219125253e-05, + "loss": 1.21, + "step": 5425 + }, + { + "epoch": 0.5636231432429625, + "grad_norm": 0.3857851028442383, + "learning_rate": 4.007251196917576e-05, + "loss": 1.6524, + "step": 5426 + }, + { + "epoch": 0.5637270177625429, + "grad_norm": 0.3993614912033081, + "learning_rate": 4.00565207764263e-05, + "loss": 1.6198, + "step": 5427 + }, + { + "epoch": 0.5638308922821232, + "grad_norm": 0.41268518567085266, + "learning_rate": 4.00405306425798e-05, + "loss": 1.6297, + "step": 5428 + }, + { + "epoch": 0.5639347668017035, + "grad_norm": 0.410853773355484, + "learning_rate": 4.0024541569339126e-05, + "loss": 1.6958, + "step": 5429 + }, + { + "epoch": 0.5640386413212839, + "grad_norm": 0.39320501685142517, + "learning_rate": 4.0008553558406945e-05, + "loss": 1.5767, + "step": 5430 + }, + { + "epoch": 0.5641425158408643, + "grad_norm": 0.444997102022171, + "learning_rate": 3.9992566611485866e-05, + "loss": 1.6399, + "step": 5431 + }, + { + "epoch": 0.5642463903604445, + "grad_norm": 0.4061078131198883, + "learning_rate": 3.9976580730278405e-05, + "loss": 1.485, + "step": 5432 + }, + { + "epoch": 0.5643502648800249, + "grad_norm": 0.38394030928611755, + "learning_rate": 3.99605959164869e-05, + "loss": 1.6078, + "step": 5433 + }, + { + "epoch": 0.5644541393996053, + "grad_norm": 0.46738219261169434, + "learning_rate": 3.9944612171813616e-05, + "loss": 1.7178, + "step": 5434 + }, + { + "epoch": 0.5645580139191856, + "grad_norm": 0.47596266865730286, + "learning_rate": 3.992862949796072e-05, + "loss": 1.7114, + "step": 5435 + }, + { + "epoch": 0.564661888438766, + "grad_norm": 0.4068540930747986, + "learning_rate": 3.991264789663019e-05, + "loss": 1.5976, + "step": 5436 + }, + { + "epoch": 0.5647657629583464, + "grad_norm": 0.4116705060005188, + "learning_rate": 3.989666736952399e-05, + "loss": 1.7029, + "step": 5437 + }, + { + "epoch": 0.5648696374779266, + "grad_norm": 0.37593719363212585, + "learning_rate": 3.9880687918343916e-05, + "loss": 1.5416, + "step": 5438 + }, + { + "epoch": 0.564973511997507, + "grad_norm": 0.43204763531684875, + "learning_rate": 3.986470954479163e-05, + "loss": 1.7622, + "step": 5439 + }, + { + "epoch": 0.5650773865170874, + "grad_norm": 0.4367099106311798, + "learning_rate": 3.984873225056873e-05, + "loss": 1.6935, + "step": 5440 + }, + { + "epoch": 0.5651812610366677, + "grad_norm": 0.3884550631046295, + "learning_rate": 3.983275603737666e-05, + "loss": 1.6477, + "step": 5441 + }, + { + "epoch": 0.565285135556248, + "grad_norm": 0.40880388021469116, + "learning_rate": 3.981678090691676e-05, + "loss": 1.7526, + "step": 5442 + }, + { + "epoch": 0.5653890100758284, + "grad_norm": 0.40109533071517944, + "learning_rate": 3.980080686089026e-05, + "loss": 1.7972, + "step": 5443 + }, + { + "epoch": 0.5654928845954087, + "grad_norm": 0.4425828456878662, + "learning_rate": 3.9784833900998286e-05, + "loss": 1.738, + "step": 5444 + }, + { + "epoch": 0.5655967591149891, + "grad_norm": 0.4051485061645508, + "learning_rate": 3.97688620289418e-05, + "loss": 1.6437, + "step": 5445 + }, + { + "epoch": 0.5657006336345695, + "grad_norm": 0.3986642360687256, + "learning_rate": 3.975289124642171e-05, + "loss": 1.6672, + "step": 5446 + }, + { + "epoch": 0.5658045081541497, + "grad_norm": 0.3762451112270355, + "learning_rate": 3.9736921555138787e-05, + "loss": 1.6666, + "step": 5447 + }, + { + "epoch": 0.5659083826737301, + "grad_norm": 0.3796166479587555, + "learning_rate": 3.972095295679365e-05, + "loss": 1.6013, + "step": 5448 + }, + { + "epoch": 0.5660122571933105, + "grad_norm": 0.4417518377304077, + "learning_rate": 3.9704985453086833e-05, + "loss": 1.6795, + "step": 5449 + }, + { + "epoch": 0.5661161317128909, + "grad_norm": 0.3697144389152527, + "learning_rate": 3.968901904571877e-05, + "loss": 1.4858, + "step": 5450 + }, + { + "epoch": 0.5662200062324712, + "grad_norm": 0.3945258557796478, + "learning_rate": 3.967305373638976e-05, + "loss": 1.5509, + "step": 5451 + }, + { + "epoch": 0.5663238807520515, + "grad_norm": 0.39280179142951965, + "learning_rate": 3.965708952679996e-05, + "loss": 1.6271, + "step": 5452 + }, + { + "epoch": 0.5664277552716319, + "grad_norm": 0.4188973605632782, + "learning_rate": 3.9641126418649456e-05, + "loss": 1.656, + "step": 5453 + }, + { + "epoch": 0.5665316297912122, + "grad_norm": 0.39895448088645935, + "learning_rate": 3.962516441363818e-05, + "loss": 1.6666, + "step": 5454 + }, + { + "epoch": 0.5666355043107926, + "grad_norm": 0.38310202956199646, + "learning_rate": 3.9609203513465974e-05, + "loss": 1.5975, + "step": 5455 + }, + { + "epoch": 0.566739378830373, + "grad_norm": 0.43473097681999207, + "learning_rate": 3.9593243719832536e-05, + "loss": 1.7687, + "step": 5456 + }, + { + "epoch": 0.5668432533499532, + "grad_norm": 0.4295366406440735, + "learning_rate": 3.957728503443748e-05, + "loss": 1.6263, + "step": 5457 + }, + { + "epoch": 0.5669471278695336, + "grad_norm": 0.4174087643623352, + "learning_rate": 3.956132745898025e-05, + "loss": 1.6705, + "step": 5458 + }, + { + "epoch": 0.567051002389114, + "grad_norm": 0.41141003370285034, + "learning_rate": 3.954537099516021e-05, + "loss": 1.6423, + "step": 5459 + }, + { + "epoch": 0.5671548769086943, + "grad_norm": 0.44283923506736755, + "learning_rate": 3.952941564467665e-05, + "loss": 1.7335, + "step": 5460 + }, + { + "epoch": 0.5672587514282746, + "grad_norm": 0.4794785678386688, + "learning_rate": 3.951346140922863e-05, + "loss": 1.7162, + "step": 5461 + }, + { + "epoch": 0.567362625947855, + "grad_norm": 0.39859795570373535, + "learning_rate": 3.949750829051516e-05, + "loss": 1.7051, + "step": 5462 + }, + { + "epoch": 0.5674665004674353, + "grad_norm": 0.39325210452079773, + "learning_rate": 3.948155629023515e-05, + "loss": 1.5073, + "step": 5463 + }, + { + "epoch": 0.5675703749870157, + "grad_norm": 0.37789732217788696, + "learning_rate": 3.946560541008734e-05, + "loss": 1.7606, + "step": 5464 + }, + { + "epoch": 0.5676742495065961, + "grad_norm": 0.4179360270500183, + "learning_rate": 3.9449655651770365e-05, + "loss": 1.576, + "step": 5465 + }, + { + "epoch": 0.5677781240261763, + "grad_norm": 0.38552039861679077, + "learning_rate": 3.943370701698281e-05, + "loss": 1.6677, + "step": 5466 + }, + { + "epoch": 0.5678819985457567, + "grad_norm": 0.4086502194404602, + "learning_rate": 3.9417759507423004e-05, + "loss": 1.7007, + "step": 5467 + }, + { + "epoch": 0.5679858730653371, + "grad_norm": 0.43782469630241394, + "learning_rate": 3.940181312478928e-05, + "loss": 1.788, + "step": 5468 + }, + { + "epoch": 0.5680897475849174, + "grad_norm": 0.40869951248168945, + "learning_rate": 3.938586787077978e-05, + "loss": 1.7881, + "step": 5469 + }, + { + "epoch": 0.5681936221044978, + "grad_norm": 0.4343576729297638, + "learning_rate": 3.936992374709256e-05, + "loss": 1.8009, + "step": 5470 + }, + { + "epoch": 0.5682974966240781, + "grad_norm": 0.4356035590171814, + "learning_rate": 3.935398075542554e-05, + "loss": 1.7802, + "step": 5471 + }, + { + "epoch": 0.5684013711436584, + "grad_norm": 0.4507029950618744, + "learning_rate": 3.933803889747655e-05, + "loss": 1.5211, + "step": 5472 + }, + { + "epoch": 0.5685052456632388, + "grad_norm": 0.3965437114238739, + "learning_rate": 3.9322098174943226e-05, + "loss": 1.6319, + "step": 5473 + }, + { + "epoch": 0.5686091201828192, + "grad_norm": 0.4066412150859833, + "learning_rate": 3.930615858952316e-05, + "loss": 1.4707, + "step": 5474 + }, + { + "epoch": 0.5687129947023996, + "grad_norm": 0.4407905638217926, + "learning_rate": 3.929022014291379e-05, + "loss": 1.7746, + "step": 5475 + }, + { + "epoch": 0.5688168692219798, + "grad_norm": 0.43420854210853577, + "learning_rate": 3.927428283681245e-05, + "loss": 1.7012, + "step": 5476 + }, + { + "epoch": 0.5689207437415602, + "grad_norm": 0.39704859256744385, + "learning_rate": 3.925834667291631e-05, + "loss": 1.623, + "step": 5477 + }, + { + "epoch": 0.5690246182611406, + "grad_norm": 0.39629414677619934, + "learning_rate": 3.924241165292245e-05, + "loss": 1.6304, + "step": 5478 + }, + { + "epoch": 0.5691284927807209, + "grad_norm": 0.4737340211868286, + "learning_rate": 3.922647777852786e-05, + "loss": 1.7869, + "step": 5479 + }, + { + "epoch": 0.5692323673003012, + "grad_norm": 0.41576650738716125, + "learning_rate": 3.9210545051429337e-05, + "loss": 1.7863, + "step": 5480 + }, + { + "epoch": 0.5693362418198816, + "grad_norm": 0.4058329463005066, + "learning_rate": 3.919461347332361e-05, + "loss": 1.7295, + "step": 5481 + }, + { + "epoch": 0.5694401163394619, + "grad_norm": 0.38741639256477356, + "learning_rate": 3.917868304590726e-05, + "loss": 1.5988, + "step": 5482 + }, + { + "epoch": 0.5695439908590423, + "grad_norm": 0.40862467885017395, + "learning_rate": 3.916275377087676e-05, + "loss": 1.6312, + "step": 5483 + }, + { + "epoch": 0.5696478653786227, + "grad_norm": 0.38968440890312195, + "learning_rate": 3.9146825649928446e-05, + "loss": 1.7144, + "step": 5484 + }, + { + "epoch": 0.5697517398982029, + "grad_norm": 0.41511067748069763, + "learning_rate": 3.913089868475855e-05, + "loss": 1.8591, + "step": 5485 + }, + { + "epoch": 0.5698556144177833, + "grad_norm": 0.4132608473300934, + "learning_rate": 3.911497287706315e-05, + "loss": 1.7331, + "step": 5486 + }, + { + "epoch": 0.5699594889373637, + "grad_norm": 0.41503503918647766, + "learning_rate": 3.909904822853823e-05, + "loss": 1.867, + "step": 5487 + }, + { + "epoch": 0.570063363456944, + "grad_norm": 0.39798980951309204, + "learning_rate": 3.908312474087966e-05, + "loss": 1.7591, + "step": 5488 + }, + { + "epoch": 0.5701672379765244, + "grad_norm": 0.4197697639465332, + "learning_rate": 3.906720241578314e-05, + "loss": 1.7974, + "step": 5489 + }, + { + "epoch": 0.5702711124961047, + "grad_norm": 0.47081831097602844, + "learning_rate": 3.905128125494427e-05, + "loss": 1.7685, + "step": 5490 + }, + { + "epoch": 0.570374987015685, + "grad_norm": 0.42261427640914917, + "learning_rate": 3.903536126005858e-05, + "loss": 1.7431, + "step": 5491 + }, + { + "epoch": 0.5704788615352654, + "grad_norm": 0.39490261673927307, + "learning_rate": 3.901944243282135e-05, + "loss": 1.8147, + "step": 5492 + }, + { + "epoch": 0.5705827360548458, + "grad_norm": 0.41960594058036804, + "learning_rate": 3.900352477492786e-05, + "loss": 1.7233, + "step": 5493 + }, + { + "epoch": 0.570686610574426, + "grad_norm": 0.3711908161640167, + "learning_rate": 3.8987608288073216e-05, + "loss": 1.5517, + "step": 5494 + }, + { + "epoch": 0.5707904850940064, + "grad_norm": 0.433907151222229, + "learning_rate": 3.897169297395238e-05, + "loss": 1.6967, + "step": 5495 + }, + { + "epoch": 0.5708943596135868, + "grad_norm": 0.40768545866012573, + "learning_rate": 3.895577883426022e-05, + "loss": 1.4462, + "step": 5496 + }, + { + "epoch": 0.5709982341331671, + "grad_norm": 0.4261188209056854, + "learning_rate": 3.893986587069147e-05, + "loss": 1.8624, + "step": 5497 + }, + { + "epoch": 0.5711021086527475, + "grad_norm": 0.3955959379673004, + "learning_rate": 3.892395408494073e-05, + "loss": 1.6179, + "step": 5498 + }, + { + "epoch": 0.5712059831723278, + "grad_norm": 0.4365078806877136, + "learning_rate": 3.890804347870248e-05, + "loss": 1.7387, + "step": 5499 + }, + { + "epoch": 0.5713098576919082, + "grad_norm": 0.4353174865245819, + "learning_rate": 3.889213405367107e-05, + "loss": 1.4776, + "step": 5500 + }, + { + "epoch": 0.5714137322114885, + "grad_norm": 0.39252567291259766, + "learning_rate": 3.887622581154077e-05, + "loss": 1.6338, + "step": 5501 + }, + { + "epoch": 0.5715176067310689, + "grad_norm": 0.49222537875175476, + "learning_rate": 3.886031875400562e-05, + "loss": 1.9512, + "step": 5502 + }, + { + "epoch": 0.5716214812506493, + "grad_norm": 0.41813385486602783, + "learning_rate": 3.884441288275965e-05, + "loss": 1.7856, + "step": 5503 + }, + { + "epoch": 0.5717253557702295, + "grad_norm": 0.4292755424976349, + "learning_rate": 3.8828508199496694e-05, + "loss": 1.6817, + "step": 5504 + }, + { + "epoch": 0.5718292302898099, + "grad_norm": 0.4089149832725525, + "learning_rate": 3.8812604705910454e-05, + "loss": 1.803, + "step": 5505 + }, + { + "epoch": 0.5719331048093903, + "grad_norm": 0.44375675916671753, + "learning_rate": 3.8796702403694554e-05, + "loss": 1.5654, + "step": 5506 + }, + { + "epoch": 0.5720369793289706, + "grad_norm": 0.3902745246887207, + "learning_rate": 3.8780801294542476e-05, + "loss": 1.6287, + "step": 5507 + }, + { + "epoch": 0.572140853848551, + "grad_norm": 0.38409364223480225, + "learning_rate": 3.876490138014752e-05, + "loss": 1.5963, + "step": 5508 + }, + { + "epoch": 0.5722447283681313, + "grad_norm": 0.41062119603157043, + "learning_rate": 3.874900266220295e-05, + "loss": 1.6819, + "step": 5509 + }, + { + "epoch": 0.5723486028877116, + "grad_norm": 0.38653069734573364, + "learning_rate": 3.873310514240183e-05, + "loss": 1.5166, + "step": 5510 + }, + { + "epoch": 0.572452477407292, + "grad_norm": 0.38248497247695923, + "learning_rate": 3.871720882243712e-05, + "loss": 1.505, + "step": 5511 + }, + { + "epoch": 0.5725563519268724, + "grad_norm": 0.41657835245132446, + "learning_rate": 3.870131370400166e-05, + "loss": 1.6057, + "step": 5512 + }, + { + "epoch": 0.5726602264464526, + "grad_norm": 0.4237949252128601, + "learning_rate": 3.868541978878818e-05, + "loss": 1.6186, + "step": 5513 + }, + { + "epoch": 0.572764100966033, + "grad_norm": 0.4268006682395935, + "learning_rate": 3.866952707848922e-05, + "loss": 1.5518, + "step": 5514 + }, + { + "epoch": 0.5728679754856134, + "grad_norm": 0.46735864877700806, + "learning_rate": 3.865363557479723e-05, + "loss": 1.7631, + "step": 5515 + }, + { + "epoch": 0.5729718500051937, + "grad_norm": 0.45263731479644775, + "learning_rate": 3.863774527940457e-05, + "loss": 1.7801, + "step": 5516 + }, + { + "epoch": 0.5730757245247741, + "grad_norm": 0.38436025381088257, + "learning_rate": 3.86218561940034e-05, + "loss": 1.4836, + "step": 5517 + }, + { + "epoch": 0.5731795990443544, + "grad_norm": 0.3812609910964966, + "learning_rate": 3.860596832028579e-05, + "loss": 1.5733, + "step": 5518 + }, + { + "epoch": 0.5732834735639347, + "grad_norm": 0.4150224030017853, + "learning_rate": 3.8590081659943694e-05, + "loss": 1.7095, + "step": 5519 + }, + { + "epoch": 0.5733873480835151, + "grad_norm": 0.4313552975654602, + "learning_rate": 3.8574196214668876e-05, + "loss": 1.7818, + "step": 5520 + }, + { + "epoch": 0.5734912226030955, + "grad_norm": 0.3813556730747223, + "learning_rate": 3.855831198615304e-05, + "loss": 1.5572, + "step": 5521 + }, + { + "epoch": 0.5735950971226758, + "grad_norm": 0.3927074670791626, + "learning_rate": 3.854242897608775e-05, + "loss": 1.4641, + "step": 5522 + }, + { + "epoch": 0.5736989716422561, + "grad_norm": 0.38922983407974243, + "learning_rate": 3.852654718616438e-05, + "loss": 1.3964, + "step": 5523 + }, + { + "epoch": 0.5738028461618365, + "grad_norm": 0.39198037981987, + "learning_rate": 3.851066661807424e-05, + "loss": 1.66, + "step": 5524 + }, + { + "epoch": 0.5739067206814168, + "grad_norm": 0.41385430097579956, + "learning_rate": 3.849478727350849e-05, + "loss": 1.7209, + "step": 5525 + }, + { + "epoch": 0.5740105952009972, + "grad_norm": 0.4008021652698517, + "learning_rate": 3.847890915415815e-05, + "loss": 1.6168, + "step": 5526 + }, + { + "epoch": 0.5741144697205776, + "grad_norm": 0.42012637853622437, + "learning_rate": 3.8463032261714114e-05, + "loss": 1.7073, + "step": 5527 + }, + { + "epoch": 0.5742183442401579, + "grad_norm": 0.428591251373291, + "learning_rate": 3.8447156597867153e-05, + "loss": 1.7536, + "step": 5528 + }, + { + "epoch": 0.5743222187597382, + "grad_norm": 0.45808881521224976, + "learning_rate": 3.843128216430791e-05, + "loss": 1.6032, + "step": 5529 + }, + { + "epoch": 0.5744260932793186, + "grad_norm": 0.41718247532844543, + "learning_rate": 3.841540896272685e-05, + "loss": 1.6722, + "step": 5530 + }, + { + "epoch": 0.574529967798899, + "grad_norm": 0.4867742955684662, + "learning_rate": 3.83995369948144e-05, + "loss": 1.7742, + "step": 5531 + }, + { + "epoch": 0.5746338423184792, + "grad_norm": 0.4175654947757721, + "learning_rate": 3.838366626226078e-05, + "loss": 1.5992, + "step": 5532 + }, + { + "epoch": 0.5747377168380596, + "grad_norm": 0.3990960121154785, + "learning_rate": 3.836779676675606e-05, + "loss": 1.7132, + "step": 5533 + }, + { + "epoch": 0.57484159135764, + "grad_norm": 0.43652915954589844, + "learning_rate": 3.8351928509990255e-05, + "loss": 1.8061, + "step": 5534 + }, + { + "epoch": 0.5749454658772203, + "grad_norm": 0.4044228196144104, + "learning_rate": 3.833606149365323e-05, + "loss": 1.7402, + "step": 5535 + }, + { + "epoch": 0.5750493403968007, + "grad_norm": 0.41677069664001465, + "learning_rate": 3.832019571943465e-05, + "loss": 1.6522, + "step": 5536 + }, + { + "epoch": 0.575153214916381, + "grad_norm": 0.40722280740737915, + "learning_rate": 3.8304331189024127e-05, + "loss": 1.8343, + "step": 5537 + }, + { + "epoch": 0.5752570894359613, + "grad_norm": 0.40876513719558716, + "learning_rate": 3.828846790411111e-05, + "loss": 1.6463, + "step": 5538 + }, + { + "epoch": 0.5753609639555417, + "grad_norm": 0.4222007095813751, + "learning_rate": 3.82726058663849e-05, + "loss": 1.672, + "step": 5539 + }, + { + "epoch": 0.5754648384751221, + "grad_norm": 0.4677537679672241, + "learning_rate": 3.8256745077534684e-05, + "loss": 1.8548, + "step": 5540 + }, + { + "epoch": 0.5755687129947024, + "grad_norm": 0.4388614594936371, + "learning_rate": 3.824088553924954e-05, + "loss": 1.8835, + "step": 5541 + }, + { + "epoch": 0.5756725875142827, + "grad_norm": 0.43357276916503906, + "learning_rate": 3.822502725321833e-05, + "loss": 1.831, + "step": 5542 + }, + { + "epoch": 0.5757764620338631, + "grad_norm": 0.40327322483062744, + "learning_rate": 3.8209170221129875e-05, + "loss": 1.7199, + "step": 5543 + }, + { + "epoch": 0.5758803365534434, + "grad_norm": 0.4302258789539337, + "learning_rate": 3.819331444467284e-05, + "loss": 1.8026, + "step": 5544 + }, + { + "epoch": 0.5759842110730238, + "grad_norm": 0.4435931444168091, + "learning_rate": 3.817745992553571e-05, + "loss": 1.6854, + "step": 5545 + }, + { + "epoch": 0.5760880855926042, + "grad_norm": 0.40510040521621704, + "learning_rate": 3.816160666540687e-05, + "loss": 1.6003, + "step": 5546 + }, + { + "epoch": 0.5761919601121844, + "grad_norm": 0.43098416924476624, + "learning_rate": 3.814575466597459e-05, + "loss": 1.8288, + "step": 5547 + }, + { + "epoch": 0.5762958346317648, + "grad_norm": 0.4217774569988251, + "learning_rate": 3.8129903928926956e-05, + "loss": 1.8433, + "step": 5548 + }, + { + "epoch": 0.5763997091513452, + "grad_norm": 0.4399735629558563, + "learning_rate": 3.811405445595197e-05, + "loss": 1.6735, + "step": 5549 + }, + { + "epoch": 0.5765035836709255, + "grad_norm": 0.39600399136543274, + "learning_rate": 3.809820624873748e-05, + "loss": 1.5637, + "step": 5550 + }, + { + "epoch": 0.5766074581905059, + "grad_norm": 0.4116421937942505, + "learning_rate": 3.808235930897118e-05, + "loss": 1.6252, + "step": 5551 + }, + { + "epoch": 0.5767113327100862, + "grad_norm": 0.4399339258670807, + "learning_rate": 3.806651363834065e-05, + "loss": 1.7843, + "step": 5552 + }, + { + "epoch": 0.5768152072296666, + "grad_norm": 0.3872789442539215, + "learning_rate": 3.805066923853333e-05, + "loss": 1.547, + "step": 5553 + }, + { + "epoch": 0.5769190817492469, + "grad_norm": 0.38812029361724854, + "learning_rate": 3.803482611123654e-05, + "loss": 1.5512, + "step": 5554 + }, + { + "epoch": 0.5770229562688273, + "grad_norm": 0.4194970428943634, + "learning_rate": 3.8018984258137423e-05, + "loss": 1.5616, + "step": 5555 + }, + { + "epoch": 0.5771268307884077, + "grad_norm": 0.44213446974754333, + "learning_rate": 3.800314368092302e-05, + "loss": 1.8645, + "step": 5556 + }, + { + "epoch": 0.5772307053079879, + "grad_norm": 0.39991211891174316, + "learning_rate": 3.798730438128027e-05, + "loss": 1.7879, + "step": 5557 + }, + { + "epoch": 0.5773345798275683, + "grad_norm": 0.4132145047187805, + "learning_rate": 3.797146636089587e-05, + "loss": 1.8618, + "step": 5558 + }, + { + "epoch": 0.5774384543471487, + "grad_norm": 0.4226401150226593, + "learning_rate": 3.795562962145649e-05, + "loss": 1.5837, + "step": 5559 + }, + { + "epoch": 0.577542328866729, + "grad_norm": 0.3981073498725891, + "learning_rate": 3.793979416464862e-05, + "loss": 1.6511, + "step": 5560 + }, + { + "epoch": 0.5776462033863093, + "grad_norm": 0.399758517742157, + "learning_rate": 3.792395999215857e-05, + "loss": 1.5685, + "step": 5561 + }, + { + "epoch": 0.5777500779058897, + "grad_norm": 0.4009932279586792, + "learning_rate": 3.790812710567259e-05, + "loss": 1.6031, + "step": 5562 + }, + { + "epoch": 0.57785395242547, + "grad_norm": 0.4045742154121399, + "learning_rate": 3.789229550687676e-05, + "loss": 1.6291, + "step": 5563 + }, + { + "epoch": 0.5779578269450504, + "grad_norm": 0.4436100423336029, + "learning_rate": 3.7876465197457014e-05, + "loss": 1.7213, + "step": 5564 + }, + { + "epoch": 0.5780617014646308, + "grad_norm": 0.41849127411842346, + "learning_rate": 3.7860636179099153e-05, + "loss": 1.7851, + "step": 5565 + }, + { + "epoch": 0.578165575984211, + "grad_norm": 0.44967445731163025, + "learning_rate": 3.784480845348886e-05, + "loss": 1.7522, + "step": 5566 + }, + { + "epoch": 0.5782694505037914, + "grad_norm": 0.4041469693183899, + "learning_rate": 3.7828982022311645e-05, + "loss": 1.6278, + "step": 5567 + }, + { + "epoch": 0.5783733250233718, + "grad_norm": 0.3950141668319702, + "learning_rate": 3.7813156887252896e-05, + "loss": 1.566, + "step": 5568 + }, + { + "epoch": 0.5784771995429521, + "grad_norm": 0.4024663269519806, + "learning_rate": 3.77973330499979e-05, + "loss": 1.6743, + "step": 5569 + }, + { + "epoch": 0.5785810740625325, + "grad_norm": 0.4183747172355652, + "learning_rate": 3.778151051223173e-05, + "loss": 1.6308, + "step": 5570 + }, + { + "epoch": 0.5786849485821128, + "grad_norm": 0.3978317081928253, + "learning_rate": 3.7765689275639374e-05, + "loss": 1.6298, + "step": 5571 + }, + { + "epoch": 0.5787888231016931, + "grad_norm": 0.4176594018936157, + "learning_rate": 3.774986934190571e-05, + "loss": 1.7289, + "step": 5572 + }, + { + "epoch": 0.5788926976212735, + "grad_norm": 0.4060553014278412, + "learning_rate": 3.7734050712715394e-05, + "loss": 1.7287, + "step": 5573 + }, + { + "epoch": 0.5789965721408539, + "grad_norm": 0.40676042437553406, + "learning_rate": 3.7718233389752986e-05, + "loss": 1.4959, + "step": 5574 + }, + { + "epoch": 0.5791004466604341, + "grad_norm": 0.4172111451625824, + "learning_rate": 3.770241737470295e-05, + "loss": 1.669, + "step": 5575 + }, + { + "epoch": 0.5792043211800145, + "grad_norm": 0.4500811994075775, + "learning_rate": 3.7686602669249505e-05, + "loss": 1.8687, + "step": 5576 + }, + { + "epoch": 0.5793081956995949, + "grad_norm": 0.4102492928504944, + "learning_rate": 3.767078927507683e-05, + "loss": 1.6422, + "step": 5577 + }, + { + "epoch": 0.5794120702191753, + "grad_norm": 0.42576107382774353, + "learning_rate": 3.765497719386894e-05, + "loss": 1.7986, + "step": 5578 + }, + { + "epoch": 0.5795159447387556, + "grad_norm": 0.43037348985671997, + "learning_rate": 3.763916642730969e-05, + "loss": 1.52, + "step": 5579 + }, + { + "epoch": 0.5796198192583359, + "grad_norm": 0.4052847623825073, + "learning_rate": 3.762335697708279e-05, + "loss": 1.8281, + "step": 5580 + }, + { + "epoch": 0.5797236937779163, + "grad_norm": 0.42516037821769714, + "learning_rate": 3.760754884487182e-05, + "loss": 1.7839, + "step": 5581 + }, + { + "epoch": 0.5798275682974966, + "grad_norm": 0.4137275815010071, + "learning_rate": 3.759174203236025e-05, + "loss": 1.643, + "step": 5582 + }, + { + "epoch": 0.579931442817077, + "grad_norm": 0.45627397298812866, + "learning_rate": 3.757593654123135e-05, + "loss": 1.8582, + "step": 5583 + }, + { + "epoch": 0.5800353173366574, + "grad_norm": 0.420109361410141, + "learning_rate": 3.75601323731683e-05, + "loss": 1.7523, + "step": 5584 + }, + { + "epoch": 0.5801391918562376, + "grad_norm": 0.4100499749183655, + "learning_rate": 3.754432952985413e-05, + "loss": 1.6175, + "step": 5585 + }, + { + "epoch": 0.580243066375818, + "grad_norm": 0.4295746088027954, + "learning_rate": 3.7528528012971694e-05, + "loss": 1.7074, + "step": 5586 + }, + { + "epoch": 0.5803469408953984, + "grad_norm": 0.40844690799713135, + "learning_rate": 3.7512727824203754e-05, + "loss": 1.6747, + "step": 5587 + }, + { + "epoch": 0.5804508154149787, + "grad_norm": 0.45334941148757935, + "learning_rate": 3.7496928965232905e-05, + "loss": 1.6918, + "step": 5588 + }, + { + "epoch": 0.580554689934559, + "grad_norm": 0.35644862055778503, + "learning_rate": 3.7481131437741566e-05, + "loss": 1.5046, + "step": 5589 + }, + { + "epoch": 0.5806585644541394, + "grad_norm": 0.41652175784111023, + "learning_rate": 3.746533524341208e-05, + "loss": 1.4811, + "step": 5590 + }, + { + "epoch": 0.5807624389737197, + "grad_norm": 0.40654313564300537, + "learning_rate": 3.7449540383926646e-05, + "loss": 1.6301, + "step": 5591 + }, + { + "epoch": 0.5808663134933001, + "grad_norm": 0.40000343322753906, + "learning_rate": 3.7433746860967235e-05, + "loss": 1.6732, + "step": 5592 + }, + { + "epoch": 0.5809701880128805, + "grad_norm": 0.4352266788482666, + "learning_rate": 3.741795467621578e-05, + "loss": 1.4708, + "step": 5593 + }, + { + "epoch": 0.5810740625324607, + "grad_norm": 0.40756914019584656, + "learning_rate": 3.7402163831354e-05, + "loss": 1.647, + "step": 5594 + }, + { + "epoch": 0.5811779370520411, + "grad_norm": 0.4151524603366852, + "learning_rate": 3.738637432806351e-05, + "loss": 1.5459, + "step": 5595 + }, + { + "epoch": 0.5812818115716215, + "grad_norm": 0.39485087990760803, + "learning_rate": 3.737058616802575e-05, + "loss": 1.6415, + "step": 5596 + }, + { + "epoch": 0.5813856860912018, + "grad_norm": 0.4184524118900299, + "learning_rate": 3.735479935292208e-05, + "loss": 1.4745, + "step": 5597 + }, + { + "epoch": 0.5814895606107822, + "grad_norm": 0.42042723298072815, + "learning_rate": 3.733901388443361e-05, + "loss": 1.6336, + "step": 5598 + }, + { + "epoch": 0.5815934351303625, + "grad_norm": 0.39631494879722595, + "learning_rate": 3.732322976424141e-05, + "loss": 1.4979, + "step": 5599 + }, + { + "epoch": 0.5816973096499428, + "grad_norm": 0.45113274455070496, + "learning_rate": 3.7307446994026375e-05, + "loss": 1.826, + "step": 5600 + }, + { + "epoch": 0.5818011841695232, + "grad_norm": 0.43801307678222656, + "learning_rate": 3.7291665575469216e-05, + "loss": 2.0605, + "step": 5601 + }, + { + "epoch": 0.5819050586891036, + "grad_norm": 0.4118470549583435, + "learning_rate": 3.727588551025053e-05, + "loss": 1.6777, + "step": 5602 + }, + { + "epoch": 0.582008933208684, + "grad_norm": 0.40802454948425293, + "learning_rate": 3.72601068000508e-05, + "loss": 1.6875, + "step": 5603 + }, + { + "epoch": 0.5821128077282642, + "grad_norm": 0.3772849142551422, + "learning_rate": 3.724432944655033e-05, + "loss": 1.7246, + "step": 5604 + }, + { + "epoch": 0.5822166822478446, + "grad_norm": 0.42528003454208374, + "learning_rate": 3.722855345142926e-05, + "loss": 1.7337, + "step": 5605 + }, + { + "epoch": 0.582320556767425, + "grad_norm": 0.4423576593399048, + "learning_rate": 3.721277881636764e-05, + "loss": 1.8097, + "step": 5606 + }, + { + "epoch": 0.5824244312870053, + "grad_norm": 0.4138087034225464, + "learning_rate": 3.719700554304533e-05, + "loss": 1.5108, + "step": 5607 + }, + { + "epoch": 0.5825283058065857, + "grad_norm": 0.4109199047088623, + "learning_rate": 3.718123363314206e-05, + "loss": 1.541, + "step": 5608 + }, + { + "epoch": 0.582632180326166, + "grad_norm": 0.38805916905403137, + "learning_rate": 3.7165463088337406e-05, + "loss": 1.7062, + "step": 5609 + }, + { + "epoch": 0.5827360548457463, + "grad_norm": 0.39964035153388977, + "learning_rate": 3.714969391031084e-05, + "loss": 1.6754, + "step": 5610 + }, + { + "epoch": 0.5828399293653267, + "grad_norm": 0.41137054562568665, + "learning_rate": 3.713392610074162e-05, + "loss": 1.6984, + "step": 5611 + }, + { + "epoch": 0.5829438038849071, + "grad_norm": 0.3872339129447937, + "learning_rate": 3.7118159661308906e-05, + "loss": 1.4976, + "step": 5612 + }, + { + "epoch": 0.5830476784044873, + "grad_norm": 0.4146081507205963, + "learning_rate": 3.7102394593691724e-05, + "loss": 1.5536, + "step": 5613 + }, + { + "epoch": 0.5831515529240677, + "grad_norm": 0.4401085674762726, + "learning_rate": 3.7086630899568906e-05, + "loss": 1.8147, + "step": 5614 + }, + { + "epoch": 0.5832554274436481, + "grad_norm": 0.46493473649024963, + "learning_rate": 3.707086858061917e-05, + "loss": 1.934, + "step": 5615 + }, + { + "epoch": 0.5833593019632284, + "grad_norm": 0.4143672585487366, + "learning_rate": 3.7055107638521086e-05, + "loss": 1.6462, + "step": 5616 + }, + { + "epoch": 0.5834631764828088, + "grad_norm": 0.41151854395866394, + "learning_rate": 3.703934807495305e-05, + "loss": 1.6508, + "step": 5617 + }, + { + "epoch": 0.5835670510023891, + "grad_norm": 0.47356948256492615, + "learning_rate": 3.7023589891593344e-05, + "loss": 1.9211, + "step": 5618 + }, + { + "epoch": 0.5836709255219694, + "grad_norm": 0.4014083445072174, + "learning_rate": 3.7007833090120116e-05, + "loss": 1.7667, + "step": 5619 + }, + { + "epoch": 0.5837748000415498, + "grad_norm": 0.3790227174758911, + "learning_rate": 3.69920776722113e-05, + "loss": 1.4863, + "step": 5620 + }, + { + "epoch": 0.5838786745611302, + "grad_norm": 0.43195411562919617, + "learning_rate": 3.697632363954475e-05, + "loss": 1.7431, + "step": 5621 + }, + { + "epoch": 0.5839825490807105, + "grad_norm": 0.4108622372150421, + "learning_rate": 3.696057099379816e-05, + "loss": 1.6647, + "step": 5622 + }, + { + "epoch": 0.5840864236002908, + "grad_norm": 0.3950975835323334, + "learning_rate": 3.694481973664904e-05, + "loss": 1.4736, + "step": 5623 + }, + { + "epoch": 0.5841902981198712, + "grad_norm": 0.41740620136260986, + "learning_rate": 3.6929069869774766e-05, + "loss": 1.6726, + "step": 5624 + }, + { + "epoch": 0.5842941726394515, + "grad_norm": 0.45372968912124634, + "learning_rate": 3.6913321394852616e-05, + "loss": 1.7569, + "step": 5625 + }, + { + "epoch": 0.5843980471590319, + "grad_norm": 0.3958111107349396, + "learning_rate": 3.689757431355964e-05, + "loss": 1.7056, + "step": 5626 + }, + { + "epoch": 0.5845019216786123, + "grad_norm": 0.4113248288631439, + "learning_rate": 3.68818286275728e-05, + "loss": 1.7895, + "step": 5627 + }, + { + "epoch": 0.5846057961981925, + "grad_norm": 0.4527917802333832, + "learning_rate": 3.686608433856891e-05, + "loss": 1.7048, + "step": 5628 + }, + { + "epoch": 0.5847096707177729, + "grad_norm": 0.43237176537513733, + "learning_rate": 3.6850341448224574e-05, + "loss": 1.6414, + "step": 5629 + }, + { + "epoch": 0.5848135452373533, + "grad_norm": 0.4343053102493286, + "learning_rate": 3.6834599958216286e-05, + "loss": 1.7855, + "step": 5630 + }, + { + "epoch": 0.5849174197569337, + "grad_norm": 0.44802016019821167, + "learning_rate": 3.6818859870220415e-05, + "loss": 1.8118, + "step": 5631 + }, + { + "epoch": 0.585021294276514, + "grad_norm": 0.4242957532405853, + "learning_rate": 3.680312118591317e-05, + "loss": 1.6769, + "step": 5632 + }, + { + "epoch": 0.5851251687960943, + "grad_norm": 0.4112863540649414, + "learning_rate": 3.6787383906970564e-05, + "loss": 1.6765, + "step": 5633 + }, + { + "epoch": 0.5852290433156747, + "grad_norm": 0.435183048248291, + "learning_rate": 3.677164803506851e-05, + "loss": 1.6958, + "step": 5634 + }, + { + "epoch": 0.585332917835255, + "grad_norm": 0.4290895462036133, + "learning_rate": 3.675591357188276e-05, + "loss": 1.6542, + "step": 5635 + }, + { + "epoch": 0.5854367923548354, + "grad_norm": 0.44175586104393005, + "learning_rate": 3.6740180519088905e-05, + "loss": 1.802, + "step": 5636 + }, + { + "epoch": 0.5855406668744157, + "grad_norm": 0.3649253249168396, + "learning_rate": 3.672444887836239e-05, + "loss": 1.4738, + "step": 5637 + }, + { + "epoch": 0.585644541393996, + "grad_norm": 0.4127234220504761, + "learning_rate": 3.6708718651378534e-05, + "loss": 1.522, + "step": 5638 + }, + { + "epoch": 0.5857484159135764, + "grad_norm": 0.41260048747062683, + "learning_rate": 3.6692989839812444e-05, + "loss": 1.6182, + "step": 5639 + }, + { + "epoch": 0.5858522904331568, + "grad_norm": 0.410636305809021, + "learning_rate": 3.667726244533913e-05, + "loss": 1.6908, + "step": 5640 + }, + { + "epoch": 0.5859561649527371, + "grad_norm": 0.393493115901947, + "learning_rate": 3.6661536469633474e-05, + "loss": 1.5382, + "step": 5641 + }, + { + "epoch": 0.5860600394723174, + "grad_norm": 0.4044821560382843, + "learning_rate": 3.6645811914370126e-05, + "loss": 1.683, + "step": 5642 + }, + { + "epoch": 0.5861639139918978, + "grad_norm": 0.39565759897232056, + "learning_rate": 3.663008878122364e-05, + "loss": 1.6154, + "step": 5643 + }, + { + "epoch": 0.5862677885114781, + "grad_norm": 0.4338008165359497, + "learning_rate": 3.661436707186842e-05, + "loss": 1.8497, + "step": 5644 + }, + { + "epoch": 0.5863716630310585, + "grad_norm": 0.4520065188407898, + "learning_rate": 3.6598646787978676e-05, + "loss": 1.6919, + "step": 5645 + }, + { + "epoch": 0.5864755375506389, + "grad_norm": 0.3839634656906128, + "learning_rate": 3.6582927931228516e-05, + "loss": 1.6516, + "step": 5646 + }, + { + "epoch": 0.5865794120702191, + "grad_norm": 0.4146488606929779, + "learning_rate": 3.656721050329189e-05, + "loss": 1.7285, + "step": 5647 + }, + { + "epoch": 0.5866832865897995, + "grad_norm": 0.43894267082214355, + "learning_rate": 3.6551494505842554e-05, + "loss": 1.6571, + "step": 5648 + }, + { + "epoch": 0.5867871611093799, + "grad_norm": 0.42978256940841675, + "learning_rate": 3.653577994055416e-05, + "loss": 1.6641, + "step": 5649 + }, + { + "epoch": 0.5868910356289602, + "grad_norm": 0.40996453166007996, + "learning_rate": 3.652006680910017e-05, + "loss": 1.5068, + "step": 5650 + }, + { + "epoch": 0.5869949101485405, + "grad_norm": 0.4839547574520111, + "learning_rate": 3.650435511315392e-05, + "loss": 1.7859, + "step": 5651 + }, + { + "epoch": 0.5870987846681209, + "grad_norm": 0.40157848596572876, + "learning_rate": 3.6488644854388577e-05, + "loss": 1.5551, + "step": 5652 + }, + { + "epoch": 0.5872026591877012, + "grad_norm": 0.46978557109832764, + "learning_rate": 3.647293603447718e-05, + "loss": 1.8654, + "step": 5653 + }, + { + "epoch": 0.5873065337072816, + "grad_norm": 0.39479735493659973, + "learning_rate": 3.6457228655092554e-05, + "loss": 1.6824, + "step": 5654 + }, + { + "epoch": 0.587410408226862, + "grad_norm": 0.4402686357498169, + "learning_rate": 3.6441522717907443e-05, + "loss": 1.7669, + "step": 5655 + }, + { + "epoch": 0.5875142827464424, + "grad_norm": 0.41756048798561096, + "learning_rate": 3.642581822459441e-05, + "loss": 1.7688, + "step": 5656 + }, + { + "epoch": 0.5876181572660226, + "grad_norm": 0.42584848403930664, + "learning_rate": 3.641011517682588e-05, + "loss": 1.6345, + "step": 5657 + }, + { + "epoch": 0.587722031785603, + "grad_norm": 0.40674230456352234, + "learning_rate": 3.639441357627404e-05, + "loss": 1.6559, + "step": 5658 + }, + { + "epoch": 0.5878259063051834, + "grad_norm": 0.40528520941734314, + "learning_rate": 3.637871342461103e-05, + "loss": 1.6813, + "step": 5659 + }, + { + "epoch": 0.5879297808247637, + "grad_norm": 0.4171191453933716, + "learning_rate": 3.6363014723508813e-05, + "loss": 1.7832, + "step": 5660 + }, + { + "epoch": 0.588033655344344, + "grad_norm": 0.3836202919483185, + "learning_rate": 3.634731747463913e-05, + "loss": 1.6288, + "step": 5661 + }, + { + "epoch": 0.5881375298639244, + "grad_norm": 0.45273473858833313, + "learning_rate": 3.633162167967366e-05, + "loss": 1.6388, + "step": 5662 + }, + { + "epoch": 0.5882414043835047, + "grad_norm": 0.42023298144340515, + "learning_rate": 3.6315927340283855e-05, + "loss": 1.6475, + "step": 5663 + }, + { + "epoch": 0.5883452789030851, + "grad_norm": 0.4072648286819458, + "learning_rate": 3.630023445814106e-05, + "loss": 1.7276, + "step": 5664 + }, + { + "epoch": 0.5884491534226655, + "grad_norm": 0.37127307057380676, + "learning_rate": 3.6284543034916405e-05, + "loss": 1.449, + "step": 5665 + }, + { + "epoch": 0.5885530279422457, + "grad_norm": 0.40578770637512207, + "learning_rate": 3.6268853072280966e-05, + "loss": 1.692, + "step": 5666 + }, + { + "epoch": 0.5886569024618261, + "grad_norm": 0.4521169364452362, + "learning_rate": 3.625316457190555e-05, + "loss": 1.7896, + "step": 5667 + }, + { + "epoch": 0.5887607769814065, + "grad_norm": 0.4111865758895874, + "learning_rate": 3.623747753546086e-05, + "loss": 1.8176, + "step": 5668 + }, + { + "epoch": 0.5888646515009868, + "grad_norm": 0.4034864008426666, + "learning_rate": 3.62217919646175e-05, + "loss": 1.7778, + "step": 5669 + }, + { + "epoch": 0.5889685260205672, + "grad_norm": 0.405782550573349, + "learning_rate": 3.6206107861045804e-05, + "loss": 1.8, + "step": 5670 + }, + { + "epoch": 0.5890724005401475, + "grad_norm": 0.3821369707584381, + "learning_rate": 3.619042522641601e-05, + "loss": 1.4904, + "step": 5671 + }, + { + "epoch": 0.5891762750597278, + "grad_norm": 0.42174914479255676, + "learning_rate": 3.617474406239824e-05, + "loss": 1.6863, + "step": 5672 + }, + { + "epoch": 0.5892801495793082, + "grad_norm": 0.4899470806121826, + "learning_rate": 3.615906437066236e-05, + "loss": 1.5951, + "step": 5673 + }, + { + "epoch": 0.5893840240988886, + "grad_norm": 0.3886786699295044, + "learning_rate": 3.614338615287816e-05, + "loss": 1.5416, + "step": 5674 + }, + { + "epoch": 0.5894878986184688, + "grad_norm": 0.3970104455947876, + "learning_rate": 3.612770941071527e-05, + "loss": 1.6345, + "step": 5675 + }, + { + "epoch": 0.5895917731380492, + "grad_norm": 0.4441760182380676, + "learning_rate": 3.61120341458431e-05, + "loss": 1.7968, + "step": 5676 + }, + { + "epoch": 0.5896956476576296, + "grad_norm": 0.4187043607234955, + "learning_rate": 3.6096360359930984e-05, + "loss": 1.458, + "step": 5677 + }, + { + "epoch": 0.5897995221772099, + "grad_norm": 0.39509010314941406, + "learning_rate": 3.608068805464802e-05, + "loss": 1.8428, + "step": 5678 + }, + { + "epoch": 0.5899033966967903, + "grad_norm": 0.41668543219566345, + "learning_rate": 3.6065017231663214e-05, + "loss": 1.4289, + "step": 5679 + }, + { + "epoch": 0.5900072712163706, + "grad_norm": 0.4090832471847534, + "learning_rate": 3.6049347892645356e-05, + "loss": 1.7567, + "step": 5680 + }, + { + "epoch": 0.590111145735951, + "grad_norm": 0.4536181390285492, + "learning_rate": 3.603368003926314e-05, + "loss": 1.6062, + "step": 5681 + }, + { + "epoch": 0.5902150202555313, + "grad_norm": 0.4117638170719147, + "learning_rate": 3.601801367318507e-05, + "loss": 1.6996, + "step": 5682 + }, + { + "epoch": 0.5903188947751117, + "grad_norm": 0.4338860809803009, + "learning_rate": 3.600234879607947e-05, + "loss": 1.6467, + "step": 5683 + }, + { + "epoch": 0.5904227692946921, + "grad_norm": 0.4608552157878876, + "learning_rate": 3.5986685409614526e-05, + "loss": 1.8217, + "step": 5684 + }, + { + "epoch": 0.5905266438142723, + "grad_norm": 0.4324304163455963, + "learning_rate": 3.597102351545831e-05, + "loss": 1.7504, + "step": 5685 + }, + { + "epoch": 0.5906305183338527, + "grad_norm": 0.3990081250667572, + "learning_rate": 3.5955363115278626e-05, + "loss": 1.6519, + "step": 5686 + }, + { + "epoch": 0.5907343928534331, + "grad_norm": 0.4149520695209503, + "learning_rate": 3.593970421074323e-05, + "loss": 1.6564, + "step": 5687 + }, + { + "epoch": 0.5908382673730134, + "grad_norm": 0.4176534116268158, + "learning_rate": 3.5924046803519675e-05, + "loss": 1.5858, + "step": 5688 + }, + { + "epoch": 0.5909421418925938, + "grad_norm": 0.40572646260261536, + "learning_rate": 3.590839089527531e-05, + "loss": 1.6495, + "step": 5689 + }, + { + "epoch": 0.5910460164121741, + "grad_norm": 0.38206931948661804, + "learning_rate": 3.589273648767742e-05, + "loss": 1.6129, + "step": 5690 + }, + { + "epoch": 0.5911498909317544, + "grad_norm": 0.4322497248649597, + "learning_rate": 3.5877083582393044e-05, + "loss": 1.7354, + "step": 5691 + }, + { + "epoch": 0.5912537654513348, + "grad_norm": 0.41732117533683777, + "learning_rate": 3.586143218108911e-05, + "loss": 1.891, + "step": 5692 + }, + { + "epoch": 0.5913576399709152, + "grad_norm": 0.3857957422733307, + "learning_rate": 3.584578228543235e-05, + "loss": 1.6225, + "step": 5693 + }, + { + "epoch": 0.5914615144904954, + "grad_norm": 0.408961683511734, + "learning_rate": 3.5830133897089394e-05, + "loss": 1.671, + "step": 5694 + }, + { + "epoch": 0.5915653890100758, + "grad_norm": 0.4022825062274933, + "learning_rate": 3.5814487017726616e-05, + "loss": 1.648, + "step": 5695 + }, + { + "epoch": 0.5916692635296562, + "grad_norm": 0.468014657497406, + "learning_rate": 3.579884164901033e-05, + "loss": 1.8003, + "step": 5696 + }, + { + "epoch": 0.5917731380492365, + "grad_norm": 0.3964448869228363, + "learning_rate": 3.5783197792606654e-05, + "loss": 1.7439, + "step": 5697 + }, + { + "epoch": 0.5918770125688169, + "grad_norm": 0.39556026458740234, + "learning_rate": 3.576755545018151e-05, + "loss": 1.6194, + "step": 5698 + }, + { + "epoch": 0.5919808870883972, + "grad_norm": 0.3882671594619751, + "learning_rate": 3.5751914623400675e-05, + "loss": 1.599, + "step": 5699 + }, + { + "epoch": 0.5920847616079775, + "grad_norm": 0.4119251072406769, + "learning_rate": 3.573627531392982e-05, + "loss": 1.8195, + "step": 5700 + }, + { + "epoch": 0.5921886361275579, + "grad_norm": 0.4111797511577606, + "learning_rate": 3.572063752343436e-05, + "loss": 1.6844, + "step": 5701 + }, + { + "epoch": 0.5922925106471383, + "grad_norm": 0.3847576379776001, + "learning_rate": 3.5705001253579615e-05, + "loss": 1.652, + "step": 5702 + }, + { + "epoch": 0.5923963851667186, + "grad_norm": 0.44039520621299744, + "learning_rate": 3.568936650603076e-05, + "loss": 1.6053, + "step": 5703 + }, + { + "epoch": 0.5925002596862989, + "grad_norm": 0.4001440703868866, + "learning_rate": 3.5673733282452725e-05, + "loss": 1.3555, + "step": 5704 + }, + { + "epoch": 0.5926041342058793, + "grad_norm": 0.4470379054546356, + "learning_rate": 3.565810158451036e-05, + "loss": 1.7809, + "step": 5705 + }, + { + "epoch": 0.5927080087254596, + "grad_norm": 0.46440574526786804, + "learning_rate": 3.5642471413868305e-05, + "loss": 1.903, + "step": 5706 + }, + { + "epoch": 0.59281188324504, + "grad_norm": 0.4026218056678772, + "learning_rate": 3.562684277219104e-05, + "loss": 1.5994, + "step": 5707 + }, + { + "epoch": 0.5929157577646204, + "grad_norm": 0.47202712297439575, + "learning_rate": 3.5611215661142897e-05, + "loss": 1.8916, + "step": 5708 + }, + { + "epoch": 0.5930196322842007, + "grad_norm": 0.4305804967880249, + "learning_rate": 3.559559008238805e-05, + "loss": 1.4593, + "step": 5709 + }, + { + "epoch": 0.593123506803781, + "grad_norm": 0.4332740306854248, + "learning_rate": 3.5579966037590524e-05, + "loss": 1.7235, + "step": 5710 + }, + { + "epoch": 0.5932273813233614, + "grad_norm": 0.40714386105537415, + "learning_rate": 3.556434352841411e-05, + "loss": 1.631, + "step": 5711 + }, + { + "epoch": 0.5933312558429418, + "grad_norm": 0.4484121799468994, + "learning_rate": 3.554872255652252e-05, + "loss": 1.8386, + "step": 5712 + }, + { + "epoch": 0.593435130362522, + "grad_norm": 0.446119099855423, + "learning_rate": 3.553310312357927e-05, + "loss": 1.838, + "step": 5713 + }, + { + "epoch": 0.5935390048821024, + "grad_norm": 0.45277395844459534, + "learning_rate": 3.551748523124766e-05, + "loss": 1.7349, + "step": 5714 + }, + { + "epoch": 0.5936428794016828, + "grad_norm": 0.3999378979206085, + "learning_rate": 3.550186888119091e-05, + "loss": 1.6032, + "step": 5715 + }, + { + "epoch": 0.5937467539212631, + "grad_norm": 0.4385007619857788, + "learning_rate": 3.5486254075072056e-05, + "loss": 1.8558, + "step": 5716 + }, + { + "epoch": 0.5938506284408435, + "grad_norm": 0.42009690403938293, + "learning_rate": 3.547064081455391e-05, + "loss": 1.7783, + "step": 5717 + }, + { + "epoch": 0.5939545029604238, + "grad_norm": 0.3846227526664734, + "learning_rate": 3.5455029101299196e-05, + "loss": 1.4527, + "step": 5718 + }, + { + "epoch": 0.5940583774800041, + "grad_norm": 0.4681823253631592, + "learning_rate": 3.5439418936970425e-05, + "loss": 1.8712, + "step": 5719 + }, + { + "epoch": 0.5941622519995845, + "grad_norm": 0.4000855088233948, + "learning_rate": 3.542381032322997e-05, + "loss": 1.548, + "step": 5720 + }, + { + "epoch": 0.5942661265191649, + "grad_norm": 0.3869953453540802, + "learning_rate": 3.540820326173999e-05, + "loss": 1.5979, + "step": 5721 + }, + { + "epoch": 0.5943700010387452, + "grad_norm": 0.43959927558898926, + "learning_rate": 3.539259775416257e-05, + "loss": 1.7001, + "step": 5722 + }, + { + "epoch": 0.5944738755583255, + "grad_norm": 0.40868815779685974, + "learning_rate": 3.537699380215953e-05, + "loss": 1.7146, + "step": 5723 + }, + { + "epoch": 0.5945777500779059, + "grad_norm": 0.39394938945770264, + "learning_rate": 3.5361391407392583e-05, + "loss": 1.711, + "step": 5724 + }, + { + "epoch": 0.5946816245974862, + "grad_norm": 0.4582231044769287, + "learning_rate": 3.5345790571523286e-05, + "loss": 1.9054, + "step": 5725 + }, + { + "epoch": 0.5947854991170666, + "grad_norm": 0.40016230940818787, + "learning_rate": 3.5330191296212976e-05, + "loss": 1.7314, + "step": 5726 + }, + { + "epoch": 0.594889373636647, + "grad_norm": 0.45281219482421875, + "learning_rate": 3.5314593583122854e-05, + "loss": 1.8133, + "step": 5727 + }, + { + "epoch": 0.5949932481562272, + "grad_norm": 0.41264450550079346, + "learning_rate": 3.529899743391398e-05, + "loss": 1.7283, + "step": 5728 + }, + { + "epoch": 0.5950971226758076, + "grad_norm": 0.41202911734580994, + "learning_rate": 3.5283402850247194e-05, + "loss": 1.3802, + "step": 5729 + }, + { + "epoch": 0.595200997195388, + "grad_norm": 0.4194154143333435, + "learning_rate": 3.526780983378321e-05, + "loss": 1.6485, + "step": 5730 + }, + { + "epoch": 0.5953048717149683, + "grad_norm": 0.4135974943637848, + "learning_rate": 3.5252218386182585e-05, + "loss": 1.793, + "step": 5731 + }, + { + "epoch": 0.5954087462345486, + "grad_norm": 0.41171103715896606, + "learning_rate": 3.523662850910564e-05, + "loss": 1.7508, + "step": 5732 + }, + { + "epoch": 0.595512620754129, + "grad_norm": 0.4255639314651489, + "learning_rate": 3.5221040204212606e-05, + "loss": 1.6849, + "step": 5733 + }, + { + "epoch": 0.5956164952737094, + "grad_norm": 0.4412103295326233, + "learning_rate": 3.5205453473163506e-05, + "loss": 1.6842, + "step": 5734 + }, + { + "epoch": 0.5957203697932897, + "grad_norm": 0.397868275642395, + "learning_rate": 3.518986831761824e-05, + "loss": 1.6491, + "step": 5735 + }, + { + "epoch": 0.5958242443128701, + "grad_norm": 0.4192911684513092, + "learning_rate": 3.517428473923644e-05, + "loss": 1.8276, + "step": 5736 + }, + { + "epoch": 0.5959281188324504, + "grad_norm": 0.39529258012771606, + "learning_rate": 3.515870273967767e-05, + "loss": 1.6447, + "step": 5737 + }, + { + "epoch": 0.5960319933520307, + "grad_norm": 0.39592933654785156, + "learning_rate": 3.514312232060132e-05, + "loss": 1.6278, + "step": 5738 + }, + { + "epoch": 0.5961358678716111, + "grad_norm": 0.4018373489379883, + "learning_rate": 3.512754348366652e-05, + "loss": 1.7605, + "step": 5739 + }, + { + "epoch": 0.5962397423911915, + "grad_norm": 0.47401878237724304, + "learning_rate": 3.5111966230532355e-05, + "loss": 2.0161, + "step": 5740 + }, + { + "epoch": 0.5963436169107718, + "grad_norm": 0.4021179676055908, + "learning_rate": 3.509639056285766e-05, + "loss": 1.6192, + "step": 5741 + }, + { + "epoch": 0.5964474914303521, + "grad_norm": 0.427116334438324, + "learning_rate": 3.50808164823011e-05, + "loss": 1.6114, + "step": 5742 + }, + { + "epoch": 0.5965513659499325, + "grad_norm": 0.41006430983543396, + "learning_rate": 3.5065243990521205e-05, + "loss": 1.7771, + "step": 5743 + }, + { + "epoch": 0.5966552404695128, + "grad_norm": 0.39242881536483765, + "learning_rate": 3.504967308917635e-05, + "loss": 1.6355, + "step": 5744 + }, + { + "epoch": 0.5967591149890932, + "grad_norm": 0.39691224694252014, + "learning_rate": 3.5034103779924674e-05, + "loss": 1.5866, + "step": 5745 + }, + { + "epoch": 0.5968629895086736, + "grad_norm": 0.4471852481365204, + "learning_rate": 3.5018536064424224e-05, + "loss": 1.663, + "step": 5746 + }, + { + "epoch": 0.5969668640282538, + "grad_norm": 0.39760124683380127, + "learning_rate": 3.500296994433282e-05, + "loss": 1.7701, + "step": 5747 + }, + { + "epoch": 0.5970707385478342, + "grad_norm": 0.3909447193145752, + "learning_rate": 3.498740542130813e-05, + "loss": 1.5167, + "step": 5748 + }, + { + "epoch": 0.5971746130674146, + "grad_norm": 0.4225844442844391, + "learning_rate": 3.497184249700766e-05, + "loss": 1.6903, + "step": 5749 + }, + { + "epoch": 0.5972784875869949, + "grad_norm": 0.4088863432407379, + "learning_rate": 3.495628117308875e-05, + "loss": 1.3562, + "step": 5750 + }, + { + "epoch": 0.5973823621065752, + "grad_norm": 0.39334914088249207, + "learning_rate": 3.4940721451208535e-05, + "loss": 1.6846, + "step": 5751 + }, + { + "epoch": 0.5974862366261556, + "grad_norm": 0.4623134434223175, + "learning_rate": 3.4925163333024015e-05, + "loss": 1.6667, + "step": 5752 + }, + { + "epoch": 0.5975901111457359, + "grad_norm": 0.4101615846157074, + "learning_rate": 3.490960682019203e-05, + "loss": 1.594, + "step": 5753 + }, + { + "epoch": 0.5976939856653163, + "grad_norm": 0.4440319240093231, + "learning_rate": 3.489405191436921e-05, + "loss": 1.8576, + "step": 5754 + }, + { + "epoch": 0.5977978601848967, + "grad_norm": 0.45121869444847107, + "learning_rate": 3.487849861721201e-05, + "loss": 1.7881, + "step": 5755 + }, + { + "epoch": 0.5979017347044769, + "grad_norm": 0.4189302623271942, + "learning_rate": 3.4862946930376775e-05, + "loss": 1.5814, + "step": 5756 + }, + { + "epoch": 0.5980056092240573, + "grad_norm": 0.45438921451568604, + "learning_rate": 3.4847396855519604e-05, + "loss": 1.688, + "step": 5757 + }, + { + "epoch": 0.5981094837436377, + "grad_norm": 0.40121012926101685, + "learning_rate": 3.483184839429647e-05, + "loss": 1.7056, + "step": 5758 + }, + { + "epoch": 0.5982133582632181, + "grad_norm": 0.4294279217720032, + "learning_rate": 3.481630154836317e-05, + "loss": 1.6278, + "step": 5759 + }, + { + "epoch": 0.5983172327827984, + "grad_norm": 0.3801029324531555, + "learning_rate": 3.4800756319375325e-05, + "loss": 1.5913, + "step": 5760 + }, + { + "epoch": 0.5984211073023787, + "grad_norm": 0.43568864464759827, + "learning_rate": 3.478521270898838e-05, + "loss": 1.7894, + "step": 5761 + }, + { + "epoch": 0.5985249818219591, + "grad_norm": 0.4160782992839813, + "learning_rate": 3.476967071885758e-05, + "loss": 1.6332, + "step": 5762 + }, + { + "epoch": 0.5986288563415394, + "grad_norm": 0.4249245524406433, + "learning_rate": 3.475413035063807e-05, + "loss": 1.6174, + "step": 5763 + }, + { + "epoch": 0.5987327308611198, + "grad_norm": 0.38329437375068665, + "learning_rate": 3.473859160598474e-05, + "loss": 1.7204, + "step": 5764 + }, + { + "epoch": 0.5988366053807002, + "grad_norm": 0.4142932593822479, + "learning_rate": 3.4723054486552354e-05, + "loss": 1.7932, + "step": 5765 + }, + { + "epoch": 0.5989404799002804, + "grad_norm": 0.44899502396583557, + "learning_rate": 3.4707518993995524e-05, + "loss": 1.7966, + "step": 5766 + }, + { + "epoch": 0.5990443544198608, + "grad_norm": 0.40485885739326477, + "learning_rate": 3.469198512996863e-05, + "loss": 1.6783, + "step": 5767 + }, + { + "epoch": 0.5991482289394412, + "grad_norm": 0.39931273460388184, + "learning_rate": 3.467645289612589e-05, + "loss": 1.5305, + "step": 5768 + }, + { + "epoch": 0.5992521034590215, + "grad_norm": 0.381760835647583, + "learning_rate": 3.4660922294121416e-05, + "loss": 1.4935, + "step": 5769 + }, + { + "epoch": 0.5993559779786019, + "grad_norm": 0.4411904215812683, + "learning_rate": 3.4645393325609046e-05, + "loss": 1.5182, + "step": 5770 + }, + { + "epoch": 0.5994598524981822, + "grad_norm": 0.39932867884635925, + "learning_rate": 3.462986599224252e-05, + "loss": 1.7623, + "step": 5771 + }, + { + "epoch": 0.5995637270177625, + "grad_norm": 0.40389779210090637, + "learning_rate": 3.46143402956754e-05, + "loss": 1.5783, + "step": 5772 + }, + { + "epoch": 0.5996676015373429, + "grad_norm": 0.41848430037498474, + "learning_rate": 3.4598816237561e-05, + "loss": 1.7221, + "step": 5773 + }, + { + "epoch": 0.5997714760569233, + "grad_norm": 0.4139845669269562, + "learning_rate": 3.4583293819552534e-05, + "loss": 1.5198, + "step": 5774 + }, + { + "epoch": 0.5998753505765035, + "grad_norm": 0.4001912474632263, + "learning_rate": 3.456777304330303e-05, + "loss": 1.6871, + "step": 5775 + }, + { + "epoch": 0.5999792250960839, + "grad_norm": 0.44707363843917847, + "learning_rate": 3.455225391046533e-05, + "loss": 1.6126, + "step": 5776 + }, + { + "epoch": 0.6000830996156643, + "grad_norm": 0.3965167701244354, + "learning_rate": 3.4536736422692076e-05, + "loss": 1.4743, + "step": 5777 + }, + { + "epoch": 0.6001869741352446, + "grad_norm": 0.42330119013786316, + "learning_rate": 3.4521220581635785e-05, + "loss": 1.4636, + "step": 5778 + }, + { + "epoch": 0.600290848654825, + "grad_norm": 0.3950074315071106, + "learning_rate": 3.4505706388948755e-05, + "loss": 1.7064, + "step": 5779 + }, + { + "epoch": 0.6003947231744053, + "grad_norm": 0.42809754610061646, + "learning_rate": 3.4490193846283125e-05, + "loss": 1.7697, + "step": 5780 + }, + { + "epoch": 0.6004985976939856, + "grad_norm": 0.4212953448295593, + "learning_rate": 3.44746829552909e-05, + "loss": 1.7538, + "step": 5781 + }, + { + "epoch": 0.600602472213566, + "grad_norm": 0.4184706211090088, + "learning_rate": 3.445917371762382e-05, + "loss": 1.8034, + "step": 5782 + }, + { + "epoch": 0.6007063467331464, + "grad_norm": 0.42354482412338257, + "learning_rate": 3.444366613493351e-05, + "loss": 1.6961, + "step": 5783 + }, + { + "epoch": 0.6008102212527268, + "grad_norm": 0.388049840927124, + "learning_rate": 3.4428160208871414e-05, + "loss": 1.6964, + "step": 5784 + }, + { + "epoch": 0.600914095772307, + "grad_norm": 0.4392579197883606, + "learning_rate": 3.441265594108881e-05, + "loss": 1.5853, + "step": 5785 + }, + { + "epoch": 0.6010179702918874, + "grad_norm": 0.3931465148925781, + "learning_rate": 3.4397153333236735e-05, + "loss": 1.5617, + "step": 5786 + }, + { + "epoch": 0.6011218448114678, + "grad_norm": 0.3917011320590973, + "learning_rate": 3.438165238696614e-05, + "loss": 1.642, + "step": 5787 + }, + { + "epoch": 0.6012257193310481, + "grad_norm": 0.41590529680252075, + "learning_rate": 3.436615310392773e-05, + "loss": 1.5589, + "step": 5788 + }, + { + "epoch": 0.6013295938506285, + "grad_norm": 0.46075764298439026, + "learning_rate": 3.4350655485772074e-05, + "loss": 1.8877, + "step": 5789 + }, + { + "epoch": 0.6014334683702088, + "grad_norm": 0.4078013300895691, + "learning_rate": 3.433515953414953e-05, + "loss": 1.5837, + "step": 5790 + }, + { + "epoch": 0.6015373428897891, + "grad_norm": 0.4370696246623993, + "learning_rate": 3.431966525071033e-05, + "loss": 1.7953, + "step": 5791 + }, + { + "epoch": 0.6016412174093695, + "grad_norm": 0.3889653980731964, + "learning_rate": 3.4304172637104436e-05, + "loss": 1.6294, + "step": 5792 + }, + { + "epoch": 0.6017450919289499, + "grad_norm": 0.3979250192642212, + "learning_rate": 3.4288681694981735e-05, + "loss": 1.686, + "step": 5793 + }, + { + "epoch": 0.6018489664485301, + "grad_norm": 0.428313672542572, + "learning_rate": 3.42731924259919e-05, + "loss": 1.6336, + "step": 5794 + }, + { + "epoch": 0.6019528409681105, + "grad_norm": 0.4451068043708801, + "learning_rate": 3.4257704831784375e-05, + "loss": 1.6593, + "step": 5795 + }, + { + "epoch": 0.6020567154876909, + "grad_norm": 0.47442716360092163, + "learning_rate": 3.42422189140085e-05, + "loss": 1.8737, + "step": 5796 + }, + { + "epoch": 0.6021605900072712, + "grad_norm": 0.4260861277580261, + "learning_rate": 3.42267346743134e-05, + "loss": 1.7137, + "step": 5797 + }, + { + "epoch": 0.6022644645268516, + "grad_norm": 0.43714046478271484, + "learning_rate": 3.4211252114348e-05, + "loss": 1.7122, + "step": 5798 + }, + { + "epoch": 0.6023683390464319, + "grad_norm": 0.4364699721336365, + "learning_rate": 3.41957712357611e-05, + "loss": 1.6723, + "step": 5799 + }, + { + "epoch": 0.6024722135660122, + "grad_norm": 0.39638301730155945, + "learning_rate": 3.41802920402013e-05, + "loss": 1.6197, + "step": 5800 + }, + { + "epoch": 0.6025760880855926, + "grad_norm": 0.4492684602737427, + "learning_rate": 3.416481452931699e-05, + "loss": 1.5572, + "step": 5801 + }, + { + "epoch": 0.602679962605173, + "grad_norm": 0.3939664661884308, + "learning_rate": 3.4149338704756416e-05, + "loss": 1.6363, + "step": 5802 + }, + { + "epoch": 0.6027838371247533, + "grad_norm": 0.5074815154075623, + "learning_rate": 3.413386456816763e-05, + "loss": 1.8986, + "step": 5803 + }, + { + "epoch": 0.6028877116443336, + "grad_norm": 0.4267491102218628, + "learning_rate": 3.411839212119851e-05, + "loss": 1.64, + "step": 5804 + }, + { + "epoch": 0.602991586163914, + "grad_norm": 0.39998719096183777, + "learning_rate": 3.410292136549673e-05, + "loss": 1.577, + "step": 5805 + }, + { + "epoch": 0.6030954606834943, + "grad_norm": 0.3894689381122589, + "learning_rate": 3.408745230270985e-05, + "loss": 1.6308, + "step": 5806 + }, + { + "epoch": 0.6031993352030747, + "grad_norm": 0.4221261441707611, + "learning_rate": 3.407198493448516e-05, + "loss": 1.6313, + "step": 5807 + }, + { + "epoch": 0.603303209722655, + "grad_norm": 0.4228569269180298, + "learning_rate": 3.405651926246982e-05, + "loss": 1.7215, + "step": 5808 + }, + { + "epoch": 0.6034070842422353, + "grad_norm": 0.3922746479511261, + "learning_rate": 3.4041055288310844e-05, + "loss": 1.665, + "step": 5809 + }, + { + "epoch": 0.6035109587618157, + "grad_norm": 0.39183309674263, + "learning_rate": 3.402559301365498e-05, + "loss": 1.6458, + "step": 5810 + }, + { + "epoch": 0.6036148332813961, + "grad_norm": 0.4340969920158386, + "learning_rate": 3.401013244014884e-05, + "loss": 1.6737, + "step": 5811 + }, + { + "epoch": 0.6037187078009765, + "grad_norm": 0.4317123591899872, + "learning_rate": 3.3994673569438886e-05, + "loss": 1.702, + "step": 5812 + }, + { + "epoch": 0.6038225823205567, + "grad_norm": 0.4379931092262268, + "learning_rate": 3.397921640317136e-05, + "loss": 1.6435, + "step": 5813 + }, + { + "epoch": 0.6039264568401371, + "grad_norm": 0.4368740916252136, + "learning_rate": 3.39637609429923e-05, + "loss": 1.7202, + "step": 5814 + }, + { + "epoch": 0.6040303313597175, + "grad_norm": 0.4536083936691284, + "learning_rate": 3.3948307190547624e-05, + "loss": 1.664, + "step": 5815 + }, + { + "epoch": 0.6041342058792978, + "grad_norm": 0.40802082419395447, + "learning_rate": 3.3932855147483024e-05, + "loss": 1.6268, + "step": 5816 + }, + { + "epoch": 0.6042380803988782, + "grad_norm": 0.4093196392059326, + "learning_rate": 3.391740481544402e-05, + "loss": 1.7967, + "step": 5817 + }, + { + "epoch": 0.6043419549184585, + "grad_norm": 0.41186216473579407, + "learning_rate": 3.3901956196075945e-05, + "loss": 1.5787, + "step": 5818 + }, + { + "epoch": 0.6044458294380388, + "grad_norm": 0.431708961725235, + "learning_rate": 3.3886509291023986e-05, + "loss": 1.6593, + "step": 5819 + }, + { + "epoch": 0.6045497039576192, + "grad_norm": 0.42467281222343445, + "learning_rate": 3.387106410193308e-05, + "loss": 1.6942, + "step": 5820 + }, + { + "epoch": 0.6046535784771996, + "grad_norm": 0.3974582254886627, + "learning_rate": 3.385562063044803e-05, + "loss": 1.5867, + "step": 5821 + }, + { + "epoch": 0.6047574529967799, + "grad_norm": 0.4653453528881073, + "learning_rate": 3.384017887821348e-05, + "loss": 1.8252, + "step": 5822 + }, + { + "epoch": 0.6048613275163602, + "grad_norm": 0.4493721127510071, + "learning_rate": 3.382473884687382e-05, + "loss": 1.8101, + "step": 5823 + }, + { + "epoch": 0.6049652020359406, + "grad_norm": 0.3836005926132202, + "learning_rate": 3.380930053807328e-05, + "loss": 1.6124, + "step": 5824 + }, + { + "epoch": 0.6050690765555209, + "grad_norm": 0.38905587792396545, + "learning_rate": 3.379386395345596e-05, + "loss": 1.575, + "step": 5825 + }, + { + "epoch": 0.6051729510751013, + "grad_norm": 0.4363052248954773, + "learning_rate": 3.377842909466571e-05, + "loss": 1.6505, + "step": 5826 + }, + { + "epoch": 0.6052768255946817, + "grad_norm": 0.40476155281066895, + "learning_rate": 3.376299596334622e-05, + "loss": 1.7677, + "step": 5827 + }, + { + "epoch": 0.6053807001142619, + "grad_norm": 0.42141294479370117, + "learning_rate": 3.374756456114102e-05, + "loss": 1.5649, + "step": 5828 + }, + { + "epoch": 0.6054845746338423, + "grad_norm": 0.45798051357269287, + "learning_rate": 3.373213488969341e-05, + "loss": 1.8967, + "step": 5829 + }, + { + "epoch": 0.6055884491534227, + "grad_norm": 0.4245013892650604, + "learning_rate": 3.371670695064655e-05, + "loss": 1.6626, + "step": 5830 + }, + { + "epoch": 0.605692323673003, + "grad_norm": 0.4293268918991089, + "learning_rate": 3.370128074564338e-05, + "loss": 1.9178, + "step": 5831 + }, + { + "epoch": 0.6057961981925833, + "grad_norm": 0.42448824644088745, + "learning_rate": 3.368585627632668e-05, + "loss": 1.7219, + "step": 5832 + }, + { + "epoch": 0.6059000727121637, + "grad_norm": 0.3986135423183441, + "learning_rate": 3.3670433544339014e-05, + "loss": 1.6541, + "step": 5833 + }, + { + "epoch": 0.606003947231744, + "grad_norm": 0.44702157378196716, + "learning_rate": 3.365501255132283e-05, + "loss": 1.5446, + "step": 5834 + }, + { + "epoch": 0.6061078217513244, + "grad_norm": 0.39528048038482666, + "learning_rate": 3.363959329892029e-05, + "loss": 1.5639, + "step": 5835 + }, + { + "epoch": 0.6062116962709048, + "grad_norm": 0.40425539016723633, + "learning_rate": 3.362417578877344e-05, + "loss": 1.5162, + "step": 5836 + }, + { + "epoch": 0.6063155707904851, + "grad_norm": 0.3955405056476593, + "learning_rate": 3.360876002252415e-05, + "loss": 1.6176, + "step": 5837 + }, + { + "epoch": 0.6064194453100654, + "grad_norm": 0.4061652421951294, + "learning_rate": 3.359334600181407e-05, + "loss": 1.7371, + "step": 5838 + }, + { + "epoch": 0.6065233198296458, + "grad_norm": 0.41885098814964294, + "learning_rate": 3.357793372828464e-05, + "loss": 1.4975, + "step": 5839 + }, + { + "epoch": 0.6066271943492262, + "grad_norm": 0.3937050998210907, + "learning_rate": 3.356252320357718e-05, + "loss": 1.623, + "step": 5840 + }, + { + "epoch": 0.6067310688688065, + "grad_norm": 0.4089513123035431, + "learning_rate": 3.354711442933279e-05, + "loss": 1.5882, + "step": 5841 + }, + { + "epoch": 0.6068349433883868, + "grad_norm": 0.43716487288475037, + "learning_rate": 3.353170740719236e-05, + "loss": 1.814, + "step": 5842 + }, + { + "epoch": 0.6069388179079672, + "grad_norm": 0.4473409056663513, + "learning_rate": 3.351630213879665e-05, + "loss": 1.5899, + "step": 5843 + }, + { + "epoch": 0.6070426924275475, + "grad_norm": 0.40252211689949036, + "learning_rate": 3.3500898625786184e-05, + "loss": 1.6746, + "step": 5844 + }, + { + "epoch": 0.6071465669471279, + "grad_norm": 0.406857967376709, + "learning_rate": 3.34854968698013e-05, + "loss": 1.5783, + "step": 5845 + }, + { + "epoch": 0.6072504414667083, + "grad_norm": 0.41453415155410767, + "learning_rate": 3.347009687248219e-05, + "loss": 1.7318, + "step": 5846 + }, + { + "epoch": 0.6073543159862885, + "grad_norm": 0.3779454231262207, + "learning_rate": 3.345469863546884e-05, + "loss": 1.5555, + "step": 5847 + }, + { + "epoch": 0.6074581905058689, + "grad_norm": 0.40654996037483215, + "learning_rate": 3.3439302160401e-05, + "loss": 1.5914, + "step": 5848 + }, + { + "epoch": 0.6075620650254493, + "grad_norm": 0.4251600205898285, + "learning_rate": 3.342390744891831e-05, + "loss": 1.7131, + "step": 5849 + }, + { + "epoch": 0.6076659395450296, + "grad_norm": 0.45632246136665344, + "learning_rate": 3.340851450266019e-05, + "loss": 1.7986, + "step": 5850 + }, + { + "epoch": 0.60776981406461, + "grad_norm": 0.43041008710861206, + "learning_rate": 3.3393123323265855e-05, + "loss": 1.7041, + "step": 5851 + }, + { + "epoch": 0.6078736885841903, + "grad_norm": 0.3932364284992218, + "learning_rate": 3.337773391237433e-05, + "loss": 1.5044, + "step": 5852 + }, + { + "epoch": 0.6079775631037706, + "grad_norm": 0.3887140154838562, + "learning_rate": 3.336234627162451e-05, + "loss": 1.6597, + "step": 5853 + }, + { + "epoch": 0.608081437623351, + "grad_norm": 0.41823363304138184, + "learning_rate": 3.3346960402655016e-05, + "loss": 1.6794, + "step": 5854 + }, + { + "epoch": 0.6081853121429314, + "grad_norm": 0.4427807033061981, + "learning_rate": 3.333157630710434e-05, + "loss": 1.8548, + "step": 5855 + }, + { + "epoch": 0.6082891866625116, + "grad_norm": 0.4366241693496704, + "learning_rate": 3.331619398661078e-05, + "loss": 1.597, + "step": 5856 + }, + { + "epoch": 0.608393061182092, + "grad_norm": 0.4059387147426605, + "learning_rate": 3.3300813442812406e-05, + "loss": 1.5453, + "step": 5857 + }, + { + "epoch": 0.6084969357016724, + "grad_norm": 0.40862488746643066, + "learning_rate": 3.328543467734715e-05, + "loss": 1.6159, + "step": 5858 + }, + { + "epoch": 0.6086008102212527, + "grad_norm": 0.4672967195510864, + "learning_rate": 3.327005769185273e-05, + "loss": 1.9154, + "step": 5859 + }, + { + "epoch": 0.6087046847408331, + "grad_norm": 0.4559909403324127, + "learning_rate": 3.325468248796666e-05, + "loss": 1.6842, + "step": 5860 + }, + { + "epoch": 0.6088085592604134, + "grad_norm": 0.3882789611816406, + "learning_rate": 3.323930906732628e-05, + "loss": 1.5068, + "step": 5861 + }, + { + "epoch": 0.6089124337799938, + "grad_norm": 0.4709301292896271, + "learning_rate": 3.322393743156874e-05, + "loss": 1.8432, + "step": 5862 + }, + { + "epoch": 0.6090163082995741, + "grad_norm": 0.43151330947875977, + "learning_rate": 3.3208567582331034e-05, + "loss": 1.7516, + "step": 5863 + }, + { + "epoch": 0.6091201828191545, + "grad_norm": 0.3997372090816498, + "learning_rate": 3.3193199521249876e-05, + "loss": 1.7277, + "step": 5864 + }, + { + "epoch": 0.6092240573387349, + "grad_norm": 0.44914302229881287, + "learning_rate": 3.3177833249961886e-05, + "loss": 1.5975, + "step": 5865 + }, + { + "epoch": 0.6093279318583151, + "grad_norm": 0.3897424638271332, + "learning_rate": 3.316246877010344e-05, + "loss": 1.6415, + "step": 5866 + }, + { + "epoch": 0.6094318063778955, + "grad_norm": 0.4456162452697754, + "learning_rate": 3.314710608331072e-05, + "loss": 1.7596, + "step": 5867 + }, + { + "epoch": 0.6095356808974759, + "grad_norm": 0.4402760863304138, + "learning_rate": 3.3131745191219744e-05, + "loss": 1.7909, + "step": 5868 + }, + { + "epoch": 0.6096395554170562, + "grad_norm": 0.41520991921424866, + "learning_rate": 3.311638609546635e-05, + "loss": 1.6518, + "step": 5869 + }, + { + "epoch": 0.6097434299366365, + "grad_norm": 0.40946778655052185, + "learning_rate": 3.3101028797686106e-05, + "loss": 1.6841, + "step": 5870 + }, + { + "epoch": 0.6098473044562169, + "grad_norm": 0.4258642792701721, + "learning_rate": 3.3085673299514496e-05, + "loss": 1.7182, + "step": 5871 + }, + { + "epoch": 0.6099511789757972, + "grad_norm": 0.38915473222732544, + "learning_rate": 3.307031960258674e-05, + "loss": 1.5292, + "step": 5872 + }, + { + "epoch": 0.6100550534953776, + "grad_norm": 0.39618629217147827, + "learning_rate": 3.30549677085379e-05, + "loss": 1.6406, + "step": 5873 + }, + { + "epoch": 0.610158928014958, + "grad_norm": 0.3926134407520294, + "learning_rate": 3.30396176190028e-05, + "loss": 1.5848, + "step": 5874 + }, + { + "epoch": 0.6102628025345382, + "grad_norm": 0.41277050971984863, + "learning_rate": 3.3024269335616155e-05, + "loss": 1.6565, + "step": 5875 + }, + { + "epoch": 0.6103666770541186, + "grad_norm": 0.40988320112228394, + "learning_rate": 3.300892286001238e-05, + "loss": 1.72, + "step": 5876 + }, + { + "epoch": 0.610470551573699, + "grad_norm": 0.4124213457107544, + "learning_rate": 3.29935781938258e-05, + "loss": 1.5959, + "step": 5877 + }, + { + "epoch": 0.6105744260932793, + "grad_norm": 0.3976089060306549, + "learning_rate": 3.29782353386905e-05, + "loss": 1.7009, + "step": 5878 + }, + { + "epoch": 0.6106783006128597, + "grad_norm": 0.40462133288383484, + "learning_rate": 3.296289429624035e-05, + "loss": 1.5514, + "step": 5879 + }, + { + "epoch": 0.61078217513244, + "grad_norm": 0.42904216051101685, + "learning_rate": 3.294755506810905e-05, + "loss": 1.6511, + "step": 5880 + }, + { + "epoch": 0.6108860496520203, + "grad_norm": 0.44326838850975037, + "learning_rate": 3.293221765593015e-05, + "loss": 1.4797, + "step": 5881 + }, + { + "epoch": 0.6109899241716007, + "grad_norm": 0.3995533883571625, + "learning_rate": 3.291688206133691e-05, + "loss": 1.77, + "step": 5882 + }, + { + "epoch": 0.6110937986911811, + "grad_norm": 0.3896981179714203, + "learning_rate": 3.290154828596248e-05, + "loss": 1.6022, + "step": 5883 + }, + { + "epoch": 0.6111976732107613, + "grad_norm": 0.38104161620140076, + "learning_rate": 3.28862163314398e-05, + "loss": 1.5761, + "step": 5884 + }, + { + "epoch": 0.6113015477303417, + "grad_norm": 0.4309033155441284, + "learning_rate": 3.287088619940158e-05, + "loss": 1.7047, + "step": 5885 + }, + { + "epoch": 0.6114054222499221, + "grad_norm": 0.38424256443977356, + "learning_rate": 3.285555789148036e-05, + "loss": 1.5804, + "step": 5886 + }, + { + "epoch": 0.6115092967695024, + "grad_norm": 0.39635035395622253, + "learning_rate": 3.284023140930851e-05, + "loss": 1.7646, + "step": 5887 + }, + { + "epoch": 0.6116131712890828, + "grad_norm": 0.40174224972724915, + "learning_rate": 3.2824906754518146e-05, + "loss": 1.6731, + "step": 5888 + }, + { + "epoch": 0.6117170458086632, + "grad_norm": 0.42028430104255676, + "learning_rate": 3.280958392874124e-05, + "loss": 1.7342, + "step": 5889 + }, + { + "epoch": 0.6118209203282435, + "grad_norm": 0.40835410356521606, + "learning_rate": 3.2794262933609555e-05, + "loss": 1.6333, + "step": 5890 + }, + { + "epoch": 0.6119247948478238, + "grad_norm": 0.43254995346069336, + "learning_rate": 3.277894377075468e-05, + "loss": 1.8507, + "step": 5891 + }, + { + "epoch": 0.6120286693674042, + "grad_norm": 0.41028037667274475, + "learning_rate": 3.276362644180794e-05, + "loss": 1.712, + "step": 5892 + }, + { + "epoch": 0.6121325438869846, + "grad_norm": 0.428543359041214, + "learning_rate": 3.2748310948400545e-05, + "loss": 1.6575, + "step": 5893 + }, + { + "epoch": 0.6122364184065648, + "grad_norm": 0.46985751390457153, + "learning_rate": 3.273299729216348e-05, + "loss": 1.9697, + "step": 5894 + }, + { + "epoch": 0.6123402929261452, + "grad_norm": 0.4072946012020111, + "learning_rate": 3.2717685474727495e-05, + "loss": 1.7102, + "step": 5895 + }, + { + "epoch": 0.6124441674457256, + "grad_norm": 0.41395875811576843, + "learning_rate": 3.270237549772319e-05, + "loss": 1.6602, + "step": 5896 + }, + { + "epoch": 0.6125480419653059, + "grad_norm": 0.40566155314445496, + "learning_rate": 3.2687067362780997e-05, + "loss": 1.6355, + "step": 5897 + }, + { + "epoch": 0.6126519164848863, + "grad_norm": 0.40978339314460754, + "learning_rate": 3.267176107153107e-05, + "loss": 1.5367, + "step": 5898 + }, + { + "epoch": 0.6127557910044666, + "grad_norm": 0.39154794812202454, + "learning_rate": 3.2656456625603425e-05, + "loss": 1.7275, + "step": 5899 + }, + { + "epoch": 0.6128596655240469, + "grad_norm": 0.4286908209323883, + "learning_rate": 3.2641154026627866e-05, + "loss": 1.7106, + "step": 5900 + }, + { + "epoch": 0.6129635400436273, + "grad_norm": 0.39364275336265564, + "learning_rate": 3.2625853276234e-05, + "loss": 1.6203, + "step": 5901 + }, + { + "epoch": 0.6130674145632077, + "grad_norm": 0.43890032172203064, + "learning_rate": 3.2610554376051224e-05, + "loss": 1.6936, + "step": 5902 + }, + { + "epoch": 0.613171289082788, + "grad_norm": 0.4209052622318268, + "learning_rate": 3.2595257327708785e-05, + "loss": 1.9501, + "step": 5903 + }, + { + "epoch": 0.6132751636023683, + "grad_norm": 0.4081374704837799, + "learning_rate": 3.257996213283566e-05, + "loss": 1.5555, + "step": 5904 + }, + { + "epoch": 0.6133790381219487, + "grad_norm": 0.3956921696662903, + "learning_rate": 3.256466879306068e-05, + "loss": 1.5927, + "step": 5905 + }, + { + "epoch": 0.613482912641529, + "grad_norm": 0.4056268334388733, + "learning_rate": 3.25493773100125e-05, + "loss": 1.6676, + "step": 5906 + }, + { + "epoch": 0.6135867871611094, + "grad_norm": 0.4204394221305847, + "learning_rate": 3.2534087685319506e-05, + "loss": 1.5935, + "step": 5907 + }, + { + "epoch": 0.6136906616806898, + "grad_norm": 0.4562225043773651, + "learning_rate": 3.2518799920609924e-05, + "loss": 1.7057, + "step": 5908 + }, + { + "epoch": 0.61379453620027, + "grad_norm": 0.4130711257457733, + "learning_rate": 3.2503514017511806e-05, + "loss": 1.6022, + "step": 5909 + }, + { + "epoch": 0.6138984107198504, + "grad_norm": 0.4055189788341522, + "learning_rate": 3.2488229977652954e-05, + "loss": 1.7709, + "step": 5910 + }, + { + "epoch": 0.6140022852394308, + "grad_norm": 0.406704306602478, + "learning_rate": 3.2472947802661005e-05, + "loss": 1.702, + "step": 5911 + }, + { + "epoch": 0.6141061597590111, + "grad_norm": 0.41046005487442017, + "learning_rate": 3.245766749416342e-05, + "loss": 1.6426, + "step": 5912 + }, + { + "epoch": 0.6142100342785914, + "grad_norm": 0.40672361850738525, + "learning_rate": 3.24423890537874e-05, + "loss": 1.6559, + "step": 5913 + }, + { + "epoch": 0.6143139087981718, + "grad_norm": 0.3978021442890167, + "learning_rate": 3.242711248316e-05, + "loss": 1.7074, + "step": 5914 + }, + { + "epoch": 0.6144177833177522, + "grad_norm": 0.43113160133361816, + "learning_rate": 3.241183778390803e-05, + "loss": 1.8111, + "step": 5915 + }, + { + "epoch": 0.6145216578373325, + "grad_norm": 0.3847864270210266, + "learning_rate": 3.239656495765817e-05, + "loss": 1.5673, + "step": 5916 + }, + { + "epoch": 0.6146255323569129, + "grad_norm": 0.406105101108551, + "learning_rate": 3.23812940060368e-05, + "loss": 1.6545, + "step": 5917 + }, + { + "epoch": 0.6147294068764932, + "grad_norm": 0.4283885657787323, + "learning_rate": 3.2366024930670194e-05, + "loss": 1.6233, + "step": 5918 + }, + { + "epoch": 0.6148332813960735, + "grad_norm": 0.39741960167884827, + "learning_rate": 3.23507577331844e-05, + "loss": 1.7184, + "step": 5919 + }, + { + "epoch": 0.6149371559156539, + "grad_norm": 0.4278057813644409, + "learning_rate": 3.2335492415205224e-05, + "loss": 1.6877, + "step": 5920 + }, + { + "epoch": 0.6150410304352343, + "grad_norm": 0.3887667655944824, + "learning_rate": 3.2320228978358324e-05, + "loss": 1.6024, + "step": 5921 + }, + { + "epoch": 0.6151449049548146, + "grad_norm": 0.4222593903541565, + "learning_rate": 3.230496742426914e-05, + "loss": 1.5523, + "step": 5922 + }, + { + "epoch": 0.6152487794743949, + "grad_norm": 0.4194217026233673, + "learning_rate": 3.228970775456288e-05, + "loss": 1.6793, + "step": 5923 + }, + { + "epoch": 0.6153526539939753, + "grad_norm": 0.41720110177993774, + "learning_rate": 3.2274449970864595e-05, + "loss": 1.6962, + "step": 5924 + }, + { + "epoch": 0.6154565285135556, + "grad_norm": 0.43170827627182007, + "learning_rate": 3.225919407479914e-05, + "loss": 1.7432, + "step": 5925 + }, + { + "epoch": 0.615560403033136, + "grad_norm": 0.41001245379447937, + "learning_rate": 3.224394006799112e-05, + "loss": 1.6198, + "step": 5926 + }, + { + "epoch": 0.6156642775527164, + "grad_norm": 0.39082086086273193, + "learning_rate": 3.222868795206498e-05, + "loss": 1.5267, + "step": 5927 + }, + { + "epoch": 0.6157681520722966, + "grad_norm": 0.4277300536632538, + "learning_rate": 3.221343772864497e-05, + "loss": 1.8421, + "step": 5928 + }, + { + "epoch": 0.615872026591877, + "grad_norm": 0.4021863639354706, + "learning_rate": 3.2198189399355085e-05, + "loss": 1.701, + "step": 5929 + }, + { + "epoch": 0.6159759011114574, + "grad_norm": 0.39132073521614075, + "learning_rate": 3.218294296581916e-05, + "loss": 1.5238, + "step": 5930 + }, + { + "epoch": 0.6160797756310377, + "grad_norm": 0.44258078932762146, + "learning_rate": 3.2167698429660866e-05, + "loss": 1.7779, + "step": 5931 + }, + { + "epoch": 0.616183650150618, + "grad_norm": 0.4172841012477875, + "learning_rate": 3.215245579250355e-05, + "loss": 1.6755, + "step": 5932 + }, + { + "epoch": 0.6162875246701984, + "grad_norm": 0.46141964197158813, + "learning_rate": 3.2137215055970496e-05, + "loss": 1.8098, + "step": 5933 + }, + { + "epoch": 0.6163913991897787, + "grad_norm": 0.39755067229270935, + "learning_rate": 3.2121976221684714e-05, + "loss": 1.5704, + "step": 5934 + }, + { + "epoch": 0.6164952737093591, + "grad_norm": 0.44859442114830017, + "learning_rate": 3.2106739291269005e-05, + "loss": 1.7943, + "step": 5935 + }, + { + "epoch": 0.6165991482289395, + "grad_norm": 0.4175950288772583, + "learning_rate": 3.209150426634597e-05, + "loss": 1.7347, + "step": 5936 + }, + { + "epoch": 0.6167030227485197, + "grad_norm": 0.42605000734329224, + "learning_rate": 3.2076271148538066e-05, + "loss": 1.6469, + "step": 5937 + }, + { + "epoch": 0.6168068972681001, + "grad_norm": 0.42592230439186096, + "learning_rate": 3.2061039939467454e-05, + "loss": 1.7029, + "step": 5938 + }, + { + "epoch": 0.6169107717876805, + "grad_norm": 0.4934481978416443, + "learning_rate": 3.204581064075616e-05, + "loss": 2.0553, + "step": 5939 + }, + { + "epoch": 0.6170146463072609, + "grad_norm": 0.4315035939216614, + "learning_rate": 3.203058325402599e-05, + "loss": 1.7076, + "step": 5940 + }, + { + "epoch": 0.6171185208268412, + "grad_norm": 0.4047625958919525, + "learning_rate": 3.201535778089853e-05, + "loss": 1.5993, + "step": 5941 + }, + { + "epoch": 0.6172223953464215, + "grad_norm": 0.4070419371128082, + "learning_rate": 3.20001342229952e-05, + "loss": 1.3709, + "step": 5942 + }, + { + "epoch": 0.6173262698660019, + "grad_norm": 0.4009752869606018, + "learning_rate": 3.1984912581937146e-05, + "loss": 1.6183, + "step": 5943 + }, + { + "epoch": 0.6174301443855822, + "grad_norm": 0.40829452872276306, + "learning_rate": 3.196969285934541e-05, + "loss": 1.5723, + "step": 5944 + }, + { + "epoch": 0.6175340189051626, + "grad_norm": 0.43150803446769714, + "learning_rate": 3.1954475056840714e-05, + "loss": 1.593, + "step": 5945 + }, + { + "epoch": 0.617637893424743, + "grad_norm": 0.4301930367946625, + "learning_rate": 3.193925917604367e-05, + "loss": 1.7796, + "step": 5946 + }, + { + "epoch": 0.6177417679443232, + "grad_norm": 0.4352579414844513, + "learning_rate": 3.1924045218574654e-05, + "loss": 1.8134, + "step": 5947 + }, + { + "epoch": 0.6178456424639036, + "grad_norm": 0.3833911716938019, + "learning_rate": 3.190883318605382e-05, + "loss": 1.5344, + "step": 5948 + }, + { + "epoch": 0.617949516983484, + "grad_norm": 0.4389069676399231, + "learning_rate": 3.189362308010113e-05, + "loss": 1.6706, + "step": 5949 + }, + { + "epoch": 0.6180533915030643, + "grad_norm": 0.3941787779331207, + "learning_rate": 3.187841490233636e-05, + "loss": 1.5388, + "step": 5950 + }, + { + "epoch": 0.6181572660226446, + "grad_norm": 0.44791027903556824, + "learning_rate": 3.1863208654379036e-05, + "loss": 1.7468, + "step": 5951 + }, + { + "epoch": 0.618261140542225, + "grad_norm": 0.41120895743370056, + "learning_rate": 3.184800433784851e-05, + "loss": 1.5909, + "step": 5952 + }, + { + "epoch": 0.6183650150618053, + "grad_norm": 0.41934871673583984, + "learning_rate": 3.1832801954363955e-05, + "loss": 1.8014, + "step": 5953 + }, + { + "epoch": 0.6184688895813857, + "grad_norm": 0.38895368576049805, + "learning_rate": 3.1817601505544264e-05, + "loss": 1.6452, + "step": 5954 + }, + { + "epoch": 0.6185727641009661, + "grad_norm": 0.38875988125801086, + "learning_rate": 3.18024029930082e-05, + "loss": 1.6638, + "step": 5955 + }, + { + "epoch": 0.6186766386205463, + "grad_norm": 0.4126511216163635, + "learning_rate": 3.178720641837426e-05, + "loss": 1.6237, + "step": 5956 + }, + { + "epoch": 0.6187805131401267, + "grad_norm": 0.4127541184425354, + "learning_rate": 3.177201178326077e-05, + "loss": 1.6954, + "step": 5957 + }, + { + "epoch": 0.6188843876597071, + "grad_norm": 0.4332854449748993, + "learning_rate": 3.175681908928584e-05, + "loss": 1.7631, + "step": 5958 + }, + { + "epoch": 0.6189882621792874, + "grad_norm": 0.401528537273407, + "learning_rate": 3.174162833806739e-05, + "loss": 1.6, + "step": 5959 + }, + { + "epoch": 0.6190921366988678, + "grad_norm": 0.4269302785396576, + "learning_rate": 3.172643953122308e-05, + "loss": 1.7953, + "step": 5960 + }, + { + "epoch": 0.6191960112184481, + "grad_norm": 0.4833371341228485, + "learning_rate": 3.171125267037042e-05, + "loss": 1.779, + "step": 5961 + }, + { + "epoch": 0.6192998857380284, + "grad_norm": 0.4078036844730377, + "learning_rate": 3.169606775712672e-05, + "loss": 1.7738, + "step": 5962 + }, + { + "epoch": 0.6194037602576088, + "grad_norm": 0.3845091164112091, + "learning_rate": 3.1680884793109014e-05, + "loss": 1.488, + "step": 5963 + }, + { + "epoch": 0.6195076347771892, + "grad_norm": 0.4441048800945282, + "learning_rate": 3.166570377993417e-05, + "loss": 1.6613, + "step": 5964 + }, + { + "epoch": 0.6196115092967694, + "grad_norm": 0.4266035556793213, + "learning_rate": 3.165052471921888e-05, + "loss": 1.6224, + "step": 5965 + }, + { + "epoch": 0.6197153838163498, + "grad_norm": 0.4394325315952301, + "learning_rate": 3.163534761257956e-05, + "loss": 1.6947, + "step": 5966 + }, + { + "epoch": 0.6198192583359302, + "grad_norm": 0.4144458472728729, + "learning_rate": 3.162017246163247e-05, + "loss": 1.7144, + "step": 5967 + }, + { + "epoch": 0.6199231328555106, + "grad_norm": 0.39400461316108704, + "learning_rate": 3.160499926799365e-05, + "loss": 1.6563, + "step": 5968 + }, + { + "epoch": 0.6200270073750909, + "grad_norm": 0.40855416655540466, + "learning_rate": 3.1589828033278925e-05, + "loss": 1.7237, + "step": 5969 + }, + { + "epoch": 0.6201308818946712, + "grad_norm": 0.39257362484931946, + "learning_rate": 3.157465875910391e-05, + "loss": 1.6751, + "step": 5970 + }, + { + "epoch": 0.6202347564142516, + "grad_norm": 0.3722376525402069, + "learning_rate": 3.1559491447084e-05, + "loss": 1.547, + "step": 5971 + }, + { + "epoch": 0.6203386309338319, + "grad_norm": 0.38416385650634766, + "learning_rate": 3.1544326098834433e-05, + "loss": 1.6595, + "step": 5972 + }, + { + "epoch": 0.6204425054534123, + "grad_norm": 0.42782196402549744, + "learning_rate": 3.152916271597015e-05, + "loss": 1.755, + "step": 5973 + }, + { + "epoch": 0.6205463799729927, + "grad_norm": 0.3950657248497009, + "learning_rate": 3.151400130010597e-05, + "loss": 1.6521, + "step": 5974 + }, + { + "epoch": 0.6206502544925729, + "grad_norm": 0.39494791626930237, + "learning_rate": 3.149884185285647e-05, + "loss": 1.6084, + "step": 5975 + }, + { + "epoch": 0.6207541290121533, + "grad_norm": 0.4145883321762085, + "learning_rate": 3.148368437583599e-05, + "loss": 1.5866, + "step": 5976 + }, + { + "epoch": 0.6208580035317337, + "grad_norm": 0.5039044618606567, + "learning_rate": 3.146852887065868e-05, + "loss": 1.8135, + "step": 5977 + }, + { + "epoch": 0.620961878051314, + "grad_norm": 0.42017415165901184, + "learning_rate": 3.145337533893851e-05, + "loss": 1.6958, + "step": 5978 + }, + { + "epoch": 0.6210657525708944, + "grad_norm": 0.41060763597488403, + "learning_rate": 3.143822378228918e-05, + "loss": 1.5397, + "step": 5979 + }, + { + "epoch": 0.6211696270904747, + "grad_norm": 0.42383599281311035, + "learning_rate": 3.142307420232423e-05, + "loss": 1.7797, + "step": 5980 + }, + { + "epoch": 0.621273501610055, + "grad_norm": 0.446959912776947, + "learning_rate": 3.1407926600657e-05, + "loss": 1.8221, + "step": 5981 + }, + { + "epoch": 0.6213773761296354, + "grad_norm": 0.403238445520401, + "learning_rate": 3.1392780978900535e-05, + "loss": 1.6687, + "step": 5982 + }, + { + "epoch": 0.6214812506492158, + "grad_norm": 0.4446134567260742, + "learning_rate": 3.1377637338667764e-05, + "loss": 1.719, + "step": 5983 + }, + { + "epoch": 0.621585125168796, + "grad_norm": 0.4344415068626404, + "learning_rate": 3.136249568157136e-05, + "loss": 1.6722, + "step": 5984 + }, + { + "epoch": 0.6216889996883764, + "grad_norm": 0.38726866245269775, + "learning_rate": 3.1347356009223786e-05, + "loss": 1.6432, + "step": 5985 + }, + { + "epoch": 0.6217928742079568, + "grad_norm": 0.43266814947128296, + "learning_rate": 3.133221832323728e-05, + "loss": 1.747, + "step": 5986 + }, + { + "epoch": 0.6218967487275371, + "grad_norm": 0.39049196243286133, + "learning_rate": 3.131708262522395e-05, + "loss": 1.6543, + "step": 5987 + }, + { + "epoch": 0.6220006232471175, + "grad_norm": 0.40162894129753113, + "learning_rate": 3.1301948916795554e-05, + "loss": 1.5967, + "step": 5988 + }, + { + "epoch": 0.6221044977666979, + "grad_norm": 0.4087611734867096, + "learning_rate": 3.128681719956375e-05, + "loss": 1.647, + "step": 5989 + }, + { + "epoch": 0.6222083722862781, + "grad_norm": 0.4542302191257477, + "learning_rate": 3.1271687475139966e-05, + "loss": 1.7831, + "step": 5990 + }, + { + "epoch": 0.6223122468058585, + "grad_norm": 0.40611404180526733, + "learning_rate": 3.125655974513537e-05, + "loss": 1.6214, + "step": 5991 + }, + { + "epoch": 0.6224161213254389, + "grad_norm": 0.4047275483608246, + "learning_rate": 3.124143401116094e-05, + "loss": 1.7274, + "step": 5992 + }, + { + "epoch": 0.6225199958450193, + "grad_norm": 0.4009588062763214, + "learning_rate": 3.1226310274827473e-05, + "loss": 1.7371, + "step": 5993 + }, + { + "epoch": 0.6226238703645995, + "grad_norm": 0.398447185754776, + "learning_rate": 3.121118853774554e-05, + "loss": 1.5706, + "step": 5994 + }, + { + "epoch": 0.6227277448841799, + "grad_norm": 0.46983155608177185, + "learning_rate": 3.119606880152545e-05, + "loss": 1.7618, + "step": 5995 + }, + { + "epoch": 0.6228316194037603, + "grad_norm": 0.42344680428504944, + "learning_rate": 3.1180951067777365e-05, + "loss": 1.6264, + "step": 5996 + }, + { + "epoch": 0.6229354939233406, + "grad_norm": 0.4075701832771301, + "learning_rate": 3.11658353381112e-05, + "loss": 1.7759, + "step": 5997 + }, + { + "epoch": 0.623039368442921, + "grad_norm": 0.40149906277656555, + "learning_rate": 3.1150721614136655e-05, + "loss": 1.7063, + "step": 5998 + }, + { + "epoch": 0.6231432429625013, + "grad_norm": 0.3998561203479767, + "learning_rate": 3.113560989746322e-05, + "loss": 1.4695, + "step": 5999 + }, + { + "epoch": 0.6232471174820816, + "grad_norm": 0.4251110553741455, + "learning_rate": 3.11205001897002e-05, + "loss": 1.7207, + "step": 6000 + }, + { + "epoch": 0.623350992001662, + "grad_norm": 0.41832244396209717, + "learning_rate": 3.110539249245663e-05, + "loss": 1.4959, + "step": 6001 + }, + { + "epoch": 0.6234548665212424, + "grad_norm": 0.3928918242454529, + "learning_rate": 3.1090286807341374e-05, + "loss": 1.7322, + "step": 6002 + }, + { + "epoch": 0.6235587410408227, + "grad_norm": 0.4026903212070465, + "learning_rate": 3.107518313596309e-05, + "loss": 1.8031, + "step": 6003 + }, + { + "epoch": 0.623662615560403, + "grad_norm": 0.4026026129722595, + "learning_rate": 3.106008147993017e-05, + "loss": 1.5662, + "step": 6004 + }, + { + "epoch": 0.6237664900799834, + "grad_norm": 0.4900519549846649, + "learning_rate": 3.1044981840850826e-05, + "loss": 1.7387, + "step": 6005 + }, + { + "epoch": 0.6238703645995637, + "grad_norm": 0.4159107804298401, + "learning_rate": 3.102988422033309e-05, + "loss": 1.6261, + "step": 6006 + }, + { + "epoch": 0.6239742391191441, + "grad_norm": 0.46456992626190186, + "learning_rate": 3.101478861998469e-05, + "loss": 1.8022, + "step": 6007 + }, + { + "epoch": 0.6240781136387245, + "grad_norm": 0.4258232116699219, + "learning_rate": 3.09996950414132e-05, + "loss": 1.7152, + "step": 6008 + }, + { + "epoch": 0.6241819881583047, + "grad_norm": 0.4385705888271332, + "learning_rate": 3.098460348622601e-05, + "loss": 1.7273, + "step": 6009 + }, + { + "epoch": 0.6242858626778851, + "grad_norm": 0.3911799192428589, + "learning_rate": 3.09695139560302e-05, + "loss": 1.525, + "step": 6010 + }, + { + "epoch": 0.6243897371974655, + "grad_norm": 0.45774349570274353, + "learning_rate": 3.095442645243272e-05, + "loss": 1.8, + "step": 6011 + }, + { + "epoch": 0.6244936117170458, + "grad_norm": 0.3881503939628601, + "learning_rate": 3.0939340977040267e-05, + "loss": 1.7008, + "step": 6012 + }, + { + "epoch": 0.6245974862366261, + "grad_norm": 0.41384172439575195, + "learning_rate": 3.092425753145931e-05, + "loss": 1.5997, + "step": 6013 + }, + { + "epoch": 0.6247013607562065, + "grad_norm": 0.4150152802467346, + "learning_rate": 3.0909176117296126e-05, + "loss": 1.6308, + "step": 6014 + }, + { + "epoch": 0.6248052352757868, + "grad_norm": 0.4042709767818451, + "learning_rate": 3.08940967361568e-05, + "loss": 1.6706, + "step": 6015 + }, + { + "epoch": 0.6249091097953672, + "grad_norm": 0.4141731560230255, + "learning_rate": 3.08790193896471e-05, + "loss": 1.6422, + "step": 6016 + }, + { + "epoch": 0.6250129843149476, + "grad_norm": 0.40485629439353943, + "learning_rate": 3.0863944079372694e-05, + "loss": 1.6133, + "step": 6017 + }, + { + "epoch": 0.6251168588345279, + "grad_norm": 0.4418627917766571, + "learning_rate": 3.0848870806938995e-05, + "loss": 1.741, + "step": 6018 + }, + { + "epoch": 0.6252207333541082, + "grad_norm": 0.40159109234809875, + "learning_rate": 3.083379957395117e-05, + "loss": 1.618, + "step": 6019 + }, + { + "epoch": 0.6253246078736886, + "grad_norm": 0.4042385220527649, + "learning_rate": 3.081873038201418e-05, + "loss": 1.7609, + "step": 6020 + }, + { + "epoch": 0.625428482393269, + "grad_norm": 0.4162798523902893, + "learning_rate": 3.0803663232732775e-05, + "loss": 1.7222, + "step": 6021 + }, + { + "epoch": 0.6255323569128493, + "grad_norm": 0.3897026777267456, + "learning_rate": 3.078859812771152e-05, + "loss": 1.5955, + "step": 6022 + }, + { + "epoch": 0.6256362314324296, + "grad_norm": 0.4212419092655182, + "learning_rate": 3.0773535068554695e-05, + "loss": 1.6507, + "step": 6023 + }, + { + "epoch": 0.62574010595201, + "grad_norm": 0.4059465229511261, + "learning_rate": 3.075847405686641e-05, + "loss": 1.6731, + "step": 6024 + }, + { + "epoch": 0.6258439804715903, + "grad_norm": 0.4359121024608612, + "learning_rate": 3.074341509425056e-05, + "loss": 1.5718, + "step": 6025 + }, + { + "epoch": 0.6259478549911707, + "grad_norm": 0.4287969470024109, + "learning_rate": 3.07283581823108e-05, + "loss": 1.6525, + "step": 6026 + }, + { + "epoch": 0.626051729510751, + "grad_norm": 0.4397551715373993, + "learning_rate": 3.071330332265054e-05, + "loss": 1.6681, + "step": 6027 + }, + { + "epoch": 0.6261556040303313, + "grad_norm": 0.4228843152523041, + "learning_rate": 3.069825051687307e-05, + "loss": 1.68, + "step": 6028 + }, + { + "epoch": 0.6262594785499117, + "grad_norm": 0.4212433993816376, + "learning_rate": 3.068319976658134e-05, + "loss": 1.7587, + "step": 6029 + }, + { + "epoch": 0.6263633530694921, + "grad_norm": 0.41605517268180847, + "learning_rate": 3.066815107337815e-05, + "loss": 1.6564, + "step": 6030 + }, + { + "epoch": 0.6264672275890724, + "grad_norm": 0.4286707639694214, + "learning_rate": 3.0653104438866104e-05, + "loss": 1.7046, + "step": 6031 + }, + { + "epoch": 0.6265711021086527, + "grad_norm": 0.4247523248195648, + "learning_rate": 3.063805986464751e-05, + "loss": 1.7082, + "step": 6032 + }, + { + "epoch": 0.6266749766282331, + "grad_norm": 0.4271533489227295, + "learning_rate": 3.06230173523245e-05, + "loss": 1.7456, + "step": 6033 + }, + { + "epoch": 0.6267788511478134, + "grad_norm": 0.43358635902404785, + "learning_rate": 3.060797690349902e-05, + "loss": 1.5457, + "step": 6034 + }, + { + "epoch": 0.6268827256673938, + "grad_norm": 0.3621548116207123, + "learning_rate": 3.0592938519772715e-05, + "loss": 1.3377, + "step": 6035 + }, + { + "epoch": 0.6269866001869742, + "grad_norm": 0.4215681850910187, + "learning_rate": 3.057790220274707e-05, + "loss": 1.5863, + "step": 6036 + }, + { + "epoch": 0.6270904747065544, + "grad_norm": 0.425849586725235, + "learning_rate": 3.056286795402337e-05, + "loss": 1.6182, + "step": 6037 + }, + { + "epoch": 0.6271943492261348, + "grad_norm": 0.4131641089916229, + "learning_rate": 3.0547835775202597e-05, + "loss": 1.6674, + "step": 6038 + }, + { + "epoch": 0.6272982237457152, + "grad_norm": 0.41090524196624756, + "learning_rate": 3.053280566788559e-05, + "loss": 1.5321, + "step": 6039 + }, + { + "epoch": 0.6274020982652955, + "grad_norm": 0.443823903799057, + "learning_rate": 3.051777763367293e-05, + "loss": 1.5838, + "step": 6040 + }, + { + "epoch": 0.6275059727848759, + "grad_norm": 0.41161811351776123, + "learning_rate": 3.0502751674164976e-05, + "loss": 1.6502, + "step": 6041 + }, + { + "epoch": 0.6276098473044562, + "grad_norm": 0.4168041944503784, + "learning_rate": 3.0487727790961884e-05, + "loss": 1.6534, + "step": 6042 + }, + { + "epoch": 0.6277137218240366, + "grad_norm": 0.43634623289108276, + "learning_rate": 3.0472705985663575e-05, + "loss": 1.6755, + "step": 6043 + }, + { + "epoch": 0.6278175963436169, + "grad_norm": 0.45974037051200867, + "learning_rate": 3.045768625986979e-05, + "loss": 1.8234, + "step": 6044 + }, + { + "epoch": 0.6279214708631973, + "grad_norm": 0.4236984848976135, + "learning_rate": 3.0442668615179953e-05, + "loss": 1.794, + "step": 6045 + }, + { + "epoch": 0.6280253453827777, + "grad_norm": 0.3920854330062866, + "learning_rate": 3.0427653053193366e-05, + "loss": 1.6738, + "step": 6046 + }, + { + "epoch": 0.6281292199023579, + "grad_norm": 0.44960126280784607, + "learning_rate": 3.0412639575509073e-05, + "loss": 1.6268, + "step": 6047 + }, + { + "epoch": 0.6282330944219383, + "grad_norm": 0.4008250832557678, + "learning_rate": 3.0397628183725856e-05, + "loss": 1.4388, + "step": 6048 + }, + { + "epoch": 0.6283369689415187, + "grad_norm": 0.39700058102607727, + "learning_rate": 3.038261887944233e-05, + "loss": 1.613, + "step": 6049 + }, + { + "epoch": 0.628440843461099, + "grad_norm": 0.46325239539146423, + "learning_rate": 3.0367611664256895e-05, + "loss": 1.8034, + "step": 6050 + }, + { + "epoch": 0.6285447179806793, + "grad_norm": 0.434199720621109, + "learning_rate": 3.0352606539767657e-05, + "loss": 1.669, + "step": 6051 + }, + { + "epoch": 0.6286485925002597, + "grad_norm": 0.3849222958087921, + "learning_rate": 3.033760350757258e-05, + "loss": 1.5992, + "step": 6052 + }, + { + "epoch": 0.62875246701984, + "grad_norm": 0.4169051945209503, + "learning_rate": 3.0322602569269355e-05, + "loss": 1.5102, + "step": 6053 + }, + { + "epoch": 0.6288563415394204, + "grad_norm": 0.42412588000297546, + "learning_rate": 3.030760372645547e-05, + "loss": 1.5752, + "step": 6054 + }, + { + "epoch": 0.6289602160590008, + "grad_norm": 0.4000975787639618, + "learning_rate": 3.0292606980728165e-05, + "loss": 1.715, + "step": 6055 + }, + { + "epoch": 0.629064090578581, + "grad_norm": 0.3950633704662323, + "learning_rate": 3.0277612333684526e-05, + "loss": 1.6543, + "step": 6056 + }, + { + "epoch": 0.6291679650981614, + "grad_norm": 0.4239039421081543, + "learning_rate": 3.02626197869213e-05, + "loss": 1.7277, + "step": 6057 + }, + { + "epoch": 0.6292718396177418, + "grad_norm": 0.44476318359375, + "learning_rate": 3.024762934203511e-05, + "loss": 1.8181, + "step": 6058 + }, + { + "epoch": 0.6293757141373221, + "grad_norm": 0.3775947093963623, + "learning_rate": 3.023264100062234e-05, + "loss": 1.5467, + "step": 6059 + }, + { + "epoch": 0.6294795886569025, + "grad_norm": 0.4439935088157654, + "learning_rate": 3.0217654764279114e-05, + "loss": 1.9913, + "step": 6060 + }, + { + "epoch": 0.6295834631764828, + "grad_norm": 0.4108879566192627, + "learning_rate": 3.0202670634601325e-05, + "loss": 1.6593, + "step": 6061 + }, + { + "epoch": 0.6296873376960631, + "grad_norm": 0.4360349476337433, + "learning_rate": 3.0187688613184717e-05, + "loss": 1.4867, + "step": 6062 + }, + { + "epoch": 0.6297912122156435, + "grad_norm": 0.4250946044921875, + "learning_rate": 3.0172708701624704e-05, + "loss": 1.6254, + "step": 6063 + }, + { + "epoch": 0.6298950867352239, + "grad_norm": 0.44005441665649414, + "learning_rate": 3.0157730901516556e-05, + "loss": 1.7055, + "step": 6064 + }, + { + "epoch": 0.6299989612548041, + "grad_norm": 0.40063902735710144, + "learning_rate": 3.014275521445531e-05, + "loss": 1.4714, + "step": 6065 + }, + { + "epoch": 0.6301028357743845, + "grad_norm": 0.3904891610145569, + "learning_rate": 3.0127781642035725e-05, + "loss": 1.5278, + "step": 6066 + }, + { + "epoch": 0.6302067102939649, + "grad_norm": 0.36918964982032776, + "learning_rate": 3.0112810185852393e-05, + "loss": 1.4645, + "step": 6067 + }, + { + "epoch": 0.6303105848135452, + "grad_norm": 0.43034636974334717, + "learning_rate": 3.009784084749966e-05, + "loss": 1.6989, + "step": 6068 + }, + { + "epoch": 0.6304144593331256, + "grad_norm": 0.43391627073287964, + "learning_rate": 3.0082873628571616e-05, + "loss": 1.7468, + "step": 6069 + }, + { + "epoch": 0.630518333852706, + "grad_norm": 0.47783637046813965, + "learning_rate": 3.006790853066217e-05, + "loss": 1.7116, + "step": 6070 + }, + { + "epoch": 0.6306222083722863, + "grad_norm": 0.46756497025489807, + "learning_rate": 3.0052945555364987e-05, + "loss": 1.8164, + "step": 6071 + }, + { + "epoch": 0.6307260828918666, + "grad_norm": 0.4332909882068634, + "learning_rate": 3.0037984704273536e-05, + "loss": 1.7011, + "step": 6072 + }, + { + "epoch": 0.630829957411447, + "grad_norm": 0.46313124895095825, + "learning_rate": 3.0023025978980977e-05, + "loss": 1.8097, + "step": 6073 + }, + { + "epoch": 0.6309338319310274, + "grad_norm": 0.4317631125450134, + "learning_rate": 3.000806938108034e-05, + "loss": 1.7535, + "step": 6074 + }, + { + "epoch": 0.6310377064506076, + "grad_norm": 0.39492112398147583, + "learning_rate": 2.9993114912164378e-05, + "loss": 1.6294, + "step": 6075 + }, + { + "epoch": 0.631141580970188, + "grad_norm": 0.3940942883491516, + "learning_rate": 2.9978162573825597e-05, + "loss": 1.7187, + "step": 6076 + }, + { + "epoch": 0.6312454554897684, + "grad_norm": 0.4159606695175171, + "learning_rate": 2.996321236765633e-05, + "loss": 1.7397, + "step": 6077 + }, + { + "epoch": 0.6313493300093487, + "grad_norm": 0.38286933302879333, + "learning_rate": 2.994826429524867e-05, + "loss": 1.5252, + "step": 6078 + }, + { + "epoch": 0.6314532045289291, + "grad_norm": 0.424221396446228, + "learning_rate": 2.9933318358194427e-05, + "loss": 1.4488, + "step": 6079 + }, + { + "epoch": 0.6315570790485094, + "grad_norm": 0.4659724831581116, + "learning_rate": 2.9918374558085272e-05, + "loss": 1.705, + "step": 6080 + }, + { + "epoch": 0.6316609535680897, + "grad_norm": 0.42719754576683044, + "learning_rate": 2.9903432896512583e-05, + "loss": 1.5949, + "step": 6081 + }, + { + "epoch": 0.6317648280876701, + "grad_norm": 0.4064619243144989, + "learning_rate": 2.988849337506753e-05, + "loss": 1.5759, + "step": 6082 + }, + { + "epoch": 0.6318687026072505, + "grad_norm": 0.4120051860809326, + "learning_rate": 2.9873555995341045e-05, + "loss": 1.6129, + "step": 6083 + }, + { + "epoch": 0.6319725771268307, + "grad_norm": 0.41194257140159607, + "learning_rate": 2.9858620758923872e-05, + "loss": 1.6637, + "step": 6084 + }, + { + "epoch": 0.6320764516464111, + "grad_norm": 0.4339029788970947, + "learning_rate": 2.9843687667406466e-05, + "loss": 1.7608, + "step": 6085 + }, + { + "epoch": 0.6321803261659915, + "grad_norm": 0.4774200916290283, + "learning_rate": 2.982875672237909e-05, + "loss": 1.5836, + "step": 6086 + }, + { + "epoch": 0.6322842006855718, + "grad_norm": 0.40862688422203064, + "learning_rate": 2.9813827925431813e-05, + "loss": 1.6473, + "step": 6087 + }, + { + "epoch": 0.6323880752051522, + "grad_norm": 0.42833518981933594, + "learning_rate": 2.9798901278154384e-05, + "loss": 1.6633, + "step": 6088 + }, + { + "epoch": 0.6324919497247325, + "grad_norm": 0.42553073167800903, + "learning_rate": 2.9783976782136385e-05, + "loss": 1.6624, + "step": 6089 + }, + { + "epoch": 0.6325958242443128, + "grad_norm": 0.3767523169517517, + "learning_rate": 2.976905443896719e-05, + "loss": 1.516, + "step": 6090 + }, + { + "epoch": 0.6326996987638932, + "grad_norm": 0.44192999601364136, + "learning_rate": 2.975413425023587e-05, + "loss": 1.8359, + "step": 6091 + }, + { + "epoch": 0.6328035732834736, + "grad_norm": 0.4179973304271698, + "learning_rate": 2.973921621753133e-05, + "loss": 1.5251, + "step": 6092 + }, + { + "epoch": 0.6329074478030539, + "grad_norm": 0.40900513529777527, + "learning_rate": 2.972430034244224e-05, + "loss": 1.7986, + "step": 6093 + }, + { + "epoch": 0.6330113223226342, + "grad_norm": 0.42563167214393616, + "learning_rate": 2.970938662655699e-05, + "loss": 1.6558, + "step": 6094 + }, + { + "epoch": 0.6331151968422146, + "grad_norm": 0.4261634349822998, + "learning_rate": 2.9694475071463797e-05, + "loss": 1.6296, + "step": 6095 + }, + { + "epoch": 0.633219071361795, + "grad_norm": 0.41434627771377563, + "learning_rate": 2.9679565678750608e-05, + "loss": 1.6559, + "step": 6096 + }, + { + "epoch": 0.6333229458813753, + "grad_norm": 0.4356957972049713, + "learning_rate": 2.9664658450005195e-05, + "loss": 1.6724, + "step": 6097 + }, + { + "epoch": 0.6334268204009557, + "grad_norm": 0.4227178692817688, + "learning_rate": 2.9649753386815017e-05, + "loss": 1.7473, + "step": 6098 + }, + { + "epoch": 0.633530694920536, + "grad_norm": 0.45652738213539124, + "learning_rate": 2.963485049076736e-05, + "loss": 1.7247, + "step": 6099 + }, + { + "epoch": 0.6336345694401163, + "grad_norm": 0.4394940435886383, + "learning_rate": 2.961994976344929e-05, + "loss": 1.7161, + "step": 6100 + }, + { + "epoch": 0.6337384439596967, + "grad_norm": 0.4043090343475342, + "learning_rate": 2.9605051206447582e-05, + "loss": 1.6737, + "step": 6101 + }, + { + "epoch": 0.6338423184792771, + "grad_norm": 0.4564562141895294, + "learning_rate": 2.959015482134885e-05, + "loss": 1.8074, + "step": 6102 + }, + { + "epoch": 0.6339461929988573, + "grad_norm": 0.4548121392726898, + "learning_rate": 2.9575260609739423e-05, + "loss": 1.7882, + "step": 6103 + }, + { + "epoch": 0.6340500675184377, + "grad_norm": 0.41030994057655334, + "learning_rate": 2.956036857320541e-05, + "loss": 1.6736, + "step": 6104 + }, + { + "epoch": 0.6341539420380181, + "grad_norm": 0.4318486154079437, + "learning_rate": 2.9545478713332707e-05, + "loss": 1.7257, + "step": 6105 + }, + { + "epoch": 0.6342578165575984, + "grad_norm": 0.4724993109703064, + "learning_rate": 2.9530591031706988e-05, + "loss": 1.6384, + "step": 6106 + }, + { + "epoch": 0.6343616910771788, + "grad_norm": 0.40737417340278625, + "learning_rate": 2.9515705529913644e-05, + "loss": 1.7593, + "step": 6107 + }, + { + "epoch": 0.6344655655967592, + "grad_norm": 0.4164734482765198, + "learning_rate": 2.9500822209537883e-05, + "loss": 1.7383, + "step": 6108 + }, + { + "epoch": 0.6345694401163394, + "grad_norm": 0.40095841884613037, + "learning_rate": 2.9485941072164656e-05, + "loss": 1.6363, + "step": 6109 + }, + { + "epoch": 0.6346733146359198, + "grad_norm": 0.4260212481021881, + "learning_rate": 2.9471062119378688e-05, + "loss": 1.7249, + "step": 6110 + }, + { + "epoch": 0.6347771891555002, + "grad_norm": 0.4054844081401825, + "learning_rate": 2.9456185352764465e-05, + "loss": 1.4506, + "step": 6111 + }, + { + "epoch": 0.6348810636750805, + "grad_norm": 0.39739441871643066, + "learning_rate": 2.944131077390627e-05, + "loss": 1.6051, + "step": 6112 + }, + { + "epoch": 0.6349849381946608, + "grad_norm": 0.4330110549926758, + "learning_rate": 2.94264383843881e-05, + "loss": 1.7506, + "step": 6113 + }, + { + "epoch": 0.6350888127142412, + "grad_norm": 0.4423455595970154, + "learning_rate": 2.9411568185793757e-05, + "loss": 1.7382, + "step": 6114 + }, + { + "epoch": 0.6351926872338215, + "grad_norm": 0.41518527269363403, + "learning_rate": 2.939670017970683e-05, + "loss": 1.4555, + "step": 6115 + }, + { + "epoch": 0.6352965617534019, + "grad_norm": 0.40972813963890076, + "learning_rate": 2.9381834367710616e-05, + "loss": 1.7941, + "step": 6116 + }, + { + "epoch": 0.6354004362729823, + "grad_norm": 0.43250906467437744, + "learning_rate": 2.9366970751388196e-05, + "loss": 1.7317, + "step": 6117 + }, + { + "epoch": 0.6355043107925625, + "grad_norm": 0.4202265441417694, + "learning_rate": 2.9352109332322468e-05, + "loss": 1.652, + "step": 6118 + }, + { + "epoch": 0.6356081853121429, + "grad_norm": 0.40753960609436035, + "learning_rate": 2.9337250112096016e-05, + "loss": 1.6144, + "step": 6119 + }, + { + "epoch": 0.6357120598317233, + "grad_norm": 0.39751505851745605, + "learning_rate": 2.9322393092291256e-05, + "loss": 1.7013, + "step": 6120 + }, + { + "epoch": 0.6358159343513037, + "grad_norm": 0.4368493854999542, + "learning_rate": 2.9307538274490348e-05, + "loss": 1.7454, + "step": 6121 + }, + { + "epoch": 0.635919808870884, + "grad_norm": 0.40719810128211975, + "learning_rate": 2.9292685660275204e-05, + "loss": 1.7023, + "step": 6122 + }, + { + "epoch": 0.6360236833904643, + "grad_norm": 0.4638088047504425, + "learning_rate": 2.9277835251227514e-05, + "loss": 1.7887, + "step": 6123 + }, + { + "epoch": 0.6361275579100447, + "grad_norm": 0.454287588596344, + "learning_rate": 2.9262987048928725e-05, + "loss": 1.8606, + "step": 6124 + }, + { + "epoch": 0.636231432429625, + "grad_norm": 0.3964051902294159, + "learning_rate": 2.9248141054960076e-05, + "loss": 1.5617, + "step": 6125 + }, + { + "epoch": 0.6363353069492054, + "grad_norm": 0.39127108454704285, + "learning_rate": 2.9233297270902514e-05, + "loss": 1.5637, + "step": 6126 + }, + { + "epoch": 0.6364391814687858, + "grad_norm": 0.4058896005153656, + "learning_rate": 2.921845569833681e-05, + "loss": 1.6716, + "step": 6127 + }, + { + "epoch": 0.636543055988366, + "grad_norm": 0.41195571422576904, + "learning_rate": 2.9203616338843486e-05, + "loss": 1.5734, + "step": 6128 + }, + { + "epoch": 0.6366469305079464, + "grad_norm": 0.45315390825271606, + "learning_rate": 2.9188779194002803e-05, + "loss": 1.6421, + "step": 6129 + }, + { + "epoch": 0.6367508050275268, + "grad_norm": 0.4393003284931183, + "learning_rate": 2.917394426539477e-05, + "loss": 1.777, + "step": 6130 + }, + { + "epoch": 0.6368546795471071, + "grad_norm": 0.4733230769634247, + "learning_rate": 2.915911155459927e-05, + "loss": 1.7351, + "step": 6131 + }, + { + "epoch": 0.6369585540666874, + "grad_norm": 0.44397008419036865, + "learning_rate": 2.914428106319579e-05, + "loss": 1.7189, + "step": 6132 + }, + { + "epoch": 0.6370624285862678, + "grad_norm": 0.4575912654399872, + "learning_rate": 2.912945279276371e-05, + "loss": 2.0055, + "step": 6133 + }, + { + "epoch": 0.6371663031058481, + "grad_norm": 0.43883270025253296, + "learning_rate": 2.9114626744882128e-05, + "loss": 1.8263, + "step": 6134 + }, + { + "epoch": 0.6372701776254285, + "grad_norm": 0.4244493842124939, + "learning_rate": 2.909980292112985e-05, + "loss": 1.7515, + "step": 6135 + }, + { + "epoch": 0.6373740521450089, + "grad_norm": 0.45086610317230225, + "learning_rate": 2.9084981323085547e-05, + "loss": 1.7123, + "step": 6136 + }, + { + "epoch": 0.6374779266645891, + "grad_norm": 0.43869614601135254, + "learning_rate": 2.90701619523276e-05, + "loss": 1.6792, + "step": 6137 + }, + { + "epoch": 0.6375818011841695, + "grad_norm": 0.43083521723747253, + "learning_rate": 2.9055344810434137e-05, + "loss": 1.6872, + "step": 6138 + }, + { + "epoch": 0.6376856757037499, + "grad_norm": 0.5012997984886169, + "learning_rate": 2.904052989898308e-05, + "loss": 1.908, + "step": 6139 + }, + { + "epoch": 0.6377895502233302, + "grad_norm": 0.4534330368041992, + "learning_rate": 2.90257172195521e-05, + "loss": 1.6729, + "step": 6140 + }, + { + "epoch": 0.6378934247429106, + "grad_norm": 0.43945181369781494, + "learning_rate": 2.9010906773718627e-05, + "loss": 1.6487, + "step": 6141 + }, + { + "epoch": 0.6379972992624909, + "grad_norm": 0.40094152092933655, + "learning_rate": 2.899609856305985e-05, + "loss": 1.508, + "step": 6142 + }, + { + "epoch": 0.6381011737820712, + "grad_norm": 0.4558815658092499, + "learning_rate": 2.8981292589152768e-05, + "loss": 1.8111, + "step": 6143 + }, + { + "epoch": 0.6382050483016516, + "grad_norm": 0.43613293766975403, + "learning_rate": 2.8966488853574036e-05, + "loss": 1.6826, + "step": 6144 + }, + { + "epoch": 0.638308922821232, + "grad_norm": 0.43343204259872437, + "learning_rate": 2.8951687357900203e-05, + "loss": 1.7693, + "step": 6145 + }, + { + "epoch": 0.6384127973408122, + "grad_norm": 0.45726385712623596, + "learning_rate": 2.8936888103707473e-05, + "loss": 1.883, + "step": 6146 + }, + { + "epoch": 0.6385166718603926, + "grad_norm": 0.3765776455402374, + "learning_rate": 2.8922091092571862e-05, + "loss": 1.5675, + "step": 6147 + }, + { + "epoch": 0.638620546379973, + "grad_norm": 0.4046104848384857, + "learning_rate": 2.8907296326069137e-05, + "loss": 1.7282, + "step": 6148 + }, + { + "epoch": 0.6387244208995534, + "grad_norm": 0.40840303897857666, + "learning_rate": 2.8892503805774806e-05, + "loss": 1.6623, + "step": 6149 + }, + { + "epoch": 0.6388282954191337, + "grad_norm": 0.4642704427242279, + "learning_rate": 2.8877713533264217e-05, + "loss": 1.6275, + "step": 6150 + }, + { + "epoch": 0.638932169938714, + "grad_norm": 0.41992318630218506, + "learning_rate": 2.8862925510112327e-05, + "loss": 1.6549, + "step": 6151 + }, + { + "epoch": 0.6390360444582944, + "grad_norm": 0.4473896324634552, + "learning_rate": 2.884813973789402e-05, + "loss": 1.7372, + "step": 6152 + }, + { + "epoch": 0.6391399189778747, + "grad_norm": 0.39671966433525085, + "learning_rate": 2.8833356218183843e-05, + "loss": 1.5238, + "step": 6153 + }, + { + "epoch": 0.6392437934974551, + "grad_norm": 0.4355907440185547, + "learning_rate": 2.881857495255612e-05, + "loss": 1.8191, + "step": 6154 + }, + { + "epoch": 0.6393476680170355, + "grad_norm": 0.4201485812664032, + "learning_rate": 2.8803795942584945e-05, + "loss": 1.4469, + "step": 6155 + }, + { + "epoch": 0.6394515425366157, + "grad_norm": 0.40149399638175964, + "learning_rate": 2.8789019189844163e-05, + "loss": 1.5771, + "step": 6156 + }, + { + "epoch": 0.6395554170561961, + "grad_norm": 0.40967410802841187, + "learning_rate": 2.877424469590739e-05, + "loss": 1.6283, + "step": 6157 + }, + { + "epoch": 0.6396592915757765, + "grad_norm": 0.40948042273521423, + "learning_rate": 2.8759472462347965e-05, + "loss": 1.6039, + "step": 6158 + }, + { + "epoch": 0.6397631660953568, + "grad_norm": 0.4414249658584595, + "learning_rate": 2.8744702490739077e-05, + "loss": 1.6585, + "step": 6159 + }, + { + "epoch": 0.6398670406149372, + "grad_norm": 0.39458298683166504, + "learning_rate": 2.8729934782653545e-05, + "loss": 1.6049, + "step": 6160 + }, + { + "epoch": 0.6399709151345175, + "grad_norm": 0.4163827896118164, + "learning_rate": 2.8715169339664062e-05, + "loss": 1.4499, + "step": 6161 + }, + { + "epoch": 0.6400747896540978, + "grad_norm": 0.4390014708042145, + "learning_rate": 2.870040616334303e-05, + "loss": 1.7916, + "step": 6162 + }, + { + "epoch": 0.6401786641736782, + "grad_norm": 0.38799330592155457, + "learning_rate": 2.8685645255262562e-05, + "loss": 1.6107, + "step": 6163 + }, + { + "epoch": 0.6402825386932586, + "grad_norm": 0.4088546633720398, + "learning_rate": 2.867088661699464e-05, + "loss": 1.7303, + "step": 6164 + }, + { + "epoch": 0.6403864132128388, + "grad_norm": 0.39084142446517944, + "learning_rate": 2.8656130250110913e-05, + "loss": 1.5763, + "step": 6165 + }, + { + "epoch": 0.6404902877324192, + "grad_norm": 0.4360373318195343, + "learning_rate": 2.864137615618283e-05, + "loss": 1.6337, + "step": 6166 + }, + { + "epoch": 0.6405941622519996, + "grad_norm": 0.39488184452056885, + "learning_rate": 2.862662433678158e-05, + "loss": 1.5719, + "step": 6167 + }, + { + "epoch": 0.6406980367715799, + "grad_norm": 0.402828186750412, + "learning_rate": 2.861187479347813e-05, + "loss": 1.4238, + "step": 6168 + }, + { + "epoch": 0.6408019112911603, + "grad_norm": 0.48173683881759644, + "learning_rate": 2.8597127527843172e-05, + "loss": 1.7344, + "step": 6169 + }, + { + "epoch": 0.6409057858107406, + "grad_norm": 0.4502374529838562, + "learning_rate": 2.858238254144716e-05, + "loss": 1.8152, + "step": 6170 + }, + { + "epoch": 0.6410096603303209, + "grad_norm": 0.41789987683296204, + "learning_rate": 2.8567639835860394e-05, + "loss": 1.612, + "step": 6171 + }, + { + "epoch": 0.6411135348499013, + "grad_norm": 0.4217549264431, + "learning_rate": 2.8552899412652757e-05, + "loss": 1.7688, + "step": 6172 + }, + { + "epoch": 0.6412174093694817, + "grad_norm": 0.41339951753616333, + "learning_rate": 2.853816127339406e-05, + "loss": 1.779, + "step": 6173 + }, + { + "epoch": 0.6413212838890621, + "grad_norm": 0.4295401871204376, + "learning_rate": 2.8523425419653784e-05, + "loss": 1.723, + "step": 6174 + }, + { + "epoch": 0.6414251584086423, + "grad_norm": 0.44703397154808044, + "learning_rate": 2.8508691853001174e-05, + "loss": 1.74, + "step": 6175 + }, + { + "epoch": 0.6415290329282227, + "grad_norm": 0.4364173710346222, + "learning_rate": 2.8493960575005242e-05, + "loss": 1.8088, + "step": 6176 + }, + { + "epoch": 0.6416329074478031, + "grad_norm": 0.41096925735473633, + "learning_rate": 2.8479231587234734e-05, + "loss": 1.6215, + "step": 6177 + }, + { + "epoch": 0.6417367819673834, + "grad_norm": 0.42458489537239075, + "learning_rate": 2.8464504891258237e-05, + "loss": 1.6694, + "step": 6178 + }, + { + "epoch": 0.6418406564869638, + "grad_norm": 0.4292949438095093, + "learning_rate": 2.8449780488643944e-05, + "loss": 1.8489, + "step": 6179 + }, + { + "epoch": 0.6419445310065441, + "grad_norm": 0.4265950918197632, + "learning_rate": 2.8435058380959956e-05, + "loss": 1.691, + "step": 6180 + }, + { + "epoch": 0.6420484055261244, + "grad_norm": 0.44009000062942505, + "learning_rate": 2.842033856977403e-05, + "loss": 1.7029, + "step": 6181 + }, + { + "epoch": 0.6421522800457048, + "grad_norm": 0.46439558267593384, + "learning_rate": 2.840562105665372e-05, + "loss": 1.7289, + "step": 6182 + }, + { + "epoch": 0.6422561545652852, + "grad_norm": 0.4735531210899353, + "learning_rate": 2.8390905843166326e-05, + "loss": 1.8544, + "step": 6183 + }, + { + "epoch": 0.6423600290848654, + "grad_norm": 0.4040323793888092, + "learning_rate": 2.8376192930878898e-05, + "loss": 1.6042, + "step": 6184 + }, + { + "epoch": 0.6424639036044458, + "grad_norm": 0.399845689535141, + "learning_rate": 2.8361482321358256e-05, + "loss": 1.7571, + "step": 6185 + }, + { + "epoch": 0.6425677781240262, + "grad_norm": 0.4226529002189636, + "learning_rate": 2.834677401617094e-05, + "loss": 1.7529, + "step": 6186 + }, + { + "epoch": 0.6426716526436065, + "grad_norm": 0.45261451601982117, + "learning_rate": 2.833206801688333e-05, + "loss": 1.7456, + "step": 6187 + }, + { + "epoch": 0.6427755271631869, + "grad_norm": 0.4298214912414551, + "learning_rate": 2.8317364325061412e-05, + "loss": 1.5975, + "step": 6188 + }, + { + "epoch": 0.6428794016827672, + "grad_norm": 0.3926003575325012, + "learning_rate": 2.8302662942271098e-05, + "loss": 1.6589, + "step": 6189 + }, + { + "epoch": 0.6429832762023475, + "grad_norm": 0.43051791191101074, + "learning_rate": 2.828796387007794e-05, + "loss": 1.7522, + "step": 6190 + }, + { + "epoch": 0.6430871507219279, + "grad_norm": 0.3769807815551758, + "learning_rate": 2.8273267110047234e-05, + "loss": 1.4194, + "step": 6191 + }, + { + "epoch": 0.6431910252415083, + "grad_norm": 0.4164545238018036, + "learning_rate": 2.8258572663744132e-05, + "loss": 1.6241, + "step": 6192 + }, + { + "epoch": 0.6432948997610886, + "grad_norm": 0.41669386625289917, + "learning_rate": 2.824388053273344e-05, + "loss": 1.6012, + "step": 6193 + }, + { + "epoch": 0.6433987742806689, + "grad_norm": 0.3818405568599701, + "learning_rate": 2.8229190718579777e-05, + "loss": 1.5466, + "step": 6194 + }, + { + "epoch": 0.6435026488002493, + "grad_norm": 0.3918877840042114, + "learning_rate": 2.8214503222847477e-05, + "loss": 1.6249, + "step": 6195 + }, + { + "epoch": 0.6436065233198296, + "grad_norm": 0.45735591650009155, + "learning_rate": 2.8199818047100656e-05, + "loss": 1.8056, + "step": 6196 + }, + { + "epoch": 0.64371039783941, + "grad_norm": 0.45919516682624817, + "learning_rate": 2.8185135192903163e-05, + "loss": 1.9505, + "step": 6197 + }, + { + "epoch": 0.6438142723589904, + "grad_norm": 0.38972949981689453, + "learning_rate": 2.8170454661818583e-05, + "loss": 1.4729, + "step": 6198 + }, + { + "epoch": 0.6439181468785707, + "grad_norm": 0.479105681180954, + "learning_rate": 2.815577645541033e-05, + "loss": 1.8712, + "step": 6199 + }, + { + "epoch": 0.644022021398151, + "grad_norm": 0.38174498081207275, + "learning_rate": 2.8141100575241498e-05, + "loss": 1.5689, + "step": 6200 + }, + { + "epoch": 0.6441258959177314, + "grad_norm": 0.40346476435661316, + "learning_rate": 2.8126427022874948e-05, + "loss": 1.7155, + "step": 6201 + }, + { + "epoch": 0.6442297704373118, + "grad_norm": 0.5447202324867249, + "learning_rate": 2.811175579987329e-05, + "loss": 1.9476, + "step": 6202 + }, + { + "epoch": 0.644333644956892, + "grad_norm": 0.4525381326675415, + "learning_rate": 2.809708690779891e-05, + "loss": 1.6353, + "step": 6203 + }, + { + "epoch": 0.6444375194764724, + "grad_norm": 0.4123556911945343, + "learning_rate": 2.8082420348213922e-05, + "loss": 1.5309, + "step": 6204 + }, + { + "epoch": 0.6445413939960528, + "grad_norm": 0.40919628739356995, + "learning_rate": 2.806775612268019e-05, + "loss": 1.5944, + "step": 6205 + }, + { + "epoch": 0.6446452685156331, + "grad_norm": 0.4005577266216278, + "learning_rate": 2.8053094232759393e-05, + "loss": 1.5079, + "step": 6206 + }, + { + "epoch": 0.6447491430352135, + "grad_norm": 0.41373926401138306, + "learning_rate": 2.8038434680012814e-05, + "loss": 1.7176, + "step": 6207 + }, + { + "epoch": 0.6448530175547939, + "grad_norm": 0.4339597821235657, + "learning_rate": 2.8023777466001666e-05, + "loss": 1.7649, + "step": 6208 + }, + { + "epoch": 0.6449568920743741, + "grad_norm": 0.40516406297683716, + "learning_rate": 2.8009122592286795e-05, + "loss": 1.6787, + "step": 6209 + }, + { + "epoch": 0.6450607665939545, + "grad_norm": 0.435835599899292, + "learning_rate": 2.7994470060428834e-05, + "loss": 1.626, + "step": 6210 + }, + { + "epoch": 0.6451646411135349, + "grad_norm": 0.44071394205093384, + "learning_rate": 2.7979819871988166e-05, + "loss": 1.6221, + "step": 6211 + }, + { + "epoch": 0.6452685156331152, + "grad_norm": 0.40906620025634766, + "learning_rate": 2.796517202852491e-05, + "loss": 1.6874, + "step": 6212 + }, + { + "epoch": 0.6453723901526955, + "grad_norm": 0.4259955585002899, + "learning_rate": 2.795052653159896e-05, + "loss": 1.6113, + "step": 6213 + }, + { + "epoch": 0.6454762646722759, + "grad_norm": 0.41644105315208435, + "learning_rate": 2.7935883382769918e-05, + "loss": 1.5794, + "step": 6214 + }, + { + "epoch": 0.6455801391918562, + "grad_norm": 0.42759883403778076, + "learning_rate": 2.7921242583597228e-05, + "loss": 1.716, + "step": 6215 + }, + { + "epoch": 0.6456840137114366, + "grad_norm": 0.4132600426673889, + "learning_rate": 2.7906604135639945e-05, + "loss": 1.6549, + "step": 6216 + }, + { + "epoch": 0.645787888231017, + "grad_norm": 0.42022904753685, + "learning_rate": 2.7891968040457007e-05, + "loss": 1.7255, + "step": 6217 + }, + { + "epoch": 0.6458917627505972, + "grad_norm": 0.5371679663658142, + "learning_rate": 2.7877334299607037e-05, + "loss": 1.993, + "step": 6218 + }, + { + "epoch": 0.6459956372701776, + "grad_norm": 0.4094901978969574, + "learning_rate": 2.7862702914648364e-05, + "loss": 1.6315, + "step": 6219 + }, + { + "epoch": 0.646099511789758, + "grad_norm": 0.3900490999221802, + "learning_rate": 2.784807388713918e-05, + "loss": 1.521, + "step": 6220 + }, + { + "epoch": 0.6462033863093383, + "grad_norm": 0.4084405303001404, + "learning_rate": 2.7833447218637332e-05, + "loss": 1.6055, + "step": 6221 + }, + { + "epoch": 0.6463072608289187, + "grad_norm": 0.38741302490234375, + "learning_rate": 2.7818822910700442e-05, + "loss": 1.6364, + "step": 6222 + }, + { + "epoch": 0.646411135348499, + "grad_norm": 0.443694531917572, + "learning_rate": 2.7804200964885897e-05, + "loss": 1.6882, + "step": 6223 + }, + { + "epoch": 0.6465150098680794, + "grad_norm": 0.47792142629623413, + "learning_rate": 2.7789581382750794e-05, + "loss": 1.6535, + "step": 6224 + }, + { + "epoch": 0.6466188843876597, + "grad_norm": 0.40224647521972656, + "learning_rate": 2.777496416585207e-05, + "loss": 1.5526, + "step": 6225 + }, + { + "epoch": 0.6467227589072401, + "grad_norm": 0.4233601987361908, + "learning_rate": 2.776034931574626e-05, + "loss": 1.6387, + "step": 6226 + }, + { + "epoch": 0.6468266334268205, + "grad_norm": 0.4132138788700104, + "learning_rate": 2.7745736833989788e-05, + "loss": 1.7418, + "step": 6227 + }, + { + "epoch": 0.6469305079464007, + "grad_norm": 0.4106374979019165, + "learning_rate": 2.7731126722138756e-05, + "loss": 1.7758, + "step": 6228 + }, + { + "epoch": 0.6470343824659811, + "grad_norm": 0.40899068117141724, + "learning_rate": 2.7716518981749022e-05, + "loss": 1.7876, + "step": 6229 + }, + { + "epoch": 0.6471382569855615, + "grad_norm": 0.4388611614704132, + "learning_rate": 2.77019136143762e-05, + "loss": 1.5814, + "step": 6230 + }, + { + "epoch": 0.6472421315051418, + "grad_norm": 0.4218350946903229, + "learning_rate": 2.7687310621575646e-05, + "loss": 1.6095, + "step": 6231 + }, + { + "epoch": 0.6473460060247221, + "grad_norm": 0.43606144189834595, + "learning_rate": 2.767271000490247e-05, + "loss": 1.5995, + "step": 6232 + }, + { + "epoch": 0.6474498805443025, + "grad_norm": 0.4526714086532593, + "learning_rate": 2.7658111765911486e-05, + "loss": 1.8424, + "step": 6233 + }, + { + "epoch": 0.6475537550638828, + "grad_norm": 0.43624788522720337, + "learning_rate": 2.764351590615738e-05, + "loss": 1.544, + "step": 6234 + }, + { + "epoch": 0.6476576295834632, + "grad_norm": 0.409190833568573, + "learning_rate": 2.762892242719439e-05, + "loss": 1.6898, + "step": 6235 + }, + { + "epoch": 0.6477615041030436, + "grad_norm": 0.40407395362854004, + "learning_rate": 2.7614331330576694e-05, + "loss": 1.4979, + "step": 6236 + }, + { + "epoch": 0.6478653786226238, + "grad_norm": 0.4525948464870453, + "learning_rate": 2.7599742617858088e-05, + "loss": 1.8237, + "step": 6237 + }, + { + "epoch": 0.6479692531422042, + "grad_norm": 0.44162291288375854, + "learning_rate": 2.7585156290592164e-05, + "loss": 1.482, + "step": 6238 + }, + { + "epoch": 0.6480731276617846, + "grad_norm": 0.44789817929267883, + "learning_rate": 2.7570572350332258e-05, + "loss": 1.7348, + "step": 6239 + }, + { + "epoch": 0.6481770021813649, + "grad_norm": 0.42044568061828613, + "learning_rate": 2.7555990798631436e-05, + "loss": 1.6357, + "step": 6240 + }, + { + "epoch": 0.6482808767009453, + "grad_norm": 0.4480580985546112, + "learning_rate": 2.7541411637042526e-05, + "loss": 1.8034, + "step": 6241 + }, + { + "epoch": 0.6483847512205256, + "grad_norm": 0.4233933389186859, + "learning_rate": 2.752683486711807e-05, + "loss": 1.6212, + "step": 6242 + }, + { + "epoch": 0.6484886257401059, + "grad_norm": 0.38887619972229004, + "learning_rate": 2.7512260490410448e-05, + "loss": 1.5956, + "step": 6243 + }, + { + "epoch": 0.6485925002596863, + "grad_norm": 0.3876674771308899, + "learning_rate": 2.7497688508471625e-05, + "loss": 1.5114, + "step": 6244 + }, + { + "epoch": 0.6486963747792667, + "grad_norm": 0.4232475459575653, + "learning_rate": 2.748311892285348e-05, + "loss": 1.4739, + "step": 6245 + }, + { + "epoch": 0.6488002492988469, + "grad_norm": 0.4148775041103363, + "learning_rate": 2.746855173510754e-05, + "loss": 1.5095, + "step": 6246 + }, + { + "epoch": 0.6489041238184273, + "grad_norm": 0.4203701913356781, + "learning_rate": 2.745398694678506e-05, + "loss": 1.8714, + "step": 6247 + }, + { + "epoch": 0.6490079983380077, + "grad_norm": 0.424444317817688, + "learning_rate": 2.7439424559437122e-05, + "loss": 1.7226, + "step": 6248 + }, + { + "epoch": 0.649111872857588, + "grad_norm": 0.3754339814186096, + "learning_rate": 2.7424864574614484e-05, + "loss": 1.5442, + "step": 6249 + }, + { + "epoch": 0.6492157473771684, + "grad_norm": 0.506229043006897, + "learning_rate": 2.7410306993867675e-05, + "loss": 1.713, + "step": 6250 + }, + { + "epoch": 0.6493196218967487, + "grad_norm": 0.43399345874786377, + "learning_rate": 2.7395751818746972e-05, + "loss": 1.6123, + "step": 6251 + }, + { + "epoch": 0.6494234964163291, + "grad_norm": 0.40855225920677185, + "learning_rate": 2.738119905080235e-05, + "loss": 1.642, + "step": 6252 + }, + { + "epoch": 0.6495273709359094, + "grad_norm": 0.3835819959640503, + "learning_rate": 2.7366648691583646e-05, + "loss": 1.4608, + "step": 6253 + }, + { + "epoch": 0.6496312454554898, + "grad_norm": 0.41114479303359985, + "learning_rate": 2.735210074264027e-05, + "loss": 1.5948, + "step": 6254 + }, + { + "epoch": 0.6497351199750702, + "grad_norm": 0.45714348554611206, + "learning_rate": 2.7337555205521514e-05, + "loss": 1.5191, + "step": 6255 + }, + { + "epoch": 0.6498389944946504, + "grad_norm": 0.4060697853565216, + "learning_rate": 2.7323012081776368e-05, + "loss": 1.5952, + "step": 6256 + }, + { + "epoch": 0.6499428690142308, + "grad_norm": 0.4719262421131134, + "learning_rate": 2.7308471372953538e-05, + "loss": 1.644, + "step": 6257 + }, + { + "epoch": 0.6500467435338112, + "grad_norm": 0.40513306856155396, + "learning_rate": 2.7293933080601507e-05, + "loss": 1.7363, + "step": 6258 + }, + { + "epoch": 0.6501506180533915, + "grad_norm": 0.4476475417613983, + "learning_rate": 2.727939720626849e-05, + "loss": 1.7536, + "step": 6259 + }, + { + "epoch": 0.6502544925729719, + "grad_norm": 0.44684842228889465, + "learning_rate": 2.7264863751502446e-05, + "loss": 1.8819, + "step": 6260 + }, + { + "epoch": 0.6503583670925522, + "grad_norm": 0.40098902583122253, + "learning_rate": 2.725033271785105e-05, + "loss": 1.6615, + "step": 6261 + }, + { + "epoch": 0.6504622416121325, + "grad_norm": 0.4347749948501587, + "learning_rate": 2.7235804106861807e-05, + "loss": 1.5312, + "step": 6262 + }, + { + "epoch": 0.6505661161317129, + "grad_norm": 0.4464839994907379, + "learning_rate": 2.7221277920081818e-05, + "loss": 1.8898, + "step": 6263 + }, + { + "epoch": 0.6506699906512933, + "grad_norm": 0.4316708743572235, + "learning_rate": 2.720675415905807e-05, + "loss": 1.779, + "step": 6264 + }, + { + "epoch": 0.6507738651708735, + "grad_norm": 0.42350926995277405, + "learning_rate": 2.7192232825337206e-05, + "loss": 1.6765, + "step": 6265 + }, + { + "epoch": 0.6508777396904539, + "grad_norm": 0.39864397048950195, + "learning_rate": 2.7177713920465643e-05, + "loss": 1.631, + "step": 6266 + }, + { + "epoch": 0.6509816142100343, + "grad_norm": 0.40547481179237366, + "learning_rate": 2.7163197445989518e-05, + "loss": 1.5956, + "step": 6267 + }, + { + "epoch": 0.6510854887296146, + "grad_norm": 0.4294564127922058, + "learning_rate": 2.7148683403454746e-05, + "loss": 1.5979, + "step": 6268 + }, + { + "epoch": 0.651189363249195, + "grad_norm": 0.41150328516960144, + "learning_rate": 2.7134171794406932e-05, + "loss": 1.546, + "step": 6269 + }, + { + "epoch": 0.6512932377687753, + "grad_norm": 0.4225398004055023, + "learning_rate": 2.711966262039145e-05, + "loss": 1.7254, + "step": 6270 + }, + { + "epoch": 0.6513971122883556, + "grad_norm": 0.42999348044395447, + "learning_rate": 2.710515588295346e-05, + "loss": 1.7585, + "step": 6271 + }, + { + "epoch": 0.651500986807936, + "grad_norm": 0.39667728543281555, + "learning_rate": 2.709065158363774e-05, + "loss": 1.5776, + "step": 6272 + }, + { + "epoch": 0.6516048613275164, + "grad_norm": 0.41493138670921326, + "learning_rate": 2.7076149723988954e-05, + "loss": 1.7011, + "step": 6273 + }, + { + "epoch": 0.6517087358470967, + "grad_norm": 0.45391151309013367, + "learning_rate": 2.7061650305551424e-05, + "loss": 1.7816, + "step": 6274 + }, + { + "epoch": 0.651812610366677, + "grad_norm": 0.4255356788635254, + "learning_rate": 2.7047153329869167e-05, + "loss": 1.8013, + "step": 6275 + }, + { + "epoch": 0.6519164848862574, + "grad_norm": 0.3899310827255249, + "learning_rate": 2.7032658798486072e-05, + "loss": 1.4257, + "step": 6276 + }, + { + "epoch": 0.6520203594058378, + "grad_norm": 0.4353606402873993, + "learning_rate": 2.7018166712945635e-05, + "loss": 1.732, + "step": 6277 + }, + { + "epoch": 0.6521242339254181, + "grad_norm": 0.4526216685771942, + "learning_rate": 2.700367707479122e-05, + "loss": 1.7406, + "step": 6278 + }, + { + "epoch": 0.6522281084449985, + "grad_norm": 0.44765564799308777, + "learning_rate": 2.6989189885565803e-05, + "loss": 1.7469, + "step": 6279 + }, + { + "epoch": 0.6523319829645788, + "grad_norm": 0.4167086184024811, + "learning_rate": 2.697470514681214e-05, + "loss": 1.6946, + "step": 6280 + }, + { + "epoch": 0.6524358574841591, + "grad_norm": 0.40468063950538635, + "learning_rate": 2.6960222860072826e-05, + "loss": 1.5709, + "step": 6281 + }, + { + "epoch": 0.6525397320037395, + "grad_norm": 0.411888062953949, + "learning_rate": 2.6945743026890012e-05, + "loss": 1.4393, + "step": 6282 + }, + { + "epoch": 0.6526436065233199, + "grad_norm": 0.41102951765060425, + "learning_rate": 2.693126564880576e-05, + "loss": 1.68, + "step": 6283 + }, + { + "epoch": 0.6527474810429001, + "grad_norm": 0.43322500586509705, + "learning_rate": 2.6916790727361773e-05, + "loss": 1.6189, + "step": 6284 + }, + { + "epoch": 0.6528513555624805, + "grad_norm": 0.47928208112716675, + "learning_rate": 2.6902318264099517e-05, + "loss": 1.8985, + "step": 6285 + }, + { + "epoch": 0.6529552300820609, + "grad_norm": 0.4368571639060974, + "learning_rate": 2.68878482605602e-05, + "loss": 1.7447, + "step": 6286 + }, + { + "epoch": 0.6530591046016412, + "grad_norm": 0.39063751697540283, + "learning_rate": 2.6873380718284757e-05, + "loss": 1.5522, + "step": 6287 + }, + { + "epoch": 0.6531629791212216, + "grad_norm": 0.4288500249385834, + "learning_rate": 2.6858915638813874e-05, + "loss": 1.8171, + "step": 6288 + }, + { + "epoch": 0.653266853640802, + "grad_norm": 0.42578479647636414, + "learning_rate": 2.6844453023687956e-05, + "loss": 1.5586, + "step": 6289 + }, + { + "epoch": 0.6533707281603822, + "grad_norm": 0.43467986583709717, + "learning_rate": 2.6829992874447196e-05, + "loss": 1.8163, + "step": 6290 + }, + { + "epoch": 0.6534746026799626, + "grad_norm": 0.39412596821784973, + "learning_rate": 2.6815535192631437e-05, + "loss": 1.4798, + "step": 6291 + }, + { + "epoch": 0.653578477199543, + "grad_norm": 0.40805160999298096, + "learning_rate": 2.6801079979780343e-05, + "loss": 1.7423, + "step": 6292 + }, + { + "epoch": 0.6536823517191233, + "grad_norm": 0.4715184271335602, + "learning_rate": 2.6786627237433275e-05, + "loss": 1.7352, + "step": 6293 + }, + { + "epoch": 0.6537862262387036, + "grad_norm": 0.469553142786026, + "learning_rate": 2.6772176967129336e-05, + "loss": 1.892, + "step": 6294 + }, + { + "epoch": 0.653890100758284, + "grad_norm": 0.45803409814834595, + "learning_rate": 2.6757729170407368e-05, + "loss": 1.7635, + "step": 6295 + }, + { + "epoch": 0.6539939752778643, + "grad_norm": 0.4040502607822418, + "learning_rate": 2.6743283848805945e-05, + "loss": 1.7023, + "step": 6296 + }, + { + "epoch": 0.6540978497974447, + "grad_norm": 0.4045460522174835, + "learning_rate": 2.672884100386338e-05, + "loss": 1.6528, + "step": 6297 + }, + { + "epoch": 0.6542017243170251, + "grad_norm": 0.44102489948272705, + "learning_rate": 2.6714400637117708e-05, + "loss": 1.7864, + "step": 6298 + }, + { + "epoch": 0.6543055988366053, + "grad_norm": 0.43990832567214966, + "learning_rate": 2.669996275010678e-05, + "loss": 1.7531, + "step": 6299 + }, + { + "epoch": 0.6544094733561857, + "grad_norm": 0.46511977910995483, + "learning_rate": 2.6685527344368022e-05, + "loss": 1.7993, + "step": 6300 + }, + { + "epoch": 0.6545133478757661, + "grad_norm": 0.40895703434944153, + "learning_rate": 2.667109442143877e-05, + "loss": 1.7012, + "step": 6301 + }, + { + "epoch": 0.6546172223953465, + "grad_norm": 0.41711392998695374, + "learning_rate": 2.665666398285599e-05, + "loss": 1.6973, + "step": 6302 + }, + { + "epoch": 0.6547210969149267, + "grad_norm": 0.44164448976516724, + "learning_rate": 2.6642236030156405e-05, + "loss": 1.6828, + "step": 6303 + }, + { + "epoch": 0.6548249714345071, + "grad_norm": 0.45560404658317566, + "learning_rate": 2.6627810564876488e-05, + "loss": 1.7191, + "step": 6304 + }, + { + "epoch": 0.6549288459540875, + "grad_norm": 0.40289151668548584, + "learning_rate": 2.6613387588552406e-05, + "loss": 1.6014, + "step": 6305 + }, + { + "epoch": 0.6550327204736678, + "grad_norm": 0.44911909103393555, + "learning_rate": 2.6598967102720174e-05, + "loss": 1.794, + "step": 6306 + }, + { + "epoch": 0.6551365949932482, + "grad_norm": 0.4513666331768036, + "learning_rate": 2.658454910891539e-05, + "loss": 1.697, + "step": 6307 + }, + { + "epoch": 0.6552404695128285, + "grad_norm": 0.4077112674713135, + "learning_rate": 2.657013360867346e-05, + "loss": 1.5579, + "step": 6308 + }, + { + "epoch": 0.6553443440324088, + "grad_norm": 0.4188466966152191, + "learning_rate": 2.6555720603529578e-05, + "loss": 1.61, + "step": 6309 + }, + { + "epoch": 0.6554482185519892, + "grad_norm": 0.38735076785087585, + "learning_rate": 2.6541310095018533e-05, + "loss": 1.4753, + "step": 6310 + }, + { + "epoch": 0.6555520930715696, + "grad_norm": 0.4412975609302521, + "learning_rate": 2.6526902084675008e-05, + "loss": 1.7289, + "step": 6311 + }, + { + "epoch": 0.6556559675911499, + "grad_norm": 0.3914285898208618, + "learning_rate": 2.6512496574033307e-05, + "loss": 1.5964, + "step": 6312 + }, + { + "epoch": 0.6557598421107302, + "grad_norm": 0.4345444142818451, + "learning_rate": 2.6498093564627515e-05, + "loss": 1.5983, + "step": 6313 + }, + { + "epoch": 0.6558637166303106, + "grad_norm": 0.4521616995334625, + "learning_rate": 2.6483693057991433e-05, + "loss": 1.719, + "step": 6314 + }, + { + "epoch": 0.6559675911498909, + "grad_norm": 0.465675413608551, + "learning_rate": 2.646929505565861e-05, + "loss": 1.7625, + "step": 6315 + }, + { + "epoch": 0.6560714656694713, + "grad_norm": 0.4356716573238373, + "learning_rate": 2.6454899559162316e-05, + "loss": 1.6035, + "step": 6316 + }, + { + "epoch": 0.6561753401890517, + "grad_norm": 0.3972066640853882, + "learning_rate": 2.644050657003554e-05, + "loss": 1.8204, + "step": 6317 + }, + { + "epoch": 0.6562792147086319, + "grad_norm": 0.40507668256759644, + "learning_rate": 2.642611608981108e-05, + "loss": 1.5586, + "step": 6318 + }, + { + "epoch": 0.6563830892282123, + "grad_norm": 0.4741031229496002, + "learning_rate": 2.641172812002134e-05, + "loss": 1.9185, + "step": 6319 + }, + { + "epoch": 0.6564869637477927, + "grad_norm": 0.4094397723674774, + "learning_rate": 2.6397342662198578e-05, + "loss": 1.7196, + "step": 6320 + }, + { + "epoch": 0.656590838267373, + "grad_norm": 0.43364858627319336, + "learning_rate": 2.638295971787471e-05, + "loss": 1.7506, + "step": 6321 + }, + { + "epoch": 0.6566947127869533, + "grad_norm": 0.43040645122528076, + "learning_rate": 2.6368579288581414e-05, + "loss": 1.5984, + "step": 6322 + }, + { + "epoch": 0.6567985873065337, + "grad_norm": 0.41142383217811584, + "learning_rate": 2.635420137585009e-05, + "loss": 1.5823, + "step": 6323 + }, + { + "epoch": 0.656902461826114, + "grad_norm": 0.4649263918399811, + "learning_rate": 2.6339825981211873e-05, + "loss": 1.8616, + "step": 6324 + }, + { + "epoch": 0.6570063363456944, + "grad_norm": 0.42940422892570496, + "learning_rate": 2.632545310619764e-05, + "loss": 1.4848, + "step": 6325 + }, + { + "epoch": 0.6571102108652748, + "grad_norm": 0.4017314016819, + "learning_rate": 2.6311082752337957e-05, + "loss": 1.4897, + "step": 6326 + }, + { + "epoch": 0.657214085384855, + "grad_norm": 0.4207054674625397, + "learning_rate": 2.6296714921163224e-05, + "loss": 1.5465, + "step": 6327 + }, + { + "epoch": 0.6573179599044354, + "grad_norm": 0.3997343182563782, + "learning_rate": 2.6282349614203415e-05, + "loss": 1.4289, + "step": 6328 + }, + { + "epoch": 0.6574218344240158, + "grad_norm": 0.43719449639320374, + "learning_rate": 2.626798683298839e-05, + "loss": 1.64, + "step": 6329 + }, + { + "epoch": 0.6575257089435962, + "grad_norm": 0.47477173805236816, + "learning_rate": 2.6253626579047654e-05, + "loss": 1.4736, + "step": 6330 + }, + { + "epoch": 0.6576295834631765, + "grad_norm": 0.39647629857063293, + "learning_rate": 2.6239268853910452e-05, + "loss": 1.5444, + "step": 6331 + }, + { + "epoch": 0.6577334579827568, + "grad_norm": 0.3856428563594818, + "learning_rate": 2.6224913659105775e-05, + "loss": 1.5782, + "step": 6332 + }, + { + "epoch": 0.6578373325023372, + "grad_norm": 0.41423386335372925, + "learning_rate": 2.621056099616233e-05, + "loss": 1.5784, + "step": 6333 + }, + { + "epoch": 0.6579412070219175, + "grad_norm": 0.38329392671585083, + "learning_rate": 2.6196210866608606e-05, + "loss": 1.6442, + "step": 6334 + }, + { + "epoch": 0.6580450815414979, + "grad_norm": 0.39722350239753723, + "learning_rate": 2.6181863271972735e-05, + "loss": 1.6947, + "step": 6335 + }, + { + "epoch": 0.6581489560610783, + "grad_norm": 0.4223669469356537, + "learning_rate": 2.616751821378261e-05, + "loss": 1.6055, + "step": 6336 + }, + { + "epoch": 0.6582528305806585, + "grad_norm": 0.39463213086128235, + "learning_rate": 2.6153175693565933e-05, + "loss": 1.5574, + "step": 6337 + }, + { + "epoch": 0.6583567051002389, + "grad_norm": 0.40961316227912903, + "learning_rate": 2.6138835712850003e-05, + "loss": 1.6478, + "step": 6338 + }, + { + "epoch": 0.6584605796198193, + "grad_norm": 0.39676934480667114, + "learning_rate": 2.6124498273161957e-05, + "loss": 1.6545, + "step": 6339 + }, + { + "epoch": 0.6585644541393996, + "grad_norm": 0.397554874420166, + "learning_rate": 2.6110163376028613e-05, + "loss": 1.5861, + "step": 6340 + }, + { + "epoch": 0.65866832865898, + "grad_norm": 0.40185415744781494, + "learning_rate": 2.6095831022976514e-05, + "loss": 1.6366, + "step": 6341 + }, + { + "epoch": 0.6587722031785603, + "grad_norm": 0.3959345519542694, + "learning_rate": 2.6081501215531955e-05, + "loss": 1.6307, + "step": 6342 + }, + { + "epoch": 0.6588760776981406, + "grad_norm": 0.41983965039253235, + "learning_rate": 2.606717395522095e-05, + "loss": 1.6515, + "step": 6343 + }, + { + "epoch": 0.658979952217721, + "grad_norm": 0.41705140471458435, + "learning_rate": 2.6052849243569233e-05, + "loss": 1.6791, + "step": 6344 + }, + { + "epoch": 0.6590838267373014, + "grad_norm": 0.5571948885917664, + "learning_rate": 2.603852708210226e-05, + "loss": 1.7317, + "step": 6345 + }, + { + "epoch": 0.6591877012568816, + "grad_norm": 0.39944010972976685, + "learning_rate": 2.6024207472345273e-05, + "loss": 1.6033, + "step": 6346 + }, + { + "epoch": 0.659291575776462, + "grad_norm": 0.39192959666252136, + "learning_rate": 2.6009890415823147e-05, + "loss": 1.6101, + "step": 6347 + }, + { + "epoch": 0.6593954502960424, + "grad_norm": 0.41190558671951294, + "learning_rate": 2.5995575914060576e-05, + "loss": 1.5903, + "step": 6348 + }, + { + "epoch": 0.6594993248156227, + "grad_norm": 0.39744171500205994, + "learning_rate": 2.5981263968581925e-05, + "loss": 1.663, + "step": 6349 + }, + { + "epoch": 0.6596031993352031, + "grad_norm": 0.4570941925048828, + "learning_rate": 2.5966954580911314e-05, + "loss": 1.5257, + "step": 6350 + }, + { + "epoch": 0.6597070738547834, + "grad_norm": 0.43954119086265564, + "learning_rate": 2.5952647752572566e-05, + "loss": 1.8657, + "step": 6351 + }, + { + "epoch": 0.6598109483743637, + "grad_norm": 0.4737524390220642, + "learning_rate": 2.5938343485089267e-05, + "loss": 1.9287, + "step": 6352 + }, + { + "epoch": 0.6599148228939441, + "grad_norm": 0.4555112421512604, + "learning_rate": 2.5924041779984692e-05, + "loss": 1.7726, + "step": 6353 + }, + { + "epoch": 0.6600186974135245, + "grad_norm": 0.4156988561153412, + "learning_rate": 2.5909742638781852e-05, + "loss": 1.6406, + "step": 6354 + }, + { + "epoch": 0.6601225719331049, + "grad_norm": 0.45365050435066223, + "learning_rate": 2.5895446063003526e-05, + "loss": 1.6795, + "step": 6355 + }, + { + "epoch": 0.6602264464526851, + "grad_norm": 0.5357591509819031, + "learning_rate": 2.5881152054172193e-05, + "loss": 1.6828, + "step": 6356 + }, + { + "epoch": 0.6603303209722655, + "grad_norm": 0.422297865152359, + "learning_rate": 2.586686061380999e-05, + "loss": 1.6548, + "step": 6357 + }, + { + "epoch": 0.6604341954918459, + "grad_norm": 0.40216442942619324, + "learning_rate": 2.5852571743438908e-05, + "loss": 1.5247, + "step": 6358 + }, + { + "epoch": 0.6605380700114262, + "grad_norm": 0.4673736095428467, + "learning_rate": 2.583828544458058e-05, + "loss": 1.6892, + "step": 6359 + }, + { + "epoch": 0.6606419445310066, + "grad_norm": 0.42048928141593933, + "learning_rate": 2.582400171875638e-05, + "loss": 1.6188, + "step": 6360 + }, + { + "epoch": 0.6607458190505869, + "grad_norm": 0.4226718246936798, + "learning_rate": 2.5809720567487395e-05, + "loss": 1.4864, + "step": 6361 + }, + { + "epoch": 0.6608496935701672, + "grad_norm": 0.46937909722328186, + "learning_rate": 2.5795441992294523e-05, + "loss": 1.7432, + "step": 6362 + }, + { + "epoch": 0.6609535680897476, + "grad_norm": 0.43607574701309204, + "learning_rate": 2.578116599469825e-05, + "loss": 1.8233, + "step": 6363 + }, + { + "epoch": 0.661057442609328, + "grad_norm": 0.4324806034564972, + "learning_rate": 2.576689257621887e-05, + "loss": 1.7842, + "step": 6364 + }, + { + "epoch": 0.6611613171289082, + "grad_norm": 0.3860158920288086, + "learning_rate": 2.5752621738376447e-05, + "loss": 1.4591, + "step": 6365 + }, + { + "epoch": 0.6612651916484886, + "grad_norm": 0.3831734359264374, + "learning_rate": 2.5738353482690624e-05, + "loss": 1.527, + "step": 6366 + }, + { + "epoch": 0.661369066168069, + "grad_norm": 0.41983523964881897, + "learning_rate": 2.5724087810680937e-05, + "loss": 1.7582, + "step": 6367 + }, + { + "epoch": 0.6614729406876493, + "grad_norm": 0.4340680241584778, + "learning_rate": 2.570982472386653e-05, + "loss": 1.7625, + "step": 6368 + }, + { + "epoch": 0.6615768152072297, + "grad_norm": 0.4355176091194153, + "learning_rate": 2.569556422376632e-05, + "loss": 1.6903, + "step": 6369 + }, + { + "epoch": 0.66168068972681, + "grad_norm": 0.39546066522598267, + "learning_rate": 2.5681306311898945e-05, + "loss": 1.6276, + "step": 6370 + }, + { + "epoch": 0.6617845642463903, + "grad_norm": 0.40482446551322937, + "learning_rate": 2.566705098978275e-05, + "loss": 1.7003, + "step": 6371 + }, + { + "epoch": 0.6618884387659707, + "grad_norm": 0.4112469255924225, + "learning_rate": 2.5652798258935825e-05, + "loss": 1.6817, + "step": 6372 + }, + { + "epoch": 0.6619923132855511, + "grad_norm": 0.4560087025165558, + "learning_rate": 2.5638548120875945e-05, + "loss": 1.7853, + "step": 6373 + }, + { + "epoch": 0.6620961878051314, + "grad_norm": 0.4203476905822754, + "learning_rate": 2.5624300577120703e-05, + "loss": 1.7259, + "step": 6374 + }, + { + "epoch": 0.6622000623247117, + "grad_norm": 0.4099465608596802, + "learning_rate": 2.5610055629187275e-05, + "loss": 1.6924, + "step": 6375 + }, + { + "epoch": 0.6623039368442921, + "grad_norm": 0.4466152489185333, + "learning_rate": 2.559581327859269e-05, + "loss": 1.8033, + "step": 6376 + }, + { + "epoch": 0.6624078113638724, + "grad_norm": 0.487020343542099, + "learning_rate": 2.558157352685363e-05, + "loss": 1.9298, + "step": 6377 + }, + { + "epoch": 0.6625116858834528, + "grad_norm": 0.5030232071876526, + "learning_rate": 2.5567336375486528e-05, + "loss": 1.6374, + "step": 6378 + }, + { + "epoch": 0.6626155604030332, + "grad_norm": 0.4117771089076996, + "learning_rate": 2.555310182600751e-05, + "loss": 1.6823, + "step": 6379 + }, + { + "epoch": 0.6627194349226135, + "grad_norm": 0.5117984414100647, + "learning_rate": 2.5538869879932438e-05, + "loss": 1.7883, + "step": 6380 + }, + { + "epoch": 0.6628233094421938, + "grad_norm": 0.4503713548183441, + "learning_rate": 2.5524640538776955e-05, + "loss": 1.8058, + "step": 6381 + }, + { + "epoch": 0.6629271839617742, + "grad_norm": 0.4063549041748047, + "learning_rate": 2.551041380405631e-05, + "loss": 1.6395, + "step": 6382 + }, + { + "epoch": 0.6630310584813546, + "grad_norm": 0.4199891984462738, + "learning_rate": 2.5496189677285574e-05, + "loss": 1.6788, + "step": 6383 + }, + { + "epoch": 0.6631349330009348, + "grad_norm": 0.45948025584220886, + "learning_rate": 2.548196815997953e-05, + "loss": 1.8964, + "step": 6384 + }, + { + "epoch": 0.6632388075205152, + "grad_norm": 0.4091807007789612, + "learning_rate": 2.546774925365259e-05, + "loss": 1.7551, + "step": 6385 + }, + { + "epoch": 0.6633426820400956, + "grad_norm": 0.3907213807106018, + "learning_rate": 2.5453532959819017e-05, + "loss": 1.6075, + "step": 6386 + }, + { + "epoch": 0.6634465565596759, + "grad_norm": 0.4916064143180847, + "learning_rate": 2.5439319279992708e-05, + "loss": 1.8376, + "step": 6387 + }, + { + "epoch": 0.6635504310792563, + "grad_norm": 0.3837120532989502, + "learning_rate": 2.5425108215687314e-05, + "loss": 1.6351, + "step": 6388 + }, + { + "epoch": 0.6636543055988366, + "grad_norm": 0.4401141405105591, + "learning_rate": 2.5410899768416186e-05, + "loss": 1.7869, + "step": 6389 + }, + { + "epoch": 0.6637581801184169, + "grad_norm": 0.4023844599723816, + "learning_rate": 2.5396693939692474e-05, + "loss": 1.5495, + "step": 6390 + }, + { + "epoch": 0.6638620546379973, + "grad_norm": 0.41856905817985535, + "learning_rate": 2.5382490731028918e-05, + "loss": 1.641, + "step": 6391 + }, + { + "epoch": 0.6639659291575777, + "grad_norm": 0.40514466166496277, + "learning_rate": 2.5368290143938062e-05, + "loss": 1.7506, + "step": 6392 + }, + { + "epoch": 0.664069803677158, + "grad_norm": 0.4336822032928467, + "learning_rate": 2.5354092179932204e-05, + "loss": 1.7111, + "step": 6393 + }, + { + "epoch": 0.6641736781967383, + "grad_norm": 0.4441390037536621, + "learning_rate": 2.533989684052325e-05, + "loss": 1.6557, + "step": 6394 + }, + { + "epoch": 0.6642775527163187, + "grad_norm": 0.4061375558376312, + "learning_rate": 2.5325704127222937e-05, + "loss": 1.5773, + "step": 6395 + }, + { + "epoch": 0.664381427235899, + "grad_norm": 0.4289427101612091, + "learning_rate": 2.5311514041542678e-05, + "loss": 1.8766, + "step": 6396 + }, + { + "epoch": 0.6644853017554794, + "grad_norm": 0.40495064854621887, + "learning_rate": 2.529732658499359e-05, + "loss": 1.582, + "step": 6397 + }, + { + "epoch": 0.6645891762750598, + "grad_norm": 0.4054896831512451, + "learning_rate": 2.5283141759086537e-05, + "loss": 1.6379, + "step": 6398 + }, + { + "epoch": 0.66469305079464, + "grad_norm": 0.3749221861362457, + "learning_rate": 2.5268959565332086e-05, + "loss": 1.4214, + "step": 6399 + }, + { + "epoch": 0.6647969253142204, + "grad_norm": 0.3932819068431854, + "learning_rate": 2.525478000524053e-05, + "loss": 1.6518, + "step": 6400 + }, + { + "epoch": 0.6649007998338008, + "grad_norm": 0.4017762541770935, + "learning_rate": 2.5240603080321863e-05, + "loss": 1.578, + "step": 6401 + }, + { + "epoch": 0.6650046743533811, + "grad_norm": 0.47562339901924133, + "learning_rate": 2.5226428792085875e-05, + "loss": 2.0138, + "step": 6402 + }, + { + "epoch": 0.6651085488729614, + "grad_norm": 0.407149076461792, + "learning_rate": 2.5212257142041938e-05, + "loss": 1.5924, + "step": 6403 + }, + { + "epoch": 0.6652124233925418, + "grad_norm": 0.44227856397628784, + "learning_rate": 2.5198088131699294e-05, + "loss": 1.7202, + "step": 6404 + }, + { + "epoch": 0.6653162979121221, + "grad_norm": 0.44379758834838867, + "learning_rate": 2.5183921762566797e-05, + "loss": 1.8716, + "step": 6405 + }, + { + "epoch": 0.6654201724317025, + "grad_norm": 0.42266109585762024, + "learning_rate": 2.5169758036153056e-05, + "loss": 1.6601, + "step": 6406 + }, + { + "epoch": 0.6655240469512829, + "grad_norm": 0.5245758295059204, + "learning_rate": 2.5155596953966397e-05, + "loss": 1.7759, + "step": 6407 + }, + { + "epoch": 0.6656279214708632, + "grad_norm": 0.3796195089817047, + "learning_rate": 2.514143851751486e-05, + "loss": 1.4598, + "step": 6408 + }, + { + "epoch": 0.6657317959904435, + "grad_norm": 0.38661640882492065, + "learning_rate": 2.512728272830625e-05, + "loss": 1.5706, + "step": 6409 + }, + { + "epoch": 0.6658356705100239, + "grad_norm": 0.41082271933555603, + "learning_rate": 2.511312958784797e-05, + "loss": 1.7148, + "step": 6410 + }, + { + "epoch": 0.6659395450296043, + "grad_norm": 0.42262521386146545, + "learning_rate": 2.5098979097647297e-05, + "loss": 1.561, + "step": 6411 + }, + { + "epoch": 0.6660434195491846, + "grad_norm": 0.3816385865211487, + "learning_rate": 2.508483125921113e-05, + "loss": 1.602, + "step": 6412 + }, + { + "epoch": 0.6661472940687649, + "grad_norm": 0.47888967394828796, + "learning_rate": 2.507068607404604e-05, + "loss": 1.9113, + "step": 6413 + }, + { + "epoch": 0.6662511685883453, + "grad_norm": 0.47457215189933777, + "learning_rate": 2.5056543543658452e-05, + "loss": 1.5093, + "step": 6414 + }, + { + "epoch": 0.6663550431079256, + "grad_norm": 0.3893379867076874, + "learning_rate": 2.5042403669554416e-05, + "loss": 1.5901, + "step": 6415 + }, + { + "epoch": 0.666458917627506, + "grad_norm": 0.4125143587589264, + "learning_rate": 2.5028266453239703e-05, + "loss": 1.6606, + "step": 6416 + }, + { + "epoch": 0.6665627921470864, + "grad_norm": 0.3965260088443756, + "learning_rate": 2.501413189621983e-05, + "loss": 1.6371, + "step": 6417 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.4429340660572052, + "learning_rate": 2.500000000000001e-05, + "loss": 1.7512, + "step": 6418 + }, + { + "epoch": 0.666770541186247, + "grad_norm": 0.471049427986145, + "learning_rate": 2.4985870766085185e-05, + "loss": 1.6877, + "step": 6419 + }, + { + "epoch": 0.6668744157058274, + "grad_norm": 0.4106440544128418, + "learning_rate": 2.4971744195979985e-05, + "loss": 1.6417, + "step": 6420 + }, + { + "epoch": 0.6669782902254077, + "grad_norm": 0.4270106554031372, + "learning_rate": 2.4957620291188844e-05, + "loss": 1.7746, + "step": 6421 + }, + { + "epoch": 0.667082164744988, + "grad_norm": 0.39720797538757324, + "learning_rate": 2.4943499053215756e-05, + "loss": 1.5985, + "step": 6422 + }, + { + "epoch": 0.6671860392645684, + "grad_norm": 0.43498122692108154, + "learning_rate": 2.4929380483564602e-05, + "loss": 1.7645, + "step": 6423 + }, + { + "epoch": 0.6672899137841487, + "grad_norm": 0.3922276198863983, + "learning_rate": 2.4915264583738858e-05, + "loss": 1.5833, + "step": 6424 + }, + { + "epoch": 0.6673937883037291, + "grad_norm": 0.4027794599533081, + "learning_rate": 2.490115135524177e-05, + "loss": 1.6876, + "step": 6425 + }, + { + "epoch": 0.6674976628233095, + "grad_norm": 0.4081961214542389, + "learning_rate": 2.488704079957629e-05, + "loss": 1.7356, + "step": 6426 + }, + { + "epoch": 0.6676015373428897, + "grad_norm": 0.427378386259079, + "learning_rate": 2.487293291824507e-05, + "loss": 1.6506, + "step": 6427 + }, + { + "epoch": 0.6677054118624701, + "grad_norm": 0.4446810185909271, + "learning_rate": 2.4858827712750494e-05, + "loss": 1.8314, + "step": 6428 + }, + { + "epoch": 0.6678092863820505, + "grad_norm": 0.41210460662841797, + "learning_rate": 2.484472518459464e-05, + "loss": 1.6845, + "step": 6429 + }, + { + "epoch": 0.6679131609016308, + "grad_norm": 0.39390888810157776, + "learning_rate": 2.483062533527938e-05, + "loss": 1.4303, + "step": 6430 + }, + { + "epoch": 0.6680170354212112, + "grad_norm": 0.4102948307991028, + "learning_rate": 2.4816528166306146e-05, + "loss": 1.7178, + "step": 6431 + }, + { + "epoch": 0.6681209099407915, + "grad_norm": 0.45998239517211914, + "learning_rate": 2.4802433679176247e-05, + "loss": 1.69, + "step": 6432 + }, + { + "epoch": 0.6682247844603719, + "grad_norm": 0.41955703496932983, + "learning_rate": 2.4788341875390614e-05, + "loss": 1.6254, + "step": 6433 + }, + { + "epoch": 0.6683286589799522, + "grad_norm": 0.41680797934532166, + "learning_rate": 2.4774252756449912e-05, + "loss": 1.6673, + "step": 6434 + }, + { + "epoch": 0.6684325334995326, + "grad_norm": 0.43315428495407104, + "learning_rate": 2.4760166323854524e-05, + "loss": 1.8226, + "step": 6435 + }, + { + "epoch": 0.668536408019113, + "grad_norm": 0.4136770963668823, + "learning_rate": 2.4746082579104518e-05, + "loss": 1.6372, + "step": 6436 + }, + { + "epoch": 0.6686402825386932, + "grad_norm": 0.40513721108436584, + "learning_rate": 2.4732001523699772e-05, + "loss": 1.5601, + "step": 6437 + }, + { + "epoch": 0.6687441570582736, + "grad_norm": 0.4485640227794647, + "learning_rate": 2.471792315913973e-05, + "loss": 1.5854, + "step": 6438 + }, + { + "epoch": 0.668848031577854, + "grad_norm": 0.4075925946235657, + "learning_rate": 2.470384748692368e-05, + "loss": 1.5502, + "step": 6439 + }, + { + "epoch": 0.6689519060974343, + "grad_norm": 0.45135799050331116, + "learning_rate": 2.468977450855058e-05, + "loss": 1.836, + "step": 6440 + }, + { + "epoch": 0.6690557806170147, + "grad_norm": 0.41414105892181396, + "learning_rate": 2.4675704225519018e-05, + "loss": 1.6872, + "step": 6441 + }, + { + "epoch": 0.669159655136595, + "grad_norm": 0.39774543046951294, + "learning_rate": 2.4661636639327444e-05, + "loss": 1.5706, + "step": 6442 + }, + { + "epoch": 0.6692635296561753, + "grad_norm": 0.43322694301605225, + "learning_rate": 2.4647571751473924e-05, + "loss": 1.7595, + "step": 6443 + }, + { + "epoch": 0.6693674041757557, + "grad_norm": 0.4438508450984955, + "learning_rate": 2.4633509563456252e-05, + "loss": 1.5921, + "step": 6444 + }, + { + "epoch": 0.6694712786953361, + "grad_norm": 0.460419237613678, + "learning_rate": 2.461945007677195e-05, + "loss": 1.9157, + "step": 6445 + }, + { + "epoch": 0.6695751532149163, + "grad_norm": 0.4211447536945343, + "learning_rate": 2.4605393292918234e-05, + "loss": 1.7016, + "step": 6446 + }, + { + "epoch": 0.6696790277344967, + "grad_norm": 0.40026143193244934, + "learning_rate": 2.459133921339205e-05, + "loss": 1.5405, + "step": 6447 + }, + { + "epoch": 0.6697829022540771, + "grad_norm": 0.4232792258262634, + "learning_rate": 2.4577287839690022e-05, + "loss": 1.7017, + "step": 6448 + }, + { + "epoch": 0.6698867767736574, + "grad_norm": 0.42340609431266785, + "learning_rate": 2.456323917330857e-05, + "loss": 1.8029, + "step": 6449 + }, + { + "epoch": 0.6699906512932378, + "grad_norm": 0.3742263615131378, + "learning_rate": 2.4549193215743706e-05, + "loss": 1.6008, + "step": 6450 + }, + { + "epoch": 0.6700945258128181, + "grad_norm": 0.41276246309280396, + "learning_rate": 2.453514996849125e-05, + "loss": 1.667, + "step": 6451 + }, + { + "epoch": 0.6701984003323984, + "grad_norm": 0.4266214668750763, + "learning_rate": 2.452110943304669e-05, + "loss": 1.5254, + "step": 6452 + }, + { + "epoch": 0.6703022748519788, + "grad_norm": 0.39875510334968567, + "learning_rate": 2.450707161090523e-05, + "loss": 1.5862, + "step": 6453 + }, + { + "epoch": 0.6704061493715592, + "grad_norm": 0.47201064229011536, + "learning_rate": 2.4493036503561795e-05, + "loss": 1.7249, + "step": 6454 + }, + { + "epoch": 0.6705100238911395, + "grad_norm": 0.43423768877983093, + "learning_rate": 2.4479004112511012e-05, + "loss": 1.5667, + "step": 6455 + }, + { + "epoch": 0.6706138984107198, + "grad_norm": 0.45527634024620056, + "learning_rate": 2.4464974439247213e-05, + "loss": 1.7565, + "step": 6456 + }, + { + "epoch": 0.6707177729303002, + "grad_norm": 0.4044038951396942, + "learning_rate": 2.4450947485264442e-05, + "loss": 1.6019, + "step": 6457 + }, + { + "epoch": 0.6708216474498806, + "grad_norm": 0.4240105450153351, + "learning_rate": 2.4436923252056486e-05, + "loss": 1.6025, + "step": 6458 + }, + { + "epoch": 0.6709255219694609, + "grad_norm": 0.45421627163887024, + "learning_rate": 2.442290174111681e-05, + "loss": 1.8032, + "step": 6459 + }, + { + "epoch": 0.6710293964890413, + "grad_norm": 0.4415995478630066, + "learning_rate": 2.440888295393859e-05, + "loss": 1.844, + "step": 6460 + }, + { + "epoch": 0.6711332710086216, + "grad_norm": 0.410188227891922, + "learning_rate": 2.4394866892014716e-05, + "loss": 1.5926, + "step": 6461 + }, + { + "epoch": 0.6712371455282019, + "grad_norm": 0.46723711490631104, + "learning_rate": 2.438085355683779e-05, + "loss": 1.5847, + "step": 6462 + }, + { + "epoch": 0.6713410200477823, + "grad_norm": 0.3974010944366455, + "learning_rate": 2.436684294990012e-05, + "loss": 1.5292, + "step": 6463 + }, + { + "epoch": 0.6714448945673627, + "grad_norm": 0.39809471368789673, + "learning_rate": 2.4352835072693713e-05, + "loss": 1.6891, + "step": 6464 + }, + { + "epoch": 0.6715487690869429, + "grad_norm": 0.45135897397994995, + "learning_rate": 2.4338829926710356e-05, + "loss": 1.8254, + "step": 6465 + }, + { + "epoch": 0.6716526436065233, + "grad_norm": 0.3903135061264038, + "learning_rate": 2.4324827513441406e-05, + "loss": 1.5315, + "step": 6466 + }, + { + "epoch": 0.6717565181261037, + "grad_norm": 0.47816312313079834, + "learning_rate": 2.431082783437807e-05, + "loss": 1.6046, + "step": 6467 + }, + { + "epoch": 0.671860392645684, + "grad_norm": 0.47781482338905334, + "learning_rate": 2.4296830891011206e-05, + "loss": 1.7201, + "step": 6468 + }, + { + "epoch": 0.6719642671652644, + "grad_norm": 0.4250834584236145, + "learning_rate": 2.4282836684831324e-05, + "loss": 1.5979, + "step": 6469 + }, + { + "epoch": 0.6720681416848447, + "grad_norm": 0.4348602592945099, + "learning_rate": 2.426884521732875e-05, + "loss": 1.6024, + "step": 6470 + }, + { + "epoch": 0.672172016204425, + "grad_norm": 0.3877941370010376, + "learning_rate": 2.425485648999345e-05, + "loss": 1.4421, + "step": 6471 + }, + { + "epoch": 0.6722758907240054, + "grad_norm": 0.4232386350631714, + "learning_rate": 2.4240870504315117e-05, + "loss": 1.7063, + "step": 6472 + }, + { + "epoch": 0.6723797652435858, + "grad_norm": 0.4069366455078125, + "learning_rate": 2.4226887261783143e-05, + "loss": 1.6469, + "step": 6473 + }, + { + "epoch": 0.672483639763166, + "grad_norm": 0.43410545587539673, + "learning_rate": 2.4212906763886638e-05, + "loss": 1.6914, + "step": 6474 + }, + { + "epoch": 0.6725875142827464, + "grad_norm": 0.40854349732398987, + "learning_rate": 2.4198929012114413e-05, + "loss": 1.5814, + "step": 6475 + }, + { + "epoch": 0.6726913888023268, + "grad_norm": 0.4242352247238159, + "learning_rate": 2.4184954007954978e-05, + "loss": 1.6704, + "step": 6476 + }, + { + "epoch": 0.6727952633219071, + "grad_norm": 0.44728976488113403, + "learning_rate": 2.417098175289661e-05, + "loss": 1.7766, + "step": 6477 + }, + { + "epoch": 0.6728991378414875, + "grad_norm": 0.4239303171634674, + "learning_rate": 2.4157012248427173e-05, + "loss": 1.5764, + "step": 6478 + }, + { + "epoch": 0.6730030123610679, + "grad_norm": 0.406534880399704, + "learning_rate": 2.414304549603437e-05, + "loss": 1.7778, + "step": 6479 + }, + { + "epoch": 0.6731068868806481, + "grad_norm": 0.43634507060050964, + "learning_rate": 2.4129081497205535e-05, + "loss": 1.7988, + "step": 6480 + }, + { + "epoch": 0.6732107614002285, + "grad_norm": 0.4149050712585449, + "learning_rate": 2.4115120253427715e-05, + "loss": 1.636, + "step": 6481 + }, + { + "epoch": 0.6733146359198089, + "grad_norm": 0.46954065561294556, + "learning_rate": 2.4101161766187673e-05, + "loss": 1.6532, + "step": 6482 + }, + { + "epoch": 0.6734185104393893, + "grad_norm": 0.4280845820903778, + "learning_rate": 2.4087206036971872e-05, + "loss": 1.5258, + "step": 6483 + }, + { + "epoch": 0.6735223849589695, + "grad_norm": 0.40671125054359436, + "learning_rate": 2.4073253067266532e-05, + "loss": 1.6615, + "step": 6484 + }, + { + "epoch": 0.6736262594785499, + "grad_norm": 0.45035362243652344, + "learning_rate": 2.4059302858557475e-05, + "loss": 1.7412, + "step": 6485 + }, + { + "epoch": 0.6737301339981303, + "grad_norm": 0.4099656045436859, + "learning_rate": 2.4045355412330328e-05, + "loss": 1.6766, + "step": 6486 + }, + { + "epoch": 0.6738340085177106, + "grad_norm": 0.39857202768325806, + "learning_rate": 2.403141073007038e-05, + "loss": 1.6006, + "step": 6487 + }, + { + "epoch": 0.673937883037291, + "grad_norm": 0.4072171151638031, + "learning_rate": 2.401746881326263e-05, + "loss": 1.6916, + "step": 6488 + }, + { + "epoch": 0.6740417575568713, + "grad_norm": 0.4422532916069031, + "learning_rate": 2.400352966339177e-05, + "loss": 1.7378, + "step": 6489 + }, + { + "epoch": 0.6741456320764516, + "grad_norm": 0.41221001744270325, + "learning_rate": 2.398959328194223e-05, + "loss": 1.6959, + "step": 6490 + }, + { + "epoch": 0.674249506596032, + "grad_norm": 0.4334010183811188, + "learning_rate": 2.3975659670398117e-05, + "loss": 1.7088, + "step": 6491 + }, + { + "epoch": 0.6743533811156124, + "grad_norm": 0.4333952069282532, + "learning_rate": 2.3961728830243224e-05, + "loss": 1.4458, + "step": 6492 + }, + { + "epoch": 0.6744572556351927, + "grad_norm": 0.40676429867744446, + "learning_rate": 2.3947800762961147e-05, + "loss": 1.68, + "step": 6493 + }, + { + "epoch": 0.674561130154773, + "grad_norm": 0.39929771423339844, + "learning_rate": 2.3933875470035038e-05, + "loss": 1.4526, + "step": 6494 + }, + { + "epoch": 0.6746650046743534, + "grad_norm": 0.4088972508907318, + "learning_rate": 2.391995295294788e-05, + "loss": 1.6881, + "step": 6495 + }, + { + "epoch": 0.6747688791939337, + "grad_norm": 0.47918689250946045, + "learning_rate": 2.3906033213182328e-05, + "loss": 1.8923, + "step": 6496 + }, + { + "epoch": 0.6748727537135141, + "grad_norm": 0.41081228852272034, + "learning_rate": 2.3892116252220653e-05, + "loss": 1.6732, + "step": 6497 + }, + { + "epoch": 0.6749766282330945, + "grad_norm": 0.3840208649635315, + "learning_rate": 2.3878202071544977e-05, + "loss": 1.5638, + "step": 6498 + }, + { + "epoch": 0.6750805027526747, + "grad_norm": 0.43148073554039, + "learning_rate": 2.3864290672637012e-05, + "loss": 1.7186, + "step": 6499 + }, + { + "epoch": 0.6751843772722551, + "grad_norm": 0.42979419231414795, + "learning_rate": 2.3850382056978228e-05, + "loss": 1.621, + "step": 6500 + }, + { + "epoch": 0.6752882517918355, + "grad_norm": 0.3895018696784973, + "learning_rate": 2.3836476226049775e-05, + "loss": 1.5422, + "step": 6501 + }, + { + "epoch": 0.6753921263114158, + "grad_norm": 0.511653482913971, + "learning_rate": 2.3822573181332526e-05, + "loss": 1.7677, + "step": 6502 + }, + { + "epoch": 0.6754960008309961, + "grad_norm": 0.43084853887557983, + "learning_rate": 2.380867292430704e-05, + "loss": 1.7901, + "step": 6503 + }, + { + "epoch": 0.6755998753505765, + "grad_norm": 0.404482364654541, + "learning_rate": 2.3794775456453573e-05, + "loss": 1.708, + "step": 6504 + }, + { + "epoch": 0.6757037498701568, + "grad_norm": 0.4159182906150818, + "learning_rate": 2.378088077925214e-05, + "loss": 1.5471, + "step": 6505 + }, + { + "epoch": 0.6758076243897372, + "grad_norm": 0.4247626066207886, + "learning_rate": 2.3766988894182357e-05, + "loss": 1.8363, + "step": 6506 + }, + { + "epoch": 0.6759114989093176, + "grad_norm": 0.47585469484329224, + "learning_rate": 2.3753099802723643e-05, + "loss": 2.0124, + "step": 6507 + }, + { + "epoch": 0.6760153734288978, + "grad_norm": 0.44073712825775146, + "learning_rate": 2.3739213506355073e-05, + "loss": 1.698, + "step": 6508 + }, + { + "epoch": 0.6761192479484782, + "grad_norm": 0.4209062159061432, + "learning_rate": 2.372533000655542e-05, + "loss": 1.8448, + "step": 6509 + }, + { + "epoch": 0.6762231224680586, + "grad_norm": 0.40654614567756653, + "learning_rate": 2.3711449304803175e-05, + "loss": 1.6932, + "step": 6510 + }, + { + "epoch": 0.676326996987639, + "grad_norm": 0.42411383986473083, + "learning_rate": 2.3697571402576497e-05, + "loss": 1.6501, + "step": 6511 + }, + { + "epoch": 0.6764308715072193, + "grad_norm": 0.4248904883861542, + "learning_rate": 2.3683696301353337e-05, + "loss": 1.4518, + "step": 6512 + }, + { + "epoch": 0.6765347460267996, + "grad_norm": 0.39044690132141113, + "learning_rate": 2.3669824002611207e-05, + "loss": 1.6496, + "step": 6513 + }, + { + "epoch": 0.67663862054638, + "grad_norm": 0.4223690629005432, + "learning_rate": 2.3655954507827456e-05, + "loss": 1.6076, + "step": 6514 + }, + { + "epoch": 0.6767424950659603, + "grad_norm": 0.44539690017700195, + "learning_rate": 2.364208781847906e-05, + "loss": 1.7109, + "step": 6515 + }, + { + "epoch": 0.6768463695855407, + "grad_norm": 0.4467482566833496, + "learning_rate": 2.3628223936042708e-05, + "loss": 1.6478, + "step": 6516 + }, + { + "epoch": 0.6769502441051211, + "grad_norm": 0.3779411315917969, + "learning_rate": 2.3614362861994803e-05, + "loss": 1.5394, + "step": 6517 + }, + { + "epoch": 0.6770541186247013, + "grad_norm": 0.44637593626976013, + "learning_rate": 2.3600504597811436e-05, + "loss": 1.8076, + "step": 6518 + }, + { + "epoch": 0.6771579931442817, + "grad_norm": 0.43689000606536865, + "learning_rate": 2.3586649144968404e-05, + "loss": 1.718, + "step": 6519 + }, + { + "epoch": 0.6772618676638621, + "grad_norm": 0.4006705582141876, + "learning_rate": 2.357279650494119e-05, + "loss": 1.4616, + "step": 6520 + }, + { + "epoch": 0.6773657421834424, + "grad_norm": 0.42330577969551086, + "learning_rate": 2.3558946679205046e-05, + "loss": 1.6517, + "step": 6521 + }, + { + "epoch": 0.6774696167030227, + "grad_norm": 0.41748082637786865, + "learning_rate": 2.3545099669234792e-05, + "loss": 1.5593, + "step": 6522 + }, + { + "epoch": 0.6775734912226031, + "grad_norm": 0.46182626485824585, + "learning_rate": 2.3531255476505086e-05, + "loss": 1.7158, + "step": 6523 + }, + { + "epoch": 0.6776773657421834, + "grad_norm": 0.444038450717926, + "learning_rate": 2.3517414102490232e-05, + "loss": 1.8144, + "step": 6524 + }, + { + "epoch": 0.6777812402617638, + "grad_norm": 0.46683838963508606, + "learning_rate": 2.350357554866416e-05, + "loss": 1.8248, + "step": 6525 + }, + { + "epoch": 0.6778851147813442, + "grad_norm": 0.4560922384262085, + "learning_rate": 2.3489739816500635e-05, + "loss": 1.7912, + "step": 6526 + }, + { + "epoch": 0.6779889893009244, + "grad_norm": 0.4331681430339813, + "learning_rate": 2.3475906907473027e-05, + "loss": 1.7446, + "step": 6527 + }, + { + "epoch": 0.6780928638205048, + "grad_norm": 0.42912784218788147, + "learning_rate": 2.3462076823054442e-05, + "loss": 1.6939, + "step": 6528 + }, + { + "epoch": 0.6781967383400852, + "grad_norm": 0.451789915561676, + "learning_rate": 2.3448249564717666e-05, + "loss": 1.7973, + "step": 6529 + }, + { + "epoch": 0.6783006128596655, + "grad_norm": 0.4173526465892792, + "learning_rate": 2.3434425133935207e-05, + "loss": 1.6438, + "step": 6530 + }, + { + "epoch": 0.6784044873792459, + "grad_norm": 0.4266951084136963, + "learning_rate": 2.3420603532179246e-05, + "loss": 1.7301, + "step": 6531 + }, + { + "epoch": 0.6785083618988262, + "grad_norm": 0.48929300904273987, + "learning_rate": 2.3406784760921667e-05, + "loss": 1.7364, + "step": 6532 + }, + { + "epoch": 0.6786122364184065, + "grad_norm": 0.3748891055583954, + "learning_rate": 2.339296882163411e-05, + "loss": 1.608, + "step": 6533 + }, + { + "epoch": 0.6787161109379869, + "grad_norm": 0.42468810081481934, + "learning_rate": 2.3379155715787792e-05, + "loss": 1.8397, + "step": 6534 + }, + { + "epoch": 0.6788199854575673, + "grad_norm": 0.46891170740127563, + "learning_rate": 2.3365345444853763e-05, + "loss": 1.9061, + "step": 6535 + }, + { + "epoch": 0.6789238599771477, + "grad_norm": 0.42651164531707764, + "learning_rate": 2.3351538010302688e-05, + "loss": 1.6904, + "step": 6536 + }, + { + "epoch": 0.6790277344967279, + "grad_norm": 0.41323697566986084, + "learning_rate": 2.333773341360494e-05, + "loss": 1.6176, + "step": 6537 + }, + { + "epoch": 0.6791316090163083, + "grad_norm": 0.4378218948841095, + "learning_rate": 2.3323931656230614e-05, + "loss": 1.7018, + "step": 6538 + }, + { + "epoch": 0.6792354835358887, + "grad_norm": 0.42341065406799316, + "learning_rate": 2.331013273964947e-05, + "loss": 1.6211, + "step": 6539 + }, + { + "epoch": 0.679339358055469, + "grad_norm": 0.37841764092445374, + "learning_rate": 2.3296336665331032e-05, + "loss": 1.6159, + "step": 6540 + }, + { + "epoch": 0.6794432325750493, + "grad_norm": 0.38354504108428955, + "learning_rate": 2.3282543434744407e-05, + "loss": 1.6425, + "step": 6541 + }, + { + "epoch": 0.6795471070946297, + "grad_norm": 0.4210819900035858, + "learning_rate": 2.3268753049358522e-05, + "loss": 1.6235, + "step": 6542 + }, + { + "epoch": 0.67965098161421, + "grad_norm": 0.4259316623210907, + "learning_rate": 2.3254965510641928e-05, + "loss": 1.8103, + "step": 6543 + }, + { + "epoch": 0.6797548561337904, + "grad_norm": 0.3991251289844513, + "learning_rate": 2.3241180820062885e-05, + "loss": 1.6595, + "step": 6544 + }, + { + "epoch": 0.6798587306533708, + "grad_norm": 0.3932139575481415, + "learning_rate": 2.3227398979089353e-05, + "loss": 1.586, + "step": 6545 + }, + { + "epoch": 0.679962605172951, + "grad_norm": 0.40890079736709595, + "learning_rate": 2.3213619989189006e-05, + "loss": 1.4932, + "step": 6546 + }, + { + "epoch": 0.6800664796925314, + "grad_norm": 0.42647114396095276, + "learning_rate": 2.319984385182918e-05, + "loss": 1.6931, + "step": 6547 + }, + { + "epoch": 0.6801703542121118, + "grad_norm": 0.39702990651130676, + "learning_rate": 2.318607056847692e-05, + "loss": 1.5665, + "step": 6548 + }, + { + "epoch": 0.6802742287316921, + "grad_norm": 0.3964666724205017, + "learning_rate": 2.3172300140599024e-05, + "loss": 1.7028, + "step": 6549 + }, + { + "epoch": 0.6803781032512725, + "grad_norm": 0.41276541352272034, + "learning_rate": 2.315853256966186e-05, + "loss": 1.5754, + "step": 6550 + }, + { + "epoch": 0.6804819777708528, + "grad_norm": 0.40658214688301086, + "learning_rate": 2.314476785713163e-05, + "loss": 1.8211, + "step": 6551 + }, + { + "epoch": 0.6805858522904331, + "grad_norm": 0.43847519159317017, + "learning_rate": 2.3131006004474152e-05, + "loss": 1.5858, + "step": 6552 + }, + { + "epoch": 0.6806897268100135, + "grad_norm": 0.5137412548065186, + "learning_rate": 2.3117247013154918e-05, + "loss": 1.9818, + "step": 6553 + }, + { + "epoch": 0.6807936013295939, + "grad_norm": 0.41750749945640564, + "learning_rate": 2.31034908846392e-05, + "loss": 1.519, + "step": 6554 + }, + { + "epoch": 0.6808974758491741, + "grad_norm": 0.3900861144065857, + "learning_rate": 2.3089737620391906e-05, + "loss": 1.6836, + "step": 6555 + }, + { + "epoch": 0.6810013503687545, + "grad_norm": 0.4610309302806854, + "learning_rate": 2.307598722187765e-05, + "loss": 1.6183, + "step": 6556 + }, + { + "epoch": 0.6811052248883349, + "grad_norm": 0.4481363892555237, + "learning_rate": 2.306223969056073e-05, + "loss": 1.7593, + "step": 6557 + }, + { + "epoch": 0.6812090994079152, + "grad_norm": 0.37722769379615784, + "learning_rate": 2.3048495027905166e-05, + "loss": 1.5302, + "step": 6558 + }, + { + "epoch": 0.6813129739274956, + "grad_norm": 0.42121073603630066, + "learning_rate": 2.303475323537465e-05, + "loss": 1.5051, + "step": 6559 + }, + { + "epoch": 0.681416848447076, + "grad_norm": 0.49709874391555786, + "learning_rate": 2.302101431443256e-05, + "loss": 1.9087, + "step": 6560 + }, + { + "epoch": 0.6815207229666563, + "grad_norm": 0.4022544026374817, + "learning_rate": 2.300727826654202e-05, + "loss": 1.6909, + "step": 6561 + }, + { + "epoch": 0.6816245974862366, + "grad_norm": 0.41728565096855164, + "learning_rate": 2.299354509316579e-05, + "loss": 1.6303, + "step": 6562 + }, + { + "epoch": 0.681728472005817, + "grad_norm": 0.439998060464859, + "learning_rate": 2.297981479576635e-05, + "loss": 1.8708, + "step": 6563 + }, + { + "epoch": 0.6818323465253974, + "grad_norm": 0.39185917377471924, + "learning_rate": 2.2966087375805874e-05, + "loss": 1.6224, + "step": 6564 + }, + { + "epoch": 0.6819362210449776, + "grad_norm": 0.38439640402793884, + "learning_rate": 2.2952362834746216e-05, + "loss": 1.5744, + "step": 6565 + }, + { + "epoch": 0.682040095564558, + "grad_norm": 0.4013480544090271, + "learning_rate": 2.2938641174048936e-05, + "loss": 1.5341, + "step": 6566 + }, + { + "epoch": 0.6821439700841384, + "grad_norm": 0.44190388917922974, + "learning_rate": 2.2924922395175268e-05, + "loss": 1.8871, + "step": 6567 + }, + { + "epoch": 0.6822478446037187, + "grad_norm": 0.4578564167022705, + "learning_rate": 2.2911206499586207e-05, + "loss": 1.8006, + "step": 6568 + }, + { + "epoch": 0.6823517191232991, + "grad_norm": 0.44078001379966736, + "learning_rate": 2.2897493488742317e-05, + "loss": 1.6603, + "step": 6569 + }, + { + "epoch": 0.6824555936428794, + "grad_norm": 0.4083844721317291, + "learning_rate": 2.2883783364103976e-05, + "loss": 1.5109, + "step": 6570 + }, + { + "epoch": 0.6825594681624597, + "grad_norm": 0.4550290107727051, + "learning_rate": 2.2870076127131196e-05, + "loss": 1.7607, + "step": 6571 + }, + { + "epoch": 0.6826633426820401, + "grad_norm": 0.4314436912536621, + "learning_rate": 2.2856371779283692e-05, + "loss": 1.7517, + "step": 6572 + }, + { + "epoch": 0.6827672172016205, + "grad_norm": 0.4039313793182373, + "learning_rate": 2.2842670322020858e-05, + "loss": 1.7537, + "step": 6573 + }, + { + "epoch": 0.6828710917212008, + "grad_norm": 0.40721437335014343, + "learning_rate": 2.2828971756801802e-05, + "loss": 1.5869, + "step": 6574 + }, + { + "epoch": 0.6829749662407811, + "grad_norm": 0.38915568590164185, + "learning_rate": 2.2815276085085313e-05, + "loss": 1.5199, + "step": 6575 + }, + { + "epoch": 0.6830788407603615, + "grad_norm": 0.42462047934532166, + "learning_rate": 2.2801583308329848e-05, + "loss": 1.6698, + "step": 6576 + }, + { + "epoch": 0.6831827152799418, + "grad_norm": 0.42079266905784607, + "learning_rate": 2.2787893427993646e-05, + "loss": 1.582, + "step": 6577 + }, + { + "epoch": 0.6832865897995222, + "grad_norm": 0.45003542304039, + "learning_rate": 2.2774206445534495e-05, + "loss": 1.4429, + "step": 6578 + }, + { + "epoch": 0.6833904643191026, + "grad_norm": 0.38841044902801514, + "learning_rate": 2.2760522362410003e-05, + "loss": 1.5655, + "step": 6579 + }, + { + "epoch": 0.6834943388386828, + "grad_norm": 0.39281266927719116, + "learning_rate": 2.274684118007743e-05, + "loss": 1.6493, + "step": 6580 + }, + { + "epoch": 0.6835982133582632, + "grad_norm": 0.37398335337638855, + "learning_rate": 2.2733162899993642e-05, + "loss": 1.4814, + "step": 6581 + }, + { + "epoch": 0.6837020878778436, + "grad_norm": 0.43852871656417847, + "learning_rate": 2.271948752361534e-05, + "loss": 1.5412, + "step": 6582 + }, + { + "epoch": 0.6838059623974239, + "grad_norm": 0.4092741012573242, + "learning_rate": 2.2705815052398816e-05, + "loss": 1.6233, + "step": 6583 + }, + { + "epoch": 0.6839098369170042, + "grad_norm": 0.3938494026660919, + "learning_rate": 2.2692145487800093e-05, + "loss": 1.6233, + "step": 6584 + }, + { + "epoch": 0.6840137114365846, + "grad_norm": 0.5273641347885132, + "learning_rate": 2.2678478831274858e-05, + "loss": 1.8792, + "step": 6585 + }, + { + "epoch": 0.6841175859561649, + "grad_norm": 0.41497117280960083, + "learning_rate": 2.266481508427852e-05, + "loss": 1.6535, + "step": 6586 + }, + { + "epoch": 0.6842214604757453, + "grad_norm": 0.39837929606437683, + "learning_rate": 2.2651154248266145e-05, + "loss": 1.5587, + "step": 6587 + }, + { + "epoch": 0.6843253349953257, + "grad_norm": 0.504227340221405, + "learning_rate": 2.2637496324692496e-05, + "loss": 1.6661, + "step": 6588 + }, + { + "epoch": 0.684429209514906, + "grad_norm": 0.425836443901062, + "learning_rate": 2.2623841315012072e-05, + "loss": 1.7157, + "step": 6589 + }, + { + "epoch": 0.6845330840344863, + "grad_norm": 0.429581880569458, + "learning_rate": 2.2610189220679007e-05, + "loss": 1.7036, + "step": 6590 + }, + { + "epoch": 0.6846369585540667, + "grad_norm": 0.4043353497982025, + "learning_rate": 2.2596540043147142e-05, + "loss": 1.6208, + "step": 6591 + }, + { + "epoch": 0.6847408330736471, + "grad_norm": 0.423662006855011, + "learning_rate": 2.2582893783870006e-05, + "loss": 1.8271, + "step": 6592 + }, + { + "epoch": 0.6848447075932274, + "grad_norm": 0.3973560631275177, + "learning_rate": 2.256925044430082e-05, + "loss": 1.638, + "step": 6593 + }, + { + "epoch": 0.6849485821128077, + "grad_norm": 0.41801849007606506, + "learning_rate": 2.2555610025892498e-05, + "loss": 1.5826, + "step": 6594 + }, + { + "epoch": 0.6850524566323881, + "grad_norm": 0.45296528935432434, + "learning_rate": 2.254197253009761e-05, + "loss": 1.5875, + "step": 6595 + }, + { + "epoch": 0.6851563311519684, + "grad_norm": 0.4170701205730438, + "learning_rate": 2.2528337958368507e-05, + "loss": 1.6933, + "step": 6596 + }, + { + "epoch": 0.6852602056715488, + "grad_norm": 0.3868260979652405, + "learning_rate": 2.2514706312157085e-05, + "loss": 1.5773, + "step": 6597 + }, + { + "epoch": 0.6853640801911292, + "grad_norm": 0.3958593010902405, + "learning_rate": 2.2501077592915065e-05, + "loss": 1.6426, + "step": 6598 + }, + { + "epoch": 0.6854679547107094, + "grad_norm": 0.3819239139556885, + "learning_rate": 2.2487451802093784e-05, + "loss": 1.5363, + "step": 6599 + }, + { + "epoch": 0.6855718292302898, + "grad_norm": 0.4158235192298889, + "learning_rate": 2.2473828941144276e-05, + "loss": 1.5895, + "step": 6600 + }, + { + "epoch": 0.6856757037498702, + "grad_norm": 0.4286392331123352, + "learning_rate": 2.246020901151728e-05, + "loss": 1.6797, + "step": 6601 + }, + { + "epoch": 0.6857795782694505, + "grad_norm": 0.39033573865890503, + "learning_rate": 2.24465920146632e-05, + "loss": 1.6592, + "step": 6602 + }, + { + "epoch": 0.6858834527890308, + "grad_norm": 0.47178182005882263, + "learning_rate": 2.2432977952032148e-05, + "loss": 1.7151, + "step": 6603 + }, + { + "epoch": 0.6859873273086112, + "grad_norm": 0.4451124370098114, + "learning_rate": 2.2419366825073894e-05, + "loss": 1.8579, + "step": 6604 + }, + { + "epoch": 0.6860912018281915, + "grad_norm": 0.4096846878528595, + "learning_rate": 2.2405758635237973e-05, + "loss": 1.57, + "step": 6605 + }, + { + "epoch": 0.6861950763477719, + "grad_norm": 0.437682569026947, + "learning_rate": 2.2392153383973473e-05, + "loss": 1.7868, + "step": 6606 + }, + { + "epoch": 0.6862989508673523, + "grad_norm": 0.4488213062286377, + "learning_rate": 2.2378551072729314e-05, + "loss": 1.7668, + "step": 6607 + }, + { + "epoch": 0.6864028253869325, + "grad_norm": 0.4229215383529663, + "learning_rate": 2.2364951702954025e-05, + "loss": 1.7325, + "step": 6608 + }, + { + "epoch": 0.6865066999065129, + "grad_norm": 0.45291194319725037, + "learning_rate": 2.235135527609578e-05, + "loss": 1.8742, + "step": 6609 + }, + { + "epoch": 0.6866105744260933, + "grad_norm": 0.41478431224823, + "learning_rate": 2.233776179360255e-05, + "loss": 1.5874, + "step": 6610 + }, + { + "epoch": 0.6867144489456736, + "grad_norm": 0.4467930495738983, + "learning_rate": 2.2324171256921918e-05, + "loss": 1.8026, + "step": 6611 + }, + { + "epoch": 0.686818323465254, + "grad_norm": 0.4063619375228882, + "learning_rate": 2.2310583667501163e-05, + "loss": 1.68, + "step": 6612 + }, + { + "epoch": 0.6869221979848343, + "grad_norm": 0.4245857000350952, + "learning_rate": 2.2296999026787262e-05, + "loss": 1.5025, + "step": 6613 + }, + { + "epoch": 0.6870260725044147, + "grad_norm": 0.44443392753601074, + "learning_rate": 2.2283417336226853e-05, + "loss": 1.6217, + "step": 6614 + }, + { + "epoch": 0.687129947023995, + "grad_norm": 0.4405082166194916, + "learning_rate": 2.2269838597266342e-05, + "loss": 1.5827, + "step": 6615 + }, + { + "epoch": 0.6872338215435754, + "grad_norm": 0.4452870190143585, + "learning_rate": 2.2256262811351676e-05, + "loss": 1.7894, + "step": 6616 + }, + { + "epoch": 0.6873376960631558, + "grad_norm": 0.37524208426475525, + "learning_rate": 2.2242689979928632e-05, + "loss": 1.5107, + "step": 6617 + }, + { + "epoch": 0.687441570582736, + "grad_norm": 0.4177568554878235, + "learning_rate": 2.222912010444259e-05, + "loss": 1.6793, + "step": 6618 + }, + { + "epoch": 0.6875454451023164, + "grad_norm": 0.39872753620147705, + "learning_rate": 2.2215553186338635e-05, + "loss": 1.6067, + "step": 6619 + }, + { + "epoch": 0.6876493196218968, + "grad_norm": 0.40704911947250366, + "learning_rate": 2.2201989227061535e-05, + "loss": 1.6807, + "step": 6620 + }, + { + "epoch": 0.6877531941414771, + "grad_norm": 0.43390125036239624, + "learning_rate": 2.2188428228055756e-05, + "loss": 1.5434, + "step": 6621 + }, + { + "epoch": 0.6878570686610574, + "grad_norm": 0.4030878245830536, + "learning_rate": 2.2174870190765428e-05, + "loss": 1.6909, + "step": 6622 + }, + { + "epoch": 0.6879609431806378, + "grad_norm": 0.45585277676582336, + "learning_rate": 2.2161315116634362e-05, + "loss": 1.7252, + "step": 6623 + }, + { + "epoch": 0.6880648177002181, + "grad_norm": 0.4133506715297699, + "learning_rate": 2.2147763007106126e-05, + "loss": 1.7813, + "step": 6624 + }, + { + "epoch": 0.6881686922197985, + "grad_norm": 0.45067742466926575, + "learning_rate": 2.213421386362383e-05, + "loss": 1.7338, + "step": 6625 + }, + { + "epoch": 0.6882725667393789, + "grad_norm": 0.4103121757507324, + "learning_rate": 2.212066768763042e-05, + "loss": 1.7215, + "step": 6626 + }, + { + "epoch": 0.6883764412589591, + "grad_norm": 0.43877947330474854, + "learning_rate": 2.2107124480568435e-05, + "loss": 1.5899, + "step": 6627 + }, + { + "epoch": 0.6884803157785395, + "grad_norm": 0.3948143720626831, + "learning_rate": 2.2093584243880116e-05, + "loss": 1.6213, + "step": 6628 + }, + { + "epoch": 0.6885841902981199, + "grad_norm": 0.39642539620399475, + "learning_rate": 2.20800469790074e-05, + "loss": 1.5516, + "step": 6629 + }, + { + "epoch": 0.6886880648177002, + "grad_norm": 0.4189082980155945, + "learning_rate": 2.2066512687391898e-05, + "loss": 1.6291, + "step": 6630 + }, + { + "epoch": 0.6887919393372806, + "grad_norm": 0.48019567131996155, + "learning_rate": 2.20529813704749e-05, + "loss": 1.819, + "step": 6631 + }, + { + "epoch": 0.6888958138568609, + "grad_norm": 0.43770167231559753, + "learning_rate": 2.2039453029697378e-05, + "loss": 1.7297, + "step": 6632 + }, + { + "epoch": 0.6889996883764412, + "grad_norm": 0.3850918114185333, + "learning_rate": 2.202592766650004e-05, + "loss": 1.6097, + "step": 6633 + }, + { + "epoch": 0.6891035628960216, + "grad_norm": 0.4326091408729553, + "learning_rate": 2.2012405282323166e-05, + "loss": 1.4958, + "step": 6634 + }, + { + "epoch": 0.689207437415602, + "grad_norm": 0.3961119055747986, + "learning_rate": 2.1998885878606835e-05, + "loss": 1.5057, + "step": 6635 + }, + { + "epoch": 0.6893113119351822, + "grad_norm": 0.4198208153247833, + "learning_rate": 2.1985369456790757e-05, + "loss": 1.614, + "step": 6636 + }, + { + "epoch": 0.6894151864547626, + "grad_norm": 0.4884597659111023, + "learning_rate": 2.1971856018314274e-05, + "loss": 1.8599, + "step": 6637 + }, + { + "epoch": 0.689519060974343, + "grad_norm": 0.42158132791519165, + "learning_rate": 2.1958345564616515e-05, + "loss": 1.6569, + "step": 6638 + }, + { + "epoch": 0.6896229354939234, + "grad_norm": 0.41984039545059204, + "learning_rate": 2.1944838097136205e-05, + "loss": 1.6422, + "step": 6639 + }, + { + "epoch": 0.6897268100135037, + "grad_norm": 0.42424213886260986, + "learning_rate": 2.1931333617311833e-05, + "loss": 1.8098, + "step": 6640 + }, + { + "epoch": 0.689830684533084, + "grad_norm": 0.3981630206108093, + "learning_rate": 2.1917832126581467e-05, + "loss": 1.6439, + "step": 6641 + }, + { + "epoch": 0.6899345590526644, + "grad_norm": 0.42164191603660583, + "learning_rate": 2.1904333626382918e-05, + "loss": 1.6079, + "step": 6642 + }, + { + "epoch": 0.6900384335722447, + "grad_norm": 0.43774572014808655, + "learning_rate": 2.189083811815371e-05, + "loss": 1.6243, + "step": 6643 + }, + { + "epoch": 0.6901423080918251, + "grad_norm": 0.4788624942302704, + "learning_rate": 2.187734560333095e-05, + "loss": 1.8424, + "step": 6644 + }, + { + "epoch": 0.6902461826114055, + "grad_norm": 0.3973691761493683, + "learning_rate": 2.186385608335154e-05, + "loss": 1.5472, + "step": 6645 + }, + { + "epoch": 0.6903500571309857, + "grad_norm": 0.4038008451461792, + "learning_rate": 2.1850369559651984e-05, + "loss": 1.5287, + "step": 6646 + }, + { + "epoch": 0.6904539316505661, + "grad_norm": 0.38313379883766174, + "learning_rate": 2.1836886033668495e-05, + "loss": 1.5671, + "step": 6647 + }, + { + "epoch": 0.6905578061701465, + "grad_norm": 0.43707215785980225, + "learning_rate": 2.1823405506836962e-05, + "loss": 1.6821, + "step": 6648 + }, + { + "epoch": 0.6906616806897268, + "grad_norm": 0.40973857045173645, + "learning_rate": 2.180992798059296e-05, + "loss": 1.6621, + "step": 6649 + }, + { + "epoch": 0.6907655552093072, + "grad_norm": 0.3938702344894409, + "learning_rate": 2.1796453456371735e-05, + "loss": 1.5388, + "step": 6650 + }, + { + "epoch": 0.6908694297288875, + "grad_norm": 0.3766863942146301, + "learning_rate": 2.17829819356082e-05, + "loss": 1.5155, + "step": 6651 + }, + { + "epoch": 0.6909733042484678, + "grad_norm": 0.4006540775299072, + "learning_rate": 2.1769513419737024e-05, + "loss": 1.768, + "step": 6652 + }, + { + "epoch": 0.6910771787680482, + "grad_norm": 0.4285779297351837, + "learning_rate": 2.1756047910192428e-05, + "loss": 1.6215, + "step": 6653 + }, + { + "epoch": 0.6911810532876286, + "grad_norm": 0.46323853731155396, + "learning_rate": 2.1742585408408438e-05, + "loss": 1.7715, + "step": 6654 + }, + { + "epoch": 0.6912849278072088, + "grad_norm": 0.46970510482788086, + "learning_rate": 2.1729125915818684e-05, + "loss": 1.7967, + "step": 6655 + }, + { + "epoch": 0.6913888023267892, + "grad_norm": 0.39832404255867004, + "learning_rate": 2.17156694338565e-05, + "loss": 1.6482, + "step": 6656 + }, + { + "epoch": 0.6914926768463696, + "grad_norm": 0.42489683628082275, + "learning_rate": 2.1702215963954893e-05, + "loss": 1.6526, + "step": 6657 + }, + { + "epoch": 0.6915965513659499, + "grad_norm": 0.39924055337905884, + "learning_rate": 2.1688765507546556e-05, + "loss": 1.3809, + "step": 6658 + }, + { + "epoch": 0.6917004258855303, + "grad_norm": 0.4202609062194824, + "learning_rate": 2.1675318066063855e-05, + "loss": 1.6426, + "step": 6659 + }, + { + "epoch": 0.6918043004051107, + "grad_norm": 0.42206838726997375, + "learning_rate": 2.166187364093882e-05, + "loss": 1.6819, + "step": 6660 + }, + { + "epoch": 0.6919081749246909, + "grad_norm": 0.4156840443611145, + "learning_rate": 2.1648432233603232e-05, + "loss": 1.8221, + "step": 6661 + }, + { + "epoch": 0.6920120494442713, + "grad_norm": 0.41110360622406006, + "learning_rate": 2.1634993845488426e-05, + "loss": 1.707, + "step": 6662 + }, + { + "epoch": 0.6921159239638517, + "grad_norm": 0.4039273262023926, + "learning_rate": 2.162155847802554e-05, + "loss": 1.6048, + "step": 6663 + }, + { + "epoch": 0.6922197984834321, + "grad_norm": 0.4477052688598633, + "learning_rate": 2.1608126132645317e-05, + "loss": 1.8689, + "step": 6664 + }, + { + "epoch": 0.6923236730030123, + "grad_norm": 0.40778693556785583, + "learning_rate": 2.1594696810778193e-05, + "loss": 1.7781, + "step": 6665 + }, + { + "epoch": 0.6924275475225927, + "grad_norm": 0.42754799127578735, + "learning_rate": 2.1581270513854283e-05, + "loss": 1.71, + "step": 6666 + }, + { + "epoch": 0.6925314220421731, + "grad_norm": 0.4134778380393982, + "learning_rate": 2.156784724330338e-05, + "loss": 1.7161, + "step": 6667 + }, + { + "epoch": 0.6926352965617534, + "grad_norm": 0.40109783411026, + "learning_rate": 2.1554427000555e-05, + "loss": 1.5071, + "step": 6668 + }, + { + "epoch": 0.6927391710813338, + "grad_norm": 0.49069786071777344, + "learning_rate": 2.154100978703824e-05, + "loss": 1.9258, + "step": 6669 + }, + { + "epoch": 0.6928430456009141, + "grad_norm": 0.4370437562465668, + "learning_rate": 2.1527595604181933e-05, + "loss": 1.4963, + "step": 6670 + }, + { + "epoch": 0.6929469201204944, + "grad_norm": 0.45224136114120483, + "learning_rate": 2.151418445341463e-05, + "loss": 1.745, + "step": 6671 + }, + { + "epoch": 0.6930507946400748, + "grad_norm": 0.4150208532810211, + "learning_rate": 2.1500776336164456e-05, + "loss": 1.6489, + "step": 6672 + }, + { + "epoch": 0.6931546691596552, + "grad_norm": 0.4757159352302551, + "learning_rate": 2.1487371253859316e-05, + "loss": 1.8266, + "step": 6673 + }, + { + "epoch": 0.6932585436792355, + "grad_norm": 0.3998592793941498, + "learning_rate": 2.1473969207926737e-05, + "loss": 1.6169, + "step": 6674 + }, + { + "epoch": 0.6933624181988158, + "grad_norm": 0.42257049679756165, + "learning_rate": 2.1460570199793917e-05, + "loss": 1.7495, + "step": 6675 + }, + { + "epoch": 0.6934662927183962, + "grad_norm": 0.3936207592487335, + "learning_rate": 2.1447174230887757e-05, + "loss": 1.7681, + "step": 6676 + }, + { + "epoch": 0.6935701672379765, + "grad_norm": 0.4289858639240265, + "learning_rate": 2.143378130263482e-05, + "loss": 1.665, + "step": 6677 + }, + { + "epoch": 0.6936740417575569, + "grad_norm": 0.4393899738788605, + "learning_rate": 2.142039141646135e-05, + "loss": 1.6799, + "step": 6678 + }, + { + "epoch": 0.6937779162771373, + "grad_norm": 0.4026007056236267, + "learning_rate": 2.140700457379325e-05, + "loss": 1.6812, + "step": 6679 + }, + { + "epoch": 0.6938817907967175, + "grad_norm": 0.4645029306411743, + "learning_rate": 2.1393620776056163e-05, + "loss": 1.7876, + "step": 6680 + }, + { + "epoch": 0.6939856653162979, + "grad_norm": 0.3936692774295807, + "learning_rate": 2.1380240024675285e-05, + "loss": 1.3614, + "step": 6681 + }, + { + "epoch": 0.6940895398358783, + "grad_norm": 0.4686059057712555, + "learning_rate": 2.136686232107562e-05, + "loss": 1.7599, + "step": 6682 + }, + { + "epoch": 0.6941934143554586, + "grad_norm": 0.4059823155403137, + "learning_rate": 2.1353487666681777e-05, + "loss": 1.663, + "step": 6683 + }, + { + "epoch": 0.6942972888750389, + "grad_norm": 0.4833250939846039, + "learning_rate": 2.134011606291804e-05, + "loss": 1.8039, + "step": 6684 + }, + { + "epoch": 0.6944011633946193, + "grad_norm": 0.41929182410240173, + "learning_rate": 2.132674751120839e-05, + "loss": 1.7361, + "step": 6685 + }, + { + "epoch": 0.6945050379141996, + "grad_norm": 0.4122251272201538, + "learning_rate": 2.131338201297647e-05, + "loss": 1.5977, + "step": 6686 + }, + { + "epoch": 0.69460891243378, + "grad_norm": 0.476242333650589, + "learning_rate": 2.1300019569645596e-05, + "loss": 1.9488, + "step": 6687 + }, + { + "epoch": 0.6947127869533604, + "grad_norm": 0.40492451190948486, + "learning_rate": 2.1286660182638756e-05, + "loss": 1.7338, + "step": 6688 + }, + { + "epoch": 0.6948166614729406, + "grad_norm": 0.4443548917770386, + "learning_rate": 2.1273303853378674e-05, + "loss": 1.5402, + "step": 6689 + }, + { + "epoch": 0.694920535992521, + "grad_norm": 0.40552589297294617, + "learning_rate": 2.125995058328763e-05, + "loss": 1.6692, + "step": 6690 + }, + { + "epoch": 0.6950244105121014, + "grad_norm": 0.5920501351356506, + "learning_rate": 2.124660037378765e-05, + "loss": 1.7801, + "step": 6691 + }, + { + "epoch": 0.6951282850316818, + "grad_norm": 0.4190227687358856, + "learning_rate": 2.1233253226300465e-05, + "loss": 1.5115, + "step": 6692 + }, + { + "epoch": 0.695232159551262, + "grad_norm": 0.47602778673171997, + "learning_rate": 2.1219909142247424e-05, + "loss": 1.8221, + "step": 6693 + }, + { + "epoch": 0.6953360340708424, + "grad_norm": 0.43550148606300354, + "learning_rate": 2.1206568123049564e-05, + "loss": 1.6134, + "step": 6694 + }, + { + "epoch": 0.6954399085904228, + "grad_norm": 0.4044070541858673, + "learning_rate": 2.119323017012759e-05, + "loss": 1.5809, + "step": 6695 + }, + { + "epoch": 0.6955437831100031, + "grad_norm": 0.4208220839500427, + "learning_rate": 2.1179895284901924e-05, + "loss": 1.5416, + "step": 6696 + }, + { + "epoch": 0.6956476576295835, + "grad_norm": 0.40507733821868896, + "learning_rate": 2.1166563468792594e-05, + "loss": 1.5112, + "step": 6697 + }, + { + "epoch": 0.6957515321491639, + "grad_norm": 0.4560292959213257, + "learning_rate": 2.1153234723219328e-05, + "loss": 1.6208, + "step": 6698 + }, + { + "epoch": 0.6958554066687441, + "grad_norm": 0.46594947576522827, + "learning_rate": 2.1139909049601587e-05, + "loss": 1.7745, + "step": 6699 + }, + { + "epoch": 0.6959592811883245, + "grad_norm": 0.4122544527053833, + "learning_rate": 2.1126586449358375e-05, + "loss": 1.5898, + "step": 6700 + }, + { + "epoch": 0.6960631557079049, + "grad_norm": 0.45607662200927734, + "learning_rate": 2.111326692390851e-05, + "loss": 1.7258, + "step": 6701 + }, + { + "epoch": 0.6961670302274852, + "grad_norm": 0.4141891300678253, + "learning_rate": 2.1099950474670387e-05, + "loss": 1.6533, + "step": 6702 + }, + { + "epoch": 0.6962709047470655, + "grad_norm": 0.3780854046344757, + "learning_rate": 2.1086637103062107e-05, + "loss": 1.5623, + "step": 6703 + }, + { + "epoch": 0.6963747792666459, + "grad_norm": 0.4268392026424408, + "learning_rate": 2.1073326810501448e-05, + "loss": 1.6596, + "step": 6704 + }, + { + "epoch": 0.6964786537862262, + "grad_norm": 0.40509405732154846, + "learning_rate": 2.1060019598405843e-05, + "loss": 1.5612, + "step": 6705 + }, + { + "epoch": 0.6965825283058066, + "grad_norm": 0.38771677017211914, + "learning_rate": 2.1046715468192414e-05, + "loss": 1.5457, + "step": 6706 + }, + { + "epoch": 0.696686402825387, + "grad_norm": 0.4286300539970398, + "learning_rate": 2.1033414421277924e-05, + "loss": 1.7415, + "step": 6707 + }, + { + "epoch": 0.6967902773449672, + "grad_norm": 0.46014636754989624, + "learning_rate": 2.1020116459078887e-05, + "loss": 1.5627, + "step": 6708 + }, + { + "epoch": 0.6968941518645476, + "grad_norm": 0.42904940247535706, + "learning_rate": 2.1006821583011356e-05, + "loss": 1.6067, + "step": 6709 + }, + { + "epoch": 0.696998026384128, + "grad_norm": 0.39022296667099, + "learning_rate": 2.099352979449119e-05, + "loss": 1.5778, + "step": 6710 + }, + { + "epoch": 0.6971019009037083, + "grad_norm": 0.4107384979724884, + "learning_rate": 2.0980241094933845e-05, + "loss": 1.685, + "step": 6711 + }, + { + "epoch": 0.6972057754232887, + "grad_norm": 0.42475056648254395, + "learning_rate": 2.0966955485754457e-05, + "loss": 1.2405, + "step": 6712 + }, + { + "epoch": 0.697309649942869, + "grad_norm": 0.4216676950454712, + "learning_rate": 2.0953672968367844e-05, + "loss": 1.4806, + "step": 6713 + }, + { + "epoch": 0.6974135244624493, + "grad_norm": 0.37157952785491943, + "learning_rate": 2.094039354418849e-05, + "loss": 1.4771, + "step": 6714 + }, + { + "epoch": 0.6975173989820297, + "grad_norm": 0.44007498025894165, + "learning_rate": 2.0927117214630547e-05, + "loss": 1.8148, + "step": 6715 + }, + { + "epoch": 0.6976212735016101, + "grad_norm": 0.4631294012069702, + "learning_rate": 2.0913843981107835e-05, + "loss": 1.7156, + "step": 6716 + }, + { + "epoch": 0.6977251480211905, + "grad_norm": 0.42013266682624817, + "learning_rate": 2.0900573845033865e-05, + "loss": 1.6558, + "step": 6717 + }, + { + "epoch": 0.6978290225407707, + "grad_norm": 0.39242061972618103, + "learning_rate": 2.0887306807821822e-05, + "loss": 1.5849, + "step": 6718 + }, + { + "epoch": 0.6979328970603511, + "grad_norm": 0.41832900047302246, + "learning_rate": 2.0874042870884475e-05, + "loss": 1.5836, + "step": 6719 + }, + { + "epoch": 0.6980367715799315, + "grad_norm": 0.44263386726379395, + "learning_rate": 2.086078203563439e-05, + "loss": 1.7748, + "step": 6720 + }, + { + "epoch": 0.6981406460995118, + "grad_norm": 0.4205859303474426, + "learning_rate": 2.0847524303483728e-05, + "loss": 1.7206, + "step": 6721 + }, + { + "epoch": 0.6982445206190921, + "grad_norm": 0.422676146030426, + "learning_rate": 2.0834269675844326e-05, + "loss": 1.7428, + "step": 6722 + }, + { + "epoch": 0.6983483951386725, + "grad_norm": 0.42799460887908936, + "learning_rate": 2.0821018154127686e-05, + "loss": 1.7212, + "step": 6723 + }, + { + "epoch": 0.6984522696582528, + "grad_norm": 0.37061259150505066, + "learning_rate": 2.0807769739745043e-05, + "loss": 1.326, + "step": 6724 + }, + { + "epoch": 0.6985561441778332, + "grad_norm": 0.48382559418678284, + "learning_rate": 2.0794524434107192e-05, + "loss": 1.6675, + "step": 6725 + }, + { + "epoch": 0.6986600186974136, + "grad_norm": 0.4368096888065338, + "learning_rate": 2.0781282238624666e-05, + "loss": 1.6125, + "step": 6726 + }, + { + "epoch": 0.6987638932169938, + "grad_norm": 0.424841970205307, + "learning_rate": 2.07680431547077e-05, + "loss": 1.567, + "step": 6727 + }, + { + "epoch": 0.6988677677365742, + "grad_norm": 0.40765881538391113, + "learning_rate": 2.0754807183766084e-05, + "loss": 1.6608, + "step": 6728 + }, + { + "epoch": 0.6989716422561546, + "grad_norm": 0.45474496483802795, + "learning_rate": 2.07415743272094e-05, + "loss": 1.672, + "step": 6729 + }, + { + "epoch": 0.6990755167757349, + "grad_norm": 0.3880652189254761, + "learning_rate": 2.0728344586446825e-05, + "loss": 1.6127, + "step": 6730 + }, + { + "epoch": 0.6991793912953153, + "grad_norm": 0.40850159525871277, + "learning_rate": 2.0715117962887224e-05, + "loss": 1.604, + "step": 6731 + }, + { + "epoch": 0.6992832658148956, + "grad_norm": 0.44652217626571655, + "learning_rate": 2.070189445793913e-05, + "loss": 1.6413, + "step": 6732 + }, + { + "epoch": 0.6993871403344759, + "grad_norm": 0.42982134222984314, + "learning_rate": 2.068867407301074e-05, + "loss": 1.6768, + "step": 6733 + }, + { + "epoch": 0.6994910148540563, + "grad_norm": 0.40679651498794556, + "learning_rate": 2.0675456809509925e-05, + "loss": 1.6151, + "step": 6734 + }, + { + "epoch": 0.6995948893736367, + "grad_norm": 0.427057683467865, + "learning_rate": 2.06622426688442e-05, + "loss": 1.6619, + "step": 6735 + }, + { + "epoch": 0.699698763893217, + "grad_norm": 0.4573904275894165, + "learning_rate": 2.064903165242083e-05, + "loss": 1.7536, + "step": 6736 + }, + { + "epoch": 0.6998026384127973, + "grad_norm": 0.4206405282020569, + "learning_rate": 2.06358237616466e-05, + "loss": 1.8067, + "step": 6737 + }, + { + "epoch": 0.6999065129323777, + "grad_norm": 0.4055739641189575, + "learning_rate": 2.062261899792811e-05, + "loss": 1.6172, + "step": 6738 + }, + { + "epoch": 0.700010387451958, + "grad_norm": 0.45139777660369873, + "learning_rate": 2.0609417362671545e-05, + "loss": 1.7188, + "step": 6739 + }, + { + "epoch": 0.7001142619715384, + "grad_norm": 0.4384258985519409, + "learning_rate": 2.0596218857282778e-05, + "loss": 1.5628, + "step": 6740 + }, + { + "epoch": 0.7002181364911187, + "grad_norm": 0.39930063486099243, + "learning_rate": 2.058302348316734e-05, + "loss": 1.6419, + "step": 6741 + }, + { + "epoch": 0.7003220110106991, + "grad_norm": 0.40519270300865173, + "learning_rate": 2.0569831241730425e-05, + "loss": 1.5731, + "step": 6742 + }, + { + "epoch": 0.7004258855302794, + "grad_norm": 0.46715521812438965, + "learning_rate": 2.0556642134376953e-05, + "loss": 1.6441, + "step": 6743 + }, + { + "epoch": 0.7005297600498598, + "grad_norm": 0.4366108179092407, + "learning_rate": 2.0543456162511386e-05, + "loss": 1.7277, + "step": 6744 + }, + { + "epoch": 0.7006336345694402, + "grad_norm": 0.3887299597263336, + "learning_rate": 2.053027332753799e-05, + "loss": 1.4717, + "step": 6745 + }, + { + "epoch": 0.7007375090890204, + "grad_norm": 0.4008408486843109, + "learning_rate": 2.0517093630860633e-05, + "loss": 1.619, + "step": 6746 + }, + { + "epoch": 0.7008413836086008, + "grad_norm": 0.43216922879219055, + "learning_rate": 2.050391707388279e-05, + "loss": 1.5183, + "step": 6747 + }, + { + "epoch": 0.7009452581281812, + "grad_norm": 0.4278073310852051, + "learning_rate": 2.049074365800772e-05, + "loss": 1.7367, + "step": 6748 + }, + { + "epoch": 0.7010491326477615, + "grad_norm": 0.4540071487426758, + "learning_rate": 2.0477573384638266e-05, + "loss": 1.8808, + "step": 6749 + }, + { + "epoch": 0.7011530071673419, + "grad_norm": 0.4321799874305725, + "learning_rate": 2.0464406255176966e-05, + "loss": 1.6206, + "step": 6750 + }, + { + "epoch": 0.7012568816869222, + "grad_norm": 0.39823484420776367, + "learning_rate": 2.045124227102601e-05, + "loss": 1.6493, + "step": 6751 + }, + { + "epoch": 0.7013607562065025, + "grad_norm": 0.3932490050792694, + "learning_rate": 2.043808143358727e-05, + "loss": 1.6487, + "step": 6752 + }, + { + "epoch": 0.7014646307260829, + "grad_norm": 0.45285892486572266, + "learning_rate": 2.0424923744262258e-05, + "loss": 1.7022, + "step": 6753 + }, + { + "epoch": 0.7015685052456633, + "grad_norm": 0.41068267822265625, + "learning_rate": 2.041176920445216e-05, + "loss": 1.7152, + "step": 6754 + }, + { + "epoch": 0.7016723797652435, + "grad_norm": 0.4176632761955261, + "learning_rate": 2.0398617815557875e-05, + "loss": 1.5213, + "step": 6755 + }, + { + "epoch": 0.7017762542848239, + "grad_norm": 0.3959343433380127, + "learning_rate": 2.038546957897986e-05, + "loss": 1.4975, + "step": 6756 + }, + { + "epoch": 0.7018801288044043, + "grad_norm": 0.4066161513328552, + "learning_rate": 2.037232449611835e-05, + "loss": 1.5669, + "step": 6757 + }, + { + "epoch": 0.7019840033239846, + "grad_norm": 0.4088003635406494, + "learning_rate": 2.0359182568373176e-05, + "loss": 1.7064, + "step": 6758 + }, + { + "epoch": 0.702087877843565, + "grad_norm": 0.4266449213027954, + "learning_rate": 2.0346043797143854e-05, + "loss": 1.5934, + "step": 6759 + }, + { + "epoch": 0.7021917523631453, + "grad_norm": 0.418796181678772, + "learning_rate": 2.0332908183829556e-05, + "loss": 1.6623, + "step": 6760 + }, + { + "epoch": 0.7022956268827256, + "grad_norm": 0.44862931966781616, + "learning_rate": 2.031977572982912e-05, + "loss": 1.8849, + "step": 6761 + }, + { + "epoch": 0.702399501402306, + "grad_norm": 0.44697731733322144, + "learning_rate": 2.030664643654106e-05, + "loss": 1.7985, + "step": 6762 + }, + { + "epoch": 0.7025033759218864, + "grad_norm": 0.38466355204582214, + "learning_rate": 2.0293520305363522e-05, + "loss": 1.5767, + "step": 6763 + }, + { + "epoch": 0.7026072504414667, + "grad_norm": 0.40668144822120667, + "learning_rate": 2.028039733769438e-05, + "loss": 1.7073, + "step": 6764 + }, + { + "epoch": 0.702711124961047, + "grad_norm": 0.42051973938941956, + "learning_rate": 2.0267277534931066e-05, + "loss": 1.3946, + "step": 6765 + }, + { + "epoch": 0.7028149994806274, + "grad_norm": 0.42236143350601196, + "learning_rate": 2.0254160898470792e-05, + "loss": 1.7273, + "step": 6766 + }, + { + "epoch": 0.7029188740002077, + "grad_norm": 0.4152265191078186, + "learning_rate": 2.024104742971035e-05, + "loss": 1.4529, + "step": 6767 + }, + { + "epoch": 0.7030227485197881, + "grad_norm": 0.37839797139167786, + "learning_rate": 2.0227937130046237e-05, + "loss": 1.4317, + "step": 6768 + }, + { + "epoch": 0.7031266230393685, + "grad_norm": 0.4131396412849426, + "learning_rate": 2.0214830000874586e-05, + "loss": 1.659, + "step": 6769 + }, + { + "epoch": 0.7032304975589488, + "grad_norm": 0.4527067244052887, + "learning_rate": 2.0201726043591186e-05, + "loss": 1.7416, + "step": 6770 + }, + { + "epoch": 0.7033343720785291, + "grad_norm": 0.40972375869750977, + "learning_rate": 2.018862525959156e-05, + "loss": 1.6191, + "step": 6771 + }, + { + "epoch": 0.7034382465981095, + "grad_norm": 0.38629379868507385, + "learning_rate": 2.017552765027077e-05, + "loss": 1.6096, + "step": 6772 + }, + { + "epoch": 0.7035421211176899, + "grad_norm": 0.39593833684921265, + "learning_rate": 2.0162433217023667e-05, + "loss": 1.7234, + "step": 6773 + }, + { + "epoch": 0.7036459956372701, + "grad_norm": 0.3909621834754944, + "learning_rate": 2.0149341961244704e-05, + "loss": 1.5358, + "step": 6774 + }, + { + "epoch": 0.7037498701568505, + "grad_norm": 0.4053054749965668, + "learning_rate": 2.0136253884327937e-05, + "loss": 1.592, + "step": 6775 + }, + { + "epoch": 0.7038537446764309, + "grad_norm": 0.4064863622188568, + "learning_rate": 2.0123168987667206e-05, + "loss": 1.6287, + "step": 6776 + }, + { + "epoch": 0.7039576191960112, + "grad_norm": 0.422603040933609, + "learning_rate": 2.011008727265593e-05, + "loss": 1.6935, + "step": 6777 + }, + { + "epoch": 0.7040614937155916, + "grad_norm": 0.418403297662735, + "learning_rate": 2.009700874068721e-05, + "loss": 1.8375, + "step": 6778 + }, + { + "epoch": 0.704165368235172, + "grad_norm": 0.4110656678676605, + "learning_rate": 2.0083933393153808e-05, + "loss": 1.6755, + "step": 6779 + }, + { + "epoch": 0.7042692427547522, + "grad_norm": 0.47258806228637695, + "learning_rate": 2.007086123144814e-05, + "loss": 1.6924, + "step": 6780 + }, + { + "epoch": 0.7043731172743326, + "grad_norm": 0.4690280854701996, + "learning_rate": 2.00577922569623e-05, + "loss": 1.8002, + "step": 6781 + }, + { + "epoch": 0.704476991793913, + "grad_norm": 0.40814927220344543, + "learning_rate": 2.0044726471088005e-05, + "loss": 1.6296, + "step": 6782 + }, + { + "epoch": 0.7045808663134933, + "grad_norm": 0.4353756606578827, + "learning_rate": 2.0031663875216717e-05, + "loss": 1.7662, + "step": 6783 + }, + { + "epoch": 0.7046847408330736, + "grad_norm": 0.41065043210983276, + "learning_rate": 2.0018604470739434e-05, + "loss": 1.5681, + "step": 6784 + }, + { + "epoch": 0.704788615352654, + "grad_norm": 0.4216291308403015, + "learning_rate": 2.000554825904692e-05, + "loss": 1.6114, + "step": 6785 + }, + { + "epoch": 0.7048924898722343, + "grad_norm": 0.43526455760002136, + "learning_rate": 1.9992495241529556e-05, + "loss": 1.6702, + "step": 6786 + }, + { + "epoch": 0.7049963643918147, + "grad_norm": 0.43194445967674255, + "learning_rate": 1.997944541957738e-05, + "loss": 1.7412, + "step": 6787 + }, + { + "epoch": 0.7051002389113951, + "grad_norm": 0.4365564286708832, + "learning_rate": 1.99663987945801e-05, + "loss": 1.7816, + "step": 6788 + }, + { + "epoch": 0.7052041134309753, + "grad_norm": 0.4253787696361542, + "learning_rate": 1.9953355367927075e-05, + "loss": 1.6678, + "step": 6789 + }, + { + "epoch": 0.7053079879505557, + "grad_norm": 0.43567463755607605, + "learning_rate": 1.9940315141007327e-05, + "loss": 1.6948, + "step": 6790 + }, + { + "epoch": 0.7054118624701361, + "grad_norm": 0.41654548048973083, + "learning_rate": 1.992727811520953e-05, + "loss": 1.6762, + "step": 6791 + }, + { + "epoch": 0.7055157369897164, + "grad_norm": 0.3824082612991333, + "learning_rate": 1.991424429192207e-05, + "loss": 1.495, + "step": 6792 + }, + { + "epoch": 0.7056196115092968, + "grad_norm": 0.4068382978439331, + "learning_rate": 1.9901213672532875e-05, + "loss": 1.6277, + "step": 6793 + }, + { + "epoch": 0.7057234860288771, + "grad_norm": 0.3962841033935547, + "learning_rate": 1.988818625842967e-05, + "loss": 1.6252, + "step": 6794 + }, + { + "epoch": 0.7058273605484575, + "grad_norm": 0.4224420487880707, + "learning_rate": 1.9875162050999747e-05, + "loss": 1.5978, + "step": 6795 + }, + { + "epoch": 0.7059312350680378, + "grad_norm": 0.38469555974006653, + "learning_rate": 1.986214105163008e-05, + "loss": 1.521, + "step": 6796 + }, + { + "epoch": 0.7060351095876182, + "grad_norm": 0.43312588334083557, + "learning_rate": 1.9849123261707305e-05, + "loss": 1.7809, + "step": 6797 + }, + { + "epoch": 0.7061389841071986, + "grad_norm": 0.42561838030815125, + "learning_rate": 1.98361086826177e-05, + "loss": 1.6883, + "step": 6798 + }, + { + "epoch": 0.7062428586267788, + "grad_norm": 0.4514966309070587, + "learning_rate": 1.9823097315747267e-05, + "loss": 1.6939, + "step": 6799 + }, + { + "epoch": 0.7063467331463592, + "grad_norm": 0.4347332715988159, + "learning_rate": 1.9810089162481548e-05, + "loss": 1.6935, + "step": 6800 + }, + { + "epoch": 0.7064506076659396, + "grad_norm": 0.45488476753234863, + "learning_rate": 1.9797084224205857e-05, + "loss": 1.8802, + "step": 6801 + }, + { + "epoch": 0.7065544821855199, + "grad_norm": 0.44390448927879333, + "learning_rate": 1.978408250230513e-05, + "loss": 1.7329, + "step": 6802 + }, + { + "epoch": 0.7066583567051002, + "grad_norm": 0.39649200439453125, + "learning_rate": 1.977108399816388e-05, + "loss": 1.5892, + "step": 6803 + }, + { + "epoch": 0.7067622312246806, + "grad_norm": 0.3859086036682129, + "learning_rate": 1.975808871316641e-05, + "loss": 1.5868, + "step": 6804 + }, + { + "epoch": 0.7068661057442609, + "grad_norm": 0.48838183283805847, + "learning_rate": 1.97450966486966e-05, + "loss": 1.7898, + "step": 6805 + }, + { + "epoch": 0.7069699802638413, + "grad_norm": 0.4066251218318939, + "learning_rate": 1.9732107806137996e-05, + "loss": 1.662, + "step": 6806 + }, + { + "epoch": 0.7070738547834217, + "grad_norm": 0.4213266968727112, + "learning_rate": 1.971912218687381e-05, + "loss": 1.5625, + "step": 6807 + }, + { + "epoch": 0.7071777293030019, + "grad_norm": 0.38811829686164856, + "learning_rate": 1.9706139792286915e-05, + "loss": 1.4567, + "step": 6808 + }, + { + "epoch": 0.7072816038225823, + "grad_norm": 0.42099571228027344, + "learning_rate": 1.9693160623759826e-05, + "loss": 1.6887, + "step": 6809 + }, + { + "epoch": 0.7073854783421627, + "grad_norm": 0.45879611372947693, + "learning_rate": 1.968018468267472e-05, + "loss": 1.7088, + "step": 6810 + }, + { + "epoch": 0.707489352861743, + "grad_norm": 0.3958274722099304, + "learning_rate": 1.9667211970413468e-05, + "loss": 1.5636, + "step": 6811 + }, + { + "epoch": 0.7075932273813234, + "grad_norm": 0.4116361141204834, + "learning_rate": 1.96542424883575e-05, + "loss": 1.5771, + "step": 6812 + }, + { + "epoch": 0.7076971019009037, + "grad_norm": 0.42354995012283325, + "learning_rate": 1.9641276237888023e-05, + "loss": 1.8045, + "step": 6813 + }, + { + "epoch": 0.707800976420484, + "grad_norm": 0.4337577223777771, + "learning_rate": 1.9628313220385818e-05, + "loss": 1.7703, + "step": 6814 + }, + { + "epoch": 0.7079048509400644, + "grad_norm": 0.4336363673210144, + "learning_rate": 1.961535343723135e-05, + "loss": 1.4605, + "step": 6815 + }, + { + "epoch": 0.7080087254596448, + "grad_norm": 0.41159120202064514, + "learning_rate": 1.960239688980473e-05, + "loss": 1.6786, + "step": 6816 + }, + { + "epoch": 0.708112599979225, + "grad_norm": 0.4380592405796051, + "learning_rate": 1.958944357948573e-05, + "loss": 1.5045, + "step": 6817 + }, + { + "epoch": 0.7082164744988054, + "grad_norm": 0.3844902515411377, + "learning_rate": 1.9576493507653776e-05, + "loss": 1.653, + "step": 6818 + }, + { + "epoch": 0.7083203490183858, + "grad_norm": 0.41153958439826965, + "learning_rate": 1.9563546675687932e-05, + "loss": 1.7776, + "step": 6819 + }, + { + "epoch": 0.7084242235379662, + "grad_norm": 0.39045795798301697, + "learning_rate": 1.9550603084966973e-05, + "loss": 1.5983, + "step": 6820 + }, + { + "epoch": 0.7085280980575465, + "grad_norm": 0.4701341688632965, + "learning_rate": 1.9537662736869272e-05, + "loss": 1.7198, + "step": 6821 + }, + { + "epoch": 0.7086319725771268, + "grad_norm": 0.4244726896286011, + "learning_rate": 1.9524725632772878e-05, + "loss": 1.6711, + "step": 6822 + }, + { + "epoch": 0.7087358470967072, + "grad_norm": 0.44290024042129517, + "learning_rate": 1.9511791774055482e-05, + "loss": 1.8374, + "step": 6823 + }, + { + "epoch": 0.7088397216162875, + "grad_norm": 0.4390253722667694, + "learning_rate": 1.9498861162094446e-05, + "loss": 1.6595, + "step": 6824 + }, + { + "epoch": 0.7089435961358679, + "grad_norm": 0.3866863548755646, + "learning_rate": 1.948593379826678e-05, + "loss": 1.5316, + "step": 6825 + }, + { + "epoch": 0.7090474706554483, + "grad_norm": 0.4000120162963867, + "learning_rate": 1.947300968394913e-05, + "loss": 1.5832, + "step": 6826 + }, + { + "epoch": 0.7091513451750285, + "grad_norm": 0.4002024531364441, + "learning_rate": 1.946008882051786e-05, + "loss": 1.6259, + "step": 6827 + }, + { + "epoch": 0.7092552196946089, + "grad_norm": 0.43123364448547363, + "learning_rate": 1.9447171209348875e-05, + "loss": 1.6728, + "step": 6828 + }, + { + "epoch": 0.7093590942141893, + "grad_norm": 0.39023008942604065, + "learning_rate": 1.943425685181785e-05, + "loss": 1.5225, + "step": 6829 + }, + { + "epoch": 0.7094629687337696, + "grad_norm": 0.40491533279418945, + "learning_rate": 1.9421345749300073e-05, + "loss": 1.6512, + "step": 6830 + }, + { + "epoch": 0.70956684325335, + "grad_norm": 0.4594792127609253, + "learning_rate": 1.940843790317041e-05, + "loss": 1.6372, + "step": 6831 + }, + { + "epoch": 0.7096707177729303, + "grad_norm": 0.4281672537326813, + "learning_rate": 1.939553331480351e-05, + "loss": 1.5949, + "step": 6832 + }, + { + "epoch": 0.7097745922925106, + "grad_norm": 0.4785386025905609, + "learning_rate": 1.9382631985573586e-05, + "loss": 1.682, + "step": 6833 + }, + { + "epoch": 0.709878466812091, + "grad_norm": 0.42151594161987305, + "learning_rate": 1.9369733916854526e-05, + "loss": 1.5583, + "step": 6834 + }, + { + "epoch": 0.7099823413316714, + "grad_norm": 0.4763428866863251, + "learning_rate": 1.935683911001988e-05, + "loss": 1.6265, + "step": 6835 + }, + { + "epoch": 0.7100862158512516, + "grad_norm": 0.4347456693649292, + "learning_rate": 1.934394756644285e-05, + "loss": 1.7359, + "step": 6836 + }, + { + "epoch": 0.710190090370832, + "grad_norm": 0.40570497512817383, + "learning_rate": 1.9331059287496274e-05, + "loss": 1.6238, + "step": 6837 + }, + { + "epoch": 0.7102939648904124, + "grad_norm": 0.41450199484825134, + "learning_rate": 1.9318174274552638e-05, + "loss": 1.7365, + "step": 6838 + }, + { + "epoch": 0.7103978394099927, + "grad_norm": 0.44917452335357666, + "learning_rate": 1.930529252898415e-05, + "loss": 1.7917, + "step": 6839 + }, + { + "epoch": 0.7105017139295731, + "grad_norm": 0.4153517186641693, + "learning_rate": 1.929241405216254e-05, + "loss": 1.7213, + "step": 6840 + }, + { + "epoch": 0.7106055884491534, + "grad_norm": 0.4131045639514923, + "learning_rate": 1.9279538845459327e-05, + "loss": 1.5519, + "step": 6841 + }, + { + "epoch": 0.7107094629687337, + "grad_norm": 0.38155892491340637, + "learning_rate": 1.92666669102456e-05, + "loss": 1.505, + "step": 6842 + }, + { + "epoch": 0.7108133374883141, + "grad_norm": 0.45029881596565247, + "learning_rate": 1.9253798247892113e-05, + "loss": 1.678, + "step": 6843 + }, + { + "epoch": 0.7109172120078945, + "grad_norm": 0.4427982568740845, + "learning_rate": 1.924093285976929e-05, + "loss": 1.8031, + "step": 6844 + }, + { + "epoch": 0.7110210865274748, + "grad_norm": 0.42872363328933716, + "learning_rate": 1.9228070747247185e-05, + "loss": 1.7211, + "step": 6845 + }, + { + "epoch": 0.7111249610470551, + "grad_norm": 0.42956963181495667, + "learning_rate": 1.9215211911695512e-05, + "loss": 1.7405, + "step": 6846 + }, + { + "epoch": 0.7112288355666355, + "grad_norm": 0.39396312832832336, + "learning_rate": 1.9202356354483624e-05, + "loss": 1.6972, + "step": 6847 + }, + { + "epoch": 0.7113327100862159, + "grad_norm": 0.41904011368751526, + "learning_rate": 1.9189504076980567e-05, + "loss": 1.7432, + "step": 6848 + }, + { + "epoch": 0.7114365846057962, + "grad_norm": 0.37891194224357605, + "learning_rate": 1.9176655080555e-05, + "loss": 1.6582, + "step": 6849 + }, + { + "epoch": 0.7115404591253766, + "grad_norm": 0.4023973345756531, + "learning_rate": 1.9163809366575235e-05, + "loss": 1.6672, + "step": 6850 + }, + { + "epoch": 0.7116443336449569, + "grad_norm": 0.44481852650642395, + "learning_rate": 1.9150966936409242e-05, + "loss": 1.7137, + "step": 6851 + }, + { + "epoch": 0.7117482081645372, + "grad_norm": 0.39180970191955566, + "learning_rate": 1.9138127791424637e-05, + "loss": 1.5847, + "step": 6852 + }, + { + "epoch": 0.7118520826841176, + "grad_norm": 0.4336892068386078, + "learning_rate": 1.912529193298869e-05, + "loss": 1.7037, + "step": 6853 + }, + { + "epoch": 0.711955957203698, + "grad_norm": 0.3839171230792999, + "learning_rate": 1.91124593624683e-05, + "loss": 1.5538, + "step": 6854 + }, + { + "epoch": 0.7120598317232782, + "grad_norm": 0.40723204612731934, + "learning_rate": 1.9099630081230095e-05, + "loss": 1.4787, + "step": 6855 + }, + { + "epoch": 0.7121637062428586, + "grad_norm": 0.449480801820755, + "learning_rate": 1.908680409064022e-05, + "loss": 1.831, + "step": 6856 + }, + { + "epoch": 0.712267580762439, + "grad_norm": 0.5175827145576477, + "learning_rate": 1.9073981392064587e-05, + "loss": 1.9043, + "step": 6857 + }, + { + "epoch": 0.7123714552820193, + "grad_norm": 0.41189754009246826, + "learning_rate": 1.9061161986868732e-05, + "loss": 1.7122, + "step": 6858 + }, + { + "epoch": 0.7124753298015997, + "grad_norm": 0.43195226788520813, + "learning_rate": 1.9048345876417752e-05, + "loss": 1.6308, + "step": 6859 + }, + { + "epoch": 0.71257920432118, + "grad_norm": 0.41276681423187256, + "learning_rate": 1.9035533062076522e-05, + "loss": 1.658, + "step": 6860 + }, + { + "epoch": 0.7126830788407603, + "grad_norm": 0.4124269485473633, + "learning_rate": 1.9022723545209487e-05, + "loss": 1.5886, + "step": 6861 + }, + { + "epoch": 0.7127869533603407, + "grad_norm": 0.4499133825302124, + "learning_rate": 1.9009917327180766e-05, + "loss": 1.5387, + "step": 6862 + }, + { + "epoch": 0.7128908278799211, + "grad_norm": 0.41163170337677, + "learning_rate": 1.8997114409354116e-05, + "loss": 1.7033, + "step": 6863 + }, + { + "epoch": 0.7129947023995014, + "grad_norm": 0.4106546640396118, + "learning_rate": 1.898431479309295e-05, + "loss": 1.6542, + "step": 6864 + }, + { + "epoch": 0.7130985769190817, + "grad_norm": 0.46902644634246826, + "learning_rate": 1.8971518479760324e-05, + "loss": 1.6021, + "step": 6865 + }, + { + "epoch": 0.7132024514386621, + "grad_norm": 0.4623051881790161, + "learning_rate": 1.8958725470718937e-05, + "loss": 1.9381, + "step": 6866 + }, + { + "epoch": 0.7133063259582424, + "grad_norm": 0.43215009570121765, + "learning_rate": 1.8945935767331186e-05, + "loss": 1.6968, + "step": 6867 + }, + { + "epoch": 0.7134102004778228, + "grad_norm": 0.4356382191181183, + "learning_rate": 1.893314937095901e-05, + "loss": 1.6144, + "step": 6868 + }, + { + "epoch": 0.7135140749974032, + "grad_norm": 0.456093966960907, + "learning_rate": 1.8920366282964115e-05, + "loss": 1.723, + "step": 6869 + }, + { + "epoch": 0.7136179495169834, + "grad_norm": 0.5878288149833679, + "learning_rate": 1.8907586504707773e-05, + "loss": 1.5267, + "step": 6870 + }, + { + "epoch": 0.7137218240365638, + "grad_norm": 0.3891201913356781, + "learning_rate": 1.8894810037550947e-05, + "loss": 1.53, + "step": 6871 + }, + { + "epoch": 0.7138256985561442, + "grad_norm": 0.3847010135650635, + "learning_rate": 1.888203688285422e-05, + "loss": 1.6174, + "step": 6872 + }, + { + "epoch": 0.7139295730757246, + "grad_norm": 0.4661828577518463, + "learning_rate": 1.8869267041977817e-05, + "loss": 1.7112, + "step": 6873 + }, + { + "epoch": 0.7140334475953048, + "grad_norm": 0.4062906503677368, + "learning_rate": 1.8856500516281684e-05, + "loss": 1.6291, + "step": 6874 + }, + { + "epoch": 0.7141373221148852, + "grad_norm": 0.5327140688896179, + "learning_rate": 1.8843737307125275e-05, + "loss": 1.4871, + "step": 6875 + }, + { + "epoch": 0.7142411966344656, + "grad_norm": 0.4605855643749237, + "learning_rate": 1.883097741586784e-05, + "loss": 1.4438, + "step": 6876 + }, + { + "epoch": 0.7143450711540459, + "grad_norm": 0.4348846673965454, + "learning_rate": 1.8818220843868178e-05, + "loss": 1.6393, + "step": 6877 + }, + { + "epoch": 0.7144489456736263, + "grad_norm": 0.41968026757240295, + "learning_rate": 1.8805467592484772e-05, + "loss": 1.7655, + "step": 6878 + }, + { + "epoch": 0.7145528201932067, + "grad_norm": 0.47305288910865784, + "learning_rate": 1.8792717663075738e-05, + "loss": 1.9356, + "step": 6879 + }, + { + "epoch": 0.7146566947127869, + "grad_norm": 0.4094095826148987, + "learning_rate": 1.8779971056998846e-05, + "loss": 1.5515, + "step": 6880 + }, + { + "epoch": 0.7147605692323673, + "grad_norm": 0.41150006651878357, + "learning_rate": 1.876722777561151e-05, + "loss": 1.7381, + "step": 6881 + }, + { + "epoch": 0.7148644437519477, + "grad_norm": 0.405069500207901, + "learning_rate": 1.8754487820270778e-05, + "loss": 1.5871, + "step": 6882 + }, + { + "epoch": 0.714968318271528, + "grad_norm": 0.4334279000759125, + "learning_rate": 1.87417511923334e-05, + "loss": 1.7346, + "step": 6883 + }, + { + "epoch": 0.7150721927911083, + "grad_norm": 0.4156447947025299, + "learning_rate": 1.872901789315566e-05, + "loss": 1.7387, + "step": 6884 + }, + { + "epoch": 0.7151760673106887, + "grad_norm": 0.4099999666213989, + "learning_rate": 1.8716287924093613e-05, + "loss": 1.6588, + "step": 6885 + }, + { + "epoch": 0.715279941830269, + "grad_norm": 0.39068254828453064, + "learning_rate": 1.8703561286502896e-05, + "loss": 1.5263, + "step": 6886 + }, + { + "epoch": 0.7153838163498494, + "grad_norm": 0.4342229664325714, + "learning_rate": 1.8690837981738747e-05, + "loss": 1.6721, + "step": 6887 + }, + { + "epoch": 0.7154876908694298, + "grad_norm": 0.40513840317726135, + "learning_rate": 1.867811801115614e-05, + "loss": 1.6021, + "step": 6888 + }, + { + "epoch": 0.71559156538901, + "grad_norm": 0.3993147313594818, + "learning_rate": 1.8665401376109654e-05, + "loss": 1.7418, + "step": 6889 + }, + { + "epoch": 0.7156954399085904, + "grad_norm": 0.4815475046634674, + "learning_rate": 1.86526880779535e-05, + "loss": 1.9432, + "step": 6890 + }, + { + "epoch": 0.7157993144281708, + "grad_norm": 0.4066917896270752, + "learning_rate": 1.863997811804155e-05, + "loss": 1.4421, + "step": 6891 + }, + { + "epoch": 0.7159031889477511, + "grad_norm": 0.4122363030910492, + "learning_rate": 1.862727149772731e-05, + "loss": 1.6838, + "step": 6892 + }, + { + "epoch": 0.7160070634673315, + "grad_norm": 0.41208595037460327, + "learning_rate": 1.8614568218363937e-05, + "loss": 1.6661, + "step": 6893 + }, + { + "epoch": 0.7161109379869118, + "grad_norm": 0.4167573153972626, + "learning_rate": 1.860186828130422e-05, + "loss": 1.7035, + "step": 6894 + }, + { + "epoch": 0.7162148125064921, + "grad_norm": 0.40936407446861267, + "learning_rate": 1.8589171687900646e-05, + "loss": 1.5622, + "step": 6895 + }, + { + "epoch": 0.7163186870260725, + "grad_norm": 0.400158554315567, + "learning_rate": 1.8576478439505248e-05, + "loss": 1.6196, + "step": 6896 + }, + { + "epoch": 0.7164225615456529, + "grad_norm": 0.42571789026260376, + "learning_rate": 1.856378853746979e-05, + "loss": 1.7369, + "step": 6897 + }, + { + "epoch": 0.7165264360652333, + "grad_norm": 0.395346462726593, + "learning_rate": 1.855110198314565e-05, + "loss": 1.7267, + "step": 6898 + }, + { + "epoch": 0.7166303105848135, + "grad_norm": 0.43957024812698364, + "learning_rate": 1.8538418777883825e-05, + "loss": 1.7538, + "step": 6899 + }, + { + "epoch": 0.7167341851043939, + "grad_norm": 0.41253888607025146, + "learning_rate": 1.8525738923035003e-05, + "loss": 1.5747, + "step": 6900 + }, + { + "epoch": 0.7168380596239743, + "grad_norm": 0.4174502193927765, + "learning_rate": 1.8513062419949452e-05, + "loss": 1.8062, + "step": 6901 + }, + { + "epoch": 0.7169419341435546, + "grad_norm": 0.42370733618736267, + "learning_rate": 1.8500389269977186e-05, + "loss": 1.668, + "step": 6902 + }, + { + "epoch": 0.7170458086631349, + "grad_norm": 0.41417741775512695, + "learning_rate": 1.848771947446772e-05, + "loss": 1.6166, + "step": 6903 + }, + { + "epoch": 0.7171496831827153, + "grad_norm": 0.4121987819671631, + "learning_rate": 1.847505303477034e-05, + "loss": 1.5487, + "step": 6904 + }, + { + "epoch": 0.7172535577022956, + "grad_norm": 0.39274632930755615, + "learning_rate": 1.8462389952233905e-05, + "loss": 1.5665, + "step": 6905 + }, + { + "epoch": 0.717357432221876, + "grad_norm": 0.41530582308769226, + "learning_rate": 1.8449730228206945e-05, + "loss": 1.4904, + "step": 6906 + }, + { + "epoch": 0.7174613067414564, + "grad_norm": 0.4424355626106262, + "learning_rate": 1.8437073864037608e-05, + "loss": 1.6915, + "step": 6907 + }, + { + "epoch": 0.7175651812610366, + "grad_norm": 0.41284075379371643, + "learning_rate": 1.842442086107371e-05, + "loss": 1.6279, + "step": 6908 + }, + { + "epoch": 0.717669055780617, + "grad_norm": 0.4062071442604065, + "learning_rate": 1.8411771220662684e-05, + "loss": 1.6648, + "step": 6909 + }, + { + "epoch": 0.7177729303001974, + "grad_norm": 0.4447495639324188, + "learning_rate": 1.8399124944151614e-05, + "loss": 1.755, + "step": 6910 + }, + { + "epoch": 0.7178768048197777, + "grad_norm": 0.4060676395893097, + "learning_rate": 1.838648203288727e-05, + "loss": 1.7247, + "step": 6911 + }, + { + "epoch": 0.717980679339358, + "grad_norm": 0.4383039176464081, + "learning_rate": 1.8373842488215966e-05, + "loss": 1.5317, + "step": 6912 + }, + { + "epoch": 0.7180845538589384, + "grad_norm": 0.42534106969833374, + "learning_rate": 1.836120631148376e-05, + "loss": 1.7154, + "step": 6913 + }, + { + "epoch": 0.7181884283785187, + "grad_norm": 0.4181063175201416, + "learning_rate": 1.8348573504036305e-05, + "loss": 1.7214, + "step": 6914 + }, + { + "epoch": 0.7182923028980991, + "grad_norm": 0.40229707956314087, + "learning_rate": 1.8335944067218846e-05, + "loss": 1.652, + "step": 6915 + }, + { + "epoch": 0.7183961774176795, + "grad_norm": 0.4175788462162018, + "learning_rate": 1.8323318002376372e-05, + "loss": 1.6764, + "step": 6916 + }, + { + "epoch": 0.7185000519372597, + "grad_norm": 0.4145587682723999, + "learning_rate": 1.831069531085344e-05, + "loss": 1.7295, + "step": 6917 + }, + { + "epoch": 0.7186039264568401, + "grad_norm": 0.4142177700996399, + "learning_rate": 1.829807599399428e-05, + "loss": 1.5781, + "step": 6918 + }, + { + "epoch": 0.7187078009764205, + "grad_norm": 0.4172203540802002, + "learning_rate": 1.828546005314274e-05, + "loss": 1.6024, + "step": 6919 + }, + { + "epoch": 0.7188116754960008, + "grad_norm": 0.42822572588920593, + "learning_rate": 1.8272847489642314e-05, + "loss": 1.7269, + "step": 6920 + }, + { + "epoch": 0.7189155500155812, + "grad_norm": 0.4436422288417816, + "learning_rate": 1.826023830483616e-05, + "loss": 1.6872, + "step": 6921 + }, + { + "epoch": 0.7190194245351615, + "grad_norm": 0.39383989572525024, + "learning_rate": 1.8247632500067017e-05, + "loss": 1.3766, + "step": 6922 + }, + { + "epoch": 0.7191232990547419, + "grad_norm": 0.4555588364601135, + "learning_rate": 1.8235030076677362e-05, + "loss": 1.7151, + "step": 6923 + }, + { + "epoch": 0.7192271735743222, + "grad_norm": 0.42651811242103577, + "learning_rate": 1.8222431036009226e-05, + "loss": 1.6841, + "step": 6924 + }, + { + "epoch": 0.7193310480939026, + "grad_norm": 0.4434896409511566, + "learning_rate": 1.8209835379404304e-05, + "loss": 1.5951, + "step": 6925 + }, + { + "epoch": 0.719434922613483, + "grad_norm": 0.38876456022262573, + "learning_rate": 1.819724310820395e-05, + "loss": 1.6513, + "step": 6926 + }, + { + "epoch": 0.7195387971330632, + "grad_norm": 0.4082390069961548, + "learning_rate": 1.8184654223749126e-05, + "loss": 1.63, + "step": 6927 + }, + { + "epoch": 0.7196426716526436, + "grad_norm": 0.5180742740631104, + "learning_rate": 1.8172068727380464e-05, + "loss": 1.7954, + "step": 6928 + }, + { + "epoch": 0.719746546172224, + "grad_norm": 0.43790149688720703, + "learning_rate": 1.8159486620438193e-05, + "loss": 1.7982, + "step": 6929 + }, + { + "epoch": 0.7198504206918043, + "grad_norm": 0.4642382562160492, + "learning_rate": 1.8146907904262267e-05, + "loss": 1.8414, + "step": 6930 + }, + { + "epoch": 0.7199542952113847, + "grad_norm": 0.4199405014514923, + "learning_rate": 1.8134332580192152e-05, + "loss": 1.8044, + "step": 6931 + }, + { + "epoch": 0.720058169730965, + "grad_norm": 0.4645751714706421, + "learning_rate": 1.8121760649567077e-05, + "loss": 1.6786, + "step": 6932 + }, + { + "epoch": 0.7201620442505453, + "grad_norm": 0.40321746468544006, + "learning_rate": 1.8109192113725826e-05, + "loss": 1.51, + "step": 6933 + }, + { + "epoch": 0.7202659187701257, + "grad_norm": 0.44322633743286133, + "learning_rate": 1.809662697400686e-05, + "loss": 1.5216, + "step": 6934 + }, + { + "epoch": 0.7203697932897061, + "grad_norm": 0.48086169362068176, + "learning_rate": 1.8084065231748265e-05, + "loss": 1.7657, + "step": 6935 + }, + { + "epoch": 0.7204736678092863, + "grad_norm": 0.46815988421440125, + "learning_rate": 1.807150688828777e-05, + "loss": 1.7116, + "step": 6936 + }, + { + "epoch": 0.7205775423288667, + "grad_norm": 0.42190316319465637, + "learning_rate": 1.8058951944962736e-05, + "loss": 1.503, + "step": 6937 + }, + { + "epoch": 0.7206814168484471, + "grad_norm": 0.4263322949409485, + "learning_rate": 1.804640040311016e-05, + "loss": 1.6463, + "step": 6938 + }, + { + "epoch": 0.7207852913680274, + "grad_norm": 0.4055632948875427, + "learning_rate": 1.8033852264066718e-05, + "loss": 1.4503, + "step": 6939 + }, + { + "epoch": 0.7208891658876078, + "grad_norm": 0.40299129486083984, + "learning_rate": 1.802130752916864e-05, + "loss": 1.6289, + "step": 6940 + }, + { + "epoch": 0.7209930404071881, + "grad_norm": 0.4563215374946594, + "learning_rate": 1.800876619975188e-05, + "loss": 1.6664, + "step": 6941 + }, + { + "epoch": 0.7210969149267684, + "grad_norm": 0.4018884599208832, + "learning_rate": 1.7996228277151995e-05, + "loss": 1.5518, + "step": 6942 + }, + { + "epoch": 0.7212007894463488, + "grad_norm": 0.4353574514389038, + "learning_rate": 1.7983693762704122e-05, + "loss": 1.5822, + "step": 6943 + }, + { + "epoch": 0.7213046639659292, + "grad_norm": 0.4123103618621826, + "learning_rate": 1.797116265774315e-05, + "loss": 1.6165, + "step": 6944 + }, + { + "epoch": 0.7214085384855095, + "grad_norm": 0.4055830240249634, + "learning_rate": 1.7958634963603516e-05, + "loss": 1.5928, + "step": 6945 + }, + { + "epoch": 0.7215124130050898, + "grad_norm": 0.439450204372406, + "learning_rate": 1.7946110681619327e-05, + "loss": 1.8438, + "step": 6946 + }, + { + "epoch": 0.7216162875246702, + "grad_norm": 0.39102089405059814, + "learning_rate": 1.7933589813124325e-05, + "loss": 1.6103, + "step": 6947 + }, + { + "epoch": 0.7217201620442505, + "grad_norm": 0.39481091499328613, + "learning_rate": 1.7921072359451872e-05, + "loss": 1.6833, + "step": 6948 + }, + { + "epoch": 0.7218240365638309, + "grad_norm": 0.4227730333805084, + "learning_rate": 1.7908558321934992e-05, + "loss": 1.5623, + "step": 6949 + }, + { + "epoch": 0.7219279110834113, + "grad_norm": 0.41327497363090515, + "learning_rate": 1.789604770190631e-05, + "loss": 1.5559, + "step": 6950 + }, + { + "epoch": 0.7220317856029916, + "grad_norm": 0.44786590337753296, + "learning_rate": 1.7883540500698148e-05, + "loss": 1.6942, + "step": 6951 + }, + { + "epoch": 0.7221356601225719, + "grad_norm": 0.40318799018859863, + "learning_rate": 1.7871036719642398e-05, + "loss": 1.4997, + "step": 6952 + }, + { + "epoch": 0.7222395346421523, + "grad_norm": 0.43783068656921387, + "learning_rate": 1.7858536360070626e-05, + "loss": 1.8249, + "step": 6953 + }, + { + "epoch": 0.7223434091617327, + "grad_norm": 0.4172214865684509, + "learning_rate": 1.784603942331402e-05, + "loss": 1.6494, + "step": 6954 + }, + { + "epoch": 0.722447283681313, + "grad_norm": 0.3986718952655792, + "learning_rate": 1.7833545910703402e-05, + "loss": 1.5129, + "step": 6955 + }, + { + "epoch": 0.7225511582008933, + "grad_norm": 0.4317796528339386, + "learning_rate": 1.7821055823569238e-05, + "loss": 1.4516, + "step": 6956 + }, + { + "epoch": 0.7226550327204737, + "grad_norm": 0.42882880568504333, + "learning_rate": 1.7808569163241605e-05, + "loss": 1.7454, + "step": 6957 + }, + { + "epoch": 0.722758907240054, + "grad_norm": 0.4157145917415619, + "learning_rate": 1.7796085931050287e-05, + "loss": 1.6144, + "step": 6958 + }, + { + "epoch": 0.7228627817596344, + "grad_norm": 0.4417478144168854, + "learning_rate": 1.778360612832458e-05, + "loss": 1.6667, + "step": 6959 + }, + { + "epoch": 0.7229666562792147, + "grad_norm": 0.47706910967826843, + "learning_rate": 1.7771129756393547e-05, + "loss": 1.7617, + "step": 6960 + }, + { + "epoch": 0.723070530798795, + "grad_norm": 0.38705945014953613, + "learning_rate": 1.77586568165858e-05, + "loss": 1.6515, + "step": 6961 + }, + { + "epoch": 0.7231744053183754, + "grad_norm": 0.42276665568351746, + "learning_rate": 1.7746187310229606e-05, + "loss": 1.7102, + "step": 6962 + }, + { + "epoch": 0.7232782798379558, + "grad_norm": 0.44928818941116333, + "learning_rate": 1.773372123865287e-05, + "loss": 1.8452, + "step": 6963 + }, + { + "epoch": 0.7233821543575361, + "grad_norm": 0.5049888491630554, + "learning_rate": 1.7721258603183144e-05, + "loss": 1.6948, + "step": 6964 + }, + { + "epoch": 0.7234860288771164, + "grad_norm": 0.4933055639266968, + "learning_rate": 1.770879940514759e-05, + "loss": 1.9625, + "step": 6965 + }, + { + "epoch": 0.7235899033966968, + "grad_norm": 0.4487113058567047, + "learning_rate": 1.7696343645873e-05, + "loss": 1.6516, + "step": 6966 + }, + { + "epoch": 0.7236937779162771, + "grad_norm": 0.4504063129425049, + "learning_rate": 1.768389132668587e-05, + "loss": 1.7148, + "step": 6967 + }, + { + "epoch": 0.7237976524358575, + "grad_norm": 0.4117870628833771, + "learning_rate": 1.7671442448912202e-05, + "loss": 1.457, + "step": 6968 + }, + { + "epoch": 0.7239015269554379, + "grad_norm": 0.4312496781349182, + "learning_rate": 1.7658997013877754e-05, + "loss": 1.5079, + "step": 6969 + }, + { + "epoch": 0.7240054014750181, + "grad_norm": 0.45666730403900146, + "learning_rate": 1.764655502290788e-05, + "loss": 1.7927, + "step": 6970 + }, + { + "epoch": 0.7241092759945985, + "grad_norm": 0.41350269317626953, + "learning_rate": 1.763411647732749e-05, + "loss": 1.6194, + "step": 6971 + }, + { + "epoch": 0.7242131505141789, + "grad_norm": 0.44214317202568054, + "learning_rate": 1.7621681378461245e-05, + "loss": 1.5284, + "step": 6972 + }, + { + "epoch": 0.7243170250337592, + "grad_norm": 0.5224699974060059, + "learning_rate": 1.7609249727633382e-05, + "loss": 1.3562, + "step": 6973 + }, + { + "epoch": 0.7244208995533395, + "grad_norm": 0.4053601622581482, + "learning_rate": 1.759682152616776e-05, + "loss": 1.6075, + "step": 6974 + }, + { + "epoch": 0.7245247740729199, + "grad_norm": 0.39831477403640747, + "learning_rate": 1.758439677538789e-05, + "loss": 1.5422, + "step": 6975 + }, + { + "epoch": 0.7246286485925003, + "grad_norm": 0.4336235225200653, + "learning_rate": 1.7571975476616897e-05, + "loss": 1.6262, + "step": 6976 + }, + { + "epoch": 0.7247325231120806, + "grad_norm": 0.4514787495136261, + "learning_rate": 1.7559557631177605e-05, + "loss": 1.6682, + "step": 6977 + }, + { + "epoch": 0.724836397631661, + "grad_norm": 0.4214664101600647, + "learning_rate": 1.754714324039234e-05, + "loss": 1.836, + "step": 6978 + }, + { + "epoch": 0.7249402721512413, + "grad_norm": 0.4808892607688904, + "learning_rate": 1.7534732305583196e-05, + "loss": 1.7584, + "step": 6979 + }, + { + "epoch": 0.7250441466708216, + "grad_norm": 0.44463905692100525, + "learning_rate": 1.7522324828071822e-05, + "loss": 1.6672, + "step": 6980 + }, + { + "epoch": 0.725148021190402, + "grad_norm": 0.4203853905200958, + "learning_rate": 1.750992080917952e-05, + "loss": 1.5338, + "step": 6981 + }, + { + "epoch": 0.7252518957099824, + "grad_norm": 0.41992905735969543, + "learning_rate": 1.7497520250227222e-05, + "loss": 1.6543, + "step": 6982 + }, + { + "epoch": 0.7253557702295627, + "grad_norm": 0.42039117217063904, + "learning_rate": 1.7485123152535488e-05, + "loss": 1.6052, + "step": 6983 + }, + { + "epoch": 0.725459644749143, + "grad_norm": 0.4649543762207031, + "learning_rate": 1.7472729517424508e-05, + "loss": 1.7914, + "step": 6984 + }, + { + "epoch": 0.7255635192687234, + "grad_norm": 0.39799681305885315, + "learning_rate": 1.7460339346214095e-05, + "loss": 1.6915, + "step": 6985 + }, + { + "epoch": 0.7256673937883037, + "grad_norm": 0.4391082227230072, + "learning_rate": 1.744795264022376e-05, + "loss": 1.7437, + "step": 6986 + }, + { + "epoch": 0.7257712683078841, + "grad_norm": 0.4607831835746765, + "learning_rate": 1.7435569400772516e-05, + "loss": 1.8705, + "step": 6987 + }, + { + "epoch": 0.7258751428274645, + "grad_norm": 0.4046463966369629, + "learning_rate": 1.742318962917914e-05, + "loss": 1.7026, + "step": 6988 + }, + { + "epoch": 0.7259790173470447, + "grad_norm": 0.4511662721633911, + "learning_rate": 1.7410813326761955e-05, + "loss": 1.7673, + "step": 6989 + }, + { + "epoch": 0.7260828918666251, + "grad_norm": 0.3973452150821686, + "learning_rate": 1.7398440494838947e-05, + "loss": 1.5597, + "step": 6990 + }, + { + "epoch": 0.7261867663862055, + "grad_norm": 0.4389094412326813, + "learning_rate": 1.7386071134727726e-05, + "loss": 1.8227, + "step": 6991 + }, + { + "epoch": 0.7262906409057858, + "grad_norm": 0.3820962905883789, + "learning_rate": 1.7373705247745532e-05, + "loss": 1.5315, + "step": 6992 + }, + { + "epoch": 0.7263945154253661, + "grad_norm": 0.4259055554866791, + "learning_rate": 1.7361342835209225e-05, + "loss": 1.598, + "step": 6993 + }, + { + "epoch": 0.7264983899449465, + "grad_norm": 0.4085586965084076, + "learning_rate": 1.7348983898435307e-05, + "loss": 1.7899, + "step": 6994 + }, + { + "epoch": 0.7266022644645268, + "grad_norm": 0.4286569654941559, + "learning_rate": 1.733662843873995e-05, + "loss": 1.7511, + "step": 6995 + }, + { + "epoch": 0.7267061389841072, + "grad_norm": 0.4160563051700592, + "learning_rate": 1.7324276457438843e-05, + "loss": 1.741, + "step": 6996 + }, + { + "epoch": 0.7268100135036876, + "grad_norm": 0.47089239954948425, + "learning_rate": 1.7311927955847428e-05, + "loss": 1.8191, + "step": 6997 + }, + { + "epoch": 0.7269138880232678, + "grad_norm": 0.41623279452323914, + "learning_rate": 1.729958293528073e-05, + "loss": 1.5197, + "step": 6998 + }, + { + "epoch": 0.7270177625428482, + "grad_norm": 0.41076546907424927, + "learning_rate": 1.728724139705334e-05, + "loss": 1.6748, + "step": 6999 + }, + { + "epoch": 0.7271216370624286, + "grad_norm": 0.4390968382358551, + "learning_rate": 1.7274903342479586e-05, + "loss": 1.6716, + "step": 7000 + }, + { + "epoch": 0.727225511582009, + "grad_norm": 0.4050026535987854, + "learning_rate": 1.726256877287335e-05, + "loss": 1.6378, + "step": 7001 + }, + { + "epoch": 0.7273293861015893, + "grad_norm": 0.41926950216293335, + "learning_rate": 1.725023768954821e-05, + "loss": 1.5495, + "step": 7002 + }, + { + "epoch": 0.7274332606211696, + "grad_norm": 0.4020102024078369, + "learning_rate": 1.7237910093817278e-05, + "loss": 1.5418, + "step": 7003 + }, + { + "epoch": 0.72753713514075, + "grad_norm": 0.42248985171318054, + "learning_rate": 1.7225585986993353e-05, + "loss": 1.5632, + "step": 7004 + }, + { + "epoch": 0.7276410096603303, + "grad_norm": 0.4645000696182251, + "learning_rate": 1.7213265370388898e-05, + "loss": 1.7221, + "step": 7005 + }, + { + "epoch": 0.7277448841799107, + "grad_norm": 0.45067256689071655, + "learning_rate": 1.7200948245315908e-05, + "loss": 1.7604, + "step": 7006 + }, + { + "epoch": 0.7278487586994911, + "grad_norm": 0.40143445134162903, + "learning_rate": 1.7188634613086103e-05, + "loss": 1.6143, + "step": 7007 + }, + { + "epoch": 0.7279526332190713, + "grad_norm": 0.37583884596824646, + "learning_rate": 1.717632447501077e-05, + "loss": 1.5589, + "step": 7008 + }, + { + "epoch": 0.7280565077386517, + "grad_norm": 0.4202621877193451, + "learning_rate": 1.7164017832400846e-05, + "loss": 1.5601, + "step": 7009 + }, + { + "epoch": 0.7281603822582321, + "grad_norm": 0.39898592233657837, + "learning_rate": 1.71517146865669e-05, + "loss": 1.6215, + "step": 7010 + }, + { + "epoch": 0.7282642567778124, + "grad_norm": 0.4259057641029358, + "learning_rate": 1.7139415038819108e-05, + "loss": 1.8012, + "step": 7011 + }, + { + "epoch": 0.7283681312973928, + "grad_norm": 0.3842464089393616, + "learning_rate": 1.7127118890467292e-05, + "loss": 1.6024, + "step": 7012 + }, + { + "epoch": 0.7284720058169731, + "grad_norm": 0.3995085060596466, + "learning_rate": 1.7114826242820875e-05, + "loss": 1.6076, + "step": 7013 + }, + { + "epoch": 0.7285758803365534, + "grad_norm": 0.37243375182151794, + "learning_rate": 1.7102537097188986e-05, + "loss": 1.5847, + "step": 7014 + }, + { + "epoch": 0.7286797548561338, + "grad_norm": 0.46955230832099915, + "learning_rate": 1.7090251454880258e-05, + "loss": 1.8128, + "step": 7015 + }, + { + "epoch": 0.7287836293757142, + "grad_norm": 0.40310418605804443, + "learning_rate": 1.707796931720305e-05, + "loss": 1.5522, + "step": 7016 + }, + { + "epoch": 0.7288875038952944, + "grad_norm": 0.4547189176082611, + "learning_rate": 1.706569068546532e-05, + "loss": 1.8937, + "step": 7017 + }, + { + "epoch": 0.7289913784148748, + "grad_norm": 0.41194766759872437, + "learning_rate": 1.7053415560974623e-05, + "loss": 1.6175, + "step": 7018 + }, + { + "epoch": 0.7290952529344552, + "grad_norm": 0.4527081847190857, + "learning_rate": 1.7041143945038186e-05, + "loss": 1.7127, + "step": 7019 + }, + { + "epoch": 0.7291991274540355, + "grad_norm": 0.45364293456077576, + "learning_rate": 1.702887583896282e-05, + "loss": 1.822, + "step": 7020 + }, + { + "epoch": 0.7293030019736159, + "grad_norm": 0.4415326714515686, + "learning_rate": 1.7016611244055e-05, + "loss": 1.7217, + "step": 7021 + }, + { + "epoch": 0.7294068764931962, + "grad_norm": 0.4117661714553833, + "learning_rate": 1.700435016162078e-05, + "loss": 1.7666, + "step": 7022 + }, + { + "epoch": 0.7295107510127765, + "grad_norm": 0.41793352365493774, + "learning_rate": 1.699209259296593e-05, + "loss": 1.6506, + "step": 7023 + }, + { + "epoch": 0.7296146255323569, + "grad_norm": 0.41876649856567383, + "learning_rate": 1.697983853939571e-05, + "loss": 1.6463, + "step": 7024 + }, + { + "epoch": 0.7297185000519373, + "grad_norm": 0.46522414684295654, + "learning_rate": 1.6967588002215134e-05, + "loss": 1.6875, + "step": 7025 + }, + { + "epoch": 0.7298223745715176, + "grad_norm": 0.40117374062538147, + "learning_rate": 1.6955340982728797e-05, + "loss": 1.6181, + "step": 7026 + }, + { + "epoch": 0.7299262490910979, + "grad_norm": 0.4233303368091583, + "learning_rate": 1.694309748224085e-05, + "loss": 1.7707, + "step": 7027 + }, + { + "epoch": 0.7300301236106783, + "grad_norm": 0.42827704548835754, + "learning_rate": 1.693085750205518e-05, + "loss": 1.6969, + "step": 7028 + }, + { + "epoch": 0.7301339981302587, + "grad_norm": 0.42237424850463867, + "learning_rate": 1.6918621043475225e-05, + "loss": 1.5153, + "step": 7029 + }, + { + "epoch": 0.730237872649839, + "grad_norm": 0.4134870171546936, + "learning_rate": 1.6906388107804118e-05, + "loss": 1.5392, + "step": 7030 + }, + { + "epoch": 0.7303417471694194, + "grad_norm": 0.42920297384262085, + "learning_rate": 1.6894158696344515e-05, + "loss": 1.7474, + "step": 7031 + }, + { + "epoch": 0.7304456216889997, + "grad_norm": 0.401040643453598, + "learning_rate": 1.6881932810398765e-05, + "loss": 1.5063, + "step": 7032 + }, + { + "epoch": 0.73054949620858, + "grad_norm": 0.4182683229446411, + "learning_rate": 1.686971045126887e-05, + "loss": 1.5769, + "step": 7033 + }, + { + "epoch": 0.7306533707281604, + "grad_norm": 0.4622383713722229, + "learning_rate": 1.6857491620256353e-05, + "loss": 1.628, + "step": 7034 + }, + { + "epoch": 0.7307572452477408, + "grad_norm": 0.5033509135246277, + "learning_rate": 1.6845276318662472e-05, + "loss": 1.8296, + "step": 7035 + }, + { + "epoch": 0.730861119767321, + "grad_norm": 0.3891574442386627, + "learning_rate": 1.6833064547788053e-05, + "loss": 1.5515, + "step": 7036 + }, + { + "epoch": 0.7309649942869014, + "grad_norm": 0.44622090458869934, + "learning_rate": 1.6820856308933543e-05, + "loss": 1.7212, + "step": 7037 + }, + { + "epoch": 0.7310688688064818, + "grad_norm": 0.4310351014137268, + "learning_rate": 1.6808651603399024e-05, + "loss": 1.5695, + "step": 7038 + }, + { + "epoch": 0.7311727433260621, + "grad_norm": 0.4409294128417969, + "learning_rate": 1.6796450432484212e-05, + "loss": 1.6526, + "step": 7039 + }, + { + "epoch": 0.7312766178456425, + "grad_norm": 0.39524850249290466, + "learning_rate": 1.678425279748843e-05, + "loss": 1.5189, + "step": 7040 + }, + { + "epoch": 0.7313804923652228, + "grad_norm": 0.42522087693214417, + "learning_rate": 1.6772058699710618e-05, + "loss": 1.6336, + "step": 7041 + }, + { + "epoch": 0.7314843668848031, + "grad_norm": 0.4241730868816376, + "learning_rate": 1.6759868140449397e-05, + "loss": 1.5303, + "step": 7042 + }, + { + "epoch": 0.7315882414043835, + "grad_norm": 0.41088756918907166, + "learning_rate": 1.6747681121002902e-05, + "loss": 1.6328, + "step": 7043 + }, + { + "epoch": 0.7316921159239639, + "grad_norm": 0.43897873163223267, + "learning_rate": 1.6735497642669008e-05, + "loss": 1.7547, + "step": 7044 + }, + { + "epoch": 0.7317959904435442, + "grad_norm": 0.42431479692459106, + "learning_rate": 1.6723317706745144e-05, + "loss": 1.6887, + "step": 7045 + }, + { + "epoch": 0.7318998649631245, + "grad_norm": 0.3968948423862457, + "learning_rate": 1.6711141314528374e-05, + "loss": 1.4962, + "step": 7046 + }, + { + "epoch": 0.7320037394827049, + "grad_norm": 0.4105996787548065, + "learning_rate": 1.669896846731539e-05, + "loss": 1.6409, + "step": 7047 + }, + { + "epoch": 0.7321076140022852, + "grad_norm": 0.40278342366218567, + "learning_rate": 1.6686799166402507e-05, + "loss": 1.5295, + "step": 7048 + }, + { + "epoch": 0.7322114885218656, + "grad_norm": 0.3999069929122925, + "learning_rate": 1.667463341308566e-05, + "loss": 1.5359, + "step": 7049 + }, + { + "epoch": 0.732315363041446, + "grad_norm": 0.3840470016002655, + "learning_rate": 1.666247120866039e-05, + "loss": 1.5948, + "step": 7050 + }, + { + "epoch": 0.7324192375610262, + "grad_norm": 0.40649813413619995, + "learning_rate": 1.6650312554421928e-05, + "loss": 1.4639, + "step": 7051 + }, + { + "epoch": 0.7325231120806066, + "grad_norm": 0.46434590220451355, + "learning_rate": 1.663815745166502e-05, + "loss": 1.8214, + "step": 7052 + }, + { + "epoch": 0.732626986600187, + "grad_norm": 0.434417724609375, + "learning_rate": 1.66260059016841e-05, + "loss": 1.5394, + "step": 7053 + }, + { + "epoch": 0.7327308611197674, + "grad_norm": 0.39939597249031067, + "learning_rate": 1.6613857905773238e-05, + "loss": 1.5732, + "step": 7054 + }, + { + "epoch": 0.7328347356393476, + "grad_norm": 0.39410555362701416, + "learning_rate": 1.6601713465226086e-05, + "loss": 1.6296, + "step": 7055 + }, + { + "epoch": 0.732938610158928, + "grad_norm": 0.4101269245147705, + "learning_rate": 1.6589572581335938e-05, + "loss": 1.6349, + "step": 7056 + }, + { + "epoch": 0.7330424846785084, + "grad_norm": 0.4318089485168457, + "learning_rate": 1.6577435255395677e-05, + "loss": 1.6986, + "step": 7057 + }, + { + "epoch": 0.7331463591980887, + "grad_norm": 0.4095480442047119, + "learning_rate": 1.6565301488697893e-05, + "loss": 1.7157, + "step": 7058 + }, + { + "epoch": 0.7332502337176691, + "grad_norm": 0.4215502440929413, + "learning_rate": 1.655317128253468e-05, + "loss": 1.587, + "step": 7059 + }, + { + "epoch": 0.7333541082372494, + "grad_norm": 0.472887247800827, + "learning_rate": 1.6541044638197818e-05, + "loss": 1.7887, + "step": 7060 + }, + { + "epoch": 0.7334579827568297, + "grad_norm": 0.4307557940483093, + "learning_rate": 1.6528921556978737e-05, + "loss": 1.7428, + "step": 7061 + }, + { + "epoch": 0.7335618572764101, + "grad_norm": 0.4611065685749054, + "learning_rate": 1.65168020401684e-05, + "loss": 1.79, + "step": 7062 + }, + { + "epoch": 0.7336657317959905, + "grad_norm": 0.45637285709381104, + "learning_rate": 1.650468608905748e-05, + "loss": 1.8929, + "step": 7063 + }, + { + "epoch": 0.7337696063155708, + "grad_norm": 0.41570550203323364, + "learning_rate": 1.649257370493622e-05, + "loss": 1.7476, + "step": 7064 + }, + { + "epoch": 0.7338734808351511, + "grad_norm": 0.40172329545021057, + "learning_rate": 1.6480464889094493e-05, + "loss": 1.762, + "step": 7065 + }, + { + "epoch": 0.7339773553547315, + "grad_norm": 0.4335883557796478, + "learning_rate": 1.646835964282179e-05, + "loss": 1.5714, + "step": 7066 + }, + { + "epoch": 0.7340812298743118, + "grad_norm": 0.4139271676540375, + "learning_rate": 1.6456257967407228e-05, + "loss": 1.6522, + "step": 7067 + }, + { + "epoch": 0.7341851043938922, + "grad_norm": 0.4054241478443146, + "learning_rate": 1.6444159864139542e-05, + "loss": 1.7425, + "step": 7068 + }, + { + "epoch": 0.7342889789134726, + "grad_norm": 0.4233381450176239, + "learning_rate": 1.6432065334307074e-05, + "loss": 1.6434, + "step": 7069 + }, + { + "epoch": 0.7343928534330528, + "grad_norm": 0.4486279785633087, + "learning_rate": 1.6419974379197834e-05, + "loss": 1.7726, + "step": 7070 + }, + { + "epoch": 0.7344967279526332, + "grad_norm": 0.4719793200492859, + "learning_rate": 1.6407887000099354e-05, + "loss": 1.5734, + "step": 7071 + }, + { + "epoch": 0.7346006024722136, + "grad_norm": 0.4241810142993927, + "learning_rate": 1.6395803198298905e-05, + "loss": 1.7336, + "step": 7072 + }, + { + "epoch": 0.7347044769917939, + "grad_norm": 0.40580904483795166, + "learning_rate": 1.6383722975083282e-05, + "loss": 1.6019, + "step": 7073 + }, + { + "epoch": 0.7348083515113742, + "grad_norm": 0.46320223808288574, + "learning_rate": 1.6371646331738953e-05, + "loss": 1.7766, + "step": 7074 + }, + { + "epoch": 0.7349122260309546, + "grad_norm": 0.41690146923065186, + "learning_rate": 1.6359573269551977e-05, + "loss": 1.6961, + "step": 7075 + }, + { + "epoch": 0.7350161005505349, + "grad_norm": 0.4090801775455475, + "learning_rate": 1.634750378980804e-05, + "loss": 1.6388, + "step": 7076 + }, + { + "epoch": 0.7351199750701153, + "grad_norm": 0.3977707624435425, + "learning_rate": 1.633543789379245e-05, + "loss": 1.6002, + "step": 7077 + }, + { + "epoch": 0.7352238495896957, + "grad_norm": 0.4486248195171356, + "learning_rate": 1.632337558279011e-05, + "loss": 1.7919, + "step": 7078 + }, + { + "epoch": 0.735327724109276, + "grad_norm": 0.41269683837890625, + "learning_rate": 1.6311316858085607e-05, + "loss": 1.5934, + "step": 7079 + }, + { + "epoch": 0.7354315986288563, + "grad_norm": 0.4171505570411682, + "learning_rate": 1.6299261720963093e-05, + "loss": 1.6255, + "step": 7080 + }, + { + "epoch": 0.7355354731484367, + "grad_norm": 0.406341552734375, + "learning_rate": 1.6287210172706297e-05, + "loss": 1.7223, + "step": 7081 + }, + { + "epoch": 0.7356393476680171, + "grad_norm": 0.43116459250450134, + "learning_rate": 1.627516221459866e-05, + "loss": 1.6397, + "step": 7082 + }, + { + "epoch": 0.7357432221875974, + "grad_norm": 0.42022863030433655, + "learning_rate": 1.6263117847923182e-05, + "loss": 1.7121, + "step": 7083 + }, + { + "epoch": 0.7358470967071777, + "grad_norm": 0.4364830553531647, + "learning_rate": 1.62510770739625e-05, + "loss": 1.5572, + "step": 7084 + }, + { + "epoch": 0.7359509712267581, + "grad_norm": 0.4395456910133362, + "learning_rate": 1.6239039893998842e-05, + "loss": 1.6877, + "step": 7085 + }, + { + "epoch": 0.7360548457463384, + "grad_norm": 0.499352365732193, + "learning_rate": 1.622700630931412e-05, + "loss": 1.4045, + "step": 7086 + }, + { + "epoch": 0.7361587202659188, + "grad_norm": 0.41505447030067444, + "learning_rate": 1.6214976321189768e-05, + "loss": 1.6824, + "step": 7087 + }, + { + "epoch": 0.7362625947854992, + "grad_norm": 0.40224704146385193, + "learning_rate": 1.6202949930906893e-05, + "loss": 1.6501, + "step": 7088 + }, + { + "epoch": 0.7363664693050794, + "grad_norm": 0.4343310296535492, + "learning_rate": 1.6190927139746255e-05, + "loss": 1.7141, + "step": 7089 + }, + { + "epoch": 0.7364703438246598, + "grad_norm": 0.4633185565471649, + "learning_rate": 1.617890794898812e-05, + "loss": 1.8566, + "step": 7090 + }, + { + "epoch": 0.7365742183442402, + "grad_norm": 0.4461994469165802, + "learning_rate": 1.616689235991249e-05, + "loss": 1.6646, + "step": 7091 + }, + { + "epoch": 0.7366780928638205, + "grad_norm": 0.4115038812160492, + "learning_rate": 1.6154880373798918e-05, + "loss": 1.6597, + "step": 7092 + }, + { + "epoch": 0.7367819673834008, + "grad_norm": 0.47499844431877136, + "learning_rate": 1.6142871991926584e-05, + "loss": 1.8035, + "step": 7093 + }, + { + "epoch": 0.7368858419029812, + "grad_norm": 0.4350105822086334, + "learning_rate": 1.6130867215574284e-05, + "loss": 1.446, + "step": 7094 + }, + { + "epoch": 0.7369897164225615, + "grad_norm": 0.39589881896972656, + "learning_rate": 1.6118866046020438e-05, + "loss": 1.5551, + "step": 7095 + }, + { + "epoch": 0.7370935909421419, + "grad_norm": 0.458922415971756, + "learning_rate": 1.6106868484543076e-05, + "loss": 1.8941, + "step": 7096 + }, + { + "epoch": 0.7371974654617223, + "grad_norm": 0.48349758982658386, + "learning_rate": 1.6094874532419828e-05, + "loss": 1.6175, + "step": 7097 + }, + { + "epoch": 0.7373013399813025, + "grad_norm": 0.43464893102645874, + "learning_rate": 1.6082884190928004e-05, + "loss": 1.7482, + "step": 7098 + }, + { + "epoch": 0.7374052145008829, + "grad_norm": 0.43286725878715515, + "learning_rate": 1.6070897461344413e-05, + "loss": 1.7566, + "step": 7099 + }, + { + "epoch": 0.7375090890204633, + "grad_norm": 0.3972510099411011, + "learning_rate": 1.6058914344945607e-05, + "loss": 1.576, + "step": 7100 + }, + { + "epoch": 0.7376129635400436, + "grad_norm": 0.44797611236572266, + "learning_rate": 1.604693484300767e-05, + "loss": 1.6982, + "step": 7101 + }, + { + "epoch": 0.737716838059624, + "grad_norm": 0.4418253004550934, + "learning_rate": 1.6034958956806333e-05, + "loss": 1.8107, + "step": 7102 + }, + { + "epoch": 0.7378207125792043, + "grad_norm": 0.40204495191574097, + "learning_rate": 1.6022986687616926e-05, + "loss": 1.5234, + "step": 7103 + }, + { + "epoch": 0.7379245870987847, + "grad_norm": 0.407816082239151, + "learning_rate": 1.6011018036714388e-05, + "loss": 1.7615, + "step": 7104 + }, + { + "epoch": 0.738028461618365, + "grad_norm": 0.4105852544307709, + "learning_rate": 1.599905300537334e-05, + "loss": 1.7623, + "step": 7105 + }, + { + "epoch": 0.7381323361379454, + "grad_norm": 0.4015292525291443, + "learning_rate": 1.598709159486789e-05, + "loss": 1.7175, + "step": 7106 + }, + { + "epoch": 0.7382362106575258, + "grad_norm": 0.4852696359157562, + "learning_rate": 1.5975133806471903e-05, + "loss": 1.8187, + "step": 7107 + }, + { + "epoch": 0.738340085177106, + "grad_norm": 0.4263093173503876, + "learning_rate": 1.596317964145877e-05, + "loss": 1.6612, + "step": 7108 + }, + { + "epoch": 0.7384439596966864, + "grad_norm": 0.43212834000587463, + "learning_rate": 1.5951229101101473e-05, + "loss": 1.6154, + "step": 7109 + }, + { + "epoch": 0.7385478342162668, + "grad_norm": 0.4184805452823639, + "learning_rate": 1.5939282186672705e-05, + "loss": 1.706, + "step": 7110 + }, + { + "epoch": 0.7386517087358471, + "grad_norm": 0.42199963331222534, + "learning_rate": 1.5927338899444706e-05, + "loss": 1.6324, + "step": 7111 + }, + { + "epoch": 0.7387555832554275, + "grad_norm": 0.44483646750450134, + "learning_rate": 1.5915399240689325e-05, + "loss": 1.6332, + "step": 7112 + }, + { + "epoch": 0.7388594577750078, + "grad_norm": 0.43318289518356323, + "learning_rate": 1.5903463211678067e-05, + "loss": 1.6949, + "step": 7113 + }, + { + "epoch": 0.7389633322945881, + "grad_norm": 0.5184254050254822, + "learning_rate": 1.5891530813682008e-05, + "loss": 1.6298, + "step": 7114 + }, + { + "epoch": 0.7390672068141685, + "grad_norm": 0.4334104657173157, + "learning_rate": 1.5879602047971865e-05, + "loss": 1.4948, + "step": 7115 + }, + { + "epoch": 0.7391710813337489, + "grad_norm": 0.39444056153297424, + "learning_rate": 1.586767691581793e-05, + "loss": 1.607, + "step": 7116 + }, + { + "epoch": 0.7392749558533291, + "grad_norm": 0.3886300027370453, + "learning_rate": 1.58557554184902e-05, + "loss": 1.5812, + "step": 7117 + }, + { + "epoch": 0.7393788303729095, + "grad_norm": 0.43382641673088074, + "learning_rate": 1.584383755725815e-05, + "loss": 1.6636, + "step": 7118 + }, + { + "epoch": 0.7394827048924899, + "grad_norm": 0.40782594680786133, + "learning_rate": 1.5831923333390985e-05, + "loss": 1.4104, + "step": 7119 + }, + { + "epoch": 0.7395865794120702, + "grad_norm": 0.3772667348384857, + "learning_rate": 1.5820012748157464e-05, + "loss": 1.4713, + "step": 7120 + }, + { + "epoch": 0.7396904539316506, + "grad_norm": 0.42232704162597656, + "learning_rate": 1.5808105802825975e-05, + "loss": 1.3777, + "step": 7121 + }, + { + "epoch": 0.7397943284512309, + "grad_norm": 0.430355042219162, + "learning_rate": 1.579620249866451e-05, + "loss": 1.6707, + "step": 7122 + }, + { + "epoch": 0.7398982029708112, + "grad_norm": 0.40983816981315613, + "learning_rate": 1.5784302836940678e-05, + "loss": 1.4956, + "step": 7123 + }, + { + "epoch": 0.7400020774903916, + "grad_norm": 0.4761542081832886, + "learning_rate": 1.5772406818921697e-05, + "loss": 1.7082, + "step": 7124 + }, + { + "epoch": 0.740105952009972, + "grad_norm": 0.4703845977783203, + "learning_rate": 1.576051444587439e-05, + "loss": 1.7969, + "step": 7125 + }, + { + "epoch": 0.7402098265295523, + "grad_norm": 0.40209317207336426, + "learning_rate": 1.574862571906525e-05, + "loss": 1.7028, + "step": 7126 + }, + { + "epoch": 0.7403137010491326, + "grad_norm": 0.39198899269104004, + "learning_rate": 1.5736740639760272e-05, + "loss": 1.6769, + "step": 7127 + }, + { + "epoch": 0.740417575568713, + "grad_norm": 0.3915342390537262, + "learning_rate": 1.5724859209225168e-05, + "loss": 1.5418, + "step": 7128 + }, + { + "epoch": 0.7405214500882933, + "grad_norm": 0.4316711127758026, + "learning_rate": 1.5712981428725192e-05, + "loss": 1.5925, + "step": 7129 + }, + { + "epoch": 0.7406253246078737, + "grad_norm": 0.41193971037864685, + "learning_rate": 1.5701107299525253e-05, + "loss": 1.5975, + "step": 7130 + }, + { + "epoch": 0.740729199127454, + "grad_norm": 0.4072656035423279, + "learning_rate": 1.5689236822889846e-05, + "loss": 1.8195, + "step": 7131 + }, + { + "epoch": 0.7408330736470344, + "grad_norm": 0.40560346841812134, + "learning_rate": 1.5677370000083065e-05, + "loss": 1.4801, + "step": 7132 + }, + { + "epoch": 0.7409369481666147, + "grad_norm": 0.4122333824634552, + "learning_rate": 1.566550683236868e-05, + "loss": 1.4249, + "step": 7133 + }, + { + "epoch": 0.7410408226861951, + "grad_norm": 0.4269520938396454, + "learning_rate": 1.5653647321009973e-05, + "loss": 1.8544, + "step": 7134 + }, + { + "epoch": 0.7411446972057755, + "grad_norm": 0.4235425293445587, + "learning_rate": 1.5641791467269922e-05, + "loss": 1.5443, + "step": 7135 + }, + { + "epoch": 0.7412485717253557, + "grad_norm": 0.49636656045913696, + "learning_rate": 1.56299392724111e-05, + "loss": 1.6799, + "step": 7136 + }, + { + "epoch": 0.7413524462449361, + "grad_norm": 0.40569061040878296, + "learning_rate": 1.561809073769561e-05, + "loss": 1.5797, + "step": 7137 + }, + { + "epoch": 0.7414563207645165, + "grad_norm": 0.42479363083839417, + "learning_rate": 1.5606245864385287e-05, + "loss": 1.6916, + "step": 7138 + }, + { + "epoch": 0.7415601952840968, + "grad_norm": 0.4160860776901245, + "learning_rate": 1.5594404653741494e-05, + "loss": 1.4902, + "step": 7139 + }, + { + "epoch": 0.7416640698036772, + "grad_norm": 0.4242292642593384, + "learning_rate": 1.5582567107025237e-05, + "loss": 1.6821, + "step": 7140 + }, + { + "epoch": 0.7417679443232575, + "grad_norm": 0.4416872262954712, + "learning_rate": 1.5570733225497115e-05, + "loss": 1.6489, + "step": 7141 + }, + { + "epoch": 0.7418718188428378, + "grad_norm": 0.4447227418422699, + "learning_rate": 1.5558903010417346e-05, + "loss": 1.766, + "step": 7142 + }, + { + "epoch": 0.7419756933624182, + "grad_norm": 0.4173499047756195, + "learning_rate": 1.5547076463045756e-05, + "loss": 1.6973, + "step": 7143 + }, + { + "epoch": 0.7420795678819986, + "grad_norm": 0.4345002770423889, + "learning_rate": 1.5535253584641762e-05, + "loss": 1.7257, + "step": 7144 + }, + { + "epoch": 0.7421834424015789, + "grad_norm": 0.45821139216423035, + "learning_rate": 1.552343437646446e-05, + "loss": 1.7526, + "step": 7145 + }, + { + "epoch": 0.7422873169211592, + "grad_norm": 0.391783744096756, + "learning_rate": 1.5511618839772437e-05, + "loss": 1.6038, + "step": 7146 + }, + { + "epoch": 0.7423911914407396, + "grad_norm": 0.43559110164642334, + "learning_rate": 1.5499806975824e-05, + "loss": 1.6295, + "step": 7147 + }, + { + "epoch": 0.7424950659603199, + "grad_norm": 0.43846607208251953, + "learning_rate": 1.5487998785877005e-05, + "loss": 1.6083, + "step": 7148 + }, + { + "epoch": 0.7425989404799003, + "grad_norm": 0.43840745091438293, + "learning_rate": 1.5476194271188944e-05, + "loss": 1.6872, + "step": 7149 + }, + { + "epoch": 0.7427028149994807, + "grad_norm": 0.44359713792800903, + "learning_rate": 1.5464393433016895e-05, + "loss": 1.6835, + "step": 7150 + }, + { + "epoch": 0.7428066895190609, + "grad_norm": 0.46567508578300476, + "learning_rate": 1.5452596272617555e-05, + "loss": 1.6991, + "step": 7151 + }, + { + "epoch": 0.7429105640386413, + "grad_norm": 0.4170341491699219, + "learning_rate": 1.5440802791247234e-05, + "loss": 1.5401, + "step": 7152 + }, + { + "epoch": 0.7430144385582217, + "grad_norm": 0.40833204984664917, + "learning_rate": 1.5429012990161827e-05, + "loss": 1.604, + "step": 7153 + }, + { + "epoch": 0.743118313077802, + "grad_norm": 0.4111480712890625, + "learning_rate": 1.5417226870616902e-05, + "loss": 1.6432, + "step": 7154 + }, + { + "epoch": 0.7432221875973823, + "grad_norm": 0.41619327664375305, + "learning_rate": 1.540544443386753e-05, + "loss": 1.5474, + "step": 7155 + }, + { + "epoch": 0.7433260621169627, + "grad_norm": 0.41143396496772766, + "learning_rate": 1.5393665681168496e-05, + "loss": 1.8406, + "step": 7156 + }, + { + "epoch": 0.7434299366365431, + "grad_norm": 0.40822920203208923, + "learning_rate": 1.5381890613774126e-05, + "loss": 1.589, + "step": 7157 + }, + { + "epoch": 0.7435338111561234, + "grad_norm": 0.39305150508880615, + "learning_rate": 1.537011923293837e-05, + "loss": 1.4567, + "step": 7158 + }, + { + "epoch": 0.7436376856757038, + "grad_norm": 0.40565574169158936, + "learning_rate": 1.5358351539914794e-05, + "loss": 1.5574, + "step": 7159 + }, + { + "epoch": 0.7437415601952841, + "grad_norm": 0.47624900937080383, + "learning_rate": 1.5346587535956547e-05, + "loss": 1.7159, + "step": 7160 + }, + { + "epoch": 0.7438454347148644, + "grad_norm": 0.41582754254341125, + "learning_rate": 1.5334827222316457e-05, + "loss": 1.6007, + "step": 7161 + }, + { + "epoch": 0.7439493092344448, + "grad_norm": 0.42031049728393555, + "learning_rate": 1.5323070600246832e-05, + "loss": 1.7471, + "step": 7162 + }, + { + "epoch": 0.7440531837540252, + "grad_norm": 0.43765780329704285, + "learning_rate": 1.5311317670999714e-05, + "loss": 1.6716, + "step": 7163 + }, + { + "epoch": 0.7441570582736055, + "grad_norm": 0.46914222836494446, + "learning_rate": 1.52995684358267e-05, + "loss": 1.7037, + "step": 7164 + }, + { + "epoch": 0.7442609327931858, + "grad_norm": 0.43838274478912354, + "learning_rate": 1.5287822895978942e-05, + "loss": 1.6881, + "step": 7165 + }, + { + "epoch": 0.7443648073127662, + "grad_norm": 0.43214210867881775, + "learning_rate": 1.52760810527073e-05, + "loss": 1.7643, + "step": 7166 + }, + { + "epoch": 0.7444686818323465, + "grad_norm": 0.43604573607444763, + "learning_rate": 1.5264342907262164e-05, + "loss": 1.6576, + "step": 7167 + }, + { + "epoch": 0.7445725563519269, + "grad_norm": 0.4425446391105652, + "learning_rate": 1.5252608460893558e-05, + "loss": 1.7985, + "step": 7168 + }, + { + "epoch": 0.7446764308715073, + "grad_norm": 0.42904865741729736, + "learning_rate": 1.5240877714851115e-05, + "loss": 1.6071, + "step": 7169 + }, + { + "epoch": 0.7447803053910875, + "grad_norm": 0.4095899164676666, + "learning_rate": 1.5229150670384056e-05, + "loss": 1.6072, + "step": 7170 + }, + { + "epoch": 0.7448841799106679, + "grad_norm": 0.4281744360923767, + "learning_rate": 1.5217427328741235e-05, + "loss": 1.642, + "step": 7171 + }, + { + "epoch": 0.7449880544302483, + "grad_norm": 0.4427133798599243, + "learning_rate": 1.5205707691171074e-05, + "loss": 1.7441, + "step": 7172 + }, + { + "epoch": 0.7450919289498286, + "grad_norm": 0.4026220142841339, + "learning_rate": 1.5193991758921666e-05, + "loss": 1.7042, + "step": 7173 + }, + { + "epoch": 0.745195803469409, + "grad_norm": 0.39205819368362427, + "learning_rate": 1.5182279533240606e-05, + "loss": 1.4862, + "step": 7174 + }, + { + "epoch": 0.7452996779889893, + "grad_norm": 0.45115184783935547, + "learning_rate": 1.5170571015375207e-05, + "loss": 1.8971, + "step": 7175 + }, + { + "epoch": 0.7454035525085696, + "grad_norm": 0.4259231686592102, + "learning_rate": 1.5158866206572313e-05, + "loss": 1.5441, + "step": 7176 + }, + { + "epoch": 0.74550742702815, + "grad_norm": 0.39843812584877014, + "learning_rate": 1.5147165108078398e-05, + "loss": 1.7365, + "step": 7177 + }, + { + "epoch": 0.7456113015477304, + "grad_norm": 0.3978004455566406, + "learning_rate": 1.5135467721139535e-05, + "loss": 1.5903, + "step": 7178 + }, + { + "epoch": 0.7457151760673106, + "grad_norm": 0.43081727623939514, + "learning_rate": 1.5123774047001404e-05, + "loss": 1.6461, + "step": 7179 + }, + { + "epoch": 0.745819050586891, + "grad_norm": 0.41507482528686523, + "learning_rate": 1.5112084086909296e-05, + "loss": 1.5398, + "step": 7180 + }, + { + "epoch": 0.7459229251064714, + "grad_norm": 0.4333358108997345, + "learning_rate": 1.5100397842108078e-05, + "loss": 1.612, + "step": 7181 + }, + { + "epoch": 0.7460267996260518, + "grad_norm": 0.4127698540687561, + "learning_rate": 1.5088715313842278e-05, + "loss": 1.5695, + "step": 7182 + }, + { + "epoch": 0.7461306741456321, + "grad_norm": 0.4184444546699524, + "learning_rate": 1.507703650335598e-05, + "loss": 1.6183, + "step": 7183 + }, + { + "epoch": 0.7462345486652124, + "grad_norm": 0.4407637417316437, + "learning_rate": 1.506536141189288e-05, + "loss": 1.6235, + "step": 7184 + }, + { + "epoch": 0.7463384231847928, + "grad_norm": 0.40334874391555786, + "learning_rate": 1.5053690040696295e-05, + "loss": 1.6717, + "step": 7185 + }, + { + "epoch": 0.7464422977043731, + "grad_norm": 0.40933892130851746, + "learning_rate": 1.504202239100913e-05, + "loss": 1.5891, + "step": 7186 + }, + { + "epoch": 0.7465461722239535, + "grad_norm": 0.41984155774116516, + "learning_rate": 1.5030358464073891e-05, + "loss": 1.6755, + "step": 7187 + }, + { + "epoch": 0.7466500467435339, + "grad_norm": 0.44677481055259705, + "learning_rate": 1.501869826113269e-05, + "loss": 1.6644, + "step": 7188 + }, + { + "epoch": 0.7467539212631141, + "grad_norm": 0.43033236265182495, + "learning_rate": 1.5007041783427294e-05, + "loss": 1.6723, + "step": 7189 + }, + { + "epoch": 0.7468577957826945, + "grad_norm": 0.4469008445739746, + "learning_rate": 1.4995389032198958e-05, + "loss": 1.7799, + "step": 7190 + }, + { + "epoch": 0.7469616703022749, + "grad_norm": 0.44168853759765625, + "learning_rate": 1.4983740008688657e-05, + "loss": 1.6653, + "step": 7191 + }, + { + "epoch": 0.7470655448218552, + "grad_norm": 0.41871321201324463, + "learning_rate": 1.4972094714136926e-05, + "loss": 1.5746, + "step": 7192 + }, + { + "epoch": 0.7471694193414355, + "grad_norm": 0.411448210477829, + "learning_rate": 1.4960453149783848e-05, + "loss": 1.5334, + "step": 7193 + }, + { + "epoch": 0.7472732938610159, + "grad_norm": 0.4211091101169586, + "learning_rate": 1.4948815316869207e-05, + "loss": 1.7637, + "step": 7194 + }, + { + "epoch": 0.7473771683805962, + "grad_norm": 0.46923527121543884, + "learning_rate": 1.4937181216632323e-05, + "loss": 1.7789, + "step": 7195 + }, + { + "epoch": 0.7474810429001766, + "grad_norm": 0.4714355766773224, + "learning_rate": 1.4925550850312142e-05, + "loss": 1.7327, + "step": 7196 + }, + { + "epoch": 0.747584917419757, + "grad_norm": 0.39055758714675903, + "learning_rate": 1.4913924219147202e-05, + "loss": 1.6193, + "step": 7197 + }, + { + "epoch": 0.7476887919393372, + "grad_norm": 0.4228423833847046, + "learning_rate": 1.490230132437565e-05, + "loss": 1.75, + "step": 7198 + }, + { + "epoch": 0.7477926664589176, + "grad_norm": 0.42969369888305664, + "learning_rate": 1.4890682167235238e-05, + "loss": 1.5587, + "step": 7199 + }, + { + "epoch": 0.747896540978498, + "grad_norm": 0.42206692695617676, + "learning_rate": 1.4879066748963295e-05, + "loss": 1.7437, + "step": 7200 + }, + { + "epoch": 0.7480004154980783, + "grad_norm": 0.4134342670440674, + "learning_rate": 1.4867455070796816e-05, + "loss": 1.6074, + "step": 7201 + }, + { + "epoch": 0.7481042900176587, + "grad_norm": 0.4012831747531891, + "learning_rate": 1.48558471339723e-05, + "loss": 1.5822, + "step": 7202 + }, + { + "epoch": 0.748208164537239, + "grad_norm": 0.42914196848869324, + "learning_rate": 1.4844242939725944e-05, + "loss": 1.6097, + "step": 7203 + }, + { + "epoch": 0.7483120390568193, + "grad_norm": 0.43084225058555603, + "learning_rate": 1.483264248929349e-05, + "loss": 1.7464, + "step": 7204 + }, + { + "epoch": 0.7484159135763997, + "grad_norm": 0.4075019657611847, + "learning_rate": 1.4821045783910298e-05, + "loss": 1.6601, + "step": 7205 + }, + { + "epoch": 0.7485197880959801, + "grad_norm": 0.46284353733062744, + "learning_rate": 1.480945282481132e-05, + "loss": 1.9072, + "step": 7206 + }, + { + "epoch": 0.7486236626155603, + "grad_norm": 0.4372515380382538, + "learning_rate": 1.4797863613231122e-05, + "loss": 1.7274, + "step": 7207 + }, + { + "epoch": 0.7487275371351407, + "grad_norm": 0.3990016281604767, + "learning_rate": 1.478627815040386e-05, + "loss": 1.5798, + "step": 7208 + }, + { + "epoch": 0.7488314116547211, + "grad_norm": 0.3860011696815491, + "learning_rate": 1.4774696437563278e-05, + "loss": 1.5999, + "step": 7209 + }, + { + "epoch": 0.7489352861743015, + "grad_norm": 0.4409307837486267, + "learning_rate": 1.4763118475942784e-05, + "loss": 1.7148, + "step": 7210 + }, + { + "epoch": 0.7490391606938818, + "grad_norm": 0.41701313853263855, + "learning_rate": 1.4751544266775302e-05, + "loss": 1.7499, + "step": 7211 + }, + { + "epoch": 0.7491430352134621, + "grad_norm": 0.40917038917541504, + "learning_rate": 1.473997381129341e-05, + "loss": 1.6405, + "step": 7212 + }, + { + "epoch": 0.7492469097330425, + "grad_norm": 0.46700426936149597, + "learning_rate": 1.4728407110729275e-05, + "loss": 1.7749, + "step": 7213 + }, + { + "epoch": 0.7493507842526228, + "grad_norm": 0.4193143844604492, + "learning_rate": 1.4716844166314648e-05, + "loss": 1.6553, + "step": 7214 + }, + { + "epoch": 0.7494546587722032, + "grad_norm": 0.4625590741634369, + "learning_rate": 1.4705284979280898e-05, + "loss": 1.8334, + "step": 7215 + }, + { + "epoch": 0.7495585332917836, + "grad_norm": 0.4359906017780304, + "learning_rate": 1.4693729550858976e-05, + "loss": 1.6999, + "step": 7216 + }, + { + "epoch": 0.7496624078113638, + "grad_norm": 0.4275515079498291, + "learning_rate": 1.4682177882279486e-05, + "loss": 1.7192, + "step": 7217 + }, + { + "epoch": 0.7497662823309442, + "grad_norm": 0.457688570022583, + "learning_rate": 1.4670629974772532e-05, + "loss": 1.725, + "step": 7218 + }, + { + "epoch": 0.7498701568505246, + "grad_norm": 0.3888382613658905, + "learning_rate": 1.465908582956792e-05, + "loss": 1.5319, + "step": 7219 + }, + { + "epoch": 0.7499740313701049, + "grad_norm": 0.396145761013031, + "learning_rate": 1.4647545447895017e-05, + "loss": 1.6354, + "step": 7220 + }, + { + "epoch": 0.7500779058896853, + "grad_norm": 0.425191730260849, + "learning_rate": 1.4636008830982729e-05, + "loss": 1.8322, + "step": 7221 + }, + { + "epoch": 0.7501817804092656, + "grad_norm": 0.4128625988960266, + "learning_rate": 1.4624475980059665e-05, + "loss": 1.5982, + "step": 7222 + }, + { + "epoch": 0.7502856549288459, + "grad_norm": 0.41751012206077576, + "learning_rate": 1.4612946896353974e-05, + "loss": 1.5379, + "step": 7223 + }, + { + "epoch": 0.7503895294484263, + "grad_norm": 0.39982759952545166, + "learning_rate": 1.4601421581093406e-05, + "loss": 1.6689, + "step": 7224 + }, + { + "epoch": 0.7504934039680067, + "grad_norm": 0.4144216775894165, + "learning_rate": 1.4589900035505321e-05, + "loss": 1.6664, + "step": 7225 + }, + { + "epoch": 0.750597278487587, + "grad_norm": 0.4442715644836426, + "learning_rate": 1.4578382260816669e-05, + "loss": 1.81, + "step": 7226 + }, + { + "epoch": 0.7507011530071673, + "grad_norm": 0.4191363751888275, + "learning_rate": 1.4566868258254013e-05, + "loss": 1.5913, + "step": 7227 + }, + { + "epoch": 0.7508050275267477, + "grad_norm": 0.41727662086486816, + "learning_rate": 1.455535802904348e-05, + "loss": 1.6758, + "step": 7228 + }, + { + "epoch": 0.750908902046328, + "grad_norm": 0.4364617168903351, + "learning_rate": 1.4543851574410872e-05, + "loss": 1.6397, + "step": 7229 + }, + { + "epoch": 0.7510127765659084, + "grad_norm": 0.45298081636428833, + "learning_rate": 1.4532348895581465e-05, + "loss": 1.6516, + "step": 7230 + }, + { + "epoch": 0.7511166510854888, + "grad_norm": 0.4370194375514984, + "learning_rate": 1.4520849993780266e-05, + "loss": 1.6518, + "step": 7231 + }, + { + "epoch": 0.751220525605069, + "grad_norm": 0.43286606669425964, + "learning_rate": 1.450935487023179e-05, + "loss": 1.5553, + "step": 7232 + }, + { + "epoch": 0.7513244001246494, + "grad_norm": 0.40669238567352295, + "learning_rate": 1.4497863526160183e-05, + "loss": 1.6917, + "step": 7233 + }, + { + "epoch": 0.7514282746442298, + "grad_norm": 0.42103543877601624, + "learning_rate": 1.4486375962789184e-05, + "loss": 1.4978, + "step": 7234 + }, + { + "epoch": 0.7515321491638102, + "grad_norm": 0.4966675043106079, + "learning_rate": 1.447489218134211e-05, + "loss": 1.7395, + "step": 7235 + }, + { + "epoch": 0.7516360236833904, + "grad_norm": 0.43593791127204895, + "learning_rate": 1.4463412183041947e-05, + "loss": 1.771, + "step": 7236 + }, + { + "epoch": 0.7517398982029708, + "grad_norm": 0.4331296980381012, + "learning_rate": 1.4451935969111158e-05, + "loss": 1.621, + "step": 7237 + }, + { + "epoch": 0.7518437727225512, + "grad_norm": 0.46651628613471985, + "learning_rate": 1.4440463540771914e-05, + "loss": 1.8537, + "step": 7238 + }, + { + "epoch": 0.7519476472421315, + "grad_norm": 0.40865036845207214, + "learning_rate": 1.4428994899245934e-05, + "loss": 1.6614, + "step": 7239 + }, + { + "epoch": 0.7520515217617119, + "grad_norm": 0.4490216374397278, + "learning_rate": 1.4417530045754529e-05, + "loss": 1.6751, + "step": 7240 + }, + { + "epoch": 0.7521553962812922, + "grad_norm": 0.46918779611587524, + "learning_rate": 1.4406068981518623e-05, + "loss": 1.8038, + "step": 7241 + }, + { + "epoch": 0.7522592708008725, + "grad_norm": 0.40138277411460876, + "learning_rate": 1.4394611707758726e-05, + "loss": 1.6524, + "step": 7242 + }, + { + "epoch": 0.7523631453204529, + "grad_norm": 0.43885231018066406, + "learning_rate": 1.4383158225694948e-05, + "loss": 1.7706, + "step": 7243 + }, + { + "epoch": 0.7524670198400333, + "grad_norm": 0.42752382159233093, + "learning_rate": 1.4371708536546985e-05, + "loss": 1.5644, + "step": 7244 + }, + { + "epoch": 0.7525708943596136, + "grad_norm": 0.4024030864238739, + "learning_rate": 1.4360262641534179e-05, + "loss": 1.6406, + "step": 7245 + }, + { + "epoch": 0.7526747688791939, + "grad_norm": 0.40453994274139404, + "learning_rate": 1.4348820541875374e-05, + "loss": 1.6899, + "step": 7246 + }, + { + "epoch": 0.7527786433987743, + "grad_norm": 0.4157525300979614, + "learning_rate": 1.4337382238789105e-05, + "loss": 1.4451, + "step": 7247 + }, + { + "epoch": 0.7528825179183546, + "grad_norm": 0.42336198687553406, + "learning_rate": 1.432594773349346e-05, + "loss": 1.7071, + "step": 7248 + }, + { + "epoch": 0.752986392437935, + "grad_norm": 0.4072533845901489, + "learning_rate": 1.4314517027206086e-05, + "loss": 1.6278, + "step": 7249 + }, + { + "epoch": 0.7530902669575154, + "grad_norm": 0.47410857677459717, + "learning_rate": 1.4303090121144302e-05, + "loss": 1.6032, + "step": 7250 + }, + { + "epoch": 0.7531941414770956, + "grad_norm": 0.39942529797554016, + "learning_rate": 1.429166701652498e-05, + "loss": 1.5404, + "step": 7251 + }, + { + "epoch": 0.753298015996676, + "grad_norm": 0.44464361667633057, + "learning_rate": 1.4280247714564582e-05, + "loss": 1.73, + "step": 7252 + }, + { + "epoch": 0.7534018905162564, + "grad_norm": 0.45004576444625854, + "learning_rate": 1.4268832216479177e-05, + "loss": 1.7523, + "step": 7253 + }, + { + "epoch": 0.7535057650358367, + "grad_norm": 0.4188900291919708, + "learning_rate": 1.4257420523484427e-05, + "loss": 1.605, + "step": 7254 + }, + { + "epoch": 0.753609639555417, + "grad_norm": 0.39535877108573914, + "learning_rate": 1.4246012636795592e-05, + "loss": 1.5916, + "step": 7255 + }, + { + "epoch": 0.7537135140749974, + "grad_norm": 0.44383829832077026, + "learning_rate": 1.42346085576275e-05, + "loss": 1.7938, + "step": 7256 + }, + { + "epoch": 0.7538173885945777, + "grad_norm": 0.41384685039520264, + "learning_rate": 1.422320828719465e-05, + "loss": 1.5843, + "step": 7257 + }, + { + "epoch": 0.7539212631141581, + "grad_norm": 0.41272881627082825, + "learning_rate": 1.4211811826711014e-05, + "loss": 1.5989, + "step": 7258 + }, + { + "epoch": 0.7540251376337385, + "grad_norm": 0.4295910596847534, + "learning_rate": 1.4200419177390279e-05, + "loss": 1.6587, + "step": 7259 + }, + { + "epoch": 0.7541290121533188, + "grad_norm": 0.38193991780281067, + "learning_rate": 1.4189030340445646e-05, + "loss": 1.5246, + "step": 7260 + }, + { + "epoch": 0.7542328866728991, + "grad_norm": 0.4172359108924866, + "learning_rate": 1.4177645317089954e-05, + "loss": 1.6807, + "step": 7261 + }, + { + "epoch": 0.7543367611924795, + "grad_norm": 0.4366094172000885, + "learning_rate": 1.416626410853561e-05, + "loss": 1.6379, + "step": 7262 + }, + { + "epoch": 0.7544406357120599, + "grad_norm": 0.4305514991283417, + "learning_rate": 1.4154886715994603e-05, + "loss": 1.6603, + "step": 7263 + }, + { + "epoch": 0.7545445102316402, + "grad_norm": 0.4556597173213959, + "learning_rate": 1.4143513140678595e-05, + "loss": 1.8564, + "step": 7264 + }, + { + "epoch": 0.7546483847512205, + "grad_norm": 0.49126148223876953, + "learning_rate": 1.4132143383798713e-05, + "loss": 1.7624, + "step": 7265 + }, + { + "epoch": 0.7547522592708009, + "grad_norm": 0.46818116307258606, + "learning_rate": 1.4120777446565791e-05, + "loss": 1.6411, + "step": 7266 + }, + { + "epoch": 0.7548561337903812, + "grad_norm": 0.4020385444164276, + "learning_rate": 1.4109415330190213e-05, + "loss": 1.6907, + "step": 7267 + }, + { + "epoch": 0.7549600083099616, + "grad_norm": 0.4432200491428375, + "learning_rate": 1.4098057035881934e-05, + "loss": 1.6414, + "step": 7268 + }, + { + "epoch": 0.755063882829542, + "grad_norm": 0.43888726830482483, + "learning_rate": 1.408670256485054e-05, + "loss": 1.5862, + "step": 7269 + }, + { + "epoch": 0.7551677573491222, + "grad_norm": 0.45090728998184204, + "learning_rate": 1.4075351918305191e-05, + "loss": 1.7012, + "step": 7270 + }, + { + "epoch": 0.7552716318687026, + "grad_norm": 0.42840370535850525, + "learning_rate": 1.4064005097454635e-05, + "loss": 1.7653, + "step": 7271 + }, + { + "epoch": 0.755375506388283, + "grad_norm": 0.4270748496055603, + "learning_rate": 1.4052662103507214e-05, + "loss": 1.6584, + "step": 7272 + }, + { + "epoch": 0.7554793809078633, + "grad_norm": 0.4157198369503021, + "learning_rate": 1.4041322937670903e-05, + "loss": 1.7567, + "step": 7273 + }, + { + "epoch": 0.7555832554274436, + "grad_norm": 0.4310022294521332, + "learning_rate": 1.4029987601153188e-05, + "loss": 1.6985, + "step": 7274 + }, + { + "epoch": 0.755687129947024, + "grad_norm": 0.41847115755081177, + "learning_rate": 1.4018656095161226e-05, + "loss": 1.6191, + "step": 7275 + }, + { + "epoch": 0.7557910044666043, + "grad_norm": 0.4358253479003906, + "learning_rate": 1.400732842090174e-05, + "loss": 1.6991, + "step": 7276 + }, + { + "epoch": 0.7558948789861847, + "grad_norm": 0.4234682321548462, + "learning_rate": 1.3996004579580995e-05, + "loss": 1.8434, + "step": 7277 + }, + { + "epoch": 0.7559987535057651, + "grad_norm": 0.5005912780761719, + "learning_rate": 1.3984684572404938e-05, + "loss": 1.7706, + "step": 7278 + }, + { + "epoch": 0.7561026280253453, + "grad_norm": 0.45514440536499023, + "learning_rate": 1.397336840057904e-05, + "loss": 1.6742, + "step": 7279 + }, + { + "epoch": 0.7562065025449257, + "grad_norm": 0.4162159562110901, + "learning_rate": 1.3962056065308388e-05, + "loss": 1.6319, + "step": 7280 + }, + { + "epoch": 0.7563103770645061, + "grad_norm": 0.41975322365760803, + "learning_rate": 1.3950747567797662e-05, + "loss": 1.6619, + "step": 7281 + }, + { + "epoch": 0.7564142515840864, + "grad_norm": 0.3796076476573944, + "learning_rate": 1.3939442909251121e-05, + "loss": 1.4829, + "step": 7282 + }, + { + "epoch": 0.7565181261036668, + "grad_norm": 0.4343501925468445, + "learning_rate": 1.3928142090872632e-05, + "loss": 1.6641, + "step": 7283 + }, + { + "epoch": 0.7566220006232471, + "grad_norm": 0.48821020126342773, + "learning_rate": 1.3916845113865618e-05, + "loss": 2.0066, + "step": 7284 + }, + { + "epoch": 0.7567258751428275, + "grad_norm": 0.4088677763938904, + "learning_rate": 1.3905551979433157e-05, + "loss": 1.6202, + "step": 7285 + }, + { + "epoch": 0.7568297496624078, + "grad_norm": 0.4253714680671692, + "learning_rate": 1.3894262688777865e-05, + "loss": 1.595, + "step": 7286 + }, + { + "epoch": 0.7569336241819882, + "grad_norm": 0.4234124422073364, + "learning_rate": 1.3882977243101957e-05, + "loss": 1.6401, + "step": 7287 + }, + { + "epoch": 0.7570374987015686, + "grad_norm": 0.4122594892978668, + "learning_rate": 1.3871695643607252e-05, + "loss": 1.415, + "step": 7288 + }, + { + "epoch": 0.7571413732211488, + "grad_norm": 0.4564708173274994, + "learning_rate": 1.3860417891495154e-05, + "loss": 1.7223, + "step": 7289 + }, + { + "epoch": 0.7572452477407292, + "grad_norm": 0.4112136662006378, + "learning_rate": 1.3849143987966645e-05, + "loss": 1.558, + "step": 7290 + }, + { + "epoch": 0.7573491222603096, + "grad_norm": 0.4101704955101013, + "learning_rate": 1.38378739342223e-05, + "loss": 1.5861, + "step": 7291 + }, + { + "epoch": 0.7574529967798899, + "grad_norm": 0.4257015287876129, + "learning_rate": 1.3826607731462332e-05, + "loss": 1.5489, + "step": 7292 + }, + { + "epoch": 0.7575568712994702, + "grad_norm": 0.41998380422592163, + "learning_rate": 1.3815345380886452e-05, + "loss": 1.5094, + "step": 7293 + }, + { + "epoch": 0.7576607458190506, + "grad_norm": 0.3991340398788452, + "learning_rate": 1.380408688369405e-05, + "loss": 1.5909, + "step": 7294 + }, + { + "epoch": 0.7577646203386309, + "grad_norm": 0.42524048686027527, + "learning_rate": 1.3792832241084058e-05, + "loss": 1.6435, + "step": 7295 + }, + { + "epoch": 0.7578684948582113, + "grad_norm": 0.4474044740200043, + "learning_rate": 1.3781581454255005e-05, + "loss": 1.6698, + "step": 7296 + }, + { + "epoch": 0.7579723693777917, + "grad_norm": 0.41872158646583557, + "learning_rate": 1.3770334524405015e-05, + "loss": 1.6468, + "step": 7297 + }, + { + "epoch": 0.7580762438973719, + "grad_norm": 0.44773685932159424, + "learning_rate": 1.3759091452731792e-05, + "loss": 1.7338, + "step": 7298 + }, + { + "epoch": 0.7581801184169523, + "grad_norm": 0.498161643743515, + "learning_rate": 1.3747852240432641e-05, + "loss": 1.6291, + "step": 7299 + }, + { + "epoch": 0.7582839929365327, + "grad_norm": 0.4790482223033905, + "learning_rate": 1.3736616888704434e-05, + "loss": 1.8468, + "step": 7300 + }, + { + "epoch": 0.758387867456113, + "grad_norm": 0.4306541383266449, + "learning_rate": 1.372538539874369e-05, + "loss": 1.6639, + "step": 7301 + }, + { + "epoch": 0.7584917419756934, + "grad_norm": 0.4123995304107666, + "learning_rate": 1.3714157771746422e-05, + "loss": 1.5665, + "step": 7302 + }, + { + "epoch": 0.7585956164952737, + "grad_norm": 0.45254337787628174, + "learning_rate": 1.3702934008908325e-05, + "loss": 1.8113, + "step": 7303 + }, + { + "epoch": 0.758699491014854, + "grad_norm": 0.47395646572113037, + "learning_rate": 1.3691714111424641e-05, + "loss": 1.7394, + "step": 7304 + }, + { + "epoch": 0.7588033655344344, + "grad_norm": 0.43641114234924316, + "learning_rate": 1.3680498080490161e-05, + "loss": 1.6745, + "step": 7305 + }, + { + "epoch": 0.7589072400540148, + "grad_norm": 0.4418218433856964, + "learning_rate": 1.3669285917299347e-05, + "loss": 1.6433, + "step": 7306 + }, + { + "epoch": 0.759011114573595, + "grad_norm": 0.3992539048194885, + "learning_rate": 1.3658077623046195e-05, + "loss": 1.5712, + "step": 7307 + }, + { + "epoch": 0.7591149890931754, + "grad_norm": 0.43207311630249023, + "learning_rate": 1.3646873198924293e-05, + "loss": 1.6891, + "step": 7308 + }, + { + "epoch": 0.7592188636127558, + "grad_norm": 0.43646010756492615, + "learning_rate": 1.3635672646126835e-05, + "loss": 1.7747, + "step": 7309 + }, + { + "epoch": 0.7593227381323361, + "grad_norm": 0.4463355839252472, + "learning_rate": 1.3624475965846584e-05, + "loss": 1.5922, + "step": 7310 + }, + { + "epoch": 0.7594266126519165, + "grad_norm": 0.41628172993659973, + "learning_rate": 1.3613283159275902e-05, + "loss": 1.8192, + "step": 7311 + }, + { + "epoch": 0.7595304871714968, + "grad_norm": 0.3946208953857422, + "learning_rate": 1.3602094227606721e-05, + "loss": 1.5317, + "step": 7312 + }, + { + "epoch": 0.7596343616910772, + "grad_norm": 0.41928601264953613, + "learning_rate": 1.3590909172030608e-05, + "loss": 1.7496, + "step": 7313 + }, + { + "epoch": 0.7597382362106575, + "grad_norm": 0.4171970784664154, + "learning_rate": 1.3579727993738666e-05, + "loss": 1.7095, + "step": 7314 + }, + { + "epoch": 0.7598421107302379, + "grad_norm": 0.4714401960372925, + "learning_rate": 1.3568550693921605e-05, + "loss": 1.8207, + "step": 7315 + }, + { + "epoch": 0.7599459852498183, + "grad_norm": 0.4133859872817993, + "learning_rate": 1.3557377273769718e-05, + "loss": 1.7672, + "step": 7316 + }, + { + "epoch": 0.7600498597693985, + "grad_norm": 0.40099528431892395, + "learning_rate": 1.3546207734472882e-05, + "loss": 1.565, + "step": 7317 + }, + { + "epoch": 0.7601537342889789, + "grad_norm": 0.4770098030567169, + "learning_rate": 1.3535042077220578e-05, + "loss": 1.7241, + "step": 7318 + }, + { + "epoch": 0.7602576088085593, + "grad_norm": 0.43758663535118103, + "learning_rate": 1.3523880303201842e-05, + "loss": 1.6718, + "step": 7319 + }, + { + "epoch": 0.7603614833281396, + "grad_norm": 0.4401274621486664, + "learning_rate": 1.3512722413605355e-05, + "loss": 1.7467, + "step": 7320 + }, + { + "epoch": 0.76046535784772, + "grad_norm": 0.4016100764274597, + "learning_rate": 1.3501568409619286e-05, + "loss": 1.6112, + "step": 7321 + }, + { + "epoch": 0.7605692323673003, + "grad_norm": 0.44667908549308777, + "learning_rate": 1.3490418292431506e-05, + "loss": 1.7561, + "step": 7322 + }, + { + "epoch": 0.7606731068868806, + "grad_norm": 0.3975602388381958, + "learning_rate": 1.3479272063229387e-05, + "loss": 1.7218, + "step": 7323 + }, + { + "epoch": 0.760776981406461, + "grad_norm": 0.43723657727241516, + "learning_rate": 1.3468129723199919e-05, + "loss": 1.777, + "step": 7324 + }, + { + "epoch": 0.7608808559260414, + "grad_norm": 0.46576887369155884, + "learning_rate": 1.3456991273529679e-05, + "loss": 1.7544, + "step": 7325 + }, + { + "epoch": 0.7609847304456216, + "grad_norm": 0.4114767014980316, + "learning_rate": 1.344585671540482e-05, + "loss": 1.6227, + "step": 7326 + }, + { + "epoch": 0.761088604965202, + "grad_norm": 0.4945017695426941, + "learning_rate": 1.3434726050011093e-05, + "loss": 1.6841, + "step": 7327 + }, + { + "epoch": 0.7611924794847824, + "grad_norm": 0.4236311614513397, + "learning_rate": 1.34235992785338e-05, + "loss": 1.6404, + "step": 7328 + }, + { + "epoch": 0.7612963540043627, + "grad_norm": 0.4564515948295593, + "learning_rate": 1.3412476402157908e-05, + "loss": 1.7414, + "step": 7329 + }, + { + "epoch": 0.7614002285239431, + "grad_norm": 0.41757771372795105, + "learning_rate": 1.3401357422067857e-05, + "loss": 1.7274, + "step": 7330 + }, + { + "epoch": 0.7615041030435235, + "grad_norm": 0.4927060008049011, + "learning_rate": 1.3390242339447779e-05, + "loss": 1.7611, + "step": 7331 + }, + { + "epoch": 0.7616079775631037, + "grad_norm": 0.4198152720928192, + "learning_rate": 1.3379131155481334e-05, + "loss": 1.7494, + "step": 7332 + }, + { + "epoch": 0.7617118520826841, + "grad_norm": 0.38799822330474854, + "learning_rate": 1.3368023871351737e-05, + "loss": 1.6216, + "step": 7333 + }, + { + "epoch": 0.7618157266022645, + "grad_norm": 0.4070364236831665, + "learning_rate": 1.3356920488241875e-05, + "loss": 1.5723, + "step": 7334 + }, + { + "epoch": 0.7619196011218448, + "grad_norm": 0.41575080156326294, + "learning_rate": 1.3345821007334153e-05, + "loss": 1.6863, + "step": 7335 + }, + { + "epoch": 0.7620234756414251, + "grad_norm": 0.43388399481773376, + "learning_rate": 1.333472542981058e-05, + "loss": 1.7284, + "step": 7336 + }, + { + "epoch": 0.7621273501610055, + "grad_norm": 0.39538997411727905, + "learning_rate": 1.3323633756852749e-05, + "loss": 1.5037, + "step": 7337 + }, + { + "epoch": 0.7622312246805859, + "grad_norm": 0.39611417055130005, + "learning_rate": 1.3312545989641823e-05, + "loss": 1.5679, + "step": 7338 + }, + { + "epoch": 0.7623350992001662, + "grad_norm": 0.42863768339157104, + "learning_rate": 1.3301462129358605e-05, + "loss": 1.6756, + "step": 7339 + }, + { + "epoch": 0.7624389737197466, + "grad_norm": 0.44742724299430847, + "learning_rate": 1.329038217718338e-05, + "loss": 1.8475, + "step": 7340 + }, + { + "epoch": 0.7625428482393269, + "grad_norm": 0.4362469017505646, + "learning_rate": 1.3279306134296122e-05, + "loss": 1.6682, + "step": 7341 + }, + { + "epoch": 0.7626467227589072, + "grad_norm": 0.41959047317504883, + "learning_rate": 1.3268234001876328e-05, + "loss": 1.6528, + "step": 7342 + }, + { + "epoch": 0.7627505972784876, + "grad_norm": 0.3941357135772705, + "learning_rate": 1.3257165781103093e-05, + "loss": 1.5253, + "step": 7343 + }, + { + "epoch": 0.762854471798068, + "grad_norm": 0.4178176522254944, + "learning_rate": 1.3246101473155098e-05, + "loss": 1.4581, + "step": 7344 + }, + { + "epoch": 0.7629583463176483, + "grad_norm": 0.42281249165534973, + "learning_rate": 1.3235041079210603e-05, + "loss": 1.5694, + "step": 7345 + }, + { + "epoch": 0.7630622208372286, + "grad_norm": 0.4281790554523468, + "learning_rate": 1.3223984600447459e-05, + "loss": 1.6775, + "step": 7346 + }, + { + "epoch": 0.763166095356809, + "grad_norm": 0.4060118496417999, + "learning_rate": 1.3212932038043075e-05, + "loss": 1.4226, + "step": 7347 + }, + { + "epoch": 0.7632699698763893, + "grad_norm": 0.41389042139053345, + "learning_rate": 1.3201883393174508e-05, + "loss": 1.6444, + "step": 7348 + }, + { + "epoch": 0.7633738443959697, + "grad_norm": 0.424570232629776, + "learning_rate": 1.3190838667018297e-05, + "loss": 1.4875, + "step": 7349 + }, + { + "epoch": 0.76347771891555, + "grad_norm": 0.39579057693481445, + "learning_rate": 1.3179797860750653e-05, + "loss": 1.7218, + "step": 7350 + }, + { + "epoch": 0.7635815934351303, + "grad_norm": 0.4264102876186371, + "learning_rate": 1.3168760975547335e-05, + "loss": 1.7828, + "step": 7351 + }, + { + "epoch": 0.7636854679547107, + "grad_norm": 0.4306561350822449, + "learning_rate": 1.3157728012583675e-05, + "loss": 1.7086, + "step": 7352 + }, + { + "epoch": 0.7637893424742911, + "grad_norm": 0.43455207347869873, + "learning_rate": 1.3146698973034599e-05, + "loss": 1.72, + "step": 7353 + }, + { + "epoch": 0.7638932169938714, + "grad_norm": 0.4588955342769623, + "learning_rate": 1.3135673858074621e-05, + "loss": 1.7184, + "step": 7354 + }, + { + "epoch": 0.7639970915134517, + "grad_norm": 0.41876885294914246, + "learning_rate": 1.3124652668877819e-05, + "loss": 1.6402, + "step": 7355 + }, + { + "epoch": 0.7641009660330321, + "grad_norm": 0.4438285529613495, + "learning_rate": 1.3113635406617858e-05, + "loss": 1.7906, + "step": 7356 + }, + { + "epoch": 0.7642048405526124, + "grad_norm": 0.4253295063972473, + "learning_rate": 1.310262207246803e-05, + "loss": 1.6315, + "step": 7357 + }, + { + "epoch": 0.7643087150721928, + "grad_norm": 0.45891672372817993, + "learning_rate": 1.3091612667601111e-05, + "loss": 1.7554, + "step": 7358 + }, + { + "epoch": 0.7644125895917732, + "grad_norm": 0.4097495973110199, + "learning_rate": 1.3080607193189559e-05, + "loss": 1.5465, + "step": 7359 + }, + { + "epoch": 0.7645164641113534, + "grad_norm": 0.4238637685775757, + "learning_rate": 1.3069605650405375e-05, + "loss": 1.6579, + "step": 7360 + }, + { + "epoch": 0.7646203386309338, + "grad_norm": 0.42440927028656006, + "learning_rate": 1.3058608040420095e-05, + "loss": 1.5157, + "step": 7361 + }, + { + "epoch": 0.7647242131505142, + "grad_norm": 0.42934998869895935, + "learning_rate": 1.3047614364404914e-05, + "loss": 1.6494, + "step": 7362 + }, + { + "epoch": 0.7648280876700946, + "grad_norm": 0.42825332283973694, + "learning_rate": 1.3036624623530552e-05, + "loss": 1.6405, + "step": 7363 + }, + { + "epoch": 0.7649319621896749, + "grad_norm": 0.41669198870658875, + "learning_rate": 1.3025638818967373e-05, + "loss": 1.5746, + "step": 7364 + }, + { + "epoch": 0.7650358367092552, + "grad_norm": 0.4732094705104828, + "learning_rate": 1.3014656951885229e-05, + "loss": 1.7997, + "step": 7365 + }, + { + "epoch": 0.7651397112288356, + "grad_norm": 0.38443320989608765, + "learning_rate": 1.3003679023453614e-05, + "loss": 1.52, + "step": 7366 + }, + { + "epoch": 0.7652435857484159, + "grad_norm": 0.3946983516216278, + "learning_rate": 1.2992705034841623e-05, + "loss": 1.707, + "step": 7367 + }, + { + "epoch": 0.7653474602679963, + "grad_norm": 0.4353131055831909, + "learning_rate": 1.2981734987217853e-05, + "loss": 1.788, + "step": 7368 + }, + { + "epoch": 0.7654513347875767, + "grad_norm": 0.41617780923843384, + "learning_rate": 1.2970768881750567e-05, + "loss": 1.6524, + "step": 7369 + }, + { + "epoch": 0.7655552093071569, + "grad_norm": 0.42190882563591003, + "learning_rate": 1.295980671960756e-05, + "loss": 1.6253, + "step": 7370 + }, + { + "epoch": 0.7656590838267373, + "grad_norm": 0.4275079369544983, + "learning_rate": 1.2948848501956207e-05, + "loss": 1.6816, + "step": 7371 + }, + { + "epoch": 0.7657629583463177, + "grad_norm": 0.4169149696826935, + "learning_rate": 1.2937894229963483e-05, + "loss": 1.5858, + "step": 7372 + }, + { + "epoch": 0.765866832865898, + "grad_norm": 0.4570522606372833, + "learning_rate": 1.2926943904795925e-05, + "loss": 1.7703, + "step": 7373 + }, + { + "epoch": 0.7659707073854783, + "grad_norm": 0.424277663230896, + "learning_rate": 1.291599752761966e-05, + "loss": 1.6223, + "step": 7374 + }, + { + "epoch": 0.7660745819050587, + "grad_norm": 0.41258883476257324, + "learning_rate": 1.2905055099600378e-05, + "loss": 1.6564, + "step": 7375 + }, + { + "epoch": 0.766178456424639, + "grad_norm": 0.434390127658844, + "learning_rate": 1.2894116621903402e-05, + "loss": 1.5993, + "step": 7376 + }, + { + "epoch": 0.7662823309442194, + "grad_norm": 0.44891485571861267, + "learning_rate": 1.2883182095693547e-05, + "loss": 1.5434, + "step": 7377 + }, + { + "epoch": 0.7663862054637998, + "grad_norm": 0.43780601024627686, + "learning_rate": 1.2872251522135286e-05, + "loss": 1.5887, + "step": 7378 + }, + { + "epoch": 0.76649007998338, + "grad_norm": 0.4061758518218994, + "learning_rate": 1.286132490239263e-05, + "loss": 1.7188, + "step": 7379 + }, + { + "epoch": 0.7665939545029604, + "grad_norm": 0.47357314825057983, + "learning_rate": 1.2850402237629183e-05, + "loss": 1.869, + "step": 7380 + }, + { + "epoch": 0.7666978290225408, + "grad_norm": 0.41141971945762634, + "learning_rate": 1.2839483529008117e-05, + "loss": 1.7155, + "step": 7381 + }, + { + "epoch": 0.7668017035421211, + "grad_norm": 0.4543936252593994, + "learning_rate": 1.2828568777692191e-05, + "loss": 1.6919, + "step": 7382 + }, + { + "epoch": 0.7669055780617015, + "grad_norm": 0.4838084876537323, + "learning_rate": 1.2817657984843746e-05, + "loss": 1.7123, + "step": 7383 + }, + { + "epoch": 0.7670094525812818, + "grad_norm": 0.422134667634964, + "learning_rate": 1.280675115162467e-05, + "loss": 1.7498, + "step": 7384 + }, + { + "epoch": 0.7671133271008621, + "grad_norm": 0.44436314702033997, + "learning_rate": 1.2795848279196515e-05, + "loss": 1.6622, + "step": 7385 + }, + { + "epoch": 0.7672172016204425, + "grad_norm": 0.42208799719810486, + "learning_rate": 1.2784949368720295e-05, + "loss": 1.7493, + "step": 7386 + }, + { + "epoch": 0.7673210761400229, + "grad_norm": 0.47984281182289124, + "learning_rate": 1.2774054421356668e-05, + "loss": 1.767, + "step": 7387 + }, + { + "epoch": 0.7674249506596031, + "grad_norm": 0.4420330226421356, + "learning_rate": 1.2763163438265896e-05, + "loss": 1.7475, + "step": 7388 + }, + { + "epoch": 0.7675288251791835, + "grad_norm": 0.41784390807151794, + "learning_rate": 1.2752276420607735e-05, + "loss": 1.756, + "step": 7389 + }, + { + "epoch": 0.7676326996987639, + "grad_norm": 0.6316620707511902, + "learning_rate": 1.2741393369541604e-05, + "loss": 1.8209, + "step": 7390 + }, + { + "epoch": 0.7677365742183443, + "grad_norm": 0.39647650718688965, + "learning_rate": 1.2730514286226435e-05, + "loss": 1.5466, + "step": 7391 + }, + { + "epoch": 0.7678404487379246, + "grad_norm": 0.41624629497528076, + "learning_rate": 1.2719639171820812e-05, + "loss": 1.6497, + "step": 7392 + }, + { + "epoch": 0.767944323257505, + "grad_norm": 0.4231652617454529, + "learning_rate": 1.2708768027482804e-05, + "loss": 1.3777, + "step": 7393 + }, + { + "epoch": 0.7680481977770853, + "grad_norm": 0.42754560708999634, + "learning_rate": 1.2697900854370103e-05, + "loss": 1.6038, + "step": 7394 + }, + { + "epoch": 0.7681520722966656, + "grad_norm": 0.4023604691028595, + "learning_rate": 1.2687037653640026e-05, + "loss": 1.6013, + "step": 7395 + }, + { + "epoch": 0.768255946816246, + "grad_norm": 0.4051744043827057, + "learning_rate": 1.2676178426449353e-05, + "loss": 1.6607, + "step": 7396 + }, + { + "epoch": 0.7683598213358264, + "grad_norm": 0.39897581934928894, + "learning_rate": 1.2665323173954557e-05, + "loss": 1.659, + "step": 7397 + }, + { + "epoch": 0.7684636958554066, + "grad_norm": 0.41026565432548523, + "learning_rate": 1.2654471897311627e-05, + "loss": 1.6306, + "step": 7398 + }, + { + "epoch": 0.768567570374987, + "grad_norm": 0.42027774453163147, + "learning_rate": 1.2643624597676123e-05, + "loss": 1.5395, + "step": 7399 + }, + { + "epoch": 0.7686714448945674, + "grad_norm": 0.3966914117336273, + "learning_rate": 1.2632781276203216e-05, + "loss": 1.6195, + "step": 7400 + }, + { + "epoch": 0.7687753194141477, + "grad_norm": 0.40674200654029846, + "learning_rate": 1.2621941934047626e-05, + "loss": 1.5897, + "step": 7401 + }, + { + "epoch": 0.7688791939337281, + "grad_norm": 0.3980896770954132, + "learning_rate": 1.2611106572363663e-05, + "loss": 1.4635, + "step": 7402 + }, + { + "epoch": 0.7689830684533084, + "grad_norm": 0.43104100227355957, + "learning_rate": 1.2600275192305184e-05, + "loss": 1.6104, + "step": 7403 + }, + { + "epoch": 0.7690869429728887, + "grad_norm": 0.4664388597011566, + "learning_rate": 1.2589447795025699e-05, + "loss": 1.7406, + "step": 7404 + }, + { + "epoch": 0.7691908174924691, + "grad_norm": 0.4235634505748749, + "learning_rate": 1.2578624381678183e-05, + "loss": 1.7893, + "step": 7405 + }, + { + "epoch": 0.7692946920120495, + "grad_norm": 0.3953429162502289, + "learning_rate": 1.2567804953415286e-05, + "loss": 1.5755, + "step": 7406 + }, + { + "epoch": 0.7693985665316297, + "grad_norm": 0.45959627628326416, + "learning_rate": 1.2556989511389183e-05, + "loss": 1.6138, + "step": 7407 + }, + { + "epoch": 0.7695024410512101, + "grad_norm": 0.4262646436691284, + "learning_rate": 1.2546178056751623e-05, + "loss": 1.5875, + "step": 7408 + }, + { + "epoch": 0.7696063155707905, + "grad_norm": 0.4125349819660187, + "learning_rate": 1.2535370590653955e-05, + "loss": 1.5919, + "step": 7409 + }, + { + "epoch": 0.7697101900903708, + "grad_norm": 0.40705251693725586, + "learning_rate": 1.2524567114247082e-05, + "loss": 1.6622, + "step": 7410 + }, + { + "epoch": 0.7698140646099512, + "grad_norm": 0.4188450872898102, + "learning_rate": 1.2513767628681489e-05, + "loss": 1.5001, + "step": 7411 + }, + { + "epoch": 0.7699179391295315, + "grad_norm": 0.430321604013443, + "learning_rate": 1.2502972135107227e-05, + "loss": 1.6887, + "step": 7412 + }, + { + "epoch": 0.7700218136491118, + "grad_norm": 0.4586038291454315, + "learning_rate": 1.2492180634673978e-05, + "loss": 1.6159, + "step": 7413 + }, + { + "epoch": 0.7701256881686922, + "grad_norm": 0.4472750425338745, + "learning_rate": 1.24813931285309e-05, + "loss": 1.6015, + "step": 7414 + }, + { + "epoch": 0.7702295626882726, + "grad_norm": 0.404680997133255, + "learning_rate": 1.2470609617826784e-05, + "loss": 1.3466, + "step": 7415 + }, + { + "epoch": 0.770333437207853, + "grad_norm": 0.44199302792549133, + "learning_rate": 1.2459830103710019e-05, + "loss": 1.7882, + "step": 7416 + }, + { + "epoch": 0.7704373117274332, + "grad_norm": 0.4036124050617218, + "learning_rate": 1.2449054587328519e-05, + "loss": 1.6455, + "step": 7417 + }, + { + "epoch": 0.7705411862470136, + "grad_norm": 0.43482506275177, + "learning_rate": 1.2438283069829799e-05, + "loss": 1.7585, + "step": 7418 + }, + { + "epoch": 0.770645060766594, + "grad_norm": 0.3882008194923401, + "learning_rate": 1.2427515552360924e-05, + "loss": 1.502, + "step": 7419 + }, + { + "epoch": 0.7707489352861743, + "grad_norm": 0.44999149441719055, + "learning_rate": 1.2416752036068596e-05, + "loss": 1.7382, + "step": 7420 + }, + { + "epoch": 0.7708528098057547, + "grad_norm": 0.4068087935447693, + "learning_rate": 1.2405992522099002e-05, + "loss": 1.5567, + "step": 7421 + }, + { + "epoch": 0.770956684325335, + "grad_norm": 0.4564858675003052, + "learning_rate": 1.2395237011597949e-05, + "loss": 1.8138, + "step": 7422 + }, + { + "epoch": 0.7710605588449153, + "grad_norm": 0.4281824231147766, + "learning_rate": 1.2384485505710847e-05, + "loss": 1.6393, + "step": 7423 + }, + { + "epoch": 0.7711644333644957, + "grad_norm": 0.3930261731147766, + "learning_rate": 1.2373738005582603e-05, + "loss": 1.5276, + "step": 7424 + }, + { + "epoch": 0.7712683078840761, + "grad_norm": 0.4548739790916443, + "learning_rate": 1.2362994512357779e-05, + "loss": 1.7476, + "step": 7425 + }, + { + "epoch": 0.7713721824036563, + "grad_norm": 0.4831191897392273, + "learning_rate": 1.235225502718046e-05, + "loss": 1.8217, + "step": 7426 + }, + { + "epoch": 0.7714760569232367, + "grad_norm": 0.41000956296920776, + "learning_rate": 1.2341519551194314e-05, + "loss": 1.5592, + "step": 7427 + }, + { + "epoch": 0.7715799314428171, + "grad_norm": 0.46288132667541504, + "learning_rate": 1.2330788085542589e-05, + "loss": 1.7773, + "step": 7428 + }, + { + "epoch": 0.7716838059623974, + "grad_norm": 0.4005960524082184, + "learning_rate": 1.2320060631368103e-05, + "loss": 1.6483, + "step": 7429 + }, + { + "epoch": 0.7717876804819778, + "grad_norm": 0.4161857068538666, + "learning_rate": 1.2309337189813242e-05, + "loss": 1.5655, + "step": 7430 + }, + { + "epoch": 0.7718915550015581, + "grad_norm": 0.4241454601287842, + "learning_rate": 1.229861776201996e-05, + "loss": 1.5819, + "step": 7431 + }, + { + "epoch": 0.7719954295211384, + "grad_norm": 0.44512662291526794, + "learning_rate": 1.2287902349129832e-05, + "loss": 1.5958, + "step": 7432 + }, + { + "epoch": 0.7720993040407188, + "grad_norm": 0.42898502945899963, + "learning_rate": 1.2277190952283912e-05, + "loss": 1.6924, + "step": 7433 + }, + { + "epoch": 0.7722031785602992, + "grad_norm": 0.4218330681324005, + "learning_rate": 1.2266483572622916e-05, + "loss": 1.6906, + "step": 7434 + }, + { + "epoch": 0.7723070530798795, + "grad_norm": 0.4153681695461273, + "learning_rate": 1.2255780211287094e-05, + "loss": 1.718, + "step": 7435 + }, + { + "epoch": 0.7724109275994598, + "grad_norm": 0.443933367729187, + "learning_rate": 1.2245080869416258e-05, + "loss": 1.636, + "step": 7436 + }, + { + "epoch": 0.7725148021190402, + "grad_norm": 0.42244407534599304, + "learning_rate": 1.2234385548149813e-05, + "loss": 1.7592, + "step": 7437 + }, + { + "epoch": 0.7726186766386205, + "grad_norm": 0.4867939054965973, + "learning_rate": 1.2223694248626721e-05, + "loss": 1.7048, + "step": 7438 + }, + { + "epoch": 0.7727225511582009, + "grad_norm": 0.45962727069854736, + "learning_rate": 1.2213006971985524e-05, + "loss": 1.808, + "step": 7439 + }, + { + "epoch": 0.7728264256777813, + "grad_norm": 0.44696059823036194, + "learning_rate": 1.2202323719364323e-05, + "loss": 1.6744, + "step": 7440 + }, + { + "epoch": 0.7729303001973616, + "grad_norm": 0.3926789462566376, + "learning_rate": 1.2191644491900822e-05, + "loss": 1.5021, + "step": 7441 + }, + { + "epoch": 0.7730341747169419, + "grad_norm": 0.4703446626663208, + "learning_rate": 1.2180969290732286e-05, + "loss": 1.7047, + "step": 7442 + }, + { + "epoch": 0.7731380492365223, + "grad_norm": 0.4337899386882782, + "learning_rate": 1.2170298116995493e-05, + "loss": 1.5239, + "step": 7443 + }, + { + "epoch": 0.7732419237561027, + "grad_norm": 0.39774975180625916, + "learning_rate": 1.2159630971826879e-05, + "loss": 1.4872, + "step": 7444 + }, + { + "epoch": 0.773345798275683, + "grad_norm": 0.47527754306793213, + "learning_rate": 1.2148967856362398e-05, + "loss": 1.813, + "step": 7445 + }, + { + "epoch": 0.7734496727952633, + "grad_norm": 0.3975711166858673, + "learning_rate": 1.2138308771737594e-05, + "loss": 1.5849, + "step": 7446 + }, + { + "epoch": 0.7735535473148437, + "grad_norm": 0.44653868675231934, + "learning_rate": 1.2127653719087572e-05, + "loss": 1.7053, + "step": 7447 + }, + { + "epoch": 0.773657421834424, + "grad_norm": 0.4855223298072815, + "learning_rate": 1.2117002699547015e-05, + "loss": 1.7733, + "step": 7448 + }, + { + "epoch": 0.7737612963540044, + "grad_norm": 0.464631050825119, + "learning_rate": 1.2106355714250172e-05, + "loss": 1.7352, + "step": 7449 + }, + { + "epoch": 0.7738651708735848, + "grad_norm": 0.44896233081817627, + "learning_rate": 1.2095712764330847e-05, + "loss": 1.9348, + "step": 7450 + }, + { + "epoch": 0.773969045393165, + "grad_norm": 0.4480723738670349, + "learning_rate": 1.2085073850922479e-05, + "loss": 1.659, + "step": 7451 + }, + { + "epoch": 0.7740729199127454, + "grad_norm": 0.469439297914505, + "learning_rate": 1.2074438975157964e-05, + "loss": 1.8332, + "step": 7452 + }, + { + "epoch": 0.7741767944323258, + "grad_norm": 0.4098644256591797, + "learning_rate": 1.2063808138169891e-05, + "loss": 1.3752, + "step": 7453 + }, + { + "epoch": 0.7742806689519061, + "grad_norm": 0.3945968449115753, + "learning_rate": 1.205318134109033e-05, + "loss": 1.632, + "step": 7454 + }, + { + "epoch": 0.7743845434714864, + "grad_norm": 0.38634219765663147, + "learning_rate": 1.2042558585050961e-05, + "loss": 1.5544, + "step": 7455 + }, + { + "epoch": 0.7744884179910668, + "grad_norm": 0.45720741152763367, + "learning_rate": 1.2031939871183023e-05, + "loss": 1.8892, + "step": 7456 + }, + { + "epoch": 0.7745922925106471, + "grad_norm": 0.44557642936706543, + "learning_rate": 1.2021325200617329e-05, + "loss": 1.6822, + "step": 7457 + }, + { + "epoch": 0.7746961670302275, + "grad_norm": 0.4418318569660187, + "learning_rate": 1.2010714574484255e-05, + "loss": 1.6621, + "step": 7458 + }, + { + "epoch": 0.7748000415498079, + "grad_norm": 0.4305969774723053, + "learning_rate": 1.2000107993913739e-05, + "loss": 1.5825, + "step": 7459 + }, + { + "epoch": 0.7749039160693881, + "grad_norm": 0.385640025138855, + "learning_rate": 1.1989505460035333e-05, + "loss": 1.573, + "step": 7460 + }, + { + "epoch": 0.7750077905889685, + "grad_norm": 0.45117321610450745, + "learning_rate": 1.1978906973978083e-05, + "loss": 1.68, + "step": 7461 + }, + { + "epoch": 0.7751116651085489, + "grad_norm": 0.4692929983139038, + "learning_rate": 1.1968312536870668e-05, + "loss": 1.5407, + "step": 7462 + }, + { + "epoch": 0.7752155396281292, + "grad_norm": 0.38421300053596497, + "learning_rate": 1.1957722149841317e-05, + "loss": 1.4685, + "step": 7463 + }, + { + "epoch": 0.7753194141477096, + "grad_norm": 0.41444119811058044, + "learning_rate": 1.1947135814017812e-05, + "loss": 1.4937, + "step": 7464 + }, + { + "epoch": 0.7754232886672899, + "grad_norm": 0.4278319478034973, + "learning_rate": 1.1936553530527523e-05, + "loss": 1.5106, + "step": 7465 + }, + { + "epoch": 0.7755271631868702, + "grad_norm": 0.39915332198143005, + "learning_rate": 1.1925975300497371e-05, + "loss": 1.4574, + "step": 7466 + }, + { + "epoch": 0.7756310377064506, + "grad_norm": 0.44178900122642517, + "learning_rate": 1.1915401125053866e-05, + "loss": 1.749, + "step": 7467 + }, + { + "epoch": 0.775734912226031, + "grad_norm": 0.39515554904937744, + "learning_rate": 1.1904831005323053e-05, + "loss": 1.5465, + "step": 7468 + }, + { + "epoch": 0.7758387867456114, + "grad_norm": 0.3899015486240387, + "learning_rate": 1.18942649424306e-05, + "loss": 1.487, + "step": 7469 + }, + { + "epoch": 0.7759426612651916, + "grad_norm": 0.44565442204475403, + "learning_rate": 1.1883702937501705e-05, + "loss": 1.6149, + "step": 7470 + }, + { + "epoch": 0.776046535784772, + "grad_norm": 0.405678927898407, + "learning_rate": 1.1873144991661106e-05, + "loss": 1.567, + "step": 7471 + }, + { + "epoch": 0.7761504103043524, + "grad_norm": 0.46768710017204285, + "learning_rate": 1.1862591106033178e-05, + "loss": 1.7902, + "step": 7472 + }, + { + "epoch": 0.7762542848239327, + "grad_norm": 0.4093432128429413, + "learning_rate": 1.185204128174181e-05, + "loss": 1.6518, + "step": 7473 + }, + { + "epoch": 0.776358159343513, + "grad_norm": 0.4388825297355652, + "learning_rate": 1.1841495519910485e-05, + "loss": 1.6971, + "step": 7474 + }, + { + "epoch": 0.7764620338630934, + "grad_norm": 0.4254426956176758, + "learning_rate": 1.1830953821662232e-05, + "loss": 1.6166, + "step": 7475 + }, + { + "epoch": 0.7765659083826737, + "grad_norm": 0.42285576462745667, + "learning_rate": 1.182041618811967e-05, + "loss": 1.611, + "step": 7476 + }, + { + "epoch": 0.7766697829022541, + "grad_norm": 0.4236442446708679, + "learning_rate": 1.1809882620404972e-05, + "loss": 1.586, + "step": 7477 + }, + { + "epoch": 0.7767736574218345, + "grad_norm": 0.4397282600402832, + "learning_rate": 1.1799353119639867e-05, + "loss": 1.7642, + "step": 7478 + }, + { + "epoch": 0.7768775319414147, + "grad_norm": 0.41384944319725037, + "learning_rate": 1.1788827686945708e-05, + "loss": 1.6689, + "step": 7479 + }, + { + "epoch": 0.7769814064609951, + "grad_norm": 0.44808679819107056, + "learning_rate": 1.177830632344331e-05, + "loss": 1.8645, + "step": 7480 + }, + { + "epoch": 0.7770852809805755, + "grad_norm": 0.3973611295223236, + "learning_rate": 1.1767789030253168e-05, + "loss": 1.6404, + "step": 7481 + }, + { + "epoch": 0.7771891555001558, + "grad_norm": 0.44642767310142517, + "learning_rate": 1.1757275808495266e-05, + "loss": 1.7545, + "step": 7482 + }, + { + "epoch": 0.7772930300197362, + "grad_norm": 0.4089753329753876, + "learning_rate": 1.1746766659289188e-05, + "loss": 1.6952, + "step": 7483 + }, + { + "epoch": 0.7773969045393165, + "grad_norm": 0.41882163286209106, + "learning_rate": 1.1736261583754072e-05, + "loss": 1.5784, + "step": 7484 + }, + { + "epoch": 0.7775007790588968, + "grad_norm": 0.489686518907547, + "learning_rate": 1.1725760583008633e-05, + "loss": 1.7849, + "step": 7485 + }, + { + "epoch": 0.7776046535784772, + "grad_norm": 0.41849491000175476, + "learning_rate": 1.171526365817114e-05, + "loss": 1.6088, + "step": 7486 + }, + { + "epoch": 0.7777085280980576, + "grad_norm": 0.40229809284210205, + "learning_rate": 1.1704770810359417e-05, + "loss": 1.5425, + "step": 7487 + }, + { + "epoch": 0.7778124026176378, + "grad_norm": 0.3939957916736603, + "learning_rate": 1.1694282040690919e-05, + "loss": 1.5142, + "step": 7488 + }, + { + "epoch": 0.7779162771372182, + "grad_norm": 0.46565282344818115, + "learning_rate": 1.168379735028256e-05, + "loss": 1.4844, + "step": 7489 + }, + { + "epoch": 0.7780201516567986, + "grad_norm": 0.4124033451080322, + "learning_rate": 1.1673316740250916e-05, + "loss": 1.647, + "step": 7490 + }, + { + "epoch": 0.7781240261763789, + "grad_norm": 0.463223397731781, + "learning_rate": 1.1662840211712084e-05, + "loss": 1.6546, + "step": 7491 + }, + { + "epoch": 0.7782279006959593, + "grad_norm": 0.40807053446769714, + "learning_rate": 1.1652367765781724e-05, + "loss": 1.7317, + "step": 7492 + }, + { + "epoch": 0.7783317752155396, + "grad_norm": 0.4360339045524597, + "learning_rate": 1.164189940357508e-05, + "loss": 1.6996, + "step": 7493 + }, + { + "epoch": 0.77843564973512, + "grad_norm": 0.4180784821510315, + "learning_rate": 1.1631435126206924e-05, + "loss": 1.5694, + "step": 7494 + }, + { + "epoch": 0.7785395242547003, + "grad_norm": 0.4407843053340912, + "learning_rate": 1.1620974934791673e-05, + "loss": 1.5474, + "step": 7495 + }, + { + "epoch": 0.7786433987742807, + "grad_norm": 0.36754143238067627, + "learning_rate": 1.1610518830443195e-05, + "loss": 1.3948, + "step": 7496 + }, + { + "epoch": 0.7787472732938611, + "grad_norm": 0.4531268775463104, + "learning_rate": 1.1600066814275028e-05, + "loss": 1.7093, + "step": 7497 + }, + { + "epoch": 0.7788511478134413, + "grad_norm": 0.4329068958759308, + "learning_rate": 1.1589618887400223e-05, + "loss": 1.6267, + "step": 7498 + }, + { + "epoch": 0.7789550223330217, + "grad_norm": 0.385638564825058, + "learning_rate": 1.1579175050931369e-05, + "loss": 1.4813, + "step": 7499 + }, + { + "epoch": 0.7790588968526021, + "grad_norm": 0.4533402621746063, + "learning_rate": 1.1568735305980693e-05, + "loss": 1.6934, + "step": 7500 + }, + { + "epoch": 0.7791627713721824, + "grad_norm": 0.41636162996292114, + "learning_rate": 1.1558299653659931e-05, + "loss": 1.7808, + "step": 7501 + }, + { + "epoch": 0.7792666458917628, + "grad_norm": 0.44357746839523315, + "learning_rate": 1.15478680950804e-05, + "loss": 1.5262, + "step": 7502 + }, + { + "epoch": 0.7793705204113431, + "grad_norm": 0.43575209379196167, + "learning_rate": 1.1537440631352975e-05, + "loss": 1.6568, + "step": 7503 + }, + { + "epoch": 0.7794743949309234, + "grad_norm": 0.4117165803909302, + "learning_rate": 1.1527017263588108e-05, + "loss": 1.5999, + "step": 7504 + }, + { + "epoch": 0.7795782694505038, + "grad_norm": 0.40848150849342346, + "learning_rate": 1.15165979928958e-05, + "loss": 1.4227, + "step": 7505 + }, + { + "epoch": 0.7796821439700842, + "grad_norm": 0.42398545145988464, + "learning_rate": 1.1506182820385608e-05, + "loss": 1.6902, + "step": 7506 + }, + { + "epoch": 0.7797860184896644, + "grad_norm": 0.4009295105934143, + "learning_rate": 1.149577174716671e-05, + "loss": 1.543, + "step": 7507 + }, + { + "epoch": 0.7798898930092448, + "grad_norm": 0.48386311531066895, + "learning_rate": 1.148536477434775e-05, + "loss": 1.6154, + "step": 7508 + }, + { + "epoch": 0.7799937675288252, + "grad_norm": 0.43705689907073975, + "learning_rate": 1.1474961903037029e-05, + "loss": 1.6337, + "step": 7509 + }, + { + "epoch": 0.7800976420484055, + "grad_norm": 0.3995164632797241, + "learning_rate": 1.146456313434236e-05, + "loss": 1.6047, + "step": 7510 + }, + { + "epoch": 0.7802015165679859, + "grad_norm": 0.40124985575675964, + "learning_rate": 1.145416846937113e-05, + "loss": 1.5502, + "step": 7511 + }, + { + "epoch": 0.7803053910875662, + "grad_norm": 0.4591343104839325, + "learning_rate": 1.1443777909230286e-05, + "loss": 1.8176, + "step": 7512 + }, + { + "epoch": 0.7804092656071465, + "grad_norm": 0.4121926724910736, + "learning_rate": 1.1433391455026343e-05, + "loss": 1.6699, + "step": 7513 + }, + { + "epoch": 0.7805131401267269, + "grad_norm": 0.4800514876842499, + "learning_rate": 1.1423009107865384e-05, + "loss": 1.9029, + "step": 7514 + }, + { + "epoch": 0.7806170146463073, + "grad_norm": 0.44299235939979553, + "learning_rate": 1.1412630868853024e-05, + "loss": 1.6599, + "step": 7515 + }, + { + "epoch": 0.7807208891658876, + "grad_norm": 0.5640523433685303, + "learning_rate": 1.1402256739094513e-05, + "loss": 1.9605, + "step": 7516 + }, + { + "epoch": 0.7808247636854679, + "grad_norm": 0.4083343744277954, + "learning_rate": 1.1391886719694561e-05, + "loss": 1.6042, + "step": 7517 + }, + { + "epoch": 0.7809286382050483, + "grad_norm": 0.4362529516220093, + "learning_rate": 1.1381520811757529e-05, + "loss": 1.6724, + "step": 7518 + }, + { + "epoch": 0.7810325127246287, + "grad_norm": 0.3871046006679535, + "learning_rate": 1.1371159016387296e-05, + "loss": 1.55, + "step": 7519 + }, + { + "epoch": 0.781136387244209, + "grad_norm": 0.41727960109710693, + "learning_rate": 1.136080133468731e-05, + "loss": 1.6557, + "step": 7520 + }, + { + "epoch": 0.7812402617637894, + "grad_norm": 0.41626328229904175, + "learning_rate": 1.1350447767760586e-05, + "loss": 1.642, + "step": 7521 + }, + { + "epoch": 0.7813441362833697, + "grad_norm": 0.41060563921928406, + "learning_rate": 1.134009831670968e-05, + "loss": 1.6731, + "step": 7522 + }, + { + "epoch": 0.78144801080295, + "grad_norm": 0.407531201839447, + "learning_rate": 1.1329752982636776e-05, + "loss": 1.6211, + "step": 7523 + }, + { + "epoch": 0.7815518853225304, + "grad_norm": 0.38882845640182495, + "learning_rate": 1.1319411766643512e-05, + "loss": 1.5965, + "step": 7524 + }, + { + "epoch": 0.7816557598421108, + "grad_norm": 0.4430273771286011, + "learning_rate": 1.1309074669831189e-05, + "loss": 1.7404, + "step": 7525 + }, + { + "epoch": 0.781759634361691, + "grad_norm": 0.4414060115814209, + "learning_rate": 1.1298741693300624e-05, + "loss": 1.6904, + "step": 7526 + }, + { + "epoch": 0.7818635088812714, + "grad_norm": 0.4308008849620819, + "learning_rate": 1.1288412838152157e-05, + "loss": 1.8278, + "step": 7527 + }, + { + "epoch": 0.7819673834008518, + "grad_norm": 0.4290758967399597, + "learning_rate": 1.1278088105485778e-05, + "loss": 1.714, + "step": 7528 + }, + { + "epoch": 0.7820712579204321, + "grad_norm": 0.4313497841358185, + "learning_rate": 1.1267767496400972e-05, + "loss": 1.6401, + "step": 7529 + }, + { + "epoch": 0.7821751324400125, + "grad_norm": 0.4212192893028259, + "learning_rate": 1.1257451011996805e-05, + "loss": 1.6266, + "step": 7530 + }, + { + "epoch": 0.7822790069595928, + "grad_norm": 0.44428661465644836, + "learning_rate": 1.1247138653371903e-05, + "loss": 1.6743, + "step": 7531 + }, + { + "epoch": 0.7823828814791731, + "grad_norm": 0.44234758615493774, + "learning_rate": 1.123683042162445e-05, + "loss": 1.6527, + "step": 7532 + }, + { + "epoch": 0.7824867559987535, + "grad_norm": 0.4521244168281555, + "learning_rate": 1.1226526317852193e-05, + "loss": 1.8607, + "step": 7533 + }, + { + "epoch": 0.7825906305183339, + "grad_norm": 0.4284451901912689, + "learning_rate": 1.1216226343152424e-05, + "loss": 1.5447, + "step": 7534 + }, + { + "epoch": 0.7826945050379142, + "grad_norm": 0.39966773986816406, + "learning_rate": 1.1205930498622053e-05, + "loss": 1.6129, + "step": 7535 + }, + { + "epoch": 0.7827983795574945, + "grad_norm": 0.4256837069988251, + "learning_rate": 1.1195638785357448e-05, + "loss": 1.5621, + "step": 7536 + }, + { + "epoch": 0.7829022540770749, + "grad_norm": 0.4508436620235443, + "learning_rate": 1.1185351204454647e-05, + "loss": 1.7632, + "step": 7537 + }, + { + "epoch": 0.7830061285966552, + "grad_norm": 0.40548187494277954, + "learning_rate": 1.1175067757009171e-05, + "loss": 1.5052, + "step": 7538 + }, + { + "epoch": 0.7831100031162356, + "grad_norm": 0.43622684478759766, + "learning_rate": 1.1164788444116131e-05, + "loss": 1.7971, + "step": 7539 + }, + { + "epoch": 0.783213877635816, + "grad_norm": 0.4189201891422272, + "learning_rate": 1.1154513266870204e-05, + "loss": 1.5101, + "step": 7540 + }, + { + "epoch": 0.7833177521553962, + "grad_norm": 0.4424949288368225, + "learning_rate": 1.1144242226365599e-05, + "loss": 1.8468, + "step": 7541 + }, + { + "epoch": 0.7834216266749766, + "grad_norm": 0.4143393635749817, + "learning_rate": 1.1133975323696111e-05, + "loss": 1.6425, + "step": 7542 + }, + { + "epoch": 0.783525501194557, + "grad_norm": 0.43571484088897705, + "learning_rate": 1.1123712559955075e-05, + "loss": 1.5772, + "step": 7543 + }, + { + "epoch": 0.7836293757141374, + "grad_norm": 0.42268067598342896, + "learning_rate": 1.1113453936235418e-05, + "loss": 1.632, + "step": 7544 + }, + { + "epoch": 0.7837332502337176, + "grad_norm": 0.4198543131351471, + "learning_rate": 1.1103199453629587e-05, + "loss": 1.6937, + "step": 7545 + }, + { + "epoch": 0.783837124753298, + "grad_norm": 0.4531916677951813, + "learning_rate": 1.1092949113229612e-05, + "loss": 1.7399, + "step": 7546 + }, + { + "epoch": 0.7839409992728784, + "grad_norm": 0.43174856901168823, + "learning_rate": 1.1082702916127063e-05, + "loss": 1.6816, + "step": 7547 + }, + { + "epoch": 0.7840448737924587, + "grad_norm": 0.3892672061920166, + "learning_rate": 1.1072460863413092e-05, + "loss": 1.3392, + "step": 7548 + }, + { + "epoch": 0.7841487483120391, + "grad_norm": 0.4050576388835907, + "learning_rate": 1.1062222956178391e-05, + "loss": 1.6167, + "step": 7549 + }, + { + "epoch": 0.7842526228316195, + "grad_norm": 0.43549004197120667, + "learning_rate": 1.1051989195513207e-05, + "loss": 1.7752, + "step": 7550 + }, + { + "epoch": 0.7843564973511997, + "grad_norm": 0.4682556390762329, + "learning_rate": 1.1041759582507393e-05, + "loss": 1.6396, + "step": 7551 + }, + { + "epoch": 0.7844603718707801, + "grad_norm": 0.3986964821815491, + "learning_rate": 1.1031534118250269e-05, + "loss": 1.697, + "step": 7552 + }, + { + "epoch": 0.7845642463903605, + "grad_norm": 0.4416189193725586, + "learning_rate": 1.1021312803830813e-05, + "loss": 1.5987, + "step": 7553 + }, + { + "epoch": 0.7846681209099408, + "grad_norm": 0.39252594113349915, + "learning_rate": 1.101109564033751e-05, + "loss": 1.5662, + "step": 7554 + }, + { + "epoch": 0.7847719954295211, + "grad_norm": 0.4105452299118042, + "learning_rate": 1.1000882628858367e-05, + "loss": 1.4838, + "step": 7555 + }, + { + "epoch": 0.7848758699491015, + "grad_norm": 0.409593790769577, + "learning_rate": 1.0990673770481036e-05, + "loss": 1.6211, + "step": 7556 + }, + { + "epoch": 0.7849797444686818, + "grad_norm": 0.40311574935913086, + "learning_rate": 1.0980469066292659e-05, + "loss": 1.5268, + "step": 7557 + }, + { + "epoch": 0.7850836189882622, + "grad_norm": 0.4237530827522278, + "learning_rate": 1.0970268517379967e-05, + "loss": 1.7556, + "step": 7558 + }, + { + "epoch": 0.7851874935078426, + "grad_norm": 0.4404970407485962, + "learning_rate": 1.0960072124829235e-05, + "loss": 1.5014, + "step": 7559 + }, + { + "epoch": 0.7852913680274228, + "grad_norm": 0.42267706990242004, + "learning_rate": 1.0949879889726294e-05, + "loss": 1.6162, + "step": 7560 + }, + { + "epoch": 0.7853952425470032, + "grad_norm": 0.44833898544311523, + "learning_rate": 1.0939691813156544e-05, + "loss": 1.7035, + "step": 7561 + }, + { + "epoch": 0.7854991170665836, + "grad_norm": 0.4051210284233093, + "learning_rate": 1.0929507896204921e-05, + "loss": 1.6337, + "step": 7562 + }, + { + "epoch": 0.7856029915861639, + "grad_norm": 0.4495202898979187, + "learning_rate": 1.0919328139955976e-05, + "loss": 1.6693, + "step": 7563 + }, + { + "epoch": 0.7857068661057443, + "grad_norm": 0.4015834331512451, + "learning_rate": 1.0909152545493712e-05, + "loss": 1.6461, + "step": 7564 + }, + { + "epoch": 0.7858107406253246, + "grad_norm": 0.4055159091949463, + "learning_rate": 1.089898111390179e-05, + "loss": 1.6278, + "step": 7565 + }, + { + "epoch": 0.7859146151449049, + "grad_norm": 0.3767836093902588, + "learning_rate": 1.0888813846263391e-05, + "loss": 1.5302, + "step": 7566 + }, + { + "epoch": 0.7860184896644853, + "grad_norm": 0.4378937780857086, + "learning_rate": 1.0878650743661229e-05, + "loss": 1.6846, + "step": 7567 + }, + { + "epoch": 0.7861223641840657, + "grad_norm": 0.43852996826171875, + "learning_rate": 1.086849180717761e-05, + "loss": 1.6535, + "step": 7568 + }, + { + "epoch": 0.7862262387036459, + "grad_norm": 0.42073825001716614, + "learning_rate": 1.0858337037894372e-05, + "loss": 1.6498, + "step": 7569 + }, + { + "epoch": 0.7863301132232263, + "grad_norm": 0.44468173384666443, + "learning_rate": 1.0848186436892927e-05, + "loss": 1.5972, + "step": 7570 + }, + { + "epoch": 0.7864339877428067, + "grad_norm": 0.4390113055706024, + "learning_rate": 1.0838040005254213e-05, + "loss": 1.618, + "step": 7571 + }, + { + "epoch": 0.7865378622623871, + "grad_norm": 0.3942447602748871, + "learning_rate": 1.082789774405878e-05, + "loss": 1.5818, + "step": 7572 + }, + { + "epoch": 0.7866417367819674, + "grad_norm": 0.4129110276699066, + "learning_rate": 1.0817759654386684e-05, + "loss": 1.512, + "step": 7573 + }, + { + "epoch": 0.7867456113015477, + "grad_norm": 0.4303886592388153, + "learning_rate": 1.0807625737317545e-05, + "loss": 1.6786, + "step": 7574 + }, + { + "epoch": 0.7868494858211281, + "grad_norm": 0.47462910413742065, + "learning_rate": 1.0797495993930562e-05, + "loss": 1.7152, + "step": 7575 + }, + { + "epoch": 0.7869533603407084, + "grad_norm": 0.41988933086395264, + "learning_rate": 1.0787370425304454e-05, + "loss": 1.7505, + "step": 7576 + }, + { + "epoch": 0.7870572348602888, + "grad_norm": 0.41259077191352844, + "learning_rate": 1.0777249032517534e-05, + "loss": 1.6145, + "step": 7577 + }, + { + "epoch": 0.7871611093798692, + "grad_norm": 0.3950294256210327, + "learning_rate": 1.0767131816647624e-05, + "loss": 1.4186, + "step": 7578 + }, + { + "epoch": 0.7872649838994494, + "grad_norm": 0.4047811031341553, + "learning_rate": 1.0757018778772181e-05, + "loss": 1.7788, + "step": 7579 + }, + { + "epoch": 0.7873688584190298, + "grad_norm": 0.46056464314460754, + "learning_rate": 1.0746909919968102e-05, + "loss": 1.7623, + "step": 7580 + }, + { + "epoch": 0.7874727329386102, + "grad_norm": 0.4839528203010559, + "learning_rate": 1.0736805241311937e-05, + "loss": 1.7721, + "step": 7581 + }, + { + "epoch": 0.7875766074581905, + "grad_norm": 0.44515979290008545, + "learning_rate": 1.0726704743879767e-05, + "loss": 1.6909, + "step": 7582 + }, + { + "epoch": 0.7876804819777709, + "grad_norm": 0.4091649353504181, + "learning_rate": 1.0716608428747177e-05, + "loss": 1.6288, + "step": 7583 + }, + { + "epoch": 0.7877843564973512, + "grad_norm": 0.4144968092441559, + "learning_rate": 1.0706516296989372e-05, + "loss": 1.7996, + "step": 7584 + }, + { + "epoch": 0.7878882310169315, + "grad_norm": 0.3900415301322937, + "learning_rate": 1.0696428349681087e-05, + "loss": 1.5053, + "step": 7585 + }, + { + "epoch": 0.7879921055365119, + "grad_norm": 0.4206119775772095, + "learning_rate": 1.0686344587896597e-05, + "loss": 1.6293, + "step": 7586 + }, + { + "epoch": 0.7880959800560923, + "grad_norm": 0.421841025352478, + "learning_rate": 1.0676265012709752e-05, + "loss": 1.628, + "step": 7587 + }, + { + "epoch": 0.7881998545756725, + "grad_norm": 0.45985618233680725, + "learning_rate": 1.066618962519395e-05, + "loss": 1.8276, + "step": 7588 + }, + { + "epoch": 0.7883037290952529, + "grad_norm": 0.414380818605423, + "learning_rate": 1.0656118426422134e-05, + "loss": 1.6578, + "step": 7589 + }, + { + "epoch": 0.7884076036148333, + "grad_norm": 0.4441922903060913, + "learning_rate": 1.06460514174668e-05, + "loss": 1.74, + "step": 7590 + }, + { + "epoch": 0.7885114781344136, + "grad_norm": 0.45518070459365845, + "learning_rate": 1.0635988599400043e-05, + "loss": 1.7842, + "step": 7591 + }, + { + "epoch": 0.788615352653994, + "grad_norm": 0.3902077078819275, + "learning_rate": 1.0625929973293419e-05, + "loss": 1.5922, + "step": 7592 + }, + { + "epoch": 0.7887192271735743, + "grad_norm": 0.4476879835128784, + "learning_rate": 1.061587554021814e-05, + "loss": 1.7011, + "step": 7593 + }, + { + "epoch": 0.7888231016931546, + "grad_norm": 0.41619980335235596, + "learning_rate": 1.0605825301244909e-05, + "loss": 1.683, + "step": 7594 + }, + { + "epoch": 0.788926976212735, + "grad_norm": 0.40933772921562195, + "learning_rate": 1.0595779257444e-05, + "loss": 1.4897, + "step": 7595 + }, + { + "epoch": 0.7890308507323154, + "grad_norm": 0.4322732388973236, + "learning_rate": 1.0585737409885227e-05, + "loss": 1.6298, + "step": 7596 + }, + { + "epoch": 0.7891347252518958, + "grad_norm": 0.4134397804737091, + "learning_rate": 1.057569975963797e-05, + "loss": 1.6549, + "step": 7597 + }, + { + "epoch": 0.789238599771476, + "grad_norm": 0.44103166460990906, + "learning_rate": 1.0565666307771193e-05, + "loss": 1.7137, + "step": 7598 + }, + { + "epoch": 0.7893424742910564, + "grad_norm": 0.4072246253490448, + "learning_rate": 1.0555637055353324e-05, + "loss": 1.5784, + "step": 7599 + }, + { + "epoch": 0.7894463488106368, + "grad_norm": 0.44003406167030334, + "learning_rate": 1.054561200345245e-05, + "loss": 1.6315, + "step": 7600 + }, + { + "epoch": 0.7895502233302171, + "grad_norm": 0.4993717074394226, + "learning_rate": 1.0535591153136137e-05, + "loss": 1.7496, + "step": 7601 + }, + { + "epoch": 0.7896540978497975, + "grad_norm": 0.41622164845466614, + "learning_rate": 1.0525574505471531e-05, + "loss": 1.6808, + "step": 7602 + }, + { + "epoch": 0.7897579723693778, + "grad_norm": 0.3730601370334625, + "learning_rate": 1.051556206152533e-05, + "loss": 1.5215, + "step": 7603 + }, + { + "epoch": 0.7898618468889581, + "grad_norm": 0.41570550203323364, + "learning_rate": 1.0505553822363783e-05, + "loss": 1.6401, + "step": 7604 + }, + { + "epoch": 0.7899657214085385, + "grad_norm": 0.41199877858161926, + "learning_rate": 1.0495549789052678e-05, + "loss": 1.613, + "step": 7605 + }, + { + "epoch": 0.7900695959281189, + "grad_norm": 0.4056338667869568, + "learning_rate": 1.0485549962657365e-05, + "loss": 1.6405, + "step": 7606 + }, + { + "epoch": 0.7901734704476991, + "grad_norm": 0.39273974299430847, + "learning_rate": 1.0475554344242783e-05, + "loss": 1.5479, + "step": 7607 + }, + { + "epoch": 0.7902773449672795, + "grad_norm": 0.454589307308197, + "learning_rate": 1.0465562934873335e-05, + "loss": 1.8367, + "step": 7608 + }, + { + "epoch": 0.7903812194868599, + "grad_norm": 0.4377838373184204, + "learning_rate": 1.0455575735613066e-05, + "loss": 1.6501, + "step": 7609 + }, + { + "epoch": 0.7904850940064402, + "grad_norm": 0.41390401124954224, + "learning_rate": 1.0445592747525534e-05, + "loss": 1.5864, + "step": 7610 + }, + { + "epoch": 0.7905889685260206, + "grad_norm": 0.40852096676826477, + "learning_rate": 1.0435613971673813e-05, + "loss": 1.5591, + "step": 7611 + }, + { + "epoch": 0.790692843045601, + "grad_norm": 0.43240541219711304, + "learning_rate": 1.0425639409120602e-05, + "loss": 1.6134, + "step": 7612 + }, + { + "epoch": 0.7907967175651812, + "grad_norm": 0.4393400549888611, + "learning_rate": 1.0415669060928108e-05, + "loss": 1.7608, + "step": 7613 + }, + { + "epoch": 0.7909005920847616, + "grad_norm": 0.4604494571685791, + "learning_rate": 1.0405702928158085e-05, + "loss": 1.7024, + "step": 7614 + }, + { + "epoch": 0.791004466604342, + "grad_norm": 0.40346240997314453, + "learning_rate": 1.0395741011871851e-05, + "loss": 1.6004, + "step": 7615 + }, + { + "epoch": 0.7911083411239223, + "grad_norm": 0.39214804768562317, + "learning_rate": 1.0385783313130276e-05, + "loss": 1.6282, + "step": 7616 + }, + { + "epoch": 0.7912122156435026, + "grad_norm": 0.44647249579429626, + "learning_rate": 1.0375829832993772e-05, + "loss": 1.7222, + "step": 7617 + }, + { + "epoch": 0.791316090163083, + "grad_norm": 0.4374501705169678, + "learning_rate": 1.0365880572522302e-05, + "loss": 1.7329, + "step": 7618 + }, + { + "epoch": 0.7914199646826633, + "grad_norm": 0.4381657838821411, + "learning_rate": 1.0355935532775419e-05, + "loss": 1.5133, + "step": 7619 + }, + { + "epoch": 0.7915238392022437, + "grad_norm": 0.4005827009677887, + "learning_rate": 1.0345994714812135e-05, + "loss": 1.4862, + "step": 7620 + }, + { + "epoch": 0.7916277137218241, + "grad_norm": 0.40875616669654846, + "learning_rate": 1.0336058119691122e-05, + "loss": 1.5222, + "step": 7621 + }, + { + "epoch": 0.7917315882414044, + "grad_norm": 0.4034888744354248, + "learning_rate": 1.0326125748470522e-05, + "loss": 1.6171, + "step": 7622 + }, + { + "epoch": 0.7918354627609847, + "grad_norm": 0.42959120869636536, + "learning_rate": 1.0316197602208067e-05, + "loss": 1.7382, + "step": 7623 + }, + { + "epoch": 0.7919393372805651, + "grad_norm": 0.5072987079620361, + "learning_rate": 1.0306273681961014e-05, + "loss": 1.8618, + "step": 7624 + }, + { + "epoch": 0.7920432118001455, + "grad_norm": 0.4025663435459137, + "learning_rate": 1.0296353988786184e-05, + "loss": 1.7562, + "step": 7625 + }, + { + "epoch": 0.7921470863197257, + "grad_norm": 0.4331921637058258, + "learning_rate": 1.0286438523739978e-05, + "loss": 1.5518, + "step": 7626 + }, + { + "epoch": 0.7922509608393061, + "grad_norm": 0.46530386805534363, + "learning_rate": 1.0276527287878268e-05, + "loss": 1.831, + "step": 7627 + }, + { + "epoch": 0.7923548353588865, + "grad_norm": 0.43432602286338806, + "learning_rate": 1.0266620282256556e-05, + "loss": 1.69, + "step": 7628 + }, + { + "epoch": 0.7924587098784668, + "grad_norm": 0.39210620522499084, + "learning_rate": 1.0256717507929852e-05, + "loss": 1.4931, + "step": 7629 + }, + { + "epoch": 0.7925625843980472, + "grad_norm": 0.4316072463989258, + "learning_rate": 1.0246818965952727e-05, + "loss": 1.5392, + "step": 7630 + }, + { + "epoch": 0.7926664589176275, + "grad_norm": 0.422419011592865, + "learning_rate": 1.0236924657379293e-05, + "loss": 1.5485, + "step": 7631 + }, + { + "epoch": 0.7927703334372078, + "grad_norm": 0.43081673979759216, + "learning_rate": 1.0227034583263218e-05, + "loss": 1.7733, + "step": 7632 + }, + { + "epoch": 0.7928742079567882, + "grad_norm": 0.4367268979549408, + "learning_rate": 1.0217148744657723e-05, + "loss": 1.7832, + "step": 7633 + }, + { + "epoch": 0.7929780824763686, + "grad_norm": 0.42134618759155273, + "learning_rate": 1.0207267142615552e-05, + "loss": 1.6996, + "step": 7634 + }, + { + "epoch": 0.7930819569959489, + "grad_norm": 0.40299269556999207, + "learning_rate": 1.0197389778189065e-05, + "loss": 1.6218, + "step": 7635 + }, + { + "epoch": 0.7931858315155292, + "grad_norm": 0.414371132850647, + "learning_rate": 1.0187516652430062e-05, + "loss": 1.6509, + "step": 7636 + }, + { + "epoch": 0.7932897060351096, + "grad_norm": 0.4730742573738098, + "learning_rate": 1.0177647766390002e-05, + "loss": 1.7145, + "step": 7637 + }, + { + "epoch": 0.7933935805546899, + "grad_norm": 0.43268048763275146, + "learning_rate": 1.0167783121119845e-05, + "loss": 1.3435, + "step": 7638 + }, + { + "epoch": 0.7934974550742703, + "grad_norm": 0.43376022577285767, + "learning_rate": 1.0157922717670054e-05, + "loss": 1.5719, + "step": 7639 + }, + { + "epoch": 0.7936013295938507, + "grad_norm": 0.40382814407348633, + "learning_rate": 1.0148066557090724e-05, + "loss": 1.6977, + "step": 7640 + }, + { + "epoch": 0.7937052041134309, + "grad_norm": 0.45663517713546753, + "learning_rate": 1.0138214640431454e-05, + "loss": 1.7793, + "step": 7641 + }, + { + "epoch": 0.7938090786330113, + "grad_norm": 0.4395454525947571, + "learning_rate": 1.0128366968741387e-05, + "loss": 1.7664, + "step": 7642 + }, + { + "epoch": 0.7939129531525917, + "grad_norm": 0.4374726116657257, + "learning_rate": 1.0118523543069226e-05, + "loss": 1.6902, + "step": 7643 + }, + { + "epoch": 0.794016827672172, + "grad_norm": 0.459427148103714, + "learning_rate": 1.0108684364463222e-05, + "loss": 1.6312, + "step": 7644 + }, + { + "epoch": 0.7941207021917523, + "grad_norm": 0.42887747287750244, + "learning_rate": 1.0098849433971169e-05, + "loss": 1.6194, + "step": 7645 + }, + { + "epoch": 0.7942245767113327, + "grad_norm": 0.4146133363246918, + "learning_rate": 1.0089018752640394e-05, + "loss": 1.6965, + "step": 7646 + }, + { + "epoch": 0.794328451230913, + "grad_norm": 0.46396344900131226, + "learning_rate": 1.0079192321517838e-05, + "loss": 1.7115, + "step": 7647 + }, + { + "epoch": 0.7944323257504934, + "grad_norm": 0.4100917875766754, + "learning_rate": 1.0069370141649876e-05, + "loss": 1.4832, + "step": 7648 + }, + { + "epoch": 0.7945362002700738, + "grad_norm": 0.4542340040206909, + "learning_rate": 1.0059552214082536e-05, + "loss": 1.6654, + "step": 7649 + }, + { + "epoch": 0.7946400747896541, + "grad_norm": 0.4383617043495178, + "learning_rate": 1.0049738539861332e-05, + "loss": 1.6049, + "step": 7650 + }, + { + "epoch": 0.7947439493092344, + "grad_norm": 0.4077033996582031, + "learning_rate": 1.0039929120031355e-05, + "loss": 1.7055, + "step": 7651 + }, + { + "epoch": 0.7948478238288148, + "grad_norm": 0.4490281641483307, + "learning_rate": 1.0030123955637227e-05, + "loss": 1.7024, + "step": 7652 + }, + { + "epoch": 0.7949516983483952, + "grad_norm": 0.4433899223804474, + "learning_rate": 1.0020323047723108e-05, + "loss": 1.7098, + "step": 7653 + }, + { + "epoch": 0.7950555728679755, + "grad_norm": 0.3918624818325043, + "learning_rate": 1.0010526397332754e-05, + "loss": 1.5875, + "step": 7654 + }, + { + "epoch": 0.7951594473875558, + "grad_norm": 0.4222068190574646, + "learning_rate": 1.0000734005509382e-05, + "loss": 1.6725, + "step": 7655 + }, + { + "epoch": 0.7952633219071362, + "grad_norm": 0.4067675471305847, + "learning_rate": 9.990945873295848e-06, + "loss": 1.6157, + "step": 7656 + }, + { + "epoch": 0.7953671964267165, + "grad_norm": 0.4570964574813843, + "learning_rate": 9.981162001734496e-06, + "loss": 1.7816, + "step": 7657 + }, + { + "epoch": 0.7954710709462969, + "grad_norm": 0.4113404452800751, + "learning_rate": 9.97138239186723e-06, + "loss": 1.6609, + "step": 7658 + }, + { + "epoch": 0.7955749454658773, + "grad_norm": 0.4180930256843567, + "learning_rate": 9.961607044735505e-06, + "loss": 1.5903, + "step": 7659 + }, + { + "epoch": 0.7956788199854575, + "grad_norm": 0.4389685094356537, + "learning_rate": 9.951835961380317e-06, + "loss": 1.655, + "step": 7660 + }, + { + "epoch": 0.7957826945050379, + "grad_norm": 0.38533127307891846, + "learning_rate": 9.942069142842214e-06, + "loss": 1.5481, + "step": 7661 + }, + { + "epoch": 0.7958865690246183, + "grad_norm": 0.4118013083934784, + "learning_rate": 9.932306590161266e-06, + "loss": 1.6456, + "step": 7662 + }, + { + "epoch": 0.7959904435441986, + "grad_norm": 0.433296263217926, + "learning_rate": 9.922548304377154e-06, + "loss": 1.6836, + "step": 7663 + }, + { + "epoch": 0.796094318063779, + "grad_norm": 0.41111084818840027, + "learning_rate": 9.91279428652901e-06, + "loss": 1.5107, + "step": 7664 + }, + { + "epoch": 0.7961981925833593, + "grad_norm": 0.4467814862728119, + "learning_rate": 9.903044537655587e-06, + "loss": 1.6828, + "step": 7665 + }, + { + "epoch": 0.7963020671029396, + "grad_norm": 0.3999803364276886, + "learning_rate": 9.893299058795175e-06, + "loss": 1.5791, + "step": 7666 + }, + { + "epoch": 0.79640594162252, + "grad_norm": 0.4356433153152466, + "learning_rate": 9.883557850985537e-06, + "loss": 1.6213, + "step": 7667 + }, + { + "epoch": 0.7965098161421004, + "grad_norm": 0.44254523515701294, + "learning_rate": 9.873820915264082e-06, + "loss": 1.7171, + "step": 7668 + }, + { + "epoch": 0.7966136906616806, + "grad_norm": 0.4416894018650055, + "learning_rate": 9.864088252667702e-06, + "loss": 1.5873, + "step": 7669 + }, + { + "epoch": 0.796717565181261, + "grad_norm": 0.46283599734306335, + "learning_rate": 9.854359864232854e-06, + "loss": 1.8697, + "step": 7670 + }, + { + "epoch": 0.7968214397008414, + "grad_norm": 0.41159945726394653, + "learning_rate": 9.84463575099553e-06, + "loss": 1.6293, + "step": 7671 + }, + { + "epoch": 0.7969253142204217, + "grad_norm": 0.4374866187572479, + "learning_rate": 9.834915913991272e-06, + "loss": 1.7457, + "step": 7672 + }, + { + "epoch": 0.7970291887400021, + "grad_norm": 0.37280136346817017, + "learning_rate": 9.825200354255171e-06, + "loss": 1.5509, + "step": 7673 + }, + { + "epoch": 0.7971330632595824, + "grad_norm": 0.41301167011260986, + "learning_rate": 9.815489072821848e-06, + "loss": 1.6039, + "step": 7674 + }, + { + "epoch": 0.7972369377791628, + "grad_norm": 0.3978497087955475, + "learning_rate": 9.805782070725494e-06, + "loss": 1.5862, + "step": 7675 + }, + { + "epoch": 0.7973408122987431, + "grad_norm": 0.42098501324653625, + "learning_rate": 9.796079348999831e-06, + "loss": 1.6685, + "step": 7676 + }, + { + "epoch": 0.7974446868183235, + "grad_norm": 0.4220602214336395, + "learning_rate": 9.786380908678112e-06, + "loss": 1.6917, + "step": 7677 + }, + { + "epoch": 0.7975485613379039, + "grad_norm": 0.44230684638023376, + "learning_rate": 9.77668675079314e-06, + "loss": 1.6786, + "step": 7678 + }, + { + "epoch": 0.7976524358574841, + "grad_norm": 0.49053019285202026, + "learning_rate": 9.766996876377283e-06, + "loss": 1.5424, + "step": 7679 + }, + { + "epoch": 0.7977563103770645, + "grad_norm": 0.41432085633277893, + "learning_rate": 9.757311286462429e-06, + "loss": 1.6682, + "step": 7680 + }, + { + "epoch": 0.7978601848966449, + "grad_norm": 0.41580092906951904, + "learning_rate": 9.747629982080003e-06, + "loss": 1.7237, + "step": 7681 + }, + { + "epoch": 0.7979640594162252, + "grad_norm": 0.4454309642314911, + "learning_rate": 9.737952964261027e-06, + "loss": 1.7689, + "step": 7682 + }, + { + "epoch": 0.7980679339358056, + "grad_norm": 0.40397220849990845, + "learning_rate": 9.728280234035981e-06, + "loss": 1.6688, + "step": 7683 + }, + { + "epoch": 0.7981718084553859, + "grad_norm": 0.43213963508605957, + "learning_rate": 9.718611792434973e-06, + "loss": 1.7833, + "step": 7684 + }, + { + "epoch": 0.7982756829749662, + "grad_norm": 0.44224637746810913, + "learning_rate": 9.708947640487598e-06, + "loss": 1.6129, + "step": 7685 + }, + { + "epoch": 0.7983795574945466, + "grad_norm": 0.42797449231147766, + "learning_rate": 9.699287779223022e-06, + "loss": 1.7152, + "step": 7686 + }, + { + "epoch": 0.798483432014127, + "grad_norm": 0.39138954877853394, + "learning_rate": 9.689632209669935e-06, + "loss": 1.5439, + "step": 7687 + }, + { + "epoch": 0.7985873065337072, + "grad_norm": 0.42455416917800903, + "learning_rate": 9.679980932856591e-06, + "loss": 1.6193, + "step": 7688 + }, + { + "epoch": 0.7986911810532876, + "grad_norm": 0.392198383808136, + "learning_rate": 9.670333949810761e-06, + "loss": 1.5908, + "step": 7689 + }, + { + "epoch": 0.798795055572868, + "grad_norm": 0.4087858200073242, + "learning_rate": 9.660691261559774e-06, + "loss": 1.6236, + "step": 7690 + }, + { + "epoch": 0.7988989300924483, + "grad_norm": 0.38556161522865295, + "learning_rate": 9.65105286913054e-06, + "loss": 1.5387, + "step": 7691 + }, + { + "epoch": 0.7990028046120287, + "grad_norm": 0.40947332978248596, + "learning_rate": 9.641418773549405e-06, + "loss": 1.583, + "step": 7692 + }, + { + "epoch": 0.799106679131609, + "grad_norm": 0.4197182059288025, + "learning_rate": 9.631788975842381e-06, + "loss": 1.593, + "step": 7693 + }, + { + "epoch": 0.7992105536511893, + "grad_norm": 0.43122637271881104, + "learning_rate": 9.622163477034957e-06, + "loss": 1.6916, + "step": 7694 + }, + { + "epoch": 0.7993144281707697, + "grad_norm": 0.4150683283805847, + "learning_rate": 9.612542278152142e-06, + "loss": 1.6192, + "step": 7695 + }, + { + "epoch": 0.7994183026903501, + "grad_norm": 0.41880491375923157, + "learning_rate": 9.602925380218547e-06, + "loss": 1.6866, + "step": 7696 + }, + { + "epoch": 0.7995221772099304, + "grad_norm": 0.40711480379104614, + "learning_rate": 9.593312784258297e-06, + "loss": 1.7376, + "step": 7697 + }, + { + "epoch": 0.7996260517295107, + "grad_norm": 0.4136184751987457, + "learning_rate": 9.583704491295042e-06, + "loss": 1.6386, + "step": 7698 + }, + { + "epoch": 0.7997299262490911, + "grad_norm": 0.41841137409210205, + "learning_rate": 9.574100502352007e-06, + "loss": 1.7478, + "step": 7699 + }, + { + "epoch": 0.7998338007686715, + "grad_norm": 0.38687604665756226, + "learning_rate": 9.564500818451916e-06, + "loss": 1.4819, + "step": 7700 + }, + { + "epoch": 0.7999376752882518, + "grad_norm": 0.40951046347618103, + "learning_rate": 9.554905440617108e-06, + "loss": 1.5952, + "step": 7701 + }, + { + "epoch": 0.8000415498078322, + "grad_norm": 0.4178246855735779, + "learning_rate": 9.545314369869362e-06, + "loss": 1.7333, + "step": 7702 + }, + { + "epoch": 0.8001454243274125, + "grad_norm": 0.4111107289791107, + "learning_rate": 9.535727607230088e-06, + "loss": 1.7021, + "step": 7703 + }, + { + "epoch": 0.8002492988469928, + "grad_norm": 0.42014679312705994, + "learning_rate": 9.52614515372019e-06, + "loss": 1.5992, + "step": 7704 + }, + { + "epoch": 0.8003531733665732, + "grad_norm": 0.45557916164398193, + "learning_rate": 9.516567010360127e-06, + "loss": 1.2576, + "step": 7705 + }, + { + "epoch": 0.8004570478861536, + "grad_norm": 0.4034499228000641, + "learning_rate": 9.506993178169892e-06, + "loss": 1.5702, + "step": 7706 + }, + { + "epoch": 0.8005609224057338, + "grad_norm": 0.4066244661808014, + "learning_rate": 9.497423658169025e-06, + "loss": 1.5422, + "step": 7707 + }, + { + "epoch": 0.8006647969253142, + "grad_norm": 0.4864404499530792, + "learning_rate": 9.487858451376608e-06, + "loss": 1.7401, + "step": 7708 + }, + { + "epoch": 0.8007686714448946, + "grad_norm": 0.4497174024581909, + "learning_rate": 9.47829755881125e-06, + "loss": 1.7727, + "step": 7709 + }, + { + "epoch": 0.8008725459644749, + "grad_norm": 0.4629211723804474, + "learning_rate": 9.468740981491143e-06, + "loss": 1.6619, + "step": 7710 + }, + { + "epoch": 0.8009764204840553, + "grad_norm": 0.392501562833786, + "learning_rate": 9.459188720433937e-06, + "loss": 1.5847, + "step": 7711 + }, + { + "epoch": 0.8010802950036356, + "grad_norm": 0.47376659512519836, + "learning_rate": 9.449640776656915e-06, + "loss": 1.6943, + "step": 7712 + }, + { + "epoch": 0.8011841695232159, + "grad_norm": 0.40953293442726135, + "learning_rate": 9.440097151176846e-06, + "loss": 1.5879, + "step": 7713 + }, + { + "epoch": 0.8012880440427963, + "grad_norm": 0.40460994839668274, + "learning_rate": 9.430557845010058e-06, + "loss": 1.7309, + "step": 7714 + }, + { + "epoch": 0.8013919185623767, + "grad_norm": 0.4047517776489258, + "learning_rate": 9.421022859172397e-06, + "loss": 1.7228, + "step": 7715 + }, + { + "epoch": 0.801495793081957, + "grad_norm": 0.397476464509964, + "learning_rate": 9.411492194679277e-06, + "loss": 1.6439, + "step": 7716 + }, + { + "epoch": 0.8015996676015373, + "grad_norm": 0.39920222759246826, + "learning_rate": 9.401965852545636e-06, + "loss": 1.5571, + "step": 7717 + }, + { + "epoch": 0.8017035421211177, + "grad_norm": 0.4768927991390228, + "learning_rate": 9.39244383378594e-06, + "loss": 1.7649, + "step": 7718 + }, + { + "epoch": 0.801807416640698, + "grad_norm": 0.4142306447029114, + "learning_rate": 9.38292613941425e-06, + "loss": 1.3555, + "step": 7719 + }, + { + "epoch": 0.8019112911602784, + "grad_norm": 0.4120739996433258, + "learning_rate": 9.37341277044408e-06, + "loss": 1.6209, + "step": 7720 + }, + { + "epoch": 0.8020151656798588, + "grad_norm": 0.434986412525177, + "learning_rate": 9.363903727888556e-06, + "loss": 1.3753, + "step": 7721 + }, + { + "epoch": 0.802119040199439, + "grad_norm": 0.3884414732456207, + "learning_rate": 9.354399012760328e-06, + "loss": 1.4721, + "step": 7722 + }, + { + "epoch": 0.8022229147190194, + "grad_norm": 0.42061513662338257, + "learning_rate": 9.34489862607153e-06, + "loss": 1.5033, + "step": 7723 + }, + { + "epoch": 0.8023267892385998, + "grad_norm": 0.4704020023345947, + "learning_rate": 9.335402568833923e-06, + "loss": 1.8149, + "step": 7724 + }, + { + "epoch": 0.8024306637581802, + "grad_norm": 0.44218188524246216, + "learning_rate": 9.325910842058733e-06, + "loss": 1.7166, + "step": 7725 + }, + { + "epoch": 0.8025345382777604, + "grad_norm": 0.4181729555130005, + "learning_rate": 9.316423446756788e-06, + "loss": 1.4822, + "step": 7726 + }, + { + "epoch": 0.8026384127973408, + "grad_norm": 0.4159371256828308, + "learning_rate": 9.306940383938395e-06, + "loss": 1.7386, + "step": 7727 + }, + { + "epoch": 0.8027422873169212, + "grad_norm": 0.45085909962654114, + "learning_rate": 9.297461654613415e-06, + "loss": 1.6013, + "step": 7728 + }, + { + "epoch": 0.8028461618365015, + "grad_norm": 0.4149478077888489, + "learning_rate": 9.28798725979131e-06, + "loss": 1.6137, + "step": 7729 + }, + { + "epoch": 0.8029500363560819, + "grad_norm": 0.5053886771202087, + "learning_rate": 9.278517200480963e-06, + "loss": 1.8171, + "step": 7730 + }, + { + "epoch": 0.8030539108756622, + "grad_norm": 0.4231288731098175, + "learning_rate": 9.26905147769091e-06, + "loss": 1.6836, + "step": 7731 + }, + { + "epoch": 0.8031577853952425, + "grad_norm": 0.5030054450035095, + "learning_rate": 9.25959009242916e-06, + "loss": 1.9295, + "step": 7732 + }, + { + "epoch": 0.8032616599148229, + "grad_norm": 0.5075528025627136, + "learning_rate": 9.25013304570328e-06, + "loss": 1.8385, + "step": 7733 + }, + { + "epoch": 0.8033655344344033, + "grad_norm": 0.43307042121887207, + "learning_rate": 9.24068033852037e-06, + "loss": 1.5867, + "step": 7734 + }, + { + "epoch": 0.8034694089539836, + "grad_norm": 0.4354676306247711, + "learning_rate": 9.231231971887066e-06, + "loss": 1.6309, + "step": 7735 + }, + { + "epoch": 0.8035732834735639, + "grad_norm": 0.4056316912174225, + "learning_rate": 9.221787946809552e-06, + "loss": 1.5253, + "step": 7736 + }, + { + "epoch": 0.8036771579931443, + "grad_norm": 0.408130019903183, + "learning_rate": 9.212348264293519e-06, + "loss": 1.5669, + "step": 7737 + }, + { + "epoch": 0.8037810325127246, + "grad_norm": 0.4351346492767334, + "learning_rate": 9.202912925344276e-06, + "loss": 1.6559, + "step": 7738 + }, + { + "epoch": 0.803884907032305, + "grad_norm": 0.4343356788158417, + "learning_rate": 9.19348193096654e-06, + "loss": 1.7244, + "step": 7739 + }, + { + "epoch": 0.8039887815518854, + "grad_norm": 0.43080124258995056, + "learning_rate": 9.184055282164699e-06, + "loss": 1.5402, + "step": 7740 + }, + { + "epoch": 0.8040926560714656, + "grad_norm": 0.4280051290988922, + "learning_rate": 9.174632979942587e-06, + "loss": 1.6772, + "step": 7741 + }, + { + "epoch": 0.804196530591046, + "grad_norm": 0.45156073570251465, + "learning_rate": 9.165215025303614e-06, + "loss": 1.7684, + "step": 7742 + }, + { + "epoch": 0.8043004051106264, + "grad_norm": 0.5198978185653687, + "learning_rate": 9.15580141925072e-06, + "loss": 1.8452, + "step": 7743 + }, + { + "epoch": 0.8044042796302067, + "grad_norm": 0.4284294843673706, + "learning_rate": 9.146392162786377e-06, + "loss": 1.7204, + "step": 7744 + }, + { + "epoch": 0.804508154149787, + "grad_norm": 0.41050204634666443, + "learning_rate": 9.136987256912594e-06, + "loss": 1.4458, + "step": 7745 + }, + { + "epoch": 0.8046120286693674, + "grad_norm": 0.5094167590141296, + "learning_rate": 9.127586702630913e-06, + "loss": 1.7218, + "step": 7746 + }, + { + "epoch": 0.8047159031889477, + "grad_norm": 0.44446441531181335, + "learning_rate": 9.118190500942459e-06, + "loss": 1.7473, + "step": 7747 + }, + { + "epoch": 0.8048197777085281, + "grad_norm": 0.4276479482650757, + "learning_rate": 9.108798652847811e-06, + "loss": 1.6519, + "step": 7748 + }, + { + "epoch": 0.8049236522281085, + "grad_norm": 0.42676594853401184, + "learning_rate": 9.099411159347133e-06, + "loss": 1.5866, + "step": 7749 + }, + { + "epoch": 0.8050275267476887, + "grad_norm": 0.43534624576568604, + "learning_rate": 9.090028021440155e-06, + "loss": 1.8477, + "step": 7750 + }, + { + "epoch": 0.8051314012672691, + "grad_norm": 0.4531230330467224, + "learning_rate": 9.080649240126054e-06, + "loss": 1.6118, + "step": 7751 + }, + { + "epoch": 0.8052352757868495, + "grad_norm": 0.4191751182079315, + "learning_rate": 9.07127481640364e-06, + "loss": 1.5275, + "step": 7752 + }, + { + "epoch": 0.8053391503064299, + "grad_norm": 0.4555509090423584, + "learning_rate": 9.061904751271193e-06, + "loss": 1.6641, + "step": 7753 + }, + { + "epoch": 0.8054430248260102, + "grad_norm": 0.4250716269016266, + "learning_rate": 9.052539045726583e-06, + "loss": 1.7141, + "step": 7754 + }, + { + "epoch": 0.8055468993455905, + "grad_norm": 0.42984840273857117, + "learning_rate": 9.043177700767148e-06, + "loss": 1.6593, + "step": 7755 + }, + { + "epoch": 0.8056507738651709, + "grad_norm": 0.4200412929058075, + "learning_rate": 9.033820717389802e-06, + "loss": 1.6844, + "step": 7756 + }, + { + "epoch": 0.8057546483847512, + "grad_norm": 0.3976275622844696, + "learning_rate": 9.024468096591022e-06, + "loss": 1.6062, + "step": 7757 + }, + { + "epoch": 0.8058585229043316, + "grad_norm": 0.40078333020210266, + "learning_rate": 9.015119839366748e-06, + "loss": 1.6187, + "step": 7758 + }, + { + "epoch": 0.805962397423912, + "grad_norm": 0.4719092845916748, + "learning_rate": 9.005775946712524e-06, + "loss": 1.6813, + "step": 7759 + }, + { + "epoch": 0.8060662719434922, + "grad_norm": 0.48558080196380615, + "learning_rate": 8.9964364196234e-06, + "loss": 1.8797, + "step": 7760 + }, + { + "epoch": 0.8061701464630726, + "grad_norm": 0.4854564368724823, + "learning_rate": 8.987101259093955e-06, + "loss": 1.6896, + "step": 7761 + }, + { + "epoch": 0.806274020982653, + "grad_norm": 0.4403029680252075, + "learning_rate": 8.977770466118313e-06, + "loss": 1.574, + "step": 7762 + }, + { + "epoch": 0.8063778955022333, + "grad_norm": 0.40083712339401245, + "learning_rate": 8.968444041690132e-06, + "loss": 1.5353, + "step": 7763 + }, + { + "epoch": 0.8064817700218136, + "grad_norm": 0.41334569454193115, + "learning_rate": 8.959121986802599e-06, + "loss": 1.494, + "step": 7764 + }, + { + "epoch": 0.806585644541394, + "grad_norm": 0.4323657751083374, + "learning_rate": 8.949804302448433e-06, + "loss": 1.6712, + "step": 7765 + }, + { + "epoch": 0.8066895190609743, + "grad_norm": 0.4492452144622803, + "learning_rate": 8.940490989619932e-06, + "loss": 1.6931, + "step": 7766 + }, + { + "epoch": 0.8067933935805547, + "grad_norm": 0.39172980189323425, + "learning_rate": 8.931182049308833e-06, + "loss": 1.616, + "step": 7767 + }, + { + "epoch": 0.8068972681001351, + "grad_norm": 0.40300890803337097, + "learning_rate": 8.921877482506518e-06, + "loss": 1.5328, + "step": 7768 + }, + { + "epoch": 0.8070011426197153, + "grad_norm": 0.38939663767814636, + "learning_rate": 8.912577290203822e-06, + "loss": 1.5393, + "step": 7769 + }, + { + "epoch": 0.8071050171392957, + "grad_norm": 0.39137062430381775, + "learning_rate": 8.90328147339115e-06, + "loss": 1.5114, + "step": 7770 + }, + { + "epoch": 0.8072088916588761, + "grad_norm": 0.39022791385650635, + "learning_rate": 8.893990033058442e-06, + "loss": 1.6231, + "step": 7771 + }, + { + "epoch": 0.8073127661784564, + "grad_norm": 0.4040229320526123, + "learning_rate": 8.884702970195152e-06, + "loss": 1.6152, + "step": 7772 + }, + { + "epoch": 0.8074166406980368, + "grad_norm": 0.45910122990608215, + "learning_rate": 8.875420285790281e-06, + "loss": 1.7177, + "step": 7773 + }, + { + "epoch": 0.8075205152176171, + "grad_norm": 0.4154045879840851, + "learning_rate": 8.866141980832354e-06, + "loss": 1.6195, + "step": 7774 + }, + { + "epoch": 0.8076243897371974, + "grad_norm": 0.397510826587677, + "learning_rate": 8.856868056309475e-06, + "loss": 1.4594, + "step": 7775 + }, + { + "epoch": 0.8077282642567778, + "grad_norm": 0.48494023084640503, + "learning_rate": 8.84759851320921e-06, + "loss": 1.7521, + "step": 7776 + }, + { + "epoch": 0.8078321387763582, + "grad_norm": 0.4340880811214447, + "learning_rate": 8.838333352518685e-06, + "loss": 1.5742, + "step": 7777 + }, + { + "epoch": 0.8079360132959386, + "grad_norm": 0.46240857243537903, + "learning_rate": 8.829072575224601e-06, + "loss": 1.6479, + "step": 7778 + }, + { + "epoch": 0.8080398878155188, + "grad_norm": 0.4113903343677521, + "learning_rate": 8.819816182313139e-06, + "loss": 1.4088, + "step": 7779 + }, + { + "epoch": 0.8081437623350992, + "grad_norm": 0.4529806077480316, + "learning_rate": 8.810564174770043e-06, + "loss": 1.5949, + "step": 7780 + }, + { + "epoch": 0.8082476368546796, + "grad_norm": 0.39212316274642944, + "learning_rate": 8.801316553580551e-06, + "loss": 1.6899, + "step": 7781 + }, + { + "epoch": 0.8083515113742599, + "grad_norm": 0.3910139501094818, + "learning_rate": 8.792073319729516e-06, + "loss": 1.5462, + "step": 7782 + }, + { + "epoch": 0.8084553858938403, + "grad_norm": 0.40568485856056213, + "learning_rate": 8.782834474201217e-06, + "loss": 1.8192, + "step": 7783 + }, + { + "epoch": 0.8085592604134206, + "grad_norm": 0.4968739151954651, + "learning_rate": 8.773600017979538e-06, + "loss": 1.5268, + "step": 7784 + }, + { + "epoch": 0.8086631349330009, + "grad_norm": 0.46858468651771545, + "learning_rate": 8.764369952047896e-06, + "loss": 1.7217, + "step": 7785 + }, + { + "epoch": 0.8087670094525813, + "grad_norm": 0.4213848412036896, + "learning_rate": 8.755144277389183e-06, + "loss": 1.7892, + "step": 7786 + }, + { + "epoch": 0.8088708839721617, + "grad_norm": 0.38827750086784363, + "learning_rate": 8.745922994985895e-06, + "loss": 1.5297, + "step": 7787 + }, + { + "epoch": 0.8089747584917419, + "grad_norm": 0.45694899559020996, + "learning_rate": 8.736706105820008e-06, + "loss": 1.7274, + "step": 7788 + }, + { + "epoch": 0.8090786330113223, + "grad_norm": 0.40677914023399353, + "learning_rate": 8.727493610873062e-06, + "loss": 1.597, + "step": 7789 + }, + { + "epoch": 0.8091825075309027, + "grad_norm": 0.4284796714782715, + "learning_rate": 8.718285511126106e-06, + "loss": 1.7429, + "step": 7790 + }, + { + "epoch": 0.809286382050483, + "grad_norm": 0.4217987656593323, + "learning_rate": 8.709081807559732e-06, + "loss": 1.6746, + "step": 7791 + }, + { + "epoch": 0.8093902565700634, + "grad_norm": 0.4426036775112152, + "learning_rate": 8.699882501154067e-06, + "loss": 1.7955, + "step": 7792 + }, + { + "epoch": 0.8094941310896437, + "grad_norm": 0.43109604716300964, + "learning_rate": 8.690687592888747e-06, + "loss": 1.741, + "step": 7793 + }, + { + "epoch": 0.809598005609224, + "grad_norm": 0.4453687369823456, + "learning_rate": 8.681497083742995e-06, + "loss": 1.5883, + "step": 7794 + }, + { + "epoch": 0.8097018801288044, + "grad_norm": 0.47059452533721924, + "learning_rate": 8.672310974695485e-06, + "loss": 1.7313, + "step": 7795 + }, + { + "epoch": 0.8098057546483848, + "grad_norm": 0.39277252554893494, + "learning_rate": 8.663129266724501e-06, + "loss": 1.6054, + "step": 7796 + }, + { + "epoch": 0.809909629167965, + "grad_norm": 0.47614243626594543, + "learning_rate": 8.65395196080781e-06, + "loss": 1.9036, + "step": 7797 + }, + { + "epoch": 0.8100135036875454, + "grad_norm": 0.43281370401382446, + "learning_rate": 8.644779057922725e-06, + "loss": 1.7245, + "step": 7798 + }, + { + "epoch": 0.8101173782071258, + "grad_norm": 0.41983285546302795, + "learning_rate": 8.635610559046087e-06, + "loss": 1.6524, + "step": 7799 + }, + { + "epoch": 0.8102212527267061, + "grad_norm": 0.41546353697776794, + "learning_rate": 8.62644646515427e-06, + "loss": 1.6953, + "step": 7800 + }, + { + "epoch": 0.8103251272462865, + "grad_norm": 0.3985000252723694, + "learning_rate": 8.617286777223177e-06, + "loss": 1.5696, + "step": 7801 + }, + { + "epoch": 0.8104290017658669, + "grad_norm": 0.4149910509586334, + "learning_rate": 8.608131496228228e-06, + "loss": 1.5954, + "step": 7802 + }, + { + "epoch": 0.8105328762854472, + "grad_norm": 0.5904526710510254, + "learning_rate": 8.598980623144427e-06, + "loss": 2.0503, + "step": 7803 + }, + { + "epoch": 0.8106367508050275, + "grad_norm": 0.44825097918510437, + "learning_rate": 8.589834158946253e-06, + "loss": 1.8498, + "step": 7804 + }, + { + "epoch": 0.8107406253246079, + "grad_norm": 0.4529954791069031, + "learning_rate": 8.580692104607712e-06, + "loss": 1.6659, + "step": 7805 + }, + { + "epoch": 0.8108444998441883, + "grad_norm": 0.41834184527397156, + "learning_rate": 8.571554461102389e-06, + "loss": 1.625, + "step": 7806 + }, + { + "epoch": 0.8109483743637685, + "grad_norm": 0.4493601620197296, + "learning_rate": 8.562421229403356e-06, + "loss": 1.7467, + "step": 7807 + }, + { + "epoch": 0.8110522488833489, + "grad_norm": 0.4033777117729187, + "learning_rate": 8.553292410483243e-06, + "loss": 1.6585, + "step": 7808 + }, + { + "epoch": 0.8111561234029293, + "grad_norm": 0.4082568883895874, + "learning_rate": 8.544168005314191e-06, + "loss": 1.6903, + "step": 7809 + }, + { + "epoch": 0.8112599979225096, + "grad_norm": 0.44599470496177673, + "learning_rate": 8.535048014867875e-06, + "loss": 1.6525, + "step": 7810 + }, + { + "epoch": 0.81136387244209, + "grad_norm": 0.38098323345184326, + "learning_rate": 8.525932440115509e-06, + "loss": 1.4948, + "step": 7811 + }, + { + "epoch": 0.8114677469616703, + "grad_norm": 0.4076872766017914, + "learning_rate": 8.516821282027814e-06, + "loss": 1.5873, + "step": 7812 + }, + { + "epoch": 0.8115716214812506, + "grad_norm": 0.40935343503952026, + "learning_rate": 8.507714541575096e-06, + "loss": 1.5856, + "step": 7813 + }, + { + "epoch": 0.811675496000831, + "grad_norm": 0.3904459476470947, + "learning_rate": 8.498612219727104e-06, + "loss": 1.6061, + "step": 7814 + }, + { + "epoch": 0.8117793705204114, + "grad_norm": 0.41350409388542175, + "learning_rate": 8.4895143174532e-06, + "loss": 1.6078, + "step": 7815 + }, + { + "epoch": 0.8118832450399917, + "grad_norm": 0.4305771589279175, + "learning_rate": 8.480420835722224e-06, + "loss": 1.6971, + "step": 7816 + }, + { + "epoch": 0.811987119559572, + "grad_norm": 0.43974366784095764, + "learning_rate": 8.471331775502566e-06, + "loss": 1.7409, + "step": 7817 + }, + { + "epoch": 0.8120909940791524, + "grad_norm": 0.40193986892700195, + "learning_rate": 8.462247137762142e-06, + "loss": 1.6476, + "step": 7818 + }, + { + "epoch": 0.8121948685987327, + "grad_norm": 0.42793333530426025, + "learning_rate": 8.453166923468387e-06, + "loss": 1.6937, + "step": 7819 + }, + { + "epoch": 0.8122987431183131, + "grad_norm": 0.46195828914642334, + "learning_rate": 8.444091133588273e-06, + "loss": 1.7928, + "step": 7820 + }, + { + "epoch": 0.8124026176378935, + "grad_norm": 0.45351237058639526, + "learning_rate": 8.435019769088298e-06, + "loss": 1.6076, + "step": 7821 + }, + { + "epoch": 0.8125064921574737, + "grad_norm": 0.41227513551712036, + "learning_rate": 8.42595283093452e-06, + "loss": 1.8033, + "step": 7822 + }, + { + "epoch": 0.8126103666770541, + "grad_norm": 0.4189743101596832, + "learning_rate": 8.416890320092451e-06, + "loss": 1.613, + "step": 7823 + }, + { + "epoch": 0.8127142411966345, + "grad_norm": 0.44498223066329956, + "learning_rate": 8.407832237527214e-06, + "loss": 1.8238, + "step": 7824 + }, + { + "epoch": 0.8128181157162148, + "grad_norm": 0.4206719994544983, + "learning_rate": 8.398778584203414e-06, + "loss": 1.6164, + "step": 7825 + }, + { + "epoch": 0.8129219902357951, + "grad_norm": 0.4327350854873657, + "learning_rate": 8.389729361085192e-06, + "loss": 1.6903, + "step": 7826 + }, + { + "epoch": 0.8130258647553755, + "grad_norm": 0.4112006425857544, + "learning_rate": 8.380684569136222e-06, + "loss": 1.5846, + "step": 7827 + }, + { + "epoch": 0.8131297392749558, + "grad_norm": 0.4160413444042206, + "learning_rate": 8.371644209319702e-06, + "loss": 1.4752, + "step": 7828 + }, + { + "epoch": 0.8132336137945362, + "grad_norm": 0.4746900796890259, + "learning_rate": 8.362608282598355e-06, + "loss": 1.6837, + "step": 7829 + }, + { + "epoch": 0.8133374883141166, + "grad_norm": 0.4345802664756775, + "learning_rate": 8.353576789934436e-06, + "loss": 1.6555, + "step": 7830 + }, + { + "epoch": 0.813441362833697, + "grad_norm": 0.43879222869873047, + "learning_rate": 8.344549732289741e-06, + "loss": 1.7406, + "step": 7831 + }, + { + "epoch": 0.8135452373532772, + "grad_norm": 0.41482990980148315, + "learning_rate": 8.33552711062559e-06, + "loss": 1.7404, + "step": 7832 + }, + { + "epoch": 0.8136491118728576, + "grad_norm": 0.4466431736946106, + "learning_rate": 8.326508925902781e-06, + "loss": 1.4813, + "step": 7833 + }, + { + "epoch": 0.813752986392438, + "grad_norm": 0.38617411255836487, + "learning_rate": 8.31749517908172e-06, + "loss": 1.3518, + "step": 7834 + }, + { + "epoch": 0.8138568609120183, + "grad_norm": 0.449827641248703, + "learning_rate": 8.308485871122284e-06, + "loss": 1.723, + "step": 7835 + }, + { + "epoch": 0.8139607354315986, + "grad_norm": 0.4846034348011017, + "learning_rate": 8.2994810029839e-06, + "loss": 1.8044, + "step": 7836 + }, + { + "epoch": 0.814064609951179, + "grad_norm": 0.4224514663219452, + "learning_rate": 8.290480575625508e-06, + "loss": 1.6315, + "step": 7837 + }, + { + "epoch": 0.8141684844707593, + "grad_norm": 0.4300024211406708, + "learning_rate": 8.28148459000559e-06, + "loss": 1.6415, + "step": 7838 + }, + { + "epoch": 0.8142723589903397, + "grad_norm": 0.4380504786968231, + "learning_rate": 8.272493047082147e-06, + "loss": 1.7884, + "step": 7839 + }, + { + "epoch": 0.8143762335099201, + "grad_norm": 0.4354472756385803, + "learning_rate": 8.263505947812694e-06, + "loss": 1.6999, + "step": 7840 + }, + { + "epoch": 0.8144801080295003, + "grad_norm": 0.42169252038002014, + "learning_rate": 8.254523293154325e-06, + "loss": 1.7044, + "step": 7841 + }, + { + "epoch": 0.8145839825490807, + "grad_norm": 0.4186304211616516, + "learning_rate": 8.245545084063572e-06, + "loss": 1.6382, + "step": 7842 + }, + { + "epoch": 0.8146878570686611, + "grad_norm": 0.41696295142173767, + "learning_rate": 8.236571321496584e-06, + "loss": 1.657, + "step": 7843 + }, + { + "epoch": 0.8147917315882414, + "grad_norm": 0.40641024708747864, + "learning_rate": 8.227602006408986e-06, + "loss": 1.6226, + "step": 7844 + }, + { + "epoch": 0.8148956061078217, + "grad_norm": 0.42552778124809265, + "learning_rate": 8.218637139755931e-06, + "loss": 1.6346, + "step": 7845 + }, + { + "epoch": 0.8149994806274021, + "grad_norm": 0.4113791584968567, + "learning_rate": 8.209676722492116e-06, + "loss": 1.6784, + "step": 7846 + }, + { + "epoch": 0.8151033551469824, + "grad_norm": 0.4235158860683441, + "learning_rate": 8.200720755571755e-06, + "loss": 1.6905, + "step": 7847 + }, + { + "epoch": 0.8152072296665628, + "grad_norm": 0.47491201758384705, + "learning_rate": 8.191769239948588e-06, + "loss": 1.8831, + "step": 7848 + }, + { + "epoch": 0.8153111041861432, + "grad_norm": 0.4474998712539673, + "learning_rate": 8.182822176575865e-06, + "loss": 1.6919, + "step": 7849 + }, + { + "epoch": 0.8154149787057234, + "grad_norm": 0.43246933817863464, + "learning_rate": 8.173879566406418e-06, + "loss": 1.6247, + "step": 7850 + }, + { + "epoch": 0.8155188532253038, + "grad_norm": 0.4082406461238861, + "learning_rate": 8.164941410392519e-06, + "loss": 1.662, + "step": 7851 + }, + { + "epoch": 0.8156227277448842, + "grad_norm": 0.3958854079246521, + "learning_rate": 8.15600770948604e-06, + "loss": 1.5751, + "step": 7852 + }, + { + "epoch": 0.8157266022644645, + "grad_norm": 0.4203862249851227, + "learning_rate": 8.147078464638346e-06, + "loss": 1.5519, + "step": 7853 + }, + { + "epoch": 0.8158304767840449, + "grad_norm": 0.5035269260406494, + "learning_rate": 8.138153676800336e-06, + "loss": 1.7116, + "step": 7854 + }, + { + "epoch": 0.8159343513036252, + "grad_norm": 0.4069529175758362, + "learning_rate": 8.129233346922422e-06, + "loss": 1.5981, + "step": 7855 + }, + { + "epoch": 0.8160382258232056, + "grad_norm": 0.44691234827041626, + "learning_rate": 8.120317475954536e-06, + "loss": 1.7425, + "step": 7856 + }, + { + "epoch": 0.8161421003427859, + "grad_norm": 0.4124663770198822, + "learning_rate": 8.111406064846194e-06, + "loss": 1.5632, + "step": 7857 + }, + { + "epoch": 0.8162459748623663, + "grad_norm": 0.4278232157230377, + "learning_rate": 8.102499114546335e-06, + "loss": 1.5863, + "step": 7858 + }, + { + "epoch": 0.8163498493819467, + "grad_norm": 0.4516265392303467, + "learning_rate": 8.093596626003519e-06, + "loss": 1.7937, + "step": 7859 + }, + { + "epoch": 0.8164537239015269, + "grad_norm": 0.42853647470474243, + "learning_rate": 8.084698600165796e-06, + "loss": 1.745, + "step": 7860 + }, + { + "epoch": 0.8165575984211073, + "grad_norm": 0.3971802294254303, + "learning_rate": 8.075805037980688e-06, + "loss": 1.528, + "step": 7861 + }, + { + "epoch": 0.8166614729406877, + "grad_norm": 0.4206622242927551, + "learning_rate": 8.066915940395341e-06, + "loss": 1.6715, + "step": 7862 + }, + { + "epoch": 0.816765347460268, + "grad_norm": 0.4111405313014984, + "learning_rate": 8.058031308356357e-06, + "loss": 1.5871, + "step": 7863 + }, + { + "epoch": 0.8168692219798483, + "grad_norm": 0.5833050608634949, + "learning_rate": 8.049151142809874e-06, + "loss": 1.8095, + "step": 7864 + }, + { + "epoch": 0.8169730964994287, + "grad_norm": 0.42305901646614075, + "learning_rate": 8.040275444701567e-06, + "loss": 1.712, + "step": 7865 + }, + { + "epoch": 0.817076971019009, + "grad_norm": 0.46678563952445984, + "learning_rate": 8.031404214976628e-06, + "loss": 1.8676, + "step": 7866 + }, + { + "epoch": 0.8171808455385894, + "grad_norm": 0.45080798864364624, + "learning_rate": 8.02253745457977e-06, + "loss": 1.5683, + "step": 7867 + }, + { + "epoch": 0.8172847200581698, + "grad_norm": 0.41853803396224976, + "learning_rate": 8.013675164455225e-06, + "loss": 1.6236, + "step": 7868 + }, + { + "epoch": 0.81738859457775, + "grad_norm": 0.4322271943092346, + "learning_rate": 8.004817345546794e-06, + "loss": 1.677, + "step": 7869 + }, + { + "epoch": 0.8174924690973304, + "grad_norm": 0.4160740077495575, + "learning_rate": 7.995963998797717e-06, + "loss": 1.6949, + "step": 7870 + }, + { + "epoch": 0.8175963436169108, + "grad_norm": 0.4284444749355316, + "learning_rate": 7.98711512515084e-06, + "loss": 1.6774, + "step": 7871 + }, + { + "epoch": 0.8177002181364911, + "grad_norm": 0.3733249306678772, + "learning_rate": 7.978270725548493e-06, + "loss": 1.4368, + "step": 7872 + }, + { + "epoch": 0.8178040926560715, + "grad_norm": 0.4796290695667267, + "learning_rate": 7.969430800932526e-06, + "loss": 1.6731, + "step": 7873 + }, + { + "epoch": 0.8179079671756518, + "grad_norm": 0.4326523542404175, + "learning_rate": 7.960595352244332e-06, + "loss": 1.7251, + "step": 7874 + }, + { + "epoch": 0.8180118416952321, + "grad_norm": 0.4698193669319153, + "learning_rate": 7.951764380424808e-06, + "loss": 1.8268, + "step": 7875 + }, + { + "epoch": 0.8181157162148125, + "grad_norm": 0.42041996121406555, + "learning_rate": 7.942937886414392e-06, + "loss": 1.4004, + "step": 7876 + }, + { + "epoch": 0.8182195907343929, + "grad_norm": 0.4212832450866699, + "learning_rate": 7.934115871153025e-06, + "loss": 1.6694, + "step": 7877 + }, + { + "epoch": 0.8183234652539731, + "grad_norm": 0.39500170946121216, + "learning_rate": 7.925298335580211e-06, + "loss": 1.5698, + "step": 7878 + }, + { + "epoch": 0.8184273397735535, + "grad_norm": 0.4193446934223175, + "learning_rate": 7.916485280634906e-06, + "loss": 1.6886, + "step": 7879 + }, + { + "epoch": 0.8185312142931339, + "grad_norm": 0.44972357153892517, + "learning_rate": 7.907676707255668e-06, + "loss": 1.8019, + "step": 7880 + }, + { + "epoch": 0.8186350888127143, + "grad_norm": 0.42009323835372925, + "learning_rate": 7.898872616380526e-06, + "loss": 1.5613, + "step": 7881 + }, + { + "epoch": 0.8187389633322946, + "grad_norm": 0.4435664713382721, + "learning_rate": 7.890073008947047e-06, + "loss": 1.6753, + "step": 7882 + }, + { + "epoch": 0.818842837851875, + "grad_norm": 0.40186628699302673, + "learning_rate": 7.881277885892324e-06, + "loss": 1.4869, + "step": 7883 + }, + { + "epoch": 0.8189467123714553, + "grad_norm": 0.40945637226104736, + "learning_rate": 7.87248724815296e-06, + "loss": 1.641, + "step": 7884 + }, + { + "epoch": 0.8190505868910356, + "grad_norm": 0.42431339621543884, + "learning_rate": 7.863701096665116e-06, + "loss": 1.708, + "step": 7885 + }, + { + "epoch": 0.819154461410616, + "grad_norm": 0.3984614610671997, + "learning_rate": 7.854919432364406e-06, + "loss": 1.545, + "step": 7886 + }, + { + "epoch": 0.8192583359301964, + "grad_norm": 0.42371925711631775, + "learning_rate": 7.846142256186046e-06, + "loss": 1.6084, + "step": 7887 + }, + { + "epoch": 0.8193622104497766, + "grad_norm": 0.40940725803375244, + "learning_rate": 7.837369569064734e-06, + "loss": 1.4237, + "step": 7888 + }, + { + "epoch": 0.819466084969357, + "grad_norm": 0.4526590406894684, + "learning_rate": 7.828601371934657e-06, + "loss": 1.7202, + "step": 7889 + }, + { + "epoch": 0.8195699594889374, + "grad_norm": 0.4891595244407654, + "learning_rate": 7.819837665729596e-06, + "loss": 1.8862, + "step": 7890 + }, + { + "epoch": 0.8196738340085177, + "grad_norm": 0.44362369179725647, + "learning_rate": 7.811078451382808e-06, + "loss": 1.776, + "step": 7891 + }, + { + "epoch": 0.8197777085280981, + "grad_norm": 0.46131572127342224, + "learning_rate": 7.802323729827077e-06, + "loss": 1.7542, + "step": 7892 + }, + { + "epoch": 0.8198815830476784, + "grad_norm": 0.45027032494544983, + "learning_rate": 7.79357350199471e-06, + "loss": 1.3346, + "step": 7893 + }, + { + "epoch": 0.8199854575672587, + "grad_norm": 0.41758599877357483, + "learning_rate": 7.784827768817548e-06, + "loss": 1.6569, + "step": 7894 + }, + { + "epoch": 0.8200893320868391, + "grad_norm": 0.3943220376968384, + "learning_rate": 7.77608653122694e-06, + "loss": 1.5361, + "step": 7895 + }, + { + "epoch": 0.8201932066064195, + "grad_norm": 0.40477508306503296, + "learning_rate": 7.767349790153738e-06, + "loss": 1.5803, + "step": 7896 + }, + { + "epoch": 0.8202970811259997, + "grad_norm": 0.4200827479362488, + "learning_rate": 7.758617546528386e-06, + "loss": 1.6642, + "step": 7897 + }, + { + "epoch": 0.8204009556455801, + "grad_norm": 0.4615240693092346, + "learning_rate": 7.74988980128074e-06, + "loss": 1.8093, + "step": 7898 + }, + { + "epoch": 0.8205048301651605, + "grad_norm": 0.42218905687332153, + "learning_rate": 7.741166555340284e-06, + "loss": 1.7515, + "step": 7899 + }, + { + "epoch": 0.8206087046847408, + "grad_norm": 0.4505891799926758, + "learning_rate": 7.732447809635956e-06, + "loss": 1.6826, + "step": 7900 + }, + { + "epoch": 0.8207125792043212, + "grad_norm": 0.4403258264064789, + "learning_rate": 7.723733565096236e-06, + "loss": 1.6249, + "step": 7901 + }, + { + "epoch": 0.8208164537239016, + "grad_norm": 0.4211053252220154, + "learning_rate": 7.715023822649126e-06, + "loss": 1.563, + "step": 7902 + }, + { + "epoch": 0.8209203282434818, + "grad_norm": 0.4513636529445648, + "learning_rate": 7.706318583222144e-06, + "loss": 1.787, + "step": 7903 + }, + { + "epoch": 0.8210242027630622, + "grad_norm": 0.3969414532184601, + "learning_rate": 7.697617847742328e-06, + "loss": 1.6397, + "step": 7904 + }, + { + "epoch": 0.8211280772826426, + "grad_norm": 0.43942591547966003, + "learning_rate": 7.688921617136224e-06, + "loss": 1.6617, + "step": 7905 + }, + { + "epoch": 0.8212319518022229, + "grad_norm": 0.3998488187789917, + "learning_rate": 7.68022989232996e-06, + "loss": 1.5171, + "step": 7906 + }, + { + "epoch": 0.8213358263218032, + "grad_norm": 0.44349002838134766, + "learning_rate": 7.671542674249077e-06, + "loss": 1.7494, + "step": 7907 + }, + { + "epoch": 0.8214397008413836, + "grad_norm": 0.4117303788661957, + "learning_rate": 7.662859963818737e-06, + "loss": 1.6006, + "step": 7908 + }, + { + "epoch": 0.821543575360964, + "grad_norm": 0.4332650303840637, + "learning_rate": 7.65418176196357e-06, + "loss": 1.4621, + "step": 7909 + }, + { + "epoch": 0.8216474498805443, + "grad_norm": 0.3984694182872772, + "learning_rate": 7.645508069607731e-06, + "loss": 1.5073, + "step": 7910 + }, + { + "epoch": 0.8217513244001247, + "grad_norm": 0.477058082818985, + "learning_rate": 7.636838887674908e-06, + "loss": 1.9018, + "step": 7911 + }, + { + "epoch": 0.821855198919705, + "grad_norm": 0.4433257281780243, + "learning_rate": 7.6281742170882854e-06, + "loss": 1.6646, + "step": 7912 + }, + { + "epoch": 0.8219590734392853, + "grad_norm": 0.4137663245201111, + "learning_rate": 7.619514058770622e-06, + "loss": 1.482, + "step": 7913 + }, + { + "epoch": 0.8220629479588657, + "grad_norm": 0.4989883005619049, + "learning_rate": 7.610858413644106e-06, + "loss": 1.7999, + "step": 7914 + }, + { + "epoch": 0.8221668224784461, + "grad_norm": 0.4211665391921997, + "learning_rate": 7.60220728263053e-06, + "loss": 1.6357, + "step": 7915 + }, + { + "epoch": 0.8222706969980264, + "grad_norm": 0.4524006247520447, + "learning_rate": 7.5935606666511785e-06, + "loss": 1.7431, + "step": 7916 + }, + { + "epoch": 0.8223745715176067, + "grad_norm": 0.42954227328300476, + "learning_rate": 7.584918566626808e-06, + "loss": 1.8379, + "step": 7917 + }, + { + "epoch": 0.8224784460371871, + "grad_norm": 0.4114231467247009, + "learning_rate": 7.576280983477768e-06, + "loss": 1.7044, + "step": 7918 + }, + { + "epoch": 0.8225823205567674, + "grad_norm": 0.43601930141448975, + "learning_rate": 7.5676479181238935e-06, + "loss": 1.6908, + "step": 7919 + }, + { + "epoch": 0.8226861950763478, + "grad_norm": 0.42451170086860657, + "learning_rate": 7.5590193714845215e-06, + "loss": 1.7343, + "step": 7920 + }, + { + "epoch": 0.8227900695959282, + "grad_norm": 0.4314807653427124, + "learning_rate": 7.5503953444785395e-06, + "loss": 1.6858, + "step": 7921 + }, + { + "epoch": 0.8228939441155084, + "grad_norm": 0.41289034485816956, + "learning_rate": 7.541775838024335e-06, + "loss": 1.4288, + "step": 7922 + }, + { + "epoch": 0.8229978186350888, + "grad_norm": 0.4084930419921875, + "learning_rate": 7.533160853039811e-06, + "loss": 1.6902, + "step": 7923 + }, + { + "epoch": 0.8231016931546692, + "grad_norm": 0.4590340852737427, + "learning_rate": 7.5245503904423974e-06, + "loss": 1.6764, + "step": 7924 + }, + { + "epoch": 0.8232055676742495, + "grad_norm": 0.46251264214515686, + "learning_rate": 7.515944451149065e-06, + "loss": 1.5257, + "step": 7925 + }, + { + "epoch": 0.8233094421938298, + "grad_norm": 0.4301720857620239, + "learning_rate": 7.507343036076236e-06, + "loss": 1.7414, + "step": 7926 + }, + { + "epoch": 0.8234133167134102, + "grad_norm": 0.42009609937667847, + "learning_rate": 7.498746146139934e-06, + "loss": 1.6366, + "step": 7927 + }, + { + "epoch": 0.8235171912329905, + "grad_norm": 0.40405720472335815, + "learning_rate": 7.490153782255643e-06, + "loss": 1.5959, + "step": 7928 + }, + { + "epoch": 0.8236210657525709, + "grad_norm": 0.4580763280391693, + "learning_rate": 7.481565945338381e-06, + "loss": 1.8361, + "step": 7929 + }, + { + "epoch": 0.8237249402721513, + "grad_norm": 0.43223121762275696, + "learning_rate": 7.472982636302694e-06, + "loss": 1.592, + "step": 7930 + }, + { + "epoch": 0.8238288147917315, + "grad_norm": 0.48288586735725403, + "learning_rate": 7.464403856062629e-06, + "loss": 1.7912, + "step": 7931 + }, + { + "epoch": 0.8239326893113119, + "grad_norm": 0.4333949685096741, + "learning_rate": 7.455829605531767e-06, + "loss": 1.7699, + "step": 7932 + }, + { + "epoch": 0.8240365638308923, + "grad_norm": 0.40936028957366943, + "learning_rate": 7.4472598856231815e-06, + "loss": 1.5568, + "step": 7933 + }, + { + "epoch": 0.8241404383504727, + "grad_norm": 0.4204612672328949, + "learning_rate": 7.438694697249504e-06, + "loss": 1.5514, + "step": 7934 + }, + { + "epoch": 0.824244312870053, + "grad_norm": 0.4096861183643341, + "learning_rate": 7.430134041322856e-06, + "loss": 1.6633, + "step": 7935 + }, + { + "epoch": 0.8243481873896333, + "grad_norm": 0.4757747948169708, + "learning_rate": 7.421577918754868e-06, + "loss": 1.8486, + "step": 7936 + }, + { + "epoch": 0.8244520619092137, + "grad_norm": 0.44562995433807373, + "learning_rate": 7.413026330456713e-06, + "loss": 1.6866, + "step": 7937 + }, + { + "epoch": 0.824555936428794, + "grad_norm": 0.4388675391674042, + "learning_rate": 7.404479277339055e-06, + "loss": 1.6298, + "step": 7938 + }, + { + "epoch": 0.8246598109483744, + "grad_norm": 0.4150373041629791, + "learning_rate": 7.3959367603121055e-06, + "loss": 1.6743, + "step": 7939 + }, + { + "epoch": 0.8247636854679548, + "grad_norm": 0.40958264470100403, + "learning_rate": 7.387398780285548e-06, + "loss": 1.5683, + "step": 7940 + }, + { + "epoch": 0.824867559987535, + "grad_norm": 0.40461811423301697, + "learning_rate": 7.378865338168655e-06, + "loss": 1.5644, + "step": 7941 + }, + { + "epoch": 0.8249714345071154, + "grad_norm": 0.4974653124809265, + "learning_rate": 7.370336434870123e-06, + "loss": 1.8986, + "step": 7942 + }, + { + "epoch": 0.8250753090266958, + "grad_norm": 0.40290409326553345, + "learning_rate": 7.361812071298246e-06, + "loss": 1.6532, + "step": 7943 + }, + { + "epoch": 0.8251791835462761, + "grad_norm": 0.4083233177661896, + "learning_rate": 7.353292248360805e-06, + "loss": 1.6532, + "step": 7944 + }, + { + "epoch": 0.8252830580658564, + "grad_norm": 0.47831931710243225, + "learning_rate": 7.344776966965061e-06, + "loss": 1.7635, + "step": 7945 + }, + { + "epoch": 0.8253869325854368, + "grad_norm": 0.4501749277114868, + "learning_rate": 7.336266228017857e-06, + "loss": 1.5736, + "step": 7946 + }, + { + "epoch": 0.8254908071050171, + "grad_norm": 0.43580162525177, + "learning_rate": 7.327760032425507e-06, + "loss": 1.7677, + "step": 7947 + }, + { + "epoch": 0.8255946816245975, + "grad_norm": 0.4431309401988983, + "learning_rate": 7.319258381093863e-06, + "loss": 1.541, + "step": 7948 + }, + { + "epoch": 0.8256985561441779, + "grad_norm": 0.46647512912750244, + "learning_rate": 7.310761274928274e-06, + "loss": 1.6897, + "step": 7949 + }, + { + "epoch": 0.8258024306637581, + "grad_norm": 0.4248170554637909, + "learning_rate": 7.302268714833621e-06, + "loss": 1.6534, + "step": 7950 + }, + { + "epoch": 0.8259063051833385, + "grad_norm": 0.461207777261734, + "learning_rate": 7.2937807017142975e-06, + "loss": 1.4829, + "step": 7951 + }, + { + "epoch": 0.8260101797029189, + "grad_norm": 0.42655470967292786, + "learning_rate": 7.285297236474198e-06, + "loss": 1.7697, + "step": 7952 + }, + { + "epoch": 0.8261140542224992, + "grad_norm": 0.40300819277763367, + "learning_rate": 7.27681832001677e-06, + "loss": 1.4717, + "step": 7953 + }, + { + "epoch": 0.8262179287420796, + "grad_norm": 0.4207182228565216, + "learning_rate": 7.268343953244921e-06, + "loss": 1.5658, + "step": 7954 + }, + { + "epoch": 0.8263218032616599, + "grad_norm": 0.516309916973114, + "learning_rate": 7.259874137061135e-06, + "loss": 1.7525, + "step": 7955 + }, + { + "epoch": 0.8264256777812402, + "grad_norm": 0.4138142466545105, + "learning_rate": 7.2514088723673605e-06, + "loss": 1.6619, + "step": 7956 + }, + { + "epoch": 0.8265295523008206, + "grad_norm": 0.4462479054927826, + "learning_rate": 7.2429481600650936e-06, + "loss": 1.7732, + "step": 7957 + }, + { + "epoch": 0.826633426820401, + "grad_norm": 0.46467795968055725, + "learning_rate": 7.234492001055332e-06, + "loss": 1.762, + "step": 7958 + }, + { + "epoch": 0.8267373013399814, + "grad_norm": 0.4307834506034851, + "learning_rate": 7.2260403962385706e-06, + "loss": 1.5749, + "step": 7959 + }, + { + "epoch": 0.8268411758595616, + "grad_norm": 0.43813708424568176, + "learning_rate": 7.217593346514884e-06, + "loss": 1.7171, + "step": 7960 + }, + { + "epoch": 0.826945050379142, + "grad_norm": 0.42140957713127136, + "learning_rate": 7.209150852783769e-06, + "loss": 1.6369, + "step": 7961 + }, + { + "epoch": 0.8270489248987224, + "grad_norm": 0.42076048254966736, + "learning_rate": 7.200712915944313e-06, + "loss": 1.6839, + "step": 7962 + }, + { + "epoch": 0.8271527994183027, + "grad_norm": 0.4501379728317261, + "learning_rate": 7.192279536895091e-06, + "loss": 1.7524, + "step": 7963 + }, + { + "epoch": 0.827256673937883, + "grad_norm": 0.39931780099868774, + "learning_rate": 7.183850716534179e-06, + "loss": 1.6049, + "step": 7964 + }, + { + "epoch": 0.8273605484574634, + "grad_norm": 0.39734897017478943, + "learning_rate": 7.175426455759193e-06, + "loss": 1.5222, + "step": 7965 + }, + { + "epoch": 0.8274644229770437, + "grad_norm": 0.434510201215744, + "learning_rate": 7.167006755467237e-06, + "loss": 1.7086, + "step": 7966 + }, + { + "epoch": 0.8275682974966241, + "grad_norm": 0.42094168066978455, + "learning_rate": 7.15859161655496e-06, + "loss": 1.7143, + "step": 7967 + }, + { + "epoch": 0.8276721720162045, + "grad_norm": 0.4152793884277344, + "learning_rate": 7.150181039918485e-06, + "loss": 1.3494, + "step": 7968 + }, + { + "epoch": 0.8277760465357847, + "grad_norm": 0.4016823470592499, + "learning_rate": 7.141775026453506e-06, + "loss": 1.5688, + "step": 7969 + }, + { + "epoch": 0.8278799210553651, + "grad_norm": 0.45909714698791504, + "learning_rate": 7.133373577055163e-06, + "loss": 1.7456, + "step": 7970 + }, + { + "epoch": 0.8279837955749455, + "grad_norm": 0.4085950255393982, + "learning_rate": 7.124976692618168e-06, + "loss": 1.5706, + "step": 7971 + }, + { + "epoch": 0.8280876700945258, + "grad_norm": 0.419292151927948, + "learning_rate": 7.116584374036733e-06, + "loss": 1.6361, + "step": 7972 + }, + { + "epoch": 0.8281915446141062, + "grad_norm": 0.4057331383228302, + "learning_rate": 7.108196622204538e-06, + "loss": 1.5935, + "step": 7973 + }, + { + "epoch": 0.8282954191336865, + "grad_norm": 0.4193360507488251, + "learning_rate": 7.099813438014841e-06, + "loss": 1.4914, + "step": 7974 + }, + { + "epoch": 0.8283992936532668, + "grad_norm": 0.42038246989250183, + "learning_rate": 7.091434822360377e-06, + "loss": 1.6914, + "step": 7975 + }, + { + "epoch": 0.8285031681728472, + "grad_norm": 0.41948550939559937, + "learning_rate": 7.083060776133404e-06, + "loss": 1.7179, + "step": 7976 + }, + { + "epoch": 0.8286070426924276, + "grad_norm": 0.46123793721199036, + "learning_rate": 7.0746913002257e-06, + "loss": 1.7011, + "step": 7977 + }, + { + "epoch": 0.8287109172120078, + "grad_norm": 0.4170924127101898, + "learning_rate": 7.066326395528539e-06, + "loss": 1.6866, + "step": 7978 + }, + { + "epoch": 0.8288147917315882, + "grad_norm": 0.4177165627479553, + "learning_rate": 7.05796606293272e-06, + "loss": 1.6695, + "step": 7979 + }, + { + "epoch": 0.8289186662511686, + "grad_norm": 0.46339476108551025, + "learning_rate": 7.049610303328541e-06, + "loss": 1.7386, + "step": 7980 + }, + { + "epoch": 0.8290225407707489, + "grad_norm": 0.45544731616973877, + "learning_rate": 7.04125911760587e-06, + "loss": 1.6829, + "step": 7981 + }, + { + "epoch": 0.8291264152903293, + "grad_norm": 0.4197863042354584, + "learning_rate": 7.032912506653983e-06, + "loss": 1.5823, + "step": 7982 + }, + { + "epoch": 0.8292302898099096, + "grad_norm": 0.42813071608543396, + "learning_rate": 7.024570471361769e-06, + "loss": 1.5883, + "step": 7983 + }, + { + "epoch": 0.82933416432949, + "grad_norm": 0.43591973185539246, + "learning_rate": 7.016233012617579e-06, + "loss": 1.4265, + "step": 7984 + }, + { + "epoch": 0.8294380388490703, + "grad_norm": 0.41332176327705383, + "learning_rate": 7.0079001313092865e-06, + "loss": 1.7404, + "step": 7985 + }, + { + "epoch": 0.8295419133686507, + "grad_norm": 0.40137556195259094, + "learning_rate": 6.99957182832428e-06, + "loss": 1.6176, + "step": 7986 + }, + { + "epoch": 0.8296457878882311, + "grad_norm": 0.45006173849105835, + "learning_rate": 6.9912481045494475e-06, + "loss": 1.6707, + "step": 7987 + }, + { + "epoch": 0.8297496624078113, + "grad_norm": 0.4188838005065918, + "learning_rate": 6.9829289608712264e-06, + "loss": 1.6712, + "step": 7988 + }, + { + "epoch": 0.8298535369273917, + "grad_norm": 0.4287783205509186, + "learning_rate": 6.974614398175511e-06, + "loss": 1.6211, + "step": 7989 + }, + { + "epoch": 0.8299574114469721, + "grad_norm": 0.47514259815216064, + "learning_rate": 6.966304417347758e-06, + "loss": 1.6593, + "step": 7990 + }, + { + "epoch": 0.8300612859665524, + "grad_norm": 0.40735095739364624, + "learning_rate": 6.95799901927291e-06, + "loss": 1.6225, + "step": 7991 + }, + { + "epoch": 0.8301651604861328, + "grad_norm": 0.39932069182395935, + "learning_rate": 6.949698204835426e-06, + "loss": 1.5756, + "step": 7992 + }, + { + "epoch": 0.8302690350057131, + "grad_norm": 0.43335703015327454, + "learning_rate": 6.941401974919276e-06, + "loss": 1.6939, + "step": 7993 + }, + { + "epoch": 0.8303729095252934, + "grad_norm": 0.43645966053009033, + "learning_rate": 6.9331103304079494e-06, + "loss": 1.7476, + "step": 7994 + }, + { + "epoch": 0.8304767840448738, + "grad_norm": 0.38558638095855713, + "learning_rate": 6.924823272184439e-06, + "loss": 1.449, + "step": 7995 + }, + { + "epoch": 0.8305806585644542, + "grad_norm": 0.3955431878566742, + "learning_rate": 6.916540801131232e-06, + "loss": 1.6482, + "step": 7996 + }, + { + "epoch": 0.8306845330840344, + "grad_norm": 0.3958597779273987, + "learning_rate": 6.908262918130392e-06, + "loss": 1.5417, + "step": 7997 + }, + { + "epoch": 0.8307884076036148, + "grad_norm": 0.4344055950641632, + "learning_rate": 6.899989624063402e-06, + "loss": 1.5201, + "step": 7998 + }, + { + "epoch": 0.8308922821231952, + "grad_norm": 0.4599206745624542, + "learning_rate": 6.8917209198113345e-06, + "loss": 1.7178, + "step": 7999 + }, + { + "epoch": 0.8309961566427755, + "grad_norm": 0.3974829614162445, + "learning_rate": 6.883456806254746e-06, + "loss": 1.5518, + "step": 8000 + }, + { + "epoch": 0.8311000311623559, + "grad_norm": 0.39066168665885925, + "learning_rate": 6.87519728427366e-06, + "loss": 1.6246, + "step": 8001 + }, + { + "epoch": 0.8312039056819363, + "grad_norm": 0.4144991338253021, + "learning_rate": 6.866942354747685e-06, + "loss": 1.7103, + "step": 8002 + }, + { + "epoch": 0.8313077802015165, + "grad_norm": 0.4020610749721527, + "learning_rate": 6.8586920185559015e-06, + "loss": 1.5555, + "step": 8003 + }, + { + "epoch": 0.8314116547210969, + "grad_norm": 0.3927983045578003, + "learning_rate": 6.850446276576905e-06, + "loss": 1.5676, + "step": 8004 + }, + { + "epoch": 0.8315155292406773, + "grad_norm": 0.4108089804649353, + "learning_rate": 6.8422051296888e-06, + "loss": 1.5645, + "step": 8005 + }, + { + "epoch": 0.8316194037602576, + "grad_norm": 0.38096117973327637, + "learning_rate": 6.833968578769201e-06, + "loss": 1.4893, + "step": 8006 + }, + { + "epoch": 0.8317232782798379, + "grad_norm": 0.43577104806900024, + "learning_rate": 6.825736624695245e-06, + "loss": 1.5007, + "step": 8007 + }, + { + "epoch": 0.8318271527994183, + "grad_norm": 0.40206262469291687, + "learning_rate": 6.817509268343553e-06, + "loss": 1.3456, + "step": 8008 + }, + { + "epoch": 0.8319310273189986, + "grad_norm": 0.4439072906970978, + "learning_rate": 6.809286510590307e-06, + "loss": 1.5847, + "step": 8009 + }, + { + "epoch": 0.832034901838579, + "grad_norm": 0.42791223526000977, + "learning_rate": 6.80106835231113e-06, + "loss": 1.7015, + "step": 8010 + }, + { + "epoch": 0.8321387763581594, + "grad_norm": 0.3975352644920349, + "learning_rate": 6.792854794381215e-06, + "loss": 1.6057, + "step": 8011 + }, + { + "epoch": 0.8322426508777397, + "grad_norm": 0.42390820384025574, + "learning_rate": 6.784645837675241e-06, + "loss": 1.6555, + "step": 8012 + }, + { + "epoch": 0.83234652539732, + "grad_norm": 0.4322831928730011, + "learning_rate": 6.776441483067386e-06, + "loss": 1.6785, + "step": 8013 + }, + { + "epoch": 0.8324503999169004, + "grad_norm": 0.443946897983551, + "learning_rate": 6.768241731431363e-06, + "loss": 1.6525, + "step": 8014 + }, + { + "epoch": 0.8325542744364808, + "grad_norm": 0.4141833186149597, + "learning_rate": 6.760046583640362e-06, + "loss": 1.6921, + "step": 8015 + }, + { + "epoch": 0.832658148956061, + "grad_norm": 0.4575175344944, + "learning_rate": 6.751856040567134e-06, + "loss": 1.7602, + "step": 8016 + }, + { + "epoch": 0.8327620234756414, + "grad_norm": 0.5201651453971863, + "learning_rate": 6.7436701030838715e-06, + "loss": 1.9422, + "step": 8017 + }, + { + "epoch": 0.8328658979952218, + "grad_norm": 0.4121319353580475, + "learning_rate": 6.735488772062337e-06, + "loss": 1.7875, + "step": 8018 + }, + { + "epoch": 0.8329697725148021, + "grad_norm": 0.40832269191741943, + "learning_rate": 6.727312048373774e-06, + "loss": 1.5723, + "step": 8019 + }, + { + "epoch": 0.8330736470343825, + "grad_norm": 0.44910913705825806, + "learning_rate": 6.71913993288894e-06, + "loss": 1.536, + "step": 8020 + }, + { + "epoch": 0.8331775215539629, + "grad_norm": 0.4083802103996277, + "learning_rate": 6.710972426478096e-06, + "loss": 1.5312, + "step": 8021 + }, + { + "epoch": 0.8332813960735431, + "grad_norm": 0.41337552666664124, + "learning_rate": 6.7028095300110225e-06, + "loss": 1.6728, + "step": 8022 + }, + { + "epoch": 0.8333852705931235, + "grad_norm": 0.39118510484695435, + "learning_rate": 6.6946512443570085e-06, + "loss": 1.4944, + "step": 8023 + }, + { + "epoch": 0.8334891451127039, + "grad_norm": 0.4005904793739319, + "learning_rate": 6.686497570384825e-06, + "loss": 1.5726, + "step": 8024 + }, + { + "epoch": 0.8335930196322842, + "grad_norm": 0.4205056428909302, + "learning_rate": 6.678348508962812e-06, + "loss": 1.6453, + "step": 8025 + }, + { + "epoch": 0.8336968941518645, + "grad_norm": 0.4279542863368988, + "learning_rate": 6.670204060958746e-06, + "loss": 1.6386, + "step": 8026 + }, + { + "epoch": 0.8338007686714449, + "grad_norm": 0.44344186782836914, + "learning_rate": 6.662064227239967e-06, + "loss": 1.6229, + "step": 8027 + }, + { + "epoch": 0.8339046431910252, + "grad_norm": 0.4007083773612976, + "learning_rate": 6.653929008673315e-06, + "loss": 1.7046, + "step": 8028 + }, + { + "epoch": 0.8340085177106056, + "grad_norm": 0.4301755130290985, + "learning_rate": 6.645798406125087e-06, + "loss": 1.6751, + "step": 8029 + }, + { + "epoch": 0.834112392230186, + "grad_norm": 0.39321601390838623, + "learning_rate": 6.637672420461161e-06, + "loss": 1.4603, + "step": 8030 + }, + { + "epoch": 0.8342162667497662, + "grad_norm": 0.4112393856048584, + "learning_rate": 6.629551052546884e-06, + "loss": 1.4504, + "step": 8031 + }, + { + "epoch": 0.8343201412693466, + "grad_norm": 0.4138662815093994, + "learning_rate": 6.6214343032471204e-06, + "loss": 1.6946, + "step": 8032 + }, + { + "epoch": 0.834424015788927, + "grad_norm": 0.43226510286331177, + "learning_rate": 6.613322173426239e-06, + "loss": 1.7209, + "step": 8033 + }, + { + "epoch": 0.8345278903085073, + "grad_norm": 0.42375436425209045, + "learning_rate": 6.605214663948111e-06, + "loss": 1.8021, + "step": 8034 + }, + { + "epoch": 0.8346317648280877, + "grad_norm": 0.4388313889503479, + "learning_rate": 6.597111775676135e-06, + "loss": 1.6669, + "step": 8035 + }, + { + "epoch": 0.834735639347668, + "grad_norm": 0.4430168569087982, + "learning_rate": 6.589013509473185e-06, + "loss": 1.5882, + "step": 8036 + }, + { + "epoch": 0.8348395138672484, + "grad_norm": 0.38932767510414124, + "learning_rate": 6.58091986620169e-06, + "loss": 1.4626, + "step": 8037 + }, + { + "epoch": 0.8349433883868287, + "grad_norm": 0.42232656478881836, + "learning_rate": 6.5728308467235435e-06, + "loss": 1.676, + "step": 8038 + }, + { + "epoch": 0.8350472629064091, + "grad_norm": 0.39752089977264404, + "learning_rate": 6.5647464519001725e-06, + "loss": 1.3398, + "step": 8039 + }, + { + "epoch": 0.8351511374259895, + "grad_norm": 0.4039764106273651, + "learning_rate": 6.556666682592494e-06, + "loss": 1.5381, + "step": 8040 + }, + { + "epoch": 0.8352550119455697, + "grad_norm": 0.41539785265922546, + "learning_rate": 6.548591539660942e-06, + "loss": 1.6253, + "step": 8041 + }, + { + "epoch": 0.8353588864651501, + "grad_norm": 0.45972946286201477, + "learning_rate": 6.540521023965457e-06, + "loss": 1.7125, + "step": 8042 + }, + { + "epoch": 0.8354627609847305, + "grad_norm": 0.40128108859062195, + "learning_rate": 6.532455136365478e-06, + "loss": 1.645, + "step": 8043 + }, + { + "epoch": 0.8355666355043108, + "grad_norm": 0.43023881316185, + "learning_rate": 6.524393877719987e-06, + "loss": 1.6461, + "step": 8044 + }, + { + "epoch": 0.8356705100238911, + "grad_norm": 0.4410925507545471, + "learning_rate": 6.516337248887399e-06, + "loss": 1.8415, + "step": 8045 + }, + { + "epoch": 0.8357743845434715, + "grad_norm": 0.4021972417831421, + "learning_rate": 6.5082852507257265e-06, + "loss": 1.4099, + "step": 8046 + }, + { + "epoch": 0.8358782590630518, + "grad_norm": 0.4312531352043152, + "learning_rate": 6.500237884092425e-06, + "loss": 1.6968, + "step": 8047 + }, + { + "epoch": 0.8359821335826322, + "grad_norm": 0.40367355942726135, + "learning_rate": 6.492195149844471e-06, + "loss": 1.66, + "step": 8048 + }, + { + "epoch": 0.8360860081022126, + "grad_norm": 0.4290336072444916, + "learning_rate": 6.484157048838368e-06, + "loss": 1.657, + "step": 8049 + }, + { + "epoch": 0.8361898826217928, + "grad_norm": 0.4326249063014984, + "learning_rate": 6.476123581930099e-06, + "loss": 1.7264, + "step": 8050 + }, + { + "epoch": 0.8362937571413732, + "grad_norm": 0.42618846893310547, + "learning_rate": 6.468094749975167e-06, + "loss": 1.798, + "step": 8051 + }, + { + "epoch": 0.8363976316609536, + "grad_norm": 0.4222290515899658, + "learning_rate": 6.460070553828573e-06, + "loss": 1.5829, + "step": 8052 + }, + { + "epoch": 0.8365015061805339, + "grad_norm": 0.3985992670059204, + "learning_rate": 6.4520509943448656e-06, + "loss": 1.5542, + "step": 8053 + }, + { + "epoch": 0.8366053807001143, + "grad_norm": 0.3930535614490509, + "learning_rate": 6.444036072378018e-06, + "loss": 1.5923, + "step": 8054 + }, + { + "epoch": 0.8367092552196946, + "grad_norm": 0.4032153785228729, + "learning_rate": 6.436025788781586e-06, + "loss": 1.6131, + "step": 8055 + }, + { + "epoch": 0.8368131297392749, + "grad_norm": 0.38635045289993286, + "learning_rate": 6.428020144408608e-06, + "loss": 1.5907, + "step": 8056 + }, + { + "epoch": 0.8369170042588553, + "grad_norm": 0.43229156732559204, + "learning_rate": 6.420019140111588e-06, + "loss": 1.6621, + "step": 8057 + }, + { + "epoch": 0.8370208787784357, + "grad_norm": 0.43635618686676025, + "learning_rate": 6.412022776742604e-06, + "loss": 1.5021, + "step": 8058 + }, + { + "epoch": 0.8371247532980159, + "grad_norm": 0.395163893699646, + "learning_rate": 6.404031055153198e-06, + "loss": 1.6401, + "step": 8059 + }, + { + "epoch": 0.8372286278175963, + "grad_norm": 0.3980249762535095, + "learning_rate": 6.396043976194416e-06, + "loss": 1.4499, + "step": 8060 + }, + { + "epoch": 0.8373325023371767, + "grad_norm": 0.4174276888370514, + "learning_rate": 6.3880615407168335e-06, + "loss": 1.6367, + "step": 8061 + }, + { + "epoch": 0.8374363768567571, + "grad_norm": 0.4133899509906769, + "learning_rate": 6.380083749570498e-06, + "loss": 1.5884, + "step": 8062 + }, + { + "epoch": 0.8375402513763374, + "grad_norm": 0.41283613443374634, + "learning_rate": 6.372110603605014e-06, + "loss": 1.4089, + "step": 8063 + }, + { + "epoch": 0.8376441258959177, + "grad_norm": 0.463218629360199, + "learning_rate": 6.364142103669418e-06, + "loss": 1.6824, + "step": 8064 + }, + { + "epoch": 0.8377480004154981, + "grad_norm": 0.39653337001800537, + "learning_rate": 6.356178250612327e-06, + "loss": 1.7319, + "step": 8065 + }, + { + "epoch": 0.8378518749350784, + "grad_norm": 0.42945095896720886, + "learning_rate": 6.348219045281822e-06, + "loss": 1.6687, + "step": 8066 + }, + { + "epoch": 0.8379557494546588, + "grad_norm": 0.4201321005821228, + "learning_rate": 6.340264488525488e-06, + "loss": 1.6577, + "step": 8067 + }, + { + "epoch": 0.8380596239742392, + "grad_norm": 0.44182446599006653, + "learning_rate": 6.332314581190424e-06, + "loss": 1.7371, + "step": 8068 + }, + { + "epoch": 0.8381634984938194, + "grad_norm": 0.4279153048992157, + "learning_rate": 6.324369324123242e-06, + "loss": 1.8348, + "step": 8069 + }, + { + "epoch": 0.8382673730133998, + "grad_norm": 0.42748069763183594, + "learning_rate": 6.316428718170036e-06, + "loss": 1.7954, + "step": 8070 + }, + { + "epoch": 0.8383712475329802, + "grad_norm": 0.4482438862323761, + "learning_rate": 6.308492764176421e-06, + "loss": 1.4829, + "step": 8071 + }, + { + "epoch": 0.8384751220525605, + "grad_norm": 0.41090673208236694, + "learning_rate": 6.300561462987542e-06, + "loss": 1.469, + "step": 8072 + }, + { + "epoch": 0.8385789965721409, + "grad_norm": 0.4080744981765747, + "learning_rate": 6.292634815447978e-06, + "loss": 1.5294, + "step": 8073 + }, + { + "epoch": 0.8386828710917212, + "grad_norm": 0.4303456246852875, + "learning_rate": 6.2847128224018835e-06, + "loss": 1.5216, + "step": 8074 + }, + { + "epoch": 0.8387867456113015, + "grad_norm": 0.44199666380882263, + "learning_rate": 6.276795484692882e-06, + "loss": 1.7961, + "step": 8075 + }, + { + "epoch": 0.8388906201308819, + "grad_norm": 0.407693088054657, + "learning_rate": 6.268882803164106e-06, + "loss": 1.6753, + "step": 8076 + }, + { + "epoch": 0.8389944946504623, + "grad_norm": 0.4050787389278412, + "learning_rate": 6.260974778658202e-06, + "loss": 1.4427, + "step": 8077 + }, + { + "epoch": 0.8390983691700425, + "grad_norm": 0.4364643692970276, + "learning_rate": 6.253071412017298e-06, + "loss": 1.7915, + "step": 8078 + }, + { + "epoch": 0.8392022436896229, + "grad_norm": 0.4825323522090912, + "learning_rate": 6.2451727040830525e-06, + "loss": 1.8103, + "step": 8079 + }, + { + "epoch": 0.8393061182092033, + "grad_norm": 0.42931079864501953, + "learning_rate": 6.237278655696605e-06, + "loss": 1.4995, + "step": 8080 + }, + { + "epoch": 0.8394099927287836, + "grad_norm": 0.43772071599960327, + "learning_rate": 6.229389267698638e-06, + "loss": 1.737, + "step": 8081 + }, + { + "epoch": 0.839513867248364, + "grad_norm": 0.4058978259563446, + "learning_rate": 6.2215045409292794e-06, + "loss": 1.5779, + "step": 8082 + }, + { + "epoch": 0.8396177417679443, + "grad_norm": 0.4402966797351837, + "learning_rate": 6.213624476228191e-06, + "loss": 1.6885, + "step": 8083 + }, + { + "epoch": 0.8397216162875246, + "grad_norm": 0.4273587167263031, + "learning_rate": 6.205749074434569e-06, + "loss": 1.5655, + "step": 8084 + }, + { + "epoch": 0.839825490807105, + "grad_norm": 0.40787985920906067, + "learning_rate": 6.197878336387042e-06, + "loss": 1.5628, + "step": 8085 + }, + { + "epoch": 0.8399293653266854, + "grad_norm": 0.42607665061950684, + "learning_rate": 6.190012262923811e-06, + "loss": 1.5581, + "step": 8086 + }, + { + "epoch": 0.8400332398462657, + "grad_norm": 0.4342617690563202, + "learning_rate": 6.182150854882546e-06, + "loss": 1.8488, + "step": 8087 + }, + { + "epoch": 0.840137114365846, + "grad_norm": 0.3898141086101532, + "learning_rate": 6.1742941131004205e-06, + "loss": 1.5469, + "step": 8088 + }, + { + "epoch": 0.8402409888854264, + "grad_norm": 0.43741726875305176, + "learning_rate": 6.166442038414122e-06, + "loss": 1.756, + "step": 8089 + }, + { + "epoch": 0.8403448634050068, + "grad_norm": 0.44850122928619385, + "learning_rate": 6.1585946316598195e-06, + "loss": 1.5767, + "step": 8090 + }, + { + "epoch": 0.8404487379245871, + "grad_norm": 0.4059605896472931, + "learning_rate": 6.150751893673229e-06, + "loss": 1.3907, + "step": 8091 + }, + { + "epoch": 0.8405526124441675, + "grad_norm": 0.46018004417419434, + "learning_rate": 6.142913825289509e-06, + "loss": 1.8505, + "step": 8092 + }, + { + "epoch": 0.8406564869637478, + "grad_norm": 0.4265289306640625, + "learning_rate": 6.135080427343376e-06, + "loss": 1.5951, + "step": 8093 + }, + { + "epoch": 0.8407603614833281, + "grad_norm": 0.4429461658000946, + "learning_rate": 6.1272517006690174e-06, + "loss": 1.7048, + "step": 8094 + }, + { + "epoch": 0.8408642360029085, + "grad_norm": 0.4366631805896759, + "learning_rate": 6.1194276461001345e-06, + "loss": 1.6959, + "step": 8095 + }, + { + "epoch": 0.8409681105224889, + "grad_norm": 0.48970597982406616, + "learning_rate": 6.111608264469926e-06, + "loss": 1.6371, + "step": 8096 + }, + { + "epoch": 0.8410719850420691, + "grad_norm": 0.3888145983219147, + "learning_rate": 6.103793556611093e-06, + "loss": 1.6457, + "step": 8097 + }, + { + "epoch": 0.8411758595616495, + "grad_norm": 0.39202946424484253, + "learning_rate": 6.095983523355841e-06, + "loss": 1.5726, + "step": 8098 + }, + { + "epoch": 0.8412797340812299, + "grad_norm": 0.41915374994277954, + "learning_rate": 6.088178165535874e-06, + "loss": 1.6642, + "step": 8099 + }, + { + "epoch": 0.8413836086008102, + "grad_norm": 0.3962395489215851, + "learning_rate": 6.080377483982424e-06, + "loss": 1.6482, + "step": 8100 + }, + { + "epoch": 0.8414874831203906, + "grad_norm": 0.3973849415779114, + "learning_rate": 6.072581479526168e-06, + "loss": 1.5789, + "step": 8101 + }, + { + "epoch": 0.841591357639971, + "grad_norm": 0.46176597476005554, + "learning_rate": 6.064790152997346e-06, + "loss": 1.7686, + "step": 8102 + }, + { + "epoch": 0.8416952321595512, + "grad_norm": 0.39481309056282043, + "learning_rate": 6.0570035052256615e-06, + "loss": 1.4962, + "step": 8103 + }, + { + "epoch": 0.8417991066791316, + "grad_norm": 0.42713215947151184, + "learning_rate": 6.049221537040339e-06, + "loss": 1.5899, + "step": 8104 + }, + { + "epoch": 0.841902981198712, + "grad_norm": 0.4113538861274719, + "learning_rate": 6.041444249270089e-06, + "loss": 1.7044, + "step": 8105 + }, + { + "epoch": 0.8420068557182923, + "grad_norm": 0.414631724357605, + "learning_rate": 6.033671642743144e-06, + "loss": 1.6839, + "step": 8106 + }, + { + "epoch": 0.8421107302378726, + "grad_norm": 0.48907044529914856, + "learning_rate": 6.025903718287212e-06, + "loss": 1.6526, + "step": 8107 + }, + { + "epoch": 0.842214604757453, + "grad_norm": 0.45432183146476746, + "learning_rate": 6.018140476729517e-06, + "loss": 1.6792, + "step": 8108 + }, + { + "epoch": 0.8423184792770333, + "grad_norm": 0.4079488217830658, + "learning_rate": 6.010381918896807e-06, + "loss": 1.7154, + "step": 8109 + }, + { + "epoch": 0.8424223537966137, + "grad_norm": 0.42770206928253174, + "learning_rate": 6.002628045615283e-06, + "loss": 1.6739, + "step": 8110 + }, + { + "epoch": 0.8425262283161941, + "grad_norm": 0.46013393998146057, + "learning_rate": 5.994878857710667e-06, + "loss": 1.7719, + "step": 8111 + }, + { + "epoch": 0.8426301028357743, + "grad_norm": 0.39818307757377625, + "learning_rate": 5.9871343560082225e-06, + "loss": 1.6761, + "step": 8112 + }, + { + "epoch": 0.8427339773553547, + "grad_norm": 0.3909452557563782, + "learning_rate": 5.9793945413326335e-06, + "loss": 1.4098, + "step": 8113 + }, + { + "epoch": 0.8428378518749351, + "grad_norm": 0.4382929801940918, + "learning_rate": 5.9716594145081625e-06, + "loss": 1.6621, + "step": 8114 + }, + { + "epoch": 0.8429417263945155, + "grad_norm": 0.4438725709915161, + "learning_rate": 5.963928976358518e-06, + "loss": 1.7382, + "step": 8115 + }, + { + "epoch": 0.8430456009140957, + "grad_norm": 0.4258117973804474, + "learning_rate": 5.956203227706963e-06, + "loss": 1.7194, + "step": 8116 + }, + { + "epoch": 0.8431494754336761, + "grad_norm": 0.486128032207489, + "learning_rate": 5.948482169376202e-06, + "loss": 1.7364, + "step": 8117 + }, + { + "epoch": 0.8432533499532565, + "grad_norm": 0.4413326382637024, + "learning_rate": 5.940765802188458e-06, + "loss": 1.4118, + "step": 8118 + }, + { + "epoch": 0.8433572244728368, + "grad_norm": 0.3983282148838043, + "learning_rate": 5.933054126965509e-06, + "loss": 1.3972, + "step": 8119 + }, + { + "epoch": 0.8434610989924172, + "grad_norm": 0.4958374500274658, + "learning_rate": 5.925347144528537e-06, + "loss": 1.8372, + "step": 8120 + }, + { + "epoch": 0.8435649735119976, + "grad_norm": 0.44721078872680664, + "learning_rate": 5.917644855698306e-06, + "loss": 1.7691, + "step": 8121 + }, + { + "epoch": 0.8436688480315778, + "grad_norm": 0.40920016169548035, + "learning_rate": 5.9099472612950455e-06, + "loss": 1.6897, + "step": 8122 + }, + { + "epoch": 0.8437727225511582, + "grad_norm": 0.44964519143104553, + "learning_rate": 5.902254362138487e-06, + "loss": 1.7894, + "step": 8123 + }, + { + "epoch": 0.8438765970707386, + "grad_norm": 0.4365478456020355, + "learning_rate": 5.894566159047865e-06, + "loss": 1.5154, + "step": 8124 + }, + { + "epoch": 0.8439804715903189, + "grad_norm": 0.42581430077552795, + "learning_rate": 5.886882652841907e-06, + "loss": 1.613, + "step": 8125 + }, + { + "epoch": 0.8440843461098992, + "grad_norm": 0.4058164656162262, + "learning_rate": 5.879203844338848e-06, + "loss": 1.6085, + "step": 8126 + }, + { + "epoch": 0.8441882206294796, + "grad_norm": 0.40692541003227234, + "learning_rate": 5.871529734356423e-06, + "loss": 1.6069, + "step": 8127 + }, + { + "epoch": 0.8442920951490599, + "grad_norm": 0.4061817228794098, + "learning_rate": 5.863860323711879e-06, + "loss": 1.539, + "step": 8128 + }, + { + "epoch": 0.8443959696686403, + "grad_norm": 0.489007830619812, + "learning_rate": 5.856195613221921e-06, + "loss": 1.3045, + "step": 8129 + }, + { + "epoch": 0.8444998441882207, + "grad_norm": 0.43798279762268066, + "learning_rate": 5.848535603702798e-06, + "loss": 1.5672, + "step": 8130 + }, + { + "epoch": 0.8446037187078009, + "grad_norm": 0.43467089533805847, + "learning_rate": 5.840880295970247e-06, + "loss": 1.7033, + "step": 8131 + }, + { + "epoch": 0.8447075932273813, + "grad_norm": 0.4537743031978607, + "learning_rate": 5.833229690839481e-06, + "loss": 1.7776, + "step": 8132 + }, + { + "epoch": 0.8448114677469617, + "grad_norm": 0.43950802087783813, + "learning_rate": 5.825583789125244e-06, + "loss": 1.7372, + "step": 8133 + }, + { + "epoch": 0.844915342266542, + "grad_norm": 0.4260973334312439, + "learning_rate": 5.817942591641762e-06, + "loss": 1.7629, + "step": 8134 + }, + { + "epoch": 0.8450192167861224, + "grad_norm": 0.40139877796173096, + "learning_rate": 5.810306099202755e-06, + "loss": 1.6475, + "step": 8135 + }, + { + "epoch": 0.8451230913057027, + "grad_norm": 0.45263901352882385, + "learning_rate": 5.802674312621448e-06, + "loss": 1.477, + "step": 8136 + }, + { + "epoch": 0.845226965825283, + "grad_norm": 0.4598293900489807, + "learning_rate": 5.795047232710599e-06, + "loss": 1.586, + "step": 8137 + }, + { + "epoch": 0.8453308403448634, + "grad_norm": 0.4252384901046753, + "learning_rate": 5.787424860282392e-06, + "loss": 1.5229, + "step": 8138 + }, + { + "epoch": 0.8454347148644438, + "grad_norm": 0.42608946561813354, + "learning_rate": 5.779807196148556e-06, + "loss": 1.7491, + "step": 8139 + }, + { + "epoch": 0.8455385893840242, + "grad_norm": 0.4330217242240906, + "learning_rate": 5.772194241120338e-06, + "loss": 1.5724, + "step": 8140 + }, + { + "epoch": 0.8456424639036044, + "grad_norm": 0.4884481132030487, + "learning_rate": 5.764585996008437e-06, + "loss": 1.7322, + "step": 8141 + }, + { + "epoch": 0.8457463384231848, + "grad_norm": 0.45733243227005005, + "learning_rate": 5.756982461623084e-06, + "loss": 1.763, + "step": 8142 + }, + { + "epoch": 0.8458502129427652, + "grad_norm": 0.5284457802772522, + "learning_rate": 5.749383638773986e-06, + "loss": 1.9331, + "step": 8143 + }, + { + "epoch": 0.8459540874623455, + "grad_norm": 0.4064652621746063, + "learning_rate": 5.7417895282703635e-06, + "loss": 1.6423, + "step": 8144 + }, + { + "epoch": 0.8460579619819258, + "grad_norm": 0.4062190055847168, + "learning_rate": 5.734200130920925e-06, + "loss": 1.5123, + "step": 8145 + }, + { + "epoch": 0.8461618365015062, + "grad_norm": 0.4112701416015625, + "learning_rate": 5.726615447533878e-06, + "loss": 1.4982, + "step": 8146 + }, + { + "epoch": 0.8462657110210865, + "grad_norm": 0.4885147213935852, + "learning_rate": 5.719035478916967e-06, + "loss": 1.8708, + "step": 8147 + }, + { + "epoch": 0.8463695855406669, + "grad_norm": 0.44870370626449585, + "learning_rate": 5.711460225877346e-06, + "loss": 1.5868, + "step": 8148 + }, + { + "epoch": 0.8464734600602473, + "grad_norm": 0.3870431184768677, + "learning_rate": 5.703889689221758e-06, + "loss": 1.4511, + "step": 8149 + }, + { + "epoch": 0.8465773345798275, + "grad_norm": 0.41669902205467224, + "learning_rate": 5.696323869756398e-06, + "loss": 1.6854, + "step": 8150 + }, + { + "epoch": 0.8466812090994079, + "grad_norm": 0.40349993109703064, + "learning_rate": 5.68876276828696e-06, + "loss": 1.6919, + "step": 8151 + }, + { + "epoch": 0.8467850836189883, + "grad_norm": 0.41194114089012146, + "learning_rate": 5.681206385618648e-06, + "loss": 1.5359, + "step": 8152 + }, + { + "epoch": 0.8468889581385686, + "grad_norm": 0.46073397994041443, + "learning_rate": 5.6736547225561545e-06, + "loss": 1.8919, + "step": 8153 + }, + { + "epoch": 0.846992832658149, + "grad_norm": 0.4091498851776123, + "learning_rate": 5.6661077799036785e-06, + "loss": 1.5971, + "step": 8154 + }, + { + "epoch": 0.8470967071777293, + "grad_norm": 0.4949287176132202, + "learning_rate": 5.65856555846489e-06, + "loss": 1.6858, + "step": 8155 + }, + { + "epoch": 0.8472005816973096, + "grad_norm": 0.43929752707481384, + "learning_rate": 5.6510280590430174e-06, + "loss": 1.673, + "step": 8156 + }, + { + "epoch": 0.84730445621689, + "grad_norm": 0.4475664794445038, + "learning_rate": 5.643495282440703e-06, + "loss": 1.5692, + "step": 8157 + }, + { + "epoch": 0.8474083307364704, + "grad_norm": 0.43574050068855286, + "learning_rate": 5.635967229460154e-06, + "loss": 1.5003, + "step": 8158 + }, + { + "epoch": 0.8475122052560506, + "grad_norm": 0.4088006317615509, + "learning_rate": 5.628443900903041e-06, + "loss": 1.6919, + "step": 8159 + }, + { + "epoch": 0.847616079775631, + "grad_norm": 0.41070079803466797, + "learning_rate": 5.62092529757054e-06, + "loss": 1.602, + "step": 8160 + }, + { + "epoch": 0.8477199542952114, + "grad_norm": 0.38564005494117737, + "learning_rate": 5.613411420263331e-06, + "loss": 1.6129, + "step": 8161 + }, + { + "epoch": 0.8478238288147917, + "grad_norm": 0.3986556828022003, + "learning_rate": 5.605902269781571e-06, + "loss": 1.572, + "step": 8162 + }, + { + "epoch": 0.8479277033343721, + "grad_norm": 0.41261792182922363, + "learning_rate": 5.598397846924935e-06, + "loss": 1.6821, + "step": 8163 + }, + { + "epoch": 0.8480315778539524, + "grad_norm": 0.41062214970588684, + "learning_rate": 5.590898152492574e-06, + "loss": 1.6375, + "step": 8164 + }, + { + "epoch": 0.8481354523735328, + "grad_norm": 0.4452644884586334, + "learning_rate": 5.583403187283159e-06, + "loss": 1.7255, + "step": 8165 + }, + { + "epoch": 0.8482393268931131, + "grad_norm": 0.41799211502075195, + "learning_rate": 5.575912952094858e-06, + "loss": 1.7604, + "step": 8166 + }, + { + "epoch": 0.8483432014126935, + "grad_norm": 0.41636091470718384, + "learning_rate": 5.568427447725283e-06, + "loss": 1.7037, + "step": 8167 + }, + { + "epoch": 0.8484470759322739, + "grad_norm": 0.45778560638427734, + "learning_rate": 5.560946674971618e-06, + "loss": 1.8658, + "step": 8168 + }, + { + "epoch": 0.8485509504518541, + "grad_norm": 0.4254413843154907, + "learning_rate": 5.553470634630492e-06, + "loss": 1.5678, + "step": 8169 + }, + { + "epoch": 0.8486548249714345, + "grad_norm": 0.4398033320903778, + "learning_rate": 5.5459993274980525e-06, + "loss": 1.6442, + "step": 8170 + }, + { + "epoch": 0.8487586994910149, + "grad_norm": 0.3922208547592163, + "learning_rate": 5.538532754369924e-06, + "loss": 1.6004, + "step": 8171 + }, + { + "epoch": 0.8488625740105952, + "grad_norm": 0.40766361355781555, + "learning_rate": 5.531070916041247e-06, + "loss": 1.5942, + "step": 8172 + }, + { + "epoch": 0.8489664485301756, + "grad_norm": 0.39256706833839417, + "learning_rate": 5.523613813306644e-06, + "loss": 1.6922, + "step": 8173 + }, + { + "epoch": 0.8490703230497559, + "grad_norm": 0.4119519591331482, + "learning_rate": 5.516161446960233e-06, + "loss": 1.6135, + "step": 8174 + }, + { + "epoch": 0.8491741975693362, + "grad_norm": 0.43249499797821045, + "learning_rate": 5.508713817795658e-06, + "loss": 1.5972, + "step": 8175 + }, + { + "epoch": 0.8492780720889166, + "grad_norm": 0.4471118152141571, + "learning_rate": 5.501270926606e-06, + "loss": 1.5106, + "step": 8176 + }, + { + "epoch": 0.849381946608497, + "grad_norm": 0.49427923560142517, + "learning_rate": 5.493832774183894e-06, + "loss": 1.9156, + "step": 8177 + }, + { + "epoch": 0.8494858211280772, + "grad_norm": 0.4142952263355255, + "learning_rate": 5.4863993613214314e-06, + "loss": 1.592, + "step": 8178 + }, + { + "epoch": 0.8495896956476576, + "grad_norm": 0.4335556626319885, + "learning_rate": 5.478970688810225e-06, + "loss": 1.6454, + "step": 8179 + }, + { + "epoch": 0.849693570167238, + "grad_norm": 0.42512187361717224, + "learning_rate": 5.471546757441359e-06, + "loss": 1.7903, + "step": 8180 + }, + { + "epoch": 0.8497974446868183, + "grad_norm": 0.4263908863067627, + "learning_rate": 5.464127568005423e-06, + "loss": 1.5594, + "step": 8181 + }, + { + "epoch": 0.8499013192063987, + "grad_norm": 0.4174048602581024, + "learning_rate": 5.456713121292517e-06, + "loss": 1.6509, + "step": 8182 + }, + { + "epoch": 0.850005193725979, + "grad_norm": 0.3824132978916168, + "learning_rate": 5.4493034180921944e-06, + "loss": 1.4437, + "step": 8183 + }, + { + "epoch": 0.8501090682455593, + "grad_norm": 0.42041993141174316, + "learning_rate": 5.441898459193573e-06, + "loss": 1.689, + "step": 8184 + }, + { + "epoch": 0.8502129427651397, + "grad_norm": 0.3966014087200165, + "learning_rate": 5.434498245385184e-06, + "loss": 1.6464, + "step": 8185 + }, + { + "epoch": 0.8503168172847201, + "grad_norm": 0.4592750370502472, + "learning_rate": 5.427102777455112e-06, + "loss": 1.4614, + "step": 8186 + }, + { + "epoch": 0.8504206918043004, + "grad_norm": 0.40993037819862366, + "learning_rate": 5.419712056190912e-06, + "loss": 1.5345, + "step": 8187 + }, + { + "epoch": 0.8505245663238807, + "grad_norm": 0.4124085009098053, + "learning_rate": 5.412326082379637e-06, + "loss": 1.5901, + "step": 8188 + }, + { + "epoch": 0.8506284408434611, + "grad_norm": 0.3968968391418457, + "learning_rate": 5.404944856807842e-06, + "loss": 1.5966, + "step": 8189 + }, + { + "epoch": 0.8507323153630414, + "grad_norm": 0.4331669509410858, + "learning_rate": 5.3975683802615586e-06, + "loss": 1.6621, + "step": 8190 + }, + { + "epoch": 0.8508361898826218, + "grad_norm": 0.42043596506118774, + "learning_rate": 5.390196653526336e-06, + "loss": 1.6508, + "step": 8191 + }, + { + "epoch": 0.8509400644022022, + "grad_norm": 0.41201046109199524, + "learning_rate": 5.382829677387186e-06, + "loss": 1.6648, + "step": 8192 + }, + { + "epoch": 0.8510439389217825, + "grad_norm": 0.41224047541618347, + "learning_rate": 5.375467452628663e-06, + "loss": 1.7252, + "step": 8193 + }, + { + "epoch": 0.8511478134413628, + "grad_norm": 0.42982521653175354, + "learning_rate": 5.368109980034775e-06, + "loss": 1.6036, + "step": 8194 + }, + { + "epoch": 0.8512516879609432, + "grad_norm": 0.41480857133865356, + "learning_rate": 5.360757260389015e-06, + "loss": 1.5241, + "step": 8195 + }, + { + "epoch": 0.8513555624805236, + "grad_norm": 0.41016316413879395, + "learning_rate": 5.353409294474421e-06, + "loss": 1.568, + "step": 8196 + }, + { + "epoch": 0.8514594370001038, + "grad_norm": 0.41887757182121277, + "learning_rate": 5.346066083073475e-06, + "loss": 1.5569, + "step": 8197 + }, + { + "epoch": 0.8515633115196842, + "grad_norm": 0.43342113494873047, + "learning_rate": 5.338727626968182e-06, + "loss": 1.6933, + "step": 8198 + }, + { + "epoch": 0.8516671860392646, + "grad_norm": 0.4318787753582001, + "learning_rate": 5.331393926940021e-06, + "loss": 1.7458, + "step": 8199 + }, + { + "epoch": 0.8517710605588449, + "grad_norm": 0.4099392294883728, + "learning_rate": 5.324064983769978e-06, + "loss": 1.6325, + "step": 8200 + }, + { + "epoch": 0.8518749350784253, + "grad_norm": 0.43265894055366516, + "learning_rate": 5.316740798238534e-06, + "loss": 1.7025, + "step": 8201 + }, + { + "epoch": 0.8519788095980056, + "grad_norm": 0.4109574854373932, + "learning_rate": 5.309421371125639e-06, + "loss": 1.7289, + "step": 8202 + }, + { + "epoch": 0.8520826841175859, + "grad_norm": 0.4493882954120636, + "learning_rate": 5.302106703210785e-06, + "loss": 1.6378, + "step": 8203 + }, + { + "epoch": 0.8521865586371663, + "grad_norm": 0.49722832441329956, + "learning_rate": 5.294796795272894e-06, + "loss": 1.8429, + "step": 8204 + }, + { + "epoch": 0.8522904331567467, + "grad_norm": 0.3835256099700928, + "learning_rate": 5.287491648090437e-06, + "loss": 1.515, + "step": 8205 + }, + { + "epoch": 0.852394307676327, + "grad_norm": 0.4009277820587158, + "learning_rate": 5.280191262441353e-06, + "loss": 1.4383, + "step": 8206 + }, + { + "epoch": 0.8524981821959073, + "grad_norm": 0.4271654486656189, + "learning_rate": 5.27289563910307e-06, + "loss": 1.6824, + "step": 8207 + }, + { + "epoch": 0.8526020567154877, + "grad_norm": 0.41742265224456787, + "learning_rate": 5.265604778852512e-06, + "loss": 1.525, + "step": 8208 + }, + { + "epoch": 0.852705931235068, + "grad_norm": 0.4177842140197754, + "learning_rate": 5.258318682466107e-06, + "loss": 1.6278, + "step": 8209 + }, + { + "epoch": 0.8528098057546484, + "grad_norm": 0.4213246703147888, + "learning_rate": 5.2510373507197675e-06, + "loss": 1.563, + "step": 8210 + }, + { + "epoch": 0.8529136802742288, + "grad_norm": 0.40991777181625366, + "learning_rate": 5.243760784388879e-06, + "loss": 1.5489, + "step": 8211 + }, + { + "epoch": 0.853017554793809, + "grad_norm": 0.4318062961101532, + "learning_rate": 5.236488984248378e-06, + "loss": 1.7528, + "step": 8212 + }, + { + "epoch": 0.8531214293133894, + "grad_norm": 0.40052494406700134, + "learning_rate": 5.229221951072611e-06, + "loss": 1.5852, + "step": 8213 + }, + { + "epoch": 0.8532253038329698, + "grad_norm": 0.4887178838253021, + "learning_rate": 5.221959685635491e-06, + "loss": 1.7859, + "step": 8214 + }, + { + "epoch": 0.8533291783525501, + "grad_norm": 0.4212459325790405, + "learning_rate": 5.2147021887103796e-06, + "loss": 1.6043, + "step": 8215 + }, + { + "epoch": 0.8534330528721304, + "grad_norm": 0.43245670199394226, + "learning_rate": 5.207449461070146e-06, + "loss": 1.7065, + "step": 8216 + }, + { + "epoch": 0.8535369273917108, + "grad_norm": 0.48420557379722595, + "learning_rate": 5.2002015034871564e-06, + "loss": 1.8633, + "step": 8217 + }, + { + "epoch": 0.8536408019112912, + "grad_norm": 0.46730557084083557, + "learning_rate": 5.1929583167332375e-06, + "loss": 1.7325, + "step": 8218 + }, + { + "epoch": 0.8537446764308715, + "grad_norm": 0.45189306139945984, + "learning_rate": 5.185719901579772e-06, + "loss": 1.8017, + "step": 8219 + }, + { + "epoch": 0.8538485509504519, + "grad_norm": 0.46504566073417664, + "learning_rate": 5.178486258797555e-06, + "loss": 1.8846, + "step": 8220 + }, + { + "epoch": 0.8539524254700323, + "grad_norm": 0.4610961973667145, + "learning_rate": 5.171257389156936e-06, + "loss": 1.7558, + "step": 8221 + }, + { + "epoch": 0.8540562999896125, + "grad_norm": 0.4036599397659302, + "learning_rate": 5.164033293427739e-06, + "loss": 1.6539, + "step": 8222 + }, + { + "epoch": 0.8541601745091929, + "grad_norm": 0.4568711817264557, + "learning_rate": 5.156813972379243e-06, + "loss": 1.769, + "step": 8223 + }, + { + "epoch": 0.8542640490287733, + "grad_norm": 0.4027152955532074, + "learning_rate": 5.1495994267802775e-06, + "loss": 1.5509, + "step": 8224 + }, + { + "epoch": 0.8543679235483536, + "grad_norm": 0.44536474347114563, + "learning_rate": 5.142389657399127e-06, + "loss": 1.7014, + "step": 8225 + }, + { + "epoch": 0.8544717980679339, + "grad_norm": 0.43924811482429504, + "learning_rate": 5.135184665003573e-06, + "loss": 1.602, + "step": 8226 + }, + { + "epoch": 0.8545756725875143, + "grad_norm": 0.4705229699611664, + "learning_rate": 5.127984450360895e-06, + "loss": 1.7716, + "step": 8227 + }, + { + "epoch": 0.8546795471070946, + "grad_norm": 0.42262768745422363, + "learning_rate": 5.120789014237859e-06, + "loss": 1.6233, + "step": 8228 + }, + { + "epoch": 0.854783421626675, + "grad_norm": 0.41569891571998596, + "learning_rate": 5.113598357400723e-06, + "loss": 1.5956, + "step": 8229 + }, + { + "epoch": 0.8548872961462554, + "grad_norm": 0.3866024911403656, + "learning_rate": 5.106412480615219e-06, + "loss": 1.5542, + "step": 8230 + }, + { + "epoch": 0.8549911706658356, + "grad_norm": 0.4203120768070221, + "learning_rate": 5.09923138464663e-06, + "loss": 1.6801, + "step": 8231 + }, + { + "epoch": 0.855095045185416, + "grad_norm": 0.433152973651886, + "learning_rate": 5.092055070259632e-06, + "loss": 1.6589, + "step": 8232 + }, + { + "epoch": 0.8551989197049964, + "grad_norm": 0.42945393919944763, + "learning_rate": 5.0848835382184875e-06, + "loss": 1.6403, + "step": 8233 + }, + { + "epoch": 0.8553027942245767, + "grad_norm": 0.46806976199150085, + "learning_rate": 5.077716789286896e-06, + "loss": 1.7067, + "step": 8234 + }, + { + "epoch": 0.855406668744157, + "grad_norm": 0.3977743983268738, + "learning_rate": 5.070554824228052e-06, + "loss": 1.667, + "step": 8235 + }, + { + "epoch": 0.8555105432637374, + "grad_norm": 0.40513405203819275, + "learning_rate": 5.063397643804662e-06, + "loss": 1.4978, + "step": 8236 + }, + { + "epoch": 0.8556144177833177, + "grad_norm": 0.41432732343673706, + "learning_rate": 5.056245248778902e-06, + "loss": 1.5833, + "step": 8237 + }, + { + "epoch": 0.8557182923028981, + "grad_norm": 0.4109817147254944, + "learning_rate": 5.049097639912454e-06, + "loss": 1.4775, + "step": 8238 + }, + { + "epoch": 0.8558221668224785, + "grad_norm": 0.4143582582473755, + "learning_rate": 5.04195481796646e-06, + "loss": 1.612, + "step": 8239 + }, + { + "epoch": 0.8559260413420587, + "grad_norm": 0.45763957500457764, + "learning_rate": 5.034816783701613e-06, + "loss": 1.5282, + "step": 8240 + }, + { + "epoch": 0.8560299158616391, + "grad_norm": 0.4432554543018341, + "learning_rate": 5.027683537878014e-06, + "loss": 1.5708, + "step": 8241 + }, + { + "epoch": 0.8561337903812195, + "grad_norm": 0.42282775044441223, + "learning_rate": 5.0205550812553345e-06, + "loss": 1.6524, + "step": 8242 + }, + { + "epoch": 0.8562376649007999, + "grad_norm": 0.4614414870738983, + "learning_rate": 5.01343141459269e-06, + "loss": 1.8134, + "step": 8243 + }, + { + "epoch": 0.8563415394203802, + "grad_norm": 0.5265931487083435, + "learning_rate": 5.0063125386486834e-06, + "loss": 1.9862, + "step": 8244 + }, + { + "epoch": 0.8564454139399605, + "grad_norm": 0.46069347858428955, + "learning_rate": 4.999198454181431e-06, + "loss": 1.8108, + "step": 8245 + }, + { + "epoch": 0.8565492884595409, + "grad_norm": 0.397867351770401, + "learning_rate": 4.99208916194851e-06, + "loss": 1.6589, + "step": 8246 + }, + { + "epoch": 0.8566531629791212, + "grad_norm": 0.4252391457557678, + "learning_rate": 4.984984662707043e-06, + "loss": 1.7302, + "step": 8247 + }, + { + "epoch": 0.8567570374987016, + "grad_norm": 0.41175001859664917, + "learning_rate": 4.977884957213558e-06, + "loss": 1.564, + "step": 8248 + }, + { + "epoch": 0.856860912018282, + "grad_norm": 0.40351465344429016, + "learning_rate": 4.970790046224144e-06, + "loss": 1.7149, + "step": 8249 + }, + { + "epoch": 0.8569647865378622, + "grad_norm": 0.4298102557659149, + "learning_rate": 4.963699930494364e-06, + "loss": 1.5095, + "step": 8250 + }, + { + "epoch": 0.8570686610574426, + "grad_norm": 0.45123982429504395, + "learning_rate": 4.95661461077922e-06, + "loss": 1.5314, + "step": 8251 + }, + { + "epoch": 0.857172535577023, + "grad_norm": 0.41809526085853577, + "learning_rate": 4.94953408783328e-06, + "loss": 1.5747, + "step": 8252 + }, + { + "epoch": 0.8572764100966033, + "grad_norm": 0.43481943011283875, + "learning_rate": 4.942458362410557e-06, + "loss": 1.628, + "step": 8253 + }, + { + "epoch": 0.8573802846161837, + "grad_norm": 0.4104442000389099, + "learning_rate": 4.935387435264549e-06, + "loss": 1.5326, + "step": 8254 + }, + { + "epoch": 0.857484159135764, + "grad_norm": 0.47249045968055725, + "learning_rate": 4.928321307148265e-06, + "loss": 1.7157, + "step": 8255 + }, + { + "epoch": 0.8575880336553443, + "grad_norm": 0.4751419425010681, + "learning_rate": 4.921259978814191e-06, + "loss": 1.8348, + "step": 8256 + }, + { + "epoch": 0.8576919081749247, + "grad_norm": 0.44779595732688904, + "learning_rate": 4.914203451014299e-06, + "loss": 1.521, + "step": 8257 + }, + { + "epoch": 0.8577957826945051, + "grad_norm": 0.4319514036178589, + "learning_rate": 4.907151724500048e-06, + "loss": 1.6885, + "step": 8258 + }, + { + "epoch": 0.8578996572140853, + "grad_norm": 0.44196733832359314, + "learning_rate": 4.9001048000224205e-06, + "loss": 1.746, + "step": 8259 + }, + { + "epoch": 0.8580035317336657, + "grad_norm": 0.4159088134765625, + "learning_rate": 4.893062678331817e-06, + "loss": 1.794, + "step": 8260 + }, + { + "epoch": 0.8581074062532461, + "grad_norm": 0.4175340533256531, + "learning_rate": 4.886025360178204e-06, + "loss": 1.6533, + "step": 8261 + }, + { + "epoch": 0.8582112807728264, + "grad_norm": 0.41944602131843567, + "learning_rate": 4.878992846310987e-06, + "loss": 1.6403, + "step": 8262 + }, + { + "epoch": 0.8583151552924068, + "grad_norm": 0.43087661266326904, + "learning_rate": 4.8719651374790735e-06, + "loss": 1.7938, + "step": 8263 + }, + { + "epoch": 0.8584190298119871, + "grad_norm": 0.41063591837882996, + "learning_rate": 4.864942234430858e-06, + "loss": 1.5834, + "step": 8264 + }, + { + "epoch": 0.8585229043315674, + "grad_norm": 0.42360448837280273, + "learning_rate": 4.857924137914233e-06, + "loss": 1.6952, + "step": 8265 + }, + { + "epoch": 0.8586267788511478, + "grad_norm": 0.4151276648044586, + "learning_rate": 4.85091084867656e-06, + "loss": 1.658, + "step": 8266 + }, + { + "epoch": 0.8587306533707282, + "grad_norm": 0.41758909821510315, + "learning_rate": 4.843902367464698e-06, + "loss": 1.6903, + "step": 8267 + }, + { + "epoch": 0.8588345278903085, + "grad_norm": 0.4258948266506195, + "learning_rate": 4.8368986950250275e-06, + "loss": 1.7291, + "step": 8268 + }, + { + "epoch": 0.8589384024098888, + "grad_norm": 0.3859458565711975, + "learning_rate": 4.829899832103335e-06, + "loss": 1.5672, + "step": 8269 + }, + { + "epoch": 0.8590422769294692, + "grad_norm": 0.3968896269798279, + "learning_rate": 4.82290577944498e-06, + "loss": 1.5287, + "step": 8270 + }, + { + "epoch": 0.8591461514490496, + "grad_norm": 0.4189419150352478, + "learning_rate": 4.815916537794763e-06, + "loss": 1.7356, + "step": 8271 + }, + { + "epoch": 0.8592500259686299, + "grad_norm": 0.39579710364341736, + "learning_rate": 4.8089321078969904e-06, + "loss": 1.5447, + "step": 8272 + }, + { + "epoch": 0.8593539004882103, + "grad_norm": 0.42127490043640137, + "learning_rate": 4.801952490495437e-06, + "loss": 1.6652, + "step": 8273 + }, + { + "epoch": 0.8594577750077906, + "grad_norm": 0.46544432640075684, + "learning_rate": 4.79497768633338e-06, + "loss": 1.6881, + "step": 8274 + }, + { + "epoch": 0.8595616495273709, + "grad_norm": 0.39384204149246216, + "learning_rate": 4.788007696153607e-06, + "loss": 1.6703, + "step": 8275 + }, + { + "epoch": 0.8596655240469513, + "grad_norm": 0.4230908155441284, + "learning_rate": 4.781042520698326e-06, + "loss": 1.7158, + "step": 8276 + }, + { + "epoch": 0.8597693985665317, + "grad_norm": 0.4715307652950287, + "learning_rate": 4.774082160709309e-06, + "loss": 1.518, + "step": 8277 + }, + { + "epoch": 0.8598732730861119, + "grad_norm": 0.42412781715393066, + "learning_rate": 4.767126616927769e-06, + "loss": 1.5671, + "step": 8278 + }, + { + "epoch": 0.8599771476056923, + "grad_norm": 0.4136991500854492, + "learning_rate": 4.760175890094399e-06, + "loss": 1.6172, + "step": 8279 + }, + { + "epoch": 0.8600810221252727, + "grad_norm": 0.4031031131744385, + "learning_rate": 4.753229980949419e-06, + "loss": 1.5905, + "step": 8280 + }, + { + "epoch": 0.860184896644853, + "grad_norm": 0.4419058859348297, + "learning_rate": 4.746288890232514e-06, + "loss": 1.6522, + "step": 8281 + }, + { + "epoch": 0.8602887711644334, + "grad_norm": 0.4122452139854431, + "learning_rate": 4.739352618682846e-06, + "loss": 1.6796, + "step": 8282 + }, + { + "epoch": 0.8603926456840137, + "grad_norm": 0.44128134846687317, + "learning_rate": 4.732421167039075e-06, + "loss": 1.6135, + "step": 8283 + }, + { + "epoch": 0.860496520203594, + "grad_norm": 0.4065702259540558, + "learning_rate": 4.725494536039354e-06, + "loss": 1.4627, + "step": 8284 + }, + { + "epoch": 0.8606003947231744, + "grad_norm": 0.40310558676719666, + "learning_rate": 4.718572726421305e-06, + "loss": 1.782, + "step": 8285 + }, + { + "epoch": 0.8607042692427548, + "grad_norm": 0.4221704602241516, + "learning_rate": 4.711655738922044e-06, + "loss": 1.6468, + "step": 8286 + }, + { + "epoch": 0.860808143762335, + "grad_norm": 0.41662177443504333, + "learning_rate": 4.704743574278209e-06, + "loss": 1.6226, + "step": 8287 + }, + { + "epoch": 0.8609120182819154, + "grad_norm": 0.42002618312835693, + "learning_rate": 4.697836233225838e-06, + "loss": 1.6135, + "step": 8288 + }, + { + "epoch": 0.8610158928014958, + "grad_norm": 0.4459064304828644, + "learning_rate": 4.690933716500556e-06, + "loss": 1.845, + "step": 8289 + }, + { + "epoch": 0.8611197673210761, + "grad_norm": 0.43704405426979065, + "learning_rate": 4.684036024837402e-06, + "loss": 1.7574, + "step": 8290 + }, + { + "epoch": 0.8612236418406565, + "grad_norm": 0.3958377540111542, + "learning_rate": 4.677143158970937e-06, + "loss": 1.5884, + "step": 8291 + }, + { + "epoch": 0.8613275163602369, + "grad_norm": 0.4173584580421448, + "learning_rate": 4.670255119635192e-06, + "loss": 1.6082, + "step": 8292 + }, + { + "epoch": 0.8614313908798171, + "grad_norm": 0.4080117642879486, + "learning_rate": 4.663371907563696e-06, + "loss": 1.5688, + "step": 8293 + }, + { + "epoch": 0.8615352653993975, + "grad_norm": 0.4516976773738861, + "learning_rate": 4.656493523489447e-06, + "loss": 1.7237, + "step": 8294 + }, + { + "epoch": 0.8616391399189779, + "grad_norm": 0.4905436635017395, + "learning_rate": 4.649619968144936e-06, + "loss": 1.6905, + "step": 8295 + }, + { + "epoch": 0.8617430144385583, + "grad_norm": 0.4043090045452118, + "learning_rate": 4.642751242262161e-06, + "loss": 1.5359, + "step": 8296 + }, + { + "epoch": 0.8618468889581385, + "grad_norm": 0.4110516905784607, + "learning_rate": 4.635887346572582e-06, + "loss": 1.6373, + "step": 8297 + }, + { + "epoch": 0.8619507634777189, + "grad_norm": 0.4277547299861908, + "learning_rate": 4.629028281807146e-06, + "loss": 1.7373, + "step": 8298 + }, + { + "epoch": 0.8620546379972993, + "grad_norm": 0.3978832960128784, + "learning_rate": 4.622174048696288e-06, + "loss": 1.456, + "step": 8299 + }, + { + "epoch": 0.8621585125168796, + "grad_norm": 0.4189075231552124, + "learning_rate": 4.615324647969932e-06, + "loss": 1.7349, + "step": 8300 + }, + { + "epoch": 0.86226238703646, + "grad_norm": 0.4180492162704468, + "learning_rate": 4.608480080357491e-06, + "loss": 1.4529, + "step": 8301 + }, + { + "epoch": 0.8623662615560403, + "grad_norm": 0.4220123291015625, + "learning_rate": 4.601640346587843e-06, + "loss": 1.7035, + "step": 8302 + }, + { + "epoch": 0.8624701360756206, + "grad_norm": 0.4284611642360687, + "learning_rate": 4.5948054473894e-06, + "loss": 1.6631, + "step": 8303 + }, + { + "epoch": 0.862574010595201, + "grad_norm": 0.4400910437107086, + "learning_rate": 4.587975383489978e-06, + "loss": 1.6642, + "step": 8304 + }, + { + "epoch": 0.8626778851147814, + "grad_norm": 0.4507836699485779, + "learning_rate": 4.581150155616959e-06, + "loss": 1.79, + "step": 8305 + }, + { + "epoch": 0.8627817596343617, + "grad_norm": 0.4717456102371216, + "learning_rate": 4.574329764497182e-06, + "loss": 1.7097, + "step": 8306 + }, + { + "epoch": 0.862885634153942, + "grad_norm": 0.42623621225357056, + "learning_rate": 4.567514210856933e-06, + "loss": 1.6195, + "step": 8307 + }, + { + "epoch": 0.8629895086735224, + "grad_norm": 0.3888145387172699, + "learning_rate": 4.560703495422037e-06, + "loss": 1.4697, + "step": 8308 + }, + { + "epoch": 0.8630933831931027, + "grad_norm": 0.40128466486930847, + "learning_rate": 4.553897618917785e-06, + "loss": 1.6035, + "step": 8309 + }, + { + "epoch": 0.8631972577126831, + "grad_norm": 0.43197745084762573, + "learning_rate": 4.547096582068938e-06, + "loss": 1.6968, + "step": 8310 + }, + { + "epoch": 0.8633011322322635, + "grad_norm": 0.4532462954521179, + "learning_rate": 4.540300385599761e-06, + "loss": 1.8258, + "step": 8311 + }, + { + "epoch": 0.8634050067518437, + "grad_norm": 0.3836333453655243, + "learning_rate": 4.533509030233995e-06, + "loss": 1.6876, + "step": 8312 + }, + { + "epoch": 0.8635088812714241, + "grad_norm": 0.44952192902565, + "learning_rate": 4.5267225166948645e-06, + "loss": 1.7243, + "step": 8313 + }, + { + "epoch": 0.8636127557910045, + "grad_norm": 0.4556354284286499, + "learning_rate": 4.519940845705067e-06, + "loss": 1.8883, + "step": 8314 + }, + { + "epoch": 0.8637166303105848, + "grad_norm": 0.40353667736053467, + "learning_rate": 4.513164017986837e-06, + "loss": 1.4688, + "step": 8315 + }, + { + "epoch": 0.8638205048301651, + "grad_norm": 0.4035186171531677, + "learning_rate": 4.506392034261803e-06, + "loss": 1.7138, + "step": 8316 + }, + { + "epoch": 0.8639243793497455, + "grad_norm": 0.4350834786891937, + "learning_rate": 4.49962489525117e-06, + "loss": 1.6899, + "step": 8317 + }, + { + "epoch": 0.8640282538693258, + "grad_norm": 0.4328579902648926, + "learning_rate": 4.492862601675563e-06, + "loss": 1.5958, + "step": 8318 + }, + { + "epoch": 0.8641321283889062, + "grad_norm": 0.45060819387435913, + "learning_rate": 4.486105154255121e-06, + "loss": 1.8196, + "step": 8319 + }, + { + "epoch": 0.8642360029084866, + "grad_norm": 0.4311492443084717, + "learning_rate": 4.4793525537094595e-06, + "loss": 1.7509, + "step": 8320 + }, + { + "epoch": 0.864339877428067, + "grad_norm": 0.45266759395599365, + "learning_rate": 4.472604800757668e-06, + "loss": 1.813, + "step": 8321 + }, + { + "epoch": 0.8644437519476472, + "grad_norm": 0.42085716128349304, + "learning_rate": 4.4658618961183506e-06, + "loss": 1.6671, + "step": 8322 + }, + { + "epoch": 0.8645476264672276, + "grad_norm": 0.4207400381565094, + "learning_rate": 4.459123840509549e-06, + "loss": 1.5313, + "step": 8323 + }, + { + "epoch": 0.864651500986808, + "grad_norm": 0.4059183895587921, + "learning_rate": 4.4523906346488345e-06, + "loss": 1.5305, + "step": 8324 + }, + { + "epoch": 0.8647553755063883, + "grad_norm": 0.40492719411849976, + "learning_rate": 4.445662279253226e-06, + "loss": 1.5326, + "step": 8325 + }, + { + "epoch": 0.8648592500259686, + "grad_norm": 0.43756330013275146, + "learning_rate": 4.438938775039253e-06, + "loss": 1.7945, + "step": 8326 + }, + { + "epoch": 0.864963124545549, + "grad_norm": 0.44969308376312256, + "learning_rate": 4.4322201227229134e-06, + "loss": 1.7237, + "step": 8327 + }, + { + "epoch": 0.8650669990651293, + "grad_norm": 0.4445195198059082, + "learning_rate": 4.425506323019685e-06, + "loss": 1.7394, + "step": 8328 + }, + { + "epoch": 0.8651708735847097, + "grad_norm": 0.5065091252326965, + "learning_rate": 4.418797376644534e-06, + "loss": 1.6003, + "step": 8329 + }, + { + "epoch": 0.8652747481042901, + "grad_norm": 0.4191296696662903, + "learning_rate": 4.412093284311913e-06, + "loss": 1.5485, + "step": 8330 + }, + { + "epoch": 0.8653786226238703, + "grad_norm": 0.4144963026046753, + "learning_rate": 4.405394046735773e-06, + "loss": 1.5195, + "step": 8331 + }, + { + "epoch": 0.8654824971434507, + "grad_norm": 0.42345312237739563, + "learning_rate": 4.398699664629497e-06, + "loss": 1.7706, + "step": 8332 + }, + { + "epoch": 0.8655863716630311, + "grad_norm": 0.4790879189968109, + "learning_rate": 4.392010138706015e-06, + "loss": 1.6909, + "step": 8333 + }, + { + "epoch": 0.8656902461826114, + "grad_norm": 0.39938417077064514, + "learning_rate": 4.3853254696777036e-06, + "loss": 1.6691, + "step": 8334 + }, + { + "epoch": 0.8657941207021917, + "grad_norm": 0.382337361574173, + "learning_rate": 4.378645658256403e-06, + "loss": 1.5524, + "step": 8335 + }, + { + "epoch": 0.8658979952217721, + "grad_norm": 0.41304081678390503, + "learning_rate": 4.371970705153489e-06, + "loss": 1.6553, + "step": 8336 + }, + { + "epoch": 0.8660018697413524, + "grad_norm": 0.44353777170181274, + "learning_rate": 4.365300611079781e-06, + "loss": 1.7254, + "step": 8337 + }, + { + "epoch": 0.8661057442609328, + "grad_norm": 0.4305352568626404, + "learning_rate": 4.358635376745601e-06, + "loss": 1.6613, + "step": 8338 + }, + { + "epoch": 0.8662096187805132, + "grad_norm": 0.4181787371635437, + "learning_rate": 4.351975002860731e-06, + "loss": 1.6243, + "step": 8339 + }, + { + "epoch": 0.8663134933000934, + "grad_norm": 0.49580061435699463, + "learning_rate": 4.345319490134453e-06, + "loss": 1.6895, + "step": 8340 + }, + { + "epoch": 0.8664173678196738, + "grad_norm": 0.422701358795166, + "learning_rate": 4.338668839275534e-06, + "loss": 1.6746, + "step": 8341 + }, + { + "epoch": 0.8665212423392542, + "grad_norm": 0.4329741299152374, + "learning_rate": 4.3320230509922014e-06, + "loss": 1.6391, + "step": 8342 + }, + { + "epoch": 0.8666251168588345, + "grad_norm": 0.40780285000801086, + "learning_rate": 4.3253821259922055e-06, + "loss": 1.5569, + "step": 8343 + }, + { + "epoch": 0.8667289913784149, + "grad_norm": 0.42592713236808777, + "learning_rate": 4.318746064982721e-06, + "loss": 1.6763, + "step": 8344 + }, + { + "epoch": 0.8668328658979952, + "grad_norm": 0.391530305147171, + "learning_rate": 4.312114868670458e-06, + "loss": 1.4977, + "step": 8345 + }, + { + "epoch": 0.8669367404175756, + "grad_norm": 0.41570156812667847, + "learning_rate": 4.305488537761588e-06, + "loss": 1.586, + "step": 8346 + }, + { + "epoch": 0.8670406149371559, + "grad_norm": 0.42140865325927734, + "learning_rate": 4.298867072961754e-06, + "loss": 1.6925, + "step": 8347 + }, + { + "epoch": 0.8671444894567363, + "grad_norm": 0.4236295819282532, + "learning_rate": 4.2922504749761005e-06, + "loss": 1.5939, + "step": 8348 + }, + { + "epoch": 0.8672483639763167, + "grad_norm": 0.4136272966861725, + "learning_rate": 4.2856387445092235e-06, + "loss": 1.6401, + "step": 8349 + }, + { + "epoch": 0.8673522384958969, + "grad_norm": 0.4001111090183258, + "learning_rate": 4.2790318822652495e-06, + "loss": 1.4537, + "step": 8350 + }, + { + "epoch": 0.8674561130154773, + "grad_norm": 0.40965673327445984, + "learning_rate": 4.272429888947732e-06, + "loss": 1.472, + "step": 8351 + }, + { + "epoch": 0.8675599875350577, + "grad_norm": 0.4452982544898987, + "learning_rate": 4.265832765259748e-06, + "loss": 1.7643, + "step": 8352 + }, + { + "epoch": 0.867663862054638, + "grad_norm": 0.41096264123916626, + "learning_rate": 4.259240511903834e-06, + "loss": 1.6081, + "step": 8353 + }, + { + "epoch": 0.8677677365742184, + "grad_norm": 0.4857504367828369, + "learning_rate": 4.252653129582013e-06, + "loss": 1.8201, + "step": 8354 + }, + { + "epoch": 0.8678716110937987, + "grad_norm": 0.3969123959541321, + "learning_rate": 4.2460706189957895e-06, + "loss": 1.5116, + "step": 8355 + }, + { + "epoch": 0.867975485613379, + "grad_norm": 0.40044331550598145, + "learning_rate": 4.239492980846155e-06, + "loss": 1.772, + "step": 8356 + }, + { + "epoch": 0.8680793601329594, + "grad_norm": 0.47415369749069214, + "learning_rate": 4.232920215833569e-06, + "loss": 1.9206, + "step": 8357 + }, + { + "epoch": 0.8681832346525398, + "grad_norm": 0.41735053062438965, + "learning_rate": 4.226352324657973e-06, + "loss": 1.7219, + "step": 8358 + }, + { + "epoch": 0.86828710917212, + "grad_norm": 0.43952199816703796, + "learning_rate": 4.219789308018829e-06, + "loss": 1.8179, + "step": 8359 + }, + { + "epoch": 0.8683909836917004, + "grad_norm": 0.42207950353622437, + "learning_rate": 4.2132311666150005e-06, + "loss": 1.6541, + "step": 8360 + }, + { + "epoch": 0.8684948582112808, + "grad_norm": 0.3974553942680359, + "learning_rate": 4.206677901144917e-06, + "loss": 1.5655, + "step": 8361 + }, + { + "epoch": 0.8685987327308611, + "grad_norm": 0.4383615255355835, + "learning_rate": 4.200129512306439e-06, + "loss": 1.707, + "step": 8362 + }, + { + "epoch": 0.8687026072504415, + "grad_norm": 0.43259862065315247, + "learning_rate": 4.1935860007969005e-06, + "loss": 1.6708, + "step": 8363 + }, + { + "epoch": 0.8688064817700218, + "grad_norm": 0.4056100845336914, + "learning_rate": 4.187047367313157e-06, + "loss": 1.5422, + "step": 8364 + }, + { + "epoch": 0.8689103562896021, + "grad_norm": 0.3907129466533661, + "learning_rate": 4.180513612551517e-06, + "loss": 1.5345, + "step": 8365 + }, + { + "epoch": 0.8690142308091825, + "grad_norm": 0.3913259208202362, + "learning_rate": 4.17398473720777e-06, + "loss": 1.5311, + "step": 8366 + }, + { + "epoch": 0.8691181053287629, + "grad_norm": 0.4376855790615082, + "learning_rate": 4.1674607419771915e-06, + "loss": 1.4414, + "step": 8367 + }, + { + "epoch": 0.8692219798483432, + "grad_norm": 0.44017404317855835, + "learning_rate": 4.160941627554538e-06, + "loss": 1.4987, + "step": 8368 + }, + { + "epoch": 0.8693258543679235, + "grad_norm": 0.4197766184806824, + "learning_rate": 4.154427394634036e-06, + "loss": 1.6341, + "step": 8369 + }, + { + "epoch": 0.8694297288875039, + "grad_norm": 0.41272974014282227, + "learning_rate": 4.147918043909404e-06, + "loss": 1.7277, + "step": 8370 + }, + { + "epoch": 0.8695336034070842, + "grad_norm": 0.4294355511665344, + "learning_rate": 4.14141357607386e-06, + "loss": 1.6332, + "step": 8371 + }, + { + "epoch": 0.8696374779266646, + "grad_norm": 0.43766576051712036, + "learning_rate": 4.1349139918200385e-06, + "loss": 1.6576, + "step": 8372 + }, + { + "epoch": 0.869741352446245, + "grad_norm": 0.39350345730781555, + "learning_rate": 4.1284192918401245e-06, + "loss": 1.517, + "step": 8373 + }, + { + "epoch": 0.8698452269658253, + "grad_norm": 0.41571328043937683, + "learning_rate": 4.121929476825742e-06, + "loss": 1.6047, + "step": 8374 + }, + { + "epoch": 0.8699491014854056, + "grad_norm": 0.4371244013309479, + "learning_rate": 4.1154445474680115e-06, + "loss": 1.6017, + "step": 8375 + }, + { + "epoch": 0.870052976004986, + "grad_norm": 0.4310753643512726, + "learning_rate": 4.108964504457524e-06, + "loss": 1.7602, + "step": 8376 + }, + { + "epoch": 0.8701568505245664, + "grad_norm": 0.3955596387386322, + "learning_rate": 4.102489348484339e-06, + "loss": 1.542, + "step": 8377 + }, + { + "epoch": 0.8702607250441466, + "grad_norm": 0.3818468749523163, + "learning_rate": 4.096019080238045e-06, + "loss": 1.4748, + "step": 8378 + }, + { + "epoch": 0.870364599563727, + "grad_norm": 0.4244476854801178, + "learning_rate": 4.089553700407634e-06, + "loss": 1.6876, + "step": 8379 + }, + { + "epoch": 0.8704684740833074, + "grad_norm": 0.44458895921707153, + "learning_rate": 4.083093209681649e-06, + "loss": 1.6637, + "step": 8380 + }, + { + "epoch": 0.8705723486028877, + "grad_norm": 0.46665099263191223, + "learning_rate": 4.076637608748074e-06, + "loss": 1.4212, + "step": 8381 + }, + { + "epoch": 0.8706762231224681, + "grad_norm": 0.43539130687713623, + "learning_rate": 4.070186898294375e-06, + "loss": 1.7361, + "step": 8382 + }, + { + "epoch": 0.8707800976420484, + "grad_norm": 0.4533788561820984, + "learning_rate": 4.063741079007505e-06, + "loss": 1.9468, + "step": 8383 + }, + { + "epoch": 0.8708839721616287, + "grad_norm": 0.40480610728263855, + "learning_rate": 4.057300151573895e-06, + "loss": 1.6335, + "step": 8384 + }, + { + "epoch": 0.8709878466812091, + "grad_norm": 0.5002861618995667, + "learning_rate": 4.050864116679454e-06, + "loss": 1.8369, + "step": 8385 + }, + { + "epoch": 0.8710917212007895, + "grad_norm": 0.41242918372154236, + "learning_rate": 4.044432975009554e-06, + "loss": 1.5588, + "step": 8386 + }, + { + "epoch": 0.8711955957203698, + "grad_norm": 0.43866342306137085, + "learning_rate": 4.038006727249094e-06, + "loss": 1.6729, + "step": 8387 + }, + { + "epoch": 0.8712994702399501, + "grad_norm": 0.4091012477874756, + "learning_rate": 4.031585374082381e-06, + "loss": 1.6353, + "step": 8388 + }, + { + "epoch": 0.8714033447595305, + "grad_norm": 0.44972357153892517, + "learning_rate": 4.025168916193273e-06, + "loss": 1.7602, + "step": 8389 + }, + { + "epoch": 0.8715072192791108, + "grad_norm": 0.3911556005477905, + "learning_rate": 4.0187573542650625e-06, + "loss": 1.5152, + "step": 8390 + }, + { + "epoch": 0.8716110937986912, + "grad_norm": 0.4245710074901581, + "learning_rate": 4.01235068898051e-06, + "loss": 1.775, + "step": 8391 + }, + { + "epoch": 0.8717149683182716, + "grad_norm": 0.41112756729125977, + "learning_rate": 4.005948921021896e-06, + "loss": 1.5729, + "step": 8392 + }, + { + "epoch": 0.8718188428378518, + "grad_norm": 0.42917004227638245, + "learning_rate": 3.99955205107096e-06, + "loss": 1.7396, + "step": 8393 + }, + { + "epoch": 0.8719227173574322, + "grad_norm": 0.44293302297592163, + "learning_rate": 3.99316007980891e-06, + "loss": 1.7512, + "step": 8394 + }, + { + "epoch": 0.8720265918770126, + "grad_norm": 0.41871437430381775, + "learning_rate": 3.9867730079164445e-06, + "loss": 1.5736, + "step": 8395 + }, + { + "epoch": 0.8721304663965929, + "grad_norm": 0.4064503312110901, + "learning_rate": 3.980390836073739e-06, + "loss": 1.4956, + "step": 8396 + }, + { + "epoch": 0.8722343409161732, + "grad_norm": 0.39449822902679443, + "learning_rate": 3.974013564960444e-06, + "loss": 1.5223, + "step": 8397 + }, + { + "epoch": 0.8723382154357536, + "grad_norm": 0.3918604850769043, + "learning_rate": 3.967641195255672e-06, + "loss": 1.5784, + "step": 8398 + }, + { + "epoch": 0.872442089955334, + "grad_norm": 0.42740651965141296, + "learning_rate": 3.9612737276380595e-06, + "loss": 1.652, + "step": 8399 + }, + { + "epoch": 0.8725459644749143, + "grad_norm": 0.45657414197921753, + "learning_rate": 3.9549111627856795e-06, + "loss": 1.863, + "step": 8400 + }, + { + "epoch": 0.8726498389944947, + "grad_norm": 0.4107915759086609, + "learning_rate": 3.948553501376096e-06, + "loss": 1.6226, + "step": 8401 + }, + { + "epoch": 0.872753713514075, + "grad_norm": 0.43051251769065857, + "learning_rate": 3.942200744086349e-06, + "loss": 1.687, + "step": 8402 + }, + { + "epoch": 0.8728575880336553, + "grad_norm": 0.3889218270778656, + "learning_rate": 3.935852891592956e-06, + "loss": 1.6296, + "step": 8403 + }, + { + "epoch": 0.8729614625532357, + "grad_norm": 0.4343833327293396, + "learning_rate": 3.929509944571918e-06, + "loss": 1.7, + "step": 8404 + }, + { + "epoch": 0.8730653370728161, + "grad_norm": 0.43371620774269104, + "learning_rate": 3.9231719036987025e-06, + "loss": 1.627, + "step": 8405 + }, + { + "epoch": 0.8731692115923964, + "grad_norm": 0.4425232410430908, + "learning_rate": 3.916838769648279e-06, + "loss": 1.6984, + "step": 8406 + }, + { + "epoch": 0.8732730861119767, + "grad_norm": 0.3910176455974579, + "learning_rate": 3.910510543095047e-06, + "loss": 1.5953, + "step": 8407 + }, + { + "epoch": 0.8733769606315571, + "grad_norm": 0.4370116591453552, + "learning_rate": 3.904187224712935e-06, + "loss": 1.8256, + "step": 8408 + }, + { + "epoch": 0.8734808351511374, + "grad_norm": 0.4318406581878662, + "learning_rate": 3.897868815175326e-06, + "loss": 1.6616, + "step": 8409 + }, + { + "epoch": 0.8735847096707178, + "grad_norm": 0.38654428720474243, + "learning_rate": 3.891555315155082e-06, + "loss": 1.4936, + "step": 8410 + }, + { + "epoch": 0.8736885841902982, + "grad_norm": 0.41681328415870667, + "learning_rate": 3.885246725324532e-06, + "loss": 1.5365, + "step": 8411 + }, + { + "epoch": 0.8737924587098784, + "grad_norm": 0.39945271611213684, + "learning_rate": 3.878943046355499e-06, + "loss": 1.5884, + "step": 8412 + }, + { + "epoch": 0.8738963332294588, + "grad_norm": 0.40073657035827637, + "learning_rate": 3.872644278919274e-06, + "loss": 1.6765, + "step": 8413 + }, + { + "epoch": 0.8740002077490392, + "grad_norm": 0.4032048285007477, + "learning_rate": 3.866350423686615e-06, + "loss": 1.6804, + "step": 8414 + }, + { + "epoch": 0.8741040822686195, + "grad_norm": 0.4637449085712433, + "learning_rate": 3.860061481327798e-06, + "loss": 1.7906, + "step": 8415 + }, + { + "epoch": 0.8742079567881998, + "grad_norm": 0.4216407537460327, + "learning_rate": 3.8537774525125195e-06, + "loss": 1.7432, + "step": 8416 + }, + { + "epoch": 0.8743118313077802, + "grad_norm": 0.4351256787776947, + "learning_rate": 3.847498337909977e-06, + "loss": 1.6909, + "step": 8417 + }, + { + "epoch": 0.8744157058273605, + "grad_norm": 0.4363458454608917, + "learning_rate": 3.8412241381888755e-06, + "loss": 1.5617, + "step": 8418 + }, + { + "epoch": 0.8745195803469409, + "grad_norm": 0.481900155544281, + "learning_rate": 3.834954854017337e-06, + "loss": 1.854, + "step": 8419 + }, + { + "epoch": 0.8746234548665213, + "grad_norm": 0.49199458956718445, + "learning_rate": 3.828690486063013e-06, + "loss": 1.7667, + "step": 8420 + }, + { + "epoch": 0.8747273293861015, + "grad_norm": 0.41270673274993896, + "learning_rate": 3.822431034993002e-06, + "loss": 1.623, + "step": 8421 + }, + { + "epoch": 0.8748312039056819, + "grad_norm": 0.444240540266037, + "learning_rate": 3.816176501473889e-06, + "loss": 1.6026, + "step": 8422 + }, + { + "epoch": 0.8749350784252623, + "grad_norm": 0.4405142366886139, + "learning_rate": 3.8099268861717275e-06, + "loss": 1.7327, + "step": 8423 + }, + { + "epoch": 0.8750389529448427, + "grad_norm": 0.4171452820301056, + "learning_rate": 3.8036821897520502e-06, + "loss": 1.622, + "step": 8424 + }, + { + "epoch": 0.875142827464423, + "grad_norm": 0.4365280866622925, + "learning_rate": 3.7974424128798868e-06, + "loss": 1.5021, + "step": 8425 + }, + { + "epoch": 0.8752467019840033, + "grad_norm": 0.419349730014801, + "learning_rate": 3.7912075562197e-06, + "loss": 1.5933, + "step": 8426 + }, + { + "epoch": 0.8753505765035837, + "grad_norm": 0.41351646184921265, + "learning_rate": 3.784977620435476e-06, + "loss": 1.63, + "step": 8427 + }, + { + "epoch": 0.875454451023164, + "grad_norm": 0.4430893361568451, + "learning_rate": 3.7787526061906387e-06, + "loss": 1.655, + "step": 8428 + }, + { + "epoch": 0.8755583255427444, + "grad_norm": 0.534438967704773, + "learning_rate": 3.7725325141481137e-06, + "loss": 1.8994, + "step": 8429 + }, + { + "epoch": 0.8756622000623248, + "grad_norm": 0.44221749901771545, + "learning_rate": 3.766317344970288e-06, + "loss": 1.6722, + "step": 8430 + }, + { + "epoch": 0.875766074581905, + "grad_norm": 0.42250174283981323, + "learning_rate": 3.7601070993190267e-06, + "loss": 1.6381, + "step": 8431 + }, + { + "epoch": 0.8758699491014854, + "grad_norm": 0.45283105969429016, + "learning_rate": 3.753901777855673e-06, + "loss": 1.6305, + "step": 8432 + }, + { + "epoch": 0.8759738236210658, + "grad_norm": 0.4005245268344879, + "learning_rate": 3.7477013812410422e-06, + "loss": 1.6229, + "step": 8433 + }, + { + "epoch": 0.8760776981406461, + "grad_norm": 0.3979766368865967, + "learning_rate": 3.7415059101354454e-06, + "loss": 1.5569, + "step": 8434 + }, + { + "epoch": 0.8761815726602264, + "grad_norm": 0.4287016987800598, + "learning_rate": 3.735315365198622e-06, + "loss": 1.5411, + "step": 8435 + }, + { + "epoch": 0.8762854471798068, + "grad_norm": 0.46041205525398254, + "learning_rate": 3.7291297470898435e-06, + "loss": 1.5864, + "step": 8436 + }, + { + "epoch": 0.8763893216993871, + "grad_norm": 0.4164537787437439, + "learning_rate": 3.7229490564678227e-06, + "loss": 1.6638, + "step": 8437 + }, + { + "epoch": 0.8764931962189675, + "grad_norm": 0.3900989294052124, + "learning_rate": 3.7167732939907496e-06, + "loss": 1.497, + "step": 8438 + }, + { + "epoch": 0.8765970707385479, + "grad_norm": 0.41212278604507446, + "learning_rate": 3.710602460316298e-06, + "loss": 1.5113, + "step": 8439 + }, + { + "epoch": 0.8767009452581281, + "grad_norm": 0.39359238743782043, + "learning_rate": 3.704436556101615e-06, + "loss": 1.5233, + "step": 8440 + }, + { + "epoch": 0.8768048197777085, + "grad_norm": 0.46984294056892395, + "learning_rate": 3.6982755820033186e-06, + "loss": 1.6151, + "step": 8441 + }, + { + "epoch": 0.8769086942972889, + "grad_norm": 0.4061124324798584, + "learning_rate": 3.6921195386774965e-06, + "loss": 1.5301, + "step": 8442 + }, + { + "epoch": 0.8770125688168692, + "grad_norm": 0.4347725808620453, + "learning_rate": 3.6859684267797455e-06, + "loss": 1.6463, + "step": 8443 + }, + { + "epoch": 0.8771164433364496, + "grad_norm": 0.46318212151527405, + "learning_rate": 3.679822246965087e-06, + "loss": 1.4855, + "step": 8444 + }, + { + "epoch": 0.8772203178560299, + "grad_norm": 0.4061053693294525, + "learning_rate": 3.673680999888035e-06, + "loss": 1.5532, + "step": 8445 + }, + { + "epoch": 0.8773241923756102, + "grad_norm": 0.3995457589626312, + "learning_rate": 3.6675446862026176e-06, + "loss": 1.5249, + "step": 8446 + }, + { + "epoch": 0.8774280668951906, + "grad_norm": 0.4302307665348053, + "learning_rate": 3.661413306562267e-06, + "loss": 1.6669, + "step": 8447 + }, + { + "epoch": 0.877531941414771, + "grad_norm": 0.45854660868644714, + "learning_rate": 3.65528686161995e-06, + "loss": 1.7169, + "step": 8448 + }, + { + "epoch": 0.8776358159343512, + "grad_norm": 0.4778292179107666, + "learning_rate": 3.649165352028083e-06, + "loss": 1.6258, + "step": 8449 + }, + { + "epoch": 0.8777396904539316, + "grad_norm": 0.3999958038330078, + "learning_rate": 3.6430487784385515e-06, + "loss": 1.6233, + "step": 8450 + }, + { + "epoch": 0.877843564973512, + "grad_norm": 0.40005001425743103, + "learning_rate": 3.636937141502733e-06, + "loss": 1.6136, + "step": 8451 + }, + { + "epoch": 0.8779474394930924, + "grad_norm": 0.4642762839794159, + "learning_rate": 3.6308304418714523e-06, + "loss": 1.8124, + "step": 8452 + }, + { + "epoch": 0.8780513140126727, + "grad_norm": 0.41780030727386475, + "learning_rate": 3.6247286801950497e-06, + "loss": 1.708, + "step": 8453 + }, + { + "epoch": 0.878155188532253, + "grad_norm": 0.3930474817752838, + "learning_rate": 3.6186318571232893e-06, + "loss": 1.5891, + "step": 8454 + }, + { + "epoch": 0.8782590630518334, + "grad_norm": 0.38920000195503235, + "learning_rate": 3.612539973305451e-06, + "loss": 1.6752, + "step": 8455 + }, + { + "epoch": 0.8783629375714137, + "grad_norm": 0.41854655742645264, + "learning_rate": 3.606453029390272e-06, + "loss": 1.6183, + "step": 8456 + }, + { + "epoch": 0.8784668120909941, + "grad_norm": 0.4399711489677429, + "learning_rate": 3.6003710260259616e-06, + "loss": 1.9086, + "step": 8457 + }, + { + "epoch": 0.8785706866105745, + "grad_norm": 0.4335370659828186, + "learning_rate": 3.5942939638602067e-06, + "loss": 1.5987, + "step": 8458 + }, + { + "epoch": 0.8786745611301547, + "grad_norm": 0.4267464280128479, + "learning_rate": 3.588221843540168e-06, + "loss": 1.7094, + "step": 8459 + }, + { + "epoch": 0.8787784356497351, + "grad_norm": 0.4027010500431061, + "learning_rate": 3.582154665712473e-06, + "loss": 1.6371, + "step": 8460 + }, + { + "epoch": 0.8788823101693155, + "grad_norm": 0.4093351662158966, + "learning_rate": 3.5760924310232258e-06, + "loss": 1.72, + "step": 8461 + }, + { + "epoch": 0.8789861846888958, + "grad_norm": 0.4228046238422394, + "learning_rate": 3.5700351401180333e-06, + "loss": 1.769, + "step": 8462 + }, + { + "epoch": 0.8790900592084762, + "grad_norm": 0.4321579337120056, + "learning_rate": 3.5639827936419123e-06, + "loss": 1.7098, + "step": 8463 + }, + { + "epoch": 0.8791939337280565, + "grad_norm": 0.45898208022117615, + "learning_rate": 3.5579353922394197e-06, + "loss": 1.7701, + "step": 8464 + }, + { + "epoch": 0.8792978082476368, + "grad_norm": 0.44818854331970215, + "learning_rate": 3.5518929365545406e-06, + "loss": 1.7982, + "step": 8465 + }, + { + "epoch": 0.8794016827672172, + "grad_norm": 0.3915591835975647, + "learning_rate": 3.5458554272307597e-06, + "loss": 1.6386, + "step": 8466 + }, + { + "epoch": 0.8795055572867976, + "grad_norm": 0.4124680459499359, + "learning_rate": 3.5398228649110186e-06, + "loss": 1.6979, + "step": 8467 + }, + { + "epoch": 0.8796094318063779, + "grad_norm": 0.4365387558937073, + "learning_rate": 3.533795250237737e-06, + "loss": 1.4209, + "step": 8468 + }, + { + "epoch": 0.8797133063259582, + "grad_norm": 0.4055129885673523, + "learning_rate": 3.527772583852812e-06, + "loss": 1.4331, + "step": 8469 + }, + { + "epoch": 0.8798171808455386, + "grad_norm": 0.4493137001991272, + "learning_rate": 3.521754866397603e-06, + "loss": 1.8114, + "step": 8470 + }, + { + "epoch": 0.8799210553651189, + "grad_norm": 0.40242356061935425, + "learning_rate": 3.5157420985129696e-06, + "loss": 1.5685, + "step": 8471 + }, + { + "epoch": 0.8800249298846993, + "grad_norm": 0.396705687046051, + "learning_rate": 3.5097342808392054e-06, + "loss": 1.4964, + "step": 8472 + }, + { + "epoch": 0.8801288044042797, + "grad_norm": 0.4387626647949219, + "learning_rate": 3.5037314140160926e-06, + "loss": 1.4608, + "step": 8473 + }, + { + "epoch": 0.8802326789238599, + "grad_norm": 0.47153568267822266, + "learning_rate": 3.4977334986829145e-06, + "loss": 1.6019, + "step": 8474 + }, + { + "epoch": 0.8803365534434403, + "grad_norm": 0.46217477321624756, + "learning_rate": 3.4917405354783772e-06, + "loss": 1.7727, + "step": 8475 + }, + { + "epoch": 0.8804404279630207, + "grad_norm": 0.48487722873687744, + "learning_rate": 3.4857525250406976e-06, + "loss": 1.7575, + "step": 8476 + }, + { + "epoch": 0.8805443024826011, + "grad_norm": 0.48644381761550903, + "learning_rate": 3.479769468007543e-06, + "loss": 1.6869, + "step": 8477 + }, + { + "epoch": 0.8806481770021813, + "grad_norm": 0.41009244322776794, + "learning_rate": 3.4737913650160826e-06, + "loss": 1.446, + "step": 8478 + }, + { + "epoch": 0.8807520515217617, + "grad_norm": 0.4005885720252991, + "learning_rate": 3.467818216702917e-06, + "loss": 1.5894, + "step": 8479 + }, + { + "epoch": 0.8808559260413421, + "grad_norm": 0.4276701509952545, + "learning_rate": 3.461850023704133e-06, + "loss": 1.5221, + "step": 8480 + }, + { + "epoch": 0.8809598005609224, + "grad_norm": 0.42804211378097534, + "learning_rate": 3.4558867866553324e-06, + "loss": 1.7771, + "step": 8481 + }, + { + "epoch": 0.8810636750805028, + "grad_norm": 0.42332831025123596, + "learning_rate": 3.4499285061915134e-06, + "loss": 1.6525, + "step": 8482 + }, + { + "epoch": 0.8811675496000831, + "grad_norm": 0.4782041311264038, + "learning_rate": 3.4439751829472123e-06, + "loss": 1.808, + "step": 8483 + }, + { + "epoch": 0.8812714241196634, + "grad_norm": 0.37131935358047485, + "learning_rate": 3.4380268175564e-06, + "loss": 1.4988, + "step": 8484 + }, + { + "epoch": 0.8813752986392438, + "grad_norm": 0.46065279841423035, + "learning_rate": 3.4320834106525367e-06, + "loss": 1.7904, + "step": 8485 + }, + { + "epoch": 0.8814791731588242, + "grad_norm": 0.41007938981056213, + "learning_rate": 3.4261449628685492e-06, + "loss": 1.4656, + "step": 8486 + }, + { + "epoch": 0.8815830476784045, + "grad_norm": 0.40511950850486755, + "learning_rate": 3.4202114748368264e-06, + "loss": 1.6166, + "step": 8487 + }, + { + "epoch": 0.8816869221979848, + "grad_norm": 0.42590075731277466, + "learning_rate": 3.414282947189251e-06, + "loss": 1.681, + "step": 8488 + }, + { + "epoch": 0.8817907967175652, + "grad_norm": 0.43543121218681335, + "learning_rate": 3.4083593805571523e-06, + "loss": 1.6805, + "step": 8489 + }, + { + "epoch": 0.8818946712371455, + "grad_norm": 0.4141135513782501, + "learning_rate": 3.402440775571364e-06, + "loss": 1.7227, + "step": 8490 + }, + { + "epoch": 0.8819985457567259, + "grad_norm": 0.4232790470123291, + "learning_rate": 3.3965271328621374e-06, + "loss": 1.792, + "step": 8491 + }, + { + "epoch": 0.8821024202763063, + "grad_norm": 0.40409740805625916, + "learning_rate": 3.390618453059263e-06, + "loss": 1.589, + "step": 8492 + }, + { + "epoch": 0.8822062947958865, + "grad_norm": 0.418250173330307, + "learning_rate": 3.384714736791955e-06, + "loss": 1.6795, + "step": 8493 + }, + { + "epoch": 0.8823101693154669, + "grad_norm": 0.42361485958099365, + "learning_rate": 3.3788159846889157e-06, + "loss": 1.6938, + "step": 8494 + }, + { + "epoch": 0.8824140438350473, + "grad_norm": 0.4593660831451416, + "learning_rate": 3.3729221973783144e-06, + "loss": 1.6643, + "step": 8495 + }, + { + "epoch": 0.8825179183546276, + "grad_norm": 0.4981370270252228, + "learning_rate": 3.3670333754877937e-06, + "loss": 1.8358, + "step": 8496 + }, + { + "epoch": 0.8826217928742079, + "grad_norm": 0.39285340905189514, + "learning_rate": 3.3611495196444685e-06, + "loss": 1.5936, + "step": 8497 + }, + { + "epoch": 0.8827256673937883, + "grad_norm": 0.41241446137428284, + "learning_rate": 3.355270630474916e-06, + "loss": 1.6163, + "step": 8498 + }, + { + "epoch": 0.8828295419133686, + "grad_norm": 0.4433367848396301, + "learning_rate": 3.3493967086052126e-06, + "loss": 1.5595, + "step": 8499 + }, + { + "epoch": 0.882933416432949, + "grad_norm": 0.4431458115577698, + "learning_rate": 3.343527754660858e-06, + "loss": 1.5006, + "step": 8500 + }, + { + "epoch": 0.8830372909525294, + "grad_norm": 0.41516363620758057, + "learning_rate": 3.3376637692668632e-06, + "loss": 1.6199, + "step": 8501 + }, + { + "epoch": 0.8831411654721097, + "grad_norm": 0.4274922311306, + "learning_rate": 3.3318047530477013e-06, + "loss": 1.7884, + "step": 8502 + }, + { + "epoch": 0.88324503999169, + "grad_norm": 0.3764563798904419, + "learning_rate": 3.3259507066273064e-06, + "loss": 1.4395, + "step": 8503 + }, + { + "epoch": 0.8833489145112704, + "grad_norm": 0.4132494032382965, + "learning_rate": 3.320101630629091e-06, + "loss": 1.7359, + "step": 8504 + }, + { + "epoch": 0.8834527890308508, + "grad_norm": 0.42872190475463867, + "learning_rate": 3.3142575256759346e-06, + "loss": 1.6038, + "step": 8505 + }, + { + "epoch": 0.883556663550431, + "grad_norm": 0.41533058881759644, + "learning_rate": 3.3084183923901834e-06, + "loss": 1.6193, + "step": 8506 + }, + { + "epoch": 0.8836605380700114, + "grad_norm": 0.39843878149986267, + "learning_rate": 3.302584231393674e-06, + "loss": 1.636, + "step": 8507 + }, + { + "epoch": 0.8837644125895918, + "grad_norm": 0.4171253740787506, + "learning_rate": 3.2967550433076754e-06, + "loss": 1.6229, + "step": 8508 + }, + { + "epoch": 0.8838682871091721, + "grad_norm": 0.4044714570045471, + "learning_rate": 3.290930828752986e-06, + "loss": 1.639, + "step": 8509 + }, + { + "epoch": 0.8839721616287525, + "grad_norm": 0.401554673910141, + "learning_rate": 3.2851115883497986e-06, + "loss": 1.6301, + "step": 8510 + }, + { + "epoch": 0.8840760361483329, + "grad_norm": 0.40703094005584717, + "learning_rate": 3.27929732271785e-06, + "loss": 1.5388, + "step": 8511 + }, + { + "epoch": 0.8841799106679131, + "grad_norm": 0.4279320538043976, + "learning_rate": 3.2734880324762962e-06, + "loss": 1.7176, + "step": 8512 + }, + { + "epoch": 0.8842837851874935, + "grad_norm": 0.5383460521697998, + "learning_rate": 3.267683718243786e-06, + "loss": 1.8459, + "step": 8513 + }, + { + "epoch": 0.8843876597070739, + "grad_norm": 0.4355732500553131, + "learning_rate": 3.261884380638436e-06, + "loss": 1.7073, + "step": 8514 + }, + { + "epoch": 0.8844915342266542, + "grad_norm": 0.4631870687007904, + "learning_rate": 3.25609002027783e-06, + "loss": 1.7029, + "step": 8515 + }, + { + "epoch": 0.8845954087462345, + "grad_norm": 0.3964858651161194, + "learning_rate": 3.250300637779019e-06, + "loss": 1.5025, + "step": 8516 + }, + { + "epoch": 0.8846992832658149, + "grad_norm": 0.4206657409667969, + "learning_rate": 3.2445162337585154e-06, + "loss": 1.8947, + "step": 8517 + }, + { + "epoch": 0.8848031577853952, + "grad_norm": 0.5454657077789307, + "learning_rate": 3.238736808832349e-06, + "loss": 1.9812, + "step": 8518 + }, + { + "epoch": 0.8849070323049756, + "grad_norm": 0.4348321557044983, + "learning_rate": 3.2329623636159435e-06, + "loss": 1.7276, + "step": 8519 + }, + { + "epoch": 0.885010906824556, + "grad_norm": 0.46917733550071716, + "learning_rate": 3.2271928987242518e-06, + "loss": 1.6525, + "step": 8520 + }, + { + "epoch": 0.8851147813441362, + "grad_norm": 0.45450276136398315, + "learning_rate": 3.2214284147716823e-06, + "loss": 1.7034, + "step": 8521 + }, + { + "epoch": 0.8852186558637166, + "grad_norm": 0.4001985490322113, + "learning_rate": 3.215668912372094e-06, + "loss": 1.5724, + "step": 8522 + }, + { + "epoch": 0.885322530383297, + "grad_norm": 0.41575512290000916, + "learning_rate": 3.209914392138835e-06, + "loss": 1.5867, + "step": 8523 + }, + { + "epoch": 0.8854264049028773, + "grad_norm": 0.42568308115005493, + "learning_rate": 3.2041648546847215e-06, + "loss": 1.6334, + "step": 8524 + }, + { + "epoch": 0.8855302794224577, + "grad_norm": 0.4289058744907379, + "learning_rate": 3.198420300622024e-06, + "loss": 1.6318, + "step": 8525 + }, + { + "epoch": 0.885634153942038, + "grad_norm": 0.39888161420822144, + "learning_rate": 3.192680730562492e-06, + "loss": 1.5733, + "step": 8526 + }, + { + "epoch": 0.8857380284616183, + "grad_norm": 0.42361804842948914, + "learning_rate": 3.18694614511737e-06, + "loss": 1.7382, + "step": 8527 + }, + { + "epoch": 0.8858419029811987, + "grad_norm": 0.4657709300518036, + "learning_rate": 3.181216544897314e-06, + "loss": 1.7749, + "step": 8528 + }, + { + "epoch": 0.8859457775007791, + "grad_norm": 0.4325384199619293, + "learning_rate": 3.175491930512492e-06, + "loss": 1.8558, + "step": 8529 + }, + { + "epoch": 0.8860496520203595, + "grad_norm": 0.4005754292011261, + "learning_rate": 3.1697723025725378e-06, + "loss": 1.5591, + "step": 8530 + }, + { + "epoch": 0.8861535265399397, + "grad_norm": 0.4035787880420685, + "learning_rate": 3.164057661686548e-06, + "loss": 1.6387, + "step": 8531 + }, + { + "epoch": 0.8862574010595201, + "grad_norm": 0.41289085149765015, + "learning_rate": 3.15834800846308e-06, + "loss": 1.6326, + "step": 8532 + }, + { + "epoch": 0.8863612755791005, + "grad_norm": 0.4399457573890686, + "learning_rate": 3.15264334351017e-06, + "loss": 1.6648, + "step": 8533 + }, + { + "epoch": 0.8864651500986808, + "grad_norm": 0.45032960176467896, + "learning_rate": 3.146943667435326e-06, + "loss": 1.8897, + "step": 8534 + }, + { + "epoch": 0.8865690246182611, + "grad_norm": 0.4220801293849945, + "learning_rate": 3.141248980845507e-06, + "loss": 1.6006, + "step": 8535 + }, + { + "epoch": 0.8866728991378415, + "grad_norm": 0.4171903133392334, + "learning_rate": 3.135559284347156e-06, + "loss": 1.4317, + "step": 8536 + }, + { + "epoch": 0.8867767736574218, + "grad_norm": 0.4253293573856354, + "learning_rate": 3.1298745785461937e-06, + "loss": 1.5809, + "step": 8537 + }, + { + "epoch": 0.8868806481770022, + "grad_norm": 0.4167105257511139, + "learning_rate": 3.1241948640479803e-06, + "loss": 1.6632, + "step": 8538 + }, + { + "epoch": 0.8869845226965826, + "grad_norm": 0.4174095690250397, + "learning_rate": 3.1185201414573707e-06, + "loss": 1.5574, + "step": 8539 + }, + { + "epoch": 0.8870883972161628, + "grad_norm": 0.43485039472579956, + "learning_rate": 3.112850411378676e-06, + "loss": 1.6832, + "step": 8540 + }, + { + "epoch": 0.8871922717357432, + "grad_norm": 0.4099987745285034, + "learning_rate": 3.10718567441568e-06, + "loss": 1.6025, + "step": 8541 + }, + { + "epoch": 0.8872961462553236, + "grad_norm": 0.4386424720287323, + "learning_rate": 3.101525931171634e-06, + "loss": 1.6887, + "step": 8542 + }, + { + "epoch": 0.8874000207749039, + "grad_norm": 0.3977483808994293, + "learning_rate": 3.095871182249255e-06, + "loss": 1.4197, + "step": 8543 + }, + { + "epoch": 0.8875038952944843, + "grad_norm": 0.42674514651298523, + "learning_rate": 3.0902214282507225e-06, + "loss": 1.655, + "step": 8544 + }, + { + "epoch": 0.8876077698140646, + "grad_norm": 0.4276520609855652, + "learning_rate": 3.0845766697776945e-06, + "loss": 1.6276, + "step": 8545 + }, + { + "epoch": 0.8877116443336449, + "grad_norm": 0.410883367061615, + "learning_rate": 3.078936907431312e-06, + "loss": 1.7068, + "step": 8546 + }, + { + "epoch": 0.8878155188532253, + "grad_norm": 0.4068666696548462, + "learning_rate": 3.0733021418121342e-06, + "loss": 1.5961, + "step": 8547 + }, + { + "epoch": 0.8879193933728057, + "grad_norm": 0.4384441375732422, + "learning_rate": 3.0676723735202416e-06, + "loss": 1.735, + "step": 8548 + }, + { + "epoch": 0.888023267892386, + "grad_norm": 0.41591498255729675, + "learning_rate": 3.0620476031551604e-06, + "loss": 1.7273, + "step": 8549 + }, + { + "epoch": 0.8881271424119663, + "grad_norm": 0.4390021860599518, + "learning_rate": 3.056427831315878e-06, + "loss": 1.6152, + "step": 8550 + }, + { + "epoch": 0.8882310169315467, + "grad_norm": 0.43032291531562805, + "learning_rate": 3.0508130586008545e-06, + "loss": 1.6926, + "step": 8551 + }, + { + "epoch": 0.888334891451127, + "grad_norm": 0.43168923258781433, + "learning_rate": 3.0452032856080227e-06, + "loss": 1.6707, + "step": 8552 + }, + { + "epoch": 0.8884387659707074, + "grad_norm": 0.42715707421302795, + "learning_rate": 3.039598512934783e-06, + "loss": 1.6203, + "step": 8553 + }, + { + "epoch": 0.8885426404902877, + "grad_norm": 0.45927006006240845, + "learning_rate": 3.0339987411779846e-06, + "loss": 1.7286, + "step": 8554 + }, + { + "epoch": 0.8886465150098681, + "grad_norm": 0.3992420434951782, + "learning_rate": 3.0284039709339795e-06, + "loss": 1.5416, + "step": 8555 + }, + { + "epoch": 0.8887503895294484, + "grad_norm": 0.4219547212123871, + "learning_rate": 3.0228142027985684e-06, + "loss": 1.6695, + "step": 8556 + }, + { + "epoch": 0.8888542640490288, + "grad_norm": 0.41342371702194214, + "learning_rate": 3.017229437366992e-06, + "loss": 1.5937, + "step": 8557 + }, + { + "epoch": 0.8889581385686092, + "grad_norm": 0.3998035788536072, + "learning_rate": 3.0116496752340127e-06, + "loss": 1.6053, + "step": 8558 + }, + { + "epoch": 0.8890620130881894, + "grad_norm": 0.38309964537620544, + "learning_rate": 3.0060749169938174e-06, + "loss": 1.491, + "step": 8559 + }, + { + "epoch": 0.8891658876077698, + "grad_norm": 0.43797820806503296, + "learning_rate": 3.000505163240075e-06, + "loss": 1.775, + "step": 8560 + }, + { + "epoch": 0.8892697621273502, + "grad_norm": 0.39977407455444336, + "learning_rate": 2.994940414565922e-06, + "loss": 1.5729, + "step": 8561 + }, + { + "epoch": 0.8893736366469305, + "grad_norm": 0.410889208316803, + "learning_rate": 2.9893806715639626e-06, + "loss": 1.5631, + "step": 8562 + }, + { + "epoch": 0.8894775111665109, + "grad_norm": 0.41645950078964233, + "learning_rate": 2.9838259348262675e-06, + "loss": 1.599, + "step": 8563 + }, + { + "epoch": 0.8895813856860912, + "grad_norm": 0.43443527817726135, + "learning_rate": 2.978276204944358e-06, + "loss": 1.6522, + "step": 8564 + }, + { + "epoch": 0.8896852602056715, + "grad_norm": 0.4481103718280792, + "learning_rate": 2.9727314825092667e-06, + "loss": 1.7648, + "step": 8565 + }, + { + "epoch": 0.8897891347252519, + "grad_norm": 0.427636057138443, + "learning_rate": 2.967191768111427e-06, + "loss": 1.6324, + "step": 8566 + }, + { + "epoch": 0.8898930092448323, + "grad_norm": 0.44498735666275024, + "learning_rate": 2.9616570623408057e-06, + "loss": 1.8669, + "step": 8567 + }, + { + "epoch": 0.8899968837644125, + "grad_norm": 0.4394892156124115, + "learning_rate": 2.956127365786793e-06, + "loss": 1.6186, + "step": 8568 + }, + { + "epoch": 0.8901007582839929, + "grad_norm": 0.4512022137641907, + "learning_rate": 2.950602679038261e-06, + "loss": 1.8238, + "step": 8569 + }, + { + "epoch": 0.8902046328035733, + "grad_norm": 0.5062389969825745, + "learning_rate": 2.9450830026835407e-06, + "loss": 1.7467, + "step": 8570 + }, + { + "epoch": 0.8903085073231536, + "grad_norm": 0.46624040603637695, + "learning_rate": 2.939568337310439e-06, + "loss": 1.931, + "step": 8571 + }, + { + "epoch": 0.890412381842734, + "grad_norm": 0.41087907552719116, + "learning_rate": 2.9340586835062302e-06, + "loss": 1.6322, + "step": 8572 + }, + { + "epoch": 0.8905162563623144, + "grad_norm": 0.39506015181541443, + "learning_rate": 2.9285540418576283e-06, + "loss": 1.5128, + "step": 8573 + }, + { + "epoch": 0.8906201308818946, + "grad_norm": 0.3968523144721985, + "learning_rate": 2.9230544129508655e-06, + "loss": 1.5671, + "step": 8574 + }, + { + "epoch": 0.890724005401475, + "grad_norm": 0.4123672544956207, + "learning_rate": 2.9175597973715828e-06, + "loss": 1.4673, + "step": 8575 + }, + { + "epoch": 0.8908278799210554, + "grad_norm": 0.4277864098548889, + "learning_rate": 2.9120701957049246e-06, + "loss": 1.6937, + "step": 8576 + }, + { + "epoch": 0.8909317544406357, + "grad_norm": 0.4264744222164154, + "learning_rate": 2.906585608535495e-06, + "loss": 1.5243, + "step": 8577 + }, + { + "epoch": 0.891035628960216, + "grad_norm": 0.45255225896835327, + "learning_rate": 2.9011060364473595e-06, + "loss": 1.5281, + "step": 8578 + }, + { + "epoch": 0.8911395034797964, + "grad_norm": 0.42393550276756287, + "learning_rate": 2.8956314800240415e-06, + "loss": 1.7556, + "step": 8579 + }, + { + "epoch": 0.8912433779993768, + "grad_norm": 0.4410860240459442, + "learning_rate": 2.8901619398485347e-06, + "loss": 1.7168, + "step": 8580 + }, + { + "epoch": 0.8913472525189571, + "grad_norm": 0.43080854415893555, + "learning_rate": 2.8846974165033235e-06, + "loss": 1.5863, + "step": 8581 + }, + { + "epoch": 0.8914511270385375, + "grad_norm": 0.3852452337741852, + "learning_rate": 2.8792379105703094e-06, + "loss": 1.5794, + "step": 8582 + }, + { + "epoch": 0.8915550015581178, + "grad_norm": 0.43257638812065125, + "learning_rate": 2.8737834226309156e-06, + "loss": 1.7181, + "step": 8583 + }, + { + "epoch": 0.8916588760776981, + "grad_norm": 0.42675426602363586, + "learning_rate": 2.868333953265989e-06, + "loss": 1.6535, + "step": 8584 + }, + { + "epoch": 0.8917627505972785, + "grad_norm": 0.40293970704078674, + "learning_rate": 2.8628895030558435e-06, + "loss": 1.6133, + "step": 8585 + }, + { + "epoch": 0.8918666251168589, + "grad_norm": 0.4669196903705597, + "learning_rate": 2.8574500725802865e-06, + "loss": 1.7044, + "step": 8586 + }, + { + "epoch": 0.8919704996364392, + "grad_norm": 0.43928948044776917, + "learning_rate": 2.8520156624185777e-06, + "loss": 1.6096, + "step": 8587 + }, + { + "epoch": 0.8920743741560195, + "grad_norm": 0.4221644401550293, + "learning_rate": 2.8465862731494264e-06, + "loss": 1.7143, + "step": 8588 + }, + { + "epoch": 0.8921782486755999, + "grad_norm": 0.4293653070926666, + "learning_rate": 2.841161905351031e-06, + "loss": 1.5406, + "step": 8589 + }, + { + "epoch": 0.8922821231951802, + "grad_norm": 0.45512861013412476, + "learning_rate": 2.8357425596010344e-06, + "loss": 1.7877, + "step": 8590 + }, + { + "epoch": 0.8923859977147606, + "grad_norm": 0.4328274726867676, + "learning_rate": 2.830328236476559e-06, + "loss": 2.0108, + "step": 8591 + }, + { + "epoch": 0.892489872234341, + "grad_norm": 0.4451085925102234, + "learning_rate": 2.8249189365541886e-06, + "loss": 1.731, + "step": 8592 + }, + { + "epoch": 0.8925937467539212, + "grad_norm": 0.4539354741573334, + "learning_rate": 2.819514660409983e-06, + "loss": 1.7513, + "step": 8593 + }, + { + "epoch": 0.8926976212735016, + "grad_norm": 0.4132786691188812, + "learning_rate": 2.814115408619428e-06, + "loss": 1.5909, + "step": 8594 + }, + { + "epoch": 0.892801495793082, + "grad_norm": 0.3973241448402405, + "learning_rate": 2.808721181757523e-06, + "loss": 1.7004, + "step": 8595 + }, + { + "epoch": 0.8929053703126623, + "grad_norm": 0.4304153025150299, + "learning_rate": 2.80333198039871e-06, + "loss": 1.6818, + "step": 8596 + }, + { + "epoch": 0.8930092448322426, + "grad_norm": 0.42142942547798157, + "learning_rate": 2.7979478051168906e-06, + "loss": 1.7153, + "step": 8597 + }, + { + "epoch": 0.893113119351823, + "grad_norm": 0.4120078384876251, + "learning_rate": 2.7925686564854447e-06, + "loss": 1.6648, + "step": 8598 + }, + { + "epoch": 0.8932169938714033, + "grad_norm": 0.40456852316856384, + "learning_rate": 2.7871945350771977e-06, + "loss": 1.6177, + "step": 8599 + }, + { + "epoch": 0.8933208683909837, + "grad_norm": 0.417198121547699, + "learning_rate": 2.781825441464464e-06, + "loss": 1.6729, + "step": 8600 + }, + { + "epoch": 0.8934247429105641, + "grad_norm": 0.4824064075946808, + "learning_rate": 2.776461376218997e-06, + "loss": 1.7711, + "step": 8601 + }, + { + "epoch": 0.8935286174301443, + "grad_norm": 0.44903162121772766, + "learning_rate": 2.7711023399120463e-06, + "loss": 1.532, + "step": 8602 + }, + { + "epoch": 0.8936324919497247, + "grad_norm": 0.4706164002418518, + "learning_rate": 2.7657483331142875e-06, + "loss": 1.7887, + "step": 8603 + }, + { + "epoch": 0.8937363664693051, + "grad_norm": 0.4258130192756653, + "learning_rate": 2.760399356395893e-06, + "loss": 1.7112, + "step": 8604 + }, + { + "epoch": 0.8938402409888855, + "grad_norm": 0.4342074692249298, + "learning_rate": 2.7550554103264845e-06, + "loss": 1.6769, + "step": 8605 + }, + { + "epoch": 0.8939441155084658, + "grad_norm": 0.43605124950408936, + "learning_rate": 2.749716495475152e-06, + "loss": 1.595, + "step": 8606 + }, + { + "epoch": 0.8940479900280461, + "grad_norm": 0.4049820899963379, + "learning_rate": 2.7443826124104454e-06, + "loss": 1.5847, + "step": 8607 + }, + { + "epoch": 0.8941518645476265, + "grad_norm": 0.41089296340942383, + "learning_rate": 2.7390537617003777e-06, + "loss": 1.4398, + "step": 8608 + }, + { + "epoch": 0.8942557390672068, + "grad_norm": 0.4379929006099701, + "learning_rate": 2.73372994391245e-06, + "loss": 1.7582, + "step": 8609 + }, + { + "epoch": 0.8943596135867872, + "grad_norm": 0.44860774278640747, + "learning_rate": 2.72841115961357e-06, + "loss": 1.7001, + "step": 8610 + }, + { + "epoch": 0.8944634881063676, + "grad_norm": 0.39442136883735657, + "learning_rate": 2.7230974093701844e-06, + "loss": 1.5964, + "step": 8611 + }, + { + "epoch": 0.8945673626259478, + "grad_norm": 0.4311879873275757, + "learning_rate": 2.7177886937481524e-06, + "loss": 1.7961, + "step": 8612 + }, + { + "epoch": 0.8946712371455282, + "grad_norm": 0.40751856565475464, + "learning_rate": 2.7124850133127987e-06, + "loss": 1.4809, + "step": 8613 + }, + { + "epoch": 0.8947751116651086, + "grad_norm": 0.3862675130367279, + "learning_rate": 2.707186368628939e-06, + "loss": 1.556, + "step": 8614 + }, + { + "epoch": 0.8948789861846889, + "grad_norm": 0.43987709283828735, + "learning_rate": 2.7018927602608323e-06, + "loss": 1.4624, + "step": 8615 + }, + { + "epoch": 0.8949828607042692, + "grad_norm": 0.5344493985176086, + "learning_rate": 2.6966041887722117e-06, + "loss": 1.8081, + "step": 8616 + }, + { + "epoch": 0.8950867352238496, + "grad_norm": 0.43068018555641174, + "learning_rate": 2.6913206547262593e-06, + "loss": 1.7128, + "step": 8617 + }, + { + "epoch": 0.8951906097434299, + "grad_norm": 0.45353829860687256, + "learning_rate": 2.6860421586856368e-06, + "loss": 1.7452, + "step": 8618 + }, + { + "epoch": 0.8952944842630103, + "grad_norm": 0.4327743947505951, + "learning_rate": 2.6807687012124605e-06, + "loss": 1.773, + "step": 8619 + }, + { + "epoch": 0.8953983587825907, + "grad_norm": 0.4295056462287903, + "learning_rate": 2.6755002828683096e-06, + "loss": 1.6954, + "step": 8620 + }, + { + "epoch": 0.8955022333021709, + "grad_norm": 0.4431517720222473, + "learning_rate": 2.6702369042142463e-06, + "loss": 1.7723, + "step": 8621 + }, + { + "epoch": 0.8956061078217513, + "grad_norm": 0.4079246520996094, + "learning_rate": 2.6649785658107494e-06, + "loss": 1.6812, + "step": 8622 + }, + { + "epoch": 0.8957099823413317, + "grad_norm": 0.427082896232605, + "learning_rate": 2.6597252682178165e-06, + "loss": 1.772, + "step": 8623 + }, + { + "epoch": 0.895813856860912, + "grad_norm": 0.4852583110332489, + "learning_rate": 2.6544770119948714e-06, + "loss": 1.6485, + "step": 8624 + }, + { + "epoch": 0.8959177313804924, + "grad_norm": 0.4270947575569153, + "learning_rate": 2.6492337977008173e-06, + "loss": 1.7824, + "step": 8625 + }, + { + "epoch": 0.8960216059000727, + "grad_norm": 0.4104156792163849, + "learning_rate": 2.6439956258940193e-06, + "loss": 1.5354, + "step": 8626 + }, + { + "epoch": 0.896125480419653, + "grad_norm": 0.4451044797897339, + "learning_rate": 2.638762497132291e-06, + "loss": 1.6791, + "step": 8627 + }, + { + "epoch": 0.8962293549392334, + "grad_norm": 0.44924893975257874, + "learning_rate": 2.633534411972921e-06, + "loss": 1.7615, + "step": 8628 + }, + { + "epoch": 0.8963332294588138, + "grad_norm": 0.40664803981781006, + "learning_rate": 2.628311370972664e-06, + "loss": 1.535, + "step": 8629 + }, + { + "epoch": 0.896437103978394, + "grad_norm": 0.43392449617385864, + "learning_rate": 2.6230933746877406e-06, + "loss": 1.6903, + "step": 8630 + }, + { + "epoch": 0.8965409784979744, + "grad_norm": 0.38720911741256714, + "learning_rate": 2.6178804236738073e-06, + "loss": 1.5899, + "step": 8631 + }, + { + "epoch": 0.8966448530175548, + "grad_norm": 0.49395009875297546, + "learning_rate": 2.61267251848602e-06, + "loss": 1.8366, + "step": 8632 + }, + { + "epoch": 0.8967487275371352, + "grad_norm": 0.41654813289642334, + "learning_rate": 2.6074696596789793e-06, + "loss": 1.6235, + "step": 8633 + }, + { + "epoch": 0.8968526020567155, + "grad_norm": 0.4147179424762726, + "learning_rate": 2.6022718478067363e-06, + "loss": 1.5863, + "step": 8634 + }, + { + "epoch": 0.8969564765762958, + "grad_norm": 0.46183696389198303, + "learning_rate": 2.597079083422832e-06, + "loss": 1.7443, + "step": 8635 + }, + { + "epoch": 0.8970603510958762, + "grad_norm": 0.4648350179195404, + "learning_rate": 2.5918913670802346e-06, + "loss": 1.7862, + "step": 8636 + }, + { + "epoch": 0.8971642256154565, + "grad_norm": 0.44168201088905334, + "learning_rate": 2.5867086993314305e-06, + "loss": 1.7847, + "step": 8637 + }, + { + "epoch": 0.8972681001350369, + "grad_norm": 0.40692418813705444, + "learning_rate": 2.5815310807282944e-06, + "loss": 1.6616, + "step": 8638 + }, + { + "epoch": 0.8973719746546173, + "grad_norm": 0.4546252489089966, + "learning_rate": 2.5763585118222246e-06, + "loss": 1.8507, + "step": 8639 + }, + { + "epoch": 0.8974758491741975, + "grad_norm": 0.5330055356025696, + "learning_rate": 2.5711909931640634e-06, + "loss": 1.9273, + "step": 8640 + }, + { + "epoch": 0.8975797236937779, + "grad_norm": 0.4015289545059204, + "learning_rate": 2.566028525304087e-06, + "loss": 1.6388, + "step": 8641 + }, + { + "epoch": 0.8976835982133583, + "grad_norm": 0.4576239585876465, + "learning_rate": 2.5608711087920844e-06, + "loss": 1.7742, + "step": 8642 + }, + { + "epoch": 0.8977874727329386, + "grad_norm": 0.42561182379722595, + "learning_rate": 2.5557187441772655e-06, + "loss": 1.5785, + "step": 8643 + }, + { + "epoch": 0.897891347252519, + "grad_norm": 0.43728065490722656, + "learning_rate": 2.550571432008325e-06, + "loss": 1.6983, + "step": 8644 + }, + { + "epoch": 0.8979952217720993, + "grad_norm": 0.4214796721935272, + "learning_rate": 2.5454291728334022e-06, + "loss": 1.7185, + "step": 8645 + }, + { + "epoch": 0.8980990962916796, + "grad_norm": 0.38102787733078003, + "learning_rate": 2.5402919672001147e-06, + "loss": 1.4215, + "step": 8646 + }, + { + "epoch": 0.89820297081126, + "grad_norm": 0.4356735050678253, + "learning_rate": 2.5351598156555356e-06, + "loss": 1.5594, + "step": 8647 + }, + { + "epoch": 0.8983068453308404, + "grad_norm": 0.4779966473579407, + "learning_rate": 2.5300327187461837e-06, + "loss": 1.7134, + "step": 8648 + }, + { + "epoch": 0.8984107198504206, + "grad_norm": 0.4045184850692749, + "learning_rate": 2.524910677018083e-06, + "loss": 1.5643, + "step": 8649 + }, + { + "epoch": 0.898514594370001, + "grad_norm": 0.4227657914161682, + "learning_rate": 2.519793691016664e-06, + "loss": 1.6597, + "step": 8650 + }, + { + "epoch": 0.8986184688895814, + "grad_norm": 0.40350401401519775, + "learning_rate": 2.5146817612868634e-06, + "loss": 1.5878, + "step": 8651 + }, + { + "epoch": 0.8987223434091617, + "grad_norm": 0.42735230922698975, + "learning_rate": 2.5095748883730562e-06, + "loss": 1.6392, + "step": 8652 + }, + { + "epoch": 0.8988262179287421, + "grad_norm": 0.4346289038658142, + "learning_rate": 2.50447307281908e-06, + "loss": 1.6582, + "step": 8653 + }, + { + "epoch": 0.8989300924483224, + "grad_norm": 0.4567740559577942, + "learning_rate": 2.4993763151682448e-06, + "loss": 1.6875, + "step": 8654 + }, + { + "epoch": 0.8990339669679027, + "grad_norm": 0.4155108332633972, + "learning_rate": 2.4942846159633104e-06, + "loss": 1.6108, + "step": 8655 + }, + { + "epoch": 0.8991378414874831, + "grad_norm": 0.3986811935901642, + "learning_rate": 2.4891979757465102e-06, + "loss": 1.6022, + "step": 8656 + }, + { + "epoch": 0.8992417160070635, + "grad_norm": 0.39959198236465454, + "learning_rate": 2.484116395059516e-06, + "loss": 1.6271, + "step": 8657 + }, + { + "epoch": 0.8993455905266439, + "grad_norm": 0.3958359956741333, + "learning_rate": 2.479039874443495e-06, + "loss": 1.4597, + "step": 8658 + }, + { + "epoch": 0.8994494650462241, + "grad_norm": 0.40250423550605774, + "learning_rate": 2.473968414439054e-06, + "loss": 1.5915, + "step": 8659 + }, + { + "epoch": 0.8995533395658045, + "grad_norm": 0.4547244608402252, + "learning_rate": 2.468902015586255e-06, + "loss": 1.7938, + "step": 8660 + }, + { + "epoch": 0.8996572140853849, + "grad_norm": 0.40203481912612915, + "learning_rate": 2.4638406784246337e-06, + "loss": 1.5711, + "step": 8661 + }, + { + "epoch": 0.8997610886049652, + "grad_norm": 0.4178847074508667, + "learning_rate": 2.458784403493186e-06, + "loss": 1.5101, + "step": 8662 + }, + { + "epoch": 0.8998649631245456, + "grad_norm": 0.42161715030670166, + "learning_rate": 2.453733191330365e-06, + "loss": 1.706, + "step": 8663 + }, + { + "epoch": 0.8999688376441259, + "grad_norm": 0.4573306739330292, + "learning_rate": 2.448687042474074e-06, + "loss": 1.8456, + "step": 8664 + }, + { + "epoch": 0.9000727121637062, + "grad_norm": 0.3947295844554901, + "learning_rate": 2.4436459574617155e-06, + "loss": 1.6289, + "step": 8665 + }, + { + "epoch": 0.9001765866832866, + "grad_norm": 0.3863442540168762, + "learning_rate": 2.4386099368300943e-06, + "loss": 1.6296, + "step": 8666 + }, + { + "epoch": 0.900280461202867, + "grad_norm": 0.4450392723083496, + "learning_rate": 2.4335789811155253e-06, + "loss": 1.7748, + "step": 8667 + }, + { + "epoch": 0.9003843357224472, + "grad_norm": 0.4072805643081665, + "learning_rate": 2.4285530908537747e-06, + "loss": 1.6025, + "step": 8668 + }, + { + "epoch": 0.9004882102420276, + "grad_norm": 0.44600561261177063, + "learning_rate": 2.423532266580031e-06, + "loss": 1.6433, + "step": 8669 + }, + { + "epoch": 0.900592084761608, + "grad_norm": 0.4108854830265045, + "learning_rate": 2.4185165088289996e-06, + "loss": 1.55, + "step": 8670 + }, + { + "epoch": 0.9006959592811883, + "grad_norm": 0.4555318355560303, + "learning_rate": 2.4135058181348036e-06, + "loss": 1.8198, + "step": 8671 + }, + { + "epoch": 0.9007998338007687, + "grad_norm": 0.4197269082069397, + "learning_rate": 2.4085001950310546e-06, + "loss": 1.7265, + "step": 8672 + }, + { + "epoch": 0.900903708320349, + "grad_norm": 0.39955320954322815, + "learning_rate": 2.403499640050805e-06, + "loss": 1.5325, + "step": 8673 + }, + { + "epoch": 0.9010075828399293, + "grad_norm": 0.4017268717288971, + "learning_rate": 2.3985041537265774e-06, + "loss": 1.5629, + "step": 8674 + }, + { + "epoch": 0.9011114573595097, + "grad_norm": 0.4230203926563263, + "learning_rate": 2.3935137365903526e-06, + "loss": 1.6934, + "step": 8675 + }, + { + "epoch": 0.9012153318790901, + "grad_norm": 0.43759918212890625, + "learning_rate": 2.3885283891735664e-06, + "loss": 1.5736, + "step": 8676 + }, + { + "epoch": 0.9013192063986704, + "grad_norm": 0.49041640758514404, + "learning_rate": 2.383548112007128e-06, + "loss": 1.6968, + "step": 8677 + }, + { + "epoch": 0.9014230809182507, + "grad_norm": 0.4059118926525116, + "learning_rate": 2.3785729056213845e-06, + "loss": 1.6976, + "step": 8678 + }, + { + "epoch": 0.9015269554378311, + "grad_norm": 0.4104028642177582, + "learning_rate": 2.3736027705461737e-06, + "loss": 1.581, + "step": 8679 + }, + { + "epoch": 0.9016308299574114, + "grad_norm": 0.4261881709098816, + "learning_rate": 2.368637707310767e-06, + "loss": 1.6556, + "step": 8680 + }, + { + "epoch": 0.9017347044769918, + "grad_norm": 0.43607598543167114, + "learning_rate": 2.3636777164439016e-06, + "loss": 1.802, + "step": 8681 + }, + { + "epoch": 0.9018385789965722, + "grad_norm": 0.5014055371284485, + "learning_rate": 2.3587227984737835e-06, + "loss": 1.8442, + "step": 8682 + }, + { + "epoch": 0.9019424535161525, + "grad_norm": 0.3931456208229065, + "learning_rate": 2.3537729539280684e-06, + "loss": 1.5732, + "step": 8683 + }, + { + "epoch": 0.9020463280357328, + "grad_norm": 0.3907303810119629, + "learning_rate": 2.3488281833338844e-06, + "loss": 1.4718, + "step": 8684 + }, + { + "epoch": 0.9021502025553132, + "grad_norm": 0.4133702516555786, + "learning_rate": 2.3438884872177992e-06, + "loss": 1.5745, + "step": 8685 + }, + { + "epoch": 0.9022540770748936, + "grad_norm": 0.45526713132858276, + "learning_rate": 2.338953866105864e-06, + "loss": 1.8402, + "step": 8686 + }, + { + "epoch": 0.9023579515944739, + "grad_norm": 0.4495735764503479, + "learning_rate": 2.3340243205235703e-06, + "loss": 1.7328, + "step": 8687 + }, + { + "epoch": 0.9024618261140542, + "grad_norm": 0.43973442912101746, + "learning_rate": 2.3290998509958752e-06, + "loss": 1.7364, + "step": 8688 + }, + { + "epoch": 0.9025657006336346, + "grad_norm": 0.4277052879333496, + "learning_rate": 2.324180458047198e-06, + "loss": 1.6693, + "step": 8689 + }, + { + "epoch": 0.9026695751532149, + "grad_norm": 0.4482114315032959, + "learning_rate": 2.3192661422014204e-06, + "loss": 1.7195, + "step": 8690 + }, + { + "epoch": 0.9027734496727953, + "grad_norm": 0.44713294506073, + "learning_rate": 2.3143569039818737e-06, + "loss": 1.5864, + "step": 8691 + }, + { + "epoch": 0.9028773241923757, + "grad_norm": 0.4385944604873657, + "learning_rate": 2.3094527439113446e-06, + "loss": 1.7092, + "step": 8692 + }, + { + "epoch": 0.9029811987119559, + "grad_norm": 0.39296817779541016, + "learning_rate": 2.3045536625121155e-06, + "loss": 1.5689, + "step": 8693 + }, + { + "epoch": 0.9030850732315363, + "grad_norm": 0.43056434392929077, + "learning_rate": 2.2996596603058695e-06, + "loss": 1.7061, + "step": 8694 + }, + { + "epoch": 0.9031889477511167, + "grad_norm": 0.49173492193222046, + "learning_rate": 2.294770737813795e-06, + "loss": 1.6226, + "step": 8695 + }, + { + "epoch": 0.903292822270697, + "grad_norm": 0.40122538805007935, + "learning_rate": 2.289886895556531e-06, + "loss": 1.5687, + "step": 8696 + }, + { + "epoch": 0.9033966967902773, + "grad_norm": 0.4927726089954376, + "learning_rate": 2.285008134054145e-06, + "loss": 1.9438, + "step": 8697 + }, + { + "epoch": 0.9035005713098577, + "grad_norm": 0.4643442928791046, + "learning_rate": 2.2801344538262103e-06, + "loss": 1.6155, + "step": 8698 + }, + { + "epoch": 0.903604445829438, + "grad_norm": 0.4130551517009735, + "learning_rate": 2.2752658553917226e-06, + "loss": 1.669, + "step": 8699 + }, + { + "epoch": 0.9037083203490184, + "grad_norm": 0.46958547830581665, + "learning_rate": 2.270402339269162e-06, + "loss": 1.7943, + "step": 8700 + }, + { + "epoch": 0.9038121948685988, + "grad_norm": 0.46900489926338196, + "learning_rate": 2.2655439059764415e-06, + "loss": 1.5809, + "step": 8701 + }, + { + "epoch": 0.903916069388179, + "grad_norm": 0.42625823616981506, + "learning_rate": 2.2606905560309534e-06, + "loss": 1.5512, + "step": 8702 + }, + { + "epoch": 0.9040199439077594, + "grad_norm": 0.38436755537986755, + "learning_rate": 2.2558422899495392e-06, + "loss": 1.5806, + "step": 8703 + }, + { + "epoch": 0.9041238184273398, + "grad_norm": 0.4123709201812744, + "learning_rate": 2.2509991082484914e-06, + "loss": 1.71, + "step": 8704 + }, + { + "epoch": 0.9042276929469201, + "grad_norm": 0.4161551594734192, + "learning_rate": 2.2461610114436038e-06, + "loss": 1.5765, + "step": 8705 + }, + { + "epoch": 0.9043315674665005, + "grad_norm": 0.409616619348526, + "learning_rate": 2.241328000050058e-06, + "loss": 1.4221, + "step": 8706 + }, + { + "epoch": 0.9044354419860808, + "grad_norm": 0.4236219525337219, + "learning_rate": 2.236500074582554e-06, + "loss": 1.6554, + "step": 8707 + }, + { + "epoch": 0.9045393165056611, + "grad_norm": 0.4036356508731842, + "learning_rate": 2.2316772355552185e-06, + "loss": 1.4211, + "step": 8708 + }, + { + "epoch": 0.9046431910252415, + "grad_norm": 0.40732133388519287, + "learning_rate": 2.226859483481658e-06, + "loss": 1.6462, + "step": 8709 + }, + { + "epoch": 0.9047470655448219, + "grad_norm": 0.4440615773200989, + "learning_rate": 2.222046818874912e-06, + "loss": 1.6724, + "step": 8710 + }, + { + "epoch": 0.9048509400644023, + "grad_norm": 0.39947155117988586, + "learning_rate": 2.217239242247493e-06, + "loss": 1.5371, + "step": 8711 + }, + { + "epoch": 0.9049548145839825, + "grad_norm": 0.41746270656585693, + "learning_rate": 2.2124367541113856e-06, + "loss": 1.7528, + "step": 8712 + }, + { + "epoch": 0.9050586891035629, + "grad_norm": 0.40026289224624634, + "learning_rate": 2.207639354977997e-06, + "loss": 1.5377, + "step": 8713 + }, + { + "epoch": 0.9051625636231433, + "grad_norm": 0.4574800133705139, + "learning_rate": 2.2028470453582253e-06, + "loss": 1.798, + "step": 8714 + }, + { + "epoch": 0.9052664381427236, + "grad_norm": 0.4214540123939514, + "learning_rate": 2.1980598257624165e-06, + "loss": 1.6247, + "step": 8715 + }, + { + "epoch": 0.9053703126623039, + "grad_norm": 0.4481583535671234, + "learning_rate": 2.193277696700363e-06, + "loss": 1.8091, + "step": 8716 + }, + { + "epoch": 0.9054741871818843, + "grad_norm": 0.45233625173568726, + "learning_rate": 2.188500658681325e-06, + "loss": 1.8274, + "step": 8717 + }, + { + "epoch": 0.9055780617014646, + "grad_norm": 0.41306886076927185, + "learning_rate": 2.183728712214028e-06, + "loss": 1.5299, + "step": 8718 + }, + { + "epoch": 0.905681936221045, + "grad_norm": 0.4653450846672058, + "learning_rate": 2.1789618578066386e-06, + "loss": 1.7601, + "step": 8719 + }, + { + "epoch": 0.9057858107406254, + "grad_norm": 0.4584251642227173, + "learning_rate": 2.1742000959667885e-06, + "loss": 1.624, + "step": 8720 + }, + { + "epoch": 0.9058896852602056, + "grad_norm": 0.4341670870780945, + "learning_rate": 2.169443427201584e-06, + "loss": 1.5616, + "step": 8721 + }, + { + "epoch": 0.905993559779786, + "grad_norm": 0.408926397562027, + "learning_rate": 2.164691852017553e-06, + "loss": 1.6354, + "step": 8722 + }, + { + "epoch": 0.9060974342993664, + "grad_norm": 0.41678133606910706, + "learning_rate": 2.159945370920713e-06, + "loss": 1.7588, + "step": 8723 + }, + { + "epoch": 0.9062013088189467, + "grad_norm": 0.4230055809020996, + "learning_rate": 2.1552039844165316e-06, + "loss": 1.6595, + "step": 8724 + }, + { + "epoch": 0.906305183338527, + "grad_norm": 0.4654471278190613, + "learning_rate": 2.1504676930099056e-06, + "loss": 1.6695, + "step": 8725 + }, + { + "epoch": 0.9064090578581074, + "grad_norm": 0.4271183907985687, + "learning_rate": 2.145736497205242e-06, + "loss": 1.6587, + "step": 8726 + }, + { + "epoch": 0.9065129323776877, + "grad_norm": 0.4363931119441986, + "learning_rate": 2.14101039750636e-06, + "loss": 1.6881, + "step": 8727 + }, + { + "epoch": 0.9066168068972681, + "grad_norm": 0.4049679636955261, + "learning_rate": 2.1362893944165573e-06, + "loss": 1.6157, + "step": 8728 + }, + { + "epoch": 0.9067206814168485, + "grad_norm": 0.4094752371311188, + "learning_rate": 2.131573488438582e-06, + "loss": 1.6042, + "step": 8729 + }, + { + "epoch": 0.9068245559364287, + "grad_norm": 0.41162753105163574, + "learning_rate": 2.1268626800746427e-06, + "loss": 1.738, + "step": 8730 + }, + { + "epoch": 0.9069284304560091, + "grad_norm": 0.4202596843242645, + "learning_rate": 2.1221569698264054e-06, + "loss": 1.662, + "step": 8731 + }, + { + "epoch": 0.9070323049755895, + "grad_norm": 0.40823620557785034, + "learning_rate": 2.11745635819498e-06, + "loss": 1.6081, + "step": 8732 + }, + { + "epoch": 0.9071361794951698, + "grad_norm": 0.4370567798614502, + "learning_rate": 2.1127608456809667e-06, + "loss": 1.6897, + "step": 8733 + }, + { + "epoch": 0.9072400540147502, + "grad_norm": 0.4072186052799225, + "learning_rate": 2.108070432784376e-06, + "loss": 1.6987, + "step": 8734 + }, + { + "epoch": 0.9073439285343305, + "grad_norm": 0.4287233352661133, + "learning_rate": 2.1033851200047195e-06, + "loss": 1.5307, + "step": 8735 + }, + { + "epoch": 0.9074478030539109, + "grad_norm": 0.40400230884552, + "learning_rate": 2.098704907840943e-06, + "loss": 1.5036, + "step": 8736 + }, + { + "epoch": 0.9075516775734912, + "grad_norm": 0.428813636302948, + "learning_rate": 2.094029796791441e-06, + "loss": 1.691, + "step": 8737 + }, + { + "epoch": 0.9076555520930716, + "grad_norm": 0.4154002368450165, + "learning_rate": 2.0893597873540947e-06, + "loss": 1.6238, + "step": 8738 + }, + { + "epoch": 0.907759426612652, + "grad_norm": 0.42926695942878723, + "learning_rate": 2.0846948800262e-06, + "loss": 1.765, + "step": 8739 + }, + { + "epoch": 0.9078633011322322, + "grad_norm": 0.41468799114227295, + "learning_rate": 2.080035075304565e-06, + "loss": 1.785, + "step": 8740 + }, + { + "epoch": 0.9079671756518126, + "grad_norm": 0.4041023254394531, + "learning_rate": 2.0753803736853872e-06, + "loss": 1.7357, + "step": 8741 + }, + { + "epoch": 0.908071050171393, + "grad_norm": 0.39929163455963135, + "learning_rate": 2.070730775664381e-06, + "loss": 1.5915, + "step": 8742 + }, + { + "epoch": 0.9081749246909733, + "grad_norm": 0.3934778571128845, + "learning_rate": 2.0660862817366846e-06, + "loss": 1.5252, + "step": 8743 + }, + { + "epoch": 0.9082787992105537, + "grad_norm": 0.4070751368999481, + "learning_rate": 2.061446892396901e-06, + "loss": 1.6859, + "step": 8744 + }, + { + "epoch": 0.908382673730134, + "grad_norm": 0.4303978979587555, + "learning_rate": 2.0568126081390916e-06, + "loss": 1.635, + "step": 8745 + }, + { + "epoch": 0.9084865482497143, + "grad_norm": 0.43727797269821167, + "learning_rate": 2.052183429456761e-06, + "loss": 1.7292, + "step": 8746 + }, + { + "epoch": 0.9085904227692947, + "grad_norm": 0.40043336153030396, + "learning_rate": 2.047559356842893e-06, + "loss": 1.6206, + "step": 8747 + }, + { + "epoch": 0.9086942972888751, + "grad_norm": 0.4222732186317444, + "learning_rate": 2.042940390789899e-06, + "loss": 1.7271, + "step": 8748 + }, + { + "epoch": 0.9087981718084553, + "grad_norm": 0.4159920811653137, + "learning_rate": 2.0383265317896916e-06, + "loss": 1.6506, + "step": 8749 + }, + { + "epoch": 0.9089020463280357, + "grad_norm": 0.4522947669029236, + "learning_rate": 2.033717780333577e-06, + "loss": 1.6477, + "step": 8750 + }, + { + "epoch": 0.9090059208476161, + "grad_norm": 0.4436376392841339, + "learning_rate": 2.0291141369123746e-06, + "loss": 1.6716, + "step": 8751 + }, + { + "epoch": 0.9091097953671964, + "grad_norm": 0.42394956946372986, + "learning_rate": 2.0245156020163413e-06, + "loss": 1.5676, + "step": 8752 + }, + { + "epoch": 0.9092136698867768, + "grad_norm": 0.4075998365879059, + "learning_rate": 2.019922176135153e-06, + "loss": 1.7053, + "step": 8753 + }, + { + "epoch": 0.9093175444063571, + "grad_norm": 0.4617473781108856, + "learning_rate": 2.0153338597580006e-06, + "loss": 1.7169, + "step": 8754 + }, + { + "epoch": 0.9094214189259374, + "grad_norm": 0.43988341093063354, + "learning_rate": 2.0107506533735e-06, + "loss": 1.7221, + "step": 8755 + }, + { + "epoch": 0.9095252934455178, + "grad_norm": 0.3944602906703949, + "learning_rate": 2.0061725574697266e-06, + "loss": 1.4159, + "step": 8756 + }, + { + "epoch": 0.9096291679650982, + "grad_norm": 0.4070970416069031, + "learning_rate": 2.0015995725342073e-06, + "loss": 1.5355, + "step": 8757 + }, + { + "epoch": 0.9097330424846785, + "grad_norm": 0.4194161891937256, + "learning_rate": 1.9970316990539306e-06, + "loss": 1.6739, + "step": 8758 + }, + { + "epoch": 0.9098369170042588, + "grad_norm": 0.4412401020526886, + "learning_rate": 1.9924689375153406e-06, + "loss": 1.6932, + "step": 8759 + }, + { + "epoch": 0.9099407915238392, + "grad_norm": 0.483318030834198, + "learning_rate": 1.9879112884043315e-06, + "loss": 1.7716, + "step": 8760 + }, + { + "epoch": 0.9100446660434196, + "grad_norm": 0.3971022367477417, + "learning_rate": 1.983358752206266e-06, + "loss": 1.7038, + "step": 8761 + }, + { + "epoch": 0.9101485405629999, + "grad_norm": 0.3960052728652954, + "learning_rate": 1.9788113294059498e-06, + "loss": 1.5655, + "step": 8762 + }, + { + "epoch": 0.9102524150825803, + "grad_norm": 0.4107067286968231, + "learning_rate": 1.974269020487646e-06, + "loss": 1.5848, + "step": 8763 + }, + { + "epoch": 0.9103562896021606, + "grad_norm": 0.42935070395469666, + "learning_rate": 1.969731825935073e-06, + "loss": 1.5146, + "step": 8764 + }, + { + "epoch": 0.9104601641217409, + "grad_norm": 0.4849339723587036, + "learning_rate": 1.9651997462314166e-06, + "loss": 1.7126, + "step": 8765 + }, + { + "epoch": 0.9105640386413213, + "grad_norm": 0.4564700424671173, + "learning_rate": 1.9606727818592963e-06, + "loss": 1.8559, + "step": 8766 + }, + { + "epoch": 0.9106679131609017, + "grad_norm": 0.43314129114151, + "learning_rate": 1.9561509333008045e-06, + "loss": 1.6292, + "step": 8767 + }, + { + "epoch": 0.910771787680482, + "grad_norm": 0.460445374250412, + "learning_rate": 1.9516342010374888e-06, + "loss": 1.7847, + "step": 8768 + }, + { + "epoch": 0.9108756622000623, + "grad_norm": 0.4221862554550171, + "learning_rate": 1.9471225855503262e-06, + "loss": 1.6476, + "step": 8769 + }, + { + "epoch": 0.9109795367196427, + "grad_norm": 0.3845740258693695, + "learning_rate": 1.9426160873197874e-06, + "loss": 1.5905, + "step": 8770 + }, + { + "epoch": 0.911083411239223, + "grad_norm": 0.42310813069343567, + "learning_rate": 1.938114706825778e-06, + "loss": 1.7829, + "step": 8771 + }, + { + "epoch": 0.9111872857588034, + "grad_norm": 0.42070627212524414, + "learning_rate": 1.9336184445476525e-06, + "loss": 1.5937, + "step": 8772 + }, + { + "epoch": 0.9112911602783837, + "grad_norm": 0.5164587497711182, + "learning_rate": 1.9291273009642287e-06, + "loss": 1.7094, + "step": 8773 + }, + { + "epoch": 0.911395034797964, + "grad_norm": 0.39574146270751953, + "learning_rate": 1.924641276553779e-06, + "loss": 1.5736, + "step": 8774 + }, + { + "epoch": 0.9114989093175444, + "grad_norm": 0.3940182030200958, + "learning_rate": 1.9201603717940385e-06, + "loss": 1.3872, + "step": 8775 + }, + { + "epoch": 0.9116027838371248, + "grad_norm": 0.42792201042175293, + "learning_rate": 1.9156845871621698e-06, + "loss": 1.7079, + "step": 8776 + }, + { + "epoch": 0.9117066583567051, + "grad_norm": 0.40028658509254456, + "learning_rate": 1.91121392313483e-06, + "loss": 1.564, + "step": 8777 + }, + { + "epoch": 0.9118105328762854, + "grad_norm": 0.462466835975647, + "learning_rate": 1.9067483801881002e-06, + "loss": 1.7931, + "step": 8778 + }, + { + "epoch": 0.9119144073958658, + "grad_norm": 0.42079779505729675, + "learning_rate": 1.9022879587975162e-06, + "loss": 1.6593, + "step": 8779 + }, + { + "epoch": 0.9120182819154461, + "grad_norm": 0.4041844606399536, + "learning_rate": 1.8978326594381035e-06, + "loss": 1.4404, + "step": 8780 + }, + { + "epoch": 0.9121221564350265, + "grad_norm": 0.4102264940738678, + "learning_rate": 1.8933824825842828e-06, + "loss": 1.5394, + "step": 8781 + }, + { + "epoch": 0.9122260309546069, + "grad_norm": 0.451555073261261, + "learning_rate": 1.8889374287099916e-06, + "loss": 1.7135, + "step": 8782 + }, + { + "epoch": 0.9123299054741871, + "grad_norm": 0.399200975894928, + "learning_rate": 1.8844974982885843e-06, + "loss": 1.4235, + "step": 8783 + }, + { + "epoch": 0.9124337799937675, + "grad_norm": 0.42155495285987854, + "learning_rate": 1.8800626917928776e-06, + "loss": 1.5576, + "step": 8784 + }, + { + "epoch": 0.9125376545133479, + "grad_norm": 0.4104582965373993, + "learning_rate": 1.8756330096951435e-06, + "loss": 1.6632, + "step": 8785 + }, + { + "epoch": 0.9126415290329283, + "grad_norm": 0.4192086160182953, + "learning_rate": 1.8712084524671047e-06, + "loss": 1.6348, + "step": 8786 + }, + { + "epoch": 0.9127454035525085, + "grad_norm": 0.40471452474594116, + "learning_rate": 1.866789020579962e-06, + "loss": 1.5868, + "step": 8787 + }, + { + "epoch": 0.9128492780720889, + "grad_norm": 0.4262443482875824, + "learning_rate": 1.862374714504317e-06, + "loss": 1.6801, + "step": 8788 + }, + { + "epoch": 0.9129531525916693, + "grad_norm": 0.4280400574207306, + "learning_rate": 1.8579655347102875e-06, + "loss": 1.5403, + "step": 8789 + }, + { + "epoch": 0.9130570271112496, + "grad_norm": 0.4044937193393707, + "learning_rate": 1.853561481667404e-06, + "loss": 1.5541, + "step": 8790 + }, + { + "epoch": 0.91316090163083, + "grad_norm": 0.41494637727737427, + "learning_rate": 1.8491625558446634e-06, + "loss": 1.7474, + "step": 8791 + }, + { + "epoch": 0.9132647761504104, + "grad_norm": 0.391250878572464, + "learning_rate": 1.8447687577105244e-06, + "loss": 1.6448, + "step": 8792 + }, + { + "epoch": 0.9133686506699906, + "grad_norm": 0.4285977780818939, + "learning_rate": 1.8403800877328847e-06, + "loss": 1.6618, + "step": 8793 + }, + { + "epoch": 0.913472525189571, + "grad_norm": 0.4157012104988098, + "learning_rate": 1.835996546379104e-06, + "loss": 1.6608, + "step": 8794 + }, + { + "epoch": 0.9135763997091514, + "grad_norm": 0.41410088539123535, + "learning_rate": 1.8316181341159977e-06, + "loss": 1.7398, + "step": 8795 + }, + { + "epoch": 0.9136802742287317, + "grad_norm": 0.43308156728744507, + "learning_rate": 1.8272448514098373e-06, + "loss": 1.5286, + "step": 8796 + }, + { + "epoch": 0.913784148748312, + "grad_norm": 0.39761316776275635, + "learning_rate": 1.8228766987263334e-06, + "loss": 1.613, + "step": 8797 + }, + { + "epoch": 0.9138880232678924, + "grad_norm": 0.46875429153442383, + "learning_rate": 1.8185136765306643e-06, + "loss": 1.6326, + "step": 8798 + }, + { + "epoch": 0.9139918977874727, + "grad_norm": 0.4058031737804413, + "learning_rate": 1.8141557852874634e-06, + "loss": 1.4985, + "step": 8799 + }, + { + "epoch": 0.9140957723070531, + "grad_norm": 0.4153459668159485, + "learning_rate": 1.8098030254608044e-06, + "loss": 1.6341, + "step": 8800 + }, + { + "epoch": 0.9141996468266335, + "grad_norm": 0.4445685148239136, + "learning_rate": 1.8054553975142273e-06, + "loss": 1.8224, + "step": 8801 + }, + { + "epoch": 0.9143035213462137, + "grad_norm": 0.4452269375324249, + "learning_rate": 1.8011129019107176e-06, + "loss": 1.7055, + "step": 8802 + }, + { + "epoch": 0.9144073958657941, + "grad_norm": 0.4562229812145233, + "learning_rate": 1.7967755391127161e-06, + "loss": 1.828, + "step": 8803 + }, + { + "epoch": 0.9145112703853745, + "grad_norm": 0.3990088701248169, + "learning_rate": 1.7924433095821203e-06, + "loss": 1.6182, + "step": 8804 + }, + { + "epoch": 0.9146151449049548, + "grad_norm": 0.4023396074771881, + "learning_rate": 1.788116213780283e-06, + "loss": 1.7746, + "step": 8805 + }, + { + "epoch": 0.9147190194245352, + "grad_norm": 0.42868486046791077, + "learning_rate": 1.783794252168003e-06, + "loss": 1.7353, + "step": 8806 + }, + { + "epoch": 0.9148228939441155, + "grad_norm": 0.43141698837280273, + "learning_rate": 1.7794774252055224e-06, + "loss": 1.6535, + "step": 8807 + }, + { + "epoch": 0.9149267684636958, + "grad_norm": 0.4295443594455719, + "learning_rate": 1.775165733352574e-06, + "loss": 1.6215, + "step": 8808 + }, + { + "epoch": 0.9150306429832762, + "grad_norm": 0.4191311299800873, + "learning_rate": 1.770859177068296e-06, + "loss": 1.6455, + "step": 8809 + }, + { + "epoch": 0.9151345175028566, + "grad_norm": 0.38304373621940613, + "learning_rate": 1.7665577568113212e-06, + "loss": 1.6085, + "step": 8810 + }, + { + "epoch": 0.9152383920224368, + "grad_norm": 0.4099152684211731, + "learning_rate": 1.7622614730397114e-06, + "loss": 1.7182, + "step": 8811 + }, + { + "epoch": 0.9153422665420172, + "grad_norm": 0.397775262594223, + "learning_rate": 1.7579703262109837e-06, + "loss": 1.5084, + "step": 8812 + }, + { + "epoch": 0.9154461410615976, + "grad_norm": 0.4640827476978302, + "learning_rate": 1.7536843167821116e-06, + "loss": 1.8689, + "step": 8813 + }, + { + "epoch": 0.915550015581178, + "grad_norm": 0.39592549204826355, + "learning_rate": 1.7494034452095242e-06, + "loss": 1.5368, + "step": 8814 + }, + { + "epoch": 0.9156538901007583, + "grad_norm": 0.4693751633167267, + "learning_rate": 1.7451277119491073e-06, + "loss": 1.806, + "step": 8815 + }, + { + "epoch": 0.9157577646203386, + "grad_norm": 0.45381423830986023, + "learning_rate": 1.740857117456174e-06, + "loss": 1.6147, + "step": 8816 + }, + { + "epoch": 0.915861639139919, + "grad_norm": 0.40232381224632263, + "learning_rate": 1.7365916621855327e-06, + "loss": 1.5101, + "step": 8817 + }, + { + "epoch": 0.9159655136594993, + "grad_norm": 0.4090515971183777, + "learning_rate": 1.7323313465914093e-06, + "loss": 1.6499, + "step": 8818 + }, + { + "epoch": 0.9160693881790797, + "grad_norm": 0.3945220410823822, + "learning_rate": 1.7280761711274906e-06, + "loss": 1.5388, + "step": 8819 + }, + { + "epoch": 0.9161732626986601, + "grad_norm": 0.39310646057128906, + "learning_rate": 1.7238261362469254e-06, + "loss": 1.5099, + "step": 8820 + }, + { + "epoch": 0.9162771372182403, + "grad_norm": 0.4310518205165863, + "learning_rate": 1.7195812424023129e-06, + "loss": 1.4882, + "step": 8821 + }, + { + "epoch": 0.9163810117378207, + "grad_norm": 0.4244692921638489, + "learning_rate": 1.715341490045691e-06, + "loss": 1.6063, + "step": 8822 + }, + { + "epoch": 0.9164848862574011, + "grad_norm": 0.4561496675014496, + "learning_rate": 1.7111068796285657e-06, + "loss": 1.7671, + "step": 8823 + }, + { + "epoch": 0.9165887607769814, + "grad_norm": 0.4400840103626251, + "learning_rate": 1.7068774116018983e-06, + "loss": 1.6152, + "step": 8824 + }, + { + "epoch": 0.9166926352965618, + "grad_norm": 0.46823105216026306, + "learning_rate": 1.7026530864160728e-06, + "loss": 1.5364, + "step": 8825 + }, + { + "epoch": 0.9167965098161421, + "grad_norm": 0.43917423486709595, + "learning_rate": 1.698433904520963e-06, + "loss": 1.5236, + "step": 8826 + }, + { + "epoch": 0.9169003843357224, + "grad_norm": 0.4674359858036041, + "learning_rate": 1.6942198663658815e-06, + "loss": 1.767, + "step": 8827 + }, + { + "epoch": 0.9170042588553028, + "grad_norm": 0.39382535219192505, + "learning_rate": 1.6900109723995805e-06, + "loss": 1.6415, + "step": 8828 + }, + { + "epoch": 0.9171081333748832, + "grad_norm": 0.4580112397670746, + "learning_rate": 1.6858072230702792e-06, + "loss": 1.6657, + "step": 8829 + }, + { + "epoch": 0.9172120078944634, + "grad_norm": 0.4108462929725647, + "learning_rate": 1.6816086188256475e-06, + "loss": 1.6511, + "step": 8830 + }, + { + "epoch": 0.9173158824140438, + "grad_norm": 0.4109908938407898, + "learning_rate": 1.677415160112794e-06, + "loss": 1.5784, + "step": 8831 + }, + { + "epoch": 0.9174197569336242, + "grad_norm": 0.4298510253429413, + "learning_rate": 1.6732268473782953e-06, + "loss": 1.6897, + "step": 8832 + }, + { + "epoch": 0.9175236314532045, + "grad_norm": 0.42794668674468994, + "learning_rate": 1.6690436810681832e-06, + "loss": 1.6477, + "step": 8833 + }, + { + "epoch": 0.9176275059727849, + "grad_norm": 0.42130839824676514, + "learning_rate": 1.664865661627918e-06, + "loss": 1.6891, + "step": 8834 + }, + { + "epoch": 0.9177313804923652, + "grad_norm": 0.4458337128162384, + "learning_rate": 1.6606927895024217e-06, + "loss": 1.5716, + "step": 8835 + }, + { + "epoch": 0.9178352550119455, + "grad_norm": 0.4314666986465454, + "learning_rate": 1.6565250651360997e-06, + "loss": 1.6978, + "step": 8836 + }, + { + "epoch": 0.9179391295315259, + "grad_norm": 0.47084978222846985, + "learning_rate": 1.6523624889727474e-06, + "loss": 1.7084, + "step": 8837 + }, + { + "epoch": 0.9180430040511063, + "grad_norm": 0.42452502250671387, + "learning_rate": 1.6482050614556709e-06, + "loss": 1.4483, + "step": 8838 + }, + { + "epoch": 0.9181468785706867, + "grad_norm": 0.4466843008995056, + "learning_rate": 1.6440527830275997e-06, + "loss": 1.7525, + "step": 8839 + }, + { + "epoch": 0.9182507530902669, + "grad_norm": 0.3827952444553375, + "learning_rate": 1.6399056541307134e-06, + "loss": 1.5032, + "step": 8840 + }, + { + "epoch": 0.9183546276098473, + "grad_norm": 0.46445831656455994, + "learning_rate": 1.6357636752066529e-06, + "loss": 1.6178, + "step": 8841 + }, + { + "epoch": 0.9184585021294277, + "grad_norm": 0.4010038375854492, + "learning_rate": 1.6316268466965046e-06, + "loss": 1.6032, + "step": 8842 + }, + { + "epoch": 0.918562376649008, + "grad_norm": 0.42369773983955383, + "learning_rate": 1.6274951690408158e-06, + "loss": 1.7141, + "step": 8843 + }, + { + "epoch": 0.9186662511685884, + "grad_norm": 0.4194977879524231, + "learning_rate": 1.623368642679557e-06, + "loss": 1.6226, + "step": 8844 + }, + { + "epoch": 0.9187701256881687, + "grad_norm": 0.5032253861427307, + "learning_rate": 1.6192472680521987e-06, + "loss": 1.9285, + "step": 8845 + }, + { + "epoch": 0.918874000207749, + "grad_norm": 0.4604244530200958, + "learning_rate": 1.6151310455976176e-06, + "loss": 1.7397, + "step": 8846 + }, + { + "epoch": 0.9189778747273294, + "grad_norm": 0.4401777386665344, + "learning_rate": 1.6110199757541632e-06, + "loss": 1.5453, + "step": 8847 + }, + { + "epoch": 0.9190817492469098, + "grad_norm": 0.48724883794784546, + "learning_rate": 1.6069140589596299e-06, + "loss": 1.6, + "step": 8848 + }, + { + "epoch": 0.91918562376649, + "grad_norm": 0.4106622338294983, + "learning_rate": 1.6028132956512732e-06, + "loss": 1.6392, + "step": 8849 + }, + { + "epoch": 0.9192894982860704, + "grad_norm": 0.4616900086402893, + "learning_rate": 1.5987176862657882e-06, + "loss": 1.6164, + "step": 8850 + }, + { + "epoch": 0.9193933728056508, + "grad_norm": 0.4380607604980469, + "learning_rate": 1.5946272312393206e-06, + "loss": 1.5529, + "step": 8851 + }, + { + "epoch": 0.9194972473252311, + "grad_norm": 0.42449167370796204, + "learning_rate": 1.5905419310074831e-06, + "loss": 1.7714, + "step": 8852 + }, + { + "epoch": 0.9196011218448115, + "grad_norm": 0.4334729015827179, + "learning_rate": 1.586461786005311e-06, + "loss": 1.703, + "step": 8853 + }, + { + "epoch": 0.9197049963643918, + "grad_norm": 0.4081462621688843, + "learning_rate": 1.5823867966673288e-06, + "loss": 1.6907, + "step": 8854 + }, + { + "epoch": 0.9198088708839721, + "grad_norm": 0.4235094487667084, + "learning_rate": 1.5783169634274787e-06, + "loss": 1.7441, + "step": 8855 + }, + { + "epoch": 0.9199127454035525, + "grad_norm": 0.4290684759616852, + "learning_rate": 1.574252286719169e-06, + "loss": 1.5403, + "step": 8856 + }, + { + "epoch": 0.9200166199231329, + "grad_norm": 0.40172362327575684, + "learning_rate": 1.5701927669752537e-06, + "loss": 1.4616, + "step": 8857 + }, + { + "epoch": 0.9201204944427132, + "grad_norm": 0.4437967538833618, + "learning_rate": 1.566138404628037e-06, + "loss": 1.6938, + "step": 8858 + }, + { + "epoch": 0.9202243689622935, + "grad_norm": 0.42959895730018616, + "learning_rate": 1.5620892001092902e-06, + "loss": 1.5245, + "step": 8859 + }, + { + "epoch": 0.9203282434818739, + "grad_norm": 0.4243561923503876, + "learning_rate": 1.5580451538502016e-06, + "loss": 1.6801, + "step": 8860 + }, + { + "epoch": 0.9204321180014542, + "grad_norm": 0.5258318781852722, + "learning_rate": 1.5540062662814603e-06, + "loss": 1.8436, + "step": 8861 + }, + { + "epoch": 0.9205359925210346, + "grad_norm": 0.4423414468765259, + "learning_rate": 1.5499725378331441e-06, + "loss": 1.6583, + "step": 8862 + }, + { + "epoch": 0.920639867040615, + "grad_norm": 0.4549945592880249, + "learning_rate": 1.5459439689348266e-06, + "loss": 1.6852, + "step": 8863 + }, + { + "epoch": 0.9207437415601953, + "grad_norm": 0.43814900517463684, + "learning_rate": 1.5419205600155306e-06, + "loss": 1.6821, + "step": 8864 + }, + { + "epoch": 0.9208476160797756, + "grad_norm": 0.47776687145233154, + "learning_rate": 1.5379023115037027e-06, + "loss": 1.8725, + "step": 8865 + }, + { + "epoch": 0.920951490599356, + "grad_norm": 0.422610342502594, + "learning_rate": 1.5338892238272562e-06, + "loss": 1.6768, + "step": 8866 + }, + { + "epoch": 0.9210553651189364, + "grad_norm": 0.46566253900527954, + "learning_rate": 1.5298812974135602e-06, + "loss": 1.7364, + "step": 8867 + }, + { + "epoch": 0.9211592396385166, + "grad_norm": 0.41145744919776917, + "learning_rate": 1.5258785326894286e-06, + "loss": 1.7075, + "step": 8868 + }, + { + "epoch": 0.921263114158097, + "grad_norm": 0.4216400980949402, + "learning_rate": 1.5218809300811155e-06, + "loss": 1.7723, + "step": 8869 + }, + { + "epoch": 0.9213669886776774, + "grad_norm": 0.4355829656124115, + "learning_rate": 1.5178884900143353e-06, + "loss": 1.6459, + "step": 8870 + }, + { + "epoch": 0.9214708631972577, + "grad_norm": 0.4165591597557068, + "learning_rate": 1.5139012129142704e-06, + "loss": 1.5313, + "step": 8871 + }, + { + "epoch": 0.9215747377168381, + "grad_norm": 0.44293418526649475, + "learning_rate": 1.5099190992055034e-06, + "loss": 1.7129, + "step": 8872 + }, + { + "epoch": 0.9216786122364184, + "grad_norm": 0.45215722918510437, + "learning_rate": 1.505942149312123e-06, + "loss": 1.4032, + "step": 8873 + }, + { + "epoch": 0.9217824867559987, + "grad_norm": 0.4243696630001068, + "learning_rate": 1.5019703636576344e-06, + "loss": 1.6315, + "step": 8874 + }, + { + "epoch": 0.9218863612755791, + "grad_norm": 0.4237101078033447, + "learning_rate": 1.498003742665005e-06, + "loss": 1.6485, + "step": 8875 + }, + { + "epoch": 0.9219902357951595, + "grad_norm": 0.4091566801071167, + "learning_rate": 1.4940422867566472e-06, + "loss": 1.671, + "step": 8876 + }, + { + "epoch": 0.9220941103147398, + "grad_norm": 0.44032323360443115, + "learning_rate": 1.4900859963544233e-06, + "loss": 1.581, + "step": 8877 + }, + { + "epoch": 0.9221979848343201, + "grad_norm": 0.41749629378318787, + "learning_rate": 1.486134871879652e-06, + "loss": 1.7884, + "step": 8878 + }, + { + "epoch": 0.9223018593539005, + "grad_norm": 0.447294682264328, + "learning_rate": 1.4821889137530852e-06, + "loss": 1.7156, + "step": 8879 + }, + { + "epoch": 0.9224057338734808, + "grad_norm": 0.45944663882255554, + "learning_rate": 1.4782481223949596e-06, + "loss": 1.8423, + "step": 8880 + }, + { + "epoch": 0.9225096083930612, + "grad_norm": 0.39821431040763855, + "learning_rate": 1.4743124982249112e-06, + "loss": 1.5655, + "step": 8881 + }, + { + "epoch": 0.9226134829126416, + "grad_norm": 0.48018068075180054, + "learning_rate": 1.4703820416620718e-06, + "loss": 1.6241, + "step": 8882 + }, + { + "epoch": 0.9227173574322218, + "grad_norm": 0.3963991701602936, + "learning_rate": 1.4664567531250007e-06, + "loss": 1.5582, + "step": 8883 + }, + { + "epoch": 0.9228212319518022, + "grad_norm": 0.405376672744751, + "learning_rate": 1.462536633031708e-06, + "loss": 1.6288, + "step": 8884 + }, + { + "epoch": 0.9229251064713826, + "grad_norm": 0.4346674382686615, + "learning_rate": 1.4586216817996602e-06, + "loss": 1.6538, + "step": 8885 + }, + { + "epoch": 0.9230289809909629, + "grad_norm": 0.4289572536945343, + "learning_rate": 1.454711899845762e-06, + "loss": 1.8293, + "step": 8886 + }, + { + "epoch": 0.9231328555105432, + "grad_norm": 0.5129453539848328, + "learning_rate": 1.450807287586381e-06, + "loss": 1.8293, + "step": 8887 + }, + { + "epoch": 0.9232367300301236, + "grad_norm": 0.41872215270996094, + "learning_rate": 1.4469078454373174e-06, + "loss": 1.6358, + "step": 8888 + }, + { + "epoch": 0.9233406045497039, + "grad_norm": 0.4631272256374359, + "learning_rate": 1.4430135738138561e-06, + "loss": 1.824, + "step": 8889 + }, + { + "epoch": 0.9234444790692843, + "grad_norm": 0.41717085242271423, + "learning_rate": 1.4391244731306763e-06, + "loss": 1.6387, + "step": 8890 + }, + { + "epoch": 0.9235483535888647, + "grad_norm": 0.4447776675224304, + "learning_rate": 1.4352405438019522e-06, + "loss": 1.7798, + "step": 8891 + }, + { + "epoch": 0.923652228108445, + "grad_norm": 0.48640677332878113, + "learning_rate": 1.431361786241292e-06, + "loss": 1.8333, + "step": 8892 + }, + { + "epoch": 0.9237561026280253, + "grad_norm": 0.4646976590156555, + "learning_rate": 1.427488200861754e-06, + "loss": 1.6033, + "step": 8893 + }, + { + "epoch": 0.9238599771476057, + "grad_norm": 0.39103513956069946, + "learning_rate": 1.4236197880758416e-06, + "loss": 1.5394, + "step": 8894 + }, + { + "epoch": 0.9239638516671861, + "grad_norm": 0.49602001905441284, + "learning_rate": 1.4197565482955144e-06, + "loss": 1.8753, + "step": 8895 + }, + { + "epoch": 0.9240677261867664, + "grad_norm": 0.3964075744152069, + "learning_rate": 1.4158984819321707e-06, + "loss": 1.5401, + "step": 8896 + }, + { + "epoch": 0.9241716007063467, + "grad_norm": 0.4444807171821594, + "learning_rate": 1.4120455893966711e-06, + "loss": 1.755, + "step": 8897 + }, + { + "epoch": 0.9242754752259271, + "grad_norm": 0.48171067237854004, + "learning_rate": 1.4081978710993092e-06, + "loss": 1.7385, + "step": 8898 + }, + { + "epoch": 0.9243793497455074, + "grad_norm": 0.41400474309921265, + "learning_rate": 1.404355327449852e-06, + "loss": 1.6438, + "step": 8899 + }, + { + "epoch": 0.9244832242650878, + "grad_norm": 0.4013029932975769, + "learning_rate": 1.4005179588574835e-06, + "loss": 1.3095, + "step": 8900 + }, + { + "epoch": 0.9245870987846682, + "grad_norm": 0.41404280066490173, + "learning_rate": 1.3966857657308652e-06, + "loss": 1.6767, + "step": 8901 + }, + { + "epoch": 0.9246909733042484, + "grad_norm": 0.4295409619808197, + "learning_rate": 1.392858748478093e-06, + "loss": 1.5915, + "step": 8902 + }, + { + "epoch": 0.9247948478238288, + "grad_norm": 0.4575590491294861, + "learning_rate": 1.3890369075067134e-06, + "loss": 1.6812, + "step": 8903 + }, + { + "epoch": 0.9248987223434092, + "grad_norm": 0.42268791794776917, + "learning_rate": 1.3852202432237227e-06, + "loss": 1.6996, + "step": 8904 + }, + { + "epoch": 0.9250025968629895, + "grad_norm": 0.4151066839694977, + "learning_rate": 1.3814087560355682e-06, + "loss": 1.5893, + "step": 8905 + }, + { + "epoch": 0.9251064713825699, + "grad_norm": 0.40030190348625183, + "learning_rate": 1.377602446348142e-06, + "loss": 1.7249, + "step": 8906 + }, + { + "epoch": 0.9252103459021502, + "grad_norm": 0.38971251249313354, + "learning_rate": 1.3738013145667804e-06, + "loss": 1.5488, + "step": 8907 + }, + { + "epoch": 0.9253142204217305, + "grad_norm": 0.39237236976623535, + "learning_rate": 1.3700053610962882e-06, + "loss": 1.5541, + "step": 8908 + }, + { + "epoch": 0.9254180949413109, + "grad_norm": 0.4266391098499298, + "learning_rate": 1.366214586340886e-06, + "loss": 1.7434, + "step": 8909 + }, + { + "epoch": 0.9255219694608913, + "grad_norm": 0.42102593183517456, + "learning_rate": 1.3624289907042787e-06, + "loss": 1.7068, + "step": 8910 + }, + { + "epoch": 0.9256258439804715, + "grad_norm": 0.42698073387145996, + "learning_rate": 1.3586485745895993e-06, + "loss": 1.7564, + "step": 8911 + }, + { + "epoch": 0.9257297185000519, + "grad_norm": 0.43909063935279846, + "learning_rate": 1.354873338399426e-06, + "loss": 1.6685, + "step": 8912 + }, + { + "epoch": 0.9258335930196323, + "grad_norm": 0.4355563819408417, + "learning_rate": 1.3511032825357982e-06, + "loss": 1.4694, + "step": 8913 + }, + { + "epoch": 0.9259374675392126, + "grad_norm": 0.4620991349220276, + "learning_rate": 1.3473384074001893e-06, + "loss": 1.5793, + "step": 8914 + }, + { + "epoch": 0.926041342058793, + "grad_norm": 0.4601285457611084, + "learning_rate": 1.3435787133935397e-06, + "loss": 1.8184, + "step": 8915 + }, + { + "epoch": 0.9261452165783733, + "grad_norm": 0.43226033449172974, + "learning_rate": 1.3398242009162177e-06, + "loss": 1.795, + "step": 8916 + }, + { + "epoch": 0.9262490910979537, + "grad_norm": 0.4279690682888031, + "learning_rate": 1.3360748703680537e-06, + "loss": 1.6734, + "step": 8917 + }, + { + "epoch": 0.926352965617534, + "grad_norm": 0.42362403869628906, + "learning_rate": 1.3323307221483339e-06, + "loss": 1.5626, + "step": 8918 + }, + { + "epoch": 0.9264568401371144, + "grad_norm": 0.4446941614151001, + "learning_rate": 1.328591756655756e-06, + "loss": 1.6638, + "step": 8919 + }, + { + "epoch": 0.9265607146566948, + "grad_norm": 0.4361754059791565, + "learning_rate": 1.3248579742885126e-06, + "loss": 1.552, + "step": 8920 + }, + { + "epoch": 0.926664589176275, + "grad_norm": 0.4090036451816559, + "learning_rate": 1.3211293754442134e-06, + "loss": 1.6251, + "step": 8921 + }, + { + "epoch": 0.9267684636958554, + "grad_norm": 0.4072844088077545, + "learning_rate": 1.317405960519924e-06, + "loss": 1.6939, + "step": 8922 + }, + { + "epoch": 0.9268723382154358, + "grad_norm": 0.4060845673084259, + "learning_rate": 1.3136877299121608e-06, + "loss": 1.6519, + "step": 8923 + }, + { + "epoch": 0.9269762127350161, + "grad_norm": 0.4271293878555298, + "learning_rate": 1.3099746840168903e-06, + "loss": 1.5427, + "step": 8924 + }, + { + "epoch": 0.9270800872545965, + "grad_norm": 0.3999338746070862, + "learning_rate": 1.3062668232295128e-06, + "loss": 1.5207, + "step": 8925 + }, + { + "epoch": 0.9271839617741768, + "grad_norm": 0.47124266624450684, + "learning_rate": 1.3025641479448958e-06, + "loss": 1.8273, + "step": 8926 + }, + { + "epoch": 0.9272878362937571, + "grad_norm": 0.4004976451396942, + "learning_rate": 1.2988666585573461e-06, + "loss": 1.4928, + "step": 8927 + }, + { + "epoch": 0.9273917108133375, + "grad_norm": 0.4213370382785797, + "learning_rate": 1.2951743554606044e-06, + "loss": 1.6839, + "step": 8928 + }, + { + "epoch": 0.9274955853329179, + "grad_norm": 0.43283870816230774, + "learning_rate": 1.2914872390478894e-06, + "loss": 1.7069, + "step": 8929 + }, + { + "epoch": 0.9275994598524981, + "grad_norm": 0.3999961316585541, + "learning_rate": 1.2878053097118425e-06, + "loss": 1.6181, + "step": 8930 + }, + { + "epoch": 0.9277033343720785, + "grad_norm": 0.44577065110206604, + "learning_rate": 1.2841285678445614e-06, + "loss": 1.599, + "step": 8931 + }, + { + "epoch": 0.9278072088916589, + "grad_norm": 0.40648534893989563, + "learning_rate": 1.2804570138375827e-06, + "loss": 1.6496, + "step": 8932 + }, + { + "epoch": 0.9279110834112392, + "grad_norm": 0.43543723225593567, + "learning_rate": 1.2767906480819104e-06, + "loss": 1.7878, + "step": 8933 + }, + { + "epoch": 0.9280149579308196, + "grad_norm": 0.39712443947792053, + "learning_rate": 1.273129470967982e-06, + "loss": 1.6478, + "step": 8934 + }, + { + "epoch": 0.9281188324503999, + "grad_norm": 0.43775272369384766, + "learning_rate": 1.2694734828856691e-06, + "loss": 1.6999, + "step": 8935 + }, + { + "epoch": 0.9282227069699802, + "grad_norm": 0.46979156136512756, + "learning_rate": 1.265822684224327e-06, + "loss": 1.5892, + "step": 8936 + }, + { + "epoch": 0.9283265814895606, + "grad_norm": 0.44931840896606445, + "learning_rate": 1.2621770753727224e-06, + "loss": 1.661, + "step": 8937 + }, + { + "epoch": 0.928430456009141, + "grad_norm": 0.441062867641449, + "learning_rate": 1.2585366567190949e-06, + "loss": 1.7309, + "step": 8938 + }, + { + "epoch": 0.9285343305287213, + "grad_norm": 0.45350944995880127, + "learning_rate": 1.2549014286511118e-06, + "loss": 1.5562, + "step": 8939 + }, + { + "epoch": 0.9286382050483016, + "grad_norm": 0.4538798928260803, + "learning_rate": 1.2512713915559027e-06, + "loss": 1.5712, + "step": 8940 + }, + { + "epoch": 0.928742079567882, + "grad_norm": 0.43491330742836, + "learning_rate": 1.247646545820036e-06, + "loss": 1.6381, + "step": 8941 + }, + { + "epoch": 0.9288459540874624, + "grad_norm": 0.417327880859375, + "learning_rate": 1.2440268918295194e-06, + "loss": 1.71, + "step": 8942 + }, + { + "epoch": 0.9289498286070427, + "grad_norm": 0.43641364574432373, + "learning_rate": 1.240412429969845e-06, + "loss": 1.7413, + "step": 8943 + }, + { + "epoch": 0.929053703126623, + "grad_norm": 0.4823327362537384, + "learning_rate": 1.2368031606258934e-06, + "loss": 1.8444, + "step": 8944 + }, + { + "epoch": 0.9291575776462034, + "grad_norm": 0.4450342655181885, + "learning_rate": 1.2331990841820407e-06, + "loss": 1.8187, + "step": 8945 + }, + { + "epoch": 0.9292614521657837, + "grad_norm": 0.4293372631072998, + "learning_rate": 1.2296002010220964e-06, + "loss": 1.6616, + "step": 8946 + }, + { + "epoch": 0.9293653266853641, + "grad_norm": 0.4217541217803955, + "learning_rate": 1.2260065115292984e-06, + "loss": 1.6949, + "step": 8947 + }, + { + "epoch": 0.9294692012049445, + "grad_norm": 0.4160800278186798, + "learning_rate": 1.2224180160863574e-06, + "loss": 1.535, + "step": 8948 + }, + { + "epoch": 0.9295730757245247, + "grad_norm": 0.42056944966316223, + "learning_rate": 1.2188347150754231e-06, + "loss": 1.7167, + "step": 8949 + }, + { + "epoch": 0.9296769502441051, + "grad_norm": 0.4336167871952057, + "learning_rate": 1.2152566088780792e-06, + "loss": 1.5799, + "step": 8950 + }, + { + "epoch": 0.9297808247636855, + "grad_norm": 0.43166765570640564, + "learning_rate": 1.2116836978753709e-06, + "loss": 1.603, + "step": 8951 + }, + { + "epoch": 0.9298846992832658, + "grad_norm": 0.4209384620189667, + "learning_rate": 1.2081159824477883e-06, + "loss": 1.5984, + "step": 8952 + }, + { + "epoch": 0.9299885738028462, + "grad_norm": 0.42866531014442444, + "learning_rate": 1.2045534629752609e-06, + "loss": 1.6496, + "step": 8953 + }, + { + "epoch": 0.9300924483224265, + "grad_norm": 0.44947803020477295, + "learning_rate": 1.2009961398371628e-06, + "loss": 1.4944, + "step": 8954 + }, + { + "epoch": 0.9301963228420068, + "grad_norm": 0.4815939962863922, + "learning_rate": 1.197444013412341e-06, + "loss": 1.5212, + "step": 8955 + }, + { + "epoch": 0.9303001973615872, + "grad_norm": 0.5035466551780701, + "learning_rate": 1.1938970840790486e-06, + "loss": 1.6037, + "step": 8956 + }, + { + "epoch": 0.9304040718811676, + "grad_norm": 0.43349477648735046, + "learning_rate": 1.1903553522150169e-06, + "loss": 1.7366, + "step": 8957 + }, + { + "epoch": 0.9305079464007479, + "grad_norm": 0.4126850366592407, + "learning_rate": 1.186818818197416e-06, + "loss": 1.5468, + "step": 8958 + }, + { + "epoch": 0.9306118209203282, + "grad_norm": 0.4104941189289093, + "learning_rate": 1.1832874824028451e-06, + "loss": 1.5958, + "step": 8959 + }, + { + "epoch": 0.9307156954399086, + "grad_norm": 0.422209769487381, + "learning_rate": 1.1797613452073808e-06, + "loss": 1.5762, + "step": 8960 + }, + { + "epoch": 0.9308195699594889, + "grad_norm": 0.4042091965675354, + "learning_rate": 1.1762404069865174e-06, + "loss": 1.6074, + "step": 8961 + }, + { + "epoch": 0.9309234444790693, + "grad_norm": 0.4372323751449585, + "learning_rate": 1.1727246681152049e-06, + "loss": 1.7324, + "step": 8962 + }, + { + "epoch": 0.9310273189986497, + "grad_norm": 0.46790817379951477, + "learning_rate": 1.1692141289678492e-06, + "loss": 1.6437, + "step": 8963 + }, + { + "epoch": 0.9311311935182299, + "grad_norm": 0.4523914158344269, + "learning_rate": 1.1657087899183016e-06, + "loss": 1.5817, + "step": 8964 + }, + { + "epoch": 0.9312350680378103, + "grad_norm": 0.4646068215370178, + "learning_rate": 1.162208651339841e-06, + "loss": 1.7894, + "step": 8965 + }, + { + "epoch": 0.9313389425573907, + "grad_norm": 0.4205915331840515, + "learning_rate": 1.1587137136052083e-06, + "loss": 1.5805, + "step": 8966 + }, + { + "epoch": 0.931442817076971, + "grad_norm": 0.40744760632514954, + "learning_rate": 1.1552239770865892e-06, + "loss": 1.6191, + "step": 8967 + }, + { + "epoch": 0.9315466915965513, + "grad_norm": 0.43079641461372375, + "learning_rate": 1.1517394421556138e-06, + "loss": 1.7234, + "step": 8968 + }, + { + "epoch": 0.9316505661161317, + "grad_norm": 0.4436144232749939, + "learning_rate": 1.1482601091833578e-06, + "loss": 1.6766, + "step": 8969 + }, + { + "epoch": 0.9317544406357121, + "grad_norm": 0.43747812509536743, + "learning_rate": 1.1447859785403359e-06, + "loss": 1.7669, + "step": 8970 + }, + { + "epoch": 0.9318583151552924, + "grad_norm": 0.4351133108139038, + "learning_rate": 1.1413170505965354e-06, + "loss": 1.7189, + "step": 8971 + }, + { + "epoch": 0.9319621896748728, + "grad_norm": 0.4412538409233093, + "learning_rate": 1.1378533257213387e-06, + "loss": 1.8092, + "step": 8972 + }, + { + "epoch": 0.9320660641944531, + "grad_norm": 0.4323018789291382, + "learning_rate": 1.134394804283634e-06, + "loss": 1.8075, + "step": 8973 + }, + { + "epoch": 0.9321699387140334, + "grad_norm": 0.47585269808769226, + "learning_rate": 1.1309414866517209e-06, + "loss": 1.7678, + "step": 8974 + }, + { + "epoch": 0.9322738132336138, + "grad_norm": 0.5131593942642212, + "learning_rate": 1.127493373193339e-06, + "loss": 1.6321, + "step": 8975 + }, + { + "epoch": 0.9323776877531942, + "grad_norm": 0.4566580057144165, + "learning_rate": 1.1240504642756945e-06, + "loss": 1.7411, + "step": 8976 + }, + { + "epoch": 0.9324815622727745, + "grad_norm": 0.4639977514743805, + "learning_rate": 1.1206127602654271e-06, + "loss": 1.9319, + "step": 8977 + }, + { + "epoch": 0.9325854367923548, + "grad_norm": 0.4284569323062897, + "learning_rate": 1.1171802615286275e-06, + "loss": 1.6936, + "step": 8978 + }, + { + "epoch": 0.9326893113119352, + "grad_norm": 0.46785593032836914, + "learning_rate": 1.1137529684308311e-06, + "loss": 1.7217, + "step": 8979 + }, + { + "epoch": 0.9327931858315155, + "grad_norm": 0.4252128303050995, + "learning_rate": 1.110330881337013e-06, + "loss": 1.6731, + "step": 8980 + }, + { + "epoch": 0.9328970603510959, + "grad_norm": 0.4072088599205017, + "learning_rate": 1.106914000611603e-06, + "loss": 1.5772, + "step": 8981 + }, + { + "epoch": 0.9330009348706763, + "grad_norm": 0.4445166289806366, + "learning_rate": 1.1035023266184663e-06, + "loss": 1.6541, + "step": 8982 + }, + { + "epoch": 0.9331048093902565, + "grad_norm": 0.42075401544570923, + "learning_rate": 1.1000958597209343e-06, + "loss": 1.6717, + "step": 8983 + }, + { + "epoch": 0.9332086839098369, + "grad_norm": 0.4416648745536804, + "learning_rate": 1.0966946002817557e-06, + "loss": 1.8383, + "step": 8984 + }, + { + "epoch": 0.9333125584294173, + "grad_norm": 0.4478912055492401, + "learning_rate": 1.0932985486631408e-06, + "loss": 1.6386, + "step": 8985 + }, + { + "epoch": 0.9334164329489976, + "grad_norm": 0.4134404957294464, + "learning_rate": 1.0899077052267393e-06, + "loss": 1.6147, + "step": 8986 + }, + { + "epoch": 0.933520307468578, + "grad_norm": 0.38977158069610596, + "learning_rate": 1.086522070333662e-06, + "loss": 1.6074, + "step": 8987 + }, + { + "epoch": 0.9336241819881583, + "grad_norm": 0.4343447685241699, + "learning_rate": 1.0831416443444376e-06, + "loss": 1.7838, + "step": 8988 + }, + { + "epoch": 0.9337280565077386, + "grad_norm": 0.42949017882347107, + "learning_rate": 1.0797664276190666e-06, + "loss": 1.5811, + "step": 8989 + }, + { + "epoch": 0.933831931027319, + "grad_norm": 0.4194577634334564, + "learning_rate": 1.0763964205169786e-06, + "loss": 1.7476, + "step": 8990 + }, + { + "epoch": 0.9339358055468994, + "grad_norm": 0.42771607637405396, + "learning_rate": 1.0730316233970473e-06, + "loss": 1.7751, + "step": 8991 + }, + { + "epoch": 0.9340396800664796, + "grad_norm": 0.4049451947212219, + "learning_rate": 1.0696720366176138e-06, + "loss": 1.5934, + "step": 8992 + }, + { + "epoch": 0.93414355458606, + "grad_norm": 0.4572373628616333, + "learning_rate": 1.0663176605364255e-06, + "loss": 1.7941, + "step": 8993 + }, + { + "epoch": 0.9342474291056404, + "grad_norm": 0.3885292708873749, + "learning_rate": 1.0629684955107132e-06, + "loss": 1.4659, + "step": 8994 + }, + { + "epoch": 0.9343513036252208, + "grad_norm": 0.38825228810310364, + "learning_rate": 1.0596245418971307e-06, + "loss": 1.5177, + "step": 8995 + }, + { + "epoch": 0.9344551781448011, + "grad_norm": 0.47034990787506104, + "learning_rate": 1.0562858000517928e-06, + "loss": 1.774, + "step": 8996 + }, + { + "epoch": 0.9345590526643814, + "grad_norm": 0.5254196524620056, + "learning_rate": 1.0529522703302319e-06, + "loss": 1.578, + "step": 8997 + }, + { + "epoch": 0.9346629271839618, + "grad_norm": 0.4228893518447876, + "learning_rate": 1.049623953087453e-06, + "loss": 1.6469, + "step": 8998 + }, + { + "epoch": 0.9347668017035421, + "grad_norm": 0.4083818197250366, + "learning_rate": 1.0463008486779058e-06, + "loss": 1.5999, + "step": 8999 + }, + { + "epoch": 0.9348706762231225, + "grad_norm": 0.417376846075058, + "learning_rate": 1.0429829574554574e-06, + "loss": 1.5968, + "step": 9000 + }, + { + "epoch": 0.9349745507427029, + "grad_norm": 0.4227767884731293, + "learning_rate": 1.0396702797734415e-06, + "loss": 1.5762, + "step": 9001 + }, + { + "epoch": 0.9350784252622831, + "grad_norm": 0.41051211953163147, + "learning_rate": 1.0363628159846483e-06, + "loss": 1.4743, + "step": 9002 + }, + { + "epoch": 0.9351822997818635, + "grad_norm": 0.41701540350914, + "learning_rate": 1.0330605664412684e-06, + "loss": 1.6244, + "step": 9003 + }, + { + "epoch": 0.9352861743014439, + "grad_norm": 0.44816792011260986, + "learning_rate": 1.0297635314949872e-06, + "loss": 1.8127, + "step": 9004 + }, + { + "epoch": 0.9353900488210242, + "grad_norm": 0.4178299307823181, + "learning_rate": 1.0264717114969125e-06, + "loss": 1.5446, + "step": 9005 + }, + { + "epoch": 0.9354939233406045, + "grad_norm": 0.4642837941646576, + "learning_rate": 1.023185106797586e-06, + "loss": 1.7789, + "step": 9006 + }, + { + "epoch": 0.9355977978601849, + "grad_norm": 0.4191594421863556, + "learning_rate": 1.0199037177470172e-06, + "loss": 1.7449, + "step": 9007 + }, + { + "epoch": 0.9357016723797652, + "grad_norm": 0.42831364274024963, + "learning_rate": 1.0166275446946372e-06, + "loss": 1.7044, + "step": 9008 + }, + { + "epoch": 0.9358055468993456, + "grad_norm": 0.44095903635025024, + "learning_rate": 1.0133565879893447e-06, + "loss": 1.7159, + "step": 9009 + }, + { + "epoch": 0.935909421418926, + "grad_norm": 0.4127524495124817, + "learning_rate": 1.0100908479794558e-06, + "loss": 1.5209, + "step": 9010 + }, + { + "epoch": 0.9360132959385062, + "grad_norm": 0.45353084802627563, + "learning_rate": 1.0068303250127697e-06, + "loss": 1.7395, + "step": 9011 + }, + { + "epoch": 0.9361171704580866, + "grad_norm": 0.4273471236228943, + "learning_rate": 1.0035750194364758e-06, + "loss": 1.6103, + "step": 9012 + }, + { + "epoch": 0.936221044977667, + "grad_norm": 0.5556546449661255, + "learning_rate": 1.0003249315972684e-06, + "loss": 1.7848, + "step": 9013 + }, + { + "epoch": 0.9363249194972473, + "grad_norm": 0.418753057718277, + "learning_rate": 9.97080061841238e-07, + "loss": 1.7764, + "step": 9014 + }, + { + "epoch": 0.9364287940168277, + "grad_norm": 0.40560612082481384, + "learning_rate": 9.938404105139465e-07, + "loss": 1.6056, + "step": 9015 + }, + { + "epoch": 0.936532668536408, + "grad_norm": 0.402055025100708, + "learning_rate": 9.906059779603849e-07, + "loss": 1.5732, + "step": 9016 + }, + { + "epoch": 0.9366365430559883, + "grad_norm": 0.4551316499710083, + "learning_rate": 9.873767645250055e-07, + "loss": 1.8166, + "step": 9017 + }, + { + "epoch": 0.9367404175755687, + "grad_norm": 0.4371841549873352, + "learning_rate": 9.84152770551683e-07, + "loss": 1.6245, + "step": 9018 + }, + { + "epoch": 0.9368442920951491, + "grad_norm": 0.43440502882003784, + "learning_rate": 9.809339963837426e-07, + "loss": 1.6387, + "step": 9019 + }, + { + "epoch": 0.9369481666147295, + "grad_norm": 0.41472846269607544, + "learning_rate": 9.77720442363983e-07, + "loss": 1.3704, + "step": 9020 + }, + { + "epoch": 0.9370520411343097, + "grad_norm": 0.437029629945755, + "learning_rate": 9.745121088346021e-07, + "loss": 1.7435, + "step": 9021 + }, + { + "epoch": 0.9371559156538901, + "grad_norm": 0.4492699205875397, + "learning_rate": 9.713089961372657e-07, + "loss": 1.8335, + "step": 9022 + }, + { + "epoch": 0.9372597901734705, + "grad_norm": 0.41893166303634644, + "learning_rate": 9.681111046130843e-07, + "loss": 1.6002, + "step": 9023 + }, + { + "epoch": 0.9373636646930508, + "grad_norm": 0.4353455603122711, + "learning_rate": 9.64918434602613e-07, + "loss": 1.7331, + "step": 9024 + }, + { + "epoch": 0.9374675392126312, + "grad_norm": 0.44769999384880066, + "learning_rate": 9.617309864458358e-07, + "loss": 1.7173, + "step": 9025 + }, + { + "epoch": 0.9375714137322115, + "grad_norm": 0.3922676146030426, + "learning_rate": 9.585487604821863e-07, + "loss": 1.5642, + "step": 9026 + }, + { + "epoch": 0.9376752882517918, + "grad_norm": 0.4413311779499054, + "learning_rate": 9.553717570505716e-07, + "loss": 1.6186, + "step": 9027 + }, + { + "epoch": 0.9377791627713722, + "grad_norm": 0.3945287764072418, + "learning_rate": 9.521999764892875e-07, + "loss": 1.5881, + "step": 9028 + }, + { + "epoch": 0.9378830372909526, + "grad_norm": 0.41674908995628357, + "learning_rate": 9.49033419136125e-07, + "loss": 1.5853, + "step": 9029 + }, + { + "epoch": 0.9379869118105328, + "grad_norm": 0.4065210521221161, + "learning_rate": 9.458720853282977e-07, + "loss": 1.5876, + "step": 9030 + }, + { + "epoch": 0.9380907863301132, + "grad_norm": 0.43645021319389343, + "learning_rate": 9.427159754024473e-07, + "loss": 1.694, + "step": 9031 + }, + { + "epoch": 0.9381946608496936, + "grad_norm": 0.4190250337123871, + "learning_rate": 9.395650896946884e-07, + "loss": 1.686, + "step": 9032 + }, + { + "epoch": 0.9382985353692739, + "grad_norm": 0.4211812913417816, + "learning_rate": 9.364194285405581e-07, + "loss": 1.6361, + "step": 9033 + }, + { + "epoch": 0.9384024098888543, + "grad_norm": 0.438837468624115, + "learning_rate": 9.332789922750495e-07, + "loss": 1.5855, + "step": 9034 + }, + { + "epoch": 0.9385062844084346, + "grad_norm": 0.40594640374183655, + "learning_rate": 9.301437812325952e-07, + "loss": 1.6175, + "step": 9035 + }, + { + "epoch": 0.9386101589280149, + "grad_norm": 0.4930059313774109, + "learning_rate": 9.27013795747067e-07, + "loss": 1.6779, + "step": 9036 + }, + { + "epoch": 0.9387140334475953, + "grad_norm": 0.4809900224208832, + "learning_rate": 9.238890361517815e-07, + "loss": 1.7196, + "step": 9037 + }, + { + "epoch": 0.9388179079671757, + "grad_norm": 0.4233071208000183, + "learning_rate": 9.20769502779506e-07, + "loss": 1.5481, + "step": 9038 + }, + { + "epoch": 0.938921782486756, + "grad_norm": 0.43219685554504395, + "learning_rate": 9.176551959624524e-07, + "loss": 1.5561, + "step": 9039 + }, + { + "epoch": 0.9390256570063363, + "grad_norm": 0.41195639967918396, + "learning_rate": 9.145461160322555e-07, + "loss": 1.6116, + "step": 9040 + }, + { + "epoch": 0.9391295315259167, + "grad_norm": 0.3952676057815552, + "learning_rate": 9.114422633200226e-07, + "loss": 1.5454, + "step": 9041 + }, + { + "epoch": 0.939233406045497, + "grad_norm": 0.41172322630882263, + "learning_rate": 9.083436381562838e-07, + "loss": 1.6116, + "step": 9042 + }, + { + "epoch": 0.9393372805650774, + "grad_norm": 0.4490038752555847, + "learning_rate": 9.05250240871014e-07, + "loss": 1.7028, + "step": 9043 + }, + { + "epoch": 0.9394411550846578, + "grad_norm": 0.46638983488082886, + "learning_rate": 9.021620717936385e-07, + "loss": 1.7121, + "step": 9044 + }, + { + "epoch": 0.9395450296042381, + "grad_norm": 0.42944493889808655, + "learning_rate": 8.990791312530222e-07, + "loss": 1.6753, + "step": 9045 + }, + { + "epoch": 0.9396489041238184, + "grad_norm": 0.4450928270816803, + "learning_rate": 8.960014195774857e-07, + "loss": 1.7523, + "step": 9046 + }, + { + "epoch": 0.9397527786433988, + "grad_norm": 0.3949832022190094, + "learning_rate": 8.929289370947669e-07, + "loss": 1.5365, + "step": 9047 + }, + { + "epoch": 0.9398566531629792, + "grad_norm": 0.41552335023880005, + "learning_rate": 8.89861684132065e-07, + "loss": 1.7463, + "step": 9048 + }, + { + "epoch": 0.9399605276825594, + "grad_norm": 0.40294694900512695, + "learning_rate": 8.867996610160246e-07, + "loss": 1.6094, + "step": 9049 + }, + { + "epoch": 0.9400644022021398, + "grad_norm": 0.43670380115509033, + "learning_rate": 8.837428680727178e-07, + "loss": 1.7627, + "step": 9050 + }, + { + "epoch": 0.9401682767217202, + "grad_norm": 0.4421059787273407, + "learning_rate": 8.806913056276788e-07, + "loss": 1.7939, + "step": 9051 + }, + { + "epoch": 0.9402721512413005, + "grad_norm": 0.48101523518562317, + "learning_rate": 8.776449740058645e-07, + "loss": 1.7245, + "step": 9052 + }, + { + "epoch": 0.9403760257608809, + "grad_norm": 0.41619133949279785, + "learning_rate": 8.746038735316931e-07, + "loss": 1.5208, + "step": 9053 + }, + { + "epoch": 0.9404799002804612, + "grad_norm": 0.46501943469047546, + "learning_rate": 8.715680045290165e-07, + "loss": 1.8578, + "step": 9054 + }, + { + "epoch": 0.9405837748000415, + "grad_norm": 0.443288654088974, + "learning_rate": 8.685373673211372e-07, + "loss": 1.6814, + "step": 9055 + }, + { + "epoch": 0.9406876493196219, + "grad_norm": 0.39399245381355286, + "learning_rate": 8.655119622307806e-07, + "loss": 1.327, + "step": 9056 + }, + { + "epoch": 0.9407915238392023, + "grad_norm": 0.40905261039733887, + "learning_rate": 8.624917895801388e-07, + "loss": 1.6839, + "step": 9057 + }, + { + "epoch": 0.9408953983587826, + "grad_norm": 0.4420108199119568, + "learning_rate": 8.594768496908378e-07, + "loss": 1.5596, + "step": 9058 + }, + { + "epoch": 0.9409992728783629, + "grad_norm": 0.3948979675769806, + "learning_rate": 8.564671428839321e-07, + "loss": 1.5437, + "step": 9059 + }, + { + "epoch": 0.9411031473979433, + "grad_norm": 0.415421724319458, + "learning_rate": 8.534626694799485e-07, + "loss": 1.5591, + "step": 9060 + }, + { + "epoch": 0.9412070219175236, + "grad_norm": 0.41926440596580505, + "learning_rate": 8.504634297988312e-07, + "loss": 1.7288, + "step": 9061 + }, + { + "epoch": 0.941310896437104, + "grad_norm": 0.3910306990146637, + "learning_rate": 8.4746942415998e-07, + "loss": 1.5736, + "step": 9062 + }, + { + "epoch": 0.9414147709566844, + "grad_norm": 0.4041973054409027, + "learning_rate": 8.444806528822291e-07, + "loss": 1.5708, + "step": 9063 + }, + { + "epoch": 0.9415186454762646, + "grad_norm": 0.423279732465744, + "learning_rate": 8.41497116283857e-07, + "loss": 1.6112, + "step": 9064 + }, + { + "epoch": 0.941622519995845, + "grad_norm": 0.4005332589149475, + "learning_rate": 8.385188146825984e-07, + "loss": 1.6238, + "step": 9065 + }, + { + "epoch": 0.9417263945154254, + "grad_norm": 0.4571692943572998, + "learning_rate": 8.355457483955998e-07, + "loss": 1.8539, + "step": 9066 + }, + { + "epoch": 0.9418302690350057, + "grad_norm": 0.4386880695819855, + "learning_rate": 8.325779177394965e-07, + "loss": 1.709, + "step": 9067 + }, + { + "epoch": 0.941934143554586, + "grad_norm": 0.4796215891838074, + "learning_rate": 8.296153230303139e-07, + "loss": 1.8527, + "step": 9068 + }, + { + "epoch": 0.9420380180741664, + "grad_norm": 0.40631356835365295, + "learning_rate": 8.266579645835548e-07, + "loss": 1.4964, + "step": 9069 + }, + { + "epoch": 0.9421418925937467, + "grad_norm": 0.40857720375061035, + "learning_rate": 8.237058427141619e-07, + "loss": 1.5452, + "step": 9070 + }, + { + "epoch": 0.9422457671133271, + "grad_norm": 0.4160623550415039, + "learning_rate": 8.20758957736506e-07, + "loss": 1.6578, + "step": 9071 + }, + { + "epoch": 0.9423496416329075, + "grad_norm": 0.4103771448135376, + "learning_rate": 8.178173099644082e-07, + "loss": 1.4836, + "step": 9072 + }, + { + "epoch": 0.9424535161524878, + "grad_norm": 0.39245137572288513, + "learning_rate": 8.148808997111234e-07, + "loss": 1.4885, + "step": 9073 + }, + { + "epoch": 0.9425573906720681, + "grad_norm": 0.4215735197067261, + "learning_rate": 8.119497272893739e-07, + "loss": 1.4961, + "step": 9074 + }, + { + "epoch": 0.9426612651916485, + "grad_norm": 0.41241350769996643, + "learning_rate": 8.090237930112932e-07, + "loss": 1.7644, + "step": 9075 + }, + { + "epoch": 0.9427651397112289, + "grad_norm": 0.41640931367874146, + "learning_rate": 8.061030971884709e-07, + "loss": 1.7283, + "step": 9076 + }, + { + "epoch": 0.9428690142308092, + "grad_norm": 0.42004087567329407, + "learning_rate": 8.031876401319471e-07, + "loss": 1.7438, + "step": 9077 + }, + { + "epoch": 0.9429728887503895, + "grad_norm": 0.4231506884098053, + "learning_rate": 8.002774221521903e-07, + "loss": 1.5288, + "step": 9078 + }, + { + "epoch": 0.9430767632699699, + "grad_norm": 0.4708472192287445, + "learning_rate": 7.973724435591135e-07, + "loss": 1.8106, + "step": 9079 + }, + { + "epoch": 0.9431806377895502, + "grad_norm": 0.4298073649406433, + "learning_rate": 7.944727046620748e-07, + "loss": 1.7294, + "step": 9080 + }, + { + "epoch": 0.9432845123091306, + "grad_norm": 0.44677823781967163, + "learning_rate": 7.915782057698772e-07, + "loss": 1.6364, + "step": 9081 + }, + { + "epoch": 0.943388386828711, + "grad_norm": 0.4536736011505127, + "learning_rate": 7.886889471907521e-07, + "loss": 1.7965, + "step": 9082 + }, + { + "epoch": 0.9434922613482912, + "grad_norm": 0.4077182114124298, + "learning_rate": 7.858049292324032e-07, + "loss": 1.6954, + "step": 9083 + }, + { + "epoch": 0.9435961358678716, + "grad_norm": 0.4194972813129425, + "learning_rate": 7.829261522019349e-07, + "loss": 1.5585, + "step": 9084 + }, + { + "epoch": 0.943700010387452, + "grad_norm": 0.4863661825656891, + "learning_rate": 7.800526164059296e-07, + "loss": 1.6892, + "step": 9085 + }, + { + "epoch": 0.9438038849070323, + "grad_norm": 0.49641919136047363, + "learning_rate": 7.771843221503927e-07, + "loss": 1.816, + "step": 9086 + }, + { + "epoch": 0.9439077594266126, + "grad_norm": 0.3937380015850067, + "learning_rate": 7.743212697407631e-07, + "loss": 1.5559, + "step": 9087 + }, + { + "epoch": 0.944011633946193, + "grad_norm": 0.45555490255355835, + "learning_rate": 7.71463459481947e-07, + "loss": 1.7509, + "step": 9088 + }, + { + "epoch": 0.9441155084657733, + "grad_norm": 0.3873693645000458, + "learning_rate": 7.686108916782786e-07, + "loss": 1.4442, + "step": 9089 + }, + { + "epoch": 0.9442193829853537, + "grad_norm": 0.4046841859817505, + "learning_rate": 7.657635666335316e-07, + "loss": 1.5161, + "step": 9090 + }, + { + "epoch": 0.9443232575049341, + "grad_norm": 0.41723448038101196, + "learning_rate": 7.629214846509191e-07, + "loss": 1.7279, + "step": 9091 + }, + { + "epoch": 0.9444271320245143, + "grad_norm": 0.39067700505256653, + "learning_rate": 7.600846460331046e-07, + "loss": 1.6004, + "step": 9092 + }, + { + "epoch": 0.9445310065440947, + "grad_norm": 0.434877872467041, + "learning_rate": 7.572530510821907e-07, + "loss": 1.6926, + "step": 9093 + }, + { + "epoch": 0.9446348810636751, + "grad_norm": 0.40955984592437744, + "learning_rate": 7.544267000997141e-07, + "loss": 1.6929, + "step": 9094 + }, + { + "epoch": 0.9447387555832554, + "grad_norm": 0.4047926962375641, + "learning_rate": 7.516055933866728e-07, + "loss": 1.6725, + "step": 9095 + }, + { + "epoch": 0.9448426301028358, + "grad_norm": 0.42006105184555054, + "learning_rate": 7.487897312434711e-07, + "loss": 1.76, + "step": 9096 + }, + { + "epoch": 0.9449465046224161, + "grad_norm": 0.4147946536540985, + "learning_rate": 7.459791139699968e-07, + "loss": 1.6635, + "step": 9097 + }, + { + "epoch": 0.9450503791419965, + "grad_norm": 0.440835565328598, + "learning_rate": 7.431737418655494e-07, + "loss": 1.7047, + "step": 9098 + }, + { + "epoch": 0.9451542536615768, + "grad_norm": 0.38934168219566345, + "learning_rate": 7.403736152288842e-07, + "loss": 1.4739, + "step": 9099 + }, + { + "epoch": 0.9452581281811572, + "grad_norm": 0.45924264192581177, + "learning_rate": 7.375787343581852e-07, + "loss": 1.8124, + "step": 9100 + }, + { + "epoch": 0.9453620027007376, + "grad_norm": 0.45659106969833374, + "learning_rate": 7.347890995510864e-07, + "loss": 1.7298, + "step": 9101 + }, + { + "epoch": 0.9454658772203178, + "grad_norm": 0.4247036278247833, + "learning_rate": 7.320047111046724e-07, + "loss": 1.6317, + "step": 9102 + }, + { + "epoch": 0.9455697517398982, + "grad_norm": 0.41282743215560913, + "learning_rate": 7.292255693154449e-07, + "loss": 1.6771, + "step": 9103 + }, + { + "epoch": 0.9456736262594786, + "grad_norm": 0.41268548369407654, + "learning_rate": 7.264516744793671e-07, + "loss": 1.6163, + "step": 9104 + }, + { + "epoch": 0.9457775007790589, + "grad_norm": 0.3944796621799469, + "learning_rate": 7.236830268918415e-07, + "loss": 1.6949, + "step": 9105 + }, + { + "epoch": 0.9458813752986392, + "grad_norm": 0.4076094925403595, + "learning_rate": 7.209196268477047e-07, + "loss": 1.5172, + "step": 9106 + }, + { + "epoch": 0.9459852498182196, + "grad_norm": 0.41099345684051514, + "learning_rate": 7.181614746412324e-07, + "loss": 1.5826, + "step": 9107 + }, + { + "epoch": 0.9460891243377999, + "grad_norm": 0.5668470859527588, + "learning_rate": 7.154085705661506e-07, + "loss": 1.8974, + "step": 9108 + }, + { + "epoch": 0.9461929988573803, + "grad_norm": 0.4689236879348755, + "learning_rate": 7.126609149156249e-07, + "loss": 1.7432, + "step": 9109 + }, + { + "epoch": 0.9462968733769607, + "grad_norm": 0.42818373441696167, + "learning_rate": 7.09918507982249e-07, + "loss": 1.614, + "step": 9110 + }, + { + "epoch": 0.9464007478965409, + "grad_norm": 0.40631645917892456, + "learning_rate": 7.071813500580837e-07, + "loss": 1.6488, + "step": 9111 + }, + { + "epoch": 0.9465046224161213, + "grad_norm": 0.38255685567855835, + "learning_rate": 7.044494414346015e-07, + "loss": 1.5024, + "step": 9112 + }, + { + "epoch": 0.9466084969357017, + "grad_norm": 0.42258989810943604, + "learning_rate": 7.017227824027305e-07, + "loss": 1.6628, + "step": 9113 + }, + { + "epoch": 0.946712371455282, + "grad_norm": 0.40682870149612427, + "learning_rate": 6.990013732528444e-07, + "loss": 1.6693, + "step": 9114 + }, + { + "epoch": 0.9468162459748624, + "grad_norm": 0.4137510657310486, + "learning_rate": 6.9628521427475e-07, + "loss": 1.5755, + "step": 9115 + }, + { + "epoch": 0.9469201204944427, + "grad_norm": 0.446568101644516, + "learning_rate": 6.935743057576938e-07, + "loss": 1.6624, + "step": 9116 + }, + { + "epoch": 0.947023995014023, + "grad_norm": 0.41516193747520447, + "learning_rate": 6.908686479903725e-07, + "loss": 1.6738, + "step": 9117 + }, + { + "epoch": 0.9471278695336034, + "grad_norm": 0.4611627459526062, + "learning_rate": 6.881682412609114e-07, + "loss": 1.7311, + "step": 9118 + }, + { + "epoch": 0.9472317440531838, + "grad_norm": 0.5270797610282898, + "learning_rate": 6.854730858568914e-07, + "loss": 1.7126, + "step": 9119 + }, + { + "epoch": 0.947335618572764, + "grad_norm": 0.41994228959083557, + "learning_rate": 6.827831820653163e-07, + "loss": 1.7939, + "step": 9120 + }, + { + "epoch": 0.9474394930923444, + "grad_norm": 0.44093456864356995, + "learning_rate": 6.800985301726404e-07, + "loss": 1.6198, + "step": 9121 + }, + { + "epoch": 0.9475433676119248, + "grad_norm": 0.4290820360183716, + "learning_rate": 6.77419130464757e-07, + "loss": 1.73, + "step": 9122 + }, + { + "epoch": 0.9476472421315052, + "grad_norm": 0.4182687997817993, + "learning_rate": 6.747449832270158e-07, + "loss": 1.6677, + "step": 9123 + }, + { + "epoch": 0.9477511166510855, + "grad_norm": 0.4348759949207306, + "learning_rate": 6.720760887441779e-07, + "loss": 1.8017, + "step": 9124 + }, + { + "epoch": 0.9478549911706659, + "grad_norm": 0.40973037481307983, + "learning_rate": 6.694124473004604e-07, + "loss": 1.6173, + "step": 9125 + }, + { + "epoch": 0.9479588656902462, + "grad_norm": 0.4033758342266083, + "learning_rate": 6.667540591795307e-07, + "loss": 1.5386, + "step": 9126 + }, + { + "epoch": 0.9480627402098265, + "grad_norm": 0.4231725037097931, + "learning_rate": 6.641009246644736e-07, + "loss": 1.652, + "step": 9127 + }, + { + "epoch": 0.9481666147294069, + "grad_norm": 0.40688326954841614, + "learning_rate": 6.614530440378352e-07, + "loss": 1.6695, + "step": 9128 + }, + { + "epoch": 0.9482704892489873, + "grad_norm": 0.38211679458618164, + "learning_rate": 6.588104175815901e-07, + "loss": 1.4878, + "step": 9129 + }, + { + "epoch": 0.9483743637685675, + "grad_norm": 0.4489685595035553, + "learning_rate": 6.561730455771686e-07, + "loss": 1.6489, + "step": 9130 + }, + { + "epoch": 0.9484782382881479, + "grad_norm": 0.4730764627456665, + "learning_rate": 6.535409283054073e-07, + "loss": 1.6737, + "step": 9131 + }, + { + "epoch": 0.9485821128077283, + "grad_norm": 0.38945716619491577, + "learning_rate": 6.509140660466318e-07, + "loss": 1.6735, + "step": 9132 + }, + { + "epoch": 0.9486859873273086, + "grad_norm": 0.4208957254886627, + "learning_rate": 6.482924590805627e-07, + "loss": 1.6911, + "step": 9133 + }, + { + "epoch": 0.948789861846889, + "grad_norm": 0.5277408361434937, + "learning_rate": 6.456761076863937e-07, + "loss": 1.5338, + "step": 9134 + }, + { + "epoch": 0.9488937363664693, + "grad_norm": 0.39843320846557617, + "learning_rate": 6.430650121427406e-07, + "loss": 1.5234, + "step": 9135 + }, + { + "epoch": 0.9489976108860496, + "grad_norm": 0.42443880438804626, + "learning_rate": 6.404591727276588e-07, + "loss": 1.7075, + "step": 9136 + }, + { + "epoch": 0.94910148540563, + "grad_norm": 0.4635830223560333, + "learning_rate": 6.378585897186651e-07, + "loss": 1.6455, + "step": 9137 + }, + { + "epoch": 0.9492053599252104, + "grad_norm": 0.39471131563186646, + "learning_rate": 6.352632633926825e-07, + "loss": 1.5482, + "step": 9138 + }, + { + "epoch": 0.9493092344447907, + "grad_norm": 0.45582547783851624, + "learning_rate": 6.326731940261122e-07, + "loss": 1.7572, + "step": 9139 + }, + { + "epoch": 0.949413108964371, + "grad_norm": 0.4490654170513153, + "learning_rate": 6.300883818947611e-07, + "loss": 1.6723, + "step": 9140 + }, + { + "epoch": 0.9495169834839514, + "grad_norm": 0.38818129897117615, + "learning_rate": 6.275088272738927e-07, + "loss": 1.7014, + "step": 9141 + }, + { + "epoch": 0.9496208580035317, + "grad_norm": 0.4169076383113861, + "learning_rate": 6.249345304382259e-07, + "loss": 1.6569, + "step": 9142 + }, + { + "epoch": 0.9497247325231121, + "grad_norm": 0.39485812187194824, + "learning_rate": 6.223654916618804e-07, + "loss": 1.6363, + "step": 9143 + }, + { + "epoch": 0.9498286070426925, + "grad_norm": 0.41402456164360046, + "learning_rate": 6.19801711218454e-07, + "loss": 1.5838, + "step": 9144 + }, + { + "epoch": 0.9499324815622727, + "grad_norm": 0.4002974331378937, + "learning_rate": 6.172431893809616e-07, + "loss": 1.4252, + "step": 9145 + }, + { + "epoch": 0.9500363560818531, + "grad_norm": 0.42501163482666016, + "learning_rate": 6.146899264218742e-07, + "loss": 1.5948, + "step": 9146 + }, + { + "epoch": 0.9501402306014335, + "grad_norm": 0.4167088270187378, + "learning_rate": 6.121419226130909e-07, + "loss": 1.5962, + "step": 9147 + }, + { + "epoch": 0.9502441051210138, + "grad_norm": 0.42873653769493103, + "learning_rate": 6.095991782259447e-07, + "loss": 1.6758, + "step": 9148 + }, + { + "epoch": 0.9503479796405941, + "grad_norm": 0.37248915433883667, + "learning_rate": 6.070616935312357e-07, + "loss": 1.4357, + "step": 9149 + }, + { + "epoch": 0.9504518541601745, + "grad_norm": 0.4385361671447754, + "learning_rate": 6.045294687991643e-07, + "loss": 1.8251, + "step": 9150 + }, + { + "epoch": 0.9505557286797549, + "grad_norm": 0.44529131054878235, + "learning_rate": 6.020025042994148e-07, + "loss": 1.8014, + "step": 9151 + }, + { + "epoch": 0.9506596031993352, + "grad_norm": 0.4091431498527527, + "learning_rate": 5.994808003010777e-07, + "loss": 1.5917, + "step": 9152 + }, + { + "epoch": 0.9507634777189156, + "grad_norm": 0.4284379780292511, + "learning_rate": 5.969643570726935e-07, + "loss": 1.5155, + "step": 9153 + }, + { + "epoch": 0.9508673522384959, + "grad_norm": 0.4677438735961914, + "learning_rate": 5.944531748822535e-07, + "loss": 1.489, + "step": 9154 + }, + { + "epoch": 0.9509712267580762, + "grad_norm": 0.4163029193878174, + "learning_rate": 5.91947253997166e-07, + "loss": 1.4501, + "step": 9155 + }, + { + "epoch": 0.9510751012776566, + "grad_norm": 0.41146647930145264, + "learning_rate": 5.894465946843008e-07, + "loss": 1.5144, + "step": 9156 + }, + { + "epoch": 0.951178975797237, + "grad_norm": 0.4144413471221924, + "learning_rate": 5.869511972099506e-07, + "loss": 1.6731, + "step": 9157 + }, + { + "epoch": 0.9512828503168173, + "grad_norm": 0.4550022482872009, + "learning_rate": 5.844610618398693e-07, + "loss": 1.7321, + "step": 9158 + }, + { + "epoch": 0.9513867248363976, + "grad_norm": 0.4146331548690796, + "learning_rate": 5.819761888392228e-07, + "loss": 1.7712, + "step": 9159 + }, + { + "epoch": 0.951490599355978, + "grad_norm": 0.4827490746974945, + "learning_rate": 5.794965784726436e-07, + "loss": 1.7595, + "step": 9160 + }, + { + "epoch": 0.9515944738755583, + "grad_norm": 0.4096835255622864, + "learning_rate": 5.770222310041817e-07, + "loss": 1.7085, + "step": 9161 + }, + { + "epoch": 0.9516983483951387, + "grad_norm": 0.4233892261981964, + "learning_rate": 5.745531466973375e-07, + "loss": 1.7196, + "step": 9162 + }, + { + "epoch": 0.951802222914719, + "grad_norm": 0.457246333360672, + "learning_rate": 5.720893258150451e-07, + "loss": 1.7608, + "step": 9163 + }, + { + "epoch": 0.9519060974342993, + "grad_norm": 0.40001896023750305, + "learning_rate": 5.696307686196944e-07, + "loss": 1.6069, + "step": 9164 + }, + { + "epoch": 0.9520099719538797, + "grad_norm": 0.40769293904304504, + "learning_rate": 5.67177475373093e-07, + "loss": 1.5777, + "step": 9165 + }, + { + "epoch": 0.9521138464734601, + "grad_norm": 0.4234029948711395, + "learning_rate": 5.647294463364927e-07, + "loss": 1.8119, + "step": 9166 + }, + { + "epoch": 0.9522177209930404, + "grad_norm": 0.4327137768268585, + "learning_rate": 5.622866817706074e-07, + "loss": 1.5702, + "step": 9167 + }, + { + "epoch": 0.9523215955126207, + "grad_norm": 0.4188234806060791, + "learning_rate": 5.598491819355567e-07, + "loss": 1.679, + "step": 9168 + }, + { + "epoch": 0.9524254700322011, + "grad_norm": 0.44610387086868286, + "learning_rate": 5.574169470909163e-07, + "loss": 1.6981, + "step": 9169 + }, + { + "epoch": 0.9525293445517814, + "grad_norm": 0.4079190194606781, + "learning_rate": 5.549899774957123e-07, + "loss": 1.5524, + "step": 9170 + }, + { + "epoch": 0.9526332190713618, + "grad_norm": 0.4039902985095978, + "learning_rate": 5.52568273408377e-07, + "loss": 1.7129, + "step": 9171 + }, + { + "epoch": 0.9527370935909422, + "grad_norm": 0.4110052287578583, + "learning_rate": 5.501518350868262e-07, + "loss": 1.5757, + "step": 9172 + }, + { + "epoch": 0.9528409681105224, + "grad_norm": 0.3944413959980011, + "learning_rate": 5.477406627883763e-07, + "loss": 1.6487, + "step": 9173 + }, + { + "epoch": 0.9529448426301028, + "grad_norm": 0.4401375949382782, + "learning_rate": 5.453347567698108e-07, + "loss": 1.7217, + "step": 9174 + }, + { + "epoch": 0.9530487171496832, + "grad_norm": 0.40892985463142395, + "learning_rate": 5.429341172873248e-07, + "loss": 1.644, + "step": 9175 + }, + { + "epoch": 0.9531525916692636, + "grad_norm": 0.40419328212738037, + "learning_rate": 5.405387445965748e-07, + "loss": 1.5666, + "step": 9176 + }, + { + "epoch": 0.9532564661888439, + "grad_norm": 0.4269610643386841, + "learning_rate": 5.381486389526569e-07, + "loss": 1.5313, + "step": 9177 + }, + { + "epoch": 0.9533603407084242, + "grad_norm": 0.4391513764858246, + "learning_rate": 5.35763800610084e-07, + "loss": 1.6146, + "step": 9178 + }, + { + "epoch": 0.9534642152280046, + "grad_norm": 0.4620340168476105, + "learning_rate": 5.333842298228364e-07, + "loss": 1.7637, + "step": 9179 + }, + { + "epoch": 0.9535680897475849, + "grad_norm": 0.44196537137031555, + "learning_rate": 5.310099268443114e-07, + "loss": 1.7589, + "step": 9180 + }, + { + "epoch": 0.9536719642671653, + "grad_norm": 0.42510437965393066, + "learning_rate": 5.286408919273567e-07, + "loss": 1.7186, + "step": 9181 + }, + { + "epoch": 0.9537758387867457, + "grad_norm": 0.4449099600315094, + "learning_rate": 5.262771253242593e-07, + "loss": 1.698, + "step": 9182 + }, + { + "epoch": 0.9538797133063259, + "grad_norm": 0.4278699457645416, + "learning_rate": 5.239186272867347e-07, + "loss": 1.4785, + "step": 9183 + }, + { + "epoch": 0.9539835878259063, + "grad_norm": 0.40362176299095154, + "learning_rate": 5.215653980659485e-07, + "loss": 1.7033, + "step": 9184 + }, + { + "epoch": 0.9540874623454867, + "grad_norm": 0.4301554262638092, + "learning_rate": 5.192174379124948e-07, + "loss": 1.6292, + "step": 9185 + }, + { + "epoch": 0.954191336865067, + "grad_norm": 0.4313088655471802, + "learning_rate": 5.16874747076429e-07, + "loss": 1.5956, + "step": 9186 + }, + { + "epoch": 0.9542952113846473, + "grad_norm": 0.43892988562583923, + "learning_rate": 5.145373258072129e-07, + "loss": 1.7799, + "step": 9187 + }, + { + "epoch": 0.9543990859042277, + "grad_norm": 0.4435426890850067, + "learning_rate": 5.122051743537692e-07, + "loss": 1.7143, + "step": 9188 + }, + { + "epoch": 0.954502960423808, + "grad_norm": 0.4262649416923523, + "learning_rate": 5.098782929644607e-07, + "loss": 1.5772, + "step": 9189 + }, + { + "epoch": 0.9546068349433884, + "grad_norm": 0.4397432804107666, + "learning_rate": 5.075566818870725e-07, + "loss": 1.7397, + "step": 9190 + }, + { + "epoch": 0.9547107094629688, + "grad_norm": 0.42559805512428284, + "learning_rate": 5.0524034136884e-07, + "loss": 1.6717, + "step": 9191 + }, + { + "epoch": 0.954814583982549, + "grad_norm": 0.40015941858291626, + "learning_rate": 5.029292716564382e-07, + "loss": 1.6091, + "step": 9192 + }, + { + "epoch": 0.9549184585021294, + "grad_norm": 0.43587568402290344, + "learning_rate": 5.006234729959813e-07, + "loss": 1.6914, + "step": 9193 + }, + { + "epoch": 0.9550223330217098, + "grad_norm": 0.40942129492759705, + "learning_rate": 4.983229456330063e-07, + "loss": 1.5895, + "step": 9194 + }, + { + "epoch": 0.9551262075412901, + "grad_norm": 0.45830440521240234, + "learning_rate": 4.960276898125227e-07, + "loss": 1.7004, + "step": 9195 + }, + { + "epoch": 0.9552300820608705, + "grad_norm": 0.45352327823638916, + "learning_rate": 4.93737705778935e-07, + "loss": 1.684, + "step": 9196 + }, + { + "epoch": 0.9553339565804508, + "grad_norm": 0.4297914206981659, + "learning_rate": 4.914529937761147e-07, + "loss": 1.8311, + "step": 9197 + }, + { + "epoch": 0.9554378311000311, + "grad_norm": 0.40148767828941345, + "learning_rate": 4.891735540473785e-07, + "loss": 1.5433, + "step": 9198 + }, + { + "epoch": 0.9555417056196115, + "grad_norm": 0.433064341545105, + "learning_rate": 4.868993868354544e-07, + "loss": 1.6219, + "step": 9199 + }, + { + "epoch": 0.9556455801391919, + "grad_norm": 0.4220326542854309, + "learning_rate": 4.846304923825263e-07, + "loss": 1.6918, + "step": 9200 + }, + { + "epoch": 0.9557494546587723, + "grad_norm": 0.43805915117263794, + "learning_rate": 4.823668709302176e-07, + "loss": 1.5888, + "step": 9201 + }, + { + "epoch": 0.9558533291783525, + "grad_norm": 0.4448419511318207, + "learning_rate": 4.801085227195856e-07, + "loss": 1.6003, + "step": 9202 + }, + { + "epoch": 0.9559572036979329, + "grad_norm": 0.42299073934555054, + "learning_rate": 4.778554479911268e-07, + "loss": 1.6384, + "step": 9203 + }, + { + "epoch": 0.9560610782175133, + "grad_norm": 0.405784547328949, + "learning_rate": 4.7560764698477146e-07, + "loss": 1.6106, + "step": 9204 + }, + { + "epoch": 0.9561649527370936, + "grad_norm": 0.41542530059814453, + "learning_rate": 4.7336511993990586e-07, + "loss": 1.6064, + "step": 9205 + }, + { + "epoch": 0.956268827256674, + "grad_norm": 0.45177707076072693, + "learning_rate": 4.711278670953223e-07, + "loss": 1.8089, + "step": 9206 + }, + { + "epoch": 0.9563727017762543, + "grad_norm": 0.45689210295677185, + "learning_rate": 4.688958886892858e-07, + "loss": 1.5888, + "step": 9207 + }, + { + "epoch": 0.9564765762958346, + "grad_norm": 0.4608011245727539, + "learning_rate": 4.666691849594784e-07, + "loss": 1.8167, + "step": 9208 + }, + { + "epoch": 0.956580450815415, + "grad_norm": 0.40499821305274963, + "learning_rate": 4.6444775614302714e-07, + "loss": 1.6155, + "step": 9209 + }, + { + "epoch": 0.9566843253349954, + "grad_norm": 0.3993094265460968, + "learning_rate": 4.6223160247650386e-07, + "loss": 1.5941, + "step": 9210 + }, + { + "epoch": 0.9567881998545756, + "grad_norm": 0.4130497872829437, + "learning_rate": 4.600207241958976e-07, + "loss": 1.5724, + "step": 9211 + }, + { + "epoch": 0.956892074374156, + "grad_norm": 0.4392036497592926, + "learning_rate": 4.5781512153665884e-07, + "loss": 1.6075, + "step": 9212 + }, + { + "epoch": 0.9569959488937364, + "grad_norm": 0.4254901111125946, + "learning_rate": 4.5561479473366644e-07, + "loss": 1.7026, + "step": 9213 + }, + { + "epoch": 0.9570998234133167, + "grad_norm": 0.4199539124965668, + "learning_rate": 4.53419744021244e-07, + "loss": 1.5035, + "step": 9214 + }, + { + "epoch": 0.9572036979328971, + "grad_norm": 0.4554733633995056, + "learning_rate": 4.5122996963313234e-07, + "loss": 1.6648, + "step": 9215 + }, + { + "epoch": 0.9573075724524774, + "grad_norm": 0.4227248430252075, + "learning_rate": 4.4904547180253386e-07, + "loss": 1.5704, + "step": 9216 + }, + { + "epoch": 0.9574114469720577, + "grad_norm": 0.37971073389053345, + "learning_rate": 4.468662507620847e-07, + "loss": 1.5082, + "step": 9217 + }, + { + "epoch": 0.9575153214916381, + "grad_norm": 0.40242883563041687, + "learning_rate": 4.446923067438491e-07, + "loss": 1.5791, + "step": 9218 + }, + { + "epoch": 0.9576191960112185, + "grad_norm": 0.43031686544418335, + "learning_rate": 4.425236399793364e-07, + "loss": 1.6155, + "step": 9219 + }, + { + "epoch": 0.9577230705307987, + "grad_norm": 0.4682821035385132, + "learning_rate": 4.403602506994897e-07, + "loss": 1.7276, + "step": 9220 + }, + { + "epoch": 0.9578269450503791, + "grad_norm": 0.4437069892883301, + "learning_rate": 4.382021391346969e-07, + "loss": 1.6508, + "step": 9221 + }, + { + "epoch": 0.9579308195699595, + "grad_norm": 0.38210955262184143, + "learning_rate": 4.360493055147796e-07, + "loss": 1.5887, + "step": 9222 + }, + { + "epoch": 0.9580346940895398, + "grad_norm": 0.4004855751991272, + "learning_rate": 4.339017500690046e-07, + "loss": 1.4538, + "step": 9223 + }, + { + "epoch": 0.9581385686091202, + "grad_norm": 0.41363096237182617, + "learning_rate": 4.3175947302605547e-07, + "loss": 1.6011, + "step": 9224 + }, + { + "epoch": 0.9582424431287005, + "grad_norm": 0.42075902223587036, + "learning_rate": 4.296224746140776e-07, + "loss": 1.6147, + "step": 9225 + }, + { + "epoch": 0.9583463176482809, + "grad_norm": 0.45383062958717346, + "learning_rate": 4.2749075506063883e-07, + "loss": 1.6448, + "step": 9226 + }, + { + "epoch": 0.9584501921678612, + "grad_norm": 0.40463367104530334, + "learning_rate": 4.253643145927577e-07, + "loss": 1.663, + "step": 9227 + }, + { + "epoch": 0.9585540666874416, + "grad_norm": 0.4023562967777252, + "learning_rate": 4.232431534368808e-07, + "loss": 1.4755, + "step": 9228 + }, + { + "epoch": 0.958657941207022, + "grad_norm": 0.4149220883846283, + "learning_rate": 4.211272718188941e-07, + "loss": 1.6507, + "step": 9229 + }, + { + "epoch": 0.9587618157266022, + "grad_norm": 0.42307958006858826, + "learning_rate": 4.190166699641229e-07, + "loss": 1.7877, + "step": 9230 + }, + { + "epoch": 0.9588656902461826, + "grad_norm": 0.4115198254585266, + "learning_rate": 4.169113480973319e-07, + "loss": 1.5952, + "step": 9231 + }, + { + "epoch": 0.958969564765763, + "grad_norm": 0.4087419807910919, + "learning_rate": 4.148113064427139e-07, + "loss": 1.6713, + "step": 9232 + }, + { + "epoch": 0.9590734392853433, + "grad_norm": 0.43182262778282166, + "learning_rate": 4.127165452239179e-07, + "loss": 1.728, + "step": 9233 + }, + { + "epoch": 0.9591773138049237, + "grad_norm": 0.41632604598999023, + "learning_rate": 4.106270646640098e-07, + "loss": 1.7331, + "step": 9234 + }, + { + "epoch": 0.959281188324504, + "grad_norm": 0.4216884970664978, + "learning_rate": 4.085428649855061e-07, + "loss": 1.7273, + "step": 9235 + }, + { + "epoch": 0.9593850628440843, + "grad_norm": 0.39997923374176025, + "learning_rate": 4.0646394641036255e-07, + "loss": 1.6779, + "step": 9236 + }, + { + "epoch": 0.9594889373636647, + "grad_norm": 0.4439750015735626, + "learning_rate": 4.043903091599632e-07, + "loss": 1.8105, + "step": 9237 + }, + { + "epoch": 0.9595928118832451, + "grad_norm": 0.4040851593017578, + "learning_rate": 4.02321953455137e-07, + "loss": 1.619, + "step": 9238 + }, + { + "epoch": 0.9596966864028253, + "grad_norm": 0.40160131454467773, + "learning_rate": 4.002588795161466e-07, + "loss": 1.5693, + "step": 9239 + }, + { + "epoch": 0.9598005609224057, + "grad_norm": 0.49092456698417664, + "learning_rate": 3.9820108756268846e-07, + "loss": 1.8386, + "step": 9240 + }, + { + "epoch": 0.9599044354419861, + "grad_norm": 0.4417165517807007, + "learning_rate": 3.9614857781390955e-07, + "loss": 1.5334, + "step": 9241 + }, + { + "epoch": 0.9600083099615664, + "grad_norm": 0.4167068302631378, + "learning_rate": 3.941013504883795e-07, + "loss": 1.7032, + "step": 9242 + }, + { + "epoch": 0.9601121844811468, + "grad_norm": 0.42593681812286377, + "learning_rate": 3.920594058041183e-07, + "loss": 1.741, + "step": 9243 + }, + { + "epoch": 0.9602160590007272, + "grad_norm": 0.4693407118320465, + "learning_rate": 3.9002274397856863e-07, + "loss": 1.8403, + "step": 9244 + }, + { + "epoch": 0.9603199335203074, + "grad_norm": 0.41874200105667114, + "learning_rate": 3.8799136522862933e-07, + "loss": 1.6718, + "step": 9245 + }, + { + "epoch": 0.9604238080398878, + "grad_norm": 0.42587071657180786, + "learning_rate": 3.859652697706162e-07, + "loss": 1.5569, + "step": 9246 + }, + { + "epoch": 0.9605276825594682, + "grad_norm": 0.4416424334049225, + "learning_rate": 3.8394445782030107e-07, + "loss": 1.5302, + "step": 9247 + }, + { + "epoch": 0.9606315570790485, + "grad_norm": 0.4183494746685028, + "learning_rate": 3.819289295928785e-07, + "loss": 1.6514, + "step": 9248 + }, + { + "epoch": 0.9607354315986288, + "grad_norm": 0.46521294116973877, + "learning_rate": 3.7991868530299345e-07, + "loss": 1.5298, + "step": 9249 + }, + { + "epoch": 0.9608393061182092, + "grad_norm": 0.424907386302948, + "learning_rate": 3.7791372516470803e-07, + "loss": 1.7081, + "step": 9250 + }, + { + "epoch": 0.9609431806377895, + "grad_norm": 0.44091981649398804, + "learning_rate": 3.759140493915514e-07, + "loss": 1.7594, + "step": 9251 + }, + { + "epoch": 0.9610470551573699, + "grad_norm": 0.4381832778453827, + "learning_rate": 3.7391965819646435e-07, + "loss": 1.5333, + "step": 9252 + }, + { + "epoch": 0.9611509296769503, + "grad_norm": 0.41242584586143494, + "learning_rate": 3.7193055179183256e-07, + "loss": 1.5122, + "step": 9253 + }, + { + "epoch": 0.9612548041965306, + "grad_norm": 0.4707975685596466, + "learning_rate": 3.6994673038948655e-07, + "loss": 1.7078, + "step": 9254 + }, + { + "epoch": 0.9613586787161109, + "grad_norm": 0.49493807554244995, + "learning_rate": 3.679681942006852e-07, + "loss": 1.5883, + "step": 9255 + }, + { + "epoch": 0.9614625532356913, + "grad_norm": 0.4006246030330658, + "learning_rate": 3.6599494343612095e-07, + "loss": 1.6098, + "step": 9256 + }, + { + "epoch": 0.9615664277552717, + "grad_norm": 0.4326263964176178, + "learning_rate": 3.6402697830594243e-07, + "loss": 1.6603, + "step": 9257 + }, + { + "epoch": 0.961670302274852, + "grad_norm": 0.39721041917800903, + "learning_rate": 3.6206429901970985e-07, + "loss": 1.493, + "step": 9258 + }, + { + "epoch": 0.9617741767944323, + "grad_norm": 0.41533419489860535, + "learning_rate": 3.6010690578643924e-07, + "loss": 1.5743, + "step": 9259 + }, + { + "epoch": 0.9618780513140127, + "grad_norm": 0.3986194431781769, + "learning_rate": 3.58154798814575e-07, + "loss": 1.4523, + "step": 9260 + }, + { + "epoch": 0.961981925833593, + "grad_norm": 0.46424034237861633, + "learning_rate": 3.56207978312012e-07, + "loss": 1.6645, + "step": 9261 + }, + { + "epoch": 0.9620858003531734, + "grad_norm": 0.44541025161743164, + "learning_rate": 3.542664444860566e-07, + "loss": 1.5695, + "step": 9262 + }, + { + "epoch": 0.9621896748727538, + "grad_norm": 0.39570924639701843, + "learning_rate": 3.5233019754347117e-07, + "loss": 1.5907, + "step": 9263 + }, + { + "epoch": 0.962293549392334, + "grad_norm": 0.4216559827327728, + "learning_rate": 3.503992376904575e-07, + "loss": 1.7391, + "step": 9264 + }, + { + "epoch": 0.9623974239119144, + "grad_norm": 0.3937249481678009, + "learning_rate": 3.4847356513263986e-07, + "loss": 1.6637, + "step": 9265 + }, + { + "epoch": 0.9625012984314948, + "grad_norm": 0.43053656816482544, + "learning_rate": 3.4655318007509316e-07, + "loss": 1.5119, + "step": 9266 + }, + { + "epoch": 0.9626051729510751, + "grad_norm": 0.423225998878479, + "learning_rate": 3.4463808272232055e-07, + "loss": 1.5577, + "step": 9267 + }, + { + "epoch": 0.9627090474706554, + "grad_norm": 0.3969104588031769, + "learning_rate": 3.427282732782644e-07, + "loss": 1.6157, + "step": 9268 + }, + { + "epoch": 0.9628129219902358, + "grad_norm": 0.42695853114128113, + "learning_rate": 3.4082375194630643e-07, + "loss": 1.5988, + "step": 9269 + }, + { + "epoch": 0.9629167965098161, + "grad_norm": 0.4294392764568329, + "learning_rate": 3.3892451892926227e-07, + "loss": 1.5232, + "step": 9270 + }, + { + "epoch": 0.9630206710293965, + "grad_norm": 0.45847055315971375, + "learning_rate": 3.370305744293867e-07, + "loss": 1.7633, + "step": 9271 + }, + { + "epoch": 0.9631245455489769, + "grad_norm": 0.4628918170928955, + "learning_rate": 3.351419186483684e-07, + "loss": 1.7437, + "step": 9272 + }, + { + "epoch": 0.9632284200685571, + "grad_norm": 0.40736517310142517, + "learning_rate": 3.3325855178733543e-07, + "loss": 1.6749, + "step": 9273 + }, + { + "epoch": 0.9633322945881375, + "grad_norm": 0.43378880620002747, + "learning_rate": 3.3138047404684957e-07, + "loss": 1.7485, + "step": 9274 + }, + { + "epoch": 0.9634361691077179, + "grad_norm": 0.40924814343452454, + "learning_rate": 3.295076856269119e-07, + "loss": 1.6725, + "step": 9275 + }, + { + "epoch": 0.9635400436272982, + "grad_norm": 0.40728503465652466, + "learning_rate": 3.2764018672696293e-07, + "loss": 1.6062, + "step": 9276 + }, + { + "epoch": 0.9636439181468786, + "grad_norm": 0.45276060700416565, + "learning_rate": 3.257779775458769e-07, + "loss": 1.7222, + "step": 9277 + }, + { + "epoch": 0.9637477926664589, + "grad_norm": 0.4360550045967102, + "learning_rate": 3.239210582819563e-07, + "loss": 1.7827, + "step": 9278 + }, + { + "epoch": 0.9638516671860393, + "grad_norm": 0.46878498792648315, + "learning_rate": 3.220694291329596e-07, + "loss": 1.7343, + "step": 9279 + }, + { + "epoch": 0.9639555417056196, + "grad_norm": 0.4349597096443176, + "learning_rate": 3.202230902960679e-07, + "loss": 1.3551, + "step": 9280 + }, + { + "epoch": 0.9640594162252, + "grad_norm": 0.4269922971725464, + "learning_rate": 3.1838204196789625e-07, + "loss": 1.5912, + "step": 9281 + }, + { + "epoch": 0.9641632907447804, + "grad_norm": 0.42639386653900146, + "learning_rate": 3.1654628434450437e-07, + "loss": 1.6366, + "step": 9282 + }, + { + "epoch": 0.9642671652643606, + "grad_norm": 0.41460925340652466, + "learning_rate": 3.147158176213916e-07, + "loss": 1.5546, + "step": 9283 + }, + { + "epoch": 0.964371039783941, + "grad_norm": 0.4272399842739105, + "learning_rate": 3.1289064199347964e-07, + "loss": 1.6533, + "step": 9284 + }, + { + "epoch": 0.9644749143035214, + "grad_norm": 0.40159735083580017, + "learning_rate": 3.110707576551408e-07, + "loss": 1.7311, + "step": 9285 + }, + { + "epoch": 0.9645787888231017, + "grad_norm": 0.40594831109046936, + "learning_rate": 3.092561648001813e-07, + "loss": 1.534, + "step": 9286 + }, + { + "epoch": 0.964682663342682, + "grad_norm": 0.4097544550895691, + "learning_rate": 3.074468636218353e-07, + "loss": 1.6231, + "step": 9287 + }, + { + "epoch": 0.9647865378622624, + "grad_norm": 0.40390005707740784, + "learning_rate": 3.056428543127765e-07, + "loss": 1.6062, + "step": 9288 + }, + { + "epoch": 0.9648904123818427, + "grad_norm": 0.4110979735851288, + "learning_rate": 3.038441370651346e-07, + "loss": 1.6621, + "step": 9289 + }, + { + "epoch": 0.9649942869014231, + "grad_norm": 0.4974674582481384, + "learning_rate": 3.020507120704341e-07, + "loss": 1.8125, + "step": 9290 + }, + { + "epoch": 0.9650981614210035, + "grad_norm": 0.45189183950424194, + "learning_rate": 3.002625795196834e-07, + "loss": 1.6262, + "step": 9291 + }, + { + "epoch": 0.9652020359405837, + "grad_norm": 0.4480794668197632, + "learning_rate": 2.9847973960329124e-07, + "loss": 1.6016, + "step": 9292 + }, + { + "epoch": 0.9653059104601641, + "grad_norm": 0.4347269535064697, + "learning_rate": 2.967021925111224e-07, + "loss": 1.9229, + "step": 9293 + }, + { + "epoch": 0.9654097849797445, + "grad_norm": 0.4739961326122284, + "learning_rate": 2.949299384324644e-07, + "loss": 1.5661, + "step": 9294 + }, + { + "epoch": 0.9655136594993248, + "grad_norm": 0.39016592502593994, + "learning_rate": 2.931629775560607e-07, + "loss": 1.542, + "step": 9295 + }, + { + "epoch": 0.9656175340189052, + "grad_norm": 0.42430630326271057, + "learning_rate": 2.914013100700663e-07, + "loss": 1.4694, + "step": 9296 + }, + { + "epoch": 0.9657214085384855, + "grad_norm": 0.4428417682647705, + "learning_rate": 2.896449361620923e-07, + "loss": 1.7179, + "step": 9297 + }, + { + "epoch": 0.9658252830580658, + "grad_norm": 0.4339398145675659, + "learning_rate": 2.8789385601917797e-07, + "loss": 1.6341, + "step": 9298 + }, + { + "epoch": 0.9659291575776462, + "grad_norm": 0.4497127830982208, + "learning_rate": 2.861480698277963e-07, + "loss": 1.8256, + "step": 9299 + }, + { + "epoch": 0.9660330320972266, + "grad_norm": 0.47623205184936523, + "learning_rate": 2.844075777738597e-07, + "loss": 1.8018, + "step": 9300 + }, + { + "epoch": 0.9661369066168068, + "grad_norm": 0.4075833261013031, + "learning_rate": 2.8267238004272554e-07, + "loss": 1.6079, + "step": 9301 + }, + { + "epoch": 0.9662407811363872, + "grad_norm": 0.44640010595321655, + "learning_rate": 2.8094247681916817e-07, + "loss": 1.7133, + "step": 9302 + }, + { + "epoch": 0.9663446556559676, + "grad_norm": 0.4296375811100006, + "learning_rate": 2.79217868287418e-07, + "loss": 1.6657, + "step": 9303 + }, + { + "epoch": 0.966448530175548, + "grad_norm": 0.40996262431144714, + "learning_rate": 2.7749855463112265e-07, + "loss": 1.6412, + "step": 9304 + }, + { + "epoch": 0.9665524046951283, + "grad_norm": 0.45529499650001526, + "learning_rate": 2.757845360333855e-07, + "loss": 1.6247, + "step": 9305 + }, + { + "epoch": 0.9666562792147086, + "grad_norm": 0.4100009500980377, + "learning_rate": 2.740758126767273e-07, + "loss": 1.6921, + "step": 9306 + }, + { + "epoch": 0.966760153734289, + "grad_norm": 0.49910518527030945, + "learning_rate": 2.723723847431192e-07, + "loss": 1.9038, + "step": 9307 + }, + { + "epoch": 0.9668640282538693, + "grad_norm": 0.42856353521347046, + "learning_rate": 2.7067425241396606e-07, + "loss": 1.6905, + "step": 9308 + }, + { + "epoch": 0.9669679027734497, + "grad_norm": 0.4085339605808258, + "learning_rate": 2.689814158700954e-07, + "loss": 1.5855, + "step": 9309 + }, + { + "epoch": 0.9670717772930301, + "grad_norm": 0.3960736095905304, + "learning_rate": 2.672938752917908e-07, + "loss": 1.5288, + "step": 9310 + }, + { + "epoch": 0.9671756518126103, + "grad_norm": 0.4568417966365814, + "learning_rate": 2.6561163085875863e-07, + "loss": 1.6313, + "step": 9311 + }, + { + "epoch": 0.9672795263321907, + "grad_norm": 0.39653193950653076, + "learning_rate": 2.6393468275014434e-07, + "loss": 1.6358, + "step": 9312 + }, + { + "epoch": 0.9673834008517711, + "grad_norm": 0.436946302652359, + "learning_rate": 2.6226303114452733e-07, + "loss": 1.8352, + "step": 9313 + }, + { + "epoch": 0.9674872753713514, + "grad_norm": 0.4774184226989746, + "learning_rate": 2.605966762199319e-07, + "loss": 1.7763, + "step": 9314 + }, + { + "epoch": 0.9675911498909318, + "grad_norm": 0.40284937620162964, + "learning_rate": 2.58935618153805e-07, + "loss": 1.6718, + "step": 9315 + }, + { + "epoch": 0.9676950244105121, + "grad_norm": 0.4363865852355957, + "learning_rate": 2.5727985712303836e-07, + "loss": 1.7835, + "step": 9316 + }, + { + "epoch": 0.9677988989300924, + "grad_norm": 0.42425060272216797, + "learning_rate": 2.5562939330396327e-07, + "loss": 1.7533, + "step": 9317 + }, + { + "epoch": 0.9679027734496728, + "grad_norm": 0.4236680567264557, + "learning_rate": 2.539842268723336e-07, + "loss": 1.5438, + "step": 9318 + }, + { + "epoch": 0.9680066479692532, + "grad_norm": 0.41327956318855286, + "learning_rate": 2.5234435800335354e-07, + "loss": 1.6705, + "step": 9319 + }, + { + "epoch": 0.9681105224888334, + "grad_norm": 0.4201132655143738, + "learning_rate": 2.507097868716501e-07, + "loss": 1.7153, + "step": 9320 + }, + { + "epoch": 0.9682143970084138, + "grad_norm": 0.45233723521232605, + "learning_rate": 2.4908051365129524e-07, + "loss": 1.716, + "step": 9321 + }, + { + "epoch": 0.9683182715279942, + "grad_norm": 0.4188010096549988, + "learning_rate": 2.4745653851579453e-07, + "loss": 1.6264, + "step": 9322 + }, + { + "epoch": 0.9684221460475745, + "grad_norm": 0.4535348415374756, + "learning_rate": 2.458378616380874e-07, + "loss": 1.8135, + "step": 9323 + }, + { + "epoch": 0.9685260205671549, + "grad_norm": 0.5156634449958801, + "learning_rate": 2.442244831905471e-07, + "loss": 1.8665, + "step": 9324 + }, + { + "epoch": 0.9686298950867352, + "grad_norm": 0.41018423438072205, + "learning_rate": 2.426164033449918e-07, + "loss": 1.6818, + "step": 9325 + }, + { + "epoch": 0.9687337696063155, + "grad_norm": 0.4227202534675598, + "learning_rate": 2.4101362227266776e-07, + "loss": 1.5506, + "step": 9326 + }, + { + "epoch": 0.9688376441258959, + "grad_norm": 0.4453946352005005, + "learning_rate": 2.394161401442552e-07, + "loss": 1.8326, + "step": 9327 + }, + { + "epoch": 0.9689415186454763, + "grad_norm": 0.43642252683639526, + "learning_rate": 2.3782395712987903e-07, + "loss": 1.8348, + "step": 9328 + }, + { + "epoch": 0.9690453931650566, + "grad_norm": 0.4805013835430145, + "learning_rate": 2.3623707339909262e-07, + "loss": 1.7733, + "step": 9329 + }, + { + "epoch": 0.9691492676846369, + "grad_norm": 0.41811105608940125, + "learning_rate": 2.3465548912088296e-07, + "loss": 1.5234, + "step": 9330 + }, + { + "epoch": 0.9692531422042173, + "grad_norm": 0.39481353759765625, + "learning_rate": 2.3307920446368203e-07, + "loss": 1.3639, + "step": 9331 + }, + { + "epoch": 0.9693570167237977, + "grad_norm": 0.43810319900512695, + "learning_rate": 2.3150821959534442e-07, + "loss": 1.6738, + "step": 9332 + }, + { + "epoch": 0.969460891243378, + "grad_norm": 0.3864077627658844, + "learning_rate": 2.299425346831807e-07, + "loss": 1.5251, + "step": 9333 + }, + { + "epoch": 0.9695647657629584, + "grad_norm": 0.42563048005104065, + "learning_rate": 2.2838214989390762e-07, + "loss": 1.6638, + "step": 9334 + }, + { + "epoch": 0.9696686402825387, + "grad_norm": 0.4011988341808319, + "learning_rate": 2.2682706539370324e-07, + "loss": 1.4951, + "step": 9335 + }, + { + "epoch": 0.969772514802119, + "grad_norm": 0.4599732458591461, + "learning_rate": 2.2527728134817406e-07, + "loss": 1.7103, + "step": 9336 + }, + { + "epoch": 0.9698763893216994, + "grad_norm": 0.3994980752468109, + "learning_rate": 2.2373279792235468e-07, + "loss": 1.6718, + "step": 9337 + }, + { + "epoch": 0.9699802638412798, + "grad_norm": 0.42097207903862, + "learning_rate": 2.2219361528071912e-07, + "loss": 1.6202, + "step": 9338 + }, + { + "epoch": 0.97008413836086, + "grad_norm": 0.4318835139274597, + "learning_rate": 2.2065973358718627e-07, + "loss": 1.5318, + "step": 9339 + }, + { + "epoch": 0.9701880128804404, + "grad_norm": 0.4295785129070282, + "learning_rate": 2.191311530050977e-07, + "loss": 1.6927, + "step": 9340 + }, + { + "epoch": 0.9702918874000208, + "grad_norm": 0.4130423367023468, + "learning_rate": 2.1760787369723423e-07, + "loss": 1.6946, + "step": 9341 + }, + { + "epoch": 0.9703957619196011, + "grad_norm": 0.43795621395111084, + "learning_rate": 2.160898958258162e-07, + "loss": 1.4909, + "step": 9342 + }, + { + "epoch": 0.9704996364391815, + "grad_norm": 0.42696115374565125, + "learning_rate": 2.14577219552492e-07, + "loss": 1.5696, + "step": 9343 + }, + { + "epoch": 0.9706035109587619, + "grad_norm": 0.4208974242210388, + "learning_rate": 2.1306984503835504e-07, + "loss": 1.5663, + "step": 9344 + }, + { + "epoch": 0.9707073854783421, + "grad_norm": 0.4519440233707428, + "learning_rate": 2.115677724439269e-07, + "loss": 1.7372, + "step": 9345 + }, + { + "epoch": 0.9708112599979225, + "grad_norm": 0.4242643415927887, + "learning_rate": 2.1007100192916296e-07, + "loss": 1.6569, + "step": 9346 + }, + { + "epoch": 0.9709151345175029, + "grad_norm": 0.41533952951431274, + "learning_rate": 2.085795336534635e-07, + "loss": 1.5835, + "step": 9347 + }, + { + "epoch": 0.9710190090370832, + "grad_norm": 0.4785119593143463, + "learning_rate": 2.0709336777565702e-07, + "loss": 1.7689, + "step": 9348 + }, + { + "epoch": 0.9711228835566635, + "grad_norm": 0.41797420382499695, + "learning_rate": 2.0561250445400583e-07, + "loss": 1.6882, + "step": 9349 + }, + { + "epoch": 0.9712267580762439, + "grad_norm": 0.4142938256263733, + "learning_rate": 2.041369438462115e-07, + "loss": 1.7222, + "step": 9350 + }, + { + "epoch": 0.9713306325958242, + "grad_norm": 0.3736826181411743, + "learning_rate": 2.0266668610940952e-07, + "loss": 1.4399, + "step": 9351 + }, + { + "epoch": 0.9714345071154046, + "grad_norm": 0.4269476532936096, + "learning_rate": 2.012017314001746e-07, + "loss": 1.4723, + "step": 9352 + }, + { + "epoch": 0.971538381634985, + "grad_norm": 0.43131187558174133, + "learning_rate": 1.9974207987450422e-07, + "loss": 1.739, + "step": 9353 + }, + { + "epoch": 0.9716422561545652, + "grad_norm": 0.4340756833553314, + "learning_rate": 1.9828773168785175e-07, + "loss": 1.7562, + "step": 9354 + }, + { + "epoch": 0.9717461306741456, + "grad_norm": 0.46568766236305237, + "learning_rate": 1.9683868699508224e-07, + "loss": 1.6564, + "step": 9355 + }, + { + "epoch": 0.971850005193726, + "grad_norm": 0.438309907913208, + "learning_rate": 1.9539494595051112e-07, + "loss": 1.6676, + "step": 9356 + }, + { + "epoch": 0.9719538797133064, + "grad_norm": 0.40006324648857117, + "learning_rate": 1.9395650870789316e-07, + "loss": 1.6582, + "step": 9357 + }, + { + "epoch": 0.9720577542328867, + "grad_norm": 0.4404867887496948, + "learning_rate": 1.9252337542040034e-07, + "loss": 1.7257, + "step": 9358 + }, + { + "epoch": 0.972161628752467, + "grad_norm": 0.4128919541835785, + "learning_rate": 1.910955462406605e-07, + "loss": 1.6339, + "step": 9359 + }, + { + "epoch": 0.9722655032720474, + "grad_norm": 0.47197067737579346, + "learning_rate": 1.8967302132071317e-07, + "loss": 1.5516, + "step": 9360 + }, + { + "epoch": 0.9723693777916277, + "grad_norm": 0.46007785201072693, + "learning_rate": 1.882558008120594e-07, + "loss": 1.6057, + "step": 9361 + }, + { + "epoch": 0.9724732523112081, + "grad_norm": 0.4223778247833252, + "learning_rate": 1.8684388486561733e-07, + "loss": 1.6723, + "step": 9362 + }, + { + "epoch": 0.9725771268307885, + "grad_norm": 0.40712058544158936, + "learning_rate": 1.854372736317389e-07, + "loss": 1.5695, + "step": 9363 + }, + { + "epoch": 0.9726810013503687, + "grad_norm": 0.4485786557197571, + "learning_rate": 1.8403596726022653e-07, + "loss": 1.4115, + "step": 9364 + }, + { + "epoch": 0.9727848758699491, + "grad_norm": 0.44639256596565247, + "learning_rate": 1.8263996590029976e-07, + "loss": 1.6466, + "step": 9365 + }, + { + "epoch": 0.9728887503895295, + "grad_norm": 0.4222765862941742, + "learning_rate": 1.812492697006285e-07, + "loss": 1.5191, + "step": 9366 + }, + { + "epoch": 0.9729926249091098, + "grad_norm": 0.4526502192020416, + "learning_rate": 1.79863878809311e-07, + "loss": 1.8303, + "step": 9367 + }, + { + "epoch": 0.9730964994286901, + "grad_norm": 0.3991301655769348, + "learning_rate": 1.7848379337387922e-07, + "loss": 1.6194, + "step": 9368 + }, + { + "epoch": 0.9732003739482705, + "grad_norm": 0.43972858786582947, + "learning_rate": 1.7710901354129894e-07, + "loss": 1.6873, + "step": 9369 + }, + { + "epoch": 0.9733042484678508, + "grad_norm": 0.42031508684158325, + "learning_rate": 1.7573953945796973e-07, + "loss": 1.7127, + "step": 9370 + }, + { + "epoch": 0.9734081229874312, + "grad_norm": 0.40002334117889404, + "learning_rate": 1.7437537126974158e-07, + "loss": 1.4936, + "step": 9371 + }, + { + "epoch": 0.9735119975070116, + "grad_norm": 0.4212174713611603, + "learning_rate": 1.7301650912188162e-07, + "loss": 1.7144, + "step": 9372 + }, + { + "epoch": 0.9736158720265918, + "grad_norm": 0.43451911211013794, + "learning_rate": 1.716629531590963e-07, + "loss": 1.7131, + "step": 9373 + }, + { + "epoch": 0.9737197465461722, + "grad_norm": 0.40921545028686523, + "learning_rate": 1.7031470352553146e-07, + "loss": 1.4595, + "step": 9374 + }, + { + "epoch": 0.9738236210657526, + "grad_norm": 0.41354814171791077, + "learning_rate": 1.6897176036476115e-07, + "loss": 1.6428, + "step": 9375 + }, + { + "epoch": 0.9739274955853329, + "grad_norm": 0.4084813892841339, + "learning_rate": 1.6763412381980425e-07, + "loss": 1.6586, + "step": 9376 + }, + { + "epoch": 0.9740313701049133, + "grad_norm": 0.5016772747039795, + "learning_rate": 1.6630179403310797e-07, + "loss": 1.7705, + "step": 9377 + }, + { + "epoch": 0.9741352446244936, + "grad_norm": 0.4082990288734436, + "learning_rate": 1.6497477114654768e-07, + "loss": 1.604, + "step": 9378 + }, + { + "epoch": 0.9742391191440739, + "grad_norm": 0.4223262369632721, + "learning_rate": 1.6365305530144926e-07, + "loss": 1.5545, + "step": 9379 + }, + { + "epoch": 0.9743429936636543, + "grad_norm": 0.45764049887657166, + "learning_rate": 1.623366466385612e-07, + "loss": 1.6171, + "step": 9380 + }, + { + "epoch": 0.9744468681832347, + "grad_norm": 0.4205342233181, + "learning_rate": 1.6102554529807134e-07, + "loss": 1.7076, + "step": 9381 + }, + { + "epoch": 0.974550742702815, + "grad_norm": 0.4099922478199005, + "learning_rate": 1.5971975141960138e-07, + "loss": 1.6506, + "step": 9382 + }, + { + "epoch": 0.9746546172223953, + "grad_norm": 0.43783533573150635, + "learning_rate": 1.5841926514220674e-07, + "loss": 1.526, + "step": 9383 + }, + { + "epoch": 0.9747584917419757, + "grad_norm": 0.4745118021965027, + "learning_rate": 1.571240866043877e-07, + "loss": 1.7779, + "step": 9384 + }, + { + "epoch": 0.9748623662615561, + "grad_norm": 0.4208376109600067, + "learning_rate": 1.5583421594405623e-07, + "loss": 1.7611, + "step": 9385 + }, + { + "epoch": 0.9749662407811364, + "grad_norm": 0.41122379899024963, + "learning_rate": 1.5454965329858573e-07, + "loss": 1.5896, + "step": 9386 + }, + { + "epoch": 0.9750701153007167, + "grad_norm": 0.3928976058959961, + "learning_rate": 1.5327039880477234e-07, + "loss": 1.59, + "step": 9387 + }, + { + "epoch": 0.9751739898202971, + "grad_norm": 0.4342755675315857, + "learning_rate": 1.5199645259883488e-07, + "loss": 1.6747, + "step": 9388 + }, + { + "epoch": 0.9752778643398774, + "grad_norm": 0.39169561862945557, + "learning_rate": 1.5072781481644816e-07, + "loss": 1.5841, + "step": 9389 + }, + { + "epoch": 0.9753817388594578, + "grad_norm": 0.43431976437568665, + "learning_rate": 1.4946448559270964e-07, + "loss": 1.661, + "step": 9390 + }, + { + "epoch": 0.9754856133790382, + "grad_norm": 0.4187610149383545, + "learning_rate": 1.4820646506215064e-07, + "loss": 1.5102, + "step": 9391 + }, + { + "epoch": 0.9755894878986184, + "grad_norm": 0.415600448846817, + "learning_rate": 1.4695375335875282e-07, + "loss": 1.4286, + "step": 9392 + }, + { + "epoch": 0.9756933624181988, + "grad_norm": 0.4058886468410492, + "learning_rate": 1.45706350615904e-07, + "loss": 1.6924, + "step": 9393 + }, + { + "epoch": 0.9757972369377792, + "grad_norm": 0.41150805354118347, + "learning_rate": 1.444642569664534e-07, + "loss": 1.5943, + "step": 9394 + }, + { + "epoch": 0.9759011114573595, + "grad_norm": 0.39166486263275146, + "learning_rate": 1.43227472542673e-07, + "loss": 1.628, + "step": 9395 + }, + { + "epoch": 0.9760049859769399, + "grad_norm": 0.41981199383735657, + "learning_rate": 1.4199599747626303e-07, + "loss": 1.4475, + "step": 9396 + }, + { + "epoch": 0.9761088604965202, + "grad_norm": 0.4433886408805847, + "learning_rate": 1.4076983189837967e-07, + "loss": 1.6782, + "step": 9397 + }, + { + "epoch": 0.9762127350161005, + "grad_norm": 0.4331313967704773, + "learning_rate": 1.3954897593958516e-07, + "loss": 1.5824, + "step": 9398 + }, + { + "epoch": 0.9763166095356809, + "grad_norm": 0.4032611846923828, + "learning_rate": 1.3833342972990327e-07, + "loss": 1.5385, + "step": 9399 + }, + { + "epoch": 0.9764204840552613, + "grad_norm": 0.4217138886451721, + "learning_rate": 1.371231933987749e-07, + "loss": 1.7366, + "step": 9400 + }, + { + "epoch": 0.9765243585748415, + "grad_norm": 0.3987552225589752, + "learning_rate": 1.3591826707508026e-07, + "loss": 1.5265, + "step": 9401 + }, + { + "epoch": 0.9766282330944219, + "grad_norm": 0.4729466140270233, + "learning_rate": 1.3471865088713342e-07, + "loss": 1.7914, + "step": 9402 + }, + { + "epoch": 0.9767321076140023, + "grad_norm": 0.4094342589378357, + "learning_rate": 1.3352434496268773e-07, + "loss": 1.6576, + "step": 9403 + }, + { + "epoch": 0.9768359821335826, + "grad_norm": 0.46844589710235596, + "learning_rate": 1.3233534942892477e-07, + "loss": 1.7209, + "step": 9404 + }, + { + "epoch": 0.976939856653163, + "grad_norm": 0.4402284026145935, + "learning_rate": 1.311516644124655e-07, + "loss": 1.6706, + "step": 9405 + }, + { + "epoch": 0.9770437311727433, + "grad_norm": 0.4201194941997528, + "learning_rate": 1.299732900393591e-07, + "loss": 1.598, + "step": 9406 + }, + { + "epoch": 0.9771476056923236, + "grad_norm": 0.41740289330482483, + "learning_rate": 1.2880022643509958e-07, + "loss": 1.6247, + "step": 9407 + }, + { + "epoch": 0.977251480211904, + "grad_norm": 0.4268152117729187, + "learning_rate": 1.2763247372460374e-07, + "loss": 1.6006, + "step": 9408 + }, + { + "epoch": 0.9773553547314844, + "grad_norm": 0.43314602971076965, + "learning_rate": 1.2647003203223318e-07, + "loss": 1.6009, + "step": 9409 + }, + { + "epoch": 0.9774592292510648, + "grad_norm": 0.42732593417167664, + "learning_rate": 1.2531290148177221e-07, + "loss": 1.6934, + "step": 9410 + }, + { + "epoch": 0.977563103770645, + "grad_norm": 0.4502175748348236, + "learning_rate": 1.2416108219645005e-07, + "loss": 1.7892, + "step": 9411 + }, + { + "epoch": 0.9776669782902254, + "grad_norm": 0.45358970761299133, + "learning_rate": 1.2301457429892972e-07, + "loss": 1.5935, + "step": 9412 + }, + { + "epoch": 0.9777708528098058, + "grad_norm": 0.43146711587905884, + "learning_rate": 1.2187337791129682e-07, + "loss": 1.457, + "step": 9413 + }, + { + "epoch": 0.9778747273293861, + "grad_norm": 0.44954973459243774, + "learning_rate": 1.2073749315508754e-07, + "loss": 1.6821, + "step": 9414 + }, + { + "epoch": 0.9779786018489665, + "grad_norm": 0.4219938814640045, + "learning_rate": 1.1960692015126062e-07, + "loss": 1.7259, + "step": 9415 + }, + { + "epoch": 0.9780824763685468, + "grad_norm": 0.41503259539604187, + "learning_rate": 1.1848165902021424e-07, + "loss": 1.4451, + "step": 9416 + }, + { + "epoch": 0.9781863508881271, + "grad_norm": 0.4230490028858185, + "learning_rate": 1.1736170988178585e-07, + "loss": 1.6452, + "step": 9417 + }, + { + "epoch": 0.9782902254077075, + "grad_norm": 0.44973745942115784, + "learning_rate": 1.1624707285523006e-07, + "loss": 1.811, + "step": 9418 + }, + { + "epoch": 0.9783940999272879, + "grad_norm": 0.4050397574901581, + "learning_rate": 1.1513774805925748e-07, + "loss": 1.4962, + "step": 9419 + }, + { + "epoch": 0.9784979744468681, + "grad_norm": 0.3766053020954132, + "learning_rate": 1.1403373561199582e-07, + "loss": 1.4053, + "step": 9420 + }, + { + "epoch": 0.9786018489664485, + "grad_norm": 0.41133883595466614, + "learning_rate": 1.1293503563101216e-07, + "loss": 1.6545, + "step": 9421 + }, + { + "epoch": 0.9787057234860289, + "grad_norm": 0.44046151638031006, + "learning_rate": 1.1184164823331844e-07, + "loss": 1.6459, + "step": 9422 + }, + { + "epoch": 0.9788095980056092, + "grad_norm": 0.44645535945892334, + "learning_rate": 1.1075357353534377e-07, + "loss": 1.5921, + "step": 9423 + }, + { + "epoch": 0.9789134725251896, + "grad_norm": 0.44139453768730164, + "learning_rate": 1.0967081165296211e-07, + "loss": 1.8106, + "step": 9424 + }, + { + "epoch": 0.97901734704477, + "grad_norm": 0.39633703231811523, + "learning_rate": 1.0859336270148124e-07, + "loss": 1.6446, + "step": 9425 + }, + { + "epoch": 0.9791212215643502, + "grad_norm": 0.4287833273410797, + "learning_rate": 1.0752122679563714e-07, + "loss": 1.6376, + "step": 9426 + }, + { + "epoch": 0.9792250960839306, + "grad_norm": 0.42582541704177856, + "learning_rate": 1.0645440404960517e-07, + "loss": 1.6317, + "step": 9427 + }, + { + "epoch": 0.979328970603511, + "grad_norm": 0.47708913683891296, + "learning_rate": 1.0539289457699442e-07, + "loss": 1.918, + "step": 9428 + }, + { + "epoch": 0.9794328451230913, + "grad_norm": 0.38468772172927856, + "learning_rate": 1.0433669849085336e-07, + "loss": 1.5551, + "step": 9429 + }, + { + "epoch": 0.9795367196426716, + "grad_norm": 0.4491479694843292, + "learning_rate": 1.032858159036476e-07, + "loss": 1.7375, + "step": 9430 + }, + { + "epoch": 0.979640594162252, + "grad_norm": 0.43696144223213196, + "learning_rate": 1.0224024692729317e-07, + "loss": 1.5334, + "step": 9431 + }, + { + "epoch": 0.9797444686818323, + "grad_norm": 0.38262155652046204, + "learning_rate": 1.0119999167313988e-07, + "loss": 1.4757, + "step": 9432 + }, + { + "epoch": 0.9798483432014127, + "grad_norm": 0.43123918771743774, + "learning_rate": 1.0016505025195467e-07, + "loss": 1.5889, + "step": 9433 + }, + { + "epoch": 0.9799522177209931, + "grad_norm": 0.44049620628356934, + "learning_rate": 9.913542277396604e-08, + "loss": 1.654, + "step": 9434 + }, + { + "epoch": 0.9800560922405734, + "grad_norm": 0.3811212182044983, + "learning_rate": 9.811110934880296e-08, + "loss": 1.4344, + "step": 9435 + }, + { + "epoch": 0.9801599667601537, + "grad_norm": 0.4550376534461975, + "learning_rate": 9.709211008556707e-08, + "loss": 1.8921, + "step": 9436 + }, + { + "epoch": 0.9802638412797341, + "grad_norm": 0.41628962755203247, + "learning_rate": 9.607842509275489e-08, + "loss": 1.6439, + "step": 9437 + }, + { + "epoch": 0.9803677157993145, + "grad_norm": 0.4357268810272217, + "learning_rate": 9.507005447833007e-08, + "loss": 1.8815, + "step": 9438 + }, + { + "epoch": 0.9804715903188947, + "grad_norm": 0.38655275106430054, + "learning_rate": 9.406699834967337e-08, + "loss": 1.4922, + "step": 9439 + }, + { + "epoch": 0.9805754648384751, + "grad_norm": 0.3913620710372925, + "learning_rate": 9.306925681359935e-08, + "loss": 1.6177, + "step": 9440 + }, + { + "epoch": 0.9806793393580555, + "grad_norm": 0.44508156180381775, + "learning_rate": 9.207682997635636e-08, + "loss": 1.6836, + "step": 9441 + }, + { + "epoch": 0.9807832138776358, + "grad_norm": 0.41767412424087524, + "learning_rate": 9.108971794363763e-08, + "loss": 1.6061, + "step": 9442 + }, + { + "epoch": 0.9808870883972162, + "grad_norm": 0.40216001868247986, + "learning_rate": 9.010792082055908e-08, + "loss": 1.5034, + "step": 9443 + }, + { + "epoch": 0.9809909629167965, + "grad_norm": 0.39269882440567017, + "learning_rate": 8.91314387116704e-08, + "loss": 1.5912, + "step": 9444 + }, + { + "epoch": 0.9810948374363768, + "grad_norm": 0.41241204738616943, + "learning_rate": 8.81602717209662e-08, + "loss": 1.6321, + "step": 9445 + }, + { + "epoch": 0.9811987119559572, + "grad_norm": 0.4276941120624542, + "learning_rate": 8.719441995186928e-08, + "loss": 1.6572, + "step": 9446 + }, + { + "epoch": 0.9813025864755376, + "grad_norm": 0.5014160871505737, + "learning_rate": 8.623388350722517e-08, + "loss": 1.6643, + "step": 9447 + }, + { + "epoch": 0.9814064609951179, + "grad_norm": 0.44852280616760254, + "learning_rate": 8.527866248933536e-08, + "loss": 1.4247, + "step": 9448 + }, + { + "epoch": 0.9815103355146982, + "grad_norm": 0.4082983136177063, + "learning_rate": 8.432875699991849e-08, + "loss": 1.4757, + "step": 9449 + }, + { + "epoch": 0.9816142100342786, + "grad_norm": 0.458987295627594, + "learning_rate": 8.338416714013253e-08, + "loss": 1.6738, + "step": 9450 + }, + { + "epoch": 0.9817180845538589, + "grad_norm": 0.42182016372680664, + "learning_rate": 8.244489301056368e-08, + "loss": 1.5149, + "step": 9451 + }, + { + "epoch": 0.9818219590734393, + "grad_norm": 0.4483654201030731, + "learning_rate": 8.151093471124305e-08, + "loss": 1.6678, + "step": 9452 + }, + { + "epoch": 0.9819258335930197, + "grad_norm": 0.4117814302444458, + "learning_rate": 8.058229234162995e-08, + "loss": 1.5014, + "step": 9453 + }, + { + "epoch": 0.9820297081125999, + "grad_norm": 0.43942439556121826, + "learning_rate": 7.965896600061751e-08, + "loss": 1.6016, + "step": 9454 + }, + { + "epoch": 0.9821335826321803, + "grad_norm": 0.4229027032852173, + "learning_rate": 7.874095578653263e-08, + "loss": 1.6684, + "step": 9455 + }, + { + "epoch": 0.9822374571517607, + "grad_norm": 0.38582631945610046, + "learning_rate": 7.782826179713043e-08, + "loss": 1.5004, + "step": 9456 + }, + { + "epoch": 0.982341331671341, + "grad_norm": 0.5087673664093018, + "learning_rate": 7.692088412961096e-08, + "loss": 1.8254, + "step": 9457 + }, + { + "epoch": 0.9824452061909213, + "grad_norm": 0.4083236753940582, + "learning_rate": 7.601882288060247e-08, + "loss": 1.577, + "step": 9458 + }, + { + "epoch": 0.9825490807105017, + "grad_norm": 0.42479801177978516, + "learning_rate": 7.512207814617256e-08, + "loss": 1.6106, + "step": 9459 + }, + { + "epoch": 0.9826529552300821, + "grad_norm": 0.4226894676685333, + "learning_rate": 7.423065002180596e-08, + "loss": 1.6185, + "step": 9460 + }, + { + "epoch": 0.9827568297496624, + "grad_norm": 0.42085468769073486, + "learning_rate": 7.334453860244339e-08, + "loss": 1.5966, + "step": 9461 + }, + { + "epoch": 0.9828607042692428, + "grad_norm": 0.4113049805164337, + "learning_rate": 7.24637439824427e-08, + "loss": 1.5905, + "step": 9462 + }, + { + "epoch": 0.9829645787888232, + "grad_norm": 0.406465083360672, + "learning_rate": 7.158826625560112e-08, + "loss": 1.6842, + "step": 9463 + }, + { + "epoch": 0.9830684533084034, + "grad_norm": 0.4487413167953491, + "learning_rate": 7.071810551516067e-08, + "loss": 1.6072, + "step": 9464 + }, + { + "epoch": 0.9831723278279838, + "grad_norm": 0.4292948544025421, + "learning_rate": 6.98532618537695e-08, + "loss": 1.6148, + "step": 9465 + }, + { + "epoch": 0.9832762023475642, + "grad_norm": 0.4271443784236908, + "learning_rate": 6.899373536354282e-08, + "loss": 1.693, + "step": 9466 + }, + { + "epoch": 0.9833800768671445, + "grad_norm": 0.4509519636631012, + "learning_rate": 6.813952613600183e-08, + "loss": 1.896, + "step": 9467 + }, + { + "epoch": 0.9834839513867248, + "grad_norm": 0.4309237599372864, + "learning_rate": 6.729063426211824e-08, + "loss": 1.6331, + "step": 9468 + }, + { + "epoch": 0.9835878259063052, + "grad_norm": 0.42812246084213257, + "learning_rate": 6.644705983229194e-08, + "loss": 1.6337, + "step": 9469 + }, + { + "epoch": 0.9836917004258855, + "grad_norm": 0.43282395601272583, + "learning_rate": 6.560880293636218e-08, + "loss": 1.7527, + "step": 9470 + }, + { + "epoch": 0.9837955749454659, + "grad_norm": 0.40227630734443665, + "learning_rate": 6.477586366358535e-08, + "loss": 1.6945, + "step": 9471 + }, + { + "epoch": 0.9838994494650463, + "grad_norm": 0.4157799780368805, + "learning_rate": 6.394824210266826e-08, + "loss": 1.4567, + "step": 9472 + }, + { + "epoch": 0.9840033239846265, + "grad_norm": 0.39977502822875977, + "learning_rate": 6.312593834175152e-08, + "loss": 1.5229, + "step": 9473 + }, + { + "epoch": 0.9841071985042069, + "grad_norm": 0.42183637619018555, + "learning_rate": 6.230895246840396e-08, + "loss": 1.5506, + "step": 9474 + }, + { + "epoch": 0.9842110730237873, + "grad_norm": 0.4323163628578186, + "learning_rate": 6.149728456961712e-08, + "loss": 1.6544, + "step": 9475 + }, + { + "epoch": 0.9843149475433676, + "grad_norm": 0.5097078680992126, + "learning_rate": 6.069093473183296e-08, + "loss": 1.7364, + "step": 9476 + }, + { + "epoch": 0.984418822062948, + "grad_norm": 0.4150666296482086, + "learning_rate": 5.988990304092724e-08, + "loss": 1.5175, + "step": 9477 + }, + { + "epoch": 0.9845226965825283, + "grad_norm": 0.4314131736755371, + "learning_rate": 5.9094189582198366e-08, + "loss": 1.6242, + "step": 9478 + }, + { + "epoch": 0.9846265711021086, + "grad_norm": 0.4086059033870697, + "learning_rate": 5.830379444038414e-08, + "loss": 1.6943, + "step": 9479 + }, + { + "epoch": 0.984730445621689, + "grad_norm": 0.4106563329696655, + "learning_rate": 5.751871769965056e-08, + "loss": 1.5377, + "step": 9480 + }, + { + "epoch": 0.9848343201412694, + "grad_norm": 0.4505181610584259, + "learning_rate": 5.673895944361407e-08, + "loss": 1.7265, + "step": 9481 + }, + { + "epoch": 0.9849381946608496, + "grad_norm": 0.41251927614212036, + "learning_rate": 5.59645197553027e-08, + "loss": 1.6865, + "step": 9482 + }, + { + "epoch": 0.98504206918043, + "grad_norm": 0.42645978927612305, + "learning_rate": 5.5195398717194926e-08, + "loss": 1.8202, + "step": 9483 + }, + { + "epoch": 0.9851459437000104, + "grad_norm": 0.4157099723815918, + "learning_rate": 5.443159641118634e-08, + "loss": 1.6619, + "step": 9484 + }, + { + "epoch": 0.9852498182195908, + "grad_norm": 0.42381036281585693, + "learning_rate": 5.367311291862853e-08, + "loss": 1.7641, + "step": 9485 + }, + { + "epoch": 0.9853536927391711, + "grad_norm": 0.4490167796611786, + "learning_rate": 5.291994832028469e-08, + "loss": 1.5296, + "step": 9486 + }, + { + "epoch": 0.9854575672587514, + "grad_norm": 0.4082736074924469, + "learning_rate": 5.217210269636286e-08, + "loss": 1.4804, + "step": 9487 + }, + { + "epoch": 0.9855614417783318, + "grad_norm": 0.41350072622299194, + "learning_rate": 5.1429576126504895e-08, + "loss": 1.5122, + "step": 9488 + }, + { + "epoch": 0.9856653162979121, + "grad_norm": 0.4074830412864685, + "learning_rate": 5.069236868978644e-08, + "loss": 1.5971, + "step": 9489 + }, + { + "epoch": 0.9857691908174925, + "grad_norm": 0.4387468099594116, + "learning_rate": 4.996048046471136e-08, + "loss": 1.6514, + "step": 9490 + }, + { + "epoch": 0.9858730653370729, + "grad_norm": 0.4530046582221985, + "learning_rate": 4.923391152921175e-08, + "loss": 1.6722, + "step": 9491 + }, + { + "epoch": 0.9859769398566531, + "grad_norm": 0.4661511778831482, + "learning_rate": 4.851266196068127e-08, + "loss": 1.6484, + "step": 9492 + }, + { + "epoch": 0.9860808143762335, + "grad_norm": 0.42836901545524597, + "learning_rate": 4.779673183590849e-08, + "loss": 1.8121, + "step": 9493 + }, + { + "epoch": 0.9861846888958139, + "grad_norm": 0.538507878780365, + "learning_rate": 4.708612123114353e-08, + "loss": 1.7639, + "step": 9494 + }, + { + "epoch": 0.9862885634153942, + "grad_norm": 0.4897894561290741, + "learning_rate": 4.638083022205364e-08, + "loss": 1.7797, + "step": 9495 + }, + { + "epoch": 0.9863924379349746, + "grad_norm": 0.4547121524810791, + "learning_rate": 4.56808588837565e-08, + "loss": 1.6323, + "step": 9496 + }, + { + "epoch": 0.9864963124545549, + "grad_norm": 0.4277886748313904, + "learning_rate": 4.4986207290792505e-08, + "loss": 1.7339, + "step": 9497 + }, + { + "epoch": 0.9866001869741352, + "grad_norm": 0.4299750328063965, + "learning_rate": 4.4296875517130243e-08, + "loss": 1.6974, + "step": 9498 + }, + { + "epoch": 0.9867040614937156, + "grad_norm": 0.4436101019382477, + "learning_rate": 4.361286363618322e-08, + "loss": 1.5564, + "step": 9499 + }, + { + "epoch": 0.986807936013296, + "grad_norm": 0.42672184109687805, + "learning_rate": 4.293417172078762e-08, + "loss": 1.5828, + "step": 9500 + }, + { + "epoch": 0.9869118105328762, + "grad_norm": 0.4650036096572876, + "learning_rate": 4.226079984322451e-08, + "loss": 1.5301, + "step": 9501 + }, + { + "epoch": 0.9870156850524566, + "grad_norm": 0.4062730371952057, + "learning_rate": 4.159274807520319e-08, + "loss": 1.6214, + "step": 9502 + }, + { + "epoch": 0.987119559572037, + "grad_norm": 0.4749537706375122, + "learning_rate": 4.0930016487861214e-08, + "loss": 1.7306, + "step": 9503 + }, + { + "epoch": 0.9872234340916173, + "grad_norm": 0.4257502555847168, + "learning_rate": 4.027260515177544e-08, + "loss": 1.8203, + "step": 9504 + }, + { + "epoch": 0.9873273086111977, + "grad_norm": 0.3824896514415741, + "learning_rate": 3.962051413695656e-08, + "loss": 1.5731, + "step": 9505 + }, + { + "epoch": 0.987431183130778, + "grad_norm": 0.41676878929138184, + "learning_rate": 3.897374351284899e-08, + "loss": 1.7339, + "step": 9506 + }, + { + "epoch": 0.9875350576503583, + "grad_norm": 0.4813458323478699, + "learning_rate": 3.8332293348325444e-08, + "loss": 1.993, + "step": 9507 + }, + { + "epoch": 0.9876389321699387, + "grad_norm": 0.47463130950927734, + "learning_rate": 3.769616371169793e-08, + "loss": 1.708, + "step": 9508 + }, + { + "epoch": 0.9877428066895191, + "grad_norm": 0.4519473612308502, + "learning_rate": 3.706535467070671e-08, + "loss": 1.9389, + "step": 9509 + }, + { + "epoch": 0.9878466812090994, + "grad_norm": 0.4201395809650421, + "learning_rate": 3.6439866292531376e-08, + "loss": 1.6609, + "step": 9510 + }, + { + "epoch": 0.9879505557286797, + "grad_norm": 0.4545467495918274, + "learning_rate": 3.5819698643779764e-08, + "loss": 1.7568, + "step": 9511 + }, + { + "epoch": 0.9880544302482601, + "grad_norm": 0.39346179366111755, + "learning_rate": 3.520485179048794e-08, + "loss": 1.6101, + "step": 9512 + }, + { + "epoch": 0.9881583047678405, + "grad_norm": 0.4246424436569214, + "learning_rate": 3.459532579814795e-08, + "loss": 1.7143, + "step": 9513 + }, + { + "epoch": 0.9882621792874208, + "grad_norm": 0.4520607590675354, + "learning_rate": 3.399112073165789e-08, + "loss": 1.5954, + "step": 9514 + }, + { + "epoch": 0.9883660538070012, + "grad_norm": 0.4503570795059204, + "learning_rate": 3.339223665536073e-08, + "loss": 1.8197, + "step": 9515 + }, + { + "epoch": 0.9884699283265815, + "grad_norm": 0.42323458194732666, + "learning_rate": 3.279867363303879e-08, + "loss": 1.6981, + "step": 9516 + }, + { + "epoch": 0.9885738028461618, + "grad_norm": 0.44448158144950867, + "learning_rate": 3.221043172789706e-08, + "loss": 1.5851, + "step": 9517 + }, + { + "epoch": 0.9886776773657422, + "grad_norm": 0.41508886218070984, + "learning_rate": 3.162751100257988e-08, + "loss": 1.6064, + "step": 9518 + }, + { + "epoch": 0.9887815518853226, + "grad_norm": 0.4167642891407013, + "learning_rate": 3.1049911519165365e-08, + "loss": 1.6993, + "step": 9519 + }, + { + "epoch": 0.9888854264049028, + "grad_norm": 0.4645927846431732, + "learning_rate": 3.047763333916542e-08, + "loss": 1.6729, + "step": 9520 + }, + { + "epoch": 0.9889893009244832, + "grad_norm": 0.42605888843536377, + "learning_rate": 2.991067652351465e-08, + "loss": 1.6401, + "step": 9521 + }, + { + "epoch": 0.9890931754440636, + "grad_norm": 0.3953196108341217, + "learning_rate": 2.9349041132598065e-08, + "loss": 1.6012, + "step": 9522 + }, + { + "epoch": 0.9891970499636439, + "grad_norm": 0.41111239790916443, + "learning_rate": 2.8792727226223393e-08, + "loss": 1.5285, + "step": 9523 + }, + { + "epoch": 0.9893009244832243, + "grad_norm": 0.4029116928577423, + "learning_rate": 2.824173486363213e-08, + "loss": 1.7036, + "step": 9524 + }, + { + "epoch": 0.9894047990028046, + "grad_norm": 0.44695624709129333, + "learning_rate": 2.7696064103505116e-08, + "loss": 1.7136, + "step": 9525 + }, + { + "epoch": 0.9895086735223849, + "grad_norm": 0.4348202347755432, + "learning_rate": 2.715571500394587e-08, + "loss": 1.6265, + "step": 9526 + }, + { + "epoch": 0.9896125480419653, + "grad_norm": 0.43105223774909973, + "learning_rate": 2.6620687622497255e-08, + "loss": 1.6818, + "step": 9527 + }, + { + "epoch": 0.9897164225615457, + "grad_norm": 0.4228067398071289, + "learning_rate": 2.609098201613591e-08, + "loss": 1.6076, + "step": 9528 + }, + { + "epoch": 0.989820297081126, + "grad_norm": 0.4084494113922119, + "learning_rate": 2.556659824127783e-08, + "loss": 1.6429, + "step": 9529 + }, + { + "epoch": 0.9899241716007063, + "grad_norm": 0.4497421979904175, + "learning_rate": 2.504753635376167e-08, + "loss": 1.5681, + "step": 9530 + }, + { + "epoch": 0.9900280461202867, + "grad_norm": 0.4513741731643677, + "learning_rate": 2.453379640886544e-08, + "loss": 1.6556, + "step": 9531 + }, + { + "epoch": 0.990131920639867, + "grad_norm": 0.38901105523109436, + "learning_rate": 2.4025378461289828e-08, + "loss": 1.502, + "step": 9532 + }, + { + "epoch": 0.9902357951594474, + "grad_norm": 0.4121716022491455, + "learning_rate": 2.352228256519151e-08, + "loss": 1.6831, + "step": 9533 + }, + { + "epoch": 0.9903396696790278, + "grad_norm": 0.4378368854522705, + "learning_rate": 2.3024508774133204e-08, + "loss": 1.6327, + "step": 9534 + }, + { + "epoch": 0.990443544198608, + "grad_norm": 0.39573803544044495, + "learning_rate": 2.2532057141128048e-08, + "loss": 1.5492, + "step": 9535 + }, + { + "epoch": 0.9905474187181884, + "grad_norm": 0.4019763469696045, + "learning_rate": 2.2044927718622987e-08, + "loss": 1.5991, + "step": 9536 + }, + { + "epoch": 0.9906512932377688, + "grad_norm": 0.40174970030784607, + "learning_rate": 2.1563120558487638e-08, + "loss": 1.5447, + "step": 9537 + }, + { + "epoch": 0.9907551677573492, + "grad_norm": 0.45444202423095703, + "learning_rate": 2.1086635712036507e-08, + "loss": 1.6873, + "step": 9538 + }, + { + "epoch": 0.9908590422769294, + "grad_norm": 0.42663663625717163, + "learning_rate": 2.061547323000679e-08, + "loss": 1.7317, + "step": 9539 + }, + { + "epoch": 0.9909629167965098, + "grad_norm": 0.40359655022621155, + "learning_rate": 2.014963316257501e-08, + "loss": 1.5926, + "step": 9540 + }, + { + "epoch": 0.9910667913160902, + "grad_norm": 0.40575510263442993, + "learning_rate": 1.9689115559345938e-08, + "loss": 1.5653, + "step": 9541 + }, + { + "epoch": 0.9911706658356705, + "grad_norm": 0.4161994457244873, + "learning_rate": 1.9233920469369225e-08, + "loss": 1.5923, + "step": 9542 + }, + { + "epoch": 0.9912745403552509, + "grad_norm": 0.4296528697013855, + "learning_rate": 1.878404794111166e-08, + "loss": 1.7091, + "step": 9543 + }, + { + "epoch": 0.9913784148748312, + "grad_norm": 0.4677174985408783, + "learning_rate": 1.8339498022490465e-08, + "loss": 1.7469, + "step": 9544 + }, + { + "epoch": 0.9914822893944115, + "grad_norm": 0.43237027525901794, + "learning_rate": 1.790027076083445e-08, + "loss": 1.6314, + "step": 9545 + }, + { + "epoch": 0.9915861639139919, + "grad_norm": 0.4431440830230713, + "learning_rate": 1.7466366202928408e-08, + "loss": 1.6766, + "step": 9546 + }, + { + "epoch": 0.9916900384335723, + "grad_norm": 0.43120649456977844, + "learning_rate": 1.7037784394968724e-08, + "loss": 1.6412, + "step": 9547 + }, + { + "epoch": 0.9917939129531526, + "grad_norm": 0.3956824541091919, + "learning_rate": 1.6614525382613323e-08, + "loss": 1.5854, + "step": 9548 + }, + { + "epoch": 0.9918977874727329, + "grad_norm": 0.41620564460754395, + "learning_rate": 1.6196589210915046e-08, + "loss": 1.5208, + "step": 9549 + }, + { + "epoch": 0.9920016619923133, + "grad_norm": 0.46985071897506714, + "learning_rate": 1.5783975924399398e-08, + "loss": 1.8193, + "step": 9550 + }, + { + "epoch": 0.9921055365118936, + "grad_norm": 0.44369015097618103, + "learning_rate": 1.5376685566992345e-08, + "loss": 1.5834, + "step": 9551 + }, + { + "epoch": 0.992209411031474, + "grad_norm": 0.40749645233154297, + "learning_rate": 1.4974718182075853e-08, + "loss": 1.5352, + "step": 9552 + }, + { + "epoch": 0.9923132855510544, + "grad_norm": 0.40627145767211914, + "learning_rate": 1.4578073812454574e-08, + "loss": 1.6593, + "step": 9553 + }, + { + "epoch": 0.9924171600706346, + "grad_norm": 0.4516492784023285, + "learning_rate": 1.418675250036694e-08, + "loss": 1.7582, + "step": 9554 + }, + { + "epoch": 0.992521034590215, + "grad_norm": 0.44202539324760437, + "learning_rate": 1.3800754287485174e-08, + "loss": 1.6358, + "step": 9555 + }, + { + "epoch": 0.9926249091097954, + "grad_norm": 0.4185415208339691, + "learning_rate": 1.3420079214915282e-08, + "loss": 1.5889, + "step": 9556 + }, + { + "epoch": 0.9927287836293757, + "grad_norm": 0.41641661524772644, + "learning_rate": 1.3044727323202611e-08, + "loss": 1.6185, + "step": 9557 + }, + { + "epoch": 0.992832658148956, + "grad_norm": 0.4016210436820984, + "learning_rate": 1.2674698652304085e-08, + "loss": 1.6872, + "step": 9558 + }, + { + "epoch": 0.9929365326685364, + "grad_norm": 0.4109307825565338, + "learning_rate": 1.2309993241638173e-08, + "loss": 1.7338, + "step": 9559 + }, + { + "epoch": 0.9930404071881167, + "grad_norm": 0.4159395098686218, + "learning_rate": 1.1950611130040479e-08, + "loss": 1.8372, + "step": 9560 + }, + { + "epoch": 0.9931442817076971, + "grad_norm": 0.37879422307014465, + "learning_rate": 1.1596552355780388e-08, + "loss": 1.5309, + "step": 9561 + }, + { + "epoch": 0.9932481562272775, + "grad_norm": 0.46690380573272705, + "learning_rate": 1.1247816956561075e-08, + "loss": 1.6077, + "step": 9562 + }, + { + "epoch": 0.9933520307468579, + "grad_norm": 0.444859117269516, + "learning_rate": 1.0904404969525051e-08, + "loss": 1.6831, + "step": 9563 + }, + { + "epoch": 0.9934559052664381, + "grad_norm": 0.46033644676208496, + "learning_rate": 1.0566316431237511e-08, + "loss": 1.7443, + "step": 9564 + }, + { + "epoch": 0.9935597797860185, + "grad_norm": 0.44090694189071655, + "learning_rate": 1.0233551377702988e-08, + "loss": 1.7238, + "step": 9565 + }, + { + "epoch": 0.9936636543055989, + "grad_norm": 0.4091147780418396, + "learning_rate": 9.906109844359802e-09, + "loss": 1.6286, + "step": 9566 + }, + { + "epoch": 0.9937675288251792, + "grad_norm": 0.41084280610084534, + "learning_rate": 9.583991866080055e-09, + "loss": 1.5039, + "step": 9567 + }, + { + "epoch": 0.9938714033447595, + "grad_norm": 0.4234639108181, + "learning_rate": 9.267197477169643e-09, + "loss": 1.6008, + "step": 9568 + }, + { + "epoch": 0.9939752778643399, + "grad_norm": 0.40848711133003235, + "learning_rate": 8.955726711351586e-09, + "loss": 1.6676, + "step": 9569 + }, + { + "epoch": 0.9940791523839202, + "grad_norm": 0.3974449932575226, + "learning_rate": 8.649579601810453e-09, + "loss": 1.4568, + "step": 9570 + }, + { + "epoch": 0.9941830269035006, + "grad_norm": 0.42480480670928955, + "learning_rate": 8.34875618113684e-09, + "loss": 1.7452, + "step": 9571 + }, + { + "epoch": 0.994286901423081, + "grad_norm": 0.4431557059288025, + "learning_rate": 8.053256481371785e-09, + "loss": 1.7141, + "step": 9572 + }, + { + "epoch": 0.9943907759426612, + "grad_norm": 0.46281933784484863, + "learning_rate": 7.763080533984557e-09, + "loss": 1.7414, + "step": 9573 + }, + { + "epoch": 0.9944946504622416, + "grad_norm": 0.44635605812072754, + "learning_rate": 7.478228369872664e-09, + "loss": 1.7477, + "step": 9574 + }, + { + "epoch": 0.994598524981822, + "grad_norm": 0.40345069766044617, + "learning_rate": 7.198700019378502e-09, + "loss": 1.7388, + "step": 9575 + }, + { + "epoch": 0.9947023995014023, + "grad_norm": 0.4090345799922943, + "learning_rate": 6.924495512256046e-09, + "loss": 1.6223, + "step": 9576 + }, + { + "epoch": 0.9948062740209827, + "grad_norm": 0.4181837737560272, + "learning_rate": 6.655614877720817e-09, + "loss": 1.6013, + "step": 9577 + }, + { + "epoch": 0.994910148540563, + "grad_norm": 0.4603481590747833, + "learning_rate": 6.392058144394364e-09, + "loss": 1.6791, + "step": 9578 + }, + { + "epoch": 0.9950140230601433, + "grad_norm": 0.43850967288017273, + "learning_rate": 6.133825340348675e-09, + "loss": 1.6949, + "step": 9579 + }, + { + "epoch": 0.9951178975797237, + "grad_norm": 0.42286258935928345, + "learning_rate": 5.8809164930839765e-09, + "loss": 1.6022, + "step": 9580 + }, + { + "epoch": 0.9952217720993041, + "grad_norm": 0.4476601481437683, + "learning_rate": 5.6333316295342775e-09, + "loss": 1.8264, + "step": 9581 + }, + { + "epoch": 0.9953256466188843, + "grad_norm": 0.4325719475746155, + "learning_rate": 5.391070776061824e-09, + "loss": 1.5983, + "step": 9582 + }, + { + "epoch": 0.9954295211384647, + "grad_norm": 0.41536781191825867, + "learning_rate": 5.154133958468199e-09, + "loss": 1.6543, + "step": 9583 + }, + { + "epoch": 0.9955333956580451, + "grad_norm": 0.4049381613731384, + "learning_rate": 4.922521201988772e-09, + "loss": 1.6388, + "step": 9584 + }, + { + "epoch": 0.9956372701776254, + "grad_norm": 0.3786325752735138, + "learning_rate": 4.6962325312760455e-09, + "loss": 1.5859, + "step": 9585 + }, + { + "epoch": 0.9957411446972058, + "grad_norm": 0.45597589015960693, + "learning_rate": 4.475267970444064e-09, + "loss": 1.8203, + "step": 9586 + }, + { + "epoch": 0.9958450192167861, + "grad_norm": 0.42808467149734497, + "learning_rate": 4.259627543012901e-09, + "loss": 1.6721, + "step": 9587 + }, + { + "epoch": 0.9959488937363664, + "grad_norm": 0.42117738723754883, + "learning_rate": 4.04931127194752e-09, + "loss": 1.5659, + "step": 9588 + }, + { + "epoch": 0.9960527682559468, + "grad_norm": 0.4049902856349945, + "learning_rate": 3.844319179646671e-09, + "loss": 1.6087, + "step": 9589 + }, + { + "epoch": 0.9961566427755272, + "grad_norm": 0.40100494027137756, + "learning_rate": 3.6446512879428907e-09, + "loss": 1.654, + "step": 9590 + }, + { + "epoch": 0.9962605172951076, + "grad_norm": 0.3948621153831482, + "learning_rate": 3.4503076180969486e-09, + "loss": 1.661, + "step": 9591 + }, + { + "epoch": 0.9963643918146878, + "grad_norm": 0.44757014513015747, + "learning_rate": 3.2612881908089533e-09, + "loss": 1.6552, + "step": 9592 + }, + { + "epoch": 0.9964682663342682, + "grad_norm": 0.413869172334671, + "learning_rate": 3.0775930261961462e-09, + "loss": 1.6067, + "step": 9593 + }, + { + "epoch": 0.9965721408538486, + "grad_norm": 0.46646273136138916, + "learning_rate": 2.89922214383731e-09, + "loss": 1.6194, + "step": 9594 + }, + { + "epoch": 0.9966760153734289, + "grad_norm": 0.39553186297416687, + "learning_rate": 2.726175562711708e-09, + "loss": 1.5895, + "step": 9595 + }, + { + "epoch": 0.9967798898930093, + "grad_norm": 0.42931756377220154, + "learning_rate": 2.558453301260144e-09, + "loss": 1.5636, + "step": 9596 + }, + { + "epoch": 0.9968837644125896, + "grad_norm": 0.4277789890766144, + "learning_rate": 2.3960553773350046e-09, + "loss": 1.6763, + "step": 9597 + }, + { + "epoch": 0.9969876389321699, + "grad_norm": 0.4121456444263458, + "learning_rate": 2.2389818082335645e-09, + "loss": 1.6111, + "step": 9598 + }, + { + "epoch": 0.9970915134517503, + "grad_norm": 0.4208837151527405, + "learning_rate": 2.0872326106868845e-09, + "loss": 1.7814, + "step": 9599 + }, + { + "epoch": 0.9971953879713307, + "grad_norm": 0.4421430826187134, + "learning_rate": 1.9408078008431586e-09, + "loss": 1.6691, + "step": 9600 + }, + { + "epoch": 0.9972992624909109, + "grad_norm": 0.4708458483219147, + "learning_rate": 1.7997073943121222e-09, + "loss": 1.6991, + "step": 9601 + }, + { + "epoch": 0.9974031370104913, + "grad_norm": 0.416151762008667, + "learning_rate": 1.6639314061095422e-09, + "loss": 1.6752, + "step": 9602 + }, + { + "epoch": 0.9975070115300717, + "grad_norm": 0.408317506313324, + "learning_rate": 1.5334798506905224e-09, + "loss": 1.4994, + "step": 9603 + }, + { + "epoch": 0.997610886049652, + "grad_norm": 0.4764207601547241, + "learning_rate": 1.4083527419606058e-09, + "loss": 1.8033, + "step": 9604 + }, + { + "epoch": 0.9977147605692324, + "grad_norm": 0.4480811655521393, + "learning_rate": 1.2885500932313666e-09, + "loss": 1.8291, + "step": 9605 + }, + { + "epoch": 0.9978186350888127, + "grad_norm": 0.4367046058177948, + "learning_rate": 1.1740719172703697e-09, + "loss": 1.6905, + "step": 9606 + }, + { + "epoch": 0.997922509608393, + "grad_norm": 0.43447238206863403, + "learning_rate": 1.0649182262623125e-09, + "loss": 1.6493, + "step": 9607 + }, + { + "epoch": 0.9980263841279734, + "grad_norm": 0.4337233006954193, + "learning_rate": 9.610890318312305e-10, + "loss": 1.6764, + "step": 9608 + }, + { + "epoch": 0.9981302586475538, + "grad_norm": 0.42332491278648376, + "learning_rate": 8.625843450404958e-10, + "loss": 1.6347, + "step": 9609 + }, + { + "epoch": 0.998234133167134, + "grad_norm": 0.4210823178291321, + "learning_rate": 7.69404176376165e-10, + "loss": 1.7052, + "step": 9610 + }, + { + "epoch": 0.9983380076867144, + "grad_norm": 0.3997040092945099, + "learning_rate": 6.81548535763632e-10, + "loss": 1.5105, + "step": 9611 + }, + { + "epoch": 0.9984418822062948, + "grad_norm": 0.43500977754592896, + "learning_rate": 5.990174325509745e-10, + "loss": 1.6361, + "step": 9612 + }, + { + "epoch": 0.9985457567258751, + "grad_norm": 0.459688276052475, + "learning_rate": 5.218108755367102e-10, + "loss": 1.6404, + "step": 9613 + }, + { + "epoch": 0.9986496312454555, + "grad_norm": 0.43085619807243347, + "learning_rate": 4.499288729364892e-10, + "loss": 1.642, + "step": 9614 + }, + { + "epoch": 0.9987535057650359, + "grad_norm": 0.4042317569255829, + "learning_rate": 3.833714324108506e-10, + "loss": 1.7038, + "step": 9615 + }, + { + "epoch": 0.9988573802846162, + "grad_norm": 0.43363288044929504, + "learning_rate": 3.221385610430172e-10, + "loss": 1.8699, + "step": 9616 + }, + { + "epoch": 0.9989612548041965, + "grad_norm": 0.40789350867271423, + "learning_rate": 2.6623026534999817e-10, + "loss": 1.7067, + "step": 9617 + }, + { + "epoch": 0.9990651293237769, + "grad_norm": 0.45757153630256653, + "learning_rate": 2.1564655129369115e-10, + "loss": 1.9236, + "step": 9618 + }, + { + "epoch": 0.9991690038433573, + "grad_norm": 0.4801182746887207, + "learning_rate": 1.7038742425867781e-10, + "loss": 1.8674, + "step": 9619 + }, + { + "epoch": 0.9992728783629375, + "grad_norm": 0.41760823130607605, + "learning_rate": 1.3045288906332608e-10, + "loss": 1.591, + "step": 9620 + }, + { + "epoch": 0.9993767528825179, + "grad_norm": 0.46719786524772644, + "learning_rate": 9.584294995979015e-11, + "loss": 1.3563, + "step": 9621 + }, + { + "epoch": 0.9994806274020983, + "grad_norm": 0.4825066030025482, + "learning_rate": 6.655761063956157e-11, + "loss": 1.7152, + "step": 9622 + }, + { + "epoch": 0.9995845019216786, + "grad_norm": 0.4088488817214966, + "learning_rate": 4.259687421126479e-11, + "loss": 1.6796, + "step": 9623 + }, + { + "epoch": 0.999688376441259, + "grad_norm": 0.4588576853275299, + "learning_rate": 2.3960743233963913e-11, + "loss": 1.8781, + "step": 9624 + }, + { + "epoch": 0.9997922509608393, + "grad_norm": 0.46052485704421997, + "learning_rate": 1.0649219683855905e-11, + "loss": 1.5272, + "step": 9625 + }, + { + "epoch": 0.9998961254804196, + "grad_norm": 0.4075811207294464, + "learning_rate": 2.6623049931284727e-12, + "loss": 1.7282, + "step": 9626 + }, + { + "epoch": 1.0, + "grad_norm": 0.5596733093261719, + "learning_rate": 0.0, + "loss": 1.8582, + "step": 9627 + }, + { + "epoch": 1.0, + "step": 9627, + "total_flos": 1.452208309419167e+19, + "train_loss": 1.7118117516813738, + "train_runtime": 275545.5724, + "train_samples_per_second": 0.14, + "train_steps_per_second": 0.035 + } + ], + "logging_steps": 1.0, + "max_steps": 9627, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.452208309419167e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}