diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,93030 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999435442895049, + "eval_steps": 500, + "global_step": 13284, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 7.527428066015545e-05, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 14.6745, + "step": 1 + }, + { + "epoch": 0.0001505485613203109, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 14.2646, + "step": 2 + }, + { + "epoch": 0.0002258228419804663, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 14.3153, + "step": 3 + }, + { + "epoch": 0.0003010971226406218, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 14.1904, + "step": 4 + }, + { + "epoch": 0.00037637140330077723, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 14.163, + "step": 5 + }, + { + "epoch": 0.0004516456839609326, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 14.1855, + "step": 6 + }, + { + "epoch": 0.0005269199646210881, + "grad_norm": 220.9912872314453, + "learning_rate": 2.506265664160401e-07, + "loss": 14.3773, + "step": 7 + }, + { + "epoch": 0.0006021942452812436, + "grad_norm": 204.32948303222656, + "learning_rate": 5.012531328320802e-07, + "loss": 14.1366, + "step": 8 + }, + { + "epoch": 0.000677468525941399, + "grad_norm": 196.6553955078125, + "learning_rate": 7.518796992481203e-07, + "loss": 14.6465, + "step": 9 + }, + { + "epoch": 0.0007527428066015545, + "grad_norm": 198.5098419189453, + "learning_rate": 1.0025062656641603e-06, + "loss": 14.161, + "step": 10 + }, + { + "epoch": 0.0008280170872617098, + "grad_norm": 221.6854705810547, + "learning_rate": 1.2531328320802005e-06, + "loss": 14.25, + "step": 11 + }, + { + "epoch": 0.0009032913679218653, + "grad_norm": 196.06338500976562, + "learning_rate": 1.5037593984962406e-06, + "loss": 14.219, + "step": 12 + }, + { + "epoch": 0.0009785656485820208, + "grad_norm": 240.4559783935547, + "learning_rate": 1.7543859649122807e-06, + "loss": 14.3176, + "step": 13 + }, + { + "epoch": 0.0010538399292421763, + "grad_norm": 188.92919921875, + "learning_rate": 2.0050125313283207e-06, + "loss": 14.0427, + "step": 14 + }, + { + "epoch": 0.0011291142099023317, + "grad_norm": 235.26187133789062, + "learning_rate": 2.255639097744361e-06, + "loss": 14.3093, + "step": 15 + }, + { + "epoch": 0.0012043884905624871, + "grad_norm": 197.09005737304688, + "learning_rate": 2.506265664160401e-06, + "loss": 14.083, + "step": 16 + }, + { + "epoch": 0.0012796627712226426, + "grad_norm": 261.8478698730469, + "learning_rate": 2.7568922305764413e-06, + "loss": 13.6877, + "step": 17 + }, + { + "epoch": 0.001354937051882798, + "grad_norm": 251.19281005859375, + "learning_rate": 3.007518796992481e-06, + "loss": 13.8317, + "step": 18 + }, + { + "epoch": 0.0014302113325429535, + "grad_norm": 251.19281005859375, + "learning_rate": 3.007518796992481e-06, + "loss": 13.1433, + "step": 19 + }, + { + "epoch": 0.001505485613203109, + "grad_norm": 274.72698974609375, + "learning_rate": 3.258145363408521e-06, + "loss": 13.5558, + "step": 20 + }, + { + "epoch": 0.0015807598938632642, + "grad_norm": 431.6600646972656, + "learning_rate": 3.5087719298245615e-06, + "loss": 13.0075, + "step": 21 + }, + { + "epoch": 0.0016560341745234196, + "grad_norm": 452.4359130859375, + "learning_rate": 3.7593984962406014e-06, + "loss": 12.822, + "step": 22 + }, + { + "epoch": 0.001731308455183575, + "grad_norm": 452.4359130859375, + "learning_rate": 3.7593984962406014e-06, + "loss": 12.3606, + "step": 23 + }, + { + "epoch": 0.0018065827358437305, + "grad_norm": 520.3015747070312, + "learning_rate": 4.010025062656641e-06, + "loss": 12.6771, + "step": 24 + }, + { + "epoch": 0.001881857016503886, + "grad_norm": 560.7103271484375, + "learning_rate": 4.260651629072682e-06, + "loss": 12.1979, + "step": 25 + }, + { + "epoch": 0.0019571312971640416, + "grad_norm": 721.1973876953125, + "learning_rate": 4.511278195488722e-06, + "loss": 11.8597, + "step": 26 + }, + { + "epoch": 0.002032405577824197, + "grad_norm": 543.4021606445312, + "learning_rate": 4.7619047619047615e-06, + "loss": 11.4942, + "step": 27 + }, + { + "epoch": 0.0021076798584843525, + "grad_norm": 736.8602905273438, + "learning_rate": 5.012531328320802e-06, + "loss": 11.1191, + "step": 28 + }, + { + "epoch": 0.002182954139144508, + "grad_norm": 563.4603271484375, + "learning_rate": 5.263157894736842e-06, + "loss": 10.5856, + "step": 29 + }, + { + "epoch": 0.0022582284198046634, + "grad_norm": 229.94313049316406, + "learning_rate": 5.5137844611528826e-06, + "loss": 10.0824, + "step": 30 + }, + { + "epoch": 0.002333502700464819, + "grad_norm": 226.94113159179688, + "learning_rate": 5.764411027568922e-06, + "loss": 9.9085, + "step": 31 + }, + { + "epoch": 0.0024087769811249743, + "grad_norm": 234.84669494628906, + "learning_rate": 6.015037593984962e-06, + "loss": 9.28, + "step": 32 + }, + { + "epoch": 0.0024840512617851297, + "grad_norm": 156.67616271972656, + "learning_rate": 6.265664160401003e-06, + "loss": 9.8853, + "step": 33 + }, + { + "epoch": 0.002559325542445285, + "grad_norm": 110.85346984863281, + "learning_rate": 6.516290726817042e-06, + "loss": 8.9517, + "step": 34 + }, + { + "epoch": 0.0026345998231054406, + "grad_norm": 94.64895629882812, + "learning_rate": 6.766917293233083e-06, + "loss": 8.9751, + "step": 35 + }, + { + "epoch": 0.002709874103765596, + "grad_norm": 86.29966735839844, + "learning_rate": 7.017543859649123e-06, + "loss": 8.3217, + "step": 36 + }, + { + "epoch": 0.0027851483844257515, + "grad_norm": 81.39305114746094, + "learning_rate": 7.2681704260651625e-06, + "loss": 8.1378, + "step": 37 + }, + { + "epoch": 0.002860422665085907, + "grad_norm": 78.70759582519531, + "learning_rate": 7.518796992481203e-06, + "loss": 8.1504, + "step": 38 + }, + { + "epoch": 0.0029356969457460624, + "grad_norm": 76.05618286132812, + "learning_rate": 7.769423558897243e-06, + "loss": 7.6079, + "step": 39 + }, + { + "epoch": 0.003010971226406218, + "grad_norm": 72.49906158447266, + "learning_rate": 8.020050125313283e-06, + "loss": 7.7394, + "step": 40 + }, + { + "epoch": 0.0030862455070663733, + "grad_norm": 72.77003479003906, + "learning_rate": 8.270676691729324e-06, + "loss": 7.7267, + "step": 41 + }, + { + "epoch": 0.0031615197877265283, + "grad_norm": 78.8106689453125, + "learning_rate": 8.521303258145363e-06, + "loss": 7.5611, + "step": 42 + }, + { + "epoch": 0.0032367940683866838, + "grad_norm": 71.1892318725586, + "learning_rate": 8.771929824561403e-06, + "loss": 7.2309, + "step": 43 + }, + { + "epoch": 0.0033120683490468392, + "grad_norm": 69.72626495361328, + "learning_rate": 9.022556390977444e-06, + "loss": 7.1093, + "step": 44 + }, + { + "epoch": 0.0033873426297069947, + "grad_norm": 67.48038482666016, + "learning_rate": 9.273182957393484e-06, + "loss": 7.0384, + "step": 45 + }, + { + "epoch": 0.00346261691036715, + "grad_norm": 61.05470657348633, + "learning_rate": 9.523809523809523e-06, + "loss": 7.3631, + "step": 46 + }, + { + "epoch": 0.0035378911910273056, + "grad_norm": 63.25644302368164, + "learning_rate": 9.774436090225564e-06, + "loss": 6.4683, + "step": 47 + }, + { + "epoch": 0.003613165471687461, + "grad_norm": 61.4146842956543, + "learning_rate": 1.0025062656641604e-05, + "loss": 6.5191, + "step": 48 + }, + { + "epoch": 0.0036884397523476165, + "grad_norm": 59.69192886352539, + "learning_rate": 1.0275689223057643e-05, + "loss": 6.3002, + "step": 49 + }, + { + "epoch": 0.003763714033007772, + "grad_norm": 57.620296478271484, + "learning_rate": 1.0526315789473684e-05, + "loss": 6.2166, + "step": 50 + }, + { + "epoch": 0.0038389883136679273, + "grad_norm": 59.934818267822266, + "learning_rate": 1.0776942355889724e-05, + "loss": 6.1602, + "step": 51 + }, + { + "epoch": 0.003914262594328083, + "grad_norm": 53.115291595458984, + "learning_rate": 1.1027568922305765e-05, + "loss": 6.0378, + "step": 52 + }, + { + "epoch": 0.003989536874988238, + "grad_norm": 55.64888000488281, + "learning_rate": 1.1278195488721805e-05, + "loss": 6.1491, + "step": 53 + }, + { + "epoch": 0.004064811155648394, + "grad_norm": 55.7314453125, + "learning_rate": 1.1528822055137844e-05, + "loss": 5.7367, + "step": 54 + }, + { + "epoch": 0.004140085436308549, + "grad_norm": 52.479618072509766, + "learning_rate": 1.1779448621553885e-05, + "loss": 5.6828, + "step": 55 + }, + { + "epoch": 0.004215359716968705, + "grad_norm": 50.01997756958008, + "learning_rate": 1.2030075187969925e-05, + "loss": 5.511, + "step": 56 + }, + { + "epoch": 0.00429063399762886, + "grad_norm": 50.458805084228516, + "learning_rate": 1.2280701754385964e-05, + "loss": 5.5437, + "step": 57 + }, + { + "epoch": 0.004365908278289016, + "grad_norm": 45.79252243041992, + "learning_rate": 1.2531328320802006e-05, + "loss": 5.2966, + "step": 58 + }, + { + "epoch": 0.004441182558949171, + "grad_norm": 42.6255989074707, + "learning_rate": 1.2781954887218045e-05, + "loss": 4.9297, + "step": 59 + }, + { + "epoch": 0.004516456839609327, + "grad_norm": 41.838863372802734, + "learning_rate": 1.3032581453634085e-05, + "loss": 5.1275, + "step": 60 + }, + { + "epoch": 0.004591731120269482, + "grad_norm": 36.9053955078125, + "learning_rate": 1.3283208020050126e-05, + "loss": 5.4475, + "step": 61 + }, + { + "epoch": 0.004667005400929638, + "grad_norm": 36.00736618041992, + "learning_rate": 1.3533834586466165e-05, + "loss": 5.0272, + "step": 62 + }, + { + "epoch": 0.004742279681589793, + "grad_norm": 33.12723922729492, + "learning_rate": 1.3784461152882205e-05, + "loss": 4.8026, + "step": 63 + }, + { + "epoch": 0.004817553962249949, + "grad_norm": 35.4222297668457, + "learning_rate": 1.4035087719298246e-05, + "loss": 4.9519, + "step": 64 + }, + { + "epoch": 0.004892828242910104, + "grad_norm": 34.06288528442383, + "learning_rate": 1.4285714285714285e-05, + "loss": 4.9215, + "step": 65 + }, + { + "epoch": 0.0049681025235702595, + "grad_norm": 35.008663177490234, + "learning_rate": 1.4536340852130325e-05, + "loss": 4.6629, + "step": 66 + }, + { + "epoch": 0.0050433768042304145, + "grad_norm": 33.12161636352539, + "learning_rate": 1.4786967418546366e-05, + "loss": 4.8497, + "step": 67 + }, + { + "epoch": 0.00511865108489057, + "grad_norm": 33.18605041503906, + "learning_rate": 1.5037593984962406e-05, + "loss": 5.071, + "step": 68 + }, + { + "epoch": 0.005193925365550725, + "grad_norm": 32.013362884521484, + "learning_rate": 1.5288220551378447e-05, + "loss": 4.4975, + "step": 69 + }, + { + "epoch": 0.005269199646210881, + "grad_norm": 33.189300537109375, + "learning_rate": 1.5538847117794486e-05, + "loss": 4.6355, + "step": 70 + }, + { + "epoch": 0.005344473926871036, + "grad_norm": 32.4898681640625, + "learning_rate": 1.5789473684210526e-05, + "loss": 4.4873, + "step": 71 + }, + { + "epoch": 0.005419748207531192, + "grad_norm": 32.60152053833008, + "learning_rate": 1.6040100250626565e-05, + "loss": 4.687, + "step": 72 + }, + { + "epoch": 0.005495022488191347, + "grad_norm": 31.895105361938477, + "learning_rate": 1.6290726817042605e-05, + "loss": 4.3908, + "step": 73 + }, + { + "epoch": 0.005570296768851503, + "grad_norm": 31.565521240234375, + "learning_rate": 1.6541353383458648e-05, + "loss": 4.3316, + "step": 74 + }, + { + "epoch": 0.005645571049511658, + "grad_norm": 32.420047760009766, + "learning_rate": 1.6791979949874687e-05, + "loss": 4.0119, + "step": 75 + }, + { + "epoch": 0.005720845330171814, + "grad_norm": 31.427255630493164, + "learning_rate": 1.7042606516290727e-05, + "loss": 4.2302, + "step": 76 + }, + { + "epoch": 0.005796119610831969, + "grad_norm": 31.781021118164062, + "learning_rate": 1.7293233082706766e-05, + "loss": 4.2859, + "step": 77 + }, + { + "epoch": 0.005871393891492125, + "grad_norm": 31.7258243560791, + "learning_rate": 1.7543859649122806e-05, + "loss": 3.903, + "step": 78 + }, + { + "epoch": 0.00594666817215228, + "grad_norm": 32.051170349121094, + "learning_rate": 1.779448621553885e-05, + "loss": 4.4862, + "step": 79 + }, + { + "epoch": 0.006021942452812436, + "grad_norm": 31.511178970336914, + "learning_rate": 1.8045112781954888e-05, + "loss": 4.1855, + "step": 80 + }, + { + "epoch": 0.006097216733472591, + "grad_norm": 30.967679977416992, + "learning_rate": 1.8295739348370928e-05, + "loss": 3.8244, + "step": 81 + }, + { + "epoch": 0.006172491014132747, + "grad_norm": 31.49563217163086, + "learning_rate": 1.8546365914786967e-05, + "loss": 3.9348, + "step": 82 + }, + { + "epoch": 0.006247765294792902, + "grad_norm": 30.85584831237793, + "learning_rate": 1.8796992481203007e-05, + "loss": 4.1612, + "step": 83 + }, + { + "epoch": 0.006323039575453057, + "grad_norm": 33.995872497558594, + "learning_rate": 1.9047619047619046e-05, + "loss": 3.985, + "step": 84 + }, + { + "epoch": 0.0063983138561132125, + "grad_norm": 30.442934036254883, + "learning_rate": 1.929824561403509e-05, + "loss": 3.8997, + "step": 85 + }, + { + "epoch": 0.0064735881367733675, + "grad_norm": 31.20969581604004, + "learning_rate": 1.954887218045113e-05, + "loss": 3.8592, + "step": 86 + }, + { + "epoch": 0.006548862417433523, + "grad_norm": 31.663227081298828, + "learning_rate": 1.9799498746867168e-05, + "loss": 3.6659, + "step": 87 + }, + { + "epoch": 0.0066241366980936784, + "grad_norm": 30.682249069213867, + "learning_rate": 2.0050125313283208e-05, + "loss": 3.5966, + "step": 88 + }, + { + "epoch": 0.006699410978753834, + "grad_norm": 31.77643394470215, + "learning_rate": 2.0300751879699247e-05, + "loss": 3.7419, + "step": 89 + }, + { + "epoch": 0.006774685259413989, + "grad_norm": 32.57149124145508, + "learning_rate": 2.0551378446115287e-05, + "loss": 3.8109, + "step": 90 + }, + { + "epoch": 0.006849959540074145, + "grad_norm": 31.767921447753906, + "learning_rate": 2.080200501253133e-05, + "loss": 3.3923, + "step": 91 + }, + { + "epoch": 0.0069252338207343, + "grad_norm": 31.329954147338867, + "learning_rate": 2.105263157894737e-05, + "loss": 3.2927, + "step": 92 + }, + { + "epoch": 0.007000508101394456, + "grad_norm": 31.79203224182129, + "learning_rate": 2.130325814536341e-05, + "loss": 3.6355, + "step": 93 + }, + { + "epoch": 0.007075782382054611, + "grad_norm": 30.774267196655273, + "learning_rate": 2.1553884711779448e-05, + "loss": 3.1644, + "step": 94 + }, + { + "epoch": 0.007151056662714767, + "grad_norm": 31.852190017700195, + "learning_rate": 2.1804511278195487e-05, + "loss": 3.2082, + "step": 95 + }, + { + "epoch": 0.007226330943374922, + "grad_norm": 31.196786880493164, + "learning_rate": 2.205513784461153e-05, + "loss": 3.4008, + "step": 96 + }, + { + "epoch": 0.007301605224035078, + "grad_norm": 28.888362884521484, + "learning_rate": 2.230576441102757e-05, + "loss": 3.0727, + "step": 97 + }, + { + "epoch": 0.007376879504695233, + "grad_norm": 28.987314224243164, + "learning_rate": 2.255639097744361e-05, + "loss": 3.1494, + "step": 98 + }, + { + "epoch": 0.007452153785355389, + "grad_norm": 27.64488410949707, + "learning_rate": 2.280701754385965e-05, + "loss": 2.9712, + "step": 99 + }, + { + "epoch": 0.007527428066015544, + "grad_norm": 24.212688446044922, + "learning_rate": 2.3057644110275688e-05, + "loss": 2.7013, + "step": 100 + }, + { + "epoch": 0.0076027023466757, + "grad_norm": 18.677289962768555, + "learning_rate": 2.3308270676691728e-05, + "loss": 2.591, + "step": 101 + }, + { + "epoch": 0.007677976627335855, + "grad_norm": 14.867158889770508, + "learning_rate": 2.355889724310777e-05, + "loss": 2.9865, + "step": 102 + }, + { + "epoch": 0.007753250907996011, + "grad_norm": 14.048884391784668, + "learning_rate": 2.380952380952381e-05, + "loss": 2.8416, + "step": 103 + }, + { + "epoch": 0.007828525188656166, + "grad_norm": 13.813777923583984, + "learning_rate": 2.406015037593985e-05, + "loss": 2.9241, + "step": 104 + }, + { + "epoch": 0.007903799469316321, + "grad_norm": 12.740989685058594, + "learning_rate": 2.431077694235589e-05, + "loss": 2.8904, + "step": 105 + }, + { + "epoch": 0.007979073749976476, + "grad_norm": 10.278301239013672, + "learning_rate": 2.456140350877193e-05, + "loss": 2.6597, + "step": 106 + }, + { + "epoch": 0.008054348030636631, + "grad_norm": 11.241612434387207, + "learning_rate": 2.4812030075187968e-05, + "loss": 2.9149, + "step": 107 + }, + { + "epoch": 0.008129622311296788, + "grad_norm": 11.8925142288208, + "learning_rate": 2.506265664160401e-05, + "loss": 2.6866, + "step": 108 + }, + { + "epoch": 0.008204896591956943, + "grad_norm": 13.825277328491211, + "learning_rate": 2.531328320802005e-05, + "loss": 2.6681, + "step": 109 + }, + { + "epoch": 0.008280170872617098, + "grad_norm": 12.951742172241211, + "learning_rate": 2.556390977443609e-05, + "loss": 2.7599, + "step": 110 + }, + { + "epoch": 0.008355445153277253, + "grad_norm": 9.65886116027832, + "learning_rate": 2.581453634085213e-05, + "loss": 2.749, + "step": 111 + }, + { + "epoch": 0.00843071943393741, + "grad_norm": 10.899394989013672, + "learning_rate": 2.606516290726817e-05, + "loss": 2.6439, + "step": 112 + }, + { + "epoch": 0.008505993714597565, + "grad_norm": 8.888102531433105, + "learning_rate": 2.6315789473684212e-05, + "loss": 2.898, + "step": 113 + }, + { + "epoch": 0.00858126799525772, + "grad_norm": 12.082961082458496, + "learning_rate": 2.656641604010025e-05, + "loss": 2.7291, + "step": 114 + }, + { + "epoch": 0.008656542275917875, + "grad_norm": 9.601634979248047, + "learning_rate": 2.681704260651629e-05, + "loss": 2.7372, + "step": 115 + }, + { + "epoch": 0.008731816556578032, + "grad_norm": 7.715977668762207, + "learning_rate": 2.706766917293233e-05, + "loss": 2.5463, + "step": 116 + }, + { + "epoch": 0.008807090837238187, + "grad_norm": 15.842620849609375, + "learning_rate": 2.731829573934837e-05, + "loss": 2.7444, + "step": 117 + }, + { + "epoch": 0.008882365117898342, + "grad_norm": 10.311222076416016, + "learning_rate": 2.756892230576441e-05, + "loss": 2.6851, + "step": 118 + }, + { + "epoch": 0.008957639398558497, + "grad_norm": 9.632251739501953, + "learning_rate": 2.7819548872180452e-05, + "loss": 2.6622, + "step": 119 + }, + { + "epoch": 0.009032913679218654, + "grad_norm": 11.367291450500488, + "learning_rate": 2.8070175438596492e-05, + "loss": 2.9979, + "step": 120 + }, + { + "epoch": 0.009108187959878809, + "grad_norm": 8.723760604858398, + "learning_rate": 2.832080200501253e-05, + "loss": 3.0326, + "step": 121 + }, + { + "epoch": 0.009183462240538964, + "grad_norm": 11.936774253845215, + "learning_rate": 2.857142857142857e-05, + "loss": 2.8038, + "step": 122 + }, + { + "epoch": 0.009258736521199119, + "grad_norm": 14.815502166748047, + "learning_rate": 2.882205513784461e-05, + "loss": 2.7392, + "step": 123 + }, + { + "epoch": 0.009334010801859275, + "grad_norm": 12.248781204223633, + "learning_rate": 2.907268170426065e-05, + "loss": 2.9079, + "step": 124 + }, + { + "epoch": 0.00940928508251943, + "grad_norm": 6.928167343139648, + "learning_rate": 2.9323308270676693e-05, + "loss": 2.7245, + "step": 125 + }, + { + "epoch": 0.009484559363179585, + "grad_norm": 11.194938659667969, + "learning_rate": 2.9573934837092732e-05, + "loss": 2.3287, + "step": 126 + }, + { + "epoch": 0.00955983364383974, + "grad_norm": 10.295517921447754, + "learning_rate": 2.9824561403508772e-05, + "loss": 2.9372, + "step": 127 + }, + { + "epoch": 0.009635107924499897, + "grad_norm": 8.496052742004395, + "learning_rate": 3.007518796992481e-05, + "loss": 2.9622, + "step": 128 + }, + { + "epoch": 0.009710382205160052, + "grad_norm": 10.407452583312988, + "learning_rate": 3.032581453634085e-05, + "loss": 2.7235, + "step": 129 + }, + { + "epoch": 0.009785656485820207, + "grad_norm": 9.219964981079102, + "learning_rate": 3.0576441102756894e-05, + "loss": 2.5662, + "step": 130 + }, + { + "epoch": 0.009860930766480362, + "grad_norm": 13.809978485107422, + "learning_rate": 3.082706766917293e-05, + "loss": 2.6972, + "step": 131 + }, + { + "epoch": 0.009936205047140519, + "grad_norm": 8.544536590576172, + "learning_rate": 3.107769423558897e-05, + "loss": 2.5842, + "step": 132 + }, + { + "epoch": 0.010011479327800674, + "grad_norm": 16.085350036621094, + "learning_rate": 3.132832080200501e-05, + "loss": 2.2536, + "step": 133 + }, + { + "epoch": 0.010086753608460829, + "grad_norm": 8.732707977294922, + "learning_rate": 3.157894736842105e-05, + "loss": 2.5977, + "step": 134 + }, + { + "epoch": 0.010162027889120984, + "grad_norm": 11.61832046508789, + "learning_rate": 3.182957393483709e-05, + "loss": 2.9786, + "step": 135 + }, + { + "epoch": 0.01023730216978114, + "grad_norm": 9.449678421020508, + "learning_rate": 3.208020050125313e-05, + "loss": 2.8203, + "step": 136 + }, + { + "epoch": 0.010312576450441296, + "grad_norm": 12.886004447937012, + "learning_rate": 3.233082706766917e-05, + "loss": 2.8235, + "step": 137 + }, + { + "epoch": 0.01038785073110145, + "grad_norm": 13.254119873046875, + "learning_rate": 3.258145363408521e-05, + "loss": 2.6703, + "step": 138 + }, + { + "epoch": 0.010463125011761606, + "grad_norm": 12.236438751220703, + "learning_rate": 3.2832080200501256e-05, + "loss": 2.5613, + "step": 139 + }, + { + "epoch": 0.010538399292421763, + "grad_norm": 12.124917984008789, + "learning_rate": 3.3082706766917295e-05, + "loss": 2.5182, + "step": 140 + }, + { + "epoch": 0.010613673573081918, + "grad_norm": 9.914979934692383, + "learning_rate": 3.3333333333333335e-05, + "loss": 2.5702, + "step": 141 + }, + { + "epoch": 0.010688947853742073, + "grad_norm": 17.25132942199707, + "learning_rate": 3.3583959899749374e-05, + "loss": 2.6629, + "step": 142 + }, + { + "epoch": 0.010764222134402228, + "grad_norm": 9.482816696166992, + "learning_rate": 3.3834586466165414e-05, + "loss": 2.4679, + "step": 143 + }, + { + "epoch": 0.010839496415062384, + "grad_norm": 11.500816345214844, + "learning_rate": 3.4085213032581453e-05, + "loss": 2.7818, + "step": 144 + }, + { + "epoch": 0.01091477069572254, + "grad_norm": 13.040982246398926, + "learning_rate": 3.433583959899749e-05, + "loss": 2.815, + "step": 145 + }, + { + "epoch": 0.010990044976382694, + "grad_norm": 10.010588645935059, + "learning_rate": 3.458646616541353e-05, + "loss": 2.6249, + "step": 146 + }, + { + "epoch": 0.01106531925704285, + "grad_norm": 11.457307815551758, + "learning_rate": 3.483709273182957e-05, + "loss": 2.8414, + "step": 147 + }, + { + "epoch": 0.011140593537703006, + "grad_norm": 13.810152053833008, + "learning_rate": 3.508771929824561e-05, + "loss": 2.5936, + "step": 148 + }, + { + "epoch": 0.011215867818363161, + "grad_norm": 9.639079093933105, + "learning_rate": 3.533834586466165e-05, + "loss": 2.357, + "step": 149 + }, + { + "epoch": 0.011291142099023316, + "grad_norm": 13.508743286132812, + "learning_rate": 3.55889724310777e-05, + "loss": 2.6349, + "step": 150 + }, + { + "epoch": 0.011366416379683471, + "grad_norm": 8.566679000854492, + "learning_rate": 3.583959899749374e-05, + "loss": 2.6622, + "step": 151 + }, + { + "epoch": 0.011441690660343628, + "grad_norm": 7.942113876342773, + "learning_rate": 3.6090225563909776e-05, + "loss": 2.5748, + "step": 152 + }, + { + "epoch": 0.011516964941003783, + "grad_norm": 9.473734855651855, + "learning_rate": 3.6340852130325816e-05, + "loss": 2.4881, + "step": 153 + }, + { + "epoch": 0.011592239221663938, + "grad_norm": 14.020955085754395, + "learning_rate": 3.6591478696741855e-05, + "loss": 2.5271, + "step": 154 + }, + { + "epoch": 0.011667513502324093, + "grad_norm": 8.211512565612793, + "learning_rate": 3.6842105263157895e-05, + "loss": 2.8688, + "step": 155 + }, + { + "epoch": 0.01174278778298425, + "grad_norm": 9.567743301391602, + "learning_rate": 3.7092731829573934e-05, + "loss": 2.7863, + "step": 156 + }, + { + "epoch": 0.011818062063644405, + "grad_norm": 13.286225318908691, + "learning_rate": 3.7343358395989974e-05, + "loss": 2.5917, + "step": 157 + }, + { + "epoch": 0.01189333634430456, + "grad_norm": 18.761098861694336, + "learning_rate": 3.759398496240601e-05, + "loss": 2.5338, + "step": 158 + }, + { + "epoch": 0.011968610624964715, + "grad_norm": 10.251504898071289, + "learning_rate": 3.784461152882205e-05, + "loss": 2.6264, + "step": 159 + }, + { + "epoch": 0.012043884905624871, + "grad_norm": 11.498222351074219, + "learning_rate": 3.809523809523809e-05, + "loss": 3.0378, + "step": 160 + }, + { + "epoch": 0.012119159186285026, + "grad_norm": 16.968473434448242, + "learning_rate": 3.834586466165413e-05, + "loss": 2.6153, + "step": 161 + }, + { + "epoch": 0.012194433466945181, + "grad_norm": 8.45138931274414, + "learning_rate": 3.859649122807018e-05, + "loss": 2.5801, + "step": 162 + }, + { + "epoch": 0.012269707747605337, + "grad_norm": 12.312670707702637, + "learning_rate": 3.884711779448622e-05, + "loss": 2.3872, + "step": 163 + }, + { + "epoch": 0.012344982028265493, + "grad_norm": 8.898662567138672, + "learning_rate": 3.909774436090226e-05, + "loss": 2.6002, + "step": 164 + }, + { + "epoch": 0.012420256308925648, + "grad_norm": 9.772583961486816, + "learning_rate": 3.9348370927318297e-05, + "loss": 2.7766, + "step": 165 + }, + { + "epoch": 0.012495530589585803, + "grad_norm": 8.063519477844238, + "learning_rate": 3.9598997493734336e-05, + "loss": 2.3458, + "step": 166 + }, + { + "epoch": 0.012570804870245958, + "grad_norm": 16.954740524291992, + "learning_rate": 3.9849624060150376e-05, + "loss": 2.5509, + "step": 167 + }, + { + "epoch": 0.012646079150906113, + "grad_norm": 12.7385892868042, + "learning_rate": 4.0100250626566415e-05, + "loss": 2.5813, + "step": 168 + }, + { + "epoch": 0.01272135343156627, + "grad_norm": 7.384824275970459, + "learning_rate": 4.0350877192982455e-05, + "loss": 2.6479, + "step": 169 + }, + { + "epoch": 0.012796627712226425, + "grad_norm": 10.624900817871094, + "learning_rate": 4.0601503759398494e-05, + "loss": 2.583, + "step": 170 + }, + { + "epoch": 0.01287190199288658, + "grad_norm": 9.559377670288086, + "learning_rate": 4.0852130325814534e-05, + "loss": 2.7829, + "step": 171 + }, + { + "epoch": 0.012947176273546735, + "grad_norm": 16.11256980895996, + "learning_rate": 4.110275689223057e-05, + "loss": 2.4822, + "step": 172 + }, + { + "epoch": 0.013022450554206892, + "grad_norm": 14.441218376159668, + "learning_rate": 4.135338345864662e-05, + "loss": 2.4924, + "step": 173 + }, + { + "epoch": 0.013097724834867047, + "grad_norm": 9.130403518676758, + "learning_rate": 4.160401002506266e-05, + "loss": 2.5223, + "step": 174 + }, + { + "epoch": 0.013172999115527202, + "grad_norm": 9.761638641357422, + "learning_rate": 4.18546365914787e-05, + "loss": 2.5702, + "step": 175 + }, + { + "epoch": 0.013248273396187357, + "grad_norm": 8.047468185424805, + "learning_rate": 4.210526315789474e-05, + "loss": 2.5369, + "step": 176 + }, + { + "epoch": 0.013323547676847514, + "grad_norm": 8.312514305114746, + "learning_rate": 4.235588972431078e-05, + "loss": 3.1303, + "step": 177 + }, + { + "epoch": 0.013398821957507669, + "grad_norm": 6.967094898223877, + "learning_rate": 4.260651629072682e-05, + "loss": 2.6107, + "step": 178 + }, + { + "epoch": 0.013474096238167824, + "grad_norm": 14.499978065490723, + "learning_rate": 4.2857142857142856e-05, + "loss": 2.5863, + "step": 179 + }, + { + "epoch": 0.013549370518827979, + "grad_norm": 8.428247451782227, + "learning_rate": 4.3107769423558896e-05, + "loss": 2.5353, + "step": 180 + }, + { + "epoch": 0.013624644799488135, + "grad_norm": 12.29853343963623, + "learning_rate": 4.3358395989974935e-05, + "loss": 2.6036, + "step": 181 + }, + { + "epoch": 0.01369991908014829, + "grad_norm": 7.652095317840576, + "learning_rate": 4.3609022556390975e-05, + "loss": 2.6479, + "step": 182 + }, + { + "epoch": 0.013775193360808445, + "grad_norm": 8.032188415527344, + "learning_rate": 4.3859649122807014e-05, + "loss": 2.8314, + "step": 183 + }, + { + "epoch": 0.0138504676414686, + "grad_norm": 6.767805576324463, + "learning_rate": 4.411027568922306e-05, + "loss": 2.7598, + "step": 184 + }, + { + "epoch": 0.013925741922128757, + "grad_norm": 11.794331550598145, + "learning_rate": 4.43609022556391e-05, + "loss": 2.4747, + "step": 185 + }, + { + "epoch": 0.014001016202788912, + "grad_norm": 9.042306900024414, + "learning_rate": 4.461152882205514e-05, + "loss": 2.6526, + "step": 186 + }, + { + "epoch": 0.014076290483449067, + "grad_norm": 10.678914070129395, + "learning_rate": 4.486215538847118e-05, + "loss": 2.3853, + "step": 187 + }, + { + "epoch": 0.014151564764109222, + "grad_norm": 9.113207817077637, + "learning_rate": 4.511278195488722e-05, + "loss": 2.5785, + "step": 188 + }, + { + "epoch": 0.014226839044769379, + "grad_norm": 11.013586044311523, + "learning_rate": 4.536340852130326e-05, + "loss": 2.7175, + "step": 189 + }, + { + "epoch": 0.014302113325429534, + "grad_norm": 8.485818862915039, + "learning_rate": 4.56140350877193e-05, + "loss": 2.4907, + "step": 190 + }, + { + "epoch": 0.014377387606089689, + "grad_norm": 12.44178295135498, + "learning_rate": 4.586466165413534e-05, + "loss": 2.6874, + "step": 191 + }, + { + "epoch": 0.014452661886749844, + "grad_norm": 14.327281951904297, + "learning_rate": 4.6115288220551377e-05, + "loss": 2.6317, + "step": 192 + }, + { + "epoch": 0.01452793616741, + "grad_norm": 18.093360900878906, + "learning_rate": 4.6365914786967416e-05, + "loss": 2.9873, + "step": 193 + }, + { + "epoch": 0.014603210448070156, + "grad_norm": 17.46791648864746, + "learning_rate": 4.6616541353383456e-05, + "loss": 2.5846, + "step": 194 + }, + { + "epoch": 0.01467848472873031, + "grad_norm": 10.141608238220215, + "learning_rate": 4.6867167919799495e-05, + "loss": 2.4733, + "step": 195 + }, + { + "epoch": 0.014753759009390466, + "grad_norm": 17.354005813598633, + "learning_rate": 4.711779448621554e-05, + "loss": 2.5055, + "step": 196 + }, + { + "epoch": 0.014829033290050623, + "grad_norm": 10.984573364257812, + "learning_rate": 4.736842105263158e-05, + "loss": 2.6181, + "step": 197 + }, + { + "epoch": 0.014904307570710778, + "grad_norm": 13.152050018310547, + "learning_rate": 4.761904761904762e-05, + "loss": 2.8441, + "step": 198 + }, + { + "epoch": 0.014979581851370933, + "grad_norm": 9.323503494262695, + "learning_rate": 4.786967418546366e-05, + "loss": 2.6344, + "step": 199 + }, + { + "epoch": 0.015054856132031088, + "grad_norm": 8.266263961791992, + "learning_rate": 4.81203007518797e-05, + "loss": 2.6267, + "step": 200 + }, + { + "epoch": 0.015130130412691244, + "grad_norm": 8.868229866027832, + "learning_rate": 4.837092731829574e-05, + "loss": 2.4875, + "step": 201 + }, + { + "epoch": 0.0152054046933514, + "grad_norm": 8.351360321044922, + "learning_rate": 4.862155388471178e-05, + "loss": 2.4873, + "step": 202 + }, + { + "epoch": 0.015280678974011554, + "grad_norm": 8.453309059143066, + "learning_rate": 4.887218045112782e-05, + "loss": 2.5607, + "step": 203 + }, + { + "epoch": 0.01535595325467171, + "grad_norm": 11.86689281463623, + "learning_rate": 4.912280701754386e-05, + "loss": 3.1392, + "step": 204 + }, + { + "epoch": 0.015431227535331866, + "grad_norm": 11.27383041381836, + "learning_rate": 4.93734335839599e-05, + "loss": 2.5259, + "step": 205 + }, + { + "epoch": 0.015506501815992021, + "grad_norm": 9.079084396362305, + "learning_rate": 4.9624060150375936e-05, + "loss": 2.4805, + "step": 206 + }, + { + "epoch": 0.015581776096652176, + "grad_norm": 8.14899730682373, + "learning_rate": 4.987468671679198e-05, + "loss": 2.4906, + "step": 207 + }, + { + "epoch": 0.015657050377312333, + "grad_norm": 9.298063278198242, + "learning_rate": 5.012531328320802e-05, + "loss": 2.5694, + "step": 208 + }, + { + "epoch": 0.015732324657972488, + "grad_norm": 13.468293190002441, + "learning_rate": 5.037593984962407e-05, + "loss": 2.5351, + "step": 209 + }, + { + "epoch": 0.015807598938632643, + "grad_norm": 10.182571411132812, + "learning_rate": 5.06265664160401e-05, + "loss": 2.5524, + "step": 210 + }, + { + "epoch": 0.015882873219292798, + "grad_norm": 10.53078556060791, + "learning_rate": 5.087719298245615e-05, + "loss": 2.2887, + "step": 211 + }, + { + "epoch": 0.015958147499952953, + "grad_norm": 10.477185249328613, + "learning_rate": 5.112781954887218e-05, + "loss": 2.6668, + "step": 212 + }, + { + "epoch": 0.016033421780613108, + "grad_norm": 13.87563705444336, + "learning_rate": 5.1378446115288226e-05, + "loss": 2.613, + "step": 213 + }, + { + "epoch": 0.016108696061273263, + "grad_norm": 12.670780181884766, + "learning_rate": 5.162907268170426e-05, + "loss": 2.5581, + "step": 214 + }, + { + "epoch": 0.01618397034193342, + "grad_norm": 9.723155975341797, + "learning_rate": 5.1879699248120305e-05, + "loss": 2.5365, + "step": 215 + }, + { + "epoch": 0.016259244622593576, + "grad_norm": 6.849913120269775, + "learning_rate": 5.213032581453634e-05, + "loss": 2.8922, + "step": 216 + }, + { + "epoch": 0.01633451890325373, + "grad_norm": 9.322701454162598, + "learning_rate": 5.2380952380952384e-05, + "loss": 2.824, + "step": 217 + }, + { + "epoch": 0.016409793183913886, + "grad_norm": 9.324692726135254, + "learning_rate": 5.2631578947368424e-05, + "loss": 2.8181, + "step": 218 + }, + { + "epoch": 0.01648506746457404, + "grad_norm": 10.296642303466797, + "learning_rate": 5.2882205513784463e-05, + "loss": 2.6506, + "step": 219 + }, + { + "epoch": 0.016560341745234197, + "grad_norm": 13.382678985595703, + "learning_rate": 5.31328320802005e-05, + "loss": 2.5141, + "step": 220 + }, + { + "epoch": 0.01663561602589435, + "grad_norm": 10.737505912780762, + "learning_rate": 5.338345864661655e-05, + "loss": 2.8113, + "step": 221 + }, + { + "epoch": 0.016710890306554507, + "grad_norm": 6.406379699707031, + "learning_rate": 5.363408521303258e-05, + "loss": 2.3996, + "step": 222 + }, + { + "epoch": 0.016786164587214665, + "grad_norm": 8.201915740966797, + "learning_rate": 5.388471177944863e-05, + "loss": 2.8995, + "step": 223 + }, + { + "epoch": 0.01686143886787482, + "grad_norm": 11.752023696899414, + "learning_rate": 5.413533834586466e-05, + "loss": 2.4166, + "step": 224 + }, + { + "epoch": 0.016936713148534975, + "grad_norm": 14.849615097045898, + "learning_rate": 5.438596491228071e-05, + "loss": 2.6851, + "step": 225 + }, + { + "epoch": 0.01701198742919513, + "grad_norm": 8.184210777282715, + "learning_rate": 5.463659147869674e-05, + "loss": 2.5694, + "step": 226 + }, + { + "epoch": 0.017087261709855285, + "grad_norm": 9.136078834533691, + "learning_rate": 5.4887218045112786e-05, + "loss": 2.6582, + "step": 227 + }, + { + "epoch": 0.01716253599051544, + "grad_norm": 9.646272659301758, + "learning_rate": 5.513784461152882e-05, + "loss": 2.66, + "step": 228 + }, + { + "epoch": 0.017237810271175595, + "grad_norm": 11.645383834838867, + "learning_rate": 5.5388471177944865e-05, + "loss": 2.6034, + "step": 229 + }, + { + "epoch": 0.01731308455183575, + "grad_norm": 17.333559036254883, + "learning_rate": 5.5639097744360905e-05, + "loss": 2.5915, + "step": 230 + }, + { + "epoch": 0.01738835883249591, + "grad_norm": 7.36332893371582, + "learning_rate": 5.5889724310776944e-05, + "loss": 2.5042, + "step": 231 + }, + { + "epoch": 0.017463633113156064, + "grad_norm": 9.636429786682129, + "learning_rate": 5.6140350877192984e-05, + "loss": 2.6502, + "step": 232 + }, + { + "epoch": 0.01753890739381622, + "grad_norm": 10.913578033447266, + "learning_rate": 5.639097744360903e-05, + "loss": 2.5222, + "step": 233 + }, + { + "epoch": 0.017614181674476374, + "grad_norm": 8.093767166137695, + "learning_rate": 5.664160401002506e-05, + "loss": 2.322, + "step": 234 + }, + { + "epoch": 0.01768945595513653, + "grad_norm": 9.814970016479492, + "learning_rate": 5.689223057644111e-05, + "loss": 2.701, + "step": 235 + }, + { + "epoch": 0.017764730235796684, + "grad_norm": 11.757080078125, + "learning_rate": 5.714285714285714e-05, + "loss": 2.5138, + "step": 236 + }, + { + "epoch": 0.01784000451645684, + "grad_norm": 10.89779281616211, + "learning_rate": 5.739348370927319e-05, + "loss": 2.9935, + "step": 237 + }, + { + "epoch": 0.017915278797116994, + "grad_norm": 9.9816255569458, + "learning_rate": 5.764411027568922e-05, + "loss": 2.5659, + "step": 238 + }, + { + "epoch": 0.017990553077777152, + "grad_norm": 8.051190376281738, + "learning_rate": 5.789473684210527e-05, + "loss": 2.5852, + "step": 239 + }, + { + "epoch": 0.018065827358437307, + "grad_norm": 11.81480598449707, + "learning_rate": 5.81453634085213e-05, + "loss": 2.4958, + "step": 240 + }, + { + "epoch": 0.018141101639097462, + "grad_norm": 7.56677770614624, + "learning_rate": 5.8395989974937346e-05, + "loss": 2.721, + "step": 241 + }, + { + "epoch": 0.018216375919757617, + "grad_norm": 8.934097290039062, + "learning_rate": 5.8646616541353386e-05, + "loss": 2.6896, + "step": 242 + }, + { + "epoch": 0.018291650200417772, + "grad_norm": 9.240399360656738, + "learning_rate": 5.889724310776943e-05, + "loss": 3.115, + "step": 243 + }, + { + "epoch": 0.018366924481077927, + "grad_norm": 10.501958847045898, + "learning_rate": 5.9147869674185465e-05, + "loss": 2.6751, + "step": 244 + }, + { + "epoch": 0.018442198761738082, + "grad_norm": 9.896129608154297, + "learning_rate": 5.939849624060151e-05, + "loss": 2.5736, + "step": 245 + }, + { + "epoch": 0.018517473042398237, + "grad_norm": 14.61193561553955, + "learning_rate": 5.9649122807017544e-05, + "loss": 2.5374, + "step": 246 + }, + { + "epoch": 0.018592747323058396, + "grad_norm": 8.298013687133789, + "learning_rate": 5.989974937343359e-05, + "loss": 2.6024, + "step": 247 + }, + { + "epoch": 0.01866802160371855, + "grad_norm": 14.158872604370117, + "learning_rate": 6.015037593984962e-05, + "loss": 2.6529, + "step": 248 + }, + { + "epoch": 0.018743295884378706, + "grad_norm": 11.085705757141113, + "learning_rate": 6.040100250626567e-05, + "loss": 2.632, + "step": 249 + }, + { + "epoch": 0.01881857016503886, + "grad_norm": 15.082877159118652, + "learning_rate": 6.06516290726817e-05, + "loss": 2.6374, + "step": 250 + }, + { + "epoch": 0.018893844445699016, + "grad_norm": 18.298795700073242, + "learning_rate": 6.090225563909775e-05, + "loss": 2.6456, + "step": 251 + }, + { + "epoch": 0.01896911872635917, + "grad_norm": 7.108242988586426, + "learning_rate": 6.115288220551379e-05, + "loss": 2.4605, + "step": 252 + }, + { + "epoch": 0.019044393007019326, + "grad_norm": 6.240487575531006, + "learning_rate": 6.140350877192983e-05, + "loss": 2.7358, + "step": 253 + }, + { + "epoch": 0.01911966728767948, + "grad_norm": 9.70210075378418, + "learning_rate": 6.165413533834587e-05, + "loss": 2.4488, + "step": 254 + }, + { + "epoch": 0.019194941568339636, + "grad_norm": 15.132344245910645, + "learning_rate": 6.19047619047619e-05, + "loss": 2.8009, + "step": 255 + }, + { + "epoch": 0.019270215848999794, + "grad_norm": 10.964588165283203, + "learning_rate": 6.215538847117795e-05, + "loss": 2.6409, + "step": 256 + }, + { + "epoch": 0.01934549012965995, + "grad_norm": 9.08316707611084, + "learning_rate": 6.240601503759398e-05, + "loss": 2.3933, + "step": 257 + }, + { + "epoch": 0.019420764410320104, + "grad_norm": 8.53038215637207, + "learning_rate": 6.265664160401002e-05, + "loss": 2.5874, + "step": 258 + }, + { + "epoch": 0.01949603869098026, + "grad_norm": 11.063270568847656, + "learning_rate": 6.290726817042606e-05, + "loss": 2.8177, + "step": 259 + }, + { + "epoch": 0.019571312971640414, + "grad_norm": 5.0860795974731445, + "learning_rate": 6.31578947368421e-05, + "loss": 2.5797, + "step": 260 + }, + { + "epoch": 0.01964658725230057, + "grad_norm": 9.935979843139648, + "learning_rate": 6.340852130325816e-05, + "loss": 3.129, + "step": 261 + }, + { + "epoch": 0.019721861532960724, + "grad_norm": 9.718613624572754, + "learning_rate": 6.365914786967418e-05, + "loss": 2.5463, + "step": 262 + }, + { + "epoch": 0.01979713581362088, + "grad_norm": 8.694568634033203, + "learning_rate": 6.390977443609024e-05, + "loss": 2.3601, + "step": 263 + }, + { + "epoch": 0.019872410094281038, + "grad_norm": 9.739588737487793, + "learning_rate": 6.416040100250626e-05, + "loss": 2.7431, + "step": 264 + }, + { + "epoch": 0.019947684374941193, + "grad_norm": 10.140974998474121, + "learning_rate": 6.441102756892231e-05, + "loss": 2.4661, + "step": 265 + }, + { + "epoch": 0.020022958655601348, + "grad_norm": 13.466826438903809, + "learning_rate": 6.466165413533834e-05, + "loss": 2.6261, + "step": 266 + }, + { + "epoch": 0.020098232936261503, + "grad_norm": 8.768433570861816, + "learning_rate": 6.49122807017544e-05, + "loss": 2.8182, + "step": 267 + }, + { + "epoch": 0.020173507216921658, + "grad_norm": 11.15190601348877, + "learning_rate": 6.516290726817042e-05, + "loss": 2.4149, + "step": 268 + }, + { + "epoch": 0.020248781497581813, + "grad_norm": 10.704723358154297, + "learning_rate": 6.541353383458647e-05, + "loss": 2.6102, + "step": 269 + }, + { + "epoch": 0.020324055778241968, + "grad_norm": 10.706787109375, + "learning_rate": 6.566416040100251e-05, + "loss": 2.4845, + "step": 270 + }, + { + "epoch": 0.020399330058902123, + "grad_norm": 9.769025802612305, + "learning_rate": 6.591478696741855e-05, + "loss": 2.665, + "step": 271 + }, + { + "epoch": 0.02047460433956228, + "grad_norm": 9.657296180725098, + "learning_rate": 6.616541353383459e-05, + "loss": 2.4942, + "step": 272 + }, + { + "epoch": 0.020549878620222436, + "grad_norm": 16.107789993286133, + "learning_rate": 6.641604010025063e-05, + "loss": 2.7219, + "step": 273 + }, + { + "epoch": 0.02062515290088259, + "grad_norm": 14.194955825805664, + "learning_rate": 6.666666666666667e-05, + "loss": 2.6061, + "step": 274 + }, + { + "epoch": 0.020700427181542747, + "grad_norm": 16.177091598510742, + "learning_rate": 6.691729323308271e-05, + "loss": 3.3424, + "step": 275 + }, + { + "epoch": 0.0207757014622029, + "grad_norm": 10.62698745727539, + "learning_rate": 6.716791979949875e-05, + "loss": 2.3526, + "step": 276 + }, + { + "epoch": 0.020850975742863057, + "grad_norm": 10.149099349975586, + "learning_rate": 6.741854636591479e-05, + "loss": 2.6473, + "step": 277 + }, + { + "epoch": 0.02092625002352321, + "grad_norm": 7.481041431427002, + "learning_rate": 6.766917293233083e-05, + "loss": 2.732, + "step": 278 + }, + { + "epoch": 0.021001524304183367, + "grad_norm": 8.676711082458496, + "learning_rate": 6.791979949874687e-05, + "loss": 2.7971, + "step": 279 + }, + { + "epoch": 0.021076798584843525, + "grad_norm": 10.013066291809082, + "learning_rate": 6.817042606516291e-05, + "loss": 2.4611, + "step": 280 + }, + { + "epoch": 0.02115207286550368, + "grad_norm": 10.484708786010742, + "learning_rate": 6.842105263157895e-05, + "loss": 2.2898, + "step": 281 + }, + { + "epoch": 0.021227347146163835, + "grad_norm": 5.649084568023682, + "learning_rate": 6.867167919799499e-05, + "loss": 2.7044, + "step": 282 + }, + { + "epoch": 0.02130262142682399, + "grad_norm": 11.02212905883789, + "learning_rate": 6.892230576441104e-05, + "loss": 2.8007, + "step": 283 + }, + { + "epoch": 0.021377895707484145, + "grad_norm": 10.195820808410645, + "learning_rate": 6.917293233082706e-05, + "loss": 2.4142, + "step": 284 + }, + { + "epoch": 0.0214531699881443, + "grad_norm": 10.254498481750488, + "learning_rate": 6.942355889724312e-05, + "loss": 2.5002, + "step": 285 + }, + { + "epoch": 0.021528444268804455, + "grad_norm": 6.710049152374268, + "learning_rate": 6.967418546365914e-05, + "loss": 2.6661, + "step": 286 + }, + { + "epoch": 0.02160371854946461, + "grad_norm": 15.258325576782227, + "learning_rate": 6.99248120300752e-05, + "loss": 2.5935, + "step": 287 + }, + { + "epoch": 0.02167899283012477, + "grad_norm": 7.982344627380371, + "learning_rate": 7.017543859649122e-05, + "loss": 2.3332, + "step": 288 + }, + { + "epoch": 0.021754267110784924, + "grad_norm": 10.401143074035645, + "learning_rate": 7.042606516290728e-05, + "loss": 2.4913, + "step": 289 + }, + { + "epoch": 0.02182954139144508, + "grad_norm": 15.017297744750977, + "learning_rate": 7.06766917293233e-05, + "loss": 2.9737, + "step": 290 + }, + { + "epoch": 0.021904815672105234, + "grad_norm": 11.712210655212402, + "learning_rate": 7.092731829573935e-05, + "loss": 2.7172, + "step": 291 + }, + { + "epoch": 0.02198008995276539, + "grad_norm": 10.404736518859863, + "learning_rate": 7.11779448621554e-05, + "loss": 2.6443, + "step": 292 + }, + { + "epoch": 0.022055364233425544, + "grad_norm": 12.326501846313477, + "learning_rate": 7.142857142857143e-05, + "loss": 2.4544, + "step": 293 + }, + { + "epoch": 0.0221306385140857, + "grad_norm": 9.318554878234863, + "learning_rate": 7.167919799498747e-05, + "loss": 2.3409, + "step": 294 + }, + { + "epoch": 0.022205912794745854, + "grad_norm": 8.803868293762207, + "learning_rate": 7.192982456140351e-05, + "loss": 2.5344, + "step": 295 + }, + { + "epoch": 0.022281187075406012, + "grad_norm": 4.301174163818359, + "learning_rate": 7.218045112781955e-05, + "loss": 2.4159, + "step": 296 + }, + { + "epoch": 0.022356461356066167, + "grad_norm": 8.091975212097168, + "learning_rate": 7.243107769423559e-05, + "loss": 2.9823, + "step": 297 + }, + { + "epoch": 0.022431735636726322, + "grad_norm": 8.679513931274414, + "learning_rate": 7.268170426065163e-05, + "loss": 2.5977, + "step": 298 + }, + { + "epoch": 0.022507009917386477, + "grad_norm": 9.462285041809082, + "learning_rate": 7.293233082706767e-05, + "loss": 2.1608, + "step": 299 + }, + { + "epoch": 0.022582284198046632, + "grad_norm": 7.032812118530273, + "learning_rate": 7.318295739348371e-05, + "loss": 2.4851, + "step": 300 + }, + { + "epoch": 0.022657558478706787, + "grad_norm": 8.19509506225586, + "learning_rate": 7.343358395989975e-05, + "loss": 2.5213, + "step": 301 + }, + { + "epoch": 0.022732832759366942, + "grad_norm": 9.354598999023438, + "learning_rate": 7.368421052631579e-05, + "loss": 2.6711, + "step": 302 + }, + { + "epoch": 0.022808107040027097, + "grad_norm": 8.307153701782227, + "learning_rate": 7.393483709273183e-05, + "loss": 2.7678, + "step": 303 + }, + { + "epoch": 0.022883381320687256, + "grad_norm": 10.352598190307617, + "learning_rate": 7.418546365914787e-05, + "loss": 2.5978, + "step": 304 + }, + { + "epoch": 0.02295865560134741, + "grad_norm": 9.236013412475586, + "learning_rate": 7.443609022556392e-05, + "loss": 2.6806, + "step": 305 + }, + { + "epoch": 0.023033929882007566, + "grad_norm": 10.241057395935059, + "learning_rate": 7.468671679197995e-05, + "loss": 2.5956, + "step": 306 + }, + { + "epoch": 0.02310920416266772, + "grad_norm": 8.748754501342773, + "learning_rate": 7.4937343358396e-05, + "loss": 2.4249, + "step": 307 + }, + { + "epoch": 0.023184478443327876, + "grad_norm": 11.137886047363281, + "learning_rate": 7.518796992481203e-05, + "loss": 2.5126, + "step": 308 + }, + { + "epoch": 0.02325975272398803, + "grad_norm": 12.867950439453125, + "learning_rate": 7.543859649122808e-05, + "loss": 2.394, + "step": 309 + }, + { + "epoch": 0.023335027004648186, + "grad_norm": 19.755313873291016, + "learning_rate": 7.56892230576441e-05, + "loss": 2.51, + "step": 310 + }, + { + "epoch": 0.02341030128530834, + "grad_norm": 10.884064674377441, + "learning_rate": 7.593984962406016e-05, + "loss": 2.8408, + "step": 311 + }, + { + "epoch": 0.0234855755659685, + "grad_norm": 17.968544006347656, + "learning_rate": 7.619047619047618e-05, + "loss": 2.7531, + "step": 312 + }, + { + "epoch": 0.023560849846628654, + "grad_norm": 16.317859649658203, + "learning_rate": 7.644110275689224e-05, + "loss": 2.6785, + "step": 313 + }, + { + "epoch": 0.02363612412728881, + "grad_norm": 12.524735450744629, + "learning_rate": 7.669172932330826e-05, + "loss": 2.5273, + "step": 314 + }, + { + "epoch": 0.023711398407948964, + "grad_norm": 16.577682495117188, + "learning_rate": 7.694235588972432e-05, + "loss": 2.8514, + "step": 315 + }, + { + "epoch": 0.02378667268860912, + "grad_norm": 8.797318458557129, + "learning_rate": 7.719298245614036e-05, + "loss": 2.7321, + "step": 316 + }, + { + "epoch": 0.023861946969269274, + "grad_norm": 6.772225379943848, + "learning_rate": 7.74436090225564e-05, + "loss": 2.4224, + "step": 317 + }, + { + "epoch": 0.02393722124992943, + "grad_norm": 6.159720420837402, + "learning_rate": 7.769423558897244e-05, + "loss": 2.7055, + "step": 318 + }, + { + "epoch": 0.024012495530589584, + "grad_norm": 6.7710442543029785, + "learning_rate": 7.794486215538847e-05, + "loss": 2.2415, + "step": 319 + }, + { + "epoch": 0.024087769811249743, + "grad_norm": 8.891196250915527, + "learning_rate": 7.819548872180451e-05, + "loss": 2.5413, + "step": 320 + }, + { + "epoch": 0.024163044091909898, + "grad_norm": 6.367236137390137, + "learning_rate": 7.844611528822055e-05, + "loss": 2.4077, + "step": 321 + }, + { + "epoch": 0.024238318372570053, + "grad_norm": 11.4622163772583, + "learning_rate": 7.869674185463659e-05, + "loss": 3.0287, + "step": 322 + }, + { + "epoch": 0.024313592653230208, + "grad_norm": 7.998172283172607, + "learning_rate": 7.894736842105263e-05, + "loss": 2.9693, + "step": 323 + }, + { + "epoch": 0.024388866933890363, + "grad_norm": 6.1461286544799805, + "learning_rate": 7.919799498746867e-05, + "loss": 2.6604, + "step": 324 + }, + { + "epoch": 0.024464141214550518, + "grad_norm": 10.322437286376953, + "learning_rate": 7.944862155388471e-05, + "loss": 2.8238, + "step": 325 + }, + { + "epoch": 0.024539415495210673, + "grad_norm": 10.830486297607422, + "learning_rate": 7.969924812030075e-05, + "loss": 2.513, + "step": 326 + }, + { + "epoch": 0.024614689775870828, + "grad_norm": 9.585814476013184, + "learning_rate": 7.994987468671679e-05, + "loss": 2.4586, + "step": 327 + }, + { + "epoch": 0.024689964056530986, + "grad_norm": 9.150453567504883, + "learning_rate": 8.020050125313283e-05, + "loss": 2.6261, + "step": 328 + }, + { + "epoch": 0.02476523833719114, + "grad_norm": 13.019880294799805, + "learning_rate": 8.045112781954888e-05, + "loss": 2.6852, + "step": 329 + }, + { + "epoch": 0.024840512617851297, + "grad_norm": 10.471446990966797, + "learning_rate": 8.070175438596491e-05, + "loss": 2.4011, + "step": 330 + }, + { + "epoch": 0.02491578689851145, + "grad_norm": 8.616655349731445, + "learning_rate": 8.095238095238096e-05, + "loss": 2.4935, + "step": 331 + }, + { + "epoch": 0.024991061179171607, + "grad_norm": 11.127389907836914, + "learning_rate": 8.120300751879699e-05, + "loss": 2.5475, + "step": 332 + }, + { + "epoch": 0.02506633545983176, + "grad_norm": 11.683151245117188, + "learning_rate": 8.145363408521304e-05, + "loss": 2.5843, + "step": 333 + }, + { + "epoch": 0.025141609740491917, + "grad_norm": 15.664186477661133, + "learning_rate": 8.170426065162907e-05, + "loss": 2.5822, + "step": 334 + }, + { + "epoch": 0.02521688402115207, + "grad_norm": 16.454160690307617, + "learning_rate": 8.195488721804512e-05, + "loss": 2.6598, + "step": 335 + }, + { + "epoch": 0.025292158301812227, + "grad_norm": 11.98859977722168, + "learning_rate": 8.220551378446115e-05, + "loss": 2.5653, + "step": 336 + }, + { + "epoch": 0.025367432582472385, + "grad_norm": 9.438582420349121, + "learning_rate": 8.24561403508772e-05, + "loss": 2.2661, + "step": 337 + }, + { + "epoch": 0.02544270686313254, + "grad_norm": 8.41351318359375, + "learning_rate": 8.270676691729324e-05, + "loss": 2.7389, + "step": 338 + }, + { + "epoch": 0.025517981143792695, + "grad_norm": 9.389608383178711, + "learning_rate": 8.295739348370928e-05, + "loss": 2.3733, + "step": 339 + }, + { + "epoch": 0.02559325542445285, + "grad_norm": 6.236713886260986, + "learning_rate": 8.320802005012532e-05, + "loss": 2.4303, + "step": 340 + }, + { + "epoch": 0.025668529705113005, + "grad_norm": 9.761801719665527, + "learning_rate": 8.345864661654136e-05, + "loss": 2.7781, + "step": 341 + }, + { + "epoch": 0.02574380398577316, + "grad_norm": 7.167248725891113, + "learning_rate": 8.37092731829574e-05, + "loss": 2.3314, + "step": 342 + }, + { + "epoch": 0.025819078266433315, + "grad_norm": 9.095253944396973, + "learning_rate": 8.395989974937344e-05, + "loss": 2.759, + "step": 343 + }, + { + "epoch": 0.02589435254709347, + "grad_norm": 8.229307174682617, + "learning_rate": 8.421052631578948e-05, + "loss": 3.0881, + "step": 344 + }, + { + "epoch": 0.02596962682775363, + "grad_norm": 5.681723117828369, + "learning_rate": 8.446115288220552e-05, + "loss": 2.3568, + "step": 345 + }, + { + "epoch": 0.026044901108413784, + "grad_norm": 6.981089115142822, + "learning_rate": 8.471177944862155e-05, + "loss": 2.7207, + "step": 346 + }, + { + "epoch": 0.02612017538907394, + "grad_norm": 8.043041229248047, + "learning_rate": 8.49624060150376e-05, + "loss": 3.4089, + "step": 347 + }, + { + "epoch": 0.026195449669734094, + "grad_norm": 11.413946151733398, + "learning_rate": 8.521303258145363e-05, + "loss": 2.6168, + "step": 348 + }, + { + "epoch": 0.02627072395039425, + "grad_norm": 8.622365951538086, + "learning_rate": 8.546365914786967e-05, + "loss": 2.2121, + "step": 349 + }, + { + "epoch": 0.026345998231054404, + "grad_norm": 7.51757287979126, + "learning_rate": 8.571428571428571e-05, + "loss": 2.6091, + "step": 350 + }, + { + "epoch": 0.02642127251171456, + "grad_norm": 8.134312629699707, + "learning_rate": 8.596491228070177e-05, + "loss": 3.0337, + "step": 351 + }, + { + "epoch": 0.026496546792374714, + "grad_norm": 10.239460945129395, + "learning_rate": 8.621553884711779e-05, + "loss": 2.252, + "step": 352 + }, + { + "epoch": 0.026571821073034872, + "grad_norm": 8.78095817565918, + "learning_rate": 8.646616541353384e-05, + "loss": 2.3559, + "step": 353 + }, + { + "epoch": 0.026647095353695027, + "grad_norm": 7.117091178894043, + "learning_rate": 8.671679197994987e-05, + "loss": 2.4834, + "step": 354 + }, + { + "epoch": 0.026722369634355182, + "grad_norm": 24.86765480041504, + "learning_rate": 8.696741854636592e-05, + "loss": 2.7105, + "step": 355 + }, + { + "epoch": 0.026797643915015337, + "grad_norm": 11.62508487701416, + "learning_rate": 8.721804511278195e-05, + "loss": 2.3604, + "step": 356 + }, + { + "epoch": 0.026872918195675492, + "grad_norm": 15.0181303024292, + "learning_rate": 8.7468671679198e-05, + "loss": 2.7977, + "step": 357 + }, + { + "epoch": 0.026948192476335647, + "grad_norm": 10.290148735046387, + "learning_rate": 8.771929824561403e-05, + "loss": 2.2396, + "step": 358 + }, + { + "epoch": 0.027023466756995802, + "grad_norm": 10.377724647521973, + "learning_rate": 8.796992481203008e-05, + "loss": 2.727, + "step": 359 + }, + { + "epoch": 0.027098741037655957, + "grad_norm": 8.549407005310059, + "learning_rate": 8.822055137844612e-05, + "loss": 2.5502, + "step": 360 + }, + { + "epoch": 0.027174015318316116, + "grad_norm": 8.503774642944336, + "learning_rate": 8.847117794486216e-05, + "loss": 2.5421, + "step": 361 + }, + { + "epoch": 0.02724928959897627, + "grad_norm": 10.217077255249023, + "learning_rate": 8.87218045112782e-05, + "loss": 2.2306, + "step": 362 + }, + { + "epoch": 0.027324563879636426, + "grad_norm": 7.723809242248535, + "learning_rate": 8.897243107769424e-05, + "loss": 2.3909, + "step": 363 + }, + { + "epoch": 0.02739983816029658, + "grad_norm": 10.626895904541016, + "learning_rate": 8.922305764411028e-05, + "loss": 2.4588, + "step": 364 + }, + { + "epoch": 0.027475112440956736, + "grad_norm": 8.87508773803711, + "learning_rate": 8.947368421052632e-05, + "loss": 2.4269, + "step": 365 + }, + { + "epoch": 0.02755038672161689, + "grad_norm": 7.231565952301025, + "learning_rate": 8.972431077694236e-05, + "loss": 2.426, + "step": 366 + }, + { + "epoch": 0.027625661002277046, + "grad_norm": 6.363231658935547, + "learning_rate": 8.99749373433584e-05, + "loss": 2.2761, + "step": 367 + }, + { + "epoch": 0.0277009352829372, + "grad_norm": 10.780290603637695, + "learning_rate": 9.022556390977444e-05, + "loss": 2.9784, + "step": 368 + }, + { + "epoch": 0.02777620956359736, + "grad_norm": 10.052960395812988, + "learning_rate": 9.047619047619048e-05, + "loss": 2.257, + "step": 369 + }, + { + "epoch": 0.027851483844257514, + "grad_norm": 5.662045478820801, + "learning_rate": 9.072681704260652e-05, + "loss": 2.425, + "step": 370 + }, + { + "epoch": 0.02792675812491767, + "grad_norm": 7.004008769989014, + "learning_rate": 9.097744360902256e-05, + "loss": 2.2103, + "step": 371 + }, + { + "epoch": 0.028002032405577824, + "grad_norm": 12.820517539978027, + "learning_rate": 9.12280701754386e-05, + "loss": 2.7579, + "step": 372 + }, + { + "epoch": 0.02807730668623798, + "grad_norm": 7.78767728805542, + "learning_rate": 9.147869674185465e-05, + "loss": 2.5058, + "step": 373 + }, + { + "epoch": 0.028152580966898134, + "grad_norm": 8.198756217956543, + "learning_rate": 9.172932330827067e-05, + "loss": 2.4175, + "step": 374 + }, + { + "epoch": 0.02822785524755829, + "grad_norm": 10.630598068237305, + "learning_rate": 9.197994987468673e-05, + "loss": 2.557, + "step": 375 + }, + { + "epoch": 0.028303129528218444, + "grad_norm": 7.511836528778076, + "learning_rate": 9.223057644110275e-05, + "loss": 2.4169, + "step": 376 + }, + { + "epoch": 0.028378403808878603, + "grad_norm": 8.69015121459961, + "learning_rate": 9.24812030075188e-05, + "loss": 2.5395, + "step": 377 + }, + { + "epoch": 0.028453678089538758, + "grad_norm": 8.51764965057373, + "learning_rate": 9.273182957393483e-05, + "loss": 2.3876, + "step": 378 + }, + { + "epoch": 0.028528952370198913, + "grad_norm": 8.147504806518555, + "learning_rate": 9.298245614035089e-05, + "loss": 2.6241, + "step": 379 + }, + { + "epoch": 0.028604226650859068, + "grad_norm": 6.697113513946533, + "learning_rate": 9.323308270676691e-05, + "loss": 2.4303, + "step": 380 + }, + { + "epoch": 0.028679500931519223, + "grad_norm": 7.759677886962891, + "learning_rate": 9.348370927318296e-05, + "loss": 2.6293, + "step": 381 + }, + { + "epoch": 0.028754775212179378, + "grad_norm": 9.727107048034668, + "learning_rate": 9.373433583959899e-05, + "loss": 2.5487, + "step": 382 + }, + { + "epoch": 0.028830049492839533, + "grad_norm": 13.338135719299316, + "learning_rate": 9.398496240601504e-05, + "loss": 2.5155, + "step": 383 + }, + { + "epoch": 0.028905323773499688, + "grad_norm": 7.734348773956299, + "learning_rate": 9.423558897243108e-05, + "loss": 2.8259, + "step": 384 + }, + { + "epoch": 0.028980598054159847, + "grad_norm": 7.431156158447266, + "learning_rate": 9.448621553884712e-05, + "loss": 2.6064, + "step": 385 + }, + { + "epoch": 0.02905587233482, + "grad_norm": 8.593841552734375, + "learning_rate": 9.473684210526316e-05, + "loss": 2.8209, + "step": 386 + }, + { + "epoch": 0.029131146615480157, + "grad_norm": 8.319141387939453, + "learning_rate": 9.49874686716792e-05, + "loss": 2.4627, + "step": 387 + }, + { + "epoch": 0.02920642089614031, + "grad_norm": 7.528913974761963, + "learning_rate": 9.523809523809524e-05, + "loss": 2.4552, + "step": 388 + }, + { + "epoch": 0.029281695176800467, + "grad_norm": 9.121739387512207, + "learning_rate": 9.548872180451128e-05, + "loss": 2.8873, + "step": 389 + }, + { + "epoch": 0.02935696945746062, + "grad_norm": 11.068231582641602, + "learning_rate": 9.573934837092732e-05, + "loss": 2.595, + "step": 390 + }, + { + "epoch": 0.029432243738120777, + "grad_norm": 6.988208293914795, + "learning_rate": 9.598997493734336e-05, + "loss": 2.345, + "step": 391 + }, + { + "epoch": 0.02950751801878093, + "grad_norm": 7.696998596191406, + "learning_rate": 9.62406015037594e-05, + "loss": 3.0609, + "step": 392 + }, + { + "epoch": 0.02958279229944109, + "grad_norm": 9.24257755279541, + "learning_rate": 9.649122807017544e-05, + "loss": 2.2814, + "step": 393 + }, + { + "epoch": 0.029658066580101245, + "grad_norm": 4.109235763549805, + "learning_rate": 9.674185463659148e-05, + "loss": 2.4879, + "step": 394 + }, + { + "epoch": 0.0297333408607614, + "grad_norm": 6.239951133728027, + "learning_rate": 9.699248120300752e-05, + "loss": 2.3889, + "step": 395 + }, + { + "epoch": 0.029808615141421555, + "grad_norm": 7.952789783477783, + "learning_rate": 9.724310776942356e-05, + "loss": 2.3344, + "step": 396 + }, + { + "epoch": 0.02988388942208171, + "grad_norm": 9.38506031036377, + "learning_rate": 9.749373433583961e-05, + "loss": 2.5513, + "step": 397 + }, + { + "epoch": 0.029959163702741865, + "grad_norm": 10.43117904663086, + "learning_rate": 9.774436090225564e-05, + "loss": 2.3323, + "step": 398 + }, + { + "epoch": 0.03003443798340202, + "grad_norm": 10.958954811096191, + "learning_rate": 9.799498746867169e-05, + "loss": 2.3101, + "step": 399 + }, + { + "epoch": 0.030109712264062175, + "grad_norm": 11.921953201293945, + "learning_rate": 9.824561403508771e-05, + "loss": 2.6906, + "step": 400 + }, + { + "epoch": 0.030184986544722334, + "grad_norm": 9.990327835083008, + "learning_rate": 9.849624060150377e-05, + "loss": 2.3335, + "step": 401 + }, + { + "epoch": 0.03026026082538249, + "grad_norm": 7.558042526245117, + "learning_rate": 9.87468671679198e-05, + "loss": 2.1983, + "step": 402 + }, + { + "epoch": 0.030335535106042644, + "grad_norm": 6.795205593109131, + "learning_rate": 9.899749373433585e-05, + "loss": 2.3898, + "step": 403 + }, + { + "epoch": 0.0304108093867028, + "grad_norm": 7.895014762878418, + "learning_rate": 9.924812030075187e-05, + "loss": 2.6437, + "step": 404 + }, + { + "epoch": 0.030486083667362954, + "grad_norm": 7.704390525817871, + "learning_rate": 9.949874686716793e-05, + "loss": 2.5093, + "step": 405 + }, + { + "epoch": 0.03056135794802311, + "grad_norm": 5.852281093597412, + "learning_rate": 9.974937343358397e-05, + "loss": 2.4512, + "step": 406 + }, + { + "epoch": 0.030636632228683264, + "grad_norm": 7.343565464019775, + "learning_rate": 0.0001, + "loss": 2.7585, + "step": 407 + }, + { + "epoch": 0.03071190650934342, + "grad_norm": 7.538259506225586, + "learning_rate": 9.999999851382173e-05, + "loss": 3.057, + "step": 408 + }, + { + "epoch": 0.030787180790003577, + "grad_norm": 8.134474754333496, + "learning_rate": 9.999999405528702e-05, + "loss": 2.5448, + "step": 409 + }, + { + "epoch": 0.030862455070663732, + "grad_norm": 11.538702011108398, + "learning_rate": 9.999998662439613e-05, + "loss": 2.5369, + "step": 410 + }, + { + "epoch": 0.030937729351323887, + "grad_norm": 6.254512786865234, + "learning_rate": 9.99999762211495e-05, + "loss": 2.5793, + "step": 411 + }, + { + "epoch": 0.031013003631984042, + "grad_norm": 9.33852481842041, + "learning_rate": 9.999996284554776e-05, + "loss": 2.7659, + "step": 412 + }, + { + "epoch": 0.031088277912644197, + "grad_norm": 9.421518325805664, + "learning_rate": 9.999994649759168e-05, + "loss": 3.3198, + "step": 413 + }, + { + "epoch": 0.031163552193304352, + "grad_norm": 5.423511505126953, + "learning_rate": 9.999992717728223e-05, + "loss": 2.4872, + "step": 414 + }, + { + "epoch": 0.031238826473964507, + "grad_norm": 8.835334777832031, + "learning_rate": 9.99999048846206e-05, + "loss": 2.3842, + "step": 415 + }, + { + "epoch": 0.031314100754624666, + "grad_norm": 9.993122100830078, + "learning_rate": 9.999987961960808e-05, + "loss": 2.493, + "step": 416 + }, + { + "epoch": 0.03138937503528482, + "grad_norm": 10.540339469909668, + "learning_rate": 9.999985138224619e-05, + "loss": 2.8218, + "step": 417 + }, + { + "epoch": 0.031464649315944976, + "grad_norm": 15.810820579528809, + "learning_rate": 9.99998201725366e-05, + "loss": 2.6097, + "step": 418 + }, + { + "epoch": 0.03153992359660513, + "grad_norm": 17.602453231811523, + "learning_rate": 9.999978599048117e-05, + "loss": 2.363, + "step": 419 + }, + { + "epoch": 0.031615197877265286, + "grad_norm": 13.12601089477539, + "learning_rate": 9.999974883608192e-05, + "loss": 2.595, + "step": 420 + }, + { + "epoch": 0.03169047215792544, + "grad_norm": 12.39922046661377, + "learning_rate": 9.999970870934108e-05, + "loss": 2.4596, + "step": 421 + }, + { + "epoch": 0.031765746438585596, + "grad_norm": 6.809163570404053, + "learning_rate": 9.9999665610261e-05, + "loss": 2.5117, + "step": 422 + }, + { + "epoch": 0.03184102071924575, + "grad_norm": 7.833404064178467, + "learning_rate": 9.999961953884427e-05, + "loss": 2.6987, + "step": 423 + }, + { + "epoch": 0.031916294999905906, + "grad_norm": 9.833885192871094, + "learning_rate": 9.999957049509365e-05, + "loss": 2.9772, + "step": 424 + }, + { + "epoch": 0.03199156928056606, + "grad_norm": 9.477235794067383, + "learning_rate": 9.9999518479012e-05, + "loss": 2.6532, + "step": 425 + }, + { + "epoch": 0.032066843561226216, + "grad_norm": 8.576098442077637, + "learning_rate": 9.999946349060245e-05, + "loss": 2.2844, + "step": 426 + }, + { + "epoch": 0.03214211784188637, + "grad_norm": 5.725990295410156, + "learning_rate": 9.999940552986826e-05, + "loss": 2.3742, + "step": 427 + }, + { + "epoch": 0.032217392122546526, + "grad_norm": 10.287811279296875, + "learning_rate": 9.999934459681288e-05, + "loss": 2.6301, + "step": 428 + }, + { + "epoch": 0.03229266640320668, + "grad_norm": 8.580178260803223, + "learning_rate": 9.999928069143993e-05, + "loss": 2.8983, + "step": 429 + }, + { + "epoch": 0.03236794068386684, + "grad_norm": 6.716117858886719, + "learning_rate": 9.999921381375319e-05, + "loss": 2.4656, + "step": 430 + }, + { + "epoch": 0.032443214964527, + "grad_norm": 6.798586368560791, + "learning_rate": 9.999914396375668e-05, + "loss": 2.8239, + "step": 431 + }, + { + "epoch": 0.03251848924518715, + "grad_norm": 4.684966564178467, + "learning_rate": 9.99990711414545e-05, + "loss": 2.4202, + "step": 432 + }, + { + "epoch": 0.03259376352584731, + "grad_norm": 8.419984817504883, + "learning_rate": 9.999899534685102e-05, + "loss": 2.4697, + "step": 433 + }, + { + "epoch": 0.03266903780650746, + "grad_norm": 12.05478286743164, + "learning_rate": 9.99989165799507e-05, + "loss": 2.6111, + "step": 434 + }, + { + "epoch": 0.03274431208716762, + "grad_norm": 7.781139850616455, + "learning_rate": 9.999883484075829e-05, + "loss": 2.7987, + "step": 435 + }, + { + "epoch": 0.03281958636782777, + "grad_norm": 5.817256927490234, + "learning_rate": 9.99987501292786e-05, + "loss": 2.1392, + "step": 436 + }, + { + "epoch": 0.03289486064848793, + "grad_norm": 7.937943458557129, + "learning_rate": 9.999866244551667e-05, + "loss": 2.5918, + "step": 437 + }, + { + "epoch": 0.03297013492914808, + "grad_norm": 5.787116050720215, + "learning_rate": 9.999857178947772e-05, + "loss": 2.3482, + "step": 438 + }, + { + "epoch": 0.03304540920980824, + "grad_norm": 8.704986572265625, + "learning_rate": 9.999847816116714e-05, + "loss": 2.5018, + "step": 439 + }, + { + "epoch": 0.03312068349046839, + "grad_norm": 8.096321105957031, + "learning_rate": 9.999838156059051e-05, + "loss": 2.3744, + "step": 440 + }, + { + "epoch": 0.03319595777112855, + "grad_norm": 8.831674575805664, + "learning_rate": 9.999828198775354e-05, + "loss": 2.6243, + "step": 441 + }, + { + "epoch": 0.0332712320517887, + "grad_norm": 9.94340991973877, + "learning_rate": 9.999817944266216e-05, + "loss": 2.5779, + "step": 442 + }, + { + "epoch": 0.03334650633244886, + "grad_norm": 9.487676620483398, + "learning_rate": 9.99980739253225e-05, + "loss": 2.5221, + "step": 443 + }, + { + "epoch": 0.03342178061310901, + "grad_norm": 5.622339725494385, + "learning_rate": 9.999796543574078e-05, + "loss": 2.4754, + "step": 444 + }, + { + "epoch": 0.03349705489376917, + "grad_norm": 7.879566669464111, + "learning_rate": 9.999785397392349e-05, + "loss": 2.7646, + "step": 445 + }, + { + "epoch": 0.03357232917442933, + "grad_norm": 7.020406246185303, + "learning_rate": 9.999773953987724e-05, + "loss": 2.6179, + "step": 446 + }, + { + "epoch": 0.033647603455089485, + "grad_norm": 8.960833549499512, + "learning_rate": 9.999762213360884e-05, + "loss": 2.7114, + "step": 447 + }, + { + "epoch": 0.03372287773574964, + "grad_norm": 7.174793243408203, + "learning_rate": 9.999750175512524e-05, + "loss": 2.2737, + "step": 448 + }, + { + "epoch": 0.033798152016409795, + "grad_norm": 9.050108909606934, + "learning_rate": 9.999737840443364e-05, + "loss": 2.5019, + "step": 449 + }, + { + "epoch": 0.03387342629706995, + "grad_norm": 4.7563886642456055, + "learning_rate": 9.999725208154135e-05, + "loss": 2.3605, + "step": 450 + }, + { + "epoch": 0.033948700577730105, + "grad_norm": 5.7538065910339355, + "learning_rate": 9.999712278645587e-05, + "loss": 2.4869, + "step": 451 + }, + { + "epoch": 0.03402397485839026, + "grad_norm": 4.971479415893555, + "learning_rate": 9.999699051918491e-05, + "loss": 2.7504, + "step": 452 + }, + { + "epoch": 0.034099249139050415, + "grad_norm": 8.01513671875, + "learning_rate": 9.999685527973633e-05, + "loss": 2.4672, + "step": 453 + }, + { + "epoch": 0.03417452341971057, + "grad_norm": 7.020472049713135, + "learning_rate": 9.999671706811814e-05, + "loss": 2.4146, + "step": 454 + }, + { + "epoch": 0.034249797700370725, + "grad_norm": 7.484527111053467, + "learning_rate": 9.999657588433858e-05, + "loss": 2.1549, + "step": 455 + }, + { + "epoch": 0.03432507198103088, + "grad_norm": 6.460445880889893, + "learning_rate": 9.999643172840605e-05, + "loss": 2.5219, + "step": 456 + }, + { + "epoch": 0.034400346261691035, + "grad_norm": 8.006592750549316, + "learning_rate": 9.99962846003291e-05, + "loss": 2.3652, + "step": 457 + }, + { + "epoch": 0.03447562054235119, + "grad_norm": 5.3611907958984375, + "learning_rate": 9.999613450011648e-05, + "loss": 2.6902, + "step": 458 + }, + { + "epoch": 0.034550894823011345, + "grad_norm": 8.275751113891602, + "learning_rate": 9.999598142777715e-05, + "loss": 2.6116, + "step": 459 + }, + { + "epoch": 0.0346261691036715, + "grad_norm": 6.2943830490112305, + "learning_rate": 9.999582538332016e-05, + "loss": 2.216, + "step": 460 + }, + { + "epoch": 0.034701443384331655, + "grad_norm": 5.55256462097168, + "learning_rate": 9.999566636675479e-05, + "loss": 2.6334, + "step": 461 + }, + { + "epoch": 0.03477671766499182, + "grad_norm": 8.560436248779297, + "learning_rate": 9.999550437809053e-05, + "loss": 2.2581, + "step": 462 + }, + { + "epoch": 0.03485199194565197, + "grad_norm": 6.974269866943359, + "learning_rate": 9.999533941733699e-05, + "loss": 2.3932, + "step": 463 + }, + { + "epoch": 0.03492726622631213, + "grad_norm": 6.966434478759766, + "learning_rate": 9.999517148450395e-05, + "loss": 2.5843, + "step": 464 + }, + { + "epoch": 0.03500254050697228, + "grad_norm": 8.39385986328125, + "learning_rate": 9.999500057960142e-05, + "loss": 2.3989, + "step": 465 + }, + { + "epoch": 0.03507781478763244, + "grad_norm": 8.003994941711426, + "learning_rate": 9.999482670263958e-05, + "loss": 2.7878, + "step": 466 + }, + { + "epoch": 0.03515308906829259, + "grad_norm": 6.788640022277832, + "learning_rate": 9.999464985362875e-05, + "loss": 2.7111, + "step": 467 + }, + { + "epoch": 0.03522836334895275, + "grad_norm": 8.72549819946289, + "learning_rate": 9.99944700325794e-05, + "loss": 2.5827, + "step": 468 + }, + { + "epoch": 0.0353036376296129, + "grad_norm": 8.082274436950684, + "learning_rate": 9.999428723950228e-05, + "loss": 2.4333, + "step": 469 + }, + { + "epoch": 0.03537891191027306, + "grad_norm": 5.939600944519043, + "learning_rate": 9.999410147440823e-05, + "loss": 2.4862, + "step": 470 + }, + { + "epoch": 0.03545418619093321, + "grad_norm": 4.3719024658203125, + "learning_rate": 9.999391273730829e-05, + "loss": 2.2626, + "step": 471 + }, + { + "epoch": 0.03552946047159337, + "grad_norm": 6.722690105438232, + "learning_rate": 9.99937210282137e-05, + "loss": 2.3747, + "step": 472 + }, + { + "epoch": 0.03560473475225352, + "grad_norm": 6.562391757965088, + "learning_rate": 9.999352634713584e-05, + "loss": 2.6484, + "step": 473 + }, + { + "epoch": 0.03568000903291368, + "grad_norm": 11.552745819091797, + "learning_rate": 9.999332869408629e-05, + "loss": 2.5665, + "step": 474 + }, + { + "epoch": 0.03575528331357383, + "grad_norm": 9.434261322021484, + "learning_rate": 9.999312806907676e-05, + "loss": 2.575, + "step": 475 + }, + { + "epoch": 0.03583055759423399, + "grad_norm": 5.114918231964111, + "learning_rate": 9.999292447211926e-05, + "loss": 2.4397, + "step": 476 + }, + { + "epoch": 0.03590583187489414, + "grad_norm": 7.138734340667725, + "learning_rate": 9.999271790322581e-05, + "loss": 2.5623, + "step": 477 + }, + { + "epoch": 0.035981106155554304, + "grad_norm": 6.426302909851074, + "learning_rate": 9.999250836240876e-05, + "loss": 2.444, + "step": 478 + }, + { + "epoch": 0.03605638043621446, + "grad_norm": 6.861083030700684, + "learning_rate": 9.99922958496805e-05, + "loss": 2.7744, + "step": 479 + }, + { + "epoch": 0.036131654716874614, + "grad_norm": 6.147741794586182, + "learning_rate": 9.999208036505372e-05, + "loss": 2.3591, + "step": 480 + }, + { + "epoch": 0.03620692899753477, + "grad_norm": 7.643355846405029, + "learning_rate": 9.99918619085412e-05, + "loss": 2.6998, + "step": 481 + }, + { + "epoch": 0.036282203278194924, + "grad_norm": 6.473873138427734, + "learning_rate": 9.999164048015593e-05, + "loss": 2.425, + "step": 482 + }, + { + "epoch": 0.03635747755885508, + "grad_norm": 6.032350063323975, + "learning_rate": 9.999141607991107e-05, + "loss": 2.2523, + "step": 483 + }, + { + "epoch": 0.036432751839515234, + "grad_norm": 6.577512741088867, + "learning_rate": 9.999118870781996e-05, + "loss": 2.3701, + "step": 484 + }, + { + "epoch": 0.03650802612017539, + "grad_norm": 9.384925842285156, + "learning_rate": 9.999095836389613e-05, + "loss": 2.4511, + "step": 485 + }, + { + "epoch": 0.036583300400835544, + "grad_norm": 5.95685338973999, + "learning_rate": 9.999072504815328e-05, + "loss": 2.3332, + "step": 486 + }, + { + "epoch": 0.0366585746814957, + "grad_norm": 4.993174076080322, + "learning_rate": 9.999048876060525e-05, + "loss": 2.5059, + "step": 487 + }, + { + "epoch": 0.036733848962155854, + "grad_norm": 8.872503280639648, + "learning_rate": 9.999024950126609e-05, + "loss": 2.3738, + "step": 488 + }, + { + "epoch": 0.03680912324281601, + "grad_norm": 8.740178108215332, + "learning_rate": 9.999000727015007e-05, + "loss": 2.4354, + "step": 489 + }, + { + "epoch": 0.036884397523476165, + "grad_norm": 7.707362174987793, + "learning_rate": 9.998976206727153e-05, + "loss": 2.7935, + "step": 490 + }, + { + "epoch": 0.03695967180413632, + "grad_norm": 5.581366062164307, + "learning_rate": 9.998951389264508e-05, + "loss": 2.4497, + "step": 491 + }, + { + "epoch": 0.037034946084796475, + "grad_norm": 6.279916286468506, + "learning_rate": 9.998926274628546e-05, + "loss": 2.5647, + "step": 492 + }, + { + "epoch": 0.03711022036545663, + "grad_norm": 4.6116156578063965, + "learning_rate": 9.99890086282076e-05, + "loss": 2.3364, + "step": 493 + }, + { + "epoch": 0.03718549464611679, + "grad_norm": 5.729796886444092, + "learning_rate": 9.998875153842662e-05, + "loss": 2.473, + "step": 494 + }, + { + "epoch": 0.037260768926776947, + "grad_norm": 5.820767879486084, + "learning_rate": 9.99884914769578e-05, + "loss": 2.3944, + "step": 495 + }, + { + "epoch": 0.0373360432074371, + "grad_norm": 7.453742980957031, + "learning_rate": 9.998822844381659e-05, + "loss": 2.5428, + "step": 496 + }, + { + "epoch": 0.03741131748809726, + "grad_norm": 7.591287136077881, + "learning_rate": 9.998796243901862e-05, + "loss": 2.5782, + "step": 497 + }, + { + "epoch": 0.03748659176875741, + "grad_norm": 7.705503940582275, + "learning_rate": 9.998769346257972e-05, + "loss": 2.8893, + "step": 498 + }, + { + "epoch": 0.03756186604941757, + "grad_norm": 7.578208923339844, + "learning_rate": 9.998742151451588e-05, + "loss": 2.3754, + "step": 499 + }, + { + "epoch": 0.03763714033007772, + "grad_norm": 9.235424041748047, + "learning_rate": 9.998714659484326e-05, + "loss": 2.1838, + "step": 500 + }, + { + "epoch": 0.03771241461073788, + "grad_norm": 5.14205265045166, + "learning_rate": 9.99868687035782e-05, + "loss": 2.2518, + "step": 501 + }, + { + "epoch": 0.03778768889139803, + "grad_norm": 8.834981918334961, + "learning_rate": 9.998658784073723e-05, + "loss": 2.5651, + "step": 502 + }, + { + "epoch": 0.03786296317205819, + "grad_norm": 6.567141532897949, + "learning_rate": 9.998630400633705e-05, + "loss": 2.3154, + "step": 503 + }, + { + "epoch": 0.03793823745271834, + "grad_norm": 7.388275146484375, + "learning_rate": 9.99860172003945e-05, + "loss": 2.0584, + "step": 504 + }, + { + "epoch": 0.0380135117333785, + "grad_norm": 5.03045654296875, + "learning_rate": 9.998572742292666e-05, + "loss": 2.4622, + "step": 505 + }, + { + "epoch": 0.03808878601403865, + "grad_norm": 9.524864196777344, + "learning_rate": 9.998543467395076e-05, + "loss": 3.0264, + "step": 506 + }, + { + "epoch": 0.03816406029469881, + "grad_norm": 6.441607475280762, + "learning_rate": 9.998513895348418e-05, + "loss": 2.2738, + "step": 507 + }, + { + "epoch": 0.03823933457535896, + "grad_norm": 8.705236434936523, + "learning_rate": 9.998484026154453e-05, + "loss": 2.3245, + "step": 508 + }, + { + "epoch": 0.03831460885601912, + "grad_norm": 6.370028972625732, + "learning_rate": 9.998453859814953e-05, + "loss": 2.5218, + "step": 509 + }, + { + "epoch": 0.03838988313667927, + "grad_norm": 4.817743301391602, + "learning_rate": 9.998423396331715e-05, + "loss": 2.7828, + "step": 510 + }, + { + "epoch": 0.038465157417339434, + "grad_norm": 7.0961527824401855, + "learning_rate": 9.998392635706547e-05, + "loss": 2.475, + "step": 511 + }, + { + "epoch": 0.03854043169799959, + "grad_norm": 6.744603157043457, + "learning_rate": 9.998361577941279e-05, + "loss": 2.4948, + "step": 512 + }, + { + "epoch": 0.038615705978659744, + "grad_norm": 9.804177284240723, + "learning_rate": 9.998330223037757e-05, + "loss": 2.6092, + "step": 513 + }, + { + "epoch": 0.0386909802593199, + "grad_norm": 6.470457553863525, + "learning_rate": 9.998298570997846e-05, + "loss": 2.3828, + "step": 514 + }, + { + "epoch": 0.038766254539980054, + "grad_norm": 9.716097831726074, + "learning_rate": 9.998266621823426e-05, + "loss": 2.7292, + "step": 515 + }, + { + "epoch": 0.03884152882064021, + "grad_norm": 5.654953479766846, + "learning_rate": 9.998234375516396e-05, + "loss": 2.6963, + "step": 516 + }, + { + "epoch": 0.038916803101300364, + "grad_norm": 8.3915376663208, + "learning_rate": 9.998201832078675e-05, + "loss": 2.4861, + "step": 517 + }, + { + "epoch": 0.03899207738196052, + "grad_norm": 5.258091449737549, + "learning_rate": 9.998168991512197e-05, + "loss": 2.6424, + "step": 518 + }, + { + "epoch": 0.039067351662620674, + "grad_norm": 8.09157943725586, + "learning_rate": 9.998135853818913e-05, + "loss": 2.2812, + "step": 519 + }, + { + "epoch": 0.03914262594328083, + "grad_norm": 6.906360626220703, + "learning_rate": 9.998102419000794e-05, + "loss": 2.4174, + "step": 520 + }, + { + "epoch": 0.039217900223940984, + "grad_norm": 6.009600639343262, + "learning_rate": 9.998068687059827e-05, + "loss": 2.4074, + "step": 521 + }, + { + "epoch": 0.03929317450460114, + "grad_norm": 6.6244072914123535, + "learning_rate": 9.998034657998018e-05, + "loss": 2.4504, + "step": 522 + }, + { + "epoch": 0.039368448785261294, + "grad_norm": 8.764154434204102, + "learning_rate": 9.99800033181739e-05, + "loss": 2.2812, + "step": 523 + }, + { + "epoch": 0.03944372306592145, + "grad_norm": 6.884581565856934, + "learning_rate": 9.997965708519984e-05, + "loss": 2.1201, + "step": 524 + }, + { + "epoch": 0.039518997346581604, + "grad_norm": 4.965678691864014, + "learning_rate": 9.997930788107854e-05, + "loss": 2.4909, + "step": 525 + }, + { + "epoch": 0.03959427162724176, + "grad_norm": 7.456455230712891, + "learning_rate": 9.997895570583082e-05, + "loss": 2.6967, + "step": 526 + }, + { + "epoch": 0.03966954590790192, + "grad_norm": 9.428606986999512, + "learning_rate": 9.997860055947758e-05, + "loss": 2.7953, + "step": 527 + }, + { + "epoch": 0.039744820188562076, + "grad_norm": 7.258148193359375, + "learning_rate": 9.997824244203995e-05, + "loss": 2.3556, + "step": 528 + }, + { + "epoch": 0.03982009446922223, + "grad_norm": 5.05000638961792, + "learning_rate": 9.997788135353919e-05, + "loss": 2.209, + "step": 529 + }, + { + "epoch": 0.039895368749882386, + "grad_norm": 6.351719379425049, + "learning_rate": 9.99775172939968e-05, + "loss": 2.4862, + "step": 530 + }, + { + "epoch": 0.03997064303054254, + "grad_norm": 5.573503017425537, + "learning_rate": 9.997715026343442e-05, + "loss": 2.7496, + "step": 531 + }, + { + "epoch": 0.040045917311202696, + "grad_norm": 8.435409545898438, + "learning_rate": 9.997678026187385e-05, + "loss": 2.8963, + "step": 532 + }, + { + "epoch": 0.04012119159186285, + "grad_norm": 10.179231643676758, + "learning_rate": 9.997640728933708e-05, + "loss": 2.7065, + "step": 533 + }, + { + "epoch": 0.040196465872523006, + "grad_norm": 7.351372241973877, + "learning_rate": 9.99760313458463e-05, + "loss": 2.8242, + "step": 534 + }, + { + "epoch": 0.04027174015318316, + "grad_norm": 6.410159111022949, + "learning_rate": 9.997565243142386e-05, + "loss": 2.5468, + "step": 535 + }, + { + "epoch": 0.040347014433843316, + "grad_norm": 4.548403739929199, + "learning_rate": 9.997527054609229e-05, + "loss": 2.6557, + "step": 536 + }, + { + "epoch": 0.04042228871450347, + "grad_norm": 8.152257919311523, + "learning_rate": 9.997488568987425e-05, + "loss": 2.7431, + "step": 537 + }, + { + "epoch": 0.040497562995163626, + "grad_norm": 5.5484795570373535, + "learning_rate": 9.997449786279267e-05, + "loss": 2.3715, + "step": 538 + }, + { + "epoch": 0.04057283727582378, + "grad_norm": 8.208224296569824, + "learning_rate": 9.997410706487058e-05, + "loss": 2.4131, + "step": 539 + }, + { + "epoch": 0.040648111556483936, + "grad_norm": 11.589445114135742, + "learning_rate": 9.997371329613123e-05, + "loss": 2.347, + "step": 540 + }, + { + "epoch": 0.04072338583714409, + "grad_norm": 6.1390790939331055, + "learning_rate": 9.9973316556598e-05, + "loss": 2.3823, + "step": 541 + }, + { + "epoch": 0.040798660117804246, + "grad_norm": 6.050121307373047, + "learning_rate": 9.99729168462945e-05, + "loss": 2.3657, + "step": 542 + }, + { + "epoch": 0.04087393439846441, + "grad_norm": 9.08910846710205, + "learning_rate": 9.997251416524446e-05, + "loss": 3.0636, + "step": 543 + }, + { + "epoch": 0.04094920867912456, + "grad_norm": 5.6785383224487305, + "learning_rate": 9.997210851347187e-05, + "loss": 2.5232, + "step": 544 + }, + { + "epoch": 0.04102448295978472, + "grad_norm": 7.894153594970703, + "learning_rate": 9.997169989100081e-05, + "loss": 2.3731, + "step": 545 + }, + { + "epoch": 0.04109975724044487, + "grad_norm": 7.020706653594971, + "learning_rate": 9.997128829785555e-05, + "loss": 2.6216, + "step": 546 + }, + { + "epoch": 0.04117503152110503, + "grad_norm": 10.187527656555176, + "learning_rate": 9.997087373406063e-05, + "loss": 2.2493, + "step": 547 + }, + { + "epoch": 0.04125030580176518, + "grad_norm": 7.443100452423096, + "learning_rate": 9.997045619964061e-05, + "loss": 2.4561, + "step": 548 + }, + { + "epoch": 0.04132558008242534, + "grad_norm": 6.0415191650390625, + "learning_rate": 9.997003569462036e-05, + "loss": 2.4258, + "step": 549 + }, + { + "epoch": 0.04140085436308549, + "grad_norm": 8.120081901550293, + "learning_rate": 9.996961221902487e-05, + "loss": 2.609, + "step": 550 + }, + { + "epoch": 0.04147612864374565, + "grad_norm": 9.864767074584961, + "learning_rate": 9.99691857728793e-05, + "loss": 2.5037, + "step": 551 + }, + { + "epoch": 0.0415514029244058, + "grad_norm": 5.714476585388184, + "learning_rate": 9.996875635620903e-05, + "loss": 2.3865, + "step": 552 + }, + { + "epoch": 0.04162667720506596, + "grad_norm": 5.0489182472229, + "learning_rate": 9.996832396903957e-05, + "loss": 2.0701, + "step": 553 + }, + { + "epoch": 0.04170195148572611, + "grad_norm": 4.438973426818848, + "learning_rate": 9.996788861139664e-05, + "loss": 2.5801, + "step": 554 + }, + { + "epoch": 0.04177722576638627, + "grad_norm": 6.644481182098389, + "learning_rate": 9.996745028330607e-05, + "loss": 2.9687, + "step": 555 + }, + { + "epoch": 0.04185250004704642, + "grad_norm": 5.789854049682617, + "learning_rate": 9.996700898479397e-05, + "loss": 2.5746, + "step": 556 + }, + { + "epoch": 0.04192777432770658, + "grad_norm": 7.773401737213135, + "learning_rate": 9.996656471588657e-05, + "loss": 2.8735, + "step": 557 + }, + { + "epoch": 0.04200304860836673, + "grad_norm": 6.320061206817627, + "learning_rate": 9.996611747661024e-05, + "loss": 2.6319, + "step": 558 + }, + { + "epoch": 0.042078322889026895, + "grad_norm": 6.118007659912109, + "learning_rate": 9.996566726699163e-05, + "loss": 2.4507, + "step": 559 + }, + { + "epoch": 0.04215359716968705, + "grad_norm": 6.324763774871826, + "learning_rate": 9.996521408705744e-05, + "loss": 2.5678, + "step": 560 + }, + { + "epoch": 0.042228871450347205, + "grad_norm": 8.241517066955566, + "learning_rate": 9.996475793683465e-05, + "loss": 2.8031, + "step": 561 + }, + { + "epoch": 0.04230414573100736, + "grad_norm": 5.500481605529785, + "learning_rate": 9.996429881635036e-05, + "loss": 2.6753, + "step": 562 + }, + { + "epoch": 0.042379420011667515, + "grad_norm": 7.616339683532715, + "learning_rate": 9.996383672563187e-05, + "loss": 2.5969, + "step": 563 + }, + { + "epoch": 0.04245469429232767, + "grad_norm": 6.290395736694336, + "learning_rate": 9.996337166470667e-05, + "loss": 3.064, + "step": 564 + }, + { + "epoch": 0.042529968572987825, + "grad_norm": 6.949843406677246, + "learning_rate": 9.996290363360237e-05, + "loss": 2.4298, + "step": 565 + }, + { + "epoch": 0.04260524285364798, + "grad_norm": 5.546399116516113, + "learning_rate": 9.996243263234679e-05, + "loss": 2.625, + "step": 566 + }, + { + "epoch": 0.042680517134308135, + "grad_norm": 7.878125190734863, + "learning_rate": 9.996195866096798e-05, + "loss": 2.5205, + "step": 567 + }, + { + "epoch": 0.04275579141496829, + "grad_norm": 7.050788402557373, + "learning_rate": 9.996148171949406e-05, + "loss": 2.3841, + "step": 568 + }, + { + "epoch": 0.042831065695628445, + "grad_norm": 5.448616027832031, + "learning_rate": 9.996100180795342e-05, + "loss": 2.6443, + "step": 569 + }, + { + "epoch": 0.0429063399762886, + "grad_norm": 8.354663848876953, + "learning_rate": 9.996051892637457e-05, + "loss": 2.4081, + "step": 570 + }, + { + "epoch": 0.042981614256948755, + "grad_norm": 8.005802154541016, + "learning_rate": 9.996003307478624e-05, + "loss": 2.4417, + "step": 571 + }, + { + "epoch": 0.04305688853760891, + "grad_norm": 6.683032512664795, + "learning_rate": 9.995954425321728e-05, + "loss": 2.4664, + "step": 572 + }, + { + "epoch": 0.043132162818269065, + "grad_norm": 6.203315734863281, + "learning_rate": 9.995905246169675e-05, + "loss": 2.3466, + "step": 573 + }, + { + "epoch": 0.04320743709892922, + "grad_norm": 4.661404609680176, + "learning_rate": 9.995855770025393e-05, + "loss": 2.7309, + "step": 574 + }, + { + "epoch": 0.04328271137958938, + "grad_norm": 8.920833587646484, + "learning_rate": 9.995805996891819e-05, + "loss": 2.2883, + "step": 575 + }, + { + "epoch": 0.04335798566024954, + "grad_norm": 7.012353897094727, + "learning_rate": 9.995755926771912e-05, + "loss": 2.5002, + "step": 576 + }, + { + "epoch": 0.04343325994090969, + "grad_norm": 7.787505149841309, + "learning_rate": 9.99570555966865e-05, + "loss": 2.4203, + "step": 577 + }, + { + "epoch": 0.04350853422156985, + "grad_norm": 4.785691261291504, + "learning_rate": 9.995654895585028e-05, + "loss": 2.5132, + "step": 578 + }, + { + "epoch": 0.04358380850223, + "grad_norm": 7.340097904205322, + "learning_rate": 9.995603934524054e-05, + "loss": 2.0884, + "step": 579 + }, + { + "epoch": 0.04365908278289016, + "grad_norm": 7.178926944732666, + "learning_rate": 9.995552676488762e-05, + "loss": 2.449, + "step": 580 + }, + { + "epoch": 0.04373435706355031, + "grad_norm": 6.881811141967773, + "learning_rate": 9.995501121482199e-05, + "loss": 2.4076, + "step": 581 + }, + { + "epoch": 0.04380963134421047, + "grad_norm": 6.278635025024414, + "learning_rate": 9.995449269507425e-05, + "loss": 2.5474, + "step": 582 + }, + { + "epoch": 0.04388490562487062, + "grad_norm": 5.479218006134033, + "learning_rate": 9.995397120567527e-05, + "loss": 2.3536, + "step": 583 + }, + { + "epoch": 0.04396017990553078, + "grad_norm": 4.818369388580322, + "learning_rate": 9.995344674665604e-05, + "loss": 2.8498, + "step": 584 + }, + { + "epoch": 0.04403545418619093, + "grad_norm": 7.86108922958374, + "learning_rate": 9.995291931804771e-05, + "loss": 2.3705, + "step": 585 + }, + { + "epoch": 0.04411072846685109, + "grad_norm": 8.095151901245117, + "learning_rate": 9.995238891988167e-05, + "loss": 2.592, + "step": 586 + }, + { + "epoch": 0.04418600274751124, + "grad_norm": 6.724441051483154, + "learning_rate": 9.995185555218944e-05, + "loss": 2.5161, + "step": 587 + }, + { + "epoch": 0.0442612770281714, + "grad_norm": 5.763269424438477, + "learning_rate": 9.995131921500273e-05, + "loss": 2.5293, + "step": 588 + }, + { + "epoch": 0.04433655130883155, + "grad_norm": 4.651147842407227, + "learning_rate": 9.995077990835342e-05, + "loss": 2.6798, + "step": 589 + }, + { + "epoch": 0.04441182558949171, + "grad_norm": 6.430241584777832, + "learning_rate": 9.995023763227356e-05, + "loss": 2.2504, + "step": 590 + }, + { + "epoch": 0.04448709987015186, + "grad_norm": 6.222283840179443, + "learning_rate": 9.99496923867954e-05, + "loss": 2.4511, + "step": 591 + }, + { + "epoch": 0.044562374150812024, + "grad_norm": 6.536647319793701, + "learning_rate": 9.994914417195134e-05, + "loss": 2.5356, + "step": 592 + }, + { + "epoch": 0.04463764843147218, + "grad_norm": 6.688315391540527, + "learning_rate": 9.9948592987774e-05, + "loss": 2.4715, + "step": 593 + }, + { + "epoch": 0.044712922712132334, + "grad_norm": 6.8323163986206055, + "learning_rate": 9.994803883429612e-05, + "loss": 2.2406, + "step": 594 + }, + { + "epoch": 0.04478819699279249, + "grad_norm": 6.268343925476074, + "learning_rate": 9.994748171155064e-05, + "loss": 2.2256, + "step": 595 + }, + { + "epoch": 0.044863471273452644, + "grad_norm": 6.467589855194092, + "learning_rate": 9.99469216195707e-05, + "loss": 2.4958, + "step": 596 + }, + { + "epoch": 0.0449387455541128, + "grad_norm": 7.256012439727783, + "learning_rate": 9.994635855838956e-05, + "loss": 3.1424, + "step": 597 + }, + { + "epoch": 0.045014019834772954, + "grad_norm": 7.852339744567871, + "learning_rate": 9.994579252804073e-05, + "loss": 2.381, + "step": 598 + }, + { + "epoch": 0.04508929411543311, + "grad_norm": 5.958998680114746, + "learning_rate": 9.994522352855784e-05, + "loss": 2.4826, + "step": 599 + }, + { + "epoch": 0.045164568396093265, + "grad_norm": 8.991501808166504, + "learning_rate": 9.994465155997472e-05, + "loss": 2.4228, + "step": 600 + }, + { + "epoch": 0.04523984267675342, + "grad_norm": 4.996432781219482, + "learning_rate": 9.994407662232539e-05, + "loss": 2.4886, + "step": 601 + }, + { + "epoch": 0.045315116957413575, + "grad_norm": 7.754335403442383, + "learning_rate": 9.994349871564399e-05, + "loss": 2.0025, + "step": 602 + }, + { + "epoch": 0.04539039123807373, + "grad_norm": 6.884054660797119, + "learning_rate": 9.99429178399649e-05, + "loss": 2.1456, + "step": 603 + }, + { + "epoch": 0.045465665518733885, + "grad_norm": 7.160465240478516, + "learning_rate": 9.994233399532265e-05, + "loss": 2.4864, + "step": 604 + }, + { + "epoch": 0.04554093979939404, + "grad_norm": 10.531169891357422, + "learning_rate": 9.994174718175196e-05, + "loss": 3.0849, + "step": 605 + }, + { + "epoch": 0.045616214080054195, + "grad_norm": 7.141112327575684, + "learning_rate": 9.994115739928767e-05, + "loss": 2.5968, + "step": 606 + }, + { + "epoch": 0.04569148836071435, + "grad_norm": 6.742875576019287, + "learning_rate": 9.994056464796488e-05, + "loss": 2.2408, + "step": 607 + }, + { + "epoch": 0.04576676264137451, + "grad_norm": 6.189098358154297, + "learning_rate": 9.993996892781883e-05, + "loss": 2.2561, + "step": 608 + }, + { + "epoch": 0.04584203692203467, + "grad_norm": 5.1011643409729, + "learning_rate": 9.993937023888491e-05, + "loss": 2.8808, + "step": 609 + }, + { + "epoch": 0.04591731120269482, + "grad_norm": 5.701725006103516, + "learning_rate": 9.993876858119872e-05, + "loss": 2.3248, + "step": 610 + }, + { + "epoch": 0.04599258548335498, + "grad_norm": 5.765629768371582, + "learning_rate": 9.993816395479603e-05, + "loss": 2.2247, + "step": 611 + }, + { + "epoch": 0.04606785976401513, + "grad_norm": 4.022218704223633, + "learning_rate": 9.993755635971278e-05, + "loss": 2.7172, + "step": 612 + }, + { + "epoch": 0.04614313404467529, + "grad_norm": 5.479935646057129, + "learning_rate": 9.99369457959851e-05, + "loss": 2.5582, + "step": 613 + }, + { + "epoch": 0.04621840832533544, + "grad_norm": 8.133018493652344, + "learning_rate": 9.993633226364927e-05, + "loss": 2.3932, + "step": 614 + }, + { + "epoch": 0.0462936826059956, + "grad_norm": 5.17117166519165, + "learning_rate": 9.993571576274177e-05, + "loss": 2.537, + "step": 615 + }, + { + "epoch": 0.04636895688665575, + "grad_norm": 5.570759296417236, + "learning_rate": 9.993509629329925e-05, + "loss": 2.524, + "step": 616 + }, + { + "epoch": 0.04644423116731591, + "grad_norm": 4.409459590911865, + "learning_rate": 9.993447385535853e-05, + "loss": 2.5767, + "step": 617 + }, + { + "epoch": 0.04651950544797606, + "grad_norm": 6.566601276397705, + "learning_rate": 9.993384844895662e-05, + "loss": 2.1811, + "step": 618 + }, + { + "epoch": 0.04659477972863622, + "grad_norm": 5.1130475997924805, + "learning_rate": 9.99332200741307e-05, + "loss": 2.6017, + "step": 619 + }, + { + "epoch": 0.04667005400929637, + "grad_norm": 5.931060791015625, + "learning_rate": 9.993258873091813e-05, + "loss": 2.6096, + "step": 620 + }, + { + "epoch": 0.04674532828995653, + "grad_norm": 7.065201759338379, + "learning_rate": 9.993195441935642e-05, + "loss": 3.1509, + "step": 621 + }, + { + "epoch": 0.04682060257061668, + "grad_norm": 6.598495960235596, + "learning_rate": 9.99313171394833e-05, + "loss": 2.3957, + "step": 622 + }, + { + "epoch": 0.04689587685127684, + "grad_norm": 6.731385707855225, + "learning_rate": 9.993067689133663e-05, + "loss": 2.4509, + "step": 623 + }, + { + "epoch": 0.046971151131937, + "grad_norm": 3.9178264141082764, + "learning_rate": 9.99300336749545e-05, + "loss": 2.2971, + "step": 624 + }, + { + "epoch": 0.047046425412597154, + "grad_norm": 7.1814656257629395, + "learning_rate": 9.992938749037513e-05, + "loss": 2.2139, + "step": 625 + }, + { + "epoch": 0.04712169969325731, + "grad_norm": 5.879206657409668, + "learning_rate": 9.992873833763693e-05, + "loss": 2.6437, + "step": 626 + }, + { + "epoch": 0.047196973973917464, + "grad_norm": 7.691930770874023, + "learning_rate": 9.992808621677849e-05, + "loss": 2.3726, + "step": 627 + }, + { + "epoch": 0.04727224825457762, + "grad_norm": 6.117953777313232, + "learning_rate": 9.99274311278386e-05, + "loss": 2.398, + "step": 628 + }, + { + "epoch": 0.047347522535237774, + "grad_norm": 5.077375888824463, + "learning_rate": 9.992677307085618e-05, + "loss": 2.3538, + "step": 629 + }, + { + "epoch": 0.04742279681589793, + "grad_norm": 5.957877159118652, + "learning_rate": 9.992611204587036e-05, + "loss": 2.3064, + "step": 630 + }, + { + "epoch": 0.047498071096558084, + "grad_norm": 8.05428409576416, + "learning_rate": 9.992544805292044e-05, + "loss": 2.4879, + "step": 631 + }, + { + "epoch": 0.04757334537721824, + "grad_norm": 7.52317476272583, + "learning_rate": 9.992478109204589e-05, + "loss": 2.1448, + "step": 632 + }, + { + "epoch": 0.047648619657878394, + "grad_norm": 7.252506256103516, + "learning_rate": 9.992411116328634e-05, + "loss": 2.6262, + "step": 633 + }, + { + "epoch": 0.04772389393853855, + "grad_norm": 9.12913703918457, + "learning_rate": 9.992343826668164e-05, + "loss": 2.5351, + "step": 634 + }, + { + "epoch": 0.047799168219198704, + "grad_norm": 8.532181739807129, + "learning_rate": 9.992276240227176e-05, + "loss": 2.3441, + "step": 635 + }, + { + "epoch": 0.04787444249985886, + "grad_norm": 7.963910102844238, + "learning_rate": 9.992208357009692e-05, + "loss": 2.3961, + "step": 636 + }, + { + "epoch": 0.047949716780519014, + "grad_norm": 8.184015274047852, + "learning_rate": 9.992140177019745e-05, + "loss": 2.5177, + "step": 637 + }, + { + "epoch": 0.04802499106117917, + "grad_norm": 12.021905899047852, + "learning_rate": 9.992071700261389e-05, + "loss": 2.2839, + "step": 638 + }, + { + "epoch": 0.048100265341839324, + "grad_norm": 6.142306804656982, + "learning_rate": 9.992002926738693e-05, + "loss": 2.4884, + "step": 639 + }, + { + "epoch": 0.048175539622499486, + "grad_norm": 6.569483280181885, + "learning_rate": 9.991933856455747e-05, + "loss": 2.2254, + "step": 640 + }, + { + "epoch": 0.04825081390315964, + "grad_norm": 6.3050923347473145, + "learning_rate": 9.991864489416658e-05, + "loss": 2.4087, + "step": 641 + }, + { + "epoch": 0.048326088183819796, + "grad_norm": 10.657489776611328, + "learning_rate": 9.991794825625547e-05, + "loss": 2.4658, + "step": 642 + }, + { + "epoch": 0.04840136246447995, + "grad_norm": 6.772811412811279, + "learning_rate": 9.991724865086558e-05, + "loss": 2.42, + "step": 643 + }, + { + "epoch": 0.048476636745140106, + "grad_norm": 6.863295078277588, + "learning_rate": 9.991654607803847e-05, + "loss": 2.2548, + "step": 644 + }, + { + "epoch": 0.04855191102580026, + "grad_norm": 8.862432479858398, + "learning_rate": 9.991584053781593e-05, + "loss": 2.4943, + "step": 645 + }, + { + "epoch": 0.048627185306460416, + "grad_norm": 6.136997222900391, + "learning_rate": 9.99151320302399e-05, + "loss": 2.2595, + "step": 646 + }, + { + "epoch": 0.04870245958712057, + "grad_norm": 9.740489959716797, + "learning_rate": 9.991442055535248e-05, + "loss": 2.3477, + "step": 647 + }, + { + "epoch": 0.048777733867780726, + "grad_norm": 5.00742244720459, + "learning_rate": 9.9913706113196e-05, + "loss": 2.345, + "step": 648 + }, + { + "epoch": 0.04885300814844088, + "grad_norm": 3.7993032932281494, + "learning_rate": 9.991298870381289e-05, + "loss": 2.2151, + "step": 649 + }, + { + "epoch": 0.048928282429101036, + "grad_norm": 4.597858428955078, + "learning_rate": 9.991226832724583e-05, + "loss": 2.5071, + "step": 650 + }, + { + "epoch": 0.04900355670976119, + "grad_norm": 5.781305313110352, + "learning_rate": 9.991154498353761e-05, + "loss": 2.4976, + "step": 651 + }, + { + "epoch": 0.049078830990421346, + "grad_norm": 8.01620864868164, + "learning_rate": 9.991081867273127e-05, + "loss": 2.2157, + "step": 652 + }, + { + "epoch": 0.0491541052710815, + "grad_norm": 6.494935512542725, + "learning_rate": 9.991008939486998e-05, + "loss": 2.4016, + "step": 653 + }, + { + "epoch": 0.049229379551741656, + "grad_norm": 4.316427230834961, + "learning_rate": 9.990935714999707e-05, + "loss": 2.2825, + "step": 654 + }, + { + "epoch": 0.04930465383240181, + "grad_norm": 6.215554714202881, + "learning_rate": 9.990862193815609e-05, + "loss": 2.3553, + "step": 655 + }, + { + "epoch": 0.04937992811306197, + "grad_norm": 4.606568813323975, + "learning_rate": 9.990788375939073e-05, + "loss": 2.5599, + "step": 656 + }, + { + "epoch": 0.04945520239372213, + "grad_norm": 5.179342746734619, + "learning_rate": 9.99071426137449e-05, + "loss": 2.2877, + "step": 657 + }, + { + "epoch": 0.04953047667438228, + "grad_norm": 8.026809692382812, + "learning_rate": 9.990639850126262e-05, + "loss": 2.504, + "step": 658 + }, + { + "epoch": 0.04960575095504244, + "grad_norm": 5.966596603393555, + "learning_rate": 9.990565142198816e-05, + "loss": 2.4761, + "step": 659 + }, + { + "epoch": 0.04968102523570259, + "grad_norm": 6.408261299133301, + "learning_rate": 9.99049013759659e-05, + "loss": 2.2383, + "step": 660 + }, + { + "epoch": 0.04975629951636275, + "grad_norm": 7.961498260498047, + "learning_rate": 9.990414836324047e-05, + "loss": 2.397, + "step": 661 + }, + { + "epoch": 0.0498315737970229, + "grad_norm": 5.791255950927734, + "learning_rate": 9.99033923838566e-05, + "loss": 2.8276, + "step": 662 + }, + { + "epoch": 0.04990684807768306, + "grad_norm": 8.344980239868164, + "learning_rate": 9.990263343785925e-05, + "loss": 2.3831, + "step": 663 + }, + { + "epoch": 0.04998212235834321, + "grad_norm": 7.122950077056885, + "learning_rate": 9.990187152529353e-05, + "loss": 2.192, + "step": 664 + }, + { + "epoch": 0.05005739663900337, + "grad_norm": 8.393940925598145, + "learning_rate": 9.990110664620472e-05, + "loss": 2.2298, + "step": 665 + }, + { + "epoch": 0.05013267091966352, + "grad_norm": 9.402276039123535, + "learning_rate": 9.990033880063832e-05, + "loss": 3.6066, + "step": 666 + }, + { + "epoch": 0.05020794520032368, + "grad_norm": 7.331552505493164, + "learning_rate": 9.989956798863993e-05, + "loss": 2.9487, + "step": 667 + }, + { + "epoch": 0.05028321948098383, + "grad_norm": 9.451202392578125, + "learning_rate": 9.989879421025542e-05, + "loss": 2.8254, + "step": 668 + }, + { + "epoch": 0.05035849376164399, + "grad_norm": 8.325094223022461, + "learning_rate": 9.989801746553077e-05, + "loss": 2.5979, + "step": 669 + }, + { + "epoch": 0.05043376804230414, + "grad_norm": 6.0098419189453125, + "learning_rate": 9.989723775451215e-05, + "loss": 2.6105, + "step": 670 + }, + { + "epoch": 0.0505090423229643, + "grad_norm": 6.120908260345459, + "learning_rate": 9.989645507724593e-05, + "loss": 2.5079, + "step": 671 + }, + { + "epoch": 0.05058431660362445, + "grad_norm": 6.508972644805908, + "learning_rate": 9.98956694337786e-05, + "loss": 2.2816, + "step": 672 + }, + { + "epoch": 0.050659590884284615, + "grad_norm": 8.383572578430176, + "learning_rate": 9.98948808241569e-05, + "loss": 2.4371, + "step": 673 + }, + { + "epoch": 0.05073486516494477, + "grad_norm": 6.390430927276611, + "learning_rate": 9.98940892484277e-05, + "loss": 2.6644, + "step": 674 + }, + { + "epoch": 0.050810139445604925, + "grad_norm": 9.718884468078613, + "learning_rate": 9.989329470663807e-05, + "loss": 2.619, + "step": 675 + }, + { + "epoch": 0.05088541372626508, + "grad_norm": 8.05876636505127, + "learning_rate": 9.989249719883522e-05, + "loss": 2.5299, + "step": 676 + }, + { + "epoch": 0.050960688006925235, + "grad_norm": 6.069999694824219, + "learning_rate": 9.989169672506655e-05, + "loss": 2.4293, + "step": 677 + }, + { + "epoch": 0.05103596228758539, + "grad_norm": 6.174783706665039, + "learning_rate": 9.989089328537969e-05, + "loss": 2.3932, + "step": 678 + }, + { + "epoch": 0.051111236568245545, + "grad_norm": 6.460120677947998, + "learning_rate": 9.989008687982238e-05, + "loss": 2.1818, + "step": 679 + }, + { + "epoch": 0.0511865108489057, + "grad_norm": 6.6206512451171875, + "learning_rate": 9.988927750844253e-05, + "loss": 2.9276, + "step": 680 + }, + { + "epoch": 0.051261785129565855, + "grad_norm": 5.874287128448486, + "learning_rate": 9.988846517128829e-05, + "loss": 2.5916, + "step": 681 + }, + { + "epoch": 0.05133705941022601, + "grad_norm": 6.49630880355835, + "learning_rate": 9.988764986840795e-05, + "loss": 2.515, + "step": 682 + }, + { + "epoch": 0.051412333690886165, + "grad_norm": 7.505910396575928, + "learning_rate": 9.988683159984998e-05, + "loss": 2.1617, + "step": 683 + }, + { + "epoch": 0.05148760797154632, + "grad_norm": 4.591773509979248, + "learning_rate": 9.988601036566298e-05, + "loss": 2.3853, + "step": 684 + }, + { + "epoch": 0.051562882252206475, + "grad_norm": 5.950367450714111, + "learning_rate": 9.988518616589583e-05, + "loss": 2.2454, + "step": 685 + }, + { + "epoch": 0.05163815653286663, + "grad_norm": 4.622622013092041, + "learning_rate": 9.98843590005975e-05, + "loss": 2.6425, + "step": 686 + }, + { + "epoch": 0.051713430813526785, + "grad_norm": 7.5886311531066895, + "learning_rate": 9.988352886981714e-05, + "loss": 2.433, + "step": 687 + }, + { + "epoch": 0.05178870509418694, + "grad_norm": 3.7447688579559326, + "learning_rate": 9.988269577360414e-05, + "loss": 2.4239, + "step": 688 + }, + { + "epoch": 0.0518639793748471, + "grad_norm": 4.272064208984375, + "learning_rate": 9.988185971200799e-05, + "loss": 2.2416, + "step": 689 + }, + { + "epoch": 0.05193925365550726, + "grad_norm": 6.251614570617676, + "learning_rate": 9.988102068507842e-05, + "loss": 2.1771, + "step": 690 + }, + { + "epoch": 0.05201452793616741, + "grad_norm": 8.024019241333008, + "learning_rate": 9.988017869286529e-05, + "loss": 2.3752, + "step": 691 + }, + { + "epoch": 0.05208980221682757, + "grad_norm": 5.293638229370117, + "learning_rate": 9.987933373541866e-05, + "loss": 2.162, + "step": 692 + }, + { + "epoch": 0.05216507649748772, + "grad_norm": 6.998028755187988, + "learning_rate": 9.987848581278876e-05, + "loss": 2.576, + "step": 693 + }, + { + "epoch": 0.05224035077814788, + "grad_norm": 5.391724109649658, + "learning_rate": 9.9877634925026e-05, + "loss": 2.2978, + "step": 694 + }, + { + "epoch": 0.05231562505880803, + "grad_norm": 7.418310165405273, + "learning_rate": 9.987678107218095e-05, + "loss": 2.3028, + "step": 695 + }, + { + "epoch": 0.05239089933946819, + "grad_norm": 10.285323143005371, + "learning_rate": 9.987592425430439e-05, + "loss": 2.4946, + "step": 696 + }, + { + "epoch": 0.05246617362012834, + "grad_norm": 12.489806175231934, + "learning_rate": 9.987506447144723e-05, + "loss": 2.2556, + "step": 697 + }, + { + "epoch": 0.0525414479007885, + "grad_norm": 6.9232563972473145, + "learning_rate": 9.98742017236606e-05, + "loss": 2.2603, + "step": 698 + }, + { + "epoch": 0.05261672218144865, + "grad_norm": 6.247459888458252, + "learning_rate": 9.987333601099577e-05, + "loss": 2.4406, + "step": 699 + }, + { + "epoch": 0.05269199646210881, + "grad_norm": 13.742070198059082, + "learning_rate": 9.987246733350423e-05, + "loss": 2.4445, + "step": 700 + }, + { + "epoch": 0.05276727074276896, + "grad_norm": 7.605139255523682, + "learning_rate": 9.987159569123761e-05, + "loss": 2.2571, + "step": 701 + }, + { + "epoch": 0.05284254502342912, + "grad_norm": 5.785974025726318, + "learning_rate": 9.987072108424772e-05, + "loss": 2.4421, + "step": 702 + }, + { + "epoch": 0.05291781930408927, + "grad_norm": 10.140782356262207, + "learning_rate": 9.986984351258656e-05, + "loss": 2.2641, + "step": 703 + }, + { + "epoch": 0.05299309358474943, + "grad_norm": 8.75837516784668, + "learning_rate": 9.98689629763063e-05, + "loss": 2.257, + "step": 704 + }, + { + "epoch": 0.05306836786540959, + "grad_norm": 10.197219848632812, + "learning_rate": 9.986807947545927e-05, + "loss": 2.3644, + "step": 705 + }, + { + "epoch": 0.053143642146069744, + "grad_norm": 10.551118850708008, + "learning_rate": 9.986719301009799e-05, + "loss": 2.2788, + "step": 706 + }, + { + "epoch": 0.0532189164267299, + "grad_norm": 7.512317180633545, + "learning_rate": 9.98663035802752e-05, + "loss": 2.386, + "step": 707 + }, + { + "epoch": 0.053294190707390054, + "grad_norm": 4.915609359741211, + "learning_rate": 9.986541118604373e-05, + "loss": 2.438, + "step": 708 + }, + { + "epoch": 0.05336946498805021, + "grad_norm": 4.814695358276367, + "learning_rate": 9.986451582745664e-05, + "loss": 2.1662, + "step": 709 + }, + { + "epoch": 0.053444739268710365, + "grad_norm": 6.823842525482178, + "learning_rate": 9.986361750456716e-05, + "loss": 2.7644, + "step": 710 + }, + { + "epoch": 0.05352001354937052, + "grad_norm": 6.695843696594238, + "learning_rate": 9.986271621742871e-05, + "loss": 2.3933, + "step": 711 + }, + { + "epoch": 0.053595287830030675, + "grad_norm": 6.394896984100342, + "learning_rate": 9.986181196609485e-05, + "loss": 2.2938, + "step": 712 + }, + { + "epoch": 0.05367056211069083, + "grad_norm": 7.3776726722717285, + "learning_rate": 9.986090475061934e-05, + "loss": 2.4984, + "step": 713 + }, + { + "epoch": 0.053745836391350985, + "grad_norm": 6.144237041473389, + "learning_rate": 9.98599945710561e-05, + "loss": 2.2483, + "step": 714 + }, + { + "epoch": 0.05382111067201114, + "grad_norm": 8.079205513000488, + "learning_rate": 9.985908142745924e-05, + "loss": 2.4494, + "step": 715 + }, + { + "epoch": 0.053896384952671295, + "grad_norm": 8.626195907592773, + "learning_rate": 9.985816531988306e-05, + "loss": 2.4397, + "step": 716 + }, + { + "epoch": 0.05397165923333145, + "grad_norm": 9.848217964172363, + "learning_rate": 9.985724624838201e-05, + "loss": 2.6696, + "step": 717 + }, + { + "epoch": 0.054046933513991605, + "grad_norm": 7.598921775817871, + "learning_rate": 9.985632421301073e-05, + "loss": 2.1476, + "step": 718 + }, + { + "epoch": 0.05412220779465176, + "grad_norm": 10.673614501953125, + "learning_rate": 9.985539921382402e-05, + "loss": 2.4005, + "step": 719 + }, + { + "epoch": 0.054197482075311915, + "grad_norm": 7.3999342918396, + "learning_rate": 9.98544712508769e-05, + "loss": 2.3004, + "step": 720 + }, + { + "epoch": 0.05427275635597208, + "grad_norm": 9.133363723754883, + "learning_rate": 9.98535403242245e-05, + "loss": 2.3326, + "step": 721 + }, + { + "epoch": 0.05434803063663223, + "grad_norm": 6.130374431610107, + "learning_rate": 9.985260643392215e-05, + "loss": 2.3124, + "step": 722 + }, + { + "epoch": 0.05442330491729239, + "grad_norm": 5.920669078826904, + "learning_rate": 9.985166958002543e-05, + "loss": 2.2433, + "step": 723 + }, + { + "epoch": 0.05449857919795254, + "grad_norm": 7.733959197998047, + "learning_rate": 9.985072976258999e-05, + "loss": 2.3199, + "step": 724 + }, + { + "epoch": 0.0545738534786127, + "grad_norm": 4.545495986938477, + "learning_rate": 9.984978698167167e-05, + "loss": 2.148, + "step": 725 + }, + { + "epoch": 0.05464912775927285, + "grad_norm": 9.407523155212402, + "learning_rate": 9.984884123732657e-05, + "loss": 2.4665, + "step": 726 + }, + { + "epoch": 0.05472440203993301, + "grad_norm": 5.692495822906494, + "learning_rate": 9.984789252961088e-05, + "loss": 2.42, + "step": 727 + }, + { + "epoch": 0.05479967632059316, + "grad_norm": 7.305675029754639, + "learning_rate": 9.984694085858105e-05, + "loss": 2.7213, + "step": 728 + }, + { + "epoch": 0.05487495060125332, + "grad_norm": 8.763826370239258, + "learning_rate": 9.984598622429356e-05, + "loss": 2.1834, + "step": 729 + }, + { + "epoch": 0.05495022488191347, + "grad_norm": 7.262514114379883, + "learning_rate": 9.984502862680524e-05, + "loss": 2.5227, + "step": 730 + }, + { + "epoch": 0.05502549916257363, + "grad_norm": 9.065153121948242, + "learning_rate": 9.984406806617298e-05, + "loss": 3.197, + "step": 731 + }, + { + "epoch": 0.05510077344323378, + "grad_norm": 6.451891899108887, + "learning_rate": 9.984310454245389e-05, + "loss": 2.9639, + "step": 732 + }, + { + "epoch": 0.05517604772389394, + "grad_norm": 11.691222190856934, + "learning_rate": 9.984213805570527e-05, + "loss": 2.6334, + "step": 733 + }, + { + "epoch": 0.05525132200455409, + "grad_norm": 8.031112670898438, + "learning_rate": 9.984116860598454e-05, + "loss": 2.3134, + "step": 734 + }, + { + "epoch": 0.05532659628521425, + "grad_norm": 6.133155822753906, + "learning_rate": 9.984019619334935e-05, + "loss": 2.132, + "step": 735 + }, + { + "epoch": 0.0554018705658744, + "grad_norm": 5.9999589920043945, + "learning_rate": 9.98392208178575e-05, + "loss": 2.1536, + "step": 736 + }, + { + "epoch": 0.055477144846534564, + "grad_norm": 7.8307108879089355, + "learning_rate": 9.983824247956699e-05, + "loss": 2.4977, + "step": 737 + }, + { + "epoch": 0.05555241912719472, + "grad_norm": 8.132503509521484, + "learning_rate": 9.983726117853595e-05, + "loss": 2.7205, + "step": 738 + }, + { + "epoch": 0.055627693407854874, + "grad_norm": 10.369138717651367, + "learning_rate": 9.983627691482273e-05, + "loss": 2.2676, + "step": 739 + }, + { + "epoch": 0.05570296768851503, + "grad_norm": 6.448859214782715, + "learning_rate": 9.983528968848585e-05, + "loss": 2.3715, + "step": 740 + }, + { + "epoch": 0.055778241969175184, + "grad_norm": 7.880186557769775, + "learning_rate": 9.983429949958401e-05, + "loss": 2.5879, + "step": 741 + }, + { + "epoch": 0.05585351624983534, + "grad_norm": 6.713860511779785, + "learning_rate": 9.983330634817604e-05, + "loss": 2.1742, + "step": 742 + }, + { + "epoch": 0.055928790530495494, + "grad_norm": 5.175292015075684, + "learning_rate": 9.9832310234321e-05, + "loss": 2.4678, + "step": 743 + }, + { + "epoch": 0.05600406481115565, + "grad_norm": 7.452995300292969, + "learning_rate": 9.983131115807809e-05, + "loss": 2.1645, + "step": 744 + }, + { + "epoch": 0.056079339091815804, + "grad_norm": 5.161840915679932, + "learning_rate": 9.983030911950671e-05, + "loss": 2.1761, + "step": 745 + }, + { + "epoch": 0.05615461337247596, + "grad_norm": 16.227642059326172, + "learning_rate": 9.982930411866644e-05, + "loss": 2.4417, + "step": 746 + }, + { + "epoch": 0.056229887653136114, + "grad_norm": 14.55573558807373, + "learning_rate": 9.982829615561703e-05, + "loss": 2.0874, + "step": 747 + }, + { + "epoch": 0.05630516193379627, + "grad_norm": 119.76773834228516, + "learning_rate": 9.982728523041838e-05, + "loss": 3.1929, + "step": 748 + }, + { + "epoch": 0.056380436214456424, + "grad_norm": 15.183578491210938, + "learning_rate": 9.982627134313058e-05, + "loss": 2.1994, + "step": 749 + }, + { + "epoch": 0.05645571049511658, + "grad_norm": 6.892390727996826, + "learning_rate": 9.982525449381392e-05, + "loss": 2.5903, + "step": 750 + }, + { + "epoch": 0.056530984775776734, + "grad_norm": 7.721436977386475, + "learning_rate": 9.982423468252886e-05, + "loss": 2.9455, + "step": 751 + }, + { + "epoch": 0.05660625905643689, + "grad_norm": 5.830654621124268, + "learning_rate": 9.9823211909336e-05, + "loss": 2.5005, + "step": 752 + }, + { + "epoch": 0.056681533337097044, + "grad_norm": 25.14459228515625, + "learning_rate": 9.982218617429615e-05, + "loss": 2.1816, + "step": 753 + }, + { + "epoch": 0.056756807617757206, + "grad_norm": 9.08641529083252, + "learning_rate": 9.982115747747029e-05, + "loss": 2.3189, + "step": 754 + }, + { + "epoch": 0.05683208189841736, + "grad_norm": 11.718338966369629, + "learning_rate": 9.982012581891958e-05, + "loss": 2.4572, + "step": 755 + }, + { + "epoch": 0.056907356179077516, + "grad_norm": 6.1245927810668945, + "learning_rate": 9.981909119870531e-05, + "loss": 2.4461, + "step": 756 + }, + { + "epoch": 0.05698263045973767, + "grad_norm": 5.890631198883057, + "learning_rate": 9.981805361688906e-05, + "loss": 2.4724, + "step": 757 + }, + { + "epoch": 0.057057904740397826, + "grad_norm": 5.78333044052124, + "learning_rate": 9.981701307353243e-05, + "loss": 2.2786, + "step": 758 + }, + { + "epoch": 0.05713317902105798, + "grad_norm": 9.026042938232422, + "learning_rate": 9.981596956869732e-05, + "loss": 2.3996, + "step": 759 + }, + { + "epoch": 0.057208453301718136, + "grad_norm": 6.684680461883545, + "learning_rate": 9.981492310244576e-05, + "loss": 2.5933, + "step": 760 + }, + { + "epoch": 0.05728372758237829, + "grad_norm": 8.666067123413086, + "learning_rate": 9.981387367483997e-05, + "loss": 2.3466, + "step": 761 + }, + { + "epoch": 0.057359001863038446, + "grad_norm": 38.26237106323242, + "learning_rate": 9.981282128594232e-05, + "loss": 2.2956, + "step": 762 + }, + { + "epoch": 0.0574342761436986, + "grad_norm": 7.783937454223633, + "learning_rate": 9.981176593581537e-05, + "loss": 2.0193, + "step": 763 + }, + { + "epoch": 0.057509550424358756, + "grad_norm": 7.6774139404296875, + "learning_rate": 9.981070762452184e-05, + "loss": 2.3278, + "step": 764 + }, + { + "epoch": 0.05758482470501891, + "grad_norm": 5.816064357757568, + "learning_rate": 9.980964635212469e-05, + "loss": 2.2031, + "step": 765 + }, + { + "epoch": 0.057660098985679066, + "grad_norm": 7.778492450714111, + "learning_rate": 9.980858211868697e-05, + "loss": 2.2466, + "step": 766 + }, + { + "epoch": 0.05773537326633922, + "grad_norm": 5.073156833648682, + "learning_rate": 9.980751492427197e-05, + "loss": 2.3197, + "step": 767 + }, + { + "epoch": 0.057810647546999376, + "grad_norm": 7.28161096572876, + "learning_rate": 9.980644476894311e-05, + "loss": 2.3205, + "step": 768 + }, + { + "epoch": 0.05788592182765953, + "grad_norm": 6.602877616882324, + "learning_rate": 9.980537165276402e-05, + "loss": 2.5677, + "step": 769 + }, + { + "epoch": 0.05796119610831969, + "grad_norm": 9.768770217895508, + "learning_rate": 9.98042955757985e-05, + "loss": 2.4587, + "step": 770 + }, + { + "epoch": 0.05803647038897985, + "grad_norm": 7.807348728179932, + "learning_rate": 9.980321653811051e-05, + "loss": 2.2789, + "step": 771 + }, + { + "epoch": 0.05811174466964, + "grad_norm": 7.8562750816345215, + "learning_rate": 9.980213453976421e-05, + "loss": 2.3424, + "step": 772 + }, + { + "epoch": 0.05818701895030016, + "grad_norm": 8.973971366882324, + "learning_rate": 9.980104958082389e-05, + "loss": 2.4038, + "step": 773 + }, + { + "epoch": 0.05826229323096031, + "grad_norm": 12.234197616577148, + "learning_rate": 9.979996166135408e-05, + "loss": 2.1111, + "step": 774 + }, + { + "epoch": 0.05833756751162047, + "grad_norm": 7.801458835601807, + "learning_rate": 9.979887078141945e-05, + "loss": 2.1947, + "step": 775 + }, + { + "epoch": 0.05841284179228062, + "grad_norm": 6.189359188079834, + "learning_rate": 9.979777694108483e-05, + "loss": 2.3981, + "step": 776 + }, + { + "epoch": 0.05848811607294078, + "grad_norm": 8.935062408447266, + "learning_rate": 9.979668014041524e-05, + "loss": 2.3187, + "step": 777 + }, + { + "epoch": 0.05856339035360093, + "grad_norm": 7.371228218078613, + "learning_rate": 9.979558037947591e-05, + "loss": 2.3166, + "step": 778 + }, + { + "epoch": 0.05863866463426109, + "grad_norm": 8.445380210876465, + "learning_rate": 9.979447765833221e-05, + "loss": 2.3155, + "step": 779 + }, + { + "epoch": 0.05871393891492124, + "grad_norm": 42.251461029052734, + "learning_rate": 9.979337197704969e-05, + "loss": 2.0374, + "step": 780 + }, + { + "epoch": 0.0587892131955814, + "grad_norm": 6.267967700958252, + "learning_rate": 9.97922633356941e-05, + "loss": 2.4888, + "step": 781 + }, + { + "epoch": 0.05886448747624155, + "grad_norm": 7.42905330657959, + "learning_rate": 9.979115173433128e-05, + "loss": 2.3898, + "step": 782 + }, + { + "epoch": 0.05893976175690171, + "grad_norm": 5.354802131652832, + "learning_rate": 9.97900371730274e-05, + "loss": 2.374, + "step": 783 + }, + { + "epoch": 0.05901503603756186, + "grad_norm": 10.564166069030762, + "learning_rate": 9.978891965184864e-05, + "loss": 2.2555, + "step": 784 + }, + { + "epoch": 0.05909031031822202, + "grad_norm": 6.234349727630615, + "learning_rate": 9.97877991708615e-05, + "loss": 2.3356, + "step": 785 + }, + { + "epoch": 0.05916558459888218, + "grad_norm": 6.825733661651611, + "learning_rate": 9.978667573013253e-05, + "loss": 2.348, + "step": 786 + }, + { + "epoch": 0.059240858879542335, + "grad_norm": 11.729052543640137, + "learning_rate": 9.978554932972855e-05, + "loss": 2.2918, + "step": 787 + }, + { + "epoch": 0.05931613316020249, + "grad_norm": 8.23495101928711, + "learning_rate": 9.97844199697165e-05, + "loss": 2.3762, + "step": 788 + }, + { + "epoch": 0.059391407440862645, + "grad_norm": 7.504601955413818, + "learning_rate": 9.978328765016355e-05, + "loss": 2.337, + "step": 789 + }, + { + "epoch": 0.0594666817215228, + "grad_norm": 6.503173828125, + "learning_rate": 9.978215237113699e-05, + "loss": 2.6591, + "step": 790 + }, + { + "epoch": 0.059541956002182955, + "grad_norm": 6.690675258636475, + "learning_rate": 9.978101413270431e-05, + "loss": 2.0039, + "step": 791 + }, + { + "epoch": 0.05961723028284311, + "grad_norm": 7.925537109375, + "learning_rate": 9.977987293493317e-05, + "loss": 2.1142, + "step": 792 + }, + { + "epoch": 0.059692504563503265, + "grad_norm": 78.65184783935547, + "learning_rate": 9.977872877789143e-05, + "loss": 2.5028, + "step": 793 + }, + { + "epoch": 0.05976777884416342, + "grad_norm": 8.48562240600586, + "learning_rate": 9.977758166164707e-05, + "loss": 2.3922, + "step": 794 + }, + { + "epoch": 0.059843053124823575, + "grad_norm": 7.253809452056885, + "learning_rate": 9.977643158626835e-05, + "loss": 2.1637, + "step": 795 + }, + { + "epoch": 0.05991832740548373, + "grad_norm": 8.590265274047852, + "learning_rate": 9.977527855182357e-05, + "loss": 3.1643, + "step": 796 + }, + { + "epoch": 0.059993601686143885, + "grad_norm": 8.352928161621094, + "learning_rate": 9.97741225583813e-05, + "loss": 2.2716, + "step": 797 + }, + { + "epoch": 0.06006887596680404, + "grad_norm": 6.841030120849609, + "learning_rate": 9.977296360601027e-05, + "loss": 2.6282, + "step": 798 + }, + { + "epoch": 0.060144150247464195, + "grad_norm": 6.771637916564941, + "learning_rate": 9.977180169477937e-05, + "loss": 2.294, + "step": 799 + }, + { + "epoch": 0.06021942452812435, + "grad_norm": 11.951594352722168, + "learning_rate": 9.977063682475767e-05, + "loss": 2.2196, + "step": 800 + }, + { + "epoch": 0.060294698808784505, + "grad_norm": 26.851673126220703, + "learning_rate": 9.976946899601441e-05, + "loss": 2.514, + "step": 801 + }, + { + "epoch": 0.06036997308944467, + "grad_norm": 8.609345436096191, + "learning_rate": 9.976829820861904e-05, + "loss": 2.1823, + "step": 802 + }, + { + "epoch": 0.06044524737010482, + "grad_norm": 7.100503444671631, + "learning_rate": 9.976712446264113e-05, + "loss": 2.4874, + "step": 803 + }, + { + "epoch": 0.06052052165076498, + "grad_norm": 6.742459297180176, + "learning_rate": 9.976594775815047e-05, + "loss": 2.1105, + "step": 804 + }, + { + "epoch": 0.06059579593142513, + "grad_norm": 6.591311454772949, + "learning_rate": 9.9764768095217e-05, + "loss": 2.5111, + "step": 805 + }, + { + "epoch": 0.06067107021208529, + "grad_norm": 6.057375431060791, + "learning_rate": 9.976358547391088e-05, + "loss": 2.1692, + "step": 806 + }, + { + "epoch": 0.06074634449274544, + "grad_norm": 8.88291072845459, + "learning_rate": 9.976239989430239e-05, + "loss": 2.8143, + "step": 807 + }, + { + "epoch": 0.0608216187734056, + "grad_norm": 8.118959426879883, + "learning_rate": 9.976121135646199e-05, + "loss": 3.1782, + "step": 808 + }, + { + "epoch": 0.06089689305406575, + "grad_norm": 7.874549865722656, + "learning_rate": 9.976001986046036e-05, + "loss": 2.5674, + "step": 809 + }, + { + "epoch": 0.06097216733472591, + "grad_norm": 7.681488990783691, + "learning_rate": 9.975882540636834e-05, + "loss": 2.3343, + "step": 810 + }, + { + "epoch": 0.06104744161538606, + "grad_norm": 8.765251159667969, + "learning_rate": 9.975762799425693e-05, + "loss": 2.3736, + "step": 811 + }, + { + "epoch": 0.06112271589604622, + "grad_norm": 9.046459197998047, + "learning_rate": 9.975642762419728e-05, + "loss": 2.3353, + "step": 812 + }, + { + "epoch": 0.06119799017670637, + "grad_norm": 9.579665184020996, + "learning_rate": 9.975522429626079e-05, + "loss": 2.2858, + "step": 813 + }, + { + "epoch": 0.06127326445736653, + "grad_norm": 8.402320861816406, + "learning_rate": 9.975401801051897e-05, + "loss": 2.3528, + "step": 814 + }, + { + "epoch": 0.06134853873802668, + "grad_norm": 7.485040664672852, + "learning_rate": 9.975280876704353e-05, + "loss": 2.7028, + "step": 815 + }, + { + "epoch": 0.06142381301868684, + "grad_norm": 6.590846061706543, + "learning_rate": 9.97515965659064e-05, + "loss": 2.2324, + "step": 816 + }, + { + "epoch": 0.06149908729934699, + "grad_norm": 7.63626766204834, + "learning_rate": 9.975038140717959e-05, + "loss": 2.3813, + "step": 817 + }, + { + "epoch": 0.061574361580007154, + "grad_norm": 8.878137588500977, + "learning_rate": 9.974916329093535e-05, + "loss": 1.9345, + "step": 818 + }, + { + "epoch": 0.06164963586066731, + "grad_norm": 7.905106544494629, + "learning_rate": 9.97479422172461e-05, + "loss": 2.2696, + "step": 819 + }, + { + "epoch": 0.061724910141327465, + "grad_norm": 7.946784496307373, + "learning_rate": 9.974671818618444e-05, + "loss": 2.4828, + "step": 820 + }, + { + "epoch": 0.06180018442198762, + "grad_norm": 6.852696895599365, + "learning_rate": 9.974549119782311e-05, + "loss": 2.1384, + "step": 821 + }, + { + "epoch": 0.061875458702647775, + "grad_norm": 20.009647369384766, + "learning_rate": 9.974426125223507e-05, + "loss": 2.4649, + "step": 822 + }, + { + "epoch": 0.06195073298330793, + "grad_norm": 5.390745639801025, + "learning_rate": 9.974302834949343e-05, + "loss": 2.1648, + "step": 823 + }, + { + "epoch": 0.062026007263968085, + "grad_norm": 6.90866756439209, + "learning_rate": 9.974179248967148e-05, + "loss": 2.1939, + "step": 824 + }, + { + "epoch": 0.06210128154462824, + "grad_norm": 6.320048809051514, + "learning_rate": 9.97405536728427e-05, + "loss": 2.5586, + "step": 825 + }, + { + "epoch": 0.062176555825288395, + "grad_norm": 5.045504093170166, + "learning_rate": 9.973931189908073e-05, + "loss": 2.3171, + "step": 826 + }, + { + "epoch": 0.06225183010594855, + "grad_norm": 7.020371437072754, + "learning_rate": 9.973806716845937e-05, + "loss": 2.2138, + "step": 827 + }, + { + "epoch": 0.062327104386608705, + "grad_norm": 5.824570655822754, + "learning_rate": 9.973681948105263e-05, + "loss": 2.3435, + "step": 828 + }, + { + "epoch": 0.06240237866726886, + "grad_norm": 6.399510383605957, + "learning_rate": 9.973556883693469e-05, + "loss": 2.1958, + "step": 829 + }, + { + "epoch": 0.062477652947929015, + "grad_norm": 5.8489484786987305, + "learning_rate": 9.97343152361799e-05, + "loss": 2.1253, + "step": 830 + }, + { + "epoch": 0.06255292722858917, + "grad_norm": 8.46529483795166, + "learning_rate": 9.973305867886276e-05, + "loss": 2.1793, + "step": 831 + }, + { + "epoch": 0.06262820150924933, + "grad_norm": 5.905313968658447, + "learning_rate": 9.9731799165058e-05, + "loss": 2.2548, + "step": 832 + }, + { + "epoch": 0.06270347578990948, + "grad_norm": 7.841697692871094, + "learning_rate": 9.973053669484045e-05, + "loss": 2.3315, + "step": 833 + }, + { + "epoch": 0.06277875007056964, + "grad_norm": 7.30699348449707, + "learning_rate": 9.97292712682852e-05, + "loss": 2.5827, + "step": 834 + }, + { + "epoch": 0.06285402435122979, + "grad_norm": 7.9440789222717285, + "learning_rate": 9.972800288546746e-05, + "loss": 2.3767, + "step": 835 + }, + { + "epoch": 0.06292929863188995, + "grad_norm": 9.207219123840332, + "learning_rate": 9.972673154646264e-05, + "loss": 2.3896, + "step": 836 + }, + { + "epoch": 0.0630045729125501, + "grad_norm": 7.319620609283447, + "learning_rate": 9.97254572513463e-05, + "loss": 2.4146, + "step": 837 + }, + { + "epoch": 0.06307984719321026, + "grad_norm": 7.219863414764404, + "learning_rate": 9.972418000019422e-05, + "loss": 2.2896, + "step": 838 + }, + { + "epoch": 0.06315512147387041, + "grad_norm": 7.409632205963135, + "learning_rate": 9.972289979308229e-05, + "loss": 2.3813, + "step": 839 + }, + { + "epoch": 0.06323039575453057, + "grad_norm": 6.324718475341797, + "learning_rate": 9.972161663008666e-05, + "loss": 2.2238, + "step": 840 + }, + { + "epoch": 0.06330567003519072, + "grad_norm": 6.108450412750244, + "learning_rate": 9.972033051128357e-05, + "loss": 2.4319, + "step": 841 + }, + { + "epoch": 0.06338094431585088, + "grad_norm": 8.239100456237793, + "learning_rate": 9.971904143674951e-05, + "loss": 2.4695, + "step": 842 + }, + { + "epoch": 0.06345621859651104, + "grad_norm": 6.279371738433838, + "learning_rate": 9.971774940656109e-05, + "loss": 2.5914, + "step": 843 + }, + { + "epoch": 0.06353149287717119, + "grad_norm": 5.380947589874268, + "learning_rate": 9.971645442079513e-05, + "loss": 2.6335, + "step": 844 + }, + { + "epoch": 0.06360676715783135, + "grad_norm": 4.4391679763793945, + "learning_rate": 9.971515647952861e-05, + "loss": 2.4388, + "step": 845 + }, + { + "epoch": 0.0636820414384915, + "grad_norm": 5.129392147064209, + "learning_rate": 9.971385558283868e-05, + "loss": 2.4109, + "step": 846 + }, + { + "epoch": 0.06375731571915166, + "grad_norm": 9.522102355957031, + "learning_rate": 9.971255173080267e-05, + "loss": 2.8749, + "step": 847 + }, + { + "epoch": 0.06383258999981181, + "grad_norm": 7.362565994262695, + "learning_rate": 9.971124492349812e-05, + "loss": 2.3519, + "step": 848 + }, + { + "epoch": 0.06390786428047197, + "grad_norm": 7.245185852050781, + "learning_rate": 9.970993516100269e-05, + "loss": 2.268, + "step": 849 + }, + { + "epoch": 0.06398313856113212, + "grad_norm": 6.527919769287109, + "learning_rate": 9.970862244339425e-05, + "loss": 2.1908, + "step": 850 + }, + { + "epoch": 0.06405841284179228, + "grad_norm": 6.99696683883667, + "learning_rate": 9.970730677075082e-05, + "loss": 2.2415, + "step": 851 + }, + { + "epoch": 0.06413368712245243, + "grad_norm": 5.327459812164307, + "learning_rate": 9.970598814315065e-05, + "loss": 2.5305, + "step": 852 + }, + { + "epoch": 0.0642089614031126, + "grad_norm": 5.620889186859131, + "learning_rate": 9.97046665606721e-05, + "loss": 2.1266, + "step": 853 + }, + { + "epoch": 0.06428423568377274, + "grad_norm": 6.113351345062256, + "learning_rate": 9.970334202339373e-05, + "loss": 2.1229, + "step": 854 + }, + { + "epoch": 0.0643595099644329, + "grad_norm": 6.923047065734863, + "learning_rate": 9.97020145313943e-05, + "loss": 2.5085, + "step": 855 + }, + { + "epoch": 0.06443478424509305, + "grad_norm": 4.128186225891113, + "learning_rate": 9.970068408475272e-05, + "loss": 2.0886, + "step": 856 + }, + { + "epoch": 0.06451005852575321, + "grad_norm": 8.818601608276367, + "learning_rate": 9.969935068354807e-05, + "loss": 2.802, + "step": 857 + }, + { + "epoch": 0.06458533280641336, + "grad_norm": 5.369051456451416, + "learning_rate": 9.969801432785965e-05, + "loss": 2.08, + "step": 858 + }, + { + "epoch": 0.06466060708707352, + "grad_norm": 7.448428153991699, + "learning_rate": 9.969667501776685e-05, + "loss": 2.4506, + "step": 859 + }, + { + "epoch": 0.06473588136773369, + "grad_norm": 7.504240989685059, + "learning_rate": 9.969533275334933e-05, + "loss": 2.4024, + "step": 860 + }, + { + "epoch": 0.06481115564839383, + "grad_norm": 6.5451555252075195, + "learning_rate": 9.969398753468686e-05, + "loss": 2.2971, + "step": 861 + }, + { + "epoch": 0.064886429929054, + "grad_norm": 5.987298488616943, + "learning_rate": 9.969263936185943e-05, + "loss": 2.7272, + "step": 862 + }, + { + "epoch": 0.06496170420971414, + "grad_norm": 8.401213645935059, + "learning_rate": 9.969128823494717e-05, + "loss": 2.6268, + "step": 863 + }, + { + "epoch": 0.0650369784903743, + "grad_norm": 6.37894344329834, + "learning_rate": 9.96899341540304e-05, + "loss": 2.4036, + "step": 864 + }, + { + "epoch": 0.06511225277103445, + "grad_norm": 10.493627548217773, + "learning_rate": 9.968857711918963e-05, + "loss": 2.2915, + "step": 865 + }, + { + "epoch": 0.06518752705169462, + "grad_norm": 6.327542304992676, + "learning_rate": 9.968721713050552e-05, + "loss": 2.0967, + "step": 866 + }, + { + "epoch": 0.06526280133235476, + "grad_norm": 5.9111762046813965, + "learning_rate": 9.968585418805891e-05, + "loss": 2.6636, + "step": 867 + }, + { + "epoch": 0.06533807561301493, + "grad_norm": 5.491913318634033, + "learning_rate": 9.968448829193085e-05, + "loss": 2.3923, + "step": 868 + }, + { + "epoch": 0.06541334989367507, + "grad_norm": 6.95320463180542, + "learning_rate": 9.968311944220251e-05, + "loss": 2.1361, + "step": 869 + }, + { + "epoch": 0.06548862417433524, + "grad_norm": 4.197360992431641, + "learning_rate": 9.968174763895529e-05, + "loss": 2.1778, + "step": 870 + }, + { + "epoch": 0.06556389845499538, + "grad_norm": 4.385210990905762, + "learning_rate": 9.96803728822707e-05, + "loss": 2.1352, + "step": 871 + }, + { + "epoch": 0.06563917273565555, + "grad_norm": 7.701970100402832, + "learning_rate": 9.967899517223051e-05, + "loss": 2.3367, + "step": 872 + }, + { + "epoch": 0.0657144470163157, + "grad_norm": 5.5492682456970215, + "learning_rate": 9.96776145089166e-05, + "loss": 2.3282, + "step": 873 + }, + { + "epoch": 0.06578972129697586, + "grad_norm": 6.109398365020752, + "learning_rate": 9.967623089241105e-05, + "loss": 2.2545, + "step": 874 + }, + { + "epoch": 0.06586499557763602, + "grad_norm": 5.757387161254883, + "learning_rate": 9.967484432279611e-05, + "loss": 2.3477, + "step": 875 + }, + { + "epoch": 0.06594026985829617, + "grad_norm": 5.365096092224121, + "learning_rate": 9.96734548001542e-05, + "loss": 2.3295, + "step": 876 + }, + { + "epoch": 0.06601554413895633, + "grad_norm": 6.437127113342285, + "learning_rate": 9.967206232456794e-05, + "loss": 2.5533, + "step": 877 + }, + { + "epoch": 0.06609081841961648, + "grad_norm": 6.558330059051514, + "learning_rate": 9.967066689612009e-05, + "loss": 2.1625, + "step": 878 + }, + { + "epoch": 0.06616609270027664, + "grad_norm": 7.039401531219482, + "learning_rate": 9.966926851489363e-05, + "loss": 2.0632, + "step": 879 + }, + { + "epoch": 0.06624136698093679, + "grad_norm": 7.518327713012695, + "learning_rate": 9.966786718097165e-05, + "loss": 2.38, + "step": 880 + }, + { + "epoch": 0.06631664126159695, + "grad_norm": 4.277347564697266, + "learning_rate": 9.96664628944375e-05, + "loss": 2.6705, + "step": 881 + }, + { + "epoch": 0.0663919155422571, + "grad_norm": 7.330222129821777, + "learning_rate": 9.966505565537463e-05, + "loss": 2.738, + "step": 882 + }, + { + "epoch": 0.06646718982291726, + "grad_norm": 6.637203693389893, + "learning_rate": 9.96636454638667e-05, + "loss": 2.0495, + "step": 883 + }, + { + "epoch": 0.0665424641035774, + "grad_norm": 6.106614112854004, + "learning_rate": 9.966223231999757e-05, + "loss": 1.961, + "step": 884 + }, + { + "epoch": 0.06661773838423757, + "grad_norm": 7.381946563720703, + "learning_rate": 9.966081622385122e-05, + "loss": 2.4937, + "step": 885 + }, + { + "epoch": 0.06669301266489772, + "grad_norm": 8.07280445098877, + "learning_rate": 9.965939717551183e-05, + "loss": 2.5545, + "step": 886 + }, + { + "epoch": 0.06676828694555788, + "grad_norm": 8.019253730773926, + "learning_rate": 9.965797517506377e-05, + "loss": 2.1663, + "step": 887 + }, + { + "epoch": 0.06684356122621803, + "grad_norm": 5.034527778625488, + "learning_rate": 9.965655022259157e-05, + "loss": 2.2445, + "step": 888 + }, + { + "epoch": 0.06691883550687819, + "grad_norm": 8.299301147460938, + "learning_rate": 9.965512231817992e-05, + "loss": 1.956, + "step": 889 + }, + { + "epoch": 0.06699410978753834, + "grad_norm": 6.635512828826904, + "learning_rate": 9.965369146191376e-05, + "loss": 2.253, + "step": 890 + }, + { + "epoch": 0.0670693840681985, + "grad_norm": 6.2723283767700195, + "learning_rate": 9.965225765387809e-05, + "loss": 2.3474, + "step": 891 + }, + { + "epoch": 0.06714465834885866, + "grad_norm": 5.686178684234619, + "learning_rate": 9.965082089415819e-05, + "loss": 2.1819, + "step": 892 + }, + { + "epoch": 0.06721993262951881, + "grad_norm": 8.98385238647461, + "learning_rate": 9.964938118283944e-05, + "loss": 1.8814, + "step": 893 + }, + { + "epoch": 0.06729520691017897, + "grad_norm": 4.871972560882568, + "learning_rate": 9.964793852000744e-05, + "loss": 2.2073, + "step": 894 + }, + { + "epoch": 0.06737048119083912, + "grad_norm": 29.07400894165039, + "learning_rate": 9.964649290574794e-05, + "loss": 2.3609, + "step": 895 + }, + { + "epoch": 0.06744575547149928, + "grad_norm": 7.433924674987793, + "learning_rate": 9.964504434014691e-05, + "loss": 2.2446, + "step": 896 + }, + { + "epoch": 0.06752102975215943, + "grad_norm": 8.690600395202637, + "learning_rate": 9.964359282329044e-05, + "loss": 2.5085, + "step": 897 + }, + { + "epoch": 0.06759630403281959, + "grad_norm": 7.348930358886719, + "learning_rate": 9.96421383552648e-05, + "loss": 2.21, + "step": 898 + }, + { + "epoch": 0.06767157831347974, + "grad_norm": 8.213135719299316, + "learning_rate": 9.964068093615649e-05, + "loss": 2.2163, + "step": 899 + }, + { + "epoch": 0.0677468525941399, + "grad_norm": 8.281112670898438, + "learning_rate": 9.963922056605213e-05, + "loss": 2.6087, + "step": 900 + }, + { + "epoch": 0.06782212687480005, + "grad_norm": 8.835277557373047, + "learning_rate": 9.963775724503853e-05, + "loss": 2.435, + "step": 901 + }, + { + "epoch": 0.06789740115546021, + "grad_norm": 6.52493143081665, + "learning_rate": 9.96362909732027e-05, + "loss": 2.7242, + "step": 902 + }, + { + "epoch": 0.06797267543612036, + "grad_norm": 7.01603889465332, + "learning_rate": 9.963482175063179e-05, + "loss": 2.2189, + "step": 903 + }, + { + "epoch": 0.06804794971678052, + "grad_norm": 6.048619747161865, + "learning_rate": 9.963334957741315e-05, + "loss": 2.2337, + "step": 904 + }, + { + "epoch": 0.06812322399744067, + "grad_norm": 4.485490798950195, + "learning_rate": 9.963187445363428e-05, + "loss": 2.3935, + "step": 905 + }, + { + "epoch": 0.06819849827810083, + "grad_norm": 5.6536970138549805, + "learning_rate": 9.96303963793829e-05, + "loss": 2.3024, + "step": 906 + }, + { + "epoch": 0.06827377255876099, + "grad_norm": 6.321853160858154, + "learning_rate": 9.962891535474685e-05, + "loss": 2.1157, + "step": 907 + }, + { + "epoch": 0.06834904683942114, + "grad_norm": 7.348297595977783, + "learning_rate": 9.962743137981417e-05, + "loss": 2.543, + "step": 908 + }, + { + "epoch": 0.0684243211200813, + "grad_norm": 8.388378143310547, + "learning_rate": 9.96259444546731e-05, + "loss": 2.3731, + "step": 909 + }, + { + "epoch": 0.06849959540074145, + "grad_norm": 5.09698486328125, + "learning_rate": 9.962445457941204e-05, + "loss": 2.3798, + "step": 910 + }, + { + "epoch": 0.06857486968140161, + "grad_norm": 8.152088165283203, + "learning_rate": 9.962296175411954e-05, + "loss": 2.2166, + "step": 911 + }, + { + "epoch": 0.06865014396206176, + "grad_norm": 9.817691802978516, + "learning_rate": 9.962146597888434e-05, + "loss": 2.2701, + "step": 912 + }, + { + "epoch": 0.06872541824272192, + "grad_norm": 6.1295247077941895, + "learning_rate": 9.961996725379537e-05, + "loss": 2.1892, + "step": 913 + }, + { + "epoch": 0.06880069252338207, + "grad_norm": 13.386317253112793, + "learning_rate": 9.961846557894173e-05, + "loss": 2.2879, + "step": 914 + }, + { + "epoch": 0.06887596680404223, + "grad_norm": 9.592955589294434, + "learning_rate": 9.961696095441266e-05, + "loss": 2.2491, + "step": 915 + }, + { + "epoch": 0.06895124108470238, + "grad_norm": 6.653538227081299, + "learning_rate": 9.961545338029765e-05, + "loss": 2.3148, + "step": 916 + }, + { + "epoch": 0.06902651536536254, + "grad_norm": 6.470543384552002, + "learning_rate": 9.961394285668629e-05, + "loss": 2.2091, + "step": 917 + }, + { + "epoch": 0.06910178964602269, + "grad_norm": 6.315271854400635, + "learning_rate": 9.96124293836684e-05, + "loss": 2.5219, + "step": 918 + }, + { + "epoch": 0.06917706392668285, + "grad_norm": 5.607402324676514, + "learning_rate": 9.961091296133391e-05, + "loss": 2.2627, + "step": 919 + }, + { + "epoch": 0.069252338207343, + "grad_norm": 6.643596172332764, + "learning_rate": 9.960939358977301e-05, + "loss": 2.4575, + "step": 920 + }, + { + "epoch": 0.06932761248800316, + "grad_norm": 41.451416015625, + "learning_rate": 9.960787126907602e-05, + "loss": 2.1879, + "step": 921 + }, + { + "epoch": 0.06940288676866331, + "grad_norm": 9.232449531555176, + "learning_rate": 9.96063459993334e-05, + "loss": 2.1, + "step": 922 + }, + { + "epoch": 0.06947816104932347, + "grad_norm": 4.796618461608887, + "learning_rate": 9.960481778063586e-05, + "loss": 2.5022, + "step": 923 + }, + { + "epoch": 0.06955343532998363, + "grad_norm": 6.436709880828857, + "learning_rate": 9.960328661307423e-05, + "loss": 2.2992, + "step": 924 + }, + { + "epoch": 0.06962870961064378, + "grad_norm": 7.875251770019531, + "learning_rate": 9.960175249673956e-05, + "loss": 2.0712, + "step": 925 + }, + { + "epoch": 0.06970398389130394, + "grad_norm": 6.331722736358643, + "learning_rate": 9.960021543172299e-05, + "loss": 2.3233, + "step": 926 + }, + { + "epoch": 0.06977925817196409, + "grad_norm": 5.4087748527526855, + "learning_rate": 9.959867541811596e-05, + "loss": 1.801, + "step": 927 + }, + { + "epoch": 0.06985453245262425, + "grad_norm": 10.131918907165527, + "learning_rate": 9.959713245600996e-05, + "loss": 2.2608, + "step": 928 + }, + { + "epoch": 0.0699298067332844, + "grad_norm": 10.273932456970215, + "learning_rate": 9.95955865454968e-05, + "loss": 2.2815, + "step": 929 + }, + { + "epoch": 0.07000508101394456, + "grad_norm": 9.296262741088867, + "learning_rate": 9.959403768666828e-05, + "loss": 2.1538, + "step": 930 + }, + { + "epoch": 0.07008035529460471, + "grad_norm": 6.723324775695801, + "learning_rate": 9.959248587961652e-05, + "loss": 2.2478, + "step": 931 + }, + { + "epoch": 0.07015562957526487, + "grad_norm": 8.327369689941406, + "learning_rate": 9.959093112443378e-05, + "loss": 2.3508, + "step": 932 + }, + { + "epoch": 0.07023090385592502, + "grad_norm": 8.568559646606445, + "learning_rate": 9.958937342121249e-05, + "loss": 2.3164, + "step": 933 + }, + { + "epoch": 0.07030617813658518, + "grad_norm": 9.101744651794434, + "learning_rate": 9.958781277004523e-05, + "loss": 2.1864, + "step": 934 + }, + { + "epoch": 0.07038145241724533, + "grad_norm": 7.617058753967285, + "learning_rate": 9.958624917102479e-05, + "loss": 2.0005, + "step": 935 + }, + { + "epoch": 0.0704567266979055, + "grad_norm": 7.2143754959106445, + "learning_rate": 9.958468262424409e-05, + "loss": 2.7119, + "step": 936 + }, + { + "epoch": 0.07053200097856564, + "grad_norm": 7.102320194244385, + "learning_rate": 9.958311312979631e-05, + "loss": 2.7345, + "step": 937 + }, + { + "epoch": 0.0706072752592258, + "grad_norm": 7.540574073791504, + "learning_rate": 9.958154068777473e-05, + "loss": 2.0282, + "step": 938 + }, + { + "epoch": 0.07068254953988595, + "grad_norm": 7.081127643585205, + "learning_rate": 9.95799652982728e-05, + "loss": 2.0488, + "step": 939 + }, + { + "epoch": 0.07075782382054611, + "grad_norm": 8.797993659973145, + "learning_rate": 9.95783869613842e-05, + "loss": 2.1166, + "step": 940 + }, + { + "epoch": 0.07083309810120628, + "grad_norm": 9.974615097045898, + "learning_rate": 9.957680567720275e-05, + "loss": 3.0936, + "step": 941 + }, + { + "epoch": 0.07090837238186642, + "grad_norm": 10.330351829528809, + "learning_rate": 9.957522144582245e-05, + "loss": 2.3086, + "step": 942 + }, + { + "epoch": 0.07098364666252659, + "grad_norm": 28.45594596862793, + "learning_rate": 9.957363426733751e-05, + "loss": 2.5327, + "step": 943 + }, + { + "epoch": 0.07105892094318673, + "grad_norm": 9.89444351196289, + "learning_rate": 9.957204414184223e-05, + "loss": 2.4356, + "step": 944 + }, + { + "epoch": 0.0711341952238469, + "grad_norm": 5.3622636795043945, + "learning_rate": 9.957045106943115e-05, + "loss": 2.3719, + "step": 945 + }, + { + "epoch": 0.07120946950450704, + "grad_norm": 5.727299213409424, + "learning_rate": 9.956885505019901e-05, + "loss": 2.1954, + "step": 946 + }, + { + "epoch": 0.0712847437851672, + "grad_norm": 9.032651901245117, + "learning_rate": 9.956725608424066e-05, + "loss": 2.2576, + "step": 947 + }, + { + "epoch": 0.07136001806582735, + "grad_norm": 7.409090518951416, + "learning_rate": 9.956565417165117e-05, + "loss": 2.4279, + "step": 948 + }, + { + "epoch": 0.07143529234648752, + "grad_norm": 5.698240756988525, + "learning_rate": 9.956404931252574e-05, + "loss": 2.3198, + "step": 949 + }, + { + "epoch": 0.07151056662714766, + "grad_norm": 7.358651161193848, + "learning_rate": 9.956244150695981e-05, + "loss": 2.6111, + "step": 950 + }, + { + "epoch": 0.07158584090780783, + "grad_norm": 6.535305023193359, + "learning_rate": 9.956083075504894e-05, + "loss": 2.1332, + "step": 951 + }, + { + "epoch": 0.07166111518846797, + "grad_norm": 5.578695297241211, + "learning_rate": 9.955921705688889e-05, + "loss": 2.3039, + "step": 952 + }, + { + "epoch": 0.07173638946912814, + "grad_norm": 5.592685222625732, + "learning_rate": 9.955760041257558e-05, + "loss": 2.5537, + "step": 953 + }, + { + "epoch": 0.07181166374978828, + "grad_norm": 5.581692695617676, + "learning_rate": 9.955598082220513e-05, + "loss": 2.0212, + "step": 954 + }, + { + "epoch": 0.07188693803044845, + "grad_norm": 4.848193168640137, + "learning_rate": 9.95543582858738e-05, + "loss": 2.0714, + "step": 955 + }, + { + "epoch": 0.07196221231110861, + "grad_norm": 4.392746448516846, + "learning_rate": 9.955273280367807e-05, + "loss": 2.0494, + "step": 956 + }, + { + "epoch": 0.07203748659176876, + "grad_norm": 4.629284858703613, + "learning_rate": 9.955110437571456e-05, + "loss": 2.2462, + "step": 957 + }, + { + "epoch": 0.07211276087242892, + "grad_norm": 8.943917274475098, + "learning_rate": 9.954947300208007e-05, + "loss": 2.1986, + "step": 958 + }, + { + "epoch": 0.07218803515308907, + "grad_norm": 5.576946258544922, + "learning_rate": 9.954783868287157e-05, + "loss": 2.2512, + "step": 959 + }, + { + "epoch": 0.07226330943374923, + "grad_norm": 4.323258876800537, + "learning_rate": 9.954620141818623e-05, + "loss": 2.5645, + "step": 960 + }, + { + "epoch": 0.07233858371440938, + "grad_norm": 7.824594020843506, + "learning_rate": 9.954456120812139e-05, + "loss": 2.3138, + "step": 961 + }, + { + "epoch": 0.07241385799506954, + "grad_norm": 5.433979511260986, + "learning_rate": 9.954291805277455e-05, + "loss": 1.8625, + "step": 962 + }, + { + "epoch": 0.07248913227572969, + "grad_norm": 7.989663600921631, + "learning_rate": 9.954127195224337e-05, + "loss": 2.4802, + "step": 963 + }, + { + "epoch": 0.07256440655638985, + "grad_norm": 5.893810272216797, + "learning_rate": 9.953962290662573e-05, + "loss": 2.364, + "step": 964 + }, + { + "epoch": 0.07263968083705, + "grad_norm": 5.590322971343994, + "learning_rate": 9.953797091601967e-05, + "loss": 2.1408, + "step": 965 + }, + { + "epoch": 0.07271495511771016, + "grad_norm": 8.895672798156738, + "learning_rate": 9.953631598052335e-05, + "loss": 2.0433, + "step": 966 + }, + { + "epoch": 0.0727902293983703, + "grad_norm": 7.369490623474121, + "learning_rate": 9.95346581002352e-05, + "loss": 2.2219, + "step": 967 + }, + { + "epoch": 0.07286550367903047, + "grad_norm": 4.879428863525391, + "learning_rate": 9.953299727525376e-05, + "loss": 2.3617, + "step": 968 + }, + { + "epoch": 0.07294077795969062, + "grad_norm": 6.520797252655029, + "learning_rate": 9.953133350567774e-05, + "loss": 2.1921, + "step": 969 + }, + { + "epoch": 0.07301605224035078, + "grad_norm": 8.669915199279785, + "learning_rate": 9.952966679160606e-05, + "loss": 2.244, + "step": 970 + }, + { + "epoch": 0.07309132652101093, + "grad_norm": 6.6778998374938965, + "learning_rate": 9.952799713313783e-05, + "loss": 2.3136, + "step": 971 + }, + { + "epoch": 0.07316660080167109, + "grad_norm": 6.274852752685547, + "learning_rate": 9.952632453037227e-05, + "loss": 2.3448, + "step": 972 + }, + { + "epoch": 0.07324187508233125, + "grad_norm": 5.950081825256348, + "learning_rate": 9.952464898340883e-05, + "loss": 2.3861, + "step": 973 + }, + { + "epoch": 0.0733171493629914, + "grad_norm": 5.693521022796631, + "learning_rate": 9.952297049234712e-05, + "loss": 2.0353, + "step": 974 + }, + { + "epoch": 0.07339242364365156, + "grad_norm": 6.284474849700928, + "learning_rate": 9.952128905728688e-05, + "loss": 2.2746, + "step": 975 + }, + { + "epoch": 0.07346769792431171, + "grad_norm": 6.664897918701172, + "learning_rate": 9.951960467832813e-05, + "loss": 2.1025, + "step": 976 + }, + { + "epoch": 0.07354297220497187, + "grad_norm": 6.148214340209961, + "learning_rate": 9.951791735557094e-05, + "loss": 2.4464, + "step": 977 + }, + { + "epoch": 0.07361824648563202, + "grad_norm": 12.645331382751465, + "learning_rate": 9.951622708911567e-05, + "loss": 2.2817, + "step": 978 + }, + { + "epoch": 0.07369352076629218, + "grad_norm": 10.046823501586914, + "learning_rate": 9.951453387906277e-05, + "loss": 2.227, + "step": 979 + }, + { + "epoch": 0.07376879504695233, + "grad_norm": 5.051860332489014, + "learning_rate": 9.951283772551289e-05, + "loss": 2.2613, + "step": 980 + }, + { + "epoch": 0.07384406932761249, + "grad_norm": 5.831755638122559, + "learning_rate": 9.95111386285669e-05, + "loss": 2.1071, + "step": 981 + }, + { + "epoch": 0.07391934360827264, + "grad_norm": 23.8297119140625, + "learning_rate": 9.950943658832577e-05, + "loss": 2.4516, + "step": 982 + }, + { + "epoch": 0.0739946178889328, + "grad_norm": 7.549853324890137, + "learning_rate": 9.950773160489069e-05, + "loss": 2.6201, + "step": 983 + }, + { + "epoch": 0.07406989216959295, + "grad_norm": 6.422366142272949, + "learning_rate": 9.950602367836303e-05, + "loss": 2.5431, + "step": 984 + }, + { + "epoch": 0.07414516645025311, + "grad_norm": 6.1562724113464355, + "learning_rate": 9.950431280884429e-05, + "loss": 2.154, + "step": 985 + }, + { + "epoch": 0.07422044073091326, + "grad_norm": 5.697635650634766, + "learning_rate": 9.950259899643622e-05, + "loss": 2.4768, + "step": 986 + }, + { + "epoch": 0.07429571501157342, + "grad_norm": 6.786236763000488, + "learning_rate": 9.950088224124068e-05, + "loss": 2.1359, + "step": 987 + }, + { + "epoch": 0.07437098929223358, + "grad_norm": 13.68165111541748, + "learning_rate": 9.94991625433597e-05, + "loss": 2.4844, + "step": 988 + }, + { + "epoch": 0.07444626357289373, + "grad_norm": 8.806624412536621, + "learning_rate": 9.949743990289555e-05, + "loss": 2.4181, + "step": 989 + }, + { + "epoch": 0.07452153785355389, + "grad_norm": 8.04106330871582, + "learning_rate": 9.949571431995062e-05, + "loss": 2.26, + "step": 990 + }, + { + "epoch": 0.07459681213421404, + "grad_norm": 6.4797539710998535, + "learning_rate": 9.949398579462749e-05, + "loss": 2.19, + "step": 991 + }, + { + "epoch": 0.0746720864148742, + "grad_norm": 13.77220630645752, + "learning_rate": 9.949225432702893e-05, + "loss": 2.4597, + "step": 992 + }, + { + "epoch": 0.07474736069553435, + "grad_norm": 9.641508102416992, + "learning_rate": 9.949051991725786e-05, + "loss": 2.2968, + "step": 993 + }, + { + "epoch": 0.07482263497619451, + "grad_norm": 9.998353958129883, + "learning_rate": 9.948878256541738e-05, + "loss": 2.2508, + "step": 994 + }, + { + "epoch": 0.07489790925685466, + "grad_norm": 7.318953990936279, + "learning_rate": 9.948704227161078e-05, + "loss": 2.3632, + "step": 995 + }, + { + "epoch": 0.07497318353751482, + "grad_norm": 8.15821361541748, + "learning_rate": 9.94852990359415e-05, + "loss": 2.1609, + "step": 996 + }, + { + "epoch": 0.07504845781817497, + "grad_norm": 5.258177757263184, + "learning_rate": 9.948355285851318e-05, + "loss": 2.1151, + "step": 997 + }, + { + "epoch": 0.07512373209883513, + "grad_norm": 9.631346702575684, + "learning_rate": 9.948180373942963e-05, + "loss": 2.4226, + "step": 998 + }, + { + "epoch": 0.07519900637949528, + "grad_norm": 8.572126388549805, + "learning_rate": 9.948005167879485e-05, + "loss": 1.986, + "step": 999 + }, + { + "epoch": 0.07527428066015544, + "grad_norm": 9.46074104309082, + "learning_rate": 9.947829667671294e-05, + "loss": 2.2485, + "step": 1000 + }, + { + "epoch": 0.07534955494081559, + "grad_norm": 8.006175994873047, + "learning_rate": 9.947653873328828e-05, + "loss": 2.3965, + "step": 1001 + }, + { + "epoch": 0.07542482922147575, + "grad_norm": 7.2722368240356445, + "learning_rate": 9.947477784862536e-05, + "loss": 2.3063, + "step": 1002 + }, + { + "epoch": 0.0755001035021359, + "grad_norm": 5.273183345794678, + "learning_rate": 9.947301402282884e-05, + "loss": 2.2712, + "step": 1003 + }, + { + "epoch": 0.07557537778279606, + "grad_norm": 9.717782020568848, + "learning_rate": 9.947124725600359e-05, + "loss": 2.4998, + "step": 1004 + }, + { + "epoch": 0.07565065206345623, + "grad_norm": 18.217540740966797, + "learning_rate": 9.946947754825464e-05, + "loss": 2.3622, + "step": 1005 + }, + { + "epoch": 0.07572592634411637, + "grad_norm": 18.87044334411621, + "learning_rate": 9.94677048996872e-05, + "loss": 2.2011, + "step": 1006 + }, + { + "epoch": 0.07580120062477654, + "grad_norm": 12.92352294921875, + "learning_rate": 9.946592931040666e-05, + "loss": 2.5136, + "step": 1007 + }, + { + "epoch": 0.07587647490543668, + "grad_norm": 9.815780639648438, + "learning_rate": 9.946415078051853e-05, + "loss": 2.2793, + "step": 1008 + }, + { + "epoch": 0.07595174918609685, + "grad_norm": 6.124758243560791, + "learning_rate": 9.946236931012858e-05, + "loss": 2.2679, + "step": 1009 + }, + { + "epoch": 0.076027023466757, + "grad_norm": 13.920767784118652, + "learning_rate": 9.946058489934269e-05, + "loss": 2.3944, + "step": 1010 + }, + { + "epoch": 0.07610229774741716, + "grad_norm": 9.413524627685547, + "learning_rate": 9.945879754826695e-05, + "loss": 2.5051, + "step": 1011 + }, + { + "epoch": 0.0761775720280773, + "grad_norm": 28.315019607543945, + "learning_rate": 9.94570072570076e-05, + "loss": 2.4098, + "step": 1012 + }, + { + "epoch": 0.07625284630873747, + "grad_norm": 51.158729553222656, + "learning_rate": 9.945521402567109e-05, + "loss": 2.1772, + "step": 1013 + }, + { + "epoch": 0.07632812058939761, + "grad_norm": 11.702067375183105, + "learning_rate": 9.9453417854364e-05, + "loss": 2.1498, + "step": 1014 + }, + { + "epoch": 0.07640339487005778, + "grad_norm": 9.4339017868042, + "learning_rate": 9.945161874319311e-05, + "loss": 1.9991, + "step": 1015 + }, + { + "epoch": 0.07647866915071792, + "grad_norm": 10.04547119140625, + "learning_rate": 9.944981669226539e-05, + "loss": 2.1931, + "step": 1016 + }, + { + "epoch": 0.07655394343137809, + "grad_norm": 5.388617038726807, + "learning_rate": 9.944801170168796e-05, + "loss": 2.549, + "step": 1017 + }, + { + "epoch": 0.07662921771203823, + "grad_norm": 6.930098533630371, + "learning_rate": 9.94462037715681e-05, + "loss": 2.1891, + "step": 1018 + }, + { + "epoch": 0.0767044919926984, + "grad_norm": 5.418160438537598, + "learning_rate": 9.944439290201331e-05, + "loss": 2.2654, + "step": 1019 + }, + { + "epoch": 0.07677976627335854, + "grad_norm": 7.499770641326904, + "learning_rate": 9.944257909313124e-05, + "loss": 2.6116, + "step": 1020 + }, + { + "epoch": 0.0768550405540187, + "grad_norm": 6.448430061340332, + "learning_rate": 9.94407623450297e-05, + "loss": 2.4772, + "step": 1021 + }, + { + "epoch": 0.07693031483467887, + "grad_norm": 7.160433292388916, + "learning_rate": 9.94389426578167e-05, + "loss": 2.2233, + "step": 1022 + }, + { + "epoch": 0.07700558911533902, + "grad_norm": 5.030216693878174, + "learning_rate": 9.943712003160043e-05, + "loss": 2.1228, + "step": 1023 + }, + { + "epoch": 0.07708086339599918, + "grad_norm": 5.987746715545654, + "learning_rate": 9.943529446648921e-05, + "loss": 2.2869, + "step": 1024 + }, + { + "epoch": 0.07715613767665933, + "grad_norm": 6.163780212402344, + "learning_rate": 9.943346596259159e-05, + "loss": 2.3491, + "step": 1025 + }, + { + "epoch": 0.07723141195731949, + "grad_norm": 9.042941093444824, + "learning_rate": 9.943163452001625e-05, + "loss": 2.5431, + "step": 1026 + }, + { + "epoch": 0.07730668623797964, + "grad_norm": 6.006967544555664, + "learning_rate": 9.942980013887207e-05, + "loss": 2.1433, + "step": 1027 + }, + { + "epoch": 0.0773819605186398, + "grad_norm": 4.533386707305908, + "learning_rate": 9.942796281926812e-05, + "loss": 2.7674, + "step": 1028 + }, + { + "epoch": 0.07745723479929995, + "grad_norm": 7.154090404510498, + "learning_rate": 9.942612256131359e-05, + "loss": 2.2536, + "step": 1029 + }, + { + "epoch": 0.07753250907996011, + "grad_norm": 5.8469133377075195, + "learning_rate": 9.94242793651179e-05, + "loss": 1.8459, + "step": 1030 + }, + { + "epoch": 0.07760778336062026, + "grad_norm": 6.218902111053467, + "learning_rate": 9.942243323079061e-05, + "loss": 2.2182, + "step": 1031 + }, + { + "epoch": 0.07768305764128042, + "grad_norm": 5.926104545593262, + "learning_rate": 9.942058415844147e-05, + "loss": 2.4657, + "step": 1032 + }, + { + "epoch": 0.07775833192194057, + "grad_norm": 9.584598541259766, + "learning_rate": 9.941873214818042e-05, + "loss": 2.2548, + "step": 1033 + }, + { + "epoch": 0.07783360620260073, + "grad_norm": 7.574162483215332, + "learning_rate": 9.941687720011753e-05, + "loss": 2.0924, + "step": 1034 + }, + { + "epoch": 0.07790888048326088, + "grad_norm": 8.9089994430542, + "learning_rate": 9.941501931436309e-05, + "loss": 2.0136, + "step": 1035 + }, + { + "epoch": 0.07798415476392104, + "grad_norm": 6.595572471618652, + "learning_rate": 9.941315849102753e-05, + "loss": 2.6039, + "step": 1036 + }, + { + "epoch": 0.0780594290445812, + "grad_norm": 4.419295310974121, + "learning_rate": 9.94112947302215e-05, + "loss": 2.1338, + "step": 1037 + }, + { + "epoch": 0.07813470332524135, + "grad_norm": 7.29696798324585, + "learning_rate": 9.940942803205575e-05, + "loss": 2.1771, + "step": 1038 + }, + { + "epoch": 0.07820997760590151, + "grad_norm": 4.897830963134766, + "learning_rate": 9.940755839664128e-05, + "loss": 2.2792, + "step": 1039 + }, + { + "epoch": 0.07828525188656166, + "grad_norm": 6.640675067901611, + "learning_rate": 9.940568582408924e-05, + "loss": 2.3644, + "step": 1040 + }, + { + "epoch": 0.07836052616722182, + "grad_norm": 4.935637474060059, + "learning_rate": 9.940381031451094e-05, + "loss": 2.296, + "step": 1041 + }, + { + "epoch": 0.07843580044788197, + "grad_norm": 5.48577880859375, + "learning_rate": 9.940193186801787e-05, + "loss": 2.064, + "step": 1042 + }, + { + "epoch": 0.07851107472854213, + "grad_norm": 6.665215492248535, + "learning_rate": 9.94000504847217e-05, + "loss": 2.0016, + "step": 1043 + }, + { + "epoch": 0.07858634900920228, + "grad_norm": 6.476923942565918, + "learning_rate": 9.939816616473427e-05, + "loss": 2.3703, + "step": 1044 + }, + { + "epoch": 0.07866162328986244, + "grad_norm": 6.548463344573975, + "learning_rate": 9.93962789081676e-05, + "loss": 2.3357, + "step": 1045 + }, + { + "epoch": 0.07873689757052259, + "grad_norm": 6.8166608810424805, + "learning_rate": 9.939438871513388e-05, + "loss": 1.9012, + "step": 1046 + }, + { + "epoch": 0.07881217185118275, + "grad_norm": 4.497401237487793, + "learning_rate": 9.939249558574548e-05, + "loss": 2.2389, + "step": 1047 + }, + { + "epoch": 0.0788874461318429, + "grad_norm": 5.580781936645508, + "learning_rate": 9.939059952011494e-05, + "loss": 2.3124, + "step": 1048 + }, + { + "epoch": 0.07896272041250306, + "grad_norm": 5.667317867279053, + "learning_rate": 9.938870051835498e-05, + "loss": 2.2054, + "step": 1049 + }, + { + "epoch": 0.07903799469316321, + "grad_norm": 4.224584102630615, + "learning_rate": 9.938679858057848e-05, + "loss": 2.2884, + "step": 1050 + }, + { + "epoch": 0.07911326897382337, + "grad_norm": 5.601130485534668, + "learning_rate": 9.938489370689853e-05, + "loss": 2.2813, + "step": 1051 + }, + { + "epoch": 0.07918854325448352, + "grad_norm": 4.993713855743408, + "learning_rate": 9.938298589742834e-05, + "loss": 2.521, + "step": 1052 + }, + { + "epoch": 0.07926381753514368, + "grad_norm": 5.588556289672852, + "learning_rate": 9.938107515228133e-05, + "loss": 2.0526, + "step": 1053 + }, + { + "epoch": 0.07933909181580384, + "grad_norm": 6.0439677238464355, + "learning_rate": 9.93791614715711e-05, + "loss": 2.2687, + "step": 1054 + }, + { + "epoch": 0.07941436609646399, + "grad_norm": 5.854000091552734, + "learning_rate": 9.93772448554114e-05, + "loss": 2.1055, + "step": 1055 + }, + { + "epoch": 0.07948964037712415, + "grad_norm": 6.042765140533447, + "learning_rate": 9.937532530391617e-05, + "loss": 2.2494, + "step": 1056 + }, + { + "epoch": 0.0795649146577843, + "grad_norm": 7.519721984863281, + "learning_rate": 9.937340281719952e-05, + "loss": 2.272, + "step": 1057 + }, + { + "epoch": 0.07964018893844446, + "grad_norm": 5.414956569671631, + "learning_rate": 9.937147739537576e-05, + "loss": 2.2651, + "step": 1058 + }, + { + "epoch": 0.07971546321910461, + "grad_norm": 5.339776039123535, + "learning_rate": 9.93695490385593e-05, + "loss": 1.9274, + "step": 1059 + }, + { + "epoch": 0.07979073749976477, + "grad_norm": 6.980020999908447, + "learning_rate": 9.936761774686482e-05, + "loss": 2.531, + "step": 1060 + }, + { + "epoch": 0.07986601178042492, + "grad_norm": 5.8498215675354, + "learning_rate": 9.936568352040714e-05, + "loss": 2.4471, + "step": 1061 + }, + { + "epoch": 0.07994128606108508, + "grad_norm": 5.2305192947387695, + "learning_rate": 9.936374635930119e-05, + "loss": 2.1691, + "step": 1062 + }, + { + "epoch": 0.08001656034174523, + "grad_norm": 6.717306137084961, + "learning_rate": 9.936180626366218e-05, + "loss": 2.1764, + "step": 1063 + }, + { + "epoch": 0.08009183462240539, + "grad_norm": 6.710395336151123, + "learning_rate": 9.935986323360541e-05, + "loss": 2.3192, + "step": 1064 + }, + { + "epoch": 0.08016710890306554, + "grad_norm": 6.129795074462891, + "learning_rate": 9.935791726924642e-05, + "loss": 2.1073, + "step": 1065 + }, + { + "epoch": 0.0802423831837257, + "grad_norm": 4.619602203369141, + "learning_rate": 9.935596837070085e-05, + "loss": 2.1327, + "step": 1066 + }, + { + "epoch": 0.08031765746438585, + "grad_norm": 5.133860111236572, + "learning_rate": 9.935401653808458e-05, + "loss": 2.2376, + "step": 1067 + }, + { + "epoch": 0.08039293174504601, + "grad_norm": 4.671417713165283, + "learning_rate": 9.935206177151364e-05, + "loss": 2.1846, + "step": 1068 + }, + { + "epoch": 0.08046820602570617, + "grad_norm": 4.523170471191406, + "learning_rate": 9.935010407110424e-05, + "loss": 2.1667, + "step": 1069 + }, + { + "epoch": 0.08054348030636632, + "grad_norm": 7.418670177459717, + "learning_rate": 9.934814343697275e-05, + "loss": 1.9934, + "step": 1070 + }, + { + "epoch": 0.08061875458702648, + "grad_norm": 4.622326850891113, + "learning_rate": 9.934617986923574e-05, + "loss": 2.4415, + "step": 1071 + }, + { + "epoch": 0.08069402886768663, + "grad_norm": 4.658565998077393, + "learning_rate": 9.934421336800992e-05, + "loss": 2.1716, + "step": 1072 + }, + { + "epoch": 0.0807693031483468, + "grad_norm": 7.18447208404541, + "learning_rate": 9.934224393341219e-05, + "loss": 2.1342, + "step": 1073 + }, + { + "epoch": 0.08084457742900694, + "grad_norm": 5.394633769989014, + "learning_rate": 9.934027156555965e-05, + "loss": 2.2161, + "step": 1074 + }, + { + "epoch": 0.0809198517096671, + "grad_norm": 5.659594535827637, + "learning_rate": 9.933829626456953e-05, + "loss": 2.2142, + "step": 1075 + }, + { + "epoch": 0.08099512599032725, + "grad_norm": 4.856943130493164, + "learning_rate": 9.933631803055927e-05, + "loss": 2.0258, + "step": 1076 + }, + { + "epoch": 0.08107040027098741, + "grad_norm": 4.335635185241699, + "learning_rate": 9.933433686364646e-05, + "loss": 2.4085, + "step": 1077 + }, + { + "epoch": 0.08114567455164756, + "grad_norm": 8.24028491973877, + "learning_rate": 9.933235276394887e-05, + "loss": 2.1489, + "step": 1078 + }, + { + "epoch": 0.08122094883230772, + "grad_norm": 6.36244010925293, + "learning_rate": 9.933036573158449e-05, + "loss": 2.3271, + "step": 1079 + }, + { + "epoch": 0.08129622311296787, + "grad_norm": 7.184402942657471, + "learning_rate": 9.93283757666714e-05, + "loss": 1.9127, + "step": 1080 + }, + { + "epoch": 0.08137149739362803, + "grad_norm": 6.435911178588867, + "learning_rate": 9.93263828693279e-05, + "loss": 2.3839, + "step": 1081 + }, + { + "epoch": 0.08144677167428818, + "grad_norm": 7.429813385009766, + "learning_rate": 9.932438703967246e-05, + "loss": 2.2122, + "step": 1082 + }, + { + "epoch": 0.08152204595494834, + "grad_norm": 9.907596588134766, + "learning_rate": 9.932238827782376e-05, + "loss": 2.1753, + "step": 1083 + }, + { + "epoch": 0.08159732023560849, + "grad_norm": 6.798231601715088, + "learning_rate": 9.93203865839006e-05, + "loss": 2.2066, + "step": 1084 + }, + { + "epoch": 0.08167259451626865, + "grad_norm": 6.365963935852051, + "learning_rate": 9.931838195802196e-05, + "loss": 2.2131, + "step": 1085 + }, + { + "epoch": 0.08174786879692882, + "grad_norm": 6.472831726074219, + "learning_rate": 9.931637440030702e-05, + "loss": 2.0847, + "step": 1086 + }, + { + "epoch": 0.08182314307758896, + "grad_norm": 5.441748142242432, + "learning_rate": 9.931436391087514e-05, + "loss": 2.0567, + "step": 1087 + }, + { + "epoch": 0.08189841735824913, + "grad_norm": 5.4392409324646, + "learning_rate": 9.931235048984582e-05, + "loss": 2.1287, + "step": 1088 + }, + { + "epoch": 0.08197369163890927, + "grad_norm": 3.969637155532837, + "learning_rate": 9.931033413733874e-05, + "loss": 2.5528, + "step": 1089 + }, + { + "epoch": 0.08204896591956944, + "grad_norm": 5.267080783843994, + "learning_rate": 9.930831485347381e-05, + "loss": 2.0441, + "step": 1090 + }, + { + "epoch": 0.08212424020022958, + "grad_norm": 5.525221824645996, + "learning_rate": 9.930629263837103e-05, + "loss": 2.1183, + "step": 1091 + }, + { + "epoch": 0.08219951448088975, + "grad_norm": 4.716779708862305, + "learning_rate": 9.930426749215063e-05, + "loss": 2.2076, + "step": 1092 + }, + { + "epoch": 0.0822747887615499, + "grad_norm": 7.564617156982422, + "learning_rate": 9.9302239414933e-05, + "loss": 2.2812, + "step": 1093 + }, + { + "epoch": 0.08235006304221006, + "grad_norm": 4.890120506286621, + "learning_rate": 9.930020840683869e-05, + "loss": 2.2501, + "step": 1094 + }, + { + "epoch": 0.0824253373228702, + "grad_norm": 7.044039249420166, + "learning_rate": 9.929817446798846e-05, + "loss": 2.2245, + "step": 1095 + }, + { + "epoch": 0.08250061160353037, + "grad_norm": 5.987939357757568, + "learning_rate": 9.929613759850321e-05, + "loss": 2.604, + "step": 1096 + }, + { + "epoch": 0.08257588588419051, + "grad_norm": 5.388245105743408, + "learning_rate": 9.929409779850401e-05, + "loss": 2.2337, + "step": 1097 + }, + { + "epoch": 0.08265116016485068, + "grad_norm": 5.306590557098389, + "learning_rate": 9.929205506811215e-05, + "loss": 1.9403, + "step": 1098 + }, + { + "epoch": 0.08272643444551082, + "grad_norm": 5.837907791137695, + "learning_rate": 9.929000940744906e-05, + "loss": 2.0672, + "step": 1099 + }, + { + "epoch": 0.08280170872617099, + "grad_norm": 5.475199222564697, + "learning_rate": 9.928796081663632e-05, + "loss": 2.165, + "step": 1100 + }, + { + "epoch": 0.08287698300683113, + "grad_norm": 6.814287185668945, + "learning_rate": 9.928590929579575e-05, + "loss": 2.4847, + "step": 1101 + }, + { + "epoch": 0.0829522572874913, + "grad_norm": 9.756586074829102, + "learning_rate": 9.928385484504927e-05, + "loss": 2.3618, + "step": 1102 + }, + { + "epoch": 0.08302753156815146, + "grad_norm": 7.229280948638916, + "learning_rate": 9.928179746451905e-05, + "loss": 2.7062, + "step": 1103 + }, + { + "epoch": 0.0831028058488116, + "grad_norm": 6.340739727020264, + "learning_rate": 9.927973715432736e-05, + "loss": 2.3042, + "step": 1104 + }, + { + "epoch": 0.08317808012947177, + "grad_norm": 5.95250129699707, + "learning_rate": 9.92776739145967e-05, + "loss": 2.0521, + "step": 1105 + }, + { + "epoch": 0.08325335441013192, + "grad_norm": 5.412210464477539, + "learning_rate": 9.927560774544972e-05, + "loss": 2.3434, + "step": 1106 + }, + { + "epoch": 0.08332862869079208, + "grad_norm": 4.403136730194092, + "learning_rate": 9.927353864700926e-05, + "loss": 2.032, + "step": 1107 + }, + { + "epoch": 0.08340390297145223, + "grad_norm": 4.979170799255371, + "learning_rate": 9.92714666193983e-05, + "loss": 2.2241, + "step": 1108 + }, + { + "epoch": 0.08347917725211239, + "grad_norm": 5.445007801055908, + "learning_rate": 9.926939166274003e-05, + "loss": 1.9934, + "step": 1109 + }, + { + "epoch": 0.08355445153277254, + "grad_norm": 5.468322277069092, + "learning_rate": 9.926731377715779e-05, + "loss": 2.1383, + "step": 1110 + }, + { + "epoch": 0.0836297258134327, + "grad_norm": 4.422060966491699, + "learning_rate": 9.926523296277511e-05, + "loss": 2.0195, + "step": 1111 + }, + { + "epoch": 0.08370500009409285, + "grad_norm": 5.470639705657959, + "learning_rate": 9.926314921971567e-05, + "loss": 2.4246, + "step": 1112 + }, + { + "epoch": 0.08378027437475301, + "grad_norm": 4.583911418914795, + "learning_rate": 9.926106254810339e-05, + "loss": 2.2444, + "step": 1113 + }, + { + "epoch": 0.08385554865541316, + "grad_norm": 6.017763137817383, + "learning_rate": 9.925897294806228e-05, + "loss": 2.0725, + "step": 1114 + }, + { + "epoch": 0.08393082293607332, + "grad_norm": 4.822187900543213, + "learning_rate": 9.925688041971656e-05, + "loss": 1.9456, + "step": 1115 + }, + { + "epoch": 0.08400609721673347, + "grad_norm": 5.219765663146973, + "learning_rate": 9.925478496319063e-05, + "loss": 2.2577, + "step": 1116 + }, + { + "epoch": 0.08408137149739363, + "grad_norm": 6.1115264892578125, + "learning_rate": 9.925268657860906e-05, + "loss": 2.335, + "step": 1117 + }, + { + "epoch": 0.08415664577805379, + "grad_norm": 5.653519153594971, + "learning_rate": 9.925058526609661e-05, + "loss": 2.188, + "step": 1118 + }, + { + "epoch": 0.08423192005871394, + "grad_norm": 5.540939807891846, + "learning_rate": 9.924848102577818e-05, + "loss": 2.0318, + "step": 1119 + }, + { + "epoch": 0.0843071943393741, + "grad_norm": 5.317679405212402, + "learning_rate": 9.924637385777886e-05, + "loss": 2.3206, + "step": 1120 + }, + { + "epoch": 0.08438246862003425, + "grad_norm": 6.680553913116455, + "learning_rate": 9.924426376222391e-05, + "loss": 2.468, + "step": 1121 + }, + { + "epoch": 0.08445774290069441, + "grad_norm": 6.053913593292236, + "learning_rate": 9.92421507392388e-05, + "loss": 2.1009, + "step": 1122 + }, + { + "epoch": 0.08453301718135456, + "grad_norm": 4.4654436111450195, + "learning_rate": 9.924003478894908e-05, + "loss": 2.064, + "step": 1123 + }, + { + "epoch": 0.08460829146201472, + "grad_norm": 7.732831001281738, + "learning_rate": 9.923791591148061e-05, + "loss": 2.131, + "step": 1124 + }, + { + "epoch": 0.08468356574267487, + "grad_norm": 7.678844928741455, + "learning_rate": 9.923579410695932e-05, + "loss": 2.2194, + "step": 1125 + }, + { + "epoch": 0.08475884002333503, + "grad_norm": 8.180438041687012, + "learning_rate": 9.923366937551132e-05, + "loss": 2.4301, + "step": 1126 + }, + { + "epoch": 0.08483411430399518, + "grad_norm": 6.855792999267578, + "learning_rate": 9.923154171726295e-05, + "loss": 1.9302, + "step": 1127 + }, + { + "epoch": 0.08490938858465534, + "grad_norm": 5.4791951179504395, + "learning_rate": 9.92294111323407e-05, + "loss": 2.4229, + "step": 1128 + }, + { + "epoch": 0.08498466286531549, + "grad_norm": 6.284817218780518, + "learning_rate": 9.92272776208712e-05, + "loss": 2.0521, + "step": 1129 + }, + { + "epoch": 0.08505993714597565, + "grad_norm": 6.739572048187256, + "learning_rate": 9.92251411829813e-05, + "loss": 2.2922, + "step": 1130 + }, + { + "epoch": 0.0851352114266358, + "grad_norm": 6.526830673217773, + "learning_rate": 9.9223001818798e-05, + "loss": 2.0448, + "step": 1131 + }, + { + "epoch": 0.08521048570729596, + "grad_norm": 4.843202114105225, + "learning_rate": 9.922085952844846e-05, + "loss": 2.3992, + "step": 1132 + }, + { + "epoch": 0.08528575998795611, + "grad_norm": 5.724582195281982, + "learning_rate": 9.921871431206007e-05, + "loss": 2.0071, + "step": 1133 + }, + { + "epoch": 0.08536103426861627, + "grad_norm": 5.391201496124268, + "learning_rate": 9.921656616976033e-05, + "loss": 2.6196, + "step": 1134 + }, + { + "epoch": 0.08543630854927643, + "grad_norm": 6.695403099060059, + "learning_rate": 9.921441510167697e-05, + "loss": 2.9009, + "step": 1135 + }, + { + "epoch": 0.08551158282993658, + "grad_norm": 7.025712490081787, + "learning_rate": 9.921226110793783e-05, + "loss": 2.214, + "step": 1136 + }, + { + "epoch": 0.08558685711059674, + "grad_norm": 5.759085655212402, + "learning_rate": 9.921010418867098e-05, + "loss": 2.085, + "step": 1137 + }, + { + "epoch": 0.08566213139125689, + "grad_norm": 6.185511589050293, + "learning_rate": 9.920794434400462e-05, + "loss": 2.2703, + "step": 1138 + }, + { + "epoch": 0.08573740567191705, + "grad_norm": 6.3566460609436035, + "learning_rate": 9.920578157406717e-05, + "loss": 2.3132, + "step": 1139 + }, + { + "epoch": 0.0858126799525772, + "grad_norm": 7.459353923797607, + "learning_rate": 9.920361587898719e-05, + "loss": 1.9444, + "step": 1140 + }, + { + "epoch": 0.08588795423323736, + "grad_norm": 7.395384311676025, + "learning_rate": 9.920144725889343e-05, + "loss": 2.0285, + "step": 1141 + }, + { + "epoch": 0.08596322851389751, + "grad_norm": 7.60577392578125, + "learning_rate": 9.91992757139148e-05, + "loss": 2.3224, + "step": 1142 + }, + { + "epoch": 0.08603850279455767, + "grad_norm": 5.471445083618164, + "learning_rate": 9.919710124418042e-05, + "loss": 2.0211, + "step": 1143 + }, + { + "epoch": 0.08611377707521782, + "grad_norm": 5.900109767913818, + "learning_rate": 9.919492384981951e-05, + "loss": 1.8563, + "step": 1144 + }, + { + "epoch": 0.08618905135587798, + "grad_norm": 6.275654315948486, + "learning_rate": 9.919274353096154e-05, + "loss": 2.2104, + "step": 1145 + }, + { + "epoch": 0.08626432563653813, + "grad_norm": 6.282829761505127, + "learning_rate": 9.91905602877361e-05, + "loss": 2.2824, + "step": 1146 + }, + { + "epoch": 0.08633959991719829, + "grad_norm": 6.2379560470581055, + "learning_rate": 9.918837412027302e-05, + "loss": 2.2563, + "step": 1147 + }, + { + "epoch": 0.08641487419785844, + "grad_norm": 8.100504875183105, + "learning_rate": 9.918618502870222e-05, + "loss": 1.9086, + "step": 1148 + }, + { + "epoch": 0.0864901484785186, + "grad_norm": 7.615232944488525, + "learning_rate": 9.918399301315386e-05, + "loss": 2.0459, + "step": 1149 + }, + { + "epoch": 0.08656542275917876, + "grad_norm": 4.653862476348877, + "learning_rate": 9.918179807375824e-05, + "loss": 1.7553, + "step": 1150 + }, + { + "epoch": 0.08664069703983891, + "grad_norm": 5.737155437469482, + "learning_rate": 9.917960021064585e-05, + "loss": 2.3357, + "step": 1151 + }, + { + "epoch": 0.08671597132049907, + "grad_norm": 6.175400257110596, + "learning_rate": 9.917739942394733e-05, + "loss": 2.2961, + "step": 1152 + }, + { + "epoch": 0.08679124560115922, + "grad_norm": 5.4419965744018555, + "learning_rate": 9.917519571379352e-05, + "loss": 1.8495, + "step": 1153 + }, + { + "epoch": 0.08686651988181938, + "grad_norm": 5.63547945022583, + "learning_rate": 9.917298908031543e-05, + "loss": 2.1394, + "step": 1154 + }, + { + "epoch": 0.08694179416247953, + "grad_norm": 6.771299362182617, + "learning_rate": 9.917077952364422e-05, + "loss": 2.2039, + "step": 1155 + }, + { + "epoch": 0.0870170684431397, + "grad_norm": 5.671048641204834, + "learning_rate": 9.916856704391127e-05, + "loss": 2.1265, + "step": 1156 + }, + { + "epoch": 0.08709234272379984, + "grad_norm": 6.2881269454956055, + "learning_rate": 9.916635164124807e-05, + "loss": 2.152, + "step": 1157 + }, + { + "epoch": 0.08716761700446, + "grad_norm": 4.633078575134277, + "learning_rate": 9.916413331578635e-05, + "loss": 2.2791, + "step": 1158 + }, + { + "epoch": 0.08724289128512015, + "grad_norm": 8.621638298034668, + "learning_rate": 9.916191206765797e-05, + "loss": 2.0797, + "step": 1159 + }, + { + "epoch": 0.08731816556578031, + "grad_norm": 6.749201774597168, + "learning_rate": 9.915968789699499e-05, + "loss": 2.2578, + "step": 1160 + }, + { + "epoch": 0.08739343984644046, + "grad_norm": 6.986334323883057, + "learning_rate": 9.91574608039296e-05, + "loss": 2.4061, + "step": 1161 + }, + { + "epoch": 0.08746871412710062, + "grad_norm": 7.910472869873047, + "learning_rate": 9.915523078859423e-05, + "loss": 2.1654, + "step": 1162 + }, + { + "epoch": 0.08754398840776077, + "grad_norm": 6.145853519439697, + "learning_rate": 9.915299785112142e-05, + "loss": 2.3252, + "step": 1163 + }, + { + "epoch": 0.08761926268842093, + "grad_norm": 6.2363715171813965, + "learning_rate": 9.915076199164392e-05, + "loss": 2.5624, + "step": 1164 + }, + { + "epoch": 0.08769453696908108, + "grad_norm": 6.332347393035889, + "learning_rate": 9.914852321029466e-05, + "loss": 2.1788, + "step": 1165 + }, + { + "epoch": 0.08776981124974124, + "grad_norm": 6.0127787590026855, + "learning_rate": 9.914628150720674e-05, + "loss": 2.3107, + "step": 1166 + }, + { + "epoch": 0.0878450855304014, + "grad_norm": 6.66409969329834, + "learning_rate": 9.914403688251337e-05, + "loss": 2.1479, + "step": 1167 + }, + { + "epoch": 0.08792035981106155, + "grad_norm": 4.9314703941345215, + "learning_rate": 9.914178933634801e-05, + "loss": 2.2033, + "step": 1168 + }, + { + "epoch": 0.08799563409172172, + "grad_norm": 6.911401271820068, + "learning_rate": 9.913953886884431e-05, + "loss": 2.3213, + "step": 1169 + }, + { + "epoch": 0.08807090837238186, + "grad_norm": 4.680646896362305, + "learning_rate": 9.913728548013599e-05, + "loss": 2.1207, + "step": 1170 + }, + { + "epoch": 0.08814618265304203, + "grad_norm": 7.904140472412109, + "learning_rate": 9.913502917035705e-05, + "loss": 2.4157, + "step": 1171 + }, + { + "epoch": 0.08822145693370217, + "grad_norm": 4.945505142211914, + "learning_rate": 9.913276993964162e-05, + "loss": 2.1609, + "step": 1172 + }, + { + "epoch": 0.08829673121436234, + "grad_norm": 5.282904624938965, + "learning_rate": 9.913050778812399e-05, + "loss": 2.1884, + "step": 1173 + }, + { + "epoch": 0.08837200549502248, + "grad_norm": 5.114220142364502, + "learning_rate": 9.912824271593863e-05, + "loss": 2.2543, + "step": 1174 + }, + { + "epoch": 0.08844727977568265, + "grad_norm": 7.029424667358398, + "learning_rate": 9.912597472322021e-05, + "loss": 2.267, + "step": 1175 + }, + { + "epoch": 0.0885225540563428, + "grad_norm": 6.848612308502197, + "learning_rate": 9.912370381010356e-05, + "loss": 2.3062, + "step": 1176 + }, + { + "epoch": 0.08859782833700296, + "grad_norm": 8.22595500946045, + "learning_rate": 9.912142997672366e-05, + "loss": 2.1037, + "step": 1177 + }, + { + "epoch": 0.0886731026176631, + "grad_norm": 8.24185562133789, + "learning_rate": 9.911915322321572e-05, + "loss": 2.3139, + "step": 1178 + }, + { + "epoch": 0.08874837689832327, + "grad_norm": 5.798275947570801, + "learning_rate": 9.911687354971504e-05, + "loss": 2.0674, + "step": 1179 + }, + { + "epoch": 0.08882365117898341, + "grad_norm": 6.978446960449219, + "learning_rate": 9.911459095635716e-05, + "loss": 2.1359, + "step": 1180 + }, + { + "epoch": 0.08889892545964358, + "grad_norm": 5.870532035827637, + "learning_rate": 9.911230544327776e-05, + "loss": 1.8003, + "step": 1181 + }, + { + "epoch": 0.08897419974030372, + "grad_norm": 5.043681621551514, + "learning_rate": 9.911001701061273e-05, + "loss": 2.2888, + "step": 1182 + }, + { + "epoch": 0.08904947402096389, + "grad_norm": 6.601596832275391, + "learning_rate": 9.91077256584981e-05, + "loss": 2.1751, + "step": 1183 + }, + { + "epoch": 0.08912474830162405, + "grad_norm": 7.080781936645508, + "learning_rate": 9.910543138707009e-05, + "loss": 2.2919, + "step": 1184 + }, + { + "epoch": 0.0892000225822842, + "grad_norm": 4.893709659576416, + "learning_rate": 9.910313419646509e-05, + "loss": 2.1967, + "step": 1185 + }, + { + "epoch": 0.08927529686294436, + "grad_norm": 4.72605562210083, + "learning_rate": 9.910083408681966e-05, + "loss": 2.2036, + "step": 1186 + }, + { + "epoch": 0.0893505711436045, + "grad_norm": 5.478279113769531, + "learning_rate": 9.909853105827051e-05, + "loss": 1.9166, + "step": 1187 + }, + { + "epoch": 0.08942584542426467, + "grad_norm": 5.6111369132995605, + "learning_rate": 9.909622511095458e-05, + "loss": 1.9616, + "step": 1188 + }, + { + "epoch": 0.08950111970492482, + "grad_norm": 5.122681140899658, + "learning_rate": 9.909391624500892e-05, + "loss": 1.888, + "step": 1189 + }, + { + "epoch": 0.08957639398558498, + "grad_norm": 6.399326801300049, + "learning_rate": 9.909160446057082e-05, + "loss": 2.0555, + "step": 1190 + }, + { + "epoch": 0.08965166826624513, + "grad_norm": 5.359175682067871, + "learning_rate": 9.90892897577777e-05, + "loss": 2.29, + "step": 1191 + }, + { + "epoch": 0.08972694254690529, + "grad_norm": 4.187844753265381, + "learning_rate": 9.908697213676716e-05, + "loss": 2.0499, + "step": 1192 + }, + { + "epoch": 0.08980221682756544, + "grad_norm": 8.327857971191406, + "learning_rate": 9.908465159767695e-05, + "loss": 2.2516, + "step": 1193 + }, + { + "epoch": 0.0898774911082256, + "grad_norm": 6.017848491668701, + "learning_rate": 9.908232814064507e-05, + "loss": 2.048, + "step": 1194 + }, + { + "epoch": 0.08995276538888575, + "grad_norm": 6.500738143920898, + "learning_rate": 9.908000176580959e-05, + "loss": 2.045, + "step": 1195 + }, + { + "epoch": 0.09002803966954591, + "grad_norm": 6.682017803192139, + "learning_rate": 9.907767247330885e-05, + "loss": 2.2444, + "step": 1196 + }, + { + "epoch": 0.09010331395020606, + "grad_norm": 8.236628532409668, + "learning_rate": 9.907534026328129e-05, + "loss": 2.3977, + "step": 1197 + }, + { + "epoch": 0.09017858823086622, + "grad_norm": 6.814053058624268, + "learning_rate": 9.907300513586557e-05, + "loss": 2.2901, + "step": 1198 + }, + { + "epoch": 0.09025386251152638, + "grad_norm": 7.034857273101807, + "learning_rate": 9.90706670912005e-05, + "loss": 2.3608, + "step": 1199 + }, + { + "epoch": 0.09032913679218653, + "grad_norm": 7.74786376953125, + "learning_rate": 9.906832612942507e-05, + "loss": 3.0251, + "step": 1200 + }, + { + "epoch": 0.09040441107284669, + "grad_norm": 6.368960857391357, + "learning_rate": 9.906598225067845e-05, + "loss": 2.0188, + "step": 1201 + }, + { + "epoch": 0.09047968535350684, + "grad_norm": 8.431161880493164, + "learning_rate": 9.906363545509995e-05, + "loss": 2.3448, + "step": 1202 + }, + { + "epoch": 0.090554959634167, + "grad_norm": 5.280194282531738, + "learning_rate": 9.906128574282913e-05, + "loss": 2.2185, + "step": 1203 + }, + { + "epoch": 0.09063023391482715, + "grad_norm": 7.383175373077393, + "learning_rate": 9.905893311400563e-05, + "loss": 2.3644, + "step": 1204 + }, + { + "epoch": 0.09070550819548731, + "grad_norm": 5.881222248077393, + "learning_rate": 9.905657756876932e-05, + "loss": 2.473, + "step": 1205 + }, + { + "epoch": 0.09078078247614746, + "grad_norm": 6.858972549438477, + "learning_rate": 9.905421910726025e-05, + "loss": 2.3122, + "step": 1206 + }, + { + "epoch": 0.09085605675680762, + "grad_norm": 5.860795974731445, + "learning_rate": 9.905185772961858e-05, + "loss": 2.0507, + "step": 1207 + }, + { + "epoch": 0.09093133103746777, + "grad_norm": 5.742857456207275, + "learning_rate": 9.904949343598473e-05, + "loss": 2.1935, + "step": 1208 + }, + { + "epoch": 0.09100660531812793, + "grad_norm": 5.536925792694092, + "learning_rate": 9.904712622649923e-05, + "loss": 2.0867, + "step": 1209 + }, + { + "epoch": 0.09108187959878808, + "grad_norm": 6.06820011138916, + "learning_rate": 9.904475610130282e-05, + "loss": 2.4031, + "step": 1210 + }, + { + "epoch": 0.09115715387944824, + "grad_norm": 6.657587051391602, + "learning_rate": 9.904238306053637e-05, + "loss": 2.1798, + "step": 1211 + }, + { + "epoch": 0.09123242816010839, + "grad_norm": 6.574684143066406, + "learning_rate": 9.904000710434097e-05, + "loss": 2.201, + "step": 1212 + }, + { + "epoch": 0.09130770244076855, + "grad_norm": 6.606392860412598, + "learning_rate": 9.903762823285786e-05, + "loss": 2.2666, + "step": 1213 + }, + { + "epoch": 0.0913829767214287, + "grad_norm": 7.316008567810059, + "learning_rate": 9.903524644622846e-05, + "loss": 2.4021, + "step": 1214 + }, + { + "epoch": 0.09145825100208886, + "grad_norm": 5.007155418395996, + "learning_rate": 9.903286174459434e-05, + "loss": 2.0124, + "step": 1215 + }, + { + "epoch": 0.09153352528274902, + "grad_norm": 5.401012897491455, + "learning_rate": 9.903047412809729e-05, + "loss": 2.0664, + "step": 1216 + }, + { + "epoch": 0.09160879956340917, + "grad_norm": 4.978917121887207, + "learning_rate": 9.902808359687926e-05, + "loss": 2.2368, + "step": 1217 + }, + { + "epoch": 0.09168407384406933, + "grad_norm": 5.641336441040039, + "learning_rate": 9.90256901510823e-05, + "loss": 2.4315, + "step": 1218 + }, + { + "epoch": 0.09175934812472948, + "grad_norm": 4.977763652801514, + "learning_rate": 9.902329379084876e-05, + "loss": 2.2907, + "step": 1219 + }, + { + "epoch": 0.09183462240538964, + "grad_norm": 5.5334014892578125, + "learning_rate": 9.902089451632105e-05, + "loss": 1.9585, + "step": 1220 + }, + { + "epoch": 0.09190989668604979, + "grad_norm": 8.463762283325195, + "learning_rate": 9.901849232764182e-05, + "loss": 2.2037, + "step": 1221 + }, + { + "epoch": 0.09198517096670995, + "grad_norm": 5.011501312255859, + "learning_rate": 9.901608722495388e-05, + "loss": 2.53, + "step": 1222 + }, + { + "epoch": 0.0920604452473701, + "grad_norm": 4.4905805587768555, + "learning_rate": 9.901367920840018e-05, + "loss": 2.0989, + "step": 1223 + }, + { + "epoch": 0.09213571952803026, + "grad_norm": 5.245330810546875, + "learning_rate": 9.901126827812391e-05, + "loss": 2.4412, + "step": 1224 + }, + { + "epoch": 0.09221099380869041, + "grad_norm": 6.360542297363281, + "learning_rate": 9.900885443426834e-05, + "loss": 2.1205, + "step": 1225 + }, + { + "epoch": 0.09228626808935057, + "grad_norm": 5.542527675628662, + "learning_rate": 9.900643767697704e-05, + "loss": 2.5843, + "step": 1226 + }, + { + "epoch": 0.09236154237001072, + "grad_norm": 7.317891597747803, + "learning_rate": 9.90040180063936e-05, + "loss": 2.3264, + "step": 1227 + }, + { + "epoch": 0.09243681665067088, + "grad_norm": 6.627783298492432, + "learning_rate": 9.900159542266192e-05, + "loss": 2.5113, + "step": 1228 + }, + { + "epoch": 0.09251209093133103, + "grad_norm": 4.347350597381592, + "learning_rate": 9.899916992592597e-05, + "loss": 1.8256, + "step": 1229 + }, + { + "epoch": 0.0925873652119912, + "grad_norm": 5.101823806762695, + "learning_rate": 9.899674151632999e-05, + "loss": 2.2285, + "step": 1230 + }, + { + "epoch": 0.09266263949265136, + "grad_norm": 6.226546764373779, + "learning_rate": 9.899431019401828e-05, + "loss": 2.0282, + "step": 1231 + }, + { + "epoch": 0.0927379137733115, + "grad_norm": 5.503075122833252, + "learning_rate": 9.899187595913544e-05, + "loss": 1.9118, + "step": 1232 + }, + { + "epoch": 0.09281318805397167, + "grad_norm": 4.337289333343506, + "learning_rate": 9.898943881182613e-05, + "loss": 2.2529, + "step": 1233 + }, + { + "epoch": 0.09288846233463181, + "grad_norm": 4.941408634185791, + "learning_rate": 9.898699875223524e-05, + "loss": 2.226, + "step": 1234 + }, + { + "epoch": 0.09296373661529198, + "grad_norm": 5.512163162231445, + "learning_rate": 9.898455578050787e-05, + "loss": 2.3425, + "step": 1235 + }, + { + "epoch": 0.09303901089595212, + "grad_norm": 4.888185977935791, + "learning_rate": 9.898210989678917e-05, + "loss": 1.8805, + "step": 1236 + }, + { + "epoch": 0.09311428517661229, + "grad_norm": 4.424567222595215, + "learning_rate": 9.89796611012246e-05, + "loss": 2.0375, + "step": 1237 + }, + { + "epoch": 0.09318955945727243, + "grad_norm": 5.642770767211914, + "learning_rate": 9.897720939395973e-05, + "loss": 2.1229, + "step": 1238 + }, + { + "epoch": 0.0932648337379326, + "grad_norm": 3.94836163520813, + "learning_rate": 9.897475477514027e-05, + "loss": 2.2574, + "step": 1239 + }, + { + "epoch": 0.09334010801859274, + "grad_norm": 6.482448101043701, + "learning_rate": 9.897229724491218e-05, + "loss": 2.0672, + "step": 1240 + }, + { + "epoch": 0.0934153822992529, + "grad_norm": 7.216182231903076, + "learning_rate": 9.896983680342154e-05, + "loss": 2.2329, + "step": 1241 + }, + { + "epoch": 0.09349065657991305, + "grad_norm": 5.277454376220703, + "learning_rate": 9.896737345081459e-05, + "loss": 2.0005, + "step": 1242 + }, + { + "epoch": 0.09356593086057322, + "grad_norm": 7.378226280212402, + "learning_rate": 9.896490718723782e-05, + "loss": 2.1358, + "step": 1243 + }, + { + "epoch": 0.09364120514123336, + "grad_norm": 5.717811584472656, + "learning_rate": 9.896243801283777e-05, + "loss": 3.0477, + "step": 1244 + }, + { + "epoch": 0.09371647942189353, + "grad_norm": 5.389885425567627, + "learning_rate": 9.89599659277613e-05, + "loss": 1.8269, + "step": 1245 + }, + { + "epoch": 0.09379175370255367, + "grad_norm": 5.471428394317627, + "learning_rate": 9.895749093215534e-05, + "loss": 2.453, + "step": 1246 + }, + { + "epoch": 0.09386702798321384, + "grad_norm": 6.5199360847473145, + "learning_rate": 9.895501302616701e-05, + "loss": 1.9186, + "step": 1247 + }, + { + "epoch": 0.093942302263874, + "grad_norm": 5.196421146392822, + "learning_rate": 9.895253220994362e-05, + "loss": 2.2301, + "step": 1248 + }, + { + "epoch": 0.09401757654453415, + "grad_norm": 5.135849952697754, + "learning_rate": 9.895004848363267e-05, + "loss": 2.2727, + "step": 1249 + }, + { + "epoch": 0.09409285082519431, + "grad_norm": 5.268392562866211, + "learning_rate": 9.894756184738177e-05, + "loss": 2.0951, + "step": 1250 + }, + { + "epoch": 0.09416812510585446, + "grad_norm": 6.069251537322998, + "learning_rate": 9.894507230133878e-05, + "loss": 2.2446, + "step": 1251 + }, + { + "epoch": 0.09424339938651462, + "grad_norm": 6.845205307006836, + "learning_rate": 9.894257984565167e-05, + "loss": 2.3176, + "step": 1252 + }, + { + "epoch": 0.09431867366717477, + "grad_norm": 6.823889255523682, + "learning_rate": 9.894008448046863e-05, + "loss": 2.3361, + "step": 1253 + }, + { + "epoch": 0.09439394794783493, + "grad_norm": 6.231601238250732, + "learning_rate": 9.893758620593798e-05, + "loss": 2.2056, + "step": 1254 + }, + { + "epoch": 0.09446922222849508, + "grad_norm": 4.7902302742004395, + "learning_rate": 9.893508502220825e-05, + "loss": 1.9851, + "step": 1255 + }, + { + "epoch": 0.09454449650915524, + "grad_norm": 6.715167045593262, + "learning_rate": 9.893258092942813e-05, + "loss": 2.1023, + "step": 1256 + }, + { + "epoch": 0.09461977078981539, + "grad_norm": 6.482387542724609, + "learning_rate": 9.893007392774647e-05, + "loss": 2.1453, + "step": 1257 + }, + { + "epoch": 0.09469504507047555, + "grad_norm": 4.422342300415039, + "learning_rate": 9.892756401731233e-05, + "loss": 2.1462, + "step": 1258 + }, + { + "epoch": 0.0947703193511357, + "grad_norm": 5.62957763671875, + "learning_rate": 9.892505119827487e-05, + "loss": 2.3287, + "step": 1259 + }, + { + "epoch": 0.09484559363179586, + "grad_norm": 4.577198505401611, + "learning_rate": 9.892253547078352e-05, + "loss": 2.2007, + "step": 1260 + }, + { + "epoch": 0.094920867912456, + "grad_norm": 7.072729110717773, + "learning_rate": 9.892001683498781e-05, + "loss": 2.3184, + "step": 1261 + }, + { + "epoch": 0.09499614219311617, + "grad_norm": 6.113210201263428, + "learning_rate": 9.891749529103746e-05, + "loss": 2.2995, + "step": 1262 + }, + { + "epoch": 0.09507141647377632, + "grad_norm": 7.24001932144165, + "learning_rate": 9.891497083908237e-05, + "loss": 2.2791, + "step": 1263 + }, + { + "epoch": 0.09514669075443648, + "grad_norm": 4.808167457580566, + "learning_rate": 9.891244347927262e-05, + "loss": 1.9341, + "step": 1264 + }, + { + "epoch": 0.09522196503509664, + "grad_norm": 6.610758304595947, + "learning_rate": 9.890991321175845e-05, + "loss": 2.0058, + "step": 1265 + }, + { + "epoch": 0.09529723931575679, + "grad_norm": 6.405998706817627, + "learning_rate": 9.890738003669029e-05, + "loss": 1.9971, + "step": 1266 + }, + { + "epoch": 0.09537251359641695, + "grad_norm": 6.294723033905029, + "learning_rate": 9.890484395421871e-05, + "loss": 3.0032, + "step": 1267 + }, + { + "epoch": 0.0954477878770771, + "grad_norm": 5.753906726837158, + "learning_rate": 9.890230496449448e-05, + "loss": 1.9599, + "step": 1268 + }, + { + "epoch": 0.09552306215773726, + "grad_norm": 4.891688823699951, + "learning_rate": 9.889976306766853e-05, + "loss": 2.1362, + "step": 1269 + }, + { + "epoch": 0.09559833643839741, + "grad_norm": 4.98150634765625, + "learning_rate": 9.889721826389197e-05, + "loss": 2.1515, + "step": 1270 + }, + { + "epoch": 0.09567361071905757, + "grad_norm": 6.657354354858398, + "learning_rate": 9.88946705533161e-05, + "loss": 2.0002, + "step": 1271 + }, + { + "epoch": 0.09574888499971772, + "grad_norm": 5.382585048675537, + "learning_rate": 9.889211993609235e-05, + "loss": 2.2606, + "step": 1272 + }, + { + "epoch": 0.09582415928037788, + "grad_norm": 7.620237350463867, + "learning_rate": 9.888956641237238e-05, + "loss": 2.0096, + "step": 1273 + }, + { + "epoch": 0.09589943356103803, + "grad_norm": 4.2678070068359375, + "learning_rate": 9.888700998230794e-05, + "loss": 2.2068, + "step": 1274 + }, + { + "epoch": 0.09597470784169819, + "grad_norm": 6.159614562988281, + "learning_rate": 9.888445064605106e-05, + "loss": 2.0576, + "step": 1275 + }, + { + "epoch": 0.09604998212235834, + "grad_norm": 7.03329610824585, + "learning_rate": 9.888188840375382e-05, + "loss": 2.4266, + "step": 1276 + }, + { + "epoch": 0.0961252564030185, + "grad_norm": 5.893270969390869, + "learning_rate": 9.88793232555686e-05, + "loss": 2.351, + "step": 1277 + }, + { + "epoch": 0.09620053068367865, + "grad_norm": 7.390767574310303, + "learning_rate": 9.887675520164783e-05, + "loss": 2.1226, + "step": 1278 + }, + { + "epoch": 0.09627580496433881, + "grad_norm": 3.7732632160186768, + "learning_rate": 9.887418424214423e-05, + "loss": 2.0019, + "step": 1279 + }, + { + "epoch": 0.09635107924499897, + "grad_norm": 4.830846786499023, + "learning_rate": 9.88716103772106e-05, + "loss": 2.3232, + "step": 1280 + }, + { + "epoch": 0.09642635352565912, + "grad_norm": 7.471973896026611, + "learning_rate": 9.886903360699997e-05, + "loss": 2.3815, + "step": 1281 + }, + { + "epoch": 0.09650162780631928, + "grad_norm": 5.435558319091797, + "learning_rate": 9.886645393166552e-05, + "loss": 2.1206, + "step": 1282 + }, + { + "epoch": 0.09657690208697943, + "grad_norm": 4.594182968139648, + "learning_rate": 9.886387135136058e-05, + "loss": 2.3454, + "step": 1283 + }, + { + "epoch": 0.09665217636763959, + "grad_norm": 5.27546501159668, + "learning_rate": 9.886128586623871e-05, + "loss": 2.5194, + "step": 1284 + }, + { + "epoch": 0.09672745064829974, + "grad_norm": 5.06937313079834, + "learning_rate": 9.88586974764536e-05, + "loss": 2.4436, + "step": 1285 + }, + { + "epoch": 0.0968027249289599, + "grad_norm": 4.586850643157959, + "learning_rate": 9.88561061821591e-05, + "loss": 2.2941, + "step": 1286 + }, + { + "epoch": 0.09687799920962005, + "grad_norm": 5.5186028480529785, + "learning_rate": 9.885351198350927e-05, + "loss": 2.5374, + "step": 1287 + }, + { + "epoch": 0.09695327349028021, + "grad_norm": 6.163332939147949, + "learning_rate": 9.885091488065832e-05, + "loss": 2.1824, + "step": 1288 + }, + { + "epoch": 0.09702854777094036, + "grad_norm": 6.884142875671387, + "learning_rate": 9.884831487376068e-05, + "loss": 2.2249, + "step": 1289 + }, + { + "epoch": 0.09710382205160052, + "grad_norm": 4.81575870513916, + "learning_rate": 9.884571196297086e-05, + "loss": 2.4353, + "step": 1290 + }, + { + "epoch": 0.09717909633226067, + "grad_norm": 4.224776744842529, + "learning_rate": 9.884310614844363e-05, + "loss": 2.3587, + "step": 1291 + }, + { + "epoch": 0.09725437061292083, + "grad_norm": 4.984908580780029, + "learning_rate": 9.884049743033388e-05, + "loss": 1.8399, + "step": 1292 + }, + { + "epoch": 0.09732964489358098, + "grad_norm": 6.497458457946777, + "learning_rate": 9.883788580879669e-05, + "loss": 2.2188, + "step": 1293 + }, + { + "epoch": 0.09740491917424114, + "grad_norm": 4.196412086486816, + "learning_rate": 9.883527128398732e-05, + "loss": 2.1835, + "step": 1294 + }, + { + "epoch": 0.09748019345490129, + "grad_norm": 8.15439224243164, + "learning_rate": 9.883265385606122e-05, + "loss": 2.1319, + "step": 1295 + }, + { + "epoch": 0.09755546773556145, + "grad_norm": 5.560108184814453, + "learning_rate": 9.883003352517394e-05, + "loss": 2.3176, + "step": 1296 + }, + { + "epoch": 0.09763074201622161, + "grad_norm": 5.698905944824219, + "learning_rate": 9.882741029148129e-05, + "loss": 2.1423, + "step": 1297 + }, + { + "epoch": 0.09770601629688176, + "grad_norm": 6.508706569671631, + "learning_rate": 9.882478415513918e-05, + "loss": 2.1866, + "step": 1298 + }, + { + "epoch": 0.09778129057754192, + "grad_norm": 4.713882923126221, + "learning_rate": 9.882215511630378e-05, + "loss": 2.0841, + "step": 1299 + }, + { + "epoch": 0.09785656485820207, + "grad_norm": 5.9731574058532715, + "learning_rate": 9.881952317513131e-05, + "loss": 2.0982, + "step": 1300 + }, + { + "epoch": 0.09793183913886223, + "grad_norm": 6.012665748596191, + "learning_rate": 9.881688833177829e-05, + "loss": 2.1209, + "step": 1301 + }, + { + "epoch": 0.09800711341952238, + "grad_norm": 4.954236030578613, + "learning_rate": 9.881425058640131e-05, + "loss": 2.0874, + "step": 1302 + }, + { + "epoch": 0.09808238770018254, + "grad_norm": 4.41301155090332, + "learning_rate": 9.881160993915722e-05, + "loss": 2.2578, + "step": 1303 + }, + { + "epoch": 0.09815766198084269, + "grad_norm": 9.022636413574219, + "learning_rate": 9.880896639020296e-05, + "loss": 2.263, + "step": 1304 + }, + { + "epoch": 0.09823293626150285, + "grad_norm": 6.371960639953613, + "learning_rate": 9.88063199396957e-05, + "loss": 2.2574, + "step": 1305 + }, + { + "epoch": 0.098308210542163, + "grad_norm": 6.644979000091553, + "learning_rate": 9.880367058779276e-05, + "loss": 2.5405, + "step": 1306 + }, + { + "epoch": 0.09838348482282316, + "grad_norm": 7.651803016662598, + "learning_rate": 9.880101833465163e-05, + "loss": 2.1326, + "step": 1307 + }, + { + "epoch": 0.09845875910348331, + "grad_norm": 6.9178571701049805, + "learning_rate": 9.879836318042998e-05, + "loss": 2.2014, + "step": 1308 + }, + { + "epoch": 0.09853403338414347, + "grad_norm": 4.821188926696777, + "learning_rate": 9.879570512528568e-05, + "loss": 2.1655, + "step": 1309 + }, + { + "epoch": 0.09860930766480362, + "grad_norm": 4.972322940826416, + "learning_rate": 9.879304416937671e-05, + "loss": 2.043, + "step": 1310 + }, + { + "epoch": 0.09868458194546378, + "grad_norm": 6.366336345672607, + "learning_rate": 9.879038031286128e-05, + "loss": 1.9648, + "step": 1311 + }, + { + "epoch": 0.09875985622612395, + "grad_norm": 5.608874320983887, + "learning_rate": 9.87877135558977e-05, + "loss": 2.1912, + "step": 1312 + }, + { + "epoch": 0.0988351305067841, + "grad_norm": 4.385733604431152, + "learning_rate": 9.878504389864458e-05, + "loss": 2.0821, + "step": 1313 + }, + { + "epoch": 0.09891040478744426, + "grad_norm": 7.843584060668945, + "learning_rate": 9.878237134126056e-05, + "loss": 2.3312, + "step": 1314 + }, + { + "epoch": 0.0989856790681044, + "grad_norm": 9.023554801940918, + "learning_rate": 9.877969588390453e-05, + "loss": 1.8833, + "step": 1315 + }, + { + "epoch": 0.09906095334876457, + "grad_norm": 7.386704921722412, + "learning_rate": 9.877701752673556e-05, + "loss": 2.2776, + "step": 1316 + }, + { + "epoch": 0.09913622762942471, + "grad_norm": 6.5616278648376465, + "learning_rate": 9.877433626991283e-05, + "loss": 2.1024, + "step": 1317 + }, + { + "epoch": 0.09921150191008488, + "grad_norm": 4.357512474060059, + "learning_rate": 9.877165211359578e-05, + "loss": 1.9143, + "step": 1318 + }, + { + "epoch": 0.09928677619074502, + "grad_norm": 6.467500686645508, + "learning_rate": 9.876896505794395e-05, + "loss": 2.1209, + "step": 1319 + }, + { + "epoch": 0.09936205047140519, + "grad_norm": 6.209388732910156, + "learning_rate": 9.87662751031171e-05, + "loss": 2.1246, + "step": 1320 + }, + { + "epoch": 0.09943732475206533, + "grad_norm": 6.647604465484619, + "learning_rate": 9.876358224927508e-05, + "loss": 1.9738, + "step": 1321 + }, + { + "epoch": 0.0995125990327255, + "grad_norm": 5.951557636260986, + "learning_rate": 9.876088649657805e-05, + "loss": 2.1976, + "step": 1322 + }, + { + "epoch": 0.09958787331338564, + "grad_norm": 7.1822285652160645, + "learning_rate": 9.87581878451862e-05, + "loss": 2.0182, + "step": 1323 + }, + { + "epoch": 0.0996631475940458, + "grad_norm": 5.4784159660339355, + "learning_rate": 9.875548629525999e-05, + "loss": 1.7095, + "step": 1324 + }, + { + "epoch": 0.09973842187470595, + "grad_norm": 5.596874237060547, + "learning_rate": 9.875278184696002e-05, + "loss": 2.1852, + "step": 1325 + }, + { + "epoch": 0.09981369615536612, + "grad_norm": 6.271819114685059, + "learning_rate": 9.875007450044706e-05, + "loss": 2.5116, + "step": 1326 + }, + { + "epoch": 0.09988897043602626, + "grad_norm": 7.904577732086182, + "learning_rate": 9.874736425588206e-05, + "loss": 2.189, + "step": 1327 + }, + { + "epoch": 0.09996424471668643, + "grad_norm": 4.808426380157471, + "learning_rate": 9.874465111342612e-05, + "loss": 2.2559, + "step": 1328 + }, + { + "epoch": 0.10003951899734659, + "grad_norm": 5.895272731781006, + "learning_rate": 9.874193507324053e-05, + "loss": 2.1681, + "step": 1329 + }, + { + "epoch": 0.10011479327800674, + "grad_norm": 5.547752380371094, + "learning_rate": 9.873921613548675e-05, + "loss": 2.1522, + "step": 1330 + }, + { + "epoch": 0.1001900675586669, + "grad_norm": 4.852756977081299, + "learning_rate": 9.873649430032643e-05, + "loss": 2.504, + "step": 1331 + }, + { + "epoch": 0.10026534183932705, + "grad_norm": 6.820119857788086, + "learning_rate": 9.873376956792137e-05, + "loss": 2.3665, + "step": 1332 + }, + { + "epoch": 0.10034061611998721, + "grad_norm": 4.905352592468262, + "learning_rate": 9.873104193843352e-05, + "loss": 2.4236, + "step": 1333 + }, + { + "epoch": 0.10041589040064736, + "grad_norm": 4.657886505126953, + "learning_rate": 9.872831141202508e-05, + "loss": 2.1675, + "step": 1334 + }, + { + "epoch": 0.10049116468130752, + "grad_norm": 8.569634437561035, + "learning_rate": 9.872557798885833e-05, + "loss": 2.0787, + "step": 1335 + }, + { + "epoch": 0.10056643896196767, + "grad_norm": 4.9745564460754395, + "learning_rate": 9.872284166909578e-05, + "loss": 2.3586, + "step": 1336 + }, + { + "epoch": 0.10064171324262783, + "grad_norm": 8.317204475402832, + "learning_rate": 9.872010245290009e-05, + "loss": 2.1153, + "step": 1337 + }, + { + "epoch": 0.10071698752328798, + "grad_norm": 5.470431327819824, + "learning_rate": 9.87173603404341e-05, + "loss": 2.1097, + "step": 1338 + }, + { + "epoch": 0.10079226180394814, + "grad_norm": 8.047344207763672, + "learning_rate": 9.871461533186083e-05, + "loss": 2.2767, + "step": 1339 + }, + { + "epoch": 0.10086753608460829, + "grad_norm": 8.675902366638184, + "learning_rate": 9.871186742734345e-05, + "loss": 2.2367, + "step": 1340 + }, + { + "epoch": 0.10094281036526845, + "grad_norm": 7.912907123565674, + "learning_rate": 9.870911662704532e-05, + "loss": 2.4865, + "step": 1341 + }, + { + "epoch": 0.1010180846459286, + "grad_norm": 7.082531452178955, + "learning_rate": 9.870636293112999e-05, + "loss": 2.1539, + "step": 1342 + }, + { + "epoch": 0.10109335892658876, + "grad_norm": 6.481196880340576, + "learning_rate": 9.870360633976111e-05, + "loss": 2.134, + "step": 1343 + }, + { + "epoch": 0.1011686332072489, + "grad_norm": 5.624795913696289, + "learning_rate": 9.870084685310259e-05, + "loss": 1.7735, + "step": 1344 + }, + { + "epoch": 0.10124390748790907, + "grad_norm": 5.709216594696045, + "learning_rate": 9.869808447131847e-05, + "loss": 2.1948, + "step": 1345 + }, + { + "epoch": 0.10131918176856923, + "grad_norm": 6.11540412902832, + "learning_rate": 9.869531919457296e-05, + "loss": 2.5375, + "step": 1346 + }, + { + "epoch": 0.10139445604922938, + "grad_norm": 5.489913463592529, + "learning_rate": 9.869255102303044e-05, + "loss": 1.7925, + "step": 1347 + }, + { + "epoch": 0.10146973032988954, + "grad_norm": 6.5903520584106445, + "learning_rate": 9.868977995685547e-05, + "loss": 1.9985, + "step": 1348 + }, + { + "epoch": 0.10154500461054969, + "grad_norm": 5.510942459106445, + "learning_rate": 9.868700599621279e-05, + "loss": 2.0588, + "step": 1349 + }, + { + "epoch": 0.10162027889120985, + "grad_norm": 5.788224220275879, + "learning_rate": 9.868422914126731e-05, + "loss": 2.3738, + "step": 1350 + }, + { + "epoch": 0.10169555317187, + "grad_norm": 4.937041282653809, + "learning_rate": 9.86814493921841e-05, + "loss": 2.1258, + "step": 1351 + }, + { + "epoch": 0.10177082745253016, + "grad_norm": 6.012052059173584, + "learning_rate": 9.86786667491284e-05, + "loss": 2.043, + "step": 1352 + }, + { + "epoch": 0.10184610173319031, + "grad_norm": 7.547396659851074, + "learning_rate": 9.867588121226563e-05, + "loss": 2.018, + "step": 1353 + }, + { + "epoch": 0.10192137601385047, + "grad_norm": 4.966169357299805, + "learning_rate": 9.86730927817614e-05, + "loss": 2.1605, + "step": 1354 + }, + { + "epoch": 0.10199665029451062, + "grad_norm": 6.8154616355896, + "learning_rate": 9.867030145778147e-05, + "loss": 2.0144, + "step": 1355 + }, + { + "epoch": 0.10207192457517078, + "grad_norm": 5.595889091491699, + "learning_rate": 9.866750724049175e-05, + "loss": 2.2067, + "step": 1356 + }, + { + "epoch": 0.10214719885583093, + "grad_norm": 7.523070335388184, + "learning_rate": 9.866471013005838e-05, + "loss": 2.2502, + "step": 1357 + }, + { + "epoch": 0.10222247313649109, + "grad_norm": 5.227776050567627, + "learning_rate": 9.866191012664763e-05, + "loss": 2.0716, + "step": 1358 + }, + { + "epoch": 0.10229774741715124, + "grad_norm": 5.172247409820557, + "learning_rate": 9.865910723042593e-05, + "loss": 2.3653, + "step": 1359 + }, + { + "epoch": 0.1023730216978114, + "grad_norm": 5.480951309204102, + "learning_rate": 9.865630144155994e-05, + "loss": 1.9977, + "step": 1360 + }, + { + "epoch": 0.10244829597847156, + "grad_norm": 4.700575828552246, + "learning_rate": 9.865349276021644e-05, + "loss": 2.0201, + "step": 1361 + }, + { + "epoch": 0.10252357025913171, + "grad_norm": 5.099559307098389, + "learning_rate": 9.86506811865624e-05, + "loss": 1.878, + "step": 1362 + }, + { + "epoch": 0.10259884453979187, + "grad_norm": 6.307775020599365, + "learning_rate": 9.864786672076495e-05, + "loss": 2.1161, + "step": 1363 + }, + { + "epoch": 0.10267411882045202, + "grad_norm": 4.547120094299316, + "learning_rate": 9.864504936299141e-05, + "loss": 2.1173, + "step": 1364 + }, + { + "epoch": 0.10274939310111218, + "grad_norm": 4.769510269165039, + "learning_rate": 9.864222911340926e-05, + "loss": 2.1823, + "step": 1365 + }, + { + "epoch": 0.10282466738177233, + "grad_norm": 5.686911582946777, + "learning_rate": 9.863940597218617e-05, + "loss": 2.1035, + "step": 1366 + }, + { + "epoch": 0.10289994166243249, + "grad_norm": 5.347987651824951, + "learning_rate": 9.863657993948995e-05, + "loss": 2.0937, + "step": 1367 + }, + { + "epoch": 0.10297521594309264, + "grad_norm": 4.7051615715026855, + "learning_rate": 9.863375101548862e-05, + "loss": 1.9138, + "step": 1368 + }, + { + "epoch": 0.1030504902237528, + "grad_norm": 6.939269065856934, + "learning_rate": 9.863091920035033e-05, + "loss": 2.0717, + "step": 1369 + }, + { + "epoch": 0.10312576450441295, + "grad_norm": 4.80898904800415, + "learning_rate": 9.862808449424341e-05, + "loss": 2.02, + "step": 1370 + }, + { + "epoch": 0.10320103878507311, + "grad_norm": 7.007601261138916, + "learning_rate": 9.862524689733644e-05, + "loss": 1.9921, + "step": 1371 + }, + { + "epoch": 0.10327631306573326, + "grad_norm": 5.391836643218994, + "learning_rate": 9.862240640979803e-05, + "loss": 1.8903, + "step": 1372 + }, + { + "epoch": 0.10335158734639342, + "grad_norm": 7.801146507263184, + "learning_rate": 9.861956303179707e-05, + "loss": 2.2166, + "step": 1373 + }, + { + "epoch": 0.10342686162705357, + "grad_norm": 6.386852741241455, + "learning_rate": 9.86167167635026e-05, + "loss": 2.4189, + "step": 1374 + }, + { + "epoch": 0.10350213590771373, + "grad_norm": 6.482513904571533, + "learning_rate": 9.861386760508382e-05, + "loss": 2.2925, + "step": 1375 + }, + { + "epoch": 0.10357741018837388, + "grad_norm": 6.825007915496826, + "learning_rate": 9.86110155567101e-05, + "loss": 2.047, + "step": 1376 + }, + { + "epoch": 0.10365268446903404, + "grad_norm": 5.490835189819336, + "learning_rate": 9.860816061855099e-05, + "loss": 2.3028, + "step": 1377 + }, + { + "epoch": 0.1037279587496942, + "grad_norm": 5.823111057281494, + "learning_rate": 9.86053027907762e-05, + "loss": 2.2276, + "step": 1378 + }, + { + "epoch": 0.10380323303035435, + "grad_norm": 4.6493306159973145, + "learning_rate": 9.860244207355562e-05, + "loss": 1.965, + "step": 1379 + }, + { + "epoch": 0.10387850731101451, + "grad_norm": 7.720662593841553, + "learning_rate": 9.859957846705931e-05, + "loss": 2.6023, + "step": 1380 + }, + { + "epoch": 0.10395378159167466, + "grad_norm": 5.117039680480957, + "learning_rate": 9.859671197145753e-05, + "loss": 2.2035, + "step": 1381 + }, + { + "epoch": 0.10402905587233482, + "grad_norm": 5.911375999450684, + "learning_rate": 9.859384258692064e-05, + "loss": 2.0839, + "step": 1382 + }, + { + "epoch": 0.10410433015299497, + "grad_norm": 4.0877156257629395, + "learning_rate": 9.859097031361924e-05, + "loss": 2.1938, + "step": 1383 + }, + { + "epoch": 0.10417960443365513, + "grad_norm": 5.16452693939209, + "learning_rate": 9.858809515172409e-05, + "loss": 1.9364, + "step": 1384 + }, + { + "epoch": 0.10425487871431528, + "grad_norm": 4.957688331604004, + "learning_rate": 9.858521710140609e-05, + "loss": 1.9771, + "step": 1385 + }, + { + "epoch": 0.10433015299497544, + "grad_norm": 5.3444366455078125, + "learning_rate": 9.858233616283634e-05, + "loss": 2.0174, + "step": 1386 + }, + { + "epoch": 0.10440542727563559, + "grad_norm": 5.152166843414307, + "learning_rate": 9.85794523361861e-05, + "loss": 2.2999, + "step": 1387 + }, + { + "epoch": 0.10448070155629575, + "grad_norm": 7.95336389541626, + "learning_rate": 9.857656562162683e-05, + "loss": 2.2427, + "step": 1388 + }, + { + "epoch": 0.1045559758369559, + "grad_norm": 5.47181510925293, + "learning_rate": 9.85736760193301e-05, + "loss": 2.4419, + "step": 1389 + }, + { + "epoch": 0.10463125011761606, + "grad_norm": 8.796538352966309, + "learning_rate": 9.85707835294677e-05, + "loss": 2.3655, + "step": 1390 + }, + { + "epoch": 0.10470652439827621, + "grad_norm": 6.300104141235352, + "learning_rate": 9.856788815221159e-05, + "loss": 2.0502, + "step": 1391 + }, + { + "epoch": 0.10478179867893637, + "grad_norm": 11.138436317443848, + "learning_rate": 9.856498988773389e-05, + "loss": 2.2526, + "step": 1392 + }, + { + "epoch": 0.10485707295959654, + "grad_norm": 4.621460437774658, + "learning_rate": 9.856208873620688e-05, + "loss": 2.1382, + "step": 1393 + }, + { + "epoch": 0.10493234724025668, + "grad_norm": 5.264841079711914, + "learning_rate": 9.855918469780303e-05, + "loss": 2.1522, + "step": 1394 + }, + { + "epoch": 0.10500762152091685, + "grad_norm": 6.980940341949463, + "learning_rate": 9.8556277772695e-05, + "loss": 2.3028, + "step": 1395 + }, + { + "epoch": 0.105082895801577, + "grad_norm": 5.55518102645874, + "learning_rate": 9.855336796105556e-05, + "loss": 2.1319, + "step": 1396 + }, + { + "epoch": 0.10515817008223716, + "grad_norm": 5.241600036621094, + "learning_rate": 9.855045526305773e-05, + "loss": 2.0691, + "step": 1397 + }, + { + "epoch": 0.1052334443628973, + "grad_norm": 5.43227481842041, + "learning_rate": 9.854753967887465e-05, + "loss": 1.947, + "step": 1398 + }, + { + "epoch": 0.10530871864355747, + "grad_norm": 4.748951435089111, + "learning_rate": 9.85446212086796e-05, + "loss": 2.0776, + "step": 1399 + }, + { + "epoch": 0.10538399292421761, + "grad_norm": 15.107120513916016, + "learning_rate": 9.854169985264614e-05, + "loss": 2.3805, + "step": 1400 + }, + { + "epoch": 0.10545926720487778, + "grad_norm": 6.483290672302246, + "learning_rate": 9.853877561094789e-05, + "loss": 2.537, + "step": 1401 + }, + { + "epoch": 0.10553454148553792, + "grad_norm": 5.504429340362549, + "learning_rate": 9.85358484837587e-05, + "loss": 2.8248, + "step": 1402 + }, + { + "epoch": 0.10560981576619809, + "grad_norm": 5.915532112121582, + "learning_rate": 9.853291847125258e-05, + "loss": 2.371, + "step": 1403 + }, + { + "epoch": 0.10568509004685823, + "grad_norm": 5.121899604797363, + "learning_rate": 9.852998557360373e-05, + "loss": 2.193, + "step": 1404 + }, + { + "epoch": 0.1057603643275184, + "grad_norm": 7.786625385284424, + "learning_rate": 9.852704979098648e-05, + "loss": 2.0593, + "step": 1405 + }, + { + "epoch": 0.10583563860817854, + "grad_norm": 7.193294525146484, + "learning_rate": 9.852411112357536e-05, + "loss": 2.2202, + "step": 1406 + }, + { + "epoch": 0.1059109128888387, + "grad_norm": 6.045143127441406, + "learning_rate": 9.852116957154505e-05, + "loss": 2.0392, + "step": 1407 + }, + { + "epoch": 0.10598618716949885, + "grad_norm": 4.620702266693115, + "learning_rate": 9.851822513507045e-05, + "loss": 2.2266, + "step": 1408 + }, + { + "epoch": 0.10606146145015902, + "grad_norm": 5.685683727264404, + "learning_rate": 9.851527781432657e-05, + "loss": 1.9394, + "step": 1409 + }, + { + "epoch": 0.10613673573081918, + "grad_norm": 6.4046630859375, + "learning_rate": 9.851232760948864e-05, + "loss": 2.4099, + "step": 1410 + }, + { + "epoch": 0.10621201001147933, + "grad_norm": 7.066629886627197, + "learning_rate": 9.850937452073203e-05, + "loss": 2.0889, + "step": 1411 + }, + { + "epoch": 0.10628728429213949, + "grad_norm": 7.340999603271484, + "learning_rate": 9.850641854823228e-05, + "loss": 2.1364, + "step": 1412 + }, + { + "epoch": 0.10636255857279964, + "grad_norm": 4.815880298614502, + "learning_rate": 9.850345969216514e-05, + "loss": 2.0507, + "step": 1413 + }, + { + "epoch": 0.1064378328534598, + "grad_norm": 4.8279643058776855, + "learning_rate": 9.850049795270649e-05, + "loss": 2.2084, + "step": 1414 + }, + { + "epoch": 0.10651310713411995, + "grad_norm": 6.439970970153809, + "learning_rate": 9.84975333300324e-05, + "loss": 2.4464, + "step": 1415 + }, + { + "epoch": 0.10658838141478011, + "grad_norm": 8.260564804077148, + "learning_rate": 9.84945658243191e-05, + "loss": 1.9994, + "step": 1416 + }, + { + "epoch": 0.10666365569544026, + "grad_norm": 4.738978862762451, + "learning_rate": 9.849159543574302e-05, + "loss": 1.9987, + "step": 1417 + }, + { + "epoch": 0.10673892997610042, + "grad_norm": 7.775970935821533, + "learning_rate": 9.848862216448074e-05, + "loss": 2.2815, + "step": 1418 + }, + { + "epoch": 0.10681420425676057, + "grad_norm": 15.273274421691895, + "learning_rate": 9.848564601070897e-05, + "loss": 2.0537, + "step": 1419 + }, + { + "epoch": 0.10688947853742073, + "grad_norm": 6.594801902770996, + "learning_rate": 9.848266697460469e-05, + "loss": 2.2578, + "step": 1420 + }, + { + "epoch": 0.10696475281808088, + "grad_norm": 6.227622985839844, + "learning_rate": 9.847968505634497e-05, + "loss": 1.8358, + "step": 1421 + }, + { + "epoch": 0.10704002709874104, + "grad_norm": 5.783336162567139, + "learning_rate": 9.847670025610707e-05, + "loss": 2.4116, + "step": 1422 + }, + { + "epoch": 0.10711530137940119, + "grad_norm": 5.180295944213867, + "learning_rate": 9.847371257406843e-05, + "loss": 2.1306, + "step": 1423 + }, + { + "epoch": 0.10719057566006135, + "grad_norm": 5.590732574462891, + "learning_rate": 9.847072201040667e-05, + "loss": 2.3146, + "step": 1424 + }, + { + "epoch": 0.1072658499407215, + "grad_norm": 7.10616397857666, + "learning_rate": 9.846772856529958e-05, + "loss": 2.0505, + "step": 1425 + }, + { + "epoch": 0.10734112422138166, + "grad_norm": 5.770072937011719, + "learning_rate": 9.846473223892508e-05, + "loss": 2.2506, + "step": 1426 + }, + { + "epoch": 0.10741639850204182, + "grad_norm": 5.590660095214844, + "learning_rate": 9.846173303146131e-05, + "loss": 2.0013, + "step": 1427 + }, + { + "epoch": 0.10749167278270197, + "grad_norm": 4.99043083190918, + "learning_rate": 9.845873094308656e-05, + "loss": 2.241, + "step": 1428 + }, + { + "epoch": 0.10756694706336213, + "grad_norm": 6.306168556213379, + "learning_rate": 9.84557259739793e-05, + "loss": 2.3264, + "step": 1429 + }, + { + "epoch": 0.10764222134402228, + "grad_norm": 5.132730007171631, + "learning_rate": 9.845271812431817e-05, + "loss": 1.6788, + "step": 1430 + }, + { + "epoch": 0.10771749562468244, + "grad_norm": 9.185691833496094, + "learning_rate": 9.844970739428199e-05, + "loss": 2.0591, + "step": 1431 + }, + { + "epoch": 0.10779276990534259, + "grad_norm": 5.803485870361328, + "learning_rate": 9.844669378404969e-05, + "loss": 2.1156, + "step": 1432 + }, + { + "epoch": 0.10786804418600275, + "grad_norm": 5.465604782104492, + "learning_rate": 9.844367729380049e-05, + "loss": 2.1561, + "step": 1433 + }, + { + "epoch": 0.1079433184666629, + "grad_norm": 6.436990261077881, + "learning_rate": 9.844065792371364e-05, + "loss": 2.0197, + "step": 1434 + }, + { + "epoch": 0.10801859274732306, + "grad_norm": 6.306140899658203, + "learning_rate": 9.843763567396868e-05, + "loss": 2.0628, + "step": 1435 + }, + { + "epoch": 0.10809386702798321, + "grad_norm": 5.11694860458374, + "learning_rate": 9.843461054474528e-05, + "loss": 2.1277, + "step": 1436 + }, + { + "epoch": 0.10816914130864337, + "grad_norm": 5.194339752197266, + "learning_rate": 9.843158253622325e-05, + "loss": 1.9044, + "step": 1437 + }, + { + "epoch": 0.10824441558930352, + "grad_norm": 5.988679885864258, + "learning_rate": 9.84285516485826e-05, + "loss": 2.3833, + "step": 1438 + }, + { + "epoch": 0.10831968986996368, + "grad_norm": 4.994627952575684, + "learning_rate": 9.84255178820035e-05, + "loss": 2.3701, + "step": 1439 + }, + { + "epoch": 0.10839496415062383, + "grad_norm": 5.64990758895874, + "learning_rate": 9.842248123666632e-05, + "loss": 2.0847, + "step": 1440 + }, + { + "epoch": 0.10847023843128399, + "grad_norm": 5.708194732666016, + "learning_rate": 9.841944171275157e-05, + "loss": 2.0912, + "step": 1441 + }, + { + "epoch": 0.10854551271194415, + "grad_norm": 7.916718006134033, + "learning_rate": 9.841639931043994e-05, + "loss": 1.788, + "step": 1442 + }, + { + "epoch": 0.1086207869926043, + "grad_norm": 5.08225154876709, + "learning_rate": 9.84133540299123e-05, + "loss": 2.1897, + "step": 1443 + }, + { + "epoch": 0.10869606127326446, + "grad_norm": 5.5593061447143555, + "learning_rate": 9.841030587134967e-05, + "loss": 2.0027, + "step": 1444 + }, + { + "epoch": 0.10877133555392461, + "grad_norm": 6.073078632354736, + "learning_rate": 9.840725483493325e-05, + "loss": 2.023, + "step": 1445 + }, + { + "epoch": 0.10884660983458477, + "grad_norm": 4.955343246459961, + "learning_rate": 9.840420092084443e-05, + "loss": 2.0405, + "step": 1446 + }, + { + "epoch": 0.10892188411524492, + "grad_norm": 5.286839962005615, + "learning_rate": 9.840114412926478e-05, + "loss": 2.2815, + "step": 1447 + }, + { + "epoch": 0.10899715839590508, + "grad_norm": 5.574923515319824, + "learning_rate": 9.839808446037598e-05, + "loss": 2.1526, + "step": 1448 + }, + { + "epoch": 0.10907243267656523, + "grad_norm": 6.172300815582275, + "learning_rate": 9.83950219143599e-05, + "loss": 1.9979, + "step": 1449 + }, + { + "epoch": 0.1091477069572254, + "grad_norm": 6.606215476989746, + "learning_rate": 9.839195649139864e-05, + "loss": 2.0816, + "step": 1450 + }, + { + "epoch": 0.10922298123788554, + "grad_norm": 5.084033966064453, + "learning_rate": 9.838888819167443e-05, + "loss": 2.0302, + "step": 1451 + }, + { + "epoch": 0.1092982555185457, + "grad_norm": 5.643555164337158, + "learning_rate": 9.838581701536967e-05, + "loss": 2.0305, + "step": 1452 + }, + { + "epoch": 0.10937352979920585, + "grad_norm": 5.315614223480225, + "learning_rate": 9.83827429626669e-05, + "loss": 2.2025, + "step": 1453 + }, + { + "epoch": 0.10944880407986601, + "grad_norm": 8.499626159667969, + "learning_rate": 9.837966603374889e-05, + "loss": 2.4057, + "step": 1454 + }, + { + "epoch": 0.10952407836052616, + "grad_norm": 7.1480183601379395, + "learning_rate": 9.837658622879856e-05, + "loss": 2.1011, + "step": 1455 + }, + { + "epoch": 0.10959935264118632, + "grad_norm": 4.459353923797607, + "learning_rate": 9.8373503547999e-05, + "loss": 2.2915, + "step": 1456 + }, + { + "epoch": 0.10967462692184647, + "grad_norm": 5.968489646911621, + "learning_rate": 9.837041799153342e-05, + "loss": 2.2431, + "step": 1457 + }, + { + "epoch": 0.10974990120250663, + "grad_norm": 5.713187217712402, + "learning_rate": 9.83673295595853e-05, + "loss": 2.2369, + "step": 1458 + }, + { + "epoch": 0.1098251754831668, + "grad_norm": 6.231503963470459, + "learning_rate": 9.836423825233821e-05, + "loss": 2.2847, + "step": 1459 + }, + { + "epoch": 0.10990044976382694, + "grad_norm": 5.514884948730469, + "learning_rate": 9.836114406997594e-05, + "loss": 2.0935, + "step": 1460 + }, + { + "epoch": 0.1099757240444871, + "grad_norm": 5.09206485748291, + "learning_rate": 9.835804701268241e-05, + "loss": 2.1089, + "step": 1461 + }, + { + "epoch": 0.11005099832514725, + "grad_norm": 8.476020812988281, + "learning_rate": 9.835494708064175e-05, + "loss": 1.955, + "step": 1462 + }, + { + "epoch": 0.11012627260580742, + "grad_norm": 9.018309593200684, + "learning_rate": 9.835184427403822e-05, + "loss": 2.515, + "step": 1463 + }, + { + "epoch": 0.11020154688646756, + "grad_norm": 7.061486721038818, + "learning_rate": 9.834873859305629e-05, + "loss": 2.1517, + "step": 1464 + }, + { + "epoch": 0.11027682116712773, + "grad_norm": 4.436018943786621, + "learning_rate": 9.834563003788057e-05, + "loss": 2.2998, + "step": 1465 + }, + { + "epoch": 0.11035209544778787, + "grad_norm": 8.385354995727539, + "learning_rate": 9.834251860869588e-05, + "loss": 2.1364, + "step": 1466 + }, + { + "epoch": 0.11042736972844804, + "grad_norm": 6.648226737976074, + "learning_rate": 9.833940430568715e-05, + "loss": 2.427, + "step": 1467 + }, + { + "epoch": 0.11050264400910818, + "grad_norm": 5.4996018409729, + "learning_rate": 9.833628712903957e-05, + "loss": 2.1902, + "step": 1468 + }, + { + "epoch": 0.11057791828976835, + "grad_norm": 7.0419087409973145, + "learning_rate": 9.833316707893838e-05, + "loss": 2.1917, + "step": 1469 + }, + { + "epoch": 0.1106531925704285, + "grad_norm": 5.958217144012451, + "learning_rate": 9.83300441555691e-05, + "loss": 2.1878, + "step": 1470 + }, + { + "epoch": 0.11072846685108866, + "grad_norm": 7.1612701416015625, + "learning_rate": 9.832691835911736e-05, + "loss": 2.194, + "step": 1471 + }, + { + "epoch": 0.1108037411317488, + "grad_norm": 7.138689994812012, + "learning_rate": 9.832378968976901e-05, + "loss": 1.9221, + "step": 1472 + }, + { + "epoch": 0.11087901541240897, + "grad_norm": 5.788731575012207, + "learning_rate": 9.832065814771e-05, + "loss": 2.081, + "step": 1473 + }, + { + "epoch": 0.11095428969306913, + "grad_norm": 4.319376468658447, + "learning_rate": 9.831752373312653e-05, + "loss": 2.151, + "step": 1474 + }, + { + "epoch": 0.11102956397372928, + "grad_norm": 5.694935321807861, + "learning_rate": 9.831438644620489e-05, + "loss": 1.7281, + "step": 1475 + }, + { + "epoch": 0.11110483825438944, + "grad_norm": 6.300022602081299, + "learning_rate": 9.831124628713161e-05, + "loss": 2.336, + "step": 1476 + }, + { + "epoch": 0.11118011253504959, + "grad_norm": 4.9054670333862305, + "learning_rate": 9.830810325609337e-05, + "loss": 2.0742, + "step": 1477 + }, + { + "epoch": 0.11125538681570975, + "grad_norm": 5.111865043640137, + "learning_rate": 9.8304957353277e-05, + "loss": 1.9785, + "step": 1478 + }, + { + "epoch": 0.1113306610963699, + "grad_norm": 5.367143630981445, + "learning_rate": 9.830180857886952e-05, + "loss": 2.2026, + "step": 1479 + }, + { + "epoch": 0.11140593537703006, + "grad_norm": 11.445992469787598, + "learning_rate": 9.829865693305811e-05, + "loss": 2.3817, + "step": 1480 + }, + { + "epoch": 0.1114812096576902, + "grad_norm": 6.0611772537231445, + "learning_rate": 9.829550241603012e-05, + "loss": 2.2339, + "step": 1481 + }, + { + "epoch": 0.11155648393835037, + "grad_norm": 4.703336715698242, + "learning_rate": 9.82923450279731e-05, + "loss": 1.9804, + "step": 1482 + }, + { + "epoch": 0.11163175821901052, + "grad_norm": 8.810880661010742, + "learning_rate": 9.828918476907472e-05, + "loss": 2.0574, + "step": 1483 + }, + { + "epoch": 0.11170703249967068, + "grad_norm": 19.528118133544922, + "learning_rate": 9.828602163952287e-05, + "loss": 2.0386, + "step": 1484 + }, + { + "epoch": 0.11178230678033083, + "grad_norm": 4.602316379547119, + "learning_rate": 9.828285563950558e-05, + "loss": 2.9653, + "step": 1485 + }, + { + "epoch": 0.11185758106099099, + "grad_norm": 6.631296157836914, + "learning_rate": 9.827968676921108e-05, + "loss": 2.1543, + "step": 1486 + }, + { + "epoch": 0.11193285534165114, + "grad_norm": 6.649188041687012, + "learning_rate": 9.827651502882771e-05, + "loss": 2.1433, + "step": 1487 + }, + { + "epoch": 0.1120081296223113, + "grad_norm": 4.542372226715088, + "learning_rate": 9.827334041854406e-05, + "loss": 1.965, + "step": 1488 + }, + { + "epoch": 0.11208340390297145, + "grad_norm": 6.464970588684082, + "learning_rate": 9.827016293854882e-05, + "loss": 2.0982, + "step": 1489 + }, + { + "epoch": 0.11215867818363161, + "grad_norm": 6.385712623596191, + "learning_rate": 9.82669825890309e-05, + "loss": 3.0171, + "step": 1490 + }, + { + "epoch": 0.11223395246429177, + "grad_norm": 6.234274864196777, + "learning_rate": 9.826379937017936e-05, + "loss": 2.2717, + "step": 1491 + }, + { + "epoch": 0.11230922674495192, + "grad_norm": 6.592389106750488, + "learning_rate": 9.826061328218342e-05, + "loss": 2.1757, + "step": 1492 + }, + { + "epoch": 0.11238450102561208, + "grad_norm": 6.315964698791504, + "learning_rate": 9.825742432523251e-05, + "loss": 1.8679, + "step": 1493 + }, + { + "epoch": 0.11245977530627223, + "grad_norm": 5.095702171325684, + "learning_rate": 9.825423249951618e-05, + "loss": 2.0326, + "step": 1494 + }, + { + "epoch": 0.11253504958693239, + "grad_norm": 5.123872756958008, + "learning_rate": 9.82510378052242e-05, + "loss": 2.11, + "step": 1495 + }, + { + "epoch": 0.11261032386759254, + "grad_norm": 6.776282787322998, + "learning_rate": 9.824784024254645e-05, + "loss": 2.3171, + "step": 1496 + }, + { + "epoch": 0.1126855981482527, + "grad_norm": 4.983455181121826, + "learning_rate": 9.824463981167305e-05, + "loss": 1.9101, + "step": 1497 + }, + { + "epoch": 0.11276087242891285, + "grad_norm": 6.514476299285889, + "learning_rate": 9.824143651279424e-05, + "loss": 1.9261, + "step": 1498 + }, + { + "epoch": 0.11283614670957301, + "grad_norm": 5.887244701385498, + "learning_rate": 9.823823034610045e-05, + "loss": 2.1857, + "step": 1499 + }, + { + "epoch": 0.11291142099023316, + "grad_norm": 5.908926963806152, + "learning_rate": 9.823502131178226e-05, + "loss": 2.1979, + "step": 1500 + }, + { + "epoch": 0.11298669527089332, + "grad_norm": 4.755209445953369, + "learning_rate": 9.823180941003048e-05, + "loss": 2.2282, + "step": 1501 + }, + { + "epoch": 0.11306196955155347, + "grad_norm": 4.9476118087768555, + "learning_rate": 9.822859464103602e-05, + "loss": 2.0001, + "step": 1502 + }, + { + "epoch": 0.11313724383221363, + "grad_norm": 5.295914173126221, + "learning_rate": 9.822537700498999e-05, + "loss": 1.6996, + "step": 1503 + }, + { + "epoch": 0.11321251811287378, + "grad_norm": 4.1576151847839355, + "learning_rate": 9.822215650208367e-05, + "loss": 2.2984, + "step": 1504 + }, + { + "epoch": 0.11328779239353394, + "grad_norm": 5.289822101593018, + "learning_rate": 9.821893313250849e-05, + "loss": 2.3052, + "step": 1505 + }, + { + "epoch": 0.11336306667419409, + "grad_norm": 6.449665546417236, + "learning_rate": 9.821570689645612e-05, + "loss": 2.7833, + "step": 1506 + }, + { + "epoch": 0.11343834095485425, + "grad_norm": 4.573452472686768, + "learning_rate": 9.82124777941183e-05, + "loss": 2.2117, + "step": 1507 + }, + { + "epoch": 0.11351361523551441, + "grad_norm": 6.983219146728516, + "learning_rate": 9.820924582568703e-05, + "loss": 2.2128, + "step": 1508 + }, + { + "epoch": 0.11358888951617456, + "grad_norm": 4.5462236404418945, + "learning_rate": 9.820601099135441e-05, + "loss": 2.4391, + "step": 1509 + }, + { + "epoch": 0.11366416379683472, + "grad_norm": 5.472576141357422, + "learning_rate": 9.820277329131276e-05, + "loss": 1.8588, + "step": 1510 + }, + { + "epoch": 0.11373943807749487, + "grad_norm": 4.933330059051514, + "learning_rate": 9.819953272575455e-05, + "loss": 2.264, + "step": 1511 + }, + { + "epoch": 0.11381471235815503, + "grad_norm": 7.668219089508057, + "learning_rate": 9.81962892948724e-05, + "loss": 2.5099, + "step": 1512 + }, + { + "epoch": 0.11388998663881518, + "grad_norm": 8.078141212463379, + "learning_rate": 9.819304299885918e-05, + "loss": 2.0212, + "step": 1513 + }, + { + "epoch": 0.11396526091947534, + "grad_norm": 5.445894241333008, + "learning_rate": 9.818979383790781e-05, + "loss": 1.9541, + "step": 1514 + }, + { + "epoch": 0.11404053520013549, + "grad_norm": 7.48053503036499, + "learning_rate": 9.818654181221147e-05, + "loss": 2.3156, + "step": 1515 + }, + { + "epoch": 0.11411580948079565, + "grad_norm": 5.724066734313965, + "learning_rate": 9.818328692196347e-05, + "loss": 2.4191, + "step": 1516 + }, + { + "epoch": 0.1141910837614558, + "grad_norm": 5.754711627960205, + "learning_rate": 9.818002916735734e-05, + "loss": 2.1985, + "step": 1517 + }, + { + "epoch": 0.11426635804211596, + "grad_norm": 7.187510967254639, + "learning_rate": 9.81767685485867e-05, + "loss": 2.3023, + "step": 1518 + }, + { + "epoch": 0.11434163232277611, + "grad_norm": 7.5015459060668945, + "learning_rate": 9.817350506584543e-05, + "loss": 2.1641, + "step": 1519 + }, + { + "epoch": 0.11441690660343627, + "grad_norm": 6.0347466468811035, + "learning_rate": 9.817023871932749e-05, + "loss": 2.2161, + "step": 1520 + }, + { + "epoch": 0.11449218088409642, + "grad_norm": 4.758808135986328, + "learning_rate": 9.816696950922709e-05, + "loss": 2.1604, + "step": 1521 + }, + { + "epoch": 0.11456745516475658, + "grad_norm": 4.702451229095459, + "learning_rate": 9.816369743573853e-05, + "loss": 1.9517, + "step": 1522 + }, + { + "epoch": 0.11464272944541674, + "grad_norm": 5.616663455963135, + "learning_rate": 9.816042249905637e-05, + "loss": 2.0729, + "step": 1523 + }, + { + "epoch": 0.11471800372607689, + "grad_norm": 7.786160945892334, + "learning_rate": 9.815714469937529e-05, + "loss": 1.6986, + "step": 1524 + }, + { + "epoch": 0.11479327800673705, + "grad_norm": 4.867729663848877, + "learning_rate": 9.815386403689013e-05, + "loss": 2.0512, + "step": 1525 + }, + { + "epoch": 0.1148685522873972, + "grad_norm": 5.106400012969971, + "learning_rate": 9.815058051179591e-05, + "loss": 1.9166, + "step": 1526 + }, + { + "epoch": 0.11494382656805736, + "grad_norm": 5.183450222015381, + "learning_rate": 9.814729412428785e-05, + "loss": 2.4293, + "step": 1527 + }, + { + "epoch": 0.11501910084871751, + "grad_norm": 6.640865325927734, + "learning_rate": 9.81440048745613e-05, + "loss": 2.3642, + "step": 1528 + }, + { + "epoch": 0.11509437512937767, + "grad_norm": 6.376265048980713, + "learning_rate": 9.81407127628118e-05, + "loss": 1.9368, + "step": 1529 + }, + { + "epoch": 0.11516964941003782, + "grad_norm": 5.263660907745361, + "learning_rate": 9.813741778923506e-05, + "loss": 2.5126, + "step": 1530 + }, + { + "epoch": 0.11524492369069798, + "grad_norm": 5.883419513702393, + "learning_rate": 9.813411995402697e-05, + "loss": 2.0917, + "step": 1531 + }, + { + "epoch": 0.11532019797135813, + "grad_norm": 7.351604461669922, + "learning_rate": 9.813081925738354e-05, + "loss": 2.0258, + "step": 1532 + }, + { + "epoch": 0.1153954722520183, + "grad_norm": 5.937775135040283, + "learning_rate": 9.812751569950101e-05, + "loss": 2.0962, + "step": 1533 + }, + { + "epoch": 0.11547074653267844, + "grad_norm": 4.7670464515686035, + "learning_rate": 9.812420928057578e-05, + "loss": 3.0468, + "step": 1534 + }, + { + "epoch": 0.1155460208133386, + "grad_norm": 4.790338516235352, + "learning_rate": 9.812090000080437e-05, + "loss": 2.153, + "step": 1535 + }, + { + "epoch": 0.11562129509399875, + "grad_norm": 5.9281086921691895, + "learning_rate": 9.811758786038354e-05, + "loss": 2.3436, + "step": 1536 + }, + { + "epoch": 0.11569656937465891, + "grad_norm": 4.886473655700684, + "learning_rate": 9.811427285951018e-05, + "loss": 1.9634, + "step": 1537 + }, + { + "epoch": 0.11577184365531906, + "grad_norm": 5.937329292297363, + "learning_rate": 9.811095499838134e-05, + "loss": 2.2245, + "step": 1538 + }, + { + "epoch": 0.11584711793597922, + "grad_norm": 5.479110240936279, + "learning_rate": 9.81076342771943e-05, + "loss": 2.2096, + "step": 1539 + }, + { + "epoch": 0.11592239221663939, + "grad_norm": 5.118183135986328, + "learning_rate": 9.81043106961464e-05, + "loss": 2.5553, + "step": 1540 + }, + { + "epoch": 0.11599766649729953, + "grad_norm": 7.191288471221924, + "learning_rate": 9.810098425543528e-05, + "loss": 2.0511, + "step": 1541 + }, + { + "epoch": 0.1160729407779597, + "grad_norm": 6.910278797149658, + "learning_rate": 9.809765495525867e-05, + "loss": 2.1989, + "step": 1542 + }, + { + "epoch": 0.11614821505861984, + "grad_norm": 6.195734024047852, + "learning_rate": 9.809432279581446e-05, + "loss": 2.2245, + "step": 1543 + }, + { + "epoch": 0.11622348933928, + "grad_norm": 6.120545387268066, + "learning_rate": 9.809098777730077e-05, + "loss": 2.1519, + "step": 1544 + }, + { + "epoch": 0.11629876361994015, + "grad_norm": 7.294845104217529, + "learning_rate": 9.808764989991585e-05, + "loss": 2.2604, + "step": 1545 + }, + { + "epoch": 0.11637403790060032, + "grad_norm": 4.55820894241333, + "learning_rate": 9.808430916385811e-05, + "loss": 2.2675, + "step": 1546 + }, + { + "epoch": 0.11644931218126046, + "grad_norm": 6.292935848236084, + "learning_rate": 9.808096556932616e-05, + "loss": 2.1317, + "step": 1547 + }, + { + "epoch": 0.11652458646192063, + "grad_norm": 6.036036014556885, + "learning_rate": 9.807761911651878e-05, + "loss": 2.2352, + "step": 1548 + }, + { + "epoch": 0.11659986074258077, + "grad_norm": 5.23516845703125, + "learning_rate": 9.807426980563488e-05, + "loss": 2.0015, + "step": 1549 + }, + { + "epoch": 0.11667513502324094, + "grad_norm": 5.066883563995361, + "learning_rate": 9.807091763687359e-05, + "loss": 2.0093, + "step": 1550 + }, + { + "epoch": 0.11675040930390108, + "grad_norm": 6.832826137542725, + "learning_rate": 9.806756261043417e-05, + "loss": 2.2704, + "step": 1551 + }, + { + "epoch": 0.11682568358456125, + "grad_norm": 7.867970943450928, + "learning_rate": 9.806420472651607e-05, + "loss": 2.1464, + "step": 1552 + }, + { + "epoch": 0.1169009578652214, + "grad_norm": 6.299739837646484, + "learning_rate": 9.806084398531892e-05, + "loss": 2.1337, + "step": 1553 + }, + { + "epoch": 0.11697623214588156, + "grad_norm": 6.507174015045166, + "learning_rate": 9.80574803870425e-05, + "loss": 2.0298, + "step": 1554 + }, + { + "epoch": 0.11705150642654172, + "grad_norm": 6.098346710205078, + "learning_rate": 9.805411393188676e-05, + "loss": 2.2475, + "step": 1555 + }, + { + "epoch": 0.11712678070720187, + "grad_norm": 6.21531867980957, + "learning_rate": 9.805074462005183e-05, + "loss": 2.2574, + "step": 1556 + }, + { + "epoch": 0.11720205498786203, + "grad_norm": 6.67768669128418, + "learning_rate": 9.804737245173801e-05, + "loss": 2.1979, + "step": 1557 + }, + { + "epoch": 0.11727732926852218, + "grad_norm": 6.259979724884033, + "learning_rate": 9.804399742714574e-05, + "loss": 2.1533, + "step": 1558 + }, + { + "epoch": 0.11735260354918234, + "grad_norm": 6.360954761505127, + "learning_rate": 9.80406195464757e-05, + "loss": 2.4152, + "step": 1559 + }, + { + "epoch": 0.11742787782984249, + "grad_norm": 5.9031219482421875, + "learning_rate": 9.803723880992866e-05, + "loss": 1.9498, + "step": 1560 + }, + { + "epoch": 0.11750315211050265, + "grad_norm": 5.639250755310059, + "learning_rate": 9.80338552177056e-05, + "loss": 2.3557, + "step": 1561 + }, + { + "epoch": 0.1175784263911628, + "grad_norm": 7.0413384437561035, + "learning_rate": 9.803046877000768e-05, + "loss": 2.0074, + "step": 1562 + }, + { + "epoch": 0.11765370067182296, + "grad_norm": 8.83906078338623, + "learning_rate": 9.802707946703622e-05, + "loss": 2.0737, + "step": 1563 + }, + { + "epoch": 0.1177289749524831, + "grad_norm": 5.171319007873535, + "learning_rate": 9.802368730899267e-05, + "loss": 1.9466, + "step": 1564 + }, + { + "epoch": 0.11780424923314327, + "grad_norm": 6.001622676849365, + "learning_rate": 9.802029229607873e-05, + "loss": 2.7779, + "step": 1565 + }, + { + "epoch": 0.11787952351380342, + "grad_norm": 5.641585826873779, + "learning_rate": 9.801689442849617e-05, + "loss": 2.1132, + "step": 1566 + }, + { + "epoch": 0.11795479779446358, + "grad_norm": 7.2758097648620605, + "learning_rate": 9.801349370644702e-05, + "loss": 2.3559, + "step": 1567 + }, + { + "epoch": 0.11803007207512373, + "grad_norm": 14.900161743164062, + "learning_rate": 9.801009013013345e-05, + "loss": 2.3044, + "step": 1568 + }, + { + "epoch": 0.11810534635578389, + "grad_norm": 5.561187267303467, + "learning_rate": 9.800668369975776e-05, + "loss": 2.9818, + "step": 1569 + }, + { + "epoch": 0.11818062063644404, + "grad_norm": 5.411054611206055, + "learning_rate": 9.800327441552248e-05, + "loss": 1.918, + "step": 1570 + }, + { + "epoch": 0.1182558949171042, + "grad_norm": 5.714082717895508, + "learning_rate": 9.799986227763027e-05, + "loss": 2.2684, + "step": 1571 + }, + { + "epoch": 0.11833116919776436, + "grad_norm": 8.185328483581543, + "learning_rate": 9.799644728628398e-05, + "loss": 2.1285, + "step": 1572 + }, + { + "epoch": 0.11840644347842451, + "grad_norm": 5.142179489135742, + "learning_rate": 9.799302944168662e-05, + "loss": 2.0175, + "step": 1573 + }, + { + "epoch": 0.11848171775908467, + "grad_norm": 6.00110387802124, + "learning_rate": 9.798960874404135e-05, + "loss": 2.5517, + "step": 1574 + }, + { + "epoch": 0.11855699203974482, + "grad_norm": 10.993745803833008, + "learning_rate": 9.798618519355155e-05, + "loss": 2.2265, + "step": 1575 + }, + { + "epoch": 0.11863226632040498, + "grad_norm": 4.340198040008545, + "learning_rate": 9.798275879042073e-05, + "loss": 1.8933, + "step": 1576 + }, + { + "epoch": 0.11870754060106513, + "grad_norm": 6.0071258544921875, + "learning_rate": 9.797932953485258e-05, + "loss": 2.4131, + "step": 1577 + }, + { + "epoch": 0.11878281488172529, + "grad_norm": 5.955046653747559, + "learning_rate": 9.797589742705094e-05, + "loss": 2.0842, + "step": 1578 + }, + { + "epoch": 0.11885808916238544, + "grad_norm": 5.829794406890869, + "learning_rate": 9.797246246721988e-05, + "loss": 1.8168, + "step": 1579 + }, + { + "epoch": 0.1189333634430456, + "grad_norm": 6.047423362731934, + "learning_rate": 9.796902465556354e-05, + "loss": 1.9509, + "step": 1580 + }, + { + "epoch": 0.11900863772370575, + "grad_norm": 5.070875644683838, + "learning_rate": 9.796558399228635e-05, + "loss": 2.1511, + "step": 1581 + }, + { + "epoch": 0.11908391200436591, + "grad_norm": 6.586251735687256, + "learning_rate": 9.796214047759282e-05, + "loss": 2.2421, + "step": 1582 + }, + { + "epoch": 0.11915918628502606, + "grad_norm": 9.841683387756348, + "learning_rate": 9.795869411168766e-05, + "loss": 2.3947, + "step": 1583 + }, + { + "epoch": 0.11923446056568622, + "grad_norm": 4.856389045715332, + "learning_rate": 9.795524489477572e-05, + "loss": 2.975, + "step": 1584 + }, + { + "epoch": 0.11930973484634637, + "grad_norm": 4.7928338050842285, + "learning_rate": 9.79517928270621e-05, + "loss": 2.1721, + "step": 1585 + }, + { + "epoch": 0.11938500912700653, + "grad_norm": 5.896542072296143, + "learning_rate": 9.794833790875197e-05, + "loss": 2.1084, + "step": 1586 + }, + { + "epoch": 0.11946028340766668, + "grad_norm": 7.052713394165039, + "learning_rate": 9.794488014005072e-05, + "loss": 2.0306, + "step": 1587 + }, + { + "epoch": 0.11953555768832684, + "grad_norm": 5.767529010772705, + "learning_rate": 9.794141952116393e-05, + "loss": 2.2704, + "step": 1588 + }, + { + "epoch": 0.119610831968987, + "grad_norm": 4.579977989196777, + "learning_rate": 9.793795605229731e-05, + "loss": 2.723, + "step": 1589 + }, + { + "epoch": 0.11968610624964715, + "grad_norm": 5.088316440582275, + "learning_rate": 9.793448973365676e-05, + "loss": 2.02, + "step": 1590 + }, + { + "epoch": 0.11976138053030731, + "grad_norm": 4.700905799865723, + "learning_rate": 9.793102056544832e-05, + "loss": 1.9295, + "step": 1591 + }, + { + "epoch": 0.11983665481096746, + "grad_norm": 4.777873516082764, + "learning_rate": 9.792754854787824e-05, + "loss": 1.9644, + "step": 1592 + }, + { + "epoch": 0.11991192909162762, + "grad_norm": 3.716501235961914, + "learning_rate": 9.79240736811529e-05, + "loss": 1.9246, + "step": 1593 + }, + { + "epoch": 0.11998720337228777, + "grad_norm": 6.38320779800415, + "learning_rate": 9.792059596547891e-05, + "loss": 2.6117, + "step": 1594 + }, + { + "epoch": 0.12006247765294793, + "grad_norm": 5.550270080566406, + "learning_rate": 9.791711540106299e-05, + "loss": 2.0404, + "step": 1595 + }, + { + "epoch": 0.12013775193360808, + "grad_norm": 5.174494743347168, + "learning_rate": 9.791363198811205e-05, + "loss": 2.4631, + "step": 1596 + }, + { + "epoch": 0.12021302621426824, + "grad_norm": 5.695245742797852, + "learning_rate": 9.791014572683316e-05, + "loss": 1.9024, + "step": 1597 + }, + { + "epoch": 0.12028830049492839, + "grad_norm": 5.44959020614624, + "learning_rate": 9.79066566174336e-05, + "loss": 2.1448, + "step": 1598 + }, + { + "epoch": 0.12036357477558855, + "grad_norm": 6.555175304412842, + "learning_rate": 9.790316466012073e-05, + "loss": 2.0605, + "step": 1599 + }, + { + "epoch": 0.1204388490562487, + "grad_norm": 6.986435890197754, + "learning_rate": 9.789966985510218e-05, + "loss": 2.0455, + "step": 1600 + }, + { + "epoch": 0.12051412333690886, + "grad_norm": 6.022130966186523, + "learning_rate": 9.789617220258571e-05, + "loss": 2.2684, + "step": 1601 + }, + { + "epoch": 0.12058939761756901, + "grad_norm": 8.36589527130127, + "learning_rate": 9.789267170277922e-05, + "loss": 2.0562, + "step": 1602 + }, + { + "epoch": 0.12066467189822917, + "grad_norm": 5.29276180267334, + "learning_rate": 9.788916835589082e-05, + "loss": 2.2865, + "step": 1603 + }, + { + "epoch": 0.12073994617888933, + "grad_norm": 8.113785743713379, + "learning_rate": 9.788566216212876e-05, + "loss": 2.1052, + "step": 1604 + }, + { + "epoch": 0.12081522045954948, + "grad_norm": 7.416926860809326, + "learning_rate": 9.78821531217015e-05, + "loss": 2.2905, + "step": 1605 + }, + { + "epoch": 0.12089049474020964, + "grad_norm": 7.012533664703369, + "learning_rate": 9.787864123481763e-05, + "loss": 2.2443, + "step": 1606 + }, + { + "epoch": 0.12096576902086979, + "grad_norm": 6.828108310699463, + "learning_rate": 9.78751265016859e-05, + "loss": 2.3844, + "step": 1607 + }, + { + "epoch": 0.12104104330152995, + "grad_norm": 5.260256767272949, + "learning_rate": 9.787160892251527e-05, + "loss": 2.3387, + "step": 1608 + }, + { + "epoch": 0.1211163175821901, + "grad_norm": 7.737606525421143, + "learning_rate": 9.786808849751486e-05, + "loss": 2.0778, + "step": 1609 + }, + { + "epoch": 0.12119159186285026, + "grad_norm": 4.928511619567871, + "learning_rate": 9.786456522689393e-05, + "loss": 1.8118, + "step": 1610 + }, + { + "epoch": 0.12126686614351041, + "grad_norm": 7.609243392944336, + "learning_rate": 9.786103911086195e-05, + "loss": 2.3105, + "step": 1611 + }, + { + "epoch": 0.12134214042417057, + "grad_norm": 6.966038227081299, + "learning_rate": 9.785751014962852e-05, + "loss": 2.2469, + "step": 1612 + }, + { + "epoch": 0.12141741470483072, + "grad_norm": 4.55077600479126, + "learning_rate": 9.785397834340341e-05, + "loss": 2.2754, + "step": 1613 + }, + { + "epoch": 0.12149268898549088, + "grad_norm": 5.990513324737549, + "learning_rate": 9.785044369239662e-05, + "loss": 2.2955, + "step": 1614 + }, + { + "epoch": 0.12156796326615103, + "grad_norm": 4.159506797790527, + "learning_rate": 9.784690619681824e-05, + "loss": 2.0037, + "step": 1615 + }, + { + "epoch": 0.1216432375468112, + "grad_norm": 5.416188716888428, + "learning_rate": 9.784336585687857e-05, + "loss": 1.7581, + "step": 1616 + }, + { + "epoch": 0.12171851182747134, + "grad_norm": 5.800360679626465, + "learning_rate": 9.783982267278808e-05, + "loss": 1.8906, + "step": 1617 + }, + { + "epoch": 0.1217937861081315, + "grad_norm": 3.4604830741882324, + "learning_rate": 9.783627664475741e-05, + "loss": 2.2593, + "step": 1618 + }, + { + "epoch": 0.12186906038879165, + "grad_norm": 7.346010684967041, + "learning_rate": 9.783272777299734e-05, + "loss": 2.005, + "step": 1619 + }, + { + "epoch": 0.12194433466945181, + "grad_norm": 5.070446014404297, + "learning_rate": 9.782917605771885e-05, + "loss": 2.0805, + "step": 1620 + }, + { + "epoch": 0.12201960895011198, + "grad_norm": 6.693550109863281, + "learning_rate": 9.782562149913307e-05, + "loss": 2.2061, + "step": 1621 + }, + { + "epoch": 0.12209488323077212, + "grad_norm": 5.4530181884765625, + "learning_rate": 9.782206409745135e-05, + "loss": 2.2714, + "step": 1622 + }, + { + "epoch": 0.12217015751143229, + "grad_norm": 5.354299545288086, + "learning_rate": 9.781850385288512e-05, + "loss": 1.9279, + "step": 1623 + }, + { + "epoch": 0.12224543179209243, + "grad_norm": 6.551476955413818, + "learning_rate": 9.781494076564604e-05, + "loss": 2.1084, + "step": 1624 + }, + { + "epoch": 0.1223207060727526, + "grad_norm": 4.194823741912842, + "learning_rate": 9.781137483594594e-05, + "loss": 1.9927, + "step": 1625 + }, + { + "epoch": 0.12239598035341274, + "grad_norm": 6.1428985595703125, + "learning_rate": 9.780780606399678e-05, + "loss": 2.3137, + "step": 1626 + }, + { + "epoch": 0.1224712546340729, + "grad_norm": 5.489068984985352, + "learning_rate": 9.780423445001073e-05, + "loss": 2.0632, + "step": 1627 + }, + { + "epoch": 0.12254652891473305, + "grad_norm": 5.6465044021606445, + "learning_rate": 9.780065999420011e-05, + "loss": 2.0563, + "step": 1628 + }, + { + "epoch": 0.12262180319539322, + "grad_norm": 6.312600135803223, + "learning_rate": 9.77970826967774e-05, + "loss": 1.9736, + "step": 1629 + }, + { + "epoch": 0.12269707747605337, + "grad_norm": 6.4857659339904785, + "learning_rate": 9.779350255795527e-05, + "loss": 2.1688, + "step": 1630 + }, + { + "epoch": 0.12277235175671353, + "grad_norm": 5.026872158050537, + "learning_rate": 9.778991957794657e-05, + "loss": 2.0311, + "step": 1631 + }, + { + "epoch": 0.12284762603737368, + "grad_norm": 7.403682708740234, + "learning_rate": 9.778633375696425e-05, + "loss": 2.2208, + "step": 1632 + }, + { + "epoch": 0.12292290031803384, + "grad_norm": 5.323304176330566, + "learning_rate": 9.778274509522152e-05, + "loss": 1.9562, + "step": 1633 + }, + { + "epoch": 0.12299817459869399, + "grad_norm": 6.573366165161133, + "learning_rate": 9.777915359293168e-05, + "loss": 2.0517, + "step": 1634 + }, + { + "epoch": 0.12307344887935415, + "grad_norm": 5.52882719039917, + "learning_rate": 9.777555925030826e-05, + "loss": 2.2944, + "step": 1635 + }, + { + "epoch": 0.12314872316001431, + "grad_norm": 5.32335090637207, + "learning_rate": 9.777196206756493e-05, + "loss": 2.1038, + "step": 1636 + }, + { + "epoch": 0.12322399744067446, + "grad_norm": 6.778807640075684, + "learning_rate": 9.776836204491553e-05, + "loss": 1.6062, + "step": 1637 + }, + { + "epoch": 0.12329927172133462, + "grad_norm": 7.642205238342285, + "learning_rate": 9.776475918257406e-05, + "loss": 2.2481, + "step": 1638 + }, + { + "epoch": 0.12337454600199477, + "grad_norm": 4.713119029998779, + "learning_rate": 9.776115348075472e-05, + "loss": 2.0603, + "step": 1639 + }, + { + "epoch": 0.12344982028265493, + "grad_norm": 6.024548530578613, + "learning_rate": 9.775754493967184e-05, + "loss": 2.2245, + "step": 1640 + }, + { + "epoch": 0.12352509456331508, + "grad_norm": 5.550868988037109, + "learning_rate": 9.775393355953995e-05, + "loss": 1.8459, + "step": 1641 + }, + { + "epoch": 0.12360036884397524, + "grad_norm": 5.1790032386779785, + "learning_rate": 9.775031934057372e-05, + "loss": 2.2188, + "step": 1642 + }, + { + "epoch": 0.12367564312463539, + "grad_norm": 6.542095184326172, + "learning_rate": 9.774670228298803e-05, + "loss": 1.8817, + "step": 1643 + }, + { + "epoch": 0.12375091740529555, + "grad_norm": 4.841869831085205, + "learning_rate": 9.774308238699787e-05, + "loss": 1.9914, + "step": 1644 + }, + { + "epoch": 0.1238261916859557, + "grad_norm": 5.752560138702393, + "learning_rate": 9.773945965281847e-05, + "loss": 2.0029, + "step": 1645 + }, + { + "epoch": 0.12390146596661586, + "grad_norm": 9.701780319213867, + "learning_rate": 9.773583408066518e-05, + "loss": 2.0469, + "step": 1646 + }, + { + "epoch": 0.12397674024727601, + "grad_norm": 4.296906471252441, + "learning_rate": 9.773220567075352e-05, + "loss": 2.1764, + "step": 1647 + }, + { + "epoch": 0.12405201452793617, + "grad_norm": 5.728732585906982, + "learning_rate": 9.772857442329917e-05, + "loss": 1.8114, + "step": 1648 + }, + { + "epoch": 0.12412728880859632, + "grad_norm": 6.810356616973877, + "learning_rate": 9.772494033851805e-05, + "loss": 2.2832, + "step": 1649 + }, + { + "epoch": 0.12420256308925648, + "grad_norm": 5.913647651672363, + "learning_rate": 9.772130341662614e-05, + "loss": 2.4259, + "step": 1650 + }, + { + "epoch": 0.12427783736991663, + "grad_norm": 5.676987648010254, + "learning_rate": 9.771766365783967e-05, + "loss": 2.0431, + "step": 1651 + }, + { + "epoch": 0.12435311165057679, + "grad_norm": 5.837631702423096, + "learning_rate": 9.771402106237504e-05, + "loss": 2.0348, + "step": 1652 + }, + { + "epoch": 0.12442838593123695, + "grad_norm": 6.00502872467041, + "learning_rate": 9.771037563044874e-05, + "loss": 2.0499, + "step": 1653 + }, + { + "epoch": 0.1245036602118971, + "grad_norm": 6.560897350311279, + "learning_rate": 9.770672736227751e-05, + "loss": 2.1814, + "step": 1654 + }, + { + "epoch": 0.12457893449255726, + "grad_norm": 5.640345096588135, + "learning_rate": 9.770307625807823e-05, + "loss": 1.9654, + "step": 1655 + }, + { + "epoch": 0.12465420877321741, + "grad_norm": 4.899707794189453, + "learning_rate": 9.769942231806794e-05, + "loss": 2.0147, + "step": 1656 + }, + { + "epoch": 0.12472948305387757, + "grad_norm": 4.982193946838379, + "learning_rate": 9.769576554246384e-05, + "loss": 2.2603, + "step": 1657 + }, + { + "epoch": 0.12480475733453772, + "grad_norm": 5.3386430740356445, + "learning_rate": 9.769210593148337e-05, + "loss": 2.0776, + "step": 1658 + }, + { + "epoch": 0.12488003161519788, + "grad_norm": 4.412789821624756, + "learning_rate": 9.768844348534403e-05, + "loss": 2.2757, + "step": 1659 + }, + { + "epoch": 0.12495530589585803, + "grad_norm": 5.69590950012207, + "learning_rate": 9.768477820426354e-05, + "loss": 1.9627, + "step": 1660 + }, + { + "epoch": 0.1250305801765182, + "grad_norm": 5.341485977172852, + "learning_rate": 9.768111008845982e-05, + "loss": 1.8706, + "step": 1661 + }, + { + "epoch": 0.12510585445717834, + "grad_norm": 4.827891826629639, + "learning_rate": 9.767743913815093e-05, + "loss": 2.0038, + "step": 1662 + }, + { + "epoch": 0.1251811287378385, + "grad_norm": 5.607531547546387, + "learning_rate": 9.767376535355508e-05, + "loss": 1.7966, + "step": 1663 + }, + { + "epoch": 0.12525640301849866, + "grad_norm": 6.948309898376465, + "learning_rate": 9.767008873489067e-05, + "loss": 2.233, + "step": 1664 + }, + { + "epoch": 0.1253316772991588, + "grad_norm": 4.283142566680908, + "learning_rate": 9.766640928237624e-05, + "loss": 1.8648, + "step": 1665 + }, + { + "epoch": 0.12540695157981896, + "grad_norm": 5.88974666595459, + "learning_rate": 9.766272699623058e-05, + "loss": 2.0447, + "step": 1666 + }, + { + "epoch": 0.1254822258604791, + "grad_norm": 7.175827503204346, + "learning_rate": 9.765904187667255e-05, + "loss": 2.6502, + "step": 1667 + }, + { + "epoch": 0.12555750014113928, + "grad_norm": 5.131073951721191, + "learning_rate": 9.765535392392123e-05, + "loss": 1.9488, + "step": 1668 + }, + { + "epoch": 0.12563277442179943, + "grad_norm": 4.771531105041504, + "learning_rate": 9.765166313819588e-05, + "loss": 2.0275, + "step": 1669 + }, + { + "epoch": 0.12570804870245958, + "grad_norm": 5.385657787322998, + "learning_rate": 9.764796951971585e-05, + "loss": 2.4261, + "step": 1670 + }, + { + "epoch": 0.12578332298311976, + "grad_norm": 6.468139171600342, + "learning_rate": 9.764427306870076e-05, + "loss": 2.3135, + "step": 1671 + }, + { + "epoch": 0.1258585972637799, + "grad_norm": 4.455704212188721, + "learning_rate": 9.764057378537035e-05, + "loss": 2.2562, + "step": 1672 + }, + { + "epoch": 0.12593387154444005, + "grad_norm": 5.4903082847595215, + "learning_rate": 9.76368716699445e-05, + "loss": 2.0452, + "step": 1673 + }, + { + "epoch": 0.1260091458251002, + "grad_norm": 5.3804216384887695, + "learning_rate": 9.763316672264336e-05, + "loss": 2.6459, + "step": 1674 + }, + { + "epoch": 0.12608442010576038, + "grad_norm": 4.479472637176514, + "learning_rate": 9.762945894368709e-05, + "loss": 2.1336, + "step": 1675 + }, + { + "epoch": 0.12615969438642052, + "grad_norm": 4.533637523651123, + "learning_rate": 9.762574833329617e-05, + "loss": 2.0619, + "step": 1676 + }, + { + "epoch": 0.12623496866708067, + "grad_norm": 6.115664958953857, + "learning_rate": 9.762203489169116e-05, + "loss": 2.0904, + "step": 1677 + }, + { + "epoch": 0.12631024294774082, + "grad_norm": 6.123455047607422, + "learning_rate": 9.761831861909283e-05, + "loss": 2.1754, + "step": 1678 + }, + { + "epoch": 0.126385517228401, + "grad_norm": 7.62650203704834, + "learning_rate": 9.761459951572208e-05, + "loss": 2.1732, + "step": 1679 + }, + { + "epoch": 0.12646079150906114, + "grad_norm": 4.549635410308838, + "learning_rate": 9.76108775818e-05, + "loss": 1.9141, + "step": 1680 + }, + { + "epoch": 0.1265360657897213, + "grad_norm": 6.09517765045166, + "learning_rate": 9.760715281754787e-05, + "loss": 2.3307, + "step": 1681 + }, + { + "epoch": 0.12661134007038144, + "grad_norm": 6.8112473487854, + "learning_rate": 9.76034252231871e-05, + "loss": 2.4828, + "step": 1682 + }, + { + "epoch": 0.12668661435104162, + "grad_norm": 8.26993465423584, + "learning_rate": 9.759969479893931e-05, + "loss": 2.7461, + "step": 1683 + }, + { + "epoch": 0.12676188863170176, + "grad_norm": 5.24868106842041, + "learning_rate": 9.759596154502622e-05, + "loss": 2.1971, + "step": 1684 + }, + { + "epoch": 0.1268371629123619, + "grad_norm": 5.017219543457031, + "learning_rate": 9.75922254616698e-05, + "loss": 2.0155, + "step": 1685 + }, + { + "epoch": 0.1269124371930221, + "grad_norm": 5.557555198669434, + "learning_rate": 9.758848654909213e-05, + "loss": 2.2513, + "step": 1686 + }, + { + "epoch": 0.12698771147368224, + "grad_norm": 4.580033302307129, + "learning_rate": 9.758474480751547e-05, + "loss": 2.3934, + "step": 1687 + }, + { + "epoch": 0.12706298575434238, + "grad_norm": 4.7362799644470215, + "learning_rate": 9.758100023716227e-05, + "loss": 2.3616, + "step": 1688 + }, + { + "epoch": 0.12713826003500253, + "grad_norm": 6.348353385925293, + "learning_rate": 9.757725283825514e-05, + "loss": 2.055, + "step": 1689 + }, + { + "epoch": 0.1272135343156627, + "grad_norm": 5.720735549926758, + "learning_rate": 9.757350261101685e-05, + "loss": 2.3077, + "step": 1690 + }, + { + "epoch": 0.12728880859632286, + "grad_norm": 5.7451605796813965, + "learning_rate": 9.756974955567033e-05, + "loss": 1.992, + "step": 1691 + }, + { + "epoch": 0.127364082876983, + "grad_norm": 4.385744571685791, + "learning_rate": 9.756599367243869e-05, + "loss": 2.0673, + "step": 1692 + }, + { + "epoch": 0.12743935715764315, + "grad_norm": 6.647571086883545, + "learning_rate": 9.75622349615452e-05, + "loss": 2.1362, + "step": 1693 + }, + { + "epoch": 0.12751463143830333, + "grad_norm": 4.4980010986328125, + "learning_rate": 9.75584734232133e-05, + "loss": 2.1913, + "step": 1694 + }, + { + "epoch": 0.12758990571896348, + "grad_norm": 5.175577640533447, + "learning_rate": 9.755470905766663e-05, + "loss": 2.0651, + "step": 1695 + }, + { + "epoch": 0.12766517999962362, + "grad_norm": 4.502558708190918, + "learning_rate": 9.755094186512897e-05, + "loss": 2.391, + "step": 1696 + }, + { + "epoch": 0.12774045428028377, + "grad_norm": 5.094159126281738, + "learning_rate": 9.754717184582424e-05, + "loss": 1.9887, + "step": 1697 + }, + { + "epoch": 0.12781572856094395, + "grad_norm": 4.980255603790283, + "learning_rate": 9.754339899997657e-05, + "loss": 2.2364, + "step": 1698 + }, + { + "epoch": 0.1278910028416041, + "grad_norm": 5.07272481918335, + "learning_rate": 9.753962332781025e-05, + "loss": 2.0959, + "step": 1699 + }, + { + "epoch": 0.12796627712226424, + "grad_norm": 8.20759105682373, + "learning_rate": 9.753584482954974e-05, + "loss": 2.6682, + "step": 1700 + }, + { + "epoch": 0.12804155140292442, + "grad_norm": 5.039438247680664, + "learning_rate": 9.753206350541963e-05, + "loss": 1.989, + "step": 1701 + }, + { + "epoch": 0.12811682568358457, + "grad_norm": 4.776042461395264, + "learning_rate": 9.752827935564474e-05, + "loss": 2.0493, + "step": 1702 + }, + { + "epoch": 0.12819209996424472, + "grad_norm": 4.75051736831665, + "learning_rate": 9.752449238045002e-05, + "loss": 1.879, + "step": 1703 + }, + { + "epoch": 0.12826737424490486, + "grad_norm": 4.19124174118042, + "learning_rate": 9.752070258006059e-05, + "loss": 1.9645, + "step": 1704 + }, + { + "epoch": 0.12834264852556504, + "grad_norm": 6.257390022277832, + "learning_rate": 9.751690995470175e-05, + "loss": 2.1308, + "step": 1705 + }, + { + "epoch": 0.1284179228062252, + "grad_norm": 9.848649024963379, + "learning_rate": 9.751311450459894e-05, + "loss": 2.194, + "step": 1706 + }, + { + "epoch": 0.12849319708688534, + "grad_norm": 8.7743558883667, + "learning_rate": 9.750931622997781e-05, + "loss": 2.1751, + "step": 1707 + }, + { + "epoch": 0.12856847136754548, + "grad_norm": 5.861545562744141, + "learning_rate": 9.750551513106416e-05, + "loss": 2.6289, + "step": 1708 + }, + { + "epoch": 0.12864374564820566, + "grad_norm": 6.404199123382568, + "learning_rate": 9.750171120808394e-05, + "loss": 2.3325, + "step": 1709 + }, + { + "epoch": 0.1287190199288658, + "grad_norm": 5.45130729675293, + "learning_rate": 9.749790446126327e-05, + "loss": 2.2062, + "step": 1710 + }, + { + "epoch": 0.12879429420952596, + "grad_norm": 8.279654502868652, + "learning_rate": 9.749409489082848e-05, + "loss": 2.7103, + "step": 1711 + }, + { + "epoch": 0.1288695684901861, + "grad_norm": 5.787228107452393, + "learning_rate": 9.749028249700602e-05, + "loss": 2.3553, + "step": 1712 + }, + { + "epoch": 0.12894484277084628, + "grad_norm": 5.115063190460205, + "learning_rate": 9.748646728002253e-05, + "loss": 2.1477, + "step": 1713 + }, + { + "epoch": 0.12902011705150643, + "grad_norm": 6.454010009765625, + "learning_rate": 9.748264924010482e-05, + "loss": 2.4386, + "step": 1714 + }, + { + "epoch": 0.12909539133216658, + "grad_norm": 4.400447845458984, + "learning_rate": 9.747882837747984e-05, + "loss": 2.7521, + "step": 1715 + }, + { + "epoch": 0.12917066561282672, + "grad_norm": 5.156986713409424, + "learning_rate": 9.747500469237476e-05, + "loss": 2.1434, + "step": 1716 + }, + { + "epoch": 0.1292459398934869, + "grad_norm": 4.321158409118652, + "learning_rate": 9.747117818501687e-05, + "loss": 2.249, + "step": 1717 + }, + { + "epoch": 0.12932121417414705, + "grad_norm": 5.449892520904541, + "learning_rate": 9.746734885563365e-05, + "loss": 2.3077, + "step": 1718 + }, + { + "epoch": 0.1293964884548072, + "grad_norm": 4.964759349822998, + "learning_rate": 9.746351670445273e-05, + "loss": 2.2884, + "step": 1719 + }, + { + "epoch": 0.12947176273546737, + "grad_norm": 5.850660800933838, + "learning_rate": 9.745968173170194e-05, + "loss": 2.2658, + "step": 1720 + }, + { + "epoch": 0.12954703701612752, + "grad_norm": 6.286604404449463, + "learning_rate": 9.745584393760923e-05, + "loss": 2.1384, + "step": 1721 + }, + { + "epoch": 0.12962231129678767, + "grad_norm": 6.299923419952393, + "learning_rate": 9.745200332240278e-05, + "loss": 2.0336, + "step": 1722 + }, + { + "epoch": 0.12969758557744782, + "grad_norm": 4.458899021148682, + "learning_rate": 9.744815988631089e-05, + "loss": 1.8374, + "step": 1723 + }, + { + "epoch": 0.129772859858108, + "grad_norm": 6.44940710067749, + "learning_rate": 9.744431362956203e-05, + "loss": 1.887, + "step": 1724 + }, + { + "epoch": 0.12984813413876814, + "grad_norm": 5.269408702850342, + "learning_rate": 9.744046455238487e-05, + "loss": 2.5734, + "step": 1725 + }, + { + "epoch": 0.1299234084194283, + "grad_norm": 8.123909950256348, + "learning_rate": 9.743661265500821e-05, + "loss": 2.4771, + "step": 1726 + }, + { + "epoch": 0.12999868270008844, + "grad_norm": 8.8287992477417, + "learning_rate": 9.743275793766102e-05, + "loss": 2.9099, + "step": 1727 + }, + { + "epoch": 0.1300739569807486, + "grad_norm": 4.445501327514648, + "learning_rate": 9.742890040057249e-05, + "loss": 2.2965, + "step": 1728 + }, + { + "epoch": 0.13014923126140876, + "grad_norm": 5.6940741539001465, + "learning_rate": 9.742504004397192e-05, + "loss": 2.1935, + "step": 1729 + }, + { + "epoch": 0.1302245055420689, + "grad_norm": 7.062665939331055, + "learning_rate": 9.742117686808878e-05, + "loss": 2.3127, + "step": 1730 + }, + { + "epoch": 0.13029977982272906, + "grad_norm": 5.12510347366333, + "learning_rate": 9.741731087315276e-05, + "loss": 2.0663, + "step": 1731 + }, + { + "epoch": 0.13037505410338923, + "grad_norm": 5.719430923461914, + "learning_rate": 9.741344205939366e-05, + "loss": 2.1603, + "step": 1732 + }, + { + "epoch": 0.13045032838404938, + "grad_norm": 7.074334144592285, + "learning_rate": 9.740957042704146e-05, + "loss": 2.1242, + "step": 1733 + }, + { + "epoch": 0.13052560266470953, + "grad_norm": 5.6323018074035645, + "learning_rate": 9.740569597632635e-05, + "loss": 2.2294, + "step": 1734 + }, + { + "epoch": 0.1306008769453697, + "grad_norm": 6.384583473205566, + "learning_rate": 9.740181870747863e-05, + "loss": 2.2038, + "step": 1735 + }, + { + "epoch": 0.13067615122602985, + "grad_norm": 7.3076958656311035, + "learning_rate": 9.73979386207288e-05, + "loss": 2.2905, + "step": 1736 + }, + { + "epoch": 0.13075142550669, + "grad_norm": 7.399223804473877, + "learning_rate": 9.739405571630751e-05, + "loss": 2.2938, + "step": 1737 + }, + { + "epoch": 0.13082669978735015, + "grad_norm": 5.686830520629883, + "learning_rate": 9.739016999444562e-05, + "loss": 2.1807, + "step": 1738 + }, + { + "epoch": 0.13090197406801032, + "grad_norm": 5.420448303222656, + "learning_rate": 9.738628145537407e-05, + "loss": 2.4733, + "step": 1739 + }, + { + "epoch": 0.13097724834867047, + "grad_norm": 7.386908054351807, + "learning_rate": 9.738239009932408e-05, + "loss": 2.6546, + "step": 1740 + }, + { + "epoch": 0.13105252262933062, + "grad_norm": 9.177496910095215, + "learning_rate": 9.737849592652695e-05, + "loss": 1.9644, + "step": 1741 + }, + { + "epoch": 0.13112779690999077, + "grad_norm": 6.809854030609131, + "learning_rate": 9.737459893721416e-05, + "loss": 2.2534, + "step": 1742 + }, + { + "epoch": 0.13120307119065094, + "grad_norm": 8.203299522399902, + "learning_rate": 9.737069913161741e-05, + "loss": 2.4105, + "step": 1743 + }, + { + "epoch": 0.1312783454713111, + "grad_norm": 8.162273406982422, + "learning_rate": 9.736679650996851e-05, + "loss": 2.1021, + "step": 1744 + }, + { + "epoch": 0.13135361975197124, + "grad_norm": 4.151286602020264, + "learning_rate": 9.73628910724995e-05, + "loss": 2.0502, + "step": 1745 + }, + { + "epoch": 0.1314288940326314, + "grad_norm": 4.2946953773498535, + "learning_rate": 9.735898281944249e-05, + "loss": 2.1161, + "step": 1746 + }, + { + "epoch": 0.13150416831329156, + "grad_norm": 4.501586437225342, + "learning_rate": 9.735507175102983e-05, + "loss": 2.577, + "step": 1747 + }, + { + "epoch": 0.1315794425939517, + "grad_norm": 5.702350616455078, + "learning_rate": 9.735115786749404e-05, + "loss": 1.849, + "step": 1748 + }, + { + "epoch": 0.13165471687461186, + "grad_norm": 6.722370624542236, + "learning_rate": 9.734724116906779e-05, + "loss": 2.2706, + "step": 1749 + }, + { + "epoch": 0.13172999115527204, + "grad_norm": 4.941370964050293, + "learning_rate": 9.73433216559839e-05, + "loss": 2.22, + "step": 1750 + }, + { + "epoch": 0.13180526543593218, + "grad_norm": 5.985321044921875, + "learning_rate": 9.733939932847538e-05, + "loss": 2.5196, + "step": 1751 + }, + { + "epoch": 0.13188053971659233, + "grad_norm": 5.818458557128906, + "learning_rate": 9.73354741867754e-05, + "loss": 2.0744, + "step": 1752 + }, + { + "epoch": 0.13195581399725248, + "grad_norm": 5.499964237213135, + "learning_rate": 9.733154623111731e-05, + "loss": 2.0645, + "step": 1753 + }, + { + "epoch": 0.13203108827791266, + "grad_norm": 4.1757097244262695, + "learning_rate": 9.73276154617346e-05, + "loss": 2.1942, + "step": 1754 + }, + { + "epoch": 0.1321063625585728, + "grad_norm": 4.557126998901367, + "learning_rate": 9.732368187886095e-05, + "loss": 2.2068, + "step": 1755 + }, + { + "epoch": 0.13218163683923295, + "grad_norm": 5.549522876739502, + "learning_rate": 9.73197454827302e-05, + "loss": 2.3314, + "step": 1756 + }, + { + "epoch": 0.1322569111198931, + "grad_norm": 6.318264007568359, + "learning_rate": 9.731580627357635e-05, + "loss": 1.9053, + "step": 1757 + }, + { + "epoch": 0.13233218540055328, + "grad_norm": 8.935311317443848, + "learning_rate": 9.731186425163359e-05, + "loss": 2.2194, + "step": 1758 + }, + { + "epoch": 0.13240745968121342, + "grad_norm": 4.741687774658203, + "learning_rate": 9.730791941713624e-05, + "loss": 2.2344, + "step": 1759 + }, + { + "epoch": 0.13248273396187357, + "grad_norm": 8.57996940612793, + "learning_rate": 9.730397177031884e-05, + "loss": 1.8939, + "step": 1760 + }, + { + "epoch": 0.13255800824253372, + "grad_norm": 8.860397338867188, + "learning_rate": 9.730002131141603e-05, + "loss": 2.4785, + "step": 1761 + }, + { + "epoch": 0.1326332825231939, + "grad_norm": 4.961053848266602, + "learning_rate": 9.729606804066269e-05, + "loss": 2.2266, + "step": 1762 + }, + { + "epoch": 0.13270855680385404, + "grad_norm": 6.253744602203369, + "learning_rate": 9.72921119582938e-05, + "loss": 2.1028, + "step": 1763 + }, + { + "epoch": 0.1327838310845142, + "grad_norm": 5.468624591827393, + "learning_rate": 9.728815306454455e-05, + "loss": 2.1288, + "step": 1764 + }, + { + "epoch": 0.13285910536517434, + "grad_norm": 5.2688679695129395, + "learning_rate": 9.72841913596503e-05, + "loss": 2.2242, + "step": 1765 + }, + { + "epoch": 0.13293437964583452, + "grad_norm": 4.66700553894043, + "learning_rate": 9.728022684384654e-05, + "loss": 2.1235, + "step": 1766 + }, + { + "epoch": 0.13300965392649466, + "grad_norm": 4.874022960662842, + "learning_rate": 9.727625951736893e-05, + "loss": 2.249, + "step": 1767 + }, + { + "epoch": 0.1330849282071548, + "grad_norm": 5.006189346313477, + "learning_rate": 9.727228938045338e-05, + "loss": 2.0739, + "step": 1768 + }, + { + "epoch": 0.133160202487815, + "grad_norm": 6.018039226531982, + "learning_rate": 9.726831643333585e-05, + "loss": 2.7297, + "step": 1769 + }, + { + "epoch": 0.13323547676847514, + "grad_norm": 4.151617527008057, + "learning_rate": 9.726434067625254e-05, + "loss": 2.0316, + "step": 1770 + }, + { + "epoch": 0.13331075104913528, + "grad_norm": 5.639735221862793, + "learning_rate": 9.726036210943978e-05, + "loss": 2.5051, + "step": 1771 + }, + { + "epoch": 0.13338602532979543, + "grad_norm": 6.182634353637695, + "learning_rate": 9.725638073313413e-05, + "loss": 2.2262, + "step": 1772 + }, + { + "epoch": 0.1334612996104556, + "grad_norm": 6.048215866088867, + "learning_rate": 9.725239654757221e-05, + "loss": 2.0787, + "step": 1773 + }, + { + "epoch": 0.13353657389111576, + "grad_norm": 8.24042797088623, + "learning_rate": 9.72484095529909e-05, + "loss": 2.5092, + "step": 1774 + }, + { + "epoch": 0.1336118481717759, + "grad_norm": 4.759000778198242, + "learning_rate": 9.724441974962722e-05, + "loss": 2.1504, + "step": 1775 + }, + { + "epoch": 0.13368712245243605, + "grad_norm": 6.386248588562012, + "learning_rate": 9.724042713771835e-05, + "loss": 1.941, + "step": 1776 + }, + { + "epoch": 0.13376239673309623, + "grad_norm": 5.959760665893555, + "learning_rate": 9.723643171750163e-05, + "loss": 2.1444, + "step": 1777 + }, + { + "epoch": 0.13383767101375638, + "grad_norm": 5.737596035003662, + "learning_rate": 9.723243348921458e-05, + "loss": 2.1536, + "step": 1778 + }, + { + "epoch": 0.13391294529441652, + "grad_norm": 4.480311393737793, + "learning_rate": 9.722843245309487e-05, + "loss": 2.2018, + "step": 1779 + }, + { + "epoch": 0.13398821957507667, + "grad_norm": 4.975409030914307, + "learning_rate": 9.722442860938039e-05, + "loss": 2.3466, + "step": 1780 + }, + { + "epoch": 0.13406349385573685, + "grad_norm": 5.327111721038818, + "learning_rate": 9.722042195830912e-05, + "loss": 1.9718, + "step": 1781 + }, + { + "epoch": 0.134138768136397, + "grad_norm": 7.569465637207031, + "learning_rate": 9.721641250011925e-05, + "loss": 2.3213, + "step": 1782 + }, + { + "epoch": 0.13421404241705714, + "grad_norm": 3.7208921909332275, + "learning_rate": 9.721240023504914e-05, + "loss": 1.9377, + "step": 1783 + }, + { + "epoch": 0.13428931669771732, + "grad_norm": 5.377165794372559, + "learning_rate": 9.72083851633373e-05, + "loss": 2.2264, + "step": 1784 + }, + { + "epoch": 0.13436459097837747, + "grad_norm": 4.720230579376221, + "learning_rate": 9.720436728522242e-05, + "loss": 2.1111, + "step": 1785 + }, + { + "epoch": 0.13443986525903762, + "grad_norm": 4.689426898956299, + "learning_rate": 9.720034660094336e-05, + "loss": 1.9133, + "step": 1786 + }, + { + "epoch": 0.13451513953969776, + "grad_norm": 6.165879726409912, + "learning_rate": 9.719632311073911e-05, + "loss": 2.3325, + "step": 1787 + }, + { + "epoch": 0.13459041382035794, + "grad_norm": 5.546353816986084, + "learning_rate": 9.719229681484889e-05, + "loss": 2.1248, + "step": 1788 + }, + { + "epoch": 0.1346656881010181, + "grad_norm": 6.2506303787231445, + "learning_rate": 9.718826771351202e-05, + "loss": 2.0899, + "step": 1789 + }, + { + "epoch": 0.13474096238167824, + "grad_norm": 5.79584264755249, + "learning_rate": 9.718423580696805e-05, + "loss": 2.1998, + "step": 1790 + }, + { + "epoch": 0.13481623666233838, + "grad_norm": 5.265510559082031, + "learning_rate": 9.718020109545663e-05, + "loss": 2.4406, + "step": 1791 + }, + { + "epoch": 0.13489151094299856, + "grad_norm": 4.779012203216553, + "learning_rate": 9.717616357921764e-05, + "loss": 2.0552, + "step": 1792 + }, + { + "epoch": 0.1349667852236587, + "grad_norm": 5.139724254608154, + "learning_rate": 9.717212325849107e-05, + "loss": 1.9984, + "step": 1793 + }, + { + "epoch": 0.13504205950431886, + "grad_norm": 4.352697849273682, + "learning_rate": 9.716808013351715e-05, + "loss": 2.2594, + "step": 1794 + }, + { + "epoch": 0.135117333784979, + "grad_norm": 4.581589698791504, + "learning_rate": 9.71640342045362e-05, + "loss": 2.1334, + "step": 1795 + }, + { + "epoch": 0.13519260806563918, + "grad_norm": 5.797093868255615, + "learning_rate": 9.715998547178874e-05, + "loss": 2.2136, + "step": 1796 + }, + { + "epoch": 0.13526788234629933, + "grad_norm": 11.062444686889648, + "learning_rate": 9.715593393551546e-05, + "loss": 2.006, + "step": 1797 + }, + { + "epoch": 0.13534315662695948, + "grad_norm": 5.804313659667969, + "learning_rate": 9.715187959595722e-05, + "loss": 1.9304, + "step": 1798 + }, + { + "epoch": 0.13541843090761965, + "grad_norm": 5.663839340209961, + "learning_rate": 9.714782245335502e-05, + "loss": 2.3911, + "step": 1799 + }, + { + "epoch": 0.1354937051882798, + "grad_norm": 5.694082736968994, + "learning_rate": 9.714376250795007e-05, + "loss": 1.7971, + "step": 1800 + }, + { + "epoch": 0.13556897946893995, + "grad_norm": 4.768952369689941, + "learning_rate": 9.713969975998369e-05, + "loss": 2.0693, + "step": 1801 + }, + { + "epoch": 0.1356442537496001, + "grad_norm": 3.2732179164886475, + "learning_rate": 9.713563420969745e-05, + "loss": 2.2302, + "step": 1802 + }, + { + "epoch": 0.13571952803026027, + "grad_norm": 3.4827237129211426, + "learning_rate": 9.713156585733298e-05, + "loss": 1.9464, + "step": 1803 + }, + { + "epoch": 0.13579480231092042, + "grad_norm": 4.595655918121338, + "learning_rate": 9.712749470313216e-05, + "loss": 1.8714, + "step": 1804 + }, + { + "epoch": 0.13587007659158057, + "grad_norm": 6.143069744110107, + "learning_rate": 9.712342074733701e-05, + "loss": 1.8531, + "step": 1805 + }, + { + "epoch": 0.13594535087224072, + "grad_norm": 5.182194232940674, + "learning_rate": 9.711934399018969e-05, + "loss": 1.8819, + "step": 1806 + }, + { + "epoch": 0.1360206251529009, + "grad_norm": 6.4293999671936035, + "learning_rate": 9.711526443193259e-05, + "loss": 2.0137, + "step": 1807 + }, + { + "epoch": 0.13609589943356104, + "grad_norm": 7.438591480255127, + "learning_rate": 9.71111820728082e-05, + "loss": 2.3774, + "step": 1808 + }, + { + "epoch": 0.1361711737142212, + "grad_norm": 5.2735137939453125, + "learning_rate": 9.710709691305922e-05, + "loss": 2.3979, + "step": 1809 + }, + { + "epoch": 0.13624644799488134, + "grad_norm": 4.832973957061768, + "learning_rate": 9.71030089529285e-05, + "loss": 1.9479, + "step": 1810 + }, + { + "epoch": 0.1363217222755415, + "grad_norm": 5.121523380279541, + "learning_rate": 9.709891819265904e-05, + "loss": 1.9892, + "step": 1811 + }, + { + "epoch": 0.13639699655620166, + "grad_norm": 6.729373931884766, + "learning_rate": 9.709482463249403e-05, + "loss": 2.2766, + "step": 1812 + }, + { + "epoch": 0.1364722708368618, + "grad_norm": 4.503567218780518, + "learning_rate": 9.709072827267683e-05, + "loss": 2.5778, + "step": 1813 + }, + { + "epoch": 0.13654754511752198, + "grad_norm": 4.67728853225708, + "learning_rate": 9.708662911345096e-05, + "loss": 1.8506, + "step": 1814 + }, + { + "epoch": 0.13662281939818213, + "grad_norm": 4.27795934677124, + "learning_rate": 9.708252715506009e-05, + "loss": 2.414, + "step": 1815 + }, + { + "epoch": 0.13669809367884228, + "grad_norm": 4.99085807800293, + "learning_rate": 9.707842239774807e-05, + "loss": 1.778, + "step": 1816 + }, + { + "epoch": 0.13677336795950243, + "grad_norm": 4.5596818923950195, + "learning_rate": 9.707431484175893e-05, + "loss": 2.1777, + "step": 1817 + }, + { + "epoch": 0.1368486422401626, + "grad_norm": 3.5612947940826416, + "learning_rate": 9.707020448733684e-05, + "loss": 2.0014, + "step": 1818 + }, + { + "epoch": 0.13692391652082275, + "grad_norm": 6.679454326629639, + "learning_rate": 9.706609133472617e-05, + "loss": 2.4387, + "step": 1819 + }, + { + "epoch": 0.1369991908014829, + "grad_norm": 8.171709060668945, + "learning_rate": 9.706197538417139e-05, + "loss": 1.868, + "step": 1820 + }, + { + "epoch": 0.13707446508214305, + "grad_norm": 7.789645671844482, + "learning_rate": 9.705785663591722e-05, + "loss": 2.1866, + "step": 1821 + }, + { + "epoch": 0.13714973936280322, + "grad_norm": 4.307685852050781, + "learning_rate": 9.705373509020849e-05, + "loss": 2.2128, + "step": 1822 + }, + { + "epoch": 0.13722501364346337, + "grad_norm": 5.5334978103637695, + "learning_rate": 9.704961074729024e-05, + "loss": 2.3402, + "step": 1823 + }, + { + "epoch": 0.13730028792412352, + "grad_norm": 4.713198184967041, + "learning_rate": 9.704548360740762e-05, + "loss": 2.448, + "step": 1824 + }, + { + "epoch": 0.13737556220478367, + "grad_norm": 8.580302238464355, + "learning_rate": 9.7041353670806e-05, + "loss": 1.9568, + "step": 1825 + }, + { + "epoch": 0.13745083648544384, + "grad_norm": 5.269320011138916, + "learning_rate": 9.703722093773086e-05, + "loss": 2.2145, + "step": 1826 + }, + { + "epoch": 0.137526110766104, + "grad_norm": 5.430004119873047, + "learning_rate": 9.703308540842792e-05, + "loss": 1.8306, + "step": 1827 + }, + { + "epoch": 0.13760138504676414, + "grad_norm": 4.822761058807373, + "learning_rate": 9.7028947083143e-05, + "loss": 1.787, + "step": 1828 + }, + { + "epoch": 0.1376766593274243, + "grad_norm": 6.5863471031188965, + "learning_rate": 9.702480596212211e-05, + "loss": 2.1386, + "step": 1829 + }, + { + "epoch": 0.13775193360808446, + "grad_norm": 7.925605773925781, + "learning_rate": 9.702066204561144e-05, + "loss": 2.19, + "step": 1830 + }, + { + "epoch": 0.1378272078887446, + "grad_norm": 6.601686000823975, + "learning_rate": 9.701651533385731e-05, + "loss": 2.0439, + "step": 1831 + }, + { + "epoch": 0.13790248216940476, + "grad_norm": 7.559757232666016, + "learning_rate": 9.701236582710629e-05, + "loss": 2.0299, + "step": 1832 + }, + { + "epoch": 0.13797775645006494, + "grad_norm": 5.193469047546387, + "learning_rate": 9.700821352560498e-05, + "loss": 2.3234, + "step": 1833 + }, + { + "epoch": 0.13805303073072508, + "grad_norm": 7.745946407318115, + "learning_rate": 9.700405842960028e-05, + "loss": 2.3313, + "step": 1834 + }, + { + "epoch": 0.13812830501138523, + "grad_norm": 4.591289043426514, + "learning_rate": 9.699990053933916e-05, + "loss": 2.2, + "step": 1835 + }, + { + "epoch": 0.13820357929204538, + "grad_norm": 7.64949369430542, + "learning_rate": 9.699573985506881e-05, + "loss": 2.03, + "step": 1836 + }, + { + "epoch": 0.13827885357270556, + "grad_norm": 5.139095783233643, + "learning_rate": 9.699157637703657e-05, + "loss": 2.4074, + "step": 1837 + }, + { + "epoch": 0.1383541278533657, + "grad_norm": 6.202521800994873, + "learning_rate": 9.698741010548997e-05, + "loss": 2.2196, + "step": 1838 + }, + { + "epoch": 0.13842940213402585, + "grad_norm": 6.413371562957764, + "learning_rate": 9.698324104067664e-05, + "loss": 1.9609, + "step": 1839 + }, + { + "epoch": 0.138504676414686, + "grad_norm": 5.329407691955566, + "learning_rate": 9.697906918284445e-05, + "loss": 2.085, + "step": 1840 + }, + { + "epoch": 0.13857995069534618, + "grad_norm": 7.392550945281982, + "learning_rate": 9.69748945322414e-05, + "loss": 2.2143, + "step": 1841 + }, + { + "epoch": 0.13865522497600632, + "grad_norm": 6.214261054992676, + "learning_rate": 9.697071708911564e-05, + "loss": 2.0554, + "step": 1842 + }, + { + "epoch": 0.13873049925666647, + "grad_norm": 4.798659801483154, + "learning_rate": 9.696653685371553e-05, + "loss": 2.0461, + "step": 1843 + }, + { + "epoch": 0.13880577353732662, + "grad_norm": 5.882729530334473, + "learning_rate": 9.696235382628958e-05, + "loss": 2.0589, + "step": 1844 + }, + { + "epoch": 0.1388810478179868, + "grad_norm": 6.147152423858643, + "learning_rate": 9.695816800708642e-05, + "loss": 2.3985, + "step": 1845 + }, + { + "epoch": 0.13895632209864694, + "grad_norm": 4.156700611114502, + "learning_rate": 9.695397939635493e-05, + "loss": 2.0313, + "step": 1846 + }, + { + "epoch": 0.1390315963793071, + "grad_norm": 7.421755313873291, + "learning_rate": 9.694978799434409e-05, + "loss": 2.1407, + "step": 1847 + }, + { + "epoch": 0.13910687065996727, + "grad_norm": 5.052271366119385, + "learning_rate": 9.694559380130306e-05, + "loss": 2.1883, + "step": 1848 + }, + { + "epoch": 0.13918214494062742, + "grad_norm": 5.62962532043457, + "learning_rate": 9.694139681748118e-05, + "loss": 2.2528, + "step": 1849 + }, + { + "epoch": 0.13925741922128757, + "grad_norm": 6.184092998504639, + "learning_rate": 9.693719704312795e-05, + "loss": 2.051, + "step": 1850 + }, + { + "epoch": 0.1393326935019477, + "grad_norm": 4.593359470367432, + "learning_rate": 9.693299447849303e-05, + "loss": 2.2775, + "step": 1851 + }, + { + "epoch": 0.1394079677826079, + "grad_norm": 4.519968032836914, + "learning_rate": 9.692878912382625e-05, + "loss": 1.9706, + "step": 1852 + }, + { + "epoch": 0.13948324206326804, + "grad_norm": 6.722831726074219, + "learning_rate": 9.692458097937763e-05, + "loss": 2.4025, + "step": 1853 + }, + { + "epoch": 0.13955851634392819, + "grad_norm": 5.059573173522949, + "learning_rate": 9.692037004539732e-05, + "loss": 2.1064, + "step": 1854 + }, + { + "epoch": 0.13963379062458833, + "grad_norm": 4.556968688964844, + "learning_rate": 9.691615632213561e-05, + "loss": 1.7618, + "step": 1855 + }, + { + "epoch": 0.1397090649052485, + "grad_norm": 4.672435283660889, + "learning_rate": 9.691193980984303e-05, + "loss": 2.1174, + "step": 1856 + }, + { + "epoch": 0.13978433918590866, + "grad_norm": 5.852546215057373, + "learning_rate": 9.690772050877024e-05, + "loss": 1.9525, + "step": 1857 + }, + { + "epoch": 0.1398596134665688, + "grad_norm": 4.079924583435059, + "learning_rate": 9.690349841916806e-05, + "loss": 2.1488, + "step": 1858 + }, + { + "epoch": 0.13993488774722895, + "grad_norm": 4.969254970550537, + "learning_rate": 9.689927354128749e-05, + "loss": 2.3749, + "step": 1859 + }, + { + "epoch": 0.14001016202788913, + "grad_norm": 4.448051929473877, + "learning_rate": 9.689504587537967e-05, + "loss": 2.1747, + "step": 1860 + }, + { + "epoch": 0.14008543630854928, + "grad_norm": 5.967103958129883, + "learning_rate": 9.689081542169591e-05, + "loss": 2.215, + "step": 1861 + }, + { + "epoch": 0.14016071058920943, + "grad_norm": 5.0489959716796875, + "learning_rate": 9.688658218048774e-05, + "loss": 2.1, + "step": 1862 + }, + { + "epoch": 0.1402359848698696, + "grad_norm": 8.213915824890137, + "learning_rate": 9.68823461520068e-05, + "loss": 2.6792, + "step": 1863 + }, + { + "epoch": 0.14031125915052975, + "grad_norm": 5.846588611602783, + "learning_rate": 9.687810733650488e-05, + "loss": 2.1365, + "step": 1864 + }, + { + "epoch": 0.1403865334311899, + "grad_norm": 4.936132907867432, + "learning_rate": 9.687386573423401e-05, + "loss": 2.0129, + "step": 1865 + }, + { + "epoch": 0.14046180771185005, + "grad_norm": 5.1625165939331055, + "learning_rate": 9.68696213454463e-05, + "loss": 2.2631, + "step": 1866 + }, + { + "epoch": 0.14053708199251022, + "grad_norm": 6.869221210479736, + "learning_rate": 9.686537417039408e-05, + "loss": 2.3976, + "step": 1867 + }, + { + "epoch": 0.14061235627317037, + "grad_norm": 4.970541000366211, + "learning_rate": 9.686112420932984e-05, + "loss": 2.1959, + "step": 1868 + }, + { + "epoch": 0.14068763055383052, + "grad_norm": 6.011102676391602, + "learning_rate": 9.685687146250624e-05, + "loss": 2.1489, + "step": 1869 + }, + { + "epoch": 0.14076290483449067, + "grad_norm": 4.859211444854736, + "learning_rate": 9.685261593017607e-05, + "loss": 2.5562, + "step": 1870 + }, + { + "epoch": 0.14083817911515084, + "grad_norm": 3.5985958576202393, + "learning_rate": 9.684835761259232e-05, + "loss": 2.2823, + "step": 1871 + }, + { + "epoch": 0.140913453395811, + "grad_norm": 6.8388190269470215, + "learning_rate": 9.684409651000812e-05, + "loss": 1.9302, + "step": 1872 + }, + { + "epoch": 0.14098872767647114, + "grad_norm": 5.084116458892822, + "learning_rate": 9.683983262267682e-05, + "loss": 1.8937, + "step": 1873 + }, + { + "epoch": 0.14106400195713129, + "grad_norm": 4.719859600067139, + "learning_rate": 9.683556595085186e-05, + "loss": 1.9668, + "step": 1874 + }, + { + "epoch": 0.14113927623779146, + "grad_norm": 4.512369632720947, + "learning_rate": 9.683129649478689e-05, + "loss": 1.9114, + "step": 1875 + }, + { + "epoch": 0.1412145505184516, + "grad_norm": 4.319483280181885, + "learning_rate": 9.68270242547357e-05, + "loss": 2.4611, + "step": 1876 + }, + { + "epoch": 0.14128982479911176, + "grad_norm": 5.295613765716553, + "learning_rate": 9.682274923095229e-05, + "loss": 2.0023, + "step": 1877 + }, + { + "epoch": 0.1413650990797719, + "grad_norm": 6.274294376373291, + "learning_rate": 9.68184714236908e-05, + "loss": 1.9391, + "step": 1878 + }, + { + "epoch": 0.14144037336043208, + "grad_norm": 5.767858028411865, + "learning_rate": 9.68141908332055e-05, + "loss": 2.0379, + "step": 1879 + }, + { + "epoch": 0.14151564764109223, + "grad_norm": 4.43416166305542, + "learning_rate": 9.68099074597509e-05, + "loss": 2.7999, + "step": 1880 + }, + { + "epoch": 0.14159092192175238, + "grad_norm": 5.756229877471924, + "learning_rate": 9.68056213035816e-05, + "loss": 2.0056, + "step": 1881 + }, + { + "epoch": 0.14166619620241255, + "grad_norm": 5.336365222930908, + "learning_rate": 9.680133236495241e-05, + "loss": 2.1471, + "step": 1882 + }, + { + "epoch": 0.1417414704830727, + "grad_norm": 4.178652763366699, + "learning_rate": 9.679704064411832e-05, + "loss": 2.0595, + "step": 1883 + }, + { + "epoch": 0.14181674476373285, + "grad_norm": 5.193113327026367, + "learning_rate": 9.679274614133442e-05, + "loss": 2.5843, + "step": 1884 + }, + { + "epoch": 0.141892019044393, + "grad_norm": 5.874992847442627, + "learning_rate": 9.678844885685603e-05, + "loss": 2.2923, + "step": 1885 + }, + { + "epoch": 0.14196729332505317, + "grad_norm": 4.65516996383667, + "learning_rate": 9.678414879093862e-05, + "loss": 2.2086, + "step": 1886 + }, + { + "epoch": 0.14204256760571332, + "grad_norm": 4.876006603240967, + "learning_rate": 9.677984594383778e-05, + "loss": 2.1032, + "step": 1887 + }, + { + "epoch": 0.14211784188637347, + "grad_norm": 4.680221080780029, + "learning_rate": 9.677554031580935e-05, + "loss": 2.106, + "step": 1888 + }, + { + "epoch": 0.14219311616703362, + "grad_norm": 4.564823627471924, + "learning_rate": 9.677123190710925e-05, + "loss": 2.2426, + "step": 1889 + }, + { + "epoch": 0.1422683904476938, + "grad_norm": 5.39882755279541, + "learning_rate": 9.676692071799362e-05, + "loss": 1.8801, + "step": 1890 + }, + { + "epoch": 0.14234366472835394, + "grad_norm": 4.606545448303223, + "learning_rate": 9.676260674871874e-05, + "loss": 2.2484, + "step": 1891 + }, + { + "epoch": 0.1424189390090141, + "grad_norm": 4.663834095001221, + "learning_rate": 9.675828999954105e-05, + "loss": 2.2326, + "step": 1892 + }, + { + "epoch": 0.14249421328967424, + "grad_norm": 4.789903163909912, + "learning_rate": 9.67539704707172e-05, + "loss": 2.1222, + "step": 1893 + }, + { + "epoch": 0.1425694875703344, + "grad_norm": 4.491421699523926, + "learning_rate": 9.674964816250395e-05, + "loss": 1.9721, + "step": 1894 + }, + { + "epoch": 0.14264476185099456, + "grad_norm": 7.125611305236816, + "learning_rate": 9.674532307515827e-05, + "loss": 2.3517, + "step": 1895 + }, + { + "epoch": 0.1427200361316547, + "grad_norm": 6.573733806610107, + "learning_rate": 9.674099520893724e-05, + "loss": 2.1702, + "step": 1896 + }, + { + "epoch": 0.14279531041231489, + "grad_norm": 6.0808258056640625, + "learning_rate": 9.673666456409817e-05, + "loss": 2.2207, + "step": 1897 + }, + { + "epoch": 0.14287058469297503, + "grad_norm": 5.009258270263672, + "learning_rate": 9.67323311408985e-05, + "loss": 2.0732, + "step": 1898 + }, + { + "epoch": 0.14294585897363518, + "grad_norm": 4.410242080688477, + "learning_rate": 9.672799493959584e-05, + "loss": 2.0659, + "step": 1899 + }, + { + "epoch": 0.14302113325429533, + "grad_norm": 5.891568183898926, + "learning_rate": 9.672365596044795e-05, + "loss": 2.4044, + "step": 1900 + }, + { + "epoch": 0.1430964075349555, + "grad_norm": 4.866339206695557, + "learning_rate": 9.671931420371278e-05, + "loss": 1.9795, + "step": 1901 + }, + { + "epoch": 0.14317168181561565, + "grad_norm": 6.606231689453125, + "learning_rate": 9.671496966964842e-05, + "loss": 1.9754, + "step": 1902 + }, + { + "epoch": 0.1432469560962758, + "grad_norm": 7.27636194229126, + "learning_rate": 9.671062235851317e-05, + "loss": 1.7578, + "step": 1903 + }, + { + "epoch": 0.14332223037693595, + "grad_norm": 5.7700114250183105, + "learning_rate": 9.670627227056543e-05, + "loss": 2.7232, + "step": 1904 + }, + { + "epoch": 0.14339750465759613, + "grad_norm": 5.701020240783691, + "learning_rate": 9.670191940606384e-05, + "loss": 2.5565, + "step": 1905 + }, + { + "epoch": 0.14347277893825627, + "grad_norm": 5.346332550048828, + "learning_rate": 9.669756376526714e-05, + "loss": 2.4132, + "step": 1906 + }, + { + "epoch": 0.14354805321891642, + "grad_norm": 6.396316051483154, + "learning_rate": 9.669320534843425e-05, + "loss": 2.5714, + "step": 1907 + }, + { + "epoch": 0.14362332749957657, + "grad_norm": 6.035998344421387, + "learning_rate": 9.66888441558243e-05, + "loss": 2.0749, + "step": 1908 + }, + { + "epoch": 0.14369860178023675, + "grad_norm": 5.173978805541992, + "learning_rate": 9.668448018769652e-05, + "loss": 2.2366, + "step": 1909 + }, + { + "epoch": 0.1437738760608969, + "grad_norm": 6.03705358505249, + "learning_rate": 9.668011344431034e-05, + "loss": 2.4118, + "step": 1910 + }, + { + "epoch": 0.14384915034155704, + "grad_norm": 7.774417400360107, + "learning_rate": 9.667574392592537e-05, + "loss": 1.9568, + "step": 1911 + }, + { + "epoch": 0.14392442462221722, + "grad_norm": 5.122204303741455, + "learning_rate": 9.667137163280134e-05, + "loss": 2.3293, + "step": 1912 + }, + { + "epoch": 0.14399969890287737, + "grad_norm": 4.664328098297119, + "learning_rate": 9.666699656519817e-05, + "loss": 1.99, + "step": 1913 + }, + { + "epoch": 0.1440749731835375, + "grad_norm": 4.738907337188721, + "learning_rate": 9.666261872337599e-05, + "loss": 2.1834, + "step": 1914 + }, + { + "epoch": 0.14415024746419766, + "grad_norm": 5.164193630218506, + "learning_rate": 9.665823810759498e-05, + "loss": 2.1721, + "step": 1915 + }, + { + "epoch": 0.14422552174485784, + "grad_norm": 3.930130958557129, + "learning_rate": 9.665385471811562e-05, + "loss": 2.3643, + "step": 1916 + }, + { + "epoch": 0.14430079602551799, + "grad_norm": 10.628747940063477, + "learning_rate": 9.664946855519845e-05, + "loss": 2.1983, + "step": 1917 + }, + { + "epoch": 0.14437607030617813, + "grad_norm": 4.002389907836914, + "learning_rate": 9.664507961910423e-05, + "loss": 1.9855, + "step": 1918 + }, + { + "epoch": 0.14445134458683828, + "grad_norm": 5.094120025634766, + "learning_rate": 9.664068791009386e-05, + "loss": 2.0275, + "step": 1919 + }, + { + "epoch": 0.14452661886749846, + "grad_norm": 4.587681770324707, + "learning_rate": 9.663629342842843e-05, + "loss": 1.9947, + "step": 1920 + }, + { + "epoch": 0.1446018931481586, + "grad_norm": 6.098039150238037, + "learning_rate": 9.663189617436916e-05, + "loss": 1.9019, + "step": 1921 + }, + { + "epoch": 0.14467716742881875, + "grad_norm": 6.508772373199463, + "learning_rate": 9.662749614817748e-05, + "loss": 2.2373, + "step": 1922 + }, + { + "epoch": 0.1447524417094789, + "grad_norm": 5.737447738647461, + "learning_rate": 9.662309335011494e-05, + "loss": 1.957, + "step": 1923 + }, + { + "epoch": 0.14482771599013908, + "grad_norm": 5.731282711029053, + "learning_rate": 9.661868778044328e-05, + "loss": 1.9759, + "step": 1924 + }, + { + "epoch": 0.14490299027079923, + "grad_norm": 5.115933895111084, + "learning_rate": 9.661427943942437e-05, + "loss": 1.7477, + "step": 1925 + }, + { + "epoch": 0.14497826455145937, + "grad_norm": 6.987696170806885, + "learning_rate": 9.660986832732032e-05, + "loss": 2.0165, + "step": 1926 + }, + { + "epoch": 0.14505353883211952, + "grad_norm": 6.869017601013184, + "learning_rate": 9.660545444439333e-05, + "loss": 2.4173, + "step": 1927 + }, + { + "epoch": 0.1451288131127797, + "grad_norm": 6.631619930267334, + "learning_rate": 9.66010377909058e-05, + "loss": 2.3266, + "step": 1928 + }, + { + "epoch": 0.14520408739343985, + "grad_norm": 5.309915542602539, + "learning_rate": 9.659661836712028e-05, + "loss": 2.2489, + "step": 1929 + }, + { + "epoch": 0.1452793616741, + "grad_norm": 4.7900190353393555, + "learning_rate": 9.659219617329951e-05, + "loss": 2.2821, + "step": 1930 + }, + { + "epoch": 0.14535463595476017, + "grad_norm": 3.912114143371582, + "learning_rate": 9.658777120970637e-05, + "loss": 2.1911, + "step": 1931 + }, + { + "epoch": 0.14542991023542032, + "grad_norm": 4.142040252685547, + "learning_rate": 9.65833434766039e-05, + "loss": 2.5666, + "step": 1932 + }, + { + "epoch": 0.14550518451608047, + "grad_norm": 4.453502655029297, + "learning_rate": 9.657891297425531e-05, + "loss": 2.0453, + "step": 1933 + }, + { + "epoch": 0.1455804587967406, + "grad_norm": 4.587471961975098, + "learning_rate": 9.657447970292402e-05, + "loss": 2.0182, + "step": 1934 + }, + { + "epoch": 0.1456557330774008, + "grad_norm": 4.55674934387207, + "learning_rate": 9.657004366287352e-05, + "loss": 2.0423, + "step": 1935 + }, + { + "epoch": 0.14573100735806094, + "grad_norm": 5.007842540740967, + "learning_rate": 9.656560485436757e-05, + "loss": 2.3055, + "step": 1936 + }, + { + "epoch": 0.14580628163872109, + "grad_norm": 5.330198764801025, + "learning_rate": 9.656116327767002e-05, + "loss": 1.8905, + "step": 1937 + }, + { + "epoch": 0.14588155591938123, + "grad_norm": 6.192382335662842, + "learning_rate": 9.65567189330449e-05, + "loss": 2.4201, + "step": 1938 + }, + { + "epoch": 0.1459568302000414, + "grad_norm": 4.350824356079102, + "learning_rate": 9.655227182075643e-05, + "loss": 2.4094, + "step": 1939 + }, + { + "epoch": 0.14603210448070156, + "grad_norm": 5.458762168884277, + "learning_rate": 9.6547821941069e-05, + "loss": 2.2235, + "step": 1940 + }, + { + "epoch": 0.1461073787613617, + "grad_norm": 4.833422660827637, + "learning_rate": 9.654336929424708e-05, + "loss": 2.2301, + "step": 1941 + }, + { + "epoch": 0.14618265304202185, + "grad_norm": 5.182411193847656, + "learning_rate": 9.653891388055541e-05, + "loss": 1.8838, + "step": 1942 + }, + { + "epoch": 0.14625792732268203, + "grad_norm": 4.223660945892334, + "learning_rate": 9.653445570025887e-05, + "loss": 2.1262, + "step": 1943 + }, + { + "epoch": 0.14633320160334218, + "grad_norm": 5.680100917816162, + "learning_rate": 9.652999475362243e-05, + "loss": 2.0308, + "step": 1944 + }, + { + "epoch": 0.14640847588400233, + "grad_norm": 4.405364036560059, + "learning_rate": 9.652553104091133e-05, + "loss": 2.0664, + "step": 1945 + }, + { + "epoch": 0.1464837501646625, + "grad_norm": 5.610743045806885, + "learning_rate": 9.65210645623909e-05, + "loss": 2.199, + "step": 1946 + }, + { + "epoch": 0.14655902444532265, + "grad_norm": 5.353166580200195, + "learning_rate": 9.651659531832667e-05, + "loss": 1.8452, + "step": 1947 + }, + { + "epoch": 0.1466342987259828, + "grad_norm": 4.80667781829834, + "learning_rate": 9.651212330898432e-05, + "loss": 2.2945, + "step": 1948 + }, + { + "epoch": 0.14670957300664295, + "grad_norm": 7.225444793701172, + "learning_rate": 9.650764853462967e-05, + "loss": 2.0363, + "step": 1949 + }, + { + "epoch": 0.14678484728730312, + "grad_norm": 4.671341419219971, + "learning_rate": 9.650317099552879e-05, + "loss": 1.8784, + "step": 1950 + }, + { + "epoch": 0.14686012156796327, + "grad_norm": 7.252857685089111, + "learning_rate": 9.649869069194783e-05, + "loss": 2.3605, + "step": 1951 + }, + { + "epoch": 0.14693539584862342, + "grad_norm": 4.275567531585693, + "learning_rate": 9.649420762415311e-05, + "loss": 1.636, + "step": 1952 + }, + { + "epoch": 0.14701067012928357, + "grad_norm": 8.222707748413086, + "learning_rate": 9.648972179241116e-05, + "loss": 2.191, + "step": 1953 + }, + { + "epoch": 0.14708594440994374, + "grad_norm": 6.288633823394775, + "learning_rate": 9.648523319698865e-05, + "loss": 2.3571, + "step": 1954 + }, + { + "epoch": 0.1471612186906039, + "grad_norm": 5.934138298034668, + "learning_rate": 9.648074183815241e-05, + "loss": 2.4525, + "step": 1955 + }, + { + "epoch": 0.14723649297126404, + "grad_norm": 8.490248680114746, + "learning_rate": 9.647624771616944e-05, + "loss": 1.7362, + "step": 1956 + }, + { + "epoch": 0.14731176725192419, + "grad_norm": 6.234622955322266, + "learning_rate": 9.647175083130687e-05, + "loss": 1.7949, + "step": 1957 + }, + { + "epoch": 0.14738704153258436, + "grad_norm": 5.089983940124512, + "learning_rate": 9.646725118383207e-05, + "loss": 1.7567, + "step": 1958 + }, + { + "epoch": 0.1474623158132445, + "grad_norm": 4.100534915924072, + "learning_rate": 9.646274877401252e-05, + "loss": 1.9498, + "step": 1959 + }, + { + "epoch": 0.14753759009390466, + "grad_norm": 4.9055962562561035, + "learning_rate": 9.645824360211589e-05, + "loss": 1.8627, + "step": 1960 + }, + { + "epoch": 0.14761286437456483, + "grad_norm": 7.884844779968262, + "learning_rate": 9.645373566840994e-05, + "loss": 2.6448, + "step": 1961 + }, + { + "epoch": 0.14768813865522498, + "grad_norm": 8.933359146118164, + "learning_rate": 9.644922497316274e-05, + "loss": 2.6395, + "step": 1962 + }, + { + "epoch": 0.14776341293588513, + "grad_norm": 5.583507537841797, + "learning_rate": 9.644471151664236e-05, + "loss": 2.0274, + "step": 1963 + }, + { + "epoch": 0.14783868721654528, + "grad_norm": 4.9328999519348145, + "learning_rate": 9.644019529911716e-05, + "loss": 2.3124, + "step": 1964 + }, + { + "epoch": 0.14791396149720545, + "grad_norm": 6.132089138031006, + "learning_rate": 9.64356763208556e-05, + "loss": 1.9784, + "step": 1965 + }, + { + "epoch": 0.1479892357778656, + "grad_norm": 6.383988380432129, + "learning_rate": 9.643115458212631e-05, + "loss": 2.1371, + "step": 1966 + }, + { + "epoch": 0.14806451005852575, + "grad_norm": 6.754266262054443, + "learning_rate": 9.642663008319812e-05, + "loss": 2.2423, + "step": 1967 + }, + { + "epoch": 0.1481397843391859, + "grad_norm": 5.1077680587768555, + "learning_rate": 9.642210282433999e-05, + "loss": 2.2848, + "step": 1968 + }, + { + "epoch": 0.14821505861984607, + "grad_norm": 6.1306891441345215, + "learning_rate": 9.641757280582103e-05, + "loss": 2.0986, + "step": 1969 + }, + { + "epoch": 0.14829033290050622, + "grad_norm": 6.481471538543701, + "learning_rate": 9.641304002791057e-05, + "loss": 2.1875, + "step": 1970 + }, + { + "epoch": 0.14836560718116637, + "grad_norm": 5.6584577560424805, + "learning_rate": 9.640850449087804e-05, + "loss": 2.2309, + "step": 1971 + }, + { + "epoch": 0.14844088146182652, + "grad_norm": 6.017479419708252, + "learning_rate": 9.640396619499308e-05, + "loss": 1.9637, + "step": 1972 + }, + { + "epoch": 0.1485161557424867, + "grad_norm": 5.747828483581543, + "learning_rate": 9.639942514052548e-05, + "loss": 2.0723, + "step": 1973 + }, + { + "epoch": 0.14859143002314684, + "grad_norm": 4.990069389343262, + "learning_rate": 9.639488132774518e-05, + "loss": 2.1641, + "step": 1974 + }, + { + "epoch": 0.148666704303807, + "grad_norm": 6.481212615966797, + "learning_rate": 9.639033475692233e-05, + "loss": 2.1727, + "step": 1975 + }, + { + "epoch": 0.14874197858446717, + "grad_norm": 3.856231689453125, + "learning_rate": 9.638578542832717e-05, + "loss": 1.8302, + "step": 1976 + }, + { + "epoch": 0.14881725286512731, + "grad_norm": 4.363240718841553, + "learning_rate": 9.638123334223017e-05, + "loss": 2.1773, + "step": 1977 + }, + { + "epoch": 0.14889252714578746, + "grad_norm": 4.597896575927734, + "learning_rate": 9.637667849890193e-05, + "loss": 1.9484, + "step": 1978 + }, + { + "epoch": 0.1489678014264476, + "grad_norm": 4.198542594909668, + "learning_rate": 9.637212089861323e-05, + "loss": 2.0514, + "step": 1979 + }, + { + "epoch": 0.14904307570710779, + "grad_norm": 4.761568546295166, + "learning_rate": 9.636756054163498e-05, + "loss": 2.2641, + "step": 1980 + }, + { + "epoch": 0.14911834998776793, + "grad_norm": 4.66417932510376, + "learning_rate": 9.636299742823831e-05, + "loss": 2.1895, + "step": 1981 + }, + { + "epoch": 0.14919362426842808, + "grad_norm": 4.6133294105529785, + "learning_rate": 9.635843155869449e-05, + "loss": 2.118, + "step": 1982 + }, + { + "epoch": 0.14926889854908823, + "grad_norm": 4.578795433044434, + "learning_rate": 9.63538629332749e-05, + "loss": 1.9732, + "step": 1983 + }, + { + "epoch": 0.1493441728297484, + "grad_norm": 4.421432018280029, + "learning_rate": 9.634929155225118e-05, + "loss": 2.2925, + "step": 1984 + }, + { + "epoch": 0.14941944711040855, + "grad_norm": 5.2989397048950195, + "learning_rate": 9.634471741589505e-05, + "loss": 1.9553, + "step": 1985 + }, + { + "epoch": 0.1494947213910687, + "grad_norm": 4.826900482177734, + "learning_rate": 9.634014052447849e-05, + "loss": 2.2639, + "step": 1986 + }, + { + "epoch": 0.14956999567172885, + "grad_norm": 4.551970481872559, + "learning_rate": 9.63355608782735e-05, + "loss": 2.3918, + "step": 1987 + }, + { + "epoch": 0.14964526995238903, + "grad_norm": 5.172665596008301, + "learning_rate": 9.633097847755238e-05, + "loss": 2.6773, + "step": 1988 + }, + { + "epoch": 0.14972054423304917, + "grad_norm": 4.0801544189453125, + "learning_rate": 9.632639332258752e-05, + "loss": 2.0645, + "step": 1989 + }, + { + "epoch": 0.14979581851370932, + "grad_norm": 8.26967716217041, + "learning_rate": 9.632180541365152e-05, + "loss": 1.9689, + "step": 1990 + }, + { + "epoch": 0.14987109279436947, + "grad_norm": 6.616847515106201, + "learning_rate": 9.631721475101709e-05, + "loss": 1.9611, + "step": 1991 + }, + { + "epoch": 0.14994636707502965, + "grad_norm": 5.640640735626221, + "learning_rate": 9.631262133495714e-05, + "loss": 2.2252, + "step": 1992 + }, + { + "epoch": 0.1500216413556898, + "grad_norm": 5.297373294830322, + "learning_rate": 9.630802516574476e-05, + "loss": 1.8446, + "step": 1993 + }, + { + "epoch": 0.15009691563634994, + "grad_norm": 6.098952293395996, + "learning_rate": 9.630342624365311e-05, + "loss": 1.7872, + "step": 1994 + }, + { + "epoch": 0.15017218991701012, + "grad_norm": 5.614943027496338, + "learning_rate": 9.629882456895567e-05, + "loss": 1.877, + "step": 1995 + }, + { + "epoch": 0.15024746419767027, + "grad_norm": 4.242717742919922, + "learning_rate": 9.629422014192595e-05, + "loss": 2.2544, + "step": 1996 + }, + { + "epoch": 0.15032273847833041, + "grad_norm": 4.987455368041992, + "learning_rate": 9.628961296283766e-05, + "loss": 1.8918, + "step": 1997 + }, + { + "epoch": 0.15039801275899056, + "grad_norm": 6.457184791564941, + "learning_rate": 9.62850030319647e-05, + "loss": 2.3043, + "step": 1998 + }, + { + "epoch": 0.15047328703965074, + "grad_norm": 4.086604595184326, + "learning_rate": 9.628039034958113e-05, + "loss": 2.4515, + "step": 1999 + }, + { + "epoch": 0.1505485613203109, + "grad_norm": 3.4579966068267822, + "learning_rate": 9.627577491596113e-05, + "loss": 2.2173, + "step": 2000 + }, + { + "epoch": 0.15062383560097103, + "grad_norm": 4.692470073699951, + "learning_rate": 9.627115673137912e-05, + "loss": 1.9975, + "step": 2001 + }, + { + "epoch": 0.15069910988163118, + "grad_norm": 4.353622913360596, + "learning_rate": 9.626653579610958e-05, + "loss": 1.9004, + "step": 2002 + }, + { + "epoch": 0.15077438416229136, + "grad_norm": 6.3962860107421875, + "learning_rate": 9.626191211042726e-05, + "loss": 1.8652, + "step": 2003 + }, + { + "epoch": 0.1508496584429515, + "grad_norm": 4.919609546661377, + "learning_rate": 9.625728567460699e-05, + "loss": 1.95, + "step": 2004 + }, + { + "epoch": 0.15092493272361165, + "grad_norm": 4.541040420532227, + "learning_rate": 9.625265648892384e-05, + "loss": 2.0875, + "step": 2005 + }, + { + "epoch": 0.1510002070042718, + "grad_norm": 5.7784528732299805, + "learning_rate": 9.624802455365294e-05, + "loss": 2.4886, + "step": 2006 + }, + { + "epoch": 0.15107548128493198, + "grad_norm": 4.03879451751709, + "learning_rate": 9.624338986906969e-05, + "loss": 2.0802, + "step": 2007 + }, + { + "epoch": 0.15115075556559213, + "grad_norm": 4.763751983642578, + "learning_rate": 9.623875243544962e-05, + "loss": 2.195, + "step": 2008 + }, + { + "epoch": 0.15122602984625227, + "grad_norm": 4.0279107093811035, + "learning_rate": 9.623411225306837e-05, + "loss": 2.2525, + "step": 2009 + }, + { + "epoch": 0.15130130412691245, + "grad_norm": 5.816425323486328, + "learning_rate": 9.622946932220182e-05, + "loss": 1.7914, + "step": 2010 + }, + { + "epoch": 0.1513765784075726, + "grad_norm": 4.008151054382324, + "learning_rate": 9.622482364312594e-05, + "loss": 1.9762, + "step": 2011 + }, + { + "epoch": 0.15145185268823275, + "grad_norm": 4.618908405303955, + "learning_rate": 9.622017521611698e-05, + "loss": 2.0327, + "step": 2012 + }, + { + "epoch": 0.1515271269688929, + "grad_norm": 4.456994533538818, + "learning_rate": 9.621552404145118e-05, + "loss": 1.7788, + "step": 2013 + }, + { + "epoch": 0.15160240124955307, + "grad_norm": 6.221850395202637, + "learning_rate": 9.621087011940509e-05, + "loss": 2.0515, + "step": 2014 + }, + { + "epoch": 0.15167767553021322, + "grad_norm": 6.546451568603516, + "learning_rate": 9.620621345025538e-05, + "loss": 2.1616, + "step": 2015 + }, + { + "epoch": 0.15175294981087337, + "grad_norm": 7.714503288269043, + "learning_rate": 9.620155403427885e-05, + "loss": 2.0173, + "step": 2016 + }, + { + "epoch": 0.15182822409153351, + "grad_norm": 6.9931535720825195, + "learning_rate": 9.619689187175249e-05, + "loss": 2.39, + "step": 2017 + }, + { + "epoch": 0.1519034983721937, + "grad_norm": 6.245025634765625, + "learning_rate": 9.619222696295348e-05, + "loss": 2.109, + "step": 2018 + }, + { + "epoch": 0.15197877265285384, + "grad_norm": 5.842207431793213, + "learning_rate": 9.618755930815912e-05, + "loss": 2.2011, + "step": 2019 + }, + { + "epoch": 0.152054046933514, + "grad_norm": 6.096057415008545, + "learning_rate": 9.618288890764688e-05, + "loss": 2.1472, + "step": 2020 + }, + { + "epoch": 0.15212932121417413, + "grad_norm": 5.656440258026123, + "learning_rate": 9.617821576169439e-05, + "loss": 1.7721, + "step": 2021 + }, + { + "epoch": 0.1522045954948343, + "grad_norm": 6.263998508453369, + "learning_rate": 9.61735398705795e-05, + "loss": 2.0137, + "step": 2022 + }, + { + "epoch": 0.15227986977549446, + "grad_norm": 4.848899841308594, + "learning_rate": 9.616886123458013e-05, + "loss": 2.1483, + "step": 2023 + }, + { + "epoch": 0.1523551440561546, + "grad_norm": 5.665957927703857, + "learning_rate": 9.616417985397446e-05, + "loss": 2.1406, + "step": 2024 + }, + { + "epoch": 0.15243041833681478, + "grad_norm": 5.535228252410889, + "learning_rate": 9.615949572904073e-05, + "loss": 2.4557, + "step": 2025 + }, + { + "epoch": 0.15250569261747493, + "grad_norm": 5.200531959533691, + "learning_rate": 9.615480886005744e-05, + "loss": 2.1067, + "step": 2026 + }, + { + "epoch": 0.15258096689813508, + "grad_norm": 7.101623058319092, + "learning_rate": 9.61501192473032e-05, + "loss": 2.0752, + "step": 2027 + }, + { + "epoch": 0.15265624117879523, + "grad_norm": 5.383849620819092, + "learning_rate": 9.614542689105676e-05, + "loss": 1.8721, + "step": 2028 + }, + { + "epoch": 0.1527315154594554, + "grad_norm": 7.677318096160889, + "learning_rate": 9.614073179159713e-05, + "loss": 2.2993, + "step": 2029 + }, + { + "epoch": 0.15280678974011555, + "grad_norm": 5.782139301300049, + "learning_rate": 9.613603394920337e-05, + "loss": 2.1034, + "step": 2030 + }, + { + "epoch": 0.1528820640207757, + "grad_norm": 5.71937894821167, + "learning_rate": 9.613133336415478e-05, + "loss": 2.1637, + "step": 2031 + }, + { + "epoch": 0.15295733830143585, + "grad_norm": 4.102837085723877, + "learning_rate": 9.612663003673079e-05, + "loss": 2.2697, + "step": 2032 + }, + { + "epoch": 0.15303261258209602, + "grad_norm": 4.255248069763184, + "learning_rate": 9.612192396721099e-05, + "loss": 2.2544, + "step": 2033 + }, + { + "epoch": 0.15310788686275617, + "grad_norm": 4.723361015319824, + "learning_rate": 9.611721515587515e-05, + "loss": 1.8494, + "step": 2034 + }, + { + "epoch": 0.15318316114341632, + "grad_norm": 4.966095447540283, + "learning_rate": 9.611250360300319e-05, + "loss": 2.2271, + "step": 2035 + }, + { + "epoch": 0.15325843542407647, + "grad_norm": 4.73899507522583, + "learning_rate": 9.610778930887521e-05, + "loss": 1.9068, + "step": 2036 + }, + { + "epoch": 0.15333370970473664, + "grad_norm": 5.542885780334473, + "learning_rate": 9.610307227377145e-05, + "loss": 2.2295, + "step": 2037 + }, + { + "epoch": 0.1534089839853968, + "grad_norm": 4.914027214050293, + "learning_rate": 9.609835249797233e-05, + "loss": 1.9545, + "step": 2038 + }, + { + "epoch": 0.15348425826605694, + "grad_norm": 4.769974708557129, + "learning_rate": 9.609362998175843e-05, + "loss": 1.9436, + "step": 2039 + }, + { + "epoch": 0.1535595325467171, + "grad_norm": 4.500949382781982, + "learning_rate": 9.608890472541048e-05, + "loss": 2.1632, + "step": 2040 + }, + { + "epoch": 0.15363480682737726, + "grad_norm": 5.971304416656494, + "learning_rate": 9.608417672920939e-05, + "loss": 1.939, + "step": 2041 + }, + { + "epoch": 0.1537100811080374, + "grad_norm": 4.242741107940674, + "learning_rate": 9.60794459934362e-05, + "loss": 2.1406, + "step": 2042 + }, + { + "epoch": 0.15378535538869756, + "grad_norm": 5.656136512756348, + "learning_rate": 9.607471251837219e-05, + "loss": 1.9298, + "step": 2043 + }, + { + "epoch": 0.15386062966935773, + "grad_norm": 5.102395057678223, + "learning_rate": 9.606997630429872e-05, + "loss": 2.5265, + "step": 2044 + }, + { + "epoch": 0.15393590395001788, + "grad_norm": 4.860550403594971, + "learning_rate": 9.606523735149735e-05, + "loss": 2.3572, + "step": 2045 + }, + { + "epoch": 0.15401117823067803, + "grad_norm": 6.583255767822266, + "learning_rate": 9.60604956602498e-05, + "loss": 2.282, + "step": 2046 + }, + { + "epoch": 0.15408645251133818, + "grad_norm": 4.965795516967773, + "learning_rate": 9.605575123083793e-05, + "loss": 2.4158, + "step": 2047 + }, + { + "epoch": 0.15416172679199835, + "grad_norm": 4.764041423797607, + "learning_rate": 9.605100406354379e-05, + "loss": 2.0478, + "step": 2048 + }, + { + "epoch": 0.1542370010726585, + "grad_norm": 4.438436985015869, + "learning_rate": 9.604625415864962e-05, + "loss": 1.82, + "step": 2049 + }, + { + "epoch": 0.15431227535331865, + "grad_norm": 5.641543388366699, + "learning_rate": 9.604150151643775e-05, + "loss": 2.2091, + "step": 2050 + }, + { + "epoch": 0.1543875496339788, + "grad_norm": 5.498066425323486, + "learning_rate": 9.603674613719072e-05, + "loss": 2.3056, + "step": 2051 + }, + { + "epoch": 0.15446282391463897, + "grad_norm": 6.014181137084961, + "learning_rate": 9.603198802119122e-05, + "loss": 1.8255, + "step": 2052 + }, + { + "epoch": 0.15453809819529912, + "grad_norm": 7.212738037109375, + "learning_rate": 9.602722716872213e-05, + "loss": 2.4084, + "step": 2053 + }, + { + "epoch": 0.15461337247595927, + "grad_norm": 4.357029914855957, + "learning_rate": 9.602246358006643e-05, + "loss": 2.2233, + "step": 2054 + }, + { + "epoch": 0.15468864675661942, + "grad_norm": 7.110274791717529, + "learning_rate": 9.601769725550736e-05, + "loss": 2.1846, + "step": 2055 + }, + { + "epoch": 0.1547639210372796, + "grad_norm": 5.4508490562438965, + "learning_rate": 9.601292819532819e-05, + "loss": 1.9935, + "step": 2056 + }, + { + "epoch": 0.15483919531793974, + "grad_norm": 4.063355445861816, + "learning_rate": 9.600815639981249e-05, + "loss": 2.3804, + "step": 2057 + }, + { + "epoch": 0.1549144695985999, + "grad_norm": 6.705149173736572, + "learning_rate": 9.600338186924389e-05, + "loss": 1.9364, + "step": 2058 + }, + { + "epoch": 0.15498974387926007, + "grad_norm": 5.776592254638672, + "learning_rate": 9.599860460390624e-05, + "loss": 1.9305, + "step": 2059 + }, + { + "epoch": 0.15506501815992021, + "grad_norm": 4.8743720054626465, + "learning_rate": 9.599382460408353e-05, + "loss": 2.2689, + "step": 2060 + }, + { + "epoch": 0.15514029244058036, + "grad_norm": 4.975855350494385, + "learning_rate": 9.598904187005991e-05, + "loss": 2.2705, + "step": 2061 + }, + { + "epoch": 0.1552155667212405, + "grad_norm": 4.627737522125244, + "learning_rate": 9.598425640211972e-05, + "loss": 2.121, + "step": 2062 + }, + { + "epoch": 0.1552908410019007, + "grad_norm": 7.565857410430908, + "learning_rate": 9.597946820054743e-05, + "loss": 2.765, + "step": 2063 + }, + { + "epoch": 0.15536611528256083, + "grad_norm": 8.018056869506836, + "learning_rate": 9.597467726562768e-05, + "loss": 2.2892, + "step": 2064 + }, + { + "epoch": 0.15544138956322098, + "grad_norm": 5.626107215881348, + "learning_rate": 9.596988359764529e-05, + "loss": 1.9739, + "step": 2065 + }, + { + "epoch": 0.15551666384388113, + "grad_norm": 5.794229030609131, + "learning_rate": 9.596508719688522e-05, + "loss": 2.7347, + "step": 2066 + }, + { + "epoch": 0.1555919381245413, + "grad_norm": 4.8090620040893555, + "learning_rate": 9.59602880636326e-05, + "loss": 2.3456, + "step": 2067 + }, + { + "epoch": 0.15566721240520145, + "grad_norm": 6.086714744567871, + "learning_rate": 9.595548619817274e-05, + "loss": 1.9479, + "step": 2068 + }, + { + "epoch": 0.1557424866858616, + "grad_norm": 5.960855007171631, + "learning_rate": 9.595068160079107e-05, + "loss": 1.9085, + "step": 2069 + }, + { + "epoch": 0.15581776096652175, + "grad_norm": 3.8029263019561768, + "learning_rate": 9.594587427177324e-05, + "loss": 1.9268, + "step": 2070 + }, + { + "epoch": 0.15589303524718193, + "grad_norm": 4.244431972503662, + "learning_rate": 9.594106421140501e-05, + "loss": 2.1821, + "step": 2071 + }, + { + "epoch": 0.15596830952784208, + "grad_norm": 5.755577564239502, + "learning_rate": 9.593625141997234e-05, + "loss": 2.454, + "step": 2072 + }, + { + "epoch": 0.15604358380850222, + "grad_norm": 4.253504276275635, + "learning_rate": 9.593143589776133e-05, + "loss": 2.042, + "step": 2073 + }, + { + "epoch": 0.1561188580891624, + "grad_norm": 4.887152671813965, + "learning_rate": 9.592661764505824e-05, + "loss": 1.8759, + "step": 2074 + }, + { + "epoch": 0.15619413236982255, + "grad_norm": 5.830852031707764, + "learning_rate": 9.59217966621495e-05, + "loss": 2.2862, + "step": 2075 + }, + { + "epoch": 0.1562694066504827, + "grad_norm": 4.546017169952393, + "learning_rate": 9.591697294932173e-05, + "loss": 2.1105, + "step": 2076 + }, + { + "epoch": 0.15634468093114284, + "grad_norm": 4.672211170196533, + "learning_rate": 9.591214650686167e-05, + "loss": 1.9517, + "step": 2077 + }, + { + "epoch": 0.15641995521180302, + "grad_norm": 5.439480781555176, + "learning_rate": 9.590731733505622e-05, + "loss": 2.0202, + "step": 2078 + }, + { + "epoch": 0.15649522949246317, + "grad_norm": 4.746372699737549, + "learning_rate": 9.59024854341925e-05, + "loss": 1.8844, + "step": 2079 + }, + { + "epoch": 0.15657050377312332, + "grad_norm": 4.743064880371094, + "learning_rate": 9.58976508045577e-05, + "loss": 2.0359, + "step": 2080 + }, + { + "epoch": 0.15664577805378346, + "grad_norm": 4.023740291595459, + "learning_rate": 9.589281344643927e-05, + "loss": 2.3622, + "step": 2081 + }, + { + "epoch": 0.15672105233444364, + "grad_norm": 5.366828441619873, + "learning_rate": 9.588797336012477e-05, + "loss": 1.9732, + "step": 2082 + }, + { + "epoch": 0.1567963266151038, + "grad_norm": 6.032355308532715, + "learning_rate": 9.588313054590192e-05, + "loss": 2.2055, + "step": 2083 + }, + { + "epoch": 0.15687160089576394, + "grad_norm": 5.078389644622803, + "learning_rate": 9.58782850040586e-05, + "loss": 2.5387, + "step": 2084 + }, + { + "epoch": 0.15694687517642408, + "grad_norm": 5.1413655281066895, + "learning_rate": 9.58734367348829e-05, + "loss": 2.1256, + "step": 2085 + }, + { + "epoch": 0.15702214945708426, + "grad_norm": 5.05446195602417, + "learning_rate": 9.586858573866299e-05, + "loss": 2.0679, + "step": 2086 + }, + { + "epoch": 0.1570974237377444, + "grad_norm": 5.082911014556885, + "learning_rate": 9.586373201568728e-05, + "loss": 2.2685, + "step": 2087 + }, + { + "epoch": 0.15717269801840456, + "grad_norm": 5.023488521575928, + "learning_rate": 9.585887556624429e-05, + "loss": 1.9534, + "step": 2088 + }, + { + "epoch": 0.1572479722990647, + "grad_norm": 6.252662181854248, + "learning_rate": 9.585401639062273e-05, + "loss": 1.9425, + "step": 2089 + }, + { + "epoch": 0.15732324657972488, + "grad_norm": 4.720130443572998, + "learning_rate": 9.584915448911147e-05, + "loss": 1.8599, + "step": 2090 + }, + { + "epoch": 0.15739852086038503, + "grad_norm": 3.836473226547241, + "learning_rate": 9.584428986199954e-05, + "loss": 1.9142, + "step": 2091 + }, + { + "epoch": 0.15747379514104518, + "grad_norm": 6.064619064331055, + "learning_rate": 9.583942250957611e-05, + "loss": 2.7541, + "step": 2092 + }, + { + "epoch": 0.15754906942170535, + "grad_norm": 4.852019786834717, + "learning_rate": 9.583455243213054e-05, + "loss": 2.0423, + "step": 2093 + }, + { + "epoch": 0.1576243437023655, + "grad_norm": 6.162034511566162, + "learning_rate": 9.582967962995235e-05, + "loss": 2.1552, + "step": 2094 + }, + { + "epoch": 0.15769961798302565, + "grad_norm": 5.074926853179932, + "learning_rate": 9.58248041033312e-05, + "loss": 1.8848, + "step": 2095 + }, + { + "epoch": 0.1577748922636858, + "grad_norm": 6.245471000671387, + "learning_rate": 9.581992585255692e-05, + "loss": 2.2631, + "step": 2096 + }, + { + "epoch": 0.15785016654434597, + "grad_norm": 5.494795322418213, + "learning_rate": 9.581504487791954e-05, + "loss": 2.1619, + "step": 2097 + }, + { + "epoch": 0.15792544082500612, + "grad_norm": 6.342918395996094, + "learning_rate": 9.58101611797092e-05, + "loss": 2.0738, + "step": 2098 + }, + { + "epoch": 0.15800071510566627, + "grad_norm": 4.561079025268555, + "learning_rate": 9.580527475821621e-05, + "loss": 1.9883, + "step": 2099 + }, + { + "epoch": 0.15807598938632642, + "grad_norm": 5.45330286026001, + "learning_rate": 9.580038561373108e-05, + "loss": 2.3148, + "step": 2100 + }, + { + "epoch": 0.1581512636669866, + "grad_norm": 5.858702182769775, + "learning_rate": 9.579549374654442e-05, + "loss": 2.8602, + "step": 2101 + }, + { + "epoch": 0.15822653794764674, + "grad_norm": 6.087124824523926, + "learning_rate": 9.579059915694708e-05, + "loss": 1.9753, + "step": 2102 + }, + { + "epoch": 0.1583018122283069, + "grad_norm": 5.192907810211182, + "learning_rate": 9.578570184523e-05, + "loss": 2.1302, + "step": 2103 + }, + { + "epoch": 0.15837708650896704, + "grad_norm": 5.223175048828125, + "learning_rate": 9.578080181168432e-05, + "loss": 2.3049, + "step": 2104 + }, + { + "epoch": 0.1584523607896272, + "grad_norm": 5.7006049156188965, + "learning_rate": 9.577589905660133e-05, + "loss": 2.1251, + "step": 2105 + }, + { + "epoch": 0.15852763507028736, + "grad_norm": 6.055491924285889, + "learning_rate": 9.577099358027249e-05, + "loss": 2.019, + "step": 2106 + }, + { + "epoch": 0.1586029093509475, + "grad_norm": 6.3947930335998535, + "learning_rate": 9.57660853829894e-05, + "loss": 2.0258, + "step": 2107 + }, + { + "epoch": 0.15867818363160768, + "grad_norm": 4.594949245452881, + "learning_rate": 9.576117446504387e-05, + "loss": 1.6655, + "step": 2108 + }, + { + "epoch": 0.15875345791226783, + "grad_norm": 6.8482255935668945, + "learning_rate": 9.575626082672782e-05, + "loss": 2.6995, + "step": 2109 + }, + { + "epoch": 0.15882873219292798, + "grad_norm": 5.405508518218994, + "learning_rate": 9.575134446833333e-05, + "loss": 2.0901, + "step": 2110 + }, + { + "epoch": 0.15890400647358813, + "grad_norm": 5.126932144165039, + "learning_rate": 9.574642539015271e-05, + "loss": 2.0346, + "step": 2111 + }, + { + "epoch": 0.1589792807542483, + "grad_norm": 6.018233299255371, + "learning_rate": 9.574150359247835e-05, + "loss": 1.8024, + "step": 2112 + }, + { + "epoch": 0.15905455503490845, + "grad_norm": 4.829485893249512, + "learning_rate": 9.573657907560286e-05, + "loss": 2.8457, + "step": 2113 + }, + { + "epoch": 0.1591298293155686, + "grad_norm": 4.471799373626709, + "learning_rate": 9.573165183981897e-05, + "loss": 1.8593, + "step": 2114 + }, + { + "epoch": 0.15920510359622875, + "grad_norm": 7.830389976501465, + "learning_rate": 9.57267218854196e-05, + "loss": 2.0075, + "step": 2115 + }, + { + "epoch": 0.15928037787688892, + "grad_norm": 6.424320220947266, + "learning_rate": 9.572178921269781e-05, + "loss": 2.5253, + "step": 2116 + }, + { + "epoch": 0.15935565215754907, + "grad_norm": 4.256260871887207, + "learning_rate": 9.571685382194686e-05, + "loss": 2.2165, + "step": 2117 + }, + { + "epoch": 0.15943092643820922, + "grad_norm": 5.4897332191467285, + "learning_rate": 9.571191571346012e-05, + "loss": 2.0615, + "step": 2118 + }, + { + "epoch": 0.15950620071886937, + "grad_norm": 6.3781538009643555, + "learning_rate": 9.570697488753113e-05, + "loss": 1.7899, + "step": 2119 + }, + { + "epoch": 0.15958147499952954, + "grad_norm": 8.220921516418457, + "learning_rate": 9.570203134445366e-05, + "loss": 2.5155, + "step": 2120 + }, + { + "epoch": 0.1596567492801897, + "grad_norm": 6.042557239532471, + "learning_rate": 9.569708508452156e-05, + "loss": 2.3551, + "step": 2121 + }, + { + "epoch": 0.15973202356084984, + "grad_norm": 4.327997207641602, + "learning_rate": 9.569213610802885e-05, + "loss": 2.4328, + "step": 2122 + }, + { + "epoch": 0.15980729784151002, + "grad_norm": 4.309088230133057, + "learning_rate": 9.568718441526976e-05, + "loss": 1.9268, + "step": 2123 + }, + { + "epoch": 0.15988257212217016, + "grad_norm": 4.423697471618652, + "learning_rate": 9.568223000653866e-05, + "loss": 1.7265, + "step": 2124 + }, + { + "epoch": 0.1599578464028303, + "grad_norm": 4.880823612213135, + "learning_rate": 9.567727288213005e-05, + "loss": 1.8291, + "step": 2125 + }, + { + "epoch": 0.16003312068349046, + "grad_norm": 5.744210720062256, + "learning_rate": 9.567231304233863e-05, + "loss": 2.8858, + "step": 2126 + }, + { + "epoch": 0.16010839496415064, + "grad_norm": 5.3685407638549805, + "learning_rate": 9.566735048745926e-05, + "loss": 2.3238, + "step": 2127 + }, + { + "epoch": 0.16018366924481078, + "grad_norm": 6.981958389282227, + "learning_rate": 9.566238521778693e-05, + "loss": 1.9767, + "step": 2128 + }, + { + "epoch": 0.16025894352547093, + "grad_norm": 4.969970703125, + "learning_rate": 9.565741723361682e-05, + "loss": 2.1186, + "step": 2129 + }, + { + "epoch": 0.16033421780613108, + "grad_norm": 7.271712303161621, + "learning_rate": 9.565244653524426e-05, + "loss": 2.7254, + "step": 2130 + }, + { + "epoch": 0.16040949208679126, + "grad_norm": 6.812403678894043, + "learning_rate": 9.564747312296474e-05, + "loss": 2.2396, + "step": 2131 + }, + { + "epoch": 0.1604847663674514, + "grad_norm": 5.743902206420898, + "learning_rate": 9.564249699707394e-05, + "loss": 2.055, + "step": 2132 + }, + { + "epoch": 0.16056004064811155, + "grad_norm": 6.387364864349365, + "learning_rate": 9.563751815786764e-05, + "loss": 2.0375, + "step": 2133 + }, + { + "epoch": 0.1606353149287717, + "grad_norm": 7.67324686050415, + "learning_rate": 9.563253660564185e-05, + "loss": 1.9973, + "step": 2134 + }, + { + "epoch": 0.16071058920943188, + "grad_norm": 4.5309157371521, + "learning_rate": 9.562755234069267e-05, + "loss": 2.4186, + "step": 2135 + }, + { + "epoch": 0.16078586349009202, + "grad_norm": 7.804358005523682, + "learning_rate": 9.562256536331644e-05, + "loss": 2.1234, + "step": 2136 + }, + { + "epoch": 0.16086113777075217, + "grad_norm": 6.012933731079102, + "learning_rate": 9.56175756738096e-05, + "loss": 2.0281, + "step": 2137 + }, + { + "epoch": 0.16093641205141235, + "grad_norm": 6.326748847961426, + "learning_rate": 9.561258327246877e-05, + "loss": 2.3152, + "step": 2138 + }, + { + "epoch": 0.1610116863320725, + "grad_norm": 4.379335403442383, + "learning_rate": 9.560758815959074e-05, + "loss": 1.9833, + "step": 2139 + }, + { + "epoch": 0.16108696061273264, + "grad_norm": 4.508022785186768, + "learning_rate": 9.560259033547248e-05, + "loss": 2.2216, + "step": 2140 + }, + { + "epoch": 0.1611622348933928, + "grad_norm": 4.609071254730225, + "learning_rate": 9.559758980041105e-05, + "loss": 2.4108, + "step": 2141 + }, + { + "epoch": 0.16123750917405297, + "grad_norm": 5.058718204498291, + "learning_rate": 9.559258655470375e-05, + "loss": 2.0961, + "step": 2142 + }, + { + "epoch": 0.16131278345471312, + "grad_norm": 4.899115562438965, + "learning_rate": 9.5587580598648e-05, + "loss": 2.6409, + "step": 2143 + }, + { + "epoch": 0.16138805773537326, + "grad_norm": 4.495053291320801, + "learning_rate": 9.558257193254139e-05, + "loss": 2.0662, + "step": 2144 + }, + { + "epoch": 0.1614633320160334, + "grad_norm": 5.3127055168151855, + "learning_rate": 9.557756055668165e-05, + "loss": 2.0403, + "step": 2145 + }, + { + "epoch": 0.1615386062966936, + "grad_norm": 5.13474702835083, + "learning_rate": 9.557254647136675e-05, + "loss": 2.0244, + "step": 2146 + }, + { + "epoch": 0.16161388057735374, + "grad_norm": 5.256868362426758, + "learning_rate": 9.556752967689469e-05, + "loss": 2.0587, + "step": 2147 + }, + { + "epoch": 0.16168915485801388, + "grad_norm": 3.520779609680176, + "learning_rate": 9.556251017356375e-05, + "loss": 2.1936, + "step": 2148 + }, + { + "epoch": 0.16176442913867403, + "grad_norm": 6.549781322479248, + "learning_rate": 9.555748796167232e-05, + "loss": 2.394, + "step": 2149 + }, + { + "epoch": 0.1618397034193342, + "grad_norm": 4.351850509643555, + "learning_rate": 9.555246304151894e-05, + "loss": 2.051, + "step": 2150 + }, + { + "epoch": 0.16191497769999436, + "grad_norm": 4.4576640129089355, + "learning_rate": 9.554743541340234e-05, + "loss": 2.1816, + "step": 2151 + }, + { + "epoch": 0.1619902519806545, + "grad_norm": 4.653453826904297, + "learning_rate": 9.55424050776214e-05, + "loss": 2.317, + "step": 2152 + }, + { + "epoch": 0.16206552626131465, + "grad_norm": 5.437779903411865, + "learning_rate": 9.553737203447514e-05, + "loss": 1.8734, + "step": 2153 + }, + { + "epoch": 0.16214080054197483, + "grad_norm": 4.8841328620910645, + "learning_rate": 9.553233628426279e-05, + "loss": 2.2682, + "step": 2154 + }, + { + "epoch": 0.16221607482263498, + "grad_norm": 4.349809169769287, + "learning_rate": 9.552729782728369e-05, + "loss": 1.8595, + "step": 2155 + }, + { + "epoch": 0.16229134910329512, + "grad_norm": 5.053386688232422, + "learning_rate": 9.552225666383737e-05, + "loss": 2.2788, + "step": 2156 + }, + { + "epoch": 0.1623666233839553, + "grad_norm": 5.275552272796631, + "learning_rate": 9.551721279422351e-05, + "loss": 1.8819, + "step": 2157 + }, + { + "epoch": 0.16244189766461545, + "grad_norm": 4.67603063583374, + "learning_rate": 9.551216621874195e-05, + "loss": 1.9222, + "step": 2158 + }, + { + "epoch": 0.1625171719452756, + "grad_norm": 8.111873626708984, + "learning_rate": 9.55071169376927e-05, + "loss": 2.184, + "step": 2159 + }, + { + "epoch": 0.16259244622593574, + "grad_norm": 4.723062515258789, + "learning_rate": 9.55020649513759e-05, + "loss": 2.2724, + "step": 2160 + }, + { + "epoch": 0.16266772050659592, + "grad_norm": 5.697718143463135, + "learning_rate": 9.549701026009193e-05, + "loss": 2.1059, + "step": 2161 + }, + { + "epoch": 0.16274299478725607, + "grad_norm": 5.4652934074401855, + "learning_rate": 9.549195286414123e-05, + "loss": 2.0788, + "step": 2162 + }, + { + "epoch": 0.16281826906791622, + "grad_norm": 4.4733686447143555, + "learning_rate": 9.548689276382447e-05, + "loss": 1.7511, + "step": 2163 + }, + { + "epoch": 0.16289354334857636, + "grad_norm": 4.630800247192383, + "learning_rate": 9.548182995944244e-05, + "loss": 1.9563, + "step": 2164 + }, + { + "epoch": 0.16296881762923654, + "grad_norm": 5.446444511413574, + "learning_rate": 9.547676445129613e-05, + "loss": 1.9986, + "step": 2165 + }, + { + "epoch": 0.1630440919098967, + "grad_norm": 4.762897968292236, + "learning_rate": 9.547169623968665e-05, + "loss": 1.9952, + "step": 2166 + }, + { + "epoch": 0.16311936619055684, + "grad_norm": 18.56108283996582, + "learning_rate": 9.546662532491533e-05, + "loss": 2.2711, + "step": 2167 + }, + { + "epoch": 0.16319464047121698, + "grad_norm": 4.0759453773498535, + "learning_rate": 9.546155170728356e-05, + "loss": 2.4245, + "step": 2168 + }, + { + "epoch": 0.16326991475187716, + "grad_norm": 4.52295446395874, + "learning_rate": 9.5456475387093e-05, + "loss": 2.0197, + "step": 2169 + }, + { + "epoch": 0.1633451890325373, + "grad_norm": 4.870581150054932, + "learning_rate": 9.54513963646454e-05, + "loss": 2.1296, + "step": 2170 + }, + { + "epoch": 0.16342046331319746, + "grad_norm": 5.747099876403809, + "learning_rate": 9.544631464024271e-05, + "loss": 1.8936, + "step": 2171 + }, + { + "epoch": 0.16349573759385763, + "grad_norm": 4.687244892120361, + "learning_rate": 9.544123021418701e-05, + "loss": 2.2479, + "step": 2172 + }, + { + "epoch": 0.16357101187451778, + "grad_norm": 6.574528694152832, + "learning_rate": 9.543614308678057e-05, + "loss": 2.5086, + "step": 2173 + }, + { + "epoch": 0.16364628615517793, + "grad_norm": 4.977025508880615, + "learning_rate": 9.543105325832577e-05, + "loss": 2.087, + "step": 2174 + }, + { + "epoch": 0.16372156043583808, + "grad_norm": 4.470439434051514, + "learning_rate": 9.542596072912524e-05, + "loss": 2.4009, + "step": 2175 + }, + { + "epoch": 0.16379683471649825, + "grad_norm": 6.825066566467285, + "learning_rate": 9.542086549948167e-05, + "loss": 2.1708, + "step": 2176 + }, + { + "epoch": 0.1638721089971584, + "grad_norm": 5.020241737365723, + "learning_rate": 9.541576756969797e-05, + "loss": 1.9547, + "step": 2177 + }, + { + "epoch": 0.16394738327781855, + "grad_norm": 4.3161749839782715, + "learning_rate": 9.54106669400772e-05, + "loss": 1.7458, + "step": 2178 + }, + { + "epoch": 0.1640226575584787, + "grad_norm": 8.207012176513672, + "learning_rate": 9.54055636109226e-05, + "loss": 2.1673, + "step": 2179 + }, + { + "epoch": 0.16409793183913887, + "grad_norm": 4.703782558441162, + "learning_rate": 9.540045758253751e-05, + "loss": 1.9911, + "step": 2180 + }, + { + "epoch": 0.16417320611979902, + "grad_norm": 5.68269681930542, + "learning_rate": 9.539534885522547e-05, + "loss": 2.1083, + "step": 2181 + }, + { + "epoch": 0.16424848040045917, + "grad_norm": 4.880998134613037, + "learning_rate": 9.539023742929022e-05, + "loss": 1.7936, + "step": 2182 + }, + { + "epoch": 0.16432375468111932, + "grad_norm": 4.096438884735107, + "learning_rate": 9.538512330503557e-05, + "loss": 1.8505, + "step": 2183 + }, + { + "epoch": 0.1643990289617795, + "grad_norm": 4.104057312011719, + "learning_rate": 9.538000648276559e-05, + "loss": 2.259, + "step": 2184 + }, + { + "epoch": 0.16447430324243964, + "grad_norm": 4.332935333251953, + "learning_rate": 9.537488696278443e-05, + "loss": 1.9897, + "step": 2185 + }, + { + "epoch": 0.1645495775230998, + "grad_norm": 3.8722269535064697, + "learning_rate": 9.536976474539642e-05, + "loss": 1.96, + "step": 2186 + }, + { + "epoch": 0.16462485180375996, + "grad_norm": 4.648573398590088, + "learning_rate": 9.536463983090608e-05, + "loss": 2.1707, + "step": 2187 + }, + { + "epoch": 0.1647001260844201, + "grad_norm": 4.850244522094727, + "learning_rate": 9.535951221961807e-05, + "loss": 2.2453, + "step": 2188 + }, + { + "epoch": 0.16477540036508026, + "grad_norm": 4.94596004486084, + "learning_rate": 9.53543819118372e-05, + "loss": 2.0225, + "step": 2189 + }, + { + "epoch": 0.1648506746457404, + "grad_norm": 4.670311450958252, + "learning_rate": 9.534924890786849e-05, + "loss": 2.4088, + "step": 2190 + }, + { + "epoch": 0.16492594892640058, + "grad_norm": 4.737998008728027, + "learning_rate": 9.534411320801704e-05, + "loss": 1.904, + "step": 2191 + }, + { + "epoch": 0.16500122320706073, + "grad_norm": 3.6912128925323486, + "learning_rate": 9.533897481258815e-05, + "loss": 2.122, + "step": 2192 + }, + { + "epoch": 0.16507649748772088, + "grad_norm": 4.509467124938965, + "learning_rate": 9.533383372188731e-05, + "loss": 2.0205, + "step": 2193 + }, + { + "epoch": 0.16515177176838103, + "grad_norm": 4.352138996124268, + "learning_rate": 9.532868993622015e-05, + "loss": 2.0035, + "step": 2194 + }, + { + "epoch": 0.1652270460490412, + "grad_norm": 5.361912727355957, + "learning_rate": 9.532354345589241e-05, + "loss": 2.3839, + "step": 2195 + }, + { + "epoch": 0.16530232032970135, + "grad_norm": 4.971524238586426, + "learning_rate": 9.531839428121008e-05, + "loss": 2.0731, + "step": 2196 + }, + { + "epoch": 0.1653775946103615, + "grad_norm": 4.168862819671631, + "learning_rate": 9.531324241247924e-05, + "loss": 2.2333, + "step": 2197 + }, + { + "epoch": 0.16545286889102165, + "grad_norm": 9.913747787475586, + "learning_rate": 9.530808785000614e-05, + "loss": 2.121, + "step": 2198 + }, + { + "epoch": 0.16552814317168182, + "grad_norm": 5.684504985809326, + "learning_rate": 9.530293059409723e-05, + "loss": 1.8647, + "step": 2199 + }, + { + "epoch": 0.16560341745234197, + "grad_norm": 5.744393825531006, + "learning_rate": 9.52977706450591e-05, + "loss": 2.0306, + "step": 2200 + }, + { + "epoch": 0.16567869173300212, + "grad_norm": 4.807511806488037, + "learning_rate": 9.529260800319846e-05, + "loss": 1.954, + "step": 2201 + }, + { + "epoch": 0.16575396601366227, + "grad_norm": 4.251824378967285, + "learning_rate": 9.528744266882224e-05, + "loss": 1.8693, + "step": 2202 + }, + { + "epoch": 0.16582924029432244, + "grad_norm": 4.286069393157959, + "learning_rate": 9.528227464223749e-05, + "loss": 2.0393, + "step": 2203 + }, + { + "epoch": 0.1659045145749826, + "grad_norm": 5.116547584533691, + "learning_rate": 9.527710392375144e-05, + "loss": 1.8903, + "step": 2204 + }, + { + "epoch": 0.16597978885564274, + "grad_norm": 3.5704052448272705, + "learning_rate": 9.527193051367149e-05, + "loss": 1.991, + "step": 2205 + }, + { + "epoch": 0.16605506313630292, + "grad_norm": 5.593133926391602, + "learning_rate": 9.526675441230517e-05, + "loss": 2.131, + "step": 2206 + }, + { + "epoch": 0.16613033741696306, + "grad_norm": 3.9124293327331543, + "learning_rate": 9.526157561996018e-05, + "loss": 2.0755, + "step": 2207 + }, + { + "epoch": 0.1662056116976232, + "grad_norm": 5.039037227630615, + "learning_rate": 9.525639413694438e-05, + "loss": 1.8333, + "step": 2208 + }, + { + "epoch": 0.16628088597828336, + "grad_norm": 5.446913242340088, + "learning_rate": 9.525120996356582e-05, + "loss": 1.9668, + "step": 2209 + }, + { + "epoch": 0.16635616025894354, + "grad_norm": 4.832751750946045, + "learning_rate": 9.524602310013266e-05, + "loss": 2.0297, + "step": 2210 + }, + { + "epoch": 0.16643143453960368, + "grad_norm": 8.919572830200195, + "learning_rate": 9.524083354695325e-05, + "loss": 1.7866, + "step": 2211 + }, + { + "epoch": 0.16650670882026383, + "grad_norm": 3.999417543411255, + "learning_rate": 9.52356413043361e-05, + "loss": 2.1334, + "step": 2212 + }, + { + "epoch": 0.16658198310092398, + "grad_norm": 4.573214530944824, + "learning_rate": 9.523044637258986e-05, + "loss": 1.9966, + "step": 2213 + }, + { + "epoch": 0.16665725738158416, + "grad_norm": 4.394474983215332, + "learning_rate": 9.522524875202338e-05, + "loss": 2.1084, + "step": 2214 + }, + { + "epoch": 0.1667325316622443, + "grad_norm": 5.048755168914795, + "learning_rate": 9.522004844294563e-05, + "loss": 2.0974, + "step": 2215 + }, + { + "epoch": 0.16680780594290445, + "grad_norm": 5.61907434463501, + "learning_rate": 9.521484544566573e-05, + "loss": 2.8612, + "step": 2216 + }, + { + "epoch": 0.1668830802235646, + "grad_norm": 5.6824517250061035, + "learning_rate": 9.520963976049302e-05, + "loss": 2.7474, + "step": 2217 + }, + { + "epoch": 0.16695835450422478, + "grad_norm": 5.720795631408691, + "learning_rate": 9.520443138773694e-05, + "loss": 1.9172, + "step": 2218 + }, + { + "epoch": 0.16703362878488492, + "grad_norm": 4.822333812713623, + "learning_rate": 9.519922032770712e-05, + "loss": 1.9588, + "step": 2219 + }, + { + "epoch": 0.16710890306554507, + "grad_norm": 4.878042697906494, + "learning_rate": 9.519400658071336e-05, + "loss": 2.1059, + "step": 2220 + }, + { + "epoch": 0.16718417734620525, + "grad_norm": 4.809291839599609, + "learning_rate": 9.518879014706556e-05, + "loss": 1.9925, + "step": 2221 + }, + { + "epoch": 0.1672594516268654, + "grad_norm": 8.407268524169922, + "learning_rate": 9.518357102707387e-05, + "loss": 2.1908, + "step": 2222 + }, + { + "epoch": 0.16733472590752554, + "grad_norm": 4.998804569244385, + "learning_rate": 9.517834922104851e-05, + "loss": 2.2037, + "step": 2223 + }, + { + "epoch": 0.1674100001881857, + "grad_norm": 4.33091402053833, + "learning_rate": 9.517312472929992e-05, + "loss": 2.0227, + "step": 2224 + }, + { + "epoch": 0.16748527446884587, + "grad_norm": 4.6145920753479, + "learning_rate": 9.516789755213868e-05, + "loss": 1.7347, + "step": 2225 + }, + { + "epoch": 0.16756054874950602, + "grad_norm": 6.538087844848633, + "learning_rate": 9.516266768987555e-05, + "loss": 2.5261, + "step": 2226 + }, + { + "epoch": 0.16763582303016616, + "grad_norm": 9.319701194763184, + "learning_rate": 9.51574351428214e-05, + "loss": 2.242, + "step": 2227 + }, + { + "epoch": 0.1677110973108263, + "grad_norm": 5.501222133636475, + "learning_rate": 9.51521999112873e-05, + "loss": 1.8181, + "step": 2228 + }, + { + "epoch": 0.1677863715914865, + "grad_norm": 6.009037017822266, + "learning_rate": 9.514696199558448e-05, + "loss": 2.0195, + "step": 2229 + }, + { + "epoch": 0.16786164587214664, + "grad_norm": 5.068233966827393, + "learning_rate": 9.514172139602431e-05, + "loss": 1.9762, + "step": 2230 + }, + { + "epoch": 0.16793692015280678, + "grad_norm": 5.676066875457764, + "learning_rate": 9.513647811291832e-05, + "loss": 2.2958, + "step": 2231 + }, + { + "epoch": 0.16801219443346693, + "grad_norm": 4.411627292633057, + "learning_rate": 9.513123214657821e-05, + "loss": 2.1639, + "step": 2232 + }, + { + "epoch": 0.1680874687141271, + "grad_norm": 6.299969673156738, + "learning_rate": 9.512598349731588e-05, + "loss": 2.6517, + "step": 2233 + }, + { + "epoch": 0.16816274299478726, + "grad_norm": 5.330996513366699, + "learning_rate": 9.512073216544329e-05, + "loss": 1.8296, + "step": 2234 + }, + { + "epoch": 0.1682380172754474, + "grad_norm": 6.758583068847656, + "learning_rate": 9.511547815127262e-05, + "loss": 1.7756, + "step": 2235 + }, + { + "epoch": 0.16831329155610758, + "grad_norm": 5.848293304443359, + "learning_rate": 9.511022145511624e-05, + "loss": 1.8558, + "step": 2236 + }, + { + "epoch": 0.16838856583676773, + "grad_norm": 6.316586017608643, + "learning_rate": 9.510496207728663e-05, + "loss": 2.3719, + "step": 2237 + }, + { + "epoch": 0.16846384011742788, + "grad_norm": 4.921469688415527, + "learning_rate": 9.509970001809645e-05, + "loss": 2.1226, + "step": 2238 + }, + { + "epoch": 0.16853911439808802, + "grad_norm": 7.172194004058838, + "learning_rate": 9.509443527785851e-05, + "loss": 2.8482, + "step": 2239 + }, + { + "epoch": 0.1686143886787482, + "grad_norm": 4.468085289001465, + "learning_rate": 9.508916785688578e-05, + "loss": 2.1885, + "step": 2240 + }, + { + "epoch": 0.16868966295940835, + "grad_norm": 5.230997562408447, + "learning_rate": 9.50838977554914e-05, + "loss": 1.953, + "step": 2241 + }, + { + "epoch": 0.1687649372400685, + "grad_norm": 4.7813615798950195, + "learning_rate": 9.507862497398864e-05, + "loss": 2.0368, + "step": 2242 + }, + { + "epoch": 0.16884021152072864, + "grad_norm": 8.799346923828125, + "learning_rate": 9.5073349512691e-05, + "loss": 2.0197, + "step": 2243 + }, + { + "epoch": 0.16891548580138882, + "grad_norm": 5.089166641235352, + "learning_rate": 9.506807137191205e-05, + "loss": 1.9206, + "step": 2244 + }, + { + "epoch": 0.16899076008204897, + "grad_norm": 4.897778511047363, + "learning_rate": 9.506279055196556e-05, + "loss": 2.1226, + "step": 2245 + }, + { + "epoch": 0.16906603436270912, + "grad_norm": 4.697531223297119, + "learning_rate": 9.505750705316548e-05, + "loss": 2.0033, + "step": 2246 + }, + { + "epoch": 0.16914130864336926, + "grad_norm": 4.474527835845947, + "learning_rate": 9.505222087582589e-05, + "loss": 1.905, + "step": 2247 + }, + { + "epoch": 0.16921658292402944, + "grad_norm": 4.868900299072266, + "learning_rate": 9.504693202026104e-05, + "loss": 2.372, + "step": 2248 + }, + { + "epoch": 0.1692918572046896, + "grad_norm": 15.200536727905273, + "learning_rate": 9.504164048678533e-05, + "loss": 2.2141, + "step": 2249 + }, + { + "epoch": 0.16936713148534974, + "grad_norm": 3.8232109546661377, + "learning_rate": 9.503634627571333e-05, + "loss": 1.9466, + "step": 2250 + }, + { + "epoch": 0.16944240576600988, + "grad_norm": 5.026607036590576, + "learning_rate": 9.503104938735978e-05, + "loss": 1.9278, + "step": 2251 + }, + { + "epoch": 0.16951768004667006, + "grad_norm": 4.797671318054199, + "learning_rate": 9.502574982203956e-05, + "loss": 2.1581, + "step": 2252 + }, + { + "epoch": 0.1695929543273302, + "grad_norm": 5.253187656402588, + "learning_rate": 9.502044758006769e-05, + "loss": 1.9361, + "step": 2253 + }, + { + "epoch": 0.16966822860799036, + "grad_norm": 5.13231897354126, + "learning_rate": 9.501514266175938e-05, + "loss": 2.131, + "step": 2254 + }, + { + "epoch": 0.16974350288865053, + "grad_norm": 3.4525153636932373, + "learning_rate": 9.500983506743002e-05, + "loss": 2.0264, + "step": 2255 + }, + { + "epoch": 0.16981877716931068, + "grad_norm": 5.151804447174072, + "learning_rate": 9.50045247973951e-05, + "loss": 2.3831, + "step": 2256 + }, + { + "epoch": 0.16989405144997083, + "grad_norm": 6.492692947387695, + "learning_rate": 9.499921185197032e-05, + "loss": 2.1047, + "step": 2257 + }, + { + "epoch": 0.16996932573063098, + "grad_norm": 4.427255153656006, + "learning_rate": 9.499389623147151e-05, + "loss": 1.8544, + "step": 2258 + }, + { + "epoch": 0.17004460001129115, + "grad_norm": 4.369527339935303, + "learning_rate": 9.498857793621468e-05, + "loss": 2.1562, + "step": 2259 + }, + { + "epoch": 0.1701198742919513, + "grad_norm": 5.105477333068848, + "learning_rate": 9.498325696651597e-05, + "loss": 1.8788, + "step": 2260 + }, + { + "epoch": 0.17019514857261145, + "grad_norm": 6.311375617980957, + "learning_rate": 9.497793332269171e-05, + "loss": 2.2194, + "step": 2261 + }, + { + "epoch": 0.1702704228532716, + "grad_norm": 5.148840427398682, + "learning_rate": 9.497260700505837e-05, + "loss": 1.9661, + "step": 2262 + }, + { + "epoch": 0.17034569713393177, + "grad_norm": 6.63953971862793, + "learning_rate": 9.496727801393257e-05, + "loss": 2.2598, + "step": 2263 + }, + { + "epoch": 0.17042097141459192, + "grad_norm": 7.3747239112854, + "learning_rate": 9.496194634963113e-05, + "loss": 1.9716, + "step": 2264 + }, + { + "epoch": 0.17049624569525207, + "grad_norm": 6.000506401062012, + "learning_rate": 9.495661201247099e-05, + "loss": 2.2741, + "step": 2265 + }, + { + "epoch": 0.17057151997591222, + "grad_norm": 5.818872928619385, + "learning_rate": 9.495127500276926e-05, + "loss": 2.0073, + "step": 2266 + }, + { + "epoch": 0.1706467942565724, + "grad_norm": 4.766014575958252, + "learning_rate": 9.494593532084322e-05, + "loss": 1.9464, + "step": 2267 + }, + { + "epoch": 0.17072206853723254, + "grad_norm": 4.145430564880371, + "learning_rate": 9.494059296701027e-05, + "loss": 2.0597, + "step": 2268 + }, + { + "epoch": 0.1707973428178927, + "grad_norm": 3.792813777923584, + "learning_rate": 9.493524794158804e-05, + "loss": 2.0204, + "step": 2269 + }, + { + "epoch": 0.17087261709855286, + "grad_norm": 5.724621772766113, + "learning_rate": 9.492990024489423e-05, + "loss": 2.2551, + "step": 2270 + }, + { + "epoch": 0.170947891379213, + "grad_norm": 5.790171146392822, + "learning_rate": 9.492454987724678e-05, + "loss": 2.1761, + "step": 2271 + }, + { + "epoch": 0.17102316565987316, + "grad_norm": 4.4422607421875, + "learning_rate": 9.491919683896374e-05, + "loss": 2.0818, + "step": 2272 + }, + { + "epoch": 0.1710984399405333, + "grad_norm": 8.553842544555664, + "learning_rate": 9.491384113036335e-05, + "loss": 2.3337, + "step": 2273 + }, + { + "epoch": 0.17117371422119348, + "grad_norm": 5.848273277282715, + "learning_rate": 9.490848275176396e-05, + "loss": 2.3575, + "step": 2274 + }, + { + "epoch": 0.17124898850185363, + "grad_norm": 7.9377031326293945, + "learning_rate": 9.490312170348412e-05, + "loss": 1.9476, + "step": 2275 + }, + { + "epoch": 0.17132426278251378, + "grad_norm": 5.477145195007324, + "learning_rate": 9.489775798584255e-05, + "loss": 1.9218, + "step": 2276 + }, + { + "epoch": 0.17139953706317393, + "grad_norm": 4.5018134117126465, + "learning_rate": 9.48923915991581e-05, + "loss": 2.0845, + "step": 2277 + }, + { + "epoch": 0.1714748113438341, + "grad_norm": 5.394852638244629, + "learning_rate": 9.488702254374977e-05, + "loss": 2.0735, + "step": 2278 + }, + { + "epoch": 0.17155008562449425, + "grad_norm": 5.898357391357422, + "learning_rate": 9.488165081993674e-05, + "loss": 2.3753, + "step": 2279 + }, + { + "epoch": 0.1716253599051544, + "grad_norm": 5.274777889251709, + "learning_rate": 9.487627642803835e-05, + "loss": 1.9596, + "step": 2280 + }, + { + "epoch": 0.17170063418581455, + "grad_norm": 6.371806621551514, + "learning_rate": 9.48708993683741e-05, + "loss": 2.0667, + "step": 2281 + }, + { + "epoch": 0.17177590846647472, + "grad_norm": 4.789442539215088, + "learning_rate": 9.486551964126363e-05, + "loss": 2.144, + "step": 2282 + }, + { + "epoch": 0.17185118274713487, + "grad_norm": 4.008392810821533, + "learning_rate": 9.486013724702676e-05, + "loss": 1.9678, + "step": 2283 + }, + { + "epoch": 0.17192645702779502, + "grad_norm": 4.920111179351807, + "learning_rate": 9.485475218598343e-05, + "loss": 2.0479, + "step": 2284 + }, + { + "epoch": 0.1720017313084552, + "grad_norm": 4.9314351081848145, + "learning_rate": 9.48493644584538e-05, + "loss": 2.3151, + "step": 2285 + }, + { + "epoch": 0.17207700558911534, + "grad_norm": 6.626919269561768, + "learning_rate": 9.484397406475813e-05, + "loss": 1.996, + "step": 2286 + }, + { + "epoch": 0.1721522798697755, + "grad_norm": 6.398860931396484, + "learning_rate": 9.483858100521688e-05, + "loss": 2.1337, + "step": 2287 + }, + { + "epoch": 0.17222755415043564, + "grad_norm": 4.1914801597595215, + "learning_rate": 9.483318528015064e-05, + "loss": 1.9283, + "step": 2288 + }, + { + "epoch": 0.17230282843109582, + "grad_norm": 5.52365255355835, + "learning_rate": 9.482778688988018e-05, + "loss": 2.0351, + "step": 2289 + }, + { + "epoch": 0.17237810271175596, + "grad_norm": 7.355966567993164, + "learning_rate": 9.48223858347264e-05, + "loss": 1.9413, + "step": 2290 + }, + { + "epoch": 0.1724533769924161, + "grad_norm": 5.980371475219727, + "learning_rate": 9.481698211501042e-05, + "loss": 1.8498, + "step": 2291 + }, + { + "epoch": 0.17252865127307626, + "grad_norm": 4.228649139404297, + "learning_rate": 9.481157573105343e-05, + "loss": 2.5016, + "step": 2292 + }, + { + "epoch": 0.17260392555373644, + "grad_norm": 4.84653377532959, + "learning_rate": 9.480616668317685e-05, + "loss": 2.1955, + "step": 2293 + }, + { + "epoch": 0.17267919983439659, + "grad_norm": 5.076524257659912, + "learning_rate": 9.480075497170221e-05, + "loss": 2.6127, + "step": 2294 + }, + { + "epoch": 0.17275447411505673, + "grad_norm": 4.977198123931885, + "learning_rate": 9.479534059695125e-05, + "loss": 1.9281, + "step": 2295 + }, + { + "epoch": 0.17282974839571688, + "grad_norm": 5.648493766784668, + "learning_rate": 9.478992355924581e-05, + "loss": 1.9401, + "step": 2296 + }, + { + "epoch": 0.17290502267637706, + "grad_norm": 3.841317892074585, + "learning_rate": 9.478450385890793e-05, + "loss": 2.0583, + "step": 2297 + }, + { + "epoch": 0.1729802969570372, + "grad_norm": 4.604987144470215, + "learning_rate": 9.477908149625981e-05, + "loss": 2.0312, + "step": 2298 + }, + { + "epoch": 0.17305557123769735, + "grad_norm": 4.968191146850586, + "learning_rate": 9.477365647162377e-05, + "loss": 1.8733, + "step": 2299 + }, + { + "epoch": 0.17313084551835753, + "grad_norm": 4.658379077911377, + "learning_rate": 9.476822878532235e-05, + "loss": 1.7941, + "step": 2300 + }, + { + "epoch": 0.17320611979901768, + "grad_norm": 5.012067794799805, + "learning_rate": 9.476279843767814e-05, + "loss": 2.1138, + "step": 2301 + }, + { + "epoch": 0.17328139407967783, + "grad_norm": 3.835139274597168, + "learning_rate": 9.475736542901404e-05, + "loss": 1.7787, + "step": 2302 + }, + { + "epoch": 0.17335666836033797, + "grad_norm": 4.714152812957764, + "learning_rate": 9.475192975965296e-05, + "loss": 2.3215, + "step": 2303 + }, + { + "epoch": 0.17343194264099815, + "grad_norm": 5.5059332847595215, + "learning_rate": 9.474649142991808e-05, + "loss": 2.2355, + "step": 2304 + }, + { + "epoch": 0.1735072169216583, + "grad_norm": 4.933670520782471, + "learning_rate": 9.474105044013267e-05, + "loss": 1.8535, + "step": 2305 + }, + { + "epoch": 0.17358249120231845, + "grad_norm": 4.394619464874268, + "learning_rate": 9.473560679062018e-05, + "loss": 2.2636, + "step": 2306 + }, + { + "epoch": 0.1736577654829786, + "grad_norm": 4.728134632110596, + "learning_rate": 9.473016048170424e-05, + "loss": 2.2082, + "step": 2307 + }, + { + "epoch": 0.17373303976363877, + "grad_norm": 6.919857501983643, + "learning_rate": 9.472471151370861e-05, + "loss": 2.6794, + "step": 2308 + }, + { + "epoch": 0.17380831404429892, + "grad_norm": 5.255416393280029, + "learning_rate": 9.47192598869572e-05, + "loss": 1.9525, + "step": 2309 + }, + { + "epoch": 0.17388358832495907, + "grad_norm": 4.584347724914551, + "learning_rate": 9.47138056017741e-05, + "loss": 1.897, + "step": 2310 + }, + { + "epoch": 0.1739588626056192, + "grad_norm": 8.130692481994629, + "learning_rate": 9.470834865848357e-05, + "loss": 2.1597, + "step": 2311 + }, + { + "epoch": 0.1740341368862794, + "grad_norm": 4.825671672821045, + "learning_rate": 9.470288905740998e-05, + "loss": 2.0798, + "step": 2312 + }, + { + "epoch": 0.17410941116693954, + "grad_norm": 4.921777248382568, + "learning_rate": 9.469742679887791e-05, + "loss": 1.7776, + "step": 2313 + }, + { + "epoch": 0.17418468544759969, + "grad_norm": 4.85650634765625, + "learning_rate": 9.469196188321207e-05, + "loss": 2.0765, + "step": 2314 + }, + { + "epoch": 0.17425995972825983, + "grad_norm": 5.249808311462402, + "learning_rate": 9.468649431073733e-05, + "loss": 1.9294, + "step": 2315 + }, + { + "epoch": 0.17433523400892, + "grad_norm": 7.292625427246094, + "learning_rate": 9.468102408177871e-05, + "loss": 2.3196, + "step": 2316 + }, + { + "epoch": 0.17441050828958016, + "grad_norm": 3.958094596862793, + "learning_rate": 9.467555119666143e-05, + "loss": 2.1624, + "step": 2317 + }, + { + "epoch": 0.1744857825702403, + "grad_norm": 4.975834846496582, + "learning_rate": 9.467007565571081e-05, + "loss": 2.0123, + "step": 2318 + }, + { + "epoch": 0.17456105685090048, + "grad_norm": 5.5789594650268555, + "learning_rate": 9.466459745925237e-05, + "loss": 1.8141, + "step": 2319 + }, + { + "epoch": 0.17463633113156063, + "grad_norm": 5.731783866882324, + "learning_rate": 9.465911660761177e-05, + "loss": 2.4411, + "step": 2320 + }, + { + "epoch": 0.17471160541222078, + "grad_norm": 4.982248783111572, + "learning_rate": 9.465363310111483e-05, + "loss": 2.0105, + "step": 2321 + }, + { + "epoch": 0.17478687969288093, + "grad_norm": 5.72434139251709, + "learning_rate": 9.464814694008752e-05, + "loss": 2.2924, + "step": 2322 + }, + { + "epoch": 0.1748621539735411, + "grad_norm": 7.923213005065918, + "learning_rate": 9.464265812485598e-05, + "loss": 2.124, + "step": 2323 + }, + { + "epoch": 0.17493742825420125, + "grad_norm": 5.3210954666137695, + "learning_rate": 9.463716665574652e-05, + "loss": 2.033, + "step": 2324 + }, + { + "epoch": 0.1750127025348614, + "grad_norm": 6.259988784790039, + "learning_rate": 9.463167253308559e-05, + "loss": 1.8128, + "step": 2325 + }, + { + "epoch": 0.17508797681552155, + "grad_norm": 5.508382797241211, + "learning_rate": 9.462617575719977e-05, + "loss": 1.9617, + "step": 2326 + }, + { + "epoch": 0.17516325109618172, + "grad_norm": 6.951991081237793, + "learning_rate": 9.462067632841585e-05, + "loss": 2.3307, + "step": 2327 + }, + { + "epoch": 0.17523852537684187, + "grad_norm": 6.0590009689331055, + "learning_rate": 9.461517424706075e-05, + "loss": 1.7302, + "step": 2328 + }, + { + "epoch": 0.17531379965750202, + "grad_norm": 4.3615946769714355, + "learning_rate": 9.460966951346157e-05, + "loss": 2.05, + "step": 2329 + }, + { + "epoch": 0.17538907393816217, + "grad_norm": 5.940329074859619, + "learning_rate": 9.460416212794554e-05, + "loss": 2.4674, + "step": 2330 + }, + { + "epoch": 0.17546434821882234, + "grad_norm": 5.762747287750244, + "learning_rate": 9.459865209084004e-05, + "loss": 2.2994, + "step": 2331 + }, + { + "epoch": 0.1755396224994825, + "grad_norm": 7.069385051727295, + "learning_rate": 9.459313940247266e-05, + "loss": 2.1658, + "step": 2332 + }, + { + "epoch": 0.17561489678014264, + "grad_norm": 5.687751770019531, + "learning_rate": 9.458762406317107e-05, + "loss": 2.7086, + "step": 2333 + }, + { + "epoch": 0.1756901710608028, + "grad_norm": 5.619853496551514, + "learning_rate": 9.45821060732632e-05, + "loss": 1.6611, + "step": 2334 + }, + { + "epoch": 0.17576544534146296, + "grad_norm": 6.729658126831055, + "learning_rate": 9.457658543307701e-05, + "loss": 2.1789, + "step": 2335 + }, + { + "epoch": 0.1758407196221231, + "grad_norm": 4.319519519805908, + "learning_rate": 9.457106214294074e-05, + "loss": 2.1151, + "step": 2336 + }, + { + "epoch": 0.17591599390278326, + "grad_norm": 4.797735214233398, + "learning_rate": 9.456553620318269e-05, + "loss": 2.0082, + "step": 2337 + }, + { + "epoch": 0.17599126818344343, + "grad_norm": 4.516303539276123, + "learning_rate": 9.456000761413141e-05, + "loss": 2.0634, + "step": 2338 + }, + { + "epoch": 0.17606654246410358, + "grad_norm": 5.7466840744018555, + "learning_rate": 9.455447637611552e-05, + "loss": 2.1342, + "step": 2339 + }, + { + "epoch": 0.17614181674476373, + "grad_norm": 4.779585838317871, + "learning_rate": 9.454894248946386e-05, + "loss": 1.9679, + "step": 2340 + }, + { + "epoch": 0.17621709102542388, + "grad_norm": 5.47901725769043, + "learning_rate": 9.45434059545054e-05, + "loss": 2.6905, + "step": 2341 + }, + { + "epoch": 0.17629236530608405, + "grad_norm": 5.424152374267578, + "learning_rate": 9.453786677156927e-05, + "loss": 1.8476, + "step": 2342 + }, + { + "epoch": 0.1763676395867442, + "grad_norm": 4.945556163787842, + "learning_rate": 9.453232494098473e-05, + "loss": 2.1064, + "step": 2343 + }, + { + "epoch": 0.17644291386740435, + "grad_norm": 5.49737548828125, + "learning_rate": 9.452678046308126e-05, + "loss": 2.14, + "step": 2344 + }, + { + "epoch": 0.1765181881480645, + "grad_norm": 3.9528067111968994, + "learning_rate": 9.452123333818845e-05, + "loss": 2.1716, + "step": 2345 + }, + { + "epoch": 0.17659346242872467, + "grad_norm": 6.919625759124756, + "learning_rate": 9.451568356663607e-05, + "loss": 2.3537, + "step": 2346 + }, + { + "epoch": 0.17666873670938482, + "grad_norm": 4.273699760437012, + "learning_rate": 9.451013114875404e-05, + "loss": 1.8256, + "step": 2347 + }, + { + "epoch": 0.17674401099004497, + "grad_norm": 4.510996341705322, + "learning_rate": 9.450457608487242e-05, + "loss": 2.065, + "step": 2348 + }, + { + "epoch": 0.17681928527070515, + "grad_norm": 4.993044853210449, + "learning_rate": 9.449901837532145e-05, + "loss": 2.1107, + "step": 2349 + }, + { + "epoch": 0.1768945595513653, + "grad_norm": 4.57253885269165, + "learning_rate": 9.44934580204315e-05, + "loss": 1.9658, + "step": 2350 + }, + { + "epoch": 0.17696983383202544, + "grad_norm": 4.315614223480225, + "learning_rate": 9.448789502053316e-05, + "loss": 2.3603, + "step": 2351 + }, + { + "epoch": 0.1770451081126856, + "grad_norm": 6.130029678344727, + "learning_rate": 9.44823293759571e-05, + "loss": 2.1687, + "step": 2352 + }, + { + "epoch": 0.17712038239334577, + "grad_norm": 4.975474834442139, + "learning_rate": 9.447676108703421e-05, + "loss": 2.0759, + "step": 2353 + }, + { + "epoch": 0.1771956566740059, + "grad_norm": 5.403075218200684, + "learning_rate": 9.447119015409547e-05, + "loss": 2.2943, + "step": 2354 + }, + { + "epoch": 0.17727093095466606, + "grad_norm": 6.381655693054199, + "learning_rate": 9.446561657747209e-05, + "loss": 2.1639, + "step": 2355 + }, + { + "epoch": 0.1773462052353262, + "grad_norm": 5.660118579864502, + "learning_rate": 9.446004035749538e-05, + "loss": 2.4056, + "step": 2356 + }, + { + "epoch": 0.17742147951598639, + "grad_norm": 4.472799301147461, + "learning_rate": 9.445446149449686e-05, + "loss": 2.0724, + "step": 2357 + }, + { + "epoch": 0.17749675379664653, + "grad_norm": 5.539114475250244, + "learning_rate": 9.444887998880814e-05, + "loss": 1.9734, + "step": 2358 + }, + { + "epoch": 0.17757202807730668, + "grad_norm": 5.416542053222656, + "learning_rate": 9.444329584076104e-05, + "loss": 2.3994, + "step": 2359 + }, + { + "epoch": 0.17764730235796683, + "grad_norm": 6.170045375823975, + "learning_rate": 9.443770905068754e-05, + "loss": 2.278, + "step": 2360 + }, + { + "epoch": 0.177722576638627, + "grad_norm": 5.045105457305908, + "learning_rate": 9.443211961891973e-05, + "loss": 1.9535, + "step": 2361 + }, + { + "epoch": 0.17779785091928715, + "grad_norm": 7.952719688415527, + "learning_rate": 9.442652754578991e-05, + "loss": 2.0159, + "step": 2362 + }, + { + "epoch": 0.1778731251999473, + "grad_norm": 5.288589000701904, + "learning_rate": 9.44209328316305e-05, + "loss": 2.3552, + "step": 2363 + }, + { + "epoch": 0.17794839948060745, + "grad_norm": 4.723139762878418, + "learning_rate": 9.441533547677408e-05, + "loss": 2.0829, + "step": 2364 + }, + { + "epoch": 0.17802367376126763, + "grad_norm": 5.130797863006592, + "learning_rate": 9.440973548155342e-05, + "loss": 2.2269, + "step": 2365 + }, + { + "epoch": 0.17809894804192777, + "grad_norm": 5.036278247833252, + "learning_rate": 9.440413284630141e-05, + "loss": 1.839, + "step": 2366 + }, + { + "epoch": 0.17817422232258792, + "grad_norm": 5.3380608558654785, + "learning_rate": 9.439852757135111e-05, + "loss": 2.0908, + "step": 2367 + }, + { + "epoch": 0.1782494966032481, + "grad_norm": 5.248415470123291, + "learning_rate": 9.439291965703573e-05, + "loss": 2.3929, + "step": 2368 + }, + { + "epoch": 0.17832477088390825, + "grad_norm": 4.454289436340332, + "learning_rate": 9.438730910368867e-05, + "loss": 2.0924, + "step": 2369 + }, + { + "epoch": 0.1784000451645684, + "grad_norm": 5.5092082023620605, + "learning_rate": 9.438169591164343e-05, + "loss": 1.8028, + "step": 2370 + }, + { + "epoch": 0.17847531944522854, + "grad_norm": 5.50926399230957, + "learning_rate": 9.437608008123374e-05, + "loss": 2.4893, + "step": 2371 + }, + { + "epoch": 0.17855059372588872, + "grad_norm": 7.207761287689209, + "learning_rate": 9.437046161279339e-05, + "loss": 2.0744, + "step": 2372 + }, + { + "epoch": 0.17862586800654887, + "grad_norm": 5.470573425292969, + "learning_rate": 9.436484050665641e-05, + "loss": 2.095, + "step": 2373 + }, + { + "epoch": 0.178701142287209, + "grad_norm": 4.394279479980469, + "learning_rate": 9.435921676315698e-05, + "loss": 2.0534, + "step": 2374 + }, + { + "epoch": 0.17877641656786916, + "grad_norm": 4.637369632720947, + "learning_rate": 9.435359038262939e-05, + "loss": 1.8077, + "step": 2375 + }, + { + "epoch": 0.17885169084852934, + "grad_norm": 4.972540855407715, + "learning_rate": 9.43479613654081e-05, + "loss": 1.8897, + "step": 2376 + }, + { + "epoch": 0.17892696512918949, + "grad_norm": 5.140957832336426, + "learning_rate": 9.434232971182777e-05, + "loss": 2.1541, + "step": 2377 + }, + { + "epoch": 0.17900223940984963, + "grad_norm": 4.392958641052246, + "learning_rate": 9.433669542222317e-05, + "loss": 2.1104, + "step": 2378 + }, + { + "epoch": 0.17907751369050978, + "grad_norm": 4.119744300842285, + "learning_rate": 9.433105849692923e-05, + "loss": 2.0633, + "step": 2379 + }, + { + "epoch": 0.17915278797116996, + "grad_norm": 6.13436222076416, + "learning_rate": 9.432541893628107e-05, + "loss": 2.0722, + "step": 2380 + }, + { + "epoch": 0.1792280622518301, + "grad_norm": 5.207207202911377, + "learning_rate": 9.431977674061394e-05, + "loss": 2.0673, + "step": 2381 + }, + { + "epoch": 0.17930333653249025, + "grad_norm": 4.403334617614746, + "learning_rate": 9.431413191026325e-05, + "loss": 1.974, + "step": 2382 + }, + { + "epoch": 0.17937861081315043, + "grad_norm": 5.226065635681152, + "learning_rate": 9.430848444556457e-05, + "loss": 1.899, + "step": 2383 + }, + { + "epoch": 0.17945388509381058, + "grad_norm": 5.031151294708252, + "learning_rate": 9.430283434685363e-05, + "loss": 2.0024, + "step": 2384 + }, + { + "epoch": 0.17952915937447073, + "grad_norm": 6.219953536987305, + "learning_rate": 9.42971816144663e-05, + "loss": 2.0735, + "step": 2385 + }, + { + "epoch": 0.17960443365513087, + "grad_norm": 4.45499849319458, + "learning_rate": 9.429152624873863e-05, + "loss": 2.0663, + "step": 2386 + }, + { + "epoch": 0.17967970793579105, + "grad_norm": 6.559972763061523, + "learning_rate": 9.42858682500068e-05, + "loss": 2.1426, + "step": 2387 + }, + { + "epoch": 0.1797549822164512, + "grad_norm": 4.3940863609313965, + "learning_rate": 9.428020761860719e-05, + "loss": 2.0331, + "step": 2388 + }, + { + "epoch": 0.17983025649711135, + "grad_norm": 3.9855360984802246, + "learning_rate": 9.427454435487628e-05, + "loss": 2.0654, + "step": 2389 + }, + { + "epoch": 0.1799055307777715, + "grad_norm": 6.852569580078125, + "learning_rate": 9.426887845915074e-05, + "loss": 1.599, + "step": 2390 + }, + { + "epoch": 0.17998080505843167, + "grad_norm": 3.503969430923462, + "learning_rate": 9.426320993176742e-05, + "loss": 2.1259, + "step": 2391 + }, + { + "epoch": 0.18005607933909182, + "grad_norm": 6.220628261566162, + "learning_rate": 9.425753877306326e-05, + "loss": 1.8697, + "step": 2392 + }, + { + "epoch": 0.18013135361975197, + "grad_norm": 4.7325119972229, + "learning_rate": 9.42518649833754e-05, + "loss": 2.0619, + "step": 2393 + }, + { + "epoch": 0.18020662790041211, + "grad_norm": 3.3543930053710938, + "learning_rate": 9.424618856304115e-05, + "loss": 2.3206, + "step": 2394 + }, + { + "epoch": 0.1802819021810723, + "grad_norm": 4.114535331726074, + "learning_rate": 9.424050951239796e-05, + "loss": 1.9628, + "step": 2395 + }, + { + "epoch": 0.18035717646173244, + "grad_norm": 3.755035877227783, + "learning_rate": 9.42348278317834e-05, + "loss": 2.2212, + "step": 2396 + }, + { + "epoch": 0.18043245074239259, + "grad_norm": 4.5761518478393555, + "learning_rate": 9.422914352153524e-05, + "loss": 2.282, + "step": 2397 + }, + { + "epoch": 0.18050772502305276, + "grad_norm": 4.852680683135986, + "learning_rate": 9.422345658199144e-05, + "loss": 2.0651, + "step": 2398 + }, + { + "epoch": 0.1805829993037129, + "grad_norm": 6.706545352935791, + "learning_rate": 9.421776701349002e-05, + "loss": 1.7192, + "step": 2399 + }, + { + "epoch": 0.18065827358437306, + "grad_norm": 4.187545299530029, + "learning_rate": 9.421207481636922e-05, + "loss": 2.2583, + "step": 2400 + }, + { + "epoch": 0.1807335478650332, + "grad_norm": 4.551011562347412, + "learning_rate": 9.420637999096744e-05, + "loss": 1.9232, + "step": 2401 + }, + { + "epoch": 0.18080882214569338, + "grad_norm": 6.75311279296875, + "learning_rate": 9.42006825376232e-05, + "loss": 1.9148, + "step": 2402 + }, + { + "epoch": 0.18088409642635353, + "grad_norm": 4.482190132141113, + "learning_rate": 9.419498245667522e-05, + "loss": 1.8169, + "step": 2403 + }, + { + "epoch": 0.18095937070701368, + "grad_norm": 6.232021331787109, + "learning_rate": 9.418927974846234e-05, + "loss": 1.8865, + "step": 2404 + }, + { + "epoch": 0.18103464498767383, + "grad_norm": 6.51279354095459, + "learning_rate": 9.418357441332358e-05, + "loss": 2.8243, + "step": 2405 + }, + { + "epoch": 0.181109919268334, + "grad_norm": 4.804620742797852, + "learning_rate": 9.417786645159807e-05, + "loss": 1.9326, + "step": 2406 + }, + { + "epoch": 0.18118519354899415, + "grad_norm": 5.308732032775879, + "learning_rate": 9.417215586362518e-05, + "loss": 1.9298, + "step": 2407 + }, + { + "epoch": 0.1812604678296543, + "grad_norm": 4.766571044921875, + "learning_rate": 9.416644264974437e-05, + "loss": 1.8049, + "step": 2408 + }, + { + "epoch": 0.18133574211031445, + "grad_norm": 5.129100322723389, + "learning_rate": 9.416072681029526e-05, + "loss": 1.7084, + "step": 2409 + }, + { + "epoch": 0.18141101639097462, + "grad_norm": 5.225581645965576, + "learning_rate": 9.415500834561768e-05, + "loss": 2.0274, + "step": 2410 + }, + { + "epoch": 0.18148629067163477, + "grad_norm": 4.342881202697754, + "learning_rate": 9.414928725605152e-05, + "loss": 2.173, + "step": 2411 + }, + { + "epoch": 0.18156156495229492, + "grad_norm": 4.411501884460449, + "learning_rate": 9.414356354193692e-05, + "loss": 1.8581, + "step": 2412 + }, + { + "epoch": 0.18163683923295507, + "grad_norm": 5.18446159362793, + "learning_rate": 9.413783720361413e-05, + "loss": 2.0093, + "step": 2413 + }, + { + "epoch": 0.18171211351361524, + "grad_norm": 4.746987819671631, + "learning_rate": 9.413210824142358e-05, + "loss": 1.9913, + "step": 2414 + }, + { + "epoch": 0.1817873877942754, + "grad_norm": 5.700861930847168, + "learning_rate": 9.412637665570581e-05, + "loss": 2.158, + "step": 2415 + }, + { + "epoch": 0.18186266207493554, + "grad_norm": 4.8684868812561035, + "learning_rate": 9.412064244680154e-05, + "loss": 1.9624, + "step": 2416 + }, + { + "epoch": 0.18193793635559571, + "grad_norm": 6.228887557983398, + "learning_rate": 9.411490561505171e-05, + "loss": 2.1389, + "step": 2417 + }, + { + "epoch": 0.18201321063625586, + "grad_norm": 4.909162521362305, + "learning_rate": 9.41091661607973e-05, + "loss": 2.0372, + "step": 2418 + }, + { + "epoch": 0.182088484916916, + "grad_norm": 5.898225784301758, + "learning_rate": 9.410342408437953e-05, + "loss": 2.1102, + "step": 2419 + }, + { + "epoch": 0.18216375919757616, + "grad_norm": 5.5384840965271, + "learning_rate": 9.409767938613973e-05, + "loss": 1.7697, + "step": 2420 + }, + { + "epoch": 0.18223903347823633, + "grad_norm": 6.185940265655518, + "learning_rate": 9.409193206641944e-05, + "loss": 2.579, + "step": 2421 + }, + { + "epoch": 0.18231430775889648, + "grad_norm": 5.22868537902832, + "learning_rate": 9.408618212556031e-05, + "loss": 1.8882, + "step": 2422 + }, + { + "epoch": 0.18238958203955663, + "grad_norm": 4.065141201019287, + "learning_rate": 9.408042956390412e-05, + "loss": 1.975, + "step": 2423 + }, + { + "epoch": 0.18246485632021678, + "grad_norm": 5.238159656524658, + "learning_rate": 9.40746743817929e-05, + "loss": 1.8925, + "step": 2424 + }, + { + "epoch": 0.18254013060087695, + "grad_norm": 4.696328639984131, + "learning_rate": 9.406891657956875e-05, + "loss": 1.9198, + "step": 2425 + }, + { + "epoch": 0.1826154048815371, + "grad_norm": 4.402562618255615, + "learning_rate": 9.406315615757396e-05, + "loss": 2.0901, + "step": 2426 + }, + { + "epoch": 0.18269067916219725, + "grad_norm": 4.148708820343018, + "learning_rate": 9.405739311615094e-05, + "loss": 1.7181, + "step": 2427 + }, + { + "epoch": 0.1827659534428574, + "grad_norm": 5.534637451171875, + "learning_rate": 9.405162745564233e-05, + "loss": 2.0757, + "step": 2428 + }, + { + "epoch": 0.18284122772351757, + "grad_norm": 5.421005725860596, + "learning_rate": 9.404585917639088e-05, + "loss": 2.2557, + "step": 2429 + }, + { + "epoch": 0.18291650200417772, + "grad_norm": 5.158527374267578, + "learning_rate": 9.404008827873946e-05, + "loss": 2.0845, + "step": 2430 + }, + { + "epoch": 0.18299177628483787, + "grad_norm": 4.128539085388184, + "learning_rate": 9.403431476303118e-05, + "loss": 1.9502, + "step": 2431 + }, + { + "epoch": 0.18306705056549805, + "grad_norm": 4.437924861907959, + "learning_rate": 9.402853862960923e-05, + "loss": 2.3968, + "step": 2432 + }, + { + "epoch": 0.1831423248461582, + "grad_norm": 5.804951190948486, + "learning_rate": 9.402275987881698e-05, + "loss": 2.2585, + "step": 2433 + }, + { + "epoch": 0.18321759912681834, + "grad_norm": 5.684848785400391, + "learning_rate": 9.401697851099798e-05, + "loss": 2.1749, + "step": 2434 + }, + { + "epoch": 0.1832928734074785, + "grad_norm": 5.475544452667236, + "learning_rate": 9.40111945264959e-05, + "loss": 2.024, + "step": 2435 + }, + { + "epoch": 0.18336814768813867, + "grad_norm": 6.695219993591309, + "learning_rate": 9.400540792565459e-05, + "loss": 2.1222, + "step": 2436 + }, + { + "epoch": 0.18344342196879881, + "grad_norm": 5.737473011016846, + "learning_rate": 9.399961870881804e-05, + "loss": 1.8551, + "step": 2437 + }, + { + "epoch": 0.18351869624945896, + "grad_norm": 3.776228189468384, + "learning_rate": 9.399382687633042e-05, + "loss": 2.0056, + "step": 2438 + }, + { + "epoch": 0.1835939705301191, + "grad_norm": 4.5843119621276855, + "learning_rate": 9.398803242853601e-05, + "loss": 1.9414, + "step": 2439 + }, + { + "epoch": 0.1836692448107793, + "grad_norm": 5.952206134796143, + "learning_rate": 9.39822353657793e-05, + "loss": 2.2458, + "step": 2440 + }, + { + "epoch": 0.18374451909143943, + "grad_norm": 5.303598880767822, + "learning_rate": 9.397643568840488e-05, + "loss": 2.3213, + "step": 2441 + }, + { + "epoch": 0.18381979337209958, + "grad_norm": 5.525512218475342, + "learning_rate": 9.397063339675755e-05, + "loss": 2.0295, + "step": 2442 + }, + { + "epoch": 0.18389506765275973, + "grad_norm": 4.800266742706299, + "learning_rate": 9.396482849118224e-05, + "loss": 2.1283, + "step": 2443 + }, + { + "epoch": 0.1839703419334199, + "grad_norm": 6.879937648773193, + "learning_rate": 9.395902097202403e-05, + "loss": 1.5851, + "step": 2444 + }, + { + "epoch": 0.18404561621408005, + "grad_norm": 5.07206392288208, + "learning_rate": 9.395321083962813e-05, + "loss": 2.2683, + "step": 2445 + }, + { + "epoch": 0.1841208904947402, + "grad_norm": 7.102728843688965, + "learning_rate": 9.394739809433998e-05, + "loss": 2.1203, + "step": 2446 + }, + { + "epoch": 0.18419616477540038, + "grad_norm": 6.38200569152832, + "learning_rate": 9.39415827365051e-05, + "loss": 1.9123, + "step": 2447 + }, + { + "epoch": 0.18427143905606053, + "grad_norm": 6.431077480316162, + "learning_rate": 9.393576476646923e-05, + "loss": 2.0466, + "step": 2448 + }, + { + "epoch": 0.18434671333672067, + "grad_norm": 4.885560512542725, + "learning_rate": 9.39299441845782e-05, + "loss": 1.7296, + "step": 2449 + }, + { + "epoch": 0.18442198761738082, + "grad_norm": 4.1443657875061035, + "learning_rate": 9.392412099117802e-05, + "loss": 2.3149, + "step": 2450 + }, + { + "epoch": 0.184497261898041, + "grad_norm": 5.507977485656738, + "learning_rate": 9.39182951866149e-05, + "loss": 2.0893, + "step": 2451 + }, + { + "epoch": 0.18457253617870115, + "grad_norm": 3.817822217941284, + "learning_rate": 9.391246677123514e-05, + "loss": 2.2836, + "step": 2452 + }, + { + "epoch": 0.1846478104593613, + "grad_norm": 8.935853004455566, + "learning_rate": 9.390663574538523e-05, + "loss": 1.9204, + "step": 2453 + }, + { + "epoch": 0.18472308474002144, + "grad_norm": 4.0461106300354, + "learning_rate": 9.390080210941181e-05, + "loss": 2.3641, + "step": 2454 + }, + { + "epoch": 0.18479835902068162, + "grad_norm": 4.40086555480957, + "learning_rate": 9.389496586366167e-05, + "loss": 2.0577, + "step": 2455 + }, + { + "epoch": 0.18487363330134177, + "grad_norm": 3.8941714763641357, + "learning_rate": 9.388912700848176e-05, + "loss": 2.0675, + "step": 2456 + }, + { + "epoch": 0.18494890758200191, + "grad_norm": 4.479083061218262, + "learning_rate": 9.388328554421917e-05, + "loss": 2.2498, + "step": 2457 + }, + { + "epoch": 0.18502418186266206, + "grad_norm": 5.162126541137695, + "learning_rate": 9.387744147122117e-05, + "loss": 1.8108, + "step": 2458 + }, + { + "epoch": 0.18509945614332224, + "grad_norm": 4.297646999359131, + "learning_rate": 9.387159478983518e-05, + "loss": 1.5779, + "step": 2459 + }, + { + "epoch": 0.1851747304239824, + "grad_norm": 5.3939900398254395, + "learning_rate": 9.386574550040876e-05, + "loss": 2.1621, + "step": 2460 + }, + { + "epoch": 0.18525000470464253, + "grad_norm": 4.0228142738342285, + "learning_rate": 9.385989360328964e-05, + "loss": 2.0555, + "step": 2461 + }, + { + "epoch": 0.1853252789853027, + "grad_norm": 5.019871234893799, + "learning_rate": 9.385403909882568e-05, + "loss": 2.0627, + "step": 2462 + }, + { + "epoch": 0.18540055326596286, + "grad_norm": 5.415604114532471, + "learning_rate": 9.384818198736496e-05, + "loss": 2.2798, + "step": 2463 + }, + { + "epoch": 0.185475827546623, + "grad_norm": 5.6253557205200195, + "learning_rate": 9.384232226925561e-05, + "loss": 2.248, + "step": 2464 + }, + { + "epoch": 0.18555110182728315, + "grad_norm": 4.109618186950684, + "learning_rate": 9.3836459944846e-05, + "loss": 1.8366, + "step": 2465 + }, + { + "epoch": 0.18562637610794333, + "grad_norm": 4.437836647033691, + "learning_rate": 9.383059501448462e-05, + "loss": 1.8628, + "step": 2466 + }, + { + "epoch": 0.18570165038860348, + "grad_norm": 6.089954853057861, + "learning_rate": 9.382472747852013e-05, + "loss": 2.3294, + "step": 2467 + }, + { + "epoch": 0.18577692466926363, + "grad_norm": 4.874607563018799, + "learning_rate": 9.381885733730136e-05, + "loss": 2.0225, + "step": 2468 + }, + { + "epoch": 0.18585219894992377, + "grad_norm": 5.030183792114258, + "learning_rate": 9.381298459117723e-05, + "loss": 1.8525, + "step": 2469 + }, + { + "epoch": 0.18592747323058395, + "grad_norm": 4.4787678718566895, + "learning_rate": 9.380710924049689e-05, + "loss": 1.9289, + "step": 2470 + }, + { + "epoch": 0.1860027475112441, + "grad_norm": 5.273500442504883, + "learning_rate": 9.380123128560961e-05, + "loss": 1.8169, + "step": 2471 + }, + { + "epoch": 0.18607802179190425, + "grad_norm": 7.062505722045898, + "learning_rate": 9.379535072686479e-05, + "loss": 2.2818, + "step": 2472 + }, + { + "epoch": 0.1861532960725644, + "grad_norm": 9.19325065612793, + "learning_rate": 9.378946756461205e-05, + "loss": 2.0197, + "step": 2473 + }, + { + "epoch": 0.18622857035322457, + "grad_norm": 6.395091533660889, + "learning_rate": 9.378358179920111e-05, + "loss": 2.3257, + "step": 2474 + }, + { + "epoch": 0.18630384463388472, + "grad_norm": 6.374067306518555, + "learning_rate": 9.377769343098185e-05, + "loss": 2.4417, + "step": 2475 + }, + { + "epoch": 0.18637911891454487, + "grad_norm": 5.0456719398498535, + "learning_rate": 9.377180246030432e-05, + "loss": 2.2903, + "step": 2476 + }, + { + "epoch": 0.18645439319520501, + "grad_norm": 6.052149295806885, + "learning_rate": 9.376590888751875e-05, + "loss": 1.8877, + "step": 2477 + }, + { + "epoch": 0.1865296674758652, + "grad_norm": 6.807764530181885, + "learning_rate": 9.376001271297546e-05, + "loss": 1.9471, + "step": 2478 + }, + { + "epoch": 0.18660494175652534, + "grad_norm": 4.64505672454834, + "learning_rate": 9.375411393702497e-05, + "loss": 2.0154, + "step": 2479 + }, + { + "epoch": 0.1866802160371855, + "grad_norm": 8.420193672180176, + "learning_rate": 9.374821256001798e-05, + "loss": 1.8711, + "step": 2480 + }, + { + "epoch": 0.18675549031784566, + "grad_norm": 6.0355544090271, + "learning_rate": 9.374230858230525e-05, + "loss": 2.1877, + "step": 2481 + }, + { + "epoch": 0.1868307645985058, + "grad_norm": 5.193342685699463, + "learning_rate": 9.373640200423781e-05, + "loss": 1.8226, + "step": 2482 + }, + { + "epoch": 0.18690603887916596, + "grad_norm": 5.592663288116455, + "learning_rate": 9.373049282616674e-05, + "loss": 2.2782, + "step": 2483 + }, + { + "epoch": 0.1869813131598261, + "grad_norm": 6.613125324249268, + "learning_rate": 9.372458104844337e-05, + "loss": 1.856, + "step": 2484 + }, + { + "epoch": 0.18705658744048628, + "grad_norm": 5.781855583190918, + "learning_rate": 9.37186666714191e-05, + "loss": 2.0069, + "step": 2485 + }, + { + "epoch": 0.18713186172114643, + "grad_norm": 5.057727813720703, + "learning_rate": 9.371274969544556e-05, + "loss": 1.7957, + "step": 2486 + }, + { + "epoch": 0.18720713600180658, + "grad_norm": 4.636129856109619, + "learning_rate": 9.370683012087447e-05, + "loss": 2.0004, + "step": 2487 + }, + { + "epoch": 0.18728241028246673, + "grad_norm": 5.959572792053223, + "learning_rate": 9.370090794805772e-05, + "loss": 2.4633, + "step": 2488 + }, + { + "epoch": 0.1873576845631269, + "grad_norm": 4.953143119812012, + "learning_rate": 9.369498317734741e-05, + "loss": 2.2138, + "step": 2489 + }, + { + "epoch": 0.18743295884378705, + "grad_norm": 4.692575931549072, + "learning_rate": 9.368905580909571e-05, + "loss": 2.1595, + "step": 2490 + }, + { + "epoch": 0.1875082331244472, + "grad_norm": 5.977521896362305, + "learning_rate": 9.3683125843655e-05, + "loss": 1.9296, + "step": 2491 + }, + { + "epoch": 0.18758350740510735, + "grad_norm": 5.172298908233643, + "learning_rate": 9.36771932813778e-05, + "loss": 1.9639, + "step": 2492 + }, + { + "epoch": 0.18765878168576752, + "grad_norm": 3.814054489135742, + "learning_rate": 9.367125812261679e-05, + "loss": 2.2502, + "step": 2493 + }, + { + "epoch": 0.18773405596642767, + "grad_norm": 4.959407806396484, + "learning_rate": 9.366532036772479e-05, + "loss": 1.9256, + "step": 2494 + }, + { + "epoch": 0.18780933024708782, + "grad_norm": 4.248777389526367, + "learning_rate": 9.365938001705476e-05, + "loss": 2.1334, + "step": 2495 + }, + { + "epoch": 0.187884604527748, + "grad_norm": 6.672389984130859, + "learning_rate": 9.365343707095989e-05, + "loss": 2.2852, + "step": 2496 + }, + { + "epoch": 0.18795987880840814, + "grad_norm": 6.3475542068481445, + "learning_rate": 9.364749152979344e-05, + "loss": 1.8428, + "step": 2497 + }, + { + "epoch": 0.1880351530890683, + "grad_norm": 5.01655387878418, + "learning_rate": 9.364154339390884e-05, + "loss": 2.0705, + "step": 2498 + }, + { + "epoch": 0.18811042736972844, + "grad_norm": 3.661987781524658, + "learning_rate": 9.363559266365973e-05, + "loss": 2.2872, + "step": 2499 + }, + { + "epoch": 0.18818570165038861, + "grad_norm": 8.07694149017334, + "learning_rate": 9.362963933939982e-05, + "loss": 2.2464, + "step": 2500 + }, + { + "epoch": 0.18826097593104876, + "grad_norm": 4.878606796264648, + "learning_rate": 9.362368342148304e-05, + "loss": 2.1052, + "step": 2501 + }, + { + "epoch": 0.1883362502117089, + "grad_norm": 5.894618034362793, + "learning_rate": 9.361772491026347e-05, + "loss": 2.2804, + "step": 2502 + }, + { + "epoch": 0.18841152449236906, + "grad_norm": 4.599306106567383, + "learning_rate": 9.361176380609529e-05, + "loss": 2.1017, + "step": 2503 + }, + { + "epoch": 0.18848679877302923, + "grad_norm": 5.100125312805176, + "learning_rate": 9.36058001093329e-05, + "loss": 2.3843, + "step": 2504 + }, + { + "epoch": 0.18856207305368938, + "grad_norm": 4.907667636871338, + "learning_rate": 9.359983382033081e-05, + "loss": 2.603, + "step": 2505 + }, + { + "epoch": 0.18863734733434953, + "grad_norm": 5.436685562133789, + "learning_rate": 9.359386493944369e-05, + "loss": 2.0805, + "step": 2506 + }, + { + "epoch": 0.18871262161500968, + "grad_norm": 4.959691047668457, + "learning_rate": 9.35878934670264e-05, + "loss": 1.921, + "step": 2507 + }, + { + "epoch": 0.18878789589566985, + "grad_norm": 6.1402668952941895, + "learning_rate": 9.358191940343389e-05, + "loss": 2.2612, + "step": 2508 + }, + { + "epoch": 0.18886317017633, + "grad_norm": 4.842352390289307, + "learning_rate": 9.357594274902133e-05, + "loss": 2.2252, + "step": 2509 + }, + { + "epoch": 0.18893844445699015, + "grad_norm": 6.455037593841553, + "learning_rate": 9.356996350414402e-05, + "loss": 1.7572, + "step": 2510 + }, + { + "epoch": 0.18901371873765033, + "grad_norm": 6.422914028167725, + "learning_rate": 9.356398166915738e-05, + "loss": 1.8875, + "step": 2511 + }, + { + "epoch": 0.18908899301831048, + "grad_norm": 4.5036468505859375, + "learning_rate": 9.355799724441703e-05, + "loss": 2.0403, + "step": 2512 + }, + { + "epoch": 0.18916426729897062, + "grad_norm": 4.51779842376709, + "learning_rate": 9.355201023027876e-05, + "loss": 2.156, + "step": 2513 + }, + { + "epoch": 0.18923954157963077, + "grad_norm": 5.565335750579834, + "learning_rate": 9.35460206270984e-05, + "loss": 2.3657, + "step": 2514 + }, + { + "epoch": 0.18931481586029095, + "grad_norm": 4.541582107543945, + "learning_rate": 9.354002843523209e-05, + "loss": 1.8633, + "step": 2515 + }, + { + "epoch": 0.1893900901409511, + "grad_norm": 5.617457866668701, + "learning_rate": 9.3534033655036e-05, + "loss": 1.9177, + "step": 2516 + }, + { + "epoch": 0.18946536442161124, + "grad_norm": 3.8435323238372803, + "learning_rate": 9.352803628686653e-05, + "loss": 2.4385, + "step": 2517 + }, + { + "epoch": 0.1895406387022714, + "grad_norm": 5.022530555725098, + "learning_rate": 9.352203633108021e-05, + "loss": 2.0657, + "step": 2518 + }, + { + "epoch": 0.18961591298293157, + "grad_norm": 3.733973979949951, + "learning_rate": 9.35160337880337e-05, + "loss": 2.0543, + "step": 2519 + }, + { + "epoch": 0.18969118726359172, + "grad_norm": 4.168149948120117, + "learning_rate": 9.351002865808383e-05, + "loss": 2.0786, + "step": 2520 + }, + { + "epoch": 0.18976646154425186, + "grad_norm": 5.00193452835083, + "learning_rate": 9.350402094158761e-05, + "loss": 1.8459, + "step": 2521 + }, + { + "epoch": 0.189841735824912, + "grad_norm": 5.939724445343018, + "learning_rate": 9.349801063890217e-05, + "loss": 1.8992, + "step": 2522 + }, + { + "epoch": 0.1899170101055722, + "grad_norm": 6.302956581115723, + "learning_rate": 9.349199775038482e-05, + "loss": 1.7942, + "step": 2523 + }, + { + "epoch": 0.18999228438623234, + "grad_norm": 4.8762078285217285, + "learning_rate": 9.348598227639299e-05, + "loss": 2.2533, + "step": 2524 + }, + { + "epoch": 0.19006755866689248, + "grad_norm": 7.0286030769348145, + "learning_rate": 9.34799642172843e-05, + "loss": 1.9256, + "step": 2525 + }, + { + "epoch": 0.19014283294755263, + "grad_norm": 4.535068988800049, + "learning_rate": 9.347394357341648e-05, + "loss": 1.7866, + "step": 2526 + }, + { + "epoch": 0.1902181072282128, + "grad_norm": 5.935866832733154, + "learning_rate": 9.346792034514746e-05, + "loss": 2.012, + "step": 2527 + }, + { + "epoch": 0.19029338150887296, + "grad_norm": 4.551203727722168, + "learning_rate": 9.346189453283529e-05, + "loss": 1.8581, + "step": 2528 + }, + { + "epoch": 0.1903686557895331, + "grad_norm": 6.322862148284912, + "learning_rate": 9.34558661368382e-05, + "loss": 2.3131, + "step": 2529 + }, + { + "epoch": 0.19044393007019328, + "grad_norm": 5.096517562866211, + "learning_rate": 9.344983515751456e-05, + "loss": 2.1992, + "step": 2530 + }, + { + "epoch": 0.19051920435085343, + "grad_norm": 4.767030715942383, + "learning_rate": 9.344380159522289e-05, + "loss": 2.1253, + "step": 2531 + }, + { + "epoch": 0.19059447863151358, + "grad_norm": 5.3960957527160645, + "learning_rate": 9.343776545032188e-05, + "loss": 1.987, + "step": 2532 + }, + { + "epoch": 0.19066975291217372, + "grad_norm": 5.560890197753906, + "learning_rate": 9.343172672317034e-05, + "loss": 2.1318, + "step": 2533 + }, + { + "epoch": 0.1907450271928339, + "grad_norm": 4.249767780303955, + "learning_rate": 9.342568541412726e-05, + "loss": 1.9264, + "step": 2534 + }, + { + "epoch": 0.19082030147349405, + "grad_norm": 4.550135612487793, + "learning_rate": 9.34196415235518e-05, + "loss": 2.003, + "step": 2535 + }, + { + "epoch": 0.1908955757541542, + "grad_norm": 4.332376956939697, + "learning_rate": 9.341359505180322e-05, + "loss": 1.919, + "step": 2536 + }, + { + "epoch": 0.19097085003481434, + "grad_norm": 3.6475462913513184, + "learning_rate": 9.3407545999241e-05, + "loss": 1.8407, + "step": 2537 + }, + { + "epoch": 0.19104612431547452, + "grad_norm": 4.63602876663208, + "learning_rate": 9.340149436622471e-05, + "loss": 2.0405, + "step": 2538 + }, + { + "epoch": 0.19112139859613467, + "grad_norm": 5.155390739440918, + "learning_rate": 9.339544015311411e-05, + "loss": 2.12, + "step": 2539 + }, + { + "epoch": 0.19119667287679482, + "grad_norm": 6.083924293518066, + "learning_rate": 9.338938336026912e-05, + "loss": 1.9697, + "step": 2540 + }, + { + "epoch": 0.19127194715745496, + "grad_norm": 5.203934669494629, + "learning_rate": 9.338332398804976e-05, + "loss": 2.3053, + "step": 2541 + }, + { + "epoch": 0.19134722143811514, + "grad_norm": 5.525043487548828, + "learning_rate": 9.337726203681629e-05, + "loss": 2.0192, + "step": 2542 + }, + { + "epoch": 0.1914224957187753, + "grad_norm": 4.273123741149902, + "learning_rate": 9.337119750692906e-05, + "loss": 2.0807, + "step": 2543 + }, + { + "epoch": 0.19149776999943544, + "grad_norm": 4.4439826011657715, + "learning_rate": 9.336513039874859e-05, + "loss": 2.1466, + "step": 2544 + }, + { + "epoch": 0.1915730442800956, + "grad_norm": 4.851051330566406, + "learning_rate": 9.335906071263553e-05, + "loss": 1.7983, + "step": 2545 + }, + { + "epoch": 0.19164831856075576, + "grad_norm": 6.889603137969971, + "learning_rate": 9.335298844895072e-05, + "loss": 1.9036, + "step": 2546 + }, + { + "epoch": 0.1917235928414159, + "grad_norm": 4.5521321296691895, + "learning_rate": 9.334691360805516e-05, + "loss": 2.3457, + "step": 2547 + }, + { + "epoch": 0.19179886712207606, + "grad_norm": 5.788790702819824, + "learning_rate": 9.334083619030993e-05, + "loss": 2.0967, + "step": 2548 + }, + { + "epoch": 0.19187414140273623, + "grad_norm": 4.741682052612305, + "learning_rate": 9.333475619607636e-05, + "loss": 1.8788, + "step": 2549 + }, + { + "epoch": 0.19194941568339638, + "grad_norm": 5.019089221954346, + "learning_rate": 9.332867362571589e-05, + "loss": 2.0809, + "step": 2550 + }, + { + "epoch": 0.19202468996405653, + "grad_norm": 4.73291540145874, + "learning_rate": 9.33225884795901e-05, + "loss": 2.0432, + "step": 2551 + }, + { + "epoch": 0.19209996424471668, + "grad_norm": 5.9592695236206055, + "learning_rate": 9.331650075806071e-05, + "loss": 2.1632, + "step": 2552 + }, + { + "epoch": 0.19217523852537685, + "grad_norm": 3.855308771133423, + "learning_rate": 9.331041046148965e-05, + "loss": 2.2532, + "step": 2553 + }, + { + "epoch": 0.192250512806037, + "grad_norm": 5.603821277618408, + "learning_rate": 9.330431759023897e-05, + "loss": 1.9761, + "step": 2554 + }, + { + "epoch": 0.19232578708669715, + "grad_norm": 5.860295295715332, + "learning_rate": 9.329822214467085e-05, + "loss": 2.0572, + "step": 2555 + }, + { + "epoch": 0.1924010613673573, + "grad_norm": 4.44080114364624, + "learning_rate": 9.329212412514765e-05, + "loss": 1.6913, + "step": 2556 + }, + { + "epoch": 0.19247633564801747, + "grad_norm": 7.0723748207092285, + "learning_rate": 9.32860235320319e-05, + "loss": 1.7352, + "step": 2557 + }, + { + "epoch": 0.19255160992867762, + "grad_norm": 14.13379192352295, + "learning_rate": 9.327992036568626e-05, + "loss": 2.3754, + "step": 2558 + }, + { + "epoch": 0.19262688420933777, + "grad_norm": 5.579792022705078, + "learning_rate": 9.327381462647354e-05, + "loss": 1.7847, + "step": 2559 + }, + { + "epoch": 0.19270215848999794, + "grad_norm": 7.573391437530518, + "learning_rate": 9.32677063147567e-05, + "loss": 2.0068, + "step": 2560 + }, + { + "epoch": 0.1927774327706581, + "grad_norm": 7.85155725479126, + "learning_rate": 9.326159543089887e-05, + "loss": 2.1874, + "step": 2561 + }, + { + "epoch": 0.19285270705131824, + "grad_norm": 6.106492042541504, + "learning_rate": 9.32554819752633e-05, + "loss": 2.378, + "step": 2562 + }, + { + "epoch": 0.1929279813319784, + "grad_norm": 4.812530040740967, + "learning_rate": 9.324936594821348e-05, + "loss": 1.7921, + "step": 2563 + }, + { + "epoch": 0.19300325561263856, + "grad_norm": 5.572739601135254, + "learning_rate": 9.324324735011291e-05, + "loss": 2.041, + "step": 2564 + }, + { + "epoch": 0.1930785298932987, + "grad_norm": 8.406535148620605, + "learning_rate": 9.323712618132541e-05, + "loss": 2.0206, + "step": 2565 + }, + { + "epoch": 0.19315380417395886, + "grad_norm": 46.575382232666016, + "learning_rate": 9.323100244221478e-05, + "loss": 1.8225, + "step": 2566 + }, + { + "epoch": 0.193229078454619, + "grad_norm": 4.07697868347168, + "learning_rate": 9.322487613314512e-05, + "loss": 2.2756, + "step": 2567 + }, + { + "epoch": 0.19330435273527918, + "grad_norm": 5.623479843139648, + "learning_rate": 9.321874725448058e-05, + "loss": 2.0736, + "step": 2568 + }, + { + "epoch": 0.19337962701593933, + "grad_norm": 4.307704448699951, + "learning_rate": 9.321261580658556e-05, + "loss": 1.7371, + "step": 2569 + }, + { + "epoch": 0.19345490129659948, + "grad_norm": 5.875655174255371, + "learning_rate": 9.32064817898245e-05, + "loss": 1.7341, + "step": 2570 + }, + { + "epoch": 0.19353017557725963, + "grad_norm": 5.115437030792236, + "learning_rate": 9.320034520456207e-05, + "loss": 2.1255, + "step": 2571 + }, + { + "epoch": 0.1936054498579198, + "grad_norm": 6.8328046798706055, + "learning_rate": 9.319420605116307e-05, + "loss": 1.9479, + "step": 2572 + }, + { + "epoch": 0.19368072413857995, + "grad_norm": 11.18570327758789, + "learning_rate": 9.318806432999245e-05, + "loss": 1.97, + "step": 2573 + }, + { + "epoch": 0.1937559984192401, + "grad_norm": 4.179947376251221, + "learning_rate": 9.318192004141536e-05, + "loss": 1.9784, + "step": 2574 + }, + { + "epoch": 0.19383127269990025, + "grad_norm": 4.179947376251221, + "learning_rate": 9.318192004141536e-05, + "loss": 2.0508, + "step": 2575 + }, + { + "epoch": 0.19390654698056042, + "grad_norm": 4.179947376251221, + "learning_rate": 9.318192004141536e-05, + "loss": 2.1421, + "step": 2576 + }, + { + "epoch": 0.19398182126122057, + "grad_norm": 29.524860382080078, + "learning_rate": 9.3175773185797e-05, + "loss": 2.0533, + "step": 2577 + }, + { + "epoch": 0.19405709554188072, + "grad_norm": 62.209346771240234, + "learning_rate": 9.31696237635028e-05, + "loss": 1.9044, + "step": 2578 + }, + { + "epoch": 0.1941323698225409, + "grad_norm": 109.74150085449219, + "learning_rate": 9.316347177489835e-05, + "loss": 2.4393, + "step": 2579 + }, + { + "epoch": 0.19420764410320104, + "grad_norm": 8.23983097076416, + "learning_rate": 9.315731722034934e-05, + "loss": 2.1779, + "step": 2580 + }, + { + "epoch": 0.1942829183838612, + "grad_norm": 173.2615966796875, + "learning_rate": 9.315116010022168e-05, + "loss": 2.398, + "step": 2581 + }, + { + "epoch": 0.19435819266452134, + "grad_norm": 7.5856781005859375, + "learning_rate": 9.314500041488135e-05, + "loss": 1.8894, + "step": 2582 + }, + { + "epoch": 0.19443346694518152, + "grad_norm": 7.97536563873291, + "learning_rate": 9.313883816469455e-05, + "loss": 2.2363, + "step": 2583 + }, + { + "epoch": 0.19450874122584166, + "grad_norm": 7.2178955078125, + "learning_rate": 9.31326733500276e-05, + "loss": 2.1213, + "step": 2584 + }, + { + "epoch": 0.1945840155065018, + "grad_norm": 18.891969680786133, + "learning_rate": 9.312650597124697e-05, + "loss": 1.8975, + "step": 2585 + }, + { + "epoch": 0.19465928978716196, + "grad_norm": 10.45845890045166, + "learning_rate": 9.31203360287193e-05, + "loss": 2.0703, + "step": 2586 + }, + { + "epoch": 0.19473456406782214, + "grad_norm": 7.401883602142334, + "learning_rate": 9.31141635228114e-05, + "loss": 2.5109, + "step": 2587 + }, + { + "epoch": 0.19480983834848228, + "grad_norm": 26.361743927001953, + "learning_rate": 9.310798845389018e-05, + "loss": 1.9633, + "step": 2588 + }, + { + "epoch": 0.19488511262914243, + "grad_norm": 146.64105224609375, + "learning_rate": 9.310181082232272e-05, + "loss": 2.894, + "step": 2589 + }, + { + "epoch": 0.19496038690980258, + "grad_norm": 11.486780166625977, + "learning_rate": 9.309563062847628e-05, + "loss": 2.2627, + "step": 2590 + }, + { + "epoch": 0.19503566119046276, + "grad_norm": 6.356736660003662, + "learning_rate": 9.308944787271827e-05, + "loss": 2.6801, + "step": 2591 + }, + { + "epoch": 0.1951109354711229, + "grad_norm": 5.402707576751709, + "learning_rate": 9.308326255541621e-05, + "loss": 1.9609, + "step": 2592 + }, + { + "epoch": 0.19518620975178305, + "grad_norm": 7.862400531768799, + "learning_rate": 9.307707467693783e-05, + "loss": 2.0292, + "step": 2593 + }, + { + "epoch": 0.19526148403244323, + "grad_norm": 6.014866352081299, + "learning_rate": 9.307088423765096e-05, + "loss": 2.1721, + "step": 2594 + }, + { + "epoch": 0.19533675831310338, + "grad_norm": 5.5270161628723145, + "learning_rate": 9.306469123792358e-05, + "loss": 2.2049, + "step": 2595 + }, + { + "epoch": 0.19541203259376352, + "grad_norm": 5.358336925506592, + "learning_rate": 9.30584956781239e-05, + "loss": 2.1197, + "step": 2596 + }, + { + "epoch": 0.19548730687442367, + "grad_norm": 5.83590030670166, + "learning_rate": 9.305229755862019e-05, + "loss": 2.2495, + "step": 2597 + }, + { + "epoch": 0.19556258115508385, + "grad_norm": 7.403632164001465, + "learning_rate": 9.304609687978092e-05, + "loss": 2.1131, + "step": 2598 + }, + { + "epoch": 0.195637855435744, + "grad_norm": 5.968019008636475, + "learning_rate": 9.30398936419747e-05, + "loss": 2.0844, + "step": 2599 + }, + { + "epoch": 0.19571312971640414, + "grad_norm": 5.025341033935547, + "learning_rate": 9.30336878455703e-05, + "loss": 2.1027, + "step": 2600 + }, + { + "epoch": 0.1957884039970643, + "grad_norm": 6.875126838684082, + "learning_rate": 9.302747949093664e-05, + "loss": 1.8119, + "step": 2601 + }, + { + "epoch": 0.19586367827772447, + "grad_norm": 5.53953742980957, + "learning_rate": 9.302126857844279e-05, + "loss": 2.155, + "step": 2602 + }, + { + "epoch": 0.19593895255838462, + "grad_norm": 5.998725891113281, + "learning_rate": 9.301505510845795e-05, + "loss": 1.9574, + "step": 2603 + }, + { + "epoch": 0.19601422683904476, + "grad_norm": 5.870541572570801, + "learning_rate": 9.300883908135152e-05, + "loss": 1.7968, + "step": 2604 + }, + { + "epoch": 0.1960895011197049, + "grad_norm": 4.784199237823486, + "learning_rate": 9.3002620497493e-05, + "loss": 2.3796, + "step": 2605 + }, + { + "epoch": 0.1961647754003651, + "grad_norm": 6.009994983673096, + "learning_rate": 9.299639935725209e-05, + "loss": 2.6274, + "step": 2606 + }, + { + "epoch": 0.19624004968102524, + "grad_norm": 4.934391498565674, + "learning_rate": 9.299017566099861e-05, + "loss": 2.0971, + "step": 2607 + }, + { + "epoch": 0.19631532396168538, + "grad_norm": 4.732306957244873, + "learning_rate": 9.298394940910254e-05, + "loss": 1.7493, + "step": 2608 + }, + { + "epoch": 0.19639059824234556, + "grad_norm": 4.06329345703125, + "learning_rate": 9.297772060193399e-05, + "loss": 2.1547, + "step": 2609 + }, + { + "epoch": 0.1964658725230057, + "grad_norm": 4.611025333404541, + "learning_rate": 9.297148923986329e-05, + "loss": 1.8565, + "step": 2610 + }, + { + "epoch": 0.19654114680366586, + "grad_norm": 5.622647762298584, + "learning_rate": 9.296525532326085e-05, + "loss": 2.039, + "step": 2611 + }, + { + "epoch": 0.196616421084326, + "grad_norm": 4.891772270202637, + "learning_rate": 9.295901885249727e-05, + "loss": 2.0125, + "step": 2612 + }, + { + "epoch": 0.19669169536498618, + "grad_norm": 5.495362281799316, + "learning_rate": 9.295277982794327e-05, + "loss": 2.1419, + "step": 2613 + }, + { + "epoch": 0.19676696964564633, + "grad_norm": 4.266232013702393, + "learning_rate": 9.294653824996976e-05, + "loss": 1.9604, + "step": 2614 + }, + { + "epoch": 0.19684224392630648, + "grad_norm": 6.018433570861816, + "learning_rate": 9.294029411894779e-05, + "loss": 2.1215, + "step": 2615 + }, + { + "epoch": 0.19691751820696662, + "grad_norm": 6.057972431182861, + "learning_rate": 9.293404743524852e-05, + "loss": 1.9628, + "step": 2616 + }, + { + "epoch": 0.1969927924876268, + "grad_norm": 5.350480556488037, + "learning_rate": 9.292779819924333e-05, + "loss": 2.034, + "step": 2617 + }, + { + "epoch": 0.19706806676828695, + "grad_norm": 4.5111894607543945, + "learning_rate": 9.292154641130371e-05, + "loss": 2.2302, + "step": 2618 + }, + { + "epoch": 0.1971433410489471, + "grad_norm": 6.462106704711914, + "learning_rate": 9.291529207180132e-05, + "loss": 2.085, + "step": 2619 + }, + { + "epoch": 0.19721861532960724, + "grad_norm": 4.553351879119873, + "learning_rate": 9.290903518110793e-05, + "loss": 1.9856, + "step": 2620 + }, + { + "epoch": 0.19729388961026742, + "grad_norm": 6.696134090423584, + "learning_rate": 9.290277573959555e-05, + "loss": 2.1369, + "step": 2621 + }, + { + "epoch": 0.19736916389092757, + "grad_norm": 4.833719253540039, + "learning_rate": 9.289651374763624e-05, + "loss": 2.0621, + "step": 2622 + }, + { + "epoch": 0.19744443817158772, + "grad_norm": 7.3011040687561035, + "learning_rate": 9.289024920560228e-05, + "loss": 1.9717, + "step": 2623 + }, + { + "epoch": 0.1975197124522479, + "grad_norm": 5.408299922943115, + "learning_rate": 9.288398211386606e-05, + "loss": 2.3192, + "step": 2624 + }, + { + "epoch": 0.19759498673290804, + "grad_norm": 5.214601993560791, + "learning_rate": 9.287771247280014e-05, + "loss": 2.2721, + "step": 2625 + }, + { + "epoch": 0.1976702610135682, + "grad_norm": 5.570950508117676, + "learning_rate": 9.287144028277726e-05, + "loss": 1.9772, + "step": 2626 + }, + { + "epoch": 0.19774553529422834, + "grad_norm": 6.670877933502197, + "learning_rate": 9.286516554417028e-05, + "loss": 1.9799, + "step": 2627 + }, + { + "epoch": 0.1978208095748885, + "grad_norm": 6.343214988708496, + "learning_rate": 9.285888825735218e-05, + "loss": 1.8738, + "step": 2628 + }, + { + "epoch": 0.19789608385554866, + "grad_norm": 5.22285270690918, + "learning_rate": 9.285260842269616e-05, + "loss": 2.0352, + "step": 2629 + }, + { + "epoch": 0.1979713581362088, + "grad_norm": 4.319041728973389, + "learning_rate": 9.284632604057553e-05, + "loss": 2.1778, + "step": 2630 + }, + { + "epoch": 0.19804663241686896, + "grad_norm": 6.656712055206299, + "learning_rate": 9.284004111136374e-05, + "loss": 2.0763, + "step": 2631 + }, + { + "epoch": 0.19812190669752913, + "grad_norm": 4.44982385635376, + "learning_rate": 9.283375363543444e-05, + "loss": 2.1115, + "step": 2632 + }, + { + "epoch": 0.19819718097818928, + "grad_norm": 5.182038307189941, + "learning_rate": 9.28274636131614e-05, + "loss": 1.963, + "step": 2633 + }, + { + "epoch": 0.19827245525884943, + "grad_norm": 4.994046211242676, + "learning_rate": 9.282117104491852e-05, + "loss": 1.9471, + "step": 2634 + }, + { + "epoch": 0.19834772953950958, + "grad_norm": 5.139937400817871, + "learning_rate": 9.281487593107989e-05, + "loss": 2.3964, + "step": 2635 + }, + { + "epoch": 0.19842300382016975, + "grad_norm": 4.936031818389893, + "learning_rate": 9.280857827201973e-05, + "loss": 2.2073, + "step": 2636 + }, + { + "epoch": 0.1984982781008299, + "grad_norm": 9.204358100891113, + "learning_rate": 9.280227806811244e-05, + "loss": 1.9594, + "step": 2637 + }, + { + "epoch": 0.19857355238149005, + "grad_norm": 5.214263439178467, + "learning_rate": 9.279597531973252e-05, + "loss": 2.2581, + "step": 2638 + }, + { + "epoch": 0.1986488266621502, + "grad_norm": 4.837527275085449, + "learning_rate": 9.278967002725465e-05, + "loss": 2.0607, + "step": 2639 + }, + { + "epoch": 0.19872410094281037, + "grad_norm": 7.141783237457275, + "learning_rate": 9.27833621910537e-05, + "loss": 2.1656, + "step": 2640 + }, + { + "epoch": 0.19879937522347052, + "grad_norm": 5.579028129577637, + "learning_rate": 9.277705181150463e-05, + "loss": 2.1741, + "step": 2641 + }, + { + "epoch": 0.19887464950413067, + "grad_norm": 5.092233657836914, + "learning_rate": 9.277073888898255e-05, + "loss": 2.4771, + "step": 2642 + }, + { + "epoch": 0.19894992378479084, + "grad_norm": 3.783620595932007, + "learning_rate": 9.276442342386277e-05, + "loss": 2.1214, + "step": 2643 + }, + { + "epoch": 0.199025198065451, + "grad_norm": 5.664515018463135, + "learning_rate": 9.275810541652072e-05, + "loss": 1.8764, + "step": 2644 + }, + { + "epoch": 0.19910047234611114, + "grad_norm": 7.091597080230713, + "learning_rate": 9.275178486733201e-05, + "loss": 2.2658, + "step": 2645 + }, + { + "epoch": 0.1991757466267713, + "grad_norm": 5.548789978027344, + "learning_rate": 9.274546177667234e-05, + "loss": 2.3322, + "step": 2646 + }, + { + "epoch": 0.19925102090743146, + "grad_norm": 4.836780548095703, + "learning_rate": 9.273913614491761e-05, + "loss": 2.2259, + "step": 2647 + }, + { + "epoch": 0.1993262951880916, + "grad_norm": 5.501684665679932, + "learning_rate": 9.273280797244388e-05, + "loss": 1.7753, + "step": 2648 + }, + { + "epoch": 0.19940156946875176, + "grad_norm": 4.515477180480957, + "learning_rate": 9.272647725962733e-05, + "loss": 2.0664, + "step": 2649 + }, + { + "epoch": 0.1994768437494119, + "grad_norm": 6.437292575836182, + "learning_rate": 9.27201440068443e-05, + "loss": 2.0311, + "step": 2650 + }, + { + "epoch": 0.19955211803007208, + "grad_norm": 7.09950590133667, + "learning_rate": 9.271380821447129e-05, + "loss": 2.6957, + "step": 2651 + }, + { + "epoch": 0.19962739231073223, + "grad_norm": 6.830597877502441, + "learning_rate": 9.270746988288494e-05, + "loss": 2.4405, + "step": 2652 + }, + { + "epoch": 0.19970266659139238, + "grad_norm": 5.383710861206055, + "learning_rate": 9.270112901246207e-05, + "loss": 2.1742, + "step": 2653 + }, + { + "epoch": 0.19977794087205253, + "grad_norm": 5.5504608154296875, + "learning_rate": 9.269478560357958e-05, + "loss": 1.9415, + "step": 2654 + }, + { + "epoch": 0.1998532151527127, + "grad_norm": 4.703890323638916, + "learning_rate": 9.268843965661457e-05, + "loss": 1.7419, + "step": 2655 + }, + { + "epoch": 0.19992848943337285, + "grad_norm": 5.986147880554199, + "learning_rate": 9.268209117194435e-05, + "loss": 2.4277, + "step": 2656 + }, + { + "epoch": 0.200003763714033, + "grad_norm": 6.474613189697266, + "learning_rate": 9.267574014994626e-05, + "loss": 2.1247, + "step": 2657 + }, + { + "epoch": 0.20007903799469318, + "grad_norm": 4.989909648895264, + "learning_rate": 9.266938659099786e-05, + "loss": 2.6483, + "step": 2658 + }, + { + "epoch": 0.20015431227535332, + "grad_norm": 4.968101978302002, + "learning_rate": 9.266303049547687e-05, + "loss": 1.6893, + "step": 2659 + }, + { + "epoch": 0.20022958655601347, + "grad_norm": 6.139657497406006, + "learning_rate": 9.265667186376113e-05, + "loss": 2.4939, + "step": 2660 + }, + { + "epoch": 0.20030486083667362, + "grad_norm": 8.679938316345215, + "learning_rate": 9.265031069622865e-05, + "loss": 2.6472, + "step": 2661 + }, + { + "epoch": 0.2003801351173338, + "grad_norm": 5.3617472648620605, + "learning_rate": 9.264394699325757e-05, + "loss": 1.9723, + "step": 2662 + }, + { + "epoch": 0.20045540939799394, + "grad_norm": 4.935393810272217, + "learning_rate": 9.263758075522619e-05, + "loss": 2.3144, + "step": 2663 + }, + { + "epoch": 0.2005306836786541, + "grad_norm": 4.61316442489624, + "learning_rate": 9.263121198251299e-05, + "loss": 2.2842, + "step": 2664 + }, + { + "epoch": 0.20060595795931424, + "grad_norm": 4.69996976852417, + "learning_rate": 9.262484067549656e-05, + "loss": 2.112, + "step": 2665 + }, + { + "epoch": 0.20068123223997442, + "grad_norm": 6.210440158843994, + "learning_rate": 9.261846683455565e-05, + "loss": 2.6443, + "step": 2666 + }, + { + "epoch": 0.20075650652063456, + "grad_norm": 7.379510879516602, + "learning_rate": 9.261209046006918e-05, + "loss": 2.1906, + "step": 2667 + }, + { + "epoch": 0.2008317808012947, + "grad_norm": 4.913089275360107, + "learning_rate": 9.260571155241619e-05, + "loss": 2.0496, + "step": 2668 + }, + { + "epoch": 0.20090705508195486, + "grad_norm": 4.121845245361328, + "learning_rate": 9.259933011197591e-05, + "loss": 2.1324, + "step": 2669 + }, + { + "epoch": 0.20098232936261504, + "grad_norm": 4.907847881317139, + "learning_rate": 9.259294613912767e-05, + "loss": 2.0812, + "step": 2670 + }, + { + "epoch": 0.20105760364327518, + "grad_norm": 5.408309459686279, + "learning_rate": 9.2586559634251e-05, + "loss": 2.1487, + "step": 2671 + }, + { + "epoch": 0.20113287792393533, + "grad_norm": 7.610551834106445, + "learning_rate": 9.258017059772556e-05, + "loss": 2.0575, + "step": 2672 + }, + { + "epoch": 0.2012081522045955, + "grad_norm": 4.74695348739624, + "learning_rate": 9.257377902993114e-05, + "loss": 2.0271, + "step": 2673 + }, + { + "epoch": 0.20128342648525566, + "grad_norm": 4.0414252281188965, + "learning_rate": 9.256738493124773e-05, + "loss": 2.1029, + "step": 2674 + }, + { + "epoch": 0.2013587007659158, + "grad_norm": 5.899327754974365, + "learning_rate": 9.256098830205542e-05, + "loss": 2.4646, + "step": 2675 + }, + { + "epoch": 0.20143397504657595, + "grad_norm": 6.295183181762695, + "learning_rate": 9.255458914273448e-05, + "loss": 2.1643, + "step": 2676 + }, + { + "epoch": 0.20150924932723613, + "grad_norm": 5.623414993286133, + "learning_rate": 9.254818745366531e-05, + "loss": 2.1055, + "step": 2677 + }, + { + "epoch": 0.20158452360789628, + "grad_norm": 7.370398998260498, + "learning_rate": 9.25417832352285e-05, + "loss": 2.3803, + "step": 2678 + }, + { + "epoch": 0.20165979788855642, + "grad_norm": 4.623668670654297, + "learning_rate": 9.253537648780473e-05, + "loss": 2.2194, + "step": 2679 + }, + { + "epoch": 0.20173507216921657, + "grad_norm": 5.763810634613037, + "learning_rate": 9.252896721177487e-05, + "loss": 1.9493, + "step": 2680 + }, + { + "epoch": 0.20181034644987675, + "grad_norm": 5.256918430328369, + "learning_rate": 9.252255540751994e-05, + "loss": 2.0534, + "step": 2681 + }, + { + "epoch": 0.2018856207305369, + "grad_norm": 5.890371322631836, + "learning_rate": 9.25161410754211e-05, + "loss": 2.0003, + "step": 2682 + }, + { + "epoch": 0.20196089501119704, + "grad_norm": 4.927006244659424, + "learning_rate": 9.250972421585968e-05, + "loss": 1.9434, + "step": 2683 + }, + { + "epoch": 0.2020361692918572, + "grad_norm": 5.063210487365723, + "learning_rate": 9.250330482921712e-05, + "loss": 2.4138, + "step": 2684 + }, + { + "epoch": 0.20211144357251737, + "grad_norm": 4.575560569763184, + "learning_rate": 9.249688291587504e-05, + "loss": 2.1506, + "step": 2685 + }, + { + "epoch": 0.20218671785317752, + "grad_norm": 5.4028120040893555, + "learning_rate": 9.249045847621521e-05, + "loss": 1.7757, + "step": 2686 + }, + { + "epoch": 0.20226199213383766, + "grad_norm": 4.79414701461792, + "learning_rate": 9.248403151061956e-05, + "loss": 1.8104, + "step": 2687 + }, + { + "epoch": 0.2023372664144978, + "grad_norm": 6.444667816162109, + "learning_rate": 9.247760201947011e-05, + "loss": 1.921, + "step": 2688 + }, + { + "epoch": 0.202412540695158, + "grad_norm": 3.928278684616089, + "learning_rate": 9.247117000314911e-05, + "loss": 1.902, + "step": 2689 + }, + { + "epoch": 0.20248781497581814, + "grad_norm": 4.457923412322998, + "learning_rate": 9.246473546203893e-05, + "loss": 1.8489, + "step": 2690 + }, + { + "epoch": 0.20256308925647828, + "grad_norm": 5.076704502105713, + "learning_rate": 9.245829839652205e-05, + "loss": 2.3135, + "step": 2691 + }, + { + "epoch": 0.20263836353713846, + "grad_norm": 4.466028690338135, + "learning_rate": 9.245185880698118e-05, + "loss": 2.2815, + "step": 2692 + }, + { + "epoch": 0.2027136378177986, + "grad_norm": 6.199206352233887, + "learning_rate": 9.24454166937991e-05, + "loss": 1.807, + "step": 2693 + }, + { + "epoch": 0.20278891209845876, + "grad_norm": 6.835103511810303, + "learning_rate": 9.243897205735878e-05, + "loss": 2.0051, + "step": 2694 + }, + { + "epoch": 0.2028641863791189, + "grad_norm": 4.338747024536133, + "learning_rate": 9.243252489804335e-05, + "loss": 2.0584, + "step": 2695 + }, + { + "epoch": 0.20293946065977908, + "grad_norm": 5.767490386962891, + "learning_rate": 9.242607521623606e-05, + "loss": 2.162, + "step": 2696 + }, + { + "epoch": 0.20301473494043923, + "grad_norm": 3.73659610748291, + "learning_rate": 9.241962301232035e-05, + "loss": 1.9434, + "step": 2697 + }, + { + "epoch": 0.20309000922109938, + "grad_norm": 9.43851375579834, + "learning_rate": 9.241316828667976e-05, + "loss": 2.0812, + "step": 2698 + }, + { + "epoch": 0.20316528350175952, + "grad_norm": 4.68175745010376, + "learning_rate": 9.2406711039698e-05, + "loss": 1.652, + "step": 2699 + }, + { + "epoch": 0.2032405577824197, + "grad_norm": 6.658591270446777, + "learning_rate": 9.240025127175896e-05, + "loss": 2.2021, + "step": 2700 + }, + { + "epoch": 0.20331583206307985, + "grad_norm": 6.5034871101379395, + "learning_rate": 9.239378898324664e-05, + "loss": 2.0113, + "step": 2701 + }, + { + "epoch": 0.20339110634374, + "grad_norm": 5.065489292144775, + "learning_rate": 9.238732417454521e-05, + "loss": 1.7716, + "step": 2702 + }, + { + "epoch": 0.20346638062440014, + "grad_norm": 6.082492828369141, + "learning_rate": 9.238085684603897e-05, + "loss": 1.9677, + "step": 2703 + }, + { + "epoch": 0.20354165490506032, + "grad_norm": 6.415153503417969, + "learning_rate": 9.23743869981124e-05, + "loss": 2.4266, + "step": 2704 + }, + { + "epoch": 0.20361692918572047, + "grad_norm": 6.274682521820068, + "learning_rate": 9.23679146311501e-05, + "loss": 2.0143, + "step": 2705 + }, + { + "epoch": 0.20369220346638062, + "grad_norm": 5.247891902923584, + "learning_rate": 9.236143974553686e-05, + "loss": 2.1716, + "step": 2706 + }, + { + "epoch": 0.2037674777470408, + "grad_norm": 4.783785343170166, + "learning_rate": 9.235496234165756e-05, + "loss": 1.9166, + "step": 2707 + }, + { + "epoch": 0.20384275202770094, + "grad_norm": 5.3051042556762695, + "learning_rate": 9.234848241989729e-05, + "loss": 1.9551, + "step": 2708 + }, + { + "epoch": 0.2039180263083611, + "grad_norm": 5.2551469802856445, + "learning_rate": 9.234199998064125e-05, + "loss": 1.6698, + "step": 2709 + }, + { + "epoch": 0.20399330058902124, + "grad_norm": 6.545422077178955, + "learning_rate": 9.23355150242748e-05, + "loss": 1.9037, + "step": 2710 + }, + { + "epoch": 0.2040685748696814, + "grad_norm": 5.051024436950684, + "learning_rate": 9.232902755118345e-05, + "loss": 1.9418, + "step": 2711 + }, + { + "epoch": 0.20414384915034156, + "grad_norm": 6.801575660705566, + "learning_rate": 9.232253756175288e-05, + "loss": 2.5586, + "step": 2712 + }, + { + "epoch": 0.2042191234310017, + "grad_norm": 5.310962200164795, + "learning_rate": 9.231604505636889e-05, + "loss": 1.9654, + "step": 2713 + }, + { + "epoch": 0.20429439771166186, + "grad_norm": 6.696695804595947, + "learning_rate": 9.230955003541743e-05, + "loss": 2.1892, + "step": 2714 + }, + { + "epoch": 0.20436967199232203, + "grad_norm": 4.305474758148193, + "learning_rate": 9.230305249928461e-05, + "loss": 2.3555, + "step": 2715 + }, + { + "epoch": 0.20444494627298218, + "grad_norm": 7.1067962646484375, + "learning_rate": 9.229655244835673e-05, + "loss": 1.7765, + "step": 2716 + }, + { + "epoch": 0.20452022055364233, + "grad_norm": 7.4718852043151855, + "learning_rate": 9.229004988302014e-05, + "loss": 1.7959, + "step": 2717 + }, + { + "epoch": 0.20459549483430248, + "grad_norm": 6.000840187072754, + "learning_rate": 9.228354480366146e-05, + "loss": 1.8983, + "step": 2718 + }, + { + "epoch": 0.20467076911496265, + "grad_norm": 7.680751323699951, + "learning_rate": 9.227703721066734e-05, + "loss": 2.5392, + "step": 2719 + }, + { + "epoch": 0.2047460433956228, + "grad_norm": 8.486054420471191, + "learning_rate": 9.227052710442468e-05, + "loss": 2.5128, + "step": 2720 + }, + { + "epoch": 0.20482131767628295, + "grad_norm": 6.617682456970215, + "learning_rate": 9.226401448532045e-05, + "loss": 1.9968, + "step": 2721 + }, + { + "epoch": 0.20489659195694312, + "grad_norm": 7.17568826675415, + "learning_rate": 9.225749935374184e-05, + "loss": 2.5921, + "step": 2722 + }, + { + "epoch": 0.20497186623760327, + "grad_norm": 5.48948860168457, + "learning_rate": 9.225098171007615e-05, + "loss": 2.0952, + "step": 2723 + }, + { + "epoch": 0.20504714051826342, + "grad_norm": 6.631126403808594, + "learning_rate": 9.224446155471083e-05, + "loss": 1.9357, + "step": 2724 + }, + { + "epoch": 0.20512241479892357, + "grad_norm": 9.109574317932129, + "learning_rate": 9.223793888803347e-05, + "loss": 2.4113, + "step": 2725 + }, + { + "epoch": 0.20519768907958374, + "grad_norm": 5.081326961517334, + "learning_rate": 9.223141371043184e-05, + "loss": 1.982, + "step": 2726 + }, + { + "epoch": 0.2052729633602439, + "grad_norm": 12.93472957611084, + "learning_rate": 9.222488602229385e-05, + "loss": 1.9383, + "step": 2727 + }, + { + "epoch": 0.20534823764090404, + "grad_norm": 19.428529739379883, + "learning_rate": 9.221835582400754e-05, + "loss": 2.0212, + "step": 2728 + }, + { + "epoch": 0.2054235119215642, + "grad_norm": 5.183816432952881, + "learning_rate": 9.221182311596112e-05, + "loss": 1.9894, + "step": 2729 + }, + { + "epoch": 0.20549878620222436, + "grad_norm": 8.503732681274414, + "learning_rate": 9.220528789854293e-05, + "loss": 2.2597, + "step": 2730 + }, + { + "epoch": 0.2055740604828845, + "grad_norm": 7.898542404174805, + "learning_rate": 9.219875017214146e-05, + "loss": 2.4169, + "step": 2731 + }, + { + "epoch": 0.20564933476354466, + "grad_norm": 6.033634185791016, + "learning_rate": 9.219220993714539e-05, + "loss": 2.4431, + "step": 2732 + }, + { + "epoch": 0.2057246090442048, + "grad_norm": 6.405065059661865, + "learning_rate": 9.21856671939435e-05, + "loss": 2.0821, + "step": 2733 + }, + { + "epoch": 0.20579988332486499, + "grad_norm": 8.12083625793457, + "learning_rate": 9.217912194292474e-05, + "loss": 2.0825, + "step": 2734 + }, + { + "epoch": 0.20587515760552513, + "grad_norm": 4.670567989349365, + "learning_rate": 9.21725741844782e-05, + "loss": 2.0697, + "step": 2735 + }, + { + "epoch": 0.20595043188618528, + "grad_norm": 6.682703018188477, + "learning_rate": 9.216602391899314e-05, + "loss": 1.8252, + "step": 2736 + }, + { + "epoch": 0.20602570616684543, + "grad_norm": 7.722905158996582, + "learning_rate": 9.215947114685895e-05, + "loss": 2.0024, + "step": 2737 + }, + { + "epoch": 0.2061009804475056, + "grad_norm": 7.832669258117676, + "learning_rate": 9.215291586846516e-05, + "loss": 2.0779, + "step": 2738 + }, + { + "epoch": 0.20617625472816575, + "grad_norm": 5.905942916870117, + "learning_rate": 9.214635808420147e-05, + "loss": 2.1217, + "step": 2739 + }, + { + "epoch": 0.2062515290088259, + "grad_norm": 6.355848789215088, + "learning_rate": 9.213979779445771e-05, + "loss": 2.0646, + "step": 2740 + }, + { + "epoch": 0.20632680328948608, + "grad_norm": 17.832256317138672, + "learning_rate": 9.213323499962392e-05, + "loss": 2.2214, + "step": 2741 + }, + { + "epoch": 0.20640207757014623, + "grad_norm": 8.87271499633789, + "learning_rate": 9.212666970009017e-05, + "loss": 2.4674, + "step": 2742 + }, + { + "epoch": 0.20647735185080637, + "grad_norm": 12.146018981933594, + "learning_rate": 9.21201018962468e-05, + "loss": 2.0848, + "step": 2743 + }, + { + "epoch": 0.20655262613146652, + "grad_norm": 39.741024017333984, + "learning_rate": 9.211353158848423e-05, + "loss": 2.5141, + "step": 2744 + }, + { + "epoch": 0.2066279004121267, + "grad_norm": 57.770042419433594, + "learning_rate": 9.210695877719303e-05, + "loss": 2.3473, + "step": 2745 + }, + { + "epoch": 0.20670317469278685, + "grad_norm": 5.470195770263672, + "learning_rate": 9.210038346276395e-05, + "loss": 1.7979, + "step": 2746 + }, + { + "epoch": 0.206778448973447, + "grad_norm": 6.125443458557129, + "learning_rate": 9.20938056455879e-05, + "loss": 1.8414, + "step": 2747 + }, + { + "epoch": 0.20685372325410714, + "grad_norm": 6.663171291351318, + "learning_rate": 9.208722532605584e-05, + "loss": 2.0653, + "step": 2748 + }, + { + "epoch": 0.20692899753476732, + "grad_norm": 3.868773937225342, + "learning_rate": 9.208064250455904e-05, + "loss": 2.2828, + "step": 2749 + }, + { + "epoch": 0.20700427181542747, + "grad_norm": 5.874432563781738, + "learning_rate": 9.207405718148876e-05, + "loss": 2.4096, + "step": 2750 + }, + { + "epoch": 0.2070795460960876, + "grad_norm": 6.231694221496582, + "learning_rate": 9.206746935723652e-05, + "loss": 2.2324, + "step": 2751 + }, + { + "epoch": 0.20715482037674776, + "grad_norm": 4.707183837890625, + "learning_rate": 9.206087903219391e-05, + "loss": 2.1788, + "step": 2752 + }, + { + "epoch": 0.20723009465740794, + "grad_norm": 6.024470806121826, + "learning_rate": 9.205428620675274e-05, + "loss": 2.1909, + "step": 2753 + }, + { + "epoch": 0.20730536893806809, + "grad_norm": 4.514256477355957, + "learning_rate": 9.204769088130493e-05, + "loss": 1.9345, + "step": 2754 + }, + { + "epoch": 0.20738064321872823, + "grad_norm": 6.8949151039123535, + "learning_rate": 9.204109305624253e-05, + "loss": 1.9622, + "step": 2755 + }, + { + "epoch": 0.2074559174993884, + "grad_norm": 6.394634246826172, + "learning_rate": 9.20344927319578e-05, + "loss": 2.2074, + "step": 2756 + }, + { + "epoch": 0.20753119178004856, + "grad_norm": 7.276010990142822, + "learning_rate": 9.202788990884306e-05, + "loss": 1.8705, + "step": 2757 + }, + { + "epoch": 0.2076064660607087, + "grad_norm": 13.49619197845459, + "learning_rate": 9.202128458729087e-05, + "loss": 1.8273, + "step": 2758 + }, + { + "epoch": 0.20768174034136885, + "grad_norm": 5.098809242248535, + "learning_rate": 9.201467676769389e-05, + "loss": 2.2117, + "step": 2759 + }, + { + "epoch": 0.20775701462202903, + "grad_norm": 8.781702995300293, + "learning_rate": 9.200806645044492e-05, + "loss": 2.2965, + "step": 2760 + }, + { + "epoch": 0.20783228890268918, + "grad_norm": 5.467831611633301, + "learning_rate": 9.200145363593694e-05, + "loss": 1.8527, + "step": 2761 + }, + { + "epoch": 0.20790756318334933, + "grad_norm": 8.295465469360352, + "learning_rate": 9.199483832456305e-05, + "loss": 2.1276, + "step": 2762 + }, + { + "epoch": 0.20798283746400947, + "grad_norm": 7.839022636413574, + "learning_rate": 9.198822051671651e-05, + "loss": 1.9388, + "step": 2763 + }, + { + "epoch": 0.20805811174466965, + "grad_norm": 6.7225213050842285, + "learning_rate": 9.198160021279076e-05, + "loss": 1.8316, + "step": 2764 + }, + { + "epoch": 0.2081333860253298, + "grad_norm": 6.108051300048828, + "learning_rate": 9.197497741317933e-05, + "loss": 1.8628, + "step": 2765 + }, + { + "epoch": 0.20820866030598995, + "grad_norm": 4.686515808105469, + "learning_rate": 9.196835211827593e-05, + "loss": 1.8158, + "step": 2766 + }, + { + "epoch": 0.2082839345866501, + "grad_norm": 4.935640811920166, + "learning_rate": 9.19617243284744e-05, + "loss": 2.0659, + "step": 2767 + }, + { + "epoch": 0.20835920886731027, + "grad_norm": 12.99338150024414, + "learning_rate": 9.195509404416878e-05, + "loss": 1.9038, + "step": 2768 + }, + { + "epoch": 0.20843448314797042, + "grad_norm": 6.222795486450195, + "learning_rate": 9.194846126575321e-05, + "loss": 2.2448, + "step": 2769 + }, + { + "epoch": 0.20850975742863057, + "grad_norm": 6.961162567138672, + "learning_rate": 9.194182599362198e-05, + "loss": 1.8268, + "step": 2770 + }, + { + "epoch": 0.20858503170929074, + "grad_norm": 5.859527111053467, + "learning_rate": 9.193518822816952e-05, + "loss": 2.1144, + "step": 2771 + }, + { + "epoch": 0.2086603059899509, + "grad_norm": 6.363965034484863, + "learning_rate": 9.192854796979045e-05, + "loss": 1.9724, + "step": 2772 + }, + { + "epoch": 0.20873558027061104, + "grad_norm": 7.075411319732666, + "learning_rate": 9.192190521887951e-05, + "loss": 1.9863, + "step": 2773 + }, + { + "epoch": 0.20881085455127119, + "grad_norm": 6.07334041595459, + "learning_rate": 9.191525997583161e-05, + "loss": 2.043, + "step": 2774 + }, + { + "epoch": 0.20888612883193136, + "grad_norm": 6.767756462097168, + "learning_rate": 9.190861224104175e-05, + "loss": 2.3918, + "step": 2775 + }, + { + "epoch": 0.2089614031125915, + "grad_norm": 6.749766826629639, + "learning_rate": 9.190196201490516e-05, + "loss": 1.9757, + "step": 2776 + }, + { + "epoch": 0.20903667739325166, + "grad_norm": 5.400588512420654, + "learning_rate": 9.189530929781715e-05, + "loss": 2.1359, + "step": 2777 + }, + { + "epoch": 0.2091119516739118, + "grad_norm": 6.137955188751221, + "learning_rate": 9.18886540901732e-05, + "loss": 2.0881, + "step": 2778 + }, + { + "epoch": 0.20918722595457198, + "grad_norm": 6.915503025054932, + "learning_rate": 9.188199639236897e-05, + "loss": 2.0054, + "step": 2779 + }, + { + "epoch": 0.20926250023523213, + "grad_norm": 4.541962146759033, + "learning_rate": 9.187533620480023e-05, + "loss": 1.9086, + "step": 2780 + }, + { + "epoch": 0.20933777451589228, + "grad_norm": 4.5916008949279785, + "learning_rate": 9.18686735278629e-05, + "loss": 1.8054, + "step": 2781 + }, + { + "epoch": 0.20941304879655243, + "grad_norm": 4.811084747314453, + "learning_rate": 9.186200836195307e-05, + "loss": 1.9144, + "step": 2782 + }, + { + "epoch": 0.2094883230772126, + "grad_norm": 4.080664157867432, + "learning_rate": 9.185534070746695e-05, + "loss": 1.921, + "step": 2783 + }, + { + "epoch": 0.20956359735787275, + "grad_norm": 5.359609603881836, + "learning_rate": 9.184867056480093e-05, + "loss": 1.9487, + "step": 2784 + }, + { + "epoch": 0.2096388716385329, + "grad_norm": 4.728886127471924, + "learning_rate": 9.184199793435152e-05, + "loss": 1.9044, + "step": 2785 + }, + { + "epoch": 0.20971414591919307, + "grad_norm": 7.131093978881836, + "learning_rate": 9.183532281651539e-05, + "loss": 1.7852, + "step": 2786 + }, + { + "epoch": 0.20978942019985322, + "grad_norm": 5.858365058898926, + "learning_rate": 9.182864521168936e-05, + "loss": 1.8538, + "step": 2787 + }, + { + "epoch": 0.20986469448051337, + "grad_norm": 5.444089889526367, + "learning_rate": 9.182196512027039e-05, + "loss": 2.0137, + "step": 2788 + }, + { + "epoch": 0.20993996876117352, + "grad_norm": 6.798290252685547, + "learning_rate": 9.181528254265559e-05, + "loss": 2.0721, + "step": 2789 + }, + { + "epoch": 0.2100152430418337, + "grad_norm": 4.014065265655518, + "learning_rate": 9.180859747924223e-05, + "loss": 1.8177, + "step": 2790 + }, + { + "epoch": 0.21009051732249384, + "grad_norm": 5.252590656280518, + "learning_rate": 9.180190993042772e-05, + "loss": 1.7736, + "step": 2791 + }, + { + "epoch": 0.210165791603154, + "grad_norm": 4.12830114364624, + "learning_rate": 9.179521989660957e-05, + "loss": 1.9409, + "step": 2792 + }, + { + "epoch": 0.21024106588381414, + "grad_norm": 6.2386088371276855, + "learning_rate": 9.178852737818557e-05, + "loss": 1.7453, + "step": 2793 + }, + { + "epoch": 0.2103163401644743, + "grad_norm": 5.434654235839844, + "learning_rate": 9.178183237555349e-05, + "loss": 2.2594, + "step": 2794 + }, + { + "epoch": 0.21039161444513446, + "grad_norm": 4.272034645080566, + "learning_rate": 9.177513488911138e-05, + "loss": 1.6573, + "step": 2795 + }, + { + "epoch": 0.2104668887257946, + "grad_norm": 4.0649614334106445, + "learning_rate": 9.176843491925735e-05, + "loss": 1.8898, + "step": 2796 + }, + { + "epoch": 0.21054216300645476, + "grad_norm": 5.4743266105651855, + "learning_rate": 9.176173246638972e-05, + "loss": 2.7306, + "step": 2797 + }, + { + "epoch": 0.21061743728711493, + "grad_norm": 7.000906944274902, + "learning_rate": 9.175502753090692e-05, + "loss": 2.0197, + "step": 2798 + }, + { + "epoch": 0.21069271156777508, + "grad_norm": 6.028274059295654, + "learning_rate": 9.174832011320755e-05, + "loss": 2.2337, + "step": 2799 + }, + { + "epoch": 0.21076798584843523, + "grad_norm": 5.796188831329346, + "learning_rate": 9.174161021369033e-05, + "loss": 1.891, + "step": 2800 + }, + { + "epoch": 0.21084326012909538, + "grad_norm": 6.9204421043396, + "learning_rate": 9.173489783275415e-05, + "loss": 2.1039, + "step": 2801 + }, + { + "epoch": 0.21091853440975555, + "grad_norm": 5.936499118804932, + "learning_rate": 9.172818297079806e-05, + "loss": 1.8546, + "step": 2802 + }, + { + "epoch": 0.2109938086904157, + "grad_norm": 6.919144630432129, + "learning_rate": 9.172146562822121e-05, + "loss": 1.7803, + "step": 2803 + }, + { + "epoch": 0.21106908297107585, + "grad_norm": 5.432614803314209, + "learning_rate": 9.171474580542295e-05, + "loss": 2.3649, + "step": 2804 + }, + { + "epoch": 0.21114435725173603, + "grad_norm": 6.412766456604004, + "learning_rate": 9.170802350280274e-05, + "loss": 1.9449, + "step": 2805 + }, + { + "epoch": 0.21121963153239617, + "grad_norm": 4.37930154800415, + "learning_rate": 9.170129872076021e-05, + "loss": 2.2373, + "step": 2806 + }, + { + "epoch": 0.21129490581305632, + "grad_norm": 5.927270889282227, + "learning_rate": 9.169457145969512e-05, + "loss": 2.0682, + "step": 2807 + }, + { + "epoch": 0.21137018009371647, + "grad_norm": 5.18697452545166, + "learning_rate": 9.168784172000739e-05, + "loss": 2.26, + "step": 2808 + }, + { + "epoch": 0.21144545437437665, + "grad_norm": 5.354100704193115, + "learning_rate": 9.168110950209709e-05, + "loss": 2.6103, + "step": 2809 + }, + { + "epoch": 0.2115207286550368, + "grad_norm": 5.394639015197754, + "learning_rate": 9.167437480636443e-05, + "loss": 2.2797, + "step": 2810 + }, + { + "epoch": 0.21159600293569694, + "grad_norm": 7.1058573722839355, + "learning_rate": 9.166763763320976e-05, + "loss": 2.0719, + "step": 2811 + }, + { + "epoch": 0.2116712772163571, + "grad_norm": 8.41796588897705, + "learning_rate": 9.16608979830336e-05, + "loss": 1.9862, + "step": 2812 + }, + { + "epoch": 0.21174655149701727, + "grad_norm": 6.795712471008301, + "learning_rate": 9.165415585623658e-05, + "loss": 1.7848, + "step": 2813 + }, + { + "epoch": 0.2118218257776774, + "grad_norm": 5.602150917053223, + "learning_rate": 9.164741125321952e-05, + "loss": 2.0413, + "step": 2814 + }, + { + "epoch": 0.21189710005833756, + "grad_norm": 5.891603469848633, + "learning_rate": 9.164066417438335e-05, + "loss": 1.9109, + "step": 2815 + }, + { + "epoch": 0.2119723743389977, + "grad_norm": 4.621806621551514, + "learning_rate": 9.16339146201292e-05, + "loss": 2.1, + "step": 2816 + }, + { + "epoch": 0.21204764861965789, + "grad_norm": 7.511584281921387, + "learning_rate": 9.162716259085827e-05, + "loss": 2.1378, + "step": 2817 + }, + { + "epoch": 0.21212292290031803, + "grad_norm": 4.577740669250488, + "learning_rate": 9.162040808697196e-05, + "loss": 1.9146, + "step": 2818 + }, + { + "epoch": 0.21219819718097818, + "grad_norm": 4.936798095703125, + "learning_rate": 9.161365110887181e-05, + "loss": 1.7413, + "step": 2819 + }, + { + "epoch": 0.21227347146163836, + "grad_norm": 5.75478982925415, + "learning_rate": 9.160689165695952e-05, + "loss": 2.1173, + "step": 2820 + }, + { + "epoch": 0.2123487457422985, + "grad_norm": 3.4062232971191406, + "learning_rate": 9.16001297316369e-05, + "loss": 1.8037, + "step": 2821 + }, + { + "epoch": 0.21242402002295865, + "grad_norm": 5.263576984405518, + "learning_rate": 9.159336533330594e-05, + "loss": 2.0905, + "step": 2822 + }, + { + "epoch": 0.2124992943036188, + "grad_norm": 6.029422760009766, + "learning_rate": 9.158659846236875e-05, + "loss": 2.1237, + "step": 2823 + }, + { + "epoch": 0.21257456858427898, + "grad_norm": 6.338613986968994, + "learning_rate": 9.157982911922759e-05, + "loss": 2.1129, + "step": 2824 + }, + { + "epoch": 0.21264984286493913, + "grad_norm": 6.587202548980713, + "learning_rate": 9.15730573042849e-05, + "loss": 2.1206, + "step": 2825 + }, + { + "epoch": 0.21272511714559927, + "grad_norm": 6.677863121032715, + "learning_rate": 9.156628301794325e-05, + "loss": 2.6606, + "step": 2826 + }, + { + "epoch": 0.21280039142625942, + "grad_norm": 5.736962795257568, + "learning_rate": 9.155950626060535e-05, + "loss": 1.8916, + "step": 2827 + }, + { + "epoch": 0.2128756657069196, + "grad_norm": 7.5251665115356445, + "learning_rate": 9.155272703267403e-05, + "loss": 1.9961, + "step": 2828 + }, + { + "epoch": 0.21295093998757975, + "grad_norm": 5.149869441986084, + "learning_rate": 9.154594533455232e-05, + "loss": 2.0102, + "step": 2829 + }, + { + "epoch": 0.2130262142682399, + "grad_norm": 5.805475234985352, + "learning_rate": 9.153916116664338e-05, + "loss": 1.9, + "step": 2830 + }, + { + "epoch": 0.21310148854890004, + "grad_norm": 4.864974021911621, + "learning_rate": 9.153237452935049e-05, + "loss": 2.1139, + "step": 2831 + }, + { + "epoch": 0.21317676282956022, + "grad_norm": 6.045137405395508, + "learning_rate": 9.15255854230771e-05, + "loss": 2.1251, + "step": 2832 + }, + { + "epoch": 0.21325203711022037, + "grad_norm": 6.3861494064331055, + "learning_rate": 9.151879384822682e-05, + "loss": 2.2734, + "step": 2833 + }, + { + "epoch": 0.21332731139088051, + "grad_norm": 9.353327751159668, + "learning_rate": 9.151199980520336e-05, + "loss": 2.1349, + "step": 2834 + }, + { + "epoch": 0.2134025856715407, + "grad_norm": 4.276968479156494, + "learning_rate": 9.150520329441063e-05, + "loss": 1.9429, + "step": 2835 + }, + { + "epoch": 0.21347785995220084, + "grad_norm": 6.518537998199463, + "learning_rate": 9.149840431625266e-05, + "loss": 2.1551, + "step": 2836 + }, + { + "epoch": 0.21355313423286099, + "grad_norm": 5.258233070373535, + "learning_rate": 9.149160287113363e-05, + "loss": 2.2718, + "step": 2837 + }, + { + "epoch": 0.21362840851352113, + "grad_norm": 6.407668113708496, + "learning_rate": 9.148479895945784e-05, + "loss": 2.4814, + "step": 2838 + }, + { + "epoch": 0.2137036827941813, + "grad_norm": 5.157773017883301, + "learning_rate": 9.147799258162981e-05, + "loss": 1.9935, + "step": 2839 + }, + { + "epoch": 0.21377895707484146, + "grad_norm": 7.175652980804443, + "learning_rate": 9.147118373805413e-05, + "loss": 1.9145, + "step": 2840 + }, + { + "epoch": 0.2138542313555016, + "grad_norm": 5.1775031089782715, + "learning_rate": 9.146437242913556e-05, + "loss": 1.8842, + "step": 2841 + }, + { + "epoch": 0.21392950563616175, + "grad_norm": 6.460962772369385, + "learning_rate": 9.145755865527904e-05, + "loss": 2.2604, + "step": 2842 + }, + { + "epoch": 0.21400477991682193, + "grad_norm": 6.818923473358154, + "learning_rate": 9.145074241688959e-05, + "loss": 1.7658, + "step": 2843 + }, + { + "epoch": 0.21408005419748208, + "grad_norm": 24.45903205871582, + "learning_rate": 9.144392371437245e-05, + "loss": 2.5371, + "step": 2844 + }, + { + "epoch": 0.21415532847814223, + "grad_norm": 5.668989658355713, + "learning_rate": 9.143710254813296e-05, + "loss": 2.1553, + "step": 2845 + }, + { + "epoch": 0.21423060275880237, + "grad_norm": 5.1732659339904785, + "learning_rate": 9.143027891857663e-05, + "loss": 2.3208, + "step": 2846 + }, + { + "epoch": 0.21430587703946255, + "grad_norm": 5.166064262390137, + "learning_rate": 9.142345282610908e-05, + "loss": 1.9757, + "step": 2847 + }, + { + "epoch": 0.2143811513201227, + "grad_norm": 5.604634761810303, + "learning_rate": 9.141662427113613e-05, + "loss": 2.1508, + "step": 2848 + }, + { + "epoch": 0.21445642560078285, + "grad_norm": 4.952036380767822, + "learning_rate": 9.140979325406369e-05, + "loss": 2.147, + "step": 2849 + }, + { + "epoch": 0.214531699881443, + "grad_norm": 5.10736083984375, + "learning_rate": 9.140295977529788e-05, + "loss": 1.9929, + "step": 2850 + }, + { + "epoch": 0.21460697416210317, + "grad_norm": 5.1326823234558105, + "learning_rate": 9.139612383524489e-05, + "loss": 1.8989, + "step": 2851 + }, + { + "epoch": 0.21468224844276332, + "grad_norm": 6.51033878326416, + "learning_rate": 9.138928543431113e-05, + "loss": 2.8586, + "step": 2852 + }, + { + "epoch": 0.21475752272342347, + "grad_norm": 5.176274299621582, + "learning_rate": 9.138244457290311e-05, + "loss": 1.7273, + "step": 2853 + }, + { + "epoch": 0.21483279700408364, + "grad_norm": 6.623416423797607, + "learning_rate": 9.13756012514275e-05, + "loss": 2.0693, + "step": 2854 + }, + { + "epoch": 0.2149080712847438, + "grad_norm": 5.0646162033081055, + "learning_rate": 9.13687554702911e-05, + "loss": 1.8187, + "step": 2855 + }, + { + "epoch": 0.21498334556540394, + "grad_norm": 5.770161151885986, + "learning_rate": 9.13619072299009e-05, + "loss": 2.0489, + "step": 2856 + }, + { + "epoch": 0.2150586198460641, + "grad_norm": 5.196555137634277, + "learning_rate": 9.135505653066399e-05, + "loss": 2.0653, + "step": 2857 + }, + { + "epoch": 0.21513389412672426, + "grad_norm": 4.448337078094482, + "learning_rate": 9.134820337298763e-05, + "loss": 2.1849, + "step": 2858 + }, + { + "epoch": 0.2152091684073844, + "grad_norm": 11.007974624633789, + "learning_rate": 9.134134775727922e-05, + "loss": 2.2417, + "step": 2859 + }, + { + "epoch": 0.21528444268804456, + "grad_norm": 9.509227752685547, + "learning_rate": 9.133448968394631e-05, + "loss": 2.1448, + "step": 2860 + }, + { + "epoch": 0.2153597169687047, + "grad_norm": 5.097561359405518, + "learning_rate": 9.13276291533966e-05, + "loss": 2.0013, + "step": 2861 + }, + { + "epoch": 0.21543499124936488, + "grad_norm": 6.589043140411377, + "learning_rate": 9.132076616603791e-05, + "loss": 1.8597, + "step": 2862 + }, + { + "epoch": 0.21551026553002503, + "grad_norm": 4.653834819793701, + "learning_rate": 9.131390072227823e-05, + "loss": 2.3103, + "step": 2863 + }, + { + "epoch": 0.21558553981068518, + "grad_norm": 6.047309875488281, + "learning_rate": 9.13070328225257e-05, + "loss": 2.21, + "step": 2864 + }, + { + "epoch": 0.21566081409134533, + "grad_norm": 9.65843391418457, + "learning_rate": 9.13001624671886e-05, + "loss": 2.2719, + "step": 2865 + }, + { + "epoch": 0.2157360883720055, + "grad_norm": 6.0782365798950195, + "learning_rate": 9.129328965667534e-05, + "loss": 1.7926, + "step": 2866 + }, + { + "epoch": 0.21581136265266565, + "grad_norm": 5.803018569946289, + "learning_rate": 9.128641439139448e-05, + "loss": 1.7579, + "step": 2867 + }, + { + "epoch": 0.2158866369333258, + "grad_norm": 6.110564708709717, + "learning_rate": 9.127953667175476e-05, + "loss": 1.8037, + "step": 2868 + }, + { + "epoch": 0.21596191121398597, + "grad_norm": 6.716493129730225, + "learning_rate": 9.127265649816504e-05, + "loss": 2.1532, + "step": 2869 + }, + { + "epoch": 0.21603718549464612, + "grad_norm": 4.54202127456665, + "learning_rate": 9.12657738710343e-05, + "loss": 1.7375, + "step": 2870 + }, + { + "epoch": 0.21611245977530627, + "grad_norm": 5.8517255783081055, + "learning_rate": 9.12588887907717e-05, + "loss": 1.903, + "step": 2871 + }, + { + "epoch": 0.21618773405596642, + "grad_norm": 7.2497148513793945, + "learning_rate": 9.125200125778657e-05, + "loss": 2.1568, + "step": 2872 + }, + { + "epoch": 0.2162630083366266, + "grad_norm": 6.089284420013428, + "learning_rate": 9.124511127248832e-05, + "loss": 2.3233, + "step": 2873 + }, + { + "epoch": 0.21633828261728674, + "grad_norm": 4.993877410888672, + "learning_rate": 9.123821883528653e-05, + "loss": 2.0073, + "step": 2874 + }, + { + "epoch": 0.2164135568979469, + "grad_norm": 6.940545558929443, + "learning_rate": 9.123132394659098e-05, + "loss": 2.0568, + "step": 2875 + }, + { + "epoch": 0.21648883117860704, + "grad_norm": 14.741281509399414, + "learning_rate": 9.122442660681153e-05, + "loss": 2.1576, + "step": 2876 + }, + { + "epoch": 0.21656410545926721, + "grad_norm": 12.624621391296387, + "learning_rate": 9.12175268163582e-05, + "loss": 2.0387, + "step": 2877 + }, + { + "epoch": 0.21663937973992736, + "grad_norm": 17.01222038269043, + "learning_rate": 9.121062457564115e-05, + "loss": 2.0933, + "step": 2878 + }, + { + "epoch": 0.2167146540205875, + "grad_norm": 41.06978988647461, + "learning_rate": 9.120371988507073e-05, + "loss": 2.0585, + "step": 2879 + }, + { + "epoch": 0.21678992830124766, + "grad_norm": 72.18241119384766, + "learning_rate": 9.11968127450574e-05, + "loss": 4.1907, + "step": 2880 + }, + { + "epoch": 0.21686520258190783, + "grad_norm": 26.403972625732422, + "learning_rate": 9.118990315601175e-05, + "loss": 2.4191, + "step": 2881 + }, + { + "epoch": 0.21694047686256798, + "grad_norm": 17.679353713989258, + "learning_rate": 9.118299111834454e-05, + "loss": 2.2037, + "step": 2882 + }, + { + "epoch": 0.21701575114322813, + "grad_norm": 8.343015670776367, + "learning_rate": 9.117607663246668e-05, + "loss": 2.0227, + "step": 2883 + }, + { + "epoch": 0.2170910254238883, + "grad_norm": 9.201031684875488, + "learning_rate": 9.116915969878921e-05, + "loss": 2.3684, + "step": 2884 + }, + { + "epoch": 0.21716629970454845, + "grad_norm": 12.270792961120605, + "learning_rate": 9.116224031772331e-05, + "loss": 2.2878, + "step": 2885 + }, + { + "epoch": 0.2172415739852086, + "grad_norm": 9.179665565490723, + "learning_rate": 9.115531848968035e-05, + "loss": 1.8538, + "step": 2886 + }, + { + "epoch": 0.21731684826586875, + "grad_norm": 7.2675604820251465, + "learning_rate": 9.114839421507179e-05, + "loss": 1.9346, + "step": 2887 + }, + { + "epoch": 0.21739212254652893, + "grad_norm": 6.593596458435059, + "learning_rate": 9.114146749430926e-05, + "loss": 2.4198, + "step": 2888 + }, + { + "epoch": 0.21746739682718907, + "grad_norm": 11.048501014709473, + "learning_rate": 9.113453832780453e-05, + "loss": 2.6034, + "step": 2889 + }, + { + "epoch": 0.21754267110784922, + "grad_norm": 7.089968204498291, + "learning_rate": 9.112760671596953e-05, + "loss": 1.9015, + "step": 2890 + }, + { + "epoch": 0.21761794538850937, + "grad_norm": 6.570745468139648, + "learning_rate": 9.112067265921633e-05, + "loss": 2.0399, + "step": 2891 + }, + { + "epoch": 0.21769321966916955, + "grad_norm": 5.847081661224365, + "learning_rate": 9.111373615795711e-05, + "loss": 2.1214, + "step": 2892 + }, + { + "epoch": 0.2177684939498297, + "grad_norm": 12.5921630859375, + "learning_rate": 9.110679721260424e-05, + "loss": 2.4735, + "step": 2893 + }, + { + "epoch": 0.21784376823048984, + "grad_norm": 6.360659599304199, + "learning_rate": 9.109985582357024e-05, + "loss": 2.0158, + "step": 2894 + }, + { + "epoch": 0.21791904251115, + "grad_norm": 5.861877918243408, + "learning_rate": 9.109291199126774e-05, + "loss": 2.1871, + "step": 2895 + }, + { + "epoch": 0.21799431679181017, + "grad_norm": 5.769315242767334, + "learning_rate": 9.108596571610952e-05, + "loss": 1.9332, + "step": 2896 + }, + { + "epoch": 0.21806959107247031, + "grad_norm": 8.93451976776123, + "learning_rate": 9.107901699850853e-05, + "loss": 2.1849, + "step": 2897 + }, + { + "epoch": 0.21814486535313046, + "grad_norm": 5.1399970054626465, + "learning_rate": 9.107206583887786e-05, + "loss": 2.1824, + "step": 2898 + }, + { + "epoch": 0.2182201396337906, + "grad_norm": 5.9394636154174805, + "learning_rate": 9.106511223763073e-05, + "loss": 2.0504, + "step": 2899 + }, + { + "epoch": 0.2182954139144508, + "grad_norm": 5.526357173919678, + "learning_rate": 9.105815619518048e-05, + "loss": 2.075, + "step": 2900 + }, + { + "epoch": 0.21837068819511093, + "grad_norm": 5.283119201660156, + "learning_rate": 9.105119771194066e-05, + "loss": 1.9623, + "step": 2901 + }, + { + "epoch": 0.21844596247577108, + "grad_norm": 5.233930587768555, + "learning_rate": 9.104423678832494e-05, + "loss": 2.1217, + "step": 2902 + }, + { + "epoch": 0.21852123675643126, + "grad_norm": 6.292283535003662, + "learning_rate": 9.103727342474711e-05, + "loss": 1.7759, + "step": 2903 + }, + { + "epoch": 0.2185965110370914, + "grad_norm": 6.08415412902832, + "learning_rate": 9.103030762162112e-05, + "loss": 1.9917, + "step": 2904 + }, + { + "epoch": 0.21867178531775155, + "grad_norm": 6.479919910430908, + "learning_rate": 9.102333937936107e-05, + "loss": 2.191, + "step": 2905 + }, + { + "epoch": 0.2187470595984117, + "grad_norm": 5.013296604156494, + "learning_rate": 9.101636869838119e-05, + "loss": 2.0969, + "step": 2906 + }, + { + "epoch": 0.21882233387907188, + "grad_norm": 4.301442623138428, + "learning_rate": 9.10093955790959e-05, + "loss": 2.1664, + "step": 2907 + }, + { + "epoch": 0.21889760815973203, + "grad_norm": 5.953193187713623, + "learning_rate": 9.10024200219197e-05, + "loss": 1.9972, + "step": 2908 + }, + { + "epoch": 0.21897288244039217, + "grad_norm": 8.041834831237793, + "learning_rate": 9.099544202726728e-05, + "loss": 1.8265, + "step": 2909 + }, + { + "epoch": 0.21904815672105232, + "grad_norm": 5.585407733917236, + "learning_rate": 9.098846159555346e-05, + "loss": 2.477, + "step": 2910 + }, + { + "epoch": 0.2191234310017125, + "grad_norm": 3.5935590267181396, + "learning_rate": 9.098147872719321e-05, + "loss": 2.0636, + "step": 2911 + }, + { + "epoch": 0.21919870528237265, + "grad_norm": 4.802414417266846, + "learning_rate": 9.097449342260165e-05, + "loss": 2.1284, + "step": 2912 + }, + { + "epoch": 0.2192739795630328, + "grad_norm": 9.175202369689941, + "learning_rate": 9.0967505682194e-05, + "loss": 2.2005, + "step": 2913 + }, + { + "epoch": 0.21934925384369294, + "grad_norm": 6.273319244384766, + "learning_rate": 9.096051550638572e-05, + "loss": 1.8883, + "step": 2914 + }, + { + "epoch": 0.21942452812435312, + "grad_norm": 3.694929599761963, + "learning_rate": 9.095352289559228e-05, + "loss": 1.8947, + "step": 2915 + }, + { + "epoch": 0.21949980240501327, + "grad_norm": 6.053046703338623, + "learning_rate": 9.094652785022945e-05, + "loss": 1.9294, + "step": 2916 + }, + { + "epoch": 0.21957507668567341, + "grad_norm": 3.9093520641326904, + "learning_rate": 9.093953037071301e-05, + "loss": 1.888, + "step": 2917 + }, + { + "epoch": 0.2196503509663336, + "grad_norm": 10.43465805053711, + "learning_rate": 9.093253045745895e-05, + "loss": 1.9106, + "step": 2918 + }, + { + "epoch": 0.21972562524699374, + "grad_norm": 3.444300889968872, + "learning_rate": 9.092552811088342e-05, + "loss": 2.305, + "step": 2919 + }, + { + "epoch": 0.2198008995276539, + "grad_norm": 5.4698896408081055, + "learning_rate": 9.091852333140267e-05, + "loss": 1.8245, + "step": 2920 + }, + { + "epoch": 0.21987617380831403, + "grad_norm": 5.371891975402832, + "learning_rate": 9.091151611943312e-05, + "loss": 1.9341, + "step": 2921 + }, + { + "epoch": 0.2199514480889742, + "grad_norm": 4.319827556610107, + "learning_rate": 9.09045064753913e-05, + "loss": 1.7098, + "step": 2922 + }, + { + "epoch": 0.22002672236963436, + "grad_norm": 4.167977809906006, + "learning_rate": 9.089749439969398e-05, + "loss": 1.9787, + "step": 2923 + }, + { + "epoch": 0.2201019966502945, + "grad_norm": 6.262346267700195, + "learning_rate": 9.089047989275793e-05, + "loss": 1.8187, + "step": 2924 + }, + { + "epoch": 0.22017727093095465, + "grad_norm": 4.242522716522217, + "learning_rate": 9.088346295500019e-05, + "loss": 2.1949, + "step": 2925 + }, + { + "epoch": 0.22025254521161483, + "grad_norm": 5.393512725830078, + "learning_rate": 9.087644358683789e-05, + "loss": 2.2597, + "step": 2926 + }, + { + "epoch": 0.22032781949227498, + "grad_norm": 5.510080337524414, + "learning_rate": 9.086942178868829e-05, + "loss": 1.92, + "step": 2927 + }, + { + "epoch": 0.22040309377293513, + "grad_norm": 5.782841205596924, + "learning_rate": 9.086239756096886e-05, + "loss": 1.945, + "step": 2928 + }, + { + "epoch": 0.22047836805359528, + "grad_norm": 3.824327230453491, + "learning_rate": 9.085537090409712e-05, + "loss": 2.0536, + "step": 2929 + }, + { + "epoch": 0.22055364233425545, + "grad_norm": 4.9918365478515625, + "learning_rate": 9.084834181849081e-05, + "loss": 1.9754, + "step": 2930 + }, + { + "epoch": 0.2206289166149156, + "grad_norm": 5.566911220550537, + "learning_rate": 9.08413103045678e-05, + "loss": 1.9961, + "step": 2931 + }, + { + "epoch": 0.22070419089557575, + "grad_norm": 8.793004989624023, + "learning_rate": 9.083427636274606e-05, + "loss": 2.1145, + "step": 2932 + }, + { + "epoch": 0.22077946517623592, + "grad_norm": 6.476354598999023, + "learning_rate": 9.082723999344377e-05, + "loss": 2.3145, + "step": 2933 + }, + { + "epoch": 0.22085473945689607, + "grad_norm": 5.027369499206543, + "learning_rate": 9.082020119707921e-05, + "loss": 2.2283, + "step": 2934 + }, + { + "epoch": 0.22093001373755622, + "grad_norm": 5.488105773925781, + "learning_rate": 9.08131599740708e-05, + "loss": 1.9099, + "step": 2935 + }, + { + "epoch": 0.22100528801821637, + "grad_norm": 4.533041000366211, + "learning_rate": 9.080611632483714e-05, + "loss": 1.7026, + "step": 2936 + }, + { + "epoch": 0.22108056229887654, + "grad_norm": 6.061397075653076, + "learning_rate": 9.079907024979696e-05, + "loss": 2.3944, + "step": 2937 + }, + { + "epoch": 0.2211558365795367, + "grad_norm": 7.901887893676758, + "learning_rate": 9.079202174936912e-05, + "loss": 2.2042, + "step": 2938 + }, + { + "epoch": 0.22123111086019684, + "grad_norm": 4.920134544372559, + "learning_rate": 9.078497082397262e-05, + "loss": 2.2818, + "step": 2939 + }, + { + "epoch": 0.221306385140857, + "grad_norm": 6.0166015625, + "learning_rate": 9.077791747402664e-05, + "loss": 2.0055, + "step": 2940 + }, + { + "epoch": 0.22138165942151716, + "grad_norm": 4.905442714691162, + "learning_rate": 9.077086169995048e-05, + "loss": 1.8066, + "step": 2941 + }, + { + "epoch": 0.2214569337021773, + "grad_norm": 7.7746262550354, + "learning_rate": 9.076380350216355e-05, + "loss": 2.312, + "step": 2942 + }, + { + "epoch": 0.22153220798283746, + "grad_norm": 8.243760108947754, + "learning_rate": 9.075674288108549e-05, + "loss": 2.2926, + "step": 2943 + }, + { + "epoch": 0.2216074822634976, + "grad_norm": 5.692325592041016, + "learning_rate": 9.0749679837136e-05, + "loss": 2.1096, + "step": 2944 + }, + { + "epoch": 0.22168275654415778, + "grad_norm": 4.68093729019165, + "learning_rate": 9.074261437073497e-05, + "loss": 2.1276, + "step": 2945 + }, + { + "epoch": 0.22175803082481793, + "grad_norm": 6.712759971618652, + "learning_rate": 9.073554648230241e-05, + "loss": 1.899, + "step": 2946 + }, + { + "epoch": 0.22183330510547808, + "grad_norm": 5.239917278289795, + "learning_rate": 9.07284761722585e-05, + "loss": 2.0195, + "step": 2947 + }, + { + "epoch": 0.22190857938613825, + "grad_norm": 5.0789475440979, + "learning_rate": 9.072140344102355e-05, + "loss": 2.0013, + "step": 2948 + }, + { + "epoch": 0.2219838536667984, + "grad_norm": 5.248326301574707, + "learning_rate": 9.0714328289018e-05, + "loss": 1.9019, + "step": 2949 + }, + { + "epoch": 0.22205912794745855, + "grad_norm": 7.704656600952148, + "learning_rate": 9.070725071666248e-05, + "loss": 1.9397, + "step": 2950 + }, + { + "epoch": 0.2221344022281187, + "grad_norm": 4.7295403480529785, + "learning_rate": 9.070017072437769e-05, + "loss": 1.8184, + "step": 2951 + }, + { + "epoch": 0.22220967650877888, + "grad_norm": 9.1859769821167, + "learning_rate": 9.069308831258452e-05, + "loss": 2.281, + "step": 2952 + }, + { + "epoch": 0.22228495078943902, + "grad_norm": 6.104687213897705, + "learning_rate": 9.068600348170401e-05, + "loss": 2.0601, + "step": 2953 + }, + { + "epoch": 0.22236022507009917, + "grad_norm": 7.116252899169922, + "learning_rate": 9.067891623215737e-05, + "loss": 2.1145, + "step": 2954 + }, + { + "epoch": 0.22243549935075932, + "grad_norm": 6.117179870605469, + "learning_rate": 9.067182656436585e-05, + "loss": 2.0234, + "step": 2955 + }, + { + "epoch": 0.2225107736314195, + "grad_norm": 5.595031261444092, + "learning_rate": 9.066473447875094e-05, + "loss": 1.6895, + "step": 2956 + }, + { + "epoch": 0.22258604791207964, + "grad_norm": 6.132288932800293, + "learning_rate": 9.065763997573427e-05, + "loss": 2.0879, + "step": 2957 + }, + { + "epoch": 0.2226613221927398, + "grad_norm": 3.8455755710601807, + "learning_rate": 9.065054305573755e-05, + "loss": 2.1217, + "step": 2958 + }, + { + "epoch": 0.22273659647339994, + "grad_norm": 5.857288837432861, + "learning_rate": 9.064344371918267e-05, + "loss": 1.929, + "step": 2959 + }, + { + "epoch": 0.22281187075406012, + "grad_norm": 6.240818023681641, + "learning_rate": 9.06363419664917e-05, + "loss": 1.9883, + "step": 2960 + }, + { + "epoch": 0.22288714503472026, + "grad_norm": 5.995606899261475, + "learning_rate": 9.062923779808678e-05, + "loss": 2.2979, + "step": 2961 + }, + { + "epoch": 0.2229624193153804, + "grad_norm": 4.7246904373168945, + "learning_rate": 9.062213121439027e-05, + "loss": 1.9242, + "step": 2962 + }, + { + "epoch": 0.22303769359604056, + "grad_norm": 7.328646183013916, + "learning_rate": 9.061502221582461e-05, + "loss": 1.9112, + "step": 2963 + }, + { + "epoch": 0.22311296787670074, + "grad_norm": 5.471291542053223, + "learning_rate": 9.060791080281241e-05, + "loss": 2.0289, + "step": 2964 + }, + { + "epoch": 0.22318824215736088, + "grad_norm": 5.698328018188477, + "learning_rate": 9.060079697577644e-05, + "loss": 2.1358, + "step": 2965 + }, + { + "epoch": 0.22326351643802103, + "grad_norm": 4.959545135498047, + "learning_rate": 9.059368073513958e-05, + "loss": 1.9713, + "step": 2966 + }, + { + "epoch": 0.2233387907186812, + "grad_norm": 4.452411651611328, + "learning_rate": 9.058656208132487e-05, + "loss": 1.9847, + "step": 2967 + }, + { + "epoch": 0.22341406499934136, + "grad_norm": 4.108034610748291, + "learning_rate": 9.057944101475553e-05, + "loss": 2.3387, + "step": 2968 + }, + { + "epoch": 0.2234893392800015, + "grad_norm": 4.727132320404053, + "learning_rate": 9.057231753585483e-05, + "loss": 2.0478, + "step": 2969 + }, + { + "epoch": 0.22356461356066165, + "grad_norm": 4.835155963897705, + "learning_rate": 9.056519164504627e-05, + "loss": 2.1538, + "step": 2970 + }, + { + "epoch": 0.22363988784132183, + "grad_norm": 5.675017833709717, + "learning_rate": 9.055806334275347e-05, + "loss": 1.8676, + "step": 2971 + }, + { + "epoch": 0.22371516212198198, + "grad_norm": 7.372113227844238, + "learning_rate": 9.055093262940017e-05, + "loss": 2.2857, + "step": 2972 + }, + { + "epoch": 0.22379043640264212, + "grad_norm": 4.974536895751953, + "learning_rate": 9.054379950541028e-05, + "loss": 2.3455, + "step": 2973 + }, + { + "epoch": 0.22386571068330227, + "grad_norm": 5.916133880615234, + "learning_rate": 9.053666397120786e-05, + "loss": 2.259, + "step": 2974 + }, + { + "epoch": 0.22394098496396245, + "grad_norm": 4.214310646057129, + "learning_rate": 9.052952602721706e-05, + "loss": 2.3951, + "step": 2975 + }, + { + "epoch": 0.2240162592446226, + "grad_norm": 4.525818824768066, + "learning_rate": 9.052238567386224e-05, + "loss": 1.9346, + "step": 2976 + }, + { + "epoch": 0.22409153352528274, + "grad_norm": 5.634984016418457, + "learning_rate": 9.051524291156785e-05, + "loss": 1.9895, + "step": 2977 + }, + { + "epoch": 0.2241668078059429, + "grad_norm": 6.583395957946777, + "learning_rate": 9.050809774075853e-05, + "loss": 1.9479, + "step": 2978 + }, + { + "epoch": 0.22424208208660307, + "grad_norm": 3.937696933746338, + "learning_rate": 9.050095016185903e-05, + "loss": 2.0978, + "step": 2979 + }, + { + "epoch": 0.22431735636726322, + "grad_norm": 4.16509485244751, + "learning_rate": 9.049380017529426e-05, + "loss": 2.3338, + "step": 2980 + }, + { + "epoch": 0.22439263064792336, + "grad_norm": 5.84874963760376, + "learning_rate": 9.048664778148924e-05, + "loss": 2.0807, + "step": 2981 + }, + { + "epoch": 0.22446790492858354, + "grad_norm": 5.054769039154053, + "learning_rate": 9.047949298086919e-05, + "loss": 2.1072, + "step": 2982 + }, + { + "epoch": 0.2245431792092437, + "grad_norm": 4.882493019104004, + "learning_rate": 9.047233577385944e-05, + "loss": 1.9118, + "step": 2983 + }, + { + "epoch": 0.22461845348990384, + "grad_norm": 3.5406758785247803, + "learning_rate": 9.046517616088545e-05, + "loss": 2.0995, + "step": 2984 + }, + { + "epoch": 0.22469372777056398, + "grad_norm": 5.1863532066345215, + "learning_rate": 9.045801414237284e-05, + "loss": 2.3511, + "step": 2985 + }, + { + "epoch": 0.22476900205122416, + "grad_norm": 5.371186256408691, + "learning_rate": 9.045084971874738e-05, + "loss": 2.2297, + "step": 2986 + }, + { + "epoch": 0.2248442763318843, + "grad_norm": 5.072912216186523, + "learning_rate": 9.044368289043497e-05, + "loss": 2.0466, + "step": 2987 + }, + { + "epoch": 0.22491955061254446, + "grad_norm": 5.343015670776367, + "learning_rate": 9.043651365786166e-05, + "loss": 1.956, + "step": 2988 + }, + { + "epoch": 0.2249948248932046, + "grad_norm": 4.214582920074463, + "learning_rate": 9.042934202145364e-05, + "loss": 2.2071, + "step": 2989 + }, + { + "epoch": 0.22507009917386478, + "grad_norm": 4.679235458374023, + "learning_rate": 9.042216798163722e-05, + "loss": 2.6149, + "step": 2990 + }, + { + "epoch": 0.22514537345452493, + "grad_norm": 5.404105186462402, + "learning_rate": 9.041499153883893e-05, + "loss": 2.4987, + "step": 2991 + }, + { + "epoch": 0.22522064773518508, + "grad_norm": 4.6516828536987305, + "learning_rate": 9.040781269348533e-05, + "loss": 1.9114, + "step": 2992 + }, + { + "epoch": 0.22529592201584522, + "grad_norm": 4.465479850769043, + "learning_rate": 9.040063144600322e-05, + "loss": 1.801, + "step": 2993 + }, + { + "epoch": 0.2253711962965054, + "grad_norm": 4.730539321899414, + "learning_rate": 9.03934477968195e-05, + "loss": 2.0379, + "step": 2994 + }, + { + "epoch": 0.22544647057716555, + "grad_norm": 6.056190013885498, + "learning_rate": 9.03862617463612e-05, + "loss": 2.3048, + "step": 2995 + }, + { + "epoch": 0.2255217448578257, + "grad_norm": 5.658048152923584, + "learning_rate": 9.037907329505553e-05, + "loss": 2.166, + "step": 2996 + }, + { + "epoch": 0.22559701913848587, + "grad_norm": 4.583282470703125, + "learning_rate": 9.03718824433298e-05, + "loss": 1.9496, + "step": 2997 + }, + { + "epoch": 0.22567229341914602, + "grad_norm": 4.5869221687316895, + "learning_rate": 9.036468919161151e-05, + "loss": 1.9332, + "step": 2998 + }, + { + "epoch": 0.22574756769980617, + "grad_norm": 5.690463542938232, + "learning_rate": 9.035749354032825e-05, + "loss": 1.5926, + "step": 2999 + }, + { + "epoch": 0.22582284198046632, + "grad_norm": 9.193011283874512, + "learning_rate": 9.035029548990782e-05, + "loss": 2.0635, + "step": 3000 + }, + { + "epoch": 0.2258981162611265, + "grad_norm": 4.976571083068848, + "learning_rate": 9.034309504077809e-05, + "loss": 2.6961, + "step": 3001 + }, + { + "epoch": 0.22597339054178664, + "grad_norm": 6.667677402496338, + "learning_rate": 9.033589219336711e-05, + "loss": 2.3779, + "step": 3002 + }, + { + "epoch": 0.2260486648224468, + "grad_norm": 5.841045379638672, + "learning_rate": 9.03286869481031e-05, + "loss": 2.0346, + "step": 3003 + }, + { + "epoch": 0.22612393910310694, + "grad_norm": 9.624619483947754, + "learning_rate": 9.032147930541435e-05, + "loss": 2.1474, + "step": 3004 + }, + { + "epoch": 0.2261992133837671, + "grad_norm": 5.452056884765625, + "learning_rate": 9.031426926572935e-05, + "loss": 2.0176, + "step": 3005 + }, + { + "epoch": 0.22627448766442726, + "grad_norm": 6.226777076721191, + "learning_rate": 9.030705682947671e-05, + "loss": 1.919, + "step": 3006 + }, + { + "epoch": 0.2263497619450874, + "grad_norm": 3.857956886291504, + "learning_rate": 9.029984199708521e-05, + "loss": 2.1481, + "step": 3007 + }, + { + "epoch": 0.22642503622574756, + "grad_norm": 4.310341835021973, + "learning_rate": 9.029262476898372e-05, + "loss": 1.9097, + "step": 3008 + }, + { + "epoch": 0.22650031050640773, + "grad_norm": 4.636809825897217, + "learning_rate": 9.028540514560132e-05, + "loss": 2.2545, + "step": 3009 + }, + { + "epoch": 0.22657558478706788, + "grad_norm": 5.450639724731445, + "learning_rate": 9.027818312736717e-05, + "loss": 2.2092, + "step": 3010 + }, + { + "epoch": 0.22665085906772803, + "grad_norm": 6.2182416915893555, + "learning_rate": 9.02709587147106e-05, + "loss": 2.1127, + "step": 3011 + }, + { + "epoch": 0.22672613334838818, + "grad_norm": 5.326910018920898, + "learning_rate": 9.026373190806108e-05, + "loss": 2.1092, + "step": 3012 + }, + { + "epoch": 0.22680140762904835, + "grad_norm": 5.691418170928955, + "learning_rate": 9.025650270784823e-05, + "loss": 2.1836, + "step": 3013 + }, + { + "epoch": 0.2268766819097085, + "grad_norm": 6.693453788757324, + "learning_rate": 9.024927111450182e-05, + "loss": 2.1319, + "step": 3014 + }, + { + "epoch": 0.22695195619036865, + "grad_norm": 7.156260013580322, + "learning_rate": 9.02420371284517e-05, + "loss": 2.1571, + "step": 3015 + }, + { + "epoch": 0.22702723047102882, + "grad_norm": 4.796119689941406, + "learning_rate": 9.023480075012795e-05, + "loss": 1.8668, + "step": 3016 + }, + { + "epoch": 0.22710250475168897, + "grad_norm": 5.814871788024902, + "learning_rate": 9.022756197996077e-05, + "loss": 1.5281, + "step": 3017 + }, + { + "epoch": 0.22717777903234912, + "grad_norm": 5.232256889343262, + "learning_rate": 9.022032081838042e-05, + "loss": 2.0473, + "step": 3018 + }, + { + "epoch": 0.22725305331300927, + "grad_norm": 5.01918888092041, + "learning_rate": 9.021307726581743e-05, + "loss": 2.0042, + "step": 3019 + }, + { + "epoch": 0.22732832759366944, + "grad_norm": 4.870156764984131, + "learning_rate": 9.020583132270235e-05, + "loss": 2.3176, + "step": 3020 + }, + { + "epoch": 0.2274036018743296, + "grad_norm": 4.214104175567627, + "learning_rate": 9.019858298946598e-05, + "loss": 1.8847, + "step": 3021 + }, + { + "epoch": 0.22747887615498974, + "grad_norm": 5.851649284362793, + "learning_rate": 9.01913322665392e-05, + "loss": 2.2905, + "step": 3022 + }, + { + "epoch": 0.2275541504356499, + "grad_norm": 4.709156036376953, + "learning_rate": 9.018407915435302e-05, + "loss": 2.0474, + "step": 3023 + }, + { + "epoch": 0.22762942471631006, + "grad_norm": 4.935070514678955, + "learning_rate": 9.017682365333866e-05, + "loss": 1.7785, + "step": 3024 + }, + { + "epoch": 0.2277046989969702, + "grad_norm": 8.181000709533691, + "learning_rate": 9.01695657639274e-05, + "loss": 1.9314, + "step": 3025 + }, + { + "epoch": 0.22777997327763036, + "grad_norm": 6.059421062469482, + "learning_rate": 9.016230548655071e-05, + "loss": 2.119, + "step": 3026 + }, + { + "epoch": 0.2278552475582905, + "grad_norm": 4.362475872039795, + "learning_rate": 9.01550428216402e-05, + "loss": 2.0547, + "step": 3027 + }, + { + "epoch": 0.22793052183895068, + "grad_norm": 4.97053861618042, + "learning_rate": 9.014777776962762e-05, + "loss": 2.1051, + "step": 3028 + }, + { + "epoch": 0.22800579611961083, + "grad_norm": 5.443403720855713, + "learning_rate": 9.014051033094484e-05, + "loss": 2.1413, + "step": 3029 + }, + { + "epoch": 0.22808107040027098, + "grad_norm": 6.948510646820068, + "learning_rate": 9.013324050602391e-05, + "loss": 2.2686, + "step": 3030 + }, + { + "epoch": 0.22815634468093116, + "grad_norm": 7.151741027832031, + "learning_rate": 9.012596829529697e-05, + "loss": 2.1945, + "step": 3031 + }, + { + "epoch": 0.2282316189615913, + "grad_norm": 6.0929789543151855, + "learning_rate": 9.011869369919636e-05, + "loss": 2.0583, + "step": 3032 + }, + { + "epoch": 0.22830689324225145, + "grad_norm": 6.305862903594971, + "learning_rate": 9.011141671815452e-05, + "loss": 2.1604, + "step": 3033 + }, + { + "epoch": 0.2283821675229116, + "grad_norm": 4.326385498046875, + "learning_rate": 9.010413735260404e-05, + "loss": 1.8108, + "step": 3034 + }, + { + "epoch": 0.22845744180357178, + "grad_norm": 8.373851776123047, + "learning_rate": 9.009685560297768e-05, + "loss": 2.0602, + "step": 3035 + }, + { + "epoch": 0.22853271608423192, + "grad_norm": 5.387134075164795, + "learning_rate": 9.00895714697083e-05, + "loss": 1.9387, + "step": 3036 + }, + { + "epoch": 0.22860799036489207, + "grad_norm": 4.1983642578125, + "learning_rate": 9.008228495322892e-05, + "loss": 1.9623, + "step": 3037 + }, + { + "epoch": 0.22868326464555222, + "grad_norm": 5.065387725830078, + "learning_rate": 9.007499605397272e-05, + "loss": 1.8803, + "step": 3038 + }, + { + "epoch": 0.2287585389262124, + "grad_norm": 5.0604424476623535, + "learning_rate": 9.006770477237298e-05, + "loss": 1.9926, + "step": 3039 + }, + { + "epoch": 0.22883381320687254, + "grad_norm": 4.227555751800537, + "learning_rate": 9.006041110886317e-05, + "loss": 2.0794, + "step": 3040 + }, + { + "epoch": 0.2289090874875327, + "grad_norm": 4.723733425140381, + "learning_rate": 9.005311506387686e-05, + "loss": 2.1048, + "step": 3041 + }, + { + "epoch": 0.22898436176819284, + "grad_norm": 8.18472671508789, + "learning_rate": 9.00458166378478e-05, + "loss": 2.3078, + "step": 3042 + }, + { + "epoch": 0.22905963604885302, + "grad_norm": 6.331607341766357, + "learning_rate": 9.003851583120981e-05, + "loss": 2.2594, + "step": 3043 + }, + { + "epoch": 0.22913491032951316, + "grad_norm": 5.22850227355957, + "learning_rate": 9.003121264439697e-05, + "loss": 1.9172, + "step": 3044 + }, + { + "epoch": 0.2292101846101733, + "grad_norm": 5.810819149017334, + "learning_rate": 9.002390707784338e-05, + "loss": 2.0768, + "step": 3045 + }, + { + "epoch": 0.2292854588908335, + "grad_norm": 6.382132053375244, + "learning_rate": 9.001659913198336e-05, + "loss": 1.8162, + "step": 3046 + }, + { + "epoch": 0.22936073317149364, + "grad_norm": 6.316071033477783, + "learning_rate": 9.000928880725136e-05, + "loss": 2.1667, + "step": 3047 + }, + { + "epoch": 0.22943600745215378, + "grad_norm": 5.533540725708008, + "learning_rate": 9.000197610408192e-05, + "loss": 2.2605, + "step": 3048 + }, + { + "epoch": 0.22951128173281393, + "grad_norm": 6.315400123596191, + "learning_rate": 8.999466102290978e-05, + "loss": 2.0325, + "step": 3049 + }, + { + "epoch": 0.2295865560134741, + "grad_norm": 4.377425670623779, + "learning_rate": 8.998734356416981e-05, + "loss": 1.7614, + "step": 3050 + }, + { + "epoch": 0.22966183029413426, + "grad_norm": 5.1501617431640625, + "learning_rate": 8.998002372829699e-05, + "loss": 2.0406, + "step": 3051 + }, + { + "epoch": 0.2297371045747944, + "grad_norm": 4.402509689331055, + "learning_rate": 8.997270151572649e-05, + "loss": 1.8774, + "step": 3052 + }, + { + "epoch": 0.22981237885545455, + "grad_norm": 5.161309719085693, + "learning_rate": 8.996537692689357e-05, + "loss": 2.2286, + "step": 3053 + }, + { + "epoch": 0.22988765313611473, + "grad_norm": 4.752976417541504, + "learning_rate": 8.995804996223367e-05, + "loss": 2.2758, + "step": 3054 + }, + { + "epoch": 0.22996292741677488, + "grad_norm": 5.243102550506592, + "learning_rate": 8.995072062218235e-05, + "loss": 2.3885, + "step": 3055 + }, + { + "epoch": 0.23003820169743502, + "grad_norm": 6.9079813957214355, + "learning_rate": 8.99433889071753e-05, + "loss": 1.9722, + "step": 3056 + }, + { + "epoch": 0.23011347597809517, + "grad_norm": 4.501270294189453, + "learning_rate": 8.993605481764841e-05, + "loss": 2.0334, + "step": 3057 + }, + { + "epoch": 0.23018875025875535, + "grad_norm": 4.904798984527588, + "learning_rate": 8.992871835403767e-05, + "loss": 1.5457, + "step": 3058 + }, + { + "epoch": 0.2302640245394155, + "grad_norm": 4.343378067016602, + "learning_rate": 8.992137951677916e-05, + "loss": 1.9967, + "step": 3059 + }, + { + "epoch": 0.23033929882007564, + "grad_norm": 5.2575860023498535, + "learning_rate": 8.99140383063092e-05, + "loss": 2.3323, + "step": 3060 + }, + { + "epoch": 0.2304145731007358, + "grad_norm": 4.954637050628662, + "learning_rate": 8.99066947230642e-05, + "loss": 2.1293, + "step": 3061 + }, + { + "epoch": 0.23048984738139597, + "grad_norm": 3.721421480178833, + "learning_rate": 8.989934876748068e-05, + "loss": 1.9338, + "step": 3062 + }, + { + "epoch": 0.23056512166205612, + "grad_norm": 4.949873447418213, + "learning_rate": 8.989200043999537e-05, + "loss": 1.9092, + "step": 3063 + }, + { + "epoch": 0.23064039594271626, + "grad_norm": 6.674953460693359, + "learning_rate": 8.988464974104509e-05, + "loss": 1.8866, + "step": 3064 + }, + { + "epoch": 0.23071567022337644, + "grad_norm": 5.114140510559082, + "learning_rate": 8.987729667106683e-05, + "loss": 2.4098, + "step": 3065 + }, + { + "epoch": 0.2307909445040366, + "grad_norm": 6.162566184997559, + "learning_rate": 8.98699412304977e-05, + "loss": 1.9575, + "step": 3066 + }, + { + "epoch": 0.23086621878469674, + "grad_norm": 6.640736103057861, + "learning_rate": 8.986258341977498e-05, + "loss": 1.9576, + "step": 3067 + }, + { + "epoch": 0.23094149306535688, + "grad_norm": 8.195666313171387, + "learning_rate": 8.985522323933603e-05, + "loss": 2.2835, + "step": 3068 + }, + { + "epoch": 0.23101676734601706, + "grad_norm": 4.752565860748291, + "learning_rate": 8.984786068961843e-05, + "loss": 2.1665, + "step": 3069 + }, + { + "epoch": 0.2310920416266772, + "grad_norm": 3.592942237854004, + "learning_rate": 8.984049577105985e-05, + "loss": 2.0028, + "step": 3070 + }, + { + "epoch": 0.23116731590733736, + "grad_norm": 4.580357074737549, + "learning_rate": 8.983312848409811e-05, + "loss": 2.0182, + "step": 3071 + }, + { + "epoch": 0.2312425901879975, + "grad_norm": 4.446590423583984, + "learning_rate": 8.982575882917119e-05, + "loss": 2.0007, + "step": 3072 + }, + { + "epoch": 0.23131786446865768, + "grad_norm": 4.340010166168213, + "learning_rate": 8.981838680671715e-05, + "loss": 2.3792, + "step": 3073 + }, + { + "epoch": 0.23139313874931783, + "grad_norm": 5.936091899871826, + "learning_rate": 8.981101241717427e-05, + "loss": 2.7705, + "step": 3074 + }, + { + "epoch": 0.23146841302997798, + "grad_norm": 4.959737300872803, + "learning_rate": 8.980363566098096e-05, + "loss": 2.2305, + "step": 3075 + }, + { + "epoch": 0.23154368731063812, + "grad_norm": 3.9526758193969727, + "learning_rate": 8.97962565385757e-05, + "loss": 1.8715, + "step": 3076 + }, + { + "epoch": 0.2316189615912983, + "grad_norm": 5.736264705657959, + "learning_rate": 8.978887505039717e-05, + "loss": 2.0841, + "step": 3077 + }, + { + "epoch": 0.23169423587195845, + "grad_norm": 6.744086265563965, + "learning_rate": 8.97814911968842e-05, + "loss": 1.8275, + "step": 3078 + }, + { + "epoch": 0.2317695101526186, + "grad_norm": 6.316197395324707, + "learning_rate": 8.977410497847571e-05, + "loss": 2.1513, + "step": 3079 + }, + { + "epoch": 0.23184478443327877, + "grad_norm": 4.470517158508301, + "learning_rate": 8.976671639561082e-05, + "loss": 1.9101, + "step": 3080 + }, + { + "epoch": 0.23192005871393892, + "grad_norm": 4.95775842666626, + "learning_rate": 8.975932544872874e-05, + "loss": 1.8498, + "step": 3081 + }, + { + "epoch": 0.23199533299459907, + "grad_norm": 4.533265590667725, + "learning_rate": 8.975193213826885e-05, + "loss": 2.0362, + "step": 3082 + }, + { + "epoch": 0.23207060727525922, + "grad_norm": 5.092185974121094, + "learning_rate": 8.974453646467063e-05, + "loss": 1.4211, + "step": 3083 + }, + { + "epoch": 0.2321458815559194, + "grad_norm": 4.261809825897217, + "learning_rate": 8.973713842837378e-05, + "loss": 1.779, + "step": 3084 + }, + { + "epoch": 0.23222115583657954, + "grad_norm": 5.04252290725708, + "learning_rate": 8.972973802981805e-05, + "loss": 1.8475, + "step": 3085 + }, + { + "epoch": 0.2322964301172397, + "grad_norm": 5.632111072540283, + "learning_rate": 8.972233526944343e-05, + "loss": 2.1461, + "step": 3086 + }, + { + "epoch": 0.23237170439789984, + "grad_norm": 4.163288593292236, + "learning_rate": 8.971493014768991e-05, + "loss": 2.0373, + "step": 3087 + }, + { + "epoch": 0.23244697867856, + "grad_norm": 4.849682807922363, + "learning_rate": 8.970752266499778e-05, + "loss": 2.2187, + "step": 3088 + }, + { + "epoch": 0.23252225295922016, + "grad_norm": 3.7831506729125977, + "learning_rate": 8.970011282180734e-05, + "loss": 2.1326, + "step": 3089 + }, + { + "epoch": 0.2325975272398803, + "grad_norm": 4.984122276306152, + "learning_rate": 8.969270061855911e-05, + "loss": 2.3132, + "step": 3090 + }, + { + "epoch": 0.23267280152054046, + "grad_norm": 6.370550155639648, + "learning_rate": 8.968528605569373e-05, + "loss": 2.143, + "step": 3091 + }, + { + "epoch": 0.23274807580120063, + "grad_norm": 5.93715238571167, + "learning_rate": 8.967786913365195e-05, + "loss": 2.0035, + "step": 3092 + }, + { + "epoch": 0.23282335008186078, + "grad_norm": 5.566004753112793, + "learning_rate": 8.96704498528747e-05, + "loss": 2.0769, + "step": 3093 + }, + { + "epoch": 0.23289862436252093, + "grad_norm": 7.450303554534912, + "learning_rate": 8.966302821380304e-05, + "loss": 2.318, + "step": 3094 + }, + { + "epoch": 0.2329738986431811, + "grad_norm": 6.916574954986572, + "learning_rate": 8.965560421687816e-05, + "loss": 2.1454, + "step": 3095 + }, + { + "epoch": 0.23304917292384125, + "grad_norm": 5.600445747375488, + "learning_rate": 8.964817786254138e-05, + "loss": 1.8668, + "step": 3096 + }, + { + "epoch": 0.2331244472045014, + "grad_norm": 4.611108303070068, + "learning_rate": 8.964074915123421e-05, + "loss": 2.1131, + "step": 3097 + }, + { + "epoch": 0.23319972148516155, + "grad_norm": 4.783222675323486, + "learning_rate": 8.963331808339823e-05, + "loss": 1.9926, + "step": 3098 + }, + { + "epoch": 0.23327499576582172, + "grad_norm": 6.094036102294922, + "learning_rate": 8.962588465947522e-05, + "loss": 2.0015, + "step": 3099 + }, + { + "epoch": 0.23335027004648187, + "grad_norm": 5.402453422546387, + "learning_rate": 8.961844887990706e-05, + "loss": 1.9773, + "step": 3100 + }, + { + "epoch": 0.23342554432714202, + "grad_norm": 3.9886672496795654, + "learning_rate": 8.96110107451358e-05, + "loss": 1.8964, + "step": 3101 + }, + { + "epoch": 0.23350081860780217, + "grad_norm": 4.694164752960205, + "learning_rate": 8.960357025560359e-05, + "loss": 2.4702, + "step": 3102 + }, + { + "epoch": 0.23357609288846234, + "grad_norm": 6.339183330535889, + "learning_rate": 8.959612741175278e-05, + "loss": 2.2143, + "step": 3103 + }, + { + "epoch": 0.2336513671691225, + "grad_norm": 4.419780731201172, + "learning_rate": 8.958868221402581e-05, + "loss": 1.8231, + "step": 3104 + }, + { + "epoch": 0.23372664144978264, + "grad_norm": 3.7694437503814697, + "learning_rate": 8.958123466286528e-05, + "loss": 2.0554, + "step": 3105 + }, + { + "epoch": 0.2338019157304428, + "grad_norm": 4.415439605712891, + "learning_rate": 8.95737847587139e-05, + "loss": 1.8152, + "step": 3106 + }, + { + "epoch": 0.23387719001110296, + "grad_norm": 4.61833381652832, + "learning_rate": 8.956633250201457e-05, + "loss": 2.4684, + "step": 3107 + }, + { + "epoch": 0.2339524642917631, + "grad_norm": 4.725123405456543, + "learning_rate": 8.955887789321031e-05, + "loss": 2.1098, + "step": 3108 + }, + { + "epoch": 0.23402773857242326, + "grad_norm": 5.1417341232299805, + "learning_rate": 8.955142093274426e-05, + "loss": 2.1036, + "step": 3109 + }, + { + "epoch": 0.23410301285308344, + "grad_norm": 5.854405879974365, + "learning_rate": 8.954396162105974e-05, + "loss": 1.9563, + "step": 3110 + }, + { + "epoch": 0.23417828713374358, + "grad_norm": 5.214634418487549, + "learning_rate": 8.953649995860015e-05, + "loss": 2.3299, + "step": 3111 + }, + { + "epoch": 0.23425356141440373, + "grad_norm": 4.962255001068115, + "learning_rate": 8.952903594580908e-05, + "loss": 1.7914, + "step": 3112 + }, + { + "epoch": 0.23432883569506388, + "grad_norm": 4.279754161834717, + "learning_rate": 8.952156958313025e-05, + "loss": 1.8516, + "step": 3113 + }, + { + "epoch": 0.23440410997572406, + "grad_norm": 5.431236267089844, + "learning_rate": 8.95141008710075e-05, + "loss": 1.881, + "step": 3114 + }, + { + "epoch": 0.2344793842563842, + "grad_norm": 5.180300712585449, + "learning_rate": 8.950662980988483e-05, + "loss": 2.0101, + "step": 3115 + }, + { + "epoch": 0.23455465853704435, + "grad_norm": 5.300332546234131, + "learning_rate": 8.949915640020639e-05, + "loss": 1.7913, + "step": 3116 + }, + { + "epoch": 0.2346299328177045, + "grad_norm": 7.975987434387207, + "learning_rate": 8.949168064241643e-05, + "loss": 2.1514, + "step": 3117 + }, + { + "epoch": 0.23470520709836468, + "grad_norm": 6.148632526397705, + "learning_rate": 8.948420253695937e-05, + "loss": 2.3402, + "step": 3118 + }, + { + "epoch": 0.23478048137902482, + "grad_norm": 7.389118671417236, + "learning_rate": 8.947672208427976e-05, + "loss": 1.6371, + "step": 3119 + }, + { + "epoch": 0.23485575565968497, + "grad_norm": 5.129561424255371, + "learning_rate": 8.94692392848223e-05, + "loss": 2.1028, + "step": 3120 + }, + { + "epoch": 0.23493102994034512, + "grad_norm": 5.223440647125244, + "learning_rate": 8.946175413903181e-05, + "loss": 2.2671, + "step": 3121 + }, + { + "epoch": 0.2350063042210053, + "grad_norm": 5.282307147979736, + "learning_rate": 8.945426664735326e-05, + "loss": 2.195, + "step": 3122 + }, + { + "epoch": 0.23508157850166544, + "grad_norm": 7.3457255363464355, + "learning_rate": 8.944677681023176e-05, + "loss": 2.0344, + "step": 3123 + }, + { + "epoch": 0.2351568527823256, + "grad_norm": 5.0852742195129395, + "learning_rate": 8.943928462811259e-05, + "loss": 2.2336, + "step": 3124 + }, + { + "epoch": 0.23523212706298574, + "grad_norm": 4.484767913818359, + "learning_rate": 8.943179010144109e-05, + "loss": 2.0429, + "step": 3125 + }, + { + "epoch": 0.23530740134364592, + "grad_norm": 5.363630294799805, + "learning_rate": 8.942429323066282e-05, + "loss": 1.9117, + "step": 3126 + }, + { + "epoch": 0.23538267562430606, + "grad_norm": 4.567474365234375, + "learning_rate": 8.941679401622344e-05, + "loss": 1.8534, + "step": 3127 + }, + { + "epoch": 0.2354579499049662, + "grad_norm": 7.795666217803955, + "learning_rate": 8.940929245856875e-05, + "loss": 2.0451, + "step": 3128 + }, + { + "epoch": 0.2355332241856264, + "grad_norm": 5.815847396850586, + "learning_rate": 8.940178855814469e-05, + "loss": 2.1459, + "step": 3129 + }, + { + "epoch": 0.23560849846628654, + "grad_norm": 6.202424049377441, + "learning_rate": 8.939428231539738e-05, + "loss": 1.8795, + "step": 3130 + }, + { + "epoch": 0.23568377274694668, + "grad_norm": 5.760075569152832, + "learning_rate": 8.938677373077302e-05, + "loss": 1.7439, + "step": 3131 + }, + { + "epoch": 0.23575904702760683, + "grad_norm": 4.191190242767334, + "learning_rate": 8.937926280471796e-05, + "loss": 2.1132, + "step": 3132 + }, + { + "epoch": 0.235834321308267, + "grad_norm": 5.4813737869262695, + "learning_rate": 8.937174953767872e-05, + "loss": 1.8399, + "step": 3133 + }, + { + "epoch": 0.23590959558892716, + "grad_norm": 6.132549285888672, + "learning_rate": 8.936423393010194e-05, + "loss": 2.2918, + "step": 3134 + }, + { + "epoch": 0.2359848698695873, + "grad_norm": 5.623762607574463, + "learning_rate": 8.935671598243441e-05, + "loss": 2.1081, + "step": 3135 + }, + { + "epoch": 0.23606014415024745, + "grad_norm": 4.4616241455078125, + "learning_rate": 8.934919569512303e-05, + "loss": 1.8074, + "step": 3136 + }, + { + "epoch": 0.23613541843090763, + "grad_norm": 6.123564720153809, + "learning_rate": 8.934167306861488e-05, + "loss": 2.1566, + "step": 3137 + }, + { + "epoch": 0.23621069271156778, + "grad_norm": 4.065822601318359, + "learning_rate": 8.933414810335716e-05, + "loss": 2.2122, + "step": 3138 + }, + { + "epoch": 0.23628596699222792, + "grad_norm": 4.7490057945251465, + "learning_rate": 8.932662079979718e-05, + "loss": 1.993, + "step": 3139 + }, + { + "epoch": 0.23636124127288807, + "grad_norm": 4.373496055603027, + "learning_rate": 8.931909115838244e-05, + "loss": 1.7665, + "step": 3140 + }, + { + "epoch": 0.23643651555354825, + "grad_norm": 5.50691556930542, + "learning_rate": 8.931155917956055e-05, + "loss": 1.7927, + "step": 3141 + }, + { + "epoch": 0.2365117898342084, + "grad_norm": 4.592144012451172, + "learning_rate": 8.930402486377927e-05, + "loss": 1.7299, + "step": 3142 + }, + { + "epoch": 0.23658706411486854, + "grad_norm": 6.42816162109375, + "learning_rate": 8.929648821148648e-05, + "loss": 2.0241, + "step": 3143 + }, + { + "epoch": 0.23666233839552872, + "grad_norm": 3.823014974594116, + "learning_rate": 8.928894922313022e-05, + "loss": 2.1517, + "step": 3144 + }, + { + "epoch": 0.23673761267618887, + "grad_norm": 5.641438007354736, + "learning_rate": 8.928140789915866e-05, + "loss": 1.7568, + "step": 3145 + }, + { + "epoch": 0.23681288695684902, + "grad_norm": 5.736764430999756, + "learning_rate": 8.927386424002011e-05, + "loss": 1.9236, + "step": 3146 + }, + { + "epoch": 0.23688816123750916, + "grad_norm": 4.213385581970215, + "learning_rate": 8.926631824616303e-05, + "loss": 2.0643, + "step": 3147 + }, + { + "epoch": 0.23696343551816934, + "grad_norm": 6.429990291595459, + "learning_rate": 8.925876991803599e-05, + "loss": 2.064, + "step": 3148 + }, + { + "epoch": 0.2370387097988295, + "grad_norm": 4.186111927032471, + "learning_rate": 8.925121925608772e-05, + "loss": 2.025, + "step": 3149 + }, + { + "epoch": 0.23711398407948964, + "grad_norm": 4.240618705749512, + "learning_rate": 8.924366626076711e-05, + "loss": 2.0895, + "step": 3150 + }, + { + "epoch": 0.23718925836014979, + "grad_norm": 5.167217254638672, + "learning_rate": 8.923611093252313e-05, + "loss": 2.0591, + "step": 3151 + }, + { + "epoch": 0.23726453264080996, + "grad_norm": 4.502598285675049, + "learning_rate": 8.922855327180494e-05, + "loss": 1.73, + "step": 3152 + }, + { + "epoch": 0.2373398069214701, + "grad_norm": 5.017537593841553, + "learning_rate": 8.922099327906181e-05, + "loss": 2.0567, + "step": 3153 + }, + { + "epoch": 0.23741508120213026, + "grad_norm": 5.639168739318848, + "learning_rate": 8.921343095474317e-05, + "loss": 2.1158, + "step": 3154 + }, + { + "epoch": 0.2374903554827904, + "grad_norm": 5.212647438049316, + "learning_rate": 8.920586629929858e-05, + "loss": 2.2101, + "step": 3155 + }, + { + "epoch": 0.23756562976345058, + "grad_norm": 5.536198616027832, + "learning_rate": 8.919829931317774e-05, + "loss": 2.0169, + "step": 3156 + }, + { + "epoch": 0.23764090404411073, + "grad_norm": 6.055906295776367, + "learning_rate": 8.919072999683046e-05, + "loss": 2.3717, + "step": 3157 + }, + { + "epoch": 0.23771617832477088, + "grad_norm": 5.978962421417236, + "learning_rate": 8.918315835070674e-05, + "loss": 1.7444, + "step": 3158 + }, + { + "epoch": 0.23779145260543105, + "grad_norm": 7.151521682739258, + "learning_rate": 8.917558437525668e-05, + "loss": 2.1532, + "step": 3159 + }, + { + "epoch": 0.2378667268860912, + "grad_norm": 4.553358554840088, + "learning_rate": 8.916800807093056e-05, + "loss": 2.0399, + "step": 3160 + }, + { + "epoch": 0.23794200116675135, + "grad_norm": 5.821055889129639, + "learning_rate": 8.916042943817872e-05, + "loss": 2.2394, + "step": 3161 + }, + { + "epoch": 0.2380172754474115, + "grad_norm": 5.401861190795898, + "learning_rate": 8.915284847745171e-05, + "loss": 1.8511, + "step": 3162 + }, + { + "epoch": 0.23809254972807167, + "grad_norm": 4.792477607727051, + "learning_rate": 8.914526518920022e-05, + "loss": 2.1013, + "step": 3163 + }, + { + "epoch": 0.23816782400873182, + "grad_norm": 5.911551475524902, + "learning_rate": 8.913767957387502e-05, + "loss": 1.7795, + "step": 3164 + }, + { + "epoch": 0.23824309828939197, + "grad_norm": 4.274476051330566, + "learning_rate": 8.913009163192707e-05, + "loss": 1.9452, + "step": 3165 + }, + { + "epoch": 0.23831837257005212, + "grad_norm": 3.694766044616699, + "learning_rate": 8.912250136380746e-05, + "loss": 1.9824, + "step": 3166 + }, + { + "epoch": 0.2383936468507123, + "grad_norm": 5.7437238693237305, + "learning_rate": 8.91149087699674e-05, + "loss": 1.9185, + "step": 3167 + }, + { + "epoch": 0.23846892113137244, + "grad_norm": 4.847318649291992, + "learning_rate": 8.910731385085824e-05, + "loss": 1.7768, + "step": 3168 + }, + { + "epoch": 0.2385441954120326, + "grad_norm": 6.198723793029785, + "learning_rate": 8.909971660693148e-05, + "loss": 2.2064, + "step": 3169 + }, + { + "epoch": 0.23861946969269274, + "grad_norm": 5.455992221832275, + "learning_rate": 8.909211703863876e-05, + "loss": 1.8099, + "step": 3170 + }, + { + "epoch": 0.2386947439733529, + "grad_norm": 5.542971134185791, + "learning_rate": 8.908451514643185e-05, + "loss": 1.7015, + "step": 3171 + }, + { + "epoch": 0.23877001825401306, + "grad_norm": 4.432085037231445, + "learning_rate": 8.907691093076266e-05, + "loss": 2.1108, + "step": 3172 + }, + { + "epoch": 0.2388452925346732, + "grad_norm": 5.2662177085876465, + "learning_rate": 8.906930439208323e-05, + "loss": 1.888, + "step": 3173 + }, + { + "epoch": 0.23892056681533336, + "grad_norm": 4.880076885223389, + "learning_rate": 8.906169553084577e-05, + "loss": 1.5764, + "step": 3174 + }, + { + "epoch": 0.23899584109599353, + "grad_norm": 5.456822395324707, + "learning_rate": 8.905408434750259e-05, + "loss": 2.189, + "step": 3175 + }, + { + "epoch": 0.23907111537665368, + "grad_norm": 7.485317230224609, + "learning_rate": 8.904647084250615e-05, + "loss": 1.9685, + "step": 3176 + }, + { + "epoch": 0.23914638965731383, + "grad_norm": 5.173656940460205, + "learning_rate": 8.903885501630907e-05, + "loss": 1.7734, + "step": 3177 + }, + { + "epoch": 0.239221663937974, + "grad_norm": 4.7703752517700195, + "learning_rate": 8.903123686936406e-05, + "loss": 2.4197, + "step": 3178 + }, + { + "epoch": 0.23929693821863415, + "grad_norm": 5.125649452209473, + "learning_rate": 8.9023616402124e-05, + "loss": 1.7814, + "step": 3179 + }, + { + "epoch": 0.2393722124992943, + "grad_norm": 5.37459659576416, + "learning_rate": 8.901599361504194e-05, + "loss": 1.78, + "step": 3180 + }, + { + "epoch": 0.23944748677995445, + "grad_norm": 5.426682949066162, + "learning_rate": 8.900836850857099e-05, + "loss": 1.8954, + "step": 3181 + }, + { + "epoch": 0.23952276106061463, + "grad_norm": 4.460564613342285, + "learning_rate": 8.900074108316448e-05, + "loss": 1.9785, + "step": 3182 + }, + { + "epoch": 0.23959803534127477, + "grad_norm": 7.670148849487305, + "learning_rate": 8.899311133927579e-05, + "loss": 1.915, + "step": 3183 + }, + { + "epoch": 0.23967330962193492, + "grad_norm": 4.1189117431640625, + "learning_rate": 8.898547927735854e-05, + "loss": 1.873, + "step": 3184 + }, + { + "epoch": 0.23974858390259507, + "grad_norm": 6.375442981719971, + "learning_rate": 8.89778448978664e-05, + "loss": 1.9937, + "step": 3185 + }, + { + "epoch": 0.23982385818325525, + "grad_norm": 4.895740985870361, + "learning_rate": 8.897020820125322e-05, + "loss": 2.0181, + "step": 3186 + }, + { + "epoch": 0.2398991324639154, + "grad_norm": 6.858406066894531, + "learning_rate": 8.896256918797298e-05, + "loss": 2.4321, + "step": 3187 + }, + { + "epoch": 0.23997440674457554, + "grad_norm": 5.281861305236816, + "learning_rate": 8.89549278584798e-05, + "loss": 2.1442, + "step": 3188 + }, + { + "epoch": 0.2400496810252357, + "grad_norm": 5.286101818084717, + "learning_rate": 8.894728421322793e-05, + "loss": 2.2196, + "step": 3189 + }, + { + "epoch": 0.24012495530589587, + "grad_norm": 4.0317792892456055, + "learning_rate": 8.893963825267177e-05, + "loss": 1.6817, + "step": 3190 + }, + { + "epoch": 0.240200229586556, + "grad_norm": 5.065546035766602, + "learning_rate": 8.893198997726583e-05, + "loss": 2.3616, + "step": 3191 + }, + { + "epoch": 0.24027550386721616, + "grad_norm": 4.716331958770752, + "learning_rate": 8.892433938746483e-05, + "loss": 1.9798, + "step": 3192 + }, + { + "epoch": 0.24035077814787634, + "grad_norm": 5.989851951599121, + "learning_rate": 8.89166864837235e-05, + "loss": 2.092, + "step": 3193 + }, + { + "epoch": 0.24042605242853649, + "grad_norm": 4.019390106201172, + "learning_rate": 8.890903126649685e-05, + "loss": 1.8826, + "step": 3194 + }, + { + "epoch": 0.24050132670919663, + "grad_norm": 4.936091423034668, + "learning_rate": 8.890137373623992e-05, + "loss": 2.3794, + "step": 3195 + }, + { + "epoch": 0.24057660098985678, + "grad_norm": 4.485189437866211, + "learning_rate": 8.889371389340795e-05, + "loss": 1.8195, + "step": 3196 + }, + { + "epoch": 0.24065187527051696, + "grad_norm": 5.977455139160156, + "learning_rate": 8.888605173845628e-05, + "loss": 2.0448, + "step": 3197 + }, + { + "epoch": 0.2407271495511771, + "grad_norm": 4.3085527420043945, + "learning_rate": 8.887838727184041e-05, + "loss": 2.2646, + "step": 3198 + }, + { + "epoch": 0.24080242383183725, + "grad_norm": 7.7725067138671875, + "learning_rate": 8.887072049401599e-05, + "loss": 2.4822, + "step": 3199 + }, + { + "epoch": 0.2408776981124974, + "grad_norm": 5.251986503601074, + "learning_rate": 8.886305140543876e-05, + "loss": 2.0402, + "step": 3200 + }, + { + "epoch": 0.24095297239315758, + "grad_norm": 7.536415100097656, + "learning_rate": 8.885538000656462e-05, + "loss": 2.0054, + "step": 3201 + }, + { + "epoch": 0.24102824667381773, + "grad_norm": 4.278420448303223, + "learning_rate": 8.884770629784965e-05, + "loss": 1.8513, + "step": 3202 + }, + { + "epoch": 0.24110352095447787, + "grad_norm": 3.909768581390381, + "learning_rate": 8.884003027974999e-05, + "loss": 2.0533, + "step": 3203 + }, + { + "epoch": 0.24117879523513802, + "grad_norm": 4.468538284301758, + "learning_rate": 8.883235195272196e-05, + "loss": 2.0185, + "step": 3204 + }, + { + "epoch": 0.2412540695157982, + "grad_norm": 5.654131889343262, + "learning_rate": 8.882467131722206e-05, + "loss": 2.083, + "step": 3205 + }, + { + "epoch": 0.24132934379645835, + "grad_norm": 6.807154655456543, + "learning_rate": 8.881698837370685e-05, + "loss": 2.0659, + "step": 3206 + }, + { + "epoch": 0.2414046180771185, + "grad_norm": 6.067615985870361, + "learning_rate": 8.880930312263304e-05, + "loss": 2.2531, + "step": 3207 + }, + { + "epoch": 0.24147989235777867, + "grad_norm": 4.343033790588379, + "learning_rate": 8.880161556445751e-05, + "loss": 2.1423, + "step": 3208 + }, + { + "epoch": 0.24155516663843882, + "grad_norm": 4.236153602600098, + "learning_rate": 8.87939256996373e-05, + "loss": 2.0936, + "step": 3209 + }, + { + "epoch": 0.24163044091909897, + "grad_norm": 5.060676574707031, + "learning_rate": 8.87862335286295e-05, + "loss": 2.0666, + "step": 3210 + }, + { + "epoch": 0.2417057151997591, + "grad_norm": 5.189123630523682, + "learning_rate": 8.877853905189139e-05, + "loss": 1.8869, + "step": 3211 + }, + { + "epoch": 0.2417809894804193, + "grad_norm": 4.262923717498779, + "learning_rate": 8.877084226988043e-05, + "loss": 1.9798, + "step": 3212 + }, + { + "epoch": 0.24185626376107944, + "grad_norm": 4.129027366638184, + "learning_rate": 8.876314318305415e-05, + "loss": 2.22, + "step": 3213 + }, + { + "epoch": 0.24193153804173959, + "grad_norm": 4.295915126800537, + "learning_rate": 8.87554417918702e-05, + "loss": 1.8713, + "step": 3214 + }, + { + "epoch": 0.24200681232239973, + "grad_norm": 6.468725681304932, + "learning_rate": 8.874773809678644e-05, + "loss": 1.7633, + "step": 3215 + }, + { + "epoch": 0.2420820866030599, + "grad_norm": 5.065306663513184, + "learning_rate": 8.874003209826087e-05, + "loss": 2.4608, + "step": 3216 + }, + { + "epoch": 0.24215736088372006, + "grad_norm": 6.162162780761719, + "learning_rate": 8.873232379675151e-05, + "loss": 2.3163, + "step": 3217 + }, + { + "epoch": 0.2422326351643802, + "grad_norm": 4.50464391708374, + "learning_rate": 8.872461319271665e-05, + "loss": 2.0267, + "step": 3218 + }, + { + "epoch": 0.24230790944504035, + "grad_norm": 7.087899684906006, + "learning_rate": 8.871690028661465e-05, + "loss": 1.853, + "step": 3219 + }, + { + "epoch": 0.24238318372570053, + "grad_norm": 3.8965680599212646, + "learning_rate": 8.8709185078904e-05, + "loss": 2.1566, + "step": 3220 + }, + { + "epoch": 0.24245845800636068, + "grad_norm": 4.163898468017578, + "learning_rate": 8.870146757004338e-05, + "loss": 1.7985, + "step": 3221 + }, + { + "epoch": 0.24253373228702083, + "grad_norm": 3.990591049194336, + "learning_rate": 8.869374776049157e-05, + "loss": 1.8742, + "step": 3222 + }, + { + "epoch": 0.242609006567681, + "grad_norm": 5.649715423583984, + "learning_rate": 8.868602565070747e-05, + "loss": 1.8228, + "step": 3223 + }, + { + "epoch": 0.24268428084834115, + "grad_norm": 3.96374773979187, + "learning_rate": 8.867830124115016e-05, + "loss": 2.1965, + "step": 3224 + }, + { + "epoch": 0.2427595551290013, + "grad_norm": 6.239009380340576, + "learning_rate": 8.867057453227881e-05, + "loss": 2.061, + "step": 3225 + }, + { + "epoch": 0.24283482940966145, + "grad_norm": 5.888492107391357, + "learning_rate": 8.866284552455276e-05, + "loss": 2.049, + "step": 3226 + }, + { + "epoch": 0.24291010369032162, + "grad_norm": 5.878908634185791, + "learning_rate": 8.865511421843149e-05, + "loss": 2.189, + "step": 3227 + }, + { + "epoch": 0.24298537797098177, + "grad_norm": 4.29360294342041, + "learning_rate": 8.864738061437459e-05, + "loss": 1.967, + "step": 3228 + }, + { + "epoch": 0.24306065225164192, + "grad_norm": 5.251470565795898, + "learning_rate": 8.863964471284179e-05, + "loss": 2.501, + "step": 3229 + }, + { + "epoch": 0.24313592653230207, + "grad_norm": 6.030707359313965, + "learning_rate": 8.8631906514293e-05, + "loss": 2.2955, + "step": 3230 + }, + { + "epoch": 0.24321120081296224, + "grad_norm": 5.896524906158447, + "learning_rate": 8.862416601918822e-05, + "loss": 2.2549, + "step": 3231 + }, + { + "epoch": 0.2432864750936224, + "grad_norm": 4.806870460510254, + "learning_rate": 8.861642322798757e-05, + "loss": 2.0192, + "step": 3232 + }, + { + "epoch": 0.24336174937428254, + "grad_norm": 4.703171730041504, + "learning_rate": 8.860867814115138e-05, + "loss": 2.0472, + "step": 3233 + }, + { + "epoch": 0.24343702365494269, + "grad_norm": 5.212809085845947, + "learning_rate": 8.860093075914005e-05, + "loss": 1.9194, + "step": 3234 + }, + { + "epoch": 0.24351229793560286, + "grad_norm": 4.855073928833008, + "learning_rate": 8.859318108241414e-05, + "loss": 2.2624, + "step": 3235 + }, + { + "epoch": 0.243587572216263, + "grad_norm": 6.122040748596191, + "learning_rate": 8.858542911143437e-05, + "loss": 1.824, + "step": 3236 + }, + { + "epoch": 0.24366284649692316, + "grad_norm": 5.330134391784668, + "learning_rate": 8.857767484666153e-05, + "loss": 2.1992, + "step": 3237 + }, + { + "epoch": 0.2437381207775833, + "grad_norm": 5.156702041625977, + "learning_rate": 8.856991828855663e-05, + "loss": 2.1636, + "step": 3238 + }, + { + "epoch": 0.24381339505824348, + "grad_norm": 6.925411224365234, + "learning_rate": 8.856215943758076e-05, + "loss": 2.1884, + "step": 3239 + }, + { + "epoch": 0.24388866933890363, + "grad_norm": 4.721677780151367, + "learning_rate": 8.855439829419514e-05, + "loss": 1.995, + "step": 3240 + }, + { + "epoch": 0.24396394361956378, + "grad_norm": 4.826356887817383, + "learning_rate": 8.854663485886116e-05, + "loss": 2.115, + "step": 3241 + }, + { + "epoch": 0.24403921790022395, + "grad_norm": 4.6484575271606445, + "learning_rate": 8.853886913204037e-05, + "loss": 1.8325, + "step": 3242 + }, + { + "epoch": 0.2441144921808841, + "grad_norm": 4.887039661407471, + "learning_rate": 8.853110111419437e-05, + "loss": 2.127, + "step": 3243 + }, + { + "epoch": 0.24418976646154425, + "grad_norm": 5.806979179382324, + "learning_rate": 8.852333080578497e-05, + "loss": 2.2364, + "step": 3244 + }, + { + "epoch": 0.2442650407422044, + "grad_norm": 6.169309616088867, + "learning_rate": 8.85155582072741e-05, + "loss": 1.8519, + "step": 3245 + }, + { + "epoch": 0.24434031502286457, + "grad_norm": 4.662698745727539, + "learning_rate": 8.85077833191238e-05, + "loss": 2.0586, + "step": 3246 + }, + { + "epoch": 0.24441558930352472, + "grad_norm": 6.05176305770874, + "learning_rate": 8.850000614179627e-05, + "loss": 2.089, + "step": 3247 + }, + { + "epoch": 0.24449086358418487, + "grad_norm": 4.627222537994385, + "learning_rate": 8.849222667575384e-05, + "loss": 2.0815, + "step": 3248 + }, + { + "epoch": 0.24456613786484502, + "grad_norm": 5.106442451477051, + "learning_rate": 8.8484444921459e-05, + "loss": 2.1427, + "step": 3249 + }, + { + "epoch": 0.2446414121455052, + "grad_norm": 5.837242603302002, + "learning_rate": 8.847666087937432e-05, + "loss": 2.0585, + "step": 3250 + }, + { + "epoch": 0.24471668642616534, + "grad_norm": 4.978334426879883, + "learning_rate": 8.846887454996254e-05, + "loss": 2.1514, + "step": 3251 + }, + { + "epoch": 0.2447919607068255, + "grad_norm": 7.252320766448975, + "learning_rate": 8.846108593368656e-05, + "loss": 2.0872, + "step": 3252 + }, + { + "epoch": 0.24486723498748564, + "grad_norm": 5.338039398193359, + "learning_rate": 8.845329503100939e-05, + "loss": 1.8807, + "step": 3253 + }, + { + "epoch": 0.2449425092681458, + "grad_norm": 4.8889360427856445, + "learning_rate": 8.844550184239415e-05, + "loss": 1.8834, + "step": 3254 + }, + { + "epoch": 0.24501778354880596, + "grad_norm": 6.122738838195801, + "learning_rate": 8.843770636830415e-05, + "loss": 1.9559, + "step": 3255 + }, + { + "epoch": 0.2450930578294661, + "grad_norm": 4.613219261169434, + "learning_rate": 8.842990860920279e-05, + "loss": 1.9199, + "step": 3256 + }, + { + "epoch": 0.24516833211012629, + "grad_norm": 3.9468183517456055, + "learning_rate": 8.842210856555362e-05, + "loss": 2.0463, + "step": 3257 + }, + { + "epoch": 0.24524360639078643, + "grad_norm": 5.367069244384766, + "learning_rate": 8.841430623782035e-05, + "loss": 1.8373, + "step": 3258 + }, + { + "epoch": 0.24531888067144658, + "grad_norm": 4.730619430541992, + "learning_rate": 8.84065016264668e-05, + "loss": 1.8651, + "step": 3259 + }, + { + "epoch": 0.24539415495210673, + "grad_norm": 3.5489230155944824, + "learning_rate": 8.839869473195693e-05, + "loss": 1.8292, + "step": 3260 + }, + { + "epoch": 0.2454694292327669, + "grad_norm": 5.090607166290283, + "learning_rate": 8.839088555475484e-05, + "loss": 1.9293, + "step": 3261 + }, + { + "epoch": 0.24554470351342705, + "grad_norm": 5.776881217956543, + "learning_rate": 8.838307409532475e-05, + "loss": 2.9284, + "step": 3262 + }, + { + "epoch": 0.2456199777940872, + "grad_norm": 7.403339862823486, + "learning_rate": 8.837526035413104e-05, + "loss": 2.3229, + "step": 3263 + }, + { + "epoch": 0.24569525207474735, + "grad_norm": 3.538074254989624, + "learning_rate": 8.836744433163822e-05, + "loss": 1.8653, + "step": 3264 + }, + { + "epoch": 0.24577052635540753, + "grad_norm": 4.7638373374938965, + "learning_rate": 8.835962602831092e-05, + "loss": 2.3151, + "step": 3265 + }, + { + "epoch": 0.24584580063606767, + "grad_norm": 5.08485221862793, + "learning_rate": 8.835180544461391e-05, + "loss": 1.9342, + "step": 3266 + }, + { + "epoch": 0.24592107491672782, + "grad_norm": 5.612966060638428, + "learning_rate": 8.834398258101212e-05, + "loss": 1.9198, + "step": 3267 + }, + { + "epoch": 0.24599634919738797, + "grad_norm": 4.825185298919678, + "learning_rate": 8.833615743797058e-05, + "loss": 2.1121, + "step": 3268 + }, + { + "epoch": 0.24607162347804815, + "grad_norm": 4.684162616729736, + "learning_rate": 8.832833001595449e-05, + "loss": 1.8406, + "step": 3269 + }, + { + "epoch": 0.2461468977587083, + "grad_norm": 7.745378494262695, + "learning_rate": 8.832050031542915e-05, + "loss": 2.3693, + "step": 3270 + }, + { + "epoch": 0.24622217203936844, + "grad_norm": 5.790078639984131, + "learning_rate": 8.831266833686001e-05, + "loss": 2.1797, + "step": 3271 + }, + { + "epoch": 0.24629744632002862, + "grad_norm": 4.767457485198975, + "learning_rate": 8.830483408071269e-05, + "loss": 1.8253, + "step": 3272 + }, + { + "epoch": 0.24637272060068877, + "grad_norm": 8.394580841064453, + "learning_rate": 8.829699754745287e-05, + "loss": 2.1633, + "step": 3273 + }, + { + "epoch": 0.24644799488134891, + "grad_norm": 5.40822696685791, + "learning_rate": 8.828915873754644e-05, + "loss": 1.9166, + "step": 3274 + }, + { + "epoch": 0.24652326916200906, + "grad_norm": 5.291109085083008, + "learning_rate": 8.82813176514594e-05, + "loss": 2.3479, + "step": 3275 + }, + { + "epoch": 0.24659854344266924, + "grad_norm": 7.391654968261719, + "learning_rate": 8.827347428965785e-05, + "loss": 1.9447, + "step": 3276 + }, + { + "epoch": 0.24667381772332939, + "grad_norm": 5.714291572570801, + "learning_rate": 8.826562865260807e-05, + "loss": 2.0981, + "step": 3277 + }, + { + "epoch": 0.24674909200398953, + "grad_norm": 5.089010715484619, + "learning_rate": 8.825778074077649e-05, + "loss": 1.9184, + "step": 3278 + }, + { + "epoch": 0.24682436628464968, + "grad_norm": 4.725710868835449, + "learning_rate": 8.82499305546296e-05, + "loss": 2.0454, + "step": 3279 + }, + { + "epoch": 0.24689964056530986, + "grad_norm": 5.179872035980225, + "learning_rate": 8.82420780946341e-05, + "loss": 2.1509, + "step": 3280 + }, + { + "epoch": 0.24697491484597, + "grad_norm": 5.690318584442139, + "learning_rate": 8.823422336125677e-05, + "loss": 2.2186, + "step": 3281 + }, + { + "epoch": 0.24705018912663015, + "grad_norm": 8.24884033203125, + "learning_rate": 8.822636635496457e-05, + "loss": 2.2951, + "step": 3282 + }, + { + "epoch": 0.2471254634072903, + "grad_norm": 4.350593566894531, + "learning_rate": 8.821850707622458e-05, + "loss": 1.9575, + "step": 3283 + }, + { + "epoch": 0.24720073768795048, + "grad_norm": 4.112179279327393, + "learning_rate": 8.821064552550399e-05, + "loss": 1.8592, + "step": 3284 + }, + { + "epoch": 0.24727601196861063, + "grad_norm": 3.415997266769409, + "learning_rate": 8.820278170327018e-05, + "loss": 1.6822, + "step": 3285 + }, + { + "epoch": 0.24735128624927077, + "grad_norm": 3.7929773330688477, + "learning_rate": 8.819491560999062e-05, + "loss": 2.0962, + "step": 3286 + }, + { + "epoch": 0.24742656052993092, + "grad_norm": 5.0059638023376465, + "learning_rate": 8.818704724613291e-05, + "loss": 2.2084, + "step": 3287 + }, + { + "epoch": 0.2475018348105911, + "grad_norm": 5.448540210723877, + "learning_rate": 8.817917661216482e-05, + "loss": 1.764, + "step": 3288 + }, + { + "epoch": 0.24757710909125125, + "grad_norm": 4.011754989624023, + "learning_rate": 8.817130370855422e-05, + "loss": 1.7774, + "step": 3289 + }, + { + "epoch": 0.2476523833719114, + "grad_norm": 4.657879829406738, + "learning_rate": 8.816342853576913e-05, + "loss": 2.1121, + "step": 3290 + }, + { + "epoch": 0.24772765765257157, + "grad_norm": 4.80071496963501, + "learning_rate": 8.815555109427773e-05, + "loss": 1.6902, + "step": 3291 + }, + { + "epoch": 0.24780293193323172, + "grad_norm": 4.337059497833252, + "learning_rate": 8.814767138454832e-05, + "loss": 1.9276, + "step": 3292 + }, + { + "epoch": 0.24787820621389187, + "grad_norm": 5.983322620391846, + "learning_rate": 8.813978940704927e-05, + "loss": 2.2808, + "step": 3293 + }, + { + "epoch": 0.24795348049455201, + "grad_norm": 5.750428199768066, + "learning_rate": 8.813190516224919e-05, + "loss": 1.8586, + "step": 3294 + }, + { + "epoch": 0.2480287547752122, + "grad_norm": 7.652900695800781, + "learning_rate": 8.812401865061675e-05, + "loss": 2.7195, + "step": 3295 + }, + { + "epoch": 0.24810402905587234, + "grad_norm": 7.795409202575684, + "learning_rate": 8.811612987262079e-05, + "loss": 2.0709, + "step": 3296 + }, + { + "epoch": 0.2481793033365325, + "grad_norm": 4.371449947357178, + "learning_rate": 8.810823882873029e-05, + "loss": 1.9146, + "step": 3297 + }, + { + "epoch": 0.24825457761719263, + "grad_norm": 3.578740119934082, + "learning_rate": 8.810034551941432e-05, + "loss": 1.9311, + "step": 3298 + }, + { + "epoch": 0.2483298518978528, + "grad_norm": 4.870274543762207, + "learning_rate": 8.809244994514216e-05, + "loss": 2.2229, + "step": 3299 + }, + { + "epoch": 0.24840512617851296, + "grad_norm": 4.595224380493164, + "learning_rate": 8.808455210638313e-05, + "loss": 1.8853, + "step": 3300 + }, + { + "epoch": 0.2484804004591731, + "grad_norm": 3.7291858196258545, + "learning_rate": 8.807665200360675e-05, + "loss": 1.8263, + "step": 3301 + }, + { + "epoch": 0.24855567473983325, + "grad_norm": 6.123600959777832, + "learning_rate": 8.806874963728267e-05, + "loss": 2.3201, + "step": 3302 + }, + { + "epoch": 0.24863094902049343, + "grad_norm": 6.319217205047607, + "learning_rate": 8.806084500788065e-05, + "loss": 2.3282, + "step": 3303 + }, + { + "epoch": 0.24870622330115358, + "grad_norm": 6.808290481567383, + "learning_rate": 8.80529381158706e-05, + "loss": 2.132, + "step": 3304 + }, + { + "epoch": 0.24878149758181373, + "grad_norm": 3.7249703407287598, + "learning_rate": 8.80450289617226e-05, + "loss": 1.9323, + "step": 3305 + }, + { + "epoch": 0.2488567718624739, + "grad_norm": 7.34173059463501, + "learning_rate": 8.803711754590676e-05, + "loss": 2.374, + "step": 3306 + }, + { + "epoch": 0.24893204614313405, + "grad_norm": 3.6013126373291016, + "learning_rate": 8.802920386889341e-05, + "loss": 1.7299, + "step": 3307 + }, + { + "epoch": 0.2490073204237942, + "grad_norm": 3.4563629627227783, + "learning_rate": 8.802128793115303e-05, + "loss": 2.2702, + "step": 3308 + }, + { + "epoch": 0.24908259470445435, + "grad_norm": 3.9899401664733887, + "learning_rate": 8.801336973315619e-05, + "loss": 1.9353, + "step": 3309 + }, + { + "epoch": 0.24915786898511452, + "grad_norm": 3.9629344940185547, + "learning_rate": 8.800544927537356e-05, + "loss": 2.0647, + "step": 3310 + }, + { + "epoch": 0.24923314326577467, + "grad_norm": 3.396920919418335, + "learning_rate": 8.799752655827604e-05, + "loss": 1.9794, + "step": 3311 + }, + { + "epoch": 0.24930841754643482, + "grad_norm": 5.570250511169434, + "learning_rate": 8.798960158233458e-05, + "loss": 1.7047, + "step": 3312 + }, + { + "epoch": 0.24938369182709497, + "grad_norm": 6.726410388946533, + "learning_rate": 8.798167434802033e-05, + "loss": 2.053, + "step": 3313 + }, + { + "epoch": 0.24945896610775514, + "grad_norm": 3.5762672424316406, + "learning_rate": 8.797374485580451e-05, + "loss": 1.986, + "step": 3314 + }, + { + "epoch": 0.2495342403884153, + "grad_norm": 3.8634605407714844, + "learning_rate": 8.796581310615851e-05, + "loss": 1.8157, + "step": 3315 + }, + { + "epoch": 0.24960951466907544, + "grad_norm": 8.683192253112793, + "learning_rate": 8.795787909955387e-05, + "loss": 1.7897, + "step": 3316 + }, + { + "epoch": 0.2496847889497356, + "grad_norm": 4.00594425201416, + "learning_rate": 8.794994283646223e-05, + "loss": 1.9053, + "step": 3317 + }, + { + "epoch": 0.24976006323039576, + "grad_norm": 6.490563869476318, + "learning_rate": 8.794200431735537e-05, + "loss": 2.0428, + "step": 3318 + }, + { + "epoch": 0.2498353375110559, + "grad_norm": 4.518208026885986, + "learning_rate": 8.793406354270523e-05, + "loss": 1.9524, + "step": 3319 + }, + { + "epoch": 0.24991061179171606, + "grad_norm": 5.257863521575928, + "learning_rate": 8.792612051298385e-05, + "loss": 1.9707, + "step": 3320 + }, + { + "epoch": 0.24998588607237623, + "grad_norm": 6.141751289367676, + "learning_rate": 8.791817522866344e-05, + "loss": 2.2946, + "step": 3321 + }, + { + "epoch": 0.2500611603530364, + "grad_norm": 5.840966701507568, + "learning_rate": 8.79102276902163e-05, + "loss": 1.8801, + "step": 3322 + }, + { + "epoch": 0.25013643463369656, + "grad_norm": 3.7239162921905518, + "learning_rate": 8.790227789811489e-05, + "loss": 1.8929, + "step": 3323 + }, + { + "epoch": 0.2502117089143567, + "grad_norm": 4.691562175750732, + "learning_rate": 8.789432585283183e-05, + "loss": 2.1422, + "step": 3324 + }, + { + "epoch": 0.25028698319501685, + "grad_norm": 4.269082069396973, + "learning_rate": 8.788637155483982e-05, + "loss": 2.1052, + "step": 3325 + }, + { + "epoch": 0.250362257475677, + "grad_norm": 5.503195285797119, + "learning_rate": 8.787841500461173e-05, + "loss": 1.9413, + "step": 3326 + }, + { + "epoch": 0.25043753175633715, + "grad_norm": 5.027651786804199, + "learning_rate": 8.787045620262057e-05, + "loss": 2.0771, + "step": 3327 + }, + { + "epoch": 0.2505128060369973, + "grad_norm": 5.761382579803467, + "learning_rate": 8.786249514933944e-05, + "loss": 2.0676, + "step": 3328 + }, + { + "epoch": 0.25058808031765745, + "grad_norm": 5.439498424530029, + "learning_rate": 8.785453184524161e-05, + "loss": 1.9866, + "step": 3329 + }, + { + "epoch": 0.2506633545983176, + "grad_norm": 4.790576934814453, + "learning_rate": 8.784656629080048e-05, + "loss": 1.8084, + "step": 3330 + }, + { + "epoch": 0.2507386288789778, + "grad_norm": 5.286802768707275, + "learning_rate": 8.783859848648958e-05, + "loss": 1.9375, + "step": 3331 + }, + { + "epoch": 0.2508139031596379, + "grad_norm": 4.268539905548096, + "learning_rate": 8.783062843278257e-05, + "loss": 2.1788, + "step": 3332 + }, + { + "epoch": 0.2508891774402981, + "grad_norm": 4.8210954666137695, + "learning_rate": 8.782265613015325e-05, + "loss": 2.1334, + "step": 3333 + }, + { + "epoch": 0.2509644517209582, + "grad_norm": 5.283801555633545, + "learning_rate": 8.781468157907555e-05, + "loss": 2.0491, + "step": 3334 + }, + { + "epoch": 0.2510397260016184, + "grad_norm": 5.443617343902588, + "learning_rate": 8.780670478002353e-05, + "loss": 2.0865, + "step": 3335 + }, + { + "epoch": 0.25111500028227857, + "grad_norm": 4.314052581787109, + "learning_rate": 8.779872573347138e-05, + "loss": 1.8371, + "step": 3336 + }, + { + "epoch": 0.2511902745629387, + "grad_norm": 3.884768486022949, + "learning_rate": 8.779074443989346e-05, + "loss": 1.7564, + "step": 3337 + }, + { + "epoch": 0.25126554884359886, + "grad_norm": 3.7344601154327393, + "learning_rate": 8.778276089976421e-05, + "loss": 1.8519, + "step": 3338 + }, + { + "epoch": 0.25134082312425904, + "grad_norm": 5.608950614929199, + "learning_rate": 8.777477511355822e-05, + "loss": 1.9414, + "step": 3339 + }, + { + "epoch": 0.25141609740491916, + "grad_norm": 4.355217933654785, + "learning_rate": 8.776678708175025e-05, + "loss": 1.9682, + "step": 3340 + }, + { + "epoch": 0.25149137168557933, + "grad_norm": 5.5209879875183105, + "learning_rate": 8.775879680481516e-05, + "loss": 2.2802, + "step": 3341 + }, + { + "epoch": 0.2515666459662395, + "grad_norm": 4.159858226776123, + "learning_rate": 8.775080428322794e-05, + "loss": 1.8581, + "step": 3342 + }, + { + "epoch": 0.25164192024689963, + "grad_norm": 4.0683746337890625, + "learning_rate": 8.774280951746372e-05, + "loss": 2.1172, + "step": 3343 + }, + { + "epoch": 0.2517171945275598, + "grad_norm": 4.304447174072266, + "learning_rate": 8.773481250799777e-05, + "loss": 1.8689, + "step": 3344 + }, + { + "epoch": 0.2517924688082199, + "grad_norm": 4.066592693328857, + "learning_rate": 8.77268132553055e-05, + "loss": 2.0504, + "step": 3345 + }, + { + "epoch": 0.2518677430888801, + "grad_norm": 4.950205326080322, + "learning_rate": 8.77188117598624e-05, + "loss": 2.14, + "step": 3346 + }, + { + "epoch": 0.2519430173695403, + "grad_norm": 5.934423446655273, + "learning_rate": 8.77108080221442e-05, + "loss": 1.7997, + "step": 3347 + }, + { + "epoch": 0.2520182916502004, + "grad_norm": 6.897109508514404, + "learning_rate": 8.770280204262666e-05, + "loss": 1.8742, + "step": 3348 + }, + { + "epoch": 0.2520935659308606, + "grad_norm": 4.602880477905273, + "learning_rate": 8.769479382178572e-05, + "loss": 1.9619, + "step": 3349 + }, + { + "epoch": 0.25216884021152075, + "grad_norm": 6.491775035858154, + "learning_rate": 8.768678336009745e-05, + "loss": 1.9483, + "step": 3350 + }, + { + "epoch": 0.25224411449218087, + "grad_norm": 4.190241813659668, + "learning_rate": 8.767877065803803e-05, + "loss": 1.9654, + "step": 3351 + }, + { + "epoch": 0.25231938877284105, + "grad_norm": 3.8782293796539307, + "learning_rate": 8.767075571608383e-05, + "loss": 1.9688, + "step": 3352 + }, + { + "epoch": 0.2523946630535012, + "grad_norm": 4.169342041015625, + "learning_rate": 8.766273853471128e-05, + "loss": 1.8204, + "step": 3353 + }, + { + "epoch": 0.25246993733416134, + "grad_norm": 3.9439549446105957, + "learning_rate": 8.765471911439697e-05, + "loss": 2.0236, + "step": 3354 + }, + { + "epoch": 0.2525452116148215, + "grad_norm": 4.357481956481934, + "learning_rate": 8.764669745561768e-05, + "loss": 2.1817, + "step": 3355 + }, + { + "epoch": 0.25262048589548164, + "grad_norm": 4.370327949523926, + "learning_rate": 8.763867355885023e-05, + "loss": 2.4143, + "step": 3356 + }, + { + "epoch": 0.2526957601761418, + "grad_norm": 6.771965026855469, + "learning_rate": 8.763064742457165e-05, + "loss": 2.1607, + "step": 3357 + }, + { + "epoch": 0.252771034456802, + "grad_norm": 5.551456451416016, + "learning_rate": 8.762261905325905e-05, + "loss": 2.0823, + "step": 3358 + }, + { + "epoch": 0.2528463087374621, + "grad_norm": 5.793180465698242, + "learning_rate": 8.76145884453897e-05, + "loss": 2.6564, + "step": 3359 + }, + { + "epoch": 0.2529215830181223, + "grad_norm": 3.7761664390563965, + "learning_rate": 8.760655560144098e-05, + "loss": 2.0649, + "step": 3360 + }, + { + "epoch": 0.25299685729878246, + "grad_norm": 7.010573863983154, + "learning_rate": 8.759852052189045e-05, + "loss": 2.1021, + "step": 3361 + }, + { + "epoch": 0.2530721315794426, + "grad_norm": 4.954830169677734, + "learning_rate": 8.759048320721576e-05, + "loss": 2.3054, + "step": 3362 + }, + { + "epoch": 0.25314740586010276, + "grad_norm": 4.391373634338379, + "learning_rate": 8.75824436578947e-05, + "loss": 1.8874, + "step": 3363 + }, + { + "epoch": 0.2532226801407629, + "grad_norm": 4.38601016998291, + "learning_rate": 8.75744018744052e-05, + "loss": 2.07, + "step": 3364 + }, + { + "epoch": 0.25329795442142305, + "grad_norm": 4.215841293334961, + "learning_rate": 8.75663578572253e-05, + "loss": 1.7271, + "step": 3365 + }, + { + "epoch": 0.25337322870208323, + "grad_norm": 5.292981147766113, + "learning_rate": 8.755831160683325e-05, + "loss": 2.1078, + "step": 3366 + }, + { + "epoch": 0.25344850298274335, + "grad_norm": 6.2771315574646, + "learning_rate": 8.755026312370733e-05, + "loss": 2.2676, + "step": 3367 + }, + { + "epoch": 0.2535237772634035, + "grad_norm": 3.629342555999756, + "learning_rate": 8.754221240832599e-05, + "loss": 1.8312, + "step": 3368 + }, + { + "epoch": 0.2535990515440637, + "grad_norm": 4.629815101623535, + "learning_rate": 8.753415946116787e-05, + "loss": 2.0529, + "step": 3369 + }, + { + "epoch": 0.2536743258247238, + "grad_norm": 8.151969909667969, + "learning_rate": 8.752610428271165e-05, + "loss": 2.0582, + "step": 3370 + }, + { + "epoch": 0.253749600105384, + "grad_norm": 7.247431755065918, + "learning_rate": 8.751804687343621e-05, + "loss": 2.031, + "step": 3371 + }, + { + "epoch": 0.2538248743860442, + "grad_norm": 5.617743968963623, + "learning_rate": 8.750998723382055e-05, + "loss": 1.9237, + "step": 3372 + }, + { + "epoch": 0.2539001486667043, + "grad_norm": 5.700640678405762, + "learning_rate": 8.750192536434376e-05, + "loss": 1.6449, + "step": 3373 + }, + { + "epoch": 0.25397542294736447, + "grad_norm": 4.86699914932251, + "learning_rate": 8.749386126548512e-05, + "loss": 1.8457, + "step": 3374 + }, + { + "epoch": 0.2540506972280246, + "grad_norm": 5.3677754402160645, + "learning_rate": 8.7485794937724e-05, + "loss": 1.892, + "step": 3375 + }, + { + "epoch": 0.25412597150868477, + "grad_norm": 4.07196569442749, + "learning_rate": 8.747772638153992e-05, + "loss": 2.0197, + "step": 3376 + }, + { + "epoch": 0.25420124578934494, + "grad_norm": 4.9392781257629395, + "learning_rate": 8.746965559741257e-05, + "loss": 1.9424, + "step": 3377 + }, + { + "epoch": 0.25427652007000506, + "grad_norm": 4.943356513977051, + "learning_rate": 8.74615825858217e-05, + "loss": 2.3784, + "step": 3378 + }, + { + "epoch": 0.25435179435066524, + "grad_norm": 5.138035774230957, + "learning_rate": 8.745350734724723e-05, + "loss": 2.1062, + "step": 3379 + }, + { + "epoch": 0.2544270686313254, + "grad_norm": 5.938267230987549, + "learning_rate": 8.744542988216922e-05, + "loss": 2.1295, + "step": 3380 + }, + { + "epoch": 0.25450234291198554, + "grad_norm": 5.142238140106201, + "learning_rate": 8.743735019106782e-05, + "loss": 2.5063, + "step": 3381 + }, + { + "epoch": 0.2545776171926457, + "grad_norm": 5.301363945007324, + "learning_rate": 8.74292682744234e-05, + "loss": 1.7866, + "step": 3382 + }, + { + "epoch": 0.25465289147330583, + "grad_norm": 4.625992298126221, + "learning_rate": 8.742118413271637e-05, + "loss": 2.1576, + "step": 3383 + }, + { + "epoch": 0.254728165753966, + "grad_norm": 5.782331466674805, + "learning_rate": 8.741309776642732e-05, + "loss": 2.4397, + "step": 3384 + }, + { + "epoch": 0.2548034400346262, + "grad_norm": 5.359298229217529, + "learning_rate": 8.740500917603696e-05, + "loss": 1.9869, + "step": 3385 + }, + { + "epoch": 0.2548787143152863, + "grad_norm": 4.609222412109375, + "learning_rate": 8.739691836202613e-05, + "loss": 1.897, + "step": 3386 + }, + { + "epoch": 0.2549539885959465, + "grad_norm": 5.1042890548706055, + "learning_rate": 8.73888253248758e-05, + "loss": 1.6332, + "step": 3387 + }, + { + "epoch": 0.25502926287660665, + "grad_norm": 7.094058513641357, + "learning_rate": 8.738073006506708e-05, + "loss": 2.1224, + "step": 3388 + }, + { + "epoch": 0.2551045371572668, + "grad_norm": 4.3952178955078125, + "learning_rate": 8.737263258308122e-05, + "loss": 2.0529, + "step": 3389 + }, + { + "epoch": 0.25517981143792695, + "grad_norm": 4.321027755737305, + "learning_rate": 8.736453287939958e-05, + "loss": 2.2885, + "step": 3390 + }, + { + "epoch": 0.2552550857185871, + "grad_norm": 4.530011177062988, + "learning_rate": 8.73564309545037e-05, + "loss": 2.2276, + "step": 3391 + }, + { + "epoch": 0.25533035999924725, + "grad_norm": 4.836587429046631, + "learning_rate": 8.734832680887517e-05, + "loss": 1.9083, + "step": 3392 + }, + { + "epoch": 0.2554056342799074, + "grad_norm": 5.3503875732421875, + "learning_rate": 8.734022044299577e-05, + "loss": 2.2422, + "step": 3393 + }, + { + "epoch": 0.25548090856056754, + "grad_norm": 4.704232215881348, + "learning_rate": 8.73321118573474e-05, + "loss": 1.7447, + "step": 3394 + }, + { + "epoch": 0.2555561828412277, + "grad_norm": 5.155667781829834, + "learning_rate": 8.732400105241211e-05, + "loss": 2.1845, + "step": 3395 + }, + { + "epoch": 0.2556314571218879, + "grad_norm": 4.234272480010986, + "learning_rate": 8.731588802867205e-05, + "loss": 2.0498, + "step": 3396 + }, + { + "epoch": 0.255706731402548, + "grad_norm": 4.946651458740234, + "learning_rate": 8.730777278660951e-05, + "loss": 1.9334, + "step": 3397 + }, + { + "epoch": 0.2557820056832082, + "grad_norm": 3.620703935623169, + "learning_rate": 8.729965532670694e-05, + "loss": 1.858, + "step": 3398 + }, + { + "epoch": 0.25585727996386837, + "grad_norm": 4.361959934234619, + "learning_rate": 8.729153564944688e-05, + "loss": 1.8494, + "step": 3399 + }, + { + "epoch": 0.2559325542445285, + "grad_norm": 6.2169013023376465, + "learning_rate": 8.728341375531203e-05, + "loss": 2.1182, + "step": 3400 + }, + { + "epoch": 0.25600782852518866, + "grad_norm": 5.479050636291504, + "learning_rate": 8.72752896447852e-05, + "loss": 2.1807, + "step": 3401 + }, + { + "epoch": 0.25608310280584884, + "grad_norm": 6.934073448181152, + "learning_rate": 8.726716331834937e-05, + "loss": 1.7589, + "step": 3402 + }, + { + "epoch": 0.25615837708650896, + "grad_norm": 5.906223297119141, + "learning_rate": 8.72590347764876e-05, + "loss": 2.1234, + "step": 3403 + }, + { + "epoch": 0.25623365136716914, + "grad_norm": 4.269212245941162, + "learning_rate": 8.725090401968312e-05, + "loss": 2.3002, + "step": 3404 + }, + { + "epoch": 0.25630892564782926, + "grad_norm": 4.559053897857666, + "learning_rate": 8.724277104841929e-05, + "loss": 2.2314, + "step": 3405 + }, + { + "epoch": 0.25638419992848943, + "grad_norm": 4.994034290313721, + "learning_rate": 8.723463586317957e-05, + "loss": 1.9856, + "step": 3406 + }, + { + "epoch": 0.2564594742091496, + "grad_norm": 4.428234100341797, + "learning_rate": 8.72264984644476e-05, + "loss": 1.864, + "step": 3407 + }, + { + "epoch": 0.2565347484898097, + "grad_norm": 3.861332416534424, + "learning_rate": 8.72183588527071e-05, + "loss": 2.1549, + "step": 3408 + }, + { + "epoch": 0.2566100227704699, + "grad_norm": 4.773284435272217, + "learning_rate": 8.721021702844197e-05, + "loss": 2.1396, + "step": 3409 + }, + { + "epoch": 0.2566852970511301, + "grad_norm": 4.855329513549805, + "learning_rate": 8.72020729921362e-05, + "loss": 1.981, + "step": 3410 + }, + { + "epoch": 0.2567605713317902, + "grad_norm": 5.015233516693115, + "learning_rate": 8.719392674427394e-05, + "loss": 2.1748, + "step": 3411 + }, + { + "epoch": 0.2568358456124504, + "grad_norm": 3.997474431991577, + "learning_rate": 8.718577828533944e-05, + "loss": 1.7354, + "step": 3412 + }, + { + "epoch": 0.2569111198931105, + "grad_norm": 5.592644691467285, + "learning_rate": 8.717762761581714e-05, + "loss": 2.0347, + "step": 3413 + }, + { + "epoch": 0.25698639417377067, + "grad_norm": 4.816791534423828, + "learning_rate": 8.716947473619154e-05, + "loss": 1.91, + "step": 3414 + }, + { + "epoch": 0.25706166845443085, + "grad_norm": 5.616014003753662, + "learning_rate": 8.716131964694731e-05, + "loss": 1.9204, + "step": 3415 + }, + { + "epoch": 0.25713694273509097, + "grad_norm": 3.9044852256774902, + "learning_rate": 8.715316234856928e-05, + "loss": 1.8938, + "step": 3416 + }, + { + "epoch": 0.25721221701575114, + "grad_norm": 6.910111427307129, + "learning_rate": 8.714500284154232e-05, + "loss": 1.8142, + "step": 3417 + }, + { + "epoch": 0.2572874912964113, + "grad_norm": 5.297390460968018, + "learning_rate": 8.713684112635154e-05, + "loss": 1.9353, + "step": 3418 + }, + { + "epoch": 0.25736276557707144, + "grad_norm": 4.03819465637207, + "learning_rate": 8.712867720348212e-05, + "loss": 1.8041, + "step": 3419 + }, + { + "epoch": 0.2574380398577316, + "grad_norm": 9.238258361816406, + "learning_rate": 8.712051107341936e-05, + "loss": 1.8716, + "step": 3420 + }, + { + "epoch": 0.2575133141383918, + "grad_norm": 5.736218452453613, + "learning_rate": 8.711234273664874e-05, + "loss": 2.1221, + "step": 3421 + }, + { + "epoch": 0.2575885884190519, + "grad_norm": 7.050442695617676, + "learning_rate": 8.710417219365583e-05, + "loss": 1.9219, + "step": 3422 + }, + { + "epoch": 0.2576638626997121, + "grad_norm": 4.648888111114502, + "learning_rate": 8.709599944492635e-05, + "loss": 1.6816, + "step": 3423 + }, + { + "epoch": 0.2577391369803722, + "grad_norm": 3.7949411869049072, + "learning_rate": 8.708782449094613e-05, + "loss": 2.0155, + "step": 3424 + }, + { + "epoch": 0.2578144112610324, + "grad_norm": 4.470954895019531, + "learning_rate": 8.707964733220116e-05, + "loss": 1.9796, + "step": 3425 + }, + { + "epoch": 0.25788968554169256, + "grad_norm": 4.936312198638916, + "learning_rate": 8.707146796917756e-05, + "loss": 2.0802, + "step": 3426 + }, + { + "epoch": 0.2579649598223527, + "grad_norm": 4.233046054840088, + "learning_rate": 8.706328640236156e-05, + "loss": 2.1409, + "step": 3427 + }, + { + "epoch": 0.25804023410301286, + "grad_norm": 6.2435808181762695, + "learning_rate": 8.705510263223953e-05, + "loss": 1.6905, + "step": 3428 + }, + { + "epoch": 0.25811550838367303, + "grad_norm": 4.378723621368408, + "learning_rate": 8.704691665929797e-05, + "loss": 2.0402, + "step": 3429 + }, + { + "epoch": 0.25819078266433315, + "grad_norm": 5.030261516571045, + "learning_rate": 8.70387284840235e-05, + "loss": 1.9507, + "step": 3430 + }, + { + "epoch": 0.2582660569449933, + "grad_norm": 5.445355415344238, + "learning_rate": 8.703053810690292e-05, + "loss": 1.8044, + "step": 3431 + }, + { + "epoch": 0.25834133122565345, + "grad_norm": 4.9107842445373535, + "learning_rate": 8.702234552842307e-05, + "loss": 1.9331, + "step": 3432 + }, + { + "epoch": 0.2584166055063136, + "grad_norm": 5.036314010620117, + "learning_rate": 8.701415074907104e-05, + "loss": 2.0232, + "step": 3433 + }, + { + "epoch": 0.2584918797869738, + "grad_norm": 6.3447184562683105, + "learning_rate": 8.700595376933394e-05, + "loss": 1.921, + "step": 3434 + }, + { + "epoch": 0.2585671540676339, + "grad_norm": 5.951164722442627, + "learning_rate": 8.699775458969908e-05, + "loss": 2.2845, + "step": 3435 + }, + { + "epoch": 0.2586424283482941, + "grad_norm": 5.955033779144287, + "learning_rate": 8.698955321065386e-05, + "loss": 1.8527, + "step": 3436 + }, + { + "epoch": 0.25871770262895427, + "grad_norm": 3.9748892784118652, + "learning_rate": 8.698134963268583e-05, + "loss": 2.1347, + "step": 3437 + }, + { + "epoch": 0.2587929769096144, + "grad_norm": 6.248036861419678, + "learning_rate": 8.697314385628268e-05, + "loss": 2.0757, + "step": 3438 + }, + { + "epoch": 0.25886825119027457, + "grad_norm": 4.950955867767334, + "learning_rate": 8.696493588193222e-05, + "loss": 1.9437, + "step": 3439 + }, + { + "epoch": 0.25894352547093474, + "grad_norm": 4.586086273193359, + "learning_rate": 8.695672571012239e-05, + "loss": 2.0434, + "step": 3440 + }, + { + "epoch": 0.25901879975159486, + "grad_norm": 3.800874710083008, + "learning_rate": 8.694851334134125e-05, + "loss": 1.9291, + "step": 3441 + }, + { + "epoch": 0.25909407403225504, + "grad_norm": 6.324313640594482, + "learning_rate": 8.694029877607702e-05, + "loss": 2.1894, + "step": 3442 + }, + { + "epoch": 0.25916934831291516, + "grad_norm": 5.828389644622803, + "learning_rate": 8.693208201481799e-05, + "loss": 1.6579, + "step": 3443 + }, + { + "epoch": 0.25924462259357534, + "grad_norm": 3.8350470066070557, + "learning_rate": 8.692386305805269e-05, + "loss": 2.1038, + "step": 3444 + }, + { + "epoch": 0.2593198968742355, + "grad_norm": 5.236853122711182, + "learning_rate": 8.691564190626967e-05, + "loss": 1.9544, + "step": 3445 + }, + { + "epoch": 0.25939517115489563, + "grad_norm": 5.964720249176025, + "learning_rate": 8.690741855995765e-05, + "loss": 2.2523, + "step": 3446 + }, + { + "epoch": 0.2594704454355558, + "grad_norm": 4.067541599273682, + "learning_rate": 8.689919301960549e-05, + "loss": 1.5779, + "step": 3447 + }, + { + "epoch": 0.259545719716216, + "grad_norm": 4.843165397644043, + "learning_rate": 8.68909652857022e-05, + "loss": 2.003, + "step": 3448 + }, + { + "epoch": 0.2596209939968761, + "grad_norm": 5.822704792022705, + "learning_rate": 8.688273535873686e-05, + "loss": 1.7866, + "step": 3449 + }, + { + "epoch": 0.2596962682775363, + "grad_norm": 5.259697914123535, + "learning_rate": 8.687450323919875e-05, + "loss": 1.4391, + "step": 3450 + }, + { + "epoch": 0.25977154255819646, + "grad_norm": 4.767988204956055, + "learning_rate": 8.68662689275772e-05, + "loss": 1.9427, + "step": 3451 + }, + { + "epoch": 0.2598468168388566, + "grad_norm": 4.689269542694092, + "learning_rate": 8.685803242436177e-05, + "loss": 2.1226, + "step": 3452 + }, + { + "epoch": 0.25992209111951675, + "grad_norm": 6.059725284576416, + "learning_rate": 8.684979373004205e-05, + "loss": 1.7197, + "step": 3453 + }, + { + "epoch": 0.25999736540017687, + "grad_norm": 6.01936674118042, + "learning_rate": 8.684155284510785e-05, + "loss": 2.1568, + "step": 3454 + }, + { + "epoch": 0.26007263968083705, + "grad_norm": 5.624912261962891, + "learning_rate": 8.683330977004902e-05, + "loss": 2.1994, + "step": 3455 + }, + { + "epoch": 0.2601479139614972, + "grad_norm": 5.395583152770996, + "learning_rate": 8.682506450535563e-05, + "loss": 2.0581, + "step": 3456 + }, + { + "epoch": 0.26022318824215734, + "grad_norm": 5.645318031311035, + "learning_rate": 8.681681705151781e-05, + "loss": 1.797, + "step": 3457 + }, + { + "epoch": 0.2602984625228175, + "grad_norm": 4.6249494552612305, + "learning_rate": 8.680856740902585e-05, + "loss": 1.9708, + "step": 3458 + }, + { + "epoch": 0.2603737368034777, + "grad_norm": 5.455705642700195, + "learning_rate": 8.680031557837018e-05, + "loss": 2.0489, + "step": 3459 + }, + { + "epoch": 0.2604490110841378, + "grad_norm": 5.904580593109131, + "learning_rate": 8.679206156004134e-05, + "loss": 2.3004, + "step": 3460 + }, + { + "epoch": 0.260524285364798, + "grad_norm": 4.126626014709473, + "learning_rate": 8.678380535453e-05, + "loss": 1.8396, + "step": 3461 + }, + { + "epoch": 0.2605995596454581, + "grad_norm": 4.627285957336426, + "learning_rate": 8.677554696232699e-05, + "loss": 2.4359, + "step": 3462 + }, + { + "epoch": 0.2606748339261183, + "grad_norm": 5.121099948883057, + "learning_rate": 8.676728638392321e-05, + "loss": 2.0476, + "step": 3463 + }, + { + "epoch": 0.26075010820677846, + "grad_norm": 5.446684837341309, + "learning_rate": 8.675902361980979e-05, + "loss": 2.1324, + "step": 3464 + }, + { + "epoch": 0.2608253824874386, + "grad_norm": 5.130393981933594, + "learning_rate": 8.675075867047786e-05, + "loss": 1.7636, + "step": 3465 + }, + { + "epoch": 0.26090065676809876, + "grad_norm": 5.909648895263672, + "learning_rate": 8.674249153641878e-05, + "loss": 2.6776, + "step": 3466 + }, + { + "epoch": 0.26097593104875894, + "grad_norm": 3.702174425125122, + "learning_rate": 8.673422221812402e-05, + "loss": 1.9407, + "step": 3467 + }, + { + "epoch": 0.26105120532941906, + "grad_norm": 4.176087379455566, + "learning_rate": 8.672595071608513e-05, + "loss": 1.7643, + "step": 3468 + }, + { + "epoch": 0.26112647961007923, + "grad_norm": 4.569227695465088, + "learning_rate": 8.671767703079387e-05, + "loss": 2.1314, + "step": 3469 + }, + { + "epoch": 0.2612017538907394, + "grad_norm": 4.633419513702393, + "learning_rate": 8.670940116274205e-05, + "loss": 2.094, + "step": 3470 + }, + { + "epoch": 0.26127702817139953, + "grad_norm": 5.385413646697998, + "learning_rate": 8.670112311242166e-05, + "loss": 2.0975, + "step": 3471 + }, + { + "epoch": 0.2613523024520597, + "grad_norm": 4.056864261627197, + "learning_rate": 8.669284288032482e-05, + "loss": 1.8349, + "step": 3472 + }, + { + "epoch": 0.2614275767327198, + "grad_norm": 6.166830539703369, + "learning_rate": 8.668456046694376e-05, + "loss": 2.0174, + "step": 3473 + }, + { + "epoch": 0.26150285101338, + "grad_norm": 4.179583549499512, + "learning_rate": 8.667627587277082e-05, + "loss": 2.1595, + "step": 3474 + }, + { + "epoch": 0.2615781252940402, + "grad_norm": 5.407345771789551, + "learning_rate": 8.666798909829852e-05, + "loss": 1.8715, + "step": 3475 + }, + { + "epoch": 0.2616533995747003, + "grad_norm": 3.9955663681030273, + "learning_rate": 8.66597001440195e-05, + "loss": 1.6038, + "step": 3476 + }, + { + "epoch": 0.26172867385536047, + "grad_norm": 5.360552787780762, + "learning_rate": 8.665140901042647e-05, + "loss": 1.8243, + "step": 3477 + }, + { + "epoch": 0.26180394813602065, + "grad_norm": 6.48111629486084, + "learning_rate": 8.664311569801235e-05, + "loss": 2.1284, + "step": 3478 + }, + { + "epoch": 0.26187922241668077, + "grad_norm": 7.117701053619385, + "learning_rate": 8.663482020727014e-05, + "loss": 2.0903, + "step": 3479 + }, + { + "epoch": 0.26195449669734094, + "grad_norm": 5.6999006271362305, + "learning_rate": 8.662652253869299e-05, + "loss": 1.9183, + "step": 3480 + }, + { + "epoch": 0.26202977097800106, + "grad_norm": 7.994014263153076, + "learning_rate": 8.661822269277416e-05, + "loss": 2.3098, + "step": 3481 + }, + { + "epoch": 0.26210504525866124, + "grad_norm": 10.794896125793457, + "learning_rate": 8.660992067000707e-05, + "loss": 2.0551, + "step": 3482 + }, + { + "epoch": 0.2621803195393214, + "grad_norm": 4.697772979736328, + "learning_rate": 8.660161647088524e-05, + "loss": 1.8397, + "step": 3483 + }, + { + "epoch": 0.26225559381998154, + "grad_norm": 6.9357805252075195, + "learning_rate": 8.659331009590233e-05, + "loss": 2.3032, + "step": 3484 + }, + { + "epoch": 0.2623308681006417, + "grad_norm": 4.746249675750732, + "learning_rate": 8.658500154555214e-05, + "loss": 1.9519, + "step": 3485 + }, + { + "epoch": 0.2624061423813019, + "grad_norm": 7.342319488525391, + "learning_rate": 8.657669082032856e-05, + "loss": 2.031, + "step": 3486 + }, + { + "epoch": 0.262481416661962, + "grad_norm": 4.2688093185424805, + "learning_rate": 8.656837792072567e-05, + "loss": 1.9488, + "step": 3487 + }, + { + "epoch": 0.2625566909426222, + "grad_norm": 5.551198959350586, + "learning_rate": 8.656006284723765e-05, + "loss": 1.8499, + "step": 3488 + }, + { + "epoch": 0.26263196522328236, + "grad_norm": 5.501975059509277, + "learning_rate": 8.655174560035879e-05, + "loss": 1.8846, + "step": 3489 + }, + { + "epoch": 0.2627072395039425, + "grad_norm": 6.153310298919678, + "learning_rate": 8.654342618058354e-05, + "loss": 2.117, + "step": 3490 + }, + { + "epoch": 0.26278251378460266, + "grad_norm": 4.485688209533691, + "learning_rate": 8.653510458840645e-05, + "loss": 2.1777, + "step": 3491 + }, + { + "epoch": 0.2628577880652628, + "grad_norm": 5.17626428604126, + "learning_rate": 8.652678082432224e-05, + "loss": 2.1638, + "step": 3492 + }, + { + "epoch": 0.26293306234592295, + "grad_norm": 5.768205165863037, + "learning_rate": 8.651845488882569e-05, + "loss": 1.7789, + "step": 3493 + }, + { + "epoch": 0.26300833662658313, + "grad_norm": 4.6757988929748535, + "learning_rate": 8.65101267824118e-05, + "loss": 1.8145, + "step": 3494 + }, + { + "epoch": 0.26308361090724325, + "grad_norm": 5.137851715087891, + "learning_rate": 8.650179650557561e-05, + "loss": 2.0095, + "step": 3495 + }, + { + "epoch": 0.2631588851879034, + "grad_norm": 4.219796657562256, + "learning_rate": 8.649346405881237e-05, + "loss": 1.8502, + "step": 3496 + }, + { + "epoch": 0.2632341594685636, + "grad_norm": 5.7525224685668945, + "learning_rate": 8.648512944261741e-05, + "loss": 1.9097, + "step": 3497 + }, + { + "epoch": 0.2633094337492237, + "grad_norm": 4.988112926483154, + "learning_rate": 8.647679265748619e-05, + "loss": 1.9095, + "step": 3498 + }, + { + "epoch": 0.2633847080298839, + "grad_norm": 4.445570468902588, + "learning_rate": 8.64684537039143e-05, + "loss": 1.6983, + "step": 3499 + }, + { + "epoch": 0.26345998231054407, + "grad_norm": 7.690849781036377, + "learning_rate": 8.646011258239747e-05, + "loss": 2.5875, + "step": 3500 + }, + { + "epoch": 0.2635352565912042, + "grad_norm": 5.7418212890625, + "learning_rate": 8.645176929343159e-05, + "loss": 2.0387, + "step": 3501 + }, + { + "epoch": 0.26361053087186437, + "grad_norm": 3.50325083732605, + "learning_rate": 8.64434238375126e-05, + "loss": 1.7684, + "step": 3502 + }, + { + "epoch": 0.2636858051525245, + "grad_norm": 6.436196327209473, + "learning_rate": 8.643507621513663e-05, + "loss": 2.109, + "step": 3503 + }, + { + "epoch": 0.26376107943318466, + "grad_norm": 5.122573375701904, + "learning_rate": 8.642672642679991e-05, + "loss": 1.9419, + "step": 3504 + }, + { + "epoch": 0.26383635371384484, + "grad_norm": 4.171894073486328, + "learning_rate": 8.641837447299886e-05, + "loss": 1.8107, + "step": 3505 + }, + { + "epoch": 0.26391162799450496, + "grad_norm": 5.3728814125061035, + "learning_rate": 8.641002035422992e-05, + "loss": 2.0222, + "step": 3506 + }, + { + "epoch": 0.26398690227516514, + "grad_norm": 4.684506893157959, + "learning_rate": 8.640166407098974e-05, + "loss": 2.159, + "step": 3507 + }, + { + "epoch": 0.2640621765558253, + "grad_norm": 4.5756516456604, + "learning_rate": 8.639330562377508e-05, + "loss": 2.1024, + "step": 3508 + }, + { + "epoch": 0.26413745083648543, + "grad_norm": 4.232524871826172, + "learning_rate": 8.638494501308283e-05, + "loss": 2.0254, + "step": 3509 + }, + { + "epoch": 0.2642127251171456, + "grad_norm": 4.19370698928833, + "learning_rate": 8.637658223940999e-05, + "loss": 1.9617, + "step": 3510 + }, + { + "epoch": 0.26428799939780573, + "grad_norm": 6.348365783691406, + "learning_rate": 8.636821730325373e-05, + "loss": 1.9834, + "step": 3511 + }, + { + "epoch": 0.2643632736784659, + "grad_norm": 4.720755100250244, + "learning_rate": 8.635985020511129e-05, + "loss": 1.8069, + "step": 3512 + }, + { + "epoch": 0.2644385479591261, + "grad_norm": 5.0552520751953125, + "learning_rate": 8.635148094548008e-05, + "loss": 2.5948, + "step": 3513 + }, + { + "epoch": 0.2645138222397862, + "grad_norm": 5.435145378112793, + "learning_rate": 8.634310952485764e-05, + "loss": 2.1941, + "step": 3514 + }, + { + "epoch": 0.2645890965204464, + "grad_norm": 5.53412389755249, + "learning_rate": 8.633473594374162e-05, + "loss": 1.9143, + "step": 3515 + }, + { + "epoch": 0.26466437080110655, + "grad_norm": 4.5146636962890625, + "learning_rate": 8.632636020262981e-05, + "loss": 1.8848, + "step": 3516 + }, + { + "epoch": 0.2647396450817667, + "grad_norm": 4.362879276275635, + "learning_rate": 8.63179823020201e-05, + "loss": 2.0601, + "step": 3517 + }, + { + "epoch": 0.26481491936242685, + "grad_norm": 4.718861103057861, + "learning_rate": 8.630960224241055e-05, + "loss": 1.7878, + "step": 3518 + }, + { + "epoch": 0.264890193643087, + "grad_norm": 4.063456058502197, + "learning_rate": 8.630122002429934e-05, + "loss": 1.8209, + "step": 3519 + }, + { + "epoch": 0.26496546792374714, + "grad_norm": 6.880354881286621, + "learning_rate": 8.629283564818478e-05, + "loss": 2.3385, + "step": 3520 + }, + { + "epoch": 0.2650407422044073, + "grad_norm": 6.569374084472656, + "learning_rate": 8.628444911456524e-05, + "loss": 2.0725, + "step": 3521 + }, + { + "epoch": 0.26511601648506744, + "grad_norm": 8.423012733459473, + "learning_rate": 8.627606042393934e-05, + "loss": 2.0777, + "step": 3522 + }, + { + "epoch": 0.2651912907657276, + "grad_norm": 6.598509788513184, + "learning_rate": 8.626766957680572e-05, + "loss": 2.1543, + "step": 3523 + }, + { + "epoch": 0.2652665650463878, + "grad_norm": 5.725747108459473, + "learning_rate": 8.625927657366321e-05, + "loss": 2.163, + "step": 3524 + }, + { + "epoch": 0.2653418393270479, + "grad_norm": 4.334387302398682, + "learning_rate": 8.625088141501074e-05, + "loss": 1.8277, + "step": 3525 + }, + { + "epoch": 0.2654171136077081, + "grad_norm": 5.765000343322754, + "learning_rate": 8.624248410134739e-05, + "loss": 1.7981, + "step": 3526 + }, + { + "epoch": 0.26549238788836826, + "grad_norm": 6.698069095611572, + "learning_rate": 8.623408463317237e-05, + "loss": 1.8343, + "step": 3527 + }, + { + "epoch": 0.2655676621690284, + "grad_norm": 7.3356242179870605, + "learning_rate": 8.622568301098496e-05, + "loss": 1.8923, + "step": 3528 + }, + { + "epoch": 0.26564293644968856, + "grad_norm": 4.493832588195801, + "learning_rate": 8.621727923528465e-05, + "loss": 2.1371, + "step": 3529 + }, + { + "epoch": 0.2657182107303487, + "grad_norm": 5.8925065994262695, + "learning_rate": 8.620887330657102e-05, + "loss": 2.317, + "step": 3530 + }, + { + "epoch": 0.26579348501100886, + "grad_norm": 4.173810958862305, + "learning_rate": 8.620046522534376e-05, + "loss": 1.8889, + "step": 3531 + }, + { + "epoch": 0.26586875929166903, + "grad_norm": 5.806678295135498, + "learning_rate": 8.619205499210269e-05, + "loss": 1.9653, + "step": 3532 + }, + { + "epoch": 0.26594403357232915, + "grad_norm": 4.102508544921875, + "learning_rate": 8.618364260734781e-05, + "loss": 1.8887, + "step": 3533 + }, + { + "epoch": 0.26601930785298933, + "grad_norm": 4.817507266998291, + "learning_rate": 8.617522807157921e-05, + "loss": 1.9382, + "step": 3534 + }, + { + "epoch": 0.2660945821336495, + "grad_norm": 3.7136971950531006, + "learning_rate": 8.61668113852971e-05, + "loss": 1.6289, + "step": 3535 + }, + { + "epoch": 0.2661698564143096, + "grad_norm": 4.387873649597168, + "learning_rate": 8.615839254900182e-05, + "loss": 2.168, + "step": 3536 + }, + { + "epoch": 0.2662451306949698, + "grad_norm": 4.71859073638916, + "learning_rate": 8.614997156319385e-05, + "loss": 2.0897, + "step": 3537 + }, + { + "epoch": 0.26632040497563, + "grad_norm": 4.652045249938965, + "learning_rate": 8.61415484283738e-05, + "loss": 2.1505, + "step": 3538 + }, + { + "epoch": 0.2663956792562901, + "grad_norm": 6.98809814453125, + "learning_rate": 8.613312314504239e-05, + "loss": 1.9022, + "step": 3539 + }, + { + "epoch": 0.2664709535369503, + "grad_norm": 4.751323699951172, + "learning_rate": 8.61246957137005e-05, + "loss": 1.8344, + "step": 3540 + }, + { + "epoch": 0.2665462278176104, + "grad_norm": 5.730648517608643, + "learning_rate": 8.61162661348491e-05, + "loss": 1.9122, + "step": 3541 + }, + { + "epoch": 0.26662150209827057, + "grad_norm": 4.286025047302246, + "learning_rate": 8.610783440898931e-05, + "loss": 1.9165, + "step": 3542 + }, + { + "epoch": 0.26669677637893074, + "grad_norm": 5.575912952423096, + "learning_rate": 8.609940053662236e-05, + "loss": 1.8833, + "step": 3543 + }, + { + "epoch": 0.26677205065959086, + "grad_norm": 4.64895486831665, + "learning_rate": 8.609096451824962e-05, + "loss": 2.1597, + "step": 3544 + }, + { + "epoch": 0.26684732494025104, + "grad_norm": 6.224883079528809, + "learning_rate": 8.608252635437261e-05, + "loss": 1.7426, + "step": 3545 + }, + { + "epoch": 0.2669225992209112, + "grad_norm": 5.420910835266113, + "learning_rate": 8.607408604549295e-05, + "loss": 1.9942, + "step": 3546 + }, + { + "epoch": 0.26699787350157134, + "grad_norm": 3.900799036026001, + "learning_rate": 8.606564359211238e-05, + "loss": 2.0286, + "step": 3547 + }, + { + "epoch": 0.2670731477822315, + "grad_norm": 5.952523708343506, + "learning_rate": 8.605719899473277e-05, + "loss": 2.4114, + "step": 3548 + }, + { + "epoch": 0.2671484220628917, + "grad_norm": 6.622057914733887, + "learning_rate": 8.604875225385613e-05, + "loss": 1.6414, + "step": 3549 + }, + { + "epoch": 0.2672236963435518, + "grad_norm": 4.9649434089660645, + "learning_rate": 8.604030336998463e-05, + "loss": 2.3073, + "step": 3550 + }, + { + "epoch": 0.267298970624212, + "grad_norm": 5.571475505828857, + "learning_rate": 8.60318523436205e-05, + "loss": 2.2395, + "step": 3551 + }, + { + "epoch": 0.2673742449048721, + "grad_norm": 5.297370910644531, + "learning_rate": 8.602339917526612e-05, + "loss": 2.1662, + "step": 3552 + }, + { + "epoch": 0.2674495191855323, + "grad_norm": 4.768833160400391, + "learning_rate": 8.601494386542403e-05, + "loss": 1.8964, + "step": 3553 + }, + { + "epoch": 0.26752479346619246, + "grad_norm": 6.848176002502441, + "learning_rate": 8.600648641459688e-05, + "loss": 2.2476, + "step": 3554 + }, + { + "epoch": 0.2676000677468526, + "grad_norm": 4.6917009353637695, + "learning_rate": 8.599802682328742e-05, + "loss": 1.8052, + "step": 3555 + }, + { + "epoch": 0.26767534202751275, + "grad_norm": 4.27440881729126, + "learning_rate": 8.598956509199854e-05, + "loss": 1.8941, + "step": 3556 + }, + { + "epoch": 0.26775061630817293, + "grad_norm": 5.244617938995361, + "learning_rate": 8.598110122123329e-05, + "loss": 2.0806, + "step": 3557 + }, + { + "epoch": 0.26782589058883305, + "grad_norm": 5.404412746429443, + "learning_rate": 8.597263521149481e-05, + "loss": 1.6779, + "step": 3558 + }, + { + "epoch": 0.2679011648694932, + "grad_norm": 8.262093544006348, + "learning_rate": 8.596416706328639e-05, + "loss": 2.2713, + "step": 3559 + }, + { + "epoch": 0.26797643915015334, + "grad_norm": 5.4215474128723145, + "learning_rate": 8.595569677711143e-05, + "loss": 2.4786, + "step": 3560 + }, + { + "epoch": 0.2680517134308135, + "grad_norm": 3.528212070465088, + "learning_rate": 8.594722435347347e-05, + "loss": 2.0307, + "step": 3561 + }, + { + "epoch": 0.2681269877114737, + "grad_norm": 6.334159851074219, + "learning_rate": 8.593874979287615e-05, + "loss": 2.1337, + "step": 3562 + }, + { + "epoch": 0.2682022619921338, + "grad_norm": 5.581978797912598, + "learning_rate": 8.593027309582328e-05, + "loss": 1.9931, + "step": 3563 + }, + { + "epoch": 0.268277536272794, + "grad_norm": 5.736482620239258, + "learning_rate": 8.592179426281878e-05, + "loss": 2.3326, + "step": 3564 + }, + { + "epoch": 0.26835281055345417, + "grad_norm": 5.4885711669921875, + "learning_rate": 8.591331329436668e-05, + "loss": 2.4756, + "step": 3565 + }, + { + "epoch": 0.2684280848341143, + "grad_norm": 5.926138877868652, + "learning_rate": 8.590483019097114e-05, + "loss": 2.4839, + "step": 3566 + }, + { + "epoch": 0.26850335911477446, + "grad_norm": 5.43041467666626, + "learning_rate": 8.589634495313648e-05, + "loss": 2.0797, + "step": 3567 + }, + { + "epoch": 0.26857863339543464, + "grad_norm": 5.4391045570373535, + "learning_rate": 8.58878575813671e-05, + "loss": 2.0321, + "step": 3568 + }, + { + "epoch": 0.26865390767609476, + "grad_norm": 5.919775485992432, + "learning_rate": 8.587936807616756e-05, + "loss": 1.8071, + "step": 3569 + }, + { + "epoch": 0.26872918195675494, + "grad_norm": 6.478799343109131, + "learning_rate": 8.587087643804256e-05, + "loss": 2.2316, + "step": 3570 + }, + { + "epoch": 0.26880445623741506, + "grad_norm": 4.596625328063965, + "learning_rate": 8.586238266749686e-05, + "loss": 1.6863, + "step": 3571 + }, + { + "epoch": 0.26887973051807523, + "grad_norm": 5.807338714599609, + "learning_rate": 8.585388676503543e-05, + "loss": 2.1501, + "step": 3572 + }, + { + "epoch": 0.2689550047987354, + "grad_norm": 5.086215496063232, + "learning_rate": 8.584538873116328e-05, + "loss": 2.4629, + "step": 3573 + }, + { + "epoch": 0.26903027907939553, + "grad_norm": 4.623048305511475, + "learning_rate": 8.583688856638563e-05, + "loss": 1.882, + "step": 3574 + }, + { + "epoch": 0.2691055533600557, + "grad_norm": 4.823899745941162, + "learning_rate": 8.58283862712078e-05, + "loss": 1.8047, + "step": 3575 + }, + { + "epoch": 0.2691808276407159, + "grad_norm": 5.859494686126709, + "learning_rate": 8.58198818461352e-05, + "loss": 2.1771, + "step": 3576 + }, + { + "epoch": 0.269256101921376, + "grad_norm": 4.5490593910217285, + "learning_rate": 8.58113752916734e-05, + "loss": 2.1381, + "step": 3577 + }, + { + "epoch": 0.2693313762020362, + "grad_norm": 3.6705760955810547, + "learning_rate": 8.58028666083281e-05, + "loss": 1.9529, + "step": 3578 + }, + { + "epoch": 0.2694066504826963, + "grad_norm": 4.589728832244873, + "learning_rate": 8.57943557966051e-05, + "loss": 1.8813, + "step": 3579 + }, + { + "epoch": 0.2694819247633565, + "grad_norm": 4.147540092468262, + "learning_rate": 8.578584285701036e-05, + "loss": 2.336, + "step": 3580 + }, + { + "epoch": 0.26955719904401665, + "grad_norm": 4.7769904136657715, + "learning_rate": 8.577732779004995e-05, + "loss": 1.986, + "step": 3581 + }, + { + "epoch": 0.26963247332467677, + "grad_norm": 4.962817192077637, + "learning_rate": 8.576881059623003e-05, + "loss": 2.3573, + "step": 3582 + }, + { + "epoch": 0.26970774760533694, + "grad_norm": 11.536269187927246, + "learning_rate": 8.576029127605698e-05, + "loss": 1.6711, + "step": 3583 + }, + { + "epoch": 0.2697830218859971, + "grad_norm": 5.132732391357422, + "learning_rate": 8.575176983003721e-05, + "loss": 1.8952, + "step": 3584 + }, + { + "epoch": 0.26985829616665724, + "grad_norm": 3.878742218017578, + "learning_rate": 8.574324625867732e-05, + "loss": 1.7662, + "step": 3585 + }, + { + "epoch": 0.2699335704473174, + "grad_norm": 5.135368347167969, + "learning_rate": 8.573472056248399e-05, + "loss": 1.9926, + "step": 3586 + }, + { + "epoch": 0.2700088447279776, + "grad_norm": 4.444483280181885, + "learning_rate": 8.572619274196406e-05, + "loss": 2.1353, + "step": 3587 + }, + { + "epoch": 0.2700841190086377, + "grad_norm": 5.549820899963379, + "learning_rate": 8.571766279762448e-05, + "loss": 2.4059, + "step": 3588 + }, + { + "epoch": 0.2701593932892979, + "grad_norm": 5.201048374176025, + "learning_rate": 8.570913072997233e-05, + "loss": 1.7403, + "step": 3589 + }, + { + "epoch": 0.270234667569958, + "grad_norm": 6.496277332305908, + "learning_rate": 8.570059653951482e-05, + "loss": 2.1463, + "step": 3590 + }, + { + "epoch": 0.2703099418506182, + "grad_norm": 5.668715953826904, + "learning_rate": 8.569206022675927e-05, + "loss": 2.2219, + "step": 3591 + }, + { + "epoch": 0.27038521613127836, + "grad_norm": 7.486010551452637, + "learning_rate": 8.568352179221317e-05, + "loss": 2.1663, + "step": 3592 + }, + { + "epoch": 0.2704604904119385, + "grad_norm": 4.589876174926758, + "learning_rate": 8.567498123638408e-05, + "loss": 1.5951, + "step": 3593 + }, + { + "epoch": 0.27053576469259866, + "grad_norm": 8.431713104248047, + "learning_rate": 8.566643855977972e-05, + "loss": 2.0533, + "step": 3594 + }, + { + "epoch": 0.27061103897325883, + "grad_norm": 8.653324127197266, + "learning_rate": 8.565789376290793e-05, + "loss": 1.8559, + "step": 3595 + }, + { + "epoch": 0.27068631325391895, + "grad_norm": 5.465704917907715, + "learning_rate": 8.564934684627665e-05, + "loss": 2.0959, + "step": 3596 + }, + { + "epoch": 0.27076158753457913, + "grad_norm": 5.4900078773498535, + "learning_rate": 8.564079781039402e-05, + "loss": 2.0015, + "step": 3597 + }, + { + "epoch": 0.2708368618152393, + "grad_norm": 5.449493885040283, + "learning_rate": 8.56322466557682e-05, + "loss": 2.0003, + "step": 3598 + }, + { + "epoch": 0.2709121360958994, + "grad_norm": 4.321719169616699, + "learning_rate": 8.562369338290757e-05, + "loss": 1.875, + "step": 3599 + }, + { + "epoch": 0.2709874103765596, + "grad_norm": 4.791361331939697, + "learning_rate": 8.561513799232057e-05, + "loss": 2.0276, + "step": 3600 + }, + { + "epoch": 0.2710626846572197, + "grad_norm": 3.7765538692474365, + "learning_rate": 8.560658048451584e-05, + "loss": 1.9769, + "step": 3601 + }, + { + "epoch": 0.2711379589378799, + "grad_norm": 7.015016555786133, + "learning_rate": 8.559802086000204e-05, + "loss": 1.8367, + "step": 3602 + }, + { + "epoch": 0.2712132332185401, + "grad_norm": 5.300413131713867, + "learning_rate": 8.558945911928804e-05, + "loss": 1.9711, + "step": 3603 + }, + { + "epoch": 0.2712885074992002, + "grad_norm": 6.6701250076293945, + "learning_rate": 8.558089526288282e-05, + "loss": 1.8491, + "step": 3604 + }, + { + "epoch": 0.27136378177986037, + "grad_norm": 7.902855396270752, + "learning_rate": 8.557232929129547e-05, + "loss": 2.4689, + "step": 3605 + }, + { + "epoch": 0.27143905606052054, + "grad_norm": 5.063051223754883, + "learning_rate": 8.556376120503522e-05, + "loss": 1.8943, + "step": 3606 + }, + { + "epoch": 0.27151433034118067, + "grad_norm": 5.839184761047363, + "learning_rate": 8.55551910046114e-05, + "loss": 1.688, + "step": 3607 + }, + { + "epoch": 0.27158960462184084, + "grad_norm": 3.8771448135375977, + "learning_rate": 8.554661869053348e-05, + "loss": 1.7804, + "step": 3608 + }, + { + "epoch": 0.27166487890250096, + "grad_norm": 4.803805351257324, + "learning_rate": 8.55380442633111e-05, + "loss": 1.9435, + "step": 3609 + }, + { + "epoch": 0.27174015318316114, + "grad_norm": 5.550612449645996, + "learning_rate": 8.552946772345393e-05, + "loss": 1.9994, + "step": 3610 + }, + { + "epoch": 0.2718154274638213, + "grad_norm": 6.348362445831299, + "learning_rate": 8.552088907147189e-05, + "loss": 2.2774, + "step": 3611 + }, + { + "epoch": 0.27189070174448143, + "grad_norm": 5.97193717956543, + "learning_rate": 8.55123083078749e-05, + "loss": 2.0419, + "step": 3612 + }, + { + "epoch": 0.2719659760251416, + "grad_norm": 4.740328788757324, + "learning_rate": 8.550372543317307e-05, + "loss": 2.2142, + "step": 3613 + }, + { + "epoch": 0.2720412503058018, + "grad_norm": 5.986849308013916, + "learning_rate": 8.549514044787663e-05, + "loss": 1.9058, + "step": 3614 + }, + { + "epoch": 0.2721165245864619, + "grad_norm": 5.747989654541016, + "learning_rate": 8.548655335249593e-05, + "loss": 1.837, + "step": 3615 + }, + { + "epoch": 0.2721917988671221, + "grad_norm": 8.26144790649414, + "learning_rate": 8.547796414754148e-05, + "loss": 2.1932, + "step": 3616 + }, + { + "epoch": 0.27226707314778226, + "grad_norm": 4.700840473175049, + "learning_rate": 8.546937283352384e-05, + "loss": 2.1159, + "step": 3617 + }, + { + "epoch": 0.2723423474284424, + "grad_norm": 5.285219192504883, + "learning_rate": 8.546077941095376e-05, + "loss": 1.9688, + "step": 3618 + }, + { + "epoch": 0.27241762170910255, + "grad_norm": 5.66541862487793, + "learning_rate": 8.545218388034211e-05, + "loss": 1.861, + "step": 3619 + }, + { + "epoch": 0.2724928959897627, + "grad_norm": 4.491893768310547, + "learning_rate": 8.544358624219983e-05, + "loss": 1.9747, + "step": 3620 + }, + { + "epoch": 0.27256817027042285, + "grad_norm": 4.784849643707275, + "learning_rate": 8.543498649703807e-05, + "loss": 2.1304, + "step": 3621 + }, + { + "epoch": 0.272643444551083, + "grad_norm": 3.998591423034668, + "learning_rate": 8.542638464536801e-05, + "loss": 2.1174, + "step": 3622 + }, + { + "epoch": 0.27271871883174315, + "grad_norm": 5.706465244293213, + "learning_rate": 8.541778068770105e-05, + "loss": 2.2976, + "step": 3623 + }, + { + "epoch": 0.2727939931124033, + "grad_norm": 4.450465679168701, + "learning_rate": 8.540917462454865e-05, + "loss": 2.0152, + "step": 3624 + }, + { + "epoch": 0.2728692673930635, + "grad_norm": 5.024024486541748, + "learning_rate": 8.540056645642242e-05, + "loss": 1.8615, + "step": 3625 + }, + { + "epoch": 0.2729445416737236, + "grad_norm": 4.129462718963623, + "learning_rate": 8.539195618383409e-05, + "loss": 1.8735, + "step": 3626 + }, + { + "epoch": 0.2730198159543838, + "grad_norm": 3.9880435466766357, + "learning_rate": 8.538334380729552e-05, + "loss": 1.9508, + "step": 3627 + }, + { + "epoch": 0.27309509023504397, + "grad_norm": 5.950443744659424, + "learning_rate": 8.537472932731867e-05, + "loss": 2.0709, + "step": 3628 + }, + { + "epoch": 0.2731703645157041, + "grad_norm": 6.595627784729004, + "learning_rate": 8.536611274441567e-05, + "loss": 2.0653, + "step": 3629 + }, + { + "epoch": 0.27324563879636427, + "grad_norm": 5.733837604522705, + "learning_rate": 8.535749405909876e-05, + "loss": 2.5805, + "step": 3630 + }, + { + "epoch": 0.2733209130770244, + "grad_norm": 5.418771743774414, + "learning_rate": 8.534887327188026e-05, + "loss": 1.9418, + "step": 3631 + }, + { + "epoch": 0.27339618735768456, + "grad_norm": 7.218081474304199, + "learning_rate": 8.534025038327267e-05, + "loss": 1.7093, + "step": 3632 + }, + { + "epoch": 0.27347146163834474, + "grad_norm": 7.917801856994629, + "learning_rate": 8.533162539378861e-05, + "loss": 2.2601, + "step": 3633 + }, + { + "epoch": 0.27354673591900486, + "grad_norm": 5.845649719238281, + "learning_rate": 8.53229983039408e-05, + "loss": 2.3418, + "step": 3634 + }, + { + "epoch": 0.27362201019966503, + "grad_norm": 5.247650146484375, + "learning_rate": 8.531436911424209e-05, + "loss": 2.209, + "step": 3635 + }, + { + "epoch": 0.2736972844803252, + "grad_norm": 5.857186317443848, + "learning_rate": 8.530573782520546e-05, + "loss": 1.9542, + "step": 3636 + }, + { + "epoch": 0.27377255876098533, + "grad_norm": 4.87792444229126, + "learning_rate": 8.529710443734402e-05, + "loss": 1.935, + "step": 3637 + }, + { + "epoch": 0.2738478330416455, + "grad_norm": 5.064113140106201, + "learning_rate": 8.5288468951171e-05, + "loss": 2.2011, + "step": 3638 + }, + { + "epoch": 0.2739231073223056, + "grad_norm": 8.027982711791992, + "learning_rate": 8.527983136719977e-05, + "loss": 2.1941, + "step": 3639 + }, + { + "epoch": 0.2739983816029658, + "grad_norm": 4.49970817565918, + "learning_rate": 8.527119168594377e-05, + "loss": 1.8789, + "step": 3640 + }, + { + "epoch": 0.274073655883626, + "grad_norm": 6.224706172943115, + "learning_rate": 8.526254990791665e-05, + "loss": 2.509, + "step": 3641 + }, + { + "epoch": 0.2741489301642861, + "grad_norm": 5.270305156707764, + "learning_rate": 8.52539060336321e-05, + "loss": 2.0799, + "step": 3642 + }, + { + "epoch": 0.2742242044449463, + "grad_norm": 4.327517032623291, + "learning_rate": 8.524526006360401e-05, + "loss": 1.9182, + "step": 3643 + }, + { + "epoch": 0.27429947872560645, + "grad_norm": 3.7807674407958984, + "learning_rate": 8.523661199834631e-05, + "loss": 2.1888, + "step": 3644 + }, + { + "epoch": 0.27437475300626657, + "grad_norm": 5.376547336578369, + "learning_rate": 8.522796183837316e-05, + "loss": 2.0272, + "step": 3645 + }, + { + "epoch": 0.27445002728692675, + "grad_norm": 5.469082832336426, + "learning_rate": 8.521930958419874e-05, + "loss": 1.9191, + "step": 3646 + }, + { + "epoch": 0.2745253015675869, + "grad_norm": 5.068480014801025, + "learning_rate": 8.521065523633744e-05, + "loss": 1.8565, + "step": 3647 + }, + { + "epoch": 0.27460057584824704, + "grad_norm": 4.083733558654785, + "learning_rate": 8.52019987953037e-05, + "loss": 1.9283, + "step": 3648 + }, + { + "epoch": 0.2746758501289072, + "grad_norm": 5.149866580963135, + "learning_rate": 8.519334026161215e-05, + "loss": 1.9036, + "step": 3649 + }, + { + "epoch": 0.27475112440956734, + "grad_norm": 5.188506603240967, + "learning_rate": 8.51846796357775e-05, + "loss": 2.4333, + "step": 3650 + }, + { + "epoch": 0.2748263986902275, + "grad_norm": 4.111723899841309, + "learning_rate": 8.517601691831461e-05, + "loss": 2.0286, + "step": 3651 + }, + { + "epoch": 0.2749016729708877, + "grad_norm": 7.372827053070068, + "learning_rate": 8.516735210973841e-05, + "loss": 2.3751, + "step": 3652 + }, + { + "epoch": 0.2749769472515478, + "grad_norm": 3.6831037998199463, + "learning_rate": 8.515868521056407e-05, + "loss": 1.8743, + "step": 3653 + }, + { + "epoch": 0.275052221532208, + "grad_norm": 4.185559272766113, + "learning_rate": 8.515001622130676e-05, + "loss": 2.0677, + "step": 3654 + }, + { + "epoch": 0.27512749581286816, + "grad_norm": 5.531414985656738, + "learning_rate": 8.514134514248185e-05, + "loss": 1.8293, + "step": 3655 + }, + { + "epoch": 0.2752027700935283, + "grad_norm": 4.202442646026611, + "learning_rate": 8.51326719746048e-05, + "loss": 2.1917, + "step": 3656 + }, + { + "epoch": 0.27527804437418846, + "grad_norm": 6.174137592315674, + "learning_rate": 8.51239967181912e-05, + "loss": 2.0446, + "step": 3657 + }, + { + "epoch": 0.2753533186548486, + "grad_norm": 4.4919891357421875, + "learning_rate": 8.511531937375678e-05, + "loss": 2.0301, + "step": 3658 + }, + { + "epoch": 0.27542859293550875, + "grad_norm": 4.308017730712891, + "learning_rate": 8.510663994181739e-05, + "loss": 2.0194, + "step": 3659 + }, + { + "epoch": 0.27550386721616893, + "grad_norm": 6.911656379699707, + "learning_rate": 8.509795842288897e-05, + "loss": 2.3379, + "step": 3660 + }, + { + "epoch": 0.27557914149682905, + "grad_norm": 6.4530181884765625, + "learning_rate": 8.508927481748765e-05, + "loss": 2.1096, + "step": 3661 + }, + { + "epoch": 0.2756544157774892, + "grad_norm": 5.648556709289551, + "learning_rate": 8.508058912612961e-05, + "loss": 1.9491, + "step": 3662 + }, + { + "epoch": 0.2757296900581494, + "grad_norm": 6.496396541595459, + "learning_rate": 8.507190134933122e-05, + "loss": 2.0145, + "step": 3663 + }, + { + "epoch": 0.2758049643388095, + "grad_norm": 9.162127494812012, + "learning_rate": 8.506321148760891e-05, + "loss": 2.7957, + "step": 3664 + }, + { + "epoch": 0.2758802386194697, + "grad_norm": 4.8795342445373535, + "learning_rate": 8.50545195414793e-05, + "loss": 1.9935, + "step": 3665 + }, + { + "epoch": 0.2759555129001299, + "grad_norm": 4.557014465332031, + "learning_rate": 8.504582551145907e-05, + "loss": 1.7547, + "step": 3666 + }, + { + "epoch": 0.27603078718079, + "grad_norm": 6.251771450042725, + "learning_rate": 8.503712939806509e-05, + "loss": 2.089, + "step": 3667 + }, + { + "epoch": 0.27610606146145017, + "grad_norm": 4.0282487869262695, + "learning_rate": 8.50284312018143e-05, + "loss": 1.9248, + "step": 3668 + }, + { + "epoch": 0.2761813357421103, + "grad_norm": 6.955066204071045, + "learning_rate": 8.501973092322377e-05, + "loss": 1.9377, + "step": 3669 + }, + { + "epoch": 0.27625661002277047, + "grad_norm": 4.348524570465088, + "learning_rate": 8.501102856281072e-05, + "loss": 1.9906, + "step": 3670 + }, + { + "epoch": 0.27633188430343064, + "grad_norm": 3.873631000518799, + "learning_rate": 8.500232412109248e-05, + "loss": 2.0831, + "step": 3671 + }, + { + "epoch": 0.27640715858409076, + "grad_norm": 3.7730767726898193, + "learning_rate": 8.499361759858651e-05, + "loss": 1.9074, + "step": 3672 + }, + { + "epoch": 0.27648243286475094, + "grad_norm": 4.187933444976807, + "learning_rate": 8.498490899581037e-05, + "loss": 2.0774, + "step": 3673 + }, + { + "epoch": 0.2765577071454111, + "grad_norm": 4.137633800506592, + "learning_rate": 8.497619831328178e-05, + "loss": 1.8843, + "step": 3674 + }, + { + "epoch": 0.27663298142607123, + "grad_norm": 4.125125885009766, + "learning_rate": 8.496748555151855e-05, + "loss": 1.9533, + "step": 3675 + }, + { + "epoch": 0.2767082557067314, + "grad_norm": 3.998176097869873, + "learning_rate": 8.495877071103864e-05, + "loss": 2.4957, + "step": 3676 + }, + { + "epoch": 0.2767835299873916, + "grad_norm": 5.1686177253723145, + "learning_rate": 8.495005379236012e-05, + "loss": 2.0331, + "step": 3677 + }, + { + "epoch": 0.2768588042680517, + "grad_norm": 4.57703161239624, + "learning_rate": 8.494133479600121e-05, + "loss": 1.9479, + "step": 3678 + }, + { + "epoch": 0.2769340785487119, + "grad_norm": 5.521742343902588, + "learning_rate": 8.493261372248018e-05, + "loss": 2.0188, + "step": 3679 + }, + { + "epoch": 0.277009352829372, + "grad_norm": 3.9131381511688232, + "learning_rate": 8.492389057231549e-05, + "loss": 2.0012, + "step": 3680 + }, + { + "epoch": 0.2770846271100322, + "grad_norm": 6.39603853225708, + "learning_rate": 8.491516534602573e-05, + "loss": 1.5257, + "step": 3681 + }, + { + "epoch": 0.27715990139069235, + "grad_norm": 4.526966094970703, + "learning_rate": 8.490643804412956e-05, + "loss": 2.0316, + "step": 3682 + }, + { + "epoch": 0.2772351756713525, + "grad_norm": 4.71266508102417, + "learning_rate": 8.489770866714582e-05, + "loss": 2.1473, + "step": 3683 + }, + { + "epoch": 0.27731044995201265, + "grad_norm": 7.189401626586914, + "learning_rate": 8.488897721559343e-05, + "loss": 2.6405, + "step": 3684 + }, + { + "epoch": 0.2773857242326728, + "grad_norm": 4.840124130249023, + "learning_rate": 8.488024368999144e-05, + "loss": 1.7833, + "step": 3685 + }, + { + "epoch": 0.27746099851333295, + "grad_norm": 3.999760627746582, + "learning_rate": 8.487150809085907e-05, + "loss": 1.8218, + "step": 3686 + }, + { + "epoch": 0.2775362727939931, + "grad_norm": 4.553103923797607, + "learning_rate": 8.486277041871557e-05, + "loss": 1.9166, + "step": 3687 + }, + { + "epoch": 0.27761154707465324, + "grad_norm": 5.243570804595947, + "learning_rate": 8.485403067408042e-05, + "loss": 1.7033, + "step": 3688 + }, + { + "epoch": 0.2776868213553134, + "grad_norm": 5.408691883087158, + "learning_rate": 8.484528885747316e-05, + "loss": 2.3282, + "step": 3689 + }, + { + "epoch": 0.2777620956359736, + "grad_norm": 5.625600337982178, + "learning_rate": 8.483654496941344e-05, + "loss": 2.1593, + "step": 3690 + }, + { + "epoch": 0.2778373699166337, + "grad_norm": 5.035191059112549, + "learning_rate": 8.482779901042108e-05, + "loss": 1.9758, + "step": 3691 + }, + { + "epoch": 0.2779126441972939, + "grad_norm": 4.293606281280518, + "learning_rate": 8.481905098101601e-05, + "loss": 2.1281, + "step": 3692 + }, + { + "epoch": 0.27798791847795407, + "grad_norm": 5.93096399307251, + "learning_rate": 8.481030088171827e-05, + "loss": 2.0534, + "step": 3693 + }, + { + "epoch": 0.2780631927586142, + "grad_norm": 4.504866123199463, + "learning_rate": 8.480154871304802e-05, + "loss": 2.3662, + "step": 3694 + }, + { + "epoch": 0.27813846703927436, + "grad_norm": 3.9637231826782227, + "learning_rate": 8.479279447552557e-05, + "loss": 2.112, + "step": 3695 + }, + { + "epoch": 0.27821374131993454, + "grad_norm": 5.485760688781738, + "learning_rate": 8.47840381696713e-05, + "loss": 1.9176, + "step": 3696 + }, + { + "epoch": 0.27828901560059466, + "grad_norm": 3.9289305210113525, + "learning_rate": 8.477527979600577e-05, + "loss": 1.9123, + "step": 3697 + }, + { + "epoch": 0.27836428988125483, + "grad_norm": 6.370930194854736, + "learning_rate": 8.476651935504964e-05, + "loss": 2.0594, + "step": 3698 + }, + { + "epoch": 0.27843956416191495, + "grad_norm": 4.711078643798828, + "learning_rate": 8.475775684732369e-05, + "loss": 1.8304, + "step": 3699 + }, + { + "epoch": 0.27851483844257513, + "grad_norm": 10.954310417175293, + "learning_rate": 8.474899227334884e-05, + "loss": 1.8188, + "step": 3700 + }, + { + "epoch": 0.2785901127232353, + "grad_norm": 4.218547344207764, + "learning_rate": 8.47402256336461e-05, + "loss": 1.8373, + "step": 3701 + }, + { + "epoch": 0.2786653870038954, + "grad_norm": 4.795693397521973, + "learning_rate": 8.473145692873661e-05, + "loss": 1.9102, + "step": 3702 + }, + { + "epoch": 0.2787406612845556, + "grad_norm": 5.087682723999023, + "learning_rate": 8.472268615914168e-05, + "loss": 2.047, + "step": 3703 + }, + { + "epoch": 0.2788159355652158, + "grad_norm": 4.761455059051514, + "learning_rate": 8.471391332538268e-05, + "loss": 2.0498, + "step": 3704 + }, + { + "epoch": 0.2788912098458759, + "grad_norm": 5.433478355407715, + "learning_rate": 8.470513842798114e-05, + "loss": 2.1848, + "step": 3705 + }, + { + "epoch": 0.2789664841265361, + "grad_norm": 5.985280513763428, + "learning_rate": 8.46963614674587e-05, + "loss": 1.8056, + "step": 3706 + }, + { + "epoch": 0.2790417584071962, + "grad_norm": 4.757037162780762, + "learning_rate": 8.468758244433712e-05, + "loss": 1.9273, + "step": 3707 + }, + { + "epoch": 0.27911703268785637, + "grad_norm": 3.7146031856536865, + "learning_rate": 8.46788013591383e-05, + "loss": 1.8569, + "step": 3708 + }, + { + "epoch": 0.27919230696851655, + "grad_norm": 5.414200782775879, + "learning_rate": 8.467001821238422e-05, + "loss": 1.9257, + "step": 3709 + }, + { + "epoch": 0.27926758124917667, + "grad_norm": 4.104365348815918, + "learning_rate": 8.466123300459707e-05, + "loss": 2.2879, + "step": 3710 + }, + { + "epoch": 0.27934285552983684, + "grad_norm": 4.387416362762451, + "learning_rate": 8.465244573629907e-05, + "loss": 2.4506, + "step": 3711 + }, + { + "epoch": 0.279418129810497, + "grad_norm": 4.066411972045898, + "learning_rate": 8.464365640801259e-05, + "loss": 1.9848, + "step": 3712 + }, + { + "epoch": 0.27949340409115714, + "grad_norm": 4.027152061462402, + "learning_rate": 8.463486502026015e-05, + "loss": 1.875, + "step": 3713 + }, + { + "epoch": 0.2795686783718173, + "grad_norm": 5.357828617095947, + "learning_rate": 8.462607157356435e-05, + "loss": 2.0049, + "step": 3714 + }, + { + "epoch": 0.2796439526524775, + "grad_norm": 3.8154964447021484, + "learning_rate": 8.461727606844796e-05, + "loss": 2.1465, + "step": 3715 + }, + { + "epoch": 0.2797192269331376, + "grad_norm": 4.5639543533325195, + "learning_rate": 8.460847850543382e-05, + "loss": 2.1883, + "step": 3716 + }, + { + "epoch": 0.2797945012137978, + "grad_norm": 4.588010311126709, + "learning_rate": 8.459967888504494e-05, + "loss": 2.1676, + "step": 3717 + }, + { + "epoch": 0.2798697754944579, + "grad_norm": 4.198830604553223, + "learning_rate": 8.459087720780443e-05, + "loss": 2.212, + "step": 3718 + }, + { + "epoch": 0.2799450497751181, + "grad_norm": 4.541491508483887, + "learning_rate": 8.458207347423554e-05, + "loss": 2.2054, + "step": 3719 + }, + { + "epoch": 0.28002032405577826, + "grad_norm": 3.481940269470215, + "learning_rate": 8.457326768486159e-05, + "loss": 1.9229, + "step": 3720 + }, + { + "epoch": 0.2800955983364384, + "grad_norm": 5.106874942779541, + "learning_rate": 8.456445984020607e-05, + "loss": 1.7894, + "step": 3721 + }, + { + "epoch": 0.28017087261709855, + "grad_norm": 4.613595485687256, + "learning_rate": 8.455564994079261e-05, + "loss": 2.1641, + "step": 3722 + }, + { + "epoch": 0.28024614689775873, + "grad_norm": 3.6597654819488525, + "learning_rate": 8.45468379871449e-05, + "loss": 1.9853, + "step": 3723 + }, + { + "epoch": 0.28032142117841885, + "grad_norm": 4.6645402908325195, + "learning_rate": 8.453802397978681e-05, + "loss": 2.4116, + "step": 3724 + }, + { + "epoch": 0.280396695459079, + "grad_norm": 4.682445049285889, + "learning_rate": 8.452920791924228e-05, + "loss": 1.9803, + "step": 3725 + }, + { + "epoch": 0.2804719697397392, + "grad_norm": 4.133098125457764, + "learning_rate": 8.452038980603543e-05, + "loss": 2.1919, + "step": 3726 + }, + { + "epoch": 0.2805472440203993, + "grad_norm": 4.461101055145264, + "learning_rate": 8.451156964069043e-05, + "loss": 2.1125, + "step": 3727 + }, + { + "epoch": 0.2806225183010595, + "grad_norm": 4.334978103637695, + "learning_rate": 8.450274742373167e-05, + "loss": 1.933, + "step": 3728 + }, + { + "epoch": 0.2806977925817196, + "grad_norm": 4.740695953369141, + "learning_rate": 8.449392315568356e-05, + "loss": 1.9149, + "step": 3729 + }, + { + "epoch": 0.2807730668623798, + "grad_norm": 5.969768047332764, + "learning_rate": 8.448509683707069e-05, + "loss": 2.0345, + "step": 3730 + }, + { + "epoch": 0.28084834114303997, + "grad_norm": 4.868721961975098, + "learning_rate": 8.447626846841777e-05, + "loss": 1.9773, + "step": 3731 + }, + { + "epoch": 0.2809236154237001, + "grad_norm": 4.836376667022705, + "learning_rate": 8.44674380502496e-05, + "loss": 1.849, + "step": 3732 + }, + { + "epoch": 0.28099888970436027, + "grad_norm": 4.017833232879639, + "learning_rate": 8.445860558309116e-05, + "loss": 1.874, + "step": 3733 + }, + { + "epoch": 0.28107416398502044, + "grad_norm": 5.448207378387451, + "learning_rate": 8.444977106746748e-05, + "loss": 2.1061, + "step": 3734 + }, + { + "epoch": 0.28114943826568056, + "grad_norm": 5.075242519378662, + "learning_rate": 8.444093450390373e-05, + "loss": 1.8405, + "step": 3735 + }, + { + "epoch": 0.28122471254634074, + "grad_norm": 6.011194705963135, + "learning_rate": 8.44320958929253e-05, + "loss": 1.9754, + "step": 3736 + }, + { + "epoch": 0.28129998682700086, + "grad_norm": 6.286788463592529, + "learning_rate": 8.442325523505752e-05, + "loss": 1.5988, + "step": 3737 + }, + { + "epoch": 0.28137526110766103, + "grad_norm": 4.686147689819336, + "learning_rate": 8.441441253082601e-05, + "loss": 2.255, + "step": 3738 + }, + { + "epoch": 0.2814505353883212, + "grad_norm": 6.673200607299805, + "learning_rate": 8.440556778075641e-05, + "loss": 2.0409, + "step": 3739 + }, + { + "epoch": 0.28152580966898133, + "grad_norm": 5.125136375427246, + "learning_rate": 8.439672098537452e-05, + "loss": 2.5536, + "step": 3740 + }, + { + "epoch": 0.2816010839496415, + "grad_norm": 5.624234676361084, + "learning_rate": 8.438787214520626e-05, + "loss": 1.974, + "step": 3741 + }, + { + "epoch": 0.2816763582303017, + "grad_norm": 5.07732629776001, + "learning_rate": 8.43790212607777e-05, + "loss": 1.7633, + "step": 3742 + }, + { + "epoch": 0.2817516325109618, + "grad_norm": 6.201187610626221, + "learning_rate": 8.437016833261496e-05, + "loss": 1.8814, + "step": 3743 + }, + { + "epoch": 0.281826906791622, + "grad_norm": 4.50330924987793, + "learning_rate": 8.436131336124432e-05, + "loss": 1.7608, + "step": 3744 + }, + { + "epoch": 0.28190218107228215, + "grad_norm": 4.039889335632324, + "learning_rate": 8.43524563471922e-05, + "loss": 1.8488, + "step": 3745 + }, + { + "epoch": 0.2819774553529423, + "grad_norm": 4.408026695251465, + "learning_rate": 8.43435972909851e-05, + "loss": 1.8219, + "step": 3746 + }, + { + "epoch": 0.28205272963360245, + "grad_norm": 5.518854141235352, + "learning_rate": 8.433473619314972e-05, + "loss": 2.021, + "step": 3747 + }, + { + "epoch": 0.28212800391426257, + "grad_norm": 5.077434062957764, + "learning_rate": 8.432587305421278e-05, + "loss": 1.9762, + "step": 3748 + }, + { + "epoch": 0.28220327819492275, + "grad_norm": 5.609174728393555, + "learning_rate": 8.431700787470118e-05, + "loss": 2.1817, + "step": 3749 + }, + { + "epoch": 0.2822785524755829, + "grad_norm": 5.44873046875, + "learning_rate": 8.43081406551419e-05, + "loss": 1.8291, + "step": 3750 + }, + { + "epoch": 0.28235382675624304, + "grad_norm": 4.499638557434082, + "learning_rate": 8.429927139606213e-05, + "loss": 2.0681, + "step": 3751 + }, + { + "epoch": 0.2824291010369032, + "grad_norm": 5.210977077484131, + "learning_rate": 8.429040009798908e-05, + "loss": 2.0506, + "step": 3752 + }, + { + "epoch": 0.2825043753175634, + "grad_norm": 5.386653900146484, + "learning_rate": 8.428152676145013e-05, + "loss": 2.1869, + "step": 3753 + }, + { + "epoch": 0.2825796495982235, + "grad_norm": 5.557379722595215, + "learning_rate": 8.427265138697279e-05, + "loss": 1.9385, + "step": 3754 + }, + { + "epoch": 0.2826549238788837, + "grad_norm": 5.396453380584717, + "learning_rate": 8.426377397508465e-05, + "loss": 2.5755, + "step": 3755 + }, + { + "epoch": 0.2827301981595438, + "grad_norm": 4.60844087600708, + "learning_rate": 8.425489452631348e-05, + "loss": 2.0698, + "step": 3756 + }, + { + "epoch": 0.282805472440204, + "grad_norm": 6.2143120765686035, + "learning_rate": 8.424601304118711e-05, + "loss": 1.9856, + "step": 3757 + }, + { + "epoch": 0.28288074672086416, + "grad_norm": 7.1812872886657715, + "learning_rate": 8.423712952023353e-05, + "loss": 2.028, + "step": 3758 + }, + { + "epoch": 0.2829560210015243, + "grad_norm": 5.392536640167236, + "learning_rate": 8.422824396398082e-05, + "loss": 2.1509, + "step": 3759 + }, + { + "epoch": 0.28303129528218446, + "grad_norm": 8.016151428222656, + "learning_rate": 8.421935637295724e-05, + "loss": 2.1664, + "step": 3760 + }, + { + "epoch": 0.28310656956284463, + "grad_norm": 5.711176872253418, + "learning_rate": 8.42104667476911e-05, + "loss": 2.4585, + "step": 3761 + }, + { + "epoch": 0.28318184384350475, + "grad_norm": 5.761692047119141, + "learning_rate": 8.420157508871087e-05, + "loss": 2.203, + "step": 3762 + }, + { + "epoch": 0.28325711812416493, + "grad_norm": 5.780212879180908, + "learning_rate": 8.419268139654514e-05, + "loss": 2.2108, + "step": 3763 + }, + { + "epoch": 0.2833323924048251, + "grad_norm": 4.646653175354004, + "learning_rate": 8.418378567172262e-05, + "loss": 1.6395, + "step": 3764 + }, + { + "epoch": 0.2834076666854852, + "grad_norm": 4.893684387207031, + "learning_rate": 8.41748879147721e-05, + "loss": 2.3411, + "step": 3765 + }, + { + "epoch": 0.2834829409661454, + "grad_norm": 7.01469087600708, + "learning_rate": 8.416598812622259e-05, + "loss": 1.7463, + "step": 3766 + }, + { + "epoch": 0.2835582152468055, + "grad_norm": 7.009760856628418, + "learning_rate": 8.415708630660309e-05, + "loss": 2.2357, + "step": 3767 + }, + { + "epoch": 0.2836334895274657, + "grad_norm": 4.444606304168701, + "learning_rate": 8.414818245644283e-05, + "loss": 1.5604, + "step": 3768 + }, + { + "epoch": 0.2837087638081259, + "grad_norm": 4.191519260406494, + "learning_rate": 8.41392765762711e-05, + "loss": 1.9184, + "step": 3769 + }, + { + "epoch": 0.283784038088786, + "grad_norm": 3.9929373264312744, + "learning_rate": 8.413036866661733e-05, + "loss": 1.8614, + "step": 3770 + }, + { + "epoch": 0.28385931236944617, + "grad_norm": 3.9005722999572754, + "learning_rate": 8.412145872801107e-05, + "loss": 1.9857, + "step": 3771 + }, + { + "epoch": 0.28393458665010635, + "grad_norm": 4.063873291015625, + "learning_rate": 8.4112546760982e-05, + "loss": 1.9791, + "step": 3772 + }, + { + "epoch": 0.28400986093076647, + "grad_norm": 9.40646743774414, + "learning_rate": 8.410363276605992e-05, + "loss": 2.5313, + "step": 3773 + }, + { + "epoch": 0.28408513521142664, + "grad_norm": 4.796889305114746, + "learning_rate": 8.409471674377471e-05, + "loss": 2.3056, + "step": 3774 + }, + { + "epoch": 0.2841604094920868, + "grad_norm": 7.428408145904541, + "learning_rate": 8.408579869465641e-05, + "loss": 1.9011, + "step": 3775 + }, + { + "epoch": 0.28423568377274694, + "grad_norm": 5.819879055023193, + "learning_rate": 8.40768786192352e-05, + "loss": 2.0832, + "step": 3776 + }, + { + "epoch": 0.2843109580534071, + "grad_norm": 4.4307122230529785, + "learning_rate": 8.406795651804132e-05, + "loss": 2.1961, + "step": 3777 + }, + { + "epoch": 0.28438623233406723, + "grad_norm": 4.760819911956787, + "learning_rate": 8.40590323916052e-05, + "loss": 2.1567, + "step": 3778 + }, + { + "epoch": 0.2844615066147274, + "grad_norm": 4.578474044799805, + "learning_rate": 8.40501062404573e-05, + "loss": 1.952, + "step": 3779 + }, + { + "epoch": 0.2845367808953876, + "grad_norm": 4.203295707702637, + "learning_rate": 8.40411780651283e-05, + "loss": 2.0934, + "step": 3780 + }, + { + "epoch": 0.2846120551760477, + "grad_norm": 4.144057750701904, + "learning_rate": 8.403224786614894e-05, + "loss": 1.7741, + "step": 3781 + }, + { + "epoch": 0.2846873294567079, + "grad_norm": 7.02156925201416, + "learning_rate": 8.402331564405009e-05, + "loss": 1.6324, + "step": 3782 + }, + { + "epoch": 0.28476260373736806, + "grad_norm": 7.329413890838623, + "learning_rate": 8.401438139936275e-05, + "loss": 2.2842, + "step": 3783 + }, + { + "epoch": 0.2848378780180282, + "grad_norm": 5.4921345710754395, + "learning_rate": 8.400544513261803e-05, + "loss": 1.9874, + "step": 3784 + }, + { + "epoch": 0.28491315229868835, + "grad_norm": 5.664630889892578, + "learning_rate": 8.399650684434719e-05, + "loss": 2.0946, + "step": 3785 + }, + { + "epoch": 0.2849884265793485, + "grad_norm": 6.516373157501221, + "learning_rate": 8.398756653508154e-05, + "loss": 2.0939, + "step": 3786 + }, + { + "epoch": 0.28506370086000865, + "grad_norm": 4.918422698974609, + "learning_rate": 8.397862420535258e-05, + "loss": 1.8095, + "step": 3787 + }, + { + "epoch": 0.2851389751406688, + "grad_norm": 5.609355926513672, + "learning_rate": 8.396967985569192e-05, + "loss": 1.9581, + "step": 3788 + }, + { + "epoch": 0.28521424942132895, + "grad_norm": 3.8850326538085938, + "learning_rate": 8.396073348663126e-05, + "loss": 1.6692, + "step": 3789 + }, + { + "epoch": 0.2852895237019891, + "grad_norm": 5.608982086181641, + "learning_rate": 8.395178509870244e-05, + "loss": 1.7296, + "step": 3790 + }, + { + "epoch": 0.2853647979826493, + "grad_norm": 4.29400110244751, + "learning_rate": 8.39428346924374e-05, + "loss": 1.7776, + "step": 3791 + }, + { + "epoch": 0.2854400722633094, + "grad_norm": 4.845223426818848, + "learning_rate": 8.393388226836825e-05, + "loss": 2.293, + "step": 3792 + }, + { + "epoch": 0.2855153465439696, + "grad_norm": 5.966276168823242, + "learning_rate": 8.392492782702717e-05, + "loss": 2.1213, + "step": 3793 + }, + { + "epoch": 0.28559062082462977, + "grad_norm": 6.135119438171387, + "learning_rate": 8.391597136894645e-05, + "loss": 1.9188, + "step": 3794 + }, + { + "epoch": 0.2856658951052899, + "grad_norm": 4.353500843048096, + "learning_rate": 8.390701289465856e-05, + "loss": 1.8072, + "step": 3795 + }, + { + "epoch": 0.28574116938595007, + "grad_norm": 4.831733226776123, + "learning_rate": 8.389805240469604e-05, + "loss": 1.7808, + "step": 3796 + }, + { + "epoch": 0.2858164436666102, + "grad_norm": 5.825110912322998, + "learning_rate": 8.388908989959156e-05, + "loss": 2.2115, + "step": 3797 + }, + { + "epoch": 0.28589171794727036, + "grad_norm": 4.649841785430908, + "learning_rate": 8.388012537987795e-05, + "loss": 1.8913, + "step": 3798 + }, + { + "epoch": 0.28596699222793054, + "grad_norm": 5.0056471824646, + "learning_rate": 8.387115884608808e-05, + "loss": 2.0774, + "step": 3799 + }, + { + "epoch": 0.28604226650859066, + "grad_norm": 4.448461055755615, + "learning_rate": 8.386219029875502e-05, + "loss": 2.052, + "step": 3800 + }, + { + "epoch": 0.28611754078925083, + "grad_norm": 4.005342960357666, + "learning_rate": 8.385321973841189e-05, + "loss": 1.9199, + "step": 3801 + }, + { + "epoch": 0.286192815069911, + "grad_norm": 4.80564022064209, + "learning_rate": 8.384424716559198e-05, + "loss": 1.803, + "step": 3802 + }, + { + "epoch": 0.28626808935057113, + "grad_norm": 4.846798896789551, + "learning_rate": 8.38352725808287e-05, + "loss": 1.976, + "step": 3803 + }, + { + "epoch": 0.2863433636312313, + "grad_norm": 4.015679359436035, + "learning_rate": 8.382629598465555e-05, + "loss": 1.6681, + "step": 3804 + }, + { + "epoch": 0.2864186379118914, + "grad_norm": 5.652832984924316, + "learning_rate": 8.381731737760614e-05, + "loss": 2.0525, + "step": 3805 + }, + { + "epoch": 0.2864939121925516, + "grad_norm": 4.099508285522461, + "learning_rate": 8.380833676021427e-05, + "loss": 1.8231, + "step": 3806 + }, + { + "epoch": 0.2865691864732118, + "grad_norm": 4.781913757324219, + "learning_rate": 8.379935413301378e-05, + "loss": 1.9868, + "step": 3807 + }, + { + "epoch": 0.2866444607538719, + "grad_norm": 4.977044105529785, + "learning_rate": 8.379036949653867e-05, + "loss": 1.9808, + "step": 3808 + }, + { + "epoch": 0.2867197350345321, + "grad_norm": 4.7633256912231445, + "learning_rate": 8.378138285132305e-05, + "loss": 2.095, + "step": 3809 + }, + { + "epoch": 0.28679500931519225, + "grad_norm": 6.2208991050720215, + "learning_rate": 8.377239419790112e-05, + "loss": 1.8992, + "step": 3810 + }, + { + "epoch": 0.28687028359585237, + "grad_norm": 3.845478057861328, + "learning_rate": 8.37634035368073e-05, + "loss": 1.8591, + "step": 3811 + }, + { + "epoch": 0.28694555787651255, + "grad_norm": 4.4607977867126465, + "learning_rate": 8.3754410868576e-05, + "loss": 1.8387, + "step": 3812 + }, + { + "epoch": 0.2870208321571727, + "grad_norm": 4.289780139923096, + "learning_rate": 8.374541619374182e-05, + "loss": 1.919, + "step": 3813 + }, + { + "epoch": 0.28709610643783284, + "grad_norm": 4.584689617156982, + "learning_rate": 8.373641951283948e-05, + "loss": 1.7526, + "step": 3814 + }, + { + "epoch": 0.287171380718493, + "grad_norm": 4.672449111938477, + "learning_rate": 8.37274208264038e-05, + "loss": 2.1342, + "step": 3815 + }, + { + "epoch": 0.28724665499915314, + "grad_norm": 4.065759181976318, + "learning_rate": 8.371842013496973e-05, + "loss": 1.7796, + "step": 3816 + }, + { + "epoch": 0.2873219292798133, + "grad_norm": 5.574157238006592, + "learning_rate": 8.370941743907233e-05, + "loss": 1.8598, + "step": 3817 + }, + { + "epoch": 0.2873972035604735, + "grad_norm": 4.440821647644043, + "learning_rate": 8.370041273924678e-05, + "loss": 1.9419, + "step": 3818 + }, + { + "epoch": 0.2874724778411336, + "grad_norm": 4.1743879318237305, + "learning_rate": 8.36914060360284e-05, + "loss": 2.0588, + "step": 3819 + }, + { + "epoch": 0.2875477521217938, + "grad_norm": 5.459051609039307, + "learning_rate": 8.36823973299526e-05, + "loss": 2.1894, + "step": 3820 + }, + { + "epoch": 0.28762302640245396, + "grad_norm": 5.443028926849365, + "learning_rate": 8.367338662155493e-05, + "loss": 2.0621, + "step": 3821 + }, + { + "epoch": 0.2876983006831141, + "grad_norm": 4.028825759887695, + "learning_rate": 8.366437391137103e-05, + "loss": 1.9051, + "step": 3822 + }, + { + "epoch": 0.28777357496377426, + "grad_norm": 4.738773345947266, + "learning_rate": 8.36553591999367e-05, + "loss": 2.1645, + "step": 3823 + }, + { + "epoch": 0.28784884924443443, + "grad_norm": 5.067007064819336, + "learning_rate": 8.364634248778784e-05, + "loss": 2.2127, + "step": 3824 + }, + { + "epoch": 0.28792412352509456, + "grad_norm": 5.075727939605713, + "learning_rate": 8.363732377546045e-05, + "loss": 2.0149, + "step": 3825 + }, + { + "epoch": 0.28799939780575473, + "grad_norm": 5.6277384757995605, + "learning_rate": 8.36283030634907e-05, + "loss": 2.0865, + "step": 3826 + }, + { + "epoch": 0.28807467208641485, + "grad_norm": 4.5693678855896, + "learning_rate": 8.36192803524148e-05, + "loss": 1.8401, + "step": 3827 + }, + { + "epoch": 0.288149946367075, + "grad_norm": 6.34404993057251, + "learning_rate": 8.361025564276917e-05, + "loss": 2.2659, + "step": 3828 + }, + { + "epoch": 0.2882252206477352, + "grad_norm": 5.009212017059326, + "learning_rate": 8.360122893509026e-05, + "loss": 2.0759, + "step": 3829 + }, + { + "epoch": 0.2883004949283953, + "grad_norm": 6.073802947998047, + "learning_rate": 8.35922002299147e-05, + "loss": 1.9269, + "step": 3830 + }, + { + "epoch": 0.2883757692090555, + "grad_norm": 7.261332035064697, + "learning_rate": 8.358316952777925e-05, + "loss": 2.074, + "step": 3831 + }, + { + "epoch": 0.2884510434897157, + "grad_norm": 5.354355812072754, + "learning_rate": 8.357413682922071e-05, + "loss": 2.5389, + "step": 3832 + }, + { + "epoch": 0.2885263177703758, + "grad_norm": 4.396764755249023, + "learning_rate": 8.356510213477606e-05, + "loss": 2.0525, + "step": 3833 + }, + { + "epoch": 0.28860159205103597, + "grad_norm": 3.9522957801818848, + "learning_rate": 8.355606544498243e-05, + "loss": 1.7007, + "step": 3834 + }, + { + "epoch": 0.2886768663316961, + "grad_norm": 4.869263648986816, + "learning_rate": 8.354702676037697e-05, + "loss": 1.9716, + "step": 3835 + }, + { + "epoch": 0.28875214061235627, + "grad_norm": 6.5716142654418945, + "learning_rate": 8.353798608149704e-05, + "loss": 1.8174, + "step": 3836 + }, + { + "epoch": 0.28882741489301644, + "grad_norm": 5.1595659255981445, + "learning_rate": 8.352894340888005e-05, + "loss": 2.0736, + "step": 3837 + }, + { + "epoch": 0.28890268917367656, + "grad_norm": 4.367010116577148, + "learning_rate": 8.351989874306358e-05, + "loss": 2.1123, + "step": 3838 + }, + { + "epoch": 0.28897796345433674, + "grad_norm": 4.278624057769775, + "learning_rate": 8.351085208458534e-05, + "loss": 1.8711, + "step": 3839 + }, + { + "epoch": 0.2890532377349969, + "grad_norm": 4.627903938293457, + "learning_rate": 8.350180343398307e-05, + "loss": 2.1301, + "step": 3840 + }, + { + "epoch": 0.28912851201565704, + "grad_norm": 5.781001091003418, + "learning_rate": 8.349275279179471e-05, + "loss": 2.0391, + "step": 3841 + }, + { + "epoch": 0.2892037862963172, + "grad_norm": 5.771537780761719, + "learning_rate": 8.348370015855831e-05, + "loss": 2.217, + "step": 3842 + }, + { + "epoch": 0.2892790605769774, + "grad_norm": 4.254384994506836, + "learning_rate": 8.347464553481201e-05, + "loss": 1.8493, + "step": 3843 + }, + { + "epoch": 0.2893543348576375, + "grad_norm": 5.078681468963623, + "learning_rate": 8.346558892109409e-05, + "loss": 1.8544, + "step": 3844 + }, + { + "epoch": 0.2894296091382977, + "grad_norm": 4.085641384124756, + "learning_rate": 8.345653031794292e-05, + "loss": 1.9271, + "step": 3845 + }, + { + "epoch": 0.2895048834189578, + "grad_norm": 6.673791408538818, + "learning_rate": 8.344746972589702e-05, + "loss": 1.8395, + "step": 3846 + }, + { + "epoch": 0.289580157699618, + "grad_norm": 5.336939334869385, + "learning_rate": 8.343840714549502e-05, + "loss": 1.5987, + "step": 3847 + }, + { + "epoch": 0.28965543198027816, + "grad_norm": 5.2516326904296875, + "learning_rate": 8.342934257727566e-05, + "loss": 2.2035, + "step": 3848 + }, + { + "epoch": 0.2897307062609383, + "grad_norm": 4.194336891174316, + "learning_rate": 8.342027602177779e-05, + "loss": 1.9974, + "step": 3849 + }, + { + "epoch": 0.28980598054159845, + "grad_norm": 5.828739166259766, + "learning_rate": 8.341120747954043e-05, + "loss": 2.0415, + "step": 3850 + }, + { + "epoch": 0.2898812548222586, + "grad_norm": 4.858316421508789, + "learning_rate": 8.340213695110264e-05, + "loss": 2.1274, + "step": 3851 + }, + { + "epoch": 0.28995652910291875, + "grad_norm": 4.2831878662109375, + "learning_rate": 8.339306443700366e-05, + "loss": 2.0904, + "step": 3852 + }, + { + "epoch": 0.2900318033835789, + "grad_norm": 5.760950565338135, + "learning_rate": 8.33839899377828e-05, + "loss": 1.9026, + "step": 3853 + }, + { + "epoch": 0.29010707766423904, + "grad_norm": 6.153852939605713, + "learning_rate": 8.337491345397953e-05, + "loss": 1.859, + "step": 3854 + }, + { + "epoch": 0.2901823519448992, + "grad_norm": 4.654358863830566, + "learning_rate": 8.336583498613342e-05, + "loss": 2.0937, + "step": 3855 + }, + { + "epoch": 0.2902576262255594, + "grad_norm": 4.814719200134277, + "learning_rate": 8.335675453478416e-05, + "loss": 2.1774, + "step": 3856 + }, + { + "epoch": 0.2903329005062195, + "grad_norm": 4.641236782073975, + "learning_rate": 8.334767210047155e-05, + "loss": 2.0392, + "step": 3857 + }, + { + "epoch": 0.2904081747868797, + "grad_norm": 4.533456802368164, + "learning_rate": 8.333858768373552e-05, + "loss": 2.0107, + "step": 3858 + }, + { + "epoch": 0.29048344906753987, + "grad_norm": 8.164016723632812, + "learning_rate": 8.332950128511612e-05, + "loss": 2.51, + "step": 3859 + }, + { + "epoch": 0.2905587233482, + "grad_norm": 5.916752338409424, + "learning_rate": 8.332041290515349e-05, + "loss": 1.7555, + "step": 3860 + }, + { + "epoch": 0.29063399762886016, + "grad_norm": 16.761079788208008, + "learning_rate": 8.331132254438793e-05, + "loss": 1.9136, + "step": 3861 + }, + { + "epoch": 0.29070927190952034, + "grad_norm": 5.389075756072998, + "learning_rate": 8.330223020335982e-05, + "loss": 1.862, + "step": 3862 + }, + { + "epoch": 0.29078454619018046, + "grad_norm": 4.40911340713501, + "learning_rate": 8.329313588260968e-05, + "loss": 1.9847, + "step": 3863 + }, + { + "epoch": 0.29085982047084064, + "grad_norm": 4.051477909088135, + "learning_rate": 8.328403958267814e-05, + "loss": 1.8207, + "step": 3864 + }, + { + "epoch": 0.29093509475150076, + "grad_norm": 5.646363258361816, + "learning_rate": 8.327494130410595e-05, + "loss": 1.7035, + "step": 3865 + }, + { + "epoch": 0.29101036903216093, + "grad_norm": 4.3422017097473145, + "learning_rate": 8.326584104743398e-05, + "loss": 1.719, + "step": 3866 + }, + { + "epoch": 0.2910856433128211, + "grad_norm": 5.7380194664001465, + "learning_rate": 8.325673881320319e-05, + "loss": 1.8496, + "step": 3867 + }, + { + "epoch": 0.2911609175934812, + "grad_norm": 4.962455749511719, + "learning_rate": 8.324763460195472e-05, + "loss": 1.9531, + "step": 3868 + }, + { + "epoch": 0.2912361918741414, + "grad_norm": 5.762204647064209, + "learning_rate": 8.323852841422976e-05, + "loss": 2.2756, + "step": 3869 + }, + { + "epoch": 0.2913114661548016, + "grad_norm": 8.217690467834473, + "learning_rate": 8.322942025056968e-05, + "loss": 2.1949, + "step": 3870 + }, + { + "epoch": 0.2913867404354617, + "grad_norm": 5.117194652557373, + "learning_rate": 8.32203101115159e-05, + "loss": 2.4251, + "step": 3871 + }, + { + "epoch": 0.2914620147161219, + "grad_norm": 5.370823860168457, + "learning_rate": 8.321119799760999e-05, + "loss": 1.8887, + "step": 3872 + }, + { + "epoch": 0.29153728899678205, + "grad_norm": 5.795083045959473, + "learning_rate": 8.320208390939368e-05, + "loss": 1.7909, + "step": 3873 + }, + { + "epoch": 0.29161256327744217, + "grad_norm": 5.213447093963623, + "learning_rate": 8.319296784740873e-05, + "loss": 2.2363, + "step": 3874 + }, + { + "epoch": 0.29168783755810235, + "grad_norm": 5.183766841888428, + "learning_rate": 8.318384981219707e-05, + "loss": 1.8655, + "step": 3875 + }, + { + "epoch": 0.29176311183876247, + "grad_norm": 4.207417964935303, + "learning_rate": 8.317472980430079e-05, + "loss": 1.9247, + "step": 3876 + }, + { + "epoch": 0.29183838611942264, + "grad_norm": 5.6128106117248535, + "learning_rate": 8.316560782426199e-05, + "loss": 2.0628, + "step": 3877 + }, + { + "epoch": 0.2919136604000828, + "grad_norm": 5.02679967880249, + "learning_rate": 8.315648387262296e-05, + "loss": 1.9159, + "step": 3878 + }, + { + "epoch": 0.29198893468074294, + "grad_norm": 5.224948406219482, + "learning_rate": 8.314735794992612e-05, + "loss": 2.0118, + "step": 3879 + }, + { + "epoch": 0.2920642089614031, + "grad_norm": 4.2891645431518555, + "learning_rate": 8.313823005671396e-05, + "loss": 1.8389, + "step": 3880 + }, + { + "epoch": 0.2921394832420633, + "grad_norm": 4.979725360870361, + "learning_rate": 8.312910019352909e-05, + "loss": 1.8752, + "step": 3881 + }, + { + "epoch": 0.2922147575227234, + "grad_norm": 4.553825378417969, + "learning_rate": 8.311996836091427e-05, + "loss": 1.8522, + "step": 3882 + }, + { + "epoch": 0.2922900318033836, + "grad_norm": 5.251980781555176, + "learning_rate": 8.311083455941238e-05, + "loss": 1.9936, + "step": 3883 + }, + { + "epoch": 0.2923653060840437, + "grad_norm": 4.495422840118408, + "learning_rate": 8.310169878956638e-05, + "loss": 2.3615, + "step": 3884 + }, + { + "epoch": 0.2924405803647039, + "grad_norm": 4.727469444274902, + "learning_rate": 8.309256105191934e-05, + "loss": 2.0753, + "step": 3885 + }, + { + "epoch": 0.29251585464536406, + "grad_norm": 4.480971813201904, + "learning_rate": 8.308342134701452e-05, + "loss": 2.0304, + "step": 3886 + }, + { + "epoch": 0.2925911289260242, + "grad_norm": 5.546588897705078, + "learning_rate": 8.307427967539523e-05, + "loss": 2.2204, + "step": 3887 + }, + { + "epoch": 0.29266640320668436, + "grad_norm": 4.663414001464844, + "learning_rate": 8.30651360376049e-05, + "loss": 1.893, + "step": 3888 + }, + { + "epoch": 0.29274167748734453, + "grad_norm": 5.547967433929443, + "learning_rate": 8.305599043418712e-05, + "loss": 1.8702, + "step": 3889 + }, + { + "epoch": 0.29281695176800465, + "grad_norm": 3.9726219177246094, + "learning_rate": 8.304684286568556e-05, + "loss": 2.0076, + "step": 3890 + }, + { + "epoch": 0.2928922260486648, + "grad_norm": 6.071773529052734, + "learning_rate": 8.303769333264402e-05, + "loss": 1.9457, + "step": 3891 + }, + { + "epoch": 0.292967500329325, + "grad_norm": 4.126742839813232, + "learning_rate": 8.302854183560639e-05, + "loss": 1.8904, + "step": 3892 + }, + { + "epoch": 0.2930427746099851, + "grad_norm": 5.6514363288879395, + "learning_rate": 8.30193883751167e-05, + "loss": 2.2859, + "step": 3893 + }, + { + "epoch": 0.2931180488906453, + "grad_norm": 7.316897869110107, + "learning_rate": 8.301023295171916e-05, + "loss": 1.9531, + "step": 3894 + }, + { + "epoch": 0.2931933231713054, + "grad_norm": 3.8049962520599365, + "learning_rate": 8.300107556595796e-05, + "loss": 1.6219, + "step": 3895 + }, + { + "epoch": 0.2932685974519656, + "grad_norm": 6.120655536651611, + "learning_rate": 8.299191621837751e-05, + "loss": 2.0276, + "step": 3896 + }, + { + "epoch": 0.29334387173262577, + "grad_norm": 6.627539157867432, + "learning_rate": 8.298275490952232e-05, + "loss": 2.0559, + "step": 3897 + }, + { + "epoch": 0.2934191460132859, + "grad_norm": 5.482000350952148, + "learning_rate": 8.297359163993697e-05, + "loss": 1.7327, + "step": 3898 + }, + { + "epoch": 0.29349442029394607, + "grad_norm": 7.104825019836426, + "learning_rate": 8.296442641016622e-05, + "loss": 2.4138, + "step": 3899 + }, + { + "epoch": 0.29356969457460624, + "grad_norm": 5.38658332824707, + "learning_rate": 8.29552592207549e-05, + "loss": 1.6841, + "step": 3900 + }, + { + "epoch": 0.29364496885526636, + "grad_norm": 5.3317694664001465, + "learning_rate": 8.294609007224797e-05, + "loss": 2.0955, + "step": 3901 + }, + { + "epoch": 0.29372024313592654, + "grad_norm": 5.122278213500977, + "learning_rate": 8.293691896519053e-05, + "loss": 2.2212, + "step": 3902 + }, + { + "epoch": 0.29379551741658666, + "grad_norm": 12.64024543762207, + "learning_rate": 8.292774590012777e-05, + "loss": 2.11, + "step": 3903 + }, + { + "epoch": 0.29387079169724684, + "grad_norm": 5.043898105621338, + "learning_rate": 8.291857087760499e-05, + "loss": 1.6972, + "step": 3904 + }, + { + "epoch": 0.293946065977907, + "grad_norm": 6.9429850578308105, + "learning_rate": 8.290939389816763e-05, + "loss": 2.081, + "step": 3905 + }, + { + "epoch": 0.29402134025856713, + "grad_norm": 5.512428283691406, + "learning_rate": 8.290021496236122e-05, + "loss": 1.5824, + "step": 3906 + }, + { + "epoch": 0.2940966145392273, + "grad_norm": 9.171966552734375, + "learning_rate": 8.289103407073143e-05, + "loss": 2.1031, + "step": 3907 + }, + { + "epoch": 0.2941718888198875, + "grad_norm": 5.711566925048828, + "learning_rate": 8.288185122382405e-05, + "loss": 1.9277, + "step": 3908 + }, + { + "epoch": 0.2942471631005476, + "grad_norm": 7.213926315307617, + "learning_rate": 8.287266642218496e-05, + "loss": 2.2912, + "step": 3909 + }, + { + "epoch": 0.2943224373812078, + "grad_norm": 8.158979415893555, + "learning_rate": 8.286347966636016e-05, + "loss": 2.175, + "step": 3910 + }, + { + "epoch": 0.29439771166186796, + "grad_norm": 3.853114604949951, + "learning_rate": 8.285429095689581e-05, + "loss": 1.8702, + "step": 3911 + }, + { + "epoch": 0.2944729859425281, + "grad_norm": 4.604254245758057, + "learning_rate": 8.284510029433813e-05, + "loss": 1.7579, + "step": 3912 + }, + { + "epoch": 0.29454826022318825, + "grad_norm": 4.960995674133301, + "learning_rate": 8.283590767923347e-05, + "loss": 2.0526, + "step": 3913 + }, + { + "epoch": 0.29462353450384837, + "grad_norm": 4.931746959686279, + "learning_rate": 8.282671311212833e-05, + "loss": 2.0382, + "step": 3914 + }, + { + "epoch": 0.29469880878450855, + "grad_norm": 5.330249309539795, + "learning_rate": 8.281751659356926e-05, + "loss": 2.1246, + "step": 3915 + }, + { + "epoch": 0.2947740830651687, + "grad_norm": 3.997763156890869, + "learning_rate": 8.280831812410301e-05, + "loss": 2.3116, + "step": 3916 + }, + { + "epoch": 0.29484935734582884, + "grad_norm": 5.4841227531433105, + "learning_rate": 8.279911770427638e-05, + "loss": 2.0063, + "step": 3917 + }, + { + "epoch": 0.294924631626489, + "grad_norm": 6.070057392120361, + "learning_rate": 8.278991533463633e-05, + "loss": 2.1699, + "step": 3918 + }, + { + "epoch": 0.2949999059071492, + "grad_norm": 4.857336044311523, + "learning_rate": 8.278071101572988e-05, + "loss": 1.9054, + "step": 3919 + }, + { + "epoch": 0.2950751801878093, + "grad_norm": 4.851632595062256, + "learning_rate": 8.277150474810422e-05, + "loss": 2.0802, + "step": 3920 + }, + { + "epoch": 0.2951504544684695, + "grad_norm": 5.263962745666504, + "learning_rate": 8.276229653230665e-05, + "loss": 2.1138, + "step": 3921 + }, + { + "epoch": 0.29522572874912967, + "grad_norm": 5.567708969116211, + "learning_rate": 8.275308636888455e-05, + "loss": 1.8598, + "step": 3922 + }, + { + "epoch": 0.2953010030297898, + "grad_norm": 5.504866123199463, + "learning_rate": 8.274387425838543e-05, + "loss": 1.8673, + "step": 3923 + }, + { + "epoch": 0.29537627731044996, + "grad_norm": 4.360326290130615, + "learning_rate": 8.273466020135695e-05, + "loss": 1.8631, + "step": 3924 + }, + { + "epoch": 0.2954515515911101, + "grad_norm": 4.288941860198975, + "learning_rate": 8.272544419834685e-05, + "loss": 2.3168, + "step": 3925 + }, + { + "epoch": 0.29552682587177026, + "grad_norm": 4.796971321105957, + "learning_rate": 8.271622624990298e-05, + "loss": 1.9152, + "step": 3926 + }, + { + "epoch": 0.29560210015243044, + "grad_norm": 4.761312007904053, + "learning_rate": 8.270700635657334e-05, + "loss": 2.1955, + "step": 3927 + }, + { + "epoch": 0.29567737443309056, + "grad_norm": 5.798272132873535, + "learning_rate": 8.269778451890603e-05, + "loss": 1.864, + "step": 3928 + }, + { + "epoch": 0.29575264871375073, + "grad_norm": 5.7751078605651855, + "learning_rate": 8.268856073744924e-05, + "loss": 2.1239, + "step": 3929 + }, + { + "epoch": 0.2958279229944109, + "grad_norm": 5.696370601654053, + "learning_rate": 8.267933501275132e-05, + "loss": 2.1238, + "step": 3930 + }, + { + "epoch": 0.29590319727507103, + "grad_norm": 5.055115222930908, + "learning_rate": 8.267010734536067e-05, + "loss": 2.0881, + "step": 3931 + }, + { + "epoch": 0.2959784715557312, + "grad_norm": 4.647343158721924, + "learning_rate": 8.266087773582591e-05, + "loss": 1.8201, + "step": 3932 + }, + { + "epoch": 0.2960537458363913, + "grad_norm": 4.77907133102417, + "learning_rate": 8.265164618469567e-05, + "loss": 1.6277, + "step": 3933 + }, + { + "epoch": 0.2961290201170515, + "grad_norm": 6.428045272827148, + "learning_rate": 8.264241269251876e-05, + "loss": 2.1503, + "step": 3934 + }, + { + "epoch": 0.2962042943977117, + "grad_norm": 8.573118209838867, + "learning_rate": 8.263317725984406e-05, + "loss": 2.194, + "step": 3935 + }, + { + "epoch": 0.2962795686783718, + "grad_norm": 3.6545355319976807, + "learning_rate": 8.262393988722061e-05, + "loss": 1.6625, + "step": 3936 + }, + { + "epoch": 0.29635484295903197, + "grad_norm": 9.070815086364746, + "learning_rate": 8.261470057519755e-05, + "loss": 2.0998, + "step": 3937 + }, + { + "epoch": 0.29643011723969215, + "grad_norm": 5.5088582038879395, + "learning_rate": 8.260545932432413e-05, + "loss": 2.2391, + "step": 3938 + }, + { + "epoch": 0.29650539152035227, + "grad_norm": 3.3841552734375, + "learning_rate": 8.25962161351497e-05, + "loss": 2.1441, + "step": 3939 + }, + { + "epoch": 0.29658066580101244, + "grad_norm": 4.375843524932861, + "learning_rate": 8.258697100822376e-05, + "loss": 2.1479, + "step": 3940 + }, + { + "epoch": 0.2966559400816726, + "grad_norm": 6.604024410247803, + "learning_rate": 8.257772394409589e-05, + "loss": 2.2115, + "step": 3941 + }, + { + "epoch": 0.29673121436233274, + "grad_norm": 4.49370813369751, + "learning_rate": 8.256847494331581e-05, + "loss": 1.8916, + "step": 3942 + }, + { + "epoch": 0.2968064886429929, + "grad_norm": 5.889625549316406, + "learning_rate": 8.255922400643335e-05, + "loss": 2.075, + "step": 3943 + }, + { + "epoch": 0.29688176292365304, + "grad_norm": 6.818896293640137, + "learning_rate": 8.254997113399842e-05, + "loss": 1.8455, + "step": 3944 + }, + { + "epoch": 0.2969570372043132, + "grad_norm": 5.231121063232422, + "learning_rate": 8.254071632656113e-05, + "loss": 2.1155, + "step": 3945 + }, + { + "epoch": 0.2970323114849734, + "grad_norm": 4.874687194824219, + "learning_rate": 8.253145958467162e-05, + "loss": 1.7141, + "step": 3946 + }, + { + "epoch": 0.2971075857656335, + "grad_norm": 5.8896050453186035, + "learning_rate": 8.252220090888017e-05, + "loss": 1.981, + "step": 3947 + }, + { + "epoch": 0.2971828600462937, + "grad_norm": 4.113016128540039, + "learning_rate": 8.251294029973719e-05, + "loss": 1.6467, + "step": 3948 + }, + { + "epoch": 0.29725813432695386, + "grad_norm": 6.598508358001709, + "learning_rate": 8.25036777577932e-05, + "loss": 2.0384, + "step": 3949 + }, + { + "epoch": 0.297333408607614, + "grad_norm": 5.10032844543457, + "learning_rate": 8.249441328359883e-05, + "loss": 1.9433, + "step": 3950 + }, + { + "epoch": 0.29740868288827416, + "grad_norm": 6.028850078582764, + "learning_rate": 8.248514687770483e-05, + "loss": 1.7315, + "step": 3951 + }, + { + "epoch": 0.29748395716893433, + "grad_norm": 5.534749507904053, + "learning_rate": 8.247587854066207e-05, + "loss": 1.9396, + "step": 3952 + }, + { + "epoch": 0.29755923144959445, + "grad_norm": 4.4021711349487305, + "learning_rate": 8.24666082730215e-05, + "loss": 1.9053, + "step": 3953 + }, + { + "epoch": 0.29763450573025463, + "grad_norm": 4.532413005828857, + "learning_rate": 8.245733607533423e-05, + "loss": 1.9209, + "step": 3954 + }, + { + "epoch": 0.29770978001091475, + "grad_norm": 4.757065296173096, + "learning_rate": 8.244806194815146e-05, + "loss": 2.3268, + "step": 3955 + }, + { + "epoch": 0.2977850542915749, + "grad_norm": 7.829839706420898, + "learning_rate": 8.243878589202451e-05, + "loss": 1.8357, + "step": 3956 + }, + { + "epoch": 0.2978603285722351, + "grad_norm": 6.756631851196289, + "learning_rate": 8.242950790750484e-05, + "loss": 1.793, + "step": 3957 + }, + { + "epoch": 0.2979356028528952, + "grad_norm": 4.6028642654418945, + "learning_rate": 8.242022799514395e-05, + "loss": 2.1313, + "step": 3958 + }, + { + "epoch": 0.2980108771335554, + "grad_norm": 6.229007244110107, + "learning_rate": 8.241094615549353e-05, + "loss": 2.1468, + "step": 3959 + }, + { + "epoch": 0.29808615141421557, + "grad_norm": 6.420483112335205, + "learning_rate": 8.240166238910536e-05, + "loss": 2.1654, + "step": 3960 + }, + { + "epoch": 0.2981614256948757, + "grad_norm": 7.910793304443359, + "learning_rate": 8.239237669653134e-05, + "loss": 2.0879, + "step": 3961 + }, + { + "epoch": 0.29823669997553587, + "grad_norm": 5.2804460525512695, + "learning_rate": 8.238308907832347e-05, + "loss": 1.9275, + "step": 3962 + }, + { + "epoch": 0.298311974256196, + "grad_norm": 10.75217342376709, + "learning_rate": 8.237379953503386e-05, + "loss": 2.747, + "step": 3963 + }, + { + "epoch": 0.29838724853685616, + "grad_norm": 4.8242106437683105, + "learning_rate": 8.236450806721476e-05, + "loss": 1.8727, + "step": 3964 + }, + { + "epoch": 0.29846252281751634, + "grad_norm": 6.477041244506836, + "learning_rate": 8.235521467541852e-05, + "loss": 1.4766, + "step": 3965 + }, + { + "epoch": 0.29853779709817646, + "grad_norm": 7.087616443634033, + "learning_rate": 8.234591936019762e-05, + "loss": 2.3395, + "step": 3966 + }, + { + "epoch": 0.29861307137883664, + "grad_norm": 4.480384349822998, + "learning_rate": 8.233662212210461e-05, + "loss": 1.9084, + "step": 3967 + }, + { + "epoch": 0.2986883456594968, + "grad_norm": 4.715414524078369, + "learning_rate": 8.23273229616922e-05, + "loss": 2.1976, + "step": 3968 + }, + { + "epoch": 0.29876361994015693, + "grad_norm": 5.919058799743652, + "learning_rate": 8.231802187951321e-05, + "loss": 1.8901, + "step": 3969 + }, + { + "epoch": 0.2988388942208171, + "grad_norm": 5.027727127075195, + "learning_rate": 8.230871887612054e-05, + "loss": 1.9373, + "step": 3970 + }, + { + "epoch": 0.2989141685014773, + "grad_norm": 5.580582618713379, + "learning_rate": 8.229941395206724e-05, + "loss": 1.9368, + "step": 3971 + }, + { + "epoch": 0.2989894427821374, + "grad_norm": 4.887148857116699, + "learning_rate": 8.229010710790646e-05, + "loss": 1.8637, + "step": 3972 + }, + { + "epoch": 0.2990647170627976, + "grad_norm": 10.317251205444336, + "learning_rate": 8.228079834419145e-05, + "loss": 1.8815, + "step": 3973 + }, + { + "epoch": 0.2991399913434577, + "grad_norm": 6.12963342666626, + "learning_rate": 8.227148766147561e-05, + "loss": 1.9372, + "step": 3974 + }, + { + "epoch": 0.2992152656241179, + "grad_norm": 5.815849781036377, + "learning_rate": 8.226217506031243e-05, + "loss": 1.8988, + "step": 3975 + }, + { + "epoch": 0.29929053990477805, + "grad_norm": 43.546199798583984, + "learning_rate": 8.225286054125552e-05, + "loss": 1.7756, + "step": 3976 + }, + { + "epoch": 0.2993658141854382, + "grad_norm": 4.317806720733643, + "learning_rate": 8.224354410485857e-05, + "loss": 2.0748, + "step": 3977 + }, + { + "epoch": 0.29944108846609835, + "grad_norm": 6.013404846191406, + "learning_rate": 8.223422575167546e-05, + "loss": 2.2538, + "step": 3978 + }, + { + "epoch": 0.2995163627467585, + "grad_norm": 5.0074543952941895, + "learning_rate": 8.222490548226011e-05, + "loss": 1.9222, + "step": 3979 + }, + { + "epoch": 0.29959163702741864, + "grad_norm": 4.372267723083496, + "learning_rate": 8.221558329716661e-05, + "loss": 1.901, + "step": 3980 + }, + { + "epoch": 0.2996669113080788, + "grad_norm": 4.76335334777832, + "learning_rate": 8.220625919694909e-05, + "loss": 1.856, + "step": 3981 + }, + { + "epoch": 0.29974218558873894, + "grad_norm": 5.090167045593262, + "learning_rate": 8.21969331821619e-05, + "loss": 2.1163, + "step": 3982 + }, + { + "epoch": 0.2998174598693991, + "grad_norm": 5.983916759490967, + "learning_rate": 8.21876052533594e-05, + "loss": 2.3146, + "step": 3983 + }, + { + "epoch": 0.2998927341500593, + "grad_norm": 5.023091793060303, + "learning_rate": 8.217827541109612e-05, + "loss": 1.9151, + "step": 3984 + }, + { + "epoch": 0.2999680084307194, + "grad_norm": 8.244074821472168, + "learning_rate": 8.21689436559267e-05, + "loss": 2.22, + "step": 3985 + }, + { + "epoch": 0.3000432827113796, + "grad_norm": 4.760922431945801, + "learning_rate": 8.215960998840591e-05, + "loss": 2.2483, + "step": 3986 + }, + { + "epoch": 0.30011855699203976, + "grad_norm": 5.63252067565918, + "learning_rate": 8.215027440908856e-05, + "loss": 1.9458, + "step": 3987 + }, + { + "epoch": 0.3001938312726999, + "grad_norm": 4.972133636474609, + "learning_rate": 8.214093691852963e-05, + "loss": 2.0652, + "step": 3988 + }, + { + "epoch": 0.30026910555336006, + "grad_norm": 6.361783504486084, + "learning_rate": 8.213159751728424e-05, + "loss": 2.0875, + "step": 3989 + }, + { + "epoch": 0.30034437983402024, + "grad_norm": 5.10275936126709, + "learning_rate": 8.212225620590757e-05, + "loss": 1.8931, + "step": 3990 + }, + { + "epoch": 0.30041965411468036, + "grad_norm": 5.946813106536865, + "learning_rate": 8.211291298495493e-05, + "loss": 2.406, + "step": 3991 + }, + { + "epoch": 0.30049492839534053, + "grad_norm": 8.434186935424805, + "learning_rate": 8.210356785498178e-05, + "loss": 1.9255, + "step": 3992 + }, + { + "epoch": 0.30057020267600065, + "grad_norm": 8.311875343322754, + "learning_rate": 8.209422081654362e-05, + "loss": 2.2633, + "step": 3993 + }, + { + "epoch": 0.30064547695666083, + "grad_norm": 5.228137493133545, + "learning_rate": 8.208487187019614e-05, + "loss": 1.9318, + "step": 3994 + }, + { + "epoch": 0.300720751237321, + "grad_norm": 7.282024383544922, + "learning_rate": 8.207552101649506e-05, + "loss": 1.6633, + "step": 3995 + }, + { + "epoch": 0.3007960255179811, + "grad_norm": 6.58770227432251, + "learning_rate": 8.20661682559963e-05, + "loss": 1.8892, + "step": 3996 + }, + { + "epoch": 0.3008712997986413, + "grad_norm": 5.570758819580078, + "learning_rate": 8.205681358925585e-05, + "loss": 2.2074, + "step": 3997 + }, + { + "epoch": 0.3009465740793015, + "grad_norm": 5.938857555389404, + "learning_rate": 8.204745701682981e-05, + "loss": 2.3226, + "step": 3998 + }, + { + "epoch": 0.3010218483599616, + "grad_norm": 4.770346164703369, + "learning_rate": 8.20380985392744e-05, + "loss": 1.8245, + "step": 3999 + }, + { + "epoch": 0.3010971226406218, + "grad_norm": 6.840176105499268, + "learning_rate": 8.202873815714596e-05, + "loss": 2.0095, + "step": 4000 + }, + { + "epoch": 0.30117239692128195, + "grad_norm": 8.326154708862305, + "learning_rate": 8.201937587100095e-05, + "loss": 1.8612, + "step": 4001 + }, + { + "epoch": 0.30124767120194207, + "grad_norm": 4.345799922943115, + "learning_rate": 8.20100116813959e-05, + "loss": 1.7019, + "step": 4002 + }, + { + "epoch": 0.30132294548260224, + "grad_norm": 5.93306827545166, + "learning_rate": 8.200064558888753e-05, + "loss": 1.9735, + "step": 4003 + }, + { + "epoch": 0.30139821976326236, + "grad_norm": 8.172798156738281, + "learning_rate": 8.199127759403258e-05, + "loss": 1.7691, + "step": 4004 + }, + { + "epoch": 0.30147349404392254, + "grad_norm": 4.271193981170654, + "learning_rate": 8.198190769738797e-05, + "loss": 1.9111, + "step": 4005 + }, + { + "epoch": 0.3015487683245827, + "grad_norm": 4.693656921386719, + "learning_rate": 8.197253589951073e-05, + "loss": 1.9867, + "step": 4006 + }, + { + "epoch": 0.30162404260524284, + "grad_norm": 5.119705677032471, + "learning_rate": 8.196316220095795e-05, + "loss": 1.8506, + "step": 4007 + }, + { + "epoch": 0.301699316885903, + "grad_norm": 5.868583679199219, + "learning_rate": 8.19537866022869e-05, + "loss": 1.74, + "step": 4008 + }, + { + "epoch": 0.3017745911665632, + "grad_norm": 5.550439357757568, + "learning_rate": 8.194440910405491e-05, + "loss": 1.6325, + "step": 4009 + }, + { + "epoch": 0.3018498654472233, + "grad_norm": 5.357812404632568, + "learning_rate": 8.193502970681948e-05, + "loss": 2.2228, + "step": 4010 + }, + { + "epoch": 0.3019251397278835, + "grad_norm": 13.665534973144531, + "learning_rate": 8.192564841113815e-05, + "loss": 2.333, + "step": 4011 + }, + { + "epoch": 0.3020004140085436, + "grad_norm": 4.938703536987305, + "learning_rate": 8.191626521756863e-05, + "loss": 1.8872, + "step": 4012 + }, + { + "epoch": 0.3020756882892038, + "grad_norm": 6.1833930015563965, + "learning_rate": 8.190688012666871e-05, + "loss": 2.3025, + "step": 4013 + }, + { + "epoch": 0.30215096256986396, + "grad_norm": 7.422309875488281, + "learning_rate": 8.189749313899633e-05, + "loss": 2.4025, + "step": 4014 + }, + { + "epoch": 0.3022262368505241, + "grad_norm": 7.6599345207214355, + "learning_rate": 8.188810425510951e-05, + "loss": 1.9873, + "step": 4015 + }, + { + "epoch": 0.30230151113118425, + "grad_norm": 4.375296115875244, + "learning_rate": 8.187871347556638e-05, + "loss": 1.584, + "step": 4016 + }, + { + "epoch": 0.30237678541184443, + "grad_norm": 6.049117088317871, + "learning_rate": 8.186932080092521e-05, + "loss": 2.253, + "step": 4017 + }, + { + "epoch": 0.30245205969250455, + "grad_norm": 9.585309982299805, + "learning_rate": 8.185992623174437e-05, + "loss": 1.9379, + "step": 4018 + }, + { + "epoch": 0.3025273339731647, + "grad_norm": 6.24110221862793, + "learning_rate": 8.185052976858231e-05, + "loss": 1.6389, + "step": 4019 + }, + { + "epoch": 0.3026026082538249, + "grad_norm": 6.152796745300293, + "learning_rate": 8.184113141199766e-05, + "loss": 1.8865, + "step": 4020 + }, + { + "epoch": 0.302677882534485, + "grad_norm": 3.958066701889038, + "learning_rate": 8.183173116254912e-05, + "loss": 1.7768, + "step": 4021 + }, + { + "epoch": 0.3027531568151452, + "grad_norm": 8.104616165161133, + "learning_rate": 8.182232902079548e-05, + "loss": 2.2823, + "step": 4022 + }, + { + "epoch": 0.3028284310958053, + "grad_norm": 5.32259464263916, + "learning_rate": 8.18129249872957e-05, + "loss": 2.415, + "step": 4023 + }, + { + "epoch": 0.3029037053764655, + "grad_norm": 3.92617130279541, + "learning_rate": 8.180351906260879e-05, + "loss": 1.852, + "step": 4024 + }, + { + "epoch": 0.30297897965712567, + "grad_norm": 4.442535400390625, + "learning_rate": 8.179411124729395e-05, + "loss": 1.9233, + "step": 4025 + }, + { + "epoch": 0.3030542539377858, + "grad_norm": 5.805205345153809, + "learning_rate": 8.178470154191041e-05, + "loss": 1.6263, + "step": 4026 + }, + { + "epoch": 0.30312952821844596, + "grad_norm": 4.834534645080566, + "learning_rate": 8.177528994701758e-05, + "loss": 2.1779, + "step": 4027 + }, + { + "epoch": 0.30320480249910614, + "grad_norm": 3.872180223464966, + "learning_rate": 8.17658764631749e-05, + "loss": 1.9189, + "step": 4028 + }, + { + "epoch": 0.30328007677976626, + "grad_norm": 5.443620681762695, + "learning_rate": 8.175646109094205e-05, + "loss": 1.9836, + "step": 4029 + }, + { + "epoch": 0.30335535106042644, + "grad_norm": 4.5822272300720215, + "learning_rate": 8.174704383087868e-05, + "loss": 2.2317, + "step": 4030 + }, + { + "epoch": 0.30343062534108656, + "grad_norm": 4.165945053100586, + "learning_rate": 8.173762468354467e-05, + "loss": 1.9622, + "step": 4031 + }, + { + "epoch": 0.30350589962174673, + "grad_norm": 7.865018844604492, + "learning_rate": 8.172820364949992e-05, + "loss": 2.1418, + "step": 4032 + }, + { + "epoch": 0.3035811739024069, + "grad_norm": 4.647759914398193, + "learning_rate": 8.171878072930451e-05, + "loss": 1.8479, + "step": 4033 + }, + { + "epoch": 0.30365644818306703, + "grad_norm": 6.076625347137451, + "learning_rate": 8.17093559235186e-05, + "loss": 1.8357, + "step": 4034 + }, + { + "epoch": 0.3037317224637272, + "grad_norm": 5.57007360458374, + "learning_rate": 8.169992923270247e-05, + "loss": 1.6405, + "step": 4035 + }, + { + "epoch": 0.3038069967443874, + "grad_norm": 4.96959114074707, + "learning_rate": 8.169050065741648e-05, + "loss": 1.7833, + "step": 4036 + }, + { + "epoch": 0.3038822710250475, + "grad_norm": 4.817521095275879, + "learning_rate": 8.168107019822117e-05, + "loss": 1.6609, + "step": 4037 + }, + { + "epoch": 0.3039575453057077, + "grad_norm": 10.931699752807617, + "learning_rate": 8.167163785567716e-05, + "loss": 2.053, + "step": 4038 + }, + { + "epoch": 0.30403281958636785, + "grad_norm": 5.620011806488037, + "learning_rate": 8.166220363034513e-05, + "loss": 2.351, + "step": 4039 + }, + { + "epoch": 0.304108093867028, + "grad_norm": 5.115849018096924, + "learning_rate": 8.165276752278594e-05, + "loss": 1.4708, + "step": 4040 + }, + { + "epoch": 0.30418336814768815, + "grad_norm": 3.9933152198791504, + "learning_rate": 8.164332953356056e-05, + "loss": 2.0754, + "step": 4041 + }, + { + "epoch": 0.30425864242834827, + "grad_norm": 4.757380485534668, + "learning_rate": 8.163388966323002e-05, + "loss": 1.8448, + "step": 4042 + }, + { + "epoch": 0.30433391670900845, + "grad_norm": 4.821587562561035, + "learning_rate": 8.162444791235552e-05, + "loss": 2.0383, + "step": 4043 + }, + { + "epoch": 0.3044091909896686, + "grad_norm": 6.20637845993042, + "learning_rate": 8.161500428149833e-05, + "loss": 1.8472, + "step": 4044 + }, + { + "epoch": 0.30448446527032874, + "grad_norm": 5.767877578735352, + "learning_rate": 8.160555877121985e-05, + "loss": 1.9185, + "step": 4045 + }, + { + "epoch": 0.3045597395509889, + "grad_norm": 4.792327404022217, + "learning_rate": 8.159611138208157e-05, + "loss": 1.9444, + "step": 4046 + }, + { + "epoch": 0.3046350138316491, + "grad_norm": 5.566361904144287, + "learning_rate": 8.158666211464514e-05, + "loss": 1.997, + "step": 4047 + }, + { + "epoch": 0.3047102881123092, + "grad_norm": 4.853877067565918, + "learning_rate": 8.157721096947229e-05, + "loss": 1.9851, + "step": 4048 + }, + { + "epoch": 0.3047855623929694, + "grad_norm": 14.512285232543945, + "learning_rate": 8.156775794712485e-05, + "loss": 1.8716, + "step": 4049 + }, + { + "epoch": 0.30486083667362956, + "grad_norm": 5.715280055999756, + "learning_rate": 8.155830304816476e-05, + "loss": 2.0179, + "step": 4050 + }, + { + "epoch": 0.3049361109542897, + "grad_norm": 5.824142932891846, + "learning_rate": 8.154884627315412e-05, + "loss": 2.053, + "step": 4051 + }, + { + "epoch": 0.30501138523494986, + "grad_norm": 4.153370380401611, + "learning_rate": 8.153938762265509e-05, + "loss": 1.806, + "step": 4052 + }, + { + "epoch": 0.30508665951561, + "grad_norm": 3.8216633796691895, + "learning_rate": 8.152992709722996e-05, + "loss": 1.905, + "step": 4053 + }, + { + "epoch": 0.30516193379627016, + "grad_norm": 4.8293609619140625, + "learning_rate": 8.152046469744115e-05, + "loss": 1.9197, + "step": 4054 + }, + { + "epoch": 0.30523720807693033, + "grad_norm": 4.6087565422058105, + "learning_rate": 8.151100042385114e-05, + "loss": 2.1521, + "step": 4055 + }, + { + "epoch": 0.30531248235759045, + "grad_norm": 20.550830841064453, + "learning_rate": 8.150153427702256e-05, + "loss": 2.0916, + "step": 4056 + }, + { + "epoch": 0.30538775663825063, + "grad_norm": 5.00862979888916, + "learning_rate": 8.149206625751817e-05, + "loss": 1.9376, + "step": 4057 + }, + { + "epoch": 0.3054630309189108, + "grad_norm": 4.550467014312744, + "learning_rate": 8.14825963659008e-05, + "loss": 1.5024, + "step": 4058 + }, + { + "epoch": 0.3055383051995709, + "grad_norm": 5.312957763671875, + "learning_rate": 8.14731246027334e-05, + "loss": 2.1229, + "step": 4059 + }, + { + "epoch": 0.3056135794802311, + "grad_norm": 4.4095139503479, + "learning_rate": 8.146365096857907e-05, + "loss": 1.8054, + "step": 4060 + }, + { + "epoch": 0.3056888537608912, + "grad_norm": 5.797292232513428, + "learning_rate": 8.145417546400096e-05, + "loss": 1.8571, + "step": 4061 + }, + { + "epoch": 0.3057641280415514, + "grad_norm": 6.129823207855225, + "learning_rate": 8.144469808956235e-05, + "loss": 1.891, + "step": 4062 + }, + { + "epoch": 0.3058394023222116, + "grad_norm": 4.725674152374268, + "learning_rate": 8.143521884582668e-05, + "loss": 2.1579, + "step": 4063 + }, + { + "epoch": 0.3059146766028717, + "grad_norm": 6.563938617706299, + "learning_rate": 8.142573773335745e-05, + "loss": 1.9406, + "step": 4064 + }, + { + "epoch": 0.30598995088353187, + "grad_norm": 4.981137275695801, + "learning_rate": 8.141625475271828e-05, + "loss": 2.1859, + "step": 4065 + }, + { + "epoch": 0.30606522516419205, + "grad_norm": 4.24431848526001, + "learning_rate": 8.140676990447291e-05, + "loss": 1.822, + "step": 4066 + }, + { + "epoch": 0.30614049944485217, + "grad_norm": 4.437074661254883, + "learning_rate": 8.139728318918518e-05, + "loss": 1.5721, + "step": 4067 + }, + { + "epoch": 0.30621577372551234, + "grad_norm": 4.984560966491699, + "learning_rate": 8.138779460741906e-05, + "loss": 2.2105, + "step": 4068 + }, + { + "epoch": 0.3062910480061725, + "grad_norm": 4.792943477630615, + "learning_rate": 8.137830415973862e-05, + "loss": 1.9797, + "step": 4069 + }, + { + "epoch": 0.30636632228683264, + "grad_norm": 6.204592704772949, + "learning_rate": 8.136881184670802e-05, + "loss": 1.9241, + "step": 4070 + }, + { + "epoch": 0.3064415965674928, + "grad_norm": 6.4346022605896, + "learning_rate": 8.135931766889156e-05, + "loss": 1.7799, + "step": 4071 + }, + { + "epoch": 0.30651687084815293, + "grad_norm": 5.862424850463867, + "learning_rate": 8.134982162685365e-05, + "loss": 1.6179, + "step": 4072 + }, + { + "epoch": 0.3065921451288131, + "grad_norm": 4.588531494140625, + "learning_rate": 8.134032372115881e-05, + "loss": 1.9102, + "step": 4073 + }, + { + "epoch": 0.3066674194094733, + "grad_norm": 7.175743579864502, + "learning_rate": 8.133082395237163e-05, + "loss": 2.0638, + "step": 4074 + }, + { + "epoch": 0.3067426936901334, + "grad_norm": 5.835910797119141, + "learning_rate": 8.132132232105686e-05, + "loss": 2.6901, + "step": 4075 + }, + { + "epoch": 0.3068179679707936, + "grad_norm": 4.879258632659912, + "learning_rate": 8.131181882777938e-05, + "loss": 1.9987, + "step": 4076 + }, + { + "epoch": 0.30689324225145376, + "grad_norm": 4.265342712402344, + "learning_rate": 8.130231347310408e-05, + "loss": 1.9826, + "step": 4077 + }, + { + "epoch": 0.3069685165321139, + "grad_norm": 4.6916704177856445, + "learning_rate": 8.129280625759608e-05, + "loss": 2.0871, + "step": 4078 + }, + { + "epoch": 0.30704379081277405, + "grad_norm": 9.7210111618042, + "learning_rate": 8.128329718182052e-05, + "loss": 1.7594, + "step": 4079 + }, + { + "epoch": 0.3071190650934342, + "grad_norm": 5.560863971710205, + "learning_rate": 8.127378624634272e-05, + "loss": 2.2045, + "step": 4080 + }, + { + "epoch": 0.30719433937409435, + "grad_norm": 16.36822509765625, + "learning_rate": 8.126427345172806e-05, + "loss": 2.2942, + "step": 4081 + }, + { + "epoch": 0.3072696136547545, + "grad_norm": 5.423416614532471, + "learning_rate": 8.125475879854206e-05, + "loss": 1.6173, + "step": 4082 + }, + { + "epoch": 0.30734488793541465, + "grad_norm": 3.740196704864502, + "learning_rate": 8.124524228735031e-05, + "loss": 1.6365, + "step": 4083 + }, + { + "epoch": 0.3074201622160748, + "grad_norm": 5.55363655090332, + "learning_rate": 8.123572391871857e-05, + "loss": 1.907, + "step": 4084 + }, + { + "epoch": 0.307495436496735, + "grad_norm": 3.603593587875366, + "learning_rate": 8.122620369321267e-05, + "loss": 2.0254, + "step": 4085 + }, + { + "epoch": 0.3075707107773951, + "grad_norm": 5.502161979675293, + "learning_rate": 8.121668161139855e-05, + "loss": 1.9084, + "step": 4086 + }, + { + "epoch": 0.3076459850580553, + "grad_norm": 4.5866289138793945, + "learning_rate": 8.120715767384228e-05, + "loss": 1.9812, + "step": 4087 + }, + { + "epoch": 0.30772125933871547, + "grad_norm": 4.779272079467773, + "learning_rate": 8.119763188111002e-05, + "loss": 1.8406, + "step": 4088 + }, + { + "epoch": 0.3077965336193756, + "grad_norm": 4.330272674560547, + "learning_rate": 8.118810423376808e-05, + "loss": 1.627, + "step": 4089 + }, + { + "epoch": 0.30787180790003577, + "grad_norm": 5.465154647827148, + "learning_rate": 8.117857473238283e-05, + "loss": 2.2803, + "step": 4090 + }, + { + "epoch": 0.3079470821806959, + "grad_norm": 3.9878289699554443, + "learning_rate": 8.116904337752077e-05, + "loss": 1.9738, + "step": 4091 + }, + { + "epoch": 0.30802235646135606, + "grad_norm": 3.970031261444092, + "learning_rate": 8.115951016974851e-05, + "loss": 1.9696, + "step": 4092 + }, + { + "epoch": 0.30809763074201624, + "grad_norm": 4.773908615112305, + "learning_rate": 8.114997510963277e-05, + "loss": 1.8754, + "step": 4093 + }, + { + "epoch": 0.30817290502267636, + "grad_norm": 5.6028971672058105, + "learning_rate": 8.114043819774041e-05, + "loss": 1.8625, + "step": 4094 + }, + { + "epoch": 0.30824817930333653, + "grad_norm": 5.135190010070801, + "learning_rate": 8.113089943463834e-05, + "loss": 2.0212, + "step": 4095 + }, + { + "epoch": 0.3083234535839967, + "grad_norm": 5.350632190704346, + "learning_rate": 8.112135882089362e-05, + "loss": 1.9573, + "step": 4096 + }, + { + "epoch": 0.30839872786465683, + "grad_norm": 5.993723392486572, + "learning_rate": 8.111181635707343e-05, + "loss": 1.9924, + "step": 4097 + }, + { + "epoch": 0.308474002145317, + "grad_norm": 4.4464850425720215, + "learning_rate": 8.1102272043745e-05, + "loss": 2.0147, + "step": 4098 + }, + { + "epoch": 0.3085492764259772, + "grad_norm": 5.06343412399292, + "learning_rate": 8.109272588147578e-05, + "loss": 1.9729, + "step": 4099 + }, + { + "epoch": 0.3086245507066373, + "grad_norm": 4.458175182342529, + "learning_rate": 8.10831778708332e-05, + "loss": 1.8192, + "step": 4100 + }, + { + "epoch": 0.3086998249872975, + "grad_norm": 11.6445951461792, + "learning_rate": 8.107362801238487e-05, + "loss": 1.9158, + "step": 4101 + }, + { + "epoch": 0.3087750992679576, + "grad_norm": 4.281254291534424, + "learning_rate": 8.106407630669852e-05, + "loss": 1.7889, + "step": 4102 + }, + { + "epoch": 0.3088503735486178, + "grad_norm": 5.1972479820251465, + "learning_rate": 8.105452275434197e-05, + "loss": 1.9563, + "step": 4103 + }, + { + "epoch": 0.30892564782927795, + "grad_norm": 4.518148899078369, + "learning_rate": 8.104496735588316e-05, + "loss": 1.8562, + "step": 4104 + }, + { + "epoch": 0.30900092210993807, + "grad_norm": 5.564634323120117, + "learning_rate": 8.103541011189011e-05, + "loss": 1.8323, + "step": 4105 + }, + { + "epoch": 0.30907619639059825, + "grad_norm": 5.633904457092285, + "learning_rate": 8.102585102293098e-05, + "loss": 2.0392, + "step": 4106 + }, + { + "epoch": 0.3091514706712584, + "grad_norm": 6.375011444091797, + "learning_rate": 8.101629008957402e-05, + "loss": 1.8462, + "step": 4107 + }, + { + "epoch": 0.30922674495191854, + "grad_norm": 5.427424907684326, + "learning_rate": 8.100672731238762e-05, + "loss": 2.1734, + "step": 4108 + }, + { + "epoch": 0.3093020192325787, + "grad_norm": 5.865309715270996, + "learning_rate": 8.099716269194025e-05, + "loss": 1.7802, + "step": 4109 + }, + { + "epoch": 0.30937729351323884, + "grad_norm": 5.993786811828613, + "learning_rate": 8.09875962288005e-05, + "loss": 2.2357, + "step": 4110 + }, + { + "epoch": 0.309452567793899, + "grad_norm": 6.211700439453125, + "learning_rate": 8.097802792353705e-05, + "loss": 1.9379, + "step": 4111 + }, + { + "epoch": 0.3095278420745592, + "grad_norm": 4.29275369644165, + "learning_rate": 8.096845777671874e-05, + "loss": 1.7731, + "step": 4112 + }, + { + "epoch": 0.3096031163552193, + "grad_norm": 4.467304706573486, + "learning_rate": 8.095888578891449e-05, + "loss": 1.8467, + "step": 4113 + }, + { + "epoch": 0.3096783906358795, + "grad_norm": 4.096654415130615, + "learning_rate": 8.094931196069328e-05, + "loss": 2.0556, + "step": 4114 + }, + { + "epoch": 0.30975366491653966, + "grad_norm": 5.9918670654296875, + "learning_rate": 8.09397362926243e-05, + "loss": 1.9033, + "step": 4115 + }, + { + "epoch": 0.3098289391971998, + "grad_norm": 6.560250759124756, + "learning_rate": 8.093015878527676e-05, + "loss": 1.8551, + "step": 4116 + }, + { + "epoch": 0.30990421347785996, + "grad_norm": 4.530459880828857, + "learning_rate": 8.092057943922004e-05, + "loss": 1.9949, + "step": 4117 + }, + { + "epoch": 0.30997948775852013, + "grad_norm": 5.376959800720215, + "learning_rate": 8.091099825502359e-05, + "loss": 1.8608, + "step": 4118 + }, + { + "epoch": 0.31005476203918025, + "grad_norm": 5.280556678771973, + "learning_rate": 8.090141523325699e-05, + "loss": 2.0554, + "step": 4119 + }, + { + "epoch": 0.31013003631984043, + "grad_norm": 4.97694730758667, + "learning_rate": 8.089183037448991e-05, + "loss": 1.9863, + "step": 4120 + }, + { + "epoch": 0.31020531060050055, + "grad_norm": 4.214208602905273, + "learning_rate": 8.088224367929216e-05, + "loss": 1.7426, + "step": 4121 + }, + { + "epoch": 0.3102805848811607, + "grad_norm": 7.379026889801025, + "learning_rate": 8.087265514823362e-05, + "loss": 2.5219, + "step": 4122 + }, + { + "epoch": 0.3103558591618209, + "grad_norm": 5.828351020812988, + "learning_rate": 8.086306478188434e-05, + "loss": 1.9479, + "step": 4123 + }, + { + "epoch": 0.310431133442481, + "grad_norm": 6.026956558227539, + "learning_rate": 8.08534725808144e-05, + "loss": 2.1447, + "step": 4124 + }, + { + "epoch": 0.3105064077231412, + "grad_norm": 4.609555244445801, + "learning_rate": 8.084387854559403e-05, + "loss": 2.0614, + "step": 4125 + }, + { + "epoch": 0.3105816820038014, + "grad_norm": 3.7196404933929443, + "learning_rate": 8.08342826767936e-05, + "loss": 2.0403, + "step": 4126 + }, + { + "epoch": 0.3106569562844615, + "grad_norm": 5.789193153381348, + "learning_rate": 8.082468497498355e-05, + "loss": 2.0174, + "step": 4127 + }, + { + "epoch": 0.31073223056512167, + "grad_norm": 5.781473636627197, + "learning_rate": 8.08150854407344e-05, + "loss": 1.9081, + "step": 4128 + }, + { + "epoch": 0.3108075048457818, + "grad_norm": 4.734216213226318, + "learning_rate": 8.080548407461684e-05, + "loss": 1.6416, + "step": 4129 + }, + { + "epoch": 0.31088277912644197, + "grad_norm": 4.8850531578063965, + "learning_rate": 8.079588087720165e-05, + "loss": 1.9766, + "step": 4130 + }, + { + "epoch": 0.31095805340710214, + "grad_norm": 4.453812599182129, + "learning_rate": 8.078627584905971e-05, + "loss": 2.0357, + "step": 4131 + }, + { + "epoch": 0.31103332768776226, + "grad_norm": 6.009422302246094, + "learning_rate": 8.077666899076201e-05, + "loss": 2.157, + "step": 4132 + }, + { + "epoch": 0.31110860196842244, + "grad_norm": 9.53534984588623, + "learning_rate": 8.076706030287964e-05, + "loss": 2.1069, + "step": 4133 + }, + { + "epoch": 0.3111838762490826, + "grad_norm": 6.668169021606445, + "learning_rate": 8.075744978598381e-05, + "loss": 2.4968, + "step": 4134 + }, + { + "epoch": 0.31125915052974273, + "grad_norm": 4.888542652130127, + "learning_rate": 8.074783744064585e-05, + "loss": 2.0425, + "step": 4135 + }, + { + "epoch": 0.3113344248104029, + "grad_norm": 4.87699556350708, + "learning_rate": 8.073822326743718e-05, + "loss": 1.9302, + "step": 4136 + }, + { + "epoch": 0.3114096990910631, + "grad_norm": 5.116543769836426, + "learning_rate": 8.072860726692933e-05, + "loss": 1.683, + "step": 4137 + }, + { + "epoch": 0.3114849733717232, + "grad_norm": 3.7127389907836914, + "learning_rate": 8.071898943969396e-05, + "loss": 1.6109, + "step": 4138 + }, + { + "epoch": 0.3115602476523834, + "grad_norm": 6.477364540100098, + "learning_rate": 8.070936978630279e-05, + "loss": 1.7584, + "step": 4139 + }, + { + "epoch": 0.3116355219330435, + "grad_norm": 6.598007678985596, + "learning_rate": 8.069974830732772e-05, + "loss": 2.0366, + "step": 4140 + }, + { + "epoch": 0.3117107962137037, + "grad_norm": 5.051046371459961, + "learning_rate": 8.069012500334069e-05, + "loss": 1.9707, + "step": 4141 + }, + { + "epoch": 0.31178607049436385, + "grad_norm": 6.309785842895508, + "learning_rate": 8.06804998749138e-05, + "loss": 2.3291, + "step": 4142 + }, + { + "epoch": 0.311861344775024, + "grad_norm": 10.376910209655762, + "learning_rate": 8.067087292261921e-05, + "loss": 1.9281, + "step": 4143 + }, + { + "epoch": 0.31193661905568415, + "grad_norm": 4.858513832092285, + "learning_rate": 8.066124414702922e-05, + "loss": 1.6067, + "step": 4144 + }, + { + "epoch": 0.3120118933363443, + "grad_norm": 5.378974914550781, + "learning_rate": 8.065161354871627e-05, + "loss": 2.1062, + "step": 4145 + }, + { + "epoch": 0.31208716761700445, + "grad_norm": 4.165724277496338, + "learning_rate": 8.064198112825284e-05, + "loss": 1.9316, + "step": 4146 + }, + { + "epoch": 0.3121624418976646, + "grad_norm": 6.173158168792725, + "learning_rate": 8.063234688621152e-05, + "loss": 1.7153, + "step": 4147 + }, + { + "epoch": 0.3122377161783248, + "grad_norm": 4.256198406219482, + "learning_rate": 8.062271082316511e-05, + "loss": 2.0153, + "step": 4148 + }, + { + "epoch": 0.3123129904589849, + "grad_norm": 5.796307563781738, + "learning_rate": 8.06130729396864e-05, + "loss": 1.9214, + "step": 4149 + }, + { + "epoch": 0.3123882647396451, + "grad_norm": 4.783032417297363, + "learning_rate": 8.060343323634834e-05, + "loss": 2.2751, + "step": 4150 + }, + { + "epoch": 0.3124635390203052, + "grad_norm": 4.99452543258667, + "learning_rate": 8.059379171372399e-05, + "loss": 1.9327, + "step": 4151 + }, + { + "epoch": 0.3125388133009654, + "grad_norm": 4.076670169830322, + "learning_rate": 8.058414837238648e-05, + "loss": 1.5504, + "step": 4152 + }, + { + "epoch": 0.31261408758162557, + "grad_norm": 4.924299240112305, + "learning_rate": 8.057450321290913e-05, + "loss": 1.7901, + "step": 4153 + }, + { + "epoch": 0.3126893618622857, + "grad_norm": 5.414648056030273, + "learning_rate": 8.056485623586529e-05, + "loss": 2.0169, + "step": 4154 + }, + { + "epoch": 0.31276463614294586, + "grad_norm": 4.750243186950684, + "learning_rate": 8.055520744182845e-05, + "loss": 1.7254, + "step": 4155 + }, + { + "epoch": 0.31283991042360604, + "grad_norm": 4.201061725616455, + "learning_rate": 8.05455568313722e-05, + "loss": 2.1101, + "step": 4156 + }, + { + "epoch": 0.31291518470426616, + "grad_norm": 6.51814079284668, + "learning_rate": 8.053590440507022e-05, + "loss": 2.398, + "step": 4157 + }, + { + "epoch": 0.31299045898492633, + "grad_norm": 4.8121442794799805, + "learning_rate": 8.052625016349636e-05, + "loss": 1.8656, + "step": 4158 + }, + { + "epoch": 0.31306573326558645, + "grad_norm": 4.739930152893066, + "learning_rate": 8.05165941072245e-05, + "loss": 1.7758, + "step": 4159 + }, + { + "epoch": 0.31314100754624663, + "grad_norm": 5.414173126220703, + "learning_rate": 8.05069362368287e-05, + "loss": 1.6628, + "step": 4160 + }, + { + "epoch": 0.3132162818269068, + "grad_norm": 5.0939106941223145, + "learning_rate": 8.049727655288308e-05, + "loss": 1.6319, + "step": 4161 + }, + { + "epoch": 0.3132915561075669, + "grad_norm": 5.440403938293457, + "learning_rate": 8.048761505596185e-05, + "loss": 2.1747, + "step": 4162 + }, + { + "epoch": 0.3133668303882271, + "grad_norm": 5.314422607421875, + "learning_rate": 8.047795174663941e-05, + "loss": 1.7695, + "step": 4163 + }, + { + "epoch": 0.3134421046688873, + "grad_norm": 6.4251298904418945, + "learning_rate": 8.046828662549018e-05, + "loss": 2.0504, + "step": 4164 + }, + { + "epoch": 0.3135173789495474, + "grad_norm": 4.187405109405518, + "learning_rate": 8.045861969308872e-05, + "loss": 1.8168, + "step": 4165 + }, + { + "epoch": 0.3135926532302076, + "grad_norm": 5.1924052238464355, + "learning_rate": 8.044895095000972e-05, + "loss": 1.9361, + "step": 4166 + }, + { + "epoch": 0.31366792751086775, + "grad_norm": 5.534804821014404, + "learning_rate": 8.043928039682798e-05, + "loss": 1.9739, + "step": 4167 + }, + { + "epoch": 0.31374320179152787, + "grad_norm": 4.467015743255615, + "learning_rate": 8.042960803411835e-05, + "loss": 2.0965, + "step": 4168 + }, + { + "epoch": 0.31381847607218805, + "grad_norm": 5.400954246520996, + "learning_rate": 8.041993386245581e-05, + "loss": 2.1307, + "step": 4169 + }, + { + "epoch": 0.31389375035284817, + "grad_norm": 5.738590240478516, + "learning_rate": 8.04102578824155e-05, + "loss": 2.2427, + "step": 4170 + }, + { + "epoch": 0.31396902463350834, + "grad_norm": 4.002321720123291, + "learning_rate": 8.040058009457263e-05, + "loss": 1.9551, + "step": 4171 + }, + { + "epoch": 0.3140442989141685, + "grad_norm": 7.204328536987305, + "learning_rate": 8.039090049950249e-05, + "loss": 2.5783, + "step": 4172 + }, + { + "epoch": 0.31411957319482864, + "grad_norm": 5.399503231048584, + "learning_rate": 8.038121909778052e-05, + "loss": 1.8442, + "step": 4173 + }, + { + "epoch": 0.3141948474754888, + "grad_norm": 6.373347759246826, + "learning_rate": 8.037153588998224e-05, + "loss": 2.0175, + "step": 4174 + }, + { + "epoch": 0.314270121756149, + "grad_norm": 8.149158477783203, + "learning_rate": 8.036185087668329e-05, + "loss": 2.1099, + "step": 4175 + }, + { + "epoch": 0.3143453960368091, + "grad_norm": 4.7030720710754395, + "learning_rate": 8.035216405845945e-05, + "loss": 2.2035, + "step": 4176 + }, + { + "epoch": 0.3144206703174693, + "grad_norm": 5.598696231842041, + "learning_rate": 8.034247543588653e-05, + "loss": 2.0627, + "step": 4177 + }, + { + "epoch": 0.3144959445981294, + "grad_norm": 6.755463600158691, + "learning_rate": 8.033278500954051e-05, + "loss": 1.8745, + "step": 4178 + }, + { + "epoch": 0.3145712188787896, + "grad_norm": 7.252674102783203, + "learning_rate": 8.032309277999745e-05, + "loss": 2.1572, + "step": 4179 + }, + { + "epoch": 0.31464649315944976, + "grad_norm": 4.6608662605285645, + "learning_rate": 8.031339874783354e-05, + "loss": 1.7747, + "step": 4180 + }, + { + "epoch": 0.3147217674401099, + "grad_norm": 3.8207948207855225, + "learning_rate": 8.030370291362506e-05, + "loss": 2.0106, + "step": 4181 + }, + { + "epoch": 0.31479704172077005, + "grad_norm": 4.81209135055542, + "learning_rate": 8.029400527794837e-05, + "loss": 1.9476, + "step": 4182 + }, + { + "epoch": 0.31487231600143023, + "grad_norm": 4.371166229248047, + "learning_rate": 8.028430584138e-05, + "loss": 1.9218, + "step": 4183 + }, + { + "epoch": 0.31494759028209035, + "grad_norm": 4.531381607055664, + "learning_rate": 8.027460460449657e-05, + "loss": 1.9316, + "step": 4184 + }, + { + "epoch": 0.3150228645627505, + "grad_norm": 4.682920932769775, + "learning_rate": 8.026490156787473e-05, + "loss": 2.1219, + "step": 4185 + }, + { + "epoch": 0.3150981388434107, + "grad_norm": 5.957268714904785, + "learning_rate": 8.025519673209135e-05, + "loss": 1.7252, + "step": 4186 + }, + { + "epoch": 0.3151734131240708, + "grad_norm": 7.825656890869141, + "learning_rate": 8.024549009772331e-05, + "loss": 1.6832, + "step": 4187 + }, + { + "epoch": 0.315248687404731, + "grad_norm": 4.485478401184082, + "learning_rate": 8.023578166534769e-05, + "loss": 2.0064, + "step": 4188 + }, + { + "epoch": 0.3153239616853911, + "grad_norm": 4.233632564544678, + "learning_rate": 8.022607143554159e-05, + "loss": 1.6843, + "step": 4189 + }, + { + "epoch": 0.3153992359660513, + "grad_norm": 4.16924524307251, + "learning_rate": 8.02163594088823e-05, + "loss": 1.9498, + "step": 4190 + }, + { + "epoch": 0.31547451024671147, + "grad_norm": 6.420746803283691, + "learning_rate": 8.020664558594711e-05, + "loss": 1.9689, + "step": 4191 + }, + { + "epoch": 0.3155497845273716, + "grad_norm": 5.494634628295898, + "learning_rate": 8.019692996731354e-05, + "loss": 1.7451, + "step": 4192 + }, + { + "epoch": 0.31562505880803177, + "grad_norm": 3.9469048976898193, + "learning_rate": 8.018721255355911e-05, + "loss": 1.9535, + "step": 4193 + }, + { + "epoch": 0.31570033308869194, + "grad_norm": 7.2983784675598145, + "learning_rate": 8.017749334526152e-05, + "loss": 2.2216, + "step": 4194 + }, + { + "epoch": 0.31577560736935206, + "grad_norm": 4.404552459716797, + "learning_rate": 8.016777234299854e-05, + "loss": 1.8578, + "step": 4195 + }, + { + "epoch": 0.31585088165001224, + "grad_norm": 4.107496738433838, + "learning_rate": 8.015804954734805e-05, + "loss": 2.1892, + "step": 4196 + }, + { + "epoch": 0.3159261559306724, + "grad_norm": 4.96108865737915, + "learning_rate": 8.014832495888804e-05, + "loss": 2.0827, + "step": 4197 + }, + { + "epoch": 0.31600143021133253, + "grad_norm": 4.61102294921875, + "learning_rate": 8.013859857819664e-05, + "loss": 1.6061, + "step": 4198 + }, + { + "epoch": 0.3160767044919927, + "grad_norm": 4.23618745803833, + "learning_rate": 8.012887040585202e-05, + "loss": 2.013, + "step": 4199 + }, + { + "epoch": 0.31615197877265283, + "grad_norm": 4.68858528137207, + "learning_rate": 8.01191404424325e-05, + "loss": 1.956, + "step": 4200 + }, + { + "epoch": 0.316227253053313, + "grad_norm": 7.864339828491211, + "learning_rate": 8.01094086885165e-05, + "loss": 1.7851, + "step": 4201 + }, + { + "epoch": 0.3163025273339732, + "grad_norm": 5.506764888763428, + "learning_rate": 8.009967514468254e-05, + "loss": 1.8027, + "step": 4202 + }, + { + "epoch": 0.3163778016146333, + "grad_norm": 4.274631023406982, + "learning_rate": 8.008993981150928e-05, + "loss": 1.8115, + "step": 4203 + }, + { + "epoch": 0.3164530758952935, + "grad_norm": 4.904685974121094, + "learning_rate": 8.008020268957543e-05, + "loss": 2.0908, + "step": 4204 + }, + { + "epoch": 0.31652835017595365, + "grad_norm": 6.7580790519714355, + "learning_rate": 8.007046377945983e-05, + "loss": 2.2388, + "step": 4205 + }, + { + "epoch": 0.3166036244566138, + "grad_norm": 6.1108479499816895, + "learning_rate": 8.006072308174144e-05, + "loss": 1.7514, + "step": 4206 + }, + { + "epoch": 0.31667889873727395, + "grad_norm": 6.036064147949219, + "learning_rate": 8.005098059699932e-05, + "loss": 1.7028, + "step": 4207 + }, + { + "epoch": 0.31675417301793407, + "grad_norm": 5.992396831512451, + "learning_rate": 8.004123632581264e-05, + "loss": 1.8413, + "step": 4208 + }, + { + "epoch": 0.31682944729859425, + "grad_norm": 4.020727634429932, + "learning_rate": 8.003149026876064e-05, + "loss": 1.749, + "step": 4209 + }, + { + "epoch": 0.3169047215792544, + "grad_norm": 5.164257526397705, + "learning_rate": 8.002174242642273e-05, + "loss": 2.1207, + "step": 4210 + }, + { + "epoch": 0.31697999585991454, + "grad_norm": 5.45728874206543, + "learning_rate": 8.001199279937837e-05, + "loss": 1.7145, + "step": 4211 + }, + { + "epoch": 0.3170552701405747, + "grad_norm": 6.363954544067383, + "learning_rate": 8.000224138820715e-05, + "loss": 1.968, + "step": 4212 + }, + { + "epoch": 0.3171305444212349, + "grad_norm": 5.200278282165527, + "learning_rate": 7.999248819348877e-05, + "loss": 2.0076, + "step": 4213 + }, + { + "epoch": 0.317205818701895, + "grad_norm": 4.892804145812988, + "learning_rate": 7.998273321580302e-05, + "loss": 2.091, + "step": 4214 + }, + { + "epoch": 0.3172810929825552, + "grad_norm": 6.6856160163879395, + "learning_rate": 7.99729764557298e-05, + "loss": 1.9064, + "step": 4215 + }, + { + "epoch": 0.31735636726321537, + "grad_norm": 7.898743152618408, + "learning_rate": 7.996321791384915e-05, + "loss": 2.5648, + "step": 4216 + }, + { + "epoch": 0.3174316415438755, + "grad_norm": 6.225060939788818, + "learning_rate": 7.995345759074116e-05, + "loss": 1.8991, + "step": 4217 + }, + { + "epoch": 0.31750691582453566, + "grad_norm": 4.233827590942383, + "learning_rate": 7.994369548698608e-05, + "loss": 2.1027, + "step": 4218 + }, + { + "epoch": 0.3175821901051958, + "grad_norm": 4.947965145111084, + "learning_rate": 7.993393160316421e-05, + "loss": 1.9304, + "step": 4219 + }, + { + "epoch": 0.31765746438585596, + "grad_norm": 5.642895698547363, + "learning_rate": 7.992416593985599e-05, + "loss": 2.0962, + "step": 4220 + }, + { + "epoch": 0.31773273866651613, + "grad_norm": 6.483269691467285, + "learning_rate": 7.991439849764196e-05, + "loss": 2.1284, + "step": 4221 + }, + { + "epoch": 0.31780801294717625, + "grad_norm": 5.053190231323242, + "learning_rate": 7.990462927710278e-05, + "loss": 2.1316, + "step": 4222 + }, + { + "epoch": 0.31788328722783643, + "grad_norm": 14.06329345703125, + "learning_rate": 7.98948582788192e-05, + "loss": 2.515, + "step": 4223 + }, + { + "epoch": 0.3179585615084966, + "grad_norm": 4.306605815887451, + "learning_rate": 7.988508550337207e-05, + "loss": 1.7845, + "step": 4224 + }, + { + "epoch": 0.3180338357891567, + "grad_norm": 4.135372638702393, + "learning_rate": 7.987531095134236e-05, + "loss": 1.6746, + "step": 4225 + }, + { + "epoch": 0.3181091100698169, + "grad_norm": 5.3160929679870605, + "learning_rate": 7.986553462331114e-05, + "loss": 2.0002, + "step": 4226 + }, + { + "epoch": 0.318184384350477, + "grad_norm": 4.564939022064209, + "learning_rate": 7.985575651985957e-05, + "loss": 1.9668, + "step": 4227 + }, + { + "epoch": 0.3182596586311372, + "grad_norm": 4.8114213943481445, + "learning_rate": 7.984597664156895e-05, + "loss": 1.8584, + "step": 4228 + }, + { + "epoch": 0.3183349329117974, + "grad_norm": 3.955514430999756, + "learning_rate": 7.983619498902064e-05, + "loss": 1.861, + "step": 4229 + }, + { + "epoch": 0.3184102071924575, + "grad_norm": 4.580810546875, + "learning_rate": 7.982641156279615e-05, + "loss": 2.0389, + "step": 4230 + }, + { + "epoch": 0.31848548147311767, + "grad_norm": 4.941291809082031, + "learning_rate": 7.98166263634771e-05, + "loss": 2.4105, + "step": 4231 + }, + { + "epoch": 0.31856075575377785, + "grad_norm": 4.099060535430908, + "learning_rate": 7.980683939164515e-05, + "loss": 2.1526, + "step": 4232 + }, + { + "epoch": 0.31863603003443797, + "grad_norm": 3.2472481727600098, + "learning_rate": 7.979705064788212e-05, + "loss": 2.0892, + "step": 4233 + }, + { + "epoch": 0.31871130431509814, + "grad_norm": 3.9895503520965576, + "learning_rate": 7.978726013276994e-05, + "loss": 2.0634, + "step": 4234 + }, + { + "epoch": 0.3187865785957583, + "grad_norm": 3.8009819984436035, + "learning_rate": 7.97774678468906e-05, + "loss": 1.892, + "step": 4235 + }, + { + "epoch": 0.31886185287641844, + "grad_norm": 4.887508392333984, + "learning_rate": 7.976767379082623e-05, + "loss": 2.1029, + "step": 4236 + }, + { + "epoch": 0.3189371271570786, + "grad_norm": 4.2729997634887695, + "learning_rate": 7.975787796515907e-05, + "loss": 2.0362, + "step": 4237 + }, + { + "epoch": 0.31901240143773874, + "grad_norm": 4.386436939239502, + "learning_rate": 7.974808037047146e-05, + "loss": 1.6816, + "step": 4238 + }, + { + "epoch": 0.3190876757183989, + "grad_norm": 3.599407196044922, + "learning_rate": 7.973828100734584e-05, + "loss": 1.9369, + "step": 4239 + }, + { + "epoch": 0.3191629499990591, + "grad_norm": 5.868585109710693, + "learning_rate": 7.972847987636471e-05, + "loss": 2.0648, + "step": 4240 + }, + { + "epoch": 0.3192382242797192, + "grad_norm": 5.061997413635254, + "learning_rate": 7.971867697811079e-05, + "loss": 1.6749, + "step": 4241 + }, + { + "epoch": 0.3193134985603794, + "grad_norm": 4.263599872589111, + "learning_rate": 7.970887231316677e-05, + "loss": 1.6256, + "step": 4242 + }, + { + "epoch": 0.31938877284103956, + "grad_norm": 6.188817977905273, + "learning_rate": 7.969906588211554e-05, + "loss": 1.8375, + "step": 4243 + }, + { + "epoch": 0.3194640471216997, + "grad_norm": 7.300205230712891, + "learning_rate": 7.968925768554005e-05, + "loss": 2.1332, + "step": 4244 + }, + { + "epoch": 0.31953932140235985, + "grad_norm": 5.567032337188721, + "learning_rate": 7.96794477240234e-05, + "loss": 1.8722, + "step": 4245 + }, + { + "epoch": 0.31961459568302003, + "grad_norm": 4.394096851348877, + "learning_rate": 7.966963599814874e-05, + "loss": 1.8188, + "step": 4246 + }, + { + "epoch": 0.31968986996368015, + "grad_norm": 4.94636869430542, + "learning_rate": 7.965982250849934e-05, + "loss": 2.2043, + "step": 4247 + }, + { + "epoch": 0.3197651442443403, + "grad_norm": 4.666872978210449, + "learning_rate": 7.965000725565861e-05, + "loss": 2.0967, + "step": 4248 + }, + { + "epoch": 0.31984041852500045, + "grad_norm": 7.000183582305908, + "learning_rate": 7.964019024021001e-05, + "loss": 1.7779, + "step": 4249 + }, + { + "epoch": 0.3199156928056606, + "grad_norm": 5.916597366333008, + "learning_rate": 7.963037146273715e-05, + "loss": 1.9934, + "step": 4250 + }, + { + "epoch": 0.3199909670863208, + "grad_norm": 6.331494331359863, + "learning_rate": 7.962055092382372e-05, + "loss": 1.717, + "step": 4251 + }, + { + "epoch": 0.3200662413669809, + "grad_norm": 4.477627754211426, + "learning_rate": 7.961072862405354e-05, + "loss": 1.7684, + "step": 4252 + }, + { + "epoch": 0.3201415156476411, + "grad_norm": 4.503161907196045, + "learning_rate": 7.96009045640105e-05, + "loss": 2.0909, + "step": 4253 + }, + { + "epoch": 0.32021678992830127, + "grad_norm": 4.416711330413818, + "learning_rate": 7.959107874427863e-05, + "loss": 1.8588, + "step": 4254 + }, + { + "epoch": 0.3202920642089614, + "grad_norm": 5.016046524047852, + "learning_rate": 7.958125116544201e-05, + "loss": 2.0138, + "step": 4255 + }, + { + "epoch": 0.32036733848962157, + "grad_norm": 5.3633713722229, + "learning_rate": 7.95714218280849e-05, + "loss": 2.157, + "step": 4256 + }, + { + "epoch": 0.3204426127702817, + "grad_norm": 3.5470776557922363, + "learning_rate": 7.956159073279161e-05, + "loss": 1.8134, + "step": 4257 + }, + { + "epoch": 0.32051788705094186, + "grad_norm": 5.646305561065674, + "learning_rate": 7.955175788014658e-05, + "loss": 1.9291, + "step": 4258 + }, + { + "epoch": 0.32059316133160204, + "grad_norm": 4.526952266693115, + "learning_rate": 7.954192327073434e-05, + "loss": 1.5957, + "step": 4259 + }, + { + "epoch": 0.32066843561226216, + "grad_norm": 4.84018611907959, + "learning_rate": 7.953208690513952e-05, + "loss": 1.6235, + "step": 4260 + }, + { + "epoch": 0.32074370989292234, + "grad_norm": 5.1198410987854, + "learning_rate": 7.952224878394687e-05, + "loss": 2.012, + "step": 4261 + }, + { + "epoch": 0.3208189841735825, + "grad_norm": 3.514918088912964, + "learning_rate": 7.951240890774124e-05, + "loss": 2.1589, + "step": 4262 + }, + { + "epoch": 0.32089425845424263, + "grad_norm": 6.791342735290527, + "learning_rate": 7.950256727710758e-05, + "loss": 1.4098, + "step": 4263 + }, + { + "epoch": 0.3209695327349028, + "grad_norm": 4.368245601654053, + "learning_rate": 7.949272389263096e-05, + "loss": 2.2477, + "step": 4264 + }, + { + "epoch": 0.321044807015563, + "grad_norm": 6.970037937164307, + "learning_rate": 7.948287875489651e-05, + "loss": 1.8916, + "step": 4265 + }, + { + "epoch": 0.3211200812962231, + "grad_norm": 7.556192398071289, + "learning_rate": 7.947303186448952e-05, + "loss": 1.9569, + "step": 4266 + }, + { + "epoch": 0.3211953555768833, + "grad_norm": 5.274992942810059, + "learning_rate": 7.946318322199535e-05, + "loss": 2.0645, + "step": 4267 + }, + { + "epoch": 0.3212706298575434, + "grad_norm": 6.149960994720459, + "learning_rate": 7.94533328279995e-05, + "loss": 1.8364, + "step": 4268 + }, + { + "epoch": 0.3213459041382036, + "grad_norm": 5.816581726074219, + "learning_rate": 7.94434806830875e-05, + "loss": 2.3717, + "step": 4269 + }, + { + "epoch": 0.32142117841886375, + "grad_norm": 5.101383209228516, + "learning_rate": 7.943362678784505e-05, + "loss": 1.9202, + "step": 4270 + }, + { + "epoch": 0.32149645269952387, + "grad_norm": 6.094253063201904, + "learning_rate": 7.942377114285796e-05, + "loss": 1.8764, + "step": 4271 + }, + { + "epoch": 0.32157172698018405, + "grad_norm": 5.8568949699401855, + "learning_rate": 7.941391374871209e-05, + "loss": 1.8461, + "step": 4272 + }, + { + "epoch": 0.3216470012608442, + "grad_norm": 5.236979007720947, + "learning_rate": 7.940405460599345e-05, + "loss": 1.9667, + "step": 4273 + }, + { + "epoch": 0.32172227554150434, + "grad_norm": 4.718485355377197, + "learning_rate": 7.939419371528813e-05, + "loss": 1.8842, + "step": 4274 + }, + { + "epoch": 0.3217975498221645, + "grad_norm": 4.007236480712891, + "learning_rate": 7.938433107718235e-05, + "loss": 1.76, + "step": 4275 + }, + { + "epoch": 0.3218728241028247, + "grad_norm": 5.785971641540527, + "learning_rate": 7.937446669226238e-05, + "loss": 1.8413, + "step": 4276 + }, + { + "epoch": 0.3219480983834848, + "grad_norm": 4.267640113830566, + "learning_rate": 7.936460056111468e-05, + "loss": 1.9159, + "step": 4277 + }, + { + "epoch": 0.322023372664145, + "grad_norm": 4.950400352478027, + "learning_rate": 7.93547326843257e-05, + "loss": 1.4911, + "step": 4278 + }, + { + "epoch": 0.3220986469448051, + "grad_norm": 4.639499187469482, + "learning_rate": 7.93448630624821e-05, + "loss": 1.8057, + "step": 4279 + }, + { + "epoch": 0.3221739212254653, + "grad_norm": 4.913708209991455, + "learning_rate": 7.93349916961706e-05, + "loss": 1.7642, + "step": 4280 + }, + { + "epoch": 0.32224919550612546, + "grad_norm": 4.055058002471924, + "learning_rate": 7.932511858597801e-05, + "loss": 2.2728, + "step": 4281 + }, + { + "epoch": 0.3223244697867856, + "grad_norm": 8.192974090576172, + "learning_rate": 7.931524373249126e-05, + "loss": 1.9588, + "step": 4282 + }, + { + "epoch": 0.32239974406744576, + "grad_norm": 4.474048137664795, + "learning_rate": 7.93053671362974e-05, + "loss": 1.6972, + "step": 4283 + }, + { + "epoch": 0.32247501834810594, + "grad_norm": 17.71534538269043, + "learning_rate": 7.929548879798353e-05, + "loss": 1.809, + "step": 4284 + }, + { + "epoch": 0.32255029262876606, + "grad_norm": 3.9231841564178467, + "learning_rate": 7.928560871813692e-05, + "loss": 1.9298, + "step": 4285 + }, + { + "epoch": 0.32262556690942623, + "grad_norm": 6.1528472900390625, + "learning_rate": 7.927572689734489e-05, + "loss": 2.1274, + "step": 4286 + }, + { + "epoch": 0.32270084119008635, + "grad_norm": 8.023346900939941, + "learning_rate": 7.926584333619491e-05, + "loss": 2.2845, + "step": 4287 + }, + { + "epoch": 0.3227761154707465, + "grad_norm": 5.260461807250977, + "learning_rate": 7.92559580352745e-05, + "loss": 1.9441, + "step": 4288 + }, + { + "epoch": 0.3228513897514067, + "grad_norm": 4.4676384925842285, + "learning_rate": 7.924607099517135e-05, + "loss": 2.0106, + "step": 4289 + }, + { + "epoch": 0.3229266640320668, + "grad_norm": 6.301382541656494, + "learning_rate": 7.92361822164732e-05, + "loss": 2.0289, + "step": 4290 + }, + { + "epoch": 0.323001938312727, + "grad_norm": 5.506801128387451, + "learning_rate": 7.922629169976789e-05, + "loss": 2.0697, + "step": 4291 + }, + { + "epoch": 0.3230772125933872, + "grad_norm": 5.69395112991333, + "learning_rate": 7.92163994456434e-05, + "loss": 2.0842, + "step": 4292 + }, + { + "epoch": 0.3231524868740473, + "grad_norm": 5.484492301940918, + "learning_rate": 7.92065054546878e-05, + "loss": 1.5692, + "step": 4293 + }, + { + "epoch": 0.32322776115470747, + "grad_norm": 5.572556972503662, + "learning_rate": 7.919660972748923e-05, + "loss": 2.0186, + "step": 4294 + }, + { + "epoch": 0.32330303543536765, + "grad_norm": 5.251938819885254, + "learning_rate": 7.918671226463601e-05, + "loss": 1.7921, + "step": 4295 + }, + { + "epoch": 0.32337830971602777, + "grad_norm": 4.50363826751709, + "learning_rate": 7.91768130667165e-05, + "loss": 1.8323, + "step": 4296 + }, + { + "epoch": 0.32345358399668794, + "grad_norm": 6.516271114349365, + "learning_rate": 7.916691213431915e-05, + "loss": 1.9599, + "step": 4297 + }, + { + "epoch": 0.32352885827734806, + "grad_norm": 4.948450565338135, + "learning_rate": 7.915700946803257e-05, + "loss": 1.9721, + "step": 4298 + }, + { + "epoch": 0.32360413255800824, + "grad_norm": 6.039429664611816, + "learning_rate": 7.914710506844545e-05, + "loss": 1.6318, + "step": 4299 + }, + { + "epoch": 0.3236794068386684, + "grad_norm": 6.55450963973999, + "learning_rate": 7.913719893614655e-05, + "loss": 1.9959, + "step": 4300 + }, + { + "epoch": 0.32375468111932854, + "grad_norm": 4.4523797035217285, + "learning_rate": 7.912729107172479e-05, + "loss": 1.8988, + "step": 4301 + }, + { + "epoch": 0.3238299553999887, + "grad_norm": 7.182500839233398, + "learning_rate": 7.911738147576914e-05, + "loss": 1.7855, + "step": 4302 + }, + { + "epoch": 0.3239052296806489, + "grad_norm": 4.500375747680664, + "learning_rate": 7.91074701488687e-05, + "loss": 1.9905, + "step": 4303 + }, + { + "epoch": 0.323980503961309, + "grad_norm": 6.276652812957764, + "learning_rate": 7.90975570916127e-05, + "loss": 1.7189, + "step": 4304 + }, + { + "epoch": 0.3240557782419692, + "grad_norm": 3.9099531173706055, + "learning_rate": 7.90876423045904e-05, + "loss": 1.6428, + "step": 4305 + }, + { + "epoch": 0.3241310525226293, + "grad_norm": 5.5810136795043945, + "learning_rate": 7.907772578839125e-05, + "loss": 1.9573, + "step": 4306 + }, + { + "epoch": 0.3242063268032895, + "grad_norm": 4.071087837219238, + "learning_rate": 7.906780754360472e-05, + "loss": 2.0077, + "step": 4307 + }, + { + "epoch": 0.32428160108394966, + "grad_norm": 5.579949378967285, + "learning_rate": 7.905788757082044e-05, + "loss": 1.8202, + "step": 4308 + }, + { + "epoch": 0.3243568753646098, + "grad_norm": 4.931799411773682, + "learning_rate": 7.904796587062811e-05, + "loss": 2.4014, + "step": 4309 + }, + { + "epoch": 0.32443214964526995, + "grad_norm": 5.031786918640137, + "learning_rate": 7.903804244361757e-05, + "loss": 1.5249, + "step": 4310 + }, + { + "epoch": 0.3245074239259301, + "grad_norm": 5.338874816894531, + "learning_rate": 7.902811729037873e-05, + "loss": 1.7179, + "step": 4311 + }, + { + "epoch": 0.32458269820659025, + "grad_norm": 5.351528644561768, + "learning_rate": 7.90181904115016e-05, + "loss": 1.7539, + "step": 4312 + }, + { + "epoch": 0.3246579724872504, + "grad_norm": 6.5313639640808105, + "learning_rate": 7.900826180757632e-05, + "loss": 2.1249, + "step": 4313 + }, + { + "epoch": 0.3247332467679106, + "grad_norm": 5.3764543533325195, + "learning_rate": 7.89983314791931e-05, + "loss": 1.8128, + "step": 4314 + }, + { + "epoch": 0.3248085210485707, + "grad_norm": 5.10286808013916, + "learning_rate": 7.898839942694228e-05, + "loss": 1.9212, + "step": 4315 + }, + { + "epoch": 0.3248837953292309, + "grad_norm": 6.762510299682617, + "learning_rate": 7.89784656514143e-05, + "loss": 1.9741, + "step": 4316 + }, + { + "epoch": 0.324959069609891, + "grad_norm": 5.928401470184326, + "learning_rate": 7.896853015319967e-05, + "loss": 1.9812, + "step": 4317 + }, + { + "epoch": 0.3250343438905512, + "grad_norm": 5.54860258102417, + "learning_rate": 7.895859293288907e-05, + "loss": 1.5551, + "step": 4318 + }, + { + "epoch": 0.32510961817121137, + "grad_norm": 6.7443389892578125, + "learning_rate": 7.894865399107319e-05, + "loss": 1.9447, + "step": 4319 + }, + { + "epoch": 0.3251848924518715, + "grad_norm": 4.55783748626709, + "learning_rate": 7.893871332834288e-05, + "loss": 2.0529, + "step": 4320 + }, + { + "epoch": 0.32526016673253166, + "grad_norm": 5.007224082946777, + "learning_rate": 7.892877094528912e-05, + "loss": 2.0183, + "step": 4321 + }, + { + "epoch": 0.32533544101319184, + "grad_norm": 5.731069087982178, + "learning_rate": 7.891882684250292e-05, + "loss": 1.9083, + "step": 4322 + }, + { + "epoch": 0.32541071529385196, + "grad_norm": 8.349027633666992, + "learning_rate": 7.890888102057545e-05, + "loss": 1.7741, + "step": 4323 + }, + { + "epoch": 0.32548598957451214, + "grad_norm": 6.190571308135986, + "learning_rate": 7.889893348009796e-05, + "loss": 2.1441, + "step": 4324 + }, + { + "epoch": 0.3255612638551723, + "grad_norm": 5.127755165100098, + "learning_rate": 7.888898422166178e-05, + "loss": 1.9843, + "step": 4325 + }, + { + "epoch": 0.32563653813583243, + "grad_norm": 7.0966901779174805, + "learning_rate": 7.887903324585839e-05, + "loss": 1.5867, + "step": 4326 + }, + { + "epoch": 0.3257118124164926, + "grad_norm": 4.568636894226074, + "learning_rate": 7.886908055327933e-05, + "loss": 1.7065, + "step": 4327 + }, + { + "epoch": 0.32578708669715273, + "grad_norm": 4.765012741088867, + "learning_rate": 7.885912614451627e-05, + "loss": 2.1661, + "step": 4328 + }, + { + "epoch": 0.3258623609778129, + "grad_norm": 8.170002937316895, + "learning_rate": 7.884917002016097e-05, + "loss": 1.9449, + "step": 4329 + }, + { + "epoch": 0.3259376352584731, + "grad_norm": 4.757733345031738, + "learning_rate": 7.883921218080528e-05, + "loss": 2.1068, + "step": 4330 + }, + { + "epoch": 0.3260129095391332, + "grad_norm": 4.404750347137451, + "learning_rate": 7.882925262704118e-05, + "loss": 2.0577, + "step": 4331 + }, + { + "epoch": 0.3260881838197934, + "grad_norm": 6.385253429412842, + "learning_rate": 7.881929135946073e-05, + "loss": 1.8286, + "step": 4332 + }, + { + "epoch": 0.32616345810045355, + "grad_norm": 4.9481658935546875, + "learning_rate": 7.88093283786561e-05, + "loss": 1.8637, + "step": 4333 + }, + { + "epoch": 0.32623873238111367, + "grad_norm": 5.324496746063232, + "learning_rate": 7.879936368521957e-05, + "loss": 2.1201, + "step": 4334 + }, + { + "epoch": 0.32631400666177385, + "grad_norm": 5.3290324211120605, + "learning_rate": 7.87893972797435e-05, + "loss": 1.8331, + "step": 4335 + }, + { + "epoch": 0.32638928094243397, + "grad_norm": 4.49165678024292, + "learning_rate": 7.877942916282037e-05, + "loss": 1.5997, + "step": 4336 + }, + { + "epoch": 0.32646455522309414, + "grad_norm": 3.9053313732147217, + "learning_rate": 7.876945933504274e-05, + "loss": 1.6632, + "step": 4337 + }, + { + "epoch": 0.3265398295037543, + "grad_norm": 4.798660755157471, + "learning_rate": 7.875948779700331e-05, + "loss": 1.7641, + "step": 4338 + }, + { + "epoch": 0.32661510378441444, + "grad_norm": 4.344418048858643, + "learning_rate": 7.874951454929484e-05, + "loss": 2.303, + "step": 4339 + }, + { + "epoch": 0.3266903780650746, + "grad_norm": 5.180002689361572, + "learning_rate": 7.873953959251023e-05, + "loss": 1.807, + "step": 4340 + }, + { + "epoch": 0.3267656523457348, + "grad_norm": 4.177915573120117, + "learning_rate": 7.872956292724247e-05, + "loss": 1.9628, + "step": 4341 + }, + { + "epoch": 0.3268409266263949, + "grad_norm": 5.383688926696777, + "learning_rate": 7.871958455408461e-05, + "loss": 1.9528, + "step": 4342 + }, + { + "epoch": 0.3269162009070551, + "grad_norm": 5.294318675994873, + "learning_rate": 7.870960447362986e-05, + "loss": 2.1267, + "step": 4343 + }, + { + "epoch": 0.32699147518771526, + "grad_norm": 4.464460372924805, + "learning_rate": 7.869962268647149e-05, + "loss": 1.6185, + "step": 4344 + }, + { + "epoch": 0.3270667494683754, + "grad_norm": 6.695094108581543, + "learning_rate": 7.868963919320292e-05, + "loss": 1.7839, + "step": 4345 + }, + { + "epoch": 0.32714202374903556, + "grad_norm": 5.034485340118408, + "learning_rate": 7.867965399441762e-05, + "loss": 1.9879, + "step": 4346 + }, + { + "epoch": 0.3272172980296957, + "grad_norm": 4.960436820983887, + "learning_rate": 7.866966709070916e-05, + "loss": 2.1976, + "step": 4347 + }, + { + "epoch": 0.32729257231035586, + "grad_norm": 4.145730495452881, + "learning_rate": 7.865967848267126e-05, + "loss": 2.0367, + "step": 4348 + }, + { + "epoch": 0.32736784659101603, + "grad_norm": 5.931301593780518, + "learning_rate": 7.864968817089772e-05, + "loss": 2.2859, + "step": 4349 + }, + { + "epoch": 0.32744312087167615, + "grad_norm": 4.807189464569092, + "learning_rate": 7.86396961559824e-05, + "loss": 1.9464, + "step": 4350 + }, + { + "epoch": 0.32751839515233633, + "grad_norm": 5.6787567138671875, + "learning_rate": 7.862970243851935e-05, + "loss": 2.3946, + "step": 4351 + }, + { + "epoch": 0.3275936694329965, + "grad_norm": 4.692102432250977, + "learning_rate": 7.861970701910262e-05, + "loss": 1.7994, + "step": 4352 + }, + { + "epoch": 0.3276689437136566, + "grad_norm": 4.082855701446533, + "learning_rate": 7.860970989832644e-05, + "loss": 1.6221, + "step": 4353 + }, + { + "epoch": 0.3277442179943168, + "grad_norm": 6.853491306304932, + "learning_rate": 7.859971107678507e-05, + "loss": 1.9617, + "step": 4354 + }, + { + "epoch": 0.3278194922749769, + "grad_norm": 4.796528339385986, + "learning_rate": 7.858971055507295e-05, + "loss": 1.8724, + "step": 4355 + }, + { + "epoch": 0.3278947665556371, + "grad_norm": 4.104365825653076, + "learning_rate": 7.857970833378458e-05, + "loss": 2.0103, + "step": 4356 + }, + { + "epoch": 0.32797004083629727, + "grad_norm": 4.175367832183838, + "learning_rate": 7.856970441351456e-05, + "loss": 1.7348, + "step": 4357 + }, + { + "epoch": 0.3280453151169574, + "grad_norm": 6.235986709594727, + "learning_rate": 7.855969879485758e-05, + "loss": 1.8518, + "step": 4358 + }, + { + "epoch": 0.32812058939761757, + "grad_norm": 4.891356945037842, + "learning_rate": 7.854969147840845e-05, + "loss": 2.1433, + "step": 4359 + }, + { + "epoch": 0.32819586367827774, + "grad_norm": 5.494083404541016, + "learning_rate": 7.853968246476209e-05, + "loss": 2.0301, + "step": 4360 + }, + { + "epoch": 0.32827113795893786, + "grad_norm": 6.059650897979736, + "learning_rate": 7.852967175451349e-05, + "loss": 1.6915, + "step": 4361 + }, + { + "epoch": 0.32834641223959804, + "grad_norm": 5.156692981719971, + "learning_rate": 7.851965934825775e-05, + "loss": 1.8174, + "step": 4362 + }, + { + "epoch": 0.3284216865202582, + "grad_norm": 4.464126110076904, + "learning_rate": 7.850964524659013e-05, + "loss": 1.9067, + "step": 4363 + }, + { + "epoch": 0.32849696080091834, + "grad_norm": 4.730215072631836, + "learning_rate": 7.849962945010587e-05, + "loss": 1.9596, + "step": 4364 + }, + { + "epoch": 0.3285722350815785, + "grad_norm": 6.299679756164551, + "learning_rate": 7.848961195940043e-05, + "loss": 2.0148, + "step": 4365 + }, + { + "epoch": 0.32864750936223863, + "grad_norm": 3.656123638153076, + "learning_rate": 7.847959277506928e-05, + "loss": 1.8974, + "step": 4366 + }, + { + "epoch": 0.3287227836428988, + "grad_norm": 3.8951895236968994, + "learning_rate": 7.846957189770809e-05, + "loss": 1.8244, + "step": 4367 + }, + { + "epoch": 0.328798057923559, + "grad_norm": 5.9096999168396, + "learning_rate": 7.845954932791253e-05, + "loss": 2.1906, + "step": 4368 + }, + { + "epoch": 0.3288733322042191, + "grad_norm": 5.1403961181640625, + "learning_rate": 7.84495250662784e-05, + "loss": 2.1406, + "step": 4369 + }, + { + "epoch": 0.3289486064848793, + "grad_norm": 6.26041841506958, + "learning_rate": 7.843949911340168e-05, + "loss": 1.8222, + "step": 4370 + }, + { + "epoch": 0.32902388076553946, + "grad_norm": 4.345606327056885, + "learning_rate": 7.842947146987829e-05, + "loss": 1.7026, + "step": 4371 + }, + { + "epoch": 0.3290991550461996, + "grad_norm": 4.3956708908081055, + "learning_rate": 7.841944213630442e-05, + "loss": 2.4946, + "step": 4372 + }, + { + "epoch": 0.32917442932685975, + "grad_norm": 4.711329460144043, + "learning_rate": 7.840941111327626e-05, + "loss": 1.5827, + "step": 4373 + }, + { + "epoch": 0.32924970360751993, + "grad_norm": 5.626861095428467, + "learning_rate": 7.839937840139013e-05, + "loss": 1.9205, + "step": 4374 + }, + { + "epoch": 0.32932497788818005, + "grad_norm": 5.321366786956787, + "learning_rate": 7.838934400124242e-05, + "loss": 1.8466, + "step": 4375 + }, + { + "epoch": 0.3294002521688402, + "grad_norm": 8.001298904418945, + "learning_rate": 7.837930791342969e-05, + "loss": 1.7368, + "step": 4376 + }, + { + "epoch": 0.32947552644950034, + "grad_norm": 6.337734222412109, + "learning_rate": 7.836927013854853e-05, + "loss": 1.8489, + "step": 4377 + }, + { + "epoch": 0.3295508007301605, + "grad_norm": 4.335766315460205, + "learning_rate": 7.835923067719565e-05, + "loss": 2.4347, + "step": 4378 + }, + { + "epoch": 0.3296260750108207, + "grad_norm": 5.940581798553467, + "learning_rate": 7.834918952996788e-05, + "loss": 2.5098, + "step": 4379 + }, + { + "epoch": 0.3297013492914808, + "grad_norm": 3.8880066871643066, + "learning_rate": 7.833914669746214e-05, + "loss": 1.9349, + "step": 4380 + }, + { + "epoch": 0.329776623572141, + "grad_norm": 3.539337396621704, + "learning_rate": 7.832910218027545e-05, + "loss": 2.0907, + "step": 4381 + }, + { + "epoch": 0.32985189785280117, + "grad_norm": 5.623476982116699, + "learning_rate": 7.831905597900491e-05, + "loss": 1.9176, + "step": 4382 + }, + { + "epoch": 0.3299271721334613, + "grad_norm": 5.693856716156006, + "learning_rate": 7.830900809424775e-05, + "loss": 1.8655, + "step": 4383 + }, + { + "epoch": 0.33000244641412146, + "grad_norm": 8.664924621582031, + "learning_rate": 7.82989585266013e-05, + "loss": 2.2314, + "step": 4384 + }, + { + "epoch": 0.3300777206947816, + "grad_norm": 4.794835567474365, + "learning_rate": 7.828890727666294e-05, + "loss": 1.8743, + "step": 4385 + }, + { + "epoch": 0.33015299497544176, + "grad_norm": 4.37811279296875, + "learning_rate": 7.827885434503021e-05, + "loss": 1.8513, + "step": 4386 + }, + { + "epoch": 0.33022826925610194, + "grad_norm": 6.0059332847595215, + "learning_rate": 7.826879973230074e-05, + "loss": 1.7265, + "step": 4387 + }, + { + "epoch": 0.33030354353676206, + "grad_norm": 4.591790199279785, + "learning_rate": 7.825874343907226e-05, + "loss": 2.3534, + "step": 4388 + }, + { + "epoch": 0.33037881781742223, + "grad_norm": 6.406615257263184, + "learning_rate": 7.824868546594255e-05, + "loss": 1.8981, + "step": 4389 + }, + { + "epoch": 0.3304540920980824, + "grad_norm": 6.243903160095215, + "learning_rate": 7.823862581350954e-05, + "loss": 2.4515, + "step": 4390 + }, + { + "epoch": 0.33052936637874253, + "grad_norm": 4.454758167266846, + "learning_rate": 7.822856448237126e-05, + "loss": 2.0204, + "step": 4391 + }, + { + "epoch": 0.3306046406594027, + "grad_norm": 7.581486225128174, + "learning_rate": 7.821850147312579e-05, + "loss": 1.9584, + "step": 4392 + }, + { + "epoch": 0.3306799149400629, + "grad_norm": 5.681704998016357, + "learning_rate": 7.82084367863714e-05, + "loss": 2.4047, + "step": 4393 + }, + { + "epoch": 0.330755189220723, + "grad_norm": 4.480740547180176, + "learning_rate": 7.81983704227064e-05, + "loss": 2.1313, + "step": 4394 + }, + { + "epoch": 0.3308304635013832, + "grad_norm": 5.834089756011963, + "learning_rate": 7.818830238272916e-05, + "loss": 1.796, + "step": 4395 + }, + { + "epoch": 0.3309057377820433, + "grad_norm": 6.718515872955322, + "learning_rate": 7.817823266703823e-05, + "loss": 1.7199, + "step": 4396 + }, + { + "epoch": 0.3309810120627035, + "grad_norm": 5.722562313079834, + "learning_rate": 7.816816127623222e-05, + "loss": 2.2911, + "step": 4397 + }, + { + "epoch": 0.33105628634336365, + "grad_norm": 5.06497049331665, + "learning_rate": 7.815808821090986e-05, + "loss": 1.8426, + "step": 4398 + }, + { + "epoch": 0.33113156062402377, + "grad_norm": 5.794899940490723, + "learning_rate": 7.814801347166993e-05, + "loss": 1.8064, + "step": 4399 + }, + { + "epoch": 0.33120683490468394, + "grad_norm": 5.660994529724121, + "learning_rate": 7.813793705911138e-05, + "loss": 1.7601, + "step": 4400 + }, + { + "epoch": 0.3312821091853441, + "grad_norm": 6.4643659591674805, + "learning_rate": 7.812785897383319e-05, + "loss": 1.8739, + "step": 4401 + }, + { + "epoch": 0.33135738346600424, + "grad_norm": 6.46829891204834, + "learning_rate": 7.811777921643451e-05, + "loss": 1.9316, + "step": 4402 + }, + { + "epoch": 0.3314326577466644, + "grad_norm": 6.490556240081787, + "learning_rate": 7.810769778751453e-05, + "loss": 2.0583, + "step": 4403 + }, + { + "epoch": 0.33150793202732454, + "grad_norm": 7.723535537719727, + "learning_rate": 7.809761468767256e-05, + "loss": 1.7346, + "step": 4404 + }, + { + "epoch": 0.3315832063079847, + "grad_norm": 5.656012058258057, + "learning_rate": 7.808752991750803e-05, + "loss": 1.669, + "step": 4405 + }, + { + "epoch": 0.3316584805886449, + "grad_norm": 4.766302585601807, + "learning_rate": 7.807744347762042e-05, + "loss": 1.8033, + "step": 4406 + }, + { + "epoch": 0.331733754869305, + "grad_norm": 4.236072540283203, + "learning_rate": 7.806735536860939e-05, + "loss": 1.7656, + "step": 4407 + }, + { + "epoch": 0.3318090291499652, + "grad_norm": 4.497416973114014, + "learning_rate": 7.805726559107458e-05, + "loss": 2.2027, + "step": 4408 + }, + { + "epoch": 0.33188430343062536, + "grad_norm": 6.1186113357543945, + "learning_rate": 7.804717414561586e-05, + "loss": 1.7344, + "step": 4409 + }, + { + "epoch": 0.3319595777112855, + "grad_norm": 4.847209453582764, + "learning_rate": 7.803708103283311e-05, + "loss": 1.7571, + "step": 4410 + }, + { + "epoch": 0.33203485199194566, + "grad_norm": 3.990985631942749, + "learning_rate": 7.802698625332634e-05, + "loss": 1.8296, + "step": 4411 + }, + { + "epoch": 0.33211012627260583, + "grad_norm": 5.556617736816406, + "learning_rate": 7.801688980769564e-05, + "loss": 1.9587, + "step": 4412 + }, + { + "epoch": 0.33218540055326595, + "grad_norm": 3.861116647720337, + "learning_rate": 7.800679169654125e-05, + "loss": 1.9628, + "step": 4413 + }, + { + "epoch": 0.33226067483392613, + "grad_norm": 5.4574875831604, + "learning_rate": 7.799669192046345e-05, + "loss": 1.7131, + "step": 4414 + }, + { + "epoch": 0.33233594911458625, + "grad_norm": 3.9400134086608887, + "learning_rate": 7.798659048006265e-05, + "loss": 2.2265, + "step": 4415 + }, + { + "epoch": 0.3324112233952464, + "grad_norm": 3.8897926807403564, + "learning_rate": 7.797648737593934e-05, + "loss": 1.5631, + "step": 4416 + }, + { + "epoch": 0.3324864976759066, + "grad_norm": 4.028106212615967, + "learning_rate": 7.796638260869412e-05, + "loss": 1.751, + "step": 4417 + }, + { + "epoch": 0.3325617719565667, + "grad_norm": 5.7744622230529785, + "learning_rate": 7.79562761789277e-05, + "loss": 1.8144, + "step": 4418 + }, + { + "epoch": 0.3326370462372269, + "grad_norm": 4.825803756713867, + "learning_rate": 7.79461680872409e-05, + "loss": 2.009, + "step": 4419 + }, + { + "epoch": 0.3327123205178871, + "grad_norm": 7.509187698364258, + "learning_rate": 7.793605833423459e-05, + "loss": 2.0374, + "step": 4420 + }, + { + "epoch": 0.3327875947985472, + "grad_norm": 6.569687843322754, + "learning_rate": 7.792594692050975e-05, + "loss": 2.1918, + "step": 4421 + }, + { + "epoch": 0.33286286907920737, + "grad_norm": 5.515702247619629, + "learning_rate": 7.791583384666751e-05, + "loss": 1.7104, + "step": 4422 + }, + { + "epoch": 0.33293814335986754, + "grad_norm": 4.994546413421631, + "learning_rate": 7.790571911330904e-05, + "loss": 1.9651, + "step": 4423 + }, + { + "epoch": 0.33301341764052766, + "grad_norm": 11.993245124816895, + "learning_rate": 7.789560272103564e-05, + "loss": 2.1854, + "step": 4424 + }, + { + "epoch": 0.33308869192118784, + "grad_norm": 11.527641296386719, + "learning_rate": 7.788548467044869e-05, + "loss": 1.9319, + "step": 4425 + }, + { + "epoch": 0.33316396620184796, + "grad_norm": 4.878230094909668, + "learning_rate": 7.787536496214972e-05, + "loss": 2.1615, + "step": 4426 + }, + { + "epoch": 0.33323924048250814, + "grad_norm": 4.188399791717529, + "learning_rate": 7.786524359674026e-05, + "loss": 2.2222, + "step": 4427 + }, + { + "epoch": 0.3333145147631683, + "grad_norm": 4.418381690979004, + "learning_rate": 7.785512057482202e-05, + "loss": 1.7423, + "step": 4428 + }, + { + "epoch": 0.33338978904382843, + "grad_norm": 5.352053165435791, + "learning_rate": 7.784499589699681e-05, + "loss": 2.361, + "step": 4429 + }, + { + "epoch": 0.3334650633244886, + "grad_norm": 5.7637200355529785, + "learning_rate": 7.783486956386647e-05, + "loss": 2.0843, + "step": 4430 + }, + { + "epoch": 0.3335403376051488, + "grad_norm": 4.4905104637146, + "learning_rate": 7.782474157603301e-05, + "loss": 2.3424, + "step": 4431 + }, + { + "epoch": 0.3336156118858089, + "grad_norm": 5.229638576507568, + "learning_rate": 7.78146119340985e-05, + "loss": 2.2531, + "step": 4432 + }, + { + "epoch": 0.3336908861664691, + "grad_norm": 7.752872943878174, + "learning_rate": 7.780448063866511e-05, + "loss": 1.8707, + "step": 4433 + }, + { + "epoch": 0.3337661604471292, + "grad_norm": 3.7875256538391113, + "learning_rate": 7.779434769033514e-05, + "loss": 1.9006, + "step": 4434 + }, + { + "epoch": 0.3338414347277894, + "grad_norm": 4.182655334472656, + "learning_rate": 7.778421308971093e-05, + "loss": 1.7904, + "step": 4435 + }, + { + "epoch": 0.33391670900844955, + "grad_norm": 5.4185051918029785, + "learning_rate": 7.7774076837395e-05, + "loss": 2.1225, + "step": 4436 + }, + { + "epoch": 0.3339919832891097, + "grad_norm": 4.450523376464844, + "learning_rate": 7.776393893398988e-05, + "loss": 1.6702, + "step": 4437 + }, + { + "epoch": 0.33406725756976985, + "grad_norm": 4.7899017333984375, + "learning_rate": 7.775379938009826e-05, + "loss": 2.054, + "step": 4438 + }, + { + "epoch": 0.33414253185043, + "grad_norm": 4.676827907562256, + "learning_rate": 7.77436581763229e-05, + "loss": 1.6154, + "step": 4439 + }, + { + "epoch": 0.33421780613109014, + "grad_norm": 5.522284030914307, + "learning_rate": 7.773351532326667e-05, + "loss": 1.9686, + "step": 4440 + }, + { + "epoch": 0.3342930804117503, + "grad_norm": 5.002451419830322, + "learning_rate": 7.772337082153253e-05, + "loss": 1.7279, + "step": 4441 + }, + { + "epoch": 0.3343683546924105, + "grad_norm": 4.5740838050842285, + "learning_rate": 7.771322467172353e-05, + "loss": 1.6956, + "step": 4442 + }, + { + "epoch": 0.3344436289730706, + "grad_norm": 4.63417387008667, + "learning_rate": 7.770307687444284e-05, + "loss": 2.2485, + "step": 4443 + }, + { + "epoch": 0.3345189032537308, + "grad_norm": 3.716222047805786, + "learning_rate": 7.769292743029374e-05, + "loss": 1.9892, + "step": 4444 + }, + { + "epoch": 0.3345941775343909, + "grad_norm": 5.563720703125, + "learning_rate": 7.768277633987955e-05, + "loss": 2.0142, + "step": 4445 + }, + { + "epoch": 0.3346694518150511, + "grad_norm": 5.678319454193115, + "learning_rate": 7.767262360380374e-05, + "loss": 2.0882, + "step": 4446 + }, + { + "epoch": 0.33474472609571126, + "grad_norm": 4.13865327835083, + "learning_rate": 7.766246922266986e-05, + "loss": 1.9522, + "step": 4447 + }, + { + "epoch": 0.3348200003763714, + "grad_norm": 6.93359375, + "learning_rate": 7.765231319708156e-05, + "loss": 2.206, + "step": 4448 + }, + { + "epoch": 0.33489527465703156, + "grad_norm": 4.408860206604004, + "learning_rate": 7.764215552764258e-05, + "loss": 1.7965, + "step": 4449 + }, + { + "epoch": 0.33497054893769174, + "grad_norm": 5.0691046714782715, + "learning_rate": 7.763199621495676e-05, + "loss": 1.8665, + "step": 4450 + }, + { + "epoch": 0.33504582321835186, + "grad_norm": 5.447968006134033, + "learning_rate": 7.762183525962804e-05, + "loss": 2.3682, + "step": 4451 + }, + { + "epoch": 0.33512109749901203, + "grad_norm": 4.8373613357543945, + "learning_rate": 7.76116726622605e-05, + "loss": 1.9963, + "step": 4452 + }, + { + "epoch": 0.33519637177967215, + "grad_norm": 4.789731025695801, + "learning_rate": 7.760150842345823e-05, + "loss": 2.1488, + "step": 4453 + }, + { + "epoch": 0.33527164606033233, + "grad_norm": 6.808932304382324, + "learning_rate": 7.75913425438255e-05, + "loss": 1.958, + "step": 4454 + }, + { + "epoch": 0.3353469203409925, + "grad_norm": 5.378639221191406, + "learning_rate": 7.758117502396661e-05, + "loss": 1.9495, + "step": 4455 + }, + { + "epoch": 0.3354221946216526, + "grad_norm": 4.950909614562988, + "learning_rate": 7.7571005864486e-05, + "loss": 1.8209, + "step": 4456 + }, + { + "epoch": 0.3354974689023128, + "grad_norm": 4.565957069396973, + "learning_rate": 7.756083506598821e-05, + "loss": 2.1044, + "step": 4457 + }, + { + "epoch": 0.335572743182973, + "grad_norm": 5.829155921936035, + "learning_rate": 7.755066262907786e-05, + "loss": 1.9729, + "step": 4458 + }, + { + "epoch": 0.3356480174636331, + "grad_norm": 6.161239147186279, + "learning_rate": 7.754048855435965e-05, + "loss": 1.9769, + "step": 4459 + }, + { + "epoch": 0.3357232917442933, + "grad_norm": 5.43168830871582, + "learning_rate": 7.753031284243844e-05, + "loss": 1.5594, + "step": 4460 + }, + { + "epoch": 0.33579856602495345, + "grad_norm": 5.409886360168457, + "learning_rate": 7.75201354939191e-05, + "loss": 2.1743, + "step": 4461 + }, + { + "epoch": 0.33587384030561357, + "grad_norm": 4.20995569229126, + "learning_rate": 7.750995650940669e-05, + "loss": 1.7825, + "step": 4462 + }, + { + "epoch": 0.33594911458627374, + "grad_norm": 5.230589866638184, + "learning_rate": 7.74997758895063e-05, + "loss": 2.3679, + "step": 4463 + }, + { + "epoch": 0.33602438886693387, + "grad_norm": 6.875021457672119, + "learning_rate": 7.748959363482313e-05, + "loss": 2.2259, + "step": 4464 + }, + { + "epoch": 0.33609966314759404, + "grad_norm": 7.151522159576416, + "learning_rate": 7.747940974596248e-05, + "loss": 1.814, + "step": 4465 + }, + { + "epoch": 0.3361749374282542, + "grad_norm": 4.0738396644592285, + "learning_rate": 7.746922422352978e-05, + "loss": 2.0622, + "step": 4466 + }, + { + "epoch": 0.33625021170891434, + "grad_norm": 5.833353519439697, + "learning_rate": 7.745903706813054e-05, + "loss": 1.9367, + "step": 4467 + }, + { + "epoch": 0.3363254859895745, + "grad_norm": 5.48893928527832, + "learning_rate": 7.744884828037031e-05, + "loss": 1.6223, + "step": 4468 + }, + { + "epoch": 0.3364007602702347, + "grad_norm": 5.804195404052734, + "learning_rate": 7.743865786085482e-05, + "loss": 1.9882, + "step": 4469 + }, + { + "epoch": 0.3364760345508948, + "grad_norm": 4.664313793182373, + "learning_rate": 7.742846581018985e-05, + "loss": 1.6981, + "step": 4470 + }, + { + "epoch": 0.336551308831555, + "grad_norm": 4.123824596405029, + "learning_rate": 7.741827212898128e-05, + "loss": 1.8707, + "step": 4471 + }, + { + "epoch": 0.33662658311221516, + "grad_norm": 6.232227802276611, + "learning_rate": 7.740807681783511e-05, + "loss": 1.9994, + "step": 4472 + }, + { + "epoch": 0.3367018573928753, + "grad_norm": 4.497732639312744, + "learning_rate": 7.739787987735742e-05, + "loss": 1.891, + "step": 4473 + }, + { + "epoch": 0.33677713167353546, + "grad_norm": 5.1084208488464355, + "learning_rate": 7.73876813081544e-05, + "loss": 1.9052, + "step": 4474 + }, + { + "epoch": 0.3368524059541956, + "grad_norm": 6.030988693237305, + "learning_rate": 7.737748111083227e-05, + "loss": 2.1788, + "step": 4475 + }, + { + "epoch": 0.33692768023485575, + "grad_norm": 5.405174732208252, + "learning_rate": 7.736727928599748e-05, + "loss": 2.0606, + "step": 4476 + }, + { + "epoch": 0.33700295451551593, + "grad_norm": 7.762973308563232, + "learning_rate": 7.735707583425643e-05, + "loss": 1.5961, + "step": 4477 + }, + { + "epoch": 0.33707822879617605, + "grad_norm": 5.235033988952637, + "learning_rate": 7.734687075621574e-05, + "loss": 1.8943, + "step": 4478 + }, + { + "epoch": 0.3371535030768362, + "grad_norm": 4.461373329162598, + "learning_rate": 7.733666405248205e-05, + "loss": 2.4107, + "step": 4479 + }, + { + "epoch": 0.3372287773574964, + "grad_norm": 6.370680809020996, + "learning_rate": 7.73264557236621e-05, + "loss": 2.6139, + "step": 4480 + }, + { + "epoch": 0.3373040516381565, + "grad_norm": 6.864678859710693, + "learning_rate": 7.731624577036278e-05, + "loss": 1.9212, + "step": 4481 + }, + { + "epoch": 0.3373793259188167, + "grad_norm": 4.778921127319336, + "learning_rate": 7.730603419319103e-05, + "loss": 2.1765, + "step": 4482 + }, + { + "epoch": 0.3374546001994768, + "grad_norm": 5.433311462402344, + "learning_rate": 7.72958209927539e-05, + "loss": 1.9568, + "step": 4483 + }, + { + "epoch": 0.337529874480137, + "grad_norm": 4.287771224975586, + "learning_rate": 7.728560616965852e-05, + "loss": 1.6992, + "step": 4484 + }, + { + "epoch": 0.33760514876079717, + "grad_norm": 5.015481472015381, + "learning_rate": 7.727538972451216e-05, + "loss": 1.6258, + "step": 4485 + }, + { + "epoch": 0.3376804230414573, + "grad_norm": 5.2515788078308105, + "learning_rate": 7.726517165792213e-05, + "loss": 1.9712, + "step": 4486 + }, + { + "epoch": 0.33775569732211747, + "grad_norm": 4.657538890838623, + "learning_rate": 7.725495197049588e-05, + "loss": 2.318, + "step": 4487 + }, + { + "epoch": 0.33783097160277764, + "grad_norm": 3.9009292125701904, + "learning_rate": 7.724473066284095e-05, + "loss": 1.6222, + "step": 4488 + }, + { + "epoch": 0.33790624588343776, + "grad_norm": 6.828575134277344, + "learning_rate": 7.723450773556494e-05, + "loss": 1.9036, + "step": 4489 + }, + { + "epoch": 0.33798152016409794, + "grad_norm": 4.917755126953125, + "learning_rate": 7.72242831892756e-05, + "loss": 1.6515, + "step": 4490 + }, + { + "epoch": 0.3380567944447581, + "grad_norm": 4.706576347351074, + "learning_rate": 7.721405702458073e-05, + "loss": 1.8587, + "step": 4491 + }, + { + "epoch": 0.33813206872541823, + "grad_norm": 7.554698467254639, + "learning_rate": 7.720382924208827e-05, + "loss": 1.8584, + "step": 4492 + }, + { + "epoch": 0.3382073430060784, + "grad_norm": 4.775897979736328, + "learning_rate": 7.719359984240619e-05, + "loss": 1.8143, + "step": 4493 + }, + { + "epoch": 0.33828261728673853, + "grad_norm": 5.146100997924805, + "learning_rate": 7.718336882614265e-05, + "loss": 1.4794, + "step": 4494 + }, + { + "epoch": 0.3383578915673987, + "grad_norm": 4.315169334411621, + "learning_rate": 7.717313619390583e-05, + "loss": 2.261, + "step": 4495 + }, + { + "epoch": 0.3384331658480589, + "grad_norm": 3.768585205078125, + "learning_rate": 7.716290194630404e-05, + "loss": 2.1138, + "step": 4496 + }, + { + "epoch": 0.338508440128719, + "grad_norm": 3.8151021003723145, + "learning_rate": 7.715266608394565e-05, + "loss": 1.9683, + "step": 4497 + }, + { + "epoch": 0.3385837144093792, + "grad_norm": 4.162018775939941, + "learning_rate": 7.714242860743916e-05, + "loss": 1.7186, + "step": 4498 + }, + { + "epoch": 0.33865898869003935, + "grad_norm": 4.899353981018066, + "learning_rate": 7.71321895173932e-05, + "loss": 1.718, + "step": 4499 + }, + { + "epoch": 0.3387342629706995, + "grad_norm": 5.593964099884033, + "learning_rate": 7.712194881441641e-05, + "loss": 2.0851, + "step": 4500 + }, + { + "epoch": 0.33880953725135965, + "grad_norm": 5.4595232009887695, + "learning_rate": 7.711170649911759e-05, + "loss": 2.3321, + "step": 4501 + }, + { + "epoch": 0.33888481153201977, + "grad_norm": 4.0324273109436035, + "learning_rate": 7.710146257210561e-05, + "loss": 2.1683, + "step": 4502 + }, + { + "epoch": 0.33896008581267995, + "grad_norm": 4.341223239898682, + "learning_rate": 7.709121703398944e-05, + "loss": 2.218, + "step": 4503 + }, + { + "epoch": 0.3390353600933401, + "grad_norm": 4.462097644805908, + "learning_rate": 7.708096988537815e-05, + "loss": 1.9665, + "step": 4504 + }, + { + "epoch": 0.33911063437400024, + "grad_norm": 4.973935604095459, + "learning_rate": 7.707072112688091e-05, + "loss": 2.1368, + "step": 4505 + }, + { + "epoch": 0.3391859086546604, + "grad_norm": 5.048158168792725, + "learning_rate": 7.706047075910696e-05, + "loss": 2.3724, + "step": 4506 + }, + { + "epoch": 0.3392611829353206, + "grad_norm": 5.3810930252075195, + "learning_rate": 7.705021878266568e-05, + "loss": 1.742, + "step": 4507 + }, + { + "epoch": 0.3393364572159807, + "grad_norm": 5.172126770019531, + "learning_rate": 7.703996519816651e-05, + "loss": 1.5642, + "step": 4508 + }, + { + "epoch": 0.3394117314966409, + "grad_norm": 4.7284159660339355, + "learning_rate": 7.702971000621899e-05, + "loss": 1.9425, + "step": 4509 + }, + { + "epoch": 0.33948700577730107, + "grad_norm": 4.3659563064575195, + "learning_rate": 7.701945320743278e-05, + "loss": 1.9658, + "step": 4510 + }, + { + "epoch": 0.3395622800579612, + "grad_norm": 3.4475326538085938, + "learning_rate": 7.70091948024176e-05, + "loss": 2.0655, + "step": 4511 + }, + { + "epoch": 0.33963755433862136, + "grad_norm": 3.9356861114501953, + "learning_rate": 7.699893479178326e-05, + "loss": 1.5902, + "step": 4512 + }, + { + "epoch": 0.3397128286192815, + "grad_norm": 3.830548048019409, + "learning_rate": 7.698867317613974e-05, + "loss": 2.082, + "step": 4513 + }, + { + "epoch": 0.33978810289994166, + "grad_norm": 3.951875925064087, + "learning_rate": 7.697840995609703e-05, + "loss": 1.8252, + "step": 4514 + }, + { + "epoch": 0.33986337718060183, + "grad_norm": 8.053879737854004, + "learning_rate": 7.696814513226527e-05, + "loss": 1.703, + "step": 4515 + }, + { + "epoch": 0.33993865146126195, + "grad_norm": 5.434277534484863, + "learning_rate": 7.695787870525465e-05, + "loss": 2.2256, + "step": 4516 + }, + { + "epoch": 0.34001392574192213, + "grad_norm": 3.973673105239868, + "learning_rate": 7.694761067567551e-05, + "loss": 1.9406, + "step": 4517 + }, + { + "epoch": 0.3400892000225823, + "grad_norm": 4.5511088371276855, + "learning_rate": 7.693734104413821e-05, + "loss": 1.6732, + "step": 4518 + }, + { + "epoch": 0.3401644743032424, + "grad_norm": 8.532569885253906, + "learning_rate": 7.692706981125329e-05, + "loss": 1.5233, + "step": 4519 + }, + { + "epoch": 0.3402397485839026, + "grad_norm": 5.463710308074951, + "learning_rate": 7.691679697763133e-05, + "loss": 1.877, + "step": 4520 + }, + { + "epoch": 0.3403150228645628, + "grad_norm": 5.922534465789795, + "learning_rate": 7.690652254388303e-05, + "loss": 1.8277, + "step": 4521 + }, + { + "epoch": 0.3403902971452229, + "grad_norm": 5.1810808181762695, + "learning_rate": 7.689624651061916e-05, + "loss": 2.0457, + "step": 4522 + }, + { + "epoch": 0.3404655714258831, + "grad_norm": 4.89050817489624, + "learning_rate": 7.688596887845062e-05, + "loss": 1.9049, + "step": 4523 + }, + { + "epoch": 0.3405408457065432, + "grad_norm": 4.357202053070068, + "learning_rate": 7.687568964798836e-05, + "loss": 1.8235, + "step": 4524 + }, + { + "epoch": 0.34061611998720337, + "grad_norm": 5.119769096374512, + "learning_rate": 7.686540881984347e-05, + "loss": 1.9132, + "step": 4525 + }, + { + "epoch": 0.34069139426786355, + "grad_norm": 4.868556022644043, + "learning_rate": 7.685512639462711e-05, + "loss": 1.7918, + "step": 4526 + }, + { + "epoch": 0.34076666854852367, + "grad_norm": 4.073054313659668, + "learning_rate": 7.684484237295055e-05, + "loss": 1.9776, + "step": 4527 + }, + { + "epoch": 0.34084194282918384, + "grad_norm": 6.021386623382568, + "learning_rate": 7.683455675542515e-05, + "loss": 1.8122, + "step": 4528 + }, + { + "epoch": 0.340917217109844, + "grad_norm": 5.63744592666626, + "learning_rate": 7.682426954266231e-05, + "loss": 1.5184, + "step": 4529 + }, + { + "epoch": 0.34099249139050414, + "grad_norm": 4.044479846954346, + "learning_rate": 7.681398073527364e-05, + "loss": 2.1389, + "step": 4530 + }, + { + "epoch": 0.3410677656711643, + "grad_norm": 5.750081539154053, + "learning_rate": 7.680369033387073e-05, + "loss": 1.857, + "step": 4531 + }, + { + "epoch": 0.34114303995182443, + "grad_norm": 5.646541595458984, + "learning_rate": 7.679339833906537e-05, + "loss": 1.6025, + "step": 4532 + }, + { + "epoch": 0.3412183142324846, + "grad_norm": 4.2863359451293945, + "learning_rate": 7.678310475146935e-05, + "loss": 1.9404, + "step": 4533 + }, + { + "epoch": 0.3412935885131448, + "grad_norm": 4.333461284637451, + "learning_rate": 7.67728095716946e-05, + "loss": 2.177, + "step": 4534 + }, + { + "epoch": 0.3413688627938049, + "grad_norm": 4.863918304443359, + "learning_rate": 7.676251280035312e-05, + "loss": 1.7849, + "step": 4535 + }, + { + "epoch": 0.3414441370744651, + "grad_norm": 5.519361972808838, + "learning_rate": 7.675221443805706e-05, + "loss": 1.7584, + "step": 4536 + }, + { + "epoch": 0.34151941135512526, + "grad_norm": 4.397111892700195, + "learning_rate": 7.674191448541861e-05, + "loss": 1.8239, + "step": 4537 + }, + { + "epoch": 0.3415946856357854, + "grad_norm": 4.253359317779541, + "learning_rate": 7.673161294305008e-05, + "loss": 2.1059, + "step": 4538 + }, + { + "epoch": 0.34166995991644555, + "grad_norm": 5.803365230560303, + "learning_rate": 7.672130981156387e-05, + "loss": 1.7686, + "step": 4539 + }, + { + "epoch": 0.34174523419710573, + "grad_norm": 5.407712936401367, + "learning_rate": 7.671100509157243e-05, + "loss": 1.6283, + "step": 4540 + }, + { + "epoch": 0.34182050847776585, + "grad_norm": 4.533716678619385, + "learning_rate": 7.670069878368842e-05, + "loss": 1.9044, + "step": 4541 + }, + { + "epoch": 0.341895782758426, + "grad_norm": 4.260860919952393, + "learning_rate": 7.669039088852446e-05, + "loss": 1.8602, + "step": 4542 + }, + { + "epoch": 0.34197105703908615, + "grad_norm": 6.364029407501221, + "learning_rate": 7.668008140669335e-05, + "loss": 1.6424, + "step": 4543 + }, + { + "epoch": 0.3420463313197463, + "grad_norm": 5.045587539672852, + "learning_rate": 7.666977033880795e-05, + "loss": 2.014, + "step": 4544 + }, + { + "epoch": 0.3421216056004065, + "grad_norm": 6.761258602142334, + "learning_rate": 7.665945768548122e-05, + "loss": 1.7627, + "step": 4545 + }, + { + "epoch": 0.3421968798810666, + "grad_norm": 6.659358978271484, + "learning_rate": 7.664914344732623e-05, + "loss": 1.8894, + "step": 4546 + }, + { + "epoch": 0.3422721541617268, + "grad_norm": 5.460297107696533, + "learning_rate": 7.663882762495614e-05, + "loss": 2.0568, + "step": 4547 + }, + { + "epoch": 0.34234742844238697, + "grad_norm": 4.423230171203613, + "learning_rate": 7.662851021898417e-05, + "loss": 1.9571, + "step": 4548 + }, + { + "epoch": 0.3424227027230471, + "grad_norm": 4.094817638397217, + "learning_rate": 7.661819123002367e-05, + "loss": 2.0311, + "step": 4549 + }, + { + "epoch": 0.34249797700370727, + "grad_norm": 5.367455959320068, + "learning_rate": 7.66078706586881e-05, + "loss": 1.9117, + "step": 4550 + }, + { + "epoch": 0.3425732512843674, + "grad_norm": 3.6734142303466797, + "learning_rate": 7.659754850559095e-05, + "loss": 1.8692, + "step": 4551 + }, + { + "epoch": 0.34264852556502756, + "grad_norm": 5.371687889099121, + "learning_rate": 7.658722477134585e-05, + "loss": 2.0769, + "step": 4552 + }, + { + "epoch": 0.34272379984568774, + "grad_norm": 8.325722694396973, + "learning_rate": 7.657689945656654e-05, + "loss": 2.2041, + "step": 4553 + }, + { + "epoch": 0.34279907412634786, + "grad_norm": 5.142776966094971, + "learning_rate": 7.656657256186681e-05, + "loss": 2.1997, + "step": 4554 + }, + { + "epoch": 0.34287434840700803, + "grad_norm": 4.608386993408203, + "learning_rate": 7.655624408786058e-05, + "loss": 1.9562, + "step": 4555 + }, + { + "epoch": 0.3429496226876682, + "grad_norm": 4.82465934753418, + "learning_rate": 7.654591403516183e-05, + "loss": 1.7444, + "step": 4556 + }, + { + "epoch": 0.34302489696832833, + "grad_norm": 5.245750904083252, + "learning_rate": 7.653558240438463e-05, + "loss": 1.7359, + "step": 4557 + }, + { + "epoch": 0.3431001712489885, + "grad_norm": 4.25660514831543, + "learning_rate": 7.652524919614323e-05, + "loss": 1.7792, + "step": 4558 + }, + { + "epoch": 0.3431754455296487, + "grad_norm": 4.651368141174316, + "learning_rate": 7.651491441105188e-05, + "loss": 2.0639, + "step": 4559 + }, + { + "epoch": 0.3432507198103088, + "grad_norm": 3.9159963130950928, + "learning_rate": 7.650457804972493e-05, + "loss": 1.8275, + "step": 4560 + }, + { + "epoch": 0.343325994090969, + "grad_norm": 8.020207405090332, + "learning_rate": 7.649424011277686e-05, + "loss": 2.1808, + "step": 4561 + }, + { + "epoch": 0.3434012683716291, + "grad_norm": 4.764902591705322, + "learning_rate": 7.648390060082225e-05, + "loss": 1.8966, + "step": 4562 + }, + { + "epoch": 0.3434765426522893, + "grad_norm": 4.096408843994141, + "learning_rate": 7.647355951447572e-05, + "loss": 1.8486, + "step": 4563 + }, + { + "epoch": 0.34355181693294945, + "grad_norm": 4.379515647888184, + "learning_rate": 7.646321685435205e-05, + "loss": 1.9737, + "step": 4564 + }, + { + "epoch": 0.34362709121360957, + "grad_norm": 6.029566287994385, + "learning_rate": 7.645287262106607e-05, + "loss": 1.7706, + "step": 4565 + }, + { + "epoch": 0.34370236549426975, + "grad_norm": 5.008563995361328, + "learning_rate": 7.644252681523271e-05, + "loss": 1.9291, + "step": 4566 + }, + { + "epoch": 0.3437776397749299, + "grad_norm": 4.422732830047607, + "learning_rate": 7.643217943746703e-05, + "loss": 1.9844, + "step": 4567 + }, + { + "epoch": 0.34385291405559004, + "grad_norm": 7.713883876800537, + "learning_rate": 7.64218304883841e-05, + "loss": 1.7341, + "step": 4568 + }, + { + "epoch": 0.3439281883362502, + "grad_norm": 5.014001369476318, + "learning_rate": 7.641147996859916e-05, + "loss": 2.0306, + "step": 4569 + }, + { + "epoch": 0.3440034626169104, + "grad_norm": 4.733586311340332, + "learning_rate": 7.640112787872753e-05, + "loss": 1.7966, + "step": 4570 + }, + { + "epoch": 0.3440787368975705, + "grad_norm": 4.042888641357422, + "learning_rate": 7.639077421938459e-05, + "loss": 2.1577, + "step": 4571 + }, + { + "epoch": 0.3441540111782307, + "grad_norm": 4.308354377746582, + "learning_rate": 7.638041899118586e-05, + "loss": 1.7748, + "step": 4572 + }, + { + "epoch": 0.3442292854588908, + "grad_norm": 4.559813499450684, + "learning_rate": 7.63700621947469e-05, + "loss": 2.0353, + "step": 4573 + }, + { + "epoch": 0.344304559739551, + "grad_norm": 5.088639736175537, + "learning_rate": 7.635970383068341e-05, + "loss": 1.9867, + "step": 4574 + }, + { + "epoch": 0.34437983402021116, + "grad_norm": 5.16423225402832, + "learning_rate": 7.634934389961116e-05, + "loss": 2.1595, + "step": 4575 + }, + { + "epoch": 0.3444551083008713, + "grad_norm": 6.445690155029297, + "learning_rate": 7.633898240214603e-05, + "loss": 1.892, + "step": 4576 + }, + { + "epoch": 0.34453038258153146, + "grad_norm": 5.115585803985596, + "learning_rate": 7.632861933890397e-05, + "loss": 1.7214, + "step": 4577 + }, + { + "epoch": 0.34460565686219163, + "grad_norm": 5.469083309173584, + "learning_rate": 7.631825471050102e-05, + "loss": 2.2577, + "step": 4578 + }, + { + "epoch": 0.34468093114285175, + "grad_norm": 4.674691677093506, + "learning_rate": 7.630788851755336e-05, + "loss": 1.9273, + "step": 4579 + }, + { + "epoch": 0.34475620542351193, + "grad_norm": 5.9623823165893555, + "learning_rate": 7.62975207606772e-05, + "loss": 1.7731, + "step": 4580 + }, + { + "epoch": 0.34483147970417205, + "grad_norm": 4.163843631744385, + "learning_rate": 7.628715144048888e-05, + "loss": 1.9499, + "step": 4581 + }, + { + "epoch": 0.3449067539848322, + "grad_norm": 6.8330159187316895, + "learning_rate": 7.627678055760485e-05, + "loss": 1.7635, + "step": 4582 + }, + { + "epoch": 0.3449820282654924, + "grad_norm": 4.5820441246032715, + "learning_rate": 7.626640811264161e-05, + "loss": 2.4614, + "step": 4583 + }, + { + "epoch": 0.3450573025461525, + "grad_norm": 5.254438877105713, + "learning_rate": 7.625603410621577e-05, + "loss": 1.7733, + "step": 4584 + }, + { + "epoch": 0.3451325768268127, + "grad_norm": 5.338019847869873, + "learning_rate": 7.624565853894405e-05, + "loss": 1.7816, + "step": 4585 + }, + { + "epoch": 0.3452078511074729, + "grad_norm": 5.570078372955322, + "learning_rate": 7.623528141144322e-05, + "loss": 1.9732, + "step": 4586 + }, + { + "epoch": 0.345283125388133, + "grad_norm": 6.184515476226807, + "learning_rate": 7.62249027243302e-05, + "loss": 2.0564, + "step": 4587 + }, + { + "epoch": 0.34535839966879317, + "grad_norm": 3.894554615020752, + "learning_rate": 7.621452247822197e-05, + "loss": 1.6548, + "step": 4588 + }, + { + "epoch": 0.34543367394945335, + "grad_norm": 6.185800552368164, + "learning_rate": 7.620414067373558e-05, + "loss": 1.987, + "step": 4589 + }, + { + "epoch": 0.34550894823011347, + "grad_norm": 6.572473526000977, + "learning_rate": 7.619375731148823e-05, + "loss": 2.277, + "step": 4590 + }, + { + "epoch": 0.34558422251077364, + "grad_norm": 3.884626626968384, + "learning_rate": 7.618337239209715e-05, + "loss": 1.7471, + "step": 4591 + }, + { + "epoch": 0.34565949679143376, + "grad_norm": 5.173831462860107, + "learning_rate": 7.617298591617971e-05, + "loss": 1.972, + "step": 4592 + }, + { + "epoch": 0.34573477107209394, + "grad_norm": 6.169503211975098, + "learning_rate": 7.616259788435337e-05, + "loss": 2.2883, + "step": 4593 + }, + { + "epoch": 0.3458100453527541, + "grad_norm": 5.72142219543457, + "learning_rate": 7.615220829723563e-05, + "loss": 2.0167, + "step": 4594 + }, + { + "epoch": 0.34588531963341423, + "grad_norm": 6.513581275939941, + "learning_rate": 7.614181715544417e-05, + "loss": 1.8981, + "step": 4595 + }, + { + "epoch": 0.3459605939140744, + "grad_norm": 6.770571231842041, + "learning_rate": 7.613142445959668e-05, + "loss": 2.0815, + "step": 4596 + }, + { + "epoch": 0.3460358681947346, + "grad_norm": 6.776336193084717, + "learning_rate": 7.612103021031099e-05, + "loss": 2.1027, + "step": 4597 + }, + { + "epoch": 0.3461111424753947, + "grad_norm": 4.741491794586182, + "learning_rate": 7.611063440820501e-05, + "loss": 1.836, + "step": 4598 + }, + { + "epoch": 0.3461864167560549, + "grad_norm": 5.063950538635254, + "learning_rate": 7.610023705389673e-05, + "loss": 1.9709, + "step": 4599 + }, + { + "epoch": 0.34626169103671506, + "grad_norm": 3.7474453449249268, + "learning_rate": 7.608983814800423e-05, + "loss": 1.9985, + "step": 4600 + }, + { + "epoch": 0.3463369653173752, + "grad_norm": 4.869091987609863, + "learning_rate": 7.607943769114574e-05, + "loss": 1.5032, + "step": 4601 + }, + { + "epoch": 0.34641223959803535, + "grad_norm": 4.552026748657227, + "learning_rate": 7.606903568393948e-05, + "loss": 1.8636, + "step": 4602 + }, + { + "epoch": 0.3464875138786955, + "grad_norm": 6.490877151489258, + "learning_rate": 7.605863212700385e-05, + "loss": 1.839, + "step": 4603 + }, + { + "epoch": 0.34656278815935565, + "grad_norm": 4.372566223144531, + "learning_rate": 7.604822702095735e-05, + "loss": 1.8915, + "step": 4604 + }, + { + "epoch": 0.3466380624400158, + "grad_norm": 4.83139705657959, + "learning_rate": 7.603782036641846e-05, + "loss": 2.4893, + "step": 4605 + }, + { + "epoch": 0.34671333672067595, + "grad_norm": 5.271783828735352, + "learning_rate": 7.602741216400586e-05, + "loss": 1.739, + "step": 4606 + }, + { + "epoch": 0.3467886110013361, + "grad_norm": 7.364255905151367, + "learning_rate": 7.60170024143383e-05, + "loss": 1.7221, + "step": 4607 + }, + { + "epoch": 0.3468638852819963, + "grad_norm": 4.707937717437744, + "learning_rate": 7.600659111803458e-05, + "loss": 1.7394, + "step": 4608 + }, + { + "epoch": 0.3469391595626564, + "grad_norm": 6.612408638000488, + "learning_rate": 7.599617827571367e-05, + "loss": 1.9136, + "step": 4609 + }, + { + "epoch": 0.3470144338433166, + "grad_norm": 4.721059322357178, + "learning_rate": 7.598576388799452e-05, + "loss": 2.2428, + "step": 4610 + }, + { + "epoch": 0.3470897081239767, + "grad_norm": 3.8378398418426514, + "learning_rate": 7.59753479554963e-05, + "loss": 1.7345, + "step": 4611 + }, + { + "epoch": 0.3471649824046369, + "grad_norm": 5.509730815887451, + "learning_rate": 7.596493047883816e-05, + "loss": 2.0214, + "step": 4612 + }, + { + "epoch": 0.34724025668529707, + "grad_norm": 3.565896987915039, + "learning_rate": 7.595451145863938e-05, + "loss": 2.1282, + "step": 4613 + }, + { + "epoch": 0.3473155309659572, + "grad_norm": 8.077792167663574, + "learning_rate": 7.594409089551941e-05, + "loss": 2.0758, + "step": 4614 + }, + { + "epoch": 0.34739080524661736, + "grad_norm": 3.7963669300079346, + "learning_rate": 7.593366879009766e-05, + "loss": 1.8992, + "step": 4615 + }, + { + "epoch": 0.34746607952727754, + "grad_norm": 4.983345985412598, + "learning_rate": 7.592324514299371e-05, + "loss": 1.6256, + "step": 4616 + }, + { + "epoch": 0.34754135380793766, + "grad_norm": 4.405954837799072, + "learning_rate": 7.591281995482722e-05, + "loss": 2.1183, + "step": 4617 + }, + { + "epoch": 0.34761662808859783, + "grad_norm": 4.969959259033203, + "learning_rate": 7.590239322621792e-05, + "loss": 1.684, + "step": 4618 + }, + { + "epoch": 0.347691902369258, + "grad_norm": 4.020200729370117, + "learning_rate": 7.58919649577857e-05, + "loss": 2.1328, + "step": 4619 + }, + { + "epoch": 0.34776717664991813, + "grad_norm": 7.955776691436768, + "learning_rate": 7.588153515015043e-05, + "loss": 2.0194, + "step": 4620 + }, + { + "epoch": 0.3478424509305783, + "grad_norm": 5.230367183685303, + "learning_rate": 7.587110380393216e-05, + "loss": 2.2738, + "step": 4621 + }, + { + "epoch": 0.3479177252112384, + "grad_norm": 9.670056343078613, + "learning_rate": 7.586067091975103e-05, + "loss": 2.5352, + "step": 4622 + }, + { + "epoch": 0.3479929994918986, + "grad_norm": 3.5471746921539307, + "learning_rate": 7.585023649822717e-05, + "loss": 2.099, + "step": 4623 + }, + { + "epoch": 0.3480682737725588, + "grad_norm": 4.767022609710693, + "learning_rate": 7.583980053998095e-05, + "loss": 1.7201, + "step": 4624 + }, + { + "epoch": 0.3481435480532189, + "grad_norm": 4.282424449920654, + "learning_rate": 7.582936304563272e-05, + "loss": 2.1026, + "step": 4625 + }, + { + "epoch": 0.3482188223338791, + "grad_norm": 4.454834461212158, + "learning_rate": 7.581892401580297e-05, + "loss": 1.9721, + "step": 4626 + }, + { + "epoch": 0.34829409661453925, + "grad_norm": 4.503671169281006, + "learning_rate": 7.580848345111228e-05, + "loss": 1.599, + "step": 4627 + }, + { + "epoch": 0.34836937089519937, + "grad_norm": 4.410214424133301, + "learning_rate": 7.579804135218129e-05, + "loss": 1.9126, + "step": 4628 + }, + { + "epoch": 0.34844464517585955, + "grad_norm": 4.832303047180176, + "learning_rate": 7.578759771963077e-05, + "loss": 1.9689, + "step": 4629 + }, + { + "epoch": 0.34851991945651967, + "grad_norm": 9.535778045654297, + "learning_rate": 7.577715255408155e-05, + "loss": 1.7885, + "step": 4630 + }, + { + "epoch": 0.34859519373717984, + "grad_norm": 8.254091262817383, + "learning_rate": 7.576670585615459e-05, + "loss": 2.1989, + "step": 4631 + }, + { + "epoch": 0.34867046801784, + "grad_norm": 5.040248394012451, + "learning_rate": 7.57562576264709e-05, + "loss": 1.4962, + "step": 4632 + }, + { + "epoch": 0.34874574229850014, + "grad_norm": 5.14858341217041, + "learning_rate": 7.57458078656516e-05, + "loss": 2.3323, + "step": 4633 + }, + { + "epoch": 0.3488210165791603, + "grad_norm": 5.499851226806641, + "learning_rate": 7.573535657431788e-05, + "loss": 1.5993, + "step": 4634 + }, + { + "epoch": 0.3488962908598205, + "grad_norm": 5.463460922241211, + "learning_rate": 7.572490375309105e-05, + "loss": 1.7505, + "step": 4635 + }, + { + "epoch": 0.3489715651404806, + "grad_norm": 7.3944926261901855, + "learning_rate": 7.571444940259252e-05, + "loss": 2.1459, + "step": 4636 + }, + { + "epoch": 0.3490468394211408, + "grad_norm": 8.682705879211426, + "learning_rate": 7.570399352344375e-05, + "loss": 2.2236, + "step": 4637 + }, + { + "epoch": 0.34912211370180096, + "grad_norm": 6.601252555847168, + "learning_rate": 7.569353611626633e-05, + "loss": 1.8421, + "step": 4638 + }, + { + "epoch": 0.3491973879824611, + "grad_norm": 5.291766166687012, + "learning_rate": 7.568307718168189e-05, + "loss": 2.13, + "step": 4639 + }, + { + "epoch": 0.34927266226312126, + "grad_norm": 5.244277000427246, + "learning_rate": 7.56726167203122e-05, + "loss": 2.1281, + "step": 4640 + }, + { + "epoch": 0.3493479365437814, + "grad_norm": 5.088771343231201, + "learning_rate": 7.566215473277913e-05, + "loss": 1.9444, + "step": 4641 + }, + { + "epoch": 0.34942321082444155, + "grad_norm": 4.619437217712402, + "learning_rate": 7.565169121970459e-05, + "loss": 1.851, + "step": 4642 + }, + { + "epoch": 0.34949848510510173, + "grad_norm": 6.315009593963623, + "learning_rate": 7.564122618171061e-05, + "loss": 2.0517, + "step": 4643 + }, + { + "epoch": 0.34957375938576185, + "grad_norm": 6.088854789733887, + "learning_rate": 7.56307596194193e-05, + "loss": 1.6924, + "step": 4644 + }, + { + "epoch": 0.349649033666422, + "grad_norm": 5.432744026184082, + "learning_rate": 7.56202915334529e-05, + "loss": 1.7815, + "step": 4645 + }, + { + "epoch": 0.3497243079470822, + "grad_norm": 4.92105770111084, + "learning_rate": 7.560982192443364e-05, + "loss": 2.1509, + "step": 4646 + }, + { + "epoch": 0.3497995822277423, + "grad_norm": 5.1703925132751465, + "learning_rate": 7.559935079298397e-05, + "loss": 2.2104, + "step": 4647 + }, + { + "epoch": 0.3498748565084025, + "grad_norm": 6.527153491973877, + "learning_rate": 7.558887813972635e-05, + "loss": 2.4767, + "step": 4648 + }, + { + "epoch": 0.3499501307890627, + "grad_norm": 6.482638359069824, + "learning_rate": 7.557840396528334e-05, + "loss": 1.9808, + "step": 4649 + }, + { + "epoch": 0.3500254050697228, + "grad_norm": 6.609414577484131, + "learning_rate": 7.556792827027761e-05, + "loss": 1.9525, + "step": 4650 + }, + { + "epoch": 0.35010067935038297, + "grad_norm": 5.3686747550964355, + "learning_rate": 7.555745105533191e-05, + "loss": 1.8645, + "step": 4651 + }, + { + "epoch": 0.3501759536310431, + "grad_norm": 5.167562007904053, + "learning_rate": 7.554697232106906e-05, + "loss": 2.049, + "step": 4652 + }, + { + "epoch": 0.35025122791170327, + "grad_norm": 4.992973804473877, + "learning_rate": 7.553649206811204e-05, + "loss": 1.9439, + "step": 4653 + }, + { + "epoch": 0.35032650219236344, + "grad_norm": 5.324767112731934, + "learning_rate": 7.552601029708382e-05, + "loss": 1.8205, + "step": 4654 + }, + { + "epoch": 0.35040177647302356, + "grad_norm": 7.380729675292969, + "learning_rate": 7.551552700860753e-05, + "loss": 2.1826, + "step": 4655 + }, + { + "epoch": 0.35047705075368374, + "grad_norm": 4.950378894805908, + "learning_rate": 7.550504220330638e-05, + "loss": 1.8575, + "step": 4656 + }, + { + "epoch": 0.3505523250343439, + "grad_norm": 4.571684837341309, + "learning_rate": 7.549455588180363e-05, + "loss": 1.8922, + "step": 4657 + }, + { + "epoch": 0.35062759931500403, + "grad_norm": 3.9421005249023438, + "learning_rate": 7.548406804472271e-05, + "loss": 2.1249, + "step": 4658 + }, + { + "epoch": 0.3507028735956642, + "grad_norm": 4.3941473960876465, + "learning_rate": 7.547357869268705e-05, + "loss": 1.6238, + "step": 4659 + }, + { + "epoch": 0.35077814787632433, + "grad_norm": 4.860470771789551, + "learning_rate": 7.546308782632024e-05, + "loss": 1.7754, + "step": 4660 + }, + { + "epoch": 0.3508534221569845, + "grad_norm": 5.041442394256592, + "learning_rate": 7.545259544624592e-05, + "loss": 2.3265, + "step": 4661 + }, + { + "epoch": 0.3509286964376447, + "grad_norm": 4.453219890594482, + "learning_rate": 7.544210155308783e-05, + "loss": 1.7371, + "step": 4662 + }, + { + "epoch": 0.3510039707183048, + "grad_norm": 4.214534759521484, + "learning_rate": 7.54316061474698e-05, + "loss": 2.1597, + "step": 4663 + }, + { + "epoch": 0.351079244998965, + "grad_norm": 8.088193893432617, + "learning_rate": 7.542110923001576e-05, + "loss": 1.7023, + "step": 4664 + }, + { + "epoch": 0.35115451927962515, + "grad_norm": 4.413039207458496, + "learning_rate": 7.541061080134972e-05, + "loss": 1.7668, + "step": 4665 + }, + { + "epoch": 0.3512297935602853, + "grad_norm": 4.505979061126709, + "learning_rate": 7.540011086209578e-05, + "loss": 1.8772, + "step": 4666 + }, + { + "epoch": 0.35130506784094545, + "grad_norm": 5.304669380187988, + "learning_rate": 7.538960941287813e-05, + "loss": 1.8723, + "step": 4667 + }, + { + "epoch": 0.3513803421216056, + "grad_norm": 9.848307609558105, + "learning_rate": 7.537910645432105e-05, + "loss": 2.4731, + "step": 4668 + }, + { + "epoch": 0.35145561640226575, + "grad_norm": 4.413638114929199, + "learning_rate": 7.53686019870489e-05, + "loss": 2.003, + "step": 4669 + }, + { + "epoch": 0.3515308906829259, + "grad_norm": 3.5043060779571533, + "learning_rate": 7.535809601168617e-05, + "loss": 1.636, + "step": 4670 + }, + { + "epoch": 0.35160616496358604, + "grad_norm": 5.944123268127441, + "learning_rate": 7.53475885288574e-05, + "loss": 1.879, + "step": 4671 + }, + { + "epoch": 0.3516814392442462, + "grad_norm": 4.058021545410156, + "learning_rate": 7.53370795391872e-05, + "loss": 1.5877, + "step": 4672 + }, + { + "epoch": 0.3517567135249064, + "grad_norm": 4.630694389343262, + "learning_rate": 7.53265690433003e-05, + "loss": 1.6617, + "step": 4673 + }, + { + "epoch": 0.3518319878055665, + "grad_norm": 4.611077308654785, + "learning_rate": 7.531605704182159e-05, + "loss": 1.8215, + "step": 4674 + }, + { + "epoch": 0.3519072620862267, + "grad_norm": 6.053842544555664, + "learning_rate": 7.53055435353759e-05, + "loss": 2.4532, + "step": 4675 + }, + { + "epoch": 0.35198253636688687, + "grad_norm": 6.291989803314209, + "learning_rate": 7.529502852458828e-05, + "loss": 1.8404, + "step": 4676 + }, + { + "epoch": 0.352057810647547, + "grad_norm": 3.821798086166382, + "learning_rate": 7.528451201008378e-05, + "loss": 2.0933, + "step": 4677 + }, + { + "epoch": 0.35213308492820716, + "grad_norm": 4.469470977783203, + "learning_rate": 7.527399399248759e-05, + "loss": 1.9019, + "step": 4678 + }, + { + "epoch": 0.3522083592088673, + "grad_norm": 4.913358688354492, + "learning_rate": 7.526347447242496e-05, + "loss": 1.9761, + "step": 4679 + }, + { + "epoch": 0.35228363348952746, + "grad_norm": 6.259414196014404, + "learning_rate": 7.525295345052128e-05, + "loss": 1.8963, + "step": 4680 + }, + { + "epoch": 0.35235890777018763, + "grad_norm": 9.231456756591797, + "learning_rate": 7.524243092740198e-05, + "loss": 1.9634, + "step": 4681 + }, + { + "epoch": 0.35243418205084776, + "grad_norm": 6.984905242919922, + "learning_rate": 7.523190690369258e-05, + "loss": 1.9377, + "step": 4682 + }, + { + "epoch": 0.35250945633150793, + "grad_norm": 5.892779350280762, + "learning_rate": 7.522138138001873e-05, + "loss": 2.2882, + "step": 4683 + }, + { + "epoch": 0.3525847306121681, + "grad_norm": 8.263434410095215, + "learning_rate": 7.52108543570061e-05, + "loss": 2.1783, + "step": 4684 + }, + { + "epoch": 0.3526600048928282, + "grad_norm": 5.448480129241943, + "learning_rate": 7.520032583528052e-05, + "loss": 1.8391, + "step": 4685 + }, + { + "epoch": 0.3527352791734884, + "grad_norm": 6.509362697601318, + "learning_rate": 7.51897958154679e-05, + "loss": 2.0646, + "step": 4686 + }, + { + "epoch": 0.3528105534541486, + "grad_norm": 5.537726879119873, + "learning_rate": 7.517926429819418e-05, + "loss": 2.2786, + "step": 4687 + }, + { + "epoch": 0.3528858277348087, + "grad_norm": 7.844386100769043, + "learning_rate": 7.516873128408546e-05, + "loss": 2.162, + "step": 4688 + }, + { + "epoch": 0.3529611020154689, + "grad_norm": 4.243312358856201, + "learning_rate": 7.515819677376787e-05, + "loss": 1.9451, + "step": 4689 + }, + { + "epoch": 0.353036376296129, + "grad_norm": 5.570211887359619, + "learning_rate": 7.514766076786766e-05, + "loss": 2.2576, + "step": 4690 + }, + { + "epoch": 0.35311165057678917, + "grad_norm": 4.9283833503723145, + "learning_rate": 7.513712326701116e-05, + "loss": 2.1044, + "step": 4691 + }, + { + "epoch": 0.35318692485744935, + "grad_norm": 5.249383926391602, + "learning_rate": 7.512658427182484e-05, + "loss": 2.3646, + "step": 4692 + }, + { + "epoch": 0.35326219913810947, + "grad_norm": 6.563067436218262, + "learning_rate": 7.511604378293518e-05, + "loss": 1.6723, + "step": 4693 + }, + { + "epoch": 0.35333747341876964, + "grad_norm": 8.052225112915039, + "learning_rate": 7.510550180096877e-05, + "loss": 2.1962, + "step": 4694 + }, + { + "epoch": 0.3534127476994298, + "grad_norm": 7.162243843078613, + "learning_rate": 7.509495832655232e-05, + "loss": 1.7458, + "step": 4695 + }, + { + "epoch": 0.35348802198008994, + "grad_norm": 6.215378284454346, + "learning_rate": 7.50844133603126e-05, + "loss": 2.1654, + "step": 4696 + }, + { + "epoch": 0.3535632962607501, + "grad_norm": 4.970090866088867, + "learning_rate": 7.507386690287647e-05, + "loss": 2.0161, + "step": 4697 + }, + { + "epoch": 0.3536385705414103, + "grad_norm": 7.203294277191162, + "learning_rate": 7.506331895487092e-05, + "loss": 1.9721, + "step": 4698 + }, + { + "epoch": 0.3537138448220704, + "grad_norm": 4.93463659286499, + "learning_rate": 7.505276951692297e-05, + "loss": 2.126, + "step": 4699 + }, + { + "epoch": 0.3537891191027306, + "grad_norm": 5.6041460037231445, + "learning_rate": 7.504221858965975e-05, + "loss": 1.7942, + "step": 4700 + }, + { + "epoch": 0.3538643933833907, + "grad_norm": 4.612369537353516, + "learning_rate": 7.503166617370849e-05, + "loss": 1.6288, + "step": 4701 + }, + { + "epoch": 0.3539396676640509, + "grad_norm": 4.573864936828613, + "learning_rate": 7.50211122696965e-05, + "loss": 1.9428, + "step": 4702 + }, + { + "epoch": 0.35401494194471106, + "grad_norm": 3.730754852294922, + "learning_rate": 7.501055687825117e-05, + "loss": 2.0171, + "step": 4703 + }, + { + "epoch": 0.3540902162253712, + "grad_norm": 3.9611310958862305, + "learning_rate": 7.500000000000001e-05, + "loss": 1.9881, + "step": 4704 + }, + { + "epoch": 0.35416549050603136, + "grad_norm": 5.63819694519043, + "learning_rate": 7.498944163557059e-05, + "loss": 1.8067, + "step": 4705 + }, + { + "epoch": 0.35424076478669153, + "grad_norm": 5.431600570678711, + "learning_rate": 7.497888178559055e-05, + "loss": 2.0458, + "step": 4706 + }, + { + "epoch": 0.35431603906735165, + "grad_norm": 4.7816643714904785, + "learning_rate": 7.496832045068765e-05, + "loss": 1.974, + "step": 4707 + }, + { + "epoch": 0.3543913133480118, + "grad_norm": 5.847719669342041, + "learning_rate": 7.495775763148975e-05, + "loss": 1.8504, + "step": 4708 + }, + { + "epoch": 0.35446658762867195, + "grad_norm": 4.829690933227539, + "learning_rate": 7.494719332862478e-05, + "loss": 2.0692, + "step": 4709 + }, + { + "epoch": 0.3545418619093321, + "grad_norm": 3.6871941089630127, + "learning_rate": 7.493662754272075e-05, + "loss": 2.2323, + "step": 4710 + }, + { + "epoch": 0.3546171361899923, + "grad_norm": 5.387477874755859, + "learning_rate": 7.492606027440577e-05, + "loss": 1.4925, + "step": 4711 + }, + { + "epoch": 0.3546924104706524, + "grad_norm": 5.110841274261475, + "learning_rate": 7.491549152430801e-05, + "loss": 1.8387, + "step": 4712 + }, + { + "epoch": 0.3547676847513126, + "grad_norm": 5.340623378753662, + "learning_rate": 7.490492129305577e-05, + "loss": 1.7546, + "step": 4713 + }, + { + "epoch": 0.35484295903197277, + "grad_norm": 5.21928071975708, + "learning_rate": 7.489434958127744e-05, + "loss": 1.8928, + "step": 4714 + }, + { + "epoch": 0.3549182333126329, + "grad_norm": 5.114518165588379, + "learning_rate": 7.488377638960143e-05, + "loss": 1.6143, + "step": 4715 + }, + { + "epoch": 0.35499350759329307, + "grad_norm": 4.30167818069458, + "learning_rate": 7.487320171865633e-05, + "loss": 1.6975, + "step": 4716 + }, + { + "epoch": 0.35506878187395324, + "grad_norm": 5.553513050079346, + "learning_rate": 7.486262556907075e-05, + "loss": 1.9717, + "step": 4717 + }, + { + "epoch": 0.35514405615461336, + "grad_norm": 5.89702033996582, + "learning_rate": 7.485204794147343e-05, + "loss": 1.8398, + "step": 4718 + }, + { + "epoch": 0.35521933043527354, + "grad_norm": 3.8982880115509033, + "learning_rate": 7.484146883649316e-05, + "loss": 1.8755, + "step": 4719 + }, + { + "epoch": 0.35529460471593366, + "grad_norm": 5.744004249572754, + "learning_rate": 7.483088825475885e-05, + "loss": 2.085, + "step": 4720 + }, + { + "epoch": 0.35536987899659384, + "grad_norm": 4.891592502593994, + "learning_rate": 7.482030619689947e-05, + "loss": 1.855, + "step": 4721 + }, + { + "epoch": 0.355445153277254, + "grad_norm": 4.790428638458252, + "learning_rate": 7.480972266354411e-05, + "loss": 1.9218, + "step": 4722 + }, + { + "epoch": 0.35552042755791413, + "grad_norm": 5.237813472747803, + "learning_rate": 7.479913765532193e-05, + "loss": 2.0425, + "step": 4723 + }, + { + "epoch": 0.3555957018385743, + "grad_norm": 4.616795539855957, + "learning_rate": 7.478855117286217e-05, + "loss": 1.8318, + "step": 4724 + }, + { + "epoch": 0.3556709761192345, + "grad_norm": 5.187572479248047, + "learning_rate": 7.477796321679416e-05, + "loss": 2.0823, + "step": 4725 + }, + { + "epoch": 0.3557462503998946, + "grad_norm": 3.922170400619507, + "learning_rate": 7.476737378774735e-05, + "loss": 1.8995, + "step": 4726 + }, + { + "epoch": 0.3558215246805548, + "grad_norm": 4.256510257720947, + "learning_rate": 7.475678288635122e-05, + "loss": 1.7397, + "step": 4727 + }, + { + "epoch": 0.3558967989612149, + "grad_norm": 4.612951278686523, + "learning_rate": 7.474619051323539e-05, + "loss": 2.2163, + "step": 4728 + }, + { + "epoch": 0.3559720732418751, + "grad_norm": 3.8241257667541504, + "learning_rate": 7.473559666902954e-05, + "loss": 1.9591, + "step": 4729 + }, + { + "epoch": 0.35604734752253525, + "grad_norm": 4.98952579498291, + "learning_rate": 7.472500135436344e-05, + "loss": 1.8972, + "step": 4730 + }, + { + "epoch": 0.35612262180319537, + "grad_norm": 4.38181734085083, + "learning_rate": 7.471440456986695e-05, + "loss": 2.1073, + "step": 4731 + }, + { + "epoch": 0.35619789608385555, + "grad_norm": 5.273332595825195, + "learning_rate": 7.470380631617002e-05, + "loss": 2.0011, + "step": 4732 + }, + { + "epoch": 0.3562731703645157, + "grad_norm": 4.205855846405029, + "learning_rate": 7.46932065939027e-05, + "loss": 1.7867, + "step": 4733 + }, + { + "epoch": 0.35634844464517584, + "grad_norm": 10.717961311340332, + "learning_rate": 7.46826054036951e-05, + "loss": 1.6936, + "step": 4734 + }, + { + "epoch": 0.356423718925836, + "grad_norm": 4.712283611297607, + "learning_rate": 7.467200274617741e-05, + "loss": 1.8296, + "step": 4735 + }, + { + "epoch": 0.3564989932064962, + "grad_norm": 5.461143493652344, + "learning_rate": 7.466139862197996e-05, + "loss": 2.466, + "step": 4736 + }, + { + "epoch": 0.3565742674871563, + "grad_norm": 4.93215274810791, + "learning_rate": 7.465079303173312e-05, + "loss": 1.8435, + "step": 4737 + }, + { + "epoch": 0.3566495417678165, + "grad_norm": 7.37506103515625, + "learning_rate": 7.464018597606738e-05, + "loss": 1.8774, + "step": 4738 + }, + { + "epoch": 0.3567248160484766, + "grad_norm": 5.154730319976807, + "learning_rate": 7.462957745561328e-05, + "loss": 1.6731, + "step": 4739 + }, + { + "epoch": 0.3568000903291368, + "grad_norm": 5.998811721801758, + "learning_rate": 7.461896747100146e-05, + "loss": 1.7755, + "step": 4740 + }, + { + "epoch": 0.35687536460979696, + "grad_norm": 4.997064113616943, + "learning_rate": 7.460835602286265e-05, + "loss": 1.5909, + "step": 4741 + }, + { + "epoch": 0.3569506388904571, + "grad_norm": 3.6149983406066895, + "learning_rate": 7.45977431118277e-05, + "loss": 2.234, + "step": 4742 + }, + { + "epoch": 0.35702591317111726, + "grad_norm": 4.140749454498291, + "learning_rate": 7.458712873852752e-05, + "loss": 1.9614, + "step": 4743 + }, + { + "epoch": 0.35710118745177744, + "grad_norm": 4.971986293792725, + "learning_rate": 7.457651290359306e-05, + "loss": 1.7933, + "step": 4744 + }, + { + "epoch": 0.35717646173243756, + "grad_norm": 5.6855974197387695, + "learning_rate": 7.456589560765545e-05, + "loss": 1.7852, + "step": 4745 + }, + { + "epoch": 0.35725173601309773, + "grad_norm": 5.738282680511475, + "learning_rate": 7.455527685134581e-05, + "loss": 1.8617, + "step": 4746 + }, + { + "epoch": 0.3573270102937579, + "grad_norm": 5.753636360168457, + "learning_rate": 7.454465663529542e-05, + "loss": 1.7105, + "step": 4747 + }, + { + "epoch": 0.357402284574418, + "grad_norm": 3.909217596054077, + "learning_rate": 7.453403496013563e-05, + "loss": 1.7866, + "step": 4748 + }, + { + "epoch": 0.3574775588550782, + "grad_norm": 4.014737606048584, + "learning_rate": 7.452341182649786e-05, + "loss": 2.0668, + "step": 4749 + }, + { + "epoch": 0.3575528331357383, + "grad_norm": 5.340160369873047, + "learning_rate": 7.451278723501362e-05, + "loss": 2.0868, + "step": 4750 + }, + { + "epoch": 0.3576281074163985, + "grad_norm": 9.055809020996094, + "learning_rate": 7.45021611863145e-05, + "loss": 1.7201, + "step": 4751 + }, + { + "epoch": 0.3577033816970587, + "grad_norm": 4.3002400398254395, + "learning_rate": 7.449153368103222e-05, + "loss": 1.8106, + "step": 4752 + }, + { + "epoch": 0.3577786559777188, + "grad_norm": 6.857629776000977, + "learning_rate": 7.448090471979853e-05, + "loss": 2.0487, + "step": 4753 + }, + { + "epoch": 0.35785393025837897, + "grad_norm": 4.552512168884277, + "learning_rate": 7.44702743032453e-05, + "loss": 2.3641, + "step": 4754 + }, + { + "epoch": 0.35792920453903915, + "grad_norm": 5.023601055145264, + "learning_rate": 7.445964243200447e-05, + "loss": 1.975, + "step": 4755 + }, + { + "epoch": 0.35800447881969927, + "grad_norm": 4.242371082305908, + "learning_rate": 7.444900910670808e-05, + "loss": 2.2007, + "step": 4756 + }, + { + "epoch": 0.35807975310035944, + "grad_norm": 6.100505352020264, + "learning_rate": 7.443837432798826e-05, + "loss": 2.4539, + "step": 4757 + }, + { + "epoch": 0.35815502738101956, + "grad_norm": 4.475913047790527, + "learning_rate": 7.442773809647718e-05, + "loss": 1.7682, + "step": 4758 + }, + { + "epoch": 0.35823030166167974, + "grad_norm": 7.261290550231934, + "learning_rate": 7.44171004128072e-05, + "loss": 2.3687, + "step": 4759 + }, + { + "epoch": 0.3583055759423399, + "grad_norm": 6.279832363128662, + "learning_rate": 7.440646127761064e-05, + "loss": 2.0404, + "step": 4760 + }, + { + "epoch": 0.35838085022300004, + "grad_norm": 5.254789352416992, + "learning_rate": 7.439582069151999e-05, + "loss": 1.8819, + "step": 4761 + }, + { + "epoch": 0.3584561245036602, + "grad_norm": 5.2040629386901855, + "learning_rate": 7.43851786551678e-05, + "loss": 2.2314, + "step": 4762 + }, + { + "epoch": 0.3585313987843204, + "grad_norm": 5.418365955352783, + "learning_rate": 7.437453516918669e-05, + "loss": 1.9486, + "step": 4763 + }, + { + "epoch": 0.3586066730649805, + "grad_norm": 3.9953980445861816, + "learning_rate": 7.436389023420944e-05, + "loss": 2.0468, + "step": 4764 + }, + { + "epoch": 0.3586819473456407, + "grad_norm": 5.670870304107666, + "learning_rate": 7.43532438508688e-05, + "loss": 2.0555, + "step": 4765 + }, + { + "epoch": 0.35875722162630086, + "grad_norm": 4.724856853485107, + "learning_rate": 7.43425960197977e-05, + "loss": 1.8635, + "step": 4766 + }, + { + "epoch": 0.358832495906961, + "grad_norm": 5.276005268096924, + "learning_rate": 7.433194674162911e-05, + "loss": 1.904, + "step": 4767 + }, + { + "epoch": 0.35890777018762116, + "grad_norm": 6.899415016174316, + "learning_rate": 7.43212960169961e-05, + "loss": 1.5306, + "step": 4768 + }, + { + "epoch": 0.3589830444682813, + "grad_norm": 5.703815460205078, + "learning_rate": 7.431064384653182e-05, + "loss": 1.7717, + "step": 4769 + }, + { + "epoch": 0.35905831874894145, + "grad_norm": 5.5511064529418945, + "learning_rate": 7.429999023086953e-05, + "loss": 1.8758, + "step": 4770 + }, + { + "epoch": 0.3591335930296016, + "grad_norm": 3.3603591918945312, + "learning_rate": 7.428933517064254e-05, + "loss": 1.9781, + "step": 4771 + }, + { + "epoch": 0.35920886731026175, + "grad_norm": 6.898568630218506, + "learning_rate": 7.427867866648425e-05, + "loss": 2.1196, + "step": 4772 + }, + { + "epoch": 0.3592841415909219, + "grad_norm": 9.525616645812988, + "learning_rate": 7.42680207190282e-05, + "loss": 1.8503, + "step": 4773 + }, + { + "epoch": 0.3593594158715821, + "grad_norm": 6.236075401306152, + "learning_rate": 7.425736132890795e-05, + "loss": 1.6964, + "step": 4774 + }, + { + "epoch": 0.3594346901522422, + "grad_norm": 4.443072319030762, + "learning_rate": 7.424670049675716e-05, + "loss": 2.0045, + "step": 4775 + }, + { + "epoch": 0.3595099644329024, + "grad_norm": 5.724411487579346, + "learning_rate": 7.423603822320962e-05, + "loss": 2.1791, + "step": 4776 + }, + { + "epoch": 0.3595852387135625, + "grad_norm": 5.621659278869629, + "learning_rate": 7.422537450889913e-05, + "loss": 1.8719, + "step": 4777 + }, + { + "epoch": 0.3596605129942227, + "grad_norm": 4.912325382232666, + "learning_rate": 7.421470935445964e-05, + "loss": 1.9995, + "step": 4778 + }, + { + "epoch": 0.35973578727488287, + "grad_norm": 4.139276504516602, + "learning_rate": 7.420404276052517e-05, + "loss": 1.8926, + "step": 4779 + }, + { + "epoch": 0.359811061555543, + "grad_norm": 5.174743175506592, + "learning_rate": 7.419337472772978e-05, + "loss": 1.8217, + "step": 4780 + }, + { + "epoch": 0.35988633583620316, + "grad_norm": 6.568526744842529, + "learning_rate": 7.418270525670769e-05, + "loss": 2.0899, + "step": 4781 + }, + { + "epoch": 0.35996161011686334, + "grad_norm": 11.795207977294922, + "learning_rate": 7.417203434809317e-05, + "loss": 2.3107, + "step": 4782 + }, + { + "epoch": 0.36003688439752346, + "grad_norm": 5.63358211517334, + "learning_rate": 7.416136200252057e-05, + "loss": 1.6795, + "step": 4783 + }, + { + "epoch": 0.36011215867818364, + "grad_norm": 6.02707052230835, + "learning_rate": 7.415068822062432e-05, + "loss": 1.6998, + "step": 4784 + }, + { + "epoch": 0.3601874329588438, + "grad_norm": 3.5824978351593018, + "learning_rate": 7.414001300303894e-05, + "loss": 1.6323, + "step": 4785 + }, + { + "epoch": 0.36026270723950393, + "grad_norm": 5.2145094871521, + "learning_rate": 7.412933635039907e-05, + "loss": 2.334, + "step": 4786 + }, + { + "epoch": 0.3603379815201641, + "grad_norm": 3.8623034954071045, + "learning_rate": 7.411865826333937e-05, + "loss": 1.9674, + "step": 4787 + }, + { + "epoch": 0.36041325580082423, + "grad_norm": 3.8722481727600098, + "learning_rate": 7.410797874249464e-05, + "loss": 2.2147, + "step": 4788 + }, + { + "epoch": 0.3604885300814844, + "grad_norm": 4.582956790924072, + "learning_rate": 7.409729778849977e-05, + "loss": 2.0644, + "step": 4789 + }, + { + "epoch": 0.3605638043621446, + "grad_norm": 5.250698566436768, + "learning_rate": 7.408661540198968e-05, + "loss": 1.8164, + "step": 4790 + }, + { + "epoch": 0.3606390786428047, + "grad_norm": 4.265100955963135, + "learning_rate": 7.40759315835994e-05, + "loss": 2.0852, + "step": 4791 + }, + { + "epoch": 0.3607143529234649, + "grad_norm": 5.080120086669922, + "learning_rate": 7.40652463339641e-05, + "loss": 1.693, + "step": 4792 + }, + { + "epoch": 0.36078962720412505, + "grad_norm": 6.49351167678833, + "learning_rate": 7.405455965371892e-05, + "loss": 2.1535, + "step": 4793 + }, + { + "epoch": 0.36086490148478517, + "grad_norm": 3.9224488735198975, + "learning_rate": 7.40438715434992e-05, + "loss": 1.6959, + "step": 4794 + }, + { + "epoch": 0.36094017576544535, + "grad_norm": 5.992222309112549, + "learning_rate": 7.403318200394033e-05, + "loss": 2.6366, + "step": 4795 + }, + { + "epoch": 0.3610154500461055, + "grad_norm": 4.102953910827637, + "learning_rate": 7.402249103567773e-05, + "loss": 2.6105, + "step": 4796 + }, + { + "epoch": 0.36109072432676564, + "grad_norm": 4.68909215927124, + "learning_rate": 7.401179863934698e-05, + "loss": 2.0845, + "step": 4797 + }, + { + "epoch": 0.3611659986074258, + "grad_norm": 5.32451868057251, + "learning_rate": 7.400110481558367e-05, + "loss": 1.8689, + "step": 4798 + }, + { + "epoch": 0.36124127288808594, + "grad_norm": 4.771303653717041, + "learning_rate": 7.399040956502358e-05, + "loss": 1.8479, + "step": 4799 + }, + { + "epoch": 0.3613165471687461, + "grad_norm": 6.669151306152344, + "learning_rate": 7.397971288830246e-05, + "loss": 2.598, + "step": 4800 + }, + { + "epoch": 0.3613918214494063, + "grad_norm": 5.390741348266602, + "learning_rate": 7.39690147860562e-05, + "loss": 1.6848, + "step": 4801 + }, + { + "epoch": 0.3614670957300664, + "grad_norm": 4.276814937591553, + "learning_rate": 7.39583152589208e-05, + "loss": 1.9152, + "step": 4802 + }, + { + "epoch": 0.3615423700107266, + "grad_norm": 4.254633903503418, + "learning_rate": 7.394761430753232e-05, + "loss": 1.8913, + "step": 4803 + }, + { + "epoch": 0.36161764429138676, + "grad_norm": 5.3279032707214355, + "learning_rate": 7.393691193252685e-05, + "loss": 1.9118, + "step": 4804 + }, + { + "epoch": 0.3616929185720469, + "grad_norm": 4.776759624481201, + "learning_rate": 7.392620813454066e-05, + "loss": 1.9526, + "step": 4805 + }, + { + "epoch": 0.36176819285270706, + "grad_norm": 4.740059852600098, + "learning_rate": 7.391550291421007e-05, + "loss": 1.794, + "step": 4806 + }, + { + "epoch": 0.3618434671333672, + "grad_norm": 6.114710807800293, + "learning_rate": 7.390479627217143e-05, + "loss": 2.1396, + "step": 4807 + }, + { + "epoch": 0.36191874141402736, + "grad_norm": 5.867406368255615, + "learning_rate": 7.389408820906124e-05, + "loss": 2.0246, + "step": 4808 + }, + { + "epoch": 0.36199401569468753, + "grad_norm": 5.67816162109375, + "learning_rate": 7.388337872551607e-05, + "loss": 1.9247, + "step": 4809 + }, + { + "epoch": 0.36206928997534765, + "grad_norm": 3.9525246620178223, + "learning_rate": 7.387266782217258e-05, + "loss": 1.8749, + "step": 4810 + }, + { + "epoch": 0.36214456425600783, + "grad_norm": 4.5001373291015625, + "learning_rate": 7.386195549966747e-05, + "loss": 2.3851, + "step": 4811 + }, + { + "epoch": 0.362219838536668, + "grad_norm": 5.693944454193115, + "learning_rate": 7.385124175863759e-05, + "loss": 1.7969, + "step": 4812 + }, + { + "epoch": 0.3622951128173281, + "grad_norm": 4.589908599853516, + "learning_rate": 7.38405265997198e-05, + "loss": 1.8688, + "step": 4813 + }, + { + "epoch": 0.3623703870979883, + "grad_norm": 4.070955753326416, + "learning_rate": 7.382981002355111e-05, + "loss": 2.1463, + "step": 4814 + }, + { + "epoch": 0.3624456613786485, + "grad_norm": 3.5276036262512207, + "learning_rate": 7.38190920307686e-05, + "loss": 1.774, + "step": 4815 + }, + { + "epoch": 0.3625209356593086, + "grad_norm": 4.324839115142822, + "learning_rate": 7.380837262200943e-05, + "loss": 1.9547, + "step": 4816 + }, + { + "epoch": 0.36259620993996877, + "grad_norm": 3.9447531700134277, + "learning_rate": 7.379765179791081e-05, + "loss": 1.8451, + "step": 4817 + }, + { + "epoch": 0.3626714842206289, + "grad_norm": 4.021512985229492, + "learning_rate": 7.378692955911005e-05, + "loss": 1.7889, + "step": 4818 + }, + { + "epoch": 0.36274675850128907, + "grad_norm": 6.69473934173584, + "learning_rate": 7.377620590624462e-05, + "loss": 1.7344, + "step": 4819 + }, + { + "epoch": 0.36282203278194924, + "grad_norm": 4.31016206741333, + "learning_rate": 7.376548083995193e-05, + "loss": 1.9734, + "step": 4820 + }, + { + "epoch": 0.36289730706260936, + "grad_norm": 6.312428951263428, + "learning_rate": 7.375475436086961e-05, + "loss": 1.6103, + "step": 4821 + }, + { + "epoch": 0.36297258134326954, + "grad_norm": 5.24380350112915, + "learning_rate": 7.374402646963533e-05, + "loss": 1.6092, + "step": 4822 + }, + { + "epoch": 0.3630478556239297, + "grad_norm": 5.6303839683532715, + "learning_rate": 7.373329716688677e-05, + "loss": 2.0299, + "step": 4823 + }, + { + "epoch": 0.36312312990458984, + "grad_norm": 4.681593894958496, + "learning_rate": 7.37225664532618e-05, + "loss": 1.9404, + "step": 4824 + }, + { + "epoch": 0.36319840418525, + "grad_norm": 5.024601936340332, + "learning_rate": 7.371183432939832e-05, + "loss": 1.8365, + "step": 4825 + }, + { + "epoch": 0.36327367846591013, + "grad_norm": 4.32515287399292, + "learning_rate": 7.370110079593435e-05, + "loss": 2.0221, + "step": 4826 + }, + { + "epoch": 0.3633489527465703, + "grad_norm": 4.834447383880615, + "learning_rate": 7.369036585350792e-05, + "loss": 1.8903, + "step": 4827 + }, + { + "epoch": 0.3634242270272305, + "grad_norm": 4.819180488586426, + "learning_rate": 7.367962950275724e-05, + "loss": 1.7662, + "step": 4828 + }, + { + "epoch": 0.3634995013078906, + "grad_norm": 4.502471446990967, + "learning_rate": 7.36688917443205e-05, + "loss": 1.9072, + "step": 4829 + }, + { + "epoch": 0.3635747755885508, + "grad_norm": 5.838901519775391, + "learning_rate": 7.365815257883607e-05, + "loss": 1.9506, + "step": 4830 + }, + { + "epoch": 0.36365004986921096, + "grad_norm": 3.4789304733276367, + "learning_rate": 7.364741200694235e-05, + "loss": 2.0589, + "step": 4831 + }, + { + "epoch": 0.3637253241498711, + "grad_norm": 5.203699111938477, + "learning_rate": 7.363667002927785e-05, + "loss": 1.747, + "step": 4832 + }, + { + "epoch": 0.36380059843053125, + "grad_norm": 5.024288177490234, + "learning_rate": 7.362592664648113e-05, + "loss": 1.9765, + "step": 4833 + }, + { + "epoch": 0.36387587271119143, + "grad_norm": 9.029402732849121, + "learning_rate": 7.361518185919087e-05, + "loss": 1.7673, + "step": 4834 + }, + { + "epoch": 0.36395114699185155, + "grad_norm": 7.1428422927856445, + "learning_rate": 7.360443566804578e-05, + "loss": 1.4925, + "step": 4835 + }, + { + "epoch": 0.3640264212725117, + "grad_norm": 6.792303562164307, + "learning_rate": 7.359368807368474e-05, + "loss": 2.0013, + "step": 4836 + }, + { + "epoch": 0.36410169555317184, + "grad_norm": 4.788605690002441, + "learning_rate": 7.358293907674664e-05, + "loss": 1.7173, + "step": 4837 + }, + { + "epoch": 0.364176969833832, + "grad_norm": 4.90203857421875, + "learning_rate": 7.357218867787048e-05, + "loss": 2.0681, + "step": 4838 + }, + { + "epoch": 0.3642522441144922, + "grad_norm": 7.211824417114258, + "learning_rate": 7.356143687769534e-05, + "loss": 1.9513, + "step": 4839 + }, + { + "epoch": 0.3643275183951523, + "grad_norm": 4.8574604988098145, + "learning_rate": 7.355068367686039e-05, + "loss": 1.86, + "step": 4840 + }, + { + "epoch": 0.3644027926758125, + "grad_norm": 3.9704689979553223, + "learning_rate": 7.353992907600484e-05, + "loss": 1.7771, + "step": 4841 + }, + { + "epoch": 0.36447806695647267, + "grad_norm": 4.170836448669434, + "learning_rate": 7.352917307576808e-05, + "loss": 1.8302, + "step": 4842 + }, + { + "epoch": 0.3645533412371328, + "grad_norm": 4.580347537994385, + "learning_rate": 7.351841567678948e-05, + "loss": 1.6868, + "step": 4843 + }, + { + "epoch": 0.36462861551779296, + "grad_norm": 4.951285362243652, + "learning_rate": 7.350765687970856e-05, + "loss": 1.811, + "step": 4844 + }, + { + "epoch": 0.36470388979845314, + "grad_norm": 5.445378303527832, + "learning_rate": 7.349689668516488e-05, + "loss": 1.9264, + "step": 4845 + }, + { + "epoch": 0.36477916407911326, + "grad_norm": 6.375279903411865, + "learning_rate": 7.348613509379811e-05, + "loss": 2.2344, + "step": 4846 + }, + { + "epoch": 0.36485443835977344, + "grad_norm": 8.046308517456055, + "learning_rate": 7.347537210624802e-05, + "loss": 1.9793, + "step": 4847 + }, + { + "epoch": 0.36492971264043356, + "grad_norm": 6.517088890075684, + "learning_rate": 7.346460772315439e-05, + "loss": 1.8596, + "step": 4848 + }, + { + "epoch": 0.36500498692109373, + "grad_norm": 5.455810070037842, + "learning_rate": 7.345384194515719e-05, + "loss": 2.0994, + "step": 4849 + }, + { + "epoch": 0.3650802612017539, + "grad_norm": 5.893856048583984, + "learning_rate": 7.344307477289637e-05, + "loss": 1.9412, + "step": 4850 + }, + { + "epoch": 0.36515553548241403, + "grad_norm": 8.713329315185547, + "learning_rate": 7.343230620701199e-05, + "loss": 1.9419, + "step": 4851 + }, + { + "epoch": 0.3652308097630742, + "grad_norm": 5.897685527801514, + "learning_rate": 7.342153624814427e-05, + "loss": 1.8261, + "step": 4852 + }, + { + "epoch": 0.3653060840437344, + "grad_norm": 5.235374927520752, + "learning_rate": 7.34107648969334e-05, + "loss": 2.1485, + "step": 4853 + }, + { + "epoch": 0.3653813583243945, + "grad_norm": 5.366219520568848, + "learning_rate": 7.339999215401975e-05, + "loss": 2.246, + "step": 4854 + }, + { + "epoch": 0.3654566326050547, + "grad_norm": 4.131043434143066, + "learning_rate": 7.338921802004372e-05, + "loss": 1.6422, + "step": 4855 + }, + { + "epoch": 0.3655319068857148, + "grad_norm": 10.72499942779541, + "learning_rate": 7.337844249564577e-05, + "loss": 1.9426, + "step": 4856 + }, + { + "epoch": 0.365607181166375, + "grad_norm": 7.017213344573975, + "learning_rate": 7.33676655814665e-05, + "loss": 1.9315, + "step": 4857 + }, + { + "epoch": 0.36568245544703515, + "grad_norm": 4.934020519256592, + "learning_rate": 7.335688727814655e-05, + "loss": 1.6458, + "step": 4858 + }, + { + "epoch": 0.36575772972769527, + "grad_norm": 4.223771095275879, + "learning_rate": 7.33461075863267e-05, + "loss": 1.9538, + "step": 4859 + }, + { + "epoch": 0.36583300400835544, + "grad_norm": 5.260005474090576, + "learning_rate": 7.333532650664772e-05, + "loss": 1.7474, + "step": 4860 + }, + { + "epoch": 0.3659082782890156, + "grad_norm": 5.600438117980957, + "learning_rate": 7.332454403975054e-05, + "loss": 1.8184, + "step": 4861 + }, + { + "epoch": 0.36598355256967574, + "grad_norm": 4.714437961578369, + "learning_rate": 7.331376018627613e-05, + "loss": 1.9086, + "step": 4862 + }, + { + "epoch": 0.3660588268503359, + "grad_norm": 4.6220221519470215, + "learning_rate": 7.330297494686557e-05, + "loss": 1.6586, + "step": 4863 + }, + { + "epoch": 0.3661341011309961, + "grad_norm": 5.061426639556885, + "learning_rate": 7.329218832216003e-05, + "loss": 1.8926, + "step": 4864 + }, + { + "epoch": 0.3662093754116562, + "grad_norm": 5.159934997558594, + "learning_rate": 7.328140031280074e-05, + "loss": 1.7559, + "step": 4865 + }, + { + "epoch": 0.3662846496923164, + "grad_norm": 5.940570831298828, + "learning_rate": 7.327061091942897e-05, + "loss": 1.928, + "step": 4866 + }, + { + "epoch": 0.3663599239729765, + "grad_norm": 3.953958034515381, + "learning_rate": 7.325982014268617e-05, + "loss": 2.0391, + "step": 4867 + }, + { + "epoch": 0.3664351982536367, + "grad_norm": 5.253901481628418, + "learning_rate": 7.324902798321379e-05, + "loss": 1.7048, + "step": 4868 + }, + { + "epoch": 0.36651047253429686, + "grad_norm": 5.357714653015137, + "learning_rate": 7.32382344416534e-05, + "loss": 1.8385, + "step": 4869 + }, + { + "epoch": 0.366585746814957, + "grad_norm": 3.6766180992126465, + "learning_rate": 7.322743951864668e-05, + "loss": 2.0573, + "step": 4870 + }, + { + "epoch": 0.36666102109561716, + "grad_norm": 4.265013217926025, + "learning_rate": 7.321664321483531e-05, + "loss": 2.1387, + "step": 4871 + }, + { + "epoch": 0.36673629537627733, + "grad_norm": 5.417975425720215, + "learning_rate": 7.320584553086113e-05, + "loss": 2.232, + "step": 4872 + }, + { + "epoch": 0.36681156965693745, + "grad_norm": 4.751915454864502, + "learning_rate": 7.3195046467366e-05, + "loss": 1.6946, + "step": 4873 + }, + { + "epoch": 0.36688684393759763, + "grad_norm": 4.073288440704346, + "learning_rate": 7.318424602499192e-05, + "loss": 2.2058, + "step": 4874 + }, + { + "epoch": 0.3669621182182578, + "grad_norm": 5.565362453460693, + "learning_rate": 7.317344420438093e-05, + "loss": 2.0365, + "step": 4875 + }, + { + "epoch": 0.3670373924989179, + "grad_norm": 4.771597862243652, + "learning_rate": 7.316264100617518e-05, + "loss": 1.6729, + "step": 4876 + }, + { + "epoch": 0.3671126667795781, + "grad_norm": 6.351498126983643, + "learning_rate": 7.315183643101689e-05, + "loss": 1.9801, + "step": 4877 + }, + { + "epoch": 0.3671879410602382, + "grad_norm": 5.164133071899414, + "learning_rate": 7.314103047954834e-05, + "loss": 1.6907, + "step": 4878 + }, + { + "epoch": 0.3672632153408984, + "grad_norm": 5.954381465911865, + "learning_rate": 7.313022315241195e-05, + "loss": 1.5462, + "step": 4879 + }, + { + "epoch": 0.3673384896215586, + "grad_norm": 5.115471363067627, + "learning_rate": 7.311941445025014e-05, + "loss": 2.0229, + "step": 4880 + }, + { + "epoch": 0.3674137639022187, + "grad_norm": 5.037909984588623, + "learning_rate": 7.310860437370548e-05, + "loss": 1.9457, + "step": 4881 + }, + { + "epoch": 0.36748903818287887, + "grad_norm": 5.122122287750244, + "learning_rate": 7.30977929234206e-05, + "loss": 2.1897, + "step": 4882 + }, + { + "epoch": 0.36756431246353904, + "grad_norm": 4.460752010345459, + "learning_rate": 7.308698010003822e-05, + "loss": 1.6896, + "step": 4883 + }, + { + "epoch": 0.36763958674419916, + "grad_norm": 4.669825077056885, + "learning_rate": 7.30761659042011e-05, + "loss": 1.812, + "step": 4884 + }, + { + "epoch": 0.36771486102485934, + "grad_norm": 6.167023181915283, + "learning_rate": 7.306535033655212e-05, + "loss": 1.5578, + "step": 4885 + }, + { + "epoch": 0.36779013530551946, + "grad_norm": 5.33987283706665, + "learning_rate": 7.305453339773425e-05, + "loss": 2.0473, + "step": 4886 + }, + { + "epoch": 0.36786540958617964, + "grad_norm": 5.457620620727539, + "learning_rate": 7.304371508839053e-05, + "loss": 1.838, + "step": 4887 + }, + { + "epoch": 0.3679406838668398, + "grad_norm": 6.7394890785217285, + "learning_rate": 7.303289540916407e-05, + "loss": 1.8993, + "step": 4888 + }, + { + "epoch": 0.36801595814749993, + "grad_norm": 4.364380359649658, + "learning_rate": 7.302207436069807e-05, + "loss": 1.8335, + "step": 4889 + }, + { + "epoch": 0.3680912324281601, + "grad_norm": 5.04023551940918, + "learning_rate": 7.301125194363579e-05, + "loss": 1.8369, + "step": 4890 + }, + { + "epoch": 0.3681665067088203, + "grad_norm": 5.439718246459961, + "learning_rate": 7.300042815862062e-05, + "loss": 2.0089, + "step": 4891 + }, + { + "epoch": 0.3682417809894804, + "grad_norm": 4.9820170402526855, + "learning_rate": 7.298960300629598e-05, + "loss": 1.934, + "step": 4892 + }, + { + "epoch": 0.3683170552701406, + "grad_norm": 4.578033924102783, + "learning_rate": 7.297877648730542e-05, + "loss": 1.8763, + "step": 4893 + }, + { + "epoch": 0.36839232955080076, + "grad_norm": 7.880160808563232, + "learning_rate": 7.296794860229254e-05, + "loss": 2.0151, + "step": 4894 + }, + { + "epoch": 0.3684676038314609, + "grad_norm": 4.76216459274292, + "learning_rate": 7.295711935190099e-05, + "loss": 2.2402, + "step": 4895 + }, + { + "epoch": 0.36854287811212105, + "grad_norm": 7.618791103363037, + "learning_rate": 7.294628873677458e-05, + "loss": 2.3357, + "step": 4896 + }, + { + "epoch": 0.3686181523927812, + "grad_norm": 4.780711650848389, + "learning_rate": 7.293545675755716e-05, + "loss": 1.8094, + "step": 4897 + }, + { + "epoch": 0.36869342667344135, + "grad_norm": 4.7286906242370605, + "learning_rate": 7.292462341489262e-05, + "loss": 1.7599, + "step": 4898 + }, + { + "epoch": 0.3687687009541015, + "grad_norm": 5.050095558166504, + "learning_rate": 7.291378870942501e-05, + "loss": 1.8317, + "step": 4899 + }, + { + "epoch": 0.36884397523476165, + "grad_norm": 6.837618827819824, + "learning_rate": 7.29029526417984e-05, + "loss": 1.8809, + "step": 4900 + }, + { + "epoch": 0.3689192495154218, + "grad_norm": 6.075804710388184, + "learning_rate": 7.289211521265698e-05, + "loss": 1.7232, + "step": 4901 + }, + { + "epoch": 0.368994523796082, + "grad_norm": 6.544180870056152, + "learning_rate": 7.288127642264497e-05, + "loss": 1.509, + "step": 4902 + }, + { + "epoch": 0.3690697980767421, + "grad_norm": 4.537235736846924, + "learning_rate": 7.287043627240676e-05, + "loss": 2.1423, + "step": 4903 + }, + { + "epoch": 0.3691450723574023, + "grad_norm": 5.466938018798828, + "learning_rate": 7.285959476258673e-05, + "loss": 2.1236, + "step": 4904 + }, + { + "epoch": 0.3692203466380624, + "grad_norm": 5.205320835113525, + "learning_rate": 7.284875189382938e-05, + "loss": 1.8688, + "step": 4905 + }, + { + "epoch": 0.3692956209187226, + "grad_norm": 5.700517654418945, + "learning_rate": 7.283790766677931e-05, + "loss": 2.1438, + "step": 4906 + }, + { + "epoch": 0.36937089519938276, + "grad_norm": 4.471470832824707, + "learning_rate": 7.282706208208113e-05, + "loss": 1.909, + "step": 4907 + }, + { + "epoch": 0.3694461694800429, + "grad_norm": 4.895221710205078, + "learning_rate": 7.281621514037962e-05, + "loss": 1.8427, + "step": 4908 + }, + { + "epoch": 0.36952144376070306, + "grad_norm": 4.930835723876953, + "learning_rate": 7.280536684231957e-05, + "loss": 1.9311, + "step": 4909 + }, + { + "epoch": 0.36959671804136324, + "grad_norm": 5.3232574462890625, + "learning_rate": 7.279451718854592e-05, + "loss": 1.9029, + "step": 4910 + }, + { + "epoch": 0.36967199232202336, + "grad_norm": 6.255612850189209, + "learning_rate": 7.278366617970363e-05, + "loss": 1.8359, + "step": 4911 + }, + { + "epoch": 0.36974726660268353, + "grad_norm": 4.256734371185303, + "learning_rate": 7.277281381643774e-05, + "loss": 1.7317, + "step": 4912 + }, + { + "epoch": 0.3698225408833437, + "grad_norm": 5.098121643066406, + "learning_rate": 7.276196009939342e-05, + "loss": 1.9491, + "step": 4913 + }, + { + "epoch": 0.36989781516400383, + "grad_norm": 4.870434284210205, + "learning_rate": 7.275110502921588e-05, + "loss": 2.4705, + "step": 4914 + }, + { + "epoch": 0.369973089444664, + "grad_norm": 5.888759613037109, + "learning_rate": 7.274024860655044e-05, + "loss": 1.727, + "step": 4915 + }, + { + "epoch": 0.3700483637253241, + "grad_norm": 5.465101718902588, + "learning_rate": 7.272939083204246e-05, + "loss": 1.992, + "step": 4916 + }, + { + "epoch": 0.3701236380059843, + "grad_norm": 5.0672760009765625, + "learning_rate": 7.27185317063374e-05, + "loss": 2.1746, + "step": 4917 + }, + { + "epoch": 0.3701989122866445, + "grad_norm": 5.251686096191406, + "learning_rate": 7.270767123008083e-05, + "loss": 2.0802, + "step": 4918 + }, + { + "epoch": 0.3702741865673046, + "grad_norm": 3.9671287536621094, + "learning_rate": 7.269680940391836e-05, + "loss": 1.6651, + "step": 4919 + }, + { + "epoch": 0.3703494608479648, + "grad_norm": 6.918791770935059, + "learning_rate": 7.268594622849569e-05, + "loss": 2.0369, + "step": 4920 + }, + { + "epoch": 0.37042473512862495, + "grad_norm": 3.7368810176849365, + "learning_rate": 7.267508170445862e-05, + "loss": 1.9294, + "step": 4921 + }, + { + "epoch": 0.37050000940928507, + "grad_norm": 4.848521709442139, + "learning_rate": 7.266421583245299e-05, + "loss": 2.0866, + "step": 4922 + }, + { + "epoch": 0.37057528368994525, + "grad_norm": 4.5620808601379395, + "learning_rate": 7.265334861312479e-05, + "loss": 1.5791, + "step": 4923 + }, + { + "epoch": 0.3706505579706054, + "grad_norm": 4.118834495544434, + "learning_rate": 7.264248004711998e-05, + "loss": 1.7779, + "step": 4924 + }, + { + "epoch": 0.37072583225126554, + "grad_norm": 6.834249973297119, + "learning_rate": 7.263161013508471e-05, + "loss": 2.2724, + "step": 4925 + }, + { + "epoch": 0.3708011065319257, + "grad_norm": 4.401737213134766, + "learning_rate": 7.262073887766517e-05, + "loss": 1.6981, + "step": 4926 + }, + { + "epoch": 0.37087638081258584, + "grad_norm": 6.490218639373779, + "learning_rate": 7.26098662755076e-05, + "loss": 1.9167, + "step": 4927 + }, + { + "epoch": 0.370951655093246, + "grad_norm": 6.030299186706543, + "learning_rate": 7.259899232925835e-05, + "loss": 2.1184, + "step": 4928 + }, + { + "epoch": 0.3710269293739062, + "grad_norm": 5.808018207550049, + "learning_rate": 7.258811703956385e-05, + "loss": 2.0847, + "step": 4929 + }, + { + "epoch": 0.3711022036545663, + "grad_norm": 5.696561336517334, + "learning_rate": 7.257724040707061e-05, + "loss": 2.1196, + "step": 4930 + }, + { + "epoch": 0.3711774779352265, + "grad_norm": 4.17270565032959, + "learning_rate": 7.25663624324252e-05, + "loss": 1.8254, + "step": 4931 + }, + { + "epoch": 0.37125275221588666, + "grad_norm": 3.4029276371002197, + "learning_rate": 7.255548311627432e-05, + "loss": 1.8895, + "step": 4932 + }, + { + "epoch": 0.3713280264965468, + "grad_norm": 4.230874061584473, + "learning_rate": 7.254460245926466e-05, + "loss": 1.7805, + "step": 4933 + }, + { + "epoch": 0.37140330077720696, + "grad_norm": 6.310096263885498, + "learning_rate": 7.253372046204307e-05, + "loss": 1.7656, + "step": 4934 + }, + { + "epoch": 0.3714785750578671, + "grad_norm": 6.5092949867248535, + "learning_rate": 7.252283712525646e-05, + "loss": 2.3295, + "step": 4935 + }, + { + "epoch": 0.37155384933852725, + "grad_norm": 6.874963283538818, + "learning_rate": 7.251195244955183e-05, + "loss": 1.7833, + "step": 4936 + }, + { + "epoch": 0.37162912361918743, + "grad_norm": 4.862421989440918, + "learning_rate": 7.250106643557619e-05, + "loss": 2.0023, + "step": 4937 + }, + { + "epoch": 0.37170439789984755, + "grad_norm": 6.898500442504883, + "learning_rate": 7.249017908397673e-05, + "loss": 1.9153, + "step": 4938 + }, + { + "epoch": 0.3717796721805077, + "grad_norm": 4.9749674797058105, + "learning_rate": 7.247929039540066e-05, + "loss": 1.7077, + "step": 4939 + }, + { + "epoch": 0.3718549464611679, + "grad_norm": 4.958609104156494, + "learning_rate": 7.246840037049527e-05, + "loss": 1.6484, + "step": 4940 + }, + { + "epoch": 0.371930220741828, + "grad_norm": 4.116805553436279, + "learning_rate": 7.245750900990793e-05, + "loss": 2.0669, + "step": 4941 + }, + { + "epoch": 0.3720054950224882, + "grad_norm": 6.013229846954346, + "learning_rate": 7.244661631428614e-05, + "loss": 1.9816, + "step": 4942 + }, + { + "epoch": 0.3720807693031484, + "grad_norm": 4.165593147277832, + "learning_rate": 7.243572228427743e-05, + "loss": 2.0706, + "step": 4943 + }, + { + "epoch": 0.3721560435838085, + "grad_norm": 6.141819953918457, + "learning_rate": 7.242482692052936e-05, + "loss": 2.0099, + "step": 4944 + }, + { + "epoch": 0.37223131786446867, + "grad_norm": 4.315420150756836, + "learning_rate": 7.24139302236897e-05, + "loss": 1.7925, + "step": 4945 + }, + { + "epoch": 0.3723065921451288, + "grad_norm": 5.752084255218506, + "learning_rate": 7.24030321944062e-05, + "loss": 2.2001, + "step": 4946 + }, + { + "epoch": 0.37238186642578897, + "grad_norm": 4.279336929321289, + "learning_rate": 7.239213283332672e-05, + "loss": 1.8707, + "step": 4947 + }, + { + "epoch": 0.37245714070644914, + "grad_norm": 5.792545318603516, + "learning_rate": 7.23812321410992e-05, + "loss": 2.1505, + "step": 4948 + }, + { + "epoch": 0.37253241498710926, + "grad_norm": 4.409698963165283, + "learning_rate": 7.237033011837162e-05, + "loss": 1.385, + "step": 4949 + }, + { + "epoch": 0.37260768926776944, + "grad_norm": 6.312104225158691, + "learning_rate": 7.235942676579213e-05, + "loss": 1.8709, + "step": 4950 + }, + { + "epoch": 0.3726829635484296, + "grad_norm": 5.440578460693359, + "learning_rate": 7.234852208400886e-05, + "loss": 2.0849, + "step": 4951 + }, + { + "epoch": 0.37275823782908973, + "grad_norm": 5.374791145324707, + "learning_rate": 7.233761607367009e-05, + "loss": 2.121, + "step": 4952 + }, + { + "epoch": 0.3728335121097499, + "grad_norm": 4.096590995788574, + "learning_rate": 7.232670873542413e-05, + "loss": 1.8233, + "step": 4953 + }, + { + "epoch": 0.37290878639041003, + "grad_norm": 3.9346656799316406, + "learning_rate": 7.23158000699194e-05, + "loss": 1.8038, + "step": 4954 + }, + { + "epoch": 0.3729840606710702, + "grad_norm": 3.6787900924682617, + "learning_rate": 7.230489007780439e-05, + "loss": 1.615, + "step": 4955 + }, + { + "epoch": 0.3730593349517304, + "grad_norm": 3.48766827583313, + "learning_rate": 7.229397875972765e-05, + "loss": 1.6827, + "step": 4956 + }, + { + "epoch": 0.3731346092323905, + "grad_norm": 5.395848751068115, + "learning_rate": 7.228306611633787e-05, + "loss": 2.2841, + "step": 4957 + }, + { + "epoch": 0.3732098835130507, + "grad_norm": 4.3597211837768555, + "learning_rate": 7.22721521482837e-05, + "loss": 1.7393, + "step": 4958 + }, + { + "epoch": 0.37328515779371085, + "grad_norm": 4.139216423034668, + "learning_rate": 7.226123685621405e-05, + "loss": 2.0095, + "step": 4959 + }, + { + "epoch": 0.373360432074371, + "grad_norm": 4.122201442718506, + "learning_rate": 7.22503202407777e-05, + "loss": 2.1399, + "step": 4960 + }, + { + "epoch": 0.37343570635503115, + "grad_norm": 3.9273369312286377, + "learning_rate": 7.223940230262367e-05, + "loss": 1.9742, + "step": 4961 + }, + { + "epoch": 0.3735109806356913, + "grad_norm": 4.356639862060547, + "learning_rate": 7.2228483042401e-05, + "loss": 1.8394, + "step": 4962 + }, + { + "epoch": 0.37358625491635145, + "grad_norm": 3.919447898864746, + "learning_rate": 7.221756246075877e-05, + "loss": 1.7308, + "step": 4963 + }, + { + "epoch": 0.3736615291970116, + "grad_norm": 4.524787425994873, + "learning_rate": 7.22066405583462e-05, + "loss": 2.3422, + "step": 4964 + }, + { + "epoch": 0.37373680347767174, + "grad_norm": 7.1857218742370605, + "learning_rate": 7.219571733581257e-05, + "loss": 1.8943, + "step": 4965 + }, + { + "epoch": 0.3738120777583319, + "grad_norm": 6.601821422576904, + "learning_rate": 7.218479279380725e-05, + "loss": 1.8363, + "step": 4966 + }, + { + "epoch": 0.3738873520389921, + "grad_norm": 5.930509090423584, + "learning_rate": 7.217386693297963e-05, + "loss": 1.9624, + "step": 4967 + }, + { + "epoch": 0.3739626263196522, + "grad_norm": 6.823774337768555, + "learning_rate": 7.216293975397927e-05, + "loss": 2.5018, + "step": 4968 + }, + { + "epoch": 0.3740379006003124, + "grad_norm": 4.764377117156982, + "learning_rate": 7.21520112574557e-05, + "loss": 1.8609, + "step": 4969 + }, + { + "epoch": 0.37411317488097257, + "grad_norm": 5.6163530349731445, + "learning_rate": 7.214108144405865e-05, + "loss": 1.9354, + "step": 4970 + }, + { + "epoch": 0.3741884491616327, + "grad_norm": 7.429581642150879, + "learning_rate": 7.213015031443783e-05, + "loss": 1.9482, + "step": 4971 + }, + { + "epoch": 0.37426372344229286, + "grad_norm": 4.5262250900268555, + "learning_rate": 7.211921786924308e-05, + "loss": 2.0705, + "step": 4972 + }, + { + "epoch": 0.37433899772295304, + "grad_norm": 5.872339725494385, + "learning_rate": 7.210828410912428e-05, + "loss": 1.9288, + "step": 4973 + }, + { + "epoch": 0.37441427200361316, + "grad_norm": 4.778729438781738, + "learning_rate": 7.209734903473143e-05, + "loss": 1.8757, + "step": 4974 + }, + { + "epoch": 0.37448954628427333, + "grad_norm": 5.326509952545166, + "learning_rate": 7.208641264671458e-05, + "loss": 1.8579, + "step": 4975 + }, + { + "epoch": 0.37456482056493345, + "grad_norm": 6.47275447845459, + "learning_rate": 7.207547494572388e-05, + "loss": 1.9314, + "step": 4976 + }, + { + "epoch": 0.37464009484559363, + "grad_norm": 4.064967155456543, + "learning_rate": 7.206453593240954e-05, + "loss": 1.5717, + "step": 4977 + }, + { + "epoch": 0.3747153691262538, + "grad_norm": 5.22079610824585, + "learning_rate": 7.205359560742185e-05, + "loss": 2.0248, + "step": 4978 + }, + { + "epoch": 0.3747906434069139, + "grad_norm": 5.952698707580566, + "learning_rate": 7.204265397141116e-05, + "loss": 2.0085, + "step": 4979 + }, + { + "epoch": 0.3748659176875741, + "grad_norm": 5.409614086151123, + "learning_rate": 7.203171102502795e-05, + "loss": 1.9192, + "step": 4980 + }, + { + "epoch": 0.3749411919682343, + "grad_norm": 4.755834102630615, + "learning_rate": 7.202076676892273e-05, + "loss": 2.1745, + "step": 4981 + }, + { + "epoch": 0.3750164662488944, + "grad_norm": 4.638421535491943, + "learning_rate": 7.200982120374613e-05, + "loss": 2.0651, + "step": 4982 + }, + { + "epoch": 0.3750917405295546, + "grad_norm": 7.0502729415893555, + "learning_rate": 7.199887433014879e-05, + "loss": 1.8495, + "step": 4983 + }, + { + "epoch": 0.3751670148102147, + "grad_norm": 5.381472110748291, + "learning_rate": 7.19879261487815e-05, + "loss": 1.6865, + "step": 4984 + }, + { + "epoch": 0.37524228909087487, + "grad_norm": 5.210650444030762, + "learning_rate": 7.197697666029511e-05, + "loss": 1.9349, + "step": 4985 + }, + { + "epoch": 0.37531756337153505, + "grad_norm": 4.95017671585083, + "learning_rate": 7.196602586534049e-05, + "loss": 1.7596, + "step": 4986 + }, + { + "epoch": 0.37539283765219517, + "grad_norm": 16.76958465576172, + "learning_rate": 7.195507376456867e-05, + "loss": 1.9437, + "step": 4987 + }, + { + "epoch": 0.37546811193285534, + "grad_norm": 4.42504358291626, + "learning_rate": 7.194412035863072e-05, + "loss": 1.856, + "step": 4988 + }, + { + "epoch": 0.3755433862135155, + "grad_norm": 5.119406700134277, + "learning_rate": 7.193316564817776e-05, + "loss": 1.6027, + "step": 4989 + }, + { + "epoch": 0.37561866049417564, + "grad_norm": 4.139310836791992, + "learning_rate": 7.192220963386104e-05, + "loss": 1.7857, + "step": 4990 + }, + { + "epoch": 0.3756939347748358, + "grad_norm": 5.9897685050964355, + "learning_rate": 7.191125231633187e-05, + "loss": 1.922, + "step": 4991 + }, + { + "epoch": 0.375769209055496, + "grad_norm": 4.8305583000183105, + "learning_rate": 7.190029369624162e-05, + "loss": 1.766, + "step": 4992 + }, + { + "epoch": 0.3758444833361561, + "grad_norm": 3.980228900909424, + "learning_rate": 7.188933377424174e-05, + "loss": 1.6575, + "step": 4993 + }, + { + "epoch": 0.3759197576168163, + "grad_norm": 5.060513019561768, + "learning_rate": 7.187837255098379e-05, + "loss": 2.1242, + "step": 4994 + }, + { + "epoch": 0.3759950318974764, + "grad_norm": 4.601027488708496, + "learning_rate": 7.186741002711935e-05, + "loss": 1.8904, + "step": 4995 + }, + { + "epoch": 0.3760703061781366, + "grad_norm": 7.523019790649414, + "learning_rate": 7.185644620330014e-05, + "loss": 1.9301, + "step": 4996 + }, + { + "epoch": 0.37614558045879676, + "grad_norm": 5.8168134689331055, + "learning_rate": 7.18454810801779e-05, + "loss": 2.3101, + "step": 4997 + }, + { + "epoch": 0.3762208547394569, + "grad_norm": 4.306633949279785, + "learning_rate": 7.18345146584045e-05, + "loss": 2.1864, + "step": 4998 + }, + { + "epoch": 0.37629612902011705, + "grad_norm": 5.9881792068481445, + "learning_rate": 7.182354693863186e-05, + "loss": 2.0395, + "step": 4999 + }, + { + "epoch": 0.37637140330077723, + "grad_norm": 5.064429759979248, + "learning_rate": 7.181257792151197e-05, + "loss": 1.8886, + "step": 5000 + }, + { + "epoch": 0.37644667758143735, + "grad_norm": 6.932483196258545, + "learning_rate": 7.180160760769692e-05, + "loss": 1.7842, + "step": 5001 + }, + { + "epoch": 0.3765219518620975, + "grad_norm": 6.999981880187988, + "learning_rate": 7.179063599783884e-05, + "loss": 1.9502, + "step": 5002 + }, + { + "epoch": 0.37659722614275765, + "grad_norm": 6.2435832023620605, + "learning_rate": 7.177966309258998e-05, + "loss": 1.9877, + "step": 5003 + }, + { + "epoch": 0.3766725004234178, + "grad_norm": 4.435873031616211, + "learning_rate": 7.176868889260264e-05, + "loss": 1.9985, + "step": 5004 + }, + { + "epoch": 0.376747774704078, + "grad_norm": 4.1692023277282715, + "learning_rate": 7.17577133985292e-05, + "loss": 1.5973, + "step": 5005 + }, + { + "epoch": 0.3768230489847381, + "grad_norm": 3.5906853675842285, + "learning_rate": 7.174673661102214e-05, + "loss": 1.6607, + "step": 5006 + }, + { + "epoch": 0.3768983232653983, + "grad_norm": 4.7808837890625, + "learning_rate": 7.173575853073398e-05, + "loss": 1.6456, + "step": 5007 + }, + { + "epoch": 0.37697359754605847, + "grad_norm": 4.536974906921387, + "learning_rate": 7.172477915831734e-05, + "loss": 1.6632, + "step": 5008 + }, + { + "epoch": 0.3770488718267186, + "grad_norm": 6.354125022888184, + "learning_rate": 7.171379849442493e-05, + "loss": 1.7917, + "step": 5009 + }, + { + "epoch": 0.37712414610737877, + "grad_norm": 4.592216968536377, + "learning_rate": 7.170281653970949e-05, + "loss": 1.7737, + "step": 5010 + }, + { + "epoch": 0.37719942038803894, + "grad_norm": 5.580130577087402, + "learning_rate": 7.169183329482388e-05, + "loss": 1.8037, + "step": 5011 + }, + { + "epoch": 0.37727469466869906, + "grad_norm": 4.263558387756348, + "learning_rate": 7.168084876042103e-05, + "loss": 1.9776, + "step": 5012 + }, + { + "epoch": 0.37734996894935924, + "grad_norm": 4.869098663330078, + "learning_rate": 7.166986293715391e-05, + "loss": 2.1364, + "step": 5013 + }, + { + "epoch": 0.37742524323001936, + "grad_norm": 4.074914932250977, + "learning_rate": 7.165887582567563e-05, + "loss": 1.8067, + "step": 5014 + }, + { + "epoch": 0.37750051751067953, + "grad_norm": 3.929335594177246, + "learning_rate": 7.164788742663932e-05, + "loss": 1.779, + "step": 5015 + }, + { + "epoch": 0.3775757917913397, + "grad_norm": 4.660194396972656, + "learning_rate": 7.163689774069823e-05, + "loss": 1.859, + "step": 5016 + }, + { + "epoch": 0.37765106607199983, + "grad_norm": 4.2294020652771, + "learning_rate": 7.162590676850565e-05, + "loss": 1.6908, + "step": 5017 + }, + { + "epoch": 0.37772634035266, + "grad_norm": 5.060115814208984, + "learning_rate": 7.161491451071495e-05, + "loss": 1.8269, + "step": 5018 + }, + { + "epoch": 0.3778016146333202, + "grad_norm": 6.044243335723877, + "learning_rate": 7.160392096797963e-05, + "loss": 2.108, + "step": 5019 + }, + { + "epoch": 0.3778768889139803, + "grad_norm": 5.04624605178833, + "learning_rate": 7.159292614095318e-05, + "loss": 2.0017, + "step": 5020 + }, + { + "epoch": 0.3779521631946405, + "grad_norm": 4.736776351928711, + "learning_rate": 7.158193003028922e-05, + "loss": 1.8517, + "step": 5021 + }, + { + "epoch": 0.37802743747530065, + "grad_norm": 5.566265106201172, + "learning_rate": 7.157093263664147e-05, + "loss": 1.8342, + "step": 5022 + }, + { + "epoch": 0.3781027117559608, + "grad_norm": 4.249990940093994, + "learning_rate": 7.155993396066365e-05, + "loss": 1.9247, + "step": 5023 + }, + { + "epoch": 0.37817798603662095, + "grad_norm": 5.810561656951904, + "learning_rate": 7.154893400300962e-05, + "loss": 2.6265, + "step": 5024 + }, + { + "epoch": 0.37825326031728107, + "grad_norm": 4.715921878814697, + "learning_rate": 7.153793276433329e-05, + "loss": 1.9445, + "step": 5025 + }, + { + "epoch": 0.37832853459794125, + "grad_norm": 5.0739850997924805, + "learning_rate": 7.152693024528867e-05, + "loss": 1.8686, + "step": 5026 + }, + { + "epoch": 0.3784038088786014, + "grad_norm": 4.152922630310059, + "learning_rate": 7.15159264465298e-05, + "loss": 2.2341, + "step": 5027 + }, + { + "epoch": 0.37847908315926154, + "grad_norm": 5.276555061340332, + "learning_rate": 7.150492136871086e-05, + "loss": 1.955, + "step": 5028 + }, + { + "epoch": 0.3785543574399217, + "grad_norm": 3.6907103061676025, + "learning_rate": 7.149391501248602e-05, + "loss": 1.7365, + "step": 5029 + }, + { + "epoch": 0.3786296317205819, + "grad_norm": 4.784834384918213, + "learning_rate": 7.148290737850963e-05, + "loss": 1.7827, + "step": 5030 + }, + { + "epoch": 0.378704906001242, + "grad_norm": 5.167219638824463, + "learning_rate": 7.147189846743601e-05, + "loss": 1.5483, + "step": 5031 + }, + { + "epoch": 0.3787801802819022, + "grad_norm": 4.793018817901611, + "learning_rate": 7.146088827991966e-05, + "loss": 1.8407, + "step": 5032 + }, + { + "epoch": 0.3788554545625623, + "grad_norm": 5.662782192230225, + "learning_rate": 7.144987681661508e-05, + "loss": 1.8257, + "step": 5033 + }, + { + "epoch": 0.3789307288432225, + "grad_norm": 4.163815498352051, + "learning_rate": 7.143886407817686e-05, + "loss": 1.8658, + "step": 5034 + }, + { + "epoch": 0.37900600312388266, + "grad_norm": 8.020354270935059, + "learning_rate": 7.14278500652597e-05, + "loss": 1.6154, + "step": 5035 + }, + { + "epoch": 0.3790812774045428, + "grad_norm": 3.7433700561523438, + "learning_rate": 7.141683477851832e-05, + "loss": 2.1138, + "step": 5036 + }, + { + "epoch": 0.37915655168520296, + "grad_norm": 4.968786716461182, + "learning_rate": 7.140581821860757e-05, + "loss": 1.9247, + "step": 5037 + }, + { + "epoch": 0.37923182596586313, + "grad_norm": 4.661604404449463, + "learning_rate": 7.139480038618235e-05, + "loss": 1.7003, + "step": 5038 + }, + { + "epoch": 0.37930710024652325, + "grad_norm": 4.374044895172119, + "learning_rate": 7.138378128189763e-05, + "loss": 1.7717, + "step": 5039 + }, + { + "epoch": 0.37938237452718343, + "grad_norm": 4.064972400665283, + "learning_rate": 7.137276090640847e-05, + "loss": 1.6095, + "step": 5040 + }, + { + "epoch": 0.3794576488078436, + "grad_norm": 4.4079790115356445, + "learning_rate": 7.136173926037e-05, + "loss": 1.9175, + "step": 5041 + }, + { + "epoch": 0.3795329230885037, + "grad_norm": 7.431078910827637, + "learning_rate": 7.135071634443744e-05, + "loss": 1.7565, + "step": 5042 + }, + { + "epoch": 0.3796081973691639, + "grad_norm": 3.742321014404297, + "learning_rate": 7.133969215926604e-05, + "loss": 1.7643, + "step": 5043 + }, + { + "epoch": 0.379683471649824, + "grad_norm": 4.022798538208008, + "learning_rate": 7.132866670551117e-05, + "loss": 1.6815, + "step": 5044 + }, + { + "epoch": 0.3797587459304842, + "grad_norm": 4.9506635665893555, + "learning_rate": 7.131763998382828e-05, + "loss": 1.7932, + "step": 5045 + }, + { + "epoch": 0.3798340202111444, + "grad_norm": 5.923523426055908, + "learning_rate": 7.130661199487283e-05, + "loss": 1.6976, + "step": 5046 + }, + { + "epoch": 0.3799092944918045, + "grad_norm": 5.72121524810791, + "learning_rate": 7.129558273930043e-05, + "loss": 1.9434, + "step": 5047 + }, + { + "epoch": 0.37998456877246467, + "grad_norm": 3.9136242866516113, + "learning_rate": 7.128455221776677e-05, + "loss": 1.889, + "step": 5048 + }, + { + "epoch": 0.38005984305312485, + "grad_norm": 5.072514533996582, + "learning_rate": 7.127352043092755e-05, + "loss": 2.345, + "step": 5049 + }, + { + "epoch": 0.38013511733378497, + "grad_norm": 4.3012800216674805, + "learning_rate": 7.126248737943858e-05, + "loss": 1.8953, + "step": 5050 + }, + { + "epoch": 0.38021039161444514, + "grad_norm": 4.170708656311035, + "learning_rate": 7.125145306395574e-05, + "loss": 1.7583, + "step": 5051 + }, + { + "epoch": 0.38028566589510526, + "grad_norm": 4.584028244018555, + "learning_rate": 7.124041748513498e-05, + "loss": 1.9824, + "step": 5052 + }, + { + "epoch": 0.38036094017576544, + "grad_norm": 5.080622673034668, + "learning_rate": 7.122938064363236e-05, + "loss": 2.1239, + "step": 5053 + }, + { + "epoch": 0.3804362144564256, + "grad_norm": 4.556831359863281, + "learning_rate": 7.121834254010398e-05, + "loss": 1.6034, + "step": 5054 + }, + { + "epoch": 0.38051148873708573, + "grad_norm": 5.543886661529541, + "learning_rate": 7.120730317520601e-05, + "loss": 1.7441, + "step": 5055 + }, + { + "epoch": 0.3805867630177459, + "grad_norm": 5.811787128448486, + "learning_rate": 7.119626254959472e-05, + "loss": 1.8901, + "step": 5056 + }, + { + "epoch": 0.3806620372984061, + "grad_norm": 5.274604797363281, + "learning_rate": 7.118522066392644e-05, + "loss": 1.7342, + "step": 5057 + }, + { + "epoch": 0.3807373115790662, + "grad_norm": 4.057298183441162, + "learning_rate": 7.117417751885756e-05, + "loss": 2.0168, + "step": 5058 + }, + { + "epoch": 0.3808125858597264, + "grad_norm": 5.519357204437256, + "learning_rate": 7.11631331150446e-05, + "loss": 1.7751, + "step": 5059 + }, + { + "epoch": 0.38088786014038656, + "grad_norm": 3.3534770011901855, + "learning_rate": 7.11520874531441e-05, + "loss": 1.723, + "step": 5060 + }, + { + "epoch": 0.3809631344210467, + "grad_norm": 4.441788673400879, + "learning_rate": 7.114104053381269e-05, + "loss": 2.0983, + "step": 5061 + }, + { + "epoch": 0.38103840870170685, + "grad_norm": 3.7337827682495117, + "learning_rate": 7.112999235770708e-05, + "loss": 1.7813, + "step": 5062 + }, + { + "epoch": 0.381113682982367, + "grad_norm": 4.0228590965271, + "learning_rate": 7.111894292548405e-05, + "loss": 2.2271, + "step": 5063 + }, + { + "epoch": 0.38118895726302715, + "grad_norm": 5.539562225341797, + "learning_rate": 7.110789223780046e-05, + "loss": 2.1016, + "step": 5064 + }, + { + "epoch": 0.3812642315436873, + "grad_norm": 4.479678153991699, + "learning_rate": 7.109684029531326e-05, + "loss": 2.0075, + "step": 5065 + }, + { + "epoch": 0.38133950582434745, + "grad_norm": 6.788171768188477, + "learning_rate": 7.108578709867941e-05, + "loss": 1.7747, + "step": 5066 + }, + { + "epoch": 0.3814147801050076, + "grad_norm": 4.8256683349609375, + "learning_rate": 7.107473264855604e-05, + "loss": 1.8965, + "step": 5067 + }, + { + "epoch": 0.3814900543856678, + "grad_norm": 4.272019863128662, + "learning_rate": 7.106367694560028e-05, + "loss": 1.7369, + "step": 5068 + }, + { + "epoch": 0.3815653286663279, + "grad_norm": 6.109707355499268, + "learning_rate": 7.105261999046935e-05, + "loss": 1.8078, + "step": 5069 + }, + { + "epoch": 0.3816406029469881, + "grad_norm": 5.393454074859619, + "learning_rate": 7.10415617838206e-05, + "loss": 2.2093, + "step": 5070 + }, + { + "epoch": 0.38171587722764827, + "grad_norm": 4.1811723709106445, + "learning_rate": 7.103050232631134e-05, + "loss": 1.8611, + "step": 5071 + }, + { + "epoch": 0.3817911515083084, + "grad_norm": 6.992862701416016, + "learning_rate": 7.101944161859908e-05, + "loss": 1.7644, + "step": 5072 + }, + { + "epoch": 0.38186642578896857, + "grad_norm": 5.09332275390625, + "learning_rate": 7.100837966134133e-05, + "loss": 2.2209, + "step": 5073 + }, + { + "epoch": 0.3819417000696287, + "grad_norm": 4.536525726318359, + "learning_rate": 7.099731645519568e-05, + "loss": 1.7918, + "step": 5074 + }, + { + "epoch": 0.38201697435028886, + "grad_norm": 4.569726467132568, + "learning_rate": 7.098625200081982e-05, + "loss": 1.9886, + "step": 5075 + }, + { + "epoch": 0.38209224863094904, + "grad_norm": 7.287348747253418, + "learning_rate": 7.09751862988715e-05, + "loss": 2.1947, + "step": 5076 + }, + { + "epoch": 0.38216752291160916, + "grad_norm": 3.569044828414917, + "learning_rate": 7.096411935000853e-05, + "loss": 1.9898, + "step": 5077 + }, + { + "epoch": 0.38224279719226933, + "grad_norm": 7.5191802978515625, + "learning_rate": 7.09530511548888e-05, + "loss": 1.8534, + "step": 5078 + }, + { + "epoch": 0.3823180714729295, + "grad_norm": 4.257462978363037, + "learning_rate": 7.094198171417032e-05, + "loss": 1.8425, + "step": 5079 + }, + { + "epoch": 0.38239334575358963, + "grad_norm": 4.106302738189697, + "learning_rate": 7.093091102851112e-05, + "loss": 1.7538, + "step": 5080 + }, + { + "epoch": 0.3824686200342498, + "grad_norm": 4.261412620544434, + "learning_rate": 7.091983909856932e-05, + "loss": 2.1242, + "step": 5081 + }, + { + "epoch": 0.3825438943149099, + "grad_norm": 5.156669616699219, + "learning_rate": 7.09087659250031e-05, + "loss": 1.7003, + "step": 5082 + }, + { + "epoch": 0.3826191685955701, + "grad_norm": 5.811466693878174, + "learning_rate": 7.089769150847075e-05, + "loss": 2.1394, + "step": 5083 + }, + { + "epoch": 0.3826944428762303, + "grad_norm": 4.325088024139404, + "learning_rate": 7.08866158496306e-05, + "loss": 2.2933, + "step": 5084 + }, + { + "epoch": 0.3827697171568904, + "grad_norm": 5.000208854675293, + "learning_rate": 7.087553894914107e-05, + "loss": 2.0184, + "step": 5085 + }, + { + "epoch": 0.3828449914375506, + "grad_norm": 6.376129627227783, + "learning_rate": 7.086446080766063e-05, + "loss": 2.006, + "step": 5086 + }, + { + "epoch": 0.38292026571821075, + "grad_norm": 5.024622440338135, + "learning_rate": 7.085338142584788e-05, + "loss": 1.9236, + "step": 5087 + }, + { + "epoch": 0.38299553999887087, + "grad_norm": 8.324176788330078, + "learning_rate": 7.084230080436142e-05, + "loss": 2.1513, + "step": 5088 + }, + { + "epoch": 0.38307081427953105, + "grad_norm": 4.331027984619141, + "learning_rate": 7.083121894386e-05, + "loss": 1.9496, + "step": 5089 + }, + { + "epoch": 0.3831460885601912, + "grad_norm": 5.0929059982299805, + "learning_rate": 7.082013584500238e-05, + "loss": 1.557, + "step": 5090 + }, + { + "epoch": 0.38322136284085134, + "grad_norm": 4.041097164154053, + "learning_rate": 7.080905150844742e-05, + "loss": 1.6634, + "step": 5091 + }, + { + "epoch": 0.3832966371215115, + "grad_norm": 5.238592147827148, + "learning_rate": 7.079796593485405e-05, + "loss": 1.7168, + "step": 5092 + }, + { + "epoch": 0.38337191140217164, + "grad_norm": 5.367953300476074, + "learning_rate": 7.07868791248813e-05, + "loss": 1.9948, + "step": 5093 + }, + { + "epoch": 0.3834471856828318, + "grad_norm": 5.463464736938477, + "learning_rate": 7.077579107918821e-05, + "loss": 1.7237, + "step": 5094 + }, + { + "epoch": 0.383522459963492, + "grad_norm": 5.756312370300293, + "learning_rate": 7.076470179843396e-05, + "loss": 2.0185, + "step": 5095 + }, + { + "epoch": 0.3835977342441521, + "grad_norm": 5.99233341217041, + "learning_rate": 7.075361128327777e-05, + "loss": 2.0536, + "step": 5096 + }, + { + "epoch": 0.3836730085248123, + "grad_norm": 5.1531453132629395, + "learning_rate": 7.074251953437893e-05, + "loss": 2.0676, + "step": 5097 + }, + { + "epoch": 0.38374828280547246, + "grad_norm": 8.285211563110352, + "learning_rate": 7.073142655239684e-05, + "loss": 2.2254, + "step": 5098 + }, + { + "epoch": 0.3838235570861326, + "grad_norm": 6.0429511070251465, + "learning_rate": 7.072033233799091e-05, + "loss": 1.9226, + "step": 5099 + }, + { + "epoch": 0.38389883136679276, + "grad_norm": 5.888047218322754, + "learning_rate": 7.07092368918207e-05, + "loss": 1.8763, + "step": 5100 + }, + { + "epoch": 0.3839741056474529, + "grad_norm": 4.138872146606445, + "learning_rate": 7.069814021454576e-05, + "loss": 1.996, + "step": 5101 + }, + { + "epoch": 0.38404937992811305, + "grad_norm": 3.933856248855591, + "learning_rate": 7.068704230682576e-05, + "loss": 1.7675, + "step": 5102 + }, + { + "epoch": 0.38412465420877323, + "grad_norm": 4.470552444458008, + "learning_rate": 7.067594316932047e-05, + "loss": 1.9382, + "step": 5103 + }, + { + "epoch": 0.38419992848943335, + "grad_norm": 5.973843574523926, + "learning_rate": 7.066484280268968e-05, + "loss": 1.8682, + "step": 5104 + }, + { + "epoch": 0.3842752027700935, + "grad_norm": 6.167820453643799, + "learning_rate": 7.06537412075933e-05, + "loss": 1.9072, + "step": 5105 + }, + { + "epoch": 0.3843504770507537, + "grad_norm": 4.238295555114746, + "learning_rate": 7.064263838469124e-05, + "loss": 2.1036, + "step": 5106 + }, + { + "epoch": 0.3844257513314138, + "grad_norm": 4.321152687072754, + "learning_rate": 7.063153433464356e-05, + "loss": 1.6087, + "step": 5107 + }, + { + "epoch": 0.384501025612074, + "grad_norm": 6.048811912536621, + "learning_rate": 7.062042905811037e-05, + "loss": 1.8642, + "step": 5108 + }, + { + "epoch": 0.3845762998927342, + "grad_norm": 17.910160064697266, + "learning_rate": 7.060932255575183e-05, + "loss": 2.1403, + "step": 5109 + }, + { + "epoch": 0.3846515741733943, + "grad_norm": 4.2710795402526855, + "learning_rate": 7.059821482822821e-05, + "loss": 1.9381, + "step": 5110 + }, + { + "epoch": 0.38472684845405447, + "grad_norm": 4.475728988647461, + "learning_rate": 7.058710587619982e-05, + "loss": 1.7287, + "step": 5111 + }, + { + "epoch": 0.3848021227347146, + "grad_norm": 5.814591884613037, + "learning_rate": 7.057599570032706e-05, + "loss": 2.1481, + "step": 5112 + }, + { + "epoch": 0.38487739701537477, + "grad_norm": 3.5970354080200195, + "learning_rate": 7.056488430127039e-05, + "loss": 1.8507, + "step": 5113 + }, + { + "epoch": 0.38495267129603494, + "grad_norm": 3.6856136322021484, + "learning_rate": 7.055377167969035e-05, + "loss": 1.6381, + "step": 5114 + }, + { + "epoch": 0.38502794557669506, + "grad_norm": 4.186501502990723, + "learning_rate": 7.054265783624756e-05, + "loss": 1.9, + "step": 5115 + }, + { + "epoch": 0.38510321985735524, + "grad_norm": 5.797103404998779, + "learning_rate": 7.053154277160272e-05, + "loss": 2.2511, + "step": 5116 + }, + { + "epoch": 0.3851784941380154, + "grad_norm": 7.860724925994873, + "learning_rate": 7.052042648641656e-05, + "loss": 1.8557, + "step": 5117 + }, + { + "epoch": 0.38525376841867554, + "grad_norm": 6.823034286499023, + "learning_rate": 7.050930898134993e-05, + "loss": 2.2742, + "step": 5118 + }, + { + "epoch": 0.3853290426993357, + "grad_norm": 7.527514457702637, + "learning_rate": 7.049819025706373e-05, + "loss": 1.6855, + "step": 5119 + }, + { + "epoch": 0.3854043169799959, + "grad_norm": 4.418719291687012, + "learning_rate": 7.048707031421893e-05, + "loss": 2.3863, + "step": 5120 + }, + { + "epoch": 0.385479591260656, + "grad_norm": 4.135549068450928, + "learning_rate": 7.047594915347658e-05, + "loss": 1.6582, + "step": 5121 + }, + { + "epoch": 0.3855548655413162, + "grad_norm": 5.610385894775391, + "learning_rate": 7.046482677549782e-05, + "loss": 1.622, + "step": 5122 + }, + { + "epoch": 0.3856301398219763, + "grad_norm": 6.739068031311035, + "learning_rate": 7.045370318094382e-05, + "loss": 2.0758, + "step": 5123 + }, + { + "epoch": 0.3857054141026365, + "grad_norm": 4.734947681427002, + "learning_rate": 7.044257837047585e-05, + "loss": 1.9354, + "step": 5124 + }, + { + "epoch": 0.38578068838329665, + "grad_norm": 3.7672181129455566, + "learning_rate": 7.043145234475526e-05, + "loss": 1.6573, + "step": 5125 + }, + { + "epoch": 0.3858559626639568, + "grad_norm": 4.083711624145508, + "learning_rate": 7.042032510444343e-05, + "loss": 2.179, + "step": 5126 + }, + { + "epoch": 0.38593123694461695, + "grad_norm": 6.062718391418457, + "learning_rate": 7.04091966502019e-05, + "loss": 2.0957, + "step": 5127 + }, + { + "epoch": 0.3860065112252771, + "grad_norm": 5.726144313812256, + "learning_rate": 7.039806698269216e-05, + "loss": 1.7507, + "step": 5128 + }, + { + "epoch": 0.38608178550593725, + "grad_norm": 5.6691060066223145, + "learning_rate": 7.038693610257588e-05, + "loss": 1.7787, + "step": 5129 + }, + { + "epoch": 0.3861570597865974, + "grad_norm": 3.711555004119873, + "learning_rate": 7.037580401051474e-05, + "loss": 2.3203, + "step": 5130 + }, + { + "epoch": 0.38623233406725754, + "grad_norm": 3.887923240661621, + "learning_rate": 7.036467070717053e-05, + "loss": 1.8974, + "step": 5131 + }, + { + "epoch": 0.3863076083479177, + "grad_norm": 5.334774494171143, + "learning_rate": 7.035353619320507e-05, + "loss": 2.0965, + "step": 5132 + }, + { + "epoch": 0.3863828826285779, + "grad_norm": 6.633877754211426, + "learning_rate": 7.03424004692803e-05, + "loss": 2.0022, + "step": 5133 + }, + { + "epoch": 0.386458156909238, + "grad_norm": 5.614700794219971, + "learning_rate": 7.033126353605817e-05, + "loss": 1.8089, + "step": 5134 + }, + { + "epoch": 0.3865334311898982, + "grad_norm": 4.816235065460205, + "learning_rate": 7.032012539420076e-05, + "loss": 2.0747, + "step": 5135 + }, + { + "epoch": 0.38660870547055837, + "grad_norm": 4.977809906005859, + "learning_rate": 7.03089860443702e-05, + "loss": 2.0232, + "step": 5136 + }, + { + "epoch": 0.3866839797512185, + "grad_norm": 4.144922733306885, + "learning_rate": 7.029784548722871e-05, + "loss": 2.4187, + "step": 5137 + }, + { + "epoch": 0.38675925403187866, + "grad_norm": 5.243071556091309, + "learning_rate": 7.028670372343853e-05, + "loss": 2.0453, + "step": 5138 + }, + { + "epoch": 0.38683452831253884, + "grad_norm": 4.381501197814941, + "learning_rate": 7.027556075366202e-05, + "loss": 1.934, + "step": 5139 + }, + { + "epoch": 0.38690980259319896, + "grad_norm": 5.496699333190918, + "learning_rate": 7.026441657856162e-05, + "loss": 1.9844, + "step": 5140 + }, + { + "epoch": 0.38698507687385914, + "grad_norm": 3.5171704292297363, + "learning_rate": 7.025327119879979e-05, + "loss": 1.8437, + "step": 5141 + }, + { + "epoch": 0.38706035115451926, + "grad_norm": 5.689639568328857, + "learning_rate": 7.02421246150391e-05, + "loss": 1.8649, + "step": 5142 + }, + { + "epoch": 0.38713562543517943, + "grad_norm": 3.4029061794281006, + "learning_rate": 7.023097682794217e-05, + "loss": 1.6097, + "step": 5143 + }, + { + "epoch": 0.3872108997158396, + "grad_norm": 5.994120121002197, + "learning_rate": 7.021982783817172e-05, + "loss": 1.9895, + "step": 5144 + }, + { + "epoch": 0.3872861739964997, + "grad_norm": 4.245425701141357, + "learning_rate": 7.020867764639054e-05, + "loss": 1.7888, + "step": 5145 + }, + { + "epoch": 0.3873614482771599, + "grad_norm": 4.548585891723633, + "learning_rate": 7.019752625326145e-05, + "loss": 2.1959, + "step": 5146 + }, + { + "epoch": 0.3874367225578201, + "grad_norm": 4.81891393661499, + "learning_rate": 7.018637365944737e-05, + "loss": 2.2107, + "step": 5147 + }, + { + "epoch": 0.3875119968384802, + "grad_norm": 4.358880519866943, + "learning_rate": 7.017521986561131e-05, + "loss": 1.8422, + "step": 5148 + }, + { + "epoch": 0.3875872711191404, + "grad_norm": 6.989319801330566, + "learning_rate": 7.016406487241632e-05, + "loss": 1.6412, + "step": 5149 + }, + { + "epoch": 0.3876625453998005, + "grad_norm": 4.323236465454102, + "learning_rate": 7.01529086805255e-05, + "loss": 1.8184, + "step": 5150 + }, + { + "epoch": 0.38773781968046067, + "grad_norm": 4.896292686462402, + "learning_rate": 7.014175129060212e-05, + "loss": 1.6843, + "step": 5151 + }, + { + "epoch": 0.38781309396112085, + "grad_norm": 4.514595031738281, + "learning_rate": 7.01305927033094e-05, + "loss": 1.835, + "step": 5152 + }, + { + "epoch": 0.38788836824178097, + "grad_norm": 4.69317102432251, + "learning_rate": 7.011943291931071e-05, + "loss": 1.9063, + "step": 5153 + }, + { + "epoch": 0.38796364252244114, + "grad_norm": 4.969700813293457, + "learning_rate": 7.010827193926947e-05, + "loss": 2.0086, + "step": 5154 + }, + { + "epoch": 0.3880389168031013, + "grad_norm": 5.646577835083008, + "learning_rate": 7.009710976384915e-05, + "loss": 1.6582, + "step": 5155 + }, + { + "epoch": 0.38811419108376144, + "grad_norm": 6.167710304260254, + "learning_rate": 7.008594639371333e-05, + "loss": 1.7893, + "step": 5156 + }, + { + "epoch": 0.3881894653644216, + "grad_norm": 4.740093231201172, + "learning_rate": 7.00747818295256e-05, + "loss": 2.1039, + "step": 5157 + }, + { + "epoch": 0.3882647396450818, + "grad_norm": 4.954835414886475, + "learning_rate": 7.006361607194972e-05, + "loss": 2.0474, + "step": 5158 + }, + { + "epoch": 0.3883400139257419, + "grad_norm": 3.7083630561828613, + "learning_rate": 7.005244912164942e-05, + "loss": 1.8587, + "step": 5159 + }, + { + "epoch": 0.3884152882064021, + "grad_norm": 6.3856353759765625, + "learning_rate": 7.004128097928857e-05, + "loss": 1.9587, + "step": 5160 + }, + { + "epoch": 0.3884905624870622, + "grad_norm": 4.35089635848999, + "learning_rate": 7.003011164553107e-05, + "loss": 1.6856, + "step": 5161 + }, + { + "epoch": 0.3885658367677224, + "grad_norm": 5.854455947875977, + "learning_rate": 7.00189411210409e-05, + "loss": 1.907, + "step": 5162 + }, + { + "epoch": 0.38864111104838256, + "grad_norm": 4.369906902313232, + "learning_rate": 7.000776940648213e-05, + "loss": 1.7421, + "step": 5163 + }, + { + "epoch": 0.3887163853290427, + "grad_norm": 5.58068323135376, + "learning_rate": 6.999659650251885e-05, + "loss": 2.2024, + "step": 5164 + }, + { + "epoch": 0.38879165960970286, + "grad_norm": 5.518388271331787, + "learning_rate": 6.998542240981531e-05, + "loss": 1.958, + "step": 5165 + }, + { + "epoch": 0.38886693389036303, + "grad_norm": 4.063182353973389, + "learning_rate": 6.997424712903576e-05, + "loss": 1.6237, + "step": 5166 + }, + { + "epoch": 0.38894220817102315, + "grad_norm": 3.8410394191741943, + "learning_rate": 6.99630706608445e-05, + "loss": 1.6417, + "step": 5167 + }, + { + "epoch": 0.3890174824516833, + "grad_norm": 4.220770359039307, + "learning_rate": 6.995189300590599e-05, + "loss": 2.3121, + "step": 5168 + }, + { + "epoch": 0.3890927567323435, + "grad_norm": 3.782012701034546, + "learning_rate": 6.994071416488468e-05, + "loss": 1.9135, + "step": 5169 + }, + { + "epoch": 0.3891680310130036, + "grad_norm": 4.577845096588135, + "learning_rate": 6.992953413844514e-05, + "loss": 1.4614, + "step": 5170 + }, + { + "epoch": 0.3892433052936638, + "grad_norm": 5.705904006958008, + "learning_rate": 6.991835292725197e-05, + "loss": 2.326, + "step": 5171 + }, + { + "epoch": 0.3893185795743239, + "grad_norm": 5.35415506362915, + "learning_rate": 6.990717053196987e-05, + "loss": 1.9108, + "step": 5172 + }, + { + "epoch": 0.3893938538549841, + "grad_norm": 4.892650604248047, + "learning_rate": 6.989598695326363e-05, + "loss": 1.6482, + "step": 5173 + }, + { + "epoch": 0.38946912813564427, + "grad_norm": 4.593874931335449, + "learning_rate": 6.988480219179801e-05, + "loss": 1.7933, + "step": 5174 + }, + { + "epoch": 0.3895444024163044, + "grad_norm": 6.953530311584473, + "learning_rate": 6.987361624823797e-05, + "loss": 1.9065, + "step": 5175 + }, + { + "epoch": 0.38961967669696457, + "grad_norm": 5.592297077178955, + "learning_rate": 6.986242912324847e-05, + "loss": 1.8321, + "step": 5176 + }, + { + "epoch": 0.38969495097762474, + "grad_norm": 4.84185266494751, + "learning_rate": 6.985124081749457e-05, + "loss": 1.7783, + "step": 5177 + }, + { + "epoch": 0.38977022525828486, + "grad_norm": 7.100406646728516, + "learning_rate": 6.984005133164134e-05, + "loss": 2.6197, + "step": 5178 + }, + { + "epoch": 0.38984549953894504, + "grad_norm": 5.400269985198975, + "learning_rate": 6.982886066635399e-05, + "loss": 1.9053, + "step": 5179 + }, + { + "epoch": 0.38992077381960516, + "grad_norm": 5.158748149871826, + "learning_rate": 6.981766882229777e-05, + "loss": 1.9748, + "step": 5180 + }, + { + "epoch": 0.38999604810026534, + "grad_norm": 7.54788064956665, + "learning_rate": 6.9806475800138e-05, + "loss": 1.9783, + "step": 5181 + }, + { + "epoch": 0.3900713223809255, + "grad_norm": 7.641003608703613, + "learning_rate": 6.979528160054009e-05, + "loss": 2.2468, + "step": 5182 + }, + { + "epoch": 0.39014659666158563, + "grad_norm": 4.912811279296875, + "learning_rate": 6.978408622416948e-05, + "loss": 1.7204, + "step": 5183 + }, + { + "epoch": 0.3902218709422458, + "grad_norm": 6.236836910247803, + "learning_rate": 6.977288967169171e-05, + "loss": 2.3072, + "step": 5184 + }, + { + "epoch": 0.390297145222906, + "grad_norm": 3.4800755977630615, + "learning_rate": 6.976169194377238e-05, + "loss": 1.8945, + "step": 5185 + }, + { + "epoch": 0.3903724195035661, + "grad_norm": 5.417069435119629, + "learning_rate": 6.975049304107716e-05, + "loss": 2.0, + "step": 5186 + }, + { + "epoch": 0.3904476937842263, + "grad_norm": 4.192644119262695, + "learning_rate": 6.973929296427181e-05, + "loss": 2.0753, + "step": 5187 + }, + { + "epoch": 0.39052296806488646, + "grad_norm": 5.31760311126709, + "learning_rate": 6.972809171402213e-05, + "loss": 1.868, + "step": 5188 + }, + { + "epoch": 0.3905982423455466, + "grad_norm": 4.881211280822754, + "learning_rate": 6.9716889290994e-05, + "loss": 2.0271, + "step": 5189 + }, + { + "epoch": 0.39067351662620675, + "grad_norm": 5.626580715179443, + "learning_rate": 6.970568569585338e-05, + "loss": 1.8045, + "step": 5190 + }, + { + "epoch": 0.39074879090686687, + "grad_norm": 4.538501262664795, + "learning_rate": 6.969448092926629e-05, + "loss": 2.0077, + "step": 5191 + }, + { + "epoch": 0.39082406518752705, + "grad_norm": 4.080972671508789, + "learning_rate": 6.968327499189879e-05, + "loss": 2.152, + "step": 5192 + }, + { + "epoch": 0.3908993394681872, + "grad_norm": 5.495019435882568, + "learning_rate": 6.96720678844171e-05, + "loss": 1.8387, + "step": 5193 + }, + { + "epoch": 0.39097461374884734, + "grad_norm": 5.964603424072266, + "learning_rate": 6.966085960748741e-05, + "loss": 1.783, + "step": 5194 + }, + { + "epoch": 0.3910498880295075, + "grad_norm": 4.61346960067749, + "learning_rate": 6.964965016177603e-05, + "loss": 1.9008, + "step": 5195 + }, + { + "epoch": 0.3911251623101677, + "grad_norm": 9.353378295898438, + "learning_rate": 6.963843954794935e-05, + "loss": 1.7423, + "step": 5196 + }, + { + "epoch": 0.3912004365908278, + "grad_norm": 4.792913913726807, + "learning_rate": 6.962722776667376e-05, + "loss": 2.0395, + "step": 5197 + }, + { + "epoch": 0.391275710871488, + "grad_norm": 6.946119785308838, + "learning_rate": 6.961601481861581e-05, + "loss": 2.0825, + "step": 5198 + }, + { + "epoch": 0.39135098515214817, + "grad_norm": 5.736914157867432, + "learning_rate": 6.960480070444205e-05, + "loss": 1.7967, + "step": 5199 + }, + { + "epoch": 0.3914262594328083, + "grad_norm": 8.586437225341797, + "learning_rate": 6.959358542481915e-05, + "loss": 1.6642, + "step": 5200 + }, + { + "epoch": 0.39150153371346846, + "grad_norm": 5.748046875, + "learning_rate": 6.958236898041382e-05, + "loss": 2.0438, + "step": 5201 + }, + { + "epoch": 0.3915768079941286, + "grad_norm": 5.315900802612305, + "learning_rate": 6.957115137189285e-05, + "loss": 1.7355, + "step": 5202 + }, + { + "epoch": 0.39165208227478876, + "grad_norm": 6.88840913772583, + "learning_rate": 6.955993259992306e-05, + "loss": 1.9918, + "step": 5203 + }, + { + "epoch": 0.39172735655544894, + "grad_norm": 5.428752899169922, + "learning_rate": 6.954871266517143e-05, + "loss": 1.9874, + "step": 5204 + }, + { + "epoch": 0.39180263083610906, + "grad_norm": 5.024386405944824, + "learning_rate": 6.95374915683049e-05, + "loss": 1.7278, + "step": 5205 + }, + { + "epoch": 0.39187790511676923, + "grad_norm": 4.7459540367126465, + "learning_rate": 6.952626930999058e-05, + "loss": 2.464, + "step": 5206 + }, + { + "epoch": 0.3919531793974294, + "grad_norm": 4.329017639160156, + "learning_rate": 6.951504589089555e-05, + "loss": 1.7331, + "step": 5207 + }, + { + "epoch": 0.39202845367808953, + "grad_norm": 5.50597620010376, + "learning_rate": 6.950382131168705e-05, + "loss": 1.9198, + "step": 5208 + }, + { + "epoch": 0.3921037279587497, + "grad_norm": 4.4370245933532715, + "learning_rate": 6.949259557303233e-05, + "loss": 1.9569, + "step": 5209 + }, + { + "epoch": 0.3921790022394098, + "grad_norm": 5.53692626953125, + "learning_rate": 6.948136867559874e-05, + "loss": 2.2376, + "step": 5210 + }, + { + "epoch": 0.39225427652007, + "grad_norm": 4.983517646789551, + "learning_rate": 6.947014062005368e-05, + "loss": 1.7596, + "step": 5211 + }, + { + "epoch": 0.3923295508007302, + "grad_norm": 5.923350811004639, + "learning_rate": 6.945891140706462e-05, + "loss": 1.9224, + "step": 5212 + }, + { + "epoch": 0.3924048250813903, + "grad_norm": 3.438063621520996, + "learning_rate": 6.944768103729913e-05, + "loss": 2.043, + "step": 5213 + }, + { + "epoch": 0.39248009936205047, + "grad_norm": 5.481429100036621, + "learning_rate": 6.943644951142478e-05, + "loss": 2.0207, + "step": 5214 + }, + { + "epoch": 0.39255537364271065, + "grad_norm": 6.9620537757873535, + "learning_rate": 6.942521683010928e-05, + "loss": 2.1753, + "step": 5215 + }, + { + "epoch": 0.39263064792337077, + "grad_norm": 4.2170915603637695, + "learning_rate": 6.941398299402039e-05, + "loss": 2.0742, + "step": 5216 + }, + { + "epoch": 0.39270592220403094, + "grad_norm": 4.788837432861328, + "learning_rate": 6.940274800382591e-05, + "loss": 1.8934, + "step": 5217 + }, + { + "epoch": 0.3927811964846911, + "grad_norm": 5.696264743804932, + "learning_rate": 6.939151186019373e-05, + "loss": 1.8366, + "step": 5218 + }, + { + "epoch": 0.39285647076535124, + "grad_norm": 10.238476753234863, + "learning_rate": 6.938027456379182e-05, + "loss": 1.824, + "step": 5219 + }, + { + "epoch": 0.3929317450460114, + "grad_norm": 4.1892499923706055, + "learning_rate": 6.936903611528818e-05, + "loss": 1.848, + "step": 5220 + }, + { + "epoch": 0.39300701932667154, + "grad_norm": 5.8312811851501465, + "learning_rate": 6.935779651535093e-05, + "loss": 2.0635, + "step": 5221 + }, + { + "epoch": 0.3930822936073317, + "grad_norm": 5.127820014953613, + "learning_rate": 6.934655576464822e-05, + "loss": 1.9774, + "step": 5222 + }, + { + "epoch": 0.3931575678879919, + "grad_norm": 5.025149822235107, + "learning_rate": 6.933531386384827e-05, + "loss": 1.9584, + "step": 5223 + }, + { + "epoch": 0.393232842168652, + "grad_norm": 4.3378777503967285, + "learning_rate": 6.93240708136194e-05, + "loss": 1.6666, + "step": 5224 + }, + { + "epoch": 0.3933081164493122, + "grad_norm": 5.0351080894470215, + "learning_rate": 6.931282661462997e-05, + "loss": 1.8046, + "step": 5225 + }, + { + "epoch": 0.39338339072997236, + "grad_norm": 5.016655445098877, + "learning_rate": 6.93015812675484e-05, + "loss": 1.6566, + "step": 5226 + }, + { + "epoch": 0.3934586650106325, + "grad_norm": 4.996002674102783, + "learning_rate": 6.929033477304323e-05, + "loss": 2.2198, + "step": 5227 + }, + { + "epoch": 0.39353393929129266, + "grad_norm": 6.874370574951172, + "learning_rate": 6.927908713178299e-05, + "loss": 2.0046, + "step": 5228 + }, + { + "epoch": 0.3936092135719528, + "grad_norm": 8.90511417388916, + "learning_rate": 6.926783834443634e-05, + "loss": 2.6859, + "step": 5229 + }, + { + "epoch": 0.39368448785261295, + "grad_norm": 5.323373794555664, + "learning_rate": 6.925658841167197e-05, + "loss": 2.0725, + "step": 5230 + }, + { + "epoch": 0.39375976213327313, + "grad_norm": 3.8473477363586426, + "learning_rate": 6.92453373341587e-05, + "loss": 2.0402, + "step": 5231 + }, + { + "epoch": 0.39383503641393325, + "grad_norm": 6.837521076202393, + "learning_rate": 6.923408511256533e-05, + "loss": 1.5902, + "step": 5232 + }, + { + "epoch": 0.3939103106945934, + "grad_norm": 4.585453033447266, + "learning_rate": 6.922283174756081e-05, + "loss": 1.848, + "step": 5233 + }, + { + "epoch": 0.3939855849752536, + "grad_norm": 6.949504852294922, + "learning_rate": 6.921157723981408e-05, + "loss": 1.9013, + "step": 5234 + }, + { + "epoch": 0.3940608592559137, + "grad_norm": 5.052194118499756, + "learning_rate": 6.920032158999422e-05, + "loss": 1.9899, + "step": 5235 + }, + { + "epoch": 0.3941361335365739, + "grad_norm": 5.41015100479126, + "learning_rate": 6.918906479877032e-05, + "loss": 1.557, + "step": 5236 + }, + { + "epoch": 0.39421140781723407, + "grad_norm": 6.677452087402344, + "learning_rate": 6.917780686681158e-05, + "loss": 1.805, + "step": 5237 + }, + { + "epoch": 0.3942866820978942, + "grad_norm": 5.306216239929199, + "learning_rate": 6.916654779478725e-05, + "loss": 2.0667, + "step": 5238 + }, + { + "epoch": 0.39436195637855437, + "grad_norm": 4.794074058532715, + "learning_rate": 6.915528758336665e-05, + "loss": 1.7614, + "step": 5239 + }, + { + "epoch": 0.3944372306592145, + "grad_norm": 5.001025676727295, + "learning_rate": 6.914402623321916e-05, + "loss": 1.9539, + "step": 5240 + }, + { + "epoch": 0.39451250493987466, + "grad_norm": 6.838381290435791, + "learning_rate": 6.913276374501425e-05, + "loss": 2.2756, + "step": 5241 + }, + { + "epoch": 0.39458777922053484, + "grad_norm": 6.005486965179443, + "learning_rate": 6.912150011942143e-05, + "loss": 2.4479, + "step": 5242 + }, + { + "epoch": 0.39466305350119496, + "grad_norm": 4.807872772216797, + "learning_rate": 6.911023535711029e-05, + "loss": 1.7264, + "step": 5243 + }, + { + "epoch": 0.39473832778185514, + "grad_norm": 6.527614116668701, + "learning_rate": 6.90989694587505e-05, + "loss": 2.0519, + "step": 5244 + }, + { + "epoch": 0.3948136020625153, + "grad_norm": 3.5883874893188477, + "learning_rate": 6.908770242501176e-05, + "loss": 1.8678, + "step": 5245 + }, + { + "epoch": 0.39488887634317543, + "grad_norm": 4.086682319641113, + "learning_rate": 6.90764342565639e-05, + "loss": 1.8968, + "step": 5246 + }, + { + "epoch": 0.3949641506238356, + "grad_norm": 4.665533065795898, + "learning_rate": 6.906516495407674e-05, + "loss": 2.0486, + "step": 5247 + }, + { + "epoch": 0.3950394249044958, + "grad_norm": 4.139925956726074, + "learning_rate": 6.905389451822024e-05, + "loss": 1.7749, + "step": 5248 + }, + { + "epoch": 0.3951146991851559, + "grad_norm": 5.476803779602051, + "learning_rate": 6.90426229496644e-05, + "loss": 2.1279, + "step": 5249 + }, + { + "epoch": 0.3951899734658161, + "grad_norm": 4.958574295043945, + "learning_rate": 6.903135024907923e-05, + "loss": 1.8292, + "step": 5250 + }, + { + "epoch": 0.3952652477464762, + "grad_norm": 5.581496238708496, + "learning_rate": 6.902007641713492e-05, + "loss": 1.9829, + "step": 5251 + }, + { + "epoch": 0.3953405220271364, + "grad_norm": 5.000868320465088, + "learning_rate": 6.900880145450163e-05, + "loss": 2.0098, + "step": 5252 + }, + { + "epoch": 0.39541579630779655, + "grad_norm": 4.69998025894165, + "learning_rate": 6.899752536184964e-05, + "loss": 1.8519, + "step": 5253 + }, + { + "epoch": 0.3954910705884567, + "grad_norm": 4.44639253616333, + "learning_rate": 6.898624813984928e-05, + "loss": 2.0462, + "step": 5254 + }, + { + "epoch": 0.39556634486911685, + "grad_norm": 5.401144504547119, + "learning_rate": 6.897496978917093e-05, + "loss": 1.9909, + "step": 5255 + }, + { + "epoch": 0.395641619149777, + "grad_norm": 5.437026500701904, + "learning_rate": 6.896369031048508e-05, + "loss": 2.0208, + "step": 5256 + }, + { + "epoch": 0.39571689343043714, + "grad_norm": 6.105072021484375, + "learning_rate": 6.895240970446226e-05, + "loss": 2.3934, + "step": 5257 + }, + { + "epoch": 0.3957921677110973, + "grad_norm": 6.4207305908203125, + "learning_rate": 6.894112797177306e-05, + "loss": 2.3805, + "step": 5258 + }, + { + "epoch": 0.39586744199175744, + "grad_norm": 5.842775821685791, + "learning_rate": 6.892984511308814e-05, + "loss": 1.9846, + "step": 5259 + }, + { + "epoch": 0.3959427162724176, + "grad_norm": 4.6595306396484375, + "learning_rate": 6.891856112907827e-05, + "loss": 1.6674, + "step": 5260 + }, + { + "epoch": 0.3960179905530778, + "grad_norm": 4.580187797546387, + "learning_rate": 6.89072760204142e-05, + "loss": 1.724, + "step": 5261 + }, + { + "epoch": 0.3960932648337379, + "grad_norm": 5.219051361083984, + "learning_rate": 6.889598978776684e-05, + "loss": 1.9839, + "step": 5262 + }, + { + "epoch": 0.3961685391143981, + "grad_norm": 7.040702819824219, + "learning_rate": 6.88847024318071e-05, + "loss": 1.7476, + "step": 5263 + }, + { + "epoch": 0.39624381339505826, + "grad_norm": 4.006730079650879, + "learning_rate": 6.887341395320597e-05, + "loss": 1.6603, + "step": 5264 + }, + { + "epoch": 0.3963190876757184, + "grad_norm": 6.135868549346924, + "learning_rate": 6.886212435263455e-05, + "loss": 2.112, + "step": 5265 + }, + { + "epoch": 0.39639436195637856, + "grad_norm": 3.9821999073028564, + "learning_rate": 6.885083363076395e-05, + "loss": 1.9438, + "step": 5266 + }, + { + "epoch": 0.39646963623703874, + "grad_norm": 5.716285705566406, + "learning_rate": 6.88395417882654e-05, + "loss": 1.946, + "step": 5267 + }, + { + "epoch": 0.39654491051769886, + "grad_norm": 4.032278060913086, + "learning_rate": 6.882824882581013e-05, + "loss": 1.7596, + "step": 5268 + }, + { + "epoch": 0.39662018479835903, + "grad_norm": 4.436804294586182, + "learning_rate": 6.881695474406949e-05, + "loss": 1.8283, + "step": 5269 + }, + { + "epoch": 0.39669545907901915, + "grad_norm": 3.6373722553253174, + "learning_rate": 6.880565954371489e-05, + "loss": 1.5366, + "step": 5270 + }, + { + "epoch": 0.39677073335967933, + "grad_norm": 4.753200531005859, + "learning_rate": 6.87943632254178e-05, + "loss": 1.7075, + "step": 5271 + }, + { + "epoch": 0.3968460076403395, + "grad_norm": 4.538983345031738, + "learning_rate": 6.87830657898497e-05, + "loss": 1.7884, + "step": 5272 + }, + { + "epoch": 0.3969212819209996, + "grad_norm": 4.678380489349365, + "learning_rate": 6.877176723768228e-05, + "loss": 1.6964, + "step": 5273 + }, + { + "epoch": 0.3969965562016598, + "grad_norm": 4.305378437042236, + "learning_rate": 6.876046756958715e-05, + "loss": 1.7612, + "step": 5274 + }, + { + "epoch": 0.39707183048232, + "grad_norm": 9.505729675292969, + "learning_rate": 6.874916678623603e-05, + "loss": 2.0122, + "step": 5275 + }, + { + "epoch": 0.3971471047629801, + "grad_norm": 4.65559720993042, + "learning_rate": 6.873786488830076e-05, + "loss": 1.6936, + "step": 5276 + }, + { + "epoch": 0.3972223790436403, + "grad_norm": 4.853903293609619, + "learning_rate": 6.87265618764532e-05, + "loss": 2.1455, + "step": 5277 + }, + { + "epoch": 0.3972976533243004, + "grad_norm": 4.222773551940918, + "learning_rate": 6.871525775136525e-05, + "loss": 1.9502, + "step": 5278 + }, + { + "epoch": 0.39737292760496057, + "grad_norm": 5.905839920043945, + "learning_rate": 6.870395251370894e-05, + "loss": 1.8499, + "step": 5279 + }, + { + "epoch": 0.39744820188562074, + "grad_norm": 8.549711227416992, + "learning_rate": 6.869264616415629e-05, + "loss": 2.0759, + "step": 5280 + }, + { + "epoch": 0.39752347616628086, + "grad_norm": 3.7769014835357666, + "learning_rate": 6.86813387033795e-05, + "loss": 1.7907, + "step": 5281 + }, + { + "epoch": 0.39759875044694104, + "grad_norm": 4.713196277618408, + "learning_rate": 6.86700301320507e-05, + "loss": 1.8547, + "step": 5282 + }, + { + "epoch": 0.3976740247276012, + "grad_norm": 4.885081768035889, + "learning_rate": 6.865872045084219e-05, + "loss": 2.1806, + "step": 5283 + }, + { + "epoch": 0.39774929900826134, + "grad_norm": 7.624965190887451, + "learning_rate": 6.864740966042629e-05, + "loss": 2.2943, + "step": 5284 + }, + { + "epoch": 0.3978245732889215, + "grad_norm": 4.436729907989502, + "learning_rate": 6.863609776147539e-05, + "loss": 1.8709, + "step": 5285 + }, + { + "epoch": 0.3978998475695817, + "grad_norm": 5.042465686798096, + "learning_rate": 6.862478475466194e-05, + "loss": 1.5665, + "step": 5286 + }, + { + "epoch": 0.3979751218502418, + "grad_norm": 4.891488075256348, + "learning_rate": 6.861347064065849e-05, + "loss": 1.7946, + "step": 5287 + }, + { + "epoch": 0.398050396130902, + "grad_norm": 5.227749347686768, + "learning_rate": 6.860215542013761e-05, + "loss": 1.8152, + "step": 5288 + }, + { + "epoch": 0.3981256704115621, + "grad_norm": 6.2593770027160645, + "learning_rate": 6.859083909377197e-05, + "loss": 2.0069, + "step": 5289 + }, + { + "epoch": 0.3982009446922223, + "grad_norm": 9.46308708190918, + "learning_rate": 6.857952166223429e-05, + "loss": 2.1499, + "step": 5290 + }, + { + "epoch": 0.39827621897288246, + "grad_norm": 5.724310398101807, + "learning_rate": 6.856820312619735e-05, + "loss": 2.1076, + "step": 5291 + }, + { + "epoch": 0.3983514932535426, + "grad_norm": 4.571035861968994, + "learning_rate": 6.855688348633402e-05, + "loss": 1.7993, + "step": 5292 + }, + { + "epoch": 0.39842676753420275, + "grad_norm": 5.289876937866211, + "learning_rate": 6.85455627433172e-05, + "loss": 1.8123, + "step": 5293 + }, + { + "epoch": 0.39850204181486293, + "grad_norm": 4.67829704284668, + "learning_rate": 6.85342408978199e-05, + "loss": 1.5675, + "step": 5294 + }, + { + "epoch": 0.39857731609552305, + "grad_norm": 5.563104152679443, + "learning_rate": 6.852291795051518e-05, + "loss": 1.6215, + "step": 5295 + }, + { + "epoch": 0.3986525903761832, + "grad_norm": 6.449482440948486, + "learning_rate": 6.851159390207611e-05, + "loss": 1.992, + "step": 5296 + }, + { + "epoch": 0.3987278646568434, + "grad_norm": 3.546137571334839, + "learning_rate": 6.85002687531759e-05, + "loss": 2.2396, + "step": 5297 + }, + { + "epoch": 0.3988031389375035, + "grad_norm": 4.938421726226807, + "learning_rate": 6.848894250448778e-05, + "loss": 1.6918, + "step": 5298 + }, + { + "epoch": 0.3988784132181637, + "grad_norm": 6.101037502288818, + "learning_rate": 6.847761515668511e-05, + "loss": 1.7588, + "step": 5299 + }, + { + "epoch": 0.3989536874988238, + "grad_norm": 3.996248722076416, + "learning_rate": 6.846628671044121e-05, + "loss": 1.6859, + "step": 5300 + }, + { + "epoch": 0.399028961779484, + "grad_norm": 4.024343490600586, + "learning_rate": 6.845495716642958e-05, + "loss": 1.6624, + "step": 5301 + }, + { + "epoch": 0.39910423606014417, + "grad_norm": 4.989960670471191, + "learning_rate": 6.844362652532368e-05, + "loss": 2.4521, + "step": 5302 + }, + { + "epoch": 0.3991795103408043, + "grad_norm": 5.24813175201416, + "learning_rate": 6.843229478779712e-05, + "loss": 1.6548, + "step": 5303 + }, + { + "epoch": 0.39925478462146446, + "grad_norm": 5.8933539390563965, + "learning_rate": 6.84209619545235e-05, + "loss": 1.9301, + "step": 5304 + }, + { + "epoch": 0.39933005890212464, + "grad_norm": 5.07541561126709, + "learning_rate": 6.840962802617656e-05, + "loss": 2.1917, + "step": 5305 + }, + { + "epoch": 0.39940533318278476, + "grad_norm": 4.937519073486328, + "learning_rate": 6.839829300343006e-05, + "loss": 1.4381, + "step": 5306 + }, + { + "epoch": 0.39948060746344494, + "grad_norm": 4.6258015632629395, + "learning_rate": 6.838695688695782e-05, + "loss": 1.9739, + "step": 5307 + }, + { + "epoch": 0.39955588174410506, + "grad_norm": 4.504283428192139, + "learning_rate": 6.837561967743375e-05, + "loss": 1.6728, + "step": 5308 + }, + { + "epoch": 0.39963115602476523, + "grad_norm": 6.086696624755859, + "learning_rate": 6.836428137553184e-05, + "loss": 2.2676, + "step": 5309 + }, + { + "epoch": 0.3997064303054254, + "grad_norm": 4.89582633972168, + "learning_rate": 6.835294198192608e-05, + "loss": 1.8347, + "step": 5310 + }, + { + "epoch": 0.39978170458608553, + "grad_norm": 7.3717756271362305, + "learning_rate": 6.834160149729058e-05, + "loss": 1.6472, + "step": 5311 + }, + { + "epoch": 0.3998569788667457, + "grad_norm": 4.447671890258789, + "learning_rate": 6.83302599222995e-05, + "loss": 1.6426, + "step": 5312 + }, + { + "epoch": 0.3999322531474059, + "grad_norm": 5.7321624755859375, + "learning_rate": 6.831891725762706e-05, + "loss": 2.2593, + "step": 5313 + }, + { + "epoch": 0.400007527428066, + "grad_norm": 6.040215969085693, + "learning_rate": 6.830757350394755e-05, + "loss": 2.1071, + "step": 5314 + }, + { + "epoch": 0.4000828017087262, + "grad_norm": 5.973138809204102, + "learning_rate": 6.829622866193532e-05, + "loss": 1.8924, + "step": 5315 + }, + { + "epoch": 0.40015807598938635, + "grad_norm": 6.683164119720459, + "learning_rate": 6.82848827322648e-05, + "loss": 1.7196, + "step": 5316 + }, + { + "epoch": 0.4002333502700465, + "grad_norm": 5.338925361633301, + "learning_rate": 6.827353571561048e-05, + "loss": 1.9467, + "step": 5317 + }, + { + "epoch": 0.40030862455070665, + "grad_norm": 4.153974533081055, + "learning_rate": 6.826218761264689e-05, + "loss": 1.6475, + "step": 5318 + }, + { + "epoch": 0.40038389883136677, + "grad_norm": 4.400400638580322, + "learning_rate": 6.825083842404862e-05, + "loss": 1.7783, + "step": 5319 + }, + { + "epoch": 0.40045917311202694, + "grad_norm": 4.023130893707275, + "learning_rate": 6.823948815049039e-05, + "loss": 2.0207, + "step": 5320 + }, + { + "epoch": 0.4005344473926871, + "grad_norm": 4.367240905761719, + "learning_rate": 6.822813679264694e-05, + "loss": 1.9491, + "step": 5321 + }, + { + "epoch": 0.40060972167334724, + "grad_norm": 4.532545566558838, + "learning_rate": 6.821678435119304e-05, + "loss": 1.7451, + "step": 5322 + }, + { + "epoch": 0.4006849959540074, + "grad_norm": 5.647633075714111, + "learning_rate": 6.82054308268036e-05, + "loss": 1.6903, + "step": 5323 + }, + { + "epoch": 0.4007602702346676, + "grad_norm": 6.778362274169922, + "learning_rate": 6.819407622015353e-05, + "loss": 2.7986, + "step": 5324 + }, + { + "epoch": 0.4008355445153277, + "grad_norm": 4.840933322906494, + "learning_rate": 6.818272053191781e-05, + "loss": 1.9522, + "step": 5325 + }, + { + "epoch": 0.4009108187959879, + "grad_norm": 4.428386688232422, + "learning_rate": 6.817136376277154e-05, + "loss": 1.6789, + "step": 5326 + }, + { + "epoch": 0.400986093076648, + "grad_norm": 4.903996467590332, + "learning_rate": 6.816000591338985e-05, + "loss": 1.9061, + "step": 5327 + }, + { + "epoch": 0.4010613673573082, + "grad_norm": 3.8121678829193115, + "learning_rate": 6.81486469844479e-05, + "loss": 1.6681, + "step": 5328 + }, + { + "epoch": 0.40113664163796836, + "grad_norm": 5.033172607421875, + "learning_rate": 6.813728697662096e-05, + "loss": 1.5037, + "step": 5329 + }, + { + "epoch": 0.4012119159186285, + "grad_norm": 4.65513801574707, + "learning_rate": 6.812592589058435e-05, + "loss": 1.9911, + "step": 5330 + }, + { + "epoch": 0.40128719019928866, + "grad_norm": 3.5418541431427, + "learning_rate": 6.811456372701345e-05, + "loss": 1.8812, + "step": 5331 + }, + { + "epoch": 0.40136246447994883, + "grad_norm": 7.2162909507751465, + "learning_rate": 6.810320048658372e-05, + "loss": 2.3676, + "step": 5332 + }, + { + "epoch": 0.40143773876060895, + "grad_norm": 4.268293380737305, + "learning_rate": 6.809183616997068e-05, + "loss": 1.7626, + "step": 5333 + }, + { + "epoch": 0.40151301304126913, + "grad_norm": 5.3905792236328125, + "learning_rate": 6.808047077784988e-05, + "loss": 2.1274, + "step": 5334 + }, + { + "epoch": 0.4015882873219293, + "grad_norm": 7.440544605255127, + "learning_rate": 6.806910431089696e-05, + "loss": 1.7608, + "step": 5335 + }, + { + "epoch": 0.4016635616025894, + "grad_norm": 6.137508392333984, + "learning_rate": 6.805773676978764e-05, + "loss": 1.7673, + "step": 5336 + }, + { + "epoch": 0.4017388358832496, + "grad_norm": 6.605532646179199, + "learning_rate": 6.804636815519768e-05, + "loss": 2.0913, + "step": 5337 + }, + { + "epoch": 0.4018141101639097, + "grad_norm": 3.625389337539673, + "learning_rate": 6.803499846780292e-05, + "loss": 1.9003, + "step": 5338 + }, + { + "epoch": 0.4018893844445699, + "grad_norm": 4.9041643142700195, + "learning_rate": 6.802362770827926e-05, + "loss": 1.9452, + "step": 5339 + }, + { + "epoch": 0.4019646587252301, + "grad_norm": 4.29222297668457, + "learning_rate": 6.801225587730263e-05, + "loss": 2.0383, + "step": 5340 + }, + { + "epoch": 0.4020399330058902, + "grad_norm": 5.0526533126831055, + "learning_rate": 6.800088297554908e-05, + "loss": 1.6259, + "step": 5341 + }, + { + "epoch": 0.40211520728655037, + "grad_norm": 5.545862674713135, + "learning_rate": 6.798950900369469e-05, + "loss": 1.8551, + "step": 5342 + }, + { + "epoch": 0.40219048156721054, + "grad_norm": 5.352468967437744, + "learning_rate": 6.79781339624156e-05, + "loss": 1.8697, + "step": 5343 + }, + { + "epoch": 0.40226575584787067, + "grad_norm": 3.8349769115448, + "learning_rate": 6.796675785238804e-05, + "loss": 1.9894, + "step": 5344 + }, + { + "epoch": 0.40234103012853084, + "grad_norm": 4.386890411376953, + "learning_rate": 6.795538067428827e-05, + "loss": 2.0703, + "step": 5345 + }, + { + "epoch": 0.402416304409191, + "grad_norm": 4.886191368103027, + "learning_rate": 6.794400242879264e-05, + "loss": 1.7853, + "step": 5346 + }, + { + "epoch": 0.40249157868985114, + "grad_norm": 4.671879291534424, + "learning_rate": 6.793262311657755e-05, + "loss": 1.7396, + "step": 5347 + }, + { + "epoch": 0.4025668529705113, + "grad_norm": 4.590297222137451, + "learning_rate": 6.792124273831948e-05, + "loss": 1.5286, + "step": 5348 + }, + { + "epoch": 0.40264212725117143, + "grad_norm": 4.19608736038208, + "learning_rate": 6.790986129469494e-05, + "loss": 2.0806, + "step": 5349 + }, + { + "epoch": 0.4027174015318316, + "grad_norm": 4.767586708068848, + "learning_rate": 6.789847878638054e-05, + "loss": 2.0316, + "step": 5350 + }, + { + "epoch": 0.4027926758124918, + "grad_norm": 5.733920097351074, + "learning_rate": 6.788709521405295e-05, + "loss": 1.9693, + "step": 5351 + }, + { + "epoch": 0.4028679500931519, + "grad_norm": 4.402591228485107, + "learning_rate": 6.787571057838884e-05, + "loss": 2.2142, + "step": 5352 + }, + { + "epoch": 0.4029432243738121, + "grad_norm": 3.8515992164611816, + "learning_rate": 6.786432488006503e-05, + "loss": 2.1997, + "step": 5353 + }, + { + "epoch": 0.40301849865447226, + "grad_norm": 5.119091033935547, + "learning_rate": 6.785293811975838e-05, + "loss": 1.8369, + "step": 5354 + }, + { + "epoch": 0.4030937729351324, + "grad_norm": 5.049182415008545, + "learning_rate": 6.784155029814579e-05, + "loss": 1.7441, + "step": 5355 + }, + { + "epoch": 0.40316904721579255, + "grad_norm": 4.594454765319824, + "learning_rate": 6.783016141590422e-05, + "loss": 1.5844, + "step": 5356 + }, + { + "epoch": 0.4032443214964527, + "grad_norm": 3.905133008956909, + "learning_rate": 6.781877147371071e-05, + "loss": 2.0903, + "step": 5357 + }, + { + "epoch": 0.40331959577711285, + "grad_norm": 4.276846885681152, + "learning_rate": 6.780738047224237e-05, + "loss": 1.8605, + "step": 5358 + }, + { + "epoch": 0.403394870057773, + "grad_norm": 4.683049201965332, + "learning_rate": 6.779598841217636e-05, + "loss": 2.0306, + "step": 5359 + }, + { + "epoch": 0.40347014433843315, + "grad_norm": 4.889284610748291, + "learning_rate": 6.778459529418989e-05, + "loss": 1.9703, + "step": 5360 + }, + { + "epoch": 0.4035454186190933, + "grad_norm": 4.012659549713135, + "learning_rate": 6.777320111896027e-05, + "loss": 1.7099, + "step": 5361 + }, + { + "epoch": 0.4036206928997535, + "grad_norm": 5.005797863006592, + "learning_rate": 6.776180588716484e-05, + "loss": 1.9093, + "step": 5362 + }, + { + "epoch": 0.4036959671804136, + "grad_norm": 5.917949676513672, + "learning_rate": 6.775040959948101e-05, + "loss": 2.0289, + "step": 5363 + }, + { + "epoch": 0.4037712414610738, + "grad_norm": 4.17591667175293, + "learning_rate": 6.773901225658626e-05, + "loss": 1.9217, + "step": 5364 + }, + { + "epoch": 0.40384651574173397, + "grad_norm": 4.706019401550293, + "learning_rate": 6.772761385915812e-05, + "loss": 1.9777, + "step": 5365 + }, + { + "epoch": 0.4039217900223941, + "grad_norm": 5.509639263153076, + "learning_rate": 6.771621440787423e-05, + "loss": 1.9314, + "step": 5366 + }, + { + "epoch": 0.40399706430305427, + "grad_norm": 4.2420854568481445, + "learning_rate": 6.770481390341221e-05, + "loss": 2.0033, + "step": 5367 + }, + { + "epoch": 0.4040723385837144, + "grad_norm": 4.64718770980835, + "learning_rate": 6.76934123464498e-05, + "loss": 1.9052, + "step": 5368 + }, + { + "epoch": 0.40414761286437456, + "grad_norm": 4.042972564697266, + "learning_rate": 6.768200973766478e-05, + "loss": 1.6507, + "step": 5369 + }, + { + "epoch": 0.40422288714503474, + "grad_norm": 3.9866669178009033, + "learning_rate": 6.767060607773504e-05, + "loss": 1.9448, + "step": 5370 + }, + { + "epoch": 0.40429816142569486, + "grad_norm": 7.047448635101318, + "learning_rate": 6.765920136733848e-05, + "loss": 1.9653, + "step": 5371 + }, + { + "epoch": 0.40437343570635503, + "grad_norm": 4.919844627380371, + "learning_rate": 6.764779560715305e-05, + "loss": 2.3077, + "step": 5372 + }, + { + "epoch": 0.4044487099870152, + "grad_norm": 4.603641510009766, + "learning_rate": 6.763638879785681e-05, + "loss": 1.7392, + "step": 5373 + }, + { + "epoch": 0.40452398426767533, + "grad_norm": 4.3513503074646, + "learning_rate": 6.762498094012787e-05, + "loss": 1.9311, + "step": 5374 + }, + { + "epoch": 0.4045992585483355, + "grad_norm": 5.913522720336914, + "learning_rate": 6.761357203464436e-05, + "loss": 1.8406, + "step": 5375 + }, + { + "epoch": 0.4046745328289956, + "grad_norm": 4.463541030883789, + "learning_rate": 6.760216208208456e-05, + "loss": 1.7442, + "step": 5376 + }, + { + "epoch": 0.4047498071096558, + "grad_norm": 4.700443267822266, + "learning_rate": 6.759075108312672e-05, + "loss": 1.7575, + "step": 5377 + }, + { + "epoch": 0.404825081390316, + "grad_norm": 3.870957851409912, + "learning_rate": 6.757933903844921e-05, + "loss": 1.6329, + "step": 5378 + }, + { + "epoch": 0.4049003556709761, + "grad_norm": 5.883016109466553, + "learning_rate": 6.756792594873042e-05, + "loss": 1.8877, + "step": 5379 + }, + { + "epoch": 0.4049756299516363, + "grad_norm": 4.039511203765869, + "learning_rate": 6.755651181464885e-05, + "loss": 1.8635, + "step": 5380 + }, + { + "epoch": 0.40505090423229645, + "grad_norm": 5.450438022613525, + "learning_rate": 6.754509663688303e-05, + "loss": 2.1554, + "step": 5381 + }, + { + "epoch": 0.40512617851295657, + "grad_norm": 5.378197193145752, + "learning_rate": 6.753368041611156e-05, + "loss": 2.0284, + "step": 5382 + }, + { + "epoch": 0.40520145279361675, + "grad_norm": 5.459533214569092, + "learning_rate": 6.752226315301309e-05, + "loss": 1.8221, + "step": 5383 + }, + { + "epoch": 0.4052767270742769, + "grad_norm": 4.1703925132751465, + "learning_rate": 6.751084484826635e-05, + "loss": 1.8704, + "step": 5384 + }, + { + "epoch": 0.40535200135493704, + "grad_norm": 4.372258186340332, + "learning_rate": 6.749942550255013e-05, + "loss": 1.6398, + "step": 5385 + }, + { + "epoch": 0.4054272756355972, + "grad_norm": 8.171854972839355, + "learning_rate": 6.748800511654327e-05, + "loss": 2.0183, + "step": 5386 + }, + { + "epoch": 0.40550254991625734, + "grad_norm": 5.452788829803467, + "learning_rate": 6.74765836909247e-05, + "loss": 1.8824, + "step": 5387 + }, + { + "epoch": 0.4055778241969175, + "grad_norm": 6.154483318328857, + "learning_rate": 6.746516122637336e-05, + "loss": 1.9031, + "step": 5388 + }, + { + "epoch": 0.4056530984775777, + "grad_norm": 5.87261962890625, + "learning_rate": 6.745373772356832e-05, + "loss": 1.7002, + "step": 5389 + }, + { + "epoch": 0.4057283727582378, + "grad_norm": 5.224818706512451, + "learning_rate": 6.744231318318863e-05, + "loss": 1.8068, + "step": 5390 + }, + { + "epoch": 0.405803647038898, + "grad_norm": 4.689792156219482, + "learning_rate": 6.743088760591349e-05, + "loss": 1.6611, + "step": 5391 + }, + { + "epoch": 0.40587892131955816, + "grad_norm": 3.937889575958252, + "learning_rate": 6.741946099242206e-05, + "loss": 1.6875, + "step": 5392 + }, + { + "epoch": 0.4059541956002183, + "grad_norm": 5.278813362121582, + "learning_rate": 6.74080333433937e-05, + "loss": 1.8321, + "step": 5393 + }, + { + "epoch": 0.40602946988087846, + "grad_norm": 4.831874370574951, + "learning_rate": 6.739660465950769e-05, + "loss": 1.6056, + "step": 5394 + }, + { + "epoch": 0.40610474416153863, + "grad_norm": 7.1477437019348145, + "learning_rate": 6.738517494144346e-05, + "loss": 1.5942, + "step": 5395 + }, + { + "epoch": 0.40618001844219875, + "grad_norm": 4.22511100769043, + "learning_rate": 6.737374418988044e-05, + "loss": 2.167, + "step": 5396 + }, + { + "epoch": 0.40625529272285893, + "grad_norm": 6.920355319976807, + "learning_rate": 6.736231240549818e-05, + "loss": 1.6283, + "step": 5397 + }, + { + "epoch": 0.40633056700351905, + "grad_norm": 5.481980800628662, + "learning_rate": 6.735087958897628e-05, + "loss": 1.9042, + "step": 5398 + }, + { + "epoch": 0.4064058412841792, + "grad_norm": 5.518102645874023, + "learning_rate": 6.733944574099437e-05, + "loss": 1.5483, + "step": 5399 + }, + { + "epoch": 0.4064811155648394, + "grad_norm": 4.6662983894348145, + "learning_rate": 6.732801086223216e-05, + "loss": 1.8087, + "step": 5400 + }, + { + "epoch": 0.4065563898454995, + "grad_norm": 4.208428382873535, + "learning_rate": 6.731657495336942e-05, + "loss": 2.1168, + "step": 5401 + }, + { + "epoch": 0.4066316641261597, + "grad_norm": 6.083242416381836, + "learning_rate": 6.730513801508601e-05, + "loss": 1.6628, + "step": 5402 + }, + { + "epoch": 0.4067069384068199, + "grad_norm": 5.460692405700684, + "learning_rate": 6.729370004806176e-05, + "loss": 1.6781, + "step": 5403 + }, + { + "epoch": 0.40678221268748, + "grad_norm": 3.8394346237182617, + "learning_rate": 6.728226105297668e-05, + "loss": 1.8413, + "step": 5404 + }, + { + "epoch": 0.40685748696814017, + "grad_norm": 3.981407403945923, + "learning_rate": 6.727082103051077e-05, + "loss": 1.7685, + "step": 5405 + }, + { + "epoch": 0.4069327612488003, + "grad_norm": 4.696751594543457, + "learning_rate": 6.725937998134412e-05, + "loss": 1.9947, + "step": 5406 + }, + { + "epoch": 0.40700803552946047, + "grad_norm": 4.249602794647217, + "learning_rate": 6.724793790615685e-05, + "loss": 1.7482, + "step": 5407 + }, + { + "epoch": 0.40708330981012064, + "grad_norm": 8.267666816711426, + "learning_rate": 6.723649480562913e-05, + "loss": 2.0539, + "step": 5408 + }, + { + "epoch": 0.40715858409078076, + "grad_norm": 5.0732221603393555, + "learning_rate": 6.722505068044128e-05, + "loss": 1.5534, + "step": 5409 + }, + { + "epoch": 0.40723385837144094, + "grad_norm": 4.7796711921691895, + "learning_rate": 6.721360553127358e-05, + "loss": 2.423, + "step": 5410 + }, + { + "epoch": 0.4073091326521011, + "grad_norm": 5.815858364105225, + "learning_rate": 6.720215935880644e-05, + "loss": 2.2248, + "step": 5411 + }, + { + "epoch": 0.40738440693276123, + "grad_norm": 6.612654685974121, + "learning_rate": 6.719071216372027e-05, + "loss": 2.0987, + "step": 5412 + }, + { + "epoch": 0.4074596812134214, + "grad_norm": 5.158834457397461, + "learning_rate": 6.717926394669558e-05, + "loss": 1.904, + "step": 5413 + }, + { + "epoch": 0.4075349554940816, + "grad_norm": 5.117301940917969, + "learning_rate": 6.716781470841296e-05, + "loss": 1.8189, + "step": 5414 + }, + { + "epoch": 0.4076102297747417, + "grad_norm": 4.515567779541016, + "learning_rate": 6.7156364449553e-05, + "loss": 1.7384, + "step": 5415 + }, + { + "epoch": 0.4076855040554019, + "grad_norm": 6.042490005493164, + "learning_rate": 6.71449131707964e-05, + "loss": 1.7519, + "step": 5416 + }, + { + "epoch": 0.407760778336062, + "grad_norm": 4.776151657104492, + "learning_rate": 6.713346087282392e-05, + "loss": 1.9827, + "step": 5417 + }, + { + "epoch": 0.4078360526167222, + "grad_norm": 4.371926784515381, + "learning_rate": 6.712200755631634e-05, + "loss": 2.0774, + "step": 5418 + }, + { + "epoch": 0.40791132689738235, + "grad_norm": 5.890740394592285, + "learning_rate": 6.711055322195455e-05, + "loss": 2.1401, + "step": 5419 + }, + { + "epoch": 0.4079866011780425, + "grad_norm": 4.706520080566406, + "learning_rate": 6.709909787041945e-05, + "loss": 1.9691, + "step": 5420 + }, + { + "epoch": 0.40806187545870265, + "grad_norm": 7.046875476837158, + "learning_rate": 6.708764150239206e-05, + "loss": 2.0648, + "step": 5421 + }, + { + "epoch": 0.4081371497393628, + "grad_norm": 5.144106388092041, + "learning_rate": 6.70761841185534e-05, + "loss": 1.6877, + "step": 5422 + }, + { + "epoch": 0.40821242402002295, + "grad_norm": 7.4167866706848145, + "learning_rate": 6.706472571958461e-05, + "loss": 1.6675, + "step": 5423 + }, + { + "epoch": 0.4082876983006831, + "grad_norm": 5.987636089324951, + "learning_rate": 6.705326630616684e-05, + "loss": 2.186, + "step": 5424 + }, + { + "epoch": 0.40836297258134324, + "grad_norm": 4.898227691650391, + "learning_rate": 6.70418058789813e-05, + "loss": 2.3504, + "step": 5425 + }, + { + "epoch": 0.4084382468620034, + "grad_norm": 5.678811073303223, + "learning_rate": 6.70303444387093e-05, + "loss": 2.1565, + "step": 5426 + }, + { + "epoch": 0.4085135211426636, + "grad_norm": 4.878733158111572, + "learning_rate": 6.701888198603221e-05, + "loss": 2.0222, + "step": 5427 + }, + { + "epoch": 0.4085887954233237, + "grad_norm": 5.351839542388916, + "learning_rate": 6.70074185216314e-05, + "loss": 2.4616, + "step": 5428 + }, + { + "epoch": 0.4086640697039839, + "grad_norm": 3.618213653564453, + "learning_rate": 6.699595404618837e-05, + "loss": 2.2737, + "step": 5429 + }, + { + "epoch": 0.40873934398464407, + "grad_norm": 6.1762776374816895, + "learning_rate": 6.698448856038462e-05, + "loss": 1.9113, + "step": 5430 + }, + { + "epoch": 0.4088146182653042, + "grad_norm": 4.630792617797852, + "learning_rate": 6.697302206490178e-05, + "loss": 1.9024, + "step": 5431 + }, + { + "epoch": 0.40888989254596436, + "grad_norm": 4.821786880493164, + "learning_rate": 6.696155456042147e-05, + "loss": 1.9918, + "step": 5432 + }, + { + "epoch": 0.40896516682662454, + "grad_norm": 4.789692401885986, + "learning_rate": 6.69500860476254e-05, + "loss": 2.0478, + "step": 5433 + }, + { + "epoch": 0.40904044110728466, + "grad_norm": 4.95683479309082, + "learning_rate": 6.693861652719536e-05, + "loss": 1.8783, + "step": 5434 + }, + { + "epoch": 0.40911571538794483, + "grad_norm": 8.01365852355957, + "learning_rate": 6.692714599981318e-05, + "loss": 2.1239, + "step": 5435 + }, + { + "epoch": 0.40919098966860495, + "grad_norm": 5.053940296173096, + "learning_rate": 6.691567446616072e-05, + "loss": 1.8161, + "step": 5436 + }, + { + "epoch": 0.40926626394926513, + "grad_norm": 5.602110385894775, + "learning_rate": 6.690420192691996e-05, + "loss": 1.589, + "step": 5437 + }, + { + "epoch": 0.4093415382299253, + "grad_norm": 4.957159996032715, + "learning_rate": 6.68927283827729e-05, + "loss": 2.1941, + "step": 5438 + }, + { + "epoch": 0.4094168125105854, + "grad_norm": 4.957586288452148, + "learning_rate": 6.68812538344016e-05, + "loss": 2.0952, + "step": 5439 + }, + { + "epoch": 0.4094920867912456, + "grad_norm": 5.923855781555176, + "learning_rate": 6.686977828248822e-05, + "loss": 1.8752, + "step": 5440 + }, + { + "epoch": 0.4095673610719058, + "grad_norm": 3.693673849105835, + "learning_rate": 6.68583017277149e-05, + "loss": 2.0919, + "step": 5441 + }, + { + "epoch": 0.4096426353525659, + "grad_norm": 4.2660040855407715, + "learning_rate": 6.684682417076392e-05, + "loss": 1.7999, + "step": 5442 + }, + { + "epoch": 0.4097179096332261, + "grad_norm": 5.9830498695373535, + "learning_rate": 6.68353456123176e-05, + "loss": 1.8812, + "step": 5443 + }, + { + "epoch": 0.40979318391388625, + "grad_norm": 4.192532062530518, + "learning_rate": 6.682386605305827e-05, + "loss": 1.6391, + "step": 5444 + }, + { + "epoch": 0.40986845819454637, + "grad_norm": 5.0706562995910645, + "learning_rate": 6.681238549366838e-05, + "loss": 1.5646, + "step": 5445 + }, + { + "epoch": 0.40994373247520655, + "grad_norm": 3.545498847961426, + "learning_rate": 6.680090393483041e-05, + "loss": 2.1008, + "step": 5446 + }, + { + "epoch": 0.41001900675586667, + "grad_norm": 4.469223499298096, + "learning_rate": 6.678942137722691e-05, + "loss": 2.2487, + "step": 5447 + }, + { + "epoch": 0.41009428103652684, + "grad_norm": 3.9156947135925293, + "learning_rate": 6.677793782154049e-05, + "loss": 1.8026, + "step": 5448 + }, + { + "epoch": 0.410169555317187, + "grad_norm": 4.049769878387451, + "learning_rate": 6.67664532684538e-05, + "loss": 2.2555, + "step": 5449 + }, + { + "epoch": 0.41024482959784714, + "grad_norm": 6.026326656341553, + "learning_rate": 6.675496771864958e-05, + "loss": 1.961, + "step": 5450 + }, + { + "epoch": 0.4103201038785073, + "grad_norm": 5.659010410308838, + "learning_rate": 6.67434811728106e-05, + "loss": 1.5755, + "step": 5451 + }, + { + "epoch": 0.4103953781591675, + "grad_norm": 10.245710372924805, + "learning_rate": 6.67319936316197e-05, + "loss": 2.0027, + "step": 5452 + }, + { + "epoch": 0.4104706524398276, + "grad_norm": 4.772189140319824, + "learning_rate": 6.672050509575978e-05, + "loss": 1.7578, + "step": 5453 + }, + { + "epoch": 0.4105459267204878, + "grad_norm": 6.108232021331787, + "learning_rate": 6.670901556591383e-05, + "loss": 2.0779, + "step": 5454 + }, + { + "epoch": 0.4106212010011479, + "grad_norm": 4.190424919128418, + "learning_rate": 6.669752504276486e-05, + "loss": 1.8233, + "step": 5455 + }, + { + "epoch": 0.4106964752818081, + "grad_norm": 5.099961757659912, + "learning_rate": 6.668603352699592e-05, + "loss": 2.1098, + "step": 5456 + }, + { + "epoch": 0.41077174956246826, + "grad_norm": 3.5378541946411133, + "learning_rate": 6.667454101929016e-05, + "loss": 2.2029, + "step": 5457 + }, + { + "epoch": 0.4108470238431284, + "grad_norm": 4.6320366859436035, + "learning_rate": 6.666304752033076e-05, + "loss": 1.7842, + "step": 5458 + }, + { + "epoch": 0.41092229812378855, + "grad_norm": 3.7197091579437256, + "learning_rate": 6.665155303080104e-05, + "loss": 1.7905, + "step": 5459 + }, + { + "epoch": 0.41099757240444873, + "grad_norm": 5.463592529296875, + "learning_rate": 6.664005755138424e-05, + "loss": 1.8578, + "step": 5460 + }, + { + "epoch": 0.41107284668510885, + "grad_norm": 4.696115493774414, + "learning_rate": 6.662856108276379e-05, + "loss": 1.8259, + "step": 5461 + }, + { + "epoch": 0.411148120965769, + "grad_norm": 4.654155731201172, + "learning_rate": 6.661706362562308e-05, + "loss": 1.5253, + "step": 5462 + }, + { + "epoch": 0.4112233952464292, + "grad_norm": 4.110753536224365, + "learning_rate": 6.66055651806456e-05, + "loss": 1.9475, + "step": 5463 + }, + { + "epoch": 0.4112986695270893, + "grad_norm": 4.073593616485596, + "learning_rate": 6.659406574851494e-05, + "loss": 1.8752, + "step": 5464 + }, + { + "epoch": 0.4113739438077495, + "grad_norm": 4.415874004364014, + "learning_rate": 6.658256532991468e-05, + "loss": 1.7501, + "step": 5465 + }, + { + "epoch": 0.4114492180884096, + "grad_norm": 5.778043746948242, + "learning_rate": 6.65710639255285e-05, + "loss": 1.524, + "step": 5466 + }, + { + "epoch": 0.4115244923690698, + "grad_norm": 4.38038969039917, + "learning_rate": 6.65595615360401e-05, + "loss": 2.1217, + "step": 5467 + }, + { + "epoch": 0.41159976664972997, + "grad_norm": 35.54948425292969, + "learning_rate": 6.65480581621333e-05, + "loss": 1.9112, + "step": 5468 + }, + { + "epoch": 0.4116750409303901, + "grad_norm": 3.8536789417266846, + "learning_rate": 6.653655380449192e-05, + "loss": 1.914, + "step": 5469 + }, + { + "epoch": 0.41175031521105027, + "grad_norm": 3.9715640544891357, + "learning_rate": 6.652504846379984e-05, + "loss": 2.3539, + "step": 5470 + }, + { + "epoch": 0.41182558949171044, + "grad_norm": 6.405800819396973, + "learning_rate": 6.651354214074106e-05, + "loss": 2.0071, + "step": 5471 + }, + { + "epoch": 0.41190086377237056, + "grad_norm": 5.497026443481445, + "learning_rate": 6.65020348359996e-05, + "loss": 1.9456, + "step": 5472 + }, + { + "epoch": 0.41197613805303074, + "grad_norm": 3.540661573410034, + "learning_rate": 6.649052655025951e-05, + "loss": 1.9546, + "step": 5473 + }, + { + "epoch": 0.41205141233369086, + "grad_norm": 4.088535308837891, + "learning_rate": 6.647901728420494e-05, + "loss": 1.9123, + "step": 5474 + }, + { + "epoch": 0.41212668661435103, + "grad_norm": 7.440867900848389, + "learning_rate": 6.646750703852006e-05, + "loss": 2.2778, + "step": 5475 + }, + { + "epoch": 0.4122019608950112, + "grad_norm": 4.542690277099609, + "learning_rate": 6.645599581388913e-05, + "loss": 1.9164, + "step": 5476 + }, + { + "epoch": 0.41227723517567133, + "grad_norm": 7.522364139556885, + "learning_rate": 6.64444836109965e-05, + "loss": 2.0474, + "step": 5477 + }, + { + "epoch": 0.4123525094563315, + "grad_norm": 4.173257350921631, + "learning_rate": 6.643297043052647e-05, + "loss": 2.1191, + "step": 5478 + }, + { + "epoch": 0.4124277837369917, + "grad_norm": 4.747725963592529, + "learning_rate": 6.64214562731635e-05, + "loss": 1.813, + "step": 5479 + }, + { + "epoch": 0.4125030580176518, + "grad_norm": 4.226992607116699, + "learning_rate": 6.640994113959209e-05, + "loss": 1.9841, + "step": 5480 + }, + { + "epoch": 0.412578332298312, + "grad_norm": 5.388829231262207, + "learning_rate": 6.639842503049674e-05, + "loss": 2.1282, + "step": 5481 + }, + { + "epoch": 0.41265360657897215, + "grad_norm": 4.628787040710449, + "learning_rate": 6.63869079465621e-05, + "loss": 1.8845, + "step": 5482 + }, + { + "epoch": 0.4127288808596323, + "grad_norm": 4.202969074249268, + "learning_rate": 6.63753898884728e-05, + "loss": 1.6986, + "step": 5483 + }, + { + "epoch": 0.41280415514029245, + "grad_norm": 4.4819111824035645, + "learning_rate": 6.636387085691356e-05, + "loss": 1.9396, + "step": 5484 + }, + { + "epoch": 0.41287942942095257, + "grad_norm": 3.757169246673584, + "learning_rate": 6.635235085256914e-05, + "loss": 1.7243, + "step": 5485 + }, + { + "epoch": 0.41295470370161275, + "grad_norm": 4.030976295471191, + "learning_rate": 6.634082987612438e-05, + "loss": 1.7044, + "step": 5486 + }, + { + "epoch": 0.4130299779822729, + "grad_norm": 4.444151401519775, + "learning_rate": 6.632930792826417e-05, + "loss": 1.8077, + "step": 5487 + }, + { + "epoch": 0.41310525226293304, + "grad_norm": 5.865589618682861, + "learning_rate": 6.631778500967347e-05, + "loss": 1.9496, + "step": 5488 + }, + { + "epoch": 0.4131805265435932, + "grad_norm": 5.243549823760986, + "learning_rate": 6.630626112103727e-05, + "loss": 2.4104, + "step": 5489 + }, + { + "epoch": 0.4132558008242534, + "grad_norm": 3.929457902908325, + "learning_rate": 6.629473626304063e-05, + "loss": 2.0006, + "step": 5490 + }, + { + "epoch": 0.4133310751049135, + "grad_norm": 5.040304660797119, + "learning_rate": 6.628321043636868e-05, + "loss": 2.1344, + "step": 5491 + }, + { + "epoch": 0.4134063493855737, + "grad_norm": 3.9484832286834717, + "learning_rate": 6.627168364170659e-05, + "loss": 1.6634, + "step": 5492 + }, + { + "epoch": 0.41348162366623387, + "grad_norm": 4.7023091316223145, + "learning_rate": 6.626015587973958e-05, + "loss": 1.9543, + "step": 5493 + }, + { + "epoch": 0.413556897946894, + "grad_norm": 3.7054481506347656, + "learning_rate": 6.624862715115297e-05, + "loss": 1.6617, + "step": 5494 + }, + { + "epoch": 0.41363217222755416, + "grad_norm": 14.342565536499023, + "learning_rate": 6.623709745663212e-05, + "loss": 1.7839, + "step": 5495 + }, + { + "epoch": 0.4137074465082143, + "grad_norm": 3.685948133468628, + "learning_rate": 6.622556679686238e-05, + "loss": 1.6717, + "step": 5496 + }, + { + "epoch": 0.41378272078887446, + "grad_norm": 5.35618257522583, + "learning_rate": 6.621403517252926e-05, + "loss": 2.0249, + "step": 5497 + }, + { + "epoch": 0.41385799506953463, + "grad_norm": 6.446765422821045, + "learning_rate": 6.620250258431827e-05, + "loss": 1.7099, + "step": 5498 + }, + { + "epoch": 0.41393326935019475, + "grad_norm": 5.034029960632324, + "learning_rate": 6.6190969032915e-05, + "loss": 1.9267, + "step": 5499 + }, + { + "epoch": 0.41400854363085493, + "grad_norm": 5.23973274230957, + "learning_rate": 6.617943451900508e-05, + "loss": 2.0429, + "step": 5500 + }, + { + "epoch": 0.4140838179115151, + "grad_norm": 5.05288028717041, + "learning_rate": 6.616789904327421e-05, + "loss": 1.8314, + "step": 5501 + }, + { + "epoch": 0.4141590921921752, + "grad_norm": 3.8253533840179443, + "learning_rate": 6.615636260640812e-05, + "loss": 2.2482, + "step": 5502 + }, + { + "epoch": 0.4142343664728354, + "grad_norm": 4.567711353302002, + "learning_rate": 6.614482520909262e-05, + "loss": 1.623, + "step": 5503 + }, + { + "epoch": 0.4143096407534955, + "grad_norm": 5.241711616516113, + "learning_rate": 6.61332868520136e-05, + "loss": 2.1957, + "step": 5504 + }, + { + "epoch": 0.4143849150341557, + "grad_norm": 4.806895732879639, + "learning_rate": 6.612174753585697e-05, + "loss": 1.6227, + "step": 5505 + }, + { + "epoch": 0.4144601893148159, + "grad_norm": 5.966377258300781, + "learning_rate": 6.61102072613087e-05, + "loss": 1.6093, + "step": 5506 + }, + { + "epoch": 0.414535463595476, + "grad_norm": 4.479160308837891, + "learning_rate": 6.609866602905484e-05, + "loss": 1.8421, + "step": 5507 + }, + { + "epoch": 0.41461073787613617, + "grad_norm": 4.3704304695129395, + "learning_rate": 6.608712383978147e-05, + "loss": 1.7253, + "step": 5508 + }, + { + "epoch": 0.41468601215679635, + "grad_norm": 5.414557456970215, + "learning_rate": 6.607558069417476e-05, + "loss": 1.8455, + "step": 5509 + }, + { + "epoch": 0.41476128643745647, + "grad_norm": 5.060478687286377, + "learning_rate": 6.60640365929209e-05, + "loss": 1.849, + "step": 5510 + }, + { + "epoch": 0.41483656071811664, + "grad_norm": 6.708364963531494, + "learning_rate": 6.605249153670615e-05, + "loss": 1.6221, + "step": 5511 + }, + { + "epoch": 0.4149118349987768, + "grad_norm": 5.423747539520264, + "learning_rate": 6.604094552621685e-05, + "loss": 1.9274, + "step": 5512 + }, + { + "epoch": 0.41498710927943694, + "grad_norm": 5.680305480957031, + "learning_rate": 6.602939856213935e-05, + "loss": 1.9748, + "step": 5513 + }, + { + "epoch": 0.4150623835600971, + "grad_norm": 6.365753173828125, + "learning_rate": 6.601785064516011e-05, + "loss": 1.7393, + "step": 5514 + }, + { + "epoch": 0.41513765784075723, + "grad_norm": 4.265463352203369, + "learning_rate": 6.600630177596559e-05, + "loss": 1.7153, + "step": 5515 + }, + { + "epoch": 0.4152129321214174, + "grad_norm": 4.823423862457275, + "learning_rate": 6.599475195524237e-05, + "loss": 1.6959, + "step": 5516 + }, + { + "epoch": 0.4152882064020776, + "grad_norm": 4.547261714935303, + "learning_rate": 6.598320118367704e-05, + "loss": 1.7547, + "step": 5517 + }, + { + "epoch": 0.4153634806827377, + "grad_norm": 4.141543865203857, + "learning_rate": 6.597164946195627e-05, + "loss": 1.8481, + "step": 5518 + }, + { + "epoch": 0.4154387549633979, + "grad_norm": 21.588747024536133, + "learning_rate": 6.596009679076675e-05, + "loss": 1.685, + "step": 5519 + }, + { + "epoch": 0.41551402924405806, + "grad_norm": 3.7997870445251465, + "learning_rate": 6.594854317079529e-05, + "loss": 1.8551, + "step": 5520 + }, + { + "epoch": 0.4155893035247182, + "grad_norm": 5.04306173324585, + "learning_rate": 6.593698860272868e-05, + "loss": 1.9271, + "step": 5521 + }, + { + "epoch": 0.41566457780537835, + "grad_norm": 4.312346458435059, + "learning_rate": 6.592543308725383e-05, + "loss": 1.5891, + "step": 5522 + }, + { + "epoch": 0.41573985208603853, + "grad_norm": 3.8243563175201416, + "learning_rate": 6.591387662505769e-05, + "loss": 1.68, + "step": 5523 + }, + { + "epoch": 0.41581512636669865, + "grad_norm": 5.457446098327637, + "learning_rate": 6.590231921682724e-05, + "loss": 2.0408, + "step": 5524 + }, + { + "epoch": 0.4158904006473588, + "grad_norm": 4.260964393615723, + "learning_rate": 6.589076086324954e-05, + "loss": 1.723, + "step": 5525 + }, + { + "epoch": 0.41596567492801895, + "grad_norm": 11.838737487792969, + "learning_rate": 6.587920156501171e-05, + "loss": 2.1117, + "step": 5526 + }, + { + "epoch": 0.4160409492086791, + "grad_norm": 5.88382625579834, + "learning_rate": 6.58676413228009e-05, + "loss": 1.7678, + "step": 5527 + }, + { + "epoch": 0.4161162234893393, + "grad_norm": 6.338740348815918, + "learning_rate": 6.585608013730436e-05, + "loss": 1.7129, + "step": 5528 + }, + { + "epoch": 0.4161914977699994, + "grad_norm": 6.767484188079834, + "learning_rate": 6.584451800920933e-05, + "loss": 2.0666, + "step": 5529 + }, + { + "epoch": 0.4162667720506596, + "grad_norm": 5.871155738830566, + "learning_rate": 6.583295493920317e-05, + "loss": 1.811, + "step": 5530 + }, + { + "epoch": 0.41634204633131977, + "grad_norm": 5.648779392242432, + "learning_rate": 6.582139092797327e-05, + "loss": 2.0329, + "step": 5531 + }, + { + "epoch": 0.4164173206119799, + "grad_norm": 5.389750957489014, + "learning_rate": 6.580982597620709e-05, + "loss": 1.8111, + "step": 5532 + }, + { + "epoch": 0.41649259489264007, + "grad_norm": 6.9728899002075195, + "learning_rate": 6.57982600845921e-05, + "loss": 1.99, + "step": 5533 + }, + { + "epoch": 0.4165678691733002, + "grad_norm": 4.023341178894043, + "learning_rate": 6.57866932538159e-05, + "loss": 1.9943, + "step": 5534 + }, + { + "epoch": 0.41664314345396036, + "grad_norm": 4.421130180358887, + "learning_rate": 6.577512548456606e-05, + "loss": 1.8025, + "step": 5535 + }, + { + "epoch": 0.41671841773462054, + "grad_norm": 5.886690139770508, + "learning_rate": 6.576355677753028e-05, + "loss": 1.6133, + "step": 5536 + }, + { + "epoch": 0.41679369201528066, + "grad_norm": 4.669497489929199, + "learning_rate": 6.575198713339629e-05, + "loss": 1.5777, + "step": 5537 + }, + { + "epoch": 0.41686896629594083, + "grad_norm": 5.553472995758057, + "learning_rate": 6.574041655285185e-05, + "loss": 1.843, + "step": 5538 + }, + { + "epoch": 0.416944240576601, + "grad_norm": 6.0697197914123535, + "learning_rate": 6.572884503658483e-05, + "loss": 1.9854, + "step": 5539 + }, + { + "epoch": 0.41701951485726113, + "grad_norm": 6.354328155517578, + "learning_rate": 6.57172725852831e-05, + "loss": 1.4417, + "step": 5540 + }, + { + "epoch": 0.4170947891379213, + "grad_norm": 5.231034755706787, + "learning_rate": 6.57056991996346e-05, + "loss": 1.7787, + "step": 5541 + }, + { + "epoch": 0.4171700634185815, + "grad_norm": 5.058248519897461, + "learning_rate": 6.569412488032735e-05, + "loss": 1.9283, + "step": 5542 + }, + { + "epoch": 0.4172453376992416, + "grad_norm": 4.3786940574646, + "learning_rate": 6.568254962804941e-05, + "loss": 1.869, + "step": 5543 + }, + { + "epoch": 0.4173206119799018, + "grad_norm": 5.398001670837402, + "learning_rate": 6.567097344348889e-05, + "loss": 1.7065, + "step": 5544 + }, + { + "epoch": 0.4173958862605619, + "grad_norm": 5.586432456970215, + "learning_rate": 6.565939632733396e-05, + "loss": 1.9383, + "step": 5545 + }, + { + "epoch": 0.4174711605412221, + "grad_norm": 6.128942489624023, + "learning_rate": 6.564781828027285e-05, + "loss": 1.9079, + "step": 5546 + }, + { + "epoch": 0.41754643482188225, + "grad_norm": 6.857537269592285, + "learning_rate": 6.563623930299385e-05, + "loss": 2.1372, + "step": 5547 + }, + { + "epoch": 0.41762170910254237, + "grad_norm": 5.849401473999023, + "learning_rate": 6.562465939618529e-05, + "loss": 1.741, + "step": 5548 + }, + { + "epoch": 0.41769698338320255, + "grad_norm": 3.8377184867858887, + "learning_rate": 6.561307856053556e-05, + "loss": 1.8385, + "step": 5549 + }, + { + "epoch": 0.4177722576638627, + "grad_norm": 4.622677326202393, + "learning_rate": 6.56014967967331e-05, + "loss": 1.8681, + "step": 5550 + }, + { + "epoch": 0.41784753194452284, + "grad_norm": 7.1490325927734375, + "learning_rate": 6.558991410546644e-05, + "loss": 1.8109, + "step": 5551 + }, + { + "epoch": 0.417922806225183, + "grad_norm": 4.818822860717773, + "learning_rate": 6.55783304874241e-05, + "loss": 1.9789, + "step": 5552 + }, + { + "epoch": 0.41799808050584314, + "grad_norm": 8.209269523620605, + "learning_rate": 6.556674594329471e-05, + "loss": 2.0813, + "step": 5553 + }, + { + "epoch": 0.4180733547865033, + "grad_norm": 4.078526973724365, + "learning_rate": 6.555516047376696e-05, + "loss": 1.9266, + "step": 5554 + }, + { + "epoch": 0.4181486290671635, + "grad_norm": 5.389189720153809, + "learning_rate": 6.554357407952954e-05, + "loss": 1.9956, + "step": 5555 + }, + { + "epoch": 0.4182239033478236, + "grad_norm": 6.136822700500488, + "learning_rate": 6.553198676127124e-05, + "loss": 1.9567, + "step": 5556 + }, + { + "epoch": 0.4182991776284838, + "grad_norm": 4.67418909072876, + "learning_rate": 6.552039851968089e-05, + "loss": 1.9819, + "step": 5557 + }, + { + "epoch": 0.41837445190914396, + "grad_norm": 4.0010528564453125, + "learning_rate": 6.550880935544738e-05, + "loss": 1.558, + "step": 5558 + }, + { + "epoch": 0.4184497261898041, + "grad_norm": 4.692303657531738, + "learning_rate": 6.549721926925966e-05, + "loss": 1.6195, + "step": 5559 + }, + { + "epoch": 0.41852500047046426, + "grad_norm": 5.817411422729492, + "learning_rate": 6.548562826180672e-05, + "loss": 1.57, + "step": 5560 + }, + { + "epoch": 0.41860027475112443, + "grad_norm": 4.993537902832031, + "learning_rate": 6.547403633377762e-05, + "loss": 1.4889, + "step": 5561 + }, + { + "epoch": 0.41867554903178456, + "grad_norm": 7.376999855041504, + "learning_rate": 6.546244348586144e-05, + "loss": 1.6951, + "step": 5562 + }, + { + "epoch": 0.41875082331244473, + "grad_norm": 5.383813858032227, + "learning_rate": 6.545084971874738e-05, + "loss": 1.6847, + "step": 5563 + }, + { + "epoch": 0.41882609759310485, + "grad_norm": 4.478681564331055, + "learning_rate": 6.543925503312462e-05, + "loss": 1.5787, + "step": 5564 + }, + { + "epoch": 0.418901371873765, + "grad_norm": 5.5853095054626465, + "learning_rate": 6.542765942968246e-05, + "loss": 2.2442, + "step": 5565 + }, + { + "epoch": 0.4189766461544252, + "grad_norm": 5.215820789337158, + "learning_rate": 6.541606290911022e-05, + "loss": 1.7897, + "step": 5566 + }, + { + "epoch": 0.4190519204350853, + "grad_norm": 3.935844659805298, + "learning_rate": 6.540446547209727e-05, + "loss": 1.9023, + "step": 5567 + }, + { + "epoch": 0.4191271947157455, + "grad_norm": 6.1414995193481445, + "learning_rate": 6.539286711933304e-05, + "loss": 1.788, + "step": 5568 + }, + { + "epoch": 0.4192024689964057, + "grad_norm": 4.467300891876221, + "learning_rate": 6.538126785150705e-05, + "loss": 2.1221, + "step": 5569 + }, + { + "epoch": 0.4192777432770658, + "grad_norm": 7.4844465255737305, + "learning_rate": 6.536966766930879e-05, + "loss": 1.6666, + "step": 5570 + }, + { + "epoch": 0.41935301755772597, + "grad_norm": 4.857432842254639, + "learning_rate": 6.53580665734279e-05, + "loss": 2.0033, + "step": 5571 + }, + { + "epoch": 0.41942829183838615, + "grad_norm": 4.305044174194336, + "learning_rate": 6.534646456455402e-05, + "loss": 1.937, + "step": 5572 + }, + { + "epoch": 0.41950356611904627, + "grad_norm": 5.471940994262695, + "learning_rate": 6.533486164337686e-05, + "loss": 1.8116, + "step": 5573 + }, + { + "epoch": 0.41957884039970644, + "grad_norm": 5.607748985290527, + "learning_rate": 6.532325781058616e-05, + "loss": 2.2992, + "step": 5574 + }, + { + "epoch": 0.41965411468036656, + "grad_norm": 8.166545867919922, + "learning_rate": 6.531165306687177e-05, + "loss": 1.8951, + "step": 5575 + }, + { + "epoch": 0.41972938896102674, + "grad_norm": 4.563679218292236, + "learning_rate": 6.530004741292352e-05, + "loss": 1.7833, + "step": 5576 + }, + { + "epoch": 0.4198046632416869, + "grad_norm": 5.571282386779785, + "learning_rate": 6.528844084943137e-05, + "loss": 2.2633, + "step": 5577 + }, + { + "epoch": 0.41987993752234704, + "grad_norm": 4.154961109161377, + "learning_rate": 6.527683337708527e-05, + "loss": 1.7505, + "step": 5578 + }, + { + "epoch": 0.4199552118030072, + "grad_norm": 6.4554762840271, + "learning_rate": 6.526522499657526e-05, + "loss": 1.6982, + "step": 5579 + }, + { + "epoch": 0.4200304860836674, + "grad_norm": 4.554333686828613, + "learning_rate": 6.525361570859143e-05, + "loss": 1.8528, + "step": 5580 + }, + { + "epoch": 0.4201057603643275, + "grad_norm": 5.1822943687438965, + "learning_rate": 6.524200551382392e-05, + "loss": 1.8563, + "step": 5581 + }, + { + "epoch": 0.4201810346449877, + "grad_norm": 5.059783458709717, + "learning_rate": 6.523039441296289e-05, + "loss": 1.922, + "step": 5582 + }, + { + "epoch": 0.4202563089256478, + "grad_norm": 4.141258716583252, + "learning_rate": 6.521878240669863e-05, + "loss": 1.8548, + "step": 5583 + }, + { + "epoch": 0.420331583206308, + "grad_norm": 5.978043079376221, + "learning_rate": 6.520716949572143e-05, + "loss": 2.0701, + "step": 5584 + }, + { + "epoch": 0.42040685748696816, + "grad_norm": 5.271459579467773, + "learning_rate": 6.519555568072163e-05, + "loss": 1.786, + "step": 5585 + }, + { + "epoch": 0.4204821317676283, + "grad_norm": 3.97273325920105, + "learning_rate": 6.518394096238965e-05, + "loss": 2.0574, + "step": 5586 + }, + { + "epoch": 0.42055740604828845, + "grad_norm": 4.546340465545654, + "learning_rate": 6.517232534141594e-05, + "loss": 1.7852, + "step": 5587 + }, + { + "epoch": 0.4206326803289486, + "grad_norm": 4.233363151550293, + "learning_rate": 6.516070881849104e-05, + "loss": 1.8665, + "step": 5588 + }, + { + "epoch": 0.42070795460960875, + "grad_norm": 5.929393291473389, + "learning_rate": 6.514909139430549e-05, + "loss": 2.0954, + "step": 5589 + }, + { + "epoch": 0.4207832288902689, + "grad_norm": 4.584272861480713, + "learning_rate": 6.513747306954992e-05, + "loss": 1.6276, + "step": 5590 + }, + { + "epoch": 0.4208585031709291, + "grad_norm": 4.1197967529296875, + "learning_rate": 6.5125853844915e-05, + "loss": 2.1005, + "step": 5591 + }, + { + "epoch": 0.4209337774515892, + "grad_norm": 4.178314208984375, + "learning_rate": 6.511423372109149e-05, + "loss": 1.9358, + "step": 5592 + }, + { + "epoch": 0.4210090517322494, + "grad_norm": 5.186056137084961, + "learning_rate": 6.510261269877015e-05, + "loss": 1.8603, + "step": 5593 + }, + { + "epoch": 0.4210843260129095, + "grad_norm": 4.511165142059326, + "learning_rate": 6.50909907786418e-05, + "loss": 2.027, + "step": 5594 + }, + { + "epoch": 0.4211596002935697, + "grad_norm": 5.030954837799072, + "learning_rate": 6.507936796139737e-05, + "loss": 1.8356, + "step": 5595 + }, + { + "epoch": 0.42123487457422987, + "grad_norm": 5.619388103485107, + "learning_rate": 6.506774424772778e-05, + "loss": 1.9346, + "step": 5596 + }, + { + "epoch": 0.42131014885489, + "grad_norm": 5.624815940856934, + "learning_rate": 6.505611963832404e-05, + "loss": 2.3075, + "step": 5597 + }, + { + "epoch": 0.42138542313555016, + "grad_norm": 5.626185417175293, + "learning_rate": 6.504449413387715e-05, + "loss": 1.9424, + "step": 5598 + }, + { + "epoch": 0.42146069741621034, + "grad_norm": 3.945868730545044, + "learning_rate": 6.503286773507829e-05, + "loss": 1.7481, + "step": 5599 + }, + { + "epoch": 0.42153597169687046, + "grad_norm": 4.337491512298584, + "learning_rate": 6.502124044261855e-05, + "loss": 2.2041, + "step": 5600 + }, + { + "epoch": 0.42161124597753064, + "grad_norm": 4.975497245788574, + "learning_rate": 6.500961225718918e-05, + "loss": 1.5947, + "step": 5601 + }, + { + "epoch": 0.42168652025819076, + "grad_norm": 7.500434875488281, + "learning_rate": 6.499798317948143e-05, + "loss": 2.4515, + "step": 5602 + }, + { + "epoch": 0.42176179453885093, + "grad_norm": 6.894420146942139, + "learning_rate": 6.498635321018661e-05, + "loss": 2.0541, + "step": 5603 + }, + { + "epoch": 0.4218370688195111, + "grad_norm": 4.910295486450195, + "learning_rate": 6.497472234999608e-05, + "loss": 2.0103, + "step": 5604 + }, + { + "epoch": 0.4219123431001712, + "grad_norm": 9.935248374938965, + "learning_rate": 6.49630905996013e-05, + "loss": 1.4736, + "step": 5605 + }, + { + "epoch": 0.4219876173808314, + "grad_norm": 4.332150459289551, + "learning_rate": 6.495145795969371e-05, + "loss": 1.6238, + "step": 5606 + }, + { + "epoch": 0.4220628916614916, + "grad_norm": 8.34221076965332, + "learning_rate": 6.493982443096484e-05, + "loss": 2.1742, + "step": 5607 + }, + { + "epoch": 0.4221381659421517, + "grad_norm": 8.279303550720215, + "learning_rate": 6.492819001410627e-05, + "loss": 1.8716, + "step": 5608 + }, + { + "epoch": 0.4222134402228119, + "grad_norm": 5.307768821716309, + "learning_rate": 6.491655470980963e-05, + "loss": 2.1383, + "step": 5609 + }, + { + "epoch": 0.42228871450347205, + "grad_norm": 8.174683570861816, + "learning_rate": 6.490491851876663e-05, + "loss": 2.3397, + "step": 5610 + }, + { + "epoch": 0.42236398878413217, + "grad_norm": 7.190784931182861, + "learning_rate": 6.489328144166899e-05, + "loss": 1.5842, + "step": 5611 + }, + { + "epoch": 0.42243926306479235, + "grad_norm": 5.019043922424316, + "learning_rate": 6.48816434792085e-05, + "loss": 2.1723, + "step": 5612 + }, + { + "epoch": 0.42251453734545247, + "grad_norm": 5.491205215454102, + "learning_rate": 6.4870004632077e-05, + "loss": 1.8924, + "step": 5613 + }, + { + "epoch": 0.42258981162611264, + "grad_norm": 4.574029922485352, + "learning_rate": 6.48583649009664e-05, + "loss": 2.053, + "step": 5614 + }, + { + "epoch": 0.4226650859067728, + "grad_norm": 5.151193618774414, + "learning_rate": 6.484672428656862e-05, + "loss": 2.2426, + "step": 5615 + }, + { + "epoch": 0.42274036018743294, + "grad_norm": 6.66817569732666, + "learning_rate": 6.48350827895757e-05, + "loss": 1.7347, + "step": 5616 + }, + { + "epoch": 0.4228156344680931, + "grad_norm": 5.294521808624268, + "learning_rate": 6.482344041067969e-05, + "loss": 1.6096, + "step": 5617 + }, + { + "epoch": 0.4228909087487533, + "grad_norm": 7.428396224975586, + "learning_rate": 6.481179715057266e-05, + "loss": 1.7049, + "step": 5618 + }, + { + "epoch": 0.4229661830294134, + "grad_norm": 10.264471054077148, + "learning_rate": 6.480015300994678e-05, + "loss": 2.3491, + "step": 5619 + }, + { + "epoch": 0.4230414573100736, + "grad_norm": 4.98459529876709, + "learning_rate": 6.478850798949428e-05, + "loss": 1.9573, + "step": 5620 + }, + { + "epoch": 0.42311673159073376, + "grad_norm": 7.347560405731201, + "learning_rate": 6.477686208990741e-05, + "loss": 1.9452, + "step": 5621 + }, + { + "epoch": 0.4231920058713939, + "grad_norm": 5.4554009437561035, + "learning_rate": 6.47652153118785e-05, + "loss": 2.0807, + "step": 5622 + }, + { + "epoch": 0.42326728015205406, + "grad_norm": 6.793489933013916, + "learning_rate": 6.47535676560999e-05, + "loss": 1.8994, + "step": 5623 + }, + { + "epoch": 0.4233425544327142, + "grad_norm": 5.506731986999512, + "learning_rate": 6.474191912326404e-05, + "loss": 2.1581, + "step": 5624 + }, + { + "epoch": 0.42341782871337436, + "grad_norm": 6.161520481109619, + "learning_rate": 6.473026971406338e-05, + "loss": 2.375, + "step": 5625 + }, + { + "epoch": 0.42349310299403453, + "grad_norm": 3.9654464721679688, + "learning_rate": 6.471861942919043e-05, + "loss": 2.0113, + "step": 5626 + }, + { + "epoch": 0.42356837727469465, + "grad_norm": 4.8513665199279785, + "learning_rate": 6.470696826933782e-05, + "loss": 1.7888, + "step": 5627 + }, + { + "epoch": 0.4236436515553548, + "grad_norm": 7.403541088104248, + "learning_rate": 6.469531623519813e-05, + "loss": 2.1294, + "step": 5628 + }, + { + "epoch": 0.423718925836015, + "grad_norm": 4.857777118682861, + "learning_rate": 6.468366332746406e-05, + "loss": 1.7992, + "step": 5629 + }, + { + "epoch": 0.4237942001166751, + "grad_norm": 3.1128454208374023, + "learning_rate": 6.467200954682835e-05, + "loss": 1.6736, + "step": 5630 + }, + { + "epoch": 0.4238694743973353, + "grad_norm": 6.336752891540527, + "learning_rate": 6.466035489398374e-05, + "loss": 2.0828, + "step": 5631 + }, + { + "epoch": 0.4239447486779954, + "grad_norm": 5.065637588500977, + "learning_rate": 6.464869936962312e-05, + "loss": 2.1352, + "step": 5632 + }, + { + "epoch": 0.4240200229586556, + "grad_norm": 5.7641448974609375, + "learning_rate": 6.463704297443935e-05, + "loss": 1.3988, + "step": 5633 + }, + { + "epoch": 0.42409529723931577, + "grad_norm": 4.519149303436279, + "learning_rate": 6.462538570912539e-05, + "loss": 1.8315, + "step": 5634 + }, + { + "epoch": 0.4241705715199759, + "grad_norm": 6.273179054260254, + "learning_rate": 6.461372757437419e-05, + "loss": 1.8832, + "step": 5635 + }, + { + "epoch": 0.42424584580063607, + "grad_norm": 6.0949625968933105, + "learning_rate": 6.460206857087882e-05, + "loss": 1.6997, + "step": 5636 + }, + { + "epoch": 0.42432112008129624, + "grad_norm": 5.529043674468994, + "learning_rate": 6.459040869933238e-05, + "loss": 2.0176, + "step": 5637 + }, + { + "epoch": 0.42439639436195636, + "grad_norm": 5.034639358520508, + "learning_rate": 6.457874796042801e-05, + "loss": 1.8934, + "step": 5638 + }, + { + "epoch": 0.42447166864261654, + "grad_norm": 6.901305198669434, + "learning_rate": 6.45670863548589e-05, + "loss": 1.6252, + "step": 5639 + }, + { + "epoch": 0.4245469429232767, + "grad_norm": 7.03779411315918, + "learning_rate": 6.45554238833183e-05, + "loss": 1.8786, + "step": 5640 + }, + { + "epoch": 0.42462221720393684, + "grad_norm": 5.553764343261719, + "learning_rate": 6.454376054649951e-05, + "loss": 1.8197, + "step": 5641 + }, + { + "epoch": 0.424697491484597, + "grad_norm": 10.932304382324219, + "learning_rate": 6.45320963450959e-05, + "loss": 1.6143, + "step": 5642 + }, + { + "epoch": 0.42477276576525713, + "grad_norm": 9.545199394226074, + "learning_rate": 6.452043127980084e-05, + "loss": 2.4248, + "step": 5643 + }, + { + "epoch": 0.4248480400459173, + "grad_norm": 6.505753517150879, + "learning_rate": 6.450876535130782e-05, + "loss": 1.7174, + "step": 5644 + }, + { + "epoch": 0.4249233143265775, + "grad_norm": 5.837764739990234, + "learning_rate": 6.449709856031033e-05, + "loss": 2.0099, + "step": 5645 + }, + { + "epoch": 0.4249985886072376, + "grad_norm": 4.977766036987305, + "learning_rate": 6.448543090750193e-05, + "loss": 1.9528, + "step": 5646 + }, + { + "epoch": 0.4250738628878978, + "grad_norm": 4.035558223724365, + "learning_rate": 6.447376239357623e-05, + "loss": 1.9382, + "step": 5647 + }, + { + "epoch": 0.42514913716855796, + "grad_norm": 6.528450012207031, + "learning_rate": 6.446209301922686e-05, + "loss": 2.0046, + "step": 5648 + }, + { + "epoch": 0.4252244114492181, + "grad_norm": 4.8118109703063965, + "learning_rate": 6.445042278514758e-05, + "loss": 2.0711, + "step": 5649 + }, + { + "epoch": 0.42529968572987825, + "grad_norm": 5.4829888343811035, + "learning_rate": 6.443875169203213e-05, + "loss": 1.9214, + "step": 5650 + }, + { + "epoch": 0.42537496001053837, + "grad_norm": 3.779977321624756, + "learning_rate": 6.442707974057432e-05, + "loss": 1.7036, + "step": 5651 + }, + { + "epoch": 0.42545023429119855, + "grad_norm": 6.238905429840088, + "learning_rate": 6.441540693146799e-05, + "loss": 1.9936, + "step": 5652 + }, + { + "epoch": 0.4255255085718587, + "grad_norm": 6.67221212387085, + "learning_rate": 6.440373326540709e-05, + "loss": 2.3296, + "step": 5653 + }, + { + "epoch": 0.42560078285251884, + "grad_norm": 4.252877235412598, + "learning_rate": 6.43920587430856e-05, + "loss": 1.6862, + "step": 5654 + }, + { + "epoch": 0.425676057133179, + "grad_norm": 5.723069667816162, + "learning_rate": 6.438038336519749e-05, + "loss": 1.8604, + "step": 5655 + }, + { + "epoch": 0.4257513314138392, + "grad_norm": 6.1414594650268555, + "learning_rate": 6.436870713243687e-05, + "loss": 2.092, + "step": 5656 + }, + { + "epoch": 0.4258266056944993, + "grad_norm": 4.658812522888184, + "learning_rate": 6.435703004549782e-05, + "loss": 1.7544, + "step": 5657 + }, + { + "epoch": 0.4259018799751595, + "grad_norm": 5.491199016571045, + "learning_rate": 6.434535210507453e-05, + "loss": 2.0663, + "step": 5658 + }, + { + "epoch": 0.42597715425581967, + "grad_norm": 5.373559474945068, + "learning_rate": 6.433367331186122e-05, + "loss": 1.861, + "step": 5659 + }, + { + "epoch": 0.4260524285364798, + "grad_norm": 5.769242286682129, + "learning_rate": 6.432199366655217e-05, + "loss": 1.7569, + "step": 5660 + }, + { + "epoch": 0.42612770281713996, + "grad_norm": 4.196788787841797, + "learning_rate": 6.431031316984169e-05, + "loss": 1.6403, + "step": 5661 + }, + { + "epoch": 0.4262029770978001, + "grad_norm": 5.87326192855835, + "learning_rate": 6.429863182242415e-05, + "loss": 1.7997, + "step": 5662 + }, + { + "epoch": 0.42627825137846026, + "grad_norm": 6.26662015914917, + "learning_rate": 6.428694962499397e-05, + "loss": 1.829, + "step": 5663 + }, + { + "epoch": 0.42635352565912044, + "grad_norm": 6.6116814613342285, + "learning_rate": 6.427526657824563e-05, + "loss": 1.9503, + "step": 5664 + }, + { + "epoch": 0.42642879993978056, + "grad_norm": 5.3168253898620605, + "learning_rate": 6.426358268287366e-05, + "loss": 1.8199, + "step": 5665 + }, + { + "epoch": 0.42650407422044073, + "grad_norm": 4.946347236633301, + "learning_rate": 6.425189793957262e-05, + "loss": 2.0032, + "step": 5666 + }, + { + "epoch": 0.4265793485011009, + "grad_norm": 5.277747631072998, + "learning_rate": 6.424021234903714e-05, + "loss": 1.9961, + "step": 5667 + }, + { + "epoch": 0.42665462278176103, + "grad_norm": 6.945952415466309, + "learning_rate": 6.422852591196191e-05, + "loss": 1.7117, + "step": 5668 + }, + { + "epoch": 0.4267298970624212, + "grad_norm": 5.881615161895752, + "learning_rate": 6.421683862904163e-05, + "loss": 2.078, + "step": 5669 + }, + { + "epoch": 0.4268051713430814, + "grad_norm": 4.9390950202941895, + "learning_rate": 6.420515050097107e-05, + "loss": 1.7596, + "step": 5670 + }, + { + "epoch": 0.4268804456237415, + "grad_norm": 4.106928825378418, + "learning_rate": 6.41934615284451e-05, + "loss": 1.7451, + "step": 5671 + }, + { + "epoch": 0.4269557199044017, + "grad_norm": 4.78021764755249, + "learning_rate": 6.418177171215856e-05, + "loss": 1.5292, + "step": 5672 + }, + { + "epoch": 0.4270309941850618, + "grad_norm": 4.843875408172607, + "learning_rate": 6.417008105280637e-05, + "loss": 1.9038, + "step": 5673 + }, + { + "epoch": 0.42710626846572197, + "grad_norm": 5.9004011154174805, + "learning_rate": 6.415838955108353e-05, + "loss": 2.251, + "step": 5674 + }, + { + "epoch": 0.42718154274638215, + "grad_norm": 6.0372395515441895, + "learning_rate": 6.414669720768505e-05, + "loss": 1.9268, + "step": 5675 + }, + { + "epoch": 0.42725681702704227, + "grad_norm": 7.231194019317627, + "learning_rate": 6.413500402330602e-05, + "loss": 2.0608, + "step": 5676 + }, + { + "epoch": 0.42733209130770244, + "grad_norm": 3.8425676822662354, + "learning_rate": 6.412330999864155e-05, + "loss": 1.8284, + "step": 5677 + }, + { + "epoch": 0.4274073655883626, + "grad_norm": 4.921058654785156, + "learning_rate": 6.411161513438684e-05, + "loss": 1.5976, + "step": 5678 + }, + { + "epoch": 0.42748263986902274, + "grad_norm": 4.734769344329834, + "learning_rate": 6.409991943123709e-05, + "loss": 1.951, + "step": 5679 + }, + { + "epoch": 0.4275579141496829, + "grad_norm": 4.920373916625977, + "learning_rate": 6.408822288988758e-05, + "loss": 1.7672, + "step": 5680 + }, + { + "epoch": 0.42763318843034304, + "grad_norm": 6.122605323791504, + "learning_rate": 6.407652551103364e-05, + "loss": 1.8711, + "step": 5681 + }, + { + "epoch": 0.4277084627110032, + "grad_norm": 5.102845191955566, + "learning_rate": 6.406482729537068e-05, + "loss": 1.9579, + "step": 5682 + }, + { + "epoch": 0.4277837369916634, + "grad_norm": 7.131606578826904, + "learning_rate": 6.405312824359408e-05, + "loss": 1.6788, + "step": 5683 + }, + { + "epoch": 0.4278590112723235, + "grad_norm": 5.223738193511963, + "learning_rate": 6.404142835639931e-05, + "loss": 1.9324, + "step": 5684 + }, + { + "epoch": 0.4279342855529837, + "grad_norm": 5.89055871963501, + "learning_rate": 6.402972763448194e-05, + "loss": 1.822, + "step": 5685 + }, + { + "epoch": 0.42800955983364386, + "grad_norm": 4.947607517242432, + "learning_rate": 6.401802607853749e-05, + "loss": 1.7063, + "step": 5686 + }, + { + "epoch": 0.428084834114304, + "grad_norm": 4.10233736038208, + "learning_rate": 6.400632368926163e-05, + "loss": 1.7739, + "step": 5687 + }, + { + "epoch": 0.42816010839496416, + "grad_norm": 4.465114593505859, + "learning_rate": 6.399462046735001e-05, + "loss": 1.8599, + "step": 5688 + }, + { + "epoch": 0.42823538267562433, + "grad_norm": 4.857585906982422, + "learning_rate": 6.398291641349836e-05, + "loss": 2.0028, + "step": 5689 + }, + { + "epoch": 0.42831065695628445, + "grad_norm": 4.200544834136963, + "learning_rate": 6.397121152840245e-05, + "loss": 1.8917, + "step": 5690 + }, + { + "epoch": 0.42838593123694463, + "grad_norm": 7.14201021194458, + "learning_rate": 6.395950581275811e-05, + "loss": 2.1933, + "step": 5691 + }, + { + "epoch": 0.42846120551760475, + "grad_norm": 4.739650726318359, + "learning_rate": 6.394779926726117e-05, + "loss": 1.982, + "step": 5692 + }, + { + "epoch": 0.4285364797982649, + "grad_norm": 7.241650104522705, + "learning_rate": 6.393609189260762e-05, + "loss": 1.8229, + "step": 5693 + }, + { + "epoch": 0.4286117540789251, + "grad_norm": 4.344841480255127, + "learning_rate": 6.392438368949338e-05, + "loss": 1.6806, + "step": 5694 + }, + { + "epoch": 0.4286870283595852, + "grad_norm": 3.6050384044647217, + "learning_rate": 6.39126746586145e-05, + "loss": 2.2344, + "step": 5695 + }, + { + "epoch": 0.4287623026402454, + "grad_norm": 5.378622531890869, + "learning_rate": 6.390096480066703e-05, + "loss": 1.8744, + "step": 5696 + }, + { + "epoch": 0.42883757692090557, + "grad_norm": 4.53015661239624, + "learning_rate": 6.388925411634708e-05, + "loss": 2.2534, + "step": 5697 + }, + { + "epoch": 0.4289128512015657, + "grad_norm": 4.165298938751221, + "learning_rate": 6.387754260635081e-05, + "loss": 1.4701, + "step": 5698 + }, + { + "epoch": 0.42898812548222587, + "grad_norm": 4.665646553039551, + "learning_rate": 6.386583027137447e-05, + "loss": 1.9028, + "step": 5699 + }, + { + "epoch": 0.429063399762886, + "grad_norm": 4.572549343109131, + "learning_rate": 6.385411711211429e-05, + "loss": 1.9894, + "step": 5700 + }, + { + "epoch": 0.42913867404354616, + "grad_norm": 4.324965476989746, + "learning_rate": 6.384240312926663e-05, + "loss": 1.7791, + "step": 5701 + }, + { + "epoch": 0.42921394832420634, + "grad_norm": 7.79465389251709, + "learning_rate": 6.38306883235278e-05, + "loss": 2.2173, + "step": 5702 + }, + { + "epoch": 0.42928922260486646, + "grad_norm": 5.260470867156982, + "learning_rate": 6.381897269559423e-05, + "loss": 1.9281, + "step": 5703 + }, + { + "epoch": 0.42936449688552664, + "grad_norm": 3.7025718688964844, + "learning_rate": 6.38072562461624e-05, + "loss": 1.6125, + "step": 5704 + }, + { + "epoch": 0.4294397711661868, + "grad_norm": 4.8538899421691895, + "learning_rate": 6.37955389759288e-05, + "loss": 1.7189, + "step": 5705 + }, + { + "epoch": 0.42951504544684693, + "grad_norm": 4.803221702575684, + "learning_rate": 6.378382088558998e-05, + "loss": 1.7207, + "step": 5706 + }, + { + "epoch": 0.4295903197275071, + "grad_norm": 4.937625885009766, + "learning_rate": 6.377210197584256e-05, + "loss": 1.8523, + "step": 5707 + }, + { + "epoch": 0.4296655940081673, + "grad_norm": 4.236814498901367, + "learning_rate": 6.37603822473832e-05, + "loss": 1.9289, + "step": 5708 + }, + { + "epoch": 0.4297408682888274, + "grad_norm": 5.121965408325195, + "learning_rate": 6.374866170090858e-05, + "loss": 1.4851, + "step": 5709 + }, + { + "epoch": 0.4298161425694876, + "grad_norm": 5.451215744018555, + "learning_rate": 6.373694033711551e-05, + "loss": 2.0166, + "step": 5710 + }, + { + "epoch": 0.4298914168501477, + "grad_norm": 6.190569877624512, + "learning_rate": 6.372521815670072e-05, + "loss": 1.9272, + "step": 5711 + }, + { + "epoch": 0.4299666911308079, + "grad_norm": 3.8366334438323975, + "learning_rate": 6.371349516036111e-05, + "loss": 1.9214, + "step": 5712 + }, + { + "epoch": 0.43004196541146805, + "grad_norm": 5.434711456298828, + "learning_rate": 6.370177134879356e-05, + "loss": 1.9969, + "step": 5713 + }, + { + "epoch": 0.4301172396921282, + "grad_norm": 5.506935119628906, + "learning_rate": 6.3690046722695e-05, + "loss": 1.8554, + "step": 5714 + }, + { + "epoch": 0.43019251397278835, + "grad_norm": 5.937681674957275, + "learning_rate": 6.367832128276245e-05, + "loss": 1.6939, + "step": 5715 + }, + { + "epoch": 0.4302677882534485, + "grad_norm": 5.294760227203369, + "learning_rate": 6.366659502969297e-05, + "loss": 1.8444, + "step": 5716 + }, + { + "epoch": 0.43034306253410864, + "grad_norm": 6.36480712890625, + "learning_rate": 6.365486796418361e-05, + "loss": 1.7454, + "step": 5717 + }, + { + "epoch": 0.4304183368147688, + "grad_norm": 5.095760822296143, + "learning_rate": 6.364314008693154e-05, + "loss": 1.8437, + "step": 5718 + }, + { + "epoch": 0.430493611095429, + "grad_norm": 4.3236308097839355, + "learning_rate": 6.363141139863394e-05, + "loss": 1.9209, + "step": 5719 + }, + { + "epoch": 0.4305688853760891, + "grad_norm": 4.875301361083984, + "learning_rate": 6.361968189998802e-05, + "loss": 1.6004, + "step": 5720 + }, + { + "epoch": 0.4306441596567493, + "grad_norm": 3.9568426609039307, + "learning_rate": 6.360795159169111e-05, + "loss": 1.6029, + "step": 5721 + }, + { + "epoch": 0.4307194339374094, + "grad_norm": 5.453303337097168, + "learning_rate": 6.359622047444052e-05, + "loss": 1.9702, + "step": 5722 + }, + { + "epoch": 0.4307947082180696, + "grad_norm": 4.648316860198975, + "learning_rate": 6.358448854893363e-05, + "loss": 2.2022, + "step": 5723 + }, + { + "epoch": 0.43086998249872976, + "grad_norm": 8.667400360107422, + "learning_rate": 6.357275581586788e-05, + "loss": 1.9823, + "step": 5724 + }, + { + "epoch": 0.4309452567793899, + "grad_norm": 5.290044784545898, + "learning_rate": 6.356102227594075e-05, + "loss": 2.0993, + "step": 5725 + }, + { + "epoch": 0.43102053106005006, + "grad_norm": 5.3776655197143555, + "learning_rate": 6.354928792984973e-05, + "loss": 1.6284, + "step": 5726 + }, + { + "epoch": 0.43109580534071024, + "grad_norm": 4.132922649383545, + "learning_rate": 6.353755277829244e-05, + "loss": 1.7455, + "step": 5727 + }, + { + "epoch": 0.43117107962137036, + "grad_norm": 4.958800315856934, + "learning_rate": 6.352581682196648e-05, + "loss": 1.9029, + "step": 5728 + }, + { + "epoch": 0.43124635390203053, + "grad_norm": 5.599297523498535, + "learning_rate": 6.351408006156953e-05, + "loss": 1.5205, + "step": 5729 + }, + { + "epoch": 0.43132162818269065, + "grad_norm": 4.568748474121094, + "learning_rate": 6.350234249779929e-05, + "loss": 2.1351, + "step": 5730 + }, + { + "epoch": 0.43139690246335083, + "grad_norm": 4.051480293273926, + "learning_rate": 6.349060413135352e-05, + "loss": 1.7804, + "step": 5731 + }, + { + "epoch": 0.431472176744011, + "grad_norm": 5.410736560821533, + "learning_rate": 6.347886496293003e-05, + "loss": 2.1364, + "step": 5732 + }, + { + "epoch": 0.4315474510246711, + "grad_norm": 3.8320963382720947, + "learning_rate": 6.346712499322673e-05, + "loss": 2.1019, + "step": 5733 + }, + { + "epoch": 0.4316227253053313, + "grad_norm": 4.344722270965576, + "learning_rate": 6.345538422294148e-05, + "loss": 1.8742, + "step": 5734 + }, + { + "epoch": 0.4316979995859915, + "grad_norm": 6.909162998199463, + "learning_rate": 6.344364265277223e-05, + "loss": 2.1468, + "step": 5735 + }, + { + "epoch": 0.4317732738666516, + "grad_norm": 4.958521842956543, + "learning_rate": 6.3431900283417e-05, + "loss": 2.045, + "step": 5736 + }, + { + "epoch": 0.4318485481473118, + "grad_norm": 4.4357008934021, + "learning_rate": 6.342015711557385e-05, + "loss": 1.8831, + "step": 5737 + }, + { + "epoch": 0.43192382242797195, + "grad_norm": 4.679714202880859, + "learning_rate": 6.340841314994085e-05, + "loss": 2.1784, + "step": 5738 + }, + { + "epoch": 0.43199909670863207, + "grad_norm": 6.914356708526611, + "learning_rate": 6.339666838721618e-05, + "loss": 1.9508, + "step": 5739 + }, + { + "epoch": 0.43207437098929224, + "grad_norm": 4.594394207000732, + "learning_rate": 6.3384922828098e-05, + "loss": 1.6421, + "step": 5740 + }, + { + "epoch": 0.43214964526995236, + "grad_norm": 4.878486633300781, + "learning_rate": 6.337317647328458e-05, + "loss": 1.9852, + "step": 5741 + }, + { + "epoch": 0.43222491955061254, + "grad_norm": 4.727959632873535, + "learning_rate": 6.336142932347417e-05, + "loss": 1.8542, + "step": 5742 + }, + { + "epoch": 0.4323001938312727, + "grad_norm": 4.930661678314209, + "learning_rate": 6.334968137936514e-05, + "loss": 2.3205, + "step": 5743 + }, + { + "epoch": 0.43237546811193284, + "grad_norm": 4.600576877593994, + "learning_rate": 6.333793264165586e-05, + "loss": 1.6141, + "step": 5744 + }, + { + "epoch": 0.432450742392593, + "grad_norm": 4.216053485870361, + "learning_rate": 6.332618311104474e-05, + "loss": 1.6323, + "step": 5745 + }, + { + "epoch": 0.4325260166732532, + "grad_norm": 4.389832496643066, + "learning_rate": 6.331443278823029e-05, + "loss": 1.9289, + "step": 5746 + }, + { + "epoch": 0.4326012909539133, + "grad_norm": 6.9350409507751465, + "learning_rate": 6.3302681673911e-05, + "loss": 2.1289, + "step": 5747 + }, + { + "epoch": 0.4326765652345735, + "grad_norm": 4.836463451385498, + "learning_rate": 6.329092976878545e-05, + "loss": 1.646, + "step": 5748 + }, + { + "epoch": 0.4327518395152336, + "grad_norm": 5.18254280090332, + "learning_rate": 6.327917707355227e-05, + "loss": 2.3174, + "step": 5749 + }, + { + "epoch": 0.4328271137958938, + "grad_norm": 5.050939083099365, + "learning_rate": 6.326742358891011e-05, + "loss": 1.9059, + "step": 5750 + }, + { + "epoch": 0.43290238807655396, + "grad_norm": 5.284121990203857, + "learning_rate": 6.32556693155577e-05, + "loss": 1.9391, + "step": 5751 + }, + { + "epoch": 0.4329776623572141, + "grad_norm": 5.758852481842041, + "learning_rate": 6.324391425419379e-05, + "loss": 1.9554, + "step": 5752 + }, + { + "epoch": 0.43305293663787425, + "grad_norm": 4.930042266845703, + "learning_rate": 6.323215840551714e-05, + "loss": 1.9471, + "step": 5753 + }, + { + "epoch": 0.43312821091853443, + "grad_norm": 7.320718765258789, + "learning_rate": 6.322040177022666e-05, + "loss": 2.2778, + "step": 5754 + }, + { + "epoch": 0.43320348519919455, + "grad_norm": 5.494884490966797, + "learning_rate": 6.320864434902122e-05, + "loss": 1.7299, + "step": 5755 + }, + { + "epoch": 0.4332787594798547, + "grad_norm": 6.235262870788574, + "learning_rate": 6.319688614259979e-05, + "loss": 2.2201, + "step": 5756 + }, + { + "epoch": 0.4333540337605149, + "grad_norm": 5.126071453094482, + "learning_rate": 6.318512715166135e-05, + "loss": 1.9133, + "step": 5757 + }, + { + "epoch": 0.433429308041175, + "grad_norm": 5.697904109954834, + "learning_rate": 6.317336737690493e-05, + "loss": 2.0646, + "step": 5758 + }, + { + "epoch": 0.4335045823218352, + "grad_norm": 3.8304848670959473, + "learning_rate": 6.31616068190296e-05, + "loss": 1.6833, + "step": 5759 + }, + { + "epoch": 0.4335798566024953, + "grad_norm": 5.505464553833008, + "learning_rate": 6.314984547873451e-05, + "loss": 2.0852, + "step": 5760 + }, + { + "epoch": 0.4336551308831555, + "grad_norm": 4.02780818939209, + "learning_rate": 6.313808335671886e-05, + "loss": 1.7287, + "step": 5761 + }, + { + "epoch": 0.43373040516381567, + "grad_norm": 4.194763660430908, + "learning_rate": 6.312632045368184e-05, + "loss": 1.5593, + "step": 5762 + }, + { + "epoch": 0.4338056794444758, + "grad_norm": 5.325915813446045, + "learning_rate": 6.311455677032274e-05, + "loss": 1.7176, + "step": 5763 + }, + { + "epoch": 0.43388095372513596, + "grad_norm": 5.318713188171387, + "learning_rate": 6.310279230734084e-05, + "loss": 1.9114, + "step": 5764 + }, + { + "epoch": 0.43395622800579614, + "grad_norm": 4.497608184814453, + "learning_rate": 6.309102706543556e-05, + "loss": 1.9409, + "step": 5765 + }, + { + "epoch": 0.43403150228645626, + "grad_norm": 4.352036476135254, + "learning_rate": 6.307926104530629e-05, + "loss": 1.9054, + "step": 5766 + }, + { + "epoch": 0.43410677656711644, + "grad_norm": 3.287715435028076, + "learning_rate": 6.306749424765246e-05, + "loss": 1.8715, + "step": 5767 + }, + { + "epoch": 0.4341820508477766, + "grad_norm": 6.548951148986816, + "learning_rate": 6.30557266731736e-05, + "loss": 2.27, + "step": 5768 + }, + { + "epoch": 0.43425732512843673, + "grad_norm": 5.26406192779541, + "learning_rate": 6.304395832256925e-05, + "loss": 1.6347, + "step": 5769 + }, + { + "epoch": 0.4343325994090969, + "grad_norm": 4.818812370300293, + "learning_rate": 6.3032189196539e-05, + "loss": 1.7376, + "step": 5770 + }, + { + "epoch": 0.43440787368975703, + "grad_norm": 3.4784011840820312, + "learning_rate": 6.30204192957825e-05, + "loss": 2.0994, + "step": 5771 + }, + { + "epoch": 0.4344831479704172, + "grad_norm": 5.408201217651367, + "learning_rate": 6.300864862099943e-05, + "loss": 2.0886, + "step": 5772 + }, + { + "epoch": 0.4345584222510774, + "grad_norm": 6.009886741638184, + "learning_rate": 6.299687717288953e-05, + "loss": 2.0543, + "step": 5773 + }, + { + "epoch": 0.4346336965317375, + "grad_norm": 5.662917613983154, + "learning_rate": 6.298510495215258e-05, + "loss": 1.7531, + "step": 5774 + }, + { + "epoch": 0.4347089708123977, + "grad_norm": 4.763823986053467, + "learning_rate": 6.29733319594884e-05, + "loss": 2.2657, + "step": 5775 + }, + { + "epoch": 0.43478424509305785, + "grad_norm": 5.582851409912109, + "learning_rate": 6.296155819559684e-05, + "loss": 1.7746, + "step": 5776 + }, + { + "epoch": 0.434859519373718, + "grad_norm": 6.4988274574279785, + "learning_rate": 6.294978366117786e-05, + "loss": 1.9416, + "step": 5777 + }, + { + "epoch": 0.43493479365437815, + "grad_norm": 6.25943660736084, + "learning_rate": 6.293800835693137e-05, + "loss": 1.7444, + "step": 5778 + }, + { + "epoch": 0.43501006793503827, + "grad_norm": 5.957331657409668, + "learning_rate": 6.292623228355743e-05, + "loss": 1.8579, + "step": 5779 + }, + { + "epoch": 0.43508534221569845, + "grad_norm": 5.184590816497803, + "learning_rate": 6.291445544175607e-05, + "loss": 1.6959, + "step": 5780 + }, + { + "epoch": 0.4351606164963586, + "grad_norm": 5.2324652671813965, + "learning_rate": 6.290267783222737e-05, + "loss": 1.8863, + "step": 5781 + }, + { + "epoch": 0.43523589077701874, + "grad_norm": 7.171288967132568, + "learning_rate": 6.289089945567151e-05, + "loss": 1.9448, + "step": 5782 + }, + { + "epoch": 0.4353111650576789, + "grad_norm": 4.723790168762207, + "learning_rate": 6.287912031278868e-05, + "loss": 1.8153, + "step": 5783 + }, + { + "epoch": 0.4353864393383391, + "grad_norm": 4.438058376312256, + "learning_rate": 6.286734040427908e-05, + "loss": 1.8438, + "step": 5784 + }, + { + "epoch": 0.4354617136189992, + "grad_norm": 4.053075313568115, + "learning_rate": 6.285555973084302e-05, + "loss": 1.491, + "step": 5785 + }, + { + "epoch": 0.4355369878996594, + "grad_norm": 6.447706699371338, + "learning_rate": 6.28437782931808e-05, + "loss": 1.5686, + "step": 5786 + }, + { + "epoch": 0.43561226218031956, + "grad_norm": 7.388658046722412, + "learning_rate": 6.283199609199285e-05, + "loss": 1.8914, + "step": 5787 + }, + { + "epoch": 0.4356875364609797, + "grad_norm": 7.218966484069824, + "learning_rate": 6.282021312797952e-05, + "loss": 1.9529, + "step": 5788 + }, + { + "epoch": 0.43576281074163986, + "grad_norm": 8.045075416564941, + "learning_rate": 6.280842940184133e-05, + "loss": 1.69, + "step": 5789 + }, + { + "epoch": 0.4358380850223, + "grad_norm": 5.286870956420898, + "learning_rate": 6.279664491427875e-05, + "loss": 1.9768, + "step": 5790 + }, + { + "epoch": 0.43591335930296016, + "grad_norm": 4.490273475646973, + "learning_rate": 6.278485966599235e-05, + "loss": 1.8576, + "step": 5791 + }, + { + "epoch": 0.43598863358362033, + "grad_norm": 7.251227378845215, + "learning_rate": 6.277307365768273e-05, + "loss": 1.5268, + "step": 5792 + }, + { + "epoch": 0.43606390786428045, + "grad_norm": 4.111464023590088, + "learning_rate": 6.276128689005053e-05, + "loss": 1.7285, + "step": 5793 + }, + { + "epoch": 0.43613918214494063, + "grad_norm": 5.580585479736328, + "learning_rate": 6.274949936379644e-05, + "loss": 1.8677, + "step": 5794 + }, + { + "epoch": 0.4362144564256008, + "grad_norm": 5.235788345336914, + "learning_rate": 6.273771107962119e-05, + "loss": 2.1583, + "step": 5795 + }, + { + "epoch": 0.4362897307062609, + "grad_norm": 4.15110969543457, + "learning_rate": 6.272592203822558e-05, + "loss": 1.963, + "step": 5796 + }, + { + "epoch": 0.4363650049869211, + "grad_norm": 4.457510471343994, + "learning_rate": 6.27141322403104e-05, + "loss": 1.7447, + "step": 5797 + }, + { + "epoch": 0.4364402792675812, + "grad_norm": 6.652022838592529, + "learning_rate": 6.270234168657653e-05, + "loss": 2.0384, + "step": 5798 + }, + { + "epoch": 0.4365155535482414, + "grad_norm": 4.389620304107666, + "learning_rate": 6.269055037772492e-05, + "loss": 1.7412, + "step": 5799 + }, + { + "epoch": 0.4365908278289016, + "grad_norm": 6.0143609046936035, + "learning_rate": 6.26787583144565e-05, + "loss": 1.9418, + "step": 5800 + }, + { + "epoch": 0.4366661021095617, + "grad_norm": 4.486854076385498, + "learning_rate": 6.266696549747227e-05, + "loss": 1.778, + "step": 5801 + }, + { + "epoch": 0.43674137639022187, + "grad_norm": 4.908323764801025, + "learning_rate": 6.26551719274733e-05, + "loss": 2.0187, + "step": 5802 + }, + { + "epoch": 0.43681665067088205, + "grad_norm": 5.330374717712402, + "learning_rate": 6.264337760516064e-05, + "loss": 2.2538, + "step": 5803 + }, + { + "epoch": 0.43689192495154217, + "grad_norm": 4.781247615814209, + "learning_rate": 6.263158253123548e-05, + "loss": 2.0503, + "step": 5804 + }, + { + "epoch": 0.43696719923220234, + "grad_norm": 5.201683521270752, + "learning_rate": 6.261978670639899e-05, + "loss": 1.7548, + "step": 5805 + }, + { + "epoch": 0.4370424735128625, + "grad_norm": 4.287737846374512, + "learning_rate": 6.260799013135237e-05, + "loss": 1.7191, + "step": 5806 + }, + { + "epoch": 0.43711774779352264, + "grad_norm": 4.6271796226501465, + "learning_rate": 6.259619280679694e-05, + "loss": 1.673, + "step": 5807 + }, + { + "epoch": 0.4371930220741828, + "grad_norm": 5.017197608947754, + "learning_rate": 6.258439473343398e-05, + "loss": 1.7914, + "step": 5808 + }, + { + "epoch": 0.43726829635484293, + "grad_norm": 4.485762119293213, + "learning_rate": 6.257259591196484e-05, + "loss": 1.6493, + "step": 5809 + }, + { + "epoch": 0.4373435706355031, + "grad_norm": 4.729482650756836, + "learning_rate": 6.256079634309098e-05, + "loss": 2.2761, + "step": 5810 + }, + { + "epoch": 0.4374188449161633, + "grad_norm": 4.202962398529053, + "learning_rate": 6.25489960275138e-05, + "loss": 2.0549, + "step": 5811 + }, + { + "epoch": 0.4374941191968234, + "grad_norm": 5.038227081298828, + "learning_rate": 6.253719496593482e-05, + "loss": 2.0185, + "step": 5812 + }, + { + "epoch": 0.4375693934774836, + "grad_norm": 5.413717746734619, + "learning_rate": 6.252539315905557e-05, + "loss": 2.1271, + "step": 5813 + }, + { + "epoch": 0.43764466775814376, + "grad_norm": 5.178262233734131, + "learning_rate": 6.251359060757762e-05, + "loss": 2.089, + "step": 5814 + }, + { + "epoch": 0.4377199420388039, + "grad_norm": 5.374349594116211, + "learning_rate": 6.250178731220265e-05, + "loss": 1.9301, + "step": 5815 + }, + { + "epoch": 0.43779521631946405, + "grad_norm": 4.188270092010498, + "learning_rate": 6.248998327363227e-05, + "loss": 1.4729, + "step": 5816 + }, + { + "epoch": 0.43787049060012423, + "grad_norm": 7.420822620391846, + "learning_rate": 6.247817849256823e-05, + "loss": 1.7689, + "step": 5817 + }, + { + "epoch": 0.43794576488078435, + "grad_norm": 5.440687656402588, + "learning_rate": 6.246637296971228e-05, + "loss": 1.6989, + "step": 5818 + }, + { + "epoch": 0.4380210391614445, + "grad_norm": 4.3115339279174805, + "learning_rate": 6.245456670576621e-05, + "loss": 1.7496, + "step": 5819 + }, + { + "epoch": 0.43809631344210465, + "grad_norm": 3.707275390625, + "learning_rate": 6.244275970143191e-05, + "loss": 1.5772, + "step": 5820 + }, + { + "epoch": 0.4381715877227648, + "grad_norm": 5.517369747161865, + "learning_rate": 6.243095195741124e-05, + "loss": 1.4858, + "step": 5821 + }, + { + "epoch": 0.438246862003425, + "grad_norm": 4.949464321136475, + "learning_rate": 6.241914347440614e-05, + "loss": 1.9462, + "step": 5822 + }, + { + "epoch": 0.4383221362840851, + "grad_norm": 3.8528895378112793, + "learning_rate": 6.240733425311859e-05, + "loss": 1.7908, + "step": 5823 + }, + { + "epoch": 0.4383974105647453, + "grad_norm": 5.060612201690674, + "learning_rate": 6.239552429425063e-05, + "loss": 1.7569, + "step": 5824 + }, + { + "epoch": 0.43847268484540547, + "grad_norm": 5.978921890258789, + "learning_rate": 6.238371359850433e-05, + "loss": 1.9004, + "step": 5825 + }, + { + "epoch": 0.4385479591260656, + "grad_norm": 4.387962818145752, + "learning_rate": 6.237190216658177e-05, + "loss": 1.6098, + "step": 5826 + }, + { + "epoch": 0.43862323340672577, + "grad_norm": 5.254219055175781, + "learning_rate": 6.236008999918514e-05, + "loss": 2.0547, + "step": 5827 + }, + { + "epoch": 0.4386985076873859, + "grad_norm": 4.570847511291504, + "learning_rate": 6.234827709701662e-05, + "loss": 1.9328, + "step": 5828 + }, + { + "epoch": 0.43877378196804606, + "grad_norm": 6.545477390289307, + "learning_rate": 6.233646346077844e-05, + "loss": 1.5737, + "step": 5829 + }, + { + "epoch": 0.43884905624870624, + "grad_norm": 5.854374885559082, + "learning_rate": 6.232464909117292e-05, + "loss": 1.712, + "step": 5830 + }, + { + "epoch": 0.43892433052936636, + "grad_norm": 7.668844699859619, + "learning_rate": 6.231283398890237e-05, + "loss": 2.141, + "step": 5831 + }, + { + "epoch": 0.43899960481002653, + "grad_norm": 4.709848880767822, + "learning_rate": 6.230101815466917e-05, + "loss": 1.8147, + "step": 5832 + }, + { + "epoch": 0.4390748790906867, + "grad_norm": 4.478626251220703, + "learning_rate": 6.228920158917574e-05, + "loss": 1.8843, + "step": 5833 + }, + { + "epoch": 0.43915015337134683, + "grad_norm": 6.493508338928223, + "learning_rate": 6.227738429312456e-05, + "loss": 1.8751, + "step": 5834 + }, + { + "epoch": 0.439225427652007, + "grad_norm": 4.667327404022217, + "learning_rate": 6.226556626721808e-05, + "loss": 1.9067, + "step": 5835 + }, + { + "epoch": 0.4393007019326672, + "grad_norm": 6.406425952911377, + "learning_rate": 6.225374751215888e-05, + "loss": 1.9351, + "step": 5836 + }, + { + "epoch": 0.4393759762133273, + "grad_norm": 4.937656402587891, + "learning_rate": 6.224192802864955e-05, + "loss": 2.1119, + "step": 5837 + }, + { + "epoch": 0.4394512504939875, + "grad_norm": 6.021147727966309, + "learning_rate": 6.223010781739273e-05, + "loss": 1.9465, + "step": 5838 + }, + { + "epoch": 0.4395265247746476, + "grad_norm": 4.826620578765869, + "learning_rate": 6.221828687909109e-05, + "loss": 1.9447, + "step": 5839 + }, + { + "epoch": 0.4396017990553078, + "grad_norm": 5.151264667510986, + "learning_rate": 6.220646521444736e-05, + "loss": 1.9302, + "step": 5840 + }, + { + "epoch": 0.43967707333596795, + "grad_norm": 5.776466369628906, + "learning_rate": 6.21946428241643e-05, + "loss": 2.1637, + "step": 5841 + }, + { + "epoch": 0.43975234761662807, + "grad_norm": 4.829813480377197, + "learning_rate": 6.21828197089447e-05, + "loss": 2.0044, + "step": 5842 + }, + { + "epoch": 0.43982762189728825, + "grad_norm": 4.402285575866699, + "learning_rate": 6.217099586949143e-05, + "loss": 1.6925, + "step": 5843 + }, + { + "epoch": 0.4399028961779484, + "grad_norm": 5.168896675109863, + "learning_rate": 6.215917130650739e-05, + "loss": 2.3146, + "step": 5844 + }, + { + "epoch": 0.43997817045860854, + "grad_norm": 4.062685489654541, + "learning_rate": 6.21473460206955e-05, + "loss": 1.5988, + "step": 5845 + }, + { + "epoch": 0.4400534447392687, + "grad_norm": 5.211696147918701, + "learning_rate": 6.213552001275874e-05, + "loss": 1.657, + "step": 5846 + }, + { + "epoch": 0.4401287190199289, + "grad_norm": 8.855429649353027, + "learning_rate": 6.212369328340014e-05, + "loss": 1.9531, + "step": 5847 + }, + { + "epoch": 0.440203993300589, + "grad_norm": 4.897368907928467, + "learning_rate": 6.211186583332275e-05, + "loss": 1.8066, + "step": 5848 + }, + { + "epoch": 0.4402792675812492, + "grad_norm": 6.428262710571289, + "learning_rate": 6.210003766322969e-05, + "loss": 2.0787, + "step": 5849 + }, + { + "epoch": 0.4403545418619093, + "grad_norm": 5.9427809715271, + "learning_rate": 6.208820877382414e-05, + "loss": 1.9592, + "step": 5850 + }, + { + "epoch": 0.4404298161425695, + "grad_norm": 6.859930992126465, + "learning_rate": 6.207637916580923e-05, + "loss": 1.7693, + "step": 5851 + }, + { + "epoch": 0.44050509042322966, + "grad_norm": 5.213363170623779, + "learning_rate": 6.206454883988823e-05, + "loss": 1.6205, + "step": 5852 + }, + { + "epoch": 0.4405803647038898, + "grad_norm": 4.418485164642334, + "learning_rate": 6.205271779676443e-05, + "loss": 1.5481, + "step": 5853 + }, + { + "epoch": 0.44065563898454996, + "grad_norm": 4.274722576141357, + "learning_rate": 6.204088603714113e-05, + "loss": 1.4668, + "step": 5854 + }, + { + "epoch": 0.44073091326521013, + "grad_norm": 4.087955474853516, + "learning_rate": 6.202905356172171e-05, + "loss": 1.7904, + "step": 5855 + }, + { + "epoch": 0.44080618754587025, + "grad_norm": 5.659778594970703, + "learning_rate": 6.201722037120957e-05, + "loss": 1.7158, + "step": 5856 + }, + { + "epoch": 0.44088146182653043, + "grad_norm": 5.042832374572754, + "learning_rate": 6.200538646630817e-05, + "loss": 1.9718, + "step": 5857 + }, + { + "epoch": 0.44095673610719055, + "grad_norm": 5.757094860076904, + "learning_rate": 6.199355184772099e-05, + "loss": 2.4797, + "step": 5858 + }, + { + "epoch": 0.4410320103878507, + "grad_norm": 5.852606296539307, + "learning_rate": 6.198171651615155e-05, + "loss": 1.7007, + "step": 5859 + }, + { + "epoch": 0.4411072846685109, + "grad_norm": 4.6757001876831055, + "learning_rate": 6.196988047230345e-05, + "loss": 1.9204, + "step": 5860 + }, + { + "epoch": 0.441182558949171, + "grad_norm": 8.15085220336914, + "learning_rate": 6.195804371688031e-05, + "loss": 1.8268, + "step": 5861 + }, + { + "epoch": 0.4412578332298312, + "grad_norm": 4.679993152618408, + "learning_rate": 6.194620625058578e-05, + "loss": 1.6113, + "step": 5862 + }, + { + "epoch": 0.4413331075104914, + "grad_norm": 4.145971775054932, + "learning_rate": 6.193436807412356e-05, + "loss": 1.5334, + "step": 5863 + }, + { + "epoch": 0.4414083817911515, + "grad_norm": 5.400669097900391, + "learning_rate": 6.19225291881974e-05, + "loss": 1.913, + "step": 5864 + }, + { + "epoch": 0.44148365607181167, + "grad_norm": 4.492130279541016, + "learning_rate": 6.19106895935111e-05, + "loss": 2.0012, + "step": 5865 + }, + { + "epoch": 0.44155893035247185, + "grad_norm": 6.411062240600586, + "learning_rate": 6.189884929076847e-05, + "loss": 1.9604, + "step": 5866 + }, + { + "epoch": 0.44163420463313197, + "grad_norm": 5.837465763092041, + "learning_rate": 6.18870082806734e-05, + "loss": 1.857, + "step": 5867 + }, + { + "epoch": 0.44170947891379214, + "grad_norm": 5.866978645324707, + "learning_rate": 6.187516656392978e-05, + "loss": 1.528, + "step": 5868 + }, + { + "epoch": 0.44178475319445226, + "grad_norm": 4.540149688720703, + "learning_rate": 6.18633241412416e-05, + "loss": 1.7805, + "step": 5869 + }, + { + "epoch": 0.44186002747511244, + "grad_norm": 3.9436843395233154, + "learning_rate": 6.185148101331281e-05, + "loss": 1.4298, + "step": 5870 + }, + { + "epoch": 0.4419353017557726, + "grad_norm": 4.274122714996338, + "learning_rate": 6.183963718084752e-05, + "loss": 1.8796, + "step": 5871 + }, + { + "epoch": 0.44201057603643273, + "grad_norm": 4.931369304656982, + "learning_rate": 6.182779264454974e-05, + "loss": 1.7588, + "step": 5872 + }, + { + "epoch": 0.4420858503170929, + "grad_norm": 4.517200946807861, + "learning_rate": 6.181594740512365e-05, + "loss": 1.9286, + "step": 5873 + }, + { + "epoch": 0.4421611245977531, + "grad_norm": 6.394038677215576, + "learning_rate": 6.180410146327336e-05, + "loss": 1.9484, + "step": 5874 + }, + { + "epoch": 0.4422363988784132, + "grad_norm": 4.643242359161377, + "learning_rate": 6.179225481970313e-05, + "loss": 1.9721, + "step": 5875 + }, + { + "epoch": 0.4423116731590734, + "grad_norm": 6.796786308288574, + "learning_rate": 6.178040747511717e-05, + "loss": 1.6678, + "step": 5876 + }, + { + "epoch": 0.4423869474397335, + "grad_norm": 5.876829147338867, + "learning_rate": 6.176855943021981e-05, + "loss": 2.2837, + "step": 5877 + }, + { + "epoch": 0.4424622217203937, + "grad_norm": 5.323246002197266, + "learning_rate": 6.175671068571535e-05, + "loss": 1.8694, + "step": 5878 + }, + { + "epoch": 0.44253749600105385, + "grad_norm": 4.678639888763428, + "learning_rate": 6.174486124230817e-05, + "loss": 1.9097, + "step": 5879 + }, + { + "epoch": 0.442612770281714, + "grad_norm": 5.8317952156066895, + "learning_rate": 6.17330111007027e-05, + "loss": 1.8571, + "step": 5880 + }, + { + "epoch": 0.44268804456237415, + "grad_norm": 3.9560675621032715, + "learning_rate": 6.172116026160337e-05, + "loss": 1.9481, + "step": 5881 + }, + { + "epoch": 0.4427633188430343, + "grad_norm": 5.007081508636475, + "learning_rate": 6.17093087257147e-05, + "loss": 1.7933, + "step": 5882 + }, + { + "epoch": 0.44283859312369445, + "grad_norm": 6.855524063110352, + "learning_rate": 6.169745649374122e-05, + "loss": 1.7125, + "step": 5883 + }, + { + "epoch": 0.4429138674043546, + "grad_norm": 4.754982948303223, + "learning_rate": 6.168560356638752e-05, + "loss": 2.0244, + "step": 5884 + }, + { + "epoch": 0.4429891416850148, + "grad_norm": 6.652046203613281, + "learning_rate": 6.167374994435822e-05, + "loss": 1.834, + "step": 5885 + }, + { + "epoch": 0.4430644159656749, + "grad_norm": 5.552329063415527, + "learning_rate": 6.166189562835798e-05, + "loss": 2.2416, + "step": 5886 + }, + { + "epoch": 0.4431396902463351, + "grad_norm": 5.925650596618652, + "learning_rate": 6.16500406190915e-05, + "loss": 2.0492, + "step": 5887 + }, + { + "epoch": 0.4432149645269952, + "grad_norm": 4.0246357917785645, + "learning_rate": 6.163818491726354e-05, + "loss": 1.5696, + "step": 5888 + }, + { + "epoch": 0.4432902388076554, + "grad_norm": 4.192280292510986, + "learning_rate": 6.162632852357887e-05, + "loss": 1.842, + "step": 5889 + }, + { + "epoch": 0.44336551308831557, + "grad_norm": 7.952633857727051, + "learning_rate": 6.161447143874234e-05, + "loss": 1.8114, + "step": 5890 + }, + { + "epoch": 0.4434407873689757, + "grad_norm": 5.156842231750488, + "learning_rate": 6.160261366345882e-05, + "loss": 1.8423, + "step": 5891 + }, + { + "epoch": 0.44351606164963586, + "grad_norm": 5.8629961013793945, + "learning_rate": 6.159075519843319e-05, + "loss": 1.737, + "step": 5892 + }, + { + "epoch": 0.44359133593029604, + "grad_norm": 4.871035575866699, + "learning_rate": 6.157889604437043e-05, + "loss": 1.8111, + "step": 5893 + }, + { + "epoch": 0.44366661021095616, + "grad_norm": 5.2656426429748535, + "learning_rate": 6.156703620197553e-05, + "loss": 1.7904, + "step": 5894 + }, + { + "epoch": 0.44374188449161633, + "grad_norm": 6.103933811187744, + "learning_rate": 6.155517567195351e-05, + "loss": 1.971, + "step": 5895 + }, + { + "epoch": 0.4438171587722765, + "grad_norm": 5.573460578918457, + "learning_rate": 6.154331445500945e-05, + "loss": 1.6584, + "step": 5896 + }, + { + "epoch": 0.44389243305293663, + "grad_norm": 4.668985843658447, + "learning_rate": 6.153145255184846e-05, + "loss": 1.807, + "step": 5897 + }, + { + "epoch": 0.4439677073335968, + "grad_norm": 4.926974296569824, + "learning_rate": 6.15195899631757e-05, + "loss": 1.5456, + "step": 5898 + }, + { + "epoch": 0.4440429816142569, + "grad_norm": 5.2956624031066895, + "learning_rate": 6.150772668969639e-05, + "loss": 2.0118, + "step": 5899 + }, + { + "epoch": 0.4441182558949171, + "grad_norm": 6.440892696380615, + "learning_rate": 6.149586273211573e-05, + "loss": 1.7713, + "step": 5900 + }, + { + "epoch": 0.4441935301755773, + "grad_norm": 4.950819492340088, + "learning_rate": 6.148399809113903e-05, + "loss": 1.546, + "step": 5901 + }, + { + "epoch": 0.4442688044562374, + "grad_norm": 4.544276237487793, + "learning_rate": 6.147213276747159e-05, + "loss": 1.9092, + "step": 5902 + }, + { + "epoch": 0.4443440787368976, + "grad_norm": 5.3630571365356445, + "learning_rate": 6.146026676181877e-05, + "loss": 1.7542, + "step": 5903 + }, + { + "epoch": 0.44441935301755775, + "grad_norm": 4.764751434326172, + "learning_rate": 6.144840007488598e-05, + "loss": 1.7094, + "step": 5904 + }, + { + "epoch": 0.44449462729821787, + "grad_norm": 4.419123649597168, + "learning_rate": 6.143653270737866e-05, + "loss": 1.8387, + "step": 5905 + }, + { + "epoch": 0.44456990157887805, + "grad_norm": 3.8902711868286133, + "learning_rate": 6.142466466000228e-05, + "loss": 1.8304, + "step": 5906 + }, + { + "epoch": 0.44464517585953817, + "grad_norm": 5.639704704284668, + "learning_rate": 6.141279593346237e-05, + "loss": 1.5237, + "step": 5907 + }, + { + "epoch": 0.44472045014019834, + "grad_norm": 4.762472629547119, + "learning_rate": 6.140092652846448e-05, + "loss": 1.8729, + "step": 5908 + }, + { + "epoch": 0.4447957244208585, + "grad_norm": 4.487603187561035, + "learning_rate": 6.138905644571421e-05, + "loss": 1.8431, + "step": 5909 + }, + { + "epoch": 0.44487099870151864, + "grad_norm": 5.679095268249512, + "learning_rate": 6.137718568591722e-05, + "loss": 1.9558, + "step": 5910 + }, + { + "epoch": 0.4449462729821788, + "grad_norm": 9.267931938171387, + "learning_rate": 6.13653142497792e-05, + "loss": 2.1104, + "step": 5911 + }, + { + "epoch": 0.445021547262839, + "grad_norm": 6.3070197105407715, + "learning_rate": 6.135344213800586e-05, + "loss": 2.0746, + "step": 5912 + }, + { + "epoch": 0.4450968215434991, + "grad_norm": 4.995018482208252, + "learning_rate": 6.134156935130296e-05, + "loss": 2.0203, + "step": 5913 + }, + { + "epoch": 0.4451720958241593, + "grad_norm": 5.359880447387695, + "learning_rate": 6.13296958903763e-05, + "loss": 2.2601, + "step": 5914 + }, + { + "epoch": 0.44524737010481946, + "grad_norm": 4.871240615844727, + "learning_rate": 6.131782175593172e-05, + "loss": 2.0277, + "step": 5915 + }, + { + "epoch": 0.4453226443854796, + "grad_norm": 5.7709197998046875, + "learning_rate": 6.130594694867512e-05, + "loss": 2.0326, + "step": 5916 + }, + { + "epoch": 0.44539791866613976, + "grad_norm": 5.7118682861328125, + "learning_rate": 6.12940714693124e-05, + "loss": 1.832, + "step": 5917 + }, + { + "epoch": 0.4454731929467999, + "grad_norm": 6.593611717224121, + "learning_rate": 6.128219531854956e-05, + "loss": 2.0637, + "step": 5918 + }, + { + "epoch": 0.44554846722746005, + "grad_norm": 7.002303600311279, + "learning_rate": 6.127031849709257e-05, + "loss": 1.9767, + "step": 5919 + }, + { + "epoch": 0.44562374150812023, + "grad_norm": 4.957760810852051, + "learning_rate": 6.125844100564746e-05, + "loss": 1.8056, + "step": 5920 + }, + { + "epoch": 0.44569901578878035, + "grad_norm": 4.455676078796387, + "learning_rate": 6.124656284492036e-05, + "loss": 1.9146, + "step": 5921 + }, + { + "epoch": 0.4457742900694405, + "grad_norm": 4.740127086639404, + "learning_rate": 6.123468401561738e-05, + "loss": 1.842, + "step": 5922 + }, + { + "epoch": 0.4458495643501007, + "grad_norm": 3.9294497966766357, + "learning_rate": 6.122280451844467e-05, + "loss": 1.7425, + "step": 5923 + }, + { + "epoch": 0.4459248386307608, + "grad_norm": 4.513614177703857, + "learning_rate": 6.121092435410841e-05, + "loss": 1.9787, + "step": 5924 + }, + { + "epoch": 0.446000112911421, + "grad_norm": 4.204850196838379, + "learning_rate": 6.119904352331488e-05, + "loss": 1.7417, + "step": 5925 + }, + { + "epoch": 0.4460753871920811, + "grad_norm": 5.066445350646973, + "learning_rate": 6.118716202677033e-05, + "loss": 1.9894, + "step": 5926 + }, + { + "epoch": 0.4461506614727413, + "grad_norm": 4.937731742858887, + "learning_rate": 6.117527986518113e-05, + "loss": 1.8921, + "step": 5927 + }, + { + "epoch": 0.44622593575340147, + "grad_norm": 5.468081474304199, + "learning_rate": 6.116339703925358e-05, + "loss": 1.9164, + "step": 5928 + }, + { + "epoch": 0.4463012100340616, + "grad_norm": 6.904363632202148, + "learning_rate": 6.115151354969413e-05, + "loss": 1.8795, + "step": 5929 + }, + { + "epoch": 0.44637648431472177, + "grad_norm": 6.607165336608887, + "learning_rate": 6.113962939720918e-05, + "loss": 2.0806, + "step": 5930 + }, + { + "epoch": 0.44645175859538194, + "grad_norm": 5.793832302093506, + "learning_rate": 6.112774458250522e-05, + "loss": 1.9448, + "step": 5931 + }, + { + "epoch": 0.44652703287604206, + "grad_norm": 5.159279823303223, + "learning_rate": 6.111585910628878e-05, + "loss": 2.0086, + "step": 5932 + }, + { + "epoch": 0.44660230715670224, + "grad_norm": 6.321938991546631, + "learning_rate": 6.110397296926641e-05, + "loss": 1.8954, + "step": 5933 + }, + { + "epoch": 0.4466775814373624, + "grad_norm": 4.06212854385376, + "learning_rate": 6.10920861721447e-05, + "loss": 2.1138, + "step": 5934 + }, + { + "epoch": 0.44675285571802253, + "grad_norm": 4.8073344230651855, + "learning_rate": 6.108019871563031e-05, + "loss": 1.7812, + "step": 5935 + }, + { + "epoch": 0.4468281299986827, + "grad_norm": 6.7949676513671875, + "learning_rate": 6.10683106004299e-05, + "loss": 1.5813, + "step": 5936 + }, + { + "epoch": 0.44690340427934283, + "grad_norm": 5.068625450134277, + "learning_rate": 6.105642182725017e-05, + "loss": 1.8129, + "step": 5937 + }, + { + "epoch": 0.446978678560003, + "grad_norm": 5.655060768127441, + "learning_rate": 6.104453239679789e-05, + "loss": 2.0068, + "step": 5938 + }, + { + "epoch": 0.4470539528406632, + "grad_norm": 6.825038433074951, + "learning_rate": 6.103264230977986e-05, + "loss": 1.6842, + "step": 5939 + }, + { + "epoch": 0.4471292271213233, + "grad_norm": 6.412866592407227, + "learning_rate": 6.10207515669029e-05, + "loss": 1.5939, + "step": 5940 + }, + { + "epoch": 0.4472045014019835, + "grad_norm": 6.018184661865234, + "learning_rate": 6.100886016887387e-05, + "loss": 1.8987, + "step": 5941 + }, + { + "epoch": 0.44727977568264365, + "grad_norm": 5.690323829650879, + "learning_rate": 6.0996968116399686e-05, + "loss": 1.7042, + "step": 5942 + }, + { + "epoch": 0.4473550499633038, + "grad_norm": 5.217639446258545, + "learning_rate": 6.098507541018732e-05, + "loss": 1.996, + "step": 5943 + }, + { + "epoch": 0.44743032424396395, + "grad_norm": 4.900861740112305, + "learning_rate": 6.097318205094373e-05, + "loss": 2.2944, + "step": 5944 + }, + { + "epoch": 0.4475055985246241, + "grad_norm": 3.287893533706665, + "learning_rate": 6.096128803937596e-05, + "loss": 1.6287, + "step": 5945 + }, + { + "epoch": 0.44758087280528425, + "grad_norm": 4.29261589050293, + "learning_rate": 6.0949393376191066e-05, + "loss": 1.724, + "step": 5946 + }, + { + "epoch": 0.4476561470859444, + "grad_norm": 3.900416851043701, + "learning_rate": 6.0937498062096146e-05, + "loss": 1.6034, + "step": 5947 + }, + { + "epoch": 0.44773142136660454, + "grad_norm": 5.113426685333252, + "learning_rate": 6.092560209779835e-05, + "loss": 1.8158, + "step": 5948 + }, + { + "epoch": 0.4478066956472647, + "grad_norm": 5.223568439483643, + "learning_rate": 6.091370548400486e-05, + "loss": 1.7778, + "step": 5949 + }, + { + "epoch": 0.4478819699279249, + "grad_norm": 6.56968879699707, + "learning_rate": 6.0901808221422885e-05, + "loss": 1.9295, + "step": 5950 + }, + { + "epoch": 0.447957244208585, + "grad_norm": 5.605441093444824, + "learning_rate": 6.0889910310759714e-05, + "loss": 1.9088, + "step": 5951 + }, + { + "epoch": 0.4480325184892452, + "grad_norm": 5.2693586349487305, + "learning_rate": 6.087801175272261e-05, + "loss": 2.3691, + "step": 5952 + }, + { + "epoch": 0.44810779276990537, + "grad_norm": 5.270655155181885, + "learning_rate": 6.0866112548018915e-05, + "loss": 1.7201, + "step": 5953 + }, + { + "epoch": 0.4481830670505655, + "grad_norm": 4.41894006729126, + "learning_rate": 6.085421269735601e-05, + "loss": 1.9381, + "step": 5954 + }, + { + "epoch": 0.44825834133122566, + "grad_norm": 5.286181926727295, + "learning_rate": 6.084231220144131e-05, + "loss": 2.3921, + "step": 5955 + }, + { + "epoch": 0.4483336156118858, + "grad_norm": 6.145561218261719, + "learning_rate": 6.083041106098225e-05, + "loss": 1.5353, + "step": 5956 + }, + { + "epoch": 0.44840888989254596, + "grad_norm": 5.020909786224365, + "learning_rate": 6.081850927668634e-05, + "loss": 2.2206, + "step": 5957 + }, + { + "epoch": 0.44848416417320613, + "grad_norm": 5.309340000152588, + "learning_rate": 6.0806606849261096e-05, + "loss": 1.69, + "step": 5958 + }, + { + "epoch": 0.44855943845386625, + "grad_norm": 5.251892566680908, + "learning_rate": 6.079470377941407e-05, + "loss": 1.6676, + "step": 5959 + }, + { + "epoch": 0.44863471273452643, + "grad_norm": 6.448060989379883, + "learning_rate": 6.0782800067852886e-05, + "loss": 1.8315, + "step": 5960 + }, + { + "epoch": 0.4487099870151866, + "grad_norm": 4.818769931793213, + "learning_rate": 6.077089571528517e-05, + "loss": 1.95, + "step": 5961 + }, + { + "epoch": 0.4487852612958467, + "grad_norm": 4.409201622009277, + "learning_rate": 6.075899072241862e-05, + "loss": 1.8656, + "step": 5962 + }, + { + "epoch": 0.4488605355765069, + "grad_norm": 8.301618576049805, + "learning_rate": 6.074708508996094e-05, + "loss": 1.6854, + "step": 5963 + }, + { + "epoch": 0.4489358098571671, + "grad_norm": 8.407438278198242, + "learning_rate": 6.073517881861988e-05, + "loss": 1.8531, + "step": 5964 + }, + { + "epoch": 0.4490110841378272, + "grad_norm": 7.966956615447998, + "learning_rate": 6.072327190910323e-05, + "loss": 1.8034, + "step": 5965 + }, + { + "epoch": 0.4490863584184874, + "grad_norm": 6.825535774230957, + "learning_rate": 6.071136436211886e-05, + "loss": 1.9072, + "step": 5966 + }, + { + "epoch": 0.4491616326991475, + "grad_norm": 5.65238618850708, + "learning_rate": 6.0699456178374605e-05, + "loss": 1.8153, + "step": 5967 + }, + { + "epoch": 0.44923690697980767, + "grad_norm": 4.933606147766113, + "learning_rate": 6.068754735857838e-05, + "loss": 1.7513, + "step": 5968 + }, + { + "epoch": 0.44931218126046785, + "grad_norm": 5.864858150482178, + "learning_rate": 6.067563790343813e-05, + "loss": 2.1341, + "step": 5969 + }, + { + "epoch": 0.44938745554112797, + "grad_norm": 4.867324352264404, + "learning_rate": 6.066372781366183e-05, + "loss": 1.7382, + "step": 5970 + }, + { + "epoch": 0.44946272982178814, + "grad_norm": 5.430710792541504, + "learning_rate": 6.065181708995752e-05, + "loss": 2.1509, + "step": 5971 + }, + { + "epoch": 0.4495380041024483, + "grad_norm": 5.0609259605407715, + "learning_rate": 6.0639905733033245e-05, + "loss": 2.0265, + "step": 5972 + }, + { + "epoch": 0.44961327838310844, + "grad_norm": 8.913156509399414, + "learning_rate": 6.06279937435971e-05, + "loss": 1.903, + "step": 5973 + }, + { + "epoch": 0.4496885526637686, + "grad_norm": 5.114089488983154, + "learning_rate": 6.0616081122357235e-05, + "loss": 1.6698, + "step": 5974 + }, + { + "epoch": 0.44976382694442874, + "grad_norm": 7.96455192565918, + "learning_rate": 6.06041678700218e-05, + "loss": 2.0036, + "step": 5975 + }, + { + "epoch": 0.4498391012250889, + "grad_norm": 5.157546043395996, + "learning_rate": 6.0592253987299e-05, + "loss": 2.2272, + "step": 5976 + }, + { + "epoch": 0.4499143755057491, + "grad_norm": 4.556109428405762, + "learning_rate": 6.0580339474897106e-05, + "loss": 1.9602, + "step": 5977 + }, + { + "epoch": 0.4499896497864092, + "grad_norm": 4.611568927764893, + "learning_rate": 6.0568424333524386e-05, + "loss": 1.5033, + "step": 5978 + }, + { + "epoch": 0.4500649240670694, + "grad_norm": 4.080313205718994, + "learning_rate": 6.055650856388917e-05, + "loss": 1.8218, + "step": 5979 + }, + { + "epoch": 0.45014019834772956, + "grad_norm": 9.208446502685547, + "learning_rate": 6.05445921666998e-05, + "loss": 2.0877, + "step": 5980 + }, + { + "epoch": 0.4502154726283897, + "grad_norm": 7.896475315093994, + "learning_rate": 6.053267514266468e-05, + "loss": 2.0519, + "step": 5981 + }, + { + "epoch": 0.45029074690904985, + "grad_norm": 4.031863689422607, + "learning_rate": 6.0520757492492244e-05, + "loss": 1.7566, + "step": 5982 + }, + { + "epoch": 0.45036602118971003, + "grad_norm": 5.065053462982178, + "learning_rate": 6.0508839216890964e-05, + "loss": 1.6911, + "step": 5983 + }, + { + "epoch": 0.45044129547037015, + "grad_norm": 4.021406650543213, + "learning_rate": 6.049692031656935e-05, + "loss": 1.5459, + "step": 5984 + }, + { + "epoch": 0.4505165697510303, + "grad_norm": 6.9904398918151855, + "learning_rate": 6.048500079223595e-05, + "loss": 2.2804, + "step": 5985 + }, + { + "epoch": 0.45059184403169045, + "grad_norm": 4.259911060333252, + "learning_rate": 6.047308064459933e-05, + "loss": 1.9706, + "step": 5986 + }, + { + "epoch": 0.4506671183123506, + "grad_norm": 4.773449897766113, + "learning_rate": 6.04611598743681e-05, + "loss": 1.7772, + "step": 5987 + }, + { + "epoch": 0.4507423925930108, + "grad_norm": 3.993645429611206, + "learning_rate": 6.044923848225096e-05, + "loss": 2.0062, + "step": 5988 + }, + { + "epoch": 0.4508176668736709, + "grad_norm": 4.864718914031982, + "learning_rate": 6.043731646895656e-05, + "loss": 1.7299, + "step": 5989 + }, + { + "epoch": 0.4508929411543311, + "grad_norm": 4.546704292297363, + "learning_rate": 6.042539383519364e-05, + "loss": 1.5175, + "step": 5990 + }, + { + "epoch": 0.45096821543499127, + "grad_norm": 4.0287089347839355, + "learning_rate": 6.041347058167098e-05, + "loss": 2.0031, + "step": 5991 + }, + { + "epoch": 0.4510434897156514, + "grad_norm": 4.2088117599487305, + "learning_rate": 6.0401546709097366e-05, + "loss": 1.9018, + "step": 5992 + }, + { + "epoch": 0.45111876399631157, + "grad_norm": 5.0298285484313965, + "learning_rate": 6.038962221818165e-05, + "loss": 2.0547, + "step": 5993 + }, + { + "epoch": 0.45119403827697174, + "grad_norm": 5.067275047302246, + "learning_rate": 6.0377697109632716e-05, + "loss": 1.6897, + "step": 5994 + }, + { + "epoch": 0.45126931255763186, + "grad_norm": 5.217893600463867, + "learning_rate": 6.036577138415945e-05, + "loss": 1.7962, + "step": 5995 + }, + { + "epoch": 0.45134458683829204, + "grad_norm": 4.910605430603027, + "learning_rate": 6.035384504247082e-05, + "loss": 1.8039, + "step": 5996 + }, + { + "epoch": 0.45141986111895216, + "grad_norm": 4.302911758422852, + "learning_rate": 6.03419180852758e-05, + "loss": 1.76, + "step": 5997 + }, + { + "epoch": 0.45149513539961234, + "grad_norm": 5.301052093505859, + "learning_rate": 6.032999051328345e-05, + "loss": 2.1016, + "step": 5998 + }, + { + "epoch": 0.4515704096802725, + "grad_norm": 4.598941326141357, + "learning_rate": 6.031806232720277e-05, + "loss": 1.7921, + "step": 5999 + }, + { + "epoch": 0.45164568396093263, + "grad_norm": 4.367154598236084, + "learning_rate": 6.030613352774293e-05, + "loss": 2.5775, + "step": 6000 + }, + { + "epoch": 0.4517209582415928, + "grad_norm": 4.537741184234619, + "learning_rate": 6.0294204115613e-05, + "loss": 1.8805, + "step": 6001 + }, + { + "epoch": 0.451796232522253, + "grad_norm": 7.140305042266846, + "learning_rate": 6.0282274091522174e-05, + "loss": 1.8169, + "step": 6002 + }, + { + "epoch": 0.4518715068029131, + "grad_norm": 4.525583744049072, + "learning_rate": 6.0270343456179666e-05, + "loss": 1.6755, + "step": 6003 + }, + { + "epoch": 0.4519467810835733, + "grad_norm": 4.482814311981201, + "learning_rate": 6.025841221029469e-05, + "loss": 2.0646, + "step": 6004 + }, + { + "epoch": 0.4520220553642334, + "grad_norm": 5.176870822906494, + "learning_rate": 6.024648035457656e-05, + "loss": 2.1803, + "step": 6005 + }, + { + "epoch": 0.4520973296448936, + "grad_norm": 7.216761589050293, + "learning_rate": 6.023454788973457e-05, + "loss": 2.263, + "step": 6006 + }, + { + "epoch": 0.45217260392555375, + "grad_norm": 3.947472095489502, + "learning_rate": 6.022261481647808e-05, + "loss": 2.0027, + "step": 6007 + }, + { + "epoch": 0.45224787820621387, + "grad_norm": 5.846710205078125, + "learning_rate": 6.021068113551645e-05, + "loss": 2.2466, + "step": 6008 + }, + { + "epoch": 0.45232315248687405, + "grad_norm": 4.835168361663818, + "learning_rate": 6.019874684755914e-05, + "loss": 2.016, + "step": 6009 + }, + { + "epoch": 0.4523984267675342, + "grad_norm": 7.244464874267578, + "learning_rate": 6.0186811953315593e-05, + "loss": 1.5864, + "step": 6010 + }, + { + "epoch": 0.45247370104819434, + "grad_norm": 5.961076259613037, + "learning_rate": 6.01748764534953e-05, + "loss": 1.6673, + "step": 6011 + }, + { + "epoch": 0.4525489753288545, + "grad_norm": 5.057140350341797, + "learning_rate": 6.0162940348807804e-05, + "loss": 1.6193, + "step": 6012 + }, + { + "epoch": 0.4526242496095147, + "grad_norm": 5.632834434509277, + "learning_rate": 6.0151003639962654e-05, + "loss": 1.9544, + "step": 6013 + }, + { + "epoch": 0.4526995238901748, + "grad_norm": 4.595800399780273, + "learning_rate": 6.0139066327669466e-05, + "loss": 1.6372, + "step": 6014 + }, + { + "epoch": 0.452774798170835, + "grad_norm": 5.394411563873291, + "learning_rate": 6.012712841263788e-05, + "loss": 2.0719, + "step": 6015 + }, + { + "epoch": 0.4528500724514951, + "grad_norm": 4.8479084968566895, + "learning_rate": 6.011518989557757e-05, + "loss": 1.8402, + "step": 6016 + }, + { + "epoch": 0.4529253467321553, + "grad_norm": 4.494967937469482, + "learning_rate": 6.010325077719825e-05, + "loss": 1.9344, + "step": 6017 + }, + { + "epoch": 0.45300062101281546, + "grad_norm": 5.047435283660889, + "learning_rate": 6.009131105820965e-05, + "loss": 1.5327, + "step": 6018 + }, + { + "epoch": 0.4530758952934756, + "grad_norm": 4.637040138244629, + "learning_rate": 6.007937073932157e-05, + "loss": 1.9608, + "step": 6019 + }, + { + "epoch": 0.45315116957413576, + "grad_norm": 5.525001525878906, + "learning_rate": 6.0067429821243816e-05, + "loss": 2.0637, + "step": 6020 + }, + { + "epoch": 0.45322644385479594, + "grad_norm": 6.5304765701293945, + "learning_rate": 6.005548830468625e-05, + "loss": 1.8389, + "step": 6021 + }, + { + "epoch": 0.45330171813545606, + "grad_norm": 5.200170993804932, + "learning_rate": 6.004354619035876e-05, + "loss": 1.6527, + "step": 6022 + }, + { + "epoch": 0.45337699241611623, + "grad_norm": 5.130229949951172, + "learning_rate": 6.003160347897126e-05, + "loss": 1.5322, + "step": 6023 + }, + { + "epoch": 0.45345226669677635, + "grad_norm": 4.31912088394165, + "learning_rate": 6.0019660171233715e-05, + "loss": 1.7071, + "step": 6024 + }, + { + "epoch": 0.4535275409774365, + "grad_norm": 8.483233451843262, + "learning_rate": 6.0007716267856126e-05, + "loss": 1.8813, + "step": 6025 + }, + { + "epoch": 0.4536028152580967, + "grad_norm": 7.1665520668029785, + "learning_rate": 5.9995771769548514e-05, + "loss": 1.5141, + "step": 6026 + }, + { + "epoch": 0.4536780895387568, + "grad_norm": 4.0948872566223145, + "learning_rate": 5.998382667702096e-05, + "loss": 2.0954, + "step": 6027 + }, + { + "epoch": 0.453753363819417, + "grad_norm": 6.821392059326172, + "learning_rate": 5.997188099098357e-05, + "loss": 1.5671, + "step": 6028 + }, + { + "epoch": 0.4538286381000772, + "grad_norm": 4.044174671173096, + "learning_rate": 5.995993471214645e-05, + "loss": 1.8528, + "step": 6029 + }, + { + "epoch": 0.4539039123807373, + "grad_norm": 6.528314590454102, + "learning_rate": 5.9947987841219774e-05, + "loss": 2.3047, + "step": 6030 + }, + { + "epoch": 0.45397918666139747, + "grad_norm": 6.535315036773682, + "learning_rate": 5.9936040378913785e-05, + "loss": 1.6642, + "step": 6031 + }, + { + "epoch": 0.45405446094205765, + "grad_norm": 6.73472261428833, + "learning_rate": 5.9924092325938694e-05, + "loss": 1.7797, + "step": 6032 + }, + { + "epoch": 0.45412973522271777, + "grad_norm": 6.7068891525268555, + "learning_rate": 5.991214368300481e-05, + "loss": 1.7158, + "step": 6033 + }, + { + "epoch": 0.45420500950337794, + "grad_norm": 7.279703617095947, + "learning_rate": 5.9900194450822413e-05, + "loss": 1.6758, + "step": 6034 + }, + { + "epoch": 0.45428028378403806, + "grad_norm": 4.736883640289307, + "learning_rate": 5.988824463010187e-05, + "loss": 1.8204, + "step": 6035 + }, + { + "epoch": 0.45435555806469824, + "grad_norm": 6.897228240966797, + "learning_rate": 5.987629422155354e-05, + "loss": 1.8004, + "step": 6036 + }, + { + "epoch": 0.4544308323453584, + "grad_norm": 4.4168524742126465, + "learning_rate": 5.9864343225887864e-05, + "loss": 1.8317, + "step": 6037 + }, + { + "epoch": 0.45450610662601854, + "grad_norm": 6.016454219818115, + "learning_rate": 5.985239164381529e-05, + "loss": 1.7513, + "step": 6038 + }, + { + "epoch": 0.4545813809066787, + "grad_norm": 4.701673984527588, + "learning_rate": 5.98404394760463e-05, + "loss": 1.4949, + "step": 6039 + }, + { + "epoch": 0.4546566551873389, + "grad_norm": 6.244337558746338, + "learning_rate": 5.9828486723291435e-05, + "loss": 2.028, + "step": 6040 + }, + { + "epoch": 0.454731929467999, + "grad_norm": 4.369786739349365, + "learning_rate": 5.981653338626122e-05, + "loss": 1.7528, + "step": 6041 + }, + { + "epoch": 0.4548072037486592, + "grad_norm": 5.680680274963379, + "learning_rate": 5.980457946566625e-05, + "loss": 2.2813, + "step": 6042 + }, + { + "epoch": 0.45488247802931936, + "grad_norm": 5.053941249847412, + "learning_rate": 5.979262496221718e-05, + "loss": 1.7401, + "step": 6043 + }, + { + "epoch": 0.4549577523099795, + "grad_norm": 7.308879375457764, + "learning_rate": 5.978066987662465e-05, + "loss": 1.9016, + "step": 6044 + }, + { + "epoch": 0.45503302659063966, + "grad_norm": 6.568202495574951, + "learning_rate": 5.976871420959936e-05, + "loss": 1.7812, + "step": 6045 + }, + { + "epoch": 0.4551083008712998, + "grad_norm": 3.8854751586914062, + "learning_rate": 5.975675796185204e-05, + "loss": 1.6295, + "step": 6046 + }, + { + "epoch": 0.45518357515195995, + "grad_norm": 4.687932968139648, + "learning_rate": 5.9744801134093454e-05, + "loss": 1.8919, + "step": 6047 + }, + { + "epoch": 0.4552588494326201, + "grad_norm": 4.6100239753723145, + "learning_rate": 5.9732843727034395e-05, + "loss": 1.9737, + "step": 6048 + }, + { + "epoch": 0.45533412371328025, + "grad_norm": 4.947805881500244, + "learning_rate": 5.972088574138571e-05, + "loss": 1.8602, + "step": 6049 + }, + { + "epoch": 0.4554093979939404, + "grad_norm": 4.51487922668457, + "learning_rate": 5.9708927177858265e-05, + "loss": 1.6657, + "step": 6050 + }, + { + "epoch": 0.4554846722746006, + "grad_norm": 6.710210800170898, + "learning_rate": 5.969696803716295e-05, + "loss": 1.9876, + "step": 6051 + }, + { + "epoch": 0.4555599465552607, + "grad_norm": 4.834372043609619, + "learning_rate": 5.968500832001072e-05, + "loss": 1.7301, + "step": 6052 + }, + { + "epoch": 0.4556352208359209, + "grad_norm": 8.133435249328613, + "learning_rate": 5.967304802711252e-05, + "loss": 1.6668, + "step": 6053 + }, + { + "epoch": 0.455710495116581, + "grad_norm": 5.9300537109375, + "learning_rate": 5.966108715917937e-05, + "loss": 2.0845, + "step": 6054 + }, + { + "epoch": 0.4557857693972412, + "grad_norm": 4.685609817504883, + "learning_rate": 5.9649125716922316e-05, + "loss": 2.1073, + "step": 6055 + }, + { + "epoch": 0.45586104367790137, + "grad_norm": 4.934604167938232, + "learning_rate": 5.9637163701052434e-05, + "loss": 1.9403, + "step": 6056 + }, + { + "epoch": 0.4559363179585615, + "grad_norm": 5.846461772918701, + "learning_rate": 5.9625201112280806e-05, + "loss": 2.3661, + "step": 6057 + }, + { + "epoch": 0.45601159223922166, + "grad_norm": 4.534073829650879, + "learning_rate": 5.96132379513186e-05, + "loss": 1.9814, + "step": 6058 + }, + { + "epoch": 0.45608686651988184, + "grad_norm": 5.712651252746582, + "learning_rate": 5.960127421887697e-05, + "loss": 1.8173, + "step": 6059 + }, + { + "epoch": 0.45616214080054196, + "grad_norm": 5.031944274902344, + "learning_rate": 5.9589309915667146e-05, + "loss": 1.3554, + "step": 6060 + }, + { + "epoch": 0.45623741508120214, + "grad_norm": 5.242394924163818, + "learning_rate": 5.9577345042400355e-05, + "loss": 1.9039, + "step": 6061 + }, + { + "epoch": 0.4563126893618623, + "grad_norm": 6.488272666931152, + "learning_rate": 5.9565379599787885e-05, + "loss": 1.6434, + "step": 6062 + }, + { + "epoch": 0.45638796364252243, + "grad_norm": 5.953472137451172, + "learning_rate": 5.955341358854105e-05, + "loss": 1.915, + "step": 6063 + }, + { + "epoch": 0.4564632379231826, + "grad_norm": 4.9955973625183105, + "learning_rate": 5.954144700937119e-05, + "loss": 1.7853, + "step": 6064 + }, + { + "epoch": 0.45653851220384273, + "grad_norm": 6.209583759307861, + "learning_rate": 5.952947986298967e-05, + "loss": 1.9479, + "step": 6065 + }, + { + "epoch": 0.4566137864845029, + "grad_norm": 8.908929824829102, + "learning_rate": 5.9517512150107926e-05, + "loss": 1.7243, + "step": 6066 + }, + { + "epoch": 0.4566890607651631, + "grad_norm": 7.247470378875732, + "learning_rate": 5.950554387143739e-05, + "loss": 1.7124, + "step": 6067 + }, + { + "epoch": 0.4567643350458232, + "grad_norm": 4.193312644958496, + "learning_rate": 5.949357502768954e-05, + "loss": 1.79, + "step": 6068 + }, + { + "epoch": 0.4568396093264834, + "grad_norm": 4.503009796142578, + "learning_rate": 5.948160561957591e-05, + "loss": 1.7226, + "step": 6069 + }, + { + "epoch": 0.45691488360714355, + "grad_norm": 6.05814790725708, + "learning_rate": 5.9469635647808006e-05, + "loss": 2.0284, + "step": 6070 + }, + { + "epoch": 0.45699015788780367, + "grad_norm": 5.404448509216309, + "learning_rate": 5.945766511309745e-05, + "loss": 1.7335, + "step": 6071 + }, + { + "epoch": 0.45706543216846385, + "grad_norm": 6.091677188873291, + "learning_rate": 5.944569401615585e-05, + "loss": 1.717, + "step": 6072 + }, + { + "epoch": 0.45714070644912397, + "grad_norm": 4.675161361694336, + "learning_rate": 5.943372235769483e-05, + "loss": 1.7547, + "step": 6073 + }, + { + "epoch": 0.45721598072978414, + "grad_norm": 4.086429119110107, + "learning_rate": 5.942175013842609e-05, + "loss": 1.5028, + "step": 6074 + }, + { + "epoch": 0.4572912550104443, + "grad_norm": 3.3575029373168945, + "learning_rate": 5.9409777359061324e-05, + "loss": 1.6663, + "step": 6075 + }, + { + "epoch": 0.45736652929110444, + "grad_norm": 4.562759876251221, + "learning_rate": 5.939780402031232e-05, + "loss": 1.9863, + "step": 6076 + }, + { + "epoch": 0.4574418035717646, + "grad_norm": 5.538679122924805, + "learning_rate": 5.938583012289082e-05, + "loss": 1.9407, + "step": 6077 + }, + { + "epoch": 0.4575170778524248, + "grad_norm": 5.690635681152344, + "learning_rate": 5.937385566750864e-05, + "loss": 1.743, + "step": 6078 + }, + { + "epoch": 0.4575923521330849, + "grad_norm": 6.556675434112549, + "learning_rate": 5.9361880654877645e-05, + "loss": 1.8004, + "step": 6079 + }, + { + "epoch": 0.4576676264137451, + "grad_norm": 5.503957271575928, + "learning_rate": 5.934990508570971e-05, + "loss": 1.7745, + "step": 6080 + }, + { + "epoch": 0.45774290069440526, + "grad_norm": 3.4711291790008545, + "learning_rate": 5.9337928960716746e-05, + "loss": 1.9403, + "step": 6081 + }, + { + "epoch": 0.4578181749750654, + "grad_norm": 4.22915506362915, + "learning_rate": 5.932595228061069e-05, + "loss": 2.0392, + "step": 6082 + }, + { + "epoch": 0.45789344925572556, + "grad_norm": 4.395815372467041, + "learning_rate": 5.931397504610353e-05, + "loss": 1.883, + "step": 6083 + }, + { + "epoch": 0.4579687235363857, + "grad_norm": 4.615835189819336, + "learning_rate": 5.9301997257907296e-05, + "loss": 1.8139, + "step": 6084 + }, + { + "epoch": 0.45804399781704586, + "grad_norm": 5.618629455566406, + "learning_rate": 5.9290018916734e-05, + "loss": 1.9482, + "step": 6085 + }, + { + "epoch": 0.45811927209770603, + "grad_norm": 5.938513278961182, + "learning_rate": 5.9278040023295744e-05, + "loss": 1.8022, + "step": 6086 + }, + { + "epoch": 0.45819454637836615, + "grad_norm": 4.427749156951904, + "learning_rate": 5.926606057830462e-05, + "loss": 1.9146, + "step": 6087 + }, + { + "epoch": 0.45826982065902633, + "grad_norm": 8.553178787231445, + "learning_rate": 5.925408058247278e-05, + "loss": 1.6609, + "step": 6088 + }, + { + "epoch": 0.4583450949396865, + "grad_norm": 4.470408916473389, + "learning_rate": 5.924210003651242e-05, + "loss": 1.9157, + "step": 6089 + }, + { + "epoch": 0.4584203692203466, + "grad_norm": 4.592541694641113, + "learning_rate": 5.923011894113573e-05, + "loss": 2.2466, + "step": 6090 + }, + { + "epoch": 0.4584956435010068, + "grad_norm": 6.455626010894775, + "learning_rate": 5.9218137297054945e-05, + "loss": 1.7034, + "step": 6091 + }, + { + "epoch": 0.458570917781667, + "grad_norm": 5.766184329986572, + "learning_rate": 5.9206155104982333e-05, + "loss": 2.1367, + "step": 6092 + }, + { + "epoch": 0.4586461920623271, + "grad_norm": 4.370254039764404, + "learning_rate": 5.919417236563023e-05, + "loss": 1.7197, + "step": 6093 + }, + { + "epoch": 0.45872146634298727, + "grad_norm": 6.85117769241333, + "learning_rate": 5.918218907971095e-05, + "loss": 2.101, + "step": 6094 + }, + { + "epoch": 0.4587967406236474, + "grad_norm": 5.252208232879639, + "learning_rate": 5.9170205247936896e-05, + "loss": 1.8725, + "step": 6095 + }, + { + "epoch": 0.45887201490430757, + "grad_norm": 4.018035888671875, + "learning_rate": 5.915822087102044e-05, + "loss": 1.9098, + "step": 6096 + }, + { + "epoch": 0.45894728918496774, + "grad_norm": 4.710295677185059, + "learning_rate": 5.914623594967404e-05, + "loss": 1.6791, + "step": 6097 + }, + { + "epoch": 0.45902256346562786, + "grad_norm": 5.938717365264893, + "learning_rate": 5.913425048461013e-05, + "loss": 1.9013, + "step": 6098 + }, + { + "epoch": 0.45909783774628804, + "grad_norm": 4.47329044342041, + "learning_rate": 5.912226447654127e-05, + "loss": 1.7194, + "step": 6099 + }, + { + "epoch": 0.4591731120269482, + "grad_norm": 4.890882968902588, + "learning_rate": 5.911027792617995e-05, + "loss": 1.9075, + "step": 6100 + }, + { + "epoch": 0.45924838630760834, + "grad_norm": 4.940886974334717, + "learning_rate": 5.909829083423875e-05, + "loss": 1.6068, + "step": 6101 + }, + { + "epoch": 0.4593236605882685, + "grad_norm": 4.328497409820557, + "learning_rate": 5.908630320143026e-05, + "loss": 1.9103, + "step": 6102 + }, + { + "epoch": 0.45939893486892863, + "grad_norm": 4.51425313949585, + "learning_rate": 5.907431502846712e-05, + "loss": 1.8118, + "step": 6103 + }, + { + "epoch": 0.4594742091495888, + "grad_norm": 5.966202259063721, + "learning_rate": 5.906232631606198e-05, + "loss": 2.1494, + "step": 6104 + }, + { + "epoch": 0.459549483430249, + "grad_norm": 6.022815704345703, + "learning_rate": 5.905033706492755e-05, + "loss": 1.7501, + "step": 6105 + }, + { + "epoch": 0.4596247577109091, + "grad_norm": 5.151634216308594, + "learning_rate": 5.903834727577656e-05, + "loss": 1.7337, + "step": 6106 + }, + { + "epoch": 0.4597000319915693, + "grad_norm": 4.636327743530273, + "learning_rate": 5.902635694932175e-05, + "loss": 2.1326, + "step": 6107 + }, + { + "epoch": 0.45977530627222946, + "grad_norm": 5.286526679992676, + "learning_rate": 5.901436608627592e-05, + "loss": 1.9134, + "step": 6108 + }, + { + "epoch": 0.4598505805528896, + "grad_norm": 7.091437816619873, + "learning_rate": 5.900237468735188e-05, + "loss": 1.9734, + "step": 6109 + }, + { + "epoch": 0.45992585483354975, + "grad_norm": 5.15493631362915, + "learning_rate": 5.89903827532625e-05, + "loss": 1.8421, + "step": 6110 + }, + { + "epoch": 0.46000112911420993, + "grad_norm": 4.623770236968994, + "learning_rate": 5.897839028472066e-05, + "loss": 2.0416, + "step": 6111 + }, + { + "epoch": 0.46007640339487005, + "grad_norm": 5.258423328399658, + "learning_rate": 5.896639728243929e-05, + "loss": 1.4041, + "step": 6112 + }, + { + "epoch": 0.4601516776755302, + "grad_norm": 4.992006778717041, + "learning_rate": 5.895440374713133e-05, + "loss": 2.0742, + "step": 6113 + }, + { + "epoch": 0.46022695195619034, + "grad_norm": 9.411041259765625, + "learning_rate": 5.894240967950976e-05, + "loss": 2.4886, + "step": 6114 + }, + { + "epoch": 0.4603022262368505, + "grad_norm": 6.868233680725098, + "learning_rate": 5.8930415080287585e-05, + "loss": 1.8167, + "step": 6115 + }, + { + "epoch": 0.4603775005175107, + "grad_norm": 5.179319381713867, + "learning_rate": 5.8918419950177864e-05, + "loss": 2.0037, + "step": 6116 + }, + { + "epoch": 0.4604527747981708, + "grad_norm": 4.42066764831543, + "learning_rate": 5.890642428989366e-05, + "loss": 1.7899, + "step": 6117 + }, + { + "epoch": 0.460528049078831, + "grad_norm": 5.01406717300415, + "learning_rate": 5.889442810014809e-05, + "loss": 2.1132, + "step": 6118 + }, + { + "epoch": 0.46060332335949117, + "grad_norm": 4.943274974822998, + "learning_rate": 5.888243138165429e-05, + "loss": 2.0505, + "step": 6119 + }, + { + "epoch": 0.4606785976401513, + "grad_norm": 5.13903284072876, + "learning_rate": 5.887043413512543e-05, + "loss": 1.6401, + "step": 6120 + }, + { + "epoch": 0.46075387192081146, + "grad_norm": 6.994175434112549, + "learning_rate": 5.8858436361274714e-05, + "loss": 1.4858, + "step": 6121 + }, + { + "epoch": 0.4608291462014716, + "grad_norm": 5.462165832519531, + "learning_rate": 5.884643806081538e-05, + "loss": 1.6786, + "step": 6122 + }, + { + "epoch": 0.46090442048213176, + "grad_norm": 4.041025638580322, + "learning_rate": 5.883443923446068e-05, + "loss": 1.827, + "step": 6123 + }, + { + "epoch": 0.46097969476279194, + "grad_norm": 5.105112552642822, + "learning_rate": 5.882243988292393e-05, + "loss": 1.7969, + "step": 6124 + }, + { + "epoch": 0.46105496904345206, + "grad_norm": 4.128626823425293, + "learning_rate": 5.881044000691842e-05, + "loss": 1.9256, + "step": 6125 + }, + { + "epoch": 0.46113024332411223, + "grad_norm": 5.253990650177002, + "learning_rate": 5.879843960715754e-05, + "loss": 1.7567, + "step": 6126 + }, + { + "epoch": 0.4612055176047724, + "grad_norm": 5.437035083770752, + "learning_rate": 5.878643868435467e-05, + "loss": 1.7301, + "step": 6127 + }, + { + "epoch": 0.46128079188543253, + "grad_norm": 5.518754482269287, + "learning_rate": 5.877443723922323e-05, + "loss": 1.7715, + "step": 6128 + }, + { + "epoch": 0.4613560661660927, + "grad_norm": 5.965641021728516, + "learning_rate": 5.876243527247669e-05, + "loss": 2.1059, + "step": 6129 + }, + { + "epoch": 0.4614313404467529, + "grad_norm": 5.311532497406006, + "learning_rate": 5.87504327848285e-05, + "loss": 1.918, + "step": 6130 + }, + { + "epoch": 0.461506614727413, + "grad_norm": 4.383765697479248, + "learning_rate": 5.87384297769922e-05, + "loss": 1.7555, + "step": 6131 + }, + { + "epoch": 0.4615818890080732, + "grad_norm": 4.188746929168701, + "learning_rate": 5.8726426249681296e-05, + "loss": 1.9346, + "step": 6132 + }, + { + "epoch": 0.4616571632887333, + "grad_norm": 6.274206161499023, + "learning_rate": 5.871442220360942e-05, + "loss": 1.8332, + "step": 6133 + }, + { + "epoch": 0.4617324375693935, + "grad_norm": 7.281796455383301, + "learning_rate": 5.870241763949014e-05, + "loss": 2.2115, + "step": 6134 + }, + { + "epoch": 0.46180771185005365, + "grad_norm": 5.126307487487793, + "learning_rate": 5.86904125580371e-05, + "loss": 1.6952, + "step": 6135 + }, + { + "epoch": 0.46188298613071377, + "grad_norm": 4.986194610595703, + "learning_rate": 5.8678406959963974e-05, + "loss": 1.4031, + "step": 6136 + }, + { + "epoch": 0.46195826041137394, + "grad_norm": 5.2310380935668945, + "learning_rate": 5.866640084598444e-05, + "loss": 1.8141, + "step": 6137 + }, + { + "epoch": 0.4620335346920341, + "grad_norm": 5.924704074859619, + "learning_rate": 5.865439421681226e-05, + "loss": 2.1446, + "step": 6138 + }, + { + "epoch": 0.46210880897269424, + "grad_norm": 7.179225444793701, + "learning_rate": 5.864238707316117e-05, + "loss": 1.4637, + "step": 6139 + }, + { + "epoch": 0.4621840832533544, + "grad_norm": 5.906728744506836, + "learning_rate": 5.863037941574496e-05, + "loss": 1.6153, + "step": 6140 + }, + { + "epoch": 0.4622593575340146, + "grad_norm": 9.076240539550781, + "learning_rate": 5.8618371245277474e-05, + "loss": 1.9877, + "step": 6141 + }, + { + "epoch": 0.4623346318146747, + "grad_norm": 4.587173938751221, + "learning_rate": 5.860636256247252e-05, + "loss": 1.715, + "step": 6142 + }, + { + "epoch": 0.4624099060953349, + "grad_norm": 5.917949676513672, + "learning_rate": 5.859435336804401e-05, + "loss": 1.5084, + "step": 6143 + }, + { + "epoch": 0.462485180375995, + "grad_norm": 4.921531677246094, + "learning_rate": 5.858234366270586e-05, + "loss": 1.8317, + "step": 6144 + }, + { + "epoch": 0.4625604546566552, + "grad_norm": 5.478975296020508, + "learning_rate": 5.8570333447172024e-05, + "loss": 1.662, + "step": 6145 + }, + { + "epoch": 0.46263572893731536, + "grad_norm": 5.567662715911865, + "learning_rate": 5.8558322722156437e-05, + "loss": 1.8929, + "step": 6146 + }, + { + "epoch": 0.4627110032179755, + "grad_norm": 4.7706098556518555, + "learning_rate": 5.854631148837312e-05, + "loss": 2.0294, + "step": 6147 + }, + { + "epoch": 0.46278627749863566, + "grad_norm": 6.8486151695251465, + "learning_rate": 5.8534299746536103e-05, + "loss": 2.0833, + "step": 6148 + }, + { + "epoch": 0.46286155177929583, + "grad_norm": 4.756955146789551, + "learning_rate": 5.852228749735946e-05, + "loss": 2.0286, + "step": 6149 + }, + { + "epoch": 0.46293682605995595, + "grad_norm": 4.3472771644592285, + "learning_rate": 5.851027474155728e-05, + "loss": 1.9251, + "step": 6150 + }, + { + "epoch": 0.46301210034061613, + "grad_norm": 4.984425067901611, + "learning_rate": 5.8498261479843685e-05, + "loss": 1.8483, + "step": 6151 + }, + { + "epoch": 0.46308737462127625, + "grad_norm": 6.893495082855225, + "learning_rate": 5.848624771293284e-05, + "loss": 1.6982, + "step": 6152 + }, + { + "epoch": 0.4631626489019364, + "grad_norm": 6.063901901245117, + "learning_rate": 5.847423344153891e-05, + "loss": 2.0692, + "step": 6153 + }, + { + "epoch": 0.4632379231825966, + "grad_norm": 6.491004943847656, + "learning_rate": 5.84622186663761e-05, + "loss": 1.9783, + "step": 6154 + }, + { + "epoch": 0.4633131974632567, + "grad_norm": 4.137994289398193, + "learning_rate": 5.845020338815869e-05, + "loss": 1.8018, + "step": 6155 + }, + { + "epoch": 0.4633884717439169, + "grad_norm": 4.295070648193359, + "learning_rate": 5.8438187607600935e-05, + "loss": 2.0033, + "step": 6156 + }, + { + "epoch": 0.4634637460245771, + "grad_norm": 3.9471449851989746, + "learning_rate": 5.8426171325417136e-05, + "loss": 1.9153, + "step": 6157 + }, + { + "epoch": 0.4635390203052372, + "grad_norm": 7.037964344024658, + "learning_rate": 5.841415454232162e-05, + "loss": 1.8796, + "step": 6158 + }, + { + "epoch": 0.46361429458589737, + "grad_norm": 5.8768792152404785, + "learning_rate": 5.840213725902877e-05, + "loss": 1.7865, + "step": 6159 + }, + { + "epoch": 0.46368956886655754, + "grad_norm": 4.794296741485596, + "learning_rate": 5.839011947625295e-05, + "loss": 1.9562, + "step": 6160 + }, + { + "epoch": 0.46376484314721766, + "grad_norm": 5.852258205413818, + "learning_rate": 5.8378101194708614e-05, + "loss": 1.7414, + "step": 6161 + }, + { + "epoch": 0.46384011742787784, + "grad_norm": 5.527842998504639, + "learning_rate": 5.8366082415110215e-05, + "loss": 2.1941, + "step": 6162 + }, + { + "epoch": 0.46391539170853796, + "grad_norm": 6.627048015594482, + "learning_rate": 5.83540631381722e-05, + "loss": 2.0013, + "step": 6163 + }, + { + "epoch": 0.46399066598919814, + "grad_norm": 5.102452754974365, + "learning_rate": 5.834204336460911e-05, + "loss": 1.9073, + "step": 6164 + }, + { + "epoch": 0.4640659402698583, + "grad_norm": 5.403213024139404, + "learning_rate": 5.8330023095135476e-05, + "loss": 2.0079, + "step": 6165 + }, + { + "epoch": 0.46414121455051843, + "grad_norm": 5.336402893066406, + "learning_rate": 5.8318002330465884e-05, + "loss": 1.7969, + "step": 6166 + }, + { + "epoch": 0.4642164888311786, + "grad_norm": 4.320074558258057, + "learning_rate": 5.830598107131491e-05, + "loss": 2.0137, + "step": 6167 + }, + { + "epoch": 0.4642917631118388, + "grad_norm": 4.862398624420166, + "learning_rate": 5.8293959318397194e-05, + "loss": 1.9063, + "step": 6168 + }, + { + "epoch": 0.4643670373924989, + "grad_norm": 5.347936630249023, + "learning_rate": 5.82819370724274e-05, + "loss": 1.8387, + "step": 6169 + }, + { + "epoch": 0.4644423116731591, + "grad_norm": 4.824131488800049, + "learning_rate": 5.826991433412019e-05, + "loss": 1.7827, + "step": 6170 + }, + { + "epoch": 0.46451758595381926, + "grad_norm": 6.199734687805176, + "learning_rate": 5.825789110419032e-05, + "loss": 1.4471, + "step": 6171 + }, + { + "epoch": 0.4645928602344794, + "grad_norm": 5.9510345458984375, + "learning_rate": 5.8245867383352516e-05, + "loss": 2.0952, + "step": 6172 + }, + { + "epoch": 0.46466813451513955, + "grad_norm": 4.034305095672607, + "learning_rate": 5.8233843172321564e-05, + "loss": 1.952, + "step": 6173 + }, + { + "epoch": 0.4647434087957997, + "grad_norm": 4.026094436645508, + "learning_rate": 5.822181847181225e-05, + "loss": 1.5948, + "step": 6174 + }, + { + "epoch": 0.46481868307645985, + "grad_norm": 5.044445037841797, + "learning_rate": 5.8209793282539414e-05, + "loss": 1.9876, + "step": 6175 + }, + { + "epoch": 0.46489395735712, + "grad_norm": 4.365877151489258, + "learning_rate": 5.819776760521793e-05, + "loss": 1.942, + "step": 6176 + }, + { + "epoch": 0.46496923163778014, + "grad_norm": 5.579507827758789, + "learning_rate": 5.818574144056268e-05, + "loss": 1.8556, + "step": 6177 + }, + { + "epoch": 0.4650445059184403, + "grad_norm": 4.639021873474121, + "learning_rate": 5.8173714789288604e-05, + "loss": 1.7235, + "step": 6178 + }, + { + "epoch": 0.4651197801991005, + "grad_norm": 4.101806640625, + "learning_rate": 5.816168765211063e-05, + "loss": 1.8189, + "step": 6179 + }, + { + "epoch": 0.4651950544797606, + "grad_norm": 6.123523235321045, + "learning_rate": 5.814966002974374e-05, + "loss": 2.5076, + "step": 6180 + }, + { + "epoch": 0.4652703287604208, + "grad_norm": 5.491883754730225, + "learning_rate": 5.8137631922902944e-05, + "loss": 1.7997, + "step": 6181 + }, + { + "epoch": 0.4653456030410809, + "grad_norm": 5.771058559417725, + "learning_rate": 5.812560333230328e-05, + "loss": 1.803, + "step": 6182 + }, + { + "epoch": 0.4654208773217411, + "grad_norm": 4.064331531524658, + "learning_rate": 5.811357425865981e-05, + "loss": 1.9535, + "step": 6183 + }, + { + "epoch": 0.46549615160240126, + "grad_norm": 7.299584865570068, + "learning_rate": 5.8101544702687636e-05, + "loss": 2.3137, + "step": 6184 + }, + { + "epoch": 0.4655714258830614, + "grad_norm": 5.733707427978516, + "learning_rate": 5.808951466510188e-05, + "loss": 1.8183, + "step": 6185 + }, + { + "epoch": 0.46564670016372156, + "grad_norm": 4.6124749183654785, + "learning_rate": 5.807748414661769e-05, + "loss": 2.3463, + "step": 6186 + }, + { + "epoch": 0.46572197444438174, + "grad_norm": 4.0095534324646, + "learning_rate": 5.806545314795022e-05, + "loss": 1.6059, + "step": 6187 + }, + { + "epoch": 0.46579724872504186, + "grad_norm": 4.9799323081970215, + "learning_rate": 5.805342166981472e-05, + "loss": 1.7927, + "step": 6188 + }, + { + "epoch": 0.46587252300570203, + "grad_norm": 4.3228607177734375, + "learning_rate": 5.804138971292642e-05, + "loss": 1.8421, + "step": 6189 + }, + { + "epoch": 0.4659477972863622, + "grad_norm": 4.3227105140686035, + "learning_rate": 5.802935727800056e-05, + "loss": 1.5631, + "step": 6190 + }, + { + "epoch": 0.46602307156702233, + "grad_norm": 5.812267780303955, + "learning_rate": 5.801732436575245e-05, + "loss": 2.1101, + "step": 6191 + }, + { + "epoch": 0.4660983458476825, + "grad_norm": 7.444596767425537, + "learning_rate": 5.800529097689742e-05, + "loss": 1.6422, + "step": 6192 + }, + { + "epoch": 0.4661736201283426, + "grad_norm": 5.47690486907959, + "learning_rate": 5.799325711215079e-05, + "loss": 2.0732, + "step": 6193 + }, + { + "epoch": 0.4662488944090028, + "grad_norm": 4.791956424713135, + "learning_rate": 5.7981222772227985e-05, + "loss": 1.8203, + "step": 6194 + }, + { + "epoch": 0.466324168689663, + "grad_norm": 4.943728923797607, + "learning_rate": 5.796918795784437e-05, + "loss": 1.7633, + "step": 6195 + }, + { + "epoch": 0.4663994429703231, + "grad_norm": 4.251983642578125, + "learning_rate": 5.7957152669715406e-05, + "loss": 2.0494, + "step": 6196 + }, + { + "epoch": 0.4664747172509833, + "grad_norm": 3.836134672164917, + "learning_rate": 5.7945116908556554e-05, + "loss": 1.9262, + "step": 6197 + }, + { + "epoch": 0.46654999153164345, + "grad_norm": 4.378943920135498, + "learning_rate": 5.793308067508328e-05, + "loss": 1.4464, + "step": 6198 + }, + { + "epoch": 0.46662526581230357, + "grad_norm": 4.998561382293701, + "learning_rate": 5.7921043970011134e-05, + "loss": 2.0506, + "step": 6199 + }, + { + "epoch": 0.46670054009296374, + "grad_norm": 5.273262977600098, + "learning_rate": 5.790900679405565e-05, + "loss": 1.6084, + "step": 6200 + }, + { + "epoch": 0.46677581437362387, + "grad_norm": 5.735292434692383, + "learning_rate": 5.7896969147932414e-05, + "loss": 1.3559, + "step": 6201 + }, + { + "epoch": 0.46685108865428404, + "grad_norm": 4.062127590179443, + "learning_rate": 5.7884931032357015e-05, + "loss": 1.6339, + "step": 6202 + }, + { + "epoch": 0.4669263629349442, + "grad_norm": 5.837065696716309, + "learning_rate": 5.787289244804509e-05, + "loss": 1.7537, + "step": 6203 + }, + { + "epoch": 0.46700163721560434, + "grad_norm": 6.040595531463623, + "learning_rate": 5.786085339571229e-05, + "loss": 1.6798, + "step": 6204 + }, + { + "epoch": 0.4670769114962645, + "grad_norm": 5.041119575500488, + "learning_rate": 5.784881387607433e-05, + "loss": 2.5128, + "step": 6205 + }, + { + "epoch": 0.4671521857769247, + "grad_norm": 5.269215106964111, + "learning_rate": 5.783677388984689e-05, + "loss": 1.5857, + "step": 6206 + }, + { + "epoch": 0.4672274600575848, + "grad_norm": 6.77852201461792, + "learning_rate": 5.7824733437745725e-05, + "loss": 2.0272, + "step": 6207 + }, + { + "epoch": 0.467302734338245, + "grad_norm": 5.967972278594971, + "learning_rate": 5.781269252048662e-05, + "loss": 1.832, + "step": 6208 + }, + { + "epoch": 0.46737800861890516, + "grad_norm": 5.950111389160156, + "learning_rate": 5.780065113878537e-05, + "loss": 1.7212, + "step": 6209 + }, + { + "epoch": 0.4674532828995653, + "grad_norm": 4.375077247619629, + "learning_rate": 5.778860929335777e-05, + "loss": 1.8869, + "step": 6210 + }, + { + "epoch": 0.46752855718022546, + "grad_norm": 5.185276508331299, + "learning_rate": 5.77765669849197e-05, + "loss": 1.9397, + "step": 6211 + }, + { + "epoch": 0.4676038314608856, + "grad_norm": 4.100444793701172, + "learning_rate": 5.7764524214187044e-05, + "loss": 2.2237, + "step": 6212 + }, + { + "epoch": 0.46767910574154575, + "grad_norm": 4.235504150390625, + "learning_rate": 5.775248098187571e-05, + "loss": 1.5765, + "step": 6213 + }, + { + "epoch": 0.46775438002220593, + "grad_norm": 5.668567657470703, + "learning_rate": 5.774043728870162e-05, + "loss": 2.155, + "step": 6214 + }, + { + "epoch": 0.46782965430286605, + "grad_norm": 4.007643222808838, + "learning_rate": 5.772839313538073e-05, + "loss": 1.9581, + "step": 6215 + }, + { + "epoch": 0.4679049285835262, + "grad_norm": 7.456630706787109, + "learning_rate": 5.771634852262906e-05, + "loss": 2.2587, + "step": 6216 + }, + { + "epoch": 0.4679802028641864, + "grad_norm": 4.913119316101074, + "learning_rate": 5.77043034511626e-05, + "loss": 1.7175, + "step": 6217 + }, + { + "epoch": 0.4680554771448465, + "grad_norm": 6.211736679077148, + "learning_rate": 5.769225792169741e-05, + "loss": 1.6308, + "step": 6218 + }, + { + "epoch": 0.4681307514255067, + "grad_norm": 4.763060092926025, + "learning_rate": 5.768021193494957e-05, + "loss": 2.3263, + "step": 6219 + }, + { + "epoch": 0.4682060257061669, + "grad_norm": 4.36004638671875, + "learning_rate": 5.766816549163514e-05, + "loss": 2.1853, + "step": 6220 + }, + { + "epoch": 0.468281299986827, + "grad_norm": 5.033383369445801, + "learning_rate": 5.76561185924703e-05, + "loss": 2.0834, + "step": 6221 + }, + { + "epoch": 0.46835657426748717, + "grad_norm": 6.124760627746582, + "learning_rate": 5.764407123817116e-05, + "loss": 1.8302, + "step": 6222 + }, + { + "epoch": 0.4684318485481473, + "grad_norm": 5.201492786407471, + "learning_rate": 5.763202342945392e-05, + "loss": 1.9125, + "step": 6223 + }, + { + "epoch": 0.46850712282880747, + "grad_norm": 7.515429496765137, + "learning_rate": 5.761997516703479e-05, + "loss": 1.736, + "step": 6224 + }, + { + "epoch": 0.46858239710946764, + "grad_norm": 3.5897014141082764, + "learning_rate": 5.760792645163001e-05, + "loss": 1.8038, + "step": 6225 + }, + { + "epoch": 0.46865767139012776, + "grad_norm": 4.168317794799805, + "learning_rate": 5.7595877283955814e-05, + "loss": 1.8019, + "step": 6226 + }, + { + "epoch": 0.46873294567078794, + "grad_norm": 5.778686046600342, + "learning_rate": 5.7583827664728516e-05, + "loss": 2.2082, + "step": 6227 + }, + { + "epoch": 0.4688082199514481, + "grad_norm": 5.232272148132324, + "learning_rate": 5.7571777594664434e-05, + "loss": 1.7172, + "step": 6228 + }, + { + "epoch": 0.46888349423210823, + "grad_norm": 3.8763198852539062, + "learning_rate": 5.7559727074479896e-05, + "loss": 1.7522, + "step": 6229 + }, + { + "epoch": 0.4689587685127684, + "grad_norm": 3.6818134784698486, + "learning_rate": 5.754767610489127e-05, + "loss": 1.8533, + "step": 6230 + }, + { + "epoch": 0.46903404279342853, + "grad_norm": 4.650472640991211, + "learning_rate": 5.753562468661497e-05, + "loss": 1.889, + "step": 6231 + }, + { + "epoch": 0.4691093170740887, + "grad_norm": 5.427741527557373, + "learning_rate": 5.752357282036738e-05, + "loss": 2.1, + "step": 6232 + }, + { + "epoch": 0.4691845913547489, + "grad_norm": 4.919838905334473, + "learning_rate": 5.751152050686498e-05, + "loss": 2.0246, + "step": 6233 + }, + { + "epoch": 0.469259865635409, + "grad_norm": 3.6855101585388184, + "learning_rate": 5.749946774682425e-05, + "loss": 1.5149, + "step": 6234 + }, + { + "epoch": 0.4693351399160692, + "grad_norm": 4.93560791015625, + "learning_rate": 5.7487414540961684e-05, + "loss": 1.8101, + "step": 6235 + }, + { + "epoch": 0.46941041419672935, + "grad_norm": 3.966005325317383, + "learning_rate": 5.7475360889993793e-05, + "loss": 1.6947, + "step": 6236 + }, + { + "epoch": 0.4694856884773895, + "grad_norm": 3.5620124340057373, + "learning_rate": 5.746330679463714e-05, + "loss": 1.7124, + "step": 6237 + }, + { + "epoch": 0.46956096275804965, + "grad_norm": 4.8148112297058105, + "learning_rate": 5.745125225560833e-05, + "loss": 2.0517, + "step": 6238 + }, + { + "epoch": 0.4696362370387098, + "grad_norm": 4.781932353973389, + "learning_rate": 5.743919727362395e-05, + "loss": 1.5108, + "step": 6239 + }, + { + "epoch": 0.46971151131936995, + "grad_norm": 4.899031162261963, + "learning_rate": 5.7427141849400625e-05, + "loss": 1.8786, + "step": 6240 + }, + { + "epoch": 0.4697867856000301, + "grad_norm": 4.892386436462402, + "learning_rate": 5.7415085983655036e-05, + "loss": 1.77, + "step": 6241 + }, + { + "epoch": 0.46986205988069024, + "grad_norm": 4.124124050140381, + "learning_rate": 5.740302967710387e-05, + "loss": 1.7454, + "step": 6242 + }, + { + "epoch": 0.4699373341613504, + "grad_norm": 5.4768805503845215, + "learning_rate": 5.739097293046382e-05, + "loss": 1.8605, + "step": 6243 + }, + { + "epoch": 0.4700126084420106, + "grad_norm": 4.206995010375977, + "learning_rate": 5.737891574445164e-05, + "loss": 2.0251, + "step": 6244 + }, + { + "epoch": 0.4700878827226707, + "grad_norm": 7.894028186798096, + "learning_rate": 5.736685811978409e-05, + "loss": 1.7893, + "step": 6245 + }, + { + "epoch": 0.4701631570033309, + "grad_norm": 4.827877521514893, + "learning_rate": 5.735480005717797e-05, + "loss": 2.1469, + "step": 6246 + }, + { + "epoch": 0.47023843128399107, + "grad_norm": 4.310880184173584, + "learning_rate": 5.7342741557350086e-05, + "loss": 1.8166, + "step": 6247 + }, + { + "epoch": 0.4703137055646512, + "grad_norm": 4.257561206817627, + "learning_rate": 5.733068262101728e-05, + "loss": 1.9137, + "step": 6248 + }, + { + "epoch": 0.47038897984531136, + "grad_norm": 5.7693657875061035, + "learning_rate": 5.7318623248896444e-05, + "loss": 1.7353, + "step": 6249 + }, + { + "epoch": 0.4704642541259715, + "grad_norm": 6.433371543884277, + "learning_rate": 5.730656344170444e-05, + "loss": 1.3982, + "step": 6250 + }, + { + "epoch": 0.47053952840663166, + "grad_norm": 4.9939470291137695, + "learning_rate": 5.729450320015821e-05, + "loss": 1.9409, + "step": 6251 + }, + { + "epoch": 0.47061480268729183, + "grad_norm": 4.814608097076416, + "learning_rate": 5.728244252497469e-05, + "loss": 1.7835, + "step": 6252 + }, + { + "epoch": 0.47069007696795195, + "grad_norm": 6.235801696777344, + "learning_rate": 5.7270381416870844e-05, + "loss": 1.5418, + "step": 6253 + }, + { + "epoch": 0.47076535124861213, + "grad_norm": 5.513654708862305, + "learning_rate": 5.7258319876563706e-05, + "loss": 1.7118, + "step": 6254 + }, + { + "epoch": 0.4708406255292723, + "grad_norm": 4.927364826202393, + "learning_rate": 5.724625790477026e-05, + "loss": 2.0866, + "step": 6255 + }, + { + "epoch": 0.4709158998099324, + "grad_norm": 5.673543930053711, + "learning_rate": 5.723419550220759e-05, + "loss": 1.9164, + "step": 6256 + }, + { + "epoch": 0.4709911740905926, + "grad_norm": 4.091364860534668, + "learning_rate": 5.7222132669592755e-05, + "loss": 1.7135, + "step": 6257 + }, + { + "epoch": 0.4710664483712528, + "grad_norm": 5.272764682769775, + "learning_rate": 5.7210069407642844e-05, + "loss": 1.6601, + "step": 6258 + }, + { + "epoch": 0.4711417226519129, + "grad_norm": 7.811646461486816, + "learning_rate": 5.719800571707501e-05, + "loss": 1.6027, + "step": 6259 + }, + { + "epoch": 0.4712169969325731, + "grad_norm": 3.998684883117676, + "learning_rate": 5.7185941598606366e-05, + "loss": 1.8905, + "step": 6260 + }, + { + "epoch": 0.4712922712132332, + "grad_norm": 3.674417495727539, + "learning_rate": 5.717387705295413e-05, + "loss": 1.7514, + "step": 6261 + }, + { + "epoch": 0.47136754549389337, + "grad_norm": 7.495507717132568, + "learning_rate": 5.716181208083548e-05, + "loss": 1.5572, + "step": 6262 + }, + { + "epoch": 0.47144281977455355, + "grad_norm": 7.455479145050049, + "learning_rate": 5.7149746682967665e-05, + "loss": 1.9323, + "step": 6263 + }, + { + "epoch": 0.47151809405521367, + "grad_norm": 6.661562919616699, + "learning_rate": 5.713768086006791e-05, + "loss": 1.622, + "step": 6264 + }, + { + "epoch": 0.47159336833587384, + "grad_norm": 4.245311737060547, + "learning_rate": 5.71256146128535e-05, + "loss": 1.7079, + "step": 6265 + }, + { + "epoch": 0.471668642616534, + "grad_norm": 4.117136478424072, + "learning_rate": 5.711354794204177e-05, + "loss": 1.7702, + "step": 6266 + }, + { + "epoch": 0.47174391689719414, + "grad_norm": 4.412699222564697, + "learning_rate": 5.7101480848350016e-05, + "loss": 1.7739, + "step": 6267 + }, + { + "epoch": 0.4718191911778543, + "grad_norm": 5.006507396697998, + "learning_rate": 5.70894133324956e-05, + "loss": 2.0974, + "step": 6268 + }, + { + "epoch": 0.4718944654585145, + "grad_norm": 4.3134355545043945, + "learning_rate": 5.707734539519591e-05, + "loss": 1.8849, + "step": 6269 + }, + { + "epoch": 0.4719697397391746, + "grad_norm": 3.327134132385254, + "learning_rate": 5.706527703716833e-05, + "loss": 1.622, + "step": 6270 + }, + { + "epoch": 0.4720450140198348, + "grad_norm": 4.596324443817139, + "learning_rate": 5.7053208259130296e-05, + "loss": 1.9659, + "step": 6271 + }, + { + "epoch": 0.4721202883004949, + "grad_norm": 4.160121917724609, + "learning_rate": 5.7041139061799285e-05, + "loss": 1.9261, + "step": 6272 + }, + { + "epoch": 0.4721955625811551, + "grad_norm": 4.924027442932129, + "learning_rate": 5.702906944589277e-05, + "loss": 1.4914, + "step": 6273 + }, + { + "epoch": 0.47227083686181526, + "grad_norm": 4.450446128845215, + "learning_rate": 5.7016999412128235e-05, + "loss": 1.7632, + "step": 6274 + }, + { + "epoch": 0.4723461111424754, + "grad_norm": 4.421329975128174, + "learning_rate": 5.7004928961223224e-05, + "loss": 2.0668, + "step": 6275 + }, + { + "epoch": 0.47242138542313555, + "grad_norm": 4.183878421783447, + "learning_rate": 5.699285809389526e-05, + "loss": 2.1978, + "step": 6276 + }, + { + "epoch": 0.47249665970379573, + "grad_norm": 3.641120433807373, + "learning_rate": 5.6980786810861974e-05, + "loss": 1.5163, + "step": 6277 + }, + { + "epoch": 0.47257193398445585, + "grad_norm": 5.562851428985596, + "learning_rate": 5.696871511284094e-05, + "loss": 1.8955, + "step": 6278 + }, + { + "epoch": 0.472647208265116, + "grad_norm": 4.711536884307861, + "learning_rate": 5.695664300054978e-05, + "loss": 1.9267, + "step": 6279 + }, + { + "epoch": 0.47272248254577615, + "grad_norm": 5.501180648803711, + "learning_rate": 5.694457047470616e-05, + "loss": 1.8941, + "step": 6280 + }, + { + "epoch": 0.4727977568264363, + "grad_norm": 5.5967817306518555, + "learning_rate": 5.6932497536027754e-05, + "loss": 1.7214, + "step": 6281 + }, + { + "epoch": 0.4728730311070965, + "grad_norm": 3.654402732849121, + "learning_rate": 5.6920424185232245e-05, + "loss": 1.8692, + "step": 6282 + }, + { + "epoch": 0.4729483053877566, + "grad_norm": 4.3889946937561035, + "learning_rate": 5.6908350423037394e-05, + "loss": 1.6314, + "step": 6283 + }, + { + "epoch": 0.4730235796684168, + "grad_norm": 4.013479232788086, + "learning_rate": 5.6896276250160915e-05, + "loss": 1.6361, + "step": 6284 + }, + { + "epoch": 0.47309885394907697, + "grad_norm": 5.948092937469482, + "learning_rate": 5.6884201667320626e-05, + "loss": 2.3148, + "step": 6285 + }, + { + "epoch": 0.4731741282297371, + "grad_norm": 4.355929851531982, + "learning_rate": 5.687212667523428e-05, + "loss": 1.969, + "step": 6286 + }, + { + "epoch": 0.47324940251039727, + "grad_norm": 5.659989833831787, + "learning_rate": 5.686005127461972e-05, + "loss": 1.8024, + "step": 6287 + }, + { + "epoch": 0.47332467679105744, + "grad_norm": 4.907593727111816, + "learning_rate": 5.684797546619479e-05, + "loss": 2.0196, + "step": 6288 + }, + { + "epoch": 0.47339995107171756, + "grad_norm": 4.597088813781738, + "learning_rate": 5.683589925067738e-05, + "loss": 1.7223, + "step": 6289 + }, + { + "epoch": 0.47347522535237774, + "grad_norm": 4.602256774902344, + "learning_rate": 5.6823822628785384e-05, + "loss": 1.6796, + "step": 6290 + }, + { + "epoch": 0.47355049963303786, + "grad_norm": 6.001948356628418, + "learning_rate": 5.68117456012367e-05, + "loss": 1.9124, + "step": 6291 + }, + { + "epoch": 0.47362577391369803, + "grad_norm": 4.915574550628662, + "learning_rate": 5.679966816874929e-05, + "loss": 1.8903, + "step": 6292 + }, + { + "epoch": 0.4737010481943582, + "grad_norm": 4.261869430541992, + "learning_rate": 5.678759033204111e-05, + "loss": 1.959, + "step": 6293 + }, + { + "epoch": 0.47377632247501833, + "grad_norm": 3.481807231903076, + "learning_rate": 5.677551209183016e-05, + "loss": 1.93, + "step": 6294 + }, + { + "epoch": 0.4738515967556785, + "grad_norm": 6.329556941986084, + "learning_rate": 5.676343344883447e-05, + "loss": 1.5879, + "step": 6295 + }, + { + "epoch": 0.4739268710363387, + "grad_norm": 3.572904348373413, + "learning_rate": 5.675135440377206e-05, + "loss": 1.8167, + "step": 6296 + }, + { + "epoch": 0.4740021453169988, + "grad_norm": 4.3822197914123535, + "learning_rate": 5.673927495736101e-05, + "loss": 1.7666, + "step": 6297 + }, + { + "epoch": 0.474077419597659, + "grad_norm": 4.4206037521362305, + "learning_rate": 5.672719511031939e-05, + "loss": 1.9821, + "step": 6298 + }, + { + "epoch": 0.4741526938783191, + "grad_norm": 3.5047097206115723, + "learning_rate": 5.6715114863365315e-05, + "loss": 1.711, + "step": 6299 + }, + { + "epoch": 0.4742279681589793, + "grad_norm": 4.298370838165283, + "learning_rate": 5.6703034217216946e-05, + "loss": 1.9796, + "step": 6300 + }, + { + "epoch": 0.47430324243963945, + "grad_norm": 6.757082462310791, + "learning_rate": 5.669095317259243e-05, + "loss": 1.9949, + "step": 6301 + }, + { + "epoch": 0.47437851672029957, + "grad_norm": 6.489284038543701, + "learning_rate": 5.667887173020994e-05, + "loss": 1.6705, + "step": 6302 + }, + { + "epoch": 0.47445379100095975, + "grad_norm": 5.902678489685059, + "learning_rate": 5.6666789890787675e-05, + "loss": 1.6508, + "step": 6303 + }, + { + "epoch": 0.4745290652816199, + "grad_norm": 4.955329895019531, + "learning_rate": 5.6654707655043894e-05, + "loss": 1.8029, + "step": 6304 + }, + { + "epoch": 0.47460433956228004, + "grad_norm": 6.021631240844727, + "learning_rate": 5.6642625023696825e-05, + "loss": 1.932, + "step": 6305 + }, + { + "epoch": 0.4746796138429402, + "grad_norm": 5.821576118469238, + "learning_rate": 5.663054199746477e-05, + "loss": 2.217, + "step": 6306 + }, + { + "epoch": 0.4747548881236004, + "grad_norm": 3.9503846168518066, + "learning_rate": 5.661845857706602e-05, + "loss": 2.0514, + "step": 6307 + }, + { + "epoch": 0.4748301624042605, + "grad_norm": 5.549496650695801, + "learning_rate": 5.6606374763218905e-05, + "loss": 1.8548, + "step": 6308 + }, + { + "epoch": 0.4749054366849207, + "grad_norm": 6.011651039123535, + "learning_rate": 5.659429055664176e-05, + "loss": 1.8539, + "step": 6309 + }, + { + "epoch": 0.4749807109655808, + "grad_norm": 4.154627799987793, + "learning_rate": 5.658220595805295e-05, + "loss": 1.964, + "step": 6310 + }, + { + "epoch": 0.475055985246241, + "grad_norm": 4.367033004760742, + "learning_rate": 5.6570120968170894e-05, + "loss": 1.9429, + "step": 6311 + }, + { + "epoch": 0.47513125952690116, + "grad_norm": 4.456181049346924, + "learning_rate": 5.6558035587714e-05, + "loss": 2.0603, + "step": 6312 + }, + { + "epoch": 0.4752065338075613, + "grad_norm": 3.9151809215545654, + "learning_rate": 5.65459498174007e-05, + "loss": 1.9993, + "step": 6313 + }, + { + "epoch": 0.47528180808822146, + "grad_norm": 7.8175740242004395, + "learning_rate": 5.6533863657949474e-05, + "loss": 2.0334, + "step": 6314 + }, + { + "epoch": 0.47535708236888163, + "grad_norm": 4.566813945770264, + "learning_rate": 5.652177711007878e-05, + "loss": 1.775, + "step": 6315 + }, + { + "epoch": 0.47543235664954175, + "grad_norm": 6.143244743347168, + "learning_rate": 5.650969017450717e-05, + "loss": 1.9767, + "step": 6316 + }, + { + "epoch": 0.47550763093020193, + "grad_norm": 5.214820861816406, + "learning_rate": 5.6497602851953156e-05, + "loss": 1.9134, + "step": 6317 + }, + { + "epoch": 0.4755829052108621, + "grad_norm": 3.975825309753418, + "learning_rate": 5.6485515143135294e-05, + "loss": 1.8284, + "step": 6318 + }, + { + "epoch": 0.4756581794915222, + "grad_norm": 4.907965183258057, + "learning_rate": 5.6473427048772165e-05, + "loss": 1.63, + "step": 6319 + }, + { + "epoch": 0.4757334537721824, + "grad_norm": 6.942389011383057, + "learning_rate": 5.646133856958237e-05, + "loss": 1.8056, + "step": 6320 + }, + { + "epoch": 0.4758087280528425, + "grad_norm": 4.986257553100586, + "learning_rate": 5.6449249706284534e-05, + "loss": 1.7189, + "step": 6321 + }, + { + "epoch": 0.4758840023335027, + "grad_norm": 5.157688140869141, + "learning_rate": 5.643716045959732e-05, + "loss": 1.7119, + "step": 6322 + }, + { + "epoch": 0.4759592766141629, + "grad_norm": 5.648739337921143, + "learning_rate": 5.642507083023938e-05, + "loss": 2.3893, + "step": 6323 + }, + { + "epoch": 0.476034550894823, + "grad_norm": 4.401586532592773, + "learning_rate": 5.6412980818929406e-05, + "loss": 2.0876, + "step": 6324 + }, + { + "epoch": 0.47610982517548317, + "grad_norm": 5.251168727874756, + "learning_rate": 5.640089042638614e-05, + "loss": 1.8618, + "step": 6325 + }, + { + "epoch": 0.47618509945614335, + "grad_norm": 4.695756912231445, + "learning_rate": 5.6388799653328284e-05, + "loss": 1.5462, + "step": 6326 + }, + { + "epoch": 0.47626037373680347, + "grad_norm": 4.998029708862305, + "learning_rate": 5.637670850047464e-05, + "loss": 1.927, + "step": 6327 + }, + { + "epoch": 0.47633564801746364, + "grad_norm": 5.373469352722168, + "learning_rate": 5.6364616968543973e-05, + "loss": 1.9827, + "step": 6328 + }, + { + "epoch": 0.47641092229812376, + "grad_norm": 5.843819618225098, + "learning_rate": 5.635252505825508e-05, + "loss": 1.6792, + "step": 6329 + }, + { + "epoch": 0.47648619657878394, + "grad_norm": 4.7509684562683105, + "learning_rate": 5.63404327703268e-05, + "loss": 1.8805, + "step": 6330 + }, + { + "epoch": 0.4765614708594441, + "grad_norm": 4.758246421813965, + "learning_rate": 5.632834010547798e-05, + "loss": 1.7291, + "step": 6331 + }, + { + "epoch": 0.47663674514010423, + "grad_norm": 5.0231032371521, + "learning_rate": 5.6316247064427504e-05, + "loss": 1.8874, + "step": 6332 + }, + { + "epoch": 0.4767120194207644, + "grad_norm": 3.9548182487487793, + "learning_rate": 5.630415364789426e-05, + "loss": 1.8076, + "step": 6333 + }, + { + "epoch": 0.4767872937014246, + "grad_norm": 6.8431243896484375, + "learning_rate": 5.629205985659718e-05, + "loss": 1.7819, + "step": 6334 + }, + { + "epoch": 0.4768625679820847, + "grad_norm": 3.5298619270324707, + "learning_rate": 5.62799656912552e-05, + "loss": 1.8879, + "step": 6335 + }, + { + "epoch": 0.4769378422627449, + "grad_norm": 6.355266094207764, + "learning_rate": 5.626787115258726e-05, + "loss": 2.1417, + "step": 6336 + }, + { + "epoch": 0.47701311654340506, + "grad_norm": 6.099227428436279, + "learning_rate": 5.6255776241312374e-05, + "loss": 1.4807, + "step": 6337 + }, + { + "epoch": 0.4770883908240652, + "grad_norm": 4.470182418823242, + "learning_rate": 5.6243680958149525e-05, + "loss": 1.5581, + "step": 6338 + }, + { + "epoch": 0.47716366510472535, + "grad_norm": 6.215469837188721, + "learning_rate": 5.623158530381778e-05, + "loss": 1.8208, + "step": 6339 + }, + { + "epoch": 0.4772389393853855, + "grad_norm": 4.599560260772705, + "learning_rate": 5.621948927903616e-05, + "loss": 2.3083, + "step": 6340 + }, + { + "epoch": 0.47731421366604565, + "grad_norm": 4.091424465179443, + "learning_rate": 5.6207392884523755e-05, + "loss": 1.5819, + "step": 6341 + }, + { + "epoch": 0.4773894879467058, + "grad_norm": 6.120739936828613, + "learning_rate": 5.619529612099965e-05, + "loss": 1.5834, + "step": 6342 + }, + { + "epoch": 0.47746476222736595, + "grad_norm": 3.402621269226074, + "learning_rate": 5.618319898918296e-05, + "loss": 1.8164, + "step": 6343 + }, + { + "epoch": 0.4775400365080261, + "grad_norm": 6.812966823577881, + "learning_rate": 5.6171101489792835e-05, + "loss": 2.4337, + "step": 6344 + }, + { + "epoch": 0.4776153107886863, + "grad_norm": 4.409587860107422, + "learning_rate": 5.6159003623548444e-05, + "loss": 1.6103, + "step": 6345 + }, + { + "epoch": 0.4776905850693464, + "grad_norm": 3.555950403213501, + "learning_rate": 5.6146905391168946e-05, + "loss": 1.6161, + "step": 6346 + }, + { + "epoch": 0.4777658593500066, + "grad_norm": 5.879570960998535, + "learning_rate": 5.613480679337357e-05, + "loss": 1.8633, + "step": 6347 + }, + { + "epoch": 0.4778411336306667, + "grad_norm": 4.932384490966797, + "learning_rate": 5.6122707830881516e-05, + "loss": 1.9702, + "step": 6348 + }, + { + "epoch": 0.4779164079113269, + "grad_norm": 6.0971221923828125, + "learning_rate": 5.6110608504412064e-05, + "loss": 2.1296, + "step": 6349 + }, + { + "epoch": 0.47799168219198707, + "grad_norm": 5.147326946258545, + "learning_rate": 5.609850881468447e-05, + "loss": 1.8963, + "step": 6350 + }, + { + "epoch": 0.4780669564726472, + "grad_norm": 5.641437530517578, + "learning_rate": 5.608640876241803e-05, + "loss": 1.6259, + "step": 6351 + }, + { + "epoch": 0.47814223075330736, + "grad_norm": 5.286806106567383, + "learning_rate": 5.607430834833204e-05, + "loss": 1.851, + "step": 6352 + }, + { + "epoch": 0.47821750503396754, + "grad_norm": 6.859378337860107, + "learning_rate": 5.606220757314585e-05, + "loss": 1.3199, + "step": 6353 + }, + { + "epoch": 0.47829277931462766, + "grad_norm": 5.167545795440674, + "learning_rate": 5.605010643757881e-05, + "loss": 1.8508, + "step": 6354 + }, + { + "epoch": 0.47836805359528783, + "grad_norm": 4.577126502990723, + "learning_rate": 5.603800494235032e-05, + "loss": 1.8609, + "step": 6355 + }, + { + "epoch": 0.478443327875948, + "grad_norm": 4.978440284729004, + "learning_rate": 5.6025903088179745e-05, + "loss": 1.8213, + "step": 6356 + }, + { + "epoch": 0.47851860215660813, + "grad_norm": 5.525322914123535, + "learning_rate": 5.601380087578654e-05, + "loss": 1.8484, + "step": 6357 + }, + { + "epoch": 0.4785938764372683, + "grad_norm": 5.149740695953369, + "learning_rate": 5.600169830589012e-05, + "loss": 1.4979, + "step": 6358 + }, + { + "epoch": 0.4786691507179284, + "grad_norm": 5.002951145172119, + "learning_rate": 5.5989595379209945e-05, + "loss": 1.5378, + "step": 6359 + }, + { + "epoch": 0.4787444249985886, + "grad_norm": 5.281551361083984, + "learning_rate": 5.597749209646551e-05, + "loss": 2.0733, + "step": 6360 + }, + { + "epoch": 0.4788196992792488, + "grad_norm": 4.627199649810791, + "learning_rate": 5.5965388458376337e-05, + "loss": 1.7909, + "step": 6361 + }, + { + "epoch": 0.4788949735599089, + "grad_norm": 4.703307151794434, + "learning_rate": 5.595328446566193e-05, + "loss": 1.7941, + "step": 6362 + }, + { + "epoch": 0.4789702478405691, + "grad_norm": 5.197971343994141, + "learning_rate": 5.5941180119041845e-05, + "loss": 2.0226, + "step": 6363 + }, + { + "epoch": 0.47904552212122925, + "grad_norm": 5.668840408325195, + "learning_rate": 5.592907541923564e-05, + "loss": 1.6802, + "step": 6364 + }, + { + "epoch": 0.47912079640188937, + "grad_norm": 5.576542377471924, + "learning_rate": 5.5916970366962914e-05, + "loss": 1.9668, + "step": 6365 + }, + { + "epoch": 0.47919607068254955, + "grad_norm": 4.479678153991699, + "learning_rate": 5.5904864962943296e-05, + "loss": 1.7578, + "step": 6366 + }, + { + "epoch": 0.4792713449632097, + "grad_norm": 4.594120979309082, + "learning_rate": 5.5892759207896386e-05, + "loss": 1.9269, + "step": 6367 + }, + { + "epoch": 0.47934661924386984, + "grad_norm": 4.9382829666137695, + "learning_rate": 5.588065310254185e-05, + "loss": 1.7342, + "step": 6368 + }, + { + "epoch": 0.47942189352453, + "grad_norm": 6.230596542358398, + "learning_rate": 5.586854664759935e-05, + "loss": 1.8827, + "step": 6369 + }, + { + "epoch": 0.47949716780519014, + "grad_norm": 4.649636268615723, + "learning_rate": 5.585643984378861e-05, + "loss": 1.6683, + "step": 6370 + }, + { + "epoch": 0.4795724420858503, + "grad_norm": 3.524915933609009, + "learning_rate": 5.58443326918293e-05, + "loss": 1.9179, + "step": 6371 + }, + { + "epoch": 0.4796477163665105, + "grad_norm": 5.869900226593018, + "learning_rate": 5.58322251924412e-05, + "loss": 1.8786, + "step": 6372 + }, + { + "epoch": 0.4797229906471706, + "grad_norm": 4.81140661239624, + "learning_rate": 5.5820117346344045e-05, + "loss": 1.703, + "step": 6373 + }, + { + "epoch": 0.4797982649278308, + "grad_norm": 5.101838111877441, + "learning_rate": 5.5808009154257625e-05, + "loss": 1.9166, + "step": 6374 + }, + { + "epoch": 0.47987353920849096, + "grad_norm": 4.833708763122559, + "learning_rate": 5.579590061690171e-05, + "loss": 1.8307, + "step": 6375 + }, + { + "epoch": 0.4799488134891511, + "grad_norm": 4.464544773101807, + "learning_rate": 5.5783791734996124e-05, + "loss": 2.0349, + "step": 6376 + }, + { + "epoch": 0.48002408776981126, + "grad_norm": 7.395505905151367, + "learning_rate": 5.5771682509260735e-05, + "loss": 1.4664, + "step": 6377 + }, + { + "epoch": 0.4800993620504714, + "grad_norm": 4.106167316436768, + "learning_rate": 5.5759572940415386e-05, + "loss": 1.7813, + "step": 6378 + }, + { + "epoch": 0.48017463633113155, + "grad_norm": 4.954022407531738, + "learning_rate": 5.574746302917994e-05, + "loss": 1.6302, + "step": 6379 + }, + { + "epoch": 0.48024991061179173, + "grad_norm": 4.396544456481934, + "learning_rate": 5.5735352776274306e-05, + "loss": 2.1225, + "step": 6380 + }, + { + "epoch": 0.48032518489245185, + "grad_norm": 4.40151309967041, + "learning_rate": 5.57232421824184e-05, + "loss": 1.5712, + "step": 6381 + }, + { + "epoch": 0.480400459173112, + "grad_norm": 5.896833896636963, + "learning_rate": 5.5711131248332185e-05, + "loss": 1.7259, + "step": 6382 + }, + { + "epoch": 0.4804757334537722, + "grad_norm": 4.436867713928223, + "learning_rate": 5.56990199747356e-05, + "loss": 1.6595, + "step": 6383 + }, + { + "epoch": 0.4805510077344323, + "grad_norm": 5.116188049316406, + "learning_rate": 5.568690836234861e-05, + "loss": 1.7467, + "step": 6384 + }, + { + "epoch": 0.4806262820150925, + "grad_norm": 4.1552910804748535, + "learning_rate": 5.567479641189126e-05, + "loss": 1.7323, + "step": 6385 + }, + { + "epoch": 0.4807015562957527, + "grad_norm": 5.479462623596191, + "learning_rate": 5.5662684124083545e-05, + "loss": 1.9029, + "step": 6386 + }, + { + "epoch": 0.4807768305764128, + "grad_norm": 4.882214069366455, + "learning_rate": 5.56505714996455e-05, + "loss": 1.7055, + "step": 6387 + }, + { + "epoch": 0.48085210485707297, + "grad_norm": 4.657861709594727, + "learning_rate": 5.563845853929718e-05, + "loss": 1.6248, + "step": 6388 + }, + { + "epoch": 0.4809273791377331, + "grad_norm": 4.639128684997559, + "learning_rate": 5.562634524375869e-05, + "loss": 1.7625, + "step": 6389 + }, + { + "epoch": 0.48100265341839327, + "grad_norm": 5.799036502838135, + "learning_rate": 5.5614231613750124e-05, + "loss": 1.9053, + "step": 6390 + }, + { + "epoch": 0.48107792769905344, + "grad_norm": 4.536380767822266, + "learning_rate": 5.560211764999159e-05, + "loss": 1.7578, + "step": 6391 + }, + { + "epoch": 0.48115320197971356, + "grad_norm": 4.304374694824219, + "learning_rate": 5.559000335320325e-05, + "loss": 1.5284, + "step": 6392 + }, + { + "epoch": 0.48122847626037374, + "grad_norm": 4.0411295890808105, + "learning_rate": 5.557788872410523e-05, + "loss": 1.6953, + "step": 6393 + }, + { + "epoch": 0.4813037505410339, + "grad_norm": 4.699453353881836, + "learning_rate": 5.556577376341774e-05, + "loss": 1.7151, + "step": 6394 + }, + { + "epoch": 0.48137902482169403, + "grad_norm": 6.213388442993164, + "learning_rate": 5.555365847186097e-05, + "loss": 2.0732, + "step": 6395 + }, + { + "epoch": 0.4814542991023542, + "grad_norm": 4.9000983238220215, + "learning_rate": 5.5541542850155136e-05, + "loss": 1.7422, + "step": 6396 + }, + { + "epoch": 0.48152957338301433, + "grad_norm": 6.778110980987549, + "learning_rate": 5.5529426899020484e-05, + "loss": 2.0417, + "step": 6397 + }, + { + "epoch": 0.4816048476636745, + "grad_norm": 4.21738338470459, + "learning_rate": 5.551731061917727e-05, + "loss": 1.8993, + "step": 6398 + }, + { + "epoch": 0.4816801219443347, + "grad_norm": 6.4587273597717285, + "learning_rate": 5.5505194011345764e-05, + "loss": 2.1121, + "step": 6399 + }, + { + "epoch": 0.4817553962249948, + "grad_norm": 4.497090816497803, + "learning_rate": 5.549307707624627e-05, + "loss": 1.7928, + "step": 6400 + }, + { + "epoch": 0.481830670505655, + "grad_norm": 4.304078578948975, + "learning_rate": 5.548095981459911e-05, + "loss": 1.7145, + "step": 6401 + }, + { + "epoch": 0.48190594478631515, + "grad_norm": 4.139160633087158, + "learning_rate": 5.5468842227124615e-05, + "loss": 1.7378, + "step": 6402 + }, + { + "epoch": 0.4819812190669753, + "grad_norm": 4.138605117797852, + "learning_rate": 5.545672431454314e-05, + "loss": 1.5571, + "step": 6403 + }, + { + "epoch": 0.48205649334763545, + "grad_norm": 6.147515296936035, + "learning_rate": 5.5444606077575056e-05, + "loss": 1.883, + "step": 6404 + }, + { + "epoch": 0.4821317676282956, + "grad_norm": 4.5211992263793945, + "learning_rate": 5.5432487516940765e-05, + "loss": 1.5521, + "step": 6405 + }, + { + "epoch": 0.48220704190895575, + "grad_norm": 5.152327537536621, + "learning_rate": 5.5420368633360695e-05, + "loss": 1.5057, + "step": 6406 + }, + { + "epoch": 0.4822823161896159, + "grad_norm": 4.975186347961426, + "learning_rate": 5.5408249427555245e-05, + "loss": 2.2254, + "step": 6407 + }, + { + "epoch": 0.48235759047027604, + "grad_norm": 3.747297525405884, + "learning_rate": 5.5396129900244894e-05, + "loss": 1.7276, + "step": 6408 + }, + { + "epoch": 0.4824328647509362, + "grad_norm": 6.560164928436279, + "learning_rate": 5.53840100521501e-05, + "loss": 1.8389, + "step": 6409 + }, + { + "epoch": 0.4825081390315964, + "grad_norm": 4.867674827575684, + "learning_rate": 5.5371889883991354e-05, + "loss": 1.805, + "step": 6410 + }, + { + "epoch": 0.4825834133122565, + "grad_norm": 4.912373065948486, + "learning_rate": 5.535976939648917e-05, + "loss": 1.7426, + "step": 6411 + }, + { + "epoch": 0.4826586875929167, + "grad_norm": 5.571805000305176, + "learning_rate": 5.5347648590364085e-05, + "loss": 1.8546, + "step": 6412 + }, + { + "epoch": 0.48273396187357687, + "grad_norm": 4.405371189117432, + "learning_rate": 5.5335527466336625e-05, + "loss": 1.8094, + "step": 6413 + }, + { + "epoch": 0.482809236154237, + "grad_norm": 8.700578689575195, + "learning_rate": 5.5323406025127364e-05, + "loss": 2.0228, + "step": 6414 + }, + { + "epoch": 0.48288451043489716, + "grad_norm": 3.563631534576416, + "learning_rate": 5.5311284267456884e-05, + "loss": 1.7234, + "step": 6415 + }, + { + "epoch": 0.48295978471555734, + "grad_norm": 6.115181922912598, + "learning_rate": 5.5299162194045806e-05, + "loss": 1.6211, + "step": 6416 + }, + { + "epoch": 0.48303505899621746, + "grad_norm": 3.7864468097686768, + "learning_rate": 5.528703980561474e-05, + "loss": 1.7361, + "step": 6417 + }, + { + "epoch": 0.48311033327687763, + "grad_norm": 5.8733601570129395, + "learning_rate": 5.527491710288433e-05, + "loss": 1.7937, + "step": 6418 + }, + { + "epoch": 0.48318560755753776, + "grad_norm": 5.387686729431152, + "learning_rate": 5.526279408657522e-05, + "loss": 2.0094, + "step": 6419 + }, + { + "epoch": 0.48326088183819793, + "grad_norm": 5.263628005981445, + "learning_rate": 5.525067075740812e-05, + "loss": 1.7669, + "step": 6420 + }, + { + "epoch": 0.4833361561188581, + "grad_norm": 4.078954696655273, + "learning_rate": 5.523854711610369e-05, + "loss": 1.7247, + "step": 6421 + }, + { + "epoch": 0.4834114303995182, + "grad_norm": 5.080759525299072, + "learning_rate": 5.522642316338268e-05, + "loss": 1.9066, + "step": 6422 + }, + { + "epoch": 0.4834867046801784, + "grad_norm": 6.2059125900268555, + "learning_rate": 5.52142988999658e-05, + "loss": 1.4491, + "step": 6423 + }, + { + "epoch": 0.4835619789608386, + "grad_norm": 5.986764907836914, + "learning_rate": 5.520217432657382e-05, + "loss": 1.7427, + "step": 6424 + }, + { + "epoch": 0.4836372532414987, + "grad_norm": 5.207749843597412, + "learning_rate": 5.519004944392751e-05, + "loss": 1.7915, + "step": 6425 + }, + { + "epoch": 0.4837125275221589, + "grad_norm": 6.247087478637695, + "learning_rate": 5.517792425274762e-05, + "loss": 1.6063, + "step": 6426 + }, + { + "epoch": 0.483787801802819, + "grad_norm": 4.744610786437988, + "learning_rate": 5.516579875375502e-05, + "loss": 1.938, + "step": 6427 + }, + { + "epoch": 0.48386307608347917, + "grad_norm": 5.761016368865967, + "learning_rate": 5.51536729476705e-05, + "loss": 1.474, + "step": 6428 + }, + { + "epoch": 0.48393835036413935, + "grad_norm": 6.9674482345581055, + "learning_rate": 5.5141546835214906e-05, + "loss": 1.9723, + "step": 6429 + }, + { + "epoch": 0.48401362464479947, + "grad_norm": 4.266446113586426, + "learning_rate": 5.5129420417109126e-05, + "loss": 2.0187, + "step": 6430 + }, + { + "epoch": 0.48408889892545964, + "grad_norm": 5.372589111328125, + "learning_rate": 5.511729369407399e-05, + "loss": 1.7445, + "step": 6431 + }, + { + "epoch": 0.4841641732061198, + "grad_norm": 4.984195709228516, + "learning_rate": 5.5105166666830444e-05, + "loss": 2.0603, + "step": 6432 + }, + { + "epoch": 0.48423944748677994, + "grad_norm": 4.876062870025635, + "learning_rate": 5.509303933609937e-05, + "loss": 1.6717, + "step": 6433 + }, + { + "epoch": 0.4843147217674401, + "grad_norm": 4.779151916503906, + "learning_rate": 5.5080911702601734e-05, + "loss": 1.7792, + "step": 6434 + }, + { + "epoch": 0.4843899960481003, + "grad_norm": 4.071149826049805, + "learning_rate": 5.5068783767058486e-05, + "loss": 2.0487, + "step": 6435 + }, + { + "epoch": 0.4844652703287604, + "grad_norm": 5.55618953704834, + "learning_rate": 5.5056655530190584e-05, + "loss": 1.6815, + "step": 6436 + }, + { + "epoch": 0.4845405446094206, + "grad_norm": 6.163825035095215, + "learning_rate": 5.504452699271901e-05, + "loss": 2.1977, + "step": 6437 + }, + { + "epoch": 0.4846158188900807, + "grad_norm": 5.055025100708008, + "learning_rate": 5.5032398155364775e-05, + "loss": 1.7242, + "step": 6438 + }, + { + "epoch": 0.4846910931707409, + "grad_norm": 4.641054153442383, + "learning_rate": 5.5020269018848923e-05, + "loss": 1.9273, + "step": 6439 + }, + { + "epoch": 0.48476636745140106, + "grad_norm": 4.056416034698486, + "learning_rate": 5.5008139583892485e-05, + "loss": 1.8215, + "step": 6440 + }, + { + "epoch": 0.4848416417320612, + "grad_norm": 6.019015789031982, + "learning_rate": 5.4996009851216515e-05, + "loss": 1.6476, + "step": 6441 + }, + { + "epoch": 0.48491691601272136, + "grad_norm": 4.817643642425537, + "learning_rate": 5.49838798215421e-05, + "loss": 1.9228, + "step": 6442 + }, + { + "epoch": 0.48499219029338153, + "grad_norm": 5.686418533325195, + "learning_rate": 5.49717494955903e-05, + "loss": 1.7242, + "step": 6443 + }, + { + "epoch": 0.48506746457404165, + "grad_norm": 5.147515773773193, + "learning_rate": 5.4959618874082284e-05, + "loss": 1.4893, + "step": 6444 + }, + { + "epoch": 0.4851427388547018, + "grad_norm": 5.002187728881836, + "learning_rate": 5.494748795773915e-05, + "loss": 1.7204, + "step": 6445 + }, + { + "epoch": 0.485218013135362, + "grad_norm": 5.279694557189941, + "learning_rate": 5.493535674728206e-05, + "loss": 1.6255, + "step": 6446 + }, + { + "epoch": 0.4852932874160221, + "grad_norm": 3.8234703540802, + "learning_rate": 5.492322524343218e-05, + "loss": 1.5834, + "step": 6447 + }, + { + "epoch": 0.4853685616966823, + "grad_norm": 4.459449291229248, + "learning_rate": 5.4911093446910654e-05, + "loss": 1.8874, + "step": 6448 + }, + { + "epoch": 0.4854438359773424, + "grad_norm": 5.751888275146484, + "learning_rate": 5.4898961358438726e-05, + "loss": 1.5702, + "step": 6449 + }, + { + "epoch": 0.4855191102580026, + "grad_norm": 4.8012776374816895, + "learning_rate": 5.488682897873761e-05, + "loss": 1.8261, + "step": 6450 + }, + { + "epoch": 0.48559438453866277, + "grad_norm": 6.291077613830566, + "learning_rate": 5.487469630852854e-05, + "loss": 1.7759, + "step": 6451 + }, + { + "epoch": 0.4856696588193229, + "grad_norm": 5.167383670806885, + "learning_rate": 5.486256334853276e-05, + "loss": 1.9764, + "step": 6452 + }, + { + "epoch": 0.48574493309998307, + "grad_norm": 4.0530924797058105, + "learning_rate": 5.4850430099471526e-05, + "loss": 1.8007, + "step": 6453 + }, + { + "epoch": 0.48582020738064324, + "grad_norm": 5.158261775970459, + "learning_rate": 5.483829656206615e-05, + "loss": 1.8812, + "step": 6454 + }, + { + "epoch": 0.48589548166130336, + "grad_norm": 5.156190395355225, + "learning_rate": 5.482616273703792e-05, + "loss": 1.7035, + "step": 6455 + }, + { + "epoch": 0.48597075594196354, + "grad_norm": 5.8077473640441895, + "learning_rate": 5.481402862510817e-05, + "loss": 1.8151, + "step": 6456 + }, + { + "epoch": 0.48604603022262366, + "grad_norm": 6.451691627502441, + "learning_rate": 5.480189422699824e-05, + "loss": 2.1713, + "step": 6457 + }, + { + "epoch": 0.48612130450328384, + "grad_norm": 4.777563571929932, + "learning_rate": 5.478975954342947e-05, + "loss": 1.8797, + "step": 6458 + }, + { + "epoch": 0.486196578783944, + "grad_norm": 6.515157222747803, + "learning_rate": 5.477762457512323e-05, + "loss": 2.2031, + "step": 6459 + }, + { + "epoch": 0.48627185306460413, + "grad_norm": 4.274823188781738, + "learning_rate": 5.476548932280092e-05, + "loss": 1.7447, + "step": 6460 + }, + { + "epoch": 0.4863471273452643, + "grad_norm": 6.736222743988037, + "learning_rate": 5.4753353787183945e-05, + "loss": 1.7745, + "step": 6461 + }, + { + "epoch": 0.4864224016259245, + "grad_norm": 7.330405235290527, + "learning_rate": 5.474121796899373e-05, + "loss": 2.3253, + "step": 6462 + }, + { + "epoch": 0.4864976759065846, + "grad_norm": 6.102550983428955, + "learning_rate": 5.4729081868951706e-05, + "loss": 1.689, + "step": 6463 + }, + { + "epoch": 0.4865729501872448, + "grad_norm": 4.754802703857422, + "learning_rate": 5.471694548777934e-05, + "loss": 1.6984, + "step": 6464 + }, + { + "epoch": 0.48664822446790496, + "grad_norm": 4.367542266845703, + "learning_rate": 5.4704808826198085e-05, + "loss": 1.8377, + "step": 6465 + }, + { + "epoch": 0.4867234987485651, + "grad_norm": 4.747076988220215, + "learning_rate": 5.4692671884929445e-05, + "loss": 1.7135, + "step": 6466 + }, + { + "epoch": 0.48679877302922525, + "grad_norm": 4.93787956237793, + "learning_rate": 5.4680534664694935e-05, + "loss": 2.2613, + "step": 6467 + }, + { + "epoch": 0.48687404730988537, + "grad_norm": 5.272012233734131, + "learning_rate": 5.466839716621608e-05, + "loss": 1.8556, + "step": 6468 + }, + { + "epoch": 0.48694932159054555, + "grad_norm": 4.7758564949035645, + "learning_rate": 5.4656259390214406e-05, + "loss": 1.611, + "step": 6469 + }, + { + "epoch": 0.4870245958712057, + "grad_norm": 5.5447096824646, + "learning_rate": 5.464412133741146e-05, + "loss": 1.5768, + "step": 6470 + }, + { + "epoch": 0.48709987015186584, + "grad_norm": 4.3626627922058105, + "learning_rate": 5.463198300852882e-05, + "loss": 1.8809, + "step": 6471 + }, + { + "epoch": 0.487175144432526, + "grad_norm": 5.845844268798828, + "learning_rate": 5.46198444042881e-05, + "loss": 1.6375, + "step": 6472 + }, + { + "epoch": 0.4872504187131862, + "grad_norm": 4.96574592590332, + "learning_rate": 5.460770552541088e-05, + "loss": 2.0007, + "step": 6473 + }, + { + "epoch": 0.4873256929938463, + "grad_norm": 5.362820148468018, + "learning_rate": 5.4595566372618775e-05, + "loss": 1.6671, + "step": 6474 + }, + { + "epoch": 0.4874009672745065, + "grad_norm": 4.479405879974365, + "learning_rate": 5.458342694663345e-05, + "loss": 1.64, + "step": 6475 + }, + { + "epoch": 0.4874762415551666, + "grad_norm": 4.854231834411621, + "learning_rate": 5.457128724817654e-05, + "loss": 2.5307, + "step": 6476 + }, + { + "epoch": 0.4875515158358268, + "grad_norm": 6.621882438659668, + "learning_rate": 5.455914727796971e-05, + "loss": 1.9304, + "step": 6477 + }, + { + "epoch": 0.48762679011648696, + "grad_norm": 6.617377758026123, + "learning_rate": 5.454700703673468e-05, + "loss": 2.0139, + "step": 6478 + }, + { + "epoch": 0.4877020643971471, + "grad_norm": 6.697943687438965, + "learning_rate": 5.453486652519311e-05, + "loss": 2.125, + "step": 6479 + }, + { + "epoch": 0.48777733867780726, + "grad_norm": 5.2741851806640625, + "learning_rate": 5.4522725744066735e-05, + "loss": 1.9523, + "step": 6480 + }, + { + "epoch": 0.48785261295846744, + "grad_norm": 5.0718793869018555, + "learning_rate": 5.451058469407729e-05, + "loss": 1.6653, + "step": 6481 + }, + { + "epoch": 0.48792788723912756, + "grad_norm": 6.898507595062256, + "learning_rate": 5.449844337594654e-05, + "loss": 1.8239, + "step": 6482 + }, + { + "epoch": 0.48800316151978773, + "grad_norm": 5.654229640960693, + "learning_rate": 5.448630179039622e-05, + "loss": 1.8119, + "step": 6483 + }, + { + "epoch": 0.4880784358004479, + "grad_norm": 4.211135387420654, + "learning_rate": 5.447415993814815e-05, + "loss": 1.6966, + "step": 6484 + }, + { + "epoch": 0.488153710081108, + "grad_norm": 5.623507022857666, + "learning_rate": 5.4462017819924106e-05, + "loss": 1.7453, + "step": 6485 + }, + { + "epoch": 0.4882289843617682, + "grad_norm": 4.709688663482666, + "learning_rate": 5.444987543644591e-05, + "loss": 2.1499, + "step": 6486 + }, + { + "epoch": 0.4883042586424283, + "grad_norm": 4.681134223937988, + "learning_rate": 5.443773278843538e-05, + "loss": 1.4425, + "step": 6487 + }, + { + "epoch": 0.4883795329230885, + "grad_norm": 7.0858612060546875, + "learning_rate": 5.4425589876614366e-05, + "loss": 1.9622, + "step": 6488 + }, + { + "epoch": 0.4884548072037487, + "grad_norm": 6.826773166656494, + "learning_rate": 5.441344670170474e-05, + "loss": 1.9337, + "step": 6489 + }, + { + "epoch": 0.4885300814844088, + "grad_norm": 5.040750980377197, + "learning_rate": 5.4401303264428364e-05, + "loss": 1.6698, + "step": 6490 + }, + { + "epoch": 0.48860535576506897, + "grad_norm": 4.9824628829956055, + "learning_rate": 5.438915956550714e-05, + "loss": 2.1412, + "step": 6491 + }, + { + "epoch": 0.48868063004572915, + "grad_norm": 5.018540859222412, + "learning_rate": 5.4377015605662986e-05, + "loss": 1.7639, + "step": 6492 + }, + { + "epoch": 0.48875590432638927, + "grad_norm": 5.104889392852783, + "learning_rate": 5.4364871385617786e-05, + "loss": 2.1173, + "step": 6493 + }, + { + "epoch": 0.48883117860704944, + "grad_norm": 5.299907207489014, + "learning_rate": 5.435272690609353e-05, + "loss": 2.0223, + "step": 6494 + }, + { + "epoch": 0.4889064528877096, + "grad_norm": 4.892288684844971, + "learning_rate": 5.434058216781214e-05, + "loss": 1.7359, + "step": 6495 + }, + { + "epoch": 0.48898172716836974, + "grad_norm": 5.492304801940918, + "learning_rate": 5.4328437171495604e-05, + "loss": 1.978, + "step": 6496 + }, + { + "epoch": 0.4890570014490299, + "grad_norm": 3.6452531814575195, + "learning_rate": 5.4316291917865883e-05, + "loss": 2.022, + "step": 6497 + }, + { + "epoch": 0.48913227572969004, + "grad_norm": 3.7482430934906006, + "learning_rate": 5.4304146407644994e-05, + "loss": 1.6646, + "step": 6498 + }, + { + "epoch": 0.4892075500103502, + "grad_norm": 6.455443859100342, + "learning_rate": 5.429200064155495e-05, + "loss": 1.9341, + "step": 6499 + }, + { + "epoch": 0.4892828242910104, + "grad_norm": 3.981693744659424, + "learning_rate": 5.427985462031779e-05, + "loss": 2.0385, + "step": 6500 + }, + { + "epoch": 0.4893580985716705, + "grad_norm": 4.170321464538574, + "learning_rate": 5.4267708344655556e-05, + "loss": 1.2996, + "step": 6501 + }, + { + "epoch": 0.4894333728523307, + "grad_norm": 4.601936340332031, + "learning_rate": 5.42555618152903e-05, + "loss": 1.9928, + "step": 6502 + }, + { + "epoch": 0.48950864713299086, + "grad_norm": 8.707098960876465, + "learning_rate": 5.424341503294411e-05, + "loss": 2.7734, + "step": 6503 + }, + { + "epoch": 0.489583921413651, + "grad_norm": 3.9739444255828857, + "learning_rate": 5.4231267998339066e-05, + "loss": 1.7302, + "step": 6504 + }, + { + "epoch": 0.48965919569431116, + "grad_norm": 5.3024702072143555, + "learning_rate": 5.421912071219728e-05, + "loss": 1.9438, + "step": 6505 + }, + { + "epoch": 0.4897344699749713, + "grad_norm": 5.4958648681640625, + "learning_rate": 5.420697317524088e-05, + "loss": 1.6608, + "step": 6506 + }, + { + "epoch": 0.48980974425563145, + "grad_norm": 3.842341899871826, + "learning_rate": 5.4194825388191995e-05, + "loss": 2.1779, + "step": 6507 + }, + { + "epoch": 0.4898850185362916, + "grad_norm": 4.176170349121094, + "learning_rate": 5.418267735177278e-05, + "loss": 2.0459, + "step": 6508 + }, + { + "epoch": 0.48996029281695175, + "grad_norm": 6.65064811706543, + "learning_rate": 5.41705290667054e-05, + "loss": 1.7718, + "step": 6509 + }, + { + "epoch": 0.4900355670976119, + "grad_norm": 6.64405632019043, + "learning_rate": 5.4158380533712016e-05, + "loss": 1.9684, + "step": 6510 + }, + { + "epoch": 0.4901108413782721, + "grad_norm": 4.931152820587158, + "learning_rate": 5.414623175351486e-05, + "loss": 1.867, + "step": 6511 + }, + { + "epoch": 0.4901861156589322, + "grad_norm": 4.482016563415527, + "learning_rate": 5.413408272683611e-05, + "loss": 2.1954, + "step": 6512 + }, + { + "epoch": 0.4902613899395924, + "grad_norm": 5.712460517883301, + "learning_rate": 5.4121933454398e-05, + "loss": 1.6091, + "step": 6513 + }, + { + "epoch": 0.49033666422025257, + "grad_norm": 6.5110182762146, + "learning_rate": 5.410978393692279e-05, + "loss": 2.1167, + "step": 6514 + }, + { + "epoch": 0.4904119385009127, + "grad_norm": 4.4460625648498535, + "learning_rate": 5.40976341751327e-05, + "loss": 1.5091, + "step": 6515 + }, + { + "epoch": 0.49048721278157287, + "grad_norm": 5.397545337677002, + "learning_rate": 5.408548416975002e-05, + "loss": 1.795, + "step": 6516 + }, + { + "epoch": 0.490562487062233, + "grad_norm": 4.741124629974365, + "learning_rate": 5.4073333921497036e-05, + "loss": 1.9955, + "step": 6517 + }, + { + "epoch": 0.49063776134289316, + "grad_norm": 4.442574501037598, + "learning_rate": 5.406118343109604e-05, + "loss": 1.8472, + "step": 6518 + }, + { + "epoch": 0.49071303562355334, + "grad_norm": 3.9879074096679688, + "learning_rate": 5.404903269926933e-05, + "loss": 1.505, + "step": 6519 + }, + { + "epoch": 0.49078830990421346, + "grad_norm": 5.394278049468994, + "learning_rate": 5.4036881726739265e-05, + "loss": 2.1498, + "step": 6520 + }, + { + "epoch": 0.49086358418487364, + "grad_norm": 5.392609596252441, + "learning_rate": 5.402473051422814e-05, + "loss": 1.821, + "step": 6521 + }, + { + "epoch": 0.4909388584655338, + "grad_norm": 5.036492824554443, + "learning_rate": 5.401257906245835e-05, + "loss": 2.5078, + "step": 6522 + }, + { + "epoch": 0.49101413274619393, + "grad_norm": 4.226395130157471, + "learning_rate": 5.400042737215224e-05, + "loss": 1.955, + "step": 6523 + }, + { + "epoch": 0.4910894070268541, + "grad_norm": 4.382475852966309, + "learning_rate": 5.398827544403221e-05, + "loss": 1.9809, + "step": 6524 + }, + { + "epoch": 0.49116468130751423, + "grad_norm": 6.398223400115967, + "learning_rate": 5.397612327882064e-05, + "loss": 2.2126, + "step": 6525 + }, + { + "epoch": 0.4912399555881744, + "grad_norm": 5.323082447052002, + "learning_rate": 5.396397087723994e-05, + "loss": 1.9895, + "step": 6526 + }, + { + "epoch": 0.4913152298688346, + "grad_norm": 5.594727516174316, + "learning_rate": 5.395181824001256e-05, + "loss": 1.8942, + "step": 6527 + }, + { + "epoch": 0.4913905041494947, + "grad_norm": 4.759037494659424, + "learning_rate": 5.3939665367860926e-05, + "loss": 2.1054, + "step": 6528 + }, + { + "epoch": 0.4914657784301549, + "grad_norm": 5.543865203857422, + "learning_rate": 5.3927512261507485e-05, + "loss": 1.9349, + "step": 6529 + }, + { + "epoch": 0.49154105271081505, + "grad_norm": 3.381861448287964, + "learning_rate": 5.391535892167471e-05, + "loss": 1.7595, + "step": 6530 + }, + { + "epoch": 0.49161632699147517, + "grad_norm": 7.823299407958984, + "learning_rate": 5.390320534908508e-05, + "loss": 2.4135, + "step": 6531 + }, + { + "epoch": 0.49169160127213535, + "grad_norm": 5.579688549041748, + "learning_rate": 5.389105154446109e-05, + "loss": 1.9716, + "step": 6532 + }, + { + "epoch": 0.4917668755527955, + "grad_norm": 4.819231033325195, + "learning_rate": 5.3878897508525265e-05, + "loss": 1.8395, + "step": 6533 + }, + { + "epoch": 0.49184214983345564, + "grad_norm": 5.305383205413818, + "learning_rate": 5.386674324200009e-05, + "loss": 1.644, + "step": 6534 + }, + { + "epoch": 0.4919174241141158, + "grad_norm": 3.493340492248535, + "learning_rate": 5.385458874560815e-05, + "loss": 2.0046, + "step": 6535 + }, + { + "epoch": 0.49199269839477594, + "grad_norm": 5.164981365203857, + "learning_rate": 5.3842434020071954e-05, + "loss": 1.9758, + "step": 6536 + }, + { + "epoch": 0.4920679726754361, + "grad_norm": 4.031330585479736, + "learning_rate": 5.383027906611409e-05, + "loss": 1.603, + "step": 6537 + }, + { + "epoch": 0.4921432469560963, + "grad_norm": 6.878929615020752, + "learning_rate": 5.381812388445711e-05, + "loss": 1.7612, + "step": 6538 + }, + { + "epoch": 0.4922185212367564, + "grad_norm": 5.020673751831055, + "learning_rate": 5.380596847582363e-05, + "loss": 2.0279, + "step": 6539 + }, + { + "epoch": 0.4922937955174166, + "grad_norm": 4.605405330657959, + "learning_rate": 5.379381284093624e-05, + "loss": 1.5668, + "step": 6540 + }, + { + "epoch": 0.49236906979807676, + "grad_norm": 4.904942035675049, + "learning_rate": 5.378165698051757e-05, + "loss": 1.6866, + "step": 6541 + }, + { + "epoch": 0.4924443440787369, + "grad_norm": 7.233217716217041, + "learning_rate": 5.3769500895290245e-05, + "loss": 1.7758, + "step": 6542 + }, + { + "epoch": 0.49251961835939706, + "grad_norm": 4.8885369300842285, + "learning_rate": 5.3757344585976886e-05, + "loss": 1.6643, + "step": 6543 + }, + { + "epoch": 0.49259489264005724, + "grad_norm": 5.4514875411987305, + "learning_rate": 5.37451880533002e-05, + "loss": 2.121, + "step": 6544 + }, + { + "epoch": 0.49267016692071736, + "grad_norm": 4.041563034057617, + "learning_rate": 5.3733031297982816e-05, + "loss": 1.6436, + "step": 6545 + }, + { + "epoch": 0.49274544120137753, + "grad_norm": 3.6761107444763184, + "learning_rate": 5.372087432074743e-05, + "loss": 1.9028, + "step": 6546 + }, + { + "epoch": 0.49282071548203765, + "grad_norm": 4.125627517700195, + "learning_rate": 5.3708717122316744e-05, + "loss": 1.5955, + "step": 6547 + }, + { + "epoch": 0.49289598976269783, + "grad_norm": 4.784101486206055, + "learning_rate": 5.3696559703413474e-05, + "loss": 2.1934, + "step": 6548 + }, + { + "epoch": 0.492971264043358, + "grad_norm": 5.15186071395874, + "learning_rate": 5.368440206476032e-05, + "loss": 1.9535, + "step": 6549 + }, + { + "epoch": 0.4930465383240181, + "grad_norm": 4.076193332672119, + "learning_rate": 5.3672244207080045e-05, + "loss": 1.9229, + "step": 6550 + }, + { + "epoch": 0.4931218126046783, + "grad_norm": 7.115160942077637, + "learning_rate": 5.36600861310954e-05, + "loss": 1.7446, + "step": 6551 + }, + { + "epoch": 0.4931970868853385, + "grad_norm": 5.7246832847595215, + "learning_rate": 5.3647927837529135e-05, + "loss": 1.5731, + "step": 6552 + }, + { + "epoch": 0.4932723611659986, + "grad_norm": 4.658170700073242, + "learning_rate": 5.363576932710401e-05, + "loss": 1.849, + "step": 6553 + }, + { + "epoch": 0.49334763544665877, + "grad_norm": 5.130527019500732, + "learning_rate": 5.362361060054284e-05, + "loss": 1.7915, + "step": 6554 + }, + { + "epoch": 0.4934229097273189, + "grad_norm": 5.57821798324585, + "learning_rate": 5.3611451658568414e-05, + "loss": 2.2171, + "step": 6555 + }, + { + "epoch": 0.49349818400797907, + "grad_norm": 5.558706760406494, + "learning_rate": 5.359929250190355e-05, + "loss": 1.5748, + "step": 6556 + }, + { + "epoch": 0.49357345828863924, + "grad_norm": 4.314650058746338, + "learning_rate": 5.358713313127108e-05, + "loss": 2.018, + "step": 6557 + }, + { + "epoch": 0.49364873256929936, + "grad_norm": 4.296151161193848, + "learning_rate": 5.357497354739385e-05, + "loss": 1.833, + "step": 6558 + }, + { + "epoch": 0.49372400684995954, + "grad_norm": 4.954275608062744, + "learning_rate": 5.356281375099467e-05, + "loss": 2.0446, + "step": 6559 + }, + { + "epoch": 0.4937992811306197, + "grad_norm": 3.6054606437683105, + "learning_rate": 5.355065374279645e-05, + "loss": 1.6044, + "step": 6560 + }, + { + "epoch": 0.49387455541127984, + "grad_norm": 6.607327938079834, + "learning_rate": 5.353849352352206e-05, + "loss": 1.9429, + "step": 6561 + }, + { + "epoch": 0.49394982969194, + "grad_norm": 5.233273506164551, + "learning_rate": 5.352633309389437e-05, + "loss": 1.86, + "step": 6562 + }, + { + "epoch": 0.4940251039726002, + "grad_norm": 5.43840217590332, + "learning_rate": 5.35141724546363e-05, + "loss": 1.7199, + "step": 6563 + }, + { + "epoch": 0.4941003782532603, + "grad_norm": 7.027149200439453, + "learning_rate": 5.350201160647077e-05, + "loss": 1.8494, + "step": 6564 + }, + { + "epoch": 0.4941756525339205, + "grad_norm": 5.242697715759277, + "learning_rate": 5.348985055012069e-05, + "loss": 1.6428, + "step": 6565 + }, + { + "epoch": 0.4942509268145806, + "grad_norm": 6.472156047821045, + "learning_rate": 5.3477689286308996e-05, + "loss": 1.846, + "step": 6566 + }, + { + "epoch": 0.4943262010952408, + "grad_norm": 5.145436763763428, + "learning_rate": 5.3465527815758674e-05, + "loss": 1.8041, + "step": 6567 + }, + { + "epoch": 0.49440147537590096, + "grad_norm": 4.717484474182129, + "learning_rate": 5.345336613919266e-05, + "loss": 1.9798, + "step": 6568 + }, + { + "epoch": 0.4944767496565611, + "grad_norm": 5.332404136657715, + "learning_rate": 5.3441204257333946e-05, + "loss": 1.8313, + "step": 6569 + }, + { + "epoch": 0.49455202393722125, + "grad_norm": 6.965898513793945, + "learning_rate": 5.342904217090549e-05, + "loss": 1.8866, + "step": 6570 + }, + { + "epoch": 0.49462729821788143, + "grad_norm": 5.574611186981201, + "learning_rate": 5.3416879880630336e-05, + "loss": 1.7331, + "step": 6571 + }, + { + "epoch": 0.49470257249854155, + "grad_norm": 5.662327289581299, + "learning_rate": 5.340471738723147e-05, + "loss": 1.6738, + "step": 6572 + }, + { + "epoch": 0.4947778467792017, + "grad_norm": 4.898918151855469, + "learning_rate": 5.3392554691431926e-05, + "loss": 1.8864, + "step": 6573 + }, + { + "epoch": 0.49485312105986184, + "grad_norm": 5.1412672996521, + "learning_rate": 5.338039179395474e-05, + "loss": 1.5666, + "step": 6574 + }, + { + "epoch": 0.494928395340522, + "grad_norm": 8.831100463867188, + "learning_rate": 5.336822869552297e-05, + "loss": 2.4161, + "step": 6575 + }, + { + "epoch": 0.4950036696211822, + "grad_norm": 3.8222408294677734, + "learning_rate": 5.335606539685964e-05, + "loss": 1.81, + "step": 6576 + }, + { + "epoch": 0.4950789439018423, + "grad_norm": 6.073613166809082, + "learning_rate": 5.3343901898687875e-05, + "loss": 1.9677, + "step": 6577 + }, + { + "epoch": 0.4951542181825025, + "grad_norm": 4.634353160858154, + "learning_rate": 5.333173820173073e-05, + "loss": 1.9349, + "step": 6578 + }, + { + "epoch": 0.49522949246316267, + "grad_norm": 5.763169288635254, + "learning_rate": 5.331957430671132e-05, + "loss": 1.7652, + "step": 6579 + }, + { + "epoch": 0.4953047667438228, + "grad_norm": 4.172359943389893, + "learning_rate": 5.3307410214352724e-05, + "loss": 2.1371, + "step": 6580 + }, + { + "epoch": 0.49538004102448296, + "grad_norm": 8.34048843383789, + "learning_rate": 5.32952459253781e-05, + "loss": 1.7159, + "step": 6581 + }, + { + "epoch": 0.49545531530514314, + "grad_norm": 5.233797550201416, + "learning_rate": 5.328308144051054e-05, + "loss": 1.5947, + "step": 6582 + }, + { + "epoch": 0.49553058958580326, + "grad_norm": 4.1640825271606445, + "learning_rate": 5.327091676047322e-05, + "loss": 1.8269, + "step": 6583 + }, + { + "epoch": 0.49560586386646344, + "grad_norm": 3.9838075637817383, + "learning_rate": 5.325875188598929e-05, + "loss": 1.829, + "step": 6584 + }, + { + "epoch": 0.49568113814712356, + "grad_norm": 4.379800319671631, + "learning_rate": 5.3246586817781906e-05, + "loss": 1.8176, + "step": 6585 + }, + { + "epoch": 0.49575641242778373, + "grad_norm": 6.511457920074463, + "learning_rate": 5.3234421556574254e-05, + "loss": 1.6071, + "step": 6586 + }, + { + "epoch": 0.4958316867084439, + "grad_norm": 6.1335859298706055, + "learning_rate": 5.322225610308952e-05, + "loss": 1.8574, + "step": 6587 + }, + { + "epoch": 0.49590696098910403, + "grad_norm": 5.041165828704834, + "learning_rate": 5.32100904580509e-05, + "loss": 1.7299, + "step": 6588 + }, + { + "epoch": 0.4959822352697642, + "grad_norm": 17.57342529296875, + "learning_rate": 5.3197924622181614e-05, + "loss": 2.4252, + "step": 6589 + }, + { + "epoch": 0.4960575095504244, + "grad_norm": 5.864713191986084, + "learning_rate": 5.3185758596204896e-05, + "loss": 1.9627, + "step": 6590 + }, + { + "epoch": 0.4961327838310845, + "grad_norm": 6.689698696136475, + "learning_rate": 5.3173592380843963e-05, + "loss": 1.6327, + "step": 6591 + }, + { + "epoch": 0.4962080581117447, + "grad_norm": 4.820821285247803, + "learning_rate": 5.316142597682207e-05, + "loss": 1.582, + "step": 6592 + }, + { + "epoch": 0.49628333239240485, + "grad_norm": 5.506748676300049, + "learning_rate": 5.314925938486246e-05, + "loss": 1.7574, + "step": 6593 + }, + { + "epoch": 0.496358606673065, + "grad_norm": 5.227021217346191, + "learning_rate": 5.313709260568842e-05, + "loss": 1.7312, + "step": 6594 + }, + { + "epoch": 0.49643388095372515, + "grad_norm": 5.703372478485107, + "learning_rate": 5.312492564002324e-05, + "loss": 1.6921, + "step": 6595 + }, + { + "epoch": 0.49650915523438527, + "grad_norm": 5.794672966003418, + "learning_rate": 5.311275848859019e-05, + "loss": 1.98, + "step": 6596 + }, + { + "epoch": 0.49658442951504544, + "grad_norm": 6.087855815887451, + "learning_rate": 5.310059115211259e-05, + "loss": 1.7984, + "step": 6597 + }, + { + "epoch": 0.4966597037957056, + "grad_norm": 6.00978946685791, + "learning_rate": 5.3088423631313735e-05, + "loss": 2.0187, + "step": 6598 + }, + { + "epoch": 0.49673497807636574, + "grad_norm": 3.9443798065185547, + "learning_rate": 5.307625592691694e-05, + "loss": 2.0072, + "step": 6599 + }, + { + "epoch": 0.4968102523570259, + "grad_norm": 4.7089667320251465, + "learning_rate": 5.306408803964557e-05, + "loss": 1.9774, + "step": 6600 + }, + { + "epoch": 0.4968855266376861, + "grad_norm": 4.21376895904541, + "learning_rate": 5.3051919970222964e-05, + "loss": 1.817, + "step": 6601 + }, + { + "epoch": 0.4969608009183462, + "grad_norm": 4.394554615020752, + "learning_rate": 5.303975171937248e-05, + "loss": 1.9268, + "step": 6602 + }, + { + "epoch": 0.4970360751990064, + "grad_norm": 5.122894287109375, + "learning_rate": 5.302758328781746e-05, + "loss": 2.2246, + "step": 6603 + }, + { + "epoch": 0.4971113494796665, + "grad_norm": 5.243992328643799, + "learning_rate": 5.30154146762813e-05, + "loss": 2.0363, + "step": 6604 + }, + { + "epoch": 0.4971866237603267, + "grad_norm": 5.476206302642822, + "learning_rate": 5.3003245885487393e-05, + "loss": 1.6873, + "step": 6605 + }, + { + "epoch": 0.49726189804098686, + "grad_norm": 4.071695804595947, + "learning_rate": 5.2991076916159146e-05, + "loss": 1.8928, + "step": 6606 + }, + { + "epoch": 0.497337172321647, + "grad_norm": 8.421332359313965, + "learning_rate": 5.297890776901996e-05, + "loss": 1.8091, + "step": 6607 + }, + { + "epoch": 0.49741244660230716, + "grad_norm": 5.12196159362793, + "learning_rate": 5.296673844479325e-05, + "loss": 1.7918, + "step": 6608 + }, + { + "epoch": 0.49748772088296733, + "grad_norm": 5.93121862411499, + "learning_rate": 5.2954568944202434e-05, + "loss": 2.0013, + "step": 6609 + }, + { + "epoch": 0.49756299516362745, + "grad_norm": 5.241358757019043, + "learning_rate": 5.294239926797099e-05, + "loss": 2.6958, + "step": 6610 + }, + { + "epoch": 0.49763826944428763, + "grad_norm": 4.241708755493164, + "learning_rate": 5.293022941682234e-05, + "loss": 1.4672, + "step": 6611 + }, + { + "epoch": 0.4977135437249478, + "grad_norm": 4.95985221862793, + "learning_rate": 5.2918059391479965e-05, + "loss": 1.7102, + "step": 6612 + }, + { + "epoch": 0.4977888180056079, + "grad_norm": 4.776228427886963, + "learning_rate": 5.290588919266735e-05, + "loss": 1.6408, + "step": 6613 + }, + { + "epoch": 0.4978640922862681, + "grad_norm": 4.282431125640869, + "learning_rate": 5.289371882110794e-05, + "loss": 1.908, + "step": 6614 + }, + { + "epoch": 0.4979393665669282, + "grad_norm": 4.2445878982543945, + "learning_rate": 5.288154827752526e-05, + "loss": 1.9812, + "step": 6615 + }, + { + "epoch": 0.4980146408475884, + "grad_norm": 4.978115558624268, + "learning_rate": 5.286937756264278e-05, + "loss": 1.5286, + "step": 6616 + }, + { + "epoch": 0.4980899151282486, + "grad_norm": 4.761786460876465, + "learning_rate": 5.285720667718406e-05, + "loss": 1.8652, + "step": 6617 + }, + { + "epoch": 0.4981651894089087, + "grad_norm": 4.414857864379883, + "learning_rate": 5.28450356218726e-05, + "loss": 1.8395, + "step": 6618 + }, + { + "epoch": 0.49824046368956887, + "grad_norm": 4.052631855010986, + "learning_rate": 5.283286439743193e-05, + "loss": 1.6264, + "step": 6619 + }, + { + "epoch": 0.49831573797022904, + "grad_norm": 3.8466341495513916, + "learning_rate": 5.282069300458561e-05, + "loss": 1.8972, + "step": 6620 + }, + { + "epoch": 0.49839101225088916, + "grad_norm": 5.695672988891602, + "learning_rate": 5.280852144405717e-05, + "loss": 1.9272, + "step": 6621 + }, + { + "epoch": 0.49846628653154934, + "grad_norm": 8.128326416015625, + "learning_rate": 5.279634971657019e-05, + "loss": 2.1078, + "step": 6622 + }, + { + "epoch": 0.49854156081220946, + "grad_norm": 4.437495231628418, + "learning_rate": 5.2784177822848245e-05, + "loss": 1.8393, + "step": 6623 + }, + { + "epoch": 0.49861683509286964, + "grad_norm": 4.087892532348633, + "learning_rate": 5.277200576361493e-05, + "loss": 1.8082, + "step": 6624 + }, + { + "epoch": 0.4986921093735298, + "grad_norm": 3.9791553020477295, + "learning_rate": 5.2759833539593814e-05, + "loss": 1.7425, + "step": 6625 + }, + { + "epoch": 0.49876738365418993, + "grad_norm": 5.6482720375061035, + "learning_rate": 5.274766115150851e-05, + "loss": 2.1118, + "step": 6626 + }, + { + "epoch": 0.4988426579348501, + "grad_norm": 3.805830717086792, + "learning_rate": 5.273548860008263e-05, + "loss": 1.7573, + "step": 6627 + }, + { + "epoch": 0.4989179322155103, + "grad_norm": 4.6420183181762695, + "learning_rate": 5.27233158860398e-05, + "loss": 1.8032, + "step": 6628 + }, + { + "epoch": 0.4989932064961704, + "grad_norm": 4.150326728820801, + "learning_rate": 5.271114301010368e-05, + "loss": 1.777, + "step": 6629 + }, + { + "epoch": 0.4990684807768306, + "grad_norm": 6.890900135040283, + "learning_rate": 5.2698969972997866e-05, + "loss": 2.4232, + "step": 6630 + }, + { + "epoch": 0.49914375505749076, + "grad_norm": 4.103460311889648, + "learning_rate": 5.268679677544605e-05, + "loss": 1.6978, + "step": 6631 + }, + { + "epoch": 0.4992190293381509, + "grad_norm": 7.450624942779541, + "learning_rate": 5.267462341817185e-05, + "loss": 1.8601, + "step": 6632 + }, + { + "epoch": 0.49929430361881105, + "grad_norm": 6.134131908416748, + "learning_rate": 5.266244990189898e-05, + "loss": 1.7238, + "step": 6633 + }, + { + "epoch": 0.4993695778994712, + "grad_norm": 8.058239936828613, + "learning_rate": 5.265027622735109e-05, + "loss": 1.9083, + "step": 6634 + }, + { + "epoch": 0.49944485218013135, + "grad_norm": 5.0254034996032715, + "learning_rate": 5.26381023952519e-05, + "loss": 1.8105, + "step": 6635 + }, + { + "epoch": 0.4995201264607915, + "grad_norm": 5.9632568359375, + "learning_rate": 5.262592840632509e-05, + "loss": 1.5693, + "step": 6636 + }, + { + "epoch": 0.49959540074145165, + "grad_norm": 5.356999397277832, + "learning_rate": 5.261375426129436e-05, + "loss": 1.7606, + "step": 6637 + }, + { + "epoch": 0.4996706750221118, + "grad_norm": 9.331753730773926, + "learning_rate": 5.260157996088343e-05, + "loss": 2.0345, + "step": 6638 + }, + { + "epoch": 0.499745949302772, + "grad_norm": 8.355175971984863, + "learning_rate": 5.258940550581606e-05, + "loss": 2.2206, + "step": 6639 + }, + { + "epoch": 0.4998212235834321, + "grad_norm": 7.343987464904785, + "learning_rate": 5.2577230896815944e-05, + "loss": 1.7228, + "step": 6640 + }, + { + "epoch": 0.4998964978640923, + "grad_norm": 7.6332831382751465, + "learning_rate": 5.256505613460686e-05, + "loss": 1.6809, + "step": 6641 + }, + { + "epoch": 0.49997177214475247, + "grad_norm": 7.6332831382751465, + "learning_rate": 5.256505613460686e-05, + "loss": 1.9205, + "step": 6642 + }, + { + "epoch": 0.5000470464254126, + "grad_norm": 15.233363151550293, + "learning_rate": 5.255288121991253e-05, + "loss": 1.4448, + "step": 6643 + }, + { + "epoch": 0.5001223207060728, + "grad_norm": 9.014907836914062, + "learning_rate": 5.2540706153456755e-05, + "loss": 1.7863, + "step": 6644 + }, + { + "epoch": 0.5001975949867329, + "grad_norm": 14.538765907287598, + "learning_rate": 5.2528530935963274e-05, + "loss": 1.7883, + "step": 6645 + }, + { + "epoch": 0.5002728692673931, + "grad_norm": 5.758622169494629, + "learning_rate": 5.2516355568155895e-05, + "loss": 1.679, + "step": 6646 + }, + { + "epoch": 0.5003481435480532, + "grad_norm": 6.037545680999756, + "learning_rate": 5.250418005075839e-05, + "loss": 2.0086, + "step": 6647 + }, + { + "epoch": 0.5004234178287134, + "grad_norm": 9.560413360595703, + "learning_rate": 5.2492004384494576e-05, + "loss": 1.8725, + "step": 6648 + }, + { + "epoch": 0.5004986921093735, + "grad_norm": 4.456220626831055, + "learning_rate": 5.247982857008825e-05, + "loss": 1.7012, + "step": 6649 + }, + { + "epoch": 0.5005739663900337, + "grad_norm": 6.109070777893066, + "learning_rate": 5.246765260826322e-05, + "loss": 1.7325, + "step": 6650 + }, + { + "epoch": 0.5006492406706938, + "grad_norm": 6.128081321716309, + "learning_rate": 5.245547649974334e-05, + "loss": 2.2765, + "step": 6651 + }, + { + "epoch": 0.500724514951354, + "grad_norm": 5.9204182624816895, + "learning_rate": 5.244330024525241e-05, + "loss": 2.4219, + "step": 6652 + }, + { + "epoch": 0.5007997892320142, + "grad_norm": 5.951528072357178, + "learning_rate": 5.243112384551431e-05, + "loss": 1.8586, + "step": 6653 + }, + { + "epoch": 0.5008750635126743, + "grad_norm": 5.156544208526611, + "learning_rate": 5.241894730125286e-05, + "loss": 1.9027, + "step": 6654 + }, + { + "epoch": 0.5009503377933344, + "grad_norm": 6.597646713256836, + "learning_rate": 5.240677061319194e-05, + "loss": 1.9693, + "step": 6655 + }, + { + "epoch": 0.5010256120739947, + "grad_norm": 5.793175220489502, + "learning_rate": 5.23945937820554e-05, + "loss": 1.5685, + "step": 6656 + }, + { + "epoch": 0.5011008863546548, + "grad_norm": 6.648615837097168, + "learning_rate": 5.238241680856715e-05, + "loss": 1.6234, + "step": 6657 + }, + { + "epoch": 0.5011761606353149, + "grad_norm": 4.223020076751709, + "learning_rate": 5.237023969345104e-05, + "loss": 1.902, + "step": 6658 + }, + { + "epoch": 0.5012514349159751, + "grad_norm": 5.66109561920166, + "learning_rate": 5.2358062437430974e-05, + "loss": 1.6387, + "step": 6659 + }, + { + "epoch": 0.5013267091966352, + "grad_norm": 6.670844554901123, + "learning_rate": 5.234588504123089e-05, + "loss": 2.1146, + "step": 6660 + }, + { + "epoch": 0.5014019834772954, + "grad_norm": 5.465707778930664, + "learning_rate": 5.2333707505574656e-05, + "loss": 1.6081, + "step": 6661 + }, + { + "epoch": 0.5014772577579556, + "grad_norm": 5.694941520690918, + "learning_rate": 5.23215298311862e-05, + "loss": 1.9031, + "step": 6662 + }, + { + "epoch": 0.5015525320386157, + "grad_norm": 5.355263710021973, + "learning_rate": 5.230935201878947e-05, + "loss": 1.679, + "step": 6663 + }, + { + "epoch": 0.5016278063192758, + "grad_norm": 5.01465368270874, + "learning_rate": 5.229717406910838e-05, + "loss": 1.9775, + "step": 6664 + }, + { + "epoch": 0.5017030805999361, + "grad_norm": 3.7221438884735107, + "learning_rate": 5.228499598286689e-05, + "loss": 1.5676, + "step": 6665 + }, + { + "epoch": 0.5017783548805962, + "grad_norm": 5.206873416900635, + "learning_rate": 5.227281776078895e-05, + "loss": 1.8844, + "step": 6666 + }, + { + "epoch": 0.5018536291612563, + "grad_norm": 5.92972993850708, + "learning_rate": 5.22606394035985e-05, + "loss": 1.8293, + "step": 6667 + }, + { + "epoch": 0.5019289034419164, + "grad_norm": 4.845220565795898, + "learning_rate": 5.224846091201953e-05, + "loss": 1.9794, + "step": 6668 + }, + { + "epoch": 0.5020041777225767, + "grad_norm": 4.447528839111328, + "learning_rate": 5.223628228677602e-05, + "loss": 1.8585, + "step": 6669 + }, + { + "epoch": 0.5020794520032368, + "grad_norm": 6.166228294372559, + "learning_rate": 5.222410352859193e-05, + "loss": 2.0312, + "step": 6670 + }, + { + "epoch": 0.5021547262838969, + "grad_norm": 7.675826549530029, + "learning_rate": 5.221192463819128e-05, + "loss": 2.2022, + "step": 6671 + }, + { + "epoch": 0.5022300005645571, + "grad_norm": 6.467051982879639, + "learning_rate": 5.219974561629803e-05, + "loss": 1.8413, + "step": 6672 + }, + { + "epoch": 0.5023052748452173, + "grad_norm": 4.609240531921387, + "learning_rate": 5.218756646363625e-05, + "loss": 1.7654, + "step": 6673 + }, + { + "epoch": 0.5023805491258774, + "grad_norm": 6.232847690582275, + "learning_rate": 5.21753871809299e-05, + "loss": 1.7535, + "step": 6674 + }, + { + "epoch": 0.5024558234065376, + "grad_norm": 9.90231704711914, + "learning_rate": 5.2163207768903036e-05, + "loss": 2.0324, + "step": 6675 + }, + { + "epoch": 0.5025310976871977, + "grad_norm": 5.207597732543945, + "learning_rate": 5.215102822827966e-05, + "loss": 1.6822, + "step": 6676 + }, + { + "epoch": 0.5026063719678578, + "grad_norm": 5.305007457733154, + "learning_rate": 5.2138848559783835e-05, + "loss": 1.7012, + "step": 6677 + }, + { + "epoch": 0.5026816462485181, + "grad_norm": 4.0028581619262695, + "learning_rate": 5.212666876413961e-05, + "loss": 1.8811, + "step": 6678 + }, + { + "epoch": 0.5027569205291782, + "grad_norm": 4.931353569030762, + "learning_rate": 5.211448884207102e-05, + "loss": 1.9463, + "step": 6679 + }, + { + "epoch": 0.5028321948098383, + "grad_norm": 4.481631278991699, + "learning_rate": 5.210230879430214e-05, + "loss": 1.7501, + "step": 6680 + }, + { + "epoch": 0.5029074690904985, + "grad_norm": 4.481631278991699, + "learning_rate": 5.210230879430214e-05, + "loss": 1.6467, + "step": 6681 + }, + { + "epoch": 0.5029827433711587, + "grad_norm": 6.17559289932251, + "learning_rate": 5.209012862155703e-05, + "loss": 1.948, + "step": 6682 + }, + { + "epoch": 0.5030580176518188, + "grad_norm": 3.8646605014801025, + "learning_rate": 5.207794832455978e-05, + "loss": 1.7035, + "step": 6683 + }, + { + "epoch": 0.503133291932479, + "grad_norm": 6.362034320831299, + "learning_rate": 5.2065767904034445e-05, + "loss": 1.912, + "step": 6684 + }, + { + "epoch": 0.5032085662131391, + "grad_norm": 13.488008499145508, + "learning_rate": 5.2053587360705156e-05, + "loss": 1.8702, + "step": 6685 + }, + { + "epoch": 0.5032838404937993, + "grad_norm": 7.346495151519775, + "learning_rate": 5.204140669529598e-05, + "loss": 2.0898, + "step": 6686 + }, + { + "epoch": 0.5033591147744594, + "grad_norm": 8.827823638916016, + "learning_rate": 5.2029225908531044e-05, + "loss": 1.8896, + "step": 6687 + }, + { + "epoch": 0.5034343890551196, + "grad_norm": 8.501490592956543, + "learning_rate": 5.2017045001134445e-05, + "loss": 2.0539, + "step": 6688 + }, + { + "epoch": 0.5035096633357797, + "grad_norm": 9.503661155700684, + "learning_rate": 5.200486397383031e-05, + "loss": 1.7324, + "step": 6689 + }, + { + "epoch": 0.5035849376164399, + "grad_norm": 15.624320983886719, + "learning_rate": 5.199268282734276e-05, + "loss": 2.0127, + "step": 6690 + }, + { + "epoch": 0.5036602118971001, + "grad_norm": 15.573356628417969, + "learning_rate": 5.1980501562395925e-05, + "loss": 1.9564, + "step": 6691 + }, + { + "epoch": 0.5037354861777602, + "grad_norm": 7.790936470031738, + "learning_rate": 5.196832017971397e-05, + "loss": 1.4427, + "step": 6692 + }, + { + "epoch": 0.5038107604584203, + "grad_norm": 7.703174114227295, + "learning_rate": 5.195613868002101e-05, + "loss": 2.0064, + "step": 6693 + }, + { + "epoch": 0.5038860347390806, + "grad_norm": 7.939530372619629, + "learning_rate": 5.1943957064041225e-05, + "loss": 1.7036, + "step": 6694 + }, + { + "epoch": 0.5039613090197407, + "grad_norm": 3.687410831451416, + "learning_rate": 5.193177533249879e-05, + "loss": 1.6375, + "step": 6695 + }, + { + "epoch": 0.5040365833004008, + "grad_norm": 5.230550765991211, + "learning_rate": 5.1919593486117824e-05, + "loss": 1.8778, + "step": 6696 + }, + { + "epoch": 0.504111857581061, + "grad_norm": 7.871644973754883, + "learning_rate": 5.190741152562255e-05, + "loss": 2.0188, + "step": 6697 + }, + { + "epoch": 0.5041871318617211, + "grad_norm": 10.600655555725098, + "learning_rate": 5.1895229451737135e-05, + "loss": 1.6422, + "step": 6698 + }, + { + "epoch": 0.5042624061423813, + "grad_norm": 6.835219383239746, + "learning_rate": 5.188304726518577e-05, + "loss": 2.0206, + "step": 6699 + }, + { + "epoch": 0.5043376804230415, + "grad_norm": 6.835219383239746, + "learning_rate": 5.188304726518577e-05, + "loss": 1.9358, + "step": 6700 + }, + { + "epoch": 0.5044129547037016, + "grad_norm": 9.150716781616211, + "learning_rate": 5.187086496669265e-05, + "loss": 1.6978, + "step": 6701 + }, + { + "epoch": 0.5044882289843617, + "grad_norm": 11.637362480163574, + "learning_rate": 5.1858682556981954e-05, + "loss": 1.819, + "step": 6702 + }, + { + "epoch": 0.504563503265022, + "grad_norm": 11.420654296875, + "learning_rate": 5.184650003677793e-05, + "loss": 1.6971, + "step": 6703 + }, + { + "epoch": 0.5046387775456821, + "grad_norm": 10.267974853515625, + "learning_rate": 5.183431740680478e-05, + "loss": 1.5735, + "step": 6704 + }, + { + "epoch": 0.5047140518263422, + "grad_norm": 5.717667579650879, + "learning_rate": 5.1822134667786714e-05, + "loss": 1.5218, + "step": 6705 + }, + { + "epoch": 0.5047893261070024, + "grad_norm": 8.817151069641113, + "learning_rate": 5.180995182044798e-05, + "loss": 1.8984, + "step": 6706 + }, + { + "epoch": 0.5048646003876626, + "grad_norm": 5.223846435546875, + "learning_rate": 5.1797768865512794e-05, + "loss": 2.0337, + "step": 6707 + }, + { + "epoch": 0.5049398746683227, + "grad_norm": 5.034576892852783, + "learning_rate": 5.1785585803705416e-05, + "loss": 1.6223, + "step": 6708 + }, + { + "epoch": 0.5050151489489828, + "grad_norm": 8.774874687194824, + "learning_rate": 5.177340263575008e-05, + "loss": 1.5931, + "step": 6709 + }, + { + "epoch": 0.505090423229643, + "grad_norm": 8.774874687194824, + "learning_rate": 5.177340263575008e-05, + "loss": 1.7888, + "step": 6710 + }, + { + "epoch": 0.5051656975103032, + "grad_norm": 9.311935424804688, + "learning_rate": 5.1761219362371036e-05, + "loss": 1.7851, + "step": 6711 + }, + { + "epoch": 0.5052409717909633, + "grad_norm": 6.983391284942627, + "learning_rate": 5.1749035984292574e-05, + "loss": 1.9447, + "step": 6712 + }, + { + "epoch": 0.5053162460716235, + "grad_norm": 10.279643058776855, + "learning_rate": 5.173685250223893e-05, + "loss": 2.1614, + "step": 6713 + }, + { + "epoch": 0.5053915203522836, + "grad_norm": 5.187672138214111, + "learning_rate": 5.1724668916934384e-05, + "loss": 2.2896, + "step": 6714 + }, + { + "epoch": 0.5054667946329438, + "grad_norm": 20.772737503051758, + "learning_rate": 5.1712485229103224e-05, + "loss": 2.0697, + "step": 6715 + }, + { + "epoch": 0.505542068913604, + "grad_norm": 14.30375862121582, + "learning_rate": 5.1700301439469736e-05, + "loss": 1.8487, + "step": 6716 + }, + { + "epoch": 0.5056173431942641, + "grad_norm": 5.290216445922852, + "learning_rate": 5.168811754875821e-05, + "loss": 1.7995, + "step": 6717 + }, + { + "epoch": 0.5056926174749242, + "grad_norm": 5.640997886657715, + "learning_rate": 5.167593355769294e-05, + "loss": 1.8875, + "step": 6718 + }, + { + "epoch": 0.5057678917555845, + "grad_norm": 10.728387832641602, + "learning_rate": 5.166374946699821e-05, + "loss": 1.5672, + "step": 6719 + }, + { + "epoch": 0.5058431660362446, + "grad_norm": 14.613763809204102, + "learning_rate": 5.165156527739836e-05, + "loss": 2.2401, + "step": 6720 + }, + { + "epoch": 0.5059184403169047, + "grad_norm": 5.957528591156006, + "learning_rate": 5.1639380989617694e-05, + "loss": 1.9183, + "step": 6721 + }, + { + "epoch": 0.5059937145975649, + "grad_norm": 5.298984527587891, + "learning_rate": 5.1627196604380534e-05, + "loss": 1.9872, + "step": 6722 + }, + { + "epoch": 0.506068988878225, + "grad_norm": 5.935640811920166, + "learning_rate": 5.16150121224112e-05, + "loss": 1.9586, + "step": 6723 + }, + { + "epoch": 0.5061442631588852, + "grad_norm": 6.94568395614624, + "learning_rate": 5.160282754443404e-05, + "loss": 1.5733, + "step": 6724 + }, + { + "epoch": 0.5062195374395454, + "grad_norm": 7.91921854019165, + "learning_rate": 5.1590642871173354e-05, + "loss": 1.9728, + "step": 6725 + }, + { + "epoch": 0.5062948117202055, + "grad_norm": 23.871170043945312, + "learning_rate": 5.157845810335353e-05, + "loss": 1.7838, + "step": 6726 + }, + { + "epoch": 0.5063700860008656, + "grad_norm": 4.906027793884277, + "learning_rate": 5.15662732416989e-05, + "loss": 1.6954, + "step": 6727 + }, + { + "epoch": 0.5064453602815258, + "grad_norm": 7.631262302398682, + "learning_rate": 5.155408828693383e-05, + "loss": 1.4316, + "step": 6728 + }, + { + "epoch": 0.506520634562186, + "grad_norm": 21.19890594482422, + "learning_rate": 5.1541903239782664e-05, + "loss": 2.3925, + "step": 6729 + }, + { + "epoch": 0.5065959088428461, + "grad_norm": 7.129833698272705, + "learning_rate": 5.152971810096977e-05, + "loss": 2.1276, + "step": 6730 + }, + { + "epoch": 0.5066711831235062, + "grad_norm": 6.705801010131836, + "learning_rate": 5.151753287121952e-05, + "loss": 1.8777, + "step": 6731 + }, + { + "epoch": 0.5067464574041665, + "grad_norm": 6.604841232299805, + "learning_rate": 5.150534755125631e-05, + "loss": 1.681, + "step": 6732 + }, + { + "epoch": 0.5068217316848266, + "grad_norm": 6.598692417144775, + "learning_rate": 5.149316214180451e-05, + "loss": 2.2476, + "step": 6733 + }, + { + "epoch": 0.5068970059654867, + "grad_norm": 6.718410491943359, + "learning_rate": 5.1480976643588486e-05, + "loss": 1.9058, + "step": 6734 + }, + { + "epoch": 0.5069722802461469, + "grad_norm": 8.54813289642334, + "learning_rate": 5.1468791057332665e-05, + "loss": 2.159, + "step": 6735 + }, + { + "epoch": 0.507047554526807, + "grad_norm": 4.509205341339111, + "learning_rate": 5.1456605383761414e-05, + "loss": 1.9215, + "step": 6736 + }, + { + "epoch": 0.5071228288074672, + "grad_norm": 32.04656219482422, + "learning_rate": 5.144441962359917e-05, + "loss": 1.8219, + "step": 6737 + }, + { + "epoch": 0.5071981030881274, + "grad_norm": 6.573288440704346, + "learning_rate": 5.143223377757032e-05, + "loss": 2.1535, + "step": 6738 + }, + { + "epoch": 0.5072733773687875, + "grad_norm": 5.543346405029297, + "learning_rate": 5.142004784639928e-05, + "loss": 1.8713, + "step": 6739 + }, + { + "epoch": 0.5073486516494476, + "grad_norm": 9.760968208312988, + "learning_rate": 5.140786183081046e-05, + "loss": 1.7709, + "step": 6740 + }, + { + "epoch": 0.5074239259301079, + "grad_norm": 4.373596668243408, + "learning_rate": 5.13956757315283e-05, + "loss": 1.5273, + "step": 6741 + }, + { + "epoch": 0.507499200210768, + "grad_norm": 5.02488899230957, + "learning_rate": 5.138348954927723e-05, + "loss": 1.9912, + "step": 6742 + }, + { + "epoch": 0.5075744744914281, + "grad_norm": 6.548547267913818, + "learning_rate": 5.137130328478166e-05, + "loss": 1.8661, + "step": 6743 + }, + { + "epoch": 0.5076497487720883, + "grad_norm": 4.559741020202637, + "learning_rate": 5.135911693876606e-05, + "loss": 1.7527, + "step": 6744 + }, + { + "epoch": 0.5077250230527485, + "grad_norm": 9.573232650756836, + "learning_rate": 5.134693051195485e-05, + "loss": 1.9646, + "step": 6745 + }, + { + "epoch": 0.5078002973334086, + "grad_norm": 4.267605781555176, + "learning_rate": 5.133474400507249e-05, + "loss": 2.0713, + "step": 6746 + }, + { + "epoch": 0.5078755716140687, + "grad_norm": 8.955902099609375, + "learning_rate": 5.132255741884343e-05, + "loss": 1.7363, + "step": 6747 + }, + { + "epoch": 0.5079508458947289, + "grad_norm": 6.133431911468506, + "learning_rate": 5.131037075399212e-05, + "loss": 1.6846, + "step": 6748 + }, + { + "epoch": 0.5080261201753891, + "grad_norm": 4.861812114715576, + "learning_rate": 5.129818401124303e-05, + "loss": 2.1018, + "step": 6749 + }, + { + "epoch": 0.5081013944560492, + "grad_norm": 4.734555721282959, + "learning_rate": 5.1285997191320635e-05, + "loss": 1.6544, + "step": 6750 + }, + { + "epoch": 0.5081766687367094, + "grad_norm": 5.478392601013184, + "learning_rate": 5.1273810294949386e-05, + "loss": 2.0477, + "step": 6751 + }, + { + "epoch": 0.5082519430173695, + "grad_norm": 4.242016315460205, + "learning_rate": 5.126162332285378e-05, + "loss": 1.8273, + "step": 6752 + }, + { + "epoch": 0.5083272172980297, + "grad_norm": 5.264090538024902, + "learning_rate": 5.124943627575827e-05, + "loss": 1.9828, + "step": 6753 + }, + { + "epoch": 0.5084024915786899, + "grad_norm": 7.781677722930908, + "learning_rate": 5.1237249154387366e-05, + "loss": 2.093, + "step": 6754 + }, + { + "epoch": 0.50847776585935, + "grad_norm": 5.341714859008789, + "learning_rate": 5.122506195946556e-05, + "loss": 1.6081, + "step": 6755 + }, + { + "epoch": 0.5085530401400101, + "grad_norm": 4.535377025604248, + "learning_rate": 5.121287469171733e-05, + "loss": 1.981, + "step": 6756 + }, + { + "epoch": 0.5086283144206704, + "grad_norm": 6.521196365356445, + "learning_rate": 5.120068735186718e-05, + "loss": 1.8668, + "step": 6757 + }, + { + "epoch": 0.5087035887013305, + "grad_norm": 4.384036540985107, + "learning_rate": 5.11884999406396e-05, + "loss": 1.7383, + "step": 6758 + }, + { + "epoch": 0.5087788629819906, + "grad_norm": 4.779227256774902, + "learning_rate": 5.1176312458759115e-05, + "loss": 1.8343, + "step": 6759 + }, + { + "epoch": 0.5088541372626508, + "grad_norm": 4.828497886657715, + "learning_rate": 5.116412490695023e-05, + "loss": 2.0024, + "step": 6760 + }, + { + "epoch": 0.508929411543311, + "grad_norm": 6.186923980712891, + "learning_rate": 5.115193728593747e-05, + "loss": 1.7744, + "step": 6761 + }, + { + "epoch": 0.5090046858239711, + "grad_norm": 6.6489338874816895, + "learning_rate": 5.113974959644534e-05, + "loss": 1.8532, + "step": 6762 + }, + { + "epoch": 0.5090799601046313, + "grad_norm": 4.300166606903076, + "learning_rate": 5.112756183919837e-05, + "loss": 1.8369, + "step": 6763 + }, + { + "epoch": 0.5091552343852914, + "grad_norm": 7.359951019287109, + "learning_rate": 5.111537401492106e-05, + "loss": 1.8562, + "step": 6764 + }, + { + "epoch": 0.5092305086659515, + "grad_norm": 5.082156658172607, + "learning_rate": 5.110318612433799e-05, + "loss": 2.0292, + "step": 6765 + }, + { + "epoch": 0.5093057829466117, + "grad_norm": 4.218928337097168, + "learning_rate": 5.1090998168173666e-05, + "loss": 1.778, + "step": 6766 + }, + { + "epoch": 0.5093810572272719, + "grad_norm": 4.600281715393066, + "learning_rate": 5.107881014715263e-05, + "loss": 2.063, + "step": 6767 + }, + { + "epoch": 0.509456331507932, + "grad_norm": 6.268326282501221, + "learning_rate": 5.106662206199943e-05, + "loss": 1.4494, + "step": 6768 + }, + { + "epoch": 0.5095316057885921, + "grad_norm": 4.664248466491699, + "learning_rate": 5.10544339134386e-05, + "loss": 1.5458, + "step": 6769 + }, + { + "epoch": 0.5096068800692524, + "grad_norm": 7.142730236053467, + "learning_rate": 5.104224570219469e-05, + "loss": 1.9964, + "step": 6770 + }, + { + "epoch": 0.5096821543499125, + "grad_norm": 6.422486305236816, + "learning_rate": 5.1030057428992284e-05, + "loss": 1.7838, + "step": 6771 + }, + { + "epoch": 0.5097574286305726, + "grad_norm": 5.357710838317871, + "learning_rate": 5.1017869094555914e-05, + "loss": 1.7604, + "step": 6772 + }, + { + "epoch": 0.5098327029112328, + "grad_norm": 6.114521503448486, + "learning_rate": 5.1005680699610146e-05, + "loss": 1.7789, + "step": 6773 + }, + { + "epoch": 0.509907977191893, + "grad_norm": 5.207308292388916, + "learning_rate": 5.0993492244879546e-05, + "loss": 1.6026, + "step": 6774 + }, + { + "epoch": 0.5099832514725531, + "grad_norm": 4.577164173126221, + "learning_rate": 5.098130373108867e-05, + "loss": 1.8836, + "step": 6775 + }, + { + "epoch": 0.5100585257532133, + "grad_norm": 5.214762210845947, + "learning_rate": 5.096911515896211e-05, + "loss": 1.4482, + "step": 6776 + }, + { + "epoch": 0.5101338000338734, + "grad_norm": 11.214588165283203, + "learning_rate": 5.095692652922445e-05, + "loss": 1.9607, + "step": 6777 + }, + { + "epoch": 0.5102090743145336, + "grad_norm": 5.782665729522705, + "learning_rate": 5.094473784260023e-05, + "loss": 1.9772, + "step": 6778 + }, + { + "epoch": 0.5102843485951938, + "grad_norm": 4.505326271057129, + "learning_rate": 5.093254909981407e-05, + "loss": 1.8873, + "step": 6779 + }, + { + "epoch": 0.5103596228758539, + "grad_norm": 8.090843200683594, + "learning_rate": 5.092036030159055e-05, + "loss": 1.6799, + "step": 6780 + }, + { + "epoch": 0.510434897156514, + "grad_norm": 5.039333820343018, + "learning_rate": 5.090817144865423e-05, + "loss": 1.6468, + "step": 6781 + }, + { + "epoch": 0.5105101714371743, + "grad_norm": 4.458166599273682, + "learning_rate": 5.089598254172974e-05, + "loss": 1.5768, + "step": 6782 + }, + { + "epoch": 0.5105854457178344, + "grad_norm": 8.188796997070312, + "learning_rate": 5.088379358154165e-05, + "loss": 1.7088, + "step": 6783 + }, + { + "epoch": 0.5106607199984945, + "grad_norm": 6.392370223999023, + "learning_rate": 5.087160456881458e-05, + "loss": 1.7747, + "step": 6784 + }, + { + "epoch": 0.5107359942791546, + "grad_norm": 4.820289611816406, + "learning_rate": 5.0859415504273114e-05, + "loss": 2.0435, + "step": 6785 + }, + { + "epoch": 0.5108112685598148, + "grad_norm": 6.314679145812988, + "learning_rate": 5.0847226388641845e-05, + "loss": 1.6003, + "step": 6786 + }, + { + "epoch": 0.510886542840475, + "grad_norm": 4.878520965576172, + "learning_rate": 5.083503722264541e-05, + "loss": 1.8787, + "step": 6787 + }, + { + "epoch": 0.5109618171211351, + "grad_norm": 4.144894123077393, + "learning_rate": 5.0822848007008414e-05, + "loss": 1.8292, + "step": 6788 + }, + { + "epoch": 0.5110370914017953, + "grad_norm": 4.306694507598877, + "learning_rate": 5.0810658742455464e-05, + "loss": 1.7037, + "step": 6789 + }, + { + "epoch": 0.5111123656824554, + "grad_norm": 7.533092021942139, + "learning_rate": 5.0798469429711185e-05, + "loss": 1.7379, + "step": 6790 + }, + { + "epoch": 0.5111876399631156, + "grad_norm": 5.421972274780273, + "learning_rate": 5.0786280069500155e-05, + "loss": 1.9746, + "step": 6791 + }, + { + "epoch": 0.5112629142437758, + "grad_norm": 5.211759567260742, + "learning_rate": 5.0774090662547055e-05, + "loss": 2.0444, + "step": 6792 + }, + { + "epoch": 0.5113381885244359, + "grad_norm": 5.798937797546387, + "learning_rate": 5.076190120957649e-05, + "loss": 2.0116, + "step": 6793 + }, + { + "epoch": 0.511413462805096, + "grad_norm": 5.778769493103027, + "learning_rate": 5.074971171131308e-05, + "loss": 1.9934, + "step": 6794 + }, + { + "epoch": 0.5114887370857563, + "grad_norm": 7.405422210693359, + "learning_rate": 5.0737522168481453e-05, + "loss": 1.4891, + "step": 6795 + }, + { + "epoch": 0.5115640113664164, + "grad_norm": 5.873201370239258, + "learning_rate": 5.0725332581806265e-05, + "loss": 1.9569, + "step": 6796 + }, + { + "epoch": 0.5116392856470765, + "grad_norm": 4.395761013031006, + "learning_rate": 5.0713142952012126e-05, + "loss": 1.9427, + "step": 6797 + }, + { + "epoch": 0.5117145599277367, + "grad_norm": 8.078096389770508, + "learning_rate": 5.070095327982368e-05, + "loss": 1.9581, + "step": 6798 + }, + { + "epoch": 0.5117898342083969, + "grad_norm": 3.766544818878174, + "learning_rate": 5.068876356596558e-05, + "loss": 1.7896, + "step": 6799 + }, + { + "epoch": 0.511865108489057, + "grad_norm": 5.806560039520264, + "learning_rate": 5.0676573811162465e-05, + "loss": 2.2869, + "step": 6800 + }, + { + "epoch": 0.5119403827697172, + "grad_norm": 4.015898704528809, + "learning_rate": 5.066438401613898e-05, + "loss": 1.4404, + "step": 6801 + }, + { + "epoch": 0.5120156570503773, + "grad_norm": 12.29764461517334, + "learning_rate": 5.0652194181619774e-05, + "loss": 1.8282, + "step": 6802 + }, + { + "epoch": 0.5120909313310374, + "grad_norm": 3.7685940265655518, + "learning_rate": 5.064000430832947e-05, + "loss": 1.9904, + "step": 6803 + }, + { + "epoch": 0.5121662056116977, + "grad_norm": 4.312082767486572, + "learning_rate": 5.062781439699277e-05, + "loss": 1.8836, + "step": 6804 + }, + { + "epoch": 0.5122414798923578, + "grad_norm": 4.480302810668945, + "learning_rate": 5.06156244483343e-05, + "loss": 1.8825, + "step": 6805 + }, + { + "epoch": 0.5123167541730179, + "grad_norm": 6.031010150909424, + "learning_rate": 5.060343446307872e-05, + "loss": 1.6072, + "step": 6806 + }, + { + "epoch": 0.512392028453678, + "grad_norm": 5.377457141876221, + "learning_rate": 5.05912444419507e-05, + "loss": 1.5968, + "step": 6807 + }, + { + "epoch": 0.5124673027343383, + "grad_norm": 5.374217987060547, + "learning_rate": 5.057905438567488e-05, + "loss": 2.0023, + "step": 6808 + }, + { + "epoch": 0.5125425770149984, + "grad_norm": 4.311944484710693, + "learning_rate": 5.056686429497594e-05, + "loss": 1.8141, + "step": 6809 + }, + { + "epoch": 0.5126178512956585, + "grad_norm": 4.842700958251953, + "learning_rate": 5.055467417057855e-05, + "loss": 2.329, + "step": 6810 + }, + { + "epoch": 0.5126931255763187, + "grad_norm": 4.983015060424805, + "learning_rate": 5.054248401320738e-05, + "loss": 1.659, + "step": 6811 + }, + { + "epoch": 0.5127683998569789, + "grad_norm": 5.216240882873535, + "learning_rate": 5.053029382358708e-05, + "loss": 1.7487, + "step": 6812 + }, + { + "epoch": 0.512843674137639, + "grad_norm": 6.023610591888428, + "learning_rate": 5.0518103602442346e-05, + "loss": 2.4389, + "step": 6813 + }, + { + "epoch": 0.5129189484182992, + "grad_norm": 5.3524556159973145, + "learning_rate": 5.0505913350497825e-05, + "loss": 1.8467, + "step": 6814 + }, + { + "epoch": 0.5129942226989593, + "grad_norm": 4.955999374389648, + "learning_rate": 5.049372306847821e-05, + "loss": 1.9026, + "step": 6815 + }, + { + "epoch": 0.5130694969796195, + "grad_norm": 10.45556640625, + "learning_rate": 5.0481532757108184e-05, + "loss": 2.1947, + "step": 6816 + }, + { + "epoch": 0.5131447712602797, + "grad_norm": 6.260288715362549, + "learning_rate": 5.046934241711242e-05, + "loss": 1.5467, + "step": 6817 + }, + { + "epoch": 0.5132200455409398, + "grad_norm": 7.2607011795043945, + "learning_rate": 5.045715204921559e-05, + "loss": 2.3121, + "step": 6818 + }, + { + "epoch": 0.5132953198215999, + "grad_norm": 7.091415882110596, + "learning_rate": 5.0444961654142385e-05, + "loss": 2.3638, + "step": 6819 + }, + { + "epoch": 0.5133705941022602, + "grad_norm": 5.887112140655518, + "learning_rate": 5.0432771232617484e-05, + "loss": 1.8906, + "step": 6820 + }, + { + "epoch": 0.5134458683829203, + "grad_norm": 5.019084453582764, + "learning_rate": 5.042058078536557e-05, + "loss": 2.0279, + "step": 6821 + }, + { + "epoch": 0.5135211426635804, + "grad_norm": 5.128907203674316, + "learning_rate": 5.040839031311134e-05, + "loss": 1.9665, + "step": 6822 + }, + { + "epoch": 0.5135964169442406, + "grad_norm": 5.571674823760986, + "learning_rate": 5.0396199816579485e-05, + "loss": 1.7181, + "step": 6823 + }, + { + "epoch": 0.5136716912249008, + "grad_norm": 4.340588569641113, + "learning_rate": 5.038400929649466e-05, + "loss": 1.6045, + "step": 6824 + }, + { + "epoch": 0.5137469655055609, + "grad_norm": 4.549193859100342, + "learning_rate": 5.037181875358161e-05, + "loss": 1.9299, + "step": 6825 + }, + { + "epoch": 0.513822239786221, + "grad_norm": 4.369811534881592, + "learning_rate": 5.035962818856499e-05, + "loss": 1.8864, + "step": 6826 + }, + { + "epoch": 0.5138975140668812, + "grad_norm": 5.870934009552002, + "learning_rate": 5.034743760216951e-05, + "loss": 1.8036, + "step": 6827 + }, + { + "epoch": 0.5139727883475413, + "grad_norm": 5.8785858154296875, + "learning_rate": 5.0335246995119864e-05, + "loss": 2.1221, + "step": 6828 + }, + { + "epoch": 0.5140480626282015, + "grad_norm": 9.000893592834473, + "learning_rate": 5.032305636814075e-05, + "loss": 2.2892, + "step": 6829 + }, + { + "epoch": 0.5141233369088617, + "grad_norm": 4.833107948303223, + "learning_rate": 5.0310865721956854e-05, + "loss": 1.9481, + "step": 6830 + }, + { + "epoch": 0.5141986111895218, + "grad_norm": 6.291653156280518, + "learning_rate": 5.029867505729289e-05, + "loss": 1.872, + "step": 6831 + }, + { + "epoch": 0.5142738854701819, + "grad_norm": 4.730236530303955, + "learning_rate": 5.028648437487354e-05, + "loss": 1.8228, + "step": 6832 + }, + { + "epoch": 0.5143491597508422, + "grad_norm": 8.443548202514648, + "learning_rate": 5.0274293675423525e-05, + "loss": 1.9007, + "step": 6833 + }, + { + "epoch": 0.5144244340315023, + "grad_norm": 4.788818836212158, + "learning_rate": 5.0262102959667546e-05, + "loss": 2.1504, + "step": 6834 + }, + { + "epoch": 0.5144997083121624, + "grad_norm": 4.468110084533691, + "learning_rate": 5.024991222833029e-05, + "loss": 1.8219, + "step": 6835 + }, + { + "epoch": 0.5145749825928226, + "grad_norm": 3.749366521835327, + "learning_rate": 5.023772148213646e-05, + "loss": 2.1225, + "step": 6836 + }, + { + "epoch": 0.5146502568734828, + "grad_norm": 4.526574611663818, + "learning_rate": 5.022553072181078e-05, + "loss": 1.8104, + "step": 6837 + }, + { + "epoch": 0.5147255311541429, + "grad_norm": 6.3095550537109375, + "learning_rate": 5.021333994807794e-05, + "loss": 1.742, + "step": 6838 + }, + { + "epoch": 0.5148008054348031, + "grad_norm": 4.514925956726074, + "learning_rate": 5.020114916166265e-05, + "loss": 1.8194, + "step": 6839 + }, + { + "epoch": 0.5148760797154632, + "grad_norm": 4.73873233795166, + "learning_rate": 5.018895836328962e-05, + "loss": 1.6268, + "step": 6840 + }, + { + "epoch": 0.5149513539961234, + "grad_norm": 4.981995582580566, + "learning_rate": 5.017676755368357e-05, + "loss": 1.8439, + "step": 6841 + }, + { + "epoch": 0.5150266282767836, + "grad_norm": 6.351907730102539, + "learning_rate": 5.016457673356917e-05, + "loss": 2.2033, + "step": 6842 + }, + { + "epoch": 0.5151019025574437, + "grad_norm": 4.076376438140869, + "learning_rate": 5.015238590367117e-05, + "loss": 2.0261, + "step": 6843 + }, + { + "epoch": 0.5151771768381038, + "grad_norm": 4.447979927062988, + "learning_rate": 5.014019506471428e-05, + "loss": 1.8087, + "step": 6844 + }, + { + "epoch": 0.5152524511187639, + "grad_norm": 4.561493396759033, + "learning_rate": 5.012800421742318e-05, + "loss": 1.85, + "step": 6845 + }, + { + "epoch": 0.5153277253994242, + "grad_norm": 7.562438488006592, + "learning_rate": 5.0115813362522615e-05, + "loss": 2.1704, + "step": 6846 + }, + { + "epoch": 0.5154029996800843, + "grad_norm": 5.949526309967041, + "learning_rate": 5.0103622500737255e-05, + "loss": 1.629, + "step": 6847 + }, + { + "epoch": 0.5154782739607444, + "grad_norm": 4.262274742126465, + "learning_rate": 5.009143163279184e-05, + "loss": 1.8219, + "step": 6848 + }, + { + "epoch": 0.5155535482414046, + "grad_norm": 5.930945873260498, + "learning_rate": 5.0079240759411085e-05, + "loss": 1.7948, + "step": 6849 + }, + { + "epoch": 0.5156288225220648, + "grad_norm": 7.679771423339844, + "learning_rate": 5.00670498813197e-05, + "loss": 2.1051, + "step": 6850 + }, + { + "epoch": 0.5157040968027249, + "grad_norm": 5.022866249084473, + "learning_rate": 5.005485899924237e-05, + "loss": 1.6155, + "step": 6851 + }, + { + "epoch": 0.5157793710833851, + "grad_norm": 8.224024772644043, + "learning_rate": 5.004266811390385e-05, + "loss": 2.289, + "step": 6852 + }, + { + "epoch": 0.5158546453640452, + "grad_norm": 8.162196159362793, + "learning_rate": 5.0030477226028816e-05, + "loss": 1.8882, + "step": 6853 + }, + { + "epoch": 0.5159299196447054, + "grad_norm": 4.712340831756592, + "learning_rate": 5.0018286336342e-05, + "loss": 1.9848, + "step": 6854 + }, + { + "epoch": 0.5160051939253656, + "grad_norm": 6.082491397857666, + "learning_rate": 5.0006095445568124e-05, + "loss": 2.1184, + "step": 6855 + }, + { + "epoch": 0.5160804682060257, + "grad_norm": 4.365261554718018, + "learning_rate": 4.9993904554431894e-05, + "loss": 1.5409, + "step": 6856 + }, + { + "epoch": 0.5161557424866858, + "grad_norm": 6.525937080383301, + "learning_rate": 4.9981713663658004e-05, + "loss": 1.8849, + "step": 6857 + }, + { + "epoch": 0.5162310167673461, + "grad_norm": 5.986301898956299, + "learning_rate": 4.99695227739712e-05, + "loss": 2.0817, + "step": 6858 + }, + { + "epoch": 0.5163062910480062, + "grad_norm": 5.401782989501953, + "learning_rate": 4.9957331886096164e-05, + "loss": 2.1331, + "step": 6859 + }, + { + "epoch": 0.5163815653286663, + "grad_norm": 6.509993076324463, + "learning_rate": 4.994514100075765e-05, + "loss": 1.7427, + "step": 6860 + }, + { + "epoch": 0.5164568396093265, + "grad_norm": 3.6936700344085693, + "learning_rate": 4.993295011868032e-05, + "loss": 1.8602, + "step": 6861 + }, + { + "epoch": 0.5165321138899867, + "grad_norm": 3.835908889770508, + "learning_rate": 4.992075924058894e-05, + "loss": 1.4762, + "step": 6862 + }, + { + "epoch": 0.5166073881706468, + "grad_norm": 4.697462558746338, + "learning_rate": 4.990856836720817e-05, + "loss": 1.9856, + "step": 6863 + }, + { + "epoch": 0.5166826624513069, + "grad_norm": 6.305080890655518, + "learning_rate": 4.989637749926275e-05, + "loss": 2.0387, + "step": 6864 + }, + { + "epoch": 0.5167579367319671, + "grad_norm": 4.466582775115967, + "learning_rate": 4.9884186637477404e-05, + "loss": 1.6545, + "step": 6865 + }, + { + "epoch": 0.5168332110126272, + "grad_norm": 4.952460289001465, + "learning_rate": 4.9871995782576816e-05, + "loss": 1.7808, + "step": 6866 + }, + { + "epoch": 0.5169084852932874, + "grad_norm": 3.505147933959961, + "learning_rate": 4.9859804935285734e-05, + "loss": 1.7967, + "step": 6867 + }, + { + "epoch": 0.5169837595739476, + "grad_norm": 6.266757965087891, + "learning_rate": 4.984761409632882e-05, + "loss": 2.1579, + "step": 6868 + }, + { + "epoch": 0.5170590338546077, + "grad_norm": 5.485719203948975, + "learning_rate": 4.9835423266430836e-05, + "loss": 2.216, + "step": 6869 + }, + { + "epoch": 0.5171343081352678, + "grad_norm": 5.597513675689697, + "learning_rate": 4.982323244631645e-05, + "loss": 1.6185, + "step": 6870 + }, + { + "epoch": 0.5172095824159281, + "grad_norm": 5.2896599769592285, + "learning_rate": 4.981104163671039e-05, + "loss": 2.0571, + "step": 6871 + }, + { + "epoch": 0.5172848566965882, + "grad_norm": 4.4692559242248535, + "learning_rate": 4.979885083833736e-05, + "loss": 2.026, + "step": 6872 + }, + { + "epoch": 0.5173601309772483, + "grad_norm": 6.023098468780518, + "learning_rate": 4.978666005192208e-05, + "loss": 2.1947, + "step": 6873 + }, + { + "epoch": 0.5174354052579085, + "grad_norm": 4.804433822631836, + "learning_rate": 4.977446927818924e-05, + "loss": 2.0929, + "step": 6874 + }, + { + "epoch": 0.5175106795385687, + "grad_norm": 5.772067546844482, + "learning_rate": 4.976227851786356e-05, + "loss": 1.8992, + "step": 6875 + }, + { + "epoch": 0.5175859538192288, + "grad_norm": 4.229405403137207, + "learning_rate": 4.975008777166973e-05, + "loss": 1.7926, + "step": 6876 + }, + { + "epoch": 0.517661228099889, + "grad_norm": 5.544678688049316, + "learning_rate": 4.973789704033248e-05, + "loss": 1.9406, + "step": 6877 + }, + { + "epoch": 0.5177365023805491, + "grad_norm": 6.918322563171387, + "learning_rate": 4.972570632457648e-05, + "loss": 2.792, + "step": 6878 + }, + { + "epoch": 0.5178117766612093, + "grad_norm": 5.6526618003845215, + "learning_rate": 4.9713515625126475e-05, + "loss": 2.0253, + "step": 6879 + }, + { + "epoch": 0.5178870509418695, + "grad_norm": 4.612674236297607, + "learning_rate": 4.9701324942707125e-05, + "loss": 1.7817, + "step": 6880 + }, + { + "epoch": 0.5179623252225296, + "grad_norm": 3.971609592437744, + "learning_rate": 4.968913427804315e-05, + "loss": 1.5204, + "step": 6881 + }, + { + "epoch": 0.5180375995031897, + "grad_norm": 7.523239612579346, + "learning_rate": 4.967694363185926e-05, + "loss": 1.7766, + "step": 6882 + }, + { + "epoch": 0.5181128737838498, + "grad_norm": 6.461738586425781, + "learning_rate": 4.9664753004880134e-05, + "loss": 1.9031, + "step": 6883 + }, + { + "epoch": 0.5181881480645101, + "grad_norm": 6.4210205078125, + "learning_rate": 4.96525623978305e-05, + "loss": 1.6346, + "step": 6884 + }, + { + "epoch": 0.5182634223451702, + "grad_norm": 5.8029465675354, + "learning_rate": 4.964037181143501e-05, + "loss": 2.3429, + "step": 6885 + }, + { + "epoch": 0.5183386966258303, + "grad_norm": 8.132143020629883, + "learning_rate": 4.96281812464184e-05, + "loss": 1.9798, + "step": 6886 + }, + { + "epoch": 0.5184139709064906, + "grad_norm": 4.619959354400635, + "learning_rate": 4.9615990703505335e-05, + "loss": 1.6911, + "step": 6887 + }, + { + "epoch": 0.5184892451871507, + "grad_norm": 4.716886043548584, + "learning_rate": 4.960380018342055e-05, + "loss": 1.9042, + "step": 6888 + }, + { + "epoch": 0.5185645194678108, + "grad_norm": 4.738988876342773, + "learning_rate": 4.9591609686888676e-05, + "loss": 1.7335, + "step": 6889 + }, + { + "epoch": 0.518639793748471, + "grad_norm": 5.591885089874268, + "learning_rate": 4.9579419214634456e-05, + "loss": 2.0569, + "step": 6890 + }, + { + "epoch": 0.5187150680291311, + "grad_norm": 5.754671096801758, + "learning_rate": 4.9567228767382535e-05, + "loss": 1.6285, + "step": 6891 + }, + { + "epoch": 0.5187903423097913, + "grad_norm": 4.7687668800354, + "learning_rate": 4.955503834585763e-05, + "loss": 2.2987, + "step": 6892 + }, + { + "epoch": 0.5188656165904515, + "grad_norm": 5.1124162673950195, + "learning_rate": 4.954284795078442e-05, + "loss": 1.6412, + "step": 6893 + }, + { + "epoch": 0.5189408908711116, + "grad_norm": 4.056662082672119, + "learning_rate": 4.95306575828876e-05, + "loss": 1.6366, + "step": 6894 + }, + { + "epoch": 0.5190161651517717, + "grad_norm": 6.833881855010986, + "learning_rate": 4.951846724289182e-05, + "loss": 2.1004, + "step": 6895 + }, + { + "epoch": 0.519091439432432, + "grad_norm": 7.1685662269592285, + "learning_rate": 4.9506276931521785e-05, + "loss": 1.8244, + "step": 6896 + }, + { + "epoch": 0.5191667137130921, + "grad_norm": 5.5675950050354, + "learning_rate": 4.949408664950219e-05, + "loss": 1.9033, + "step": 6897 + }, + { + "epoch": 0.5192419879937522, + "grad_norm": 6.878044605255127, + "learning_rate": 4.9481896397557665e-05, + "loss": 2.0798, + "step": 6898 + }, + { + "epoch": 0.5193172622744124, + "grad_norm": 3.768016815185547, + "learning_rate": 4.946970617641292e-05, + "loss": 1.5171, + "step": 6899 + }, + { + "epoch": 0.5193925365550726, + "grad_norm": 3.9038679599761963, + "learning_rate": 4.9457515986792624e-05, + "loss": 1.5763, + "step": 6900 + }, + { + "epoch": 0.5194678108357327, + "grad_norm": 7.663052558898926, + "learning_rate": 4.944532582942146e-05, + "loss": 2.4522, + "step": 6901 + }, + { + "epoch": 0.5195430851163929, + "grad_norm": 4.954900741577148, + "learning_rate": 4.943313570502407e-05, + "loss": 2.4479, + "step": 6902 + }, + { + "epoch": 0.519618359397053, + "grad_norm": 4.997289657592773, + "learning_rate": 4.942094561432513e-05, + "loss": 1.7567, + "step": 6903 + }, + { + "epoch": 0.5196936336777132, + "grad_norm": 6.063836097717285, + "learning_rate": 4.940875555804931e-05, + "loss": 1.9728, + "step": 6904 + }, + { + "epoch": 0.5197689079583733, + "grad_norm": 3.666879415512085, + "learning_rate": 4.93965655369213e-05, + "loss": 1.7522, + "step": 6905 + }, + { + "epoch": 0.5198441822390335, + "grad_norm": 5.604748725891113, + "learning_rate": 4.9384375551665714e-05, + "loss": 2.0205, + "step": 6906 + }, + { + "epoch": 0.5199194565196936, + "grad_norm": 5.2609992027282715, + "learning_rate": 4.9372185603007255e-05, + "loss": 1.7705, + "step": 6907 + }, + { + "epoch": 0.5199947308003537, + "grad_norm": 7.6664509773254395, + "learning_rate": 4.935999569167054e-05, + "loss": 1.9849, + "step": 6908 + }, + { + "epoch": 0.520070005081014, + "grad_norm": 4.170504570007324, + "learning_rate": 4.934780581838026e-05, + "loss": 1.9235, + "step": 6909 + }, + { + "epoch": 0.5201452793616741, + "grad_norm": 5.818812370300293, + "learning_rate": 4.9335615983861034e-05, + "loss": 1.9803, + "step": 6910 + }, + { + "epoch": 0.5202205536423342, + "grad_norm": 18.383562088012695, + "learning_rate": 4.932342618883756e-05, + "loss": 1.7371, + "step": 6911 + }, + { + "epoch": 0.5202958279229944, + "grad_norm": 4.735395908355713, + "learning_rate": 4.931123643403443e-05, + "loss": 1.6755, + "step": 6912 + }, + { + "epoch": 0.5203711022036546, + "grad_norm": 5.558811664581299, + "learning_rate": 4.9299046720176315e-05, + "loss": 1.8114, + "step": 6913 + }, + { + "epoch": 0.5204463764843147, + "grad_norm": 4.932512283325195, + "learning_rate": 4.9286857047987886e-05, + "loss": 1.937, + "step": 6914 + }, + { + "epoch": 0.5205216507649749, + "grad_norm": 3.746288537979126, + "learning_rate": 4.927466741819373e-05, + "loss": 1.8413, + "step": 6915 + }, + { + "epoch": 0.520596925045635, + "grad_norm": 6.8798418045043945, + "learning_rate": 4.926247783151855e-05, + "loss": 1.7361, + "step": 6916 + }, + { + "epoch": 0.5206721993262952, + "grad_norm": 3.8318376541137695, + "learning_rate": 4.925028828868693e-05, + "loss": 1.6954, + "step": 6917 + }, + { + "epoch": 0.5207474736069554, + "grad_norm": 8.702962875366211, + "learning_rate": 4.9238098790423526e-05, + "loss": 2.0857, + "step": 6918 + }, + { + "epoch": 0.5208227478876155, + "grad_norm": 5.833158016204834, + "learning_rate": 4.922590933745295e-05, + "loss": 1.887, + "step": 6919 + }, + { + "epoch": 0.5208980221682756, + "grad_norm": 4.21670389175415, + "learning_rate": 4.921371993049985e-05, + "loss": 2.1758, + "step": 6920 + }, + { + "epoch": 0.5209732964489359, + "grad_norm": 4.305966854095459, + "learning_rate": 4.9201530570288834e-05, + "loss": 1.4069, + "step": 6921 + }, + { + "epoch": 0.521048570729596, + "grad_norm": 7.04738187789917, + "learning_rate": 4.918934125754456e-05, + "loss": 1.6969, + "step": 6922 + }, + { + "epoch": 0.5211238450102561, + "grad_norm": 4.116695880889893, + "learning_rate": 4.917715199299159e-05, + "loss": 1.7867, + "step": 6923 + }, + { + "epoch": 0.5211991192909162, + "grad_norm": 4.401083946228027, + "learning_rate": 4.9164962777354605e-05, + "loss": 1.7388, + "step": 6924 + }, + { + "epoch": 0.5212743935715765, + "grad_norm": 4.131354808807373, + "learning_rate": 4.915277361135817e-05, + "loss": 1.6782, + "step": 6925 + }, + { + "epoch": 0.5213496678522366, + "grad_norm": 11.5966215133667, + "learning_rate": 4.914058449572691e-05, + "loss": 2.3563, + "step": 6926 + }, + { + "epoch": 0.5214249421328967, + "grad_norm": 6.715978145599365, + "learning_rate": 4.912839543118543e-05, + "loss": 1.5925, + "step": 6927 + }, + { + "epoch": 0.5215002164135569, + "grad_norm": 7.157362461090088, + "learning_rate": 4.9116206418458364e-05, + "loss": 1.7972, + "step": 6928 + }, + { + "epoch": 0.521575490694217, + "grad_norm": 8.35208797454834, + "learning_rate": 4.910401745827027e-05, + "loss": 2.2069, + "step": 6929 + }, + { + "epoch": 0.5216507649748772, + "grad_norm": 6.615951061248779, + "learning_rate": 4.909182855134577e-05, + "loss": 2.0758, + "step": 6930 + }, + { + "epoch": 0.5217260392555374, + "grad_norm": 4.61112642288208, + "learning_rate": 4.907963969840946e-05, + "loss": 1.6757, + "step": 6931 + }, + { + "epoch": 0.5218013135361975, + "grad_norm": 4.089064121246338, + "learning_rate": 4.9067450900185926e-05, + "loss": 1.6415, + "step": 6932 + }, + { + "epoch": 0.5218765878168576, + "grad_norm": 5.252660751342773, + "learning_rate": 4.9055262157399776e-05, + "loss": 1.9207, + "step": 6933 + }, + { + "epoch": 0.5219518620975179, + "grad_norm": 6.988577365875244, + "learning_rate": 4.9043073470775556e-05, + "loss": 1.9933, + "step": 6934 + }, + { + "epoch": 0.522027136378178, + "grad_norm": 4.907176971435547, + "learning_rate": 4.90308848410379e-05, + "loss": 1.6469, + "step": 6935 + }, + { + "epoch": 0.5221024106588381, + "grad_norm": 4.790060997009277, + "learning_rate": 4.901869626891133e-05, + "loss": 2.0633, + "step": 6936 + }, + { + "epoch": 0.5221776849394983, + "grad_norm": 6.5458550453186035, + "learning_rate": 4.900650775512047e-05, + "loss": 2.2284, + "step": 6937 + }, + { + "epoch": 0.5222529592201585, + "grad_norm": 7.152644157409668, + "learning_rate": 4.8994319300389865e-05, + "loss": 1.8101, + "step": 6938 + }, + { + "epoch": 0.5223282335008186, + "grad_norm": 6.585931301116943, + "learning_rate": 4.8982130905444104e-05, + "loss": 1.6948, + "step": 6939 + }, + { + "epoch": 0.5224035077814788, + "grad_norm": 6.136294841766357, + "learning_rate": 4.896994257100773e-05, + "loss": 1.9636, + "step": 6940 + }, + { + "epoch": 0.5224787820621389, + "grad_norm": 5.393958568572998, + "learning_rate": 4.8957754297805314e-05, + "loss": 2.249, + "step": 6941 + }, + { + "epoch": 0.5225540563427991, + "grad_norm": 5.075571537017822, + "learning_rate": 4.894556608656141e-05, + "loss": 2.2963, + "step": 6942 + }, + { + "epoch": 0.5226293306234592, + "grad_norm": 8.331381797790527, + "learning_rate": 4.89333779380006e-05, + "loss": 1.5739, + "step": 6943 + }, + { + "epoch": 0.5227046049041194, + "grad_norm": 6.831275939941406, + "learning_rate": 4.892118985284738e-05, + "loss": 1.9118, + "step": 6944 + }, + { + "epoch": 0.5227798791847795, + "grad_norm": 10.060151100158691, + "learning_rate": 4.890900183182633e-05, + "loss": 1.5767, + "step": 6945 + }, + { + "epoch": 0.5228551534654396, + "grad_norm": 11.265748023986816, + "learning_rate": 4.889681387566202e-05, + "loss": 1.9145, + "step": 6946 + }, + { + "epoch": 0.5229304277460999, + "grad_norm": 20.67662811279297, + "learning_rate": 4.888462598507893e-05, + "loss": 2.3029, + "step": 6947 + }, + { + "epoch": 0.52300570202676, + "grad_norm": 10.647335052490234, + "learning_rate": 4.887243816080165e-05, + "loss": 1.6152, + "step": 6948 + }, + { + "epoch": 0.5230809763074201, + "grad_norm": 7.7786784172058105, + "learning_rate": 4.8860250403554665e-05, + "loss": 1.7386, + "step": 6949 + }, + { + "epoch": 0.5231562505880804, + "grad_norm": 66.79920959472656, + "learning_rate": 4.8848062714062544e-05, + "loss": 1.8861, + "step": 6950 + }, + { + "epoch": 0.5232315248687405, + "grad_norm": 16.194293975830078, + "learning_rate": 4.8835875093049764e-05, + "loss": 1.9081, + "step": 6951 + }, + { + "epoch": 0.5233067991494006, + "grad_norm": 10.99862003326416, + "learning_rate": 4.8823687541240896e-05, + "loss": 1.8302, + "step": 6952 + }, + { + "epoch": 0.5233820734300608, + "grad_norm": 6.475612163543701, + "learning_rate": 4.881150005936041e-05, + "loss": 1.8786, + "step": 6953 + }, + { + "epoch": 0.5234573477107209, + "grad_norm": 5.452008247375488, + "learning_rate": 4.879931264813284e-05, + "loss": 1.8106, + "step": 6954 + }, + { + "epoch": 0.5235326219913811, + "grad_norm": 6.2017436027526855, + "learning_rate": 4.8787125308282684e-05, + "loss": 2.0323, + "step": 6955 + }, + { + "epoch": 0.5236078962720413, + "grad_norm": 5.7541279792785645, + "learning_rate": 4.877493804053446e-05, + "loss": 1.8911, + "step": 6956 + }, + { + "epoch": 0.5236831705527014, + "grad_norm": 8.359824180603027, + "learning_rate": 4.8762750845612646e-05, + "loss": 1.7161, + "step": 6957 + }, + { + "epoch": 0.5237584448333615, + "grad_norm": 4.140836238861084, + "learning_rate": 4.8750563724241745e-05, + "loss": 1.6101, + "step": 6958 + }, + { + "epoch": 0.5238337191140218, + "grad_norm": 6.034945011138916, + "learning_rate": 4.8738376677146234e-05, + "loss": 2.0993, + "step": 6959 + }, + { + "epoch": 0.5239089933946819, + "grad_norm": 4.626430511474609, + "learning_rate": 4.872618970505063e-05, + "loss": 1.752, + "step": 6960 + }, + { + "epoch": 0.523984267675342, + "grad_norm": 3.849564552307129, + "learning_rate": 4.8714002808679384e-05, + "loss": 1.6206, + "step": 6961 + }, + { + "epoch": 0.5240595419560021, + "grad_norm": 5.5037455558776855, + "learning_rate": 4.870181598875697e-05, + "loss": 1.7479, + "step": 6962 + }, + { + "epoch": 0.5241348162366624, + "grad_norm": 4.773678302764893, + "learning_rate": 4.8689629246007894e-05, + "loss": 2.0511, + "step": 6963 + }, + { + "epoch": 0.5242100905173225, + "grad_norm": 5.401362895965576, + "learning_rate": 4.867744258115658e-05, + "loss": 1.8812, + "step": 6964 + }, + { + "epoch": 0.5242853647979826, + "grad_norm": 5.2645063400268555, + "learning_rate": 4.8665255994927515e-05, + "loss": 1.7538, + "step": 6965 + }, + { + "epoch": 0.5243606390786428, + "grad_norm": 5.374499797821045, + "learning_rate": 4.865306948804515e-05, + "loss": 1.5407, + "step": 6966 + }, + { + "epoch": 0.524435913359303, + "grad_norm": 5.339637756347656, + "learning_rate": 4.8640883061233954e-05, + "loss": 1.8745, + "step": 6967 + }, + { + "epoch": 0.5245111876399631, + "grad_norm": 4.9777936935424805, + "learning_rate": 4.8628696715218344e-05, + "loss": 1.548, + "step": 6968 + }, + { + "epoch": 0.5245864619206233, + "grad_norm": 4.010859489440918, + "learning_rate": 4.8616510450722784e-05, + "loss": 1.9453, + "step": 6969 + }, + { + "epoch": 0.5246617362012834, + "grad_norm": 4.428409099578857, + "learning_rate": 4.860432426847171e-05, + "loss": 1.8392, + "step": 6970 + }, + { + "epoch": 0.5247370104819435, + "grad_norm": 4.118762016296387, + "learning_rate": 4.8592138169189555e-05, + "loss": 1.6973, + "step": 6971 + }, + { + "epoch": 0.5248122847626038, + "grad_norm": 5.352478981018066, + "learning_rate": 4.857995215360073e-05, + "loss": 1.7932, + "step": 6972 + }, + { + "epoch": 0.5248875590432639, + "grad_norm": 7.0646562576293945, + "learning_rate": 4.8567766222429706e-05, + "loss": 2.2301, + "step": 6973 + }, + { + "epoch": 0.524962833323924, + "grad_norm": 5.086212158203125, + "learning_rate": 4.855558037640085e-05, + "loss": 2.142, + "step": 6974 + }, + { + "epoch": 0.5250381076045842, + "grad_norm": 5.054366588592529, + "learning_rate": 4.85433946162386e-05, + "loss": 1.6863, + "step": 6975 + }, + { + "epoch": 0.5251133818852444, + "grad_norm": 5.074423789978027, + "learning_rate": 4.853120894266735e-05, + "loss": 1.8816, + "step": 6976 + }, + { + "epoch": 0.5251886561659045, + "grad_norm": 4.513974189758301, + "learning_rate": 4.8519023356411505e-05, + "loss": 1.6312, + "step": 6977 + }, + { + "epoch": 0.5252639304465647, + "grad_norm": 4.997650623321533, + "learning_rate": 4.850683785819551e-05, + "loss": 1.8712, + "step": 6978 + }, + { + "epoch": 0.5253392047272248, + "grad_norm": 6.093069553375244, + "learning_rate": 4.849465244874369e-05, + "loss": 2.0122, + "step": 6979 + }, + { + "epoch": 0.525414479007885, + "grad_norm": 6.122801303863525, + "learning_rate": 4.848246712878048e-05, + "loss": 1.8779, + "step": 6980 + }, + { + "epoch": 0.5254897532885452, + "grad_norm": 5.72458028793335, + "learning_rate": 4.847028189903024e-05, + "loss": 1.9705, + "step": 6981 + }, + { + "epoch": 0.5255650275692053, + "grad_norm": 5.365975379943848, + "learning_rate": 4.845809676021735e-05, + "loss": 1.6608, + "step": 6982 + }, + { + "epoch": 0.5256403018498654, + "grad_norm": 4.129518032073975, + "learning_rate": 4.8445911713066176e-05, + "loss": 1.7577, + "step": 6983 + }, + { + "epoch": 0.5257155761305256, + "grad_norm": 4.705109119415283, + "learning_rate": 4.8433726758301104e-05, + "loss": 1.6422, + "step": 6984 + }, + { + "epoch": 0.5257908504111858, + "grad_norm": 6.8828558921813965, + "learning_rate": 4.8421541896646475e-05, + "loss": 1.6365, + "step": 6985 + }, + { + "epoch": 0.5258661246918459, + "grad_norm": 5.2355804443359375, + "learning_rate": 4.840935712882666e-05, + "loss": 1.6857, + "step": 6986 + }, + { + "epoch": 0.525941398972506, + "grad_norm": 4.676090240478516, + "learning_rate": 4.839717245556597e-05, + "loss": 1.8799, + "step": 6987 + }, + { + "epoch": 0.5260166732531663, + "grad_norm": 4.370206832885742, + "learning_rate": 4.8384987877588823e-05, + "loss": 1.9023, + "step": 6988 + }, + { + "epoch": 0.5260919475338264, + "grad_norm": 4.9542341232299805, + "learning_rate": 4.8372803395619484e-05, + "loss": 1.8961, + "step": 6989 + }, + { + "epoch": 0.5261672218144865, + "grad_norm": 5.3305816650390625, + "learning_rate": 4.836061901038233e-05, + "loss": 1.8018, + "step": 6990 + }, + { + "epoch": 0.5262424960951467, + "grad_norm": 6.1477789878845215, + "learning_rate": 4.834843472260165e-05, + "loss": 2.1026, + "step": 6991 + }, + { + "epoch": 0.5263177703758068, + "grad_norm": 4.607006549835205, + "learning_rate": 4.83362505330018e-05, + "loss": 2.1647, + "step": 6992 + }, + { + "epoch": 0.526393044656467, + "grad_norm": 4.899542808532715, + "learning_rate": 4.832406644230708e-05, + "loss": 1.8749, + "step": 6993 + }, + { + "epoch": 0.5264683189371272, + "grad_norm": 5.408762454986572, + "learning_rate": 4.831188245124179e-05, + "loss": 2.1989, + "step": 6994 + }, + { + "epoch": 0.5265435932177873, + "grad_norm": 5.334831237792969, + "learning_rate": 4.829969856053027e-05, + "loss": 1.7056, + "step": 6995 + }, + { + "epoch": 0.5266188674984474, + "grad_norm": 4.125281810760498, + "learning_rate": 4.8287514770896774e-05, + "loss": 1.7697, + "step": 6996 + }, + { + "epoch": 0.5266941417791077, + "grad_norm": 4.881189823150635, + "learning_rate": 4.827533108306563e-05, + "loss": 1.9636, + "step": 6997 + }, + { + "epoch": 0.5267694160597678, + "grad_norm": 6.621417999267578, + "learning_rate": 4.826314749776108e-05, + "loss": 1.6707, + "step": 6998 + }, + { + "epoch": 0.5268446903404279, + "grad_norm": 5.049466133117676, + "learning_rate": 4.825096401570744e-05, + "loss": 1.8097, + "step": 6999 + }, + { + "epoch": 0.5269199646210881, + "grad_norm": 4.564596176147461, + "learning_rate": 4.823878063762896e-05, + "loss": 1.7809, + "step": 7000 + }, + { + "epoch": 0.5269952389017483, + "grad_norm": 6.0483269691467285, + "learning_rate": 4.822659736424994e-05, + "loss": 1.7232, + "step": 7001 + }, + { + "epoch": 0.5270705131824084, + "grad_norm": 4.390514850616455, + "learning_rate": 4.82144141962946e-05, + "loss": 1.6616, + "step": 7002 + }, + { + "epoch": 0.5271457874630685, + "grad_norm": 3.8221426010131836, + "learning_rate": 4.820223113448722e-05, + "loss": 1.6716, + "step": 7003 + }, + { + "epoch": 0.5272210617437287, + "grad_norm": 5.722057819366455, + "learning_rate": 4.819004817955203e-05, + "loss": 1.6935, + "step": 7004 + }, + { + "epoch": 0.5272963360243889, + "grad_norm": 6.698046684265137, + "learning_rate": 4.8177865332213304e-05, + "loss": 1.7554, + "step": 7005 + }, + { + "epoch": 0.527371610305049, + "grad_norm": 3.6555323600769043, + "learning_rate": 4.8165682593195234e-05, + "loss": 1.6882, + "step": 7006 + }, + { + "epoch": 0.5274468845857092, + "grad_norm": 5.007680416107178, + "learning_rate": 4.815349996322209e-05, + "loss": 2.0708, + "step": 7007 + }, + { + "epoch": 0.5275221588663693, + "grad_norm": 3.7382116317749023, + "learning_rate": 4.814131744301806e-05, + "loss": 1.7423, + "step": 7008 + }, + { + "epoch": 0.5275974331470294, + "grad_norm": 7.774674415588379, + "learning_rate": 4.8129135033307364e-05, + "loss": 2.0451, + "step": 7009 + }, + { + "epoch": 0.5276727074276897, + "grad_norm": 6.460090637207031, + "learning_rate": 4.8116952734814244e-05, + "loss": 2.2039, + "step": 7010 + }, + { + "epoch": 0.5277479817083498, + "grad_norm": 4.0191731452941895, + "learning_rate": 4.810477054826286e-05, + "loss": 1.9601, + "step": 7011 + }, + { + "epoch": 0.5278232559890099, + "grad_norm": 5.635463714599609, + "learning_rate": 4.8092588474377456e-05, + "loss": 2.2001, + "step": 7012 + }, + { + "epoch": 0.5278985302696702, + "grad_norm": 5.282304763793945, + "learning_rate": 4.8080406513882175e-05, + "loss": 1.8133, + "step": 7013 + }, + { + "epoch": 0.5279738045503303, + "grad_norm": 5.59666109085083, + "learning_rate": 4.8068224667501225e-05, + "loss": 1.8089, + "step": 7014 + }, + { + "epoch": 0.5280490788309904, + "grad_norm": 5.204546928405762, + "learning_rate": 4.8056042935958766e-05, + "loss": 1.7156, + "step": 7015 + }, + { + "epoch": 0.5281243531116506, + "grad_norm": 5.108616352081299, + "learning_rate": 4.8043861319979e-05, + "loss": 1.5924, + "step": 7016 + }, + { + "epoch": 0.5281996273923107, + "grad_norm": 6.9872026443481445, + "learning_rate": 4.8031679820286044e-05, + "loss": 1.7737, + "step": 7017 + }, + { + "epoch": 0.5282749016729709, + "grad_norm": 4.594210147857666, + "learning_rate": 4.80194984376041e-05, + "loss": 1.6782, + "step": 7018 + }, + { + "epoch": 0.5283501759536311, + "grad_norm": 8.959563255310059, + "learning_rate": 4.800731717265726e-05, + "loss": 2.4159, + "step": 7019 + }, + { + "epoch": 0.5284254502342912, + "grad_norm": 6.792599678039551, + "learning_rate": 4.7995136026169716e-05, + "loss": 1.6504, + "step": 7020 + }, + { + "epoch": 0.5285007245149513, + "grad_norm": 4.816525459289551, + "learning_rate": 4.798295499886557e-05, + "loss": 1.6979, + "step": 7021 + }, + { + "epoch": 0.5285759987956115, + "grad_norm": 4.310568332672119, + "learning_rate": 4.797077409146898e-05, + "loss": 1.9567, + "step": 7022 + }, + { + "epoch": 0.5286512730762717, + "grad_norm": 3.7488651275634766, + "learning_rate": 4.795859330470402e-05, + "loss": 1.7453, + "step": 7023 + }, + { + "epoch": 0.5287265473569318, + "grad_norm": 3.877856969833374, + "learning_rate": 4.794641263929487e-05, + "loss": 1.5974, + "step": 7024 + }, + { + "epoch": 0.5288018216375919, + "grad_norm": 4.42359733581543, + "learning_rate": 4.793423209596557e-05, + "loss": 1.8277, + "step": 7025 + }, + { + "epoch": 0.5288770959182522, + "grad_norm": 4.096857070922852, + "learning_rate": 4.792205167544023e-05, + "loss": 1.7405, + "step": 7026 + }, + { + "epoch": 0.5289523701989123, + "grad_norm": 4.363656044006348, + "learning_rate": 4.790987137844298e-05, + "loss": 1.576, + "step": 7027 + }, + { + "epoch": 0.5290276444795724, + "grad_norm": 4.442607402801514, + "learning_rate": 4.789769120569786e-05, + "loss": 1.2785, + "step": 7028 + }, + { + "epoch": 0.5291029187602326, + "grad_norm": 4.370973110198975, + "learning_rate": 4.788551115792899e-05, + "loss": 1.6854, + "step": 7029 + }, + { + "epoch": 0.5291781930408928, + "grad_norm": 4.262331008911133, + "learning_rate": 4.7873331235860396e-05, + "loss": 1.8263, + "step": 7030 + }, + { + "epoch": 0.5292534673215529, + "grad_norm": 4.139841556549072, + "learning_rate": 4.786115144021617e-05, + "loss": 1.9565, + "step": 7031 + }, + { + "epoch": 0.5293287416022131, + "grad_norm": 5.7718377113342285, + "learning_rate": 4.784897177172034e-05, + "loss": 2.336, + "step": 7032 + }, + { + "epoch": 0.5294040158828732, + "grad_norm": 4.2104716300964355, + "learning_rate": 4.783679223109699e-05, + "loss": 1.6549, + "step": 7033 + }, + { + "epoch": 0.5294792901635333, + "grad_norm": 4.143003940582275, + "learning_rate": 4.782461281907011e-05, + "loss": 1.3662, + "step": 7034 + }, + { + "epoch": 0.5295545644441936, + "grad_norm": 6.713183403015137, + "learning_rate": 4.781243353636378e-05, + "loss": 2.0556, + "step": 7035 + }, + { + "epoch": 0.5296298387248537, + "grad_norm": 4.65440559387207, + "learning_rate": 4.780025438370197e-05, + "loss": 1.7405, + "step": 7036 + }, + { + "epoch": 0.5297051130055138, + "grad_norm": 6.032905101776123, + "learning_rate": 4.7788075361808743e-05, + "loss": 1.9212, + "step": 7037 + }, + { + "epoch": 0.529780387286174, + "grad_norm": 3.625398635864258, + "learning_rate": 4.7775896471408076e-05, + "loss": 1.5621, + "step": 7038 + }, + { + "epoch": 0.5298556615668342, + "grad_norm": 5.455428600311279, + "learning_rate": 4.776371771322401e-05, + "loss": 1.9143, + "step": 7039 + }, + { + "epoch": 0.5299309358474943, + "grad_norm": 5.555819034576416, + "learning_rate": 4.7751539087980484e-05, + "loss": 1.5025, + "step": 7040 + }, + { + "epoch": 0.5300062101281544, + "grad_norm": 5.401242256164551, + "learning_rate": 4.7739360596401505e-05, + "loss": 1.9289, + "step": 7041 + }, + { + "epoch": 0.5300814844088146, + "grad_norm": 6.3931732177734375, + "learning_rate": 4.7727182239211066e-05, + "loss": 1.8073, + "step": 7042 + }, + { + "epoch": 0.5301567586894748, + "grad_norm": 7.2550272941589355, + "learning_rate": 4.771500401713311e-05, + "loss": 1.9789, + "step": 7043 + }, + { + "epoch": 0.5302320329701349, + "grad_norm": 6.63098669052124, + "learning_rate": 4.770282593089162e-05, + "loss": 1.8565, + "step": 7044 + }, + { + "epoch": 0.5303073072507951, + "grad_norm": 4.709261417388916, + "learning_rate": 4.769064798121054e-05, + "loss": 1.9746, + "step": 7045 + }, + { + "epoch": 0.5303825815314552, + "grad_norm": 4.143810749053955, + "learning_rate": 4.767847016881381e-05, + "loss": 1.8047, + "step": 7046 + }, + { + "epoch": 0.5304578558121154, + "grad_norm": 6.855062484741211, + "learning_rate": 4.7666292494425355e-05, + "loss": 1.8527, + "step": 7047 + }, + { + "epoch": 0.5305331300927756, + "grad_norm": 4.31651496887207, + "learning_rate": 4.765411495876912e-05, + "loss": 1.7567, + "step": 7048 + }, + { + "epoch": 0.5306084043734357, + "grad_norm": 5.476384162902832, + "learning_rate": 4.764193756256902e-05, + "loss": 1.4019, + "step": 7049 + }, + { + "epoch": 0.5306836786540958, + "grad_norm": 4.62022590637207, + "learning_rate": 4.7629760306548975e-05, + "loss": 1.8531, + "step": 7050 + }, + { + "epoch": 0.5307589529347561, + "grad_norm": 4.353903770446777, + "learning_rate": 4.761758319143287e-05, + "loss": 1.6894, + "step": 7051 + }, + { + "epoch": 0.5308342272154162, + "grad_norm": 4.936910629272461, + "learning_rate": 4.760540621794462e-05, + "loss": 1.7424, + "step": 7052 + }, + { + "epoch": 0.5309095014960763, + "grad_norm": 5.773194789886475, + "learning_rate": 4.759322938680808e-05, + "loss": 1.6638, + "step": 7053 + }, + { + "epoch": 0.5309847757767365, + "grad_norm": 5.201620578765869, + "learning_rate": 4.758105269874716e-05, + "loss": 1.5271, + "step": 7054 + }, + { + "epoch": 0.5310600500573966, + "grad_norm": 4.860100269317627, + "learning_rate": 4.75688761544857e-05, + "loss": 1.822, + "step": 7055 + }, + { + "epoch": 0.5311353243380568, + "grad_norm": 4.516215801239014, + "learning_rate": 4.7556699754747605e-05, + "loss": 1.8609, + "step": 7056 + }, + { + "epoch": 0.531210598618717, + "grad_norm": 5.856453895568848, + "learning_rate": 4.754452350025668e-05, + "loss": 1.5506, + "step": 7057 + }, + { + "epoch": 0.5312858728993771, + "grad_norm": 6.175862789154053, + "learning_rate": 4.753234739173678e-05, + "loss": 2.1066, + "step": 7058 + }, + { + "epoch": 0.5313611471800372, + "grad_norm": 5.485408306121826, + "learning_rate": 4.7520171429911755e-05, + "loss": 1.9859, + "step": 7059 + }, + { + "epoch": 0.5314364214606974, + "grad_norm": 6.727893352508545, + "learning_rate": 4.750799561550542e-05, + "loss": 1.7683, + "step": 7060 + }, + { + "epoch": 0.5315116957413576, + "grad_norm": 5.950839042663574, + "learning_rate": 4.749581994924161e-05, + "loss": 2.1551, + "step": 7061 + }, + { + "epoch": 0.5315869700220177, + "grad_norm": 4.195001602172852, + "learning_rate": 4.748364443184411e-05, + "loss": 1.9197, + "step": 7062 + }, + { + "epoch": 0.5316622443026778, + "grad_norm": 7.756577014923096, + "learning_rate": 4.747146906403674e-05, + "loss": 1.8342, + "step": 7063 + }, + { + "epoch": 0.5317375185833381, + "grad_norm": 5.636590003967285, + "learning_rate": 4.7459293846543256e-05, + "loss": 2.1996, + "step": 7064 + }, + { + "epoch": 0.5318127928639982, + "grad_norm": 4.685662746429443, + "learning_rate": 4.744711878008748e-05, + "loss": 1.836, + "step": 7065 + }, + { + "epoch": 0.5318880671446583, + "grad_norm": 4.575371742248535, + "learning_rate": 4.743494386539316e-05, + "loss": 2.1747, + "step": 7066 + }, + { + "epoch": 0.5319633414253185, + "grad_norm": 5.661198616027832, + "learning_rate": 4.7422769103184074e-05, + "loss": 2.2177, + "step": 7067 + }, + { + "epoch": 0.5320386157059787, + "grad_norm": 4.978537082672119, + "learning_rate": 4.741059449418396e-05, + "loss": 2.3386, + "step": 7068 + }, + { + "epoch": 0.5321138899866388, + "grad_norm": 3.9368650913238525, + "learning_rate": 4.7398420039116576e-05, + "loss": 1.4963, + "step": 7069 + }, + { + "epoch": 0.532189164267299, + "grad_norm": 5.474009990692139, + "learning_rate": 4.738624573870565e-05, + "loss": 1.591, + "step": 7070 + }, + { + "epoch": 0.5322644385479591, + "grad_norm": 5.0814127922058105, + "learning_rate": 4.737407159367494e-05, + "loss": 1.9219, + "step": 7071 + }, + { + "epoch": 0.5323397128286192, + "grad_norm": 5.447683334350586, + "learning_rate": 4.7361897604748114e-05, + "loss": 1.9954, + "step": 7072 + }, + { + "epoch": 0.5324149871092795, + "grad_norm": 5.200130462646484, + "learning_rate": 4.734972377264892e-05, + "loss": 1.8311, + "step": 7073 + }, + { + "epoch": 0.5324902613899396, + "grad_norm": 9.71096134185791, + "learning_rate": 4.7337550098101034e-05, + "loss": 1.8673, + "step": 7074 + }, + { + "epoch": 0.5325655356705997, + "grad_norm": 10.20805549621582, + "learning_rate": 4.732537658182815e-05, + "loss": 2.2654, + "step": 7075 + }, + { + "epoch": 0.53264080995126, + "grad_norm": 5.923792362213135, + "learning_rate": 4.7313203224553965e-05, + "loss": 1.7081, + "step": 7076 + }, + { + "epoch": 0.5327160842319201, + "grad_norm": 5.313742637634277, + "learning_rate": 4.730103002700213e-05, + "loss": 1.9851, + "step": 7077 + }, + { + "epoch": 0.5327913585125802, + "grad_norm": 4.714290618896484, + "learning_rate": 4.7288856989896336e-05, + "loss": 1.5335, + "step": 7078 + }, + { + "epoch": 0.5328666327932404, + "grad_norm": 5.395437240600586, + "learning_rate": 4.727668411396019e-05, + "loss": 1.6475, + "step": 7079 + }, + { + "epoch": 0.5329419070739005, + "grad_norm": 4.499417781829834, + "learning_rate": 4.726451139991738e-05, + "loss": 1.8595, + "step": 7080 + }, + { + "epoch": 0.5330171813545607, + "grad_norm": 4.509920597076416, + "learning_rate": 4.725233884849151e-05, + "loss": 2.1105, + "step": 7081 + }, + { + "epoch": 0.5330924556352208, + "grad_norm": 4.62795877456665, + "learning_rate": 4.724016646040621e-05, + "loss": 1.7669, + "step": 7082 + }, + { + "epoch": 0.533167729915881, + "grad_norm": 4.693404674530029, + "learning_rate": 4.722799423638509e-05, + "loss": 1.7674, + "step": 7083 + }, + { + "epoch": 0.5332430041965411, + "grad_norm": 3.549790382385254, + "learning_rate": 4.721582217715177e-05, + "loss": 1.8806, + "step": 7084 + }, + { + "epoch": 0.5333182784772013, + "grad_norm": 5.290420055389404, + "learning_rate": 4.720365028342982e-05, + "loss": 1.8818, + "step": 7085 + }, + { + "epoch": 0.5333935527578615, + "grad_norm": 4.46742582321167, + "learning_rate": 4.719147855594285e-05, + "loss": 1.9148, + "step": 7086 + }, + { + "epoch": 0.5334688270385216, + "grad_norm": 5.571162223815918, + "learning_rate": 4.7179306995414404e-05, + "loss": 1.9953, + "step": 7087 + }, + { + "epoch": 0.5335441013191817, + "grad_norm": 4.64400577545166, + "learning_rate": 4.716713560256809e-05, + "loss": 1.8739, + "step": 7088 + }, + { + "epoch": 0.533619375599842, + "grad_norm": 6.43136739730835, + "learning_rate": 4.715496437812741e-05, + "loss": 2.093, + "step": 7089 + }, + { + "epoch": 0.5336946498805021, + "grad_norm": 4.113124370574951, + "learning_rate": 4.714279332281594e-05, + "loss": 1.8901, + "step": 7090 + }, + { + "epoch": 0.5337699241611622, + "grad_norm": 5.140697956085205, + "learning_rate": 4.713062243735722e-05, + "loss": 1.8444, + "step": 7091 + }, + { + "epoch": 0.5338451984418224, + "grad_norm": 5.672872543334961, + "learning_rate": 4.711845172247475e-05, + "loss": 1.9507, + "step": 7092 + }, + { + "epoch": 0.5339204727224826, + "grad_norm": 4.760066986083984, + "learning_rate": 4.710628117889207e-05, + "loss": 1.6674, + "step": 7093 + }, + { + "epoch": 0.5339957470031427, + "grad_norm": 4.440094470977783, + "learning_rate": 4.7094110807332656e-05, + "loss": 1.9026, + "step": 7094 + }, + { + "epoch": 0.5340710212838029, + "grad_norm": 5.391696453094482, + "learning_rate": 4.708194060852004e-05, + "loss": 1.9004, + "step": 7095 + }, + { + "epoch": 0.534146295564463, + "grad_norm": 5.722980976104736, + "learning_rate": 4.706977058317766e-05, + "loss": 1.8074, + "step": 7096 + }, + { + "epoch": 0.5342215698451231, + "grad_norm": 4.9427900314331055, + "learning_rate": 4.7057600732029016e-05, + "loss": 1.691, + "step": 7097 + }, + { + "epoch": 0.5342968441257834, + "grad_norm": 5.230165004730225, + "learning_rate": 4.704543105579757e-05, + "loss": 2.0287, + "step": 7098 + }, + { + "epoch": 0.5343721184064435, + "grad_norm": 6.101943016052246, + "learning_rate": 4.7033261555206776e-05, + "loss": 1.9988, + "step": 7099 + }, + { + "epoch": 0.5344473926871036, + "grad_norm": 6.258819580078125, + "learning_rate": 4.702109223098005e-05, + "loss": 1.6022, + "step": 7100 + }, + { + "epoch": 0.5345226669677637, + "grad_norm": 5.994137287139893, + "learning_rate": 4.700892308384087e-05, + "loss": 2.2714, + "step": 7101 + }, + { + "epoch": 0.534597941248424, + "grad_norm": 4.317666530609131, + "learning_rate": 4.699675411451261e-05, + "loss": 1.8352, + "step": 7102 + }, + { + "epoch": 0.5346732155290841, + "grad_norm": 4.5896315574646, + "learning_rate": 4.698458532371871e-05, + "loss": 1.9402, + "step": 7103 + }, + { + "epoch": 0.5347484898097442, + "grad_norm": 7.177721977233887, + "learning_rate": 4.6972416712182546e-05, + "loss": 2.0471, + "step": 7104 + }, + { + "epoch": 0.5348237640904044, + "grad_norm": 4.990874767303467, + "learning_rate": 4.696024828062755e-05, + "loss": 1.975, + "step": 7105 + }, + { + "epoch": 0.5348990383710646, + "grad_norm": 3.862558364868164, + "learning_rate": 4.694808002977704e-05, + "loss": 1.7455, + "step": 7106 + }, + { + "epoch": 0.5349743126517247, + "grad_norm": 5.713701248168945, + "learning_rate": 4.6935911960354425e-05, + "loss": 1.8903, + "step": 7107 + }, + { + "epoch": 0.5350495869323849, + "grad_norm": 4.977960109710693, + "learning_rate": 4.692374407308307e-05, + "loss": 1.488, + "step": 7108 + }, + { + "epoch": 0.535124861213045, + "grad_norm": 6.108043193817139, + "learning_rate": 4.6911576368686284e-05, + "loss": 1.7804, + "step": 7109 + }, + { + "epoch": 0.5352001354937052, + "grad_norm": 6.555980682373047, + "learning_rate": 4.689940884788743e-05, + "loss": 1.6373, + "step": 7110 + }, + { + "epoch": 0.5352754097743654, + "grad_norm": 6.220038890838623, + "learning_rate": 4.688724151140981e-05, + "loss": 1.9791, + "step": 7111 + }, + { + "epoch": 0.5353506840550255, + "grad_norm": 6.495673656463623, + "learning_rate": 4.6875074359976775e-05, + "loss": 1.7855, + "step": 7112 + }, + { + "epoch": 0.5354259583356856, + "grad_norm": 4.918026447296143, + "learning_rate": 4.686290739431158e-05, + "loss": 2.7851, + "step": 7113 + }, + { + "epoch": 0.5355012326163459, + "grad_norm": 4.212944507598877, + "learning_rate": 4.685074061513755e-05, + "loss": 1.5775, + "step": 7114 + }, + { + "epoch": 0.535576506897006, + "grad_norm": 5.814725875854492, + "learning_rate": 4.6838574023177936e-05, + "loss": 2.0929, + "step": 7115 + }, + { + "epoch": 0.5356517811776661, + "grad_norm": 6.479356288909912, + "learning_rate": 4.682640761915606e-05, + "loss": 1.8572, + "step": 7116 + }, + { + "epoch": 0.5357270554583263, + "grad_norm": 5.799812316894531, + "learning_rate": 4.6814241403795116e-05, + "loss": 1.9715, + "step": 7117 + }, + { + "epoch": 0.5358023297389864, + "grad_norm": 4.480240821838379, + "learning_rate": 4.6802075377818404e-05, + "loss": 1.8021, + "step": 7118 + }, + { + "epoch": 0.5358776040196466, + "grad_norm": 4.874342918395996, + "learning_rate": 4.678990954194911e-05, + "loss": 1.7762, + "step": 7119 + }, + { + "epoch": 0.5359528783003067, + "grad_norm": 6.1596293449401855, + "learning_rate": 4.67777438969105e-05, + "loss": 1.8206, + "step": 7120 + }, + { + "epoch": 0.5360281525809669, + "grad_norm": 6.481925964355469, + "learning_rate": 4.676557844342576e-05, + "loss": 1.9858, + "step": 7121 + }, + { + "epoch": 0.536103426861627, + "grad_norm": 4.077095985412598, + "learning_rate": 4.675341318221809e-05, + "loss": 1.8822, + "step": 7122 + }, + { + "epoch": 0.5361787011422872, + "grad_norm": 4.741276264190674, + "learning_rate": 4.674124811401071e-05, + "loss": 1.8454, + "step": 7123 + }, + { + "epoch": 0.5362539754229474, + "grad_norm": 7.0525970458984375, + "learning_rate": 4.6729083239526775e-05, + "loss": 1.7784, + "step": 7124 + }, + { + "epoch": 0.5363292497036075, + "grad_norm": 5.131325721740723, + "learning_rate": 4.671691855948947e-05, + "loss": 2.0409, + "step": 7125 + }, + { + "epoch": 0.5364045239842676, + "grad_norm": 4.6899638175964355, + "learning_rate": 4.670475407462191e-05, + "loss": 1.8881, + "step": 7126 + }, + { + "epoch": 0.5364797982649279, + "grad_norm": 4.911111831665039, + "learning_rate": 4.669258978564728e-05, + "loss": 2.0297, + "step": 7127 + }, + { + "epoch": 0.536555072545588, + "grad_norm": 3.9518191814422607, + "learning_rate": 4.6680425693288684e-05, + "loss": 1.7745, + "step": 7128 + }, + { + "epoch": 0.5366303468262481, + "grad_norm": 4.593074798583984, + "learning_rate": 4.666826179826928e-05, + "loss": 1.7241, + "step": 7129 + }, + { + "epoch": 0.5367056211069083, + "grad_norm": 4.520386219024658, + "learning_rate": 4.665609810131213e-05, + "loss": 1.6282, + "step": 7130 + }, + { + "epoch": 0.5367808953875685, + "grad_norm": 5.2434468269348145, + "learning_rate": 4.664393460314036e-05, + "loss": 1.964, + "step": 7131 + }, + { + "epoch": 0.5368561696682286, + "grad_norm": 7.866587162017822, + "learning_rate": 4.663177130447705e-05, + "loss": 2.1877, + "step": 7132 + }, + { + "epoch": 0.5369314439488888, + "grad_norm": 6.069332599639893, + "learning_rate": 4.6619608206045276e-05, + "loss": 2.3246, + "step": 7133 + }, + { + "epoch": 0.5370067182295489, + "grad_norm": 5.403286457061768, + "learning_rate": 4.6607445308568085e-05, + "loss": 2.083, + "step": 7134 + }, + { + "epoch": 0.537081992510209, + "grad_norm": 4.690673828125, + "learning_rate": 4.659528261276855e-05, + "loss": 2.0188, + "step": 7135 + }, + { + "epoch": 0.5371572667908693, + "grad_norm": 4.797272205352783, + "learning_rate": 4.658312011936968e-05, + "loss": 1.8803, + "step": 7136 + }, + { + "epoch": 0.5372325410715294, + "grad_norm": 5.583282947540283, + "learning_rate": 4.6570957829094525e-05, + "loss": 1.7108, + "step": 7137 + }, + { + "epoch": 0.5373078153521895, + "grad_norm": 6.557413578033447, + "learning_rate": 4.655879574266607e-05, + "loss": 1.8773, + "step": 7138 + }, + { + "epoch": 0.5373830896328496, + "grad_norm": 4.59459924697876, + "learning_rate": 4.6546633860807346e-05, + "loss": 1.9706, + "step": 7139 + }, + { + "epoch": 0.5374583639135099, + "grad_norm": 3.5619723796844482, + "learning_rate": 4.653447218424134e-05, + "loss": 1.8086, + "step": 7140 + }, + { + "epoch": 0.53753363819417, + "grad_norm": 5.119499206542969, + "learning_rate": 4.6522310713690995e-05, + "loss": 2.0314, + "step": 7141 + }, + { + "epoch": 0.5376089124748301, + "grad_norm": 5.700216770172119, + "learning_rate": 4.6510149449879325e-05, + "loss": 2.0707, + "step": 7142 + }, + { + "epoch": 0.5376841867554903, + "grad_norm": 4.273297309875488, + "learning_rate": 4.649798839352923e-05, + "loss": 1.9882, + "step": 7143 + }, + { + "epoch": 0.5377594610361505, + "grad_norm": 4.523004055023193, + "learning_rate": 4.648582754536372e-05, + "loss": 1.6166, + "step": 7144 + }, + { + "epoch": 0.5378347353168106, + "grad_norm": 4.488810062408447, + "learning_rate": 4.647366690610564e-05, + "loss": 1.9809, + "step": 7145 + }, + { + "epoch": 0.5379100095974708, + "grad_norm": 4.9414215087890625, + "learning_rate": 4.6461506476477966e-05, + "loss": 1.3765, + "step": 7146 + }, + { + "epoch": 0.5379852838781309, + "grad_norm": 5.432328224182129, + "learning_rate": 4.6449346257203555e-05, + "loss": 2.0161, + "step": 7147 + }, + { + "epoch": 0.5380605581587911, + "grad_norm": 4.182375431060791, + "learning_rate": 4.643718624900534e-05, + "loss": 1.8227, + "step": 7148 + }, + { + "epoch": 0.5381358324394513, + "grad_norm": 5.107208728790283, + "learning_rate": 4.642502645260617e-05, + "loss": 1.6544, + "step": 7149 + }, + { + "epoch": 0.5382111067201114, + "grad_norm": 4.173213958740234, + "learning_rate": 4.6412866868728935e-05, + "loss": 2.0312, + "step": 7150 + }, + { + "epoch": 0.5382863810007715, + "grad_norm": 5.704531669616699, + "learning_rate": 4.6400707498096454e-05, + "loss": 1.8912, + "step": 7151 + }, + { + "epoch": 0.5383616552814318, + "grad_norm": 13.378335952758789, + "learning_rate": 4.6388548341431604e-05, + "loss": 1.78, + "step": 7152 + }, + { + "epoch": 0.5384369295620919, + "grad_norm": 6.626962184906006, + "learning_rate": 4.6376389399457176e-05, + "loss": 1.9232, + "step": 7153 + }, + { + "epoch": 0.538512203842752, + "grad_norm": 5.496099948883057, + "learning_rate": 4.6364230672896e-05, + "loss": 1.6237, + "step": 7154 + }, + { + "epoch": 0.5385874781234122, + "grad_norm": 5.707422256469727, + "learning_rate": 4.6352072162470883e-05, + "loss": 1.9289, + "step": 7155 + }, + { + "epoch": 0.5386627524040724, + "grad_norm": 7.062283992767334, + "learning_rate": 4.63399138689046e-05, + "loss": 1.7421, + "step": 7156 + }, + { + "epoch": 0.5387380266847325, + "grad_norm": 6.010374069213867, + "learning_rate": 4.632775579291996e-05, + "loss": 2.2133, + "step": 7157 + }, + { + "epoch": 0.5388133009653926, + "grad_norm": 4.583510398864746, + "learning_rate": 4.631559793523968e-05, + "loss": 1.8985, + "step": 7158 + }, + { + "epoch": 0.5388885752460528, + "grad_norm": 6.212681770324707, + "learning_rate": 4.630344029658654e-05, + "loss": 1.7679, + "step": 7159 + }, + { + "epoch": 0.538963849526713, + "grad_norm": 3.843825101852417, + "learning_rate": 4.6291282877683254e-05, + "loss": 1.963, + "step": 7160 + }, + { + "epoch": 0.5390391238073731, + "grad_norm": 4.492417335510254, + "learning_rate": 4.627912567925259e-05, + "loss": 1.8701, + "step": 7161 + }, + { + "epoch": 0.5391143980880333, + "grad_norm": 4.7527289390563965, + "learning_rate": 4.6266968702017195e-05, + "loss": 2.2984, + "step": 7162 + }, + { + "epoch": 0.5391896723686934, + "grad_norm": 6.014012813568115, + "learning_rate": 4.6254811946699824e-05, + "loss": 1.9207, + "step": 7163 + }, + { + "epoch": 0.5392649466493535, + "grad_norm": 3.7802605628967285, + "learning_rate": 4.6242655414023125e-05, + "loss": 1.8224, + "step": 7164 + }, + { + "epoch": 0.5393402209300138, + "grad_norm": 4.799820899963379, + "learning_rate": 4.623049910470978e-05, + "loss": 1.911, + "step": 7165 + }, + { + "epoch": 0.5394154952106739, + "grad_norm": 5.492232322692871, + "learning_rate": 4.621834301948244e-05, + "loss": 1.8215, + "step": 7166 + }, + { + "epoch": 0.539490769491334, + "grad_norm": 3.970005989074707, + "learning_rate": 4.620618715906378e-05, + "loss": 1.7968, + "step": 7167 + }, + { + "epoch": 0.5395660437719942, + "grad_norm": 3.3712337017059326, + "learning_rate": 4.619403152417638e-05, + "loss": 1.7028, + "step": 7168 + }, + { + "epoch": 0.5396413180526544, + "grad_norm": 4.6120829582214355, + "learning_rate": 4.6181876115542904e-05, + "loss": 1.6412, + "step": 7169 + }, + { + "epoch": 0.5397165923333145, + "grad_norm": 4.709830284118652, + "learning_rate": 4.6169720933885924e-05, + "loss": 1.6691, + "step": 7170 + }, + { + "epoch": 0.5397918666139747, + "grad_norm": 5.849932670593262, + "learning_rate": 4.6157565979928044e-05, + "loss": 1.6677, + "step": 7171 + }, + { + "epoch": 0.5398671408946348, + "grad_norm": 4.436333656311035, + "learning_rate": 4.614541125439186e-05, + "loss": 2.0023, + "step": 7172 + }, + { + "epoch": 0.539942415175295, + "grad_norm": 6.389732360839844, + "learning_rate": 4.61332567579999e-05, + "loss": 1.8701, + "step": 7173 + }, + { + "epoch": 0.5400176894559552, + "grad_norm": 4.789639949798584, + "learning_rate": 4.6121102491474754e-05, + "loss": 1.8854, + "step": 7174 + }, + { + "epoch": 0.5400929637366153, + "grad_norm": 5.201119422912598, + "learning_rate": 4.6108948455538906e-05, + "loss": 1.6388, + "step": 7175 + }, + { + "epoch": 0.5401682380172754, + "grad_norm": 4.099886894226074, + "learning_rate": 4.609679465091493e-05, + "loss": 2.0143, + "step": 7176 + }, + { + "epoch": 0.5402435122979357, + "grad_norm": 6.517070293426514, + "learning_rate": 4.608464107832529e-05, + "loss": 1.8052, + "step": 7177 + }, + { + "epoch": 0.5403187865785958, + "grad_norm": 4.246268272399902, + "learning_rate": 4.607248773849253e-05, + "loss": 1.931, + "step": 7178 + }, + { + "epoch": 0.5403940608592559, + "grad_norm": 5.61403751373291, + "learning_rate": 4.606033463213908e-05, + "loss": 1.9052, + "step": 7179 + }, + { + "epoch": 0.540469335139916, + "grad_norm": 4.424510478973389, + "learning_rate": 4.6048181759987456e-05, + "loss": 1.8366, + "step": 7180 + }, + { + "epoch": 0.5405446094205762, + "grad_norm": 6.226020812988281, + "learning_rate": 4.603602912276007e-05, + "loss": 1.7517, + "step": 7181 + }, + { + "epoch": 0.5406198837012364, + "grad_norm": 5.004098892211914, + "learning_rate": 4.602387672117938e-05, + "loss": 1.97, + "step": 7182 + }, + { + "epoch": 0.5406951579818965, + "grad_norm": 4.143091678619385, + "learning_rate": 4.6011724555967806e-05, + "loss": 1.5954, + "step": 7183 + }, + { + "epoch": 0.5407704322625567, + "grad_norm": 5.554112434387207, + "learning_rate": 4.599957262784778e-05, + "loss": 1.9317, + "step": 7184 + }, + { + "epoch": 0.5408457065432168, + "grad_norm": 6.292996883392334, + "learning_rate": 4.5987420937541664e-05, + "loss": 1.927, + "step": 7185 + }, + { + "epoch": 0.540920980823877, + "grad_norm": 4.358442306518555, + "learning_rate": 4.597526948577188e-05, + "loss": 1.7427, + "step": 7186 + }, + { + "epoch": 0.5409962551045372, + "grad_norm": 4.794389724731445, + "learning_rate": 4.596311827326075e-05, + "loss": 1.9363, + "step": 7187 + }, + { + "epoch": 0.5410715293851973, + "grad_norm": 5.894508361816406, + "learning_rate": 4.595096730073066e-05, + "loss": 1.7981, + "step": 7188 + }, + { + "epoch": 0.5411468036658574, + "grad_norm": 4.176530361175537, + "learning_rate": 4.593881656890397e-05, + "loss": 2.0038, + "step": 7189 + }, + { + "epoch": 0.5412220779465177, + "grad_norm": 4.829484939575195, + "learning_rate": 4.592666607850297e-05, + "loss": 1.8977, + "step": 7190 + }, + { + "epoch": 0.5412973522271778, + "grad_norm": 4.035079002380371, + "learning_rate": 4.5914515830249996e-05, + "loss": 1.6441, + "step": 7191 + }, + { + "epoch": 0.5413726265078379, + "grad_norm": 4.085139751434326, + "learning_rate": 4.590236582486731e-05, + "loss": 1.8679, + "step": 7192 + }, + { + "epoch": 0.5414479007884981, + "grad_norm": 5.611950874328613, + "learning_rate": 4.589021606307723e-05, + "loss": 1.6502, + "step": 7193 + }, + { + "epoch": 0.5415231750691583, + "grad_norm": 4.763374328613281, + "learning_rate": 4.5878066545602006e-05, + "loss": 1.4502, + "step": 7194 + }, + { + "epoch": 0.5415984493498184, + "grad_norm": 6.742323398590088, + "learning_rate": 4.5865917273163915e-05, + "loss": 2.183, + "step": 7195 + }, + { + "epoch": 0.5416737236304786, + "grad_norm": 5.428110599517822, + "learning_rate": 4.585376824648516e-05, + "loss": 1.5844, + "step": 7196 + }, + { + "epoch": 0.5417489979111387, + "grad_norm": 4.162417888641357, + "learning_rate": 4.5841619466288e-05, + "loss": 1.7878, + "step": 7197 + }, + { + "epoch": 0.5418242721917989, + "grad_norm": 4.169439792633057, + "learning_rate": 4.582947093329462e-05, + "loss": 1.8564, + "step": 7198 + }, + { + "epoch": 0.541899546472459, + "grad_norm": 5.597404956817627, + "learning_rate": 4.581732264822724e-05, + "loss": 1.861, + "step": 7199 + }, + { + "epoch": 0.5419748207531192, + "grad_norm": 4.662982940673828, + "learning_rate": 4.580517461180801e-05, + "loss": 1.8278, + "step": 7200 + }, + { + "epoch": 0.5420500950337793, + "grad_norm": 5.7214765548706055, + "learning_rate": 4.5793026824759136e-05, + "loss": 2.1227, + "step": 7201 + }, + { + "epoch": 0.5421253693144394, + "grad_norm": 5.172969341278076, + "learning_rate": 4.578087928780273e-05, + "loss": 1.6182, + "step": 7202 + }, + { + "epoch": 0.5422006435950997, + "grad_norm": 4.198230743408203, + "learning_rate": 4.576873200166094e-05, + "loss": 1.9745, + "step": 7203 + }, + { + "epoch": 0.5422759178757598, + "grad_norm": 6.017120838165283, + "learning_rate": 4.57565849670559e-05, + "loss": 1.9459, + "step": 7204 + }, + { + "epoch": 0.5423511921564199, + "grad_norm": 5.469533443450928, + "learning_rate": 4.5744438184709696e-05, + "loss": 1.9369, + "step": 7205 + }, + { + "epoch": 0.5424264664370801, + "grad_norm": 4.9822540283203125, + "learning_rate": 4.5732291655344456e-05, + "loss": 1.6067, + "step": 7206 + }, + { + "epoch": 0.5425017407177403, + "grad_norm": 5.304908275604248, + "learning_rate": 4.572014537968221e-05, + "loss": 1.5948, + "step": 7207 + }, + { + "epoch": 0.5425770149984004, + "grad_norm": 7.2243781089782715, + "learning_rate": 4.5707999358445066e-05, + "loss": 1.713, + "step": 7208 + }, + { + "epoch": 0.5426522892790606, + "grad_norm": 4.796379566192627, + "learning_rate": 4.569585359235502e-05, + "loss": 1.8851, + "step": 7209 + }, + { + "epoch": 0.5427275635597207, + "grad_norm": 7.041388988494873, + "learning_rate": 4.5683708082134135e-05, + "loss": 1.7858, + "step": 7210 + }, + { + "epoch": 0.5428028378403809, + "grad_norm": 19.674196243286133, + "learning_rate": 4.567156282850441e-05, + "loss": 1.7125, + "step": 7211 + }, + { + "epoch": 0.5428781121210411, + "grad_norm": 3.8531394004821777, + "learning_rate": 4.565941783218788e-05, + "loss": 1.9153, + "step": 7212 + }, + { + "epoch": 0.5429533864017012, + "grad_norm": 4.803952217102051, + "learning_rate": 4.564727309390648e-05, + "loss": 1.8591, + "step": 7213 + }, + { + "epoch": 0.5430286606823613, + "grad_norm": 5.400869369506836, + "learning_rate": 4.563512861438222e-05, + "loss": 1.6223, + "step": 7214 + }, + { + "epoch": 0.5431039349630216, + "grad_norm": 4.910324573516846, + "learning_rate": 4.562298439433703e-05, + "loss": 1.4528, + "step": 7215 + }, + { + "epoch": 0.5431792092436817, + "grad_norm": 4.223775863647461, + "learning_rate": 4.561084043449287e-05, + "loss": 1.6992, + "step": 7216 + }, + { + "epoch": 0.5432544835243418, + "grad_norm": 3.9501447677612305, + "learning_rate": 4.559869673557164e-05, + "loss": 1.7347, + "step": 7217 + }, + { + "epoch": 0.5433297578050019, + "grad_norm": 4.67562198638916, + "learning_rate": 4.558655329829529e-05, + "loss": 1.9098, + "step": 7218 + }, + { + "epoch": 0.5434050320856622, + "grad_norm": 5.404338359832764, + "learning_rate": 4.5574410123385645e-05, + "loss": 1.8064, + "step": 7219 + }, + { + "epoch": 0.5434803063663223, + "grad_norm": 6.5876874923706055, + "learning_rate": 4.556226721156463e-05, + "loss": 2.3587, + "step": 7220 + }, + { + "epoch": 0.5435555806469824, + "grad_norm": 5.806699275970459, + "learning_rate": 4.55501245635541e-05, + "loss": 1.8675, + "step": 7221 + }, + { + "epoch": 0.5436308549276426, + "grad_norm": 7.903731822967529, + "learning_rate": 4.553798218007589e-05, + "loss": 1.9947, + "step": 7222 + }, + { + "epoch": 0.5437061292083027, + "grad_norm": 4.872975826263428, + "learning_rate": 4.552584006185186e-05, + "loss": 1.8651, + "step": 7223 + }, + { + "epoch": 0.5437814034889629, + "grad_norm": 4.901376724243164, + "learning_rate": 4.551369820960377e-05, + "loss": 1.6136, + "step": 7224 + }, + { + "epoch": 0.5438566777696231, + "grad_norm": 4.038053035736084, + "learning_rate": 4.550155662405347e-05, + "loss": 1.8519, + "step": 7225 + }, + { + "epoch": 0.5439319520502832, + "grad_norm": 4.550975322723389, + "learning_rate": 4.548941530592272e-05, + "loss": 2.1055, + "step": 7226 + }, + { + "epoch": 0.5440072263309433, + "grad_norm": 4.840604305267334, + "learning_rate": 4.5477274255933276e-05, + "loss": 1.742, + "step": 7227 + }, + { + "epoch": 0.5440825006116036, + "grad_norm": 5.355466842651367, + "learning_rate": 4.5465133474806903e-05, + "loss": 2.0793, + "step": 7228 + }, + { + "epoch": 0.5441577748922637, + "grad_norm": 5.605018138885498, + "learning_rate": 4.545299296326535e-05, + "loss": 1.8814, + "step": 7229 + }, + { + "epoch": 0.5442330491729238, + "grad_norm": 6.136038303375244, + "learning_rate": 4.5440852722030294e-05, + "loss": 1.8699, + "step": 7230 + }, + { + "epoch": 0.544308323453584, + "grad_norm": 6.588369369506836, + "learning_rate": 4.5428712751823476e-05, + "loss": 2.2187, + "step": 7231 + }, + { + "epoch": 0.5443835977342442, + "grad_norm": 4.299859523773193, + "learning_rate": 4.541657305336656e-05, + "loss": 1.9892, + "step": 7232 + }, + { + "epoch": 0.5444588720149043, + "grad_norm": 3.8395161628723145, + "learning_rate": 4.5404433627381236e-05, + "loss": 1.7461, + "step": 7233 + }, + { + "epoch": 0.5445341462955645, + "grad_norm": 4.47541618347168, + "learning_rate": 4.539229447458914e-05, + "loss": 1.716, + "step": 7234 + }, + { + "epoch": 0.5446094205762246, + "grad_norm": 5.200368404388428, + "learning_rate": 4.53801555957119e-05, + "loss": 1.947, + "step": 7235 + }, + { + "epoch": 0.5446846948568848, + "grad_norm": 4.562351703643799, + "learning_rate": 4.536801699147119e-05, + "loss": 1.9171, + "step": 7236 + }, + { + "epoch": 0.5447599691375449, + "grad_norm": 8.490396499633789, + "learning_rate": 4.535587866258855e-05, + "loss": 1.766, + "step": 7237 + }, + { + "epoch": 0.5448352434182051, + "grad_norm": 4.597094535827637, + "learning_rate": 4.534374060978561e-05, + "loss": 1.5614, + "step": 7238 + }, + { + "epoch": 0.5449105176988652, + "grad_norm": 4.932806491851807, + "learning_rate": 4.533160283378392e-05, + "loss": 2.2603, + "step": 7239 + }, + { + "epoch": 0.5449857919795253, + "grad_norm": 4.407392501831055, + "learning_rate": 4.531946533530507e-05, + "loss": 1.7965, + "step": 7240 + }, + { + "epoch": 0.5450610662601856, + "grad_norm": 3.764080762863159, + "learning_rate": 4.530732811507055e-05, + "loss": 1.6974, + "step": 7241 + }, + { + "epoch": 0.5451363405408457, + "grad_norm": 4.483845233917236, + "learning_rate": 4.529519117380192e-05, + "loss": 1.759, + "step": 7242 + }, + { + "epoch": 0.5452116148215058, + "grad_norm": 5.474569797515869, + "learning_rate": 4.5283054512220665e-05, + "loss": 1.9536, + "step": 7243 + }, + { + "epoch": 0.545286889102166, + "grad_norm": 6.2152252197265625, + "learning_rate": 4.527091813104831e-05, + "loss": 1.8565, + "step": 7244 + }, + { + "epoch": 0.5453621633828262, + "grad_norm": 5.316666603088379, + "learning_rate": 4.525878203100628e-05, + "loss": 1.8908, + "step": 7245 + }, + { + "epoch": 0.5454374376634863, + "grad_norm": 5.8045878410339355, + "learning_rate": 4.524664621281608e-05, + "loss": 1.8136, + "step": 7246 + }, + { + "epoch": 0.5455127119441465, + "grad_norm": 4.972594738006592, + "learning_rate": 4.52345106771991e-05, + "loss": 1.7589, + "step": 7247 + }, + { + "epoch": 0.5455879862248066, + "grad_norm": 5.450202465057373, + "learning_rate": 4.522237542487679e-05, + "loss": 1.5791, + "step": 7248 + }, + { + "epoch": 0.5456632605054668, + "grad_norm": 3.8361589908599854, + "learning_rate": 4.5210240456570544e-05, + "loss": 1.869, + "step": 7249 + }, + { + "epoch": 0.545738534786127, + "grad_norm": 6.235348701477051, + "learning_rate": 4.5198105773001784e-05, + "loss": 1.5913, + "step": 7250 + }, + { + "epoch": 0.5458138090667871, + "grad_norm": 5.294140338897705, + "learning_rate": 4.5185971374891834e-05, + "loss": 1.8294, + "step": 7251 + }, + { + "epoch": 0.5458890833474472, + "grad_norm": 5.349215030670166, + "learning_rate": 4.5173837262962075e-05, + "loss": 1.8865, + "step": 7252 + }, + { + "epoch": 0.5459643576281075, + "grad_norm": 4.613503932952881, + "learning_rate": 4.516170343793386e-05, + "loss": 1.5004, + "step": 7253 + }, + { + "epoch": 0.5460396319087676, + "grad_norm": 5.381326675415039, + "learning_rate": 4.514956990052848e-05, + "loss": 1.8151, + "step": 7254 + }, + { + "epoch": 0.5461149061894277, + "grad_norm": 4.5260090827941895, + "learning_rate": 4.513743665146726e-05, + "loss": 1.7312, + "step": 7255 + }, + { + "epoch": 0.5461901804700879, + "grad_norm": 5.9940338134765625, + "learning_rate": 4.512530369147146e-05, + "loss": 2.001, + "step": 7256 + }, + { + "epoch": 0.5462654547507481, + "grad_norm": 4.149104118347168, + "learning_rate": 4.51131710212624e-05, + "loss": 1.9411, + "step": 7257 + }, + { + "epoch": 0.5463407290314082, + "grad_norm": 4.28159761428833, + "learning_rate": 4.510103864156127e-05, + "loss": 1.9475, + "step": 7258 + }, + { + "epoch": 0.5464160033120683, + "grad_norm": 5.195186138153076, + "learning_rate": 4.508890655308936e-05, + "loss": 1.634, + "step": 7259 + }, + { + "epoch": 0.5464912775927285, + "grad_norm": 3.783459424972534, + "learning_rate": 4.5076774756567834e-05, + "loss": 1.6379, + "step": 7260 + }, + { + "epoch": 0.5465665518733887, + "grad_norm": 4.282258033752441, + "learning_rate": 4.506464325271796e-05, + "loss": 1.5783, + "step": 7261 + }, + { + "epoch": 0.5466418261540488, + "grad_norm": 6.5915350914001465, + "learning_rate": 4.5052512042260854e-05, + "loss": 1.6986, + "step": 7262 + }, + { + "epoch": 0.546717100434709, + "grad_norm": 5.597428321838379, + "learning_rate": 4.5040381125917734e-05, + "loss": 1.6255, + "step": 7263 + }, + { + "epoch": 0.5467923747153691, + "grad_norm": 4.7702741622924805, + "learning_rate": 4.5028250504409706e-05, + "loss": 1.588, + "step": 7264 + }, + { + "epoch": 0.5468676489960292, + "grad_norm": 5.140831470489502, + "learning_rate": 4.5016120178457935e-05, + "loss": 1.8663, + "step": 7265 + }, + { + "epoch": 0.5469429232766895, + "grad_norm": 5.149367809295654, + "learning_rate": 4.50039901487835e-05, + "loss": 1.9889, + "step": 7266 + }, + { + "epoch": 0.5470181975573496, + "grad_norm": 4.4787139892578125, + "learning_rate": 4.499186041610752e-05, + "loss": 1.7752, + "step": 7267 + }, + { + "epoch": 0.5470934718380097, + "grad_norm": 3.79844069480896, + "learning_rate": 4.497973098115109e-05, + "loss": 1.9258, + "step": 7268 + }, + { + "epoch": 0.54716874611867, + "grad_norm": 4.899024486541748, + "learning_rate": 4.496760184463522e-05, + "loss": 1.8554, + "step": 7269 + }, + { + "epoch": 0.5472440203993301, + "grad_norm": 4.692483425140381, + "learning_rate": 4.4955473007281e-05, + "loss": 1.6704, + "step": 7270 + }, + { + "epoch": 0.5473192946799902, + "grad_norm": 6.104963779449463, + "learning_rate": 4.494334446980942e-05, + "loss": 2.1492, + "step": 7271 + }, + { + "epoch": 0.5473945689606504, + "grad_norm": 4.283858299255371, + "learning_rate": 4.4931216232941526e-05, + "loss": 1.4139, + "step": 7272 + }, + { + "epoch": 0.5474698432413105, + "grad_norm": 4.8414812088012695, + "learning_rate": 4.491908829739826e-05, + "loss": 2.1027, + "step": 7273 + }, + { + "epoch": 0.5475451175219707, + "grad_norm": 5.531759262084961, + "learning_rate": 4.4906960663900636e-05, + "loss": 1.8018, + "step": 7274 + }, + { + "epoch": 0.5476203918026309, + "grad_norm": 4.972339630126953, + "learning_rate": 4.4894833333169574e-05, + "loss": 1.5577, + "step": 7275 + }, + { + "epoch": 0.547695666083291, + "grad_norm": 5.616275310516357, + "learning_rate": 4.4882706305926026e-05, + "loss": 1.7957, + "step": 7276 + }, + { + "epoch": 0.5477709403639511, + "grad_norm": 5.516354084014893, + "learning_rate": 4.487057958289089e-05, + "loss": 2.5014, + "step": 7277 + }, + { + "epoch": 0.5478462146446113, + "grad_norm": 6.476902008056641, + "learning_rate": 4.4858453164785105e-05, + "loss": 2.162, + "step": 7278 + }, + { + "epoch": 0.5479214889252715, + "grad_norm": 5.6654744148254395, + "learning_rate": 4.4846327052329504e-05, + "loss": 2.1678, + "step": 7279 + }, + { + "epoch": 0.5479967632059316, + "grad_norm": 4.00792121887207, + "learning_rate": 4.4834201246245e-05, + "loss": 1.9374, + "step": 7280 + }, + { + "epoch": 0.5480720374865917, + "grad_norm": 5.483493328094482, + "learning_rate": 4.482207574725238e-05, + "loss": 2.2121, + "step": 7281 + }, + { + "epoch": 0.548147311767252, + "grad_norm": 5.4148454666137695, + "learning_rate": 4.4809950556072525e-05, + "loss": 2.1685, + "step": 7282 + }, + { + "epoch": 0.5482225860479121, + "grad_norm": 6.117647647857666, + "learning_rate": 4.4797825673426194e-05, + "loss": 2.022, + "step": 7283 + }, + { + "epoch": 0.5482978603285722, + "grad_norm": 4.982205867767334, + "learning_rate": 4.47857011000342e-05, + "loss": 1.7208, + "step": 7284 + }, + { + "epoch": 0.5483731346092324, + "grad_norm": 3.6697936058044434, + "learning_rate": 4.477357683661734e-05, + "loss": 1.5918, + "step": 7285 + }, + { + "epoch": 0.5484484088898925, + "grad_norm": 4.153434753417969, + "learning_rate": 4.476145288389631e-05, + "loss": 1.8734, + "step": 7286 + }, + { + "epoch": 0.5485236831705527, + "grad_norm": 5.237375259399414, + "learning_rate": 4.47493292425919e-05, + "loss": 2.1259, + "step": 7287 + }, + { + "epoch": 0.5485989574512129, + "grad_norm": 5.294926166534424, + "learning_rate": 4.473720591342478e-05, + "loss": 1.7614, + "step": 7288 + }, + { + "epoch": 0.548674231731873, + "grad_norm": 4.6781158447265625, + "learning_rate": 4.472508289711569e-05, + "loss": 1.9955, + "step": 7289 + }, + { + "epoch": 0.5487495060125331, + "grad_norm": 4.195505142211914, + "learning_rate": 4.471296019438527e-05, + "loss": 1.8132, + "step": 7290 + }, + { + "epoch": 0.5488247802931934, + "grad_norm": 6.255605220794678, + "learning_rate": 4.470083780595421e-05, + "loss": 1.6445, + "step": 7291 + }, + { + "epoch": 0.5489000545738535, + "grad_norm": 4.516093730926514, + "learning_rate": 4.468871573254312e-05, + "loss": 1.9043, + "step": 7292 + }, + { + "epoch": 0.5489753288545136, + "grad_norm": 4.334864139556885, + "learning_rate": 4.467659397487265e-05, + "loss": 1.6518, + "step": 7293 + }, + { + "epoch": 0.5490506031351738, + "grad_norm": 3.3741214275360107, + "learning_rate": 4.466447253366338e-05, + "loss": 1.8086, + "step": 7294 + }, + { + "epoch": 0.549125877415834, + "grad_norm": 3.6362617015838623, + "learning_rate": 4.465235140963594e-05, + "loss": 1.6579, + "step": 7295 + }, + { + "epoch": 0.5492011516964941, + "grad_norm": 3.998081684112549, + "learning_rate": 4.4640230603510835e-05, + "loss": 1.706, + "step": 7296 + }, + { + "epoch": 0.5492764259771542, + "grad_norm": 4.017053604125977, + "learning_rate": 4.462811011600865e-05, + "loss": 1.599, + "step": 7297 + }, + { + "epoch": 0.5493517002578144, + "grad_norm": 4.432136535644531, + "learning_rate": 4.46159899478499e-05, + "loss": 1.6721, + "step": 7298 + }, + { + "epoch": 0.5494269745384746, + "grad_norm": 5.504012584686279, + "learning_rate": 4.4603870099755104e-05, + "loss": 1.6547, + "step": 7299 + }, + { + "epoch": 0.5495022488191347, + "grad_norm": 4.690577030181885, + "learning_rate": 4.459175057244476e-05, + "loss": 1.8855, + "step": 7300 + }, + { + "epoch": 0.5495775230997949, + "grad_norm": 4.181679725646973, + "learning_rate": 4.457963136663931e-05, + "loss": 1.8668, + "step": 7301 + }, + { + "epoch": 0.549652797380455, + "grad_norm": 4.668452262878418, + "learning_rate": 4.456751248305924e-05, + "loss": 1.5617, + "step": 7302 + }, + { + "epoch": 0.5497280716611151, + "grad_norm": 6.058993339538574, + "learning_rate": 4.455539392242494e-05, + "loss": 1.6426, + "step": 7303 + }, + { + "epoch": 0.5498033459417754, + "grad_norm": 5.4947943687438965, + "learning_rate": 4.454327568545687e-05, + "loss": 1.7664, + "step": 7304 + }, + { + "epoch": 0.5498786202224355, + "grad_norm": 5.9989728927612305, + "learning_rate": 4.453115777287539e-05, + "loss": 2.1876, + "step": 7305 + }, + { + "epoch": 0.5499538945030956, + "grad_norm": 5.636870861053467, + "learning_rate": 4.4519040185400904e-05, + "loss": 1.6717, + "step": 7306 + }, + { + "epoch": 0.5500291687837559, + "grad_norm": 5.44373083114624, + "learning_rate": 4.4506922923753735e-05, + "loss": 1.6202, + "step": 7307 + }, + { + "epoch": 0.550104443064416, + "grad_norm": 4.024693965911865, + "learning_rate": 4.449480598865426e-05, + "loss": 1.8654, + "step": 7308 + }, + { + "epoch": 0.5501797173450761, + "grad_norm": 4.129490852355957, + "learning_rate": 4.4482689380822744e-05, + "loss": 1.7647, + "step": 7309 + }, + { + "epoch": 0.5502549916257363, + "grad_norm": 5.428489685058594, + "learning_rate": 4.447057310097953e-05, + "loss": 2.1559, + "step": 7310 + }, + { + "epoch": 0.5503302659063964, + "grad_norm": 5.7311296463012695, + "learning_rate": 4.445845714984487e-05, + "loss": 1.656, + "step": 7311 + }, + { + "epoch": 0.5504055401870566, + "grad_norm": 5.774123191833496, + "learning_rate": 4.444634152813905e-05, + "loss": 2.2453, + "step": 7312 + }, + { + "epoch": 0.5504808144677168, + "grad_norm": 5.238942623138428, + "learning_rate": 4.443422623658227e-05, + "loss": 1.8695, + "step": 7313 + }, + { + "epoch": 0.5505560887483769, + "grad_norm": 4.722099304199219, + "learning_rate": 4.4422111275894785e-05, + "loss": 1.7761, + "step": 7314 + }, + { + "epoch": 0.550631363029037, + "grad_norm": 4.443918228149414, + "learning_rate": 4.440999664679676e-05, + "loss": 1.7145, + "step": 7315 + }, + { + "epoch": 0.5507066373096972, + "grad_norm": 4.346336841583252, + "learning_rate": 4.439788235000841e-05, + "loss": 1.7461, + "step": 7316 + }, + { + "epoch": 0.5507819115903574, + "grad_norm": 4.5712432861328125, + "learning_rate": 4.438576838624989e-05, + "loss": 1.9465, + "step": 7317 + }, + { + "epoch": 0.5508571858710175, + "grad_norm": 5.216814994812012, + "learning_rate": 4.4373654756241314e-05, + "loss": 1.6134, + "step": 7318 + }, + { + "epoch": 0.5509324601516776, + "grad_norm": 8.374541282653809, + "learning_rate": 4.436154146070283e-05, + "loss": 1.9638, + "step": 7319 + }, + { + "epoch": 0.5510077344323379, + "grad_norm": 4.869935989379883, + "learning_rate": 4.4349428500354514e-05, + "loss": 1.3676, + "step": 7320 + }, + { + "epoch": 0.551083008712998, + "grad_norm": 5.3260345458984375, + "learning_rate": 4.4337315875916473e-05, + "loss": 1.6899, + "step": 7321 + }, + { + "epoch": 0.5511582829936581, + "grad_norm": 6.002689838409424, + "learning_rate": 4.432520358810875e-05, + "loss": 1.8503, + "step": 7322 + }, + { + "epoch": 0.5512335572743183, + "grad_norm": 4.692911148071289, + "learning_rate": 4.43130916376514e-05, + "loss": 1.7429, + "step": 7323 + }, + { + "epoch": 0.5513088315549785, + "grad_norm": 4.0181732177734375, + "learning_rate": 4.430098002526442e-05, + "loss": 1.8699, + "step": 7324 + }, + { + "epoch": 0.5513841058356386, + "grad_norm": 4.17948579788208, + "learning_rate": 4.4288868751667826e-05, + "loss": 1.9062, + "step": 7325 + }, + { + "epoch": 0.5514593801162988, + "grad_norm": 4.210275650024414, + "learning_rate": 4.427675781758161e-05, + "loss": 1.557, + "step": 7326 + }, + { + "epoch": 0.5515346543969589, + "grad_norm": 5.996506214141846, + "learning_rate": 4.426464722372571e-05, + "loss": 1.8201, + "step": 7327 + }, + { + "epoch": 0.551609928677619, + "grad_norm": 4.769120693206787, + "learning_rate": 4.425253697082007e-05, + "loss": 1.6263, + "step": 7328 + }, + { + "epoch": 0.5516852029582793, + "grad_norm": 5.9188618659973145, + "learning_rate": 4.4240427059584646e-05, + "loss": 1.8107, + "step": 7329 + }, + { + "epoch": 0.5517604772389394, + "grad_norm": 3.2366561889648438, + "learning_rate": 4.4228317490739276e-05, + "loss": 1.7488, + "step": 7330 + }, + { + "epoch": 0.5518357515195995, + "grad_norm": 4.70269775390625, + "learning_rate": 4.421620826500388e-05, + "loss": 1.562, + "step": 7331 + }, + { + "epoch": 0.5519110258002597, + "grad_norm": 4.698917865753174, + "learning_rate": 4.4204099383098294e-05, + "loss": 1.8538, + "step": 7332 + }, + { + "epoch": 0.5519863000809199, + "grad_norm": 5.996650695800781, + "learning_rate": 4.419199084574237e-05, + "loss": 1.855, + "step": 7333 + }, + { + "epoch": 0.55206157436158, + "grad_norm": 5.515308856964111, + "learning_rate": 4.417988265365596e-05, + "loss": 1.9484, + "step": 7334 + }, + { + "epoch": 0.5521368486422401, + "grad_norm": 7.892343997955322, + "learning_rate": 4.4167774807558796e-05, + "loss": 1.8699, + "step": 7335 + }, + { + "epoch": 0.5522121229229003, + "grad_norm": 4.509010314941406, + "learning_rate": 4.415566730817071e-05, + "loss": 2.0394, + "step": 7336 + }, + { + "epoch": 0.5522873972035605, + "grad_norm": 6.582646369934082, + "learning_rate": 4.414356015621141e-05, + "loss": 1.8539, + "step": 7337 + }, + { + "epoch": 0.5523626714842206, + "grad_norm": 4.764413356781006, + "learning_rate": 4.413145335240066e-05, + "loss": 1.4583, + "step": 7338 + }, + { + "epoch": 0.5524379457648808, + "grad_norm": 6.172499656677246, + "learning_rate": 4.4119346897458156e-05, + "loss": 1.8544, + "step": 7339 + }, + { + "epoch": 0.5525132200455409, + "grad_norm": 5.4019904136657715, + "learning_rate": 4.410724079210364e-05, + "loss": 2.069, + "step": 7340 + }, + { + "epoch": 0.552588494326201, + "grad_norm": 4.147262096405029, + "learning_rate": 4.4095135037056715e-05, + "loss": 1.8534, + "step": 7341 + }, + { + "epoch": 0.5526637686068613, + "grad_norm": 4.645547866821289, + "learning_rate": 4.408302963303709e-05, + "loss": 1.9137, + "step": 7342 + }, + { + "epoch": 0.5527390428875214, + "grad_norm": 3.386406660079956, + "learning_rate": 4.407092458076436e-05, + "loss": 1.6478, + "step": 7343 + }, + { + "epoch": 0.5528143171681815, + "grad_norm": 4.044482231140137, + "learning_rate": 4.405881988095818e-05, + "loss": 1.7621, + "step": 7344 + }, + { + "epoch": 0.5528895914488418, + "grad_norm": 5.414804935455322, + "learning_rate": 4.4046715534338084e-05, + "loss": 1.8401, + "step": 7345 + }, + { + "epoch": 0.5529648657295019, + "grad_norm": 6.917354106903076, + "learning_rate": 4.403461154162369e-05, + "loss": 2.8522, + "step": 7346 + }, + { + "epoch": 0.553040140010162, + "grad_norm": 4.381288051605225, + "learning_rate": 4.40225079035345e-05, + "loss": 1.8826, + "step": 7347 + }, + { + "epoch": 0.5531154142908222, + "grad_norm": 5.965206146240234, + "learning_rate": 4.4010404620790066e-05, + "loss": 1.791, + "step": 7348 + }, + { + "epoch": 0.5531906885714823, + "grad_norm": 3.955334186553955, + "learning_rate": 4.39983016941099e-05, + "loss": 1.5135, + "step": 7349 + }, + { + "epoch": 0.5532659628521425, + "grad_norm": 4.960020065307617, + "learning_rate": 4.3986199124213465e-05, + "loss": 1.9935, + "step": 7350 + }, + { + "epoch": 0.5533412371328027, + "grad_norm": 5.991844177246094, + "learning_rate": 4.397409691182026e-05, + "loss": 1.9516, + "step": 7351 + }, + { + "epoch": 0.5534165114134628, + "grad_norm": 5.176320552825928, + "learning_rate": 4.396199505764968e-05, + "loss": 2.092, + "step": 7352 + }, + { + "epoch": 0.5534917856941229, + "grad_norm": 5.308154106140137, + "learning_rate": 4.3949893562421195e-05, + "loss": 1.6836, + "step": 7353 + }, + { + "epoch": 0.5535670599747832, + "grad_norm": 4.589962482452393, + "learning_rate": 4.393779242685416e-05, + "loss": 1.9806, + "step": 7354 + }, + { + "epoch": 0.5536423342554433, + "grad_norm": 4.495156764984131, + "learning_rate": 4.392569165166798e-05, + "loss": 1.6884, + "step": 7355 + }, + { + "epoch": 0.5537176085361034, + "grad_norm": 6.182584285736084, + "learning_rate": 4.391359123758198e-05, + "loss": 1.9364, + "step": 7356 + }, + { + "epoch": 0.5537928828167635, + "grad_norm": 3.5868709087371826, + "learning_rate": 4.3901491185315544e-05, + "loss": 1.7965, + "step": 7357 + }, + { + "epoch": 0.5538681570974238, + "grad_norm": 4.117193222045898, + "learning_rate": 4.388939149558795e-05, + "loss": 1.763, + "step": 7358 + }, + { + "epoch": 0.5539434313780839, + "grad_norm": 4.263593673706055, + "learning_rate": 4.387729216911849e-05, + "loss": 1.6851, + "step": 7359 + }, + { + "epoch": 0.554018705658744, + "grad_norm": 5.544510364532471, + "learning_rate": 4.3865193206626446e-05, + "loss": 2.0027, + "step": 7360 + }, + { + "epoch": 0.5540939799394042, + "grad_norm": 7.832817077636719, + "learning_rate": 4.385309460883107e-05, + "loss": 1.8839, + "step": 7361 + }, + { + "epoch": 0.5541692542200644, + "grad_norm": 5.9470438957214355, + "learning_rate": 4.3840996376451574e-05, + "loss": 2.5062, + "step": 7362 + }, + { + "epoch": 0.5542445285007245, + "grad_norm": 4.780316352844238, + "learning_rate": 4.382889851020718e-05, + "loss": 1.8254, + "step": 7363 + }, + { + "epoch": 0.5543198027813847, + "grad_norm": 4.252252101898193, + "learning_rate": 4.381680101081706e-05, + "loss": 1.6875, + "step": 7364 + }, + { + "epoch": 0.5543950770620448, + "grad_norm": 5.962366580963135, + "learning_rate": 4.380470387900036e-05, + "loss": 1.6471, + "step": 7365 + }, + { + "epoch": 0.554470351342705, + "grad_norm": 6.075410842895508, + "learning_rate": 4.3792607115476256e-05, + "loss": 2.1169, + "step": 7366 + }, + { + "epoch": 0.5545456256233652, + "grad_norm": 5.119984149932861, + "learning_rate": 4.378051072096384e-05, + "loss": 1.6036, + "step": 7367 + }, + { + "epoch": 0.5546208999040253, + "grad_norm": 5.708016395568848, + "learning_rate": 4.376841469618224e-05, + "loss": 1.7511, + "step": 7368 + }, + { + "epoch": 0.5546961741846854, + "grad_norm": 5.144496440887451, + "learning_rate": 4.375631904185047e-05, + "loss": 1.6978, + "step": 7369 + }, + { + "epoch": 0.5547714484653457, + "grad_norm": 4.935191631317139, + "learning_rate": 4.3744223758687645e-05, + "loss": 1.9764, + "step": 7370 + }, + { + "epoch": 0.5548467227460058, + "grad_norm": 5.738244533538818, + "learning_rate": 4.373212884741275e-05, + "loss": 1.8449, + "step": 7371 + }, + { + "epoch": 0.5549219970266659, + "grad_norm": 17.346952438354492, + "learning_rate": 4.3720034308744825e-05, + "loss": 1.7223, + "step": 7372 + }, + { + "epoch": 0.5549972713073261, + "grad_norm": 5.4458818435668945, + "learning_rate": 4.3707940143402825e-05, + "loss": 2.0466, + "step": 7373 + }, + { + "epoch": 0.5550725455879862, + "grad_norm": 4.200076103210449, + "learning_rate": 4.369584635210575e-05, + "loss": 2.086, + "step": 7374 + }, + { + "epoch": 0.5551478198686464, + "grad_norm": 5.461789608001709, + "learning_rate": 4.368375293557251e-05, + "loss": 2.151, + "step": 7375 + }, + { + "epoch": 0.5552230941493065, + "grad_norm": 4.160572528839111, + "learning_rate": 4.367165989452203e-05, + "loss": 1.8176, + "step": 7376 + }, + { + "epoch": 0.5552983684299667, + "grad_norm": 4.739264965057373, + "learning_rate": 4.365956722967321e-05, + "loss": 1.9772, + "step": 7377 + }, + { + "epoch": 0.5553736427106268, + "grad_norm": 8.766257286071777, + "learning_rate": 4.3647474941744947e-05, + "loss": 1.4902, + "step": 7378 + }, + { + "epoch": 0.555448916991287, + "grad_norm": 4.3162665367126465, + "learning_rate": 4.3635383031456045e-05, + "loss": 2.2551, + "step": 7379 + }, + { + "epoch": 0.5555241912719472, + "grad_norm": 19.630680084228516, + "learning_rate": 4.3623291499525356e-05, + "loss": 2.0508, + "step": 7380 + }, + { + "epoch": 0.5555994655526073, + "grad_norm": 6.041289329528809, + "learning_rate": 4.361120034667172e-05, + "loss": 1.7392, + "step": 7381 + }, + { + "epoch": 0.5556747398332674, + "grad_norm": 5.298127174377441, + "learning_rate": 4.3599109573613875e-05, + "loss": 2.1195, + "step": 7382 + }, + { + "epoch": 0.5557500141139277, + "grad_norm": 5.084386348724365, + "learning_rate": 4.35870191810706e-05, + "loss": 2.0705, + "step": 7383 + }, + { + "epoch": 0.5558252883945878, + "grad_norm": 4.2747392654418945, + "learning_rate": 4.357492916976062e-05, + "loss": 1.7711, + "step": 7384 + }, + { + "epoch": 0.5559005626752479, + "grad_norm": 4.177096366882324, + "learning_rate": 4.35628395404027e-05, + "loss": 1.9132, + "step": 7385 + }, + { + "epoch": 0.5559758369559081, + "grad_norm": 3.7384274005889893, + "learning_rate": 4.355075029371547e-05, + "loss": 1.9329, + "step": 7386 + }, + { + "epoch": 0.5560511112365683, + "grad_norm": 7.845738887786865, + "learning_rate": 4.353866143041764e-05, + "loss": 2.0035, + "step": 7387 + }, + { + "epoch": 0.5561263855172284, + "grad_norm": 6.672183036804199, + "learning_rate": 4.352657295122784e-05, + "loss": 1.7596, + "step": 7388 + }, + { + "epoch": 0.5562016597978886, + "grad_norm": 5.343553066253662, + "learning_rate": 4.3514484856864724e-05, + "loss": 1.5301, + "step": 7389 + }, + { + "epoch": 0.5562769340785487, + "grad_norm": 5.855565547943115, + "learning_rate": 4.3502397148046855e-05, + "loss": 1.6712, + "step": 7390 + }, + { + "epoch": 0.5563522083592088, + "grad_norm": 6.186790943145752, + "learning_rate": 4.349030982549285e-05, + "loss": 1.7094, + "step": 7391 + }, + { + "epoch": 0.5564274826398691, + "grad_norm": 4.008334636688232, + "learning_rate": 4.347822288992123e-05, + "loss": 1.6008, + "step": 7392 + }, + { + "epoch": 0.5565027569205292, + "grad_norm": 6.670836448669434, + "learning_rate": 4.346613634205055e-05, + "loss": 1.7821, + "step": 7393 + }, + { + "epoch": 0.5565780312011893, + "grad_norm": 4.638400554656982, + "learning_rate": 4.3454050182599315e-05, + "loss": 1.8854, + "step": 7394 + }, + { + "epoch": 0.5566533054818494, + "grad_norm": 5.600529670715332, + "learning_rate": 4.3441964412286026e-05, + "loss": 1.6546, + "step": 7395 + }, + { + "epoch": 0.5567285797625097, + "grad_norm": 5.600529670715332, + "learning_rate": 4.3441964412286026e-05, + "loss": 1.631, + "step": 7396 + }, + { + "epoch": 0.5568038540431698, + "grad_norm": 5.415933132171631, + "learning_rate": 4.3429879031829125e-05, + "loss": 1.5816, + "step": 7397 + }, + { + "epoch": 0.5568791283238299, + "grad_norm": 5.31972074508667, + "learning_rate": 4.341779404194706e-05, + "loss": 1.683, + "step": 7398 + }, + { + "epoch": 0.5569544026044901, + "grad_norm": 6.651749610900879, + "learning_rate": 4.3405709443358256e-05, + "loss": 1.7278, + "step": 7399 + }, + { + "epoch": 0.5570296768851503, + "grad_norm": 3.8869152069091797, + "learning_rate": 4.33936252367811e-05, + "loss": 1.8289, + "step": 7400 + }, + { + "epoch": 0.5571049511658104, + "grad_norm": 4.794390678405762, + "learning_rate": 4.338154142293398e-05, + "loss": 1.7762, + "step": 7401 + }, + { + "epoch": 0.5571802254464706, + "grad_norm": 6.339361190795898, + "learning_rate": 4.336945800253522e-05, + "loss": 2.0466, + "step": 7402 + }, + { + "epoch": 0.5572554997271307, + "grad_norm": 4.306975364685059, + "learning_rate": 4.335737497630318e-05, + "loss": 1.938, + "step": 7403 + }, + { + "epoch": 0.5573307740077909, + "grad_norm": 5.262393474578857, + "learning_rate": 4.334529234495612e-05, + "loss": 1.881, + "step": 7404 + }, + { + "epoch": 0.5574060482884511, + "grad_norm": 4.171533107757568, + "learning_rate": 4.3333210109212336e-05, + "loss": 1.6922, + "step": 7405 + }, + { + "epoch": 0.5574813225691112, + "grad_norm": 4.779244899749756, + "learning_rate": 4.3321128269790074e-05, + "loss": 1.8641, + "step": 7406 + }, + { + "epoch": 0.5575565968497713, + "grad_norm": 4.942127227783203, + "learning_rate": 4.3309046827407594e-05, + "loss": 1.6782, + "step": 7407 + }, + { + "epoch": 0.5576318711304316, + "grad_norm": 6.362834453582764, + "learning_rate": 4.329696578278306e-05, + "loss": 1.8551, + "step": 7408 + }, + { + "epoch": 0.5577071454110917, + "grad_norm": 5.308552265167236, + "learning_rate": 4.3284885136634704e-05, + "loss": 1.6864, + "step": 7409 + }, + { + "epoch": 0.5577824196917518, + "grad_norm": 4.01266622543335, + "learning_rate": 4.327280488968063e-05, + "loss": 1.9965, + "step": 7410 + }, + { + "epoch": 0.557857693972412, + "grad_norm": 4.613207817077637, + "learning_rate": 4.3260725042639014e-05, + "loss": 1.5381, + "step": 7411 + }, + { + "epoch": 0.5579329682530721, + "grad_norm": 3.8639886379241943, + "learning_rate": 4.324864559622795e-05, + "loss": 1.7362, + "step": 7412 + }, + { + "epoch": 0.5580082425337323, + "grad_norm": 5.024290084838867, + "learning_rate": 4.323656655116553e-05, + "loss": 1.7859, + "step": 7413 + }, + { + "epoch": 0.5580835168143924, + "grad_norm": 8.777493476867676, + "learning_rate": 4.3224487908169844e-05, + "loss": 2.1262, + "step": 7414 + }, + { + "epoch": 0.5581587910950526, + "grad_norm": 5.892402648925781, + "learning_rate": 4.32124096679589e-05, + "loss": 2.0683, + "step": 7415 + }, + { + "epoch": 0.5582340653757127, + "grad_norm": 4.961948394775391, + "learning_rate": 4.320033183125072e-05, + "loss": 1.9883, + "step": 7416 + }, + { + "epoch": 0.5583093396563729, + "grad_norm": 6.471138000488281, + "learning_rate": 4.31882543987633e-05, + "loss": 1.8035, + "step": 7417 + }, + { + "epoch": 0.5583846139370331, + "grad_norm": 4.507258892059326, + "learning_rate": 4.3176177371214634e-05, + "loss": 1.7907, + "step": 7418 + }, + { + "epoch": 0.5584598882176932, + "grad_norm": 4.751652240753174, + "learning_rate": 4.316410074932262e-05, + "loss": 1.9768, + "step": 7419 + }, + { + "epoch": 0.5585351624983533, + "grad_norm": 5.327757835388184, + "learning_rate": 4.315202453380522e-05, + "loss": 1.7059, + "step": 7420 + }, + { + "epoch": 0.5586104367790136, + "grad_norm": 5.68921422958374, + "learning_rate": 4.3139948725380295e-05, + "loss": 1.8097, + "step": 7421 + }, + { + "epoch": 0.5586857110596737, + "grad_norm": 4.355867862701416, + "learning_rate": 4.312787332476574e-05, + "loss": 1.9244, + "step": 7422 + }, + { + "epoch": 0.5587609853403338, + "grad_norm": 5.760314464569092, + "learning_rate": 4.311579833267939e-05, + "loss": 1.7182, + "step": 7423 + }, + { + "epoch": 0.558836259620994, + "grad_norm": 5.257676124572754, + "learning_rate": 4.3103723749839096e-05, + "loss": 1.7564, + "step": 7424 + }, + { + "epoch": 0.5589115339016542, + "grad_norm": 5.390875816345215, + "learning_rate": 4.3091649576962624e-05, + "loss": 1.7063, + "step": 7425 + }, + { + "epoch": 0.5589868081823143, + "grad_norm": 6.912785530090332, + "learning_rate": 4.3079575814767766e-05, + "loss": 2.0889, + "step": 7426 + }, + { + "epoch": 0.5590620824629745, + "grad_norm": 6.88408088684082, + "learning_rate": 4.306750246397225e-05, + "loss": 1.8458, + "step": 7427 + }, + { + "epoch": 0.5591373567436346, + "grad_norm": 4.158571243286133, + "learning_rate": 4.305542952529386e-05, + "loss": 1.6018, + "step": 7428 + }, + { + "epoch": 0.5592126310242947, + "grad_norm": 4.697673320770264, + "learning_rate": 4.304335699945023e-05, + "loss": 1.6856, + "step": 7429 + }, + { + "epoch": 0.559287905304955, + "grad_norm": 4.52990198135376, + "learning_rate": 4.3031284887159065e-05, + "loss": 1.6021, + "step": 7430 + }, + { + "epoch": 0.5593631795856151, + "grad_norm": 4.265514850616455, + "learning_rate": 4.301921318913804e-05, + "loss": 1.6793, + "step": 7431 + }, + { + "epoch": 0.5594384538662752, + "grad_norm": 18.841873168945312, + "learning_rate": 4.300714190610473e-05, + "loss": 1.8684, + "step": 7432 + }, + { + "epoch": 0.5595137281469353, + "grad_norm": 4.621226787567139, + "learning_rate": 4.2995071038776794e-05, + "loss": 1.9078, + "step": 7433 + }, + { + "epoch": 0.5595890024275956, + "grad_norm": 4.262931823730469, + "learning_rate": 4.2983000587871763e-05, + "loss": 2.108, + "step": 7434 + }, + { + "epoch": 0.5596642767082557, + "grad_norm": 4.973301410675049, + "learning_rate": 4.2970930554107244e-05, + "loss": 1.8347, + "step": 7435 + }, + { + "epoch": 0.5597395509889158, + "grad_norm": 4.143371105194092, + "learning_rate": 4.2958860938200706e-05, + "loss": 1.7814, + "step": 7436 + }, + { + "epoch": 0.559814825269576, + "grad_norm": 4.87202262878418, + "learning_rate": 4.294679174086971e-05, + "loss": 1.9359, + "step": 7437 + }, + { + "epoch": 0.5598900995502362, + "grad_norm": 4.949337959289551, + "learning_rate": 4.293472296283168e-05, + "loss": 1.5853, + "step": 7438 + }, + { + "epoch": 0.5599653738308963, + "grad_norm": 5.291871547698975, + "learning_rate": 4.292265460480411e-05, + "loss": 1.6179, + "step": 7439 + }, + { + "epoch": 0.5600406481115565, + "grad_norm": 3.6330108642578125, + "learning_rate": 4.2910586667504406e-05, + "loss": 1.6022, + "step": 7440 + }, + { + "epoch": 0.5601159223922166, + "grad_norm": 3.868661642074585, + "learning_rate": 4.289851915165001e-05, + "loss": 1.4004, + "step": 7441 + }, + { + "epoch": 0.5601911966728768, + "grad_norm": 4.826730728149414, + "learning_rate": 4.288645205795824e-05, + "loss": 1.7722, + "step": 7442 + }, + { + "epoch": 0.560266470953537, + "grad_norm": 6.208520889282227, + "learning_rate": 4.28743853871465e-05, + "loss": 1.8908, + "step": 7443 + }, + { + "epoch": 0.5603417452341971, + "grad_norm": 4.353363513946533, + "learning_rate": 4.28623191399321e-05, + "loss": 1.6369, + "step": 7444 + }, + { + "epoch": 0.5604170195148572, + "grad_norm": 10.165128707885742, + "learning_rate": 4.285025331703236e-05, + "loss": 1.9703, + "step": 7445 + }, + { + "epoch": 0.5604922937955175, + "grad_norm": 5.548494338989258, + "learning_rate": 4.283818791916453e-05, + "loss": 2.2073, + "step": 7446 + }, + { + "epoch": 0.5605675680761776, + "grad_norm": 5.090147972106934, + "learning_rate": 4.2826122947045875e-05, + "loss": 1.7724, + "step": 7447 + }, + { + "epoch": 0.5606428423568377, + "grad_norm": 4.495290279388428, + "learning_rate": 4.281405840139364e-05, + "loss": 2.1729, + "step": 7448 + }, + { + "epoch": 0.5607181166374979, + "grad_norm": 3.614408493041992, + "learning_rate": 4.280199428292501e-05, + "loss": 1.6867, + "step": 7449 + }, + { + "epoch": 0.560793390918158, + "grad_norm": 3.771042823791504, + "learning_rate": 4.278993059235716e-05, + "loss": 1.5991, + "step": 7450 + }, + { + "epoch": 0.5608686651988182, + "grad_norm": 5.69525671005249, + "learning_rate": 4.277786733040725e-05, + "loss": 1.7602, + "step": 7451 + }, + { + "epoch": 0.5609439394794784, + "grad_norm": 4.18727970123291, + "learning_rate": 4.276580449779242e-05, + "loss": 1.5962, + "step": 7452 + }, + { + "epoch": 0.5610192137601385, + "grad_norm": 5.1569390296936035, + "learning_rate": 4.275374209522974e-05, + "loss": 1.691, + "step": 7453 + }, + { + "epoch": 0.5610944880407986, + "grad_norm": 3.6384122371673584, + "learning_rate": 4.27416801234363e-05, + "loss": 1.8882, + "step": 7454 + }, + { + "epoch": 0.5611697623214588, + "grad_norm": 6.092841148376465, + "learning_rate": 4.272961858312916e-05, + "loss": 2.1804, + "step": 7455 + }, + { + "epoch": 0.561245036602119, + "grad_norm": 4.804257392883301, + "learning_rate": 4.2717557475025335e-05, + "loss": 1.7789, + "step": 7456 + }, + { + "epoch": 0.5613203108827791, + "grad_norm": 7.01716423034668, + "learning_rate": 4.27054967998418e-05, + "loss": 1.5826, + "step": 7457 + }, + { + "epoch": 0.5613955851634392, + "grad_norm": 5.15637731552124, + "learning_rate": 4.269343655829558e-05, + "loss": 2.0327, + "step": 7458 + }, + { + "epoch": 0.5614708594440995, + "grad_norm": 3.420675277709961, + "learning_rate": 4.2681376751103575e-05, + "loss": 1.6229, + "step": 7459 + }, + { + "epoch": 0.5615461337247596, + "grad_norm": 6.5957159996032715, + "learning_rate": 4.266931737898273e-05, + "loss": 1.7094, + "step": 7460 + }, + { + "epoch": 0.5616214080054197, + "grad_norm": 4.626439094543457, + "learning_rate": 4.265725844264992e-05, + "loss": 1.9657, + "step": 7461 + }, + { + "epoch": 0.5616966822860799, + "grad_norm": 4.969316005706787, + "learning_rate": 4.264519994282202e-05, + "loss": 1.799, + "step": 7462 + }, + { + "epoch": 0.5617719565667401, + "grad_norm": 4.947000980377197, + "learning_rate": 4.263314188021591e-05, + "loss": 2.0143, + "step": 7463 + }, + { + "epoch": 0.5618472308474002, + "grad_norm": 5.604894638061523, + "learning_rate": 4.2621084255548363e-05, + "loss": 2.0187, + "step": 7464 + }, + { + "epoch": 0.5619225051280604, + "grad_norm": 5.0242486000061035, + "learning_rate": 4.2609027069536186e-05, + "loss": 1.6759, + "step": 7465 + }, + { + "epoch": 0.5619977794087205, + "grad_norm": 4.64635705947876, + "learning_rate": 4.259697032289613e-05, + "loss": 1.475, + "step": 7466 + }, + { + "epoch": 0.5620730536893807, + "grad_norm": 7.2053542137146, + "learning_rate": 4.258491401634497e-05, + "loss": 2.2857, + "step": 7467 + }, + { + "epoch": 0.5621483279700409, + "grad_norm": 4.491894721984863, + "learning_rate": 4.2572858150599374e-05, + "loss": 1.5445, + "step": 7468 + }, + { + "epoch": 0.562223602250701, + "grad_norm": 7.9561448097229, + "learning_rate": 4.256080272637607e-05, + "loss": 1.5742, + "step": 7469 + }, + { + "epoch": 0.5622988765313611, + "grad_norm": 4.057933330535889, + "learning_rate": 4.254874774439168e-05, + "loss": 1.7919, + "step": 7470 + }, + { + "epoch": 0.5623741508120214, + "grad_norm": 5.072565078735352, + "learning_rate": 4.253669320536286e-05, + "loss": 2.0377, + "step": 7471 + }, + { + "epoch": 0.5624494250926815, + "grad_norm": 4.860074043273926, + "learning_rate": 4.252463911000621e-05, + "loss": 1.7523, + "step": 7472 + }, + { + "epoch": 0.5625246993733416, + "grad_norm": 6.180167198181152, + "learning_rate": 4.251258545903835e-05, + "loss": 1.9421, + "step": 7473 + }, + { + "epoch": 0.5625999736540017, + "grad_norm": 5.108168601989746, + "learning_rate": 4.250053225317576e-05, + "loss": 1.9517, + "step": 7474 + }, + { + "epoch": 0.562675247934662, + "grad_norm": 3.6376171112060547, + "learning_rate": 4.2488479493135034e-05, + "loss": 1.9135, + "step": 7475 + }, + { + "epoch": 0.5627505222153221, + "grad_norm": 21.876197814941406, + "learning_rate": 4.2476427179632635e-05, + "loss": 2.1445, + "step": 7476 + }, + { + "epoch": 0.5628257964959822, + "grad_norm": 5.575896739959717, + "learning_rate": 4.246437531338506e-05, + "loss": 2.0888, + "step": 7477 + }, + { + "epoch": 0.5629010707766424, + "grad_norm": 5.1024909019470215, + "learning_rate": 4.245232389510874e-05, + "loss": 1.9425, + "step": 7478 + }, + { + "epoch": 0.5629763450573025, + "grad_norm": 5.275901794433594, + "learning_rate": 4.24402729255201e-05, + "loss": 2.0351, + "step": 7479 + }, + { + "epoch": 0.5630516193379627, + "grad_norm": 7.1715850830078125, + "learning_rate": 4.242822240533558e-05, + "loss": 2.2203, + "step": 7480 + }, + { + "epoch": 0.5631268936186229, + "grad_norm": 4.021682262420654, + "learning_rate": 4.2416172335271476e-05, + "loss": 1.4009, + "step": 7481 + }, + { + "epoch": 0.563202167899283, + "grad_norm": 5.005763053894043, + "learning_rate": 4.240412271604419e-05, + "loss": 1.8137, + "step": 7482 + }, + { + "epoch": 0.5632774421799431, + "grad_norm": 5.314718246459961, + "learning_rate": 4.239207354837e-05, + "loss": 1.8505, + "step": 7483 + }, + { + "epoch": 0.5633527164606034, + "grad_norm": 5.31954288482666, + "learning_rate": 4.238002483296522e-05, + "loss": 1.9272, + "step": 7484 + }, + { + "epoch": 0.5634279907412635, + "grad_norm": 4.3423380851745605, + "learning_rate": 4.236797657054608e-05, + "loss": 1.6048, + "step": 7485 + }, + { + "epoch": 0.5635032650219236, + "grad_norm": 9.00117301940918, + "learning_rate": 4.2355928761828855e-05, + "loss": 1.6651, + "step": 7486 + }, + { + "epoch": 0.5635785393025838, + "grad_norm": 4.542453289031982, + "learning_rate": 4.234388140752972e-05, + "loss": 1.7389, + "step": 7487 + }, + { + "epoch": 0.563653813583244, + "grad_norm": 5.218205451965332, + "learning_rate": 4.233183450836487e-05, + "loss": 1.7961, + "step": 7488 + }, + { + "epoch": 0.5637290878639041, + "grad_norm": 4.660144329071045, + "learning_rate": 4.2319788065050445e-05, + "loss": 1.9375, + "step": 7489 + }, + { + "epoch": 0.5638043621445643, + "grad_norm": 5.165487289428711, + "learning_rate": 4.230774207830261e-05, + "loss": 1.9622, + "step": 7490 + }, + { + "epoch": 0.5638796364252244, + "grad_norm": 4.344994068145752, + "learning_rate": 4.229569654883741e-05, + "loss": 1.6855, + "step": 7491 + }, + { + "epoch": 0.5639549107058845, + "grad_norm": 6.870647430419922, + "learning_rate": 4.2283651477370966e-05, + "loss": 1.5097, + "step": 7492 + }, + { + "epoch": 0.5640301849865447, + "grad_norm": 4.5190935134887695, + "learning_rate": 4.227160686461928e-05, + "loss": 1.7008, + "step": 7493 + }, + { + "epoch": 0.5641054592672049, + "grad_norm": 22.18250274658203, + "learning_rate": 4.22595627112984e-05, + "loss": 1.8461, + "step": 7494 + }, + { + "epoch": 0.564180733547865, + "grad_norm": 5.605548858642578, + "learning_rate": 4.2247519018124306e-05, + "loss": 1.8153, + "step": 7495 + }, + { + "epoch": 0.5642560078285251, + "grad_norm": 4.350778102874756, + "learning_rate": 4.223547578581295e-05, + "loss": 1.6657, + "step": 7496 + }, + { + "epoch": 0.5643312821091854, + "grad_norm": 5.072646617889404, + "learning_rate": 4.2223433015080314e-05, + "loss": 1.7613, + "step": 7497 + }, + { + "epoch": 0.5644065563898455, + "grad_norm": 5.001066207885742, + "learning_rate": 4.221139070664224e-05, + "loss": 1.4896, + "step": 7498 + }, + { + "epoch": 0.5644818306705056, + "grad_norm": 6.147504806518555, + "learning_rate": 4.219934886121465e-05, + "loss": 1.812, + "step": 7499 + }, + { + "epoch": 0.5645571049511658, + "grad_norm": 6.255466461181641, + "learning_rate": 4.218730747951338e-05, + "loss": 2.0394, + "step": 7500 + }, + { + "epoch": 0.564632379231826, + "grad_norm": 7.103241443634033, + "learning_rate": 4.2175266562254287e-05, + "loss": 1.9296, + "step": 7501 + }, + { + "epoch": 0.5647076535124861, + "grad_norm": 7.3898749351501465, + "learning_rate": 4.216322611015312e-05, + "loss": 1.8299, + "step": 7502 + }, + { + "epoch": 0.5647829277931463, + "grad_norm": 5.107294082641602, + "learning_rate": 4.21511861239257e-05, + "loss": 1.5601, + "step": 7503 + }, + { + "epoch": 0.5648582020738064, + "grad_norm": 7.923689365386963, + "learning_rate": 4.2139146604287715e-05, + "loss": 1.9649, + "step": 7504 + }, + { + "epoch": 0.5649334763544666, + "grad_norm": 7.2628912925720215, + "learning_rate": 4.212710755195493e-05, + "loss": 1.8243, + "step": 7505 + }, + { + "epoch": 0.5650087506351268, + "grad_norm": 4.212297439575195, + "learning_rate": 4.2115068967643e-05, + "loss": 1.7723, + "step": 7506 + }, + { + "epoch": 0.5650840249157869, + "grad_norm": 5.4501142501831055, + "learning_rate": 4.2103030852067605e-05, + "loss": 1.8017, + "step": 7507 + }, + { + "epoch": 0.565159299196447, + "grad_norm": 4.330233573913574, + "learning_rate": 4.209099320594436e-05, + "loss": 2.0909, + "step": 7508 + }, + { + "epoch": 0.5652345734771073, + "grad_norm": 5.734197616577148, + "learning_rate": 4.2078956029988884e-05, + "loss": 1.8515, + "step": 7509 + }, + { + "epoch": 0.5653098477577674, + "grad_norm": 4.397119045257568, + "learning_rate": 4.206691932491673e-05, + "loss": 1.5727, + "step": 7510 + }, + { + "epoch": 0.5653851220384275, + "grad_norm": 5.802783489227295, + "learning_rate": 4.205488309144346e-05, + "loss": 1.6944, + "step": 7511 + }, + { + "epoch": 0.5654603963190876, + "grad_norm": 6.3564019203186035, + "learning_rate": 4.20428473302846e-05, + "loss": 1.8573, + "step": 7512 + }, + { + "epoch": 0.5655356705997479, + "grad_norm": 4.0578765869140625, + "learning_rate": 4.203081204215563e-05, + "loss": 1.5931, + "step": 7513 + }, + { + "epoch": 0.565610944880408, + "grad_norm": 4.7927117347717285, + "learning_rate": 4.2018777227772034e-05, + "loss": 2.0905, + "step": 7514 + }, + { + "epoch": 0.5656862191610681, + "grad_norm": 6.430075168609619, + "learning_rate": 4.20067428878492e-05, + "loss": 1.9327, + "step": 7515 + }, + { + "epoch": 0.5657614934417283, + "grad_norm": 4.997286796569824, + "learning_rate": 4.19947090231026e-05, + "loss": 1.932, + "step": 7516 + }, + { + "epoch": 0.5658367677223884, + "grad_norm": 5.49184513092041, + "learning_rate": 4.198267563424755e-05, + "loss": 2.0093, + "step": 7517 + }, + { + "epoch": 0.5659120420030486, + "grad_norm": 6.221096992492676, + "learning_rate": 4.1970642721999455e-05, + "loss": 1.7121, + "step": 7518 + }, + { + "epoch": 0.5659873162837088, + "grad_norm": 5.395580291748047, + "learning_rate": 4.195861028707359e-05, + "loss": 1.8832, + "step": 7519 + }, + { + "epoch": 0.5660625905643689, + "grad_norm": 4.471389293670654, + "learning_rate": 4.194657833018529e-05, + "loss": 1.9942, + "step": 7520 + }, + { + "epoch": 0.566137864845029, + "grad_norm": 5.878729343414307, + "learning_rate": 4.193454685204979e-05, + "loss": 1.9613, + "step": 7521 + }, + { + "epoch": 0.5662131391256893, + "grad_norm": 6.629793167114258, + "learning_rate": 4.192251585338234e-05, + "loss": 1.9033, + "step": 7522 + }, + { + "epoch": 0.5662884134063494, + "grad_norm": 5.211687088012695, + "learning_rate": 4.191048533489813e-05, + "loss": 1.8494, + "step": 7523 + }, + { + "epoch": 0.5663636876870095, + "grad_norm": 6.793416976928711, + "learning_rate": 4.189845529731238e-05, + "loss": 1.8274, + "step": 7524 + }, + { + "epoch": 0.5664389619676697, + "grad_norm": 4.75369119644165, + "learning_rate": 4.18864257413402e-05, + "loss": 2.1421, + "step": 7525 + }, + { + "epoch": 0.5665142362483299, + "grad_norm": 5.9005446434021, + "learning_rate": 4.187439666769672e-05, + "loss": 1.8238, + "step": 7526 + }, + { + "epoch": 0.56658951052899, + "grad_norm": 35.89638137817383, + "learning_rate": 4.186236807709706e-05, + "loss": 2.2183, + "step": 7527 + }, + { + "epoch": 0.5666647848096502, + "grad_norm": 6.612427711486816, + "learning_rate": 4.1850339970256266e-05, + "loss": 1.7168, + "step": 7528 + }, + { + "epoch": 0.5667400590903103, + "grad_norm": 4.51029109954834, + "learning_rate": 4.183831234788938e-05, + "loss": 1.6069, + "step": 7529 + }, + { + "epoch": 0.5668153333709705, + "grad_norm": 4.103471279144287, + "learning_rate": 4.18262852107114e-05, + "loss": 1.8781, + "step": 7530 + }, + { + "epoch": 0.5668906076516306, + "grad_norm": 6.257298946380615, + "learning_rate": 4.1814258559437325e-05, + "loss": 1.8776, + "step": 7531 + }, + { + "epoch": 0.5669658819322908, + "grad_norm": 5.5712199211120605, + "learning_rate": 4.180223239478208e-05, + "loss": 2.1598, + "step": 7532 + }, + { + "epoch": 0.5670411562129509, + "grad_norm": 5.68540620803833, + "learning_rate": 4.17902067174606e-05, + "loss": 2.3442, + "step": 7533 + }, + { + "epoch": 0.567116430493611, + "grad_norm": 5.268666744232178, + "learning_rate": 4.177818152818776e-05, + "loss": 2.0087, + "step": 7534 + }, + { + "epoch": 0.5671917047742713, + "grad_norm": 3.7052841186523438, + "learning_rate": 4.176615682767846e-05, + "loss": 1.863, + "step": 7535 + }, + { + "epoch": 0.5672669790549314, + "grad_norm": 7.956150054931641, + "learning_rate": 4.175413261664749e-05, + "loss": 1.6416, + "step": 7536 + }, + { + "epoch": 0.5673422533355915, + "grad_norm": 4.613858222961426, + "learning_rate": 4.17421088958097e-05, + "loss": 1.7873, + "step": 7537 + }, + { + "epoch": 0.5674175276162517, + "grad_norm": 5.168544769287109, + "learning_rate": 4.1730085665879814e-05, + "loss": 2.1064, + "step": 7538 + }, + { + "epoch": 0.5674928018969119, + "grad_norm": 5.967532157897949, + "learning_rate": 4.171806292757263e-05, + "loss": 1.9516, + "step": 7539 + }, + { + "epoch": 0.567568076177572, + "grad_norm": 4.7017364501953125, + "learning_rate": 4.170604068160282e-05, + "loss": 1.8806, + "step": 7540 + }, + { + "epoch": 0.5676433504582322, + "grad_norm": 3.658191204071045, + "learning_rate": 4.1694018928685114e-05, + "loss": 1.9929, + "step": 7541 + }, + { + "epoch": 0.5677186247388923, + "grad_norm": 4.997767448425293, + "learning_rate": 4.1681997669534134e-05, + "loss": 1.8761, + "step": 7542 + }, + { + "epoch": 0.5677938990195525, + "grad_norm": 4.4882049560546875, + "learning_rate": 4.166997690486452e-05, + "loss": 1.6011, + "step": 7543 + }, + { + "epoch": 0.5678691733002127, + "grad_norm": 4.326699733734131, + "learning_rate": 4.1657956635390896e-05, + "loss": 1.5773, + "step": 7544 + }, + { + "epoch": 0.5679444475808728, + "grad_norm": 5.023290157318115, + "learning_rate": 4.164593686182779e-05, + "loss": 2.1886, + "step": 7545 + }, + { + "epoch": 0.5680197218615329, + "grad_norm": 5.108522891998291, + "learning_rate": 4.1633917584889804e-05, + "loss": 1.7776, + "step": 7546 + }, + { + "epoch": 0.5680949961421932, + "grad_norm": 5.90935754776001, + "learning_rate": 4.1621898805291384e-05, + "loss": 1.9849, + "step": 7547 + }, + { + "epoch": 0.5681702704228533, + "grad_norm": 5.0524797439575195, + "learning_rate": 4.160988052374706e-05, + "loss": 2.0358, + "step": 7548 + }, + { + "epoch": 0.5682455447035134, + "grad_norm": 6.703489303588867, + "learning_rate": 4.159786274097125e-05, + "loss": 1.9873, + "step": 7549 + }, + { + "epoch": 0.5683208189841736, + "grad_norm": 4.991318225860596, + "learning_rate": 4.15858454576784e-05, + "loss": 1.6501, + "step": 7550 + }, + { + "epoch": 0.5683960932648338, + "grad_norm": 4.4793009757995605, + "learning_rate": 4.157382867458288e-05, + "loss": 1.9308, + "step": 7551 + }, + { + "epoch": 0.5684713675454939, + "grad_norm": 4.188916206359863, + "learning_rate": 4.156181239239909e-05, + "loss": 1.8858, + "step": 7552 + }, + { + "epoch": 0.568546641826154, + "grad_norm": 9.955094337463379, + "learning_rate": 4.154979661184132e-05, + "loss": 1.9399, + "step": 7553 + }, + { + "epoch": 0.5686219161068142, + "grad_norm": 4.845600605010986, + "learning_rate": 4.153778133362391e-05, + "loss": 2.0156, + "step": 7554 + }, + { + "epoch": 0.5686971903874743, + "grad_norm": 6.118020057678223, + "learning_rate": 4.15257665584611e-05, + "loss": 1.8297, + "step": 7555 + }, + { + "epoch": 0.5687724646681345, + "grad_norm": 4.025101184844971, + "learning_rate": 4.151375228706719e-05, + "loss": 2.2015, + "step": 7556 + }, + { + "epoch": 0.5688477389487947, + "grad_norm": 4.272671699523926, + "learning_rate": 4.150173852015632e-05, + "loss": 1.9844, + "step": 7557 + }, + { + "epoch": 0.5689230132294548, + "grad_norm": 3.4581785202026367, + "learning_rate": 4.148972525844272e-05, + "loss": 1.706, + "step": 7558 + }, + { + "epoch": 0.5689982875101149, + "grad_norm": 6.242349624633789, + "learning_rate": 4.147771250264055e-05, + "loss": 2.1029, + "step": 7559 + }, + { + "epoch": 0.5690735617907752, + "grad_norm": 10.583616256713867, + "learning_rate": 4.1465700253463895e-05, + "loss": 2.2592, + "step": 7560 + }, + { + "epoch": 0.5691488360714353, + "grad_norm": 4.287550926208496, + "learning_rate": 4.145368851162689e-05, + "loss": 1.9225, + "step": 7561 + }, + { + "epoch": 0.5692241103520954, + "grad_norm": 18.562786102294922, + "learning_rate": 4.144167727784357e-05, + "loss": 2.2432, + "step": 7562 + }, + { + "epoch": 0.5692993846327556, + "grad_norm": 3.963780403137207, + "learning_rate": 4.1429666552827994e-05, + "loss": 1.7815, + "step": 7563 + }, + { + "epoch": 0.5693746589134158, + "grad_norm": 6.12351655960083, + "learning_rate": 4.141765633729413e-05, + "loss": 1.949, + "step": 7564 + }, + { + "epoch": 0.5694499331940759, + "grad_norm": 12.092477798461914, + "learning_rate": 4.1405646631955996e-05, + "loss": 1.8686, + "step": 7565 + }, + { + "epoch": 0.5695252074747361, + "grad_norm": 4.8221917152404785, + "learning_rate": 4.139363743752749e-05, + "loss": 1.554, + "step": 7566 + }, + { + "epoch": 0.5696004817553962, + "grad_norm": 5.134352207183838, + "learning_rate": 4.138162875472256e-05, + "loss": 2.1465, + "step": 7567 + }, + { + "epoch": 0.5696757560360564, + "grad_norm": 8.267267227172852, + "learning_rate": 4.136962058425505e-05, + "loss": 1.513, + "step": 7568 + }, + { + "epoch": 0.5697510303167166, + "grad_norm": 5.418541431427002, + "learning_rate": 4.135761292683886e-05, + "loss": 1.9171, + "step": 7569 + }, + { + "epoch": 0.5698263045973767, + "grad_norm": 5.002322196960449, + "learning_rate": 4.134560578318776e-05, + "loss": 1.7065, + "step": 7570 + }, + { + "epoch": 0.5699015788780368, + "grad_norm": 4.396917819976807, + "learning_rate": 4.1333599154015564e-05, + "loss": 2.0018, + "step": 7571 + }, + { + "epoch": 0.569976853158697, + "grad_norm": 4.024701118469238, + "learning_rate": 4.132159304003604e-05, + "loss": 1.9425, + "step": 7572 + }, + { + "epoch": 0.5700521274393572, + "grad_norm": 5.082156181335449, + "learning_rate": 4.130958744196291e-05, + "loss": 1.4771, + "step": 7573 + }, + { + "epoch": 0.5701274017200173, + "grad_norm": 5.736774921417236, + "learning_rate": 4.129758236050987e-05, + "loss": 1.8898, + "step": 7574 + }, + { + "epoch": 0.5702026760006774, + "grad_norm": 7.4611382484436035, + "learning_rate": 4.1285577796390576e-05, + "loss": 1.8959, + "step": 7575 + }, + { + "epoch": 0.5702779502813377, + "grad_norm": 4.995553493499756, + "learning_rate": 4.127357375031871e-05, + "loss": 2.125, + "step": 7576 + }, + { + "epoch": 0.5703532245619978, + "grad_norm": 5.859127044677734, + "learning_rate": 4.126157022300781e-05, + "loss": 1.8727, + "step": 7577 + }, + { + "epoch": 0.5704284988426579, + "grad_norm": 4.033148765563965, + "learning_rate": 4.124956721517151e-05, + "loss": 1.4259, + "step": 7578 + }, + { + "epoch": 0.5705037731233181, + "grad_norm": 4.1409149169921875, + "learning_rate": 4.1237564727523315e-05, + "loss": 1.728, + "step": 7579 + }, + { + "epoch": 0.5705790474039782, + "grad_norm": 6.867090225219727, + "learning_rate": 4.122556276077677e-05, + "loss": 1.8162, + "step": 7580 + }, + { + "epoch": 0.5706543216846384, + "grad_norm": 5.356757640838623, + "learning_rate": 4.121356131564533e-05, + "loss": 1.8797, + "step": 7581 + }, + { + "epoch": 0.5707295959652986, + "grad_norm": 5.843905925750732, + "learning_rate": 4.120156039284246e-05, + "loss": 1.5183, + "step": 7582 + }, + { + "epoch": 0.5708048702459587, + "grad_norm": 5.252926826477051, + "learning_rate": 4.11895599930816e-05, + "loss": 1.896, + "step": 7583 + }, + { + "epoch": 0.5708801445266188, + "grad_norm": 4.3138813972473145, + "learning_rate": 4.1177560117076095e-05, + "loss": 1.521, + "step": 7584 + }, + { + "epoch": 0.5709554188072791, + "grad_norm": 4.822055339813232, + "learning_rate": 4.1165560765539326e-05, + "loss": 1.9798, + "step": 7585 + }, + { + "epoch": 0.5710306930879392, + "grad_norm": 5.2819743156433105, + "learning_rate": 4.115356193918464e-05, + "loss": 1.9232, + "step": 7586 + }, + { + "epoch": 0.5711059673685993, + "grad_norm": 7.974135875701904, + "learning_rate": 4.11415636387253e-05, + "loss": 1.4495, + "step": 7587 + }, + { + "epoch": 0.5711812416492595, + "grad_norm": 5.3852128982543945, + "learning_rate": 4.112956586487458e-05, + "loss": 1.5679, + "step": 7588 + }, + { + "epoch": 0.5712565159299197, + "grad_norm": 9.800442695617676, + "learning_rate": 4.1117568618345713e-05, + "loss": 1.8617, + "step": 7589 + }, + { + "epoch": 0.5713317902105798, + "grad_norm": 4.913433074951172, + "learning_rate": 4.110557189985193e-05, + "loss": 1.7764, + "step": 7590 + }, + { + "epoch": 0.5714070644912399, + "grad_norm": 4.534649848937988, + "learning_rate": 4.109357571010635e-05, + "loss": 1.6252, + "step": 7591 + }, + { + "epoch": 0.5714823387719001, + "grad_norm": 5.556998252868652, + "learning_rate": 4.108158004982214e-05, + "loss": 1.992, + "step": 7592 + }, + { + "epoch": 0.5715576130525603, + "grad_norm": 4.160196781158447, + "learning_rate": 4.106958491971243e-05, + "loss": 1.9837, + "step": 7593 + }, + { + "epoch": 0.5716328873332204, + "grad_norm": 8.108696937561035, + "learning_rate": 4.1057590320490254e-05, + "loss": 1.6993, + "step": 7594 + }, + { + "epoch": 0.5717081616138806, + "grad_norm": 6.701841831207275, + "learning_rate": 4.104559625286868e-05, + "loss": 1.9985, + "step": 7595 + }, + { + "epoch": 0.5717834358945407, + "grad_norm": 4.406612396240234, + "learning_rate": 4.103360271756071e-05, + "loss": 1.4843, + "step": 7596 + }, + { + "epoch": 0.5718587101752008, + "grad_norm": 5.234290599822998, + "learning_rate": 4.102160971527935e-05, + "loss": 1.8815, + "step": 7597 + }, + { + "epoch": 0.5719339844558611, + "grad_norm": 5.806726455688477, + "learning_rate": 4.10096172467375e-05, + "loss": 1.984, + "step": 7598 + }, + { + "epoch": 0.5720092587365212, + "grad_norm": 6.453545093536377, + "learning_rate": 4.099762531264813e-05, + "loss": 2.0246, + "step": 7599 + }, + { + "epoch": 0.5720845330171813, + "grad_norm": 8.040641784667969, + "learning_rate": 4.098563391372409e-05, + "loss": 1.9438, + "step": 7600 + }, + { + "epoch": 0.5721598072978415, + "grad_norm": 8.319119453430176, + "learning_rate": 4.0973643050678274e-05, + "loss": 1.6654, + "step": 7601 + }, + { + "epoch": 0.5722350815785017, + "grad_norm": 24.514068603515625, + "learning_rate": 4.0961652724223456e-05, + "loss": 1.6127, + "step": 7602 + }, + { + "epoch": 0.5723103558591618, + "grad_norm": 4.91010856628418, + "learning_rate": 4.094966293507246e-05, + "loss": 1.898, + "step": 7603 + }, + { + "epoch": 0.572385630139822, + "grad_norm": 3.823251485824585, + "learning_rate": 4.0937673683938026e-05, + "loss": 1.7655, + "step": 7604 + }, + { + "epoch": 0.5724609044204821, + "grad_norm": 7.659752368927002, + "learning_rate": 4.0925684971532896e-05, + "loss": 1.6663, + "step": 7605 + }, + { + "epoch": 0.5725361787011423, + "grad_norm": 4.408021926879883, + "learning_rate": 4.091369679856975e-05, + "loss": 1.9852, + "step": 7606 + }, + { + "epoch": 0.5726114529818025, + "grad_norm": 4.5239973068237305, + "learning_rate": 4.090170916576125e-05, + "loss": 1.8239, + "step": 7607 + }, + { + "epoch": 0.5726867272624626, + "grad_norm": 5.948333740234375, + "learning_rate": 4.088972207382006e-05, + "loss": 1.7147, + "step": 7608 + }, + { + "epoch": 0.5727620015431227, + "grad_norm": 5.049926280975342, + "learning_rate": 4.087773552345873e-05, + "loss": 1.7191, + "step": 7609 + }, + { + "epoch": 0.5728372758237829, + "grad_norm": 6.151418685913086, + "learning_rate": 4.086574951538987e-05, + "loss": 2.213, + "step": 7610 + }, + { + "epoch": 0.5729125501044431, + "grad_norm": 11.300714492797852, + "learning_rate": 4.0853764050325975e-05, + "loss": 1.7782, + "step": 7611 + }, + { + "epoch": 0.5729878243851032, + "grad_norm": 10.53197193145752, + "learning_rate": 4.084177912897957e-05, + "loss": 1.6997, + "step": 7612 + }, + { + "epoch": 0.5730630986657633, + "grad_norm": 12.738458633422852, + "learning_rate": 4.082979475206311e-05, + "loss": 2.061, + "step": 7613 + }, + { + "epoch": 0.5731383729464236, + "grad_norm": 8.15504264831543, + "learning_rate": 4.0817810920289054e-05, + "loss": 2.0792, + "step": 7614 + }, + { + "epoch": 0.5732136472270837, + "grad_norm": 10.665327072143555, + "learning_rate": 4.080582763436978e-05, + "loss": 1.9999, + "step": 7615 + }, + { + "epoch": 0.5732889215077438, + "grad_norm": 6.009346961975098, + "learning_rate": 4.079384489501768e-05, + "loss": 1.8116, + "step": 7616 + }, + { + "epoch": 0.573364195788404, + "grad_norm": 6.944472312927246, + "learning_rate": 4.078186270294507e-05, + "loss": 1.8726, + "step": 7617 + }, + { + "epoch": 0.5734394700690641, + "grad_norm": 7.390512466430664, + "learning_rate": 4.07698810588643e-05, + "loss": 1.8692, + "step": 7618 + }, + { + "epoch": 0.5735147443497243, + "grad_norm": 7.753500938415527, + "learning_rate": 4.075789996348759e-05, + "loss": 1.8387, + "step": 7619 + }, + { + "epoch": 0.5735900186303845, + "grad_norm": 6.456392288208008, + "learning_rate": 4.074591941752724e-05, + "loss": 1.8525, + "step": 7620 + }, + { + "epoch": 0.5736652929110446, + "grad_norm": 15.365704536437988, + "learning_rate": 4.073393942169539e-05, + "loss": 2.39, + "step": 7621 + }, + { + "epoch": 0.5737405671917047, + "grad_norm": 5.82509183883667, + "learning_rate": 4.0721959976704274e-05, + "loss": 1.8625, + "step": 7622 + }, + { + "epoch": 0.573815841472365, + "grad_norm": 6.718338966369629, + "learning_rate": 4.0709981083266005e-05, + "loss": 1.5799, + "step": 7623 + }, + { + "epoch": 0.5738911157530251, + "grad_norm": 4.886624336242676, + "learning_rate": 4.069800274209271e-05, + "loss": 2.006, + "step": 7624 + }, + { + "epoch": 0.5739663900336852, + "grad_norm": 6.082042694091797, + "learning_rate": 4.0686024953896476e-05, + "loss": 1.6803, + "step": 7625 + }, + { + "epoch": 0.5740416643143454, + "grad_norm": 33.16362380981445, + "learning_rate": 4.0674047719389316e-05, + "loss": 1.8149, + "step": 7626 + }, + { + "epoch": 0.5741169385950056, + "grad_norm": 6.0648651123046875, + "learning_rate": 4.0662071039283266e-05, + "loss": 1.6786, + "step": 7627 + }, + { + "epoch": 0.5741922128756657, + "grad_norm": 4.450899600982666, + "learning_rate": 4.065009491429029e-05, + "loss": 1.6123, + "step": 7628 + }, + { + "epoch": 0.5742674871563259, + "grad_norm": 11.947538375854492, + "learning_rate": 4.0638119345122367e-05, + "loss": 1.5006, + "step": 7629 + }, + { + "epoch": 0.574342761436986, + "grad_norm": 5.139440059661865, + "learning_rate": 4.0626144332491366e-05, + "loss": 1.7439, + "step": 7630 + }, + { + "epoch": 0.5744180357176462, + "grad_norm": 5.500138282775879, + "learning_rate": 4.061416987710921e-05, + "loss": 1.9787, + "step": 7631 + }, + { + "epoch": 0.5744933099983063, + "grad_norm": 64.40674591064453, + "learning_rate": 4.0602195979687694e-05, + "loss": 2.1109, + "step": 7632 + }, + { + "epoch": 0.5745685842789665, + "grad_norm": 5.353816509246826, + "learning_rate": 4.059022264093868e-05, + "loss": 1.5944, + "step": 7633 + }, + { + "epoch": 0.5746438585596266, + "grad_norm": 4.684369087219238, + "learning_rate": 4.057824986157392e-05, + "loss": 1.8784, + "step": 7634 + }, + { + "epoch": 0.5747191328402868, + "grad_norm": 89.47626495361328, + "learning_rate": 4.056627764230519e-05, + "loss": 1.8921, + "step": 7635 + }, + { + "epoch": 0.574794407120947, + "grad_norm": 10.382705688476562, + "learning_rate": 4.055430598384416e-05, + "loss": 1.7594, + "step": 7636 + }, + { + "epoch": 0.5748696814016071, + "grad_norm": 5.9232869148254395, + "learning_rate": 4.0542334886902564e-05, + "loss": 1.7076, + "step": 7637 + }, + { + "epoch": 0.5749449556822672, + "grad_norm": 16.931175231933594, + "learning_rate": 4.0530364352192e-05, + "loss": 1.996, + "step": 7638 + }, + { + "epoch": 0.5750202299629275, + "grad_norm": 5.3834991455078125, + "learning_rate": 4.0518394380424105e-05, + "loss": 1.9526, + "step": 7639 + }, + { + "epoch": 0.5750955042435876, + "grad_norm": 6.087472438812256, + "learning_rate": 4.050642497231047e-05, + "loss": 1.7579, + "step": 7640 + }, + { + "epoch": 0.5751707785242477, + "grad_norm": 15.0820951461792, + "learning_rate": 4.0494456128562616e-05, + "loss": 1.7045, + "step": 7641 + }, + { + "epoch": 0.5752460528049079, + "grad_norm": 7.258744239807129, + "learning_rate": 4.048248784989209e-05, + "loss": 1.9194, + "step": 7642 + }, + { + "epoch": 0.575321327085568, + "grad_norm": 7.857977867126465, + "learning_rate": 4.047052013701034e-05, + "loss": 1.7645, + "step": 7643 + }, + { + "epoch": 0.5753966013662282, + "grad_norm": 7.7916107177734375, + "learning_rate": 4.045855299062883e-05, + "loss": 1.6341, + "step": 7644 + }, + { + "epoch": 0.5754718756468884, + "grad_norm": 7.143621444702148, + "learning_rate": 4.0446586411458955e-05, + "loss": 1.5135, + "step": 7645 + }, + { + "epoch": 0.5755471499275485, + "grad_norm": 5.207786560058594, + "learning_rate": 4.0434620400212126e-05, + "loss": 1.8042, + "step": 7646 + }, + { + "epoch": 0.5756224242082086, + "grad_norm": 7.3629913330078125, + "learning_rate": 4.042265495759965e-05, + "loss": 2.0324, + "step": 7647 + }, + { + "epoch": 0.5756976984888689, + "grad_norm": 8.182631492614746, + "learning_rate": 4.041069008433287e-05, + "loss": 1.9298, + "step": 7648 + }, + { + "epoch": 0.575772972769529, + "grad_norm": 5.825046539306641, + "learning_rate": 4.039872578112304e-05, + "loss": 1.6621, + "step": 7649 + }, + { + "epoch": 0.5758482470501891, + "grad_norm": 6.1910529136657715, + "learning_rate": 4.038676204868142e-05, + "loss": 1.8166, + "step": 7650 + }, + { + "epoch": 0.5759235213308492, + "grad_norm": 13.692588806152344, + "learning_rate": 4.03747988877192e-05, + "loss": 1.3486, + "step": 7651 + }, + { + "epoch": 0.5759987956115095, + "grad_norm": 6.388610363006592, + "learning_rate": 4.036283629894759e-05, + "loss": 1.8297, + "step": 7652 + }, + { + "epoch": 0.5760740698921696, + "grad_norm": 5.442832946777344, + "learning_rate": 4.035087428307769e-05, + "loss": 1.7994, + "step": 7653 + }, + { + "epoch": 0.5761493441728297, + "grad_norm": 5.421885013580322, + "learning_rate": 4.033891284082064e-05, + "loss": 1.8196, + "step": 7654 + }, + { + "epoch": 0.5762246184534899, + "grad_norm": 5.447461128234863, + "learning_rate": 4.032695197288748e-05, + "loss": 1.6935, + "step": 7655 + }, + { + "epoch": 0.57629989273415, + "grad_norm": 4.517678260803223, + "learning_rate": 4.0314991679989286e-05, + "loss": 1.7598, + "step": 7656 + }, + { + "epoch": 0.5763751670148102, + "grad_norm": 5.556516170501709, + "learning_rate": 4.030303196283706e-05, + "loss": 1.9433, + "step": 7657 + }, + { + "epoch": 0.5764504412954704, + "grad_norm": 5.049991607666016, + "learning_rate": 4.029107282214174e-05, + "loss": 1.7958, + "step": 7658 + }, + { + "epoch": 0.5765257155761305, + "grad_norm": 5.4705963134765625, + "learning_rate": 4.02791142586143e-05, + "loss": 2.1544, + "step": 7659 + }, + { + "epoch": 0.5766009898567906, + "grad_norm": 5.582090854644775, + "learning_rate": 4.026715627296561e-05, + "loss": 1.9455, + "step": 7660 + }, + { + "epoch": 0.5766762641374509, + "grad_norm": 6.407038688659668, + "learning_rate": 4.025519886590656e-05, + "loss": 1.9572, + "step": 7661 + }, + { + "epoch": 0.576751538418111, + "grad_norm": 4.496862888336182, + "learning_rate": 4.024324203814797e-05, + "loss": 1.7405, + "step": 7662 + }, + { + "epoch": 0.5768268126987711, + "grad_norm": 5.327667713165283, + "learning_rate": 4.023128579040066e-05, + "loss": 2.0863, + "step": 7663 + }, + { + "epoch": 0.5769020869794313, + "grad_norm": 4.400580883026123, + "learning_rate": 4.021933012337536e-05, + "loss": 1.53, + "step": 7664 + }, + { + "epoch": 0.5769773612600915, + "grad_norm": 9.315071105957031, + "learning_rate": 4.020737503778284e-05, + "loss": 2.0147, + "step": 7665 + }, + { + "epoch": 0.5770526355407516, + "grad_norm": 4.649260520935059, + "learning_rate": 4.019542053433376e-05, + "loss": 1.584, + "step": 7666 + }, + { + "epoch": 0.5771279098214118, + "grad_norm": 9.710540771484375, + "learning_rate": 4.018346661373881e-05, + "loss": 1.6173, + "step": 7667 + }, + { + "epoch": 0.5772031841020719, + "grad_norm": 6.309180736541748, + "learning_rate": 4.017151327670858e-05, + "loss": 1.6811, + "step": 7668 + }, + { + "epoch": 0.5772784583827321, + "grad_norm": 4.205515384674072, + "learning_rate": 4.0159560523953715e-05, + "loss": 1.7661, + "step": 7669 + }, + { + "epoch": 0.5773537326633922, + "grad_norm": 6.521707534790039, + "learning_rate": 4.014760835618472e-05, + "loss": 1.6006, + "step": 7670 + }, + { + "epoch": 0.5774290069440524, + "grad_norm": 12.977082252502441, + "learning_rate": 4.013565677411214e-05, + "loss": 1.9718, + "step": 7671 + }, + { + "epoch": 0.5775042812247125, + "grad_norm": 5.760602951049805, + "learning_rate": 4.0123705778446464e-05, + "loss": 1.7383, + "step": 7672 + }, + { + "epoch": 0.5775795555053727, + "grad_norm": 8.371894836425781, + "learning_rate": 4.011175536989814e-05, + "loss": 2.0667, + "step": 7673 + }, + { + "epoch": 0.5776548297860329, + "grad_norm": 7.219283580780029, + "learning_rate": 4.00998055491776e-05, + "loss": 2.0219, + "step": 7674 + }, + { + "epoch": 0.577730104066693, + "grad_norm": 6.496978282928467, + "learning_rate": 4.0087856316995195e-05, + "loss": 1.6115, + "step": 7675 + }, + { + "epoch": 0.5778053783473531, + "grad_norm": 5.030826568603516, + "learning_rate": 4.007590767406131e-05, + "loss": 1.3646, + "step": 7676 + }, + { + "epoch": 0.5778806526280134, + "grad_norm": 10.293353080749512, + "learning_rate": 4.006395962108622e-05, + "loss": 1.7044, + "step": 7677 + }, + { + "epoch": 0.5779559269086735, + "grad_norm": 6.060354232788086, + "learning_rate": 4.005201215878023e-05, + "loss": 1.9537, + "step": 7678 + }, + { + "epoch": 0.5780312011893336, + "grad_norm": 7.648204803466797, + "learning_rate": 4.004006528785357e-05, + "loss": 1.5125, + "step": 7679 + }, + { + "epoch": 0.5781064754699938, + "grad_norm": 9.064006805419922, + "learning_rate": 4.0028119009016465e-05, + "loss": 1.9324, + "step": 7680 + }, + { + "epoch": 0.578181749750654, + "grad_norm": 6.551533222198486, + "learning_rate": 4.001617332297905e-05, + "loss": 1.7619, + "step": 7681 + }, + { + "epoch": 0.5782570240313141, + "grad_norm": 4.220593452453613, + "learning_rate": 4.00042282304515e-05, + "loss": 1.8916, + "step": 7682 + }, + { + "epoch": 0.5783322983119743, + "grad_norm": 8.592142105102539, + "learning_rate": 3.999228373214388e-05, + "loss": 1.457, + "step": 7683 + }, + { + "epoch": 0.5784075725926344, + "grad_norm": 6.103160381317139, + "learning_rate": 3.99803398287663e-05, + "loss": 1.7634, + "step": 7684 + }, + { + "epoch": 0.5784828468732945, + "grad_norm": 6.902022838592529, + "learning_rate": 3.9968396521028754e-05, + "loss": 1.747, + "step": 7685 + }, + { + "epoch": 0.5785581211539548, + "grad_norm": 6.451738357543945, + "learning_rate": 3.995645380964127e-05, + "loss": 1.7657, + "step": 7686 + }, + { + "epoch": 0.5786333954346149, + "grad_norm": 8.722206115722656, + "learning_rate": 3.994451169531376e-05, + "loss": 2.0234, + "step": 7687 + }, + { + "epoch": 0.578708669715275, + "grad_norm": 5.986215591430664, + "learning_rate": 3.993257017875619e-05, + "loss": 1.668, + "step": 7688 + }, + { + "epoch": 0.5787839439959351, + "grad_norm": 4.706733703613281, + "learning_rate": 3.992062926067844e-05, + "loss": 2.0764, + "step": 7689 + }, + { + "epoch": 0.5788592182765954, + "grad_norm": 4.810065746307373, + "learning_rate": 3.990868894179035e-05, + "loss": 1.7711, + "step": 7690 + }, + { + "epoch": 0.5789344925572555, + "grad_norm": 5.2552995681762695, + "learning_rate": 3.9896749222801765e-05, + "loss": 1.919, + "step": 7691 + }, + { + "epoch": 0.5790097668379156, + "grad_norm": 8.421819686889648, + "learning_rate": 3.9884810104422434e-05, + "loss": 2.88, + "step": 7692 + }, + { + "epoch": 0.5790850411185758, + "grad_norm": 29.928903579711914, + "learning_rate": 3.987287158736213e-05, + "loss": 1.9934, + "step": 7693 + }, + { + "epoch": 0.579160315399236, + "grad_norm": 9.155946731567383, + "learning_rate": 3.986093367233054e-05, + "loss": 2.1684, + "step": 7694 + }, + { + "epoch": 0.5792355896798961, + "grad_norm": 6.801254749298096, + "learning_rate": 3.984899636003736e-05, + "loss": 1.7315, + "step": 7695 + }, + { + "epoch": 0.5793108639605563, + "grad_norm": 8.98769760131836, + "learning_rate": 3.983705965119221e-05, + "loss": 1.9001, + "step": 7696 + }, + { + "epoch": 0.5793861382412164, + "grad_norm": 7.122663497924805, + "learning_rate": 3.982512354650472e-05, + "loss": 1.621, + "step": 7697 + }, + { + "epoch": 0.5794614125218766, + "grad_norm": 6.252668857574463, + "learning_rate": 3.981318804668442e-05, + "loss": 1.8167, + "step": 7698 + }, + { + "epoch": 0.5795366868025368, + "grad_norm": 5.9064812660217285, + "learning_rate": 3.980125315244088e-05, + "loss": 1.3788, + "step": 7699 + }, + { + "epoch": 0.5796119610831969, + "grad_norm": 102.83370971679688, + "learning_rate": 3.978931886448355e-05, + "loss": 1.5207, + "step": 7700 + }, + { + "epoch": 0.579687235363857, + "grad_norm": 7.330103397369385, + "learning_rate": 3.977738518352195e-05, + "loss": 1.7526, + "step": 7701 + }, + { + "epoch": 0.5797625096445173, + "grad_norm": 7.552342891693115, + "learning_rate": 3.976545211026545e-05, + "loss": 2.1627, + "step": 7702 + }, + { + "epoch": 0.5798377839251774, + "grad_norm": 7.449044227600098, + "learning_rate": 3.975351964542346e-05, + "loss": 1.6889, + "step": 7703 + }, + { + "epoch": 0.5799130582058375, + "grad_norm": 6.8733038902282715, + "learning_rate": 3.974158778970532e-05, + "loss": 1.6129, + "step": 7704 + }, + { + "epoch": 0.5799883324864977, + "grad_norm": 5.719249725341797, + "learning_rate": 3.9729656543820346e-05, + "loss": 1.929, + "step": 7705 + }, + { + "epoch": 0.5800636067671578, + "grad_norm": 6.353752136230469, + "learning_rate": 3.971772590847783e-05, + "loss": 2.2031, + "step": 7706 + }, + { + "epoch": 0.580138881047818, + "grad_norm": 6.378872394561768, + "learning_rate": 3.970579588438701e-05, + "loss": 1.6825, + "step": 7707 + }, + { + "epoch": 0.5802141553284781, + "grad_norm": 8.938995361328125, + "learning_rate": 3.969386647225709e-05, + "loss": 1.8722, + "step": 7708 + }, + { + "epoch": 0.5802894296091383, + "grad_norm": 5.455897808074951, + "learning_rate": 3.968193767279722e-05, + "loss": 1.7259, + "step": 7709 + }, + { + "epoch": 0.5803647038897984, + "grad_norm": 10.626827239990234, + "learning_rate": 3.9670009486716566e-05, + "loss": 2.1713, + "step": 7710 + }, + { + "epoch": 0.5804399781704586, + "grad_norm": 8.435165405273438, + "learning_rate": 3.96580819147242e-05, + "loss": 1.5515, + "step": 7711 + }, + { + "epoch": 0.5805152524511188, + "grad_norm": 18.149642944335938, + "learning_rate": 3.9646154957529196e-05, + "loss": 1.5665, + "step": 7712 + }, + { + "epoch": 0.5805905267317789, + "grad_norm": 5.684018611907959, + "learning_rate": 3.963422861584056e-05, + "loss": 2.0705, + "step": 7713 + }, + { + "epoch": 0.580665801012439, + "grad_norm": 4.610129356384277, + "learning_rate": 3.9622302890367316e-05, + "loss": 1.5745, + "step": 7714 + }, + { + "epoch": 0.5807410752930993, + "grad_norm": 10.838143348693848, + "learning_rate": 3.9610377781818356e-05, + "loss": 1.8308, + "step": 7715 + }, + { + "epoch": 0.5808163495737594, + "grad_norm": 5.506286144256592, + "learning_rate": 3.9598453290902646e-05, + "loss": 1.8067, + "step": 7716 + }, + { + "epoch": 0.5808916238544195, + "grad_norm": 52.67744445800781, + "learning_rate": 3.9586529418329024e-05, + "loss": 1.9501, + "step": 7717 + }, + { + "epoch": 0.5809668981350797, + "grad_norm": 5.856502056121826, + "learning_rate": 3.957460616480637e-05, + "loss": 1.7384, + "step": 7718 + }, + { + "epoch": 0.5810421724157399, + "grad_norm": 6.297516345977783, + "learning_rate": 3.956268353104346e-05, + "loss": 1.7496, + "step": 7719 + }, + { + "epoch": 0.5811174466964, + "grad_norm": 6.518642902374268, + "learning_rate": 3.9550761517749046e-05, + "loss": 2.0549, + "step": 7720 + }, + { + "epoch": 0.5811927209770602, + "grad_norm": 7.070342063903809, + "learning_rate": 3.9538840125631904e-05, + "loss": 1.8289, + "step": 7721 + }, + { + "epoch": 0.5812679952577203, + "grad_norm": 25.646183013916016, + "learning_rate": 3.952691935540068e-05, + "loss": 2.117, + "step": 7722 + }, + { + "epoch": 0.5813432695383804, + "grad_norm": 7.958316802978516, + "learning_rate": 3.9514999207764064e-05, + "loss": 1.9843, + "step": 7723 + }, + { + "epoch": 0.5814185438190407, + "grad_norm": 8.61415958404541, + "learning_rate": 3.950307968343065e-05, + "loss": 2.1666, + "step": 7724 + }, + { + "epoch": 0.5814938180997008, + "grad_norm": 9.46127700805664, + "learning_rate": 3.949116078310905e-05, + "loss": 1.7044, + "step": 7725 + }, + { + "epoch": 0.5815690923803609, + "grad_norm": 16.115154266357422, + "learning_rate": 3.947924250750776e-05, + "loss": 1.6184, + "step": 7726 + }, + { + "epoch": 0.5816443666610212, + "grad_norm": 4.614068508148193, + "learning_rate": 3.946732485733533e-05, + "loss": 1.7835, + "step": 7727 + }, + { + "epoch": 0.5817196409416813, + "grad_norm": 11.254007339477539, + "learning_rate": 3.9455407833300206e-05, + "loss": 1.8339, + "step": 7728 + }, + { + "epoch": 0.5817949152223414, + "grad_norm": 112.46536254882812, + "learning_rate": 3.9443491436110856e-05, + "loss": 1.6812, + "step": 7729 + }, + { + "epoch": 0.5818701895030015, + "grad_norm": 5.2983317375183105, + "learning_rate": 3.9431575666475625e-05, + "loss": 1.9183, + "step": 7730 + }, + { + "epoch": 0.5819454637836617, + "grad_norm": 5.490381240844727, + "learning_rate": 3.941966052510292e-05, + "loss": 1.8509, + "step": 7731 + }, + { + "epoch": 0.5820207380643219, + "grad_norm": 6.620068550109863, + "learning_rate": 3.940774601270101e-05, + "loss": 1.7066, + "step": 7732 + }, + { + "epoch": 0.582096012344982, + "grad_norm": 6.261382102966309, + "learning_rate": 3.939583212997823e-05, + "loss": 1.6854, + "step": 7733 + }, + { + "epoch": 0.5821712866256422, + "grad_norm": 17.490434646606445, + "learning_rate": 3.938391887764278e-05, + "loss": 1.774, + "step": 7734 + }, + { + "epoch": 0.5822465609063023, + "grad_norm": 11.957138061523438, + "learning_rate": 3.9372006256402916e-05, + "loss": 2.0122, + "step": 7735 + }, + { + "epoch": 0.5823218351869625, + "grad_norm": 5.503768444061279, + "learning_rate": 3.9360094266966767e-05, + "loss": 2.0174, + "step": 7736 + }, + { + "epoch": 0.5823971094676227, + "grad_norm": 6.774337291717529, + "learning_rate": 3.934818291004248e-05, + "loss": 1.5783, + "step": 7737 + }, + { + "epoch": 0.5824723837482828, + "grad_norm": 6.354311466217041, + "learning_rate": 3.933627218633818e-05, + "loss": 1.9314, + "step": 7738 + }, + { + "epoch": 0.5825476580289429, + "grad_norm": 8.407363891601562, + "learning_rate": 3.9324362096561887e-05, + "loss": 1.7868, + "step": 7739 + }, + { + "epoch": 0.5826229323096032, + "grad_norm": 8.599726676940918, + "learning_rate": 3.931245264142163e-05, + "loss": 1.586, + "step": 7740 + }, + { + "epoch": 0.5826982065902633, + "grad_norm": 14.862786293029785, + "learning_rate": 3.93005438216254e-05, + "loss": 1.7719, + "step": 7741 + }, + { + "epoch": 0.5827734808709234, + "grad_norm": 7.204057693481445, + "learning_rate": 3.928863563788116e-05, + "loss": 1.9382, + "step": 7742 + }, + { + "epoch": 0.5828487551515836, + "grad_norm": 8.053291320800781, + "learning_rate": 3.9276728090896765e-05, + "loss": 1.9003, + "step": 7743 + }, + { + "epoch": 0.5829240294322438, + "grad_norm": 6.424757480621338, + "learning_rate": 3.926482118138013e-05, + "loss": 1.5787, + "step": 7744 + }, + { + "epoch": 0.5829993037129039, + "grad_norm": 18.074607849121094, + "learning_rate": 3.9252914910039076e-05, + "loss": 2.0957, + "step": 7745 + }, + { + "epoch": 0.5830745779935641, + "grad_norm": 7.47772216796875, + "learning_rate": 3.92410092775814e-05, + "loss": 1.9102, + "step": 7746 + }, + { + "epoch": 0.5831498522742242, + "grad_norm": 8.709019660949707, + "learning_rate": 3.9229104284714836e-05, + "loss": 1.7954, + "step": 7747 + }, + { + "epoch": 0.5832251265548843, + "grad_norm": 10.641573905944824, + "learning_rate": 3.921719993214714e-05, + "loss": 1.9079, + "step": 7748 + }, + { + "epoch": 0.5833004008355445, + "grad_norm": 86.91674041748047, + "learning_rate": 3.9205296220585944e-05, + "loss": 2.2703, + "step": 7749 + }, + { + "epoch": 0.5833756751162047, + "grad_norm": 8.8544340133667, + "learning_rate": 3.919339315073893e-05, + "loss": 1.9014, + "step": 7750 + }, + { + "epoch": 0.5834509493968648, + "grad_norm": 10.9514799118042, + "learning_rate": 3.918149072331367e-05, + "loss": 1.8172, + "step": 7751 + }, + { + "epoch": 0.5835262236775249, + "grad_norm": 7.188732624053955, + "learning_rate": 3.916958893901774e-05, + "loss": 1.8594, + "step": 7752 + }, + { + "epoch": 0.5836014979581852, + "grad_norm": 19.22016143798828, + "learning_rate": 3.9157687798558706e-05, + "loss": 1.7865, + "step": 7753 + }, + { + "epoch": 0.5836767722388453, + "grad_norm": 7.07549524307251, + "learning_rate": 3.9145787302643996e-05, + "loss": 1.8274, + "step": 7754 + }, + { + "epoch": 0.5837520465195054, + "grad_norm": 20.174047470092773, + "learning_rate": 3.9133887451981096e-05, + "loss": 1.7724, + "step": 7755 + }, + { + "epoch": 0.5838273208001656, + "grad_norm": 10.372930526733398, + "learning_rate": 3.9121988247277394e-05, + "loss": 1.5365, + "step": 7756 + }, + { + "epoch": 0.5839025950808258, + "grad_norm": 4.67086935043335, + "learning_rate": 3.9110089689240304e-05, + "loss": 1.7607, + "step": 7757 + }, + { + "epoch": 0.5839778693614859, + "grad_norm": 5.123300552368164, + "learning_rate": 3.9098191778577106e-05, + "loss": 1.7086, + "step": 7758 + }, + { + "epoch": 0.5840531436421461, + "grad_norm": 8.166887283325195, + "learning_rate": 3.908629451599516e-05, + "loss": 2.0545, + "step": 7759 + }, + { + "epoch": 0.5841284179228062, + "grad_norm": 7.1622090339660645, + "learning_rate": 3.907439790220166e-05, + "loss": 1.8178, + "step": 7760 + }, + { + "epoch": 0.5842036922034664, + "grad_norm": 10.619863510131836, + "learning_rate": 3.906250193790387e-05, + "loss": 1.5488, + "step": 7761 + }, + { + "epoch": 0.5842789664841266, + "grad_norm": 73.42439270019531, + "learning_rate": 3.9050606623808945e-05, + "loss": 1.664, + "step": 7762 + }, + { + "epoch": 0.5843542407647867, + "grad_norm": 9.640317916870117, + "learning_rate": 3.903871196062406e-05, + "loss": 1.9813, + "step": 7763 + }, + { + "epoch": 0.5844295150454468, + "grad_norm": 6.3367156982421875, + "learning_rate": 3.9026817949056274e-05, + "loss": 1.6749, + "step": 7764 + }, + { + "epoch": 0.584504789326107, + "grad_norm": 6.502419948577881, + "learning_rate": 3.90149245898127e-05, + "loss": 1.9261, + "step": 7765 + }, + { + "epoch": 0.5845800636067672, + "grad_norm": 7.96453857421875, + "learning_rate": 3.900303188360032e-05, + "loss": 2.4439, + "step": 7766 + }, + { + "epoch": 0.5846553378874273, + "grad_norm": 10.199264526367188, + "learning_rate": 3.899113983112614e-05, + "loss": 1.7262, + "step": 7767 + }, + { + "epoch": 0.5847306121680874, + "grad_norm": 7.787285804748535, + "learning_rate": 3.897924843309711e-05, + "loss": 1.5831, + "step": 7768 + }, + { + "epoch": 0.5848058864487476, + "grad_norm": 10.597495079040527, + "learning_rate": 3.896735769022014e-05, + "loss": 1.6023, + "step": 7769 + }, + { + "epoch": 0.5848811607294078, + "grad_norm": 8.17022705078125, + "learning_rate": 3.895546760320212e-05, + "loss": 1.7257, + "step": 7770 + }, + { + "epoch": 0.5849564350100679, + "grad_norm": 6.265396595001221, + "learning_rate": 3.8943578172749826e-05, + "loss": 1.6589, + "step": 7771 + }, + { + "epoch": 0.5850317092907281, + "grad_norm": 6.709798812866211, + "learning_rate": 3.893168939957011e-05, + "loss": 2.0947, + "step": 7772 + }, + { + "epoch": 0.5851069835713882, + "grad_norm": 6.752137184143066, + "learning_rate": 3.8919801284369685e-05, + "loss": 1.6306, + "step": 7773 + }, + { + "epoch": 0.5851822578520484, + "grad_norm": 12.63714599609375, + "learning_rate": 3.8907913827855304e-05, + "loss": 2.0672, + "step": 7774 + }, + { + "epoch": 0.5852575321327086, + "grad_norm": 8.344486236572266, + "learning_rate": 3.8896027030733597e-05, + "loss": 1.4701, + "step": 7775 + }, + { + "epoch": 0.5853328064133687, + "grad_norm": 14.580890655517578, + "learning_rate": 3.888414089371124e-05, + "loss": 1.9305, + "step": 7776 + }, + { + "epoch": 0.5854080806940288, + "grad_norm": 10.79031753540039, + "learning_rate": 3.8872255417494795e-05, + "loss": 1.8953, + "step": 7777 + }, + { + "epoch": 0.5854833549746891, + "grad_norm": 5.265921115875244, + "learning_rate": 3.8860370602790844e-05, + "loss": 1.8284, + "step": 7778 + }, + { + "epoch": 0.5855586292553492, + "grad_norm": 4.571259021759033, + "learning_rate": 3.884848645030589e-05, + "loss": 1.6352, + "step": 7779 + }, + { + "epoch": 0.5856339035360093, + "grad_norm": 5.80585241317749, + "learning_rate": 3.8836602960746435e-05, + "loss": 1.6203, + "step": 7780 + }, + { + "epoch": 0.5857091778166695, + "grad_norm": 5.599833011627197, + "learning_rate": 3.882472013481889e-05, + "loss": 1.7617, + "step": 7781 + }, + { + "epoch": 0.5857844520973297, + "grad_norm": 8.281386375427246, + "learning_rate": 3.881283797322967e-05, + "loss": 1.9712, + "step": 7782 + }, + { + "epoch": 0.5858597263779898, + "grad_norm": 6.664051532745361, + "learning_rate": 3.880095647668512e-05, + "loss": 2.2647, + "step": 7783 + }, + { + "epoch": 0.58593500065865, + "grad_norm": 6.239869594573975, + "learning_rate": 3.878907564589158e-05, + "loss": 1.6236, + "step": 7784 + }, + { + "epoch": 0.5860102749393101, + "grad_norm": 4.133721828460693, + "learning_rate": 3.877719548155535e-05, + "loss": 2.0619, + "step": 7785 + }, + { + "epoch": 0.5860855492199702, + "grad_norm": 5.634904861450195, + "learning_rate": 3.876531598438262e-05, + "loss": 1.9969, + "step": 7786 + }, + { + "epoch": 0.5861608235006304, + "grad_norm": 8.61093807220459, + "learning_rate": 3.875343715507964e-05, + "loss": 2.0777, + "step": 7787 + }, + { + "epoch": 0.5862360977812906, + "grad_norm": 5.1009087562561035, + "learning_rate": 3.8741558994352536e-05, + "loss": 1.8357, + "step": 7788 + }, + { + "epoch": 0.5863113720619507, + "grad_norm": 8.159534454345703, + "learning_rate": 3.872968150290745e-05, + "loss": 1.8776, + "step": 7789 + }, + { + "epoch": 0.5863866463426108, + "grad_norm": 8.548595428466797, + "learning_rate": 3.8717804681450456e-05, + "loss": 1.8362, + "step": 7790 + }, + { + "epoch": 0.5864619206232711, + "grad_norm": 6.970361232757568, + "learning_rate": 3.870592853068761e-05, + "loss": 1.7311, + "step": 7791 + }, + { + "epoch": 0.5865371949039312, + "grad_norm": 6.835873126983643, + "learning_rate": 3.8694053051324895e-05, + "loss": 1.6037, + "step": 7792 + }, + { + "epoch": 0.5866124691845913, + "grad_norm": 8.510467529296875, + "learning_rate": 3.868217824406831e-05, + "loss": 1.5934, + "step": 7793 + }, + { + "epoch": 0.5866877434652515, + "grad_norm": 7.392192363739014, + "learning_rate": 3.867030410962372e-05, + "loss": 1.9757, + "step": 7794 + }, + { + "epoch": 0.5867630177459117, + "grad_norm": 5.656592845916748, + "learning_rate": 3.865843064869706e-05, + "loss": 1.8046, + "step": 7795 + }, + { + "epoch": 0.5868382920265718, + "grad_norm": 5.134749889373779, + "learning_rate": 3.864655786199415e-05, + "loss": 1.9017, + "step": 7796 + }, + { + "epoch": 0.586913566307232, + "grad_norm": 5.325922966003418, + "learning_rate": 3.863468575022081e-05, + "loss": 1.7018, + "step": 7797 + }, + { + "epoch": 0.5869888405878921, + "grad_norm": 4.971122741699219, + "learning_rate": 3.8622814314082775e-05, + "loss": 2.0384, + "step": 7798 + }, + { + "epoch": 0.5870641148685523, + "grad_norm": 4.277151584625244, + "learning_rate": 3.86109435542858e-05, + "loss": 1.6948, + "step": 7799 + }, + { + "epoch": 0.5871393891492125, + "grad_norm": 6.318285942077637, + "learning_rate": 3.859907347153554e-05, + "loss": 1.684, + "step": 7800 + }, + { + "epoch": 0.5872146634298726, + "grad_norm": 5.20002555847168, + "learning_rate": 3.858720406653764e-05, + "loss": 1.8195, + "step": 7801 + }, + { + "epoch": 0.5872899377105327, + "grad_norm": 7.05336856842041, + "learning_rate": 3.8575335339997736e-05, + "loss": 1.7054, + "step": 7802 + }, + { + "epoch": 0.587365211991193, + "grad_norm": 3.8951804637908936, + "learning_rate": 3.856346729262135e-05, + "loss": 2.0091, + "step": 7803 + }, + { + "epoch": 0.5874404862718531, + "grad_norm": 6.10189151763916, + "learning_rate": 3.855159992511403e-05, + "loss": 2.3809, + "step": 7804 + }, + { + "epoch": 0.5875157605525132, + "grad_norm": 7.5449910163879395, + "learning_rate": 3.853973323818123e-05, + "loss": 1.4777, + "step": 7805 + }, + { + "epoch": 0.5875910348331733, + "grad_norm": 4.103111743927002, + "learning_rate": 3.852786723252842e-05, + "loss": 1.6717, + "step": 7806 + }, + { + "epoch": 0.5876663091138336, + "grad_norm": 4.2307820320129395, + "learning_rate": 3.851600190886096e-05, + "loss": 1.6303, + "step": 7807 + }, + { + "epoch": 0.5877415833944937, + "grad_norm": 4.756314754486084, + "learning_rate": 3.850413726788428e-05, + "loss": 2.0387, + "step": 7808 + }, + { + "epoch": 0.5878168576751538, + "grad_norm": 7.391165733337402, + "learning_rate": 3.849227331030362e-05, + "loss": 1.6792, + "step": 7809 + }, + { + "epoch": 0.587892131955814, + "grad_norm": 5.620707035064697, + "learning_rate": 3.84804100368243e-05, + "loss": 1.9753, + "step": 7810 + }, + { + "epoch": 0.5879674062364741, + "grad_norm": 6.869284629821777, + "learning_rate": 3.8468547448151554e-05, + "loss": 1.4429, + "step": 7811 + }, + { + "epoch": 0.5880426805171343, + "grad_norm": 7.211230278015137, + "learning_rate": 3.845668554499057e-05, + "loss": 1.7411, + "step": 7812 + }, + { + "epoch": 0.5881179547977945, + "grad_norm": 6.20987606048584, + "learning_rate": 3.84448243280465e-05, + "loss": 1.6849, + "step": 7813 + }, + { + "epoch": 0.5881932290784546, + "grad_norm": 6.503393173217773, + "learning_rate": 3.84329637980245e-05, + "loss": 2.011, + "step": 7814 + }, + { + "epoch": 0.5882685033591147, + "grad_norm": 8.304048538208008, + "learning_rate": 3.8421103955629576e-05, + "loss": 1.9289, + "step": 7815 + }, + { + "epoch": 0.588343777639775, + "grad_norm": 6.682769298553467, + "learning_rate": 3.840924480156682e-05, + "loss": 1.7291, + "step": 7816 + }, + { + "epoch": 0.5884190519204351, + "grad_norm": 5.163976669311523, + "learning_rate": 3.839738633654119e-05, + "loss": 1.718, + "step": 7817 + }, + { + "epoch": 0.5884943262010952, + "grad_norm": 7.390124797821045, + "learning_rate": 3.838552856125764e-05, + "loss": 2.0468, + "step": 7818 + }, + { + "epoch": 0.5885696004817554, + "grad_norm": 5.315319061279297, + "learning_rate": 3.8373671476421124e-05, + "loss": 2.1188, + "step": 7819 + }, + { + "epoch": 0.5886448747624156, + "grad_norm": 7.298669815063477, + "learning_rate": 3.836181508273646e-05, + "loss": 1.693, + "step": 7820 + }, + { + "epoch": 0.5887201490430757, + "grad_norm": 5.967959403991699, + "learning_rate": 3.834995938090851e-05, + "loss": 1.7822, + "step": 7821 + }, + { + "epoch": 0.5887954233237359, + "grad_norm": 6.1331706047058105, + "learning_rate": 3.8338104371642023e-05, + "loss": 1.6159, + "step": 7822 + }, + { + "epoch": 0.588870697604396, + "grad_norm": 4.626715183258057, + "learning_rate": 3.8326250055641787e-05, + "loss": 1.6983, + "step": 7823 + }, + { + "epoch": 0.5889459718850562, + "grad_norm": 4.874824523925781, + "learning_rate": 3.831439643361248e-05, + "loss": 2.0086, + "step": 7824 + }, + { + "epoch": 0.5890212461657164, + "grad_norm": 6.665626525878906, + "learning_rate": 3.830254350625879e-05, + "loss": 1.762, + "step": 7825 + }, + { + "epoch": 0.5890965204463765, + "grad_norm": 6.5086445808410645, + "learning_rate": 3.829069127428531e-05, + "loss": 2.0195, + "step": 7826 + }, + { + "epoch": 0.5891717947270366, + "grad_norm": 8.164224624633789, + "learning_rate": 3.8278839738396645e-05, + "loss": 1.8401, + "step": 7827 + }, + { + "epoch": 0.5892470690076967, + "grad_norm": 7.549623966217041, + "learning_rate": 3.826698889929731e-05, + "loss": 2.1557, + "step": 7828 + }, + { + "epoch": 0.589322343288357, + "grad_norm": 9.807738304138184, + "learning_rate": 3.8255138757691845e-05, + "loss": 1.9996, + "step": 7829 + }, + { + "epoch": 0.5893976175690171, + "grad_norm": 8.33513355255127, + "learning_rate": 3.824328931428466e-05, + "loss": 1.7231, + "step": 7830 + }, + { + "epoch": 0.5894728918496772, + "grad_norm": 6.582488059997559, + "learning_rate": 3.823144056978021e-05, + "loss": 1.8628, + "step": 7831 + }, + { + "epoch": 0.5895481661303374, + "grad_norm": 6.319664478302002, + "learning_rate": 3.821959252488283e-05, + "loss": 2.105, + "step": 7832 + }, + { + "epoch": 0.5896234404109976, + "grad_norm": 7.326380729675293, + "learning_rate": 3.820774518029687e-05, + "loss": 1.5313, + "step": 7833 + }, + { + "epoch": 0.5896987146916577, + "grad_norm": 8.140748023986816, + "learning_rate": 3.819589853672664e-05, + "loss": 2.3046, + "step": 7834 + }, + { + "epoch": 0.5897739889723179, + "grad_norm": 4.614751815795898, + "learning_rate": 3.818405259487636e-05, + "loss": 1.4783, + "step": 7835 + }, + { + "epoch": 0.589849263252978, + "grad_norm": 5.540757179260254, + "learning_rate": 3.817220735545027e-05, + "loss": 1.6479, + "step": 7836 + }, + { + "epoch": 0.5899245375336382, + "grad_norm": 4.285771369934082, + "learning_rate": 3.8160362819152496e-05, + "loss": 1.6789, + "step": 7837 + }, + { + "epoch": 0.5899998118142984, + "grad_norm": 4.895909309387207, + "learning_rate": 3.81485189866872e-05, + "loss": 1.902, + "step": 7838 + }, + { + "epoch": 0.5900750860949585, + "grad_norm": 6.41829776763916, + "learning_rate": 3.813667585875842e-05, + "loss": 2.0968, + "step": 7839 + }, + { + "epoch": 0.5901503603756186, + "grad_norm": 4.605316638946533, + "learning_rate": 3.812483343607023e-05, + "loss": 1.5405, + "step": 7840 + }, + { + "epoch": 0.5902256346562789, + "grad_norm": 6.022290229797363, + "learning_rate": 3.811299171932661e-05, + "loss": 1.9434, + "step": 7841 + }, + { + "epoch": 0.590300908936939, + "grad_norm": 9.94079875946045, + "learning_rate": 3.8101150709231546e-05, + "loss": 1.5875, + "step": 7842 + }, + { + "epoch": 0.5903761832175991, + "grad_norm": 9.434564590454102, + "learning_rate": 3.808931040648892e-05, + "loss": 1.8614, + "step": 7843 + }, + { + "epoch": 0.5904514574982593, + "grad_norm": 6.522921562194824, + "learning_rate": 3.807747081180261e-05, + "loss": 1.6028, + "step": 7844 + }, + { + "epoch": 0.5905267317789195, + "grad_norm": 4.857781887054443, + "learning_rate": 3.806563192587645e-05, + "loss": 1.6105, + "step": 7845 + }, + { + "epoch": 0.5906020060595796, + "grad_norm": 6.424694538116455, + "learning_rate": 3.805379374941425e-05, + "loss": 2.3181, + "step": 7846 + }, + { + "epoch": 0.5906772803402397, + "grad_norm": 7.388171672821045, + "learning_rate": 3.80419562831197e-05, + "loss": 1.6591, + "step": 7847 + }, + { + "epoch": 0.5907525546208999, + "grad_norm": 5.525968074798584, + "learning_rate": 3.803011952769657e-05, + "loss": 1.7299, + "step": 7848 + }, + { + "epoch": 0.59082782890156, + "grad_norm": 5.514792442321777, + "learning_rate": 3.8018283483848464e-05, + "loss": 1.8752, + "step": 7849 + }, + { + "epoch": 0.5909031031822202, + "grad_norm": 5.844470500946045, + "learning_rate": 3.800644815227903e-05, + "loss": 1.9748, + "step": 7850 + }, + { + "epoch": 0.5909783774628804, + "grad_norm": 5.275660991668701, + "learning_rate": 3.7994613533691844e-05, + "loss": 1.7545, + "step": 7851 + }, + { + "epoch": 0.5910536517435405, + "grad_norm": 8.844895362854004, + "learning_rate": 3.798277962879043e-05, + "loss": 1.7559, + "step": 7852 + }, + { + "epoch": 0.5911289260242006, + "grad_norm": 5.269638538360596, + "learning_rate": 3.7970946438278305e-05, + "loss": 1.397, + "step": 7853 + }, + { + "epoch": 0.5912042003048609, + "grad_norm": 6.199509143829346, + "learning_rate": 3.795911396285888e-05, + "loss": 2.1492, + "step": 7854 + }, + { + "epoch": 0.591279474585521, + "grad_norm": 4.309825897216797, + "learning_rate": 3.794728220323559e-05, + "loss": 1.8073, + "step": 7855 + }, + { + "epoch": 0.5913547488661811, + "grad_norm": 7.432586193084717, + "learning_rate": 3.7935451160111774e-05, + "loss": 2.2177, + "step": 7856 + }, + { + "epoch": 0.5914300231468413, + "grad_norm": 40.442604064941406, + "learning_rate": 3.79236208341908e-05, + "loss": 1.9635, + "step": 7857 + }, + { + "epoch": 0.5915052974275015, + "grad_norm": 6.739380359649658, + "learning_rate": 3.791179122617588e-05, + "loss": 1.7764, + "step": 7858 + }, + { + "epoch": 0.5915805717081616, + "grad_norm": 5.147418975830078, + "learning_rate": 3.789996233677032e-05, + "loss": 1.7619, + "step": 7859 + }, + { + "epoch": 0.5916558459888218, + "grad_norm": 5.606623649597168, + "learning_rate": 3.788813416667727e-05, + "loss": 1.701, + "step": 7860 + }, + { + "epoch": 0.5917311202694819, + "grad_norm": 5.424412727355957, + "learning_rate": 3.787630671659989e-05, + "loss": 1.771, + "step": 7861 + }, + { + "epoch": 0.5918063945501421, + "grad_norm": 7.910769462585449, + "learning_rate": 3.7864479987241266e-05, + "loss": 1.785, + "step": 7862 + }, + { + "epoch": 0.5918816688308023, + "grad_norm": 6.933977127075195, + "learning_rate": 3.785265397930452e-05, + "loss": 1.9439, + "step": 7863 + }, + { + "epoch": 0.5919569431114624, + "grad_norm": 9.293656349182129, + "learning_rate": 3.784082869349262e-05, + "loss": 1.8421, + "step": 7864 + }, + { + "epoch": 0.5920322173921225, + "grad_norm": 5.7107744216918945, + "learning_rate": 3.7829004130508564e-05, + "loss": 2.0165, + "step": 7865 + }, + { + "epoch": 0.5921074916727826, + "grad_norm": 9.340198516845703, + "learning_rate": 3.781718029105531e-05, + "loss": 1.7173, + "step": 7866 + }, + { + "epoch": 0.5921827659534429, + "grad_norm": 5.240962982177734, + "learning_rate": 3.780535717583571e-05, + "loss": 1.904, + "step": 7867 + }, + { + "epoch": 0.592258040234103, + "grad_norm": 7.411725044250488, + "learning_rate": 3.779353478555265e-05, + "loss": 1.6533, + "step": 7868 + }, + { + "epoch": 0.5923333145147631, + "grad_norm": 6.767391681671143, + "learning_rate": 3.7781713120908905e-05, + "loss": 1.8388, + "step": 7869 + }, + { + "epoch": 0.5924085887954234, + "grad_norm": 4.411859035491943, + "learning_rate": 3.776989218260728e-05, + "loss": 1.8294, + "step": 7870 + }, + { + "epoch": 0.5924838630760835, + "grad_norm": 8.716769218444824, + "learning_rate": 3.7758071971350456e-05, + "loss": 1.682, + "step": 7871 + }, + { + "epoch": 0.5925591373567436, + "grad_norm": 11.558558464050293, + "learning_rate": 3.7746252487841135e-05, + "loss": 1.7559, + "step": 7872 + }, + { + "epoch": 0.5926344116374038, + "grad_norm": 5.734289169311523, + "learning_rate": 3.773443373278193e-05, + "loss": 1.7234, + "step": 7873 + }, + { + "epoch": 0.5927096859180639, + "grad_norm": 5.654652118682861, + "learning_rate": 3.7722615706875474e-05, + "loss": 1.7824, + "step": 7874 + }, + { + "epoch": 0.5927849601987241, + "grad_norm": 5.810328483581543, + "learning_rate": 3.771079841082426e-05, + "loss": 1.8483, + "step": 7875 + }, + { + "epoch": 0.5928602344793843, + "grad_norm": 5.487584114074707, + "learning_rate": 3.769898184533084e-05, + "loss": 1.931, + "step": 7876 + }, + { + "epoch": 0.5929355087600444, + "grad_norm": 4.513000011444092, + "learning_rate": 3.768716601109764e-05, + "loss": 1.4377, + "step": 7877 + }, + { + "epoch": 0.5930107830407045, + "grad_norm": 16.523670196533203, + "learning_rate": 3.767535090882709e-05, + "loss": 1.9914, + "step": 7878 + }, + { + "epoch": 0.5930860573213648, + "grad_norm": 7.6347784996032715, + "learning_rate": 3.766353653922156e-05, + "loss": 1.7766, + "step": 7879 + }, + { + "epoch": 0.5931613316020249, + "grad_norm": 6.243397235870361, + "learning_rate": 3.765172290298341e-05, + "loss": 1.5956, + "step": 7880 + }, + { + "epoch": 0.593236605882685, + "grad_norm": 4.8335747718811035, + "learning_rate": 3.763991000081488e-05, + "loss": 1.3649, + "step": 7881 + }, + { + "epoch": 0.5933118801633452, + "grad_norm": 7.388392448425293, + "learning_rate": 3.762809783341823e-05, + "loss": 1.9113, + "step": 7882 + }, + { + "epoch": 0.5933871544440054, + "grad_norm": 6.164877891540527, + "learning_rate": 3.761628640149567e-05, + "loss": 1.8034, + "step": 7883 + }, + { + "epoch": 0.5934624287246655, + "grad_norm": 6.600752353668213, + "learning_rate": 3.7604475705749356e-05, + "loss": 1.6975, + "step": 7884 + }, + { + "epoch": 0.5935377030053256, + "grad_norm": 5.02759313583374, + "learning_rate": 3.7592665746881406e-05, + "loss": 1.6081, + "step": 7885 + }, + { + "epoch": 0.5936129772859858, + "grad_norm": 5.703899383544922, + "learning_rate": 3.758085652559386e-05, + "loss": 1.9267, + "step": 7886 + }, + { + "epoch": 0.593688251566646, + "grad_norm": 6.274411201477051, + "learning_rate": 3.756904804258878e-05, + "loss": 2.2004, + "step": 7887 + }, + { + "epoch": 0.5937635258473061, + "grad_norm": 5.550094127655029, + "learning_rate": 3.75572402985681e-05, + "loss": 1.7734, + "step": 7888 + }, + { + "epoch": 0.5938388001279663, + "grad_norm": 5.130786895751953, + "learning_rate": 3.754543329423379e-05, + "loss": 1.775, + "step": 7889 + }, + { + "epoch": 0.5939140744086264, + "grad_norm": 15.95766830444336, + "learning_rate": 3.753362703028773e-05, + "loss": 2.0867, + "step": 7890 + }, + { + "epoch": 0.5939893486892865, + "grad_norm": 7.361571788787842, + "learning_rate": 3.75218215074318e-05, + "loss": 1.6491, + "step": 7891 + }, + { + "epoch": 0.5940646229699468, + "grad_norm": 5.177577972412109, + "learning_rate": 3.7510016726367746e-05, + "loss": 1.7267, + "step": 7892 + }, + { + "epoch": 0.5941398972506069, + "grad_norm": 6.116597652435303, + "learning_rate": 3.7498212687797386e-05, + "loss": 1.7385, + "step": 7893 + }, + { + "epoch": 0.594215171531267, + "grad_norm": 8.254535675048828, + "learning_rate": 3.748640939242238e-05, + "loss": 2.0837, + "step": 7894 + }, + { + "epoch": 0.5942904458119272, + "grad_norm": 16.961008071899414, + "learning_rate": 3.7474606840944446e-05, + "loss": 1.7142, + "step": 7895 + }, + { + "epoch": 0.5943657200925874, + "grad_norm": 5.936086654663086, + "learning_rate": 3.7462805034065195e-05, + "loss": 1.815, + "step": 7896 + }, + { + "epoch": 0.5944409943732475, + "grad_norm": 12.696560859680176, + "learning_rate": 3.74510039724862e-05, + "loss": 2.0108, + "step": 7897 + }, + { + "epoch": 0.5945162686539077, + "grad_norm": 5.704233646392822, + "learning_rate": 3.743920365690904e-05, + "loss": 2.0651, + "step": 7898 + }, + { + "epoch": 0.5945915429345678, + "grad_norm": 4.409765720367432, + "learning_rate": 3.7427404088035154e-05, + "loss": 1.6282, + "step": 7899 + }, + { + "epoch": 0.594666817215228, + "grad_norm": 5.390868663787842, + "learning_rate": 3.7415605266566034e-05, + "loss": 1.881, + "step": 7900 + }, + { + "epoch": 0.5947420914958882, + "grad_norm": 16.65022850036621, + "learning_rate": 3.740380719320307e-05, + "loss": 2.0287, + "step": 7901 + }, + { + "epoch": 0.5948173657765483, + "grad_norm": 5.610833168029785, + "learning_rate": 3.739200986864763e-05, + "loss": 2.2095, + "step": 7902 + }, + { + "epoch": 0.5948926400572084, + "grad_norm": 6.843104839324951, + "learning_rate": 3.738021329360102e-05, + "loss": 2.1499, + "step": 7903 + }, + { + "epoch": 0.5949679143378687, + "grad_norm": 13.994860649108887, + "learning_rate": 3.7368417468764526e-05, + "loss": 1.8148, + "step": 7904 + }, + { + "epoch": 0.5950431886185288, + "grad_norm": 7.777004241943359, + "learning_rate": 3.735662239483936e-05, + "loss": 1.9148, + "step": 7905 + }, + { + "epoch": 0.5951184628991889, + "grad_norm": 6.914122104644775, + "learning_rate": 3.734482807252673e-05, + "loss": 1.8312, + "step": 7906 + }, + { + "epoch": 0.595193737179849, + "grad_norm": 5.754234790802002, + "learning_rate": 3.733303450252773e-05, + "loss": 1.5458, + "step": 7907 + }, + { + "epoch": 0.5952690114605093, + "grad_norm": 5.34912109375, + "learning_rate": 3.732124168554352e-05, + "loss": 1.6264, + "step": 7908 + }, + { + "epoch": 0.5953442857411694, + "grad_norm": 5.44113302230835, + "learning_rate": 3.7309449622275085e-05, + "loss": 2.5788, + "step": 7909 + }, + { + "epoch": 0.5954195600218295, + "grad_norm": 4.614048957824707, + "learning_rate": 3.729765831342347e-05, + "loss": 2.1583, + "step": 7910 + }, + { + "epoch": 0.5954948343024897, + "grad_norm": 6.298909664154053, + "learning_rate": 3.7285867759689605e-05, + "loss": 1.7276, + "step": 7911 + }, + { + "epoch": 0.5955701085831498, + "grad_norm": 5.208887577056885, + "learning_rate": 3.727407796177445e-05, + "loss": 1.8435, + "step": 7912 + }, + { + "epoch": 0.59564538286381, + "grad_norm": 5.651218891143799, + "learning_rate": 3.7262288920378815e-05, + "loss": 1.6454, + "step": 7913 + }, + { + "epoch": 0.5957206571444702, + "grad_norm": 5.447145938873291, + "learning_rate": 3.7250500636203565e-05, + "loss": 1.8268, + "step": 7914 + }, + { + "epoch": 0.5957959314251303, + "grad_norm": 4.8585991859436035, + "learning_rate": 3.7238713109949486e-05, + "loss": 1.5718, + "step": 7915 + }, + { + "epoch": 0.5958712057057904, + "grad_norm": 5.589688777923584, + "learning_rate": 3.722692634231728e-05, + "loss": 2.0318, + "step": 7916 + }, + { + "epoch": 0.5959464799864507, + "grad_norm": 5.98341703414917, + "learning_rate": 3.721514033400766e-05, + "loss": 1.8352, + "step": 7917 + }, + { + "epoch": 0.5960217542671108, + "grad_norm": 3.825155258178711, + "learning_rate": 3.7203355085721246e-05, + "loss": 1.7144, + "step": 7918 + }, + { + "epoch": 0.5960970285477709, + "grad_norm": 6.559853553771973, + "learning_rate": 3.7191570598158685e-05, + "loss": 1.435, + "step": 7919 + }, + { + "epoch": 0.5961723028284311, + "grad_norm": 6.952791213989258, + "learning_rate": 3.7179786872020474e-05, + "loss": 1.8183, + "step": 7920 + }, + { + "epoch": 0.5962475771090913, + "grad_norm": 4.6901726722717285, + "learning_rate": 3.716800390800718e-05, + "loss": 1.6564, + "step": 7921 + }, + { + "epoch": 0.5963228513897514, + "grad_norm": 5.896546363830566, + "learning_rate": 3.71562217068192e-05, + "loss": 1.8689, + "step": 7922 + }, + { + "epoch": 0.5963981256704116, + "grad_norm": 10.33854866027832, + "learning_rate": 3.7144440269157e-05, + "loss": 2.0965, + "step": 7923 + }, + { + "epoch": 0.5964733999510717, + "grad_norm": 4.577197551727295, + "learning_rate": 3.7132659595720934e-05, + "loss": 2.1768, + "step": 7924 + }, + { + "epoch": 0.5965486742317319, + "grad_norm": 8.719362258911133, + "learning_rate": 3.712087968721135e-05, + "loss": 1.9673, + "step": 7925 + }, + { + "epoch": 0.596623948512392, + "grad_norm": 4.364465713500977, + "learning_rate": 3.71091005443285e-05, + "loss": 1.3295, + "step": 7926 + }, + { + "epoch": 0.5966992227930522, + "grad_norm": 5.006140232086182, + "learning_rate": 3.709732216777264e-05, + "loss": 1.8035, + "step": 7927 + }, + { + "epoch": 0.5967744970737123, + "grad_norm": 11.312521934509277, + "learning_rate": 3.708554455824394e-05, + "loss": 2.0853, + "step": 7928 + }, + { + "epoch": 0.5968497713543724, + "grad_norm": 5.651864051818848, + "learning_rate": 3.7073767716442565e-05, + "loss": 1.6085, + "step": 7929 + }, + { + "epoch": 0.5969250456350327, + "grad_norm": 6.8333563804626465, + "learning_rate": 3.706199164306863e-05, + "loss": 1.9544, + "step": 7930 + }, + { + "epoch": 0.5970003199156928, + "grad_norm": 9.202512741088867, + "learning_rate": 3.7050216338822154e-05, + "loss": 1.9953, + "step": 7931 + }, + { + "epoch": 0.5970755941963529, + "grad_norm": 4.917809009552002, + "learning_rate": 3.703844180440317e-05, + "loss": 1.7483, + "step": 7932 + }, + { + "epoch": 0.5971508684770132, + "grad_norm": 7.653840065002441, + "learning_rate": 3.7026668040511616e-05, + "loss": 1.8228, + "step": 7933 + }, + { + "epoch": 0.5972261427576733, + "grad_norm": 8.099530220031738, + "learning_rate": 3.701489504784743e-05, + "loss": 1.7004, + "step": 7934 + }, + { + "epoch": 0.5973014170383334, + "grad_norm": 5.8279595375061035, + "learning_rate": 3.7003122827110466e-05, + "loss": 1.701, + "step": 7935 + }, + { + "epoch": 0.5973766913189936, + "grad_norm": 4.222233772277832, + "learning_rate": 3.6991351379000575e-05, + "loss": 1.6755, + "step": 7936 + }, + { + "epoch": 0.5974519655996537, + "grad_norm": 5.0771379470825195, + "learning_rate": 3.69795807042175e-05, + "loss": 1.6316, + "step": 7937 + }, + { + "epoch": 0.5975272398803139, + "grad_norm": 4.183101177215576, + "learning_rate": 3.6967810803461006e-05, + "loss": 1.9059, + "step": 7938 + }, + { + "epoch": 0.5976025141609741, + "grad_norm": 5.993053913116455, + "learning_rate": 3.695604167743076e-05, + "loss": 1.9763, + "step": 7939 + }, + { + "epoch": 0.5976777884416342, + "grad_norm": 4.631672382354736, + "learning_rate": 3.694427332682642e-05, + "loss": 1.6653, + "step": 7940 + }, + { + "epoch": 0.5977530627222943, + "grad_norm": 4.304272651672363, + "learning_rate": 3.6932505752347545e-05, + "loss": 2.0, + "step": 7941 + }, + { + "epoch": 0.5978283370029546, + "grad_norm": 6.853264808654785, + "learning_rate": 3.692073895469373e-05, + "loss": 1.6307, + "step": 7942 + }, + { + "epoch": 0.5979036112836147, + "grad_norm": 6.397177219390869, + "learning_rate": 3.690897293456444e-05, + "loss": 1.7743, + "step": 7943 + }, + { + "epoch": 0.5979788855642748, + "grad_norm": 6.35953950881958, + "learning_rate": 3.689720769265916e-05, + "loss": 2.0189, + "step": 7944 + }, + { + "epoch": 0.5980541598449349, + "grad_norm": 5.3479413986206055, + "learning_rate": 3.688544322967728e-05, + "loss": 1.6231, + "step": 7945 + }, + { + "epoch": 0.5981294341255952, + "grad_norm": 5.162478923797607, + "learning_rate": 3.687367954631816e-05, + "loss": 2.0133, + "step": 7946 + }, + { + "epoch": 0.5982047084062553, + "grad_norm": 5.8216071128845215, + "learning_rate": 3.686191664328116e-05, + "loss": 1.8014, + "step": 7947 + }, + { + "epoch": 0.5982799826869154, + "grad_norm": 4.807302474975586, + "learning_rate": 3.685015452126548e-05, + "loss": 1.4395, + "step": 7948 + }, + { + "epoch": 0.5983552569675756, + "grad_norm": 4.11058235168457, + "learning_rate": 3.6838393180970416e-05, + "loss": 1.4983, + "step": 7949 + }, + { + "epoch": 0.5984305312482358, + "grad_norm": 4.955685615539551, + "learning_rate": 3.682663262309509e-05, + "loss": 1.8072, + "step": 7950 + }, + { + "epoch": 0.5985058055288959, + "grad_norm": 6.777763843536377, + "learning_rate": 3.681487284833867e-05, + "loss": 1.3496, + "step": 7951 + }, + { + "epoch": 0.5985810798095561, + "grad_norm": 6.210040092468262, + "learning_rate": 3.6803113857400206e-05, + "loss": 1.5608, + "step": 7952 + }, + { + "epoch": 0.5986563540902162, + "grad_norm": 4.810294151306152, + "learning_rate": 3.679135565097878e-05, + "loss": 1.8268, + "step": 7953 + }, + { + "epoch": 0.5987316283708763, + "grad_norm": 7.854957580566406, + "learning_rate": 3.6779598229773344e-05, + "loss": 2.4505, + "step": 7954 + }, + { + "epoch": 0.5988069026515366, + "grad_norm": 17.12396812438965, + "learning_rate": 3.676784159448287e-05, + "loss": 1.7445, + "step": 7955 + }, + { + "epoch": 0.5988821769321967, + "grad_norm": 5.653016567230225, + "learning_rate": 3.675608574580623e-05, + "loss": 1.9367, + "step": 7956 + }, + { + "epoch": 0.5989574512128568, + "grad_norm": 9.522991180419922, + "learning_rate": 3.674433068444232e-05, + "loss": 1.8378, + "step": 7957 + }, + { + "epoch": 0.599032725493517, + "grad_norm": 4.22638463973999, + "learning_rate": 3.673257641108989e-05, + "loss": 1.5451, + "step": 7958 + }, + { + "epoch": 0.5991079997741772, + "grad_norm": 6.512850284576416, + "learning_rate": 3.672082292644775e-05, + "loss": 1.5608, + "step": 7959 + }, + { + "epoch": 0.5991832740548373, + "grad_norm": 4.190865993499756, + "learning_rate": 3.670907023121456e-05, + "loss": 1.9132, + "step": 7960 + }, + { + "epoch": 0.5992585483354975, + "grad_norm": 6.298935413360596, + "learning_rate": 3.669731832608902e-05, + "loss": 1.6428, + "step": 7961 + }, + { + "epoch": 0.5993338226161576, + "grad_norm": 5.427552223205566, + "learning_rate": 3.6685567211769726e-05, + "loss": 2.2609, + "step": 7962 + }, + { + "epoch": 0.5994090968968178, + "grad_norm": 7.042741775512695, + "learning_rate": 3.6673816888955256e-05, + "loss": 1.8596, + "step": 7963 + }, + { + "epoch": 0.5994843711774779, + "grad_norm": 5.713334083557129, + "learning_rate": 3.666206735834415e-05, + "loss": 1.8944, + "step": 7964 + }, + { + "epoch": 0.5995596454581381, + "grad_norm": 4.512503147125244, + "learning_rate": 3.6650318620634864e-05, + "loss": 1.8201, + "step": 7965 + }, + { + "epoch": 0.5996349197387982, + "grad_norm": 5.618650913238525, + "learning_rate": 3.663857067652584e-05, + "loss": 1.9081, + "step": 7966 + }, + { + "epoch": 0.5997101940194584, + "grad_norm": 8.329724311828613, + "learning_rate": 3.662682352671543e-05, + "loss": 1.4886, + "step": 7967 + }, + { + "epoch": 0.5997854683001186, + "grad_norm": 4.864678859710693, + "learning_rate": 3.6615077171902e-05, + "loss": 1.8321, + "step": 7968 + }, + { + "epoch": 0.5998607425807787, + "grad_norm": 14.594117164611816, + "learning_rate": 3.660333161278382e-05, + "loss": 1.7641, + "step": 7969 + }, + { + "epoch": 0.5999360168614388, + "grad_norm": 5.460437774658203, + "learning_rate": 3.659158685005916e-05, + "loss": 1.9168, + "step": 7970 + }, + { + "epoch": 0.6000112911420991, + "grad_norm": 6.388253688812256, + "learning_rate": 3.6579842884426165e-05, + "loss": 2.1474, + "step": 7971 + }, + { + "epoch": 0.6000865654227592, + "grad_norm": 17.14667510986328, + "learning_rate": 3.656809971658301e-05, + "loss": 1.8553, + "step": 7972 + }, + { + "epoch": 0.6001618397034193, + "grad_norm": 5.064866065979004, + "learning_rate": 3.655635734722778e-05, + "loss": 1.4612, + "step": 7973 + }, + { + "epoch": 0.6002371139840795, + "grad_norm": 4.627748012542725, + "learning_rate": 3.654461577705855e-05, + "loss": 1.8051, + "step": 7974 + }, + { + "epoch": 0.6003123882647396, + "grad_norm": 5.306166172027588, + "learning_rate": 3.6532875006773284e-05, + "loss": 1.6079, + "step": 7975 + }, + { + "epoch": 0.6003876625453998, + "grad_norm": 6.5008440017700195, + "learning_rate": 3.652113503706998e-05, + "loss": 2.0493, + "step": 7976 + }, + { + "epoch": 0.60046293682606, + "grad_norm": 9.011292457580566, + "learning_rate": 3.6509395868646503e-05, + "loss": 1.7143, + "step": 7977 + }, + { + "epoch": 0.6005382111067201, + "grad_norm": 7.110915660858154, + "learning_rate": 3.649765750220073e-05, + "loss": 1.807, + "step": 7978 + }, + { + "epoch": 0.6006134853873802, + "grad_norm": 7.633163928985596, + "learning_rate": 3.648591993843049e-05, + "loss": 2.1113, + "step": 7979 + }, + { + "epoch": 0.6006887596680405, + "grad_norm": 4.713283061981201, + "learning_rate": 3.6474183178033516e-05, + "loss": 1.6939, + "step": 7980 + }, + { + "epoch": 0.6007640339487006, + "grad_norm": 4.894254684448242, + "learning_rate": 3.646244722170756e-05, + "loss": 1.8362, + "step": 7981 + }, + { + "epoch": 0.6008393082293607, + "grad_norm": 10.824357032775879, + "learning_rate": 3.645071207015026e-05, + "loss": 2.0984, + "step": 7982 + }, + { + "epoch": 0.6009145825100208, + "grad_norm": 9.78145694732666, + "learning_rate": 3.6438977724059266e-05, + "loss": 1.8573, + "step": 7983 + }, + { + "epoch": 0.6009898567906811, + "grad_norm": 4.926133155822754, + "learning_rate": 3.642724418413211e-05, + "loss": 1.617, + "step": 7984 + }, + { + "epoch": 0.6010651310713412, + "grad_norm": 23.463472366333008, + "learning_rate": 3.641551145106638e-05, + "loss": 1.5609, + "step": 7985 + }, + { + "epoch": 0.6011404053520013, + "grad_norm": 6.883307933807373, + "learning_rate": 3.640377952555949e-05, + "loss": 2.0238, + "step": 7986 + }, + { + "epoch": 0.6012156796326615, + "grad_norm": 5.299291133880615, + "learning_rate": 3.639204840830891e-05, + "loss": 1.9907, + "step": 7987 + }, + { + "epoch": 0.6012909539133217, + "grad_norm": 5.825193405151367, + "learning_rate": 3.638031810001199e-05, + "loss": 1.8336, + "step": 7988 + }, + { + "epoch": 0.6013662281939818, + "grad_norm": 5.397491931915283, + "learning_rate": 3.636858860136609e-05, + "loss": 2.0342, + "step": 7989 + }, + { + "epoch": 0.601441502474642, + "grad_norm": 6.165040016174316, + "learning_rate": 3.6356859913068475e-05, + "loss": 2.0, + "step": 7990 + }, + { + "epoch": 0.6015167767553021, + "grad_norm": 7.328851222991943, + "learning_rate": 3.634513203581641e-05, + "loss": 1.5466, + "step": 7991 + }, + { + "epoch": 0.6015920510359622, + "grad_norm": 5.763233184814453, + "learning_rate": 3.6333404970307044e-05, + "loss": 1.754, + "step": 7992 + }, + { + "epoch": 0.6016673253166225, + "grad_norm": 5.853240489959717, + "learning_rate": 3.6321678717237564e-05, + "loss": 1.966, + "step": 7993 + }, + { + "epoch": 0.6017425995972826, + "grad_norm": 5.526442527770996, + "learning_rate": 3.630995327730501e-05, + "loss": 2.0525, + "step": 7994 + }, + { + "epoch": 0.6018178738779427, + "grad_norm": 5.30607795715332, + "learning_rate": 3.629822865120646e-05, + "loss": 1.8564, + "step": 7995 + }, + { + "epoch": 0.601893148158603, + "grad_norm": 4.887218952178955, + "learning_rate": 3.62865048396389e-05, + "loss": 1.7685, + "step": 7996 + }, + { + "epoch": 0.6019684224392631, + "grad_norm": 6.019776821136475, + "learning_rate": 3.6274781843299275e-05, + "loss": 2.0926, + "step": 7997 + }, + { + "epoch": 0.6020436967199232, + "grad_norm": 7.026263236999512, + "learning_rate": 3.626305966288451e-05, + "loss": 1.8379, + "step": 7998 + }, + { + "epoch": 0.6021189710005834, + "grad_norm": 11.796557426452637, + "learning_rate": 3.625133829909141e-05, + "loss": 1.9731, + "step": 7999 + }, + { + "epoch": 0.6021942452812435, + "grad_norm": 7.872086048126221, + "learning_rate": 3.623961775261682e-05, + "loss": 1.5359, + "step": 8000 + }, + { + "epoch": 0.6022695195619037, + "grad_norm": 4.217902183532715, + "learning_rate": 3.622789802415745e-05, + "loss": 1.7633, + "step": 8001 + }, + { + "epoch": 0.6023447938425639, + "grad_norm": 6.429264068603516, + "learning_rate": 3.6216179114410044e-05, + "loss": 1.7467, + "step": 8002 + }, + { + "epoch": 0.602420068123224, + "grad_norm": 7.515525817871094, + "learning_rate": 3.620446102407122e-05, + "loss": 1.7931, + "step": 8003 + }, + { + "epoch": 0.6024953424038841, + "grad_norm": 4.938969612121582, + "learning_rate": 3.619274375383763e-05, + "loss": 1.8082, + "step": 8004 + }, + { + "epoch": 0.6025706166845443, + "grad_norm": 4.357238292694092, + "learning_rate": 3.618102730440578e-05, + "loss": 1.7753, + "step": 8005 + }, + { + "epoch": 0.6026458909652045, + "grad_norm": 4.71003532409668, + "learning_rate": 3.616931167647222e-05, + "loss": 1.6215, + "step": 8006 + }, + { + "epoch": 0.6027211652458646, + "grad_norm": 4.695444107055664, + "learning_rate": 3.615759687073339e-05, + "loss": 1.9408, + "step": 8007 + }, + { + "epoch": 0.6027964395265247, + "grad_norm": 4.16632604598999, + "learning_rate": 3.614588288788572e-05, + "loss": 1.5767, + "step": 8008 + }, + { + "epoch": 0.602871713807185, + "grad_norm": 7.27195405960083, + "learning_rate": 3.6134169728625546e-05, + "loss": 1.4922, + "step": 8009 + }, + { + "epoch": 0.6029469880878451, + "grad_norm": 6.277414321899414, + "learning_rate": 3.612245739364919e-05, + "loss": 2.1323, + "step": 8010 + }, + { + "epoch": 0.6030222623685052, + "grad_norm": 8.176581382751465, + "learning_rate": 3.611074588365294e-05, + "loss": 2.1511, + "step": 8011 + }, + { + "epoch": 0.6030975366491654, + "grad_norm": 6.605081081390381, + "learning_rate": 3.609903519933298e-05, + "loss": 1.5329, + "step": 8012 + }, + { + "epoch": 0.6031728109298256, + "grad_norm": 6.708236217498779, + "learning_rate": 3.6087325341385514e-05, + "loss": 1.6915, + "step": 8013 + }, + { + "epoch": 0.6032480852104857, + "grad_norm": 4.941124439239502, + "learning_rate": 3.607561631050661e-05, + "loss": 1.3294, + "step": 8014 + }, + { + "epoch": 0.6033233594911459, + "grad_norm": 4.750990867614746, + "learning_rate": 3.606390810739239e-05, + "loss": 1.8748, + "step": 8015 + }, + { + "epoch": 0.603398633771806, + "grad_norm": 4.168507099151611, + "learning_rate": 3.605220073273882e-05, + "loss": 1.6881, + "step": 8016 + }, + { + "epoch": 0.6034739080524661, + "grad_norm": 4.999948978424072, + "learning_rate": 3.604049418724191e-05, + "loss": 1.8046, + "step": 8017 + }, + { + "epoch": 0.6035491823331264, + "grad_norm": 6.029209613800049, + "learning_rate": 3.602878847159755e-05, + "loss": 1.9789, + "step": 8018 + }, + { + "epoch": 0.6036244566137865, + "grad_norm": 8.745284080505371, + "learning_rate": 3.601708358650166e-05, + "loss": 1.6268, + "step": 8019 + }, + { + "epoch": 0.6036997308944466, + "grad_norm": 51.208824157714844, + "learning_rate": 3.6005379532649996e-05, + "loss": 2.1388, + "step": 8020 + }, + { + "epoch": 0.6037750051751068, + "grad_norm": 5.057849884033203, + "learning_rate": 3.5993676310738396e-05, + "loss": 1.7145, + "step": 8021 + }, + { + "epoch": 0.603850279455767, + "grad_norm": 6.256768226623535, + "learning_rate": 3.598197392146252e-05, + "loss": 2.0481, + "step": 8022 + }, + { + "epoch": 0.6039255537364271, + "grad_norm": 6.3571648597717285, + "learning_rate": 3.597027236551809e-05, + "loss": 1.9657, + "step": 8023 + }, + { + "epoch": 0.6040008280170872, + "grad_norm": 5.093692779541016, + "learning_rate": 3.59585716436007e-05, + "loss": 1.9584, + "step": 8024 + }, + { + "epoch": 0.6040761022977474, + "grad_norm": 5.285855770111084, + "learning_rate": 3.594687175640595e-05, + "loss": 1.6453, + "step": 8025 + }, + { + "epoch": 0.6041513765784076, + "grad_norm": 5.955439567565918, + "learning_rate": 3.593517270462934e-05, + "loss": 2.0053, + "step": 8026 + }, + { + "epoch": 0.6042266508590677, + "grad_norm": 4.958759784698486, + "learning_rate": 3.592347448896635e-05, + "loss": 2.002, + "step": 8027 + }, + { + "epoch": 0.6043019251397279, + "grad_norm": 4.557804107666016, + "learning_rate": 3.591177711011242e-05, + "loss": 2.257, + "step": 8028 + }, + { + "epoch": 0.604377199420388, + "grad_norm": 5.139565944671631, + "learning_rate": 3.5900080568762916e-05, + "loss": 1.8749, + "step": 8029 + }, + { + "epoch": 0.6044524737010482, + "grad_norm": 5.143902778625488, + "learning_rate": 3.588838486561318e-05, + "loss": 1.7796, + "step": 8030 + }, + { + "epoch": 0.6045277479817084, + "grad_norm": 4.591400623321533, + "learning_rate": 3.5876690001358445e-05, + "loss": 1.5792, + "step": 8031 + }, + { + "epoch": 0.6046030222623685, + "grad_norm": 5.710268974304199, + "learning_rate": 3.5864995976694e-05, + "loss": 1.4823, + "step": 8032 + }, + { + "epoch": 0.6046782965430286, + "grad_norm": 5.014245510101318, + "learning_rate": 3.585330279231496e-05, + "loss": 1.6984, + "step": 8033 + }, + { + "epoch": 0.6047535708236889, + "grad_norm": 4.509344100952148, + "learning_rate": 3.5841610448916484e-05, + "loss": 1.5616, + "step": 8034 + }, + { + "epoch": 0.604828845104349, + "grad_norm": 4.033265590667725, + "learning_rate": 3.5829918947193634e-05, + "loss": 1.3959, + "step": 8035 + }, + { + "epoch": 0.6049041193850091, + "grad_norm": 6.812567234039307, + "learning_rate": 3.5818228287841466e-05, + "loss": 2.0578, + "step": 8036 + }, + { + "epoch": 0.6049793936656693, + "grad_norm": 5.77276086807251, + "learning_rate": 3.580653847155491e-05, + "loss": 1.7571, + "step": 8037 + }, + { + "epoch": 0.6050546679463294, + "grad_norm": 4.42315149307251, + "learning_rate": 3.579484949902893e-05, + "loss": 1.8428, + "step": 8038 + }, + { + "epoch": 0.6051299422269896, + "grad_norm": 6.85744571685791, + "learning_rate": 3.578316137095838e-05, + "loss": 1.7037, + "step": 8039 + }, + { + "epoch": 0.6052052165076498, + "grad_norm": 4.550130844116211, + "learning_rate": 3.5771474088038114e-05, + "loss": 1.7322, + "step": 8040 + }, + { + "epoch": 0.6052804907883099, + "grad_norm": 6.225490093231201, + "learning_rate": 3.575978765096286e-05, + "loss": 1.695, + "step": 8041 + }, + { + "epoch": 0.60535576506897, + "grad_norm": 6.0857319831848145, + "learning_rate": 3.5748102060427374e-05, + "loss": 1.9571, + "step": 8042 + }, + { + "epoch": 0.6054310393496302, + "grad_norm": 5.0007100105285645, + "learning_rate": 3.573641731712635e-05, + "loss": 1.5728, + "step": 8043 + }, + { + "epoch": 0.6055063136302904, + "grad_norm": 6.7718892097473145, + "learning_rate": 3.5724733421754365e-05, + "loss": 1.9892, + "step": 8044 + }, + { + "epoch": 0.6055815879109505, + "grad_norm": 5.31088924407959, + "learning_rate": 3.571305037500603e-05, + "loss": 1.5263, + "step": 8045 + }, + { + "epoch": 0.6056568621916106, + "grad_norm": 6.791069507598877, + "learning_rate": 3.570136817757585e-05, + "loss": 1.8175, + "step": 8046 + }, + { + "epoch": 0.6057321364722709, + "grad_norm": 4.031097888946533, + "learning_rate": 3.568968683015832e-05, + "loss": 1.7603, + "step": 8047 + }, + { + "epoch": 0.605807410752931, + "grad_norm": 4.455870628356934, + "learning_rate": 3.567800633344783e-05, + "loss": 1.9016, + "step": 8048 + }, + { + "epoch": 0.6058826850335911, + "grad_norm": 5.4805731773376465, + "learning_rate": 3.566632668813879e-05, + "loss": 1.5804, + "step": 8049 + }, + { + "epoch": 0.6059579593142513, + "grad_norm": 4.379767417907715, + "learning_rate": 3.5654647894925475e-05, + "loss": 1.6012, + "step": 8050 + }, + { + "epoch": 0.6060332335949115, + "grad_norm": 6.464620590209961, + "learning_rate": 3.564296995450219e-05, + "loss": 2.3095, + "step": 8051 + }, + { + "epoch": 0.6061085078755716, + "grad_norm": 8.686850547790527, + "learning_rate": 3.5631292867563146e-05, + "loss": 1.9917, + "step": 8052 + }, + { + "epoch": 0.6061837821562318, + "grad_norm": 4.714382171630859, + "learning_rate": 3.5619616634802524e-05, + "loss": 1.8046, + "step": 8053 + }, + { + "epoch": 0.6062590564368919, + "grad_norm": 6.38838005065918, + "learning_rate": 3.560794125691441e-05, + "loss": 2.3639, + "step": 8054 + }, + { + "epoch": 0.606334330717552, + "grad_norm": 5.940616607666016, + "learning_rate": 3.559626673459291e-05, + "loss": 1.7498, + "step": 8055 + }, + { + "epoch": 0.6064096049982123, + "grad_norm": 4.436683654785156, + "learning_rate": 3.558459306853201e-05, + "loss": 1.6991, + "step": 8056 + }, + { + "epoch": 0.6064848792788724, + "grad_norm": 5.870072364807129, + "learning_rate": 3.5572920259425715e-05, + "loss": 1.284, + "step": 8057 + }, + { + "epoch": 0.6065601535595325, + "grad_norm": 4.202273845672607, + "learning_rate": 3.556124830796789e-05, + "loss": 1.7791, + "step": 8058 + }, + { + "epoch": 0.6066354278401928, + "grad_norm": 3.9480738639831543, + "learning_rate": 3.554957721485242e-05, + "loss": 1.34, + "step": 8059 + }, + { + "epoch": 0.6067107021208529, + "grad_norm": 4.205045223236084, + "learning_rate": 3.553790698077315e-05, + "loss": 1.5577, + "step": 8060 + }, + { + "epoch": 0.606785976401513, + "grad_norm": 4.402333736419678, + "learning_rate": 3.5526237606423786e-05, + "loss": 1.7104, + "step": 8061 + }, + { + "epoch": 0.6068612506821731, + "grad_norm": 5.593390464782715, + "learning_rate": 3.551456909249808e-05, + "loss": 1.979, + "step": 8062 + }, + { + "epoch": 0.6069365249628333, + "grad_norm": 4.244074821472168, + "learning_rate": 3.5502901439689664e-05, + "loss": 1.5256, + "step": 8063 + }, + { + "epoch": 0.6070117992434935, + "grad_norm": 4.680155277252197, + "learning_rate": 3.549123464869218e-05, + "loss": 1.7962, + "step": 8064 + }, + { + "epoch": 0.6070870735241536, + "grad_norm": 4.564580917358398, + "learning_rate": 3.547956872019915e-05, + "loss": 1.6722, + "step": 8065 + }, + { + "epoch": 0.6071623478048138, + "grad_norm": 10.013934135437012, + "learning_rate": 3.546790365490411e-05, + "loss": 2.0294, + "step": 8066 + }, + { + "epoch": 0.6072376220854739, + "grad_norm": 3.786233425140381, + "learning_rate": 3.545623945350049e-05, + "loss": 1.6733, + "step": 8067 + }, + { + "epoch": 0.6073128963661341, + "grad_norm": 6.8180999755859375, + "learning_rate": 3.544457611668171e-05, + "loss": 1.954, + "step": 8068 + }, + { + "epoch": 0.6073881706467943, + "grad_norm": 4.573334693908691, + "learning_rate": 3.5432913645141106e-05, + "loss": 1.71, + "step": 8069 + }, + { + "epoch": 0.6074634449274544, + "grad_norm": 3.6010117530822754, + "learning_rate": 3.542125203957202e-05, + "loss": 1.642, + "step": 8070 + }, + { + "epoch": 0.6075387192081145, + "grad_norm": 6.272981643676758, + "learning_rate": 3.5409591300667636e-05, + "loss": 1.7795, + "step": 8071 + }, + { + "epoch": 0.6076139934887748, + "grad_norm": 4.059992790222168, + "learning_rate": 3.539793142912119e-05, + "loss": 1.7403, + "step": 8072 + }, + { + "epoch": 0.6076892677694349, + "grad_norm": 4.270183086395264, + "learning_rate": 3.5386272425625816e-05, + "loss": 1.6402, + "step": 8073 + }, + { + "epoch": 0.607764542050095, + "grad_norm": 5.349389553070068, + "learning_rate": 3.537461429087464e-05, + "loss": 1.5877, + "step": 8074 + }, + { + "epoch": 0.6078398163307552, + "grad_norm": 4.138882160186768, + "learning_rate": 3.536295702556065e-05, + "loss": 1.769, + "step": 8075 + }, + { + "epoch": 0.6079150906114154, + "grad_norm": 5.533018589019775, + "learning_rate": 3.5351300630376876e-05, + "loss": 1.6796, + "step": 8076 + }, + { + "epoch": 0.6079903648920755, + "grad_norm": 6.179644584655762, + "learning_rate": 3.533964510601626e-05, + "loss": 1.8955, + "step": 8077 + }, + { + "epoch": 0.6080656391727357, + "grad_norm": 6.139017581939697, + "learning_rate": 3.5327990453171663e-05, + "loss": 2.0319, + "step": 8078 + }, + { + "epoch": 0.6081409134533958, + "grad_norm": 6.271912097930908, + "learning_rate": 3.5316336672535945e-05, + "loss": 1.3298, + "step": 8079 + }, + { + "epoch": 0.608216187734056, + "grad_norm": 6.237033843994141, + "learning_rate": 3.530468376480187e-05, + "loss": 1.6628, + "step": 8080 + }, + { + "epoch": 0.6082914620147161, + "grad_norm": 3.975836753845215, + "learning_rate": 3.5293031730662195e-05, + "loss": 1.8774, + "step": 8081 + }, + { + "epoch": 0.6083667362953763, + "grad_norm": 3.5257015228271484, + "learning_rate": 3.528138057080957e-05, + "loss": 1.7437, + "step": 8082 + }, + { + "epoch": 0.6084420105760364, + "grad_norm": 4.711452960968018, + "learning_rate": 3.526973028593664e-05, + "loss": 1.7632, + "step": 8083 + }, + { + "epoch": 0.6085172848566965, + "grad_norm": 5.078463554382324, + "learning_rate": 3.525808087673598e-05, + "loss": 1.7084, + "step": 8084 + }, + { + "epoch": 0.6085925591373568, + "grad_norm": 5.767451286315918, + "learning_rate": 3.524643234390012e-05, + "loss": 1.8479, + "step": 8085 + }, + { + "epoch": 0.6086678334180169, + "grad_norm": 4.351364612579346, + "learning_rate": 3.5234784688121516e-05, + "loss": 1.8607, + "step": 8086 + }, + { + "epoch": 0.608743107698677, + "grad_norm": 6.21993350982666, + "learning_rate": 3.5223137910092605e-05, + "loss": 2.0673, + "step": 8087 + }, + { + "epoch": 0.6088183819793372, + "grad_norm": 4.6438822746276855, + "learning_rate": 3.5211492010505734e-05, + "loss": 1.8567, + "step": 8088 + }, + { + "epoch": 0.6088936562599974, + "grad_norm": 4.6182332038879395, + "learning_rate": 3.5199846990053234e-05, + "loss": 1.9892, + "step": 8089 + }, + { + "epoch": 0.6089689305406575, + "grad_norm": 8.021430969238281, + "learning_rate": 3.5188202849427355e-05, + "loss": 2.3467, + "step": 8090 + }, + { + "epoch": 0.6090442048213177, + "grad_norm": 5.784464359283447, + "learning_rate": 3.5176559589320326e-05, + "loss": 1.7767, + "step": 8091 + }, + { + "epoch": 0.6091194791019778, + "grad_norm": 6.09599494934082, + "learning_rate": 3.516491721042431e-05, + "loss": 2.5454, + "step": 8092 + }, + { + "epoch": 0.609194753382638, + "grad_norm": 6.063644886016846, + "learning_rate": 3.5153275713431375e-05, + "loss": 2.3278, + "step": 8093 + }, + { + "epoch": 0.6092700276632982, + "grad_norm": 6.2047929763793945, + "learning_rate": 3.514163509903362e-05, + "loss": 1.6538, + "step": 8094 + }, + { + "epoch": 0.6093453019439583, + "grad_norm": 4.3374505043029785, + "learning_rate": 3.512999536792301e-05, + "loss": 1.7604, + "step": 8095 + }, + { + "epoch": 0.6094205762246184, + "grad_norm": 4.688549518585205, + "learning_rate": 3.511835652079151e-05, + "loss": 1.708, + "step": 8096 + }, + { + "epoch": 0.6094958505052787, + "grad_norm": 4.737636089324951, + "learning_rate": 3.5106718558331006e-05, + "loss": 1.5878, + "step": 8097 + }, + { + "epoch": 0.6095711247859388, + "grad_norm": 4.31265926361084, + "learning_rate": 3.5095081481233384e-05, + "loss": 2.057, + "step": 8098 + }, + { + "epoch": 0.6096463990665989, + "grad_norm": 5.311634540557861, + "learning_rate": 3.5083445290190364e-05, + "loss": 1.7763, + "step": 8099 + }, + { + "epoch": 0.6097216733472591, + "grad_norm": 5.816988468170166, + "learning_rate": 3.507180998589374e-05, + "loss": 1.8476, + "step": 8100 + }, + { + "epoch": 0.6097969476279193, + "grad_norm": 8.308273315429688, + "learning_rate": 3.506017556903517e-05, + "loss": 1.8243, + "step": 8101 + }, + { + "epoch": 0.6098722219085794, + "grad_norm": 5.233309268951416, + "learning_rate": 3.504854204030631e-05, + "loss": 1.9434, + "step": 8102 + }, + { + "epoch": 0.6099474961892395, + "grad_norm": 4.769708156585693, + "learning_rate": 3.503690940039871e-05, + "loss": 1.7205, + "step": 8103 + }, + { + "epoch": 0.6100227704698997, + "grad_norm": 6.045488357543945, + "learning_rate": 3.502527765000393e-05, + "loss": 1.6621, + "step": 8104 + }, + { + "epoch": 0.6100980447505598, + "grad_norm": 5.941781044006348, + "learning_rate": 3.5013646789813404e-05, + "loss": 2.0352, + "step": 8105 + }, + { + "epoch": 0.61017331903122, + "grad_norm": 5.315737724304199, + "learning_rate": 3.500201682051859e-05, + "loss": 2.0206, + "step": 8106 + }, + { + "epoch": 0.6102485933118802, + "grad_norm": 4.679686546325684, + "learning_rate": 3.499038774281083e-05, + "loss": 1.4654, + "step": 8107 + }, + { + "epoch": 0.6103238675925403, + "grad_norm": 4.6445722579956055, + "learning_rate": 3.4978759557381444e-05, + "loss": 1.6942, + "step": 8108 + }, + { + "epoch": 0.6103991418732004, + "grad_norm": 6.710294246673584, + "learning_rate": 3.496713226492173e-05, + "loss": 1.9981, + "step": 8109 + }, + { + "epoch": 0.6104744161538607, + "grad_norm": 6.312389373779297, + "learning_rate": 3.495550586612285e-05, + "loss": 1.8949, + "step": 8110 + }, + { + "epoch": 0.6105496904345208, + "grad_norm": 4.057049751281738, + "learning_rate": 3.494388036167598e-05, + "loss": 1.9033, + "step": 8111 + }, + { + "epoch": 0.6106249647151809, + "grad_norm": 4.585483074188232, + "learning_rate": 3.493225575227222e-05, + "loss": 1.846, + "step": 8112 + }, + { + "epoch": 0.6107002389958411, + "grad_norm": 5.755495071411133, + "learning_rate": 3.492063203860264e-05, + "loss": 1.8582, + "step": 8113 + }, + { + "epoch": 0.6107755132765013, + "grad_norm": 5.890713214874268, + "learning_rate": 3.49090092213582e-05, + "loss": 1.7666, + "step": 8114 + }, + { + "epoch": 0.6108507875571614, + "grad_norm": 4.833436489105225, + "learning_rate": 3.489738730122988e-05, + "loss": 1.7351, + "step": 8115 + }, + { + "epoch": 0.6109260618378216, + "grad_norm": 6.042886257171631, + "learning_rate": 3.4885766278908526e-05, + "loss": 1.8573, + "step": 8116 + }, + { + "epoch": 0.6110013361184817, + "grad_norm": 3.989191770553589, + "learning_rate": 3.487414615508501e-05, + "loss": 1.3383, + "step": 8117 + }, + { + "epoch": 0.6110766103991419, + "grad_norm": 5.329035758972168, + "learning_rate": 3.48625269304501e-05, + "loss": 2.0238, + "step": 8118 + }, + { + "epoch": 0.6111518846798021, + "grad_norm": 5.113417625427246, + "learning_rate": 3.485090860569454e-05, + "loss": 1.617, + "step": 8119 + }, + { + "epoch": 0.6112271589604622, + "grad_norm": 9.029998779296875, + "learning_rate": 3.483929118150898e-05, + "loss": 1.91, + "step": 8120 + }, + { + "epoch": 0.6113024332411223, + "grad_norm": 7.3666839599609375, + "learning_rate": 3.482767465858408e-05, + "loss": 1.8286, + "step": 8121 + }, + { + "epoch": 0.6113777075217824, + "grad_norm": 4.620846271514893, + "learning_rate": 3.4816059037610365e-05, + "loss": 1.6997, + "step": 8122 + }, + { + "epoch": 0.6114529818024427, + "grad_norm": 5.611097812652588, + "learning_rate": 3.480444431927837e-05, + "loss": 1.9035, + "step": 8123 + }, + { + "epoch": 0.6115282560831028, + "grad_norm": 4.780721664428711, + "learning_rate": 3.479283050427858e-05, + "loss": 1.6846, + "step": 8124 + }, + { + "epoch": 0.6116035303637629, + "grad_norm": 5.277527332305908, + "learning_rate": 3.478121759330136e-05, + "loss": 1.5705, + "step": 8125 + }, + { + "epoch": 0.6116788046444231, + "grad_norm": 7.533792972564697, + "learning_rate": 3.476960558703711e-05, + "loss": 2.0774, + "step": 8126 + }, + { + "epoch": 0.6117540789250833, + "grad_norm": 3.64604115486145, + "learning_rate": 3.4757994486176094e-05, + "loss": 1.7476, + "step": 8127 + }, + { + "epoch": 0.6118293532057434, + "grad_norm": 5.177612781524658, + "learning_rate": 3.474638429140858e-05, + "loss": 1.5629, + "step": 8128 + }, + { + "epoch": 0.6119046274864036, + "grad_norm": 4.776829242706299, + "learning_rate": 3.4734775003424734e-05, + "loss": 1.9089, + "step": 8129 + }, + { + "epoch": 0.6119799017670637, + "grad_norm": 5.867118835449219, + "learning_rate": 3.472316662291474e-05, + "loss": 1.7578, + "step": 8130 + }, + { + "epoch": 0.6120551760477239, + "grad_norm": 4.7869768142700195, + "learning_rate": 3.4711559150568634e-05, + "loss": 1.7473, + "step": 8131 + }, + { + "epoch": 0.6121304503283841, + "grad_norm": 4.424525260925293, + "learning_rate": 3.469995258707649e-05, + "loss": 1.9923, + "step": 8132 + }, + { + "epoch": 0.6122057246090442, + "grad_norm": 5.202546119689941, + "learning_rate": 3.4688346933128246e-05, + "loss": 1.6578, + "step": 8133 + }, + { + "epoch": 0.6122809988897043, + "grad_norm": 5.164725303649902, + "learning_rate": 3.467674218941385e-05, + "loss": 1.9647, + "step": 8134 + }, + { + "epoch": 0.6123562731703646, + "grad_norm": 5.317021369934082, + "learning_rate": 3.466513835662315e-05, + "loss": 1.647, + "step": 8135 + }, + { + "epoch": 0.6124315474510247, + "grad_norm": 5.3344831466674805, + "learning_rate": 3.4653535435446e-05, + "loss": 2.1972, + "step": 8136 + }, + { + "epoch": 0.6125068217316848, + "grad_norm": 4.284879684448242, + "learning_rate": 3.464193342657211e-05, + "loss": 1.9931, + "step": 8137 + }, + { + "epoch": 0.612582096012345, + "grad_norm": 4.506369113922119, + "learning_rate": 3.4630332330691224e-05, + "loss": 2.0826, + "step": 8138 + }, + { + "epoch": 0.6126573702930052, + "grad_norm": 6.041220664978027, + "learning_rate": 3.461873214849297e-05, + "loss": 1.8367, + "step": 8139 + }, + { + "epoch": 0.6127326445736653, + "grad_norm": 5.328061580657959, + "learning_rate": 3.4607132880666956e-05, + "loss": 1.6496, + "step": 8140 + }, + { + "epoch": 0.6128079188543254, + "grad_norm": 4.81540584564209, + "learning_rate": 3.459553452790274e-05, + "loss": 1.7206, + "step": 8141 + }, + { + "epoch": 0.6128831931349856, + "grad_norm": 4.256353855133057, + "learning_rate": 3.458393709088979e-05, + "loss": 1.8251, + "step": 8142 + }, + { + "epoch": 0.6129584674156457, + "grad_norm": 4.865283489227295, + "learning_rate": 3.457234057031755e-05, + "loss": 2.1317, + "step": 8143 + }, + { + "epoch": 0.6130337416963059, + "grad_norm": 4.786359786987305, + "learning_rate": 3.456074496687538e-05, + "loss": 1.5756, + "step": 8144 + }, + { + "epoch": 0.6131090159769661, + "grad_norm": 4.32356071472168, + "learning_rate": 3.4549150281252636e-05, + "loss": 1.7171, + "step": 8145 + }, + { + "epoch": 0.6131842902576262, + "grad_norm": 5.075093746185303, + "learning_rate": 3.4537556514138556e-05, + "loss": 1.7837, + "step": 8146 + }, + { + "epoch": 0.6132595645382863, + "grad_norm": 4.299962520599365, + "learning_rate": 3.452596366622241e-05, + "loss": 1.8348, + "step": 8147 + }, + { + "epoch": 0.6133348388189466, + "grad_norm": 3.900052309036255, + "learning_rate": 3.451437173819329e-05, + "loss": 1.7232, + "step": 8148 + }, + { + "epoch": 0.6134101130996067, + "grad_norm": 5.519255638122559, + "learning_rate": 3.450278073074036e-05, + "loss": 1.8058, + "step": 8149 + }, + { + "epoch": 0.6134853873802668, + "grad_norm": 4.785489082336426, + "learning_rate": 3.449119064455263e-05, + "loss": 1.2532, + "step": 8150 + }, + { + "epoch": 0.613560661660927, + "grad_norm": 4.234131813049316, + "learning_rate": 3.447960148031913e-05, + "loss": 2.0709, + "step": 8151 + }, + { + "epoch": 0.6136359359415872, + "grad_norm": 6.209066867828369, + "learning_rate": 3.4468013238728774e-05, + "loss": 1.9193, + "step": 8152 + }, + { + "epoch": 0.6137112102222473, + "grad_norm": 5.110276222229004, + "learning_rate": 3.445642592047049e-05, + "loss": 2.1567, + "step": 8153 + }, + { + "epoch": 0.6137864845029075, + "grad_norm": 6.174053192138672, + "learning_rate": 3.444483952623306e-05, + "loss": 1.7745, + "step": 8154 + }, + { + "epoch": 0.6138617587835676, + "grad_norm": 5.744781494140625, + "learning_rate": 3.443325405670529e-05, + "loss": 1.9298, + "step": 8155 + }, + { + "epoch": 0.6139370330642278, + "grad_norm": 5.062314033508301, + "learning_rate": 3.442166951257591e-05, + "loss": 2.0628, + "step": 8156 + }, + { + "epoch": 0.614012307344888, + "grad_norm": 3.9385316371917725, + "learning_rate": 3.4410085894533576e-05, + "loss": 1.5186, + "step": 8157 + }, + { + "epoch": 0.6140875816255481, + "grad_norm": 4.121107578277588, + "learning_rate": 3.439850320326691e-05, + "loss": 1.7981, + "step": 8158 + }, + { + "epoch": 0.6141628559062082, + "grad_norm": 4.6892781257629395, + "learning_rate": 3.438692143946445e-05, + "loss": 1.8421, + "step": 8159 + }, + { + "epoch": 0.6142381301868683, + "grad_norm": 4.884023666381836, + "learning_rate": 3.437534060381473e-05, + "loss": 1.9118, + "step": 8160 + }, + { + "epoch": 0.6143134044675286, + "grad_norm": 5.3741774559021, + "learning_rate": 3.4363760697006155e-05, + "loss": 1.861, + "step": 8161 + }, + { + "epoch": 0.6143886787481887, + "grad_norm": 4.460343837738037, + "learning_rate": 3.4352181719727156e-05, + "loss": 1.7782, + "step": 8162 + }, + { + "epoch": 0.6144639530288488, + "grad_norm": 5.736788272857666, + "learning_rate": 3.434060367266604e-05, + "loss": 1.9163, + "step": 8163 + }, + { + "epoch": 0.614539227309509, + "grad_norm": 4.548046588897705, + "learning_rate": 3.4329026556511137e-05, + "loss": 1.8157, + "step": 8164 + }, + { + "epoch": 0.6146145015901692, + "grad_norm": 4.101787567138672, + "learning_rate": 3.431745037195061e-05, + "loss": 1.715, + "step": 8165 + }, + { + "epoch": 0.6146897758708293, + "grad_norm": 5.348918437957764, + "learning_rate": 3.430587511967267e-05, + "loss": 1.7904, + "step": 8166 + }, + { + "epoch": 0.6147650501514895, + "grad_norm": 4.291496276855469, + "learning_rate": 3.429430080036541e-05, + "loss": 1.8053, + "step": 8167 + }, + { + "epoch": 0.6148403244321496, + "grad_norm": 3.503713846206665, + "learning_rate": 3.428272741471693e-05, + "loss": 1.8766, + "step": 8168 + }, + { + "epoch": 0.6149155987128098, + "grad_norm": 5.3673882484436035, + "learning_rate": 3.427115496341518e-05, + "loss": 1.5653, + "step": 8169 + }, + { + "epoch": 0.61499087299347, + "grad_norm": 3.673436403274536, + "learning_rate": 3.425958344714816e-05, + "loss": 1.5551, + "step": 8170 + }, + { + "epoch": 0.6150661472741301, + "grad_norm": 4.699519634246826, + "learning_rate": 3.424801286660372e-05, + "loss": 1.9083, + "step": 8171 + }, + { + "epoch": 0.6151414215547902, + "grad_norm": 5.202142715454102, + "learning_rate": 3.4236443222469715e-05, + "loss": 1.9237, + "step": 8172 + }, + { + "epoch": 0.6152166958354505, + "grad_norm": 4.560394763946533, + "learning_rate": 3.422487451543395e-05, + "loss": 1.6084, + "step": 8173 + }, + { + "epoch": 0.6152919701161106, + "grad_norm": 5.937926769256592, + "learning_rate": 3.421330674618411e-05, + "loss": 1.6355, + "step": 8174 + }, + { + "epoch": 0.6153672443967707, + "grad_norm": 6.353554725646973, + "learning_rate": 3.420173991540791e-05, + "loss": 1.9213, + "step": 8175 + }, + { + "epoch": 0.6154425186774309, + "grad_norm": 4.914665699005127, + "learning_rate": 3.419017402379292e-05, + "loss": 1.8267, + "step": 8176 + }, + { + "epoch": 0.6155177929580911, + "grad_norm": 6.191343784332275, + "learning_rate": 3.4178609072026745e-05, + "loss": 2.3829, + "step": 8177 + }, + { + "epoch": 0.6155930672387512, + "grad_norm": 7.114634037017822, + "learning_rate": 3.416704506079684e-05, + "loss": 1.9127, + "step": 8178 + }, + { + "epoch": 0.6156683415194114, + "grad_norm": 5.051496982574463, + "learning_rate": 3.415548199079069e-05, + "loss": 1.9752, + "step": 8179 + }, + { + "epoch": 0.6157436158000715, + "grad_norm": 4.595468521118164, + "learning_rate": 3.414391986269566e-05, + "loss": 1.8462, + "step": 8180 + }, + { + "epoch": 0.6158188900807317, + "grad_norm": 4.51190710067749, + "learning_rate": 3.413235867719912e-05, + "loss": 1.5839, + "step": 8181 + }, + { + "epoch": 0.6158941643613918, + "grad_norm": 4.797574520111084, + "learning_rate": 3.4120798434988296e-05, + "loss": 1.6178, + "step": 8182 + }, + { + "epoch": 0.615969438642052, + "grad_norm": 5.4998393058776855, + "learning_rate": 3.410923913675047e-05, + "loss": 1.7397, + "step": 8183 + }, + { + "epoch": 0.6160447129227121, + "grad_norm": 5.5877604484558105, + "learning_rate": 3.4097680783172764e-05, + "loss": 1.7773, + "step": 8184 + }, + { + "epoch": 0.6161199872033722, + "grad_norm": 5.176278591156006, + "learning_rate": 3.408612337494233e-05, + "loss": 1.7342, + "step": 8185 + }, + { + "epoch": 0.6161952614840325, + "grad_norm": 4.746948719024658, + "learning_rate": 3.407456691274618e-05, + "loss": 1.4581, + "step": 8186 + }, + { + "epoch": 0.6162705357646926, + "grad_norm": 5.017001628875732, + "learning_rate": 3.406301139727132e-05, + "loss": 1.5443, + "step": 8187 + }, + { + "epoch": 0.6163458100453527, + "grad_norm": 5.200531959533691, + "learning_rate": 3.405145682920473e-05, + "loss": 2.6941, + "step": 8188 + }, + { + "epoch": 0.616421084326013, + "grad_norm": 4.859975337982178, + "learning_rate": 3.403990320923325e-05, + "loss": 1.743, + "step": 8189 + }, + { + "epoch": 0.6164963586066731, + "grad_norm": 4.141367435455322, + "learning_rate": 3.402835053804374e-05, + "loss": 1.7978, + "step": 8190 + }, + { + "epoch": 0.6165716328873332, + "grad_norm": 5.767589569091797, + "learning_rate": 3.401679881632296e-05, + "loss": 1.6312, + "step": 8191 + }, + { + "epoch": 0.6166469071679934, + "grad_norm": 3.8327958583831787, + "learning_rate": 3.400524804475764e-05, + "loss": 1.8651, + "step": 8192 + }, + { + "epoch": 0.6167221814486535, + "grad_norm": 5.357612133026123, + "learning_rate": 3.3993698224034415e-05, + "loss": 1.9488, + "step": 8193 + }, + { + "epoch": 0.6167974557293137, + "grad_norm": 4.806764602661133, + "learning_rate": 3.398214935483991e-05, + "loss": 1.7941, + "step": 8194 + }, + { + "epoch": 0.6168727300099739, + "grad_norm": 5.690446853637695, + "learning_rate": 3.397060143786067e-05, + "loss": 1.618, + "step": 8195 + }, + { + "epoch": 0.616948004290634, + "grad_norm": 4.265924453735352, + "learning_rate": 3.3959054473783175e-05, + "loss": 1.7575, + "step": 8196 + }, + { + "epoch": 0.6170232785712941, + "grad_norm": 8.809532165527344, + "learning_rate": 3.394750846329386e-05, + "loss": 2.0642, + "step": 8197 + }, + { + "epoch": 0.6170985528519544, + "grad_norm": 5.366862773895264, + "learning_rate": 3.393596340707912e-05, + "loss": 1.9248, + "step": 8198 + }, + { + "epoch": 0.6171738271326145, + "grad_norm": 3.813216209411621, + "learning_rate": 3.392441930582525e-05, + "loss": 1.8163, + "step": 8199 + }, + { + "epoch": 0.6172491014132746, + "grad_norm": 6.4618330001831055, + "learning_rate": 3.391287616021853e-05, + "loss": 1.6525, + "step": 8200 + }, + { + "epoch": 0.6173243756939347, + "grad_norm": 16.57746696472168, + "learning_rate": 3.390133397094516e-05, + "loss": 1.6273, + "step": 8201 + }, + { + "epoch": 0.617399649974595, + "grad_norm": 3.996077060699463, + "learning_rate": 3.3889792738691314e-05, + "loss": 1.4443, + "step": 8202 + }, + { + "epoch": 0.6174749242552551, + "grad_norm": 5.756370544433594, + "learning_rate": 3.387825246414303e-05, + "loss": 2.1152, + "step": 8203 + }, + { + "epoch": 0.6175501985359152, + "grad_norm": 6.613095283508301, + "learning_rate": 3.3866713147986394e-05, + "loss": 2.1753, + "step": 8204 + }, + { + "epoch": 0.6176254728165754, + "grad_norm": 6.042783260345459, + "learning_rate": 3.385517479090738e-05, + "loss": 1.8942, + "step": 8205 + }, + { + "epoch": 0.6177007470972355, + "grad_norm": 5.89695405960083, + "learning_rate": 3.3843637393591894e-05, + "loss": 2.1273, + "step": 8206 + }, + { + "epoch": 0.6177760213778957, + "grad_norm": 4.44822883605957, + "learning_rate": 3.383210095672581e-05, + "loss": 1.3664, + "step": 8207 + }, + { + "epoch": 0.6178512956585559, + "grad_norm": 4.939016819000244, + "learning_rate": 3.3820565480994913e-05, + "loss": 2.0909, + "step": 8208 + }, + { + "epoch": 0.617926569939216, + "grad_norm": 4.091365337371826, + "learning_rate": 3.380903096708501e-05, + "loss": 1.7525, + "step": 8209 + }, + { + "epoch": 0.6180018442198761, + "grad_norm": 4.0339508056640625, + "learning_rate": 3.3797497415681725e-05, + "loss": 1.554, + "step": 8210 + }, + { + "epoch": 0.6180771185005364, + "grad_norm": 4.657739639282227, + "learning_rate": 3.378596482747075e-05, + "loss": 1.6208, + "step": 8211 + }, + { + "epoch": 0.6181523927811965, + "grad_norm": 5.9851813316345215, + "learning_rate": 3.377443320313762e-05, + "loss": 1.8359, + "step": 8212 + }, + { + "epoch": 0.6182276670618566, + "grad_norm": 5.4093241691589355, + "learning_rate": 3.3762902543367916e-05, + "loss": 1.8806, + "step": 8213 + }, + { + "epoch": 0.6183029413425168, + "grad_norm": 4.459803581237793, + "learning_rate": 3.375137284884703e-05, + "loss": 1.8015, + "step": 8214 + }, + { + "epoch": 0.618378215623177, + "grad_norm": 4.538405895233154, + "learning_rate": 3.3739844120260436e-05, + "loss": 1.9921, + "step": 8215 + }, + { + "epoch": 0.6184534899038371, + "grad_norm": 3.7780160903930664, + "learning_rate": 3.372831635829343e-05, + "loss": 1.6092, + "step": 8216 + }, + { + "epoch": 0.6185287641844973, + "grad_norm": 4.539201736450195, + "learning_rate": 3.3716789563631334e-05, + "loss": 1.5892, + "step": 8217 + }, + { + "epoch": 0.6186040384651574, + "grad_norm": 4.376361846923828, + "learning_rate": 3.370526373695937e-05, + "loss": 1.6221, + "step": 8218 + }, + { + "epoch": 0.6186793127458176, + "grad_norm": 5.478304862976074, + "learning_rate": 3.3693738878962745e-05, + "loss": 1.6096, + "step": 8219 + }, + { + "epoch": 0.6187545870264777, + "grad_norm": 4.415644645690918, + "learning_rate": 3.368221499032653e-05, + "loss": 1.9493, + "step": 8220 + }, + { + "epoch": 0.6188298613071379, + "grad_norm": 4.649485111236572, + "learning_rate": 3.3670692071735824e-05, + "loss": 1.9579, + "step": 8221 + }, + { + "epoch": 0.618905135587798, + "grad_norm": 4.2577972412109375, + "learning_rate": 3.365917012387563e-05, + "loss": 1.6833, + "step": 8222 + }, + { + "epoch": 0.6189804098684581, + "grad_norm": 7.096913814544678, + "learning_rate": 3.364764914743087e-05, + "loss": 1.7063, + "step": 8223 + }, + { + "epoch": 0.6190556841491184, + "grad_norm": 5.64223051071167, + "learning_rate": 3.363612914308645e-05, + "loss": 1.9535, + "step": 8224 + }, + { + "epoch": 0.6191309584297785, + "grad_norm": 5.372496128082275, + "learning_rate": 3.36246101115272e-05, + "loss": 1.6014, + "step": 8225 + }, + { + "epoch": 0.6192062327104386, + "grad_norm": 5.196044921875, + "learning_rate": 3.36130920534379e-05, + "loss": 2.0847, + "step": 8226 + }, + { + "epoch": 0.6192815069910989, + "grad_norm": 4.410834312438965, + "learning_rate": 3.3601574969503255e-05, + "loss": 1.5573, + "step": 8227 + }, + { + "epoch": 0.619356781271759, + "grad_norm": 5.691237449645996, + "learning_rate": 3.359005886040792e-05, + "loss": 1.7361, + "step": 8228 + }, + { + "epoch": 0.6194320555524191, + "grad_norm": 6.057751655578613, + "learning_rate": 3.35785437268365e-05, + "loss": 1.5906, + "step": 8229 + }, + { + "epoch": 0.6195073298330793, + "grad_norm": 3.959517478942871, + "learning_rate": 3.356702956947355e-05, + "loss": 1.6366, + "step": 8230 + }, + { + "epoch": 0.6195826041137394, + "grad_norm": 5.356793403625488, + "learning_rate": 3.3555516389003525e-05, + "loss": 2.0475, + "step": 8231 + }, + { + "epoch": 0.6196578783943996, + "grad_norm": 5.596253395080566, + "learning_rate": 3.3544004186110886e-05, + "loss": 1.9068, + "step": 8232 + }, + { + "epoch": 0.6197331526750598, + "grad_norm": 5.67264986038208, + "learning_rate": 3.353249296147997e-05, + "loss": 2.2231, + "step": 8233 + }, + { + "epoch": 0.6198084269557199, + "grad_norm": 4.520387649536133, + "learning_rate": 3.352098271579509e-05, + "loss": 2.0095, + "step": 8234 + }, + { + "epoch": 0.61988370123638, + "grad_norm": 6.100481033325195, + "learning_rate": 3.35094734497405e-05, + "loss": 1.761, + "step": 8235 + }, + { + "epoch": 0.6199589755170403, + "grad_norm": 6.019735813140869, + "learning_rate": 3.34979651640004e-05, + "loss": 1.628, + "step": 8236 + }, + { + "epoch": 0.6200342497977004, + "grad_norm": 4.518129825592041, + "learning_rate": 3.3486457859258934e-05, + "loss": 2.0593, + "step": 8237 + }, + { + "epoch": 0.6201095240783605, + "grad_norm": 4.630537509918213, + "learning_rate": 3.347495153620015e-05, + "loss": 1.5638, + "step": 8238 + }, + { + "epoch": 0.6201847983590206, + "grad_norm": 5.115309238433838, + "learning_rate": 3.3463446195508094e-05, + "loss": 1.9213, + "step": 8239 + }, + { + "epoch": 0.6202600726396809, + "grad_norm": 4.890008926391602, + "learning_rate": 3.34519418378667e-05, + "loss": 1.5088, + "step": 8240 + }, + { + "epoch": 0.620335346920341, + "grad_norm": 5.8107452392578125, + "learning_rate": 3.344043846395991e-05, + "loss": 2.489, + "step": 8241 + }, + { + "epoch": 0.6204106212010011, + "grad_norm": 4.9344964027404785, + "learning_rate": 3.342893607447151e-05, + "loss": 2.2474, + "step": 8242 + }, + { + "epoch": 0.6204858954816613, + "grad_norm": 4.316576957702637, + "learning_rate": 3.3417434670085335e-05, + "loss": 1.8245, + "step": 8243 + }, + { + "epoch": 0.6205611697623215, + "grad_norm": 6.2932844161987305, + "learning_rate": 3.340593425148507e-05, + "loss": 1.5421, + "step": 8244 + }, + { + "epoch": 0.6206364440429816, + "grad_norm": 5.49137020111084, + "learning_rate": 3.3394434819354404e-05, + "loss": 1.7211, + "step": 8245 + }, + { + "epoch": 0.6207117183236418, + "grad_norm": 5.316649436950684, + "learning_rate": 3.3382936374376935e-05, + "loss": 2.2811, + "step": 8246 + }, + { + "epoch": 0.6207869926043019, + "grad_norm": 5.549999237060547, + "learning_rate": 3.337143891723624e-05, + "loss": 1.8084, + "step": 8247 + }, + { + "epoch": 0.620862266884962, + "grad_norm": 4.585305690765381, + "learning_rate": 3.335994244861577e-05, + "loss": 1.5884, + "step": 8248 + }, + { + "epoch": 0.6209375411656223, + "grad_norm": 4.191547393798828, + "learning_rate": 3.3348446969198984e-05, + "loss": 1.7825, + "step": 8249 + }, + { + "epoch": 0.6210128154462824, + "grad_norm": 6.276517391204834, + "learning_rate": 3.3336952479669234e-05, + "loss": 1.5819, + "step": 8250 + }, + { + "epoch": 0.6210880897269425, + "grad_norm": 4.5018391609191895, + "learning_rate": 3.332545898070987e-05, + "loss": 1.6155, + "step": 8251 + }, + { + "epoch": 0.6211633640076027, + "grad_norm": 6.358250141143799, + "learning_rate": 3.3313966473004096e-05, + "loss": 1.6911, + "step": 8252 + }, + { + "epoch": 0.6212386382882629, + "grad_norm": 4.8963470458984375, + "learning_rate": 3.330247495723515e-05, + "loss": 1.7713, + "step": 8253 + }, + { + "epoch": 0.621313912568923, + "grad_norm": 5.280465602874756, + "learning_rate": 3.329098443408617e-05, + "loss": 2.0679, + "step": 8254 + }, + { + "epoch": 0.6213891868495832, + "grad_norm": 5.506618022918701, + "learning_rate": 3.3279494904240206e-05, + "loss": 1.5009, + "step": 8255 + }, + { + "epoch": 0.6214644611302433, + "grad_norm": 5.569920539855957, + "learning_rate": 3.3268006368380304e-05, + "loss": 1.8057, + "step": 8256 + }, + { + "epoch": 0.6215397354109035, + "grad_norm": 4.997191905975342, + "learning_rate": 3.32565188271894e-05, + "loss": 1.8979, + "step": 8257 + }, + { + "epoch": 0.6216150096915636, + "grad_norm": 4.7654242515563965, + "learning_rate": 3.3245032281350435e-05, + "loss": 2.0739, + "step": 8258 + }, + { + "epoch": 0.6216902839722238, + "grad_norm": 5.233922958374023, + "learning_rate": 3.3233546731546204e-05, + "loss": 1.8729, + "step": 8259 + }, + { + "epoch": 0.6217655582528839, + "grad_norm": 5.415831565856934, + "learning_rate": 3.3222062178459526e-05, + "loss": 1.8151, + "step": 8260 + }, + { + "epoch": 0.621840832533544, + "grad_norm": 3.984008312225342, + "learning_rate": 3.32105786227731e-05, + "loss": 1.3166, + "step": 8261 + }, + { + "epoch": 0.6219161068142043, + "grad_norm": 8.059233665466309, + "learning_rate": 3.31990960651696e-05, + "loss": 2.1252, + "step": 8262 + }, + { + "epoch": 0.6219913810948644, + "grad_norm": 5.580278396606445, + "learning_rate": 3.318761450633163e-05, + "loss": 2.274, + "step": 8263 + }, + { + "epoch": 0.6220666553755245, + "grad_norm": 5.7492780685424805, + "learning_rate": 3.317613394694175e-05, + "loss": 1.3134, + "step": 8264 + }, + { + "epoch": 0.6221419296561848, + "grad_norm": 6.129329681396484, + "learning_rate": 3.316465438768242e-05, + "loss": 1.9404, + "step": 8265 + }, + { + "epoch": 0.6222172039368449, + "grad_norm": 5.890538692474365, + "learning_rate": 3.3153175829236085e-05, + "loss": 1.9933, + "step": 8266 + }, + { + "epoch": 0.622292478217505, + "grad_norm": 5.701658725738525, + "learning_rate": 3.3141698272285107e-05, + "loss": 1.7508, + "step": 8267 + }, + { + "epoch": 0.6223677524981652, + "grad_norm": 5.047867774963379, + "learning_rate": 3.313022171751179e-05, + "loss": 1.5584, + "step": 8268 + }, + { + "epoch": 0.6224430267788253, + "grad_norm": 4.532959938049316, + "learning_rate": 3.31187461655984e-05, + "loss": 1.8344, + "step": 8269 + }, + { + "epoch": 0.6225183010594855, + "grad_norm": 4.06094217300415, + "learning_rate": 3.3107271617227096e-05, + "loss": 1.7074, + "step": 8270 + }, + { + "epoch": 0.6225935753401457, + "grad_norm": 5.323122501373291, + "learning_rate": 3.3095798073080055e-05, + "loss": 1.5828, + "step": 8271 + }, + { + "epoch": 0.6226688496208058, + "grad_norm": 3.8890061378479004, + "learning_rate": 3.3084325533839275e-05, + "loss": 1.7797, + "step": 8272 + }, + { + "epoch": 0.6227441239014659, + "grad_norm": 4.8923492431640625, + "learning_rate": 3.3072854000186836e-05, + "loss": 1.835, + "step": 8273 + }, + { + "epoch": 0.6228193981821262, + "grad_norm": 4.122373104095459, + "learning_rate": 3.306138347280464e-05, + "loss": 1.9805, + "step": 8274 + }, + { + "epoch": 0.6228946724627863, + "grad_norm": 5.760934352874756, + "learning_rate": 3.304991395237461e-05, + "loss": 1.9075, + "step": 8275 + }, + { + "epoch": 0.6229699467434464, + "grad_norm": 4.359913349151611, + "learning_rate": 3.3038445439578544e-05, + "loss": 1.7145, + "step": 8276 + }, + { + "epoch": 0.6230452210241066, + "grad_norm": 5.756412982940674, + "learning_rate": 3.302697793509825e-05, + "loss": 1.7743, + "step": 8277 + }, + { + "epoch": 0.6231204953047668, + "grad_norm": 6.323239326477051, + "learning_rate": 3.3015511439615384e-05, + "loss": 1.7402, + "step": 8278 + }, + { + "epoch": 0.6231957695854269, + "grad_norm": 8.151937484741211, + "learning_rate": 3.300404595381165e-05, + "loss": 2.1713, + "step": 8279 + }, + { + "epoch": 0.623271043866087, + "grad_norm": 5.953207492828369, + "learning_rate": 3.2992581478368614e-05, + "loss": 1.9522, + "step": 8280 + }, + { + "epoch": 0.6233463181467472, + "grad_norm": 4.825040817260742, + "learning_rate": 3.2981118013967815e-05, + "loss": 1.5557, + "step": 8281 + }, + { + "epoch": 0.6234215924274074, + "grad_norm": 4.547284126281738, + "learning_rate": 3.29696555612907e-05, + "loss": 1.5958, + "step": 8282 + }, + { + "epoch": 0.6234968667080675, + "grad_norm": 5.367687702178955, + "learning_rate": 3.295819412101872e-05, + "loss": 1.8126, + "step": 8283 + }, + { + "epoch": 0.6235721409887277, + "grad_norm": 5.942622661590576, + "learning_rate": 3.294673369383318e-05, + "loss": 2.03, + "step": 8284 + }, + { + "epoch": 0.6236474152693878, + "grad_norm": 4.396617412567139, + "learning_rate": 3.2935274280415385e-05, + "loss": 1.8626, + "step": 8285 + }, + { + "epoch": 0.623722689550048, + "grad_norm": 4.370640277862549, + "learning_rate": 3.2923815881446594e-05, + "loss": 1.7118, + "step": 8286 + }, + { + "epoch": 0.6237979638307082, + "grad_norm": 6.063573837280273, + "learning_rate": 3.291235849760794e-05, + "loss": 2.3146, + "step": 8287 + }, + { + "epoch": 0.6238732381113683, + "grad_norm": 5.1595234870910645, + "learning_rate": 3.290090212958056e-05, + "loss": 1.866, + "step": 8288 + }, + { + "epoch": 0.6239485123920284, + "grad_norm": 5.361724853515625, + "learning_rate": 3.2889446778045464e-05, + "loss": 2.0147, + "step": 8289 + }, + { + "epoch": 0.6240237866726887, + "grad_norm": 4.5459980964660645, + "learning_rate": 3.287799244368367e-05, + "loss": 1.8177, + "step": 8290 + }, + { + "epoch": 0.6240990609533488, + "grad_norm": 5.594852447509766, + "learning_rate": 3.286653912717609e-05, + "loss": 1.7873, + "step": 8291 + }, + { + "epoch": 0.6241743352340089, + "grad_norm": 5.766581058502197, + "learning_rate": 3.285508682920361e-05, + "loss": 1.7964, + "step": 8292 + }, + { + "epoch": 0.6242496095146691, + "grad_norm": 5.8725152015686035, + "learning_rate": 3.2843635550447004e-05, + "loss": 2.0339, + "step": 8293 + }, + { + "epoch": 0.6243248837953292, + "grad_norm": 6.279059410095215, + "learning_rate": 3.283218529158706e-05, + "loss": 1.6742, + "step": 8294 + }, + { + "epoch": 0.6244001580759894, + "grad_norm": 3.8164103031158447, + "learning_rate": 3.282073605330443e-05, + "loss": 1.7464, + "step": 8295 + }, + { + "epoch": 0.6244754323566496, + "grad_norm": 6.469735145568848, + "learning_rate": 3.280928783627975e-05, + "loss": 2.2422, + "step": 8296 + }, + { + "epoch": 0.6245507066373097, + "grad_norm": 5.138152122497559, + "learning_rate": 3.279784064119357e-05, + "loss": 1.422, + "step": 8297 + }, + { + "epoch": 0.6246259809179698, + "grad_norm": 4.320432662963867, + "learning_rate": 3.2786394468726436e-05, + "loss": 1.7409, + "step": 8298 + }, + { + "epoch": 0.62470125519863, + "grad_norm": 4.7534284591674805, + "learning_rate": 3.277494931955873e-05, + "loss": 1.9486, + "step": 8299 + }, + { + "epoch": 0.6247765294792902, + "grad_norm": 4.9402594566345215, + "learning_rate": 3.2763505194370866e-05, + "loss": 1.6897, + "step": 8300 + }, + { + "epoch": 0.6248518037599503, + "grad_norm": 4.532849311828613, + "learning_rate": 3.275206209384317e-05, + "loss": 1.8942, + "step": 8301 + }, + { + "epoch": 0.6249270780406104, + "grad_norm": 9.393890380859375, + "learning_rate": 3.2740620018655885e-05, + "loss": 1.6423, + "step": 8302 + }, + { + "epoch": 0.6250023523212707, + "grad_norm": 5.9799675941467285, + "learning_rate": 3.272917896948923e-05, + "loss": 2.2211, + "step": 8303 + }, + { + "epoch": 0.6250776266019308, + "grad_norm": 6.8729071617126465, + "learning_rate": 3.2717738947023314e-05, + "loss": 1.7444, + "step": 8304 + }, + { + "epoch": 0.6251529008825909, + "grad_norm": 4.415032863616943, + "learning_rate": 3.2706299951938255e-05, + "loss": 1.5718, + "step": 8305 + }, + { + "epoch": 0.6252281751632511, + "grad_norm": 9.217084884643555, + "learning_rate": 3.269486198491402e-05, + "loss": 1.924, + "step": 8306 + }, + { + "epoch": 0.6253034494439113, + "grad_norm": 5.921759605407715, + "learning_rate": 3.268342504663059e-05, + "loss": 1.705, + "step": 8307 + }, + { + "epoch": 0.6253787237245714, + "grad_norm": 5.970406532287598, + "learning_rate": 3.267198913776785e-05, + "loss": 1.8171, + "step": 8308 + }, + { + "epoch": 0.6254539980052316, + "grad_norm": 4.7461934089660645, + "learning_rate": 3.266055425900565e-05, + "loss": 1.7467, + "step": 8309 + }, + { + "epoch": 0.6255292722858917, + "grad_norm": 15.243721961975098, + "learning_rate": 3.2649120411023734e-05, + "loss": 2.2232, + "step": 8310 + }, + { + "epoch": 0.6256045465665518, + "grad_norm": 5.61964750289917, + "learning_rate": 3.263768759450183e-05, + "loss": 1.9535, + "step": 8311 + }, + { + "epoch": 0.6256798208472121, + "grad_norm": 5.71324348449707, + "learning_rate": 3.262625581011957e-05, + "loss": 1.5552, + "step": 8312 + }, + { + "epoch": 0.6257550951278722, + "grad_norm": 3.868100166320801, + "learning_rate": 3.2614825058556573e-05, + "loss": 1.8502, + "step": 8313 + }, + { + "epoch": 0.6258303694085323, + "grad_norm": 4.370196342468262, + "learning_rate": 3.260339534049232e-05, + "loss": 1.7184, + "step": 8314 + }, + { + "epoch": 0.6259056436891925, + "grad_norm": 5.04323148727417, + "learning_rate": 3.259196665660632e-05, + "loss": 2.1143, + "step": 8315 + }, + { + "epoch": 0.6259809179698527, + "grad_norm": 4.476436138153076, + "learning_rate": 3.2580539007577934e-05, + "loss": 1.6678, + "step": 8316 + }, + { + "epoch": 0.6260561922505128, + "grad_norm": 4.9494171142578125, + "learning_rate": 3.256911239408652e-05, + "loss": 1.6163, + "step": 8317 + }, + { + "epoch": 0.6261314665311729, + "grad_norm": 5.270737171173096, + "learning_rate": 3.255768681681137e-05, + "loss": 1.458, + "step": 8318 + }, + { + "epoch": 0.6262067408118331, + "grad_norm": 5.886576175689697, + "learning_rate": 3.254626227643168e-05, + "loss": 1.5343, + "step": 8319 + }, + { + "epoch": 0.6262820150924933, + "grad_norm": 5.326073169708252, + "learning_rate": 3.253483877362664e-05, + "loss": 1.4423, + "step": 8320 + }, + { + "epoch": 0.6263572893731534, + "grad_norm": 4.253442764282227, + "learning_rate": 3.252341630907531e-05, + "loss": 1.4805, + "step": 8321 + }, + { + "epoch": 0.6264325636538136, + "grad_norm": 6.536440849304199, + "learning_rate": 3.251199488345674e-05, + "loss": 1.6546, + "step": 8322 + }, + { + "epoch": 0.6265078379344737, + "grad_norm": 5.27125358581543, + "learning_rate": 3.2500574497449874e-05, + "loss": 2.3648, + "step": 8323 + }, + { + "epoch": 0.6265831122151339, + "grad_norm": 5.5030035972595215, + "learning_rate": 3.2489155151733665e-05, + "loss": 1.8989, + "step": 8324 + }, + { + "epoch": 0.6266583864957941, + "grad_norm": 5.477755546569824, + "learning_rate": 3.2477736846986924e-05, + "loss": 1.7228, + "step": 8325 + }, + { + "epoch": 0.6267336607764542, + "grad_norm": 6.4154133796691895, + "learning_rate": 3.2466319583888464e-05, + "loss": 1.6662, + "step": 8326 + }, + { + "epoch": 0.6268089350571143, + "grad_norm": 4.027299404144287, + "learning_rate": 3.245490336311698e-05, + "loss": 1.7546, + "step": 8327 + }, + { + "epoch": 0.6268842093377746, + "grad_norm": 5.473293304443359, + "learning_rate": 3.2443488185351154e-05, + "loss": 1.8159, + "step": 8328 + }, + { + "epoch": 0.6269594836184347, + "grad_norm": 4.334332466125488, + "learning_rate": 3.2432074051269576e-05, + "loss": 1.7733, + "step": 8329 + }, + { + "epoch": 0.6270347578990948, + "grad_norm": 5.058533191680908, + "learning_rate": 3.242066096155081e-05, + "loss": 1.8312, + "step": 8330 + }, + { + "epoch": 0.627110032179755, + "grad_norm": 5.62025260925293, + "learning_rate": 3.240924891687328e-05, + "loss": 1.9789, + "step": 8331 + }, + { + "epoch": 0.6271853064604151, + "grad_norm": 4.268112659454346, + "learning_rate": 3.239783791791546e-05, + "loss": 1.6075, + "step": 8332 + }, + { + "epoch": 0.6272605807410753, + "grad_norm": 5.520972728729248, + "learning_rate": 3.2386427965355636e-05, + "loss": 2.0835, + "step": 8333 + }, + { + "epoch": 0.6273358550217355, + "grad_norm": 6.268272876739502, + "learning_rate": 3.2375019059872144e-05, + "loss": 1.8408, + "step": 8334 + }, + { + "epoch": 0.6274111293023956, + "grad_norm": 5.816622257232666, + "learning_rate": 3.236361120214319e-05, + "loss": 1.7554, + "step": 8335 + }, + { + "epoch": 0.6274864035830557, + "grad_norm": 4.832163333892822, + "learning_rate": 3.235220439284695e-05, + "loss": 1.5541, + "step": 8336 + }, + { + "epoch": 0.6275616778637159, + "grad_norm": 5.548486232757568, + "learning_rate": 3.234079863266154e-05, + "loss": 1.5873, + "step": 8337 + }, + { + "epoch": 0.6276369521443761, + "grad_norm": 5.206502437591553, + "learning_rate": 3.232939392226496e-05, + "loss": 1.681, + "step": 8338 + }, + { + "epoch": 0.6277122264250362, + "grad_norm": 5.786398887634277, + "learning_rate": 3.2317990262335215e-05, + "loss": 1.9467, + "step": 8339 + }, + { + "epoch": 0.6277875007056963, + "grad_norm": 4.6580657958984375, + "learning_rate": 3.230658765355021e-05, + "loss": 1.9014, + "step": 8340 + }, + { + "epoch": 0.6278627749863566, + "grad_norm": 3.8492088317871094, + "learning_rate": 3.2295186096587815e-05, + "loss": 1.7569, + "step": 8341 + }, + { + "epoch": 0.6279380492670167, + "grad_norm": 5.562602996826172, + "learning_rate": 3.2283785592125785e-05, + "loss": 1.9212, + "step": 8342 + }, + { + "epoch": 0.6280133235476768, + "grad_norm": 4.065419673919678, + "learning_rate": 3.2272386140841895e-05, + "loss": 1.4302, + "step": 8343 + }, + { + "epoch": 0.628088597828337, + "grad_norm": 3.9851157665252686, + "learning_rate": 3.226098774341376e-05, + "loss": 1.5262, + "step": 8344 + }, + { + "epoch": 0.6281638721089972, + "grad_norm": 6.33390474319458, + "learning_rate": 3.224959040051902e-05, + "loss": 1.7523, + "step": 8345 + }, + { + "epoch": 0.6282391463896573, + "grad_norm": 3.9492480754852295, + "learning_rate": 3.223819411283517e-05, + "loss": 1.839, + "step": 8346 + }, + { + "epoch": 0.6283144206703175, + "grad_norm": 5.650598526000977, + "learning_rate": 3.222679888103975e-05, + "loss": 2.1154, + "step": 8347 + }, + { + "epoch": 0.6283896949509776, + "grad_norm": 4.282275676727295, + "learning_rate": 3.221540470581011e-05, + "loss": 1.7995, + "step": 8348 + }, + { + "epoch": 0.6284649692316377, + "grad_norm": 5.363541603088379, + "learning_rate": 3.220401158782365e-05, + "loss": 1.4928, + "step": 8349 + }, + { + "epoch": 0.628540243512298, + "grad_norm": 5.535744667053223, + "learning_rate": 3.219261952775764e-05, + "loss": 1.9825, + "step": 8350 + }, + { + "epoch": 0.6286155177929581, + "grad_norm": 4.997323989868164, + "learning_rate": 3.218122852628929e-05, + "loss": 1.4421, + "step": 8351 + }, + { + "epoch": 0.6286907920736182, + "grad_norm": 6.069633960723877, + "learning_rate": 3.216983858409579e-05, + "loss": 1.6157, + "step": 8352 + }, + { + "epoch": 0.6287660663542785, + "grad_norm": 5.873281002044678, + "learning_rate": 3.215844970185421e-05, + "loss": 1.8869, + "step": 8353 + }, + { + "epoch": 0.6288413406349386, + "grad_norm": 5.071971893310547, + "learning_rate": 3.214706188024162e-05, + "loss": 1.8392, + "step": 8354 + }, + { + "epoch": 0.6289166149155987, + "grad_norm": 3.8708982467651367, + "learning_rate": 3.213567511993497e-05, + "loss": 1.702, + "step": 8355 + }, + { + "epoch": 0.6289918891962588, + "grad_norm": 4.605964183807373, + "learning_rate": 3.2124289421611175e-05, + "loss": 1.7435, + "step": 8356 + }, + { + "epoch": 0.629067163476919, + "grad_norm": 4.5636210441589355, + "learning_rate": 3.211290478594707e-05, + "loss": 1.4221, + "step": 8357 + }, + { + "epoch": 0.6291424377575792, + "grad_norm": 4.302135467529297, + "learning_rate": 3.210152121361948e-05, + "loss": 1.4778, + "step": 8358 + }, + { + "epoch": 0.6292177120382393, + "grad_norm": 5.387387752532959, + "learning_rate": 3.209013870530507e-05, + "loss": 1.8696, + "step": 8359 + }, + { + "epoch": 0.6292929863188995, + "grad_norm": 4.852499485015869, + "learning_rate": 3.207875726168055e-05, + "loss": 1.8526, + "step": 8360 + }, + { + "epoch": 0.6293682605995596, + "grad_norm": 6.312399387359619, + "learning_rate": 3.2067376883422464e-05, + "loss": 1.5264, + "step": 8361 + }, + { + "epoch": 0.6294435348802198, + "grad_norm": 5.809385776519775, + "learning_rate": 3.205599757120737e-05, + "loss": 1.5664, + "step": 8362 + }, + { + "epoch": 0.62951880916088, + "grad_norm": 9.201595306396484, + "learning_rate": 3.204461932571174e-05, + "loss": 2.1274, + "step": 8363 + }, + { + "epoch": 0.6295940834415401, + "grad_norm": 4.501537322998047, + "learning_rate": 3.203324214761198e-05, + "loss": 1.9544, + "step": 8364 + }, + { + "epoch": 0.6296693577222002, + "grad_norm": 4.940958023071289, + "learning_rate": 3.202186603758442e-05, + "loss": 1.7369, + "step": 8365 + }, + { + "epoch": 0.6297446320028605, + "grad_norm": 6.034842014312744, + "learning_rate": 3.201049099630532e-05, + "loss": 1.8851, + "step": 8366 + }, + { + "epoch": 0.6298199062835206, + "grad_norm": 4.616048336029053, + "learning_rate": 3.1999117024450924e-05, + "loss": 1.6926, + "step": 8367 + }, + { + "epoch": 0.6298951805641807, + "grad_norm": 5.375426292419434, + "learning_rate": 3.198774412269736e-05, + "loss": 1.8714, + "step": 8368 + }, + { + "epoch": 0.6299704548448409, + "grad_norm": 6.863966941833496, + "learning_rate": 3.197637229172076e-05, + "loss": 1.9756, + "step": 8369 + }, + { + "epoch": 0.630045729125501, + "grad_norm": 4.060893535614014, + "learning_rate": 3.1965001532197075e-05, + "loss": 1.6985, + "step": 8370 + }, + { + "epoch": 0.6301210034061612, + "grad_norm": 6.251177787780762, + "learning_rate": 3.1953631844802327e-05, + "loss": 1.7256, + "step": 8371 + }, + { + "epoch": 0.6301962776868214, + "grad_norm": 5.243875503540039, + "learning_rate": 3.194226323021237e-05, + "loss": 1.4514, + "step": 8372 + }, + { + "epoch": 0.6302715519674815, + "grad_norm": 4.302337646484375, + "learning_rate": 3.1930895689103054e-05, + "loss": 1.5191, + "step": 8373 + }, + { + "epoch": 0.6303468262481416, + "grad_norm": 5.064497947692871, + "learning_rate": 3.1919529222150136e-05, + "loss": 1.7131, + "step": 8374 + }, + { + "epoch": 0.6304221005288019, + "grad_norm": 3.6568796634674072, + "learning_rate": 3.190816383002934e-05, + "loss": 1.5786, + "step": 8375 + }, + { + "epoch": 0.630497374809462, + "grad_norm": 7.440704345703125, + "learning_rate": 3.189679951341628e-05, + "loss": 1.7698, + "step": 8376 + }, + { + "epoch": 0.6305726490901221, + "grad_norm": 4.935954570770264, + "learning_rate": 3.1885436272986566e-05, + "loss": 1.7981, + "step": 8377 + }, + { + "epoch": 0.6306479233707822, + "grad_norm": 3.5592689514160156, + "learning_rate": 3.1874074109415666e-05, + "loss": 1.8108, + "step": 8378 + }, + { + "epoch": 0.6307231976514425, + "grad_norm": 5.209213733673096, + "learning_rate": 3.186271302337906e-05, + "loss": 1.8393, + "step": 8379 + }, + { + "epoch": 0.6307984719321026, + "grad_norm": 6.28550386428833, + "learning_rate": 3.185135301555212e-05, + "loss": 1.8224, + "step": 8380 + }, + { + "epoch": 0.6308737462127627, + "grad_norm": 5.0310282707214355, + "learning_rate": 3.183999408661016e-05, + "loss": 1.9601, + "step": 8381 + }, + { + "epoch": 0.6309490204934229, + "grad_norm": 4.4893293380737305, + "learning_rate": 3.182863623722847e-05, + "loss": 1.856, + "step": 8382 + }, + { + "epoch": 0.6310242947740831, + "grad_norm": 4.414626598358154, + "learning_rate": 3.18172794680822e-05, + "loss": 1.7328, + "step": 8383 + }, + { + "epoch": 0.6310995690547432, + "grad_norm": 4.429747104644775, + "learning_rate": 3.180592377984649e-05, + "loss": 1.9052, + "step": 8384 + }, + { + "epoch": 0.6311748433354034, + "grad_norm": 4.949826717376709, + "learning_rate": 3.179456917319641e-05, + "loss": 1.8485, + "step": 8385 + }, + { + "epoch": 0.6312501176160635, + "grad_norm": 3.2864019870758057, + "learning_rate": 3.178321564880696e-05, + "loss": 1.6907, + "step": 8386 + }, + { + "epoch": 0.6313253918967237, + "grad_norm": 3.3971188068389893, + "learning_rate": 3.177186320735306e-05, + "loss": 1.8963, + "step": 8387 + }, + { + "epoch": 0.6314006661773839, + "grad_norm": 4.255220890045166, + "learning_rate": 3.1760511849509616e-05, + "loss": 2.0426, + "step": 8388 + }, + { + "epoch": 0.631475940458044, + "grad_norm": 8.138797760009766, + "learning_rate": 3.174916157595138e-05, + "loss": 1.5441, + "step": 8389 + }, + { + "epoch": 0.6315512147387041, + "grad_norm": 4.774603366851807, + "learning_rate": 3.173781238735313e-05, + "loss": 1.5635, + "step": 8390 + }, + { + "epoch": 0.6316264890193644, + "grad_norm": 5.166033744812012, + "learning_rate": 3.1726464284389527e-05, + "loss": 1.9124, + "step": 8391 + }, + { + "epoch": 0.6317017633000245, + "grad_norm": 9.103131294250488, + "learning_rate": 3.171511726773521e-05, + "loss": 2.1462, + "step": 8392 + }, + { + "epoch": 0.6317770375806846, + "grad_norm": 5.8270583152771, + "learning_rate": 3.170377133806469e-05, + "loss": 2.0279, + "step": 8393 + }, + { + "epoch": 0.6318523118613448, + "grad_norm": 4.923410415649414, + "learning_rate": 3.1692426496052465e-05, + "loss": 1.5148, + "step": 8394 + }, + { + "epoch": 0.631927586142005, + "grad_norm": 6.492119789123535, + "learning_rate": 3.1681082742372955e-05, + "loss": 1.9183, + "step": 8395 + }, + { + "epoch": 0.6320028604226651, + "grad_norm": 5.552718162536621, + "learning_rate": 3.166974007770053e-05, + "loss": 1.7525, + "step": 8396 + }, + { + "epoch": 0.6320781347033252, + "grad_norm": 5.070193767547607, + "learning_rate": 3.1658398502709436e-05, + "loss": 1.6527, + "step": 8397 + }, + { + "epoch": 0.6321534089839854, + "grad_norm": 6.275478363037109, + "learning_rate": 3.164705801807393e-05, + "loss": 1.4792, + "step": 8398 + }, + { + "epoch": 0.6322286832646455, + "grad_norm": 4.463521957397461, + "learning_rate": 3.163571862446818e-05, + "loss": 1.8394, + "step": 8399 + }, + { + "epoch": 0.6323039575453057, + "grad_norm": 5.131325721740723, + "learning_rate": 3.162438032256625e-05, + "loss": 1.6528, + "step": 8400 + }, + { + "epoch": 0.6323792318259659, + "grad_norm": 4.390284061431885, + "learning_rate": 3.1613043113042186e-05, + "loss": 1.8403, + "step": 8401 + }, + { + "epoch": 0.632454506106626, + "grad_norm": 6.3855462074279785, + "learning_rate": 3.160170699656995e-05, + "loss": 1.8047, + "step": 8402 + }, + { + "epoch": 0.6325297803872861, + "grad_norm": 5.283062934875488, + "learning_rate": 3.1590371973823455e-05, + "loss": 1.7814, + "step": 8403 + }, + { + "epoch": 0.6326050546679464, + "grad_norm": 4.808558464050293, + "learning_rate": 3.157903804547651e-05, + "loss": 1.9955, + "step": 8404 + }, + { + "epoch": 0.6326803289486065, + "grad_norm": 4.953303813934326, + "learning_rate": 3.1567705212202915e-05, + "loss": 2.5542, + "step": 8405 + }, + { + "epoch": 0.6327556032292666, + "grad_norm": 4.593639850616455, + "learning_rate": 3.155637347467633e-05, + "loss": 1.9645, + "step": 8406 + }, + { + "epoch": 0.6328308775099268, + "grad_norm": 4.8060126304626465, + "learning_rate": 3.1545042833570435e-05, + "loss": 1.7422, + "step": 8407 + }, + { + "epoch": 0.632906151790587, + "grad_norm": 4.549798965454102, + "learning_rate": 3.1533713289558784e-05, + "loss": 1.7909, + "step": 8408 + }, + { + "epoch": 0.6329814260712471, + "grad_norm": 5.148681640625, + "learning_rate": 3.152238484331491e-05, + "loss": 1.3784, + "step": 8409 + }, + { + "epoch": 0.6330567003519073, + "grad_norm": 3.6379642486572266, + "learning_rate": 3.151105749551222e-05, + "loss": 1.8599, + "step": 8410 + }, + { + "epoch": 0.6331319746325674, + "grad_norm": 4.729899883270264, + "learning_rate": 3.1499731246824124e-05, + "loss": 2.0417, + "step": 8411 + }, + { + "epoch": 0.6332072489132275, + "grad_norm": 4.906039714813232, + "learning_rate": 3.1488406097923905e-05, + "loss": 2.0639, + "step": 8412 + }, + { + "epoch": 0.6332825231938878, + "grad_norm": 4.288619518280029, + "learning_rate": 3.147708204948483e-05, + "loss": 1.603, + "step": 8413 + }, + { + "epoch": 0.6333577974745479, + "grad_norm": 4.182696342468262, + "learning_rate": 3.1465759102180106e-05, + "loss": 1.8269, + "step": 8414 + }, + { + "epoch": 0.633433071755208, + "grad_norm": 5.559972763061523, + "learning_rate": 3.145443725668279e-05, + "loss": 1.8969, + "step": 8415 + }, + { + "epoch": 0.6335083460358681, + "grad_norm": 5.187122821807861, + "learning_rate": 3.1443116513665995e-05, + "loss": 1.465, + "step": 8416 + }, + { + "epoch": 0.6335836203165284, + "grad_norm": 5.475125789642334, + "learning_rate": 3.143179687380265e-05, + "loss": 2.0753, + "step": 8417 + }, + { + "epoch": 0.6336588945971885, + "grad_norm": 4.6722798347473145, + "learning_rate": 3.1420478337765726e-05, + "loss": 1.6511, + "step": 8418 + }, + { + "epoch": 0.6337341688778486, + "grad_norm": 4.518062114715576, + "learning_rate": 3.1409160906228036e-05, + "loss": 1.7954, + "step": 8419 + }, + { + "epoch": 0.6338094431585088, + "grad_norm": 3.9041812419891357, + "learning_rate": 3.1397844579862404e-05, + "loss": 1.6306, + "step": 8420 + }, + { + "epoch": 0.633884717439169, + "grad_norm": 3.719054698944092, + "learning_rate": 3.138652935934152e-05, + "loss": 1.6765, + "step": 8421 + }, + { + "epoch": 0.6339599917198291, + "grad_norm": 6.022618293762207, + "learning_rate": 3.137521524533807e-05, + "loss": 1.6126, + "step": 8422 + }, + { + "epoch": 0.6340352660004893, + "grad_norm": 7.237251281738281, + "learning_rate": 3.1363902238524625e-05, + "loss": 1.7232, + "step": 8423 + }, + { + "epoch": 0.6341105402811494, + "grad_norm": 4.522614002227783, + "learning_rate": 3.135259033957373e-05, + "loss": 1.7325, + "step": 8424 + }, + { + "epoch": 0.6341858145618096, + "grad_norm": 7.520346641540527, + "learning_rate": 3.134127954915782e-05, + "loss": 1.6413, + "step": 8425 + }, + { + "epoch": 0.6342610888424698, + "grad_norm": 5.1252899169921875, + "learning_rate": 3.1329969867949316e-05, + "loss": 1.9181, + "step": 8426 + }, + { + "epoch": 0.6343363631231299, + "grad_norm": 4.5424933433532715, + "learning_rate": 3.131866129662052e-05, + "loss": 1.9148, + "step": 8427 + }, + { + "epoch": 0.63441163740379, + "grad_norm": 4.337066173553467, + "learning_rate": 3.130735383584372e-05, + "loss": 1.7209, + "step": 8428 + }, + { + "epoch": 0.6344869116844503, + "grad_norm": 6.601905345916748, + "learning_rate": 3.129604748629108e-05, + "loss": 2.2253, + "step": 8429 + }, + { + "epoch": 0.6345621859651104, + "grad_norm": 5.095047950744629, + "learning_rate": 3.1284742248634755e-05, + "loss": 1.7484, + "step": 8430 + }, + { + "epoch": 0.6346374602457705, + "grad_norm": 6.78175163269043, + "learning_rate": 3.127343812354682e-05, + "loss": 1.8814, + "step": 8431 + }, + { + "epoch": 0.6347127345264307, + "grad_norm": 4.827413558959961, + "learning_rate": 3.126213511169924e-05, + "loss": 2.1398, + "step": 8432 + }, + { + "epoch": 0.6347880088070909, + "grad_norm": 6.343512535095215, + "learning_rate": 3.125083321376398e-05, + "loss": 1.5024, + "step": 8433 + }, + { + "epoch": 0.634863283087751, + "grad_norm": 5.82692289352417, + "learning_rate": 3.1239532430412866e-05, + "loss": 1.9298, + "step": 8434 + }, + { + "epoch": 0.6349385573684111, + "grad_norm": 5.086280345916748, + "learning_rate": 3.1228232762317734e-05, + "loss": 1.5825, + "step": 8435 + }, + { + "epoch": 0.6350138316490713, + "grad_norm": 5.925399303436279, + "learning_rate": 3.1216934210150286e-05, + "loss": 1.8492, + "step": 8436 + }, + { + "epoch": 0.6350891059297314, + "grad_norm": 6.042376518249512, + "learning_rate": 3.120563677458224e-05, + "loss": 1.873, + "step": 8437 + }, + { + "epoch": 0.6351643802103916, + "grad_norm": 4.674612998962402, + "learning_rate": 3.119434045628512e-05, + "loss": 1.6808, + "step": 8438 + }, + { + "epoch": 0.6352396544910518, + "grad_norm": 5.1638007164001465, + "learning_rate": 3.118304525593052e-05, + "loss": 1.801, + "step": 8439 + }, + { + "epoch": 0.6353149287717119, + "grad_norm": 4.807487487792969, + "learning_rate": 3.117175117418988e-05, + "loss": 1.8879, + "step": 8440 + }, + { + "epoch": 0.635390203052372, + "grad_norm": 6.7060112953186035, + "learning_rate": 3.116045821173462e-05, + "loss": 1.8532, + "step": 8441 + }, + { + "epoch": 0.6354654773330323, + "grad_norm": 4.874916076660156, + "learning_rate": 3.114916636923605e-05, + "loss": 1.9274, + "step": 8442 + }, + { + "epoch": 0.6355407516136924, + "grad_norm": 8.288866996765137, + "learning_rate": 3.1137875647365466e-05, + "loss": 1.7894, + "step": 8443 + }, + { + "epoch": 0.6356160258943525, + "grad_norm": 5.885791778564453, + "learning_rate": 3.112658604679404e-05, + "loss": 1.8041, + "step": 8444 + }, + { + "epoch": 0.6356913001750127, + "grad_norm": 5.1199951171875, + "learning_rate": 3.1115297568192915e-05, + "loss": 1.4772, + "step": 8445 + }, + { + "epoch": 0.6357665744556729, + "grad_norm": 4.650871753692627, + "learning_rate": 3.110401021223317e-05, + "loss": 1.3804, + "step": 8446 + }, + { + "epoch": 0.635841848736333, + "grad_norm": 6.74761438369751, + "learning_rate": 3.1092723979585795e-05, + "loss": 1.9824, + "step": 8447 + }, + { + "epoch": 0.6359171230169932, + "grad_norm": 6.541318416595459, + "learning_rate": 3.108143887092175e-05, + "loss": 1.732, + "step": 8448 + }, + { + "epoch": 0.6359923972976533, + "grad_norm": 5.873243808746338, + "learning_rate": 3.107015488691185e-05, + "loss": 2.0946, + "step": 8449 + }, + { + "epoch": 0.6360676715783135, + "grad_norm": 4.697175979614258, + "learning_rate": 3.105887202822696e-05, + "loss": 1.6158, + "step": 8450 + }, + { + "epoch": 0.6361429458589737, + "grad_norm": 4.365893363952637, + "learning_rate": 3.1047590295537754e-05, + "loss": 1.4763, + "step": 8451 + }, + { + "epoch": 0.6362182201396338, + "grad_norm": 7.688378810882568, + "learning_rate": 3.103630968951493e-05, + "loss": 1.859, + "step": 8452 + }, + { + "epoch": 0.6362934944202939, + "grad_norm": 4.652393341064453, + "learning_rate": 3.102503021082907e-05, + "loss": 1.9668, + "step": 8453 + }, + { + "epoch": 0.636368768700954, + "grad_norm": 3.991792917251587, + "learning_rate": 3.101375186015075e-05, + "loss": 1.8417, + "step": 8454 + }, + { + "epoch": 0.6364440429816143, + "grad_norm": 4.749658584594727, + "learning_rate": 3.1002474638150376e-05, + "loss": 1.873, + "step": 8455 + }, + { + "epoch": 0.6365193172622744, + "grad_norm": 4.839898586273193, + "learning_rate": 3.099119854549839e-05, + "loss": 2.2917, + "step": 8456 + }, + { + "epoch": 0.6365945915429345, + "grad_norm": 4.51483678817749, + "learning_rate": 3.097992358286509e-05, + "loss": 1.7918, + "step": 8457 + }, + { + "epoch": 0.6366698658235947, + "grad_norm": 5.897151470184326, + "learning_rate": 3.0968649750920784e-05, + "loss": 1.9302, + "step": 8458 + }, + { + "epoch": 0.6367451401042549, + "grad_norm": 5.364502906799316, + "learning_rate": 3.0957377050335624e-05, + "loss": 2.0121, + "step": 8459 + }, + { + "epoch": 0.636820414384915, + "grad_norm": 4.192001819610596, + "learning_rate": 3.0946105481779776e-05, + "loss": 1.9458, + "step": 8460 + }, + { + "epoch": 0.6368956886655752, + "grad_norm": 5.836835861206055, + "learning_rate": 3.093483504592326e-05, + "loss": 1.8303, + "step": 8461 + }, + { + "epoch": 0.6369709629462353, + "grad_norm": 6.19293212890625, + "learning_rate": 3.092356574343611e-05, + "loss": 2.2205, + "step": 8462 + }, + { + "epoch": 0.6370462372268955, + "grad_norm": 5.912998676300049, + "learning_rate": 3.091229757498825e-05, + "loss": 1.551, + "step": 8463 + }, + { + "epoch": 0.6371215115075557, + "grad_norm": 7.025411605834961, + "learning_rate": 3.090103054124951e-05, + "loss": 1.3883, + "step": 8464 + }, + { + "epoch": 0.6371967857882158, + "grad_norm": 4.694859981536865, + "learning_rate": 3.0889764642889725e-05, + "loss": 1.919, + "step": 8465 + }, + { + "epoch": 0.6372720600688759, + "grad_norm": 4.134038925170898, + "learning_rate": 3.087849988057858e-05, + "loss": 1.7212, + "step": 8466 + }, + { + "epoch": 0.6373473343495362, + "grad_norm": 5.514676570892334, + "learning_rate": 3.0867236254985764e-05, + "loss": 1.8228, + "step": 8467 + }, + { + "epoch": 0.6374226086301963, + "grad_norm": 5.282869338989258, + "learning_rate": 3.085597376678084e-05, + "loss": 1.9211, + "step": 8468 + }, + { + "epoch": 0.6374978829108564, + "grad_norm": 5.633350372314453, + "learning_rate": 3.084471241663337e-05, + "loss": 2.0835, + "step": 8469 + }, + { + "epoch": 0.6375731571915166, + "grad_norm": 5.962558746337891, + "learning_rate": 3.083345220521276e-05, + "loss": 1.948, + "step": 8470 + }, + { + "epoch": 0.6376484314721768, + "grad_norm": 4.047971725463867, + "learning_rate": 3.082219313318844e-05, + "loss": 1.7437, + "step": 8471 + }, + { + "epoch": 0.6377237057528369, + "grad_norm": 4.637302398681641, + "learning_rate": 3.0810935201229694e-05, + "loss": 1.6338, + "step": 8472 + }, + { + "epoch": 0.6377989800334971, + "grad_norm": 6.315282344818115, + "learning_rate": 3.0799678410005806e-05, + "loss": 2.0675, + "step": 8473 + }, + { + "epoch": 0.6378742543141572, + "grad_norm": 4.907542705535889, + "learning_rate": 3.0788422760185926e-05, + "loss": 1.9194, + "step": 8474 + }, + { + "epoch": 0.6379495285948173, + "grad_norm": 5.113279342651367, + "learning_rate": 3.0777168252439217e-05, + "loss": 2.0411, + "step": 8475 + }, + { + "epoch": 0.6380248028754775, + "grad_norm": 6.089837551116943, + "learning_rate": 3.0765914887434665e-05, + "loss": 1.6365, + "step": 8476 + }, + { + "epoch": 0.6381000771561377, + "grad_norm": 5.753455638885498, + "learning_rate": 3.075466266584132e-05, + "loss": 1.6322, + "step": 8477 + }, + { + "epoch": 0.6381753514367978, + "grad_norm": 5.757205963134766, + "learning_rate": 3.074341158832803e-05, + "loss": 1.9458, + "step": 8478 + }, + { + "epoch": 0.6382506257174579, + "grad_norm": 4.642826557159424, + "learning_rate": 3.073216165556367e-05, + "loss": 1.889, + "step": 8479 + }, + { + "epoch": 0.6383258999981182, + "grad_norm": 5.545953273773193, + "learning_rate": 3.072091286821702e-05, + "loss": 1.4403, + "step": 8480 + }, + { + "epoch": 0.6384011742787783, + "grad_norm": 5.29569149017334, + "learning_rate": 3.070966522695677e-05, + "loss": 1.6189, + "step": 8481 + }, + { + "epoch": 0.6384764485594384, + "grad_norm": 7.771848201751709, + "learning_rate": 3.069841873245161e-05, + "loss": 2.1126, + "step": 8482 + }, + { + "epoch": 0.6385517228400986, + "grad_norm": 5.504758358001709, + "learning_rate": 3.068717338537004e-05, + "loss": 1.8735, + "step": 8483 + }, + { + "epoch": 0.6386269971207588, + "grad_norm": 5.3497633934021, + "learning_rate": 3.06759291863806e-05, + "loss": 1.6839, + "step": 8484 + }, + { + "epoch": 0.6387022714014189, + "grad_norm": 4.654297351837158, + "learning_rate": 3.066468613615173e-05, + "loss": 1.7848, + "step": 8485 + }, + { + "epoch": 0.6387775456820791, + "grad_norm": 4.21811056137085, + "learning_rate": 3.0653444235351805e-05, + "loss": 1.7197, + "step": 8486 + }, + { + "epoch": 0.6388528199627392, + "grad_norm": 5.228713512420654, + "learning_rate": 3.064220348464908e-05, + "loss": 1.895, + "step": 8487 + }, + { + "epoch": 0.6389280942433994, + "grad_norm": 4.425973892211914, + "learning_rate": 3.0630963884711836e-05, + "loss": 1.6014, + "step": 8488 + }, + { + "epoch": 0.6390033685240596, + "grad_norm": 5.489468097686768, + "learning_rate": 3.06197254362082e-05, + "loss": 1.5466, + "step": 8489 + }, + { + "epoch": 0.6390786428047197, + "grad_norm": 5.966813087463379, + "learning_rate": 3.060848813980628e-05, + "loss": 2.0535, + "step": 8490 + }, + { + "epoch": 0.6391539170853798, + "grad_norm": 5.415655136108398, + "learning_rate": 3.05972519961741e-05, + "loss": 1.7442, + "step": 8491 + }, + { + "epoch": 0.6392291913660401, + "grad_norm": 8.164666175842285, + "learning_rate": 3.058601700597963e-05, + "loss": 2.2542, + "step": 8492 + }, + { + "epoch": 0.6393044656467002, + "grad_norm": 4.809131622314453, + "learning_rate": 3.0574783169890724e-05, + "loss": 1.637, + "step": 8493 + }, + { + "epoch": 0.6393797399273603, + "grad_norm": 5.912126064300537, + "learning_rate": 3.056355048857522e-05, + "loss": 1.861, + "step": 8494 + }, + { + "epoch": 0.6394550142080204, + "grad_norm": 5.254271984100342, + "learning_rate": 3.0552318962700885e-05, + "loss": 1.5742, + "step": 8495 + }, + { + "epoch": 0.6395302884886807, + "grad_norm": 4.5365447998046875, + "learning_rate": 3.0541088592935374e-05, + "loss": 1.8213, + "step": 8496 + }, + { + "epoch": 0.6396055627693408, + "grad_norm": 6.108835697174072, + "learning_rate": 3.0529859379946334e-05, + "loss": 1.6501, + "step": 8497 + }, + { + "epoch": 0.6396808370500009, + "grad_norm": 4.741199493408203, + "learning_rate": 3.051863132440126e-05, + "loss": 1.4316, + "step": 8498 + }, + { + "epoch": 0.6397561113306611, + "grad_norm": 4.340286731719971, + "learning_rate": 3.0507404426967685e-05, + "loss": 1.7586, + "step": 8499 + }, + { + "epoch": 0.6398313856113212, + "grad_norm": 3.8744378089904785, + "learning_rate": 3.049617868831296e-05, + "loss": 1.7396, + "step": 8500 + }, + { + "epoch": 0.6399066598919814, + "grad_norm": 5.0242228507995605, + "learning_rate": 3.0484954109104462e-05, + "loss": 1.6887, + "step": 8501 + }, + { + "epoch": 0.6399819341726416, + "grad_norm": 4.608253479003906, + "learning_rate": 3.047373069000944e-05, + "loss": 1.7822, + "step": 8502 + }, + { + "epoch": 0.6400572084533017, + "grad_norm": 3.910228967666626, + "learning_rate": 3.0462508431695112e-05, + "loss": 1.747, + "step": 8503 + }, + { + "epoch": 0.6401324827339618, + "grad_norm": 5.269796371459961, + "learning_rate": 3.045128733482859e-05, + "loss": 1.6867, + "step": 8504 + }, + { + "epoch": 0.6402077570146221, + "grad_norm": 4.4679059982299805, + "learning_rate": 3.0440067400076954e-05, + "loss": 1.7267, + "step": 8505 + }, + { + "epoch": 0.6402830312952822, + "grad_norm": 4.875297546386719, + "learning_rate": 3.0428848628107176e-05, + "loss": 1.5967, + "step": 8506 + }, + { + "epoch": 0.6403583055759423, + "grad_norm": 6.765722751617432, + "learning_rate": 3.0417631019586197e-05, + "loss": 1.6587, + "step": 8507 + }, + { + "epoch": 0.6404335798566025, + "grad_norm": 7.07462215423584, + "learning_rate": 3.0406414575180853e-05, + "loss": 1.7844, + "step": 8508 + }, + { + "epoch": 0.6405088541372627, + "grad_norm": 6.472033500671387, + "learning_rate": 3.0395199295557965e-05, + "loss": 1.3962, + "step": 8509 + }, + { + "epoch": 0.6405841284179228, + "grad_norm": 5.995518684387207, + "learning_rate": 3.0383985181384213e-05, + "loss": 1.9082, + "step": 8510 + }, + { + "epoch": 0.640659402698583, + "grad_norm": 3.9193172454833984, + "learning_rate": 3.0372772233326242e-05, + "loss": 1.9017, + "step": 8511 + }, + { + "epoch": 0.6407346769792431, + "grad_norm": 5.115964889526367, + "learning_rate": 3.0361560452050664e-05, + "loss": 1.7711, + "step": 8512 + }, + { + "epoch": 0.6408099512599033, + "grad_norm": 4.943985939025879, + "learning_rate": 3.0350349838223958e-05, + "loss": 1.9811, + "step": 8513 + }, + { + "epoch": 0.6408852255405634, + "grad_norm": 6.502768516540527, + "learning_rate": 3.03391403925126e-05, + "loss": 1.8597, + "step": 8514 + }, + { + "epoch": 0.6409604998212236, + "grad_norm": 6.17203426361084, + "learning_rate": 3.0327932115582903e-05, + "loss": 1.9055, + "step": 8515 + }, + { + "epoch": 0.6410357741018837, + "grad_norm": 5.492644309997559, + "learning_rate": 3.0316725008101215e-05, + "loss": 1.9197, + "step": 8516 + }, + { + "epoch": 0.6411110483825438, + "grad_norm": 4.3855485916137695, + "learning_rate": 3.0305519070733734e-05, + "loss": 1.5706, + "step": 8517 + }, + { + "epoch": 0.6411863226632041, + "grad_norm": 6.03162956237793, + "learning_rate": 3.0294314304146642e-05, + "loss": 2.1461, + "step": 8518 + }, + { + "epoch": 0.6412615969438642, + "grad_norm": 4.356393337249756, + "learning_rate": 3.0283110709006014e-05, + "loss": 1.7449, + "step": 8519 + }, + { + "epoch": 0.6413368712245243, + "grad_norm": 7.34688138961792, + "learning_rate": 3.027190828597789e-05, + "loss": 1.634, + "step": 8520 + }, + { + "epoch": 0.6414121455051845, + "grad_norm": 4.727262020111084, + "learning_rate": 3.0260707035728208e-05, + "loss": 2.0758, + "step": 8521 + }, + { + "epoch": 0.6414874197858447, + "grad_norm": 6.444448947906494, + "learning_rate": 3.0249506958922857e-05, + "loss": 1.7117, + "step": 8522 + }, + { + "epoch": 0.6415626940665048, + "grad_norm": 5.084204196929932, + "learning_rate": 3.0238308056227633e-05, + "loss": 1.752, + "step": 8523 + }, + { + "epoch": 0.641637968347165, + "grad_norm": 4.34086275100708, + "learning_rate": 3.022711032830832e-05, + "loss": 1.8848, + "step": 8524 + }, + { + "epoch": 0.6417132426278251, + "grad_norm": 7.672061443328857, + "learning_rate": 3.0215913775830533e-05, + "loss": 1.908, + "step": 8525 + }, + { + "epoch": 0.6417885169084853, + "grad_norm": 4.4123663902282715, + "learning_rate": 3.0204718399459913e-05, + "loss": 1.502, + "step": 8526 + }, + { + "epoch": 0.6418637911891455, + "grad_norm": 5.0550856590271, + "learning_rate": 3.0193524199862e-05, + "loss": 2.139, + "step": 8527 + }, + { + "epoch": 0.6419390654698056, + "grad_norm": 5.212312698364258, + "learning_rate": 3.018233117770223e-05, + "loss": 1.972, + "step": 8528 + }, + { + "epoch": 0.6420143397504657, + "grad_norm": 4.040462970733643, + "learning_rate": 3.0171139333646014e-05, + "loss": 1.4669, + "step": 8529 + }, + { + "epoch": 0.642089614031126, + "grad_norm": 16.13506507873535, + "learning_rate": 3.0159948668358662e-05, + "loss": 1.2675, + "step": 8530 + }, + { + "epoch": 0.6421648883117861, + "grad_norm": 6.759674072265625, + "learning_rate": 3.014875918250545e-05, + "loss": 1.8255, + "step": 8531 + }, + { + "epoch": 0.6422401625924462, + "grad_norm": 4.3888325691223145, + "learning_rate": 3.0137570876751524e-05, + "loss": 1.6966, + "step": 8532 + }, + { + "epoch": 0.6423154368731063, + "grad_norm": 5.11987829208374, + "learning_rate": 3.0126383751762043e-05, + "loss": 1.7512, + "step": 8533 + }, + { + "epoch": 0.6423907111537666, + "grad_norm": 6.475493907928467, + "learning_rate": 3.0115197808202e-05, + "loss": 1.6936, + "step": 8534 + }, + { + "epoch": 0.6424659854344267, + "grad_norm": 4.794989585876465, + "learning_rate": 3.0104013046736402e-05, + "loss": 1.4118, + "step": 8535 + }, + { + "epoch": 0.6425412597150868, + "grad_norm": 5.625105381011963, + "learning_rate": 3.0092829468030125e-05, + "loss": 1.6277, + "step": 8536 + }, + { + "epoch": 0.642616533995747, + "grad_norm": 4.115213394165039, + "learning_rate": 3.0081647072748043e-05, + "loss": 1.7945, + "step": 8537 + }, + { + "epoch": 0.6426918082764072, + "grad_norm": 5.181756496429443, + "learning_rate": 3.0070465861554874e-05, + "loss": 1.5837, + "step": 8538 + }, + { + "epoch": 0.6427670825570673, + "grad_norm": 8.046330451965332, + "learning_rate": 3.005928583511533e-05, + "loss": 2.0312, + "step": 8539 + }, + { + "epoch": 0.6428423568377275, + "grad_norm": 5.965188026428223, + "learning_rate": 3.0048106994094016e-05, + "loss": 1.8734, + "step": 8540 + }, + { + "epoch": 0.6429176311183876, + "grad_norm": 4.9047956466674805, + "learning_rate": 3.003692933915552e-05, + "loss": 1.599, + "step": 8541 + }, + { + "epoch": 0.6429929053990477, + "grad_norm": 4.987523078918457, + "learning_rate": 3.0025752870964262e-05, + "loss": 1.9271, + "step": 8542 + }, + { + "epoch": 0.643068179679708, + "grad_norm": 6.196382999420166, + "learning_rate": 3.001457759018469e-05, + "loss": 1.6547, + "step": 8543 + }, + { + "epoch": 0.6431434539603681, + "grad_norm": 8.652907371520996, + "learning_rate": 3.0003403497481152e-05, + "loss": 1.8071, + "step": 8544 + }, + { + "epoch": 0.6432187282410282, + "grad_norm": 5.0724406242370605, + "learning_rate": 2.999223059351789e-05, + "loss": 1.8669, + "step": 8545 + }, + { + "epoch": 0.6432940025216884, + "grad_norm": 6.797928810119629, + "learning_rate": 2.998105887895911e-05, + "loss": 1.7042, + "step": 8546 + }, + { + "epoch": 0.6433692768023486, + "grad_norm": 5.89650821685791, + "learning_rate": 2.9969888354468933e-05, + "loss": 1.861, + "step": 8547 + }, + { + "epoch": 0.6434445510830087, + "grad_norm": 6.362919807434082, + "learning_rate": 2.995871902071144e-05, + "loss": 1.5493, + "step": 8548 + }, + { + "epoch": 0.6435198253636689, + "grad_norm": 3.9971113204956055, + "learning_rate": 2.9947550878350572e-05, + "loss": 1.458, + "step": 8549 + }, + { + "epoch": 0.643595099644329, + "grad_norm": 5.029232501983643, + "learning_rate": 2.9936383928050284e-05, + "loss": 1.4297, + "step": 8550 + }, + { + "epoch": 0.6436703739249892, + "grad_norm": 6.111117839813232, + "learning_rate": 2.9925218170474402e-05, + "loss": 1.9348, + "step": 8551 + }, + { + "epoch": 0.6437456482056494, + "grad_norm": 5.975658416748047, + "learning_rate": 2.9914053606286695e-05, + "loss": 1.9114, + "step": 8552 + }, + { + "epoch": 0.6438209224863095, + "grad_norm": 5.5311455726623535, + "learning_rate": 2.990289023615086e-05, + "loss": 1.7824, + "step": 8553 + }, + { + "epoch": 0.6438961967669696, + "grad_norm": 4.59359073638916, + "learning_rate": 2.9891728060730555e-05, + "loss": 2.1552, + "step": 8554 + }, + { + "epoch": 0.6439714710476298, + "grad_norm": 4.412354469299316, + "learning_rate": 2.9880567080689303e-05, + "loss": 1.7128, + "step": 8555 + }, + { + "epoch": 0.64404674532829, + "grad_norm": 4.536726474761963, + "learning_rate": 2.986940729669061e-05, + "loss": 1.7408, + "step": 8556 + }, + { + "epoch": 0.6441220196089501, + "grad_norm": 5.419759273529053, + "learning_rate": 2.985824870939789e-05, + "loss": 1.6995, + "step": 8557 + }, + { + "epoch": 0.6441972938896102, + "grad_norm": 5.157083988189697, + "learning_rate": 2.9847091319474485e-05, + "loss": 2.0251, + "step": 8558 + }, + { + "epoch": 0.6442725681702705, + "grad_norm": 7.159788608551025, + "learning_rate": 2.9835935127583703e-05, + "loss": 2.1951, + "step": 8559 + }, + { + "epoch": 0.6443478424509306, + "grad_norm": 4.922720432281494, + "learning_rate": 2.9824780134388697e-05, + "loss": 1.576, + "step": 8560 + }, + { + "epoch": 0.6444231167315907, + "grad_norm": 8.038802146911621, + "learning_rate": 2.981362634055265e-05, + "loss": 2.0754, + "step": 8561 + }, + { + "epoch": 0.6444983910122509, + "grad_norm": 8.4837007522583, + "learning_rate": 2.9802473746738562e-05, + "loss": 1.8285, + "step": 8562 + }, + { + "epoch": 0.644573665292911, + "grad_norm": 6.366405963897705, + "learning_rate": 2.9791322353609473e-05, + "loss": 1.6207, + "step": 8563 + }, + { + "epoch": 0.6446489395735712, + "grad_norm": 6.312903881072998, + "learning_rate": 2.978017216182828e-05, + "loss": 2.2482, + "step": 8564 + }, + { + "epoch": 0.6447242138542314, + "grad_norm": 4.105279922485352, + "learning_rate": 2.976902317205785e-05, + "loss": 1.3769, + "step": 8565 + }, + { + "epoch": 0.6447994881348915, + "grad_norm": 4.707040786743164, + "learning_rate": 2.975787538496092e-05, + "loss": 1.7346, + "step": 8566 + }, + { + "epoch": 0.6448747624155516, + "grad_norm": 5.906081676483154, + "learning_rate": 2.974672880120023e-05, + "loss": 1.5794, + "step": 8567 + }, + { + "epoch": 0.6449500366962119, + "grad_norm": 6.209989547729492, + "learning_rate": 2.973558342143839e-05, + "loss": 2.2461, + "step": 8568 + }, + { + "epoch": 0.645025310976872, + "grad_norm": 4.54546594619751, + "learning_rate": 2.9724439246337987e-05, + "loss": 1.745, + "step": 8569 + }, + { + "epoch": 0.6451005852575321, + "grad_norm": 5.0926384925842285, + "learning_rate": 2.971329627656148e-05, + "loss": 1.8897, + "step": 8570 + }, + { + "epoch": 0.6451758595381923, + "grad_norm": 6.7401533126831055, + "learning_rate": 2.970215451277132e-05, + "loss": 1.8301, + "step": 8571 + }, + { + "epoch": 0.6452511338188525, + "grad_norm": 8.800064086914062, + "learning_rate": 2.9691013955629808e-05, + "loss": 1.8959, + "step": 8572 + }, + { + "epoch": 0.6453264080995126, + "grad_norm": 5.441192626953125, + "learning_rate": 2.9679874605799257e-05, + "loss": 1.8238, + "step": 8573 + }, + { + "epoch": 0.6454016823801727, + "grad_norm": 4.746504783630371, + "learning_rate": 2.9668736463941844e-05, + "loss": 2.0758, + "step": 8574 + }, + { + "epoch": 0.6454769566608329, + "grad_norm": 6.459043979644775, + "learning_rate": 2.9657599530719714e-05, + "loss": 2.0284, + "step": 8575 + }, + { + "epoch": 0.645552230941493, + "grad_norm": 5.312110424041748, + "learning_rate": 2.964646380679494e-05, + "loss": 1.4591, + "step": 8576 + }, + { + "epoch": 0.6456275052221532, + "grad_norm": 7.225520133972168, + "learning_rate": 2.963532929282947e-05, + "loss": 1.8167, + "step": 8577 + }, + { + "epoch": 0.6457027795028134, + "grad_norm": 6.189176559448242, + "learning_rate": 2.9624195989485264e-05, + "loss": 1.9349, + "step": 8578 + }, + { + "epoch": 0.6457780537834735, + "grad_norm": 7.673556327819824, + "learning_rate": 2.961306389742412e-05, + "loss": 1.5972, + "step": 8579 + }, + { + "epoch": 0.6458533280641336, + "grad_norm": 7.309333324432373, + "learning_rate": 2.9601933017307847e-05, + "loss": 1.7686, + "step": 8580 + }, + { + "epoch": 0.6459286023447939, + "grad_norm": 4.640376567840576, + "learning_rate": 2.959080334979811e-05, + "loss": 1.9322, + "step": 8581 + }, + { + "epoch": 0.646003876625454, + "grad_norm": 6.656774997711182, + "learning_rate": 2.9579674895556574e-05, + "loss": 1.8778, + "step": 8582 + }, + { + "epoch": 0.6460791509061141, + "grad_norm": 5.470804691314697, + "learning_rate": 2.956854765524476e-05, + "loss": 1.8805, + "step": 8583 + }, + { + "epoch": 0.6461544251867744, + "grad_norm": 6.081241130828857, + "learning_rate": 2.9557421629524163e-05, + "loss": 1.6681, + "step": 8584 + }, + { + "epoch": 0.6462296994674345, + "grad_norm": 6.941330909729004, + "learning_rate": 2.9546296819056192e-05, + "loss": 1.8574, + "step": 8585 + }, + { + "epoch": 0.6463049737480946, + "grad_norm": 5.426976680755615, + "learning_rate": 2.95351732245022e-05, + "loss": 1.8432, + "step": 8586 + }, + { + "epoch": 0.6463802480287548, + "grad_norm": 5.855489730834961, + "learning_rate": 2.9524050846523427e-05, + "loss": 1.9034, + "step": 8587 + }, + { + "epoch": 0.6464555223094149, + "grad_norm": 5.917722702026367, + "learning_rate": 2.9512929685781092e-05, + "loss": 2.2343, + "step": 8588 + }, + { + "epoch": 0.6465307965900751, + "grad_norm": 6.118962287902832, + "learning_rate": 2.9501809742936292e-05, + "loss": 1.642, + "step": 8589 + }, + { + "epoch": 0.6466060708707353, + "grad_norm": 6.862045764923096, + "learning_rate": 2.9490691018650084e-05, + "loss": 1.9824, + "step": 8590 + }, + { + "epoch": 0.6466813451513954, + "grad_norm": 3.6708619594573975, + "learning_rate": 2.9479573513583446e-05, + "loss": 1.6212, + "step": 8591 + }, + { + "epoch": 0.6467566194320555, + "grad_norm": 4.685216903686523, + "learning_rate": 2.9468457228397284e-05, + "loss": 1.6677, + "step": 8592 + }, + { + "epoch": 0.6468318937127157, + "grad_norm": 4.831169605255127, + "learning_rate": 2.9457342163752443e-05, + "loss": 1.7129, + "step": 8593 + }, + { + "epoch": 0.6469071679933759, + "grad_norm": 4.635401248931885, + "learning_rate": 2.9446228320309655e-05, + "loss": 1.6859, + "step": 8594 + }, + { + "epoch": 0.646982442274036, + "grad_norm": 4.3091864585876465, + "learning_rate": 2.943511569872962e-05, + "loss": 1.7232, + "step": 8595 + }, + { + "epoch": 0.6470577165546961, + "grad_norm": 5.619930744171143, + "learning_rate": 2.9424004299672948e-05, + "loss": 1.566, + "step": 8596 + }, + { + "epoch": 0.6471329908353564, + "grad_norm": 5.065615653991699, + "learning_rate": 2.9412894123800195e-05, + "loss": 1.5106, + "step": 8597 + }, + { + "epoch": 0.6472082651160165, + "grad_norm": 4.372215747833252, + "learning_rate": 2.940178517177179e-05, + "loss": 1.8035, + "step": 8598 + }, + { + "epoch": 0.6472835393966766, + "grad_norm": 5.182182788848877, + "learning_rate": 2.939067744424818e-05, + "loss": 1.595, + "step": 8599 + }, + { + "epoch": 0.6473588136773368, + "grad_norm": 7.073099613189697, + "learning_rate": 2.9379570941889643e-05, + "loss": 1.5772, + "step": 8600 + }, + { + "epoch": 0.647434087957997, + "grad_norm": 5.8310956954956055, + "learning_rate": 2.9368465665356448e-05, + "loss": 1.7128, + "step": 8601 + }, + { + "epoch": 0.6475093622386571, + "grad_norm": 4.4995436668396, + "learning_rate": 2.935736161530877e-05, + "loss": 1.667, + "step": 8602 + }, + { + "epoch": 0.6475846365193173, + "grad_norm": 5.390491485595703, + "learning_rate": 2.9346258792406734e-05, + "loss": 1.7406, + "step": 8603 + }, + { + "epoch": 0.6476599107999774, + "grad_norm": 5.841893672943115, + "learning_rate": 2.9335157197310327e-05, + "loss": 1.8724, + "step": 8604 + }, + { + "epoch": 0.6477351850806375, + "grad_norm": 4.496160984039307, + "learning_rate": 2.932405683067955e-05, + "loss": 1.8703, + "step": 8605 + }, + { + "epoch": 0.6478104593612978, + "grad_norm": 4.957056045532227, + "learning_rate": 2.931295769317425e-05, + "loss": 1.6683, + "step": 8606 + }, + { + "epoch": 0.6478857336419579, + "grad_norm": 6.544439315795898, + "learning_rate": 2.9301859785454254e-05, + "loss": 1.979, + "step": 8607 + }, + { + "epoch": 0.647961007922618, + "grad_norm": 6.328692436218262, + "learning_rate": 2.929076310817932e-05, + "loss": 2.0248, + "step": 8608 + }, + { + "epoch": 0.6480362822032782, + "grad_norm": 6.0852484703063965, + "learning_rate": 2.927966766200908e-05, + "loss": 1.8166, + "step": 8609 + }, + { + "epoch": 0.6481115564839384, + "grad_norm": 5.802111625671387, + "learning_rate": 2.926857344760317e-05, + "loss": 1.6243, + "step": 8610 + }, + { + "epoch": 0.6481868307645985, + "grad_norm": 6.164817810058594, + "learning_rate": 2.9257480465621063e-05, + "loss": 2.1907, + "step": 8611 + }, + { + "epoch": 0.6482621050452586, + "grad_norm": 5.88037109375, + "learning_rate": 2.9246388716722233e-05, + "loss": 1.9438, + "step": 8612 + }, + { + "epoch": 0.6483373793259188, + "grad_norm": 4.45822811126709, + "learning_rate": 2.9235298201566037e-05, + "loss": 1.9145, + "step": 8613 + }, + { + "epoch": 0.648412653606579, + "grad_norm": 5.1589813232421875, + "learning_rate": 2.9224208920811803e-05, + "loss": 1.6335, + "step": 8614 + }, + { + "epoch": 0.6484879278872391, + "grad_norm": 4.337166786193848, + "learning_rate": 2.9213120875118715e-05, + "loss": 1.888, + "step": 8615 + }, + { + "epoch": 0.6485632021678993, + "grad_norm": 4.947061061859131, + "learning_rate": 2.9202034065145967e-05, + "loss": 2.1875, + "step": 8616 + }, + { + "epoch": 0.6486384764485594, + "grad_norm": 5.6075005531311035, + "learning_rate": 2.9190948491552596e-05, + "loss": 1.7302, + "step": 8617 + }, + { + "epoch": 0.6487137507292196, + "grad_norm": 8.494487762451172, + "learning_rate": 2.917986415499765e-05, + "loss": 2.1678, + "step": 8618 + }, + { + "epoch": 0.6487890250098798, + "grad_norm": 4.403011322021484, + "learning_rate": 2.916878105614002e-05, + "loss": 1.8067, + "step": 8619 + }, + { + "epoch": 0.6488642992905399, + "grad_norm": 5.305337429046631, + "learning_rate": 2.9157699195638588e-05, + "loss": 1.6376, + "step": 8620 + }, + { + "epoch": 0.6489395735712, + "grad_norm": 3.4083411693573, + "learning_rate": 2.9146618574152128e-05, + "loss": 1.9024, + "step": 8621 + }, + { + "epoch": 0.6490148478518603, + "grad_norm": 5.2513604164123535, + "learning_rate": 2.913553919233939e-05, + "loss": 1.9188, + "step": 8622 + }, + { + "epoch": 0.6490901221325204, + "grad_norm": 4.839473247528076, + "learning_rate": 2.9124461050858954e-05, + "loss": 1.9511, + "step": 8623 + }, + { + "epoch": 0.6491653964131805, + "grad_norm": 4.708089351654053, + "learning_rate": 2.9113384150369406e-05, + "loss": 1.9867, + "step": 8624 + }, + { + "epoch": 0.6492406706938407, + "grad_norm": 5.063660144805908, + "learning_rate": 2.910230849152926e-05, + "loss": 1.7267, + "step": 8625 + }, + { + "epoch": 0.6493159449745008, + "grad_norm": 4.762404918670654, + "learning_rate": 2.9091234074996897e-05, + "loss": 1.767, + "step": 8626 + }, + { + "epoch": 0.649391219255161, + "grad_norm": 5.119905471801758, + "learning_rate": 2.9080160901430698e-05, + "loss": 1.8115, + "step": 8627 + }, + { + "epoch": 0.6494664935358212, + "grad_norm": 4.938631534576416, + "learning_rate": 2.9069088971488877e-05, + "loss": 1.4761, + "step": 8628 + }, + { + "epoch": 0.6495417678164813, + "grad_norm": 5.724737644195557, + "learning_rate": 2.905801828582969e-05, + "loss": 1.7815, + "step": 8629 + }, + { + "epoch": 0.6496170420971414, + "grad_norm": 5.352190971374512, + "learning_rate": 2.9046948845111198e-05, + "loss": 1.9752, + "step": 8630 + }, + { + "epoch": 0.6496923163778016, + "grad_norm": 8.342714309692383, + "learning_rate": 2.9035880649991487e-05, + "loss": 1.9821, + "step": 8631 + }, + { + "epoch": 0.6497675906584618, + "grad_norm": 5.685028076171875, + "learning_rate": 2.9024813701128506e-05, + "loss": 1.6406, + "step": 8632 + }, + { + "epoch": 0.6498428649391219, + "grad_norm": 5.1816582679748535, + "learning_rate": 2.9013747999180195e-05, + "loss": 1.7914, + "step": 8633 + }, + { + "epoch": 0.649918139219782, + "grad_norm": 5.811161041259766, + "learning_rate": 2.900268354480432e-05, + "loss": 1.8281, + "step": 8634 + }, + { + "epoch": 0.6499934135004423, + "grad_norm": 4.335180282592773, + "learning_rate": 2.8991620338658687e-05, + "loss": 1.6229, + "step": 8635 + }, + { + "epoch": 0.6500686877811024, + "grad_norm": 6.94002628326416, + "learning_rate": 2.8980558381400925e-05, + "loss": 1.9868, + "step": 8636 + }, + { + "epoch": 0.6501439620617625, + "grad_norm": 5.186955451965332, + "learning_rate": 2.8969497673688674e-05, + "loss": 1.9077, + "step": 8637 + }, + { + "epoch": 0.6502192363424227, + "grad_norm": 5.300660133361816, + "learning_rate": 2.8958438216179425e-05, + "loss": 1.94, + "step": 8638 + }, + { + "epoch": 0.6502945106230829, + "grad_norm": 4.873750686645508, + "learning_rate": 2.8947380009530644e-05, + "loss": 1.6035, + "step": 8639 + }, + { + "epoch": 0.650369784903743, + "grad_norm": 5.39937162399292, + "learning_rate": 2.8936323054399738e-05, + "loss": 1.8245, + "step": 8640 + }, + { + "epoch": 0.6504450591844032, + "grad_norm": 4.3305792808532715, + "learning_rate": 2.892526735144397e-05, + "loss": 1.9254, + "step": 8641 + }, + { + "epoch": 0.6505203334650633, + "grad_norm": 4.445459365844727, + "learning_rate": 2.8914212901320605e-05, + "loss": 1.5404, + "step": 8642 + }, + { + "epoch": 0.6505956077457234, + "grad_norm": 4.570689678192139, + "learning_rate": 2.8903159704686762e-05, + "loss": 1.4882, + "step": 8643 + }, + { + "epoch": 0.6506708820263837, + "grad_norm": 5.089055061340332, + "learning_rate": 2.8892107762199538e-05, + "loss": 1.8677, + "step": 8644 + }, + { + "epoch": 0.6507461563070438, + "grad_norm": 4.328354358673096, + "learning_rate": 2.888105707451595e-05, + "loss": 1.5696, + "step": 8645 + }, + { + "epoch": 0.6508214305877039, + "grad_norm": 7.792619228363037, + "learning_rate": 2.8870007642292933e-05, + "loss": 1.9269, + "step": 8646 + }, + { + "epoch": 0.6508967048683642, + "grad_norm": 5.175947189331055, + "learning_rate": 2.8858959466187317e-05, + "loss": 1.9349, + "step": 8647 + }, + { + "epoch": 0.6509719791490243, + "grad_norm": 5.960355758666992, + "learning_rate": 2.884791254685592e-05, + "loss": 2.0185, + "step": 8648 + }, + { + "epoch": 0.6510472534296844, + "grad_norm": 4.274903297424316, + "learning_rate": 2.883686688495541e-05, + "loss": 1.9235, + "step": 8649 + }, + { + "epoch": 0.6511225277103446, + "grad_norm": 4.283148765563965, + "learning_rate": 2.882582248114246e-05, + "loss": 1.8648, + "step": 8650 + }, + { + "epoch": 0.6511978019910047, + "grad_norm": 4.214829444885254, + "learning_rate": 2.881477933607359e-05, + "loss": 2.0684, + "step": 8651 + }, + { + "epoch": 0.6512730762716649, + "grad_norm": 6.859586238861084, + "learning_rate": 2.8803737450405322e-05, + "loss": 1.9442, + "step": 8652 + }, + { + "epoch": 0.651348350552325, + "grad_norm": 4.5531415939331055, + "learning_rate": 2.8792696824794018e-05, + "loss": 1.7516, + "step": 8653 + }, + { + "epoch": 0.6514236248329852, + "grad_norm": 6.204555988311768, + "learning_rate": 2.878165745989604e-05, + "loss": 1.8646, + "step": 8654 + }, + { + "epoch": 0.6514988991136453, + "grad_norm": 4.691348552703857, + "learning_rate": 2.8770619356367646e-05, + "loss": 1.8374, + "step": 8655 + }, + { + "epoch": 0.6515741733943055, + "grad_norm": 5.44064998626709, + "learning_rate": 2.8759582514865012e-05, + "loss": 1.943, + "step": 8656 + }, + { + "epoch": 0.6516494476749657, + "grad_norm": 5.013979434967041, + "learning_rate": 2.8748546936044275e-05, + "loss": 1.88, + "step": 8657 + }, + { + "epoch": 0.6517247219556258, + "grad_norm": 3.9379639625549316, + "learning_rate": 2.8737512620561423e-05, + "loss": 1.379, + "step": 8658 + }, + { + "epoch": 0.6517999962362859, + "grad_norm": 4.876352310180664, + "learning_rate": 2.872647956907246e-05, + "loss": 1.5299, + "step": 8659 + }, + { + "epoch": 0.6518752705169462, + "grad_norm": 3.9959232807159424, + "learning_rate": 2.8715447782233227e-05, + "loss": 1.6813, + "step": 8660 + }, + { + "epoch": 0.6519505447976063, + "grad_norm": 7.477232933044434, + "learning_rate": 2.870441726069957e-05, + "loss": 1.8957, + "step": 8661 + }, + { + "epoch": 0.6520258190782664, + "grad_norm": 5.5284552574157715, + "learning_rate": 2.869338800512718e-05, + "loss": 1.6133, + "step": 8662 + }, + { + "epoch": 0.6521010933589266, + "grad_norm": 4.665445804595947, + "learning_rate": 2.8682360016171762e-05, + "loss": 1.8148, + "step": 8663 + }, + { + "epoch": 0.6521763676395868, + "grad_norm": 7.617617130279541, + "learning_rate": 2.867133329448885e-05, + "loss": 2.1744, + "step": 8664 + }, + { + "epoch": 0.6522516419202469, + "grad_norm": 5.930420398712158, + "learning_rate": 2.8660307840733973e-05, + "loss": 1.8305, + "step": 8665 + }, + { + "epoch": 0.6523269162009071, + "grad_norm": 7.521519660949707, + "learning_rate": 2.864928365556257e-05, + "loss": 2.3467, + "step": 8666 + }, + { + "epoch": 0.6524021904815672, + "grad_norm": 5.112128257751465, + "learning_rate": 2.8638260739630007e-05, + "loss": 1.5662, + "step": 8667 + }, + { + "epoch": 0.6524774647622273, + "grad_norm": 7.980247497558594, + "learning_rate": 2.8627239093591536e-05, + "loss": 1.9166, + "step": 8668 + }, + { + "epoch": 0.6525527390428876, + "grad_norm": 5.541444301605225, + "learning_rate": 2.8616218718102388e-05, + "loss": 1.8907, + "step": 8669 + }, + { + "epoch": 0.6526280133235477, + "grad_norm": 5.688579082489014, + "learning_rate": 2.8605199613817668e-05, + "loss": 1.7098, + "step": 8670 + }, + { + "epoch": 0.6527032876042078, + "grad_norm": 4.370813846588135, + "learning_rate": 2.8594181781392437e-05, + "loss": 1.817, + "step": 8671 + }, + { + "epoch": 0.6527785618848679, + "grad_norm": 7.396319389343262, + "learning_rate": 2.85831652214817e-05, + "loss": 1.3468, + "step": 8672 + }, + { + "epoch": 0.6528538361655282, + "grad_norm": 5.487939834594727, + "learning_rate": 2.8572149934740317e-05, + "loss": 1.5988, + "step": 8673 + }, + { + "epoch": 0.6529291104461883, + "grad_norm": 5.727562427520752, + "learning_rate": 2.8561135921823157e-05, + "loss": 1.7862, + "step": 8674 + }, + { + "epoch": 0.6530043847268484, + "grad_norm": 4.359500885009766, + "learning_rate": 2.8550123183384936e-05, + "loss": 1.6676, + "step": 8675 + }, + { + "epoch": 0.6530796590075086, + "grad_norm": 6.9994306564331055, + "learning_rate": 2.8539111720080343e-05, + "loss": 2.003, + "step": 8676 + }, + { + "epoch": 0.6531549332881688, + "grad_norm": 6.189208030700684, + "learning_rate": 2.8528101532563978e-05, + "loss": 2.2584, + "step": 8677 + }, + { + "epoch": 0.6532302075688289, + "grad_norm": 5.228851318359375, + "learning_rate": 2.8517092621490388e-05, + "loss": 2.043, + "step": 8678 + }, + { + "epoch": 0.6533054818494891, + "grad_norm": 5.385641098022461, + "learning_rate": 2.8506084987513983e-05, + "loss": 1.5296, + "step": 8679 + }, + { + "epoch": 0.6533807561301492, + "grad_norm": 5.640270233154297, + "learning_rate": 2.8495078631289167e-05, + "loss": 1.6052, + "step": 8680 + }, + { + "epoch": 0.6534560304108094, + "grad_norm": 5.341486930847168, + "learning_rate": 2.8484073553470203e-05, + "loss": 1.89, + "step": 8681 + }, + { + "epoch": 0.6535313046914696, + "grad_norm": 5.480493545532227, + "learning_rate": 2.8473069754711356e-05, + "loss": 1.7674, + "step": 8682 + }, + { + "epoch": 0.6536065789721297, + "grad_norm": 5.01368522644043, + "learning_rate": 2.8462067235666724e-05, + "loss": 1.9275, + "step": 8683 + }, + { + "epoch": 0.6536818532527898, + "grad_norm": 3.596524715423584, + "learning_rate": 2.845106599699041e-05, + "loss": 1.625, + "step": 8684 + }, + { + "epoch": 0.65375712753345, + "grad_norm": 4.796699047088623, + "learning_rate": 2.8440066039336372e-05, + "loss": 1.6971, + "step": 8685 + }, + { + "epoch": 0.6538324018141102, + "grad_norm": 4.636098384857178, + "learning_rate": 2.8429067363358545e-05, + "loss": 1.3821, + "step": 8686 + }, + { + "epoch": 0.6539076760947703, + "grad_norm": 5.4085822105407715, + "learning_rate": 2.8418069969710776e-05, + "loss": 1.9129, + "step": 8687 + }, + { + "epoch": 0.6539829503754305, + "grad_norm": 7.012679576873779, + "learning_rate": 2.840707385904682e-05, + "loss": 1.9035, + "step": 8688 + }, + { + "epoch": 0.6540582246560906, + "grad_norm": 6.826591968536377, + "learning_rate": 2.8396079032020383e-05, + "loss": 1.9101, + "step": 8689 + }, + { + "epoch": 0.6541334989367508, + "grad_norm": 5.054472923278809, + "learning_rate": 2.8385085489285034e-05, + "loss": 1.6556, + "step": 8690 + }, + { + "epoch": 0.6542087732174109, + "grad_norm": 5.786702632904053, + "learning_rate": 2.837409323149436e-05, + "loss": 1.7376, + "step": 8691 + }, + { + "epoch": 0.6542840474980711, + "grad_norm": 6.182795524597168, + "learning_rate": 2.836310225930177e-05, + "loss": 2.0521, + "step": 8692 + }, + { + "epoch": 0.6543593217787312, + "grad_norm": 5.056280612945557, + "learning_rate": 2.835211257336069e-05, + "loss": 1.8635, + "step": 8693 + }, + { + "epoch": 0.6544345960593914, + "grad_norm": 6.025611400604248, + "learning_rate": 2.8341124174324372e-05, + "loss": 1.5967, + "step": 8694 + }, + { + "epoch": 0.6545098703400516, + "grad_norm": 5.763370990753174, + "learning_rate": 2.8330137062846107e-05, + "loss": 1.8443, + "step": 8695 + }, + { + "epoch": 0.6545851446207117, + "grad_norm": 8.152851104736328, + "learning_rate": 2.8319151239578994e-05, + "loss": 1.9465, + "step": 8696 + }, + { + "epoch": 0.6546604189013718, + "grad_norm": 3.914912462234497, + "learning_rate": 2.8308166705176147e-05, + "loss": 1.7677, + "step": 8697 + }, + { + "epoch": 0.6547356931820321, + "grad_norm": 4.402346134185791, + "learning_rate": 2.8297183460290532e-05, + "loss": 1.6815, + "step": 8698 + }, + { + "epoch": 0.6548109674626922, + "grad_norm": 4.883076190948486, + "learning_rate": 2.8286201505575084e-05, + "loss": 1.6569, + "step": 8699 + }, + { + "epoch": 0.6548862417433523, + "grad_norm": 3.7037272453308105, + "learning_rate": 2.8275220841682658e-05, + "loss": 1.6894, + "step": 8700 + }, + { + "epoch": 0.6549615160240125, + "grad_norm": 5.254941463470459, + "learning_rate": 2.8264241469266033e-05, + "loss": 1.5933, + "step": 8701 + }, + { + "epoch": 0.6550367903046727, + "grad_norm": 4.173793792724609, + "learning_rate": 2.825326338897787e-05, + "loss": 1.5768, + "step": 8702 + }, + { + "epoch": 0.6551120645853328, + "grad_norm": 4.200857639312744, + "learning_rate": 2.8242286601470795e-05, + "loss": 1.8999, + "step": 8703 + }, + { + "epoch": 0.655187338865993, + "grad_norm": 7.90508508682251, + "learning_rate": 2.8231311107397373e-05, + "loss": 1.8072, + "step": 8704 + }, + { + "epoch": 0.6552626131466531, + "grad_norm": 3.8276569843292236, + "learning_rate": 2.8220336907410028e-05, + "loss": 1.7257, + "step": 8705 + }, + { + "epoch": 0.6553378874273132, + "grad_norm": 5.258414268493652, + "learning_rate": 2.8209364002161177e-05, + "loss": 1.84, + "step": 8706 + }, + { + "epoch": 0.6554131617079735, + "grad_norm": 5.518962860107422, + "learning_rate": 2.81983923923031e-05, + "loss": 1.6715, + "step": 8707 + }, + { + "epoch": 0.6554884359886336, + "grad_norm": 4.878246784210205, + "learning_rate": 2.8187422078488045e-05, + "loss": 1.6819, + "step": 8708 + }, + { + "epoch": 0.6555637102692937, + "grad_norm": 5.995737552642822, + "learning_rate": 2.8176453061368145e-05, + "loss": 1.9267, + "step": 8709 + }, + { + "epoch": 0.6556389845499538, + "grad_norm": 9.327507019042969, + "learning_rate": 2.8165485341595498e-05, + "loss": 2.281, + "step": 8710 + }, + { + "epoch": 0.6557142588306141, + "grad_norm": 5.783663272857666, + "learning_rate": 2.8154518919822092e-05, + "loss": 1.8807, + "step": 8711 + }, + { + "epoch": 0.6557895331112742, + "grad_norm": 6.752278804779053, + "learning_rate": 2.8143553796699872e-05, + "loss": 1.9318, + "step": 8712 + }, + { + "epoch": 0.6558648073919343, + "grad_norm": 4.767989158630371, + "learning_rate": 2.8132589972880653e-05, + "loss": 1.6484, + "step": 8713 + }, + { + "epoch": 0.6559400816725945, + "grad_norm": 7.460727691650391, + "learning_rate": 2.812162744901623e-05, + "loss": 1.738, + "step": 8714 + }, + { + "epoch": 0.6560153559532547, + "grad_norm": 6.6065850257873535, + "learning_rate": 2.8110666225758264e-05, + "loss": 2.214, + "step": 8715 + }, + { + "epoch": 0.6560906302339148, + "grad_norm": 3.660745859146118, + "learning_rate": 2.80997063037584e-05, + "loss": 1.6317, + "step": 8716 + }, + { + "epoch": 0.656165904514575, + "grad_norm": 5.311392784118652, + "learning_rate": 2.8088747683668138e-05, + "loss": 1.8431, + "step": 8717 + }, + { + "epoch": 0.6562411787952351, + "grad_norm": 5.0982184410095215, + "learning_rate": 2.8077790366138977e-05, + "loss": 1.8254, + "step": 8718 + }, + { + "epoch": 0.6563164530758953, + "grad_norm": 4.19042444229126, + "learning_rate": 2.8066834351822257e-05, + "loss": 1.5987, + "step": 8719 + }, + { + "epoch": 0.6563917273565555, + "grad_norm": 3.981968402862549, + "learning_rate": 2.8055879641369298e-05, + "loss": 1.6814, + "step": 8720 + }, + { + "epoch": 0.6564670016372156, + "grad_norm": 8.813477516174316, + "learning_rate": 2.8044926235431335e-05, + "loss": 1.5869, + "step": 8721 + }, + { + "epoch": 0.6565422759178757, + "grad_norm": 6.0203423500061035, + "learning_rate": 2.80339741346595e-05, + "loss": 1.5511, + "step": 8722 + }, + { + "epoch": 0.656617550198536, + "grad_norm": 6.520727157592773, + "learning_rate": 2.8023023339704907e-05, + "loss": 1.8609, + "step": 8723 + }, + { + "epoch": 0.6566928244791961, + "grad_norm": 7.125439167022705, + "learning_rate": 2.801207385121849e-05, + "loss": 2.0156, + "step": 8724 + }, + { + "epoch": 0.6567680987598562, + "grad_norm": 7.163008213043213, + "learning_rate": 2.800112566985122e-05, + "loss": 1.8355, + "step": 8725 + }, + { + "epoch": 0.6568433730405164, + "grad_norm": 5.797338962554932, + "learning_rate": 2.799017879625388e-05, + "loss": 1.5828, + "step": 8726 + }, + { + "epoch": 0.6569186473211766, + "grad_norm": 5.205743312835693, + "learning_rate": 2.7979233231077278e-05, + "loss": 1.5995, + "step": 8727 + }, + { + "epoch": 0.6569939216018367, + "grad_norm": 5.32490348815918, + "learning_rate": 2.7968288974972058e-05, + "loss": 1.8256, + "step": 8728 + }, + { + "epoch": 0.6570691958824968, + "grad_norm": 4.363036632537842, + "learning_rate": 2.795734602858886e-05, + "loss": 1.6274, + "step": 8729 + }, + { + "epoch": 0.657144470163157, + "grad_norm": 4.83621072769165, + "learning_rate": 2.794640439257818e-05, + "loss": 1.6522, + "step": 8730 + }, + { + "epoch": 0.6572197444438171, + "grad_norm": 5.710203170776367, + "learning_rate": 2.7935464067590473e-05, + "loss": 2.1365, + "step": 8731 + }, + { + "epoch": 0.6572950187244773, + "grad_norm": 6.177736759185791, + "learning_rate": 2.7924525054276118e-05, + "loss": 1.6826, + "step": 8732 + }, + { + "epoch": 0.6573702930051375, + "grad_norm": 4.1032867431640625, + "learning_rate": 2.7913587353285432e-05, + "loss": 1.753, + "step": 8733 + }, + { + "epoch": 0.6574455672857976, + "grad_norm": 9.235843658447266, + "learning_rate": 2.7902650965268572e-05, + "loss": 1.9237, + "step": 8734 + }, + { + "epoch": 0.6575208415664577, + "grad_norm": 4.227553367614746, + "learning_rate": 2.7891715890875742e-05, + "loss": 1.4215, + "step": 8735 + }, + { + "epoch": 0.657596115847118, + "grad_norm": 5.74335241317749, + "learning_rate": 2.7880782130756937e-05, + "loss": 1.7132, + "step": 8736 + }, + { + "epoch": 0.6576713901277781, + "grad_norm": 6.310643196105957, + "learning_rate": 2.786984968556218e-05, + "loss": 1.6527, + "step": 8737 + }, + { + "epoch": 0.6577466644084382, + "grad_norm": 4.480103492736816, + "learning_rate": 2.7858918555941365e-05, + "loss": 1.6175, + "step": 8738 + }, + { + "epoch": 0.6578219386890984, + "grad_norm": 4.663218975067139, + "learning_rate": 2.7847988742544297e-05, + "loss": 1.5242, + "step": 8739 + }, + { + "epoch": 0.6578972129697586, + "grad_norm": 5.051090240478516, + "learning_rate": 2.7837060246020762e-05, + "loss": 1.5483, + "step": 8740 + }, + { + "epoch": 0.6579724872504187, + "grad_norm": 6.499863147735596, + "learning_rate": 2.7826133067020375e-05, + "loss": 2.0834, + "step": 8741 + }, + { + "epoch": 0.6580477615310789, + "grad_norm": 4.248459815979004, + "learning_rate": 2.7815207206192783e-05, + "loss": 1.7828, + "step": 8742 + }, + { + "epoch": 0.658123035811739, + "grad_norm": 5.525918483734131, + "learning_rate": 2.7804282664187443e-05, + "loss": 1.8055, + "step": 8743 + }, + { + "epoch": 0.6581983100923992, + "grad_norm": 6.875134468078613, + "learning_rate": 2.779335944165381e-05, + "loss": 1.7754, + "step": 8744 + }, + { + "epoch": 0.6582735843730594, + "grad_norm": 5.824242115020752, + "learning_rate": 2.7782437539241233e-05, + "loss": 2.098, + "step": 8745 + }, + { + "epoch": 0.6583488586537195, + "grad_norm": 4.301902770996094, + "learning_rate": 2.777151695759903e-05, + "loss": 1.7413, + "step": 8746 + }, + { + "epoch": 0.6584241329343796, + "grad_norm": 3.784511089324951, + "learning_rate": 2.7760597697376335e-05, + "loss": 1.9452, + "step": 8747 + }, + { + "epoch": 0.6584994072150399, + "grad_norm": 4.715965747833252, + "learning_rate": 2.7749679759222314e-05, + "loss": 1.7355, + "step": 8748 + }, + { + "epoch": 0.6585746814957, + "grad_norm": 4.41172456741333, + "learning_rate": 2.7738763143785972e-05, + "loss": 2.0268, + "step": 8749 + }, + { + "epoch": 0.6586499557763601, + "grad_norm": 4.197338104248047, + "learning_rate": 2.7727847851716305e-05, + "loss": 1.7489, + "step": 8750 + }, + { + "epoch": 0.6587252300570202, + "grad_norm": 5.126345634460449, + "learning_rate": 2.7716933883662154e-05, + "loss": 1.8483, + "step": 8751 + }, + { + "epoch": 0.6588005043376804, + "grad_norm": 5.135181903839111, + "learning_rate": 2.770602124027235e-05, + "loss": 1.8513, + "step": 8752 + }, + { + "epoch": 0.6588757786183406, + "grad_norm": 4.417708396911621, + "learning_rate": 2.7695109922195632e-05, + "loss": 1.6875, + "step": 8753 + }, + { + "epoch": 0.6589510528990007, + "grad_norm": 4.860291004180908, + "learning_rate": 2.7684199930080613e-05, + "loss": 1.6024, + "step": 8754 + }, + { + "epoch": 0.6590263271796609, + "grad_norm": 4.549044609069824, + "learning_rate": 2.7673291264575874e-05, + "loss": 1.7823, + "step": 8755 + }, + { + "epoch": 0.659101601460321, + "grad_norm": 6.025548458099365, + "learning_rate": 2.7662383926329903e-05, + "loss": 1.5586, + "step": 8756 + }, + { + "epoch": 0.6591768757409812, + "grad_norm": 6.463230133056641, + "learning_rate": 2.765147791599114e-05, + "loss": 1.6067, + "step": 8757 + }, + { + "epoch": 0.6592521500216414, + "grad_norm": 6.396263599395752, + "learning_rate": 2.7640573234207866e-05, + "loss": 1.9241, + "step": 8758 + }, + { + "epoch": 0.6593274243023015, + "grad_norm": 6.237919330596924, + "learning_rate": 2.7629669881628384e-05, + "loss": 1.8979, + "step": 8759 + }, + { + "epoch": 0.6594026985829616, + "grad_norm": 6.155825138092041, + "learning_rate": 2.7618767858900818e-05, + "loss": 2.0217, + "step": 8760 + }, + { + "epoch": 0.6594779728636219, + "grad_norm": 6.062699317932129, + "learning_rate": 2.7607867166673296e-05, + "loss": 1.7237, + "step": 8761 + }, + { + "epoch": 0.659553247144282, + "grad_norm": 4.322330951690674, + "learning_rate": 2.759696780559381e-05, + "loss": 1.7834, + "step": 8762 + }, + { + "epoch": 0.6596285214249421, + "grad_norm": 4.465962886810303, + "learning_rate": 2.758606977631032e-05, + "loss": 1.7007, + "step": 8763 + }, + { + "epoch": 0.6597037957056023, + "grad_norm": 4.836397647857666, + "learning_rate": 2.757517307947065e-05, + "loss": 1.8184, + "step": 8764 + }, + { + "epoch": 0.6597790699862625, + "grad_norm": 4.468504905700684, + "learning_rate": 2.7564277715722596e-05, + "loss": 2.0884, + "step": 8765 + }, + { + "epoch": 0.6598543442669226, + "grad_norm": 5.0097832679748535, + "learning_rate": 2.7553383685713862e-05, + "loss": 1.5737, + "step": 8766 + }, + { + "epoch": 0.6599296185475828, + "grad_norm": 4.7780537605285645, + "learning_rate": 2.7542490990092074e-05, + "loss": 2.0759, + "step": 8767 + }, + { + "epoch": 0.6600048928282429, + "grad_norm": 4.698875904083252, + "learning_rate": 2.7531599629504745e-05, + "loss": 1.9418, + "step": 8768 + }, + { + "epoch": 0.660080167108903, + "grad_norm": 5.682632923126221, + "learning_rate": 2.752070960459934e-05, + "loss": 1.5591, + "step": 8769 + }, + { + "epoch": 0.6601554413895632, + "grad_norm": 5.964789867401123, + "learning_rate": 2.7509820916023276e-05, + "loss": 1.8142, + "step": 8770 + }, + { + "epoch": 0.6602307156702234, + "grad_norm": 6.310579776763916, + "learning_rate": 2.749893356442381e-05, + "loss": 1.8981, + "step": 8771 + }, + { + "epoch": 0.6603059899508835, + "grad_norm": 5.43596076965332, + "learning_rate": 2.7488047550448193e-05, + "loss": 1.7978, + "step": 8772 + }, + { + "epoch": 0.6603812642315436, + "grad_norm": 7.301657199859619, + "learning_rate": 2.747716287474354e-05, + "loss": 2.096, + "step": 8773 + }, + { + "epoch": 0.6604565385122039, + "grad_norm": 5.815987586975098, + "learning_rate": 2.7466279537956944e-05, + "loss": 1.8819, + "step": 8774 + }, + { + "epoch": 0.660531812792864, + "grad_norm": 4.713632583618164, + "learning_rate": 2.745539754073536e-05, + "loss": 1.9333, + "step": 8775 + }, + { + "epoch": 0.6606070870735241, + "grad_norm": 5.6835432052612305, + "learning_rate": 2.74445168837257e-05, + "loss": 2.1965, + "step": 8776 + }, + { + "epoch": 0.6606823613541843, + "grad_norm": 5.595894813537598, + "learning_rate": 2.7433637567574788e-05, + "loss": 1.7818, + "step": 8777 + }, + { + "epoch": 0.6607576356348445, + "grad_norm": 3.613860607147217, + "learning_rate": 2.7422759592929405e-05, + "loss": 1.7596, + "step": 8778 + }, + { + "epoch": 0.6608329099155046, + "grad_norm": 5.724103927612305, + "learning_rate": 2.7411882960436153e-05, + "loss": 1.5363, + "step": 8779 + }, + { + "epoch": 0.6609081841961648, + "grad_norm": 5.551961421966553, + "learning_rate": 2.7401007670741667e-05, + "loss": 1.6305, + "step": 8780 + }, + { + "epoch": 0.6609834584768249, + "grad_norm": 4.2294816970825195, + "learning_rate": 2.7390133724492417e-05, + "loss": 1.4462, + "step": 8781 + }, + { + "epoch": 0.6610587327574851, + "grad_norm": 5.838983535766602, + "learning_rate": 2.7379261122334855e-05, + "loss": 2.1848, + "step": 8782 + }, + { + "epoch": 0.6611340070381453, + "grad_norm": 5.823403358459473, + "learning_rate": 2.7368389864915294e-05, + "loss": 1.7284, + "step": 8783 + }, + { + "epoch": 0.6612092813188054, + "grad_norm": 6.538461685180664, + "learning_rate": 2.7357519952880023e-05, + "loss": 1.7012, + "step": 8784 + }, + { + "epoch": 0.6612845555994655, + "grad_norm": 7.819999694824219, + "learning_rate": 2.7346651386875237e-05, + "loss": 1.7066, + "step": 8785 + }, + { + "epoch": 0.6613598298801258, + "grad_norm": 4.031497001647949, + "learning_rate": 2.7335784167547006e-05, + "loss": 1.9733, + "step": 8786 + }, + { + "epoch": 0.6614351041607859, + "grad_norm": 7.550850868225098, + "learning_rate": 2.732491829554138e-05, + "loss": 1.6939, + "step": 8787 + }, + { + "epoch": 0.661510378441446, + "grad_norm": 7.595545291900635, + "learning_rate": 2.73140537715043e-05, + "loss": 1.7113, + "step": 8788 + }, + { + "epoch": 0.6615856527221061, + "grad_norm": 5.982686996459961, + "learning_rate": 2.7303190596081645e-05, + "loss": 2.2336, + "step": 8789 + }, + { + "epoch": 0.6616609270027664, + "grad_norm": 4.109394073486328, + "learning_rate": 2.7292328769919172e-05, + "loss": 1.4977, + "step": 8790 + }, + { + "epoch": 0.6617362012834265, + "grad_norm": 3.759683847427368, + "learning_rate": 2.7281468293662608e-05, + "loss": 1.5384, + "step": 8791 + }, + { + "epoch": 0.6618114755640866, + "grad_norm": 6.634958267211914, + "learning_rate": 2.727060916795755e-05, + "loss": 1.7587, + "step": 8792 + }, + { + "epoch": 0.6618867498447468, + "grad_norm": 6.004629135131836, + "learning_rate": 2.7259751393449584e-05, + "loss": 1.8345, + "step": 8793 + }, + { + "epoch": 0.661962024125407, + "grad_norm": 5.197699069976807, + "learning_rate": 2.724889497078413e-05, + "loss": 2.264, + "step": 8794 + }, + { + "epoch": 0.6620372984060671, + "grad_norm": 4.483395576477051, + "learning_rate": 2.7238039900606605e-05, + "loss": 2.005, + "step": 8795 + }, + { + "epoch": 0.6621125726867273, + "grad_norm": 5.399251461029053, + "learning_rate": 2.7227186183562276e-05, + "loss": 1.563, + "step": 8796 + }, + { + "epoch": 0.6621878469673874, + "grad_norm": 5.074873924255371, + "learning_rate": 2.7216333820296402e-05, + "loss": 1.7355, + "step": 8797 + }, + { + "epoch": 0.6622631212480475, + "grad_norm": 4.188551425933838, + "learning_rate": 2.7205482811454097e-05, + "loss": 1.7052, + "step": 8798 + }, + { + "epoch": 0.6623383955287078, + "grad_norm": 4.108470916748047, + "learning_rate": 2.7194633157680434e-05, + "loss": 1.9538, + "step": 8799 + }, + { + "epoch": 0.6624136698093679, + "grad_norm": 7.338245868682861, + "learning_rate": 2.7183784859620387e-05, + "loss": 1.753, + "step": 8800 + }, + { + "epoch": 0.662488944090028, + "grad_norm": 4.754312038421631, + "learning_rate": 2.7172937917918868e-05, + "loss": 1.7548, + "step": 8801 + }, + { + "epoch": 0.6625642183706882, + "grad_norm": 8.20932388305664, + "learning_rate": 2.7162092333220712e-05, + "loss": 1.8648, + "step": 8802 + }, + { + "epoch": 0.6626394926513484, + "grad_norm": 5.048293590545654, + "learning_rate": 2.7151248106170613e-05, + "loss": 2.0186, + "step": 8803 + }, + { + "epoch": 0.6627147669320085, + "grad_norm": 5.247382640838623, + "learning_rate": 2.714040523741328e-05, + "loss": 1.6981, + "step": 8804 + }, + { + "epoch": 0.6627900412126687, + "grad_norm": 5.969110012054443, + "learning_rate": 2.712956372759324e-05, + "loss": 1.7547, + "step": 8805 + }, + { + "epoch": 0.6628653154933288, + "grad_norm": 4.111473083496094, + "learning_rate": 2.7118723577355033e-05, + "loss": 1.6477, + "step": 8806 + }, + { + "epoch": 0.662940589773989, + "grad_norm": 7.6356587409973145, + "learning_rate": 2.7107884787343035e-05, + "loss": 1.7452, + "step": 8807 + }, + { + "epoch": 0.6630158640546491, + "grad_norm": 5.619043350219727, + "learning_rate": 2.709704735820162e-05, + "loss": 1.6748, + "step": 8808 + }, + { + "epoch": 0.6630911383353093, + "grad_norm": 4.394221782684326, + "learning_rate": 2.708621129057501e-05, + "loss": 1.6388, + "step": 8809 + }, + { + "epoch": 0.6631664126159694, + "grad_norm": 4.229480743408203, + "learning_rate": 2.7075376585107388e-05, + "loss": 1.6175, + "step": 8810 + }, + { + "epoch": 0.6632416868966295, + "grad_norm": 5.275230407714844, + "learning_rate": 2.706454324244285e-05, + "loss": 1.7506, + "step": 8811 + }, + { + "epoch": 0.6633169611772898, + "grad_norm": 4.691878318786621, + "learning_rate": 2.7053711263225427e-05, + "loss": 1.8894, + "step": 8812 + }, + { + "epoch": 0.6633922354579499, + "grad_norm": 5.101324081420898, + "learning_rate": 2.7042880648099013e-05, + "loss": 1.5453, + "step": 8813 + }, + { + "epoch": 0.66346750973861, + "grad_norm": 4.267333030700684, + "learning_rate": 2.7032051397707492e-05, + "loss": 1.7482, + "step": 8814 + }, + { + "epoch": 0.6635427840192702, + "grad_norm": 6.043514251708984, + "learning_rate": 2.7021223512694587e-05, + "loss": 1.9816, + "step": 8815 + }, + { + "epoch": 0.6636180582999304, + "grad_norm": 5.69575309753418, + "learning_rate": 2.701039699370401e-05, + "loss": 1.8079, + "step": 8816 + }, + { + "epoch": 0.6636933325805905, + "grad_norm": 4.062493324279785, + "learning_rate": 2.6999571841379394e-05, + "loss": 1.6159, + "step": 8817 + }, + { + "epoch": 0.6637686068612507, + "grad_norm": 4.54690408706665, + "learning_rate": 2.6988748056364214e-05, + "loss": 1.7475, + "step": 8818 + }, + { + "epoch": 0.6638438811419108, + "grad_norm": 6.173492431640625, + "learning_rate": 2.697792563930196e-05, + "loss": 1.7627, + "step": 8819 + }, + { + "epoch": 0.663919155422571, + "grad_norm": 6.645335674285889, + "learning_rate": 2.696710459083594e-05, + "loss": 1.5357, + "step": 8820 + }, + { + "epoch": 0.6639944297032312, + "grad_norm": 6.449656009674072, + "learning_rate": 2.695628491160947e-05, + "loss": 1.5488, + "step": 8821 + }, + { + "epoch": 0.6640697039838913, + "grad_norm": 4.287746906280518, + "learning_rate": 2.694546660226574e-05, + "loss": 1.7928, + "step": 8822 + }, + { + "epoch": 0.6641449782645514, + "grad_norm": 6.8692708015441895, + "learning_rate": 2.6934649663447885e-05, + "loss": 1.5845, + "step": 8823 + }, + { + "epoch": 0.6642202525452117, + "grad_norm": 4.393044471740723, + "learning_rate": 2.6923834095798916e-05, + "loss": 1.8204, + "step": 8824 + }, + { + "epoch": 0.6642955268258718, + "grad_norm": 10.407302856445312, + "learning_rate": 2.691301989996181e-05, + "loss": 1.7352, + "step": 8825 + }, + { + "epoch": 0.6643708011065319, + "grad_norm": 4.676828861236572, + "learning_rate": 2.6902207076579406e-05, + "loss": 1.3686, + "step": 8826 + }, + { + "epoch": 0.6644460753871921, + "grad_norm": 4.771677494049072, + "learning_rate": 2.6891395626294536e-05, + "loss": 1.7782, + "step": 8827 + }, + { + "epoch": 0.6645213496678523, + "grad_norm": 4.710468769073486, + "learning_rate": 2.6880585549749875e-05, + "loss": 2.027, + "step": 8828 + }, + { + "epoch": 0.6645966239485124, + "grad_norm": 6.521298885345459, + "learning_rate": 2.6869776847588078e-05, + "loss": 1.867, + "step": 8829 + }, + { + "epoch": 0.6646718982291725, + "grad_norm": 4.38245964050293, + "learning_rate": 2.685896952045167e-05, + "loss": 1.6619, + "step": 8830 + }, + { + "epoch": 0.6647471725098327, + "grad_norm": 5.583203315734863, + "learning_rate": 2.684816356898312e-05, + "loss": 1.8872, + "step": 8831 + }, + { + "epoch": 0.6648224467904928, + "grad_norm": 4.860591411590576, + "learning_rate": 2.6837358993824814e-05, + "loss": 1.8094, + "step": 8832 + }, + { + "epoch": 0.664897721071153, + "grad_norm": 6.729776859283447, + "learning_rate": 2.6826555795619056e-05, + "loss": 2.4783, + "step": 8833 + }, + { + "epoch": 0.6649729953518132, + "grad_norm": 3.9188456535339355, + "learning_rate": 2.6815753975008085e-05, + "loss": 1.5631, + "step": 8834 + }, + { + "epoch": 0.6650482696324733, + "grad_norm": 4.6805620193481445, + "learning_rate": 2.6804953532634e-05, + "loss": 1.7168, + "step": 8835 + }, + { + "epoch": 0.6651235439131334, + "grad_norm": 6.026560306549072, + "learning_rate": 2.6794154469138887e-05, + "loss": 1.936, + "step": 8836 + }, + { + "epoch": 0.6651988181937937, + "grad_norm": 4.962831020355225, + "learning_rate": 2.6783356785164686e-05, + "loss": 1.7152, + "step": 8837 + }, + { + "epoch": 0.6652740924744538, + "grad_norm": 4.919339656829834, + "learning_rate": 2.6772560481353332e-05, + "loss": 1.5892, + "step": 8838 + }, + { + "epoch": 0.6653493667551139, + "grad_norm": 5.3777174949646, + "learning_rate": 2.6761765558346585e-05, + "loss": 1.7404, + "step": 8839 + }, + { + "epoch": 0.6654246410357741, + "grad_norm": 5.235607147216797, + "learning_rate": 2.6750972016786223e-05, + "loss": 1.8679, + "step": 8840 + }, + { + "epoch": 0.6654999153164343, + "grad_norm": 4.862646102905273, + "learning_rate": 2.6740179857313845e-05, + "loss": 1.8517, + "step": 8841 + }, + { + "epoch": 0.6655751895970944, + "grad_norm": 5.399610996246338, + "learning_rate": 2.6729389080571033e-05, + "loss": 1.7782, + "step": 8842 + }, + { + "epoch": 0.6656504638777546, + "grad_norm": 9.928549766540527, + "learning_rate": 2.6718599687199293e-05, + "loss": 2.2446, + "step": 8843 + }, + { + "epoch": 0.6657257381584147, + "grad_norm": 6.3774943351745605, + "learning_rate": 2.6707811677839978e-05, + "loss": 1.8751, + "step": 8844 + }, + { + "epoch": 0.6658010124390749, + "grad_norm": 4.686183452606201, + "learning_rate": 2.669702505313442e-05, + "loss": 1.8164, + "step": 8845 + }, + { + "epoch": 0.6658762867197351, + "grad_norm": 8.357050895690918, + "learning_rate": 2.668623981372389e-05, + "loss": 1.6654, + "step": 8846 + }, + { + "epoch": 0.6659515610003952, + "grad_norm": 4.379011154174805, + "learning_rate": 2.6675455960249478e-05, + "loss": 1.4854, + "step": 8847 + }, + { + "epoch": 0.6660268352810553, + "grad_norm": 5.247550010681152, + "learning_rate": 2.666467349335231e-05, + "loss": 1.8402, + "step": 8848 + }, + { + "epoch": 0.6661021095617154, + "grad_norm": 5.4151291847229, + "learning_rate": 2.665389241367332e-05, + "loss": 1.696, + "step": 8849 + }, + { + "epoch": 0.6661773838423757, + "grad_norm": 4.565914154052734, + "learning_rate": 2.6643112721853446e-05, + "loss": 1.8568, + "step": 8850 + }, + { + "epoch": 0.6662526581230358, + "grad_norm": 5.331752777099609, + "learning_rate": 2.6632334418533516e-05, + "loss": 1.716, + "step": 8851 + }, + { + "epoch": 0.6663279324036959, + "grad_norm": 7.889148712158203, + "learning_rate": 2.6621557504354237e-05, + "loss": 1.497, + "step": 8852 + }, + { + "epoch": 0.6664032066843562, + "grad_norm": 4.140315055847168, + "learning_rate": 2.6610781979956307e-05, + "loss": 1.5817, + "step": 8853 + }, + { + "epoch": 0.6664784809650163, + "grad_norm": 5.344308853149414, + "learning_rate": 2.6600007845980257e-05, + "loss": 1.7633, + "step": 8854 + }, + { + "epoch": 0.6665537552456764, + "grad_norm": 6.488978385925293, + "learning_rate": 2.6589235103066595e-05, + "loss": 1.6702, + "step": 8855 + }, + { + "epoch": 0.6666290295263366, + "grad_norm": 5.172380447387695, + "learning_rate": 2.657846375185573e-05, + "loss": 1.6459, + "step": 8856 + }, + { + "epoch": 0.6667043038069967, + "grad_norm": 5.646066188812256, + "learning_rate": 2.6567693792988017e-05, + "loss": 1.8875, + "step": 8857 + }, + { + "epoch": 0.6667795780876569, + "grad_norm": 6.2224650382995605, + "learning_rate": 2.6556925227103656e-05, + "loss": 1.5776, + "step": 8858 + }, + { + "epoch": 0.6668548523683171, + "grad_norm": 4.641350269317627, + "learning_rate": 2.654615805484284e-05, + "loss": 1.7257, + "step": 8859 + }, + { + "epoch": 0.6669301266489772, + "grad_norm": 5.486895561218262, + "learning_rate": 2.6535392276845612e-05, + "loss": 1.7098, + "step": 8860 + }, + { + "epoch": 0.6670054009296373, + "grad_norm": 5.808675765991211, + "learning_rate": 2.652462789375201e-05, + "loss": 1.809, + "step": 8861 + }, + { + "epoch": 0.6670806752102976, + "grad_norm": 4.312739372253418, + "learning_rate": 2.651386490620189e-05, + "loss": 1.7275, + "step": 8862 + }, + { + "epoch": 0.6671559494909577, + "grad_norm": 4.1838250160217285, + "learning_rate": 2.6503103314835142e-05, + "loss": 1.4376, + "step": 8863 + }, + { + "epoch": 0.6672312237716178, + "grad_norm": 4.25212287902832, + "learning_rate": 2.6492343120291453e-05, + "loss": 1.7681, + "step": 8864 + }, + { + "epoch": 0.667306498052278, + "grad_norm": 4.632696151733398, + "learning_rate": 2.6481584323210524e-05, + "loss": 1.9863, + "step": 8865 + }, + { + "epoch": 0.6673817723329382, + "grad_norm": 5.620511054992676, + "learning_rate": 2.6470826924231918e-05, + "loss": 1.9272, + "step": 8866 + }, + { + "epoch": 0.6674570466135983, + "grad_norm": 4.017658710479736, + "learning_rate": 2.646007092399514e-05, + "loss": 1.4344, + "step": 8867 + }, + { + "epoch": 0.6675323208942584, + "grad_norm": 5.515852928161621, + "learning_rate": 2.644931632313963e-05, + "loss": 2.1625, + "step": 8868 + }, + { + "epoch": 0.6676075951749186, + "grad_norm": 4.282059192657471, + "learning_rate": 2.643856312230466e-05, + "loss": 1.9126, + "step": 8869 + }, + { + "epoch": 0.6676828694555788, + "grad_norm": 5.449529647827148, + "learning_rate": 2.6427811322129525e-05, + "loss": 1.7935, + "step": 8870 + }, + { + "epoch": 0.6677581437362389, + "grad_norm": 4.128555774688721, + "learning_rate": 2.641706092325336e-05, + "loss": 1.803, + "step": 8871 + }, + { + "epoch": 0.6678334180168991, + "grad_norm": 5.660778999328613, + "learning_rate": 2.6406311926315273e-05, + "loss": 1.5873, + "step": 8872 + }, + { + "epoch": 0.6679086922975592, + "grad_norm": 5.0545220375061035, + "learning_rate": 2.6395564331954225e-05, + "loss": 1.8072, + "step": 8873 + }, + { + "epoch": 0.6679839665782193, + "grad_norm": 5.5421833992004395, + "learning_rate": 2.6384818140809166e-05, + "loss": 1.9646, + "step": 8874 + }, + { + "epoch": 0.6680592408588796, + "grad_norm": 4.955144882202148, + "learning_rate": 2.6374073353518892e-05, + "loss": 1.7933, + "step": 8875 + }, + { + "epoch": 0.6681345151395397, + "grad_norm": 4.172619342803955, + "learning_rate": 2.636332997072216e-05, + "loss": 1.8326, + "step": 8876 + }, + { + "epoch": 0.6682097894201998, + "grad_norm": 3.8146655559539795, + "learning_rate": 2.6352587993057646e-05, + "loss": 1.7013, + "step": 8877 + }, + { + "epoch": 0.66828506370086, + "grad_norm": 4.290432929992676, + "learning_rate": 2.6341847421163946e-05, + "loss": 1.6764, + "step": 8878 + }, + { + "epoch": 0.6683603379815202, + "grad_norm": 4.940219879150391, + "learning_rate": 2.633110825567951e-05, + "loss": 1.6026, + "step": 8879 + }, + { + "epoch": 0.6684356122621803, + "grad_norm": 5.315835952758789, + "learning_rate": 2.6320370497242798e-05, + "loss": 1.6759, + "step": 8880 + }, + { + "epoch": 0.6685108865428405, + "grad_norm": 5.418793678283691, + "learning_rate": 2.630963414649209e-05, + "loss": 1.6034, + "step": 8881 + }, + { + "epoch": 0.6685861608235006, + "grad_norm": 6.137773036956787, + "learning_rate": 2.6298899204065657e-05, + "loss": 2.0568, + "step": 8882 + }, + { + "epoch": 0.6686614351041608, + "grad_norm": 4.720584869384766, + "learning_rate": 2.6288165670601684e-05, + "loss": 1.9455, + "step": 8883 + }, + { + "epoch": 0.668736709384821, + "grad_norm": 5.469613552093506, + "learning_rate": 2.6277433546738206e-05, + "loss": 1.5402, + "step": 8884 + }, + { + "epoch": 0.6688119836654811, + "grad_norm": 5.493709087371826, + "learning_rate": 2.626670283311325e-05, + "loss": 1.9048, + "step": 8885 + }, + { + "epoch": 0.6688872579461412, + "grad_norm": 5.28349494934082, + "learning_rate": 2.6255973530364696e-05, + "loss": 1.8301, + "step": 8886 + }, + { + "epoch": 0.6689625322268014, + "grad_norm": 4.45078706741333, + "learning_rate": 2.6245245639130388e-05, + "loss": 1.542, + "step": 8887 + }, + { + "epoch": 0.6690378065074616, + "grad_norm": 6.717918872833252, + "learning_rate": 2.6234519160048064e-05, + "loss": 1.87, + "step": 8888 + }, + { + "epoch": 0.6691130807881217, + "grad_norm": 5.85417366027832, + "learning_rate": 2.6223794093755405e-05, + "loss": 1.664, + "step": 8889 + }, + { + "epoch": 0.6691883550687818, + "grad_norm": 6.811474323272705, + "learning_rate": 2.6213070440889942e-05, + "loss": 2.1546, + "step": 8890 + }, + { + "epoch": 0.6692636293494421, + "grad_norm": 6.377946853637695, + "learning_rate": 2.6202348202089216e-05, + "loss": 2.0215, + "step": 8891 + }, + { + "epoch": 0.6693389036301022, + "grad_norm": 6.582273483276367, + "learning_rate": 2.6191627377990586e-05, + "loss": 1.9936, + "step": 8892 + }, + { + "epoch": 0.6694141779107623, + "grad_norm": 5.34724760055542, + "learning_rate": 2.618090796923141e-05, + "loss": 1.7541, + "step": 8893 + }, + { + "epoch": 0.6694894521914225, + "grad_norm": 4.509660243988037, + "learning_rate": 2.6170189976448894e-05, + "loss": 2.2674, + "step": 8894 + }, + { + "epoch": 0.6695647264720826, + "grad_norm": 5.12171745300293, + "learning_rate": 2.6159473400280227e-05, + "loss": 1.9887, + "step": 8895 + }, + { + "epoch": 0.6696400007527428, + "grad_norm": 6.933834552764893, + "learning_rate": 2.6148758241362436e-05, + "loss": 1.8252, + "step": 8896 + }, + { + "epoch": 0.669715275033403, + "grad_norm": 6.463094711303711, + "learning_rate": 2.6138044500332536e-05, + "loss": 1.7883, + "step": 8897 + }, + { + "epoch": 0.6697905493140631, + "grad_norm": 5.302150726318359, + "learning_rate": 2.6127332177827446e-05, + "loss": 1.4326, + "step": 8898 + }, + { + "epoch": 0.6698658235947232, + "grad_norm": 6.862728595733643, + "learning_rate": 2.611662127448393e-05, + "loss": 1.7649, + "step": 8899 + }, + { + "epoch": 0.6699410978753835, + "grad_norm": 4.771916389465332, + "learning_rate": 2.6105911790938763e-05, + "loss": 1.6605, + "step": 8900 + }, + { + "epoch": 0.6700163721560436, + "grad_norm": 5.966783046722412, + "learning_rate": 2.609520372782857e-05, + "loss": 1.8541, + "step": 8901 + }, + { + "epoch": 0.6700916464367037, + "grad_norm": 4.28339958190918, + "learning_rate": 2.608449708578995e-05, + "loss": 1.6107, + "step": 8902 + }, + { + "epoch": 0.670166920717364, + "grad_norm": 7.336152076721191, + "learning_rate": 2.607379186545933e-05, + "loss": 1.6183, + "step": 8903 + }, + { + "epoch": 0.6702421949980241, + "grad_norm": 5.136387348175049, + "learning_rate": 2.6063088067473156e-05, + "loss": 1.45, + "step": 8904 + }, + { + "epoch": 0.6703174692786842, + "grad_norm": 4.165783405303955, + "learning_rate": 2.6052385692467697e-05, + "loss": 1.6781, + "step": 8905 + }, + { + "epoch": 0.6703927435593443, + "grad_norm": 6.601078987121582, + "learning_rate": 2.604168474107921e-05, + "loss": 1.7538, + "step": 8906 + }, + { + "epoch": 0.6704680178400045, + "grad_norm": 3.8668179512023926, + "learning_rate": 2.6030985213943803e-05, + "loss": 1.6392, + "step": 8907 + }, + { + "epoch": 0.6705432921206647, + "grad_norm": 5.337393283843994, + "learning_rate": 2.602028711169757e-05, + "loss": 1.2807, + "step": 8908 + }, + { + "epoch": 0.6706185664013248, + "grad_norm": 7.130876541137695, + "learning_rate": 2.6009590434976445e-05, + "loss": 1.7257, + "step": 8909 + }, + { + "epoch": 0.670693840681985, + "grad_norm": 5.920907974243164, + "learning_rate": 2.5998895184416327e-05, + "loss": 1.9523, + "step": 8910 + }, + { + "epoch": 0.6707691149626451, + "grad_norm": 4.62687873840332, + "learning_rate": 2.598820136065303e-05, + "loss": 1.6335, + "step": 8911 + }, + { + "epoch": 0.6708443892433053, + "grad_norm": 4.449464797973633, + "learning_rate": 2.5977508964322282e-05, + "loss": 2.0018, + "step": 8912 + }, + { + "epoch": 0.6709196635239655, + "grad_norm": 5.300284385681152, + "learning_rate": 2.5966817996059677e-05, + "loss": 1.8395, + "step": 8913 + }, + { + "epoch": 0.6709949378046256, + "grad_norm": 5.117275238037109, + "learning_rate": 2.5956128456500784e-05, + "loss": 2.1261, + "step": 8914 + }, + { + "epoch": 0.6710702120852857, + "grad_norm": 4.4601874351501465, + "learning_rate": 2.594544034628108e-05, + "loss": 1.8862, + "step": 8915 + }, + { + "epoch": 0.671145486365946, + "grad_norm": 5.206698417663574, + "learning_rate": 2.5934753666035917e-05, + "loss": 1.7572, + "step": 8916 + }, + { + "epoch": 0.6712207606466061, + "grad_norm": 8.460914611816406, + "learning_rate": 2.592406841640061e-05, + "loss": 1.5933, + "step": 8917 + }, + { + "epoch": 0.6712960349272662, + "grad_norm": 5.655177116394043, + "learning_rate": 2.5913384598010337e-05, + "loss": 2.1689, + "step": 8918 + }, + { + "epoch": 0.6713713092079264, + "grad_norm": 4.573807716369629, + "learning_rate": 2.590270221150025e-05, + "loss": 1.8257, + "step": 8919 + }, + { + "epoch": 0.6714465834885865, + "grad_norm": 4.637154579162598, + "learning_rate": 2.589202125750536e-05, + "loss": 1.6038, + "step": 8920 + }, + { + "epoch": 0.6715218577692467, + "grad_norm": 4.816969871520996, + "learning_rate": 2.5881341736660635e-05, + "loss": 1.6088, + "step": 8921 + }, + { + "epoch": 0.6715971320499069, + "grad_norm": 5.132904052734375, + "learning_rate": 2.587066364960094e-05, + "loss": 1.86, + "step": 8922 + }, + { + "epoch": 0.671672406330567, + "grad_norm": 5.0843024253845215, + "learning_rate": 2.5859986996961074e-05, + "loss": 1.4324, + "step": 8923 + }, + { + "epoch": 0.6717476806112271, + "grad_norm": 4.941413879394531, + "learning_rate": 2.5849311779375696e-05, + "loss": 1.667, + "step": 8924 + }, + { + "epoch": 0.6718229548918874, + "grad_norm": 4.7580389976501465, + "learning_rate": 2.5838637997479454e-05, + "loss": 1.748, + "step": 8925 + }, + { + "epoch": 0.6718982291725475, + "grad_norm": 6.587122440338135, + "learning_rate": 2.5827965651906838e-05, + "loss": 1.6381, + "step": 8926 + }, + { + "epoch": 0.6719735034532076, + "grad_norm": 4.850554466247559, + "learning_rate": 2.5817294743292324e-05, + "loss": 1.651, + "step": 8927 + }, + { + "epoch": 0.6720487777338677, + "grad_norm": 6.054388999938965, + "learning_rate": 2.5806625272270236e-05, + "loss": 1.8364, + "step": 8928 + }, + { + "epoch": 0.672124052014528, + "grad_norm": 4.8603363037109375, + "learning_rate": 2.5795957239474856e-05, + "loss": 1.4958, + "step": 8929 + }, + { + "epoch": 0.6721993262951881, + "grad_norm": 6.259040832519531, + "learning_rate": 2.5785290645540382e-05, + "loss": 1.7718, + "step": 8930 + }, + { + "epoch": 0.6722746005758482, + "grad_norm": 4.31606912612915, + "learning_rate": 2.5774625491100877e-05, + "loss": 1.802, + "step": 8931 + }, + { + "epoch": 0.6723498748565084, + "grad_norm": 4.259108543395996, + "learning_rate": 2.576396177679039e-05, + "loss": 1.6813, + "step": 8932 + }, + { + "epoch": 0.6724251491371686, + "grad_norm": 6.823403358459473, + "learning_rate": 2.5753299503242823e-05, + "loss": 1.9162, + "step": 8933 + }, + { + "epoch": 0.6725004234178287, + "grad_norm": 6.004079818725586, + "learning_rate": 2.5742638671092058e-05, + "loss": 1.8791, + "step": 8934 + }, + { + "epoch": 0.6725756976984889, + "grad_norm": 6.041334629058838, + "learning_rate": 2.5731979280971795e-05, + "loss": 1.8691, + "step": 8935 + }, + { + "epoch": 0.672650971979149, + "grad_norm": 4.036243915557861, + "learning_rate": 2.572132133351575e-05, + "loss": 1.5077, + "step": 8936 + }, + { + "epoch": 0.6727262462598091, + "grad_norm": 4.807054042816162, + "learning_rate": 2.5710664829357473e-05, + "loss": 1.8773, + "step": 8937 + }, + { + "epoch": 0.6728015205404694, + "grad_norm": 4.213299751281738, + "learning_rate": 2.5700009769130496e-05, + "loss": 1.8359, + "step": 8938 + }, + { + "epoch": 0.6728767948211295, + "grad_norm": 5.245260715484619, + "learning_rate": 2.5689356153468192e-05, + "loss": 1.8355, + "step": 8939 + }, + { + "epoch": 0.6729520691017896, + "grad_norm": 4.809926986694336, + "learning_rate": 2.567870398300393e-05, + "loss": 1.616, + "step": 8940 + }, + { + "epoch": 0.6730273433824498, + "grad_norm": 6.881416320800781, + "learning_rate": 2.5668053258370918e-05, + "loss": 1.8143, + "step": 8941 + }, + { + "epoch": 0.67310261766311, + "grad_norm": 6.4238386154174805, + "learning_rate": 2.565740398020231e-05, + "loss": 1.3436, + "step": 8942 + }, + { + "epoch": 0.6731778919437701, + "grad_norm": 6.750030994415283, + "learning_rate": 2.5646756149131202e-05, + "loss": 1.4291, + "step": 8943 + }, + { + "epoch": 0.6732531662244303, + "grad_norm": 5.476352691650391, + "learning_rate": 2.5636109765790573e-05, + "loss": 1.8626, + "step": 8944 + }, + { + "epoch": 0.6733284405050904, + "grad_norm": 5.122496604919434, + "learning_rate": 2.5625464830813295e-05, + "loss": 1.5922, + "step": 8945 + }, + { + "epoch": 0.6734037147857506, + "grad_norm": 5.0122904777526855, + "learning_rate": 2.5614821344832197e-05, + "loss": 1.8137, + "step": 8946 + }, + { + "epoch": 0.6734789890664107, + "grad_norm": 5.461144924163818, + "learning_rate": 2.5604179308480018e-05, + "loss": 1.7167, + "step": 8947 + }, + { + "epoch": 0.6735542633470709, + "grad_norm": 5.308995246887207, + "learning_rate": 2.5593538722389364e-05, + "loss": 1.8742, + "step": 8948 + }, + { + "epoch": 0.673629537627731, + "grad_norm": 4.136376857757568, + "learning_rate": 2.558289958719282e-05, + "loss": 1.5212, + "step": 8949 + }, + { + "epoch": 0.6737048119083912, + "grad_norm": 5.871334552764893, + "learning_rate": 2.557226190352281e-05, + "loss": 1.6403, + "step": 8950 + }, + { + "epoch": 0.6737800861890514, + "grad_norm": 4.9417290687561035, + "learning_rate": 2.5561625672011767e-05, + "loss": 1.8095, + "step": 8951 + }, + { + "epoch": 0.6738553604697115, + "grad_norm": 4.988206386566162, + "learning_rate": 2.5550990893291933e-05, + "loss": 2.1889, + "step": 8952 + }, + { + "epoch": 0.6739306347503716, + "grad_norm": 6.624125957489014, + "learning_rate": 2.5540357567995554e-05, + "loss": 1.9031, + "step": 8953 + }, + { + "epoch": 0.6740059090310319, + "grad_norm": 5.214404106140137, + "learning_rate": 2.5529725696754726e-05, + "loss": 1.5437, + "step": 8954 + }, + { + "epoch": 0.674081183311692, + "grad_norm": 5.45963716506958, + "learning_rate": 2.551909528020148e-05, + "loss": 1.5732, + "step": 8955 + }, + { + "epoch": 0.6741564575923521, + "grad_norm": 5.8915910720825195, + "learning_rate": 2.550846631896778e-05, + "loss": 1.777, + "step": 8956 + }, + { + "epoch": 0.6742317318730123, + "grad_norm": 5.44212532043457, + "learning_rate": 2.5497838813685503e-05, + "loss": 1.7981, + "step": 8957 + }, + { + "epoch": 0.6743070061536725, + "grad_norm": 5.495545864105225, + "learning_rate": 2.548721276498639e-05, + "loss": 1.8196, + "step": 8958 + }, + { + "epoch": 0.6743822804343326, + "grad_norm": 7.3368144035339355, + "learning_rate": 2.547658817350216e-05, + "loss": 1.7229, + "step": 8959 + }, + { + "epoch": 0.6744575547149928, + "grad_norm": 13.75881290435791, + "learning_rate": 2.546596503986437e-05, + "loss": 2.1297, + "step": 8960 + }, + { + "epoch": 0.6745328289956529, + "grad_norm": 4.820085048675537, + "learning_rate": 2.5455343364704575e-05, + "loss": 1.5043, + "step": 8961 + }, + { + "epoch": 0.674608103276313, + "grad_norm": 5.784001350402832, + "learning_rate": 2.5444723148654204e-05, + "loss": 1.8998, + "step": 8962 + }, + { + "epoch": 0.6746833775569733, + "grad_norm": 7.111554145812988, + "learning_rate": 2.543410439234456e-05, + "loss": 1.7091, + "step": 8963 + }, + { + "epoch": 0.6747586518376334, + "grad_norm": 5.689934730529785, + "learning_rate": 2.542348709640695e-05, + "loss": 1.8417, + "step": 8964 + }, + { + "epoch": 0.6748339261182935, + "grad_norm": 4.593084812164307, + "learning_rate": 2.5412871261472494e-05, + "loss": 1.8021, + "step": 8965 + }, + { + "epoch": 0.6749092003989536, + "grad_norm": 3.721564292907715, + "learning_rate": 2.5402256888172282e-05, + "loss": 1.5523, + "step": 8966 + }, + { + "epoch": 0.6749844746796139, + "grad_norm": 4.551478862762451, + "learning_rate": 2.539164397713733e-05, + "loss": 1.5206, + "step": 8967 + }, + { + "epoch": 0.675059748960274, + "grad_norm": 4.444461345672607, + "learning_rate": 2.538103252899855e-05, + "loss": 1.6235, + "step": 8968 + }, + { + "epoch": 0.6751350232409341, + "grad_norm": 3.860729217529297, + "learning_rate": 2.537042254438673e-05, + "loss": 1.8192, + "step": 8969 + }, + { + "epoch": 0.6752102975215943, + "grad_norm": 4.60359525680542, + "learning_rate": 2.5359814023932637e-05, + "loss": 1.8313, + "step": 8970 + }, + { + "epoch": 0.6752855718022545, + "grad_norm": 5.170285224914551, + "learning_rate": 2.534920696826688e-05, + "loss": 1.5194, + "step": 8971 + }, + { + "epoch": 0.6753608460829146, + "grad_norm": 5.5495734214782715, + "learning_rate": 2.533860137802006e-05, + "loss": 2.2173, + "step": 8972 + }, + { + "epoch": 0.6754361203635748, + "grad_norm": 7.279989242553711, + "learning_rate": 2.5327997253822605e-05, + "loss": 1.6887, + "step": 8973 + }, + { + "epoch": 0.6755113946442349, + "grad_norm": 5.887800216674805, + "learning_rate": 2.5317394596304943e-05, + "loss": 2.0943, + "step": 8974 + }, + { + "epoch": 0.675586668924895, + "grad_norm": 4.7322211265563965, + "learning_rate": 2.530679340609733e-05, + "loss": 1.7826, + "step": 8975 + }, + { + "epoch": 0.6756619432055553, + "grad_norm": 6.192080974578857, + "learning_rate": 2.5296193683829987e-05, + "loss": 1.9582, + "step": 8976 + }, + { + "epoch": 0.6757372174862154, + "grad_norm": 4.433382987976074, + "learning_rate": 2.5285595430133058e-05, + "loss": 2.0646, + "step": 8977 + }, + { + "epoch": 0.6758124917668755, + "grad_norm": 6.091145992279053, + "learning_rate": 2.527499864563656e-05, + "loss": 1.8453, + "step": 8978 + }, + { + "epoch": 0.6758877660475358, + "grad_norm": 5.364508152008057, + "learning_rate": 2.526440333097047e-05, + "loss": 1.9142, + "step": 8979 + }, + { + "epoch": 0.6759630403281959, + "grad_norm": 5.312218189239502, + "learning_rate": 2.5253809486764614e-05, + "loss": 1.9455, + "step": 8980 + }, + { + "epoch": 0.676038314608856, + "grad_norm": 4.552134990692139, + "learning_rate": 2.5243217113648788e-05, + "loss": 1.9862, + "step": 8981 + }, + { + "epoch": 0.6761135888895162, + "grad_norm": 6.616587162017822, + "learning_rate": 2.5232626212252657e-05, + "loss": 1.798, + "step": 8982 + }, + { + "epoch": 0.6761888631701763, + "grad_norm": 6.512275218963623, + "learning_rate": 2.522203678320585e-05, + "loss": 1.9935, + "step": 8983 + }, + { + "epoch": 0.6762641374508365, + "grad_norm": 6.578413009643555, + "learning_rate": 2.521144882713784e-05, + "loss": 2.0257, + "step": 8984 + }, + { + "epoch": 0.6763394117314966, + "grad_norm": 6.5681939125061035, + "learning_rate": 2.520086234467809e-05, + "loss": 1.9837, + "step": 8985 + }, + { + "epoch": 0.6764146860121568, + "grad_norm": 6.156750202178955, + "learning_rate": 2.5190277336455902e-05, + "loss": 1.6867, + "step": 8986 + }, + { + "epoch": 0.6764899602928169, + "grad_norm": 11.211443901062012, + "learning_rate": 2.5179693803100536e-05, + "loss": 2.0208, + "step": 8987 + }, + { + "epoch": 0.6765652345734771, + "grad_norm": 5.609131336212158, + "learning_rate": 2.516911174524116e-05, + "loss": 1.548, + "step": 8988 + }, + { + "epoch": 0.6766405088541373, + "grad_norm": 4.331786155700684, + "learning_rate": 2.5158531163506854e-05, + "loss": 1.7258, + "step": 8989 + }, + { + "epoch": 0.6767157831347974, + "grad_norm": 4.298157215118408, + "learning_rate": 2.514795205852658e-05, + "loss": 1.2919, + "step": 8990 + }, + { + "epoch": 0.6767910574154575, + "grad_norm": 4.455791473388672, + "learning_rate": 2.5137374430929262e-05, + "loss": 1.6229, + "step": 8991 + }, + { + "epoch": 0.6768663316961178, + "grad_norm": 6.435390949249268, + "learning_rate": 2.5126798281343677e-05, + "loss": 1.8519, + "step": 8992 + }, + { + "epoch": 0.6769416059767779, + "grad_norm": 6.2262139320373535, + "learning_rate": 2.511622361039858e-05, + "loss": 2.0097, + "step": 8993 + }, + { + "epoch": 0.677016880257438, + "grad_norm": 6.057564735412598, + "learning_rate": 2.510565041872258e-05, + "loss": 1.8266, + "step": 8994 + }, + { + "epoch": 0.6770921545380982, + "grad_norm": 6.164272785186768, + "learning_rate": 2.509507870694423e-05, + "loss": 2.1945, + "step": 8995 + }, + { + "epoch": 0.6771674288187584, + "grad_norm": 5.387622356414795, + "learning_rate": 2.5084508475692008e-05, + "loss": 1.9489, + "step": 8996 + }, + { + "epoch": 0.6772427030994185, + "grad_norm": 4.734719276428223, + "learning_rate": 2.5073939725594248e-05, + "loss": 2.0393, + "step": 8997 + }, + { + "epoch": 0.6773179773800787, + "grad_norm": 3.627415895462036, + "learning_rate": 2.5063372457279266e-05, + "loss": 1.7087, + "step": 8998 + }, + { + "epoch": 0.6773932516607388, + "grad_norm": 5.981908798217773, + "learning_rate": 2.505280667137523e-05, + "loss": 1.758, + "step": 8999 + }, + { + "epoch": 0.677468525941399, + "grad_norm": 4.462006092071533, + "learning_rate": 2.504224236851025e-05, + "loss": 1.6143, + "step": 9000 + }, + { + "epoch": 0.6775438002220592, + "grad_norm": 5.129408836364746, + "learning_rate": 2.5031679549312347e-05, + "loss": 1.9339, + "step": 9001 + }, + { + "epoch": 0.6776190745027193, + "grad_norm": 6.876696586608887, + "learning_rate": 2.5021118214409473e-05, + "loss": 1.5552, + "step": 9002 + }, + { + "epoch": 0.6776943487833794, + "grad_norm": 5.253693580627441, + "learning_rate": 2.5010558364429433e-05, + "loss": 1.8826, + "step": 9003 + }, + { + "epoch": 0.6777696230640395, + "grad_norm": 4.006175994873047, + "learning_rate": 2.500000000000001e-05, + "loss": 1.9282, + "step": 9004 + }, + { + "epoch": 0.6778448973446998, + "grad_norm": 4.476637363433838, + "learning_rate": 2.498944312174884e-05, + "loss": 1.5152, + "step": 9005 + }, + { + "epoch": 0.6779201716253599, + "grad_norm": 5.97061014175415, + "learning_rate": 2.497888773030353e-05, + "loss": 1.4737, + "step": 9006 + }, + { + "epoch": 0.67799544590602, + "grad_norm": 4.747392177581787, + "learning_rate": 2.496833382629153e-05, + "loss": 1.587, + "step": 9007 + }, + { + "epoch": 0.6780707201866802, + "grad_norm": 5.084542274475098, + "learning_rate": 2.495778141034028e-05, + "loss": 1.9394, + "step": 9008 + }, + { + "epoch": 0.6781459944673404, + "grad_norm": 4.760303020477295, + "learning_rate": 2.4947230483077048e-05, + "loss": 1.7394, + "step": 9009 + }, + { + "epoch": 0.6782212687480005, + "grad_norm": 4.503913879394531, + "learning_rate": 2.493668104512909e-05, + "loss": 1.6716, + "step": 9010 + }, + { + "epoch": 0.6782965430286607, + "grad_norm": 5.278449535369873, + "learning_rate": 2.492613309712352e-05, + "loss": 2.1111, + "step": 9011 + }, + { + "epoch": 0.6783718173093208, + "grad_norm": 5.737161159515381, + "learning_rate": 2.4915586639687394e-05, + "loss": 1.4511, + "step": 9012 + }, + { + "epoch": 0.678447091589981, + "grad_norm": 7.096086502075195, + "learning_rate": 2.490504167344769e-05, + "loss": 1.2409, + "step": 9013 + }, + { + "epoch": 0.6785223658706412, + "grad_norm": 6.587225914001465, + "learning_rate": 2.489449819903123e-05, + "loss": 1.9241, + "step": 9014 + }, + { + "epoch": 0.6785976401513013, + "grad_norm": 3.9985365867614746, + "learning_rate": 2.4883956217064834e-05, + "loss": 1.5651, + "step": 9015 + }, + { + "epoch": 0.6786729144319614, + "grad_norm": 4.432419776916504, + "learning_rate": 2.487341572817516e-05, + "loss": 2.0156, + "step": 9016 + }, + { + "epoch": 0.6787481887126217, + "grad_norm": 4.6513214111328125, + "learning_rate": 2.4862876732988844e-05, + "loss": 1.9355, + "step": 9017 + }, + { + "epoch": 0.6788234629932818, + "grad_norm": 5.872947692871094, + "learning_rate": 2.485233923213236e-05, + "loss": 1.7047, + "step": 9018 + }, + { + "epoch": 0.6788987372739419, + "grad_norm": 7.708502769470215, + "learning_rate": 2.4841803226232168e-05, + "loss": 1.8101, + "step": 9019 + }, + { + "epoch": 0.6789740115546021, + "grad_norm": 5.772946357727051, + "learning_rate": 2.4831268715914568e-05, + "loss": 1.4017, + "step": 9020 + }, + { + "epoch": 0.6790492858352623, + "grad_norm": 5.85705041885376, + "learning_rate": 2.482073570180583e-05, + "loss": 1.9735, + "step": 9021 + }, + { + "epoch": 0.6791245601159224, + "grad_norm": 4.456969738006592, + "learning_rate": 2.4810204184532104e-05, + "loss": 1.9099, + "step": 9022 + }, + { + "epoch": 0.6791998343965826, + "grad_norm": 5.9123454093933105, + "learning_rate": 2.4799674164719483e-05, + "loss": 1.4918, + "step": 9023 + }, + { + "epoch": 0.6792751086772427, + "grad_norm": 4.112330436706543, + "learning_rate": 2.4789145642993904e-05, + "loss": 1.9637, + "step": 9024 + }, + { + "epoch": 0.6793503829579028, + "grad_norm": 5.251736164093018, + "learning_rate": 2.4778618619981296e-05, + "loss": 1.6792, + "step": 9025 + }, + { + "epoch": 0.679425657238563, + "grad_norm": 6.009305953979492, + "learning_rate": 2.4768093096307428e-05, + "loss": 2.3858, + "step": 9026 + }, + { + "epoch": 0.6795009315192232, + "grad_norm": 5.897009372711182, + "learning_rate": 2.4757569072598025e-05, + "loss": 2.0942, + "step": 9027 + }, + { + "epoch": 0.6795762057998833, + "grad_norm": 6.150177478790283, + "learning_rate": 2.474704654947873e-05, + "loss": 1.6733, + "step": 9028 + }, + { + "epoch": 0.6796514800805434, + "grad_norm": 5.053952217102051, + "learning_rate": 2.4736525527575038e-05, + "loss": 1.9128, + "step": 9029 + }, + { + "epoch": 0.6797267543612037, + "grad_norm": 5.430047512054443, + "learning_rate": 2.4726006007512437e-05, + "loss": 2.4255, + "step": 9030 + }, + { + "epoch": 0.6798020286418638, + "grad_norm": 7.376884937286377, + "learning_rate": 2.4715487989916237e-05, + "loss": 1.734, + "step": 9031 + }, + { + "epoch": 0.6798773029225239, + "grad_norm": 4.462740421295166, + "learning_rate": 2.470497147541173e-05, + "loss": 1.7504, + "step": 9032 + }, + { + "epoch": 0.6799525772031841, + "grad_norm": 7.46880578994751, + "learning_rate": 2.469445646462409e-05, + "loss": 1.9191, + "step": 9033 + }, + { + "epoch": 0.6800278514838443, + "grad_norm": 4.844864845275879, + "learning_rate": 2.468394295817842e-05, + "loss": 1.7111, + "step": 9034 + }, + { + "epoch": 0.6801031257645044, + "grad_norm": 5.50860595703125, + "learning_rate": 2.4673430956699684e-05, + "loss": 1.7701, + "step": 9035 + }, + { + "epoch": 0.6801784000451646, + "grad_norm": 5.091622829437256, + "learning_rate": 2.466292046081282e-05, + "loss": 1.699, + "step": 9036 + }, + { + "epoch": 0.6802536743258247, + "grad_norm": 5.141727924346924, + "learning_rate": 2.4652411471142624e-05, + "loss": 1.8141, + "step": 9037 + }, + { + "epoch": 0.6803289486064849, + "grad_norm": 4.802796363830566, + "learning_rate": 2.4641903988313852e-05, + "loss": 1.9413, + "step": 9038 + }, + { + "epoch": 0.6804042228871451, + "grad_norm": 3.8956284523010254, + "learning_rate": 2.4631398012951107e-05, + "loss": 1.6964, + "step": 9039 + }, + { + "epoch": 0.6804794971678052, + "grad_norm": 5.410317897796631, + "learning_rate": 2.462089354567898e-05, + "loss": 1.9749, + "step": 9040 + }, + { + "epoch": 0.6805547714484653, + "grad_norm": 4.489328861236572, + "learning_rate": 2.461039058712189e-05, + "loss": 1.4522, + "step": 9041 + }, + { + "epoch": 0.6806300457291256, + "grad_norm": 5.2307820320129395, + "learning_rate": 2.459988913790423e-05, + "loss": 2.0446, + "step": 9042 + }, + { + "epoch": 0.6807053200097857, + "grad_norm": 5.295603275299072, + "learning_rate": 2.4589389198650282e-05, + "loss": 1.5994, + "step": 9043 + }, + { + "epoch": 0.6807805942904458, + "grad_norm": 4.425142288208008, + "learning_rate": 2.457889076998423e-05, + "loss": 1.7597, + "step": 9044 + }, + { + "epoch": 0.6808558685711059, + "grad_norm": 3.9996602535247803, + "learning_rate": 2.4568393852530202e-05, + "loss": 1.47, + "step": 9045 + }, + { + "epoch": 0.6809311428517661, + "grad_norm": 5.020233631134033, + "learning_rate": 2.455789844691217e-05, + "loss": 1.8243, + "step": 9046 + }, + { + "epoch": 0.6810064171324263, + "grad_norm": 4.329055309295654, + "learning_rate": 2.454740455375409e-05, + "loss": 2.0197, + "step": 9047 + }, + { + "epoch": 0.6810816914130864, + "grad_norm": 4.769218921661377, + "learning_rate": 2.453691217367976e-05, + "loss": 1.6264, + "step": 9048 + }, + { + "epoch": 0.6811569656937466, + "grad_norm": 4.689718246459961, + "learning_rate": 2.452642130731296e-05, + "loss": 2.0406, + "step": 9049 + }, + { + "epoch": 0.6812322399744067, + "grad_norm": 6.0476250648498535, + "learning_rate": 2.45159319552773e-05, + "loss": 1.7601, + "step": 9050 + }, + { + "epoch": 0.6813075142550669, + "grad_norm": 4.0572428703308105, + "learning_rate": 2.450544411819638e-05, + "loss": 1.596, + "step": 9051 + }, + { + "epoch": 0.6813827885357271, + "grad_norm": 4.599985122680664, + "learning_rate": 2.449495779669364e-05, + "loss": 1.6318, + "step": 9052 + }, + { + "epoch": 0.6814580628163872, + "grad_norm": 14.866327285766602, + "learning_rate": 2.4484472991392497e-05, + "loss": 1.624, + "step": 9053 + }, + { + "epoch": 0.6815333370970473, + "grad_norm": 6.0913238525390625, + "learning_rate": 2.44739897029162e-05, + "loss": 1.5806, + "step": 9054 + }, + { + "epoch": 0.6816086113777076, + "grad_norm": 6.88595724105835, + "learning_rate": 2.4463507931887976e-05, + "loss": 1.6799, + "step": 9055 + }, + { + "epoch": 0.6816838856583677, + "grad_norm": 6.878454685211182, + "learning_rate": 2.445302767893093e-05, + "loss": 1.8902, + "step": 9056 + }, + { + "epoch": 0.6817591599390278, + "grad_norm": 4.573638916015625, + "learning_rate": 2.4442548944668107e-05, + "loss": 1.8534, + "step": 9057 + }, + { + "epoch": 0.681834434219688, + "grad_norm": 4.2600297927856445, + "learning_rate": 2.44320717297224e-05, + "loss": 1.7514, + "step": 9058 + }, + { + "epoch": 0.6819097085003482, + "grad_norm": 5.892971515655518, + "learning_rate": 2.442159603471666e-05, + "loss": 1.7874, + "step": 9059 + }, + { + "epoch": 0.6819849827810083, + "grad_norm": 7.999608516693115, + "learning_rate": 2.4411121860273666e-05, + "loss": 1.955, + "step": 9060 + }, + { + "epoch": 0.6820602570616685, + "grad_norm": 7.40668249130249, + "learning_rate": 2.4400649207016037e-05, + "loss": 1.3391, + "step": 9061 + }, + { + "epoch": 0.6821355313423286, + "grad_norm": 5.527510166168213, + "learning_rate": 2.4390178075566373e-05, + "loss": 1.8384, + "step": 9062 + }, + { + "epoch": 0.6822108056229887, + "grad_norm": 5.405130386352539, + "learning_rate": 2.4379708466547125e-05, + "loss": 1.6556, + "step": 9063 + }, + { + "epoch": 0.6822860799036489, + "grad_norm": 6.154996395111084, + "learning_rate": 2.4369240380580716e-05, + "loss": 1.5456, + "step": 9064 + }, + { + "epoch": 0.6823613541843091, + "grad_norm": 4.708168983459473, + "learning_rate": 2.43587738182894e-05, + "loss": 1.5539, + "step": 9065 + }, + { + "epoch": 0.6824366284649692, + "grad_norm": 6.514187335968018, + "learning_rate": 2.4348308780295408e-05, + "loss": 1.8811, + "step": 9066 + }, + { + "epoch": 0.6825119027456293, + "grad_norm": 5.899097919464111, + "learning_rate": 2.433784526722086e-05, + "loss": 1.5233, + "step": 9067 + }, + { + "epoch": 0.6825871770262896, + "grad_norm": 4.958945274353027, + "learning_rate": 2.43273832796878e-05, + "loss": 1.8005, + "step": 9068 + }, + { + "epoch": 0.6826624513069497, + "grad_norm": 5.436830043792725, + "learning_rate": 2.4316922818318115e-05, + "loss": 2.0136, + "step": 9069 + }, + { + "epoch": 0.6827377255876098, + "grad_norm": 4.498725891113281, + "learning_rate": 2.4306463883733693e-05, + "loss": 1.8262, + "step": 9070 + }, + { + "epoch": 0.68281299986827, + "grad_norm": 5.130438327789307, + "learning_rate": 2.4296006476556256e-05, + "loss": 1.8237, + "step": 9071 + }, + { + "epoch": 0.6828882741489302, + "grad_norm": 4.566993713378906, + "learning_rate": 2.4285550597407503e-05, + "loss": 1.8994, + "step": 9072 + }, + { + "epoch": 0.6829635484295903, + "grad_norm": 5.661617279052734, + "learning_rate": 2.427509624690896e-05, + "loss": 1.8046, + "step": 9073 + }, + { + "epoch": 0.6830388227102505, + "grad_norm": 5.407983779907227, + "learning_rate": 2.4264643425682133e-05, + "loss": 1.4927, + "step": 9074 + }, + { + "epoch": 0.6831140969909106, + "grad_norm": 5.694133758544922, + "learning_rate": 2.4254192134348425e-05, + "loss": 1.4831, + "step": 9075 + }, + { + "epoch": 0.6831893712715708, + "grad_norm": 7.230705738067627, + "learning_rate": 2.424374237352911e-05, + "loss": 1.9631, + "step": 9076 + }, + { + "epoch": 0.683264645552231, + "grad_norm": 5.2818169593811035, + "learning_rate": 2.4233294143845403e-05, + "loss": 1.7565, + "step": 9077 + }, + { + "epoch": 0.6833399198328911, + "grad_norm": 4.694523811340332, + "learning_rate": 2.4222847445918428e-05, + "loss": 1.433, + "step": 9078 + }, + { + "epoch": 0.6834151941135512, + "grad_norm": 5.643838882446289, + "learning_rate": 2.4212402280369235e-05, + "loss": 1.8091, + "step": 9079 + }, + { + "epoch": 0.6834904683942115, + "grad_norm": 6.0183491706848145, + "learning_rate": 2.4201958647818707e-05, + "loss": 2.1274, + "step": 9080 + }, + { + "epoch": 0.6835657426748716, + "grad_norm": 4.095691680908203, + "learning_rate": 2.4191516548887732e-05, + "loss": 1.7002, + "step": 9081 + }, + { + "epoch": 0.6836410169555317, + "grad_norm": 5.108181953430176, + "learning_rate": 2.4181075984197034e-05, + "loss": 1.7636, + "step": 9082 + }, + { + "epoch": 0.6837162912361918, + "grad_norm": 5.371180534362793, + "learning_rate": 2.41706369543673e-05, + "loss": 1.5803, + "step": 9083 + }, + { + "epoch": 0.683791565516852, + "grad_norm": 5.243228435516357, + "learning_rate": 2.4160199460019067e-05, + "loss": 1.842, + "step": 9084 + }, + { + "epoch": 0.6838668397975122, + "grad_norm": 5.379593849182129, + "learning_rate": 2.4149763501772847e-05, + "loss": 1.45, + "step": 9085 + }, + { + "epoch": 0.6839421140781723, + "grad_norm": 3.944453716278076, + "learning_rate": 2.4139329080249006e-05, + "loss": 1.5573, + "step": 9086 + }, + { + "epoch": 0.6840173883588325, + "grad_norm": 7.069557189941406, + "learning_rate": 2.412889619606784e-05, + "loss": 1.4579, + "step": 9087 + }, + { + "epoch": 0.6840926626394926, + "grad_norm": 6.9610185623168945, + "learning_rate": 2.411846484984957e-05, + "loss": 1.5905, + "step": 9088 + }, + { + "epoch": 0.6841679369201528, + "grad_norm": 5.010219573974609, + "learning_rate": 2.4108035042214316e-05, + "loss": 1.823, + "step": 9089 + }, + { + "epoch": 0.684243211200813, + "grad_norm": 4.76895809173584, + "learning_rate": 2.409760677378207e-05, + "loss": 1.8095, + "step": 9090 + }, + { + "epoch": 0.6843184854814731, + "grad_norm": 6.045764446258545, + "learning_rate": 2.408718004517278e-05, + "loss": 2.0616, + "step": 9091 + }, + { + "epoch": 0.6843937597621332, + "grad_norm": 6.840459823608398, + "learning_rate": 2.40767548570063e-05, + "loss": 1.543, + "step": 9092 + }, + { + "epoch": 0.6844690340427935, + "grad_norm": 6.660374641418457, + "learning_rate": 2.4066331209902342e-05, + "loss": 1.8116, + "step": 9093 + }, + { + "epoch": 0.6845443083234536, + "grad_norm": 4.9909138679504395, + "learning_rate": 2.4055909104480602e-05, + "loss": 1.8671, + "step": 9094 + }, + { + "epoch": 0.6846195826041137, + "grad_norm": 6.588348865509033, + "learning_rate": 2.4045488541360606e-05, + "loss": 1.881, + "step": 9095 + }, + { + "epoch": 0.6846948568847739, + "grad_norm": 4.804275035858154, + "learning_rate": 2.4035069521161864e-05, + "loss": 1.56, + "step": 9096 + }, + { + "epoch": 0.6847701311654341, + "grad_norm": 4.575013637542725, + "learning_rate": 2.4024652044503713e-05, + "loss": 1.4501, + "step": 9097 + }, + { + "epoch": 0.6848454054460942, + "grad_norm": 5.886621475219727, + "learning_rate": 2.4014236112005474e-05, + "loss": 1.516, + "step": 9098 + }, + { + "epoch": 0.6849206797267544, + "grad_norm": 4.221965312957764, + "learning_rate": 2.4003821724286353e-05, + "loss": 1.7367, + "step": 9099 + }, + { + "epoch": 0.6849959540074145, + "grad_norm": 5.929372787475586, + "learning_rate": 2.3993408881965418e-05, + "loss": 1.7175, + "step": 9100 + }, + { + "epoch": 0.6850712282880747, + "grad_norm": 4.118820667266846, + "learning_rate": 2.3982997585661703e-05, + "loss": 1.674, + "step": 9101 + }, + { + "epoch": 0.6851465025687348, + "grad_norm": 4.19256591796875, + "learning_rate": 2.397258783599415e-05, + "loss": 1.5431, + "step": 9102 + }, + { + "epoch": 0.685221776849395, + "grad_norm": 6.534456729888916, + "learning_rate": 2.396217963358155e-05, + "loss": 1.6074, + "step": 9103 + }, + { + "epoch": 0.6852970511300551, + "grad_norm": 6.444225311279297, + "learning_rate": 2.395177297904268e-05, + "loss": 1.8397, + "step": 9104 + }, + { + "epoch": 0.6853723254107152, + "grad_norm": 4.366914749145508, + "learning_rate": 2.3941367872996144e-05, + "loss": 1.6823, + "step": 9105 + }, + { + "epoch": 0.6854475996913755, + "grad_norm": 6.486684322357178, + "learning_rate": 2.393096431606054e-05, + "loss": 2.2661, + "step": 9106 + }, + { + "epoch": 0.6855228739720356, + "grad_norm": 4.471291542053223, + "learning_rate": 2.392056230885429e-05, + "loss": 1.744, + "step": 9107 + }, + { + "epoch": 0.6855981482526957, + "grad_norm": 4.8174591064453125, + "learning_rate": 2.3910161851995778e-05, + "loss": 1.8921, + "step": 9108 + }, + { + "epoch": 0.685673422533356, + "grad_norm": 5.309037685394287, + "learning_rate": 2.38997629461033e-05, + "loss": 1.7261, + "step": 9109 + }, + { + "epoch": 0.6857486968140161, + "grad_norm": 6.324955940246582, + "learning_rate": 2.388936559179501e-05, + "loss": 1.9158, + "step": 9110 + }, + { + "epoch": 0.6858239710946762, + "grad_norm": 5.957499027252197, + "learning_rate": 2.387896978968901e-05, + "loss": 1.8795, + "step": 9111 + }, + { + "epoch": 0.6858992453753364, + "grad_norm": 9.591164588928223, + "learning_rate": 2.386857554040331e-05, + "loss": 1.7037, + "step": 9112 + }, + { + "epoch": 0.6859745196559965, + "grad_norm": 7.181317329406738, + "learning_rate": 2.3858182844555832e-05, + "loss": 1.7483, + "step": 9113 + }, + { + "epoch": 0.6860497939366567, + "grad_norm": 5.994622230529785, + "learning_rate": 2.3847791702764362e-05, + "loss": 1.4644, + "step": 9114 + }, + { + "epoch": 0.6861250682173169, + "grad_norm": 4.924421310424805, + "learning_rate": 2.3837402115646647e-05, + "loss": 1.8404, + "step": 9115 + }, + { + "epoch": 0.686200342497977, + "grad_norm": 6.665289878845215, + "learning_rate": 2.3827014083820292e-05, + "loss": 1.5744, + "step": 9116 + }, + { + "epoch": 0.6862756167786371, + "grad_norm": 3.5703861713409424, + "learning_rate": 2.381662760790287e-05, + "loss": 1.4793, + "step": 9117 + }, + { + "epoch": 0.6863508910592974, + "grad_norm": 4.324224472045898, + "learning_rate": 2.3806242688511786e-05, + "loss": 1.7794, + "step": 9118 + }, + { + "epoch": 0.6864261653399575, + "grad_norm": 9.157154083251953, + "learning_rate": 2.379585932626444e-05, + "loss": 1.8435, + "step": 9119 + }, + { + "epoch": 0.6865014396206176, + "grad_norm": 5.736857891082764, + "learning_rate": 2.378547752177805e-05, + "loss": 1.4894, + "step": 9120 + }, + { + "epoch": 0.6865767139012778, + "grad_norm": 4.471385478973389, + "learning_rate": 2.37750972756698e-05, + "loss": 1.7953, + "step": 9121 + }, + { + "epoch": 0.686651988181938, + "grad_norm": 6.882564544677734, + "learning_rate": 2.3764718588556773e-05, + "loss": 1.8281, + "step": 9122 + }, + { + "epoch": 0.6867272624625981, + "grad_norm": 3.907404899597168, + "learning_rate": 2.3754341461055947e-05, + "loss": 1.3766, + "step": 9123 + }, + { + "epoch": 0.6868025367432582, + "grad_norm": 5.0541486740112305, + "learning_rate": 2.3743965893784225e-05, + "loss": 1.7818, + "step": 9124 + }, + { + "epoch": 0.6868778110239184, + "grad_norm": 5.510163307189941, + "learning_rate": 2.3733591887358385e-05, + "loss": 1.5318, + "step": 9125 + }, + { + "epoch": 0.6869530853045785, + "grad_norm": 3.9574410915374756, + "learning_rate": 2.3723219442395156e-05, + "loss": 1.9685, + "step": 9126 + }, + { + "epoch": 0.6870283595852387, + "grad_norm": 4.755228042602539, + "learning_rate": 2.3712848559511112e-05, + "loss": 2.0521, + "step": 9127 + }, + { + "epoch": 0.6871036338658989, + "grad_norm": 5.204033851623535, + "learning_rate": 2.3702479239322818e-05, + "loss": 1.4387, + "step": 9128 + }, + { + "epoch": 0.687178908146559, + "grad_norm": 6.228471755981445, + "learning_rate": 2.369211148244666e-05, + "loss": 1.696, + "step": 9129 + }, + { + "epoch": 0.6872541824272191, + "grad_norm": 5.8581461906433105, + "learning_rate": 2.3681745289499002e-05, + "loss": 1.8198, + "step": 9130 + }, + { + "epoch": 0.6873294567078794, + "grad_norm": 3.6106932163238525, + "learning_rate": 2.3671380661096053e-05, + "loss": 1.7352, + "step": 9131 + }, + { + "epoch": 0.6874047309885395, + "grad_norm": 7.152736186981201, + "learning_rate": 2.3661017597853987e-05, + "loss": 1.3093, + "step": 9132 + }, + { + "epoch": 0.6874800052691996, + "grad_norm": 4.191873550415039, + "learning_rate": 2.365065610038884e-05, + "loss": 1.8602, + "step": 9133 + }, + { + "epoch": 0.6875552795498598, + "grad_norm": 4.579094409942627, + "learning_rate": 2.3640296169316604e-05, + "loss": 1.631, + "step": 9134 + }, + { + "epoch": 0.68763055383052, + "grad_norm": 5.931415557861328, + "learning_rate": 2.362993780525311e-05, + "loss": 1.5701, + "step": 9135 + }, + { + "epoch": 0.6877058281111801, + "grad_norm": 6.446444511413574, + "learning_rate": 2.361958100881417e-05, + "loss": 1.7242, + "step": 9136 + }, + { + "epoch": 0.6877811023918403, + "grad_norm": 6.846824645996094, + "learning_rate": 2.360922578061542e-05, + "loss": 1.8371, + "step": 9137 + }, + { + "epoch": 0.6878563766725004, + "grad_norm": 4.564132213592529, + "learning_rate": 2.3598872121272498e-05, + "loss": 1.6937, + "step": 9138 + }, + { + "epoch": 0.6879316509531606, + "grad_norm": 5.120004177093506, + "learning_rate": 2.358852003140085e-05, + "loss": 1.8039, + "step": 9139 + }, + { + "epoch": 0.6880069252338208, + "grad_norm": 5.18104362487793, + "learning_rate": 2.3578169511615912e-05, + "loss": 1.6413, + "step": 9140 + }, + { + "epoch": 0.6880821995144809, + "grad_norm": 6.644830703735352, + "learning_rate": 2.3567820562532994e-05, + "loss": 1.5443, + "step": 9141 + }, + { + "epoch": 0.688157473795141, + "grad_norm": 5.391605854034424, + "learning_rate": 2.3557473184767287e-05, + "loss": 1.7052, + "step": 9142 + }, + { + "epoch": 0.6882327480758011, + "grad_norm": 6.080070972442627, + "learning_rate": 2.3547127378933925e-05, + "loss": 1.8115, + "step": 9143 + }, + { + "epoch": 0.6883080223564614, + "grad_norm": 4.348923683166504, + "learning_rate": 2.353678314564794e-05, + "loss": 1.6486, + "step": 9144 + }, + { + "epoch": 0.6883832966371215, + "grad_norm": 4.040170192718506, + "learning_rate": 2.352644048552428e-05, + "loss": 1.9937, + "step": 9145 + }, + { + "epoch": 0.6884585709177816, + "grad_norm": 5.374321460723877, + "learning_rate": 2.351609939917776e-05, + "loss": 2.1787, + "step": 9146 + }, + { + "epoch": 0.6885338451984419, + "grad_norm": 5.156060218811035, + "learning_rate": 2.3505759887223155e-05, + "loss": 1.7299, + "step": 9147 + }, + { + "epoch": 0.688609119479102, + "grad_norm": 4.476626396179199, + "learning_rate": 2.3495421950275086e-05, + "loss": 1.7923, + "step": 9148 + }, + { + "epoch": 0.6886843937597621, + "grad_norm": 7.028581619262695, + "learning_rate": 2.3485085588948153e-05, + "loss": 1.9865, + "step": 9149 + }, + { + "epoch": 0.6887596680404223, + "grad_norm": 5.202090263366699, + "learning_rate": 2.3474750803856782e-05, + "loss": 1.9552, + "step": 9150 + }, + { + "epoch": 0.6888349423210824, + "grad_norm": 5.547637462615967, + "learning_rate": 2.346441759561538e-05, + "loss": 1.6691, + "step": 9151 + }, + { + "epoch": 0.6889102166017426, + "grad_norm": 6.704097270965576, + "learning_rate": 2.34540859648382e-05, + "loss": 1.9233, + "step": 9152 + }, + { + "epoch": 0.6889854908824028, + "grad_norm": 5.791963577270508, + "learning_rate": 2.3443755912139458e-05, + "loss": 1.6412, + "step": 9153 + }, + { + "epoch": 0.6890607651630629, + "grad_norm": 6.554966449737549, + "learning_rate": 2.343342743813321e-05, + "loss": 1.9647, + "step": 9154 + }, + { + "epoch": 0.689136039443723, + "grad_norm": 4.742175579071045, + "learning_rate": 2.342310054343347e-05, + "loss": 1.7511, + "step": 9155 + }, + { + "epoch": 0.6892113137243833, + "grad_norm": 8.150256156921387, + "learning_rate": 2.3412775228654147e-05, + "loss": 2.302, + "step": 9156 + }, + { + "epoch": 0.6892865880050434, + "grad_norm": 5.036172866821289, + "learning_rate": 2.340245149440905e-05, + "loss": 1.6255, + "step": 9157 + }, + { + "epoch": 0.6893618622857035, + "grad_norm": 4.9711456298828125, + "learning_rate": 2.3392129341311914e-05, + "loss": 1.6857, + "step": 9158 + }, + { + "epoch": 0.6894371365663637, + "grad_norm": 4.807013034820557, + "learning_rate": 2.3381808769976323e-05, + "loss": 1.7928, + "step": 9159 + }, + { + "epoch": 0.6895124108470239, + "grad_norm": 5.212289810180664, + "learning_rate": 2.3371489781015838e-05, + "loss": 2.1028, + "step": 9160 + }, + { + "epoch": 0.689587685127684, + "grad_norm": 4.676896572113037, + "learning_rate": 2.3361172375043867e-05, + "loss": 1.6098, + "step": 9161 + }, + { + "epoch": 0.6896629594083441, + "grad_norm": 4.520888328552246, + "learning_rate": 2.335085655267378e-05, + "loss": 1.5006, + "step": 9162 + }, + { + "epoch": 0.6897382336890043, + "grad_norm": 5.282129287719727, + "learning_rate": 2.3340542314518786e-05, + "loss": 1.5047, + "step": 9163 + }, + { + "epoch": 0.6898135079696645, + "grad_norm": 4.305469989776611, + "learning_rate": 2.3330229661192077e-05, + "loss": 1.4217, + "step": 9164 + }, + { + "epoch": 0.6898887822503246, + "grad_norm": 5.6384758949279785, + "learning_rate": 2.3319918593306673e-05, + "loss": 1.6183, + "step": 9165 + }, + { + "epoch": 0.6899640565309848, + "grad_norm": 5.203912258148193, + "learning_rate": 2.330960911147555e-05, + "loss": 1.9253, + "step": 9166 + }, + { + "epoch": 0.6900393308116449, + "grad_norm": 6.500441074371338, + "learning_rate": 2.3299301216311586e-05, + "loss": 1.6001, + "step": 9167 + }, + { + "epoch": 0.690114605092305, + "grad_norm": 4.630136489868164, + "learning_rate": 2.328899490842757e-05, + "loss": 1.7604, + "step": 9168 + }, + { + "epoch": 0.6901898793729653, + "grad_norm": 6.14787483215332, + "learning_rate": 2.3278690188436147e-05, + "loss": 2.0143, + "step": 9169 + }, + { + "epoch": 0.6902651536536254, + "grad_norm": 5.346184253692627, + "learning_rate": 2.3268387056949935e-05, + "loss": 1.7744, + "step": 9170 + }, + { + "epoch": 0.6903404279342855, + "grad_norm": 4.308267593383789, + "learning_rate": 2.3258085514581395e-05, + "loss": 1.8318, + "step": 9171 + }, + { + "epoch": 0.6904157022149457, + "grad_norm": 5.004668712615967, + "learning_rate": 2.324778556194294e-05, + "loss": 1.392, + "step": 9172 + }, + { + "epoch": 0.6904909764956059, + "grad_norm": 5.011069297790527, + "learning_rate": 2.3237487199646896e-05, + "loss": 1.5038, + "step": 9173 + }, + { + "epoch": 0.690566250776266, + "grad_norm": 4.422870635986328, + "learning_rate": 2.3227190428305425e-05, + "loss": 1.7832, + "step": 9174 + }, + { + "epoch": 0.6906415250569262, + "grad_norm": 7.220592975616455, + "learning_rate": 2.321689524853068e-05, + "loss": 1.8406, + "step": 9175 + }, + { + "epoch": 0.6907167993375863, + "grad_norm": 5.2014479637146, + "learning_rate": 2.320660166093464e-05, + "loss": 1.7574, + "step": 9176 + }, + { + "epoch": 0.6907920736182465, + "grad_norm": 7.197844982147217, + "learning_rate": 2.3196309666129263e-05, + "loss": 1.5406, + "step": 9177 + }, + { + "epoch": 0.6908673478989067, + "grad_norm": 4.710906028747559, + "learning_rate": 2.318601926472636e-05, + "loss": 2.0996, + "step": 9178 + }, + { + "epoch": 0.6909426221795668, + "grad_norm": 10.995844841003418, + "learning_rate": 2.3175730457337697e-05, + "loss": 2.0979, + "step": 9179 + }, + { + "epoch": 0.6910178964602269, + "grad_norm": 4.889793872833252, + "learning_rate": 2.316544324457487e-05, + "loss": 1.3697, + "step": 9180 + }, + { + "epoch": 0.691093170740887, + "grad_norm": 4.081658363342285, + "learning_rate": 2.3155157627049462e-05, + "loss": 1.6905, + "step": 9181 + }, + { + "epoch": 0.6911684450215473, + "grad_norm": 6.739316463470459, + "learning_rate": 2.314487360537289e-05, + "loss": 2.1387, + "step": 9182 + }, + { + "epoch": 0.6912437193022074, + "grad_norm": 5.182290077209473, + "learning_rate": 2.3134591180156545e-05, + "loss": 1.4409, + "step": 9183 + }, + { + "epoch": 0.6913189935828675, + "grad_norm": 4.573348522186279, + "learning_rate": 2.3124310352011652e-05, + "loss": 1.5495, + "step": 9184 + }, + { + "epoch": 0.6913942678635278, + "grad_norm": 5.446428298950195, + "learning_rate": 2.3114031121549405e-05, + "loss": 1.5756, + "step": 9185 + }, + { + "epoch": 0.6914695421441879, + "grad_norm": 5.0014448165893555, + "learning_rate": 2.3103753489380853e-05, + "loss": 2.1781, + "step": 9186 + }, + { + "epoch": 0.691544816424848, + "grad_norm": 5.167144775390625, + "learning_rate": 2.3093477456116978e-05, + "loss": 1.4128, + "step": 9187 + }, + { + "epoch": 0.6916200907055082, + "grad_norm": 5.072972297668457, + "learning_rate": 2.3083203022368667e-05, + "loss": 1.6413, + "step": 9188 + }, + { + "epoch": 0.6916953649861683, + "grad_norm": 5.502506256103516, + "learning_rate": 2.30729301887467e-05, + "loss": 1.468, + "step": 9189 + }, + { + "epoch": 0.6917706392668285, + "grad_norm": 4.651703357696533, + "learning_rate": 2.3062658955861794e-05, + "loss": 1.9904, + "step": 9190 + }, + { + "epoch": 0.6918459135474887, + "grad_norm": 4.388789653778076, + "learning_rate": 2.3052389324324504e-05, + "loss": 1.6403, + "step": 9191 + }, + { + "epoch": 0.6919211878281488, + "grad_norm": 6.176862716674805, + "learning_rate": 2.3042121294745356e-05, + "loss": 2.0999, + "step": 9192 + }, + { + "epoch": 0.6919964621088089, + "grad_norm": 6.360662460327148, + "learning_rate": 2.3031854867734737e-05, + "loss": 1.4859, + "step": 9193 + }, + { + "epoch": 0.6920717363894692, + "grad_norm": 5.236613750457764, + "learning_rate": 2.302159004390298e-05, + "loss": 1.7031, + "step": 9194 + }, + { + "epoch": 0.6921470106701293, + "grad_norm": 4.803183555603027, + "learning_rate": 2.301132682386027e-05, + "loss": 1.6113, + "step": 9195 + }, + { + "epoch": 0.6922222849507894, + "grad_norm": 5.111410617828369, + "learning_rate": 2.3001065208216755e-05, + "loss": 1.8398, + "step": 9196 + }, + { + "epoch": 0.6922975592314496, + "grad_norm": 7.739950180053711, + "learning_rate": 2.2990805197582426e-05, + "loss": 1.7123, + "step": 9197 + }, + { + "epoch": 0.6923728335121098, + "grad_norm": 9.180806159973145, + "learning_rate": 2.298054679256724e-05, + "loss": 2.3827, + "step": 9198 + }, + { + "epoch": 0.6924481077927699, + "grad_norm": 9.780746459960938, + "learning_rate": 2.2970289993781025e-05, + "loss": 2.0686, + "step": 9199 + }, + { + "epoch": 0.6925233820734301, + "grad_norm": 6.493659019470215, + "learning_rate": 2.29600348018335e-05, + "loss": 1.6725, + "step": 9200 + }, + { + "epoch": 0.6925986563540902, + "grad_norm": 4.05678653717041, + "learning_rate": 2.2949781217334322e-05, + "loss": 1.5804, + "step": 9201 + }, + { + "epoch": 0.6926739306347504, + "grad_norm": 3.992006778717041, + "learning_rate": 2.293952924089305e-05, + "loss": 1.358, + "step": 9202 + }, + { + "epoch": 0.6927492049154105, + "grad_norm": 5.253567695617676, + "learning_rate": 2.2929278873119103e-05, + "loss": 1.6375, + "step": 9203 + }, + { + "epoch": 0.6928244791960707, + "grad_norm": 5.085446357727051, + "learning_rate": 2.2919030114621848e-05, + "loss": 2.0361, + "step": 9204 + }, + { + "epoch": 0.6928997534767308, + "grad_norm": 6.214475154876709, + "learning_rate": 2.2908782966010574e-05, + "loss": 1.993, + "step": 9205 + }, + { + "epoch": 0.692975027757391, + "grad_norm": 6.2569475173950195, + "learning_rate": 2.2898537427894396e-05, + "loss": 1.8131, + "step": 9206 + }, + { + "epoch": 0.6930503020380512, + "grad_norm": 5.323792457580566, + "learning_rate": 2.2888293500882423e-05, + "loss": 2.0126, + "step": 9207 + }, + { + "epoch": 0.6931255763187113, + "grad_norm": 4.812386989593506, + "learning_rate": 2.2878051185583594e-05, + "loss": 1.5517, + "step": 9208 + }, + { + "epoch": 0.6932008505993714, + "grad_norm": 4.390308856964111, + "learning_rate": 2.2867810482606816e-05, + "loss": 1.5801, + "step": 9209 + }, + { + "epoch": 0.6932761248800317, + "grad_norm": 4.9885969161987305, + "learning_rate": 2.2857571392560838e-05, + "loss": 1.3427, + "step": 9210 + }, + { + "epoch": 0.6933513991606918, + "grad_norm": 4.702757358551025, + "learning_rate": 2.284733391605436e-05, + "loss": 1.7395, + "step": 9211 + }, + { + "epoch": 0.6934266734413519, + "grad_norm": 5.381412982940674, + "learning_rate": 2.283709805369597e-05, + "loss": 1.9432, + "step": 9212 + }, + { + "epoch": 0.6935019477220121, + "grad_norm": 4.8568620681762695, + "learning_rate": 2.282686380609418e-05, + "loss": 1.7734, + "step": 9213 + }, + { + "epoch": 0.6935772220026722, + "grad_norm": 8.742486000061035, + "learning_rate": 2.281663117385735e-05, + "loss": 1.8303, + "step": 9214 + }, + { + "epoch": 0.6936524962833324, + "grad_norm": 5.089756965637207, + "learning_rate": 2.2806400157593816e-05, + "loss": 2.0048, + "step": 9215 + }, + { + "epoch": 0.6937277705639926, + "grad_norm": 8.57791519165039, + "learning_rate": 2.279617075791175e-05, + "loss": 2.1358, + "step": 9216 + }, + { + "epoch": 0.6938030448446527, + "grad_norm": 7.224941730499268, + "learning_rate": 2.2785942975419293e-05, + "loss": 1.6037, + "step": 9217 + }, + { + "epoch": 0.6938783191253128, + "grad_norm": 6.479065895080566, + "learning_rate": 2.2775716810724422e-05, + "loss": 2.0925, + "step": 9218 + }, + { + "epoch": 0.6939535934059731, + "grad_norm": 4.162958145141602, + "learning_rate": 2.276549226443509e-05, + "loss": 1.7294, + "step": 9219 + }, + { + "epoch": 0.6940288676866332, + "grad_norm": 5.649865627288818, + "learning_rate": 2.275526933715908e-05, + "loss": 1.8782, + "step": 9220 + }, + { + "epoch": 0.6941041419672933, + "grad_norm": 8.647049903869629, + "learning_rate": 2.274504802950413e-05, + "loss": 1.9675, + "step": 9221 + }, + { + "epoch": 0.6941794162479534, + "grad_norm": 5.656197547912598, + "learning_rate": 2.2734828342077875e-05, + "loss": 1.9168, + "step": 9222 + }, + { + "epoch": 0.6942546905286137, + "grad_norm": 5.337955951690674, + "learning_rate": 2.2724610275487844e-05, + "loss": 1.9066, + "step": 9223 + }, + { + "epoch": 0.6943299648092738, + "grad_norm": 5.077858924865723, + "learning_rate": 2.2714393830341483e-05, + "loss": 1.8422, + "step": 9224 + }, + { + "epoch": 0.6944052390899339, + "grad_norm": 5.149392604827881, + "learning_rate": 2.2704179007246107e-05, + "loss": 1.7214, + "step": 9225 + }, + { + "epoch": 0.6944805133705941, + "grad_norm": 5.166655540466309, + "learning_rate": 2.269396580680898e-05, + "loss": 1.4478, + "step": 9226 + }, + { + "epoch": 0.6945557876512543, + "grad_norm": 4.211627960205078, + "learning_rate": 2.268375422963722e-05, + "loss": 1.8313, + "step": 9227 + }, + { + "epoch": 0.6946310619319144, + "grad_norm": 6.946881294250488, + "learning_rate": 2.2673544276337915e-05, + "loss": 1.9191, + "step": 9228 + }, + { + "epoch": 0.6947063362125746, + "grad_norm": 5.517900466918945, + "learning_rate": 2.2663335947517973e-05, + "loss": 1.7898, + "step": 9229 + }, + { + "epoch": 0.6947816104932347, + "grad_norm": 5.154735088348389, + "learning_rate": 2.265312924378429e-05, + "loss": 1.84, + "step": 9230 + }, + { + "epoch": 0.6948568847738948, + "grad_norm": 5.251152992248535, + "learning_rate": 2.2642924165743584e-05, + "loss": 1.6716, + "step": 9231 + }, + { + "epoch": 0.6949321590545551, + "grad_norm": 5.098364353179932, + "learning_rate": 2.2632720714002542e-05, + "loss": 1.8855, + "step": 9232 + }, + { + "epoch": 0.6950074333352152, + "grad_norm": 6.923643112182617, + "learning_rate": 2.2622518889167725e-05, + "loss": 1.7016, + "step": 9233 + }, + { + "epoch": 0.6950827076158753, + "grad_norm": 4.4657392501831055, + "learning_rate": 2.2612318691845628e-05, + "loss": 1.6098, + "step": 9234 + }, + { + "epoch": 0.6951579818965355, + "grad_norm": 5.778563022613525, + "learning_rate": 2.2602120122642585e-05, + "loss": 1.4312, + "step": 9235 + }, + { + "epoch": 0.6952332561771957, + "grad_norm": 5.23717737197876, + "learning_rate": 2.259192318216488e-05, + "loss": 1.967, + "step": 9236 + }, + { + "epoch": 0.6953085304578558, + "grad_norm": 4.6560492515563965, + "learning_rate": 2.2581727871018726e-05, + "loss": 2.0135, + "step": 9237 + }, + { + "epoch": 0.695383804738516, + "grad_norm": 4.112236976623535, + "learning_rate": 2.257153418981015e-05, + "loss": 1.8031, + "step": 9238 + }, + { + "epoch": 0.6954590790191761, + "grad_norm": 3.8167972564697266, + "learning_rate": 2.256134213914519e-05, + "loss": 1.9649, + "step": 9239 + }, + { + "epoch": 0.6955343532998363, + "grad_norm": 8.365281105041504, + "learning_rate": 2.2551151719629693e-05, + "loss": 1.8492, + "step": 9240 + }, + { + "epoch": 0.6956096275804964, + "grad_norm": 6.150554180145264, + "learning_rate": 2.254096293186948e-05, + "loss": 1.8858, + "step": 9241 + }, + { + "epoch": 0.6956849018611566, + "grad_norm": 4.381303787231445, + "learning_rate": 2.253077577647022e-05, + "loss": 1.8058, + "step": 9242 + }, + { + "epoch": 0.6957601761418167, + "grad_norm": 6.580338478088379, + "learning_rate": 2.2520590254037516e-05, + "loss": 1.893, + "step": 9243 + }, + { + "epoch": 0.6958354504224769, + "grad_norm": 4.404387474060059, + "learning_rate": 2.251040636517688e-05, + "loss": 1.731, + "step": 9244 + }, + { + "epoch": 0.6959107247031371, + "grad_norm": 5.647733211517334, + "learning_rate": 2.2500224110493717e-05, + "loss": 1.5891, + "step": 9245 + }, + { + "epoch": 0.6959859989837972, + "grad_norm": 5.918029308319092, + "learning_rate": 2.2490043490593315e-05, + "loss": 1.6628, + "step": 9246 + }, + { + "epoch": 0.6960612732644573, + "grad_norm": 6.325699806213379, + "learning_rate": 2.247986450608091e-05, + "loss": 2.1293, + "step": 9247 + }, + { + "epoch": 0.6961365475451176, + "grad_norm": 5.987757682800293, + "learning_rate": 2.2469687157561576e-05, + "loss": 1.4418, + "step": 9248 + }, + { + "epoch": 0.6962118218257777, + "grad_norm": 6.244060516357422, + "learning_rate": 2.2459511445640362e-05, + "loss": 2.1044, + "step": 9249 + }, + { + "epoch": 0.6962870961064378, + "grad_norm": 4.552266597747803, + "learning_rate": 2.2449337370922158e-05, + "loss": 2.1373, + "step": 9250 + }, + { + "epoch": 0.696362370387098, + "grad_norm": 6.966643333435059, + "learning_rate": 2.2439164934011815e-05, + "loss": 1.7356, + "step": 9251 + }, + { + "epoch": 0.6964376446677581, + "grad_norm": 4.6088361740112305, + "learning_rate": 2.2428994135514015e-05, + "loss": 1.7247, + "step": 9252 + }, + { + "epoch": 0.6965129189484183, + "grad_norm": 5.5677103996276855, + "learning_rate": 2.2418824976033397e-05, + "loss": 1.7217, + "step": 9253 + }, + { + "epoch": 0.6965881932290785, + "grad_norm": 4.105062484741211, + "learning_rate": 2.240865745617452e-05, + "loss": 1.7487, + "step": 9254 + }, + { + "epoch": 0.6966634675097386, + "grad_norm": 5.527612686157227, + "learning_rate": 2.239849157654177e-05, + "loss": 1.7733, + "step": 9255 + }, + { + "epoch": 0.6967387417903987, + "grad_norm": 5.76947021484375, + "learning_rate": 2.23883273377395e-05, + "loss": 1.5663, + "step": 9256 + }, + { + "epoch": 0.696814016071059, + "grad_norm": 4.906215190887451, + "learning_rate": 2.2378164740371936e-05, + "loss": 1.6728, + "step": 9257 + }, + { + "epoch": 0.6968892903517191, + "grad_norm": 4.5309906005859375, + "learning_rate": 2.2368003785043253e-05, + "loss": 1.6026, + "step": 9258 + }, + { + "epoch": 0.6969645646323792, + "grad_norm": 5.8111982345581055, + "learning_rate": 2.2357844472357432e-05, + "loss": 1.4393, + "step": 9259 + }, + { + "epoch": 0.6970398389130393, + "grad_norm": 5.151187896728516, + "learning_rate": 2.234768680291846e-05, + "loss": 1.9922, + "step": 9260 + }, + { + "epoch": 0.6971151131936996, + "grad_norm": 7.043175220489502, + "learning_rate": 2.233753077733015e-05, + "loss": 1.9925, + "step": 9261 + }, + { + "epoch": 0.6971903874743597, + "grad_norm": 4.817233085632324, + "learning_rate": 2.232737639619628e-05, + "loss": 1.7242, + "step": 9262 + }, + { + "epoch": 0.6972656617550198, + "grad_norm": 6.31687068939209, + "learning_rate": 2.2317223660120464e-05, + "loss": 1.7103, + "step": 9263 + }, + { + "epoch": 0.69734093603568, + "grad_norm": 4.4964985847473145, + "learning_rate": 2.230707256970629e-05, + "loss": 1.8345, + "step": 9264 + }, + { + "epoch": 0.6974162103163402, + "grad_norm": 5.479279041290283, + "learning_rate": 2.2296923125557167e-05, + "loss": 1.3243, + "step": 9265 + }, + { + "epoch": 0.6974914845970003, + "grad_norm": 7.7540388107299805, + "learning_rate": 2.2286775328276482e-05, + "loss": 1.536, + "step": 9266 + }, + { + "epoch": 0.6975667588776605, + "grad_norm": 7.980586051940918, + "learning_rate": 2.2276629178467483e-05, + "loss": 1.6434, + "step": 9267 + }, + { + "epoch": 0.6976420331583206, + "grad_norm": 4.286620140075684, + "learning_rate": 2.2266484676733328e-05, + "loss": 1.5265, + "step": 9268 + }, + { + "epoch": 0.6977173074389807, + "grad_norm": 4.886096000671387, + "learning_rate": 2.2256341823677106e-05, + "loss": 1.9481, + "step": 9269 + }, + { + "epoch": 0.697792581719641, + "grad_norm": 5.949970722198486, + "learning_rate": 2.224620061990174e-05, + "loss": 2.0638, + "step": 9270 + }, + { + "epoch": 0.6978678560003011, + "grad_norm": 4.383805274963379, + "learning_rate": 2.2236061066010127e-05, + "loss": 1.3111, + "step": 9271 + }, + { + "epoch": 0.6979431302809612, + "grad_norm": 5.51016902923584, + "learning_rate": 2.2225923162605006e-05, + "loss": 1.7864, + "step": 9272 + }, + { + "epoch": 0.6980184045616215, + "grad_norm": 5.812095642089844, + "learning_rate": 2.2215786910289076e-05, + "loss": 1.888, + "step": 9273 + }, + { + "epoch": 0.6980936788422816, + "grad_norm": 4.68532133102417, + "learning_rate": 2.2205652309664876e-05, + "loss": 1.8049, + "step": 9274 + }, + { + "epoch": 0.6981689531229417, + "grad_norm": 3.9553608894348145, + "learning_rate": 2.219551936133491e-05, + "loss": 1.6567, + "step": 9275 + }, + { + "epoch": 0.6982442274036019, + "grad_norm": 5.245287895202637, + "learning_rate": 2.2185388065901518e-05, + "loss": 2.0843, + "step": 9276 + }, + { + "epoch": 0.698319501684262, + "grad_norm": 4.831108570098877, + "learning_rate": 2.2175258423967003e-05, + "loss": 1.6006, + "step": 9277 + }, + { + "epoch": 0.6983947759649222, + "grad_norm": 4.776740074157715, + "learning_rate": 2.2165130436133535e-05, + "loss": 1.8107, + "step": 9278 + }, + { + "epoch": 0.6984700502455823, + "grad_norm": 6.1328630447387695, + "learning_rate": 2.2155004103003206e-05, + "loss": 1.7686, + "step": 9279 + }, + { + "epoch": 0.6985453245262425, + "grad_norm": 5.434392929077148, + "learning_rate": 2.2144879425177977e-05, + "loss": 1.9137, + "step": 9280 + }, + { + "epoch": 0.6986205988069026, + "grad_norm": 5.613449573516846, + "learning_rate": 2.213475640325976e-05, + "loss": 1.987, + "step": 9281 + }, + { + "epoch": 0.6986958730875628, + "grad_norm": 6.249162673950195, + "learning_rate": 2.21246350378503e-05, + "loss": 1.9819, + "step": 9282 + }, + { + "epoch": 0.698771147368223, + "grad_norm": 5.588217735290527, + "learning_rate": 2.2114515329551315e-05, + "loss": 1.5138, + "step": 9283 + }, + { + "epoch": 0.6988464216488831, + "grad_norm": 6.83455753326416, + "learning_rate": 2.2104397278964372e-05, + "loss": 1.9257, + "step": 9284 + }, + { + "epoch": 0.6989216959295432, + "grad_norm": 4.094822406768799, + "learning_rate": 2.2094280886690967e-05, + "loss": 1.8233, + "step": 9285 + }, + { + "epoch": 0.6989969702102035, + "grad_norm": 5.607545852661133, + "learning_rate": 2.2084166153332508e-05, + "loss": 1.7432, + "step": 9286 + }, + { + "epoch": 0.6990722444908636, + "grad_norm": 5.21363639831543, + "learning_rate": 2.2074053079490255e-05, + "loss": 1.6966, + "step": 9287 + }, + { + "epoch": 0.6991475187715237, + "grad_norm": 4.50490140914917, + "learning_rate": 2.206394166576542e-05, + "loss": 1.5315, + "step": 9288 + }, + { + "epoch": 0.6992227930521839, + "grad_norm": 4.661284923553467, + "learning_rate": 2.2053831912759093e-05, + "loss": 2.015, + "step": 9289 + }, + { + "epoch": 0.699298067332844, + "grad_norm": 4.059357643127441, + "learning_rate": 2.2043723821072292e-05, + "loss": 1.8894, + "step": 9290 + }, + { + "epoch": 0.6993733416135042, + "grad_norm": 6.154763221740723, + "learning_rate": 2.2033617391305876e-05, + "loss": 2.4171, + "step": 9291 + }, + { + "epoch": 0.6994486158941644, + "grad_norm": 5.474515438079834, + "learning_rate": 2.2023512624060678e-05, + "loss": 1.5316, + "step": 9292 + }, + { + "epoch": 0.6995238901748245, + "grad_norm": 6.8122711181640625, + "learning_rate": 2.201340951993736e-05, + "loss": 1.7568, + "step": 9293 + }, + { + "epoch": 0.6995991644554846, + "grad_norm": 5.8783159255981445, + "learning_rate": 2.2003308079536572e-05, + "loss": 1.8163, + "step": 9294 + }, + { + "epoch": 0.6996744387361449, + "grad_norm": 5.432812690734863, + "learning_rate": 2.1993208303458762e-05, + "loss": 1.7845, + "step": 9295 + }, + { + "epoch": 0.699749713016805, + "grad_norm": 5.120323181152344, + "learning_rate": 2.1983110192304374e-05, + "loss": 2.0576, + "step": 9296 + }, + { + "epoch": 0.6998249872974651, + "grad_norm": 3.621363878250122, + "learning_rate": 2.1973013746673682e-05, + "loss": 1.7208, + "step": 9297 + }, + { + "epoch": 0.6999002615781253, + "grad_norm": 5.142535209655762, + "learning_rate": 2.19629189671669e-05, + "loss": 1.9196, + "step": 9298 + }, + { + "epoch": 0.6999755358587855, + "grad_norm": 4.326256275177002, + "learning_rate": 2.195282585438414e-05, + "loss": 1.6816, + "step": 9299 + }, + { + "epoch": 0.7000508101394456, + "grad_norm": 4.8200249671936035, + "learning_rate": 2.1942734408925425e-05, + "loss": 1.6447, + "step": 9300 + }, + { + "epoch": 0.7001260844201057, + "grad_norm": 6.406703948974609, + "learning_rate": 2.1932644631390625e-05, + "loss": 1.4646, + "step": 9301 + }, + { + "epoch": 0.7002013587007659, + "grad_norm": 5.095361232757568, + "learning_rate": 2.1922556522379573e-05, + "loss": 1.9541, + "step": 9302 + }, + { + "epoch": 0.7002766329814261, + "grad_norm": 6.016397953033447, + "learning_rate": 2.191247008249198e-05, + "loss": 1.6406, + "step": 9303 + }, + { + "epoch": 0.7003519072620862, + "grad_norm": 3.97965145111084, + "learning_rate": 2.190238531232744e-05, + "loss": 1.7068, + "step": 9304 + }, + { + "epoch": 0.7004271815427464, + "grad_norm": 4.538355827331543, + "learning_rate": 2.189230221248549e-05, + "loss": 1.5554, + "step": 9305 + }, + { + "epoch": 0.7005024558234065, + "grad_norm": 7.378587245941162, + "learning_rate": 2.18822207835655e-05, + "loss": 1.7149, + "step": 9306 + }, + { + "epoch": 0.7005777301040667, + "grad_norm": 3.9917564392089844, + "learning_rate": 2.1872141026166825e-05, + "loss": 1.8094, + "step": 9307 + }, + { + "epoch": 0.7006530043847269, + "grad_norm": 4.944794178009033, + "learning_rate": 2.1862062940888638e-05, + "loss": 1.5551, + "step": 9308 + }, + { + "epoch": 0.700728278665387, + "grad_norm": 4.743811130523682, + "learning_rate": 2.1851986528330094e-05, + "loss": 1.5143, + "step": 9309 + }, + { + "epoch": 0.7008035529460471, + "grad_norm": 5.676422595977783, + "learning_rate": 2.1841911789090163e-05, + "loss": 1.9789, + "step": 9310 + }, + { + "epoch": 0.7008788272267074, + "grad_norm": 6.224275588989258, + "learning_rate": 2.183183872376779e-05, + "loss": 1.5431, + "step": 9311 + }, + { + "epoch": 0.7009541015073675, + "grad_norm": 4.299226760864258, + "learning_rate": 2.1821767332961772e-05, + "loss": 1.9939, + "step": 9312 + }, + { + "epoch": 0.7010293757880276, + "grad_norm": 7.578540802001953, + "learning_rate": 2.1811697617270854e-05, + "loss": 2.3154, + "step": 9313 + }, + { + "epoch": 0.7011046500686878, + "grad_norm": 5.21992826461792, + "learning_rate": 2.180162957729362e-05, + "loss": 1.8349, + "step": 9314 + }, + { + "epoch": 0.701179924349348, + "grad_norm": 4.469162940979004, + "learning_rate": 2.1791563213628606e-05, + "loss": 1.839, + "step": 9315 + }, + { + "epoch": 0.7012551986300081, + "grad_norm": 4.552899360656738, + "learning_rate": 2.1781498526874205e-05, + "loss": 1.7132, + "step": 9316 + }, + { + "epoch": 0.7013304729106683, + "grad_norm": 7.298826217651367, + "learning_rate": 2.177143551762875e-05, + "loss": 1.6412, + "step": 9317 + }, + { + "epoch": 0.7014057471913284, + "grad_norm": 5.657158851623535, + "learning_rate": 2.176137418649048e-05, + "loss": 2.0058, + "step": 9318 + }, + { + "epoch": 0.7014810214719885, + "grad_norm": 4.395684242248535, + "learning_rate": 2.175131453405746e-05, + "loss": 1.752, + "step": 9319 + }, + { + "epoch": 0.7015562957526487, + "grad_norm": 5.283894062042236, + "learning_rate": 2.1741256560927763e-05, + "loss": 1.5043, + "step": 9320 + }, + { + "epoch": 0.7016315700333089, + "grad_norm": 5.907847881317139, + "learning_rate": 2.173120026769926e-05, + "loss": 1.8634, + "step": 9321 + }, + { + "epoch": 0.701706844313969, + "grad_norm": 8.078811645507812, + "learning_rate": 2.172114565496979e-05, + "loss": 1.6235, + "step": 9322 + }, + { + "epoch": 0.7017821185946291, + "grad_norm": 4.851768970489502, + "learning_rate": 2.171109272333706e-05, + "loss": 1.8159, + "step": 9323 + }, + { + "epoch": 0.7018573928752894, + "grad_norm": 5.2085652351379395, + "learning_rate": 2.1701041473398725e-05, + "loss": 2.0303, + "step": 9324 + }, + { + "epoch": 0.7019326671559495, + "grad_norm": 4.886053085327148, + "learning_rate": 2.1690991905752254e-05, + "loss": 1.7685, + "step": 9325 + }, + { + "epoch": 0.7020079414366096, + "grad_norm": 4.854077339172363, + "learning_rate": 2.168094402099511e-05, + "loss": 1.8122, + "step": 9326 + }, + { + "epoch": 0.7020832157172698, + "grad_norm": 5.279056072235107, + "learning_rate": 2.1670897819724562e-05, + "loss": 2.1024, + "step": 9327 + }, + { + "epoch": 0.70215848999793, + "grad_norm": 5.010796070098877, + "learning_rate": 2.1660853302537874e-05, + "loss": 1.8607, + "step": 9328 + }, + { + "epoch": 0.7022337642785901, + "grad_norm": 4.8485212326049805, + "learning_rate": 2.165081047003213e-05, + "loss": 1.611, + "step": 9329 + }, + { + "epoch": 0.7023090385592503, + "grad_norm": 4.5974555015563965, + "learning_rate": 2.164076932280437e-05, + "loss": 1.7285, + "step": 9330 + }, + { + "epoch": 0.7023843128399104, + "grad_norm": 7.602117538452148, + "learning_rate": 2.1630729861451493e-05, + "loss": 1.6344, + "step": 9331 + }, + { + "epoch": 0.7024595871205705, + "grad_norm": 5.322581768035889, + "learning_rate": 2.162069208657032e-05, + "loss": 1.4554, + "step": 9332 + }, + { + "epoch": 0.7025348614012308, + "grad_norm": 5.387753486633301, + "learning_rate": 2.161065599875757e-05, + "loss": 1.9233, + "step": 9333 + }, + { + "epoch": 0.7026101356818909, + "grad_norm": 4.501760959625244, + "learning_rate": 2.1600621598609865e-05, + "loss": 1.663, + "step": 9334 + }, + { + "epoch": 0.702685409962551, + "grad_norm": 5.881781578063965, + "learning_rate": 2.159058888672374e-05, + "loss": 1.6888, + "step": 9335 + }, + { + "epoch": 0.7027606842432113, + "grad_norm": 4.83047342300415, + "learning_rate": 2.1580557863695573e-05, + "loss": 1.4525, + "step": 9336 + }, + { + "epoch": 0.7028359585238714, + "grad_norm": 4.738838195800781, + "learning_rate": 2.157052853012171e-05, + "loss": 1.7855, + "step": 9337 + }, + { + "epoch": 0.7029112328045315, + "grad_norm": 6.7153401374816895, + "learning_rate": 2.156050088659834e-05, + "loss": 1.9179, + "step": 9338 + }, + { + "epoch": 0.7029865070851916, + "grad_norm": 6.461137771606445, + "learning_rate": 2.1550474933721605e-05, + "loss": 2.0043, + "step": 9339 + }, + { + "epoch": 0.7030617813658518, + "grad_norm": 7.266298294067383, + "learning_rate": 2.1540450672087482e-05, + "loss": 1.9271, + "step": 9340 + }, + { + "epoch": 0.703137055646512, + "grad_norm": 5.673678874969482, + "learning_rate": 2.1530428102291927e-05, + "loss": 1.9622, + "step": 9341 + }, + { + "epoch": 0.7032123299271721, + "grad_norm": 8.27978229522705, + "learning_rate": 2.152040722493072e-05, + "loss": 1.7324, + "step": 9342 + }, + { + "epoch": 0.7032876042078323, + "grad_norm": 4.629066467285156, + "learning_rate": 2.1510388040599584e-05, + "loss": 1.8469, + "step": 9343 + }, + { + "epoch": 0.7033628784884924, + "grad_norm": 8.153464317321777, + "learning_rate": 2.1500370549894135e-05, + "loss": 1.9278, + "step": 9344 + }, + { + "epoch": 0.7034381527691526, + "grad_norm": 5.454888343811035, + "learning_rate": 2.1490354753409892e-05, + "loss": 1.6325, + "step": 9345 + }, + { + "epoch": 0.7035134270498128, + "grad_norm": 4.374019622802734, + "learning_rate": 2.148034065174224e-05, + "loss": 1.7229, + "step": 9346 + }, + { + "epoch": 0.7035887013304729, + "grad_norm": 6.546783447265625, + "learning_rate": 2.1470328245486533e-05, + "loss": 1.6478, + "step": 9347 + }, + { + "epoch": 0.703663975611133, + "grad_norm": 7.149482727050781, + "learning_rate": 2.1460317535237922e-05, + "loss": 1.9492, + "step": 9348 + }, + { + "epoch": 0.7037392498917933, + "grad_norm": 7.192654609680176, + "learning_rate": 2.145030852159155e-05, + "loss": 1.5616, + "step": 9349 + }, + { + "epoch": 0.7038145241724534, + "grad_norm": 3.957714080810547, + "learning_rate": 2.1440301205142433e-05, + "loss": 1.8837, + "step": 9350 + }, + { + "epoch": 0.7038897984531135, + "grad_norm": 4.212893962860107, + "learning_rate": 2.1430295586485443e-05, + "loss": 1.6644, + "step": 9351 + }, + { + "epoch": 0.7039650727337737, + "grad_norm": 5.551982402801514, + "learning_rate": 2.1420291666215426e-05, + "loss": 1.5535, + "step": 9352 + }, + { + "epoch": 0.7040403470144339, + "grad_norm": 5.363429546356201, + "learning_rate": 2.1410289444927045e-05, + "loss": 1.3214, + "step": 9353 + }, + { + "epoch": 0.704115621295094, + "grad_norm": 4.432673454284668, + "learning_rate": 2.140028892321494e-05, + "loss": 1.5352, + "step": 9354 + }, + { + "epoch": 0.7041908955757542, + "grad_norm": 7.830320835113525, + "learning_rate": 2.1390290101673583e-05, + "loss": 1.9992, + "step": 9355 + }, + { + "epoch": 0.7042661698564143, + "grad_norm": 4.241793632507324, + "learning_rate": 2.1380292980897388e-05, + "loss": 1.8128, + "step": 9356 + }, + { + "epoch": 0.7043414441370744, + "grad_norm": 4.632193565368652, + "learning_rate": 2.137029756148065e-05, + "loss": 1.4276, + "step": 9357 + }, + { + "epoch": 0.7044167184177346, + "grad_norm": 4.696080684661865, + "learning_rate": 2.13603038440176e-05, + "loss": 1.5902, + "step": 9358 + }, + { + "epoch": 0.7044919926983948, + "grad_norm": 6.631053924560547, + "learning_rate": 2.1350311829102288e-05, + "loss": 1.904, + "step": 9359 + }, + { + "epoch": 0.7045672669790549, + "grad_norm": 5.032156467437744, + "learning_rate": 2.1340321517328754e-05, + "loss": 1.5573, + "step": 9360 + }, + { + "epoch": 0.704642541259715, + "grad_norm": 3.662193775177002, + "learning_rate": 2.133033290929085e-05, + "loss": 1.646, + "step": 9361 + }, + { + "epoch": 0.7047178155403753, + "grad_norm": 6.491400241851807, + "learning_rate": 2.132034600558241e-05, + "loss": 1.6885, + "step": 9362 + }, + { + "epoch": 0.7047930898210354, + "grad_norm": 5.359855651855469, + "learning_rate": 2.1310360806797092e-05, + "loss": 1.9654, + "step": 9363 + }, + { + "epoch": 0.7048683641016955, + "grad_norm": 3.7614152431488037, + "learning_rate": 2.1300377313528524e-05, + "loss": 2.1606, + "step": 9364 + }, + { + "epoch": 0.7049436383823557, + "grad_norm": 5.248484134674072, + "learning_rate": 2.1290395526370155e-05, + "loss": 1.9529, + "step": 9365 + }, + { + "epoch": 0.7050189126630159, + "grad_norm": 5.110313892364502, + "learning_rate": 2.1280415445915397e-05, + "loss": 1.6505, + "step": 9366 + }, + { + "epoch": 0.705094186943676, + "grad_norm": 4.277717590332031, + "learning_rate": 2.1270437072757538e-05, + "loss": 1.9946, + "step": 9367 + }, + { + "epoch": 0.7051694612243362, + "grad_norm": 3.5830435752868652, + "learning_rate": 2.1260460407489758e-05, + "loss": 1.8986, + "step": 9368 + }, + { + "epoch": 0.7052447355049963, + "grad_norm": 4.895545959472656, + "learning_rate": 2.125048545070516e-05, + "loss": 2.0643, + "step": 9369 + }, + { + "epoch": 0.7053200097856565, + "grad_norm": 4.324821949005127, + "learning_rate": 2.1240512202996695e-05, + "loss": 1.8781, + "step": 9370 + }, + { + "epoch": 0.7053952840663167, + "grad_norm": 6.403765678405762, + "learning_rate": 2.123054066495727e-05, + "loss": 1.6515, + "step": 9371 + }, + { + "epoch": 0.7054705583469768, + "grad_norm": 5.25103235244751, + "learning_rate": 2.1220570837179643e-05, + "loss": 1.6929, + "step": 9372 + }, + { + "epoch": 0.7055458326276369, + "grad_norm": 4.886552810668945, + "learning_rate": 2.1210602720256522e-05, + "loss": 1.7995, + "step": 9373 + }, + { + "epoch": 0.7056211069082972, + "grad_norm": 4.412656784057617, + "learning_rate": 2.1200636314780442e-05, + "loss": 1.6064, + "step": 9374 + }, + { + "epoch": 0.7056963811889573, + "grad_norm": 7.8165364265441895, + "learning_rate": 2.1190671621343916e-05, + "loss": 1.554, + "step": 9375 + }, + { + "epoch": 0.7057716554696174, + "grad_norm": 3.77953839302063, + "learning_rate": 2.118070864053928e-05, + "loss": 1.6703, + "step": 9376 + }, + { + "epoch": 0.7058469297502775, + "grad_norm": 5.9822893142700195, + "learning_rate": 2.1170747372958828e-05, + "loss": 1.8626, + "step": 9377 + }, + { + "epoch": 0.7059222040309377, + "grad_norm": 3.8279495239257812, + "learning_rate": 2.1160787819194715e-05, + "loss": 1.6277, + "step": 9378 + }, + { + "epoch": 0.7059974783115979, + "grad_norm": 5.431642532348633, + "learning_rate": 2.1150829979839043e-05, + "loss": 2.2763, + "step": 9379 + }, + { + "epoch": 0.706072752592258, + "grad_norm": 4.592985153198242, + "learning_rate": 2.1140873855483733e-05, + "loss": 1.786, + "step": 9380 + }, + { + "epoch": 0.7061480268729182, + "grad_norm": 6.455079078674316, + "learning_rate": 2.1130919446720667e-05, + "loss": 1.691, + "step": 9381 + }, + { + "epoch": 0.7062233011535783, + "grad_norm": 5.9986114501953125, + "learning_rate": 2.112096675414162e-05, + "loss": 1.554, + "step": 9382 + }, + { + "epoch": 0.7062985754342385, + "grad_norm": 4.72442102432251, + "learning_rate": 2.1111015778338215e-05, + "loss": 1.8014, + "step": 9383 + }, + { + "epoch": 0.7063738497148987, + "grad_norm": 6.363242149353027, + "learning_rate": 2.110106651990205e-05, + "loss": 1.5556, + "step": 9384 + }, + { + "epoch": 0.7064491239955588, + "grad_norm": 5.460268974304199, + "learning_rate": 2.109111897942455e-05, + "loss": 1.6968, + "step": 9385 + }, + { + "epoch": 0.7065243982762189, + "grad_norm": 5.110371112823486, + "learning_rate": 2.1081173157497086e-05, + "loss": 1.7673, + "step": 9386 + }, + { + "epoch": 0.7065996725568792, + "grad_norm": 5.427928924560547, + "learning_rate": 2.107122905471089e-05, + "loss": 2.2465, + "step": 9387 + }, + { + "epoch": 0.7066749468375393, + "grad_norm": 4.6043219566345215, + "learning_rate": 2.1061286671657116e-05, + "loss": 1.8022, + "step": 9388 + }, + { + "epoch": 0.7067502211181994, + "grad_norm": 5.905806064605713, + "learning_rate": 2.1051346008926815e-05, + "loss": 1.6227, + "step": 9389 + }, + { + "epoch": 0.7068254953988596, + "grad_norm": 5.884878635406494, + "learning_rate": 2.1041407067110953e-05, + "loss": 1.8365, + "step": 9390 + }, + { + "epoch": 0.7069007696795198, + "grad_norm": 6.200535297393799, + "learning_rate": 2.1031469846800327e-05, + "loss": 1.8344, + "step": 9391 + }, + { + "epoch": 0.7069760439601799, + "grad_norm": 5.801346302032471, + "learning_rate": 2.1021534348585718e-05, + "loss": 1.7325, + "step": 9392 + }, + { + "epoch": 0.7070513182408401, + "grad_norm": 8.003877639770508, + "learning_rate": 2.1011600573057722e-05, + "loss": 1.7194, + "step": 9393 + }, + { + "epoch": 0.7071265925215002, + "grad_norm": 6.339446067810059, + "learning_rate": 2.1001668520806917e-05, + "loss": 1.7208, + "step": 9394 + }, + { + "epoch": 0.7072018668021604, + "grad_norm": 3.8153862953186035, + "learning_rate": 2.0991738192423695e-05, + "loss": 1.5206, + "step": 9395 + }, + { + "epoch": 0.7072771410828206, + "grad_norm": 4.933321952819824, + "learning_rate": 2.0981809588498418e-05, + "loss": 1.7913, + "step": 9396 + }, + { + "epoch": 0.7073524153634807, + "grad_norm": 4.997800350189209, + "learning_rate": 2.0971882709621288e-05, + "loss": 2.251, + "step": 9397 + }, + { + "epoch": 0.7074276896441408, + "grad_norm": 4.7735595703125, + "learning_rate": 2.0961957556382434e-05, + "loss": 1.8686, + "step": 9398 + }, + { + "epoch": 0.7075029639248009, + "grad_norm": 4.735471725463867, + "learning_rate": 2.0952034129371884e-05, + "loss": 1.5492, + "step": 9399 + }, + { + "epoch": 0.7075782382054612, + "grad_norm": 5.052907943725586, + "learning_rate": 2.0942112429179556e-05, + "loss": 2.1842, + "step": 9400 + }, + { + "epoch": 0.7076535124861213, + "grad_norm": 4.847354412078857, + "learning_rate": 2.0932192456395288e-05, + "loss": 1.7986, + "step": 9401 + }, + { + "epoch": 0.7077287867667814, + "grad_norm": 4.7606425285339355, + "learning_rate": 2.0922274211608756e-05, + "loss": 1.6958, + "step": 9402 + }, + { + "epoch": 0.7078040610474416, + "grad_norm": 4.672359943389893, + "learning_rate": 2.09123576954096e-05, + "loss": 1.4159, + "step": 9403 + }, + { + "epoch": 0.7078793353281018, + "grad_norm": 5.7129693031311035, + "learning_rate": 2.090244290838731e-05, + "loss": 1.4977, + "step": 9404 + }, + { + "epoch": 0.7079546096087619, + "grad_norm": 5.1066670417785645, + "learning_rate": 2.089252985113131e-05, + "loss": 1.4826, + "step": 9405 + }, + { + "epoch": 0.7080298838894221, + "grad_norm": 4.685865879058838, + "learning_rate": 2.0882618524230875e-05, + "loss": 1.7515, + "step": 9406 + }, + { + "epoch": 0.7081051581700822, + "grad_norm": 4.7058281898498535, + "learning_rate": 2.087270892827524e-05, + "loss": 1.9802, + "step": 9407 + }, + { + "epoch": 0.7081804324507424, + "grad_norm": 8.86685562133789, + "learning_rate": 2.0862801063853466e-05, + "loss": 2.0712, + "step": 9408 + }, + { + "epoch": 0.7082557067314026, + "grad_norm": 5.86874532699585, + "learning_rate": 2.0852894931554586e-05, + "loss": 1.5738, + "step": 9409 + }, + { + "epoch": 0.7083309810120627, + "grad_norm": 4.495116233825684, + "learning_rate": 2.0842990531967444e-05, + "loss": 1.6954, + "step": 9410 + }, + { + "epoch": 0.7084062552927228, + "grad_norm": 5.553970813751221, + "learning_rate": 2.0833087865680855e-05, + "loss": 2.175, + "step": 9411 + }, + { + "epoch": 0.7084815295733831, + "grad_norm": 4.52925443649292, + "learning_rate": 2.0823186933283513e-05, + "loss": 1.4447, + "step": 9412 + }, + { + "epoch": 0.7085568038540432, + "grad_norm": 6.302709102630615, + "learning_rate": 2.081328773536398e-05, + "loss": 1.935, + "step": 9413 + }, + { + "epoch": 0.7086320781347033, + "grad_norm": 5.351108551025391, + "learning_rate": 2.0803390272510764e-05, + "loss": 1.7254, + "step": 9414 + }, + { + "epoch": 0.7087073524153635, + "grad_norm": 4.4544572830200195, + "learning_rate": 2.079349454531221e-05, + "loss": 1.7225, + "step": 9415 + }, + { + "epoch": 0.7087826266960237, + "grad_norm": 6.059580326080322, + "learning_rate": 2.0783600554356615e-05, + "loss": 1.5477, + "step": 9416 + }, + { + "epoch": 0.7088579009766838, + "grad_norm": 4.927691459655762, + "learning_rate": 2.0773708300232118e-05, + "loss": 1.7489, + "step": 9417 + }, + { + "epoch": 0.7089331752573439, + "grad_norm": 5.823075771331787, + "learning_rate": 2.0763817783526824e-05, + "loss": 1.8722, + "step": 9418 + }, + { + "epoch": 0.7090084495380041, + "grad_norm": 4.6229248046875, + "learning_rate": 2.0753929004828654e-05, + "loss": 1.4568, + "step": 9419 + }, + { + "epoch": 0.7090837238186642, + "grad_norm": 5.61832857131958, + "learning_rate": 2.0744041964725508e-05, + "loss": 1.4044, + "step": 9420 + }, + { + "epoch": 0.7091589980993244, + "grad_norm": 6.5735602378845215, + "learning_rate": 2.0734156663805103e-05, + "loss": 1.7421, + "step": 9421 + }, + { + "epoch": 0.7092342723799846, + "grad_norm": 5.72095251083374, + "learning_rate": 2.072427310265511e-05, + "loss": 1.9738, + "step": 9422 + }, + { + "epoch": 0.7093095466606447, + "grad_norm": 6.4231085777282715, + "learning_rate": 2.071439128186309e-05, + "loss": 1.7157, + "step": 9423 + }, + { + "epoch": 0.7093848209413048, + "grad_norm": 5.852846622467041, + "learning_rate": 2.0704511202016485e-05, + "loss": 1.9646, + "step": 9424 + }, + { + "epoch": 0.7094600952219651, + "grad_norm": 5.215975284576416, + "learning_rate": 2.0694632863702618e-05, + "loss": 1.5632, + "step": 9425 + }, + { + "epoch": 0.7095353695026252, + "grad_norm": 5.809387683868408, + "learning_rate": 2.0684756267508755e-05, + "loss": 1.8481, + "step": 9426 + }, + { + "epoch": 0.7096106437832853, + "grad_norm": 5.000277996063232, + "learning_rate": 2.0674881414022002e-05, + "loss": 1.5543, + "step": 9427 + }, + { + "epoch": 0.7096859180639455, + "grad_norm": 5.496491432189941, + "learning_rate": 2.066500830382942e-05, + "loss": 1.8817, + "step": 9428 + }, + { + "epoch": 0.7097611923446057, + "grad_norm": 5.164965629577637, + "learning_rate": 2.0655136937517906e-05, + "loss": 1.8411, + "step": 9429 + }, + { + "epoch": 0.7098364666252658, + "grad_norm": 6.205110549926758, + "learning_rate": 2.0645267315674306e-05, + "loss": 1.7306, + "step": 9430 + }, + { + "epoch": 0.709911740905926, + "grad_norm": 5.23795747756958, + "learning_rate": 2.063539943888535e-05, + "loss": 2.1281, + "step": 9431 + }, + { + "epoch": 0.7099870151865861, + "grad_norm": 7.166508674621582, + "learning_rate": 2.062553330773762e-05, + "loss": 1.9502, + "step": 9432 + }, + { + "epoch": 0.7100622894672463, + "grad_norm": 5.010339260101318, + "learning_rate": 2.0615668922817655e-05, + "loss": 1.8963, + "step": 9433 + }, + { + "epoch": 0.7101375637479065, + "grad_norm": 6.603209495544434, + "learning_rate": 2.0605806284711858e-05, + "loss": 1.7303, + "step": 9434 + }, + { + "epoch": 0.7102128380285666, + "grad_norm": 5.158009052276611, + "learning_rate": 2.0595945394006554e-05, + "loss": 1.4614, + "step": 9435 + }, + { + "epoch": 0.7102881123092267, + "grad_norm": 4.6790008544921875, + "learning_rate": 2.0586086251287907e-05, + "loss": 1.8434, + "step": 9436 + }, + { + "epoch": 0.7103633865898868, + "grad_norm": 7.089626312255859, + "learning_rate": 2.0576228857142056e-05, + "loss": 1.4228, + "step": 9437 + }, + { + "epoch": 0.7104386608705471, + "grad_norm": 5.4256792068481445, + "learning_rate": 2.0566373212154956e-05, + "loss": 1.7713, + "step": 9438 + }, + { + "epoch": 0.7105139351512072, + "grad_norm": 4.999897480010986, + "learning_rate": 2.0556519316912526e-05, + "loss": 1.894, + "step": 9439 + }, + { + "epoch": 0.7105892094318673, + "grad_norm": 4.260997295379639, + "learning_rate": 2.0546667172000524e-05, + "loss": 1.591, + "step": 9440 + }, + { + "epoch": 0.7106644837125276, + "grad_norm": 4.535533905029297, + "learning_rate": 2.0536816778004665e-05, + "loss": 1.3605, + "step": 9441 + }, + { + "epoch": 0.7107397579931877, + "grad_norm": 4.561544895172119, + "learning_rate": 2.0526968135510493e-05, + "loss": 1.7287, + "step": 9442 + }, + { + "epoch": 0.7108150322738478, + "grad_norm": 4.750445365905762, + "learning_rate": 2.05171212451035e-05, + "loss": 1.5893, + "step": 9443 + }, + { + "epoch": 0.710890306554508, + "grad_norm": 4.174236297607422, + "learning_rate": 2.050727610736905e-05, + "loss": 2.0115, + "step": 9444 + }, + { + "epoch": 0.7109655808351681, + "grad_norm": 5.502784252166748, + "learning_rate": 2.0497432722892414e-05, + "loss": 1.6432, + "step": 9445 + }, + { + "epoch": 0.7110408551158283, + "grad_norm": 4.573144912719727, + "learning_rate": 2.0487591092258763e-05, + "loss": 2.0313, + "step": 9446 + }, + { + "epoch": 0.7111161293964885, + "grad_norm": 5.890664100646973, + "learning_rate": 2.0477751216053126e-05, + "loss": 1.8027, + "step": 9447 + }, + { + "epoch": 0.7111914036771486, + "grad_norm": 4.696347713470459, + "learning_rate": 2.0467913094860487e-05, + "loss": 1.9201, + "step": 9448 + }, + { + "epoch": 0.7112666779578087, + "grad_norm": 7.478809356689453, + "learning_rate": 2.045807672926566e-05, + "loss": 1.735, + "step": 9449 + }, + { + "epoch": 0.711341952238469, + "grad_norm": 4.8346405029296875, + "learning_rate": 2.0448242119853427e-05, + "loss": 1.7149, + "step": 9450 + }, + { + "epoch": 0.7114172265191291, + "grad_norm": 5.449156761169434, + "learning_rate": 2.0438409267208387e-05, + "loss": 1.9531, + "step": 9451 + }, + { + "epoch": 0.7114925007997892, + "grad_norm": 5.970377445220947, + "learning_rate": 2.0428578171915107e-05, + "loss": 1.719, + "step": 9452 + }, + { + "epoch": 0.7115677750804494, + "grad_norm": 5.865736484527588, + "learning_rate": 2.0418748834557995e-05, + "loss": 1.7465, + "step": 9453 + }, + { + "epoch": 0.7116430493611096, + "grad_norm": 4.304711818695068, + "learning_rate": 2.040892125572138e-05, + "loss": 1.744, + "step": 9454 + }, + { + "epoch": 0.7117183236417697, + "grad_norm": 4.753633975982666, + "learning_rate": 2.0399095435989514e-05, + "loss": 1.6585, + "step": 9455 + }, + { + "epoch": 0.7117935979224298, + "grad_norm": 5.238807201385498, + "learning_rate": 2.038927137594647e-05, + "loss": 1.6146, + "step": 9456 + }, + { + "epoch": 0.71186887220309, + "grad_norm": 4.112076759338379, + "learning_rate": 2.0379449076176276e-05, + "loss": 1.5485, + "step": 9457 + }, + { + "epoch": 0.7119441464837502, + "grad_norm": 4.629601955413818, + "learning_rate": 2.0369628537262863e-05, + "loss": 1.6979, + "step": 9458 + }, + { + "epoch": 0.7120194207644103, + "grad_norm": 5.083816051483154, + "learning_rate": 2.035980975979e-05, + "loss": 1.8969, + "step": 9459 + }, + { + "epoch": 0.7120946950450705, + "grad_norm": 6.646997928619385, + "learning_rate": 2.034999274434142e-05, + "loss": 1.8507, + "step": 9460 + }, + { + "epoch": 0.7121699693257306, + "grad_norm": 7.1772589683532715, + "learning_rate": 2.034017749150067e-05, + "loss": 1.9429, + "step": 9461 + }, + { + "epoch": 0.7122452436063907, + "grad_norm": 5.474501132965088, + "learning_rate": 2.0330364001851264e-05, + "loss": 1.6245, + "step": 9462 + }, + { + "epoch": 0.712320517887051, + "grad_norm": 4.639760971069336, + "learning_rate": 2.032055227597661e-05, + "loss": 1.9059, + "step": 9463 + }, + { + "epoch": 0.7123957921677111, + "grad_norm": 4.972564220428467, + "learning_rate": 2.0310742314459945e-05, + "loss": 1.9239, + "step": 9464 + }, + { + "epoch": 0.7124710664483712, + "grad_norm": 2.9669077396392822, + "learning_rate": 2.030093411788448e-05, + "loss": 1.5783, + "step": 9465 + }, + { + "epoch": 0.7125463407290314, + "grad_norm": 5.708822250366211, + "learning_rate": 2.0291127686833244e-05, + "loss": 1.7934, + "step": 9466 + }, + { + "epoch": 0.7126216150096916, + "grad_norm": 5.984284400939941, + "learning_rate": 2.028132302188922e-05, + "loss": 1.5982, + "step": 9467 + }, + { + "epoch": 0.7126968892903517, + "grad_norm": 5.550839424133301, + "learning_rate": 2.027152012363528e-05, + "loss": 2.0099, + "step": 9468 + }, + { + "epoch": 0.7127721635710119, + "grad_norm": 5.770566463470459, + "learning_rate": 2.0261718992654176e-05, + "loss": 1.6989, + "step": 9469 + }, + { + "epoch": 0.712847437851672, + "grad_norm": 5.042080402374268, + "learning_rate": 2.0251919629528538e-05, + "loss": 1.6973, + "step": 9470 + }, + { + "epoch": 0.7129227121323322, + "grad_norm": 4.344055652618408, + "learning_rate": 2.0242122034840932e-05, + "loss": 1.4789, + "step": 9471 + }, + { + "epoch": 0.7129979864129924, + "grad_norm": 5.328199863433838, + "learning_rate": 2.0232326209173774e-05, + "loss": 1.7635, + "step": 9472 + }, + { + "epoch": 0.7130732606936525, + "grad_norm": 4.057014465332031, + "learning_rate": 2.022253215310943e-05, + "loss": 1.8215, + "step": 9473 + }, + { + "epoch": 0.7131485349743126, + "grad_norm": 4.870148658752441, + "learning_rate": 2.0212739867230086e-05, + "loss": 1.7192, + "step": 9474 + }, + { + "epoch": 0.7132238092549729, + "grad_norm": 5.1111040115356445, + "learning_rate": 2.0202949352117905e-05, + "loss": 1.3829, + "step": 9475 + }, + { + "epoch": 0.713299083535633, + "grad_norm": 8.54655933380127, + "learning_rate": 2.019316060835487e-05, + "loss": 1.5274, + "step": 9476 + }, + { + "epoch": 0.7133743578162931, + "grad_norm": 4.405472278594971, + "learning_rate": 2.0183373636522912e-05, + "loss": 1.6771, + "step": 9477 + }, + { + "epoch": 0.7134496320969532, + "grad_norm": 4.744894981384277, + "learning_rate": 2.0173588437203838e-05, + "loss": 2.0716, + "step": 9478 + }, + { + "epoch": 0.7135249063776135, + "grad_norm": 4.948408126831055, + "learning_rate": 2.016380501097935e-05, + "loss": 1.9256, + "step": 9479 + }, + { + "epoch": 0.7136001806582736, + "grad_norm": 4.117366313934326, + "learning_rate": 2.015402335843107e-05, + "loss": 1.6762, + "step": 9480 + }, + { + "epoch": 0.7136754549389337, + "grad_norm": 5.52851676940918, + "learning_rate": 2.0144243480140433e-05, + "loss": 1.9827, + "step": 9481 + }, + { + "epoch": 0.7137507292195939, + "grad_norm": 5.923187255859375, + "learning_rate": 2.0134465376688877e-05, + "loss": 1.4375, + "step": 9482 + }, + { + "epoch": 0.713826003500254, + "grad_norm": 8.230806350708008, + "learning_rate": 2.0124689048657646e-05, + "loss": 1.7676, + "step": 9483 + }, + { + "epoch": 0.7139012777809142, + "grad_norm": 6.832233428955078, + "learning_rate": 2.0114914496627944e-05, + "loss": 1.8914, + "step": 9484 + }, + { + "epoch": 0.7139765520615744, + "grad_norm": 4.676365375518799, + "learning_rate": 2.010514172118081e-05, + "loss": 1.9585, + "step": 9485 + }, + { + "epoch": 0.7140518263422345, + "grad_norm": 4.52425479888916, + "learning_rate": 2.009537072289724e-05, + "loss": 2.1216, + "step": 9486 + }, + { + "epoch": 0.7141271006228946, + "grad_norm": 4.042599678039551, + "learning_rate": 2.0085601502358052e-05, + "loss": 1.4409, + "step": 9487 + }, + { + "epoch": 0.7142023749035549, + "grad_norm": 6.027821063995361, + "learning_rate": 2.0075834060144027e-05, + "loss": 1.6192, + "step": 9488 + }, + { + "epoch": 0.714277649184215, + "grad_norm": 6.656979084014893, + "learning_rate": 2.0066068396835803e-05, + "loss": 1.6756, + "step": 9489 + }, + { + "epoch": 0.7143529234648751, + "grad_norm": 9.03135871887207, + "learning_rate": 2.005630451301394e-05, + "loss": 1.7149, + "step": 9490 + }, + { + "epoch": 0.7144281977455353, + "grad_norm": 5.291358470916748, + "learning_rate": 2.004654240925884e-05, + "loss": 1.7536, + "step": 9491 + }, + { + "epoch": 0.7145034720261955, + "grad_norm": 4.695072650909424, + "learning_rate": 2.0036782086150862e-05, + "loss": 1.488, + "step": 9492 + }, + { + "epoch": 0.7145787463068556, + "grad_norm": 4.534849166870117, + "learning_rate": 2.0027023544270195e-05, + "loss": 1.7671, + "step": 9493 + }, + { + "epoch": 0.7146540205875158, + "grad_norm": 4.625223159790039, + "learning_rate": 2.001726678419698e-05, + "loss": 1.7271, + "step": 9494 + }, + { + "epoch": 0.7147292948681759, + "grad_norm": 4.820035934448242, + "learning_rate": 2.000751180651124e-05, + "loss": 1.8365, + "step": 9495 + }, + { + "epoch": 0.714804569148836, + "grad_norm": 5.447136402130127, + "learning_rate": 1.9997758611792855e-05, + "loss": 1.5714, + "step": 9496 + }, + { + "epoch": 0.7148798434294962, + "grad_norm": 4.278191566467285, + "learning_rate": 1.9988007200621646e-05, + "loss": 1.746, + "step": 9497 + }, + { + "epoch": 0.7149551177101564, + "grad_norm": 4.140084743499756, + "learning_rate": 1.9978257573577275e-05, + "loss": 1.4296, + "step": 9498 + }, + { + "epoch": 0.7150303919908165, + "grad_norm": 4.942025184631348, + "learning_rate": 1.9968509731239354e-05, + "loss": 2.2987, + "step": 9499 + }, + { + "epoch": 0.7151056662714766, + "grad_norm": 4.47216796875, + "learning_rate": 1.9958763674187358e-05, + "loss": 1.7226, + "step": 9500 + }, + { + "epoch": 0.7151809405521369, + "grad_norm": 4.665167331695557, + "learning_rate": 1.994901940300068e-05, + "loss": 1.5928, + "step": 9501 + }, + { + "epoch": 0.715256214832797, + "grad_norm": 7.131214618682861, + "learning_rate": 1.9939276918258554e-05, + "loss": 1.5387, + "step": 9502 + }, + { + "epoch": 0.7153314891134571, + "grad_norm": 5.923559665679932, + "learning_rate": 1.992953622054018e-05, + "loss": 2.2767, + "step": 9503 + }, + { + "epoch": 0.7154067633941174, + "grad_norm": 5.1621246337890625, + "learning_rate": 1.9919797310424582e-05, + "loss": 1.633, + "step": 9504 + }, + { + "epoch": 0.7154820376747775, + "grad_norm": 5.4784064292907715, + "learning_rate": 1.9910060188490738e-05, + "loss": 2.1133, + "step": 9505 + }, + { + "epoch": 0.7155573119554376, + "grad_norm": 4.585684776306152, + "learning_rate": 1.990032485531746e-05, + "loss": 1.8184, + "step": 9506 + }, + { + "epoch": 0.7156325862360978, + "grad_norm": 5.093260765075684, + "learning_rate": 1.989059131148352e-05, + "loss": 1.6507, + "step": 9507 + }, + { + "epoch": 0.7157078605167579, + "grad_norm": 4.967860698699951, + "learning_rate": 1.9880859557567517e-05, + "loss": 2.1291, + "step": 9508 + }, + { + "epoch": 0.7157831347974181, + "grad_norm": 5.453738212585449, + "learning_rate": 1.9871129594148015e-05, + "loss": 2.3147, + "step": 9509 + }, + { + "epoch": 0.7158584090780783, + "grad_norm": 5.732362747192383, + "learning_rate": 1.9861401421803382e-05, + "loss": 1.7411, + "step": 9510 + }, + { + "epoch": 0.7159336833587384, + "grad_norm": 5.740506172180176, + "learning_rate": 1.985167504111196e-05, + "loss": 1.581, + "step": 9511 + }, + { + "epoch": 0.7160089576393985, + "grad_norm": 5.705687999725342, + "learning_rate": 1.984195045265195e-05, + "loss": 1.6964, + "step": 9512 + }, + { + "epoch": 0.7160842319200588, + "grad_norm": 5.087823867797852, + "learning_rate": 1.983222765700146e-05, + "loss": 1.6311, + "step": 9513 + }, + { + "epoch": 0.7161595062007189, + "grad_norm": 4.713496208190918, + "learning_rate": 1.9822506654738486e-05, + "loss": 1.5901, + "step": 9514 + }, + { + "epoch": 0.716234780481379, + "grad_norm": 4.316542625427246, + "learning_rate": 1.981278744644089e-05, + "loss": 1.6445, + "step": 9515 + }, + { + "epoch": 0.7163100547620391, + "grad_norm": 6.0693793296813965, + "learning_rate": 1.9803070032686478e-05, + "loss": 1.3688, + "step": 9516 + }, + { + "epoch": 0.7163853290426994, + "grad_norm": 5.99887752532959, + "learning_rate": 1.979335441405289e-05, + "loss": 1.9262, + "step": 9517 + }, + { + "epoch": 0.7164606033233595, + "grad_norm": 8.41556453704834, + "learning_rate": 1.9783640591117725e-05, + "loss": 2.0765, + "step": 9518 + }, + { + "epoch": 0.7165358776040196, + "grad_norm": 5.457451820373535, + "learning_rate": 1.977392856445841e-05, + "loss": 1.5663, + "step": 9519 + }, + { + "epoch": 0.7166111518846798, + "grad_norm": 4.922848701477051, + "learning_rate": 1.9764218334652335e-05, + "loss": 1.9317, + "step": 9520 + }, + { + "epoch": 0.71668642616534, + "grad_norm": 5.458019733428955, + "learning_rate": 1.9754509902276704e-05, + "loss": 1.6215, + "step": 9521 + }, + { + "epoch": 0.7167617004460001, + "grad_norm": 5.272371292114258, + "learning_rate": 1.9744803267908674e-05, + "loss": 1.6082, + "step": 9522 + }, + { + "epoch": 0.7168369747266603, + "grad_norm": 5.434672832489014, + "learning_rate": 1.9735098432125277e-05, + "loss": 1.6452, + "step": 9523 + }, + { + "epoch": 0.7169122490073204, + "grad_norm": 4.719828128814697, + "learning_rate": 1.972539539550346e-05, + "loss": 1.6626, + "step": 9524 + }, + { + "epoch": 0.7169875232879805, + "grad_norm": 5.53287410736084, + "learning_rate": 1.9715694158619996e-05, + "loss": 1.7293, + "step": 9525 + }, + { + "epoch": 0.7170627975686408, + "grad_norm": 3.6611194610595703, + "learning_rate": 1.9705994722051624e-05, + "loss": 1.5275, + "step": 9526 + }, + { + "epoch": 0.7171380718493009, + "grad_norm": 5.8244147300720215, + "learning_rate": 1.969629708637496e-05, + "loss": 1.5163, + "step": 9527 + }, + { + "epoch": 0.717213346129961, + "grad_norm": 4.741801738739014, + "learning_rate": 1.968660125216646e-05, + "loss": 1.7332, + "step": 9528 + }, + { + "epoch": 0.7172886204106212, + "grad_norm": 5.14818811416626, + "learning_rate": 1.967690722000256e-05, + "loss": 2.0653, + "step": 9529 + }, + { + "epoch": 0.7173638946912814, + "grad_norm": 4.6582112312316895, + "learning_rate": 1.9667214990459497e-05, + "loss": 1.8544, + "step": 9530 + }, + { + "epoch": 0.7174391689719415, + "grad_norm": 4.6874823570251465, + "learning_rate": 1.965752456411349e-05, + "loss": 1.5251, + "step": 9531 + }, + { + "epoch": 0.7175144432526017, + "grad_norm": 7.363040447235107, + "learning_rate": 1.9647835941540565e-05, + "loss": 2.0454, + "step": 9532 + }, + { + "epoch": 0.7175897175332618, + "grad_norm": 5.054626941680908, + "learning_rate": 1.9638149123316706e-05, + "loss": 1.5206, + "step": 9533 + }, + { + "epoch": 0.717664991813922, + "grad_norm": 5.105851650238037, + "learning_rate": 1.9628464110017757e-05, + "loss": 1.8735, + "step": 9534 + }, + { + "epoch": 0.7177402660945821, + "grad_norm": 5.140401363372803, + "learning_rate": 1.9618780902219496e-05, + "loss": 1.8935, + "step": 9535 + }, + { + "epoch": 0.7178155403752423, + "grad_norm": 6.223511695861816, + "learning_rate": 1.9609099500497517e-05, + "loss": 1.7419, + "step": 9536 + }, + { + "epoch": 0.7178908146559024, + "grad_norm": 4.152301788330078, + "learning_rate": 1.959941990542739e-05, + "loss": 1.6642, + "step": 9537 + }, + { + "epoch": 0.7179660889365626, + "grad_norm": 4.429924964904785, + "learning_rate": 1.9589742117584502e-05, + "loss": 2.1958, + "step": 9538 + }, + { + "epoch": 0.7180413632172228, + "grad_norm": 5.276669025421143, + "learning_rate": 1.9580066137544208e-05, + "loss": 1.6808, + "step": 9539 + }, + { + "epoch": 0.7181166374978829, + "grad_norm": 6.9061760902404785, + "learning_rate": 1.9570391965881675e-05, + "loss": 1.4457, + "step": 9540 + }, + { + "epoch": 0.718191911778543, + "grad_norm": 5.002562999725342, + "learning_rate": 1.9560719603172046e-05, + "loss": 1.8005, + "step": 9541 + }, + { + "epoch": 0.7182671860592033, + "grad_norm": 5.824179172515869, + "learning_rate": 1.955104904999028e-05, + "loss": 1.4862, + "step": 9542 + }, + { + "epoch": 0.7183424603398634, + "grad_norm": 4.308703422546387, + "learning_rate": 1.9541380306911276e-05, + "loss": 1.85, + "step": 9543 + }, + { + "epoch": 0.7184177346205235, + "grad_norm": 4.935798645019531, + "learning_rate": 1.9531713374509824e-05, + "loss": 1.6276, + "step": 9544 + }, + { + "epoch": 0.7184930089011837, + "grad_norm": 4.291970729827881, + "learning_rate": 1.9522048253360587e-05, + "loss": 1.8772, + "step": 9545 + }, + { + "epoch": 0.7185682831818438, + "grad_norm": 6.710949897766113, + "learning_rate": 1.9512384944038148e-05, + "loss": 2.0798, + "step": 9546 + }, + { + "epoch": 0.718643557462504, + "grad_norm": 4.978475570678711, + "learning_rate": 1.9502723447116927e-05, + "loss": 2.1588, + "step": 9547 + }, + { + "epoch": 0.7187188317431642, + "grad_norm": 5.360921859741211, + "learning_rate": 1.949306376317131e-05, + "loss": 1.5614, + "step": 9548 + }, + { + "epoch": 0.7187941060238243, + "grad_norm": 6.072622299194336, + "learning_rate": 1.9483405892775495e-05, + "loss": 2.1187, + "step": 9549 + }, + { + "epoch": 0.7188693803044844, + "grad_norm": 5.077350616455078, + "learning_rate": 1.9473749836503657e-05, + "loss": 1.2447, + "step": 9550 + }, + { + "epoch": 0.7189446545851447, + "grad_norm": 6.823755264282227, + "learning_rate": 1.9464095594929788e-05, + "loss": 2.0138, + "step": 9551 + }, + { + "epoch": 0.7190199288658048, + "grad_norm": 4.885786533355713, + "learning_rate": 1.9454443168627833e-05, + "loss": 1.775, + "step": 9552 + }, + { + "epoch": 0.7190952031464649, + "grad_norm": 5.651880741119385, + "learning_rate": 1.944479255817157e-05, + "loss": 1.7802, + "step": 9553 + }, + { + "epoch": 0.719170477427125, + "grad_norm": 7.052658557891846, + "learning_rate": 1.9435143764134718e-05, + "loss": 2.1312, + "step": 9554 + }, + { + "epoch": 0.7192457517077853, + "grad_norm": 4.349106788635254, + "learning_rate": 1.942549678709087e-05, + "loss": 1.8467, + "step": 9555 + }, + { + "epoch": 0.7193210259884454, + "grad_norm": 4.216656684875488, + "learning_rate": 1.9415851627613525e-05, + "loss": 1.5086, + "step": 9556 + }, + { + "epoch": 0.7193963002691055, + "grad_norm": 7.126668453216553, + "learning_rate": 1.9406208286276024e-05, + "loss": 1.9477, + "step": 9557 + }, + { + "epoch": 0.7194715745497657, + "grad_norm": 5.728780746459961, + "learning_rate": 1.9396566763651658e-05, + "loss": 1.5526, + "step": 9558 + }, + { + "epoch": 0.7195468488304259, + "grad_norm": 6.8929033279418945, + "learning_rate": 1.9386927060313607e-05, + "loss": 1.2448, + "step": 9559 + }, + { + "epoch": 0.719622123111086, + "grad_norm": 5.203591346740723, + "learning_rate": 1.9377289176834884e-05, + "loss": 1.7012, + "step": 9560 + }, + { + "epoch": 0.7196973973917462, + "grad_norm": 5.053387641906738, + "learning_rate": 1.9367653113788474e-05, + "loss": 1.5202, + "step": 9561 + }, + { + "epoch": 0.7197726716724063, + "grad_norm": 7.34931755065918, + "learning_rate": 1.9358018871747176e-05, + "loss": 1.6993, + "step": 9562 + }, + { + "epoch": 0.7198479459530664, + "grad_norm": 4.3718366622924805, + "learning_rate": 1.9348386451283746e-05, + "loss": 1.5393, + "step": 9563 + }, + { + "epoch": 0.7199232202337267, + "grad_norm": 4.59337043762207, + "learning_rate": 1.933875585297078e-05, + "loss": 1.5855, + "step": 9564 + }, + { + "epoch": 0.7199984945143868, + "grad_norm": 5.202847957611084, + "learning_rate": 1.9329127077380815e-05, + "loss": 1.8224, + "step": 9565 + }, + { + "epoch": 0.7200737687950469, + "grad_norm": 7.64265775680542, + "learning_rate": 1.9319500125086228e-05, + "loss": 2.4433, + "step": 9566 + }, + { + "epoch": 0.7201490430757072, + "grad_norm": 5.492217063903809, + "learning_rate": 1.9309874996659317e-05, + "loss": 1.5909, + "step": 9567 + }, + { + "epoch": 0.7202243173563673, + "grad_norm": 7.977818012237549, + "learning_rate": 1.9300251692672283e-05, + "loss": 2.2989, + "step": 9568 + }, + { + "epoch": 0.7202995916370274, + "grad_norm": 4.588866710662842, + "learning_rate": 1.929063021369722e-05, + "loss": 1.6004, + "step": 9569 + }, + { + "epoch": 0.7203748659176876, + "grad_norm": 5.409287929534912, + "learning_rate": 1.928101056030605e-05, + "loss": 1.7466, + "step": 9570 + }, + { + "epoch": 0.7204501401983477, + "grad_norm": 5.490162372589111, + "learning_rate": 1.9271392733070686e-05, + "loss": 1.7118, + "step": 9571 + }, + { + "epoch": 0.7205254144790079, + "grad_norm": 5.397826194763184, + "learning_rate": 1.926177673256283e-05, + "loss": 1.6493, + "step": 9572 + }, + { + "epoch": 0.7206006887596681, + "grad_norm": 4.545472145080566, + "learning_rate": 1.925216255935417e-05, + "loss": 1.5836, + "step": 9573 + }, + { + "epoch": 0.7206759630403282, + "grad_norm": 5.01995325088501, + "learning_rate": 1.9242550214016204e-05, + "loss": 1.8362, + "step": 9574 + }, + { + "epoch": 0.7207512373209883, + "grad_norm": 5.7508544921875, + "learning_rate": 1.9232939697120374e-05, + "loss": 1.7253, + "step": 9575 + }, + { + "epoch": 0.7208265116016485, + "grad_norm": 4.7725090980529785, + "learning_rate": 1.922333100923801e-05, + "loss": 1.8204, + "step": 9576 + }, + { + "epoch": 0.7209017858823087, + "grad_norm": 5.888092994689941, + "learning_rate": 1.92137241509403e-05, + "loss": 2.2328, + "step": 9577 + }, + { + "epoch": 0.7209770601629688, + "grad_norm": 5.053342819213867, + "learning_rate": 1.9204119122798347e-05, + "loss": 1.7214, + "step": 9578 + }, + { + "epoch": 0.7210523344436289, + "grad_norm": 6.660046100616455, + "learning_rate": 1.919451592538315e-05, + "loss": 2.006, + "step": 9579 + }, + { + "epoch": 0.7211276087242892, + "grad_norm": 3.9272639751434326, + "learning_rate": 1.918491455926561e-05, + "loss": 1.7116, + "step": 9580 + }, + { + "epoch": 0.7212028830049493, + "grad_norm": 6.261285305023193, + "learning_rate": 1.9175315025016464e-05, + "loss": 1.8894, + "step": 9581 + }, + { + "epoch": 0.7212781572856094, + "grad_norm": 4.320059776306152, + "learning_rate": 1.9165717323206412e-05, + "loss": 1.5943, + "step": 9582 + }, + { + "epoch": 0.7213534315662696, + "grad_norm": 3.753704786300659, + "learning_rate": 1.9156121454405968e-05, + "loss": 2.1705, + "step": 9583 + }, + { + "epoch": 0.7214287058469298, + "grad_norm": 5.66665506362915, + "learning_rate": 1.914652741918563e-05, + "loss": 1.5607, + "step": 9584 + }, + { + "epoch": 0.7215039801275899, + "grad_norm": 4.248273849487305, + "learning_rate": 1.9136935218115682e-05, + "loss": 1.7863, + "step": 9585 + }, + { + "epoch": 0.7215792544082501, + "grad_norm": 5.0871992111206055, + "learning_rate": 1.91273448517664e-05, + "loss": 1.7524, + "step": 9586 + }, + { + "epoch": 0.7216545286889102, + "grad_norm": 7.492625713348389, + "learning_rate": 1.9117756320707865e-05, + "loss": 1.8665, + "step": 9587 + }, + { + "epoch": 0.7217298029695703, + "grad_norm": 5.716367721557617, + "learning_rate": 1.9108169625510107e-05, + "loss": 1.6759, + "step": 9588 + }, + { + "epoch": 0.7218050772502306, + "grad_norm": 4.533812522888184, + "learning_rate": 1.9098584766743026e-05, + "loss": 1.6845, + "step": 9589 + }, + { + "epoch": 0.7218803515308907, + "grad_norm": 4.361556053161621, + "learning_rate": 1.9089001744976413e-05, + "loss": 1.8053, + "step": 9590 + }, + { + "epoch": 0.7219556258115508, + "grad_norm": 5.910179615020752, + "learning_rate": 1.907942056077997e-05, + "loss": 1.8072, + "step": 9591 + }, + { + "epoch": 0.722030900092211, + "grad_norm": 6.202566146850586, + "learning_rate": 1.906984121472324e-05, + "loss": 1.9143, + "step": 9592 + }, + { + "epoch": 0.7221061743728712, + "grad_norm": 4.104458332061768, + "learning_rate": 1.9060263707375713e-05, + "loss": 2.2607, + "step": 9593 + }, + { + "epoch": 0.7221814486535313, + "grad_norm": 6.638364315032959, + "learning_rate": 1.9050688039306717e-05, + "loss": 1.5661, + "step": 9594 + }, + { + "epoch": 0.7222567229341914, + "grad_norm": 5.149166584014893, + "learning_rate": 1.9041114211085538e-05, + "loss": 1.64, + "step": 9595 + }, + { + "epoch": 0.7223319972148516, + "grad_norm": 6.063036918640137, + "learning_rate": 1.9031542223281263e-05, + "loss": 1.6034, + "step": 9596 + }, + { + "epoch": 0.7224072714955118, + "grad_norm": 5.441456317901611, + "learning_rate": 1.9021972076462964e-05, + "loss": 1.9747, + "step": 9597 + }, + { + "epoch": 0.7224825457761719, + "grad_norm": 5.859633445739746, + "learning_rate": 1.9012403771199517e-05, + "loss": 1.819, + "step": 9598 + }, + { + "epoch": 0.7225578200568321, + "grad_norm": 7.120880126953125, + "learning_rate": 1.9002837308059755e-05, + "loss": 1.8416, + "step": 9599 + }, + { + "epoch": 0.7226330943374922, + "grad_norm": 4.530207633972168, + "learning_rate": 1.8993272687612378e-05, + "loss": 1.8415, + "step": 9600 + }, + { + "epoch": 0.7227083686181524, + "grad_norm": 3.640749931335449, + "learning_rate": 1.898370991042599e-05, + "loss": 1.9136, + "step": 9601 + }, + { + "epoch": 0.7227836428988126, + "grad_norm": 3.6428956985473633, + "learning_rate": 1.897414897706903e-05, + "loss": 1.431, + "step": 9602 + }, + { + "epoch": 0.7228589171794727, + "grad_norm": 4.3585710525512695, + "learning_rate": 1.8964589888109906e-05, + "loss": 1.6025, + "step": 9603 + }, + { + "epoch": 0.7229341914601328, + "grad_norm": 4.545854091644287, + "learning_rate": 1.8955032644116848e-05, + "loss": 1.4056, + "step": 9604 + }, + { + "epoch": 0.7230094657407931, + "grad_norm": 6.152764320373535, + "learning_rate": 1.8945477245658038e-05, + "loss": 1.5699, + "step": 9605 + }, + { + "epoch": 0.7230847400214532, + "grad_norm": 4.762167930603027, + "learning_rate": 1.8935923693301488e-05, + "loss": 1.9865, + "step": 9606 + }, + { + "epoch": 0.7231600143021133, + "grad_norm": 4.807002544403076, + "learning_rate": 1.8926371987615137e-05, + "loss": 1.8307, + "step": 9607 + }, + { + "epoch": 0.7232352885827735, + "grad_norm": 5.177624225616455, + "learning_rate": 1.8916822129166827e-05, + "loss": 1.7896, + "step": 9608 + }, + { + "epoch": 0.7233105628634336, + "grad_norm": 4.05759334564209, + "learning_rate": 1.8907274118524236e-05, + "loss": 1.6902, + "step": 9609 + }, + { + "epoch": 0.7233858371440938, + "grad_norm": 4.612060070037842, + "learning_rate": 1.8897727956255007e-05, + "loss": 1.5394, + "step": 9610 + }, + { + "epoch": 0.723461111424754, + "grad_norm": 5.294699192047119, + "learning_rate": 1.888818364292659e-05, + "loss": 1.6194, + "step": 9611 + }, + { + "epoch": 0.7235363857054141, + "grad_norm": 9.087019920349121, + "learning_rate": 1.8878641179106375e-05, + "loss": 1.5872, + "step": 9612 + }, + { + "epoch": 0.7236116599860742, + "grad_norm": 5.4589643478393555, + "learning_rate": 1.8869100565361657e-05, + "loss": 1.2749, + "step": 9613 + }, + { + "epoch": 0.7236869342667344, + "grad_norm": 4.834293842315674, + "learning_rate": 1.8859561802259597e-05, + "loss": 1.6789, + "step": 9614 + }, + { + "epoch": 0.7237622085473946, + "grad_norm": 5.562376976013184, + "learning_rate": 1.8850024890367224e-05, + "loss": 1.8636, + "step": 9615 + }, + { + "epoch": 0.7238374828280547, + "grad_norm": 5.088202476501465, + "learning_rate": 1.88404898302515e-05, + "loss": 1.8992, + "step": 9616 + }, + { + "epoch": 0.7239127571087148, + "grad_norm": 4.807105541229248, + "learning_rate": 1.883095662247924e-05, + "loss": 1.7993, + "step": 9617 + }, + { + "epoch": 0.7239880313893751, + "grad_norm": 3.7315304279327393, + "learning_rate": 1.8821425267617187e-05, + "loss": 2.0528, + "step": 9618 + }, + { + "epoch": 0.7240633056700352, + "grad_norm": 5.1169962882995605, + "learning_rate": 1.8811895766231928e-05, + "loss": 1.492, + "step": 9619 + }, + { + "epoch": 0.7241385799506953, + "grad_norm": 4.946092128753662, + "learning_rate": 1.880236811888999e-05, + "loss": 2.0553, + "step": 9620 + }, + { + "epoch": 0.7242138542313555, + "grad_norm": 4.723944187164307, + "learning_rate": 1.879284232615774e-05, + "loss": 1.8918, + "step": 9621 + }, + { + "epoch": 0.7242891285120157, + "grad_norm": 4.518237590789795, + "learning_rate": 1.8783318388601463e-05, + "loss": 1.619, + "step": 9622 + }, + { + "epoch": 0.7243644027926758, + "grad_norm": 5.302652359008789, + "learning_rate": 1.8773796306787343e-05, + "loss": 1.7057, + "step": 9623 + }, + { + "epoch": 0.724439677073336, + "grad_norm": 4.588483810424805, + "learning_rate": 1.8764276081281428e-05, + "loss": 1.7534, + "step": 9624 + }, + { + "epoch": 0.7245149513539961, + "grad_norm": 3.4573464393615723, + "learning_rate": 1.87547577126497e-05, + "loss": 1.8733, + "step": 9625 + }, + { + "epoch": 0.7245902256346562, + "grad_norm": 6.490103721618652, + "learning_rate": 1.8745241201457954e-05, + "loss": 1.9235, + "step": 9626 + }, + { + "epoch": 0.7246654999153165, + "grad_norm": 4.740297794342041, + "learning_rate": 1.8735726548271947e-05, + "loss": 1.7179, + "step": 9627 + }, + { + "epoch": 0.7247407741959766, + "grad_norm": 6.027243137359619, + "learning_rate": 1.8726213753657278e-05, + "loss": 1.7502, + "step": 9628 + }, + { + "epoch": 0.7248160484766367, + "grad_norm": 7.567425727844238, + "learning_rate": 1.8716702818179487e-05, + "loss": 1.8367, + "step": 9629 + }, + { + "epoch": 0.724891322757297, + "grad_norm": 6.478000164031982, + "learning_rate": 1.8707193742403935e-05, + "loss": 1.5227, + "step": 9630 + }, + { + "epoch": 0.7249665970379571, + "grad_norm": 5.904613018035889, + "learning_rate": 1.869768652689594e-05, + "loss": 2.163, + "step": 9631 + }, + { + "epoch": 0.7250418713186172, + "grad_norm": 4.555305004119873, + "learning_rate": 1.868818117222065e-05, + "loss": 1.8282, + "step": 9632 + }, + { + "epoch": 0.7251171455992773, + "grad_norm": 7.761171817779541, + "learning_rate": 1.867867767894314e-05, + "loss": 2.0681, + "step": 9633 + }, + { + "epoch": 0.7251924198799375, + "grad_norm": 5.859729290008545, + "learning_rate": 1.8669176047628373e-05, + "loss": 1.7864, + "step": 9634 + }, + { + "epoch": 0.7252676941605977, + "grad_norm": 5.73459529876709, + "learning_rate": 1.8659676278841215e-05, + "loss": 1.5884, + "step": 9635 + }, + { + "epoch": 0.7253429684412578, + "grad_norm": 3.531911849975586, + "learning_rate": 1.8650178373146355e-05, + "loss": 1.8256, + "step": 9636 + }, + { + "epoch": 0.725418242721918, + "grad_norm": 4.602025032043457, + "learning_rate": 1.864068233110845e-05, + "loss": 1.6444, + "step": 9637 + }, + { + "epoch": 0.7254935170025781, + "grad_norm": 4.323998928070068, + "learning_rate": 1.863118815329199e-05, + "loss": 1.8074, + "step": 9638 + }, + { + "epoch": 0.7255687912832383, + "grad_norm": 4.131141185760498, + "learning_rate": 1.8621695840261388e-05, + "loss": 1.725, + "step": 9639 + }, + { + "epoch": 0.7256440655638985, + "grad_norm": 7.421967029571533, + "learning_rate": 1.8612205392580945e-05, + "loss": 1.8095, + "step": 9640 + }, + { + "epoch": 0.7257193398445586, + "grad_norm": 5.806467056274414, + "learning_rate": 1.860271681081482e-05, + "loss": 1.6861, + "step": 9641 + }, + { + "epoch": 0.7257946141252187, + "grad_norm": 5.778765678405762, + "learning_rate": 1.8593230095527104e-05, + "loss": 1.3898, + "step": 9642 + }, + { + "epoch": 0.725869888405879, + "grad_norm": 5.036045551300049, + "learning_rate": 1.8583745247281724e-05, + "loss": 1.8877, + "step": 9643 + }, + { + "epoch": 0.7259451626865391, + "grad_norm": 4.849656105041504, + "learning_rate": 1.8574262266642552e-05, + "loss": 1.8789, + "step": 9644 + }, + { + "epoch": 0.7260204369671992, + "grad_norm": 4.712091445922852, + "learning_rate": 1.8564781154173312e-05, + "loss": 1.4646, + "step": 9645 + }, + { + "epoch": 0.7260957112478594, + "grad_norm": 4.8300042152404785, + "learning_rate": 1.855530191043765e-05, + "loss": 1.5845, + "step": 9646 + }, + { + "epoch": 0.7261709855285196, + "grad_norm": 5.635161876678467, + "learning_rate": 1.854582453599905e-05, + "loss": 1.3405, + "step": 9647 + }, + { + "epoch": 0.7262462598091797, + "grad_norm": 4.434541702270508, + "learning_rate": 1.8536349031420953e-05, + "loss": 1.7525, + "step": 9648 + }, + { + "epoch": 0.7263215340898399, + "grad_norm": 6.554038047790527, + "learning_rate": 1.8526875397266603e-05, + "loss": 1.963, + "step": 9649 + }, + { + "epoch": 0.7263968083705, + "grad_norm": 6.015099048614502, + "learning_rate": 1.8517403634099217e-05, + "loss": 1.6413, + "step": 9650 + }, + { + "epoch": 0.7264720826511601, + "grad_norm": 3.994659185409546, + "learning_rate": 1.850793374248184e-05, + "loss": 1.5221, + "step": 9651 + }, + { + "epoch": 0.7265473569318203, + "grad_norm": 6.556643486022949, + "learning_rate": 1.849846572297746e-05, + "loss": 1.8464, + "step": 9652 + }, + { + "epoch": 0.7266226312124805, + "grad_norm": 5.417644023895264, + "learning_rate": 1.8488999576148886e-05, + "loss": 1.8422, + "step": 9653 + }, + { + "epoch": 0.7266979054931406, + "grad_norm": 6.207387924194336, + "learning_rate": 1.8479535302558872e-05, + "loss": 1.9829, + "step": 9654 + }, + { + "epoch": 0.7267731797738007, + "grad_norm": 5.782745838165283, + "learning_rate": 1.8470072902770037e-05, + "loss": 2.1712, + "step": 9655 + }, + { + "epoch": 0.726848454054461, + "grad_norm": 4.438650131225586, + "learning_rate": 1.846061237734492e-05, + "loss": 1.9534, + "step": 9656 + }, + { + "epoch": 0.7269237283351211, + "grad_norm": 5.083627700805664, + "learning_rate": 1.8451153726845882e-05, + "loss": 1.7894, + "step": 9657 + }, + { + "epoch": 0.7269990026157812, + "grad_norm": 5.782078266143799, + "learning_rate": 1.8441696951835235e-05, + "loss": 2.0339, + "step": 9658 + }, + { + "epoch": 0.7270742768964414, + "grad_norm": 7.519563674926758, + "learning_rate": 1.8432242052875165e-05, + "loss": 1.9038, + "step": 9659 + }, + { + "epoch": 0.7271495511771016, + "grad_norm": 5.788137435913086, + "learning_rate": 1.8422789030527714e-05, + "loss": 2.0252, + "step": 9660 + }, + { + "epoch": 0.7272248254577617, + "grad_norm": 5.3881964683532715, + "learning_rate": 1.8413337885354866e-05, + "loss": 1.7703, + "step": 9661 + }, + { + "epoch": 0.7273000997384219, + "grad_norm": 6.170146465301514, + "learning_rate": 1.840388861791843e-05, + "loss": 1.4191, + "step": 9662 + }, + { + "epoch": 0.727375374019082, + "grad_norm": 7.8503828048706055, + "learning_rate": 1.8394441228780174e-05, + "loss": 1.5262, + "step": 9663 + }, + { + "epoch": 0.7274506482997422, + "grad_norm": 4.970616817474365, + "learning_rate": 1.8384995718501685e-05, + "loss": 1.5646, + "step": 9664 + }, + { + "epoch": 0.7275259225804024, + "grad_norm": 6.081812858581543, + "learning_rate": 1.8375552087644497e-05, + "loss": 2.199, + "step": 9665 + }, + { + "epoch": 0.7276011968610625, + "grad_norm": 5.441454887390137, + "learning_rate": 1.8366110336769987e-05, + "loss": 1.9577, + "step": 9666 + }, + { + "epoch": 0.7276764711417226, + "grad_norm": 5.96256685256958, + "learning_rate": 1.8356670466439446e-05, + "loss": 1.6481, + "step": 9667 + }, + { + "epoch": 0.7277517454223829, + "grad_norm": 5.123640060424805, + "learning_rate": 1.8347232477214053e-05, + "loss": 2.1115, + "step": 9668 + }, + { + "epoch": 0.727827019703043, + "grad_norm": 4.599729061126709, + "learning_rate": 1.8337796369654887e-05, + "loss": 1.7672, + "step": 9669 + }, + { + "epoch": 0.7279022939837031, + "grad_norm": 5.109320163726807, + "learning_rate": 1.832836214432286e-05, + "loss": 1.7815, + "step": 9670 + }, + { + "epoch": 0.7279775682643633, + "grad_norm": 4.701651573181152, + "learning_rate": 1.8318929801778824e-05, + "loss": 1.4773, + "step": 9671 + }, + { + "epoch": 0.7280528425450234, + "grad_norm": 5.379658222198486, + "learning_rate": 1.8309499342583525e-05, + "loss": 1.4647, + "step": 9672 + }, + { + "epoch": 0.7281281168256836, + "grad_norm": 4.223718166351318, + "learning_rate": 1.830007076729754e-05, + "loss": 1.5286, + "step": 9673 + }, + { + "epoch": 0.7282033911063437, + "grad_norm": 5.562403678894043, + "learning_rate": 1.829064407648141e-05, + "loss": 2.0676, + "step": 9674 + }, + { + "epoch": 0.7282786653870039, + "grad_norm": 4.373466968536377, + "learning_rate": 1.8281219270695493e-05, + "loss": 1.7127, + "step": 9675 + }, + { + "epoch": 0.728353939667664, + "grad_norm": 6.073401927947998, + "learning_rate": 1.827179635050009e-05, + "loss": 1.8229, + "step": 9676 + }, + { + "epoch": 0.7284292139483242, + "grad_norm": 5.517831325531006, + "learning_rate": 1.8262375316455343e-05, + "loss": 1.8406, + "step": 9677 + }, + { + "epoch": 0.7285044882289844, + "grad_norm": 4.2269673347473145, + "learning_rate": 1.8252956169121315e-05, + "loss": 1.7179, + "step": 9678 + }, + { + "epoch": 0.7285797625096445, + "grad_norm": 5.474207401275635, + "learning_rate": 1.8243538909057954e-05, + "loss": 1.8164, + "step": 9679 + }, + { + "epoch": 0.7286550367903046, + "grad_norm": 4.730332851409912, + "learning_rate": 1.82341235368251e-05, + "loss": 1.5695, + "step": 9680 + }, + { + "epoch": 0.7287303110709649, + "grad_norm": 5.244772911071777, + "learning_rate": 1.8224710052982435e-05, + "loss": 1.6626, + "step": 9681 + }, + { + "epoch": 0.728805585351625, + "grad_norm": 6.432041168212891, + "learning_rate": 1.8215298458089607e-05, + "loss": 1.6311, + "step": 9682 + }, + { + "epoch": 0.7288808596322851, + "grad_norm": 4.665215492248535, + "learning_rate": 1.8205888752706062e-05, + "loss": 2.0177, + "step": 9683 + }, + { + "epoch": 0.7289561339129453, + "grad_norm": 7.389789581298828, + "learning_rate": 1.8196480937391223e-05, + "loss": 2.1725, + "step": 9684 + }, + { + "epoch": 0.7290314081936055, + "grad_norm": 4.367339134216309, + "learning_rate": 1.8187075012704324e-05, + "loss": 1.3324, + "step": 9685 + }, + { + "epoch": 0.7291066824742656, + "grad_norm": 4.733195781707764, + "learning_rate": 1.8177670979204546e-05, + "loss": 2.1111, + "step": 9686 + }, + { + "epoch": 0.7291819567549258, + "grad_norm": 6.2214674949646, + "learning_rate": 1.8168268837450907e-05, + "loss": 2.127, + "step": 9687 + }, + { + "epoch": 0.7292572310355859, + "grad_norm": 5.486142635345459, + "learning_rate": 1.815886858800235e-05, + "loss": 1.9937, + "step": 9688 + }, + { + "epoch": 0.729332505316246, + "grad_norm": 6.762661457061768, + "learning_rate": 1.8149470231417686e-05, + "loss": 1.57, + "step": 9689 + }, + { + "epoch": 0.7294077795969063, + "grad_norm": 5.990834712982178, + "learning_rate": 1.8140073768255633e-05, + "loss": 1.9127, + "step": 9690 + }, + { + "epoch": 0.7294830538775664, + "grad_norm": 5.218294620513916, + "learning_rate": 1.8130679199074798e-05, + "loss": 2.1038, + "step": 9691 + }, + { + "epoch": 0.7295583281582265, + "grad_norm": 4.288217544555664, + "learning_rate": 1.8121286524433616e-05, + "loss": 1.5043, + "step": 9692 + }, + { + "epoch": 0.7296336024388866, + "grad_norm": 7.890143394470215, + "learning_rate": 1.81118957448905e-05, + "loss": 1.9132, + "step": 9693 + }, + { + "epoch": 0.7297088767195469, + "grad_norm": 5.704329490661621, + "learning_rate": 1.810250686100367e-05, + "loss": 1.8966, + "step": 9694 + }, + { + "epoch": 0.729784151000207, + "grad_norm": 6.121559143066406, + "learning_rate": 1.8093119873331297e-05, + "loss": 1.6732, + "step": 9695 + }, + { + "epoch": 0.7298594252808671, + "grad_norm": 7.57781457901001, + "learning_rate": 1.808373478243138e-05, + "loss": 1.9503, + "step": 9696 + }, + { + "epoch": 0.7299346995615273, + "grad_norm": 5.74470329284668, + "learning_rate": 1.8074351588861876e-05, + "loss": 1.7946, + "step": 9697 + }, + { + "epoch": 0.7300099738421875, + "grad_norm": 6.562565326690674, + "learning_rate": 1.8064970293180545e-05, + "loss": 2.2749, + "step": 9698 + }, + { + "epoch": 0.7300852481228476, + "grad_norm": 4.184543132781982, + "learning_rate": 1.805559089594509e-05, + "loss": 1.797, + "step": 9699 + }, + { + "epoch": 0.7301605224035078, + "grad_norm": 6.909560680389404, + "learning_rate": 1.8046213397713108e-05, + "loss": 2.0133, + "step": 9700 + }, + { + "epoch": 0.7302357966841679, + "grad_norm": 5.985777378082275, + "learning_rate": 1.803683779904206e-05, + "loss": 1.5298, + "step": 9701 + }, + { + "epoch": 0.7303110709648281, + "grad_norm": 6.221324443817139, + "learning_rate": 1.8027464100489283e-05, + "loss": 1.646, + "step": 9702 + }, + { + "epoch": 0.7303863452454883, + "grad_norm": 8.32680892944336, + "learning_rate": 1.801809230261203e-05, + "loss": 1.7285, + "step": 9703 + }, + { + "epoch": 0.7304616195261484, + "grad_norm": 5.151854038238525, + "learning_rate": 1.800872240596743e-05, + "loss": 1.9519, + "step": 9704 + }, + { + "epoch": 0.7305368938068085, + "grad_norm": 4.809290885925293, + "learning_rate": 1.799935441111248e-05, + "loss": 1.7571, + "step": 9705 + }, + { + "epoch": 0.7306121680874688, + "grad_norm": 5.311567783355713, + "learning_rate": 1.79899883186041e-05, + "loss": 1.7058, + "step": 9706 + }, + { + "epoch": 0.7306874423681289, + "grad_norm": 5.552243709564209, + "learning_rate": 1.7980624128999056e-05, + "loss": 1.871, + "step": 9707 + }, + { + "epoch": 0.730762716648789, + "grad_norm": 5.807781219482422, + "learning_rate": 1.7971261842854048e-05, + "loss": 2.0432, + "step": 9708 + }, + { + "epoch": 0.7308379909294492, + "grad_norm": 4.923655033111572, + "learning_rate": 1.7961901460725605e-05, + "loss": 1.6863, + "step": 9709 + }, + { + "epoch": 0.7309132652101094, + "grad_norm": 4.194489002227783, + "learning_rate": 1.7952542983170213e-05, + "loss": 1.4531, + "step": 9710 + }, + { + "epoch": 0.7309885394907695, + "grad_norm": 4.47189474105835, + "learning_rate": 1.794318641074417e-05, + "loss": 2.0195, + "step": 9711 + }, + { + "epoch": 0.7310638137714296, + "grad_norm": 5.246617794036865, + "learning_rate": 1.793383174400371e-05, + "loss": 1.6853, + "step": 9712 + }, + { + "epoch": 0.7311390880520898, + "grad_norm": 5.105777263641357, + "learning_rate": 1.7924478983504945e-05, + "loss": 1.833, + "step": 9713 + }, + { + "epoch": 0.73121436233275, + "grad_norm": 5.721435070037842, + "learning_rate": 1.7915128129803886e-05, + "loss": 1.5561, + "step": 9714 + }, + { + "epoch": 0.7312896366134101, + "grad_norm": 4.135849952697754, + "learning_rate": 1.7905779183456382e-05, + "loss": 1.9762, + "step": 9715 + }, + { + "epoch": 0.7313649108940703, + "grad_norm": 3.937908411026001, + "learning_rate": 1.789643214501824e-05, + "loss": 1.6637, + "step": 9716 + }, + { + "epoch": 0.7314401851747304, + "grad_norm": 4.459949970245361, + "learning_rate": 1.788708701504506e-05, + "loss": 1.8036, + "step": 9717 + }, + { + "epoch": 0.7315154594553905, + "grad_norm": 4.623850345611572, + "learning_rate": 1.7877743794092444e-05, + "loss": 1.6307, + "step": 9718 + }, + { + "epoch": 0.7315907337360508, + "grad_norm": 4.850218296051025, + "learning_rate": 1.7868402482715768e-05, + "loss": 1.7574, + "step": 9719 + }, + { + "epoch": 0.7316660080167109, + "grad_norm": 5.499302864074707, + "learning_rate": 1.7859063081470372e-05, + "loss": 2.0765, + "step": 9720 + }, + { + "epoch": 0.731741282297371, + "grad_norm": 3.726660966873169, + "learning_rate": 1.784972559091147e-05, + "loss": 1.3902, + "step": 9721 + }, + { + "epoch": 0.7318165565780312, + "grad_norm": 4.583266258239746, + "learning_rate": 1.784039001159411e-05, + "loss": 1.7323, + "step": 9722 + }, + { + "epoch": 0.7318918308586914, + "grad_norm": 4.42882776260376, + "learning_rate": 1.7831056344073293e-05, + "loss": 1.6156, + "step": 9723 + }, + { + "epoch": 0.7319671051393515, + "grad_norm": 4.100247383117676, + "learning_rate": 1.782172458890387e-05, + "loss": 1.7967, + "step": 9724 + }, + { + "epoch": 0.7320423794200117, + "grad_norm": 4.469841957092285, + "learning_rate": 1.781239474664061e-05, + "loss": 1.5035, + "step": 9725 + }, + { + "epoch": 0.7321176537006718, + "grad_norm": 4.846963882446289, + "learning_rate": 1.7803066817838103e-05, + "loss": 1.9044, + "step": 9726 + }, + { + "epoch": 0.732192927981332, + "grad_norm": 7.62241792678833, + "learning_rate": 1.779374080305092e-05, + "loss": 1.5622, + "step": 9727 + }, + { + "epoch": 0.7322682022619922, + "grad_norm": 6.226736068725586, + "learning_rate": 1.7784416702833407e-05, + "loss": 1.3241, + "step": 9728 + }, + { + "epoch": 0.7323434765426523, + "grad_norm": 5.41347074508667, + "learning_rate": 1.7775094517739903e-05, + "loss": 1.9273, + "step": 9729 + }, + { + "epoch": 0.7324187508233124, + "grad_norm": 3.618593454360962, + "learning_rate": 1.776577424832455e-05, + "loss": 1.4416, + "step": 9730 + }, + { + "epoch": 0.7324940251039725, + "grad_norm": 4.909494400024414, + "learning_rate": 1.775645589514145e-05, + "loss": 1.445, + "step": 9731 + }, + { + "epoch": 0.7325692993846328, + "grad_norm": 5.621603012084961, + "learning_rate": 1.7747139458744504e-05, + "loss": 1.7553, + "step": 9732 + }, + { + "epoch": 0.7326445736652929, + "grad_norm": 5.214666366577148, + "learning_rate": 1.773782493968758e-05, + "loss": 2.0385, + "step": 9733 + }, + { + "epoch": 0.732719847945953, + "grad_norm": 5.866427898406982, + "learning_rate": 1.772851233852439e-05, + "loss": 1.931, + "step": 9734 + }, + { + "epoch": 0.7327951222266132, + "grad_norm": 5.117861747741699, + "learning_rate": 1.7719201655808565e-05, + "loss": 1.8607, + "step": 9735 + }, + { + "epoch": 0.7328703965072734, + "grad_norm": 4.039570331573486, + "learning_rate": 1.7709892892093554e-05, + "loss": 1.9952, + "step": 9736 + }, + { + "epoch": 0.7329456707879335, + "grad_norm": 8.141228675842285, + "learning_rate": 1.7700586047932764e-05, + "loss": 1.8911, + "step": 9737 + }, + { + "epoch": 0.7330209450685937, + "grad_norm": 5.21773624420166, + "learning_rate": 1.7691281123879472e-05, + "loss": 1.7537, + "step": 9738 + }, + { + "epoch": 0.7330962193492538, + "grad_norm": 4.155019760131836, + "learning_rate": 1.76819781204868e-05, + "loss": 1.9437, + "step": 9739 + }, + { + "epoch": 0.733171493629914, + "grad_norm": 4.629144668579102, + "learning_rate": 1.7672677038307812e-05, + "loss": 1.4311, + "step": 9740 + }, + { + "epoch": 0.7332467679105742, + "grad_norm": 3.8458335399627686, + "learning_rate": 1.7663377877895397e-05, + "loss": 1.6115, + "step": 9741 + }, + { + "epoch": 0.7333220421912343, + "grad_norm": 6.913977146148682, + "learning_rate": 1.76540806398024e-05, + "loss": 1.4588, + "step": 9742 + }, + { + "epoch": 0.7333973164718944, + "grad_norm": 5.407314777374268, + "learning_rate": 1.7644785324581487e-05, + "loss": 1.8244, + "step": 9743 + }, + { + "epoch": 0.7334725907525547, + "grad_norm": 6.0443243980407715, + "learning_rate": 1.763549193278524e-05, + "loss": 2.0189, + "step": 9744 + }, + { + "epoch": 0.7335478650332148, + "grad_norm": 6.7276930809021, + "learning_rate": 1.762620046496614e-05, + "loss": 2.0751, + "step": 9745 + }, + { + "epoch": 0.7336231393138749, + "grad_norm": 5.041492462158203, + "learning_rate": 1.7616910921676548e-05, + "loss": 2.0146, + "step": 9746 + }, + { + "epoch": 0.7336984135945351, + "grad_norm": 7.375744342803955, + "learning_rate": 1.760762330346867e-05, + "loss": 1.5188, + "step": 9747 + }, + { + "epoch": 0.7337736878751953, + "grad_norm": 5.783930778503418, + "learning_rate": 1.759833761089465e-05, + "loss": 1.531, + "step": 9748 + }, + { + "epoch": 0.7338489621558554, + "grad_norm": 7.4266462326049805, + "learning_rate": 1.7589053844506477e-05, + "loss": 1.3724, + "step": 9749 + }, + { + "epoch": 0.7339242364365156, + "grad_norm": 6.140977382659912, + "learning_rate": 1.757977200485607e-05, + "loss": 1.8341, + "step": 9750 + }, + { + "epoch": 0.7339995107171757, + "grad_norm": 4.919197082519531, + "learning_rate": 1.7570492092495183e-05, + "loss": 1.9131, + "step": 9751 + }, + { + "epoch": 0.7340747849978358, + "grad_norm": 4.211818695068359, + "learning_rate": 1.7561214107975483e-05, + "loss": 1.5347, + "step": 9752 + }, + { + "epoch": 0.734150059278496, + "grad_norm": 4.7609453201293945, + "learning_rate": 1.7551938051848548e-05, + "loss": 1.5824, + "step": 9753 + }, + { + "epoch": 0.7342253335591562, + "grad_norm": 5.219226360321045, + "learning_rate": 1.7542663924665776e-05, + "loss": 1.7613, + "step": 9754 + }, + { + "epoch": 0.7343006078398163, + "grad_norm": 4.831563472747803, + "learning_rate": 1.7533391726978498e-05, + "loss": 1.4846, + "step": 9755 + }, + { + "epoch": 0.7343758821204764, + "grad_norm": 4.237647533416748, + "learning_rate": 1.752412145933793e-05, + "loss": 1.6977, + "step": 9756 + }, + { + "epoch": 0.7344511564011367, + "grad_norm": 4.2744903564453125, + "learning_rate": 1.751485312229517e-05, + "loss": 1.4849, + "step": 9757 + }, + { + "epoch": 0.7345264306817968, + "grad_norm": 5.690421104431152, + "learning_rate": 1.7505586716401167e-05, + "loss": 1.8325, + "step": 9758 + }, + { + "epoch": 0.7346017049624569, + "grad_norm": 3.9903831481933594, + "learning_rate": 1.7496322242206815e-05, + "loss": 1.8537, + "step": 9759 + }, + { + "epoch": 0.7346769792431171, + "grad_norm": 5.13058614730835, + "learning_rate": 1.7487059700262824e-05, + "loss": 1.656, + "step": 9760 + }, + { + "epoch": 0.7347522535237773, + "grad_norm": 6.390394687652588, + "learning_rate": 1.7477799091119862e-05, + "loss": 2.1095, + "step": 9761 + }, + { + "epoch": 0.7348275278044374, + "grad_norm": 5.219070911407471, + "learning_rate": 1.7468540415328406e-05, + "loss": 1.887, + "step": 9762 + }, + { + "epoch": 0.7349028020850976, + "grad_norm": 4.456096172332764, + "learning_rate": 1.74592836734389e-05, + "loss": 1.5901, + "step": 9763 + }, + { + "epoch": 0.7349780763657577, + "grad_norm": 6.045371055603027, + "learning_rate": 1.745002886600159e-05, + "loss": 1.5719, + "step": 9764 + }, + { + "epoch": 0.7350533506464179, + "grad_norm": 6.996026039123535, + "learning_rate": 1.744077599356669e-05, + "loss": 1.7844, + "step": 9765 + }, + { + "epoch": 0.7351286249270781, + "grad_norm": 6.744865417480469, + "learning_rate": 1.7431525056684213e-05, + "loss": 1.5956, + "step": 9766 + }, + { + "epoch": 0.7352038992077382, + "grad_norm": 4.752002239227295, + "learning_rate": 1.742227605590412e-05, + "loss": 1.807, + "step": 9767 + }, + { + "epoch": 0.7352791734883983, + "grad_norm": 4.950200080871582, + "learning_rate": 1.7413028991776242e-05, + "loss": 1.8698, + "step": 9768 + }, + { + "epoch": 0.7353544477690586, + "grad_norm": 5.587347984313965, + "learning_rate": 1.7403783864850286e-05, + "loss": 1.4786, + "step": 9769 + }, + { + "epoch": 0.7354297220497187, + "grad_norm": 7.138562202453613, + "learning_rate": 1.7394540675675868e-05, + "loss": 1.8078, + "step": 9770 + }, + { + "epoch": 0.7355049963303788, + "grad_norm": 3.674849271774292, + "learning_rate": 1.7385299424802437e-05, + "loss": 1.6273, + "step": 9771 + }, + { + "epoch": 0.7355802706110389, + "grad_norm": 4.589719772338867, + "learning_rate": 1.737606011277939e-05, + "loss": 1.7456, + "step": 9772 + }, + { + "epoch": 0.7356555448916992, + "grad_norm": 5.046091556549072, + "learning_rate": 1.736682274015594e-05, + "loss": 1.4479, + "step": 9773 + }, + { + "epoch": 0.7357308191723593, + "grad_norm": 6.175132751464844, + "learning_rate": 1.7357587307481267e-05, + "loss": 1.821, + "step": 9774 + }, + { + "epoch": 0.7358060934530194, + "grad_norm": 5.633509635925293, + "learning_rate": 1.7348353815304343e-05, + "loss": 1.8415, + "step": 9775 + }, + { + "epoch": 0.7358813677336796, + "grad_norm": 3.9048843383789062, + "learning_rate": 1.7339122264174118e-05, + "loss": 1.7794, + "step": 9776 + }, + { + "epoch": 0.7359566420143397, + "grad_norm": 4.080097198486328, + "learning_rate": 1.732989265463934e-05, + "loss": 1.6807, + "step": 9777 + }, + { + "epoch": 0.7360319162949999, + "grad_norm": 3.6625871658325195, + "learning_rate": 1.73206649872487e-05, + "loss": 1.3563, + "step": 9778 + }, + { + "epoch": 0.7361071905756601, + "grad_norm": 6.474795341491699, + "learning_rate": 1.7311439262550766e-05, + "loss": 1.8562, + "step": 9779 + }, + { + "epoch": 0.7361824648563202, + "grad_norm": 6.221617221832275, + "learning_rate": 1.7302215481093985e-05, + "loss": 1.92, + "step": 9780 + }, + { + "epoch": 0.7362577391369803, + "grad_norm": 4.830767631530762, + "learning_rate": 1.7292993643426657e-05, + "loss": 1.7613, + "step": 9781 + }, + { + "epoch": 0.7363330134176406, + "grad_norm": 4.695906162261963, + "learning_rate": 1.728377375009703e-05, + "loss": 2.1972, + "step": 9782 + }, + { + "epoch": 0.7364082876983007, + "grad_norm": 5.0376691818237305, + "learning_rate": 1.7274555801653163e-05, + "loss": 1.6508, + "step": 9783 + }, + { + "epoch": 0.7364835619789608, + "grad_norm": 4.550879001617432, + "learning_rate": 1.7265339798643048e-05, + "loss": 1.7659, + "step": 9784 + }, + { + "epoch": 0.736558836259621, + "grad_norm": 6.037420749664307, + "learning_rate": 1.725612574161458e-05, + "loss": 1.7059, + "step": 9785 + }, + { + "epoch": 0.7366341105402812, + "grad_norm": 4.423812389373779, + "learning_rate": 1.7246913631115465e-05, + "loss": 1.4304, + "step": 9786 + }, + { + "epoch": 0.7367093848209413, + "grad_norm": 4.588987350463867, + "learning_rate": 1.7237703467693366e-05, + "loss": 1.7778, + "step": 9787 + }, + { + "epoch": 0.7367846591016015, + "grad_norm": 5.821567058563232, + "learning_rate": 1.7228495251895783e-05, + "loss": 1.7919, + "step": 9788 + }, + { + "epoch": 0.7368599333822616, + "grad_norm": 5.66328239440918, + "learning_rate": 1.721928898427012e-05, + "loss": 1.2932, + "step": 9789 + }, + { + "epoch": 0.7369352076629218, + "grad_norm": 5.165868282318115, + "learning_rate": 1.7210084665363668e-05, + "loss": 1.6868, + "step": 9790 + }, + { + "epoch": 0.7370104819435819, + "grad_norm": 3.956026554107666, + "learning_rate": 1.720088229572362e-05, + "loss": 1.405, + "step": 9791 + }, + { + "epoch": 0.7370857562242421, + "grad_norm": 4.367198467254639, + "learning_rate": 1.7191681875896986e-05, + "loss": 2.0717, + "step": 9792 + }, + { + "epoch": 0.7371610305049022, + "grad_norm": 4.24954891204834, + "learning_rate": 1.7182483406430745e-05, + "loss": 1.9531, + "step": 9793 + }, + { + "epoch": 0.7372363047855623, + "grad_norm": 3.9967422485351562, + "learning_rate": 1.7173286887871687e-05, + "loss": 1.5159, + "step": 9794 + }, + { + "epoch": 0.7373115790662226, + "grad_norm": 5.425422191619873, + "learning_rate": 1.7164092320766544e-05, + "loss": 2.0148, + "step": 9795 + }, + { + "epoch": 0.7373868533468827, + "grad_norm": 5.593850612640381, + "learning_rate": 1.7154899705661886e-05, + "loss": 1.7512, + "step": 9796 + }, + { + "epoch": 0.7374621276275428, + "grad_norm": 4.212367057800293, + "learning_rate": 1.7145709043104208e-05, + "loss": 1.8903, + "step": 9797 + }, + { + "epoch": 0.737537401908203, + "grad_norm": 5.770796298980713, + "learning_rate": 1.7136520333639843e-05, + "loss": 1.5749, + "step": 9798 + }, + { + "epoch": 0.7376126761888632, + "grad_norm": 5.1938323974609375, + "learning_rate": 1.7127333577815053e-05, + "loss": 1.5588, + "step": 9799 + }, + { + "epoch": 0.7376879504695233, + "grad_norm": 5.713388442993164, + "learning_rate": 1.711814877617595e-05, + "loss": 1.7488, + "step": 9800 + }, + { + "epoch": 0.7377632247501835, + "grad_norm": 4.531913757324219, + "learning_rate": 1.710896592926856e-05, + "loss": 1.8826, + "step": 9801 + }, + { + "epoch": 0.7378384990308436, + "grad_norm": 5.700654029846191, + "learning_rate": 1.7099785037638787e-05, + "loss": 1.7525, + "step": 9802 + }, + { + "epoch": 0.7379137733115038, + "grad_norm": 4.287419319152832, + "learning_rate": 1.709060610183238e-05, + "loss": 1.74, + "step": 9803 + }, + { + "epoch": 0.737989047592164, + "grad_norm": 5.163872718811035, + "learning_rate": 1.7081429122395022e-05, + "loss": 1.6885, + "step": 9804 + }, + { + "epoch": 0.7380643218728241, + "grad_norm": 4.883586883544922, + "learning_rate": 1.707225409987224e-05, + "loss": 1.5009, + "step": 9805 + }, + { + "epoch": 0.7381395961534842, + "grad_norm": 4.405183792114258, + "learning_rate": 1.706308103480948e-05, + "loss": 1.681, + "step": 9806 + }, + { + "epoch": 0.7382148704341445, + "grad_norm": 5.201859951019287, + "learning_rate": 1.7053909927752034e-05, + "loss": 1.4759, + "step": 9807 + }, + { + "epoch": 0.7382901447148046, + "grad_norm": 6.630100250244141, + "learning_rate": 1.7044740779245123e-05, + "loss": 2.0427, + "step": 9808 + }, + { + "epoch": 0.7383654189954647, + "grad_norm": 6.883792877197266, + "learning_rate": 1.7035573589833798e-05, + "loss": 1.7807, + "step": 9809 + }, + { + "epoch": 0.7384406932761248, + "grad_norm": 6.82187557220459, + "learning_rate": 1.7026408360063035e-05, + "loss": 1.5923, + "step": 9810 + }, + { + "epoch": 0.7385159675567851, + "grad_norm": 4.41224479675293, + "learning_rate": 1.70172450904777e-05, + "loss": 1.869, + "step": 9811 + }, + { + "epoch": 0.7385912418374452, + "grad_norm": 5.27655029296875, + "learning_rate": 1.7008083781622496e-05, + "loss": 1.7183, + "step": 9812 + }, + { + "epoch": 0.7386665161181053, + "grad_norm": 6.023240566253662, + "learning_rate": 1.699892443404204e-05, + "loss": 1.632, + "step": 9813 + }, + { + "epoch": 0.7387417903987655, + "grad_norm": 5.507640838623047, + "learning_rate": 1.698976704828086e-05, + "loss": 2.0998, + "step": 9814 + }, + { + "epoch": 0.7388170646794257, + "grad_norm": 6.4250335693359375, + "learning_rate": 1.6980611624883285e-05, + "loss": 1.3663, + "step": 9815 + }, + { + "epoch": 0.7388923389600858, + "grad_norm": 7.229237079620361, + "learning_rate": 1.6971458164393618e-05, + "loss": 1.7778, + "step": 9816 + }, + { + "epoch": 0.738967613240746, + "grad_norm": 4.294672966003418, + "learning_rate": 1.6962306667355997e-05, + "loss": 1.5861, + "step": 9817 + }, + { + "epoch": 0.7390428875214061, + "grad_norm": 5.85977840423584, + "learning_rate": 1.695315713431444e-05, + "loss": 2.0294, + "step": 9818 + }, + { + "epoch": 0.7391181618020662, + "grad_norm": 5.06605863571167, + "learning_rate": 1.6944009565812886e-05, + "loss": 1.8145, + "step": 9819 + }, + { + "epoch": 0.7391934360827265, + "grad_norm": 6.896977424621582, + "learning_rate": 1.69348639623951e-05, + "loss": 1.9041, + "step": 9820 + }, + { + "epoch": 0.7392687103633866, + "grad_norm": 6.381049633026123, + "learning_rate": 1.692572032460479e-05, + "loss": 1.6344, + "step": 9821 + }, + { + "epoch": 0.7393439846440467, + "grad_norm": 4.932339191436768, + "learning_rate": 1.6916578652985488e-05, + "loss": 2.0262, + "step": 9822 + }, + { + "epoch": 0.739419258924707, + "grad_norm": 4.843674182891846, + "learning_rate": 1.690743894808066e-05, + "loss": 1.6609, + "step": 9823 + }, + { + "epoch": 0.7394945332053671, + "grad_norm": 7.585464954376221, + "learning_rate": 1.6898301210433633e-05, + "loss": 1.6574, + "step": 9824 + }, + { + "epoch": 0.7395698074860272, + "grad_norm": 5.226008415222168, + "learning_rate": 1.6889165440587635e-05, + "loss": 1.5134, + "step": 9825 + }, + { + "epoch": 0.7396450817666874, + "grad_norm": 5.085031509399414, + "learning_rate": 1.688003163908573e-05, + "loss": 1.5474, + "step": 9826 + }, + { + "epoch": 0.7397203560473475, + "grad_norm": 5.649744987487793, + "learning_rate": 1.687089980647093e-05, + "loss": 1.5104, + "step": 9827 + }, + { + "epoch": 0.7397956303280077, + "grad_norm": 4.8033447265625, + "learning_rate": 1.6861769943286065e-05, + "loss": 1.5578, + "step": 9828 + }, + { + "epoch": 0.7398709046086678, + "grad_norm": 4.751375198364258, + "learning_rate": 1.68526420500739e-05, + "loss": 1.6794, + "step": 9829 + }, + { + "epoch": 0.739946178889328, + "grad_norm": 6.169285297393799, + "learning_rate": 1.6843516127377044e-05, + "loss": 1.6183, + "step": 9830 + }, + { + "epoch": 0.7400214531699881, + "grad_norm": 4.804197788238525, + "learning_rate": 1.683439217573804e-05, + "loss": 1.7831, + "step": 9831 + }, + { + "epoch": 0.7400967274506483, + "grad_norm": 5.225752353668213, + "learning_rate": 1.682527019569923e-05, + "loss": 2.3008, + "step": 9832 + }, + { + "epoch": 0.7401720017313085, + "grad_norm": 6.870513916015625, + "learning_rate": 1.6816150187802925e-05, + "loss": 1.7917, + "step": 9833 + }, + { + "epoch": 0.7402472760119686, + "grad_norm": 5.141991138458252, + "learning_rate": 1.680703215259128e-05, + "loss": 1.6125, + "step": 9834 + }, + { + "epoch": 0.7403225502926287, + "grad_norm": 6.870355606079102, + "learning_rate": 1.6797916090606326e-05, + "loss": 2.2078, + "step": 9835 + }, + { + "epoch": 0.740397824573289, + "grad_norm": 5.971714973449707, + "learning_rate": 1.6788802002390008e-05, + "loss": 2.0104, + "step": 9836 + }, + { + "epoch": 0.7404730988539491, + "grad_norm": 5.547500133514404, + "learning_rate": 1.677968988848411e-05, + "loss": 1.8608, + "step": 9837 + }, + { + "epoch": 0.7405483731346092, + "grad_norm": 5.365405559539795, + "learning_rate": 1.677057974943033e-05, + "loss": 1.8265, + "step": 9838 + }, + { + "epoch": 0.7406236474152694, + "grad_norm": 4.378012657165527, + "learning_rate": 1.6761471585770232e-05, + "loss": 1.9151, + "step": 9839 + }, + { + "epoch": 0.7406989216959295, + "grad_norm": 4.3325934410095215, + "learning_rate": 1.6752365398045295e-05, + "loss": 1.9368, + "step": 9840 + }, + { + "epoch": 0.7407741959765897, + "grad_norm": 4.515381813049316, + "learning_rate": 1.674326118679681e-05, + "loss": 1.5082, + "step": 9841 + }, + { + "epoch": 0.7408494702572499, + "grad_norm": 3.7970385551452637, + "learning_rate": 1.6734158952566048e-05, + "loss": 1.6191, + "step": 9842 + }, + { + "epoch": 0.74092474453791, + "grad_norm": 4.837187767028809, + "learning_rate": 1.672505869589407e-05, + "loss": 1.9662, + "step": 9843 + }, + { + "epoch": 0.7410000188185701, + "grad_norm": 4.181177139282227, + "learning_rate": 1.671596041732187e-05, + "loss": 1.8354, + "step": 9844 + }, + { + "epoch": 0.7410752930992304, + "grad_norm": 4.596796989440918, + "learning_rate": 1.6706864117390326e-05, + "loss": 1.7701, + "step": 9845 + }, + { + "epoch": 0.7411505673798905, + "grad_norm": 4.650289535522461, + "learning_rate": 1.6697769796640196e-05, + "loss": 1.2754, + "step": 9846 + }, + { + "epoch": 0.7412258416605506, + "grad_norm": 4.834753513336182, + "learning_rate": 1.668867745561208e-05, + "loss": 1.5619, + "step": 9847 + }, + { + "epoch": 0.7413011159412108, + "grad_norm": 5.085977077484131, + "learning_rate": 1.6679587094846504e-05, + "loss": 1.9584, + "step": 9848 + }, + { + "epoch": 0.741376390221871, + "grad_norm": 4.147677898406982, + "learning_rate": 1.667049871488389e-05, + "loss": 2.0329, + "step": 9849 + }, + { + "epoch": 0.7414516645025311, + "grad_norm": 5.9029693603515625, + "learning_rate": 1.666141231626448e-05, + "loss": 1.4683, + "step": 9850 + }, + { + "epoch": 0.7415269387831912, + "grad_norm": 5.1416826248168945, + "learning_rate": 1.6652327899528458e-05, + "loss": 1.657, + "step": 9851 + }, + { + "epoch": 0.7416022130638514, + "grad_norm": 4.5896782875061035, + "learning_rate": 1.6643245465215845e-05, + "loss": 1.8417, + "step": 9852 + }, + { + "epoch": 0.7416774873445116, + "grad_norm": 5.31846284866333, + "learning_rate": 1.6634165013866592e-05, + "loss": 2.0711, + "step": 9853 + }, + { + "epoch": 0.7417527616251717, + "grad_norm": 4.897181034088135, + "learning_rate": 1.662508654602048e-05, + "loss": 1.5818, + "step": 9854 + }, + { + "epoch": 0.7418280359058319, + "grad_norm": 4.997043132781982, + "learning_rate": 1.6616010062217208e-05, + "loss": 1.6459, + "step": 9855 + }, + { + "epoch": 0.741903310186492, + "grad_norm": 5.569971561431885, + "learning_rate": 1.6606935562996346e-05, + "loss": 2.0158, + "step": 9856 + }, + { + "epoch": 0.7419785844671521, + "grad_norm": 4.525067329406738, + "learning_rate": 1.6597863048897367e-05, + "loss": 1.8028, + "step": 9857 + }, + { + "epoch": 0.7420538587478124, + "grad_norm": 4.395412445068359, + "learning_rate": 1.658879252045957e-05, + "loss": 1.5657, + "step": 9858 + }, + { + "epoch": 0.7421291330284725, + "grad_norm": 4.703008651733398, + "learning_rate": 1.6579723978222213e-05, + "loss": 1.9342, + "step": 9859 + }, + { + "epoch": 0.7422044073091326, + "grad_norm": 7.443033218383789, + "learning_rate": 1.6570657422724345e-05, + "loss": 1.6266, + "step": 9860 + }, + { + "epoch": 0.7422796815897929, + "grad_norm": 5.142846584320068, + "learning_rate": 1.6561592854504996e-05, + "loss": 1.576, + "step": 9861 + }, + { + "epoch": 0.742354955870453, + "grad_norm": 4.719917297363281, + "learning_rate": 1.655253027410299e-05, + "loss": 2.0376, + "step": 9862 + }, + { + "epoch": 0.7424302301511131, + "grad_norm": 4.248937129974365, + "learning_rate": 1.6543469682057106e-05, + "loss": 1.8473, + "step": 9863 + }, + { + "epoch": 0.7425055044317733, + "grad_norm": 7.620512008666992, + "learning_rate": 1.6534411078905932e-05, + "loss": 1.8284, + "step": 9864 + }, + { + "epoch": 0.7425807787124334, + "grad_norm": 5.996993064880371, + "learning_rate": 1.6525354465187998e-05, + "loss": 1.863, + "step": 9865 + }, + { + "epoch": 0.7426560529930936, + "grad_norm": 4.5782880783081055, + "learning_rate": 1.6516299841441707e-05, + "loss": 1.6022, + "step": 9866 + }, + { + "epoch": 0.7427313272737538, + "grad_norm": 4.695895195007324, + "learning_rate": 1.6507247208205295e-05, + "loss": 1.6695, + "step": 9867 + }, + { + "epoch": 0.7428066015544139, + "grad_norm": 4.661893367767334, + "learning_rate": 1.649819656601694e-05, + "loss": 1.7295, + "step": 9868 + }, + { + "epoch": 0.742881875835074, + "grad_norm": 5.824334621429443, + "learning_rate": 1.6489147915414672e-05, + "loss": 1.5382, + "step": 9869 + }, + { + "epoch": 0.7429571501157342, + "grad_norm": 3.8496460914611816, + "learning_rate": 1.648010125693642e-05, + "loss": 1.5605, + "step": 9870 + }, + { + "epoch": 0.7430324243963944, + "grad_norm": 5.330741882324219, + "learning_rate": 1.6471056591119955e-05, + "loss": 1.6525, + "step": 9871 + }, + { + "epoch": 0.7431076986770545, + "grad_norm": 5.867516040802002, + "learning_rate": 1.6462013918502983e-05, + "loss": 2.7417, + "step": 9872 + }, + { + "epoch": 0.7431829729577146, + "grad_norm": 6.273266792297363, + "learning_rate": 1.645297323962304e-05, + "loss": 1.9553, + "step": 9873 + }, + { + "epoch": 0.7432582472383749, + "grad_norm": 5.626605033874512, + "learning_rate": 1.6443934555017593e-05, + "loss": 1.8489, + "step": 9874 + }, + { + "epoch": 0.743333521519035, + "grad_norm": 4.875461578369141, + "learning_rate": 1.643489786522394e-05, + "loss": 1.8658, + "step": 9875 + }, + { + "epoch": 0.7434087957996951, + "grad_norm": 4.712279796600342, + "learning_rate": 1.6425863170779315e-05, + "loss": 1.758, + "step": 9876 + }, + { + "epoch": 0.7434840700803553, + "grad_norm": 5.885703086853027, + "learning_rate": 1.641683047222078e-05, + "loss": 1.7856, + "step": 9877 + }, + { + "epoch": 0.7435593443610155, + "grad_norm": 5.22415828704834, + "learning_rate": 1.6407799770085303e-05, + "loss": 1.8832, + "step": 9878 + }, + { + "epoch": 0.7436346186416756, + "grad_norm": 5.288778305053711, + "learning_rate": 1.6398771064909744e-05, + "loss": 1.7586, + "step": 9879 + }, + { + "epoch": 0.7437098929223358, + "grad_norm": 5.386417388916016, + "learning_rate": 1.6389744357230857e-05, + "loss": 1.8506, + "step": 9880 + }, + { + "epoch": 0.7437851672029959, + "grad_norm": 6.601144313812256, + "learning_rate": 1.6380719647585202e-05, + "loss": 1.8024, + "step": 9881 + }, + { + "epoch": 0.743860441483656, + "grad_norm": 5.505392074584961, + "learning_rate": 1.6371696936509307e-05, + "loss": 1.9418, + "step": 9882 + }, + { + "epoch": 0.7439357157643163, + "grad_norm": 3.752809524536133, + "learning_rate": 1.6362676224539553e-05, + "loss": 1.5012, + "step": 9883 + }, + { + "epoch": 0.7440109900449764, + "grad_norm": 5.4744367599487305, + "learning_rate": 1.6353657512212168e-05, + "loss": 1.7722, + "step": 9884 + }, + { + "epoch": 0.7440862643256365, + "grad_norm": 7.035309314727783, + "learning_rate": 1.634464080006331e-05, + "loss": 1.9359, + "step": 9885 + }, + { + "epoch": 0.7441615386062967, + "grad_norm": 4.583143711090088, + "learning_rate": 1.6335626088628982e-05, + "loss": 1.3352, + "step": 9886 + }, + { + "epoch": 0.7442368128869569, + "grad_norm": 5.726820468902588, + "learning_rate": 1.6326613378445094e-05, + "loss": 1.8148, + "step": 9887 + }, + { + "epoch": 0.744312087167617, + "grad_norm": 5.830718517303467, + "learning_rate": 1.6317602670047412e-05, + "loss": 1.4756, + "step": 9888 + }, + { + "epoch": 0.7443873614482771, + "grad_norm": 6.600234031677246, + "learning_rate": 1.63085939639716e-05, + "loss": 1.842, + "step": 9889 + }, + { + "epoch": 0.7444626357289373, + "grad_norm": 4.266689777374268, + "learning_rate": 1.6299587260753214e-05, + "loss": 1.489, + "step": 9890 + }, + { + "epoch": 0.7445379100095975, + "grad_norm": 4.250828742980957, + "learning_rate": 1.629058256092768e-05, + "loss": 1.693, + "step": 9891 + }, + { + "epoch": 0.7446131842902576, + "grad_norm": 5.014566421508789, + "learning_rate": 1.6281579865030272e-05, + "loss": 1.6572, + "step": 9892 + }, + { + "epoch": 0.7446884585709178, + "grad_norm": 7.314173221588135, + "learning_rate": 1.6272579173596207e-05, + "loss": 1.8965, + "step": 9893 + }, + { + "epoch": 0.7447637328515779, + "grad_norm": 7.061277866363525, + "learning_rate": 1.6263580487160523e-05, + "loss": 1.837, + "step": 9894 + }, + { + "epoch": 0.744839007132238, + "grad_norm": 5.2542524337768555, + "learning_rate": 1.6254583806258196e-05, + "loss": 2.2552, + "step": 9895 + }, + { + "epoch": 0.7449142814128983, + "grad_norm": 4.493199825286865, + "learning_rate": 1.6245589131424015e-05, + "loss": 1.556, + "step": 9896 + }, + { + "epoch": 0.7449895556935584, + "grad_norm": 4.95303201675415, + "learning_rate": 1.6236596463192712e-05, + "loss": 1.8854, + "step": 9897 + }, + { + "epoch": 0.7450648299742185, + "grad_norm": 5.3555707931518555, + "learning_rate": 1.6227605802098883e-05, + "loss": 1.9172, + "step": 9898 + }, + { + "epoch": 0.7451401042548788, + "grad_norm": 6.785427093505859, + "learning_rate": 1.6218617148676966e-05, + "loss": 1.8183, + "step": 9899 + }, + { + "epoch": 0.7452153785355389, + "grad_norm": 4.2658185958862305, + "learning_rate": 1.6209630503461337e-05, + "loss": 1.4228, + "step": 9900 + }, + { + "epoch": 0.745290652816199, + "grad_norm": 5.293824195861816, + "learning_rate": 1.620064586698622e-05, + "loss": 2.2091, + "step": 9901 + }, + { + "epoch": 0.7453659270968592, + "grad_norm": 7.266054630279541, + "learning_rate": 1.619166323978574e-05, + "loss": 1.9825, + "step": 9902 + }, + { + "epoch": 0.7454412013775193, + "grad_norm": 6.64616060256958, + "learning_rate": 1.618268262239385e-05, + "loss": 2.3791, + "step": 9903 + }, + { + "epoch": 0.7455164756581795, + "grad_norm": 4.592991352081299, + "learning_rate": 1.6173704015344464e-05, + "loss": 1.6131, + "step": 9904 + }, + { + "epoch": 0.7455917499388397, + "grad_norm": 6.452118396759033, + "learning_rate": 1.6164727419171305e-05, + "loss": 2.1458, + "step": 9905 + }, + { + "epoch": 0.7456670242194998, + "grad_norm": 5.426454067230225, + "learning_rate": 1.6155752834408027e-05, + "loss": 1.8225, + "step": 9906 + }, + { + "epoch": 0.7457422985001599, + "grad_norm": 5.5808329582214355, + "learning_rate": 1.6146780261588123e-05, + "loss": 1.7889, + "step": 9907 + }, + { + "epoch": 0.7458175727808201, + "grad_norm": 6.885249614715576, + "learning_rate": 1.613780970124501e-05, + "loss": 1.5035, + "step": 9908 + }, + { + "epoch": 0.7458928470614803, + "grad_norm": 5.570834159851074, + "learning_rate": 1.6128841153911934e-05, + "loss": 1.8849, + "step": 9909 + }, + { + "epoch": 0.7459681213421404, + "grad_norm": 5.213533401489258, + "learning_rate": 1.611987462012206e-05, + "loss": 1.7008, + "step": 9910 + }, + { + "epoch": 0.7460433956228005, + "grad_norm": 4.122905254364014, + "learning_rate": 1.6110910100408427e-05, + "loss": 1.8105, + "step": 9911 + }, + { + "epoch": 0.7461186699034608, + "grad_norm": 5.627893924713135, + "learning_rate": 1.610194759530397e-05, + "loss": 1.528, + "step": 9912 + }, + { + "epoch": 0.7461939441841209, + "grad_norm": 4.984193801879883, + "learning_rate": 1.6092987105341443e-05, + "loss": 1.5068, + "step": 9913 + }, + { + "epoch": 0.746269218464781, + "grad_norm": 5.867680549621582, + "learning_rate": 1.608402863105355e-05, + "loss": 1.6347, + "step": 9914 + }, + { + "epoch": 0.7463444927454412, + "grad_norm": 5.676518440246582, + "learning_rate": 1.607507217297285e-05, + "loss": 1.7946, + "step": 9915 + }, + { + "epoch": 0.7464197670261014, + "grad_norm": 5.440070629119873, + "learning_rate": 1.6066117731631754e-05, + "loss": 2.0962, + "step": 9916 + }, + { + "epoch": 0.7464950413067615, + "grad_norm": 3.949209213256836, + "learning_rate": 1.60571653075626e-05, + "loss": 1.7183, + "step": 9917 + }, + { + "epoch": 0.7465703155874217, + "grad_norm": 5.474831581115723, + "learning_rate": 1.6048214901297566e-05, + "loss": 1.9147, + "step": 9918 + }, + { + "epoch": 0.7466455898680818, + "grad_norm": 4.86569356918335, + "learning_rate": 1.6039266513368757e-05, + "loss": 2.0999, + "step": 9919 + }, + { + "epoch": 0.746720864148742, + "grad_norm": 5.1166887283325195, + "learning_rate": 1.6030320144308088e-05, + "loss": 1.6953, + "step": 9920 + }, + { + "epoch": 0.7467961384294022, + "grad_norm": 5.321571350097656, + "learning_rate": 1.602137579464743e-05, + "loss": 1.8286, + "step": 9921 + }, + { + "epoch": 0.7468714127100623, + "grad_norm": 4.872490882873535, + "learning_rate": 1.6012433464918475e-05, + "loss": 2.0017, + "step": 9922 + }, + { + "epoch": 0.7469466869907224, + "grad_norm": 4.994636535644531, + "learning_rate": 1.6003493155652825e-05, + "loss": 1.9069, + "step": 9923 + }, + { + "epoch": 0.7470219612713827, + "grad_norm": 5.14051628112793, + "learning_rate": 1.5994554867381967e-05, + "loss": 1.8245, + "step": 9924 + }, + { + "epoch": 0.7470972355520428, + "grad_norm": 5.342349052429199, + "learning_rate": 1.5985618600637257e-05, + "loss": 1.5742, + "step": 9925 + }, + { + "epoch": 0.7471725098327029, + "grad_norm": 6.364773750305176, + "learning_rate": 1.597668435594991e-05, + "loss": 1.8386, + "step": 9926 + }, + { + "epoch": 0.747247784113363, + "grad_norm": 4.553479194641113, + "learning_rate": 1.5967752133851076e-05, + "loss": 1.7948, + "step": 9927 + }, + { + "epoch": 0.7473230583940232, + "grad_norm": 6.005308628082275, + "learning_rate": 1.595882193487171e-05, + "loss": 1.7371, + "step": 9928 + }, + { + "epoch": 0.7473983326746834, + "grad_norm": 6.273256778717041, + "learning_rate": 1.59498937595427e-05, + "loss": 1.8991, + "step": 9929 + }, + { + "epoch": 0.7474736069553435, + "grad_norm": 4.796781063079834, + "learning_rate": 1.5940967608394823e-05, + "loss": 1.6124, + "step": 9930 + }, + { + "epoch": 0.7475488812360037, + "grad_norm": 4.505614280700684, + "learning_rate": 1.593204348195868e-05, + "loss": 1.8508, + "step": 9931 + }, + { + "epoch": 0.7476241555166638, + "grad_norm": 5.095390796661377, + "learning_rate": 1.5923121380764818e-05, + "loss": 1.7337, + "step": 9932 + }, + { + "epoch": 0.747699429797324, + "grad_norm": 5.091737270355225, + "learning_rate": 1.591420130534359e-05, + "loss": 2.1031, + "step": 9933 + }, + { + "epoch": 0.7477747040779842, + "grad_norm": 7.829418182373047, + "learning_rate": 1.59052832562253e-05, + "loss": 1.9929, + "step": 9934 + }, + { + "epoch": 0.7478499783586443, + "grad_norm": 4.928241729736328, + "learning_rate": 1.5896367233940084e-05, + "loss": 1.7883, + "step": 9935 + }, + { + "epoch": 0.7479252526393044, + "grad_norm": 5.002897262573242, + "learning_rate": 1.5887453239018e-05, + "loss": 1.9792, + "step": 9936 + }, + { + "epoch": 0.7480005269199647, + "grad_norm": 4.8554768562316895, + "learning_rate": 1.587854127198893e-05, + "loss": 1.7349, + "step": 9937 + }, + { + "epoch": 0.7480758012006248, + "grad_norm": 7.002180099487305, + "learning_rate": 1.5869631333382684e-05, + "loss": 1.9243, + "step": 9938 + }, + { + "epoch": 0.7481510754812849, + "grad_norm": 5.728883743286133, + "learning_rate": 1.586072342372891e-05, + "loss": 1.7415, + "step": 9939 + }, + { + "epoch": 0.7482263497619451, + "grad_norm": 5.124384880065918, + "learning_rate": 1.5851817543557196e-05, + "loss": 2.088, + "step": 9940 + }, + { + "epoch": 0.7483016240426053, + "grad_norm": 5.529357433319092, + "learning_rate": 1.5842913693396928e-05, + "loss": 1.3941, + "step": 9941 + }, + { + "epoch": 0.7483768983232654, + "grad_norm": 3.9997756481170654, + "learning_rate": 1.5834011873777445e-05, + "loss": 1.8718, + "step": 9942 + }, + { + "epoch": 0.7484521726039256, + "grad_norm": 5.735321044921875, + "learning_rate": 1.5825112085227904e-05, + "loss": 1.8631, + "step": 9943 + }, + { + "epoch": 0.7485274468845857, + "grad_norm": 5.311409950256348, + "learning_rate": 1.5816214328277395e-05, + "loss": 1.8433, + "step": 9944 + }, + { + "epoch": 0.7486027211652458, + "grad_norm": 5.071771144866943, + "learning_rate": 1.5807318603454864e-05, + "loss": 1.7934, + "step": 9945 + }, + { + "epoch": 0.7486779954459061, + "grad_norm": 5.063529968261719, + "learning_rate": 1.5798424911289128e-05, + "loss": 1.6195, + "step": 9946 + }, + { + "epoch": 0.7487532697265662, + "grad_norm": 4.125574588775635, + "learning_rate": 1.5789533252308908e-05, + "loss": 1.4737, + "step": 9947 + }, + { + "epoch": 0.7488285440072263, + "grad_norm": 5.837564945220947, + "learning_rate": 1.5780643627042763e-05, + "loss": 1.7738, + "step": 9948 + }, + { + "epoch": 0.7489038182878864, + "grad_norm": 5.077790260314941, + "learning_rate": 1.5771756036019185e-05, + "loss": 1.8853, + "step": 9949 + }, + { + "epoch": 0.7489790925685467, + "grad_norm": 6.3469462394714355, + "learning_rate": 1.5762870479766484e-05, + "loss": 1.8312, + "step": 9950 + }, + { + "epoch": 0.7490543668492068, + "grad_norm": 4.351106643676758, + "learning_rate": 1.575398695881291e-05, + "loss": 1.6348, + "step": 9951 + }, + { + "epoch": 0.7491296411298669, + "grad_norm": 6.104426383972168, + "learning_rate": 1.5745105473686533e-05, + "loss": 1.7368, + "step": 9952 + }, + { + "epoch": 0.7492049154105271, + "grad_norm": 4.603887557983398, + "learning_rate": 1.5736226024915364e-05, + "loss": 1.5682, + "step": 9953 + }, + { + "epoch": 0.7492801896911873, + "grad_norm": 4.979403972625732, + "learning_rate": 1.572734861302722e-05, + "loss": 1.7404, + "step": 9954 + }, + { + "epoch": 0.7493554639718474, + "grad_norm": 5.704952716827393, + "learning_rate": 1.571847323854987e-05, + "loss": 2.1988, + "step": 9955 + }, + { + "epoch": 0.7494307382525076, + "grad_norm": 6.086668491363525, + "learning_rate": 1.570959990201092e-05, + "loss": 1.8179, + "step": 9956 + }, + { + "epoch": 0.7495060125331677, + "grad_norm": 5.470081806182861, + "learning_rate": 1.570072860393788e-05, + "loss": 1.6791, + "step": 9957 + }, + { + "epoch": 0.7495812868138279, + "grad_norm": 4.402439117431641, + "learning_rate": 1.5691859344858096e-05, + "loss": 1.5306, + "step": 9958 + }, + { + "epoch": 0.7496565610944881, + "grad_norm": 5.325098514556885, + "learning_rate": 1.5682992125298847e-05, + "loss": 2.0148, + "step": 9959 + }, + { + "epoch": 0.7497318353751482, + "grad_norm": 4.851621627807617, + "learning_rate": 1.5674126945787232e-05, + "loss": 2.1176, + "step": 9960 + }, + { + "epoch": 0.7498071096558083, + "grad_norm": 5.519502639770508, + "learning_rate": 1.5665263806850276e-05, + "loss": 1.7965, + "step": 9961 + }, + { + "epoch": 0.7498823839364686, + "grad_norm": 5.295783996582031, + "learning_rate": 1.565640270901489e-05, + "loss": 1.9005, + "step": 9962 + }, + { + "epoch": 0.7499576582171287, + "grad_norm": 7.355525493621826, + "learning_rate": 1.5647543652807804e-05, + "loss": 1.702, + "step": 9963 + }, + { + "epoch": 0.7500329324977888, + "grad_norm": 5.001543998718262, + "learning_rate": 1.5638686638755696e-05, + "loss": 1.6658, + "step": 9964 + }, + { + "epoch": 0.750108206778449, + "grad_norm": 5.500482559204102, + "learning_rate": 1.5629831667385054e-05, + "loss": 1.7302, + "step": 9965 + }, + { + "epoch": 0.7501834810591091, + "grad_norm": 5.5395283699035645, + "learning_rate": 1.562097873922232e-05, + "loss": 1.834, + "step": 9966 + }, + { + "epoch": 0.7502587553397693, + "grad_norm": 5.235549449920654, + "learning_rate": 1.5612127854793734e-05, + "loss": 1.8471, + "step": 9967 + }, + { + "epoch": 0.7503340296204294, + "grad_norm": 4.581087112426758, + "learning_rate": 1.5603279014625484e-05, + "loss": 1.6731, + "step": 9968 + }, + { + "epoch": 0.7504093039010896, + "grad_norm": 4.876650333404541, + "learning_rate": 1.5594432219243598e-05, + "loss": 1.7112, + "step": 9969 + }, + { + "epoch": 0.7504845781817497, + "grad_norm": 5.286788463592529, + "learning_rate": 1.5585587469174012e-05, + "loss": 1.4978, + "step": 9970 + }, + { + "epoch": 0.7505598524624099, + "grad_norm": 6.490696430206299, + "learning_rate": 1.557674476494249e-05, + "loss": 1.6403, + "step": 9971 + }, + { + "epoch": 0.7506351267430701, + "grad_norm": 6.099976062774658, + "learning_rate": 1.556790410707473e-05, + "loss": 1.6556, + "step": 9972 + }, + { + "epoch": 0.7507104010237302, + "grad_norm": 4.853091239929199, + "learning_rate": 1.555906549609627e-05, + "loss": 1.7186, + "step": 9973 + }, + { + "epoch": 0.7507856753043903, + "grad_norm": 5.023470878601074, + "learning_rate": 1.555022893253255e-05, + "loss": 1.5079, + "step": 9974 + }, + { + "epoch": 0.7508609495850506, + "grad_norm": 4.34914493560791, + "learning_rate": 1.554139441690886e-05, + "loss": 1.4418, + "step": 9975 + }, + { + "epoch": 0.7509362238657107, + "grad_norm": 4.561581611633301, + "learning_rate": 1.5532561949750418e-05, + "loss": 1.7428, + "step": 9976 + }, + { + "epoch": 0.7510114981463708, + "grad_norm": 4.435133934020996, + "learning_rate": 1.552373153158225e-05, + "loss": 1.9861, + "step": 9977 + }, + { + "epoch": 0.751086772427031, + "grad_norm": 6.253811836242676, + "learning_rate": 1.5514903162929323e-05, + "loss": 2.3957, + "step": 9978 + }, + { + "epoch": 0.7511620467076912, + "grad_norm": 6.178959846496582, + "learning_rate": 1.5506076844316446e-05, + "loss": 1.5617, + "step": 9979 + }, + { + "epoch": 0.7512373209883513, + "grad_norm": 5.523349285125732, + "learning_rate": 1.5497252576268336e-05, + "loss": 1.9068, + "step": 9980 + }, + { + "epoch": 0.7513125952690115, + "grad_norm": 5.803079605102539, + "learning_rate": 1.5488430359309568e-05, + "loss": 1.7659, + "step": 9981 + }, + { + "epoch": 0.7513878695496716, + "grad_norm": 4.786619663238525, + "learning_rate": 1.5479610193964582e-05, + "loss": 1.3783, + "step": 9982 + }, + { + "epoch": 0.7514631438303317, + "grad_norm": 3.7122581005096436, + "learning_rate": 1.5470792080757733e-05, + "loss": 1.4453, + "step": 9983 + }, + { + "epoch": 0.751538418110992, + "grad_norm": 6.226945400238037, + "learning_rate": 1.5461976020213204e-05, + "loss": 1.8423, + "step": 9984 + }, + { + "epoch": 0.7516136923916521, + "grad_norm": 6.400366306304932, + "learning_rate": 1.5453162012855117e-05, + "loss": 1.6239, + "step": 9985 + }, + { + "epoch": 0.7516889666723122, + "grad_norm": 3.9581212997436523, + "learning_rate": 1.54443500592074e-05, + "loss": 1.6621, + "step": 9986 + }, + { + "epoch": 0.7517642409529723, + "grad_norm": 5.008101940155029, + "learning_rate": 1.5435540159793942e-05, + "loss": 1.6239, + "step": 9987 + }, + { + "epoch": 0.7518395152336326, + "grad_norm": 6.00190544128418, + "learning_rate": 1.5426732315138425e-05, + "loss": 1.7272, + "step": 9988 + }, + { + "epoch": 0.7519147895142927, + "grad_norm": 6.756215572357178, + "learning_rate": 1.5417926525764475e-05, + "loss": 2.1489, + "step": 9989 + }, + { + "epoch": 0.7519900637949528, + "grad_norm": 6.139777660369873, + "learning_rate": 1.540912279219556e-05, + "loss": 1.5925, + "step": 9990 + }, + { + "epoch": 0.752065338075613, + "grad_norm": 5.313515663146973, + "learning_rate": 1.540032111495507e-05, + "loss": 1.7574, + "step": 9991 + }, + { + "epoch": 0.7521406123562732, + "grad_norm": 5.057952880859375, + "learning_rate": 1.5391521494566184e-05, + "loss": 1.8048, + "step": 9992 + }, + { + "epoch": 0.7522158866369333, + "grad_norm": 5.165274143218994, + "learning_rate": 1.5382723931552067e-05, + "loss": 1.6223, + "step": 9993 + }, + { + "epoch": 0.7522911609175935, + "grad_norm": 6.147108554840088, + "learning_rate": 1.5373928426435663e-05, + "loss": 1.6667, + "step": 9994 + }, + { + "epoch": 0.7523664351982536, + "grad_norm": 4.717916488647461, + "learning_rate": 1.5365134979739858e-05, + "loss": 1.9927, + "step": 9995 + }, + { + "epoch": 0.7524417094789138, + "grad_norm": 4.8238935470581055, + "learning_rate": 1.5356343591987422e-05, + "loss": 1.7482, + "step": 9996 + }, + { + "epoch": 0.752516983759574, + "grad_norm": 4.508865833282471, + "learning_rate": 1.5347554263700936e-05, + "loss": 1.8188, + "step": 9997 + }, + { + "epoch": 0.7525922580402341, + "grad_norm": 6.686875820159912, + "learning_rate": 1.533876699540294e-05, + "loss": 1.82, + "step": 9998 + }, + { + "epoch": 0.7526675323208942, + "grad_norm": 6.506516933441162, + "learning_rate": 1.532998178761577e-05, + "loss": 1.9982, + "step": 9999 + }, + { + "epoch": 0.7527428066015545, + "grad_norm": 4.9058942794799805, + "learning_rate": 1.5321198640861712e-05, + "loss": 1.7071, + "step": 10000 + }, + { + "epoch": 0.7528180808822146, + "grad_norm": 4.993441104888916, + "learning_rate": 1.5312417555662882e-05, + "loss": 1.5984, + "step": 10001 + }, + { + "epoch": 0.7528933551628747, + "grad_norm": 4.646112442016602, + "learning_rate": 1.530363853254132e-05, + "loss": 1.813, + "step": 10002 + }, + { + "epoch": 0.7529686294435349, + "grad_norm": 7.877852439880371, + "learning_rate": 1.529486157201887e-05, + "loss": 2.0974, + "step": 10003 + }, + { + "epoch": 0.753043903724195, + "grad_norm": 5.016733646392822, + "learning_rate": 1.528608667461734e-05, + "loss": 2.0004, + "step": 10004 + }, + { + "epoch": 0.7531191780048552, + "grad_norm": 5.312817573547363, + "learning_rate": 1.5277313840858337e-05, + "loss": 1.6183, + "step": 10005 + }, + { + "epoch": 0.7531944522855153, + "grad_norm": 6.48883581161499, + "learning_rate": 1.5268543071263408e-05, + "loss": 1.9897, + "step": 10006 + }, + { + "epoch": 0.7532697265661755, + "grad_norm": 6.557301044464111, + "learning_rate": 1.5259774366353925e-05, + "loss": 1.5934, + "step": 10007 + }, + { + "epoch": 0.7533450008468356, + "grad_norm": 4.494949817657471, + "learning_rate": 1.5251007726651184e-05, + "loss": 1.6282, + "step": 10008 + }, + { + "epoch": 0.7534202751274958, + "grad_norm": 4.333489418029785, + "learning_rate": 1.5242243152676317e-05, + "loss": 1.5445, + "step": 10009 + }, + { + "epoch": 0.753495549408156, + "grad_norm": 4.993746280670166, + "learning_rate": 1.5233480644950365e-05, + "loss": 1.5356, + "step": 10010 + }, + { + "epoch": 0.7535708236888161, + "grad_norm": 5.311154365539551, + "learning_rate": 1.5224720203994231e-05, + "loss": 1.8532, + "step": 10011 + }, + { + "epoch": 0.7536460979694762, + "grad_norm": 5.699016571044922, + "learning_rate": 1.5215961830328695e-05, + "loss": 1.9042, + "step": 10012 + }, + { + "epoch": 0.7537213722501365, + "grad_norm": 7.998257637023926, + "learning_rate": 1.5207205524474443e-05, + "loss": 2.1436, + "step": 10013 + }, + { + "epoch": 0.7537966465307966, + "grad_norm": 4.56911039352417, + "learning_rate": 1.5198451286951975e-05, + "loss": 1.7483, + "step": 10014 + }, + { + "epoch": 0.7538719208114567, + "grad_norm": 4.595117568969727, + "learning_rate": 1.5189699118281736e-05, + "loss": 1.4343, + "step": 10015 + }, + { + "epoch": 0.7539471950921169, + "grad_norm": 7.130337238311768, + "learning_rate": 1.5180949018983987e-05, + "loss": 1.7832, + "step": 10016 + }, + { + "epoch": 0.7540224693727771, + "grad_norm": 4.053917407989502, + "learning_rate": 1.5172200989578921e-05, + "loss": 1.6529, + "step": 10017 + }, + { + "epoch": 0.7540977436534372, + "grad_norm": 4.876270771026611, + "learning_rate": 1.5163455030586565e-05, + "loss": 1.5952, + "step": 10018 + }, + { + "epoch": 0.7541730179340974, + "grad_norm": 5.201292991638184, + "learning_rate": 1.5154711142526867e-05, + "loss": 1.3094, + "step": 10019 + }, + { + "epoch": 0.7542482922147575, + "grad_norm": 6.455628395080566, + "learning_rate": 1.514596932591959e-05, + "loss": 1.7467, + "step": 10020 + }, + { + "epoch": 0.7543235664954177, + "grad_norm": 6.481708526611328, + "learning_rate": 1.5137229581284445e-05, + "loss": 1.4359, + "step": 10021 + }, + { + "epoch": 0.7543988407760779, + "grad_norm": 4.381015300750732, + "learning_rate": 1.5128491909140952e-05, + "loss": 1.9735, + "step": 10022 + }, + { + "epoch": 0.754474115056738, + "grad_norm": 6.0889787673950195, + "learning_rate": 1.511975631000856e-05, + "loss": 1.7595, + "step": 10023 + }, + { + "epoch": 0.7545493893373981, + "grad_norm": 4.869718074798584, + "learning_rate": 1.5111022784406575e-05, + "loss": 1.6829, + "step": 10024 + }, + { + "epoch": 0.7546246636180582, + "grad_norm": 5.428213596343994, + "learning_rate": 1.510229133285419e-05, + "loss": 1.5676, + "step": 10025 + }, + { + "epoch": 0.7546999378987185, + "grad_norm": 5.847835540771484, + "learning_rate": 1.5093561955870439e-05, + "loss": 1.6258, + "step": 10026 + }, + { + "epoch": 0.7547752121793786, + "grad_norm": 5.718438625335693, + "learning_rate": 1.5084834653974267e-05, + "loss": 1.7579, + "step": 10027 + }, + { + "epoch": 0.7548504864600387, + "grad_norm": 4.168840408325195, + "learning_rate": 1.5076109427684515e-05, + "loss": 1.8391, + "step": 10028 + }, + { + "epoch": 0.754925760740699, + "grad_norm": 8.060935020446777, + "learning_rate": 1.506738627751983e-05, + "loss": 1.9102, + "step": 10029 + }, + { + "epoch": 0.7550010350213591, + "grad_norm": 5.792908191680908, + "learning_rate": 1.5058665203998812e-05, + "loss": 1.7805, + "step": 10030 + }, + { + "epoch": 0.7550763093020192, + "grad_norm": 7.352843284606934, + "learning_rate": 1.5049946207639875e-05, + "loss": 1.899, + "step": 10031 + }, + { + "epoch": 0.7551515835826794, + "grad_norm": 4.981710910797119, + "learning_rate": 1.5041229288961367e-05, + "loss": 1.988, + "step": 10032 + }, + { + "epoch": 0.7552268578633395, + "grad_norm": 10.141742706298828, + "learning_rate": 1.503251444848146e-05, + "loss": 1.8241, + "step": 10033 + }, + { + "epoch": 0.7553021321439997, + "grad_norm": 5.5042595863342285, + "learning_rate": 1.5023801686718231e-05, + "loss": 1.7092, + "step": 10034 + }, + { + "epoch": 0.7553774064246599, + "grad_norm": 4.59608268737793, + "learning_rate": 1.5015091004189635e-05, + "loss": 1.5714, + "step": 10035 + }, + { + "epoch": 0.75545268070532, + "grad_norm": 5.842303276062012, + "learning_rate": 1.500638240141351e-05, + "loss": 1.8281, + "step": 10036 + }, + { + "epoch": 0.7555279549859801, + "grad_norm": 5.421987533569336, + "learning_rate": 1.499767587890753e-05, + "loss": 1.9733, + "step": 10037 + }, + { + "epoch": 0.7556032292666404, + "grad_norm": 4.80499267578125, + "learning_rate": 1.4988971437189304e-05, + "loss": 1.8722, + "step": 10038 + }, + { + "epoch": 0.7556785035473005, + "grad_norm": 4.836743354797363, + "learning_rate": 1.4980269076776249e-05, + "loss": 1.7338, + "step": 10039 + }, + { + "epoch": 0.7557537778279606, + "grad_norm": 6.06622314453125, + "learning_rate": 1.4971568798185731e-05, + "loss": 1.6825, + "step": 10040 + }, + { + "epoch": 0.7558290521086208, + "grad_norm": 9.080888748168945, + "learning_rate": 1.4962870601934924e-05, + "loss": 2.5524, + "step": 10041 + }, + { + "epoch": 0.755904326389281, + "grad_norm": 5.807736873626709, + "learning_rate": 1.4954174488540923e-05, + "loss": 1.6447, + "step": 10042 + }, + { + "epoch": 0.7559796006699411, + "grad_norm": 6.850009918212891, + "learning_rate": 1.4945480458520711e-05, + "loss": 1.7889, + "step": 10043 + }, + { + "epoch": 0.7560548749506013, + "grad_norm": 8.626649856567383, + "learning_rate": 1.4936788512391087e-05, + "loss": 1.6266, + "step": 10044 + }, + { + "epoch": 0.7561301492312614, + "grad_norm": 5.807698726654053, + "learning_rate": 1.4928098650668782e-05, + "loss": 1.5468, + "step": 10045 + }, + { + "epoch": 0.7562054235119215, + "grad_norm": 3.84612774848938, + "learning_rate": 1.4919410873870377e-05, + "loss": 1.7902, + "step": 10046 + }, + { + "epoch": 0.7562806977925817, + "grad_norm": 6.826109886169434, + "learning_rate": 1.4910725182512354e-05, + "loss": 1.963, + "step": 10047 + }, + { + "epoch": 0.7563559720732419, + "grad_norm": 4.686533451080322, + "learning_rate": 1.4902041577111014e-05, + "loss": 1.6806, + "step": 10048 + }, + { + "epoch": 0.756431246353902, + "grad_norm": 4.075978755950928, + "learning_rate": 1.4893360058182621e-05, + "loss": 1.6779, + "step": 10049 + }, + { + "epoch": 0.7565065206345621, + "grad_norm": 4.039658069610596, + "learning_rate": 1.4884680626243219e-05, + "loss": 1.6225, + "step": 10050 + }, + { + "epoch": 0.7565817949152224, + "grad_norm": 4.5650458335876465, + "learning_rate": 1.487600328180881e-05, + "loss": 1.4947, + "step": 10051 + }, + { + "epoch": 0.7566570691958825, + "grad_norm": 6.31627893447876, + "learning_rate": 1.486732802539521e-05, + "loss": 1.7032, + "step": 10052 + }, + { + "epoch": 0.7567323434765426, + "grad_norm": 4.108381271362305, + "learning_rate": 1.4858654857518173e-05, + "loss": 1.6666, + "step": 10053 + }, + { + "epoch": 0.7568076177572028, + "grad_norm": 5.454526901245117, + "learning_rate": 1.484998377869325e-05, + "loss": 1.4915, + "step": 10054 + }, + { + "epoch": 0.756882892037863, + "grad_norm": 4.754133701324463, + "learning_rate": 1.484131478943594e-05, + "loss": 1.8371, + "step": 10055 + }, + { + "epoch": 0.7569581663185231, + "grad_norm": 5.136195659637451, + "learning_rate": 1.483264789026158e-05, + "loss": 1.6624, + "step": 10056 + }, + { + "epoch": 0.7570334405991833, + "grad_norm": 5.0071330070495605, + "learning_rate": 1.4823983081685417e-05, + "loss": 1.7636, + "step": 10057 + }, + { + "epoch": 0.7571087148798434, + "grad_norm": 5.209502696990967, + "learning_rate": 1.4815320364222507e-05, + "loss": 1.9075, + "step": 10058 + }, + { + "epoch": 0.7571839891605036, + "grad_norm": 7.703102111816406, + "learning_rate": 1.4806659738387846e-05, + "loss": 2.0394, + "step": 10059 + }, + { + "epoch": 0.7572592634411638, + "grad_norm": 6.7194342613220215, + "learning_rate": 1.4798001204696305e-05, + "loss": 1.6936, + "step": 10060 + }, + { + "epoch": 0.7573345377218239, + "grad_norm": 5.501828193664551, + "learning_rate": 1.4789344763662565e-05, + "loss": 1.4453, + "step": 10061 + }, + { + "epoch": 0.757409812002484, + "grad_norm": 5.3522162437438965, + "learning_rate": 1.4780690415801262e-05, + "loss": 1.6884, + "step": 10062 + }, + { + "epoch": 0.7574850862831443, + "grad_norm": 5.664185047149658, + "learning_rate": 1.4772038161626845e-05, + "loss": 1.5604, + "step": 10063 + }, + { + "epoch": 0.7575603605638044, + "grad_norm": 5.101410388946533, + "learning_rate": 1.47633880016537e-05, + "loss": 1.6012, + "step": 10064 + }, + { + "epoch": 0.7576356348444645, + "grad_norm": 8.094220161437988, + "learning_rate": 1.4754739936396012e-05, + "loss": 2.1059, + "step": 10065 + }, + { + "epoch": 0.7577109091251246, + "grad_norm": 6.133645534515381, + "learning_rate": 1.474609396636792e-05, + "loss": 1.8676, + "step": 10066 + }, + { + "epoch": 0.7577861834057849, + "grad_norm": 4.60461950302124, + "learning_rate": 1.4737450092083371e-05, + "loss": 1.6885, + "step": 10067 + }, + { + "epoch": 0.757861457686445, + "grad_norm": 4.129703998565674, + "learning_rate": 1.4728808314056236e-05, + "loss": 1.6083, + "step": 10068 + }, + { + "epoch": 0.7579367319671051, + "grad_norm": 5.100508689880371, + "learning_rate": 1.472016863280024e-05, + "loss": 1.7562, + "step": 10069 + }, + { + "epoch": 0.7580120062477653, + "grad_norm": 5.066123008728027, + "learning_rate": 1.4711531048829008e-05, + "loss": 1.8232, + "step": 10070 + }, + { + "epoch": 0.7580872805284254, + "grad_norm": 5.168105125427246, + "learning_rate": 1.4702895562655983e-05, + "loss": 1.4935, + "step": 10071 + }, + { + "epoch": 0.7581625548090856, + "grad_norm": 5.9550933837890625, + "learning_rate": 1.469426217479456e-05, + "loss": 1.7039, + "step": 10072 + }, + { + "epoch": 0.7582378290897458, + "grad_norm": 3.7155375480651855, + "learning_rate": 1.4685630885757922e-05, + "loss": 1.6539, + "step": 10073 + }, + { + "epoch": 0.7583131033704059, + "grad_norm": 5.259271144866943, + "learning_rate": 1.4677001696059206e-05, + "loss": 1.8085, + "step": 10074 + }, + { + "epoch": 0.758388377651066, + "grad_norm": 5.260867595672607, + "learning_rate": 1.46683746062114e-05, + "loss": 1.4816, + "step": 10075 + }, + { + "epoch": 0.7584636519317263, + "grad_norm": 9.010892868041992, + "learning_rate": 1.465974961672733e-05, + "loss": 1.6957, + "step": 10076 + }, + { + "epoch": 0.7585389262123864, + "grad_norm": 6.998414039611816, + "learning_rate": 1.4651126728119763e-05, + "loss": 1.8098, + "step": 10077 + }, + { + "epoch": 0.7586142004930465, + "grad_norm": 4.8501458168029785, + "learning_rate": 1.4642505940901263e-05, + "loss": 1.9624, + "step": 10078 + }, + { + "epoch": 0.7586894747737067, + "grad_norm": 5.795738697052002, + "learning_rate": 1.4633887255584333e-05, + "loss": 1.4809, + "step": 10079 + }, + { + "epoch": 0.7587647490543669, + "grad_norm": 5.152638912200928, + "learning_rate": 1.4625270672681329e-05, + "loss": 1.7397, + "step": 10080 + }, + { + "epoch": 0.758840023335027, + "grad_norm": 4.586065769195557, + "learning_rate": 1.4616656192704503e-05, + "loss": 1.961, + "step": 10081 + }, + { + "epoch": 0.7589152976156872, + "grad_norm": 6.066518306732178, + "learning_rate": 1.4608043816165922e-05, + "loss": 1.9905, + "step": 10082 + }, + { + "epoch": 0.7589905718963473, + "grad_norm": 5.896439552307129, + "learning_rate": 1.4599433543577595e-05, + "loss": 1.6447, + "step": 10083 + }, + { + "epoch": 0.7590658461770075, + "grad_norm": 5.810495853424072, + "learning_rate": 1.4590825375451361e-05, + "loss": 1.7837, + "step": 10084 + }, + { + "epoch": 0.7591411204576676, + "grad_norm": 4.890543460845947, + "learning_rate": 1.4582219312298967e-05, + "loss": 1.8365, + "step": 10085 + }, + { + "epoch": 0.7592163947383278, + "grad_norm": 9.255048751831055, + "learning_rate": 1.4573615354631997e-05, + "loss": 2.0827, + "step": 10086 + }, + { + "epoch": 0.7592916690189879, + "grad_norm": 4.650705337524414, + "learning_rate": 1.4565013502961961e-05, + "loss": 1.5162, + "step": 10087 + }, + { + "epoch": 0.759366943299648, + "grad_norm": 4.950584888458252, + "learning_rate": 1.4556413757800185e-05, + "loss": 1.6253, + "step": 10088 + }, + { + "epoch": 0.7594422175803083, + "grad_norm": 5.099827766418457, + "learning_rate": 1.4547816119657909e-05, + "loss": 2.091, + "step": 10089 + }, + { + "epoch": 0.7595174918609684, + "grad_norm": 5.303583145141602, + "learning_rate": 1.4539220589046237e-05, + "loss": 1.5854, + "step": 10090 + }, + { + "epoch": 0.7595927661416285, + "grad_norm": 4.705607891082764, + "learning_rate": 1.4530627166476157e-05, + "loss": 1.4776, + "step": 10091 + }, + { + "epoch": 0.7596680404222887, + "grad_norm": 5.103209018707275, + "learning_rate": 1.4522035852458532e-05, + "loss": 1.9415, + "step": 10092 + }, + { + "epoch": 0.7597433147029489, + "grad_norm": 4.16525411605835, + "learning_rate": 1.4513446647504064e-05, + "loss": 1.8676, + "step": 10093 + }, + { + "epoch": 0.759818588983609, + "grad_norm": 4.694868564605713, + "learning_rate": 1.450485955212339e-05, + "loss": 1.6125, + "step": 10094 + }, + { + "epoch": 0.7598938632642692, + "grad_norm": 4.706785202026367, + "learning_rate": 1.4496274566826945e-05, + "loss": 1.7359, + "step": 10095 + }, + { + "epoch": 0.7599691375449293, + "grad_norm": 4.521183013916016, + "learning_rate": 1.4487691692125126e-05, + "loss": 1.4624, + "step": 10096 + }, + { + "epoch": 0.7600444118255895, + "grad_norm": 5.8799591064453125, + "learning_rate": 1.4479110928528122e-05, + "loss": 1.8318, + "step": 10097 + }, + { + "epoch": 0.7601196861062497, + "grad_norm": 3.8151891231536865, + "learning_rate": 1.4470532276546073e-05, + "loss": 1.5768, + "step": 10098 + }, + { + "epoch": 0.7601949603869098, + "grad_norm": 6.5606560707092285, + "learning_rate": 1.4461955736688915e-05, + "loss": 2.0879, + "step": 10099 + }, + { + "epoch": 0.7602702346675699, + "grad_norm": 3.7395894527435303, + "learning_rate": 1.4453381309466519e-05, + "loss": 1.5066, + "step": 10100 + }, + { + "epoch": 0.7603455089482302, + "grad_norm": 4.756248950958252, + "learning_rate": 1.444480899538861e-05, + "loss": 1.6712, + "step": 10101 + }, + { + "epoch": 0.7604207832288903, + "grad_norm": 5.572134971618652, + "learning_rate": 1.4436238794964802e-05, + "loss": 1.5674, + "step": 10102 + }, + { + "epoch": 0.7604960575095504, + "grad_norm": 4.928898811340332, + "learning_rate": 1.4427670708704532e-05, + "loss": 1.6759, + "step": 10103 + }, + { + "epoch": 0.7605713317902105, + "grad_norm": 3.838778257369995, + "learning_rate": 1.4419104737117195e-05, + "loss": 1.7176, + "step": 10104 + }, + { + "epoch": 0.7606466060708708, + "grad_norm": 4.890132904052734, + "learning_rate": 1.441054088071197e-05, + "loss": 1.4518, + "step": 10105 + }, + { + "epoch": 0.7607218803515309, + "grad_norm": 4.720916271209717, + "learning_rate": 1.4401979139997968e-05, + "loss": 1.6793, + "step": 10106 + }, + { + "epoch": 0.760797154632191, + "grad_norm": 4.491049766540527, + "learning_rate": 1.4393419515484186e-05, + "loss": 1.7992, + "step": 10107 + }, + { + "epoch": 0.7608724289128512, + "grad_norm": 6.02352237701416, + "learning_rate": 1.4384862007679429e-05, + "loss": 1.7626, + "step": 10108 + }, + { + "epoch": 0.7609477031935113, + "grad_norm": 5.63279914855957, + "learning_rate": 1.4376306617092445e-05, + "loss": 2.0995, + "step": 10109 + }, + { + "epoch": 0.7610229774741715, + "grad_norm": 7.28151798248291, + "learning_rate": 1.4367753344231804e-05, + "loss": 1.7465, + "step": 10110 + }, + { + "epoch": 0.7610982517548317, + "grad_norm": 5.273720741271973, + "learning_rate": 1.4359202189605991e-05, + "loss": 1.6489, + "step": 10111 + }, + { + "epoch": 0.7611735260354918, + "grad_norm": 6.151078224182129, + "learning_rate": 1.4350653153723337e-05, + "loss": 1.7647, + "step": 10112 + }, + { + "epoch": 0.7612488003161519, + "grad_norm": 7.303998947143555, + "learning_rate": 1.4342106237092085e-05, + "loss": 2.1778, + "step": 10113 + }, + { + "epoch": 0.7613240745968122, + "grad_norm": 4.938976764678955, + "learning_rate": 1.4333561440220283e-05, + "loss": 1.8573, + "step": 10114 + }, + { + "epoch": 0.7613993488774723, + "grad_norm": 5.373554229736328, + "learning_rate": 1.4325018763615933e-05, + "loss": 1.9773, + "step": 10115 + }, + { + "epoch": 0.7614746231581324, + "grad_norm": 5.634899616241455, + "learning_rate": 1.4316478207786837e-05, + "loss": 2.3235, + "step": 10116 + }, + { + "epoch": 0.7615498974387926, + "grad_norm": 4.085782527923584, + "learning_rate": 1.4307939773240735e-05, + "loss": 1.4427, + "step": 10117 + }, + { + "epoch": 0.7616251717194528, + "grad_norm": 5.606118202209473, + "learning_rate": 1.4299403460485195e-05, + "loss": 1.5548, + "step": 10118 + }, + { + "epoch": 0.7617004460001129, + "grad_norm": 6.036867141723633, + "learning_rate": 1.4290869270027696e-05, + "loss": 2.1235, + "step": 10119 + }, + { + "epoch": 0.7617757202807731, + "grad_norm": 5.042972087860107, + "learning_rate": 1.428233720237554e-05, + "loss": 1.5364, + "step": 10120 + }, + { + "epoch": 0.7618509945614332, + "grad_norm": 4.725385665893555, + "learning_rate": 1.4273807258035964e-05, + "loss": 1.7187, + "step": 10121 + }, + { + "epoch": 0.7619262688420934, + "grad_norm": 4.96973991394043, + "learning_rate": 1.4265279437516027e-05, + "loss": 2.192, + "step": 10122 + }, + { + "epoch": 0.7620015431227536, + "grad_norm": 5.463183403015137, + "learning_rate": 1.4256753741322692e-05, + "loss": 1.7678, + "step": 10123 + }, + { + "epoch": 0.7620768174034137, + "grad_norm": 4.306041717529297, + "learning_rate": 1.424823016996279e-05, + "loss": 1.9103, + "step": 10124 + }, + { + "epoch": 0.7621520916840738, + "grad_norm": 5.003134727478027, + "learning_rate": 1.4239708723943018e-05, + "loss": 1.4808, + "step": 10125 + }, + { + "epoch": 0.762227365964734, + "grad_norm": 6.2968950271606445, + "learning_rate": 1.4231189403769967e-05, + "loss": 1.5603, + "step": 10126 + }, + { + "epoch": 0.7623026402453942, + "grad_norm": 5.072829246520996, + "learning_rate": 1.4222672209950066e-05, + "loss": 1.2633, + "step": 10127 + }, + { + "epoch": 0.7623779145260543, + "grad_norm": 7.920516014099121, + "learning_rate": 1.4214157142989654e-05, + "loss": 1.5244, + "step": 10128 + }, + { + "epoch": 0.7624531888067144, + "grad_norm": 4.0125555992126465, + "learning_rate": 1.4205644203394908e-05, + "loss": 1.5937, + "step": 10129 + }, + { + "epoch": 0.7625284630873747, + "grad_norm": 4.432816982269287, + "learning_rate": 1.4197133391671925e-05, + "loss": 1.9464, + "step": 10130 + }, + { + "epoch": 0.7626037373680348, + "grad_norm": 4.216919422149658, + "learning_rate": 1.4188624708326609e-05, + "loss": 1.7704, + "step": 10131 + }, + { + "epoch": 0.7626790116486949, + "grad_norm": 5.988671779632568, + "learning_rate": 1.4180118153864818e-05, + "loss": 1.9277, + "step": 10132 + }, + { + "epoch": 0.7627542859293551, + "grad_norm": 5.29292106628418, + "learning_rate": 1.4171613728792215e-05, + "loss": 1.9177, + "step": 10133 + }, + { + "epoch": 0.7628295602100152, + "grad_norm": 4.31779670715332, + "learning_rate": 1.4163111433614369e-05, + "loss": 1.3979, + "step": 10134 + }, + { + "epoch": 0.7629048344906754, + "grad_norm": 6.334811687469482, + "learning_rate": 1.4154611268836714e-05, + "loss": 2.2132, + "step": 10135 + }, + { + "epoch": 0.7629801087713356, + "grad_norm": 4.904697895050049, + "learning_rate": 1.4146113234964592e-05, + "loss": 1.9821, + "step": 10136 + }, + { + "epoch": 0.7630553830519957, + "grad_norm": 6.991318225860596, + "learning_rate": 1.4137617332503144e-05, + "loss": 1.8405, + "step": 10137 + }, + { + "epoch": 0.7631306573326558, + "grad_norm": 5.191012382507324, + "learning_rate": 1.4129123561957457e-05, + "loss": 1.8925, + "step": 10138 + }, + { + "epoch": 0.7632059316133161, + "grad_norm": 4.45994234085083, + "learning_rate": 1.4120631923832433e-05, + "loss": 1.6965, + "step": 10139 + }, + { + "epoch": 0.7632812058939762, + "grad_norm": 7.204648017883301, + "learning_rate": 1.4112142418632895e-05, + "loss": 1.9773, + "step": 10140 + }, + { + "epoch": 0.7633564801746363, + "grad_norm": 6.436873435974121, + "learning_rate": 1.4103655046863535e-05, + "loss": 1.8566, + "step": 10141 + }, + { + "epoch": 0.7634317544552965, + "grad_norm": 4.682058811187744, + "learning_rate": 1.4095169809028863e-05, + "loss": 1.7463, + "step": 10142 + }, + { + "epoch": 0.7635070287359567, + "grad_norm": 5.8324408531188965, + "learning_rate": 1.408668670563334e-05, + "loss": 2.1097, + "step": 10143 + }, + { + "epoch": 0.7635823030166168, + "grad_norm": 5.299108505249023, + "learning_rate": 1.4078205737181233e-05, + "loss": 1.7499, + "step": 10144 + }, + { + "epoch": 0.7636575772972769, + "grad_norm": 6.168453216552734, + "learning_rate": 1.4069726904176722e-05, + "loss": 1.8839, + "step": 10145 + }, + { + "epoch": 0.7637328515779371, + "grad_norm": 5.390091896057129, + "learning_rate": 1.4061250207123849e-05, + "loss": 1.4335, + "step": 10146 + }, + { + "epoch": 0.7638081258585973, + "grad_norm": 5.720200538635254, + "learning_rate": 1.405277564652655e-05, + "loss": 1.6969, + "step": 10147 + }, + { + "epoch": 0.7638834001392574, + "grad_norm": 7.397523880004883, + "learning_rate": 1.4044303222888577e-05, + "loss": 2.0308, + "step": 10148 + }, + { + "epoch": 0.7639586744199176, + "grad_norm": 4.855720520019531, + "learning_rate": 1.4035832936713622e-05, + "loss": 1.7079, + "step": 10149 + }, + { + "epoch": 0.7640339487005777, + "grad_norm": 4.96509313583374, + "learning_rate": 1.4027364788505199e-05, + "loss": 1.327, + "step": 10150 + }, + { + "epoch": 0.7641092229812378, + "grad_norm": 4.938605308532715, + "learning_rate": 1.4018898778766732e-05, + "loss": 2.0292, + "step": 10151 + }, + { + "epoch": 0.7641844972618981, + "grad_norm": 5.345151901245117, + "learning_rate": 1.4010434908001474e-05, + "loss": 1.5294, + "step": 10152 + }, + { + "epoch": 0.7642597715425582, + "grad_norm": 4.048271179199219, + "learning_rate": 1.4001973176712613e-05, + "loss": 1.5688, + "step": 10153 + }, + { + "epoch": 0.7643350458232183, + "grad_norm": 4.621281623840332, + "learning_rate": 1.399351358540314e-05, + "loss": 2.1397, + "step": 10154 + }, + { + "epoch": 0.7644103201038785, + "grad_norm": 5.173277378082275, + "learning_rate": 1.3985056134575975e-05, + "loss": 1.7013, + "step": 10155 + }, + { + "epoch": 0.7644855943845387, + "grad_norm": 7.028407096862793, + "learning_rate": 1.3976600824733876e-05, + "loss": 1.8077, + "step": 10156 + }, + { + "epoch": 0.7645608686651988, + "grad_norm": 8.822701454162598, + "learning_rate": 1.3968147656379498e-05, + "loss": 1.82, + "step": 10157 + }, + { + "epoch": 0.764636142945859, + "grad_norm": 4.47874641418457, + "learning_rate": 1.3959696630015373e-05, + "loss": 1.6453, + "step": 10158 + }, + { + "epoch": 0.7647114172265191, + "grad_norm": 4.895484924316406, + "learning_rate": 1.3951247746143859e-05, + "loss": 1.8053, + "step": 10159 + }, + { + "epoch": 0.7647866915071793, + "grad_norm": 4.803922653198242, + "learning_rate": 1.3942801005267241e-05, + "loss": 1.6819, + "step": 10160 + }, + { + "epoch": 0.7648619657878395, + "grad_norm": 7.20355224609375, + "learning_rate": 1.3934356407887633e-05, + "loss": 1.7368, + "step": 10161 + }, + { + "epoch": 0.7649372400684996, + "grad_norm": 5.272319316864014, + "learning_rate": 1.3925913954507063e-05, + "loss": 1.8015, + "step": 10162 + }, + { + "epoch": 0.7650125143491597, + "grad_norm": 5.768642902374268, + "learning_rate": 1.3917473645627387e-05, + "loss": 1.7408, + "step": 10163 + }, + { + "epoch": 0.7650877886298199, + "grad_norm": 4.686328887939453, + "learning_rate": 1.3909035481750388e-05, + "loss": 2.0013, + "step": 10164 + }, + { + "epoch": 0.7651630629104801, + "grad_norm": 6.504966735839844, + "learning_rate": 1.3900599463377655e-05, + "loss": 1.7226, + "step": 10165 + }, + { + "epoch": 0.7652383371911402, + "grad_norm": 4.107400417327881, + "learning_rate": 1.3892165591010703e-05, + "loss": 1.9509, + "step": 10166 + }, + { + "epoch": 0.7653136114718003, + "grad_norm": 4.027732849121094, + "learning_rate": 1.3883733865150917e-05, + "loss": 1.6336, + "step": 10167 + }, + { + "epoch": 0.7653888857524606, + "grad_norm": 5.05764627456665, + "learning_rate": 1.3875304286299511e-05, + "loss": 1.8627, + "step": 10168 + }, + { + "epoch": 0.7654641600331207, + "grad_norm": 9.101910591125488, + "learning_rate": 1.3866876854957612e-05, + "loss": 1.6842, + "step": 10169 + }, + { + "epoch": 0.7655394343137808, + "grad_norm": 4.698602676391602, + "learning_rate": 1.385845157162622e-05, + "loss": 2.1931, + "step": 10170 + }, + { + "epoch": 0.765614708594441, + "grad_norm": 4.0162882804870605, + "learning_rate": 1.3850028436806163e-05, + "loss": 2.0744, + "step": 10171 + }, + { + "epoch": 0.7656899828751011, + "grad_norm": 5.0005950927734375, + "learning_rate": 1.384160745099819e-05, + "loss": 1.7237, + "step": 10172 + }, + { + "epoch": 0.7657652571557613, + "grad_norm": 4.594305992126465, + "learning_rate": 1.3833188614702918e-05, + "loss": 1.7381, + "step": 10173 + }, + { + "epoch": 0.7658405314364215, + "grad_norm": 6.005731582641602, + "learning_rate": 1.3824771928420798e-05, + "loss": 1.8101, + "step": 10174 + }, + { + "epoch": 0.7659158057170816, + "grad_norm": 5.330007553100586, + "learning_rate": 1.3816357392652202e-05, + "loss": 2.0456, + "step": 10175 + }, + { + "epoch": 0.7659910799977417, + "grad_norm": 5.196441173553467, + "learning_rate": 1.3807945007897315e-05, + "loss": 1.8135, + "step": 10176 + }, + { + "epoch": 0.766066354278402, + "grad_norm": 5.656313896179199, + "learning_rate": 1.3799534774656276e-05, + "loss": 2.0188, + "step": 10177 + }, + { + "epoch": 0.7661416285590621, + "grad_norm": 4.673975467681885, + "learning_rate": 1.3791126693429001e-05, + "loss": 1.707, + "step": 10178 + }, + { + "epoch": 0.7662169028397222, + "grad_norm": 4.612622261047363, + "learning_rate": 1.3782720764715357e-05, + "loss": 1.5591, + "step": 10179 + }, + { + "epoch": 0.7662921771203824, + "grad_norm": 5.464699745178223, + "learning_rate": 1.3774316989015034e-05, + "loss": 1.8789, + "step": 10180 + }, + { + "epoch": 0.7663674514010426, + "grad_norm": 4.1278533935546875, + "learning_rate": 1.3765915366827648e-05, + "loss": 1.58, + "step": 10181 + }, + { + "epoch": 0.7664427256817027, + "grad_norm": 4.506363391876221, + "learning_rate": 1.3757515898652607e-05, + "loss": 1.5439, + "step": 10182 + }, + { + "epoch": 0.7665179999623628, + "grad_norm": 3.9853479862213135, + "learning_rate": 1.3749118584989267e-05, + "loss": 1.6594, + "step": 10183 + }, + { + "epoch": 0.766593274243023, + "grad_norm": 3.770150899887085, + "learning_rate": 1.3740723426336799e-05, + "loss": 1.3656, + "step": 10184 + }, + { + "epoch": 0.7666685485236832, + "grad_norm": 3.9209647178649902, + "learning_rate": 1.37323304231943e-05, + "loss": 1.7156, + "step": 10185 + }, + { + "epoch": 0.7667438228043433, + "grad_norm": 4.824875831604004, + "learning_rate": 1.372393957606068e-05, + "loss": 1.6057, + "step": 10186 + }, + { + "epoch": 0.7668190970850035, + "grad_norm": 5.079814910888672, + "learning_rate": 1.3715550885434758e-05, + "loss": 1.544, + "step": 10187 + }, + { + "epoch": 0.7668943713656636, + "grad_norm": 5.106378078460693, + "learning_rate": 1.3707164351815244e-05, + "loss": 1.5633, + "step": 10188 + }, + { + "epoch": 0.7669696456463237, + "grad_norm": 5.742199897766113, + "learning_rate": 1.369877997570066e-05, + "loss": 1.3998, + "step": 10189 + }, + { + "epoch": 0.767044919926984, + "grad_norm": 5.798011779785156, + "learning_rate": 1.369039775758944e-05, + "loss": 1.7006, + "step": 10190 + }, + { + "epoch": 0.7671201942076441, + "grad_norm": 4.587980270385742, + "learning_rate": 1.3682017697979899e-05, + "loss": 1.6186, + "step": 10191 + }, + { + "epoch": 0.7671954684883042, + "grad_norm": 4.735659122467041, + "learning_rate": 1.3673639797370202e-05, + "loss": 1.7001, + "step": 10192 + }, + { + "epoch": 0.7672707427689645, + "grad_norm": 4.876805782318115, + "learning_rate": 1.3665264056258381e-05, + "loss": 1.4508, + "step": 10193 + }, + { + "epoch": 0.7673460170496246, + "grad_norm": 5.445727348327637, + "learning_rate": 1.3656890475142365e-05, + "loss": 1.6739, + "step": 10194 + }, + { + "epoch": 0.7674212913302847, + "grad_norm": 4.230088233947754, + "learning_rate": 1.3648519054519921e-05, + "loss": 1.7102, + "step": 10195 + }, + { + "epoch": 0.7674965656109449, + "grad_norm": 5.762754917144775, + "learning_rate": 1.3640149794888724e-05, + "loss": 1.6971, + "step": 10196 + }, + { + "epoch": 0.767571839891605, + "grad_norm": 6.53095817565918, + "learning_rate": 1.3631782696746281e-05, + "loss": 1.7859, + "step": 10197 + }, + { + "epoch": 0.7676471141722652, + "grad_norm": 6.96212100982666, + "learning_rate": 1.3623417760590019e-05, + "loss": 1.7841, + "step": 10198 + }, + { + "epoch": 0.7677223884529254, + "grad_norm": 5.955032825469971, + "learning_rate": 1.3615054986917181e-05, + "loss": 1.7942, + "step": 10199 + }, + { + "epoch": 0.7677976627335855, + "grad_norm": 5.478219985961914, + "learning_rate": 1.3606694376224927e-05, + "loss": 1.404, + "step": 10200 + }, + { + "epoch": 0.7678729370142456, + "grad_norm": 6.48700475692749, + "learning_rate": 1.3598335929010264e-05, + "loss": 2.182, + "step": 10201 + }, + { + "epoch": 0.7679482112949058, + "grad_norm": 5.597201347351074, + "learning_rate": 1.3589979645770095e-05, + "loss": 1.5984, + "step": 10202 + }, + { + "epoch": 0.768023485575566, + "grad_norm": 4.210601329803467, + "learning_rate": 1.3581625527001152e-05, + "loss": 1.7306, + "step": 10203 + }, + { + "epoch": 0.7680987598562261, + "grad_norm": 5.506786346435547, + "learning_rate": 1.3573273573200074e-05, + "loss": 1.719, + "step": 10204 + }, + { + "epoch": 0.7681740341368862, + "grad_norm": 5.6637492179870605, + "learning_rate": 1.3564923784863381e-05, + "loss": 1.6162, + "step": 10205 + }, + { + "epoch": 0.7682493084175465, + "grad_norm": 4.858002662658691, + "learning_rate": 1.3556576162487405e-05, + "loss": 1.7073, + "step": 10206 + }, + { + "epoch": 0.7683245826982066, + "grad_norm": 5.349017143249512, + "learning_rate": 1.3548230706568427e-05, + "loss": 1.7948, + "step": 10207 + }, + { + "epoch": 0.7683998569788667, + "grad_norm": 5.1434502601623535, + "learning_rate": 1.3539887417602525e-05, + "loss": 1.8034, + "step": 10208 + }, + { + "epoch": 0.7684751312595269, + "grad_norm": 5.197366714477539, + "learning_rate": 1.3531546296085718e-05, + "loss": 1.6545, + "step": 10209 + }, + { + "epoch": 0.768550405540187, + "grad_norm": 4.205633640289307, + "learning_rate": 1.3523207342513827e-05, + "loss": 1.6724, + "step": 10210 + }, + { + "epoch": 0.7686256798208472, + "grad_norm": 4.71999454498291, + "learning_rate": 1.3514870557382592e-05, + "loss": 1.712, + "step": 10211 + }, + { + "epoch": 0.7687009541015074, + "grad_norm": 7.27725887298584, + "learning_rate": 1.350653594118762e-05, + "loss": 1.7551, + "step": 10212 + }, + { + "epoch": 0.7687762283821675, + "grad_norm": 5.577297210693359, + "learning_rate": 1.3498203494424389e-05, + "loss": 1.6733, + "step": 10213 + }, + { + "epoch": 0.7688515026628276, + "grad_norm": 5.161733150482178, + "learning_rate": 1.3489873217588206e-05, + "loss": 1.3555, + "step": 10214 + }, + { + "epoch": 0.7689267769434879, + "grad_norm": 4.906425952911377, + "learning_rate": 1.348154511117432e-05, + "loss": 1.6006, + "step": 10215 + }, + { + "epoch": 0.769002051224148, + "grad_norm": 5.485410690307617, + "learning_rate": 1.3473219175677776e-05, + "loss": 1.2479, + "step": 10216 + }, + { + "epoch": 0.7690773255048081, + "grad_norm": 6.863162040710449, + "learning_rate": 1.3464895411593565e-05, + "loss": 1.8076, + "step": 10217 + }, + { + "epoch": 0.7691525997854683, + "grad_norm": 6.820272922515869, + "learning_rate": 1.3456573819416468e-05, + "loss": 2.1578, + "step": 10218 + }, + { + "epoch": 0.7692278740661285, + "grad_norm": 5.093837738037109, + "learning_rate": 1.3448254399641207e-05, + "loss": 1.6191, + "step": 10219 + }, + { + "epoch": 0.7693031483467886, + "grad_norm": 4.812144756317139, + "learning_rate": 1.3439937152762361e-05, + "loss": 1.922, + "step": 10220 + }, + { + "epoch": 0.7693784226274488, + "grad_norm": 5.317289352416992, + "learning_rate": 1.343162207927433e-05, + "loss": 1.6296, + "step": 10221 + }, + { + "epoch": 0.7694536969081089, + "grad_norm": 4.122788429260254, + "learning_rate": 1.3423309179671457e-05, + "loss": 1.6758, + "step": 10222 + }, + { + "epoch": 0.7695289711887691, + "grad_norm": 4.622232913970947, + "learning_rate": 1.3414998454447886e-05, + "loss": 2.0713, + "step": 10223 + }, + { + "epoch": 0.7696042454694292, + "grad_norm": 5.705224990844727, + "learning_rate": 1.340668990409768e-05, + "loss": 1.7215, + "step": 10224 + }, + { + "epoch": 0.7696795197500894, + "grad_norm": 4.129983425140381, + "learning_rate": 1.3398383529114766e-05, + "loss": 1.5216, + "step": 10225 + }, + { + "epoch": 0.7697547940307495, + "grad_norm": 16.65015983581543, + "learning_rate": 1.3390079329992943e-05, + "loss": 1.8478, + "step": 10226 + }, + { + "epoch": 0.7698300683114097, + "grad_norm": 6.217301368713379, + "learning_rate": 1.338177730722584e-05, + "loss": 1.6934, + "step": 10227 + }, + { + "epoch": 0.7699053425920699, + "grad_norm": 6.5831217765808105, + "learning_rate": 1.3373477461307027e-05, + "loss": 1.2938, + "step": 10228 + }, + { + "epoch": 0.76998061687273, + "grad_norm": 5.313953876495361, + "learning_rate": 1.3365179792729871e-05, + "loss": 1.711, + "step": 10229 + }, + { + "epoch": 0.7700558911533901, + "grad_norm": 5.7974853515625, + "learning_rate": 1.3356884301987671e-05, + "loss": 1.9951, + "step": 10230 + }, + { + "epoch": 0.7701311654340504, + "grad_norm": 5.088039398193359, + "learning_rate": 1.3348590989573544e-05, + "loss": 1.6943, + "step": 10231 + }, + { + "epoch": 0.7702064397147105, + "grad_norm": 5.849719047546387, + "learning_rate": 1.3340299855980531e-05, + "loss": 1.7053, + "step": 10232 + }, + { + "epoch": 0.7702817139953706, + "grad_norm": 5.1266584396362305, + "learning_rate": 1.3332010901701492e-05, + "loss": 1.8402, + "step": 10233 + }, + { + "epoch": 0.7703569882760308, + "grad_norm": 4.686154365539551, + "learning_rate": 1.3323724127229192e-05, + "loss": 1.7119, + "step": 10234 + }, + { + "epoch": 0.770432262556691, + "grad_norm": 6.998249530792236, + "learning_rate": 1.3315439533056251e-05, + "loss": 2.0737, + "step": 10235 + }, + { + "epoch": 0.7705075368373511, + "grad_norm": 7.463515758514404, + "learning_rate": 1.3307157119675179e-05, + "loss": 1.9611, + "step": 10236 + }, + { + "epoch": 0.7705828111180113, + "grad_norm": 4.3849968910217285, + "learning_rate": 1.3298876887578338e-05, + "loss": 1.8781, + "step": 10237 + }, + { + "epoch": 0.7706580853986714, + "grad_norm": 6.642724514007568, + "learning_rate": 1.329059883725795e-05, + "loss": 1.9034, + "step": 10238 + }, + { + "epoch": 0.7707333596793315, + "grad_norm": 5.543706893920898, + "learning_rate": 1.3282322969206145e-05, + "loss": 2.0299, + "step": 10239 + }, + { + "epoch": 0.7708086339599918, + "grad_norm": 4.8008131980896, + "learning_rate": 1.3274049283914869e-05, + "loss": 2.2268, + "step": 10240 + }, + { + "epoch": 0.7708839082406519, + "grad_norm": 6.801697731018066, + "learning_rate": 1.3265777781875998e-05, + "loss": 1.9507, + "step": 10241 + }, + { + "epoch": 0.770959182521312, + "grad_norm": 4.281809329986572, + "learning_rate": 1.325750846358122e-05, + "loss": 1.7693, + "step": 10242 + }, + { + "epoch": 0.7710344568019721, + "grad_norm": 4.42861795425415, + "learning_rate": 1.3249241329522155e-05, + "loss": 1.6532, + "step": 10243 + }, + { + "epoch": 0.7711097310826324, + "grad_norm": 8.229764938354492, + "learning_rate": 1.3240976380190229e-05, + "loss": 2.4722, + "step": 10244 + }, + { + "epoch": 0.7711850053632925, + "grad_norm": 6.613121509552002, + "learning_rate": 1.323271361607678e-05, + "loss": 1.8756, + "step": 10245 + }, + { + "epoch": 0.7712602796439526, + "grad_norm": 7.833305358886719, + "learning_rate": 1.3224453037673012e-05, + "loss": 1.9431, + "step": 10246 + }, + { + "epoch": 0.7713355539246128, + "grad_norm": 5.7198896408081055, + "learning_rate": 1.3216194645470003e-05, + "loss": 1.5606, + "step": 10247 + }, + { + "epoch": 0.771410828205273, + "grad_norm": 5.165931701660156, + "learning_rate": 1.3207938439958667e-05, + "loss": 1.51, + "step": 10248 + }, + { + "epoch": 0.7714861024859331, + "grad_norm": 5.363079071044922, + "learning_rate": 1.3199684421629833e-05, + "loss": 1.5776, + "step": 10249 + }, + { + "epoch": 0.7715613767665933, + "grad_norm": 4.452789783477783, + "learning_rate": 1.3191432590974157e-05, + "loss": 1.6926, + "step": 10250 + }, + { + "epoch": 0.7716366510472534, + "grad_norm": 5.525335311889648, + "learning_rate": 1.3183182948482209e-05, + "loss": 1.7787, + "step": 10251 + }, + { + "epoch": 0.7717119253279136, + "grad_norm": 7.703904628753662, + "learning_rate": 1.3174935494644385e-05, + "loss": 2.1641, + "step": 10252 + }, + { + "epoch": 0.7717871996085738, + "grad_norm": 7.517100811004639, + "learning_rate": 1.3166690229950978e-05, + "loss": 2.1448, + "step": 10253 + }, + { + "epoch": 0.7718624738892339, + "grad_norm": 6.480628490447998, + "learning_rate": 1.3158447154892168e-05, + "loss": 2.1454, + "step": 10254 + }, + { + "epoch": 0.771937748169894, + "grad_norm": 6.072985649108887, + "learning_rate": 1.3150206269957948e-05, + "loss": 1.9743, + "step": 10255 + }, + { + "epoch": 0.7720130224505543, + "grad_norm": 5.4649834632873535, + "learning_rate": 1.3141967575638231e-05, + "loss": 1.9911, + "step": 10256 + }, + { + "epoch": 0.7720882967312144, + "grad_norm": 4.9850592613220215, + "learning_rate": 1.313373107242279e-05, + "loss": 1.819, + "step": 10257 + }, + { + "epoch": 0.7721635710118745, + "grad_norm": 4.890212535858154, + "learning_rate": 1.3125496760801265e-05, + "loss": 1.9531, + "step": 10258 + }, + { + "epoch": 0.7722388452925347, + "grad_norm": 4.969103813171387, + "learning_rate": 1.3117264641263139e-05, + "loss": 1.4442, + "step": 10259 + }, + { + "epoch": 0.7723141195731948, + "grad_norm": 4.332989692687988, + "learning_rate": 1.3109034714297813e-05, + "loss": 1.5818, + "step": 10260 + }, + { + "epoch": 0.772389393853855, + "grad_norm": 4.930012226104736, + "learning_rate": 1.3100806980394508e-05, + "loss": 1.6395, + "step": 10261 + }, + { + "epoch": 0.7724646681345151, + "grad_norm": 5.588300704956055, + "learning_rate": 1.3092581440042368e-05, + "loss": 1.5416, + "step": 10262 + }, + { + "epoch": 0.7725399424151753, + "grad_norm": 7.19502067565918, + "learning_rate": 1.3084358093730348e-05, + "loss": 1.9466, + "step": 10263 + }, + { + "epoch": 0.7726152166958354, + "grad_norm": 4.257158279418945, + "learning_rate": 1.3076136941947331e-05, + "loss": 1.5019, + "step": 10264 + }, + { + "epoch": 0.7726904909764956, + "grad_norm": 4.666217803955078, + "learning_rate": 1.3067917985182016e-05, + "loss": 1.8647, + "step": 10265 + }, + { + "epoch": 0.7727657652571558, + "grad_norm": 5.680887699127197, + "learning_rate": 1.3059701223923e-05, + "loss": 2.2243, + "step": 10266 + }, + { + "epoch": 0.7728410395378159, + "grad_norm": 4.806848049163818, + "learning_rate": 1.3051486658658756e-05, + "loss": 1.8854, + "step": 10267 + }, + { + "epoch": 0.772916313818476, + "grad_norm": 5.246175765991211, + "learning_rate": 1.3043274289877627e-05, + "loss": 1.7991, + "step": 10268 + }, + { + "epoch": 0.7729915880991363, + "grad_norm": 7.053308963775635, + "learning_rate": 1.3035064118067785e-05, + "loss": 1.6427, + "step": 10269 + }, + { + "epoch": 0.7730668623797964, + "grad_norm": 5.682214260101318, + "learning_rate": 1.3026856143717314e-05, + "loss": 2.0361, + "step": 10270 + }, + { + "epoch": 0.7731421366604565, + "grad_norm": 6.381186008453369, + "learning_rate": 1.3018650367314179e-05, + "loss": 1.7591, + "step": 10271 + }, + { + "epoch": 0.7732174109411167, + "grad_norm": 3.792112112045288, + "learning_rate": 1.3010446789346149e-05, + "loss": 1.8476, + "step": 10272 + }, + { + "epoch": 0.7732926852217769, + "grad_norm": 4.088170051574707, + "learning_rate": 1.3002245410300939e-05, + "loss": 1.5566, + "step": 10273 + }, + { + "epoch": 0.773367959502437, + "grad_norm": 5.690225601196289, + "learning_rate": 1.2994046230666068e-05, + "loss": 1.9006, + "step": 10274 + }, + { + "epoch": 0.7734432337830972, + "grad_norm": 5.58751106262207, + "learning_rate": 1.2985849250928978e-05, + "loss": 1.9268, + "step": 10275 + }, + { + "epoch": 0.7735185080637573, + "grad_norm": 7.11445951461792, + "learning_rate": 1.2977654471576929e-05, + "loss": 1.6542, + "step": 10276 + }, + { + "epoch": 0.7735937823444174, + "grad_norm": 5.366473197937012, + "learning_rate": 1.2969461893097113e-05, + "loss": 2.2286, + "step": 10277 + }, + { + "epoch": 0.7736690566250777, + "grad_norm": 4.1333842277526855, + "learning_rate": 1.2961271515976515e-05, + "loss": 1.6497, + "step": 10278 + }, + { + "epoch": 0.7737443309057378, + "grad_norm": 4.328958034515381, + "learning_rate": 1.2953083340702049e-05, + "loss": 1.8362, + "step": 10279 + }, + { + "epoch": 0.7738196051863979, + "grad_norm": 4.567979335784912, + "learning_rate": 1.2944897367760478e-05, + "loss": 1.6662, + "step": 10280 + }, + { + "epoch": 0.773894879467058, + "grad_norm": 4.012460708618164, + "learning_rate": 1.2936713597638456e-05, + "loss": 1.9246, + "step": 10281 + }, + { + "epoch": 0.7739701537477183, + "grad_norm": 5.220430374145508, + "learning_rate": 1.2928532030822444e-05, + "loss": 1.6656, + "step": 10282 + }, + { + "epoch": 0.7740454280283784, + "grad_norm": 5.515970230102539, + "learning_rate": 1.2920352667798852e-05, + "loss": 1.7491, + "step": 10283 + }, + { + "epoch": 0.7741207023090385, + "grad_norm": 4.307553768157959, + "learning_rate": 1.291217550905388e-05, + "loss": 1.7732, + "step": 10284 + }, + { + "epoch": 0.7741959765896987, + "grad_norm": 8.74333667755127, + "learning_rate": 1.2904000555073664e-05, + "loss": 1.7607, + "step": 10285 + }, + { + "epoch": 0.7742712508703589, + "grad_norm": 4.565000534057617, + "learning_rate": 1.2895827806344185e-05, + "loss": 1.8625, + "step": 10286 + }, + { + "epoch": 0.774346525151019, + "grad_norm": 4.800649166107178, + "learning_rate": 1.2887657263351265e-05, + "loss": 1.7607, + "step": 10287 + }, + { + "epoch": 0.7744217994316792, + "grad_norm": 6.736042499542236, + "learning_rate": 1.2879488926580647e-05, + "loss": 2.1323, + "step": 10288 + }, + { + "epoch": 0.7744970737123393, + "grad_norm": 4.857367038726807, + "learning_rate": 1.2871322796517888e-05, + "loss": 1.368, + "step": 10289 + }, + { + "epoch": 0.7745723479929995, + "grad_norm": 5.044548511505127, + "learning_rate": 1.2863158873648456e-05, + "loss": 1.6316, + "step": 10290 + }, + { + "epoch": 0.7746476222736597, + "grad_norm": 4.925876140594482, + "learning_rate": 1.285499715845767e-05, + "loss": 1.3224, + "step": 10291 + }, + { + "epoch": 0.7747228965543198, + "grad_norm": 4.4432573318481445, + "learning_rate": 1.2846837651430737e-05, + "loss": 2.0812, + "step": 10292 + }, + { + "epoch": 0.7747981708349799, + "grad_norm": 3.8878657817840576, + "learning_rate": 1.283868035305269e-05, + "loss": 1.4908, + "step": 10293 + }, + { + "epoch": 0.7748734451156402, + "grad_norm": 4.609302997589111, + "learning_rate": 1.283052526380848e-05, + "loss": 2.2326, + "step": 10294 + }, + { + "epoch": 0.7749487193963003, + "grad_norm": 5.825728893280029, + "learning_rate": 1.2822372384182874e-05, + "loss": 1.7323, + "step": 10295 + }, + { + "epoch": 0.7750239936769604, + "grad_norm": 4.553735733032227, + "learning_rate": 1.2814221714660574e-05, + "loss": 1.8489, + "step": 10296 + }, + { + "epoch": 0.7750992679576206, + "grad_norm": 5.288209915161133, + "learning_rate": 1.2806073255726076e-05, + "loss": 1.9439, + "step": 10297 + }, + { + "epoch": 0.7751745422382808, + "grad_norm": 4.695043087005615, + "learning_rate": 1.2797927007863819e-05, + "loss": 1.5055, + "step": 10298 + }, + { + "epoch": 0.7752498165189409, + "grad_norm": 4.3415069580078125, + "learning_rate": 1.2789782971558046e-05, + "loss": 1.8138, + "step": 10299 + }, + { + "epoch": 0.775325090799601, + "grad_norm": 4.5453691482543945, + "learning_rate": 1.27816411472929e-05, + "loss": 1.6747, + "step": 10300 + }, + { + "epoch": 0.7754003650802612, + "grad_norm": 6.97796630859375, + "learning_rate": 1.2773501535552402e-05, + "loss": 1.8747, + "step": 10301 + }, + { + "epoch": 0.7754756393609213, + "grad_norm": 4.79071044921875, + "learning_rate": 1.2765364136820418e-05, + "loss": 1.6734, + "step": 10302 + }, + { + "epoch": 0.7755509136415815, + "grad_norm": 3.522183656692505, + "learning_rate": 1.2757228951580718e-05, + "loss": 1.2688, + "step": 10303 + }, + { + "epoch": 0.7756261879222417, + "grad_norm": 5.459575653076172, + "learning_rate": 1.2749095980316877e-05, + "loss": 1.7673, + "step": 10304 + }, + { + "epoch": 0.7757014622029018, + "grad_norm": 4.961472511291504, + "learning_rate": 1.2740965223512413e-05, + "loss": 1.6318, + "step": 10305 + }, + { + "epoch": 0.7757767364835619, + "grad_norm": 4.215363025665283, + "learning_rate": 1.273283668165064e-05, + "loss": 1.6714, + "step": 10306 + }, + { + "epoch": 0.7758520107642222, + "grad_norm": 4.462075233459473, + "learning_rate": 1.2724710355214809e-05, + "loss": 1.6308, + "step": 10307 + }, + { + "epoch": 0.7759272850448823, + "grad_norm": 6.183042049407959, + "learning_rate": 1.2716586244687984e-05, + "loss": 1.5209, + "step": 10308 + }, + { + "epoch": 0.7760025593255424, + "grad_norm": 5.079890251159668, + "learning_rate": 1.2708464350553134e-05, + "loss": 2.012, + "step": 10309 + }, + { + "epoch": 0.7760778336062026, + "grad_norm": 6.207006454467773, + "learning_rate": 1.2700344673293069e-05, + "loss": 2.215, + "step": 10310 + }, + { + "epoch": 0.7761531078868628, + "grad_norm": 4.307170867919922, + "learning_rate": 1.269222721339049e-05, + "loss": 1.4973, + "step": 10311 + }, + { + "epoch": 0.7762283821675229, + "grad_norm": 5.595042705535889, + "learning_rate": 1.268411197132795e-05, + "loss": 1.7722, + "step": 10312 + }, + { + "epoch": 0.7763036564481831, + "grad_norm": 4.818110466003418, + "learning_rate": 1.2675998947587898e-05, + "loss": 1.9686, + "step": 10313 + }, + { + "epoch": 0.7763789307288432, + "grad_norm": 6.0062761306762695, + "learning_rate": 1.2667888142652601e-05, + "loss": 1.7757, + "step": 10314 + }, + { + "epoch": 0.7764542050095034, + "grad_norm": 6.283889293670654, + "learning_rate": 1.2659779557004248e-05, + "loss": 1.5556, + "step": 10315 + }, + { + "epoch": 0.7765294792901636, + "grad_norm": 5.00290060043335, + "learning_rate": 1.2651673191124847e-05, + "loss": 1.9766, + "step": 10316 + }, + { + "epoch": 0.7766047535708237, + "grad_norm": 6.145565032958984, + "learning_rate": 1.2643569045496312e-05, + "loss": 1.8794, + "step": 10317 + }, + { + "epoch": 0.7766800278514838, + "grad_norm": 6.193139553070068, + "learning_rate": 1.263546712060042e-05, + "loss": 1.7614, + "step": 10318 + }, + { + "epoch": 0.776755302132144, + "grad_norm": 5.160417079925537, + "learning_rate": 1.2627367416918783e-05, + "loss": 1.5131, + "step": 10319 + }, + { + "epoch": 0.7768305764128042, + "grad_norm": 7.251806735992432, + "learning_rate": 1.2619269934932937e-05, + "loss": 1.8154, + "step": 10320 + }, + { + "epoch": 0.7769058506934643, + "grad_norm": 3.9753565788269043, + "learning_rate": 1.2611174675124222e-05, + "loss": 1.5316, + "step": 10321 + }, + { + "epoch": 0.7769811249741244, + "grad_norm": 5.273367404937744, + "learning_rate": 1.26030816379739e-05, + "loss": 1.7175, + "step": 10322 + }, + { + "epoch": 0.7770563992547846, + "grad_norm": 6.550223350524902, + "learning_rate": 1.259499082396306e-05, + "loss": 1.5607, + "step": 10323 + }, + { + "epoch": 0.7771316735354448, + "grad_norm": 4.198886871337891, + "learning_rate": 1.258690223357269e-05, + "loss": 1.1894, + "step": 10324 + }, + { + "epoch": 0.7772069478161049, + "grad_norm": 5.8040361404418945, + "learning_rate": 1.2578815867283628e-05, + "loss": 1.9106, + "step": 10325 + }, + { + "epoch": 0.7772822220967651, + "grad_norm": 5.107682704925537, + "learning_rate": 1.2570731725576606e-05, + "loss": 1.6954, + "step": 10326 + }, + { + "epoch": 0.7773574963774252, + "grad_norm": 4.525354862213135, + "learning_rate": 1.2562649808932175e-05, + "loss": 1.3459, + "step": 10327 + }, + { + "epoch": 0.7774327706580854, + "grad_norm": 5.908242702484131, + "learning_rate": 1.2554570117830805e-05, + "loss": 1.8911, + "step": 10328 + }, + { + "epoch": 0.7775080449387456, + "grad_norm": 5.483054161071777, + "learning_rate": 1.2546492652752784e-05, + "loss": 1.6569, + "step": 10329 + }, + { + "epoch": 0.7775833192194057, + "grad_norm": 4.690734386444092, + "learning_rate": 1.2538417414178322e-05, + "loss": 1.601, + "step": 10330 + }, + { + "epoch": 0.7776585935000658, + "grad_norm": 4.324377059936523, + "learning_rate": 1.2530344402587441e-05, + "loss": 1.5411, + "step": 10331 + }, + { + "epoch": 0.7777338677807261, + "grad_norm": 6.816080093383789, + "learning_rate": 1.2522273618460073e-05, + "loss": 1.7946, + "step": 10332 + }, + { + "epoch": 0.7778091420613862, + "grad_norm": 4.970005512237549, + "learning_rate": 1.2514205062276019e-05, + "loss": 1.9737, + "step": 10333 + }, + { + "epoch": 0.7778844163420463, + "grad_norm": 4.52910852432251, + "learning_rate": 1.25061387345149e-05, + "loss": 1.7355, + "step": 10334 + }, + { + "epoch": 0.7779596906227065, + "grad_norm": 6.592383861541748, + "learning_rate": 1.2498074635656248e-05, + "loss": 2.3875, + "step": 10335 + }, + { + "epoch": 0.7780349649033667, + "grad_norm": 3.89489483833313, + "learning_rate": 1.249001276617946e-05, + "loss": 1.6784, + "step": 10336 + }, + { + "epoch": 0.7781102391840268, + "grad_norm": 4.704974174499512, + "learning_rate": 1.2481953126563795e-05, + "loss": 1.7346, + "step": 10337 + }, + { + "epoch": 0.778185513464687, + "grad_norm": 4.8796916007995605, + "learning_rate": 1.2473895717288353e-05, + "loss": 1.4454, + "step": 10338 + }, + { + "epoch": 0.7782607877453471, + "grad_norm": 4.560452938079834, + "learning_rate": 1.2465840538832147e-05, + "loss": 1.4695, + "step": 10339 + }, + { + "epoch": 0.7783360620260072, + "grad_norm": 7.7489118576049805, + "learning_rate": 1.2457787591674014e-05, + "loss": 1.9832, + "step": 10340 + }, + { + "epoch": 0.7784113363066674, + "grad_norm": 5.141170501708984, + "learning_rate": 1.2449736876292695e-05, + "loss": 1.9783, + "step": 10341 + }, + { + "epoch": 0.7784866105873276, + "grad_norm": 5.449611663818359, + "learning_rate": 1.2441688393166767e-05, + "loss": 1.7474, + "step": 10342 + }, + { + "epoch": 0.7785618848679877, + "grad_norm": 5.13983678817749, + "learning_rate": 1.243364214277471e-05, + "loss": 1.4331, + "step": 10343 + }, + { + "epoch": 0.7786371591486478, + "grad_norm": 3.841327428817749, + "learning_rate": 1.2425598125594823e-05, + "loss": 1.5642, + "step": 10344 + }, + { + "epoch": 0.7787124334293081, + "grad_norm": 5.303977012634277, + "learning_rate": 1.2417556342105314e-05, + "loss": 1.6515, + "step": 10345 + }, + { + "epoch": 0.7787877077099682, + "grad_norm": 5.259059429168701, + "learning_rate": 1.2409516792784242e-05, + "loss": 1.4326, + "step": 10346 + }, + { + "epoch": 0.7788629819906283, + "grad_norm": 4.944968223571777, + "learning_rate": 1.2401479478109557e-05, + "loss": 1.6456, + "step": 10347 + }, + { + "epoch": 0.7789382562712885, + "grad_norm": 5.618319988250732, + "learning_rate": 1.2393444398559018e-05, + "loss": 2.1742, + "step": 10348 + }, + { + "epoch": 0.7790135305519487, + "grad_norm": 7.4382853507995605, + "learning_rate": 1.2385411554610304e-05, + "loss": 1.6208, + "step": 10349 + }, + { + "epoch": 0.7790888048326088, + "grad_norm": 6.550363540649414, + "learning_rate": 1.2377380946740958e-05, + "loss": 1.8913, + "step": 10350 + }, + { + "epoch": 0.779164079113269, + "grad_norm": 5.512263774871826, + "learning_rate": 1.236935257542835e-05, + "loss": 1.7068, + "step": 10351 + }, + { + "epoch": 0.7792393533939291, + "grad_norm": 4.1389594078063965, + "learning_rate": 1.2361326441149773e-05, + "loss": 1.5726, + "step": 10352 + }, + { + "epoch": 0.7793146276745893, + "grad_norm": 5.506958961486816, + "learning_rate": 1.2353302544382329e-05, + "loss": 1.7796, + "step": 10353 + }, + { + "epoch": 0.7793899019552495, + "grad_norm": 5.596493244171143, + "learning_rate": 1.2345280885603039e-05, + "loss": 1.7706, + "step": 10354 + }, + { + "epoch": 0.7794651762359096, + "grad_norm": 6.378200054168701, + "learning_rate": 1.2337261465288742e-05, + "loss": 1.9278, + "step": 10355 + }, + { + "epoch": 0.7795404505165697, + "grad_norm": 5.696508407592773, + "learning_rate": 1.2329244283916187e-05, + "loss": 1.7747, + "step": 10356 + }, + { + "epoch": 0.77961572479723, + "grad_norm": 5.246788501739502, + "learning_rate": 1.2321229341961965e-05, + "loss": 2.0656, + "step": 10357 + }, + { + "epoch": 0.7796909990778901, + "grad_norm": 5.020092010498047, + "learning_rate": 1.2313216639902564e-05, + "loss": 1.4931, + "step": 10358 + }, + { + "epoch": 0.7797662733585502, + "grad_norm": 6.718818187713623, + "learning_rate": 1.2305206178214279e-05, + "loss": 1.6398, + "step": 10359 + }, + { + "epoch": 0.7798415476392103, + "grad_norm": 5.6989288330078125, + "learning_rate": 1.2297197957373353e-05, + "loss": 1.5864, + "step": 10360 + }, + { + "epoch": 0.7799168219198706, + "grad_norm": 6.278674602508545, + "learning_rate": 1.2289191977855801e-05, + "loss": 2.1765, + "step": 10361 + }, + { + "epoch": 0.7799920962005307, + "grad_norm": 4.680496692657471, + "learning_rate": 1.2281188240137604e-05, + "loss": 1.4982, + "step": 10362 + }, + { + "epoch": 0.7800673704811908, + "grad_norm": 4.018574237823486, + "learning_rate": 1.2273186744694521e-05, + "loss": 1.8108, + "step": 10363 + }, + { + "epoch": 0.780142644761851, + "grad_norm": 5.31124210357666, + "learning_rate": 1.2265187492002239e-05, + "loss": 1.5725, + "step": 10364 + }, + { + "epoch": 0.7802179190425111, + "grad_norm": 8.885973930358887, + "learning_rate": 1.2257190482536296e-05, + "loss": 2.0078, + "step": 10365 + }, + { + "epoch": 0.7802931933231713, + "grad_norm": 6.205285549163818, + "learning_rate": 1.2249195716772071e-05, + "loss": 1.7303, + "step": 10366 + }, + { + "epoch": 0.7803684676038315, + "grad_norm": 6.261410236358643, + "learning_rate": 1.2241203195184842e-05, + "loss": 1.6018, + "step": 10367 + }, + { + "epoch": 0.7804437418844916, + "grad_norm": 4.895419597625732, + "learning_rate": 1.2233212918249742e-05, + "loss": 2.0266, + "step": 10368 + }, + { + "epoch": 0.7805190161651517, + "grad_norm": 4.210794925689697, + "learning_rate": 1.2225224886441784e-05, + "loss": 1.7217, + "step": 10369 + }, + { + "epoch": 0.780594290445812, + "grad_norm": 5.944062232971191, + "learning_rate": 1.22172391002358e-05, + "loss": 1.7784, + "step": 10370 + }, + { + "epoch": 0.7806695647264721, + "grad_norm": 6.118096828460693, + "learning_rate": 1.220925556010656e-05, + "loss": 1.6184, + "step": 10371 + }, + { + "epoch": 0.7807448390071322, + "grad_norm": 4.016359806060791, + "learning_rate": 1.2201274266528622e-05, + "loss": 1.369, + "step": 10372 + }, + { + "epoch": 0.7808201132877924, + "grad_norm": 7.931426048278809, + "learning_rate": 1.2193295219976491e-05, + "loss": 2.1792, + "step": 10373 + }, + { + "epoch": 0.7808953875684526, + "grad_norm": 5.14262056350708, + "learning_rate": 1.2185318420924462e-05, + "loss": 1.6773, + "step": 10374 + }, + { + "epoch": 0.7809706618491127, + "grad_norm": 6.011925220489502, + "learning_rate": 1.2177343869846769e-05, + "loss": 1.7462, + "step": 10375 + }, + { + "epoch": 0.7810459361297729, + "grad_norm": 5.608316421508789, + "learning_rate": 1.216937156721744e-05, + "loss": 1.8358, + "step": 10376 + }, + { + "epoch": 0.781121210410433, + "grad_norm": 4.704582214355469, + "learning_rate": 1.2161401513510439e-05, + "loss": 1.7629, + "step": 10377 + }, + { + "epoch": 0.7811964846910932, + "grad_norm": 6.235286235809326, + "learning_rate": 1.2153433709199536e-05, + "loss": 1.6791, + "step": 10378 + }, + { + "epoch": 0.7812717589717533, + "grad_norm": 7.8474040031433105, + "learning_rate": 1.21454681547584e-05, + "loss": 1.5677, + "step": 10379 + }, + { + "epoch": 0.7813470332524135, + "grad_norm": 4.421538352966309, + "learning_rate": 1.2137504850660564e-05, + "loss": 1.8301, + "step": 10380 + }, + { + "epoch": 0.7814223075330736, + "grad_norm": 5.945563793182373, + "learning_rate": 1.2129543797379428e-05, + "loss": 1.6735, + "step": 10381 + }, + { + "epoch": 0.7814975818137337, + "grad_norm": 5.697854995727539, + "learning_rate": 1.2121584995388268e-05, + "loss": 1.6392, + "step": 10382 + }, + { + "epoch": 0.781572856094394, + "grad_norm": 5.222346305847168, + "learning_rate": 1.2113628445160175e-05, + "loss": 1.5816, + "step": 10383 + }, + { + "epoch": 0.7816481303750541, + "grad_norm": 5.863215446472168, + "learning_rate": 1.2105674147168177e-05, + "loss": 1.7197, + "step": 10384 + }, + { + "epoch": 0.7817234046557142, + "grad_norm": 5.299314975738525, + "learning_rate": 1.2097722101885112e-05, + "loss": 1.5862, + "step": 10385 + }, + { + "epoch": 0.7817986789363744, + "grad_norm": 5.9684739112854, + "learning_rate": 1.208977230978372e-05, + "loss": 1.5973, + "step": 10386 + }, + { + "epoch": 0.7818739532170346, + "grad_norm": 5.788741588592529, + "learning_rate": 1.208182477133658e-05, + "loss": 2.0576, + "step": 10387 + }, + { + "epoch": 0.7819492274976947, + "grad_norm": 4.03679895401001, + "learning_rate": 1.2073879487016171e-05, + "loss": 1.594, + "step": 10388 + }, + { + "epoch": 0.7820245017783549, + "grad_norm": 5.378968715667725, + "learning_rate": 1.2065936457294785e-05, + "loss": 1.6345, + "step": 10389 + }, + { + "epoch": 0.782099776059015, + "grad_norm": 4.676007270812988, + "learning_rate": 1.205799568264464e-05, + "loss": 1.6658, + "step": 10390 + }, + { + "epoch": 0.7821750503396752, + "grad_norm": 4.683546543121338, + "learning_rate": 1.2050057163537775e-05, + "loss": 1.729, + "step": 10391 + }, + { + "epoch": 0.7822503246203354, + "grad_norm": 4.563548564910889, + "learning_rate": 1.2042120900446141e-05, + "loss": 1.918, + "step": 10392 + }, + { + "epoch": 0.7823255989009955, + "grad_norm": 5.3870038986206055, + "learning_rate": 1.2034186893841493e-05, + "loss": 1.8164, + "step": 10393 + }, + { + "epoch": 0.7824008731816556, + "grad_norm": 4.784528732299805, + "learning_rate": 1.2026255144195508e-05, + "loss": 1.6157, + "step": 10394 + }, + { + "epoch": 0.7824761474623159, + "grad_norm": 6.122600555419922, + "learning_rate": 1.2018325651979684e-05, + "loss": 1.7472, + "step": 10395 + }, + { + "epoch": 0.782551421742976, + "grad_norm": 8.900561332702637, + "learning_rate": 1.201039841766543e-05, + "loss": 2.0104, + "step": 10396 + }, + { + "epoch": 0.7826266960236361, + "grad_norm": 5.1237311363220215, + "learning_rate": 1.2002473441723972e-05, + "loss": 1.5203, + "step": 10397 + }, + { + "epoch": 0.7827019703042963, + "grad_norm": 7.0345540046691895, + "learning_rate": 1.1994550724626441e-05, + "loss": 1.706, + "step": 10398 + }, + { + "epoch": 0.7827772445849565, + "grad_norm": 6.087735652923584, + "learning_rate": 1.1986630266843835e-05, + "loss": 1.5862, + "step": 10399 + }, + { + "epoch": 0.7828525188656166, + "grad_norm": 5.51188850402832, + "learning_rate": 1.1978712068846976e-05, + "loss": 1.1813, + "step": 10400 + }, + { + "epoch": 0.7829277931462767, + "grad_norm": 9.195707321166992, + "learning_rate": 1.1970796131106582e-05, + "loss": 2.1399, + "step": 10401 + }, + { + "epoch": 0.7830030674269369, + "grad_norm": 5.205798625946045, + "learning_rate": 1.1962882454093244e-05, + "loss": 1.899, + "step": 10402 + }, + { + "epoch": 0.783078341707597, + "grad_norm": 5.538637161254883, + "learning_rate": 1.1954971038277418e-05, + "loss": 1.4458, + "step": 10403 + }, + { + "epoch": 0.7831536159882572, + "grad_norm": 8.486610412597656, + "learning_rate": 1.1947061884129385e-05, + "loss": 2.2277, + "step": 10404 + }, + { + "epoch": 0.7832288902689174, + "grad_norm": 4.626307964324951, + "learning_rate": 1.1939154992119356e-05, + "loss": 1.5518, + "step": 10405 + }, + { + "epoch": 0.7833041645495775, + "grad_norm": 4.804617404937744, + "learning_rate": 1.1931250362717333e-05, + "loss": 1.9471, + "step": 10406 + }, + { + "epoch": 0.7833794388302376, + "grad_norm": 6.58779239654541, + "learning_rate": 1.1923347996393259e-05, + "loss": 1.3891, + "step": 10407 + }, + { + "epoch": 0.7834547131108979, + "grad_norm": 6.769383907318115, + "learning_rate": 1.1915447893616883e-05, + "loss": 1.5826, + "step": 10408 + }, + { + "epoch": 0.783529987391558, + "grad_norm": 7.992859363555908, + "learning_rate": 1.1907550054857863e-05, + "loss": 1.9358, + "step": 10409 + }, + { + "epoch": 0.7836052616722181, + "grad_norm": 5.812442779541016, + "learning_rate": 1.189965448058568e-05, + "loss": 1.4119, + "step": 10410 + }, + { + "epoch": 0.7836805359528783, + "grad_norm": 4.696333408355713, + "learning_rate": 1.1891761171269717e-05, + "loss": 1.6086, + "step": 10411 + }, + { + "epoch": 0.7837558102335385, + "grad_norm": 7.6028337478637695, + "learning_rate": 1.1883870127379204e-05, + "loss": 1.7211, + "step": 10412 + }, + { + "epoch": 0.7838310845141986, + "grad_norm": 6.111557960510254, + "learning_rate": 1.1875981349383248e-05, + "loss": 1.3461, + "step": 10413 + }, + { + "epoch": 0.7839063587948588, + "grad_norm": 4.8355631828308105, + "learning_rate": 1.186809483775082e-05, + "loss": 1.5982, + "step": 10414 + }, + { + "epoch": 0.7839816330755189, + "grad_norm": 4.156749248504639, + "learning_rate": 1.1860210592950732e-05, + "loss": 1.5075, + "step": 10415 + }, + { + "epoch": 0.7840569073561791, + "grad_norm": 5.9238057136535645, + "learning_rate": 1.1852328615451702e-05, + "loss": 1.8323, + "step": 10416 + }, + { + "epoch": 0.7841321816368393, + "grad_norm": 6.757015228271484, + "learning_rate": 1.1844448905722267e-05, + "loss": 1.3312, + "step": 10417 + }, + { + "epoch": 0.7842074559174994, + "grad_norm": 6.875500202178955, + "learning_rate": 1.1836571464230873e-05, + "loss": 1.9363, + "step": 10418 + }, + { + "epoch": 0.7842827301981595, + "grad_norm": 7.752608776092529, + "learning_rate": 1.182869629144579e-05, + "loss": 1.8928, + "step": 10419 + }, + { + "epoch": 0.7843580044788196, + "grad_norm": 6.00040864944458, + "learning_rate": 1.18208233878352e-05, + "loss": 1.8727, + "step": 10420 + }, + { + "epoch": 0.7844332787594799, + "grad_norm": 6.621996879577637, + "learning_rate": 1.1812952753867096e-05, + "loss": 1.7605, + "step": 10421 + }, + { + "epoch": 0.78450855304014, + "grad_norm": 4.64190673828125, + "learning_rate": 1.180508439000938e-05, + "loss": 2.1531, + "step": 10422 + }, + { + "epoch": 0.7845838273208001, + "grad_norm": 4.6876044273376465, + "learning_rate": 1.1797218296729823e-05, + "loss": 1.5145, + "step": 10423 + }, + { + "epoch": 0.7846591016014604, + "grad_norm": 4.145970821380615, + "learning_rate": 1.1789354474496e-05, + "loss": 1.5211, + "step": 10424 + }, + { + "epoch": 0.7847343758821205, + "grad_norm": 5.84623908996582, + "learning_rate": 1.1781492923775422e-05, + "loss": 1.7697, + "step": 10425 + }, + { + "epoch": 0.7848096501627806, + "grad_norm": 5.659975051879883, + "learning_rate": 1.1773633645035443e-05, + "loss": 2.0242, + "step": 10426 + }, + { + "epoch": 0.7848849244434408, + "grad_norm": 5.609220027923584, + "learning_rate": 1.1765776638743242e-05, + "loss": 1.6666, + "step": 10427 + }, + { + "epoch": 0.7849601987241009, + "grad_norm": 5.019374847412109, + "learning_rate": 1.1757921905365927e-05, + "loss": 1.6319, + "step": 10428 + }, + { + "epoch": 0.7850354730047611, + "grad_norm": 7.090348720550537, + "learning_rate": 1.175006944537041e-05, + "loss": 1.8514, + "step": 10429 + }, + { + "epoch": 0.7851107472854213, + "grad_norm": 4.777589797973633, + "learning_rate": 1.174221925922352e-05, + "loss": 1.7572, + "step": 10430 + }, + { + "epoch": 0.7851860215660814, + "grad_norm": 4.995297431945801, + "learning_rate": 1.1734371347391932e-05, + "loss": 1.6413, + "step": 10431 + }, + { + "epoch": 0.7852612958467415, + "grad_norm": 4.389878273010254, + "learning_rate": 1.1726525710342157e-05, + "loss": 1.6195, + "step": 10432 + }, + { + "epoch": 0.7853365701274018, + "grad_norm": 5.9035797119140625, + "learning_rate": 1.1718682348540621e-05, + "loss": 2.0086, + "step": 10433 + }, + { + "epoch": 0.7854118444080619, + "grad_norm": 4.817549705505371, + "learning_rate": 1.1710841262453564e-05, + "loss": 1.9537, + "step": 10434 + }, + { + "epoch": 0.785487118688722, + "grad_norm": 5.770604133605957, + "learning_rate": 1.1703002452547135e-05, + "loss": 1.5293, + "step": 10435 + }, + { + "epoch": 0.7855623929693822, + "grad_norm": 5.3528666496276855, + "learning_rate": 1.1695165919287321e-05, + "loss": 1.9388, + "step": 10436 + }, + { + "epoch": 0.7856376672500424, + "grad_norm": 7.174671173095703, + "learning_rate": 1.1687331663139994e-05, + "loss": 1.9033, + "step": 10437 + }, + { + "epoch": 0.7857129415307025, + "grad_norm": 4.868191719055176, + "learning_rate": 1.167949968457086e-05, + "loss": 1.6811, + "step": 10438 + }, + { + "epoch": 0.7857882158113626, + "grad_norm": 6.7687153816223145, + "learning_rate": 1.167166998404553e-05, + "loss": 1.6018, + "step": 10439 + }, + { + "epoch": 0.7858634900920228, + "grad_norm": 5.85630989074707, + "learning_rate": 1.1663842562029425e-05, + "loss": 2.1513, + "step": 10440 + }, + { + "epoch": 0.785938764372683, + "grad_norm": 5.93109655380249, + "learning_rate": 1.1656017418987896e-05, + "loss": 1.4615, + "step": 10441 + }, + { + "epoch": 0.7860140386533431, + "grad_norm": 4.774758815765381, + "learning_rate": 1.16481945553861e-05, + "loss": 1.7843, + "step": 10442 + }, + { + "epoch": 0.7860893129340033, + "grad_norm": 4.823338508605957, + "learning_rate": 1.1640373971689106e-05, + "loss": 1.6991, + "step": 10443 + }, + { + "epoch": 0.7861645872146634, + "grad_norm": 5.284474849700928, + "learning_rate": 1.1632555668361799e-05, + "loss": 1.6173, + "step": 10444 + }, + { + "epoch": 0.7862398614953235, + "grad_norm": 6.030603885650635, + "learning_rate": 1.1624739645868966e-05, + "loss": 1.9631, + "step": 10445 + }, + { + "epoch": 0.7863151357759838, + "grad_norm": 6.263217926025391, + "learning_rate": 1.1616925904675253e-05, + "loss": 1.7531, + "step": 10446 + }, + { + "epoch": 0.7863904100566439, + "grad_norm": 4.388792037963867, + "learning_rate": 1.1609114445245161e-05, + "loss": 1.6163, + "step": 10447 + }, + { + "epoch": 0.786465684337304, + "grad_norm": 6.712930202484131, + "learning_rate": 1.1601305268043079e-05, + "loss": 1.5744, + "step": 10448 + }, + { + "epoch": 0.7865409586179642, + "grad_norm": 6.377499103546143, + "learning_rate": 1.1593498373533196e-05, + "loss": 2.0643, + "step": 10449 + }, + { + "epoch": 0.7866162328986244, + "grad_norm": 4.666498184204102, + "learning_rate": 1.1585693762179656e-05, + "loss": 2.0171, + "step": 10450 + }, + { + "epoch": 0.7866915071792845, + "grad_norm": 4.165910243988037, + "learning_rate": 1.157789143444638e-05, + "loss": 1.732, + "step": 10451 + }, + { + "epoch": 0.7867667814599447, + "grad_norm": 4.550856113433838, + "learning_rate": 1.1570091390797232e-05, + "loss": 1.6186, + "step": 10452 + }, + { + "epoch": 0.7868420557406048, + "grad_norm": 5.810037136077881, + "learning_rate": 1.1562293631695864e-05, + "loss": 1.6703, + "step": 10453 + }, + { + "epoch": 0.786917330021265, + "grad_norm": 7.292099952697754, + "learning_rate": 1.1554498157605864e-05, + "loss": 1.8828, + "step": 10454 + }, + { + "epoch": 0.7869926043019252, + "grad_norm": 4.417516708374023, + "learning_rate": 1.1546704968990623e-05, + "loss": 1.6324, + "step": 10455 + }, + { + "epoch": 0.7870678785825853, + "grad_norm": 6.819991111755371, + "learning_rate": 1.153891406631344e-05, + "loss": 1.8652, + "step": 10456 + }, + { + "epoch": 0.7871431528632454, + "grad_norm": 5.585198402404785, + "learning_rate": 1.1531125450037455e-05, + "loss": 1.9052, + "step": 10457 + }, + { + "epoch": 0.7872184271439056, + "grad_norm": 5.671231269836426, + "learning_rate": 1.1523339120625698e-05, + "loss": 1.7976, + "step": 10458 + }, + { + "epoch": 0.7872937014245658, + "grad_norm": 4.499709129333496, + "learning_rate": 1.1515555078541012e-05, + "loss": 1.6856, + "step": 10459 + }, + { + "epoch": 0.7873689757052259, + "grad_norm": 4.429848670959473, + "learning_rate": 1.1507773324246168e-05, + "loss": 1.7027, + "step": 10460 + }, + { + "epoch": 0.787444249985886, + "grad_norm": 5.3119096755981445, + "learning_rate": 1.1499993858203739e-05, + "loss": 1.5158, + "step": 10461 + }, + { + "epoch": 0.7875195242665463, + "grad_norm": 4.101273536682129, + "learning_rate": 1.1492216680876212e-05, + "loss": 1.3369, + "step": 10462 + }, + { + "epoch": 0.7875947985472064, + "grad_norm": 4.9043755531311035, + "learning_rate": 1.148444179272592e-05, + "loss": 1.7397, + "step": 10463 + }, + { + "epoch": 0.7876700728278665, + "grad_norm": 7.090843200683594, + "learning_rate": 1.1476669194215034e-05, + "loss": 1.7017, + "step": 10464 + }, + { + "epoch": 0.7877453471085267, + "grad_norm": 4.314177989959717, + "learning_rate": 1.1468898885805646e-05, + "loss": 1.534, + "step": 10465 + }, + { + "epoch": 0.7878206213891868, + "grad_norm": 4.344799041748047, + "learning_rate": 1.1461130867959647e-05, + "loss": 1.8008, + "step": 10466 + }, + { + "epoch": 0.787895895669847, + "grad_norm": 6.150382995605469, + "learning_rate": 1.1453365141138834e-05, + "loss": 1.7104, + "step": 10467 + }, + { + "epoch": 0.7879711699505072, + "grad_norm": 4.730415344238281, + "learning_rate": 1.1445601705804864e-05, + "loss": 2.0103, + "step": 10468 + }, + { + "epoch": 0.7880464442311673, + "grad_norm": 4.7263689041137695, + "learning_rate": 1.143784056241926e-05, + "loss": 1.818, + "step": 10469 + }, + { + "epoch": 0.7881217185118274, + "grad_norm": 5.751407623291016, + "learning_rate": 1.1430081711443375e-05, + "loss": 1.5443, + "step": 10470 + }, + { + "epoch": 0.7881969927924877, + "grad_norm": 7.114349842071533, + "learning_rate": 1.1422325153338475e-05, + "loss": 1.9503, + "step": 10471 + }, + { + "epoch": 0.7882722670731478, + "grad_norm": 5.2256903648376465, + "learning_rate": 1.141457088856564e-05, + "loss": 1.6957, + "step": 10472 + }, + { + "epoch": 0.7883475413538079, + "grad_norm": 5.743153095245361, + "learning_rate": 1.1406818917585865e-05, + "loss": 1.7534, + "step": 10473 + }, + { + "epoch": 0.7884228156344681, + "grad_norm": 7.917233943939209, + "learning_rate": 1.1399069240859955e-05, + "loss": 1.8375, + "step": 10474 + }, + { + "epoch": 0.7884980899151283, + "grad_norm": 5.567706108093262, + "learning_rate": 1.1391321858848636e-05, + "loss": 1.3812, + "step": 10475 + }, + { + "epoch": 0.7885733641957884, + "grad_norm": 4.238580226898193, + "learning_rate": 1.1383576772012438e-05, + "loss": 1.8107, + "step": 10476 + }, + { + "epoch": 0.7886486384764485, + "grad_norm": 5.030584812164307, + "learning_rate": 1.1375833980811795e-05, + "loss": 1.4146, + "step": 10477 + }, + { + "epoch": 0.7887239127571087, + "grad_norm": 4.106687545776367, + "learning_rate": 1.1368093485707015e-05, + "loss": 1.5048, + "step": 10478 + }, + { + "epoch": 0.7887991870377689, + "grad_norm": 5.378469944000244, + "learning_rate": 1.1360355287158214e-05, + "loss": 1.8093, + "step": 10479 + }, + { + "epoch": 0.788874461318429, + "grad_norm": 6.50471305847168, + "learning_rate": 1.1352619385625418e-05, + "loss": 1.6644, + "step": 10480 + }, + { + "epoch": 0.7889497355990892, + "grad_norm": 5.928462028503418, + "learning_rate": 1.134488578156851e-05, + "loss": 2.2287, + "step": 10481 + }, + { + "epoch": 0.7890250098797493, + "grad_norm": 4.056450366973877, + "learning_rate": 1.1337154475447243e-05, + "loss": 1.8395, + "step": 10482 + }, + { + "epoch": 0.7891002841604094, + "grad_norm": 7.611124038696289, + "learning_rate": 1.1329425467721194e-05, + "loss": 1.8706, + "step": 10483 + }, + { + "epoch": 0.7891755584410697, + "grad_norm": 4.591465473175049, + "learning_rate": 1.1321698758849853e-05, + "loss": 1.4513, + "step": 10484 + }, + { + "epoch": 0.7892508327217298, + "grad_norm": 5.608588695526123, + "learning_rate": 1.1313974349292528e-05, + "loss": 1.7426, + "step": 10485 + }, + { + "epoch": 0.7893261070023899, + "grad_norm": 5.723721981048584, + "learning_rate": 1.1306252239508441e-05, + "loss": 1.8174, + "step": 10486 + }, + { + "epoch": 0.7894013812830502, + "grad_norm": 6.48117733001709, + "learning_rate": 1.1298532429956615e-05, + "loss": 1.6998, + "step": 10487 + }, + { + "epoch": 0.7894766555637103, + "grad_norm": 5.236823558807373, + "learning_rate": 1.1290814921096011e-05, + "loss": 1.6647, + "step": 10488 + }, + { + "epoch": 0.7895519298443704, + "grad_norm": 6.0537638664245605, + "learning_rate": 1.128309971338537e-05, + "loss": 1.6636, + "step": 10489 + }, + { + "epoch": 0.7896272041250306, + "grad_norm": 5.2724609375, + "learning_rate": 1.1275386807283362e-05, + "loss": 2.0006, + "step": 10490 + }, + { + "epoch": 0.7897024784056907, + "grad_norm": 5.128378391265869, + "learning_rate": 1.126767620324849e-05, + "loss": 2.0647, + "step": 10491 + }, + { + "epoch": 0.7897777526863509, + "grad_norm": 4.980754852294922, + "learning_rate": 1.1259967901739155e-05, + "loss": 1.6357, + "step": 10492 + }, + { + "epoch": 0.7898530269670111, + "grad_norm": 4.9206132888793945, + "learning_rate": 1.1252261903213552e-05, + "loss": 1.6635, + "step": 10493 + }, + { + "epoch": 0.7899283012476712, + "grad_norm": 4.8630523681640625, + "learning_rate": 1.1244558208129796e-05, + "loss": 1.8803, + "step": 10494 + }, + { + "epoch": 0.7900035755283313, + "grad_norm": 5.814865589141846, + "learning_rate": 1.1236856816945873e-05, + "loss": 1.6901, + "step": 10495 + }, + { + "epoch": 0.7900788498089916, + "grad_norm": 5.6628804206848145, + "learning_rate": 1.1229157730119571e-05, + "loss": 2.3944, + "step": 10496 + }, + { + "epoch": 0.7901541240896517, + "grad_norm": 6.422629356384277, + "learning_rate": 1.122146094810861e-05, + "loss": 1.6935, + "step": 10497 + }, + { + "epoch": 0.7902293983703118, + "grad_norm": 6.330454349517822, + "learning_rate": 1.1213766471370513e-05, + "loss": 1.6972, + "step": 10498 + }, + { + "epoch": 0.7903046726509719, + "grad_norm": 5.49129581451416, + "learning_rate": 1.1206074300362723e-05, + "loss": 1.8726, + "step": 10499 + }, + { + "epoch": 0.7903799469316322, + "grad_norm": 7.602344989776611, + "learning_rate": 1.1198384435542492e-05, + "loss": 1.8896, + "step": 10500 + }, + { + "epoch": 0.7904552212122923, + "grad_norm": 5.057015419006348, + "learning_rate": 1.119069687736697e-05, + "loss": 1.8293, + "step": 10501 + }, + { + "epoch": 0.7905304954929524, + "grad_norm": 6.302358627319336, + "learning_rate": 1.118301162629316e-05, + "loss": 2.0665, + "step": 10502 + }, + { + "epoch": 0.7906057697736126, + "grad_norm": 7.878444671630859, + "learning_rate": 1.1175328682777952e-05, + "loss": 1.6431, + "step": 10503 + }, + { + "epoch": 0.7906810440542728, + "grad_norm": 6.101637363433838, + "learning_rate": 1.1167648047278034e-05, + "loss": 1.5395, + "step": 10504 + }, + { + "epoch": 0.7907563183349329, + "grad_norm": 5.802186965942383, + "learning_rate": 1.115996972025003e-05, + "loss": 1.9857, + "step": 10505 + }, + { + "epoch": 0.7908315926155931, + "grad_norm": 6.742974281311035, + "learning_rate": 1.1152293702150368e-05, + "loss": 1.5822, + "step": 10506 + }, + { + "epoch": 0.7909068668962532, + "grad_norm": 5.875470161437988, + "learning_rate": 1.1144619993435396e-05, + "loss": 1.9987, + "step": 10507 + }, + { + "epoch": 0.7909821411769133, + "grad_norm": 3.6113080978393555, + "learning_rate": 1.1136948594561258e-05, + "loss": 1.7487, + "step": 10508 + }, + { + "epoch": 0.7910574154575736, + "grad_norm": 4.340760707855225, + "learning_rate": 1.1129279505984036e-05, + "loss": 1.9162, + "step": 10509 + }, + { + "epoch": 0.7911326897382337, + "grad_norm": 4.842530250549316, + "learning_rate": 1.1121612728159598e-05, + "loss": 1.6663, + "step": 10510 + }, + { + "epoch": 0.7912079640188938, + "grad_norm": 5.240428447723389, + "learning_rate": 1.1113948261543727e-05, + "loss": 1.6863, + "step": 10511 + }, + { + "epoch": 0.791283238299554, + "grad_norm": 5.86461877822876, + "learning_rate": 1.1106286106592057e-05, + "loss": 1.7796, + "step": 10512 + }, + { + "epoch": 0.7913585125802142, + "grad_norm": 5.585118770599365, + "learning_rate": 1.1098626263760077e-05, + "loss": 1.7166, + "step": 10513 + }, + { + "epoch": 0.7914337868608743, + "grad_norm": 5.257789134979248, + "learning_rate": 1.109096873350316e-05, + "loss": 1.5808, + "step": 10514 + }, + { + "epoch": 0.7915090611415345, + "grad_norm": 4.84629487991333, + "learning_rate": 1.1083313516276495e-05, + "loss": 1.9518, + "step": 10515 + }, + { + "epoch": 0.7915843354221946, + "grad_norm": 4.674193859100342, + "learning_rate": 1.1075660612535193e-05, + "loss": 1.6503, + "step": 10516 + }, + { + "epoch": 0.7916596097028548, + "grad_norm": 6.107690811157227, + "learning_rate": 1.1068010022734165e-05, + "loss": 1.6162, + "step": 10517 + }, + { + "epoch": 0.7917348839835149, + "grad_norm": 4.9589715003967285, + "learning_rate": 1.1060361747328247e-05, + "loss": 1.9681, + "step": 10518 + }, + { + "epoch": 0.7918101582641751, + "grad_norm": 7.172971725463867, + "learning_rate": 1.1052715786772077e-05, + "loss": 1.5179, + "step": 10519 + }, + { + "epoch": 0.7918854325448352, + "grad_norm": 5.10906457901001, + "learning_rate": 1.1045072141520219e-05, + "loss": 1.7195, + "step": 10520 + }, + { + "epoch": 0.7919607068254954, + "grad_norm": 6.275109767913818, + "learning_rate": 1.1037430812027033e-05, + "loss": 1.7023, + "step": 10521 + }, + { + "epoch": 0.7920359811061556, + "grad_norm": 6.711030960083008, + "learning_rate": 1.1029791798746792e-05, + "loss": 1.7476, + "step": 10522 + }, + { + "epoch": 0.7921112553868157, + "grad_norm": 7.630378723144531, + "learning_rate": 1.1022155102133603e-05, + "loss": 1.7564, + "step": 10523 + }, + { + "epoch": 0.7921865296674758, + "grad_norm": 5.615964412689209, + "learning_rate": 1.1014520722641476e-05, + "loss": 1.9682, + "step": 10524 + }, + { + "epoch": 0.7922618039481361, + "grad_norm": 3.756969928741455, + "learning_rate": 1.1006888660724207e-05, + "loss": 1.4586, + "step": 10525 + }, + { + "epoch": 0.7923370782287962, + "grad_norm": 4.800335884094238, + "learning_rate": 1.0999258916835531e-05, + "loss": 1.8983, + "step": 10526 + }, + { + "epoch": 0.7924123525094563, + "grad_norm": 5.853816509246826, + "learning_rate": 1.0991631491429017e-05, + "loss": 1.6133, + "step": 10527 + }, + { + "epoch": 0.7924876267901165, + "grad_norm": 5.256314277648926, + "learning_rate": 1.0984006384958068e-05, + "loss": 1.8579, + "step": 10528 + }, + { + "epoch": 0.7925629010707766, + "grad_norm": 5.243255615234375, + "learning_rate": 1.0976383597876006e-05, + "loss": 1.9268, + "step": 10529 + }, + { + "epoch": 0.7926381753514368, + "grad_norm": 6.024759292602539, + "learning_rate": 1.0968763130635951e-05, + "loss": 1.4863, + "step": 10530 + }, + { + "epoch": 0.792713449632097, + "grad_norm": 5.020583152770996, + "learning_rate": 1.0961144983690952e-05, + "loss": 2.0377, + "step": 10531 + }, + { + "epoch": 0.7927887239127571, + "grad_norm": 5.0521979331970215, + "learning_rate": 1.0953529157493852e-05, + "loss": 2.0494, + "step": 10532 + }, + { + "epoch": 0.7928639981934172, + "grad_norm": 4.836261749267578, + "learning_rate": 1.0945915652497424e-05, + "loss": 1.677, + "step": 10533 + }, + { + "epoch": 0.7929392724740775, + "grad_norm": 8.282684326171875, + "learning_rate": 1.0938304469154237e-05, + "loss": 1.6697, + "step": 10534 + }, + { + "epoch": 0.7930145467547376, + "grad_norm": 4.360495567321777, + "learning_rate": 1.093069560791677e-05, + "loss": 1.8186, + "step": 10535 + }, + { + "epoch": 0.7930898210353977, + "grad_norm": 6.021212100982666, + "learning_rate": 1.0923089069237341e-05, + "loss": 1.5067, + "step": 10536 + }, + { + "epoch": 0.7931650953160578, + "grad_norm": 5.404700756072998, + "learning_rate": 1.0915484853568164e-05, + "loss": 1.709, + "step": 10537 + }, + { + "epoch": 0.7932403695967181, + "grad_norm": 4.677717685699463, + "learning_rate": 1.0907882961361248e-05, + "loss": 1.6103, + "step": 10538 + }, + { + "epoch": 0.7933156438773782, + "grad_norm": 7.666770935058594, + "learning_rate": 1.090028339306854e-05, + "loss": 1.904, + "step": 10539 + }, + { + "epoch": 0.7933909181580383, + "grad_norm": 7.285512924194336, + "learning_rate": 1.0892686149141773e-05, + "loss": 1.849, + "step": 10540 + }, + { + "epoch": 0.7934661924386985, + "grad_norm": 6.228544235229492, + "learning_rate": 1.0885091230032624e-05, + "loss": 1.7822, + "step": 10541 + }, + { + "epoch": 0.7935414667193587, + "grad_norm": 6.802058696746826, + "learning_rate": 1.0877498636192552e-05, + "loss": 2.0611, + "step": 10542 + }, + { + "epoch": 0.7936167410000188, + "grad_norm": 5.491922378540039, + "learning_rate": 1.0869908368072928e-05, + "loss": 1.789, + "step": 10543 + }, + { + "epoch": 0.793692015280679, + "grad_norm": 5.2152276039123535, + "learning_rate": 1.0862320426124989e-05, + "loss": 2.018, + "step": 10544 + }, + { + "epoch": 0.7937672895613391, + "grad_norm": 3.8597824573516846, + "learning_rate": 1.0854734810799794e-05, + "loss": 1.8055, + "step": 10545 + }, + { + "epoch": 0.7938425638419992, + "grad_norm": 7.639402389526367, + "learning_rate": 1.0847151522548287e-05, + "loss": 1.7085, + "step": 10546 + }, + { + "epoch": 0.7939178381226595, + "grad_norm": 5.059473037719727, + "learning_rate": 1.0839570561821282e-05, + "loss": 1.8607, + "step": 10547 + }, + { + "epoch": 0.7939931124033196, + "grad_norm": 5.818789958953857, + "learning_rate": 1.0831991929069463e-05, + "loss": 1.9702, + "step": 10548 + }, + { + "epoch": 0.7940683866839797, + "grad_norm": 4.596042156219482, + "learning_rate": 1.0824415624743316e-05, + "loss": 1.5096, + "step": 10549 + }, + { + "epoch": 0.79414366096464, + "grad_norm": 6.282876968383789, + "learning_rate": 1.0816841649293275e-05, + "loss": 2.1012, + "step": 10550 + }, + { + "epoch": 0.7942189352453001, + "grad_norm": 6.446961879730225, + "learning_rate": 1.0809270003169548e-05, + "loss": 1.4965, + "step": 10551 + }, + { + "epoch": 0.7942942095259602, + "grad_norm": 5.011605739593506, + "learning_rate": 1.0801700686822286e-05, + "loss": 1.8478, + "step": 10552 + }, + { + "epoch": 0.7943694838066204, + "grad_norm": 6.0295209884643555, + "learning_rate": 1.0794133700701432e-05, + "loss": 1.7817, + "step": 10553 + }, + { + "epoch": 0.7944447580872805, + "grad_norm": 6.072133541107178, + "learning_rate": 1.0786569045256845e-05, + "loss": 1.8002, + "step": 10554 + }, + { + "epoch": 0.7945200323679407, + "grad_norm": 4.951912879943848, + "learning_rate": 1.0779006720938201e-05, + "loss": 1.6483, + "step": 10555 + }, + { + "epoch": 0.7945953066486008, + "grad_norm": 4.589295387268066, + "learning_rate": 1.0771446728195073e-05, + "loss": 1.9174, + "step": 10556 + }, + { + "epoch": 0.794670580929261, + "grad_norm": 4.080289840698242, + "learning_rate": 1.0763889067476873e-05, + "loss": 1.727, + "step": 10557 + }, + { + "epoch": 0.7947458552099211, + "grad_norm": 5.067260265350342, + "learning_rate": 1.0756333739232888e-05, + "loss": 1.6684, + "step": 10558 + }, + { + "epoch": 0.7948211294905813, + "grad_norm": 5.672039031982422, + "learning_rate": 1.0748780743912273e-05, + "loss": 1.9667, + "step": 10559 + }, + { + "epoch": 0.7948964037712415, + "grad_norm": 5.210699558258057, + "learning_rate": 1.0741230081964004e-05, + "loss": 1.6591, + "step": 10560 + }, + { + "epoch": 0.7949716780519016, + "grad_norm": 5.158468246459961, + "learning_rate": 1.0733681753836977e-05, + "loss": 1.7553, + "step": 10561 + }, + { + "epoch": 0.7950469523325617, + "grad_norm": 5.988271236419678, + "learning_rate": 1.0726135759979888e-05, + "loss": 1.6791, + "step": 10562 + }, + { + "epoch": 0.795122226613222, + "grad_norm": 4.158828258514404, + "learning_rate": 1.0718592100841352e-05, + "loss": 1.9253, + "step": 10563 + }, + { + "epoch": 0.7951975008938821, + "grad_norm": 4.432088851928711, + "learning_rate": 1.0711050776869791e-05, + "loss": 1.57, + "step": 10564 + }, + { + "epoch": 0.7952727751745422, + "grad_norm": 4.330204010009766, + "learning_rate": 1.0703511788513543e-05, + "loss": 1.5973, + "step": 10565 + }, + { + "epoch": 0.7953480494552024, + "grad_norm": 7.869935035705566, + "learning_rate": 1.0695975136220749e-05, + "loss": 1.9457, + "step": 10566 + }, + { + "epoch": 0.7954233237358626, + "grad_norm": 4.189110279083252, + "learning_rate": 1.0688440820439455e-05, + "loss": 1.6745, + "step": 10567 + }, + { + "epoch": 0.7954985980165227, + "grad_norm": 4.747684955596924, + "learning_rate": 1.0680908841617559e-05, + "loss": 1.8506, + "step": 10568 + }, + { + "epoch": 0.7955738722971829, + "grad_norm": 7.34932804107666, + "learning_rate": 1.0673379200202831e-05, + "loss": 1.477, + "step": 10569 + }, + { + "epoch": 0.795649146577843, + "grad_norm": 5.344128131866455, + "learning_rate": 1.0665851896642854e-05, + "loss": 2.0512, + "step": 10570 + }, + { + "epoch": 0.7957244208585031, + "grad_norm": 4.930959224700928, + "learning_rate": 1.0658326931385127e-05, + "loss": 1.7322, + "step": 10571 + }, + { + "epoch": 0.7957996951391634, + "grad_norm": 6.277360916137695, + "learning_rate": 1.065080430487697e-05, + "loss": 2.0128, + "step": 10572 + }, + { + "epoch": 0.7958749694198235, + "grad_norm": 5.415355205535889, + "learning_rate": 1.0643284017565608e-05, + "loss": 1.8397, + "step": 10573 + }, + { + "epoch": 0.7959502437004836, + "grad_norm": 4.9344563484191895, + "learning_rate": 1.0635766069898068e-05, + "loss": 1.4473, + "step": 10574 + }, + { + "epoch": 0.7960255179811437, + "grad_norm": 5.091531753540039, + "learning_rate": 1.0628250462321287e-05, + "loss": 1.8385, + "step": 10575 + }, + { + "epoch": 0.796100792261804, + "grad_norm": 4.807434558868408, + "learning_rate": 1.062073719528206e-05, + "loss": 1.6607, + "step": 10576 + }, + { + "epoch": 0.7961760665424641, + "grad_norm": 5.729763984680176, + "learning_rate": 1.0613226269227e-05, + "loss": 1.8739, + "step": 10577 + }, + { + "epoch": 0.7962513408231242, + "grad_norm": 4.4684224128723145, + "learning_rate": 1.060571768460264e-05, + "loss": 1.4431, + "step": 10578 + }, + { + "epoch": 0.7963266151037844, + "grad_norm": 7.565558433532715, + "learning_rate": 1.0598211441855311e-05, + "loss": 1.4743, + "step": 10579 + }, + { + "epoch": 0.7964018893844446, + "grad_norm": 4.719305992126465, + "learning_rate": 1.059070754143126e-05, + "loss": 1.5595, + "step": 10580 + }, + { + "epoch": 0.7964771636651047, + "grad_norm": 6.272445201873779, + "learning_rate": 1.0583205983776567e-05, + "loss": 2.0568, + "step": 10581 + }, + { + "epoch": 0.7965524379457649, + "grad_norm": 4.44451904296875, + "learning_rate": 1.057570676933719e-05, + "loss": 1.797, + "step": 10582 + }, + { + "epoch": 0.796627712226425, + "grad_norm": 4.996301174163818, + "learning_rate": 1.0568209898558911e-05, + "loss": 1.4681, + "step": 10583 + }, + { + "epoch": 0.7967029865070852, + "grad_norm": 5.9832258224487305, + "learning_rate": 1.0560715371887425e-05, + "loss": 1.8983, + "step": 10584 + }, + { + "epoch": 0.7967782607877454, + "grad_norm": 5.333082675933838, + "learning_rate": 1.0553223189768235e-05, + "loss": 1.6785, + "step": 10585 + }, + { + "epoch": 0.7968535350684055, + "grad_norm": 4.810013771057129, + "learning_rate": 1.0545733352646753e-05, + "loss": 1.5289, + "step": 10586 + }, + { + "epoch": 0.7969288093490656, + "grad_norm": 4.3170247077941895, + "learning_rate": 1.0538245860968204e-05, + "loss": 1.6403, + "step": 10587 + }, + { + "epoch": 0.7970040836297259, + "grad_norm": 5.658699989318848, + "learning_rate": 1.0530760715177724e-05, + "loss": 2.0403, + "step": 10588 + }, + { + "epoch": 0.797079357910386, + "grad_norm": 5.508332252502441, + "learning_rate": 1.0523277915720253e-05, + "loss": 1.6767, + "step": 10589 + }, + { + "epoch": 0.7971546321910461, + "grad_norm": 5.7640767097473145, + "learning_rate": 1.0515797463040639e-05, + "loss": 1.9817, + "step": 10590 + }, + { + "epoch": 0.7972299064717063, + "grad_norm": 5.14919900894165, + "learning_rate": 1.0508319357583574e-05, + "loss": 2.1085, + "step": 10591 + }, + { + "epoch": 0.7973051807523664, + "grad_norm": 5.494036674499512, + "learning_rate": 1.0500843599793609e-05, + "loss": 1.4134, + "step": 10592 + }, + { + "epoch": 0.7973804550330266, + "grad_norm": 4.990346908569336, + "learning_rate": 1.0493370190115171e-05, + "loss": 1.6858, + "step": 10593 + }, + { + "epoch": 0.7974557293136868, + "grad_norm": 6.158915996551514, + "learning_rate": 1.0485899128992499e-05, + "loss": 1.6698, + "step": 10594 + }, + { + "epoch": 0.7975310035943469, + "grad_norm": 4.069199562072754, + "learning_rate": 1.0478430416869767e-05, + "loss": 1.6355, + "step": 10595 + }, + { + "epoch": 0.797606277875007, + "grad_norm": 4.592220783233643, + "learning_rate": 1.0470964054190928e-05, + "loss": 1.582, + "step": 10596 + }, + { + "epoch": 0.7976815521556672, + "grad_norm": 7.295780181884766, + "learning_rate": 1.0463500041399866e-05, + "loss": 1.8742, + "step": 10597 + }, + { + "epoch": 0.7977568264363274, + "grad_norm": 6.256105422973633, + "learning_rate": 1.0456038378940276e-05, + "loss": 1.9315, + "step": 10598 + }, + { + "epoch": 0.7978321007169875, + "grad_norm": 6.621603488922119, + "learning_rate": 1.0448579067255749e-05, + "loss": 1.7559, + "step": 10599 + }, + { + "epoch": 0.7979073749976476, + "grad_norm": 5.420297145843506, + "learning_rate": 1.0441122106789697e-05, + "loss": 1.9411, + "step": 10600 + }, + { + "epoch": 0.7979826492783079, + "grad_norm": 4.679011821746826, + "learning_rate": 1.0433667497985433e-05, + "loss": 1.4724, + "step": 10601 + }, + { + "epoch": 0.798057923558968, + "grad_norm": 4.576089859008789, + "learning_rate": 1.0426215241286103e-05, + "loss": 1.5912, + "step": 10602 + }, + { + "epoch": 0.7981331978396281, + "grad_norm": 6.404575824737549, + "learning_rate": 1.041876533713474e-05, + "loss": 1.8148, + "step": 10603 + }, + { + "epoch": 0.7982084721202883, + "grad_norm": 5.479586601257324, + "learning_rate": 1.0411317785974196e-05, + "loss": 1.6784, + "step": 10604 + }, + { + "epoch": 0.7982837464009485, + "grad_norm": 4.6277055740356445, + "learning_rate": 1.0403872588247232e-05, + "loss": 1.547, + "step": 10605 + }, + { + "epoch": 0.7983590206816086, + "grad_norm": 6.0654826164245605, + "learning_rate": 1.0396429744396414e-05, + "loss": 1.4457, + "step": 10606 + }, + { + "epoch": 0.7984342949622688, + "grad_norm": 6.441137313842773, + "learning_rate": 1.0388989254864206e-05, + "loss": 2.0086, + "step": 10607 + }, + { + "epoch": 0.7985095692429289, + "grad_norm": 3.926325798034668, + "learning_rate": 1.0381551120092953e-05, + "loss": 1.9872, + "step": 10608 + }, + { + "epoch": 0.798584843523589, + "grad_norm": 5.805916786193848, + "learning_rate": 1.0374115340524787e-05, + "loss": 1.9349, + "step": 10609 + }, + { + "epoch": 0.7986601178042493, + "grad_norm": 5.70231294631958, + "learning_rate": 1.0366681916601784e-05, + "loss": 1.5689, + "step": 10610 + }, + { + "epoch": 0.7987353920849094, + "grad_norm": 5.908403396606445, + "learning_rate": 1.03592508487658e-05, + "loss": 1.7823, + "step": 10611 + }, + { + "epoch": 0.7988106663655695, + "grad_norm": 5.6760687828063965, + "learning_rate": 1.0351822137458617e-05, + "loss": 1.427, + "step": 10612 + }, + { + "epoch": 0.7988859406462298, + "grad_norm": 6.1604814529418945, + "learning_rate": 1.0344395783121842e-05, + "loss": 1.6533, + "step": 10613 + }, + { + "epoch": 0.7989612149268899, + "grad_norm": 5.252334117889404, + "learning_rate": 1.0336971786196964e-05, + "loss": 1.5114, + "step": 10614 + }, + { + "epoch": 0.79903648920755, + "grad_norm": 6.688009262084961, + "learning_rate": 1.03295501471253e-05, + "loss": 2.0561, + "step": 10615 + }, + { + "epoch": 0.7991117634882101, + "grad_norm": 5.16076135635376, + "learning_rate": 1.0322130866348063e-05, + "loss": 1.5292, + "step": 10616 + }, + { + "epoch": 0.7991870377688703, + "grad_norm": 5.181064128875732, + "learning_rate": 1.031471394430628e-05, + "loss": 1.9962, + "step": 10617 + }, + { + "epoch": 0.7992623120495305, + "grad_norm": 7.301278114318848, + "learning_rate": 1.0307299381440904e-05, + "loss": 1.9521, + "step": 10618 + }, + { + "epoch": 0.7993375863301906, + "grad_norm": 6.100028038024902, + "learning_rate": 1.0299887178192668e-05, + "loss": 1.6906, + "step": 10619 + }, + { + "epoch": 0.7994128606108508, + "grad_norm": 7.162051200866699, + "learning_rate": 1.0292477335002243e-05, + "loss": 1.7066, + "step": 10620 + }, + { + "epoch": 0.7994881348915109, + "grad_norm": 4.326688289642334, + "learning_rate": 1.0285069852310097e-05, + "loss": 1.6699, + "step": 10621 + }, + { + "epoch": 0.7995634091721711, + "grad_norm": 8.665959358215332, + "learning_rate": 1.0277664730556591e-05, + "loss": 1.9696, + "step": 10622 + }, + { + "epoch": 0.7996386834528313, + "grad_norm": 4.6314496994018555, + "learning_rate": 1.0270261970181938e-05, + "loss": 1.6467, + "step": 10623 + }, + { + "epoch": 0.7997139577334914, + "grad_norm": 4.487320423126221, + "learning_rate": 1.0262861571626232e-05, + "loss": 2.1179, + "step": 10624 + }, + { + "epoch": 0.7997892320141515, + "grad_norm": 4.4540910720825195, + "learning_rate": 1.0255463535329369e-05, + "loss": 2.1424, + "step": 10625 + }, + { + "epoch": 0.7998645062948118, + "grad_norm": 6.571934223175049, + "learning_rate": 1.0248067861731158e-05, + "loss": 1.5611, + "step": 10626 + }, + { + "epoch": 0.7999397805754719, + "grad_norm": 4.979409217834473, + "learning_rate": 1.0240674551271267e-05, + "loss": 2.2006, + "step": 10627 + }, + { + "epoch": 0.800015054856132, + "grad_norm": 5.225123405456543, + "learning_rate": 1.0233283604389183e-05, + "loss": 1.5194, + "step": 10628 + }, + { + "epoch": 0.8000903291367922, + "grad_norm": 5.426865100860596, + "learning_rate": 1.0225895021524289e-05, + "loss": 1.3784, + "step": 10629 + }, + { + "epoch": 0.8001656034174524, + "grad_norm": 5.305206298828125, + "learning_rate": 1.0218508803115806e-05, + "loss": 1.9126, + "step": 10630 + }, + { + "epoch": 0.8002408776981125, + "grad_norm": 5.936051845550537, + "learning_rate": 1.021112494960284e-05, + "loss": 1.6457, + "step": 10631 + }, + { + "epoch": 0.8003161519787727, + "grad_norm": 5.725304126739502, + "learning_rate": 1.0203743461424315e-05, + "loss": 1.6587, + "step": 10632 + }, + { + "epoch": 0.8003914262594328, + "grad_norm": 5.414479732513428, + "learning_rate": 1.0196364339019065e-05, + "loss": 1.6272, + "step": 10633 + }, + { + "epoch": 0.800466700540093, + "grad_norm": 5.1898884773254395, + "learning_rate": 1.0188987582825732e-05, + "loss": 1.7255, + "step": 10634 + }, + { + "epoch": 0.8005419748207531, + "grad_norm": 5.3717217445373535, + "learning_rate": 1.0181613193282857e-05, + "loss": 1.7785, + "step": 10635 + }, + { + "epoch": 0.8006172491014133, + "grad_norm": 4.629612922668457, + "learning_rate": 1.0174241170828824e-05, + "loss": 1.4838, + "step": 10636 + }, + { + "epoch": 0.8006925233820734, + "grad_norm": 5.608335971832275, + "learning_rate": 1.01668715159019e-05, + "loss": 1.5831, + "step": 10637 + }, + { + "epoch": 0.8007677976627335, + "grad_norm": 5.094991207122803, + "learning_rate": 1.0159504228940154e-05, + "loss": 1.5637, + "step": 10638 + }, + { + "epoch": 0.8008430719433938, + "grad_norm": 4.878345966339111, + "learning_rate": 1.0152139310381565e-05, + "loss": 1.8037, + "step": 10639 + }, + { + "epoch": 0.8009183462240539, + "grad_norm": 6.904033660888672, + "learning_rate": 1.0144776760663972e-05, + "loss": 1.8108, + "step": 10640 + }, + { + "epoch": 0.800993620504714, + "grad_norm": 7.314725399017334, + "learning_rate": 1.0137416580225029e-05, + "loss": 1.7323, + "step": 10641 + }, + { + "epoch": 0.8010688947853742, + "grad_norm": 3.854630708694458, + "learning_rate": 1.0130058769502304e-05, + "loss": 1.418, + "step": 10642 + }, + { + "epoch": 0.8011441690660344, + "grad_norm": 5.137583255767822, + "learning_rate": 1.0122703328933175e-05, + "loss": 2.2821, + "step": 10643 + }, + { + "epoch": 0.8012194433466945, + "grad_norm": 5.078144073486328, + "learning_rate": 1.0115350258954926e-05, + "loss": 1.4482, + "step": 10644 + }, + { + "epoch": 0.8012947176273547, + "grad_norm": 6.147450923919678, + "learning_rate": 1.0107999560004643e-05, + "loss": 1.8671, + "step": 10645 + }, + { + "epoch": 0.8013699919080148, + "grad_norm": 5.67170524597168, + "learning_rate": 1.0100651232519325e-05, + "loss": 1.6432, + "step": 10646 + }, + { + "epoch": 0.801445266188675, + "grad_norm": 4.897963523864746, + "learning_rate": 1.009330527693581e-05, + "loss": 1.7613, + "step": 10647 + }, + { + "epoch": 0.8015205404693352, + "grad_norm": 4.8016438484191895, + "learning_rate": 1.008596169369081e-05, + "loss": 1.5796, + "step": 10648 + }, + { + "epoch": 0.8015958147499953, + "grad_norm": 4.5629706382751465, + "learning_rate": 1.0078620483220841e-05, + "loss": 1.5226, + "step": 10649 + }, + { + "epoch": 0.8016710890306554, + "grad_norm": 5.971982479095459, + "learning_rate": 1.007128164596235e-05, + "loss": 1.6648, + "step": 10650 + }, + { + "epoch": 0.8017463633113157, + "grad_norm": 5.117178916931152, + "learning_rate": 1.006394518235158e-05, + "loss": 1.495, + "step": 10651 + }, + { + "epoch": 0.8018216375919758, + "grad_norm": 6.995295524597168, + "learning_rate": 1.0056611092824702e-05, + "loss": 1.5306, + "step": 10652 + }, + { + "epoch": 0.8018969118726359, + "grad_norm": 4.581061840057373, + "learning_rate": 1.0049279377817667e-05, + "loss": 1.584, + "step": 10653 + }, + { + "epoch": 0.801972186153296, + "grad_norm": 4.181276798248291, + "learning_rate": 1.0041950037766351e-05, + "loss": 1.7099, + "step": 10654 + }, + { + "epoch": 0.8020474604339562, + "grad_norm": 3.886526346206665, + "learning_rate": 1.0034623073106441e-05, + "loss": 2.0714, + "step": 10655 + }, + { + "epoch": 0.8021227347146164, + "grad_norm": 10.13980484008789, + "learning_rate": 1.0027298484273518e-05, + "loss": 2.1488, + "step": 10656 + }, + { + "epoch": 0.8021980089952765, + "grad_norm": 5.43254280090332, + "learning_rate": 1.0019976271703007e-05, + "loss": 1.683, + "step": 10657 + }, + { + "epoch": 0.8022732832759367, + "grad_norm": 6.171635150909424, + "learning_rate": 1.0012656435830186e-05, + "loss": 1.8215, + "step": 10658 + }, + { + "epoch": 0.8023485575565968, + "grad_norm": 6.717435359954834, + "learning_rate": 1.0005338977090223e-05, + "loss": 1.7821, + "step": 10659 + }, + { + "epoch": 0.802423831837257, + "grad_norm": 4.962304592132568, + "learning_rate": 9.99802389591808e-06, + "loss": 1.653, + "step": 10660 + }, + { + "epoch": 0.8024991061179172, + "grad_norm": 6.326074123382568, + "learning_rate": 9.990711192748658e-06, + "loss": 1.7713, + "step": 10661 + }, + { + "epoch": 0.8025743803985773, + "grad_norm": 6.142279624938965, + "learning_rate": 9.98340086801664e-06, + "loss": 1.7752, + "step": 10662 + }, + { + "epoch": 0.8026496546792374, + "grad_norm": 4.611369609832764, + "learning_rate": 9.976092922156632e-06, + "loss": 1.6849, + "step": 10663 + }, + { + "epoch": 0.8027249289598977, + "grad_norm": 4.548069953918457, + "learning_rate": 9.968787355603044e-06, + "loss": 1.5126, + "step": 10664 + }, + { + "epoch": 0.8028002032405578, + "grad_norm": 6.732028007507324, + "learning_rate": 9.961484168790197e-06, + "loss": 1.6549, + "step": 10665 + }, + { + "epoch": 0.8028754775212179, + "grad_norm": 4.674823760986328, + "learning_rate": 9.954183362152225e-06, + "loss": 1.636, + "step": 10666 + }, + { + "epoch": 0.8029507518018781, + "grad_norm": 7.17686128616333, + "learning_rate": 9.946884936123147e-06, + "loss": 1.5867, + "step": 10667 + }, + { + "epoch": 0.8030260260825383, + "grad_norm": 4.899192810058594, + "learning_rate": 9.93958889113683e-06, + "loss": 1.706, + "step": 10668 + }, + { + "epoch": 0.8031013003631984, + "grad_norm": 5.161313533782959, + "learning_rate": 9.932295227627026e-06, + "loss": 1.7566, + "step": 10669 + }, + { + "epoch": 0.8031765746438586, + "grad_norm": 5.347611904144287, + "learning_rate": 9.925003946027284e-06, + "loss": 1.6103, + "step": 10670 + }, + { + "epoch": 0.8032518489245187, + "grad_norm": 6.055444240570068, + "learning_rate": 9.917715046771075e-06, + "loss": 1.7651, + "step": 10671 + }, + { + "epoch": 0.8033271232051789, + "grad_norm": 6.4514265060424805, + "learning_rate": 9.91042853029171e-06, + "loss": 1.9849, + "step": 10672 + }, + { + "epoch": 0.803402397485839, + "grad_norm": 4.989435195922852, + "learning_rate": 9.903144397022324e-06, + "loss": 1.6067, + "step": 10673 + }, + { + "epoch": 0.8034776717664992, + "grad_norm": 8.791582107543945, + "learning_rate": 9.895862647395964e-06, + "loss": 2.0, + "step": 10674 + }, + { + "epoch": 0.8035529460471593, + "grad_norm": 5.445896148681641, + "learning_rate": 9.888583281845487e-06, + "loss": 1.8492, + "step": 10675 + }, + { + "epoch": 0.8036282203278194, + "grad_norm": 6.909423828125, + "learning_rate": 9.881306300803656e-06, + "loss": 2.1479, + "step": 10676 + }, + { + "epoch": 0.8037034946084797, + "grad_norm": 6.366767406463623, + "learning_rate": 9.874031704703035e-06, + "loss": 2.6708, + "step": 10677 + }, + { + "epoch": 0.8037787688891398, + "grad_norm": 5.221425533294678, + "learning_rate": 9.866759493976113e-06, + "loss": 1.5831, + "step": 10678 + }, + { + "epoch": 0.8038540431697999, + "grad_norm": 4.941837310791016, + "learning_rate": 9.859489669055166e-06, + "loss": 1.7053, + "step": 10679 + }, + { + "epoch": 0.8039293174504601, + "grad_norm": 6.212662220001221, + "learning_rate": 9.852222230372388e-06, + "loss": 1.6218, + "step": 10680 + }, + { + "epoch": 0.8040045917311203, + "grad_norm": 7.193216323852539, + "learning_rate": 9.844957178359798e-06, + "loss": 2.1572, + "step": 10681 + }, + { + "epoch": 0.8040798660117804, + "grad_norm": 3.970851182937622, + "learning_rate": 9.837694513449297e-06, + "loss": 1.7016, + "step": 10682 + }, + { + "epoch": 0.8041551402924406, + "grad_norm": 5.798356056213379, + "learning_rate": 9.830434236072605e-06, + "loss": 1.4327, + "step": 10683 + }, + { + "epoch": 0.8042304145731007, + "grad_norm": 6.288734436035156, + "learning_rate": 9.823176346661356e-06, + "loss": 2.3856, + "step": 10684 + }, + { + "epoch": 0.8043056888537609, + "grad_norm": 3.9786925315856934, + "learning_rate": 9.815920845646976e-06, + "loss": 1.6482, + "step": 10685 + }, + { + "epoch": 0.8043809631344211, + "grad_norm": 8.27253532409668, + "learning_rate": 9.80866773346082e-06, + "loss": 1.4883, + "step": 10686 + }, + { + "epoch": 0.8044562374150812, + "grad_norm": 5.0289177894592285, + "learning_rate": 9.801417010534026e-06, + "loss": 1.578, + "step": 10687 + }, + { + "epoch": 0.8045315116957413, + "grad_norm": 4.757718086242676, + "learning_rate": 9.794168677297649e-06, + "loss": 1.8336, + "step": 10688 + }, + { + "epoch": 0.8046067859764016, + "grad_norm": 4.743627071380615, + "learning_rate": 9.786922734182597e-06, + "loss": 1.7996, + "step": 10689 + }, + { + "epoch": 0.8046820602570617, + "grad_norm": 5.902846336364746, + "learning_rate": 9.779679181619583e-06, + "loss": 2.0891, + "step": 10690 + }, + { + "epoch": 0.8047573345377218, + "grad_norm": 7.6224565505981445, + "learning_rate": 9.772438020039243e-06, + "loss": 1.7325, + "step": 10691 + }, + { + "epoch": 0.804832608818382, + "grad_norm": 4.660473346710205, + "learning_rate": 9.765199249872038e-06, + "loss": 1.6968, + "step": 10692 + }, + { + "epoch": 0.8049078830990422, + "grad_norm": 4.547463893890381, + "learning_rate": 9.757962871548305e-06, + "loss": 1.3432, + "step": 10693 + }, + { + "epoch": 0.8049831573797023, + "grad_norm": 4.737615585327148, + "learning_rate": 9.750728885498195e-06, + "loss": 1.6027, + "step": 10694 + }, + { + "epoch": 0.8050584316603624, + "grad_norm": 4.18674373626709, + "learning_rate": 9.743497292151776e-06, + "loss": 1.2952, + "step": 10695 + }, + { + "epoch": 0.8051337059410226, + "grad_norm": 4.588990211486816, + "learning_rate": 9.736268091938927e-06, + "loss": 1.6648, + "step": 10696 + }, + { + "epoch": 0.8052089802216827, + "grad_norm": 6.996903896331787, + "learning_rate": 9.72904128528942e-06, + "loss": 1.6836, + "step": 10697 + }, + { + "epoch": 0.8052842545023429, + "grad_norm": 5.54869270324707, + "learning_rate": 9.721816872632844e-06, + "loss": 1.5177, + "step": 10698 + }, + { + "epoch": 0.8053595287830031, + "grad_norm": 5.713213920593262, + "learning_rate": 9.714594854398695e-06, + "loss": 1.3814, + "step": 10699 + }, + { + "epoch": 0.8054348030636632, + "grad_norm": 6.0902228355407715, + "learning_rate": 9.707375231016286e-06, + "loss": 2.0545, + "step": 10700 + }, + { + "epoch": 0.8055100773443233, + "grad_norm": 6.508362293243408, + "learning_rate": 9.7001580029148e-06, + "loss": 1.7278, + "step": 10701 + }, + { + "epoch": 0.8055853516249836, + "grad_norm": 5.344238758087158, + "learning_rate": 9.692943170523288e-06, + "loss": 1.5212, + "step": 10702 + }, + { + "epoch": 0.8056606259056437, + "grad_norm": 6.159590244293213, + "learning_rate": 9.685730734270654e-06, + "loss": 2.2059, + "step": 10703 + }, + { + "epoch": 0.8057359001863038, + "grad_norm": 4.705336570739746, + "learning_rate": 9.678520694585664e-06, + "loss": 1.5378, + "step": 10704 + }, + { + "epoch": 0.805811174466964, + "grad_norm": 5.453982353210449, + "learning_rate": 9.671313051896908e-06, + "loss": 1.6365, + "step": 10705 + }, + { + "epoch": 0.8058864487476242, + "grad_norm": 5.272101402282715, + "learning_rate": 9.664107806632888e-06, + "loss": 1.5563, + "step": 10706 + }, + { + "epoch": 0.8059617230282843, + "grad_norm": 5.203165531158447, + "learning_rate": 9.656904959221912e-06, + "loss": 1.9751, + "step": 10707 + }, + { + "epoch": 0.8060369973089445, + "grad_norm": 5.922636985778809, + "learning_rate": 9.649704510092195e-06, + "loss": 1.7906, + "step": 10708 + }, + { + "epoch": 0.8061122715896046, + "grad_norm": 4.592273712158203, + "learning_rate": 9.642506459671746e-06, + "loss": 1.7691, + "step": 10709 + }, + { + "epoch": 0.8061875458702648, + "grad_norm": 4.961813449859619, + "learning_rate": 9.635310808388509e-06, + "loss": 1.9511, + "step": 10710 + }, + { + "epoch": 0.806262820150925, + "grad_norm": 5.237392902374268, + "learning_rate": 9.628117556670207e-06, + "loss": 1.3525, + "step": 10711 + }, + { + "epoch": 0.8063380944315851, + "grad_norm": 4.152876377105713, + "learning_rate": 9.620926704944482e-06, + "loss": 1.4131, + "step": 10712 + }, + { + "epoch": 0.8064133687122452, + "grad_norm": 6.025280952453613, + "learning_rate": 9.613738253638798e-06, + "loss": 2.1607, + "step": 10713 + }, + { + "epoch": 0.8064886429929053, + "grad_norm": 5.101561546325684, + "learning_rate": 9.606552203180513e-06, + "loss": 1.7789, + "step": 10714 + }, + { + "epoch": 0.8065639172735656, + "grad_norm": 4.042206287384033, + "learning_rate": 9.599368553996779e-06, + "loss": 1.4833, + "step": 10715 + }, + { + "epoch": 0.8066391915542257, + "grad_norm": 5.61211633682251, + "learning_rate": 9.592187306514676e-06, + "loss": 1.7118, + "step": 10716 + }, + { + "epoch": 0.8067144658348858, + "grad_norm": 4.726149082183838, + "learning_rate": 9.585008461161082e-06, + "loss": 1.7268, + "step": 10717 + }, + { + "epoch": 0.806789740115546, + "grad_norm": 6.354880332946777, + "learning_rate": 9.577832018362788e-06, + "loss": 1.6667, + "step": 10718 + }, + { + "epoch": 0.8068650143962062, + "grad_norm": 4.31047248840332, + "learning_rate": 9.570657978546383e-06, + "loss": 1.4686, + "step": 10719 + }, + { + "epoch": 0.8069402886768663, + "grad_norm": 5.139489650726318, + "learning_rate": 9.56348634213835e-06, + "loss": 1.552, + "step": 10720 + }, + { + "epoch": 0.8070155629575265, + "grad_norm": 5.7096662521362305, + "learning_rate": 9.55631710956505e-06, + "loss": 1.2927, + "step": 10721 + }, + { + "epoch": 0.8070908372381866, + "grad_norm": 5.0845627784729, + "learning_rate": 9.549150281252633e-06, + "loss": 1.5906, + "step": 10722 + }, + { + "epoch": 0.8071661115188468, + "grad_norm": 4.14543342590332, + "learning_rate": 9.541985857627166e-06, + "loss": 1.6516, + "step": 10723 + }, + { + "epoch": 0.807241385799507, + "grad_norm": 6.379011631011963, + "learning_rate": 9.534823839114553e-06, + "loss": 2.0315, + "step": 10724 + }, + { + "epoch": 0.8073166600801671, + "grad_norm": 4.089724063873291, + "learning_rate": 9.52766422614057e-06, + "loss": 1.9729, + "step": 10725 + }, + { + "epoch": 0.8073919343608272, + "grad_norm": 5.798945426940918, + "learning_rate": 9.520507019130804e-06, + "loss": 1.5667, + "step": 10726 + }, + { + "epoch": 0.8074672086414875, + "grad_norm": 4.755006790161133, + "learning_rate": 9.513352218510763e-06, + "loss": 1.5349, + "step": 10727 + }, + { + "epoch": 0.8075424829221476, + "grad_norm": 5.690619945526123, + "learning_rate": 9.506199824705753e-06, + "loss": 1.9181, + "step": 10728 + }, + { + "epoch": 0.8076177572028077, + "grad_norm": 4.4464569091796875, + "learning_rate": 9.499049838140982e-06, + "loss": 1.8954, + "step": 10729 + }, + { + "epoch": 0.8076930314834679, + "grad_norm": 4.454349517822266, + "learning_rate": 9.491902259241476e-06, + "loss": 1.8409, + "step": 10730 + }, + { + "epoch": 0.8077683057641281, + "grad_norm": 5.359729290008545, + "learning_rate": 9.484757088432161e-06, + "loss": 1.2949, + "step": 10731 + }, + { + "epoch": 0.8078435800447882, + "grad_norm": 5.450061798095703, + "learning_rate": 9.477614326137773e-06, + "loss": 1.5636, + "step": 10732 + }, + { + "epoch": 0.8079188543254483, + "grad_norm": 4.664270877838135, + "learning_rate": 9.470473972782962e-06, + "loss": 1.7716, + "step": 10733 + }, + { + "epoch": 0.8079941286061085, + "grad_norm": 5.994903087615967, + "learning_rate": 9.463336028792157e-06, + "loss": 1.4817, + "step": 10734 + }, + { + "epoch": 0.8080694028867687, + "grad_norm": 4.6994147300720215, + "learning_rate": 9.456200494589723e-06, + "loss": 1.8198, + "step": 10735 + }, + { + "epoch": 0.8081446771674288, + "grad_norm": 4.183773517608643, + "learning_rate": 9.449067370599829e-06, + "loss": 1.8369, + "step": 10736 + }, + { + "epoch": 0.808219951448089, + "grad_norm": 5.278270244598389, + "learning_rate": 9.44193665724653e-06, + "loss": 1.51, + "step": 10737 + }, + { + "epoch": 0.8082952257287491, + "grad_norm": 5.589204788208008, + "learning_rate": 9.434808354953734e-06, + "loss": 1.5561, + "step": 10738 + }, + { + "epoch": 0.8083705000094092, + "grad_norm": 6.377546310424805, + "learning_rate": 9.427682464145172e-06, + "loss": 1.5708, + "step": 10739 + }, + { + "epoch": 0.8084457742900695, + "grad_norm": 6.885293960571289, + "learning_rate": 9.420558985244488e-06, + "loss": 2.1308, + "step": 10740 + }, + { + "epoch": 0.8085210485707296, + "grad_norm": 4.487792491912842, + "learning_rate": 9.413437918675123e-06, + "loss": 1.6929, + "step": 10741 + }, + { + "epoch": 0.8085963228513897, + "grad_norm": 5.957722187042236, + "learning_rate": 9.40631926486043e-06, + "loss": 1.7808, + "step": 10742 + }, + { + "epoch": 0.80867159713205, + "grad_norm": 6.010234832763672, + "learning_rate": 9.399203024223568e-06, + "loss": 1.8448, + "step": 10743 + }, + { + "epoch": 0.8087468714127101, + "grad_norm": 5.487905502319336, + "learning_rate": 9.392089197187604e-06, + "loss": 1.6634, + "step": 10744 + }, + { + "epoch": 0.8088221456933702, + "grad_norm": 5.758172035217285, + "learning_rate": 9.384977784175403e-06, + "loss": 1.5261, + "step": 10745 + }, + { + "epoch": 0.8088974199740304, + "grad_norm": 4.373035907745361, + "learning_rate": 9.37786878560974e-06, + "loss": 1.6, + "step": 10746 + }, + { + "epoch": 0.8089726942546905, + "grad_norm": 5.505105495452881, + "learning_rate": 9.37076220191322e-06, + "loss": 1.7087, + "step": 10747 + }, + { + "epoch": 0.8090479685353507, + "grad_norm": 5.450742721557617, + "learning_rate": 9.363658033508315e-06, + "loss": 1.3421, + "step": 10748 + }, + { + "epoch": 0.8091232428160109, + "grad_norm": 4.443560600280762, + "learning_rate": 9.356556280817335e-06, + "loss": 1.5061, + "step": 10749 + }, + { + "epoch": 0.809198517096671, + "grad_norm": 5.444678783416748, + "learning_rate": 9.349456944262474e-06, + "loss": 1.7152, + "step": 10750 + }, + { + "epoch": 0.8092737913773311, + "grad_norm": 6.038509368896484, + "learning_rate": 9.34236002426575e-06, + "loss": 1.4727, + "step": 10751 + }, + { + "epoch": 0.8093490656579913, + "grad_norm": 4.136316776275635, + "learning_rate": 9.335265521249059e-06, + "loss": 1.5126, + "step": 10752 + }, + { + "epoch": 0.8094243399386515, + "grad_norm": 6.54505729675293, + "learning_rate": 9.328173435634164e-06, + "loss": 1.5576, + "step": 10753 + }, + { + "epoch": 0.8094996142193116, + "grad_norm": 7.999358177185059, + "learning_rate": 9.321083767842648e-06, + "loss": 2.1663, + "step": 10754 + }, + { + "epoch": 0.8095748884999717, + "grad_norm": 5.532820224761963, + "learning_rate": 9.313996518295993e-06, + "loss": 1.8447, + "step": 10755 + }, + { + "epoch": 0.809650162780632, + "grad_norm": 6.342843532562256, + "learning_rate": 9.306911687415488e-06, + "loss": 1.6293, + "step": 10756 + }, + { + "epoch": 0.8097254370612921, + "grad_norm": 4.488259792327881, + "learning_rate": 9.299829275622323e-06, + "loss": 1.6924, + "step": 10757 + }, + { + "epoch": 0.8098007113419522, + "grad_norm": 4.587261199951172, + "learning_rate": 9.292749283337533e-06, + "loss": 1.5904, + "step": 10758 + }, + { + "epoch": 0.8098759856226124, + "grad_norm": 5.1121368408203125, + "learning_rate": 9.285671710981997e-06, + "loss": 1.5995, + "step": 10759 + }, + { + "epoch": 0.8099512599032725, + "grad_norm": 5.67233419418335, + "learning_rate": 9.27859655897645e-06, + "loss": 1.9992, + "step": 10760 + }, + { + "epoch": 0.8100265341839327, + "grad_norm": 6.744446277618408, + "learning_rate": 9.271523827741508e-06, + "loss": 1.6726, + "step": 10761 + }, + { + "epoch": 0.8101018084645929, + "grad_norm": 4.304105758666992, + "learning_rate": 9.264453517697597e-06, + "loss": 1.9408, + "step": 10762 + }, + { + "epoch": 0.810177082745253, + "grad_norm": 4.234796047210693, + "learning_rate": 9.257385629265053e-06, + "loss": 1.4792, + "step": 10763 + }, + { + "epoch": 0.8102523570259131, + "grad_norm": 10.539286613464355, + "learning_rate": 9.250320162864012e-06, + "loss": 1.8107, + "step": 10764 + }, + { + "epoch": 0.8103276313065734, + "grad_norm": 5.138901233673096, + "learning_rate": 9.243257118914533e-06, + "loss": 1.7529, + "step": 10765 + }, + { + "epoch": 0.8104029055872335, + "grad_norm": 4.357977867126465, + "learning_rate": 9.236196497836458e-06, + "loss": 1.5923, + "step": 10766 + }, + { + "epoch": 0.8104781798678936, + "grad_norm": 5.026484966278076, + "learning_rate": 9.22913830004954e-06, + "loss": 1.5597, + "step": 10767 + }, + { + "epoch": 0.8105534541485538, + "grad_norm": 6.011439323425293, + "learning_rate": 9.222082525973364e-06, + "loss": 1.8472, + "step": 10768 + }, + { + "epoch": 0.810628728429214, + "grad_norm": 5.101469993591309, + "learning_rate": 9.215029176027373e-06, + "loss": 1.4364, + "step": 10769 + }, + { + "epoch": 0.8107040027098741, + "grad_norm": 6.551079750061035, + "learning_rate": 9.207978250630888e-06, + "loss": 1.6188, + "step": 10770 + }, + { + "epoch": 0.8107792769905343, + "grad_norm": 4.4180145263671875, + "learning_rate": 9.20092975020304e-06, + "loss": 1.2674, + "step": 10771 + }, + { + "epoch": 0.8108545512711944, + "grad_norm": 5.633283615112305, + "learning_rate": 9.193883675162863e-06, + "loss": 1.7919, + "step": 10772 + }, + { + "epoch": 0.8109298255518546, + "grad_norm": 5.37089204788208, + "learning_rate": 9.186840025929206e-06, + "loss": 1.4768, + "step": 10773 + }, + { + "epoch": 0.8110050998325147, + "grad_norm": 5.3983845710754395, + "learning_rate": 9.179798802920814e-06, + "loss": 1.7804, + "step": 10774 + }, + { + "epoch": 0.8110803741131749, + "grad_norm": 4.682229042053223, + "learning_rate": 9.172760006556242e-06, + "loss": 1.8913, + "step": 10775 + }, + { + "epoch": 0.811155648393835, + "grad_norm": 5.739368915557861, + "learning_rate": 9.165723637253953e-06, + "loss": 1.5409, + "step": 10776 + }, + { + "epoch": 0.8112309226744951, + "grad_norm": 4.172613620758057, + "learning_rate": 9.158689695432221e-06, + "loss": 1.6334, + "step": 10777 + }, + { + "epoch": 0.8113061969551554, + "grad_norm": 5.450798511505127, + "learning_rate": 9.151658181509194e-06, + "loss": 1.8833, + "step": 10778 + }, + { + "epoch": 0.8113814712358155, + "grad_norm": 4.6598358154296875, + "learning_rate": 9.144629095902896e-06, + "loss": 1.6871, + "step": 10779 + }, + { + "epoch": 0.8114567455164756, + "grad_norm": 6.442360877990723, + "learning_rate": 9.137602439031157e-06, + "loss": 2.0413, + "step": 10780 + }, + { + "epoch": 0.8115320197971359, + "grad_norm": 4.727507591247559, + "learning_rate": 9.130578211311708e-06, + "loss": 1.6409, + "step": 10781 + }, + { + "epoch": 0.811607294077796, + "grad_norm": 5.5395636558532715, + "learning_rate": 9.123556413162126e-06, + "loss": 1.7509, + "step": 10782 + }, + { + "epoch": 0.8116825683584561, + "grad_norm": 6.314138889312744, + "learning_rate": 9.116537044999817e-06, + "loss": 1.8439, + "step": 10783 + }, + { + "epoch": 0.8117578426391163, + "grad_norm": 5.2358503341674805, + "learning_rate": 9.10952010724207e-06, + "loss": 2.136, + "step": 10784 + }, + { + "epoch": 0.8118331169197764, + "grad_norm": 5.34305477142334, + "learning_rate": 9.102505600306045e-06, + "loss": 2.0532, + "step": 10785 + }, + { + "epoch": 0.8119083912004366, + "grad_norm": 5.926006317138672, + "learning_rate": 9.095493524608694e-06, + "loss": 1.5635, + "step": 10786 + }, + { + "epoch": 0.8119836654810968, + "grad_norm": 4.744370460510254, + "learning_rate": 9.0884838805669e-06, + "loss": 2.1624, + "step": 10787 + }, + { + "epoch": 0.8120589397617569, + "grad_norm": 5.013854503631592, + "learning_rate": 9.081476668597338e-06, + "loss": 1.7757, + "step": 10788 + }, + { + "epoch": 0.812134214042417, + "grad_norm": 4.619051933288574, + "learning_rate": 9.074471889116593e-06, + "loss": 1.6179, + "step": 10789 + }, + { + "epoch": 0.8122094883230773, + "grad_norm": 4.140326976776123, + "learning_rate": 9.067469542541051e-06, + "loss": 1.7695, + "step": 10790 + }, + { + "epoch": 0.8122847626037374, + "grad_norm": 3.8155040740966797, + "learning_rate": 9.060469629287e-06, + "loss": 1.4484, + "step": 10791 + }, + { + "epoch": 0.8123600368843975, + "grad_norm": 6.427689075469971, + "learning_rate": 9.053472149770559e-06, + "loss": 2.1325, + "step": 10792 + }, + { + "epoch": 0.8124353111650576, + "grad_norm": 6.895628929138184, + "learning_rate": 9.04647710440772e-06, + "loss": 1.6923, + "step": 10793 + }, + { + "epoch": 0.8125105854457179, + "grad_norm": 7.723230838775635, + "learning_rate": 9.039484493614298e-06, + "loss": 1.8225, + "step": 10794 + }, + { + "epoch": 0.812585859726378, + "grad_norm": 4.482957363128662, + "learning_rate": 9.032494317806007e-06, + "loss": 1.8363, + "step": 10795 + }, + { + "epoch": 0.8126611340070381, + "grad_norm": 4.407188892364502, + "learning_rate": 9.025506577398362e-06, + "loss": 1.657, + "step": 10796 + }, + { + "epoch": 0.8127364082876983, + "grad_norm": 8.488208770751953, + "learning_rate": 9.0185212728068e-06, + "loss": 1.9447, + "step": 10797 + }, + { + "epoch": 0.8128116825683585, + "grad_norm": 5.23155403137207, + "learning_rate": 9.011538404446546e-06, + "loss": 1.709, + "step": 10798 + }, + { + "epoch": 0.8128869568490186, + "grad_norm": 5.007590293884277, + "learning_rate": 9.00455797273274e-06, + "loss": 1.8046, + "step": 10799 + }, + { + "epoch": 0.8129622311296788, + "grad_norm": 7.946403980255127, + "learning_rate": 8.997579978080317e-06, + "loss": 1.8746, + "step": 10800 + }, + { + "epoch": 0.8130375054103389, + "grad_norm": 5.797719955444336, + "learning_rate": 8.990604420904115e-06, + "loss": 1.7205, + "step": 10801 + }, + { + "epoch": 0.813112779690999, + "grad_norm": 5.198882102966309, + "learning_rate": 8.983631301618812e-06, + "loss": 1.5435, + "step": 10802 + }, + { + "epoch": 0.8131880539716593, + "grad_norm": 6.69896125793457, + "learning_rate": 8.976660620638939e-06, + "loss": 2.0884, + "step": 10803 + }, + { + "epoch": 0.8132633282523194, + "grad_norm": 5.018105506896973, + "learning_rate": 8.969692378378897e-06, + "loss": 1.592, + "step": 10804 + }, + { + "epoch": 0.8133386025329795, + "grad_norm": 5.290376663208008, + "learning_rate": 8.9627265752529e-06, + "loss": 2.0723, + "step": 10805 + }, + { + "epoch": 0.8134138768136397, + "grad_norm": 6.265124320983887, + "learning_rate": 8.955763211675072e-06, + "loss": 1.855, + "step": 10806 + }, + { + "epoch": 0.8134891510942999, + "grad_norm": 7.196878910064697, + "learning_rate": 8.948802288059338e-06, + "loss": 1.8538, + "step": 10807 + }, + { + "epoch": 0.81356442537496, + "grad_norm": 4.254331111907959, + "learning_rate": 8.941843804819533e-06, + "loss": 1.7619, + "step": 10808 + }, + { + "epoch": 0.8136396996556202, + "grad_norm": 5.06824254989624, + "learning_rate": 8.934887762369293e-06, + "loss": 1.8977, + "step": 10809 + }, + { + "epoch": 0.8137149739362803, + "grad_norm": 5.001463890075684, + "learning_rate": 8.927934161122154e-06, + "loss": 2.4393, + "step": 10810 + }, + { + "epoch": 0.8137902482169405, + "grad_norm": 6.348660469055176, + "learning_rate": 8.920983001491473e-06, + "loss": 1.4893, + "step": 10811 + }, + { + "epoch": 0.8138655224976006, + "grad_norm": 6.043994426727295, + "learning_rate": 8.914034283890483e-06, + "loss": 2.0927, + "step": 10812 + }, + { + "epoch": 0.8139407967782608, + "grad_norm": 6.441689491271973, + "learning_rate": 8.907088008732261e-06, + "loss": 1.6517, + "step": 10813 + }, + { + "epoch": 0.8140160710589209, + "grad_norm": 4.943453311920166, + "learning_rate": 8.900144176429765e-06, + "loss": 1.7683, + "step": 10814 + }, + { + "epoch": 0.814091345339581, + "grad_norm": 5.463819980621338, + "learning_rate": 8.893202787395755e-06, + "loss": 1.646, + "step": 10815 + }, + { + "epoch": 0.8141666196202413, + "grad_norm": 3.7891833782196045, + "learning_rate": 8.886263842042892e-06, + "loss": 1.6337, + "step": 10816 + }, + { + "epoch": 0.8142418939009014, + "grad_norm": 5.5398383140563965, + "learning_rate": 8.879327340783688e-06, + "loss": 1.879, + "step": 10817 + }, + { + "epoch": 0.8143171681815615, + "grad_norm": 4.9219255447387695, + "learning_rate": 8.872393284030472e-06, + "loss": 1.8492, + "step": 10818 + }, + { + "epoch": 0.8143924424622218, + "grad_norm": 6.905791759490967, + "learning_rate": 8.865461672195479e-06, + "loss": 1.9149, + "step": 10819 + }, + { + "epoch": 0.8144677167428819, + "grad_norm": 6.123511791229248, + "learning_rate": 8.858532505690747e-06, + "loss": 1.6254, + "step": 10820 + }, + { + "epoch": 0.814542991023542, + "grad_norm": 3.921410083770752, + "learning_rate": 8.851605784928224e-06, + "loss": 1.7646, + "step": 10821 + }, + { + "epoch": 0.8146182653042022, + "grad_norm": 5.23663854598999, + "learning_rate": 8.844681510319658e-06, + "loss": 1.8701, + "step": 10822 + }, + { + "epoch": 0.8146935395848623, + "grad_norm": 6.305942535400391, + "learning_rate": 8.837759682276685e-06, + "loss": 1.6075, + "step": 10823 + }, + { + "epoch": 0.8147688138655225, + "grad_norm": 4.474752426147461, + "learning_rate": 8.830840301210796e-06, + "loss": 1.8221, + "step": 10824 + }, + { + "epoch": 0.8148440881461827, + "grad_norm": 5.417909145355225, + "learning_rate": 8.823923367533332e-06, + "loss": 1.8152, + "step": 10825 + }, + { + "epoch": 0.8149193624268428, + "grad_norm": 5.269543170928955, + "learning_rate": 8.817008881655464e-06, + "loss": 2.0432, + "step": 10826 + }, + { + "epoch": 0.8149946367075029, + "grad_norm": 7.617185592651367, + "learning_rate": 8.810096843988263e-06, + "loss": 2.0973, + "step": 10827 + }, + { + "epoch": 0.8150699109881632, + "grad_norm": 4.337515354156494, + "learning_rate": 8.803187254942607e-06, + "loss": 1.5825, + "step": 10828 + }, + { + "epoch": 0.8151451852688233, + "grad_norm": 5.0963358879089355, + "learning_rate": 8.796280114929274e-06, + "loss": 1.6098, + "step": 10829 + }, + { + "epoch": 0.8152204595494834, + "grad_norm": 5.3562211990356445, + "learning_rate": 8.789375424358847e-06, + "loss": 2.2276, + "step": 10830 + }, + { + "epoch": 0.8152957338301435, + "grad_norm": 4.748007297515869, + "learning_rate": 8.78247318364182e-06, + "loss": 1.6319, + "step": 10831 + }, + { + "epoch": 0.8153710081108038, + "grad_norm": 5.295051574707031, + "learning_rate": 8.775573393188485e-06, + "loss": 1.4072, + "step": 10832 + }, + { + "epoch": 0.8154462823914639, + "grad_norm": 4.093663215637207, + "learning_rate": 8.76867605340902e-06, + "loss": 1.5891, + "step": 10833 + }, + { + "epoch": 0.815521556672124, + "grad_norm": 5.201910018920898, + "learning_rate": 8.761781164713472e-06, + "loss": 1.7125, + "step": 10834 + }, + { + "epoch": 0.8155968309527842, + "grad_norm": 7.028207778930664, + "learning_rate": 8.754888727511696e-06, + "loss": 1.6056, + "step": 10835 + }, + { + "epoch": 0.8156721052334444, + "grad_norm": 6.011172294616699, + "learning_rate": 8.747998742213443e-06, + "loss": 1.8434, + "step": 10836 + }, + { + "epoch": 0.8157473795141045, + "grad_norm": 4.462532043457031, + "learning_rate": 8.741111209228292e-06, + "loss": 1.4937, + "step": 10837 + }, + { + "epoch": 0.8158226537947647, + "grad_norm": 4.796829700469971, + "learning_rate": 8.73422612896571e-06, + "loss": 1.4131, + "step": 10838 + }, + { + "epoch": 0.8158979280754248, + "grad_norm": 5.935859203338623, + "learning_rate": 8.727343501834972e-06, + "loss": 1.612, + "step": 10839 + }, + { + "epoch": 0.815973202356085, + "grad_norm": 5.869561672210693, + "learning_rate": 8.720463328245248e-06, + "loss": 1.5326, + "step": 10840 + }, + { + "epoch": 0.8160484766367452, + "grad_norm": 8.488271713256836, + "learning_rate": 8.713585608605524e-06, + "loss": 2.3256, + "step": 10841 + }, + { + "epoch": 0.8161237509174053, + "grad_norm": 4.954436779022217, + "learning_rate": 8.706710343324681e-06, + "loss": 1.313, + "step": 10842 + }, + { + "epoch": 0.8161990251980654, + "grad_norm": 5.284755706787109, + "learning_rate": 8.69983753281141e-06, + "loss": 1.7639, + "step": 10843 + }, + { + "epoch": 0.8162742994787257, + "grad_norm": 6.778896808624268, + "learning_rate": 8.69296717747431e-06, + "loss": 1.4852, + "step": 10844 + }, + { + "epoch": 0.8163495737593858, + "grad_norm": 4.2122578620910645, + "learning_rate": 8.686099277721777e-06, + "loss": 1.868, + "step": 10845 + }, + { + "epoch": 0.8164248480400459, + "grad_norm": 5.115288734436035, + "learning_rate": 8.679233833962097e-06, + "loss": 1.8222, + "step": 10846 + }, + { + "epoch": 0.8165001223207061, + "grad_norm": 6.201122760772705, + "learning_rate": 8.672370846603406e-06, + "loss": 1.4463, + "step": 10847 + }, + { + "epoch": 0.8165753966013662, + "grad_norm": 4.541061878204346, + "learning_rate": 8.665510316053682e-06, + "loss": 1.7211, + "step": 10848 + }, + { + "epoch": 0.8166506708820264, + "grad_norm": 5.629924297332764, + "learning_rate": 8.658652242720777e-06, + "loss": 1.7897, + "step": 10849 + }, + { + "epoch": 0.8167259451626865, + "grad_norm": 4.42575216293335, + "learning_rate": 8.651796627012371e-06, + "loss": 1.4235, + "step": 10850 + }, + { + "epoch": 0.8168012194433467, + "grad_norm": 4.409134387969971, + "learning_rate": 8.64494346933602e-06, + "loss": 1.8178, + "step": 10851 + }, + { + "epoch": 0.8168764937240068, + "grad_norm": 4.608599662780762, + "learning_rate": 8.63809277009911e-06, + "loss": 1.9263, + "step": 10852 + }, + { + "epoch": 0.816951768004667, + "grad_norm": 6.109070777893066, + "learning_rate": 8.631244529708915e-06, + "loss": 2.0224, + "step": 10853 + }, + { + "epoch": 0.8170270422853272, + "grad_norm": 5.422711372375488, + "learning_rate": 8.62439874857252e-06, + "loss": 1.4519, + "step": 10854 + }, + { + "epoch": 0.8171023165659873, + "grad_norm": 4.467102527618408, + "learning_rate": 8.617555427096913e-06, + "loss": 1.6747, + "step": 10855 + }, + { + "epoch": 0.8171775908466474, + "grad_norm": 3.8936846256256104, + "learning_rate": 8.610714565688882e-06, + "loss": 1.7327, + "step": 10856 + }, + { + "epoch": 0.8172528651273077, + "grad_norm": 5.262109279632568, + "learning_rate": 8.60387616475511e-06, + "loss": 1.6653, + "step": 10857 + }, + { + "epoch": 0.8173281394079678, + "grad_norm": 5.685330867767334, + "learning_rate": 8.597040224702124e-06, + "loss": 2.1393, + "step": 10858 + }, + { + "epoch": 0.8174034136886279, + "grad_norm": 5.750579357147217, + "learning_rate": 8.590206745936308e-06, + "loss": 1.8151, + "step": 10859 + }, + { + "epoch": 0.8174786879692881, + "grad_norm": 7.088691234588623, + "learning_rate": 8.583375728863874e-06, + "loss": 2.1563, + "step": 10860 + }, + { + "epoch": 0.8175539622499483, + "grad_norm": 5.558090686798096, + "learning_rate": 8.576547173890926e-06, + "loss": 1.6044, + "step": 10861 + }, + { + "epoch": 0.8176292365306084, + "grad_norm": 5.482133865356445, + "learning_rate": 8.569721081423376e-06, + "loss": 1.7457, + "step": 10862 + }, + { + "epoch": 0.8177045108112686, + "grad_norm": 4.279702186584473, + "learning_rate": 8.562897451867047e-06, + "loss": 1.7408, + "step": 10863 + }, + { + "epoch": 0.8177797850919287, + "grad_norm": 5.656026363372803, + "learning_rate": 8.556076285627557e-06, + "loss": 1.5561, + "step": 10864 + }, + { + "epoch": 0.8178550593725888, + "grad_norm": 6.071460723876953, + "learning_rate": 8.549257583110415e-06, + "loss": 1.7851, + "step": 10865 + }, + { + "epoch": 0.8179303336532491, + "grad_norm": 5.292352676391602, + "learning_rate": 8.54244134472098e-06, + "loss": 1.879, + "step": 10866 + }, + { + "epoch": 0.8180056079339092, + "grad_norm": 5.435975074768066, + "learning_rate": 8.535627570864447e-06, + "loss": 1.3742, + "step": 10867 + }, + { + "epoch": 0.8180808822145693, + "grad_norm": 4.488597393035889, + "learning_rate": 8.528816261945877e-06, + "loss": 1.6002, + "step": 10868 + }, + { + "epoch": 0.8181561564952295, + "grad_norm": 3.987753391265869, + "learning_rate": 8.522007418370188e-06, + "loss": 1.6502, + "step": 10869 + }, + { + "epoch": 0.8182314307758897, + "grad_norm": 5.061254024505615, + "learning_rate": 8.515201040542158e-06, + "loss": 1.5702, + "step": 10870 + }, + { + "epoch": 0.8183067050565498, + "grad_norm": 4.399049758911133, + "learning_rate": 8.50839712886638e-06, + "loss": 1.6182, + "step": 10871 + }, + { + "epoch": 0.8183819793372099, + "grad_norm": 5.0424628257751465, + "learning_rate": 8.501595683747348e-06, + "loss": 1.8637, + "step": 10872 + }, + { + "epoch": 0.8184572536178701, + "grad_norm": 5.050430774688721, + "learning_rate": 8.49479670558937e-06, + "loss": 1.6919, + "step": 10873 + }, + { + "epoch": 0.8185325278985303, + "grad_norm": 5.877337455749512, + "learning_rate": 8.488000194796647e-06, + "loss": 1.5564, + "step": 10874 + }, + { + "epoch": 0.8186078021791904, + "grad_norm": 4.292812347412109, + "learning_rate": 8.48120615177319e-06, + "loss": 1.2814, + "step": 10875 + }, + { + "epoch": 0.8186830764598506, + "grad_norm": 4.522998332977295, + "learning_rate": 8.474414576922912e-06, + "loss": 1.5369, + "step": 10876 + }, + { + "epoch": 0.8187583507405107, + "grad_norm": 5.359824180603027, + "learning_rate": 8.467625470649526e-06, + "loss": 2.1323, + "step": 10877 + }, + { + "epoch": 0.8188336250211709, + "grad_norm": 4.371697425842285, + "learning_rate": 8.460838833356632e-06, + "loss": 1.6965, + "step": 10878 + }, + { + "epoch": 0.8189088993018311, + "grad_norm": 4.70720911026001, + "learning_rate": 8.454054665447681e-06, + "loss": 1.4837, + "step": 10879 + }, + { + "epoch": 0.8189841735824912, + "grad_norm": 5.8965959548950195, + "learning_rate": 8.447272967325981e-06, + "loss": 1.9186, + "step": 10880 + }, + { + "epoch": 0.8190594478631513, + "grad_norm": 5.415875434875488, + "learning_rate": 8.440493739394668e-06, + "loss": 1.6566, + "step": 10881 + }, + { + "epoch": 0.8191347221438116, + "grad_norm": 6.362188816070557, + "learning_rate": 8.433716982056749e-06, + "loss": 1.8009, + "step": 10882 + }, + { + "epoch": 0.8192099964244717, + "grad_norm": 5.833646297454834, + "learning_rate": 8.426942695715101e-06, + "loss": 1.9785, + "step": 10883 + }, + { + "epoch": 0.8192852707051318, + "grad_norm": 5.876897811889648, + "learning_rate": 8.420170880772415e-06, + "loss": 1.8089, + "step": 10884 + }, + { + "epoch": 0.819360544985792, + "grad_norm": 5.744025230407715, + "learning_rate": 8.413401537631277e-06, + "loss": 1.6437, + "step": 10885 + }, + { + "epoch": 0.8194358192664521, + "grad_norm": 8.794206619262695, + "learning_rate": 8.406634666694074e-06, + "loss": 1.7701, + "step": 10886 + }, + { + "epoch": 0.8195110935471123, + "grad_norm": 5.116382122039795, + "learning_rate": 8.399870268363113e-06, + "loss": 1.8833, + "step": 10887 + }, + { + "epoch": 0.8195863678277725, + "grad_norm": 4.325708866119385, + "learning_rate": 8.393108343040484e-06, + "loss": 1.675, + "step": 10888 + }, + { + "epoch": 0.8196616421084326, + "grad_norm": 4.20037841796875, + "learning_rate": 8.386348891128198e-06, + "loss": 1.3599, + "step": 10889 + }, + { + "epoch": 0.8197369163890927, + "grad_norm": 5.545212745666504, + "learning_rate": 8.379591913028051e-06, + "loss": 2.0006, + "step": 10890 + }, + { + "epoch": 0.8198121906697529, + "grad_norm": 5.9127678871154785, + "learning_rate": 8.372837409141744e-06, + "loss": 1.7678, + "step": 10891 + }, + { + "epoch": 0.8198874649504131, + "grad_norm": 5.372143745422363, + "learning_rate": 8.366085379870814e-06, + "loss": 1.6145, + "step": 10892 + }, + { + "epoch": 0.8199627392310732, + "grad_norm": 8.43078327178955, + "learning_rate": 8.359335825616655e-06, + "loss": 1.8953, + "step": 10893 + }, + { + "epoch": 0.8200380135117333, + "grad_norm": 5.326043605804443, + "learning_rate": 8.352588746780488e-06, + "loss": 1.4639, + "step": 10894 + }, + { + "epoch": 0.8201132877923936, + "grad_norm": 7.153926849365234, + "learning_rate": 8.345844143763437e-06, + "loss": 1.5647, + "step": 10895 + }, + { + "epoch": 0.8201885620730537, + "grad_norm": 5.018762111663818, + "learning_rate": 8.339102016966415e-06, + "loss": 1.5729, + "step": 10896 + }, + { + "epoch": 0.8202638363537138, + "grad_norm": 4.839263916015625, + "learning_rate": 8.332362366790241e-06, + "loss": 1.7049, + "step": 10897 + }, + { + "epoch": 0.820339110634374, + "grad_norm": 5.453345775604248, + "learning_rate": 8.325625193635583e-06, + "loss": 1.3305, + "step": 10898 + }, + { + "epoch": 0.8204143849150342, + "grad_norm": 8.34306812286377, + "learning_rate": 8.318890497902914e-06, + "loss": 1.7974, + "step": 10899 + }, + { + "epoch": 0.8204896591956943, + "grad_norm": 3.804720640182495, + "learning_rate": 8.31215827999262e-06, + "loss": 1.5273, + "step": 10900 + }, + { + "epoch": 0.8205649334763545, + "grad_norm": 6.422311782836914, + "learning_rate": 8.305428540304889e-06, + "loss": 1.5448, + "step": 10901 + }, + { + "epoch": 0.8206402077570146, + "grad_norm": 7.073173522949219, + "learning_rate": 8.298701279239795e-06, + "loss": 1.7702, + "step": 10902 + }, + { + "epoch": 0.8207154820376747, + "grad_norm": 3.5602216720581055, + "learning_rate": 8.291976497197263e-06, + "loss": 1.5583, + "step": 10903 + }, + { + "epoch": 0.820790756318335, + "grad_norm": 4.932760715484619, + "learning_rate": 8.285254194577057e-06, + "loss": 1.7468, + "step": 10904 + }, + { + "epoch": 0.8208660305989951, + "grad_norm": 4.4945573806762695, + "learning_rate": 8.27853437177879e-06, + "loss": 1.6441, + "step": 10905 + }, + { + "epoch": 0.8209413048796552, + "grad_norm": 4.575950622558594, + "learning_rate": 8.27181702920195e-06, + "loss": 1.5961, + "step": 10906 + }, + { + "epoch": 0.8210165791603155, + "grad_norm": 5.490642070770264, + "learning_rate": 8.265102167245848e-06, + "loss": 1.5985, + "step": 10907 + }, + { + "epoch": 0.8210918534409756, + "grad_norm": 5.020941734313965, + "learning_rate": 8.258389786309677e-06, + "loss": 2.0604, + "step": 10908 + }, + { + "epoch": 0.8211671277216357, + "grad_norm": 4.401211261749268, + "learning_rate": 8.251679886792457e-06, + "loss": 1.9484, + "step": 10909 + }, + { + "epoch": 0.8212424020022958, + "grad_norm": 4.599873065948486, + "learning_rate": 8.244972469093092e-06, + "loss": 1.5293, + "step": 10910 + }, + { + "epoch": 0.821317676282956, + "grad_norm": 6.028941631317139, + "learning_rate": 8.23826753361029e-06, + "loss": 1.7849, + "step": 10911 + }, + { + "epoch": 0.8213929505636162, + "grad_norm": 4.834048271179199, + "learning_rate": 8.231565080742654e-06, + "loss": 1.712, + "step": 10912 + }, + { + "epoch": 0.8214682248442763, + "grad_norm": 5.334903240203857, + "learning_rate": 8.22486511088863e-06, + "loss": 1.5723, + "step": 10913 + }, + { + "epoch": 0.8215434991249365, + "grad_norm": 4.641783237457275, + "learning_rate": 8.218167624446504e-06, + "loss": 1.6751, + "step": 10914 + }, + { + "epoch": 0.8216187734055966, + "grad_norm": 4.336653709411621, + "learning_rate": 8.211472621814441e-06, + "loss": 1.7782, + "step": 10915 + }, + { + "epoch": 0.8216940476862568, + "grad_norm": 4.06304407119751, + "learning_rate": 8.204780103390414e-06, + "loss": 1.9177, + "step": 10916 + }, + { + "epoch": 0.821769321966917, + "grad_norm": 4.336757659912109, + "learning_rate": 8.198090069572301e-06, + "loss": 1.7821, + "step": 10917 + }, + { + "epoch": 0.8218445962475771, + "grad_norm": 4.985970497131348, + "learning_rate": 8.191402520757773e-06, + "loss": 1.9833, + "step": 10918 + }, + { + "epoch": 0.8219198705282372, + "grad_norm": 5.235047817230225, + "learning_rate": 8.184717457344421e-06, + "loss": 1.9888, + "step": 10919 + }, + { + "epoch": 0.8219951448088975, + "grad_norm": 3.7938363552093506, + "learning_rate": 8.178034879729618e-06, + "loss": 1.6152, + "step": 10920 + }, + { + "epoch": 0.8220704190895576, + "grad_norm": 4.5220441818237305, + "learning_rate": 8.171354788310659e-06, + "loss": 1.6875, + "step": 10921 + }, + { + "epoch": 0.8221456933702177, + "grad_norm": 4.037940502166748, + "learning_rate": 8.16467718348462e-06, + "loss": 1.3536, + "step": 10922 + }, + { + "epoch": 0.8222209676508779, + "grad_norm": 5.319381237030029, + "learning_rate": 8.158002065648485e-06, + "loss": 2.0584, + "step": 10923 + }, + { + "epoch": 0.822296241931538, + "grad_norm": 4.723677635192871, + "learning_rate": 8.151329435199068e-06, + "loss": 1.96, + "step": 10924 + }, + { + "epoch": 0.8223715162121982, + "grad_norm": 4.269428730010986, + "learning_rate": 8.144659292533052e-06, + "loss": 1.7691, + "step": 10925 + }, + { + "epoch": 0.8224467904928584, + "grad_norm": 7.120105266571045, + "learning_rate": 8.137991638046932e-06, + "loss": 1.8422, + "step": 10926 + }, + { + "epoch": 0.8225220647735185, + "grad_norm": 3.9309520721435547, + "learning_rate": 8.131326472137107e-06, + "loss": 1.3894, + "step": 10927 + }, + { + "epoch": 0.8225973390541786, + "grad_norm": 4.960661888122559, + "learning_rate": 8.124663795199771e-06, + "loss": 1.845, + "step": 10928 + }, + { + "epoch": 0.8226726133348388, + "grad_norm": 6.035215854644775, + "learning_rate": 8.118003607631025e-06, + "loss": 1.5816, + "step": 10929 + }, + { + "epoch": 0.822747887615499, + "grad_norm": 4.923506736755371, + "learning_rate": 8.111345909826801e-06, + "loss": 1.9743, + "step": 10930 + }, + { + "epoch": 0.8228231618961591, + "grad_norm": 7.376336097717285, + "learning_rate": 8.10469070218286e-06, + "loss": 1.3924, + "step": 10931 + }, + { + "epoch": 0.8228984361768192, + "grad_norm": 4.756871223449707, + "learning_rate": 8.098037985094858e-06, + "loss": 1.9125, + "step": 10932 + }, + { + "epoch": 0.8229737104574795, + "grad_norm": 4.2428154945373535, + "learning_rate": 8.091387758958253e-06, + "loss": 1.606, + "step": 10933 + }, + { + "epoch": 0.8230489847381396, + "grad_norm": 4.157013893127441, + "learning_rate": 8.084740024168408e-06, + "loss": 1.9736, + "step": 10934 + }, + { + "epoch": 0.8231242590187997, + "grad_norm": 5.077691078186035, + "learning_rate": 8.078094781120494e-06, + "loss": 1.8644, + "step": 10935 + }, + { + "epoch": 0.8231995332994599, + "grad_norm": 4.817681789398193, + "learning_rate": 8.071452030209553e-06, + "loss": 1.6616, + "step": 10936 + }, + { + "epoch": 0.8232748075801201, + "grad_norm": 7.545267581939697, + "learning_rate": 8.064811771830483e-06, + "loss": 1.8113, + "step": 10937 + }, + { + "epoch": 0.8233500818607802, + "grad_norm": 9.952936172485352, + "learning_rate": 8.058174006378039e-06, + "loss": 1.6782, + "step": 10938 + }, + { + "epoch": 0.8234253561414404, + "grad_norm": 4.724859237670898, + "learning_rate": 8.051538734246793e-06, + "loss": 2.1885, + "step": 10939 + }, + { + "epoch": 0.8235006304221005, + "grad_norm": 7.638442039489746, + "learning_rate": 8.044905955831223e-06, + "loss": 1.8009, + "step": 10940 + }, + { + "epoch": 0.8235759047027607, + "grad_norm": 5.1861958503723145, + "learning_rate": 8.038275671525592e-06, + "loss": 1.9533, + "step": 10941 + }, + { + "epoch": 0.8236511789834209, + "grad_norm": 5.1349968910217285, + "learning_rate": 8.031647881724086e-06, + "loss": 1.7181, + "step": 10942 + }, + { + "epoch": 0.823726453264081, + "grad_norm": 4.116723537445068, + "learning_rate": 8.025022586820679e-06, + "loss": 1.727, + "step": 10943 + }, + { + "epoch": 0.8238017275447411, + "grad_norm": 5.33433723449707, + "learning_rate": 8.018399787209258e-06, + "loss": 1.9008, + "step": 10944 + }, + { + "epoch": 0.8238770018254014, + "grad_norm": 7.582970142364502, + "learning_rate": 8.011779483283494e-06, + "loss": 1.6893, + "step": 10945 + }, + { + "epoch": 0.8239522761060615, + "grad_norm": 4.536684513092041, + "learning_rate": 8.005161675436961e-06, + "loss": 1.5235, + "step": 10946 + }, + { + "epoch": 0.8240275503867216, + "grad_norm": 9.125519752502441, + "learning_rate": 7.998546364063069e-06, + "loss": 2.0258, + "step": 10947 + }, + { + "epoch": 0.8241028246673817, + "grad_norm": 7.238245010375977, + "learning_rate": 7.99193354955508e-06, + "loss": 1.7037, + "step": 10948 + }, + { + "epoch": 0.824178098948042, + "grad_norm": 4.888298034667969, + "learning_rate": 7.985323232306124e-06, + "loss": 1.6628, + "step": 10949 + }, + { + "epoch": 0.8242533732287021, + "grad_norm": 5.797140121459961, + "learning_rate": 7.978715412709131e-06, + "loss": 1.5902, + "step": 10950 + }, + { + "epoch": 0.8243286475093622, + "grad_norm": 5.052073001861572, + "learning_rate": 7.972110091156947e-06, + "loss": 1.8259, + "step": 10951 + }, + { + "epoch": 0.8244039217900224, + "grad_norm": 4.93002986907959, + "learning_rate": 7.965507268042217e-06, + "loss": 1.8239, + "step": 10952 + }, + { + "epoch": 0.8244791960706825, + "grad_norm": 7.035369396209717, + "learning_rate": 7.958906943757483e-06, + "loss": 1.9364, + "step": 10953 + }, + { + "epoch": 0.8245544703513427, + "grad_norm": 5.951178550720215, + "learning_rate": 7.952309118695084e-06, + "loss": 1.8852, + "step": 10954 + }, + { + "epoch": 0.8246297446320029, + "grad_norm": 4.959742069244385, + "learning_rate": 7.945713793247273e-06, + "loss": 1.7807, + "step": 10955 + }, + { + "epoch": 0.824705018912663, + "grad_norm": 6.198773384094238, + "learning_rate": 7.939120967806101e-06, + "loss": 1.7916, + "step": 10956 + }, + { + "epoch": 0.8247802931933231, + "grad_norm": 4.590382099151611, + "learning_rate": 7.932530642763498e-06, + "loss": 1.508, + "step": 10957 + }, + { + "epoch": 0.8248555674739834, + "grad_norm": 7.815896511077881, + "learning_rate": 7.925942818511245e-06, + "loss": 1.5852, + "step": 10958 + }, + { + "epoch": 0.8249308417546435, + "grad_norm": 5.780311584472656, + "learning_rate": 7.919357495440977e-06, + "loss": 1.9684, + "step": 10959 + }, + { + "epoch": 0.8250061160353036, + "grad_norm": 8.305814743041992, + "learning_rate": 7.912774673944157e-06, + "loss": 1.8048, + "step": 10960 + }, + { + "epoch": 0.8250813903159638, + "grad_norm": 6.1196465492248535, + "learning_rate": 7.906194354412117e-06, + "loss": 1.5855, + "step": 10961 + }, + { + "epoch": 0.825156664596624, + "grad_norm": 3.809445858001709, + "learning_rate": 7.89961653723605e-06, + "loss": 1.565, + "step": 10962 + }, + { + "epoch": 0.8252319388772841, + "grad_norm": 5.269323825836182, + "learning_rate": 7.893041222806975e-06, + "loss": 2.1431, + "step": 10963 + }, + { + "epoch": 0.8253072131579443, + "grad_norm": 4.550458908081055, + "learning_rate": 7.886468411515784e-06, + "loss": 1.9105, + "step": 10964 + }, + { + "epoch": 0.8253824874386044, + "grad_norm": 5.480938911437988, + "learning_rate": 7.879898103753208e-06, + "loss": 1.7384, + "step": 10965 + }, + { + "epoch": 0.8254577617192645, + "grad_norm": 4.501259803771973, + "learning_rate": 7.873330299909837e-06, + "loss": 1.8431, + "step": 10966 + }, + { + "epoch": 0.8255330359999248, + "grad_norm": 5.292892932891846, + "learning_rate": 7.866765000376098e-06, + "loss": 1.6566, + "step": 10967 + }, + { + "epoch": 0.8256083102805849, + "grad_norm": 3.9994585514068604, + "learning_rate": 7.860202205542282e-06, + "loss": 1.7348, + "step": 10968 + }, + { + "epoch": 0.825683584561245, + "grad_norm": 5.7271728515625, + "learning_rate": 7.853641915798532e-06, + "loss": 1.6322, + "step": 10969 + }, + { + "epoch": 0.8257588588419051, + "grad_norm": 6.063449382781982, + "learning_rate": 7.847084131534855e-06, + "loss": 1.6248, + "step": 10970 + }, + { + "epoch": 0.8258341331225654, + "grad_norm": 7.751720428466797, + "learning_rate": 7.84052885314106e-06, + "loss": 1.9196, + "step": 10971 + }, + { + "epoch": 0.8259094074032255, + "grad_norm": 5.197169303894043, + "learning_rate": 7.833976081006873e-06, + "loss": 1.9748, + "step": 10972 + }, + { + "epoch": 0.8259846816838856, + "grad_norm": 4.158738613128662, + "learning_rate": 7.827425815521804e-06, + "loss": 1.8588, + "step": 10973 + }, + { + "epoch": 0.8260599559645458, + "grad_norm": 6.7579193115234375, + "learning_rate": 7.820878057075275e-06, + "loss": 1.8351, + "step": 10974 + }, + { + "epoch": 0.826135230245206, + "grad_norm": 5.700201034545898, + "learning_rate": 7.814332806056507e-06, + "loss": 1.9827, + "step": 10975 + }, + { + "epoch": 0.8262105045258661, + "grad_norm": 5.586737632751465, + "learning_rate": 7.807790062854625e-06, + "loss": 1.5592, + "step": 10976 + }, + { + "epoch": 0.8262857788065263, + "grad_norm": 7.50944709777832, + "learning_rate": 7.801249827858547e-06, + "loss": 1.8614, + "step": 10977 + }, + { + "epoch": 0.8263610530871864, + "grad_norm": 4.703793525695801, + "learning_rate": 7.794712101457086e-06, + "loss": 1.6359, + "step": 10978 + }, + { + "epoch": 0.8264363273678466, + "grad_norm": 7.1648759841918945, + "learning_rate": 7.788176884038889e-06, + "loss": 2.4626, + "step": 10979 + }, + { + "epoch": 0.8265116016485068, + "grad_norm": 5.00128173828125, + "learning_rate": 7.78164417599246e-06, + "loss": 1.6202, + "step": 10980 + }, + { + "epoch": 0.8265868759291669, + "grad_norm": 6.689926624298096, + "learning_rate": 7.77511397770615e-06, + "loss": 2.0002, + "step": 10981 + }, + { + "epoch": 0.826662150209827, + "grad_norm": 4.7410125732421875, + "learning_rate": 7.768586289568153e-06, + "loss": 1.845, + "step": 10982 + }, + { + "epoch": 0.8267374244904873, + "grad_norm": 5.61693811416626, + "learning_rate": 7.762061111966534e-06, + "loss": 1.9447, + "step": 10983 + }, + { + "epoch": 0.8268126987711474, + "grad_norm": 5.026223659515381, + "learning_rate": 7.755538445289179e-06, + "loss": 1.8375, + "step": 10984 + }, + { + "epoch": 0.8268879730518075, + "grad_norm": 5.361608505249023, + "learning_rate": 7.74901828992386e-06, + "loss": 1.9564, + "step": 10985 + }, + { + "epoch": 0.8269632473324677, + "grad_norm": 5.447265625, + "learning_rate": 7.74250064625816e-06, + "loss": 1.7766, + "step": 10986 + }, + { + "epoch": 0.8270385216131279, + "grad_norm": 4.481755256652832, + "learning_rate": 7.735985514679561e-06, + "loss": 1.6674, + "step": 10987 + }, + { + "epoch": 0.827113795893788, + "grad_norm": 6.55386209487915, + "learning_rate": 7.729472895575341e-06, + "loss": 1.8225, + "step": 10988 + }, + { + "epoch": 0.8271890701744481, + "grad_norm": 6.0485663414001465, + "learning_rate": 7.722962789332676e-06, + "loss": 1.8584, + "step": 10989 + }, + { + "epoch": 0.8272643444551083, + "grad_norm": 5.985475540161133, + "learning_rate": 7.716455196338563e-06, + "loss": 1.5388, + "step": 10990 + }, + { + "epoch": 0.8273396187357684, + "grad_norm": 8.02165412902832, + "learning_rate": 7.709950116979858e-06, + "loss": 1.6634, + "step": 10991 + }, + { + "epoch": 0.8274148930164286, + "grad_norm": 4.710300445556641, + "learning_rate": 7.703447551643278e-06, + "loss": 1.7326, + "step": 10992 + }, + { + "epoch": 0.8274901672970888, + "grad_norm": 4.7654218673706055, + "learning_rate": 7.69694750071538e-06, + "loss": 1.5626, + "step": 10993 + }, + { + "epoch": 0.8275654415777489, + "grad_norm": 4.788698673248291, + "learning_rate": 7.69044996458258e-06, + "loss": 1.5371, + "step": 10994 + }, + { + "epoch": 0.827640715858409, + "grad_norm": 4.505056858062744, + "learning_rate": 7.683954943631116e-06, + "loss": 1.8602, + "step": 10995 + }, + { + "epoch": 0.8277159901390693, + "grad_norm": 4.957911014556885, + "learning_rate": 7.677462438247129e-06, + "loss": 1.7271, + "step": 10996 + }, + { + "epoch": 0.8277912644197294, + "grad_norm": 5.344749927520752, + "learning_rate": 7.67097244881655e-06, + "loss": 1.623, + "step": 10997 + }, + { + "epoch": 0.8278665387003895, + "grad_norm": 6.659008979797363, + "learning_rate": 7.664484975725211e-06, + "loss": 1.5782, + "step": 10998 + }, + { + "epoch": 0.8279418129810497, + "grad_norm": 4.466121673583984, + "learning_rate": 7.658000019358764e-06, + "loss": 1.519, + "step": 10999 + }, + { + "epoch": 0.8280170872617099, + "grad_norm": 4.804609775543213, + "learning_rate": 7.651517580102724e-06, + "loss": 1.3442, + "step": 11000 + }, + { + "epoch": 0.82809236154237, + "grad_norm": 4.663994312286377, + "learning_rate": 7.645037658342447e-06, + "loss": 1.9341, + "step": 11001 + }, + { + "epoch": 0.8281676358230302, + "grad_norm": 4.295934677124023, + "learning_rate": 7.63856025446315e-06, + "loss": 1.5768, + "step": 11002 + }, + { + "epoch": 0.8282429101036903, + "grad_norm": 5.220107555389404, + "learning_rate": 7.632085368849896e-06, + "loss": 1.5171, + "step": 11003 + }, + { + "epoch": 0.8283181843843505, + "grad_norm": 4.193447113037109, + "learning_rate": 7.62561300188761e-06, + "loss": 1.4782, + "step": 11004 + }, + { + "epoch": 0.8283934586650107, + "grad_norm": 8.150671005249023, + "learning_rate": 7.619143153961039e-06, + "loss": 1.6729, + "step": 11005 + }, + { + "epoch": 0.8284687329456708, + "grad_norm": 5.071077346801758, + "learning_rate": 7.6126758254548125e-06, + "loss": 1.7875, + "step": 11006 + }, + { + "epoch": 0.8285440072263309, + "grad_norm": 4.6814656257629395, + "learning_rate": 7.6062110167533726e-06, + "loss": 1.7586, + "step": 11007 + }, + { + "epoch": 0.828619281506991, + "grad_norm": 4.4523749351501465, + "learning_rate": 7.599748728241057e-06, + "loss": 1.5667, + "step": 11008 + }, + { + "epoch": 0.8286945557876513, + "grad_norm": 4.9366912841796875, + "learning_rate": 7.593288960302008e-06, + "loss": 1.8477, + "step": 11009 + }, + { + "epoch": 0.8287698300683114, + "grad_norm": 4.7316203117370605, + "learning_rate": 7.586831713320253e-06, + "loss": 1.2793, + "step": 11010 + }, + { + "epoch": 0.8288451043489715, + "grad_norm": 6.039381980895996, + "learning_rate": 7.580376987679666e-06, + "loss": 1.4809, + "step": 11011 + }, + { + "epoch": 0.8289203786296317, + "grad_norm": 5.554289817810059, + "learning_rate": 7.573924783763942e-06, + "loss": 1.4709, + "step": 11012 + }, + { + "epoch": 0.8289956529102919, + "grad_norm": 8.146241188049316, + "learning_rate": 7.567475101956656e-06, + "loss": 1.6761, + "step": 11013 + }, + { + "epoch": 0.829070927190952, + "grad_norm": 5.214634895324707, + "learning_rate": 7.561027942641219e-06, + "loss": 2.1128, + "step": 11014 + }, + { + "epoch": 0.8291462014716122, + "grad_norm": 4.8079376220703125, + "learning_rate": 7.554583306200913e-06, + "loss": 1.7158, + "step": 11015 + }, + { + "epoch": 0.8292214757522723, + "grad_norm": 4.704328536987305, + "learning_rate": 7.548141193018832e-06, + "loss": 1.7376, + "step": 11016 + }, + { + "epoch": 0.8292967500329325, + "grad_norm": 5.261148929595947, + "learning_rate": 7.541701603477957e-06, + "loss": 1.4864, + "step": 11017 + }, + { + "epoch": 0.8293720243135927, + "grad_norm": 6.521105766296387, + "learning_rate": 7.535264537961084e-06, + "loss": 1.4563, + "step": 11018 + }, + { + "epoch": 0.8294472985942528, + "grad_norm": 5.155236721038818, + "learning_rate": 7.5288299968509e-06, + "loss": 1.6743, + "step": 11019 + }, + { + "epoch": 0.8295225728749129, + "grad_norm": 7.313258171081543, + "learning_rate": 7.5223979805299e-06, + "loss": 1.7916, + "step": 11020 + }, + { + "epoch": 0.8295978471555732, + "grad_norm": 3.989315986633301, + "learning_rate": 7.515968489380465e-06, + "loss": 1.7118, + "step": 11021 + }, + { + "epoch": 0.8296731214362333, + "grad_norm": 5.877418041229248, + "learning_rate": 7.509541523784797e-06, + "loss": 1.9989, + "step": 11022 + }, + { + "epoch": 0.8297483957168934, + "grad_norm": 6.289952278137207, + "learning_rate": 7.503117084124961e-06, + "loss": 2.3036, + "step": 11023 + }, + { + "epoch": 0.8298236699975536, + "grad_norm": 4.3154730796813965, + "learning_rate": 7.49669517078288e-06, + "loss": 1.6588, + "step": 11024 + }, + { + "epoch": 0.8298989442782138, + "grad_norm": 5.762095928192139, + "learning_rate": 7.4902757841403325e-06, + "loss": 1.5326, + "step": 11025 + }, + { + "epoch": 0.8299742185588739, + "grad_norm": 5.923062324523926, + "learning_rate": 7.483858924578896e-06, + "loss": 2.2313, + "step": 11026 + }, + { + "epoch": 0.830049492839534, + "grad_norm": 6.4805450439453125, + "learning_rate": 7.47744459248006e-06, + "loss": 1.5764, + "step": 11027 + }, + { + "epoch": 0.8301247671201942, + "grad_norm": 5.429744720458984, + "learning_rate": 7.4710327882251355e-06, + "loss": 1.969, + "step": 11028 + }, + { + "epoch": 0.8302000414008543, + "grad_norm": 4.730030059814453, + "learning_rate": 7.464623512195279e-06, + "loss": 1.7627, + "step": 11029 + }, + { + "epoch": 0.8302753156815145, + "grad_norm": 5.404567718505859, + "learning_rate": 7.458216764771514e-06, + "loss": 1.741, + "step": 11030 + }, + { + "epoch": 0.8303505899621747, + "grad_norm": 5.660772800445557, + "learning_rate": 7.4518125463346855e-06, + "loss": 1.9058, + "step": 11031 + }, + { + "epoch": 0.8304258642428348, + "grad_norm": 4.867333889007568, + "learning_rate": 7.445410857265528e-06, + "loss": 1.8546, + "step": 11032 + }, + { + "epoch": 0.8305011385234949, + "grad_norm": 6.74031400680542, + "learning_rate": 7.439011697944581e-06, + "loss": 1.415, + "step": 11033 + }, + { + "epoch": 0.8305764128041552, + "grad_norm": 3.620102882385254, + "learning_rate": 7.43261506875228e-06, + "loss": 1.8443, + "step": 11034 + }, + { + "epoch": 0.8306516870848153, + "grad_norm": 6.375827312469482, + "learning_rate": 7.426220970068864e-06, + "loss": 1.5704, + "step": 11035 + }, + { + "epoch": 0.8307269613654754, + "grad_norm": 5.2237629890441895, + "learning_rate": 7.419829402274453e-06, + "loss": 1.7401, + "step": 11036 + }, + { + "epoch": 0.8308022356461356, + "grad_norm": 5.559440612792969, + "learning_rate": 7.413440365749002e-06, + "loss": 1.8119, + "step": 11037 + }, + { + "epoch": 0.8308775099267958, + "grad_norm": 5.466170310974121, + "learning_rate": 7.407053860872343e-06, + "loss": 1.8112, + "step": 11038 + }, + { + "epoch": 0.8309527842074559, + "grad_norm": 3.8664166927337646, + "learning_rate": 7.4006698880241084e-06, + "loss": 1.783, + "step": 11039 + }, + { + "epoch": 0.8310280584881161, + "grad_norm": 7.441871643066406, + "learning_rate": 7.394288447583825e-06, + "loss": 1.6649, + "step": 11040 + }, + { + "epoch": 0.8311033327687762, + "grad_norm": 9.133502960205078, + "learning_rate": 7.3879095399308375e-06, + "loss": 1.9666, + "step": 11041 + }, + { + "epoch": 0.8311786070494364, + "grad_norm": 5.8501152992248535, + "learning_rate": 7.381533165444355e-06, + "loss": 1.9769, + "step": 11042 + }, + { + "epoch": 0.8312538813300966, + "grad_norm": 5.106383800506592, + "learning_rate": 7.375159324503456e-06, + "loss": 1.663, + "step": 11043 + }, + { + "epoch": 0.8313291556107567, + "grad_norm": 4.815539836883545, + "learning_rate": 7.368788017487016e-06, + "loss": 1.8211, + "step": 11044 + }, + { + "epoch": 0.8314044298914168, + "grad_norm": 5.38843297958374, + "learning_rate": 7.362419244773816e-06, + "loss": 1.89, + "step": 11045 + }, + { + "epoch": 0.8314797041720771, + "grad_norm": 5.356052398681641, + "learning_rate": 7.356053006742442e-06, + "loss": 1.8514, + "step": 11046 + }, + { + "epoch": 0.8315549784527372, + "grad_norm": 5.468636512756348, + "learning_rate": 7.3496893037713564e-06, + "loss": 1.8794, + "step": 11047 + }, + { + "epoch": 0.8316302527333973, + "grad_norm": 7.057224273681641, + "learning_rate": 7.343328136238869e-06, + "loss": 1.3001, + "step": 11048 + }, + { + "epoch": 0.8317055270140574, + "grad_norm": 4.931948184967041, + "learning_rate": 7.336969504523133e-06, + "loss": 1.4663, + "step": 11049 + }, + { + "epoch": 0.8317808012947177, + "grad_norm": 4.959153175354004, + "learning_rate": 7.330613409002135e-06, + "loss": 1.2662, + "step": 11050 + }, + { + "epoch": 0.8318560755753778, + "grad_norm": 7.052262783050537, + "learning_rate": 7.324259850053755e-06, + "loss": 2.5178, + "step": 11051 + }, + { + "epoch": 0.8319313498560379, + "grad_norm": 5.001206398010254, + "learning_rate": 7.317908828055659e-06, + "loss": 1.8402, + "step": 11052 + }, + { + "epoch": 0.8320066241366981, + "grad_norm": 4.8166961669921875, + "learning_rate": 7.311560343385432e-06, + "loss": 1.6132, + "step": 11053 + }, + { + "epoch": 0.8320818984173582, + "grad_norm": 5.357537269592285, + "learning_rate": 7.30521439642044e-06, + "loss": 1.5618, + "step": 11054 + }, + { + "epoch": 0.8321571726980184, + "grad_norm": 5.318265438079834, + "learning_rate": 7.298870987537959e-06, + "loss": 1.5966, + "step": 11055 + }, + { + "epoch": 0.8322324469786786, + "grad_norm": 6.288393497467041, + "learning_rate": 7.292530117115059e-06, + "loss": 1.974, + "step": 11056 + }, + { + "epoch": 0.8323077212593387, + "grad_norm": 5.921321868896484, + "learning_rate": 7.286191785528707e-06, + "loss": 2.0694, + "step": 11057 + }, + { + "epoch": 0.8323829955399988, + "grad_norm": 5.27810001373291, + "learning_rate": 7.2798559931556934e-06, + "loss": 1.4968, + "step": 11058 + }, + { + "epoch": 0.8324582698206591, + "grad_norm": 4.912023544311523, + "learning_rate": 7.273522740372662e-06, + "loss": 1.7914, + "step": 11059 + }, + { + "epoch": 0.8325335441013192, + "grad_norm": 5.61682653427124, + "learning_rate": 7.267192027556119e-06, + "loss": 1.5717, + "step": 11060 + }, + { + "epoch": 0.8326088183819793, + "grad_norm": 5.883847236633301, + "learning_rate": 7.260863855082389e-06, + "loss": 1.6645, + "step": 11061 + }, + { + "epoch": 0.8326840926626395, + "grad_norm": 4.574014663696289, + "learning_rate": 7.254538223327678e-06, + "loss": 1.9285, + "step": 11062 + }, + { + "epoch": 0.8327593669432997, + "grad_norm": 5.6800642013549805, + "learning_rate": 7.248215132668007e-06, + "loss": 1.513, + "step": 11063 + }, + { + "epoch": 0.8328346412239598, + "grad_norm": 6.786117076873779, + "learning_rate": 7.241894583479286e-06, + "loss": 1.5505, + "step": 11064 + }, + { + "epoch": 0.83290991550462, + "grad_norm": 4.412558555603027, + "learning_rate": 7.235576576137243e-06, + "loss": 1.824, + "step": 11065 + }, + { + "epoch": 0.8329851897852801, + "grad_norm": 5.002243518829346, + "learning_rate": 7.229261111017471e-06, + "loss": 1.5395, + "step": 11066 + }, + { + "epoch": 0.8330604640659403, + "grad_norm": 6.369983196258545, + "learning_rate": 7.222948188495393e-06, + "loss": 1.9259, + "step": 11067 + }, + { + "epoch": 0.8331357383466004, + "grad_norm": 5.593514442443848, + "learning_rate": 7.216637808946308e-06, + "loss": 1.5783, + "step": 11068 + }, + { + "epoch": 0.8332110126272606, + "grad_norm": 4.803321838378906, + "learning_rate": 7.210329972745344e-06, + "loss": 1.691, + "step": 11069 + }, + { + "epoch": 0.8332862869079207, + "grad_norm": 6.01563024520874, + "learning_rate": 7.204024680267496e-06, + "loss": 1.6997, + "step": 11070 + }, + { + "epoch": 0.8333615611885808, + "grad_norm": 5.081418514251709, + "learning_rate": 7.197721931887574e-06, + "loss": 1.6954, + "step": 11071 + }, + { + "epoch": 0.8334368354692411, + "grad_norm": 4.742153167724609, + "learning_rate": 7.19142172798028e-06, + "loss": 1.6538, + "step": 11072 + }, + { + "epoch": 0.8335121097499012, + "grad_norm": 4.234041213989258, + "learning_rate": 7.185124068920124e-06, + "loss": 1.6028, + "step": 11073 + }, + { + "epoch": 0.8335873840305613, + "grad_norm": 6.410834789276123, + "learning_rate": 7.17882895508149e-06, + "loss": 2.0493, + "step": 11074 + }, + { + "epoch": 0.8336626583112215, + "grad_norm": 5.497403621673584, + "learning_rate": 7.1725363868386185e-06, + "loss": 1.5671, + "step": 11075 + }, + { + "epoch": 0.8337379325918817, + "grad_norm": 4.742766857147217, + "learning_rate": 7.166246364565565e-06, + "loss": 1.5208, + "step": 11076 + }, + { + "epoch": 0.8338132068725418, + "grad_norm": 6.510270118713379, + "learning_rate": 7.15995888863627e-06, + "loss": 1.8001, + "step": 11077 + }, + { + "epoch": 0.833888481153202, + "grad_norm": 5.075628280639648, + "learning_rate": 7.153673959424484e-06, + "loss": 1.8106, + "step": 11078 + }, + { + "epoch": 0.8339637554338621, + "grad_norm": 4.539682388305664, + "learning_rate": 7.147391577303847e-06, + "loss": 1.5485, + "step": 11079 + }, + { + "epoch": 0.8340390297145223, + "grad_norm": 7.159042835235596, + "learning_rate": 7.141111742647816e-06, + "loss": 2.097, + "step": 11080 + }, + { + "epoch": 0.8341143039951825, + "grad_norm": 8.984960556030273, + "learning_rate": 7.134834455829731e-06, + "loss": 1.4693, + "step": 11081 + }, + { + "epoch": 0.8341895782758426, + "grad_norm": 4.777451038360596, + "learning_rate": 7.1285597172227325e-06, + "loss": 1.6546, + "step": 11082 + }, + { + "epoch": 0.8342648525565027, + "grad_norm": 3.711937665939331, + "learning_rate": 7.122287527199861e-06, + "loss": 1.6194, + "step": 11083 + }, + { + "epoch": 0.834340126837163, + "grad_norm": 4.29139518737793, + "learning_rate": 7.116017886133946e-06, + "loss": 1.5071, + "step": 11084 + }, + { + "epoch": 0.8344154011178231, + "grad_norm": 6.242701530456543, + "learning_rate": 7.1097507943977405e-06, + "loss": 1.9474, + "step": 11085 + }, + { + "epoch": 0.8344906753984832, + "grad_norm": 5.977292537689209, + "learning_rate": 7.103486252363767e-06, + "loss": 1.877, + "step": 11086 + }, + { + "epoch": 0.8345659496791433, + "grad_norm": 4.64522123336792, + "learning_rate": 7.097224260404467e-06, + "loss": 1.5543, + "step": 11087 + }, + { + "epoch": 0.8346412239598036, + "grad_norm": 3.896138906478882, + "learning_rate": 7.090964818892071e-06, + "loss": 1.801, + "step": 11088 + }, + { + "epoch": 0.8347164982404637, + "grad_norm": 5.946915626525879, + "learning_rate": 7.084707928198703e-06, + "loss": 1.5494, + "step": 11089 + }, + { + "epoch": 0.8347917725211238, + "grad_norm": 6.362826347351074, + "learning_rate": 7.078453588696304e-06, + "loss": 1.6414, + "step": 11090 + }, + { + "epoch": 0.834867046801784, + "grad_norm": 7.737717151641846, + "learning_rate": 7.072201800756684e-06, + "loss": 1.731, + "step": 11091 + }, + { + "epoch": 0.8349423210824441, + "grad_norm": 6.174107551574707, + "learning_rate": 7.065952564751488e-06, + "loss": 1.8562, + "step": 11092 + }, + { + "epoch": 0.8350175953631043, + "grad_norm": 4.8028059005737305, + "learning_rate": 7.0597058810522255e-06, + "loss": 1.9719, + "step": 11093 + }, + { + "epoch": 0.8350928696437645, + "grad_norm": 5.0389604568481445, + "learning_rate": 7.053461750030249e-06, + "loss": 1.8084, + "step": 11094 + }, + { + "epoch": 0.8351681439244246, + "grad_norm": 6.150721073150635, + "learning_rate": 7.047220172056734e-06, + "loss": 2.0972, + "step": 11095 + }, + { + "epoch": 0.8352434182050847, + "grad_norm": 5.63960075378418, + "learning_rate": 7.040981147502746e-06, + "loss": 1.9313, + "step": 11096 + }, + { + "epoch": 0.835318692485745, + "grad_norm": 6.209543228149414, + "learning_rate": 7.034744676739152e-06, + "loss": 1.9807, + "step": 11097 + }, + { + "epoch": 0.8353939667664051, + "grad_norm": 8.983195304870605, + "learning_rate": 7.028510760136719e-06, + "loss": 1.9088, + "step": 11098 + }, + { + "epoch": 0.8354692410470652, + "grad_norm": 5.123426914215088, + "learning_rate": 7.022279398066006e-06, + "loss": 1.6754, + "step": 11099 + }, + { + "epoch": 0.8355445153277254, + "grad_norm": 5.828625202178955, + "learning_rate": 7.016050590897482e-06, + "loss": 1.4463, + "step": 11100 + }, + { + "epoch": 0.8356197896083856, + "grad_norm": 4.167364597320557, + "learning_rate": 7.009824339001403e-06, + "loss": 1.7247, + "step": 11101 + }, + { + "epoch": 0.8356950638890457, + "grad_norm": 6.106855869293213, + "learning_rate": 7.0036006427479154e-06, + "loss": 1.8565, + "step": 11102 + }, + { + "epoch": 0.8357703381697059, + "grad_norm": 5.279453277587891, + "learning_rate": 6.997379502507001e-06, + "loss": 1.6274, + "step": 11103 + }, + { + "epoch": 0.835845612450366, + "grad_norm": 4.252303123474121, + "learning_rate": 6.991160918648493e-06, + "loss": 1.732, + "step": 11104 + }, + { + "epoch": 0.8359208867310262, + "grad_norm": 7.787482738494873, + "learning_rate": 6.9849448915420555e-06, + "loss": 2.1317, + "step": 11105 + }, + { + "epoch": 0.8359961610116863, + "grad_norm": 4.159592151641846, + "learning_rate": 6.9787314215572165e-06, + "loss": 1.5805, + "step": 11106 + }, + { + "epoch": 0.8360714352923465, + "grad_norm": 4.559919834136963, + "learning_rate": 6.972520509063363e-06, + "loss": 1.7205, + "step": 11107 + }, + { + "epoch": 0.8361467095730066, + "grad_norm": 4.523076057434082, + "learning_rate": 6.966312154429699e-06, + "loss": 1.6352, + "step": 11108 + }, + { + "epoch": 0.8362219838536668, + "grad_norm": 5.078846454620361, + "learning_rate": 6.9601063580253086e-06, + "loss": 1.245, + "step": 11109 + }, + { + "epoch": 0.836297258134327, + "grad_norm": 5.903271675109863, + "learning_rate": 6.953903120219091e-06, + "loss": 1.3959, + "step": 11110 + }, + { + "epoch": 0.8363725324149871, + "grad_norm": 5.042324066162109, + "learning_rate": 6.947702441379828e-06, + "loss": 1.6825, + "step": 11111 + }, + { + "epoch": 0.8364478066956472, + "grad_norm": 4.117861270904541, + "learning_rate": 6.9415043218761136e-06, + "loss": 1.8086, + "step": 11112 + }, + { + "epoch": 0.8365230809763075, + "grad_norm": 8.842025756835938, + "learning_rate": 6.935308762076415e-06, + "loss": 2.5337, + "step": 11113 + }, + { + "epoch": 0.8365983552569676, + "grad_norm": 7.153327941894531, + "learning_rate": 6.9291157623490475e-06, + "loss": 1.5316, + "step": 11114 + }, + { + "epoch": 0.8366736295376277, + "grad_norm": 4.12716007232666, + "learning_rate": 6.92292532306218e-06, + "loss": 1.6695, + "step": 11115 + }, + { + "epoch": 0.8367489038182879, + "grad_norm": 5.79695463180542, + "learning_rate": 6.916737444583782e-06, + "loss": 1.6195, + "step": 11116 + }, + { + "epoch": 0.836824178098948, + "grad_norm": 4.951187610626221, + "learning_rate": 6.9105521272817384e-06, + "loss": 1.7018, + "step": 11117 + }, + { + "epoch": 0.8368994523796082, + "grad_norm": 6.289691925048828, + "learning_rate": 6.9043693715237165e-06, + "loss": 1.7198, + "step": 11118 + }, + { + "epoch": 0.8369747266602684, + "grad_norm": 4.718573570251465, + "learning_rate": 6.898189177677295e-06, + "loss": 1.9348, + "step": 11119 + }, + { + "epoch": 0.8370500009409285, + "grad_norm": 7.739538669586182, + "learning_rate": 6.8920115461098445e-06, + "loss": 1.5703, + "step": 11120 + }, + { + "epoch": 0.8371252752215886, + "grad_norm": 5.575514793395996, + "learning_rate": 6.885836477188629e-06, + "loss": 1.7773, + "step": 11121 + }, + { + "epoch": 0.8372005495022489, + "grad_norm": 6.485258102416992, + "learning_rate": 6.879663971280709e-06, + "loss": 1.5495, + "step": 11122 + }, + { + "epoch": 0.837275823782909, + "grad_norm": 4.962364673614502, + "learning_rate": 6.873494028753041e-06, + "loss": 1.614, + "step": 11123 + }, + { + "epoch": 0.8373510980635691, + "grad_norm": 4.118284225463867, + "learning_rate": 6.8673266499724166e-06, + "loss": 1.4532, + "step": 11124 + }, + { + "epoch": 0.8374263723442292, + "grad_norm": 4.268768787384033, + "learning_rate": 6.861161835305452e-06, + "loss": 1.6754, + "step": 11125 + }, + { + "epoch": 0.8375016466248895, + "grad_norm": 4.877648830413818, + "learning_rate": 6.854999585118654e-06, + "loss": 1.6147, + "step": 11126 + }, + { + "epoch": 0.8375769209055496, + "grad_norm": 4.90310525894165, + "learning_rate": 6.8488398997783195e-06, + "loss": 1.5458, + "step": 11127 + }, + { + "epoch": 0.8376521951862097, + "grad_norm": 6.428467273712158, + "learning_rate": 6.842682779650655e-06, + "loss": 1.3369, + "step": 11128 + }, + { + "epoch": 0.8377274694668699, + "grad_norm": 5.383645534515381, + "learning_rate": 6.8365282251016515e-06, + "loss": 1.5801, + "step": 11129 + }, + { + "epoch": 0.83780274374753, + "grad_norm": 4.903599262237549, + "learning_rate": 6.830376236497205e-06, + "loss": 1.7061, + "step": 11130 + }, + { + "epoch": 0.8378780180281902, + "grad_norm": 3.8339924812316895, + "learning_rate": 6.824226814203017e-06, + "loss": 1.681, + "step": 11131 + }, + { + "epoch": 0.8379532923088504, + "grad_norm": 4.178893566131592, + "learning_rate": 6.8180799585846665e-06, + "loss": 1.6656, + "step": 11132 + }, + { + "epoch": 0.8380285665895105, + "grad_norm": 4.922976970672607, + "learning_rate": 6.811935670007552e-06, + "loss": 1.5009, + "step": 11133 + }, + { + "epoch": 0.8381038408701706, + "grad_norm": 5.118055820465088, + "learning_rate": 6.805793948836941e-06, + "loss": 2.0673, + "step": 11134 + }, + { + "epoch": 0.8381791151508309, + "grad_norm": 6.452218055725098, + "learning_rate": 6.799654795437949e-06, + "loss": 1.6623, + "step": 11135 + }, + { + "epoch": 0.838254389431491, + "grad_norm": 4.6666364669799805, + "learning_rate": 6.793518210175515e-06, + "loss": 1.9366, + "step": 11136 + }, + { + "epoch": 0.8383296637121511, + "grad_norm": 4.66019344329834, + "learning_rate": 6.787384193414453e-06, + "loss": 1.4549, + "step": 11137 + }, + { + "epoch": 0.8384049379928113, + "grad_norm": 3.937755823135376, + "learning_rate": 6.781252745519417e-06, + "loss": 1.8971, + "step": 11138 + }, + { + "epoch": 0.8384802122734715, + "grad_norm": 5.8036065101623535, + "learning_rate": 6.775123866854888e-06, + "loss": 1.6427, + "step": 11139 + }, + { + "epoch": 0.8385554865541316, + "grad_norm": 6.429412364959717, + "learning_rate": 6.768997557785217e-06, + "loss": 1.9028, + "step": 11140 + }, + { + "epoch": 0.8386307608347918, + "grad_norm": 5.78550386428833, + "learning_rate": 6.762873818674609e-06, + "loss": 2.0287, + "step": 11141 + }, + { + "epoch": 0.8387060351154519, + "grad_norm": 5.572829246520996, + "learning_rate": 6.756752649887082e-06, + "loss": 1.9933, + "step": 11142 + }, + { + "epoch": 0.8387813093961121, + "grad_norm": 7.585248947143555, + "learning_rate": 6.750634051786542e-06, + "loss": 2.0518, + "step": 11143 + }, + { + "epoch": 0.8388565836767723, + "grad_norm": 5.612165451049805, + "learning_rate": 6.744518024736696e-06, + "loss": 1.5131, + "step": 11144 + }, + { + "epoch": 0.8389318579574324, + "grad_norm": 6.8556084632873535, + "learning_rate": 6.738404569101153e-06, + "loss": 1.6808, + "step": 11145 + }, + { + "epoch": 0.8390071322380925, + "grad_norm": 6.01460599899292, + "learning_rate": 6.7322936852433184e-06, + "loss": 1.872, + "step": 11146 + }, + { + "epoch": 0.8390824065187527, + "grad_norm": 6.867294788360596, + "learning_rate": 6.726185373526473e-06, + "loss": 1.5567, + "step": 11147 + }, + { + "epoch": 0.8391576807994129, + "grad_norm": 4.484410285949707, + "learning_rate": 6.720079634313742e-06, + "loss": 1.2322, + "step": 11148 + }, + { + "epoch": 0.839232955080073, + "grad_norm": 5.141690254211426, + "learning_rate": 6.713976467968103e-06, + "loss": 2.1124, + "step": 11149 + }, + { + "epoch": 0.8393082293607331, + "grad_norm": 6.083642482757568, + "learning_rate": 6.707875874852348e-06, + "loss": 1.4595, + "step": 11150 + }, + { + "epoch": 0.8393835036413934, + "grad_norm": 5.788231372833252, + "learning_rate": 6.701777855329167e-06, + "loss": 1.626, + "step": 11151 + }, + { + "epoch": 0.8394587779220535, + "grad_norm": 7.060763835906982, + "learning_rate": 6.695682409761045e-06, + "loss": 1.5856, + "step": 11152 + }, + { + "epoch": 0.8395340522027136, + "grad_norm": 4.986818313598633, + "learning_rate": 6.6895895385103605e-06, + "loss": 1.8896, + "step": 11153 + }, + { + "epoch": 0.8396093264833738, + "grad_norm": 6.275311470031738, + "learning_rate": 6.683499241939295e-06, + "loss": 1.9622, + "step": 11154 + }, + { + "epoch": 0.839684600764034, + "grad_norm": 5.7394914627075195, + "learning_rate": 6.67741152040991e-06, + "loss": 1.5168, + "step": 11155 + }, + { + "epoch": 0.8397598750446941, + "grad_norm": 6.20206880569458, + "learning_rate": 6.6713263742841145e-06, + "loss": 1.9609, + "step": 11156 + }, + { + "epoch": 0.8398351493253543, + "grad_norm": 6.096729755401611, + "learning_rate": 6.665243803923632e-06, + "loss": 1.6326, + "step": 11157 + }, + { + "epoch": 0.8399104236060144, + "grad_norm": 4.3786516189575195, + "learning_rate": 6.659163809690067e-06, + "loss": 1.6732, + "step": 11158 + }, + { + "epoch": 0.8399856978866745, + "grad_norm": 5.004992961883545, + "learning_rate": 6.653086391944852e-06, + "loss": 1.5563, + "step": 11159 + }, + { + "epoch": 0.8400609721673348, + "grad_norm": 5.620204448699951, + "learning_rate": 6.647011551049287e-06, + "loss": 1.4618, + "step": 11160 + }, + { + "epoch": 0.8401362464479949, + "grad_norm": 4.507925987243652, + "learning_rate": 6.640939287364478e-06, + "loss": 1.6447, + "step": 11161 + }, + { + "epoch": 0.840211520728655, + "grad_norm": 4.962132453918457, + "learning_rate": 6.634869601251426e-06, + "loss": 1.4089, + "step": 11162 + }, + { + "epoch": 0.8402867950093152, + "grad_norm": 5.6625189781188965, + "learning_rate": 6.62880249307094e-06, + "loss": 1.3758, + "step": 11163 + }, + { + "epoch": 0.8403620692899754, + "grad_norm": 6.484951972961426, + "learning_rate": 6.622737963183712e-06, + "loss": 2.223, + "step": 11164 + }, + { + "epoch": 0.8404373435706355, + "grad_norm": 4.909332752227783, + "learning_rate": 6.6166760119502405e-06, + "loss": 1.817, + "step": 11165 + }, + { + "epoch": 0.8405126178512956, + "grad_norm": 7.888582706451416, + "learning_rate": 6.6106166397309045e-06, + "loss": 2.0353, + "step": 11166 + }, + { + "epoch": 0.8405878921319558, + "grad_norm": 4.517070770263672, + "learning_rate": 6.6045598468858996e-06, + "loss": 1.5444, + "step": 11167 + }, + { + "epoch": 0.840663166412616, + "grad_norm": 4.5210652351379395, + "learning_rate": 6.598505633775304e-06, + "loss": 1.845, + "step": 11168 + }, + { + "epoch": 0.8407384406932761, + "grad_norm": 4.667665958404541, + "learning_rate": 6.5924540007590076e-06, + "loss": 1.6682, + "step": 11169 + }, + { + "epoch": 0.8408137149739363, + "grad_norm": 4.05859375, + "learning_rate": 6.5864049481967834e-06, + "loss": 1.6899, + "step": 11170 + }, + { + "epoch": 0.8408889892545964, + "grad_norm": 5.745716571807861, + "learning_rate": 6.580358476448212e-06, + "loss": 1.5289, + "step": 11171 + }, + { + "epoch": 0.8409642635352566, + "grad_norm": 9.191686630249023, + "learning_rate": 6.574314585872738e-06, + "loss": 1.7182, + "step": 11172 + }, + { + "epoch": 0.8410395378159168, + "grad_norm": 4.4797139167785645, + "learning_rate": 6.5682732768296725e-06, + "loss": 1.8096, + "step": 11173 + }, + { + "epoch": 0.8411148120965769, + "grad_norm": 4.803218364715576, + "learning_rate": 6.5622345496781315e-06, + "loss": 1.7938, + "step": 11174 + }, + { + "epoch": 0.841190086377237, + "grad_norm": 5.984793663024902, + "learning_rate": 6.556198404777119e-06, + "loss": 1.774, + "step": 11175 + }, + { + "epoch": 0.8412653606578973, + "grad_norm": 7.745728969573975, + "learning_rate": 6.550164842485445e-06, + "loss": 1.5113, + "step": 11176 + }, + { + "epoch": 0.8413406349385574, + "grad_norm": 4.81554651260376, + "learning_rate": 6.544133863161811e-06, + "loss": 2.1828, + "step": 11177 + }, + { + "epoch": 0.8414159092192175, + "grad_norm": 5.009761333465576, + "learning_rate": 6.538105467164718e-06, + "loss": 1.4728, + "step": 11178 + }, + { + "epoch": 0.8414911834998777, + "grad_norm": 4.34937858581543, + "learning_rate": 6.532079654852552e-06, + "loss": 1.2945, + "step": 11179 + }, + { + "epoch": 0.8415664577805378, + "grad_norm": 5.538344383239746, + "learning_rate": 6.526056426583526e-06, + "loss": 1.8349, + "step": 11180 + }, + { + "epoch": 0.841641732061198, + "grad_norm": 4.853887557983398, + "learning_rate": 6.520035782715716e-06, + "loss": 1.6384, + "step": 11181 + }, + { + "epoch": 0.8417170063418582, + "grad_norm": 7.687852382659912, + "learning_rate": 6.514017723607007e-06, + "loss": 1.9673, + "step": 11182 + }, + { + "epoch": 0.8417922806225183, + "grad_norm": 4.518214702606201, + "learning_rate": 6.508002249615186e-06, + "loss": 1.8865, + "step": 11183 + }, + { + "epoch": 0.8418675549031784, + "grad_norm": 4.735904216766357, + "learning_rate": 6.501989361097821e-06, + "loss": 1.7322, + "step": 11184 + }, + { + "epoch": 0.8419428291838386, + "grad_norm": 5.055391788482666, + "learning_rate": 6.4959790584123934e-06, + "loss": 1.6865, + "step": 11185 + }, + { + "epoch": 0.8420181034644988, + "grad_norm": 6.8694376945495605, + "learning_rate": 6.4899713419161735e-06, + "loss": 1.7476, + "step": 11186 + }, + { + "epoch": 0.8420933777451589, + "grad_norm": 5.501788139343262, + "learning_rate": 6.483966211966308e-06, + "loss": 1.7496, + "step": 11187 + }, + { + "epoch": 0.842168652025819, + "grad_norm": 4.376001834869385, + "learning_rate": 6.477963668919806e-06, + "loss": 1.6578, + "step": 11188 + }, + { + "epoch": 0.8422439263064793, + "grad_norm": 5.062139511108398, + "learning_rate": 6.471963713133472e-06, + "loss": 1.7292, + "step": 11189 + }, + { + "epoch": 0.8423192005871394, + "grad_norm": 4.50846004486084, + "learning_rate": 6.465966344964014e-06, + "loss": 1.8718, + "step": 11190 + }, + { + "epoch": 0.8423944748677995, + "grad_norm": 6.001620769500732, + "learning_rate": 6.459971564767925e-06, + "loss": 1.6182, + "step": 11191 + }, + { + "epoch": 0.8424697491484597, + "grad_norm": 4.871519565582275, + "learning_rate": 6.453979372901603e-06, + "loss": 1.3407, + "step": 11192 + }, + { + "epoch": 0.8425450234291199, + "grad_norm": 4.48479700088501, + "learning_rate": 6.4479897697212574e-06, + "loss": 1.7747, + "step": 11193 + }, + { + "epoch": 0.84262029770978, + "grad_norm": 4.681257724761963, + "learning_rate": 6.442002755582966e-06, + "loss": 1.8296, + "step": 11194 + }, + { + "epoch": 0.8426955719904402, + "grad_norm": 4.2211408615112305, + "learning_rate": 6.436018330842619e-06, + "loss": 1.6402, + "step": 11195 + }, + { + "epoch": 0.8427708462711003, + "grad_norm": 4.731535911560059, + "learning_rate": 6.43003649585599e-06, + "loss": 2.138, + "step": 11196 + }, + { + "epoch": 0.8428461205517604, + "grad_norm": 4.571580410003662, + "learning_rate": 6.424057250978671e-06, + "loss": 1.7857, + "step": 11197 + }, + { + "epoch": 0.8429213948324207, + "grad_norm": 5.964311122894287, + "learning_rate": 6.418080596566123e-06, + "loss": 1.8946, + "step": 11198 + }, + { + "epoch": 0.8429966691130808, + "grad_norm": 6.811120510101318, + "learning_rate": 6.4121065329736165e-06, + "loss": 1.8943, + "step": 11199 + }, + { + "epoch": 0.8430719433937409, + "grad_norm": 5.971794128417969, + "learning_rate": 6.406135060556329e-06, + "loss": 1.7446, + "step": 11200 + }, + { + "epoch": 0.8431472176744012, + "grad_norm": 5.61641263961792, + "learning_rate": 6.400166179669209e-06, + "loss": 1.8669, + "step": 11201 + }, + { + "epoch": 0.8432224919550613, + "grad_norm": 6.132184028625488, + "learning_rate": 6.394199890667113e-06, + "loss": 1.5679, + "step": 11202 + }, + { + "epoch": 0.8432977662357214, + "grad_norm": 5.616762161254883, + "learning_rate": 6.388236193904712e-06, + "loss": 1.88, + "step": 11203 + }, + { + "epoch": 0.8433730405163815, + "grad_norm": 4.172910690307617, + "learning_rate": 6.382275089736534e-06, + "loss": 1.6733, + "step": 11204 + }, + { + "epoch": 0.8434483147970417, + "grad_norm": 4.799108028411865, + "learning_rate": 6.376316578516955e-06, + "loss": 2.1099, + "step": 11205 + }, + { + "epoch": 0.8435235890777019, + "grad_norm": 7.142563343048096, + "learning_rate": 6.370360660600178e-06, + "loss": 1.7779, + "step": 11206 + }, + { + "epoch": 0.843598863358362, + "grad_norm": 5.616652488708496, + "learning_rate": 6.3644073363402836e-06, + "loss": 1.6552, + "step": 11207 + }, + { + "epoch": 0.8436741376390222, + "grad_norm": 7.383673191070557, + "learning_rate": 6.3584566060911565e-06, + "loss": 1.6729, + "step": 11208 + }, + { + "epoch": 0.8437494119196823, + "grad_norm": 7.80656099319458, + "learning_rate": 6.3525084702065754e-06, + "loss": 1.6252, + "step": 11209 + }, + { + "epoch": 0.8438246862003425, + "grad_norm": 7.280595302581787, + "learning_rate": 6.346562929040112e-06, + "loss": 1.7012, + "step": 11210 + }, + { + "epoch": 0.8438999604810027, + "grad_norm": 5.628237724304199, + "learning_rate": 6.340619982945239e-06, + "loss": 1.9386, + "step": 11211 + }, + { + "epoch": 0.8439752347616628, + "grad_norm": 5.039392948150635, + "learning_rate": 6.334679632275226e-06, + "loss": 1.9509, + "step": 11212 + }, + { + "epoch": 0.8440505090423229, + "grad_norm": 4.84282922744751, + "learning_rate": 6.328741877383221e-06, + "loss": 1.8118, + "step": 11213 + }, + { + "epoch": 0.8441257833229832, + "grad_norm": 8.12214183807373, + "learning_rate": 6.322806718622204e-06, + "loss": 1.9314, + "step": 11214 + }, + { + "epoch": 0.8442010576036433, + "grad_norm": 4.691908836364746, + "learning_rate": 6.3168741563450095e-06, + "loss": 1.6397, + "step": 11215 + }, + { + "epoch": 0.8442763318843034, + "grad_norm": 5.326754093170166, + "learning_rate": 6.3109441909043e-06, + "loss": 1.7776, + "step": 11216 + }, + { + "epoch": 0.8443516061649636, + "grad_norm": 4.921647071838379, + "learning_rate": 6.305016822652609e-06, + "loss": 1.5997, + "step": 11217 + }, + { + "epoch": 0.8444268804456238, + "grad_norm": 4.4140448570251465, + "learning_rate": 6.299092051942279e-06, + "loss": 1.5259, + "step": 11218 + }, + { + "epoch": 0.8445021547262839, + "grad_norm": 6.338119029998779, + "learning_rate": 6.293169879125538e-06, + "loss": 1.6065, + "step": 11219 + }, + { + "epoch": 0.8445774290069441, + "grad_norm": 4.332146167755127, + "learning_rate": 6.287250304554448e-06, + "loss": 1.9074, + "step": 11220 + }, + { + "epoch": 0.8446527032876042, + "grad_norm": 6.503258228302002, + "learning_rate": 6.281333328580896e-06, + "loss": 2.0955, + "step": 11221 + }, + { + "epoch": 0.8447279775682643, + "grad_norm": 4.90545129776001, + "learning_rate": 6.275418951556639e-06, + "loss": 1.5922, + "step": 11222 + }, + { + "epoch": 0.8448032518489245, + "grad_norm": 5.027221202850342, + "learning_rate": 6.2695071738332555e-06, + "loss": 1.684, + "step": 11223 + }, + { + "epoch": 0.8448785261295847, + "grad_norm": 6.59862756729126, + "learning_rate": 6.263597995762199e-06, + "loss": 1.3753, + "step": 11224 + }, + { + "epoch": 0.8449538004102448, + "grad_norm": 4.585446834564209, + "learning_rate": 6.257691417694739e-06, + "loss": 1.7602, + "step": 11225 + }, + { + "epoch": 0.8450290746909049, + "grad_norm": 7.058629035949707, + "learning_rate": 6.251787439982032e-06, + "loss": 1.7624, + "step": 11226 + }, + { + "epoch": 0.8451043489715652, + "grad_norm": 3.844776153564453, + "learning_rate": 6.245886062975021e-06, + "loss": 1.8411, + "step": 11227 + }, + { + "epoch": 0.8451796232522253, + "grad_norm": 4.824901580810547, + "learning_rate": 6.239987287024546e-06, + "loss": 2.0247, + "step": 11228 + }, + { + "epoch": 0.8452548975328854, + "grad_norm": 4.562021732330322, + "learning_rate": 6.2340911124812605e-06, + "loss": 1.5139, + "step": 11229 + }, + { + "epoch": 0.8453301718135456, + "grad_norm": 7.2056708335876465, + "learning_rate": 6.228197539695685e-06, + "loss": 1.8279, + "step": 11230 + }, + { + "epoch": 0.8454054460942058, + "grad_norm": 7.355616569519043, + "learning_rate": 6.222306569018166e-06, + "loss": 1.7975, + "step": 11231 + }, + { + "epoch": 0.8454807203748659, + "grad_norm": 6.44072151184082, + "learning_rate": 6.2164182007989124e-06, + "loss": 1.6639, + "step": 11232 + }, + { + "epoch": 0.8455559946555261, + "grad_norm": 5.348602771759033, + "learning_rate": 6.2105324353879615e-06, + "loss": 1.4814, + "step": 11233 + }, + { + "epoch": 0.8456312689361862, + "grad_norm": 6.094944477081299, + "learning_rate": 6.20464927313521e-06, + "loss": 1.899, + "step": 11234 + }, + { + "epoch": 0.8457065432168464, + "grad_norm": 4.532393455505371, + "learning_rate": 6.198768714390396e-06, + "loss": 1.4413, + "step": 11235 + }, + { + "epoch": 0.8457818174975066, + "grad_norm": 5.502945423126221, + "learning_rate": 6.192890759503117e-06, + "loss": 1.697, + "step": 11236 + }, + { + "epoch": 0.8458570917781667, + "grad_norm": 4.7064995765686035, + "learning_rate": 6.187015408822772e-06, + "loss": 2.1542, + "step": 11237 + }, + { + "epoch": 0.8459323660588268, + "grad_norm": 4.368133068084717, + "learning_rate": 6.181142662698647e-06, + "loss": 1.5646, + "step": 11238 + }, + { + "epoch": 0.846007640339487, + "grad_norm": 6.451797008514404, + "learning_rate": 6.175272521479869e-06, + "loss": 1.3642, + "step": 11239 + }, + { + "epoch": 0.8460829146201472, + "grad_norm": 5.179734230041504, + "learning_rate": 6.169404985515381e-06, + "loss": 2.2393, + "step": 11240 + }, + { + "epoch": 0.8461581889008073, + "grad_norm": 5.3309783935546875, + "learning_rate": 6.163540055154016e-06, + "loss": 1.73, + "step": 11241 + }, + { + "epoch": 0.8462334631814675, + "grad_norm": 7.391524791717529, + "learning_rate": 6.1576777307444046e-06, + "loss": 1.6768, + "step": 11242 + }, + { + "epoch": 0.8463087374621276, + "grad_norm": 6.062099456787109, + "learning_rate": 6.151818012635064e-06, + "loss": 1.7797, + "step": 11243 + }, + { + "epoch": 0.8463840117427878, + "grad_norm": 4.093652725219727, + "learning_rate": 6.145960901174314e-06, + "loss": 1.6052, + "step": 11244 + }, + { + "epoch": 0.8464592860234479, + "grad_norm": 5.368590831756592, + "learning_rate": 6.140106396710371e-06, + "loss": 1.5607, + "step": 11245 + }, + { + "epoch": 0.8465345603041081, + "grad_norm": 4.866500377655029, + "learning_rate": 6.134254499591247e-06, + "loss": 1.6595, + "step": 11246 + }, + { + "epoch": 0.8466098345847682, + "grad_norm": 4.729656219482422, + "learning_rate": 6.128405210164823e-06, + "loss": 2.2454, + "step": 11247 + }, + { + "epoch": 0.8466851088654284, + "grad_norm": 4.635674953460693, + "learning_rate": 6.122558528778826e-06, + "loss": 1.9665, + "step": 11248 + }, + { + "epoch": 0.8467603831460886, + "grad_norm": 6.337672710418701, + "learning_rate": 6.116714455780842e-06, + "loss": 1.6717, + "step": 11249 + }, + { + "epoch": 0.8468356574267487, + "grad_norm": 5.941047668457031, + "learning_rate": 6.110872991518251e-06, + "loss": 1.7067, + "step": 11250 + }, + { + "epoch": 0.8469109317074088, + "grad_norm": 4.131974220275879, + "learning_rate": 6.105034136338333e-06, + "loss": 1.4491, + "step": 11251 + }, + { + "epoch": 0.8469862059880691, + "grad_norm": 7.867033958435059, + "learning_rate": 6.099197890588199e-06, + "loss": 1.6296, + "step": 11252 + }, + { + "epoch": 0.8470614802687292, + "grad_norm": 5.8019938468933105, + "learning_rate": 6.093364254614775e-06, + "loss": 1.6926, + "step": 11253 + }, + { + "epoch": 0.8471367545493893, + "grad_norm": 6.510514736175537, + "learning_rate": 6.087533228764869e-06, + "loss": 1.6824, + "step": 11254 + }, + { + "epoch": 0.8472120288300495, + "grad_norm": 8.191925048828125, + "learning_rate": 6.081704813385109e-06, + "loss": 1.8462, + "step": 11255 + }, + { + "epoch": 0.8472873031107097, + "grad_norm": 5.8642706871032715, + "learning_rate": 6.0758790088219885e-06, + "loss": 1.7432, + "step": 11256 + }, + { + "epoch": 0.8473625773913698, + "grad_norm": 3.8678395748138428, + "learning_rate": 6.070055815421816e-06, + "loss": 1.682, + "step": 11257 + }, + { + "epoch": 0.84743785167203, + "grad_norm": 7.020726203918457, + "learning_rate": 6.064235233530779e-06, + "loss": 2.1519, + "step": 11258 + }, + { + "epoch": 0.8475131259526901, + "grad_norm": 4.905362606048584, + "learning_rate": 6.0584172634948925e-06, + "loss": 1.8492, + "step": 11259 + }, + { + "epoch": 0.8475884002333502, + "grad_norm": 6.480381965637207, + "learning_rate": 6.052601905660027e-06, + "loss": 1.5367, + "step": 11260 + }, + { + "epoch": 0.8476636745140105, + "grad_norm": 5.769245147705078, + "learning_rate": 6.0467891603718686e-06, + "loss": 1.6573, + "step": 11261 + }, + { + "epoch": 0.8477389487946706, + "grad_norm": 5.388595104217529, + "learning_rate": 6.04097902797599e-06, + "loss": 1.8247, + "step": 11262 + }, + { + "epoch": 0.8478142230753307, + "grad_norm": 7.229869365692139, + "learning_rate": 6.035171508817766e-06, + "loss": 1.5251, + "step": 11263 + }, + { + "epoch": 0.8478894973559908, + "grad_norm": 6.41273832321167, + "learning_rate": 6.029366603242453e-06, + "loss": 1.4877, + "step": 11264 + }, + { + "epoch": 0.8479647716366511, + "grad_norm": 4.079343795776367, + "learning_rate": 6.02356431159512e-06, + "loss": 1.8844, + "step": 11265 + }, + { + "epoch": 0.8480400459173112, + "grad_norm": 3.910313367843628, + "learning_rate": 6.017764634220719e-06, + "loss": 1.8569, + "step": 11266 + }, + { + "epoch": 0.8481153201979713, + "grad_norm": 4.8918938636779785, + "learning_rate": 6.011967571464e-06, + "loss": 1.6251, + "step": 11267 + }, + { + "epoch": 0.8481905944786315, + "grad_norm": 5.043403625488281, + "learning_rate": 6.00617312366959e-06, + "loss": 1.8034, + "step": 11268 + }, + { + "epoch": 0.8482658687592917, + "grad_norm": 5.6443891525268555, + "learning_rate": 6.0003812911819615e-06, + "loss": 1.9552, + "step": 11269 + }, + { + "epoch": 0.8483411430399518, + "grad_norm": 7.913328647613525, + "learning_rate": 5.994592074345412e-06, + "loss": 1.9104, + "step": 11270 + }, + { + "epoch": 0.848416417320612, + "grad_norm": 5.491076946258545, + "learning_rate": 5.988805473504106e-06, + "loss": 1.8495, + "step": 11271 + }, + { + "epoch": 0.8484916916012721, + "grad_norm": 7.318009376525879, + "learning_rate": 5.9830214890020274e-06, + "loss": 1.9661, + "step": 11272 + }, + { + "epoch": 0.8485669658819323, + "grad_norm": 6.952710151672363, + "learning_rate": 5.9772401211830306e-06, + "loss": 2.0904, + "step": 11273 + }, + { + "epoch": 0.8486422401625925, + "grad_norm": 5.428725719451904, + "learning_rate": 5.971461370390779e-06, + "loss": 1.7761, + "step": 11274 + }, + { + "epoch": 0.8487175144432526, + "grad_norm": 4.671530246734619, + "learning_rate": 5.965685236968832e-06, + "loss": 1.8911, + "step": 11275 + }, + { + "epoch": 0.8487927887239127, + "grad_norm": 6.262880325317383, + "learning_rate": 5.959911721260541e-06, + "loss": 1.7644, + "step": 11276 + }, + { + "epoch": 0.848868063004573, + "grad_norm": 6.2670135498046875, + "learning_rate": 5.954140823609139e-06, + "loss": 2.0092, + "step": 11277 + }, + { + "epoch": 0.8489433372852331, + "grad_norm": 6.2670135498046875, + "learning_rate": 5.954140823609139e-06, + "loss": 1.6408, + "step": 11278 + }, + { + "epoch": 0.8490186115658932, + "grad_norm": 4.753340721130371, + "learning_rate": 5.948372544357672e-06, + "loss": 1.4227, + "step": 11279 + }, + { + "epoch": 0.8490938858465534, + "grad_norm": 7.392508506774902, + "learning_rate": 5.942606883849061e-06, + "loss": 1.8656, + "step": 11280 + }, + { + "epoch": 0.8491691601272136, + "grad_norm": 5.248170852661133, + "learning_rate": 5.936843842426054e-06, + "loss": 1.613, + "step": 11281 + }, + { + "epoch": 0.8492444344078737, + "grad_norm": 4.482975006103516, + "learning_rate": 5.93108342043126e-06, + "loss": 1.6833, + "step": 11282 + }, + { + "epoch": 0.8493197086885338, + "grad_norm": 4.513645648956299, + "learning_rate": 5.925325618207101e-06, + "loss": 1.8635, + "step": 11283 + }, + { + "epoch": 0.849394982969194, + "grad_norm": 5.6021013259887695, + "learning_rate": 5.919570436095878e-06, + "loss": 1.641, + "step": 11284 + }, + { + "epoch": 0.8494702572498541, + "grad_norm": 6.515329360961914, + "learning_rate": 5.9138178744397036e-06, + "loss": 2.0755, + "step": 11285 + }, + { + "epoch": 0.8495455315305143, + "grad_norm": 11.327410697937012, + "learning_rate": 5.908067933580558e-06, + "loss": 2.0185, + "step": 11286 + }, + { + "epoch": 0.8496208058111745, + "grad_norm": 5.503057956695557, + "learning_rate": 5.902320613860268e-06, + "loss": 1.671, + "step": 11287 + }, + { + "epoch": 0.8496960800918346, + "grad_norm": 6.435186862945557, + "learning_rate": 5.896575915620478e-06, + "loss": 1.5572, + "step": 11288 + }, + { + "epoch": 0.8497713543724947, + "grad_norm": 6.620285987854004, + "learning_rate": 5.890833839202714e-06, + "loss": 1.2494, + "step": 11289 + }, + { + "epoch": 0.849846628653155, + "grad_norm": 6.316104412078857, + "learning_rate": 5.885094384948303e-06, + "loss": 1.6142, + "step": 11290 + }, + { + "epoch": 0.8499219029338151, + "grad_norm": 4.289783000946045, + "learning_rate": 5.879357553198461e-06, + "loss": 1.7146, + "step": 11291 + }, + { + "epoch": 0.8499971772144752, + "grad_norm": 6.927998065948486, + "learning_rate": 5.87362334429421e-06, + "loss": 2.0255, + "step": 11292 + }, + { + "epoch": 0.8500724514951354, + "grad_norm": 5.464117527008057, + "learning_rate": 5.867891758576432e-06, + "loss": 1.8702, + "step": 11293 + }, + { + "epoch": 0.8501477257757956, + "grad_norm": 5.7342634201049805, + "learning_rate": 5.862162796385867e-06, + "loss": 1.5493, + "step": 11294 + }, + { + "epoch": 0.8502230000564557, + "grad_norm": 8.534250259399414, + "learning_rate": 5.856436458063086e-06, + "loss": 1.5649, + "step": 11295 + }, + { + "epoch": 0.8502982743371159, + "grad_norm": 7.186662673950195, + "learning_rate": 5.850712743948483e-06, + "loss": 1.8795, + "step": 11296 + }, + { + "epoch": 0.850373548617776, + "grad_norm": 4.4900221824646, + "learning_rate": 5.844991654382337e-06, + "loss": 1.8705, + "step": 11297 + }, + { + "epoch": 0.8504488228984362, + "grad_norm": 4.691919326782227, + "learning_rate": 5.839273189704736e-06, + "loss": 1.4768, + "step": 11298 + }, + { + "epoch": 0.8505240971790964, + "grad_norm": 4.956110954284668, + "learning_rate": 5.833557350255642e-06, + "loss": 2.294, + "step": 11299 + }, + { + "epoch": 0.8505993714597565, + "grad_norm": 5.126801490783691, + "learning_rate": 5.827844136374827e-06, + "loss": 1.3174, + "step": 11300 + }, + { + "epoch": 0.8506746457404166, + "grad_norm": 5.413674354553223, + "learning_rate": 5.822133548401931e-06, + "loss": 1.8377, + "step": 11301 + }, + { + "epoch": 0.8507499200210767, + "grad_norm": 4.41970682144165, + "learning_rate": 5.816425586676444e-06, + "loss": 1.7476, + "step": 11302 + }, + { + "epoch": 0.850825194301737, + "grad_norm": 4.973052978515625, + "learning_rate": 5.810720251537671e-06, + "loss": 1.6316, + "step": 11303 + }, + { + "epoch": 0.8509004685823971, + "grad_norm": 5.457945346832275, + "learning_rate": 5.8050175433247864e-06, + "loss": 1.5139, + "step": 11304 + }, + { + "epoch": 0.8509757428630572, + "grad_norm": 5.019343376159668, + "learning_rate": 5.799317462376802e-06, + "loss": 1.675, + "step": 11305 + }, + { + "epoch": 0.8510510171437174, + "grad_norm": 5.423277854919434, + "learning_rate": 5.793620009032574e-06, + "loss": 1.3282, + "step": 11306 + }, + { + "epoch": 0.8511262914243776, + "grad_norm": 4.6853837966918945, + "learning_rate": 5.787925183630788e-06, + "loss": 1.9548, + "step": 11307 + }, + { + "epoch": 0.8512015657050377, + "grad_norm": 4.376720905303955, + "learning_rate": 5.782232986509995e-06, + "loss": 2.004, + "step": 11308 + }, + { + "epoch": 0.8512768399856979, + "grad_norm": 5.314483642578125, + "learning_rate": 5.776543418008573e-06, + "loss": 1.6641, + "step": 11309 + }, + { + "epoch": 0.851352114266358, + "grad_norm": 5.720130920410156, + "learning_rate": 5.770856478464759e-06, + "loss": 1.8305, + "step": 11310 + }, + { + "epoch": 0.8514273885470182, + "grad_norm": 4.887152194976807, + "learning_rate": 5.765172168216609e-06, + "loss": 1.6619, + "step": 11311 + }, + { + "epoch": 0.8515026628276784, + "grad_norm": 6.36875057220459, + "learning_rate": 5.759490487602065e-06, + "loss": 1.5929, + "step": 11312 + }, + { + "epoch": 0.8515779371083385, + "grad_norm": 4.069787502288818, + "learning_rate": 5.753811436958856e-06, + "loss": 2.0068, + "step": 11313 + }, + { + "epoch": 0.8516532113889986, + "grad_norm": 4.5081400871276855, + "learning_rate": 5.7481350166246004e-06, + "loss": 2.0092, + "step": 11314 + }, + { + "epoch": 0.8517284856696589, + "grad_norm": 4.4899420738220215, + "learning_rate": 5.742461226936746e-06, + "loss": 1.6442, + "step": 11315 + }, + { + "epoch": 0.851803759950319, + "grad_norm": 5.834133625030518, + "learning_rate": 5.736790068232595e-06, + "loss": 1.8345, + "step": 11316 + }, + { + "epoch": 0.8518790342309791, + "grad_norm": 8.381048202514648, + "learning_rate": 5.7311215408492605e-06, + "loss": 1.7994, + "step": 11317 + }, + { + "epoch": 0.8519543085116393, + "grad_norm": 4.379827499389648, + "learning_rate": 5.725455645123723e-06, + "loss": 2.1029, + "step": 11318 + }, + { + "epoch": 0.8520295827922995, + "grad_norm": 4.135026454925537, + "learning_rate": 5.719792381392824e-06, + "loss": 1.7511, + "step": 11319 + }, + { + "epoch": 0.8521048570729596, + "grad_norm": 6.45878791809082, + "learning_rate": 5.714131749993201e-06, + "loss": 1.5649, + "step": 11320 + }, + { + "epoch": 0.8521801313536198, + "grad_norm": 5.381940841674805, + "learning_rate": 5.708473751261384e-06, + "loss": 2.0128, + "step": 11321 + }, + { + "epoch": 0.8522554056342799, + "grad_norm": 4.772152423858643, + "learning_rate": 5.7028183855337095e-06, + "loss": 1.6736, + "step": 11322 + }, + { + "epoch": 0.85233067991494, + "grad_norm": 4.930591583251953, + "learning_rate": 5.697165653146386e-06, + "loss": 1.5916, + "step": 11323 + }, + { + "epoch": 0.8524059541956002, + "grad_norm": 5.021491050720215, + "learning_rate": 5.691515554435439e-06, + "loss": 1.3997, + "step": 11324 + }, + { + "epoch": 0.8524812284762604, + "grad_norm": 4.731684684753418, + "learning_rate": 5.685868089736757e-06, + "loss": 1.9967, + "step": 11325 + }, + { + "epoch": 0.8525565027569205, + "grad_norm": 4.944148540496826, + "learning_rate": 5.680223259386064e-06, + "loss": 1.4439, + "step": 11326 + }, + { + "epoch": 0.8526317770375806, + "grad_norm": 4.647091388702393, + "learning_rate": 5.6745810637189364e-06, + "loss": 1.4827, + "step": 11327 + }, + { + "epoch": 0.8527070513182409, + "grad_norm": 6.359358787536621, + "learning_rate": 5.668941503070773e-06, + "loss": 1.7635, + "step": 11328 + }, + { + "epoch": 0.852782325598901, + "grad_norm": 7.544530391693115, + "learning_rate": 5.663304577776851e-06, + "loss": 1.5772, + "step": 11329 + }, + { + "epoch": 0.8528575998795611, + "grad_norm": 5.096128463745117, + "learning_rate": 5.657670288172246e-06, + "loss": 1.7751, + "step": 11330 + }, + { + "epoch": 0.8529328741602213, + "grad_norm": 4.855006217956543, + "learning_rate": 5.652038634591916e-06, + "loss": 1.8739, + "step": 11331 + }, + { + "epoch": 0.8530081484408815, + "grad_norm": 7.162740707397461, + "learning_rate": 5.64640961737063e-06, + "loss": 1.8219, + "step": 11332 + }, + { + "epoch": 0.8530834227215416, + "grad_norm": 5.542574882507324, + "learning_rate": 5.640783236843028e-06, + "loss": 1.368, + "step": 11333 + }, + { + "epoch": 0.8531586970022018, + "grad_norm": 5.108410835266113, + "learning_rate": 5.635159493343595e-06, + "loss": 1.7203, + "step": 11334 + }, + { + "epoch": 0.8532339712828619, + "grad_norm": 5.540926933288574, + "learning_rate": 5.629538387206617e-06, + "loss": 1.867, + "step": 11335 + }, + { + "epoch": 0.8533092455635221, + "grad_norm": 6.137965202331543, + "learning_rate": 5.6239199187662735e-06, + "loss": 2.0165, + "step": 11336 + }, + { + "epoch": 0.8533845198441823, + "grad_norm": 4.016019821166992, + "learning_rate": 5.618304088356563e-06, + "loss": 1.717, + "step": 11337 + }, + { + "epoch": 0.8534597941248424, + "grad_norm": 6.203599452972412, + "learning_rate": 5.612690896311334e-06, + "loss": 1.4609, + "step": 11338 + }, + { + "epoch": 0.8535350684055025, + "grad_norm": 5.111523151397705, + "learning_rate": 5.607080342964266e-06, + "loss": 1.4237, + "step": 11339 + }, + { + "epoch": 0.8536103426861628, + "grad_norm": 4.699853897094727, + "learning_rate": 5.601472428648902e-06, + "loss": 1.4911, + "step": 11340 + }, + { + "epoch": 0.8536856169668229, + "grad_norm": 6.21340274810791, + "learning_rate": 5.595867153698597e-06, + "loss": 1.6598, + "step": 11341 + }, + { + "epoch": 0.853760891247483, + "grad_norm": 5.056273460388184, + "learning_rate": 5.590264518446586e-06, + "loss": 1.8033, + "step": 11342 + }, + { + "epoch": 0.8538361655281431, + "grad_norm": 4.697856426239014, + "learning_rate": 5.584664523225918e-06, + "loss": 2.2798, + "step": 11343 + }, + { + "epoch": 0.8539114398088034, + "grad_norm": 8.03392505645752, + "learning_rate": 5.579067168369511e-06, + "loss": 2.0526, + "step": 11344 + }, + { + "epoch": 0.8539867140894635, + "grad_norm": 5.68810510635376, + "learning_rate": 5.573472454210099e-06, + "loss": 1.5942, + "step": 11345 + }, + { + "epoch": 0.8540619883701236, + "grad_norm": 4.943951606750488, + "learning_rate": 5.567880381080276e-06, + "loss": 1.7857, + "step": 11346 + }, + { + "epoch": 0.8541372626507838, + "grad_norm": 4.0935540199279785, + "learning_rate": 5.562290949312471e-06, + "loss": 1.8435, + "step": 11347 + }, + { + "epoch": 0.8542125369314439, + "grad_norm": 5.209892749786377, + "learning_rate": 5.5567041592389624e-06, + "loss": 1.6656, + "step": 11348 + }, + { + "epoch": 0.8542878112121041, + "grad_norm": 6.323092460632324, + "learning_rate": 5.551120011191868e-06, + "loss": 1.8089, + "step": 11349 + }, + { + "epoch": 0.8543630854927643, + "grad_norm": 5.032996654510498, + "learning_rate": 5.54553850550315e-06, + "loss": 2.1649, + "step": 11350 + }, + { + "epoch": 0.8544383597734244, + "grad_norm": 7.199209690093994, + "learning_rate": 5.539959642504622e-06, + "loss": 1.4405, + "step": 11351 + }, + { + "epoch": 0.8545136340540845, + "grad_norm": 8.381277084350586, + "learning_rate": 5.534383422527917e-06, + "loss": 2.0797, + "step": 11352 + }, + { + "epoch": 0.8545889083347448, + "grad_norm": 4.558521747589111, + "learning_rate": 5.528809845904537e-06, + "loss": 1.8159, + "step": 11353 + }, + { + "epoch": 0.8546641826154049, + "grad_norm": 4.3714165687561035, + "learning_rate": 5.523238912965806e-06, + "loss": 1.7214, + "step": 11354 + }, + { + "epoch": 0.854739456896065, + "grad_norm": 5.791597366333008, + "learning_rate": 5.517670624042909e-06, + "loss": 1.7511, + "step": 11355 + }, + { + "epoch": 0.8548147311767252, + "grad_norm": 4.845280647277832, + "learning_rate": 5.512104979466848e-06, + "loss": 1.7795, + "step": 11356 + }, + { + "epoch": 0.8548900054573854, + "grad_norm": 5.3342084884643555, + "learning_rate": 5.506541979568508e-06, + "loss": 1.5808, + "step": 11357 + }, + { + "epoch": 0.8549652797380455, + "grad_norm": 6.236125469207764, + "learning_rate": 5.50098162467857e-06, + "loss": 1.7899, + "step": 11358 + }, + { + "epoch": 0.8550405540187057, + "grad_norm": 4.513321399688721, + "learning_rate": 5.495423915127596e-06, + "loss": 1.8479, + "step": 11359 + }, + { + "epoch": 0.8551158282993658, + "grad_norm": 6.374955177307129, + "learning_rate": 5.489868851245966e-06, + "loss": 1.9707, + "step": 11360 + }, + { + "epoch": 0.855191102580026, + "grad_norm": 4.890799522399902, + "learning_rate": 5.4843164333639344e-06, + "loss": 1.8111, + "step": 11361 + }, + { + "epoch": 0.8552663768606861, + "grad_norm": 4.762050628662109, + "learning_rate": 5.478766661811552e-06, + "loss": 1.6816, + "step": 11362 + }, + { + "epoch": 0.8553416511413463, + "grad_norm": 5.99746036529541, + "learning_rate": 5.47321953691875e-06, + "loss": 1.5941, + "step": 11363 + }, + { + "epoch": 0.8554169254220064, + "grad_norm": 4.764297962188721, + "learning_rate": 5.467675059015276e-06, + "loss": 1.7951, + "step": 11364 + }, + { + "epoch": 0.8554921997026665, + "grad_norm": 5.52287483215332, + "learning_rate": 5.462133228430749e-06, + "loss": 1.6265, + "step": 11365 + }, + { + "epoch": 0.8555674739833268, + "grad_norm": 4.4598541259765625, + "learning_rate": 5.456594045494612e-06, + "loss": 1.4421, + "step": 11366 + }, + { + "epoch": 0.8556427482639869, + "grad_norm": 4.356582164764404, + "learning_rate": 5.451057510536139e-06, + "loss": 1.6927, + "step": 11367 + }, + { + "epoch": 0.855718022544647, + "grad_norm": 4.479369163513184, + "learning_rate": 5.445523623884485e-06, + "loss": 1.6539, + "step": 11368 + }, + { + "epoch": 0.8557932968253072, + "grad_norm": 5.862975597381592, + "learning_rate": 5.439992385868598e-06, + "loss": 1.9353, + "step": 11369 + }, + { + "epoch": 0.8558685711059674, + "grad_norm": 6.235518455505371, + "learning_rate": 5.434463796817307e-06, + "loss": 1.4789, + "step": 11370 + }, + { + "epoch": 0.8559438453866275, + "grad_norm": 6.50362491607666, + "learning_rate": 5.428937857059274e-06, + "loss": 1.5927, + "step": 11371 + }, + { + "epoch": 0.8560191196672877, + "grad_norm": 5.840924263000488, + "learning_rate": 5.423414566923002e-06, + "loss": 1.6784, + "step": 11372 + }, + { + "epoch": 0.8560943939479478, + "grad_norm": 4.376617908477783, + "learning_rate": 5.4178939267368226e-06, + "loss": 1.8526, + "step": 11373 + }, + { + "epoch": 0.856169668228608, + "grad_norm": 6.432030200958252, + "learning_rate": 5.4123759368289336e-06, + "loss": 1.7891, + "step": 11374 + }, + { + "epoch": 0.8562449425092682, + "grad_norm": 4.809435844421387, + "learning_rate": 5.4068605975273525e-06, + "loss": 1.9444, + "step": 11375 + }, + { + "epoch": 0.8563202167899283, + "grad_norm": 6.031755447387695, + "learning_rate": 5.401347909159971e-06, + "loss": 1.7005, + "step": 11376 + }, + { + "epoch": 0.8563954910705884, + "grad_norm": 6.395382881164551, + "learning_rate": 5.395837872054471e-06, + "loss": 1.6492, + "step": 11377 + }, + { + "epoch": 0.8564707653512487, + "grad_norm": 5.86688756942749, + "learning_rate": 5.390330486538442e-06, + "loss": 1.5799, + "step": 11378 + }, + { + "epoch": 0.8565460396319088, + "grad_norm": 7.687005519866943, + "learning_rate": 5.384825752939254e-06, + "loss": 1.5125, + "step": 11379 + }, + { + "epoch": 0.8566213139125689, + "grad_norm": 9.41920280456543, + "learning_rate": 5.3793236715841574e-06, + "loss": 2.1686, + "step": 11380 + }, + { + "epoch": 0.856696588193229, + "grad_norm": 5.602234840393066, + "learning_rate": 5.373824242800241e-06, + "loss": 1.7412, + "step": 11381 + }, + { + "epoch": 0.8567718624738893, + "grad_norm": 4.371423721313477, + "learning_rate": 5.368327466914425e-06, + "loss": 1.715, + "step": 11382 + }, + { + "epoch": 0.8568471367545494, + "grad_norm": 4.858772277832031, + "learning_rate": 5.362833344253487e-06, + "loss": 1.7312, + "step": 11383 + }, + { + "epoch": 0.8569224110352095, + "grad_norm": 5.651652812957764, + "learning_rate": 5.3573418751440175e-06, + "loss": 1.6394, + "step": 11384 + }, + { + "epoch": 0.8569976853158697, + "grad_norm": 4.63021993637085, + "learning_rate": 5.351853059912493e-06, + "loss": 1.5859, + "step": 11385 + }, + { + "epoch": 0.8570729595965298, + "grad_norm": 5.911614894866943, + "learning_rate": 5.34636689888518e-06, + "loss": 2.047, + "step": 11386 + }, + { + "epoch": 0.85714823387719, + "grad_norm": 4.582193374633789, + "learning_rate": 5.340883392388246e-06, + "loss": 1.7452, + "step": 11387 + }, + { + "epoch": 0.8572235081578502, + "grad_norm": 4.856236457824707, + "learning_rate": 5.335402540747636e-06, + "loss": 1.4459, + "step": 11388 + }, + { + "epoch": 0.8572987824385103, + "grad_norm": 4.626628875732422, + "learning_rate": 5.3299243442892e-06, + "loss": 1.8276, + "step": 11389 + }, + { + "epoch": 0.8573740567191704, + "grad_norm": 5.238063335418701, + "learning_rate": 5.324448803338577e-06, + "loss": 1.8916, + "step": 11390 + }, + { + "epoch": 0.8574493309998307, + "grad_norm": 5.1228556632995605, + "learning_rate": 5.318975918221292e-06, + "loss": 2.0314, + "step": 11391 + }, + { + "epoch": 0.8575246052804908, + "grad_norm": 6.85102653503418, + "learning_rate": 5.313505689262688e-06, + "loss": 1.45, + "step": 11392 + }, + { + "epoch": 0.8575998795611509, + "grad_norm": 4.33017635345459, + "learning_rate": 5.308038116787939e-06, + "loss": 1.6836, + "step": 11393 + }, + { + "epoch": 0.8576751538418111, + "grad_norm": 6.9080023765563965, + "learning_rate": 5.302573201122091e-06, + "loss": 1.3402, + "step": 11394 + }, + { + "epoch": 0.8577504281224713, + "grad_norm": 5.097667694091797, + "learning_rate": 5.297110942590028e-06, + "loss": 1.6407, + "step": 11395 + }, + { + "epoch": 0.8578257024031314, + "grad_norm": 6.422938346862793, + "learning_rate": 5.29165134151644e-06, + "loss": 1.7441, + "step": 11396 + }, + { + "epoch": 0.8579009766837916, + "grad_norm": 6.0826616287231445, + "learning_rate": 5.2861943982259025e-06, + "loss": 2.12, + "step": 11397 + }, + { + "epoch": 0.8579762509644517, + "grad_norm": 5.136501789093018, + "learning_rate": 5.280740113042803e-06, + "loss": 1.6658, + "step": 11398 + }, + { + "epoch": 0.8580515252451119, + "grad_norm": 5.7545905113220215, + "learning_rate": 5.275288486291391e-06, + "loss": 1.9354, + "step": 11399 + }, + { + "epoch": 0.858126799525772, + "grad_norm": 6.818969249725342, + "learning_rate": 5.2698395182957605e-06, + "loss": 1.9808, + "step": 11400 + }, + { + "epoch": 0.8582020738064322, + "grad_norm": 5.798734188079834, + "learning_rate": 5.264393209379814e-06, + "loss": 1.7621, + "step": 11401 + }, + { + "epoch": 0.8582773480870923, + "grad_norm": 5.587700366973877, + "learning_rate": 5.258949559867338e-06, + "loss": 1.4785, + "step": 11402 + }, + { + "epoch": 0.8583526223677524, + "grad_norm": 5.608959197998047, + "learning_rate": 5.253508570081928e-06, + "loss": 1.5538, + "step": 11403 + }, + { + "epoch": 0.8584278966484127, + "grad_norm": 5.877653121948242, + "learning_rate": 5.2480702403470415e-06, + "loss": 1.7898, + "step": 11404 + }, + { + "epoch": 0.8585031709290728, + "grad_norm": 5.400816917419434, + "learning_rate": 5.242634570985966e-06, + "loss": 1.4356, + "step": 11405 + }, + { + "epoch": 0.8585784452097329, + "grad_norm": 5.04696798324585, + "learning_rate": 5.237201562321858e-06, + "loss": 1.619, + "step": 11406 + }, + { + "epoch": 0.8586537194903932, + "grad_norm": 4.8833112716674805, + "learning_rate": 5.231771214677667e-06, + "loss": 1.8486, + "step": 11407 + }, + { + "epoch": 0.8587289937710533, + "grad_norm": 5.007887840270996, + "learning_rate": 5.226343528376226e-06, + "loss": 1.7479, + "step": 11408 + }, + { + "epoch": 0.8588042680517134, + "grad_norm": 4.869284629821777, + "learning_rate": 5.220918503740191e-06, + "loss": 1.6167, + "step": 11409 + }, + { + "epoch": 0.8588795423323736, + "grad_norm": 5.048999309539795, + "learning_rate": 5.215496141092075e-06, + "loss": 1.7723, + "step": 11410 + }, + { + "epoch": 0.8589548166130337, + "grad_norm": 8.82483959197998, + "learning_rate": 5.210076440754197e-06, + "loss": 1.7855, + "step": 11411 + }, + { + "epoch": 0.8590300908936939, + "grad_norm": 5.169203758239746, + "learning_rate": 5.204659403048767e-06, + "loss": 1.8576, + "step": 11412 + }, + { + "epoch": 0.8591053651743541, + "grad_norm": 5.934205055236816, + "learning_rate": 5.1992450282978e-06, + "loss": 1.8864, + "step": 11413 + }, + { + "epoch": 0.8591806394550142, + "grad_norm": 5.172539710998535, + "learning_rate": 5.1938333168231656e-06, + "loss": 1.5112, + "step": 11414 + }, + { + "epoch": 0.8592559137356743, + "grad_norm": 6.376372814178467, + "learning_rate": 5.188424268946573e-06, + "loss": 1.9112, + "step": 11415 + }, + { + "epoch": 0.8593311880163346, + "grad_norm": 5.443299770355225, + "learning_rate": 5.183017884989583e-06, + "loss": 2.0688, + "step": 11416 + }, + { + "epoch": 0.8594064622969947, + "grad_norm": 6.428813934326172, + "learning_rate": 5.177614165273597e-06, + "loss": 1.6477, + "step": 11417 + }, + { + "epoch": 0.8594817365776548, + "grad_norm": 5.511814594268799, + "learning_rate": 5.172213110119823e-06, + "loss": 1.4918, + "step": 11418 + }, + { + "epoch": 0.859557010858315, + "grad_norm": 6.020489692687988, + "learning_rate": 5.166814719849372e-06, + "loss": 1.8941, + "step": 11419 + }, + { + "epoch": 0.8596322851389752, + "grad_norm": 5.4443888664245605, + "learning_rate": 5.161418994783129e-06, + "loss": 1.8701, + "step": 11420 + }, + { + "epoch": 0.8597075594196353, + "grad_norm": 5.209005832672119, + "learning_rate": 5.156025935241881e-06, + "loss": 1.8178, + "step": 11421 + }, + { + "epoch": 0.8597828337002954, + "grad_norm": 4.597990989685059, + "learning_rate": 5.150635541546211e-06, + "loss": 1.6371, + "step": 11422 + }, + { + "epoch": 0.8598581079809556, + "grad_norm": 5.521476745605469, + "learning_rate": 5.145247814016579e-06, + "loss": 1.9045, + "step": 11423 + }, + { + "epoch": 0.8599333822616158, + "grad_norm": 8.375419616699219, + "learning_rate": 5.139862752973257e-06, + "loss": 1.9795, + "step": 11424 + }, + { + "epoch": 0.8600086565422759, + "grad_norm": 5.825814723968506, + "learning_rate": 5.134480358736371e-06, + "loss": 1.6392, + "step": 11425 + }, + { + "epoch": 0.8600839308229361, + "grad_norm": 5.737662315368652, + "learning_rate": 5.129100631625894e-06, + "loss": 2.0108, + "step": 11426 + }, + { + "epoch": 0.8601592051035962, + "grad_norm": 4.838788032531738, + "learning_rate": 5.123723571961647e-06, + "loss": 1.5939, + "step": 11427 + }, + { + "epoch": 0.8602344793842563, + "grad_norm": 5.150355339050293, + "learning_rate": 5.118349180063259e-06, + "loss": 2.2611, + "step": 11428 + }, + { + "epoch": 0.8603097536649166, + "grad_norm": 7.665215015411377, + "learning_rate": 5.112977456250245e-06, + "loss": 2.1151, + "step": 11429 + }, + { + "epoch": 0.8603850279455767, + "grad_norm": 7.475167751312256, + "learning_rate": 5.107608400841913e-06, + "loss": 1.7025, + "step": 11430 + }, + { + "epoch": 0.8604603022262368, + "grad_norm": 6.16090726852417, + "learning_rate": 5.102242014157449e-06, + "loss": 1.6492, + "step": 11431 + }, + { + "epoch": 0.860535576506897, + "grad_norm": 5.874026298522949, + "learning_rate": 5.096878296515883e-06, + "loss": 1.6016, + "step": 11432 + }, + { + "epoch": 0.8606108507875572, + "grad_norm": 4.7230119705200195, + "learning_rate": 5.0915172482360556e-06, + "loss": 1.871, + "step": 11433 + }, + { + "epoch": 0.8606861250682173, + "grad_norm": 4.541685581207275, + "learning_rate": 5.086158869636676e-06, + "loss": 1.4608, + "step": 11434 + }, + { + "epoch": 0.8607613993488775, + "grad_norm": 4.855148792266846, + "learning_rate": 5.080803161036268e-06, + "loss": 2.007, + "step": 11435 + }, + { + "epoch": 0.8608366736295376, + "grad_norm": 6.244253158569336, + "learning_rate": 5.075450122753228e-06, + "loss": 1.9192, + "step": 11436 + }, + { + "epoch": 0.8609119479101978, + "grad_norm": 4.192311763763428, + "learning_rate": 5.070099755105772e-06, + "loss": 1.8702, + "step": 11437 + }, + { + "epoch": 0.860987222190858, + "grad_norm": 5.897631645202637, + "learning_rate": 5.064752058411975e-06, + "loss": 1.6425, + "step": 11438 + }, + { + "epoch": 0.8610624964715181, + "grad_norm": 10.919322967529297, + "learning_rate": 5.059407032989733e-06, + "loss": 1.8024, + "step": 11439 + }, + { + "epoch": 0.8611377707521782, + "grad_norm": 6.1647539138793945, + "learning_rate": 5.054064679156795e-06, + "loss": 1.8826, + "step": 11440 + }, + { + "epoch": 0.8612130450328384, + "grad_norm": 4.032434463500977, + "learning_rate": 5.0487249972307415e-06, + "loss": 1.9875, + "step": 11441 + }, + { + "epoch": 0.8612883193134986, + "grad_norm": 5.4214277267456055, + "learning_rate": 5.043387987529019e-06, + "loss": 1.5636, + "step": 11442 + }, + { + "epoch": 0.8613635935941587, + "grad_norm": 5.4388041496276855, + "learning_rate": 5.038053650368874e-06, + "loss": 1.8962, + "step": 11443 + }, + { + "epoch": 0.8614388678748188, + "grad_norm": 5.1301093101501465, + "learning_rate": 5.03272198606744e-06, + "loss": 1.9354, + "step": 11444 + }, + { + "epoch": 0.8615141421554791, + "grad_norm": 6.499573230743408, + "learning_rate": 5.027392994941643e-06, + "loss": 2.0367, + "step": 11445 + }, + { + "epoch": 0.8615894164361392, + "grad_norm": 5.132656097412109, + "learning_rate": 5.022066677308296e-06, + "loss": 1.7381, + "step": 11446 + }, + { + "epoch": 0.8616646907167993, + "grad_norm": 5.67801570892334, + "learning_rate": 5.016743033484039e-06, + "loss": 1.9379, + "step": 11447 + }, + { + "epoch": 0.8617399649974595, + "grad_norm": 4.897541522979736, + "learning_rate": 5.011422063785332e-06, + "loss": 1.7281, + "step": 11448 + }, + { + "epoch": 0.8618152392781196, + "grad_norm": 7.580458641052246, + "learning_rate": 5.006103768528486e-06, + "loss": 1.897, + "step": 11449 + }, + { + "epoch": 0.8618905135587798, + "grad_norm": 4.290918350219727, + "learning_rate": 5.000788148029678e-06, + "loss": 1.4936, + "step": 11450 + }, + { + "epoch": 0.86196578783944, + "grad_norm": 4.917287349700928, + "learning_rate": 4.995475202604905e-06, + "loss": 1.9685, + "step": 11451 + }, + { + "epoch": 0.8620410621201001, + "grad_norm": 4.892531394958496, + "learning_rate": 4.990164932569985e-06, + "loss": 1.6736, + "step": 11452 + }, + { + "epoch": 0.8621163364007602, + "grad_norm": 5.9600067138671875, + "learning_rate": 4.984857338240623e-06, + "loss": 1.6307, + "step": 11453 + }, + { + "epoch": 0.8621916106814205, + "grad_norm": 6.309071063995361, + "learning_rate": 4.979552419932327e-06, + "loss": 1.7524, + "step": 11454 + }, + { + "epoch": 0.8622668849620806, + "grad_norm": 4.425530433654785, + "learning_rate": 4.97425017796046e-06, + "loss": 2.0199, + "step": 11455 + }, + { + "epoch": 0.8623421592427407, + "grad_norm": 4.341009616851807, + "learning_rate": 4.9689506126402216e-06, + "loss": 1.7633, + "step": 11456 + }, + { + "epoch": 0.862417433523401, + "grad_norm": 6.090918064117432, + "learning_rate": 4.963653724286671e-06, + "loss": 1.8364, + "step": 11457 + }, + { + "epoch": 0.8624927078040611, + "grad_norm": 4.086336135864258, + "learning_rate": 4.958359513214678e-06, + "loss": 1.4659, + "step": 11458 + }, + { + "epoch": 0.8625679820847212, + "grad_norm": 5.7656402587890625, + "learning_rate": 4.953067979738968e-06, + "loss": 2.0907, + "step": 11459 + }, + { + "epoch": 0.8626432563653813, + "grad_norm": 5.543933391571045, + "learning_rate": 4.947779124174112e-06, + "loss": 1.6493, + "step": 11460 + }, + { + "epoch": 0.8627185306460415, + "grad_norm": 5.281326770782471, + "learning_rate": 4.942492946834526e-06, + "loss": 1.6564, + "step": 11461 + }, + { + "epoch": 0.8627938049267017, + "grad_norm": 6.59226131439209, + "learning_rate": 4.937209448034441e-06, + "loss": 2.1814, + "step": 11462 + }, + { + "epoch": 0.8628690792073618, + "grad_norm": 5.21685791015625, + "learning_rate": 4.931928628087956e-06, + "loss": 1.5481, + "step": 11463 + }, + { + "epoch": 0.862944353488022, + "grad_norm": 5.575679779052734, + "learning_rate": 4.926650487309009e-06, + "loss": 1.6405, + "step": 11464 + }, + { + "epoch": 0.8630196277686821, + "grad_norm": 6.082661151885986, + "learning_rate": 4.921375026011349e-06, + "loss": 1.8297, + "step": 11465 + }, + { + "epoch": 0.8630949020493422, + "grad_norm": 4.544851779937744, + "learning_rate": 4.916102244508614e-06, + "loss": 1.7704, + "step": 11466 + }, + { + "epoch": 0.8631701763300025, + "grad_norm": 4.473884582519531, + "learning_rate": 4.910832143114225e-06, + "loss": 1.509, + "step": 11467 + }, + { + "epoch": 0.8632454506106626, + "grad_norm": 4.714999198913574, + "learning_rate": 4.905564722141498e-06, + "loss": 1.5481, + "step": 11468 + }, + { + "epoch": 0.8633207248913227, + "grad_norm": 5.141282558441162, + "learning_rate": 4.900299981903556e-06, + "loss": 1.5963, + "step": 11469 + }, + { + "epoch": 0.863395999171983, + "grad_norm": 6.951136589050293, + "learning_rate": 4.895037922713369e-06, + "loss": 1.6883, + "step": 11470 + }, + { + "epoch": 0.8634712734526431, + "grad_norm": 7.007379055023193, + "learning_rate": 4.8897785448837555e-06, + "loss": 1.4552, + "step": 11471 + }, + { + "epoch": 0.8635465477333032, + "grad_norm": 7.2951130867004395, + "learning_rate": 4.884521848727386e-06, + "loss": 1.6371, + "step": 11472 + }, + { + "epoch": 0.8636218220139634, + "grad_norm": 4.26556396484375, + "learning_rate": 4.879267834556728e-06, + "loss": 1.997, + "step": 11473 + }, + { + "epoch": 0.8636970962946235, + "grad_norm": 5.4073967933654785, + "learning_rate": 4.874016502684142e-06, + "loss": 1.6618, + "step": 11474 + }, + { + "epoch": 0.8637723705752837, + "grad_norm": 4.163093566894531, + "learning_rate": 4.868767853421785e-06, + "loss": 1.7336, + "step": 11475 + }, + { + "epoch": 0.8638476448559439, + "grad_norm": 7.2789106369018555, + "learning_rate": 4.86352188708169e-06, + "loss": 1.9649, + "step": 11476 + }, + { + "epoch": 0.863922919136604, + "grad_norm": 5.7152485847473145, + "learning_rate": 4.8582786039757025e-06, + "loss": 1.8087, + "step": 11477 + }, + { + "epoch": 0.8639981934172641, + "grad_norm": 4.291051864624023, + "learning_rate": 4.8530380044155274e-06, + "loss": 1.491, + "step": 11478 + }, + { + "epoch": 0.8640734676979243, + "grad_norm": 6.267467021942139, + "learning_rate": 4.847800088712706e-06, + "loss": 1.7059, + "step": 11479 + }, + { + "epoch": 0.8641487419785845, + "grad_norm": 4.5645341873168945, + "learning_rate": 4.842564857178605e-06, + "loss": 1.5515, + "step": 11480 + }, + { + "epoch": 0.8642240162592446, + "grad_norm": 5.338619232177734, + "learning_rate": 4.837332310124454e-06, + "loss": 1.8482, + "step": 11481 + }, + { + "epoch": 0.8642992905399047, + "grad_norm": 3.888235330581665, + "learning_rate": 4.832102447861309e-06, + "loss": 1.9462, + "step": 11482 + }, + { + "epoch": 0.864374564820565, + "grad_norm": 4.248838424682617, + "learning_rate": 4.826875270700082e-06, + "loss": 1.9611, + "step": 11483 + }, + { + "epoch": 0.8644498391012251, + "grad_norm": 4.586554527282715, + "learning_rate": 4.821650778951492e-06, + "loss": 1.7218, + "step": 11484 + }, + { + "epoch": 0.8645251133818852, + "grad_norm": 5.409714698791504, + "learning_rate": 4.816428972926146e-06, + "loss": 1.6721, + "step": 11485 + }, + { + "epoch": 0.8646003876625454, + "grad_norm": 4.3038458824157715, + "learning_rate": 4.8112098529344395e-06, + "loss": 1.5473, + "step": 11486 + }, + { + "epoch": 0.8646756619432056, + "grad_norm": 4.266467571258545, + "learning_rate": 4.805993419286658e-06, + "loss": 1.8832, + "step": 11487 + }, + { + "epoch": 0.8647509362238657, + "grad_norm": 6.867487907409668, + "learning_rate": 4.800779672292882e-06, + "loss": 1.7353, + "step": 11488 + }, + { + "epoch": 0.8648262105045259, + "grad_norm": 4.510806560516357, + "learning_rate": 4.7955686122630685e-06, + "loss": 1.5294, + "step": 11489 + }, + { + "epoch": 0.864901484785186, + "grad_norm": 5.671181678771973, + "learning_rate": 4.790360239506986e-06, + "loss": 1.7955, + "step": 11490 + }, + { + "epoch": 0.8649767590658461, + "grad_norm": 6.436161994934082, + "learning_rate": 4.78515455433427e-06, + "loss": 1.7936, + "step": 11491 + }, + { + "epoch": 0.8650520333465064, + "grad_norm": 5.025091171264648, + "learning_rate": 4.779951557054391e-06, + "loss": 2.011, + "step": 11492 + }, + { + "epoch": 0.8651273076271665, + "grad_norm": 6.753842830657959, + "learning_rate": 4.774751247976628e-06, + "loss": 2.0031, + "step": 11493 + }, + { + "epoch": 0.8652025819078266, + "grad_norm": 5.108877658843994, + "learning_rate": 4.769553627410134e-06, + "loss": 1.5295, + "step": 11494 + }, + { + "epoch": 0.8652778561884868, + "grad_norm": 5.935607433319092, + "learning_rate": 4.7643586956639005e-06, + "loss": 1.5896, + "step": 11495 + }, + { + "epoch": 0.865353130469147, + "grad_norm": 4.038177967071533, + "learning_rate": 4.759166453046754e-06, + "loss": 1.9277, + "step": 11496 + }, + { + "epoch": 0.8654284047498071, + "grad_norm": 5.444188117980957, + "learning_rate": 4.753976899867346e-06, + "loss": 1.7296, + "step": 11497 + }, + { + "epoch": 0.8655036790304672, + "grad_norm": 4.549103260040283, + "learning_rate": 4.748790036434192e-06, + "loss": 1.8206, + "step": 11498 + }, + { + "epoch": 0.8655789533111274, + "grad_norm": 6.194704532623291, + "learning_rate": 4.7436058630556225e-06, + "loss": 1.6857, + "step": 11499 + }, + { + "epoch": 0.8656542275917876, + "grad_norm": 6.0705037117004395, + "learning_rate": 4.738424380039835e-06, + "loss": 1.6978, + "step": 11500 + }, + { + "epoch": 0.8657295018724477, + "grad_norm": 4.544860363006592, + "learning_rate": 4.733245587694846e-06, + "loss": 1.5882, + "step": 11501 + }, + { + "epoch": 0.8658047761531079, + "grad_norm": 8.552252769470215, + "learning_rate": 4.728069486328524e-06, + "loss": 2.0228, + "step": 11502 + }, + { + "epoch": 0.865880050433768, + "grad_norm": 5.126590251922607, + "learning_rate": 4.722896076248568e-06, + "loss": 2.0219, + "step": 11503 + }, + { + "epoch": 0.8659553247144282, + "grad_norm": 6.823358058929443, + "learning_rate": 4.71772535776252e-06, + "loss": 2.0208, + "step": 11504 + }, + { + "epoch": 0.8660305989950884, + "grad_norm": 5.678152084350586, + "learning_rate": 4.712557331177775e-06, + "loss": 1.6206, + "step": 11505 + }, + { + "epoch": 0.8661058732757485, + "grad_norm": 7.036484718322754, + "learning_rate": 4.707391996801558e-06, + "loss": 1.5981, + "step": 11506 + }, + { + "epoch": 0.8661811475564086, + "grad_norm": 5.370208740234375, + "learning_rate": 4.702229354940918e-06, + "loss": 1.5341, + "step": 11507 + }, + { + "epoch": 0.8662564218370689, + "grad_norm": 7.007867813110352, + "learning_rate": 4.697069405902782e-06, + "loss": 1.8734, + "step": 11508 + }, + { + "epoch": 0.866331696117729, + "grad_norm": 4.613592147827148, + "learning_rate": 4.691912149993866e-06, + "loss": 1.4372, + "step": 11509 + }, + { + "epoch": 0.8664069703983891, + "grad_norm": 4.310728073120117, + "learning_rate": 4.686757587520774e-06, + "loss": 1.4544, + "step": 11510 + }, + { + "epoch": 0.8664822446790493, + "grad_norm": 4.814352512359619, + "learning_rate": 4.681605718789933e-06, + "loss": 2.0673, + "step": 11511 + }, + { + "epoch": 0.8665575189597094, + "grad_norm": 4.7128987312316895, + "learning_rate": 4.6764565441075935e-06, + "loss": 1.4206, + "step": 11512 + }, + { + "epoch": 0.8666327932403696, + "grad_norm": 4.8484883308410645, + "learning_rate": 4.671310063779871e-06, + "loss": 1.6717, + "step": 11513 + }, + { + "epoch": 0.8667080675210298, + "grad_norm": 5.862910270690918, + "learning_rate": 4.666166278112688e-06, + "loss": 1.4175, + "step": 11514 + }, + { + "epoch": 0.8667833418016899, + "grad_norm": 5.032711505889893, + "learning_rate": 4.661025187411849e-06, + "loss": 1.589, + "step": 11515 + }, + { + "epoch": 0.86685861608235, + "grad_norm": 4.635819911956787, + "learning_rate": 4.655886791982972e-06, + "loss": 1.4409, + "step": 11516 + }, + { + "epoch": 0.8669338903630103, + "grad_norm": 4.501954555511475, + "learning_rate": 4.650751092131523e-06, + "loss": 1.7449, + "step": 11517 + }, + { + "epoch": 0.8670091646436704, + "grad_norm": 5.654611587524414, + "learning_rate": 4.64561808816279e-06, + "loss": 1.9805, + "step": 11518 + }, + { + "epoch": 0.8670844389243305, + "grad_norm": 9.355611801147461, + "learning_rate": 4.640487780381936e-06, + "loss": 1.7205, + "step": 11519 + }, + { + "epoch": 0.8671597132049906, + "grad_norm": 5.797630310058594, + "learning_rate": 4.635360169093928e-06, + "loss": 1.4642, + "step": 11520 + }, + { + "epoch": 0.8672349874856509, + "grad_norm": 4.173593521118164, + "learning_rate": 4.6302352546035935e-06, + "loss": 1.5482, + "step": 11521 + }, + { + "epoch": 0.867310261766311, + "grad_norm": 6.74575662612915, + "learning_rate": 4.625113037215589e-06, + "loss": 2.0897, + "step": 11522 + }, + { + "epoch": 0.8673855360469711, + "grad_norm": 5.9245381355285645, + "learning_rate": 4.619993517234428e-06, + "loss": 1.6388, + "step": 11523 + }, + { + "epoch": 0.8674608103276313, + "grad_norm": 4.278258323669434, + "learning_rate": 4.614876694964432e-06, + "loss": 1.8343, + "step": 11524 + }, + { + "epoch": 0.8675360846082915, + "grad_norm": 3.93045711517334, + "learning_rate": 4.6097625707097914e-06, + "loss": 1.3582, + "step": 11525 + }, + { + "epoch": 0.8676113588889516, + "grad_norm": 4.08480978012085, + "learning_rate": 4.604651144774524e-06, + "loss": 1.8225, + "step": 11526 + }, + { + "epoch": 0.8676866331696118, + "grad_norm": 4.731809139251709, + "learning_rate": 4.599542417462499e-06, + "loss": 1.5408, + "step": 11527 + }, + { + "epoch": 0.8677619074502719, + "grad_norm": 6.7461676597595215, + "learning_rate": 4.594436389077411e-06, + "loss": 1.4332, + "step": 11528 + }, + { + "epoch": 0.867837181730932, + "grad_norm": 4.909346103668213, + "learning_rate": 4.58933305992279e-06, + "loss": 2.0213, + "step": 11529 + }, + { + "epoch": 0.8679124560115923, + "grad_norm": 6.06249475479126, + "learning_rate": 4.584232430302033e-06, + "loss": 2.0043, + "step": 11530 + }, + { + "epoch": 0.8679877302922524, + "grad_norm": 5.091506004333496, + "learning_rate": 4.579134500518334e-06, + "loss": 1.7015, + "step": 11531 + }, + { + "epoch": 0.8680630045729125, + "grad_norm": 6.5612993240356445, + "learning_rate": 4.574039270874775e-06, + "loss": 2.2226, + "step": 11532 + }, + { + "epoch": 0.8681382788535728, + "grad_norm": 4.921543598175049, + "learning_rate": 4.5689467416742234e-06, + "loss": 1.5816, + "step": 11533 + }, + { + "epoch": 0.8682135531342329, + "grad_norm": 6.254297733306885, + "learning_rate": 4.5638569132194446e-06, + "loss": 1.5923, + "step": 11534 + }, + { + "epoch": 0.868288827414893, + "grad_norm": 5.603431701660156, + "learning_rate": 4.558769785812994e-06, + "loss": 1.4275, + "step": 11535 + }, + { + "epoch": 0.8683641016955532, + "grad_norm": 4.965296268463135, + "learning_rate": 4.553685359757287e-06, + "loss": 1.3709, + "step": 11536 + }, + { + "epoch": 0.8684393759762133, + "grad_norm": 7.3806471824646, + "learning_rate": 4.548603635354592e-06, + "loss": 1.9007, + "step": 11537 + }, + { + "epoch": 0.8685146502568735, + "grad_norm": 6.049238681793213, + "learning_rate": 4.543524612907002e-06, + "loss": 1.8835, + "step": 11538 + }, + { + "epoch": 0.8685899245375336, + "grad_norm": 6.568185806274414, + "learning_rate": 4.538448292716441e-06, + "loss": 2.0711, + "step": 11539 + }, + { + "epoch": 0.8686651988181938, + "grad_norm": 4.970886707305908, + "learning_rate": 4.533374675084689e-06, + "loss": 1.6045, + "step": 11540 + }, + { + "epoch": 0.8687404730988539, + "grad_norm": 5.119251251220703, + "learning_rate": 4.528303760313346e-06, + "loss": 1.811, + "step": 11541 + }, + { + "epoch": 0.8688157473795141, + "grad_norm": 7.422088146209717, + "learning_rate": 4.52323554870388e-06, + "loss": 1.6024, + "step": 11542 + }, + { + "epoch": 0.8688910216601743, + "grad_norm": 5.681402683258057, + "learning_rate": 4.518170040557568e-06, + "loss": 1.6538, + "step": 11543 + }, + { + "epoch": 0.8689662959408344, + "grad_norm": 5.216109275817871, + "learning_rate": 4.51310723617554e-06, + "loss": 1.6513, + "step": 11544 + }, + { + "epoch": 0.8690415702214945, + "grad_norm": 6.236734390258789, + "learning_rate": 4.508047135858778e-06, + "loss": 1.7925, + "step": 11545 + }, + { + "epoch": 0.8691168445021548, + "grad_norm": 4.702597141265869, + "learning_rate": 4.502989739908081e-06, + "loss": 1.7424, + "step": 11546 + }, + { + "epoch": 0.8691921187828149, + "grad_norm": 5.18388032913208, + "learning_rate": 4.497935048624102e-06, + "loss": 1.6439, + "step": 11547 + }, + { + "epoch": 0.869267393063475, + "grad_norm": 6.318454265594482, + "learning_rate": 4.492883062307318e-06, + "loss": 1.6407, + "step": 11548 + }, + { + "epoch": 0.8693426673441352, + "grad_norm": 4.4423065185546875, + "learning_rate": 4.4878337812580605e-06, + "loss": 1.6859, + "step": 11549 + }, + { + "epoch": 0.8694179416247954, + "grad_norm": 4.101556777954102, + "learning_rate": 4.482787205776495e-06, + "loss": 1.6305, + "step": 11550 + }, + { + "epoch": 0.8694932159054555, + "grad_norm": 5.198694705963135, + "learning_rate": 4.477743336162638e-06, + "loss": 1.7561, + "step": 11551 + }, + { + "epoch": 0.8695684901861157, + "grad_norm": 3.577151298522949, + "learning_rate": 4.472702172716309e-06, + "loss": 1.8221, + "step": 11552 + }, + { + "epoch": 0.8696437644667758, + "grad_norm": 4.889743328094482, + "learning_rate": 4.467663715737214e-06, + "loss": 1.7028, + "step": 11553 + }, + { + "epoch": 0.869719038747436, + "grad_norm": 5.674793243408203, + "learning_rate": 4.462627965524851e-06, + "loss": 1.8969, + "step": 11554 + }, + { + "epoch": 0.8697943130280962, + "grad_norm": 8.243905067443848, + "learning_rate": 4.4575949223786105e-06, + "loss": 2.041, + "step": 11555 + }, + { + "epoch": 0.8698695873087563, + "grad_norm": 4.411231994628906, + "learning_rate": 4.452564586597663e-06, + "loss": 1.9132, + "step": 11556 + }, + { + "epoch": 0.8699448615894164, + "grad_norm": 3.9965784549713135, + "learning_rate": 4.447536958481069e-06, + "loss": 1.3917, + "step": 11557 + }, + { + "epoch": 0.8700201358700765, + "grad_norm": 6.580601692199707, + "learning_rate": 4.44251203832769e-06, + "loss": 1.8265, + "step": 11558 + }, + { + "epoch": 0.8700954101507368, + "grad_norm": 5.445290565490723, + "learning_rate": 4.437489826436253e-06, + "loss": 1.8683, + "step": 11559 + }, + { + "epoch": 0.8701706844313969, + "grad_norm": 5.180593013763428, + "learning_rate": 4.432470323105309e-06, + "loss": 1.9632, + "step": 11560 + }, + { + "epoch": 0.870245958712057, + "grad_norm": 6.6374735832214355, + "learning_rate": 4.427453528633263e-06, + "loss": 1.6555, + "step": 11561 + }, + { + "epoch": 0.8703212329927172, + "grad_norm": 5.852902889251709, + "learning_rate": 4.422439443318343e-06, + "loss": 2.0719, + "step": 11562 + }, + { + "epoch": 0.8703965072733774, + "grad_norm": 6.008835792541504, + "learning_rate": 4.417428067458618e-06, + "loss": 1.4886, + "step": 11563 + }, + { + "epoch": 0.8704717815540375, + "grad_norm": 6.359923362731934, + "learning_rate": 4.412419401352008e-06, + "loss": 2.0294, + "step": 11564 + }, + { + "epoch": 0.8705470558346977, + "grad_norm": 4.297892093658447, + "learning_rate": 4.407413445296255e-06, + "loss": 1.7915, + "step": 11565 + }, + { + "epoch": 0.8706223301153578, + "grad_norm": 5.2801594734191895, + "learning_rate": 4.402410199588958e-06, + "loss": 1.9035, + "step": 11566 + }, + { + "epoch": 0.870697604396018, + "grad_norm": 4.736752033233643, + "learning_rate": 4.3974096645275354e-06, + "loss": 1.319, + "step": 11567 + }, + { + "epoch": 0.8707728786766782, + "grad_norm": 4.760752201080322, + "learning_rate": 4.392411840409266e-06, + "loss": 1.5775, + "step": 11568 + }, + { + "epoch": 0.8708481529573383, + "grad_norm": 5.854061603546143, + "learning_rate": 4.38741672753124e-06, + "loss": 1.7196, + "step": 11569 + }, + { + "epoch": 0.8709234272379984, + "grad_norm": 5.108160972595215, + "learning_rate": 4.382424326190415e-06, + "loss": 1.7086, + "step": 11570 + }, + { + "epoch": 0.8709987015186587, + "grad_norm": 5.345641613006592, + "learning_rate": 4.37743463668357e-06, + "loss": 1.5626, + "step": 11571 + }, + { + "epoch": 0.8710739757993188, + "grad_norm": 5.871346473693848, + "learning_rate": 4.372447659307338e-06, + "loss": 1.2776, + "step": 11572 + }, + { + "epoch": 0.8711492500799789, + "grad_norm": 7.1747212409973145, + "learning_rate": 4.367463394358168e-06, + "loss": 1.9189, + "step": 11573 + }, + { + "epoch": 0.8712245243606391, + "grad_norm": 4.06326961517334, + "learning_rate": 4.362481842132371e-06, + "loss": 1.3152, + "step": 11574 + }, + { + "epoch": 0.8712997986412993, + "grad_norm": 5.53347635269165, + "learning_rate": 4.357503002926072e-06, + "loss": 1.7382, + "step": 11575 + }, + { + "epoch": 0.8713750729219594, + "grad_norm": 4.354653835296631, + "learning_rate": 4.352526877035257e-06, + "loss": 1.5702, + "step": 11576 + }, + { + "epoch": 0.8714503472026195, + "grad_norm": 4.4479169845581055, + "learning_rate": 4.347553464755749e-06, + "loss": 1.4155, + "step": 11577 + }, + { + "epoch": 0.8715256214832797, + "grad_norm": 5.20963716506958, + "learning_rate": 4.342582766383185e-06, + "loss": 1.9486, + "step": 11578 + }, + { + "epoch": 0.8716008957639398, + "grad_norm": 5.951019763946533, + "learning_rate": 4.337614782213079e-06, + "loss": 1.8345, + "step": 11579 + }, + { + "epoch": 0.8716761700446, + "grad_norm": 5.686764240264893, + "learning_rate": 4.332649512540748e-06, + "loss": 1.6027, + "step": 11580 + }, + { + "epoch": 0.8717514443252602, + "grad_norm": 4.664113998413086, + "learning_rate": 4.32768695766137e-06, + "loss": 1.3727, + "step": 11581 + }, + { + "epoch": 0.8718267186059203, + "grad_norm": 5.536030292510986, + "learning_rate": 4.322727117869951e-06, + "loss": 1.9266, + "step": 11582 + }, + { + "epoch": 0.8719019928865804, + "grad_norm": 4.608307361602783, + "learning_rate": 4.317769993461351e-06, + "loss": 1.8797, + "step": 11583 + }, + { + "epoch": 0.8719772671672407, + "grad_norm": 4.955003261566162, + "learning_rate": 4.31281558473024e-06, + "loss": 1.9135, + "step": 11584 + }, + { + "epoch": 0.8720525414479008, + "grad_norm": 4.039622783660889, + "learning_rate": 4.307863891971164e-06, + "loss": 1.7485, + "step": 11585 + }, + { + "epoch": 0.8721278157285609, + "grad_norm": 6.942947864532471, + "learning_rate": 4.302914915478462e-06, + "loss": 2.0713, + "step": 11586 + }, + { + "epoch": 0.8722030900092211, + "grad_norm": 3.8982932567596436, + "learning_rate": 4.297968655546353e-06, + "loss": 1.6418, + "step": 11587 + }, + { + "epoch": 0.8722783642898813, + "grad_norm": 4.942421913146973, + "learning_rate": 4.293025112468868e-06, + "loss": 1.658, + "step": 11588 + }, + { + "epoch": 0.8723536385705414, + "grad_norm": 4.122900485992432, + "learning_rate": 4.288084286539906e-06, + "loss": 1.7429, + "step": 11589 + }, + { + "epoch": 0.8724289128512016, + "grad_norm": 7.055030345916748, + "learning_rate": 4.283146178053155e-06, + "loss": 1.6566, + "step": 11590 + }, + { + "epoch": 0.8725041871318617, + "grad_norm": 5.566137313842773, + "learning_rate": 4.278210787302189e-06, + "loss": 1.5428, + "step": 11591 + }, + { + "epoch": 0.8725794614125219, + "grad_norm": 6.710389137268066, + "learning_rate": 4.273278114580403e-06, + "loss": 1.2971, + "step": 11592 + }, + { + "epoch": 0.8726547356931821, + "grad_norm": 4.844457149505615, + "learning_rate": 4.268348160181035e-06, + "loss": 1.5797, + "step": 11593 + }, + { + "epoch": 0.8727300099738422, + "grad_norm": 6.99944543838501, + "learning_rate": 4.263420924397143e-06, + "loss": 1.8378, + "step": 11594 + }, + { + "epoch": 0.8728052842545023, + "grad_norm": 5.550914287567139, + "learning_rate": 4.2584964075216446e-06, + "loss": 1.6948, + "step": 11595 + }, + { + "epoch": 0.8728805585351624, + "grad_norm": 5.969269752502441, + "learning_rate": 4.2535746098473e-06, + "loss": 1.6399, + "step": 11596 + }, + { + "epoch": 0.8729558328158227, + "grad_norm": 4.773648738861084, + "learning_rate": 4.248655531666668e-06, + "loss": 1.4704, + "step": 11597 + }, + { + "epoch": 0.8730311070964828, + "grad_norm": 4.324612617492676, + "learning_rate": 4.2437391732721985e-06, + "loss": 1.6364, + "step": 11598 + }, + { + "epoch": 0.8731063813771429, + "grad_norm": 5.689640522003174, + "learning_rate": 4.238825534956142e-06, + "loss": 1.7344, + "step": 11599 + }, + { + "epoch": 0.8731816556578031, + "grad_norm": 5.320837020874023, + "learning_rate": 4.233914617010609e-06, + "loss": 1.6702, + "step": 11600 + }, + { + "epoch": 0.8732569299384633, + "grad_norm": 4.304983615875244, + "learning_rate": 4.229006419727527e-06, + "loss": 1.8566, + "step": 11601 + }, + { + "epoch": 0.8733322042191234, + "grad_norm": 4.6210551261901855, + "learning_rate": 4.22410094339869e-06, + "loss": 1.915, + "step": 11602 + }, + { + "epoch": 0.8734074784997836, + "grad_norm": 4.775902271270752, + "learning_rate": 4.2191981883156985e-06, + "loss": 1.7336, + "step": 11603 + }, + { + "epoch": 0.8734827527804437, + "grad_norm": 4.701263904571533, + "learning_rate": 4.214298154770013e-06, + "loss": 1.6854, + "step": 11604 + }, + { + "epoch": 0.8735580270611039, + "grad_norm": 5.8190836906433105, + "learning_rate": 4.209400843052924e-06, + "loss": 1.6473, + "step": 11605 + }, + { + "epoch": 0.8736333013417641, + "grad_norm": 9.0632963180542, + "learning_rate": 4.204506253455582e-06, + "loss": 1.4826, + "step": 11606 + }, + { + "epoch": 0.8737085756224242, + "grad_norm": 7.049914836883545, + "learning_rate": 4.1996143862689294e-06, + "loss": 2.0397, + "step": 11607 + }, + { + "epoch": 0.8737838499030843, + "grad_norm": 8.300387382507324, + "learning_rate": 4.194725241783792e-06, + "loss": 1.8738, + "step": 11608 + }, + { + "epoch": 0.8738591241837446, + "grad_norm": 4.721436023712158, + "learning_rate": 4.189838820290809e-06, + "loss": 1.1764, + "step": 11609 + }, + { + "epoch": 0.8739343984644047, + "grad_norm": 6.0229597091674805, + "learning_rate": 4.184955122080458e-06, + "loss": 1.5799, + "step": 11610 + }, + { + "epoch": 0.8740096727450648, + "grad_norm": 5.022579193115234, + "learning_rate": 4.180074147443081e-06, + "loss": 1.9015, + "step": 11611 + }, + { + "epoch": 0.874084947025725, + "grad_norm": 7.2051591873168945, + "learning_rate": 4.17519589666881e-06, + "loss": 1.8852, + "step": 11612 + }, + { + "epoch": 0.8741602213063852, + "grad_norm": 9.31006145477295, + "learning_rate": 4.170320370047665e-06, + "loss": 1.7434, + "step": 11613 + }, + { + "epoch": 0.8742354955870453, + "grad_norm": 4.9991841316223145, + "learning_rate": 4.1654475678694686e-06, + "loss": 1.628, + "step": 11614 + }, + { + "epoch": 0.8743107698677055, + "grad_norm": 5.307773590087891, + "learning_rate": 4.160577490423895e-06, + "loss": 1.9827, + "step": 11615 + }, + { + "epoch": 0.8743860441483656, + "grad_norm": 3.837818145751953, + "learning_rate": 4.155710138000468e-06, + "loss": 1.6727, + "step": 11616 + }, + { + "epoch": 0.8744613184290257, + "grad_norm": 7.097601890563965, + "learning_rate": 4.1508455108885344e-06, + "loss": 1.7093, + "step": 11617 + }, + { + "epoch": 0.8745365927096859, + "grad_norm": 4.821915626525879, + "learning_rate": 4.145983609377274e-06, + "loss": 1.6164, + "step": 11618 + }, + { + "epoch": 0.8746118669903461, + "grad_norm": 5.598127841949463, + "learning_rate": 4.141124433755722e-06, + "loss": 1.3991, + "step": 11619 + }, + { + "epoch": 0.8746871412710062, + "grad_norm": 5.766859531402588, + "learning_rate": 4.136267984312736e-06, + "loss": 1.5847, + "step": 11620 + }, + { + "epoch": 0.8747624155516663, + "grad_norm": 4.895109176635742, + "learning_rate": 4.131414261337024e-06, + "loss": 1.7144, + "step": 11621 + }, + { + "epoch": 0.8748376898323266, + "grad_norm": 6.280261039733887, + "learning_rate": 4.126563265117117e-06, + "loss": 2.0421, + "step": 11622 + }, + { + "epoch": 0.8749129641129867, + "grad_norm": 5.280960559844971, + "learning_rate": 4.1217149959414e-06, + "loss": 2.1281, + "step": 11623 + }, + { + "epoch": 0.8749882383936468, + "grad_norm": 6.34357213973999, + "learning_rate": 4.116869454098093e-06, + "loss": 1.5751, + "step": 11624 + }, + { + "epoch": 0.875063512674307, + "grad_norm": 6.598411560058594, + "learning_rate": 4.112026639875233e-06, + "loss": 2.0574, + "step": 11625 + }, + { + "epoch": 0.8751387869549672, + "grad_norm": 4.268938064575195, + "learning_rate": 4.1071865535607266e-06, + "loss": 1.5755, + "step": 11626 + }, + { + "epoch": 0.8752140612356273, + "grad_norm": 5.984579086303711, + "learning_rate": 4.1023491954422955e-06, + "loss": 1.3913, + "step": 11627 + }, + { + "epoch": 0.8752893355162875, + "grad_norm": 6.586937427520752, + "learning_rate": 4.097514565807514e-06, + "loss": 1.3842, + "step": 11628 + }, + { + "epoch": 0.8753646097969476, + "grad_norm": 4.75636625289917, + "learning_rate": 4.092682664943781e-06, + "loss": 1.581, + "step": 11629 + }, + { + "epoch": 0.8754398840776078, + "grad_norm": 4.57895040512085, + "learning_rate": 4.087853493138344e-06, + "loss": 1.863, + "step": 11630 + }, + { + "epoch": 0.875515158358268, + "grad_norm": 4.619965076446533, + "learning_rate": 4.083027050678279e-06, + "loss": 1.6153, + "step": 11631 + }, + { + "epoch": 0.8755904326389281, + "grad_norm": 6.6473307609558105, + "learning_rate": 4.078203337850506e-06, + "loss": 1.5541, + "step": 11632 + }, + { + "epoch": 0.8756657069195882, + "grad_norm": 6.655191898345947, + "learning_rate": 4.0733823549417736e-06, + "loss": 1.5841, + "step": 11633 + }, + { + "epoch": 0.8757409812002485, + "grad_norm": 5.713079929351807, + "learning_rate": 4.068564102238692e-06, + "loss": 1.7949, + "step": 11634 + }, + { + "epoch": 0.8758162554809086, + "grad_norm": 5.624162673950195, + "learning_rate": 4.063748580027676e-06, + "loss": 2.0115, + "step": 11635 + }, + { + "epoch": 0.8758915297615687, + "grad_norm": 5.819950103759766, + "learning_rate": 4.0589357885949975e-06, + "loss": 2.1572, + "step": 11636 + }, + { + "epoch": 0.8759668040422288, + "grad_norm": 7.705456256866455, + "learning_rate": 4.054125728226765e-06, + "loss": 1.8177, + "step": 11637 + }, + { + "epoch": 0.876042078322889, + "grad_norm": 3.90543270111084, + "learning_rate": 4.049318399208935e-06, + "loss": 1.682, + "step": 11638 + }, + { + "epoch": 0.8761173526035492, + "grad_norm": 7.418792724609375, + "learning_rate": 4.044513801827271e-06, + "loss": 1.5995, + "step": 11639 + }, + { + "epoch": 0.8761926268842093, + "grad_norm": 9.873350143432617, + "learning_rate": 4.039711936367402e-06, + "loss": 2.1189, + "step": 11640 + }, + { + "epoch": 0.8762679011648695, + "grad_norm": 4.566072463989258, + "learning_rate": 4.034912803114793e-06, + "loss": 1.7768, + "step": 11641 + }, + { + "epoch": 0.8763431754455296, + "grad_norm": 4.642801284790039, + "learning_rate": 4.030116402354717e-06, + "loss": 1.7531, + "step": 11642 + }, + { + "epoch": 0.8764184497261898, + "grad_norm": 4.306826114654541, + "learning_rate": 4.025322734372328e-06, + "loss": 1.9671, + "step": 11643 + }, + { + "epoch": 0.87649372400685, + "grad_norm": 4.958789348602295, + "learning_rate": 4.020531799452576e-06, + "loss": 1.769, + "step": 11644 + }, + { + "epoch": 0.8765689982875101, + "grad_norm": 6.042473316192627, + "learning_rate": 4.015743597880289e-06, + "loss": 1.6892, + "step": 11645 + }, + { + "epoch": 0.8766442725681702, + "grad_norm": 7.725668430328369, + "learning_rate": 4.010958129940096e-06, + "loss": 1.8807, + "step": 11646 + }, + { + "epoch": 0.8767195468488305, + "grad_norm": 4.606240749359131, + "learning_rate": 4.006175395916489e-06, + "loss": 1.4182, + "step": 11647 + }, + { + "epoch": 0.8767948211294906, + "grad_norm": 6.348077297210693, + "learning_rate": 4.001395396093777e-06, + "loss": 1.6075, + "step": 11648 + }, + { + "epoch": 0.8768700954101507, + "grad_norm": 4.901601791381836, + "learning_rate": 3.996618130756119e-06, + "loss": 1.5807, + "step": 11649 + }, + { + "epoch": 0.8769453696908109, + "grad_norm": 5.004312515258789, + "learning_rate": 3.991843600187522e-06, + "loss": 1.6093, + "step": 11650 + }, + { + "epoch": 0.8770206439714711, + "grad_norm": 5.340473175048828, + "learning_rate": 3.987071804671816e-06, + "loss": 1.7903, + "step": 11651 + }, + { + "epoch": 0.8770959182521312, + "grad_norm": 4.167001247406006, + "learning_rate": 3.982302744492655e-06, + "loss": 1.5913, + "step": 11652 + }, + { + "epoch": 0.8771711925327914, + "grad_norm": 5.036463737487793, + "learning_rate": 3.97753641993357e-06, + "loss": 1.3365, + "step": 11653 + }, + { + "epoch": 0.8772464668134515, + "grad_norm": 4.699038028717041, + "learning_rate": 3.9727728312778765e-06, + "loss": 1.5388, + "step": 11654 + }, + { + "epoch": 0.8773217410941117, + "grad_norm": 5.600727081298828, + "learning_rate": 3.9680119788087835e-06, + "loss": 1.7237, + "step": 11655 + }, + { + "epoch": 0.8773970153747718, + "grad_norm": 5.301457405090332, + "learning_rate": 3.96325386280929e-06, + "loss": 1.4352, + "step": 11656 + }, + { + "epoch": 0.877472289655432, + "grad_norm": 6.565226078033447, + "learning_rate": 3.958498483562262e-06, + "loss": 1.5271, + "step": 11657 + }, + { + "epoch": 0.8775475639360921, + "grad_norm": 6.191504955291748, + "learning_rate": 3.953745841350393e-06, + "loss": 1.6714, + "step": 11658 + }, + { + "epoch": 0.8776228382167522, + "grad_norm": 4.479940414428711, + "learning_rate": 3.948995936456207e-06, + "loss": 1.8599, + "step": 11659 + }, + { + "epoch": 0.8776981124974125, + "grad_norm": 4.914144992828369, + "learning_rate": 3.9442487691620785e-06, + "loss": 1.6284, + "step": 11660 + }, + { + "epoch": 0.8777733867780726, + "grad_norm": 5.820263862609863, + "learning_rate": 3.939504339750211e-06, + "loss": 1.9864, + "step": 11661 + }, + { + "epoch": 0.8778486610587327, + "grad_norm": 6.093573093414307, + "learning_rate": 3.93476264850266e-06, + "loss": 1.5193, + "step": 11662 + }, + { + "epoch": 0.877923935339393, + "grad_norm": 6.339012622833252, + "learning_rate": 3.9300236957012795e-06, + "loss": 1.6429, + "step": 11663 + }, + { + "epoch": 0.8779992096200531, + "grad_norm": 4.103133201599121, + "learning_rate": 3.92528748162781e-06, + "loss": 1.5805, + "step": 11664 + }, + { + "epoch": 0.8780744839007132, + "grad_norm": 5.911979675292969, + "learning_rate": 3.920554006563792e-06, + "loss": 1.5007, + "step": 11665 + }, + { + "epoch": 0.8781497581813734, + "grad_norm": 6.187437534332275, + "learning_rate": 3.915823270790625e-06, + "loss": 1.6198, + "step": 11666 + }, + { + "epoch": 0.8782250324620335, + "grad_norm": 5.643014430999756, + "learning_rate": 3.9110952745895325e-06, + "loss": 2.2238, + "step": 11667 + }, + { + "epoch": 0.8783003067426937, + "grad_norm": 4.993788242340088, + "learning_rate": 3.906370018241584e-06, + "loss": 1.589, + "step": 11668 + }, + { + "epoch": 0.8783755810233539, + "grad_norm": 5.978770732879639, + "learning_rate": 3.9016475020276774e-06, + "loss": 1.6596, + "step": 11669 + }, + { + "epoch": 0.878450855304014, + "grad_norm": 4.27323055267334, + "learning_rate": 3.89692772622855e-06, + "loss": 1.7956, + "step": 11670 + }, + { + "epoch": 0.8785261295846741, + "grad_norm": 5.560185432434082, + "learning_rate": 3.892210691124787e-06, + "loss": 1.5247, + "step": 11671 + }, + { + "epoch": 0.8786014038653344, + "grad_norm": 8.784761428833008, + "learning_rate": 3.887496396996803e-06, + "loss": 1.562, + "step": 11672 + }, + { + "epoch": 0.8786766781459945, + "grad_norm": 5.015218257904053, + "learning_rate": 3.8827848441248535e-06, + "loss": 1.615, + "step": 11673 + }, + { + "epoch": 0.8787519524266546, + "grad_norm": 4.807821750640869, + "learning_rate": 3.8780760327890074e-06, + "loss": 1.4541, + "step": 11674 + }, + { + "epoch": 0.8788272267073147, + "grad_norm": 5.571457862854004, + "learning_rate": 3.873369963269219e-06, + "loss": 2.1872, + "step": 11675 + }, + { + "epoch": 0.878902500987975, + "grad_norm": 5.210233211517334, + "learning_rate": 3.8686666358452196e-06, + "loss": 1.677, + "step": 11676 + }, + { + "epoch": 0.8789777752686351, + "grad_norm": 6.442724704742432, + "learning_rate": 3.863966050796635e-06, + "loss": 1.5309, + "step": 11677 + }, + { + "epoch": 0.8790530495492952, + "grad_norm": 5.60943603515625, + "learning_rate": 3.859268208402877e-06, + "loss": 2.0106, + "step": 11678 + }, + { + "epoch": 0.8791283238299554, + "grad_norm": 4.648293495178223, + "learning_rate": 3.854573108943244e-06, + "loss": 1.7532, + "step": 11679 + }, + { + "epoch": 0.8792035981106155, + "grad_norm": 6.230894088745117, + "learning_rate": 3.849880752696822e-06, + "loss": 1.7911, + "step": 11680 + }, + { + "epoch": 0.8792788723912757, + "grad_norm": 5.838742733001709, + "learning_rate": 3.8451911399425725e-06, + "loss": 1.8798, + "step": 11681 + }, + { + "epoch": 0.8793541466719359, + "grad_norm": 6.311849594116211, + "learning_rate": 3.840504270959272e-06, + "loss": 1.6725, + "step": 11682 + }, + { + "epoch": 0.879429420952596, + "grad_norm": 5.469592094421387, + "learning_rate": 3.835820146025559e-06, + "loss": 1.8966, + "step": 11683 + }, + { + "epoch": 0.8795046952332561, + "grad_norm": 5.354036331176758, + "learning_rate": 3.831138765419867e-06, + "loss": 1.4313, + "step": 11684 + }, + { + "epoch": 0.8795799695139164, + "grad_norm": 7.604939937591553, + "learning_rate": 3.8264601294205114e-06, + "loss": 2.2046, + "step": 11685 + }, + { + "epoch": 0.8796552437945765, + "grad_norm": 4.8442559242248535, + "learning_rate": 3.821784238305609e-06, + "loss": 1.8863, + "step": 11686 + }, + { + "epoch": 0.8797305180752366, + "grad_norm": 4.711925029754639, + "learning_rate": 3.817111092353138e-06, + "loss": 1.636, + "step": 11687 + }, + { + "epoch": 0.8798057923558968, + "grad_norm": 4.527335166931152, + "learning_rate": 3.8124406918408962e-06, + "loss": 1.7394, + "step": 11688 + }, + { + "epoch": 0.879881066636557, + "grad_norm": 5.3554301261901855, + "learning_rate": 3.8077730370465247e-06, + "loss": 1.6984, + "step": 11689 + }, + { + "epoch": 0.8799563409172171, + "grad_norm": 5.890391826629639, + "learning_rate": 3.8031081282475122e-06, + "loss": 1.4783, + "step": 11690 + }, + { + "epoch": 0.8800316151978773, + "grad_norm": 7.108761310577393, + "learning_rate": 3.798445965721159e-06, + "loss": 1.6675, + "step": 11691 + }, + { + "epoch": 0.8801068894785374, + "grad_norm": 5.666106700897217, + "learning_rate": 3.793786549744627e-06, + "loss": 1.6277, + "step": 11692 + }, + { + "epoch": 0.8801821637591976, + "grad_norm": 6.613912105560303, + "learning_rate": 3.789129880594905e-06, + "loss": 1.6352, + "step": 11693 + }, + { + "epoch": 0.8802574380398578, + "grad_norm": 5.371757507324219, + "learning_rate": 3.7844759585488276e-06, + "loss": 1.4573, + "step": 11694 + }, + { + "epoch": 0.8803327123205179, + "grad_norm": 6.298776626586914, + "learning_rate": 3.7798247838830403e-06, + "loss": 1.6934, + "step": 11695 + }, + { + "epoch": 0.880407986601178, + "grad_norm": 5.35821533203125, + "learning_rate": 3.775176356874055e-06, + "loss": 1.862, + "step": 11696 + }, + { + "epoch": 0.8804832608818381, + "grad_norm": 7.1741414070129395, + "learning_rate": 3.7705306777981896e-06, + "loss": 2.0966, + "step": 11697 + }, + { + "epoch": 0.8805585351624984, + "grad_norm": 5.849855899810791, + "learning_rate": 3.76588774693164e-06, + "loss": 1.6965, + "step": 11698 + }, + { + "epoch": 0.8806338094431585, + "grad_norm": 5.199145317077637, + "learning_rate": 3.7612475645503963e-06, + "loss": 1.4261, + "step": 11699 + }, + { + "epoch": 0.8807090837238186, + "grad_norm": 4.1813883781433105, + "learning_rate": 3.756610130930316e-06, + "loss": 1.6016, + "step": 11700 + }, + { + "epoch": 0.8807843580044789, + "grad_norm": 3.929568290710449, + "learning_rate": 3.7519754463470725e-06, + "loss": 1.3947, + "step": 11701 + }, + { + "epoch": 0.880859632285139, + "grad_norm": 4.311989784240723, + "learning_rate": 3.7473435110761903e-06, + "loss": 1.83, + "step": 11702 + }, + { + "epoch": 0.8809349065657991, + "grad_norm": 4.188756942749023, + "learning_rate": 3.7427143253930207e-06, + "loss": 1.8414, + "step": 11703 + }, + { + "epoch": 0.8810101808464593, + "grad_norm": 7.10547399520874, + "learning_rate": 3.73808788957275e-06, + "loss": 1.8294, + "step": 11704 + }, + { + "epoch": 0.8810854551271194, + "grad_norm": 7.271067142486572, + "learning_rate": 3.733464203890419e-06, + "loss": 1.9018, + "step": 11705 + }, + { + "epoch": 0.8811607294077796, + "grad_norm": 5.098751068115234, + "learning_rate": 3.7288432686208853e-06, + "loss": 1.6468, + "step": 11706 + }, + { + "epoch": 0.8812360036884398, + "grad_norm": 4.3407697677612305, + "learning_rate": 3.7242250840388625e-06, + "loss": 1.6873, + "step": 11707 + }, + { + "epoch": 0.8813112779690999, + "grad_norm": 4.938584804534912, + "learning_rate": 3.7196096504188705e-06, + "loss": 1.4899, + "step": 11708 + }, + { + "epoch": 0.88138655224976, + "grad_norm": 4.2281365394592285, + "learning_rate": 3.7149969680353e-06, + "loss": 1.6732, + "step": 11709 + }, + { + "epoch": 0.8814618265304203, + "grad_norm": 4.876209259033203, + "learning_rate": 3.710387037162344e-06, + "loss": 1.7241, + "step": 11710 + }, + { + "epoch": 0.8815371008110804, + "grad_norm": 6.2961883544921875, + "learning_rate": 3.7057798580740656e-06, + "loss": 1.6042, + "step": 11711 + }, + { + "epoch": 0.8816123750917405, + "grad_norm": 4.273378372192383, + "learning_rate": 3.7011754310443347e-06, + "loss": 2.2216, + "step": 11712 + }, + { + "epoch": 0.8816876493724007, + "grad_norm": 6.431707859039307, + "learning_rate": 3.6965737563468884e-06, + "loss": 1.9265, + "step": 11713 + }, + { + "epoch": 0.8817629236530609, + "grad_norm": 6.290647029876709, + "learning_rate": 3.691974834255263e-06, + "loss": 1.7623, + "step": 11714 + }, + { + "epoch": 0.881838197933721, + "grad_norm": 5.260566711425781, + "learning_rate": 3.687378665042862e-06, + "loss": 1.7577, + "step": 11715 + }, + { + "epoch": 0.8819134722143811, + "grad_norm": 5.3276143074035645, + "learning_rate": 3.682785248982912e-06, + "loss": 1.7501, + "step": 11716 + }, + { + "epoch": 0.8819887464950413, + "grad_norm": 5.1861958503723145, + "learning_rate": 3.678194586348488e-06, + "loss": 1.7142, + "step": 11717 + }, + { + "epoch": 0.8820640207757015, + "grad_norm": 5.255173683166504, + "learning_rate": 3.6736066774124778e-06, + "loss": 1.5375, + "step": 11718 + }, + { + "epoch": 0.8821392950563616, + "grad_norm": 4.203284740447998, + "learning_rate": 3.6690215224476297e-06, + "loss": 1.7789, + "step": 11719 + }, + { + "epoch": 0.8822145693370218, + "grad_norm": 6.088117599487305, + "learning_rate": 3.6644391217265094e-06, + "loss": 1.6451, + "step": 11720 + }, + { + "epoch": 0.8822898436176819, + "grad_norm": 5.710407257080078, + "learning_rate": 3.6598594755215266e-06, + "loss": 1.8608, + "step": 11721 + }, + { + "epoch": 0.882365117898342, + "grad_norm": 5.321258544921875, + "learning_rate": 3.6552825841049464e-06, + "loss": 2.0566, + "step": 11722 + }, + { + "epoch": 0.8824403921790023, + "grad_norm": 5.131575107574463, + "learning_rate": 3.6507084477488295e-06, + "loss": 1.7693, + "step": 11723 + }, + { + "epoch": 0.8825156664596624, + "grad_norm": 4.4243903160095215, + "learning_rate": 3.646137066725108e-06, + "loss": 1.5854, + "step": 11724 + }, + { + "epoch": 0.8825909407403225, + "grad_norm": 4.345744609832764, + "learning_rate": 3.6415684413055317e-06, + "loss": 1.6932, + "step": 11725 + }, + { + "epoch": 0.8826662150209827, + "grad_norm": 5.765183448791504, + "learning_rate": 3.6370025717616886e-06, + "loss": 2.0257, + "step": 11726 + }, + { + "epoch": 0.8827414893016429, + "grad_norm": 6.967600345611572, + "learning_rate": 3.632439458365017e-06, + "loss": 1.5581, + "step": 11727 + }, + { + "epoch": 0.882816763582303, + "grad_norm": 5.070700168609619, + "learning_rate": 3.627879101386783e-06, + "loss": 1.8778, + "step": 11728 + }, + { + "epoch": 0.8828920378629632, + "grad_norm": 5.085634231567383, + "learning_rate": 3.6233215010980704e-06, + "loss": 1.9464, + "step": 11729 + }, + { + "epoch": 0.8829673121436233, + "grad_norm": 4.410702228546143, + "learning_rate": 3.6187666577698344e-06, + "loss": 1.7233, + "step": 11730 + }, + { + "epoch": 0.8830425864242835, + "grad_norm": 4.733056545257568, + "learning_rate": 3.6142145716728305e-06, + "loss": 1.7679, + "step": 11731 + }, + { + "epoch": 0.8831178607049437, + "grad_norm": 5.505751132965088, + "learning_rate": 3.609665243077681e-06, + "loss": 1.8032, + "step": 11732 + }, + { + "epoch": 0.8831931349856038, + "grad_norm": 6.331247329711914, + "learning_rate": 3.60511867225482e-06, + "loss": 2.1625, + "step": 11733 + }, + { + "epoch": 0.8832684092662639, + "grad_norm": 6.491484642028809, + "learning_rate": 3.600574859474537e-06, + "loss": 1.6657, + "step": 11734 + }, + { + "epoch": 0.883343683546924, + "grad_norm": 4.10695219039917, + "learning_rate": 3.5960338050069318e-06, + "loss": 1.9303, + "step": 11735 + }, + { + "epoch": 0.8834189578275843, + "grad_norm": 6.300858497619629, + "learning_rate": 3.5914955091219727e-06, + "loss": 1.6, + "step": 11736 + }, + { + "epoch": 0.8834942321082444, + "grad_norm": 5.158595085144043, + "learning_rate": 3.5869599720894486e-06, + "loss": 1.4763, + "step": 11737 + }, + { + "epoch": 0.8835695063889045, + "grad_norm": 5.257260799407959, + "learning_rate": 3.582427194178972e-06, + "loss": 1.8055, + "step": 11738 + }, + { + "epoch": 0.8836447806695648, + "grad_norm": 7.28010892868042, + "learning_rate": 3.5778971756600277e-06, + "loss": 1.6662, + "step": 11739 + }, + { + "epoch": 0.8837200549502249, + "grad_norm": 4.828343391418457, + "learning_rate": 3.5733699168018832e-06, + "loss": 1.6488, + "step": 11740 + }, + { + "epoch": 0.883795329230885, + "grad_norm": 5.949958801269531, + "learning_rate": 3.5688454178736954e-06, + "loss": 1.5758, + "step": 11741 + }, + { + "epoch": 0.8838706035115452, + "grad_norm": 5.839580535888672, + "learning_rate": 3.56432367914441e-06, + "loss": 1.4823, + "step": 11742 + }, + { + "epoch": 0.8839458777922053, + "grad_norm": 6.548633575439453, + "learning_rate": 3.5598047008828517e-06, + "loss": 1.6498, + "step": 11743 + }, + { + "epoch": 0.8840211520728655, + "grad_norm": 4.428357124328613, + "learning_rate": 3.555288483357644e-06, + "loss": 1.4869, + "step": 11744 + }, + { + "epoch": 0.8840964263535257, + "grad_norm": 5.908418655395508, + "learning_rate": 3.550775026837283e-06, + "loss": 2.0765, + "step": 11745 + }, + { + "epoch": 0.8841717006341858, + "grad_norm": 4.651230335235596, + "learning_rate": 3.5462643315900602e-06, + "loss": 1.9432, + "step": 11746 + }, + { + "epoch": 0.8842469749148459, + "grad_norm": 5.599878787994385, + "learning_rate": 3.5417563978841273e-06, + "loss": 1.8916, + "step": 11747 + }, + { + "epoch": 0.8843222491955062, + "grad_norm": 6.476396560668945, + "learning_rate": 3.5372512259874867e-06, + "loss": 1.6926, + "step": 11748 + }, + { + "epoch": 0.8843975234761663, + "grad_norm": 4.777995586395264, + "learning_rate": 3.53274881616793e-06, + "loss": 1.7362, + "step": 11749 + }, + { + "epoch": 0.8844727977568264, + "grad_norm": 5.074110984802246, + "learning_rate": 3.5282491686931317e-06, + "loss": 1.6087, + "step": 11750 + }, + { + "epoch": 0.8845480720374866, + "grad_norm": 4.8917717933654785, + "learning_rate": 3.523752283830584e-06, + "loss": 1.7535, + "step": 11751 + }, + { + "epoch": 0.8846233463181468, + "grad_norm": 5.364163398742676, + "learning_rate": 3.5192581618475997e-06, + "loss": 1.7661, + "step": 11752 + }, + { + "epoch": 0.8846986205988069, + "grad_norm": 5.70516300201416, + "learning_rate": 3.514766803011349e-06, + "loss": 1.7951, + "step": 11753 + }, + { + "epoch": 0.884773894879467, + "grad_norm": 4.460023880004883, + "learning_rate": 3.5102782075888407e-06, + "loss": 1.8055, + "step": 11754 + }, + { + "epoch": 0.8848491691601272, + "grad_norm": 3.985265016555786, + "learning_rate": 3.5057923758468893e-06, + "loss": 1.9297, + "step": 11755 + }, + { + "epoch": 0.8849244434407874, + "grad_norm": 7.1369500160217285, + "learning_rate": 3.5013093080521808e-06, + "loss": 1.8269, + "step": 11756 + }, + { + "epoch": 0.8849997177214475, + "grad_norm": 4.976137638092041, + "learning_rate": 3.496829004471208e-06, + "loss": 1.8256, + "step": 11757 + }, + { + "epoch": 0.8850749920021077, + "grad_norm": 5.7405900955200195, + "learning_rate": 3.4923514653703247e-06, + "loss": 1.8753, + "step": 11758 + }, + { + "epoch": 0.8851502662827678, + "grad_norm": 5.189647674560547, + "learning_rate": 3.487876691015696e-06, + "loss": 1.6399, + "step": 11759 + }, + { + "epoch": 0.885225540563428, + "grad_norm": 5.781796932220459, + "learning_rate": 3.483404681673341e-06, + "loss": 1.6308, + "step": 11760 + }, + { + "epoch": 0.8853008148440882, + "grad_norm": 4.491673946380615, + "learning_rate": 3.4789354376091043e-06, + "loss": 1.3869, + "step": 11761 + }, + { + "epoch": 0.8853760891247483, + "grad_norm": 6.065124034881592, + "learning_rate": 3.474468959088678e-06, + "loss": 1.6242, + "step": 11762 + }, + { + "epoch": 0.8854513634054084, + "grad_norm": 5.406561851501465, + "learning_rate": 3.470005246377572e-06, + "loss": 1.7458, + "step": 11763 + }, + { + "epoch": 0.8855266376860687, + "grad_norm": 6.974289894104004, + "learning_rate": 3.465544299741147e-06, + "loss": 1.69, + "step": 11764 + }, + { + "epoch": 0.8856019119667288, + "grad_norm": 5.5609211921691895, + "learning_rate": 3.4610861194445897e-06, + "loss": 1.5639, + "step": 11765 + }, + { + "epoch": 0.8856771862473889, + "grad_norm": 5.059272766113281, + "learning_rate": 3.456630705752928e-06, + "loss": 1.7774, + "step": 11766 + }, + { + "epoch": 0.8857524605280491, + "grad_norm": 7.404388427734375, + "learning_rate": 3.4521780589310213e-06, + "loss": 2.0379, + "step": 11767 + }, + { + "epoch": 0.8858277348087092, + "grad_norm": 4.628129482269287, + "learning_rate": 3.4477281792435646e-06, + "loss": 1.6093, + "step": 11768 + }, + { + "epoch": 0.8859030090893694, + "grad_norm": 3.706904172897339, + "learning_rate": 3.443281066955106e-06, + "loss": 1.7138, + "step": 11769 + }, + { + "epoch": 0.8859782833700296, + "grad_norm": 7.007409572601318, + "learning_rate": 3.438836722329991e-06, + "loss": 1.876, + "step": 11770 + }, + { + "epoch": 0.8860535576506897, + "grad_norm": 5.222723960876465, + "learning_rate": 3.434395145632435e-06, + "loss": 1.8289, + "step": 11771 + }, + { + "epoch": 0.8861288319313498, + "grad_norm": 7.494507312774658, + "learning_rate": 3.429956337126472e-06, + "loss": 1.5502, + "step": 11772 + }, + { + "epoch": 0.88620410621201, + "grad_norm": 5.389138698577881, + "learning_rate": 3.42552029707599e-06, + "loss": 1.6866, + "step": 11773 + }, + { + "epoch": 0.8862793804926702, + "grad_norm": 7.180854320526123, + "learning_rate": 3.4210870257446847e-06, + "loss": 1.4546, + "step": 11774 + }, + { + "epoch": 0.8863546547733303, + "grad_norm": 5.274043083190918, + "learning_rate": 3.4166565233961113e-06, + "loss": 1.6604, + "step": 11775 + }, + { + "epoch": 0.8864299290539904, + "grad_norm": 4.56284236907959, + "learning_rate": 3.412228790293637e-06, + "loss": 1.5328, + "step": 11776 + }, + { + "epoch": 0.8865052033346507, + "grad_norm": 4.2645673751831055, + "learning_rate": 3.4078038267004953e-06, + "loss": 2.0537, + "step": 11777 + }, + { + "epoch": 0.8865804776153108, + "grad_norm": 4.076914310455322, + "learning_rate": 3.403381632879721e-06, + "loss": 1.6361, + "step": 11778 + }, + { + "epoch": 0.8866557518959709, + "grad_norm": 7.227497577667236, + "learning_rate": 3.3989622090942087e-06, + "loss": 1.6755, + "step": 11779 + }, + { + "epoch": 0.8867310261766311, + "grad_norm": 6.30675745010376, + "learning_rate": 3.394545555606682e-06, + "loss": 2.0847, + "step": 11780 + }, + { + "epoch": 0.8868063004572913, + "grad_norm": 5.158139705657959, + "learning_rate": 3.390131672679686e-06, + "loss": 1.7836, + "step": 11781 + }, + { + "epoch": 0.8868815747379514, + "grad_norm": 4.040538787841797, + "learning_rate": 3.385720560575628e-06, + "loss": 1.3484, + "step": 11782 + }, + { + "epoch": 0.8869568490186116, + "grad_norm": 6.937635898590088, + "learning_rate": 3.3813122195567426e-06, + "loss": 1.5636, + "step": 11783 + }, + { + "epoch": 0.8870321232992717, + "grad_norm": 6.234410285949707, + "learning_rate": 3.37690664988507e-06, + "loss": 1.8675, + "step": 11784 + }, + { + "epoch": 0.8871073975799318, + "grad_norm": 7.553998947143555, + "learning_rate": 3.3725038518225228e-06, + "loss": 1.7981, + "step": 11785 + }, + { + "epoch": 0.8871826718605921, + "grad_norm": 4.448619842529297, + "learning_rate": 3.368103825630836e-06, + "loss": 1.5365, + "step": 11786 + }, + { + "epoch": 0.8872579461412522, + "grad_norm": 5.437499523162842, + "learning_rate": 3.3637065715715676e-06, + "loss": 1.2683, + "step": 11787 + }, + { + "epoch": 0.8873332204219123, + "grad_norm": 8.597917556762695, + "learning_rate": 3.3593120899061413e-06, + "loss": 1.6052, + "step": 11788 + }, + { + "epoch": 0.8874084947025725, + "grad_norm": 4.290695667266846, + "learning_rate": 3.354920380895771e-06, + "loss": 1.413, + "step": 11789 + }, + { + "epoch": 0.8874837689832327, + "grad_norm": 6.231933116912842, + "learning_rate": 3.350531444801558e-06, + "loss": 1.7449, + "step": 11790 + }, + { + "epoch": 0.8875590432638928, + "grad_norm": 5.409153461456299, + "learning_rate": 3.3461452818843887e-06, + "loss": 2.0625, + "step": 11791 + }, + { + "epoch": 0.887634317544553, + "grad_norm": 4.630675315856934, + "learning_rate": 3.3417618924050153e-06, + "loss": 1.7052, + "step": 11792 + }, + { + "epoch": 0.8877095918252131, + "grad_norm": 4.727572917938232, + "learning_rate": 3.3373812766240188e-06, + "loss": 1.8587, + "step": 11793 + }, + { + "epoch": 0.8877848661058733, + "grad_norm": 3.757991075515747, + "learning_rate": 3.3330034348018235e-06, + "loss": 1.8437, + "step": 11794 + }, + { + "epoch": 0.8878601403865334, + "grad_norm": 4.6337175369262695, + "learning_rate": 3.3286283671986664e-06, + "loss": 1.8376, + "step": 11795 + }, + { + "epoch": 0.8879354146671936, + "grad_norm": 8.868522644042969, + "learning_rate": 3.324256074074644e-06, + "loss": 1.6372, + "step": 11796 + }, + { + "epoch": 0.8880106889478537, + "grad_norm": 4.06243896484375, + "learning_rate": 3.319886555689661e-06, + "loss": 1.4519, + "step": 11797 + }, + { + "epoch": 0.8880859632285139, + "grad_norm": 4.288581848144531, + "learning_rate": 3.315519812303491e-06, + "loss": 1.9517, + "step": 11798 + }, + { + "epoch": 0.8881612375091741, + "grad_norm": 5.379961967468262, + "learning_rate": 3.311155844175712e-06, + "loss": 1.7662, + "step": 11799 + }, + { + "epoch": 0.8882365117898342, + "grad_norm": 5.306863307952881, + "learning_rate": 3.3067946515657532e-06, + "loss": 1.5555, + "step": 11800 + }, + { + "epoch": 0.8883117860704943, + "grad_norm": 4.886042594909668, + "learning_rate": 3.3024362347328697e-06, + "loss": 1.9256, + "step": 11801 + }, + { + "epoch": 0.8883870603511546, + "grad_norm": 4.271335124969482, + "learning_rate": 3.2980805939361646e-06, + "loss": 1.4134, + "step": 11802 + }, + { + "epoch": 0.8884623346318147, + "grad_norm": 5.149145126342773, + "learning_rate": 3.2937277294345703e-06, + "loss": 1.6608, + "step": 11803 + }, + { + "epoch": 0.8885376089124748, + "grad_norm": 4.669297218322754, + "learning_rate": 3.2893776414868406e-06, + "loss": 1.6267, + "step": 11804 + }, + { + "epoch": 0.888612883193135, + "grad_norm": 6.247199535369873, + "learning_rate": 3.2850303303515797e-06, + "loss": 1.9736, + "step": 11805 + }, + { + "epoch": 0.8886881574737951, + "grad_norm": 4.843697547912598, + "learning_rate": 3.280685796287225e-06, + "loss": 1.7359, + "step": 11806 + }, + { + "epoch": 0.8887634317544553, + "grad_norm": 4.9426589012146, + "learning_rate": 3.2763440395520593e-06, + "loss": 1.9896, + "step": 11807 + }, + { + "epoch": 0.8888387060351155, + "grad_norm": 5.840628147125244, + "learning_rate": 3.272005060404165e-06, + "loss": 1.7038, + "step": 11808 + }, + { + "epoch": 0.8889139803157756, + "grad_norm": 5.470430850982666, + "learning_rate": 3.267668859101497e-06, + "loss": 1.7663, + "step": 11809 + }, + { + "epoch": 0.8889892545964357, + "grad_norm": 6.2844343185424805, + "learning_rate": 3.263335435901821e-06, + "loss": 1.937, + "step": 11810 + }, + { + "epoch": 0.889064528877096, + "grad_norm": 5.550494194030762, + "learning_rate": 3.2590047910627585e-06, + "loss": 1.6852, + "step": 11811 + }, + { + "epoch": 0.8891398031577561, + "grad_norm": 4.293384552001953, + "learning_rate": 3.2546769248417374e-06, + "loss": 1.448, + "step": 11812 + }, + { + "epoch": 0.8892150774384162, + "grad_norm": 6.422075271606445, + "learning_rate": 3.250351837496057e-06, + "loss": 2.3203, + "step": 11813 + }, + { + "epoch": 0.8892903517190763, + "grad_norm": 4.269379138946533, + "learning_rate": 3.2460295292828067e-06, + "loss": 1.6588, + "step": 11814 + }, + { + "epoch": 0.8893656259997366, + "grad_norm": 4.266817569732666, + "learning_rate": 3.241710000458953e-06, + "loss": 1.7064, + "step": 11815 + }, + { + "epoch": 0.8894409002803967, + "grad_norm": 4.602149486541748, + "learning_rate": 3.2373932512812733e-06, + "loss": 1.7132, + "step": 11816 + }, + { + "epoch": 0.8895161745610568, + "grad_norm": 6.119515419006348, + "learning_rate": 3.2330792820063905e-06, + "loss": 1.8436, + "step": 11817 + }, + { + "epoch": 0.889591448841717, + "grad_norm": 4.6813063621521, + "learning_rate": 3.228768092890755e-06, + "loss": 1.6285, + "step": 11818 + }, + { + "epoch": 0.8896667231223772, + "grad_norm": 6.6129536628723145, + "learning_rate": 3.224459684190656e-06, + "loss": 2.095, + "step": 11819 + }, + { + "epoch": 0.8897419974030373, + "grad_norm": 6.721310615539551, + "learning_rate": 3.220154056162217e-06, + "loss": 1.7597, + "step": 11820 + }, + { + "epoch": 0.8898172716836975, + "grad_norm": 6.64024019241333, + "learning_rate": 3.215851209061388e-06, + "loss": 1.7932, + "step": 11821 + }, + { + "epoch": 0.8898925459643576, + "grad_norm": 7.984348297119141, + "learning_rate": 3.2115511431439704e-06, + "loss": 1.7888, + "step": 11822 + }, + { + "epoch": 0.8899678202450177, + "grad_norm": 4.032537460327148, + "learning_rate": 3.207253858665582e-06, + "loss": 1.6596, + "step": 11823 + }, + { + "epoch": 0.890043094525678, + "grad_norm": 5.3724260330200195, + "learning_rate": 3.202959355881696e-06, + "loss": 1.7009, + "step": 11824 + }, + { + "epoch": 0.8901183688063381, + "grad_norm": 3.8592283725738525, + "learning_rate": 3.198667635047592e-06, + "loss": 1.2274, + "step": 11825 + }, + { + "epoch": 0.8901936430869982, + "grad_norm": 5.176328659057617, + "learning_rate": 3.1943786964184043e-06, + "loss": 1.676, + "step": 11826 + }, + { + "epoch": 0.8902689173676585, + "grad_norm": 4.628762245178223, + "learning_rate": 3.1900925402491067e-06, + "loss": 1.8527, + "step": 11827 + }, + { + "epoch": 0.8903441916483186, + "grad_norm": 5.457942008972168, + "learning_rate": 3.185809166794507e-06, + "loss": 1.605, + "step": 11828 + }, + { + "epoch": 0.8904194659289787, + "grad_norm": 4.664469242095947, + "learning_rate": 3.1815285763092118e-06, + "loss": 1.7968, + "step": 11829 + }, + { + "epoch": 0.8904947402096389, + "grad_norm": 4.056907653808594, + "learning_rate": 3.177250769047718e-06, + "loss": 2.1007, + "step": 11830 + }, + { + "epoch": 0.890570014490299, + "grad_norm": 5.603760242462158, + "learning_rate": 3.172975745264306e-06, + "loss": 1.7209, + "step": 11831 + }, + { + "epoch": 0.8906452887709592, + "grad_norm": 5.925732135772705, + "learning_rate": 3.168703505213133e-06, + "loss": 2.0265, + "step": 11832 + }, + { + "epoch": 0.8907205630516193, + "grad_norm": 3.605818748474121, + "learning_rate": 3.1644340491481574e-06, + "loss": 1.67, + "step": 11833 + }, + { + "epoch": 0.8907958373322795, + "grad_norm": 4.680509567260742, + "learning_rate": 3.1601673773231867e-06, + "loss": 1.8419, + "step": 11834 + }, + { + "epoch": 0.8908711116129396, + "grad_norm": 4.67205810546875, + "learning_rate": 3.1559034899918794e-06, + "loss": 1.7164, + "step": 11835 + }, + { + "epoch": 0.8909463858935998, + "grad_norm": 5.491727828979492, + "learning_rate": 3.1516423874076886e-06, + "loss": 1.7162, + "step": 11836 + }, + { + "epoch": 0.89102166017426, + "grad_norm": 5.674481391906738, + "learning_rate": 3.1473840698239334e-06, + "loss": 1.8317, + "step": 11837 + }, + { + "epoch": 0.8910969344549201, + "grad_norm": 5.7415313720703125, + "learning_rate": 3.1431285374937614e-06, + "loss": 1.8102, + "step": 11838 + }, + { + "epoch": 0.8911722087355802, + "grad_norm": 5.425180912017822, + "learning_rate": 3.1388757906701595e-06, + "loss": 1.7347, + "step": 11839 + }, + { + "epoch": 0.8912474830162405, + "grad_norm": 4.5430450439453125, + "learning_rate": 3.134625829605925e-06, + "loss": 1.6552, + "step": 11840 + }, + { + "epoch": 0.8913227572969006, + "grad_norm": 7.66649055480957, + "learning_rate": 3.1303786545537173e-06, + "loss": 1.4269, + "step": 11841 + }, + { + "epoch": 0.8913980315775607, + "grad_norm": 5.277303695678711, + "learning_rate": 3.1261342657660063e-06, + "loss": 2.0535, + "step": 11842 + }, + { + "epoch": 0.8914733058582209, + "grad_norm": 6.212246417999268, + "learning_rate": 3.1218926634951296e-06, + "loss": 1.6592, + "step": 11843 + }, + { + "epoch": 0.891548580138881, + "grad_norm": 5.850680351257324, + "learning_rate": 3.117653847993213e-06, + "loss": 1.4848, + "step": 11844 + }, + { + "epoch": 0.8916238544195412, + "grad_norm": 5.032403469085693, + "learning_rate": 3.1134178195122653e-06, + "loss": 1.6089, + "step": 11845 + }, + { + "epoch": 0.8916991287002014, + "grad_norm": 5.041711330413818, + "learning_rate": 3.109184578304086e-06, + "loss": 1.5844, + "step": 11846 + }, + { + "epoch": 0.8917744029808615, + "grad_norm": 5.2173051834106445, + "learning_rate": 3.1049541246203406e-06, + "loss": 1.7889, + "step": 11847 + }, + { + "epoch": 0.8918496772615216, + "grad_norm": 4.667613506317139, + "learning_rate": 3.1007264587125163e-06, + "loss": 1.3501, + "step": 11848 + }, + { + "epoch": 0.8919249515421819, + "grad_norm": 5.336594104766846, + "learning_rate": 3.09650158083194e-06, + "loss": 1.7557, + "step": 11849 + }, + { + "epoch": 0.892000225822842, + "grad_norm": 5.6623358726501465, + "learning_rate": 3.0922794912297605e-06, + "loss": 1.4947, + "step": 11850 + }, + { + "epoch": 0.8920755001035021, + "grad_norm": 5.622028350830078, + "learning_rate": 3.088060190156966e-06, + "loss": 1.337, + "step": 11851 + }, + { + "epoch": 0.8921507743841622, + "grad_norm": 5.765834331512451, + "learning_rate": 3.0838436778643954e-06, + "loss": 1.5459, + "step": 11852 + }, + { + "epoch": 0.8922260486648225, + "grad_norm": 5.946274757385254, + "learning_rate": 3.0796299546026976e-06, + "loss": 1.438, + "step": 11853 + }, + { + "epoch": 0.8923013229454826, + "grad_norm": 6.284672737121582, + "learning_rate": 3.0754190206223776e-06, + "loss": 1.7243, + "step": 11854 + }, + { + "epoch": 0.8923765972261427, + "grad_norm": 6.272509574890137, + "learning_rate": 3.0712108761737413e-06, + "loss": 1.5053, + "step": 11855 + }, + { + "epoch": 0.8924518715068029, + "grad_norm": 5.620674133300781, + "learning_rate": 3.0670055215069772e-06, + "loss": 1.7141, + "step": 11856 + }, + { + "epoch": 0.8925271457874631, + "grad_norm": 4.726962566375732, + "learning_rate": 3.062802956872057e-06, + "loss": 1.8393, + "step": 11857 + }, + { + "epoch": 0.8926024200681232, + "grad_norm": 6.394321441650391, + "learning_rate": 3.058603182518832e-06, + "loss": 1.6707, + "step": 11858 + }, + { + "epoch": 0.8926776943487834, + "grad_norm": 4.937551975250244, + "learning_rate": 3.054406198696952e-06, + "loss": 2.3569, + "step": 11859 + }, + { + "epoch": 0.8927529686294435, + "grad_norm": 4.248887062072754, + "learning_rate": 3.0502120056559223e-06, + "loss": 1.9027, + "step": 11860 + }, + { + "epoch": 0.8928282429101037, + "grad_norm": 4.80689001083374, + "learning_rate": 3.0460206036450724e-06, + "loss": 1.791, + "step": 11861 + }, + { + "epoch": 0.8929035171907639, + "grad_norm": 5.4363837242126465, + "learning_rate": 3.0418319929135854e-06, + "loss": 1.8721, + "step": 11862 + }, + { + "epoch": 0.892978791471424, + "grad_norm": 5.42356014251709, + "learning_rate": 3.0376461737104345e-06, + "loss": 1.7802, + "step": 11863 + }, + { + "epoch": 0.8930540657520841, + "grad_norm": 7.600926876068115, + "learning_rate": 3.0334631462844766e-06, + "loss": 1.7742, + "step": 11864 + }, + { + "epoch": 0.8931293400327444, + "grad_norm": 5.78744649887085, + "learning_rate": 3.0292829108843625e-06, + "loss": 1.7462, + "step": 11865 + }, + { + "epoch": 0.8932046143134045, + "grad_norm": 4.564485549926758, + "learning_rate": 3.02510546775861e-06, + "loss": 1.4648, + "step": 11866 + }, + { + "epoch": 0.8932798885940646, + "grad_norm": 6.6460466384887695, + "learning_rate": 3.0209308171555593e-06, + "loss": 1.7858, + "step": 11867 + }, + { + "epoch": 0.8933551628747248, + "grad_norm": 5.321741580963135, + "learning_rate": 3.0167589593233624e-06, + "loss": 1.991, + "step": 11868 + }, + { + "epoch": 0.893430437155385, + "grad_norm": 5.145423889160156, + "learning_rate": 3.0125898945100427e-06, + "loss": 1.8985, + "step": 11869 + }, + { + "epoch": 0.8935057114360451, + "grad_norm": 5.762793064117432, + "learning_rate": 3.0084236229634244e-06, + "loss": 1.7517, + "step": 11870 + }, + { + "epoch": 0.8935809857167052, + "grad_norm": 4.454794406890869, + "learning_rate": 3.0042601449311925e-06, + "loss": 2.0009, + "step": 11871 + }, + { + "epoch": 0.8936562599973654, + "grad_norm": 4.987224578857422, + "learning_rate": 3.0000994606608433e-06, + "loss": 1.9479, + "step": 11872 + }, + { + "epoch": 0.8937315342780255, + "grad_norm": 6.412914752960205, + "learning_rate": 2.995941570399735e-06, + "loss": 1.8571, + "step": 11873 + }, + { + "epoch": 0.8938068085586857, + "grad_norm": 5.8690667152404785, + "learning_rate": 2.9917864743950195e-06, + "loss": 1.7564, + "step": 11874 + }, + { + "epoch": 0.8938820828393459, + "grad_norm": 6.174225807189941, + "learning_rate": 2.987634172893727e-06, + "loss": 1.525, + "step": 11875 + }, + { + "epoch": 0.893957357120006, + "grad_norm": 6.714494228363037, + "learning_rate": 2.983484666142683e-06, + "loss": 1.8933, + "step": 11876 + }, + { + "epoch": 0.8940326314006661, + "grad_norm": 5.083178997039795, + "learning_rate": 2.9793379543885724e-06, + "loss": 1.6603, + "step": 11877 + }, + { + "epoch": 0.8941079056813264, + "grad_norm": 5.738973140716553, + "learning_rate": 2.9751940378778986e-06, + "loss": 1.8367, + "step": 11878 + }, + { + "epoch": 0.8941831799619865, + "grad_norm": 6.122857570648193, + "learning_rate": 2.97105291685702e-06, + "loss": 1.7418, + "step": 11879 + }, + { + "epoch": 0.8942584542426466, + "grad_norm": 6.506328582763672, + "learning_rate": 2.96691459157209e-06, + "loss": 1.9127, + "step": 11880 + }, + { + "epoch": 0.8943337285233068, + "grad_norm": 6.465460777282715, + "learning_rate": 2.9627790622691387e-06, + "loss": 1.456, + "step": 11881 + }, + { + "epoch": 0.894409002803967, + "grad_norm": 6.305689334869385, + "learning_rate": 2.9586463291940093e-06, + "loss": 1.6368, + "step": 11882 + }, + { + "epoch": 0.8944842770846271, + "grad_norm": 4.671209335327148, + "learning_rate": 2.9545163925923767e-06, + "loss": 1.7731, + "step": 11883 + }, + { + "epoch": 0.8945595513652873, + "grad_norm": 5.7320876121521, + "learning_rate": 2.950389252709762e-06, + "loss": 1.5774, + "step": 11884 + }, + { + "epoch": 0.8946348256459474, + "grad_norm": 7.709319114685059, + "learning_rate": 2.946264909791502e-06, + "loss": 1.7231, + "step": 11885 + }, + { + "epoch": 0.8947100999266075, + "grad_norm": 4.983735084533691, + "learning_rate": 2.9421433640827834e-06, + "loss": 2.1062, + "step": 11886 + }, + { + "epoch": 0.8947853742072678, + "grad_norm": 6.776435375213623, + "learning_rate": 2.9380246158286105e-06, + "loss": 2.1543, + "step": 11887 + }, + { + "epoch": 0.8948606484879279, + "grad_norm": 4.080688953399658, + "learning_rate": 2.933908665273849e-06, + "loss": 1.5094, + "step": 11888 + }, + { + "epoch": 0.894935922768588, + "grad_norm": 4.9377055168151855, + "learning_rate": 2.929795512663164e-06, + "loss": 1.5374, + "step": 11889 + }, + { + "epoch": 0.8950111970492483, + "grad_norm": 9.383986473083496, + "learning_rate": 2.925685158241076e-06, + "loss": 1.453, + "step": 11890 + }, + { + "epoch": 0.8950864713299084, + "grad_norm": 4.8620781898498535, + "learning_rate": 2.9215776022519347e-06, + "loss": 1.4244, + "step": 11891 + }, + { + "epoch": 0.8951617456105685, + "grad_norm": 4.56439733505249, + "learning_rate": 2.917472844939917e-06, + "loss": 1.8379, + "step": 11892 + }, + { + "epoch": 0.8952370198912286, + "grad_norm": 5.891862869262695, + "learning_rate": 2.9133708865490438e-06, + "loss": 1.9073, + "step": 11893 + }, + { + "epoch": 0.8953122941718888, + "grad_norm": 4.024796485900879, + "learning_rate": 2.9092717273231764e-06, + "loss": 1.7676, + "step": 11894 + }, + { + "epoch": 0.895387568452549, + "grad_norm": 6.878586292266846, + "learning_rate": 2.9051753675059745e-06, + "loss": 1.4376, + "step": 11895 + }, + { + "epoch": 0.8954628427332091, + "grad_norm": 5.703959941864014, + "learning_rate": 2.901081807340977e-06, + "loss": 1.7734, + "step": 11896 + }, + { + "epoch": 0.8955381170138693, + "grad_norm": 5.176022529602051, + "learning_rate": 2.8969910470715122e-06, + "loss": 1.8948, + "step": 11897 + }, + { + "epoch": 0.8956133912945294, + "grad_norm": 6.328505992889404, + "learning_rate": 2.892903086940779e-06, + "loss": 1.7422, + "step": 11898 + }, + { + "epoch": 0.8956886655751896, + "grad_norm": 5.516473293304443, + "learning_rate": 2.8888179271918e-06, + "loss": 1.5715, + "step": 11899 + }, + { + "epoch": 0.8957639398558498, + "grad_norm": 6.584245681762695, + "learning_rate": 2.8847355680674094e-06, + "loss": 1.7064, + "step": 11900 + }, + { + "epoch": 0.8958392141365099, + "grad_norm": 3.959855079650879, + "learning_rate": 2.8806560098103117e-06, + "loss": 1.6046, + "step": 11901 + }, + { + "epoch": 0.89591448841717, + "grad_norm": 6.0848493576049805, + "learning_rate": 2.876579252663003e-06, + "loss": 1.4078, + "step": 11902 + }, + { + "epoch": 0.8959897626978303, + "grad_norm": 6.37382698059082, + "learning_rate": 2.8725052968678502e-06, + "loss": 1.6613, + "step": 11903 + }, + { + "epoch": 0.8960650369784904, + "grad_norm": 5.751524925231934, + "learning_rate": 2.8684341426670316e-06, + "loss": 1.6956, + "step": 11904 + }, + { + "epoch": 0.8961403112591505, + "grad_norm": 6.415385723114014, + "learning_rate": 2.8643657903025647e-06, + "loss": 1.3696, + "step": 11905 + }, + { + "epoch": 0.8962155855398107, + "grad_norm": 8.60664176940918, + "learning_rate": 2.8603002400163003e-06, + "loss": 1.6802, + "step": 11906 + }, + { + "epoch": 0.8962908598204709, + "grad_norm": 6.551266193389893, + "learning_rate": 2.85623749204994e-06, + "loss": 1.4695, + "step": 11907 + }, + { + "epoch": 0.896366134101131, + "grad_norm": 5.4599127769470215, + "learning_rate": 2.852177546644985e-06, + "loss": 1.6816, + "step": 11908 + }, + { + "epoch": 0.8964414083817912, + "grad_norm": 5.290742874145508, + "learning_rate": 2.8481204040427923e-06, + "loss": 1.5421, + "step": 11909 + }, + { + "epoch": 0.8965166826624513, + "grad_norm": 5.019072532653809, + "learning_rate": 2.844066064484546e-06, + "loss": 1.7126, + "step": 11910 + }, + { + "epoch": 0.8965919569431114, + "grad_norm": 4.828590393066406, + "learning_rate": 2.8400145282112713e-06, + "loss": 1.7533, + "step": 11911 + }, + { + "epoch": 0.8966672312237716, + "grad_norm": 4.919812202453613, + "learning_rate": 2.835965795463813e-06, + "loss": 1.6237, + "step": 11912 + }, + { + "epoch": 0.8967425055044318, + "grad_norm": 8.098947525024414, + "learning_rate": 2.8319198664828628e-06, + "loss": 2.2638, + "step": 11913 + }, + { + "epoch": 0.8968177797850919, + "grad_norm": 4.93231725692749, + "learning_rate": 2.827876741508928e-06, + "loss": 1.5447, + "step": 11914 + }, + { + "epoch": 0.896893054065752, + "grad_norm": 5.701282501220703, + "learning_rate": 2.823836420782372e-06, + "loss": 1.477, + "step": 11915 + }, + { + "epoch": 0.8969683283464123, + "grad_norm": 5.879255771636963, + "learning_rate": 2.819798904543375e-06, + "loss": 2.0108, + "step": 11916 + }, + { + "epoch": 0.8970436026270724, + "grad_norm": 5.179291725158691, + "learning_rate": 2.8157641930319613e-06, + "loss": 1.677, + "step": 11917 + }, + { + "epoch": 0.8971188769077325, + "grad_norm": 7.153129577636719, + "learning_rate": 2.8117322864879834e-06, + "loss": 2.0414, + "step": 11918 + }, + { + "epoch": 0.8971941511883927, + "grad_norm": 5.386348724365234, + "learning_rate": 2.8077031851511173e-06, + "loss": 1.825, + "step": 11919 + }, + { + "epoch": 0.8972694254690529, + "grad_norm": 6.56143045425415, + "learning_rate": 2.8036768892608924e-06, + "loss": 1.6851, + "step": 11920 + }, + { + "epoch": 0.897344699749713, + "grad_norm": 7.8744049072265625, + "learning_rate": 2.7996533990566454e-06, + "loss": 1.6488, + "step": 11921 + }, + { + "epoch": 0.8974199740303732, + "grad_norm": 4.648986339569092, + "learning_rate": 2.795632714777585e-06, + "loss": 1.7521, + "step": 11922 + }, + { + "epoch": 0.8974952483110333, + "grad_norm": 5.307143688201904, + "learning_rate": 2.791614836662704e-06, + "loss": 1.7356, + "step": 11923 + }, + { + "epoch": 0.8975705225916935, + "grad_norm": 6.285475254058838, + "learning_rate": 2.7875997649508713e-06, + "loss": 1.8785, + "step": 11924 + }, + { + "epoch": 0.8976457968723537, + "grad_norm": 7.4974446296691895, + "learning_rate": 2.7835874998807576e-06, + "loss": 1.8508, + "step": 11925 + }, + { + "epoch": 0.8977210711530138, + "grad_norm": 6.573554992675781, + "learning_rate": 2.779578041690889e-06, + "loss": 2.2806, + "step": 11926 + }, + { + "epoch": 0.8977963454336739, + "grad_norm": 5.30454683303833, + "learning_rate": 2.7755713906196134e-06, + "loss": 1.8508, + "step": 11927 + }, + { + "epoch": 0.8978716197143342, + "grad_norm": 4.32321834564209, + "learning_rate": 2.7715675469051293e-06, + "loss": 1.8609, + "step": 11928 + }, + { + "epoch": 0.8979468939949943, + "grad_norm": 6.064350128173828, + "learning_rate": 2.7675665107854298e-06, + "loss": 1.8117, + "step": 11929 + }, + { + "epoch": 0.8980221682756544, + "grad_norm": 6.371723175048828, + "learning_rate": 2.7635682824983743e-06, + "loss": 1.4551, + "step": 11930 + }, + { + "epoch": 0.8980974425563145, + "grad_norm": 5.655940532684326, + "learning_rate": 2.759572862281662e-06, + "loss": 1.7116, + "step": 11931 + }, + { + "epoch": 0.8981727168369747, + "grad_norm": 5.6205902099609375, + "learning_rate": 2.75558025037278e-06, + "loss": 2.4975, + "step": 11932 + }, + { + "epoch": 0.8982479911176349, + "grad_norm": 4.355203151702881, + "learning_rate": 2.751590447009106e-06, + "loss": 1.5173, + "step": 11933 + }, + { + "epoch": 0.898323265398295, + "grad_norm": 7.522546768188477, + "learning_rate": 2.747603452427794e-06, + "loss": 1.3424, + "step": 11934 + }, + { + "epoch": 0.8983985396789552, + "grad_norm": 7.461538791656494, + "learning_rate": 2.7436192668658876e-06, + "loss": 1.7813, + "step": 11935 + }, + { + "epoch": 0.8984738139596153, + "grad_norm": 4.0410542488098145, + "learning_rate": 2.739637890560215e-06, + "loss": 1.7704, + "step": 11936 + }, + { + "epoch": 0.8985490882402755, + "grad_norm": 5.376936912536621, + "learning_rate": 2.7356593237474636e-06, + "loss": 1.4698, + "step": 11937 + }, + { + "epoch": 0.8986243625209357, + "grad_norm": 5.933337688446045, + "learning_rate": 2.7316835666641504e-06, + "loss": 1.8077, + "step": 11938 + }, + { + "epoch": 0.8986996368015958, + "grad_norm": 5.833147048950195, + "learning_rate": 2.727710619546625e-06, + "loss": 1.9333, + "step": 11939 + }, + { + "epoch": 0.8987749110822559, + "grad_norm": 4.419341564178467, + "learning_rate": 2.72374048263106e-06, + "loss": 1.9527, + "step": 11940 + }, + { + "epoch": 0.8988501853629162, + "grad_norm": 5.924996376037598, + "learning_rate": 2.7197731561534767e-06, + "loss": 1.6099, + "step": 11941 + }, + { + "epoch": 0.8989254596435763, + "grad_norm": 6.665846824645996, + "learning_rate": 2.7158086403497097e-06, + "loss": 1.6098, + "step": 11942 + }, + { + "epoch": 0.8990007339242364, + "grad_norm": 6.3641767501831055, + "learning_rate": 2.711846935455453e-06, + "loss": 2.033, + "step": 11943 + }, + { + "epoch": 0.8990760082048966, + "grad_norm": 4.893651485443115, + "learning_rate": 2.707888041706208e-06, + "loss": 1.6455, + "step": 11944 + }, + { + "epoch": 0.8991512824855568, + "grad_norm": 5.841111183166504, + "learning_rate": 2.7039319593373246e-06, + "loss": 1.9816, + "step": 11945 + }, + { + "epoch": 0.8992265567662169, + "grad_norm": 5.636909008026123, + "learning_rate": 2.699978688583976e-06, + "loss": 1.4573, + "step": 11946 + }, + { + "epoch": 0.8993018310468771, + "grad_norm": 5.647894382476807, + "learning_rate": 2.696028229681169e-06, + "loss": 1.9749, + "step": 11947 + }, + { + "epoch": 0.8993771053275372, + "grad_norm": 6.1011881828308105, + "learning_rate": 2.69208058286376e-06, + "loss": 1.8956, + "step": 11948 + }, + { + "epoch": 0.8994523796081973, + "grad_norm": 6.474798679351807, + "learning_rate": 2.6881357483664216e-06, + "loss": 2.0522, + "step": 11949 + }, + { + "epoch": 0.8995276538888575, + "grad_norm": 5.42941427230835, + "learning_rate": 2.684193726423656e-06, + "loss": 1.5721, + "step": 11950 + }, + { + "epoch": 0.8996029281695177, + "grad_norm": 5.032866477966309, + "learning_rate": 2.680254517269809e-06, + "loss": 2.2334, + "step": 11951 + }, + { + "epoch": 0.8996782024501778, + "grad_norm": 6.370822429656982, + "learning_rate": 2.6763181211390596e-06, + "loss": 1.8079, + "step": 11952 + }, + { + "epoch": 0.8997534767308379, + "grad_norm": 4.981447219848633, + "learning_rate": 2.6723845382654043e-06, + "loss": 1.601, + "step": 11953 + }, + { + "epoch": 0.8998287510114982, + "grad_norm": 8.518513679504395, + "learning_rate": 2.6684537688827006e-06, + "loss": 1.7817, + "step": 11954 + }, + { + "epoch": 0.8999040252921583, + "grad_norm": 6.082557201385498, + "learning_rate": 2.6645258132245997e-06, + "loss": 1.6746, + "step": 11955 + }, + { + "epoch": 0.8999792995728184, + "grad_norm": 5.060088157653809, + "learning_rate": 2.6606006715246268e-06, + "loss": 1.9312, + "step": 11956 + }, + { + "epoch": 0.9000545738534786, + "grad_norm": 5.059937477111816, + "learning_rate": 2.656678344016106e-06, + "loss": 1.754, + "step": 11957 + }, + { + "epoch": 0.9001298481341388, + "grad_norm": 5.644674777984619, + "learning_rate": 2.6527588309322226e-06, + "loss": 1.369, + "step": 11958 + }, + { + "epoch": 0.9002051224147989, + "grad_norm": 5.513916015625, + "learning_rate": 2.648842132505963e-06, + "loss": 2.0142, + "step": 11959 + }, + { + "epoch": 0.9002803966954591, + "grad_norm": 4.713348388671875, + "learning_rate": 2.6449282489701744e-06, + "loss": 1.4611, + "step": 11960 + }, + { + "epoch": 0.9003556709761192, + "grad_norm": 6.027116775512695, + "learning_rate": 2.6410171805575203e-06, + "loss": 1.8902, + "step": 11961 + }, + { + "epoch": 0.9004309452567794, + "grad_norm": 6.251559734344482, + "learning_rate": 2.637108927500509e-06, + "loss": 1.4345, + "step": 11962 + }, + { + "epoch": 0.9005062195374396, + "grad_norm": 5.564704895019531, + "learning_rate": 2.6332034900314827e-06, + "loss": 1.9362, + "step": 11963 + }, + { + "epoch": 0.9005814938180997, + "grad_norm": 5.13485050201416, + "learning_rate": 2.6293008683825893e-06, + "loss": 1.5188, + "step": 11964 + }, + { + "epoch": 0.9006567680987598, + "grad_norm": 4.393579959869385, + "learning_rate": 2.6254010627858426e-06, + "loss": 1.3775, + "step": 11965 + }, + { + "epoch": 0.9007320423794201, + "grad_norm": 5.7572760581970215, + "learning_rate": 2.6215040734730633e-06, + "loss": 1.3876, + "step": 11966 + }, + { + "epoch": 0.9008073166600802, + "grad_norm": 5.004550933837891, + "learning_rate": 2.6176099006759324e-06, + "loss": 1.9273, + "step": 11967 + }, + { + "epoch": 0.9008825909407403, + "grad_norm": 4.437585353851318, + "learning_rate": 2.6137185446259315e-06, + "loss": 1.7927, + "step": 11968 + }, + { + "epoch": 0.9009578652214005, + "grad_norm": 5.320894718170166, + "learning_rate": 2.6098300055543978e-06, + "loss": 2.158, + "step": 11969 + }, + { + "epoch": 0.9010331395020607, + "grad_norm": 5.74613618850708, + "learning_rate": 2.6059442836924906e-06, + "loss": 1.7708, + "step": 11970 + }, + { + "epoch": 0.9011084137827208, + "grad_norm": 6.124148368835449, + "learning_rate": 2.6020613792712032e-06, + "loss": 1.6836, + "step": 11971 + }, + { + "epoch": 0.9011836880633809, + "grad_norm": 6.6365485191345215, + "learning_rate": 2.598181292521373e-06, + "loss": 1.8699, + "step": 11972 + }, + { + "epoch": 0.9012589623440411, + "grad_norm": 4.937188148498535, + "learning_rate": 2.594304023673655e-06, + "loss": 1.6145, + "step": 11973 + }, + { + "epoch": 0.9013342366247012, + "grad_norm": 4.8288397789001465, + "learning_rate": 2.590429572958536e-06, + "loss": 1.1911, + "step": 11974 + }, + { + "epoch": 0.9014095109053614, + "grad_norm": 4.601717948913574, + "learning_rate": 2.5865579406063488e-06, + "loss": 1.4361, + "step": 11975 + }, + { + "epoch": 0.9014847851860216, + "grad_norm": 5.0262651443481445, + "learning_rate": 2.5826891268472485e-06, + "loss": 1.8421, + "step": 11976 + }, + { + "epoch": 0.9015600594666817, + "grad_norm": 5.285348892211914, + "learning_rate": 2.578823131911223e-06, + "loss": 1.5288, + "step": 11977 + }, + { + "epoch": 0.9016353337473418, + "grad_norm": 7.302457809448242, + "learning_rate": 2.5749599560280947e-06, + "loss": 1.4515, + "step": 11978 + }, + { + "epoch": 0.9017106080280021, + "grad_norm": 5.04168176651001, + "learning_rate": 2.5710995994275176e-06, + "loss": 1.7501, + "step": 11979 + }, + { + "epoch": 0.9017858823086622, + "grad_norm": 4.656704902648926, + "learning_rate": 2.567242062338987e-06, + "loss": 1.823, + "step": 11980 + }, + { + "epoch": 0.9018611565893223, + "grad_norm": 5.510639190673828, + "learning_rate": 2.563387344991808e-06, + "loss": 1.6198, + "step": 11981 + }, + { + "epoch": 0.9019364308699825, + "grad_norm": 7.097165584564209, + "learning_rate": 2.5595354476151413e-06, + "loss": 1.8351, + "step": 11982 + }, + { + "epoch": 0.9020117051506427, + "grad_norm": 5.188054084777832, + "learning_rate": 2.555686370437971e-06, + "loss": 1.826, + "step": 11983 + }, + { + "epoch": 0.9020869794313028, + "grad_norm": 5.7781171798706055, + "learning_rate": 2.5518401136891134e-06, + "loss": 1.6859, + "step": 11984 + }, + { + "epoch": 0.902162253711963, + "grad_norm": 4.948212623596191, + "learning_rate": 2.5479966775972198e-06, + "loss": 1.434, + "step": 11985 + }, + { + "epoch": 0.9022375279926231, + "grad_norm": 5.777616500854492, + "learning_rate": 2.544156062390768e-06, + "loss": 1.5527, + "step": 11986 + }, + { + "epoch": 0.9023128022732833, + "grad_norm": 5.043802261352539, + "learning_rate": 2.54031826829807e-06, + "loss": 1.4769, + "step": 11987 + }, + { + "epoch": 0.9023880765539435, + "grad_norm": 7.5450029373168945, + "learning_rate": 2.536483295547282e-06, + "loss": 2.1653, + "step": 11988 + }, + { + "epoch": 0.9024633508346036, + "grad_norm": 5.307740211486816, + "learning_rate": 2.53265114436636e-06, + "loss": 1.6227, + "step": 11989 + }, + { + "epoch": 0.9025386251152637, + "grad_norm": 5.527013301849365, + "learning_rate": 2.52882181498314e-06, + "loss": 1.5562, + "step": 11990 + }, + { + "epoch": 0.9026138993959238, + "grad_norm": 4.846912384033203, + "learning_rate": 2.5249953076252496e-06, + "loss": 1.7455, + "step": 11991 + }, + { + "epoch": 0.9026891736765841, + "grad_norm": 4.152447700500488, + "learning_rate": 2.5211716225201632e-06, + "loss": 1.7088, + "step": 11992 + }, + { + "epoch": 0.9027644479572442, + "grad_norm": 5.484810829162598, + "learning_rate": 2.5173507598951927e-06, + "loss": 1.6365, + "step": 11993 + }, + { + "epoch": 0.9028397222379043, + "grad_norm": 4.570249557495117, + "learning_rate": 2.513532719977474e-06, + "loss": 1.8197, + "step": 11994 + }, + { + "epoch": 0.9029149965185645, + "grad_norm": 6.993124485015869, + "learning_rate": 2.509717502993991e-06, + "loss": 1.7005, + "step": 11995 + }, + { + "epoch": 0.9029902707992247, + "grad_norm": 4.189418315887451, + "learning_rate": 2.5059051091715303e-06, + "loss": 1.6685, + "step": 11996 + }, + { + "epoch": 0.9030655450798848, + "grad_norm": 4.5827250480651855, + "learning_rate": 2.5020955387367372e-06, + "loss": 1.5126, + "step": 11997 + }, + { + "epoch": 0.903140819360545, + "grad_norm": 4.651432037353516, + "learning_rate": 2.498288791916076e-06, + "loss": 1.9041, + "step": 11998 + }, + { + "epoch": 0.9032160936412051, + "grad_norm": 4.879321098327637, + "learning_rate": 2.4944848689358534e-06, + "loss": 1.7801, + "step": 11999 + }, + { + "epoch": 0.9032913679218653, + "grad_norm": 4.520389556884766, + "learning_rate": 2.4906837700221896e-06, + "loss": 1.7018, + "step": 12000 + }, + { + "epoch": 0.9033666422025255, + "grad_norm": 5.30260705947876, + "learning_rate": 2.4868854954010645e-06, + "loss": 1.8589, + "step": 12001 + }, + { + "epoch": 0.9034419164831856, + "grad_norm": 5.72796630859375, + "learning_rate": 2.4830900452982587e-06, + "loss": 1.4862, + "step": 12002 + }, + { + "epoch": 0.9035171907638457, + "grad_norm": 5.147854328155518, + "learning_rate": 2.479297419939419e-06, + "loss": 1.7129, + "step": 12003 + }, + { + "epoch": 0.903592465044506, + "grad_norm": 5.8284406661987305, + "learning_rate": 2.475507619549983e-06, + "loss": 1.7856, + "step": 12004 + }, + { + "epoch": 0.9036677393251661, + "grad_norm": 5.082967281341553, + "learning_rate": 2.471720644355258e-06, + "loss": 1.9227, + "step": 12005 + }, + { + "epoch": 0.9037430136058262, + "grad_norm": 5.6202521324157715, + "learning_rate": 2.467936494580364e-06, + "loss": 1.731, + "step": 12006 + }, + { + "epoch": 0.9038182878864864, + "grad_norm": 4.830552577972412, + "learning_rate": 2.464155170450272e-06, + "loss": 1.482, + "step": 12007 + }, + { + "epoch": 0.9038935621671466, + "grad_norm": 8.249223709106445, + "learning_rate": 2.4603766721897513e-06, + "loss": 1.5037, + "step": 12008 + }, + { + "epoch": 0.9039688364478067, + "grad_norm": 4.685480117797852, + "learning_rate": 2.4566010000234397e-06, + "loss": 1.6392, + "step": 12009 + }, + { + "epoch": 0.9040441107284668, + "grad_norm": 5.3261494636535645, + "learning_rate": 2.452828154175768e-06, + "loss": 1.6821, + "step": 12010 + }, + { + "epoch": 0.904119385009127, + "grad_norm": 6.012526988983154, + "learning_rate": 2.4490581348710408e-06, + "loss": 1.6481, + "step": 12011 + }, + { + "epoch": 0.9041946592897872, + "grad_norm": 5.432146072387695, + "learning_rate": 2.4452909423333723e-06, + "loss": 1.8019, + "step": 12012 + }, + { + "epoch": 0.9042699335704473, + "grad_norm": 5.548080921173096, + "learning_rate": 2.4415265767867014e-06, + "loss": 1.7308, + "step": 12013 + }, + { + "epoch": 0.9043452078511075, + "grad_norm": 9.11673641204834, + "learning_rate": 2.43776503845482e-06, + "loss": 1.9102, + "step": 12014 + }, + { + "epoch": 0.9044204821317676, + "grad_norm": 4.765735626220703, + "learning_rate": 2.4340063275613335e-06, + "loss": 1.5548, + "step": 12015 + }, + { + "epoch": 0.9044957564124277, + "grad_norm": 4.7274909019470215, + "learning_rate": 2.4302504443296848e-06, + "loss": 1.7932, + "step": 12016 + }, + { + "epoch": 0.904571030693088, + "grad_norm": 5.0326666831970215, + "learning_rate": 2.4264973889831565e-06, + "loss": 1.6462, + "step": 12017 + }, + { + "epoch": 0.9046463049737481, + "grad_norm": 4.561138153076172, + "learning_rate": 2.4227471617448596e-06, + "loss": 1.5056, + "step": 12018 + }, + { + "epoch": 0.9047215792544082, + "grad_norm": 6.14715051651001, + "learning_rate": 2.4189997628377205e-06, + "loss": 2.3344, + "step": 12019 + }, + { + "epoch": 0.9047968535350684, + "grad_norm": 8.388001441955566, + "learning_rate": 2.4152551924845334e-06, + "loss": 1.8963, + "step": 12020 + }, + { + "epoch": 0.9048721278157286, + "grad_norm": 7.677555561065674, + "learning_rate": 2.411513450907876e-06, + "loss": 2.1121, + "step": 12021 + }, + { + "epoch": 0.9049474020963887, + "grad_norm": 5.5138325691223145, + "learning_rate": 2.4077745383302097e-06, + "loss": 1.6134, + "step": 12022 + }, + { + "epoch": 0.9050226763770489, + "grad_norm": 5.14309024810791, + "learning_rate": 2.404038454973778e-06, + "loss": 1.7937, + "step": 12023 + }, + { + "epoch": 0.905097950657709, + "grad_norm": 5.361375331878662, + "learning_rate": 2.400305201060704e-06, + "loss": 1.5505, + "step": 12024 + }, + { + "epoch": 0.9051732249383692, + "grad_norm": 5.180974006652832, + "learning_rate": 2.3965747768128933e-06, + "loss": 1.941, + "step": 12025 + }, + { + "epoch": 0.9052484992190294, + "grad_norm": 5.114887237548828, + "learning_rate": 2.3928471824521293e-06, + "loss": 2.0865, + "step": 12026 + }, + { + "epoch": 0.9053237734996895, + "grad_norm": 5.358067512512207, + "learning_rate": 2.389122418199996e-06, + "loss": 1.6241, + "step": 12027 + }, + { + "epoch": 0.9053990477803496, + "grad_norm": 6.268847465515137, + "learning_rate": 2.3854004842779277e-06, + "loss": 1.8622, + "step": 12028 + }, + { + "epoch": 0.9054743220610098, + "grad_norm": 7.79710578918457, + "learning_rate": 2.38168138090718e-06, + "loss": 1.7554, + "step": 12029 + }, + { + "epoch": 0.90554959634167, + "grad_norm": 6.9985032081604, + "learning_rate": 2.3779651083088384e-06, + "loss": 1.5325, + "step": 12030 + }, + { + "epoch": 0.9056248706223301, + "grad_norm": 6.117090225219727, + "learning_rate": 2.374251666703836e-06, + "loss": 1.6224, + "step": 12031 + }, + { + "epoch": 0.9057001449029902, + "grad_norm": 4.519183158874512, + "learning_rate": 2.3705410563129137e-06, + "loss": 1.7283, + "step": 12032 + }, + { + "epoch": 0.9057754191836505, + "grad_norm": 6.158343315124512, + "learning_rate": 2.3668332773566617e-06, + "loss": 1.7579, + "step": 12033 + }, + { + "epoch": 0.9058506934643106, + "grad_norm": 6.365387916564941, + "learning_rate": 2.363128330055492e-06, + "loss": 1.8, + "step": 12034 + }, + { + "epoch": 0.9059259677449707, + "grad_norm": 4.843330383300781, + "learning_rate": 2.3594262146296676e-06, + "loss": 1.6792, + "step": 12035 + }, + { + "epoch": 0.9060012420256309, + "grad_norm": 4.804835319519043, + "learning_rate": 2.3557269312992514e-06, + "loss": 1.8734, + "step": 12036 + }, + { + "epoch": 0.906076516306291, + "grad_norm": 5.723232269287109, + "learning_rate": 2.352030480284162e-06, + "loss": 1.9366, + "step": 12037 + }, + { + "epoch": 0.9061517905869512, + "grad_norm": 3.8502132892608643, + "learning_rate": 2.3483368618041403e-06, + "loss": 1.5554, + "step": 12038 + }, + { + "epoch": 0.9062270648676114, + "grad_norm": 5.516935348510742, + "learning_rate": 2.3446460760787713e-06, + "loss": 1.8526, + "step": 12039 + }, + { + "epoch": 0.9063023391482715, + "grad_norm": 6.163309574127197, + "learning_rate": 2.3409581233274468e-06, + "loss": 1.674, + "step": 12040 + }, + { + "epoch": 0.9063776134289316, + "grad_norm": 4.91181755065918, + "learning_rate": 2.3372730037694245e-06, + "loss": 1.7659, + "step": 12041 + }, + { + "epoch": 0.9064528877095919, + "grad_norm": 5.466695308685303, + "learning_rate": 2.3335907176237513e-06, + "loss": 1.7764, + "step": 12042 + }, + { + "epoch": 0.906528161990252, + "grad_norm": 5.5561957359313965, + "learning_rate": 2.3299112651093414e-06, + "loss": 1.5113, + "step": 12043 + }, + { + "epoch": 0.9066034362709121, + "grad_norm": 5.407498836517334, + "learning_rate": 2.3262346464449358e-06, + "loss": 1.5861, + "step": 12044 + }, + { + "epoch": 0.9066787105515723, + "grad_norm": 4.9287028312683105, + "learning_rate": 2.3225608618490766e-06, + "loss": 1.5724, + "step": 12045 + }, + { + "epoch": 0.9067539848322325, + "grad_norm": 5.3105549812316895, + "learning_rate": 2.318889911540184e-06, + "loss": 1.661, + "step": 12046 + }, + { + "epoch": 0.9068292591128926, + "grad_norm": 4.929981708526611, + "learning_rate": 2.3152217957364607e-06, + "loss": 2.1262, + "step": 12047 + }, + { + "epoch": 0.9069045333935527, + "grad_norm": 7.421399116516113, + "learning_rate": 2.3115565146559826e-06, + "loss": 1.8894, + "step": 12048 + }, + { + "epoch": 0.9069798076742129, + "grad_norm": 5.4301862716674805, + "learning_rate": 2.3078940685166364e-06, + "loss": 1.808, + "step": 12049 + }, + { + "epoch": 0.907055081954873, + "grad_norm": 4.186066150665283, + "learning_rate": 2.304234457536153e-06, + "loss": 2.0256, + "step": 12050 + }, + { + "epoch": 0.9071303562355332, + "grad_norm": 5.003772258758545, + "learning_rate": 2.3005776819320648e-06, + "loss": 1.8191, + "step": 12051 + }, + { + "epoch": 0.9072056305161934, + "grad_norm": 6.509251594543457, + "learning_rate": 2.2969237419217802e-06, + "loss": 1.512, + "step": 12052 + }, + { + "epoch": 0.9072809047968535, + "grad_norm": 6.0294880867004395, + "learning_rate": 2.293272637722493e-06, + "loss": 2.0606, + "step": 12053 + }, + { + "epoch": 0.9073561790775136, + "grad_norm": 4.9922194480896, + "learning_rate": 2.289624369551269e-06, + "loss": 1.547, + "step": 12054 + }, + { + "epoch": 0.9074314533581739, + "grad_norm": 6.61724328994751, + "learning_rate": 2.2859789376249718e-06, + "loss": 1.7804, + "step": 12055 + }, + { + "epoch": 0.907506727638834, + "grad_norm": 7.011882305145264, + "learning_rate": 2.2823363421603294e-06, + "loss": 1.8639, + "step": 12056 + }, + { + "epoch": 0.9075820019194941, + "grad_norm": 5.678738594055176, + "learning_rate": 2.2786965833738684e-06, + "loss": 1.5468, + "step": 12057 + }, + { + "epoch": 0.9076572762001544, + "grad_norm": 4.543079376220703, + "learning_rate": 2.2750596614819708e-06, + "loss": 1.7992, + "step": 12058 + }, + { + "epoch": 0.9077325504808145, + "grad_norm": 5.744643211364746, + "learning_rate": 2.271425576700831e-06, + "loss": 1.9835, + "step": 12059 + }, + { + "epoch": 0.9078078247614746, + "grad_norm": 5.307352066040039, + "learning_rate": 2.267794329246498e-06, + "loss": 1.43, + "step": 12060 + }, + { + "epoch": 0.9078830990421348, + "grad_norm": 5.3971452713012695, + "learning_rate": 2.264165919334826e-06, + "loss": 1.5944, + "step": 12061 + }, + { + "epoch": 0.9079583733227949, + "grad_norm": 4.742458820343018, + "learning_rate": 2.2605403471815278e-06, + "loss": 1.6461, + "step": 12062 + }, + { + "epoch": 0.9080336476034551, + "grad_norm": 5.145166397094727, + "learning_rate": 2.256917613002124e-06, + "loss": 1.6588, + "step": 12063 + }, + { + "epoch": 0.9081089218841153, + "grad_norm": 5.413640022277832, + "learning_rate": 2.2532977170119764e-06, + "loss": 1.8302, + "step": 12064 + }, + { + "epoch": 0.9081841961647754, + "grad_norm": 4.933236598968506, + "learning_rate": 2.2496806594262842e-06, + "loss": 1.8759, + "step": 12065 + }, + { + "epoch": 0.9082594704454355, + "grad_norm": 4.591185569763184, + "learning_rate": 2.246066440460065e-06, + "loss": 1.7334, + "step": 12066 + }, + { + "epoch": 0.9083347447260958, + "grad_norm": 4.764317035675049, + "learning_rate": 2.2424550603281746e-06, + "loss": 1.5602, + "step": 12067 + }, + { + "epoch": 0.9084100190067559, + "grad_norm": 6.722568988800049, + "learning_rate": 2.2388465192452914e-06, + "loss": 1.7871, + "step": 12068 + }, + { + "epoch": 0.908485293287416, + "grad_norm": 4.362168312072754, + "learning_rate": 2.2352408174259487e-06, + "loss": 1.6478, + "step": 12069 + }, + { + "epoch": 0.9085605675680761, + "grad_norm": 5.946048736572266, + "learning_rate": 2.2316379550844815e-06, + "loss": 1.7711, + "step": 12070 + }, + { + "epoch": 0.9086358418487364, + "grad_norm": 5.6206464767456055, + "learning_rate": 2.2280379324350785e-06, + "loss": 1.9079, + "step": 12071 + }, + { + "epoch": 0.9087111161293965, + "grad_norm": 6.771544456481934, + "learning_rate": 2.224440749691742e-06, + "loss": 1.4976, + "step": 12072 + }, + { + "epoch": 0.9087863904100566, + "grad_norm": 7.210448741912842, + "learning_rate": 2.220846407068328e-06, + "loss": 1.8177, + "step": 12073 + }, + { + "epoch": 0.9088616646907168, + "grad_norm": 4.924471378326416, + "learning_rate": 2.217254904778493e-06, + "loss": 1.5207, + "step": 12074 + }, + { + "epoch": 0.908936938971377, + "grad_norm": 4.953563690185547, + "learning_rate": 2.21366624303575e-06, + "loss": 1.4749, + "step": 12075 + }, + { + "epoch": 0.9090122132520371, + "grad_norm": 3.6346771717071533, + "learning_rate": 2.2100804220534454e-06, + "loss": 1.515, + "step": 12076 + }, + { + "epoch": 0.9090874875326973, + "grad_norm": 5.833499908447266, + "learning_rate": 2.206497442044725e-06, + "loss": 2.204, + "step": 12077 + }, + { + "epoch": 0.9091627618133574, + "grad_norm": 4.451202869415283, + "learning_rate": 2.202917303222607e-06, + "loss": 1.6256, + "step": 12078 + }, + { + "epoch": 0.9092380360940175, + "grad_norm": 4.531048774719238, + "learning_rate": 2.1993400057998994e-06, + "loss": 1.7475, + "step": 12079 + }, + { + "epoch": 0.9093133103746778, + "grad_norm": 6.64673376083374, + "learning_rate": 2.195765549989276e-06, + "loss": 1.7005, + "step": 12080 + }, + { + "epoch": 0.9093885846553379, + "grad_norm": 6.584712505340576, + "learning_rate": 2.192193936003223e-06, + "loss": 1.6248, + "step": 12081 + }, + { + "epoch": 0.909463858935998, + "grad_norm": 5.862979412078857, + "learning_rate": 2.1886251640540643e-06, + "loss": 1.8505, + "step": 12082 + }, + { + "epoch": 0.9095391332166582, + "grad_norm": 5.620852947235107, + "learning_rate": 2.1850592343539532e-06, + "loss": 1.6635, + "step": 12083 + }, + { + "epoch": 0.9096144074973184, + "grad_norm": 5.809586524963379, + "learning_rate": 2.181496147114881e-06, + "loss": 1.9142, + "step": 12084 + }, + { + "epoch": 0.9096896817779785, + "grad_norm": 4.972949504852295, + "learning_rate": 2.1779359025486503e-06, + "loss": 2.0363, + "step": 12085 + }, + { + "epoch": 0.9097649560586387, + "grad_norm": 6.226766109466553, + "learning_rate": 2.1743785008669194e-06, + "loss": 1.7471, + "step": 12086 + }, + { + "epoch": 0.9098402303392988, + "grad_norm": 6.337262153625488, + "learning_rate": 2.1708239422811525e-06, + "loss": 1.5184, + "step": 12087 + }, + { + "epoch": 0.909915504619959, + "grad_norm": 4.261514663696289, + "learning_rate": 2.167272227002676e-06, + "loss": 1.2308, + "step": 12088 + }, + { + "epoch": 0.9099907789006191, + "grad_norm": 6.280251502990723, + "learning_rate": 2.1637233552426028e-06, + "loss": 1.3626, + "step": 12089 + }, + { + "epoch": 0.9100660531812793, + "grad_norm": 6.412320137023926, + "learning_rate": 2.160177327211932e-06, + "loss": 1.4087, + "step": 12090 + }, + { + "epoch": 0.9101413274619394, + "grad_norm": 6.003942489624023, + "learning_rate": 2.1566341431214443e-06, + "loss": 1.2439, + "step": 12091 + }, + { + "epoch": 0.9102166017425996, + "grad_norm": 4.7945966720581055, + "learning_rate": 2.1530938031817714e-06, + "loss": 1.9089, + "step": 12092 + }, + { + "epoch": 0.9102918760232598, + "grad_norm": 6.132021427154541, + "learning_rate": 2.1495563076033896e-06, + "loss": 1.8737, + "step": 12093 + }, + { + "epoch": 0.9103671503039199, + "grad_norm": 4.359644889831543, + "learning_rate": 2.146021656596586e-06, + "loss": 1.8186, + "step": 12094 + }, + { + "epoch": 0.91044242458458, + "grad_norm": 5.697770118713379, + "learning_rate": 2.1424898503714985e-06, + "loss": 1.7652, + "step": 12095 + }, + { + "epoch": 0.9105176988652403, + "grad_norm": 4.348025798797607, + "learning_rate": 2.1389608891380584e-06, + "loss": 1.7078, + "step": 12096 + }, + { + "epoch": 0.9105929731459004, + "grad_norm": 5.43589448928833, + "learning_rate": 2.1354347731060707e-06, + "loss": 1.8083, + "step": 12097 + }, + { + "epoch": 0.9106682474265605, + "grad_norm": 5.264846324920654, + "learning_rate": 2.13191150248514e-06, + "loss": 1.744, + "step": 12098 + }, + { + "epoch": 0.9107435217072207, + "grad_norm": 5.35165548324585, + "learning_rate": 2.1283910774847315e-06, + "loss": 1.5764, + "step": 12099 + }, + { + "epoch": 0.9108187959878808, + "grad_norm": 4.15131139755249, + "learning_rate": 2.124873498314106e-06, + "loss": 1.4299, + "step": 12100 + }, + { + "epoch": 0.910894070268541, + "grad_norm": 5.66389274597168, + "learning_rate": 2.121358765182385e-06, + "loss": 1.8705, + "step": 12101 + }, + { + "epoch": 0.9109693445492012, + "grad_norm": 5.436544895172119, + "learning_rate": 2.1178468782985074e-06, + "loss": 1.7606, + "step": 12102 + }, + { + "epoch": 0.9110446188298613, + "grad_norm": 6.734651565551758, + "learning_rate": 2.1143378378712388e-06, + "loss": 1.7211, + "step": 12103 + }, + { + "epoch": 0.9111198931105214, + "grad_norm": 5.556075096130371, + "learning_rate": 2.1108316441091904e-06, + "loss": 1.7475, + "step": 12104 + }, + { + "epoch": 0.9111951673911817, + "grad_norm": 4.562472343444824, + "learning_rate": 2.10732829722079e-06, + "loss": 1.6336, + "step": 12105 + }, + { + "epoch": 0.9112704416718418, + "grad_norm": 6.423446178436279, + "learning_rate": 2.1038277974142982e-06, + "loss": 1.8875, + "step": 12106 + }, + { + "epoch": 0.9113457159525019, + "grad_norm": 4.994909763336182, + "learning_rate": 2.100330144897822e-06, + "loss": 1.4374, + "step": 12107 + }, + { + "epoch": 0.911420990233162, + "grad_norm": 4.016394138336182, + "learning_rate": 2.0968353398792773e-06, + "loss": 1.525, + "step": 12108 + }, + { + "epoch": 0.9114962645138223, + "grad_norm": 4.11951208114624, + "learning_rate": 2.0933433825664206e-06, + "loss": 1.5314, + "step": 12109 + }, + { + "epoch": 0.9115715387944824, + "grad_norm": 6.115981101989746, + "learning_rate": 2.0898542731668415e-06, + "loss": 1.7548, + "step": 12110 + }, + { + "epoch": 0.9116468130751425, + "grad_norm": 5.9816789627075195, + "learning_rate": 2.086368011887957e-06, + "loss": 1.7514, + "step": 12111 + }, + { + "epoch": 0.9117220873558027, + "grad_norm": 5.767557144165039, + "learning_rate": 2.0828845989370127e-06, + "loss": 1.5362, + "step": 12112 + }, + { + "epoch": 0.9117973616364629, + "grad_norm": 4.936404705047607, + "learning_rate": 2.0794040345210874e-06, + "loss": 1.9594, + "step": 12113 + }, + { + "epoch": 0.911872635917123, + "grad_norm": 4.490144729614258, + "learning_rate": 2.075926318847099e-06, + "loss": 1.9167, + "step": 12114 + }, + { + "epoch": 0.9119479101977832, + "grad_norm": 7.167920112609863, + "learning_rate": 2.0724514521217764e-06, + "loss": 1.7929, + "step": 12115 + }, + { + "epoch": 0.9120231844784433, + "grad_norm": 5.424349784851074, + "learning_rate": 2.0689794345516935e-06, + "loss": 1.5981, + "step": 12116 + }, + { + "epoch": 0.9120984587591034, + "grad_norm": 4.81858491897583, + "learning_rate": 2.065510266343257e-06, + "loss": 1.5832, + "step": 12117 + }, + { + "epoch": 0.9121737330397637, + "grad_norm": 7.445239543914795, + "learning_rate": 2.0620439477026966e-06, + "loss": 1.8212, + "step": 12118 + }, + { + "epoch": 0.9122490073204238, + "grad_norm": 6.772849082946777, + "learning_rate": 2.0585804788360696e-06, + "loss": 1.9515, + "step": 12119 + }, + { + "epoch": 0.9123242816010839, + "grad_norm": 5.177568435668945, + "learning_rate": 2.055119859949284e-06, + "loss": 1.9221, + "step": 12120 + }, + { + "epoch": 0.9123995558817442, + "grad_norm": 5.158840179443359, + "learning_rate": 2.051662091248041e-06, + "loss": 1.495, + "step": 12121 + }, + { + "epoch": 0.9124748301624043, + "grad_norm": 5.418978214263916, + "learning_rate": 2.048207172937916e-06, + "loss": 1.3839, + "step": 12122 + }, + { + "epoch": 0.9125501044430644, + "grad_norm": 5.125420093536377, + "learning_rate": 2.044755105224283e-06, + "loss": 1.3722, + "step": 12123 + }, + { + "epoch": 0.9126253787237246, + "grad_norm": 5.5098795890808105, + "learning_rate": 2.041305888312356e-06, + "loss": 2.0321, + "step": 12124 + }, + { + "epoch": 0.9127006530043847, + "grad_norm": 4.938101291656494, + "learning_rate": 2.037859522407193e-06, + "loss": 1.6111, + "step": 12125 + }, + { + "epoch": 0.9127759272850449, + "grad_norm": 5.382662773132324, + "learning_rate": 2.034416007713652e-06, + "loss": 1.3861, + "step": 12126 + }, + { + "epoch": 0.912851201565705, + "grad_norm": 5.148458003997803, + "learning_rate": 2.0309753444364533e-06, + "loss": 1.6609, + "step": 12127 + }, + { + "epoch": 0.9129264758463652, + "grad_norm": 6.09207010269165, + "learning_rate": 2.027537532780133e-06, + "loss": 1.6672, + "step": 12128 + }, + { + "epoch": 0.9130017501270253, + "grad_norm": 5.972532272338867, + "learning_rate": 2.024102572949066e-06, + "loss": 1.5395, + "step": 12129 + }, + { + "epoch": 0.9130770244076855, + "grad_norm": 5.418562412261963, + "learning_rate": 2.020670465147434e-06, + "loss": 1.8867, + "step": 12130 + }, + { + "epoch": 0.9131522986883457, + "grad_norm": 5.558815002441406, + "learning_rate": 2.0172412095792793e-06, + "loss": 2.0467, + "step": 12131 + }, + { + "epoch": 0.9132275729690058, + "grad_norm": 4.814827919006348, + "learning_rate": 2.013814806448455e-06, + "loss": 1.4894, + "step": 12132 + }, + { + "epoch": 0.9133028472496659, + "grad_norm": 5.758498191833496, + "learning_rate": 2.0103912559586545e-06, + "loss": 2.1396, + "step": 12133 + }, + { + "epoch": 0.9133781215303262, + "grad_norm": 5.368686199188232, + "learning_rate": 2.0069705583133926e-06, + "loss": 1.4753, + "step": 12134 + }, + { + "epoch": 0.9134533958109863, + "grad_norm": 5.155279636383057, + "learning_rate": 2.0035527137160284e-06, + "loss": 1.4479, + "step": 12135 + }, + { + "epoch": 0.9135286700916464, + "grad_norm": 6.514254093170166, + "learning_rate": 2.000137722369733e-06, + "loss": 2.3788, + "step": 12136 + }, + { + "epoch": 0.9136039443723066, + "grad_norm": 5.442849636077881, + "learning_rate": 1.996725584477521e-06, + "loss": 1.7622, + "step": 12137 + }, + { + "epoch": 0.9136792186529668, + "grad_norm": 5.402738571166992, + "learning_rate": 1.9933163002422373e-06, + "loss": 1.8042, + "step": 12138 + }, + { + "epoch": 0.9137544929336269, + "grad_norm": 5.796487331390381, + "learning_rate": 1.9899098698665574e-06, + "loss": 1.6853, + "step": 12139 + }, + { + "epoch": 0.9138297672142871, + "grad_norm": 7.907670974731445, + "learning_rate": 1.986506293552981e-06, + "loss": 1.9604, + "step": 12140 + }, + { + "epoch": 0.9139050414949472, + "grad_norm": 5.485021114349365, + "learning_rate": 1.9831055715038293e-06, + "loss": 1.5177, + "step": 12141 + }, + { + "epoch": 0.9139803157756073, + "grad_norm": 4.917305946350098, + "learning_rate": 1.979707703921285e-06, + "loss": 1.7208, + "step": 12142 + }, + { + "epoch": 0.9140555900562676, + "grad_norm": 4.9062604904174805, + "learning_rate": 1.9763126910073315e-06, + "loss": 1.9048, + "step": 12143 + }, + { + "epoch": 0.9141308643369277, + "grad_norm": 5.854250907897949, + "learning_rate": 1.9729205329637902e-06, + "loss": 1.9475, + "step": 12144 + }, + { + "epoch": 0.9142061386175878, + "grad_norm": 4.369939804077148, + "learning_rate": 1.9695312299923165e-06, + "loss": 1.5317, + "step": 12145 + }, + { + "epoch": 0.9142814128982479, + "grad_norm": 4.852906703948975, + "learning_rate": 1.9661447822944047e-06, + "loss": 1.7866, + "step": 12146 + }, + { + "epoch": 0.9143566871789082, + "grad_norm": 5.385268688201904, + "learning_rate": 1.9627611900713495e-06, + "loss": 1.4388, + "step": 12147 + }, + { + "epoch": 0.9144319614595683, + "grad_norm": 4.07965612411499, + "learning_rate": 1.959380453524312e-06, + "loss": 1.7746, + "step": 12148 + }, + { + "epoch": 0.9145072357402284, + "grad_norm": 6.136497974395752, + "learning_rate": 1.95600257285426e-06, + "loss": 1.7557, + "step": 12149 + }, + { + "epoch": 0.9145825100208886, + "grad_norm": 5.008094787597656, + "learning_rate": 1.9526275482620095e-06, + "loss": 1.7943, + "step": 12150 + }, + { + "epoch": 0.9146577843015488, + "grad_norm": 5.1837334632873535, + "learning_rate": 1.949255379948178e-06, + "loss": 1.9581, + "step": 12151 + }, + { + "epoch": 0.9147330585822089, + "grad_norm": 5.017820835113525, + "learning_rate": 1.9458860681132506e-06, + "loss": 1.4835, + "step": 12152 + }, + { + "epoch": 0.9148083328628691, + "grad_norm": 4.911017417907715, + "learning_rate": 1.942519612957505e-06, + "loss": 1.8279, + "step": 12153 + }, + { + "epoch": 0.9148836071435292, + "grad_norm": 5.315505504608154, + "learning_rate": 1.9391560146810872e-06, + "loss": 1.5972, + "step": 12154 + }, + { + "epoch": 0.9149588814241894, + "grad_norm": 4.267319202423096, + "learning_rate": 1.9357952734839314e-06, + "loss": 1.8427, + "step": 12155 + }, + { + "epoch": 0.9150341557048496, + "grad_norm": 5.728069305419922, + "learning_rate": 1.932437389565833e-06, + "loss": 1.7168, + "step": 12156 + }, + { + "epoch": 0.9151094299855097, + "grad_norm": 4.643542766571045, + "learning_rate": 1.929082363126422e-06, + "loss": 1.7374, + "step": 12157 + }, + { + "epoch": 0.9151847042661698, + "grad_norm": 5.494507789611816, + "learning_rate": 1.925730194365127e-06, + "loss": 1.724, + "step": 12158 + }, + { + "epoch": 0.91525997854683, + "grad_norm": 6.585967540740967, + "learning_rate": 1.922380883481234e-06, + "loss": 1.6298, + "step": 12159 + }, + { + "epoch": 0.9153352528274902, + "grad_norm": 5.267186641693115, + "learning_rate": 1.919034430673844e-06, + "loss": 1.714, + "step": 12160 + }, + { + "epoch": 0.9154105271081503, + "grad_norm": 4.434470176696777, + "learning_rate": 1.9156908361418924e-06, + "loss": 1.78, + "step": 12161 + }, + { + "epoch": 0.9154858013888105, + "grad_norm": 4.448185920715332, + "learning_rate": 1.9123501000841594e-06, + "loss": 2.0354, + "step": 12162 + }, + { + "epoch": 0.9155610756694706, + "grad_norm": 7.0453948974609375, + "learning_rate": 1.909012222699236e-06, + "loss": 1.5976, + "step": 12163 + }, + { + "epoch": 0.9156363499501308, + "grad_norm": 5.426478862762451, + "learning_rate": 1.9056772041855408e-06, + "loss": 1.8676, + "step": 12164 + }, + { + "epoch": 0.915711624230791, + "grad_norm": 6.679986953735352, + "learning_rate": 1.9023450447413439e-06, + "loss": 1.4884, + "step": 12165 + }, + { + "epoch": 0.9157868985114511, + "grad_norm": 5.238773345947266, + "learning_rate": 1.8990157445647195e-06, + "loss": 1.5711, + "step": 12166 + }, + { + "epoch": 0.9158621727921112, + "grad_norm": 5.7694478034973145, + "learning_rate": 1.8956893038535983e-06, + "loss": 1.5851, + "step": 12167 + }, + { + "epoch": 0.9159374470727714, + "grad_norm": 5.07877254486084, + "learning_rate": 1.8923657228057168e-06, + "loss": 2.0668, + "step": 12168 + }, + { + "epoch": 0.9160127213534316, + "grad_norm": 5.141547203063965, + "learning_rate": 1.8890450016186611e-06, + "loss": 1.7363, + "step": 12169 + }, + { + "epoch": 0.9160879956340917, + "grad_norm": 6.566463470458984, + "learning_rate": 1.8857271404898292e-06, + "loss": 1.7442, + "step": 12170 + }, + { + "epoch": 0.9161632699147518, + "grad_norm": 5.055227279663086, + "learning_rate": 1.8824121396164628e-06, + "loss": 1.7583, + "step": 12171 + }, + { + "epoch": 0.9162385441954121, + "grad_norm": 8.883407592773438, + "learning_rate": 1.8790999991956327e-06, + "loss": 1.9036, + "step": 12172 + }, + { + "epoch": 0.9163138184760722, + "grad_norm": 4.488308429718018, + "learning_rate": 1.8757907194242307e-06, + "loss": 1.6855, + "step": 12173 + }, + { + "epoch": 0.9163890927567323, + "grad_norm": 5.276883602142334, + "learning_rate": 1.8724843004989944e-06, + "loss": 1.5994, + "step": 12174 + }, + { + "epoch": 0.9164643670373925, + "grad_norm": 3.9721500873565674, + "learning_rate": 1.869180742616461e-06, + "loss": 1.9524, + "step": 12175 + }, + { + "epoch": 0.9165396413180527, + "grad_norm": 6.090517044067383, + "learning_rate": 1.8658800459730451e-06, + "loss": 1.5121, + "step": 12176 + }, + { + "epoch": 0.9166149155987128, + "grad_norm": 5.494152069091797, + "learning_rate": 1.862582210764935e-06, + "loss": 1.8729, + "step": 12177 + }, + { + "epoch": 0.916690189879373, + "grad_norm": 5.577969074249268, + "learning_rate": 1.8592872371882009e-06, + "loss": 1.7699, + "step": 12178 + }, + { + "epoch": 0.9167654641600331, + "grad_norm": 6.000285625457764, + "learning_rate": 1.8559951254386975e-06, + "loss": 1.6191, + "step": 12179 + }, + { + "epoch": 0.9168407384406932, + "grad_norm": 6.428563117980957, + "learning_rate": 1.8527058757121574e-06, + "loss": 1.8143, + "step": 12180 + }, + { + "epoch": 0.9169160127213535, + "grad_norm": 4.437234878540039, + "learning_rate": 1.8494194882040905e-06, + "loss": 1.7165, + "step": 12181 + }, + { + "epoch": 0.9169912870020136, + "grad_norm": 5.975449085235596, + "learning_rate": 1.8461359631098796e-06, + "loss": 1.6618, + "step": 12182 + }, + { + "epoch": 0.9170665612826737, + "grad_norm": 6.742742538452148, + "learning_rate": 1.8428553006247129e-06, + "loss": 2.3385, + "step": 12183 + }, + { + "epoch": 0.917141835563334, + "grad_norm": 6.922664642333984, + "learning_rate": 1.8395775009436288e-06, + "loss": 1.6849, + "step": 12184 + }, + { + "epoch": 0.9172171098439941, + "grad_norm": 4.397456645965576, + "learning_rate": 1.8363025642614662e-06, + "loss": 1.5654, + "step": 12185 + }, + { + "epoch": 0.9172923841246542, + "grad_norm": 5.630561828613281, + "learning_rate": 1.8330304907729302e-06, + "loss": 1.6441, + "step": 12186 + }, + { + "epoch": 0.9173676584053143, + "grad_norm": 6.6440324783325195, + "learning_rate": 1.8297612806725151e-06, + "loss": 1.8448, + "step": 12187 + }, + { + "epoch": 0.9174429326859745, + "grad_norm": 6.135951519012451, + "learning_rate": 1.8264949341545768e-06, + "loss": 1.4995, + "step": 12188 + }, + { + "epoch": 0.9175182069666347, + "grad_norm": 5.361112117767334, + "learning_rate": 1.823231451413293e-06, + "loss": 1.8675, + "step": 12189 + }, + { + "epoch": 0.9175934812472948, + "grad_norm": 5.832495212554932, + "learning_rate": 1.8199708326426644e-06, + "loss": 1.9665, + "step": 12190 + }, + { + "epoch": 0.917668755527955, + "grad_norm": 4.473899841308594, + "learning_rate": 1.8167130780365248e-06, + "loss": 1.7678, + "step": 12191 + }, + { + "epoch": 0.9177440298086151, + "grad_norm": 3.823052167892456, + "learning_rate": 1.813458187788536e-06, + "loss": 1.7738, + "step": 12192 + }, + { + "epoch": 0.9178193040892753, + "grad_norm": 7.666285991668701, + "learning_rate": 1.8102061620921984e-06, + "loss": 1.2345, + "step": 12193 + }, + { + "epoch": 0.9178945783699355, + "grad_norm": 4.310316562652588, + "learning_rate": 1.8069570011408298e-06, + "loss": 1.9532, + "step": 12194 + }, + { + "epoch": 0.9179698526505956, + "grad_norm": 6.366835117340088, + "learning_rate": 1.8037107051275926e-06, + "loss": 1.9232, + "step": 12195 + }, + { + "epoch": 0.9180451269312557, + "grad_norm": 6.490205764770508, + "learning_rate": 1.8004672742454598e-06, + "loss": 1.663, + "step": 12196 + }, + { + "epoch": 0.918120401211916, + "grad_norm": 4.824207305908203, + "learning_rate": 1.7972267086872495e-06, + "loss": 2.1084, + "step": 12197 + }, + { + "epoch": 0.9181956754925761, + "grad_norm": 5.91863489151001, + "learning_rate": 1.793989008645597e-06, + "loss": 1.6556, + "step": 12198 + }, + { + "epoch": 0.9182709497732362, + "grad_norm": 4.996007919311523, + "learning_rate": 1.7907541743129864e-06, + "loss": 1.371, + "step": 12199 + }, + { + "epoch": 0.9183462240538964, + "grad_norm": 8.245656967163086, + "learning_rate": 1.787522205881703e-06, + "loss": 1.7829, + "step": 12200 + }, + { + "epoch": 0.9184214983345566, + "grad_norm": 5.744636535644531, + "learning_rate": 1.7842931035438992e-06, + "loss": 2.1329, + "step": 12201 + }, + { + "epoch": 0.9184967726152167, + "grad_norm": 7.146065711975098, + "learning_rate": 1.7810668674915154e-06, + "loss": 1.6686, + "step": 12202 + }, + { + "epoch": 0.9185720468958769, + "grad_norm": 5.422779560089111, + "learning_rate": 1.7778434979163483e-06, + "loss": 2.03, + "step": 12203 + }, + { + "epoch": 0.918647321176537, + "grad_norm": 3.983621120452881, + "learning_rate": 1.7746229950100223e-06, + "loss": 1.9129, + "step": 12204 + }, + { + "epoch": 0.9187225954571971, + "grad_norm": 5.484464645385742, + "learning_rate": 1.77140535896399e-06, + "loss": 1.8603, + "step": 12205 + }, + { + "epoch": 0.9187978697378573, + "grad_norm": 6.660750389099121, + "learning_rate": 1.7681905899695206e-06, + "loss": 2.0861, + "step": 12206 + }, + { + "epoch": 0.9188731440185175, + "grad_norm": 4.562093257904053, + "learning_rate": 1.7649786882177277e-06, + "loss": 1.6932, + "step": 12207 + }, + { + "epoch": 0.9189484182991776, + "grad_norm": 5.688755989074707, + "learning_rate": 1.7617696538995587e-06, + "loss": 1.8248, + "step": 12208 + }, + { + "epoch": 0.9190236925798377, + "grad_norm": 4.942871570587158, + "learning_rate": 1.7585634872057665e-06, + "loss": 1.6688, + "step": 12209 + }, + { + "epoch": 0.919098966860498, + "grad_norm": 5.830164432525635, + "learning_rate": 1.75536018832696e-06, + "loss": 1.7803, + "step": 12210 + }, + { + "epoch": 0.9191742411411581, + "grad_norm": 4.164713382720947, + "learning_rate": 1.752159757453553e-06, + "loss": 1.6047, + "step": 12211 + }, + { + "epoch": 0.9192495154218182, + "grad_norm": 5.69412899017334, + "learning_rate": 1.748962194775816e-06, + "loss": 2.0252, + "step": 12212 + }, + { + "epoch": 0.9193247897024784, + "grad_norm": 4.335793495178223, + "learning_rate": 1.7457675004838247e-06, + "loss": 1.707, + "step": 12213 + }, + { + "epoch": 0.9194000639831386, + "grad_norm": 4.493879795074463, + "learning_rate": 1.7425756747674992e-06, + "loss": 1.9527, + "step": 12214 + }, + { + "epoch": 0.9194753382637987, + "grad_norm": 6.0843186378479, + "learning_rate": 1.7393867178165823e-06, + "loss": 1.9063, + "step": 12215 + }, + { + "epoch": 0.9195506125444589, + "grad_norm": 5.394599437713623, + "learning_rate": 1.73620062982065e-06, + "loss": 1.5675, + "step": 12216 + }, + { + "epoch": 0.919625886825119, + "grad_norm": 6.0314788818359375, + "learning_rate": 1.7330174109691067e-06, + "loss": 1.7645, + "step": 12217 + }, + { + "epoch": 0.9197011611057792, + "grad_norm": 4.311070919036865, + "learning_rate": 1.7298370614511895e-06, + "loss": 1.638, + "step": 12218 + }, + { + "epoch": 0.9197764353864394, + "grad_norm": 5.176286220550537, + "learning_rate": 1.7266595814559472e-06, + "loss": 1.5722, + "step": 12219 + }, + { + "epoch": 0.9198517096670995, + "grad_norm": 5.046456813812256, + "learning_rate": 1.7234849711722845e-06, + "loss": 1.8054, + "step": 12220 + }, + { + "epoch": 0.9199269839477596, + "grad_norm": 4.80660343170166, + "learning_rate": 1.720313230788928e-06, + "loss": 1.6777, + "step": 12221 + }, + { + "epoch": 0.9200022582284199, + "grad_norm": 4.196972846984863, + "learning_rate": 1.7171443604944104e-06, + "loss": 1.5336, + "step": 12222 + }, + { + "epoch": 0.92007753250908, + "grad_norm": 5.110964298248291, + "learning_rate": 1.7139783604771253e-06, + "loss": 1.9501, + "step": 12223 + }, + { + "epoch": 0.9201528067897401, + "grad_norm": 4.71935510635376, + "learning_rate": 1.7108152309252778e-06, + "loss": 1.8142, + "step": 12224 + }, + { + "epoch": 0.9202280810704002, + "grad_norm": 9.157746315002441, + "learning_rate": 1.707654972026912e-06, + "loss": 1.7646, + "step": 12225 + }, + { + "epoch": 0.9203033553510604, + "grad_norm": 4.6988844871521, + "learning_rate": 1.704497583969883e-06, + "loss": 1.1773, + "step": 12226 + }, + { + "epoch": 0.9203786296317206, + "grad_norm": 5.397796154022217, + "learning_rate": 1.7013430669419018e-06, + "loss": 1.5783, + "step": 12227 + }, + { + "epoch": 0.9204539039123807, + "grad_norm": 5.24730920791626, + "learning_rate": 1.698191421130485e-06, + "loss": 2.1331, + "step": 12228 + }, + { + "epoch": 0.9205291781930409, + "grad_norm": 5.70337438583374, + "learning_rate": 1.695042646723005e-06, + "loss": 1.4865, + "step": 12229 + }, + { + "epoch": 0.920604452473701, + "grad_norm": 5.092368125915527, + "learning_rate": 1.6918967439066346e-06, + "loss": 1.7773, + "step": 12230 + }, + { + "epoch": 0.9206797267543612, + "grad_norm": 6.473546028137207, + "learning_rate": 1.6887537128683905e-06, + "loss": 1.6467, + "step": 12231 + }, + { + "epoch": 0.9207550010350214, + "grad_norm": 4.975739479064941, + "learning_rate": 1.6856135537951123e-06, + "loss": 2.0076, + "step": 12232 + }, + { + "epoch": 0.9208302753156815, + "grad_norm": 4.614065170288086, + "learning_rate": 1.6824762668734895e-06, + "loss": 1.4596, + "step": 12233 + }, + { + "epoch": 0.9209055495963416, + "grad_norm": 5.10197114944458, + "learning_rate": 1.6793418522900062e-06, + "loss": 1.4368, + "step": 12234 + }, + { + "epoch": 0.9209808238770019, + "grad_norm": 4.685549736022949, + "learning_rate": 1.6762103102310078e-06, + "loss": 1.2427, + "step": 12235 + }, + { + "epoch": 0.921056098157662, + "grad_norm": 5.914480686187744, + "learning_rate": 1.67308164088264e-06, + "loss": 1.5994, + "step": 12236 + }, + { + "epoch": 0.9211313724383221, + "grad_norm": 5.779835224151611, + "learning_rate": 1.6699558444309094e-06, + "loss": 1.7868, + "step": 12237 + }, + { + "epoch": 0.9212066467189823, + "grad_norm": 6.497752666473389, + "learning_rate": 1.6668329210616229e-06, + "loss": 1.4605, + "step": 12238 + }, + { + "epoch": 0.9212819209996425, + "grad_norm": 5.44590425491333, + "learning_rate": 1.6637128709604432e-06, + "loss": 1.3929, + "step": 12239 + }, + { + "epoch": 0.9213571952803026, + "grad_norm": 4.850203990936279, + "learning_rate": 1.6605956943128443e-06, + "loss": 1.751, + "step": 12240 + }, + { + "epoch": 0.9214324695609628, + "grad_norm": 6.7934699058532715, + "learning_rate": 1.657481391304122e-06, + "loss": 1.5936, + "step": 12241 + }, + { + "epoch": 0.9215077438416229, + "grad_norm": 6.179891586303711, + "learning_rate": 1.6543699621194286e-06, + "loss": 1.8999, + "step": 12242 + }, + { + "epoch": 0.921583018122283, + "grad_norm": 4.753556728363037, + "learning_rate": 1.651261406943716e-06, + "loss": 1.5237, + "step": 12243 + }, + { + "epoch": 0.9216582924029432, + "grad_norm": 4.914474010467529, + "learning_rate": 1.6481557259617864e-06, + "loss": 1.8511, + "step": 12244 + }, + { + "epoch": 0.9217335666836034, + "grad_norm": 4.9062113761901855, + "learning_rate": 1.6450529193582587e-06, + "loss": 1.8184, + "step": 12245 + }, + { + "epoch": 0.9218088409642635, + "grad_norm": 9.640711784362793, + "learning_rate": 1.6419529873175964e-06, + "loss": 1.7705, + "step": 12246 + }, + { + "epoch": 0.9218841152449236, + "grad_norm": 7.6342291831970215, + "learning_rate": 1.6388559300240692e-06, + "loss": 1.8647, + "step": 12247 + }, + { + "epoch": 0.9219593895255839, + "grad_norm": 4.452996730804443, + "learning_rate": 1.6357617476617904e-06, + "loss": 1.98, + "step": 12248 + }, + { + "epoch": 0.922034663806244, + "grad_norm": 5.374137878417969, + "learning_rate": 1.632670440414702e-06, + "loss": 1.9002, + "step": 12249 + }, + { + "epoch": 0.9221099380869041, + "grad_norm": 6.792808532714844, + "learning_rate": 1.6295820084665847e-06, + "loss": 1.733, + "step": 12250 + }, + { + "epoch": 0.9221852123675643, + "grad_norm": 4.465142250061035, + "learning_rate": 1.6264964520010195e-06, + "loss": 1.8956, + "step": 12251 + }, + { + "epoch": 0.9222604866482245, + "grad_norm": 5.383513927459717, + "learning_rate": 1.623413771201443e-06, + "loss": 1.7553, + "step": 12252 + }, + { + "epoch": 0.9223357609288846, + "grad_norm": 5.288760662078857, + "learning_rate": 1.6203339662511086e-06, + "loss": 1.8845, + "step": 12253 + }, + { + "epoch": 0.9224110352095448, + "grad_norm": 8.443306922912598, + "learning_rate": 1.617257037333103e-06, + "loss": 2.2085, + "step": 12254 + }, + { + "epoch": 0.9224863094902049, + "grad_norm": 4.722512722015381, + "learning_rate": 1.614182984630347e-06, + "loss": 1.4702, + "step": 12255 + }, + { + "epoch": 0.9225615837708651, + "grad_norm": 5.945209980010986, + "learning_rate": 1.6111118083255717e-06, + "loss": 1.8746, + "step": 12256 + }, + { + "epoch": 0.9226368580515253, + "grad_norm": 4.329065799713135, + "learning_rate": 1.608043508601359e-06, + "loss": 1.7203, + "step": 12257 + }, + { + "epoch": 0.9227121323321854, + "grad_norm": 5.7689409255981445, + "learning_rate": 1.604978085640102e-06, + "loss": 1.7159, + "step": 12258 + }, + { + "epoch": 0.9227874066128455, + "grad_norm": 5.388646602630615, + "learning_rate": 1.6019155396240437e-06, + "loss": 1.8853, + "step": 12259 + }, + { + "epoch": 0.9228626808935058, + "grad_norm": 5.030128002166748, + "learning_rate": 1.598855870735233e-06, + "loss": 1.8776, + "step": 12260 + }, + { + "epoch": 0.9229379551741659, + "grad_norm": 4.348973274230957, + "learning_rate": 1.5957990791555633e-06, + "loss": 2.0141, + "step": 12261 + }, + { + "epoch": 0.923013229454826, + "grad_norm": 5.563854694366455, + "learning_rate": 1.5927451650667446e-06, + "loss": 1.9544, + "step": 12262 + }, + { + "epoch": 0.9230885037354862, + "grad_norm": 4.292466640472412, + "learning_rate": 1.589694128650343e-06, + "loss": 1.3908, + "step": 12263 + }, + { + "epoch": 0.9231637780161464, + "grad_norm": 5.419643878936768, + "learning_rate": 1.5866459700877135e-06, + "loss": 1.6841, + "step": 12264 + }, + { + "epoch": 0.9232390522968065, + "grad_norm": 4.999896049499512, + "learning_rate": 1.583600689560072e-06, + "loss": 1.6132, + "step": 12265 + }, + { + "epoch": 0.9233143265774666, + "grad_norm": 5.763510704040527, + "learning_rate": 1.5805582872484404e-06, + "loss": 2.3052, + "step": 12266 + }, + { + "epoch": 0.9233896008581268, + "grad_norm": 5.501773834228516, + "learning_rate": 1.5775187633336907e-06, + "loss": 1.6491, + "step": 12267 + }, + { + "epoch": 0.923464875138787, + "grad_norm": 5.015240669250488, + "learning_rate": 1.574482117996512e-06, + "loss": 1.7718, + "step": 12268 + }, + { + "epoch": 0.9235401494194471, + "grad_norm": 4.600796699523926, + "learning_rate": 1.5714483514174204e-06, + "loss": 1.7315, + "step": 12269 + }, + { + "epoch": 0.9236154237001073, + "grad_norm": 4.388249397277832, + "learning_rate": 1.5684174637767724e-06, + "loss": 1.9078, + "step": 12270 + }, + { + "epoch": 0.9236906979807674, + "grad_norm": 6.586758613586426, + "learning_rate": 1.5653894552547344e-06, + "loss": 1.7139, + "step": 12271 + }, + { + "epoch": 0.9237659722614275, + "grad_norm": 8.650191307067871, + "learning_rate": 1.562364326031318e-06, + "loss": 1.6562, + "step": 12272 + }, + { + "epoch": 0.9238412465420878, + "grad_norm": 6.165069103240967, + "learning_rate": 1.5593420762863575e-06, + "loss": 1.7134, + "step": 12273 + }, + { + "epoch": 0.9239165208227479, + "grad_norm": 9.19767951965332, + "learning_rate": 1.5563227061995256e-06, + "loss": 1.9511, + "step": 12274 + }, + { + "epoch": 0.923991795103408, + "grad_norm": 4.214312553405762, + "learning_rate": 1.5533062159503064e-06, + "loss": 1.847, + "step": 12275 + }, + { + "epoch": 0.9240670693840682, + "grad_norm": 4.82616662979126, + "learning_rate": 1.550292605718029e-06, + "loss": 1.2446, + "step": 12276 + }, + { + "epoch": 0.9241423436647284, + "grad_norm": 4.3085150718688965, + "learning_rate": 1.5472818756818274e-06, + "loss": 1.6227, + "step": 12277 + }, + { + "epoch": 0.9242176179453885, + "grad_norm": 4.585626602172852, + "learning_rate": 1.5442740260207034e-06, + "loss": 1.492, + "step": 12278 + }, + { + "epoch": 0.9242928922260487, + "grad_norm": 4.837274074554443, + "learning_rate": 1.5412690569134413e-06, + "loss": 1.5806, + "step": 12279 + }, + { + "epoch": 0.9243681665067088, + "grad_norm": 5.985339164733887, + "learning_rate": 1.538266968538704e-06, + "loss": 1.7914, + "step": 12280 + }, + { + "epoch": 0.924443440787369, + "grad_norm": 4.571539878845215, + "learning_rate": 1.5352677610749321e-06, + "loss": 1.8326, + "step": 12281 + }, + { + "epoch": 0.9245187150680292, + "grad_norm": 6.083450794219971, + "learning_rate": 1.532271434700433e-06, + "loss": 1.9743, + "step": 12282 + }, + { + "epoch": 0.9245939893486893, + "grad_norm": 7.210784912109375, + "learning_rate": 1.5292779895933252e-06, + "loss": 1.7333, + "step": 12283 + }, + { + "epoch": 0.9246692636293494, + "grad_norm": 5.155959606170654, + "learning_rate": 1.5262874259315663e-06, + "loss": 1.8311, + "step": 12284 + }, + { + "epoch": 0.9247445379100095, + "grad_norm": 4.488932132720947, + "learning_rate": 1.5232997438929308e-06, + "loss": 1.9897, + "step": 12285 + }, + { + "epoch": 0.9248198121906698, + "grad_norm": 5.681082248687744, + "learning_rate": 1.5203149436550324e-06, + "loss": 1.5771, + "step": 12286 + }, + { + "epoch": 0.9248950864713299, + "grad_norm": 7.093104839324951, + "learning_rate": 1.517333025395312e-06, + "loss": 1.6844, + "step": 12287 + }, + { + "epoch": 0.92497036075199, + "grad_norm": 6.076920986175537, + "learning_rate": 1.514353989291023e-06, + "loss": 2.2778, + "step": 12288 + }, + { + "epoch": 0.9250456350326502, + "grad_norm": 6.106997489929199, + "learning_rate": 1.5113778355192731e-06, + "loss": 1.5718, + "step": 12289 + }, + { + "epoch": 0.9251209093133104, + "grad_norm": 5.919914245605469, + "learning_rate": 1.5084045642569765e-06, + "loss": 1.8211, + "step": 12290 + }, + { + "epoch": 0.9251961835939705, + "grad_norm": 6.355270862579346, + "learning_rate": 1.5054341756808976e-06, + "loss": 1.898, + "step": 12291 + }, + { + "epoch": 0.9252714578746307, + "grad_norm": 4.474854469299316, + "learning_rate": 1.5024666699676059e-06, + "loss": 1.6779, + "step": 12292 + }, + { + "epoch": 0.9253467321552908, + "grad_norm": 6.288714408874512, + "learning_rate": 1.499502047293516e-06, + "loss": 1.8599, + "step": 12293 + }, + { + "epoch": 0.925422006435951, + "grad_norm": 7.064754009246826, + "learning_rate": 1.496540307834865e-06, + "loss": 1.8186, + "step": 12294 + }, + { + "epoch": 0.9254972807166112, + "grad_norm": 5.304087162017822, + "learning_rate": 1.4935814517677226e-06, + "loss": 1.7796, + "step": 12295 + }, + { + "epoch": 0.9255725549972713, + "grad_norm": 4.559020042419434, + "learning_rate": 1.490625479267982e-06, + "loss": 1.8039, + "step": 12296 + }, + { + "epoch": 0.9256478292779314, + "grad_norm": 5.901546478271484, + "learning_rate": 1.4876723905113688e-06, + "loss": 1.7986, + "step": 12297 + }, + { + "epoch": 0.9257231035585917, + "grad_norm": 5.7770609855651855, + "learning_rate": 1.4847221856734317e-06, + "loss": 1.7017, + "step": 12298 + }, + { + "epoch": 0.9257983778392518, + "grad_norm": 4.706702709197998, + "learning_rate": 1.4817748649295581e-06, + "loss": 1.6068, + "step": 12299 + }, + { + "epoch": 0.9258736521199119, + "grad_norm": 5.018117904663086, + "learning_rate": 1.478830428454947e-06, + "loss": 1.7014, + "step": 12300 + }, + { + "epoch": 0.9259489264005721, + "grad_norm": 6.656970024108887, + "learning_rate": 1.475888876424647e-06, + "loss": 1.7619, + "step": 12301 + }, + { + "epoch": 0.9260242006812323, + "grad_norm": 4.830564022064209, + "learning_rate": 1.4729502090135295e-06, + "loss": 1.7941, + "step": 12302 + }, + { + "epoch": 0.9260994749618924, + "grad_norm": 5.256102561950684, + "learning_rate": 1.4700144263962767e-06, + "loss": 1.6108, + "step": 12303 + }, + { + "epoch": 0.9261747492425525, + "grad_norm": 4.669799327850342, + "learning_rate": 1.467081528747416e-06, + "loss": 1.5463, + "step": 12304 + }, + { + "epoch": 0.9262500235232127, + "grad_norm": 6.318775177001953, + "learning_rate": 1.464151516241302e-06, + "loss": 1.8985, + "step": 12305 + }, + { + "epoch": 0.9263252978038728, + "grad_norm": 4.087380886077881, + "learning_rate": 1.4612243890521238e-06, + "loss": 1.6545, + "step": 12306 + }, + { + "epoch": 0.926400572084533, + "grad_norm": 4.970261573791504, + "learning_rate": 1.4583001473538693e-06, + "loss": 1.6994, + "step": 12307 + }, + { + "epoch": 0.9264758463651932, + "grad_norm": 6.173431873321533, + "learning_rate": 1.4553787913203999e-06, + "loss": 1.9022, + "step": 12308 + }, + { + "epoch": 0.9265511206458533, + "grad_norm": 4.611196517944336, + "learning_rate": 1.4524603211253651e-06, + "loss": 1.7756, + "step": 12309 + }, + { + "epoch": 0.9266263949265134, + "grad_norm": 8.14472484588623, + "learning_rate": 1.4495447369422766e-06, + "loss": 2.2904, + "step": 12310 + }, + { + "epoch": 0.9267016692071737, + "grad_norm": 6.003556251525879, + "learning_rate": 1.4466320389444343e-06, + "loss": 1.6783, + "step": 12311 + }, + { + "epoch": 0.9267769434878338, + "grad_norm": 5.917142868041992, + "learning_rate": 1.4437222273050112e-06, + "loss": 1.8021, + "step": 12312 + }, + { + "epoch": 0.9268522177684939, + "grad_norm": 6.692000389099121, + "learning_rate": 1.4408153021969683e-06, + "loss": 1.7285, + "step": 12313 + }, + { + "epoch": 0.9269274920491541, + "grad_norm": 4.290324687957764, + "learning_rate": 1.4379112637931292e-06, + "loss": 1.7827, + "step": 12314 + }, + { + "epoch": 0.9270027663298143, + "grad_norm": 5.0580315589904785, + "learning_rate": 1.435010112266122e-06, + "loss": 2.1214, + "step": 12315 + }, + { + "epoch": 0.9270780406104744, + "grad_norm": 7.565629482269287, + "learning_rate": 1.4321118477884199e-06, + "loss": 1.436, + "step": 12316 + }, + { + "epoch": 0.9271533148911346, + "grad_norm": 4.361331939697266, + "learning_rate": 1.4292164705323075e-06, + "loss": 1.5064, + "step": 12317 + }, + { + "epoch": 0.9272285891717947, + "grad_norm": 6.161437034606934, + "learning_rate": 1.426323980669908e-06, + "loss": 1.5386, + "step": 12318 + }, + { + "epoch": 0.9273038634524549, + "grad_norm": 6.603729248046875, + "learning_rate": 1.423434378373184e-06, + "loss": 1.7413, + "step": 12319 + }, + { + "epoch": 0.9273791377331151, + "grad_norm": 4.6165595054626465, + "learning_rate": 1.4205476638138926e-06, + "loss": 1.6272, + "step": 12320 + }, + { + "epoch": 0.9274544120137752, + "grad_norm": 4.574575424194336, + "learning_rate": 1.417663837163663e-06, + "loss": 1.4373, + "step": 12321 + }, + { + "epoch": 0.9275296862944353, + "grad_norm": 6.774166107177734, + "learning_rate": 1.4147828985939139e-06, + "loss": 1.7896, + "step": 12322 + }, + { + "epoch": 0.9276049605750954, + "grad_norm": 4.1649065017700195, + "learning_rate": 1.4119048482759189e-06, + "loss": 1.3807, + "step": 12323 + }, + { + "epoch": 0.9276802348557557, + "grad_norm": 8.064258575439453, + "learning_rate": 1.4090296863807638e-06, + "loss": 1.6739, + "step": 12324 + }, + { + "epoch": 0.9277555091364158, + "grad_norm": 4.821267127990723, + "learning_rate": 1.406157413079373e-06, + "loss": 1.4055, + "step": 12325 + }, + { + "epoch": 0.9278307834170759, + "grad_norm": 6.588372230529785, + "learning_rate": 1.4032880285424877e-06, + "loss": 1.3516, + "step": 12326 + }, + { + "epoch": 0.9279060576977362, + "grad_norm": 5.196451663970947, + "learning_rate": 1.4004215329406933e-06, + "loss": 1.4895, + "step": 12327 + }, + { + "epoch": 0.9279813319783963, + "grad_norm": 7.899312973022461, + "learning_rate": 1.3975579264443872e-06, + "loss": 1.9651, + "step": 12328 + }, + { + "epoch": 0.9280566062590564, + "grad_norm": 4.531418323516846, + "learning_rate": 1.3946972092238108e-06, + "loss": 1.9487, + "step": 12329 + }, + { + "epoch": 0.9281318805397166, + "grad_norm": 5.499183177947998, + "learning_rate": 1.3918393814490172e-06, + "loss": 1.6444, + "step": 12330 + }, + { + "epoch": 0.9282071548203767, + "grad_norm": 5.589671611785889, + "learning_rate": 1.3889844432899035e-06, + "loss": 1.7745, + "step": 12331 + }, + { + "epoch": 0.9282824291010369, + "grad_norm": 6.294485569000244, + "learning_rate": 1.3861323949161787e-06, + "loss": 1.7791, + "step": 12332 + }, + { + "epoch": 0.9283577033816971, + "grad_norm": 5.882016181945801, + "learning_rate": 1.3832832364973959e-06, + "loss": 1.6313, + "step": 12333 + }, + { + "epoch": 0.9284329776623572, + "grad_norm": 4.849218845367432, + "learning_rate": 1.3804369682029362e-06, + "loss": 1.9802, + "step": 12334 + }, + { + "epoch": 0.9285082519430173, + "grad_norm": 4.862195014953613, + "learning_rate": 1.377593590201981e-06, + "loss": 1.8791, + "step": 12335 + }, + { + "epoch": 0.9285835262236776, + "grad_norm": 5.179335594177246, + "learning_rate": 1.3747531026635841e-06, + "loss": 1.6424, + "step": 12336 + }, + { + "epoch": 0.9286588005043377, + "grad_norm": 5.868218421936035, + "learning_rate": 1.371915505756588e-06, + "loss": 1.8969, + "step": 12337 + }, + { + "epoch": 0.9287340747849978, + "grad_norm": 5.639591217041016, + "learning_rate": 1.3690807996496858e-06, + "loss": 1.6854, + "step": 12338 + }, + { + "epoch": 0.928809349065658, + "grad_norm": 4.688801288604736, + "learning_rate": 1.3662489845113923e-06, + "loss": 1.8411, + "step": 12339 + }, + { + "epoch": 0.9288846233463182, + "grad_norm": 5.562274932861328, + "learning_rate": 1.3634200605100511e-06, + "loss": 1.8952, + "step": 12340 + }, + { + "epoch": 0.9289598976269783, + "grad_norm": 4.749614715576172, + "learning_rate": 1.3605940278138328e-06, + "loss": 1.8899, + "step": 12341 + }, + { + "epoch": 0.9290351719076385, + "grad_norm": 4.972977638244629, + "learning_rate": 1.3577708865907424e-06, + "loss": 1.7877, + "step": 12342 + }, + { + "epoch": 0.9291104461882986, + "grad_norm": 8.933000564575195, + "learning_rate": 1.3549506370085952e-06, + "loss": 1.8573, + "step": 12343 + }, + { + "epoch": 0.9291857204689588, + "grad_norm": 6.871606349945068, + "learning_rate": 1.3521332792350627e-06, + "loss": 2.0229, + "step": 12344 + }, + { + "epoch": 0.9292609947496189, + "grad_norm": 4.4091973304748535, + "learning_rate": 1.3493188134376112e-06, + "loss": 1.5757, + "step": 12345 + }, + { + "epoch": 0.9293362690302791, + "grad_norm": 5.878164291381836, + "learning_rate": 1.3465072397835733e-06, + "loss": 2.2906, + "step": 12346 + }, + { + "epoch": 0.9294115433109392, + "grad_norm": 4.191864490509033, + "learning_rate": 1.3436985584400652e-06, + "loss": 1.7239, + "step": 12347 + }, + { + "epoch": 0.9294868175915993, + "grad_norm": 7.327856540679932, + "learning_rate": 1.3408927695740703e-06, + "loss": 1.6402, + "step": 12348 + }, + { + "epoch": 0.9295620918722596, + "grad_norm": 5.747330188751221, + "learning_rate": 1.3380898733523828e-06, + "loss": 1.7923, + "step": 12349 + }, + { + "epoch": 0.9296373661529197, + "grad_norm": 5.146080493927002, + "learning_rate": 1.335289869941625e-06, + "loss": 1.4958, + "step": 12350 + }, + { + "epoch": 0.9297126404335798, + "grad_norm": 5.231537342071533, + "learning_rate": 1.3324927595082526e-06, + "loss": 1.9381, + "step": 12351 + }, + { + "epoch": 0.92978791471424, + "grad_norm": 5.865573406219482, + "learning_rate": 1.3296985422185383e-06, + "loss": 1.5873, + "step": 12352 + }, + { + "epoch": 0.9298631889949002, + "grad_norm": 4.612595081329346, + "learning_rate": 1.3269072182386044e-06, + "loss": 1.7128, + "step": 12353 + }, + { + "epoch": 0.9299384632755603, + "grad_norm": 5.4627156257629395, + "learning_rate": 1.3241187877343687e-06, + "loss": 1.5128, + "step": 12354 + }, + { + "epoch": 0.9300137375562205, + "grad_norm": 6.94678258895874, + "learning_rate": 1.3213332508716093e-06, + "loss": 1.3282, + "step": 12355 + }, + { + "epoch": 0.9300890118368806, + "grad_norm": 5.569812774658203, + "learning_rate": 1.318550607815905e-06, + "loss": 1.6859, + "step": 12356 + }, + { + "epoch": 0.9301642861175408, + "grad_norm": 5.834296226501465, + "learning_rate": 1.3157708587326955e-06, + "loss": 1.2278, + "step": 12357 + }, + { + "epoch": 0.930239560398201, + "grad_norm": 4.061436653137207, + "learning_rate": 1.31299400378721e-06, + "loss": 1.7411, + "step": 12358 + }, + { + "epoch": 0.9303148346788611, + "grad_norm": 4.4513373374938965, + "learning_rate": 1.3102200431445332e-06, + "loss": 2.0215, + "step": 12359 + }, + { + "epoch": 0.9303901089595212, + "grad_norm": 5.512117385864258, + "learning_rate": 1.307448976969572e-06, + "loss": 1.7271, + "step": 12360 + }, + { + "epoch": 0.9304653832401815, + "grad_norm": 4.888034820556641, + "learning_rate": 1.3046808054270498e-06, + "loss": 1.7194, + "step": 12361 + }, + { + "epoch": 0.9305406575208416, + "grad_norm": 6.136234283447266, + "learning_rate": 1.3019155286815354e-06, + "loss": 2.0031, + "step": 12362 + }, + { + "epoch": 0.9306159318015017, + "grad_norm": 4.862524509429932, + "learning_rate": 1.299153146897414e-06, + "loss": 1.7416, + "step": 12363 + }, + { + "epoch": 0.9306912060821618, + "grad_norm": 6.69742488861084, + "learning_rate": 1.2963936602388926e-06, + "loss": 2.0654, + "step": 12364 + }, + { + "epoch": 0.9307664803628221, + "grad_norm": 8.008426666259766, + "learning_rate": 1.293637068870024e-06, + "loss": 1.8037, + "step": 12365 + }, + { + "epoch": 0.9308417546434822, + "grad_norm": 6.133461952209473, + "learning_rate": 1.2908833729546822e-06, + "loss": 1.4561, + "step": 12366 + }, + { + "epoch": 0.9309170289241423, + "grad_norm": 6.209114074707031, + "learning_rate": 1.288132572656553e-06, + "loss": 1.5562, + "step": 12367 + }, + { + "epoch": 0.9309923032048025, + "grad_norm": 6.522866725921631, + "learning_rate": 1.2853846681391835e-06, + "loss": 1.6408, + "step": 12368 + }, + { + "epoch": 0.9310675774854626, + "grad_norm": 4.693703651428223, + "learning_rate": 1.2826396595659097e-06, + "loss": 1.7546, + "step": 12369 + }, + { + "epoch": 0.9311428517661228, + "grad_norm": 5.042492866516113, + "learning_rate": 1.2798975470999286e-06, + "loss": 1.6797, + "step": 12370 + }, + { + "epoch": 0.931218126046783, + "grad_norm": 6.170299053192139, + "learning_rate": 1.2771583309042378e-06, + "loss": 2.1038, + "step": 12371 + }, + { + "epoch": 0.9312934003274431, + "grad_norm": 5.040173053741455, + "learning_rate": 1.2744220111416793e-06, + "loss": 1.6481, + "step": 12372 + }, + { + "epoch": 0.9313686746081032, + "grad_norm": 5.482685565948486, + "learning_rate": 1.2716885879749286e-06, + "loss": 1.5797, + "step": 12373 + }, + { + "epoch": 0.9314439488887635, + "grad_norm": 6.3766679763793945, + "learning_rate": 1.2689580615664775e-06, + "loss": 1.6212, + "step": 12374 + }, + { + "epoch": 0.9315192231694236, + "grad_norm": 4.745019912719727, + "learning_rate": 1.2662304320786412e-06, + "loss": 1.8213, + "step": 12375 + }, + { + "epoch": 0.9315944974500837, + "grad_norm": 4.777936935424805, + "learning_rate": 1.2635056996735783e-06, + "loss": 1.958, + "step": 12376 + }, + { + "epoch": 0.931669771730744, + "grad_norm": 3.75307559967041, + "learning_rate": 1.2607838645132486e-06, + "loss": 1.6302, + "step": 12377 + }, + { + "epoch": 0.9317450460114041, + "grad_norm": 5.264604091644287, + "learning_rate": 1.2580649267594835e-06, + "loss": 1.768, + "step": 12378 + }, + { + "epoch": 0.9318203202920642, + "grad_norm": 4.239240646362305, + "learning_rate": 1.2553488865738927e-06, + "loss": 1.8226, + "step": 12379 + }, + { + "epoch": 0.9318955945727244, + "grad_norm": 7.2942023277282715, + "learning_rate": 1.2526357441179527e-06, + "loss": 1.7995, + "step": 12380 + }, + { + "epoch": 0.9319708688533845, + "grad_norm": 4.348647117614746, + "learning_rate": 1.2499254995529397e-06, + "loss": 1.4049, + "step": 12381 + }, + { + "epoch": 0.9320461431340447, + "grad_norm": 4.719681739807129, + "learning_rate": 1.247218153039975e-06, + "loss": 1.8017, + "step": 12382 + }, + { + "epoch": 0.9321214174147048, + "grad_norm": 5.846291542053223, + "learning_rate": 1.2445137047400079e-06, + "loss": 1.6934, + "step": 12383 + }, + { + "epoch": 0.932196691695365, + "grad_norm": 6.471892356872559, + "learning_rate": 1.241812154813804e-06, + "loss": 1.6251, + "step": 12384 + }, + { + "epoch": 0.9322719659760251, + "grad_norm": 5.613927841186523, + "learning_rate": 1.239113503421968e-06, + "loss": 1.883, + "step": 12385 + }, + { + "epoch": 0.9323472402566853, + "grad_norm": 5.0293450355529785, + "learning_rate": 1.2364177507249219e-06, + "loss": 1.9882, + "step": 12386 + }, + { + "epoch": 0.9324225145373455, + "grad_norm": 6.841736316680908, + "learning_rate": 1.2337248968829262e-06, + "loss": 2.158, + "step": 12387 + }, + { + "epoch": 0.9324977888180056, + "grad_norm": 5.236517429351807, + "learning_rate": 1.231034942056053e-06, + "loss": 1.7109, + "step": 12388 + }, + { + "epoch": 0.9325730630986657, + "grad_norm": 4.217904567718506, + "learning_rate": 1.2283478864042243e-06, + "loss": 1.7164, + "step": 12389 + }, + { + "epoch": 0.932648337379326, + "grad_norm": 5.619619369506836, + "learning_rate": 1.2256637300871621e-06, + "loss": 2.1378, + "step": 12390 + }, + { + "epoch": 0.9327236116599861, + "grad_norm": 4.520730018615723, + "learning_rate": 1.22298247326445e-06, + "loss": 1.482, + "step": 12391 + }, + { + "epoch": 0.9327988859406462, + "grad_norm": 5.719366550445557, + "learning_rate": 1.220304116095472e-06, + "loss": 1.7828, + "step": 12392 + }, + { + "epoch": 0.9328741602213064, + "grad_norm": 5.427941799163818, + "learning_rate": 1.2176286587394447e-06, + "loss": 1.5228, + "step": 12393 + }, + { + "epoch": 0.9329494345019665, + "grad_norm": 5.557382583618164, + "learning_rate": 1.2149561013554245e-06, + "loss": 1.913, + "step": 12394 + }, + { + "epoch": 0.9330247087826267, + "grad_norm": 5.168394088745117, + "learning_rate": 1.2122864441022896e-06, + "loss": 1.5502, + "step": 12395 + }, + { + "epoch": 0.9330999830632869, + "grad_norm": 4.020081996917725, + "learning_rate": 1.20961968713873e-06, + "loss": 1.4035, + "step": 12396 + }, + { + "epoch": 0.933175257343947, + "grad_norm": 6.088656902313232, + "learning_rate": 1.2069558306232853e-06, + "loss": 1.6016, + "step": 12397 + }, + { + "epoch": 0.9332505316246071, + "grad_norm": 6.082274436950684, + "learning_rate": 1.2042948747143235e-06, + "loss": 1.4098, + "step": 12398 + }, + { + "epoch": 0.9333258059052674, + "grad_norm": 5.805689811706543, + "learning_rate": 1.2016368195700122e-06, + "loss": 1.5958, + "step": 12399 + }, + { + "epoch": 0.9334010801859275, + "grad_norm": 4.40662145614624, + "learning_rate": 1.1989816653483755e-06, + "loss": 2.0892, + "step": 12400 + }, + { + "epoch": 0.9334763544665876, + "grad_norm": 6.134064197540283, + "learning_rate": 1.1963294122072533e-06, + "loss": 1.3814, + "step": 12401 + }, + { + "epoch": 0.9335516287472477, + "grad_norm": 5.066925525665283, + "learning_rate": 1.1936800603043141e-06, + "loss": 1.2358, + "step": 12402 + }, + { + "epoch": 0.933626903027908, + "grad_norm": 5.94294548034668, + "learning_rate": 1.1910336097970541e-06, + "loss": 2.0095, + "step": 12403 + }, + { + "epoch": 0.9337021773085681, + "grad_norm": 9.314642906188965, + "learning_rate": 1.1883900608427924e-06, + "loss": 1.5994, + "step": 12404 + }, + { + "epoch": 0.9337774515892282, + "grad_norm": 5.044084548950195, + "learning_rate": 1.1857494135986858e-06, + "loss": 1.7027, + "step": 12405 + }, + { + "epoch": 0.9338527258698884, + "grad_norm": 5.547286033630371, + "learning_rate": 1.1831116682217203e-06, + "loss": 1.6115, + "step": 12406 + }, + { + "epoch": 0.9339280001505486, + "grad_norm": 4.696609020233154, + "learning_rate": 1.1804768248686926e-06, + "loss": 2.2242, + "step": 12407 + }, + { + "epoch": 0.9340032744312087, + "grad_norm": 7.525778293609619, + "learning_rate": 1.1778448836962385e-06, + "loss": 1.3349, + "step": 12408 + }, + { + "epoch": 0.9340785487118689, + "grad_norm": 5.140316486358643, + "learning_rate": 1.1752158448608164e-06, + "loss": 1.9508, + "step": 12409 + }, + { + "epoch": 0.934153822992529, + "grad_norm": 4.8282318115234375, + "learning_rate": 1.1725897085187231e-06, + "loss": 1.7799, + "step": 12410 + }, + { + "epoch": 0.9342290972731891, + "grad_norm": 6.452484607696533, + "learning_rate": 1.1699664748260676e-06, + "loss": 1.9593, + "step": 12411 + }, + { + "epoch": 0.9343043715538494, + "grad_norm": 5.4392290115356445, + "learning_rate": 1.1673461439387967e-06, + "loss": 1.6865, + "step": 12412 + }, + { + "epoch": 0.9343796458345095, + "grad_norm": 6.571662902832031, + "learning_rate": 1.1647287160126807e-06, + "loss": 1.9043, + "step": 12413 + }, + { + "epoch": 0.9344549201151696, + "grad_norm": 6.11766242980957, + "learning_rate": 1.1621141912033173e-06, + "loss": 1.7401, + "step": 12414 + }, + { + "epoch": 0.9345301943958298, + "grad_norm": 4.414893627166748, + "learning_rate": 1.1595025696661376e-06, + "loss": 1.9431, + "step": 12415 + }, + { + "epoch": 0.93460546867649, + "grad_norm": 5.226499080657959, + "learning_rate": 1.1568938515563843e-06, + "loss": 1.7418, + "step": 12416 + }, + { + "epoch": 0.9346807429571501, + "grad_norm": 3.7228245735168457, + "learning_rate": 1.1542880370291443e-06, + "loss": 1.6627, + "step": 12417 + }, + { + "epoch": 0.9347560172378103, + "grad_norm": 5.262889862060547, + "learning_rate": 1.1516851262393325e-06, + "loss": 1.8866, + "step": 12418 + }, + { + "epoch": 0.9348312915184704, + "grad_norm": 7.188180446624756, + "learning_rate": 1.1490851193416751e-06, + "loss": 2.1864, + "step": 12419 + }, + { + "epoch": 0.9349065657991306, + "grad_norm": 4.463619232177734, + "learning_rate": 1.1464880164907377e-06, + "loss": 1.4231, + "step": 12420 + }, + { + "epoch": 0.9349818400797907, + "grad_norm": 6.451642036437988, + "learning_rate": 1.1438938178409187e-06, + "loss": 1.7827, + "step": 12421 + }, + { + "epoch": 0.9350571143604509, + "grad_norm": 4.099438667297363, + "learning_rate": 1.1413025235464226e-06, + "loss": 1.6598, + "step": 12422 + }, + { + "epoch": 0.935132388641111, + "grad_norm": 6.34583044052124, + "learning_rate": 1.1387141337612983e-06, + "loss": 1.635, + "step": 12423 + }, + { + "epoch": 0.9352076629217712, + "grad_norm": 8.350417137145996, + "learning_rate": 1.136128648639423e-06, + "loss": 1.8617, + "step": 12424 + }, + { + "epoch": 0.9352829372024314, + "grad_norm": 5.344479560852051, + "learning_rate": 1.1335460683344957e-06, + "loss": 1.8442, + "step": 12425 + }, + { + "epoch": 0.9353582114830915, + "grad_norm": 5.231409072875977, + "learning_rate": 1.1309663930000324e-06, + "loss": 1.8191, + "step": 12426 + }, + { + "epoch": 0.9354334857637516, + "grad_norm": 5.588863849639893, + "learning_rate": 1.1283896227893997e-06, + "loss": 1.6049, + "step": 12427 + }, + { + "epoch": 0.9355087600444119, + "grad_norm": 7.030704975128174, + "learning_rate": 1.1258157578557804e-06, + "loss": 1.7882, + "step": 12428 + }, + { + "epoch": 0.935584034325072, + "grad_norm": 6.795058250427246, + "learning_rate": 1.1232447983521743e-06, + "loss": 2.164, + "step": 12429 + }, + { + "epoch": 0.9356593086057321, + "grad_norm": 5.804948806762695, + "learning_rate": 1.1206767444314204e-06, + "loss": 1.3897, + "step": 12430 + }, + { + "epoch": 0.9357345828863923, + "grad_norm": 4.454458713531494, + "learning_rate": 1.1181115962461853e-06, + "loss": 1.7913, + "step": 12431 + }, + { + "epoch": 0.9358098571670525, + "grad_norm": 5.039211273193359, + "learning_rate": 1.1155493539489636e-06, + "loss": 1.6498, + "step": 12432 + }, + { + "epoch": 0.9358851314477126, + "grad_norm": 6.152379512786865, + "learning_rate": 1.1129900176920616e-06, + "loss": 2.0145, + "step": 12433 + }, + { + "epoch": 0.9359604057283728, + "grad_norm": 4.326107025146484, + "learning_rate": 1.110433587627635e-06, + "loss": 1.771, + "step": 12434 + }, + { + "epoch": 0.9360356800090329, + "grad_norm": 6.058328628540039, + "learning_rate": 1.1078800639076458e-06, + "loss": 1.9418, + "step": 12435 + }, + { + "epoch": 0.936110954289693, + "grad_norm": 6.08373498916626, + "learning_rate": 1.105329446683906e-06, + "loss": 1.6828, + "step": 12436 + }, + { + "epoch": 0.9361862285703533, + "grad_norm": 6.416456699371338, + "learning_rate": 1.1027817361080273e-06, + "loss": 1.6574, + "step": 12437 + }, + { + "epoch": 0.9362615028510134, + "grad_norm": 4.993826866149902, + "learning_rate": 1.1002369323314777e-06, + "loss": 1.6932, + "step": 12438 + }, + { + "epoch": 0.9363367771316735, + "grad_norm": 4.442711353302002, + "learning_rate": 1.0976950355055304e-06, + "loss": 1.3363, + "step": 12439 + }, + { + "epoch": 0.9364120514123337, + "grad_norm": 7.723465442657471, + "learning_rate": 1.0951560457813036e-06, + "loss": 1.8088, + "step": 12440 + }, + { + "epoch": 0.9364873256929939, + "grad_norm": 4.392434597015381, + "learning_rate": 1.0926199633097157e-06, + "loss": 1.9165, + "step": 12441 + }, + { + "epoch": 0.936562599973654, + "grad_norm": 7.253666400909424, + "learning_rate": 1.090086788241551e-06, + "loss": 1.9432, + "step": 12442 + }, + { + "epoch": 0.9366378742543141, + "grad_norm": 5.460792064666748, + "learning_rate": 1.087556520727384e-06, + "loss": 1.7116, + "step": 12443 + }, + { + "epoch": 0.9367131485349743, + "grad_norm": 6.114422798156738, + "learning_rate": 1.0850291609176387e-06, + "loss": 1.6893, + "step": 12444 + }, + { + "epoch": 0.9367884228156345, + "grad_norm": 5.0753278732299805, + "learning_rate": 1.0825047089625506e-06, + "loss": 1.3846, + "step": 12445 + }, + { + "epoch": 0.9368636970962946, + "grad_norm": 4.737733364105225, + "learning_rate": 1.0799831650121995e-06, + "loss": 1.6372, + "step": 12446 + }, + { + "epoch": 0.9369389713769548, + "grad_norm": 6.026360988616943, + "learning_rate": 1.0774645292164875e-06, + "loss": 1.7137, + "step": 12447 + }, + { + "epoch": 0.9370142456576149, + "grad_norm": 6.874166011810303, + "learning_rate": 1.0749488017251286e-06, + "loss": 2.1847, + "step": 12448 + }, + { + "epoch": 0.937089519938275, + "grad_norm": 5.302225589752197, + "learning_rate": 1.0724359826876806e-06, + "loss": 1.8939, + "step": 12449 + }, + { + "epoch": 0.9371647942189353, + "grad_norm": 5.188690662384033, + "learning_rate": 1.0699260722535242e-06, + "loss": 1.7143, + "step": 12450 + }, + { + "epoch": 0.9372400684995954, + "grad_norm": 5.56906270980835, + "learning_rate": 1.0674190705718733e-06, + "loss": 1.5983, + "step": 12451 + }, + { + "epoch": 0.9373153427802555, + "grad_norm": 5.406866550445557, + "learning_rate": 1.0649149777917532e-06, + "loss": 1.6205, + "step": 12452 + }, + { + "epoch": 0.9373906170609158, + "grad_norm": 4.310051918029785, + "learning_rate": 1.0624137940620283e-06, + "loss": 1.8467, + "step": 12453 + }, + { + "epoch": 0.9374658913415759, + "grad_norm": 5.226302146911621, + "learning_rate": 1.0599155195313847e-06, + "loss": 1.812, + "step": 12454 + }, + { + "epoch": 0.937541165622236, + "grad_norm": 4.872749328613281, + "learning_rate": 1.0574201543483375e-06, + "loss": 1.7247, + "step": 12455 + }, + { + "epoch": 0.9376164399028962, + "grad_norm": 5.026918888092041, + "learning_rate": 1.0549276986612288e-06, + "loss": 1.4934, + "step": 12456 + }, + { + "epoch": 0.9376917141835563, + "grad_norm": 6.810827255249023, + "learning_rate": 1.0524381526182347e-06, + "loss": 1.5994, + "step": 12457 + }, + { + "epoch": 0.9377669884642165, + "grad_norm": 4.407581329345703, + "learning_rate": 1.0499515163673423e-06, + "loss": 1.6185, + "step": 12458 + }, + { + "epoch": 0.9378422627448767, + "grad_norm": 5.270987510681152, + "learning_rate": 1.0474677900563723e-06, + "loss": 1.3497, + "step": 12459 + }, + { + "epoch": 0.9379175370255368, + "grad_norm": 6.135046482086182, + "learning_rate": 1.0449869738329953e-06, + "loss": 1.9524, + "step": 12460 + }, + { + "epoch": 0.9379928113061969, + "grad_norm": 4.757420063018799, + "learning_rate": 1.0425090678446658e-06, + "loss": 1.9275, + "step": 12461 + }, + { + "epoch": 0.9380680855868571, + "grad_norm": 7.520183086395264, + "learning_rate": 1.0400340722386936e-06, + "loss": 1.8929, + "step": 12462 + }, + { + "epoch": 0.9381433598675173, + "grad_norm": 5.079556941986084, + "learning_rate": 1.0375619871622222e-06, + "loss": 1.6177, + "step": 12463 + }, + { + "epoch": 0.9382186341481774, + "grad_norm": 4.4423723220825195, + "learning_rate": 1.0350928127622005e-06, + "loss": 1.7271, + "step": 12464 + }, + { + "epoch": 0.9382939084288375, + "grad_norm": 4.50895881652832, + "learning_rate": 1.0326265491854114e-06, + "loss": 1.8119, + "step": 12465 + }, + { + "epoch": 0.9383691827094978, + "grad_norm": 7.6383442878723145, + "learning_rate": 1.0301631965784763e-06, + "loss": 1.311, + "step": 12466 + }, + { + "epoch": 0.9384444569901579, + "grad_norm": 8.528254508972168, + "learning_rate": 1.0277027550878226e-06, + "loss": 1.7583, + "step": 12467 + }, + { + "epoch": 0.938519731270818, + "grad_norm": 6.644458770751953, + "learning_rate": 1.0252452248597277e-06, + "loss": 1.9075, + "step": 12468 + }, + { + "epoch": 0.9385950055514782, + "grad_norm": 6.614968299865723, + "learning_rate": 1.0227906060402747e-06, + "loss": 1.6182, + "step": 12469 + }, + { + "epoch": 0.9386702798321384, + "grad_norm": 6.287567138671875, + "learning_rate": 1.0203388987753971e-06, + "loss": 1.6834, + "step": 12470 + }, + { + "epoch": 0.9387455541127985, + "grad_norm": 5.54484224319458, + "learning_rate": 1.0178901032108278e-06, + "loss": 1.6684, + "step": 12471 + }, + { + "epoch": 0.9388208283934587, + "grad_norm": 4.640774726867676, + "learning_rate": 1.0154442194921455e-06, + "loss": 1.9592, + "step": 12472 + }, + { + "epoch": 0.9388961026741188, + "grad_norm": 5.380571365356445, + "learning_rate": 1.0130012477647499e-06, + "loss": 1.7724, + "step": 12473 + }, + { + "epoch": 0.938971376954779, + "grad_norm": 5.134807586669922, + "learning_rate": 1.0105611881738808e-06, + "loss": 1.9337, + "step": 12474 + }, + { + "epoch": 0.9390466512354392, + "grad_norm": 4.549726486206055, + "learning_rate": 1.008124040864572e-06, + "loss": 2.0434, + "step": 12475 + }, + { + "epoch": 0.9391219255160993, + "grad_norm": 6.080228328704834, + "learning_rate": 1.0056898059817245e-06, + "loss": 1.854, + "step": 12476 + }, + { + "epoch": 0.9391971997967594, + "grad_norm": 5.546905994415283, + "learning_rate": 1.0032584836700277e-06, + "loss": 1.6351, + "step": 12477 + }, + { + "epoch": 0.9392724740774197, + "grad_norm": 5.720434665679932, + "learning_rate": 1.0008300740740328e-06, + "loss": 1.9311, + "step": 12478 + }, + { + "epoch": 0.9393477483580798, + "grad_norm": 5.755777359008789, + "learning_rate": 9.984045773380968e-07, + "loss": 1.4386, + "step": 12479 + }, + { + "epoch": 0.9394230226387399, + "grad_norm": 5.842292785644531, + "learning_rate": 9.95981993606404e-07, + "loss": 1.7596, + "step": 12480 + }, + { + "epoch": 0.9394982969194, + "grad_norm": 5.842292785644531, + "learning_rate": 9.95981993606404e-07, + "loss": 2.0823, + "step": 12481 + }, + { + "epoch": 0.9395735712000602, + "grad_norm": 7.733410835266113, + "learning_rate": 9.93562323022973e-07, + "loss": 2.2521, + "step": 12482 + }, + { + "epoch": 0.9396488454807204, + "grad_norm": 8.049281120300293, + "learning_rate": 9.911455657316492e-07, + "loss": 1.6223, + "step": 12483 + }, + { + "epoch": 0.9397241197613805, + "grad_norm": 5.2476582527160645, + "learning_rate": 9.88731721876096e-07, + "loss": 1.7754, + "step": 12484 + }, + { + "epoch": 0.9397993940420407, + "grad_norm": 4.549846172332764, + "learning_rate": 9.86320791599815e-07, + "loss": 1.6596, + "step": 12485 + }, + { + "epoch": 0.9398746683227008, + "grad_norm": 4.986956596374512, + "learning_rate": 9.839127750461308e-07, + "loss": 2.0868, + "step": 12486 + }, + { + "epoch": 0.939949942603361, + "grad_norm": 4.548063278198242, + "learning_rate": 9.815076723581784e-07, + "loss": 1.74, + "step": 12487 + }, + { + "epoch": 0.9400252168840212, + "grad_norm": 5.637622356414795, + "learning_rate": 9.791054836789549e-07, + "loss": 1.2324, + "step": 12488 + }, + { + "epoch": 0.9401004911646813, + "grad_norm": 6.698195457458496, + "learning_rate": 9.767062091512514e-07, + "loss": 1.476, + "step": 12489 + }, + { + "epoch": 0.9401757654453414, + "grad_norm": 5.241351127624512, + "learning_rate": 9.743098489177037e-07, + "loss": 1.5097, + "step": 12490 + }, + { + "epoch": 0.9402510397260017, + "grad_norm": 9.292669296264648, + "learning_rate": 9.719164031207594e-07, + "loss": 1.7098, + "step": 12491 + }, + { + "epoch": 0.9403263140066618, + "grad_norm": 4.180777072906494, + "learning_rate": 9.6952587190271e-07, + "loss": 1.6337, + "step": 12492 + }, + { + "epoch": 0.9404015882873219, + "grad_norm": 5.366047382354736, + "learning_rate": 9.671382554056585e-07, + "loss": 1.6953, + "step": 12493 + }, + { + "epoch": 0.9404768625679821, + "grad_norm": 6.424968719482422, + "learning_rate": 9.647535537715524e-07, + "loss": 1.9156, + "step": 12494 + }, + { + "epoch": 0.9405521368486423, + "grad_norm": 5.5772857666015625, + "learning_rate": 9.623717671421451e-07, + "loss": 1.4928, + "step": 12495 + }, + { + "epoch": 0.9406274111293024, + "grad_norm": 5.922527313232422, + "learning_rate": 9.599928956590343e-07, + "loss": 1.6548, + "step": 12496 + }, + { + "epoch": 0.9407026854099626, + "grad_norm": 4.799578666687012, + "learning_rate": 9.576169394636348e-07, + "loss": 1.9037, + "step": 12497 + }, + { + "epoch": 0.9407779596906227, + "grad_norm": 5.528852462768555, + "learning_rate": 9.55243898697189e-07, + "loss": 1.7065, + "step": 12498 + }, + { + "epoch": 0.9408532339712828, + "grad_norm": 6.137616157531738, + "learning_rate": 9.528737735007675e-07, + "loss": 1.7302, + "step": 12499 + }, + { + "epoch": 0.940928508251943, + "grad_norm": 6.736990451812744, + "learning_rate": 9.505065640152689e-07, + "loss": 1.6853, + "step": 12500 + }, + { + "epoch": 0.9410037825326032, + "grad_norm": 4.613751411437988, + "learning_rate": 9.481422703814191e-07, + "loss": 1.8414, + "step": 12501 + }, + { + "epoch": 0.9410790568132633, + "grad_norm": 4.368471145629883, + "learning_rate": 9.457808927397616e-07, + "loss": 1.5764, + "step": 12502 + }, + { + "epoch": 0.9411543310939234, + "grad_norm": 5.797604560852051, + "learning_rate": 9.434224312306839e-07, + "loss": 1.5189, + "step": 12503 + }, + { + "epoch": 0.9412296053745837, + "grad_norm": 4.951598167419434, + "learning_rate": 9.410668859943738e-07, + "loss": 1.6908, + "step": 12504 + }, + { + "epoch": 0.9413048796552438, + "grad_norm": 5.668099880218506, + "learning_rate": 9.387142571708751e-07, + "loss": 2.0259, + "step": 12505 + }, + { + "epoch": 0.9413801539359039, + "grad_norm": 6.4489545822143555, + "learning_rate": 9.363645449000425e-07, + "loss": 1.4728, + "step": 12506 + }, + { + "epoch": 0.9414554282165641, + "grad_norm": 4.458563327789307, + "learning_rate": 9.340177493215641e-07, + "loss": 1.7233, + "step": 12507 + }, + { + "epoch": 0.9415307024972243, + "grad_norm": 4.0902934074401855, + "learning_rate": 9.31673870574934e-07, + "loss": 1.654, + "step": 12508 + }, + { + "epoch": 0.9416059767778844, + "grad_norm": 5.992631912231445, + "learning_rate": 9.293329087995073e-07, + "loss": 1.9809, + "step": 12509 + }, + { + "epoch": 0.9416812510585446, + "grad_norm": 7.571398735046387, + "learning_rate": 9.26994864134434e-07, + "loss": 2.0451, + "step": 12510 + }, + { + "epoch": 0.9417565253392047, + "grad_norm": 5.377318859100342, + "learning_rate": 9.246597367187138e-07, + "loss": 1.9563, + "step": 12511 + }, + { + "epoch": 0.9418317996198649, + "grad_norm": 6.769304275512695, + "learning_rate": 9.22327526691158e-07, + "loss": 1.9248, + "step": 12512 + }, + { + "epoch": 0.9419070739005251, + "grad_norm": 4.998494625091553, + "learning_rate": 9.199982341904112e-07, + "loss": 1.4112, + "step": 12513 + }, + { + "epoch": 0.9419823481811852, + "grad_norm": 5.737831115722656, + "learning_rate": 9.176718593549461e-07, + "loss": 1.6631, + "step": 12514 + }, + { + "epoch": 0.9420576224618453, + "grad_norm": 4.237371444702148, + "learning_rate": 9.153484023230519e-07, + "loss": 1.8976, + "step": 12515 + }, + { + "epoch": 0.9421328967425056, + "grad_norm": 8.039645195007324, + "learning_rate": 9.13027863232857e-07, + "loss": 1.7093, + "step": 12516 + }, + { + "epoch": 0.9422081710231657, + "grad_norm": 8.183951377868652, + "learning_rate": 9.107102422223069e-07, + "loss": 1.8546, + "step": 12517 + }, + { + "epoch": 0.9422834453038258, + "grad_norm": 4.903064250946045, + "learning_rate": 9.083955394291799e-07, + "loss": 1.4752, + "step": 12518 + }, + { + "epoch": 0.9423587195844859, + "grad_norm": 5.741955757141113, + "learning_rate": 9.060837549910772e-07, + "loss": 1.7708, + "step": 12519 + }, + { + "epoch": 0.9424339938651461, + "grad_norm": 5.609581470489502, + "learning_rate": 9.03774889045439e-07, + "loss": 1.519, + "step": 12520 + }, + { + "epoch": 0.9425092681458063, + "grad_norm": 8.574150085449219, + "learning_rate": 9.014689417294997e-07, + "loss": 1.8016, + "step": 12521 + }, + { + "epoch": 0.9425845424264664, + "grad_norm": 6.401511192321777, + "learning_rate": 8.99165913180361e-07, + "loss": 1.6493, + "step": 12522 + }, + { + "epoch": 0.9426598167071266, + "grad_norm": 5.3528008460998535, + "learning_rate": 8.968658035349131e-07, + "loss": 2.1504, + "step": 12523 + }, + { + "epoch": 0.9427350909877867, + "grad_norm": 7.316405773162842, + "learning_rate": 8.94568612929908e-07, + "loss": 1.6506, + "step": 12524 + }, + { + "epoch": 0.9428103652684469, + "grad_norm": 4.656257629394531, + "learning_rate": 8.922743415018975e-07, + "loss": 1.4228, + "step": 12525 + }, + { + "epoch": 0.9428856395491071, + "grad_norm": 5.314303398132324, + "learning_rate": 8.899829893872724e-07, + "loss": 1.7005, + "step": 12526 + }, + { + "epoch": 0.9429609138297672, + "grad_norm": 6.6532769203186035, + "learning_rate": 8.876945567222461e-07, + "loss": 1.644, + "step": 12527 + }, + { + "epoch": 0.9430361881104273, + "grad_norm": 5.868844032287598, + "learning_rate": 8.854090436428542e-07, + "loss": 2.0362, + "step": 12528 + }, + { + "epoch": 0.9431114623910876, + "grad_norm": 5.348581790924072, + "learning_rate": 8.831264502849712e-07, + "loss": 1.5585, + "step": 12529 + }, + { + "epoch": 0.9431867366717477, + "grad_norm": 4.28386926651001, + "learning_rate": 8.808467767842887e-07, + "loss": 1.4932, + "step": 12530 + }, + { + "epoch": 0.9432620109524078, + "grad_norm": 5.758905410766602, + "learning_rate": 8.785700232763316e-07, + "loss": 1.862, + "step": 12531 + }, + { + "epoch": 0.943337285233068, + "grad_norm": 4.322707176208496, + "learning_rate": 8.762961898964362e-07, + "loss": 1.5659, + "step": 12532 + }, + { + "epoch": 0.9434125595137282, + "grad_norm": 4.687629222869873, + "learning_rate": 8.740252767797885e-07, + "loss": 1.3565, + "step": 12533 + }, + { + "epoch": 0.9434878337943883, + "grad_norm": 5.1688408851623535, + "learning_rate": 8.717572840613697e-07, + "loss": 1.7934, + "step": 12534 + }, + { + "epoch": 0.9435631080750485, + "grad_norm": 7.117114067077637, + "learning_rate": 8.694922118760273e-07, + "loss": 1.8326, + "step": 12535 + }, + { + "epoch": 0.9436383823557086, + "grad_norm": 5.018606662750244, + "learning_rate": 8.672300603583927e-07, + "loss": 1.7247, + "step": 12536 + }, + { + "epoch": 0.9437136566363687, + "grad_norm": 6.573191165924072, + "learning_rate": 8.649708296429526e-07, + "loss": 1.5378, + "step": 12537 + }, + { + "epoch": 0.943788930917029, + "grad_norm": 5.426792144775391, + "learning_rate": 8.627145198640163e-07, + "loss": 1.6134, + "step": 12538 + }, + { + "epoch": 0.9438642051976891, + "grad_norm": 5.405853748321533, + "learning_rate": 8.604611311557098e-07, + "loss": 1.9442, + "step": 12539 + }, + { + "epoch": 0.9439394794783492, + "grad_norm": 4.964509963989258, + "learning_rate": 8.582106636519871e-07, + "loss": 1.4679, + "step": 12540 + }, + { + "epoch": 0.9440147537590093, + "grad_norm": 4.2952704429626465, + "learning_rate": 8.559631174866467e-07, + "loss": 1.4856, + "step": 12541 + }, + { + "epoch": 0.9440900280396696, + "grad_norm": 6.918556213378906, + "learning_rate": 8.537184927932818e-07, + "loss": 1.5914, + "step": 12542 + }, + { + "epoch": 0.9441653023203297, + "grad_norm": 6.630629062652588, + "learning_rate": 8.514767897053411e-07, + "loss": 1.7433, + "step": 12543 + }, + { + "epoch": 0.9442405766009898, + "grad_norm": 4.634683609008789, + "learning_rate": 8.492380083560736e-07, + "loss": 1.6828, + "step": 12544 + }, + { + "epoch": 0.94431585088165, + "grad_norm": 5.763222694396973, + "learning_rate": 8.470021488785839e-07, + "loss": 1.5053, + "step": 12545 + }, + { + "epoch": 0.9443911251623102, + "grad_norm": 6.340846061706543, + "learning_rate": 8.447692114057826e-07, + "loss": 1.865, + "step": 12546 + }, + { + "epoch": 0.9444663994429703, + "grad_norm": 6.015748500823975, + "learning_rate": 8.425391960704021e-07, + "loss": 1.7199, + "step": 12547 + }, + { + "epoch": 0.9445416737236305, + "grad_norm": 5.575201511383057, + "learning_rate": 8.403121030050254e-07, + "loss": 1.9596, + "step": 12548 + }, + { + "epoch": 0.9446169480042906, + "grad_norm": 5.872047424316406, + "learning_rate": 8.380879323420354e-07, + "loss": 2.0706, + "step": 12549 + }, + { + "epoch": 0.9446922222849508, + "grad_norm": 5.524749279022217, + "learning_rate": 8.358666842136542e-07, + "loss": 1.842, + "step": 12550 + }, + { + "epoch": 0.944767496565611, + "grad_norm": 4.896964073181152, + "learning_rate": 8.336483587519317e-07, + "loss": 1.7122, + "step": 12551 + }, + { + "epoch": 0.9448427708462711, + "grad_norm": 6.14225959777832, + "learning_rate": 8.314329560887458e-07, + "loss": 1.7822, + "step": 12552 + }, + { + "epoch": 0.9449180451269312, + "grad_norm": 5.46946907043457, + "learning_rate": 8.292204763557854e-07, + "loss": 1.4848, + "step": 12553 + }, + { + "epoch": 0.9449933194075915, + "grad_norm": 5.759443283081055, + "learning_rate": 8.270109196845843e-07, + "loss": 1.8836, + "step": 12554 + }, + { + "epoch": 0.9450685936882516, + "grad_norm": 4.762637138366699, + "learning_rate": 8.248042862064875e-07, + "loss": 1.5756, + "step": 12555 + }, + { + "epoch": 0.9451438679689117, + "grad_norm": 5.927667617797852, + "learning_rate": 8.226005760526789e-07, + "loss": 1.5246, + "step": 12556 + }, + { + "epoch": 0.9452191422495719, + "grad_norm": 4.547461986541748, + "learning_rate": 8.203997893541593e-07, + "loss": 1.7886, + "step": 12557 + }, + { + "epoch": 0.945294416530232, + "grad_norm": 5.653580188751221, + "learning_rate": 8.182019262417628e-07, + "loss": 1.6881, + "step": 12558 + }, + { + "epoch": 0.9453696908108922, + "grad_norm": 6.114114761352539, + "learning_rate": 8.160069868461406e-07, + "loss": 1.7012, + "step": 12559 + }, + { + "epoch": 0.9454449650915523, + "grad_norm": 4.542468070983887, + "learning_rate": 8.138149712977716e-07, + "loss": 1.9193, + "step": 12560 + }, + { + "epoch": 0.9455202393722125, + "grad_norm": 5.340787887573242, + "learning_rate": 8.116258797269793e-07, + "loss": 1.819, + "step": 12561 + }, + { + "epoch": 0.9455955136528726, + "grad_norm": 4.6120123863220215, + "learning_rate": 8.09439712263893e-07, + "loss": 1.2782, + "step": 12562 + }, + { + "epoch": 0.9456707879335328, + "grad_norm": 7.416088581085205, + "learning_rate": 8.072564690384643e-07, + "loss": 1.5048, + "step": 12563 + }, + { + "epoch": 0.945746062214193, + "grad_norm": 7.504977226257324, + "learning_rate": 8.050761501804949e-07, + "loss": 1.5301, + "step": 12564 + }, + { + "epoch": 0.9458213364948531, + "grad_norm": 4.926298141479492, + "learning_rate": 8.028987558195922e-07, + "loss": 1.8725, + "step": 12565 + }, + { + "epoch": 0.9458966107755132, + "grad_norm": 8.05074691772461, + "learning_rate": 8.007242860851971e-07, + "loss": 1.9804, + "step": 12566 + }, + { + "epoch": 0.9459718850561735, + "grad_norm": 6.948409080505371, + "learning_rate": 7.98552741106573e-07, + "loss": 1.8467, + "step": 12567 + }, + { + "epoch": 0.9460471593368336, + "grad_norm": 6.183065414428711, + "learning_rate": 7.963841210128109e-07, + "loss": 1.8517, + "step": 12568 + }, + { + "epoch": 0.9461224336174937, + "grad_norm": 5.389442443847656, + "learning_rate": 7.942184259328356e-07, + "loss": 1.4457, + "step": 12569 + }, + { + "epoch": 0.9461977078981539, + "grad_norm": 5.673714637756348, + "learning_rate": 7.920556559953884e-07, + "loss": 1.6746, + "step": 12570 + }, + { + "epoch": 0.9462729821788141, + "grad_norm": 4.9728899002075195, + "learning_rate": 7.898958113290389e-07, + "loss": 1.3762, + "step": 12571 + }, + { + "epoch": 0.9463482564594742, + "grad_norm": 6.527997016906738, + "learning_rate": 7.877388920621786e-07, + "loss": 2.1061, + "step": 12572 + }, + { + "epoch": 0.9464235307401344, + "grad_norm": 5.880069732666016, + "learning_rate": 7.855848983230385e-07, + "loss": 1.7438, + "step": 12573 + }, + { + "epoch": 0.9464988050207945, + "grad_norm": 7.144503116607666, + "learning_rate": 7.834338302396604e-07, + "loss": 1.7178, + "step": 12574 + }, + { + "epoch": 0.9465740793014547, + "grad_norm": 6.0495405197143555, + "learning_rate": 7.812856879399311e-07, + "loss": 1.9783, + "step": 12575 + }, + { + "epoch": 0.9466493535821149, + "grad_norm": 5.908311367034912, + "learning_rate": 7.791404715515372e-07, + "loss": 1.5966, + "step": 12576 + }, + { + "epoch": 0.946724627862775, + "grad_norm": 4.650298118591309, + "learning_rate": 7.7699818120201e-07, + "loss": 1.5503, + "step": 12577 + }, + { + "epoch": 0.9467999021434351, + "grad_norm": 4.715789318084717, + "learning_rate": 7.748588170187087e-07, + "loss": 1.4444, + "step": 12578 + }, + { + "epoch": 0.9468751764240952, + "grad_norm": 5.299562454223633, + "learning_rate": 7.727223791288097e-07, + "loss": 2.0255, + "step": 12579 + }, + { + "epoch": 0.9469504507047555, + "grad_norm": 5.761845111846924, + "learning_rate": 7.705888676593165e-07, + "loss": 1.7115, + "step": 12580 + }, + { + "epoch": 0.9470257249854156, + "grad_norm": 5.574798583984375, + "learning_rate": 7.684582827370557e-07, + "loss": 1.7198, + "step": 12581 + }, + { + "epoch": 0.9471009992660757, + "grad_norm": 6.217630863189697, + "learning_rate": 7.663306244886925e-07, + "loss": 1.8472, + "step": 12582 + }, + { + "epoch": 0.947176273546736, + "grad_norm": 4.493381500244141, + "learning_rate": 7.642058930406981e-07, + "loss": 1.6004, + "step": 12583 + }, + { + "epoch": 0.9472515478273961, + "grad_norm": 5.0270586013793945, + "learning_rate": 7.620840885193992e-07, + "loss": 1.8077, + "step": 12584 + }, + { + "epoch": 0.9473268221080562, + "grad_norm": 5.161044120788574, + "learning_rate": 7.59965211050917e-07, + "loss": 1.8267, + "step": 12585 + }, + { + "epoch": 0.9474020963887164, + "grad_norm": 4.769935131072998, + "learning_rate": 7.57849260761223e-07, + "loss": 1.424, + "step": 12586 + }, + { + "epoch": 0.9474773706693765, + "grad_norm": 4.501280307769775, + "learning_rate": 7.55736237776089e-07, + "loss": 1.5655, + "step": 12587 + }, + { + "epoch": 0.9475526449500367, + "grad_norm": 6.53385066986084, + "learning_rate": 7.536261422211477e-07, + "loss": 1.6743, + "step": 12588 + }, + { + "epoch": 0.9476279192306969, + "grad_norm": 5.590780258178711, + "learning_rate": 7.515189742218265e-07, + "loss": 1.654, + "step": 12589 + }, + { + "epoch": 0.947703193511357, + "grad_norm": 5.566118240356445, + "learning_rate": 7.494147339033919e-07, + "loss": 1.7755, + "step": 12590 + }, + { + "epoch": 0.9477784677920171, + "grad_norm": 5.123452663421631, + "learning_rate": 7.473134213909327e-07, + "loss": 1.7205, + "step": 12591 + }, + { + "epoch": 0.9478537420726774, + "grad_norm": 5.632142066955566, + "learning_rate": 7.452150368093713e-07, + "loss": 1.7436, + "step": 12592 + }, + { + "epoch": 0.9479290163533375, + "grad_norm": 5.397055149078369, + "learning_rate": 7.431195802834468e-07, + "loss": 1.5878, + "step": 12593 + }, + { + "epoch": 0.9480042906339976, + "grad_norm": 5.040858268737793, + "learning_rate": 7.410270519377316e-07, + "loss": 1.7953, + "step": 12594 + }, + { + "epoch": 0.9480795649146578, + "grad_norm": 4.766727924346924, + "learning_rate": 7.389374518966152e-07, + "loss": 1.2923, + "step": 12595 + }, + { + "epoch": 0.948154839195318, + "grad_norm": 4.738611221313477, + "learning_rate": 7.368507802843205e-07, + "loss": 1.9148, + "step": 12596 + }, + { + "epoch": 0.9482301134759781, + "grad_norm": 5.1082916259765625, + "learning_rate": 7.347670372249038e-07, + "loss": 1.7276, + "step": 12597 + }, + { + "epoch": 0.9483053877566382, + "grad_norm": 4.379059791564941, + "learning_rate": 7.326862228422216e-07, + "loss": 1.6177, + "step": 12598 + }, + { + "epoch": 0.9483806620372984, + "grad_norm": 4.747915267944336, + "learning_rate": 7.306083372599804e-07, + "loss": 2.0814, + "step": 12599 + }, + { + "epoch": 0.9484559363179585, + "grad_norm": 6.735932350158691, + "learning_rate": 7.285333806017036e-07, + "loss": 1.6567, + "step": 12600 + }, + { + "epoch": 0.9485312105986187, + "grad_norm": 4.8582282066345215, + "learning_rate": 7.264613529907482e-07, + "loss": 1.4624, + "step": 12601 + }, + { + "epoch": 0.9486064848792789, + "grad_norm": 3.982478618621826, + "learning_rate": 7.243922545502768e-07, + "loss": 1.8557, + "step": 12602 + }, + { + "epoch": 0.948681759159939, + "grad_norm": 6.0325469970703125, + "learning_rate": 7.223260854033021e-07, + "loss": 1.7344, + "step": 12603 + }, + { + "epoch": 0.9487570334405991, + "grad_norm": 5.877956867218018, + "learning_rate": 7.202628456726423e-07, + "loss": 1.8891, + "step": 12604 + }, + { + "epoch": 0.9488323077212594, + "grad_norm": 5.365961074829102, + "learning_rate": 7.182025354809607e-07, + "loss": 1.5835, + "step": 12605 + }, + { + "epoch": 0.9489075820019195, + "grad_norm": 5.662789344787598, + "learning_rate": 7.161451549507314e-07, + "loss": 1.7199, + "step": 12606 + }, + { + "epoch": 0.9489828562825796, + "grad_norm": 5.25111722946167, + "learning_rate": 7.140907042042621e-07, + "loss": 1.5117, + "step": 12607 + }, + { + "epoch": 0.9490581305632398, + "grad_norm": 4.914377689361572, + "learning_rate": 7.12039183363683e-07, + "loss": 1.6782, + "step": 12608 + }, + { + "epoch": 0.9491334048439, + "grad_norm": 5.842033386230469, + "learning_rate": 7.09990592550952e-07, + "loss": 1.5695, + "step": 12609 + }, + { + "epoch": 0.9492086791245601, + "grad_norm": 5.765605449676514, + "learning_rate": 7.07944931887855e-07, + "loss": 1.4335, + "step": 12610 + }, + { + "epoch": 0.9492839534052203, + "grad_norm": 5.255399227142334, + "learning_rate": 7.059022014959893e-07, + "loss": 1.663, + "step": 12611 + }, + { + "epoch": 0.9493592276858804, + "grad_norm": 8.746234893798828, + "learning_rate": 7.03862401496802e-07, + "loss": 1.786, + "step": 12612 + }, + { + "epoch": 0.9494345019665406, + "grad_norm": 5.831923961639404, + "learning_rate": 7.018255320115519e-07, + "loss": 1.7251, + "step": 12613 + }, + { + "epoch": 0.9495097762472008, + "grad_norm": 4.88585901260376, + "learning_rate": 6.997915931613197e-07, + "loss": 1.8299, + "step": 12614 + }, + { + "epoch": 0.9495850505278609, + "grad_norm": 5.934195518493652, + "learning_rate": 6.977605850670144e-07, + "loss": 1.612, + "step": 12615 + }, + { + "epoch": 0.949660324808521, + "grad_norm": 4.965317249298096, + "learning_rate": 6.957325078493837e-07, + "loss": 1.8125, + "step": 12616 + }, + { + "epoch": 0.9497355990891813, + "grad_norm": 5.332989692687988, + "learning_rate": 6.937073616289813e-07, + "loss": 1.7354, + "step": 12617 + }, + { + "epoch": 0.9498108733698414, + "grad_norm": 6.912236213684082, + "learning_rate": 6.916851465261997e-07, + "loss": 1.5647, + "step": 12618 + }, + { + "epoch": 0.9498861476505015, + "grad_norm": 7.173828601837158, + "learning_rate": 6.89665862661254e-07, + "loss": 1.9432, + "step": 12619 + }, + { + "epoch": 0.9499614219311616, + "grad_norm": 5.719157695770264, + "learning_rate": 6.876495101541924e-07, + "loss": 1.9049, + "step": 12620 + }, + { + "epoch": 0.9500366962118219, + "grad_norm": 5.528200149536133, + "learning_rate": 6.856360891248692e-07, + "loss": 1.7393, + "step": 12621 + }, + { + "epoch": 0.950111970492482, + "grad_norm": 6.372963905334473, + "learning_rate": 6.836255996929886e-07, + "loss": 1.7865, + "step": 12622 + }, + { + "epoch": 0.9501872447731421, + "grad_norm": 4.901663780212402, + "learning_rate": 6.816180419780493e-07, + "loss": 1.9239, + "step": 12623 + }, + { + "epoch": 0.9502625190538023, + "grad_norm": 5.7198486328125, + "learning_rate": 6.796134160994172e-07, + "loss": 1.7213, + "step": 12624 + }, + { + "epoch": 0.9503377933344624, + "grad_norm": 7.771454334259033, + "learning_rate": 6.776117221762468e-07, + "loss": 1.8762, + "step": 12625 + }, + { + "epoch": 0.9504130676151226, + "grad_norm": 5.403398513793945, + "learning_rate": 6.756129603275374e-07, + "loss": 1.8571, + "step": 12626 + }, + { + "epoch": 0.9504883418957828, + "grad_norm": 4.727996826171875, + "learning_rate": 6.736171306721162e-07, + "loss": 1.5952, + "step": 12627 + }, + { + "epoch": 0.9505636161764429, + "grad_norm": 4.673222064971924, + "learning_rate": 6.71624233328616e-07, + "loss": 1.5117, + "step": 12628 + }, + { + "epoch": 0.950638890457103, + "grad_norm": 5.851140022277832, + "learning_rate": 6.696342684155199e-07, + "loss": 1.4182, + "step": 12629 + }, + { + "epoch": 0.9507141647377633, + "grad_norm": 4.836675643920898, + "learning_rate": 6.676472360511165e-07, + "loss": 1.9661, + "step": 12630 + }, + { + "epoch": 0.9507894390184234, + "grad_norm": 5.107874870300293, + "learning_rate": 6.656631363535448e-07, + "loss": 2.1448, + "step": 12631 + }, + { + "epoch": 0.9508647132990835, + "grad_norm": 4.364999294281006, + "learning_rate": 6.636819694407382e-07, + "loss": 1.8065, + "step": 12632 + }, + { + "epoch": 0.9509399875797437, + "grad_norm": 4.818136215209961, + "learning_rate": 6.617037354304745e-07, + "loss": 1.4799, + "step": 12633 + }, + { + "epoch": 0.9510152618604039, + "grad_norm": 4.360340118408203, + "learning_rate": 6.597284344403599e-07, + "loss": 1.4148, + "step": 12634 + }, + { + "epoch": 0.951090536141064, + "grad_norm": 6.0234880447387695, + "learning_rate": 6.577560665878169e-07, + "loss": 1.4179, + "step": 12635 + }, + { + "epoch": 0.9511658104217242, + "grad_norm": 7.2181925773620605, + "learning_rate": 6.557866319900907e-07, + "loss": 1.6164, + "step": 12636 + }, + { + "epoch": 0.9512410847023843, + "grad_norm": 6.3350114822387695, + "learning_rate": 6.538201307642711e-07, + "loss": 1.5292, + "step": 12637 + }, + { + "epoch": 0.9513163589830445, + "grad_norm": 5.853308200836182, + "learning_rate": 6.518565630272533e-07, + "loss": 1.7489, + "step": 12638 + }, + { + "epoch": 0.9513916332637046, + "grad_norm": 5.933257102966309, + "learning_rate": 6.498959288957662e-07, + "loss": 1.9363, + "step": 12639 + }, + { + "epoch": 0.9514669075443648, + "grad_norm": 6.689883232116699, + "learning_rate": 6.479382284863611e-07, + "loss": 1.6647, + "step": 12640 + }, + { + "epoch": 0.9515421818250249, + "grad_norm": 5.111749172210693, + "learning_rate": 6.459834619154282e-07, + "loss": 1.9243, + "step": 12641 + }, + { + "epoch": 0.951617456105685, + "grad_norm": 4.389624118804932, + "learning_rate": 6.440316292991633e-07, + "loss": 1.2527, + "step": 12642 + }, + { + "epoch": 0.9516927303863453, + "grad_norm": 6.181106090545654, + "learning_rate": 6.420827307535959e-07, + "loss": 1.9714, + "step": 12643 + }, + { + "epoch": 0.9517680046670054, + "grad_norm": 5.142221450805664, + "learning_rate": 6.401367663945945e-07, + "loss": 1.6255, + "step": 12644 + }, + { + "epoch": 0.9518432789476655, + "grad_norm": 6.7326459884643555, + "learning_rate": 6.381937363378277e-07, + "loss": 1.947, + "step": 12645 + }, + { + "epoch": 0.9519185532283257, + "grad_norm": 5.060942649841309, + "learning_rate": 6.362536406988084e-07, + "loss": 1.567, + "step": 12646 + }, + { + "epoch": 0.9519938275089859, + "grad_norm": 6.392467975616455, + "learning_rate": 6.343164795928724e-07, + "loss": 1.5161, + "step": 12647 + }, + { + "epoch": 0.952069101789646, + "grad_norm": 4.735790252685547, + "learning_rate": 6.323822531351775e-07, + "loss": 1.5167, + "step": 12648 + }, + { + "epoch": 0.9521443760703062, + "grad_norm": 6.459567546844482, + "learning_rate": 6.304509614406984e-07, + "loss": 1.552, + "step": 12649 + }, + { + "epoch": 0.9522196503509663, + "grad_norm": 4.889157295227051, + "learning_rate": 6.285226046242543e-07, + "loss": 1.3057, + "step": 12650 + }, + { + "epoch": 0.9522949246316265, + "grad_norm": 5.364915370941162, + "learning_rate": 6.265971828004813e-07, + "loss": 1.6994, + "step": 12651 + }, + { + "epoch": 0.9523701989122867, + "grad_norm": 4.556305408477783, + "learning_rate": 6.246746960838379e-07, + "loss": 1.5721, + "step": 12652 + }, + { + "epoch": 0.9524454731929468, + "grad_norm": 7.982117176055908, + "learning_rate": 6.227551445886104e-07, + "loss": 1.5765, + "step": 12653 + }, + { + "epoch": 0.9525207474736069, + "grad_norm": 7.070639610290527, + "learning_rate": 6.208385284289131e-07, + "loss": 2.07, + "step": 12654 + }, + { + "epoch": 0.9525960217542672, + "grad_norm": 4.184110164642334, + "learning_rate": 6.189248477186715e-07, + "loss": 2.1231, + "step": 12655 + }, + { + "epoch": 0.9526712960349273, + "grad_norm": 5.4122161865234375, + "learning_rate": 6.170141025716669e-07, + "loss": 1.3802, + "step": 12656 + }, + { + "epoch": 0.9527465703155874, + "grad_norm": 4.412859916687012, + "learning_rate": 6.151062931014751e-07, + "loss": 1.9731, + "step": 12657 + }, + { + "epoch": 0.9528218445962475, + "grad_norm": 5.209953784942627, + "learning_rate": 6.132014194215108e-07, + "loss": 1.4634, + "step": 12658 + }, + { + "epoch": 0.9528971188769078, + "grad_norm": 5.093353271484375, + "learning_rate": 6.112994816450169e-07, + "loss": 2.078, + "step": 12659 + }, + { + "epoch": 0.9529723931575679, + "grad_norm": 5.430246829986572, + "learning_rate": 6.094004798850584e-07, + "loss": 1.662, + "step": 12660 + }, + { + "epoch": 0.953047667438228, + "grad_norm": 4.798196792602539, + "learning_rate": 6.075044142545172e-07, + "loss": 1.8175, + "step": 12661 + }, + { + "epoch": 0.9531229417188882, + "grad_norm": 6.064657211303711, + "learning_rate": 6.056112848661199e-07, + "loss": 1.9005, + "step": 12662 + }, + { + "epoch": 0.9531982159995483, + "grad_norm": 4.359076499938965, + "learning_rate": 6.037210918324099e-07, + "loss": 1.581, + "step": 12663 + }, + { + "epoch": 0.9532734902802085, + "grad_norm": 4.044832706451416, + "learning_rate": 6.01833835265736e-07, + "loss": 1.7297, + "step": 12664 + }, + { + "epoch": 0.9533487645608687, + "grad_norm": 4.811741352081299, + "learning_rate": 5.999495152783086e-07, + "loss": 1.6842, + "step": 12665 + }, + { + "epoch": 0.9534240388415288, + "grad_norm": 6.12009859085083, + "learning_rate": 5.980681319821379e-07, + "loss": 1.9095, + "step": 12666 + }, + { + "epoch": 0.9534993131221889, + "grad_norm": 6.275171756744385, + "learning_rate": 5.961896854890681e-07, + "loss": 1.8664, + "step": 12667 + }, + { + "epoch": 0.9535745874028492, + "grad_norm": 5.184971809387207, + "learning_rate": 5.943141759107595e-07, + "loss": 2.1915, + "step": 12668 + }, + { + "epoch": 0.9536498616835093, + "grad_norm": 4.3995137214660645, + "learning_rate": 5.924416033587177e-07, + "loss": 1.636, + "step": 12669 + }, + { + "epoch": 0.9537251359641694, + "grad_norm": 7.882792949676514, + "learning_rate": 5.905719679442534e-07, + "loss": 1.5945, + "step": 12670 + }, + { + "epoch": 0.9538004102448296, + "grad_norm": 4.9927778244018555, + "learning_rate": 5.887052697785167e-07, + "loss": 1.6996, + "step": 12671 + }, + { + "epoch": 0.9538756845254898, + "grad_norm": 4.428548336029053, + "learning_rate": 5.868415089724688e-07, + "loss": 1.6615, + "step": 12672 + }, + { + "epoch": 0.9539509588061499, + "grad_norm": 4.27783203125, + "learning_rate": 5.849806856369156e-07, + "loss": 1.9074, + "step": 12673 + }, + { + "epoch": 0.9540262330868101, + "grad_norm": 5.507120609283447, + "learning_rate": 5.83122799882474e-07, + "loss": 1.4479, + "step": 12674 + }, + { + "epoch": 0.9541015073674702, + "grad_norm": 4.442719459533691, + "learning_rate": 5.812678518195836e-07, + "loss": 1.5942, + "step": 12675 + }, + { + "epoch": 0.9541767816481304, + "grad_norm": 5.206497669219971, + "learning_rate": 5.794158415585282e-07, + "loss": 1.5815, + "step": 12676 + }, + { + "epoch": 0.9542520559287905, + "grad_norm": 5.101005554199219, + "learning_rate": 5.775667692093922e-07, + "loss": 1.741, + "step": 12677 + }, + { + "epoch": 0.9543273302094507, + "grad_norm": 6.174216270446777, + "learning_rate": 5.757206348821098e-07, + "loss": 1.6413, + "step": 12678 + }, + { + "epoch": 0.9544026044901108, + "grad_norm": 6.751343250274658, + "learning_rate": 5.738774386864154e-07, + "loss": 1.5605, + "step": 12679 + }, + { + "epoch": 0.954477878770771, + "grad_norm": 4.732336044311523, + "learning_rate": 5.720371807318881e-07, + "loss": 1.7258, + "step": 12680 + }, + { + "epoch": 0.9545531530514312, + "grad_norm": 5.747982978820801, + "learning_rate": 5.701998611279291e-07, + "loss": 1.6925, + "step": 12681 + }, + { + "epoch": 0.9546284273320913, + "grad_norm": 5.427239418029785, + "learning_rate": 5.683654799837567e-07, + "loss": 1.6573, + "step": 12682 + }, + { + "epoch": 0.9547037016127514, + "grad_norm": 6.149512767791748, + "learning_rate": 5.665340374084172e-07, + "loss": 1.5456, + "step": 12683 + }, + { + "epoch": 0.9547789758934117, + "grad_norm": 5.5960469245910645, + "learning_rate": 5.647055335107954e-07, + "loss": 1.9983, + "step": 12684 + }, + { + "epoch": 0.9548542501740718, + "grad_norm": 6.095613479614258, + "learning_rate": 5.628799683995767e-07, + "loss": 1.7953, + "step": 12685 + }, + { + "epoch": 0.9549295244547319, + "grad_norm": 4.696746826171875, + "learning_rate": 5.610573421832965e-07, + "loss": 1.8766, + "step": 12686 + }, + { + "epoch": 0.9550047987353921, + "grad_norm": 6.406050682067871, + "learning_rate": 5.592376549703015e-07, + "loss": 1.9763, + "step": 12687 + }, + { + "epoch": 0.9550800730160522, + "grad_norm": 7.507273197174072, + "learning_rate": 5.574209068687719e-07, + "loss": 2.0079, + "step": 12688 + }, + { + "epoch": 0.9551553472967124, + "grad_norm": 5.4612226486206055, + "learning_rate": 5.556070979866934e-07, + "loss": 1.8129, + "step": 12689 + }, + { + "epoch": 0.9552306215773726, + "grad_norm": 5.730344772338867, + "learning_rate": 5.537962284319021e-07, + "loss": 1.7734, + "step": 12690 + }, + { + "epoch": 0.9553058958580327, + "grad_norm": 4.310399532318115, + "learning_rate": 5.519882983120561e-07, + "loss": 1.3763, + "step": 12691 + }, + { + "epoch": 0.9553811701386928, + "grad_norm": 4.23614501953125, + "learning_rate": 5.501833077346142e-07, + "loss": 1.8208, + "step": 12692 + }, + { + "epoch": 0.9554564444193531, + "grad_norm": 5.807472229003906, + "learning_rate": 5.483812568068958e-07, + "loss": 1.9063, + "step": 12693 + }, + { + "epoch": 0.9555317187000132, + "grad_norm": 5.356497287750244, + "learning_rate": 5.465821456360098e-07, + "loss": 1.6296, + "step": 12694 + }, + { + "epoch": 0.9556069929806733, + "grad_norm": 8.384337425231934, + "learning_rate": 5.447859743289208e-07, + "loss": 2.2468, + "step": 12695 + }, + { + "epoch": 0.9556822672613334, + "grad_norm": 4.483689308166504, + "learning_rate": 5.429927429924042e-07, + "loss": 1.9173, + "step": 12696 + }, + { + "epoch": 0.9557575415419937, + "grad_norm": 5.285558700561523, + "learning_rate": 5.412024517330638e-07, + "loss": 2.0464, + "step": 12697 + }, + { + "epoch": 0.9558328158226538, + "grad_norm": 4.871053218841553, + "learning_rate": 5.3941510065732e-07, + "loss": 1.6155, + "step": 12698 + }, + { + "epoch": 0.9559080901033139, + "grad_norm": 7.079250812530518, + "learning_rate": 5.376306898714322e-07, + "loss": 1.7695, + "step": 12699 + }, + { + "epoch": 0.9559833643839741, + "grad_norm": 5.054591655731201, + "learning_rate": 5.358492194814768e-07, + "loss": 1.8037, + "step": 12700 + }, + { + "epoch": 0.9560586386646343, + "grad_norm": 5.048430919647217, + "learning_rate": 5.340706895933578e-07, + "loss": 1.8318, + "step": 12701 + }, + { + "epoch": 0.9561339129452944, + "grad_norm": 5.564990043640137, + "learning_rate": 5.322951003127963e-07, + "loss": 1.6238, + "step": 12702 + }, + { + "epoch": 0.9562091872259546, + "grad_norm": 5.284124374389648, + "learning_rate": 5.305224517453577e-07, + "loss": 1.4887, + "step": 12703 + }, + { + "epoch": 0.9562844615066147, + "grad_norm": 4.962721824645996, + "learning_rate": 5.287527439964135e-07, + "loss": 1.6165, + "step": 12704 + }, + { + "epoch": 0.9563597357872748, + "grad_norm": 6.835129261016846, + "learning_rate": 5.269859771711683e-07, + "loss": 2.2373, + "step": 12705 + }, + { + "epoch": 0.9564350100679351, + "grad_norm": 4.361907005310059, + "learning_rate": 5.252221513746547e-07, + "loss": 1.5867, + "step": 12706 + }, + { + "epoch": 0.9565102843485952, + "grad_norm": 4.861889362335205, + "learning_rate": 5.234612667117278e-07, + "loss": 1.6194, + "step": 12707 + }, + { + "epoch": 0.9565855586292553, + "grad_norm": 5.289926528930664, + "learning_rate": 5.217033232870594e-07, + "loss": 1.5487, + "step": 12708 + }, + { + "epoch": 0.9566608329099155, + "grad_norm": 4.22956657409668, + "learning_rate": 5.199483212051659e-07, + "loss": 1.9352, + "step": 12709 + }, + { + "epoch": 0.9567361071905757, + "grad_norm": 4.0802483558654785, + "learning_rate": 5.181962605703639e-07, + "loss": 1.7447, + "step": 12710 + }, + { + "epoch": 0.9568113814712358, + "grad_norm": 5.962789058685303, + "learning_rate": 5.1644714148682e-07, + "loss": 1.8126, + "step": 12711 + }, + { + "epoch": 0.956886655751896, + "grad_norm": 4.925459384918213, + "learning_rate": 5.147009640585065e-07, + "loss": 1.4307, + "step": 12712 + }, + { + "epoch": 0.9569619300325561, + "grad_norm": 6.229773044586182, + "learning_rate": 5.129577283892295e-07, + "loss": 1.7581, + "step": 12713 + }, + { + "epoch": 0.9570372043132163, + "grad_norm": 5.008143901824951, + "learning_rate": 5.112174345826281e-07, + "loss": 1.674, + "step": 12714 + }, + { + "epoch": 0.9571124785938765, + "grad_norm": 5.549509048461914, + "learning_rate": 5.094800827421475e-07, + "loss": 1.8888, + "step": 12715 + }, + { + "epoch": 0.9571877528745366, + "grad_norm": 4.771287441253662, + "learning_rate": 5.077456729710661e-07, + "loss": 1.4084, + "step": 12716 + }, + { + "epoch": 0.9572630271551967, + "grad_norm": 4.510410785675049, + "learning_rate": 5.060142053725014e-07, + "loss": 1.6189, + "step": 12717 + }, + { + "epoch": 0.9573383014358569, + "grad_norm": 4.811689853668213, + "learning_rate": 5.042856800493768e-07, + "loss": 1.9651, + "step": 12718 + }, + { + "epoch": 0.9574135757165171, + "grad_norm": 5.329996585845947, + "learning_rate": 5.025600971044486e-07, + "loss": 1.8558, + "step": 12719 + }, + { + "epoch": 0.9574888499971772, + "grad_norm": 4.806715965270996, + "learning_rate": 5.008374566403018e-07, + "loss": 1.6839, + "step": 12720 + }, + { + "epoch": 0.9575641242778373, + "grad_norm": 4.990218639373779, + "learning_rate": 4.991177587593321e-07, + "loss": 1.6452, + "step": 12721 + }, + { + "epoch": 0.9576393985584976, + "grad_norm": 6.543436527252197, + "learning_rate": 4.974010035637855e-07, + "loss": 1.5381, + "step": 12722 + }, + { + "epoch": 0.9577146728391577, + "grad_norm": 5.1081767082214355, + "learning_rate": 4.956871911557081e-07, + "loss": 1.5925, + "step": 12723 + }, + { + "epoch": 0.9577899471198178, + "grad_norm": 5.018471717834473, + "learning_rate": 4.939763216369797e-07, + "loss": 1.7178, + "step": 12724 + }, + { + "epoch": 0.957865221400478, + "grad_norm": 4.7650556564331055, + "learning_rate": 4.922683951093133e-07, + "loss": 1.945, + "step": 12725 + }, + { + "epoch": 0.9579404956811381, + "grad_norm": 5.736057281494141, + "learning_rate": 4.905634116742386e-07, + "loss": 1.4811, + "step": 12726 + }, + { + "epoch": 0.9580157699617983, + "grad_norm": 6.507660865783691, + "learning_rate": 4.888613714331081e-07, + "loss": 1.8229, + "step": 12727 + }, + { + "epoch": 0.9580910442424585, + "grad_norm": 5.225996017456055, + "learning_rate": 4.871622744871074e-07, + "loss": 1.7795, + "step": 12728 + }, + { + "epoch": 0.9581663185231186, + "grad_norm": 5.069697856903076, + "learning_rate": 4.854661209372391e-07, + "loss": 1.8151, + "step": 12729 + }, + { + "epoch": 0.9582415928037787, + "grad_norm": 3.9640355110168457, + "learning_rate": 4.837729108843336e-07, + "loss": 1.6215, + "step": 12730 + }, + { + "epoch": 0.958316867084439, + "grad_norm": 4.68833589553833, + "learning_rate": 4.820826444290605e-07, + "loss": 1.7128, + "step": 12731 + }, + { + "epoch": 0.9583921413650991, + "grad_norm": 5.0856475830078125, + "learning_rate": 4.803953216718837e-07, + "loss": 1.8493, + "step": 12732 + }, + { + "epoch": 0.9584674156457592, + "grad_norm": 4.266898155212402, + "learning_rate": 4.787109427131175e-07, + "loss": 1.9668, + "step": 12733 + }, + { + "epoch": 0.9585426899264194, + "grad_norm": 4.884166240692139, + "learning_rate": 4.770295076528985e-07, + "loss": 1.7062, + "step": 12734 + }, + { + "epoch": 0.9586179642070796, + "grad_norm": 5.035543918609619, + "learning_rate": 4.7535101659117453e-07, + "loss": 1.7581, + "step": 12735 + }, + { + "epoch": 0.9586932384877397, + "grad_norm": 5.708413124084473, + "learning_rate": 4.736754696277268e-07, + "loss": 1.8908, + "step": 12736 + }, + { + "epoch": 0.9587685127683998, + "grad_norm": 4.90730094909668, + "learning_rate": 4.720028668621701e-07, + "loss": 1.4379, + "step": 12737 + }, + { + "epoch": 0.95884378704906, + "grad_norm": 5.6391682624816895, + "learning_rate": 4.7033320839393045e-07, + "loss": 1.8864, + "step": 12738 + }, + { + "epoch": 0.9589190613297202, + "grad_norm": 5.905911445617676, + "learning_rate": 4.6866649432226185e-07, + "loss": 1.8899, + "step": 12739 + }, + { + "epoch": 0.9589943356103803, + "grad_norm": 4.411929607391357, + "learning_rate": 4.670027247462516e-07, + "loss": 1.608, + "step": 12740 + }, + { + "epoch": 0.9590696098910405, + "grad_norm": 6.870064735412598, + "learning_rate": 4.65341899764804e-07, + "loss": 2.0341, + "step": 12741 + }, + { + "epoch": 0.9591448841717006, + "grad_norm": 7.483013153076172, + "learning_rate": 4.6368401947665117e-07, + "loss": 1.4863, + "step": 12742 + }, + { + "epoch": 0.9592201584523607, + "grad_norm": 4.806339263916016, + "learning_rate": 4.6202908398034203e-07, + "loss": 1.5378, + "step": 12743 + }, + { + "epoch": 0.959295432733021, + "grad_norm": 6.128211975097656, + "learning_rate": 4.603770933742702e-07, + "loss": 1.6042, + "step": 12744 + }, + { + "epoch": 0.9593707070136811, + "grad_norm": 5.991851806640625, + "learning_rate": 4.5872804775662916e-07, + "loss": 1.3748, + "step": 12745 + }, + { + "epoch": 0.9594459812943412, + "grad_norm": 5.846717357635498, + "learning_rate": 4.5708194722546284e-07, + "loss": 1.7164, + "step": 12746 + }, + { + "epoch": 0.9595212555750015, + "grad_norm": 5.387139797210693, + "learning_rate": 4.5543879187861517e-07, + "loss": 1.796, + "step": 12747 + }, + { + "epoch": 0.9595965298556616, + "grad_norm": 4.864641189575195, + "learning_rate": 4.5379858181377466e-07, + "loss": 2.1802, + "step": 12748 + }, + { + "epoch": 0.9596718041363217, + "grad_norm": 4.4707865715026855, + "learning_rate": 4.521613171284411e-07, + "loss": 1.3434, + "step": 12749 + }, + { + "epoch": 0.9597470784169819, + "grad_norm": 4.4707865715026855, + "learning_rate": 4.521613171284411e-07, + "loss": 2.0174, + "step": 12750 + }, + { + "epoch": 0.959822352697642, + "grad_norm": 5.0891828536987305, + "learning_rate": 4.505269979199478e-07, + "loss": 1.7849, + "step": 12751 + }, + { + "epoch": 0.9598976269783022, + "grad_norm": 6.626373767852783, + "learning_rate": 4.4889562428545027e-07, + "loss": 1.5173, + "step": 12752 + }, + { + "epoch": 0.9599729012589624, + "grad_norm": 5.3148674964904785, + "learning_rate": 4.472671963219377e-07, + "loss": 2.04, + "step": 12753 + }, + { + "epoch": 0.9600481755396225, + "grad_norm": 4.4283061027526855, + "learning_rate": 4.4564171412619926e-07, + "loss": 1.7598, + "step": 12754 + }, + { + "epoch": 0.9601234498202826, + "grad_norm": 5.9036173820495605, + "learning_rate": 4.4401917779487436e-07, + "loss": 1.5959, + "step": 12755 + }, + { + "epoch": 0.9601987241009428, + "grad_norm": 5.72764253616333, + "learning_rate": 4.423995874244247e-07, + "loss": 1.4162, + "step": 12756 + }, + { + "epoch": 0.960273998381603, + "grad_norm": 5.874645233154297, + "learning_rate": 4.407829431111177e-07, + "loss": 1.8502, + "step": 12757 + }, + { + "epoch": 0.9603492726622631, + "grad_norm": 6.777151107788086, + "learning_rate": 4.3916924495106537e-07, + "loss": 1.8303, + "step": 12758 + }, + { + "epoch": 0.9604245469429232, + "grad_norm": 5.144296169281006, + "learning_rate": 4.37558493040191e-07, + "loss": 1.5879, + "step": 12759 + }, + { + "epoch": 0.9604998212235835, + "grad_norm": 4.937539100646973, + "learning_rate": 4.359506874742569e-07, + "loss": 1.5883, + "step": 12760 + }, + { + "epoch": 0.9605750955042436, + "grad_norm": 5.9273362159729, + "learning_rate": 4.343458283488366e-07, + "loss": 1.653, + "step": 12761 + }, + { + "epoch": 0.9606503697849037, + "grad_norm": 5.750225067138672, + "learning_rate": 4.3274391575933715e-07, + "loss": 1.6408, + "step": 12762 + }, + { + "epoch": 0.9607256440655639, + "grad_norm": 5.339879989624023, + "learning_rate": 4.3114494980098785e-07, + "loss": 1.5289, + "step": 12763 + }, + { + "epoch": 0.960800918346224, + "grad_norm": 6.042171001434326, + "learning_rate": 4.2954893056884606e-07, + "loss": 1.4628, + "step": 12764 + }, + { + "epoch": 0.9608761926268842, + "grad_norm": 4.973211765289307, + "learning_rate": 4.279558581577858e-07, + "loss": 1.8237, + "step": 12765 + }, + { + "epoch": 0.9609514669075444, + "grad_norm": 4.382627964019775, + "learning_rate": 4.2636573266251476e-07, + "loss": 1.5081, + "step": 12766 + }, + { + "epoch": 0.9610267411882045, + "grad_norm": 5.47891092300415, + "learning_rate": 4.2477855417755173e-07, + "loss": 1.7905, + "step": 12767 + }, + { + "epoch": 0.9611020154688646, + "grad_norm": 5.99039888381958, + "learning_rate": 4.231943227972601e-07, + "loss": 1.1773, + "step": 12768 + }, + { + "epoch": 0.9611772897495249, + "grad_norm": 5.750499725341797, + "learning_rate": 4.2161303861580903e-07, + "loss": 1.6254, + "step": 12769 + }, + { + "epoch": 0.961252564030185, + "grad_norm": 4.780459403991699, + "learning_rate": 4.2003470172721213e-07, + "loss": 2.1193, + "step": 12770 + }, + { + "epoch": 0.9613278383108451, + "grad_norm": 5.220534801483154, + "learning_rate": 4.184593122252889e-07, + "loss": 1.4776, + "step": 12771 + }, + { + "epoch": 0.9614031125915053, + "grad_norm": 5.915187835693359, + "learning_rate": 4.168868702036921e-07, + "loss": 2.002, + "step": 12772 + }, + { + "epoch": 0.9614783868721655, + "grad_norm": 8.071564674377441, + "learning_rate": 4.1531737575590813e-07, + "loss": 1.4587, + "step": 12773 + }, + { + "epoch": 0.9615536611528256, + "grad_norm": 4.235130786895752, + "learning_rate": 4.1375082897522344e-07, + "loss": 1.3438, + "step": 12774 + }, + { + "epoch": 0.9616289354334857, + "grad_norm": 4.875904083251953, + "learning_rate": 4.1218722995477463e-07, + "loss": 1.7017, + "step": 12775 + }, + { + "epoch": 0.9617042097141459, + "grad_norm": 4.517753601074219, + "learning_rate": 4.106265787875152e-07, + "loss": 1.7427, + "step": 12776 + }, + { + "epoch": 0.9617794839948061, + "grad_norm": 5.332659721374512, + "learning_rate": 4.090688755662153e-07, + "loss": 1.7206, + "step": 12777 + }, + { + "epoch": 0.9618547582754662, + "grad_norm": 4.5537214279174805, + "learning_rate": 4.075141203834787e-07, + "loss": 1.6129, + "step": 12778 + }, + { + "epoch": 0.9619300325561264, + "grad_norm": 6.118309497833252, + "learning_rate": 4.059623133317314e-07, + "loss": 1.7482, + "step": 12779 + }, + { + "epoch": 0.9620053068367865, + "grad_norm": 5.080042839050293, + "learning_rate": 4.0441345450322745e-07, + "loss": 1.6767, + "step": 12780 + }, + { + "epoch": 0.9620805811174467, + "grad_norm": 6.272345542907715, + "learning_rate": 4.02867543990032e-07, + "loss": 1.7058, + "step": 12781 + }, + { + "epoch": 0.9621558553981069, + "grad_norm": 5.104844093322754, + "learning_rate": 4.0132458188405495e-07, + "loss": 1.5619, + "step": 12782 + }, + { + "epoch": 0.962231129678767, + "grad_norm": 5.087047100067139, + "learning_rate": 3.9978456827701185e-07, + "loss": 1.4748, + "step": 12783 + }, + { + "epoch": 0.9623064039594271, + "grad_norm": 5.018366813659668, + "learning_rate": 3.982475032604627e-07, + "loss": 1.6243, + "step": 12784 + }, + { + "epoch": 0.9623816782400874, + "grad_norm": 5.7456159591674805, + "learning_rate": 3.967133869257733e-07, + "loss": 1.6521, + "step": 12785 + }, + { + "epoch": 0.9624569525207475, + "grad_norm": 7.355575084686279, + "learning_rate": 3.9518221936414304e-07, + "loss": 1.6972, + "step": 12786 + }, + { + "epoch": 0.9625322268014076, + "grad_norm": 4.517178535461426, + "learning_rate": 3.93654000666599e-07, + "loss": 1.5206, + "step": 12787 + }, + { + "epoch": 0.9626075010820678, + "grad_norm": 5.357083320617676, + "learning_rate": 3.921287309239907e-07, + "loss": 1.9129, + "step": 12788 + }, + { + "epoch": 0.962682775362728, + "grad_norm": 5.312506675720215, + "learning_rate": 3.9060641022698464e-07, + "loss": 1.9362, + "step": 12789 + }, + { + "epoch": 0.9627580496433881, + "grad_norm": 6.706018447875977, + "learning_rate": 3.8908703866608606e-07, + "loss": 1.8119, + "step": 12790 + }, + { + "epoch": 0.9628333239240483, + "grad_norm": 7.562875747680664, + "learning_rate": 3.8757061633161174e-07, + "loss": 1.6422, + "step": 12791 + }, + { + "epoch": 0.9629085982047084, + "grad_norm": 5.181768894195557, + "learning_rate": 3.860571433137117e-07, + "loss": 1.9383, + "step": 12792 + }, + { + "epoch": 0.9629838724853685, + "grad_norm": 4.47805643081665, + "learning_rate": 3.8454661970235284e-07, + "loss": 1.6609, + "step": 12793 + }, + { + "epoch": 0.9630591467660287, + "grad_norm": 4.423263072967529, + "learning_rate": 3.8303904558734117e-07, + "loss": 1.8387, + "step": 12794 + }, + { + "epoch": 0.9631344210466889, + "grad_norm": 4.938326835632324, + "learning_rate": 3.8153442105828275e-07, + "loss": 1.754, + "step": 12795 + }, + { + "epoch": 0.963209695327349, + "grad_norm": 4.799740314483643, + "learning_rate": 3.800327462046338e-07, + "loss": 1.6828, + "step": 12796 + }, + { + "epoch": 0.9632849696080091, + "grad_norm": 4.493744850158691, + "learning_rate": 3.7853402111566184e-07, + "loss": 1.3974, + "step": 12797 + }, + { + "epoch": 0.9633602438886694, + "grad_norm": 4.173511028289795, + "learning_rate": 3.7703824588046775e-07, + "loss": 2.1499, + "step": 12798 + }, + { + "epoch": 0.9634355181693295, + "grad_norm": 5.556063652038574, + "learning_rate": 3.7554542058796385e-07, + "loss": 2.0209, + "step": 12799 + }, + { + "epoch": 0.9635107924499896, + "grad_norm": 4.924504280090332, + "learning_rate": 3.740555453268957e-07, + "loss": 1.8456, + "step": 12800 + }, + { + "epoch": 0.9635860667306498, + "grad_norm": 7.22938346862793, + "learning_rate": 3.725686201858314e-07, + "loss": 1.8772, + "step": 12801 + }, + { + "epoch": 0.96366134101131, + "grad_norm": 6.452375411987305, + "learning_rate": 3.710846452531669e-07, + "loss": 1.5424, + "step": 12802 + }, + { + "epoch": 0.9637366152919701, + "grad_norm": 4.909130096435547, + "learning_rate": 3.6960362061711494e-07, + "loss": 2.0875, + "step": 12803 + }, + { + "epoch": 0.9638118895726303, + "grad_norm": 5.063354015350342, + "learning_rate": 3.681255463657274e-07, + "loss": 1.4524, + "step": 12804 + }, + { + "epoch": 0.9638871638532904, + "grad_norm": 6.484045028686523, + "learning_rate": 3.6665042258686166e-07, + "loss": 1.5862, + "step": 12805 + }, + { + "epoch": 0.9639624381339505, + "grad_norm": 4.589997291564941, + "learning_rate": 3.651782493682199e-07, + "loss": 1.364, + "step": 12806 + }, + { + "epoch": 0.9640377124146108, + "grad_norm": 4.875156879425049, + "learning_rate": 3.637090267973042e-07, + "loss": 1.6493, + "step": 12807 + }, + { + "epoch": 0.9641129866952709, + "grad_norm": 4.830776691436768, + "learning_rate": 3.6224275496147263e-07, + "loss": 1.6902, + "step": 12808 + }, + { + "epoch": 0.964188260975931, + "grad_norm": 4.292070388793945, + "learning_rate": 3.607794339478776e-07, + "loss": 1.5618, + "step": 12809 + }, + { + "epoch": 0.9642635352565913, + "grad_norm": 4.975305080413818, + "learning_rate": 3.5931906384351623e-07, + "loss": 1.3542, + "step": 12810 + }, + { + "epoch": 0.9643388095372514, + "grad_norm": 5.7770280838012695, + "learning_rate": 3.5786164473520235e-07, + "loss": 1.7314, + "step": 12811 + }, + { + "epoch": 0.9644140838179115, + "grad_norm": 5.053014755249023, + "learning_rate": 3.5640717670957223e-07, + "loss": 1.4206, + "step": 12812 + }, + { + "epoch": 0.9644893580985717, + "grad_norm": 5.223371505737305, + "learning_rate": 3.5495565985309566e-07, + "loss": 1.6191, + "step": 12813 + }, + { + "epoch": 0.9645646323792318, + "grad_norm": 5.122992515563965, + "learning_rate": 3.535070942520591e-07, + "loss": 1.8021, + "step": 12814 + }, + { + "epoch": 0.964639906659892, + "grad_norm": 5.406673908233643, + "learning_rate": 3.5206147999257143e-07, + "loss": 1.8231, + "step": 12815 + }, + { + "epoch": 0.9647151809405521, + "grad_norm": 6.0550079345703125, + "learning_rate": 3.5061881716056954e-07, + "loss": 1.9092, + "step": 12816 + }, + { + "epoch": 0.9647904552212123, + "grad_norm": 5.839788913726807, + "learning_rate": 3.491791058418181e-07, + "loss": 1.9117, + "step": 12817 + }, + { + "epoch": 0.9648657295018724, + "grad_norm": 7.533646583557129, + "learning_rate": 3.477423461219098e-07, + "loss": 1.5877, + "step": 12818 + }, + { + "epoch": 0.9649410037825326, + "grad_norm": 7.256137371063232, + "learning_rate": 3.463085380862485e-07, + "loss": 1.6428, + "step": 12819 + }, + { + "epoch": 0.9650162780631928, + "grad_norm": 5.78106164932251, + "learning_rate": 3.4487768182007163e-07, + "loss": 1.7498, + "step": 12820 + }, + { + "epoch": 0.9650915523438529, + "grad_norm": 4.756579399108887, + "learning_rate": 3.4344977740843887e-07, + "loss": 1.854, + "step": 12821 + }, + { + "epoch": 0.965166826624513, + "grad_norm": 5.204861164093018, + "learning_rate": 3.4202482493623787e-07, + "loss": 1.7757, + "step": 12822 + }, + { + "epoch": 0.9652421009051733, + "grad_norm": 5.986173152923584, + "learning_rate": 3.406028244881787e-07, + "loss": 2.0117, + "step": 12823 + }, + { + "epoch": 0.9653173751858334, + "grad_norm": 5.37211275100708, + "learning_rate": 3.391837761487937e-07, + "loss": 2.113, + "step": 12824 + }, + { + "epoch": 0.9653926494664935, + "grad_norm": 5.7370805740356445, + "learning_rate": 3.377676800024321e-07, + "loss": 1.7308, + "step": 12825 + }, + { + "epoch": 0.9654679237471537, + "grad_norm": 4.459313869476318, + "learning_rate": 3.3635453613329314e-07, + "loss": 1.2555, + "step": 12826 + }, + { + "epoch": 0.9655431980278139, + "grad_norm": 4.01496696472168, + "learning_rate": 3.349443446253708e-07, + "loss": 1.9142, + "step": 12827 + }, + { + "epoch": 0.965618472308474, + "grad_norm": 4.9404296875, + "learning_rate": 3.3353710556250917e-07, + "loss": 1.5522, + "step": 12828 + }, + { + "epoch": 0.9656937465891342, + "grad_norm": 5.495742321014404, + "learning_rate": 3.3213281902835237e-07, + "loss": 2.0353, + "step": 12829 + }, + { + "epoch": 0.9657690208697943, + "grad_norm": 5.374664783477783, + "learning_rate": 3.307314851063836e-07, + "loss": 1.7758, + "step": 12830 + }, + { + "epoch": 0.9658442951504544, + "grad_norm": 5.570564270019531, + "learning_rate": 3.293331038799141e-07, + "loss": 1.6268, + "step": 12831 + }, + { + "epoch": 0.9659195694311147, + "grad_norm": 4.828046798706055, + "learning_rate": 3.2793767543207176e-07, + "loss": 1.8683, + "step": 12832 + }, + { + "epoch": 0.9659948437117748, + "grad_norm": 5.624510288238525, + "learning_rate": 3.265451998458069e-07, + "loss": 1.6801, + "step": 12833 + }, + { + "epoch": 0.9660701179924349, + "grad_norm": 5.807058334350586, + "learning_rate": 3.251556772039033e-07, + "loss": 1.6605, + "step": 12834 + }, + { + "epoch": 0.966145392273095, + "grad_norm": 6.272126197814941, + "learning_rate": 3.237691075889615e-07, + "loss": 1.6154, + "step": 12835 + }, + { + "epoch": 0.9662206665537553, + "grad_norm": 4.713158130645752, + "learning_rate": 3.2238549108340455e-07, + "loss": 1.6714, + "step": 12836 + }, + { + "epoch": 0.9662959408344154, + "grad_norm": 4.318984031677246, + "learning_rate": 3.2100482776949435e-07, + "loss": 1.7028, + "step": 12837 + }, + { + "epoch": 0.9663712151150755, + "grad_norm": 4.251882553100586, + "learning_rate": 3.1962711772929863e-07, + "loss": 1.7487, + "step": 12838 + }, + { + "epoch": 0.9664464893957357, + "grad_norm": 5.758731365203857, + "learning_rate": 3.182523610447241e-07, + "loss": 1.6493, + "step": 12839 + }, + { + "epoch": 0.9665217636763959, + "grad_norm": 5.774416446685791, + "learning_rate": 3.1688055779749425e-07, + "loss": 1.6216, + "step": 12840 + }, + { + "epoch": 0.966597037957056, + "grad_norm": 5.681101322174072, + "learning_rate": 3.15511708069155e-07, + "loss": 1.9636, + "step": 12841 + }, + { + "epoch": 0.9666723122377162, + "grad_norm": 5.927643775939941, + "learning_rate": 3.141458119410856e-07, + "loss": 1.6767, + "step": 12842 + }, + { + "epoch": 0.9667475865183763, + "grad_norm": 6.631484508514404, + "learning_rate": 3.127828694944879e-07, + "loss": 1.7838, + "step": 12843 + }, + { + "epoch": 0.9668228607990365, + "grad_norm": 6.1152215003967285, + "learning_rate": 3.114228808103747e-07, + "loss": 1.8677, + "step": 12844 + }, + { + "epoch": 0.9668981350796967, + "grad_norm": 5.246252059936523, + "learning_rate": 3.100658459695982e-07, + "loss": 1.8115, + "step": 12845 + }, + { + "epoch": 0.9669734093603568, + "grad_norm": 6.18123722076416, + "learning_rate": 3.0871176505283263e-07, + "loss": 1.8144, + "step": 12846 + }, + { + "epoch": 0.9670486836410169, + "grad_norm": 4.367028713226318, + "learning_rate": 3.073606381405747e-07, + "loss": 1.612, + "step": 12847 + }, + { + "epoch": 0.9671239579216772, + "grad_norm": 5.881445407867432, + "learning_rate": 3.06012465313138e-07, + "loss": 2.0238, + "step": 12848 + }, + { + "epoch": 0.9671992322023373, + "grad_norm": 4.773110866546631, + "learning_rate": 3.0466724665067504e-07, + "loss": 1.5765, + "step": 12849 + }, + { + "epoch": 0.9672745064829974, + "grad_norm": 4.50179386138916, + "learning_rate": 3.0332498223314964e-07, + "loss": 1.7561, + "step": 12850 + }, + { + "epoch": 0.9673497807636576, + "grad_norm": 6.259771347045898, + "learning_rate": 3.019856721403591e-07, + "loss": 2.3133, + "step": 12851 + }, + { + "epoch": 0.9674250550443177, + "grad_norm": 5.690469741821289, + "learning_rate": 3.006493164519231e-07, + "loss": 1.6591, + "step": 12852 + }, + { + "epoch": 0.9675003293249779, + "grad_norm": 5.143002986907959, + "learning_rate": 2.99315915247278e-07, + "loss": 1.6979, + "step": 12853 + }, + { + "epoch": 0.967575603605638, + "grad_norm": 5.152135372161865, + "learning_rate": 2.979854686056993e-07, + "loss": 1.5917, + "step": 12854 + }, + { + "epoch": 0.9676508778862982, + "grad_norm": 4.120881080627441, + "learning_rate": 2.9665797660626825e-07, + "loss": 1.6993, + "step": 12855 + }, + { + "epoch": 0.9677261521669583, + "grad_norm": 6.325472354888916, + "learning_rate": 2.9533343932791056e-07, + "loss": 1.4777, + "step": 12856 + }, + { + "epoch": 0.9678014264476185, + "grad_norm": 6.165053367614746, + "learning_rate": 2.940118568493577e-07, + "loss": 1.5892, + "step": 12857 + }, + { + "epoch": 0.9678767007282787, + "grad_norm": 5.428379535675049, + "learning_rate": 2.9269322924918573e-07, + "loss": 1.7299, + "step": 12858 + }, + { + "epoch": 0.9679519750089388, + "grad_norm": 6.150330066680908, + "learning_rate": 2.913775566057653e-07, + "loss": 1.8178, + "step": 12859 + }, + { + "epoch": 0.9680272492895989, + "grad_norm": 5.36868143081665, + "learning_rate": 2.900648389973226e-07, + "loss": 1.7349, + "step": 12860 + }, + { + "epoch": 0.9681025235702592, + "grad_norm": 5.713620662689209, + "learning_rate": 2.887550765018898e-07, + "loss": 1.3487, + "step": 12861 + }, + { + "epoch": 0.9681777978509193, + "grad_norm": 5.377217769622803, + "learning_rate": 2.874482691973324e-07, + "loss": 1.6173, + "step": 12862 + }, + { + "epoch": 0.9682530721315794, + "grad_norm": 5.350352764129639, + "learning_rate": 2.8614441716133254e-07, + "loss": 1.6766, + "step": 12863 + }, + { + "epoch": 0.9683283464122396, + "grad_norm": 4.306874752044678, + "learning_rate": 2.8484352047140063e-07, + "loss": 1.7165, + "step": 12864 + }, + { + "epoch": 0.9684036206928998, + "grad_norm": 4.555135726928711, + "learning_rate": 2.835455792048747e-07, + "loss": 1.8007, + "step": 12865 + }, + { + "epoch": 0.9684788949735599, + "grad_norm": 5.005427837371826, + "learning_rate": 2.822505934389097e-07, + "loss": 1.4207, + "step": 12866 + }, + { + "epoch": 0.9685541692542201, + "grad_norm": 7.8824462890625, + "learning_rate": 2.809585632504941e-07, + "loss": 1.642, + "step": 12867 + }, + { + "epoch": 0.9686294435348802, + "grad_norm": 3.909339189529419, + "learning_rate": 2.796694887164275e-07, + "loss": 1.7001, + "step": 12868 + }, + { + "epoch": 0.9687047178155404, + "grad_norm": 5.610635280609131, + "learning_rate": 2.783833699133542e-07, + "loss": 1.7563, + "step": 12869 + }, + { + "epoch": 0.9687799920962006, + "grad_norm": 4.308082103729248, + "learning_rate": 2.771002069177131e-07, + "loss": 1.3855, + "step": 12870 + }, + { + "epoch": 0.9688552663768607, + "grad_norm": 5.778499603271484, + "learning_rate": 2.7581999980579863e-07, + "loss": 2.1166, + "step": 12871 + }, + { + "epoch": 0.9689305406575208, + "grad_norm": 4.197400093078613, + "learning_rate": 2.745427486537111e-07, + "loss": 1.5451, + "step": 12872 + }, + { + "epoch": 0.9690058149381809, + "grad_norm": 4.780872821807861, + "learning_rate": 2.732684535373786e-07, + "loss": 2.052, + "step": 12873 + }, + { + "epoch": 0.9690810892188412, + "grad_norm": 5.577661037445068, + "learning_rate": 2.719971145325517e-07, + "loss": 1.7506, + "step": 12874 + }, + { + "epoch": 0.9691563634995013, + "grad_norm": 6.526430130004883, + "learning_rate": 2.707287317148088e-07, + "loss": 1.5767, + "step": 12875 + }, + { + "epoch": 0.9692316377801614, + "grad_norm": 4.576663970947266, + "learning_rate": 2.694633051595563e-07, + "loss": 1.7079, + "step": 12876 + }, + { + "epoch": 0.9693069120608216, + "grad_norm": 4.87368106842041, + "learning_rate": 2.682008349420173e-07, + "loss": 1.6087, + "step": 12877 + }, + { + "epoch": 0.9693821863414818, + "grad_norm": 7.903324127197266, + "learning_rate": 2.669413211372429e-07, + "loss": 1.8065, + "step": 12878 + }, + { + "epoch": 0.9694574606221419, + "grad_norm": 5.2060136795043945, + "learning_rate": 2.6568476382010656e-07, + "loss": 1.6779, + "step": 12879 + }, + { + "epoch": 0.9695327349028021, + "grad_norm": 5.165953159332275, + "learning_rate": 2.6443116306530955e-07, + "loss": 1.7052, + "step": 12880 + }, + { + "epoch": 0.9696080091834622, + "grad_norm": 5.682737350463867, + "learning_rate": 2.6318051894737017e-07, + "loss": 1.5715, + "step": 12881 + }, + { + "epoch": 0.9696832834641224, + "grad_norm": 4.7289934158325195, + "learning_rate": 2.6193283154063997e-07, + "loss": 1.7209, + "step": 12882 + }, + { + "epoch": 0.9697585577447826, + "grad_norm": 5.737270832061768, + "learning_rate": 2.6068810091928187e-07, + "loss": 1.5889, + "step": 12883 + }, + { + "epoch": 0.9698338320254427, + "grad_norm": 4.942068576812744, + "learning_rate": 2.594463271573033e-07, + "loss": 1.4881, + "step": 12884 + }, + { + "epoch": 0.9699091063061028, + "grad_norm": 4.215206146240234, + "learning_rate": 2.5820751032851755e-07, + "loss": 1.3702, + "step": 12885 + }, + { + "epoch": 0.9699843805867631, + "grad_norm": 4.6445136070251465, + "learning_rate": 2.5697165050657113e-07, + "loss": 1.7753, + "step": 12886 + }, + { + "epoch": 0.9700596548674232, + "grad_norm": 5.8361496925354, + "learning_rate": 2.557387477649331e-07, + "loss": 1.4644, + "step": 12887 + }, + { + "epoch": 0.9701349291480833, + "grad_norm": 5.14605712890625, + "learning_rate": 2.5450880217688935e-07, + "loss": 1.796, + "step": 12888 + }, + { + "epoch": 0.9702102034287435, + "grad_norm": 5.978387832641602, + "learning_rate": 2.532818138155646e-07, + "loss": 1.7977, + "step": 12889 + }, + { + "epoch": 0.9702854777094037, + "grad_norm": 5.54160737991333, + "learning_rate": 2.5205778275390056e-07, + "loss": 1.8962, + "step": 12890 + }, + { + "epoch": 0.9703607519900638, + "grad_norm": 6.199498653411865, + "learning_rate": 2.508367090646502e-07, + "loss": 1.2902, + "step": 12891 + }, + { + "epoch": 0.970436026270724, + "grad_norm": 4.247002601623535, + "learning_rate": 2.4961859282041643e-07, + "loss": 1.6683, + "step": 12892 + }, + { + "epoch": 0.9705113005513841, + "grad_norm": 4.69089412689209, + "learning_rate": 2.484034340936081e-07, + "loss": 1.6091, + "step": 12893 + }, + { + "epoch": 0.9705865748320442, + "grad_norm": 4.586596488952637, + "learning_rate": 2.471912329564618e-07, + "loss": 1.9038, + "step": 12894 + }, + { + "epoch": 0.9706618491127044, + "grad_norm": 6.109058380126953, + "learning_rate": 2.459819894810367e-07, + "loss": 1.9022, + "step": 12895 + }, + { + "epoch": 0.9707371233933646, + "grad_norm": 5.328130722045898, + "learning_rate": 2.447757037392251e-07, + "loss": 1.6564, + "step": 12896 + }, + { + "epoch": 0.9708123976740247, + "grad_norm": 6.030274391174316, + "learning_rate": 2.4357237580273084e-07, + "loss": 1.7612, + "step": 12897 + }, + { + "epoch": 0.9708876719546848, + "grad_norm": 4.516773700714111, + "learning_rate": 2.423720057430967e-07, + "loss": 1.719, + "step": 12898 + }, + { + "epoch": 0.9709629462353451, + "grad_norm": 5.488609313964844, + "learning_rate": 2.411745936316712e-07, + "loss": 1.8407, + "step": 12899 + }, + { + "epoch": 0.9710382205160052, + "grad_norm": 4.606796741485596, + "learning_rate": 2.399801395396417e-07, + "loss": 1.677, + "step": 12900 + }, + { + "epoch": 0.9711134947966653, + "grad_norm": 5.437780380249023, + "learning_rate": 2.387886435380182e-07, + "loss": 1.7001, + "step": 12901 + }, + { + "epoch": 0.9711887690773255, + "grad_norm": 4.013258457183838, + "learning_rate": 2.3760010569762737e-07, + "loss": 1.4984, + "step": 12902 + }, + { + "epoch": 0.9712640433579857, + "grad_norm": 5.091400146484375, + "learning_rate": 2.364145260891293e-07, + "loss": 1.4741, + "step": 12903 + }, + { + "epoch": 0.9713393176386458, + "grad_norm": 4.874166965484619, + "learning_rate": 2.352319047829954e-07, + "loss": 1.8073, + "step": 12904 + }, + { + "epoch": 0.971414591919306, + "grad_norm": 5.469064712524414, + "learning_rate": 2.3405224184954166e-07, + "loss": 1.8371, + "step": 12905 + }, + { + "epoch": 0.9714898661999661, + "grad_norm": 4.165596008300781, + "learning_rate": 2.328755373588787e-07, + "loss": 1.5299, + "step": 12906 + }, + { + "epoch": 0.9715651404806263, + "grad_norm": 4.864057540893555, + "learning_rate": 2.317017913809727e-07, + "loss": 1.7596, + "step": 12907 + }, + { + "epoch": 0.9716404147612865, + "grad_norm": 5.076822757720947, + "learning_rate": 2.3053100398559015e-07, + "loss": 1.2872, + "step": 12908 + }, + { + "epoch": 0.9717156890419466, + "grad_norm": 5.097784996032715, + "learning_rate": 2.293631752423364e-07, + "loss": 1.7277, + "step": 12909 + }, + { + "epoch": 0.9717909633226067, + "grad_norm": 5.238947868347168, + "learning_rate": 2.281983052206338e-07, + "loss": 1.7465, + "step": 12910 + }, + { + "epoch": 0.971866237603267, + "grad_norm": 6.354681491851807, + "learning_rate": 2.2703639398973242e-07, + "loss": 1.9878, + "step": 12911 + }, + { + "epoch": 0.9719415118839271, + "grad_norm": 5.712283611297607, + "learning_rate": 2.2587744161869927e-07, + "loss": 1.8883, + "step": 12912 + }, + { + "epoch": 0.9720167861645872, + "grad_norm": 5.845961570739746, + "learning_rate": 2.2472144817643482e-07, + "loss": 1.8732, + "step": 12913 + }, + { + "epoch": 0.9720920604452473, + "grad_norm": 7.510724067687988, + "learning_rate": 2.2356841373166182e-07, + "loss": 1.6988, + "step": 12914 + }, + { + "epoch": 0.9721673347259076, + "grad_norm": 4.078990459442139, + "learning_rate": 2.224183383529199e-07, + "loss": 1.3829, + "step": 12915 + }, + { + "epoch": 0.9722426090065677, + "grad_norm": 7.452385425567627, + "learning_rate": 2.2127122210857663e-07, + "loss": 1.651, + "step": 12916 + }, + { + "epoch": 0.9723178832872278, + "grad_norm": 4.549737930297852, + "learning_rate": 2.20127065066833e-07, + "loss": 1.8032, + "step": 12917 + }, + { + "epoch": 0.972393157567888, + "grad_norm": 5.105953216552734, + "learning_rate": 2.1898586729570126e-07, + "loss": 1.6681, + "step": 12918 + }, + { + "epoch": 0.9724684318485481, + "grad_norm": 5.604067325592041, + "learning_rate": 2.1784762886302157e-07, + "loss": 1.5157, + "step": 12919 + }, + { + "epoch": 0.9725437061292083, + "grad_norm": 4.463692665100098, + "learning_rate": 2.167123498364565e-07, + "loss": 1.5276, + "step": 12920 + }, + { + "epoch": 0.9726189804098685, + "grad_norm": 6.033914566040039, + "learning_rate": 2.1558003028349648e-07, + "loss": 1.9292, + "step": 12921 + }, + { + "epoch": 0.9726942546905286, + "grad_norm": 5.758117198944092, + "learning_rate": 2.1445067027145994e-07, + "loss": 1.4229, + "step": 12922 + }, + { + "epoch": 0.9727695289711887, + "grad_norm": 4.967009544372559, + "learning_rate": 2.1332426986747644e-07, + "loss": 1.6084, + "step": 12923 + }, + { + "epoch": 0.972844803251849, + "grad_norm": 7.284834384918213, + "learning_rate": 2.1220082913852024e-07, + "loss": 1.5554, + "step": 12924 + }, + { + "epoch": 0.9729200775325091, + "grad_norm": 5.290718078613281, + "learning_rate": 2.1108034815136013e-07, + "loss": 1.7649, + "step": 12925 + }, + { + "epoch": 0.9729953518131692, + "grad_norm": 7.140928745269775, + "learning_rate": 2.0996282697261505e-07, + "loss": 1.8844, + "step": 12926 + }, + { + "epoch": 0.9730706260938294, + "grad_norm": 5.510298728942871, + "learning_rate": 2.0884826566871517e-07, + "loss": 1.7051, + "step": 12927 + }, + { + "epoch": 0.9731459003744896, + "grad_norm": 5.675850868225098, + "learning_rate": 2.0773666430591866e-07, + "loss": 1.5329, + "step": 12928 + }, + { + "epoch": 0.9732211746551497, + "grad_norm": 5.625651836395264, + "learning_rate": 2.0662802295030592e-07, + "loss": 1.7409, + "step": 12929 + }, + { + "epoch": 0.9732964489358099, + "grad_norm": 4.719574928283691, + "learning_rate": 2.055223416677854e-07, + "loss": 1.8129, + "step": 12930 + }, + { + "epoch": 0.97337172321647, + "grad_norm": 4.777347087860107, + "learning_rate": 2.0441962052408782e-07, + "loss": 1.8035, + "step": 12931 + }, + { + "epoch": 0.9734469974971302, + "grad_norm": 4.947113513946533, + "learning_rate": 2.0331985958476075e-07, + "loss": 2.1781, + "step": 12932 + }, + { + "epoch": 0.9735222717777903, + "grad_norm": 6.180505275726318, + "learning_rate": 2.0222305891518521e-07, + "loss": 1.6594, + "step": 12933 + }, + { + "epoch": 0.9735975460584505, + "grad_norm": 5.926185607910156, + "learning_rate": 2.0112921858056466e-07, + "loss": 1.9934, + "step": 12934 + }, + { + "epoch": 0.9736728203391106, + "grad_norm": 5.52576208114624, + "learning_rate": 2.0003833864592482e-07, + "loss": 1.9768, + "step": 12935 + }, + { + "epoch": 0.9737480946197707, + "grad_norm": 7.180680274963379, + "learning_rate": 1.989504191761138e-07, + "loss": 1.8272, + "step": 12936 + }, + { + "epoch": 0.973823368900431, + "grad_norm": 6.3857808113098145, + "learning_rate": 1.978654602358021e-07, + "loss": 1.6979, + "step": 12937 + }, + { + "epoch": 0.9738986431810911, + "grad_norm": 4.431358337402344, + "learning_rate": 1.967834618894937e-07, + "loss": 1.8563, + "step": 12938 + }, + { + "epoch": 0.9739739174617512, + "grad_norm": 4.656952381134033, + "learning_rate": 1.957044242015038e-07, + "loss": 1.9013, + "step": 12939 + }, + { + "epoch": 0.9740491917424114, + "grad_norm": 7.515347003936768, + "learning_rate": 1.946283472359811e-07, + "loss": 1.6379, + "step": 12940 + }, + { + "epoch": 0.9741244660230716, + "grad_norm": 5.863698482513428, + "learning_rate": 1.9355523105689666e-07, + "loss": 2.0243, + "step": 12941 + }, + { + "epoch": 0.9741997403037317, + "grad_norm": 5.547372817993164, + "learning_rate": 1.9248507572804385e-07, + "loss": 1.742, + "step": 12942 + }, + { + "epoch": 0.9742750145843919, + "grad_norm": 7.607717990875244, + "learning_rate": 1.9141788131303851e-07, + "loss": 2.2946, + "step": 12943 + }, + { + "epoch": 0.974350288865052, + "grad_norm": 5.372774600982666, + "learning_rate": 1.9035364787532427e-07, + "loss": 1.6303, + "step": 12944 + }, + { + "epoch": 0.9744255631457122, + "grad_norm": 6.622222423553467, + "learning_rate": 1.8929237547816726e-07, + "loss": 1.3414, + "step": 12945 + }, + { + "epoch": 0.9745008374263724, + "grad_norm": 3.733778953552246, + "learning_rate": 1.8823406418465028e-07, + "loss": 1.9528, + "step": 12946 + }, + { + "epoch": 0.9745761117070325, + "grad_norm": 6.550518989562988, + "learning_rate": 1.8717871405769526e-07, + "loss": 2.2974, + "step": 12947 + }, + { + "epoch": 0.9746513859876926, + "grad_norm": 6.448490142822266, + "learning_rate": 1.8612632516004092e-07, + "loss": 1.9443, + "step": 12948 + }, + { + "epoch": 0.9747266602683529, + "grad_norm": 6.662453651428223, + "learning_rate": 1.850768975542372e-07, + "loss": 1.8339, + "step": 12949 + }, + { + "epoch": 0.974801934549013, + "grad_norm": 6.098504543304443, + "learning_rate": 1.840304313026786e-07, + "loss": 1.5332, + "step": 12950 + }, + { + "epoch": 0.9748772088296731, + "grad_norm": 4.197260856628418, + "learning_rate": 1.8298692646757653e-07, + "loss": 1.7926, + "step": 12951 + }, + { + "epoch": 0.9749524831103332, + "grad_norm": 5.539402008056641, + "learning_rate": 1.8194638311095914e-07, + "loss": 1.6992, + "step": 12952 + }, + { + "epoch": 0.9750277573909935, + "grad_norm": 4.922698020935059, + "learning_rate": 1.8090880129468246e-07, + "loss": 1.6961, + "step": 12953 + }, + { + "epoch": 0.9751030316716536, + "grad_norm": 5.984186172485352, + "learning_rate": 1.7987418108043608e-07, + "loss": 1.741, + "step": 12954 + }, + { + "epoch": 0.9751783059523137, + "grad_norm": 4.5789079666137695, + "learning_rate": 1.7884252252971524e-07, + "loss": 1.7496, + "step": 12955 + }, + { + "epoch": 0.9752535802329739, + "grad_norm": 6.276437759399414, + "learning_rate": 1.7781382570385418e-07, + "loss": 1.8477, + "step": 12956 + }, + { + "epoch": 0.975328854513634, + "grad_norm": 5.271674156188965, + "learning_rate": 1.7678809066400405e-07, + "loss": 1.7218, + "step": 12957 + }, + { + "epoch": 0.9754041287942942, + "grad_norm": 5.132743835449219, + "learning_rate": 1.7576531747114378e-07, + "loss": 1.3096, + "step": 12958 + }, + { + "epoch": 0.9754794030749544, + "grad_norm": 4.780306816101074, + "learning_rate": 1.7474550618607477e-07, + "loss": 1.5559, + "step": 12959 + }, + { + "epoch": 0.9755546773556145, + "grad_norm": 6.03981351852417, + "learning_rate": 1.7372865686942075e-07, + "loss": 1.7755, + "step": 12960 + }, + { + "epoch": 0.9756299516362746, + "grad_norm": 4.649022579193115, + "learning_rate": 1.7271476958163336e-07, + "loss": 1.7137, + "step": 12961 + }, + { + "epoch": 0.9757052259169349, + "grad_norm": 7.028926372528076, + "learning_rate": 1.717038443829755e-07, + "loss": 1.5326, + "step": 12962 + }, + { + "epoch": 0.975780500197595, + "grad_norm": 5.268937587738037, + "learning_rate": 1.7069588133355464e-07, + "loss": 2.0288, + "step": 12963 + }, + { + "epoch": 0.9758557744782551, + "grad_norm": 10.64756965637207, + "learning_rate": 1.6969088049328398e-07, + "loss": 1.6751, + "step": 12964 + }, + { + "epoch": 0.9759310487589153, + "grad_norm": 6.178853988647461, + "learning_rate": 1.686888419219157e-07, + "loss": 1.965, + "step": 12965 + }, + { + "epoch": 0.9760063230395755, + "grad_norm": 5.278573513031006, + "learning_rate": 1.6768976567901328e-07, + "loss": 1.9171, + "step": 12966 + }, + { + "epoch": 0.9760815973202356, + "grad_norm": 7.354477882385254, + "learning_rate": 1.666936518239681e-07, + "loss": 1.846, + "step": 12967 + }, + { + "epoch": 0.9761568716008958, + "grad_norm": 5.221993923187256, + "learning_rate": 1.657005004159995e-07, + "loss": 1.646, + "step": 12968 + }, + { + "epoch": 0.9762321458815559, + "grad_norm": 4.843984127044678, + "learning_rate": 1.647103115141435e-07, + "loss": 1.9226, + "step": 12969 + }, + { + "epoch": 0.976307420162216, + "grad_norm": 5.810163974761963, + "learning_rate": 1.6372308517726976e-07, + "loss": 1.8495, + "step": 12970 + }, + { + "epoch": 0.9763826944428762, + "grad_norm": 5.0624189376831055, + "learning_rate": 1.6273882146405904e-07, + "loss": 1.6248, + "step": 12971 + }, + { + "epoch": 0.9764579687235364, + "grad_norm": 6.379784107208252, + "learning_rate": 1.6175752043302573e-07, + "loss": 2.0395, + "step": 12972 + }, + { + "epoch": 0.9765332430041965, + "grad_norm": 4.916862487792969, + "learning_rate": 1.6077918214250642e-07, + "loss": 1.7323, + "step": 12973 + }, + { + "epoch": 0.9766085172848566, + "grad_norm": 4.556856632232666, + "learning_rate": 1.598038066506602e-07, + "loss": 1.5505, + "step": 12974 + }, + { + "epoch": 0.9766837915655169, + "grad_norm": 4.7181620597839355, + "learning_rate": 1.5883139401546844e-07, + "loss": 1.8615, + "step": 12975 + }, + { + "epoch": 0.976759065846177, + "grad_norm": 6.4315080642700195, + "learning_rate": 1.5786194429474044e-07, + "loss": 2.0256, + "step": 12976 + }, + { + "epoch": 0.9768343401268371, + "grad_norm": 7.1936540603637695, + "learning_rate": 1.5689545754610792e-07, + "loss": 1.7461, + "step": 12977 + }, + { + "epoch": 0.9769096144074974, + "grad_norm": 6.024977207183838, + "learning_rate": 1.5593193382702486e-07, + "loss": 1.7274, + "step": 12978 + }, + { + "epoch": 0.9769848886881575, + "grad_norm": 4.834726333618164, + "learning_rate": 1.5497137319476773e-07, + "loss": 1.8512, + "step": 12979 + }, + { + "epoch": 0.9770601629688176, + "grad_norm": 4.230078220367432, + "learning_rate": 1.5401377570644083e-07, + "loss": 1.7278, + "step": 12980 + }, + { + "epoch": 0.9771354372494778, + "grad_norm": 5.981341361999512, + "learning_rate": 1.5305914141897082e-07, + "loss": 1.9992, + "step": 12981 + }, + { + "epoch": 0.9772107115301379, + "grad_norm": 7.254819393157959, + "learning_rate": 1.5210747038911234e-07, + "loss": 1.5953, + "step": 12982 + }, + { + "epoch": 0.9772859858107981, + "grad_norm": 4.417457103729248, + "learning_rate": 1.511587626734312e-07, + "loss": 1.5504, + "step": 12983 + }, + { + "epoch": 0.9773612600914583, + "grad_norm": 7.3231425285339355, + "learning_rate": 1.5021301832833235e-07, + "loss": 1.5914, + "step": 12984 + }, + { + "epoch": 0.9774365343721184, + "grad_norm": 4.309480667114258, + "learning_rate": 1.4927023741003187e-07, + "loss": 1.4203, + "step": 12985 + }, + { + "epoch": 0.9775118086527785, + "grad_norm": 5.741574764251709, + "learning_rate": 1.4833041997457942e-07, + "loss": 1.5366, + "step": 12986 + }, + { + "epoch": 0.9775870829334388, + "grad_norm": 4.996456623077393, + "learning_rate": 1.473935660778414e-07, + "loss": 2.0576, + "step": 12987 + }, + { + "epoch": 0.9776623572140989, + "grad_norm": 5.0791096687316895, + "learning_rate": 1.4645967577551212e-07, + "loss": 1.6859, + "step": 12988 + }, + { + "epoch": 0.977737631494759, + "grad_norm": 8.104195594787598, + "learning_rate": 1.4552874912310832e-07, + "loss": 1.6833, + "step": 12989 + }, + { + "epoch": 0.9778129057754192, + "grad_norm": 5.378783702850342, + "learning_rate": 1.4460078617597462e-07, + "loss": 1.3848, + "step": 12990 + }, + { + "epoch": 0.9778881800560794, + "grad_norm": 6.668229103088379, + "learning_rate": 1.4367578698927242e-07, + "loss": 1.7759, + "step": 12991 + }, + { + "epoch": 0.9779634543367395, + "grad_norm": 5.935540676116943, + "learning_rate": 1.427537516179911e-07, + "loss": 1.6215, + "step": 12992 + }, + { + "epoch": 0.9780387286173996, + "grad_norm": 5.124584197998047, + "learning_rate": 1.4183468011694235e-07, + "loss": 1.7608, + "step": 12993 + }, + { + "epoch": 0.9781140028980598, + "grad_norm": 5.161470413208008, + "learning_rate": 1.409185725407658e-07, + "loss": 1.5508, + "step": 12994 + }, + { + "epoch": 0.97818927717872, + "grad_norm": 6.0180344581604, + "learning_rate": 1.4000542894391232e-07, + "loss": 1.5922, + "step": 12995 + }, + { + "epoch": 0.9782645514593801, + "grad_norm": 6.332244396209717, + "learning_rate": 1.390952493806774e-07, + "loss": 1.8204, + "step": 12996 + }, + { + "epoch": 0.9783398257400403, + "grad_norm": 5.8722968101501465, + "learning_rate": 1.3818803390515667e-07, + "loss": 1.9245, + "step": 12997 + }, + { + "epoch": 0.9784151000207004, + "grad_norm": 8.521466255187988, + "learning_rate": 1.3728378257129027e-07, + "loss": 1.3932, + "step": 12998 + }, + { + "epoch": 0.9784903743013605, + "grad_norm": 5.747220039367676, + "learning_rate": 1.3638249543283522e-07, + "loss": 1.8866, + "step": 12999 + }, + { + "epoch": 0.9785656485820208, + "grad_norm": 4.722435474395752, + "learning_rate": 1.3548417254336532e-07, + "loss": 1.3835, + "step": 13000 + }, + { + "epoch": 0.9786409228626809, + "grad_norm": 7.290378570556641, + "learning_rate": 1.345888139562823e-07, + "loss": 2.3858, + "step": 13001 + }, + { + "epoch": 0.978716197143341, + "grad_norm": 6.358278274536133, + "learning_rate": 1.336964197248103e-07, + "loss": 1.8686, + "step": 13002 + }, + { + "epoch": 0.9787914714240012, + "grad_norm": 6.795258045196533, + "learning_rate": 1.328069899020068e-07, + "loss": 1.9679, + "step": 13003 + }, + { + "epoch": 0.9788667457046614, + "grad_norm": 4.248903751373291, + "learning_rate": 1.319205245407462e-07, + "loss": 1.9973, + "step": 13004 + }, + { + "epoch": 0.9789420199853215, + "grad_norm": 3.9946954250335693, + "learning_rate": 1.3103702369371973e-07, + "loss": 1.6456, + "step": 13005 + }, + { + "epoch": 0.9790172942659817, + "grad_norm": 5.1285014152526855, + "learning_rate": 1.3015648741345199e-07, + "loss": 1.5785, + "step": 13006 + }, + { + "epoch": 0.9790925685466418, + "grad_norm": 5.78769588470459, + "learning_rate": 1.2927891575228446e-07, + "loss": 1.656, + "step": 13007 + }, + { + "epoch": 0.979167842827302, + "grad_norm": 5.299044609069824, + "learning_rate": 1.2840430876239206e-07, + "loss": 1.7985, + "step": 13008 + }, + { + "epoch": 0.9792431171079622, + "grad_norm": 6.17142391204834, + "learning_rate": 1.275326664957721e-07, + "loss": 1.6126, + "step": 13009 + }, + { + "epoch": 0.9793183913886223, + "grad_norm": 6.033590316772461, + "learning_rate": 1.266639890042276e-07, + "loss": 1.9633, + "step": 13010 + }, + { + "epoch": 0.9793936656692824, + "grad_norm": 5.743236541748047, + "learning_rate": 1.2579827633941166e-07, + "loss": 1.5454, + "step": 13011 + }, + { + "epoch": 0.9794689399499426, + "grad_norm": 5.568728923797607, + "learning_rate": 1.2493552855278313e-07, + "loss": 1.8759, + "step": 13012 + }, + { + "epoch": 0.9795442142306028, + "grad_norm": 7.057471752166748, + "learning_rate": 1.240757456956232e-07, + "loss": 1.8611, + "step": 13013 + }, + { + "epoch": 0.9796194885112629, + "grad_norm": 5.332674980163574, + "learning_rate": 1.2321892781905763e-07, + "loss": 1.5087, + "step": 13014 + }, + { + "epoch": 0.979694762791923, + "grad_norm": 4.490135669708252, + "learning_rate": 1.223650749740124e-07, + "loss": 1.3191, + "step": 13015 + }, + { + "epoch": 0.9797700370725833, + "grad_norm": 6.370522499084473, + "learning_rate": 1.2151418721124684e-07, + "loss": 1.5535, + "step": 13016 + }, + { + "epoch": 0.9798453113532434, + "grad_norm": 6.171915054321289, + "learning_rate": 1.2066626458134278e-07, + "loss": 1.9122, + "step": 13017 + }, + { + "epoch": 0.9799205856339035, + "grad_norm": 7.4021196365356445, + "learning_rate": 1.198213071347154e-07, + "loss": 1.905, + "step": 13018 + }, + { + "epoch": 0.9799958599145637, + "grad_norm": 5.191743850708008, + "learning_rate": 1.1897931492158565e-07, + "loss": 1.7342, + "step": 13019 + }, + { + "epoch": 0.9800711341952238, + "grad_norm": 5.4447736740112305, + "learning_rate": 1.1814028799201348e-07, + "loss": 1.7532, + "step": 13020 + }, + { + "epoch": 0.980146408475884, + "grad_norm": 4.939035415649414, + "learning_rate": 1.1730422639587013e-07, + "loss": 1.6722, + "step": 13021 + }, + { + "epoch": 0.9802216827565442, + "grad_norm": 5.296854496002197, + "learning_rate": 1.1647113018286027e-07, + "loss": 1.9805, + "step": 13022 + }, + { + "epoch": 0.9802969570372043, + "grad_norm": 4.869935035705566, + "learning_rate": 1.156409994025165e-07, + "loss": 1.6448, + "step": 13023 + }, + { + "epoch": 0.9803722313178644, + "grad_norm": 6.449895858764648, + "learning_rate": 1.1481383410417712e-07, + "loss": 1.9441, + "step": 13024 + }, + { + "epoch": 0.9804475055985247, + "grad_norm": 4.578142166137695, + "learning_rate": 1.1398963433701393e-07, + "loss": 1.9716, + "step": 13025 + }, + { + "epoch": 0.9805227798791848, + "grad_norm": 5.157728672027588, + "learning_rate": 1.1316840015003772e-07, + "loss": 1.7029, + "step": 13026 + }, + { + "epoch": 0.9805980541598449, + "grad_norm": 5.1253533363342285, + "learning_rate": 1.1235013159205387e-07, + "loss": 1.512, + "step": 13027 + }, + { + "epoch": 0.9806733284405051, + "grad_norm": 6.157623767852783, + "learning_rate": 1.1153482871170684e-07, + "loss": 1.7635, + "step": 13028 + }, + { + "epoch": 0.9807486027211653, + "grad_norm": 5.447888374328613, + "learning_rate": 1.107224915574745e-07, + "loss": 1.6854, + "step": 13029 + }, + { + "epoch": 0.9808238770018254, + "grad_norm": 8.986708641052246, + "learning_rate": 1.0991312017764044e-07, + "loss": 1.8057, + "step": 13030 + }, + { + "epoch": 0.9808991512824855, + "grad_norm": 4.813953399658203, + "learning_rate": 1.0910671462031618e-07, + "loss": 2.0215, + "step": 13031 + }, + { + "epoch": 0.9809744255631457, + "grad_norm": 6.0057477951049805, + "learning_rate": 1.0830327493344672e-07, + "loss": 2.0875, + "step": 13032 + }, + { + "epoch": 0.9810496998438059, + "grad_norm": 5.363242149353027, + "learning_rate": 1.0750280116479383e-07, + "loss": 1.8018, + "step": 13033 + }, + { + "epoch": 0.981124974124466, + "grad_norm": 4.823620319366455, + "learning_rate": 1.0670529336194168e-07, + "loss": 1.606, + "step": 13034 + }, + { + "epoch": 0.9812002484051262, + "grad_norm": 6.737373352050781, + "learning_rate": 1.0591075157230235e-07, + "loss": 1.697, + "step": 13035 + }, + { + "epoch": 0.9812755226857863, + "grad_norm": 6.440965175628662, + "learning_rate": 1.0511917584310472e-07, + "loss": 1.6982, + "step": 13036 + }, + { + "epoch": 0.9813507969664464, + "grad_norm": 5.122552394866943, + "learning_rate": 1.0433056622140558e-07, + "loss": 1.8574, + "step": 13037 + }, + { + "epoch": 0.9814260712471067, + "grad_norm": 5.1819963455200195, + "learning_rate": 1.0354492275408966e-07, + "loss": 1.5558, + "step": 13038 + }, + { + "epoch": 0.9815013455277668, + "grad_norm": 5.199045658111572, + "learning_rate": 1.027622454878585e-07, + "loss": 1.9707, + "step": 13039 + }, + { + "epoch": 0.9815766198084269, + "grad_norm": 5.493001461029053, + "learning_rate": 1.01982534469236e-07, + "loss": 1.8542, + "step": 13040 + }, + { + "epoch": 0.9816518940890872, + "grad_norm": 5.978510856628418, + "learning_rate": 1.0120578974458506e-07, + "loss": 1.5781, + "step": 13041 + }, + { + "epoch": 0.9817271683697473, + "grad_norm": 6.871249198913574, + "learning_rate": 1.0043201136006874e-07, + "loss": 2.1672, + "step": 13042 + }, + { + "epoch": 0.9818024426504074, + "grad_norm": 3.9805896282196045, + "learning_rate": 9.966119936170026e-08, + "loss": 1.7804, + "step": 13043 + }, + { + "epoch": 0.9818777169310676, + "grad_norm": 5.285131454467773, + "learning_rate": 9.889335379528186e-08, + "loss": 1.7595, + "step": 13044 + }, + { + "epoch": 0.9819529912117277, + "grad_norm": 5.9241766929626465, + "learning_rate": 9.812847470647702e-08, + "loss": 1.7324, + "step": 13045 + }, + { + "epoch": 0.9820282654923879, + "grad_norm": 4.808355331420898, + "learning_rate": 9.73665621407549e-08, + "loss": 1.5791, + "step": 13046 + }, + { + "epoch": 0.9821035397730481, + "grad_norm": 5.59078311920166, + "learning_rate": 9.660761614339598e-08, + "loss": 1.9391, + "step": 13047 + }, + { + "epoch": 0.9821788140537082, + "grad_norm": 11.050440788269043, + "learning_rate": 9.58516367595308e-08, + "loss": 2.0352, + "step": 13048 + }, + { + "epoch": 0.9822540883343683, + "grad_norm": 5.303092956542969, + "learning_rate": 9.50986240340901e-08, + "loss": 1.8439, + "step": 13049 + }, + { + "epoch": 0.9823293626150285, + "grad_norm": 3.9743611812591553, + "learning_rate": 9.434857801184915e-08, + "loss": 1.64, + "step": 13050 + }, + { + "epoch": 0.9824046368956887, + "grad_norm": 7.176499366760254, + "learning_rate": 9.360149873738345e-08, + "loss": 1.7053, + "step": 13051 + }, + { + "epoch": 0.9824799111763488, + "grad_norm": 5.9079508781433105, + "learning_rate": 9.285738625511298e-08, + "loss": 1.7981, + "step": 13052 + }, + { + "epoch": 0.9825551854570089, + "grad_norm": 4.518915176391602, + "learning_rate": 9.211624060926904e-08, + "loss": 1.4843, + "step": 13053 + }, + { + "epoch": 0.9826304597376692, + "grad_norm": 5.704377174377441, + "learning_rate": 9.137806184391639e-08, + "loss": 1.9328, + "step": 13054 + }, + { + "epoch": 0.9827057340183293, + "grad_norm": 5.946772575378418, + "learning_rate": 9.06428500029366e-08, + "loss": 1.8884, + "step": 13055 + }, + { + "epoch": 0.9827810082989894, + "grad_norm": 5.229925632476807, + "learning_rate": 8.991060513002803e-08, + "loss": 1.4348, + "step": 13056 + }, + { + "epoch": 0.9828562825796496, + "grad_norm": 5.110372543334961, + "learning_rate": 8.918132726872808e-08, + "loss": 1.502, + "step": 13057 + }, + { + "epoch": 0.9829315568603098, + "grad_norm": 4.785898685455322, + "learning_rate": 8.845501646239096e-08, + "loss": 1.6833, + "step": 13058 + }, + { + "epoch": 0.9830068311409699, + "grad_norm": 4.3589558601379395, + "learning_rate": 8.773167275418214e-08, + "loss": 1.5899, + "step": 13059 + }, + { + "epoch": 0.9830821054216301, + "grad_norm": 6.032839775085449, + "learning_rate": 8.701129618712279e-08, + "loss": 2.0903, + "step": 13060 + }, + { + "epoch": 0.9831573797022902, + "grad_norm": 5.521963596343994, + "learning_rate": 8.62938868040175e-08, + "loss": 1.744, + "step": 13061 + }, + { + "epoch": 0.9832326539829503, + "grad_norm": 5.807338237762451, + "learning_rate": 8.55794446475211e-08, + "loss": 1.5956, + "step": 13062 + }, + { + "epoch": 0.9833079282636106, + "grad_norm": 7.117595672607422, + "learning_rate": 8.486796976011069e-08, + "loss": 1.5686, + "step": 13063 + }, + { + "epoch": 0.9833832025442707, + "grad_norm": 4.9173431396484375, + "learning_rate": 8.415946218407466e-08, + "loss": 1.4706, + "step": 13064 + }, + { + "epoch": 0.9834584768249308, + "grad_norm": 4.793614387512207, + "learning_rate": 8.345392196153489e-08, + "loss": 1.5263, + "step": 13065 + }, + { + "epoch": 0.983533751105591, + "grad_norm": 5.021864891052246, + "learning_rate": 8.275134913443005e-08, + "loss": 1.7084, + "step": 13066 + }, + { + "epoch": 0.9836090253862512, + "grad_norm": 5.586772441864014, + "learning_rate": 8.205174374453228e-08, + "loss": 1.6972, + "step": 13067 + }, + { + "epoch": 0.9836842996669113, + "grad_norm": 5.8311567306518555, + "learning_rate": 8.135510583342498e-08, + "loss": 2.0313, + "step": 13068 + }, + { + "epoch": 0.9837595739475714, + "grad_norm": 4.036192893981934, + "learning_rate": 8.066143544253058e-08, + "loss": 1.7055, + "step": 13069 + }, + { + "epoch": 0.9838348482282316, + "grad_norm": 5.202116012573242, + "learning_rate": 7.997073261307164e-08, + "loss": 1.6789, + "step": 13070 + }, + { + "epoch": 0.9839101225088918, + "grad_norm": 4.484016418457031, + "learning_rate": 7.928299738612088e-08, + "loss": 1.8332, + "step": 13071 + }, + { + "epoch": 0.9839853967895519, + "grad_norm": 6.615151405334473, + "learning_rate": 7.859822980255115e-08, + "loss": 1.8901, + "step": 13072 + }, + { + "epoch": 0.9840606710702121, + "grad_norm": 5.3288254737854, + "learning_rate": 7.79164299030799e-08, + "loss": 1.8249, + "step": 13073 + }, + { + "epoch": 0.9841359453508722, + "grad_norm": 5.11895751953125, + "learning_rate": 7.72375977282358e-08, + "loss": 1.6614, + "step": 13074 + }, + { + "epoch": 0.9842112196315324, + "grad_norm": 6.499939441680908, + "learning_rate": 7.656173331836991e-08, + "loss": 1.9715, + "step": 13075 + }, + { + "epoch": 0.9842864939121926, + "grad_norm": 4.265520095825195, + "learning_rate": 7.588883671366675e-08, + "loss": 1.5722, + "step": 13076 + }, + { + "epoch": 0.9843617681928527, + "grad_norm": 4.985448837280273, + "learning_rate": 7.521890795411657e-08, + "loss": 1.6902, + "step": 13077 + }, + { + "epoch": 0.9844370424735128, + "grad_norm": 4.991160869598389, + "learning_rate": 7.45519470795597e-08, + "loss": 1.7498, + "step": 13078 + }, + { + "epoch": 0.9845123167541731, + "grad_norm": 8.067404747009277, + "learning_rate": 7.388795412963112e-08, + "loss": 1.5335, + "step": 13079 + }, + { + "epoch": 0.9845875910348332, + "grad_norm": 5.131585597991943, + "learning_rate": 7.32269291438159e-08, + "loss": 1.4492, + "step": 13080 + }, + { + "epoch": 0.9846628653154933, + "grad_norm": 5.241518020629883, + "learning_rate": 7.256887216139929e-08, + "loss": 1.6936, + "step": 13081 + }, + { + "epoch": 0.9847381395961535, + "grad_norm": 5.683465957641602, + "learning_rate": 7.191378322150555e-08, + "loss": 1.8145, + "step": 13082 + }, + { + "epoch": 0.9848134138768136, + "grad_norm": 6.25462007522583, + "learning_rate": 7.126166236307575e-08, + "loss": 1.9503, + "step": 13083 + }, + { + "epoch": 0.9848886881574738, + "grad_norm": 6.322662353515625, + "learning_rate": 7.061250962488441e-08, + "loss": 1.4981, + "step": 13084 + }, + { + "epoch": 0.984963962438134, + "grad_norm": 4.374819278717041, + "learning_rate": 6.996632504551182e-08, + "loss": 1.7794, + "step": 13085 + }, + { + "epoch": 0.9850392367187941, + "grad_norm": 4.685973644256592, + "learning_rate": 6.932310866337721e-08, + "loss": 1.599, + "step": 13086 + }, + { + "epoch": 0.9851145109994542, + "grad_norm": 4.924495220184326, + "learning_rate": 6.868286051671669e-08, + "loss": 1.7593, + "step": 13087 + }, + { + "epoch": 0.9851897852801145, + "grad_norm": 5.370654582977295, + "learning_rate": 6.804558064358868e-08, + "loss": 1.5076, + "step": 13088 + }, + { + "epoch": 0.9852650595607746, + "grad_norm": 6.819989204406738, + "learning_rate": 6.741126908187956e-08, + "loss": 1.4984, + "step": 13089 + }, + { + "epoch": 0.9853403338414347, + "grad_norm": 4.890023231506348, + "learning_rate": 6.67799258693036e-08, + "loss": 1.4644, + "step": 13090 + }, + { + "epoch": 0.9854156081220948, + "grad_norm": 5.000364303588867, + "learning_rate": 6.615155104338077e-08, + "loss": 1.4204, + "step": 13091 + }, + { + "epoch": 0.9854908824027551, + "grad_norm": 5.84981632232666, + "learning_rate": 6.552614464147566e-08, + "loss": 1.4479, + "step": 13092 + }, + { + "epoch": 0.9855661566834152, + "grad_norm": 4.8147735595703125, + "learning_rate": 6.490370670076407e-08, + "loss": 1.6987, + "step": 13093 + }, + { + "epoch": 0.9856414309640753, + "grad_norm": 6.752495288848877, + "learning_rate": 6.428423725824417e-08, + "loss": 2.0106, + "step": 13094 + }, + { + "epoch": 0.9857167052447355, + "grad_norm": 6.254847526550293, + "learning_rate": 6.366773635074208e-08, + "loss": 2.0797, + "step": 13095 + }, + { + "epoch": 0.9857919795253957, + "grad_norm": 5.815625190734863, + "learning_rate": 6.30542040149118e-08, + "loss": 1.5863, + "step": 13096 + }, + { + "epoch": 0.9858672538060558, + "grad_norm": 6.244816780090332, + "learning_rate": 6.244364028722971e-08, + "loss": 1.5153, + "step": 13097 + }, + { + "epoch": 0.985942528086716, + "grad_norm": 4.518695831298828, + "learning_rate": 6.18360452039779e-08, + "loss": 1.7786, + "step": 13098 + }, + { + "epoch": 0.9860178023673761, + "grad_norm": 5.306760311126709, + "learning_rate": 6.123141880128857e-08, + "loss": 1.6481, + "step": 13099 + }, + { + "epoch": 0.9860930766480362, + "grad_norm": 4.418492794036865, + "learning_rate": 6.062976111509966e-08, + "loss": 1.7768, + "step": 13100 + }, + { + "epoch": 0.9861683509286965, + "grad_norm": 5.4592976570129395, + "learning_rate": 6.003107218118253e-08, + "loss": 1.8049, + "step": 13101 + }, + { + "epoch": 0.9862436252093566, + "grad_norm": 4.977929592132568, + "learning_rate": 5.94353520351254e-08, + "loss": 1.9898, + "step": 13102 + }, + { + "epoch": 0.9863188994900167, + "grad_norm": 7.750016689300537, + "learning_rate": 5.8842600712338825e-08, + "loss": 1.6663, + "step": 13103 + }, + { + "epoch": 0.986394173770677, + "grad_norm": 6.001031875610352, + "learning_rate": 5.825281824805573e-08, + "loss": 1.9044, + "step": 13104 + }, + { + "epoch": 0.9864694480513371, + "grad_norm": 6.493915557861328, + "learning_rate": 5.7666004677353616e-08, + "loss": 1.8797, + "step": 13105 + }, + { + "epoch": 0.9865447223319972, + "grad_norm": 5.944760799407959, + "learning_rate": 5.708216003509903e-08, + "loss": 1.7292, + "step": 13106 + }, + { + "epoch": 0.9866199966126574, + "grad_norm": 5.876149654388428, + "learning_rate": 5.65012843560142e-08, + "loss": 1.664, + "step": 13107 + }, + { + "epoch": 0.9866952708933175, + "grad_norm": 6.851894378662109, + "learning_rate": 5.592337767462152e-08, + "loss": 1.2876, + "step": 13108 + }, + { + "epoch": 0.9867705451739777, + "grad_norm": 6.240671634674072, + "learning_rate": 5.5348440025276835e-08, + "loss": 1.7208, + "step": 13109 + }, + { + "epoch": 0.9868458194546378, + "grad_norm": 4.443357467651367, + "learning_rate": 5.477647144216391e-08, + "loss": 1.9254, + "step": 13110 + }, + { + "epoch": 0.986921093735298, + "grad_norm": 7.245244026184082, + "learning_rate": 5.420747195927778e-08, + "loss": 1.8466, + "step": 13111 + }, + { + "epoch": 0.9869963680159581, + "grad_norm": 6.102886199951172, + "learning_rate": 5.364144161044693e-08, + "loss": 1.5019, + "step": 13112 + }, + { + "epoch": 0.9870716422966183, + "grad_norm": 6.471378803253174, + "learning_rate": 5.307838042931667e-08, + "loss": 1.7059, + "step": 13113 + }, + { + "epoch": 0.9871469165772785, + "grad_norm": 6.312297344207764, + "learning_rate": 5.251828844937135e-08, + "loss": 1.706, + "step": 13114 + }, + { + "epoch": 0.9872221908579386, + "grad_norm": 4.498809814453125, + "learning_rate": 5.1961165703889866e-08, + "loss": 1.7317, + "step": 13115 + }, + { + "epoch": 0.9872974651385987, + "grad_norm": 4.326809883117676, + "learning_rate": 5.1407012226006855e-08, + "loss": 1.6492, + "step": 13116 + }, + { + "epoch": 0.987372739419259, + "grad_norm": 7.009565830230713, + "learning_rate": 5.0855828048657073e-08, + "loss": 1.5545, + "step": 13117 + }, + { + "epoch": 0.9874480136999191, + "grad_norm": 4.513393878936768, + "learning_rate": 5.03076132046032e-08, + "loss": 1.2998, + "step": 13118 + }, + { + "epoch": 0.9875232879805792, + "grad_norm": 4.883440971374512, + "learning_rate": 4.9762367726446935e-08, + "loss": 1.6765, + "step": 13119 + }, + { + "epoch": 0.9875985622612394, + "grad_norm": 5.346716403961182, + "learning_rate": 4.922009164659014e-08, + "loss": 1.9156, + "step": 13120 + }, + { + "epoch": 0.9876738365418996, + "grad_norm": 6.416129112243652, + "learning_rate": 4.8680784997273684e-08, + "loss": 1.5486, + "step": 13121 + }, + { + "epoch": 0.9877491108225597, + "grad_norm": 5.799234867095947, + "learning_rate": 4.8144447810560825e-08, + "loss": 2.2425, + "step": 13122 + }, + { + "epoch": 0.9878243851032199, + "grad_norm": 4.730310916900635, + "learning_rate": 4.7611080118331596e-08, + "loss": 1.8047, + "step": 13123 + }, + { + "epoch": 0.98789965938388, + "grad_norm": 6.363705635070801, + "learning_rate": 4.708068195229398e-08, + "loss": 1.634, + "step": 13124 + }, + { + "epoch": 0.9879749336645401, + "grad_norm": 14.274803161621094, + "learning_rate": 4.655325334397831e-08, + "loss": 1.7679, + "step": 13125 + }, + { + "epoch": 0.9880502079452004, + "grad_norm": 6.044187068939209, + "learning_rate": 4.602879432473728e-08, + "loss": 1.6974, + "step": 13126 + }, + { + "epoch": 0.9881254822258605, + "grad_norm": 5.682431697845459, + "learning_rate": 4.550730492575705e-08, + "loss": 1.6306, + "step": 13127 + }, + { + "epoch": 0.9882007565065206, + "grad_norm": 4.307530879974365, + "learning_rate": 4.498878517802396e-08, + "loss": 1.7235, + "step": 13128 + }, + { + "epoch": 0.9882760307871807, + "grad_norm": 4.414935111999512, + "learning_rate": 4.447323511237999e-08, + "loss": 1.4991, + "step": 13129 + }, + { + "epoch": 0.988351305067841, + "grad_norm": 4.00349760055542, + "learning_rate": 4.396065475945621e-08, + "loss": 1.5763, + "step": 13130 + }, + { + "epoch": 0.9884265793485011, + "grad_norm": 4.511332988739014, + "learning_rate": 4.3451044149733777e-08, + "loss": 1.4512, + "step": 13131 + }, + { + "epoch": 0.9885018536291612, + "grad_norm": 5.25727653503418, + "learning_rate": 4.2944403313505135e-08, + "loss": 1.7225, + "step": 13132 + }, + { + "epoch": 0.9885771279098214, + "grad_norm": 6.089904308319092, + "learning_rate": 4.2440732280885075e-08, + "loss": 1.5906, + "step": 13133 + }, + { + "epoch": 0.9886524021904816, + "grad_norm": 6.592550277709961, + "learning_rate": 4.194003108182742e-08, + "loss": 1.6953, + "step": 13134 + }, + { + "epoch": 0.9887276764711417, + "grad_norm": 4.942586421966553, + "learning_rate": 4.1442299746080603e-08, + "loss": 1.3553, + "step": 13135 + }, + { + "epoch": 0.9888029507518019, + "grad_norm": 5.673553943634033, + "learning_rate": 4.094753830324871e-08, + "loss": 1.8644, + "step": 13136 + }, + { + "epoch": 0.988878225032462, + "grad_norm": 6.152290344238281, + "learning_rate": 4.0455746782736007e-08, + "loss": 1.693, + "step": 13137 + }, + { + "epoch": 0.9889534993131222, + "grad_norm": 4.717894077301025, + "learning_rate": 3.9966925213774654e-08, + "loss": 1.7388, + "step": 13138 + }, + { + "epoch": 0.9890287735937824, + "grad_norm": 4.000372409820557, + "learning_rate": 3.9481073625430296e-08, + "loss": 1.7355, + "step": 13139 + }, + { + "epoch": 0.9891040478744425, + "grad_norm": 5.228072643280029, + "learning_rate": 3.8998192046585394e-08, + "loss": 1.4317, + "step": 13140 + }, + { + "epoch": 0.9891793221551026, + "grad_norm": 6.17822265625, + "learning_rate": 3.851828050594475e-08, + "loss": 1.8829, + "step": 13141 + }, + { + "epoch": 0.9892545964357629, + "grad_norm": 4.330573558807373, + "learning_rate": 3.804133903203e-08, + "loss": 1.9422, + "step": 13142 + }, + { + "epoch": 0.989329870716423, + "grad_norm": 10.764684677124023, + "learning_rate": 3.756736765320734e-08, + "loss": 2.1166, + "step": 13143 + }, + { + "epoch": 0.9894051449970831, + "grad_norm": 4.638550281524658, + "learning_rate": 3.709636639764313e-08, + "loss": 1.6455, + "step": 13144 + }, + { + "epoch": 0.9894804192777433, + "grad_norm": 6.8160786628723145, + "learning_rate": 3.662833529334275e-08, + "loss": 1.5211, + "step": 13145 + }, + { + "epoch": 0.9895556935584034, + "grad_norm": 4.503322601318359, + "learning_rate": 3.6163274368122834e-08, + "loss": 1.8195, + "step": 13146 + }, + { + "epoch": 0.9896309678390636, + "grad_norm": 4.949244022369385, + "learning_rate": 3.570118364963904e-08, + "loss": 1.2814, + "step": 13147 + }, + { + "epoch": 0.9897062421197237, + "grad_norm": 5.407182216644287, + "learning_rate": 3.5242063165352725e-08, + "loss": 2.1482, + "step": 13148 + }, + { + "epoch": 0.9897815164003839, + "grad_norm": 4.733443260192871, + "learning_rate": 3.478591294256428e-08, + "loss": 1.7436, + "step": 13149 + }, + { + "epoch": 0.989856790681044, + "grad_norm": 8.241961479187012, + "learning_rate": 3.43327330083798e-08, + "loss": 1.8737, + "step": 13150 + }, + { + "epoch": 0.9899320649617042, + "grad_norm": 4.252402305603027, + "learning_rate": 3.3882523389755504e-08, + "loss": 1.9455, + "step": 13151 + }, + { + "epoch": 0.9900073392423644, + "grad_norm": 5.956196308135986, + "learning_rate": 3.343528411344221e-08, + "loss": 1.5691, + "step": 13152 + }, + { + "epoch": 0.9900826135230245, + "grad_norm": 6.419072151184082, + "learning_rate": 3.299101520602976e-08, + "loss": 1.5605, + "step": 13153 + }, + { + "epoch": 0.9901578878036846, + "grad_norm": 3.8831958770751953, + "learning_rate": 3.2549716693930365e-08, + "loss": 1.7606, + "step": 13154 + }, + { + "epoch": 0.9902331620843449, + "grad_norm": 5.0645527839660645, + "learning_rate": 3.2111388603378586e-08, + "loss": 1.4158, + "step": 13155 + }, + { + "epoch": 0.990308436365005, + "grad_norm": 4.285924434661865, + "learning_rate": 3.167603096043137e-08, + "loss": 1.3323, + "step": 13156 + }, + { + "epoch": 0.9903837106456651, + "grad_norm": 4.4667510986328125, + "learning_rate": 3.1243643790968e-08, + "loss": 1.8686, + "step": 13157 + }, + { + "epoch": 0.9904589849263253, + "grad_norm": 4.28079891204834, + "learning_rate": 3.0814227120695707e-08, + "loss": 1.8314, + "step": 13158 + }, + { + "epoch": 0.9905342592069855, + "grad_norm": 4.143362045288086, + "learning_rate": 3.0387780975138505e-08, + "loss": 1.6542, + "step": 13159 + }, + { + "epoch": 0.9906095334876456, + "grad_norm": 6.253848552703857, + "learning_rate": 2.996430537964834e-08, + "loss": 1.7356, + "step": 13160 + }, + { + "epoch": 0.9906848077683058, + "grad_norm": 6.70048189163208, + "learning_rate": 2.954380035939952e-08, + "loss": 1.6953, + "step": 13161 + }, + { + "epoch": 0.9907600820489659, + "grad_norm": 7.164137363433838, + "learning_rate": 2.912626593938872e-08, + "loss": 1.971, + "step": 13162 + }, + { + "epoch": 0.990835356329626, + "grad_norm": 5.263330459594727, + "learning_rate": 2.8711702144446074e-08, + "loss": 1.7556, + "step": 13163 + }, + { + "epoch": 0.9909106306102863, + "grad_norm": 3.901093006134033, + "learning_rate": 2.830010899920188e-08, + "loss": 1.4832, + "step": 13164 + }, + { + "epoch": 0.9909859048909464, + "grad_norm": 5.758511543273926, + "learning_rate": 2.7891486528136558e-08, + "loss": 1.81, + "step": 13165 + }, + { + "epoch": 0.9910611791716065, + "grad_norm": 5.488779544830322, + "learning_rate": 2.7485834755530683e-08, + "loss": 1.9614, + "step": 13166 + }, + { + "epoch": 0.9911364534522666, + "grad_norm": 5.147634983062744, + "learning_rate": 2.70831537055094e-08, + "loss": 1.4763, + "step": 13167 + }, + { + "epoch": 0.9912117277329269, + "grad_norm": 4.163125038146973, + "learning_rate": 2.668344340200357e-08, + "loss": 1.7668, + "step": 13168 + }, + { + "epoch": 0.991287002013587, + "grad_norm": 5.76678991317749, + "learning_rate": 2.6286703868777518e-08, + "loss": 1.7387, + "step": 13169 + }, + { + "epoch": 0.9913622762942471, + "grad_norm": 3.8791990280151367, + "learning_rate": 2.5892935129417927e-08, + "loss": 1.5529, + "step": 13170 + }, + { + "epoch": 0.9914375505749073, + "grad_norm": 4.593095779418945, + "learning_rate": 2.5502137207333853e-08, + "loss": 1.7283, + "step": 13171 + }, + { + "epoch": 0.9915128248555675, + "grad_norm": 4.07630729675293, + "learning_rate": 2.5114310125751162e-08, + "loss": 1.4735, + "step": 13172 + }, + { + "epoch": 0.9915880991362276, + "grad_norm": 8.618682861328125, + "learning_rate": 2.4729453907729182e-08, + "loss": 1.7506, + "step": 13173 + }, + { + "epoch": 0.9916633734168878, + "grad_norm": 4.966878890991211, + "learning_rate": 2.4347568576144064e-08, + "loss": 1.9417, + "step": 13174 + }, + { + "epoch": 0.9917386476975479, + "grad_norm": 8.414887428283691, + "learning_rate": 2.3968654153699866e-08, + "loss": 1.9812, + "step": 13175 + }, + { + "epoch": 0.9918139219782081, + "grad_norm": 6.286038398742676, + "learning_rate": 2.3592710662923012e-08, + "loss": 1.5767, + "step": 13176 + }, + { + "epoch": 0.9918891962588683, + "grad_norm": 4.6140289306640625, + "learning_rate": 2.3219738126162293e-08, + "loss": 1.7424, + "step": 13177 + }, + { + "epoch": 0.9919644705395284, + "grad_norm": 4.626462936401367, + "learning_rate": 2.2849736565588865e-08, + "loss": 1.7685, + "step": 13178 + }, + { + "epoch": 0.9920397448201885, + "grad_norm": 4.816788673400879, + "learning_rate": 2.2482706003201794e-08, + "loss": 1.7274, + "step": 13179 + }, + { + "epoch": 0.9921150191008488, + "grad_norm": 5.877460956573486, + "learning_rate": 2.211864646081141e-08, + "loss": 1.3039, + "step": 13180 + }, + { + "epoch": 0.9921902933815089, + "grad_norm": 4.755939960479736, + "learning_rate": 2.1757557960061513e-08, + "loss": 2.1589, + "step": 13181 + }, + { + "epoch": 0.992265567662169, + "grad_norm": 7.746996879577637, + "learning_rate": 2.139944052242937e-08, + "loss": 1.546, + "step": 13182 + }, + { + "epoch": 0.9923408419428292, + "grad_norm": 8.059784889221191, + "learning_rate": 2.104429416918685e-08, + "loss": 2.1399, + "step": 13183 + }, + { + "epoch": 0.9924161162234894, + "grad_norm": 6.785238265991211, + "learning_rate": 2.06921189214615e-08, + "loss": 1.6153, + "step": 13184 + }, + { + "epoch": 0.9924913905041495, + "grad_norm": 4.743091583251953, + "learning_rate": 2.0342914800181022e-08, + "loss": 1.9528, + "step": 13185 + }, + { + "epoch": 0.9925666647848097, + "grad_norm": 5.474056243896484, + "learning_rate": 1.999668182610659e-08, + "loss": 1.4623, + "step": 13186 + }, + { + "epoch": 0.9926419390654698, + "grad_norm": 6.484755039215088, + "learning_rate": 1.9653420019821735e-08, + "loss": 1.8063, + "step": 13187 + }, + { + "epoch": 0.99271721334613, + "grad_norm": 6.0270843505859375, + "learning_rate": 1.9313129401732355e-08, + "loss": 1.4616, + "step": 13188 + }, + { + "epoch": 0.9927924876267901, + "grad_norm": 4.8150434494018555, + "learning_rate": 1.897580999206672e-08, + "loss": 1.67, + "step": 13189 + }, + { + "epoch": 0.9928677619074503, + "grad_norm": 4.706716060638428, + "learning_rate": 1.864146181087545e-08, + "loss": 1.671, + "step": 13190 + }, + { + "epoch": 0.9929430361881104, + "grad_norm": 4.276401519775391, + "learning_rate": 1.8310084878037093e-08, + "loss": 1.5748, + "step": 13191 + }, + { + "epoch": 0.9930183104687705, + "grad_norm": 5.668478488922119, + "learning_rate": 1.7981679213247005e-08, + "loss": 1.8071, + "step": 13192 + }, + { + "epoch": 0.9930935847494308, + "grad_norm": 4.245683193206787, + "learning_rate": 1.765624483603956e-08, + "loss": 1.2246, + "step": 13193 + }, + { + "epoch": 0.9931688590300909, + "grad_norm": 5.264678955078125, + "learning_rate": 1.733378176574929e-08, + "loss": 1.4358, + "step": 13194 + }, + { + "epoch": 0.993244133310751, + "grad_norm": 7.62374210357666, + "learning_rate": 1.70142900215442e-08, + "loss": 1.7747, + "step": 13195 + }, + { + "epoch": 0.9933194075914112, + "grad_norm": 4.2350664138793945, + "learning_rate": 1.6697769622431303e-08, + "loss": 1.818, + "step": 13196 + }, + { + "epoch": 0.9933946818720714, + "grad_norm": 5.310856342315674, + "learning_rate": 1.6384220587212228e-08, + "loss": 1.7176, + "step": 13197 + }, + { + "epoch": 0.9934699561527315, + "grad_norm": 6.541186332702637, + "learning_rate": 1.6073642934533172e-08, + "loss": 1.9554, + "step": 13198 + }, + { + "epoch": 0.9935452304333917, + "grad_norm": 6.157875061035156, + "learning_rate": 1.5766036682857142e-08, + "loss": 1.4948, + "step": 13199 + }, + { + "epoch": 0.9936205047140518, + "grad_norm": 6.176355361938477, + "learning_rate": 1.5461401850463964e-08, + "loss": 1.6552, + "step": 13200 + }, + { + "epoch": 0.993695778994712, + "grad_norm": 4.338268280029297, + "learning_rate": 1.5159738455478022e-08, + "loss": 1.5791, + "step": 13201 + }, + { + "epoch": 0.9937710532753722, + "grad_norm": 4.909199237823486, + "learning_rate": 1.4861046515818322e-08, + "loss": 2.1224, + "step": 13202 + }, + { + "epoch": 0.9938463275560323, + "grad_norm": 5.564504623413086, + "learning_rate": 1.4565326049242877e-08, + "loss": 1.579, + "step": 13203 + }, + { + "epoch": 0.9939216018366924, + "grad_norm": 6.53950309753418, + "learning_rate": 1.4272577073337624e-08, + "loss": 1.5207, + "step": 13204 + }, + { + "epoch": 0.9939968761173527, + "grad_norm": 4.690648078918457, + "learning_rate": 1.3982799605505303e-08, + "loss": 1.8126, + "step": 13205 + }, + { + "epoch": 0.9940721503980128, + "grad_norm": 6.9517316818237305, + "learning_rate": 1.3695993662965478e-08, + "loss": 1.7874, + "step": 13206 + }, + { + "epoch": 0.9941474246786729, + "grad_norm": 7.462461948394775, + "learning_rate": 1.3412159262771174e-08, + "loss": 1.6961, + "step": 13207 + }, + { + "epoch": 0.994222698959333, + "grad_norm": 6.088038921356201, + "learning_rate": 1.3131296421797778e-08, + "loss": 1.9404, + "step": 13208 + }, + { + "epoch": 0.9942979732399932, + "grad_norm": 4.753822326660156, + "learning_rate": 1.2853405156743048e-08, + "loss": 2.0245, + "step": 13209 + }, + { + "epoch": 0.9943732475206534, + "grad_norm": 4.652671813964844, + "learning_rate": 1.257848548412155e-08, + "loss": 1.8277, + "step": 13210 + }, + { + "epoch": 0.9944485218013135, + "grad_norm": 6.292595863342285, + "learning_rate": 1.2306537420281316e-08, + "loss": 1.5606, + "step": 13211 + }, + { + "epoch": 0.9945237960819737, + "grad_norm": 5.574264049530029, + "learning_rate": 1.2037560981381646e-08, + "loss": 1.7578, + "step": 13212 + }, + { + "epoch": 0.9945990703626338, + "grad_norm": 4.798356533050537, + "learning_rate": 1.177155618342085e-08, + "loss": 1.4871, + "step": 13213 + }, + { + "epoch": 0.994674344643294, + "grad_norm": 4.483429908752441, + "learning_rate": 1.1508523042214058e-08, + "loss": 1.7969, + "step": 13214 + }, + { + "epoch": 0.9947496189239542, + "grad_norm": 5.526252746582031, + "learning_rate": 1.1248461573382108e-08, + "loss": 1.9078, + "step": 13215 + }, + { + "epoch": 0.9948248932046143, + "grad_norm": 5.312394618988037, + "learning_rate": 1.0991371792401506e-08, + "loss": 1.9027, + "step": 13216 + }, + { + "epoch": 0.9949001674852744, + "grad_norm": 5.9386467933654785, + "learning_rate": 1.0737253714548923e-08, + "loss": 1.389, + "step": 13217 + }, + { + "epoch": 0.9949754417659347, + "grad_norm": 4.037280082702637, + "learning_rate": 1.0486107354928942e-08, + "loss": 1.6321, + "step": 13218 + }, + { + "epoch": 0.9950507160465948, + "grad_norm": 6.371796131134033, + "learning_rate": 1.0237932728479616e-08, + "loss": 1.879, + "step": 13219 + }, + { + "epoch": 0.9951259903272549, + "grad_norm": 7.081063270568848, + "learning_rate": 9.992729849944704e-09, + "loss": 1.5863, + "step": 13220 + }, + { + "epoch": 0.9952012646079151, + "grad_norm": 4.47339391708374, + "learning_rate": 9.750498733901436e-09, + "loss": 1.6295, + "step": 13221 + }, + { + "epoch": 0.9952765388885753, + "grad_norm": 4.612218379974365, + "learning_rate": 9.511239394754955e-09, + "loss": 1.9312, + "step": 13222 + }, + { + "epoch": 0.9953518131692354, + "grad_norm": 4.369223594665527, + "learning_rate": 9.27495184672722e-09, + "loss": 1.6171, + "step": 13223 + }, + { + "epoch": 0.9954270874498956, + "grad_norm": 7.309564113616943, + "learning_rate": 9.04163610386255e-09, + "loss": 2.064, + "step": 13224 + }, + { + "epoch": 0.9955023617305557, + "grad_norm": 4.090071678161621, + "learning_rate": 8.811292180038733e-09, + "loss": 1.5285, + "step": 13225 + }, + { + "epoch": 0.9955776360112158, + "grad_norm": 6.835105895996094, + "learning_rate": 8.583920088933717e-09, + "loss": 1.5129, + "step": 13226 + }, + { + "epoch": 0.995652910291876, + "grad_norm": 4.884006500244141, + "learning_rate": 8.35951984408112e-09, + "loss": 1.5977, + "step": 13227 + }, + { + "epoch": 0.9957281845725362, + "grad_norm": 6.8290863037109375, + "learning_rate": 8.138091458809172e-09, + "loss": 1.8848, + "step": 13228 + }, + { + "epoch": 0.9958034588531963, + "grad_norm": 8.762774467468262, + "learning_rate": 7.919634946285115e-09, + "loss": 1.9743, + "step": 13229 + }, + { + "epoch": 0.9958787331338564, + "grad_norm": 5.470524787902832, + "learning_rate": 7.704150319493008e-09, + "loss": 2.0906, + "step": 13230 + }, + { + "epoch": 0.9959540074145167, + "grad_norm": 5.067729949951172, + "learning_rate": 7.491637591250378e-09, + "loss": 1.4118, + "step": 13231 + }, + { + "epoch": 0.9960292816951768, + "grad_norm": 5.369177341461182, + "learning_rate": 7.282096774180458e-09, + "loss": 1.8995, + "step": 13232 + }, + { + "epoch": 0.9961045559758369, + "grad_norm": 8.151293754577637, + "learning_rate": 7.075527880751054e-09, + "loss": 1.9689, + "step": 13233 + }, + { + "epoch": 0.9961798302564971, + "grad_norm": 4.490253925323486, + "learning_rate": 6.87193092323013e-09, + "loss": 1.737, + "step": 13234 + }, + { + "epoch": 0.9962551045371573, + "grad_norm": 4.734788417816162, + "learning_rate": 6.671305913724668e-09, + "loss": 1.6175, + "step": 13235 + }, + { + "epoch": 0.9963303788178174, + "grad_norm": 4.815192699432373, + "learning_rate": 6.473652864169566e-09, + "loss": 2.1594, + "step": 13236 + }, + { + "epoch": 0.9964056530984776, + "grad_norm": 5.325466156005859, + "learning_rate": 6.278971786305432e-09, + "loss": 1.4389, + "step": 13237 + }, + { + "epoch": 0.9964809273791377, + "grad_norm": 5.544800281524658, + "learning_rate": 6.087262691706341e-09, + "loss": 1.6178, + "step": 13238 + }, + { + "epoch": 0.9965562016597979, + "grad_norm": 6.2679266929626465, + "learning_rate": 5.898525591774284e-09, + "loss": 1.5716, + "step": 13239 + }, + { + "epoch": 0.9966314759404581, + "grad_norm": 6.059432029724121, + "learning_rate": 5.712760497722514e-09, + "loss": 1.6418, + "step": 13240 + }, + { + "epoch": 0.9967067502211182, + "grad_norm": 7.546961307525635, + "learning_rate": 5.529967420597748e-09, + "loss": 1.7474, + "step": 13241 + }, + { + "epoch": 0.9967820245017783, + "grad_norm": 5.917884349822998, + "learning_rate": 5.350146371269071e-09, + "loss": 2.1466, + "step": 13242 + }, + { + "epoch": 0.9968572987824386, + "grad_norm": 5.187016010284424, + "learning_rate": 5.17329736042238e-09, + "loss": 2.0016, + "step": 13243 + }, + { + "epoch": 0.9969325730630987, + "grad_norm": 5.7227606773376465, + "learning_rate": 4.999420398571486e-09, + "loss": 1.7272, + "step": 13244 + }, + { + "epoch": 0.9970078473437588, + "grad_norm": 5.843156814575195, + "learning_rate": 4.828515496052566e-09, + "loss": 2.0979, + "step": 13245 + }, + { + "epoch": 0.9970831216244189, + "grad_norm": 6.642384052276611, + "learning_rate": 4.66058266302416e-09, + "loss": 2.0105, + "step": 13246 + }, + { + "epoch": 0.9971583959050792, + "grad_norm": 5.34022331237793, + "learning_rate": 4.495621909478276e-09, + "loss": 1.658, + "step": 13247 + }, + { + "epoch": 0.9972336701857393, + "grad_norm": 5.263411521911621, + "learning_rate": 4.3336332452070805e-09, + "loss": 1.635, + "step": 13248 + }, + { + "epoch": 0.9973089444663994, + "grad_norm": 7.458767414093018, + "learning_rate": 4.174616679852861e-09, + "loss": 1.8832, + "step": 13249 + }, + { + "epoch": 0.9973842187470596, + "grad_norm": 5.247926235198975, + "learning_rate": 4.018572222858064e-09, + "loss": 1.83, + "step": 13250 + }, + { + "epoch": 0.9974594930277197, + "grad_norm": 4.568474292755127, + "learning_rate": 3.865499883509704e-09, + "loss": 1.5774, + "step": 13251 + }, + { + "epoch": 0.9975347673083799, + "grad_norm": 4.635526657104492, + "learning_rate": 3.71539967090051e-09, + "loss": 1.5594, + "step": 13252 + }, + { + "epoch": 0.9976100415890401, + "grad_norm": 6.023122310638428, + "learning_rate": 3.5682715939566737e-09, + "loss": 1.2635, + "step": 13253 + }, + { + "epoch": 0.9976853158697002, + "grad_norm": 5.974853038787842, + "learning_rate": 3.424115661421201e-09, + "loss": 1.7406, + "step": 13254 + }, + { + "epoch": 0.9977605901503603, + "grad_norm": 5.682826042175293, + "learning_rate": 3.2829318818650144e-09, + "loss": 1.656, + "step": 13255 + }, + { + "epoch": 0.9978358644310206, + "grad_norm": 4.203394889831543, + "learning_rate": 3.144720263686951e-09, + "loss": 1.2487, + "step": 13256 + }, + { + "epoch": 0.9979111387116807, + "grad_norm": 7.662532329559326, + "learning_rate": 3.009480815091559e-09, + "loss": 2.0305, + "step": 13257 + }, + { + "epoch": 0.9979864129923408, + "grad_norm": 6.0161213874816895, + "learning_rate": 2.8772135441279546e-09, + "loss": 1.6108, + "step": 13258 + }, + { + "epoch": 0.998061687273001, + "grad_norm": 8.421697616577148, + "learning_rate": 2.7479184586620688e-09, + "loss": 1.9172, + "step": 13259 + }, + { + "epoch": 0.9981369615536612, + "grad_norm": 7.240542888641357, + "learning_rate": 2.621595566365542e-09, + "loss": 1.4668, + "step": 13260 + }, + { + "epoch": 0.9982122358343213, + "grad_norm": 6.9801435470581055, + "learning_rate": 2.498244874760136e-09, + "loss": 1.6899, + "step": 13261 + }, + { + "epoch": 0.9982875101149815, + "grad_norm": 4.888098239898682, + "learning_rate": 2.3778663911733222e-09, + "loss": 1.5554, + "step": 13262 + }, + { + "epoch": 0.9983627843956416, + "grad_norm": 5.449404239654541, + "learning_rate": 2.2604601227660394e-09, + "loss": 1.7882, + "step": 13263 + }, + { + "epoch": 0.9984380586763018, + "grad_norm": 5.648730278015137, + "learning_rate": 2.146026076510488e-09, + "loss": 1.8965, + "step": 13264 + }, + { + "epoch": 0.998513332956962, + "grad_norm": 6.069187164306641, + "learning_rate": 2.034564259217886e-09, + "loss": 1.8282, + "step": 13265 + }, + { + "epoch": 0.9985886072376221, + "grad_norm": 5.236371040344238, + "learning_rate": 1.926074677510714e-09, + "loss": 1.6175, + "step": 13266 + }, + { + "epoch": 0.9986638815182822, + "grad_norm": 7.428731441497803, + "learning_rate": 1.820557337833817e-09, + "loss": 1.5625, + "step": 13267 + }, + { + "epoch": 0.9987391557989423, + "grad_norm": 6.343943119049072, + "learning_rate": 1.7180122464655057e-09, + "loss": 1.7361, + "step": 13268 + }, + { + "epoch": 0.9988144300796026, + "grad_norm": 4.76800537109375, + "learning_rate": 1.6184394095009048e-09, + "loss": 1.3939, + "step": 13269 + }, + { + "epoch": 0.9988897043602627, + "grad_norm": 5.303952693939209, + "learning_rate": 1.521838832863054e-09, + "loss": 1.8384, + "step": 13270 + }, + { + "epoch": 0.9989649786409228, + "grad_norm": 5.326231002807617, + "learning_rate": 1.4282105222862553e-09, + "loss": 1.9399, + "step": 13271 + }, + { + "epoch": 0.999040252921583, + "grad_norm": 4.362345218658447, + "learning_rate": 1.3375544833382769e-09, + "loss": 1.8648, + "step": 13272 + }, + { + "epoch": 0.9991155272022432, + "grad_norm": 5.289018630981445, + "learning_rate": 1.249870721409252e-09, + "loss": 1.6075, + "step": 13273 + }, + { + "epoch": 0.9991908014829033, + "grad_norm": 7.3560028076171875, + "learning_rate": 1.1651592417172285e-09, + "loss": 1.9866, + "step": 13274 + }, + { + "epoch": 0.9992660757635635, + "grad_norm": 6.750360488891602, + "learning_rate": 1.0834200492915169e-09, + "loss": 2.0521, + "step": 13275 + }, + { + "epoch": 0.9993413500442236, + "grad_norm": 4.349361896514893, + "learning_rate": 1.004653148994894e-09, + "loss": 1.7502, + "step": 13276 + }, + { + "epoch": 0.9994166243248838, + "grad_norm": 4.751714706420898, + "learning_rate": 9.288585455069498e-10, + "loss": 1.8379, + "step": 13277 + }, + { + "epoch": 0.999491898605544, + "grad_norm": 3.7782351970672607, + "learning_rate": 8.560362433351898e-10, + "loss": 1.7802, + "step": 13278 + }, + { + "epoch": 0.9995671728862041, + "grad_norm": 6.511214256286621, + "learning_rate": 7.861862468094838e-10, + "loss": 1.531, + "step": 13279 + }, + { + "epoch": 0.9996424471668642, + "grad_norm": 5.865057945251465, + "learning_rate": 7.193085600820659e-10, + "loss": 1.6823, + "step": 13280 + }, + { + "epoch": 0.9997177214475245, + "grad_norm": 6.277799606323242, + "learning_rate": 6.554031871275345e-10, + "loss": 1.9129, + "step": 13281 + }, + { + "epoch": 0.9997929957281846, + "grad_norm": 5.464688777923584, + "learning_rate": 5.944701317428525e-10, + "loss": 1.752, + "step": 13282 + }, + { + "epoch": 0.9998682700088447, + "grad_norm": 4.103088855743408, + "learning_rate": 5.365093975528979e-10, + "loss": 1.6882, + "step": 13283 + }, + { + "epoch": 0.9999435442895049, + "grad_norm": 6.382748603820801, + "learning_rate": 4.815209880049132e-10, + "loss": 1.7416, + "step": 13284 + }, + { + "epoch": 0.9999435442895049, + "step": 13284, + "total_flos": 3.033057184768957e+19, + "train_loss": 0.0, + "train_runtime": 0.029, + "train_samples_per_second": 7332847.669, + "train_steps_per_second": 458279.262 + } + ], + "logging_steps": 1.0, + "max_steps": 13284, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.033057184768957e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}