diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,93030 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999435442895049, + "eval_steps": 500, + "global_step": 13284, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 7.527428066015545e-05, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 11.3058, + "step": 1 + }, + { + "epoch": 0.0001505485613203109, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 11.2167, + "step": 2 + }, + { + "epoch": 0.0002258228419804663, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 11.0158, + "step": 3 + }, + { + "epoch": 0.0003010971226406218, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 10.9581, + "step": 4 + }, + { + "epoch": 0.00037637140330077723, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 11.0368, + "step": 5 + }, + { + "epoch": 0.0004516456839609326, + "grad_norm": 101.22171020507812, + "learning_rate": 2.506265664160401e-07, + "loss": 11.0195, + "step": 6 + }, + { + "epoch": 0.0005269199646210881, + "grad_norm": 101.00434875488281, + "learning_rate": 5.012531328320802e-07, + "loss": 10.8547, + "step": 7 + }, + { + "epoch": 0.0006021942452812436, + "grad_norm": 101.00434875488281, + "learning_rate": 5.012531328320802e-07, + "loss": 11.2062, + "step": 8 + }, + { + "epoch": 0.000677468525941399, + "grad_norm": 100.9647216796875, + "learning_rate": 7.518796992481203e-07, + "loss": 11.0756, + "step": 9 + }, + { + "epoch": 0.0007527428066015545, + "grad_norm": 98.53367614746094, + "learning_rate": 1.0025062656641603e-06, + "loss": 10.9468, + "step": 10 + }, + { + "epoch": 0.0008280170872617098, + "grad_norm": 103.55094146728516, + "learning_rate": 1.2531328320802005e-06, + "loss": 10.6497, + "step": 11 + }, + { + "epoch": 0.0009032913679218653, + "grad_norm": 101.98058319091797, + "learning_rate": 1.5037593984962406e-06, + "loss": 11.0655, + "step": 12 + }, + { + "epoch": 0.0009785656485820208, + "grad_norm": 99.74136352539062, + "learning_rate": 1.7543859649122807e-06, + "loss": 11.0013, + "step": 13 + }, + { + "epoch": 0.0010538399292421763, + "grad_norm": 102.27731323242188, + "learning_rate": 2.0050125313283207e-06, + "loss": 11.1922, + "step": 14 + }, + { + "epoch": 0.0011291142099023317, + "grad_norm": 103.02958679199219, + "learning_rate": 2.255639097744361e-06, + "loss": 10.9353, + "step": 15 + }, + { + "epoch": 0.0012043884905624871, + "grad_norm": 102.86016082763672, + "learning_rate": 2.506265664160401e-06, + "loss": 10.8083, + "step": 16 + }, + { + "epoch": 0.0012796627712226426, + "grad_norm": 99.78475952148438, + "learning_rate": 2.7568922305764413e-06, + "loss": 10.789, + "step": 17 + }, + { + "epoch": 0.001354937051882798, + "grad_norm": 97.88682556152344, + "learning_rate": 3.007518796992481e-06, + "loss": 10.6574, + "step": 18 + }, + { + "epoch": 0.0014302113325429535, + "grad_norm": 94.35863494873047, + "learning_rate": 3.258145363408521e-06, + "loss": 10.2311, + "step": 19 + }, + { + "epoch": 0.001505485613203109, + "grad_norm": 97.56143951416016, + "learning_rate": 3.5087719298245615e-06, + "loss": 10.3029, + "step": 20 + }, + { + "epoch": 0.0015807598938632642, + "grad_norm": 93.58026123046875, + "learning_rate": 3.7593984962406014e-06, + "loss": 10.1692, + "step": 21 + }, + { + "epoch": 0.0016560341745234196, + "grad_norm": 94.10589599609375, + "learning_rate": 4.010025062656641e-06, + "loss": 9.9597, + "step": 22 + }, + { + "epoch": 0.001731308455183575, + "grad_norm": 91.92644500732422, + "learning_rate": 4.260651629072682e-06, + "loss": 9.6604, + "step": 23 + }, + { + "epoch": 0.0018065827358437305, + "grad_norm": 91.24312591552734, + "learning_rate": 4.511278195488722e-06, + "loss": 9.8507, + "step": 24 + }, + { + "epoch": 0.001881857016503886, + "grad_norm": 91.24312591552734, + "learning_rate": 4.511278195488722e-06, + "loss": 9.0144, + "step": 25 + }, + { + "epoch": 0.0019571312971640416, + "grad_norm": 91.44805145263672, + "learning_rate": 4.7619047619047615e-06, + "loss": 9.6564, + "step": 26 + }, + { + "epoch": 0.002032405577824197, + "grad_norm": 88.8994140625, + "learning_rate": 5.012531328320802e-06, + "loss": 9.2601, + "step": 27 + }, + { + "epoch": 0.0021076798584843525, + "grad_norm": 85.07931518554688, + "learning_rate": 5.263157894736842e-06, + "loss": 9.1414, + "step": 28 + }, + { + "epoch": 0.002182954139144508, + "grad_norm": 91.28398132324219, + "learning_rate": 5.5137844611528826e-06, + "loss": 9.0763, + "step": 29 + }, + { + "epoch": 0.0022582284198046634, + "grad_norm": 82.28803253173828, + "learning_rate": 5.764411027568922e-06, + "loss": 8.6692, + "step": 30 + }, + { + "epoch": 0.002333502700464819, + "grad_norm": 80.47472381591797, + "learning_rate": 6.015037593984962e-06, + "loss": 8.3735, + "step": 31 + }, + { + "epoch": 0.0024087769811249743, + "grad_norm": 80.88319396972656, + "learning_rate": 6.265664160401003e-06, + "loss": 7.9273, + "step": 32 + }, + { + "epoch": 0.0024840512617851297, + "grad_norm": 77.2528076171875, + "learning_rate": 6.516290726817042e-06, + "loss": 8.8538, + "step": 33 + }, + { + "epoch": 0.002559325542445285, + "grad_norm": 80.05760192871094, + "learning_rate": 6.766917293233083e-06, + "loss": 8.0052, + "step": 34 + }, + { + "epoch": 0.0026345998231054406, + "grad_norm": 77.98321533203125, + "learning_rate": 7.017543859649123e-06, + "loss": 7.9527, + "step": 35 + }, + { + "epoch": 0.002709874103765596, + "grad_norm": 76.9040756225586, + "learning_rate": 7.2681704260651625e-06, + "loss": 7.5808, + "step": 36 + }, + { + "epoch": 0.0027851483844257515, + "grad_norm": 74.9203109741211, + "learning_rate": 7.518796992481203e-06, + "loss": 7.2887, + "step": 37 + }, + { + "epoch": 0.002860422665085907, + "grad_norm": 71.68769836425781, + "learning_rate": 7.769423558897243e-06, + "loss": 7.2107, + "step": 38 + }, + { + "epoch": 0.0029356969457460624, + "grad_norm": 72.97769927978516, + "learning_rate": 8.020050125313283e-06, + "loss": 6.7856, + "step": 39 + }, + { + "epoch": 0.003010971226406218, + "grad_norm": 67.87655639648438, + "learning_rate": 8.270676691729324e-06, + "loss": 7.0611, + "step": 40 + }, + { + "epoch": 0.0030862455070663733, + "grad_norm": 70.81007385253906, + "learning_rate": 8.521303258145363e-06, + "loss": 6.8754, + "step": 41 + }, + { + "epoch": 0.0031615197877265283, + "grad_norm": 65.32673645019531, + "learning_rate": 8.771929824561403e-06, + "loss": 6.6728, + "step": 42 + }, + { + "epoch": 0.0032367940683866838, + "grad_norm": 61.19737243652344, + "learning_rate": 9.022556390977444e-06, + "loss": 6.531, + "step": 43 + }, + { + "epoch": 0.0033120683490468392, + "grad_norm": 59.31134033203125, + "learning_rate": 9.273182957393484e-06, + "loss": 6.202, + "step": 44 + }, + { + "epoch": 0.0033873426297069947, + "grad_norm": 63.7019157409668, + "learning_rate": 9.523809523809523e-06, + "loss": 6.1788, + "step": 45 + }, + { + "epoch": 0.00346261691036715, + "grad_norm": 56.24123764038086, + "learning_rate": 9.774436090225564e-06, + "loss": 6.3614, + "step": 46 + }, + { + "epoch": 0.0035378911910273056, + "grad_norm": 54.28101348876953, + "learning_rate": 1.0025062656641604e-05, + "loss": 5.5329, + "step": 47 + }, + { + "epoch": 0.003613165471687461, + "grad_norm": 54.67324447631836, + "learning_rate": 1.0275689223057643e-05, + "loss": 5.4588, + "step": 48 + }, + { + "epoch": 0.0036884397523476165, + "grad_norm": 55.15860366821289, + "learning_rate": 1.0526315789473684e-05, + "loss": 5.4524, + "step": 49 + }, + { + "epoch": 0.003763714033007772, + "grad_norm": 51.7996711730957, + "learning_rate": 1.0776942355889724e-05, + "loss": 5.0809, + "step": 50 + }, + { + "epoch": 0.0038389883136679273, + "grad_norm": 50.0368537902832, + "learning_rate": 1.1027568922305765e-05, + "loss": 4.9969, + "step": 51 + }, + { + "epoch": 0.003914262594328083, + "grad_norm": 51.43173599243164, + "learning_rate": 1.1278195488721805e-05, + "loss": 4.9582, + "step": 52 + }, + { + "epoch": 0.003989536874988238, + "grad_norm": 48.52267837524414, + "learning_rate": 1.1528822055137844e-05, + "loss": 5.2144, + "step": 53 + }, + { + "epoch": 0.004064811155648394, + "grad_norm": 45.86253356933594, + "learning_rate": 1.1779448621553885e-05, + "loss": 4.963, + "step": 54 + }, + { + "epoch": 0.004140085436308549, + "grad_norm": 46.87076187133789, + "learning_rate": 1.2030075187969925e-05, + "loss": 4.8069, + "step": 55 + }, + { + "epoch": 0.004215359716968705, + "grad_norm": 43.505149841308594, + "learning_rate": 1.2280701754385964e-05, + "loss": 4.4433, + "step": 56 + }, + { + "epoch": 0.00429063399762886, + "grad_norm": 42.096683502197266, + "learning_rate": 1.2531328320802006e-05, + "loss": 4.5141, + "step": 57 + }, + { + "epoch": 0.004365908278289016, + "grad_norm": 41.87490463256836, + "learning_rate": 1.2781954887218045e-05, + "loss": 4.4574, + "step": 58 + }, + { + "epoch": 0.004441182558949171, + "grad_norm": 40.193180084228516, + "learning_rate": 1.3032581453634085e-05, + "loss": 3.9994, + "step": 59 + }, + { + "epoch": 0.004516456839609327, + "grad_norm": 42.521270751953125, + "learning_rate": 1.3283208020050126e-05, + "loss": 4.0917, + "step": 60 + }, + { + "epoch": 0.004591731120269482, + "grad_norm": 37.18898010253906, + "learning_rate": 1.3533834586466165e-05, + "loss": 4.3226, + "step": 61 + }, + { + "epoch": 0.004667005400929638, + "grad_norm": 36.022308349609375, + "learning_rate": 1.3784461152882205e-05, + "loss": 4.0419, + "step": 62 + }, + { + "epoch": 0.004742279681589793, + "grad_norm": 30.037837982177734, + "learning_rate": 1.4035087719298246e-05, + "loss": 4.033, + "step": 63 + }, + { + "epoch": 0.004817553962249949, + "grad_norm": 30.106727600097656, + "learning_rate": 1.4285714285714285e-05, + "loss": 3.7006, + "step": 64 + }, + { + "epoch": 0.004892828242910104, + "grad_norm": 27.95155143737793, + "learning_rate": 1.4536340852130325e-05, + "loss": 4.0383, + "step": 65 + }, + { + "epoch": 0.0049681025235702595, + "grad_norm": 31.840641021728516, + "learning_rate": 1.4786967418546366e-05, + "loss": 3.7578, + "step": 66 + }, + { + "epoch": 0.0050433768042304145, + "grad_norm": 26.964170455932617, + "learning_rate": 1.5037593984962406e-05, + "loss": 3.8204, + "step": 67 + }, + { + "epoch": 0.00511865108489057, + "grad_norm": 28.508447647094727, + "learning_rate": 1.5288220551378447e-05, + "loss": 4.0668, + "step": 68 + }, + { + "epoch": 0.005193925365550725, + "grad_norm": 27.509992599487305, + "learning_rate": 1.5538847117794486e-05, + "loss": 3.3308, + "step": 69 + }, + { + "epoch": 0.005269199646210881, + "grad_norm": 27.621871948242188, + "learning_rate": 1.5789473684210526e-05, + "loss": 3.569, + "step": 70 + }, + { + "epoch": 0.005344473926871036, + "grad_norm": 26.775339126586914, + "learning_rate": 1.6040100250626565e-05, + "loss": 3.6699, + "step": 71 + }, + { + "epoch": 0.005419748207531192, + "grad_norm": 27.39186668395996, + "learning_rate": 1.6290726817042605e-05, + "loss": 3.4824, + "step": 72 + }, + { + "epoch": 0.005495022488191347, + "grad_norm": 26.232194900512695, + "learning_rate": 1.6541353383458648e-05, + "loss": 3.3197, + "step": 73 + }, + { + "epoch": 0.005570296768851503, + "grad_norm": 26.70931053161621, + "learning_rate": 1.6791979949874687e-05, + "loss": 3.3354, + "step": 74 + }, + { + "epoch": 0.005645571049511658, + "grad_norm": 28.280254364013672, + "learning_rate": 1.7042606516290727e-05, + "loss": 3.1711, + "step": 75 + }, + { + "epoch": 0.005720845330171814, + "grad_norm": 25.6878662109375, + "learning_rate": 1.7293233082706766e-05, + "loss": 3.3231, + "step": 76 + }, + { + "epoch": 0.005796119610831969, + "grad_norm": 26.246875762939453, + "learning_rate": 1.7543859649122806e-05, + "loss": 3.171, + "step": 77 + }, + { + "epoch": 0.005871393891492125, + "grad_norm": 26.477882385253906, + "learning_rate": 1.779448621553885e-05, + "loss": 2.823, + "step": 78 + }, + { + "epoch": 0.00594666817215228, + "grad_norm": 24.533912658691406, + "learning_rate": 1.8045112781954888e-05, + "loss": 3.3136, + "step": 79 + }, + { + "epoch": 0.006021942452812436, + "grad_norm": 22.89409065246582, + "learning_rate": 1.8295739348370928e-05, + "loss": 2.9865, + "step": 80 + }, + { + "epoch": 0.006097216733472591, + "grad_norm": 23.302785873413086, + "learning_rate": 1.8546365914786967e-05, + "loss": 2.8005, + "step": 81 + }, + { + "epoch": 0.006172491014132747, + "grad_norm": 24.13874626159668, + "learning_rate": 1.8796992481203007e-05, + "loss": 2.8054, + "step": 82 + }, + { + "epoch": 0.006247765294792902, + "grad_norm": 21.652523040771484, + "learning_rate": 1.9047619047619046e-05, + "loss": 3.3006, + "step": 83 + }, + { + "epoch": 0.006323039575453057, + "grad_norm": 23.442100524902344, + "learning_rate": 1.929824561403509e-05, + "loss": 3.1333, + "step": 84 + }, + { + "epoch": 0.0063983138561132125, + "grad_norm": 20.342926025390625, + "learning_rate": 1.954887218045113e-05, + "loss": 3.1004, + "step": 85 + }, + { + "epoch": 0.0064735881367733675, + "grad_norm": 21.158906936645508, + "learning_rate": 1.9799498746867168e-05, + "loss": 2.8283, + "step": 86 + }, + { + "epoch": 0.006548862417433523, + "grad_norm": 21.205425262451172, + "learning_rate": 2.0050125313283208e-05, + "loss": 2.8407, + "step": 87 + }, + { + "epoch": 0.0066241366980936784, + "grad_norm": 16.539318084716797, + "learning_rate": 2.0300751879699247e-05, + "loss": 2.992, + "step": 88 + }, + { + "epoch": 0.006699410978753834, + "grad_norm": 15.958436012268066, + "learning_rate": 2.0551378446115287e-05, + "loss": 2.7528, + "step": 89 + }, + { + "epoch": 0.006774685259413989, + "grad_norm": 16.10943603515625, + "learning_rate": 2.080200501253133e-05, + "loss": 2.9206, + "step": 90 + }, + { + "epoch": 0.006849959540074145, + "grad_norm": 13.623184204101562, + "learning_rate": 2.105263157894737e-05, + "loss": 2.8539, + "step": 91 + }, + { + "epoch": 0.0069252338207343, + "grad_norm": 13.710868835449219, + "learning_rate": 2.130325814536341e-05, + "loss": 2.8377, + "step": 92 + }, + { + "epoch": 0.007000508101394456, + "grad_norm": 14.358534812927246, + "learning_rate": 2.1553884711779448e-05, + "loss": 2.971, + "step": 93 + }, + { + "epoch": 0.007075782382054611, + "grad_norm": 13.918020248413086, + "learning_rate": 2.1804511278195487e-05, + "loss": 2.6541, + "step": 94 + }, + { + "epoch": 0.007151056662714767, + "grad_norm": 12.453470230102539, + "learning_rate": 2.205513784461153e-05, + "loss": 2.9524, + "step": 95 + }, + { + "epoch": 0.007226330943374922, + "grad_norm": 10.925068855285645, + "learning_rate": 2.230576441102757e-05, + "loss": 2.789, + "step": 96 + }, + { + "epoch": 0.007301605224035078, + "grad_norm": 11.36191177368164, + "learning_rate": 2.255639097744361e-05, + "loss": 2.7477, + "step": 97 + }, + { + "epoch": 0.007376879504695233, + "grad_norm": 9.909281730651855, + "learning_rate": 2.280701754385965e-05, + "loss": 2.981, + "step": 98 + }, + { + "epoch": 0.007452153785355389, + "grad_norm": 11.188608169555664, + "learning_rate": 2.3057644110275688e-05, + "loss": 2.6559, + "step": 99 + }, + { + "epoch": 0.007527428066015544, + "grad_norm": 12.857466697692871, + "learning_rate": 2.3308270676691728e-05, + "loss": 2.6402, + "step": 100 + }, + { + "epoch": 0.0076027023466757, + "grad_norm": 12.022490501403809, + "learning_rate": 2.355889724310777e-05, + "loss": 2.5045, + "step": 101 + }, + { + "epoch": 0.007677976627335855, + "grad_norm": 9.543107986450195, + "learning_rate": 2.380952380952381e-05, + "loss": 2.876, + "step": 102 + }, + { + "epoch": 0.007753250907996011, + "grad_norm": 11.801715850830078, + "learning_rate": 2.406015037593985e-05, + "loss": 2.9806, + "step": 103 + }, + { + "epoch": 0.007828525188656166, + "grad_norm": 13.703426361083984, + "learning_rate": 2.431077694235589e-05, + "loss": 2.642, + "step": 104 + }, + { + "epoch": 0.007903799469316321, + "grad_norm": 11.499256134033203, + "learning_rate": 2.456140350877193e-05, + "loss": 2.8738, + "step": 105 + }, + { + "epoch": 0.007979073749976476, + "grad_norm": 12.320425987243652, + "learning_rate": 2.4812030075187968e-05, + "loss": 2.8757, + "step": 106 + }, + { + "epoch": 0.008054348030636631, + "grad_norm": 14.50576114654541, + "learning_rate": 2.506265664160401e-05, + "loss": 2.8903, + "step": 107 + }, + { + "epoch": 0.008129622311296788, + "grad_norm": 11.007486343383789, + "learning_rate": 2.531328320802005e-05, + "loss": 2.7056, + "step": 108 + }, + { + "epoch": 0.008204896591956943, + "grad_norm": 11.293839454650879, + "learning_rate": 2.556390977443609e-05, + "loss": 2.6704, + "step": 109 + }, + { + "epoch": 0.008280170872617098, + "grad_norm": 8.651300430297852, + "learning_rate": 2.581453634085213e-05, + "loss": 2.6453, + "step": 110 + }, + { + "epoch": 0.008355445153277253, + "grad_norm": 12.535172462463379, + "learning_rate": 2.606516290726817e-05, + "loss": 2.7358, + "step": 111 + }, + { + "epoch": 0.00843071943393741, + "grad_norm": 8.519335746765137, + "learning_rate": 2.6315789473684212e-05, + "loss": 2.9246, + "step": 112 + }, + { + "epoch": 0.008505993714597565, + "grad_norm": 9.05390739440918, + "learning_rate": 2.656641604010025e-05, + "loss": 2.8547, + "step": 113 + }, + { + "epoch": 0.00858126799525772, + "grad_norm": 11.525436401367188, + "learning_rate": 2.681704260651629e-05, + "loss": 2.6, + "step": 114 + }, + { + "epoch": 0.008656542275917875, + "grad_norm": 10.789581298828125, + "learning_rate": 2.706766917293233e-05, + "loss": 2.6484, + "step": 115 + }, + { + "epoch": 0.008731816556578032, + "grad_norm": 12.913826942443848, + "learning_rate": 2.731829573934837e-05, + "loss": 2.8547, + "step": 116 + }, + { + "epoch": 0.008807090837238187, + "grad_norm": 11.780866622924805, + "learning_rate": 2.756892230576441e-05, + "loss": 2.6838, + "step": 117 + }, + { + "epoch": 0.008882365117898342, + "grad_norm": 8.270967483520508, + "learning_rate": 2.7819548872180452e-05, + "loss": 2.5647, + "step": 118 + }, + { + "epoch": 0.008957639398558497, + "grad_norm": 7.76688814163208, + "learning_rate": 2.8070175438596492e-05, + "loss": 2.686, + "step": 119 + }, + { + "epoch": 0.009032913679218654, + "grad_norm": 9.221709251403809, + "learning_rate": 2.832080200501253e-05, + "loss": 2.9965, + "step": 120 + }, + { + "epoch": 0.009108187959878809, + "grad_norm": 11.874032974243164, + "learning_rate": 2.857142857142857e-05, + "loss": 3.1813, + "step": 121 + }, + { + "epoch": 0.009183462240538964, + "grad_norm": 12.074503898620605, + "learning_rate": 2.882205513784461e-05, + "loss": 3.005, + "step": 122 + }, + { + "epoch": 0.009258736521199119, + "grad_norm": 12.125741004943848, + "learning_rate": 2.907268170426065e-05, + "loss": 3.0457, + "step": 123 + }, + { + "epoch": 0.009334010801859275, + "grad_norm": 13.402585983276367, + "learning_rate": 2.9323308270676693e-05, + "loss": 2.9726, + "step": 124 + }, + { + "epoch": 0.00940928508251943, + "grad_norm": 7.172392845153809, + "learning_rate": 2.9573934837092732e-05, + "loss": 2.773, + "step": 125 + }, + { + "epoch": 0.009484559363179585, + "grad_norm": 8.764741897583008, + "learning_rate": 2.9824561403508772e-05, + "loss": 2.3307, + "step": 126 + }, + { + "epoch": 0.00955983364383974, + "grad_norm": 10.728087425231934, + "learning_rate": 3.007518796992481e-05, + "loss": 2.9004, + "step": 127 + }, + { + "epoch": 0.009635107924499897, + "grad_norm": 9.693150520324707, + "learning_rate": 3.032581453634085e-05, + "loss": 2.942, + "step": 128 + }, + { + "epoch": 0.009710382205160052, + "grad_norm": 11.793756484985352, + "learning_rate": 3.0576441102756894e-05, + "loss": 2.7575, + "step": 129 + }, + { + "epoch": 0.009785656485820207, + "grad_norm": 11.24767017364502, + "learning_rate": 3.082706766917293e-05, + "loss": 2.6284, + "step": 130 + }, + { + "epoch": 0.009860930766480362, + "grad_norm": 9.41115665435791, + "learning_rate": 3.107769423558897e-05, + "loss": 2.8344, + "step": 131 + }, + { + "epoch": 0.009936205047140519, + "grad_norm": 10.862749099731445, + "learning_rate": 3.132832080200501e-05, + "loss": 2.4733, + "step": 132 + }, + { + "epoch": 0.010011479327800674, + "grad_norm": 10.923384666442871, + "learning_rate": 3.157894736842105e-05, + "loss": 2.4103, + "step": 133 + }, + { + "epoch": 0.010086753608460829, + "grad_norm": 10.988842010498047, + "learning_rate": 3.182957393483709e-05, + "loss": 2.599, + "step": 134 + }, + { + "epoch": 0.010162027889120984, + "grad_norm": 10.643759727478027, + "learning_rate": 3.208020050125313e-05, + "loss": 3.0312, + "step": 135 + }, + { + "epoch": 0.01023730216978114, + "grad_norm": 9.693270683288574, + "learning_rate": 3.233082706766917e-05, + "loss": 2.7399, + "step": 136 + }, + { + "epoch": 0.010312576450441296, + "grad_norm": 10.290359497070312, + "learning_rate": 3.258145363408521e-05, + "loss": 2.8732, + "step": 137 + }, + { + "epoch": 0.01038785073110145, + "grad_norm": 12.482595443725586, + "learning_rate": 3.2832080200501256e-05, + "loss": 2.8354, + "step": 138 + }, + { + "epoch": 0.010463125011761606, + "grad_norm": 9.462326049804688, + "learning_rate": 3.3082706766917295e-05, + "loss": 2.6542, + "step": 139 + }, + { + "epoch": 0.010538399292421763, + "grad_norm": 10.323150634765625, + "learning_rate": 3.3333333333333335e-05, + "loss": 2.433, + "step": 140 + }, + { + "epoch": 0.010613673573081918, + "grad_norm": 10.48134708404541, + "learning_rate": 3.3583959899749374e-05, + "loss": 2.6072, + "step": 141 + }, + { + "epoch": 0.010688947853742073, + "grad_norm": 14.575450897216797, + "learning_rate": 3.3834586466165414e-05, + "loss": 2.7018, + "step": 142 + }, + { + "epoch": 0.010764222134402228, + "grad_norm": 10.16996955871582, + "learning_rate": 3.4085213032581453e-05, + "loss": 2.5096, + "step": 143 + }, + { + "epoch": 0.010839496415062384, + "grad_norm": 14.478660583496094, + "learning_rate": 3.433583959899749e-05, + "loss": 3.074, + "step": 144 + }, + { + "epoch": 0.01091477069572254, + "grad_norm": 11.558063507080078, + "learning_rate": 3.458646616541353e-05, + "loss": 2.9335, + "step": 145 + }, + { + "epoch": 0.010990044976382694, + "grad_norm": 12.873056411743164, + "learning_rate": 3.483709273182957e-05, + "loss": 2.7112, + "step": 146 + }, + { + "epoch": 0.01106531925704285, + "grad_norm": 10.242127418518066, + "learning_rate": 3.508771929824561e-05, + "loss": 2.8176, + "step": 147 + }, + { + "epoch": 0.011140593537703006, + "grad_norm": 9.07247257232666, + "learning_rate": 3.533834586466165e-05, + "loss": 2.6257, + "step": 148 + }, + { + "epoch": 0.011215867818363161, + "grad_norm": 11.077366828918457, + "learning_rate": 3.55889724310777e-05, + "loss": 2.4143, + "step": 149 + }, + { + "epoch": 0.011291142099023316, + "grad_norm": 10.589329719543457, + "learning_rate": 3.583959899749374e-05, + "loss": 2.7472, + "step": 150 + }, + { + "epoch": 0.011366416379683471, + "grad_norm": 9.62755012512207, + "learning_rate": 3.6090225563909776e-05, + "loss": 2.802, + "step": 151 + }, + { + "epoch": 0.011441690660343628, + "grad_norm": 8.788288116455078, + "learning_rate": 3.6340852130325816e-05, + "loss": 2.4896, + "step": 152 + }, + { + "epoch": 0.011516964941003783, + "grad_norm": 10.032755851745605, + "learning_rate": 3.6591478696741855e-05, + "loss": 2.5718, + "step": 153 + }, + { + "epoch": 0.011592239221663938, + "grad_norm": 9.229397773742676, + "learning_rate": 3.6842105263157895e-05, + "loss": 2.7517, + "step": 154 + }, + { + "epoch": 0.011667513502324093, + "grad_norm": 8.434051513671875, + "learning_rate": 3.7092731829573934e-05, + "loss": 2.8381, + "step": 155 + }, + { + "epoch": 0.01174278778298425, + "grad_norm": 9.783924102783203, + "learning_rate": 3.7343358395989974e-05, + "loss": 2.7413, + "step": 156 + }, + { + "epoch": 0.011818062063644405, + "grad_norm": 9.114348411560059, + "learning_rate": 3.759398496240601e-05, + "loss": 2.7403, + "step": 157 + }, + { + "epoch": 0.01189333634430456, + "grad_norm": 12.778593063354492, + "learning_rate": 3.784461152882205e-05, + "loss": 2.7325, + "step": 158 + }, + { + "epoch": 0.011968610624964715, + "grad_norm": 8.32952880859375, + "learning_rate": 3.809523809523809e-05, + "loss": 2.8156, + "step": 159 + }, + { + "epoch": 0.012043884905624871, + "grad_norm": 12.271199226379395, + "learning_rate": 3.834586466165413e-05, + "loss": 2.7965, + "step": 160 + }, + { + "epoch": 0.012119159186285026, + "grad_norm": 11.134658813476562, + "learning_rate": 3.859649122807018e-05, + "loss": 2.5964, + "step": 161 + }, + { + "epoch": 0.012194433466945181, + "grad_norm": 9.325178146362305, + "learning_rate": 3.884711779448622e-05, + "loss": 2.6213, + "step": 162 + }, + { + "epoch": 0.012269707747605337, + "grad_norm": 12.493197441101074, + "learning_rate": 3.909774436090226e-05, + "loss": 2.4914, + "step": 163 + }, + { + "epoch": 0.012344982028265493, + "grad_norm": 9.506828308105469, + "learning_rate": 3.9348370927318297e-05, + "loss": 2.5746, + "step": 164 + }, + { + "epoch": 0.012420256308925648, + "grad_norm": 10.369872093200684, + "learning_rate": 3.9598997493734336e-05, + "loss": 2.7721, + "step": 165 + }, + { + "epoch": 0.012495530589585803, + "grad_norm": 10.101872444152832, + "learning_rate": 3.9849624060150376e-05, + "loss": 2.5736, + "step": 166 + }, + { + "epoch": 0.012570804870245958, + "grad_norm": 17.092546463012695, + "learning_rate": 4.0100250626566415e-05, + "loss": 2.598, + "step": 167 + }, + { + "epoch": 0.012646079150906113, + "grad_norm": 11.833017349243164, + "learning_rate": 4.0350877192982455e-05, + "loss": 2.6022, + "step": 168 + }, + { + "epoch": 0.01272135343156627, + "grad_norm": 7.747100830078125, + "learning_rate": 4.0601503759398494e-05, + "loss": 2.6336, + "step": 169 + }, + { + "epoch": 0.012796627712226425, + "grad_norm": 9.564743995666504, + "learning_rate": 4.0852130325814534e-05, + "loss": 2.5899, + "step": 170 + }, + { + "epoch": 0.01287190199288658, + "grad_norm": 9.064987182617188, + "learning_rate": 4.110275689223057e-05, + "loss": 2.7125, + "step": 171 + }, + { + "epoch": 0.012947176273546735, + "grad_norm": 10.471091270446777, + "learning_rate": 4.135338345864662e-05, + "loss": 2.4207, + "step": 172 + }, + { + "epoch": 0.013022450554206892, + "grad_norm": 10.878631591796875, + "learning_rate": 4.160401002506266e-05, + "loss": 2.6163, + "step": 173 + }, + { + "epoch": 0.013097724834867047, + "grad_norm": 8.91076946258545, + "learning_rate": 4.18546365914787e-05, + "loss": 2.4706, + "step": 174 + }, + { + "epoch": 0.013172999115527202, + "grad_norm": 9.40295124053955, + "learning_rate": 4.210526315789474e-05, + "loss": 2.6412, + "step": 175 + }, + { + "epoch": 0.013248273396187357, + "grad_norm": 9.639464378356934, + "learning_rate": 4.235588972431078e-05, + "loss": 2.4719, + "step": 176 + }, + { + "epoch": 0.013323547676847514, + "grad_norm": 7.9790825843811035, + "learning_rate": 4.260651629072682e-05, + "loss": 3.1457, + "step": 177 + }, + { + "epoch": 0.013398821957507669, + "grad_norm": 8.203713417053223, + "learning_rate": 4.2857142857142856e-05, + "loss": 2.6524, + "step": 178 + }, + { + "epoch": 0.013474096238167824, + "grad_norm": 9.808259010314941, + "learning_rate": 4.3107769423558896e-05, + "loss": 2.5888, + "step": 179 + }, + { + "epoch": 0.013549370518827979, + "grad_norm": 7.837316513061523, + "learning_rate": 4.3358395989974935e-05, + "loss": 2.6199, + "step": 180 + }, + { + "epoch": 0.013624644799488135, + "grad_norm": 10.121833801269531, + "learning_rate": 4.3609022556390975e-05, + "loss": 2.5769, + "step": 181 + }, + { + "epoch": 0.01369991908014829, + "grad_norm": 7.981498718261719, + "learning_rate": 4.3859649122807014e-05, + "loss": 2.7497, + "step": 182 + }, + { + "epoch": 0.013775193360808445, + "grad_norm": 8.599218368530273, + "learning_rate": 4.411027568922306e-05, + "loss": 2.9126, + "step": 183 + }, + { + "epoch": 0.0138504676414686, + "grad_norm": 5.973028659820557, + "learning_rate": 4.43609022556391e-05, + "loss": 2.8026, + "step": 184 + }, + { + "epoch": 0.013925741922128757, + "grad_norm": 9.95548152923584, + "learning_rate": 4.461152882205514e-05, + "loss": 2.5834, + "step": 185 + }, + { + "epoch": 0.014001016202788912, + "grad_norm": 6.851253986358643, + "learning_rate": 4.486215538847118e-05, + "loss": 2.7454, + "step": 186 + }, + { + "epoch": 0.014076290483449067, + "grad_norm": 10.43259048461914, + "learning_rate": 4.511278195488722e-05, + "loss": 2.5623, + "step": 187 + }, + { + "epoch": 0.014151564764109222, + "grad_norm": 7.847858905792236, + "learning_rate": 4.536340852130326e-05, + "loss": 2.6478, + "step": 188 + }, + { + "epoch": 0.014226839044769379, + "grad_norm": 9.846750259399414, + "learning_rate": 4.56140350877193e-05, + "loss": 2.7244, + "step": 189 + }, + { + "epoch": 0.014302113325429534, + "grad_norm": 9.090900421142578, + "learning_rate": 4.586466165413534e-05, + "loss": 2.7087, + "step": 190 + }, + { + "epoch": 0.014377387606089689, + "grad_norm": 13.029911994934082, + "learning_rate": 4.6115288220551377e-05, + "loss": 2.6135, + "step": 191 + }, + { + "epoch": 0.014452661886749844, + "grad_norm": 10.575517654418945, + "learning_rate": 4.6365914786967416e-05, + "loss": 2.6086, + "step": 192 + }, + { + "epoch": 0.01452793616741, + "grad_norm": 11.485733985900879, + "learning_rate": 4.6616541353383456e-05, + "loss": 2.9375, + "step": 193 + }, + { + "epoch": 0.014603210448070156, + "grad_norm": 7.707833290100098, + "learning_rate": 4.6867167919799495e-05, + "loss": 2.6695, + "step": 194 + }, + { + "epoch": 0.01467848472873031, + "grad_norm": 7.827512264251709, + "learning_rate": 4.711779448621554e-05, + "loss": 2.5223, + "step": 195 + }, + { + "epoch": 0.014753759009390466, + "grad_norm": 7.267850875854492, + "learning_rate": 4.736842105263158e-05, + "loss": 2.4527, + "step": 196 + }, + { + "epoch": 0.014829033290050623, + "grad_norm": 7.232391357421875, + "learning_rate": 4.761904761904762e-05, + "loss": 2.5753, + "step": 197 + }, + { + "epoch": 0.014904307570710778, + "grad_norm": 12.583212852478027, + "learning_rate": 4.786967418546366e-05, + "loss": 2.8173, + "step": 198 + }, + { + "epoch": 0.014979581851370933, + "grad_norm": 10.620282173156738, + "learning_rate": 4.81203007518797e-05, + "loss": 2.6762, + "step": 199 + }, + { + "epoch": 0.015054856132031088, + "grad_norm": 9.189391136169434, + "learning_rate": 4.837092731829574e-05, + "loss": 2.7449, + "step": 200 + }, + { + "epoch": 0.015130130412691244, + "grad_norm": 8.411532402038574, + "learning_rate": 4.862155388471178e-05, + "loss": 2.4211, + "step": 201 + }, + { + "epoch": 0.0152054046933514, + "grad_norm": 12.243152618408203, + "learning_rate": 4.887218045112782e-05, + "loss": 2.7006, + "step": 202 + }, + { + "epoch": 0.015280678974011554, + "grad_norm": 8.701164245605469, + "learning_rate": 4.912280701754386e-05, + "loss": 2.5448, + "step": 203 + }, + { + "epoch": 0.01535595325467171, + "grad_norm": 10.85342788696289, + "learning_rate": 4.93734335839599e-05, + "loss": 3.0048, + "step": 204 + }, + { + "epoch": 0.015431227535331866, + "grad_norm": 10.478662490844727, + "learning_rate": 4.9624060150375936e-05, + "loss": 2.5254, + "step": 205 + }, + { + "epoch": 0.015506501815992021, + "grad_norm": 11.290699005126953, + "learning_rate": 4.987468671679198e-05, + "loss": 2.478, + "step": 206 + }, + { + "epoch": 0.015581776096652176, + "grad_norm": 8.114274024963379, + "learning_rate": 5.012531328320802e-05, + "loss": 2.565, + "step": 207 + }, + { + "epoch": 0.015657050377312333, + "grad_norm": 9.683647155761719, + "learning_rate": 5.037593984962407e-05, + "loss": 2.3578, + "step": 208 + }, + { + "epoch": 0.015732324657972488, + "grad_norm": 9.260629653930664, + "learning_rate": 5.06265664160401e-05, + "loss": 2.6074, + "step": 209 + }, + { + "epoch": 0.015807598938632643, + "grad_norm": 10.380516052246094, + "learning_rate": 5.087719298245615e-05, + "loss": 2.6925, + "step": 210 + }, + { + "epoch": 0.015882873219292798, + "grad_norm": 9.815593719482422, + "learning_rate": 5.112781954887218e-05, + "loss": 2.3392, + "step": 211 + }, + { + "epoch": 0.015958147499952953, + "grad_norm": 9.420886993408203, + "learning_rate": 5.1378446115288226e-05, + "loss": 2.8057, + "step": 212 + }, + { + "epoch": 0.016033421780613108, + "grad_norm": 11.434077262878418, + "learning_rate": 5.162907268170426e-05, + "loss": 2.5541, + "step": 213 + }, + { + "epoch": 0.016108696061273263, + "grad_norm": 8.445125579833984, + "learning_rate": 5.1879699248120305e-05, + "loss": 2.4893, + "step": 214 + }, + { + "epoch": 0.01618397034193342, + "grad_norm": 6.411423206329346, + "learning_rate": 5.213032581453634e-05, + "loss": 2.6418, + "step": 215 + }, + { + "epoch": 0.016259244622593576, + "grad_norm": 8.493948936462402, + "learning_rate": 5.2380952380952384e-05, + "loss": 2.9823, + "step": 216 + }, + { + "epoch": 0.01633451890325373, + "grad_norm": 7.110140323638916, + "learning_rate": 5.2631578947368424e-05, + "loss": 2.7274, + "step": 217 + }, + { + "epoch": 0.016409793183913886, + "grad_norm": 10.618717193603516, + "learning_rate": 5.2882205513784463e-05, + "loss": 2.7715, + "step": 218 + }, + { + "epoch": 0.01648506746457404, + "grad_norm": 6.806361198425293, + "learning_rate": 5.31328320802005e-05, + "loss": 2.7524, + "step": 219 + }, + { + "epoch": 0.016560341745234197, + "grad_norm": 11.365694046020508, + "learning_rate": 5.338345864661655e-05, + "loss": 2.6783, + "step": 220 + }, + { + "epoch": 0.01663561602589435, + "grad_norm": 8.204473495483398, + "learning_rate": 5.363408521303258e-05, + "loss": 2.779, + "step": 221 + }, + { + "epoch": 0.016710890306554507, + "grad_norm": 7.241642475128174, + "learning_rate": 5.388471177944863e-05, + "loss": 2.4447, + "step": 222 + }, + { + "epoch": 0.016786164587214665, + "grad_norm": 8.607502937316895, + "learning_rate": 5.413533834586466e-05, + "loss": 2.674, + "step": 223 + }, + { + "epoch": 0.01686143886787482, + "grad_norm": 11.882096290588379, + "learning_rate": 5.438596491228071e-05, + "loss": 2.5251, + "step": 224 + }, + { + "epoch": 0.016936713148534975, + "grad_norm": 7.958731174468994, + "learning_rate": 5.463659147869674e-05, + "loss": 2.6136, + "step": 225 + }, + { + "epoch": 0.01701198742919513, + "grad_norm": 9.484212875366211, + "learning_rate": 5.4887218045112786e-05, + "loss": 2.6626, + "step": 226 + }, + { + "epoch": 0.017087261709855285, + "grad_norm": 7.906175136566162, + "learning_rate": 5.513784461152882e-05, + "loss": 2.8029, + "step": 227 + }, + { + "epoch": 0.01716253599051544, + "grad_norm": 8.0032377243042, + "learning_rate": 5.5388471177944865e-05, + "loss": 2.646, + "step": 228 + }, + { + "epoch": 0.017237810271175595, + "grad_norm": 9.094749450683594, + "learning_rate": 5.5639097744360905e-05, + "loss": 2.5874, + "step": 229 + }, + { + "epoch": 0.01731308455183575, + "grad_norm": 10.920360565185547, + "learning_rate": 5.5889724310776944e-05, + "loss": 2.7446, + "step": 230 + }, + { + "epoch": 0.01738835883249591, + "grad_norm": 7.0707573890686035, + "learning_rate": 5.6140350877192984e-05, + "loss": 2.3312, + "step": 231 + }, + { + "epoch": 0.017463633113156064, + "grad_norm": 8.162520408630371, + "learning_rate": 5.639097744360903e-05, + "loss": 2.7798, + "step": 232 + }, + { + "epoch": 0.01753890739381622, + "grad_norm": 7.946226596832275, + "learning_rate": 5.664160401002506e-05, + "loss": 2.6714, + "step": 233 + }, + { + "epoch": 0.017614181674476374, + "grad_norm": 6.753323554992676, + "learning_rate": 5.689223057644111e-05, + "loss": 2.5322, + "step": 234 + }, + { + "epoch": 0.01768945595513653, + "grad_norm": 8.795600891113281, + "learning_rate": 5.714285714285714e-05, + "loss": 2.6777, + "step": 235 + }, + { + "epoch": 0.017764730235796684, + "grad_norm": 7.2303786277771, + "learning_rate": 5.739348370927319e-05, + "loss": 2.5091, + "step": 236 + }, + { + "epoch": 0.01784000451645684, + "grad_norm": 8.029302597045898, + "learning_rate": 5.764411027568922e-05, + "loss": 3.0638, + "step": 237 + }, + { + "epoch": 0.017915278797116994, + "grad_norm": 7.138152599334717, + "learning_rate": 5.789473684210527e-05, + "loss": 2.4998, + "step": 238 + }, + { + "epoch": 0.017990553077777152, + "grad_norm": 8.6770601272583, + "learning_rate": 5.81453634085213e-05, + "loss": 2.6019, + "step": 239 + }, + { + "epoch": 0.018065827358437307, + "grad_norm": 9.012789726257324, + "learning_rate": 5.8395989974937346e-05, + "loss": 2.4086, + "step": 240 + }, + { + "epoch": 0.018141101639097462, + "grad_norm": 7.267442226409912, + "learning_rate": 5.8646616541353386e-05, + "loss": 2.6218, + "step": 241 + }, + { + "epoch": 0.018216375919757617, + "grad_norm": 11.098615646362305, + "learning_rate": 5.889724310776943e-05, + "loss": 2.6493, + "step": 242 + }, + { + "epoch": 0.018291650200417772, + "grad_norm": 8.830709457397461, + "learning_rate": 5.9147869674185465e-05, + "loss": 3.1504, + "step": 243 + }, + { + "epoch": 0.018366924481077927, + "grad_norm": 7.9406609535217285, + "learning_rate": 5.939849624060151e-05, + "loss": 2.4455, + "step": 244 + }, + { + "epoch": 0.018442198761738082, + "grad_norm": 7.834328651428223, + "learning_rate": 5.9649122807017544e-05, + "loss": 2.7414, + "step": 245 + }, + { + "epoch": 0.018517473042398237, + "grad_norm": 10.794530868530273, + "learning_rate": 5.989974937343359e-05, + "loss": 2.5237, + "step": 246 + }, + { + "epoch": 0.018592747323058396, + "grad_norm": 10.709030151367188, + "learning_rate": 6.015037593984962e-05, + "loss": 2.4774, + "step": 247 + }, + { + "epoch": 0.01866802160371855, + "grad_norm": 10.267542839050293, + "learning_rate": 6.040100250626567e-05, + "loss": 2.6907, + "step": 248 + }, + { + "epoch": 0.018743295884378706, + "grad_norm": 8.143807411193848, + "learning_rate": 6.06516290726817e-05, + "loss": 2.5935, + "step": 249 + }, + { + "epoch": 0.01881857016503886, + "grad_norm": 10.164816856384277, + "learning_rate": 6.090225563909775e-05, + "loss": 2.5394, + "step": 250 + }, + { + "epoch": 0.018893844445699016, + "grad_norm": 14.251152038574219, + "learning_rate": 6.115288220551379e-05, + "loss": 2.5156, + "step": 251 + }, + { + "epoch": 0.01896911872635917, + "grad_norm": 8.305621147155762, + "learning_rate": 6.140350877192983e-05, + "loss": 2.414, + "step": 252 + }, + { + "epoch": 0.019044393007019326, + "grad_norm": 6.562389373779297, + "learning_rate": 6.165413533834587e-05, + "loss": 2.6894, + "step": 253 + }, + { + "epoch": 0.01911966728767948, + "grad_norm": 9.983012199401855, + "learning_rate": 6.19047619047619e-05, + "loss": 2.35, + "step": 254 + }, + { + "epoch": 0.019194941568339636, + "grad_norm": 18.99936294555664, + "learning_rate": 6.215538847117795e-05, + "loss": 3.0117, + "step": 255 + }, + { + "epoch": 0.019270215848999794, + "grad_norm": 10.677833557128906, + "learning_rate": 6.240601503759398e-05, + "loss": 2.578, + "step": 256 + }, + { + "epoch": 0.01934549012965995, + "grad_norm": 8.067180633544922, + "learning_rate": 6.265664160401002e-05, + "loss": 2.4961, + "step": 257 + }, + { + "epoch": 0.019420764410320104, + "grad_norm": 10.744783401489258, + "learning_rate": 6.290726817042606e-05, + "loss": 2.6725, + "step": 258 + }, + { + "epoch": 0.01949603869098026, + "grad_norm": 14.301708221435547, + "learning_rate": 6.31578947368421e-05, + "loss": 2.7495, + "step": 259 + }, + { + "epoch": 0.019571312971640414, + "grad_norm": 6.461829662322998, + "learning_rate": 6.340852130325816e-05, + "loss": 2.4634, + "step": 260 + }, + { + "epoch": 0.01964658725230057, + "grad_norm": 12.466805458068848, + "learning_rate": 6.365914786967418e-05, + "loss": 3.057, + "step": 261 + }, + { + "epoch": 0.019721861532960724, + "grad_norm": 9.61961555480957, + "learning_rate": 6.390977443609024e-05, + "loss": 2.3606, + "step": 262 + }, + { + "epoch": 0.01979713581362088, + "grad_norm": 7.8552374839782715, + "learning_rate": 6.416040100250626e-05, + "loss": 2.3515, + "step": 263 + }, + { + "epoch": 0.019872410094281038, + "grad_norm": 8.121237754821777, + "learning_rate": 6.441102756892231e-05, + "loss": 2.6421, + "step": 264 + }, + { + "epoch": 0.019947684374941193, + "grad_norm": 7.811336040496826, + "learning_rate": 6.466165413533834e-05, + "loss": 2.4611, + "step": 265 + }, + { + "epoch": 0.020022958655601348, + "grad_norm": 12.549189567565918, + "learning_rate": 6.49122807017544e-05, + "loss": 2.8462, + "step": 266 + }, + { + "epoch": 0.020098232936261503, + "grad_norm": 9.146442413330078, + "learning_rate": 6.516290726817042e-05, + "loss": 2.9353, + "step": 267 + }, + { + "epoch": 0.020173507216921658, + "grad_norm": 9.0011625289917, + "learning_rate": 6.541353383458647e-05, + "loss": 2.4484, + "step": 268 + }, + { + "epoch": 0.020248781497581813, + "grad_norm": 8.126823425292969, + "learning_rate": 6.566416040100251e-05, + "loss": 2.5692, + "step": 269 + }, + { + "epoch": 0.020324055778241968, + "grad_norm": 8.504871368408203, + "learning_rate": 6.591478696741855e-05, + "loss": 2.6727, + "step": 270 + }, + { + "epoch": 0.020399330058902123, + "grad_norm": 18.124483108520508, + "learning_rate": 6.616541353383459e-05, + "loss": 2.672, + "step": 271 + }, + { + "epoch": 0.02047460433956228, + "grad_norm": 9.791601181030273, + "learning_rate": 6.641604010025063e-05, + "loss": 2.5762, + "step": 272 + }, + { + "epoch": 0.020549878620222436, + "grad_norm": 10.024748802185059, + "learning_rate": 6.666666666666667e-05, + "loss": 2.6214, + "step": 273 + }, + { + "epoch": 0.02062515290088259, + "grad_norm": 8.394241333007812, + "learning_rate": 6.691729323308271e-05, + "loss": 2.5758, + "step": 274 + }, + { + "epoch": 0.020700427181542747, + "grad_norm": 12.312226295471191, + "learning_rate": 6.716791979949875e-05, + "loss": 3.2458, + "step": 275 + }, + { + "epoch": 0.0207757014622029, + "grad_norm": 11.922405242919922, + "learning_rate": 6.741854636591479e-05, + "loss": 2.4109, + "step": 276 + }, + { + "epoch": 0.020850975742863057, + "grad_norm": 8.170721054077148, + "learning_rate": 6.766917293233083e-05, + "loss": 2.7055, + "step": 277 + }, + { + "epoch": 0.02092625002352321, + "grad_norm": 11.160572052001953, + "learning_rate": 6.791979949874687e-05, + "loss": 2.749, + "step": 278 + }, + { + "epoch": 0.021001524304183367, + "grad_norm": 8.44627857208252, + "learning_rate": 6.817042606516291e-05, + "loss": 2.5876, + "step": 279 + }, + { + "epoch": 0.021076798584843525, + "grad_norm": 8.555981636047363, + "learning_rate": 6.842105263157895e-05, + "loss": 2.3791, + "step": 280 + }, + { + "epoch": 0.02115207286550368, + "grad_norm": 6.683128356933594, + "learning_rate": 6.867167919799499e-05, + "loss": 2.3495, + "step": 281 + }, + { + "epoch": 0.021227347146163835, + "grad_norm": 7.898855209350586, + "learning_rate": 6.892230576441104e-05, + "loss": 2.8104, + "step": 282 + }, + { + "epoch": 0.02130262142682399, + "grad_norm": 10.912748336791992, + "learning_rate": 6.917293233082706e-05, + "loss": 2.8165, + "step": 283 + }, + { + "epoch": 0.021377895707484145, + "grad_norm": 11.60110855102539, + "learning_rate": 6.942355889724312e-05, + "loss": 2.4285, + "step": 284 + }, + { + "epoch": 0.0214531699881443, + "grad_norm": 7.183400630950928, + "learning_rate": 6.967418546365914e-05, + "loss": 2.4517, + "step": 285 + }, + { + "epoch": 0.021528444268804455, + "grad_norm": 8.255367279052734, + "learning_rate": 6.99248120300752e-05, + "loss": 2.6391, + "step": 286 + }, + { + "epoch": 0.02160371854946461, + "grad_norm": 9.572988510131836, + "learning_rate": 7.017543859649122e-05, + "loss": 2.6038, + "step": 287 + }, + { + "epoch": 0.02167899283012477, + "grad_norm": 5.990320205688477, + "learning_rate": 7.042606516290728e-05, + "loss": 2.3011, + "step": 288 + }, + { + "epoch": 0.021754267110784924, + "grad_norm": 12.906543731689453, + "learning_rate": 7.06766917293233e-05, + "loss": 2.4855, + "step": 289 + }, + { + "epoch": 0.02182954139144508, + "grad_norm": 8.411698341369629, + "learning_rate": 7.092731829573935e-05, + "loss": 2.5813, + "step": 290 + }, + { + "epoch": 0.021904815672105234, + "grad_norm": 8.958061218261719, + "learning_rate": 7.11779448621554e-05, + "loss": 2.7643, + "step": 291 + }, + { + "epoch": 0.02198008995276539, + "grad_norm": 5.848999500274658, + "learning_rate": 7.142857142857143e-05, + "loss": 2.542, + "step": 292 + }, + { + "epoch": 0.022055364233425544, + "grad_norm": 10.549403190612793, + "learning_rate": 7.167919799498747e-05, + "loss": 2.6225, + "step": 293 + }, + { + "epoch": 0.0221306385140857, + "grad_norm": 7.328732490539551, + "learning_rate": 7.192982456140351e-05, + "loss": 2.3358, + "step": 294 + }, + { + "epoch": 0.022205912794745854, + "grad_norm": 8.090675354003906, + "learning_rate": 7.218045112781955e-05, + "loss": 2.4583, + "step": 295 + }, + { + "epoch": 0.022281187075406012, + "grad_norm": 4.750062942504883, + "learning_rate": 7.243107769423559e-05, + "loss": 2.5265, + "step": 296 + }, + { + "epoch": 0.022356461356066167, + "grad_norm": 7.802643775939941, + "learning_rate": 7.268170426065163e-05, + "loss": 2.9576, + "step": 297 + }, + { + "epoch": 0.022431735636726322, + "grad_norm": 6.483747482299805, + "learning_rate": 7.293233082706767e-05, + "loss": 2.6836, + "step": 298 + }, + { + "epoch": 0.022507009917386477, + "grad_norm": 11.573009490966797, + "learning_rate": 7.318295739348371e-05, + "loss": 2.3012, + "step": 299 + }, + { + "epoch": 0.022582284198046632, + "grad_norm": 8.898382186889648, + "learning_rate": 7.343358395989975e-05, + "loss": 2.3735, + "step": 300 + }, + { + "epoch": 0.022657558478706787, + "grad_norm": 5.705074787139893, + "learning_rate": 7.368421052631579e-05, + "loss": 2.4902, + "step": 301 + }, + { + "epoch": 0.022732832759366942, + "grad_norm": 8.153017044067383, + "learning_rate": 7.393483709273183e-05, + "loss": 2.5585, + "step": 302 + }, + { + "epoch": 0.022808107040027097, + "grad_norm": 9.58834457397461, + "learning_rate": 7.418546365914787e-05, + "loss": 2.6702, + "step": 303 + }, + { + "epoch": 0.022883381320687256, + "grad_norm": 7.1557230949401855, + "learning_rate": 7.443609022556392e-05, + "loss": 2.4311, + "step": 304 + }, + { + "epoch": 0.02295865560134741, + "grad_norm": 7.851171016693115, + "learning_rate": 7.468671679197995e-05, + "loss": 2.5695, + "step": 305 + }, + { + "epoch": 0.023033929882007566, + "grad_norm": 10.56496810913086, + "learning_rate": 7.4937343358396e-05, + "loss": 2.608, + "step": 306 + }, + { + "epoch": 0.02310920416266772, + "grad_norm": 10.43893051147461, + "learning_rate": 7.518796992481203e-05, + "loss": 2.4521, + "step": 307 + }, + { + "epoch": 0.023184478443327876, + "grad_norm": 15.015542030334473, + "learning_rate": 7.543859649122808e-05, + "loss": 2.6188, + "step": 308 + }, + { + "epoch": 0.02325975272398803, + "grad_norm": 7.0824151039123535, + "learning_rate": 7.56892230576441e-05, + "loss": 2.3985, + "step": 309 + }, + { + "epoch": 0.023335027004648186, + "grad_norm": 11.571524620056152, + "learning_rate": 7.593984962406016e-05, + "loss": 2.8705, + "step": 310 + }, + { + "epoch": 0.02341030128530834, + "grad_norm": 13.608711242675781, + "learning_rate": 7.619047619047618e-05, + "loss": 2.6108, + "step": 311 + }, + { + "epoch": 0.0234855755659685, + "grad_norm": 12.766798973083496, + "learning_rate": 7.644110275689224e-05, + "loss": 2.7538, + "step": 312 + }, + { + "epoch": 0.023560849846628654, + "grad_norm": 10.92900562286377, + "learning_rate": 7.669172932330826e-05, + "loss": 2.5924, + "step": 313 + }, + { + "epoch": 0.02363612412728881, + "grad_norm": 7.872305393218994, + "learning_rate": 7.694235588972432e-05, + "loss": 2.4474, + "step": 314 + }, + { + "epoch": 0.023711398407948964, + "grad_norm": 8.653833389282227, + "learning_rate": 7.719298245614036e-05, + "loss": 2.764, + "step": 315 + }, + { + "epoch": 0.02378667268860912, + "grad_norm": 8.444396018981934, + "learning_rate": 7.74436090225564e-05, + "loss": 2.5794, + "step": 316 + }, + { + "epoch": 0.023861946969269274, + "grad_norm": 9.17378044128418, + "learning_rate": 7.769423558897244e-05, + "loss": 2.4611, + "step": 317 + }, + { + "epoch": 0.02393722124992943, + "grad_norm": 5.68281364440918, + "learning_rate": 7.794486215538847e-05, + "loss": 2.7726, + "step": 318 + }, + { + "epoch": 0.024012495530589584, + "grad_norm": 9.510702133178711, + "learning_rate": 7.819548872180451e-05, + "loss": 2.4254, + "step": 319 + }, + { + "epoch": 0.024087769811249743, + "grad_norm": 9.245924949645996, + "learning_rate": 7.844611528822055e-05, + "loss": 2.4872, + "step": 320 + }, + { + "epoch": 0.024163044091909898, + "grad_norm": 6.058887958526611, + "learning_rate": 7.869674185463659e-05, + "loss": 2.5338, + "step": 321 + }, + { + "epoch": 0.024238318372570053, + "grad_norm": 9.469134330749512, + "learning_rate": 7.894736842105263e-05, + "loss": 2.9507, + "step": 322 + }, + { + "epoch": 0.024313592653230208, + "grad_norm": 8.840911865234375, + "learning_rate": 7.919799498746867e-05, + "loss": 2.9634, + "step": 323 + }, + { + "epoch": 0.024388866933890363, + "grad_norm": 6.134981155395508, + "learning_rate": 7.944862155388471e-05, + "loss": 2.5341, + "step": 324 + }, + { + "epoch": 0.024464141214550518, + "grad_norm": 7.281913757324219, + "learning_rate": 7.969924812030075e-05, + "loss": 2.6995, + "step": 325 + }, + { + "epoch": 0.024539415495210673, + "grad_norm": 8.727099418640137, + "learning_rate": 7.994987468671679e-05, + "loss": 2.355, + "step": 326 + }, + { + "epoch": 0.024614689775870828, + "grad_norm": 7.811582088470459, + "learning_rate": 8.020050125313283e-05, + "loss": 2.3913, + "step": 327 + }, + { + "epoch": 0.024689964056530986, + "grad_norm": 8.502856254577637, + "learning_rate": 8.045112781954888e-05, + "loss": 2.5857, + "step": 328 + }, + { + "epoch": 0.02476523833719114, + "grad_norm": 10.060704231262207, + "learning_rate": 8.070175438596491e-05, + "loss": 2.7334, + "step": 329 + }, + { + "epoch": 0.024840512617851297, + "grad_norm": 9.777992248535156, + "learning_rate": 8.095238095238096e-05, + "loss": 2.3778, + "step": 330 + }, + { + "epoch": 0.02491578689851145, + "grad_norm": 9.373239517211914, + "learning_rate": 8.120300751879699e-05, + "loss": 2.5264, + "step": 331 + }, + { + "epoch": 0.024991061179171607, + "grad_norm": 7.8832221031188965, + "learning_rate": 8.145363408521304e-05, + "loss": 2.702, + "step": 332 + }, + { + "epoch": 0.02506633545983176, + "grad_norm": 9.64803695678711, + "learning_rate": 8.170426065162907e-05, + "loss": 2.4452, + "step": 333 + }, + { + "epoch": 0.025141609740491917, + "grad_norm": 12.649718284606934, + "learning_rate": 8.195488721804512e-05, + "loss": 2.5841, + "step": 334 + }, + { + "epoch": 0.02521688402115207, + "grad_norm": 10.74282455444336, + "learning_rate": 8.220551378446115e-05, + "loss": 2.5936, + "step": 335 + }, + { + "epoch": 0.025292158301812227, + "grad_norm": 10.858030319213867, + "learning_rate": 8.24561403508772e-05, + "loss": 2.4946, + "step": 336 + }, + { + "epoch": 0.025367432582472385, + "grad_norm": 11.46617603302002, + "learning_rate": 8.270676691729324e-05, + "loss": 2.234, + "step": 337 + }, + { + "epoch": 0.02544270686313254, + "grad_norm": 9.468840599060059, + "learning_rate": 8.295739348370928e-05, + "loss": 2.7549, + "step": 338 + }, + { + "epoch": 0.025517981143792695, + "grad_norm": 13.022171020507812, + "learning_rate": 8.320802005012532e-05, + "loss": 2.4938, + "step": 339 + }, + { + "epoch": 0.02559325542445285, + "grad_norm": 7.186497211456299, + "learning_rate": 8.345864661654136e-05, + "loss": 2.6162, + "step": 340 + }, + { + "epoch": 0.025668529705113005, + "grad_norm": 9.196660995483398, + "learning_rate": 8.37092731829574e-05, + "loss": 2.6004, + "step": 341 + }, + { + "epoch": 0.02574380398577316, + "grad_norm": 8.185606956481934, + "learning_rate": 8.395989974937344e-05, + "loss": 2.3878, + "step": 342 + }, + { + "epoch": 0.025819078266433315, + "grad_norm": 11.060405731201172, + "learning_rate": 8.421052631578948e-05, + "loss": 2.6286, + "step": 343 + }, + { + "epoch": 0.02589435254709347, + "grad_norm": 9.861959457397461, + "learning_rate": 8.446115288220552e-05, + "loss": 3.0686, + "step": 344 + }, + { + "epoch": 0.02596962682775363, + "grad_norm": 5.27044677734375, + "learning_rate": 8.471177944862155e-05, + "loss": 2.4063, + "step": 345 + }, + { + "epoch": 0.026044901108413784, + "grad_norm": 6.576343059539795, + "learning_rate": 8.49624060150376e-05, + "loss": 2.7093, + "step": 346 + }, + { + "epoch": 0.02612017538907394, + "grad_norm": 7.733982086181641, + "learning_rate": 8.521303258145363e-05, + "loss": 3.3062, + "step": 347 + }, + { + "epoch": 0.026195449669734094, + "grad_norm": 8.054815292358398, + "learning_rate": 8.546365914786967e-05, + "loss": 2.5026, + "step": 348 + }, + { + "epoch": 0.02627072395039425, + "grad_norm": 6.866470813751221, + "learning_rate": 8.571428571428571e-05, + "loss": 2.2814, + "step": 349 + }, + { + "epoch": 0.026345998231054404, + "grad_norm": 9.631629943847656, + "learning_rate": 8.596491228070177e-05, + "loss": 2.5933, + "step": 350 + }, + { + "epoch": 0.02642127251171456, + "grad_norm": 7.4486846923828125, + "learning_rate": 8.621553884711779e-05, + "loss": 3.0249, + "step": 351 + }, + { + "epoch": 0.026496546792374714, + "grad_norm": 8.702048301696777, + "learning_rate": 8.646616541353384e-05, + "loss": 2.5208, + "step": 352 + }, + { + "epoch": 0.026571821073034872, + "grad_norm": 9.063924789428711, + "learning_rate": 8.671679197994987e-05, + "loss": 2.4645, + "step": 353 + }, + { + "epoch": 0.026647095353695027, + "grad_norm": 8.293697357177734, + "learning_rate": 8.696741854636592e-05, + "loss": 2.5806, + "step": 354 + }, + { + "epoch": 0.026722369634355182, + "grad_norm": 12.996391296386719, + "learning_rate": 8.721804511278195e-05, + "loss": 2.4446, + "step": 355 + }, + { + "epoch": 0.026797643915015337, + "grad_norm": 9.56811809539795, + "learning_rate": 8.7468671679198e-05, + "loss": 2.1486, + "step": 356 + }, + { + "epoch": 0.026872918195675492, + "grad_norm": 9.908924102783203, + "learning_rate": 8.771929824561403e-05, + "loss": 2.6213, + "step": 357 + }, + { + "epoch": 0.026948192476335647, + "grad_norm": 5.640969276428223, + "learning_rate": 8.796992481203008e-05, + "loss": 2.2379, + "step": 358 + }, + { + "epoch": 0.027023466756995802, + "grad_norm": 9.026790618896484, + "learning_rate": 8.822055137844612e-05, + "loss": 2.6115, + "step": 359 + }, + { + "epoch": 0.027098741037655957, + "grad_norm": 8.025802612304688, + "learning_rate": 8.847117794486216e-05, + "loss": 2.6743, + "step": 360 + }, + { + "epoch": 0.027174015318316116, + "grad_norm": 6.087635040283203, + "learning_rate": 8.87218045112782e-05, + "loss": 2.4731, + "step": 361 + }, + { + "epoch": 0.02724928959897627, + "grad_norm": 7.93490743637085, + "learning_rate": 8.897243107769424e-05, + "loss": 2.2696, + "step": 362 + }, + { + "epoch": 0.027324563879636426, + "grad_norm": 8.029409408569336, + "learning_rate": 8.922305764411028e-05, + "loss": 2.5261, + "step": 363 + }, + { + "epoch": 0.02739983816029658, + "grad_norm": 11.829414367675781, + "learning_rate": 8.947368421052632e-05, + "loss": 2.4563, + "step": 364 + }, + { + "epoch": 0.027475112440956736, + "grad_norm": 7.845546722412109, + "learning_rate": 8.972431077694236e-05, + "loss": 2.4929, + "step": 365 + }, + { + "epoch": 0.02755038672161689, + "grad_norm": 9.62540054321289, + "learning_rate": 8.99749373433584e-05, + "loss": 2.4691, + "step": 366 + }, + { + "epoch": 0.027625661002277046, + "grad_norm": 7.262996196746826, + "learning_rate": 9.022556390977444e-05, + "loss": 2.1627, + "step": 367 + }, + { + "epoch": 0.0277009352829372, + "grad_norm": 8.31999397277832, + "learning_rate": 9.047619047619048e-05, + "loss": 2.9411, + "step": 368 + }, + { + "epoch": 0.02777620956359736, + "grad_norm": 8.52043342590332, + "learning_rate": 9.072681704260652e-05, + "loss": 2.3104, + "step": 369 + }, + { + "epoch": 0.027851483844257514, + "grad_norm": 6.144811153411865, + "learning_rate": 9.097744360902256e-05, + "loss": 2.4447, + "step": 370 + }, + { + "epoch": 0.02792675812491767, + "grad_norm": 7.1753363609313965, + "learning_rate": 9.12280701754386e-05, + "loss": 2.3539, + "step": 371 + }, + { + "epoch": 0.028002032405577824, + "grad_norm": 9.527101516723633, + "learning_rate": 9.147869674185465e-05, + "loss": 2.9253, + "step": 372 + }, + { + "epoch": 0.02807730668623798, + "grad_norm": 13.049726486206055, + "learning_rate": 9.172932330827067e-05, + "loss": 2.6301, + "step": 373 + }, + { + "epoch": 0.028152580966898134, + "grad_norm": 10.012778282165527, + "learning_rate": 9.197994987468673e-05, + "loss": 2.4109, + "step": 374 + }, + { + "epoch": 0.02822785524755829, + "grad_norm": 9.397246360778809, + "learning_rate": 9.223057644110275e-05, + "loss": 2.5127, + "step": 375 + }, + { + "epoch": 0.028303129528218444, + "grad_norm": 6.507575988769531, + "learning_rate": 9.24812030075188e-05, + "loss": 2.3723, + "step": 376 + }, + { + "epoch": 0.028378403808878603, + "grad_norm": 8.265238761901855, + "learning_rate": 9.273182957393483e-05, + "loss": 2.4718, + "step": 377 + }, + { + "epoch": 0.028453678089538758, + "grad_norm": 6.205020904541016, + "learning_rate": 9.298245614035089e-05, + "loss": 2.3485, + "step": 378 + }, + { + "epoch": 0.028528952370198913, + "grad_norm": 9.341041564941406, + "learning_rate": 9.323308270676691e-05, + "loss": 2.6597, + "step": 379 + }, + { + "epoch": 0.028604226650859068, + "grad_norm": 8.819635391235352, + "learning_rate": 9.348370927318296e-05, + "loss": 2.3982, + "step": 380 + }, + { + "epoch": 0.028679500931519223, + "grad_norm": 7.3793721199035645, + "learning_rate": 9.373433583959899e-05, + "loss": 2.6471, + "step": 381 + }, + { + "epoch": 0.028754775212179378, + "grad_norm": 8.588555335998535, + "learning_rate": 9.398496240601504e-05, + "loss": 2.5609, + "step": 382 + }, + { + "epoch": 0.028830049492839533, + "grad_norm": 10.27634334564209, + "learning_rate": 9.423558897243108e-05, + "loss": 2.7713, + "step": 383 + }, + { + "epoch": 0.028905323773499688, + "grad_norm": 8.250353813171387, + "learning_rate": 9.448621553884712e-05, + "loss": 2.8336, + "step": 384 + }, + { + "epoch": 0.028980598054159847, + "grad_norm": 10.630231857299805, + "learning_rate": 9.473684210526316e-05, + "loss": 2.5987, + "step": 385 + }, + { + "epoch": 0.02905587233482, + "grad_norm": 6.692311763763428, + "learning_rate": 9.49874686716792e-05, + "loss": 2.9426, + "step": 386 + }, + { + "epoch": 0.029131146615480157, + "grad_norm": 9.565740585327148, + "learning_rate": 9.523809523809524e-05, + "loss": 2.5801, + "step": 387 + }, + { + "epoch": 0.02920642089614031, + "grad_norm": 10.207667350769043, + "learning_rate": 9.548872180451128e-05, + "loss": 2.5359, + "step": 388 + }, + { + "epoch": 0.029281695176800467, + "grad_norm": 8.448806762695312, + "learning_rate": 9.573934837092732e-05, + "loss": 2.7342, + "step": 389 + }, + { + "epoch": 0.02935696945746062, + "grad_norm": 8.73253059387207, + "learning_rate": 9.598997493734336e-05, + "loss": 2.7241, + "step": 390 + }, + { + "epoch": 0.029432243738120777, + "grad_norm": 5.377964973449707, + "learning_rate": 9.62406015037594e-05, + "loss": 2.406, + "step": 391 + }, + { + "epoch": 0.02950751801878093, + "grad_norm": 8.039495468139648, + "learning_rate": 9.649122807017544e-05, + "loss": 3.0968, + "step": 392 + }, + { + "epoch": 0.02958279229944109, + "grad_norm": 7.784538745880127, + "learning_rate": 9.674185463659148e-05, + "loss": 2.2749, + "step": 393 + }, + { + "epoch": 0.029658066580101245, + "grad_norm": 6.550390720367432, + "learning_rate": 9.699248120300752e-05, + "loss": 2.5131, + "step": 394 + }, + { + "epoch": 0.0297333408607614, + "grad_norm": 7.988730430603027, + "learning_rate": 9.724310776942356e-05, + "loss": 2.3518, + "step": 395 + }, + { + "epoch": 0.029808615141421555, + "grad_norm": 6.855724811553955, + "learning_rate": 9.749373433583961e-05, + "loss": 2.2968, + "step": 396 + }, + { + "epoch": 0.02988388942208171, + "grad_norm": 9.838045120239258, + "learning_rate": 9.774436090225564e-05, + "loss": 2.6223, + "step": 397 + }, + { + "epoch": 0.029959163702741865, + "grad_norm": 11.083606719970703, + "learning_rate": 9.799498746867169e-05, + "loss": 2.3675, + "step": 398 + }, + { + "epoch": 0.03003443798340202, + "grad_norm": 7.736291408538818, + "learning_rate": 9.824561403508771e-05, + "loss": 2.2007, + "step": 399 + }, + { + "epoch": 0.030109712264062175, + "grad_norm": 7.533214092254639, + "learning_rate": 9.849624060150377e-05, + "loss": 2.8185, + "step": 400 + }, + { + "epoch": 0.030184986544722334, + "grad_norm": 5.824602127075195, + "learning_rate": 9.87468671679198e-05, + "loss": 2.4325, + "step": 401 + }, + { + "epoch": 0.03026026082538249, + "grad_norm": 7.297369480133057, + "learning_rate": 9.899749373433585e-05, + "loss": 2.1774, + "step": 402 + }, + { + "epoch": 0.030335535106042644, + "grad_norm": 5.556282043457031, + "learning_rate": 9.924812030075187e-05, + "loss": 2.2991, + "step": 403 + }, + { + "epoch": 0.0304108093867028, + "grad_norm": 7.1707987785339355, + "learning_rate": 9.949874686716793e-05, + "loss": 2.7968, + "step": 404 + }, + { + "epoch": 0.030486083667362954, + "grad_norm": 7.452920913696289, + "learning_rate": 9.974937343358397e-05, + "loss": 2.6478, + "step": 405 + }, + { + "epoch": 0.03056135794802311, + "grad_norm": 5.904439926147461, + "learning_rate": 0.0001, + "loss": 2.4775, + "step": 406 + }, + { + "epoch": 0.030636632228683264, + "grad_norm": 7.584089279174805, + "learning_rate": 9.999999851382173e-05, + "loss": 2.7436, + "step": 407 + }, + { + "epoch": 0.03071190650934342, + "grad_norm": 5.895624160766602, + "learning_rate": 9.999999405528702e-05, + "loss": 3.1167, + "step": 408 + }, + { + "epoch": 0.030787180790003577, + "grad_norm": 7.877616882324219, + "learning_rate": 9.999998662439613e-05, + "loss": 2.5746, + "step": 409 + }, + { + "epoch": 0.030862455070663732, + "grad_norm": 10.939014434814453, + "learning_rate": 9.99999762211495e-05, + "loss": 2.3587, + "step": 410 + }, + { + "epoch": 0.030937729351323887, + "grad_norm": 5.0304694175720215, + "learning_rate": 9.999996284554776e-05, + "loss": 2.5545, + "step": 411 + }, + { + "epoch": 0.031013003631984042, + "grad_norm": 10.43912124633789, + "learning_rate": 9.999994649759168e-05, + "loss": 2.5543, + "step": 412 + }, + { + "epoch": 0.031088277912644197, + "grad_norm": 11.310480117797852, + "learning_rate": 9.999992717728223e-05, + "loss": 3.2997, + "step": 413 + }, + { + "epoch": 0.031163552193304352, + "grad_norm": 5.6342620849609375, + "learning_rate": 9.99999048846206e-05, + "loss": 2.4806, + "step": 414 + }, + { + "epoch": 0.031238826473964507, + "grad_norm": 7.379421710968018, + "learning_rate": 9.999987961960808e-05, + "loss": 2.4209, + "step": 415 + }, + { + "epoch": 0.031314100754624666, + "grad_norm": 9.048238754272461, + "learning_rate": 9.999985138224619e-05, + "loss": 2.5663, + "step": 416 + }, + { + "epoch": 0.03138937503528482, + "grad_norm": 14.7123384475708, + "learning_rate": 9.99998201725366e-05, + "loss": 2.9186, + "step": 417 + }, + { + "epoch": 0.031464649315944976, + "grad_norm": 11.715332984924316, + "learning_rate": 9.999978599048117e-05, + "loss": 2.6159, + "step": 418 + }, + { + "epoch": 0.03153992359660513, + "grad_norm": 15.586795806884766, + "learning_rate": 9.999974883608192e-05, + "loss": 2.5011, + "step": 419 + }, + { + "epoch": 0.031615197877265286, + "grad_norm": 9.806571960449219, + "learning_rate": 9.999970870934108e-05, + "loss": 2.9515, + "step": 420 + }, + { + "epoch": 0.03169047215792544, + "grad_norm": 11.321080207824707, + "learning_rate": 9.9999665610261e-05, + "loss": 2.4308, + "step": 421 + }, + { + "epoch": 0.031765746438585596, + "grad_norm": 6.57304573059082, + "learning_rate": 9.999961953884427e-05, + "loss": 2.487, + "step": 422 + }, + { + "epoch": 0.03184102071924575, + "grad_norm": 5.4669365882873535, + "learning_rate": 9.999957049509365e-05, + "loss": 2.6643, + "step": 423 + }, + { + "epoch": 0.031916294999905906, + "grad_norm": 8.392806053161621, + "learning_rate": 9.9999518479012e-05, + "loss": 3.0091, + "step": 424 + }, + { + "epoch": 0.03199156928056606, + "grad_norm": 8.882760047912598, + "learning_rate": 9.999946349060245e-05, + "loss": 2.6367, + "step": 425 + }, + { + "epoch": 0.032066843561226216, + "grad_norm": 8.551156044006348, + "learning_rate": 9.999940552986826e-05, + "loss": 2.4453, + "step": 426 + }, + { + "epoch": 0.03214211784188637, + "grad_norm": 4.792285442352295, + "learning_rate": 9.999934459681288e-05, + "loss": 2.4126, + "step": 427 + }, + { + "epoch": 0.032217392122546526, + "grad_norm": 6.7595953941345215, + "learning_rate": 9.999928069143993e-05, + "loss": 2.6518, + "step": 428 + }, + { + "epoch": 0.03229266640320668, + "grad_norm": 10.124109268188477, + "learning_rate": 9.999921381375319e-05, + "loss": 2.7516, + "step": 429 + }, + { + "epoch": 0.03236794068386684, + "grad_norm": 6.628558158874512, + "learning_rate": 9.999914396375668e-05, + "loss": 2.462, + "step": 430 + }, + { + "epoch": 0.032443214964527, + "grad_norm": 8.301045417785645, + "learning_rate": 9.99990711414545e-05, + "loss": 2.8795, + "step": 431 + }, + { + "epoch": 0.03251848924518715, + "grad_norm": 5.132826328277588, + "learning_rate": 9.999899534685102e-05, + "loss": 2.3893, + "step": 432 + }, + { + "epoch": 0.03259376352584731, + "grad_norm": 9.406960487365723, + "learning_rate": 9.99989165799507e-05, + "loss": 2.5055, + "step": 433 + }, + { + "epoch": 0.03266903780650746, + "grad_norm": 10.371724128723145, + "learning_rate": 9.999883484075829e-05, + "loss": 2.6004, + "step": 434 + }, + { + "epoch": 0.03274431208716762, + "grad_norm": 7.023213863372803, + "learning_rate": 9.99987501292786e-05, + "loss": 2.7704, + "step": 435 + }, + { + "epoch": 0.03281958636782777, + "grad_norm": 7.946764945983887, + "learning_rate": 9.999866244551667e-05, + "loss": 2.2468, + "step": 436 + }, + { + "epoch": 0.03289486064848793, + "grad_norm": 6.187266826629639, + "learning_rate": 9.999857178947772e-05, + "loss": 2.7072, + "step": 437 + }, + { + "epoch": 0.03297013492914808, + "grad_norm": 5.986959934234619, + "learning_rate": 9.999847816116714e-05, + "loss": 2.4356, + "step": 438 + }, + { + "epoch": 0.03304540920980824, + "grad_norm": 7.843387126922607, + "learning_rate": 9.999838156059051e-05, + "loss": 2.5055, + "step": 439 + }, + { + "epoch": 0.03312068349046839, + "grad_norm": 7.710484981536865, + "learning_rate": 9.999828198775354e-05, + "loss": 2.4021, + "step": 440 + }, + { + "epoch": 0.03319595777112855, + "grad_norm": 12.383356094360352, + "learning_rate": 9.999817944266216e-05, + "loss": 2.6619, + "step": 441 + }, + { + "epoch": 0.0332712320517887, + "grad_norm": 6.836391925811768, + "learning_rate": 9.99980739253225e-05, + "loss": 2.5544, + "step": 442 + }, + { + "epoch": 0.03334650633244886, + "grad_norm": 11.41148567199707, + "learning_rate": 9.999796543574078e-05, + "loss": 2.6145, + "step": 443 + }, + { + "epoch": 0.03342178061310901, + "grad_norm": 5.887909412384033, + "learning_rate": 9.999785397392349e-05, + "loss": 2.5155, + "step": 444 + }, + { + "epoch": 0.03349705489376917, + "grad_norm": 9.66697883605957, + "learning_rate": 9.999773953987724e-05, + "loss": 2.7728, + "step": 445 + }, + { + "epoch": 0.03357232917442933, + "grad_norm": 6.084951877593994, + "learning_rate": 9.999762213360884e-05, + "loss": 2.5111, + "step": 446 + }, + { + "epoch": 0.033647603455089485, + "grad_norm": 7.673906326293945, + "learning_rate": 9.999750175512524e-05, + "loss": 2.5937, + "step": 447 + }, + { + "epoch": 0.03372287773574964, + "grad_norm": 6.4027485847473145, + "learning_rate": 9.999737840443364e-05, + "loss": 2.4019, + "step": 448 + }, + { + "epoch": 0.033798152016409795, + "grad_norm": 7.2442522048950195, + "learning_rate": 9.999725208154135e-05, + "loss": 2.6147, + "step": 449 + }, + { + "epoch": 0.03387342629706995, + "grad_norm": 5.990758895874023, + "learning_rate": 9.999712278645587e-05, + "loss": 2.4489, + "step": 450 + }, + { + "epoch": 0.033948700577730105, + "grad_norm": 4.980669021606445, + "learning_rate": 9.999699051918491e-05, + "loss": 2.5905, + "step": 451 + }, + { + "epoch": 0.03402397485839026, + "grad_norm": 4.490603446960449, + "learning_rate": 9.999685527973633e-05, + "loss": 2.7177, + "step": 452 + }, + { + "epoch": 0.034099249139050415, + "grad_norm": 9.191030502319336, + "learning_rate": 9.999671706811814e-05, + "loss": 2.5565, + "step": 453 + }, + { + "epoch": 0.03417452341971057, + "grad_norm": 6.982939720153809, + "learning_rate": 9.999657588433858e-05, + "loss": 2.479, + "step": 454 + }, + { + "epoch": 0.034249797700370725, + "grad_norm": 6.823544979095459, + "learning_rate": 9.999643172840605e-05, + "loss": 2.2048, + "step": 455 + }, + { + "epoch": 0.03432507198103088, + "grad_norm": 6.251356601715088, + "learning_rate": 9.99962846003291e-05, + "loss": 2.5377, + "step": 456 + }, + { + "epoch": 0.034400346261691035, + "grad_norm": 7.133268356323242, + "learning_rate": 9.999613450011648e-05, + "loss": 2.3033, + "step": 457 + }, + { + "epoch": 0.03447562054235119, + "grad_norm": 5.284430503845215, + "learning_rate": 9.999598142777715e-05, + "loss": 2.7628, + "step": 458 + }, + { + "epoch": 0.034550894823011345, + "grad_norm": 6.167489528656006, + "learning_rate": 9.999582538332016e-05, + "loss": 2.7558, + "step": 459 + }, + { + "epoch": 0.0346261691036715, + "grad_norm": 5.280205726623535, + "learning_rate": 9.999566636675479e-05, + "loss": 2.1525, + "step": 460 + }, + { + "epoch": 0.034701443384331655, + "grad_norm": 6.544081687927246, + "learning_rate": 9.999550437809053e-05, + "loss": 2.6361, + "step": 461 + }, + { + "epoch": 0.03477671766499182, + "grad_norm": 5.885289192199707, + "learning_rate": 9.999533941733699e-05, + "loss": 2.3613, + "step": 462 + }, + { + "epoch": 0.03485199194565197, + "grad_norm": 5.703323841094971, + "learning_rate": 9.999517148450395e-05, + "loss": 2.5808, + "step": 463 + }, + { + "epoch": 0.03492726622631213, + "grad_norm": 9.864120483398438, + "learning_rate": 9.999500057960142e-05, + "loss": 2.6428, + "step": 464 + }, + { + "epoch": 0.03500254050697228, + "grad_norm": 9.22010612487793, + "learning_rate": 9.999482670263958e-05, + "loss": 2.4964, + "step": 465 + }, + { + "epoch": 0.03507781478763244, + "grad_norm": 8.119236946105957, + "learning_rate": 9.999464985362875e-05, + "loss": 2.7858, + "step": 466 + }, + { + "epoch": 0.03515308906829259, + "grad_norm": 6.575767993927002, + "learning_rate": 9.99944700325794e-05, + "loss": 2.7414, + "step": 467 + }, + { + "epoch": 0.03522836334895275, + "grad_norm": 7.580944061279297, + "learning_rate": 9.999428723950228e-05, + "loss": 2.6357, + "step": 468 + }, + { + "epoch": 0.0353036376296129, + "grad_norm": 6.015957355499268, + "learning_rate": 9.999410147440823e-05, + "loss": 2.5034, + "step": 469 + }, + { + "epoch": 0.03537891191027306, + "grad_norm": 5.732180118560791, + "learning_rate": 9.999391273730829e-05, + "loss": 2.5332, + "step": 470 + }, + { + "epoch": 0.03545418619093321, + "grad_norm": 4.958330154418945, + "learning_rate": 9.99937210282137e-05, + "loss": 2.2508, + "step": 471 + }, + { + "epoch": 0.03552946047159337, + "grad_norm": 6.343937397003174, + "learning_rate": 9.999352634713584e-05, + "loss": 2.3269, + "step": 472 + }, + { + "epoch": 0.03560473475225352, + "grad_norm": 6.58331298828125, + "learning_rate": 9.999332869408629e-05, + "loss": 2.6838, + "step": 473 + }, + { + "epoch": 0.03568000903291368, + "grad_norm": 11.521378517150879, + "learning_rate": 9.999312806907676e-05, + "loss": 2.7741, + "step": 474 + }, + { + "epoch": 0.03575528331357383, + "grad_norm": 8.342028617858887, + "learning_rate": 9.999292447211926e-05, + "loss": 2.6406, + "step": 475 + }, + { + "epoch": 0.03583055759423399, + "grad_norm": 5.67268180847168, + "learning_rate": 9.999271790322581e-05, + "loss": 2.4836, + "step": 476 + }, + { + "epoch": 0.03590583187489414, + "grad_norm": 7.215313911437988, + "learning_rate": 9.999250836240876e-05, + "loss": 2.5318, + "step": 477 + }, + { + "epoch": 0.035981106155554304, + "grad_norm": 6.308455944061279, + "learning_rate": 9.99922958496805e-05, + "loss": 2.3813, + "step": 478 + }, + { + "epoch": 0.03605638043621446, + "grad_norm": 8.246892929077148, + "learning_rate": 9.999208036505372e-05, + "loss": 2.7915, + "step": 479 + }, + { + "epoch": 0.036131654716874614, + "grad_norm": 7.0365071296691895, + "learning_rate": 9.99918619085412e-05, + "loss": 2.3779, + "step": 480 + }, + { + "epoch": 0.03620692899753477, + "grad_norm": 5.719037055969238, + "learning_rate": 9.999164048015593e-05, + "loss": 2.7042, + "step": 481 + }, + { + "epoch": 0.036282203278194924, + "grad_norm": 6.5956315994262695, + "learning_rate": 9.999141607991107e-05, + "loss": 2.3586, + "step": 482 + }, + { + "epoch": 0.03635747755885508, + "grad_norm": 5.437718868255615, + "learning_rate": 9.999118870781996e-05, + "loss": 2.3513, + "step": 483 + }, + { + "epoch": 0.036432751839515234, + "grad_norm": 8.259861946105957, + "learning_rate": 9.999095836389613e-05, + "loss": 2.2873, + "step": 484 + }, + { + "epoch": 0.03650802612017539, + "grad_norm": 7.971058368682861, + "learning_rate": 9.999072504815328e-05, + "loss": 2.4196, + "step": 485 + }, + { + "epoch": 0.036583300400835544, + "grad_norm": 9.012106895446777, + "learning_rate": 9.999048876060525e-05, + "loss": 2.3551, + "step": 486 + }, + { + "epoch": 0.0366585746814957, + "grad_norm": 6.393646717071533, + "learning_rate": 9.999024950126609e-05, + "loss": 2.5493, + "step": 487 + }, + { + "epoch": 0.036733848962155854, + "grad_norm": 8.119710922241211, + "learning_rate": 9.999000727015007e-05, + "loss": 2.2621, + "step": 488 + }, + { + "epoch": 0.03680912324281601, + "grad_norm": 10.802091598510742, + "learning_rate": 9.998976206727153e-05, + "loss": 2.5148, + "step": 489 + }, + { + "epoch": 0.036884397523476165, + "grad_norm": 6.845826148986816, + "learning_rate": 9.998951389264508e-05, + "loss": 2.7917, + "step": 490 + }, + { + "epoch": 0.03695967180413632, + "grad_norm": 7.117809295654297, + "learning_rate": 9.998926274628546e-05, + "loss": 2.5097, + "step": 491 + }, + { + "epoch": 0.037034946084796475, + "grad_norm": 6.615505218505859, + "learning_rate": 9.99890086282076e-05, + "loss": 2.5368, + "step": 492 + }, + { + "epoch": 0.03711022036545663, + "grad_norm": 6.764005661010742, + "learning_rate": 9.998875153842662e-05, + "loss": 2.3979, + "step": 493 + }, + { + "epoch": 0.03718549464611679, + "grad_norm": 6.176251411437988, + "learning_rate": 9.99884914769578e-05, + "loss": 2.4214, + "step": 494 + }, + { + "epoch": 0.037260768926776947, + "grad_norm": 7.364236831665039, + "learning_rate": 9.998822844381659e-05, + "loss": 2.5111, + "step": 495 + }, + { + "epoch": 0.0373360432074371, + "grad_norm": 7.308414936065674, + "learning_rate": 9.998796243901862e-05, + "loss": 2.555, + "step": 496 + }, + { + "epoch": 0.03741131748809726, + "grad_norm": 8.038701057434082, + "learning_rate": 9.998769346257972e-05, + "loss": 2.6026, + "step": 497 + }, + { + "epoch": 0.03748659176875741, + "grad_norm": 7.312729835510254, + "learning_rate": 9.998742151451588e-05, + "loss": 2.8019, + "step": 498 + }, + { + "epoch": 0.03756186604941757, + "grad_norm": 7.3715128898620605, + "learning_rate": 9.998714659484326e-05, + "loss": 2.475, + "step": 499 + }, + { + "epoch": 0.03763714033007772, + "grad_norm": 8.535462379455566, + "learning_rate": 9.99868687035782e-05, + "loss": 2.2464, + "step": 500 + }, + { + "epoch": 0.03771241461073788, + "grad_norm": 5.346155166625977, + "learning_rate": 9.998658784073723e-05, + "loss": 2.3705, + "step": 501 + }, + { + "epoch": 0.03778768889139803, + "grad_norm": 6.561291217803955, + "learning_rate": 9.998630400633705e-05, + "loss": 2.5572, + "step": 502 + }, + { + "epoch": 0.03786296317205819, + "grad_norm": 6.337287902832031, + "learning_rate": 9.99860172003945e-05, + "loss": 2.2467, + "step": 503 + }, + { + "epoch": 0.03793823745271834, + "grad_norm": 6.656157493591309, + "learning_rate": 9.998572742292666e-05, + "loss": 2.1208, + "step": 504 + }, + { + "epoch": 0.0380135117333785, + "grad_norm": 5.564158916473389, + "learning_rate": 9.998543467395076e-05, + "loss": 2.4658, + "step": 505 + }, + { + "epoch": 0.03808878601403865, + "grad_norm": 8.458843231201172, + "learning_rate": 9.998513895348418e-05, + "loss": 3.1493, + "step": 506 + }, + { + "epoch": 0.03816406029469881, + "grad_norm": 6.628011703491211, + "learning_rate": 9.998484026154453e-05, + "loss": 2.367, + "step": 507 + }, + { + "epoch": 0.03823933457535896, + "grad_norm": 6.011812210083008, + "learning_rate": 9.998453859814953e-05, + "loss": 2.2363, + "step": 508 + }, + { + "epoch": 0.03831460885601912, + "grad_norm": 4.7522077560424805, + "learning_rate": 9.998423396331715e-05, + "loss": 2.5032, + "step": 509 + }, + { + "epoch": 0.03838988313667927, + "grad_norm": 5.3855133056640625, + "learning_rate": 9.998392635706547e-05, + "loss": 2.7719, + "step": 510 + }, + { + "epoch": 0.038465157417339434, + "grad_norm": 6.51707124710083, + "learning_rate": 9.998361577941279e-05, + "loss": 2.4928, + "step": 511 + }, + { + "epoch": 0.03854043169799959, + "grad_norm": 5.331462860107422, + "learning_rate": 9.998330223037757e-05, + "loss": 2.3371, + "step": 512 + }, + { + "epoch": 0.038615705978659744, + "grad_norm": 7.951931476593018, + "learning_rate": 9.998298570997846e-05, + "loss": 2.4478, + "step": 513 + }, + { + "epoch": 0.0386909802593199, + "grad_norm": 7.166647911071777, + "learning_rate": 9.998266621823426e-05, + "loss": 2.3564, + "step": 514 + }, + { + "epoch": 0.038766254539980054, + "grad_norm": 9.120553970336914, + "learning_rate": 9.998234375516396e-05, + "loss": 2.6303, + "step": 515 + }, + { + "epoch": 0.03884152882064021, + "grad_norm": 6.1128058433532715, + "learning_rate": 9.998201832078675e-05, + "loss": 2.7501, + "step": 516 + }, + { + "epoch": 0.038916803101300364, + "grad_norm": 7.445671081542969, + "learning_rate": 9.998168991512197e-05, + "loss": 2.4277, + "step": 517 + }, + { + "epoch": 0.03899207738196052, + "grad_norm": 6.254639625549316, + "learning_rate": 9.998135853818913e-05, + "loss": 2.7413, + "step": 518 + }, + { + "epoch": 0.039067351662620674, + "grad_norm": 8.722423553466797, + "learning_rate": 9.998102419000794e-05, + "loss": 2.341, + "step": 519 + }, + { + "epoch": 0.03914262594328083, + "grad_norm": 7.148705959320068, + "learning_rate": 9.998068687059827e-05, + "loss": 2.5418, + "step": 520 + }, + { + "epoch": 0.039217900223940984, + "grad_norm": 4.841706275939941, + "learning_rate": 9.998034657998018e-05, + "loss": 2.4428, + "step": 521 + }, + { + "epoch": 0.03929317450460114, + "grad_norm": 4.844259262084961, + "learning_rate": 9.99800033181739e-05, + "loss": 2.4732, + "step": 522 + }, + { + "epoch": 0.039368448785261294, + "grad_norm": 7.524613857269287, + "learning_rate": 9.997965708519984e-05, + "loss": 2.4265, + "step": 523 + }, + { + "epoch": 0.03944372306592145, + "grad_norm": 7.401834964752197, + "learning_rate": 9.997930788107854e-05, + "loss": 2.2409, + "step": 524 + }, + { + "epoch": 0.039518997346581604, + "grad_norm": 5.294851779937744, + "learning_rate": 9.997895570583082e-05, + "loss": 2.4899, + "step": 525 + }, + { + "epoch": 0.03959427162724176, + "grad_norm": 12.990023612976074, + "learning_rate": 9.997860055947758e-05, + "loss": 2.5265, + "step": 526 + }, + { + "epoch": 0.03966954590790192, + "grad_norm": 7.998562335968018, + "learning_rate": 9.997824244203995e-05, + "loss": 2.6798, + "step": 527 + }, + { + "epoch": 0.039744820188562076, + "grad_norm": 6.187966823577881, + "learning_rate": 9.997788135353919e-05, + "loss": 2.3066, + "step": 528 + }, + { + "epoch": 0.03982009446922223, + "grad_norm": 6.218991756439209, + "learning_rate": 9.99775172939968e-05, + "loss": 2.2707, + "step": 529 + }, + { + "epoch": 0.039895368749882386, + "grad_norm": 5.636419773101807, + "learning_rate": 9.997715026343442e-05, + "loss": 2.5545, + "step": 530 + }, + { + "epoch": 0.03997064303054254, + "grad_norm": 5.747351169586182, + "learning_rate": 9.997678026187385e-05, + "loss": 2.6234, + "step": 531 + }, + { + "epoch": 0.040045917311202696, + "grad_norm": 9.34416675567627, + "learning_rate": 9.997640728933708e-05, + "loss": 2.9354, + "step": 532 + }, + { + "epoch": 0.04012119159186285, + "grad_norm": 10.553180694580078, + "learning_rate": 9.99760313458463e-05, + "loss": 2.6615, + "step": 533 + }, + { + "epoch": 0.040196465872523006, + "grad_norm": 6.410879135131836, + "learning_rate": 9.997565243142386e-05, + "loss": 2.7898, + "step": 534 + }, + { + "epoch": 0.04027174015318316, + "grad_norm": 8.0240478515625, + "learning_rate": 9.997527054609229e-05, + "loss": 2.5834, + "step": 535 + }, + { + "epoch": 0.040347014433843316, + "grad_norm": 6.889857292175293, + "learning_rate": 9.997488568987425e-05, + "loss": 2.5845, + "step": 536 + }, + { + "epoch": 0.04042228871450347, + "grad_norm": 6.3225178718566895, + "learning_rate": 9.997449786279267e-05, + "loss": 2.807, + "step": 537 + }, + { + "epoch": 0.040497562995163626, + "grad_norm": 5.587867736816406, + "learning_rate": 9.997410706487058e-05, + "loss": 2.3666, + "step": 538 + }, + { + "epoch": 0.04057283727582378, + "grad_norm": 10.935953140258789, + "learning_rate": 9.997371329613123e-05, + "loss": 2.3211, + "step": 539 + }, + { + "epoch": 0.040648111556483936, + "grad_norm": 6.5122246742248535, + "learning_rate": 9.9973316556598e-05, + "loss": 2.411, + "step": 540 + }, + { + "epoch": 0.04072338583714409, + "grad_norm": 9.348227500915527, + "learning_rate": 9.99729168462945e-05, + "loss": 2.5361, + "step": 541 + }, + { + "epoch": 0.040798660117804246, + "grad_norm": 8.161140441894531, + "learning_rate": 9.997251416524446e-05, + "loss": 2.4348, + "step": 542 + }, + { + "epoch": 0.04087393439846441, + "grad_norm": 8.295262336730957, + "learning_rate": 9.997210851347187e-05, + "loss": 3.0491, + "step": 543 + }, + { + "epoch": 0.04094920867912456, + "grad_norm": 5.279495716094971, + "learning_rate": 9.997169989100081e-05, + "loss": 2.5017, + "step": 544 + }, + { + "epoch": 0.04102448295978472, + "grad_norm": 7.351101875305176, + "learning_rate": 9.997128829785555e-05, + "loss": 2.4604, + "step": 545 + }, + { + "epoch": 0.04109975724044487, + "grad_norm": 7.537226676940918, + "learning_rate": 9.997087373406063e-05, + "loss": 2.6514, + "step": 546 + }, + { + "epoch": 0.04117503152110503, + "grad_norm": 6.526947021484375, + "learning_rate": 9.997045619964061e-05, + "loss": 2.143, + "step": 547 + }, + { + "epoch": 0.04125030580176518, + "grad_norm": 8.374302864074707, + "learning_rate": 9.997003569462036e-05, + "loss": 2.5148, + "step": 548 + }, + { + "epoch": 0.04132558008242534, + "grad_norm": 6.060415744781494, + "learning_rate": 9.996961221902487e-05, + "loss": 2.3173, + "step": 549 + }, + { + "epoch": 0.04140085436308549, + "grad_norm": 6.49837589263916, + "learning_rate": 9.99691857728793e-05, + "loss": 2.5405, + "step": 550 + }, + { + "epoch": 0.04147612864374565, + "grad_norm": 5.977110862731934, + "learning_rate": 9.996875635620903e-05, + "loss": 2.3883, + "step": 551 + }, + { + "epoch": 0.0415514029244058, + "grad_norm": 5.554208278656006, + "learning_rate": 9.996832396903957e-05, + "loss": 2.4085, + "step": 552 + }, + { + "epoch": 0.04162667720506596, + "grad_norm": 7.002029895782471, + "learning_rate": 9.996788861139664e-05, + "loss": 2.039, + "step": 553 + }, + { + "epoch": 0.04170195148572611, + "grad_norm": 5.461437702178955, + "learning_rate": 9.996745028330607e-05, + "loss": 2.3954, + "step": 554 + }, + { + "epoch": 0.04177722576638627, + "grad_norm": 6.573638439178467, + "learning_rate": 9.996700898479397e-05, + "loss": 2.9218, + "step": 555 + }, + { + "epoch": 0.04185250004704642, + "grad_norm": 6.566964149475098, + "learning_rate": 9.996656471588657e-05, + "loss": 2.6249, + "step": 556 + }, + { + "epoch": 0.04192777432770658, + "grad_norm": 6.387213706970215, + "learning_rate": 9.996611747661024e-05, + "loss": 2.7764, + "step": 557 + }, + { + "epoch": 0.04200304860836673, + "grad_norm": 5.735657215118408, + "learning_rate": 9.996566726699163e-05, + "loss": 2.665, + "step": 558 + }, + { + "epoch": 0.042078322889026895, + "grad_norm": 7.310241222381592, + "learning_rate": 9.996521408705744e-05, + "loss": 2.5025, + "step": 559 + }, + { + "epoch": 0.04215359716968705, + "grad_norm": 5.92233419418335, + "learning_rate": 9.996475793683465e-05, + "loss": 2.5994, + "step": 560 + }, + { + "epoch": 0.042228871450347205, + "grad_norm": 5.580022811889648, + "learning_rate": 9.996429881635036e-05, + "loss": 2.6675, + "step": 561 + }, + { + "epoch": 0.04230414573100736, + "grad_norm": 6.15976095199585, + "learning_rate": 9.996383672563187e-05, + "loss": 2.6914, + "step": 562 + }, + { + "epoch": 0.042379420011667515, + "grad_norm": 4.10883903503418, + "learning_rate": 9.996337166470667e-05, + "loss": 2.5672, + "step": 563 + }, + { + "epoch": 0.04245469429232767, + "grad_norm": 5.744843006134033, + "learning_rate": 9.996290363360237e-05, + "loss": 3.0443, + "step": 564 + }, + { + "epoch": 0.042529968572987825, + "grad_norm": 7.262097358703613, + "learning_rate": 9.996243263234679e-05, + "loss": 2.4417, + "step": 565 + }, + { + "epoch": 0.04260524285364798, + "grad_norm": 6.471399784088135, + "learning_rate": 9.996195866096798e-05, + "loss": 2.6812, + "step": 566 + }, + { + "epoch": 0.042680517134308135, + "grad_norm": 5.091197490692139, + "learning_rate": 9.996148171949406e-05, + "loss": 2.5856, + "step": 567 + }, + { + "epoch": 0.04275579141496829, + "grad_norm": 5.39486837387085, + "learning_rate": 9.996100180795342e-05, + "loss": 2.2875, + "step": 568 + }, + { + "epoch": 0.042831065695628445, + "grad_norm": 4.359315395355225, + "learning_rate": 9.996051892637457e-05, + "loss": 2.6801, + "step": 569 + }, + { + "epoch": 0.0429063399762886, + "grad_norm": 5.395928859710693, + "learning_rate": 9.996003307478624e-05, + "loss": 2.4626, + "step": 570 + }, + { + "epoch": 0.042981614256948755, + "grad_norm": 5.215150356292725, + "learning_rate": 9.995954425321728e-05, + "loss": 2.383, + "step": 571 + }, + { + "epoch": 0.04305688853760891, + "grad_norm": 8.549391746520996, + "learning_rate": 9.995905246169675e-05, + "loss": 2.4032, + "step": 572 + }, + { + "epoch": 0.043132162818269065, + "grad_norm": 6.394818305969238, + "learning_rate": 9.995855770025393e-05, + "loss": 2.3881, + "step": 573 + }, + { + "epoch": 0.04320743709892922, + "grad_norm": 5.099479675292969, + "learning_rate": 9.995805996891819e-05, + "loss": 2.598, + "step": 574 + }, + { + "epoch": 0.04328271137958938, + "grad_norm": 8.931200981140137, + "learning_rate": 9.995755926771912e-05, + "loss": 2.2237, + "step": 575 + }, + { + "epoch": 0.04335798566024954, + "grad_norm": 8.167283058166504, + "learning_rate": 9.99570555966865e-05, + "loss": 2.5803, + "step": 576 + }, + { + "epoch": 0.04343325994090969, + "grad_norm": 6.921133518218994, + "learning_rate": 9.995654895585028e-05, + "loss": 2.4995, + "step": 577 + }, + { + "epoch": 0.04350853422156985, + "grad_norm": 5.398970603942871, + "learning_rate": 9.995603934524054e-05, + "loss": 2.6149, + "step": 578 + }, + { + "epoch": 0.04358380850223, + "grad_norm": 4.879952907562256, + "learning_rate": 9.995552676488762e-05, + "loss": 2.0798, + "step": 579 + }, + { + "epoch": 0.04365908278289016, + "grad_norm": 4.6440749168396, + "learning_rate": 9.995501121482199e-05, + "loss": 2.4392, + "step": 580 + }, + { + "epoch": 0.04373435706355031, + "grad_norm": 5.459920883178711, + "learning_rate": 9.995449269507425e-05, + "loss": 2.3939, + "step": 581 + }, + { + "epoch": 0.04380963134421047, + "grad_norm": 7.002266883850098, + "learning_rate": 9.995397120567527e-05, + "loss": 2.614, + "step": 582 + }, + { + "epoch": 0.04388490562487062, + "grad_norm": 7.5849127769470215, + "learning_rate": 9.995344674665604e-05, + "loss": 2.3912, + "step": 583 + }, + { + "epoch": 0.04396017990553078, + "grad_norm": 6.636555194854736, + "learning_rate": 9.995291931804771e-05, + "loss": 2.9888, + "step": 584 + }, + { + "epoch": 0.04403545418619093, + "grad_norm": 6.017383098602295, + "learning_rate": 9.995238891988167e-05, + "loss": 2.5443, + "step": 585 + }, + { + "epoch": 0.04411072846685109, + "grad_norm": 5.583050727844238, + "learning_rate": 9.995185555218944e-05, + "loss": 2.5157, + "step": 586 + }, + { + "epoch": 0.04418600274751124, + "grad_norm": 5.433871746063232, + "learning_rate": 9.995131921500273e-05, + "loss": 2.5615, + "step": 587 + }, + { + "epoch": 0.0442612770281714, + "grad_norm": 5.141054153442383, + "learning_rate": 9.995077990835342e-05, + "loss": 2.4829, + "step": 588 + }, + { + "epoch": 0.04433655130883155, + "grad_norm": 7.2861647605896, + "learning_rate": 9.995023763227356e-05, + "loss": 2.766, + "step": 589 + }, + { + "epoch": 0.04441182558949171, + "grad_norm": 5.399280071258545, + "learning_rate": 9.99496923867954e-05, + "loss": 2.3811, + "step": 590 + }, + { + "epoch": 0.04448709987015186, + "grad_norm": 6.386356830596924, + "learning_rate": 9.994914417195134e-05, + "loss": 2.4847, + "step": 591 + }, + { + "epoch": 0.044562374150812024, + "grad_norm": 6.586709976196289, + "learning_rate": 9.9948592987774e-05, + "loss": 2.5281, + "step": 592 + }, + { + "epoch": 0.04463764843147218, + "grad_norm": 7.796422004699707, + "learning_rate": 9.994803883429612e-05, + "loss": 2.7682, + "step": 593 + }, + { + "epoch": 0.044712922712132334, + "grad_norm": 5.9834113121032715, + "learning_rate": 9.994748171155064e-05, + "loss": 2.2488, + "step": 594 + }, + { + "epoch": 0.04478819699279249, + "grad_norm": 7.149685382843018, + "learning_rate": 9.99469216195707e-05, + "loss": 2.2831, + "step": 595 + }, + { + "epoch": 0.044863471273452644, + "grad_norm": 8.159541130065918, + "learning_rate": 9.994635855838956e-05, + "loss": 2.2218, + "step": 596 + }, + { + "epoch": 0.0449387455541128, + "grad_norm": 7.774938106536865, + "learning_rate": 9.994579252804073e-05, + "loss": 3.0546, + "step": 597 + }, + { + "epoch": 0.045014019834772954, + "grad_norm": 6.304027557373047, + "learning_rate": 9.994522352855784e-05, + "loss": 2.4802, + "step": 598 + }, + { + "epoch": 0.04508929411543311, + "grad_norm": 4.762304306030273, + "learning_rate": 9.994465155997472e-05, + "loss": 2.4738, + "step": 599 + }, + { + "epoch": 0.045164568396093265, + "grad_norm": 8.871561050415039, + "learning_rate": 9.994407662232539e-05, + "loss": 2.4376, + "step": 600 + }, + { + "epoch": 0.04523984267675342, + "grad_norm": 6.56936502456665, + "learning_rate": 9.994349871564399e-05, + "loss": 2.5849, + "step": 601 + }, + { + "epoch": 0.045315116957413575, + "grad_norm": 5.996016502380371, + "learning_rate": 9.99429178399649e-05, + "loss": 2.148, + "step": 602 + }, + { + "epoch": 0.04539039123807373, + "grad_norm": 6.419641971588135, + "learning_rate": 9.994233399532265e-05, + "loss": 2.2824, + "step": 603 + }, + { + "epoch": 0.045465665518733885, + "grad_norm": 6.68547248840332, + "learning_rate": 9.994174718175196e-05, + "loss": 2.5177, + "step": 604 + }, + { + "epoch": 0.04554093979939404, + "grad_norm": 6.640247821807861, + "learning_rate": 9.994115739928767e-05, + "loss": 3.0279, + "step": 605 + }, + { + "epoch": 0.045616214080054195, + "grad_norm": 10.76980972290039, + "learning_rate": 9.994056464796488e-05, + "loss": 2.5157, + "step": 606 + }, + { + "epoch": 0.04569148836071435, + "grad_norm": 5.20290994644165, + "learning_rate": 9.993996892781883e-05, + "loss": 2.2782, + "step": 607 + }, + { + "epoch": 0.04576676264137451, + "grad_norm": 6.822776794433594, + "learning_rate": 9.993937023888491e-05, + "loss": 2.3409, + "step": 608 + }, + { + "epoch": 0.04584203692203467, + "grad_norm": 5.55907678604126, + "learning_rate": 9.993876858119872e-05, + "loss": 2.843, + "step": 609 + }, + { + "epoch": 0.04591731120269482, + "grad_norm": 5.106736183166504, + "learning_rate": 9.993816395479603e-05, + "loss": 2.216, + "step": 610 + }, + { + "epoch": 0.04599258548335498, + "grad_norm": 8.372129440307617, + "learning_rate": 9.993755635971278e-05, + "loss": 2.1318, + "step": 611 + }, + { + "epoch": 0.04606785976401513, + "grad_norm": 3.449436664581299, + "learning_rate": 9.99369457959851e-05, + "loss": 2.6827, + "step": 612 + }, + { + "epoch": 0.04614313404467529, + "grad_norm": 5.324493408203125, + "learning_rate": 9.993633226364927e-05, + "loss": 2.618, + "step": 613 + }, + { + "epoch": 0.04621840832533544, + "grad_norm": 5.592950344085693, + "learning_rate": 9.993571576274177e-05, + "loss": 2.3601, + "step": 614 + }, + { + "epoch": 0.0462936826059956, + "grad_norm": 5.400594711303711, + "learning_rate": 9.993509629329925e-05, + "loss": 2.4891, + "step": 615 + }, + { + "epoch": 0.04636895688665575, + "grad_norm": 5.7456583976745605, + "learning_rate": 9.993447385535853e-05, + "loss": 2.5015, + "step": 616 + }, + { + "epoch": 0.04644423116731591, + "grad_norm": 5.353075981140137, + "learning_rate": 9.993384844895662e-05, + "loss": 2.773, + "step": 617 + }, + { + "epoch": 0.04651950544797606, + "grad_norm": 5.943024158477783, + "learning_rate": 9.99332200741307e-05, + "loss": 2.1428, + "step": 618 + }, + { + "epoch": 0.04659477972863622, + "grad_norm": 4.641613483428955, + "learning_rate": 9.993258873091813e-05, + "loss": 2.5845, + "step": 619 + }, + { + "epoch": 0.04667005400929637, + "grad_norm": 7.78437614440918, + "learning_rate": 9.993195441935642e-05, + "loss": 2.4693, + "step": 620 + }, + { + "epoch": 0.04674532828995653, + "grad_norm": 6.8994669914245605, + "learning_rate": 9.99313171394833e-05, + "loss": 3.0794, + "step": 621 + }, + { + "epoch": 0.04682060257061668, + "grad_norm": 4.69108772277832, + "learning_rate": 9.993067689133663e-05, + "loss": 2.355, + "step": 622 + }, + { + "epoch": 0.04689587685127684, + "grad_norm": 7.404256343841553, + "learning_rate": 9.99300336749545e-05, + "loss": 2.4818, + "step": 623 + }, + { + "epoch": 0.046971151131937, + "grad_norm": 5.823089599609375, + "learning_rate": 9.992938749037513e-05, + "loss": 2.4112, + "step": 624 + }, + { + "epoch": 0.047046425412597154, + "grad_norm": 5.970458507537842, + "learning_rate": 9.992873833763693e-05, + "loss": 2.3352, + "step": 625 + }, + { + "epoch": 0.04712169969325731, + "grad_norm": 6.043220520019531, + "learning_rate": 9.992808621677849e-05, + "loss": 2.6431, + "step": 626 + }, + { + "epoch": 0.047196973973917464, + "grad_norm": 6.244069576263428, + "learning_rate": 9.99274311278386e-05, + "loss": 2.2774, + "step": 627 + }, + { + "epoch": 0.04727224825457762, + "grad_norm": 5.437317371368408, + "learning_rate": 9.992677307085618e-05, + "loss": 2.4144, + "step": 628 + }, + { + "epoch": 0.047347522535237774, + "grad_norm": 5.5632829666137695, + "learning_rate": 9.992611204587036e-05, + "loss": 2.3307, + "step": 629 + }, + { + "epoch": 0.04742279681589793, + "grad_norm": 6.590501308441162, + "learning_rate": 9.992544805292044e-05, + "loss": 2.6062, + "step": 630 + }, + { + "epoch": 0.047498071096558084, + "grad_norm": 8.09814739227295, + "learning_rate": 9.992478109204589e-05, + "loss": 2.6395, + "step": 631 + }, + { + "epoch": 0.04757334537721824, + "grad_norm": 7.408868312835693, + "learning_rate": 9.992411116328634e-05, + "loss": 1.9976, + "step": 632 + }, + { + "epoch": 0.047648619657878394, + "grad_norm": 6.7912702560424805, + "learning_rate": 9.992343826668164e-05, + "loss": 2.6207, + "step": 633 + }, + { + "epoch": 0.04772389393853855, + "grad_norm": 6.9812726974487305, + "learning_rate": 9.992276240227176e-05, + "loss": 2.7207, + "step": 634 + }, + { + "epoch": 0.047799168219198704, + "grad_norm": 5.685427665710449, + "learning_rate": 9.992208357009692e-05, + "loss": 2.4244, + "step": 635 + }, + { + "epoch": 0.04787444249985886, + "grad_norm": 5.536280632019043, + "learning_rate": 9.992140177019745e-05, + "loss": 2.44, + "step": 636 + }, + { + "epoch": 0.047949716780519014, + "grad_norm": 9.64120864868164, + "learning_rate": 9.992071700261389e-05, + "loss": 2.6631, + "step": 637 + }, + { + "epoch": 0.04802499106117917, + "grad_norm": 8.762601852416992, + "learning_rate": 9.992002926738693e-05, + "loss": 2.4953, + "step": 638 + }, + { + "epoch": 0.048100265341839324, + "grad_norm": 5.968412399291992, + "learning_rate": 9.991933856455747e-05, + "loss": 2.5909, + "step": 639 + }, + { + "epoch": 0.048175539622499486, + "grad_norm": 6.247552871704102, + "learning_rate": 9.991864489416658e-05, + "loss": 2.2363, + "step": 640 + }, + { + "epoch": 0.04825081390315964, + "grad_norm": 4.866089344024658, + "learning_rate": 9.991794825625547e-05, + "loss": 2.3514, + "step": 641 + }, + { + "epoch": 0.048326088183819796, + "grad_norm": 6.439634799957275, + "learning_rate": 9.991724865086558e-05, + "loss": 2.387, + "step": 642 + }, + { + "epoch": 0.04840136246447995, + "grad_norm": 6.257725238800049, + "learning_rate": 9.991654607803847e-05, + "loss": 2.5005, + "step": 643 + }, + { + "epoch": 0.048476636745140106, + "grad_norm": 7.805270195007324, + "learning_rate": 9.991584053781593e-05, + "loss": 2.2804, + "step": 644 + }, + { + "epoch": 0.04855191102580026, + "grad_norm": 7.503831386566162, + "learning_rate": 9.99151320302399e-05, + "loss": 2.5358, + "step": 645 + }, + { + "epoch": 0.048627185306460416, + "grad_norm": 5.44814920425415, + "learning_rate": 9.991442055535248e-05, + "loss": 2.2454, + "step": 646 + }, + { + "epoch": 0.04870245958712057, + "grad_norm": 9.13061809539795, + "learning_rate": 9.9913706113196e-05, + "loss": 2.3681, + "step": 647 + }, + { + "epoch": 0.048777733867780726, + "grad_norm": 5.287869930267334, + "learning_rate": 9.991298870381289e-05, + "loss": 2.446, + "step": 648 + }, + { + "epoch": 0.04885300814844088, + "grad_norm": 5.838399410247803, + "learning_rate": 9.991226832724583e-05, + "loss": 2.2934, + "step": 649 + }, + { + "epoch": 0.048928282429101036, + "grad_norm": 5.242301940917969, + "learning_rate": 9.991154498353761e-05, + "loss": 2.5359, + "step": 650 + }, + { + "epoch": 0.04900355670976119, + "grad_norm": 4.780924320220947, + "learning_rate": 9.991081867273127e-05, + "loss": 2.543, + "step": 651 + }, + { + "epoch": 0.049078830990421346, + "grad_norm": 5.0087995529174805, + "learning_rate": 9.991008939486998e-05, + "loss": 2.3301, + "step": 652 + }, + { + "epoch": 0.0491541052710815, + "grad_norm": 5.926429748535156, + "learning_rate": 9.990935714999707e-05, + "loss": 2.6174, + "step": 653 + }, + { + "epoch": 0.049229379551741656, + "grad_norm": 4.909923553466797, + "learning_rate": 9.990862193815609e-05, + "loss": 2.2662, + "step": 654 + }, + { + "epoch": 0.04930465383240181, + "grad_norm": 6.0996623039245605, + "learning_rate": 9.990788375939073e-05, + "loss": 2.4263, + "step": 655 + }, + { + "epoch": 0.04937992811306197, + "grad_norm": 7.067746639251709, + "learning_rate": 9.99071426137449e-05, + "loss": 2.7269, + "step": 656 + }, + { + "epoch": 0.04945520239372213, + "grad_norm": 6.930025100708008, + "learning_rate": 9.990639850126262e-05, + "loss": 2.2417, + "step": 657 + }, + { + "epoch": 0.04953047667438228, + "grad_norm": 8.447489738464355, + "learning_rate": 9.990565142198816e-05, + "loss": 2.51, + "step": 658 + }, + { + "epoch": 0.04960575095504244, + "grad_norm": 5.6237077713012695, + "learning_rate": 9.99049013759659e-05, + "loss": 2.523, + "step": 659 + }, + { + "epoch": 0.04968102523570259, + "grad_norm": 6.538666248321533, + "learning_rate": 9.990414836324047e-05, + "loss": 2.2978, + "step": 660 + }, + { + "epoch": 0.04975629951636275, + "grad_norm": 7.247674942016602, + "learning_rate": 9.99033923838566e-05, + "loss": 2.3974, + "step": 661 + }, + { + "epoch": 0.0498315737970229, + "grad_norm": 6.012288570404053, + "learning_rate": 9.990263343785925e-05, + "loss": 2.9195, + "step": 662 + }, + { + "epoch": 0.04990684807768306, + "grad_norm": 7.676389217376709, + "learning_rate": 9.990187152529353e-05, + "loss": 2.4498, + "step": 663 + }, + { + "epoch": 0.04998212235834321, + "grad_norm": 9.682546615600586, + "learning_rate": 9.990110664620472e-05, + "loss": 2.3964, + "step": 664 + }, + { + "epoch": 0.05005739663900337, + "grad_norm": 6.515074729919434, + "learning_rate": 9.990033880063832e-05, + "loss": 2.3845, + "step": 665 + }, + { + "epoch": 0.05013267091966352, + "grad_norm": 9.541638374328613, + "learning_rate": 9.989956798863993e-05, + "loss": 3.5656, + "step": 666 + }, + { + "epoch": 0.05020794520032368, + "grad_norm": 7.824437141418457, + "learning_rate": 9.989879421025542e-05, + "loss": 2.9887, + "step": 667 + }, + { + "epoch": 0.05028321948098383, + "grad_norm": 9.796915054321289, + "learning_rate": 9.989801746553077e-05, + "loss": 2.9921, + "step": 668 + }, + { + "epoch": 0.05035849376164399, + "grad_norm": 6.329982757568359, + "learning_rate": 9.989723775451215e-05, + "loss": 2.5484, + "step": 669 + }, + { + "epoch": 0.05043376804230414, + "grad_norm": 6.734201908111572, + "learning_rate": 9.989645507724593e-05, + "loss": 2.5668, + "step": 670 + }, + { + "epoch": 0.0505090423229643, + "grad_norm": 5.232598781585693, + "learning_rate": 9.98956694337786e-05, + "loss": 2.4472, + "step": 671 + }, + { + "epoch": 0.05058431660362445, + "grad_norm": 5.023258209228516, + "learning_rate": 9.98948808241569e-05, + "loss": 2.3763, + "step": 672 + }, + { + "epoch": 0.050659590884284615, + "grad_norm": 8.550032615661621, + "learning_rate": 9.98940892484277e-05, + "loss": 2.3337, + "step": 673 + }, + { + "epoch": 0.05073486516494477, + "grad_norm": 5.128488540649414, + "learning_rate": 9.989329470663807e-05, + "loss": 2.6499, + "step": 674 + }, + { + "epoch": 0.050810139445604925, + "grad_norm": 5.089874267578125, + "learning_rate": 9.989249719883522e-05, + "loss": 2.5086, + "step": 675 + }, + { + "epoch": 0.05088541372626508, + "grad_norm": 6.769781589508057, + "learning_rate": 9.989169672506655e-05, + "loss": 2.683, + "step": 676 + }, + { + "epoch": 0.050960688006925235, + "grad_norm": 4.525973320007324, + "learning_rate": 9.989089328537969e-05, + "loss": 2.3601, + "step": 677 + }, + { + "epoch": 0.05103596228758539, + "grad_norm": 5.513026714324951, + "learning_rate": 9.989008687982238e-05, + "loss": 2.37, + "step": 678 + }, + { + "epoch": 0.051111236568245545, + "grad_norm": 6.706972599029541, + "learning_rate": 9.988927750844253e-05, + "loss": 2.0829, + "step": 679 + }, + { + "epoch": 0.0511865108489057, + "grad_norm": 5.544755935668945, + "learning_rate": 9.988846517128829e-05, + "loss": 2.9128, + "step": 680 + }, + { + "epoch": 0.051261785129565855, + "grad_norm": 5.722530841827393, + "learning_rate": 9.988764986840795e-05, + "loss": 2.5245, + "step": 681 + }, + { + "epoch": 0.05133705941022601, + "grad_norm": 4.756669998168945, + "learning_rate": 9.988683159984998e-05, + "loss": 2.3442, + "step": 682 + }, + { + "epoch": 0.051412333690886165, + "grad_norm": 7.017010688781738, + "learning_rate": 9.988601036566298e-05, + "loss": 2.2975, + "step": 683 + }, + { + "epoch": 0.05148760797154632, + "grad_norm": 4.922911643981934, + "learning_rate": 9.988518616589583e-05, + "loss": 2.5241, + "step": 684 + }, + { + "epoch": 0.051562882252206475, + "grad_norm": 4.759385108947754, + "learning_rate": 9.98843590005975e-05, + "loss": 2.1971, + "step": 685 + }, + { + "epoch": 0.05163815653286663, + "grad_norm": 5.39924430847168, + "learning_rate": 9.988352886981714e-05, + "loss": 2.6424, + "step": 686 + }, + { + "epoch": 0.051713430813526785, + "grad_norm": 5.887570381164551, + "learning_rate": 9.988269577360414e-05, + "loss": 2.3709, + "step": 687 + }, + { + "epoch": 0.05178870509418694, + "grad_norm": 4.887275218963623, + "learning_rate": 9.988185971200799e-05, + "loss": 2.395, + "step": 688 + }, + { + "epoch": 0.0518639793748471, + "grad_norm": 5.4255781173706055, + "learning_rate": 9.988102068507842e-05, + "loss": 2.4802, + "step": 689 + }, + { + "epoch": 0.05193925365550726, + "grad_norm": 5.614969253540039, + "learning_rate": 9.988017869286529e-05, + "loss": 2.1813, + "step": 690 + }, + { + "epoch": 0.05201452793616741, + "grad_norm": 7.493210792541504, + "learning_rate": 9.987933373541866e-05, + "loss": 2.3615, + "step": 691 + }, + { + "epoch": 0.05208980221682757, + "grad_norm": 5.189207077026367, + "learning_rate": 9.987848581278876e-05, + "loss": 2.2156, + "step": 692 + }, + { + "epoch": 0.05216507649748772, + "grad_norm": 6.868730068206787, + "learning_rate": 9.9877634925026e-05, + "loss": 2.5814, + "step": 693 + }, + { + "epoch": 0.05224035077814788, + "grad_norm": 3.981825590133667, + "learning_rate": 9.987678107218095e-05, + "loss": 2.4568, + "step": 694 + }, + { + "epoch": 0.05231562505880803, + "grad_norm": 6.582211971282959, + "learning_rate": 9.987592425430439e-05, + "loss": 2.4367, + "step": 695 + }, + { + "epoch": 0.05239089933946819, + "grad_norm": 5.212049961090088, + "learning_rate": 9.987506447144723e-05, + "loss": 2.5464, + "step": 696 + }, + { + "epoch": 0.05246617362012834, + "grad_norm": 6.014098644256592, + "learning_rate": 9.98742017236606e-05, + "loss": 2.1987, + "step": 697 + }, + { + "epoch": 0.0525414479007885, + "grad_norm": 6.738082408905029, + "learning_rate": 9.987333601099577e-05, + "loss": 2.4259, + "step": 698 + }, + { + "epoch": 0.05261672218144865, + "grad_norm": 5.633063316345215, + "learning_rate": 9.987246733350423e-05, + "loss": 2.5601, + "step": 699 + }, + { + "epoch": 0.05269199646210881, + "grad_norm": 8.220179557800293, + "learning_rate": 9.987159569123761e-05, + "loss": 2.6459, + "step": 700 + }, + { + "epoch": 0.05276727074276896, + "grad_norm": 5.723289966583252, + "learning_rate": 9.987072108424772e-05, + "loss": 2.4444, + "step": 701 + }, + { + "epoch": 0.05284254502342912, + "grad_norm": 5.5899481773376465, + "learning_rate": 9.986984351258656e-05, + "loss": 2.6005, + "step": 702 + }, + { + "epoch": 0.05291781930408927, + "grad_norm": 6.062432765960693, + "learning_rate": 9.98689629763063e-05, + "loss": 2.2026, + "step": 703 + }, + { + "epoch": 0.05299309358474943, + "grad_norm": 7.311080455780029, + "learning_rate": 9.986807947545927e-05, + "loss": 2.318, + "step": 704 + }, + { + "epoch": 0.05306836786540959, + "grad_norm": 5.180997848510742, + "learning_rate": 9.986719301009799e-05, + "loss": 2.4562, + "step": 705 + }, + { + "epoch": 0.053143642146069744, + "grad_norm": 10.06629467010498, + "learning_rate": 9.98663035802752e-05, + "loss": 2.4868, + "step": 706 + }, + { + "epoch": 0.0532189164267299, + "grad_norm": 6.045038223266602, + "learning_rate": 9.986541118604373e-05, + "loss": 2.4337, + "step": 707 + }, + { + "epoch": 0.053294190707390054, + "grad_norm": 4.921200752258301, + "learning_rate": 9.986451582745664e-05, + "loss": 2.4036, + "step": 708 + }, + { + "epoch": 0.05336946498805021, + "grad_norm": 4.26608419418335, + "learning_rate": 9.986361750456716e-05, + "loss": 2.2131, + "step": 709 + }, + { + "epoch": 0.053444739268710365, + "grad_norm": 5.476470470428467, + "learning_rate": 9.986271621742871e-05, + "loss": 2.7903, + "step": 710 + }, + { + "epoch": 0.05352001354937052, + "grad_norm": 7.235992908477783, + "learning_rate": 9.986181196609485e-05, + "loss": 2.2002, + "step": 711 + }, + { + "epoch": 0.053595287830030675, + "grad_norm": 4.889042854309082, + "learning_rate": 9.986090475061934e-05, + "loss": 2.3387, + "step": 712 + }, + { + "epoch": 0.05367056211069083, + "grad_norm": 5.935046672821045, + "learning_rate": 9.98599945710561e-05, + "loss": 2.3095, + "step": 713 + }, + { + "epoch": 0.053745836391350985, + "grad_norm": 4.836548805236816, + "learning_rate": 9.985908142745924e-05, + "loss": 2.3641, + "step": 714 + }, + { + "epoch": 0.05382111067201114, + "grad_norm": 6.320348262786865, + "learning_rate": 9.985816531988306e-05, + "loss": 2.4294, + "step": 715 + }, + { + "epoch": 0.053896384952671295, + "grad_norm": 6.6849446296691895, + "learning_rate": 9.985724624838201e-05, + "loss": 2.3695, + "step": 716 + }, + { + "epoch": 0.05397165923333145, + "grad_norm": 6.945517063140869, + "learning_rate": 9.985632421301073e-05, + "loss": 2.6486, + "step": 717 + }, + { + "epoch": 0.054046933513991605, + "grad_norm": 10.14905071258545, + "learning_rate": 9.985539921382402e-05, + "loss": 2.2833, + "step": 718 + }, + { + "epoch": 0.05412220779465176, + "grad_norm": 8.127327919006348, + "learning_rate": 9.98544712508769e-05, + "loss": 2.3559, + "step": 719 + }, + { + "epoch": 0.054197482075311915, + "grad_norm": 6.484206676483154, + "learning_rate": 9.98535403242245e-05, + "loss": 2.3251, + "step": 720 + }, + { + "epoch": 0.05427275635597208, + "grad_norm": 7.569386959075928, + "learning_rate": 9.985260643392215e-05, + "loss": 2.4524, + "step": 721 + }, + { + "epoch": 0.05434803063663223, + "grad_norm": 4.585761547088623, + "learning_rate": 9.985166958002543e-05, + "loss": 2.4823, + "step": 722 + }, + { + "epoch": 0.05442330491729239, + "grad_norm": 4.589804172515869, + "learning_rate": 9.985072976258999e-05, + "loss": 2.3001, + "step": 723 + }, + { + "epoch": 0.05449857919795254, + "grad_norm": 6.320410251617432, + "learning_rate": 9.984978698167167e-05, + "loss": 2.3685, + "step": 724 + }, + { + "epoch": 0.0545738534786127, + "grad_norm": 6.319190979003906, + "learning_rate": 9.984884123732657e-05, + "loss": 2.1191, + "step": 725 + }, + { + "epoch": 0.05464912775927285, + "grad_norm": 7.725295543670654, + "learning_rate": 9.984789252961088e-05, + "loss": 2.4234, + "step": 726 + }, + { + "epoch": 0.05472440203993301, + "grad_norm": 5.5601959228515625, + "learning_rate": 9.984694085858105e-05, + "loss": 2.5882, + "step": 727 + }, + { + "epoch": 0.05479967632059316, + "grad_norm": 6.580722332000732, + "learning_rate": 9.984598622429356e-05, + "loss": 2.7962, + "step": 728 + }, + { + "epoch": 0.05487495060125332, + "grad_norm": 6.733530521392822, + "learning_rate": 9.984502862680524e-05, + "loss": 2.2311, + "step": 729 + }, + { + "epoch": 0.05495022488191347, + "grad_norm": 6.550483226776123, + "learning_rate": 9.984406806617298e-05, + "loss": 2.6386, + "step": 730 + }, + { + "epoch": 0.05502549916257363, + "grad_norm": 8.7761812210083, + "learning_rate": 9.984310454245389e-05, + "loss": 3.1246, + "step": 731 + }, + { + "epoch": 0.05510077344323378, + "grad_norm": 5.58363151550293, + "learning_rate": 9.984213805570527e-05, + "loss": 2.9234, + "step": 732 + }, + { + "epoch": 0.05517604772389394, + "grad_norm": 7.577469348907471, + "learning_rate": 9.984116860598454e-05, + "loss": 2.532, + "step": 733 + }, + { + "epoch": 0.05525132200455409, + "grad_norm": 6.627183437347412, + "learning_rate": 9.984019619334935e-05, + "loss": 2.4154, + "step": 734 + }, + { + "epoch": 0.05532659628521425, + "grad_norm": 6.652490139007568, + "learning_rate": 9.98392208178575e-05, + "loss": 2.1275, + "step": 735 + }, + { + "epoch": 0.0554018705658744, + "grad_norm": 6.850701808929443, + "learning_rate": 9.983824247956699e-05, + "loss": 2.0242, + "step": 736 + }, + { + "epoch": 0.055477144846534564, + "grad_norm": 8.733121871948242, + "learning_rate": 9.983726117853595e-05, + "loss": 2.6301, + "step": 737 + }, + { + "epoch": 0.05555241912719472, + "grad_norm": 5.286831855773926, + "learning_rate": 9.983627691482273e-05, + "loss": 2.8511, + "step": 738 + }, + { + "epoch": 0.055627693407854874, + "grad_norm": 7.479588985443115, + "learning_rate": 9.983528968848585e-05, + "loss": 2.3986, + "step": 739 + }, + { + "epoch": 0.05570296768851503, + "grad_norm": 4.2901153564453125, + "learning_rate": 9.983429949958401e-05, + "loss": 2.4922, + "step": 740 + }, + { + "epoch": 0.055778241969175184, + "grad_norm": 4.9687323570251465, + "learning_rate": 9.983330634817604e-05, + "loss": 2.6474, + "step": 741 + }, + { + "epoch": 0.05585351624983534, + "grad_norm": 5.911922454833984, + "learning_rate": 9.9832310234321e-05, + "loss": 2.2, + "step": 742 + }, + { + "epoch": 0.055928790530495494, + "grad_norm": 5.424359321594238, + "learning_rate": 9.983131115807809e-05, + "loss": 2.464, + "step": 743 + }, + { + "epoch": 0.05600406481115565, + "grad_norm": 6.540555000305176, + "learning_rate": 9.983030911950671e-05, + "loss": 2.2448, + "step": 744 + }, + { + "epoch": 0.056079339091815804, + "grad_norm": 5.263029098510742, + "learning_rate": 9.982930411866644e-05, + "loss": 2.3292, + "step": 745 + }, + { + "epoch": 0.05615461337247596, + "grad_norm": 4.780468940734863, + "learning_rate": 9.982829615561703e-05, + "loss": 2.4589, + "step": 746 + }, + { + "epoch": 0.056229887653136114, + "grad_norm": 7.426937103271484, + "learning_rate": 9.982728523041838e-05, + "loss": 2.0081, + "step": 747 + }, + { + "epoch": 0.05630516193379627, + "grad_norm": 5.775896072387695, + "learning_rate": 9.982627134313058e-05, + "loss": 2.369, + "step": 748 + }, + { + "epoch": 0.056380436214456424, + "grad_norm": 6.047148704528809, + "learning_rate": 9.982525449381392e-05, + "loss": 2.1239, + "step": 749 + }, + { + "epoch": 0.05645571049511658, + "grad_norm": 5.515091419219971, + "learning_rate": 9.982423468252886e-05, + "loss": 2.5432, + "step": 750 + }, + { + "epoch": 0.056530984775776734, + "grad_norm": 5.722801685333252, + "learning_rate": 9.9823211909336e-05, + "loss": 2.8707, + "step": 751 + }, + { + "epoch": 0.05660625905643689, + "grad_norm": 4.457740783691406, + "learning_rate": 9.982218617429615e-05, + "loss": 2.4601, + "step": 752 + }, + { + "epoch": 0.056681533337097044, + "grad_norm": 6.8544182777404785, + "learning_rate": 9.982115747747029e-05, + "loss": 2.0331, + "step": 753 + }, + { + "epoch": 0.056756807617757206, + "grad_norm": 7.249621868133545, + "learning_rate": 9.982012581891958e-05, + "loss": 2.2857, + "step": 754 + }, + { + "epoch": 0.05683208189841736, + "grad_norm": 6.304576396942139, + "learning_rate": 9.981909119870531e-05, + "loss": 2.5246, + "step": 755 + }, + { + "epoch": 0.056907356179077516, + "grad_norm": 5.6655497550964355, + "learning_rate": 9.981805361688906e-05, + "loss": 2.3135, + "step": 756 + }, + { + "epoch": 0.05698263045973767, + "grad_norm": 7.412888050079346, + "learning_rate": 9.981701307353243e-05, + "loss": 2.6853, + "step": 757 + }, + { + "epoch": 0.057057904740397826, + "grad_norm": 6.149228096008301, + "learning_rate": 9.981596956869732e-05, + "loss": 2.2792, + "step": 758 + }, + { + "epoch": 0.05713317902105798, + "grad_norm": 4.461668968200684, + "learning_rate": 9.981492310244576e-05, + "loss": 2.429, + "step": 759 + }, + { + "epoch": 0.057208453301718136, + "grad_norm": 8.047645568847656, + "learning_rate": 9.981387367483997e-05, + "loss": 2.5823, + "step": 760 + }, + { + "epoch": 0.05728372758237829, + "grad_norm": 5.71897029876709, + "learning_rate": 9.981282128594232e-05, + "loss": 2.3675, + "step": 761 + }, + { + "epoch": 0.057359001863038446, + "grad_norm": 5.366356372833252, + "learning_rate": 9.981176593581537e-05, + "loss": 2.2436, + "step": 762 + }, + { + "epoch": 0.0574342761436986, + "grad_norm": 6.393449306488037, + "learning_rate": 9.981070762452184e-05, + "loss": 1.9928, + "step": 763 + }, + { + "epoch": 0.057509550424358756, + "grad_norm": 5.603578090667725, + "learning_rate": 9.980964635212469e-05, + "loss": 2.5008, + "step": 764 + }, + { + "epoch": 0.05758482470501891, + "grad_norm": 5.958922863006592, + "learning_rate": 9.980858211868697e-05, + "loss": 2.295, + "step": 765 + }, + { + "epoch": 0.057660098985679066, + "grad_norm": 7.110422134399414, + "learning_rate": 9.980751492427197e-05, + "loss": 2.0474, + "step": 766 + }, + { + "epoch": 0.05773537326633922, + "grad_norm": 7.016932964324951, + "learning_rate": 9.980644476894311e-05, + "loss": 2.3051, + "step": 767 + }, + { + "epoch": 0.057810647546999376, + "grad_norm": 7.762566089630127, + "learning_rate": 9.980537165276402e-05, + "loss": 2.3683, + "step": 768 + }, + { + "epoch": 0.05788592182765953, + "grad_norm": 7.152153491973877, + "learning_rate": 9.98042955757985e-05, + "loss": 2.5303, + "step": 769 + }, + { + "epoch": 0.05796119610831969, + "grad_norm": 9.040239334106445, + "learning_rate": 9.980321653811051e-05, + "loss": 2.4154, + "step": 770 + }, + { + "epoch": 0.05803647038897985, + "grad_norm": 6.318729877471924, + "learning_rate": 9.980213453976421e-05, + "loss": 2.3276, + "step": 771 + }, + { + "epoch": 0.05811174466964, + "grad_norm": 7.08339262008667, + "learning_rate": 9.980104958082389e-05, + "loss": 2.3597, + "step": 772 + }, + { + "epoch": 0.05818701895030016, + "grad_norm": 6.9691386222839355, + "learning_rate": 9.979996166135408e-05, + "loss": 2.5083, + "step": 773 + }, + { + "epoch": 0.05826229323096031, + "grad_norm": 7.71740198135376, + "learning_rate": 9.979887078141945e-05, + "loss": 2.1033, + "step": 774 + }, + { + "epoch": 0.05833756751162047, + "grad_norm": 4.721714019775391, + "learning_rate": 9.979777694108483e-05, + "loss": 2.2884, + "step": 775 + }, + { + "epoch": 0.05841284179228062, + "grad_norm": 4.897715091705322, + "learning_rate": 9.979668014041524e-05, + "loss": 2.4906, + "step": 776 + }, + { + "epoch": 0.05848811607294078, + "grad_norm": 5.300808429718018, + "learning_rate": 9.979558037947591e-05, + "loss": 2.2899, + "step": 777 + }, + { + "epoch": 0.05856339035360093, + "grad_norm": 6.205648899078369, + "learning_rate": 9.979447765833221e-05, + "loss": 2.0926, + "step": 778 + }, + { + "epoch": 0.05863866463426109, + "grad_norm": 5.6532464027404785, + "learning_rate": 9.979337197704969e-05, + "loss": 2.4447, + "step": 779 + }, + { + "epoch": 0.05871393891492124, + "grad_norm": 8.568648338317871, + "learning_rate": 9.97922633356941e-05, + "loss": 2.1264, + "step": 780 + }, + { + "epoch": 0.0587892131955814, + "grad_norm": 5.700965404510498, + "learning_rate": 9.979115173433128e-05, + "loss": 2.491, + "step": 781 + }, + { + "epoch": 0.05886448747624155, + "grad_norm": 7.4272894859313965, + "learning_rate": 9.97900371730274e-05, + "loss": 2.3558, + "step": 782 + }, + { + "epoch": 0.05893976175690171, + "grad_norm": 5.166018009185791, + "learning_rate": 9.978891965184864e-05, + "loss": 2.5376, + "step": 783 + }, + { + "epoch": 0.05901503603756186, + "grad_norm": 8.077032089233398, + "learning_rate": 9.97877991708615e-05, + "loss": 2.1797, + "step": 784 + }, + { + "epoch": 0.05909031031822202, + "grad_norm": 7.215895652770996, + "learning_rate": 9.978667573013253e-05, + "loss": 2.5612, + "step": 785 + }, + { + "epoch": 0.05916558459888218, + "grad_norm": 8.063216209411621, + "learning_rate": 9.978554932972855e-05, + "loss": 2.4893, + "step": 786 + }, + { + "epoch": 0.059240858879542335, + "grad_norm": 9.267261505126953, + "learning_rate": 9.97844199697165e-05, + "loss": 2.4537, + "step": 787 + }, + { + "epoch": 0.05931613316020249, + "grad_norm": 9.832077980041504, + "learning_rate": 9.978328765016355e-05, + "loss": 2.5387, + "step": 788 + }, + { + "epoch": 0.059391407440862645, + "grad_norm": 5.216586112976074, + "learning_rate": 9.978215237113699e-05, + "loss": 2.5711, + "step": 789 + }, + { + "epoch": 0.0594666817215228, + "grad_norm": 6.500823974609375, + "learning_rate": 9.978101413270431e-05, + "loss": 2.7454, + "step": 790 + }, + { + "epoch": 0.059541956002182955, + "grad_norm": 6.540772914886475, + "learning_rate": 9.977987293493317e-05, + "loss": 1.9508, + "step": 791 + }, + { + "epoch": 0.05961723028284311, + "grad_norm": 6.135433197021484, + "learning_rate": 9.977872877789143e-05, + "loss": 2.1823, + "step": 792 + }, + { + "epoch": 0.059692504563503265, + "grad_norm": 6.370469093322754, + "learning_rate": 9.977758166164707e-05, + "loss": 2.2792, + "step": 793 + }, + { + "epoch": 0.05976777884416342, + "grad_norm": 5.538809299468994, + "learning_rate": 9.977643158626835e-05, + "loss": 2.4347, + "step": 794 + }, + { + "epoch": 0.059843053124823575, + "grad_norm": 7.072920799255371, + "learning_rate": 9.977527855182357e-05, + "loss": 2.2672, + "step": 795 + }, + { + "epoch": 0.05991832740548373, + "grad_norm": 7.1014509201049805, + "learning_rate": 9.97741225583813e-05, + "loss": 3.158, + "step": 796 + }, + { + "epoch": 0.059993601686143885, + "grad_norm": 6.903295040130615, + "learning_rate": 9.977296360601027e-05, + "loss": 2.5023, + "step": 797 + }, + { + "epoch": 0.06006887596680404, + "grad_norm": 5.6039323806762695, + "learning_rate": 9.977180169477937e-05, + "loss": 2.6403, + "step": 798 + }, + { + "epoch": 0.060144150247464195, + "grad_norm": 5.642335891723633, + "learning_rate": 9.977063682475767e-05, + "loss": 2.513, + "step": 799 + }, + { + "epoch": 0.06021942452812435, + "grad_norm": 7.554859638214111, + "learning_rate": 9.976946899601441e-05, + "loss": 2.1054, + "step": 800 + }, + { + "epoch": 0.060294698808784505, + "grad_norm": 7.001617431640625, + "learning_rate": 9.976829820861904e-05, + "loss": 2.3187, + "step": 801 + }, + { + "epoch": 0.06036997308944467, + "grad_norm": 6.228675842285156, + "learning_rate": 9.976712446264113e-05, + "loss": 2.068, + "step": 802 + }, + { + "epoch": 0.06044524737010482, + "grad_norm": 6.299452304840088, + "learning_rate": 9.976594775815047e-05, + "loss": 2.3287, + "step": 803 + }, + { + "epoch": 0.06052052165076498, + "grad_norm": 6.488040924072266, + "learning_rate": 9.9764768095217e-05, + "loss": 2.3386, + "step": 804 + }, + { + "epoch": 0.06059579593142513, + "grad_norm": 7.455612659454346, + "learning_rate": 9.976358547391088e-05, + "loss": 2.4758, + "step": 805 + }, + { + "epoch": 0.06067107021208529, + "grad_norm": 5.75909423828125, + "learning_rate": 9.976239989430239e-05, + "loss": 2.2389, + "step": 806 + }, + { + "epoch": 0.06074634449274544, + "grad_norm": 6.22749662399292, + "learning_rate": 9.976121135646199e-05, + "loss": 2.7626, + "step": 807 + }, + { + "epoch": 0.0608216187734056, + "grad_norm": 6.732685565948486, + "learning_rate": 9.976001986046036e-05, + "loss": 3.0899, + "step": 808 + }, + { + "epoch": 0.06089689305406575, + "grad_norm": 7.681735992431641, + "learning_rate": 9.975882540636834e-05, + "loss": 2.4637, + "step": 809 + }, + { + "epoch": 0.06097216733472591, + "grad_norm": 4.422317981719971, + "learning_rate": 9.975762799425693e-05, + "loss": 2.2814, + "step": 810 + }, + { + "epoch": 0.06104744161538606, + "grad_norm": 9.311334609985352, + "learning_rate": 9.975642762419728e-05, + "loss": 2.5256, + "step": 811 + }, + { + "epoch": 0.06112271589604622, + "grad_norm": 7.12771463394165, + "learning_rate": 9.975522429626079e-05, + "loss": 2.5477, + "step": 812 + }, + { + "epoch": 0.06119799017670637, + "grad_norm": 5.93684720993042, + "learning_rate": 9.975401801051897e-05, + "loss": 2.3224, + "step": 813 + }, + { + "epoch": 0.06127326445736653, + "grad_norm": 5.034492015838623, + "learning_rate": 9.975280876704353e-05, + "loss": 2.382, + "step": 814 + }, + { + "epoch": 0.06134853873802668, + "grad_norm": 7.287402629852295, + "learning_rate": 9.97515965659064e-05, + "loss": 2.6205, + "step": 815 + }, + { + "epoch": 0.06142381301868684, + "grad_norm": 6.875100612640381, + "learning_rate": 9.975038140717959e-05, + "loss": 2.4561, + "step": 816 + }, + { + "epoch": 0.06149908729934699, + "grad_norm": 6.032425403594971, + "learning_rate": 9.974916329093535e-05, + "loss": 2.4004, + "step": 817 + }, + { + "epoch": 0.061574361580007154, + "grad_norm": 7.3820295333862305, + "learning_rate": 9.97479422172461e-05, + "loss": 2.1098, + "step": 818 + }, + { + "epoch": 0.06164963586066731, + "grad_norm": 4.1829142570495605, + "learning_rate": 9.974671818618444e-05, + "loss": 2.3091, + "step": 819 + }, + { + "epoch": 0.061724910141327465, + "grad_norm": 5.894301414489746, + "learning_rate": 9.974549119782311e-05, + "loss": 2.4662, + "step": 820 + }, + { + "epoch": 0.06180018442198762, + "grad_norm": 5.369170665740967, + "learning_rate": 9.974426125223507e-05, + "loss": 2.2282, + "step": 821 + }, + { + "epoch": 0.061875458702647775, + "grad_norm": 5.439139366149902, + "learning_rate": 9.974302834949343e-05, + "loss": 2.5761, + "step": 822 + }, + { + "epoch": 0.06195073298330793, + "grad_norm": 5.655567169189453, + "learning_rate": 9.974179248967148e-05, + "loss": 2.155, + "step": 823 + }, + { + "epoch": 0.062026007263968085, + "grad_norm": 7.799777030944824, + "learning_rate": 9.97405536728427e-05, + "loss": 2.4568, + "step": 824 + }, + { + "epoch": 0.06210128154462824, + "grad_norm": 6.679040431976318, + "learning_rate": 9.973931189908073e-05, + "loss": 2.6116, + "step": 825 + }, + { + "epoch": 0.062176555825288395, + "grad_norm": 4.565517902374268, + "learning_rate": 9.973806716845937e-05, + "loss": 2.2689, + "step": 826 + }, + { + "epoch": 0.06225183010594855, + "grad_norm": 8.25964069366455, + "learning_rate": 9.973681948105263e-05, + "loss": 2.2068, + "step": 827 + }, + { + "epoch": 0.062327104386608705, + "grad_norm": 5.433253765106201, + "learning_rate": 9.973556883693469e-05, + "loss": 2.4395, + "step": 828 + }, + { + "epoch": 0.06240237866726886, + "grad_norm": 7.11344575881958, + "learning_rate": 9.97343152361799e-05, + "loss": 2.2614, + "step": 829 + }, + { + "epoch": 0.062477652947929015, + "grad_norm": 5.474208354949951, + "learning_rate": 9.973305867886276e-05, + "loss": 2.152, + "step": 830 + }, + { + "epoch": 0.06255292722858917, + "grad_norm": 6.635583877563477, + "learning_rate": 9.9731799165058e-05, + "loss": 2.3749, + "step": 831 + }, + { + "epoch": 0.06262820150924933, + "grad_norm": 6.661943435668945, + "learning_rate": 9.973053669484045e-05, + "loss": 2.354, + "step": 832 + }, + { + "epoch": 0.06270347578990948, + "grad_norm": 5.574874401092529, + "learning_rate": 9.97292712682852e-05, + "loss": 2.0282, + "step": 833 + }, + { + "epoch": 0.06277875007056964, + "grad_norm": 5.962992191314697, + "learning_rate": 9.972800288546746e-05, + "loss": 2.44, + "step": 834 + }, + { + "epoch": 0.06285402435122979, + "grad_norm": 6.058572292327881, + "learning_rate": 9.972673154646264e-05, + "loss": 2.4435, + "step": 835 + }, + { + "epoch": 0.06292929863188995, + "grad_norm": 6.082131385803223, + "learning_rate": 9.97254572513463e-05, + "loss": 2.4655, + "step": 836 + }, + { + "epoch": 0.0630045729125501, + "grad_norm": 6.920094966888428, + "learning_rate": 9.972418000019422e-05, + "loss": 2.2728, + "step": 837 + }, + { + "epoch": 0.06307984719321026, + "grad_norm": 6.013792514801025, + "learning_rate": 9.972289979308229e-05, + "loss": 2.2486, + "step": 838 + }, + { + "epoch": 0.06315512147387041, + "grad_norm": 6.150375843048096, + "learning_rate": 9.972161663008666e-05, + "loss": 2.2234, + "step": 839 + }, + { + "epoch": 0.06323039575453057, + "grad_norm": 6.13953971862793, + "learning_rate": 9.972033051128357e-05, + "loss": 2.2833, + "step": 840 + }, + { + "epoch": 0.06330567003519072, + "grad_norm": 6.005297660827637, + "learning_rate": 9.971904143674951e-05, + "loss": 2.6507, + "step": 841 + }, + { + "epoch": 0.06338094431585088, + "grad_norm": 5.4421772956848145, + "learning_rate": 9.971774940656109e-05, + "loss": 2.6106, + "step": 842 + }, + { + "epoch": 0.06345621859651104, + "grad_norm": 6.671179294586182, + "learning_rate": 9.971645442079513e-05, + "loss": 2.743, + "step": 843 + }, + { + "epoch": 0.06353149287717119, + "grad_norm": 4.749128818511963, + "learning_rate": 9.971515647952861e-05, + "loss": 2.8267, + "step": 844 + }, + { + "epoch": 0.06360676715783135, + "grad_norm": 4.922209739685059, + "learning_rate": 9.971385558283868e-05, + "loss": 2.6258, + "step": 845 + }, + { + "epoch": 0.0636820414384915, + "grad_norm": 7.677456378936768, + "learning_rate": 9.971255173080267e-05, + "loss": 2.4213, + "step": 846 + }, + { + "epoch": 0.06375731571915166, + "grad_norm": 5.307931423187256, + "learning_rate": 9.971124492349812e-05, + "loss": 2.7683, + "step": 847 + }, + { + "epoch": 0.06383258999981181, + "grad_norm": 6.525881767272949, + "learning_rate": 9.970993516100269e-05, + "loss": 2.2817, + "step": 848 + }, + { + "epoch": 0.06390786428047197, + "grad_norm": 6.345904350280762, + "learning_rate": 9.970862244339425e-05, + "loss": 2.1466, + "step": 849 + }, + { + "epoch": 0.06398313856113212, + "grad_norm": 5.944136142730713, + "learning_rate": 9.970730677075082e-05, + "loss": 2.3249, + "step": 850 + }, + { + "epoch": 0.06405841284179228, + "grad_norm": 6.007492542266846, + "learning_rate": 9.970598814315065e-05, + "loss": 2.3649, + "step": 851 + }, + { + "epoch": 0.06413368712245243, + "grad_norm": 5.802694320678711, + "learning_rate": 9.97046665606721e-05, + "loss": 2.7615, + "step": 852 + }, + { + "epoch": 0.0642089614031126, + "grad_norm": 9.016657829284668, + "learning_rate": 9.970334202339373e-05, + "loss": 2.37, + "step": 853 + }, + { + "epoch": 0.06428423568377274, + "grad_norm": 6.420614242553711, + "learning_rate": 9.97020145313943e-05, + "loss": 2.1526, + "step": 854 + }, + { + "epoch": 0.0643595099644329, + "grad_norm": 6.139922142028809, + "learning_rate": 9.970068408475272e-05, + "loss": 2.6131, + "step": 855 + }, + { + "epoch": 0.06443478424509305, + "grad_norm": 5.153894901275635, + "learning_rate": 9.969935068354807e-05, + "loss": 2.3414, + "step": 856 + }, + { + "epoch": 0.06451005852575321, + "grad_norm": 8.590038299560547, + "learning_rate": 9.969801432785965e-05, + "loss": 2.5912, + "step": 857 + }, + { + "epoch": 0.06458533280641336, + "grad_norm": 5.471381664276123, + "learning_rate": 9.969667501776685e-05, + "loss": 2.1089, + "step": 858 + }, + { + "epoch": 0.06466060708707352, + "grad_norm": 4.682456970214844, + "learning_rate": 9.969533275334933e-05, + "loss": 2.4857, + "step": 859 + }, + { + "epoch": 0.06473588136773369, + "grad_norm": 6.165603160858154, + "learning_rate": 9.969398753468686e-05, + "loss": 2.5575, + "step": 860 + }, + { + "epoch": 0.06481115564839383, + "grad_norm": 6.946765422821045, + "learning_rate": 9.969263936185943e-05, + "loss": 2.5268, + "step": 861 + }, + { + "epoch": 0.064886429929054, + "grad_norm": 5.101597309112549, + "learning_rate": 9.969128823494717e-05, + "loss": 2.6535, + "step": 862 + }, + { + "epoch": 0.06496170420971414, + "grad_norm": 6.410821437835693, + "learning_rate": 9.96899341540304e-05, + "loss": 2.6071, + "step": 863 + }, + { + "epoch": 0.0650369784903743, + "grad_norm": 5.042365074157715, + "learning_rate": 9.968857711918963e-05, + "loss": 2.4744, + "step": 864 + }, + { + "epoch": 0.06511225277103445, + "grad_norm": 5.671431064605713, + "learning_rate": 9.968721713050552e-05, + "loss": 2.4302, + "step": 865 + }, + { + "epoch": 0.06518752705169462, + "grad_norm": 5.558021068572998, + "learning_rate": 9.968585418805891e-05, + "loss": 2.2342, + "step": 866 + }, + { + "epoch": 0.06526280133235476, + "grad_norm": 5.795393466949463, + "learning_rate": 9.968448829193085e-05, + "loss": 2.639, + "step": 867 + }, + { + "epoch": 0.06533807561301493, + "grad_norm": 5.995419025421143, + "learning_rate": 9.968311944220251e-05, + "loss": 2.3098, + "step": 868 + }, + { + "epoch": 0.06541334989367507, + "grad_norm": 4.014678001403809, + "learning_rate": 9.968174763895529e-05, + "loss": 1.9536, + "step": 869 + }, + { + "epoch": 0.06548862417433524, + "grad_norm": 4.1848859786987305, + "learning_rate": 9.96803728822707e-05, + "loss": 2.2224, + "step": 870 + }, + { + "epoch": 0.06556389845499538, + "grad_norm": 5.406251430511475, + "learning_rate": 9.967899517223051e-05, + "loss": 2.2532, + "step": 871 + }, + { + "epoch": 0.06563917273565555, + "grad_norm": 7.1709465980529785, + "learning_rate": 9.96776145089166e-05, + "loss": 2.3149, + "step": 872 + }, + { + "epoch": 0.0657144470163157, + "grad_norm": 5.1394195556640625, + "learning_rate": 9.967623089241105e-05, + "loss": 2.332, + "step": 873 + }, + { + "epoch": 0.06578972129697586, + "grad_norm": 5.810478687286377, + "learning_rate": 9.967484432279611e-05, + "loss": 2.4215, + "step": 874 + }, + { + "epoch": 0.06586499557763602, + "grad_norm": 5.735139846801758, + "learning_rate": 9.96734548001542e-05, + "loss": 2.2825, + "step": 875 + }, + { + "epoch": 0.06594026985829617, + "grad_norm": 6.34000301361084, + "learning_rate": 9.967206232456794e-05, + "loss": 2.3447, + "step": 876 + }, + { + "epoch": 0.06601554413895633, + "grad_norm": 6.4331889152526855, + "learning_rate": 9.967066689612009e-05, + "loss": 2.6042, + "step": 877 + }, + { + "epoch": 0.06609081841961648, + "grad_norm": 6.684754848480225, + "learning_rate": 9.966926851489363e-05, + "loss": 2.1852, + "step": 878 + }, + { + "epoch": 0.06616609270027664, + "grad_norm": 6.291850566864014, + "learning_rate": 9.966786718097165e-05, + "loss": 2.0725, + "step": 879 + }, + { + "epoch": 0.06624136698093679, + "grad_norm": 7.072674751281738, + "learning_rate": 9.96664628944375e-05, + "loss": 2.4599, + "step": 880 + }, + { + "epoch": 0.06631664126159695, + "grad_norm": 5.791036605834961, + "learning_rate": 9.966505565537463e-05, + "loss": 2.5734, + "step": 881 + }, + { + "epoch": 0.0663919155422571, + "grad_norm": 7.2464919090271, + "learning_rate": 9.96636454638667e-05, + "loss": 2.8932, + "step": 882 + }, + { + "epoch": 0.06646718982291726, + "grad_norm": 5.751157760620117, + "learning_rate": 9.966223231999757e-05, + "loss": 2.1895, + "step": 883 + }, + { + "epoch": 0.0665424641035774, + "grad_norm": 5.123445510864258, + "learning_rate": 9.966081622385122e-05, + "loss": 2.0499, + "step": 884 + }, + { + "epoch": 0.06661773838423757, + "grad_norm": 6.297735691070557, + "learning_rate": 9.965939717551183e-05, + "loss": 2.4749, + "step": 885 + }, + { + "epoch": 0.06669301266489772, + "grad_norm": 4.268171310424805, + "learning_rate": 9.965797517506377e-05, + "loss": 2.5821, + "step": 886 + }, + { + "epoch": 0.06676828694555788, + "grad_norm": 7.6090474128723145, + "learning_rate": 9.965655022259157e-05, + "loss": 2.2455, + "step": 887 + }, + { + "epoch": 0.06684356122621803, + "grad_norm": 5.460054397583008, + "learning_rate": 9.965512231817992e-05, + "loss": 2.3015, + "step": 888 + }, + { + "epoch": 0.06691883550687819, + "grad_norm": 5.82401180267334, + "learning_rate": 9.965369146191376e-05, + "loss": 2.156, + "step": 889 + }, + { + "epoch": 0.06699410978753834, + "grad_norm": 7.957887172698975, + "learning_rate": 9.965225765387809e-05, + "loss": 2.4905, + "step": 890 + }, + { + "epoch": 0.0670693840681985, + "grad_norm": 6.796222686767578, + "learning_rate": 9.965082089415819e-05, + "loss": 2.3148, + "step": 891 + }, + { + "epoch": 0.06714465834885866, + "grad_norm": 4.408276557922363, + "learning_rate": 9.964938118283944e-05, + "loss": 2.1697, + "step": 892 + }, + { + "epoch": 0.06721993262951881, + "grad_norm": 6.927859306335449, + "learning_rate": 9.964793852000744e-05, + "loss": 2.0336, + "step": 893 + }, + { + "epoch": 0.06729520691017897, + "grad_norm": 6.1446146965026855, + "learning_rate": 9.964649290574794e-05, + "loss": 2.3938, + "step": 894 + }, + { + "epoch": 0.06737048119083912, + "grad_norm": 6.0834832191467285, + "learning_rate": 9.964504434014691e-05, + "loss": 2.4393, + "step": 895 + }, + { + "epoch": 0.06744575547149928, + "grad_norm": 5.697766304016113, + "learning_rate": 9.964359282329044e-05, + "loss": 2.2426, + "step": 896 + }, + { + "epoch": 0.06752102975215943, + "grad_norm": 7.894393444061279, + "learning_rate": 9.96421383552648e-05, + "loss": 2.5878, + "step": 897 + }, + { + "epoch": 0.06759630403281959, + "grad_norm": 5.909679412841797, + "learning_rate": 9.964068093615649e-05, + "loss": 2.3413, + "step": 898 + }, + { + "epoch": 0.06767157831347974, + "grad_norm": 6.938018798828125, + "learning_rate": 9.963922056605213e-05, + "loss": 2.2678, + "step": 899 + }, + { + "epoch": 0.0677468525941399, + "grad_norm": 8.607162475585938, + "learning_rate": 9.963775724503853e-05, + "loss": 2.7139, + "step": 900 + }, + { + "epoch": 0.06782212687480005, + "grad_norm": 6.983379364013672, + "learning_rate": 9.96362909732027e-05, + "loss": 2.3579, + "step": 901 + }, + { + "epoch": 0.06789740115546021, + "grad_norm": 7.071916103363037, + "learning_rate": 9.963482175063179e-05, + "loss": 2.7323, + "step": 902 + }, + { + "epoch": 0.06797267543612036, + "grad_norm": 5.51076602935791, + "learning_rate": 9.963334957741315e-05, + "loss": 2.4123, + "step": 903 + }, + { + "epoch": 0.06804794971678052, + "grad_norm": 6.5850653648376465, + "learning_rate": 9.963187445363428e-05, + "loss": 2.4006, + "step": 904 + }, + { + "epoch": 0.06812322399744067, + "grad_norm": 4.215870380401611, + "learning_rate": 9.96303963793829e-05, + "loss": 2.3117, + "step": 905 + }, + { + "epoch": 0.06819849827810083, + "grad_norm": 6.121519565582275, + "learning_rate": 9.962891535474685e-05, + "loss": 2.3227, + "step": 906 + }, + { + "epoch": 0.06827377255876099, + "grad_norm": 6.652968883514404, + "learning_rate": 9.962743137981417e-05, + "loss": 2.0054, + "step": 907 + }, + { + "epoch": 0.06834904683942114, + "grad_norm": 6.960452079772949, + "learning_rate": 9.96259444546731e-05, + "loss": 2.5732, + "step": 908 + }, + { + "epoch": 0.0684243211200813, + "grad_norm": 5.3289713859558105, + "learning_rate": 9.962445457941204e-05, + "loss": 2.3569, + "step": 909 + }, + { + "epoch": 0.06849959540074145, + "grad_norm": 6.253427028656006, + "learning_rate": 9.962296175411954e-05, + "loss": 2.4584, + "step": 910 + }, + { + "epoch": 0.06857486968140161, + "grad_norm": 6.991831302642822, + "learning_rate": 9.962146597888434e-05, + "loss": 2.2947, + "step": 911 + }, + { + "epoch": 0.06865014396206176, + "grad_norm": 5.575552940368652, + "learning_rate": 9.961996725379537e-05, + "loss": 2.3399, + "step": 912 + }, + { + "epoch": 0.06872541824272192, + "grad_norm": 6.069404602050781, + "learning_rate": 9.961846557894173e-05, + "loss": 2.357, + "step": 913 + }, + { + "epoch": 0.06880069252338207, + "grad_norm": 6.642460346221924, + "learning_rate": 9.961696095441266e-05, + "loss": 2.5456, + "step": 914 + }, + { + "epoch": 0.06887596680404223, + "grad_norm": 7.879169940948486, + "learning_rate": 9.961545338029765e-05, + "loss": 2.3147, + "step": 915 + }, + { + "epoch": 0.06895124108470238, + "grad_norm": 5.864014625549316, + "learning_rate": 9.961394285668629e-05, + "loss": 2.2817, + "step": 916 + }, + { + "epoch": 0.06902651536536254, + "grad_norm": 5.330665111541748, + "learning_rate": 9.96124293836684e-05, + "loss": 2.3965, + "step": 917 + }, + { + "epoch": 0.06910178964602269, + "grad_norm": 6.035168170928955, + "learning_rate": 9.961091296133391e-05, + "loss": 2.4772, + "step": 918 + }, + { + "epoch": 0.06917706392668285, + "grad_norm": 5.027256488800049, + "learning_rate": 9.960939358977301e-05, + "loss": 2.3338, + "step": 919 + }, + { + "epoch": 0.069252338207343, + "grad_norm": 6.097067356109619, + "learning_rate": 9.960787126907602e-05, + "loss": 2.4752, + "step": 920 + }, + { + "epoch": 0.06932761248800316, + "grad_norm": 5.557241439819336, + "learning_rate": 9.96063459993334e-05, + "loss": 2.3751, + "step": 921 + }, + { + "epoch": 0.06940288676866331, + "grad_norm": 7.162276268005371, + "learning_rate": 9.960481778063586e-05, + "loss": 2.0992, + "step": 922 + }, + { + "epoch": 0.06947816104932347, + "grad_norm": 5.234118938446045, + "learning_rate": 9.960328661307423e-05, + "loss": 2.4994, + "step": 923 + }, + { + "epoch": 0.06955343532998363, + "grad_norm": 6.395406246185303, + "learning_rate": 9.960175249673956e-05, + "loss": 2.2585, + "step": 924 + }, + { + "epoch": 0.06962870961064378, + "grad_norm": 7.6956329345703125, + "learning_rate": 9.960021543172299e-05, + "loss": 2.4022, + "step": 925 + }, + { + "epoch": 0.06970398389130394, + "grad_norm": 5.352781772613525, + "learning_rate": 9.959867541811596e-05, + "loss": 2.1947, + "step": 926 + }, + { + "epoch": 0.06977925817196409, + "grad_norm": 6.12201452255249, + "learning_rate": 9.959713245600996e-05, + "loss": 1.9226, + "step": 927 + }, + { + "epoch": 0.06985453245262425, + "grad_norm": 8.42553424835205, + "learning_rate": 9.95955865454968e-05, + "loss": 2.1042, + "step": 928 + }, + { + "epoch": 0.0699298067332844, + "grad_norm": 7.184745788574219, + "learning_rate": 9.959403768666828e-05, + "loss": 2.092, + "step": 929 + }, + { + "epoch": 0.07000508101394456, + "grad_norm": 7.369794845581055, + "learning_rate": 9.959248587961652e-05, + "loss": 2.1832, + "step": 930 + }, + { + "epoch": 0.07008035529460471, + "grad_norm": 5.493983268737793, + "learning_rate": 9.959093112443378e-05, + "loss": 2.313, + "step": 931 + }, + { + "epoch": 0.07015562957526487, + "grad_norm": 7.570969104766846, + "learning_rate": 9.958937342121249e-05, + "loss": 2.6078, + "step": 932 + }, + { + "epoch": 0.07023090385592502, + "grad_norm": 7.0704216957092285, + "learning_rate": 9.958781277004523e-05, + "loss": 2.3047, + "step": 933 + }, + { + "epoch": 0.07030617813658518, + "grad_norm": 5.484086513519287, + "learning_rate": 9.958624917102479e-05, + "loss": 2.2524, + "step": 934 + }, + { + "epoch": 0.07038145241724533, + "grad_norm": 8.462700843811035, + "learning_rate": 9.958468262424409e-05, + "loss": 2.0558, + "step": 935 + }, + { + "epoch": 0.0704567266979055, + "grad_norm": 6.400140285491943, + "learning_rate": 9.958311312979631e-05, + "loss": 3.0189, + "step": 936 + }, + { + "epoch": 0.07053200097856564, + "grad_norm": 6.533931255340576, + "learning_rate": 9.958154068777473e-05, + "loss": 2.7761, + "step": 937 + }, + { + "epoch": 0.0706072752592258, + "grad_norm": 5.321656227111816, + "learning_rate": 9.95799652982728e-05, + "loss": 2.0451, + "step": 938 + }, + { + "epoch": 0.07068254953988595, + "grad_norm": 4.879770755767822, + "learning_rate": 9.95783869613842e-05, + "loss": 2.1618, + "step": 939 + }, + { + "epoch": 0.07075782382054611, + "grad_norm": 6.507859230041504, + "learning_rate": 9.957680567720275e-05, + "loss": 2.1516, + "step": 940 + }, + { + "epoch": 0.07083309810120628, + "grad_norm": 9.765229225158691, + "learning_rate": 9.957522144582245e-05, + "loss": 3.2496, + "step": 941 + }, + { + "epoch": 0.07090837238186642, + "grad_norm": 6.177280426025391, + "learning_rate": 9.957363426733751e-05, + "loss": 2.2814, + "step": 942 + }, + { + "epoch": 0.07098364666252659, + "grad_norm": 6.372262477874756, + "learning_rate": 9.957204414184223e-05, + "loss": 2.453, + "step": 943 + }, + { + "epoch": 0.07105892094318673, + "grad_norm": 5.343485355377197, + "learning_rate": 9.957045106943115e-05, + "loss": 2.3823, + "step": 944 + }, + { + "epoch": 0.0711341952238469, + "grad_norm": 5.006737232208252, + "learning_rate": 9.956885505019901e-05, + "loss": 2.457, + "step": 945 + }, + { + "epoch": 0.07120946950450704, + "grad_norm": 4.654512405395508, + "learning_rate": 9.956725608424066e-05, + "loss": 2.2822, + "step": 946 + }, + { + "epoch": 0.0712847437851672, + "grad_norm": 5.320466041564941, + "learning_rate": 9.956565417165117e-05, + "loss": 2.1134, + "step": 947 + }, + { + "epoch": 0.07136001806582735, + "grad_norm": 4.9236249923706055, + "learning_rate": 9.956404931252574e-05, + "loss": 2.4609, + "step": 948 + }, + { + "epoch": 0.07143529234648752, + "grad_norm": 5.646245002746582, + "learning_rate": 9.956244150695981e-05, + "loss": 2.3735, + "step": 949 + }, + { + "epoch": 0.07151056662714766, + "grad_norm": 5.308044910430908, + "learning_rate": 9.956083075504894e-05, + "loss": 2.4017, + "step": 950 + }, + { + "epoch": 0.07158584090780783, + "grad_norm": 6.247456073760986, + "learning_rate": 9.955921705688889e-05, + "loss": 2.3095, + "step": 951 + }, + { + "epoch": 0.07166111518846797, + "grad_norm": 5.998600006103516, + "learning_rate": 9.955760041257558e-05, + "loss": 2.2016, + "step": 952 + }, + { + "epoch": 0.07173638946912814, + "grad_norm": 5.084904670715332, + "learning_rate": 9.955598082220513e-05, + "loss": 2.5664, + "step": 953 + }, + { + "epoch": 0.07181166374978828, + "grad_norm": 6.240522384643555, + "learning_rate": 9.95543582858738e-05, + "loss": 2.1274, + "step": 954 + }, + { + "epoch": 0.07188693803044845, + "grad_norm": 5.399087429046631, + "learning_rate": 9.955273280367807e-05, + "loss": 2.204, + "step": 955 + }, + { + "epoch": 0.07196221231110861, + "grad_norm": 4.183984756469727, + "learning_rate": 9.955110437571456e-05, + "loss": 2.179, + "step": 956 + }, + { + "epoch": 0.07203748659176876, + "grad_norm": 5.170600891113281, + "learning_rate": 9.954947300208007e-05, + "loss": 2.455, + "step": 957 + }, + { + "epoch": 0.07211276087242892, + "grad_norm": 6.372200012207031, + "learning_rate": 9.954783868287157e-05, + "loss": 2.2084, + "step": 958 + }, + { + "epoch": 0.07218803515308907, + "grad_norm": 4.859469413757324, + "learning_rate": 9.954620141818623e-05, + "loss": 2.421, + "step": 959 + }, + { + "epoch": 0.07226330943374923, + "grad_norm": 5.183671474456787, + "learning_rate": 9.954456120812139e-05, + "loss": 2.4034, + "step": 960 + }, + { + "epoch": 0.07233858371440938, + "grad_norm": 7.792542457580566, + "learning_rate": 9.954291805277455e-05, + "loss": 2.2416, + "step": 961 + }, + { + "epoch": 0.07241385799506954, + "grad_norm": 4.288693904876709, + "learning_rate": 9.954127195224337e-05, + "loss": 1.9048, + "step": 962 + }, + { + "epoch": 0.07248913227572969, + "grad_norm": 5.301598072052002, + "learning_rate": 9.953962290662573e-05, + "loss": 2.4557, + "step": 963 + }, + { + "epoch": 0.07256440655638985, + "grad_norm": 5.720681190490723, + "learning_rate": 9.953797091601967e-05, + "loss": 2.4226, + "step": 964 + }, + { + "epoch": 0.07263968083705, + "grad_norm": 4.384246349334717, + "learning_rate": 9.953631598052335e-05, + "loss": 2.1714, + "step": 965 + }, + { + "epoch": 0.07271495511771016, + "grad_norm": 6.421280860900879, + "learning_rate": 9.95346581002352e-05, + "loss": 2.2805, + "step": 966 + }, + { + "epoch": 0.0727902293983703, + "grad_norm": 6.632420539855957, + "learning_rate": 9.953299727525376e-05, + "loss": 2.2177, + "step": 967 + }, + { + "epoch": 0.07286550367903047, + "grad_norm": 5.725123405456543, + "learning_rate": 9.953133350567774e-05, + "loss": 2.5709, + "step": 968 + }, + { + "epoch": 0.07294077795969062, + "grad_norm": 5.322400093078613, + "learning_rate": 9.952966679160606e-05, + "loss": 2.1169, + "step": 969 + }, + { + "epoch": 0.07301605224035078, + "grad_norm": 6.844789981842041, + "learning_rate": 9.952799713313783e-05, + "loss": 2.2264, + "step": 970 + }, + { + "epoch": 0.07309132652101093, + "grad_norm": 5.416439056396484, + "learning_rate": 9.952632453037227e-05, + "loss": 2.4751, + "step": 971 + }, + { + "epoch": 0.07316660080167109, + "grad_norm": 6.6972856521606445, + "learning_rate": 9.952464898340883e-05, + "loss": 2.471, + "step": 972 + }, + { + "epoch": 0.07324187508233125, + "grad_norm": 6.15932035446167, + "learning_rate": 9.952297049234712e-05, + "loss": 2.2028, + "step": 973 + }, + { + "epoch": 0.0733171493629914, + "grad_norm": 6.374386310577393, + "learning_rate": 9.952128905728688e-05, + "loss": 2.2344, + "step": 974 + }, + { + "epoch": 0.07339242364365156, + "grad_norm": 4.6305694580078125, + "learning_rate": 9.951960467832813e-05, + "loss": 2.2047, + "step": 975 + }, + { + "epoch": 0.07346769792431171, + "grad_norm": 5.737475395202637, + "learning_rate": 9.951791735557094e-05, + "loss": 2.0497, + "step": 976 + }, + { + "epoch": 0.07354297220497187, + "grad_norm": 7.1963019371032715, + "learning_rate": 9.951622708911567e-05, + "loss": 2.3527, + "step": 977 + }, + { + "epoch": 0.07361824648563202, + "grad_norm": 6.804144382476807, + "learning_rate": 9.951453387906277e-05, + "loss": 2.3246, + "step": 978 + }, + { + "epoch": 0.07369352076629218, + "grad_norm": 7.030019760131836, + "learning_rate": 9.951283772551289e-05, + "loss": 2.1328, + "step": 979 + }, + { + "epoch": 0.07376879504695233, + "grad_norm": 9.475345611572266, + "learning_rate": 9.95111386285669e-05, + "loss": 2.3515, + "step": 980 + }, + { + "epoch": 0.07384406932761249, + "grad_norm": 4.585552215576172, + "learning_rate": 9.950943658832577e-05, + "loss": 1.99, + "step": 981 + }, + { + "epoch": 0.07391934360827264, + "grad_norm": 7.107287883758545, + "learning_rate": 9.950773160489069e-05, + "loss": 2.7024, + "step": 982 + }, + { + "epoch": 0.0739946178889328, + "grad_norm": 6.360047817230225, + "learning_rate": 9.950602367836303e-05, + "loss": 2.6684, + "step": 983 + }, + { + "epoch": 0.07406989216959295, + "grad_norm": 7.048748970031738, + "learning_rate": 9.950431280884429e-05, + "loss": 2.547, + "step": 984 + }, + { + "epoch": 0.07414516645025311, + "grad_norm": 5.756119728088379, + "learning_rate": 9.950259899643622e-05, + "loss": 2.2265, + "step": 985 + }, + { + "epoch": 0.07422044073091326, + "grad_norm": 4.780413627624512, + "learning_rate": 9.950088224124068e-05, + "loss": 2.3658, + "step": 986 + }, + { + "epoch": 0.07429571501157342, + "grad_norm": 5.234473705291748, + "learning_rate": 9.94991625433597e-05, + "loss": 2.1723, + "step": 987 + }, + { + "epoch": 0.07437098929223358, + "grad_norm": 5.499043941497803, + "learning_rate": 9.949743990289555e-05, + "loss": 2.6353, + "step": 988 + }, + { + "epoch": 0.07444626357289373, + "grad_norm": 6.970423698425293, + "learning_rate": 9.949571431995062e-05, + "loss": 2.3523, + "step": 989 + }, + { + "epoch": 0.07452153785355389, + "grad_norm": 5.077739238739014, + "learning_rate": 9.949398579462749e-05, + "loss": 2.3519, + "step": 990 + }, + { + "epoch": 0.07459681213421404, + "grad_norm": 3.6100821495056152, + "learning_rate": 9.949225432702893e-05, + "loss": 2.2624, + "step": 991 + }, + { + "epoch": 0.0746720864148742, + "grad_norm": 6.948885917663574, + "learning_rate": 9.949051991725786e-05, + "loss": 2.4027, + "step": 992 + }, + { + "epoch": 0.07474736069553435, + "grad_norm": 8.069746017456055, + "learning_rate": 9.948878256541738e-05, + "loss": 2.3889, + "step": 993 + }, + { + "epoch": 0.07482263497619451, + "grad_norm": 5.7547197341918945, + "learning_rate": 9.948704227161078e-05, + "loss": 2.4589, + "step": 994 + }, + { + "epoch": 0.07489790925685466, + "grad_norm": 4.529244422912598, + "learning_rate": 9.94852990359415e-05, + "loss": 2.3446, + "step": 995 + }, + { + "epoch": 0.07497318353751482, + "grad_norm": 5.505538463592529, + "learning_rate": 9.948355285851318e-05, + "loss": 2.1697, + "step": 996 + }, + { + "epoch": 0.07504845781817497, + "grad_norm": 4.478546142578125, + "learning_rate": 9.948180373942963e-05, + "loss": 1.9482, + "step": 997 + }, + { + "epoch": 0.07512373209883513, + "grad_norm": 7.179813861846924, + "learning_rate": 9.948005167879485e-05, + "loss": 2.4936, + "step": 998 + }, + { + "epoch": 0.07519900637949528, + "grad_norm": 6.788231372833252, + "learning_rate": 9.947829667671294e-05, + "loss": 2.0441, + "step": 999 + }, + { + "epoch": 0.07527428066015544, + "grad_norm": 6.892848014831543, + "learning_rate": 9.947653873328828e-05, + "loss": 2.2042, + "step": 1000 + }, + { + "epoch": 0.07534955494081559, + "grad_norm": 6.119129180908203, + "learning_rate": 9.947477784862536e-05, + "loss": 2.4574, + "step": 1001 + }, + { + "epoch": 0.07542482922147575, + "grad_norm": 4.81908655166626, + "learning_rate": 9.947301402282884e-05, + "loss": 2.275, + "step": 1002 + }, + { + "epoch": 0.0755001035021359, + "grad_norm": 5.189309597015381, + "learning_rate": 9.947124725600359e-05, + "loss": 2.3772, + "step": 1003 + }, + { + "epoch": 0.07557537778279606, + "grad_norm": 7.3246636390686035, + "learning_rate": 9.946947754825464e-05, + "loss": 2.6769, + "step": 1004 + }, + { + "epoch": 0.07565065206345623, + "grad_norm": 4.19852876663208, + "learning_rate": 9.94677048996872e-05, + "loss": 2.3871, + "step": 1005 + }, + { + "epoch": 0.07572592634411637, + "grad_norm": 7.656721591949463, + "learning_rate": 9.946592931040666e-05, + "loss": 2.063, + "step": 1006 + }, + { + "epoch": 0.07580120062477654, + "grad_norm": 6.189587593078613, + "learning_rate": 9.946415078051853e-05, + "loss": 2.3819, + "step": 1007 + }, + { + "epoch": 0.07587647490543668, + "grad_norm": 6.411081790924072, + "learning_rate": 9.946236931012858e-05, + "loss": 2.4864, + "step": 1008 + }, + { + "epoch": 0.07595174918609685, + "grad_norm": 5.585498332977295, + "learning_rate": 9.946058489934269e-05, + "loss": 2.2988, + "step": 1009 + }, + { + "epoch": 0.076027023466757, + "grad_norm": 8.20356559753418, + "learning_rate": 9.945879754826695e-05, + "loss": 2.4969, + "step": 1010 + }, + { + "epoch": 0.07610229774741716, + "grad_norm": 4.942919731140137, + "learning_rate": 9.94570072570076e-05, + "loss": 2.7698, + "step": 1011 + }, + { + "epoch": 0.0761775720280773, + "grad_norm": 6.5601043701171875, + "learning_rate": 9.945521402567109e-05, + "loss": 2.3243, + "step": 1012 + }, + { + "epoch": 0.07625284630873747, + "grad_norm": 5.879470348358154, + "learning_rate": 9.9453417854364e-05, + "loss": 2.1877, + "step": 1013 + }, + { + "epoch": 0.07632812058939761, + "grad_norm": 5.421281337738037, + "learning_rate": 9.945161874319311e-05, + "loss": 2.2029, + "step": 1014 + }, + { + "epoch": 0.07640339487005778, + "grad_norm": 5.667712211608887, + "learning_rate": 9.944981669226539e-05, + "loss": 2.1336, + "step": 1015 + }, + { + "epoch": 0.07647866915071792, + "grad_norm": 8.215184211730957, + "learning_rate": 9.944801170168796e-05, + "loss": 2.1728, + "step": 1016 + }, + { + "epoch": 0.07655394343137809, + "grad_norm": 5.038661479949951, + "learning_rate": 9.94462037715681e-05, + "loss": 2.5447, + "step": 1017 + }, + { + "epoch": 0.07662921771203823, + "grad_norm": 5.737669944763184, + "learning_rate": 9.944439290201331e-05, + "loss": 2.335, + "step": 1018 + }, + { + "epoch": 0.0767044919926984, + "grad_norm": 4.581604957580566, + "learning_rate": 9.944257909313124e-05, + "loss": 2.2242, + "step": 1019 + }, + { + "epoch": 0.07677976627335854, + "grad_norm": 7.627722263336182, + "learning_rate": 9.94407623450297e-05, + "loss": 2.6063, + "step": 1020 + }, + { + "epoch": 0.0768550405540187, + "grad_norm": 4.925320148468018, + "learning_rate": 9.94389426578167e-05, + "loss": 2.3832, + "step": 1021 + }, + { + "epoch": 0.07693031483467887, + "grad_norm": 6.603039741516113, + "learning_rate": 9.943712003160043e-05, + "loss": 2.2791, + "step": 1022 + }, + { + "epoch": 0.07700558911533902, + "grad_norm": 4.396380424499512, + "learning_rate": 9.943529446648921e-05, + "loss": 2.1878, + "step": 1023 + }, + { + "epoch": 0.07708086339599918, + "grad_norm": 4.721848011016846, + "learning_rate": 9.943346596259159e-05, + "loss": 2.3043, + "step": 1024 + }, + { + "epoch": 0.07715613767665933, + "grad_norm": 6.192013740539551, + "learning_rate": 9.943163452001625e-05, + "loss": 2.3955, + "step": 1025 + }, + { + "epoch": 0.07723141195731949, + "grad_norm": 6.906729221343994, + "learning_rate": 9.942980013887207e-05, + "loss": 2.4838, + "step": 1026 + }, + { + "epoch": 0.07730668623797964, + "grad_norm": 7.221069812774658, + "learning_rate": 9.942796281926812e-05, + "loss": 2.2178, + "step": 1027 + }, + { + "epoch": 0.0773819605186398, + "grad_norm": 6.018310070037842, + "learning_rate": 9.942612256131359e-05, + "loss": 2.6929, + "step": 1028 + }, + { + "epoch": 0.07745723479929995, + "grad_norm": 5.666262149810791, + "learning_rate": 9.94242793651179e-05, + "loss": 2.4844, + "step": 1029 + }, + { + "epoch": 0.07753250907996011, + "grad_norm": 7.3587260246276855, + "learning_rate": 9.942243323079061e-05, + "loss": 2.0608, + "step": 1030 + }, + { + "epoch": 0.07760778336062026, + "grad_norm": 5.538628101348877, + "learning_rate": 9.942058415844147e-05, + "loss": 2.1712, + "step": 1031 + }, + { + "epoch": 0.07768305764128042, + "grad_norm": 6.028646945953369, + "learning_rate": 9.941873214818042e-05, + "loss": 2.4612, + "step": 1032 + }, + { + "epoch": 0.07775833192194057, + "grad_norm": 7.019837379455566, + "learning_rate": 9.941687720011753e-05, + "loss": 2.2944, + "step": 1033 + }, + { + "epoch": 0.07783360620260073, + "grad_norm": 7.470941543579102, + "learning_rate": 9.941501931436309e-05, + "loss": 2.0832, + "step": 1034 + }, + { + "epoch": 0.07790888048326088, + "grad_norm": 6.686201095581055, + "learning_rate": 9.941315849102753e-05, + "loss": 2.1468, + "step": 1035 + }, + { + "epoch": 0.07798415476392104, + "grad_norm": 5.702566146850586, + "learning_rate": 9.94112947302215e-05, + "loss": 2.7865, + "step": 1036 + }, + { + "epoch": 0.0780594290445812, + "grad_norm": 5.05864953994751, + "learning_rate": 9.940942803205575e-05, + "loss": 2.2278, + "step": 1037 + }, + { + "epoch": 0.07813470332524135, + "grad_norm": 6.330860614776611, + "learning_rate": 9.940755839664128e-05, + "loss": 2.2412, + "step": 1038 + }, + { + "epoch": 0.07820997760590151, + "grad_norm": 4.715969085693359, + "learning_rate": 9.940568582408924e-05, + "loss": 2.3136, + "step": 1039 + }, + { + "epoch": 0.07828525188656166, + "grad_norm": 6.241091251373291, + "learning_rate": 9.940381031451094e-05, + "loss": 2.1582, + "step": 1040 + }, + { + "epoch": 0.07836052616722182, + "grad_norm": 4.697853088378906, + "learning_rate": 9.940193186801787e-05, + "loss": 2.4335, + "step": 1041 + }, + { + "epoch": 0.07843580044788197, + "grad_norm": 4.922056674957275, + "learning_rate": 9.94000504847217e-05, + "loss": 2.1039, + "step": 1042 + }, + { + "epoch": 0.07851107472854213, + "grad_norm": 5.535480976104736, + "learning_rate": 9.939816616473427e-05, + "loss": 2.151, + "step": 1043 + }, + { + "epoch": 0.07858634900920228, + "grad_norm": 4.560698986053467, + "learning_rate": 9.93962789081676e-05, + "loss": 2.4929, + "step": 1044 + }, + { + "epoch": 0.07866162328986244, + "grad_norm": 5.94008207321167, + "learning_rate": 9.939438871513388e-05, + "loss": 2.3912, + "step": 1045 + }, + { + "epoch": 0.07873689757052259, + "grad_norm": 5.139965057373047, + "learning_rate": 9.939249558574548e-05, + "loss": 1.9442, + "step": 1046 + }, + { + "epoch": 0.07881217185118275, + "grad_norm": 5.666169166564941, + "learning_rate": 9.939059952011494e-05, + "loss": 2.258, + "step": 1047 + }, + { + "epoch": 0.0788874461318429, + "grad_norm": 4.650025844573975, + "learning_rate": 9.938870051835498e-05, + "loss": 2.4298, + "step": 1048 + }, + { + "epoch": 0.07896272041250306, + "grad_norm": 5.10502815246582, + "learning_rate": 9.938679858057848e-05, + "loss": 2.3377, + "step": 1049 + }, + { + "epoch": 0.07903799469316321, + "grad_norm": 5.629189491271973, + "learning_rate": 9.938489370689853e-05, + "loss": 2.4516, + "step": 1050 + }, + { + "epoch": 0.07911326897382337, + "grad_norm": 5.972681522369385, + "learning_rate": 9.938298589742834e-05, + "loss": 2.4691, + "step": 1051 + }, + { + "epoch": 0.07918854325448352, + "grad_norm": 4.494705677032471, + "learning_rate": 9.938107515228133e-05, + "loss": 2.5649, + "step": 1052 + }, + { + "epoch": 0.07926381753514368, + "grad_norm": 4.915561199188232, + "learning_rate": 9.93791614715711e-05, + "loss": 2.1048, + "step": 1053 + }, + { + "epoch": 0.07933909181580384, + "grad_norm": 4.736732006072998, + "learning_rate": 9.93772448554114e-05, + "loss": 2.4447, + "step": 1054 + }, + { + "epoch": 0.07941436609646399, + "grad_norm": 3.9177684783935547, + "learning_rate": 9.937532530391617e-05, + "loss": 2.2234, + "step": 1055 + }, + { + "epoch": 0.07948964037712415, + "grad_norm": 5.615268707275391, + "learning_rate": 9.937340281719952e-05, + "loss": 2.35, + "step": 1056 + }, + { + "epoch": 0.0795649146577843, + "grad_norm": 4.6389007568359375, + "learning_rate": 9.937147739537576e-05, + "loss": 2.2979, + "step": 1057 + }, + { + "epoch": 0.07964018893844446, + "grad_norm": 4.469980239868164, + "learning_rate": 9.93695490385593e-05, + "loss": 2.4946, + "step": 1058 + }, + { + "epoch": 0.07971546321910461, + "grad_norm": 4.239686965942383, + "learning_rate": 9.936761774686482e-05, + "loss": 1.9174, + "step": 1059 + }, + { + "epoch": 0.07979073749976477, + "grad_norm": 4.816890239715576, + "learning_rate": 9.936568352040714e-05, + "loss": 2.5983, + "step": 1060 + }, + { + "epoch": 0.07986601178042492, + "grad_norm": 5.86488151550293, + "learning_rate": 9.936374635930119e-05, + "loss": 2.3391, + "step": 1061 + }, + { + "epoch": 0.07994128606108508, + "grad_norm": 5.060385704040527, + "learning_rate": 9.936180626366218e-05, + "loss": 2.1666, + "step": 1062 + }, + { + "epoch": 0.08001656034174523, + "grad_norm": 5.208842754364014, + "learning_rate": 9.935986323360541e-05, + "loss": 2.2212, + "step": 1063 + }, + { + "epoch": 0.08009183462240539, + "grad_norm": 4.977725982666016, + "learning_rate": 9.935791726924642e-05, + "loss": 2.3255, + "step": 1064 + }, + { + "epoch": 0.08016710890306554, + "grad_norm": 4.706028461456299, + "learning_rate": 9.935596837070085e-05, + "loss": 2.0912, + "step": 1065 + }, + { + "epoch": 0.0802423831837257, + "grad_norm": 4.670478343963623, + "learning_rate": 9.935401653808458e-05, + "loss": 2.1627, + "step": 1066 + }, + { + "epoch": 0.08031765746438585, + "grad_norm": 7.468380928039551, + "learning_rate": 9.935206177151364e-05, + "loss": 2.2187, + "step": 1067 + }, + { + "epoch": 0.08039293174504601, + "grad_norm": 4.354898929595947, + "learning_rate": 9.935010407110424e-05, + "loss": 1.9525, + "step": 1068 + }, + { + "epoch": 0.08046820602570617, + "grad_norm": 5.351611614227295, + "learning_rate": 9.934814343697275e-05, + "loss": 2.3182, + "step": 1069 + }, + { + "epoch": 0.08054348030636632, + "grad_norm": 5.501046657562256, + "learning_rate": 9.934617986923574e-05, + "loss": 2.111, + "step": 1070 + }, + { + "epoch": 0.08061875458702648, + "grad_norm": 4.17952299118042, + "learning_rate": 9.934421336800992e-05, + "loss": 2.5661, + "step": 1071 + }, + { + "epoch": 0.08069402886768663, + "grad_norm": 5.963723659515381, + "learning_rate": 9.934224393341219e-05, + "loss": 2.3498, + "step": 1072 + }, + { + "epoch": 0.0807693031483468, + "grad_norm": 5.3424224853515625, + "learning_rate": 9.934027156555965e-05, + "loss": 2.3266, + "step": 1073 + }, + { + "epoch": 0.08084457742900694, + "grad_norm": 5.322235107421875, + "learning_rate": 9.933829626456953e-05, + "loss": 2.3117, + "step": 1074 + }, + { + "epoch": 0.0809198517096671, + "grad_norm": 6.136697292327881, + "learning_rate": 9.933631803055927e-05, + "loss": 2.2734, + "step": 1075 + }, + { + "epoch": 0.08099512599032725, + "grad_norm": 5.181178092956543, + "learning_rate": 9.933433686364646e-05, + "loss": 2.1142, + "step": 1076 + }, + { + "epoch": 0.08107040027098741, + "grad_norm": 4.634887218475342, + "learning_rate": 9.933235276394887e-05, + "loss": 2.4871, + "step": 1077 + }, + { + "epoch": 0.08114567455164756, + "grad_norm": 9.338444709777832, + "learning_rate": 9.933036573158449e-05, + "loss": 2.3129, + "step": 1078 + }, + { + "epoch": 0.08122094883230772, + "grad_norm": 5.21658182144165, + "learning_rate": 9.93283757666714e-05, + "loss": 2.3391, + "step": 1079 + }, + { + "epoch": 0.08129622311296787, + "grad_norm": 4.9668989181518555, + "learning_rate": 9.93263828693279e-05, + "loss": 2.2192, + "step": 1080 + }, + { + "epoch": 0.08137149739362803, + "grad_norm": 8.038995742797852, + "learning_rate": 9.932438703967246e-05, + "loss": 2.3, + "step": 1081 + }, + { + "epoch": 0.08144677167428818, + "grad_norm": 6.618760108947754, + "learning_rate": 9.932238827782376e-05, + "loss": 2.393, + "step": 1082 + }, + { + "epoch": 0.08152204595494834, + "grad_norm": 6.852845668792725, + "learning_rate": 9.93203865839006e-05, + "loss": 2.0772, + "step": 1083 + }, + { + "epoch": 0.08159732023560849, + "grad_norm": 8.266192436218262, + "learning_rate": 9.931838195802196e-05, + "loss": 2.2675, + "step": 1084 + }, + { + "epoch": 0.08167259451626865, + "grad_norm": 5.701829433441162, + "learning_rate": 9.931637440030702e-05, + "loss": 2.1978, + "step": 1085 + }, + { + "epoch": 0.08174786879692882, + "grad_norm": 4.683379173278809, + "learning_rate": 9.931436391087514e-05, + "loss": 2.2586, + "step": 1086 + }, + { + "epoch": 0.08182314307758896, + "grad_norm": 7.1637091636657715, + "learning_rate": 9.931235048984582e-05, + "loss": 2.0183, + "step": 1087 + }, + { + "epoch": 0.08189841735824913, + "grad_norm": 5.0980658531188965, + "learning_rate": 9.931033413733874e-05, + "loss": 2.2851, + "step": 1088 + }, + { + "epoch": 0.08197369163890927, + "grad_norm": 6.3092756271362305, + "learning_rate": 9.930831485347381e-05, + "loss": 2.6583, + "step": 1089 + }, + { + "epoch": 0.08204896591956944, + "grad_norm": 5.924673080444336, + "learning_rate": 9.930629263837103e-05, + "loss": 2.0791, + "step": 1090 + }, + { + "epoch": 0.08212424020022958, + "grad_norm": 4.839828014373779, + "learning_rate": 9.930426749215063e-05, + "loss": 2.121, + "step": 1091 + }, + { + "epoch": 0.08219951448088975, + "grad_norm": 5.012866497039795, + "learning_rate": 9.9302239414933e-05, + "loss": 2.3123, + "step": 1092 + }, + { + "epoch": 0.0822747887615499, + "grad_norm": 4.831496715545654, + "learning_rate": 9.930020840683869e-05, + "loss": 2.2346, + "step": 1093 + }, + { + "epoch": 0.08235006304221006, + "grad_norm": 4.405677795410156, + "learning_rate": 9.929817446798846e-05, + "loss": 2.4625, + "step": 1094 + }, + { + "epoch": 0.0824253373228702, + "grad_norm": 6.026737213134766, + "learning_rate": 9.929613759850321e-05, + "loss": 2.3199, + "step": 1095 + }, + { + "epoch": 0.08250061160353037, + "grad_norm": 5.9357171058654785, + "learning_rate": 9.929409779850401e-05, + "loss": 2.7255, + "step": 1096 + }, + { + "epoch": 0.08257588588419051, + "grad_norm": 7.4474592208862305, + "learning_rate": 9.929205506811215e-05, + "loss": 2.1615, + "step": 1097 + }, + { + "epoch": 0.08265116016485068, + "grad_norm": 6.040008544921875, + "learning_rate": 9.929000940744906e-05, + "loss": 2.0836, + "step": 1098 + }, + { + "epoch": 0.08272643444551082, + "grad_norm": 5.82507848739624, + "learning_rate": 9.928796081663632e-05, + "loss": 1.9006, + "step": 1099 + }, + { + "epoch": 0.08280170872617099, + "grad_norm": 5.737878322601318, + "learning_rate": 9.928590929579575e-05, + "loss": 2.336, + "step": 1100 + }, + { + "epoch": 0.08287698300683113, + "grad_norm": 5.825507640838623, + "learning_rate": 9.928385484504927e-05, + "loss": 2.5359, + "step": 1101 + }, + { + "epoch": 0.0829522572874913, + "grad_norm": 6.403767108917236, + "learning_rate": 9.928179746451905e-05, + "loss": 2.2275, + "step": 1102 + }, + { + "epoch": 0.08302753156815146, + "grad_norm": 7.845687389373779, + "learning_rate": 9.927973715432736e-05, + "loss": 2.7856, + "step": 1103 + }, + { + "epoch": 0.0831028058488116, + "grad_norm": 6.293184280395508, + "learning_rate": 9.92776739145967e-05, + "loss": 2.4, + "step": 1104 + }, + { + "epoch": 0.08317808012947177, + "grad_norm": 7.6609344482421875, + "learning_rate": 9.927560774544972e-05, + "loss": 2.2605, + "step": 1105 + }, + { + "epoch": 0.08325335441013192, + "grad_norm": 4.736534118652344, + "learning_rate": 9.927353864700926e-05, + "loss": 2.4199, + "step": 1106 + }, + { + "epoch": 0.08332862869079208, + "grad_norm": 4.435877323150635, + "learning_rate": 9.92714666193983e-05, + "loss": 2.1503, + "step": 1107 + }, + { + "epoch": 0.08340390297145223, + "grad_norm": 6.6472320556640625, + "learning_rate": 9.926939166274003e-05, + "loss": 2.235, + "step": 1108 + }, + { + "epoch": 0.08347917725211239, + "grad_norm": 5.518762111663818, + "learning_rate": 9.926731377715779e-05, + "loss": 2.0932, + "step": 1109 + }, + { + "epoch": 0.08355445153277254, + "grad_norm": 4.8642449378967285, + "learning_rate": 9.926523296277511e-05, + "loss": 2.0227, + "step": 1110 + }, + { + "epoch": 0.0836297258134327, + "grad_norm": 3.95300555229187, + "learning_rate": 9.926314921971567e-05, + "loss": 2.0899, + "step": 1111 + }, + { + "epoch": 0.08370500009409285, + "grad_norm": 4.438868522644043, + "learning_rate": 9.926106254810339e-05, + "loss": 2.5073, + "step": 1112 + }, + { + "epoch": 0.08378027437475301, + "grad_norm": 4.2434163093566895, + "learning_rate": 9.925897294806228e-05, + "loss": 2.16, + "step": 1113 + }, + { + "epoch": 0.08385554865541316, + "grad_norm": 6.885189533233643, + "learning_rate": 9.925688041971656e-05, + "loss": 2.0897, + "step": 1114 + }, + { + "epoch": 0.08393082293607332, + "grad_norm": 4.5535454750061035, + "learning_rate": 9.925478496319063e-05, + "loss": 2.0444, + "step": 1115 + }, + { + "epoch": 0.08400609721673347, + "grad_norm": 5.977106094360352, + "learning_rate": 9.925268657860906e-05, + "loss": 2.353, + "step": 1116 + }, + { + "epoch": 0.08408137149739363, + "grad_norm": 5.5683369636535645, + "learning_rate": 9.925058526609661e-05, + "loss": 2.3858, + "step": 1117 + }, + { + "epoch": 0.08415664577805379, + "grad_norm": 7.104760646820068, + "learning_rate": 9.924848102577818e-05, + "loss": 2.127, + "step": 1118 + }, + { + "epoch": 0.08423192005871394, + "grad_norm": 5.28377103805542, + "learning_rate": 9.924637385777886e-05, + "loss": 2.0708, + "step": 1119 + }, + { + "epoch": 0.0843071943393741, + "grad_norm": 5.864218711853027, + "learning_rate": 9.924426376222391e-05, + "loss": 2.5374, + "step": 1120 + }, + { + "epoch": 0.08438246862003425, + "grad_norm": 7.219125270843506, + "learning_rate": 9.92421507392388e-05, + "loss": 2.5879, + "step": 1121 + }, + { + "epoch": 0.08445774290069441, + "grad_norm": 6.50026798248291, + "learning_rate": 9.924003478894908e-05, + "loss": 2.2759, + "step": 1122 + }, + { + "epoch": 0.08453301718135456, + "grad_norm": 5.509936809539795, + "learning_rate": 9.923791591148061e-05, + "loss": 2.2141, + "step": 1123 + }, + { + "epoch": 0.08460829146201472, + "grad_norm": 6.062098979949951, + "learning_rate": 9.923579410695932e-05, + "loss": 2.3936, + "step": 1124 + }, + { + "epoch": 0.08468356574267487, + "grad_norm": 6.7604756355285645, + "learning_rate": 9.923366937551132e-05, + "loss": 2.3012, + "step": 1125 + }, + { + "epoch": 0.08475884002333503, + "grad_norm": 6.552176475524902, + "learning_rate": 9.923154171726295e-05, + "loss": 2.4841, + "step": 1126 + }, + { + "epoch": 0.08483411430399518, + "grad_norm": 5.604454517364502, + "learning_rate": 9.92294111323407e-05, + "loss": 2.0627, + "step": 1127 + }, + { + "epoch": 0.08490938858465534, + "grad_norm": 5.521183967590332, + "learning_rate": 9.92272776208712e-05, + "loss": 2.4608, + "step": 1128 + }, + { + "epoch": 0.08498466286531549, + "grad_norm": 5.1209611892700195, + "learning_rate": 9.92251411829813e-05, + "loss": 2.2051, + "step": 1129 + }, + { + "epoch": 0.08505993714597565, + "grad_norm": 5.43388557434082, + "learning_rate": 9.9223001818798e-05, + "loss": 2.2708, + "step": 1130 + }, + { + "epoch": 0.0851352114266358, + "grad_norm": 5.455743312835693, + "learning_rate": 9.922085952844846e-05, + "loss": 2.1273, + "step": 1131 + }, + { + "epoch": 0.08521048570729596, + "grad_norm": 4.575812816619873, + "learning_rate": 9.921871431206007e-05, + "loss": 2.5016, + "step": 1132 + }, + { + "epoch": 0.08528575998795611, + "grad_norm": 7.7491230964660645, + "learning_rate": 9.921656616976033e-05, + "loss": 1.8787, + "step": 1133 + }, + { + "epoch": 0.08536103426861627, + "grad_norm": 5.662252426147461, + "learning_rate": 9.921441510167697e-05, + "loss": 2.6668, + "step": 1134 + }, + { + "epoch": 0.08543630854927643, + "grad_norm": 4.987123966217041, + "learning_rate": 9.921226110793783e-05, + "loss": 2.8943, + "step": 1135 + }, + { + "epoch": 0.08551158282993658, + "grad_norm": 6.202556610107422, + "learning_rate": 9.921010418867098e-05, + "loss": 2.2573, + "step": 1136 + }, + { + "epoch": 0.08558685711059674, + "grad_norm": 4.842884540557861, + "learning_rate": 9.920794434400462e-05, + "loss": 2.0757, + "step": 1137 + }, + { + "epoch": 0.08566213139125689, + "grad_norm": 5.519599437713623, + "learning_rate": 9.920578157406717e-05, + "loss": 2.3174, + "step": 1138 + }, + { + "epoch": 0.08573740567191705, + "grad_norm": 7.333906650543213, + "learning_rate": 9.920361587898719e-05, + "loss": 2.1983, + "step": 1139 + }, + { + "epoch": 0.0858126799525772, + "grad_norm": 6.405233860015869, + "learning_rate": 9.920144725889343e-05, + "loss": 2.1087, + "step": 1140 + }, + { + "epoch": 0.08588795423323736, + "grad_norm": 6.9802656173706055, + "learning_rate": 9.91992757139148e-05, + "loss": 2.2193, + "step": 1141 + }, + { + "epoch": 0.08596322851389751, + "grad_norm": 7.72580099105835, + "learning_rate": 9.919710124418042e-05, + "loss": 2.4775, + "step": 1142 + }, + { + "epoch": 0.08603850279455767, + "grad_norm": 5.433417320251465, + "learning_rate": 9.919492384981951e-05, + "loss": 2.1343, + "step": 1143 + }, + { + "epoch": 0.08611377707521782, + "grad_norm": 6.366405487060547, + "learning_rate": 9.919274353096154e-05, + "loss": 2.1731, + "step": 1144 + }, + { + "epoch": 0.08618905135587798, + "grad_norm": 5.061887741088867, + "learning_rate": 9.91905602877361e-05, + "loss": 2.3821, + "step": 1145 + }, + { + "epoch": 0.08626432563653813, + "grad_norm": 5.18080997467041, + "learning_rate": 9.918837412027302e-05, + "loss": 2.221, + "step": 1146 + }, + { + "epoch": 0.08633959991719829, + "grad_norm": 5.952714920043945, + "learning_rate": 9.918618502870222e-05, + "loss": 2.3506, + "step": 1147 + }, + { + "epoch": 0.08641487419785844, + "grad_norm": 7.845677852630615, + "learning_rate": 9.918399301315386e-05, + "loss": 2.0601, + "step": 1148 + }, + { + "epoch": 0.0864901484785186, + "grad_norm": 6.018703460693359, + "learning_rate": 9.918179807375824e-05, + "loss": 2.2542, + "step": 1149 + }, + { + "epoch": 0.08656542275917876, + "grad_norm": 4.599795341491699, + "learning_rate": 9.917960021064585e-05, + "loss": 1.9858, + "step": 1150 + }, + { + "epoch": 0.08664069703983891, + "grad_norm": 8.23704719543457, + "learning_rate": 9.917739942394733e-05, + "loss": 2.432, + "step": 1151 + }, + { + "epoch": 0.08671597132049907, + "grad_norm": 6.213704586029053, + "learning_rate": 9.917519571379352e-05, + "loss": 2.4168, + "step": 1152 + }, + { + "epoch": 0.08679124560115922, + "grad_norm": 5.583656311035156, + "learning_rate": 9.917298908031543e-05, + "loss": 2.0159, + "step": 1153 + }, + { + "epoch": 0.08686651988181938, + "grad_norm": 5.6313276290893555, + "learning_rate": 9.917077952364422e-05, + "loss": 2.0108, + "step": 1154 + }, + { + "epoch": 0.08694179416247953, + "grad_norm": 6.999698162078857, + "learning_rate": 9.916856704391127e-05, + "loss": 2.5267, + "step": 1155 + }, + { + "epoch": 0.0870170684431397, + "grad_norm": 5.772253036499023, + "learning_rate": 9.916635164124807e-05, + "loss": 2.2743, + "step": 1156 + }, + { + "epoch": 0.08709234272379984, + "grad_norm": 4.270077705383301, + "learning_rate": 9.916413331578635e-05, + "loss": 2.1398, + "step": 1157 + }, + { + "epoch": 0.08716761700446, + "grad_norm": 5.124973773956299, + "learning_rate": 9.916191206765797e-05, + "loss": 2.3097, + "step": 1158 + }, + { + "epoch": 0.08724289128512015, + "grad_norm": 5.555698394775391, + "learning_rate": 9.915968789699499e-05, + "loss": 2.2625, + "step": 1159 + }, + { + "epoch": 0.08731816556578031, + "grad_norm": 4.2281270027160645, + "learning_rate": 9.91574608039296e-05, + "loss": 2.3018, + "step": 1160 + }, + { + "epoch": 0.08739343984644046, + "grad_norm": 5.254685401916504, + "learning_rate": 9.915523078859423e-05, + "loss": 2.4533, + "step": 1161 + }, + { + "epoch": 0.08746871412710062, + "grad_norm": 6.163359642028809, + "learning_rate": 9.915299785112142e-05, + "loss": 2.1292, + "step": 1162 + }, + { + "epoch": 0.08754398840776077, + "grad_norm": 5.744691371917725, + "learning_rate": 9.915076199164392e-05, + "loss": 2.2701, + "step": 1163 + }, + { + "epoch": 0.08761926268842093, + "grad_norm": 5.872090816497803, + "learning_rate": 9.914852321029466e-05, + "loss": 2.6751, + "step": 1164 + }, + { + "epoch": 0.08769453696908108, + "grad_norm": 5.114642143249512, + "learning_rate": 9.914628150720674e-05, + "loss": 2.2849, + "step": 1165 + }, + { + "epoch": 0.08776981124974124, + "grad_norm": 4.612166404724121, + "learning_rate": 9.914403688251337e-05, + "loss": 2.5745, + "step": 1166 + }, + { + "epoch": 0.0878450855304014, + "grad_norm": 8.587389945983887, + "learning_rate": 9.914178933634801e-05, + "loss": 2.4016, + "step": 1167 + }, + { + "epoch": 0.08792035981106155, + "grad_norm": 5.873881816864014, + "learning_rate": 9.913953886884431e-05, + "loss": 2.2368, + "step": 1168 + }, + { + "epoch": 0.08799563409172172, + "grad_norm": 6.308575630187988, + "learning_rate": 9.913728548013599e-05, + "loss": 2.4197, + "step": 1169 + }, + { + "epoch": 0.08807090837238186, + "grad_norm": 4.444754123687744, + "learning_rate": 9.913502917035705e-05, + "loss": 2.1971, + "step": 1170 + }, + { + "epoch": 0.08814618265304203, + "grad_norm": 4.840965270996094, + "learning_rate": 9.913276993964162e-05, + "loss": 2.3044, + "step": 1171 + }, + { + "epoch": 0.08822145693370217, + "grad_norm": 5.132396697998047, + "learning_rate": 9.913050778812399e-05, + "loss": 2.1907, + "step": 1172 + }, + { + "epoch": 0.08829673121436234, + "grad_norm": 4.840834617614746, + "learning_rate": 9.912824271593863e-05, + "loss": 2.0675, + "step": 1173 + }, + { + "epoch": 0.08837200549502248, + "grad_norm": 6.998414993286133, + "learning_rate": 9.912597472322021e-05, + "loss": 2.4654, + "step": 1174 + }, + { + "epoch": 0.08844727977568265, + "grad_norm": 6.046200275421143, + "learning_rate": 9.912370381010356e-05, + "loss": 2.3616, + "step": 1175 + }, + { + "epoch": 0.0885225540563428, + "grad_norm": 4.645432472229004, + "learning_rate": 9.912142997672366e-05, + "loss": 2.3322, + "step": 1176 + }, + { + "epoch": 0.08859782833700296, + "grad_norm": 7.393093585968018, + "learning_rate": 9.911915322321572e-05, + "loss": 2.041, + "step": 1177 + }, + { + "epoch": 0.0886731026176631, + "grad_norm": 7.361612319946289, + "learning_rate": 9.911687354971504e-05, + "loss": 2.3988, + "step": 1178 + }, + { + "epoch": 0.08874837689832327, + "grad_norm": 5.41740083694458, + "learning_rate": 9.911459095635716e-05, + "loss": 2.1069, + "step": 1179 + }, + { + "epoch": 0.08882365117898341, + "grad_norm": 5.164453506469727, + "learning_rate": 9.911230544327776e-05, + "loss": 2.1698, + "step": 1180 + }, + { + "epoch": 0.08889892545964358, + "grad_norm": 5.7849955558776855, + "learning_rate": 9.911001701061273e-05, + "loss": 2.002, + "step": 1181 + }, + { + "epoch": 0.08897419974030372, + "grad_norm": 4.980674743652344, + "learning_rate": 9.91077256584981e-05, + "loss": 2.3879, + "step": 1182 + }, + { + "epoch": 0.08904947402096389, + "grad_norm": 8.146739959716797, + "learning_rate": 9.910543138707009e-05, + "loss": 2.3295, + "step": 1183 + }, + { + "epoch": 0.08912474830162405, + "grad_norm": 5.663999557495117, + "learning_rate": 9.910313419646509e-05, + "loss": 2.2329, + "step": 1184 + }, + { + "epoch": 0.0892000225822842, + "grad_norm": 4.857204437255859, + "learning_rate": 9.910083408681966e-05, + "loss": 2.3024, + "step": 1185 + }, + { + "epoch": 0.08927529686294436, + "grad_norm": 5.403531074523926, + "learning_rate": 9.909853105827051e-05, + "loss": 2.2123, + "step": 1186 + }, + { + "epoch": 0.0893505711436045, + "grad_norm": 4.682754039764404, + "learning_rate": 9.909622511095458e-05, + "loss": 2.0178, + "step": 1187 + }, + { + "epoch": 0.08942584542426467, + "grad_norm": 7.898589611053467, + "learning_rate": 9.909391624500892e-05, + "loss": 2.0381, + "step": 1188 + }, + { + "epoch": 0.08950111970492482, + "grad_norm": 4.3345746994018555, + "learning_rate": 9.909160446057082e-05, + "loss": 1.9817, + "step": 1189 + }, + { + "epoch": 0.08957639398558498, + "grad_norm": 5.2663774490356445, + "learning_rate": 9.90892897577777e-05, + "loss": 2.2422, + "step": 1190 + }, + { + "epoch": 0.08965166826624513, + "grad_norm": 4.381959438323975, + "learning_rate": 9.908697213676716e-05, + "loss": 2.5133, + "step": 1191 + }, + { + "epoch": 0.08972694254690529, + "grad_norm": 5.167469024658203, + "learning_rate": 9.908465159767695e-05, + "loss": 2.0555, + "step": 1192 + }, + { + "epoch": 0.08980221682756544, + "grad_norm": 5.803744316101074, + "learning_rate": 9.908232814064507e-05, + "loss": 2.2211, + "step": 1193 + }, + { + "epoch": 0.0898774911082256, + "grad_norm": 5.2488813400268555, + "learning_rate": 9.908000176580959e-05, + "loss": 2.2053, + "step": 1194 + }, + { + "epoch": 0.08995276538888575, + "grad_norm": 4.878261566162109, + "learning_rate": 9.907767247330885e-05, + "loss": 2.1682, + "step": 1195 + }, + { + "epoch": 0.09002803966954591, + "grad_norm": 5.3838725090026855, + "learning_rate": 9.907534026328129e-05, + "loss": 2.2657, + "step": 1196 + }, + { + "epoch": 0.09010331395020606, + "grad_norm": 7.655898571014404, + "learning_rate": 9.907300513586557e-05, + "loss": 2.3684, + "step": 1197 + }, + { + "epoch": 0.09017858823086622, + "grad_norm": 7.286603927612305, + "learning_rate": 9.90706670912005e-05, + "loss": 2.2987, + "step": 1198 + }, + { + "epoch": 0.09025386251152638, + "grad_norm": 5.679907321929932, + "learning_rate": 9.906832612942507e-05, + "loss": 2.5172, + "step": 1199 + }, + { + "epoch": 0.09032913679218653, + "grad_norm": 7.662585735321045, + "learning_rate": 9.906598225067845e-05, + "loss": 2.8452, + "step": 1200 + }, + { + "epoch": 0.09040441107284669, + "grad_norm": 4.499806880950928, + "learning_rate": 9.906363545509995e-05, + "loss": 1.9919, + "step": 1201 + }, + { + "epoch": 0.09047968535350684, + "grad_norm": 5.981786251068115, + "learning_rate": 9.906128574282913e-05, + "loss": 2.2924, + "step": 1202 + }, + { + "epoch": 0.090554959634167, + "grad_norm": 4.9308977127075195, + "learning_rate": 9.905893311400563e-05, + "loss": 2.3659, + "step": 1203 + }, + { + "epoch": 0.09063023391482715, + "grad_norm": 7.844737529754639, + "learning_rate": 9.905657756876932e-05, + "loss": 2.3044, + "step": 1204 + }, + { + "epoch": 0.09070550819548731, + "grad_norm": 4.43223762512207, + "learning_rate": 9.905421910726025e-05, + "loss": 2.5299, + "step": 1205 + }, + { + "epoch": 0.09078078247614746, + "grad_norm": 6.778882026672363, + "learning_rate": 9.905185772961858e-05, + "loss": 2.1524, + "step": 1206 + }, + { + "epoch": 0.09085605675680762, + "grad_norm": 4.246809482574463, + "learning_rate": 9.904949343598473e-05, + "loss": 2.2371, + "step": 1207 + }, + { + "epoch": 0.09093133103746777, + "grad_norm": 6.284351348876953, + "learning_rate": 9.904712622649923e-05, + "loss": 2.246, + "step": 1208 + }, + { + "epoch": 0.09100660531812793, + "grad_norm": 6.6328020095825195, + "learning_rate": 9.904475610130282e-05, + "loss": 2.2712, + "step": 1209 + }, + { + "epoch": 0.09108187959878808, + "grad_norm": 6.602048397064209, + "learning_rate": 9.904238306053637e-05, + "loss": 2.1394, + "step": 1210 + }, + { + "epoch": 0.09115715387944824, + "grad_norm": 5.839536190032959, + "learning_rate": 9.904000710434097e-05, + "loss": 2.1277, + "step": 1211 + }, + { + "epoch": 0.09123242816010839, + "grad_norm": 4.945462226867676, + "learning_rate": 9.903762823285786e-05, + "loss": 2.1361, + "step": 1212 + }, + { + "epoch": 0.09130770244076855, + "grad_norm": 4.412333011627197, + "learning_rate": 9.903524644622846e-05, + "loss": 2.2587, + "step": 1213 + }, + { + "epoch": 0.0913829767214287, + "grad_norm": 6.278764724731445, + "learning_rate": 9.903286174459434e-05, + "loss": 2.3017, + "step": 1214 + }, + { + "epoch": 0.09145825100208886, + "grad_norm": 4.224057197570801, + "learning_rate": 9.903047412809729e-05, + "loss": 2.0379, + "step": 1215 + }, + { + "epoch": 0.09153352528274902, + "grad_norm": 4.2989091873168945, + "learning_rate": 9.902808359687926e-05, + "loss": 2.0845, + "step": 1216 + }, + { + "epoch": 0.09160879956340917, + "grad_norm": 4.08694314956665, + "learning_rate": 9.90256901510823e-05, + "loss": 2.3598, + "step": 1217 + }, + { + "epoch": 0.09168407384406933, + "grad_norm": 5.269775390625, + "learning_rate": 9.902329379084876e-05, + "loss": 2.3412, + "step": 1218 + }, + { + "epoch": 0.09175934812472948, + "grad_norm": 4.4063029289245605, + "learning_rate": 9.902089451632105e-05, + "loss": 2.3307, + "step": 1219 + }, + { + "epoch": 0.09183462240538964, + "grad_norm": 5.829647541046143, + "learning_rate": 9.901849232764182e-05, + "loss": 1.9702, + "step": 1220 + }, + { + "epoch": 0.09190989668604979, + "grad_norm": 6.281095027923584, + "learning_rate": 9.901608722495388e-05, + "loss": 2.2451, + "step": 1221 + }, + { + "epoch": 0.09198517096670995, + "grad_norm": 4.600857734680176, + "learning_rate": 9.901367920840018e-05, + "loss": 2.4667, + "step": 1222 + }, + { + "epoch": 0.0920604452473701, + "grad_norm": 4.4568586349487305, + "learning_rate": 9.901126827812391e-05, + "loss": 2.0653, + "step": 1223 + }, + { + "epoch": 0.09213571952803026, + "grad_norm": 5.564261436462402, + "learning_rate": 9.900885443426834e-05, + "loss": 2.5238, + "step": 1224 + }, + { + "epoch": 0.09221099380869041, + "grad_norm": 5.048755645751953, + "learning_rate": 9.900643767697704e-05, + "loss": 2.2748, + "step": 1225 + }, + { + "epoch": 0.09228626808935057, + "grad_norm": 5.369926452636719, + "learning_rate": 9.90040180063936e-05, + "loss": 2.7051, + "step": 1226 + }, + { + "epoch": 0.09236154237001072, + "grad_norm": 7.708915710449219, + "learning_rate": 9.900159542266192e-05, + "loss": 2.458, + "step": 1227 + }, + { + "epoch": 0.09243681665067088, + "grad_norm": 4.9571919441223145, + "learning_rate": 9.899916992592597e-05, + "loss": 2.5832, + "step": 1228 + }, + { + "epoch": 0.09251209093133103, + "grad_norm": 5.185609817504883, + "learning_rate": 9.899674151632999e-05, + "loss": 2.0539, + "step": 1229 + }, + { + "epoch": 0.0925873652119912, + "grad_norm": 4.914243221282959, + "learning_rate": 9.899431019401828e-05, + "loss": 2.371, + "step": 1230 + }, + { + "epoch": 0.09266263949265136, + "grad_norm": 6.205069541931152, + "learning_rate": 9.899187595913544e-05, + "loss": 2.1225, + "step": 1231 + }, + { + "epoch": 0.0927379137733115, + "grad_norm": 4.880532264709473, + "learning_rate": 9.898943881182613e-05, + "loss": 1.9952, + "step": 1232 + }, + { + "epoch": 0.09281318805397167, + "grad_norm": 4.993622303009033, + "learning_rate": 9.898699875223524e-05, + "loss": 2.5224, + "step": 1233 + }, + { + "epoch": 0.09288846233463181, + "grad_norm": 5.312560558319092, + "learning_rate": 9.898455578050787e-05, + "loss": 2.2535, + "step": 1234 + }, + { + "epoch": 0.09296373661529198, + "grad_norm": 5.254156589508057, + "learning_rate": 9.898210989678917e-05, + "loss": 2.437, + "step": 1235 + }, + { + "epoch": 0.09303901089595212, + "grad_norm": 6.612512111663818, + "learning_rate": 9.89796611012246e-05, + "loss": 1.9526, + "step": 1236 + }, + { + "epoch": 0.09311428517661229, + "grad_norm": 4.762012004852295, + "learning_rate": 9.897720939395973e-05, + "loss": 2.0193, + "step": 1237 + }, + { + "epoch": 0.09318955945727243, + "grad_norm": 8.582109451293945, + "learning_rate": 9.897475477514027e-05, + "loss": 2.2787, + "step": 1238 + }, + { + "epoch": 0.0932648337379326, + "grad_norm": 4.477728366851807, + "learning_rate": 9.897229724491218e-05, + "loss": 2.2228, + "step": 1239 + }, + { + "epoch": 0.09334010801859274, + "grad_norm": 5.659603118896484, + "learning_rate": 9.896983680342154e-05, + "loss": 2.1808, + "step": 1240 + }, + { + "epoch": 0.0934153822992529, + "grad_norm": 5.629698753356934, + "learning_rate": 9.896737345081459e-05, + "loss": 2.1982, + "step": 1241 + }, + { + "epoch": 0.09349065657991305, + "grad_norm": 4.196998596191406, + "learning_rate": 9.896490718723782e-05, + "loss": 2.1435, + "step": 1242 + }, + { + "epoch": 0.09356593086057322, + "grad_norm": 4.0380167961120605, + "learning_rate": 9.896243801283777e-05, + "loss": 2.0633, + "step": 1243 + }, + { + "epoch": 0.09364120514123336, + "grad_norm": 5.888142108917236, + "learning_rate": 9.89599659277613e-05, + "loss": 3.1702, + "step": 1244 + }, + { + "epoch": 0.09371647942189353, + "grad_norm": 5.672674655914307, + "learning_rate": 9.895749093215534e-05, + "loss": 1.9467, + "step": 1245 + }, + { + "epoch": 0.09379175370255367, + "grad_norm": 5.711461544036865, + "learning_rate": 9.895501302616701e-05, + "loss": 2.7244, + "step": 1246 + }, + { + "epoch": 0.09386702798321384, + "grad_norm": 5.369699954986572, + "learning_rate": 9.895253220994362e-05, + "loss": 1.8389, + "step": 1247 + }, + { + "epoch": 0.093942302263874, + "grad_norm": 5.1924920082092285, + "learning_rate": 9.895004848363267e-05, + "loss": 2.2727, + "step": 1248 + }, + { + "epoch": 0.09401757654453415, + "grad_norm": 5.881622791290283, + "learning_rate": 9.894756184738177e-05, + "loss": 2.2776, + "step": 1249 + }, + { + "epoch": 0.09409285082519431, + "grad_norm": 5.337461471557617, + "learning_rate": 9.894507230133878e-05, + "loss": 2.1022, + "step": 1250 + }, + { + "epoch": 0.09416812510585446, + "grad_norm": 4.679605960845947, + "learning_rate": 9.894257984565167e-05, + "loss": 2.4345, + "step": 1251 + }, + { + "epoch": 0.09424339938651462, + "grad_norm": 4.815038204193115, + "learning_rate": 9.894008448046863e-05, + "loss": 2.2069, + "step": 1252 + }, + { + "epoch": 0.09431867366717477, + "grad_norm": 8.264766693115234, + "learning_rate": 9.893758620593798e-05, + "loss": 2.1542, + "step": 1253 + }, + { + "epoch": 0.09439394794783493, + "grad_norm": 5.655462265014648, + "learning_rate": 9.893508502220825e-05, + "loss": 2.3207, + "step": 1254 + }, + { + "epoch": 0.09446922222849508, + "grad_norm": 4.800930023193359, + "learning_rate": 9.893258092942813e-05, + "loss": 1.9531, + "step": 1255 + }, + { + "epoch": 0.09454449650915524, + "grad_norm": 6.131793975830078, + "learning_rate": 9.893007392774647e-05, + "loss": 2.3465, + "step": 1256 + }, + { + "epoch": 0.09461977078981539, + "grad_norm": 6.297909736633301, + "learning_rate": 9.892756401731233e-05, + "loss": 1.9844, + "step": 1257 + }, + { + "epoch": 0.09469504507047555, + "grad_norm": 4.924347400665283, + "learning_rate": 9.892505119827487e-05, + "loss": 2.3395, + "step": 1258 + }, + { + "epoch": 0.0947703193511357, + "grad_norm": 6.420069694519043, + "learning_rate": 9.892253547078352e-05, + "loss": 2.3433, + "step": 1259 + }, + { + "epoch": 0.09484559363179586, + "grad_norm": 4.504122734069824, + "learning_rate": 9.892001683498781e-05, + "loss": 2.5711, + "step": 1260 + }, + { + "epoch": 0.094920867912456, + "grad_norm": 5.376524925231934, + "learning_rate": 9.891749529103746e-05, + "loss": 2.4909, + "step": 1261 + }, + { + "epoch": 0.09499614219311617, + "grad_norm": 4.911278247833252, + "learning_rate": 9.891497083908237e-05, + "loss": 2.3502, + "step": 1262 + }, + { + "epoch": 0.09507141647377632, + "grad_norm": 7.62767219543457, + "learning_rate": 9.891244347927262e-05, + "loss": 2.4731, + "step": 1263 + }, + { + "epoch": 0.09514669075443648, + "grad_norm": 4.864021301269531, + "learning_rate": 9.890991321175845e-05, + "loss": 2.0387, + "step": 1264 + }, + { + "epoch": 0.09522196503509664, + "grad_norm": 4.829661846160889, + "learning_rate": 9.890738003669029e-05, + "loss": 2.1206, + "step": 1265 + }, + { + "epoch": 0.09529723931575679, + "grad_norm": 7.437132835388184, + "learning_rate": 9.890484395421871e-05, + "loss": 2.2334, + "step": 1266 + }, + { + "epoch": 0.09537251359641695, + "grad_norm": 5.918606758117676, + "learning_rate": 9.890230496449448e-05, + "loss": 2.9684, + "step": 1267 + }, + { + "epoch": 0.0954477878770771, + "grad_norm": 4.703569412231445, + "learning_rate": 9.889976306766853e-05, + "loss": 1.8511, + "step": 1268 + }, + { + "epoch": 0.09552306215773726, + "grad_norm": 4.625489711761475, + "learning_rate": 9.889721826389197e-05, + "loss": 2.2033, + "step": 1269 + }, + { + "epoch": 0.09559833643839741, + "grad_norm": 5.362358570098877, + "learning_rate": 9.88946705533161e-05, + "loss": 2.3028, + "step": 1270 + }, + { + "epoch": 0.09567361071905757, + "grad_norm": 6.535942077636719, + "learning_rate": 9.889211993609235e-05, + "loss": 2.0545, + "step": 1271 + }, + { + "epoch": 0.09574888499971772, + "grad_norm": 5.676138877868652, + "learning_rate": 9.888956641237238e-05, + "loss": 2.2467, + "step": 1272 + }, + { + "epoch": 0.09582415928037788, + "grad_norm": 6.557033061981201, + "learning_rate": 9.888700998230794e-05, + "loss": 1.9677, + "step": 1273 + }, + { + "epoch": 0.09589943356103803, + "grad_norm": 4.518818378448486, + "learning_rate": 9.888445064605106e-05, + "loss": 2.2068, + "step": 1274 + }, + { + "epoch": 0.09597470784169819, + "grad_norm": 5.886898994445801, + "learning_rate": 9.888188840375382e-05, + "loss": 2.05, + "step": 1275 + }, + { + "epoch": 0.09604998212235834, + "grad_norm": 5.915098190307617, + "learning_rate": 9.88793232555686e-05, + "loss": 2.6569, + "step": 1276 + }, + { + "epoch": 0.0961252564030185, + "grad_norm": 6.290450572967529, + "learning_rate": 9.887675520164783e-05, + "loss": 2.191, + "step": 1277 + }, + { + "epoch": 0.09620053068367865, + "grad_norm": 5.214074611663818, + "learning_rate": 9.887418424214423e-05, + "loss": 2.3441, + "step": 1278 + }, + { + "epoch": 0.09627580496433881, + "grad_norm": 4.566655158996582, + "learning_rate": 9.88716103772106e-05, + "loss": 2.1281, + "step": 1279 + }, + { + "epoch": 0.09635107924499897, + "grad_norm": 4.753139972686768, + "learning_rate": 9.886903360699997e-05, + "loss": 2.4563, + "step": 1280 + }, + { + "epoch": 0.09642635352565912, + "grad_norm": 6.502361297607422, + "learning_rate": 9.886645393166552e-05, + "loss": 2.4556, + "step": 1281 + }, + { + "epoch": 0.09650162780631928, + "grad_norm": 5.372557640075684, + "learning_rate": 9.886387135136058e-05, + "loss": 2.1031, + "step": 1282 + }, + { + "epoch": 0.09657690208697943, + "grad_norm": 5.277848720550537, + "learning_rate": 9.886128586623871e-05, + "loss": 2.5858, + "step": 1283 + }, + { + "epoch": 0.09665217636763959, + "grad_norm": 5.626523494720459, + "learning_rate": 9.88586974764536e-05, + "loss": 2.5502, + "step": 1284 + }, + { + "epoch": 0.09672745064829974, + "grad_norm": 4.654241561889648, + "learning_rate": 9.88561061821591e-05, + "loss": 2.5378, + "step": 1285 + }, + { + "epoch": 0.0968027249289599, + "grad_norm": 3.7818820476531982, + "learning_rate": 9.885351198350927e-05, + "loss": 2.1636, + "step": 1286 + }, + { + "epoch": 0.09687799920962005, + "grad_norm": 4.863179683685303, + "learning_rate": 9.885091488065832e-05, + "loss": 2.4674, + "step": 1287 + }, + { + "epoch": 0.09695327349028021, + "grad_norm": 5.401482582092285, + "learning_rate": 9.884831487376068e-05, + "loss": 2.33, + "step": 1288 + }, + { + "epoch": 0.09702854777094036, + "grad_norm": 5.601634979248047, + "learning_rate": 9.884571196297086e-05, + "loss": 2.341, + "step": 1289 + }, + { + "epoch": 0.09710382205160052, + "grad_norm": 6.169212341308594, + "learning_rate": 9.884310614844363e-05, + "loss": 2.5662, + "step": 1290 + }, + { + "epoch": 0.09717909633226067, + "grad_norm": 6.100691318511963, + "learning_rate": 9.884049743033388e-05, + "loss": 2.3231, + "step": 1291 + }, + { + "epoch": 0.09725437061292083, + "grad_norm": 5.652121543884277, + "learning_rate": 9.883788580879669e-05, + "loss": 1.8696, + "step": 1292 + }, + { + "epoch": 0.09732964489358098, + "grad_norm": 5.417090892791748, + "learning_rate": 9.883527128398732e-05, + "loss": 2.3028, + "step": 1293 + }, + { + "epoch": 0.09740491917424114, + "grad_norm": 4.444824695587158, + "learning_rate": 9.883265385606122e-05, + "loss": 2.1503, + "step": 1294 + }, + { + "epoch": 0.09748019345490129, + "grad_norm": 6.258047580718994, + "learning_rate": 9.883003352517394e-05, + "loss": 2.302, + "step": 1295 + }, + { + "epoch": 0.09755546773556145, + "grad_norm": 6.353456020355225, + "learning_rate": 9.882741029148129e-05, + "loss": 2.4965, + "step": 1296 + }, + { + "epoch": 0.09763074201622161, + "grad_norm": 6.08019495010376, + "learning_rate": 9.882478415513918e-05, + "loss": 2.1724, + "step": 1297 + }, + { + "epoch": 0.09770601629688176, + "grad_norm": 5.589347839355469, + "learning_rate": 9.882215511630378e-05, + "loss": 2.1164, + "step": 1298 + }, + { + "epoch": 0.09778129057754192, + "grad_norm": 5.3560357093811035, + "learning_rate": 9.881952317513131e-05, + "loss": 2.1525, + "step": 1299 + }, + { + "epoch": 0.09785656485820207, + "grad_norm": 5.481121063232422, + "learning_rate": 9.881688833177829e-05, + "loss": 2.1835, + "step": 1300 + }, + { + "epoch": 0.09793183913886223, + "grad_norm": 4.726559638977051, + "learning_rate": 9.881425058640131e-05, + "loss": 2.0267, + "step": 1301 + }, + { + "epoch": 0.09800711341952238, + "grad_norm": 5.2286834716796875, + "learning_rate": 9.881160993915722e-05, + "loss": 2.1971, + "step": 1302 + }, + { + "epoch": 0.09808238770018254, + "grad_norm": 5.180140018463135, + "learning_rate": 9.880896639020296e-05, + "loss": 2.2377, + "step": 1303 + }, + { + "epoch": 0.09815766198084269, + "grad_norm": 4.529655456542969, + "learning_rate": 9.88063199396957e-05, + "loss": 2.1725, + "step": 1304 + }, + { + "epoch": 0.09823293626150285, + "grad_norm": 5.000154972076416, + "learning_rate": 9.880367058779276e-05, + "loss": 2.2472, + "step": 1305 + }, + { + "epoch": 0.098308210542163, + "grad_norm": 5.002303600311279, + "learning_rate": 9.880101833465163e-05, + "loss": 2.5147, + "step": 1306 + }, + { + "epoch": 0.09838348482282316, + "grad_norm": 8.194414138793945, + "learning_rate": 9.879836318042998e-05, + "loss": 2.345, + "step": 1307 + }, + { + "epoch": 0.09845875910348331, + "grad_norm": 6.684850215911865, + "learning_rate": 9.879570512528568e-05, + "loss": 2.2136, + "step": 1308 + }, + { + "epoch": 0.09853403338414347, + "grad_norm": 5.606630802154541, + "learning_rate": 9.879304416937671e-05, + "loss": 2.188, + "step": 1309 + }, + { + "epoch": 0.09860930766480362, + "grad_norm": 4.944584369659424, + "learning_rate": 9.879038031286128e-05, + "loss": 2.0588, + "step": 1310 + }, + { + "epoch": 0.09868458194546378, + "grad_norm": 7.088937282562256, + "learning_rate": 9.87877135558977e-05, + "loss": 2.0422, + "step": 1311 + }, + { + "epoch": 0.09875985622612395, + "grad_norm": 4.895090579986572, + "learning_rate": 9.878504389864458e-05, + "loss": 2.1093, + "step": 1312 + }, + { + "epoch": 0.0988351305067841, + "grad_norm": 4.786828517913818, + "learning_rate": 9.878237134126056e-05, + "loss": 2.1436, + "step": 1313 + }, + { + "epoch": 0.09891040478744426, + "grad_norm": 6.79971170425415, + "learning_rate": 9.877969588390453e-05, + "loss": 2.3396, + "step": 1314 + }, + { + "epoch": 0.0989856790681044, + "grad_norm": 7.508633613586426, + "learning_rate": 9.877701752673556e-05, + "loss": 1.9994, + "step": 1315 + }, + { + "epoch": 0.09906095334876457, + "grad_norm": 7.53606653213501, + "learning_rate": 9.877433626991283e-05, + "loss": 2.4583, + "step": 1316 + }, + { + "epoch": 0.09913622762942471, + "grad_norm": 5.509038925170898, + "learning_rate": 9.877165211359578e-05, + "loss": 2.2688, + "step": 1317 + }, + { + "epoch": 0.09921150191008488, + "grad_norm": 4.700623989105225, + "learning_rate": 9.876896505794395e-05, + "loss": 2.2001, + "step": 1318 + }, + { + "epoch": 0.09928677619074502, + "grad_norm": 5.297270774841309, + "learning_rate": 9.87662751031171e-05, + "loss": 2.2062, + "step": 1319 + }, + { + "epoch": 0.09936205047140519, + "grad_norm": 8.08660888671875, + "learning_rate": 9.876358224927508e-05, + "loss": 2.2233, + "step": 1320 + }, + { + "epoch": 0.09943732475206533, + "grad_norm": 5.491371154785156, + "learning_rate": 9.876088649657805e-05, + "loss": 2.0448, + "step": 1321 + }, + { + "epoch": 0.0995125990327255, + "grad_norm": 5.495194911956787, + "learning_rate": 9.87581878451862e-05, + "loss": 2.028, + "step": 1322 + }, + { + "epoch": 0.09958787331338564, + "grad_norm": 5.543384075164795, + "learning_rate": 9.875548629525999e-05, + "loss": 2.176, + "step": 1323 + }, + { + "epoch": 0.0996631475940458, + "grad_norm": 4.787388801574707, + "learning_rate": 9.875278184696002e-05, + "loss": 1.8804, + "step": 1324 + }, + { + "epoch": 0.09973842187470595, + "grad_norm": 4.401033401489258, + "learning_rate": 9.875007450044706e-05, + "loss": 2.2221, + "step": 1325 + }, + { + "epoch": 0.09981369615536612, + "grad_norm": 6.260168552398682, + "learning_rate": 9.874736425588206e-05, + "loss": 2.2908, + "step": 1326 + }, + { + "epoch": 0.09988897043602626, + "grad_norm": 4.534674167633057, + "learning_rate": 9.874465111342612e-05, + "loss": 2.2074, + "step": 1327 + }, + { + "epoch": 0.09996424471668643, + "grad_norm": 6.710474014282227, + "learning_rate": 9.874193507324053e-05, + "loss": 2.2408, + "step": 1328 + }, + { + "epoch": 0.10003951899734659, + "grad_norm": 5.775381565093994, + "learning_rate": 9.873921613548675e-05, + "loss": 2.2554, + "step": 1329 + }, + { + "epoch": 0.10011479327800674, + "grad_norm": 5.60185432434082, + "learning_rate": 9.873649430032643e-05, + "loss": 2.2604, + "step": 1330 + }, + { + "epoch": 0.1001900675586669, + "grad_norm": 4.707239151000977, + "learning_rate": 9.873376956792137e-05, + "loss": 2.4742, + "step": 1331 + }, + { + "epoch": 0.10026534183932705, + "grad_norm": 6.17344856262207, + "learning_rate": 9.873104193843352e-05, + "loss": 2.2377, + "step": 1332 + }, + { + "epoch": 0.10034061611998721, + "grad_norm": 5.033746242523193, + "learning_rate": 9.872831141202508e-05, + "loss": 2.5008, + "step": 1333 + }, + { + "epoch": 0.10041589040064736, + "grad_norm": 4.9681010246276855, + "learning_rate": 9.872557798885833e-05, + "loss": 2.1991, + "step": 1334 + }, + { + "epoch": 0.10049116468130752, + "grad_norm": 5.089704990386963, + "learning_rate": 9.872284166909578e-05, + "loss": 2.2982, + "step": 1335 + }, + { + "epoch": 0.10056643896196767, + "grad_norm": 4.939492702484131, + "learning_rate": 9.872010245290009e-05, + "loss": 2.2759, + "step": 1336 + }, + { + "epoch": 0.10064171324262783, + "grad_norm": 7.233314037322998, + "learning_rate": 9.87173603404341e-05, + "loss": 2.1301, + "step": 1337 + }, + { + "epoch": 0.10071698752328798, + "grad_norm": 4.265902042388916, + "learning_rate": 9.871461533186083e-05, + "loss": 2.1499, + "step": 1338 + }, + { + "epoch": 0.10079226180394814, + "grad_norm": 5.709811210632324, + "learning_rate": 9.871186742734345e-05, + "loss": 2.3327, + "step": 1339 + }, + { + "epoch": 0.10086753608460829, + "grad_norm": 6.284304141998291, + "learning_rate": 9.870911662704532e-05, + "loss": 2.1441, + "step": 1340 + }, + { + "epoch": 0.10094281036526845, + "grad_norm": 4.845076084136963, + "learning_rate": 9.870636293112999e-05, + "loss": 2.5536, + "step": 1341 + }, + { + "epoch": 0.1010180846459286, + "grad_norm": 5.140475749969482, + "learning_rate": 9.870360633976111e-05, + "loss": 2.0125, + "step": 1342 + }, + { + "epoch": 0.10109335892658876, + "grad_norm": 6.250215530395508, + "learning_rate": 9.870084685310259e-05, + "loss": 2.3919, + "step": 1343 + }, + { + "epoch": 0.1011686332072489, + "grad_norm": 5.205529689788818, + "learning_rate": 9.869808447131847e-05, + "loss": 1.9747, + "step": 1344 + }, + { + "epoch": 0.10124390748790907, + "grad_norm": 5.132730007171631, + "learning_rate": 9.869531919457296e-05, + "loss": 2.1715, + "step": 1345 + }, + { + "epoch": 0.10131918176856923, + "grad_norm": 6.4714508056640625, + "learning_rate": 9.869255102303044e-05, + "loss": 2.6872, + "step": 1346 + }, + { + "epoch": 0.10139445604922938, + "grad_norm": 5.070152759552002, + "learning_rate": 9.868977995685547e-05, + "loss": 1.7995, + "step": 1347 + }, + { + "epoch": 0.10146973032988954, + "grad_norm": 4.949240684509277, + "learning_rate": 9.868700599621279e-05, + "loss": 2.0117, + "step": 1348 + }, + { + "epoch": 0.10154500461054969, + "grad_norm": 4.624660968780518, + "learning_rate": 9.868422914126731e-05, + "loss": 2.2539, + "step": 1349 + }, + { + "epoch": 0.10162027889120985, + "grad_norm": 4.8336052894592285, + "learning_rate": 9.86814493921841e-05, + "loss": 2.418, + "step": 1350 + }, + { + "epoch": 0.10169555317187, + "grad_norm": 6.691450595855713, + "learning_rate": 9.86786667491284e-05, + "loss": 2.0609, + "step": 1351 + }, + { + "epoch": 0.10177082745253016, + "grad_norm": 5.7258806228637695, + "learning_rate": 9.867588121226563e-05, + "loss": 2.1486, + "step": 1352 + }, + { + "epoch": 0.10184610173319031, + "grad_norm": 7.780943870544434, + "learning_rate": 9.86730927817614e-05, + "loss": 2.1588, + "step": 1353 + }, + { + "epoch": 0.10192137601385047, + "grad_norm": 5.153010368347168, + "learning_rate": 9.867030145778147e-05, + "loss": 2.3842, + "step": 1354 + }, + { + "epoch": 0.10199665029451062, + "grad_norm": 8.258830070495605, + "learning_rate": 9.866750724049175e-05, + "loss": 2.2651, + "step": 1355 + }, + { + "epoch": 0.10207192457517078, + "grad_norm": 5.167582988739014, + "learning_rate": 9.866471013005838e-05, + "loss": 2.0271, + "step": 1356 + }, + { + "epoch": 0.10214719885583093, + "grad_norm": 8.057304382324219, + "learning_rate": 9.866191012664763e-05, + "loss": 2.4244, + "step": 1357 + }, + { + "epoch": 0.10222247313649109, + "grad_norm": 6.899253845214844, + "learning_rate": 9.865910723042593e-05, + "loss": 2.2179, + "step": 1358 + }, + { + "epoch": 0.10229774741715124, + "grad_norm": 4.171437740325928, + "learning_rate": 9.865630144155994e-05, + "loss": 2.4336, + "step": 1359 + }, + { + "epoch": 0.1023730216978114, + "grad_norm": 4.7057414054870605, + "learning_rate": 9.865349276021644e-05, + "loss": 2.007, + "step": 1360 + }, + { + "epoch": 0.10244829597847156, + "grad_norm": 6.168985843658447, + "learning_rate": 9.86506811865624e-05, + "loss": 2.0854, + "step": 1361 + }, + { + "epoch": 0.10252357025913171, + "grad_norm": 5.492602348327637, + "learning_rate": 9.864786672076495e-05, + "loss": 1.8508, + "step": 1362 + }, + { + "epoch": 0.10259884453979187, + "grad_norm": 4.3562164306640625, + "learning_rate": 9.864504936299141e-05, + "loss": 2.13, + "step": 1363 + }, + { + "epoch": 0.10267411882045202, + "grad_norm": 4.802323818206787, + "learning_rate": 9.864222911340926e-05, + "loss": 2.0242, + "step": 1364 + }, + { + "epoch": 0.10274939310111218, + "grad_norm": 4.20440149307251, + "learning_rate": 9.863940597218617e-05, + "loss": 2.3471, + "step": 1365 + }, + { + "epoch": 0.10282466738177233, + "grad_norm": 4.615967273712158, + "learning_rate": 9.863657993948995e-05, + "loss": 1.9809, + "step": 1366 + }, + { + "epoch": 0.10289994166243249, + "grad_norm": 6.9775919914245605, + "learning_rate": 9.863375101548862e-05, + "loss": 1.9167, + "step": 1367 + }, + { + "epoch": 0.10297521594309264, + "grad_norm": 4.219887733459473, + "learning_rate": 9.863091920035033e-05, + "loss": 2.0872, + "step": 1368 + }, + { + "epoch": 0.1030504902237528, + "grad_norm": 4.816910266876221, + "learning_rate": 9.862808449424341e-05, + "loss": 2.1472, + "step": 1369 + }, + { + "epoch": 0.10312576450441295, + "grad_norm": 5.884079933166504, + "learning_rate": 9.862524689733644e-05, + "loss": 2.058, + "step": 1370 + }, + { + "epoch": 0.10320103878507311, + "grad_norm": 7.90699577331543, + "learning_rate": 9.862240640979803e-05, + "loss": 2.2629, + "step": 1371 + }, + { + "epoch": 0.10327631306573326, + "grad_norm": 6.91544771194458, + "learning_rate": 9.861956303179707e-05, + "loss": 2.0087, + "step": 1372 + }, + { + "epoch": 0.10335158734639342, + "grad_norm": 7.192230701446533, + "learning_rate": 9.86167167635026e-05, + "loss": 2.0257, + "step": 1373 + }, + { + "epoch": 0.10342686162705357, + "grad_norm": 5.880402565002441, + "learning_rate": 9.861386760508382e-05, + "loss": 2.5124, + "step": 1374 + }, + { + "epoch": 0.10350213590771373, + "grad_norm": 5.4257283210754395, + "learning_rate": 9.86110155567101e-05, + "loss": 2.2905, + "step": 1375 + }, + { + "epoch": 0.10357741018837388, + "grad_norm": 5.242028713226318, + "learning_rate": 9.860816061855099e-05, + "loss": 2.1021, + "step": 1376 + }, + { + "epoch": 0.10365268446903404, + "grad_norm": 6.060970306396484, + "learning_rate": 9.86053027907762e-05, + "loss": 2.3967, + "step": 1377 + }, + { + "epoch": 0.1037279587496942, + "grad_norm": 5.679944038391113, + "learning_rate": 9.860244207355562e-05, + "loss": 2.2894, + "step": 1378 + }, + { + "epoch": 0.10380323303035435, + "grad_norm": 6.1499104499816895, + "learning_rate": 9.859957846705931e-05, + "loss": 2.0166, + "step": 1379 + }, + { + "epoch": 0.10387850731101451, + "grad_norm": 5.7871198654174805, + "learning_rate": 9.859671197145753e-05, + "loss": 2.6949, + "step": 1380 + }, + { + "epoch": 0.10395378159167466, + "grad_norm": 4.203625202178955, + "learning_rate": 9.859384258692064e-05, + "loss": 2.3076, + "step": 1381 + }, + { + "epoch": 0.10402905587233482, + "grad_norm": 7.567563533782959, + "learning_rate": 9.859097031361924e-05, + "loss": 2.1866, + "step": 1382 + }, + { + "epoch": 0.10410433015299497, + "grad_norm": 3.7766213417053223, + "learning_rate": 9.858809515172409e-05, + "loss": 2.1515, + "step": 1383 + }, + { + "epoch": 0.10417960443365513, + "grad_norm": 6.01621675491333, + "learning_rate": 9.858521710140609e-05, + "loss": 1.9643, + "step": 1384 + }, + { + "epoch": 0.10425487871431528, + "grad_norm": 5.394628047943115, + "learning_rate": 9.858233616283634e-05, + "loss": 1.8145, + "step": 1385 + }, + { + "epoch": 0.10433015299497544, + "grad_norm": 5.330570697784424, + "learning_rate": 9.85794523361861e-05, + "loss": 2.1916, + "step": 1386 + }, + { + "epoch": 0.10440542727563559, + "grad_norm": 4.793075084686279, + "learning_rate": 9.857656562162683e-05, + "loss": 2.3454, + "step": 1387 + }, + { + "epoch": 0.10448070155629575, + "grad_norm": 7.11953592300415, + "learning_rate": 9.85736760193301e-05, + "loss": 2.3438, + "step": 1388 + }, + { + "epoch": 0.1045559758369559, + "grad_norm": 5.012604713439941, + "learning_rate": 9.85707835294677e-05, + "loss": 2.3994, + "step": 1389 + }, + { + "epoch": 0.10463125011761606, + "grad_norm": 9.111664772033691, + "learning_rate": 9.856788815221159e-05, + "loss": 2.4086, + "step": 1390 + }, + { + "epoch": 0.10470652439827621, + "grad_norm": 5.258410930633545, + "learning_rate": 9.856498988773389e-05, + "loss": 2.0269, + "step": 1391 + }, + { + "epoch": 0.10478179867893637, + "grad_norm": 8.604662895202637, + "learning_rate": 9.856208873620688e-05, + "loss": 2.3107, + "step": 1392 + }, + { + "epoch": 0.10485707295959654, + "grad_norm": 5.09596061706543, + "learning_rate": 9.855918469780303e-05, + "loss": 2.0576, + "step": 1393 + }, + { + "epoch": 0.10493234724025668, + "grad_norm": 5.101263999938965, + "learning_rate": 9.8556277772695e-05, + "loss": 2.2593, + "step": 1394 + }, + { + "epoch": 0.10500762152091685, + "grad_norm": 4.718286514282227, + "learning_rate": 9.855336796105556e-05, + "loss": 2.382, + "step": 1395 + }, + { + "epoch": 0.105082895801577, + "grad_norm": 5.197150230407715, + "learning_rate": 9.855045526305773e-05, + "loss": 2.3176, + "step": 1396 + }, + { + "epoch": 0.10515817008223716, + "grad_norm": 5.26522970199585, + "learning_rate": 9.854753967887465e-05, + "loss": 2.1254, + "step": 1397 + }, + { + "epoch": 0.1052334443628973, + "grad_norm": 6.0825419425964355, + "learning_rate": 9.85446212086796e-05, + "loss": 1.9642, + "step": 1398 + }, + { + "epoch": 0.10530871864355747, + "grad_norm": 4.064992904663086, + "learning_rate": 9.854169985264614e-05, + "loss": 2.183, + "step": 1399 + }, + { + "epoch": 0.10538399292421761, + "grad_norm": 5.792464256286621, + "learning_rate": 9.853877561094789e-05, + "loss": 2.3714, + "step": 1400 + }, + { + "epoch": 0.10545926720487778, + "grad_norm": 5.2079758644104, + "learning_rate": 9.85358484837587e-05, + "loss": 2.5722, + "step": 1401 + }, + { + "epoch": 0.10553454148553792, + "grad_norm": 4.400840759277344, + "learning_rate": 9.853291847125258e-05, + "loss": 2.7732, + "step": 1402 + }, + { + "epoch": 0.10560981576619809, + "grad_norm": 5.1285834312438965, + "learning_rate": 9.852998557360373e-05, + "loss": 2.3474, + "step": 1403 + }, + { + "epoch": 0.10568509004685823, + "grad_norm": 4.393636703491211, + "learning_rate": 9.852704979098648e-05, + "loss": 2.2061, + "step": 1404 + }, + { + "epoch": 0.1057603643275184, + "grad_norm": 6.350622653961182, + "learning_rate": 9.852411112357536e-05, + "loss": 2.113, + "step": 1405 + }, + { + "epoch": 0.10583563860817854, + "grad_norm": 6.430047035217285, + "learning_rate": 9.852116957154505e-05, + "loss": 2.0917, + "step": 1406 + }, + { + "epoch": 0.1059109128888387, + "grad_norm": 5.039078712463379, + "learning_rate": 9.851822513507045e-05, + "loss": 2.4053, + "step": 1407 + }, + { + "epoch": 0.10598618716949885, + "grad_norm": 4.880955696105957, + "learning_rate": 9.851527781432657e-05, + "loss": 2.3186, + "step": 1408 + }, + { + "epoch": 0.10606146145015902, + "grad_norm": 5.812760829925537, + "learning_rate": 9.851232760948864e-05, + "loss": 1.9694, + "step": 1409 + }, + { + "epoch": 0.10613673573081918, + "grad_norm": 4.599240303039551, + "learning_rate": 9.850937452073203e-05, + "loss": 2.4078, + "step": 1410 + }, + { + "epoch": 0.10621201001147933, + "grad_norm": 4.970304489135742, + "learning_rate": 9.850641854823228e-05, + "loss": 2.14, + "step": 1411 + }, + { + "epoch": 0.10628728429213949, + "grad_norm": 5.6756086349487305, + "learning_rate": 9.850345969216514e-05, + "loss": 2.157, + "step": 1412 + }, + { + "epoch": 0.10636255857279964, + "grad_norm": 4.906312465667725, + "learning_rate": 9.850049795270649e-05, + "loss": 2.2399, + "step": 1413 + }, + { + "epoch": 0.1064378328534598, + "grad_norm": 4.625144958496094, + "learning_rate": 9.84975333300324e-05, + "loss": 2.2531, + "step": 1414 + }, + { + "epoch": 0.10651310713411995, + "grad_norm": 4.526831150054932, + "learning_rate": 9.84945658243191e-05, + "loss": 2.4922, + "step": 1415 + }, + { + "epoch": 0.10658838141478011, + "grad_norm": 5.445694923400879, + "learning_rate": 9.849159543574302e-05, + "loss": 2.0667, + "step": 1416 + }, + { + "epoch": 0.10666365569544026, + "grad_norm": 3.5116755962371826, + "learning_rate": 9.848862216448074e-05, + "loss": 2.1409, + "step": 1417 + }, + { + "epoch": 0.10673892997610042, + "grad_norm": 7.167372703552246, + "learning_rate": 9.848564601070897e-05, + "loss": 2.0867, + "step": 1418 + }, + { + "epoch": 0.10681420425676057, + "grad_norm": 5.482428073883057, + "learning_rate": 9.848266697460469e-05, + "loss": 2.4357, + "step": 1419 + }, + { + "epoch": 0.10688947853742073, + "grad_norm": 5.8753180503845215, + "learning_rate": 9.847968505634497e-05, + "loss": 2.3368, + "step": 1420 + }, + { + "epoch": 0.10696475281808088, + "grad_norm": 4.832120895385742, + "learning_rate": 9.847670025610707e-05, + "loss": 1.996, + "step": 1421 + }, + { + "epoch": 0.10704002709874104, + "grad_norm": 6.677201747894287, + "learning_rate": 9.847371257406843e-05, + "loss": 2.5662, + "step": 1422 + }, + { + "epoch": 0.10711530137940119, + "grad_norm": 4.344008445739746, + "learning_rate": 9.847072201040667e-05, + "loss": 2.2108, + "step": 1423 + }, + { + "epoch": 0.10719057566006135, + "grad_norm": 6.102686405181885, + "learning_rate": 9.846772856529958e-05, + "loss": 2.3959, + "step": 1424 + }, + { + "epoch": 0.1072658499407215, + "grad_norm": 7.902398586273193, + "learning_rate": 9.846473223892508e-05, + "loss": 2.1278, + "step": 1425 + }, + { + "epoch": 0.10734112422138166, + "grad_norm": 5.588354110717773, + "learning_rate": 9.846173303146131e-05, + "loss": 2.3202, + "step": 1426 + }, + { + "epoch": 0.10741639850204182, + "grad_norm": 6.013638973236084, + "learning_rate": 9.845873094308656e-05, + "loss": 2.1313, + "step": 1427 + }, + { + "epoch": 0.10749167278270197, + "grad_norm": 4.5509443283081055, + "learning_rate": 9.84557259739793e-05, + "loss": 2.388, + "step": 1428 + }, + { + "epoch": 0.10756694706336213, + "grad_norm": 4.707016468048096, + "learning_rate": 9.845271812431817e-05, + "loss": 2.4723, + "step": 1429 + }, + { + "epoch": 0.10764222134402228, + "grad_norm": 6.561839580535889, + "learning_rate": 9.844970739428199e-05, + "loss": 2.0649, + "step": 1430 + }, + { + "epoch": 0.10771749562468244, + "grad_norm": 5.726894378662109, + "learning_rate": 9.844669378404969e-05, + "loss": 2.1675, + "step": 1431 + }, + { + "epoch": 0.10779276990534259, + "grad_norm": 7.741239547729492, + "learning_rate": 9.844367729380049e-05, + "loss": 2.2878, + "step": 1432 + }, + { + "epoch": 0.10786804418600275, + "grad_norm": 10.629728317260742, + "learning_rate": 9.844065792371364e-05, + "loss": 2.5216, + "step": 1433 + }, + { + "epoch": 0.1079433184666629, + "grad_norm": 4.772575378417969, + "learning_rate": 9.843763567396868e-05, + "loss": 2.0712, + "step": 1434 + }, + { + "epoch": 0.10801859274732306, + "grad_norm": 4.807673931121826, + "learning_rate": 9.843461054474528e-05, + "loss": 2.0015, + "step": 1435 + }, + { + "epoch": 0.10809386702798321, + "grad_norm": 5.960400104522705, + "learning_rate": 9.843158253622325e-05, + "loss": 2.1179, + "step": 1436 + }, + { + "epoch": 0.10816914130864337, + "grad_norm": 4.638011932373047, + "learning_rate": 9.84285516485826e-05, + "loss": 2.0644, + "step": 1437 + }, + { + "epoch": 0.10824441558930352, + "grad_norm": 5.544718265533447, + "learning_rate": 9.84255178820035e-05, + "loss": 2.3577, + "step": 1438 + }, + { + "epoch": 0.10831968986996368, + "grad_norm": 5.5586371421813965, + "learning_rate": 9.842248123666632e-05, + "loss": 2.5347, + "step": 1439 + }, + { + "epoch": 0.10839496415062383, + "grad_norm": 4.557663440704346, + "learning_rate": 9.841944171275157e-05, + "loss": 2.1664, + "step": 1440 + }, + { + "epoch": 0.10847023843128399, + "grad_norm": 4.60097599029541, + "learning_rate": 9.841639931043994e-05, + "loss": 2.2154, + "step": 1441 + }, + { + "epoch": 0.10854551271194415, + "grad_norm": 4.8958282470703125, + "learning_rate": 9.84133540299123e-05, + "loss": 2.2027, + "step": 1442 + }, + { + "epoch": 0.1086207869926043, + "grad_norm": 6.68956995010376, + "learning_rate": 9.841030587134967e-05, + "loss": 2.2456, + "step": 1443 + }, + { + "epoch": 0.10869606127326446, + "grad_norm": 4.5504889488220215, + "learning_rate": 9.840725483493325e-05, + "loss": 2.1168, + "step": 1444 + }, + { + "epoch": 0.10877133555392461, + "grad_norm": 4.70060920715332, + "learning_rate": 9.840420092084443e-05, + "loss": 2.0425, + "step": 1445 + }, + { + "epoch": 0.10884660983458477, + "grad_norm": 4.551839828491211, + "learning_rate": 9.840114412926478e-05, + "loss": 2.1641, + "step": 1446 + }, + { + "epoch": 0.10892188411524492, + "grad_norm": 5.133751392364502, + "learning_rate": 9.839808446037598e-05, + "loss": 2.4, + "step": 1447 + }, + { + "epoch": 0.10899715839590508, + "grad_norm": 4.662445068359375, + "learning_rate": 9.83950219143599e-05, + "loss": 2.261, + "step": 1448 + }, + { + "epoch": 0.10907243267656523, + "grad_norm": 7.807523250579834, + "learning_rate": 9.839195649139864e-05, + "loss": 2.3798, + "step": 1449 + }, + { + "epoch": 0.1091477069572254, + "grad_norm": 5.682362079620361, + "learning_rate": 9.838888819167443e-05, + "loss": 2.3878, + "step": 1450 + }, + { + "epoch": 0.10922298123788554, + "grad_norm": 5.008208274841309, + "learning_rate": 9.838581701536967e-05, + "loss": 2.1507, + "step": 1451 + }, + { + "epoch": 0.1092982555185457, + "grad_norm": 5.632689952850342, + "learning_rate": 9.83827429626669e-05, + "loss": 1.7746, + "step": 1452 + }, + { + "epoch": 0.10937352979920585, + "grad_norm": 3.6657118797302246, + "learning_rate": 9.837966603374889e-05, + "loss": 2.3033, + "step": 1453 + }, + { + "epoch": 0.10944880407986601, + "grad_norm": 5.824965000152588, + "learning_rate": 9.837658622879856e-05, + "loss": 2.4717, + "step": 1454 + }, + { + "epoch": 0.10952407836052616, + "grad_norm": 4.90222692489624, + "learning_rate": 9.8373503547999e-05, + "loss": 2.1955, + "step": 1455 + }, + { + "epoch": 0.10959935264118632, + "grad_norm": 3.9360909461975098, + "learning_rate": 9.837041799153342e-05, + "loss": 2.1443, + "step": 1456 + }, + { + "epoch": 0.10967462692184647, + "grad_norm": 5.997777938842773, + "learning_rate": 9.83673295595853e-05, + "loss": 2.4089, + "step": 1457 + }, + { + "epoch": 0.10974990120250663, + "grad_norm": 7.144850730895996, + "learning_rate": 9.836423825233821e-05, + "loss": 2.2587, + "step": 1458 + }, + { + "epoch": 0.1098251754831668, + "grad_norm": 6.629261493682861, + "learning_rate": 9.836114406997594e-05, + "loss": 2.4092, + "step": 1459 + }, + { + "epoch": 0.10990044976382694, + "grad_norm": 5.653792858123779, + "learning_rate": 9.835804701268241e-05, + "loss": 2.1819, + "step": 1460 + }, + { + "epoch": 0.1099757240444871, + "grad_norm": 5.176329612731934, + "learning_rate": 9.835494708064175e-05, + "loss": 2.0457, + "step": 1461 + }, + { + "epoch": 0.11005099832514725, + "grad_norm": 4.583652973175049, + "learning_rate": 9.835184427403822e-05, + "loss": 2.1199, + "step": 1462 + }, + { + "epoch": 0.11012627260580742, + "grad_norm": 7.92254638671875, + "learning_rate": 9.834873859305629e-05, + "loss": 2.831, + "step": 1463 + }, + { + "epoch": 0.11020154688646756, + "grad_norm": 4.404693126678467, + "learning_rate": 9.834563003788057e-05, + "loss": 2.0809, + "step": 1464 + }, + { + "epoch": 0.11027682116712773, + "grad_norm": 3.798464775085449, + "learning_rate": 9.834251860869588e-05, + "loss": 2.3996, + "step": 1465 + }, + { + "epoch": 0.11035209544778787, + "grad_norm": 6.599026203155518, + "learning_rate": 9.833940430568715e-05, + "loss": 2.1482, + "step": 1466 + }, + { + "epoch": 0.11042736972844804, + "grad_norm": 5.410624980926514, + "learning_rate": 9.833628712903957e-05, + "loss": 2.271, + "step": 1467 + }, + { + "epoch": 0.11050264400910818, + "grad_norm": 4.607469081878662, + "learning_rate": 9.833316707893838e-05, + "loss": 2.2432, + "step": 1468 + }, + { + "epoch": 0.11057791828976835, + "grad_norm": 5.609137535095215, + "learning_rate": 9.83300441555691e-05, + "loss": 2.2847, + "step": 1469 + }, + { + "epoch": 0.1106531925704285, + "grad_norm": 4.124760627746582, + "learning_rate": 9.832691835911736e-05, + "loss": 2.2725, + "step": 1470 + }, + { + "epoch": 0.11072846685108866, + "grad_norm": 6.28630256652832, + "learning_rate": 9.832378968976901e-05, + "loss": 2.1676, + "step": 1471 + }, + { + "epoch": 0.1108037411317488, + "grad_norm": 5.398787975311279, + "learning_rate": 9.832065814771e-05, + "loss": 2.1924, + "step": 1472 + }, + { + "epoch": 0.11087901541240897, + "grad_norm": 4.385289192199707, + "learning_rate": 9.831752373312653e-05, + "loss": 2.2452, + "step": 1473 + }, + { + "epoch": 0.11095428969306913, + "grad_norm": 4.326237201690674, + "learning_rate": 9.831438644620489e-05, + "loss": 2.2244, + "step": 1474 + }, + { + "epoch": 0.11102956397372928, + "grad_norm": 5.806609153747559, + "learning_rate": 9.831124628713161e-05, + "loss": 1.7286, + "step": 1475 + }, + { + "epoch": 0.11110483825438944, + "grad_norm": 5.2352399826049805, + "learning_rate": 9.830810325609337e-05, + "loss": 2.1993, + "step": 1476 + }, + { + "epoch": 0.11118011253504959, + "grad_norm": 5.491091728210449, + "learning_rate": 9.8304957353277e-05, + "loss": 2.1426, + "step": 1477 + }, + { + "epoch": 0.11125538681570975, + "grad_norm": 4.986297130584717, + "learning_rate": 9.830180857886952e-05, + "loss": 2.0211, + "step": 1478 + }, + { + "epoch": 0.1113306610963699, + "grad_norm": 4.85268497467041, + "learning_rate": 9.829865693305811e-05, + "loss": 2.2682, + "step": 1479 + }, + { + "epoch": 0.11140593537703006, + "grad_norm": 5.306323051452637, + "learning_rate": 9.829550241603012e-05, + "loss": 2.2754, + "step": 1480 + }, + { + "epoch": 0.1114812096576902, + "grad_norm": 4.7514777183532715, + "learning_rate": 9.82923450279731e-05, + "loss": 2.2996, + "step": 1481 + }, + { + "epoch": 0.11155648393835037, + "grad_norm": 7.218596935272217, + "learning_rate": 9.828918476907472e-05, + "loss": 2.0388, + "step": 1482 + }, + { + "epoch": 0.11163175821901052, + "grad_norm": 5.108249664306641, + "learning_rate": 9.828602163952287e-05, + "loss": 2.0017, + "step": 1483 + }, + { + "epoch": 0.11170703249967068, + "grad_norm": 6.2356109619140625, + "learning_rate": 9.828285563950558e-05, + "loss": 2.1725, + "step": 1484 + }, + { + "epoch": 0.11178230678033083, + "grad_norm": 5.727863311767578, + "learning_rate": 9.827968676921108e-05, + "loss": 2.8907, + "step": 1485 + }, + { + "epoch": 0.11185758106099099, + "grad_norm": 5.71699333190918, + "learning_rate": 9.827651502882771e-05, + "loss": 2.2345, + "step": 1486 + }, + { + "epoch": 0.11193285534165114, + "grad_norm": 6.761249542236328, + "learning_rate": 9.827334041854406e-05, + "loss": 2.2687, + "step": 1487 + }, + { + "epoch": 0.1120081296223113, + "grad_norm": 6.661614418029785, + "learning_rate": 9.827016293854882e-05, + "loss": 2.1393, + "step": 1488 + }, + { + "epoch": 0.11208340390297145, + "grad_norm": 6.173034191131592, + "learning_rate": 9.82669825890309e-05, + "loss": 2.2199, + "step": 1489 + }, + { + "epoch": 0.11215867818363161, + "grad_norm": 4.6556620597839355, + "learning_rate": 9.826379937017936e-05, + "loss": 3.0586, + "step": 1490 + }, + { + "epoch": 0.11223395246429177, + "grad_norm": 4.349430084228516, + "learning_rate": 9.826061328218342e-05, + "loss": 2.5344, + "step": 1491 + }, + { + "epoch": 0.11230922674495192, + "grad_norm": 5.721402645111084, + "learning_rate": 9.825742432523251e-05, + "loss": 2.1835, + "step": 1492 + }, + { + "epoch": 0.11238450102561208, + "grad_norm": 5.224651336669922, + "learning_rate": 9.825423249951618e-05, + "loss": 2.1935, + "step": 1493 + }, + { + "epoch": 0.11245977530627223, + "grad_norm": 5.001589775085449, + "learning_rate": 9.82510378052242e-05, + "loss": 2.1997, + "step": 1494 + }, + { + "epoch": 0.11253504958693239, + "grad_norm": 5.430053234100342, + "learning_rate": 9.824784024254645e-05, + "loss": 2.1947, + "step": 1495 + }, + { + "epoch": 0.11261032386759254, + "grad_norm": 6.980978965759277, + "learning_rate": 9.824463981167305e-05, + "loss": 2.3193, + "step": 1496 + }, + { + "epoch": 0.1126855981482527, + "grad_norm": 6.289052486419678, + "learning_rate": 9.824143651279424e-05, + "loss": 1.9883, + "step": 1497 + }, + { + "epoch": 0.11276087242891285, + "grad_norm": 4.463650703430176, + "learning_rate": 9.823823034610045e-05, + "loss": 1.9558, + "step": 1498 + }, + { + "epoch": 0.11283614670957301, + "grad_norm": 5.06191349029541, + "learning_rate": 9.823502131178226e-05, + "loss": 2.2293, + "step": 1499 + }, + { + "epoch": 0.11291142099023316, + "grad_norm": 4.8617167472839355, + "learning_rate": 9.823180941003048e-05, + "loss": 2.194, + "step": 1500 + }, + { + "epoch": 0.11298669527089332, + "grad_norm": 4.87706184387207, + "learning_rate": 9.822859464103602e-05, + "loss": 2.1378, + "step": 1501 + }, + { + "epoch": 0.11306196955155347, + "grad_norm": 6.628759860992432, + "learning_rate": 9.822537700498999e-05, + "loss": 2.1634, + "step": 1502 + }, + { + "epoch": 0.11313724383221363, + "grad_norm": 4.992720603942871, + "learning_rate": 9.822215650208367e-05, + "loss": 1.8693, + "step": 1503 + }, + { + "epoch": 0.11321251811287378, + "grad_norm": 4.385207653045654, + "learning_rate": 9.821893313250849e-05, + "loss": 2.4103, + "step": 1504 + }, + { + "epoch": 0.11328779239353394, + "grad_norm": 4.952878475189209, + "learning_rate": 9.821570689645612e-05, + "loss": 2.3154, + "step": 1505 + }, + { + "epoch": 0.11336306667419409, + "grad_norm": 7.591663837432861, + "learning_rate": 9.82124777941183e-05, + "loss": 2.9471, + "step": 1506 + }, + { + "epoch": 0.11343834095485425, + "grad_norm": 5.995678901672363, + "learning_rate": 9.820924582568703e-05, + "loss": 2.2399, + "step": 1507 + }, + { + "epoch": 0.11351361523551441, + "grad_norm": 5.2290496826171875, + "learning_rate": 9.820601099135441e-05, + "loss": 2.1638, + "step": 1508 + }, + { + "epoch": 0.11358888951617456, + "grad_norm": 6.682706832885742, + "learning_rate": 9.820277329131276e-05, + "loss": 2.3476, + "step": 1509 + }, + { + "epoch": 0.11366416379683472, + "grad_norm": 5.347593784332275, + "learning_rate": 9.819953272575455e-05, + "loss": 1.9392, + "step": 1510 + }, + { + "epoch": 0.11373943807749487, + "grad_norm": 6.090190410614014, + "learning_rate": 9.81962892948724e-05, + "loss": 2.4825, + "step": 1511 + }, + { + "epoch": 0.11381471235815503, + "grad_norm": 5.386474609375, + "learning_rate": 9.819304299885918e-05, + "loss": 2.4159, + "step": 1512 + }, + { + "epoch": 0.11388998663881518, + "grad_norm": 7.41466760635376, + "learning_rate": 9.818979383790781e-05, + "loss": 2.2016, + "step": 1513 + }, + { + "epoch": 0.11396526091947534, + "grad_norm": 6.078017711639404, + "learning_rate": 9.818654181221147e-05, + "loss": 1.9737, + "step": 1514 + }, + { + "epoch": 0.11404053520013549, + "grad_norm": 5.459829807281494, + "learning_rate": 9.818328692196347e-05, + "loss": 2.411, + "step": 1515 + }, + { + "epoch": 0.11411580948079565, + "grad_norm": 4.553897857666016, + "learning_rate": 9.818002916735734e-05, + "loss": 2.336, + "step": 1516 + }, + { + "epoch": 0.1141910837614558, + "grad_norm": 6.528814792633057, + "learning_rate": 9.81767685485867e-05, + "loss": 2.4098, + "step": 1517 + }, + { + "epoch": 0.11426635804211596, + "grad_norm": 12.733213424682617, + "learning_rate": 9.817350506584543e-05, + "loss": 2.3403, + "step": 1518 + }, + { + "epoch": 0.11434163232277611, + "grad_norm": 6.481761455535889, + "learning_rate": 9.817023871932749e-05, + "loss": 2.187, + "step": 1519 + }, + { + "epoch": 0.11441690660343627, + "grad_norm": 5.830627918243408, + "learning_rate": 9.816696950922709e-05, + "loss": 2.1685, + "step": 1520 + }, + { + "epoch": 0.11449218088409642, + "grad_norm": 4.158799171447754, + "learning_rate": 9.816369743573853e-05, + "loss": 2.2313, + "step": 1521 + }, + { + "epoch": 0.11456745516475658, + "grad_norm": 5.076837062835693, + "learning_rate": 9.816042249905637e-05, + "loss": 2.0637, + "step": 1522 + }, + { + "epoch": 0.11464272944541674, + "grad_norm": 4.538496017456055, + "learning_rate": 9.815714469937529e-05, + "loss": 2.045, + "step": 1523 + }, + { + "epoch": 0.11471800372607689, + "grad_norm": 5.56032657623291, + "learning_rate": 9.815386403689013e-05, + "loss": 1.7054, + "step": 1524 + }, + { + "epoch": 0.11479327800673705, + "grad_norm": 4.467593193054199, + "learning_rate": 9.815058051179591e-05, + "loss": 2.1045, + "step": 1525 + }, + { + "epoch": 0.1148685522873972, + "grad_norm": 4.002077579498291, + "learning_rate": 9.814729412428785e-05, + "loss": 2.0444, + "step": 1526 + }, + { + "epoch": 0.11494382656805736, + "grad_norm": 4.604103088378906, + "learning_rate": 9.81440048745613e-05, + "loss": 2.5774, + "step": 1527 + }, + { + "epoch": 0.11501910084871751, + "grad_norm": 4.999310493469238, + "learning_rate": 9.81407127628118e-05, + "loss": 2.2775, + "step": 1528 + }, + { + "epoch": 0.11509437512937767, + "grad_norm": 4.501382827758789, + "learning_rate": 9.813741778923506e-05, + "loss": 2.1369, + "step": 1529 + }, + { + "epoch": 0.11516964941003782, + "grad_norm": 6.101622581481934, + "learning_rate": 9.813411995402697e-05, + "loss": 2.5277, + "step": 1530 + }, + { + "epoch": 0.11524492369069798, + "grad_norm": 4.6172566413879395, + "learning_rate": 9.813081925738354e-05, + "loss": 2.0067, + "step": 1531 + }, + { + "epoch": 0.11532019797135813, + "grad_norm": 5.320746421813965, + "learning_rate": 9.812751569950101e-05, + "loss": 2.1142, + "step": 1532 + }, + { + "epoch": 0.1153954722520183, + "grad_norm": 7.294499397277832, + "learning_rate": 9.812420928057578e-05, + "loss": 2.0137, + "step": 1533 + }, + { + "epoch": 0.11547074653267844, + "grad_norm": 6.202102184295654, + "learning_rate": 9.812090000080437e-05, + "loss": 3.0938, + "step": 1534 + }, + { + "epoch": 0.1155460208133386, + "grad_norm": 4.939585208892822, + "learning_rate": 9.811758786038354e-05, + "loss": 2.2617, + "step": 1535 + }, + { + "epoch": 0.11562129509399875, + "grad_norm": 6.0335187911987305, + "learning_rate": 9.811427285951018e-05, + "loss": 2.3668, + "step": 1536 + }, + { + "epoch": 0.11569656937465891, + "grad_norm": 5.094693183898926, + "learning_rate": 9.811095499838134e-05, + "loss": 2.2623, + "step": 1537 + }, + { + "epoch": 0.11577184365531906, + "grad_norm": 5.980704307556152, + "learning_rate": 9.81076342771943e-05, + "loss": 2.188, + "step": 1538 + }, + { + "epoch": 0.11584711793597922, + "grad_norm": 5.815098762512207, + "learning_rate": 9.81043106961464e-05, + "loss": 2.2217, + "step": 1539 + }, + { + "epoch": 0.11592239221663939, + "grad_norm": 4.305836200714111, + "learning_rate": 9.810098425543528e-05, + "loss": 2.3161, + "step": 1540 + }, + { + "epoch": 0.11599766649729953, + "grad_norm": 7.828603744506836, + "learning_rate": 9.809765495525867e-05, + "loss": 2.0805, + "step": 1541 + }, + { + "epoch": 0.1160729407779597, + "grad_norm": 6.011185169219971, + "learning_rate": 9.809432279581446e-05, + "loss": 2.4162, + "step": 1542 + }, + { + "epoch": 0.11614821505861984, + "grad_norm": 5.189256191253662, + "learning_rate": 9.809098777730077e-05, + "loss": 2.3483, + "step": 1543 + }, + { + "epoch": 0.11622348933928, + "grad_norm": 5.920383453369141, + "learning_rate": 9.808764989991585e-05, + "loss": 2.3059, + "step": 1544 + }, + { + "epoch": 0.11629876361994015, + "grad_norm": 8.073580741882324, + "learning_rate": 9.808430916385811e-05, + "loss": 2.5592, + "step": 1545 + }, + { + "epoch": 0.11637403790060032, + "grad_norm": 5.3328680992126465, + "learning_rate": 9.808096556932616e-05, + "loss": 2.3284, + "step": 1546 + }, + { + "epoch": 0.11644931218126046, + "grad_norm": 5.027093410491943, + "learning_rate": 9.807761911651878e-05, + "loss": 2.0594, + "step": 1547 + }, + { + "epoch": 0.11652458646192063, + "grad_norm": 4.8395209312438965, + "learning_rate": 9.807426980563488e-05, + "loss": 2.3464, + "step": 1548 + }, + { + "epoch": 0.11659986074258077, + "grad_norm": 5.231376647949219, + "learning_rate": 9.807091763687359e-05, + "loss": 2.1779, + "step": 1549 + }, + { + "epoch": 0.11667513502324094, + "grad_norm": 4.496547698974609, + "learning_rate": 9.806756261043417e-05, + "loss": 2.2298, + "step": 1550 + }, + { + "epoch": 0.11675040930390108, + "grad_norm": 5.417749404907227, + "learning_rate": 9.806420472651607e-05, + "loss": 2.4479, + "step": 1551 + }, + { + "epoch": 0.11682568358456125, + "grad_norm": 6.970733642578125, + "learning_rate": 9.806084398531892e-05, + "loss": 2.2433, + "step": 1552 + }, + { + "epoch": 0.1169009578652214, + "grad_norm": 4.662734508514404, + "learning_rate": 9.80574803870425e-05, + "loss": 2.1779, + "step": 1553 + }, + { + "epoch": 0.11697623214588156, + "grad_norm": 3.979790210723877, + "learning_rate": 9.805411393188676e-05, + "loss": 2.1901, + "step": 1554 + }, + { + "epoch": 0.11705150642654172, + "grad_norm": 5.445489883422852, + "learning_rate": 9.805074462005183e-05, + "loss": 2.3631, + "step": 1555 + }, + { + "epoch": 0.11712678070720187, + "grad_norm": 5.91757345199585, + "learning_rate": 9.804737245173801e-05, + "loss": 2.2131, + "step": 1556 + }, + { + "epoch": 0.11720205498786203, + "grad_norm": 5.672887325286865, + "learning_rate": 9.804399742714574e-05, + "loss": 2.2574, + "step": 1557 + }, + { + "epoch": 0.11727732926852218, + "grad_norm": 6.305829048156738, + "learning_rate": 9.80406195464757e-05, + "loss": 2.2388, + "step": 1558 + }, + { + "epoch": 0.11735260354918234, + "grad_norm": 6.497957706451416, + "learning_rate": 9.803723880992866e-05, + "loss": 2.2928, + "step": 1559 + }, + { + "epoch": 0.11742787782984249, + "grad_norm": 4.699962139129639, + "learning_rate": 9.80338552177056e-05, + "loss": 1.9581, + "step": 1560 + }, + { + "epoch": 0.11750315211050265, + "grad_norm": 5.287069320678711, + "learning_rate": 9.803046877000768e-05, + "loss": 2.7058, + "step": 1561 + }, + { + "epoch": 0.1175784263911628, + "grad_norm": 5.612594127655029, + "learning_rate": 9.802707946703622e-05, + "loss": 2.0045, + "step": 1562 + }, + { + "epoch": 0.11765370067182296, + "grad_norm": 8.759663581848145, + "learning_rate": 9.802368730899267e-05, + "loss": 1.9407, + "step": 1563 + }, + { + "epoch": 0.1177289749524831, + "grad_norm": 3.793367624282837, + "learning_rate": 9.802029229607873e-05, + "loss": 2.0367, + "step": 1564 + }, + { + "epoch": 0.11780424923314327, + "grad_norm": 4.237112045288086, + "learning_rate": 9.801689442849617e-05, + "loss": 2.8146, + "step": 1565 + }, + { + "epoch": 0.11787952351380342, + "grad_norm": 6.132881164550781, + "learning_rate": 9.801349370644702e-05, + "loss": 2.2959, + "step": 1566 + }, + { + "epoch": 0.11795479779446358, + "grad_norm": 4.572925090789795, + "learning_rate": 9.801009013013345e-05, + "loss": 2.534, + "step": 1567 + }, + { + "epoch": 0.11803007207512373, + "grad_norm": 8.123010635375977, + "learning_rate": 9.800668369975776e-05, + "loss": 2.5581, + "step": 1568 + }, + { + "epoch": 0.11810534635578389, + "grad_norm": 4.87898588180542, + "learning_rate": 9.800327441552248e-05, + "loss": 2.8425, + "step": 1569 + }, + { + "epoch": 0.11818062063644404, + "grad_norm": 5.832376003265381, + "learning_rate": 9.799986227763027e-05, + "loss": 1.9416, + "step": 1570 + }, + { + "epoch": 0.1182558949171042, + "grad_norm": 5.254724025726318, + "learning_rate": 9.799644728628398e-05, + "loss": 2.1531, + "step": 1571 + }, + { + "epoch": 0.11833116919776436, + "grad_norm": 8.972786903381348, + "learning_rate": 9.799302944168662e-05, + "loss": 2.2018, + "step": 1572 + }, + { + "epoch": 0.11840644347842451, + "grad_norm": 5.709095478057861, + "learning_rate": 9.798960874404135e-05, + "loss": 2.0954, + "step": 1573 + }, + { + "epoch": 0.11848171775908467, + "grad_norm": 5.663673400878906, + "learning_rate": 9.798618519355155e-05, + "loss": 2.52, + "step": 1574 + }, + { + "epoch": 0.11855699203974482, + "grad_norm": 6.007000923156738, + "learning_rate": 9.798275879042073e-05, + "loss": 2.2824, + "step": 1575 + }, + { + "epoch": 0.11863226632040498, + "grad_norm": 3.155287742614746, + "learning_rate": 9.797932953485258e-05, + "loss": 1.9951, + "step": 1576 + }, + { + "epoch": 0.11870754060106513, + "grad_norm": 5.0395050048828125, + "learning_rate": 9.797589742705094e-05, + "loss": 2.5841, + "step": 1577 + }, + { + "epoch": 0.11878281488172529, + "grad_norm": 3.9909863471984863, + "learning_rate": 9.797246246721988e-05, + "loss": 2.1333, + "step": 1578 + }, + { + "epoch": 0.11885808916238544, + "grad_norm": 5.659982204437256, + "learning_rate": 9.796902465556354e-05, + "loss": 1.9128, + "step": 1579 + }, + { + "epoch": 0.1189333634430456, + "grad_norm": 5.3229780197143555, + "learning_rate": 9.796558399228635e-05, + "loss": 2.0977, + "step": 1580 + }, + { + "epoch": 0.11900863772370575, + "grad_norm": 4.665555000305176, + "learning_rate": 9.796214047759282e-05, + "loss": 2.0713, + "step": 1581 + }, + { + "epoch": 0.11908391200436591, + "grad_norm": 5.370392322540283, + "learning_rate": 9.795869411168766e-05, + "loss": 2.1872, + "step": 1582 + }, + { + "epoch": 0.11915918628502606, + "grad_norm": 5.251527786254883, + "learning_rate": 9.795524489477572e-05, + "loss": 2.2613, + "step": 1583 + }, + { + "epoch": 0.11923446056568622, + "grad_norm": 6.221897125244141, + "learning_rate": 9.79517928270621e-05, + "loss": 2.9688, + "step": 1584 + }, + { + "epoch": 0.11930973484634637, + "grad_norm": 6.268673419952393, + "learning_rate": 9.794833790875197e-05, + "loss": 2.2185, + "step": 1585 + }, + { + "epoch": 0.11938500912700653, + "grad_norm": 4.565062999725342, + "learning_rate": 9.794488014005072e-05, + "loss": 2.1612, + "step": 1586 + }, + { + "epoch": 0.11946028340766668, + "grad_norm": 5.256009578704834, + "learning_rate": 9.794141952116393e-05, + "loss": 2.1432, + "step": 1587 + }, + { + "epoch": 0.11953555768832684, + "grad_norm": 6.327003002166748, + "learning_rate": 9.793795605229731e-05, + "loss": 2.2175, + "step": 1588 + }, + { + "epoch": 0.119610831968987, + "grad_norm": 4.936118125915527, + "learning_rate": 9.793448973365676e-05, + "loss": 2.7965, + "step": 1589 + }, + { + "epoch": 0.11968610624964715, + "grad_norm": 4.425882339477539, + "learning_rate": 9.793102056544832e-05, + "loss": 2.0861, + "step": 1590 + }, + { + "epoch": 0.11976138053030731, + "grad_norm": 3.899171829223633, + "learning_rate": 9.792754854787824e-05, + "loss": 2.0353, + "step": 1591 + }, + { + "epoch": 0.11983665481096746, + "grad_norm": 4.647331237792969, + "learning_rate": 9.79240736811529e-05, + "loss": 2.1953, + "step": 1592 + }, + { + "epoch": 0.11991192909162762, + "grad_norm": 3.512587308883667, + "learning_rate": 9.792059596547891e-05, + "loss": 2.0129, + "step": 1593 + }, + { + "epoch": 0.11998720337228777, + "grad_norm": 5.037900447845459, + "learning_rate": 9.791711540106299e-05, + "loss": 2.5432, + "step": 1594 + }, + { + "epoch": 0.12006247765294793, + "grad_norm": 4.464351654052734, + "learning_rate": 9.791363198811205e-05, + "loss": 2.0256, + "step": 1595 + }, + { + "epoch": 0.12013775193360808, + "grad_norm": 5.208583831787109, + "learning_rate": 9.791014572683316e-05, + "loss": 2.499, + "step": 1596 + }, + { + "epoch": 0.12021302621426824, + "grad_norm": 5.325770854949951, + "learning_rate": 9.79066566174336e-05, + "loss": 2.0296, + "step": 1597 + }, + { + "epoch": 0.12028830049492839, + "grad_norm": 4.36183500289917, + "learning_rate": 9.790316466012073e-05, + "loss": 2.17, + "step": 1598 + }, + { + "epoch": 0.12036357477558855, + "grad_norm": 5.266951084136963, + "learning_rate": 9.789966985510218e-05, + "loss": 2.0156, + "step": 1599 + }, + { + "epoch": 0.1204388490562487, + "grad_norm": 6.464771747589111, + "learning_rate": 9.789617220258571e-05, + "loss": 2.1139, + "step": 1600 + }, + { + "epoch": 0.12051412333690886, + "grad_norm": 6.859620094299316, + "learning_rate": 9.789267170277922e-05, + "loss": 2.3087, + "step": 1601 + }, + { + "epoch": 0.12058939761756901, + "grad_norm": 5.349453926086426, + "learning_rate": 9.788916835589082e-05, + "loss": 1.9592, + "step": 1602 + }, + { + "epoch": 0.12066467189822917, + "grad_norm": 4.6302876472473145, + "learning_rate": 9.788566216212876e-05, + "loss": 2.2156, + "step": 1603 + }, + { + "epoch": 0.12073994617888933, + "grad_norm": 5.097748756408691, + "learning_rate": 9.78821531217015e-05, + "loss": 2.0748, + "step": 1604 + }, + { + "epoch": 0.12081522045954948, + "grad_norm": 7.0460333824157715, + "learning_rate": 9.787864123481763e-05, + "loss": 2.5018, + "step": 1605 + }, + { + "epoch": 0.12089049474020964, + "grad_norm": 7.261343955993652, + "learning_rate": 9.78751265016859e-05, + "loss": 2.2595, + "step": 1606 + }, + { + "epoch": 0.12096576902086979, + "grad_norm": 5.592549800872803, + "learning_rate": 9.787160892251527e-05, + "loss": 2.4875, + "step": 1607 + }, + { + "epoch": 0.12104104330152995, + "grad_norm": 6.423880577087402, + "learning_rate": 9.786808849751486e-05, + "loss": 2.3915, + "step": 1608 + }, + { + "epoch": 0.1211163175821901, + "grad_norm": 8.358610153198242, + "learning_rate": 9.786456522689393e-05, + "loss": 2.2105, + "step": 1609 + }, + { + "epoch": 0.12119159186285026, + "grad_norm": 5.989588737487793, + "learning_rate": 9.786103911086195e-05, + "loss": 2.0286, + "step": 1610 + }, + { + "epoch": 0.12126686614351041, + "grad_norm": 7.00147008895874, + "learning_rate": 9.785751014962852e-05, + "loss": 2.4252, + "step": 1611 + }, + { + "epoch": 0.12134214042417057, + "grad_norm": 4.66259241104126, + "learning_rate": 9.785397834340341e-05, + "loss": 2.148, + "step": 1612 + }, + { + "epoch": 0.12141741470483072, + "grad_norm": 4.299710273742676, + "learning_rate": 9.785044369239662e-05, + "loss": 2.4166, + "step": 1613 + }, + { + "epoch": 0.12149268898549088, + "grad_norm": 5.795466899871826, + "learning_rate": 9.784690619681824e-05, + "loss": 2.4665, + "step": 1614 + }, + { + "epoch": 0.12156796326615103, + "grad_norm": 4.6185150146484375, + "learning_rate": 9.784336585687857e-05, + "loss": 2.0793, + "step": 1615 + }, + { + "epoch": 0.1216432375468112, + "grad_norm": 4.8778815269470215, + "learning_rate": 9.783982267278808e-05, + "loss": 1.7552, + "step": 1616 + }, + { + "epoch": 0.12171851182747134, + "grad_norm": 6.437533855438232, + "learning_rate": 9.783627664475741e-05, + "loss": 2.0403, + "step": 1617 + }, + { + "epoch": 0.1217937861081315, + "grad_norm": 3.989184856414795, + "learning_rate": 9.783272777299734e-05, + "loss": 2.3687, + "step": 1618 + }, + { + "epoch": 0.12186906038879165, + "grad_norm": 8.013960838317871, + "learning_rate": 9.782917605771885e-05, + "loss": 2.0486, + "step": 1619 + }, + { + "epoch": 0.12194433466945181, + "grad_norm": 4.957374095916748, + "learning_rate": 9.782562149913307e-05, + "loss": 2.2626, + "step": 1620 + }, + { + "epoch": 0.12201960895011198, + "grad_norm": 4.5848822593688965, + "learning_rate": 9.782206409745135e-05, + "loss": 1.8456, + "step": 1621 + }, + { + "epoch": 0.12209488323077212, + "grad_norm": 4.23613166809082, + "learning_rate": 9.781850385288512e-05, + "loss": 2.4278, + "step": 1622 + }, + { + "epoch": 0.12217015751143229, + "grad_norm": 4.207071304321289, + "learning_rate": 9.781494076564604e-05, + "loss": 2.0765, + "step": 1623 + }, + { + "epoch": 0.12224543179209243, + "grad_norm": 4.560945510864258, + "learning_rate": 9.781137483594594e-05, + "loss": 2.2401, + "step": 1624 + }, + { + "epoch": 0.1223207060727526, + "grad_norm": 4.446397304534912, + "learning_rate": 9.780780606399678e-05, + "loss": 1.9131, + "step": 1625 + }, + { + "epoch": 0.12239598035341274, + "grad_norm": 6.349095344543457, + "learning_rate": 9.780423445001073e-05, + "loss": 2.4463, + "step": 1626 + }, + { + "epoch": 0.1224712546340729, + "grad_norm": 5.460348129272461, + "learning_rate": 9.780065999420011e-05, + "loss": 2.0926, + "step": 1627 + }, + { + "epoch": 0.12254652891473305, + "grad_norm": 6.743232727050781, + "learning_rate": 9.77970826967774e-05, + "loss": 2.1142, + "step": 1628 + }, + { + "epoch": 0.12262180319539322, + "grad_norm": 6.462011814117432, + "learning_rate": 9.779350255795527e-05, + "loss": 2.0189, + "step": 1629 + }, + { + "epoch": 0.12269707747605337, + "grad_norm": 7.219926834106445, + "learning_rate": 9.778991957794657e-05, + "loss": 2.2696, + "step": 1630 + }, + { + "epoch": 0.12277235175671353, + "grad_norm": 4.853704452514648, + "learning_rate": 9.778633375696425e-05, + "loss": 2.1161, + "step": 1631 + }, + { + "epoch": 0.12284762603737368, + "grad_norm": 4.4934797286987305, + "learning_rate": 9.778274509522152e-05, + "loss": 2.0316, + "step": 1632 + }, + { + "epoch": 0.12292290031803384, + "grad_norm": 5.992727756500244, + "learning_rate": 9.777915359293168e-05, + "loss": 2.0836, + "step": 1633 + }, + { + "epoch": 0.12299817459869399, + "grad_norm": 6.169079780578613, + "learning_rate": 9.777555925030826e-05, + "loss": 2.3188, + "step": 1634 + }, + { + "epoch": 0.12307344887935415, + "grad_norm": 4.415220737457275, + "learning_rate": 9.777196206756493e-05, + "loss": 2.3539, + "step": 1635 + }, + { + "epoch": 0.12314872316001431, + "grad_norm": 6.755588531494141, + "learning_rate": 9.776836204491553e-05, + "loss": 2.129, + "step": 1636 + }, + { + "epoch": 0.12322399744067446, + "grad_norm": 4.62690544128418, + "learning_rate": 9.776475918257406e-05, + "loss": 1.9415, + "step": 1637 + }, + { + "epoch": 0.12329927172133462, + "grad_norm": 7.091202259063721, + "learning_rate": 9.776115348075472e-05, + "loss": 2.2476, + "step": 1638 + }, + { + "epoch": 0.12337454600199477, + "grad_norm": 5.287752151489258, + "learning_rate": 9.775754493967184e-05, + "loss": 2.0417, + "step": 1639 + }, + { + "epoch": 0.12344982028265493, + "grad_norm": 3.6862854957580566, + "learning_rate": 9.775393355953995e-05, + "loss": 2.3695, + "step": 1640 + }, + { + "epoch": 0.12352509456331508, + "grad_norm": 4.684847354888916, + "learning_rate": 9.775031934057372e-05, + "loss": 1.9462, + "step": 1641 + }, + { + "epoch": 0.12360036884397524, + "grad_norm": 4.057544708251953, + "learning_rate": 9.774670228298803e-05, + "loss": 2.294, + "step": 1642 + }, + { + "epoch": 0.12367564312463539, + "grad_norm": 6.539106369018555, + "learning_rate": 9.774308238699787e-05, + "loss": 1.9845, + "step": 1643 + }, + { + "epoch": 0.12375091740529555, + "grad_norm": 5.352054119110107, + "learning_rate": 9.773945965281847e-05, + "loss": 2.0626, + "step": 1644 + }, + { + "epoch": 0.1238261916859557, + "grad_norm": 5.649844646453857, + "learning_rate": 9.773583408066518e-05, + "loss": 2.2001, + "step": 1645 + }, + { + "epoch": 0.12390146596661586, + "grad_norm": 5.47548770904541, + "learning_rate": 9.773220567075352e-05, + "loss": 2.0255, + "step": 1646 + }, + { + "epoch": 0.12397674024727601, + "grad_norm": 4.692183017730713, + "learning_rate": 9.772857442329917e-05, + "loss": 2.1694, + "step": 1647 + }, + { + "epoch": 0.12405201452793617, + "grad_norm": 5.586865425109863, + "learning_rate": 9.772494033851805e-05, + "loss": 1.8244, + "step": 1648 + }, + { + "epoch": 0.12412728880859632, + "grad_norm": 7.032715797424316, + "learning_rate": 9.772130341662614e-05, + "loss": 2.4628, + "step": 1649 + }, + { + "epoch": 0.12420256308925648, + "grad_norm": 4.874334335327148, + "learning_rate": 9.771766365783967e-05, + "loss": 2.4653, + "step": 1650 + }, + { + "epoch": 0.12427783736991663, + "grad_norm": 4.872899532318115, + "learning_rate": 9.771402106237504e-05, + "loss": 2.181, + "step": 1651 + }, + { + "epoch": 0.12435311165057679, + "grad_norm": 5.087370872497559, + "learning_rate": 9.771037563044874e-05, + "loss": 2.1197, + "step": 1652 + }, + { + "epoch": 0.12442838593123695, + "grad_norm": 7.149764537811279, + "learning_rate": 9.770672736227751e-05, + "loss": 2.3339, + "step": 1653 + }, + { + "epoch": 0.1245036602118971, + "grad_norm": 5.5925092697143555, + "learning_rate": 9.770307625807823e-05, + "loss": 2.1305, + "step": 1654 + }, + { + "epoch": 0.12457893449255726, + "grad_norm": 5.180748462677002, + "learning_rate": 9.769942231806794e-05, + "loss": 1.9459, + "step": 1655 + }, + { + "epoch": 0.12465420877321741, + "grad_norm": 8.0724458694458, + "learning_rate": 9.769576554246384e-05, + "loss": 2.1494, + "step": 1656 + }, + { + "epoch": 0.12472948305387757, + "grad_norm": 5.70414924621582, + "learning_rate": 9.769210593148337e-05, + "loss": 2.3089, + "step": 1657 + }, + { + "epoch": 0.12480475733453772, + "grad_norm": 6.4600605964660645, + "learning_rate": 9.768844348534403e-05, + "loss": 2.1489, + "step": 1658 + }, + { + "epoch": 0.12488003161519788, + "grad_norm": 4.639852523803711, + "learning_rate": 9.768477820426354e-05, + "loss": 2.2941, + "step": 1659 + }, + { + "epoch": 0.12495530589585803, + "grad_norm": 5.862159729003906, + "learning_rate": 9.768111008845982e-05, + "loss": 1.9134, + "step": 1660 + }, + { + "epoch": 0.1250305801765182, + "grad_norm": 4.392872333526611, + "learning_rate": 9.767743913815093e-05, + "loss": 2.0711, + "step": 1661 + }, + { + "epoch": 0.12510585445717834, + "grad_norm": 5.746190547943115, + "learning_rate": 9.767376535355508e-05, + "loss": 2.1864, + "step": 1662 + }, + { + "epoch": 0.1251811287378385, + "grad_norm": 3.662151575088501, + "learning_rate": 9.767008873489067e-05, + "loss": 1.904, + "step": 1663 + }, + { + "epoch": 0.12525640301849866, + "grad_norm": 5.1756911277771, + "learning_rate": 9.766640928237624e-05, + "loss": 2.312, + "step": 1664 + }, + { + "epoch": 0.1253316772991588, + "grad_norm": 4.287230968475342, + "learning_rate": 9.766272699623058e-05, + "loss": 2.0945, + "step": 1665 + }, + { + "epoch": 0.12540695157981896, + "grad_norm": 4.961116790771484, + "learning_rate": 9.765904187667255e-05, + "loss": 2.1049, + "step": 1666 + }, + { + "epoch": 0.1254822258604791, + "grad_norm": 6.955589771270752, + "learning_rate": 9.765535392392123e-05, + "loss": 2.6384, + "step": 1667 + }, + { + "epoch": 0.12555750014113928, + "grad_norm": 5.326606273651123, + "learning_rate": 9.765166313819588e-05, + "loss": 2.1886, + "step": 1668 + }, + { + "epoch": 0.12563277442179943, + "grad_norm": 4.608530044555664, + "learning_rate": 9.764796951971585e-05, + "loss": 2.1499, + "step": 1669 + }, + { + "epoch": 0.12570804870245958, + "grad_norm": 5.765841484069824, + "learning_rate": 9.764427306870076e-05, + "loss": 2.3587, + "step": 1670 + }, + { + "epoch": 0.12578332298311976, + "grad_norm": 5.100683212280273, + "learning_rate": 9.764057378537035e-05, + "loss": 2.3403, + "step": 1671 + }, + { + "epoch": 0.1258585972637799, + "grad_norm": 5.592106342315674, + "learning_rate": 9.76368716699445e-05, + "loss": 2.2112, + "step": 1672 + }, + { + "epoch": 0.12593387154444005, + "grad_norm": 5.025146484375, + "learning_rate": 9.763316672264336e-05, + "loss": 2.1422, + "step": 1673 + }, + { + "epoch": 0.1260091458251002, + "grad_norm": 4.339875221252441, + "learning_rate": 9.762945894368709e-05, + "loss": 2.6697, + "step": 1674 + }, + { + "epoch": 0.12608442010576038, + "grad_norm": 3.809053421020508, + "learning_rate": 9.762574833329617e-05, + "loss": 1.9938, + "step": 1675 + }, + { + "epoch": 0.12615969438642052, + "grad_norm": 7.578778266906738, + "learning_rate": 9.762203489169116e-05, + "loss": 2.1337, + "step": 1676 + }, + { + "epoch": 0.12623496866708067, + "grad_norm": 4.791518211364746, + "learning_rate": 9.761831861909283e-05, + "loss": 2.1314, + "step": 1677 + }, + { + "epoch": 0.12631024294774082, + "grad_norm": 4.78886079788208, + "learning_rate": 9.761459951572208e-05, + "loss": 2.1539, + "step": 1678 + }, + { + "epoch": 0.126385517228401, + "grad_norm": 4.933764934539795, + "learning_rate": 9.76108775818e-05, + "loss": 2.3209, + "step": 1679 + }, + { + "epoch": 0.12646079150906114, + "grad_norm": 4.993045330047607, + "learning_rate": 9.760715281754787e-05, + "loss": 2.1642, + "step": 1680 + }, + { + "epoch": 0.1265360657897213, + "grad_norm": 4.818905830383301, + "learning_rate": 9.76034252231871e-05, + "loss": 2.6079, + "step": 1681 + }, + { + "epoch": 0.12661134007038144, + "grad_norm": 6.519049167633057, + "learning_rate": 9.759969479893931e-05, + "loss": 2.4723, + "step": 1682 + }, + { + "epoch": 0.12668661435104162, + "grad_norm": 7.517443656921387, + "learning_rate": 9.759596154502622e-05, + "loss": 2.9614, + "step": 1683 + }, + { + "epoch": 0.12676188863170176, + "grad_norm": 4.484193325042725, + "learning_rate": 9.75922254616698e-05, + "loss": 2.2104, + "step": 1684 + }, + { + "epoch": 0.1268371629123619, + "grad_norm": 5.735039234161377, + "learning_rate": 9.758848654909213e-05, + "loss": 2.0253, + "step": 1685 + }, + { + "epoch": 0.1269124371930221, + "grad_norm": 5.4627838134765625, + "learning_rate": 9.758474480751547e-05, + "loss": 2.4451, + "step": 1686 + }, + { + "epoch": 0.12698771147368224, + "grad_norm": 4.789186954498291, + "learning_rate": 9.758100023716227e-05, + "loss": 2.6446, + "step": 1687 + }, + { + "epoch": 0.12706298575434238, + "grad_norm": 6.586690425872803, + "learning_rate": 9.757725283825514e-05, + "loss": 2.4926, + "step": 1688 + }, + { + "epoch": 0.12713826003500253, + "grad_norm": 6.562723159790039, + "learning_rate": 9.757350261101685e-05, + "loss": 1.8865, + "step": 1689 + }, + { + "epoch": 0.1272135343156627, + "grad_norm": 5.989397048950195, + "learning_rate": 9.756974955567033e-05, + "loss": 2.3004, + "step": 1690 + }, + { + "epoch": 0.12728880859632286, + "grad_norm": 6.762864112854004, + "learning_rate": 9.756599367243869e-05, + "loss": 2.0279, + "step": 1691 + }, + { + "epoch": 0.127364082876983, + "grad_norm": 4.570464611053467, + "learning_rate": 9.75622349615452e-05, + "loss": 2.0751, + "step": 1692 + }, + { + "epoch": 0.12743935715764315, + "grad_norm": 7.553378105163574, + "learning_rate": 9.75584734232133e-05, + "loss": 2.3705, + "step": 1693 + }, + { + "epoch": 0.12751463143830333, + "grad_norm": 5.850673198699951, + "learning_rate": 9.755470905766663e-05, + "loss": 2.1451, + "step": 1694 + }, + { + "epoch": 0.12758990571896348, + "grad_norm": 3.9360392093658447, + "learning_rate": 9.755094186512897e-05, + "loss": 2.1379, + "step": 1695 + }, + { + "epoch": 0.12766517999962362, + "grad_norm": 4.635295391082764, + "learning_rate": 9.754717184582424e-05, + "loss": 2.5343, + "step": 1696 + }, + { + "epoch": 0.12774045428028377, + "grad_norm": 6.2629852294921875, + "learning_rate": 9.754339899997657e-05, + "loss": 2.0976, + "step": 1697 + }, + { + "epoch": 0.12781572856094395, + "grad_norm": 4.538017272949219, + "learning_rate": 9.753962332781025e-05, + "loss": 2.1238, + "step": 1698 + }, + { + "epoch": 0.1278910028416041, + "grad_norm": 5.178086757659912, + "learning_rate": 9.753584482954974e-05, + "loss": 2.112, + "step": 1699 + }, + { + "epoch": 0.12796627712226424, + "grad_norm": 6.888523578643799, + "learning_rate": 9.753206350541963e-05, + "loss": 2.4774, + "step": 1700 + }, + { + "epoch": 0.12804155140292442, + "grad_norm": 5.4148430824279785, + "learning_rate": 9.752827935564474e-05, + "loss": 2.0926, + "step": 1701 + }, + { + "epoch": 0.12811682568358457, + "grad_norm": 4.234797477722168, + "learning_rate": 9.752449238045002e-05, + "loss": 2.2639, + "step": 1702 + }, + { + "epoch": 0.12819209996424472, + "grad_norm": 5.121828079223633, + "learning_rate": 9.752070258006059e-05, + "loss": 2.0641, + "step": 1703 + }, + { + "epoch": 0.12826737424490486, + "grad_norm": 4.237938404083252, + "learning_rate": 9.751690995470175e-05, + "loss": 1.9471, + "step": 1704 + }, + { + "epoch": 0.12834264852556504, + "grad_norm": 5.727453231811523, + "learning_rate": 9.751311450459894e-05, + "loss": 2.1931, + "step": 1705 + }, + { + "epoch": 0.1284179228062252, + "grad_norm": 5.09385347366333, + "learning_rate": 9.750931622997781e-05, + "loss": 2.1248, + "step": 1706 + }, + { + "epoch": 0.12849319708688534, + "grad_norm": 4.489762306213379, + "learning_rate": 9.750551513106416e-05, + "loss": 2.2304, + "step": 1707 + }, + { + "epoch": 0.12856847136754548, + "grad_norm": 5.428772926330566, + "learning_rate": 9.750171120808394e-05, + "loss": 2.5218, + "step": 1708 + }, + { + "epoch": 0.12864374564820566, + "grad_norm": 6.32047700881958, + "learning_rate": 9.749790446126327e-05, + "loss": 2.313, + "step": 1709 + }, + { + "epoch": 0.1287190199288658, + "grad_norm": 4.1708879470825195, + "learning_rate": 9.749409489082848e-05, + "loss": 2.337, + "step": 1710 + }, + { + "epoch": 0.12879429420952596, + "grad_norm": 6.77269983291626, + "learning_rate": 9.749028249700602e-05, + "loss": 2.9133, + "step": 1711 + }, + { + "epoch": 0.1288695684901861, + "grad_norm": 4.5942702293396, + "learning_rate": 9.748646728002253e-05, + "loss": 2.5118, + "step": 1712 + }, + { + "epoch": 0.12894484277084628, + "grad_norm": 4.483979225158691, + "learning_rate": 9.748264924010482e-05, + "loss": 2.17, + "step": 1713 + }, + { + "epoch": 0.12902011705150643, + "grad_norm": 6.06091833114624, + "learning_rate": 9.747882837747984e-05, + "loss": 2.4806, + "step": 1714 + }, + { + "epoch": 0.12909539133216658, + "grad_norm": 4.3725433349609375, + "learning_rate": 9.747500469237476e-05, + "loss": 2.8324, + "step": 1715 + }, + { + "epoch": 0.12917066561282672, + "grad_norm": 4.649985313415527, + "learning_rate": 9.747117818501687e-05, + "loss": 2.0885, + "step": 1716 + }, + { + "epoch": 0.1292459398934869, + "grad_norm": 5.056002140045166, + "learning_rate": 9.746734885563365e-05, + "loss": 2.2877, + "step": 1717 + }, + { + "epoch": 0.12932121417414705, + "grad_norm": 4.823305130004883, + "learning_rate": 9.746351670445273e-05, + "loss": 2.351, + "step": 1718 + }, + { + "epoch": 0.1293964884548072, + "grad_norm": 5.800633430480957, + "learning_rate": 9.745968173170194e-05, + "loss": 2.358, + "step": 1719 + }, + { + "epoch": 0.12947176273546737, + "grad_norm": 5.244085788726807, + "learning_rate": 9.745584393760923e-05, + "loss": 2.243, + "step": 1720 + }, + { + "epoch": 0.12954703701612752, + "grad_norm": 7.268054485321045, + "learning_rate": 9.745200332240278e-05, + "loss": 2.1576, + "step": 1721 + }, + { + "epoch": 0.12962231129678767, + "grad_norm": 5.814276218414307, + "learning_rate": 9.744815988631089e-05, + "loss": 2.1023, + "step": 1722 + }, + { + "epoch": 0.12969758557744782, + "grad_norm": 4.464752674102783, + "learning_rate": 9.744431362956203e-05, + "loss": 2.0206, + "step": 1723 + }, + { + "epoch": 0.129772859858108, + "grad_norm": 4.056717872619629, + "learning_rate": 9.744046455238487e-05, + "loss": 1.9266, + "step": 1724 + }, + { + "epoch": 0.12984813413876814, + "grad_norm": 5.691699981689453, + "learning_rate": 9.743661265500821e-05, + "loss": 2.5917, + "step": 1725 + }, + { + "epoch": 0.1299234084194283, + "grad_norm": 6.967303276062012, + "learning_rate": 9.743275793766102e-05, + "loss": 2.2194, + "step": 1726 + }, + { + "epoch": 0.12999868270008844, + "grad_norm": 7.970575332641602, + "learning_rate": 9.742890040057249e-05, + "loss": 3.0811, + "step": 1727 + }, + { + "epoch": 0.1300739569807486, + "grad_norm": 4.193321228027344, + "learning_rate": 9.742504004397192e-05, + "loss": 2.3907, + "step": 1728 + }, + { + "epoch": 0.13014923126140876, + "grad_norm": 5.682349681854248, + "learning_rate": 9.742117686808878e-05, + "loss": 2.174, + "step": 1729 + }, + { + "epoch": 0.1302245055420689, + "grad_norm": 5.465896129608154, + "learning_rate": 9.741731087315276e-05, + "loss": 2.28, + "step": 1730 + }, + { + "epoch": 0.13029977982272906, + "grad_norm": 4.960921287536621, + "learning_rate": 9.741344205939366e-05, + "loss": 2.2761, + "step": 1731 + }, + { + "epoch": 0.13037505410338923, + "grad_norm": 7.062646389007568, + "learning_rate": 9.740957042704146e-05, + "loss": 2.2874, + "step": 1732 + }, + { + "epoch": 0.13045032838404938, + "grad_norm": 5.2693257331848145, + "learning_rate": 9.740569597632635e-05, + "loss": 2.1913, + "step": 1733 + }, + { + "epoch": 0.13052560266470953, + "grad_norm": 4.980762004852295, + "learning_rate": 9.740181870747863e-05, + "loss": 2.2419, + "step": 1734 + }, + { + "epoch": 0.1306008769453697, + "grad_norm": 6.031246662139893, + "learning_rate": 9.73979386207288e-05, + "loss": 2.2241, + "step": 1735 + }, + { + "epoch": 0.13067615122602985, + "grad_norm": 7.926592826843262, + "learning_rate": 9.739405571630751e-05, + "loss": 2.2495, + "step": 1736 + }, + { + "epoch": 0.13075142550669, + "grad_norm": 8.379772186279297, + "learning_rate": 9.739016999444562e-05, + "loss": 2.3224, + "step": 1737 + }, + { + "epoch": 0.13082669978735015, + "grad_norm": 4.8269243240356445, + "learning_rate": 9.738628145537407e-05, + "loss": 2.0694, + "step": 1738 + }, + { + "epoch": 0.13090197406801032, + "grad_norm": 4.838050842285156, + "learning_rate": 9.738239009932408e-05, + "loss": 2.4132, + "step": 1739 + }, + { + "epoch": 0.13097724834867047, + "grad_norm": 7.829267978668213, + "learning_rate": 9.737849592652695e-05, + "loss": 2.7272, + "step": 1740 + }, + { + "epoch": 0.13105252262933062, + "grad_norm": 4.662322044372559, + "learning_rate": 9.737459893721416e-05, + "loss": 2.0681, + "step": 1741 + }, + { + "epoch": 0.13112779690999077, + "grad_norm": 4.9359564781188965, + "learning_rate": 9.737069913161741e-05, + "loss": 2.1911, + "step": 1742 + }, + { + "epoch": 0.13120307119065094, + "grad_norm": 5.634368896484375, + "learning_rate": 9.736679650996851e-05, + "loss": 2.5004, + "step": 1743 + }, + { + "epoch": 0.1312783454713111, + "grad_norm": 6.821021556854248, + "learning_rate": 9.73628910724995e-05, + "loss": 2.5504, + "step": 1744 + }, + { + "epoch": 0.13135361975197124, + "grad_norm": 4.493718147277832, + "learning_rate": 9.735898281944249e-05, + "loss": 2.1869, + "step": 1745 + }, + { + "epoch": 0.1314288940326314, + "grad_norm": 5.231554985046387, + "learning_rate": 9.735507175102983e-05, + "loss": 2.0704, + "step": 1746 + }, + { + "epoch": 0.13150416831329156, + "grad_norm": 4.544377326965332, + "learning_rate": 9.735115786749404e-05, + "loss": 2.5327, + "step": 1747 + }, + { + "epoch": 0.1315794425939517, + "grad_norm": 5.713136196136475, + "learning_rate": 9.734724116906779e-05, + "loss": 1.8814, + "step": 1748 + }, + { + "epoch": 0.13165471687461186, + "grad_norm": 7.447630405426025, + "learning_rate": 9.73433216559839e-05, + "loss": 2.411, + "step": 1749 + }, + { + "epoch": 0.13172999115527204, + "grad_norm": 4.947197914123535, + "learning_rate": 9.733939932847538e-05, + "loss": 2.3038, + "step": 1750 + }, + { + "epoch": 0.13180526543593218, + "grad_norm": 5.95836067199707, + "learning_rate": 9.73354741867754e-05, + "loss": 2.5787, + "step": 1751 + }, + { + "epoch": 0.13188053971659233, + "grad_norm": 5.138620853424072, + "learning_rate": 9.733154623111731e-05, + "loss": 2.1285, + "step": 1752 + }, + { + "epoch": 0.13195581399725248, + "grad_norm": 4.807033538818359, + "learning_rate": 9.73276154617346e-05, + "loss": 1.9861, + "step": 1753 + }, + { + "epoch": 0.13203108827791266, + "grad_norm": 5.524970531463623, + "learning_rate": 9.732368187886095e-05, + "loss": 2.215, + "step": 1754 + }, + { + "epoch": 0.1321063625585728, + "grad_norm": 6.054365634918213, + "learning_rate": 9.73197454827302e-05, + "loss": 2.3715, + "step": 1755 + }, + { + "epoch": 0.13218163683923295, + "grad_norm": 4.977542400360107, + "learning_rate": 9.731580627357635e-05, + "loss": 2.4739, + "step": 1756 + }, + { + "epoch": 0.1322569111198931, + "grad_norm": 5.680325984954834, + "learning_rate": 9.731186425163359e-05, + "loss": 1.9051, + "step": 1757 + }, + { + "epoch": 0.13233218540055328, + "grad_norm": 6.17794942855835, + "learning_rate": 9.730791941713624e-05, + "loss": 2.3579, + "step": 1758 + }, + { + "epoch": 0.13240745968121342, + "grad_norm": 5.49123477935791, + "learning_rate": 9.730397177031884e-05, + "loss": 2.1278, + "step": 1759 + }, + { + "epoch": 0.13248273396187357, + "grad_norm": 5.485405445098877, + "learning_rate": 9.730002131141603e-05, + "loss": 2.0819, + "step": 1760 + }, + { + "epoch": 0.13255800824253372, + "grad_norm": 9.210233688354492, + "learning_rate": 9.729606804066269e-05, + "loss": 2.5269, + "step": 1761 + }, + { + "epoch": 0.1326332825231939, + "grad_norm": 5.67719841003418, + "learning_rate": 9.72921119582938e-05, + "loss": 2.5882, + "step": 1762 + }, + { + "epoch": 0.13270855680385404, + "grad_norm": 8.652382850646973, + "learning_rate": 9.728815306454455e-05, + "loss": 2.1637, + "step": 1763 + }, + { + "epoch": 0.1327838310845142, + "grad_norm": 5.790106296539307, + "learning_rate": 9.72841913596503e-05, + "loss": 2.3963, + "step": 1764 + }, + { + "epoch": 0.13285910536517434, + "grad_norm": 5.1820831298828125, + "learning_rate": 9.728022684384654e-05, + "loss": 2.2085, + "step": 1765 + }, + { + "epoch": 0.13293437964583452, + "grad_norm": 4.50135612487793, + "learning_rate": 9.727625951736893e-05, + "loss": 2.2223, + "step": 1766 + }, + { + "epoch": 0.13300965392649466, + "grad_norm": 5.442310810089111, + "learning_rate": 9.727228938045338e-05, + "loss": 2.5634, + "step": 1767 + }, + { + "epoch": 0.1330849282071548, + "grad_norm": 5.8599138259887695, + "learning_rate": 9.726831643333585e-05, + "loss": 2.1358, + "step": 1768 + }, + { + "epoch": 0.133160202487815, + "grad_norm": 4.942758083343506, + "learning_rate": 9.726434067625254e-05, + "loss": 2.7683, + "step": 1769 + }, + { + "epoch": 0.13323547676847514, + "grad_norm": 4.242802619934082, + "learning_rate": 9.726036210943978e-05, + "loss": 2.1248, + "step": 1770 + }, + { + "epoch": 0.13331075104913528, + "grad_norm": 5.054646015167236, + "learning_rate": 9.725638073313413e-05, + "loss": 2.5606, + "step": 1771 + }, + { + "epoch": 0.13338602532979543, + "grad_norm": 4.6928486824035645, + "learning_rate": 9.725239654757221e-05, + "loss": 2.5103, + "step": 1772 + }, + { + "epoch": 0.1334612996104556, + "grad_norm": 5.823727607727051, + "learning_rate": 9.72484095529909e-05, + "loss": 1.9912, + "step": 1773 + }, + { + "epoch": 0.13353657389111576, + "grad_norm": 8.489828109741211, + "learning_rate": 9.724441974962722e-05, + "loss": 2.4317, + "step": 1774 + }, + { + "epoch": 0.1336118481717759, + "grad_norm": 8.048446655273438, + "learning_rate": 9.724042713771835e-05, + "loss": 1.9961, + "step": 1775 + }, + { + "epoch": 0.13368712245243605, + "grad_norm": 4.8349409103393555, + "learning_rate": 9.723643171750163e-05, + "loss": 2.1962, + "step": 1776 + }, + { + "epoch": 0.13376239673309623, + "grad_norm": 4.24368143081665, + "learning_rate": 9.723243348921458e-05, + "loss": 2.1277, + "step": 1777 + }, + { + "epoch": 0.13383767101375638, + "grad_norm": 5.371838569641113, + "learning_rate": 9.722843245309487e-05, + "loss": 2.3456, + "step": 1778 + }, + { + "epoch": 0.13391294529441652, + "grad_norm": 4.18806266784668, + "learning_rate": 9.722442860938039e-05, + "loss": 2.2133, + "step": 1779 + }, + { + "epoch": 0.13398821957507667, + "grad_norm": 5.439259052276611, + "learning_rate": 9.722042195830912e-05, + "loss": 2.2445, + "step": 1780 + }, + { + "epoch": 0.13406349385573685, + "grad_norm": 4.135159492492676, + "learning_rate": 9.721641250011925e-05, + "loss": 2.0865, + "step": 1781 + }, + { + "epoch": 0.134138768136397, + "grad_norm": 3.7987565994262695, + "learning_rate": 9.721240023504914e-05, + "loss": 2.1844, + "step": 1782 + }, + { + "epoch": 0.13421404241705714, + "grad_norm": 4.4955735206604, + "learning_rate": 9.72083851633373e-05, + "loss": 2.0196, + "step": 1783 + }, + { + "epoch": 0.13428931669771732, + "grad_norm": 4.269046306610107, + "learning_rate": 9.720436728522242e-05, + "loss": 2.2484, + "step": 1784 + }, + { + "epoch": 0.13436459097837747, + "grad_norm": 5.672484874725342, + "learning_rate": 9.720034660094336e-05, + "loss": 2.1519, + "step": 1785 + }, + { + "epoch": 0.13443986525903762, + "grad_norm": 4.3847880363464355, + "learning_rate": 9.719632311073911e-05, + "loss": 2.0127, + "step": 1786 + }, + { + "epoch": 0.13451513953969776, + "grad_norm": 6.430187225341797, + "learning_rate": 9.719229681484889e-05, + "loss": 2.43, + "step": 1787 + }, + { + "epoch": 0.13459041382035794, + "grad_norm": 5.318563938140869, + "learning_rate": 9.718826771351202e-05, + "loss": 2.236, + "step": 1788 + }, + { + "epoch": 0.1346656881010181, + "grad_norm": 6.8275580406188965, + "learning_rate": 9.718423580696805e-05, + "loss": 2.3024, + "step": 1789 + }, + { + "epoch": 0.13474096238167824, + "grad_norm": 4.084136486053467, + "learning_rate": 9.718020109545663e-05, + "loss": 2.1722, + "step": 1790 + }, + { + "epoch": 0.13481623666233838, + "grad_norm": 4.49135160446167, + "learning_rate": 9.717616357921764e-05, + "loss": 2.3923, + "step": 1791 + }, + { + "epoch": 0.13489151094299856, + "grad_norm": 5.338467121124268, + "learning_rate": 9.717212325849107e-05, + "loss": 2.0977, + "step": 1792 + }, + { + "epoch": 0.1349667852236587, + "grad_norm": 5.188724517822266, + "learning_rate": 9.716808013351715e-05, + "loss": 2.0666, + "step": 1793 + }, + { + "epoch": 0.13504205950431886, + "grad_norm": 6.0009074211120605, + "learning_rate": 9.71640342045362e-05, + "loss": 2.0435, + "step": 1794 + }, + { + "epoch": 0.135117333784979, + "grad_norm": 3.454115152359009, + "learning_rate": 9.715998547178874e-05, + "loss": 2.2185, + "step": 1795 + }, + { + "epoch": 0.13519260806563918, + "grad_norm": 4.948105812072754, + "learning_rate": 9.715593393551546e-05, + "loss": 2.1565, + "step": 1796 + }, + { + "epoch": 0.13526788234629933, + "grad_norm": 4.150262355804443, + "learning_rate": 9.715187959595722e-05, + "loss": 2.0609, + "step": 1797 + }, + { + "epoch": 0.13534315662695948, + "grad_norm": 4.311717987060547, + "learning_rate": 9.714782245335502e-05, + "loss": 1.9061, + "step": 1798 + }, + { + "epoch": 0.13541843090761965, + "grad_norm": 4.890169143676758, + "learning_rate": 9.714376250795007e-05, + "loss": 2.4948, + "step": 1799 + }, + { + "epoch": 0.1354937051882798, + "grad_norm": 4.548721790313721, + "learning_rate": 9.713969975998369e-05, + "loss": 1.8287, + "step": 1800 + }, + { + "epoch": 0.13556897946893995, + "grad_norm": 5.311888694763184, + "learning_rate": 9.713563420969745e-05, + "loss": 2.6011, + "step": 1801 + }, + { + "epoch": 0.1356442537496001, + "grad_norm": 3.4806299209594727, + "learning_rate": 9.713156585733298e-05, + "loss": 2.3599, + "step": 1802 + }, + { + "epoch": 0.13571952803026027, + "grad_norm": 4.0148138999938965, + "learning_rate": 9.712749470313216e-05, + "loss": 2.0326, + "step": 1803 + }, + { + "epoch": 0.13579480231092042, + "grad_norm": 7.7831902503967285, + "learning_rate": 9.712342074733701e-05, + "loss": 2.1464, + "step": 1804 + }, + { + "epoch": 0.13587007659158057, + "grad_norm": 5.510692119598389, + "learning_rate": 9.711934399018969e-05, + "loss": 1.9611, + "step": 1805 + }, + { + "epoch": 0.13594535087224072, + "grad_norm": 4.311880588531494, + "learning_rate": 9.711526443193259e-05, + "loss": 2.1199, + "step": 1806 + }, + { + "epoch": 0.1360206251529009, + "grad_norm": 5.091075897216797, + "learning_rate": 9.71111820728082e-05, + "loss": 2.1322, + "step": 1807 + }, + { + "epoch": 0.13609589943356104, + "grad_norm": 5.07310152053833, + "learning_rate": 9.710709691305922e-05, + "loss": 2.1444, + "step": 1808 + }, + { + "epoch": 0.1361711737142212, + "grad_norm": 4.195400714874268, + "learning_rate": 9.71030089529285e-05, + "loss": 2.3226, + "step": 1809 + }, + { + "epoch": 0.13624644799488134, + "grad_norm": 4.664185523986816, + "learning_rate": 9.709891819265904e-05, + "loss": 2.19, + "step": 1810 + }, + { + "epoch": 0.1363217222755415, + "grad_norm": 4.323750019073486, + "learning_rate": 9.709482463249403e-05, + "loss": 2.0303, + "step": 1811 + }, + { + "epoch": 0.13639699655620166, + "grad_norm": 8.341972351074219, + "learning_rate": 9.709072827267683e-05, + "loss": 2.2671, + "step": 1812 + }, + { + "epoch": 0.1364722708368618, + "grad_norm": 3.623511791229248, + "learning_rate": 9.708662911345096e-05, + "loss": 2.5169, + "step": 1813 + }, + { + "epoch": 0.13654754511752198, + "grad_norm": 4.627481937408447, + "learning_rate": 9.708252715506009e-05, + "loss": 1.9635, + "step": 1814 + }, + { + "epoch": 0.13662281939818213, + "grad_norm": 5.3794379234313965, + "learning_rate": 9.707842239774807e-05, + "loss": 2.5748, + "step": 1815 + }, + { + "epoch": 0.13669809367884228, + "grad_norm": 6.208539009094238, + "learning_rate": 9.707431484175893e-05, + "loss": 1.8715, + "step": 1816 + }, + { + "epoch": 0.13677336795950243, + "grad_norm": 4.358168601989746, + "learning_rate": 9.707020448733684e-05, + "loss": 1.9982, + "step": 1817 + }, + { + "epoch": 0.1368486422401626, + "grad_norm": 3.709925889968872, + "learning_rate": 9.706609133472617e-05, + "loss": 1.9796, + "step": 1818 + }, + { + "epoch": 0.13692391652082275, + "grad_norm": 5.560945510864258, + "learning_rate": 9.706197538417139e-05, + "loss": 2.5432, + "step": 1819 + }, + { + "epoch": 0.1369991908014829, + "grad_norm": 4.2365851402282715, + "learning_rate": 9.705785663591722e-05, + "loss": 1.9914, + "step": 1820 + }, + { + "epoch": 0.13707446508214305, + "grad_norm": 5.010767936706543, + "learning_rate": 9.705373509020849e-05, + "loss": 2.2454, + "step": 1821 + }, + { + "epoch": 0.13714973936280322, + "grad_norm": 3.9499197006225586, + "learning_rate": 9.704961074729024e-05, + "loss": 2.2639, + "step": 1822 + }, + { + "epoch": 0.13722501364346337, + "grad_norm": 4.281479835510254, + "learning_rate": 9.704548360740762e-05, + "loss": 2.2585, + "step": 1823 + }, + { + "epoch": 0.13730028792412352, + "grad_norm": 5.20364236831665, + "learning_rate": 9.7041353670806e-05, + "loss": 2.7264, + "step": 1824 + }, + { + "epoch": 0.13737556220478367, + "grad_norm": 5.149622440338135, + "learning_rate": 9.703722093773086e-05, + "loss": 1.9872, + "step": 1825 + }, + { + "epoch": 0.13745083648544384, + "grad_norm": 5.009570121765137, + "learning_rate": 9.703308540842792e-05, + "loss": 2.4357, + "step": 1826 + }, + { + "epoch": 0.137526110766104, + "grad_norm": 7.407719612121582, + "learning_rate": 9.7028947083143e-05, + "loss": 2.0713, + "step": 1827 + }, + { + "epoch": 0.13760138504676414, + "grad_norm": 3.7702975273132324, + "learning_rate": 9.702480596212211e-05, + "loss": 1.9476, + "step": 1828 + }, + { + "epoch": 0.1376766593274243, + "grad_norm": 5.987146377563477, + "learning_rate": 9.702066204561144e-05, + "loss": 2.3998, + "step": 1829 + }, + { + "epoch": 0.13775193360808446, + "grad_norm": 6.7265400886535645, + "learning_rate": 9.701651533385731e-05, + "loss": 2.3342, + "step": 1830 + }, + { + "epoch": 0.1378272078887446, + "grad_norm": 6.512685775756836, + "learning_rate": 9.701236582710629e-05, + "loss": 2.1574, + "step": 1831 + }, + { + "epoch": 0.13790248216940476, + "grad_norm": 8.839797019958496, + "learning_rate": 9.700821352560498e-05, + "loss": 2.3089, + "step": 1832 + }, + { + "epoch": 0.13797775645006494, + "grad_norm": 6.385513782501221, + "learning_rate": 9.700405842960028e-05, + "loss": 2.4704, + "step": 1833 + }, + { + "epoch": 0.13805303073072508, + "grad_norm": 8.119880676269531, + "learning_rate": 9.699990053933916e-05, + "loss": 2.4203, + "step": 1834 + }, + { + "epoch": 0.13812830501138523, + "grad_norm": 4.073723316192627, + "learning_rate": 9.699573985506881e-05, + "loss": 2.3897, + "step": 1835 + }, + { + "epoch": 0.13820357929204538, + "grad_norm": 6.010770797729492, + "learning_rate": 9.699157637703657e-05, + "loss": 2.2526, + "step": 1836 + }, + { + "epoch": 0.13827885357270556, + "grad_norm": 5.169813632965088, + "learning_rate": 9.698741010548997e-05, + "loss": 2.4182, + "step": 1837 + }, + { + "epoch": 0.1383541278533657, + "grad_norm": 4.972817897796631, + "learning_rate": 9.698324104067664e-05, + "loss": 2.2069, + "step": 1838 + }, + { + "epoch": 0.13842940213402585, + "grad_norm": 7.212032794952393, + "learning_rate": 9.697906918284445e-05, + "loss": 2.1537, + "step": 1839 + }, + { + "epoch": 0.138504676414686, + "grad_norm": 5.314699649810791, + "learning_rate": 9.69748945322414e-05, + "loss": 2.2994, + "step": 1840 + }, + { + "epoch": 0.13857995069534618, + "grad_norm": 5.680018424987793, + "learning_rate": 9.697071708911564e-05, + "loss": 2.2162, + "step": 1841 + }, + { + "epoch": 0.13865522497600632, + "grad_norm": 6.512355327606201, + "learning_rate": 9.696653685371553e-05, + "loss": 2.2615, + "step": 1842 + }, + { + "epoch": 0.13873049925666647, + "grad_norm": 4.901081085205078, + "learning_rate": 9.696235382628958e-05, + "loss": 2.1403, + "step": 1843 + }, + { + "epoch": 0.13880577353732662, + "grad_norm": 4.775032997131348, + "learning_rate": 9.695816800708642e-05, + "loss": 1.9616, + "step": 1844 + }, + { + "epoch": 0.1388810478179868, + "grad_norm": 4.621478080749512, + "learning_rate": 9.695397939635493e-05, + "loss": 2.4736, + "step": 1845 + }, + { + "epoch": 0.13895632209864694, + "grad_norm": 4.7510085105896, + "learning_rate": 9.694978799434409e-05, + "loss": 2.1455, + "step": 1846 + }, + { + "epoch": 0.1390315963793071, + "grad_norm": 3.76348876953125, + "learning_rate": 9.694559380130306e-05, + "loss": 1.9571, + "step": 1847 + }, + { + "epoch": 0.13910687065996727, + "grad_norm": 5.012405872344971, + "learning_rate": 9.694139681748118e-05, + "loss": 2.2396, + "step": 1848 + }, + { + "epoch": 0.13918214494062742, + "grad_norm": 4.353190898895264, + "learning_rate": 9.693719704312795e-05, + "loss": 2.2327, + "step": 1849 + }, + { + "epoch": 0.13925741922128757, + "grad_norm": 5.981847763061523, + "learning_rate": 9.693299447849303e-05, + "loss": 2.169, + "step": 1850 + }, + { + "epoch": 0.1393326935019477, + "grad_norm": 4.708183765411377, + "learning_rate": 9.692878912382625e-05, + "loss": 2.2332, + "step": 1851 + }, + { + "epoch": 0.1394079677826079, + "grad_norm": 4.729650497436523, + "learning_rate": 9.692458097937763e-05, + "loss": 1.9346, + "step": 1852 + }, + { + "epoch": 0.13948324206326804, + "grad_norm": 4.960033416748047, + "learning_rate": 9.692037004539732e-05, + "loss": 2.4869, + "step": 1853 + }, + { + "epoch": 0.13955851634392819, + "grad_norm": 3.768878698348999, + "learning_rate": 9.691615632213561e-05, + "loss": 1.9531, + "step": 1854 + }, + { + "epoch": 0.13963379062458833, + "grad_norm": 5.850656509399414, + "learning_rate": 9.691193980984303e-05, + "loss": 1.8112, + "step": 1855 + }, + { + "epoch": 0.1397090649052485, + "grad_norm": 6.420674800872803, + "learning_rate": 9.690772050877024e-05, + "loss": 2.2319, + "step": 1856 + }, + { + "epoch": 0.13978433918590866, + "grad_norm": 6.607874870300293, + "learning_rate": 9.690349841916806e-05, + "loss": 2.1272, + "step": 1857 + }, + { + "epoch": 0.1398596134665688, + "grad_norm": 4.868158340454102, + "learning_rate": 9.689927354128749e-05, + "loss": 2.2246, + "step": 1858 + }, + { + "epoch": 0.13993488774722895, + "grad_norm": 5.244241714477539, + "learning_rate": 9.689504587537967e-05, + "loss": 2.3166, + "step": 1859 + }, + { + "epoch": 0.14001016202788913, + "grad_norm": 6.573191165924072, + "learning_rate": 9.689081542169591e-05, + "loss": 1.9857, + "step": 1860 + }, + { + "epoch": 0.14008543630854928, + "grad_norm": 4.720640182495117, + "learning_rate": 9.688658218048774e-05, + "loss": 2.1274, + "step": 1861 + }, + { + "epoch": 0.14016071058920943, + "grad_norm": 4.303944110870361, + "learning_rate": 9.68823461520068e-05, + "loss": 2.2675, + "step": 1862 + }, + { + "epoch": 0.1402359848698696, + "grad_norm": 6.73034143447876, + "learning_rate": 9.687810733650488e-05, + "loss": 2.5371, + "step": 1863 + }, + { + "epoch": 0.14031125915052975, + "grad_norm": 6.235661029815674, + "learning_rate": 9.687386573423401e-05, + "loss": 2.2235, + "step": 1864 + }, + { + "epoch": 0.1403865334311899, + "grad_norm": 5.523322105407715, + "learning_rate": 9.68696213454463e-05, + "loss": 1.9954, + "step": 1865 + }, + { + "epoch": 0.14046180771185005, + "grad_norm": 5.003766059875488, + "learning_rate": 9.686537417039408e-05, + "loss": 2.1717, + "step": 1866 + }, + { + "epoch": 0.14053708199251022, + "grad_norm": 5.235233783721924, + "learning_rate": 9.686112420932984e-05, + "loss": 2.3419, + "step": 1867 + }, + { + "epoch": 0.14061235627317037, + "grad_norm": 5.51898193359375, + "learning_rate": 9.685687146250624e-05, + "loss": 2.2538, + "step": 1868 + }, + { + "epoch": 0.14068763055383052, + "grad_norm": 5.234782695770264, + "learning_rate": 9.685261593017607e-05, + "loss": 2.0887, + "step": 1869 + }, + { + "epoch": 0.14076290483449067, + "grad_norm": 4.609278678894043, + "learning_rate": 9.684835761259232e-05, + "loss": 2.5293, + "step": 1870 + }, + { + "epoch": 0.14083817911515084, + "grad_norm": 4.051346778869629, + "learning_rate": 9.684409651000812e-05, + "loss": 2.324, + "step": 1871 + }, + { + "epoch": 0.140913453395811, + "grad_norm": 4.640045642852783, + "learning_rate": 9.683983262267682e-05, + "loss": 1.8394, + "step": 1872 + }, + { + "epoch": 0.14098872767647114, + "grad_norm": 6.540625095367432, + "learning_rate": 9.683556595085186e-05, + "loss": 1.9364, + "step": 1873 + }, + { + "epoch": 0.14106400195713129, + "grad_norm": 4.141967296600342, + "learning_rate": 9.683129649478689e-05, + "loss": 2.0072, + "step": 1874 + }, + { + "epoch": 0.14113927623779146, + "grad_norm": 4.262126445770264, + "learning_rate": 9.68270242547357e-05, + "loss": 2.2954, + "step": 1875 + }, + { + "epoch": 0.1412145505184516, + "grad_norm": 4.7414703369140625, + "learning_rate": 9.682274923095229e-05, + "loss": 2.5297, + "step": 1876 + }, + { + "epoch": 0.14128982479911176, + "grad_norm": 3.738071918487549, + "learning_rate": 9.68184714236908e-05, + "loss": 2.2613, + "step": 1877 + }, + { + "epoch": 0.1413650990797719, + "grad_norm": 5.414370059967041, + "learning_rate": 9.68141908332055e-05, + "loss": 2.0167, + "step": 1878 + }, + { + "epoch": 0.14144037336043208, + "grad_norm": 4.328585624694824, + "learning_rate": 9.68099074597509e-05, + "loss": 2.0192, + "step": 1879 + }, + { + "epoch": 0.14151564764109223, + "grad_norm": 3.526667833328247, + "learning_rate": 9.68056213035816e-05, + "loss": 2.7083, + "step": 1880 + }, + { + "epoch": 0.14159092192175238, + "grad_norm": 4.046936988830566, + "learning_rate": 9.680133236495241e-05, + "loss": 2.1185, + "step": 1881 + }, + { + "epoch": 0.14166619620241255, + "grad_norm": 4.649966716766357, + "learning_rate": 9.679704064411832e-05, + "loss": 2.2994, + "step": 1882 + }, + { + "epoch": 0.1417414704830727, + "grad_norm": 5.140581130981445, + "learning_rate": 9.679274614133442e-05, + "loss": 2.1596, + "step": 1883 + }, + { + "epoch": 0.14181674476373285, + "grad_norm": 4.729349136352539, + "learning_rate": 9.678844885685603e-05, + "loss": 2.5428, + "step": 1884 + }, + { + "epoch": 0.141892019044393, + "grad_norm": 6.481583118438721, + "learning_rate": 9.678414879093862e-05, + "loss": 2.284, + "step": 1885 + }, + { + "epoch": 0.14196729332505317, + "grad_norm": 4.541432857513428, + "learning_rate": 9.677984594383778e-05, + "loss": 2.1312, + "step": 1886 + }, + { + "epoch": 0.14204256760571332, + "grad_norm": 3.671532154083252, + "learning_rate": 9.677554031580935e-05, + "loss": 2.1774, + "step": 1887 + }, + { + "epoch": 0.14211784188637347, + "grad_norm": 4.682297229766846, + "learning_rate": 9.677123190710925e-05, + "loss": 2.0723, + "step": 1888 + }, + { + "epoch": 0.14219311616703362, + "grad_norm": 4.896402359008789, + "learning_rate": 9.676692071799362e-05, + "loss": 2.2624, + "step": 1889 + }, + { + "epoch": 0.1422683904476938, + "grad_norm": 5.723679542541504, + "learning_rate": 9.676260674871874e-05, + "loss": 1.9701, + "step": 1890 + }, + { + "epoch": 0.14234366472835394, + "grad_norm": 3.2864460945129395, + "learning_rate": 9.675828999954105e-05, + "loss": 2.2517, + "step": 1891 + }, + { + "epoch": 0.1424189390090141, + "grad_norm": 3.565432548522949, + "learning_rate": 9.67539704707172e-05, + "loss": 2.2873, + "step": 1892 + }, + { + "epoch": 0.14249421328967424, + "grad_norm": 5.5910515785217285, + "learning_rate": 9.674964816250395e-05, + "loss": 2.0594, + "step": 1893 + }, + { + "epoch": 0.1425694875703344, + "grad_norm": 3.683396100997925, + "learning_rate": 9.674532307515827e-05, + "loss": 2.0167, + "step": 1894 + }, + { + "epoch": 0.14264476185099456, + "grad_norm": 4.884602069854736, + "learning_rate": 9.674099520893724e-05, + "loss": 2.5024, + "step": 1895 + }, + { + "epoch": 0.1427200361316547, + "grad_norm": 5.192716598510742, + "learning_rate": 9.673666456409817e-05, + "loss": 2.2091, + "step": 1896 + }, + { + "epoch": 0.14279531041231489, + "grad_norm": 7.458247184753418, + "learning_rate": 9.67323311408985e-05, + "loss": 2.0761, + "step": 1897 + }, + { + "epoch": 0.14287058469297503, + "grad_norm": 4.641127109527588, + "learning_rate": 9.672799493959584e-05, + "loss": 2.1959, + "step": 1898 + }, + { + "epoch": 0.14294585897363518, + "grad_norm": 6.1288862228393555, + "learning_rate": 9.672365596044795e-05, + "loss": 2.1263, + "step": 1899 + }, + { + "epoch": 0.14302113325429533, + "grad_norm": 4.811298370361328, + "learning_rate": 9.671931420371278e-05, + "loss": 2.5735, + "step": 1900 + }, + { + "epoch": 0.1430964075349555, + "grad_norm": 3.9263174533843994, + "learning_rate": 9.671496966964842e-05, + "loss": 1.8406, + "step": 1901 + }, + { + "epoch": 0.14317168181561565, + "grad_norm": 7.9855804443359375, + "learning_rate": 9.671062235851317e-05, + "loss": 2.0001, + "step": 1902 + }, + { + "epoch": 0.1432469560962758, + "grad_norm": 5.881470203399658, + "learning_rate": 9.670627227056543e-05, + "loss": 2.0452, + "step": 1903 + }, + { + "epoch": 0.14332223037693595, + "grad_norm": 6.645844459533691, + "learning_rate": 9.670191940606384e-05, + "loss": 2.7314, + "step": 1904 + }, + { + "epoch": 0.14339750465759613, + "grad_norm": 6.873920917510986, + "learning_rate": 9.669756376526714e-05, + "loss": 2.6188, + "step": 1905 + }, + { + "epoch": 0.14347277893825627, + "grad_norm": 5.633041858673096, + "learning_rate": 9.669320534843425e-05, + "loss": 2.445, + "step": 1906 + }, + { + "epoch": 0.14354805321891642, + "grad_norm": 5.7597126960754395, + "learning_rate": 9.66888441558243e-05, + "loss": 2.3496, + "step": 1907 + }, + { + "epoch": 0.14362332749957657, + "grad_norm": 5.59921932220459, + "learning_rate": 9.668448018769652e-05, + "loss": 2.1463, + "step": 1908 + }, + { + "epoch": 0.14369860178023675, + "grad_norm": 4.217930793762207, + "learning_rate": 9.668011344431034e-05, + "loss": 2.4306, + "step": 1909 + }, + { + "epoch": 0.1437738760608969, + "grad_norm": 5.275418281555176, + "learning_rate": 9.667574392592537e-05, + "loss": 2.5023, + "step": 1910 + }, + { + "epoch": 0.14384915034155704, + "grad_norm": 5.853795051574707, + "learning_rate": 9.667137163280134e-05, + "loss": 2.203, + "step": 1911 + }, + { + "epoch": 0.14392442462221722, + "grad_norm": 4.18360710144043, + "learning_rate": 9.666699656519817e-05, + "loss": 2.3513, + "step": 1912 + }, + { + "epoch": 0.14399969890287737, + "grad_norm": 5.436791896820068, + "learning_rate": 9.666261872337599e-05, + "loss": 2.0364, + "step": 1913 + }, + { + "epoch": 0.1440749731835375, + "grad_norm": 4.552731513977051, + "learning_rate": 9.665823810759498e-05, + "loss": 2.1606, + "step": 1914 + }, + { + "epoch": 0.14415024746419766, + "grad_norm": 4.0237717628479, + "learning_rate": 9.665385471811562e-05, + "loss": 2.2284, + "step": 1915 + }, + { + "epoch": 0.14422552174485784, + "grad_norm": 4.607576370239258, + "learning_rate": 9.664946855519845e-05, + "loss": 2.2797, + "step": 1916 + }, + { + "epoch": 0.14430079602551799, + "grad_norm": 6.482448577880859, + "learning_rate": 9.664507961910423e-05, + "loss": 2.1925, + "step": 1917 + }, + { + "epoch": 0.14437607030617813, + "grad_norm": 5.397335529327393, + "learning_rate": 9.664068791009386e-05, + "loss": 2.0366, + "step": 1918 + }, + { + "epoch": 0.14445134458683828, + "grad_norm": 6.092636585235596, + "learning_rate": 9.663629342842843e-05, + "loss": 1.9911, + "step": 1919 + }, + { + "epoch": 0.14452661886749846, + "grad_norm": 11.476844787597656, + "learning_rate": 9.663189617436916e-05, + "loss": 2.1739, + "step": 1920 + }, + { + "epoch": 0.1446018931481586, + "grad_norm": 4.44120979309082, + "learning_rate": 9.662749614817748e-05, + "loss": 2.1389, + "step": 1921 + }, + { + "epoch": 0.14467716742881875, + "grad_norm": 5.036876201629639, + "learning_rate": 9.662309335011494e-05, + "loss": 2.2473, + "step": 1922 + }, + { + "epoch": 0.1447524417094789, + "grad_norm": 5.119384288787842, + "learning_rate": 9.661868778044328e-05, + "loss": 2.2134, + "step": 1923 + }, + { + "epoch": 0.14482771599013908, + "grad_norm": 4.753607273101807, + "learning_rate": 9.661427943942437e-05, + "loss": 2.1122, + "step": 1924 + }, + { + "epoch": 0.14490299027079923, + "grad_norm": 5.219104290008545, + "learning_rate": 9.660986832732032e-05, + "loss": 1.8996, + "step": 1925 + }, + { + "epoch": 0.14497826455145937, + "grad_norm": 4.637187957763672, + "learning_rate": 9.660545444439333e-05, + "loss": 2.1506, + "step": 1926 + }, + { + "epoch": 0.14505353883211952, + "grad_norm": 5.25298547744751, + "learning_rate": 9.66010377909058e-05, + "loss": 2.2684, + "step": 1927 + }, + { + "epoch": 0.1451288131127797, + "grad_norm": 5.068880081176758, + "learning_rate": 9.659661836712028e-05, + "loss": 2.174, + "step": 1928 + }, + { + "epoch": 0.14520408739343985, + "grad_norm": 3.5758395195007324, + "learning_rate": 9.659219617329951e-05, + "loss": 2.2494, + "step": 1929 + }, + { + "epoch": 0.1452793616741, + "grad_norm": 5.59762716293335, + "learning_rate": 9.658777120970637e-05, + "loss": 2.2994, + "step": 1930 + }, + { + "epoch": 0.14535463595476017, + "grad_norm": 4.5902299880981445, + "learning_rate": 9.65833434766039e-05, + "loss": 2.1852, + "step": 1931 + }, + { + "epoch": 0.14542991023542032, + "grad_norm": 4.434948921203613, + "learning_rate": 9.657891297425531e-05, + "loss": 2.5772, + "step": 1932 + }, + { + "epoch": 0.14550518451608047, + "grad_norm": 4.257136821746826, + "learning_rate": 9.657447970292402e-05, + "loss": 2.2725, + "step": 1933 + }, + { + "epoch": 0.1455804587967406, + "grad_norm": 4.4683098793029785, + "learning_rate": 9.657004366287352e-05, + "loss": 2.2595, + "step": 1934 + }, + { + "epoch": 0.1456557330774008, + "grad_norm": 5.881546497344971, + "learning_rate": 9.656560485436757e-05, + "loss": 1.9656, + "step": 1935 + }, + { + "epoch": 0.14573100735806094, + "grad_norm": 5.927746772766113, + "learning_rate": 9.656116327767002e-05, + "loss": 2.3611, + "step": 1936 + }, + { + "epoch": 0.14580628163872109, + "grad_norm": 6.368885517120361, + "learning_rate": 9.65567189330449e-05, + "loss": 2.0469, + "step": 1937 + }, + { + "epoch": 0.14588155591938123, + "grad_norm": 9.653582572937012, + "learning_rate": 9.655227182075643e-05, + "loss": 2.3902, + "step": 1938 + }, + { + "epoch": 0.1459568302000414, + "grad_norm": 5.578766822814941, + "learning_rate": 9.6547821941069e-05, + "loss": 2.4388, + "step": 1939 + }, + { + "epoch": 0.14603210448070156, + "grad_norm": 6.3838043212890625, + "learning_rate": 9.654336929424708e-05, + "loss": 2.1256, + "step": 1940 + }, + { + "epoch": 0.1461073787613617, + "grad_norm": 6.19985818862915, + "learning_rate": 9.653891388055541e-05, + "loss": 2.2711, + "step": 1941 + }, + { + "epoch": 0.14618265304202185, + "grad_norm": 5.388345241546631, + "learning_rate": 9.653445570025887e-05, + "loss": 1.9546, + "step": 1942 + }, + { + "epoch": 0.14625792732268203, + "grad_norm": 5.75406551361084, + "learning_rate": 9.652999475362243e-05, + "loss": 1.9034, + "step": 1943 + }, + { + "epoch": 0.14633320160334218, + "grad_norm": 5.268181800842285, + "learning_rate": 9.652553104091133e-05, + "loss": 2.354, + "step": 1944 + }, + { + "epoch": 0.14640847588400233, + "grad_norm": 4.4514336585998535, + "learning_rate": 9.65210645623909e-05, + "loss": 2.2878, + "step": 1945 + }, + { + "epoch": 0.1464837501646625, + "grad_norm": 5.7580156326293945, + "learning_rate": 9.651659531832667e-05, + "loss": 2.0802, + "step": 1946 + }, + { + "epoch": 0.14655902444532265, + "grad_norm": 4.718121528625488, + "learning_rate": 9.651212330898432e-05, + "loss": 1.9684, + "step": 1947 + }, + { + "epoch": 0.1466342987259828, + "grad_norm": 3.99812912940979, + "learning_rate": 9.650764853462967e-05, + "loss": 2.5342, + "step": 1948 + }, + { + "epoch": 0.14670957300664295, + "grad_norm": 4.740177154541016, + "learning_rate": 9.650317099552879e-05, + "loss": 2.2826, + "step": 1949 + }, + { + "epoch": 0.14678484728730312, + "grad_norm": 5.82108211517334, + "learning_rate": 9.649869069194783e-05, + "loss": 2.0493, + "step": 1950 + }, + { + "epoch": 0.14686012156796327, + "grad_norm": 4.381932258605957, + "learning_rate": 9.649420762415311e-05, + "loss": 2.236, + "step": 1951 + }, + { + "epoch": 0.14693539584862342, + "grad_norm": 4.588272571563721, + "learning_rate": 9.648972179241116e-05, + "loss": 2.0181, + "step": 1952 + }, + { + "epoch": 0.14701067012928357, + "grad_norm": 5.516083240509033, + "learning_rate": 9.648523319698865e-05, + "loss": 2.3241, + "step": 1953 + }, + { + "epoch": 0.14708594440994374, + "grad_norm": 9.62438678741455, + "learning_rate": 9.648074183815241e-05, + "loss": 2.4048, + "step": 1954 + }, + { + "epoch": 0.1471612186906039, + "grad_norm": 5.418132305145264, + "learning_rate": 9.647624771616944e-05, + "loss": 2.3005, + "step": 1955 + }, + { + "epoch": 0.14723649297126404, + "grad_norm": 7.159862518310547, + "learning_rate": 9.647175083130687e-05, + "loss": 2.0388, + "step": 1956 + }, + { + "epoch": 0.14731176725192419, + "grad_norm": 5.345907211303711, + "learning_rate": 9.646725118383207e-05, + "loss": 2.2907, + "step": 1957 + }, + { + "epoch": 0.14738704153258436, + "grad_norm": 4.737625598907471, + "learning_rate": 9.646274877401252e-05, + "loss": 1.9337, + "step": 1958 + }, + { + "epoch": 0.1474623158132445, + "grad_norm": 5.6605119705200195, + "learning_rate": 9.645824360211589e-05, + "loss": 2.3023, + "step": 1959 + }, + { + "epoch": 0.14753759009390466, + "grad_norm": 4.911399841308594, + "learning_rate": 9.645373566840994e-05, + "loss": 1.931, + "step": 1960 + }, + { + "epoch": 0.14761286437456483, + "grad_norm": 5.157008647918701, + "learning_rate": 9.644922497316274e-05, + "loss": 2.6263, + "step": 1961 + }, + { + "epoch": 0.14768813865522498, + "grad_norm": 7.209107875823975, + "learning_rate": 9.644471151664236e-05, + "loss": 2.3765, + "step": 1962 + }, + { + "epoch": 0.14776341293588513, + "grad_norm": 5.182798862457275, + "learning_rate": 9.644019529911716e-05, + "loss": 2.3124, + "step": 1963 + }, + { + "epoch": 0.14783868721654528, + "grad_norm": 4.9227471351623535, + "learning_rate": 9.64356763208556e-05, + "loss": 2.3799, + "step": 1964 + }, + { + "epoch": 0.14791396149720545, + "grad_norm": 5.272462844848633, + "learning_rate": 9.643115458212631e-05, + "loss": 2.0561, + "step": 1965 + }, + { + "epoch": 0.1479892357778656, + "grad_norm": 6.156679630279541, + "learning_rate": 9.642663008319812e-05, + "loss": 2.2961, + "step": 1966 + }, + { + "epoch": 0.14806451005852575, + "grad_norm": 5.657508373260498, + "learning_rate": 9.642210282433999e-05, + "loss": 2.4496, + "step": 1967 + }, + { + "epoch": 0.1481397843391859, + "grad_norm": 5.954901218414307, + "learning_rate": 9.641757280582103e-05, + "loss": 2.4012, + "step": 1968 + }, + { + "epoch": 0.14821505861984607, + "grad_norm": 5.273434638977051, + "learning_rate": 9.641304002791057e-05, + "loss": 2.2418, + "step": 1969 + }, + { + "epoch": 0.14829033290050622, + "grad_norm": 4.450619697570801, + "learning_rate": 9.640850449087804e-05, + "loss": 2.1015, + "step": 1970 + }, + { + "epoch": 0.14836560718116637, + "grad_norm": 4.208042144775391, + "learning_rate": 9.640396619499308e-05, + "loss": 2.3728, + "step": 1971 + }, + { + "epoch": 0.14844088146182652, + "grad_norm": 5.491754531860352, + "learning_rate": 9.639942514052548e-05, + "loss": 2.0012, + "step": 1972 + }, + { + "epoch": 0.1485161557424867, + "grad_norm": 4.738384246826172, + "learning_rate": 9.639488132774518e-05, + "loss": 2.137, + "step": 1973 + }, + { + "epoch": 0.14859143002314684, + "grad_norm": 4.606748104095459, + "learning_rate": 9.639033475692233e-05, + "loss": 2.244, + "step": 1974 + }, + { + "epoch": 0.148666704303807, + "grad_norm": 5.480950355529785, + "learning_rate": 9.638578542832717e-05, + "loss": 2.397, + "step": 1975 + }, + { + "epoch": 0.14874197858446717, + "grad_norm": 4.1901350021362305, + "learning_rate": 9.638123334223017e-05, + "loss": 1.8605, + "step": 1976 + }, + { + "epoch": 0.14881725286512731, + "grad_norm": 4.445580005645752, + "learning_rate": 9.637667849890193e-05, + "loss": 2.2525, + "step": 1977 + }, + { + "epoch": 0.14889252714578746, + "grad_norm": 3.7742133140563965, + "learning_rate": 9.637212089861323e-05, + "loss": 2.1167, + "step": 1978 + }, + { + "epoch": 0.1489678014264476, + "grad_norm": 4.743167877197266, + "learning_rate": 9.636756054163498e-05, + "loss": 2.1636, + "step": 1979 + }, + { + "epoch": 0.14904307570710779, + "grad_norm": 4.316456317901611, + "learning_rate": 9.636299742823831e-05, + "loss": 2.2178, + "step": 1980 + }, + { + "epoch": 0.14911834998776793, + "grad_norm": 3.9753384590148926, + "learning_rate": 9.635843155869449e-05, + "loss": 2.2348, + "step": 1981 + }, + { + "epoch": 0.14919362426842808, + "grad_norm": 4.181934356689453, + "learning_rate": 9.63538629332749e-05, + "loss": 2.1192, + "step": 1982 + }, + { + "epoch": 0.14926889854908823, + "grad_norm": 4.687638282775879, + "learning_rate": 9.634929155225118e-05, + "loss": 2.0832, + "step": 1983 + }, + { + "epoch": 0.1493441728297484, + "grad_norm": 3.862661361694336, + "learning_rate": 9.634471741589505e-05, + "loss": 2.4405, + "step": 1984 + }, + { + "epoch": 0.14941944711040855, + "grad_norm": 3.4980392456054688, + "learning_rate": 9.634014052447849e-05, + "loss": 2.0628, + "step": 1985 + }, + { + "epoch": 0.1494947213910687, + "grad_norm": 4.194029331207275, + "learning_rate": 9.63355608782735e-05, + "loss": 2.2522, + "step": 1986 + }, + { + "epoch": 0.14956999567172885, + "grad_norm": 5.431275367736816, + "learning_rate": 9.633097847755238e-05, + "loss": 2.3805, + "step": 1987 + }, + { + "epoch": 0.14964526995238903, + "grad_norm": 5.273715019226074, + "learning_rate": 9.632639332258752e-05, + "loss": 2.6288, + "step": 1988 + }, + { + "epoch": 0.14972054423304917, + "grad_norm": 4.149738311767578, + "learning_rate": 9.632180541365152e-05, + "loss": 2.0956, + "step": 1989 + }, + { + "epoch": 0.14979581851370932, + "grad_norm": 5.5944647789001465, + "learning_rate": 9.631721475101709e-05, + "loss": 2.1588, + "step": 1990 + }, + { + "epoch": 0.14987109279436947, + "grad_norm": 5.478260040283203, + "learning_rate": 9.631262133495714e-05, + "loss": 1.9695, + "step": 1991 + }, + { + "epoch": 0.14994636707502965, + "grad_norm": 5.873048305511475, + "learning_rate": 9.630802516574476e-05, + "loss": 2.498, + "step": 1992 + }, + { + "epoch": 0.1500216413556898, + "grad_norm": 5.332206726074219, + "learning_rate": 9.630342624365311e-05, + "loss": 1.9052, + "step": 1993 + }, + { + "epoch": 0.15009691563634994, + "grad_norm": 5.516310691833496, + "learning_rate": 9.629882456895567e-05, + "loss": 1.893, + "step": 1994 + }, + { + "epoch": 0.15017218991701012, + "grad_norm": 5.302465438842773, + "learning_rate": 9.629422014192595e-05, + "loss": 1.9819, + "step": 1995 + }, + { + "epoch": 0.15024746419767027, + "grad_norm": 5.737462997436523, + "learning_rate": 9.628961296283766e-05, + "loss": 2.0859, + "step": 1996 + }, + { + "epoch": 0.15032273847833041, + "grad_norm": 5.143144607543945, + "learning_rate": 9.62850030319647e-05, + "loss": 1.9948, + "step": 1997 + }, + { + "epoch": 0.15039801275899056, + "grad_norm": 4.926069259643555, + "learning_rate": 9.628039034958113e-05, + "loss": 2.1829, + "step": 1998 + }, + { + "epoch": 0.15047328703965074, + "grad_norm": 4.048475742340088, + "learning_rate": 9.627577491596113e-05, + "loss": 2.4922, + "step": 1999 + }, + { + "epoch": 0.1505485613203109, + "grad_norm": 4.139350891113281, + "learning_rate": 9.627115673137912e-05, + "loss": 2.2655, + "step": 2000 + }, + { + "epoch": 0.15062383560097103, + "grad_norm": 5.4989142417907715, + "learning_rate": 9.626653579610958e-05, + "loss": 2.0027, + "step": 2001 + }, + { + "epoch": 0.15069910988163118, + "grad_norm": 4.3663530349731445, + "learning_rate": 9.626191211042726e-05, + "loss": 1.8043, + "step": 2002 + }, + { + "epoch": 0.15077438416229136, + "grad_norm": 6.856672763824463, + "learning_rate": 9.625728567460699e-05, + "loss": 2.0533, + "step": 2003 + }, + { + "epoch": 0.1508496584429515, + "grad_norm": 5.059989929199219, + "learning_rate": 9.625265648892384e-05, + "loss": 2.2027, + "step": 2004 + }, + { + "epoch": 0.15092493272361165, + "grad_norm": 4.941856861114502, + "learning_rate": 9.624802455365294e-05, + "loss": 2.081, + "step": 2005 + }, + { + "epoch": 0.1510002070042718, + "grad_norm": 3.969909906387329, + "learning_rate": 9.624338986906969e-05, + "loss": 2.5115, + "step": 2006 + }, + { + "epoch": 0.15107548128493198, + "grad_norm": 4.537139892578125, + "learning_rate": 9.623875243544962e-05, + "loss": 2.1718, + "step": 2007 + }, + { + "epoch": 0.15115075556559213, + "grad_norm": 3.777068614959717, + "learning_rate": 9.623411225306837e-05, + "loss": 2.2531, + "step": 2008 + }, + { + "epoch": 0.15122602984625227, + "grad_norm": 3.402850389480591, + "learning_rate": 9.622946932220182e-05, + "loss": 2.3155, + "step": 2009 + }, + { + "epoch": 0.15130130412691245, + "grad_norm": 4.730953693389893, + "learning_rate": 9.622482364312594e-05, + "loss": 2.1493, + "step": 2010 + }, + { + "epoch": 0.1513765784075726, + "grad_norm": 5.613648891448975, + "learning_rate": 9.622017521611698e-05, + "loss": 2.0707, + "step": 2011 + }, + { + "epoch": 0.15145185268823275, + "grad_norm": 3.657475233078003, + "learning_rate": 9.621552404145118e-05, + "loss": 2.3353, + "step": 2012 + }, + { + "epoch": 0.1515271269688929, + "grad_norm": 3.5609402656555176, + "learning_rate": 9.621087011940509e-05, + "loss": 1.8909, + "step": 2013 + }, + { + "epoch": 0.15160240124955307, + "grad_norm": 4.627619743347168, + "learning_rate": 9.620621345025538e-05, + "loss": 2.186, + "step": 2014 + }, + { + "epoch": 0.15167767553021322, + "grad_norm": 4.744054317474365, + "learning_rate": 9.620155403427885e-05, + "loss": 2.2233, + "step": 2015 + }, + { + "epoch": 0.15175294981087337, + "grad_norm": 4.01196813583374, + "learning_rate": 9.619689187175249e-05, + "loss": 1.9883, + "step": 2016 + }, + { + "epoch": 0.15182822409153351, + "grad_norm": 5.643701076507568, + "learning_rate": 9.619222696295348e-05, + "loss": 2.3755, + "step": 2017 + }, + { + "epoch": 0.1519034983721937, + "grad_norm": 4.8457536697387695, + "learning_rate": 9.618755930815912e-05, + "loss": 2.0702, + "step": 2018 + }, + { + "epoch": 0.15197877265285384, + "grad_norm": 4.948049068450928, + "learning_rate": 9.618288890764688e-05, + "loss": 2.0424, + "step": 2019 + }, + { + "epoch": 0.152054046933514, + "grad_norm": 5.378200054168701, + "learning_rate": 9.617821576169439e-05, + "loss": 2.2498, + "step": 2020 + }, + { + "epoch": 0.15212932121417413, + "grad_norm": 5.266604423522949, + "learning_rate": 9.61735398705795e-05, + "loss": 1.7246, + "step": 2021 + }, + { + "epoch": 0.1522045954948343, + "grad_norm": 5.899149417877197, + "learning_rate": 9.616886123458013e-05, + "loss": 2.2373, + "step": 2022 + }, + { + "epoch": 0.15227986977549446, + "grad_norm": 5.359463691711426, + "learning_rate": 9.616417985397446e-05, + "loss": 2.2125, + "step": 2023 + }, + { + "epoch": 0.1523551440561546, + "grad_norm": 5.404905796051025, + "learning_rate": 9.615949572904073e-05, + "loss": 2.3286, + "step": 2024 + }, + { + "epoch": 0.15243041833681478, + "grad_norm": 5.987763404846191, + "learning_rate": 9.615480886005744e-05, + "loss": 2.466, + "step": 2025 + }, + { + "epoch": 0.15250569261747493, + "grad_norm": 6.86370849609375, + "learning_rate": 9.61501192473032e-05, + "loss": 2.1007, + "step": 2026 + }, + { + "epoch": 0.15258096689813508, + "grad_norm": 7.341048240661621, + "learning_rate": 9.614542689105676e-05, + "loss": 2.1356, + "step": 2027 + }, + { + "epoch": 0.15265624117879523, + "grad_norm": 4.952999591827393, + "learning_rate": 9.614073179159713e-05, + "loss": 1.9574, + "step": 2028 + }, + { + "epoch": 0.1527315154594554, + "grad_norm": 6.23982572555542, + "learning_rate": 9.613603394920337e-05, + "loss": 2.1264, + "step": 2029 + }, + { + "epoch": 0.15280678974011555, + "grad_norm": 5.975821495056152, + "learning_rate": 9.613133336415478e-05, + "loss": 2.0535, + "step": 2030 + }, + { + "epoch": 0.1528820640207757, + "grad_norm": 5.267685413360596, + "learning_rate": 9.612663003673079e-05, + "loss": 2.0035, + "step": 2031 + }, + { + "epoch": 0.15295733830143585, + "grad_norm": 5.806224822998047, + "learning_rate": 9.612192396721099e-05, + "loss": 2.2577, + "step": 2032 + }, + { + "epoch": 0.15303261258209602, + "grad_norm": 5.5962982177734375, + "learning_rate": 9.611721515587515e-05, + "loss": 2.34, + "step": 2033 + }, + { + "epoch": 0.15310788686275617, + "grad_norm": 4.431849002838135, + "learning_rate": 9.611250360300319e-05, + "loss": 1.9954, + "step": 2034 + }, + { + "epoch": 0.15318316114341632, + "grad_norm": 4.238766193389893, + "learning_rate": 9.610778930887521e-05, + "loss": 2.2883, + "step": 2035 + }, + { + "epoch": 0.15325843542407647, + "grad_norm": 4.898646831512451, + "learning_rate": 9.610307227377145e-05, + "loss": 1.9973, + "step": 2036 + }, + { + "epoch": 0.15333370970473664, + "grad_norm": 5.965308666229248, + "learning_rate": 9.609835249797233e-05, + "loss": 2.2187, + "step": 2037 + }, + { + "epoch": 0.1534089839853968, + "grad_norm": 5.081641674041748, + "learning_rate": 9.609362998175843e-05, + "loss": 1.9027, + "step": 2038 + }, + { + "epoch": 0.15348425826605694, + "grad_norm": 4.4746856689453125, + "learning_rate": 9.608890472541048e-05, + "loss": 2.1957, + "step": 2039 + }, + { + "epoch": 0.1535595325467171, + "grad_norm": 3.936408042907715, + "learning_rate": 9.608417672920939e-05, + "loss": 1.9187, + "step": 2040 + }, + { + "epoch": 0.15363480682737726, + "grad_norm": 5.039267539978027, + "learning_rate": 9.60794459934362e-05, + "loss": 1.919, + "step": 2041 + }, + { + "epoch": 0.1537100811080374, + "grad_norm": 4.060781478881836, + "learning_rate": 9.607471251837219e-05, + "loss": 2.1018, + "step": 2042 + }, + { + "epoch": 0.15378535538869756, + "grad_norm": 4.91633415222168, + "learning_rate": 9.606997630429872e-05, + "loss": 1.9919, + "step": 2043 + }, + { + "epoch": 0.15386062966935773, + "grad_norm": 4.289859771728516, + "learning_rate": 9.606523735149735e-05, + "loss": 2.5446, + "step": 2044 + }, + { + "epoch": 0.15393590395001788, + "grad_norm": 3.936551094055176, + "learning_rate": 9.60604956602498e-05, + "loss": 2.3704, + "step": 2045 + }, + { + "epoch": 0.15401117823067803, + "grad_norm": 10.058585166931152, + "learning_rate": 9.605575123083793e-05, + "loss": 2.2148, + "step": 2046 + }, + { + "epoch": 0.15408645251133818, + "grad_norm": 5.448790073394775, + "learning_rate": 9.605100406354379e-05, + "loss": 2.2353, + "step": 2047 + }, + { + "epoch": 0.15416172679199835, + "grad_norm": 3.798121929168701, + "learning_rate": 9.604625415864962e-05, + "loss": 2.2022, + "step": 2048 + }, + { + "epoch": 0.1542370010726585, + "grad_norm": 3.8325226306915283, + "learning_rate": 9.604150151643775e-05, + "loss": 1.9418, + "step": 2049 + }, + { + "epoch": 0.15431227535331865, + "grad_norm": 6.15515661239624, + "learning_rate": 9.603674613719072e-05, + "loss": 2.3556, + "step": 2050 + }, + { + "epoch": 0.1543875496339788, + "grad_norm": 4.198747158050537, + "learning_rate": 9.603198802119122e-05, + "loss": 2.1127, + "step": 2051 + }, + { + "epoch": 0.15446282391463897, + "grad_norm": 4.799349784851074, + "learning_rate": 9.602722716872213e-05, + "loss": 1.9679, + "step": 2052 + }, + { + "epoch": 0.15453809819529912, + "grad_norm": 6.379265785217285, + "learning_rate": 9.602246358006643e-05, + "loss": 2.4261, + "step": 2053 + }, + { + "epoch": 0.15461337247595927, + "grad_norm": 7.0591607093811035, + "learning_rate": 9.601769725550736e-05, + "loss": 2.2622, + "step": 2054 + }, + { + "epoch": 0.15468864675661942, + "grad_norm": 4.3434929847717285, + "learning_rate": 9.601292819532819e-05, + "loss": 2.3136, + "step": 2055 + }, + { + "epoch": 0.1547639210372796, + "grad_norm": 7.49066162109375, + "learning_rate": 9.600815639981249e-05, + "loss": 2.1691, + "step": 2056 + }, + { + "epoch": 0.15483919531793974, + "grad_norm": 4.92397403717041, + "learning_rate": 9.600338186924389e-05, + "loss": 2.5493, + "step": 2057 + }, + { + "epoch": 0.1549144695985999, + "grad_norm": 4.865357875823975, + "learning_rate": 9.599860460390624e-05, + "loss": 1.9386, + "step": 2058 + }, + { + "epoch": 0.15498974387926007, + "grad_norm": 4.770902633666992, + "learning_rate": 9.599382460408353e-05, + "loss": 2.0517, + "step": 2059 + }, + { + "epoch": 0.15506501815992021, + "grad_norm": 5.135201930999756, + "learning_rate": 9.598904187005991e-05, + "loss": 2.2801, + "step": 2060 + }, + { + "epoch": 0.15514029244058036, + "grad_norm": 6.234017848968506, + "learning_rate": 9.598425640211972e-05, + "loss": 2.3472, + "step": 2061 + }, + { + "epoch": 0.1552155667212405, + "grad_norm": 5.580723762512207, + "learning_rate": 9.597946820054743e-05, + "loss": 2.0529, + "step": 2062 + }, + { + "epoch": 0.1552908410019007, + "grad_norm": 5.65152645111084, + "learning_rate": 9.597467726562768e-05, + "loss": 2.7563, + "step": 2063 + }, + { + "epoch": 0.15536611528256083, + "grad_norm": 6.707641124725342, + "learning_rate": 9.596988359764529e-05, + "loss": 2.2833, + "step": 2064 + }, + { + "epoch": 0.15544138956322098, + "grad_norm": 6.38559627532959, + "learning_rate": 9.596508719688522e-05, + "loss": 2.1667, + "step": 2065 + }, + { + "epoch": 0.15551666384388113, + "grad_norm": 5.363674640655518, + "learning_rate": 9.59602880636326e-05, + "loss": 2.6474, + "step": 2066 + }, + { + "epoch": 0.1555919381245413, + "grad_norm": 4.697733402252197, + "learning_rate": 9.595548619817274e-05, + "loss": 2.236, + "step": 2067 + }, + { + "epoch": 0.15566721240520145, + "grad_norm": 4.307929992675781, + "learning_rate": 9.595068160079107e-05, + "loss": 2.0107, + "step": 2068 + }, + { + "epoch": 0.1557424866858616, + "grad_norm": 5.606794834136963, + "learning_rate": 9.594587427177324e-05, + "loss": 2.0769, + "step": 2069 + }, + { + "epoch": 0.15581776096652175, + "grad_norm": 3.8274621963500977, + "learning_rate": 9.594106421140501e-05, + "loss": 1.966, + "step": 2070 + }, + { + "epoch": 0.15589303524718193, + "grad_norm": 4.292691707611084, + "learning_rate": 9.593625141997234e-05, + "loss": 2.1974, + "step": 2071 + }, + { + "epoch": 0.15596830952784208, + "grad_norm": 4.167516231536865, + "learning_rate": 9.593143589776133e-05, + "loss": 2.3495, + "step": 2072 + }, + { + "epoch": 0.15604358380850222, + "grad_norm": 4.153419017791748, + "learning_rate": 9.592661764505824e-05, + "loss": 2.172, + "step": 2073 + }, + { + "epoch": 0.1561188580891624, + "grad_norm": 3.9185049533843994, + "learning_rate": 9.59217966621495e-05, + "loss": 1.9352, + "step": 2074 + }, + { + "epoch": 0.15619413236982255, + "grad_norm": 4.827195644378662, + "learning_rate": 9.591697294932173e-05, + "loss": 2.2603, + "step": 2075 + }, + { + "epoch": 0.1562694066504827, + "grad_norm": 4.187567710876465, + "learning_rate": 9.591214650686167e-05, + "loss": 2.0342, + "step": 2076 + }, + { + "epoch": 0.15634468093114284, + "grad_norm": 4.472141742706299, + "learning_rate": 9.590731733505622e-05, + "loss": 2.0691, + "step": 2077 + }, + { + "epoch": 0.15641995521180302, + "grad_norm": 4.0354695320129395, + "learning_rate": 9.59024854341925e-05, + "loss": 2.0649, + "step": 2078 + }, + { + "epoch": 0.15649522949246317, + "grad_norm": 3.723557233810425, + "learning_rate": 9.58976508045577e-05, + "loss": 2.0413, + "step": 2079 + }, + { + "epoch": 0.15657050377312332, + "grad_norm": 4.510951042175293, + "learning_rate": 9.589281344643927e-05, + "loss": 2.0874, + "step": 2080 + }, + { + "epoch": 0.15664577805378346, + "grad_norm": 3.7920453548431396, + "learning_rate": 9.588797336012477e-05, + "loss": 2.5182, + "step": 2081 + }, + { + "epoch": 0.15672105233444364, + "grad_norm": 4.730495452880859, + "learning_rate": 9.588313054590192e-05, + "loss": 2.1439, + "step": 2082 + }, + { + "epoch": 0.1567963266151038, + "grad_norm": 6.249584197998047, + "learning_rate": 9.58782850040586e-05, + "loss": 2.3361, + "step": 2083 + }, + { + "epoch": 0.15687160089576394, + "grad_norm": 3.6124255657196045, + "learning_rate": 9.58734367348829e-05, + "loss": 2.5142, + "step": 2084 + }, + { + "epoch": 0.15694687517642408, + "grad_norm": 5.7303290367126465, + "learning_rate": 9.586858573866299e-05, + "loss": 2.1577, + "step": 2085 + }, + { + "epoch": 0.15702214945708426, + "grad_norm": 5.092860221862793, + "learning_rate": 9.586373201568728e-05, + "loss": 2.1525, + "step": 2086 + }, + { + "epoch": 0.1570974237377444, + "grad_norm": 3.9583547115325928, + "learning_rate": 9.585887556624429e-05, + "loss": 2.346, + "step": 2087 + }, + { + "epoch": 0.15717269801840456, + "grad_norm": 4.521017551422119, + "learning_rate": 9.585401639062273e-05, + "loss": 1.8967, + "step": 2088 + }, + { + "epoch": 0.1572479722990647, + "grad_norm": 5.252394676208496, + "learning_rate": 9.584915448911147e-05, + "loss": 2.201, + "step": 2089 + }, + { + "epoch": 0.15732324657972488, + "grad_norm": 5.868988037109375, + "learning_rate": 9.584428986199954e-05, + "loss": 1.8643, + "step": 2090 + }, + { + "epoch": 0.15739852086038503, + "grad_norm": 4.457388877868652, + "learning_rate": 9.583942250957611e-05, + "loss": 2.035, + "step": 2091 + }, + { + "epoch": 0.15747379514104518, + "grad_norm": 6.086800575256348, + "learning_rate": 9.583455243213054e-05, + "loss": 2.6587, + "step": 2092 + }, + { + "epoch": 0.15754906942170535, + "grad_norm": 4.44578218460083, + "learning_rate": 9.582967962995235e-05, + "loss": 2.2085, + "step": 2093 + }, + { + "epoch": 0.1576243437023655, + "grad_norm": 5.932287216186523, + "learning_rate": 9.58248041033312e-05, + "loss": 2.0856, + "step": 2094 + }, + { + "epoch": 0.15769961798302565, + "grad_norm": 4.791714191436768, + "learning_rate": 9.581992585255692e-05, + "loss": 2.0318, + "step": 2095 + }, + { + "epoch": 0.1577748922636858, + "grad_norm": 6.2426981925964355, + "learning_rate": 9.581504487791954e-05, + "loss": 2.2789, + "step": 2096 + }, + { + "epoch": 0.15785016654434597, + "grad_norm": 5.699377536773682, + "learning_rate": 9.58101611797092e-05, + "loss": 2.3412, + "step": 2097 + }, + { + "epoch": 0.15792544082500612, + "grad_norm": 7.385111331939697, + "learning_rate": 9.580527475821621e-05, + "loss": 2.247, + "step": 2098 + }, + { + "epoch": 0.15800071510566627, + "grad_norm": 5.266499996185303, + "learning_rate": 9.580038561373108e-05, + "loss": 2.0686, + "step": 2099 + }, + { + "epoch": 0.15807598938632642, + "grad_norm": 4.207569599151611, + "learning_rate": 9.579549374654442e-05, + "loss": 2.3634, + "step": 2100 + }, + { + "epoch": 0.1581512636669866, + "grad_norm": 7.277801513671875, + "learning_rate": 9.579059915694708e-05, + "loss": 2.9515, + "step": 2101 + }, + { + "epoch": 0.15822653794764674, + "grad_norm": 6.7272047996521, + "learning_rate": 9.578570184523e-05, + "loss": 1.9149, + "step": 2102 + }, + { + "epoch": 0.1583018122283069, + "grad_norm": 4.62692403793335, + "learning_rate": 9.578080181168432e-05, + "loss": 2.1776, + "step": 2103 + }, + { + "epoch": 0.15837708650896704, + "grad_norm": 4.971329689025879, + "learning_rate": 9.577589905660133e-05, + "loss": 2.2639, + "step": 2104 + }, + { + "epoch": 0.1584523607896272, + "grad_norm": 5.457459926605225, + "learning_rate": 9.577099358027249e-05, + "loss": 2.1835, + "step": 2105 + }, + { + "epoch": 0.15852763507028736, + "grad_norm": 4.928334712982178, + "learning_rate": 9.57660853829894e-05, + "loss": 2.1405, + "step": 2106 + }, + { + "epoch": 0.1586029093509475, + "grad_norm": 4.502702713012695, + "learning_rate": 9.576117446504387e-05, + "loss": 2.0504, + "step": 2107 + }, + { + "epoch": 0.15867818363160768, + "grad_norm": 7.410353183746338, + "learning_rate": 9.575626082672782e-05, + "loss": 1.8989, + "step": 2108 + }, + { + "epoch": 0.15875345791226783, + "grad_norm": 5.7210493087768555, + "learning_rate": 9.575134446833333e-05, + "loss": 2.695, + "step": 2109 + }, + { + "epoch": 0.15882873219292798, + "grad_norm": 4.770233154296875, + "learning_rate": 9.574642539015271e-05, + "loss": 2.205, + "step": 2110 + }, + { + "epoch": 0.15890400647358813, + "grad_norm": 6.2118072509765625, + "learning_rate": 9.574150359247835e-05, + "loss": 2.2313, + "step": 2111 + }, + { + "epoch": 0.1589792807542483, + "grad_norm": 5.2125563621521, + "learning_rate": 9.573657907560286e-05, + "loss": 1.9515, + "step": 2112 + }, + { + "epoch": 0.15905455503490845, + "grad_norm": 3.98368501663208, + "learning_rate": 9.573165183981897e-05, + "loss": 2.9803, + "step": 2113 + }, + { + "epoch": 0.1591298293155686, + "grad_norm": 4.522790431976318, + "learning_rate": 9.57267218854196e-05, + "loss": 2.0788, + "step": 2114 + }, + { + "epoch": 0.15920510359622875, + "grad_norm": 4.263027667999268, + "learning_rate": 9.572178921269781e-05, + "loss": 2.0058, + "step": 2115 + }, + { + "epoch": 0.15928037787688892, + "grad_norm": 6.23157262802124, + "learning_rate": 9.571685382194686e-05, + "loss": 2.6592, + "step": 2116 + }, + { + "epoch": 0.15935565215754907, + "grad_norm": 3.7246224880218506, + "learning_rate": 9.571191571346012e-05, + "loss": 2.1737, + "step": 2117 + }, + { + "epoch": 0.15943092643820922, + "grad_norm": 4.581549644470215, + "learning_rate": 9.570697488753113e-05, + "loss": 2.1702, + "step": 2118 + }, + { + "epoch": 0.15950620071886937, + "grad_norm": 4.184098720550537, + "learning_rate": 9.570203134445366e-05, + "loss": 1.9539, + "step": 2119 + }, + { + "epoch": 0.15958147499952954, + "grad_norm": 5.316369533538818, + "learning_rate": 9.569708508452156e-05, + "loss": 2.5667, + "step": 2120 + }, + { + "epoch": 0.1596567492801897, + "grad_norm": 6.95085334777832, + "learning_rate": 9.569213610802885e-05, + "loss": 2.3969, + "step": 2121 + }, + { + "epoch": 0.15973202356084984, + "grad_norm": 4.367122173309326, + "learning_rate": 9.568718441526976e-05, + "loss": 2.4738, + "step": 2122 + }, + { + "epoch": 0.15980729784151002, + "grad_norm": 4.955066680908203, + "learning_rate": 9.568223000653866e-05, + "loss": 2.2542, + "step": 2123 + }, + { + "epoch": 0.15988257212217016, + "grad_norm": 4.541550636291504, + "learning_rate": 9.567727288213005e-05, + "loss": 1.8783, + "step": 2124 + }, + { + "epoch": 0.1599578464028303, + "grad_norm": 3.552600383758545, + "learning_rate": 9.567231304233863e-05, + "loss": 1.9578, + "step": 2125 + }, + { + "epoch": 0.16003312068349046, + "grad_norm": 4.2720627784729, + "learning_rate": 9.566735048745926e-05, + "loss": 2.7564, + "step": 2126 + }, + { + "epoch": 0.16010839496415064, + "grad_norm": 4.8603644371032715, + "learning_rate": 9.566238521778693e-05, + "loss": 2.3159, + "step": 2127 + }, + { + "epoch": 0.16018366924481078, + "grad_norm": 5.021570205688477, + "learning_rate": 9.565741723361682e-05, + "loss": 2.0601, + "step": 2128 + }, + { + "epoch": 0.16025894352547093, + "grad_norm": 4.532212734222412, + "learning_rate": 9.565244653524426e-05, + "loss": 1.8818, + "step": 2129 + }, + { + "epoch": 0.16033421780613108, + "grad_norm": 6.954552173614502, + "learning_rate": 9.564747312296474e-05, + "loss": 2.9218, + "step": 2130 + }, + { + "epoch": 0.16040949208679126, + "grad_norm": 6.202856540679932, + "learning_rate": 9.564249699707394e-05, + "loss": 2.3562, + "step": 2131 + }, + { + "epoch": 0.1604847663674514, + "grad_norm": 6.85939359664917, + "learning_rate": 9.563751815786764e-05, + "loss": 2.2643, + "step": 2132 + }, + { + "epoch": 0.16056004064811155, + "grad_norm": 5.891876697540283, + "learning_rate": 9.563253660564185e-05, + "loss": 2.1593, + "step": 2133 + }, + { + "epoch": 0.1606353149287717, + "grad_norm": 8.343067169189453, + "learning_rate": 9.562755234069267e-05, + "loss": 1.9639, + "step": 2134 + }, + { + "epoch": 0.16071058920943188, + "grad_norm": 4.527833938598633, + "learning_rate": 9.562256536331644e-05, + "loss": 2.5304, + "step": 2135 + }, + { + "epoch": 0.16078586349009202, + "grad_norm": 3.9452993869781494, + "learning_rate": 9.56175756738096e-05, + "loss": 2.1859, + "step": 2136 + }, + { + "epoch": 0.16086113777075217, + "grad_norm": 6.359346389770508, + "learning_rate": 9.561258327246877e-05, + "loss": 2.0809, + "step": 2137 + }, + { + "epoch": 0.16093641205141235, + "grad_norm": 4.81974458694458, + "learning_rate": 9.560758815959074e-05, + "loss": 2.2716, + "step": 2138 + }, + { + "epoch": 0.1610116863320725, + "grad_norm": 4.2938313484191895, + "learning_rate": 9.560259033547248e-05, + "loss": 2.1051, + "step": 2139 + }, + { + "epoch": 0.16108696061273264, + "grad_norm": 3.4313347339630127, + "learning_rate": 9.559758980041105e-05, + "loss": 2.2185, + "step": 2140 + }, + { + "epoch": 0.1611622348933928, + "grad_norm": 4.162536144256592, + "learning_rate": 9.559258655470375e-05, + "loss": 2.53, + "step": 2141 + }, + { + "epoch": 0.16123750917405297, + "grad_norm": 5.780637741088867, + "learning_rate": 9.5587580598648e-05, + "loss": 2.126, + "step": 2142 + }, + { + "epoch": 0.16131278345471312, + "grad_norm": 4.964845180511475, + "learning_rate": 9.558257193254139e-05, + "loss": 2.7349, + "step": 2143 + }, + { + "epoch": 0.16138805773537326, + "grad_norm": 4.01690149307251, + "learning_rate": 9.557756055668165e-05, + "loss": 2.0901, + "step": 2144 + }, + { + "epoch": 0.1614633320160334, + "grad_norm": 5.37811803817749, + "learning_rate": 9.557254647136675e-05, + "loss": 2.3779, + "step": 2145 + }, + { + "epoch": 0.1615386062966936, + "grad_norm": 6.000019550323486, + "learning_rate": 9.556752967689469e-05, + "loss": 2.0882, + "step": 2146 + }, + { + "epoch": 0.16161388057735374, + "grad_norm": 4.8202643394470215, + "learning_rate": 9.556251017356375e-05, + "loss": 2.279, + "step": 2147 + }, + { + "epoch": 0.16168915485801388, + "grad_norm": 4.633787155151367, + "learning_rate": 9.555748796167232e-05, + "loss": 2.2039, + "step": 2148 + }, + { + "epoch": 0.16176442913867403, + "grad_norm": 5.664914608001709, + "learning_rate": 9.555246304151894e-05, + "loss": 2.306, + "step": 2149 + }, + { + "epoch": 0.1618397034193342, + "grad_norm": 4.398691177368164, + "learning_rate": 9.554743541340234e-05, + "loss": 2.1348, + "step": 2150 + }, + { + "epoch": 0.16191497769999436, + "grad_norm": 4.392038345336914, + "learning_rate": 9.55424050776214e-05, + "loss": 2.182, + "step": 2151 + }, + { + "epoch": 0.1619902519806545, + "grad_norm": 5.23166036605835, + "learning_rate": 9.553737203447514e-05, + "loss": 2.3464, + "step": 2152 + }, + { + "epoch": 0.16206552626131465, + "grad_norm": 4.836452960968018, + "learning_rate": 9.553233628426279e-05, + "loss": 2.0934, + "step": 2153 + }, + { + "epoch": 0.16214080054197483, + "grad_norm": 5.13286828994751, + "learning_rate": 9.552729782728369e-05, + "loss": 2.1849, + "step": 2154 + }, + { + "epoch": 0.16221607482263498, + "grad_norm": 4.531123161315918, + "learning_rate": 9.552225666383737e-05, + "loss": 1.9598, + "step": 2155 + }, + { + "epoch": 0.16229134910329512, + "grad_norm": 5.338104248046875, + "learning_rate": 9.551721279422351e-05, + "loss": 2.4238, + "step": 2156 + }, + { + "epoch": 0.1623666233839553, + "grad_norm": 4.85247802734375, + "learning_rate": 9.551216621874195e-05, + "loss": 1.9163, + "step": 2157 + }, + { + "epoch": 0.16244189766461545, + "grad_norm": 3.82311749458313, + "learning_rate": 9.55071169376927e-05, + "loss": 1.9752, + "step": 2158 + }, + { + "epoch": 0.1625171719452756, + "grad_norm": 5.538928031921387, + "learning_rate": 9.55020649513759e-05, + "loss": 2.2083, + "step": 2159 + }, + { + "epoch": 0.16259244622593574, + "grad_norm": 5.192752838134766, + "learning_rate": 9.549701026009193e-05, + "loss": 2.237, + "step": 2160 + }, + { + "epoch": 0.16266772050659592, + "grad_norm": 4.371562957763672, + "learning_rate": 9.549195286414123e-05, + "loss": 2.1165, + "step": 2161 + }, + { + "epoch": 0.16274299478725607, + "grad_norm": 6.093799591064453, + "learning_rate": 9.548689276382447e-05, + "loss": 2.1418, + "step": 2162 + }, + { + "epoch": 0.16281826906791622, + "grad_norm": 4.156737327575684, + "learning_rate": 9.548182995944244e-05, + "loss": 1.9759, + "step": 2163 + }, + { + "epoch": 0.16289354334857636, + "grad_norm": 4.536795139312744, + "learning_rate": 9.547676445129613e-05, + "loss": 1.8443, + "step": 2164 + }, + { + "epoch": 0.16296881762923654, + "grad_norm": 5.529335975646973, + "learning_rate": 9.547169623968665e-05, + "loss": 1.8961, + "step": 2165 + }, + { + "epoch": 0.1630440919098967, + "grad_norm": 7.771794319152832, + "learning_rate": 9.546662532491533e-05, + "loss": 2.0879, + "step": 2166 + }, + { + "epoch": 0.16311936619055684, + "grad_norm": 5.329034328460693, + "learning_rate": 9.546155170728356e-05, + "loss": 2.2964, + "step": 2167 + }, + { + "epoch": 0.16319464047121698, + "grad_norm": 3.9763481616973877, + "learning_rate": 9.5456475387093e-05, + "loss": 2.6385, + "step": 2168 + }, + { + "epoch": 0.16326991475187716, + "grad_norm": 4.062328338623047, + "learning_rate": 9.54513963646454e-05, + "loss": 1.9477, + "step": 2169 + }, + { + "epoch": 0.1633451890325373, + "grad_norm": 4.845366954803467, + "learning_rate": 9.544631464024271e-05, + "loss": 2.2485, + "step": 2170 + }, + { + "epoch": 0.16342046331319746, + "grad_norm": 6.792723178863525, + "learning_rate": 9.544123021418701e-05, + "loss": 1.8903, + "step": 2171 + }, + { + "epoch": 0.16349573759385763, + "grad_norm": 4.109110355377197, + "learning_rate": 9.543614308678057e-05, + "loss": 2.359, + "step": 2172 + }, + { + "epoch": 0.16357101187451778, + "grad_norm": 5.6095356941223145, + "learning_rate": 9.543105325832577e-05, + "loss": 2.4139, + "step": 2173 + }, + { + "epoch": 0.16364628615517793, + "grad_norm": 4.7631754875183105, + "learning_rate": 9.542596072912524e-05, + "loss": 2.0527, + "step": 2174 + }, + { + "epoch": 0.16372156043583808, + "grad_norm": 3.9870476722717285, + "learning_rate": 9.542086549948167e-05, + "loss": 2.5087, + "step": 2175 + }, + { + "epoch": 0.16379683471649825, + "grad_norm": 4.602846622467041, + "learning_rate": 9.541576756969797e-05, + "loss": 2.2427, + "step": 2176 + }, + { + "epoch": 0.1638721089971584, + "grad_norm": 4.71099328994751, + "learning_rate": 9.54106669400772e-05, + "loss": 2.1087, + "step": 2177 + }, + { + "epoch": 0.16394738327781855, + "grad_norm": 5.087553024291992, + "learning_rate": 9.54055636109226e-05, + "loss": 1.7692, + "step": 2178 + }, + { + "epoch": 0.1640226575584787, + "grad_norm": 8.089323997497559, + "learning_rate": 9.540045758253751e-05, + "loss": 2.2295, + "step": 2179 + }, + { + "epoch": 0.16409793183913887, + "grad_norm": 5.659903526306152, + "learning_rate": 9.539534885522547e-05, + "loss": 2.0684, + "step": 2180 + }, + { + "epoch": 0.16417320611979902, + "grad_norm": 5.205198287963867, + "learning_rate": 9.539023742929022e-05, + "loss": 2.5589, + "step": 2181 + }, + { + "epoch": 0.16424848040045917, + "grad_norm": 6.978159427642822, + "learning_rate": 9.538512330503557e-05, + "loss": 2.1062, + "step": 2182 + }, + { + "epoch": 0.16432375468111932, + "grad_norm": 4.487802028656006, + "learning_rate": 9.538000648276559e-05, + "loss": 2.0254, + "step": 2183 + }, + { + "epoch": 0.1643990289617795, + "grad_norm": 3.3947670459747314, + "learning_rate": 9.537488696278443e-05, + "loss": 2.7997, + "step": 2184 + }, + { + "epoch": 0.16447430324243964, + "grad_norm": 4.046313762664795, + "learning_rate": 9.536976474539642e-05, + "loss": 1.9636, + "step": 2185 + }, + { + "epoch": 0.1645495775230998, + "grad_norm": 5.125520706176758, + "learning_rate": 9.536463983090608e-05, + "loss": 2.0246, + "step": 2186 + }, + { + "epoch": 0.16462485180375996, + "grad_norm": 4.133129596710205, + "learning_rate": 9.535951221961807e-05, + "loss": 2.3225, + "step": 2187 + }, + { + "epoch": 0.1647001260844201, + "grad_norm": 4.609368801116943, + "learning_rate": 9.53543819118372e-05, + "loss": 2.1459, + "step": 2188 + }, + { + "epoch": 0.16477540036508026, + "grad_norm": 5.235116004943848, + "learning_rate": 9.534924890786849e-05, + "loss": 2.0638, + "step": 2189 + }, + { + "epoch": 0.1648506746457404, + "grad_norm": 3.448789596557617, + "learning_rate": 9.534411320801704e-05, + "loss": 2.3114, + "step": 2190 + }, + { + "epoch": 0.16492594892640058, + "grad_norm": 5.083763599395752, + "learning_rate": 9.533897481258815e-05, + "loss": 1.968, + "step": 2191 + }, + { + "epoch": 0.16500122320706073, + "grad_norm": 3.7599713802337646, + "learning_rate": 9.533383372188731e-05, + "loss": 2.072, + "step": 2192 + }, + { + "epoch": 0.16507649748772088, + "grad_norm": 5.607828617095947, + "learning_rate": 9.532868993622015e-05, + "loss": 1.9586, + "step": 2193 + }, + { + "epoch": 0.16515177176838103, + "grad_norm": 6.540659427642822, + "learning_rate": 9.532354345589241e-05, + "loss": 2.1076, + "step": 2194 + }, + { + "epoch": 0.1652270460490412, + "grad_norm": 4.928903579711914, + "learning_rate": 9.531839428121008e-05, + "loss": 2.429, + "step": 2195 + }, + { + "epoch": 0.16530232032970135, + "grad_norm": 4.7645368576049805, + "learning_rate": 9.531324241247924e-05, + "loss": 2.0913, + "step": 2196 + }, + { + "epoch": 0.1653775946103615, + "grad_norm": 5.299357891082764, + "learning_rate": 9.530808785000614e-05, + "loss": 2.4863, + "step": 2197 + }, + { + "epoch": 0.16545286889102165, + "grad_norm": 5.263016223907471, + "learning_rate": 9.530293059409723e-05, + "loss": 2.1709, + "step": 2198 + }, + { + "epoch": 0.16552814317168182, + "grad_norm": 5.059325218200684, + "learning_rate": 9.52977706450591e-05, + "loss": 1.9798, + "step": 2199 + }, + { + "epoch": 0.16560341745234197, + "grad_norm": 5.113597393035889, + "learning_rate": 9.529260800319846e-05, + "loss": 2.1529, + "step": 2200 + }, + { + "epoch": 0.16567869173300212, + "grad_norm": 5.493249893188477, + "learning_rate": 9.528744266882224e-05, + "loss": 1.9849, + "step": 2201 + }, + { + "epoch": 0.16575396601366227, + "grad_norm": 3.821795701980591, + "learning_rate": 9.528227464223749e-05, + "loss": 1.9154, + "step": 2202 + }, + { + "epoch": 0.16582924029432244, + "grad_norm": 3.966338634490967, + "learning_rate": 9.527710392375144e-05, + "loss": 2.1736, + "step": 2203 + }, + { + "epoch": 0.1659045145749826, + "grad_norm": 4.234589576721191, + "learning_rate": 9.527193051367149e-05, + "loss": 1.9443, + "step": 2204 + }, + { + "epoch": 0.16597978885564274, + "grad_norm": 4.575554370880127, + "learning_rate": 9.526675441230517e-05, + "loss": 1.9432, + "step": 2205 + }, + { + "epoch": 0.16605506313630292, + "grad_norm": 4.4436540603637695, + "learning_rate": 9.526157561996018e-05, + "loss": 2.1861, + "step": 2206 + }, + { + "epoch": 0.16613033741696306, + "grad_norm": 3.480534553527832, + "learning_rate": 9.525639413694438e-05, + "loss": 2.171, + "step": 2207 + }, + { + "epoch": 0.1662056116976232, + "grad_norm": 4.550610065460205, + "learning_rate": 9.525120996356582e-05, + "loss": 1.8931, + "step": 2208 + }, + { + "epoch": 0.16628088597828336, + "grad_norm": 5.82284688949585, + "learning_rate": 9.524602310013266e-05, + "loss": 2.2064, + "step": 2209 + }, + { + "epoch": 0.16635616025894354, + "grad_norm": 5.232897758483887, + "learning_rate": 9.524083354695325e-05, + "loss": 2.0327, + "step": 2210 + }, + { + "epoch": 0.16643143453960368, + "grad_norm": 6.37138032913208, + "learning_rate": 9.52356413043361e-05, + "loss": 1.8247, + "step": 2211 + }, + { + "epoch": 0.16650670882026383, + "grad_norm": 5.429400444030762, + "learning_rate": 9.523044637258986e-05, + "loss": 2.1906, + "step": 2212 + }, + { + "epoch": 0.16658198310092398, + "grad_norm": 4.930382251739502, + "learning_rate": 9.522524875202338e-05, + "loss": 2.2628, + "step": 2213 + }, + { + "epoch": 0.16665725738158416, + "grad_norm": 5.107860565185547, + "learning_rate": 9.522004844294563e-05, + "loss": 2.3101, + "step": 2214 + }, + { + "epoch": 0.1667325316622443, + "grad_norm": 4.301032543182373, + "learning_rate": 9.521484544566573e-05, + "loss": 2.1433, + "step": 2215 + }, + { + "epoch": 0.16680780594290445, + "grad_norm": 6.7353739738464355, + "learning_rate": 9.520963976049302e-05, + "loss": 3.0137, + "step": 2216 + }, + { + "epoch": 0.1668830802235646, + "grad_norm": 4.559290885925293, + "learning_rate": 9.520443138773694e-05, + "loss": 2.8433, + "step": 2217 + }, + { + "epoch": 0.16695835450422478, + "grad_norm": 4.9988298416137695, + "learning_rate": 9.519922032770712e-05, + "loss": 2.1131, + "step": 2218 + }, + { + "epoch": 0.16703362878488492, + "grad_norm": 4.082030296325684, + "learning_rate": 9.519400658071336e-05, + "loss": 2.1254, + "step": 2219 + }, + { + "epoch": 0.16710890306554507, + "grad_norm": 4.1938323974609375, + "learning_rate": 9.518879014706556e-05, + "loss": 2.1296, + "step": 2220 + }, + { + "epoch": 0.16718417734620525, + "grad_norm": 4.549106121063232, + "learning_rate": 9.518357102707387e-05, + "loss": 2.1653, + "step": 2221 + }, + { + "epoch": 0.1672594516268654, + "grad_norm": 4.568601131439209, + "learning_rate": 9.517834922104851e-05, + "loss": 2.2272, + "step": 2222 + }, + { + "epoch": 0.16733472590752554, + "grad_norm": 4.738588809967041, + "learning_rate": 9.517312472929992e-05, + "loss": 2.3205, + "step": 2223 + }, + { + "epoch": 0.1674100001881857, + "grad_norm": 3.195651054382324, + "learning_rate": 9.516789755213868e-05, + "loss": 2.2214, + "step": 2224 + }, + { + "epoch": 0.16748527446884587, + "grad_norm": 4.565551280975342, + "learning_rate": 9.516266768987555e-05, + "loss": 1.8288, + "step": 2225 + }, + { + "epoch": 0.16756054874950602, + "grad_norm": 7.472075939178467, + "learning_rate": 9.51574351428214e-05, + "loss": 2.6066, + "step": 2226 + }, + { + "epoch": 0.16763582303016616, + "grad_norm": 5.391735553741455, + "learning_rate": 9.51521999112873e-05, + "loss": 2.3681, + "step": 2227 + }, + { + "epoch": 0.1677110973108263, + "grad_norm": 6.187355041503906, + "learning_rate": 9.514696199558448e-05, + "loss": 1.8019, + "step": 2228 + }, + { + "epoch": 0.1677863715914865, + "grad_norm": 4.559253692626953, + "learning_rate": 9.514172139602431e-05, + "loss": 1.9727, + "step": 2229 + }, + { + "epoch": 0.16786164587214664, + "grad_norm": 3.7746024131774902, + "learning_rate": 9.513647811291832e-05, + "loss": 2.0619, + "step": 2230 + }, + { + "epoch": 0.16793692015280678, + "grad_norm": 4.132672309875488, + "learning_rate": 9.513123214657821e-05, + "loss": 2.5438, + "step": 2231 + }, + { + "epoch": 0.16801219443346693, + "grad_norm": 3.707167148590088, + "learning_rate": 9.512598349731588e-05, + "loss": 2.1179, + "step": 2232 + }, + { + "epoch": 0.1680874687141271, + "grad_norm": 5.947897911071777, + "learning_rate": 9.512073216544329e-05, + "loss": 2.3137, + "step": 2233 + }, + { + "epoch": 0.16816274299478726, + "grad_norm": 4.157155513763428, + "learning_rate": 9.511547815127262e-05, + "loss": 1.8269, + "step": 2234 + }, + { + "epoch": 0.1682380172754474, + "grad_norm": 5.308967590332031, + "learning_rate": 9.511022145511624e-05, + "loss": 1.9455, + "step": 2235 + }, + { + "epoch": 0.16831329155610758, + "grad_norm": 5.097304344177246, + "learning_rate": 9.510496207728663e-05, + "loss": 2.0311, + "step": 2236 + }, + { + "epoch": 0.16838856583676773, + "grad_norm": 4.092731475830078, + "learning_rate": 9.509970001809645e-05, + "loss": 2.1319, + "step": 2237 + }, + { + "epoch": 0.16846384011742788, + "grad_norm": 4.651304721832275, + "learning_rate": 9.509443527785851e-05, + "loss": 2.1493, + "step": 2238 + }, + { + "epoch": 0.16853911439808802, + "grad_norm": 4.787350654602051, + "learning_rate": 9.508916785688578e-05, + "loss": 2.5233, + "step": 2239 + }, + { + "epoch": 0.1686143886787482, + "grad_norm": 4.076807022094727, + "learning_rate": 9.50838977554914e-05, + "loss": 2.1228, + "step": 2240 + }, + { + "epoch": 0.16868966295940835, + "grad_norm": 5.558131694793701, + "learning_rate": 9.507862497398864e-05, + "loss": 2.2416, + "step": 2241 + }, + { + "epoch": 0.1687649372400685, + "grad_norm": 6.108593463897705, + "learning_rate": 9.5073349512691e-05, + "loss": 1.9382, + "step": 2242 + }, + { + "epoch": 0.16884021152072864, + "grad_norm": 5.862201690673828, + "learning_rate": 9.506807137191205e-05, + "loss": 2.1214, + "step": 2243 + }, + { + "epoch": 0.16891548580138882, + "grad_norm": 4.9014763832092285, + "learning_rate": 9.506279055196556e-05, + "loss": 2.1792, + "step": 2244 + }, + { + "epoch": 0.16899076008204897, + "grad_norm": 4.522253036499023, + "learning_rate": 9.505750705316548e-05, + "loss": 2.0104, + "step": 2245 + }, + { + "epoch": 0.16906603436270912, + "grad_norm": 4.980876922607422, + "learning_rate": 9.505222087582589e-05, + "loss": 2.2206, + "step": 2246 + }, + { + "epoch": 0.16914130864336926, + "grad_norm": 4.8022050857543945, + "learning_rate": 9.504693202026104e-05, + "loss": 2.1094, + "step": 2247 + }, + { + "epoch": 0.16921658292402944, + "grad_norm": 6.1085896492004395, + "learning_rate": 9.504164048678533e-05, + "loss": 2.3155, + "step": 2248 + }, + { + "epoch": 0.1692918572046896, + "grad_norm": 4.865021705627441, + "learning_rate": 9.503634627571333e-05, + "loss": 2.1186, + "step": 2249 + }, + { + "epoch": 0.16936713148534974, + "grad_norm": 5.2590651512146, + "learning_rate": 9.503104938735978e-05, + "loss": 2.0465, + "step": 2250 + }, + { + "epoch": 0.16944240576600988, + "grad_norm": 4.784515857696533, + "learning_rate": 9.502574982203956e-05, + "loss": 2.0709, + "step": 2251 + }, + { + "epoch": 0.16951768004667006, + "grad_norm": 5.401655197143555, + "learning_rate": 9.502044758006769e-05, + "loss": 2.0731, + "step": 2252 + }, + { + "epoch": 0.1695929543273302, + "grad_norm": 3.9704253673553467, + "learning_rate": 9.501514266175938e-05, + "loss": 1.7908, + "step": 2253 + }, + { + "epoch": 0.16966822860799036, + "grad_norm": 4.363543510437012, + "learning_rate": 9.500983506743002e-05, + "loss": 2.0285, + "step": 2254 + }, + { + "epoch": 0.16974350288865053, + "grad_norm": 5.469841003417969, + "learning_rate": 9.50045247973951e-05, + "loss": 2.0921, + "step": 2255 + }, + { + "epoch": 0.16981877716931068, + "grad_norm": 5.638004302978516, + "learning_rate": 9.499921185197032e-05, + "loss": 2.2864, + "step": 2256 + }, + { + "epoch": 0.16989405144997083, + "grad_norm": 5.755684852600098, + "learning_rate": 9.499389623147151e-05, + "loss": 2.2282, + "step": 2257 + }, + { + "epoch": 0.16996932573063098, + "grad_norm": 4.243750095367432, + "learning_rate": 9.498857793621468e-05, + "loss": 1.8188, + "step": 2258 + }, + { + "epoch": 0.17004460001129115, + "grad_norm": 4.503023624420166, + "learning_rate": 9.498325696651597e-05, + "loss": 2.126, + "step": 2259 + }, + { + "epoch": 0.1701198742919513, + "grad_norm": 4.63602876663208, + "learning_rate": 9.497793332269171e-05, + "loss": 1.9809, + "step": 2260 + }, + { + "epoch": 0.17019514857261145, + "grad_norm": 5.731786251068115, + "learning_rate": 9.497260700505837e-05, + "loss": 2.1542, + "step": 2261 + }, + { + "epoch": 0.1702704228532716, + "grad_norm": 4.860467433929443, + "learning_rate": 9.496727801393257e-05, + "loss": 2.0546, + "step": 2262 + }, + { + "epoch": 0.17034569713393177, + "grad_norm": 6.777553081512451, + "learning_rate": 9.496194634963113e-05, + "loss": 2.1452, + "step": 2263 + }, + { + "epoch": 0.17042097141459192, + "grad_norm": 7.032948017120361, + "learning_rate": 9.495661201247099e-05, + "loss": 2.0653, + "step": 2264 + }, + { + "epoch": 0.17049624569525207, + "grad_norm": 5.93035888671875, + "learning_rate": 9.495127500276926e-05, + "loss": 2.2643, + "step": 2265 + }, + { + "epoch": 0.17057151997591222, + "grad_norm": 5.123088359832764, + "learning_rate": 9.494593532084322e-05, + "loss": 2.2344, + "step": 2266 + }, + { + "epoch": 0.1706467942565724, + "grad_norm": 3.9339468479156494, + "learning_rate": 9.494059296701027e-05, + "loss": 2.0217, + "step": 2267 + }, + { + "epoch": 0.17072206853723254, + "grad_norm": 4.163431167602539, + "learning_rate": 9.493524794158804e-05, + "loss": 2.0717, + "step": 2268 + }, + { + "epoch": 0.1707973428178927, + "grad_norm": 4.021878719329834, + "learning_rate": 9.492990024489423e-05, + "loss": 2.0992, + "step": 2269 + }, + { + "epoch": 0.17087261709855286, + "grad_norm": 4.295801162719727, + "learning_rate": 9.492454987724678e-05, + "loss": 2.1383, + "step": 2270 + }, + { + "epoch": 0.170947891379213, + "grad_norm": 7.184655666351318, + "learning_rate": 9.491919683896374e-05, + "loss": 2.3497, + "step": 2271 + }, + { + "epoch": 0.17102316565987316, + "grad_norm": 5.332753658294678, + "learning_rate": 9.491384113036335e-05, + "loss": 2.2226, + "step": 2272 + }, + { + "epoch": 0.1710984399405333, + "grad_norm": 6.286576747894287, + "learning_rate": 9.490848275176396e-05, + "loss": 2.6511, + "step": 2273 + }, + { + "epoch": 0.17117371422119348, + "grad_norm": 4.57448148727417, + "learning_rate": 9.490312170348412e-05, + "loss": 2.454, + "step": 2274 + }, + { + "epoch": 0.17124898850185363, + "grad_norm": 5.415542125701904, + "learning_rate": 9.489775798584255e-05, + "loss": 2.0048, + "step": 2275 + }, + { + "epoch": 0.17132426278251378, + "grad_norm": 5.4753594398498535, + "learning_rate": 9.48923915991581e-05, + "loss": 1.9631, + "step": 2276 + }, + { + "epoch": 0.17139953706317393, + "grad_norm": 3.6613900661468506, + "learning_rate": 9.488702254374977e-05, + "loss": 2.0597, + "step": 2277 + }, + { + "epoch": 0.1714748113438341, + "grad_norm": 3.686659574508667, + "learning_rate": 9.488165081993674e-05, + "loss": 1.9906, + "step": 2278 + }, + { + "epoch": 0.17155008562449425, + "grad_norm": 6.281643390655518, + "learning_rate": 9.487627642803835e-05, + "loss": 2.2516, + "step": 2279 + }, + { + "epoch": 0.1716253599051544, + "grad_norm": 5.62980318069458, + "learning_rate": 9.48708993683741e-05, + "loss": 2.0339, + "step": 2280 + }, + { + "epoch": 0.17170063418581455, + "grad_norm": 6.123843193054199, + "learning_rate": 9.486551964126363e-05, + "loss": 2.0877, + "step": 2281 + }, + { + "epoch": 0.17177590846647472, + "grad_norm": 5.146125316619873, + "learning_rate": 9.486013724702676e-05, + "loss": 2.2186, + "step": 2282 + }, + { + "epoch": 0.17185118274713487, + "grad_norm": 3.640549659729004, + "learning_rate": 9.485475218598343e-05, + "loss": 2.0595, + "step": 2283 + }, + { + "epoch": 0.17192645702779502, + "grad_norm": 4.005462169647217, + "learning_rate": 9.48493644584538e-05, + "loss": 2.0645, + "step": 2284 + }, + { + "epoch": 0.1720017313084552, + "grad_norm": 4.827098369598389, + "learning_rate": 9.484397406475813e-05, + "loss": 2.2185, + "step": 2285 + }, + { + "epoch": 0.17207700558911534, + "grad_norm": 5.123477935791016, + "learning_rate": 9.483858100521688e-05, + "loss": 1.9404, + "step": 2286 + }, + { + "epoch": 0.1721522798697755, + "grad_norm": 5.968159198760986, + "learning_rate": 9.483318528015064e-05, + "loss": 2.3006, + "step": 2287 + }, + { + "epoch": 0.17222755415043564, + "grad_norm": 4.824171543121338, + "learning_rate": 9.482778688988018e-05, + "loss": 1.9657, + "step": 2288 + }, + { + "epoch": 0.17230282843109582, + "grad_norm": 8.66939926147461, + "learning_rate": 9.48223858347264e-05, + "loss": 2.1769, + "step": 2289 + }, + { + "epoch": 0.17237810271175596, + "grad_norm": 10.327978134155273, + "learning_rate": 9.481698211501042e-05, + "loss": 2.022, + "step": 2290 + }, + { + "epoch": 0.1724533769924161, + "grad_norm": 5.406477928161621, + "learning_rate": 9.481157573105343e-05, + "loss": 1.9567, + "step": 2291 + }, + { + "epoch": 0.17252865127307626, + "grad_norm": 4.529332160949707, + "learning_rate": 9.480616668317685e-05, + "loss": 2.5469, + "step": 2292 + }, + { + "epoch": 0.17260392555373644, + "grad_norm": 5.656266212463379, + "learning_rate": 9.480075497170221e-05, + "loss": 2.2001, + "step": 2293 + }, + { + "epoch": 0.17267919983439659, + "grad_norm": 4.836466312408447, + "learning_rate": 9.479534059695125e-05, + "loss": 2.7274, + "step": 2294 + }, + { + "epoch": 0.17275447411505673, + "grad_norm": 3.5666630268096924, + "learning_rate": 9.478992355924581e-05, + "loss": 1.9948, + "step": 2295 + }, + { + "epoch": 0.17282974839571688, + "grad_norm": 4.9752516746521, + "learning_rate": 9.478450385890793e-05, + "loss": 2.088, + "step": 2296 + }, + { + "epoch": 0.17290502267637706, + "grad_norm": 3.6577799320220947, + "learning_rate": 9.477908149625981e-05, + "loss": 2.086, + "step": 2297 + }, + { + "epoch": 0.1729802969570372, + "grad_norm": 3.805631160736084, + "learning_rate": 9.477365647162377e-05, + "loss": 1.9465, + "step": 2298 + }, + { + "epoch": 0.17305557123769735, + "grad_norm": 4.894316673278809, + "learning_rate": 9.476822878532235e-05, + "loss": 2.0717, + "step": 2299 + }, + { + "epoch": 0.17313084551835753, + "grad_norm": 3.622702121734619, + "learning_rate": 9.476279843767814e-05, + "loss": 1.9883, + "step": 2300 + }, + { + "epoch": 0.17320611979901768, + "grad_norm": 3.7990994453430176, + "learning_rate": 9.475736542901404e-05, + "loss": 2.0218, + "step": 2301 + }, + { + "epoch": 0.17328139407967783, + "grad_norm": 3.273437976837158, + "learning_rate": 9.475192975965296e-05, + "loss": 1.9075, + "step": 2302 + }, + { + "epoch": 0.17335666836033797, + "grad_norm": 4.26989221572876, + "learning_rate": 9.474649142991808e-05, + "loss": 2.4987, + "step": 2303 + }, + { + "epoch": 0.17343194264099815, + "grad_norm": 4.140346527099609, + "learning_rate": 9.474105044013267e-05, + "loss": 2.2862, + "step": 2304 + }, + { + "epoch": 0.1735072169216583, + "grad_norm": 4.889487266540527, + "learning_rate": 9.473560679062018e-05, + "loss": 1.8517, + "step": 2305 + }, + { + "epoch": 0.17358249120231845, + "grad_norm": 3.732370615005493, + "learning_rate": 9.473016048170424e-05, + "loss": 2.1845, + "step": 2306 + }, + { + "epoch": 0.1736577654829786, + "grad_norm": 4.43004846572876, + "learning_rate": 9.472471151370861e-05, + "loss": 2.192, + "step": 2307 + }, + { + "epoch": 0.17373303976363877, + "grad_norm": 5.592849254608154, + "learning_rate": 9.47192598869572e-05, + "loss": 2.6197, + "step": 2308 + }, + { + "epoch": 0.17380831404429892, + "grad_norm": 6.032291412353516, + "learning_rate": 9.47138056017741e-05, + "loss": 2.0181, + "step": 2309 + }, + { + "epoch": 0.17388358832495907, + "grad_norm": 5.228835582733154, + "learning_rate": 9.470834865848357e-05, + "loss": 2.2855, + "step": 2310 + }, + { + "epoch": 0.1739588626056192, + "grad_norm": 5.930988311767578, + "learning_rate": 9.470288905740998e-05, + "loss": 2.1199, + "step": 2311 + }, + { + "epoch": 0.1740341368862794, + "grad_norm": 5.700383186340332, + "learning_rate": 9.469742679887791e-05, + "loss": 2.2224, + "step": 2312 + }, + { + "epoch": 0.17410941116693954, + "grad_norm": 5.302191257476807, + "learning_rate": 9.469196188321207e-05, + "loss": 2.0117, + "step": 2313 + }, + { + "epoch": 0.17418468544759969, + "grad_norm": 5.366429805755615, + "learning_rate": 9.468649431073733e-05, + "loss": 2.1309, + "step": 2314 + }, + { + "epoch": 0.17425995972825983, + "grad_norm": 4.630857944488525, + "learning_rate": 9.468102408177871e-05, + "loss": 2.1334, + "step": 2315 + }, + { + "epoch": 0.17433523400892, + "grad_norm": 4.050517559051514, + "learning_rate": 9.467555119666143e-05, + "loss": 2.3049, + "step": 2316 + }, + { + "epoch": 0.17441050828958016, + "grad_norm": 4.166477203369141, + "learning_rate": 9.467007565571081e-05, + "loss": 2.1528, + "step": 2317 + }, + { + "epoch": 0.1744857825702403, + "grad_norm": 3.5567290782928467, + "learning_rate": 9.466459745925237e-05, + "loss": 2.0358, + "step": 2318 + }, + { + "epoch": 0.17456105685090048, + "grad_norm": 3.938577175140381, + "learning_rate": 9.465911660761177e-05, + "loss": 1.953, + "step": 2319 + }, + { + "epoch": 0.17463633113156063, + "grad_norm": 4.731121063232422, + "learning_rate": 9.465363310111483e-05, + "loss": 2.2991, + "step": 2320 + }, + { + "epoch": 0.17471160541222078, + "grad_norm": 4.032708168029785, + "learning_rate": 9.464814694008752e-05, + "loss": 1.9237, + "step": 2321 + }, + { + "epoch": 0.17478687969288093, + "grad_norm": 4.967084884643555, + "learning_rate": 9.464265812485598e-05, + "loss": 2.3047, + "step": 2322 + }, + { + "epoch": 0.1748621539735411, + "grad_norm": 4.150388240814209, + "learning_rate": 9.463716665574652e-05, + "loss": 2.2523, + "step": 2323 + }, + { + "epoch": 0.17493742825420125, + "grad_norm": 5.334815979003906, + "learning_rate": 9.463167253308559e-05, + "loss": 2.1501, + "step": 2324 + }, + { + "epoch": 0.1750127025348614, + "grad_norm": 6.422043800354004, + "learning_rate": 9.462617575719977e-05, + "loss": 1.9845, + "step": 2325 + }, + { + "epoch": 0.17508797681552155, + "grad_norm": 4.570641040802002, + "learning_rate": 9.462067632841585e-05, + "loss": 2.1799, + "step": 2326 + }, + { + "epoch": 0.17516325109618172, + "grad_norm": 5.045809268951416, + "learning_rate": 9.461517424706075e-05, + "loss": 2.415, + "step": 2327 + }, + { + "epoch": 0.17523852537684187, + "grad_norm": 4.2256388664245605, + "learning_rate": 9.460966951346157e-05, + "loss": 1.8744, + "step": 2328 + }, + { + "epoch": 0.17531379965750202, + "grad_norm": 4.779211044311523, + "learning_rate": 9.460416212794554e-05, + "loss": 2.1445, + "step": 2329 + }, + { + "epoch": 0.17538907393816217, + "grad_norm": 5.90493106842041, + "learning_rate": 9.459865209084004e-05, + "loss": 2.6916, + "step": 2330 + }, + { + "epoch": 0.17546434821882234, + "grad_norm": 5.32578706741333, + "learning_rate": 9.459313940247266e-05, + "loss": 2.2656, + "step": 2331 + }, + { + "epoch": 0.1755396224994825, + "grad_norm": 6.139587879180908, + "learning_rate": 9.458762406317107e-05, + "loss": 2.2693, + "step": 2332 + }, + { + "epoch": 0.17561489678014264, + "grad_norm": 5.8783674240112305, + "learning_rate": 9.45821060732632e-05, + "loss": 2.7173, + "step": 2333 + }, + { + "epoch": 0.1756901710608028, + "grad_norm": 5.068629741668701, + "learning_rate": 9.457658543307701e-05, + "loss": 1.6876, + "step": 2334 + }, + { + "epoch": 0.17576544534146296, + "grad_norm": 4.986094951629639, + "learning_rate": 9.457106214294074e-05, + "loss": 2.3307, + "step": 2335 + }, + { + "epoch": 0.1758407196221231, + "grad_norm": 5.165501594543457, + "learning_rate": 9.456553620318269e-05, + "loss": 2.2215, + "step": 2336 + }, + { + "epoch": 0.17591599390278326, + "grad_norm": 5.441031455993652, + "learning_rate": 9.456000761413141e-05, + "loss": 2.0878, + "step": 2337 + }, + { + "epoch": 0.17599126818344343, + "grad_norm": 5.335861682891846, + "learning_rate": 9.455447637611552e-05, + "loss": 2.1241, + "step": 2338 + }, + { + "epoch": 0.17606654246410358, + "grad_norm": 6.4383544921875, + "learning_rate": 9.454894248946386e-05, + "loss": 2.2103, + "step": 2339 + }, + { + "epoch": 0.17614181674476373, + "grad_norm": 8.162375450134277, + "learning_rate": 9.45434059545054e-05, + "loss": 2.0437, + "step": 2340 + }, + { + "epoch": 0.17621709102542388, + "grad_norm": 5.421572208404541, + "learning_rate": 9.453786677156927e-05, + "loss": 2.5892, + "step": 2341 + }, + { + "epoch": 0.17629236530608405, + "grad_norm": 6.447846412658691, + "learning_rate": 9.453232494098473e-05, + "loss": 2.038, + "step": 2342 + }, + { + "epoch": 0.1763676395867442, + "grad_norm": 6.637684345245361, + "learning_rate": 9.452678046308126e-05, + "loss": 2.1189, + "step": 2343 + }, + { + "epoch": 0.17644291386740435, + "grad_norm": 4.693938732147217, + "learning_rate": 9.452123333818845e-05, + "loss": 2.3551, + "step": 2344 + }, + { + "epoch": 0.1765181881480645, + "grad_norm": 4.605899333953857, + "learning_rate": 9.451568356663607e-05, + "loss": 2.2943, + "step": 2345 + }, + { + "epoch": 0.17659346242872467, + "grad_norm": 5.670785903930664, + "learning_rate": 9.451013114875404e-05, + "loss": 2.3798, + "step": 2346 + }, + { + "epoch": 0.17666873670938482, + "grad_norm": 4.882007122039795, + "learning_rate": 9.450457608487242e-05, + "loss": 1.9727, + "step": 2347 + }, + { + "epoch": 0.17674401099004497, + "grad_norm": 4.417944431304932, + "learning_rate": 9.449901837532145e-05, + "loss": 2.0738, + "step": 2348 + }, + { + "epoch": 0.17681928527070515, + "grad_norm": 4.64418363571167, + "learning_rate": 9.44934580204315e-05, + "loss": 2.189, + "step": 2349 + }, + { + "epoch": 0.1768945595513653, + "grad_norm": 3.882991075515747, + "learning_rate": 9.448789502053316e-05, + "loss": 1.847, + "step": 2350 + }, + { + "epoch": 0.17696983383202544, + "grad_norm": 5.01652717590332, + "learning_rate": 9.44823293759571e-05, + "loss": 2.2999, + "step": 2351 + }, + { + "epoch": 0.1770451081126856, + "grad_norm": 4.760298728942871, + "learning_rate": 9.447676108703421e-05, + "loss": 2.1113, + "step": 2352 + }, + { + "epoch": 0.17712038239334577, + "grad_norm": 4.580170631408691, + "learning_rate": 9.447119015409547e-05, + "loss": 2.2249, + "step": 2353 + }, + { + "epoch": 0.1771956566740059, + "grad_norm": 6.8296895027160645, + "learning_rate": 9.446561657747209e-05, + "loss": 2.3861, + "step": 2354 + }, + { + "epoch": 0.17727093095466606, + "grad_norm": 5.590836048126221, + "learning_rate": 9.446004035749538e-05, + "loss": 2.2823, + "step": 2355 + }, + { + "epoch": 0.1773462052353262, + "grad_norm": 7.718230724334717, + "learning_rate": 9.445446149449686e-05, + "loss": 2.2516, + "step": 2356 + }, + { + "epoch": 0.17742147951598639, + "grad_norm": 3.219867706298828, + "learning_rate": 9.444887998880814e-05, + "loss": 2.2606, + "step": 2357 + }, + { + "epoch": 0.17749675379664653, + "grad_norm": 4.374912261962891, + "learning_rate": 9.444329584076104e-05, + "loss": 1.9019, + "step": 2358 + }, + { + "epoch": 0.17757202807730668, + "grad_norm": 4.683414459228516, + "learning_rate": 9.443770905068754e-05, + "loss": 2.2066, + "step": 2359 + }, + { + "epoch": 0.17764730235796683, + "grad_norm": 4.491998672485352, + "learning_rate": 9.443211961891973e-05, + "loss": 2.3096, + "step": 2360 + }, + { + "epoch": 0.177722576638627, + "grad_norm": 4.711297988891602, + "learning_rate": 9.442652754578991e-05, + "loss": 1.9365, + "step": 2361 + }, + { + "epoch": 0.17779785091928715, + "grad_norm": 7.7094292640686035, + "learning_rate": 9.44209328316305e-05, + "loss": 2.1713, + "step": 2362 + }, + { + "epoch": 0.1778731251999473, + "grad_norm": 5.9597063064575195, + "learning_rate": 9.441533547677408e-05, + "loss": 2.408, + "step": 2363 + }, + { + "epoch": 0.17794839948060745, + "grad_norm": 4.834662437438965, + "learning_rate": 9.440973548155342e-05, + "loss": 2.2714, + "step": 2364 + }, + { + "epoch": 0.17802367376126763, + "grad_norm": 5.837653636932373, + "learning_rate": 9.440413284630141e-05, + "loss": 2.1192, + "step": 2365 + }, + { + "epoch": 0.17809894804192777, + "grad_norm": 3.679480791091919, + "learning_rate": 9.439852757135111e-05, + "loss": 1.9432, + "step": 2366 + }, + { + "epoch": 0.17817422232258792, + "grad_norm": 5.310233116149902, + "learning_rate": 9.439291965703573e-05, + "loss": 2.2702, + "step": 2367 + }, + { + "epoch": 0.1782494966032481, + "grad_norm": 8.406325340270996, + "learning_rate": 9.438730910368867e-05, + "loss": 2.366, + "step": 2368 + }, + { + "epoch": 0.17832477088390825, + "grad_norm": 4.908152103424072, + "learning_rate": 9.438169591164343e-05, + "loss": 2.0483, + "step": 2369 + }, + { + "epoch": 0.1784000451645684, + "grad_norm": 4.152422904968262, + "learning_rate": 9.437608008123374e-05, + "loss": 1.8626, + "step": 2370 + }, + { + "epoch": 0.17847531944522854, + "grad_norm": 4.660198211669922, + "learning_rate": 9.437046161279339e-05, + "loss": 2.3298, + "step": 2371 + }, + { + "epoch": 0.17855059372588872, + "grad_norm": 5.6455979347229, + "learning_rate": 9.436484050665641e-05, + "loss": 2.0798, + "step": 2372 + }, + { + "epoch": 0.17862586800654887, + "grad_norm": 4.440837383270264, + "learning_rate": 9.435921676315698e-05, + "loss": 1.9895, + "step": 2373 + }, + { + "epoch": 0.178701142287209, + "grad_norm": 4.986636638641357, + "learning_rate": 9.435359038262939e-05, + "loss": 2.1823, + "step": 2374 + }, + { + "epoch": 0.17877641656786916, + "grad_norm": 5.736190319061279, + "learning_rate": 9.43479613654081e-05, + "loss": 1.9994, + "step": 2375 + }, + { + "epoch": 0.17885169084852934, + "grad_norm": 6.821258068084717, + "learning_rate": 9.434232971182777e-05, + "loss": 1.9361, + "step": 2376 + }, + { + "epoch": 0.17892696512918949, + "grad_norm": 4.035789966583252, + "learning_rate": 9.433669542222317e-05, + "loss": 2.0181, + "step": 2377 + }, + { + "epoch": 0.17900223940984963, + "grad_norm": 4.947262763977051, + "learning_rate": 9.433105849692923e-05, + "loss": 2.2581, + "step": 2378 + }, + { + "epoch": 0.17907751369050978, + "grad_norm": 5.483085632324219, + "learning_rate": 9.432541893628107e-05, + "loss": 1.9159, + "step": 2379 + }, + { + "epoch": 0.17915278797116996, + "grad_norm": 6.753114700317383, + "learning_rate": 9.431977674061394e-05, + "loss": 2.3465, + "step": 2380 + }, + { + "epoch": 0.1792280622518301, + "grad_norm": 5.069437026977539, + "learning_rate": 9.431413191026325e-05, + "loss": 2.2337, + "step": 2381 + }, + { + "epoch": 0.17930333653249025, + "grad_norm": 4.258182048797607, + "learning_rate": 9.430848444556457e-05, + "loss": 2.0306, + "step": 2382 + }, + { + "epoch": 0.17937861081315043, + "grad_norm": 5.638947010040283, + "learning_rate": 9.430283434685363e-05, + "loss": 1.8044, + "step": 2383 + }, + { + "epoch": 0.17945388509381058, + "grad_norm": 4.735197067260742, + "learning_rate": 9.42971816144663e-05, + "loss": 2.0935, + "step": 2384 + }, + { + "epoch": 0.17952915937447073, + "grad_norm": 5.030169486999512, + "learning_rate": 9.429152624873863e-05, + "loss": 2.1498, + "step": 2385 + }, + { + "epoch": 0.17960443365513087, + "grad_norm": 4.605495929718018, + "learning_rate": 9.42858682500068e-05, + "loss": 2.2142, + "step": 2386 + }, + { + "epoch": 0.17967970793579105, + "grad_norm": 4.7468485832214355, + "learning_rate": 9.428020761860719e-05, + "loss": 2.1551, + "step": 2387 + }, + { + "epoch": 0.1797549822164512, + "grad_norm": 4.07126522064209, + "learning_rate": 9.427454435487628e-05, + "loss": 2.2395, + "step": 2388 + }, + { + "epoch": 0.17983025649711135, + "grad_norm": 3.910022258758545, + "learning_rate": 9.426887845915074e-05, + "loss": 2.0808, + "step": 2389 + }, + { + "epoch": 0.1799055307777715, + "grad_norm": 4.715250492095947, + "learning_rate": 9.426320993176742e-05, + "loss": 1.5938, + "step": 2390 + }, + { + "epoch": 0.17998080505843167, + "grad_norm": 4.89539098739624, + "learning_rate": 9.425753877306326e-05, + "loss": 2.1834, + "step": 2391 + }, + { + "epoch": 0.18005607933909182, + "grad_norm": 4.629598140716553, + "learning_rate": 9.42518649833754e-05, + "loss": 2.038, + "step": 2392 + }, + { + "epoch": 0.18013135361975197, + "grad_norm": 6.25762939453125, + "learning_rate": 9.424618856304115e-05, + "loss": 2.1019, + "step": 2393 + }, + { + "epoch": 0.18020662790041211, + "grad_norm": 4.406968593597412, + "learning_rate": 9.424050951239796e-05, + "loss": 2.128, + "step": 2394 + }, + { + "epoch": 0.1802819021810723, + "grad_norm": 3.9285688400268555, + "learning_rate": 9.42348278317834e-05, + "loss": 1.7804, + "step": 2395 + }, + { + "epoch": 0.18035717646173244, + "grad_norm": 4.2842631340026855, + "learning_rate": 9.422914352153524e-05, + "loss": 2.0389, + "step": 2396 + }, + { + "epoch": 0.18043245074239259, + "grad_norm": 6.087997913360596, + "learning_rate": 9.422345658199144e-05, + "loss": 2.2996, + "step": 2397 + }, + { + "epoch": 0.18050772502305276, + "grad_norm": 4.773106098175049, + "learning_rate": 9.421776701349002e-05, + "loss": 2.0469, + "step": 2398 + }, + { + "epoch": 0.1805829993037129, + "grad_norm": 5.146775722503662, + "learning_rate": 9.421207481636922e-05, + "loss": 1.7992, + "step": 2399 + }, + { + "epoch": 0.18065827358437306, + "grad_norm": 5.000457286834717, + "learning_rate": 9.420637999096744e-05, + "loss": 2.3456, + "step": 2400 + }, + { + "epoch": 0.1807335478650332, + "grad_norm": 6.762832164764404, + "learning_rate": 9.42006825376232e-05, + "loss": 1.8799, + "step": 2401 + }, + { + "epoch": 0.18080882214569338, + "grad_norm": 5.668283462524414, + "learning_rate": 9.419498245667522e-05, + "loss": 2.0409, + "step": 2402 + }, + { + "epoch": 0.18088409642635353, + "grad_norm": 5.641218185424805, + "learning_rate": 9.418927974846234e-05, + "loss": 1.8974, + "step": 2403 + }, + { + "epoch": 0.18095937070701368, + "grad_norm": 6.720699787139893, + "learning_rate": 9.418357441332358e-05, + "loss": 2.246, + "step": 2404 + }, + { + "epoch": 0.18103464498767383, + "grad_norm": 7.127935409545898, + "learning_rate": 9.417786645159807e-05, + "loss": 2.9731, + "step": 2405 + }, + { + "epoch": 0.181109919268334, + "grad_norm": 4.489436149597168, + "learning_rate": 9.417215586362518e-05, + "loss": 2.1494, + "step": 2406 + }, + { + "epoch": 0.18118519354899415, + "grad_norm": 4.5200910568237305, + "learning_rate": 9.416644264974437e-05, + "loss": 2.3272, + "step": 2407 + }, + { + "epoch": 0.1812604678296543, + "grad_norm": 5.34370756149292, + "learning_rate": 9.416072681029526e-05, + "loss": 1.9206, + "step": 2408 + }, + { + "epoch": 0.18133574211031445, + "grad_norm": 4.176974296569824, + "learning_rate": 9.415500834561768e-05, + "loss": 1.9707, + "step": 2409 + }, + { + "epoch": 0.18141101639097462, + "grad_norm": 4.307436943054199, + "learning_rate": 9.414928725605152e-05, + "loss": 2.0193, + "step": 2410 + }, + { + "epoch": 0.18148629067163477, + "grad_norm": 4.275622844696045, + "learning_rate": 9.414356354193692e-05, + "loss": 2.0408, + "step": 2411 + }, + { + "epoch": 0.18156156495229492, + "grad_norm": 4.892441749572754, + "learning_rate": 9.413783720361413e-05, + "loss": 2.0104, + "step": 2412 + }, + { + "epoch": 0.18163683923295507, + "grad_norm": 4.65931510925293, + "learning_rate": 9.413210824142358e-05, + "loss": 2.3706, + "step": 2413 + }, + { + "epoch": 0.18171211351361524, + "grad_norm": 3.8563642501831055, + "learning_rate": 9.412637665570581e-05, + "loss": 2.2299, + "step": 2414 + }, + { + "epoch": 0.1817873877942754, + "grad_norm": 5.556540489196777, + "learning_rate": 9.412064244680154e-05, + "loss": 2.1063, + "step": 2415 + }, + { + "epoch": 0.18186266207493554, + "grad_norm": 6.540408611297607, + "learning_rate": 9.411490561505171e-05, + "loss": 2.0972, + "step": 2416 + }, + { + "epoch": 0.18193793635559571, + "grad_norm": 4.668656349182129, + "learning_rate": 9.41091661607973e-05, + "loss": 2.2057, + "step": 2417 + }, + { + "epoch": 0.18201321063625586, + "grad_norm": 4.944366931915283, + "learning_rate": 9.410342408437953e-05, + "loss": 2.0218, + "step": 2418 + }, + { + "epoch": 0.182088484916916, + "grad_norm": 6.270229339599609, + "learning_rate": 9.409767938613973e-05, + "loss": 2.0377, + "step": 2419 + }, + { + "epoch": 0.18216375919757616, + "grad_norm": 5.974524974822998, + "learning_rate": 9.409193206641944e-05, + "loss": 1.8561, + "step": 2420 + }, + { + "epoch": 0.18223903347823633, + "grad_norm": 5.791412353515625, + "learning_rate": 9.408618212556031e-05, + "loss": 2.6372, + "step": 2421 + }, + { + "epoch": 0.18231430775889648, + "grad_norm": 4.222814083099365, + "learning_rate": 9.408042956390412e-05, + "loss": 1.8803, + "step": 2422 + }, + { + "epoch": 0.18238958203955663, + "grad_norm": 4.217424392700195, + "learning_rate": 9.40746743817929e-05, + "loss": 2.1173, + "step": 2423 + }, + { + "epoch": 0.18246485632021678, + "grad_norm": 4.1311798095703125, + "learning_rate": 9.406891657956875e-05, + "loss": 1.9785, + "step": 2424 + }, + { + "epoch": 0.18254013060087695, + "grad_norm": 4.623928546905518, + "learning_rate": 9.406315615757396e-05, + "loss": 2.0211, + "step": 2425 + }, + { + "epoch": 0.1826154048815371, + "grad_norm": 4.721377849578857, + "learning_rate": 9.405739311615094e-05, + "loss": 2.1413, + "step": 2426 + }, + { + "epoch": 0.18269067916219725, + "grad_norm": 3.957151174545288, + "learning_rate": 9.405162745564233e-05, + "loss": 1.9126, + "step": 2427 + }, + { + "epoch": 0.1827659534428574, + "grad_norm": 4.023303985595703, + "learning_rate": 9.404585917639088e-05, + "loss": 2.2212, + "step": 2428 + }, + { + "epoch": 0.18284122772351757, + "grad_norm": 3.649458885192871, + "learning_rate": 9.404008827873946e-05, + "loss": 1.9352, + "step": 2429 + }, + { + "epoch": 0.18291650200417772, + "grad_norm": 4.543708801269531, + "learning_rate": 9.403431476303118e-05, + "loss": 2.0626, + "step": 2430 + }, + { + "epoch": 0.18299177628483787, + "grad_norm": 4.4156107902526855, + "learning_rate": 9.402853862960923e-05, + "loss": 2.1769, + "step": 2431 + }, + { + "epoch": 0.18306705056549805, + "grad_norm": 4.563872337341309, + "learning_rate": 9.402275987881698e-05, + "loss": 2.5336, + "step": 2432 + }, + { + "epoch": 0.1831423248461582, + "grad_norm": 6.249185085296631, + "learning_rate": 9.401697851099798e-05, + "loss": 2.2064, + "step": 2433 + }, + { + "epoch": 0.18321759912681834, + "grad_norm": 3.949712038040161, + "learning_rate": 9.40111945264959e-05, + "loss": 2.1051, + "step": 2434 + }, + { + "epoch": 0.1832928734074785, + "grad_norm": 4.53810453414917, + "learning_rate": 9.400540792565459e-05, + "loss": 2.0903, + "step": 2435 + }, + { + "epoch": 0.18336814768813867, + "grad_norm": 5.742558002471924, + "learning_rate": 9.399961870881804e-05, + "loss": 2.2207, + "step": 2436 + }, + { + "epoch": 0.18344342196879881, + "grad_norm": 4.680918216705322, + "learning_rate": 9.399382687633042e-05, + "loss": 1.8058, + "step": 2437 + }, + { + "epoch": 0.18351869624945896, + "grad_norm": 5.94005012512207, + "learning_rate": 9.398803242853601e-05, + "loss": 2.2041, + "step": 2438 + }, + { + "epoch": 0.1835939705301191, + "grad_norm": 7.358811855316162, + "learning_rate": 9.39822353657793e-05, + "loss": 1.9529, + "step": 2439 + }, + { + "epoch": 0.1836692448107793, + "grad_norm": 5.252130031585693, + "learning_rate": 9.397643568840488e-05, + "loss": 2.3559, + "step": 2440 + }, + { + "epoch": 0.18374451909143943, + "grad_norm": 4.878549098968506, + "learning_rate": 9.397063339675755e-05, + "loss": 2.1199, + "step": 2441 + }, + { + "epoch": 0.18381979337209958, + "grad_norm": 3.991593599319458, + "learning_rate": 9.396482849118224e-05, + "loss": 2.0138, + "step": 2442 + }, + { + "epoch": 0.18389506765275973, + "grad_norm": 5.3004865646362305, + "learning_rate": 9.395902097202403e-05, + "loss": 2.137, + "step": 2443 + }, + { + "epoch": 0.1839703419334199, + "grad_norm": 6.8371782302856445, + "learning_rate": 9.395321083962813e-05, + "loss": 1.7962, + "step": 2444 + }, + { + "epoch": 0.18404561621408005, + "grad_norm": 6.035629749298096, + "learning_rate": 9.394739809433998e-05, + "loss": 2.2543, + "step": 2445 + }, + { + "epoch": 0.1841208904947402, + "grad_norm": 4.844130516052246, + "learning_rate": 9.39415827365051e-05, + "loss": 2.0448, + "step": 2446 + }, + { + "epoch": 0.18419616477540038, + "grad_norm": 6.140071392059326, + "learning_rate": 9.393576476646923e-05, + "loss": 2.2053, + "step": 2447 + }, + { + "epoch": 0.18427143905606053, + "grad_norm": 5.780943393707275, + "learning_rate": 9.39299441845782e-05, + "loss": 2.0894, + "step": 2448 + }, + { + "epoch": 0.18434671333672067, + "grad_norm": 4.227553367614746, + "learning_rate": 9.392412099117802e-05, + "loss": 1.7292, + "step": 2449 + }, + { + "epoch": 0.18442198761738082, + "grad_norm": 4.283457279205322, + "learning_rate": 9.39182951866149e-05, + "loss": 2.121, + "step": 2450 + }, + { + "epoch": 0.184497261898041, + "grad_norm": 4.520118236541748, + "learning_rate": 9.391246677123514e-05, + "loss": 2.16, + "step": 2451 + }, + { + "epoch": 0.18457253617870115, + "grad_norm": 3.396616220474243, + "learning_rate": 9.390663574538523e-05, + "loss": 2.1485, + "step": 2452 + }, + { + "epoch": 0.1846478104593613, + "grad_norm": 4.4132866859436035, + "learning_rate": 9.390080210941181e-05, + "loss": 1.9867, + "step": 2453 + }, + { + "epoch": 0.18472308474002144, + "grad_norm": 3.8098809719085693, + "learning_rate": 9.389496586366167e-05, + "loss": 2.4707, + "step": 2454 + }, + { + "epoch": 0.18479835902068162, + "grad_norm": 4.23969030380249, + "learning_rate": 9.388912700848176e-05, + "loss": 2.2455, + "step": 2455 + }, + { + "epoch": 0.18487363330134177, + "grad_norm": 5.872971057891846, + "learning_rate": 9.388328554421917e-05, + "loss": 2.1556, + "step": 2456 + }, + { + "epoch": 0.18494890758200191, + "grad_norm": 5.10668420791626, + "learning_rate": 9.387744147122117e-05, + "loss": 2.2551, + "step": 2457 + }, + { + "epoch": 0.18502418186266206, + "grad_norm": 5.751786231994629, + "learning_rate": 9.387159478983518e-05, + "loss": 2.0982, + "step": 2458 + }, + { + "epoch": 0.18509945614332224, + "grad_norm": 5.4257612228393555, + "learning_rate": 9.386574550040876e-05, + "loss": 1.9487, + "step": 2459 + }, + { + "epoch": 0.1851747304239824, + "grad_norm": 5.04498291015625, + "learning_rate": 9.385989360328964e-05, + "loss": 2.1612, + "step": 2460 + }, + { + "epoch": 0.18525000470464253, + "grad_norm": 4.236330986022949, + "learning_rate": 9.385403909882568e-05, + "loss": 2.2415, + "step": 2461 + }, + { + "epoch": 0.1853252789853027, + "grad_norm": 5.090738773345947, + "learning_rate": 9.384818198736496e-05, + "loss": 2.2152, + "step": 2462 + }, + { + "epoch": 0.18540055326596286, + "grad_norm": 4.343742847442627, + "learning_rate": 9.384232226925561e-05, + "loss": 2.2333, + "step": 2463 + }, + { + "epoch": 0.185475827546623, + "grad_norm": 4.728951930999756, + "learning_rate": 9.3836459944846e-05, + "loss": 2.2967, + "step": 2464 + }, + { + "epoch": 0.18555110182728315, + "grad_norm": 3.680865526199341, + "learning_rate": 9.383059501448462e-05, + "loss": 1.9406, + "step": 2465 + }, + { + "epoch": 0.18562637610794333, + "grad_norm": 5.822722911834717, + "learning_rate": 9.382472747852013e-05, + "loss": 1.9613, + "step": 2466 + }, + { + "epoch": 0.18570165038860348, + "grad_norm": 5.067307949066162, + "learning_rate": 9.381885733730136e-05, + "loss": 2.5257, + "step": 2467 + }, + { + "epoch": 0.18577692466926363, + "grad_norm": 4.1995530128479, + "learning_rate": 9.381298459117723e-05, + "loss": 2.0076, + "step": 2468 + }, + { + "epoch": 0.18585219894992377, + "grad_norm": 5.420987129211426, + "learning_rate": 9.380710924049689e-05, + "loss": 1.9664, + "step": 2469 + }, + { + "epoch": 0.18592747323058395, + "grad_norm": 4.720597267150879, + "learning_rate": 9.380123128560961e-05, + "loss": 1.7115, + "step": 2470 + }, + { + "epoch": 0.1860027475112441, + "grad_norm": 5.392365455627441, + "learning_rate": 9.379535072686479e-05, + "loss": 1.8641, + "step": 2471 + }, + { + "epoch": 0.18607802179190425, + "grad_norm": 5.885098934173584, + "learning_rate": 9.378946756461205e-05, + "loss": 2.2107, + "step": 2472 + }, + { + "epoch": 0.1861532960725644, + "grad_norm": 5.320254325866699, + "learning_rate": 9.378358179920111e-05, + "loss": 1.9729, + "step": 2473 + }, + { + "epoch": 0.18622857035322457, + "grad_norm": 4.235252857208252, + "learning_rate": 9.377769343098185e-05, + "loss": 2.2764, + "step": 2474 + }, + { + "epoch": 0.18630384463388472, + "grad_norm": 6.214223384857178, + "learning_rate": 9.377180246030432e-05, + "loss": 2.4978, + "step": 2475 + }, + { + "epoch": 0.18637911891454487, + "grad_norm": 7.149608135223389, + "learning_rate": 9.376590888751875e-05, + "loss": 2.4284, + "step": 2476 + }, + { + "epoch": 0.18645439319520501, + "grad_norm": 7.12389612197876, + "learning_rate": 9.376001271297546e-05, + "loss": 1.8931, + "step": 2477 + }, + { + "epoch": 0.1865296674758652, + "grad_norm": 5.371740341186523, + "learning_rate": 9.375411393702497e-05, + "loss": 2.0803, + "step": 2478 + }, + { + "epoch": 0.18660494175652534, + "grad_norm": 4.696757793426514, + "learning_rate": 9.374821256001798e-05, + "loss": 2.2489, + "step": 2479 + }, + { + "epoch": 0.1866802160371855, + "grad_norm": 4.32313871383667, + "learning_rate": 9.374230858230525e-05, + "loss": 2.1105, + "step": 2480 + }, + { + "epoch": 0.18675549031784566, + "grad_norm": 4.14240026473999, + "learning_rate": 9.373640200423781e-05, + "loss": 2.3715, + "step": 2481 + }, + { + "epoch": 0.1868307645985058, + "grad_norm": 4.030320167541504, + "learning_rate": 9.373049282616674e-05, + "loss": 1.932, + "step": 2482 + }, + { + "epoch": 0.18690603887916596, + "grad_norm": 4.575912952423096, + "learning_rate": 9.372458104844337e-05, + "loss": 2.2811, + "step": 2483 + }, + { + "epoch": 0.1869813131598261, + "grad_norm": 4.238185882568359, + "learning_rate": 9.37186666714191e-05, + "loss": 1.9182, + "step": 2484 + }, + { + "epoch": 0.18705658744048628, + "grad_norm": 5.361331462860107, + "learning_rate": 9.371274969544556e-05, + "loss": 2.0284, + "step": 2485 + }, + { + "epoch": 0.18713186172114643, + "grad_norm": 4.422515869140625, + "learning_rate": 9.370683012087447e-05, + "loss": 1.9103, + "step": 2486 + }, + { + "epoch": 0.18720713600180658, + "grad_norm": 4.045429229736328, + "learning_rate": 9.370090794805772e-05, + "loss": 2.238, + "step": 2487 + }, + { + "epoch": 0.18728241028246673, + "grad_norm": 4.757269859313965, + "learning_rate": 9.369498317734741e-05, + "loss": 2.5913, + "step": 2488 + }, + { + "epoch": 0.1873576845631269, + "grad_norm": 4.8929643630981445, + "learning_rate": 9.368905580909571e-05, + "loss": 2.2632, + "step": 2489 + }, + { + "epoch": 0.18743295884378705, + "grad_norm": 3.796651840209961, + "learning_rate": 9.3683125843655e-05, + "loss": 2.1742, + "step": 2490 + }, + { + "epoch": 0.1875082331244472, + "grad_norm": 4.369163513183594, + "learning_rate": 9.36771932813778e-05, + "loss": 1.9691, + "step": 2491 + }, + { + "epoch": 0.18758350740510735, + "grad_norm": 5.370289325714111, + "learning_rate": 9.367125812261679e-05, + "loss": 2.1115, + "step": 2492 + }, + { + "epoch": 0.18765878168576752, + "grad_norm": 3.3248627185821533, + "learning_rate": 9.366532036772479e-05, + "loss": 2.1965, + "step": 2493 + }, + { + "epoch": 0.18773405596642767, + "grad_norm": 4.258205413818359, + "learning_rate": 9.365938001705476e-05, + "loss": 2.0163, + "step": 2494 + }, + { + "epoch": 0.18780933024708782, + "grad_norm": 4.4000468254089355, + "learning_rate": 9.365343707095989e-05, + "loss": 2.2695, + "step": 2495 + }, + { + "epoch": 0.187884604527748, + "grad_norm": 5.0567827224731445, + "learning_rate": 9.364749152979344e-05, + "loss": 2.2224, + "step": 2496 + }, + { + "epoch": 0.18795987880840814, + "grad_norm": 4.769439220428467, + "learning_rate": 9.364154339390884e-05, + "loss": 2.0068, + "step": 2497 + }, + { + "epoch": 0.1880351530890683, + "grad_norm": 3.545689105987549, + "learning_rate": 9.363559266365973e-05, + "loss": 2.0759, + "step": 2498 + }, + { + "epoch": 0.18811042736972844, + "grad_norm": 3.941669225692749, + "learning_rate": 9.362963933939982e-05, + "loss": 2.2694, + "step": 2499 + }, + { + "epoch": 0.18818570165038861, + "grad_norm": 7.426388740539551, + "learning_rate": 9.362368342148304e-05, + "loss": 2.248, + "step": 2500 + }, + { + "epoch": 0.18826097593104876, + "grad_norm": 3.741877317428589, + "learning_rate": 9.361772491026347e-05, + "loss": 2.0463, + "step": 2501 + }, + { + "epoch": 0.1883362502117089, + "grad_norm": 5.458040237426758, + "learning_rate": 9.361176380609529e-05, + "loss": 2.3928, + "step": 2502 + }, + { + "epoch": 0.18841152449236906, + "grad_norm": 4.791382312774658, + "learning_rate": 9.36058001093329e-05, + "loss": 2.2111, + "step": 2503 + }, + { + "epoch": 0.18848679877302923, + "grad_norm": 4.490954399108887, + "learning_rate": 9.359983382033081e-05, + "loss": 2.4891, + "step": 2504 + }, + { + "epoch": 0.18856207305368938, + "grad_norm": 5.043197154998779, + "learning_rate": 9.359386493944369e-05, + "loss": 2.5872, + "step": 2505 + }, + { + "epoch": 0.18863734733434953, + "grad_norm": 5.550272464752197, + "learning_rate": 9.35878934670264e-05, + "loss": 2.136, + "step": 2506 + }, + { + "epoch": 0.18871262161500968, + "grad_norm": 5.618335247039795, + "learning_rate": 9.358191940343389e-05, + "loss": 1.9605, + "step": 2507 + }, + { + "epoch": 0.18878789589566985, + "grad_norm": 4.352292537689209, + "learning_rate": 9.357594274902133e-05, + "loss": 2.4042, + "step": 2508 + }, + { + "epoch": 0.18886317017633, + "grad_norm": 4.100133419036865, + "learning_rate": 9.356996350414402e-05, + "loss": 2.3039, + "step": 2509 + }, + { + "epoch": 0.18893844445699015, + "grad_norm": 4.744510650634766, + "learning_rate": 9.356398166915738e-05, + "loss": 1.9309, + "step": 2510 + }, + { + "epoch": 0.18901371873765033, + "grad_norm": 5.219865322113037, + "learning_rate": 9.355799724441703e-05, + "loss": 2.0551, + "step": 2511 + }, + { + "epoch": 0.18908899301831048, + "grad_norm": 4.375669479370117, + "learning_rate": 9.355201023027876e-05, + "loss": 2.0971, + "step": 2512 + }, + { + "epoch": 0.18916426729897062, + "grad_norm": 3.9628045558929443, + "learning_rate": 9.35460206270984e-05, + "loss": 2.137, + "step": 2513 + }, + { + "epoch": 0.18923954157963077, + "grad_norm": 3.951970338821411, + "learning_rate": 9.354002843523209e-05, + "loss": 2.1666, + "step": 2514 + }, + { + "epoch": 0.18931481586029095, + "grad_norm": 5.941013813018799, + "learning_rate": 9.3534033655036e-05, + "loss": 1.9261, + "step": 2515 + }, + { + "epoch": 0.1893900901409511, + "grad_norm": 5.416547775268555, + "learning_rate": 9.352803628686653e-05, + "loss": 2.0508, + "step": 2516 + }, + { + "epoch": 0.18946536442161124, + "grad_norm": 4.743608474731445, + "learning_rate": 9.352203633108021e-05, + "loss": 2.484, + "step": 2517 + }, + { + "epoch": 0.1895406387022714, + "grad_norm": 4.451670169830322, + "learning_rate": 9.35160337880337e-05, + "loss": 2.0116, + "step": 2518 + }, + { + "epoch": 0.18961591298293157, + "grad_norm": 4.249894618988037, + "learning_rate": 9.351002865808383e-05, + "loss": 2.2439, + "step": 2519 + }, + { + "epoch": 0.18969118726359172, + "grad_norm": 5.0988688468933105, + "learning_rate": 9.350402094158761e-05, + "loss": 2.179, + "step": 2520 + }, + { + "epoch": 0.18976646154425186, + "grad_norm": 5.777320861816406, + "learning_rate": 9.349801063890217e-05, + "loss": 1.8623, + "step": 2521 + }, + { + "epoch": 0.189841735824912, + "grad_norm": 5.133855819702148, + "learning_rate": 9.349199775038482e-05, + "loss": 1.999, + "step": 2522 + }, + { + "epoch": 0.1899170101055722, + "grad_norm": 6.937568664550781, + "learning_rate": 9.348598227639299e-05, + "loss": 1.9257, + "step": 2523 + }, + { + "epoch": 0.18999228438623234, + "grad_norm": 4.837934970855713, + "learning_rate": 9.34799642172843e-05, + "loss": 2.1202, + "step": 2524 + }, + { + "epoch": 0.19006755866689248, + "grad_norm": 8.245290756225586, + "learning_rate": 9.347394357341648e-05, + "loss": 1.9762, + "step": 2525 + }, + { + "epoch": 0.19014283294755263, + "grad_norm": 5.214762210845947, + "learning_rate": 9.346792034514746e-05, + "loss": 1.7786, + "step": 2526 + }, + { + "epoch": 0.1902181072282128, + "grad_norm": 4.201835632324219, + "learning_rate": 9.346189453283529e-05, + "loss": 2.0194, + "step": 2527 + }, + { + "epoch": 0.19029338150887296, + "grad_norm": 4.834220886230469, + "learning_rate": 9.34558661368382e-05, + "loss": 2.0185, + "step": 2528 + }, + { + "epoch": 0.1903686557895331, + "grad_norm": 4.203024864196777, + "learning_rate": 9.344983515751456e-05, + "loss": 2.3082, + "step": 2529 + }, + { + "epoch": 0.19044393007019328, + "grad_norm": 4.904376029968262, + "learning_rate": 9.344380159522289e-05, + "loss": 2.2924, + "step": 2530 + }, + { + "epoch": 0.19051920435085343, + "grad_norm": 4.9724273681640625, + "learning_rate": 9.343776545032188e-05, + "loss": 2.0996, + "step": 2531 + }, + { + "epoch": 0.19059447863151358, + "grad_norm": 4.520656585693359, + "learning_rate": 9.343172672317034e-05, + "loss": 2.0701, + "step": 2532 + }, + { + "epoch": 0.19066975291217372, + "grad_norm": 4.859874248504639, + "learning_rate": 9.342568541412726e-05, + "loss": 2.1396, + "step": 2533 + }, + { + "epoch": 0.1907450271928339, + "grad_norm": 4.253837585449219, + "learning_rate": 9.34196415235518e-05, + "loss": 1.9302, + "step": 2534 + }, + { + "epoch": 0.19082030147349405, + "grad_norm": 5.5314459800720215, + "learning_rate": 9.341359505180322e-05, + "loss": 1.978, + "step": 2535 + }, + { + "epoch": 0.1908955757541542, + "grad_norm": 4.943543910980225, + "learning_rate": 9.3407545999241e-05, + "loss": 2.0852, + "step": 2536 + }, + { + "epoch": 0.19097085003481434, + "grad_norm": 5.548428058624268, + "learning_rate": 9.340149436622471e-05, + "loss": 1.8628, + "step": 2537 + }, + { + "epoch": 0.19104612431547452, + "grad_norm": 5.43626594543457, + "learning_rate": 9.339544015311411e-05, + "loss": 2.2098, + "step": 2538 + }, + { + "epoch": 0.19112139859613467, + "grad_norm": 6.1091437339782715, + "learning_rate": 9.338938336026912e-05, + "loss": 2.1499, + "step": 2539 + }, + { + "epoch": 0.19119667287679482, + "grad_norm": 4.8484697341918945, + "learning_rate": 9.338332398804976e-05, + "loss": 2.0968, + "step": 2540 + }, + { + "epoch": 0.19127194715745496, + "grad_norm": 5.473712921142578, + "learning_rate": 9.337726203681629e-05, + "loss": 2.2765, + "step": 2541 + }, + { + "epoch": 0.19134722143811514, + "grad_norm": 3.590080738067627, + "learning_rate": 9.337119750692906e-05, + "loss": 2.1073, + "step": 2542 + }, + { + "epoch": 0.1914224957187753, + "grad_norm": 4.980856895446777, + "learning_rate": 9.336513039874859e-05, + "loss": 2.2553, + "step": 2543 + }, + { + "epoch": 0.19149776999943544, + "grad_norm": 4.057771682739258, + "learning_rate": 9.335906071263553e-05, + "loss": 2.1213, + "step": 2544 + }, + { + "epoch": 0.1915730442800956, + "grad_norm": 5.293370723724365, + "learning_rate": 9.335298844895072e-05, + "loss": 2.1464, + "step": 2545 + }, + { + "epoch": 0.19164831856075576, + "grad_norm": 5.37392520904541, + "learning_rate": 9.334691360805516e-05, + "loss": 2.0742, + "step": 2546 + }, + { + "epoch": 0.1917235928414159, + "grad_norm": 5.438200950622559, + "learning_rate": 9.334083619030993e-05, + "loss": 2.5388, + "step": 2547 + }, + { + "epoch": 0.19179886712207606, + "grad_norm": 7.548996925354004, + "learning_rate": 9.333475619607636e-05, + "loss": 1.9418, + "step": 2548 + }, + { + "epoch": 0.19187414140273623, + "grad_norm": 5.1043291091918945, + "learning_rate": 9.332867362571589e-05, + "loss": 2.1118, + "step": 2549 + }, + { + "epoch": 0.19194941568339638, + "grad_norm": 4.160706043243408, + "learning_rate": 9.33225884795901e-05, + "loss": 2.1057, + "step": 2550 + }, + { + "epoch": 0.19202468996405653, + "grad_norm": 4.977852821350098, + "learning_rate": 9.331650075806071e-05, + "loss": 2.131, + "step": 2551 + }, + { + "epoch": 0.19209996424471668, + "grad_norm": 4.698365211486816, + "learning_rate": 9.331041046148965e-05, + "loss": 2.3714, + "step": 2552 + }, + { + "epoch": 0.19217523852537685, + "grad_norm": 4.620189189910889, + "learning_rate": 9.330431759023897e-05, + "loss": 2.1871, + "step": 2553 + }, + { + "epoch": 0.192250512806037, + "grad_norm": 4.794996738433838, + "learning_rate": 9.329822214467085e-05, + "loss": 2.3205, + "step": 2554 + }, + { + "epoch": 0.19232578708669715, + "grad_norm": 4.8759050369262695, + "learning_rate": 9.329212412514765e-05, + "loss": 2.1524, + "step": 2555 + }, + { + "epoch": 0.1924010613673573, + "grad_norm": 3.981215476989746, + "learning_rate": 9.32860235320319e-05, + "loss": 1.8509, + "step": 2556 + }, + { + "epoch": 0.19247633564801747, + "grad_norm": 4.774701118469238, + "learning_rate": 9.327992036568626e-05, + "loss": 2.0014, + "step": 2557 + }, + { + "epoch": 0.19255160992867762, + "grad_norm": 5.7614240646362305, + "learning_rate": 9.327381462647354e-05, + "loss": 2.3369, + "step": 2558 + }, + { + "epoch": 0.19262688420933777, + "grad_norm": 5.10483455657959, + "learning_rate": 9.32677063147567e-05, + "loss": 1.8325, + "step": 2559 + }, + { + "epoch": 0.19270215848999794, + "grad_norm": 4.313691139221191, + "learning_rate": 9.326159543089887e-05, + "loss": 1.8342, + "step": 2560 + }, + { + "epoch": 0.1927774327706581, + "grad_norm": 4.264848709106445, + "learning_rate": 9.32554819752633e-05, + "loss": 2.0532, + "step": 2561 + }, + { + "epoch": 0.19285270705131824, + "grad_norm": 4.446906566619873, + "learning_rate": 9.324936594821348e-05, + "loss": 2.6164, + "step": 2562 + }, + { + "epoch": 0.1929279813319784, + "grad_norm": 6.16718864440918, + "learning_rate": 9.324324735011291e-05, + "loss": 1.956, + "step": 2563 + }, + { + "epoch": 0.19300325561263856, + "grad_norm": 5.927128314971924, + "learning_rate": 9.323712618132541e-05, + "loss": 2.1769, + "step": 2564 + }, + { + "epoch": 0.1930785298932987, + "grad_norm": 6.929825782775879, + "learning_rate": 9.323100244221478e-05, + "loss": 2.1547, + "step": 2565 + }, + { + "epoch": 0.19315380417395886, + "grad_norm": 5.624950408935547, + "learning_rate": 9.322487613314512e-05, + "loss": 2.0811, + "step": 2566 + }, + { + "epoch": 0.193229078454619, + "grad_norm": 4.538450717926025, + "learning_rate": 9.321874725448058e-05, + "loss": 2.1482, + "step": 2567 + }, + { + "epoch": 0.19330435273527918, + "grad_norm": 4.675336837768555, + "learning_rate": 9.321261580658556e-05, + "loss": 2.0677, + "step": 2568 + }, + { + "epoch": 0.19337962701593933, + "grad_norm": 5.141780853271484, + "learning_rate": 9.32064817898245e-05, + "loss": 1.8231, + "step": 2569 + }, + { + "epoch": 0.19345490129659948, + "grad_norm": 3.757761001586914, + "learning_rate": 9.320034520456207e-05, + "loss": 1.8837, + "step": 2570 + }, + { + "epoch": 0.19353017557725963, + "grad_norm": 4.213200092315674, + "learning_rate": 9.319420605116307e-05, + "loss": 2.166, + "step": 2571 + }, + { + "epoch": 0.1936054498579198, + "grad_norm": 5.930081844329834, + "learning_rate": 9.318806432999245e-05, + "loss": 1.8438, + "step": 2572 + }, + { + "epoch": 0.19368072413857995, + "grad_norm": 6.261200428009033, + "learning_rate": 9.318192004141536e-05, + "loss": 2.0288, + "step": 2573 + }, + { + "epoch": 0.1937559984192401, + "grad_norm": 5.915249824523926, + "learning_rate": 9.3175773185797e-05, + "loss": 2.0577, + "step": 2574 + }, + { + "epoch": 0.19383127269990025, + "grad_norm": 6.85219144821167, + "learning_rate": 9.31696237635028e-05, + "loss": 2.0642, + "step": 2575 + }, + { + "epoch": 0.19390654698056042, + "grad_norm": 4.204634666442871, + "learning_rate": 9.316347177489835e-05, + "loss": 2.2449, + "step": 2576 + }, + { + "epoch": 0.19398182126122057, + "grad_norm": 4.536286354064941, + "learning_rate": 9.315731722034934e-05, + "loss": 1.9782, + "step": 2577 + }, + { + "epoch": 0.19405709554188072, + "grad_norm": 5.71628999710083, + "learning_rate": 9.315116010022168e-05, + "loss": 1.98, + "step": 2578 + }, + { + "epoch": 0.1941323698225409, + "grad_norm": 5.147578239440918, + "learning_rate": 9.314500041488135e-05, + "loss": 2.0296, + "step": 2579 + }, + { + "epoch": 0.19420764410320104, + "grad_norm": 4.791040897369385, + "learning_rate": 9.313883816469455e-05, + "loss": 2.0343, + "step": 2580 + }, + { + "epoch": 0.1942829183838612, + "grad_norm": 5.327045440673828, + "learning_rate": 9.31326733500276e-05, + "loss": 2.1418, + "step": 2581 + }, + { + "epoch": 0.19435819266452134, + "grad_norm": 4.423271179199219, + "learning_rate": 9.312650597124697e-05, + "loss": 2.1174, + "step": 2582 + }, + { + "epoch": 0.19443346694518152, + "grad_norm": 3.8413028717041016, + "learning_rate": 9.31203360287193e-05, + "loss": 2.1181, + "step": 2583 + }, + { + "epoch": 0.19450874122584166, + "grad_norm": 4.535932540893555, + "learning_rate": 9.31141635228114e-05, + "loss": 2.3185, + "step": 2584 + }, + { + "epoch": 0.1945840155065018, + "grad_norm": 4.861112117767334, + "learning_rate": 9.310798845389018e-05, + "loss": 1.9685, + "step": 2585 + }, + { + "epoch": 0.19465928978716196, + "grad_norm": 5.055726051330566, + "learning_rate": 9.310181082232272e-05, + "loss": 2.2517, + "step": 2586 + }, + { + "epoch": 0.19473456406782214, + "grad_norm": 5.35298490524292, + "learning_rate": 9.309563062847628e-05, + "loss": 2.6957, + "step": 2587 + }, + { + "epoch": 0.19480983834848228, + "grad_norm": 6.587187767028809, + "learning_rate": 9.308944787271827e-05, + "loss": 2.0718, + "step": 2588 + }, + { + "epoch": 0.19488511262914243, + "grad_norm": 4.75013542175293, + "learning_rate": 9.308326255541621e-05, + "loss": 2.0937, + "step": 2589 + }, + { + "epoch": 0.19496038690980258, + "grad_norm": 4.977231025695801, + "learning_rate": 9.307707467693783e-05, + "loss": 2.3111, + "step": 2590 + }, + { + "epoch": 0.19503566119046276, + "grad_norm": 4.176503658294678, + "learning_rate": 9.307088423765096e-05, + "loss": 2.8169, + "step": 2591 + }, + { + "epoch": 0.1951109354711229, + "grad_norm": 5.028183937072754, + "learning_rate": 9.306469123792358e-05, + "loss": 2.2027, + "step": 2592 + }, + { + "epoch": 0.19518620975178305, + "grad_norm": 4.943946361541748, + "learning_rate": 9.30584956781239e-05, + "loss": 1.9997, + "step": 2593 + }, + { + "epoch": 0.19526148403244323, + "grad_norm": 3.7517528533935547, + "learning_rate": 9.305229755862019e-05, + "loss": 2.2685, + "step": 2594 + }, + { + "epoch": 0.19533675831310338, + "grad_norm": 5.388548374176025, + "learning_rate": 9.304609687978092e-05, + "loss": 2.201, + "step": 2595 + }, + { + "epoch": 0.19541203259376352, + "grad_norm": 5.044585227966309, + "learning_rate": 9.30398936419747e-05, + "loss": 1.9716, + "step": 2596 + }, + { + "epoch": 0.19548730687442367, + "grad_norm": 4.254343032836914, + "learning_rate": 9.30336878455703e-05, + "loss": 2.3274, + "step": 2597 + }, + { + "epoch": 0.19556258115508385, + "grad_norm": 5.093156814575195, + "learning_rate": 9.302747949093664e-05, + "loss": 2.1795, + "step": 2598 + }, + { + "epoch": 0.195637855435744, + "grad_norm": 3.9735777378082275, + "learning_rate": 9.302126857844279e-05, + "loss": 2.0776, + "step": 2599 + }, + { + "epoch": 0.19571312971640414, + "grad_norm": 4.32735538482666, + "learning_rate": 9.301505510845795e-05, + "loss": 2.1883, + "step": 2600 + }, + { + "epoch": 0.1957884039970643, + "grad_norm": 4.864895343780518, + "learning_rate": 9.300883908135152e-05, + "loss": 1.837, + "step": 2601 + }, + { + "epoch": 0.19586367827772447, + "grad_norm": 4.4535298347473145, + "learning_rate": 9.3002620497493e-05, + "loss": 2.1299, + "step": 2602 + }, + { + "epoch": 0.19593895255838462, + "grad_norm": 4.141602993011475, + "learning_rate": 9.299639935725209e-05, + "loss": 1.9555, + "step": 2603 + }, + { + "epoch": 0.19601422683904476, + "grad_norm": 3.575546979904175, + "learning_rate": 9.299017566099861e-05, + "loss": 1.7796, + "step": 2604 + }, + { + "epoch": 0.1960895011197049, + "grad_norm": 4.551851749420166, + "learning_rate": 9.298394940910254e-05, + "loss": 2.3008, + "step": 2605 + }, + { + "epoch": 0.1961647754003651, + "grad_norm": 4.713071346282959, + "learning_rate": 9.297772060193399e-05, + "loss": 2.7555, + "step": 2606 + }, + { + "epoch": 0.19624004968102524, + "grad_norm": 5.464794158935547, + "learning_rate": 9.297148923986329e-05, + "loss": 2.045, + "step": 2607 + }, + { + "epoch": 0.19631532396168538, + "grad_norm": 4.088005065917969, + "learning_rate": 9.296525532326085e-05, + "loss": 1.8355, + "step": 2608 + }, + { + "epoch": 0.19639059824234556, + "grad_norm": 4.126147747039795, + "learning_rate": 9.295901885249727e-05, + "loss": 2.1773, + "step": 2609 + }, + { + "epoch": 0.1964658725230057, + "grad_norm": 3.7973711490631104, + "learning_rate": 9.295277982794327e-05, + "loss": 1.8913, + "step": 2610 + }, + { + "epoch": 0.19654114680366586, + "grad_norm": 5.4946208000183105, + "learning_rate": 9.294653824996976e-05, + "loss": 2.1021, + "step": 2611 + }, + { + "epoch": 0.196616421084326, + "grad_norm": 6.627760410308838, + "learning_rate": 9.294029411894779e-05, + "loss": 2.0899, + "step": 2612 + }, + { + "epoch": 0.19669169536498618, + "grad_norm": 4.387209892272949, + "learning_rate": 9.293404743524852e-05, + "loss": 2.2512, + "step": 2613 + }, + { + "epoch": 0.19676696964564633, + "grad_norm": 4.849071025848389, + "learning_rate": 9.292779819924333e-05, + "loss": 1.9134, + "step": 2614 + }, + { + "epoch": 0.19684224392630648, + "grad_norm": 5.269917011260986, + "learning_rate": 9.292154641130371e-05, + "loss": 1.9464, + "step": 2615 + }, + { + "epoch": 0.19691751820696662, + "grad_norm": 6.315644264221191, + "learning_rate": 9.291529207180132e-05, + "loss": 2.2925, + "step": 2616 + }, + { + "epoch": 0.1969927924876268, + "grad_norm": 5.099985122680664, + "learning_rate": 9.290903518110793e-05, + "loss": 2.2075, + "step": 2617 + }, + { + "epoch": 0.19706806676828695, + "grad_norm": 5.856556415557861, + "learning_rate": 9.290277573959555e-05, + "loss": 2.341, + "step": 2618 + }, + { + "epoch": 0.1971433410489471, + "grad_norm": 6.284870147705078, + "learning_rate": 9.289651374763624e-05, + "loss": 2.2362, + "step": 2619 + }, + { + "epoch": 0.19721861532960724, + "grad_norm": 6.153741359710693, + "learning_rate": 9.289024920560228e-05, + "loss": 2.1497, + "step": 2620 + }, + { + "epoch": 0.19729388961026742, + "grad_norm": 5.033062934875488, + "learning_rate": 9.288398211386606e-05, + "loss": 2.0753, + "step": 2621 + }, + { + "epoch": 0.19736916389092757, + "grad_norm": 4.078958034515381, + "learning_rate": 9.287771247280014e-05, + "loss": 2.136, + "step": 2622 + }, + { + "epoch": 0.19744443817158772, + "grad_norm": 7.0571980476379395, + "learning_rate": 9.287144028277726e-05, + "loss": 2.0564, + "step": 2623 + }, + { + "epoch": 0.1975197124522479, + "grad_norm": 4.978921890258789, + "learning_rate": 9.286516554417028e-05, + "loss": 2.348, + "step": 2624 + }, + { + "epoch": 0.19759498673290804, + "grad_norm": 3.79028582572937, + "learning_rate": 9.285888825735218e-05, + "loss": 2.2694, + "step": 2625 + }, + { + "epoch": 0.1976702610135682, + "grad_norm": 4.624883651733398, + "learning_rate": 9.285260842269616e-05, + "loss": 1.9855, + "step": 2626 + }, + { + "epoch": 0.19774553529422834, + "grad_norm": 6.384089469909668, + "learning_rate": 9.284632604057553e-05, + "loss": 2.0368, + "step": 2627 + }, + { + "epoch": 0.1978208095748885, + "grad_norm": 6.101902484893799, + "learning_rate": 9.284004111136374e-05, + "loss": 2.0705, + "step": 2628 + }, + { + "epoch": 0.19789608385554866, + "grad_norm": 5.448572158813477, + "learning_rate": 9.283375363543444e-05, + "loss": 2.2335, + "step": 2629 + }, + { + "epoch": 0.1979713581362088, + "grad_norm": 5.265905380249023, + "learning_rate": 9.28274636131614e-05, + "loss": 2.3178, + "step": 2630 + }, + { + "epoch": 0.19804663241686896, + "grad_norm": 7.9123148918151855, + "learning_rate": 9.282117104491852e-05, + "loss": 2.1305, + "step": 2631 + }, + { + "epoch": 0.19812190669752913, + "grad_norm": 7.143202781677246, + "learning_rate": 9.281487593107989e-05, + "loss": 2.4103, + "step": 2632 + }, + { + "epoch": 0.19819718097818928, + "grad_norm": 4.973522663116455, + "learning_rate": 9.280857827201973e-05, + "loss": 1.8422, + "step": 2633 + }, + { + "epoch": 0.19827245525884943, + "grad_norm": 4.00935173034668, + "learning_rate": 9.280227806811244e-05, + "loss": 2.0586, + "step": 2634 + }, + { + "epoch": 0.19834772953950958, + "grad_norm": 3.9648549556732178, + "learning_rate": 9.279597531973252e-05, + "loss": 2.3299, + "step": 2635 + }, + { + "epoch": 0.19842300382016975, + "grad_norm": 3.629316568374634, + "learning_rate": 9.278967002725465e-05, + "loss": 2.3687, + "step": 2636 + }, + { + "epoch": 0.1984982781008299, + "grad_norm": 3.7593750953674316, + "learning_rate": 9.27833621910537e-05, + "loss": 1.9443, + "step": 2637 + }, + { + "epoch": 0.19857355238149005, + "grad_norm": 4.642870903015137, + "learning_rate": 9.277705181150463e-05, + "loss": 2.5142, + "step": 2638 + }, + { + "epoch": 0.1986488266621502, + "grad_norm": 4.226221561431885, + "learning_rate": 9.277073888898255e-05, + "loss": 2.0096, + "step": 2639 + }, + { + "epoch": 0.19872410094281037, + "grad_norm": 4.961756229400635, + "learning_rate": 9.276442342386277e-05, + "loss": 2.0554, + "step": 2640 + }, + { + "epoch": 0.19879937522347052, + "grad_norm": 5.621178150177002, + "learning_rate": 9.275810541652072e-05, + "loss": 2.1673, + "step": 2641 + }, + { + "epoch": 0.19887464950413067, + "grad_norm": 5.010156631469727, + "learning_rate": 9.275178486733201e-05, + "loss": 2.4134, + "step": 2642 + }, + { + "epoch": 0.19894992378479084, + "grad_norm": 3.99113392829895, + "learning_rate": 9.274546177667234e-05, + "loss": 2.2111, + "step": 2643 + }, + { + "epoch": 0.199025198065451, + "grad_norm": 4.587311267852783, + "learning_rate": 9.273913614491761e-05, + "loss": 2.0292, + "step": 2644 + }, + { + "epoch": 0.19910047234611114, + "grad_norm": 4.786252975463867, + "learning_rate": 9.273280797244388e-05, + "loss": 2.186, + "step": 2645 + }, + { + "epoch": 0.1991757466267713, + "grad_norm": 5.825490474700928, + "learning_rate": 9.272647725962733e-05, + "loss": 2.4339, + "step": 2646 + }, + { + "epoch": 0.19925102090743146, + "grad_norm": 3.9713294506073, + "learning_rate": 9.27201440068443e-05, + "loss": 2.105, + "step": 2647 + }, + { + "epoch": 0.1993262951880916, + "grad_norm": 4.747962474822998, + "learning_rate": 9.271380821447129e-05, + "loss": 1.911, + "step": 2648 + }, + { + "epoch": 0.19940156946875176, + "grad_norm": 3.773463726043701, + "learning_rate": 9.270746988288494e-05, + "loss": 2.0269, + "step": 2649 + }, + { + "epoch": 0.1994768437494119, + "grad_norm": 4.771273612976074, + "learning_rate": 9.270112901246207e-05, + "loss": 2.0478, + "step": 2650 + }, + { + "epoch": 0.19955211803007208, + "grad_norm": 6.746914386749268, + "learning_rate": 9.269478560357958e-05, + "loss": 2.4765, + "step": 2651 + }, + { + "epoch": 0.19962739231073223, + "grad_norm": 6.104433059692383, + "learning_rate": 9.268843965661457e-05, + "loss": 2.2402, + "step": 2652 + }, + { + "epoch": 0.19970266659139238, + "grad_norm": 4.703186511993408, + "learning_rate": 9.268209117194435e-05, + "loss": 2.1815, + "step": 2653 + }, + { + "epoch": 0.19977794087205253, + "grad_norm": 7.270728588104248, + "learning_rate": 9.267574014994626e-05, + "loss": 2.0614, + "step": 2654 + }, + { + "epoch": 0.1998532151527127, + "grad_norm": 5.134958267211914, + "learning_rate": 9.266938659099786e-05, + "loss": 1.6658, + "step": 2655 + }, + { + "epoch": 0.19992848943337285, + "grad_norm": 6.875425338745117, + "learning_rate": 9.266303049547687e-05, + "loss": 2.5154, + "step": 2656 + }, + { + "epoch": 0.200003763714033, + "grad_norm": 4.7156572341918945, + "learning_rate": 9.265667186376113e-05, + "loss": 2.3171, + "step": 2657 + }, + { + "epoch": 0.20007903799469318, + "grad_norm": 7.363480567932129, + "learning_rate": 9.265031069622865e-05, + "loss": 2.6819, + "step": 2658 + }, + { + "epoch": 0.20015431227535332, + "grad_norm": 7.363480567932129, + "learning_rate": 9.265031069622865e-05, + "loss": 1.7411, + "step": 2659 + }, + { + "epoch": 0.20022958655601347, + "grad_norm": 5.607661724090576, + "learning_rate": 9.264394699325757e-05, + "loss": 2.4929, + "step": 2660 + }, + { + "epoch": 0.20030486083667362, + "grad_norm": 7.142682075500488, + "learning_rate": 9.263758075522619e-05, + "loss": 2.4227, + "step": 2661 + }, + { + "epoch": 0.2003801351173338, + "grad_norm": 6.157449722290039, + "learning_rate": 9.263121198251299e-05, + "loss": 2.2881, + "step": 2662 + }, + { + "epoch": 0.20045540939799394, + "grad_norm": 4.961678981781006, + "learning_rate": 9.262484067549656e-05, + "loss": 2.3077, + "step": 2663 + }, + { + "epoch": 0.2005306836786541, + "grad_norm": 4.743991851806641, + "learning_rate": 9.261846683455565e-05, + "loss": 2.2727, + "step": 2664 + }, + { + "epoch": 0.20060595795931424, + "grad_norm": 6.180283069610596, + "learning_rate": 9.261209046006918e-05, + "loss": 2.1534, + "step": 2665 + }, + { + "epoch": 0.20068123223997442, + "grad_norm": 7.317596435546875, + "learning_rate": 9.260571155241619e-05, + "loss": 2.449, + "step": 2666 + }, + { + "epoch": 0.20075650652063456, + "grad_norm": 3.833103895187378, + "learning_rate": 9.259933011197591e-05, + "loss": 2.2059, + "step": 2667 + }, + { + "epoch": 0.2008317808012947, + "grad_norm": 4.415266513824463, + "learning_rate": 9.259294613912767e-05, + "loss": 2.0813, + "step": 2668 + }, + { + "epoch": 0.20090705508195486, + "grad_norm": 4.395266532897949, + "learning_rate": 9.2586559634251e-05, + "loss": 2.1295, + "step": 2669 + }, + { + "epoch": 0.20098232936261504, + "grad_norm": 7.380612373352051, + "learning_rate": 9.258017059772556e-05, + "loss": 2.2435, + "step": 2670 + }, + { + "epoch": 0.20105760364327518, + "grad_norm": 4.570094108581543, + "learning_rate": 9.257377902993114e-05, + "loss": 2.2224, + "step": 2671 + }, + { + "epoch": 0.20113287792393533, + "grad_norm": 9.569217681884766, + "learning_rate": 9.256738493124773e-05, + "loss": 2.3579, + "step": 2672 + }, + { + "epoch": 0.2012081522045955, + "grad_norm": 5.0647149085998535, + "learning_rate": 9.256098830205542e-05, + "loss": 2.1132, + "step": 2673 + }, + { + "epoch": 0.20128342648525566, + "grad_norm": 4.9264020919799805, + "learning_rate": 9.255458914273448e-05, + "loss": 2.1071, + "step": 2674 + }, + { + "epoch": 0.2013587007659158, + "grad_norm": 5.490030288696289, + "learning_rate": 9.254818745366531e-05, + "loss": 2.5883, + "step": 2675 + }, + { + "epoch": 0.20143397504657595, + "grad_norm": 5.552890777587891, + "learning_rate": 9.25417832352285e-05, + "loss": 2.2194, + "step": 2676 + }, + { + "epoch": 0.20150924932723613, + "grad_norm": 6.219507694244385, + "learning_rate": 9.253537648780473e-05, + "loss": 2.2628, + "step": 2677 + }, + { + "epoch": 0.20158452360789628, + "grad_norm": 6.691783428192139, + "learning_rate": 9.252896721177487e-05, + "loss": 2.5432, + "step": 2678 + }, + { + "epoch": 0.20165979788855642, + "grad_norm": 4.836513996124268, + "learning_rate": 9.252255540751994e-05, + "loss": 2.2211, + "step": 2679 + }, + { + "epoch": 0.20173507216921657, + "grad_norm": 4.348185062408447, + "learning_rate": 9.25161410754211e-05, + "loss": 1.993, + "step": 2680 + }, + { + "epoch": 0.20181034644987675, + "grad_norm": 5.402199745178223, + "learning_rate": 9.250972421585968e-05, + "loss": 2.1034, + "step": 2681 + }, + { + "epoch": 0.2018856207305369, + "grad_norm": 3.2253646850585938, + "learning_rate": 9.250330482921712e-05, + "loss": 2.0738, + "step": 2682 + }, + { + "epoch": 0.20196089501119704, + "grad_norm": 4.809782028198242, + "learning_rate": 9.249688291587504e-05, + "loss": 1.9647, + "step": 2683 + }, + { + "epoch": 0.2020361692918572, + "grad_norm": 4.990761756896973, + "learning_rate": 9.249045847621521e-05, + "loss": 2.5667, + "step": 2684 + }, + { + "epoch": 0.20211144357251737, + "grad_norm": 5.1938347816467285, + "learning_rate": 9.248403151061956e-05, + "loss": 2.138, + "step": 2685 + }, + { + "epoch": 0.20218671785317752, + "grad_norm": 5.125086307525635, + "learning_rate": 9.247760201947011e-05, + "loss": 1.888, + "step": 2686 + }, + { + "epoch": 0.20226199213383766, + "grad_norm": 4.409528732299805, + "learning_rate": 9.247117000314911e-05, + "loss": 1.848, + "step": 2687 + }, + { + "epoch": 0.2023372664144978, + "grad_norm": 5.727984428405762, + "learning_rate": 9.246473546203893e-05, + "loss": 2.3168, + "step": 2688 + }, + { + "epoch": 0.202412540695158, + "grad_norm": 3.619250535964966, + "learning_rate": 9.245829839652205e-05, + "loss": 1.9843, + "step": 2689 + }, + { + "epoch": 0.20248781497581814, + "grad_norm": 6.49954080581665, + "learning_rate": 9.245185880698118e-05, + "loss": 1.9022, + "step": 2690 + }, + { + "epoch": 0.20256308925647828, + "grad_norm": 4.470929145812988, + "learning_rate": 9.24454166937991e-05, + "loss": 2.5562, + "step": 2691 + }, + { + "epoch": 0.20263836353713846, + "grad_norm": 5.310934066772461, + "learning_rate": 9.243897205735878e-05, + "loss": 2.31, + "step": 2692 + }, + { + "epoch": 0.2027136378177986, + "grad_norm": 5.04948616027832, + "learning_rate": 9.243252489804335e-05, + "loss": 1.917, + "step": 2693 + }, + { + "epoch": 0.20278891209845876, + "grad_norm": 4.324390411376953, + "learning_rate": 9.242607521623606e-05, + "loss": 2.0204, + "step": 2694 + }, + { + "epoch": 0.2028641863791189, + "grad_norm": 4.0668230056762695, + "learning_rate": 9.241962301232035e-05, + "loss": 2.2603, + "step": 2695 + }, + { + "epoch": 0.20293946065977908, + "grad_norm": 5.163023471832275, + "learning_rate": 9.241316828667976e-05, + "loss": 2.2552, + "step": 2696 + }, + { + "epoch": 0.20301473494043923, + "grad_norm": 3.842123031616211, + "learning_rate": 9.2406711039698e-05, + "loss": 2.0686, + "step": 2697 + }, + { + "epoch": 0.20309000922109938, + "grad_norm": 3.7394609451293945, + "learning_rate": 9.240025127175896e-05, + "loss": 2.2229, + "step": 2698 + }, + { + "epoch": 0.20316528350175952, + "grad_norm": 5.072762966156006, + "learning_rate": 9.239378898324664e-05, + "loss": 1.7493, + "step": 2699 + }, + { + "epoch": 0.2032405577824197, + "grad_norm": 3.9831929206848145, + "learning_rate": 9.238732417454521e-05, + "loss": 2.0991, + "step": 2700 + }, + { + "epoch": 0.20331583206307985, + "grad_norm": 5.643798828125, + "learning_rate": 9.238085684603897e-05, + "loss": 2.1445, + "step": 2701 + }, + { + "epoch": 0.20339110634374, + "grad_norm": 4.979661464691162, + "learning_rate": 9.23743869981124e-05, + "loss": 1.9424, + "step": 2702 + }, + { + "epoch": 0.20346638062440014, + "grad_norm": 4.533657550811768, + "learning_rate": 9.23679146311501e-05, + "loss": 2.1678, + "step": 2703 + }, + { + "epoch": 0.20354165490506032, + "grad_norm": 4.680571556091309, + "learning_rate": 9.236143974553686e-05, + "loss": 2.5833, + "step": 2704 + }, + { + "epoch": 0.20361692918572047, + "grad_norm": 5.625328063964844, + "learning_rate": 9.235496234165756e-05, + "loss": 2.228, + "step": 2705 + }, + { + "epoch": 0.20369220346638062, + "grad_norm": 4.053057670593262, + "learning_rate": 9.234848241989729e-05, + "loss": 2.4346, + "step": 2706 + }, + { + "epoch": 0.2037674777470408, + "grad_norm": 4.140114784240723, + "learning_rate": 9.234199998064125e-05, + "loss": 2.103, + "step": 2707 + }, + { + "epoch": 0.20384275202770094, + "grad_norm": 5.398926258087158, + "learning_rate": 9.23355150242748e-05, + "loss": 2.1543, + "step": 2708 + }, + { + "epoch": 0.2039180263083611, + "grad_norm": 5.658435344696045, + "learning_rate": 9.232902755118345e-05, + "loss": 1.9006, + "step": 2709 + }, + { + "epoch": 0.20399330058902124, + "grad_norm": 4.941132545471191, + "learning_rate": 9.232253756175288e-05, + "loss": 1.902, + "step": 2710 + }, + { + "epoch": 0.2040685748696814, + "grad_norm": 4.231126308441162, + "learning_rate": 9.231604505636889e-05, + "loss": 1.9513, + "step": 2711 + }, + { + "epoch": 0.20414384915034156, + "grad_norm": 6.0593037605285645, + "learning_rate": 9.230955003541743e-05, + "loss": 2.6645, + "step": 2712 + }, + { + "epoch": 0.2042191234310017, + "grad_norm": 5.5126471519470215, + "learning_rate": 9.230305249928461e-05, + "loss": 2.1858, + "step": 2713 + }, + { + "epoch": 0.20429439771166186, + "grad_norm": 5.879733085632324, + "learning_rate": 9.229655244835673e-05, + "loss": 2.278, + "step": 2714 + }, + { + "epoch": 0.20436967199232203, + "grad_norm": 4.923147678375244, + "learning_rate": 9.229004988302014e-05, + "loss": 2.5746, + "step": 2715 + }, + { + "epoch": 0.20444494627298218, + "grad_norm": 7.514196872711182, + "learning_rate": 9.228354480366146e-05, + "loss": 1.9377, + "step": 2716 + }, + { + "epoch": 0.20452022055364233, + "grad_norm": 7.064095497131348, + "learning_rate": 9.227703721066734e-05, + "loss": 2.0156, + "step": 2717 + }, + { + "epoch": 0.20459549483430248, + "grad_norm": 4.517171382904053, + "learning_rate": 9.227052710442468e-05, + "loss": 2.0787, + "step": 2718 + }, + { + "epoch": 0.20467076911496265, + "grad_norm": 7.329111099243164, + "learning_rate": 9.226401448532045e-05, + "loss": 2.3843, + "step": 2719 + }, + { + "epoch": 0.2047460433956228, + "grad_norm": 5.9707183837890625, + "learning_rate": 9.225749935374184e-05, + "loss": 2.3209, + "step": 2720 + }, + { + "epoch": 0.20482131767628295, + "grad_norm": 5.696382999420166, + "learning_rate": 9.225098171007615e-05, + "loss": 2.2724, + "step": 2721 + }, + { + "epoch": 0.20489659195694312, + "grad_norm": 5.391494274139404, + "learning_rate": 9.224446155471083e-05, + "loss": 2.6625, + "step": 2722 + }, + { + "epoch": 0.20497186623760327, + "grad_norm": 5.725043296813965, + "learning_rate": 9.223793888803347e-05, + "loss": 2.3145, + "step": 2723 + }, + { + "epoch": 0.20504714051826342, + "grad_norm": 4.924393653869629, + "learning_rate": 9.223141371043184e-05, + "loss": 2.1112, + "step": 2724 + }, + { + "epoch": 0.20512241479892357, + "grad_norm": 5.879674911499023, + "learning_rate": 9.222488602229385e-05, + "loss": 2.3642, + "step": 2725 + }, + { + "epoch": 0.20519768907958374, + "grad_norm": 4.345478057861328, + "learning_rate": 9.221835582400754e-05, + "loss": 2.0124, + "step": 2726 + }, + { + "epoch": 0.2052729633602439, + "grad_norm": 4.756162643432617, + "learning_rate": 9.221182311596112e-05, + "loss": 2.0019, + "step": 2727 + }, + { + "epoch": 0.20534823764090404, + "grad_norm": 5.437966346740723, + "learning_rate": 9.220528789854293e-05, + "loss": 1.9805, + "step": 2728 + }, + { + "epoch": 0.2054235119215642, + "grad_norm": 5.083278656005859, + "learning_rate": 9.219875017214146e-05, + "loss": 2.108, + "step": 2729 + }, + { + "epoch": 0.20549878620222436, + "grad_norm": 4.584100246429443, + "learning_rate": 9.219220993714539e-05, + "loss": 2.2582, + "step": 2730 + }, + { + "epoch": 0.2055740604828845, + "grad_norm": 7.760411739349365, + "learning_rate": 9.21856671939435e-05, + "loss": 2.346, + "step": 2731 + }, + { + "epoch": 0.20564933476354466, + "grad_norm": 5.776886940002441, + "learning_rate": 9.217912194292474e-05, + "loss": 2.4974, + "step": 2732 + }, + { + "epoch": 0.2057246090442048, + "grad_norm": 5.723376750946045, + "learning_rate": 9.21725741844782e-05, + "loss": 2.1521, + "step": 2733 + }, + { + "epoch": 0.20579988332486499, + "grad_norm": 6.4057135581970215, + "learning_rate": 9.216602391899314e-05, + "loss": 2.1174, + "step": 2734 + }, + { + "epoch": 0.20587515760552513, + "grad_norm": 4.838561534881592, + "learning_rate": 9.215947114685895e-05, + "loss": 2.1417, + "step": 2735 + }, + { + "epoch": 0.20595043188618528, + "grad_norm": 6.190004825592041, + "learning_rate": 9.215291586846516e-05, + "loss": 1.854, + "step": 2736 + }, + { + "epoch": 0.20602570616684543, + "grad_norm": 10.128652572631836, + "learning_rate": 9.214635808420147e-05, + "loss": 2.1287, + "step": 2737 + }, + { + "epoch": 0.2061009804475056, + "grad_norm": 7.153855800628662, + "learning_rate": 9.213979779445771e-05, + "loss": 2.3407, + "step": 2738 + }, + { + "epoch": 0.20617625472816575, + "grad_norm": 5.416784763336182, + "learning_rate": 9.213323499962392e-05, + "loss": 2.1244, + "step": 2739 + }, + { + "epoch": 0.2062515290088259, + "grad_norm": 4.468263149261475, + "learning_rate": 9.212666970009017e-05, + "loss": 2.0533, + "step": 2740 + }, + { + "epoch": 0.20632680328948608, + "grad_norm": 4.061378479003906, + "learning_rate": 9.21201018962468e-05, + "loss": 1.997, + "step": 2741 + }, + { + "epoch": 0.20640207757014623, + "grad_norm": 4.568488121032715, + "learning_rate": 9.211353158848423e-05, + "loss": 2.4459, + "step": 2742 + }, + { + "epoch": 0.20647735185080637, + "grad_norm": 6.1667256355285645, + "learning_rate": 9.210695877719303e-05, + "loss": 2.138, + "step": 2743 + }, + { + "epoch": 0.20655262613146652, + "grad_norm": 4.548626899719238, + "learning_rate": 9.210038346276395e-05, + "loss": 2.3282, + "step": 2744 + }, + { + "epoch": 0.2066279004121267, + "grad_norm": 4.237787246704102, + "learning_rate": 9.20938056455879e-05, + "loss": 2.1, + "step": 2745 + }, + { + "epoch": 0.20670317469278685, + "grad_norm": 4.625013828277588, + "learning_rate": 9.208722532605584e-05, + "loss": 1.9612, + "step": 2746 + }, + { + "epoch": 0.206778448973447, + "grad_norm": 5.302488327026367, + "learning_rate": 9.208064250455904e-05, + "loss": 2.2188, + "step": 2747 + }, + { + "epoch": 0.20685372325410714, + "grad_norm": 6.519745349884033, + "learning_rate": 9.207405718148876e-05, + "loss": 1.943, + "step": 2748 + }, + { + "epoch": 0.20692899753476732, + "grad_norm": 4.911252975463867, + "learning_rate": 9.206746935723652e-05, + "loss": 2.1185, + "step": 2749 + }, + { + "epoch": 0.20700427181542747, + "grad_norm": 6.025754928588867, + "learning_rate": 9.206087903219391e-05, + "loss": 2.4712, + "step": 2750 + }, + { + "epoch": 0.2070795460960876, + "grad_norm": 4.421285629272461, + "learning_rate": 9.205428620675274e-05, + "loss": 2.3608, + "step": 2751 + }, + { + "epoch": 0.20715482037674776, + "grad_norm": 6.1203179359436035, + "learning_rate": 9.204769088130493e-05, + "loss": 2.3081, + "step": 2752 + }, + { + "epoch": 0.20723009465740794, + "grad_norm": 4.159307479858398, + "learning_rate": 9.204109305624253e-05, + "loss": 2.0839, + "step": 2753 + }, + { + "epoch": 0.20730536893806809, + "grad_norm": 4.70167350769043, + "learning_rate": 9.20344927319578e-05, + "loss": 2.0895, + "step": 2754 + }, + { + "epoch": 0.20738064321872823, + "grad_norm": 4.928372383117676, + "learning_rate": 9.202788990884306e-05, + "loss": 2.1404, + "step": 2755 + }, + { + "epoch": 0.2074559174993884, + "grad_norm": 4.6017327308654785, + "learning_rate": 9.202128458729087e-05, + "loss": 2.1956, + "step": 2756 + }, + { + "epoch": 0.20753119178004856, + "grad_norm": 6.752647876739502, + "learning_rate": 9.201467676769389e-05, + "loss": 2.1492, + "step": 2757 + }, + { + "epoch": 0.2076064660607087, + "grad_norm": 4.317784309387207, + "learning_rate": 9.200806645044492e-05, + "loss": 1.8446, + "step": 2758 + }, + { + "epoch": 0.20768174034136885, + "grad_norm": 4.698585510253906, + "learning_rate": 9.200145363593694e-05, + "loss": 2.2889, + "step": 2759 + }, + { + "epoch": 0.20775701462202903, + "grad_norm": 5.974823474884033, + "learning_rate": 9.199483832456305e-05, + "loss": 2.2835, + "step": 2760 + }, + { + "epoch": 0.20783228890268918, + "grad_norm": 4.162746429443359, + "learning_rate": 9.198822051671651e-05, + "loss": 1.8286, + "step": 2761 + }, + { + "epoch": 0.20790756318334933, + "grad_norm": 6.423340797424316, + "learning_rate": 9.198160021279076e-05, + "loss": 2.1872, + "step": 2762 + }, + { + "epoch": 0.20798283746400947, + "grad_norm": 5.672008991241455, + "learning_rate": 9.197497741317933e-05, + "loss": 2.2306, + "step": 2763 + }, + { + "epoch": 0.20805811174466965, + "grad_norm": 7.741572856903076, + "learning_rate": 9.196835211827593e-05, + "loss": 2.1358, + "step": 2764 + }, + { + "epoch": 0.2081333860253298, + "grad_norm": 5.3536577224731445, + "learning_rate": 9.19617243284744e-05, + "loss": 1.8629, + "step": 2765 + }, + { + "epoch": 0.20820866030598995, + "grad_norm": 4.840548038482666, + "learning_rate": 9.195509404416878e-05, + "loss": 1.9415, + "step": 2766 + }, + { + "epoch": 0.2082839345866501, + "grad_norm": 5.354896545410156, + "learning_rate": 9.194846126575321e-05, + "loss": 2.087, + "step": 2767 + }, + { + "epoch": 0.20835920886731027, + "grad_norm": 3.991562604904175, + "learning_rate": 9.194182599362198e-05, + "loss": 2.1069, + "step": 2768 + }, + { + "epoch": 0.20843448314797042, + "grad_norm": 6.801267623901367, + "learning_rate": 9.193518822816952e-05, + "loss": 2.2929, + "step": 2769 + }, + { + "epoch": 0.20850975742863057, + "grad_norm": 4.3830790519714355, + "learning_rate": 9.192854796979045e-05, + "loss": 2.0509, + "step": 2770 + }, + { + "epoch": 0.20858503170929074, + "grad_norm": 4.92519998550415, + "learning_rate": 9.192190521887951e-05, + "loss": 2.3104, + "step": 2771 + }, + { + "epoch": 0.2086603059899509, + "grad_norm": 5.735259532928467, + "learning_rate": 9.191525997583161e-05, + "loss": 2.0312, + "step": 2772 + }, + { + "epoch": 0.20873558027061104, + "grad_norm": 5.806834697723389, + "learning_rate": 9.190861224104175e-05, + "loss": 1.8885, + "step": 2773 + }, + { + "epoch": 0.20881085455127119, + "grad_norm": 3.834347724914551, + "learning_rate": 9.190196201490516e-05, + "loss": 2.0853, + "step": 2774 + }, + { + "epoch": 0.20888612883193136, + "grad_norm": 3.846677780151367, + "learning_rate": 9.189530929781715e-05, + "loss": 2.4739, + "step": 2775 + }, + { + "epoch": 0.2089614031125915, + "grad_norm": 5.2583489418029785, + "learning_rate": 9.18886540901732e-05, + "loss": 2.1837, + "step": 2776 + }, + { + "epoch": 0.20903667739325166, + "grad_norm": 3.9179093837738037, + "learning_rate": 9.188199639236897e-05, + "loss": 2.0571, + "step": 2777 + }, + { + "epoch": 0.2091119516739118, + "grad_norm": 4.368052005767822, + "learning_rate": 9.187533620480023e-05, + "loss": 2.0461, + "step": 2778 + }, + { + "epoch": 0.20918722595457198, + "grad_norm": 5.238763809204102, + "learning_rate": 9.18686735278629e-05, + "loss": 2.065, + "step": 2779 + }, + { + "epoch": 0.20926250023523213, + "grad_norm": 3.870419502258301, + "learning_rate": 9.186200836195307e-05, + "loss": 1.9187, + "step": 2780 + }, + { + "epoch": 0.20933777451589228, + "grad_norm": 5.346678733825684, + "learning_rate": 9.185534070746695e-05, + "loss": 2.0468, + "step": 2781 + }, + { + "epoch": 0.20941304879655243, + "grad_norm": 4.438832759857178, + "learning_rate": 9.184867056480093e-05, + "loss": 2.1511, + "step": 2782 + }, + { + "epoch": 0.2094883230772126, + "grad_norm": 7.27894401550293, + "learning_rate": 9.184199793435152e-05, + "loss": 1.8806, + "step": 2783 + }, + { + "epoch": 0.20956359735787275, + "grad_norm": 4.2289838790893555, + "learning_rate": 9.183532281651539e-05, + "loss": 1.9441, + "step": 2784 + }, + { + "epoch": 0.2096388716385329, + "grad_norm": 4.106243133544922, + "learning_rate": 9.182864521168936e-05, + "loss": 2.0805, + "step": 2785 + }, + { + "epoch": 0.20971414591919307, + "grad_norm": 6.5418219566345215, + "learning_rate": 9.182196512027039e-05, + "loss": 1.7826, + "step": 2786 + }, + { + "epoch": 0.20978942019985322, + "grad_norm": 5.005733966827393, + "learning_rate": 9.181528254265559e-05, + "loss": 2.0603, + "step": 2787 + }, + { + "epoch": 0.20986469448051337, + "grad_norm": 5.310815811157227, + "learning_rate": 9.180859747924223e-05, + "loss": 2.2185, + "step": 2788 + }, + { + "epoch": 0.20993996876117352, + "grad_norm": 5.525910377502441, + "learning_rate": 9.180190993042772e-05, + "loss": 1.9102, + "step": 2789 + }, + { + "epoch": 0.2100152430418337, + "grad_norm": 4.9078497886657715, + "learning_rate": 9.179521989660957e-05, + "loss": 2.064, + "step": 2790 + }, + { + "epoch": 0.21009051732249384, + "grad_norm": 4.449695587158203, + "learning_rate": 9.178852737818557e-05, + "loss": 1.9704, + "step": 2791 + }, + { + "epoch": 0.210165791603154, + "grad_norm": 4.521839618682861, + "learning_rate": 9.178183237555349e-05, + "loss": 2.1179, + "step": 2792 + }, + { + "epoch": 0.21024106588381414, + "grad_norm": 5.896556377410889, + "learning_rate": 9.177513488911138e-05, + "loss": 1.7694, + "step": 2793 + }, + { + "epoch": 0.2103163401644743, + "grad_norm": 3.947960615158081, + "learning_rate": 9.176843491925735e-05, + "loss": 2.2439, + "step": 2794 + }, + { + "epoch": 0.21039161444513446, + "grad_norm": 4.381667137145996, + "learning_rate": 9.176173246638972e-05, + "loss": 1.7337, + "step": 2795 + }, + { + "epoch": 0.2104668887257946, + "grad_norm": 4.278743267059326, + "learning_rate": 9.175502753090692e-05, + "loss": 2.1427, + "step": 2796 + }, + { + "epoch": 0.21054216300645476, + "grad_norm": 5.845564365386963, + "learning_rate": 9.174832011320755e-05, + "loss": 2.5555, + "step": 2797 + }, + { + "epoch": 0.21061743728711493, + "grad_norm": 5.500210285186768, + "learning_rate": 9.174161021369033e-05, + "loss": 2.0677, + "step": 2798 + }, + { + "epoch": 0.21069271156777508, + "grad_norm": 4.367582321166992, + "learning_rate": 9.173489783275415e-05, + "loss": 2.443, + "step": 2799 + }, + { + "epoch": 0.21076798584843523, + "grad_norm": 4.06942081451416, + "learning_rate": 9.172818297079806e-05, + "loss": 1.7442, + "step": 2800 + }, + { + "epoch": 0.21084326012909538, + "grad_norm": 4.764585494995117, + "learning_rate": 9.172146562822121e-05, + "loss": 1.9274, + "step": 2801 + }, + { + "epoch": 0.21091853440975555, + "grad_norm": 5.261363506317139, + "learning_rate": 9.171474580542295e-05, + "loss": 1.8798, + "step": 2802 + }, + { + "epoch": 0.2109938086904157, + "grad_norm": 4.888810634613037, + "learning_rate": 9.170802350280274e-05, + "loss": 2.0509, + "step": 2803 + }, + { + "epoch": 0.21106908297107585, + "grad_norm": 4.8682942390441895, + "learning_rate": 9.170129872076021e-05, + "loss": 2.5575, + "step": 2804 + }, + { + "epoch": 0.21114435725173603, + "grad_norm": 5.021416187286377, + "learning_rate": 9.169457145969512e-05, + "loss": 2.0261, + "step": 2805 + }, + { + "epoch": 0.21121963153239617, + "grad_norm": 5.399176597595215, + "learning_rate": 9.168784172000739e-05, + "loss": 2.4254, + "step": 2806 + }, + { + "epoch": 0.21129490581305632, + "grad_norm": 5.843966484069824, + "learning_rate": 9.168110950209709e-05, + "loss": 1.9917, + "step": 2807 + }, + { + "epoch": 0.21137018009371647, + "grad_norm": 4.437451362609863, + "learning_rate": 9.167437480636443e-05, + "loss": 2.2837, + "step": 2808 + }, + { + "epoch": 0.21144545437437665, + "grad_norm": 5.064668655395508, + "learning_rate": 9.166763763320976e-05, + "loss": 2.5844, + "step": 2809 + }, + { + "epoch": 0.2115207286550368, + "grad_norm": 5.718889236450195, + "learning_rate": 9.16608979830336e-05, + "loss": 2.149, + "step": 2810 + }, + { + "epoch": 0.21159600293569694, + "grad_norm": 4.556615352630615, + "learning_rate": 9.165415585623658e-05, + "loss": 2.1195, + "step": 2811 + }, + { + "epoch": 0.2116712772163571, + "grad_norm": 6.086655616760254, + "learning_rate": 9.164741125321952e-05, + "loss": 2.2139, + "step": 2812 + }, + { + "epoch": 0.21174655149701727, + "grad_norm": 4.455840110778809, + "learning_rate": 9.164066417438335e-05, + "loss": 1.8764, + "step": 2813 + }, + { + "epoch": 0.2118218257776774, + "grad_norm": 4.952279567718506, + "learning_rate": 9.16339146201292e-05, + "loss": 2.007, + "step": 2814 + }, + { + "epoch": 0.21189710005833756, + "grad_norm": 5.059940338134766, + "learning_rate": 9.162716259085827e-05, + "loss": 2.0489, + "step": 2815 + }, + { + "epoch": 0.2119723743389977, + "grad_norm": 5.385522365570068, + "learning_rate": 9.162040808697196e-05, + "loss": 2.1337, + "step": 2816 + }, + { + "epoch": 0.21204764861965789, + "grad_norm": 8.4082670211792, + "learning_rate": 9.161365110887181e-05, + "loss": 2.4877, + "step": 2817 + }, + { + "epoch": 0.21212292290031803, + "grad_norm": 5.099520206451416, + "learning_rate": 9.160689165695952e-05, + "loss": 2.0511, + "step": 2818 + }, + { + "epoch": 0.21219819718097818, + "grad_norm": 4.262336730957031, + "learning_rate": 9.16001297316369e-05, + "loss": 1.7989, + "step": 2819 + }, + { + "epoch": 0.21227347146163836, + "grad_norm": 4.891003131866455, + "learning_rate": 9.159336533330594e-05, + "loss": 2.367, + "step": 2820 + }, + { + "epoch": 0.2123487457422985, + "grad_norm": 4.341526031494141, + "learning_rate": 9.158659846236875e-05, + "loss": 1.8421, + "step": 2821 + }, + { + "epoch": 0.21242402002295865, + "grad_norm": 3.5332159996032715, + "learning_rate": 9.157982911922759e-05, + "loss": 2.0078, + "step": 2822 + }, + { + "epoch": 0.2124992943036188, + "grad_norm": 5.055265426635742, + "learning_rate": 9.15730573042849e-05, + "loss": 2.344, + "step": 2823 + }, + { + "epoch": 0.21257456858427898, + "grad_norm": 5.544818878173828, + "learning_rate": 9.156628301794325e-05, + "loss": 1.9671, + "step": 2824 + }, + { + "epoch": 0.21264984286493913, + "grad_norm": 5.053716659545898, + "learning_rate": 9.155950626060535e-05, + "loss": 2.0973, + "step": 2825 + }, + { + "epoch": 0.21272511714559927, + "grad_norm": 5.481313228607178, + "learning_rate": 9.155272703267403e-05, + "loss": 2.9367, + "step": 2826 + }, + { + "epoch": 0.21280039142625942, + "grad_norm": 7.212437152862549, + "learning_rate": 9.154594533455232e-05, + "loss": 2.1106, + "step": 2827 + }, + { + "epoch": 0.2128756657069196, + "grad_norm": 4.365868091583252, + "learning_rate": 9.153916116664338e-05, + "loss": 2.0575, + "step": 2828 + }, + { + "epoch": 0.21295093998757975, + "grad_norm": 5.271259784698486, + "learning_rate": 9.153237452935049e-05, + "loss": 2.0631, + "step": 2829 + }, + { + "epoch": 0.2130262142682399, + "grad_norm": 3.9800784587860107, + "learning_rate": 9.15255854230771e-05, + "loss": 1.9935, + "step": 2830 + }, + { + "epoch": 0.21310148854890004, + "grad_norm": 4.016927719116211, + "learning_rate": 9.151879384822682e-05, + "loss": 1.9925, + "step": 2831 + }, + { + "epoch": 0.21317676282956022, + "grad_norm": 5.193508148193359, + "learning_rate": 9.151199980520336e-05, + "loss": 2.2573, + "step": 2832 + }, + { + "epoch": 0.21325203711022037, + "grad_norm": 5.365655899047852, + "learning_rate": 9.150520329441063e-05, + "loss": 2.2443, + "step": 2833 + }, + { + "epoch": 0.21332731139088051, + "grad_norm": 5.702099800109863, + "learning_rate": 9.149840431625266e-05, + "loss": 2.0334, + "step": 2834 + }, + { + "epoch": 0.2134025856715407, + "grad_norm": 4.0529890060424805, + "learning_rate": 9.149160287113363e-05, + "loss": 1.9494, + "step": 2835 + }, + { + "epoch": 0.21347785995220084, + "grad_norm": 4.112748622894287, + "learning_rate": 9.148479895945784e-05, + "loss": 2.094, + "step": 2836 + }, + { + "epoch": 0.21355313423286099, + "grad_norm": 4.626742839813232, + "learning_rate": 9.147799258162981e-05, + "loss": 2.2898, + "step": 2837 + }, + { + "epoch": 0.21362840851352113, + "grad_norm": 7.863895893096924, + "learning_rate": 9.147118373805413e-05, + "loss": 2.5628, + "step": 2838 + }, + { + "epoch": 0.2137036827941813, + "grad_norm": 4.257025718688965, + "learning_rate": 9.146437242913556e-05, + "loss": 1.898, + "step": 2839 + }, + { + "epoch": 0.21377895707484146, + "grad_norm": 5.227463245391846, + "learning_rate": 9.145755865527904e-05, + "loss": 1.8511, + "step": 2840 + }, + { + "epoch": 0.2138542313555016, + "grad_norm": 4.386064052581787, + "learning_rate": 9.145074241688959e-05, + "loss": 1.9155, + "step": 2841 + }, + { + "epoch": 0.21392950563616175, + "grad_norm": 4.630362510681152, + "learning_rate": 9.144392371437245e-05, + "loss": 2.5548, + "step": 2842 + }, + { + "epoch": 0.21400477991682193, + "grad_norm": 4.202327728271484, + "learning_rate": 9.143710254813296e-05, + "loss": 1.8846, + "step": 2843 + }, + { + "epoch": 0.21408005419748208, + "grad_norm": 4.9407477378845215, + "learning_rate": 9.143027891857663e-05, + "loss": 2.4482, + "step": 2844 + }, + { + "epoch": 0.21415532847814223, + "grad_norm": 4.4251017570495605, + "learning_rate": 9.142345282610908e-05, + "loss": 2.0174, + "step": 2845 + }, + { + "epoch": 0.21423060275880237, + "grad_norm": 4.769384384155273, + "learning_rate": 9.141662427113613e-05, + "loss": 2.1288, + "step": 2846 + }, + { + "epoch": 0.21430587703946255, + "grad_norm": 5.329495906829834, + "learning_rate": 9.140979325406369e-05, + "loss": 1.8539, + "step": 2847 + }, + { + "epoch": 0.2143811513201227, + "grad_norm": 4.017544746398926, + "learning_rate": 9.140295977529788e-05, + "loss": 2.0459, + "step": 2848 + }, + { + "epoch": 0.21445642560078285, + "grad_norm": 4.645512580871582, + "learning_rate": 9.139612383524489e-05, + "loss": 2.3162, + "step": 2849 + }, + { + "epoch": 0.214531699881443, + "grad_norm": 4.684594631195068, + "learning_rate": 9.138928543431113e-05, + "loss": 1.9761, + "step": 2850 + }, + { + "epoch": 0.21460697416210317, + "grad_norm": 4.723893165588379, + "learning_rate": 9.138244457290311e-05, + "loss": 2.077, + "step": 2851 + }, + { + "epoch": 0.21468224844276332, + "grad_norm": 4.869959831237793, + "learning_rate": 9.13756012514275e-05, + "loss": 2.4885, + "step": 2852 + }, + { + "epoch": 0.21475752272342347, + "grad_norm": 4.7527689933776855, + "learning_rate": 9.13687554702911e-05, + "loss": 1.9081, + "step": 2853 + }, + { + "epoch": 0.21483279700408364, + "grad_norm": 5.106403827667236, + "learning_rate": 9.13619072299009e-05, + "loss": 2.1309, + "step": 2854 + }, + { + "epoch": 0.2149080712847438, + "grad_norm": 4.389427661895752, + "learning_rate": 9.135505653066399e-05, + "loss": 1.6671, + "step": 2855 + }, + { + "epoch": 0.21498334556540394, + "grad_norm": 3.9092814922332764, + "learning_rate": 9.134820337298763e-05, + "loss": 2.1034, + "step": 2856 + }, + { + "epoch": 0.2150586198460641, + "grad_norm": 4.435222148895264, + "learning_rate": 9.134134775727922e-05, + "loss": 2.2736, + "step": 2857 + }, + { + "epoch": 0.21513389412672426, + "grad_norm": 4.515613079071045, + "learning_rate": 9.133448968394631e-05, + "loss": 2.2219, + "step": 2858 + }, + { + "epoch": 0.2152091684073844, + "grad_norm": 6.38145112991333, + "learning_rate": 9.13276291533966e-05, + "loss": 1.9501, + "step": 2859 + }, + { + "epoch": 0.21528444268804456, + "grad_norm": 10.269143104553223, + "learning_rate": 9.132076616603791e-05, + "loss": 2.1356, + "step": 2860 + }, + { + "epoch": 0.2153597169687047, + "grad_norm": 4.484152793884277, + "learning_rate": 9.131390072227823e-05, + "loss": 2.3876, + "step": 2861 + }, + { + "epoch": 0.21543499124936488, + "grad_norm": 3.530357837677002, + "learning_rate": 9.13070328225257e-05, + "loss": 1.8473, + "step": 2862 + }, + { + "epoch": 0.21551026553002503, + "grad_norm": 4.104977130889893, + "learning_rate": 9.13001624671886e-05, + "loss": 2.3885, + "step": 2863 + }, + { + "epoch": 0.21558553981068518, + "grad_norm": 5.4687299728393555, + "learning_rate": 9.129328965667534e-05, + "loss": 2.3085, + "step": 2864 + }, + { + "epoch": 0.21566081409134533, + "grad_norm": 6.70385217666626, + "learning_rate": 9.128641439139448e-05, + "loss": 2.4711, + "step": 2865 + }, + { + "epoch": 0.2157360883720055, + "grad_norm": 4.164482116699219, + "learning_rate": 9.127953667175476e-05, + "loss": 1.967, + "step": 2866 + }, + { + "epoch": 0.21581136265266565, + "grad_norm": 8.239279747009277, + "learning_rate": 9.127265649816504e-05, + "loss": 1.9405, + "step": 2867 + }, + { + "epoch": 0.2158866369333258, + "grad_norm": 5.549882411956787, + "learning_rate": 9.12657738710343e-05, + "loss": 1.9267, + "step": 2868 + }, + { + "epoch": 0.21596191121398597, + "grad_norm": 4.876378059387207, + "learning_rate": 9.12588887907717e-05, + "loss": 2.2171, + "step": 2869 + }, + { + "epoch": 0.21603718549464612, + "grad_norm": 4.760179042816162, + "learning_rate": 9.125200125778657e-05, + "loss": 1.9083, + "step": 2870 + }, + { + "epoch": 0.21611245977530627, + "grad_norm": 4.280489921569824, + "learning_rate": 9.124511127248832e-05, + "loss": 1.9738, + "step": 2871 + }, + { + "epoch": 0.21618773405596642, + "grad_norm": 4.674266338348389, + "learning_rate": 9.123821883528653e-05, + "loss": 2.0772, + "step": 2872 + }, + { + "epoch": 0.2162630083366266, + "grad_norm": 4.658254623413086, + "learning_rate": 9.123132394659098e-05, + "loss": 2.5261, + "step": 2873 + }, + { + "epoch": 0.21633828261728674, + "grad_norm": 4.049352645874023, + "learning_rate": 9.122442660681153e-05, + "loss": 2.1706, + "step": 2874 + }, + { + "epoch": 0.2164135568979469, + "grad_norm": 3.8615479469299316, + "learning_rate": 9.12175268163582e-05, + "loss": 2.0985, + "step": 2875 + }, + { + "epoch": 0.21648883117860704, + "grad_norm": 4.295295715332031, + "learning_rate": 9.121062457564115e-05, + "loss": 2.2902, + "step": 2876 + }, + { + "epoch": 0.21656410545926721, + "grad_norm": 4.983851909637451, + "learning_rate": 9.120371988507073e-05, + "loss": 2.0875, + "step": 2877 + }, + { + "epoch": 0.21663937973992736, + "grad_norm": 4.82524299621582, + "learning_rate": 9.11968127450574e-05, + "loss": 2.1808, + "step": 2878 + }, + { + "epoch": 0.2167146540205875, + "grad_norm": 3.676748752593994, + "learning_rate": 9.118990315601175e-05, + "loss": 1.6644, + "step": 2879 + }, + { + "epoch": 0.21678992830124766, + "grad_norm": 6.100306510925293, + "learning_rate": 9.118299111834454e-05, + "loss": 2.0209, + "step": 2880 + }, + { + "epoch": 0.21686520258190783, + "grad_norm": 3.7935211658477783, + "learning_rate": 9.117607663246668e-05, + "loss": 2.1315, + "step": 2881 + }, + { + "epoch": 0.21694047686256798, + "grad_norm": 6.240111827850342, + "learning_rate": 9.116915969878921e-05, + "loss": 2.0366, + "step": 2882 + }, + { + "epoch": 0.21701575114322813, + "grad_norm": 5.97177267074585, + "learning_rate": 9.116224031772331e-05, + "loss": 2.0558, + "step": 2883 + }, + { + "epoch": 0.2170910254238883, + "grad_norm": 4.682295799255371, + "learning_rate": 9.115531848968035e-05, + "loss": 2.5279, + "step": 2884 + }, + { + "epoch": 0.21716629970454845, + "grad_norm": 5.760948657989502, + "learning_rate": 9.114839421507179e-05, + "loss": 2.1068, + "step": 2885 + }, + { + "epoch": 0.2172415739852086, + "grad_norm": 3.304828405380249, + "learning_rate": 9.114146749430926e-05, + "loss": 1.9157, + "step": 2886 + }, + { + "epoch": 0.21731684826586875, + "grad_norm": 4.40485143661499, + "learning_rate": 9.113453832780453e-05, + "loss": 2.0141, + "step": 2887 + }, + { + "epoch": 0.21739212254652893, + "grad_norm": 4.610844612121582, + "learning_rate": 9.112760671596953e-05, + "loss": 2.394, + "step": 2888 + }, + { + "epoch": 0.21746739682718907, + "grad_norm": 9.409422874450684, + "learning_rate": 9.112067265921633e-05, + "loss": 2.0827, + "step": 2889 + }, + { + "epoch": 0.21754267110784922, + "grad_norm": 4.706923484802246, + "learning_rate": 9.111373615795711e-05, + "loss": 1.9139, + "step": 2890 + }, + { + "epoch": 0.21761794538850937, + "grad_norm": 4.656936168670654, + "learning_rate": 9.110679721260424e-05, + "loss": 1.9504, + "step": 2891 + }, + { + "epoch": 0.21769321966916955, + "grad_norm": 4.418980121612549, + "learning_rate": 9.109985582357024e-05, + "loss": 1.9901, + "step": 2892 + }, + { + "epoch": 0.2177684939498297, + "grad_norm": 5.177255153656006, + "learning_rate": 9.109291199126774e-05, + "loss": 2.331, + "step": 2893 + }, + { + "epoch": 0.21784376823048984, + "grad_norm": 6.097462177276611, + "learning_rate": 9.108596571610952e-05, + "loss": 1.9097, + "step": 2894 + }, + { + "epoch": 0.21791904251115, + "grad_norm": 5.419043064117432, + "learning_rate": 9.107901699850853e-05, + "loss": 2.1482, + "step": 2895 + }, + { + "epoch": 0.21799431679181017, + "grad_norm": 4.536962985992432, + "learning_rate": 9.107206583887786e-05, + "loss": 2.1616, + "step": 2896 + }, + { + "epoch": 0.21806959107247031, + "grad_norm": 4.24278450012207, + "learning_rate": 9.106511223763073e-05, + "loss": 2.313, + "step": 2897 + }, + { + "epoch": 0.21814486535313046, + "grad_norm": 4.091751575469971, + "learning_rate": 9.105815619518048e-05, + "loss": 2.0003, + "step": 2898 + }, + { + "epoch": 0.2182201396337906, + "grad_norm": 5.067935943603516, + "learning_rate": 9.105119771194066e-05, + "loss": 2.1062, + "step": 2899 + }, + { + "epoch": 0.2182954139144508, + "grad_norm": 6.963771820068359, + "learning_rate": 9.104423678832494e-05, + "loss": 2.3203, + "step": 2900 + }, + { + "epoch": 0.21837068819511093, + "grad_norm": 4.553853988647461, + "learning_rate": 9.103727342474711e-05, + "loss": 2.0733, + "step": 2901 + }, + { + "epoch": 0.21844596247577108, + "grad_norm": 4.527489185333252, + "learning_rate": 9.103030762162112e-05, + "loss": 1.8942, + "step": 2902 + }, + { + "epoch": 0.21852123675643126, + "grad_norm": 5.539656162261963, + "learning_rate": 9.102333937936107e-05, + "loss": 1.6873, + "step": 2903 + }, + { + "epoch": 0.2185965110370914, + "grad_norm": 5.065639495849609, + "learning_rate": 9.101636869838119e-05, + "loss": 2.0554, + "step": 2904 + }, + { + "epoch": 0.21867178531775155, + "grad_norm": 5.687830448150635, + "learning_rate": 9.10093955790959e-05, + "loss": 2.0371, + "step": 2905 + }, + { + "epoch": 0.2187470595984117, + "grad_norm": 4.382602214813232, + "learning_rate": 9.10024200219197e-05, + "loss": 2.0513, + "step": 2906 + }, + { + "epoch": 0.21882233387907188, + "grad_norm": 4.833364486694336, + "learning_rate": 9.099544202726728e-05, + "loss": 1.9644, + "step": 2907 + }, + { + "epoch": 0.21889760815973203, + "grad_norm": 5.483856201171875, + "learning_rate": 9.098846159555346e-05, + "loss": 2.0015, + "step": 2908 + }, + { + "epoch": 0.21897288244039217, + "grad_norm": 5.608835697174072, + "learning_rate": 9.098147872719321e-05, + "loss": 1.7457, + "step": 2909 + }, + { + "epoch": 0.21904815672105232, + "grad_norm": 5.9853949546813965, + "learning_rate": 9.097449342260165e-05, + "loss": 2.6678, + "step": 2910 + }, + { + "epoch": 0.2191234310017125, + "grad_norm": 4.045847415924072, + "learning_rate": 9.0967505682194e-05, + "loss": 1.9462, + "step": 2911 + }, + { + "epoch": 0.21919870528237265, + "grad_norm": 5.401683807373047, + "learning_rate": 9.096051550638572e-05, + "loss": 2.281, + "step": 2912 + }, + { + "epoch": 0.2192739795630328, + "grad_norm": 6.834976673126221, + "learning_rate": 9.095352289559228e-05, + "loss": 2.0505, + "step": 2913 + }, + { + "epoch": 0.21934925384369294, + "grad_norm": 5.75833797454834, + "learning_rate": 9.094652785022945e-05, + "loss": 1.8767, + "step": 2914 + }, + { + "epoch": 0.21942452812435312, + "grad_norm": 3.7877635955810547, + "learning_rate": 9.093953037071301e-05, + "loss": 2.2009, + "step": 2915 + }, + { + "epoch": 0.21949980240501327, + "grad_norm": 5.442332744598389, + "learning_rate": 9.093253045745895e-05, + "loss": 1.9677, + "step": 2916 + }, + { + "epoch": 0.21957507668567341, + "grad_norm": 4.543300628662109, + "learning_rate": 9.092552811088342e-05, + "loss": 1.8725, + "step": 2917 + }, + { + "epoch": 0.2196503509663336, + "grad_norm": 6.140537261962891, + "learning_rate": 9.091852333140267e-05, + "loss": 1.9658, + "step": 2918 + }, + { + "epoch": 0.21972562524699374, + "grad_norm": 6.194490432739258, + "learning_rate": 9.091151611943312e-05, + "loss": 2.2637, + "step": 2919 + }, + { + "epoch": 0.2198008995276539, + "grad_norm": 5.404478073120117, + "learning_rate": 9.09045064753913e-05, + "loss": 1.8551, + "step": 2920 + }, + { + "epoch": 0.21987617380831403, + "grad_norm": 4.314300537109375, + "learning_rate": 9.089749439969398e-05, + "loss": 1.8512, + "step": 2921 + }, + { + "epoch": 0.2199514480889742, + "grad_norm": 4.811122894287109, + "learning_rate": 9.089047989275793e-05, + "loss": 1.7737, + "step": 2922 + }, + { + "epoch": 0.22002672236963436, + "grad_norm": 4.302212238311768, + "learning_rate": 9.088346295500019e-05, + "loss": 2.1659, + "step": 2923 + }, + { + "epoch": 0.2201019966502945, + "grad_norm": 5.338231563568115, + "learning_rate": 9.087644358683789e-05, + "loss": 2.0363, + "step": 2924 + }, + { + "epoch": 0.22017727093095465, + "grad_norm": 5.035805702209473, + "learning_rate": 9.086942178868829e-05, + "loss": 2.1317, + "step": 2925 + }, + { + "epoch": 0.22025254521161483, + "grad_norm": 5.151651382446289, + "learning_rate": 9.086239756096886e-05, + "loss": 2.3377, + "step": 2926 + }, + { + "epoch": 0.22032781949227498, + "grad_norm": 4.626562595367432, + "learning_rate": 9.085537090409712e-05, + "loss": 2.0775, + "step": 2927 + }, + { + "epoch": 0.22040309377293513, + "grad_norm": 6.436605930328369, + "learning_rate": 9.084834181849081e-05, + "loss": 1.9339, + "step": 2928 + }, + { + "epoch": 0.22047836805359528, + "grad_norm": 5.9805450439453125, + "learning_rate": 9.08413103045678e-05, + "loss": 2.0937, + "step": 2929 + }, + { + "epoch": 0.22055364233425545, + "grad_norm": 4.39972448348999, + "learning_rate": 9.083427636274606e-05, + "loss": 1.9688, + "step": 2930 + }, + { + "epoch": 0.2206289166149156, + "grad_norm": 4.032160758972168, + "learning_rate": 9.082723999344377e-05, + "loss": 2.2035, + "step": 2931 + }, + { + "epoch": 0.22070419089557575, + "grad_norm": 7.017184734344482, + "learning_rate": 9.082020119707921e-05, + "loss": 1.9799, + "step": 2932 + }, + { + "epoch": 0.22077946517623592, + "grad_norm": 5.398980617523193, + "learning_rate": 9.08131599740708e-05, + "loss": 2.1431, + "step": 2933 + }, + { + "epoch": 0.22085473945689607, + "grad_norm": 4.855225563049316, + "learning_rate": 9.080611632483714e-05, + "loss": 2.2772, + "step": 2934 + }, + { + "epoch": 0.22093001373755622, + "grad_norm": 5.293283462524414, + "learning_rate": 9.079907024979696e-05, + "loss": 2.1082, + "step": 2935 + }, + { + "epoch": 0.22100528801821637, + "grad_norm": 5.850619316101074, + "learning_rate": 9.079202174936912e-05, + "loss": 1.7753, + "step": 2936 + }, + { + "epoch": 0.22108056229887654, + "grad_norm": 4.3017191886901855, + "learning_rate": 9.078497082397262e-05, + "loss": 2.2775, + "step": 2937 + }, + { + "epoch": 0.2211558365795367, + "grad_norm": 4.076857566833496, + "learning_rate": 9.077791747402664e-05, + "loss": 2.1259, + "step": 2938 + }, + { + "epoch": 0.22123111086019684, + "grad_norm": 4.446213722229004, + "learning_rate": 9.077086169995048e-05, + "loss": 2.1826, + "step": 2939 + }, + { + "epoch": 0.221306385140857, + "grad_norm": 4.013009548187256, + "learning_rate": 9.076380350216355e-05, + "loss": 2.0302, + "step": 2940 + }, + { + "epoch": 0.22138165942151716, + "grad_norm": 4.595061302185059, + "learning_rate": 9.075674288108549e-05, + "loss": 1.7048, + "step": 2941 + }, + { + "epoch": 0.2214569337021773, + "grad_norm": 8.099377632141113, + "learning_rate": 9.0749679837136e-05, + "loss": 2.5522, + "step": 2942 + }, + { + "epoch": 0.22153220798283746, + "grad_norm": 8.246782302856445, + "learning_rate": 9.074261437073497e-05, + "loss": 2.1225, + "step": 2943 + }, + { + "epoch": 0.2216074822634976, + "grad_norm": 4.502189636230469, + "learning_rate": 9.073554648230241e-05, + "loss": 2.2436, + "step": 2944 + }, + { + "epoch": 0.22168275654415778, + "grad_norm": 5.9204816818237305, + "learning_rate": 9.07284761722585e-05, + "loss": 2.0374, + "step": 2945 + }, + { + "epoch": 0.22175803082481793, + "grad_norm": 4.8395538330078125, + "learning_rate": 9.072140344102355e-05, + "loss": 2.0299, + "step": 2946 + }, + { + "epoch": 0.22183330510547808, + "grad_norm": 6.0673675537109375, + "learning_rate": 9.0714328289018e-05, + "loss": 2.2062, + "step": 2947 + }, + { + "epoch": 0.22190857938613825, + "grad_norm": 4.742865562438965, + "learning_rate": 9.070725071666248e-05, + "loss": 1.9603, + "step": 2948 + }, + { + "epoch": 0.2219838536667984, + "grad_norm": 5.113400459289551, + "learning_rate": 9.070017072437769e-05, + "loss": 2.0069, + "step": 2949 + }, + { + "epoch": 0.22205912794745855, + "grad_norm": 4.952245712280273, + "learning_rate": 9.069308831258452e-05, + "loss": 2.2253, + "step": 2950 + }, + { + "epoch": 0.2221344022281187, + "grad_norm": 5.182546615600586, + "learning_rate": 9.068600348170401e-05, + "loss": 1.9472, + "step": 2951 + }, + { + "epoch": 0.22220967650877888, + "grad_norm": 9.822628021240234, + "learning_rate": 9.067891623215737e-05, + "loss": 2.3668, + "step": 2952 + }, + { + "epoch": 0.22228495078943902, + "grad_norm": 6.70409631729126, + "learning_rate": 9.067182656436585e-05, + "loss": 2.1276, + "step": 2953 + }, + { + "epoch": 0.22236022507009917, + "grad_norm": 5.011364459991455, + "learning_rate": 9.066473447875094e-05, + "loss": 2.1604, + "step": 2954 + }, + { + "epoch": 0.22243549935075932, + "grad_norm": 6.013547420501709, + "learning_rate": 9.065763997573427e-05, + "loss": 1.9803, + "step": 2955 + }, + { + "epoch": 0.2225107736314195, + "grad_norm": 5.591979026794434, + "learning_rate": 9.065054305573755e-05, + "loss": 1.8404, + "step": 2956 + }, + { + "epoch": 0.22258604791207964, + "grad_norm": 5.186059474945068, + "learning_rate": 9.064344371918267e-05, + "loss": 1.9471, + "step": 2957 + }, + { + "epoch": 0.2226613221927398, + "grad_norm": 5.612191200256348, + "learning_rate": 9.06363419664917e-05, + "loss": 2.2213, + "step": 2958 + }, + { + "epoch": 0.22273659647339994, + "grad_norm": 4.949213027954102, + "learning_rate": 9.062923779808678e-05, + "loss": 1.7411, + "step": 2959 + }, + { + "epoch": 0.22281187075406012, + "grad_norm": 4.895546913146973, + "learning_rate": 9.062213121439027e-05, + "loss": 2.1532, + "step": 2960 + }, + { + "epoch": 0.22288714503472026, + "grad_norm": 6.915483474731445, + "learning_rate": 9.061502221582461e-05, + "loss": 2.4376, + "step": 2961 + }, + { + "epoch": 0.2229624193153804, + "grad_norm": 4.932807445526123, + "learning_rate": 9.060791080281241e-05, + "loss": 1.9923, + "step": 2962 + }, + { + "epoch": 0.22303769359604056, + "grad_norm": 6.073930263519287, + "learning_rate": 9.060079697577644e-05, + "loss": 1.876, + "step": 2963 + }, + { + "epoch": 0.22311296787670074, + "grad_norm": 5.532196998596191, + "learning_rate": 9.059368073513958e-05, + "loss": 2.0465, + "step": 2964 + }, + { + "epoch": 0.22318824215736088, + "grad_norm": 4.24959135055542, + "learning_rate": 9.058656208132487e-05, + "loss": 2.2441, + "step": 2965 + }, + { + "epoch": 0.22326351643802103, + "grad_norm": 4.8348236083984375, + "learning_rate": 9.057944101475553e-05, + "loss": 2.1498, + "step": 2966 + }, + { + "epoch": 0.2233387907186812, + "grad_norm": 4.467230319976807, + "learning_rate": 9.057231753585483e-05, + "loss": 1.8951, + "step": 2967 + }, + { + "epoch": 0.22341406499934136, + "grad_norm": 4.355803966522217, + "learning_rate": 9.056519164504627e-05, + "loss": 2.3531, + "step": 2968 + }, + { + "epoch": 0.2234893392800015, + "grad_norm": 5.565178394317627, + "learning_rate": 9.055806334275347e-05, + "loss": 2.0837, + "step": 2969 + }, + { + "epoch": 0.22356461356066165, + "grad_norm": 3.871276617050171, + "learning_rate": 9.055093262940017e-05, + "loss": 2.2883, + "step": 2970 + }, + { + "epoch": 0.22363988784132183, + "grad_norm": 4.407426357269287, + "learning_rate": 9.054379950541028e-05, + "loss": 1.9799, + "step": 2971 + }, + { + "epoch": 0.22371516212198198, + "grad_norm": 7.752662181854248, + "learning_rate": 9.053666397120786e-05, + "loss": 2.3833, + "step": 2972 + }, + { + "epoch": 0.22379043640264212, + "grad_norm": 8.185113906860352, + "learning_rate": 9.052952602721706e-05, + "loss": 2.2716, + "step": 2973 + }, + { + "epoch": 0.22386571068330227, + "grad_norm": 5.350937366485596, + "learning_rate": 9.052238567386224e-05, + "loss": 2.2734, + "step": 2974 + }, + { + "epoch": 0.22394098496396245, + "grad_norm": 4.131227970123291, + "learning_rate": 9.051524291156785e-05, + "loss": 2.2203, + "step": 2975 + }, + { + "epoch": 0.2240162592446226, + "grad_norm": 3.5607070922851562, + "learning_rate": 9.050809774075853e-05, + "loss": 2.0199, + "step": 2976 + }, + { + "epoch": 0.22409153352528274, + "grad_norm": 4.787525653839111, + "learning_rate": 9.050095016185903e-05, + "loss": 1.9822, + "step": 2977 + }, + { + "epoch": 0.2241668078059429, + "grad_norm": 5.674337863922119, + "learning_rate": 9.049380017529426e-05, + "loss": 2.0224, + "step": 2978 + }, + { + "epoch": 0.22424208208660307, + "grad_norm": 3.7880828380584717, + "learning_rate": 9.048664778148924e-05, + "loss": 2.1697, + "step": 2979 + }, + { + "epoch": 0.22431735636726322, + "grad_norm": 5.682531356811523, + "learning_rate": 9.047949298086919e-05, + "loss": 2.4035, + "step": 2980 + }, + { + "epoch": 0.22439263064792336, + "grad_norm": 4.9303178787231445, + "learning_rate": 9.047233577385944e-05, + "loss": 2.0629, + "step": 2981 + }, + { + "epoch": 0.22446790492858354, + "grad_norm": 5.042529582977295, + "learning_rate": 9.046517616088545e-05, + "loss": 2.221, + "step": 2982 + }, + { + "epoch": 0.2245431792092437, + "grad_norm": 4.45997953414917, + "learning_rate": 9.045801414237284e-05, + "loss": 1.9773, + "step": 2983 + }, + { + "epoch": 0.22461845348990384, + "grad_norm": 4.140063285827637, + "learning_rate": 9.045084971874738e-05, + "loss": 1.9895, + "step": 2984 + }, + { + "epoch": 0.22469372777056398, + "grad_norm": 5.330562591552734, + "learning_rate": 9.044368289043497e-05, + "loss": 2.411, + "step": 2985 + }, + { + "epoch": 0.22476900205122416, + "grad_norm": 4.238677024841309, + "learning_rate": 9.043651365786166e-05, + "loss": 2.2728, + "step": 2986 + }, + { + "epoch": 0.2248442763318843, + "grad_norm": 4.427063465118408, + "learning_rate": 9.042934202145364e-05, + "loss": 2.1105, + "step": 2987 + }, + { + "epoch": 0.22491955061254446, + "grad_norm": 4.295475959777832, + "learning_rate": 9.042216798163722e-05, + "loss": 1.9844, + "step": 2988 + }, + { + "epoch": 0.2249948248932046, + "grad_norm": 4.440731048583984, + "learning_rate": 9.041499153883893e-05, + "loss": 2.2493, + "step": 2989 + }, + { + "epoch": 0.22507009917386478, + "grad_norm": 5.3758063316345215, + "learning_rate": 9.040781269348533e-05, + "loss": 2.4395, + "step": 2990 + }, + { + "epoch": 0.22514537345452493, + "grad_norm": 4.190469264984131, + "learning_rate": 9.040063144600322e-05, + "loss": 2.3388, + "step": 2991 + }, + { + "epoch": 0.22522064773518508, + "grad_norm": 4.746899127960205, + "learning_rate": 9.03934477968195e-05, + "loss": 1.9926, + "step": 2992 + }, + { + "epoch": 0.22529592201584522, + "grad_norm": 4.665939807891846, + "learning_rate": 9.03862617463612e-05, + "loss": 2.0094, + "step": 2993 + }, + { + "epoch": 0.2253711962965054, + "grad_norm": 3.8885819911956787, + "learning_rate": 9.037907329505553e-05, + "loss": 1.9834, + "step": 2994 + }, + { + "epoch": 0.22544647057716555, + "grad_norm": 5.015994548797607, + "learning_rate": 9.03718824433298e-05, + "loss": 2.1075, + "step": 2995 + }, + { + "epoch": 0.2255217448578257, + "grad_norm": 6.6822333335876465, + "learning_rate": 9.036468919161151e-05, + "loss": 2.2278, + "step": 2996 + }, + { + "epoch": 0.22559701913848587, + "grad_norm": 4.36100435256958, + "learning_rate": 9.035749354032825e-05, + "loss": 1.9734, + "step": 2997 + }, + { + "epoch": 0.22567229341914602, + "grad_norm": 3.667426586151123, + "learning_rate": 9.035029548990782e-05, + "loss": 2.008, + "step": 2998 + }, + { + "epoch": 0.22574756769980617, + "grad_norm": 4.259113788604736, + "learning_rate": 9.034309504077809e-05, + "loss": 1.6353, + "step": 2999 + }, + { + "epoch": 0.22582284198046632, + "grad_norm": 3.9176275730133057, + "learning_rate": 9.033589219336711e-05, + "loss": 1.97, + "step": 3000 + }, + { + "epoch": 0.2258981162611265, + "grad_norm": 6.052648544311523, + "learning_rate": 9.03286869481031e-05, + "loss": 2.7073, + "step": 3001 + }, + { + "epoch": 0.22597339054178664, + "grad_norm": 6.663978576660156, + "learning_rate": 9.032147930541435e-05, + "loss": 2.4969, + "step": 3002 + }, + { + "epoch": 0.2260486648224468, + "grad_norm": 8.080892562866211, + "learning_rate": 9.031426926572935e-05, + "loss": 2.1463, + "step": 3003 + }, + { + "epoch": 0.22612393910310694, + "grad_norm": 6.703983306884766, + "learning_rate": 9.030705682947671e-05, + "loss": 1.9333, + "step": 3004 + }, + { + "epoch": 0.2261992133837671, + "grad_norm": 7.132099151611328, + "learning_rate": 9.029984199708521e-05, + "loss": 2.0013, + "step": 3005 + }, + { + "epoch": 0.22627448766442726, + "grad_norm": 4.324505805969238, + "learning_rate": 9.029262476898372e-05, + "loss": 1.8208, + "step": 3006 + }, + { + "epoch": 0.2263497619450874, + "grad_norm": 4.36423921585083, + "learning_rate": 9.028540514560132e-05, + "loss": 2.0609, + "step": 3007 + }, + { + "epoch": 0.22642503622574756, + "grad_norm": 6.873592376708984, + "learning_rate": 9.027818312736717e-05, + "loss": 1.9287, + "step": 3008 + }, + { + "epoch": 0.22650031050640773, + "grad_norm": 6.495251655578613, + "learning_rate": 9.02709587147106e-05, + "loss": 2.2841, + "step": 3009 + }, + { + "epoch": 0.22657558478706788, + "grad_norm": 4.629186153411865, + "learning_rate": 9.026373190806108e-05, + "loss": 2.2215, + "step": 3010 + }, + { + "epoch": 0.22665085906772803, + "grad_norm": 4.751518249511719, + "learning_rate": 9.025650270784823e-05, + "loss": 1.9765, + "step": 3011 + }, + { + "epoch": 0.22672613334838818, + "grad_norm": 4.488245964050293, + "learning_rate": 9.024927111450182e-05, + "loss": 2.1709, + "step": 3012 + }, + { + "epoch": 0.22680140762904835, + "grad_norm": 4.00231409072876, + "learning_rate": 9.02420371284517e-05, + "loss": 2.157, + "step": 3013 + }, + { + "epoch": 0.2268766819097085, + "grad_norm": 7.076344966888428, + "learning_rate": 9.023480075012795e-05, + "loss": 2.1229, + "step": 3014 + }, + { + "epoch": 0.22695195619036865, + "grad_norm": 6.816217422485352, + "learning_rate": 9.022756197996077e-05, + "loss": 2.0886, + "step": 3015 + }, + { + "epoch": 0.22702723047102882, + "grad_norm": 4.8998260498046875, + "learning_rate": 9.022032081838042e-05, + "loss": 2.1428, + "step": 3016 + }, + { + "epoch": 0.22710250475168897, + "grad_norm": 4.486428737640381, + "learning_rate": 9.021307726581743e-05, + "loss": 1.7001, + "step": 3017 + }, + { + "epoch": 0.22717777903234912, + "grad_norm": 5.397170543670654, + "learning_rate": 9.020583132270235e-05, + "loss": 2.277, + "step": 3018 + }, + { + "epoch": 0.22725305331300927, + "grad_norm": 4.8494791984558105, + "learning_rate": 9.019858298946598e-05, + "loss": 2.0772, + "step": 3019 + }, + { + "epoch": 0.22732832759366944, + "grad_norm": 6.169023036956787, + "learning_rate": 9.01913322665392e-05, + "loss": 2.2601, + "step": 3020 + }, + { + "epoch": 0.2274036018743296, + "grad_norm": 4.9365997314453125, + "learning_rate": 9.018407915435302e-05, + "loss": 1.977, + "step": 3021 + }, + { + "epoch": 0.22747887615498974, + "grad_norm": 6.302662372589111, + "learning_rate": 9.017682365333866e-05, + "loss": 2.5235, + "step": 3022 + }, + { + "epoch": 0.2275541504356499, + "grad_norm": 8.961931228637695, + "learning_rate": 9.01695657639274e-05, + "loss": 2.1748, + "step": 3023 + }, + { + "epoch": 0.22762942471631006, + "grad_norm": 4.763588905334473, + "learning_rate": 9.016230548655071e-05, + "loss": 1.9832, + "step": 3024 + }, + { + "epoch": 0.2277046989969702, + "grad_norm": 6.08983039855957, + "learning_rate": 9.01550428216402e-05, + "loss": 2.0133, + "step": 3025 + }, + { + "epoch": 0.22777997327763036, + "grad_norm": 4.464398384094238, + "learning_rate": 9.014777776962762e-05, + "loss": 2.1519, + "step": 3026 + }, + { + "epoch": 0.2278552475582905, + "grad_norm": 4.90877628326416, + "learning_rate": 9.014051033094484e-05, + "loss": 2.276, + "step": 3027 + }, + { + "epoch": 0.22793052183895068, + "grad_norm": 3.8603129386901855, + "learning_rate": 9.013324050602391e-05, + "loss": 1.9222, + "step": 3028 + }, + { + "epoch": 0.22800579611961083, + "grad_norm": 4.96784782409668, + "learning_rate": 9.012596829529697e-05, + "loss": 2.118, + "step": 3029 + }, + { + "epoch": 0.22808107040027098, + "grad_norm": 7.061264514923096, + "learning_rate": 9.011869369919636e-05, + "loss": 2.3798, + "step": 3030 + }, + { + "epoch": 0.22815634468093116, + "grad_norm": 4.7067766189575195, + "learning_rate": 9.011141671815452e-05, + "loss": 2.07, + "step": 3031 + }, + { + "epoch": 0.2282316189615913, + "grad_norm": 4.883420467376709, + "learning_rate": 9.010413735260404e-05, + "loss": 2.2647, + "step": 3032 + }, + { + "epoch": 0.22830689324225145, + "grad_norm": 5.399378776550293, + "learning_rate": 9.009685560297768e-05, + "loss": 2.0837, + "step": 3033 + }, + { + "epoch": 0.2283821675229116, + "grad_norm": 4.923521041870117, + "learning_rate": 9.00895714697083e-05, + "loss": 1.9732, + "step": 3034 + }, + { + "epoch": 0.22845744180357178, + "grad_norm": 5.1057939529418945, + "learning_rate": 9.008228495322892e-05, + "loss": 2.117, + "step": 3035 + }, + { + "epoch": 0.22853271608423192, + "grad_norm": 3.8005971908569336, + "learning_rate": 9.007499605397272e-05, + "loss": 2.1132, + "step": 3036 + }, + { + "epoch": 0.22860799036489207, + "grad_norm": 4.073781967163086, + "learning_rate": 9.006770477237298e-05, + "loss": 2.2012, + "step": 3037 + }, + { + "epoch": 0.22868326464555222, + "grad_norm": 4.875649452209473, + "learning_rate": 9.006041110886317e-05, + "loss": 1.9903, + "step": 3038 + }, + { + "epoch": 0.2287585389262124, + "grad_norm": 4.754485607147217, + "learning_rate": 9.005311506387686e-05, + "loss": 2.1516, + "step": 3039 + }, + { + "epoch": 0.22883381320687254, + "grad_norm": 4.883895397186279, + "learning_rate": 9.00458166378478e-05, + "loss": 2.2613, + "step": 3040 + }, + { + "epoch": 0.2289090874875327, + "grad_norm": 4.565127849578857, + "learning_rate": 9.003851583120981e-05, + "loss": 2.1924, + "step": 3041 + }, + { + "epoch": 0.22898436176819284, + "grad_norm": 5.656824111938477, + "learning_rate": 9.003121264439697e-05, + "loss": 2.3375, + "step": 3042 + }, + { + "epoch": 0.22905963604885302, + "grad_norm": 4.560620307922363, + "learning_rate": 9.002390707784338e-05, + "loss": 2.3094, + "step": 3043 + }, + { + "epoch": 0.22913491032951316, + "grad_norm": 5.578507423400879, + "learning_rate": 9.001659913198336e-05, + "loss": 1.891, + "step": 3044 + }, + { + "epoch": 0.2292101846101733, + "grad_norm": 4.357603549957275, + "learning_rate": 9.000928880725136e-05, + "loss": 2.0809, + "step": 3045 + }, + { + "epoch": 0.2292854588908335, + "grad_norm": 3.7798662185668945, + "learning_rate": 9.000197610408192e-05, + "loss": 1.7311, + "step": 3046 + }, + { + "epoch": 0.22936073317149364, + "grad_norm": 4.265188694000244, + "learning_rate": 8.999466102290978e-05, + "loss": 2.3679, + "step": 3047 + }, + { + "epoch": 0.22943600745215378, + "grad_norm": 7.112925052642822, + "learning_rate": 8.998734356416981e-05, + "loss": 2.2524, + "step": 3048 + }, + { + "epoch": 0.22951128173281393, + "grad_norm": 3.864196300506592, + "learning_rate": 8.998002372829699e-05, + "loss": 2.159, + "step": 3049 + }, + { + "epoch": 0.2295865560134741, + "grad_norm": 4.573678970336914, + "learning_rate": 8.997270151572649e-05, + "loss": 1.925, + "step": 3050 + }, + { + "epoch": 0.22966183029413426, + "grad_norm": 4.890881538391113, + "learning_rate": 8.996537692689357e-05, + "loss": 1.9123, + "step": 3051 + }, + { + "epoch": 0.2297371045747944, + "grad_norm": 5.622613430023193, + "learning_rate": 8.995804996223367e-05, + "loss": 1.9269, + "step": 3052 + }, + { + "epoch": 0.22981237885545455, + "grad_norm": 6.246424674987793, + "learning_rate": 8.995072062218235e-05, + "loss": 2.2979, + "step": 3053 + }, + { + "epoch": 0.22988765313611473, + "grad_norm": 4.573148250579834, + "learning_rate": 8.99433889071753e-05, + "loss": 2.0967, + "step": 3054 + }, + { + "epoch": 0.22996292741677488, + "grad_norm": 4.3591179847717285, + "learning_rate": 8.993605481764841e-05, + "loss": 2.2305, + "step": 3055 + }, + { + "epoch": 0.23003820169743502, + "grad_norm": 5.699021339416504, + "learning_rate": 8.992871835403767e-05, + "loss": 1.93, + "step": 3056 + }, + { + "epoch": 0.23011347597809517, + "grad_norm": 4.548457622528076, + "learning_rate": 8.992137951677916e-05, + "loss": 2.0882, + "step": 3057 + }, + { + "epoch": 0.23018875025875535, + "grad_norm": 6.617466449737549, + "learning_rate": 8.99140383063092e-05, + "loss": 1.6243, + "step": 3058 + }, + { + "epoch": 0.2302640245394155, + "grad_norm": 4.906440734863281, + "learning_rate": 8.99066947230642e-05, + "loss": 2.2427, + "step": 3059 + }, + { + "epoch": 0.23033929882007564, + "grad_norm": 4.505703926086426, + "learning_rate": 8.989934876748068e-05, + "loss": 2.402, + "step": 3060 + }, + { + "epoch": 0.2304145731007358, + "grad_norm": 5.4420928955078125, + "learning_rate": 8.989200043999537e-05, + "loss": 2.0293, + "step": 3061 + }, + { + "epoch": 0.23048984738139597, + "grad_norm": 4.512494087219238, + "learning_rate": 8.988464974104509e-05, + "loss": 1.9191, + "step": 3062 + }, + { + "epoch": 0.23056512166205612, + "grad_norm": 4.833025932312012, + "learning_rate": 8.987729667106683e-05, + "loss": 2.0507, + "step": 3063 + }, + { + "epoch": 0.23064039594271626, + "grad_norm": 5.658215045928955, + "learning_rate": 8.98699412304977e-05, + "loss": 2.0965, + "step": 3064 + }, + { + "epoch": 0.23071567022337644, + "grad_norm": 6.153744697570801, + "learning_rate": 8.986258341977498e-05, + "loss": 2.4311, + "step": 3065 + }, + { + "epoch": 0.2307909445040366, + "grad_norm": 5.608066558837891, + "learning_rate": 8.985522323933603e-05, + "loss": 2.1879, + "step": 3066 + }, + { + "epoch": 0.23086621878469674, + "grad_norm": 4.241041660308838, + "learning_rate": 8.984786068961843e-05, + "loss": 2.2134, + "step": 3067 + }, + { + "epoch": 0.23094149306535688, + "grad_norm": 5.847661018371582, + "learning_rate": 8.984049577105985e-05, + "loss": 2.1156, + "step": 3068 + }, + { + "epoch": 0.23101676734601706, + "grad_norm": 4.819242000579834, + "learning_rate": 8.983312848409811e-05, + "loss": 2.2144, + "step": 3069 + }, + { + "epoch": 0.2310920416266772, + "grad_norm": 4.251035690307617, + "learning_rate": 8.982575882917119e-05, + "loss": 2.1086, + "step": 3070 + }, + { + "epoch": 0.23116731590733736, + "grad_norm": 5.436933994293213, + "learning_rate": 8.981838680671715e-05, + "loss": 1.9431, + "step": 3071 + }, + { + "epoch": 0.2312425901879975, + "grad_norm": 4.8714399337768555, + "learning_rate": 8.981101241717427e-05, + "loss": 2.123, + "step": 3072 + }, + { + "epoch": 0.23131786446865768, + "grad_norm": 4.5080671310424805, + "learning_rate": 8.980363566098096e-05, + "loss": 2.2184, + "step": 3073 + }, + { + "epoch": 0.23139313874931783, + "grad_norm": 5.637725830078125, + "learning_rate": 8.97962565385757e-05, + "loss": 2.7501, + "step": 3074 + }, + { + "epoch": 0.23146841302997798, + "grad_norm": 4.687811374664307, + "learning_rate": 8.978887505039717e-05, + "loss": 2.127, + "step": 3075 + }, + { + "epoch": 0.23154368731063812, + "grad_norm": 4.3032355308532715, + "learning_rate": 8.97814911968842e-05, + "loss": 1.8478, + "step": 3076 + }, + { + "epoch": 0.2316189615912983, + "grad_norm": 6.097776889801025, + "learning_rate": 8.977410497847571e-05, + "loss": 2.492, + "step": 3077 + }, + { + "epoch": 0.23169423587195845, + "grad_norm": 5.614762783050537, + "learning_rate": 8.976671639561082e-05, + "loss": 1.8767, + "step": 3078 + }, + { + "epoch": 0.2317695101526186, + "grad_norm": 7.298680782318115, + "learning_rate": 8.975932544872874e-05, + "loss": 2.2084, + "step": 3079 + }, + { + "epoch": 0.23184478443327877, + "grad_norm": 5.520145416259766, + "learning_rate": 8.975193213826885e-05, + "loss": 1.9535, + "step": 3080 + }, + { + "epoch": 0.23192005871393892, + "grad_norm": 4.96815824508667, + "learning_rate": 8.974453646467063e-05, + "loss": 1.8722, + "step": 3081 + }, + { + "epoch": 0.23199533299459907, + "grad_norm": 4.702606201171875, + "learning_rate": 8.973713842837378e-05, + "loss": 2.298, + "step": 3082 + }, + { + "epoch": 0.23207060727525922, + "grad_norm": 6.2534284591674805, + "learning_rate": 8.972973802981805e-05, + "loss": 1.5298, + "step": 3083 + }, + { + "epoch": 0.2321458815559194, + "grad_norm": 4.1786394119262695, + "learning_rate": 8.972233526944343e-05, + "loss": 1.8238, + "step": 3084 + }, + { + "epoch": 0.23222115583657954, + "grad_norm": 5.316036701202393, + "learning_rate": 8.971493014768991e-05, + "loss": 1.9646, + "step": 3085 + }, + { + "epoch": 0.2322964301172397, + "grad_norm": 3.139407157897949, + "learning_rate": 8.970752266499778e-05, + "loss": 2.2641, + "step": 3086 + }, + { + "epoch": 0.23237170439789984, + "grad_norm": 5.529141902923584, + "learning_rate": 8.970011282180734e-05, + "loss": 2.1762, + "step": 3087 + }, + { + "epoch": 0.23244697867856, + "grad_norm": 4.312731742858887, + "learning_rate": 8.969270061855911e-05, + "loss": 2.1871, + "step": 3088 + }, + { + "epoch": 0.23252225295922016, + "grad_norm": 4.658344745635986, + "learning_rate": 8.968528605569373e-05, + "loss": 2.0451, + "step": 3089 + }, + { + "epoch": 0.2325975272398803, + "grad_norm": 5.684056282043457, + "learning_rate": 8.967786913365195e-05, + "loss": 2.4996, + "step": 3090 + }, + { + "epoch": 0.23267280152054046, + "grad_norm": 5.3817458152771, + "learning_rate": 8.96704498528747e-05, + "loss": 2.2046, + "step": 3091 + }, + { + "epoch": 0.23274807580120063, + "grad_norm": 5.300045967102051, + "learning_rate": 8.966302821380304e-05, + "loss": 1.9514, + "step": 3092 + }, + { + "epoch": 0.23282335008186078, + "grad_norm": 3.844289541244507, + "learning_rate": 8.965560421687816e-05, + "loss": 1.7736, + "step": 3093 + }, + { + "epoch": 0.23289862436252093, + "grad_norm": 4.255853652954102, + "learning_rate": 8.964817786254138e-05, + "loss": 1.8253, + "step": 3094 + }, + { + "epoch": 0.2329738986431811, + "grad_norm": 6.417258262634277, + "learning_rate": 8.964074915123421e-05, + "loss": 2.1792, + "step": 3095 + }, + { + "epoch": 0.23304917292384125, + "grad_norm": 5.512781143188477, + "learning_rate": 8.963331808339823e-05, + "loss": 2.0537, + "step": 3096 + }, + { + "epoch": 0.2331244472045014, + "grad_norm": 4.924642562866211, + "learning_rate": 8.962588465947522e-05, + "loss": 2.0586, + "step": 3097 + }, + { + "epoch": 0.23319972148516155, + "grad_norm": 4.606882095336914, + "learning_rate": 8.961844887990706e-05, + "loss": 2.0768, + "step": 3098 + }, + { + "epoch": 0.23327499576582172, + "grad_norm": 4.865094184875488, + "learning_rate": 8.96110107451358e-05, + "loss": 1.9203, + "step": 3099 + }, + { + "epoch": 0.23335027004648187, + "grad_norm": 4.5715837478637695, + "learning_rate": 8.960357025560359e-05, + "loss": 2.0566, + "step": 3100 + }, + { + "epoch": 0.23342554432714202, + "grad_norm": 3.477773904800415, + "learning_rate": 8.959612741175278e-05, + "loss": 1.776, + "step": 3101 + }, + { + "epoch": 0.23350081860780217, + "grad_norm": 4.101693153381348, + "learning_rate": 8.958868221402581e-05, + "loss": 2.4927, + "step": 3102 + }, + { + "epoch": 0.23357609288846234, + "grad_norm": 4.088027000427246, + "learning_rate": 8.958123466286528e-05, + "loss": 2.0329, + "step": 3103 + }, + { + "epoch": 0.2336513671691225, + "grad_norm": 4.115071773529053, + "learning_rate": 8.95737847587139e-05, + "loss": 1.8868, + "step": 3104 + }, + { + "epoch": 0.23372664144978264, + "grad_norm": 3.3214151859283447, + "learning_rate": 8.956633250201457e-05, + "loss": 2.2385, + "step": 3105 + }, + { + "epoch": 0.2338019157304428, + "grad_norm": 4.605666637420654, + "learning_rate": 8.955887789321031e-05, + "loss": 1.8063, + "step": 3106 + }, + { + "epoch": 0.23387719001110296, + "grad_norm": 4.58540678024292, + "learning_rate": 8.955142093274426e-05, + "loss": 2.394, + "step": 3107 + }, + { + "epoch": 0.2339524642917631, + "grad_norm": 3.4408278465270996, + "learning_rate": 8.954396162105974e-05, + "loss": 1.9515, + "step": 3108 + }, + { + "epoch": 0.23402773857242326, + "grad_norm": 6.240958213806152, + "learning_rate": 8.953649995860015e-05, + "loss": 2.2987, + "step": 3109 + }, + { + "epoch": 0.23410301285308344, + "grad_norm": 4.513462066650391, + "learning_rate": 8.952903594580908e-05, + "loss": 2.0325, + "step": 3110 + }, + { + "epoch": 0.23417828713374358, + "grad_norm": 4.148117542266846, + "learning_rate": 8.952156958313025e-05, + "loss": 2.3954, + "step": 3111 + }, + { + "epoch": 0.23425356141440373, + "grad_norm": 4.6144609451293945, + "learning_rate": 8.95141008710075e-05, + "loss": 2.0167, + "step": 3112 + }, + { + "epoch": 0.23432883569506388, + "grad_norm": 3.2475714683532715, + "learning_rate": 8.950662980988483e-05, + "loss": 1.8625, + "step": 3113 + }, + { + "epoch": 0.23440410997572406, + "grad_norm": 3.3102614879608154, + "learning_rate": 8.949915640020639e-05, + "loss": 1.84, + "step": 3114 + }, + { + "epoch": 0.2344793842563842, + "grad_norm": 4.453986644744873, + "learning_rate": 8.949168064241643e-05, + "loss": 2.0672, + "step": 3115 + }, + { + "epoch": 0.23455465853704435, + "grad_norm": 5.168568134307861, + "learning_rate": 8.948420253695937e-05, + "loss": 1.8874, + "step": 3116 + }, + { + "epoch": 0.2346299328177045, + "grad_norm": 5.015261173248291, + "learning_rate": 8.947672208427976e-05, + "loss": 2.2946, + "step": 3117 + }, + { + "epoch": 0.23470520709836468, + "grad_norm": 6.297513961791992, + "learning_rate": 8.94692392848223e-05, + "loss": 1.9489, + "step": 3118 + }, + { + "epoch": 0.23478048137902482, + "grad_norm": 4.652873992919922, + "learning_rate": 8.946175413903181e-05, + "loss": 1.7196, + "step": 3119 + }, + { + "epoch": 0.23485575565968497, + "grad_norm": 5.195075511932373, + "learning_rate": 8.945426664735326e-05, + "loss": 2.1111, + "step": 3120 + }, + { + "epoch": 0.23493102994034512, + "grad_norm": 4.507716178894043, + "learning_rate": 8.944677681023176e-05, + "loss": 2.232, + "step": 3121 + }, + { + "epoch": 0.2350063042210053, + "grad_norm": 5.152048110961914, + "learning_rate": 8.943928462811259e-05, + "loss": 2.3662, + "step": 3122 + }, + { + "epoch": 0.23508157850166544, + "grad_norm": 6.105501174926758, + "learning_rate": 8.943179010144109e-05, + "loss": 2.0616, + "step": 3123 + }, + { + "epoch": 0.2351568527823256, + "grad_norm": 4.871524810791016, + "learning_rate": 8.942429323066282e-05, + "loss": 2.1278, + "step": 3124 + }, + { + "epoch": 0.23523212706298574, + "grad_norm": 6.164617538452148, + "learning_rate": 8.941679401622344e-05, + "loss": 1.8989, + "step": 3125 + }, + { + "epoch": 0.23530740134364592, + "grad_norm": 5.468104839324951, + "learning_rate": 8.940929245856875e-05, + "loss": 2.038, + "step": 3126 + }, + { + "epoch": 0.23538267562430606, + "grad_norm": 4.448368072509766, + "learning_rate": 8.940178855814469e-05, + "loss": 1.8849, + "step": 3127 + }, + { + "epoch": 0.2354579499049662, + "grad_norm": 6.080448150634766, + "learning_rate": 8.939428231539738e-05, + "loss": 1.8613, + "step": 3128 + }, + { + "epoch": 0.2355332241856264, + "grad_norm": 6.833067893981934, + "learning_rate": 8.938677373077302e-05, + "loss": 2.4226, + "step": 3129 + }, + { + "epoch": 0.23560849846628654, + "grad_norm": 9.775557518005371, + "learning_rate": 8.937926280471796e-05, + "loss": 2.1885, + "step": 3130 + }, + { + "epoch": 0.23568377274694668, + "grad_norm": 4.300649166107178, + "learning_rate": 8.937174953767872e-05, + "loss": 1.9099, + "step": 3131 + }, + { + "epoch": 0.23575904702760683, + "grad_norm": 4.808737754821777, + "learning_rate": 8.936423393010194e-05, + "loss": 1.9384, + "step": 3132 + }, + { + "epoch": 0.235834321308267, + "grad_norm": 5.373995304107666, + "learning_rate": 8.935671598243441e-05, + "loss": 2.1004, + "step": 3133 + }, + { + "epoch": 0.23590959558892716, + "grad_norm": 5.3765645027160645, + "learning_rate": 8.934919569512303e-05, + "loss": 2.1998, + "step": 3134 + }, + { + "epoch": 0.2359848698695873, + "grad_norm": 4.715597629547119, + "learning_rate": 8.934167306861488e-05, + "loss": 2.1741, + "step": 3135 + }, + { + "epoch": 0.23606014415024745, + "grad_norm": 5.471828937530518, + "learning_rate": 8.933414810335716e-05, + "loss": 2.1155, + "step": 3136 + }, + { + "epoch": 0.23613541843090763, + "grad_norm": 5.577611923217773, + "learning_rate": 8.932662079979718e-05, + "loss": 2.01, + "step": 3137 + }, + { + "epoch": 0.23621069271156778, + "grad_norm": 4.3506059646606445, + "learning_rate": 8.931909115838244e-05, + "loss": 2.1198, + "step": 3138 + }, + { + "epoch": 0.23628596699222792, + "grad_norm": 3.760789632797241, + "learning_rate": 8.931155917956055e-05, + "loss": 2.2164, + "step": 3139 + }, + { + "epoch": 0.23636124127288807, + "grad_norm": 5.615194320678711, + "learning_rate": 8.930402486377927e-05, + "loss": 2.0517, + "step": 3140 + }, + { + "epoch": 0.23643651555354825, + "grad_norm": 4.123208999633789, + "learning_rate": 8.929648821148648e-05, + "loss": 1.8561, + "step": 3141 + }, + { + "epoch": 0.2365117898342084, + "grad_norm": 4.531689167022705, + "learning_rate": 8.928894922313022e-05, + "loss": 2.0198, + "step": 3142 + }, + { + "epoch": 0.23658706411486854, + "grad_norm": 6.550711154937744, + "learning_rate": 8.928140789915866e-05, + "loss": 2.0334, + "step": 3143 + }, + { + "epoch": 0.23666233839552872, + "grad_norm": 4.847607135772705, + "learning_rate": 8.927386424002011e-05, + "loss": 2.4329, + "step": 3144 + }, + { + "epoch": 0.23673761267618887, + "grad_norm": 4.96705436706543, + "learning_rate": 8.926631824616303e-05, + "loss": 2.1111, + "step": 3145 + }, + { + "epoch": 0.23681288695684902, + "grad_norm": 5.4653120040893555, + "learning_rate": 8.925876991803599e-05, + "loss": 1.7965, + "step": 3146 + }, + { + "epoch": 0.23688816123750916, + "grad_norm": 3.8454763889312744, + "learning_rate": 8.925121925608772e-05, + "loss": 2.2273, + "step": 3147 + }, + { + "epoch": 0.23696343551816934, + "grad_norm": 5.1777663230896, + "learning_rate": 8.924366626076711e-05, + "loss": 2.1696, + "step": 3148 + }, + { + "epoch": 0.2370387097988295, + "grad_norm": 4.664035320281982, + "learning_rate": 8.923611093252313e-05, + "loss": 1.9461, + "step": 3149 + }, + { + "epoch": 0.23711398407948964, + "grad_norm": 4.680776596069336, + "learning_rate": 8.922855327180494e-05, + "loss": 2.1509, + "step": 3150 + }, + { + "epoch": 0.23718925836014979, + "grad_norm": 4.545822620391846, + "learning_rate": 8.922099327906181e-05, + "loss": 2.1153, + "step": 3151 + }, + { + "epoch": 0.23726453264080996, + "grad_norm": 4.652801036834717, + "learning_rate": 8.921343095474317e-05, + "loss": 1.9073, + "step": 3152 + }, + { + "epoch": 0.2373398069214701, + "grad_norm": 3.912487506866455, + "learning_rate": 8.920586629929858e-05, + "loss": 2.0896, + "step": 3153 + }, + { + "epoch": 0.23741508120213026, + "grad_norm": 3.8954620361328125, + "learning_rate": 8.919829931317774e-05, + "loss": 2.1027, + "step": 3154 + }, + { + "epoch": 0.2374903554827904, + "grad_norm": 5.0088982582092285, + "learning_rate": 8.919072999683046e-05, + "loss": 2.1641, + "step": 3155 + }, + { + "epoch": 0.23756562976345058, + "grad_norm": 4.781058311462402, + "learning_rate": 8.918315835070674e-05, + "loss": 2.0671, + "step": 3156 + }, + { + "epoch": 0.23764090404411073, + "grad_norm": 3.9386327266693115, + "learning_rate": 8.917558437525668e-05, + "loss": 2.2063, + "step": 3157 + }, + { + "epoch": 0.23771617832477088, + "grad_norm": 4.667530059814453, + "learning_rate": 8.916800807093056e-05, + "loss": 1.9672, + "step": 3158 + }, + { + "epoch": 0.23779145260543105, + "grad_norm": 3.55936598777771, + "learning_rate": 8.916042943817872e-05, + "loss": 2.3099, + "step": 3159 + }, + { + "epoch": 0.2378667268860912, + "grad_norm": 5.442907333374023, + "learning_rate": 8.915284847745171e-05, + "loss": 2.1016, + "step": 3160 + }, + { + "epoch": 0.23794200116675135, + "grad_norm": 5.0765252113342285, + "learning_rate": 8.914526518920022e-05, + "loss": 2.1358, + "step": 3161 + }, + { + "epoch": 0.2380172754474115, + "grad_norm": 5.0043559074401855, + "learning_rate": 8.913767957387502e-05, + "loss": 1.9611, + "step": 3162 + }, + { + "epoch": 0.23809254972807167, + "grad_norm": 4.878602981567383, + "learning_rate": 8.913009163192707e-05, + "loss": 2.0599, + "step": 3163 + }, + { + "epoch": 0.23816782400873182, + "grad_norm": 5.485286235809326, + "learning_rate": 8.912250136380746e-05, + "loss": 1.6888, + "step": 3164 + }, + { + "epoch": 0.23824309828939197, + "grad_norm": 4.859185695648193, + "learning_rate": 8.91149087699674e-05, + "loss": 1.624, + "step": 3165 + }, + { + "epoch": 0.23831837257005212, + "grad_norm": 4.253639221191406, + "learning_rate": 8.910731385085824e-05, + "loss": 1.9612, + "step": 3166 + }, + { + "epoch": 0.2383936468507123, + "grad_norm": 5.439131259918213, + "learning_rate": 8.909971660693148e-05, + "loss": 1.9577, + "step": 3167 + }, + { + "epoch": 0.23846892113137244, + "grad_norm": 6.699115753173828, + "learning_rate": 8.909211703863876e-05, + "loss": 1.7809, + "step": 3168 + }, + { + "epoch": 0.2385441954120326, + "grad_norm": 5.474216938018799, + "learning_rate": 8.908451514643185e-05, + "loss": 2.4552, + "step": 3169 + }, + { + "epoch": 0.23861946969269274, + "grad_norm": 4.317144870758057, + "learning_rate": 8.907691093076266e-05, + "loss": 1.899, + "step": 3170 + }, + { + "epoch": 0.2386947439733529, + "grad_norm": 5.524565696716309, + "learning_rate": 8.906930439208323e-05, + "loss": 1.7165, + "step": 3171 + }, + { + "epoch": 0.23877001825401306, + "grad_norm": 5.491680145263672, + "learning_rate": 8.906169553084577e-05, + "loss": 2.2379, + "step": 3172 + }, + { + "epoch": 0.2388452925346732, + "grad_norm": 4.068700790405273, + "learning_rate": 8.905408434750259e-05, + "loss": 1.9099, + "step": 3173 + }, + { + "epoch": 0.23892056681533336, + "grad_norm": 5.242240905761719, + "learning_rate": 8.904647084250615e-05, + "loss": 1.7492, + "step": 3174 + }, + { + "epoch": 0.23899584109599353, + "grad_norm": 5.466667652130127, + "learning_rate": 8.903885501630907e-05, + "loss": 2.2747, + "step": 3175 + }, + { + "epoch": 0.23907111537665368, + "grad_norm": 5.807854175567627, + "learning_rate": 8.903123686936406e-05, + "loss": 1.9065, + "step": 3176 + }, + { + "epoch": 0.23914638965731383, + "grad_norm": 5.579805850982666, + "learning_rate": 8.9023616402124e-05, + "loss": 1.7062, + "step": 3177 + }, + { + "epoch": 0.239221663937974, + "grad_norm": 4.477570056915283, + "learning_rate": 8.901599361504194e-05, + "loss": 2.6109, + "step": 3178 + }, + { + "epoch": 0.23929693821863415, + "grad_norm": 3.5368828773498535, + "learning_rate": 8.900836850857099e-05, + "loss": 1.9125, + "step": 3179 + }, + { + "epoch": 0.2393722124992943, + "grad_norm": 5.75269079208374, + "learning_rate": 8.900074108316448e-05, + "loss": 1.7974, + "step": 3180 + }, + { + "epoch": 0.23944748677995445, + "grad_norm": 4.291672706604004, + "learning_rate": 8.899311133927579e-05, + "loss": 2.0582, + "step": 3181 + }, + { + "epoch": 0.23952276106061463, + "grad_norm": 3.6255385875701904, + "learning_rate": 8.898547927735854e-05, + "loss": 2.144, + "step": 3182 + }, + { + "epoch": 0.23959803534127477, + "grad_norm": 6.381038188934326, + "learning_rate": 8.89778448978664e-05, + "loss": 1.8225, + "step": 3183 + }, + { + "epoch": 0.23967330962193492, + "grad_norm": 4.766451358795166, + "learning_rate": 8.897020820125322e-05, + "loss": 2.016, + "step": 3184 + }, + { + "epoch": 0.23974858390259507, + "grad_norm": 5.317236423492432, + "learning_rate": 8.896256918797298e-05, + "loss": 1.9968, + "step": 3185 + }, + { + "epoch": 0.23982385818325525, + "grad_norm": 5.136428356170654, + "learning_rate": 8.89549278584798e-05, + "loss": 2.1573, + "step": 3186 + }, + { + "epoch": 0.2398991324639154, + "grad_norm": 6.299529552459717, + "learning_rate": 8.894728421322793e-05, + "loss": 2.4789, + "step": 3187 + }, + { + "epoch": 0.23997440674457554, + "grad_norm": 6.992971420288086, + "learning_rate": 8.893963825267177e-05, + "loss": 2.433, + "step": 3188 + }, + { + "epoch": 0.2400496810252357, + "grad_norm": 4.698974609375, + "learning_rate": 8.893198997726583e-05, + "loss": 2.0936, + "step": 3189 + }, + { + "epoch": 0.24012495530589587, + "grad_norm": 5.291773319244385, + "learning_rate": 8.892433938746483e-05, + "loss": 1.9175, + "step": 3190 + }, + { + "epoch": 0.240200229586556, + "grad_norm": 7.637009620666504, + "learning_rate": 8.89166864837235e-05, + "loss": 2.3605, + "step": 3191 + }, + { + "epoch": 0.24027550386721616, + "grad_norm": 4.988747596740723, + "learning_rate": 8.890903126649685e-05, + "loss": 1.9909, + "step": 3192 + }, + { + "epoch": 0.24035077814787634, + "grad_norm": 4.667901039123535, + "learning_rate": 8.890137373623992e-05, + "loss": 2.093, + "step": 3193 + }, + { + "epoch": 0.24042605242853649, + "grad_norm": 4.370298862457275, + "learning_rate": 8.889371389340795e-05, + "loss": 1.8265, + "step": 3194 + }, + { + "epoch": 0.24050132670919663, + "grad_norm": 5.158820152282715, + "learning_rate": 8.888605173845628e-05, + "loss": 2.4405, + "step": 3195 + }, + { + "epoch": 0.24057660098985678, + "grad_norm": 4.049282550811768, + "learning_rate": 8.887838727184041e-05, + "loss": 2.0967, + "step": 3196 + }, + { + "epoch": 0.24065187527051696, + "grad_norm": 5.029526710510254, + "learning_rate": 8.887072049401599e-05, + "loss": 2.2693, + "step": 3197 + }, + { + "epoch": 0.2407271495511771, + "grad_norm": 4.659379482269287, + "learning_rate": 8.886305140543876e-05, + "loss": 2.2833, + "step": 3198 + }, + { + "epoch": 0.24080242383183725, + "grad_norm": 7.6814141273498535, + "learning_rate": 8.885538000656462e-05, + "loss": 2.7273, + "step": 3199 + }, + { + "epoch": 0.2408776981124974, + "grad_norm": 4.647086143493652, + "learning_rate": 8.884770629784965e-05, + "loss": 2.2066, + "step": 3200 + }, + { + "epoch": 0.24095297239315758, + "grad_norm": 6.569813251495361, + "learning_rate": 8.884003027974999e-05, + "loss": 2.1592, + "step": 3201 + }, + { + "epoch": 0.24102824667381773, + "grad_norm": 5.138356685638428, + "learning_rate": 8.883235195272196e-05, + "loss": 1.9341, + "step": 3202 + }, + { + "epoch": 0.24110352095447787, + "grad_norm": 4.743891716003418, + "learning_rate": 8.882467131722206e-05, + "loss": 2.1359, + "step": 3203 + }, + { + "epoch": 0.24117879523513802, + "grad_norm": 3.7779014110565186, + "learning_rate": 8.881698837370685e-05, + "loss": 1.9735, + "step": 3204 + }, + { + "epoch": 0.2412540695157982, + "grad_norm": 6.693047046661377, + "learning_rate": 8.880930312263304e-05, + "loss": 2.2381, + "step": 3205 + }, + { + "epoch": 0.24132934379645835, + "grad_norm": 7.864299774169922, + "learning_rate": 8.880161556445751e-05, + "loss": 2.1537, + "step": 3206 + }, + { + "epoch": 0.2414046180771185, + "grad_norm": 4.908127307891846, + "learning_rate": 8.87939256996373e-05, + "loss": 2.1698, + "step": 3207 + }, + { + "epoch": 0.24147989235777867, + "grad_norm": 4.617619037628174, + "learning_rate": 8.87862335286295e-05, + "loss": 2.33, + "step": 3208 + }, + { + "epoch": 0.24155516663843882, + "grad_norm": 5.731082916259766, + "learning_rate": 8.877853905189139e-05, + "loss": 1.8775, + "step": 3209 + }, + { + "epoch": 0.24163044091909897, + "grad_norm": 5.916258811950684, + "learning_rate": 8.877084226988043e-05, + "loss": 2.1458, + "step": 3210 + }, + { + "epoch": 0.2417057151997591, + "grad_norm": 4.0186920166015625, + "learning_rate": 8.876314318305415e-05, + "loss": 1.8395, + "step": 3211 + }, + { + "epoch": 0.2417809894804193, + "grad_norm": 4.932983875274658, + "learning_rate": 8.87554417918702e-05, + "loss": 2.1925, + "step": 3212 + }, + { + "epoch": 0.24185626376107944, + "grad_norm": 4.3356194496154785, + "learning_rate": 8.874773809678644e-05, + "loss": 2.1817, + "step": 3213 + }, + { + "epoch": 0.24193153804173959, + "grad_norm": 5.016956806182861, + "learning_rate": 8.874003209826087e-05, + "loss": 2.0678, + "step": 3214 + }, + { + "epoch": 0.24200681232239973, + "grad_norm": 4.181794166564941, + "learning_rate": 8.873232379675151e-05, + "loss": 1.9835, + "step": 3215 + }, + { + "epoch": 0.2420820866030599, + "grad_norm": 5.459475040435791, + "learning_rate": 8.872461319271665e-05, + "loss": 2.3716, + "step": 3216 + }, + { + "epoch": 0.24215736088372006, + "grad_norm": 5.499965667724609, + "learning_rate": 8.871690028661465e-05, + "loss": 2.4414, + "step": 3217 + }, + { + "epoch": 0.2422326351643802, + "grad_norm": 4.132140159606934, + "learning_rate": 8.8709185078904e-05, + "loss": 2.0363, + "step": 3218 + }, + { + "epoch": 0.24230790944504035, + "grad_norm": 6.4035258293151855, + "learning_rate": 8.870146757004338e-05, + "loss": 1.8633, + "step": 3219 + }, + { + "epoch": 0.24238318372570053, + "grad_norm": 3.8113491535186768, + "learning_rate": 8.869374776049157e-05, + "loss": 2.4163, + "step": 3220 + }, + { + "epoch": 0.24245845800636068, + "grad_norm": 3.9124670028686523, + "learning_rate": 8.868602565070747e-05, + "loss": 1.9247, + "step": 3221 + }, + { + "epoch": 0.24253373228702083, + "grad_norm": 4.697439670562744, + "learning_rate": 8.867830124115016e-05, + "loss": 1.9543, + "step": 3222 + }, + { + "epoch": 0.242609006567681, + "grad_norm": 5.2570271492004395, + "learning_rate": 8.867057453227881e-05, + "loss": 1.7818, + "step": 3223 + }, + { + "epoch": 0.24268428084834115, + "grad_norm": 4.84550142288208, + "learning_rate": 8.866284552455276e-05, + "loss": 2.4094, + "step": 3224 + }, + { + "epoch": 0.2427595551290013, + "grad_norm": 4.9503912925720215, + "learning_rate": 8.865511421843149e-05, + "loss": 2.222, + "step": 3225 + }, + { + "epoch": 0.24283482940966145, + "grad_norm": 5.553617000579834, + "learning_rate": 8.864738061437459e-05, + "loss": 2.0724, + "step": 3226 + }, + { + "epoch": 0.24291010369032162, + "grad_norm": 4.538109302520752, + "learning_rate": 8.863964471284179e-05, + "loss": 2.1492, + "step": 3227 + }, + { + "epoch": 0.24298537797098177, + "grad_norm": 3.849200487136841, + "learning_rate": 8.8631906514293e-05, + "loss": 2.0135, + "step": 3228 + }, + { + "epoch": 0.24306065225164192, + "grad_norm": 5.269128322601318, + "learning_rate": 8.862416601918822e-05, + "loss": 2.5036, + "step": 3229 + }, + { + "epoch": 0.24313592653230207, + "grad_norm": 4.566773891448975, + "learning_rate": 8.861642322798757e-05, + "loss": 2.2918, + "step": 3230 + }, + { + "epoch": 0.24321120081296224, + "grad_norm": 5.218326568603516, + "learning_rate": 8.860867814115138e-05, + "loss": 2.1897, + "step": 3231 + }, + { + "epoch": 0.2432864750936224, + "grad_norm": 5.25179386138916, + "learning_rate": 8.860093075914005e-05, + "loss": 1.9741, + "step": 3232 + }, + { + "epoch": 0.24336174937428254, + "grad_norm": 4.57682991027832, + "learning_rate": 8.859318108241414e-05, + "loss": 2.5103, + "step": 3233 + }, + { + "epoch": 0.24343702365494269, + "grad_norm": 3.9245426654815674, + "learning_rate": 8.858542911143437e-05, + "loss": 2.0015, + "step": 3234 + }, + { + "epoch": 0.24351229793560286, + "grad_norm": 5.288936138153076, + "learning_rate": 8.857767484666153e-05, + "loss": 2.2405, + "step": 3235 + }, + { + "epoch": 0.243587572216263, + "grad_norm": 4.212207317352295, + "learning_rate": 8.856991828855663e-05, + "loss": 1.9943, + "step": 3236 + }, + { + "epoch": 0.24366284649692316, + "grad_norm": 4.822325229644775, + "learning_rate": 8.856215943758076e-05, + "loss": 2.3207, + "step": 3237 + }, + { + "epoch": 0.2437381207775833, + "grad_norm": 4.394848823547363, + "learning_rate": 8.855439829419514e-05, + "loss": 2.0547, + "step": 3238 + }, + { + "epoch": 0.24381339505824348, + "grad_norm": 6.581297397613525, + "learning_rate": 8.854663485886116e-05, + "loss": 1.9425, + "step": 3239 + }, + { + "epoch": 0.24388866933890363, + "grad_norm": 3.85432767868042, + "learning_rate": 8.853886913204037e-05, + "loss": 1.9968, + "step": 3240 + }, + { + "epoch": 0.24396394361956378, + "grad_norm": 5.349886417388916, + "learning_rate": 8.853110111419437e-05, + "loss": 2.0066, + "step": 3241 + }, + { + "epoch": 0.24403921790022395, + "grad_norm": 4.470263481140137, + "learning_rate": 8.852333080578497e-05, + "loss": 1.9458, + "step": 3242 + }, + { + "epoch": 0.2441144921808841, + "grad_norm": 6.385467052459717, + "learning_rate": 8.85155582072741e-05, + "loss": 2.3326, + "step": 3243 + }, + { + "epoch": 0.24418976646154425, + "grad_norm": 6.3552422523498535, + "learning_rate": 8.85077833191238e-05, + "loss": 2.4346, + "step": 3244 + }, + { + "epoch": 0.2442650407422044, + "grad_norm": 5.7520060539245605, + "learning_rate": 8.850000614179627e-05, + "loss": 1.9974, + "step": 3245 + }, + { + "epoch": 0.24434031502286457, + "grad_norm": 5.259435176849365, + "learning_rate": 8.849222667575384e-05, + "loss": 2.2463, + "step": 3246 + }, + { + "epoch": 0.24441558930352472, + "grad_norm": 6.514979839324951, + "learning_rate": 8.8484444921459e-05, + "loss": 2.0718, + "step": 3247 + }, + { + "epoch": 0.24449086358418487, + "grad_norm": 4.0444560050964355, + "learning_rate": 8.847666087937432e-05, + "loss": 2.1929, + "step": 3248 + }, + { + "epoch": 0.24456613786484502, + "grad_norm": 6.167499542236328, + "learning_rate": 8.846887454996254e-05, + "loss": 1.8931, + "step": 3249 + }, + { + "epoch": 0.2446414121455052, + "grad_norm": 5.100831031799316, + "learning_rate": 8.846108593368656e-05, + "loss": 2.031, + "step": 3250 + }, + { + "epoch": 0.24471668642616534, + "grad_norm": 4.207341194152832, + "learning_rate": 8.845329503100939e-05, + "loss": 2.1415, + "step": 3251 + }, + { + "epoch": 0.2447919607068255, + "grad_norm": 5.961008071899414, + "learning_rate": 8.844550184239415e-05, + "loss": 2.0773, + "step": 3252 + }, + { + "epoch": 0.24486723498748564, + "grad_norm": 4.484932899475098, + "learning_rate": 8.843770636830415e-05, + "loss": 1.792, + "step": 3253 + }, + { + "epoch": 0.2449425092681458, + "grad_norm": 6.538995265960693, + "learning_rate": 8.842990860920279e-05, + "loss": 1.9059, + "step": 3254 + }, + { + "epoch": 0.24501778354880596, + "grad_norm": 5.3386149406433105, + "learning_rate": 8.842210856555362e-05, + "loss": 2.0881, + "step": 3255 + }, + { + "epoch": 0.2450930578294661, + "grad_norm": 3.9098312854766846, + "learning_rate": 8.841430623782035e-05, + "loss": 1.9012, + "step": 3256 + }, + { + "epoch": 0.24516833211012629, + "grad_norm": 4.072517395019531, + "learning_rate": 8.84065016264668e-05, + "loss": 2.1567, + "step": 3257 + }, + { + "epoch": 0.24524360639078643, + "grad_norm": 3.917480707168579, + "learning_rate": 8.839869473195693e-05, + "loss": 1.7459, + "step": 3258 + }, + { + "epoch": 0.24531888067144658, + "grad_norm": 4.662839412689209, + "learning_rate": 8.839088555475484e-05, + "loss": 1.9131, + "step": 3259 + }, + { + "epoch": 0.24539415495210673, + "grad_norm": 4.041963577270508, + "learning_rate": 8.838307409532475e-05, + "loss": 2.0014, + "step": 3260 + }, + { + "epoch": 0.2454694292327669, + "grad_norm": 5.345216751098633, + "learning_rate": 8.837526035413104e-05, + "loss": 2.0544, + "step": 3261 + }, + { + "epoch": 0.24554470351342705, + "grad_norm": 4.985344409942627, + "learning_rate": 8.836744433163822e-05, + "loss": 2.9741, + "step": 3262 + }, + { + "epoch": 0.2456199777940872, + "grad_norm": 6.469967365264893, + "learning_rate": 8.835962602831092e-05, + "loss": 2.3676, + "step": 3263 + }, + { + "epoch": 0.24569525207474735, + "grad_norm": 3.80484938621521, + "learning_rate": 8.835180544461391e-05, + "loss": 1.828, + "step": 3264 + }, + { + "epoch": 0.24577052635540753, + "grad_norm": 3.9377801418304443, + "learning_rate": 8.834398258101212e-05, + "loss": 2.4294, + "step": 3265 + }, + { + "epoch": 0.24584580063606767, + "grad_norm": 5.02417516708374, + "learning_rate": 8.833615743797058e-05, + "loss": 2.1795, + "step": 3266 + }, + { + "epoch": 0.24592107491672782, + "grad_norm": 3.6834230422973633, + "learning_rate": 8.832833001595449e-05, + "loss": 1.9889, + "step": 3267 + }, + { + "epoch": 0.24599634919738797, + "grad_norm": 4.532587051391602, + "learning_rate": 8.832050031542915e-05, + "loss": 2.2274, + "step": 3268 + }, + { + "epoch": 0.24607162347804815, + "grad_norm": 3.6254334449768066, + "learning_rate": 8.831266833686001e-05, + "loss": 1.7468, + "step": 3269 + }, + { + "epoch": 0.2461468977587083, + "grad_norm": 5.678330421447754, + "learning_rate": 8.830483408071269e-05, + "loss": 2.2031, + "step": 3270 + }, + { + "epoch": 0.24622217203936844, + "grad_norm": 4.265519142150879, + "learning_rate": 8.829699754745287e-05, + "loss": 2.078, + "step": 3271 + }, + { + "epoch": 0.24629744632002862, + "grad_norm": 7.205012321472168, + "learning_rate": 8.828915873754644e-05, + "loss": 2.0402, + "step": 3272 + }, + { + "epoch": 0.24637272060068877, + "grad_norm": 6.666203498840332, + "learning_rate": 8.82813176514594e-05, + "loss": 2.0742, + "step": 3273 + }, + { + "epoch": 0.24644799488134891, + "grad_norm": 6.356449604034424, + "learning_rate": 8.827347428965785e-05, + "loss": 2.1263, + "step": 3274 + }, + { + "epoch": 0.24652326916200906, + "grad_norm": 4.000171661376953, + "learning_rate": 8.826562865260807e-05, + "loss": 2.2328, + "step": 3275 + }, + { + "epoch": 0.24659854344266924, + "grad_norm": 5.625492572784424, + "learning_rate": 8.825778074077649e-05, + "loss": 2.0938, + "step": 3276 + }, + { + "epoch": 0.24667381772332939, + "grad_norm": 5.216203212738037, + "learning_rate": 8.82499305546296e-05, + "loss": 2.0113, + "step": 3277 + }, + { + "epoch": 0.24674909200398953, + "grad_norm": 5.553715229034424, + "learning_rate": 8.82420780946341e-05, + "loss": 2.0522, + "step": 3278 + }, + { + "epoch": 0.24682436628464968, + "grad_norm": 4.141896724700928, + "learning_rate": 8.823422336125677e-05, + "loss": 2.1696, + "step": 3279 + }, + { + "epoch": 0.24689964056530986, + "grad_norm": 4.694346904754639, + "learning_rate": 8.822636635496457e-05, + "loss": 2.2712, + "step": 3280 + }, + { + "epoch": 0.24697491484597, + "grad_norm": 5.814883708953857, + "learning_rate": 8.821850707622458e-05, + "loss": 2.3358, + "step": 3281 + }, + { + "epoch": 0.24705018912663015, + "grad_norm": 5.676123142242432, + "learning_rate": 8.821064552550399e-05, + "loss": 2.1686, + "step": 3282 + }, + { + "epoch": 0.2471254634072903, + "grad_norm": 4.168556213378906, + "learning_rate": 8.820278170327018e-05, + "loss": 2.0238, + "step": 3283 + }, + { + "epoch": 0.24720073768795048, + "grad_norm": 5.1784257888793945, + "learning_rate": 8.819491560999062e-05, + "loss": 2.2653, + "step": 3284 + }, + { + "epoch": 0.24727601196861063, + "grad_norm": 6.038859844207764, + "learning_rate": 8.818704724613291e-05, + "loss": 2.0224, + "step": 3285 + }, + { + "epoch": 0.24735128624927077, + "grad_norm": 4.4210686683654785, + "learning_rate": 8.817917661216482e-05, + "loss": 2.1923, + "step": 3286 + }, + { + "epoch": 0.24742656052993092, + "grad_norm": 4.8354411125183105, + "learning_rate": 8.817130370855422e-05, + "loss": 2.2924, + "step": 3287 + }, + { + "epoch": 0.2475018348105911, + "grad_norm": 4.830492973327637, + "learning_rate": 8.816342853576913e-05, + "loss": 1.903, + "step": 3288 + }, + { + "epoch": 0.24757710909125125, + "grad_norm": 4.170045375823975, + "learning_rate": 8.815555109427773e-05, + "loss": 1.7754, + "step": 3289 + }, + { + "epoch": 0.2476523833719114, + "grad_norm": 4.566800594329834, + "learning_rate": 8.814767138454832e-05, + "loss": 2.1097, + "step": 3290 + }, + { + "epoch": 0.24772765765257157, + "grad_norm": 4.780071258544922, + "learning_rate": 8.813978940704927e-05, + "loss": 1.6647, + "step": 3291 + }, + { + "epoch": 0.24780293193323172, + "grad_norm": 5.604640483856201, + "learning_rate": 8.813190516224919e-05, + "loss": 1.8964, + "step": 3292 + }, + { + "epoch": 0.24787820621389187, + "grad_norm": 6.145547389984131, + "learning_rate": 8.812401865061675e-05, + "loss": 2.3407, + "step": 3293 + }, + { + "epoch": 0.24795348049455201, + "grad_norm": 6.518482208251953, + "learning_rate": 8.811612987262079e-05, + "loss": 2.0306, + "step": 3294 + }, + { + "epoch": 0.2480287547752122, + "grad_norm": 6.609365940093994, + "learning_rate": 8.810823882873029e-05, + "loss": 2.6311, + "step": 3295 + }, + { + "epoch": 0.24810402905587234, + "grad_norm": 6.722716808319092, + "learning_rate": 8.810034551941432e-05, + "loss": 2.2381, + "step": 3296 + }, + { + "epoch": 0.2481793033365325, + "grad_norm": 4.61924409866333, + "learning_rate": 8.809244994514216e-05, + "loss": 1.8746, + "step": 3297 + }, + { + "epoch": 0.24825457761719263, + "grad_norm": 4.667189121246338, + "learning_rate": 8.808455210638313e-05, + "loss": 2.0654, + "step": 3298 + }, + { + "epoch": 0.2483298518978528, + "grad_norm": 4.460404396057129, + "learning_rate": 8.807665200360675e-05, + "loss": 2.2116, + "step": 3299 + }, + { + "epoch": 0.24840512617851296, + "grad_norm": 3.8346314430236816, + "learning_rate": 8.806874963728267e-05, + "loss": 2.0176, + "step": 3300 + }, + { + "epoch": 0.2484804004591731, + "grad_norm": 4.841716289520264, + "learning_rate": 8.806084500788065e-05, + "loss": 1.6538, + "step": 3301 + }, + { + "epoch": 0.24855567473983325, + "grad_norm": 6.082234859466553, + "learning_rate": 8.80529381158706e-05, + "loss": 2.4185, + "step": 3302 + }, + { + "epoch": 0.24863094902049343, + "grad_norm": 5.5185394287109375, + "learning_rate": 8.80450289617226e-05, + "loss": 2.261, + "step": 3303 + }, + { + "epoch": 0.24870622330115358, + "grad_norm": 5.971257209777832, + "learning_rate": 8.803711754590676e-05, + "loss": 2.0064, + "step": 3304 + }, + { + "epoch": 0.24878149758181373, + "grad_norm": 4.2381744384765625, + "learning_rate": 8.802920386889341e-05, + "loss": 2.1358, + "step": 3305 + }, + { + "epoch": 0.2488567718624739, + "grad_norm": 5.927640914916992, + "learning_rate": 8.802128793115303e-05, + "loss": 2.4818, + "step": 3306 + }, + { + "epoch": 0.24893204614313405, + "grad_norm": 3.640798568725586, + "learning_rate": 8.801336973315619e-05, + "loss": 1.7848, + "step": 3307 + }, + { + "epoch": 0.2490073204237942, + "grad_norm": 3.7607762813568115, + "learning_rate": 8.800544927537356e-05, + "loss": 2.4525, + "step": 3308 + }, + { + "epoch": 0.24908259470445435, + "grad_norm": 4.451687812805176, + "learning_rate": 8.799752655827604e-05, + "loss": 1.9191, + "step": 3309 + }, + { + "epoch": 0.24915786898511452, + "grad_norm": 4.012941360473633, + "learning_rate": 8.798960158233458e-05, + "loss": 2.0683, + "step": 3310 + }, + { + "epoch": 0.24923314326577467, + "grad_norm": 2.915116310119629, + "learning_rate": 8.798167434802033e-05, + "loss": 2.0742, + "step": 3311 + }, + { + "epoch": 0.24930841754643482, + "grad_norm": 3.7754080295562744, + "learning_rate": 8.797374485580451e-05, + "loss": 1.7844, + "step": 3312 + }, + { + "epoch": 0.24938369182709497, + "grad_norm": 3.72605299949646, + "learning_rate": 8.796581310615851e-05, + "loss": 2.0949, + "step": 3313 + }, + { + "epoch": 0.24945896610775514, + "grad_norm": 4.392784118652344, + "learning_rate": 8.795787909955387e-05, + "loss": 2.0711, + "step": 3314 + }, + { + "epoch": 0.2495342403884153, + "grad_norm": 4.005949974060059, + "learning_rate": 8.794994283646223e-05, + "loss": 1.9918, + "step": 3315 + }, + { + "epoch": 0.24960951466907544, + "grad_norm": 5.2048845291137695, + "learning_rate": 8.794200431735537e-05, + "loss": 2.0352, + "step": 3316 + }, + { + "epoch": 0.2496847889497356, + "grad_norm": 3.9257380962371826, + "learning_rate": 8.793406354270523e-05, + "loss": 1.9416, + "step": 3317 + }, + { + "epoch": 0.24976006323039576, + "grad_norm": 6.32564640045166, + "learning_rate": 8.792612051298385e-05, + "loss": 2.3855, + "step": 3318 + }, + { + "epoch": 0.2498353375110559, + "grad_norm": 4.4817891120910645, + "learning_rate": 8.791817522866344e-05, + "loss": 2.1471, + "step": 3319 + }, + { + "epoch": 0.24991061179171606, + "grad_norm": 4.599247455596924, + "learning_rate": 8.79102276902163e-05, + "loss": 2.0043, + "step": 3320 + }, + { + "epoch": 0.24998588607237623, + "grad_norm": 5.344282627105713, + "learning_rate": 8.790227789811489e-05, + "loss": 2.2064, + "step": 3321 + }, + { + "epoch": 0.2500611603530364, + "grad_norm": 4.306174278259277, + "learning_rate": 8.789432585283183e-05, + "loss": 1.9538, + "step": 3322 + }, + { + "epoch": 0.25013643463369656, + "grad_norm": 5.088106632232666, + "learning_rate": 8.788637155483982e-05, + "loss": 1.8962, + "step": 3323 + }, + { + "epoch": 0.2502117089143567, + "grad_norm": 3.599647283554077, + "learning_rate": 8.787841500461173e-05, + "loss": 2.264, + "step": 3324 + }, + { + "epoch": 0.25028698319501685, + "grad_norm": 3.4975473880767822, + "learning_rate": 8.787045620262057e-05, + "loss": 1.9251, + "step": 3325 + }, + { + "epoch": 0.250362257475677, + "grad_norm": 4.483765602111816, + "learning_rate": 8.786249514933944e-05, + "loss": 2.0298, + "step": 3326 + }, + { + "epoch": 0.25043753175633715, + "grad_norm": 5.014871597290039, + "learning_rate": 8.785453184524161e-05, + "loss": 1.9818, + "step": 3327 + }, + { + "epoch": 0.2505128060369973, + "grad_norm": 5.549981117248535, + "learning_rate": 8.784656629080048e-05, + "loss": 1.9732, + "step": 3328 + }, + { + "epoch": 0.25058808031765745, + "grad_norm": 3.8736355304718018, + "learning_rate": 8.783859848648958e-05, + "loss": 1.9212, + "step": 3329 + }, + { + "epoch": 0.2506633545983176, + "grad_norm": 5.429457664489746, + "learning_rate": 8.783062843278257e-05, + "loss": 1.8222, + "step": 3330 + }, + { + "epoch": 0.2507386288789778, + "grad_norm": 5.866755962371826, + "learning_rate": 8.782265613015325e-05, + "loss": 2.2229, + "step": 3331 + }, + { + "epoch": 0.2508139031596379, + "grad_norm": 4.262095928192139, + "learning_rate": 8.781468157907555e-05, + "loss": 2.2646, + "step": 3332 + }, + { + "epoch": 0.2508891774402981, + "grad_norm": 4.359179973602295, + "learning_rate": 8.780670478002353e-05, + "loss": 2.0286, + "step": 3333 + }, + { + "epoch": 0.2509644517209582, + "grad_norm": 4.44980525970459, + "learning_rate": 8.779872573347138e-05, + "loss": 2.1966, + "step": 3334 + }, + { + "epoch": 0.2510397260016184, + "grad_norm": 4.281689167022705, + "learning_rate": 8.779074443989346e-05, + "loss": 2.0202, + "step": 3335 + }, + { + "epoch": 0.25111500028227857, + "grad_norm": 4.681268692016602, + "learning_rate": 8.778276089976421e-05, + "loss": 1.9536, + "step": 3336 + }, + { + "epoch": 0.2511902745629387, + "grad_norm": 3.7675046920776367, + "learning_rate": 8.777477511355822e-05, + "loss": 1.8321, + "step": 3337 + }, + { + "epoch": 0.25126554884359886, + "grad_norm": 4.948992729187012, + "learning_rate": 8.776678708175025e-05, + "loss": 1.9032, + "step": 3338 + }, + { + "epoch": 0.25134082312425904, + "grad_norm": 5.282908916473389, + "learning_rate": 8.775879680481516e-05, + "loss": 1.8272, + "step": 3339 + }, + { + "epoch": 0.25141609740491916, + "grad_norm": 5.23888635635376, + "learning_rate": 8.775080428322794e-05, + "loss": 2.1043, + "step": 3340 + }, + { + "epoch": 0.25149137168557933, + "grad_norm": 3.9256439208984375, + "learning_rate": 8.774280951746372e-05, + "loss": 2.1799, + "step": 3341 + }, + { + "epoch": 0.2515666459662395, + "grad_norm": 3.6742818355560303, + "learning_rate": 8.773481250799777e-05, + "loss": 2.0432, + "step": 3342 + }, + { + "epoch": 0.25164192024689963, + "grad_norm": 4.291045188903809, + "learning_rate": 8.77268132553055e-05, + "loss": 2.2712, + "step": 3343 + }, + { + "epoch": 0.2517171945275598, + "grad_norm": 4.5728678703308105, + "learning_rate": 8.77188117598624e-05, + "loss": 2.0552, + "step": 3344 + }, + { + "epoch": 0.2517924688082199, + "grad_norm": 3.9421279430389404, + "learning_rate": 8.77108080221442e-05, + "loss": 2.1023, + "step": 3345 + }, + { + "epoch": 0.2518677430888801, + "grad_norm": 5.261859893798828, + "learning_rate": 8.770280204262666e-05, + "loss": 2.2573, + "step": 3346 + }, + { + "epoch": 0.2519430173695403, + "grad_norm": 4.453205108642578, + "learning_rate": 8.769479382178572e-05, + "loss": 1.7899, + "step": 3347 + }, + { + "epoch": 0.2520182916502004, + "grad_norm": 4.459711074829102, + "learning_rate": 8.768678336009745e-05, + "loss": 1.8279, + "step": 3348 + }, + { + "epoch": 0.2520935659308606, + "grad_norm": 3.706254482269287, + "learning_rate": 8.767877065803803e-05, + "loss": 1.9549, + "step": 3349 + }, + { + "epoch": 0.25216884021152075, + "grad_norm": 3.9402592182159424, + "learning_rate": 8.767075571608383e-05, + "loss": 1.971, + "step": 3350 + }, + { + "epoch": 0.25224411449218087, + "grad_norm": 4.983983516693115, + "learning_rate": 8.766273853471128e-05, + "loss": 1.9869, + "step": 3351 + }, + { + "epoch": 0.25231938877284105, + "grad_norm": 6.2729315757751465, + "learning_rate": 8.765471911439697e-05, + "loss": 2.2604, + "step": 3352 + }, + { + "epoch": 0.2523946630535012, + "grad_norm": 4.499894618988037, + "learning_rate": 8.764669745561768e-05, + "loss": 1.7951, + "step": 3353 + }, + { + "epoch": 0.25246993733416134, + "grad_norm": 3.6646013259887695, + "learning_rate": 8.763867355885023e-05, + "loss": 1.9109, + "step": 3354 + }, + { + "epoch": 0.2525452116148215, + "grad_norm": 3.9780445098876953, + "learning_rate": 8.763064742457165e-05, + "loss": 2.2108, + "step": 3355 + }, + { + "epoch": 0.25262048589548164, + "grad_norm": 5.8783721923828125, + "learning_rate": 8.762261905325905e-05, + "loss": 2.2732, + "step": 3356 + }, + { + "epoch": 0.2526957601761418, + "grad_norm": 5.000054836273193, + "learning_rate": 8.76145884453897e-05, + "loss": 2.1449, + "step": 3357 + }, + { + "epoch": 0.252771034456802, + "grad_norm": 6.021858215332031, + "learning_rate": 8.760655560144098e-05, + "loss": 2.1501, + "step": 3358 + }, + { + "epoch": 0.2528463087374621, + "grad_norm": 5.94951868057251, + "learning_rate": 8.759852052189045e-05, + "loss": 2.8932, + "step": 3359 + }, + { + "epoch": 0.2529215830181223, + "grad_norm": 4.1834845542907715, + "learning_rate": 8.759048320721576e-05, + "loss": 2.1667, + "step": 3360 + }, + { + "epoch": 0.25299685729878246, + "grad_norm": 6.927559852600098, + "learning_rate": 8.75824436578947e-05, + "loss": 2.1306, + "step": 3361 + }, + { + "epoch": 0.2530721315794426, + "grad_norm": 3.9181342124938965, + "learning_rate": 8.75744018744052e-05, + "loss": 2.539, + "step": 3362 + }, + { + "epoch": 0.25314740586010276, + "grad_norm": 4.221496105194092, + "learning_rate": 8.75663578572253e-05, + "loss": 1.9139, + "step": 3363 + }, + { + "epoch": 0.2532226801407629, + "grad_norm": 6.087464332580566, + "learning_rate": 8.755831160683325e-05, + "loss": 2.0362, + "step": 3364 + }, + { + "epoch": 0.25329795442142305, + "grad_norm": 4.116762638092041, + "learning_rate": 8.755026312370733e-05, + "loss": 1.8901, + "step": 3365 + }, + { + "epoch": 0.25337322870208323, + "grad_norm": 5.038180351257324, + "learning_rate": 8.754221240832599e-05, + "loss": 2.2415, + "step": 3366 + }, + { + "epoch": 0.25344850298274335, + "grad_norm": 4.341926097869873, + "learning_rate": 8.753415946116787e-05, + "loss": 2.4661, + "step": 3367 + }, + { + "epoch": 0.2535237772634035, + "grad_norm": 4.785672187805176, + "learning_rate": 8.752610428271165e-05, + "loss": 1.9654, + "step": 3368 + }, + { + "epoch": 0.2535990515440637, + "grad_norm": 5.808557510375977, + "learning_rate": 8.751804687343621e-05, + "loss": 2.1148, + "step": 3369 + }, + { + "epoch": 0.2536743258247238, + "grad_norm": 5.877425670623779, + "learning_rate": 8.750998723382055e-05, + "loss": 2.0915, + "step": 3370 + }, + { + "epoch": 0.253749600105384, + "grad_norm": 4.9935994148254395, + "learning_rate": 8.750192536434376e-05, + "loss": 2.1862, + "step": 3371 + }, + { + "epoch": 0.2538248743860442, + "grad_norm": 9.6919527053833, + "learning_rate": 8.749386126548512e-05, + "loss": 2.2075, + "step": 3372 + }, + { + "epoch": 0.2539001486667043, + "grad_norm": 4.741062164306641, + "learning_rate": 8.7485794937724e-05, + "loss": 1.7482, + "step": 3373 + }, + { + "epoch": 0.25397542294736447, + "grad_norm": 5.2496442794799805, + "learning_rate": 8.747772638153992e-05, + "loss": 1.8747, + "step": 3374 + }, + { + "epoch": 0.2540506972280246, + "grad_norm": 4.504026889801025, + "learning_rate": 8.746965559741257e-05, + "loss": 1.9155, + "step": 3375 + }, + { + "epoch": 0.25412597150868477, + "grad_norm": 4.862549304962158, + "learning_rate": 8.74615825858217e-05, + "loss": 1.9825, + "step": 3376 + }, + { + "epoch": 0.25420124578934494, + "grad_norm": 5.344687461853027, + "learning_rate": 8.745350734724723e-05, + "loss": 1.8682, + "step": 3377 + }, + { + "epoch": 0.25427652007000506, + "grad_norm": 5.085996627807617, + "learning_rate": 8.744542988216922e-05, + "loss": 2.223, + "step": 3378 + }, + { + "epoch": 0.25435179435066524, + "grad_norm": 4.684676170349121, + "learning_rate": 8.743735019106782e-05, + "loss": 2.1211, + "step": 3379 + }, + { + "epoch": 0.2544270686313254, + "grad_norm": 4.374932289123535, + "learning_rate": 8.74292682744234e-05, + "loss": 2.1404, + "step": 3380 + }, + { + "epoch": 0.25450234291198554, + "grad_norm": 6.876060962677002, + "learning_rate": 8.742118413271637e-05, + "loss": 2.1856, + "step": 3381 + }, + { + "epoch": 0.2545776171926457, + "grad_norm": 5.487707614898682, + "learning_rate": 8.741309776642732e-05, + "loss": 1.9215, + "step": 3382 + }, + { + "epoch": 0.25465289147330583, + "grad_norm": 5.342986106872559, + "learning_rate": 8.740500917603696e-05, + "loss": 2.2173, + "step": 3383 + }, + { + "epoch": 0.254728165753966, + "grad_norm": 5.712308883666992, + "learning_rate": 8.739691836202613e-05, + "loss": 2.436, + "step": 3384 + }, + { + "epoch": 0.2548034400346262, + "grad_norm": 4.376852035522461, + "learning_rate": 8.73888253248758e-05, + "loss": 2.094, + "step": 3385 + }, + { + "epoch": 0.2548787143152863, + "grad_norm": 4.699362754821777, + "learning_rate": 8.738073006506708e-05, + "loss": 1.9965, + "step": 3386 + }, + { + "epoch": 0.2549539885959465, + "grad_norm": 5.363994121551514, + "learning_rate": 8.737263258308122e-05, + "loss": 1.822, + "step": 3387 + }, + { + "epoch": 0.25502926287660665, + "grad_norm": 5.242997169494629, + "learning_rate": 8.736453287939958e-05, + "loss": 2.462, + "step": 3388 + }, + { + "epoch": 0.2551045371572668, + "grad_norm": 7.264503479003906, + "learning_rate": 8.73564309545037e-05, + "loss": 2.1427, + "step": 3389 + }, + { + "epoch": 0.25517981143792695, + "grad_norm": 5.701240062713623, + "learning_rate": 8.734832680887517e-05, + "loss": 2.3662, + "step": 3390 + }, + { + "epoch": 0.2552550857185871, + "grad_norm": 6.550506591796875, + "learning_rate": 8.734022044299577e-05, + "loss": 2.2641, + "step": 3391 + }, + { + "epoch": 0.25533035999924725, + "grad_norm": 4.799621105194092, + "learning_rate": 8.73321118573474e-05, + "loss": 1.7875, + "step": 3392 + }, + { + "epoch": 0.2554056342799074, + "grad_norm": 6.353862762451172, + "learning_rate": 8.732400105241211e-05, + "loss": 2.4473, + "step": 3393 + }, + { + "epoch": 0.25548090856056754, + "grad_norm": 6.884500026702881, + "learning_rate": 8.731588802867205e-05, + "loss": 2.0167, + "step": 3394 + }, + { + "epoch": 0.2555561828412277, + "grad_norm": 6.473781585693359, + "learning_rate": 8.730777278660951e-05, + "loss": 2.4102, + "step": 3395 + }, + { + "epoch": 0.2556314571218879, + "grad_norm": 9.139266967773438, + "learning_rate": 8.729965532670694e-05, + "loss": 2.3445, + "step": 3396 + }, + { + "epoch": 0.255706731402548, + "grad_norm": 4.931876182556152, + "learning_rate": 8.729153564944688e-05, + "loss": 2.0216, + "step": 3397 + }, + { + "epoch": 0.2557820056832082, + "grad_norm": 3.9296631813049316, + "learning_rate": 8.728341375531203e-05, + "loss": 1.9161, + "step": 3398 + }, + { + "epoch": 0.25585727996386837, + "grad_norm": 4.26688289642334, + "learning_rate": 8.72752896447852e-05, + "loss": 2.0306, + "step": 3399 + }, + { + "epoch": 0.2559325542445285, + "grad_norm": 4.134580135345459, + "learning_rate": 8.726716331834937e-05, + "loss": 2.3585, + "step": 3400 + }, + { + "epoch": 0.25600782852518866, + "grad_norm": 5.008721351623535, + "learning_rate": 8.72590347764876e-05, + "loss": 2.2548, + "step": 3401 + }, + { + "epoch": 0.25608310280584884, + "grad_norm": 4.195300102233887, + "learning_rate": 8.725090401968312e-05, + "loss": 1.9368, + "step": 3402 + }, + { + "epoch": 0.25615837708650896, + "grad_norm": 5.666923999786377, + "learning_rate": 8.724277104841929e-05, + "loss": 2.0947, + "step": 3403 + }, + { + "epoch": 0.25623365136716914, + "grad_norm": 4.095748424530029, + "learning_rate": 8.723463586317957e-05, + "loss": 2.4304, + "step": 3404 + }, + { + "epoch": 0.25630892564782926, + "grad_norm": 4.958882808685303, + "learning_rate": 8.72264984644476e-05, + "loss": 2.4885, + "step": 3405 + }, + { + "epoch": 0.25638419992848943, + "grad_norm": 4.125391483306885, + "learning_rate": 8.72183588527071e-05, + "loss": 2.1755, + "step": 3406 + }, + { + "epoch": 0.2564594742091496, + "grad_norm": 5.554819107055664, + "learning_rate": 8.721021702844197e-05, + "loss": 2.2817, + "step": 3407 + }, + { + "epoch": 0.2565347484898097, + "grad_norm": 3.444105386734009, + "learning_rate": 8.72020729921362e-05, + "loss": 2.3285, + "step": 3408 + }, + { + "epoch": 0.2566100227704699, + "grad_norm": 4.626033782958984, + "learning_rate": 8.719392674427394e-05, + "loss": 2.104, + "step": 3409 + }, + { + "epoch": 0.2566852970511301, + "grad_norm": 3.7324564456939697, + "learning_rate": 8.718577828533944e-05, + "loss": 1.9851, + "step": 3410 + }, + { + "epoch": 0.2567605713317902, + "grad_norm": 4.411436557769775, + "learning_rate": 8.717762761581714e-05, + "loss": 2.301, + "step": 3411 + }, + { + "epoch": 0.2568358456124504, + "grad_norm": 3.9267332553863525, + "learning_rate": 8.716947473619154e-05, + "loss": 2.0016, + "step": 3412 + }, + { + "epoch": 0.2569111198931105, + "grad_norm": 4.398698329925537, + "learning_rate": 8.716131964694731e-05, + "loss": 2.0531, + "step": 3413 + }, + { + "epoch": 0.25698639417377067, + "grad_norm": 4.13516092300415, + "learning_rate": 8.715316234856928e-05, + "loss": 1.8714, + "step": 3414 + }, + { + "epoch": 0.25706166845443085, + "grad_norm": 4.921574115753174, + "learning_rate": 8.714500284154232e-05, + "loss": 1.9893, + "step": 3415 + }, + { + "epoch": 0.25713694273509097, + "grad_norm": 4.164416790008545, + "learning_rate": 8.713684112635154e-05, + "loss": 2.0498, + "step": 3416 + }, + { + "epoch": 0.25721221701575114, + "grad_norm": 4.605637073516846, + "learning_rate": 8.712867720348212e-05, + "loss": 1.9556, + "step": 3417 + }, + { + "epoch": 0.2572874912964113, + "grad_norm": 3.7386839389801025, + "learning_rate": 8.712051107341936e-05, + "loss": 2.0976, + "step": 3418 + }, + { + "epoch": 0.25736276557707144, + "grad_norm": 5.0655035972595215, + "learning_rate": 8.711234273664874e-05, + "loss": 1.9368, + "step": 3419 + }, + { + "epoch": 0.2574380398577316, + "grad_norm": 5.053414344787598, + "learning_rate": 8.710417219365583e-05, + "loss": 1.9514, + "step": 3420 + }, + { + "epoch": 0.2575133141383918, + "grad_norm": 4.10591983795166, + "learning_rate": 8.709599944492635e-05, + "loss": 2.2746, + "step": 3421 + }, + { + "epoch": 0.2575885884190519, + "grad_norm": 4.1164960861206055, + "learning_rate": 8.708782449094613e-05, + "loss": 2.0249, + "step": 3422 + }, + { + "epoch": 0.2576638626997121, + "grad_norm": 5.9446492195129395, + "learning_rate": 8.707964733220116e-05, + "loss": 1.8944, + "step": 3423 + }, + { + "epoch": 0.2577391369803722, + "grad_norm": 4.235020637512207, + "learning_rate": 8.707146796917756e-05, + "loss": 2.0211, + "step": 3424 + }, + { + "epoch": 0.2578144112610324, + "grad_norm": 4.341794013977051, + "learning_rate": 8.706328640236156e-05, + "loss": 2.1195, + "step": 3425 + }, + { + "epoch": 0.25788968554169256, + "grad_norm": 4.268988609313965, + "learning_rate": 8.705510263223953e-05, + "loss": 2.0087, + "step": 3426 + }, + { + "epoch": 0.2579649598223527, + "grad_norm": 4.792752742767334, + "learning_rate": 8.704691665929797e-05, + "loss": 2.2867, + "step": 3427 + }, + { + "epoch": 0.25804023410301286, + "grad_norm": 5.365433216094971, + "learning_rate": 8.70387284840235e-05, + "loss": 1.9162, + "step": 3428 + }, + { + "epoch": 0.25811550838367303, + "grad_norm": 4.760412693023682, + "learning_rate": 8.703053810690292e-05, + "loss": 2.1497, + "step": 3429 + }, + { + "epoch": 0.25819078266433315, + "grad_norm": 3.640106201171875, + "learning_rate": 8.702234552842307e-05, + "loss": 1.9194, + "step": 3430 + }, + { + "epoch": 0.2582660569449933, + "grad_norm": 6.287134170532227, + "learning_rate": 8.701415074907104e-05, + "loss": 1.9811, + "step": 3431 + }, + { + "epoch": 0.25834133122565345, + "grad_norm": 4.32661247253418, + "learning_rate": 8.700595376933394e-05, + "loss": 2.1024, + "step": 3432 + }, + { + "epoch": 0.2584166055063136, + "grad_norm": 5.4799933433532715, + "learning_rate": 8.699775458969908e-05, + "loss": 2.1458, + "step": 3433 + }, + { + "epoch": 0.2584918797869738, + "grad_norm": 5.0961809158325195, + "learning_rate": 8.698955321065386e-05, + "loss": 2.0579, + "step": 3434 + }, + { + "epoch": 0.2585671540676339, + "grad_norm": 5.822303295135498, + "learning_rate": 8.698134963268583e-05, + "loss": 2.2369, + "step": 3435 + }, + { + "epoch": 0.2586424283482941, + "grad_norm": 4.892825126647949, + "learning_rate": 8.697314385628268e-05, + "loss": 1.9262, + "step": 3436 + }, + { + "epoch": 0.25871770262895427, + "grad_norm": 5.237296104431152, + "learning_rate": 8.696493588193222e-05, + "loss": 2.1281, + "step": 3437 + }, + { + "epoch": 0.2587929769096144, + "grad_norm": 4.93479061126709, + "learning_rate": 8.695672571012239e-05, + "loss": 1.9472, + "step": 3438 + }, + { + "epoch": 0.25886825119027457, + "grad_norm": 5.76990270614624, + "learning_rate": 8.694851334134125e-05, + "loss": 1.9941, + "step": 3439 + }, + { + "epoch": 0.25894352547093474, + "grad_norm": 4.854992389678955, + "learning_rate": 8.694029877607702e-05, + "loss": 2.1369, + "step": 3440 + }, + { + "epoch": 0.25901879975159486, + "grad_norm": 3.966226816177368, + "learning_rate": 8.693208201481799e-05, + "loss": 2.0117, + "step": 3441 + }, + { + "epoch": 0.25909407403225504, + "grad_norm": 6.378432273864746, + "learning_rate": 8.692386305805269e-05, + "loss": 2.1988, + "step": 3442 + }, + { + "epoch": 0.25916934831291516, + "grad_norm": 4.8376898765563965, + "learning_rate": 8.691564190626967e-05, + "loss": 1.618, + "step": 3443 + }, + { + "epoch": 0.25924462259357534, + "grad_norm": 4.570425987243652, + "learning_rate": 8.690741855995765e-05, + "loss": 2.0759, + "step": 3444 + }, + { + "epoch": 0.2593198968742355, + "grad_norm": 4.492746353149414, + "learning_rate": 8.689919301960549e-05, + "loss": 1.854, + "step": 3445 + }, + { + "epoch": 0.25939517115489563, + "grad_norm": 5.882372856140137, + "learning_rate": 8.68909652857022e-05, + "loss": 2.1512, + "step": 3446 + }, + { + "epoch": 0.2594704454355558, + "grad_norm": 4.4794793128967285, + "learning_rate": 8.688273535873686e-05, + "loss": 1.6465, + "step": 3447 + }, + { + "epoch": 0.259545719716216, + "grad_norm": 3.931464195251465, + "learning_rate": 8.687450323919875e-05, + "loss": 1.864, + "step": 3448 + }, + { + "epoch": 0.2596209939968761, + "grad_norm": 4.325509071350098, + "learning_rate": 8.68662689275772e-05, + "loss": 1.903, + "step": 3449 + }, + { + "epoch": 0.2596962682775363, + "grad_norm": 4.37838077545166, + "learning_rate": 8.685803242436177e-05, + "loss": 1.6749, + "step": 3450 + }, + { + "epoch": 0.25977154255819646, + "grad_norm": 4.602680206298828, + "learning_rate": 8.684979373004205e-05, + "loss": 2.022, + "step": 3451 + }, + { + "epoch": 0.2598468168388566, + "grad_norm": 3.8039469718933105, + "learning_rate": 8.684155284510785e-05, + "loss": 2.1983, + "step": 3452 + }, + { + "epoch": 0.25992209111951675, + "grad_norm": 7.304414749145508, + "learning_rate": 8.683330977004902e-05, + "loss": 2.0093, + "step": 3453 + }, + { + "epoch": 0.25999736540017687, + "grad_norm": 6.7230329513549805, + "learning_rate": 8.682506450535563e-05, + "loss": 2.6703, + "step": 3454 + }, + { + "epoch": 0.26007263968083705, + "grad_norm": 10.431265830993652, + "learning_rate": 8.681681705151781e-05, + "loss": 2.2693, + "step": 3455 + }, + { + "epoch": 0.2601479139614972, + "grad_norm": 5.38029146194458, + "learning_rate": 8.680856740902585e-05, + "loss": 2.18, + "step": 3456 + }, + { + "epoch": 0.26022318824215734, + "grad_norm": 3.5006840229034424, + "learning_rate": 8.680031557837018e-05, + "loss": 1.7489, + "step": 3457 + }, + { + "epoch": 0.2602984625228175, + "grad_norm": 4.128906726837158, + "learning_rate": 8.679206156004134e-05, + "loss": 1.8874, + "step": 3458 + }, + { + "epoch": 0.2603737368034777, + "grad_norm": 5.0077314376831055, + "learning_rate": 8.678380535453e-05, + "loss": 2.1523, + "step": 3459 + }, + { + "epoch": 0.2604490110841378, + "grad_norm": 5.8345818519592285, + "learning_rate": 8.677554696232699e-05, + "loss": 2.1446, + "step": 3460 + }, + { + "epoch": 0.260524285364798, + "grad_norm": 5.284924507141113, + "learning_rate": 8.676728638392321e-05, + "loss": 1.9388, + "step": 3461 + }, + { + "epoch": 0.2605995596454581, + "grad_norm": 5.802793502807617, + "learning_rate": 8.675902361980979e-05, + "loss": 2.5715, + "step": 3462 + }, + { + "epoch": 0.2606748339261183, + "grad_norm": 4.506278038024902, + "learning_rate": 8.675075867047786e-05, + "loss": 2.046, + "step": 3463 + }, + { + "epoch": 0.26075010820677846, + "grad_norm": 6.851126670837402, + "learning_rate": 8.674249153641878e-05, + "loss": 2.4547, + "step": 3464 + }, + { + "epoch": 0.2608253824874386, + "grad_norm": 4.001614093780518, + "learning_rate": 8.673422221812402e-05, + "loss": 1.8197, + "step": 3465 + }, + { + "epoch": 0.26090065676809876, + "grad_norm": 5.8435797691345215, + "learning_rate": 8.672595071608513e-05, + "loss": 2.684, + "step": 3466 + }, + { + "epoch": 0.26097593104875894, + "grad_norm": 5.1660943031311035, + "learning_rate": 8.671767703079387e-05, + "loss": 1.7656, + "step": 3467 + }, + { + "epoch": 0.26105120532941906, + "grad_norm": 4.083738803863525, + "learning_rate": 8.670940116274205e-05, + "loss": 1.8932, + "step": 3468 + }, + { + "epoch": 0.26112647961007923, + "grad_norm": 4.045709133148193, + "learning_rate": 8.670112311242166e-05, + "loss": 2.1297, + "step": 3469 + }, + { + "epoch": 0.2612017538907394, + "grad_norm": 3.737746000289917, + "learning_rate": 8.669284288032482e-05, + "loss": 2.1864, + "step": 3470 + }, + { + "epoch": 0.26127702817139953, + "grad_norm": 4.691811561584473, + "learning_rate": 8.668456046694376e-05, + "loss": 2.2395, + "step": 3471 + }, + { + "epoch": 0.2613523024520597, + "grad_norm": 4.320754051208496, + "learning_rate": 8.667627587277082e-05, + "loss": 1.904, + "step": 3472 + }, + { + "epoch": 0.2614275767327198, + "grad_norm": 6.436110019683838, + "learning_rate": 8.666798909829852e-05, + "loss": 2.1441, + "step": 3473 + }, + { + "epoch": 0.26150285101338, + "grad_norm": 4.605068683624268, + "learning_rate": 8.66597001440195e-05, + "loss": 2.2468, + "step": 3474 + }, + { + "epoch": 0.2615781252940402, + "grad_norm": 4.703996658325195, + "learning_rate": 8.665140901042647e-05, + "loss": 1.8968, + "step": 3475 + }, + { + "epoch": 0.2616533995747003, + "grad_norm": 4.316329002380371, + "learning_rate": 8.664311569801235e-05, + "loss": 1.7415, + "step": 3476 + }, + { + "epoch": 0.26172867385536047, + "grad_norm": 4.848236560821533, + "learning_rate": 8.663482020727014e-05, + "loss": 1.9466, + "step": 3477 + }, + { + "epoch": 0.26180394813602065, + "grad_norm": 3.2008492946624756, + "learning_rate": 8.662652253869299e-05, + "loss": 2.1173, + "step": 3478 + }, + { + "epoch": 0.26187922241668077, + "grad_norm": 5.0430073738098145, + "learning_rate": 8.661822269277416e-05, + "loss": 2.2131, + "step": 3479 + }, + { + "epoch": 0.26195449669734094, + "grad_norm": 5.203665733337402, + "learning_rate": 8.660992067000707e-05, + "loss": 1.938, + "step": 3480 + }, + { + "epoch": 0.26202977097800106, + "grad_norm": 6.658170700073242, + "learning_rate": 8.660161647088524e-05, + "loss": 2.6746, + "step": 3481 + }, + { + "epoch": 0.26210504525866124, + "grad_norm": 5.447011470794678, + "learning_rate": 8.659331009590233e-05, + "loss": 2.0946, + "step": 3482 + }, + { + "epoch": 0.2621803195393214, + "grad_norm": 4.5602641105651855, + "learning_rate": 8.658500154555214e-05, + "loss": 1.882, + "step": 3483 + }, + { + "epoch": 0.26225559381998154, + "grad_norm": 6.496964454650879, + "learning_rate": 8.657669082032856e-05, + "loss": 2.3845, + "step": 3484 + }, + { + "epoch": 0.2623308681006417, + "grad_norm": 4.06599235534668, + "learning_rate": 8.656837792072567e-05, + "loss": 2.2797, + "step": 3485 + }, + { + "epoch": 0.2624061423813019, + "grad_norm": 4.25238561630249, + "learning_rate": 8.656006284723765e-05, + "loss": 2.2503, + "step": 3486 + }, + { + "epoch": 0.262481416661962, + "grad_norm": 3.3910272121429443, + "learning_rate": 8.655174560035879e-05, + "loss": 1.9028, + "step": 3487 + }, + { + "epoch": 0.2625566909426222, + "grad_norm": 5.794406414031982, + "learning_rate": 8.654342618058354e-05, + "loss": 2.0506, + "step": 3488 + }, + { + "epoch": 0.26263196522328236, + "grad_norm": 4.569392681121826, + "learning_rate": 8.653510458840645e-05, + "loss": 2.0771, + "step": 3489 + }, + { + "epoch": 0.2627072395039425, + "grad_norm": 4.954533576965332, + "learning_rate": 8.652678082432224e-05, + "loss": 2.2721, + "step": 3490 + }, + { + "epoch": 0.26278251378460266, + "grad_norm": 3.7053730487823486, + "learning_rate": 8.651845488882569e-05, + "loss": 2.2615, + "step": 3491 + }, + { + "epoch": 0.2628577880652628, + "grad_norm": 4.0285325050354, + "learning_rate": 8.65101267824118e-05, + "loss": 2.0601, + "step": 3492 + }, + { + "epoch": 0.26293306234592295, + "grad_norm": 4.186265468597412, + "learning_rate": 8.650179650557561e-05, + "loss": 1.749, + "step": 3493 + }, + { + "epoch": 0.26300833662658313, + "grad_norm": 6.563839912414551, + "learning_rate": 8.649346405881237e-05, + "loss": 2.0697, + "step": 3494 + }, + { + "epoch": 0.26308361090724325, + "grad_norm": 4.802326202392578, + "learning_rate": 8.648512944261741e-05, + "loss": 2.0839, + "step": 3495 + }, + { + "epoch": 0.2631588851879034, + "grad_norm": 5.419128894805908, + "learning_rate": 8.647679265748619e-05, + "loss": 1.7791, + "step": 3496 + }, + { + "epoch": 0.2632341594685636, + "grad_norm": 4.4524431228637695, + "learning_rate": 8.64684537039143e-05, + "loss": 1.9806, + "step": 3497 + }, + { + "epoch": 0.2633094337492237, + "grad_norm": 7.896785259246826, + "learning_rate": 8.646011258239747e-05, + "loss": 1.9088, + "step": 3498 + }, + { + "epoch": 0.2633847080298839, + "grad_norm": 4.490203380584717, + "learning_rate": 8.645176929343159e-05, + "loss": 1.8399, + "step": 3499 + }, + { + "epoch": 0.26345998231054407, + "grad_norm": 6.67177152633667, + "learning_rate": 8.64434238375126e-05, + "loss": 2.4361, + "step": 3500 + }, + { + "epoch": 0.2635352565912042, + "grad_norm": 4.74786376953125, + "learning_rate": 8.643507621513663e-05, + "loss": 1.8581, + "step": 3501 + }, + { + "epoch": 0.26361053087186437, + "grad_norm": 4.057636260986328, + "learning_rate": 8.642672642679991e-05, + "loss": 1.8163, + "step": 3502 + }, + { + "epoch": 0.2636858051525245, + "grad_norm": 6.529108047485352, + "learning_rate": 8.641837447299886e-05, + "loss": 2.1189, + "step": 3503 + }, + { + "epoch": 0.26376107943318466, + "grad_norm": 5.8578619956970215, + "learning_rate": 8.641002035422992e-05, + "loss": 1.8417, + "step": 3504 + }, + { + "epoch": 0.26383635371384484, + "grad_norm": 4.551418304443359, + "learning_rate": 8.640166407098974e-05, + "loss": 2.0364, + "step": 3505 + }, + { + "epoch": 0.26391162799450496, + "grad_norm": 4.891948699951172, + "learning_rate": 8.639330562377508e-05, + "loss": 2.0196, + "step": 3506 + }, + { + "epoch": 0.26398690227516514, + "grad_norm": 4.2468671798706055, + "learning_rate": 8.638494501308283e-05, + "loss": 2.0781, + "step": 3507 + }, + { + "epoch": 0.2640621765558253, + "grad_norm": 4.8525285720825195, + "learning_rate": 8.637658223940999e-05, + "loss": 2.0506, + "step": 3508 + }, + { + "epoch": 0.26413745083648543, + "grad_norm": 6.0103631019592285, + "learning_rate": 8.636821730325373e-05, + "loss": 2.0097, + "step": 3509 + }, + { + "epoch": 0.2642127251171456, + "grad_norm": 4.095498561859131, + "learning_rate": 8.635985020511129e-05, + "loss": 1.8606, + "step": 3510 + }, + { + "epoch": 0.26428799939780573, + "grad_norm": 6.583241939544678, + "learning_rate": 8.635148094548008e-05, + "loss": 2.2384, + "step": 3511 + }, + { + "epoch": 0.2643632736784659, + "grad_norm": 6.626463890075684, + "learning_rate": 8.634310952485764e-05, + "loss": 1.849, + "step": 3512 + }, + { + "epoch": 0.2644385479591261, + "grad_norm": 4.025700092315674, + "learning_rate": 8.633473594374162e-05, + "loss": 2.4964, + "step": 3513 + }, + { + "epoch": 0.2645138222397862, + "grad_norm": 9.353435516357422, + "learning_rate": 8.632636020262981e-05, + "loss": 2.3538, + "step": 3514 + }, + { + "epoch": 0.2645890965204464, + "grad_norm": 5.180330753326416, + "learning_rate": 8.63179823020201e-05, + "loss": 2.0134, + "step": 3515 + }, + { + "epoch": 0.26466437080110655, + "grad_norm": 4.322362422943115, + "learning_rate": 8.630960224241055e-05, + "loss": 2.0556, + "step": 3516 + }, + { + "epoch": 0.2647396450817667, + "grad_norm": 5.330920219421387, + "learning_rate": 8.630122002429934e-05, + "loss": 2.0454, + "step": 3517 + }, + { + "epoch": 0.26481491936242685, + "grad_norm": 4.6195759773254395, + "learning_rate": 8.629283564818478e-05, + "loss": 1.915, + "step": 3518 + }, + { + "epoch": 0.264890193643087, + "grad_norm": 5.498222827911377, + "learning_rate": 8.628444911456524e-05, + "loss": 1.8984, + "step": 3519 + }, + { + "epoch": 0.26496546792374714, + "grad_norm": 5.760746002197266, + "learning_rate": 8.627606042393934e-05, + "loss": 2.4864, + "step": 3520 + }, + { + "epoch": 0.2650407422044073, + "grad_norm": 5.845561504364014, + "learning_rate": 8.626766957680572e-05, + "loss": 2.2566, + "step": 3521 + }, + { + "epoch": 0.26511601648506744, + "grad_norm": 7.052467346191406, + "learning_rate": 8.625927657366321e-05, + "loss": 2.1368, + "step": 3522 + }, + { + "epoch": 0.2651912907657276, + "grad_norm": 4.2269768714904785, + "learning_rate": 8.625088141501074e-05, + "loss": 2.1631, + "step": 3523 + }, + { + "epoch": 0.2652665650463878, + "grad_norm": 4.330885887145996, + "learning_rate": 8.624248410134739e-05, + "loss": 2.4266, + "step": 3524 + }, + { + "epoch": 0.2653418393270479, + "grad_norm": 4.239013671875, + "learning_rate": 8.623408463317237e-05, + "loss": 1.9877, + "step": 3525 + }, + { + "epoch": 0.2654171136077081, + "grad_norm": 5.516305446624756, + "learning_rate": 8.622568301098496e-05, + "loss": 1.7916, + "step": 3526 + }, + { + "epoch": 0.26549238788836826, + "grad_norm": 5.711477279663086, + "learning_rate": 8.621727923528465e-05, + "loss": 1.8021, + "step": 3527 + }, + { + "epoch": 0.2655676621690284, + "grad_norm": 6.629284381866455, + "learning_rate": 8.620887330657102e-05, + "loss": 1.9709, + "step": 3528 + }, + { + "epoch": 0.26564293644968856, + "grad_norm": 4.86653470993042, + "learning_rate": 8.620046522534376e-05, + "loss": 2.1331, + "step": 3529 + }, + { + "epoch": 0.2657182107303487, + "grad_norm": 7.280300140380859, + "learning_rate": 8.619205499210269e-05, + "loss": 2.4143, + "step": 3530 + }, + { + "epoch": 0.26579348501100886, + "grad_norm": 6.372810363769531, + "learning_rate": 8.618364260734781e-05, + "loss": 2.0764, + "step": 3531 + }, + { + "epoch": 0.26586875929166903, + "grad_norm": 4.045786380767822, + "learning_rate": 8.617522807157921e-05, + "loss": 2.0302, + "step": 3532 + }, + { + "epoch": 0.26594403357232915, + "grad_norm": 4.7104291915893555, + "learning_rate": 8.61668113852971e-05, + "loss": 2.0957, + "step": 3533 + }, + { + "epoch": 0.26601930785298933, + "grad_norm": 4.618283748626709, + "learning_rate": 8.615839254900182e-05, + "loss": 1.9602, + "step": 3534 + }, + { + "epoch": 0.2660945821336495, + "grad_norm": 3.7331349849700928, + "learning_rate": 8.614997156319385e-05, + "loss": 1.8533, + "step": 3535 + }, + { + "epoch": 0.2661698564143096, + "grad_norm": 3.600839853286743, + "learning_rate": 8.61415484283738e-05, + "loss": 2.2455, + "step": 3536 + }, + { + "epoch": 0.2662451306949698, + "grad_norm": 4.755758285522461, + "learning_rate": 8.613312314504239e-05, + "loss": 2.0226, + "step": 3537 + }, + { + "epoch": 0.26632040497563, + "grad_norm": 4.29447603225708, + "learning_rate": 8.61246957137005e-05, + "loss": 2.1633, + "step": 3538 + }, + { + "epoch": 0.2663956792562901, + "grad_norm": 7.697479248046875, + "learning_rate": 8.61162661348491e-05, + "loss": 2.0379, + "step": 3539 + }, + { + "epoch": 0.2664709535369503, + "grad_norm": 4.47981595993042, + "learning_rate": 8.610783440898931e-05, + "loss": 1.8475, + "step": 3540 + }, + { + "epoch": 0.2665462278176104, + "grad_norm": 5.289915084838867, + "learning_rate": 8.609940053662236e-05, + "loss": 2.1634, + "step": 3541 + }, + { + "epoch": 0.26662150209827057, + "grad_norm": 7.253913879394531, + "learning_rate": 8.609096451824962e-05, + "loss": 1.8922, + "step": 3542 + }, + { + "epoch": 0.26669677637893074, + "grad_norm": 3.510399341583252, + "learning_rate": 8.608252635437261e-05, + "loss": 1.8736, + "step": 3543 + }, + { + "epoch": 0.26677205065959086, + "grad_norm": 6.3173604011535645, + "learning_rate": 8.607408604549295e-05, + "loss": 2.3701, + "step": 3544 + }, + { + "epoch": 0.26684732494025104, + "grad_norm": 5.826380729675293, + "learning_rate": 8.606564359211238e-05, + "loss": 1.9431, + "step": 3545 + }, + { + "epoch": 0.2669225992209112, + "grad_norm": 5.121246814727783, + "learning_rate": 8.605719899473277e-05, + "loss": 2.0404, + "step": 3546 + }, + { + "epoch": 0.26699787350157134, + "grad_norm": 4.491133689880371, + "learning_rate": 8.604875225385613e-05, + "loss": 2.1306, + "step": 3547 + }, + { + "epoch": 0.2670731477822315, + "grad_norm": 5.903306484222412, + "learning_rate": 8.604030336998463e-05, + "loss": 2.4523, + "step": 3548 + }, + { + "epoch": 0.2671484220628917, + "grad_norm": 7.432837009429932, + "learning_rate": 8.60318523436205e-05, + "loss": 1.9176, + "step": 3549 + }, + { + "epoch": 0.2672236963435518, + "grad_norm": 5.928546905517578, + "learning_rate": 8.602339917526612e-05, + "loss": 2.1684, + "step": 3550 + }, + { + "epoch": 0.267298970624212, + "grad_norm": 6.407994270324707, + "learning_rate": 8.601494386542403e-05, + "loss": 2.2811, + "step": 3551 + }, + { + "epoch": 0.2673742449048721, + "grad_norm": 5.582583904266357, + "learning_rate": 8.600648641459688e-05, + "loss": 2.3498, + "step": 3552 + }, + { + "epoch": 0.2674495191855323, + "grad_norm": 9.848580360412598, + "learning_rate": 8.599802682328742e-05, + "loss": 2.0346, + "step": 3553 + }, + { + "epoch": 0.26752479346619246, + "grad_norm": 7.439355850219727, + "learning_rate": 8.598956509199854e-05, + "loss": 2.2476, + "step": 3554 + }, + { + "epoch": 0.2676000677468526, + "grad_norm": 4.979750156402588, + "learning_rate": 8.598110122123329e-05, + "loss": 1.8053, + "step": 3555 + }, + { + "epoch": 0.26767534202751275, + "grad_norm": 4.2381510734558105, + "learning_rate": 8.597263521149481e-05, + "loss": 2.0136, + "step": 3556 + }, + { + "epoch": 0.26775061630817293, + "grad_norm": 3.3402321338653564, + "learning_rate": 8.596416706328639e-05, + "loss": 2.2251, + "step": 3557 + }, + { + "epoch": 0.26782589058883305, + "grad_norm": 4.999721527099609, + "learning_rate": 8.595569677711143e-05, + "loss": 1.7848, + "step": 3558 + }, + { + "epoch": 0.2679011648694932, + "grad_norm": 6.978513240814209, + "learning_rate": 8.594722435347347e-05, + "loss": 2.1537, + "step": 3559 + }, + { + "epoch": 0.26797643915015334, + "grad_norm": 5.452956199645996, + "learning_rate": 8.593874979287615e-05, + "loss": 2.7631, + "step": 3560 + }, + { + "epoch": 0.2680517134308135, + "grad_norm": 3.072632312774658, + "learning_rate": 8.593027309582328e-05, + "loss": 2.0021, + "step": 3561 + }, + { + "epoch": 0.2681269877114737, + "grad_norm": 5.195108413696289, + "learning_rate": 8.592179426281878e-05, + "loss": 2.1097, + "step": 3562 + }, + { + "epoch": 0.2682022619921338, + "grad_norm": 6.555400371551514, + "learning_rate": 8.591331329436668e-05, + "loss": 1.9364, + "step": 3563 + }, + { + "epoch": 0.268277536272794, + "grad_norm": 8.0423002243042, + "learning_rate": 8.590483019097114e-05, + "loss": 2.3436, + "step": 3564 + }, + { + "epoch": 0.26835281055345417, + "grad_norm": 4.884520053863525, + "learning_rate": 8.589634495313648e-05, + "loss": 2.1912, + "step": 3565 + }, + { + "epoch": 0.2684280848341143, + "grad_norm": 4.626718521118164, + "learning_rate": 8.58878575813671e-05, + "loss": 2.5206, + "step": 3566 + }, + { + "epoch": 0.26850335911477446, + "grad_norm": 3.974411725997925, + "learning_rate": 8.587936807616756e-05, + "loss": 2.1137, + "step": 3567 + }, + { + "epoch": 0.26857863339543464, + "grad_norm": 4.4176483154296875, + "learning_rate": 8.587087643804256e-05, + "loss": 2.0215, + "step": 3568 + }, + { + "epoch": 0.26865390767609476, + "grad_norm": 5.508111953735352, + "learning_rate": 8.586238266749686e-05, + "loss": 1.8853, + "step": 3569 + }, + { + "epoch": 0.26872918195675494, + "grad_norm": 4.688547134399414, + "learning_rate": 8.585388676503543e-05, + "loss": 2.128, + "step": 3570 + }, + { + "epoch": 0.26880445623741506, + "grad_norm": 3.828927993774414, + "learning_rate": 8.584538873116328e-05, + "loss": 1.676, + "step": 3571 + }, + { + "epoch": 0.26887973051807523, + "grad_norm": 6.4143290519714355, + "learning_rate": 8.583688856638563e-05, + "loss": 1.9208, + "step": 3572 + }, + { + "epoch": 0.2689550047987354, + "grad_norm": 5.6260905265808105, + "learning_rate": 8.58283862712078e-05, + "loss": 2.5867, + "step": 3573 + }, + { + "epoch": 0.26903027907939553, + "grad_norm": 6.218117713928223, + "learning_rate": 8.58198818461352e-05, + "loss": 2.0441, + "step": 3574 + }, + { + "epoch": 0.2691055533600557, + "grad_norm": 7.069939136505127, + "learning_rate": 8.58113752916734e-05, + "loss": 1.9179, + "step": 3575 + }, + { + "epoch": 0.2691808276407159, + "grad_norm": 5.20361328125, + "learning_rate": 8.58028666083281e-05, + "loss": 2.1202, + "step": 3576 + }, + { + "epoch": 0.269256101921376, + "grad_norm": 5.410080432891846, + "learning_rate": 8.57943557966051e-05, + "loss": 2.0853, + "step": 3577 + }, + { + "epoch": 0.2693313762020362, + "grad_norm": 4.13230037689209, + "learning_rate": 8.578584285701036e-05, + "loss": 1.792, + "step": 3578 + }, + { + "epoch": 0.2694066504826963, + "grad_norm": 4.529636859893799, + "learning_rate": 8.577732779004995e-05, + "loss": 2.1396, + "step": 3579 + }, + { + "epoch": 0.2694819247633565, + "grad_norm": 4.05285120010376, + "learning_rate": 8.576881059623003e-05, + "loss": 2.2955, + "step": 3580 + }, + { + "epoch": 0.26955719904401665, + "grad_norm": 4.4058685302734375, + "learning_rate": 8.576029127605698e-05, + "loss": 2.061, + "step": 3581 + }, + { + "epoch": 0.26963247332467677, + "grad_norm": 5.240222454071045, + "learning_rate": 8.575176983003721e-05, + "loss": 2.3837, + "step": 3582 + }, + { + "epoch": 0.26970774760533694, + "grad_norm": 4.010353088378906, + "learning_rate": 8.574324625867732e-05, + "loss": 1.7741, + "step": 3583 + }, + { + "epoch": 0.2697830218859971, + "grad_norm": 6.166420936584473, + "learning_rate": 8.573472056248399e-05, + "loss": 2.0576, + "step": 3584 + }, + { + "epoch": 0.26985829616665724, + "grad_norm": 4.106327056884766, + "learning_rate": 8.572619274196406e-05, + "loss": 2.069, + "step": 3585 + }, + { + "epoch": 0.2699335704473174, + "grad_norm": 3.5963661670684814, + "learning_rate": 8.571766279762448e-05, + "loss": 2.0419, + "step": 3586 + }, + { + "epoch": 0.2700088447279776, + "grad_norm": 4.46986198425293, + "learning_rate": 8.570913072997233e-05, + "loss": 2.2244, + "step": 3587 + }, + { + "epoch": 0.2700841190086377, + "grad_norm": 6.361734867095947, + "learning_rate": 8.570059653951482e-05, + "loss": 2.4293, + "step": 3588 + }, + { + "epoch": 0.2701593932892979, + "grad_norm": 4.877951622009277, + "learning_rate": 8.569206022675927e-05, + "loss": 1.6311, + "step": 3589 + }, + { + "epoch": 0.270234667569958, + "grad_norm": 7.91477108001709, + "learning_rate": 8.568352179221317e-05, + "loss": 2.2059, + "step": 3590 + }, + { + "epoch": 0.2703099418506182, + "grad_norm": 6.4916090965271, + "learning_rate": 8.567498123638408e-05, + "loss": 2.0631, + "step": 3591 + }, + { + "epoch": 0.27038521613127836, + "grad_norm": 6.365734577178955, + "learning_rate": 8.566643855977972e-05, + "loss": 2.248, + "step": 3592 + }, + { + "epoch": 0.2704604904119385, + "grad_norm": 3.8709022998809814, + "learning_rate": 8.565789376290793e-05, + "loss": 1.9263, + "step": 3593 + }, + { + "epoch": 0.27053576469259866, + "grad_norm": 5.647452354431152, + "learning_rate": 8.564934684627665e-05, + "loss": 2.0827, + "step": 3594 + }, + { + "epoch": 0.27061103897325883, + "grad_norm": 8.403019905090332, + "learning_rate": 8.564079781039402e-05, + "loss": 1.8089, + "step": 3595 + }, + { + "epoch": 0.27068631325391895, + "grad_norm": 7.931789875030518, + "learning_rate": 8.56322466557682e-05, + "loss": 2.3045, + "step": 3596 + }, + { + "epoch": 0.27076158753457913, + "grad_norm": 7.294676303863525, + "learning_rate": 8.562369338290757e-05, + "loss": 2.3273, + "step": 3597 + }, + { + "epoch": 0.2708368618152393, + "grad_norm": 6.213169574737549, + "learning_rate": 8.561513799232057e-05, + "loss": 2.2608, + "step": 3598 + }, + { + "epoch": 0.2709121360958994, + "grad_norm": 6.195013046264648, + "learning_rate": 8.560658048451584e-05, + "loss": 1.9316, + "step": 3599 + }, + { + "epoch": 0.2709874103765596, + "grad_norm": 4.21322774887085, + "learning_rate": 8.559802086000204e-05, + "loss": 2.1797, + "step": 3600 + }, + { + "epoch": 0.2710626846572197, + "grad_norm": 4.740867614746094, + "learning_rate": 8.558945911928804e-05, + "loss": 1.993, + "step": 3601 + }, + { + "epoch": 0.2711379589378799, + "grad_norm": 5.991433143615723, + "learning_rate": 8.558089526288282e-05, + "loss": 1.7565, + "step": 3602 + }, + { + "epoch": 0.2712132332185401, + "grad_norm": 4.477705955505371, + "learning_rate": 8.557232929129547e-05, + "loss": 1.9191, + "step": 3603 + }, + { + "epoch": 0.2712885074992002, + "grad_norm": 4.275289535522461, + "learning_rate": 8.556376120503522e-05, + "loss": 2.0778, + "step": 3604 + }, + { + "epoch": 0.27136378177986037, + "grad_norm": 7.188881874084473, + "learning_rate": 8.55551910046114e-05, + "loss": 2.343, + "step": 3605 + }, + { + "epoch": 0.27143905606052054, + "grad_norm": 4.844945430755615, + "learning_rate": 8.554661869053348e-05, + "loss": 2.0136, + "step": 3606 + }, + { + "epoch": 0.27151433034118067, + "grad_norm": 4.660032272338867, + "learning_rate": 8.55380442633111e-05, + "loss": 2.0537, + "step": 3607 + }, + { + "epoch": 0.27158960462184084, + "grad_norm": 4.035862922668457, + "learning_rate": 8.552946772345393e-05, + "loss": 2.0713, + "step": 3608 + }, + { + "epoch": 0.27166487890250096, + "grad_norm": 4.965854644775391, + "learning_rate": 8.552088907147189e-05, + "loss": 2.03, + "step": 3609 + }, + { + "epoch": 0.27174015318316114, + "grad_norm": 5.072443962097168, + "learning_rate": 8.55123083078749e-05, + "loss": 1.9861, + "step": 3610 + }, + { + "epoch": 0.2718154274638213, + "grad_norm": 5.708595275878906, + "learning_rate": 8.550372543317307e-05, + "loss": 2.65, + "step": 3611 + }, + { + "epoch": 0.27189070174448143, + "grad_norm": 5.120248317718506, + "learning_rate": 8.549514044787663e-05, + "loss": 2.047, + "step": 3612 + }, + { + "epoch": 0.2719659760251416, + "grad_norm": 4.392369270324707, + "learning_rate": 8.548655335249593e-05, + "loss": 2.5038, + "step": 3613 + }, + { + "epoch": 0.2720412503058018, + "grad_norm": 5.40472412109375, + "learning_rate": 8.547796414754148e-05, + "loss": 2.2558, + "step": 3614 + }, + { + "epoch": 0.2721165245864619, + "grad_norm": 6.653119087219238, + "learning_rate": 8.546937283352384e-05, + "loss": 1.9401, + "step": 3615 + }, + { + "epoch": 0.2721917988671221, + "grad_norm": 6.963906764984131, + "learning_rate": 8.546077941095376e-05, + "loss": 2.2188, + "step": 3616 + }, + { + "epoch": 0.27226707314778226, + "grad_norm": 3.746129035949707, + "learning_rate": 8.545218388034211e-05, + "loss": 2.3688, + "step": 3617 + }, + { + "epoch": 0.2723423474284424, + "grad_norm": 4.360978603363037, + "learning_rate": 8.544358624219983e-05, + "loss": 1.9599, + "step": 3618 + }, + { + "epoch": 0.27241762170910255, + "grad_norm": 5.566946029663086, + "learning_rate": 8.543498649703807e-05, + "loss": 1.9973, + "step": 3619 + }, + { + "epoch": 0.2724928959897627, + "grad_norm": 4.58626127243042, + "learning_rate": 8.542638464536801e-05, + "loss": 1.924, + "step": 3620 + }, + { + "epoch": 0.27256817027042285, + "grad_norm": 5.034262180328369, + "learning_rate": 8.541778068770105e-05, + "loss": 2.2181, + "step": 3621 + }, + { + "epoch": 0.272643444551083, + "grad_norm": 3.571824550628662, + "learning_rate": 8.540917462454865e-05, + "loss": 2.126, + "step": 3622 + }, + { + "epoch": 0.27271871883174315, + "grad_norm": 5.0311503410339355, + "learning_rate": 8.540056645642242e-05, + "loss": 2.2951, + "step": 3623 + }, + { + "epoch": 0.2727939931124033, + "grad_norm": 4.832629680633545, + "learning_rate": 8.539195618383409e-05, + "loss": 2.0846, + "step": 3624 + }, + { + "epoch": 0.2728692673930635, + "grad_norm": 5.541370391845703, + "learning_rate": 8.538334380729552e-05, + "loss": 1.9308, + "step": 3625 + }, + { + "epoch": 0.2729445416737236, + "grad_norm": 4.064426898956299, + "learning_rate": 8.537472932731867e-05, + "loss": 1.9538, + "step": 3626 + }, + { + "epoch": 0.2730198159543838, + "grad_norm": 4.208832263946533, + "learning_rate": 8.536611274441567e-05, + "loss": 2.1032, + "step": 3627 + }, + { + "epoch": 0.27309509023504397, + "grad_norm": 3.8191335201263428, + "learning_rate": 8.535749405909876e-05, + "loss": 2.1858, + "step": 3628 + }, + { + "epoch": 0.2731703645157041, + "grad_norm": 6.1279520988464355, + "learning_rate": 8.534887327188026e-05, + "loss": 2.2174, + "step": 3629 + }, + { + "epoch": 0.27324563879636427, + "grad_norm": 7.215233325958252, + "learning_rate": 8.534025038327267e-05, + "loss": 2.6492, + "step": 3630 + }, + { + "epoch": 0.2733209130770244, + "grad_norm": 3.9133596420288086, + "learning_rate": 8.533162539378861e-05, + "loss": 2.0271, + "step": 3631 + }, + { + "epoch": 0.27339618735768456, + "grad_norm": 5.657726287841797, + "learning_rate": 8.53229983039408e-05, + "loss": 1.6412, + "step": 3632 + }, + { + "epoch": 0.27347146163834474, + "grad_norm": 5.177445411682129, + "learning_rate": 8.531436911424209e-05, + "loss": 2.0748, + "step": 3633 + }, + { + "epoch": 0.27354673591900486, + "grad_norm": 4.1661696434021, + "learning_rate": 8.530573782520546e-05, + "loss": 2.3361, + "step": 3634 + }, + { + "epoch": 0.27362201019966503, + "grad_norm": 6.5816874504089355, + "learning_rate": 8.529710443734402e-05, + "loss": 2.3101, + "step": 3635 + }, + { + "epoch": 0.2736972844803252, + "grad_norm": 4.619372367858887, + "learning_rate": 8.5288468951171e-05, + "loss": 1.917, + "step": 3636 + }, + { + "epoch": 0.27377255876098533, + "grad_norm": 3.834791660308838, + "learning_rate": 8.527983136719977e-05, + "loss": 1.8601, + "step": 3637 + }, + { + "epoch": 0.2738478330416455, + "grad_norm": 6.1451215744018555, + "learning_rate": 8.527119168594377e-05, + "loss": 2.1909, + "step": 3638 + }, + { + "epoch": 0.2739231073223056, + "grad_norm": 6.895201206207275, + "learning_rate": 8.526254990791665e-05, + "loss": 2.3125, + "step": 3639 + }, + { + "epoch": 0.2739983816029658, + "grad_norm": 4.202020168304443, + "learning_rate": 8.52539060336321e-05, + "loss": 2.054, + "step": 3640 + }, + { + "epoch": 0.274073655883626, + "grad_norm": 5.718653202056885, + "learning_rate": 8.524526006360401e-05, + "loss": 2.4088, + "step": 3641 + }, + { + "epoch": 0.2741489301642861, + "grad_norm": 5.545661449432373, + "learning_rate": 8.523661199834631e-05, + "loss": 2.1962, + "step": 3642 + }, + { + "epoch": 0.2742242044449463, + "grad_norm": 3.9309816360473633, + "learning_rate": 8.522796183837316e-05, + "loss": 1.969, + "step": 3643 + }, + { + "epoch": 0.27429947872560645, + "grad_norm": 5.553758144378662, + "learning_rate": 8.521930958419874e-05, + "loss": 2.3714, + "step": 3644 + }, + { + "epoch": 0.27437475300626657, + "grad_norm": 4.314330101013184, + "learning_rate": 8.521065523633744e-05, + "loss": 2.1054, + "step": 3645 + }, + { + "epoch": 0.27445002728692675, + "grad_norm": 5.760187149047852, + "learning_rate": 8.52019987953037e-05, + "loss": 1.8984, + "step": 3646 + }, + { + "epoch": 0.2745253015675869, + "grad_norm": 4.427959442138672, + "learning_rate": 8.519334026161215e-05, + "loss": 1.9055, + "step": 3647 + }, + { + "epoch": 0.27460057584824704, + "grad_norm": 4.975903511047363, + "learning_rate": 8.51846796357775e-05, + "loss": 1.8556, + "step": 3648 + }, + { + "epoch": 0.2746758501289072, + "grad_norm": 4.018494129180908, + "learning_rate": 8.517601691831461e-05, + "loss": 1.9764, + "step": 3649 + }, + { + "epoch": 0.27475112440956734, + "grad_norm": 4.055217266082764, + "learning_rate": 8.516735210973841e-05, + "loss": 2.4727, + "step": 3650 + }, + { + "epoch": 0.2748263986902275, + "grad_norm": 4.514999866485596, + "learning_rate": 8.515868521056407e-05, + "loss": 2.041, + "step": 3651 + }, + { + "epoch": 0.2749016729708877, + "grad_norm": 5.443676948547363, + "learning_rate": 8.515001622130676e-05, + "loss": 2.4615, + "step": 3652 + }, + { + "epoch": 0.2749769472515478, + "grad_norm": 3.3675954341888428, + "learning_rate": 8.514134514248185e-05, + "loss": 2.0067, + "step": 3653 + }, + { + "epoch": 0.275052221532208, + "grad_norm": 4.267234802246094, + "learning_rate": 8.51326719746048e-05, + "loss": 2.1175, + "step": 3654 + }, + { + "epoch": 0.27512749581286816, + "grad_norm": 4.728448867797852, + "learning_rate": 8.51239967181912e-05, + "loss": 1.9433, + "step": 3655 + }, + { + "epoch": 0.2752027700935283, + "grad_norm": 3.8933749198913574, + "learning_rate": 8.511531937375678e-05, + "loss": 2.2744, + "step": 3656 + }, + { + "epoch": 0.27527804437418846, + "grad_norm": 4.720108985900879, + "learning_rate": 8.510663994181739e-05, + "loss": 2.1416, + "step": 3657 + }, + { + "epoch": 0.2753533186548486, + "grad_norm": 4.744030952453613, + "learning_rate": 8.509795842288897e-05, + "loss": 2.0895, + "step": 3658 + }, + { + "epoch": 0.27542859293550875, + "grad_norm": 4.030458450317383, + "learning_rate": 8.508927481748765e-05, + "loss": 2.0314, + "step": 3659 + }, + { + "epoch": 0.27550386721616893, + "grad_norm": 4.617257595062256, + "learning_rate": 8.508058912612961e-05, + "loss": 2.7968, + "step": 3660 + }, + { + "epoch": 0.27557914149682905, + "grad_norm": 5.676261901855469, + "learning_rate": 8.507190134933122e-05, + "loss": 2.2711, + "step": 3661 + }, + { + "epoch": 0.2756544157774892, + "grad_norm": 4.699383735656738, + "learning_rate": 8.506321148760891e-05, + "loss": 2.1165, + "step": 3662 + }, + { + "epoch": 0.2757296900581494, + "grad_norm": 4.686986923217773, + "learning_rate": 8.50545195414793e-05, + "loss": 2.1272, + "step": 3663 + }, + { + "epoch": 0.2758049643388095, + "grad_norm": 7.716921806335449, + "learning_rate": 8.504582551145907e-05, + "loss": 2.843, + "step": 3664 + }, + { + "epoch": 0.2758802386194697, + "grad_norm": 7.023846626281738, + "learning_rate": 8.503712939806509e-05, + "loss": 2.061, + "step": 3665 + }, + { + "epoch": 0.2759555129001299, + "grad_norm": 4.08076810836792, + "learning_rate": 8.50284312018143e-05, + "loss": 1.8785, + "step": 3666 + }, + { + "epoch": 0.27603078718079, + "grad_norm": 6.744572639465332, + "learning_rate": 8.501973092322377e-05, + "loss": 2.2279, + "step": 3667 + }, + { + "epoch": 0.27610606146145017, + "grad_norm": 5.839383125305176, + "learning_rate": 8.501102856281072e-05, + "loss": 1.923, + "step": 3668 + }, + { + "epoch": 0.2761813357421103, + "grad_norm": 5.463138103485107, + "learning_rate": 8.500232412109248e-05, + "loss": 1.9304, + "step": 3669 + }, + { + "epoch": 0.27625661002277047, + "grad_norm": 4.074203968048096, + "learning_rate": 8.499361759858651e-05, + "loss": 2.0525, + "step": 3670 + }, + { + "epoch": 0.27633188430343064, + "grad_norm": 4.893757343292236, + "learning_rate": 8.498490899581037e-05, + "loss": 2.1287, + "step": 3671 + }, + { + "epoch": 0.27640715858409076, + "grad_norm": 3.9854490756988525, + "learning_rate": 8.497619831328178e-05, + "loss": 2.0592, + "step": 3672 + }, + { + "epoch": 0.27648243286475094, + "grad_norm": 4.770266532897949, + "learning_rate": 8.496748555151855e-05, + "loss": 1.9451, + "step": 3673 + }, + { + "epoch": 0.2765577071454111, + "grad_norm": 4.641678333282471, + "learning_rate": 8.495877071103864e-05, + "loss": 2.0983, + "step": 3674 + }, + { + "epoch": 0.27663298142607123, + "grad_norm": 3.688652515411377, + "learning_rate": 8.495005379236012e-05, + "loss": 2.0546, + "step": 3675 + }, + { + "epoch": 0.2767082557067314, + "grad_norm": 4.314660549163818, + "learning_rate": 8.494133479600121e-05, + "loss": 2.5765, + "step": 3676 + }, + { + "epoch": 0.2767835299873916, + "grad_norm": 5.093321323394775, + "learning_rate": 8.493261372248018e-05, + "loss": 1.8994, + "step": 3677 + }, + { + "epoch": 0.2768588042680517, + "grad_norm": 6.219925403594971, + "learning_rate": 8.492389057231549e-05, + "loss": 2.0182, + "step": 3678 + }, + { + "epoch": 0.2769340785487119, + "grad_norm": 4.212336540222168, + "learning_rate": 8.491516534602573e-05, + "loss": 2.0456, + "step": 3679 + }, + { + "epoch": 0.277009352829372, + "grad_norm": 5.203505992889404, + "learning_rate": 8.490643804412956e-05, + "loss": 1.9374, + "step": 3680 + }, + { + "epoch": 0.2770846271100322, + "grad_norm": 4.704999923706055, + "learning_rate": 8.489770866714582e-05, + "loss": 1.6675, + "step": 3681 + }, + { + "epoch": 0.27715990139069235, + "grad_norm": 5.450031280517578, + "learning_rate": 8.488897721559343e-05, + "loss": 2.1451, + "step": 3682 + }, + { + "epoch": 0.2772351756713525, + "grad_norm": 6.107594966888428, + "learning_rate": 8.488024368999144e-05, + "loss": 2.1799, + "step": 3683 + }, + { + "epoch": 0.27731044995201265, + "grad_norm": 6.630837440490723, + "learning_rate": 8.487150809085907e-05, + "loss": 2.6763, + "step": 3684 + }, + { + "epoch": 0.2773857242326728, + "grad_norm": 4.827672481536865, + "learning_rate": 8.486277041871557e-05, + "loss": 1.8602, + "step": 3685 + }, + { + "epoch": 0.27746099851333295, + "grad_norm": 3.5622708797454834, + "learning_rate": 8.485403067408042e-05, + "loss": 1.9435, + "step": 3686 + }, + { + "epoch": 0.2775362727939931, + "grad_norm": 4.234933376312256, + "learning_rate": 8.484528885747316e-05, + "loss": 1.861, + "step": 3687 + }, + { + "epoch": 0.27761154707465324, + "grad_norm": 4.909501552581787, + "learning_rate": 8.483654496941344e-05, + "loss": 1.9997, + "step": 3688 + }, + { + "epoch": 0.2776868213553134, + "grad_norm": 5.760375499725342, + "learning_rate": 8.482779901042108e-05, + "loss": 2.3467, + "step": 3689 + }, + { + "epoch": 0.2777620956359736, + "grad_norm": 6.31289529800415, + "learning_rate": 8.481905098101601e-05, + "loss": 2.2078, + "step": 3690 + }, + { + "epoch": 0.2778373699166337, + "grad_norm": 4.647454261779785, + "learning_rate": 8.481030088171827e-05, + "loss": 1.9932, + "step": 3691 + }, + { + "epoch": 0.2779126441972939, + "grad_norm": 5.118434906005859, + "learning_rate": 8.480154871304802e-05, + "loss": 2.2114, + "step": 3692 + }, + { + "epoch": 0.27798791847795407, + "grad_norm": 5.353497505187988, + "learning_rate": 8.479279447552557e-05, + "loss": 2.077, + "step": 3693 + }, + { + "epoch": 0.2780631927586142, + "grad_norm": 5.585944652557373, + "learning_rate": 8.47840381696713e-05, + "loss": 2.3376, + "step": 3694 + }, + { + "epoch": 0.27813846703927436, + "grad_norm": 4.524191379547119, + "learning_rate": 8.477527979600577e-05, + "loss": 2.2071, + "step": 3695 + }, + { + "epoch": 0.27821374131993454, + "grad_norm": 4.1141581535339355, + "learning_rate": 8.476651935504964e-05, + "loss": 1.8742, + "step": 3696 + }, + { + "epoch": 0.27828901560059466, + "grad_norm": 4.320529937744141, + "learning_rate": 8.475775684732369e-05, + "loss": 1.821, + "step": 3697 + }, + { + "epoch": 0.27836428988125483, + "grad_norm": 6.32377290725708, + "learning_rate": 8.474899227334884e-05, + "loss": 2.3346, + "step": 3698 + }, + { + "epoch": 0.27843956416191495, + "grad_norm": 4.690769672393799, + "learning_rate": 8.47402256336461e-05, + "loss": 2.014, + "step": 3699 + }, + { + "epoch": 0.27851483844257513, + "grad_norm": 4.876758098602295, + "learning_rate": 8.473145692873661e-05, + "loss": 1.8336, + "step": 3700 + }, + { + "epoch": 0.2785901127232353, + "grad_norm": 3.833322763442993, + "learning_rate": 8.472268615914168e-05, + "loss": 1.9416, + "step": 3701 + }, + { + "epoch": 0.2786653870038954, + "grad_norm": 3.9122402667999268, + "learning_rate": 8.471391332538268e-05, + "loss": 2.0796, + "step": 3702 + }, + { + "epoch": 0.2787406612845556, + "grad_norm": 4.947221279144287, + "learning_rate": 8.470513842798114e-05, + "loss": 2.0523, + "step": 3703 + }, + { + "epoch": 0.2788159355652158, + "grad_norm": 5.889840126037598, + "learning_rate": 8.46963614674587e-05, + "loss": 2.1409, + "step": 3704 + }, + { + "epoch": 0.2788912098458759, + "grad_norm": 6.707186698913574, + "learning_rate": 8.468758244433712e-05, + "loss": 2.2097, + "step": 3705 + }, + { + "epoch": 0.2789664841265361, + "grad_norm": 6.559521198272705, + "learning_rate": 8.46788013591383e-05, + "loss": 1.7735, + "step": 3706 + }, + { + "epoch": 0.2790417584071962, + "grad_norm": 5.264425754547119, + "learning_rate": 8.467001821238422e-05, + "loss": 2.0271, + "step": 3707 + }, + { + "epoch": 0.27911703268785637, + "grad_norm": 4.324186325073242, + "learning_rate": 8.466123300459707e-05, + "loss": 1.9381, + "step": 3708 + }, + { + "epoch": 0.27919230696851655, + "grad_norm": 4.952320575714111, + "learning_rate": 8.465244573629907e-05, + "loss": 1.9165, + "step": 3709 + }, + { + "epoch": 0.27926758124917667, + "grad_norm": 4.5556182861328125, + "learning_rate": 8.464365640801259e-05, + "loss": 2.3048, + "step": 3710 + }, + { + "epoch": 0.27934285552983684, + "grad_norm": 4.639623641967773, + "learning_rate": 8.463486502026015e-05, + "loss": 2.4237, + "step": 3711 + }, + { + "epoch": 0.279418129810497, + "grad_norm": 4.315785884857178, + "learning_rate": 8.462607157356435e-05, + "loss": 1.9636, + "step": 3712 + }, + { + "epoch": 0.27949340409115714, + "grad_norm": 3.648529052734375, + "learning_rate": 8.461727606844796e-05, + "loss": 1.7965, + "step": 3713 + }, + { + "epoch": 0.2795686783718173, + "grad_norm": 5.383721828460693, + "learning_rate": 8.460847850543382e-05, + "loss": 2.2302, + "step": 3714 + }, + { + "epoch": 0.2796439526524775, + "grad_norm": 3.9807448387145996, + "learning_rate": 8.459967888504494e-05, + "loss": 2.054, + "step": 3715 + }, + { + "epoch": 0.2797192269331376, + "grad_norm": 5.0536017417907715, + "learning_rate": 8.459087720780443e-05, + "loss": 2.2137, + "step": 3716 + }, + { + "epoch": 0.2797945012137978, + "grad_norm": 5.559696197509766, + "learning_rate": 8.458207347423554e-05, + "loss": 2.1715, + "step": 3717 + }, + { + "epoch": 0.2798697754944579, + "grad_norm": 4.65275239944458, + "learning_rate": 8.457326768486159e-05, + "loss": 2.2653, + "step": 3718 + }, + { + "epoch": 0.2799450497751181, + "grad_norm": 5.178258895874023, + "learning_rate": 8.456445984020607e-05, + "loss": 1.9544, + "step": 3719 + }, + { + "epoch": 0.28002032405577826, + "grad_norm": 4.327636241912842, + "learning_rate": 8.455564994079261e-05, + "loss": 2.1668, + "step": 3720 + }, + { + "epoch": 0.2800955983364384, + "grad_norm": 4.348104476928711, + "learning_rate": 8.45468379871449e-05, + "loss": 1.9429, + "step": 3721 + }, + { + "epoch": 0.28017087261709855, + "grad_norm": 5.069277286529541, + "learning_rate": 8.453802397978681e-05, + "loss": 2.3457, + "step": 3722 + }, + { + "epoch": 0.28024614689775873, + "grad_norm": 4.621848106384277, + "learning_rate": 8.452920791924228e-05, + "loss": 2.2044, + "step": 3723 + }, + { + "epoch": 0.28032142117841885, + "grad_norm": 5.038558483123779, + "learning_rate": 8.452038980603543e-05, + "loss": 2.4712, + "step": 3724 + }, + { + "epoch": 0.280396695459079, + "grad_norm": 5.322809219360352, + "learning_rate": 8.451156964069043e-05, + "loss": 2.0299, + "step": 3725 + }, + { + "epoch": 0.2804719697397392, + "grad_norm": 3.8125722408294678, + "learning_rate": 8.450274742373167e-05, + "loss": 2.2145, + "step": 3726 + }, + { + "epoch": 0.2805472440203993, + "grad_norm": 5.403464317321777, + "learning_rate": 8.449392315568356e-05, + "loss": 2.3177, + "step": 3727 + }, + { + "epoch": 0.2806225183010595, + "grad_norm": 4.7079854011535645, + "learning_rate": 8.448509683707069e-05, + "loss": 1.9227, + "step": 3728 + }, + { + "epoch": 0.2806977925817196, + "grad_norm": 5.48919153213501, + "learning_rate": 8.447626846841777e-05, + "loss": 2.0462, + "step": 3729 + }, + { + "epoch": 0.2807730668623798, + "grad_norm": 4.870682716369629, + "learning_rate": 8.44674380502496e-05, + "loss": 2.2351, + "step": 3730 + }, + { + "epoch": 0.28084834114303997, + "grad_norm": 4.92102575302124, + "learning_rate": 8.445860558309116e-05, + "loss": 1.9376, + "step": 3731 + }, + { + "epoch": 0.2809236154237001, + "grad_norm": 3.806340456008911, + "learning_rate": 8.444977106746748e-05, + "loss": 1.9041, + "step": 3732 + }, + { + "epoch": 0.28099888970436027, + "grad_norm": 4.492342472076416, + "learning_rate": 8.444093450390373e-05, + "loss": 2.0126, + "step": 3733 + }, + { + "epoch": 0.28107416398502044, + "grad_norm": 4.713959693908691, + "learning_rate": 8.44320958929253e-05, + "loss": 2.1837, + "step": 3734 + }, + { + "epoch": 0.28114943826568056, + "grad_norm": 4.521111011505127, + "learning_rate": 8.442325523505752e-05, + "loss": 1.8736, + "step": 3735 + }, + { + "epoch": 0.28122471254634074, + "grad_norm": 3.839892864227295, + "learning_rate": 8.441441253082601e-05, + "loss": 2.0659, + "step": 3736 + }, + { + "epoch": 0.28129998682700086, + "grad_norm": 5.555484771728516, + "learning_rate": 8.440556778075641e-05, + "loss": 1.7145, + "step": 3737 + }, + { + "epoch": 0.28137526110766103, + "grad_norm": 4.659602642059326, + "learning_rate": 8.439672098537452e-05, + "loss": 2.2435, + "step": 3738 + }, + { + "epoch": 0.2814505353883212, + "grad_norm": 5.173230171203613, + "learning_rate": 8.438787214520626e-05, + "loss": 2.0285, + "step": 3739 + }, + { + "epoch": 0.28152580966898133, + "grad_norm": 4.528423309326172, + "learning_rate": 8.43790212607777e-05, + "loss": 2.591, + "step": 3740 + }, + { + "epoch": 0.2816010839496415, + "grad_norm": 5.994011878967285, + "learning_rate": 8.437016833261496e-05, + "loss": 1.8713, + "step": 3741 + }, + { + "epoch": 0.2816763582303017, + "grad_norm": 4.154655456542969, + "learning_rate": 8.436131336124432e-05, + "loss": 2.0949, + "step": 3742 + }, + { + "epoch": 0.2817516325109618, + "grad_norm": 4.845334529876709, + "learning_rate": 8.43524563471922e-05, + "loss": 1.9901, + "step": 3743 + }, + { + "epoch": 0.281826906791622, + "grad_norm": 5.715048313140869, + "learning_rate": 8.43435972909851e-05, + "loss": 1.8887, + "step": 3744 + }, + { + "epoch": 0.28190218107228215, + "grad_norm": 3.6951839923858643, + "learning_rate": 8.433473619314972e-05, + "loss": 1.9527, + "step": 3745 + }, + { + "epoch": 0.2819774553529423, + "grad_norm": 4.321444034576416, + "learning_rate": 8.432587305421278e-05, + "loss": 1.9878, + "step": 3746 + }, + { + "epoch": 0.28205272963360245, + "grad_norm": 4.282951354980469, + "learning_rate": 8.431700787470118e-05, + "loss": 2.0862, + "step": 3747 + }, + { + "epoch": 0.28212800391426257, + "grad_norm": 5.607655048370361, + "learning_rate": 8.43081406551419e-05, + "loss": 2.1657, + "step": 3748 + }, + { + "epoch": 0.28220327819492275, + "grad_norm": 5.064688682556152, + "learning_rate": 8.429927139606213e-05, + "loss": 2.1957, + "step": 3749 + }, + { + "epoch": 0.2822785524755829, + "grad_norm": 5.386007308959961, + "learning_rate": 8.429040009798908e-05, + "loss": 2.0302, + "step": 3750 + }, + { + "epoch": 0.28235382675624304, + "grad_norm": 6.051460266113281, + "learning_rate": 8.428152676145013e-05, + "loss": 2.2934, + "step": 3751 + }, + { + "epoch": 0.2824291010369032, + "grad_norm": 5.103939056396484, + "learning_rate": 8.427265138697279e-05, + "loss": 2.291, + "step": 3752 + }, + { + "epoch": 0.2825043753175634, + "grad_norm": 5.415882587432861, + "learning_rate": 8.426377397508465e-05, + "loss": 2.2452, + "step": 3753 + }, + { + "epoch": 0.2825796495982235, + "grad_norm": 6.235812187194824, + "learning_rate": 8.425489452631348e-05, + "loss": 1.9409, + "step": 3754 + }, + { + "epoch": 0.2826549238788837, + "grad_norm": 5.436208248138428, + "learning_rate": 8.424601304118711e-05, + "loss": 2.2726, + "step": 3755 + }, + { + "epoch": 0.2827301981595438, + "grad_norm": 4.568819046020508, + "learning_rate": 8.423712952023353e-05, + "loss": 2.0822, + "step": 3756 + }, + { + "epoch": 0.282805472440204, + "grad_norm": 4.734124183654785, + "learning_rate": 8.422824396398082e-05, + "loss": 1.9301, + "step": 3757 + }, + { + "epoch": 0.28288074672086416, + "grad_norm": 5.557109355926514, + "learning_rate": 8.421935637295724e-05, + "loss": 1.9187, + "step": 3758 + }, + { + "epoch": 0.2829560210015243, + "grad_norm": 4.806347846984863, + "learning_rate": 8.42104667476911e-05, + "loss": 2.0105, + "step": 3759 + }, + { + "epoch": 0.28303129528218446, + "grad_norm": 7.374860763549805, + "learning_rate": 8.420157508871087e-05, + "loss": 2.0197, + "step": 3760 + }, + { + "epoch": 0.28310656956284463, + "grad_norm": 5.27902889251709, + "learning_rate": 8.419268139654514e-05, + "loss": 2.4943, + "step": 3761 + }, + { + "epoch": 0.28318184384350475, + "grad_norm": 4.831271648406982, + "learning_rate": 8.418378567172262e-05, + "loss": 2.3162, + "step": 3762 + }, + { + "epoch": 0.28325711812416493, + "grad_norm": 4.960517883300781, + "learning_rate": 8.41748879147721e-05, + "loss": 2.2269, + "step": 3763 + }, + { + "epoch": 0.2833323924048251, + "grad_norm": 6.046313285827637, + "learning_rate": 8.416598812622259e-05, + "loss": 1.7854, + "step": 3764 + }, + { + "epoch": 0.2834076666854852, + "grad_norm": 5.182833671569824, + "learning_rate": 8.415708630660309e-05, + "loss": 2.4509, + "step": 3765 + }, + { + "epoch": 0.2834829409661454, + "grad_norm": 4.593726634979248, + "learning_rate": 8.414818245644283e-05, + "loss": 1.7849, + "step": 3766 + }, + { + "epoch": 0.2835582152468055, + "grad_norm": 5.483936786651611, + "learning_rate": 8.41392765762711e-05, + "loss": 2.188, + "step": 3767 + }, + { + "epoch": 0.2836334895274657, + "grad_norm": 3.8305039405822754, + "learning_rate": 8.413036866661733e-05, + "loss": 1.6908, + "step": 3768 + }, + { + "epoch": 0.2837087638081259, + "grad_norm": 4.9925432205200195, + "learning_rate": 8.412145872801107e-05, + "loss": 2.1793, + "step": 3769 + }, + { + "epoch": 0.283784038088786, + "grad_norm": 4.195642471313477, + "learning_rate": 8.4112546760982e-05, + "loss": 1.9961, + "step": 3770 + }, + { + "epoch": 0.28385931236944617, + "grad_norm": 4.224948406219482, + "learning_rate": 8.410363276605992e-05, + "loss": 2.0325, + "step": 3771 + }, + { + "epoch": 0.28393458665010635, + "grad_norm": 3.92268443107605, + "learning_rate": 8.409471674377471e-05, + "loss": 2.0449, + "step": 3772 + }, + { + "epoch": 0.28400986093076647, + "grad_norm": 10.248950004577637, + "learning_rate": 8.408579869465641e-05, + "loss": 2.6393, + "step": 3773 + }, + { + "epoch": 0.28408513521142664, + "grad_norm": 5.0474467277526855, + "learning_rate": 8.40768786192352e-05, + "loss": 2.6215, + "step": 3774 + }, + { + "epoch": 0.2841604094920868, + "grad_norm": 6.735265254974365, + "learning_rate": 8.406795651804132e-05, + "loss": 1.957, + "step": 3775 + }, + { + "epoch": 0.28423568377274694, + "grad_norm": 6.065521240234375, + "learning_rate": 8.40590323916052e-05, + "loss": 1.916, + "step": 3776 + }, + { + "epoch": 0.2843109580534071, + "grad_norm": 4.69582462310791, + "learning_rate": 8.40501062404573e-05, + "loss": 2.258, + "step": 3777 + }, + { + "epoch": 0.28438623233406723, + "grad_norm": 3.578555107116699, + "learning_rate": 8.40411780651283e-05, + "loss": 2.1657, + "step": 3778 + }, + { + "epoch": 0.2844615066147274, + "grad_norm": 4.259191989898682, + "learning_rate": 8.403224786614894e-05, + "loss": 2.0313, + "step": 3779 + }, + { + "epoch": 0.2845367808953876, + "grad_norm": 3.659712076187134, + "learning_rate": 8.402331564405009e-05, + "loss": 2.2301, + "step": 3780 + }, + { + "epoch": 0.2846120551760477, + "grad_norm": 4.338321685791016, + "learning_rate": 8.401438139936275e-05, + "loss": 1.9096, + "step": 3781 + }, + { + "epoch": 0.2846873294567079, + "grad_norm": 5.078385353088379, + "learning_rate": 8.400544513261803e-05, + "loss": 1.8121, + "step": 3782 + }, + { + "epoch": 0.28476260373736806, + "grad_norm": 7.463127613067627, + "learning_rate": 8.399650684434719e-05, + "loss": 2.3373, + "step": 3783 + }, + { + "epoch": 0.2848378780180282, + "grad_norm": 4.69392728805542, + "learning_rate": 8.398756653508154e-05, + "loss": 2.2265, + "step": 3784 + }, + { + "epoch": 0.28491315229868835, + "grad_norm": 6.1369733810424805, + "learning_rate": 8.397862420535258e-05, + "loss": 2.0806, + "step": 3785 + }, + { + "epoch": 0.2849884265793485, + "grad_norm": 5.64315938949585, + "learning_rate": 8.396967985569192e-05, + "loss": 2.4179, + "step": 3786 + }, + { + "epoch": 0.28506370086000865, + "grad_norm": 4.775173664093018, + "learning_rate": 8.396073348663126e-05, + "loss": 1.7027, + "step": 3787 + }, + { + "epoch": 0.2851389751406688, + "grad_norm": 3.9329686164855957, + "learning_rate": 8.395178509870244e-05, + "loss": 2.1827, + "step": 3788 + }, + { + "epoch": 0.28521424942132895, + "grad_norm": 3.681978940963745, + "learning_rate": 8.39428346924374e-05, + "loss": 1.6844, + "step": 3789 + }, + { + "epoch": 0.2852895237019891, + "grad_norm": 4.753894329071045, + "learning_rate": 8.393388226836825e-05, + "loss": 1.8726, + "step": 3790 + }, + { + "epoch": 0.2853647979826493, + "grad_norm": 4.579850196838379, + "learning_rate": 8.392492782702717e-05, + "loss": 2.0369, + "step": 3791 + }, + { + "epoch": 0.2854400722633094, + "grad_norm": 6.411602020263672, + "learning_rate": 8.391597136894645e-05, + "loss": 2.2986, + "step": 3792 + }, + { + "epoch": 0.2855153465439696, + "grad_norm": 6.695382595062256, + "learning_rate": 8.390701289465856e-05, + "loss": 2.0889, + "step": 3793 + }, + { + "epoch": 0.28559062082462977, + "grad_norm": 7.453019142150879, + "learning_rate": 8.389805240469604e-05, + "loss": 2.0118, + "step": 3794 + }, + { + "epoch": 0.2856658951052899, + "grad_norm": 6.5703206062316895, + "learning_rate": 8.388908989959156e-05, + "loss": 1.8328, + "step": 3795 + }, + { + "epoch": 0.28574116938595007, + "grad_norm": 4.400692939758301, + "learning_rate": 8.388012537987795e-05, + "loss": 1.791, + "step": 3796 + }, + { + "epoch": 0.2858164436666102, + "grad_norm": 4.215025901794434, + "learning_rate": 8.387115884608808e-05, + "loss": 2.4272, + "step": 3797 + }, + { + "epoch": 0.28589171794727036, + "grad_norm": 4.3364338874816895, + "learning_rate": 8.386219029875502e-05, + "loss": 2.0135, + "step": 3798 + }, + { + "epoch": 0.28596699222793054, + "grad_norm": 4.847531318664551, + "learning_rate": 8.385321973841189e-05, + "loss": 1.9906, + "step": 3799 + }, + { + "epoch": 0.28604226650859066, + "grad_norm": 4.136904716491699, + "learning_rate": 8.384424716559198e-05, + "loss": 2.1823, + "step": 3800 + }, + { + "epoch": 0.28611754078925083, + "grad_norm": 4.262138366699219, + "learning_rate": 8.38352725808287e-05, + "loss": 2.1362, + "step": 3801 + }, + { + "epoch": 0.286192815069911, + "grad_norm": 3.498966932296753, + "learning_rate": 8.382629598465555e-05, + "loss": 1.8396, + "step": 3802 + }, + { + "epoch": 0.28626808935057113, + "grad_norm": 4.954427242279053, + "learning_rate": 8.381731737760614e-05, + "loss": 1.9453, + "step": 3803 + }, + { + "epoch": 0.2863433636312313, + "grad_norm": 4.326083660125732, + "learning_rate": 8.380833676021427e-05, + "loss": 1.7592, + "step": 3804 + }, + { + "epoch": 0.2864186379118914, + "grad_norm": 5.140336036682129, + "learning_rate": 8.379935413301378e-05, + "loss": 2.3247, + "step": 3805 + }, + { + "epoch": 0.2864939121925516, + "grad_norm": 3.8094191551208496, + "learning_rate": 8.379036949653867e-05, + "loss": 2.1055, + "step": 3806 + }, + { + "epoch": 0.2865691864732118, + "grad_norm": 4.065224647521973, + "learning_rate": 8.378138285132305e-05, + "loss": 2.0397, + "step": 3807 + }, + { + "epoch": 0.2866444607538719, + "grad_norm": 5.094583034515381, + "learning_rate": 8.377239419790112e-05, + "loss": 2.0621, + "step": 3808 + }, + { + "epoch": 0.2867197350345321, + "grad_norm": 4.215206146240234, + "learning_rate": 8.37634035368073e-05, + "loss": 2.0845, + "step": 3809 + }, + { + "epoch": 0.28679500931519225, + "grad_norm": 5.886216163635254, + "learning_rate": 8.3754410868576e-05, + "loss": 2.1409, + "step": 3810 + }, + { + "epoch": 0.28687028359585237, + "grad_norm": 4.233997344970703, + "learning_rate": 8.374541619374182e-05, + "loss": 2.0214, + "step": 3811 + }, + { + "epoch": 0.28694555787651255, + "grad_norm": 4.8626909255981445, + "learning_rate": 8.373641951283948e-05, + "loss": 1.8788, + "step": 3812 + }, + { + "epoch": 0.2870208321571727, + "grad_norm": 3.4109678268432617, + "learning_rate": 8.37274208264038e-05, + "loss": 1.9001, + "step": 3813 + }, + { + "epoch": 0.28709610643783284, + "grad_norm": 4.922783374786377, + "learning_rate": 8.371842013496973e-05, + "loss": 1.6678, + "step": 3814 + }, + { + "epoch": 0.287171380718493, + "grad_norm": 5.090536117553711, + "learning_rate": 8.370941743907233e-05, + "loss": 2.2334, + "step": 3815 + }, + { + "epoch": 0.28724665499915314, + "grad_norm": 4.591648578643799, + "learning_rate": 8.370041273924678e-05, + "loss": 2.0385, + "step": 3816 + }, + { + "epoch": 0.2873219292798133, + "grad_norm": 4.662242412567139, + "learning_rate": 8.36914060360284e-05, + "loss": 1.9126, + "step": 3817 + }, + { + "epoch": 0.2873972035604735, + "grad_norm": 4.626654148101807, + "learning_rate": 8.36823973299526e-05, + "loss": 2.0897, + "step": 3818 + }, + { + "epoch": 0.2874724778411336, + "grad_norm": 3.912527322769165, + "learning_rate": 8.367338662155493e-05, + "loss": 2.136, + "step": 3819 + }, + { + "epoch": 0.2875477521217938, + "grad_norm": 4.67393159866333, + "learning_rate": 8.366437391137103e-05, + "loss": 2.1881, + "step": 3820 + }, + { + "epoch": 0.28762302640245396, + "grad_norm": 4.673575401306152, + "learning_rate": 8.36553591999367e-05, + "loss": 2.0971, + "step": 3821 + }, + { + "epoch": 0.2876983006831141, + "grad_norm": 4.837671279907227, + "learning_rate": 8.364634248778784e-05, + "loss": 2.1025, + "step": 3822 + }, + { + "epoch": 0.28777357496377426, + "grad_norm": 4.0610785484313965, + "learning_rate": 8.363732377546045e-05, + "loss": 2.1179, + "step": 3823 + }, + { + "epoch": 0.28784884924443443, + "grad_norm": 4.773620128631592, + "learning_rate": 8.36283030634907e-05, + "loss": 2.0521, + "step": 3824 + }, + { + "epoch": 0.28792412352509456, + "grad_norm": 5.910046100616455, + "learning_rate": 8.36192803524148e-05, + "loss": 2.0464, + "step": 3825 + }, + { + "epoch": 0.28799939780575473, + "grad_norm": 4.970592975616455, + "learning_rate": 8.361025564276917e-05, + "loss": 2.2635, + "step": 3826 + }, + { + "epoch": 0.28807467208641485, + "grad_norm": 5.785548686981201, + "learning_rate": 8.360122893509026e-05, + "loss": 2.0841, + "step": 3827 + }, + { + "epoch": 0.288149946367075, + "grad_norm": 4.501217842102051, + "learning_rate": 8.35922002299147e-05, + "loss": 2.3656, + "step": 3828 + }, + { + "epoch": 0.2882252206477352, + "grad_norm": 4.644845485687256, + "learning_rate": 8.358316952777925e-05, + "loss": 2.2716, + "step": 3829 + }, + { + "epoch": 0.2883004949283953, + "grad_norm": 4.391740322113037, + "learning_rate": 8.357413682922071e-05, + "loss": 1.7138, + "step": 3830 + }, + { + "epoch": 0.2883757692090555, + "grad_norm": 5.401401042938232, + "learning_rate": 8.356510213477606e-05, + "loss": 2.0579, + "step": 3831 + }, + { + "epoch": 0.2884510434897157, + "grad_norm": 5.10807466506958, + "learning_rate": 8.355606544498243e-05, + "loss": 2.3897, + "step": 3832 + }, + { + "epoch": 0.2885263177703758, + "grad_norm": 5.3836493492126465, + "learning_rate": 8.354702676037697e-05, + "loss": 2.1215, + "step": 3833 + }, + { + "epoch": 0.28860159205103597, + "grad_norm": 3.7758097648620605, + "learning_rate": 8.353798608149704e-05, + "loss": 1.7526, + "step": 3834 + }, + { + "epoch": 0.2886768663316961, + "grad_norm": 6.716507434844971, + "learning_rate": 8.352894340888005e-05, + "loss": 2.0362, + "step": 3835 + }, + { + "epoch": 0.28875214061235627, + "grad_norm": 5.690709590911865, + "learning_rate": 8.351989874306358e-05, + "loss": 1.9573, + "step": 3836 + }, + { + "epoch": 0.28882741489301644, + "grad_norm": 4.09605598449707, + "learning_rate": 8.351085208458534e-05, + "loss": 2.1092, + "step": 3837 + }, + { + "epoch": 0.28890268917367656, + "grad_norm": 4.334811210632324, + "learning_rate": 8.350180343398307e-05, + "loss": 2.2697, + "step": 3838 + }, + { + "epoch": 0.28897796345433674, + "grad_norm": 5.101006507873535, + "learning_rate": 8.349275279179471e-05, + "loss": 2.0569, + "step": 3839 + }, + { + "epoch": 0.2890532377349969, + "grad_norm": 3.7114970684051514, + "learning_rate": 8.348370015855831e-05, + "loss": 2.1174, + "step": 3840 + }, + { + "epoch": 0.28912851201565704, + "grad_norm": 4.862429618835449, + "learning_rate": 8.347464553481201e-05, + "loss": 2.0633, + "step": 3841 + }, + { + "epoch": 0.2892037862963172, + "grad_norm": 4.648384094238281, + "learning_rate": 8.346558892109409e-05, + "loss": 2.0382, + "step": 3842 + }, + { + "epoch": 0.2892790605769774, + "grad_norm": 3.9312198162078857, + "learning_rate": 8.345653031794292e-05, + "loss": 1.9585, + "step": 3843 + }, + { + "epoch": 0.2893543348576375, + "grad_norm": 4.682003498077393, + "learning_rate": 8.344746972589702e-05, + "loss": 1.9414, + "step": 3844 + }, + { + "epoch": 0.2894296091382977, + "grad_norm": 4.717390060424805, + "learning_rate": 8.343840714549502e-05, + "loss": 1.9941, + "step": 3845 + }, + { + "epoch": 0.2895048834189578, + "grad_norm": 8.018993377685547, + "learning_rate": 8.342934257727566e-05, + "loss": 2.0114, + "step": 3846 + }, + { + "epoch": 0.289580157699618, + "grad_norm": 4.741693019866943, + "learning_rate": 8.342027602177779e-05, + "loss": 1.5556, + "step": 3847 + }, + { + "epoch": 0.28965543198027816, + "grad_norm": 5.723662376403809, + "learning_rate": 8.341120747954043e-05, + "loss": 2.2284, + "step": 3848 + }, + { + "epoch": 0.2897307062609383, + "grad_norm": 3.611459255218506, + "learning_rate": 8.340213695110264e-05, + "loss": 2.0701, + "step": 3849 + }, + { + "epoch": 0.28980598054159845, + "grad_norm": 4.317535400390625, + "learning_rate": 8.339306443700366e-05, + "loss": 2.0111, + "step": 3850 + }, + { + "epoch": 0.2898812548222586, + "grad_norm": 4.843621253967285, + "learning_rate": 8.33839899377828e-05, + "loss": 2.1514, + "step": 3851 + }, + { + "epoch": 0.28995652910291875, + "grad_norm": 5.786384105682373, + "learning_rate": 8.337491345397953e-05, + "loss": 2.1834, + "step": 3852 + }, + { + "epoch": 0.2900318033835789, + "grad_norm": 5.805883884429932, + "learning_rate": 8.336583498613342e-05, + "loss": 2.3418, + "step": 3853 + }, + { + "epoch": 0.29010707766423904, + "grad_norm": 4.941905975341797, + "learning_rate": 8.335675453478416e-05, + "loss": 2.0093, + "step": 3854 + }, + { + "epoch": 0.2901823519448992, + "grad_norm": 4.880833148956299, + "learning_rate": 8.334767210047155e-05, + "loss": 2.1342, + "step": 3855 + }, + { + "epoch": 0.2902576262255594, + "grad_norm": 5.400149822235107, + "learning_rate": 8.333858768373552e-05, + "loss": 2.247, + "step": 3856 + }, + { + "epoch": 0.2903329005062195, + "grad_norm": 4.728665828704834, + "learning_rate": 8.332950128511612e-05, + "loss": 1.9277, + "step": 3857 + }, + { + "epoch": 0.2904081747868797, + "grad_norm": 4.006535530090332, + "learning_rate": 8.332041290515349e-05, + "loss": 2.048, + "step": 3858 + }, + { + "epoch": 0.29048344906753987, + "grad_norm": 5.73699951171875, + "learning_rate": 8.331132254438793e-05, + "loss": 2.4664, + "step": 3859 + }, + { + "epoch": 0.2905587233482, + "grad_norm": 4.849059104919434, + "learning_rate": 8.330223020335982e-05, + "loss": 1.9854, + "step": 3860 + }, + { + "epoch": 0.29063399762886016, + "grad_norm": 4.367104530334473, + "learning_rate": 8.329313588260968e-05, + "loss": 2.075, + "step": 3861 + }, + { + "epoch": 0.29070927190952034, + "grad_norm": 6.0469560623168945, + "learning_rate": 8.328403958267814e-05, + "loss": 1.9521, + "step": 3862 + }, + { + "epoch": 0.29078454619018046, + "grad_norm": 4.310194492340088, + "learning_rate": 8.327494130410595e-05, + "loss": 1.9982, + "step": 3863 + }, + { + "epoch": 0.29085982047084064, + "grad_norm": 5.066767692565918, + "learning_rate": 8.326584104743398e-05, + "loss": 1.9326, + "step": 3864 + }, + { + "epoch": 0.29093509475150076, + "grad_norm": 6.129978656768799, + "learning_rate": 8.325673881320319e-05, + "loss": 1.8982, + "step": 3865 + }, + { + "epoch": 0.29101036903216093, + "grad_norm": 4.554304599761963, + "learning_rate": 8.324763460195472e-05, + "loss": 1.9654, + "step": 3866 + }, + { + "epoch": 0.2910856433128211, + "grad_norm": 4.857050895690918, + "learning_rate": 8.323852841422976e-05, + "loss": 2.1171, + "step": 3867 + }, + { + "epoch": 0.2911609175934812, + "grad_norm": 5.321245193481445, + "learning_rate": 8.322942025056968e-05, + "loss": 2.0208, + "step": 3868 + }, + { + "epoch": 0.2912361918741414, + "grad_norm": 5.192969799041748, + "learning_rate": 8.32203101115159e-05, + "loss": 2.2445, + "step": 3869 + }, + { + "epoch": 0.2913114661548016, + "grad_norm": 7.29888916015625, + "learning_rate": 8.321119799760999e-05, + "loss": 2.1943, + "step": 3870 + }, + { + "epoch": 0.2913867404354617, + "grad_norm": 5.519054412841797, + "learning_rate": 8.320208390939368e-05, + "loss": 2.2425, + "step": 3871 + }, + { + "epoch": 0.2914620147161219, + "grad_norm": 5.460089683532715, + "learning_rate": 8.319296784740873e-05, + "loss": 2.0266, + "step": 3872 + }, + { + "epoch": 0.29153728899678205, + "grad_norm": 6.9658203125, + "learning_rate": 8.318384981219707e-05, + "loss": 1.7287, + "step": 3873 + }, + { + "epoch": 0.29161256327744217, + "grad_norm": 4.43731689453125, + "learning_rate": 8.317472980430079e-05, + "loss": 1.8946, + "step": 3874 + }, + { + "epoch": 0.29168783755810235, + "grad_norm": 3.5282974243164062, + "learning_rate": 8.316560782426199e-05, + "loss": 1.8885, + "step": 3875 + }, + { + "epoch": 0.29176311183876247, + "grad_norm": 3.992851734161377, + "learning_rate": 8.315648387262296e-05, + "loss": 1.8385, + "step": 3876 + }, + { + "epoch": 0.29183838611942264, + "grad_norm": 4.900772571563721, + "learning_rate": 8.314735794992612e-05, + "loss": 2.0619, + "step": 3877 + }, + { + "epoch": 0.2919136604000828, + "grad_norm": 4.7528977394104, + "learning_rate": 8.313823005671396e-05, + "loss": 2.067, + "step": 3878 + }, + { + "epoch": 0.29198893468074294, + "grad_norm": 4.103478908538818, + "learning_rate": 8.312910019352909e-05, + "loss": 2.0187, + "step": 3879 + }, + { + "epoch": 0.2920642089614031, + "grad_norm": 4.086658954620361, + "learning_rate": 8.311996836091427e-05, + "loss": 2.0212, + "step": 3880 + }, + { + "epoch": 0.2921394832420633, + "grad_norm": 3.4795148372650146, + "learning_rate": 8.311083455941238e-05, + "loss": 1.9303, + "step": 3881 + }, + { + "epoch": 0.2922147575227234, + "grad_norm": 3.36259388923645, + "learning_rate": 8.310169878956638e-05, + "loss": 1.8607, + "step": 3882 + }, + { + "epoch": 0.2922900318033836, + "grad_norm": 3.9996771812438965, + "learning_rate": 8.309256105191934e-05, + "loss": 2.0218, + "step": 3883 + }, + { + "epoch": 0.2923653060840437, + "grad_norm": 4.0418009757995605, + "learning_rate": 8.308342134701452e-05, + "loss": 2.3364, + "step": 3884 + }, + { + "epoch": 0.2924405803647039, + "grad_norm": 3.996657609939575, + "learning_rate": 8.307427967539523e-05, + "loss": 2.1476, + "step": 3885 + }, + { + "epoch": 0.29251585464536406, + "grad_norm": 3.3307595252990723, + "learning_rate": 8.30651360376049e-05, + "loss": 2.1576, + "step": 3886 + }, + { + "epoch": 0.2925911289260242, + "grad_norm": 3.8001067638397217, + "learning_rate": 8.305599043418712e-05, + "loss": 2.344, + "step": 3887 + }, + { + "epoch": 0.29266640320668436, + "grad_norm": 3.9414944648742676, + "learning_rate": 8.304684286568556e-05, + "loss": 2.0307, + "step": 3888 + }, + { + "epoch": 0.29274167748734453, + "grad_norm": 5.2964348793029785, + "learning_rate": 8.303769333264402e-05, + "loss": 1.8305, + "step": 3889 + }, + { + "epoch": 0.29281695176800465, + "grad_norm": 4.15656852722168, + "learning_rate": 8.302854183560639e-05, + "loss": 2.107, + "step": 3890 + }, + { + "epoch": 0.2928922260486648, + "grad_norm": 5.4341325759887695, + "learning_rate": 8.30193883751167e-05, + "loss": 1.9815, + "step": 3891 + }, + { + "epoch": 0.292967500329325, + "grad_norm": 3.7687902450561523, + "learning_rate": 8.301023295171916e-05, + "loss": 2.0175, + "step": 3892 + }, + { + "epoch": 0.2930427746099851, + "grad_norm": 4.858816623687744, + "learning_rate": 8.300107556595796e-05, + "loss": 2.3404, + "step": 3893 + }, + { + "epoch": 0.2931180488906453, + "grad_norm": 5.89325475692749, + "learning_rate": 8.299191621837751e-05, + "loss": 2.329, + "step": 3894 + }, + { + "epoch": 0.2931933231713054, + "grad_norm": 4.135165691375732, + "learning_rate": 8.298275490952232e-05, + "loss": 1.7683, + "step": 3895 + }, + { + "epoch": 0.2932685974519656, + "grad_norm": 3.811086654663086, + "learning_rate": 8.297359163993697e-05, + "loss": 1.8282, + "step": 3896 + }, + { + "epoch": 0.29334387173262577, + "grad_norm": 5.477607727050781, + "learning_rate": 8.296442641016622e-05, + "loss": 2.4105, + "step": 3897 + }, + { + "epoch": 0.2934191460132859, + "grad_norm": 5.256962776184082, + "learning_rate": 8.29552592207549e-05, + "loss": 1.8768, + "step": 3898 + }, + { + "epoch": 0.29349442029394607, + "grad_norm": 5.292032241821289, + "learning_rate": 8.294609007224797e-05, + "loss": 2.0916, + "step": 3899 + }, + { + "epoch": 0.29356969457460624, + "grad_norm": 5.821744918823242, + "learning_rate": 8.293691896519053e-05, + "loss": 1.9116, + "step": 3900 + }, + { + "epoch": 0.29364496885526636, + "grad_norm": 5.460099220275879, + "learning_rate": 8.292774590012777e-05, + "loss": 2.1305, + "step": 3901 + }, + { + "epoch": 0.29372024313592654, + "grad_norm": 4.575508117675781, + "learning_rate": 8.291857087760499e-05, + "loss": 2.1291, + "step": 3902 + }, + { + "epoch": 0.29379551741658666, + "grad_norm": 5.542977333068848, + "learning_rate": 8.290939389816763e-05, + "loss": 2.0904, + "step": 3903 + }, + { + "epoch": 0.29387079169724684, + "grad_norm": 5.345617771148682, + "learning_rate": 8.290021496236122e-05, + "loss": 1.8152, + "step": 3904 + }, + { + "epoch": 0.293946065977907, + "grad_norm": 4.5423784255981445, + "learning_rate": 8.289103407073143e-05, + "loss": 1.9969, + "step": 3905 + }, + { + "epoch": 0.29402134025856713, + "grad_norm": 4.751160621643066, + "learning_rate": 8.288185122382405e-05, + "loss": 1.6903, + "step": 3906 + }, + { + "epoch": 0.2940966145392273, + "grad_norm": 5.917749404907227, + "learning_rate": 8.287266642218496e-05, + "loss": 2.1263, + "step": 3907 + }, + { + "epoch": 0.2941718888198875, + "grad_norm": 4.357872486114502, + "learning_rate": 8.286347966636016e-05, + "loss": 1.9129, + "step": 3908 + }, + { + "epoch": 0.2942471631005476, + "grad_norm": 6.8260650634765625, + "learning_rate": 8.285429095689581e-05, + "loss": 2.4091, + "step": 3909 + }, + { + "epoch": 0.2943224373812078, + "grad_norm": 9.451496124267578, + "learning_rate": 8.284510029433813e-05, + "loss": 2.3008, + "step": 3910 + }, + { + "epoch": 0.29439771166186796, + "grad_norm": 4.148449897766113, + "learning_rate": 8.283590767923347e-05, + "loss": 1.9655, + "step": 3911 + }, + { + "epoch": 0.2944729859425281, + "grad_norm": 4.271643161773682, + "learning_rate": 8.282671311212833e-05, + "loss": 1.9186, + "step": 3912 + }, + { + "epoch": 0.29454826022318825, + "grad_norm": 4.254988193511963, + "learning_rate": 8.281751659356926e-05, + "loss": 2.1378, + "step": 3913 + }, + { + "epoch": 0.29462353450384837, + "grad_norm": 4.778247356414795, + "learning_rate": 8.280831812410301e-05, + "loss": 2.0003, + "step": 3914 + }, + { + "epoch": 0.29469880878450855, + "grad_norm": 5.063285827636719, + "learning_rate": 8.279911770427638e-05, + "loss": 2.2523, + "step": 3915 + }, + { + "epoch": 0.2947740830651687, + "grad_norm": 3.722421884536743, + "learning_rate": 8.278991533463633e-05, + "loss": 2.4155, + "step": 3916 + }, + { + "epoch": 0.29484935734582884, + "grad_norm": 5.895635604858398, + "learning_rate": 8.278071101572988e-05, + "loss": 2.0962, + "step": 3917 + }, + { + "epoch": 0.294924631626489, + "grad_norm": 4.910404682159424, + "learning_rate": 8.277150474810422e-05, + "loss": 2.0297, + "step": 3918 + }, + { + "epoch": 0.2949999059071492, + "grad_norm": 4.950763702392578, + "learning_rate": 8.276229653230665e-05, + "loss": 2.1183, + "step": 3919 + }, + { + "epoch": 0.2950751801878093, + "grad_norm": 4.1348419189453125, + "learning_rate": 8.275308636888455e-05, + "loss": 2.3009, + "step": 3920 + }, + { + "epoch": 0.2951504544684695, + "grad_norm": 3.937880277633667, + "learning_rate": 8.274387425838543e-05, + "loss": 2.1799, + "step": 3921 + }, + { + "epoch": 0.29522572874912967, + "grad_norm": 4.720341682434082, + "learning_rate": 8.273466020135695e-05, + "loss": 2.0935, + "step": 3922 + }, + { + "epoch": 0.2953010030297898, + "grad_norm": 4.910231590270996, + "learning_rate": 8.272544419834685e-05, + "loss": 2.0942, + "step": 3923 + }, + { + "epoch": 0.29537627731044996, + "grad_norm": 4.611443519592285, + "learning_rate": 8.271622624990298e-05, + "loss": 1.8372, + "step": 3924 + }, + { + "epoch": 0.2954515515911101, + "grad_norm": 5.8901143074035645, + "learning_rate": 8.270700635657334e-05, + "loss": 2.3443, + "step": 3925 + }, + { + "epoch": 0.29552682587177026, + "grad_norm": 4.361662864685059, + "learning_rate": 8.269778451890603e-05, + "loss": 1.7892, + "step": 3926 + }, + { + "epoch": 0.29560210015243044, + "grad_norm": 4.859438896179199, + "learning_rate": 8.268856073744924e-05, + "loss": 2.0249, + "step": 3927 + }, + { + "epoch": 0.29567737443309056, + "grad_norm": 6.353734016418457, + "learning_rate": 8.267933501275132e-05, + "loss": 2.0772, + "step": 3928 + }, + { + "epoch": 0.29575264871375073, + "grad_norm": 4.042829990386963, + "learning_rate": 8.267010734536067e-05, + "loss": 2.0966, + "step": 3929 + }, + { + "epoch": 0.2958279229944109, + "grad_norm": 5.235006809234619, + "learning_rate": 8.266087773582591e-05, + "loss": 2.1075, + "step": 3930 + }, + { + "epoch": 0.29590319727507103, + "grad_norm": 6.5151448249816895, + "learning_rate": 8.265164618469567e-05, + "loss": 2.1292, + "step": 3931 + }, + { + "epoch": 0.2959784715557312, + "grad_norm": 5.033535480499268, + "learning_rate": 8.264241269251876e-05, + "loss": 1.837, + "step": 3932 + }, + { + "epoch": 0.2960537458363913, + "grad_norm": 5.000491619110107, + "learning_rate": 8.263317725984406e-05, + "loss": 1.8514, + "step": 3933 + }, + { + "epoch": 0.2961290201170515, + "grad_norm": 5.6992669105529785, + "learning_rate": 8.262393988722061e-05, + "loss": 2.3944, + "step": 3934 + }, + { + "epoch": 0.2962042943977117, + "grad_norm": 6.751134395599365, + "learning_rate": 8.261470057519755e-05, + "loss": 2.3705, + "step": 3935 + }, + { + "epoch": 0.2962795686783718, + "grad_norm": 4.841224670410156, + "learning_rate": 8.260545932432413e-05, + "loss": 1.7609, + "step": 3936 + }, + { + "epoch": 0.29635484295903197, + "grad_norm": 4.674503803253174, + "learning_rate": 8.25962161351497e-05, + "loss": 2.2483, + "step": 3937 + }, + { + "epoch": 0.29643011723969215, + "grad_norm": 6.809699535369873, + "learning_rate": 8.258697100822376e-05, + "loss": 2.394, + "step": 3938 + }, + { + "epoch": 0.29650539152035227, + "grad_norm": 4.707481384277344, + "learning_rate": 8.257772394409589e-05, + "loss": 2.2682, + "step": 3939 + }, + { + "epoch": 0.29658066580101244, + "grad_norm": 4.450935363769531, + "learning_rate": 8.256847494331581e-05, + "loss": 2.2971, + "step": 3940 + }, + { + "epoch": 0.2966559400816726, + "grad_norm": 6.442799091339111, + "learning_rate": 8.255922400643335e-05, + "loss": 2.1986, + "step": 3941 + }, + { + "epoch": 0.29673121436233274, + "grad_norm": 4.342957019805908, + "learning_rate": 8.254997113399842e-05, + "loss": 1.8359, + "step": 3942 + }, + { + "epoch": 0.2968064886429929, + "grad_norm": 4.233155250549316, + "learning_rate": 8.254071632656113e-05, + "loss": 2.1798, + "step": 3943 + }, + { + "epoch": 0.29688176292365304, + "grad_norm": 4.547600746154785, + "learning_rate": 8.253145958467162e-05, + "loss": 1.9461, + "step": 3944 + }, + { + "epoch": 0.2969570372043132, + "grad_norm": 5.658600330352783, + "learning_rate": 8.252220090888017e-05, + "loss": 2.2108, + "step": 3945 + }, + { + "epoch": 0.2970323114849734, + "grad_norm": 4.1030192375183105, + "learning_rate": 8.251294029973719e-05, + "loss": 1.8221, + "step": 3946 + }, + { + "epoch": 0.2971075857656335, + "grad_norm": 5.113779067993164, + "learning_rate": 8.25036777577932e-05, + "loss": 2.3138, + "step": 3947 + }, + { + "epoch": 0.2971828600462937, + "grad_norm": 5.227575778961182, + "learning_rate": 8.249441328359883e-05, + "loss": 1.8268, + "step": 3948 + }, + { + "epoch": 0.29725813432695386, + "grad_norm": 4.392734050750732, + "learning_rate": 8.248514687770483e-05, + "loss": 2.0917, + "step": 3949 + }, + { + "epoch": 0.297333408607614, + "grad_norm": 4.64680814743042, + "learning_rate": 8.247587854066207e-05, + "loss": 2.0378, + "step": 3950 + }, + { + "epoch": 0.29740868288827416, + "grad_norm": 6.5726423263549805, + "learning_rate": 8.24666082730215e-05, + "loss": 1.8285, + "step": 3951 + }, + { + "epoch": 0.29748395716893433, + "grad_norm": 5.788646697998047, + "learning_rate": 8.245733607533423e-05, + "loss": 2.1962, + "step": 3952 + }, + { + "epoch": 0.29755923144959445, + "grad_norm": 5.60465669631958, + "learning_rate": 8.244806194815146e-05, + "loss": 1.8553, + "step": 3953 + }, + { + "epoch": 0.29763450573025463, + "grad_norm": 3.659080982208252, + "learning_rate": 8.243878589202451e-05, + "loss": 2.0637, + "step": 3954 + }, + { + "epoch": 0.29770978001091475, + "grad_norm": 4.029128551483154, + "learning_rate": 8.242950790750484e-05, + "loss": 2.5333, + "step": 3955 + }, + { + "epoch": 0.2977850542915749, + "grad_norm": 6.846841812133789, + "learning_rate": 8.242022799514395e-05, + "loss": 1.9938, + "step": 3956 + }, + { + "epoch": 0.2978603285722351, + "grad_norm": 4.650421619415283, + "learning_rate": 8.241094615549353e-05, + "loss": 1.8515, + "step": 3957 + }, + { + "epoch": 0.2979356028528952, + "grad_norm": 4.748678684234619, + "learning_rate": 8.240166238910536e-05, + "loss": 2.25, + "step": 3958 + }, + { + "epoch": 0.2980108771335554, + "grad_norm": 3.416999340057373, + "learning_rate": 8.239237669653134e-05, + "loss": 2.2199, + "step": 3959 + }, + { + "epoch": 0.29808615141421557, + "grad_norm": 4.143542766571045, + "learning_rate": 8.238308907832347e-05, + "loss": 2.2873, + "step": 3960 + }, + { + "epoch": 0.2981614256948757, + "grad_norm": 6.439436435699463, + "learning_rate": 8.237379953503386e-05, + "loss": 2.4096, + "step": 3961 + }, + { + "epoch": 0.29823669997553587, + "grad_norm": 4.001052379608154, + "learning_rate": 8.236450806721476e-05, + "loss": 1.9737, + "step": 3962 + }, + { + "epoch": 0.298311974256196, + "grad_norm": 7.826466083526611, + "learning_rate": 8.235521467541852e-05, + "loss": 3.3204, + "step": 3963 + }, + { + "epoch": 0.29838724853685616, + "grad_norm": 4.821740627288818, + "learning_rate": 8.234591936019762e-05, + "loss": 2.0892, + "step": 3964 + }, + { + "epoch": 0.29846252281751634, + "grad_norm": 4.715775012969971, + "learning_rate": 8.233662212210461e-05, + "loss": 1.5054, + "step": 3965 + }, + { + "epoch": 0.29853779709817646, + "grad_norm": 4.925595283508301, + "learning_rate": 8.23273229616922e-05, + "loss": 2.6118, + "step": 3966 + }, + { + "epoch": 0.29861307137883664, + "grad_norm": 5.416860580444336, + "learning_rate": 8.231802187951321e-05, + "loss": 2.0367, + "step": 3967 + }, + { + "epoch": 0.2986883456594968, + "grad_norm": 4.345701694488525, + "learning_rate": 8.230871887612054e-05, + "loss": 2.4256, + "step": 3968 + }, + { + "epoch": 0.29876361994015693, + "grad_norm": 3.9337241649627686, + "learning_rate": 8.229941395206724e-05, + "loss": 1.9631, + "step": 3969 + }, + { + "epoch": 0.2988388942208171, + "grad_norm": 3.07364821434021, + "learning_rate": 8.229010710790646e-05, + "loss": 1.9359, + "step": 3970 + }, + { + "epoch": 0.2989141685014773, + "grad_norm": 5.827083110809326, + "learning_rate": 8.228079834419145e-05, + "loss": 1.8755, + "step": 3971 + }, + { + "epoch": 0.2989894427821374, + "grad_norm": 3.811436891555786, + "learning_rate": 8.227148766147561e-05, + "loss": 1.9425, + "step": 3972 + }, + { + "epoch": 0.2990647170627976, + "grad_norm": 3.9005682468414307, + "learning_rate": 8.226217506031243e-05, + "loss": 1.7227, + "step": 3973 + }, + { + "epoch": 0.2991399913434577, + "grad_norm": 4.720652103424072, + "learning_rate": 8.225286054125552e-05, + "loss": 2.0385, + "step": 3974 + }, + { + "epoch": 0.2992152656241179, + "grad_norm": 5.91552734375, + "learning_rate": 8.224354410485857e-05, + "loss": 1.8691, + "step": 3975 + }, + { + "epoch": 0.29929053990477805, + "grad_norm": 4.188562393188477, + "learning_rate": 8.223422575167546e-05, + "loss": 1.7629, + "step": 3976 + }, + { + "epoch": 0.2993658141854382, + "grad_norm": 3.4777920246124268, + "learning_rate": 8.222490548226011e-05, + "loss": 2.0878, + "step": 3977 + }, + { + "epoch": 0.29944108846609835, + "grad_norm": 5.007256031036377, + "learning_rate": 8.221558329716661e-05, + "loss": 2.2845, + "step": 3978 + }, + { + "epoch": 0.2995163627467585, + "grad_norm": 6.493385314941406, + "learning_rate": 8.220625919694909e-05, + "loss": 1.9871, + "step": 3979 + }, + { + "epoch": 0.29959163702741864, + "grad_norm": 3.5783050060272217, + "learning_rate": 8.21969331821619e-05, + "loss": 1.9078, + "step": 3980 + }, + { + "epoch": 0.2996669113080788, + "grad_norm": 3.807652235031128, + "learning_rate": 8.21876052533594e-05, + "loss": 1.7729, + "step": 3981 + }, + { + "epoch": 0.29974218558873894, + "grad_norm": 4.732152938842773, + "learning_rate": 8.217827541109612e-05, + "loss": 2.0942, + "step": 3982 + }, + { + "epoch": 0.2998174598693991, + "grad_norm": 5.426209926605225, + "learning_rate": 8.21689436559267e-05, + "loss": 2.2869, + "step": 3983 + }, + { + "epoch": 0.2998927341500593, + "grad_norm": 5.603438854217529, + "learning_rate": 8.215960998840591e-05, + "loss": 2.0067, + "step": 3984 + }, + { + "epoch": 0.2999680084307194, + "grad_norm": 8.527303695678711, + "learning_rate": 8.215027440908856e-05, + "loss": 2.1007, + "step": 3985 + }, + { + "epoch": 0.3000432827113796, + "grad_norm": 4.717874050140381, + "learning_rate": 8.214093691852963e-05, + "loss": 2.2188, + "step": 3986 + }, + { + "epoch": 0.30011855699203976, + "grad_norm": 4.590381622314453, + "learning_rate": 8.213159751728424e-05, + "loss": 1.9118, + "step": 3987 + }, + { + "epoch": 0.3001938312726999, + "grad_norm": 3.9194700717926025, + "learning_rate": 8.212225620590757e-05, + "loss": 2.1885, + "step": 3988 + }, + { + "epoch": 0.30026910555336006, + "grad_norm": 6.475456714630127, + "learning_rate": 8.211291298495493e-05, + "loss": 2.0232, + "step": 3989 + }, + { + "epoch": 0.30034437983402024, + "grad_norm": 4.130027770996094, + "learning_rate": 8.210356785498178e-05, + "loss": 2.1068, + "step": 3990 + }, + { + "epoch": 0.30041965411468036, + "grad_norm": 5.896829605102539, + "learning_rate": 8.209422081654362e-05, + "loss": 2.5909, + "step": 3991 + }, + { + "epoch": 0.30049492839534053, + "grad_norm": 4.915403366088867, + "learning_rate": 8.208487187019614e-05, + "loss": 1.8748, + "step": 3992 + }, + { + "epoch": 0.30057020267600065, + "grad_norm": 6.936680316925049, + "learning_rate": 8.207552101649506e-05, + "loss": 2.5195, + "step": 3993 + }, + { + "epoch": 0.30064547695666083, + "grad_norm": 5.8221259117126465, + "learning_rate": 8.20661682559963e-05, + "loss": 2.1315, + "step": 3994 + }, + { + "epoch": 0.300720751237321, + "grad_norm": 6.580040454864502, + "learning_rate": 8.205681358925585e-05, + "loss": 1.6014, + "step": 3995 + }, + { + "epoch": 0.3007960255179811, + "grad_norm": 3.8563807010650635, + "learning_rate": 8.204745701682981e-05, + "loss": 2.0251, + "step": 3996 + }, + { + "epoch": 0.3008712997986413, + "grad_norm": 9.69757080078125, + "learning_rate": 8.20380985392744e-05, + "loss": 2.3376, + "step": 3997 + }, + { + "epoch": 0.3009465740793015, + "grad_norm": 5.109683513641357, + "learning_rate": 8.202873815714596e-05, + "loss": 2.4456, + "step": 3998 + }, + { + "epoch": 0.3010218483599616, + "grad_norm": 4.2439351081848145, + "learning_rate": 8.201937587100095e-05, + "loss": 1.9209, + "step": 3999 + }, + { + "epoch": 0.3010971226406218, + "grad_norm": 5.059563636779785, + "learning_rate": 8.20100116813959e-05, + "loss": 2.0953, + "step": 4000 + }, + { + "epoch": 0.30117239692128195, + "grad_norm": 4.447500228881836, + "learning_rate": 8.200064558888753e-05, + "loss": 2.2077, + "step": 4001 + }, + { + "epoch": 0.30124767120194207, + "grad_norm": 5.637600421905518, + "learning_rate": 8.199127759403258e-05, + "loss": 1.6544, + "step": 4002 + }, + { + "epoch": 0.30132294548260224, + "grad_norm": 4.083150386810303, + "learning_rate": 8.198190769738797e-05, + "loss": 2.1035, + "step": 4003 + }, + { + "epoch": 0.30139821976326236, + "grad_norm": 5.844494342803955, + "learning_rate": 8.197253589951073e-05, + "loss": 1.9702, + "step": 4004 + }, + { + "epoch": 0.30147349404392254, + "grad_norm": 3.5449769496917725, + "learning_rate": 8.196316220095795e-05, + "loss": 1.8437, + "step": 4005 + }, + { + "epoch": 0.3015487683245827, + "grad_norm": 3.938633918762207, + "learning_rate": 8.19537866022869e-05, + "loss": 2.2647, + "step": 4006 + }, + { + "epoch": 0.30162404260524284, + "grad_norm": 4.594608306884766, + "learning_rate": 8.194440910405491e-05, + "loss": 2.1227, + "step": 4007 + }, + { + "epoch": 0.301699316885903, + "grad_norm": 4.835352420806885, + "learning_rate": 8.193502970681948e-05, + "loss": 2.04, + "step": 4008 + }, + { + "epoch": 0.3017745911665632, + "grad_norm": 4.093810558319092, + "learning_rate": 8.192564841113815e-05, + "loss": 1.6861, + "step": 4009 + }, + { + "epoch": 0.3018498654472233, + "grad_norm": 4.382516384124756, + "learning_rate": 8.191626521756863e-05, + "loss": 2.6152, + "step": 4010 + }, + { + "epoch": 0.3019251397278835, + "grad_norm": 4.418197154998779, + "learning_rate": 8.190688012666871e-05, + "loss": 2.3922, + "step": 4011 + }, + { + "epoch": 0.3020004140085436, + "grad_norm": 3.890061616897583, + "learning_rate": 8.189749313899633e-05, + "loss": 2.067, + "step": 4012 + }, + { + "epoch": 0.3020756882892038, + "grad_norm": 5.5481953620910645, + "learning_rate": 8.188810425510951e-05, + "loss": 2.3599, + "step": 4013 + }, + { + "epoch": 0.30215096256986396, + "grad_norm": 5.106924057006836, + "learning_rate": 8.187871347556638e-05, + "loss": 2.2143, + "step": 4014 + }, + { + "epoch": 0.3022262368505241, + "grad_norm": 4.832911968231201, + "learning_rate": 8.186932080092521e-05, + "loss": 1.9913, + "step": 4015 + }, + { + "epoch": 0.30230151113118425, + "grad_norm": 5.2827677726745605, + "learning_rate": 8.185992623174437e-05, + "loss": 1.8147, + "step": 4016 + }, + { + "epoch": 0.30237678541184443, + "grad_norm": 4.7907023429870605, + "learning_rate": 8.185052976858231e-05, + "loss": 2.3666, + "step": 4017 + }, + { + "epoch": 0.30245205969250455, + "grad_norm": 4.429478168487549, + "learning_rate": 8.184113141199766e-05, + "loss": 2.1214, + "step": 4018 + }, + { + "epoch": 0.3025273339731647, + "grad_norm": 5.64173698425293, + "learning_rate": 8.183173116254912e-05, + "loss": 1.7993, + "step": 4019 + }, + { + "epoch": 0.3026026082538249, + "grad_norm": 4.29542350769043, + "learning_rate": 8.182232902079548e-05, + "loss": 1.8399, + "step": 4020 + }, + { + "epoch": 0.302677882534485, + "grad_norm": 6.586565017700195, + "learning_rate": 8.18129249872957e-05, + "loss": 1.7283, + "step": 4021 + }, + { + "epoch": 0.3027531568151452, + "grad_norm": 4.294366359710693, + "learning_rate": 8.180351906260879e-05, + "loss": 2.7436, + "step": 4022 + }, + { + "epoch": 0.3028284310958053, + "grad_norm": 4.890934467315674, + "learning_rate": 8.179411124729395e-05, + "loss": 2.5896, + "step": 4023 + }, + { + "epoch": 0.3029037053764655, + "grad_norm": 2.9330251216888428, + "learning_rate": 8.178470154191041e-05, + "loss": 2.0704, + "step": 4024 + }, + { + "epoch": 0.30297897965712567, + "grad_norm": 4.444950103759766, + "learning_rate": 8.177528994701758e-05, + "loss": 2.2147, + "step": 4025 + }, + { + "epoch": 0.3030542539377858, + "grad_norm": 3.5154833793640137, + "learning_rate": 8.17658764631749e-05, + "loss": 1.7689, + "step": 4026 + }, + { + "epoch": 0.30312952821844596, + "grad_norm": 5.468313217163086, + "learning_rate": 8.175646109094205e-05, + "loss": 2.2068, + "step": 4027 + }, + { + "epoch": 0.30320480249910614, + "grad_norm": 5.573143482208252, + "learning_rate": 8.174704383087868e-05, + "loss": 1.9208, + "step": 4028 + }, + { + "epoch": 0.30328007677976626, + "grad_norm": 4.63287353515625, + "learning_rate": 8.173762468354467e-05, + "loss": 2.0323, + "step": 4029 + }, + { + "epoch": 0.30335535106042644, + "grad_norm": 4.022366046905518, + "learning_rate": 8.172820364949992e-05, + "loss": 2.3292, + "step": 4030 + }, + { + "epoch": 0.30343062534108656, + "grad_norm": 3.474485397338867, + "learning_rate": 8.171878072930451e-05, + "loss": 2.1012, + "step": 4031 + }, + { + "epoch": 0.30350589962174673, + "grad_norm": 4.458362579345703, + "learning_rate": 8.17093559235186e-05, + "loss": 2.0234, + "step": 4032 + }, + { + "epoch": 0.3035811739024069, + "grad_norm": 5.3365397453308105, + "learning_rate": 8.169992923270247e-05, + "loss": 1.8182, + "step": 4033 + }, + { + "epoch": 0.30365644818306703, + "grad_norm": 4.710349082946777, + "learning_rate": 8.169050065741648e-05, + "loss": 1.9494, + "step": 4034 + }, + { + "epoch": 0.3037317224637272, + "grad_norm": 4.603903293609619, + "learning_rate": 8.168107019822117e-05, + "loss": 1.6216, + "step": 4035 + }, + { + "epoch": 0.3038069967443874, + "grad_norm": 4.38110876083374, + "learning_rate": 8.167163785567716e-05, + "loss": 1.9569, + "step": 4036 + }, + { + "epoch": 0.3038822710250475, + "grad_norm": 5.198643684387207, + "learning_rate": 8.166220363034513e-05, + "loss": 1.9497, + "step": 4037 + }, + { + "epoch": 0.3039575453057077, + "grad_norm": 6.538378715515137, + "learning_rate": 8.165276752278594e-05, + "loss": 2.3177, + "step": 4038 + }, + { + "epoch": 0.30403281958636785, + "grad_norm": 3.8377621173858643, + "learning_rate": 8.164332953356056e-05, + "loss": 2.2456, + "step": 4039 + }, + { + "epoch": 0.304108093867028, + "grad_norm": 3.472177505493164, + "learning_rate": 8.163388966323002e-05, + "loss": 1.6802, + "step": 4040 + }, + { + "epoch": 0.30418336814768815, + "grad_norm": 3.353297233581543, + "learning_rate": 8.162444791235552e-05, + "loss": 2.1592, + "step": 4041 + }, + { + "epoch": 0.30425864242834827, + "grad_norm": 4.183670520782471, + "learning_rate": 8.161500428149833e-05, + "loss": 1.9398, + "step": 4042 + }, + { + "epoch": 0.30433391670900845, + "grad_norm": 5.333328723907471, + "learning_rate": 8.160555877121985e-05, + "loss": 1.9265, + "step": 4043 + }, + { + "epoch": 0.3044091909896686, + "grad_norm": 5.0812907218933105, + "learning_rate": 8.159611138208157e-05, + "loss": 1.8548, + "step": 4044 + }, + { + "epoch": 0.30448446527032874, + "grad_norm": 4.2538909912109375, + "learning_rate": 8.158666211464514e-05, + "loss": 1.9934, + "step": 4045 + }, + { + "epoch": 0.3045597395509889, + "grad_norm": 5.934788703918457, + "learning_rate": 8.157721096947229e-05, + "loss": 1.842, + "step": 4046 + }, + { + "epoch": 0.3046350138316491, + "grad_norm": 4.272953510284424, + "learning_rate": 8.156775794712485e-05, + "loss": 2.1654, + "step": 4047 + }, + { + "epoch": 0.3047102881123092, + "grad_norm": 4.149127960205078, + "learning_rate": 8.155830304816476e-05, + "loss": 2.0902, + "step": 4048 + }, + { + "epoch": 0.3047855623929694, + "grad_norm": 5.6880598068237305, + "learning_rate": 8.154884627315412e-05, + "loss": 1.9925, + "step": 4049 + }, + { + "epoch": 0.30486083667362956, + "grad_norm": 4.037545680999756, + "learning_rate": 8.153938762265509e-05, + "loss": 2.0993, + "step": 4050 + }, + { + "epoch": 0.3049361109542897, + "grad_norm": 4.277050495147705, + "learning_rate": 8.152992709722996e-05, + "loss": 2.2155, + "step": 4051 + }, + { + "epoch": 0.30501138523494986, + "grad_norm": 4.304996967315674, + "learning_rate": 8.152046469744115e-05, + "loss": 1.8723, + "step": 4052 + }, + { + "epoch": 0.30508665951561, + "grad_norm": 3.7426083087921143, + "learning_rate": 8.151100042385114e-05, + "loss": 1.9863, + "step": 4053 + }, + { + "epoch": 0.30516193379627016, + "grad_norm": 3.8212766647338867, + "learning_rate": 8.150153427702256e-05, + "loss": 2.0204, + "step": 4054 + }, + { + "epoch": 0.30523720807693033, + "grad_norm": 3.710747003555298, + "learning_rate": 8.149206625751817e-05, + "loss": 2.0361, + "step": 4055 + }, + { + "epoch": 0.30531248235759045, + "grad_norm": 4.176271438598633, + "learning_rate": 8.14825963659008e-05, + "loss": 2.0997, + "step": 4056 + }, + { + "epoch": 0.30538775663825063, + "grad_norm": 4.722878932952881, + "learning_rate": 8.14731246027334e-05, + "loss": 2.0295, + "step": 4057 + }, + { + "epoch": 0.3054630309189108, + "grad_norm": 4.019648551940918, + "learning_rate": 8.146365096857907e-05, + "loss": 1.4909, + "step": 4058 + }, + { + "epoch": 0.3055383051995709, + "grad_norm": 5.106976509094238, + "learning_rate": 8.145417546400096e-05, + "loss": 2.1997, + "step": 4059 + }, + { + "epoch": 0.3056135794802311, + "grad_norm": 6.042627811431885, + "learning_rate": 8.144469808956235e-05, + "loss": 1.7898, + "step": 4060 + }, + { + "epoch": 0.3056888537608912, + "grad_norm": 3.81748628616333, + "learning_rate": 8.143521884582668e-05, + "loss": 1.9809, + "step": 4061 + }, + { + "epoch": 0.3057641280415514, + "grad_norm": 6.871935844421387, + "learning_rate": 8.142573773335745e-05, + "loss": 1.9752, + "step": 4062 + }, + { + "epoch": 0.3058394023222116, + "grad_norm": 3.6774086952209473, + "learning_rate": 8.141625475271828e-05, + "loss": 2.0542, + "step": 4063 + }, + { + "epoch": 0.3059146766028717, + "grad_norm": 5.08273983001709, + "learning_rate": 8.140676990447291e-05, + "loss": 2.0862, + "step": 4064 + }, + { + "epoch": 0.30598995088353187, + "grad_norm": 4.048882484436035, + "learning_rate": 8.139728318918518e-05, + "loss": 1.9852, + "step": 4065 + }, + { + "epoch": 0.30606522516419205, + "grad_norm": 4.194674968719482, + "learning_rate": 8.138779460741906e-05, + "loss": 1.9804, + "step": 4066 + }, + { + "epoch": 0.30614049944485217, + "grad_norm": 4.660661697387695, + "learning_rate": 8.137830415973862e-05, + "loss": 1.7398, + "step": 4067 + }, + { + "epoch": 0.30621577372551234, + "grad_norm": 4.434919834136963, + "learning_rate": 8.136881184670802e-05, + "loss": 2.2419, + "step": 4068 + }, + { + "epoch": 0.3062910480061725, + "grad_norm": 5.208791732788086, + "learning_rate": 8.135931766889156e-05, + "loss": 2.0253, + "step": 4069 + }, + { + "epoch": 0.30636632228683264, + "grad_norm": 4.941207408905029, + "learning_rate": 8.134982162685365e-05, + "loss": 2.0767, + "step": 4070 + }, + { + "epoch": 0.3064415965674928, + "grad_norm": 3.9456770420074463, + "learning_rate": 8.134032372115881e-05, + "loss": 1.9677, + "step": 4071 + }, + { + "epoch": 0.30651687084815293, + "grad_norm": 4.965690612792969, + "learning_rate": 8.133082395237163e-05, + "loss": 1.8762, + "step": 4072 + }, + { + "epoch": 0.3065921451288131, + "grad_norm": 6.1574907302856445, + "learning_rate": 8.132132232105686e-05, + "loss": 1.9853, + "step": 4073 + }, + { + "epoch": 0.3066674194094733, + "grad_norm": 5.204733371734619, + "learning_rate": 8.131181882777938e-05, + "loss": 2.0672, + "step": 4074 + }, + { + "epoch": 0.3067426936901334, + "grad_norm": 5.487968921661377, + "learning_rate": 8.130231347310408e-05, + "loss": 2.6792, + "step": 4075 + }, + { + "epoch": 0.3068179679707936, + "grad_norm": 4.597421646118164, + "learning_rate": 8.129280625759608e-05, + "loss": 2.1712, + "step": 4076 + }, + { + "epoch": 0.30689324225145376, + "grad_norm": 4.876674175262451, + "learning_rate": 8.128329718182052e-05, + "loss": 2.0335, + "step": 4077 + }, + { + "epoch": 0.3069685165321139, + "grad_norm": 4.4856438636779785, + "learning_rate": 8.127378624634272e-05, + "loss": 2.1583, + "step": 4078 + }, + { + "epoch": 0.30704379081277405, + "grad_norm": 4.017351150512695, + "learning_rate": 8.126427345172806e-05, + "loss": 1.8377, + "step": 4079 + }, + { + "epoch": 0.3071190650934342, + "grad_norm": 6.131873607635498, + "learning_rate": 8.125475879854206e-05, + "loss": 2.1403, + "step": 4080 + }, + { + "epoch": 0.30719433937409435, + "grad_norm": 5.1919846534729, + "learning_rate": 8.124524228735031e-05, + "loss": 2.3897, + "step": 4081 + }, + { + "epoch": 0.3072696136547545, + "grad_norm": 6.324389934539795, + "learning_rate": 8.123572391871857e-05, + "loss": 1.6265, + "step": 4082 + }, + { + "epoch": 0.30734488793541465, + "grad_norm": 5.253021717071533, + "learning_rate": 8.122620369321267e-05, + "loss": 1.7724, + "step": 4083 + }, + { + "epoch": 0.3074201622160748, + "grad_norm": 4.45188045501709, + "learning_rate": 8.121668161139855e-05, + "loss": 1.9286, + "step": 4084 + }, + { + "epoch": 0.307495436496735, + "grad_norm": 4.46063756942749, + "learning_rate": 8.120715767384228e-05, + "loss": 2.0184, + "step": 4085 + }, + { + "epoch": 0.3075707107773951, + "grad_norm": 4.661442756652832, + "learning_rate": 8.119763188111002e-05, + "loss": 2.1171, + "step": 4086 + }, + { + "epoch": 0.3076459850580553, + "grad_norm": 5.498867511749268, + "learning_rate": 8.118810423376808e-05, + "loss": 2.0624, + "step": 4087 + }, + { + "epoch": 0.30772125933871547, + "grad_norm": 5.268554210662842, + "learning_rate": 8.117857473238283e-05, + "loss": 1.779, + "step": 4088 + }, + { + "epoch": 0.3077965336193756, + "grad_norm": 5.048746585845947, + "learning_rate": 8.116904337752077e-05, + "loss": 1.8637, + "step": 4089 + }, + { + "epoch": 0.30787180790003577, + "grad_norm": 6.81883430480957, + "learning_rate": 8.115951016974851e-05, + "loss": 2.3365, + "step": 4090 + }, + { + "epoch": 0.3079470821806959, + "grad_norm": 4.037649154663086, + "learning_rate": 8.114997510963277e-05, + "loss": 2.0255, + "step": 4091 + }, + { + "epoch": 0.30802235646135606, + "grad_norm": 3.8580470085144043, + "learning_rate": 8.114043819774041e-05, + "loss": 1.952, + "step": 4092 + }, + { + "epoch": 0.30809763074201624, + "grad_norm": 4.869329452514648, + "learning_rate": 8.113089943463834e-05, + "loss": 2.0638, + "step": 4093 + }, + { + "epoch": 0.30817290502267636, + "grad_norm": 5.133235454559326, + "learning_rate": 8.112135882089362e-05, + "loss": 1.8439, + "step": 4094 + }, + { + "epoch": 0.30824817930333653, + "grad_norm": 4.648326873779297, + "learning_rate": 8.111181635707343e-05, + "loss": 1.982, + "step": 4095 + }, + { + "epoch": 0.3083234535839967, + "grad_norm": 5.285988807678223, + "learning_rate": 8.1102272043745e-05, + "loss": 2.0807, + "step": 4096 + }, + { + "epoch": 0.30839872786465683, + "grad_norm": 11.501988410949707, + "learning_rate": 8.109272588147578e-05, + "loss": 2.2605, + "step": 4097 + }, + { + "epoch": 0.308474002145317, + "grad_norm": 5.610404014587402, + "learning_rate": 8.10831778708332e-05, + "loss": 2.1319, + "step": 4098 + }, + { + "epoch": 0.3085492764259772, + "grad_norm": 3.634601593017578, + "learning_rate": 8.107362801238487e-05, + "loss": 1.9752, + "step": 4099 + }, + { + "epoch": 0.3086245507066373, + "grad_norm": 5.04384708404541, + "learning_rate": 8.106407630669852e-05, + "loss": 1.9854, + "step": 4100 + }, + { + "epoch": 0.3086998249872975, + "grad_norm": 4.8440165519714355, + "learning_rate": 8.105452275434197e-05, + "loss": 2.0274, + "step": 4101 + }, + { + "epoch": 0.3087750992679576, + "grad_norm": 5.094618797302246, + "learning_rate": 8.104496735588316e-05, + "loss": 1.8071, + "step": 4102 + }, + { + "epoch": 0.3088503735486178, + "grad_norm": 5.863461017608643, + "learning_rate": 8.103541011189011e-05, + "loss": 1.9727, + "step": 4103 + }, + { + "epoch": 0.30892564782927795, + "grad_norm": 5.042271137237549, + "learning_rate": 8.102585102293098e-05, + "loss": 1.9689, + "step": 4104 + }, + { + "epoch": 0.30900092210993807, + "grad_norm": 4.370942115783691, + "learning_rate": 8.101629008957402e-05, + "loss": 1.776, + "step": 4105 + }, + { + "epoch": 0.30907619639059825, + "grad_norm": 3.9975955486297607, + "learning_rate": 8.100672731238762e-05, + "loss": 1.9453, + "step": 4106 + }, + { + "epoch": 0.3091514706712584, + "grad_norm": 4.6842827796936035, + "learning_rate": 8.099716269194025e-05, + "loss": 1.9941, + "step": 4107 + }, + { + "epoch": 0.30922674495191854, + "grad_norm": 5.521533489227295, + "learning_rate": 8.09875962288005e-05, + "loss": 2.2095, + "step": 4108 + }, + { + "epoch": 0.3093020192325787, + "grad_norm": 5.3865532875061035, + "learning_rate": 8.097802792353705e-05, + "loss": 1.6124, + "step": 4109 + }, + { + "epoch": 0.30937729351323884, + "grad_norm": 5.3589959144592285, + "learning_rate": 8.096845777671874e-05, + "loss": 2.208, + "step": 4110 + }, + { + "epoch": 0.309452567793899, + "grad_norm": 5.392484664916992, + "learning_rate": 8.095888578891449e-05, + "loss": 1.8472, + "step": 4111 + }, + { + "epoch": 0.3095278420745592, + "grad_norm": 3.5484778881073, + "learning_rate": 8.094931196069328e-05, + "loss": 1.8095, + "step": 4112 + }, + { + "epoch": 0.3096031163552193, + "grad_norm": 4.021432876586914, + "learning_rate": 8.09397362926243e-05, + "loss": 2.2879, + "step": 4113 + }, + { + "epoch": 0.3096783906358795, + "grad_norm": 4.4004740715026855, + "learning_rate": 8.093015878527676e-05, + "loss": 2.1758, + "step": 4114 + }, + { + "epoch": 0.30975366491653966, + "grad_norm": 5.061720848083496, + "learning_rate": 8.092057943922004e-05, + "loss": 1.8712, + "step": 4115 + }, + { + "epoch": 0.3098289391971998, + "grad_norm": 5.847341537475586, + "learning_rate": 8.091099825502359e-05, + "loss": 1.6718, + "step": 4116 + }, + { + "epoch": 0.30990421347785996, + "grad_norm": 3.808504104614258, + "learning_rate": 8.090141523325699e-05, + "loss": 2.0764, + "step": 4117 + }, + { + "epoch": 0.30997948775852013, + "grad_norm": 3.72623348236084, + "learning_rate": 8.089183037448991e-05, + "loss": 1.9831, + "step": 4118 + }, + { + "epoch": 0.31005476203918025, + "grad_norm": 4.513221740722656, + "learning_rate": 8.088224367929216e-05, + "loss": 2.1058, + "step": 4119 + }, + { + "epoch": 0.31013003631984043, + "grad_norm": 6.3050537109375, + "learning_rate": 8.087265514823362e-05, + "loss": 1.9353, + "step": 4120 + }, + { + "epoch": 0.31020531060050055, + "grad_norm": 3.6257219314575195, + "learning_rate": 8.086306478188434e-05, + "loss": 1.8618, + "step": 4121 + }, + { + "epoch": 0.3102805848811607, + "grad_norm": 4.7520318031311035, + "learning_rate": 8.08534725808144e-05, + "loss": 2.5045, + "step": 4122 + }, + { + "epoch": 0.3103558591618209, + "grad_norm": 5.053914546966553, + "learning_rate": 8.084387854559403e-05, + "loss": 2.275, + "step": 4123 + }, + { + "epoch": 0.310431133442481, + "grad_norm": 6.50734806060791, + "learning_rate": 8.08342826767936e-05, + "loss": 2.1574, + "step": 4124 + }, + { + "epoch": 0.3105064077231412, + "grad_norm": 5.083341121673584, + "learning_rate": 8.082468497498355e-05, + "loss": 2.0956, + "step": 4125 + }, + { + "epoch": 0.3105816820038014, + "grad_norm": 5.281416416168213, + "learning_rate": 8.08150854407344e-05, + "loss": 2.0642, + "step": 4126 + }, + { + "epoch": 0.3106569562844615, + "grad_norm": 5.110672950744629, + "learning_rate": 8.080548407461684e-05, + "loss": 2.1145, + "step": 4127 + }, + { + "epoch": 0.31073223056512167, + "grad_norm": 4.969967842102051, + "learning_rate": 8.079588087720165e-05, + "loss": 1.9353, + "step": 4128 + }, + { + "epoch": 0.3108075048457818, + "grad_norm": 4.673458576202393, + "learning_rate": 8.078627584905971e-05, + "loss": 1.7939, + "step": 4129 + }, + { + "epoch": 0.31088277912644197, + "grad_norm": 3.638911724090576, + "learning_rate": 8.077666899076201e-05, + "loss": 1.7281, + "step": 4130 + }, + { + "epoch": 0.31095805340710214, + "grad_norm": 3.6680829524993896, + "learning_rate": 8.076706030287964e-05, + "loss": 2.1524, + "step": 4131 + }, + { + "epoch": 0.31103332768776226, + "grad_norm": 5.044182300567627, + "learning_rate": 8.075744978598381e-05, + "loss": 2.6038, + "step": 4132 + }, + { + "epoch": 0.31110860196842244, + "grad_norm": 7.607592582702637, + "learning_rate": 8.074783744064585e-05, + "loss": 2.198, + "step": 4133 + }, + { + "epoch": 0.3111838762490826, + "grad_norm": 5.536702632904053, + "learning_rate": 8.073822326743718e-05, + "loss": 2.6092, + "step": 4134 + }, + { + "epoch": 0.31125915052974273, + "grad_norm": 4.153436183929443, + "learning_rate": 8.072860726692933e-05, + "loss": 2.2134, + "step": 4135 + }, + { + "epoch": 0.3113344248104029, + "grad_norm": 4.976628303527832, + "learning_rate": 8.071898943969396e-05, + "loss": 2.0248, + "step": 4136 + }, + { + "epoch": 0.3114096990910631, + "grad_norm": 4.188265323638916, + "learning_rate": 8.070936978630279e-05, + "loss": 1.7563, + "step": 4137 + }, + { + "epoch": 0.3114849733717232, + "grad_norm": 3.808957815170288, + "learning_rate": 8.069974830732772e-05, + "loss": 1.7866, + "step": 4138 + }, + { + "epoch": 0.3115602476523834, + "grad_norm": 5.6623663902282715, + "learning_rate": 8.069012500334069e-05, + "loss": 1.7498, + "step": 4139 + }, + { + "epoch": 0.3116355219330435, + "grad_norm": 6.884983539581299, + "learning_rate": 8.06804998749138e-05, + "loss": 2.1193, + "step": 4140 + }, + { + "epoch": 0.3117107962137037, + "grad_norm": 5.9430670738220215, + "learning_rate": 8.067087292261921e-05, + "loss": 2.1311, + "step": 4141 + }, + { + "epoch": 0.31178607049436385, + "grad_norm": 4.839221954345703, + "learning_rate": 8.066124414702922e-05, + "loss": 2.2644, + "step": 4142 + }, + { + "epoch": 0.311861344775024, + "grad_norm": 6.93126106262207, + "learning_rate": 8.065161354871627e-05, + "loss": 1.9878, + "step": 4143 + }, + { + "epoch": 0.31193661905568415, + "grad_norm": 5.780520915985107, + "learning_rate": 8.064198112825284e-05, + "loss": 1.7907, + "step": 4144 + }, + { + "epoch": 0.3120118933363443, + "grad_norm": 4.934045314788818, + "learning_rate": 8.063234688621152e-05, + "loss": 2.1539, + "step": 4145 + }, + { + "epoch": 0.31208716761700445, + "grad_norm": 4.8335418701171875, + "learning_rate": 8.062271082316511e-05, + "loss": 1.994, + "step": 4146 + }, + { + "epoch": 0.3121624418976646, + "grad_norm": 6.246216297149658, + "learning_rate": 8.06130729396864e-05, + "loss": 1.7197, + "step": 4147 + }, + { + "epoch": 0.3122377161783248, + "grad_norm": 4.342348098754883, + "learning_rate": 8.060343323634834e-05, + "loss": 2.2097, + "step": 4148 + }, + { + "epoch": 0.3123129904589849, + "grad_norm": 4.31889009475708, + "learning_rate": 8.059379171372399e-05, + "loss": 1.8451, + "step": 4149 + }, + { + "epoch": 0.3123882647396451, + "grad_norm": 4.057023525238037, + "learning_rate": 8.058414837238648e-05, + "loss": 2.3338, + "step": 4150 + }, + { + "epoch": 0.3124635390203052, + "grad_norm": 5.637147903442383, + "learning_rate": 8.057450321290913e-05, + "loss": 1.9821, + "step": 4151 + }, + { + "epoch": 0.3125388133009654, + "grad_norm": 4.549032211303711, + "learning_rate": 8.056485623586529e-05, + "loss": 1.602, + "step": 4152 + }, + { + "epoch": 0.31261408758162557, + "grad_norm": 3.76389479637146, + "learning_rate": 8.055520744182845e-05, + "loss": 1.8877, + "step": 4153 + }, + { + "epoch": 0.3126893618622857, + "grad_norm": 4.166214466094971, + "learning_rate": 8.05455568313722e-05, + "loss": 1.9226, + "step": 4154 + }, + { + "epoch": 0.31276463614294586, + "grad_norm": 4.176894664764404, + "learning_rate": 8.053590440507022e-05, + "loss": 1.8303, + "step": 4155 + }, + { + "epoch": 0.31283991042360604, + "grad_norm": 3.733010768890381, + "learning_rate": 8.052625016349636e-05, + "loss": 2.0695, + "step": 4156 + }, + { + "epoch": 0.31291518470426616, + "grad_norm": 4.667023658752441, + "learning_rate": 8.05165941072245e-05, + "loss": 2.3617, + "step": 4157 + }, + { + "epoch": 0.31299045898492633, + "grad_norm": 4.891637802124023, + "learning_rate": 8.05069362368287e-05, + "loss": 2.1035, + "step": 4158 + }, + { + "epoch": 0.31306573326558645, + "grad_norm": 4.5017571449279785, + "learning_rate": 8.049727655288308e-05, + "loss": 1.8228, + "step": 4159 + }, + { + "epoch": 0.31314100754624663, + "grad_norm": 4.297059059143066, + "learning_rate": 8.048761505596185e-05, + "loss": 1.7543, + "step": 4160 + }, + { + "epoch": 0.3132162818269068, + "grad_norm": 5.62193489074707, + "learning_rate": 8.047795174663941e-05, + "loss": 1.8116, + "step": 4161 + }, + { + "epoch": 0.3132915561075669, + "grad_norm": 5.109541416168213, + "learning_rate": 8.046828662549018e-05, + "loss": 2.2991, + "step": 4162 + }, + { + "epoch": 0.3133668303882271, + "grad_norm": 4.738384246826172, + "learning_rate": 8.045861969308872e-05, + "loss": 1.753, + "step": 4163 + }, + { + "epoch": 0.3134421046688873, + "grad_norm": 4.469361782073975, + "learning_rate": 8.044895095000972e-05, + "loss": 2.0392, + "step": 4164 + }, + { + "epoch": 0.3135173789495474, + "grad_norm": 3.790707588195801, + "learning_rate": 8.043928039682798e-05, + "loss": 1.7623, + "step": 4165 + }, + { + "epoch": 0.3135926532302076, + "grad_norm": 5.312924861907959, + "learning_rate": 8.042960803411835e-05, + "loss": 1.7638, + "step": 4166 + }, + { + "epoch": 0.31366792751086775, + "grad_norm": 5.138605117797852, + "learning_rate": 8.041993386245581e-05, + "loss": 2.1812, + "step": 4167 + }, + { + "epoch": 0.31374320179152787, + "grad_norm": 4.705393314361572, + "learning_rate": 8.04102578824155e-05, + "loss": 2.1203, + "step": 4168 + }, + { + "epoch": 0.31381847607218805, + "grad_norm": 4.142406940460205, + "learning_rate": 8.040058009457263e-05, + "loss": 2.0457, + "step": 4169 + }, + { + "epoch": 0.31389375035284817, + "grad_norm": 4.072697162628174, + "learning_rate": 8.039090049950249e-05, + "loss": 2.3336, + "step": 4170 + }, + { + "epoch": 0.31396902463350834, + "grad_norm": 3.9310832023620605, + "learning_rate": 8.038121909778052e-05, + "loss": 2.0128, + "step": 4171 + }, + { + "epoch": 0.3140442989141685, + "grad_norm": 4.50548791885376, + "learning_rate": 8.037153588998224e-05, + "loss": 2.7424, + "step": 4172 + }, + { + "epoch": 0.31411957319482864, + "grad_norm": 4.231843948364258, + "learning_rate": 8.036185087668329e-05, + "loss": 2.1257, + "step": 4173 + }, + { + "epoch": 0.3141948474754888, + "grad_norm": 7.13316535949707, + "learning_rate": 8.035216405845945e-05, + "loss": 2.4721, + "step": 4174 + }, + { + "epoch": 0.314270121756149, + "grad_norm": 5.095510005950928, + "learning_rate": 8.034247543588653e-05, + "loss": 2.2306, + "step": 4175 + }, + { + "epoch": 0.3143453960368091, + "grad_norm": 3.729904890060425, + "learning_rate": 8.033278500954051e-05, + "loss": 2.1525, + "step": 4176 + }, + { + "epoch": 0.3144206703174693, + "grad_norm": 5.1464762687683105, + "learning_rate": 8.032309277999745e-05, + "loss": 2.035, + "step": 4177 + }, + { + "epoch": 0.3144959445981294, + "grad_norm": 6.489736080169678, + "learning_rate": 8.031339874783354e-05, + "loss": 2.0451, + "step": 4178 + }, + { + "epoch": 0.3145712188787896, + "grad_norm": 5.9339375495910645, + "learning_rate": 8.030370291362506e-05, + "loss": 2.6718, + "step": 4179 + }, + { + "epoch": 0.31464649315944976, + "grad_norm": 4.8780107498168945, + "learning_rate": 8.029400527794837e-05, + "loss": 2.0106, + "step": 4180 + }, + { + "epoch": 0.3147217674401099, + "grad_norm": 4.1365485191345215, + "learning_rate": 8.028430584138e-05, + "loss": 2.0318, + "step": 4181 + }, + { + "epoch": 0.31479704172077005, + "grad_norm": 4.1941752433776855, + "learning_rate": 8.027460460449657e-05, + "loss": 2.0775, + "step": 4182 + }, + { + "epoch": 0.31487231600143023, + "grad_norm": 3.978912353515625, + "learning_rate": 8.026490156787473e-05, + "loss": 2.0506, + "step": 4183 + }, + { + "epoch": 0.31494759028209035, + "grad_norm": 3.028184175491333, + "learning_rate": 8.025519673209135e-05, + "loss": 1.9099, + "step": 4184 + }, + { + "epoch": 0.3150228645627505, + "grad_norm": 3.51009202003479, + "learning_rate": 8.024549009772331e-05, + "loss": 2.3928, + "step": 4185 + }, + { + "epoch": 0.3150981388434107, + "grad_norm": 5.449923038482666, + "learning_rate": 8.023578166534769e-05, + "loss": 1.7385, + "step": 4186 + }, + { + "epoch": 0.3151734131240708, + "grad_norm": 5.712917327880859, + "learning_rate": 8.022607143554159e-05, + "loss": 1.642, + "step": 4187 + }, + { + "epoch": 0.315248687404731, + "grad_norm": 5.557191848754883, + "learning_rate": 8.02163594088823e-05, + "loss": 2.0553, + "step": 4188 + }, + { + "epoch": 0.3153239616853911, + "grad_norm": 4.718916416168213, + "learning_rate": 8.020664558594711e-05, + "loss": 1.8814, + "step": 4189 + }, + { + "epoch": 0.3153992359660513, + "grad_norm": 4.241898536682129, + "learning_rate": 8.019692996731354e-05, + "loss": 2.1256, + "step": 4190 + }, + { + "epoch": 0.31547451024671147, + "grad_norm": 3.9251842498779297, + "learning_rate": 8.018721255355911e-05, + "loss": 2.1138, + "step": 4191 + }, + { + "epoch": 0.3155497845273716, + "grad_norm": 4.508667469024658, + "learning_rate": 8.017749334526152e-05, + "loss": 1.8397, + "step": 4192 + }, + { + "epoch": 0.31562505880803177, + "grad_norm": 3.842512607574463, + "learning_rate": 8.016777234299854e-05, + "loss": 2.0503, + "step": 4193 + }, + { + "epoch": 0.31570033308869194, + "grad_norm": 5.04013204574585, + "learning_rate": 8.015804954734805e-05, + "loss": 2.1784, + "step": 4194 + }, + { + "epoch": 0.31577560736935206, + "grad_norm": 4.8822150230407715, + "learning_rate": 8.014832495888804e-05, + "loss": 2.0893, + "step": 4195 + }, + { + "epoch": 0.31585088165001224, + "grad_norm": 6.418948173522949, + "learning_rate": 8.013859857819664e-05, + "loss": 2.2752, + "step": 4196 + }, + { + "epoch": 0.3159261559306724, + "grad_norm": 6.8863420486450195, + "learning_rate": 8.012887040585202e-05, + "loss": 2.0813, + "step": 4197 + }, + { + "epoch": 0.31600143021133253, + "grad_norm": 4.413151741027832, + "learning_rate": 8.01191404424325e-05, + "loss": 1.9331, + "step": 4198 + }, + { + "epoch": 0.3160767044919927, + "grad_norm": 4.351831912994385, + "learning_rate": 8.01094086885165e-05, + "loss": 2.0356, + "step": 4199 + }, + { + "epoch": 0.31615197877265283, + "grad_norm": 5.103581428527832, + "learning_rate": 8.009967514468254e-05, + "loss": 1.8951, + "step": 4200 + }, + { + "epoch": 0.316227253053313, + "grad_norm": 7.617289066314697, + "learning_rate": 8.008993981150928e-05, + "loss": 2.0731, + "step": 4201 + }, + { + "epoch": 0.3163025273339732, + "grad_norm": 3.9347362518310547, + "learning_rate": 8.008020268957543e-05, + "loss": 1.8671, + "step": 4202 + }, + { + "epoch": 0.3163778016146333, + "grad_norm": 4.56450080871582, + "learning_rate": 8.007046377945983e-05, + "loss": 1.8944, + "step": 4203 + }, + { + "epoch": 0.3164530758952935, + "grad_norm": 4.860941410064697, + "learning_rate": 8.006072308174144e-05, + "loss": 2.3441, + "step": 4204 + }, + { + "epoch": 0.31652835017595365, + "grad_norm": 6.801948547363281, + "learning_rate": 8.005098059699932e-05, + "loss": 2.2835, + "step": 4205 + }, + { + "epoch": 0.3166036244566138, + "grad_norm": 4.3067626953125, + "learning_rate": 8.004123632581264e-05, + "loss": 1.7514, + "step": 4206 + }, + { + "epoch": 0.31667889873727395, + "grad_norm": 5.525414943695068, + "learning_rate": 8.003149026876064e-05, + "loss": 1.8895, + "step": 4207 + }, + { + "epoch": 0.31675417301793407, + "grad_norm": 4.888801574707031, + "learning_rate": 8.002174242642273e-05, + "loss": 1.8697, + "step": 4208 + }, + { + "epoch": 0.31682944729859425, + "grad_norm": 5.053991794586182, + "learning_rate": 8.001199279937837e-05, + "loss": 1.9335, + "step": 4209 + }, + { + "epoch": 0.3169047215792544, + "grad_norm": 5.848512649536133, + "learning_rate": 8.000224138820715e-05, + "loss": 2.3787, + "step": 4210 + }, + { + "epoch": 0.31697999585991454, + "grad_norm": 4.121793746948242, + "learning_rate": 7.999248819348877e-05, + "loss": 1.9153, + "step": 4211 + }, + { + "epoch": 0.3170552701405747, + "grad_norm": 6.225831031799316, + "learning_rate": 7.998273321580302e-05, + "loss": 2.1583, + "step": 4212 + }, + { + "epoch": 0.3171305444212349, + "grad_norm": 4.883932113647461, + "learning_rate": 7.99729764557298e-05, + "loss": 2.1148, + "step": 4213 + }, + { + "epoch": 0.317205818701895, + "grad_norm": 5.402492523193359, + "learning_rate": 7.996321791384915e-05, + "loss": 2.1135, + "step": 4214 + }, + { + "epoch": 0.3172810929825552, + "grad_norm": 5.1864213943481445, + "learning_rate": 7.995345759074116e-05, + "loss": 1.8842, + "step": 4215 + }, + { + "epoch": 0.31735636726321537, + "grad_norm": 4.874420166015625, + "learning_rate": 7.994369548698608e-05, + "loss": 2.3403, + "step": 4216 + }, + { + "epoch": 0.3174316415438755, + "grad_norm": 5.139329433441162, + "learning_rate": 7.993393160316421e-05, + "loss": 2.1243, + "step": 4217 + }, + { + "epoch": 0.31750691582453566, + "grad_norm": 4.445467948913574, + "learning_rate": 7.992416593985599e-05, + "loss": 2.1472, + "step": 4218 + }, + { + "epoch": 0.3175821901051958, + "grad_norm": 5.442739486694336, + "learning_rate": 7.991439849764196e-05, + "loss": 1.9282, + "step": 4219 + }, + { + "epoch": 0.31765746438585596, + "grad_norm": 5.267314910888672, + "learning_rate": 7.990462927710278e-05, + "loss": 2.1721, + "step": 4220 + }, + { + "epoch": 0.31773273866651613, + "grad_norm": 5.10986852645874, + "learning_rate": 7.98948582788192e-05, + "loss": 1.9953, + "step": 4221 + }, + { + "epoch": 0.31780801294717625, + "grad_norm": 4.468688011169434, + "learning_rate": 7.988508550337207e-05, + "loss": 2.006, + "step": 4222 + }, + { + "epoch": 0.31788328722783643, + "grad_norm": 6.800841808319092, + "learning_rate": 7.987531095134236e-05, + "loss": 2.2421, + "step": 4223 + }, + { + "epoch": 0.3179585615084966, + "grad_norm": 3.708024740219116, + "learning_rate": 7.986553462331114e-05, + "loss": 1.7357, + "step": 4224 + }, + { + "epoch": 0.3180338357891567, + "grad_norm": 3.7180466651916504, + "learning_rate": 7.985575651985957e-05, + "loss": 1.7467, + "step": 4225 + }, + { + "epoch": 0.3181091100698169, + "grad_norm": 4.065893173217773, + "learning_rate": 7.984597664156895e-05, + "loss": 1.9881, + "step": 4226 + }, + { + "epoch": 0.318184384350477, + "grad_norm": 4.427385330200195, + "learning_rate": 7.983619498902064e-05, + "loss": 1.8517, + "step": 4227 + }, + { + "epoch": 0.3182596586311372, + "grad_norm": 3.9506874084472656, + "learning_rate": 7.982641156279615e-05, + "loss": 1.8064, + "step": 4228 + }, + { + "epoch": 0.3183349329117974, + "grad_norm": 4.5521440505981445, + "learning_rate": 7.98166263634771e-05, + "loss": 2.0856, + "step": 4229 + }, + { + "epoch": 0.3184102071924575, + "grad_norm": 5.417686939239502, + "learning_rate": 7.980683939164515e-05, + "loss": 2.1261, + "step": 4230 + }, + { + "epoch": 0.31848548147311767, + "grad_norm": 5.204405784606934, + "learning_rate": 7.979705064788212e-05, + "loss": 2.4627, + "step": 4231 + }, + { + "epoch": 0.31856075575377785, + "grad_norm": 6.09988260269165, + "learning_rate": 7.978726013276994e-05, + "loss": 2.1523, + "step": 4232 + }, + { + "epoch": 0.31863603003443797, + "grad_norm": 4.191976547241211, + "learning_rate": 7.97774678468906e-05, + "loss": 2.2153, + "step": 4233 + }, + { + "epoch": 0.31871130431509814, + "grad_norm": 4.253741264343262, + "learning_rate": 7.976767379082623e-05, + "loss": 2.1283, + "step": 4234 + }, + { + "epoch": 0.3187865785957583, + "grad_norm": 4.9622039794921875, + "learning_rate": 7.975787796515907e-05, + "loss": 2.0825, + "step": 4235 + }, + { + "epoch": 0.31886185287641844, + "grad_norm": 4.996169567108154, + "learning_rate": 7.974808037047146e-05, + "loss": 1.9932, + "step": 4236 + }, + { + "epoch": 0.3189371271570786, + "grad_norm": 4.502724647521973, + "learning_rate": 7.973828100734584e-05, + "loss": 2.1003, + "step": 4237 + }, + { + "epoch": 0.31901240143773874, + "grad_norm": 4.587053298950195, + "learning_rate": 7.972847987636471e-05, + "loss": 1.662, + "step": 4238 + }, + { + "epoch": 0.3190876757183989, + "grad_norm": 3.4886562824249268, + "learning_rate": 7.971867697811079e-05, + "loss": 2.2299, + "step": 4239 + }, + { + "epoch": 0.3191629499990591, + "grad_norm": 5.665261268615723, + "learning_rate": 7.970887231316677e-05, + "loss": 2.0695, + "step": 4240 + }, + { + "epoch": 0.3192382242797192, + "grad_norm": 4.8837127685546875, + "learning_rate": 7.969906588211554e-05, + "loss": 1.8784, + "step": 4241 + }, + { + "epoch": 0.3193134985603794, + "grad_norm": 4.704890251159668, + "learning_rate": 7.968925768554005e-05, + "loss": 1.7844, + "step": 4242 + }, + { + "epoch": 0.31938877284103956, + "grad_norm": 5.348260402679443, + "learning_rate": 7.96794477240234e-05, + "loss": 2.0128, + "step": 4243 + }, + { + "epoch": 0.3194640471216997, + "grad_norm": 5.229158401489258, + "learning_rate": 7.966963599814874e-05, + "loss": 2.2811, + "step": 4244 + }, + { + "epoch": 0.31953932140235985, + "grad_norm": 4.3444013595581055, + "learning_rate": 7.965982250849934e-05, + "loss": 1.7147, + "step": 4245 + }, + { + "epoch": 0.31961459568302003, + "grad_norm": 4.520373344421387, + "learning_rate": 7.965000725565861e-05, + "loss": 2.0335, + "step": 4246 + }, + { + "epoch": 0.31968986996368015, + "grad_norm": 6.000203609466553, + "learning_rate": 7.964019024021001e-05, + "loss": 2.2267, + "step": 4247 + }, + { + "epoch": 0.3197651442443403, + "grad_norm": 3.972568988800049, + "learning_rate": 7.963037146273715e-05, + "loss": 2.0934, + "step": 4248 + }, + { + "epoch": 0.31984041852500045, + "grad_norm": 5.49121618270874, + "learning_rate": 7.962055092382372e-05, + "loss": 1.9397, + "step": 4249 + }, + { + "epoch": 0.3199156928056606, + "grad_norm": 4.182012557983398, + "learning_rate": 7.961072862405354e-05, + "loss": 2.0562, + "step": 4250 + }, + { + "epoch": 0.3199909670863208, + "grad_norm": 4.054929256439209, + "learning_rate": 7.96009045640105e-05, + "loss": 1.7626, + "step": 4251 + }, + { + "epoch": 0.3200662413669809, + "grad_norm": 4.9852399826049805, + "learning_rate": 7.959107874427863e-05, + "loss": 1.8231, + "step": 4252 + }, + { + "epoch": 0.3201415156476411, + "grad_norm": 4.02760124206543, + "learning_rate": 7.958125116544201e-05, + "loss": 2.1763, + "step": 4253 + }, + { + "epoch": 0.32021678992830127, + "grad_norm": 3.0537211894989014, + "learning_rate": 7.95714218280849e-05, + "loss": 1.9088, + "step": 4254 + }, + { + "epoch": 0.3202920642089614, + "grad_norm": 5.0345354080200195, + "learning_rate": 7.956159073279161e-05, + "loss": 2.0425, + "step": 4255 + }, + { + "epoch": 0.32036733848962157, + "grad_norm": 4.502742290496826, + "learning_rate": 7.955175788014658e-05, + "loss": 2.0991, + "step": 4256 + }, + { + "epoch": 0.3204426127702817, + "grad_norm": 3.492851972579956, + "learning_rate": 7.954192327073434e-05, + "loss": 1.821, + "step": 4257 + }, + { + "epoch": 0.32051788705094186, + "grad_norm": 4.437711715698242, + "learning_rate": 7.953208690513952e-05, + "loss": 1.8556, + "step": 4258 + }, + { + "epoch": 0.32059316133160204, + "grad_norm": 5.508961200714111, + "learning_rate": 7.952224878394687e-05, + "loss": 1.6512, + "step": 4259 + }, + { + "epoch": 0.32066843561226216, + "grad_norm": 4.101222991943359, + "learning_rate": 7.951240890774124e-05, + "loss": 1.6915, + "step": 4260 + }, + { + "epoch": 0.32074370989292234, + "grad_norm": 3.9297451972961426, + "learning_rate": 7.950256727710758e-05, + "loss": 2.0412, + "step": 4261 + }, + { + "epoch": 0.3208189841735825, + "grad_norm": 4.377941131591797, + "learning_rate": 7.949272389263096e-05, + "loss": 2.2407, + "step": 4262 + }, + { + "epoch": 0.32089425845424263, + "grad_norm": 4.96002721786499, + "learning_rate": 7.948287875489651e-05, + "loss": 1.4974, + "step": 4263 + }, + { + "epoch": 0.3209695327349028, + "grad_norm": 4.096199989318848, + "learning_rate": 7.947303186448952e-05, + "loss": 2.0694, + "step": 4264 + }, + { + "epoch": 0.321044807015563, + "grad_norm": 7.235177040100098, + "learning_rate": 7.946318322199535e-05, + "loss": 2.0124, + "step": 4265 + }, + { + "epoch": 0.3211200812962231, + "grad_norm": 5.543209552764893, + "learning_rate": 7.94533328279995e-05, + "loss": 2.1203, + "step": 4266 + }, + { + "epoch": 0.3211953555768833, + "grad_norm": 4.147661209106445, + "learning_rate": 7.94434806830875e-05, + "loss": 2.2026, + "step": 4267 + }, + { + "epoch": 0.3212706298575434, + "grad_norm": 6.708639621734619, + "learning_rate": 7.943362678784505e-05, + "loss": 2.0444, + "step": 4268 + }, + { + "epoch": 0.3213459041382036, + "grad_norm": 7.2309370040893555, + "learning_rate": 7.942377114285796e-05, + "loss": 2.2447, + "step": 4269 + }, + { + "epoch": 0.32142117841886375, + "grad_norm": 4.966527462005615, + "learning_rate": 7.941391374871209e-05, + "loss": 1.8573, + "step": 4270 + }, + { + "epoch": 0.32149645269952387, + "grad_norm": 5.686848163604736, + "learning_rate": 7.940405460599345e-05, + "loss": 1.696, + "step": 4271 + }, + { + "epoch": 0.32157172698018405, + "grad_norm": 4.885373592376709, + "learning_rate": 7.939419371528813e-05, + "loss": 1.7538, + "step": 4272 + }, + { + "epoch": 0.3216470012608442, + "grad_norm": 6.414916038513184, + "learning_rate": 7.938433107718235e-05, + "loss": 2.0618, + "step": 4273 + }, + { + "epoch": 0.32172227554150434, + "grad_norm": 4.447811603546143, + "learning_rate": 7.937446669226238e-05, + "loss": 2.0703, + "step": 4274 + }, + { + "epoch": 0.3217975498221645, + "grad_norm": 5.1187744140625, + "learning_rate": 7.936460056111468e-05, + "loss": 1.9478, + "step": 4275 + }, + { + "epoch": 0.3218728241028247, + "grad_norm": 3.978109359741211, + "learning_rate": 7.93547326843257e-05, + "loss": 1.7732, + "step": 4276 + }, + { + "epoch": 0.3219480983834848, + "grad_norm": 6.079015731811523, + "learning_rate": 7.93448630624821e-05, + "loss": 2.0366, + "step": 4277 + }, + { + "epoch": 0.322023372664145, + "grad_norm": 5.9308881759643555, + "learning_rate": 7.93349916961706e-05, + "loss": 1.8863, + "step": 4278 + }, + { + "epoch": 0.3220986469448051, + "grad_norm": 4.380959510803223, + "learning_rate": 7.932511858597801e-05, + "loss": 1.8405, + "step": 4279 + }, + { + "epoch": 0.3221739212254653, + "grad_norm": 5.676402568817139, + "learning_rate": 7.931524373249126e-05, + "loss": 1.9023, + "step": 4280 + }, + { + "epoch": 0.32224919550612546, + "grad_norm": 5.0600433349609375, + "learning_rate": 7.93053671362974e-05, + "loss": 2.3296, + "step": 4281 + }, + { + "epoch": 0.3223244697867856, + "grad_norm": 4.345888614654541, + "learning_rate": 7.929548879798353e-05, + "loss": 2.2065, + "step": 4282 + }, + { + "epoch": 0.32239974406744576, + "grad_norm": 4.994972229003906, + "learning_rate": 7.928560871813692e-05, + "loss": 1.7915, + "step": 4283 + }, + { + "epoch": 0.32247501834810594, + "grad_norm": 4.461777687072754, + "learning_rate": 7.927572689734489e-05, + "loss": 2.0374, + "step": 4284 + }, + { + "epoch": 0.32255029262876606, + "grad_norm": 5.133300304412842, + "learning_rate": 7.926584333619491e-05, + "loss": 1.9424, + "step": 4285 + }, + { + "epoch": 0.32262556690942623, + "grad_norm": 5.922780513763428, + "learning_rate": 7.92559580352745e-05, + "loss": 2.175, + "step": 4286 + }, + { + "epoch": 0.32270084119008635, + "grad_norm": 7.765385150909424, + "learning_rate": 7.924607099517135e-05, + "loss": 2.184, + "step": 4287 + }, + { + "epoch": 0.3227761154707465, + "grad_norm": 4.702193737030029, + "learning_rate": 7.92361822164732e-05, + "loss": 2.1254, + "step": 4288 + }, + { + "epoch": 0.3228513897514067, + "grad_norm": 3.975552558898926, + "learning_rate": 7.922629169976789e-05, + "loss": 2.0114, + "step": 4289 + }, + { + "epoch": 0.3229266640320668, + "grad_norm": 6.022935390472412, + "learning_rate": 7.92163994456434e-05, + "loss": 2.011, + "step": 4290 + }, + { + "epoch": 0.323001938312727, + "grad_norm": 4.55320930480957, + "learning_rate": 7.92065054546878e-05, + "loss": 2.1454, + "step": 4291 + }, + { + "epoch": 0.3230772125933872, + "grad_norm": 5.387508869171143, + "learning_rate": 7.919660972748923e-05, + "loss": 2.2658, + "step": 4292 + }, + { + "epoch": 0.3231524868740473, + "grad_norm": 4.650260925292969, + "learning_rate": 7.918671226463601e-05, + "loss": 1.8261, + "step": 4293 + }, + { + "epoch": 0.32322776115470747, + "grad_norm": 6.136397838592529, + "learning_rate": 7.91768130667165e-05, + "loss": 1.9409, + "step": 4294 + }, + { + "epoch": 0.32330303543536765, + "grad_norm": 5.892971515655518, + "learning_rate": 7.916691213431915e-05, + "loss": 2.0142, + "step": 4295 + }, + { + "epoch": 0.32337830971602777, + "grad_norm": 7.019687175750732, + "learning_rate": 7.915700946803257e-05, + "loss": 1.9436, + "step": 4296 + }, + { + "epoch": 0.32345358399668794, + "grad_norm": 3.8222222328186035, + "learning_rate": 7.914710506844545e-05, + "loss": 2.2068, + "step": 4297 + }, + { + "epoch": 0.32352885827734806, + "grad_norm": 5.505558013916016, + "learning_rate": 7.913719893614655e-05, + "loss": 2.126, + "step": 4298 + }, + { + "epoch": 0.32360413255800824, + "grad_norm": 3.8058364391326904, + "learning_rate": 7.912729107172479e-05, + "loss": 1.6943, + "step": 4299 + }, + { + "epoch": 0.3236794068386684, + "grad_norm": 3.583813190460205, + "learning_rate": 7.911738147576914e-05, + "loss": 2.1404, + "step": 4300 + }, + { + "epoch": 0.32375468111932854, + "grad_norm": 3.3333823680877686, + "learning_rate": 7.91074701488687e-05, + "loss": 1.9622, + "step": 4301 + }, + { + "epoch": 0.3238299553999887, + "grad_norm": 7.203909873962402, + "learning_rate": 7.90975570916127e-05, + "loss": 2.0116, + "step": 4302 + }, + { + "epoch": 0.3239052296806489, + "grad_norm": 4.942460060119629, + "learning_rate": 7.90876423045904e-05, + "loss": 1.9917, + "step": 4303 + }, + { + "epoch": 0.323980503961309, + "grad_norm": 5.483014106750488, + "learning_rate": 7.907772578839125e-05, + "loss": 1.832, + "step": 4304 + }, + { + "epoch": 0.3240557782419692, + "grad_norm": 4.343141555786133, + "learning_rate": 7.906780754360472e-05, + "loss": 1.7992, + "step": 4305 + }, + { + "epoch": 0.3241310525226293, + "grad_norm": 4.399363994598389, + "learning_rate": 7.905788757082044e-05, + "loss": 1.9644, + "step": 4306 + }, + { + "epoch": 0.3242063268032895, + "grad_norm": 4.299865245819092, + "learning_rate": 7.904796587062811e-05, + "loss": 2.067, + "step": 4307 + }, + { + "epoch": 0.32428160108394966, + "grad_norm": 4.725642204284668, + "learning_rate": 7.903804244361757e-05, + "loss": 1.9544, + "step": 4308 + }, + { + "epoch": 0.3243568753646098, + "grad_norm": 4.476677417755127, + "learning_rate": 7.902811729037873e-05, + "loss": 2.3473, + "step": 4309 + }, + { + "epoch": 0.32443214964526995, + "grad_norm": 3.550751209259033, + "learning_rate": 7.90181904115016e-05, + "loss": 1.7212, + "step": 4310 + }, + { + "epoch": 0.3245074239259301, + "grad_norm": 6.090052127838135, + "learning_rate": 7.900826180757632e-05, + "loss": 1.9578, + "step": 4311 + }, + { + "epoch": 0.32458269820659025, + "grad_norm": 4.012701511383057, + "learning_rate": 7.89983314791931e-05, + "loss": 1.9202, + "step": 4312 + }, + { + "epoch": 0.3246579724872504, + "grad_norm": 4.01508903503418, + "learning_rate": 7.898839942694228e-05, + "loss": 2.2097, + "step": 4313 + }, + { + "epoch": 0.3247332467679106, + "grad_norm": 3.751957654953003, + "learning_rate": 7.89784656514143e-05, + "loss": 1.7587, + "step": 4314 + }, + { + "epoch": 0.3248085210485707, + "grad_norm": 4.292129993438721, + "learning_rate": 7.896853015319967e-05, + "loss": 1.8522, + "step": 4315 + }, + { + "epoch": 0.3248837953292309, + "grad_norm": 6.028865814208984, + "learning_rate": 7.895859293288907e-05, + "loss": 2.0651, + "step": 4316 + }, + { + "epoch": 0.324959069609891, + "grad_norm": 5.337474346160889, + "learning_rate": 7.894865399107319e-05, + "loss": 2.1627, + "step": 4317 + }, + { + "epoch": 0.3250343438905512, + "grad_norm": 3.9330873489379883, + "learning_rate": 7.893871332834288e-05, + "loss": 1.8096, + "step": 4318 + }, + { + "epoch": 0.32510961817121137, + "grad_norm": 4.409187316894531, + "learning_rate": 7.892877094528912e-05, + "loss": 1.871, + "step": 4319 + }, + { + "epoch": 0.3251848924518715, + "grad_norm": 3.8573362827301025, + "learning_rate": 7.891882684250292e-05, + "loss": 2.2731, + "step": 4320 + }, + { + "epoch": 0.32526016673253166, + "grad_norm": 4.413033962249756, + "learning_rate": 7.890888102057545e-05, + "loss": 1.9311, + "step": 4321 + }, + { + "epoch": 0.32533544101319184, + "grad_norm": 5.094873428344727, + "learning_rate": 7.889893348009796e-05, + "loss": 1.8511, + "step": 4322 + }, + { + "epoch": 0.32541071529385196, + "grad_norm": 7.153104782104492, + "learning_rate": 7.888898422166178e-05, + "loss": 1.7481, + "step": 4323 + }, + { + "epoch": 0.32548598957451214, + "grad_norm": 11.435986518859863, + "learning_rate": 7.887903324585839e-05, + "loss": 2.1428, + "step": 4324 + }, + { + "epoch": 0.3255612638551723, + "grad_norm": 3.9848570823669434, + "learning_rate": 7.886908055327933e-05, + "loss": 1.8565, + "step": 4325 + }, + { + "epoch": 0.32563653813583243, + "grad_norm": 4.865833282470703, + "learning_rate": 7.885912614451627e-05, + "loss": 1.708, + "step": 4326 + }, + { + "epoch": 0.3257118124164926, + "grad_norm": 5.470982551574707, + "learning_rate": 7.884917002016097e-05, + "loss": 2.0026, + "step": 4327 + }, + { + "epoch": 0.32578708669715273, + "grad_norm": 3.812516927719116, + "learning_rate": 7.883921218080528e-05, + "loss": 2.2151, + "step": 4328 + }, + { + "epoch": 0.3258623609778129, + "grad_norm": 4.403374671936035, + "learning_rate": 7.882925262704118e-05, + "loss": 1.8582, + "step": 4329 + }, + { + "epoch": 0.3259376352584731, + "grad_norm": 5.941940784454346, + "learning_rate": 7.881929135946073e-05, + "loss": 2.3364, + "step": 4330 + }, + { + "epoch": 0.3260129095391332, + "grad_norm": 4.640542984008789, + "learning_rate": 7.88093283786561e-05, + "loss": 2.1626, + "step": 4331 + }, + { + "epoch": 0.3260881838197934, + "grad_norm": 4.178682804107666, + "learning_rate": 7.879936368521957e-05, + "loss": 2.0245, + "step": 4332 + }, + { + "epoch": 0.32616345810045355, + "grad_norm": 4.092398643493652, + "learning_rate": 7.87893972797435e-05, + "loss": 2.0029, + "step": 4333 + }, + { + "epoch": 0.32623873238111367, + "grad_norm": 4.372684955596924, + "learning_rate": 7.877942916282037e-05, + "loss": 2.2685, + "step": 4334 + }, + { + "epoch": 0.32631400666177385, + "grad_norm": 4.049747467041016, + "learning_rate": 7.876945933504274e-05, + "loss": 2.002, + "step": 4335 + }, + { + "epoch": 0.32638928094243397, + "grad_norm": 4.097887992858887, + "learning_rate": 7.875948779700331e-05, + "loss": 1.6293, + "step": 4336 + }, + { + "epoch": 0.32646455522309414, + "grad_norm": 4.2467851638793945, + "learning_rate": 7.874951454929484e-05, + "loss": 1.7134, + "step": 4337 + }, + { + "epoch": 0.3265398295037543, + "grad_norm": 5.490288734436035, + "learning_rate": 7.873953959251023e-05, + "loss": 1.8483, + "step": 4338 + }, + { + "epoch": 0.32661510378441444, + "grad_norm": 3.672646999359131, + "learning_rate": 7.872956292724247e-05, + "loss": 2.2833, + "step": 4339 + }, + { + "epoch": 0.3266903780650746, + "grad_norm": 5.37131404876709, + "learning_rate": 7.871958455408461e-05, + "loss": 1.9189, + "step": 4340 + }, + { + "epoch": 0.3267656523457348, + "grad_norm": 4.298294544219971, + "learning_rate": 7.870960447362986e-05, + "loss": 1.9717, + "step": 4341 + }, + { + "epoch": 0.3268409266263949, + "grad_norm": 4.142689228057861, + "learning_rate": 7.869962268647149e-05, + "loss": 1.9898, + "step": 4342 + }, + { + "epoch": 0.3269162009070551, + "grad_norm": 3.9931344985961914, + "learning_rate": 7.868963919320292e-05, + "loss": 2.1516, + "step": 4343 + }, + { + "epoch": 0.32699147518771526, + "grad_norm": 4.967349529266357, + "learning_rate": 7.867965399441762e-05, + "loss": 1.7839, + "step": 4344 + }, + { + "epoch": 0.3270667494683754, + "grad_norm": 6.360227584838867, + "learning_rate": 7.866966709070916e-05, + "loss": 1.8235, + "step": 4345 + }, + { + "epoch": 0.32714202374903556, + "grad_norm": 4.461812496185303, + "learning_rate": 7.865967848267126e-05, + "loss": 2.0104, + "step": 4346 + }, + { + "epoch": 0.3272172980296957, + "grad_norm": 4.522796154022217, + "learning_rate": 7.864968817089772e-05, + "loss": 2.2676, + "step": 4347 + }, + { + "epoch": 0.32729257231035586, + "grad_norm": 4.2580342292785645, + "learning_rate": 7.86396961559824e-05, + "loss": 2.2547, + "step": 4348 + }, + { + "epoch": 0.32736784659101603, + "grad_norm": 6.083259105682373, + "learning_rate": 7.862970243851935e-05, + "loss": 2.2956, + "step": 4349 + }, + { + "epoch": 0.32744312087167615, + "grad_norm": 4.294934272766113, + "learning_rate": 7.861970701910262e-05, + "loss": 1.9972, + "step": 4350 + }, + { + "epoch": 0.32751839515233633, + "grad_norm": 6.339199066162109, + "learning_rate": 7.860970989832644e-05, + "loss": 2.2726, + "step": 4351 + }, + { + "epoch": 0.3275936694329965, + "grad_norm": 4.928619861602783, + "learning_rate": 7.859971107678507e-05, + "loss": 1.7872, + "step": 4352 + }, + { + "epoch": 0.3276689437136566, + "grad_norm": 4.054974555969238, + "learning_rate": 7.858971055507295e-05, + "loss": 1.8862, + "step": 4353 + }, + { + "epoch": 0.3277442179943168, + "grad_norm": 6.087123870849609, + "learning_rate": 7.857970833378458e-05, + "loss": 2.017, + "step": 4354 + }, + { + "epoch": 0.3278194922749769, + "grad_norm": 5.132475852966309, + "learning_rate": 7.856970441351456e-05, + "loss": 1.9775, + "step": 4355 + }, + { + "epoch": 0.3278947665556371, + "grad_norm": 3.0807714462280273, + "learning_rate": 7.855969879485758e-05, + "loss": 1.9778, + "step": 4356 + }, + { + "epoch": 0.32797004083629727, + "grad_norm": 5.419703483581543, + "learning_rate": 7.854969147840845e-05, + "loss": 1.8723, + "step": 4357 + }, + { + "epoch": 0.3280453151169574, + "grad_norm": 7.973278522491455, + "learning_rate": 7.853968246476209e-05, + "loss": 2.0772, + "step": 4358 + }, + { + "epoch": 0.32812058939761757, + "grad_norm": 7.781949520111084, + "learning_rate": 7.852967175451349e-05, + "loss": 2.3391, + "step": 4359 + }, + { + "epoch": 0.32819586367827774, + "grad_norm": 5.441353797912598, + "learning_rate": 7.851965934825775e-05, + "loss": 1.9057, + "step": 4360 + }, + { + "epoch": 0.32827113795893786, + "grad_norm": 4.854582786560059, + "learning_rate": 7.850964524659013e-05, + "loss": 1.6827, + "step": 4361 + }, + { + "epoch": 0.32834641223959804, + "grad_norm": 5.159985542297363, + "learning_rate": 7.849962945010587e-05, + "loss": 1.8685, + "step": 4362 + }, + { + "epoch": 0.3284216865202582, + "grad_norm": 5.116022109985352, + "learning_rate": 7.848961195940043e-05, + "loss": 1.951, + "step": 4363 + }, + { + "epoch": 0.32849696080091834, + "grad_norm": 4.6052961349487305, + "learning_rate": 7.847959277506928e-05, + "loss": 1.9344, + "step": 4364 + }, + { + "epoch": 0.3285722350815785, + "grad_norm": 7.996417999267578, + "learning_rate": 7.846957189770809e-05, + "loss": 1.9771, + "step": 4365 + }, + { + "epoch": 0.32864750936223863, + "grad_norm": 6.167883396148682, + "learning_rate": 7.845954932791253e-05, + "loss": 1.7924, + "step": 4366 + }, + { + "epoch": 0.3287227836428988, + "grad_norm": 5.015079021453857, + "learning_rate": 7.84495250662784e-05, + "loss": 1.9587, + "step": 4367 + }, + { + "epoch": 0.328798057923559, + "grad_norm": 4.868743419647217, + "learning_rate": 7.843949911340168e-05, + "loss": 2.056, + "step": 4368 + }, + { + "epoch": 0.3288733322042191, + "grad_norm": 4.857298374176025, + "learning_rate": 7.842947146987829e-05, + "loss": 2.1993, + "step": 4369 + }, + { + "epoch": 0.3289486064848793, + "grad_norm": 5.768919467926025, + "learning_rate": 7.841944213630442e-05, + "loss": 1.8897, + "step": 4370 + }, + { + "epoch": 0.32902388076553946, + "grad_norm": 5.21314811706543, + "learning_rate": 7.840941111327626e-05, + "loss": 1.7522, + "step": 4371 + }, + { + "epoch": 0.3290991550461996, + "grad_norm": 4.84205436706543, + "learning_rate": 7.839937840139013e-05, + "loss": 2.7705, + "step": 4372 + }, + { + "epoch": 0.32917442932685975, + "grad_norm": 4.099056720733643, + "learning_rate": 7.838934400124242e-05, + "loss": 1.4868, + "step": 4373 + }, + { + "epoch": 0.32924970360751993, + "grad_norm": 5.3566575050354, + "learning_rate": 7.837930791342969e-05, + "loss": 2.1252, + "step": 4374 + }, + { + "epoch": 0.32932497788818005, + "grad_norm": 7.518743515014648, + "learning_rate": 7.836927013854853e-05, + "loss": 1.8127, + "step": 4375 + }, + { + "epoch": 0.3294002521688402, + "grad_norm": 6.118736743927002, + "learning_rate": 7.835923067719565e-05, + "loss": 1.6639, + "step": 4376 + }, + { + "epoch": 0.32947552644950034, + "grad_norm": 4.957034587860107, + "learning_rate": 7.834918952996788e-05, + "loss": 1.8331, + "step": 4377 + }, + { + "epoch": 0.3295508007301605, + "grad_norm": 4.866137981414795, + "learning_rate": 7.833914669746214e-05, + "loss": 2.2401, + "step": 4378 + }, + { + "epoch": 0.3296260750108207, + "grad_norm": 7.278197765350342, + "learning_rate": 7.832910218027545e-05, + "loss": 2.2516, + "step": 4379 + }, + { + "epoch": 0.3297013492914808, + "grad_norm": 3.633559465408325, + "learning_rate": 7.831905597900491e-05, + "loss": 2.0158, + "step": 4380 + }, + { + "epoch": 0.329776623572141, + "grad_norm": 4.701472282409668, + "learning_rate": 7.830900809424775e-05, + "loss": 2.0728, + "step": 4381 + }, + { + "epoch": 0.32985189785280117, + "grad_norm": 4.455605983734131, + "learning_rate": 7.82989585266013e-05, + "loss": 2.0341, + "step": 4382 + }, + { + "epoch": 0.3299271721334613, + "grad_norm": 4.972047805786133, + "learning_rate": 7.828890727666294e-05, + "loss": 1.9944, + "step": 4383 + }, + { + "epoch": 0.33000244641412146, + "grad_norm": 6.96929931640625, + "learning_rate": 7.827885434503021e-05, + "loss": 2.2406, + "step": 4384 + }, + { + "epoch": 0.3300777206947816, + "grad_norm": 4.37180757522583, + "learning_rate": 7.826879973230074e-05, + "loss": 1.949, + "step": 4385 + }, + { + "epoch": 0.33015299497544176, + "grad_norm": 4.380893230438232, + "learning_rate": 7.825874343907226e-05, + "loss": 2.113, + "step": 4386 + }, + { + "epoch": 0.33022826925610194, + "grad_norm": 5.326410293579102, + "learning_rate": 7.824868546594255e-05, + "loss": 1.7335, + "step": 4387 + }, + { + "epoch": 0.33030354353676206, + "grad_norm": 4.2704572677612305, + "learning_rate": 7.823862581350954e-05, + "loss": 2.1107, + "step": 4388 + }, + { + "epoch": 0.33037881781742223, + "grad_norm": 5.810431480407715, + "learning_rate": 7.822856448237126e-05, + "loss": 2.1861, + "step": 4389 + }, + { + "epoch": 0.3304540920980824, + "grad_norm": 6.7572526931762695, + "learning_rate": 7.821850147312579e-05, + "loss": 2.476, + "step": 4390 + }, + { + "epoch": 0.33052936637874253, + "grad_norm": 5.573050498962402, + "learning_rate": 7.82084367863714e-05, + "loss": 2.1025, + "step": 4391 + }, + { + "epoch": 0.3306046406594027, + "grad_norm": 4.264028549194336, + "learning_rate": 7.81983704227064e-05, + "loss": 1.7665, + "step": 4392 + }, + { + "epoch": 0.3306799149400629, + "grad_norm": 5.386504650115967, + "learning_rate": 7.818830238272916e-05, + "loss": 2.5614, + "step": 4393 + }, + { + "epoch": 0.330755189220723, + "grad_norm": 4.34447717666626, + "learning_rate": 7.817823266703823e-05, + "loss": 2.0942, + "step": 4394 + }, + { + "epoch": 0.3308304635013832, + "grad_norm": 4.112555027008057, + "learning_rate": 7.816816127623222e-05, + "loss": 2.0292, + "step": 4395 + }, + { + "epoch": 0.3309057377820433, + "grad_norm": 4.329293727874756, + "learning_rate": 7.815808821090986e-05, + "loss": 1.6239, + "step": 4396 + }, + { + "epoch": 0.3309810120627035, + "grad_norm": 4.057832717895508, + "learning_rate": 7.814801347166993e-05, + "loss": 2.0869, + "step": 4397 + }, + { + "epoch": 0.33105628634336365, + "grad_norm": 5.131999492645264, + "learning_rate": 7.813793705911138e-05, + "loss": 1.981, + "step": 4398 + }, + { + "epoch": 0.33113156062402377, + "grad_norm": 7.1158905029296875, + "learning_rate": 7.812785897383319e-05, + "loss": 2.1861, + "step": 4399 + }, + { + "epoch": 0.33120683490468394, + "grad_norm": 4.448178291320801, + "learning_rate": 7.811777921643451e-05, + "loss": 1.7959, + "step": 4400 + }, + { + "epoch": 0.3312821091853441, + "grad_norm": 5.340634822845459, + "learning_rate": 7.810769778751453e-05, + "loss": 2.0081, + "step": 4401 + }, + { + "epoch": 0.33135738346600424, + "grad_norm": 4.587032318115234, + "learning_rate": 7.809761468767256e-05, + "loss": 1.9775, + "step": 4402 + }, + { + "epoch": 0.3314326577466644, + "grad_norm": 7.372905731201172, + "learning_rate": 7.808752991750803e-05, + "loss": 2.443, + "step": 4403 + }, + { + "epoch": 0.33150793202732454, + "grad_norm": 5.082029342651367, + "learning_rate": 7.807744347762042e-05, + "loss": 1.8237, + "step": 4404 + }, + { + "epoch": 0.3315832063079847, + "grad_norm": 4.510185718536377, + "learning_rate": 7.806735536860939e-05, + "loss": 1.5911, + "step": 4405 + }, + { + "epoch": 0.3316584805886449, + "grad_norm": 4.006820201873779, + "learning_rate": 7.805726559107458e-05, + "loss": 1.7089, + "step": 4406 + }, + { + "epoch": 0.331733754869305, + "grad_norm": 4.016329765319824, + "learning_rate": 7.804717414561586e-05, + "loss": 1.7514, + "step": 4407 + }, + { + "epoch": 0.3318090291499652, + "grad_norm": 4.960211277008057, + "learning_rate": 7.803708103283311e-05, + "loss": 2.1126, + "step": 4408 + }, + { + "epoch": 0.33188430343062536, + "grad_norm": 5.330507278442383, + "learning_rate": 7.802698625332634e-05, + "loss": 2.1649, + "step": 4409 + }, + { + "epoch": 0.3319595777112855, + "grad_norm": 3.614654302597046, + "learning_rate": 7.801688980769564e-05, + "loss": 1.6986, + "step": 4410 + }, + { + "epoch": 0.33203485199194566, + "grad_norm": 3.831610679626465, + "learning_rate": 7.800679169654125e-05, + "loss": 2.0079, + "step": 4411 + }, + { + "epoch": 0.33211012627260583, + "grad_norm": 5.059773921966553, + "learning_rate": 7.799669192046345e-05, + "loss": 2.1007, + "step": 4412 + }, + { + "epoch": 0.33218540055326595, + "grad_norm": 3.5472607612609863, + "learning_rate": 7.798659048006265e-05, + "loss": 2.063, + "step": 4413 + }, + { + "epoch": 0.33226067483392613, + "grad_norm": 4.235814094543457, + "learning_rate": 7.797648737593934e-05, + "loss": 1.8019, + "step": 4414 + }, + { + "epoch": 0.33233594911458625, + "grad_norm": 4.2445268630981445, + "learning_rate": 7.796638260869412e-05, + "loss": 2.2093, + "step": 4415 + }, + { + "epoch": 0.3324112233952464, + "grad_norm": 3.7013628482818604, + "learning_rate": 7.79562761789277e-05, + "loss": 1.7717, + "step": 4416 + }, + { + "epoch": 0.3324864976759066, + "grad_norm": 4.648346424102783, + "learning_rate": 7.79461680872409e-05, + "loss": 1.9894, + "step": 4417 + }, + { + "epoch": 0.3325617719565667, + "grad_norm": 4.188632488250732, + "learning_rate": 7.793605833423459e-05, + "loss": 1.6559, + "step": 4418 + }, + { + "epoch": 0.3326370462372269, + "grad_norm": 3.7285044193267822, + "learning_rate": 7.792594692050975e-05, + "loss": 2.1474, + "step": 4419 + }, + { + "epoch": 0.3327123205178871, + "grad_norm": 5.108846664428711, + "learning_rate": 7.791583384666751e-05, + "loss": 2.1861, + "step": 4420 + }, + { + "epoch": 0.3327875947985472, + "grad_norm": 5.324336528778076, + "learning_rate": 7.790571911330904e-05, + "loss": 2.0839, + "step": 4421 + }, + { + "epoch": 0.33286286907920737, + "grad_norm": 4.901613235473633, + "learning_rate": 7.789560272103564e-05, + "loss": 1.6584, + "step": 4422 + }, + { + "epoch": 0.33293814335986754, + "grad_norm": 4.398468971252441, + "learning_rate": 7.788548467044869e-05, + "loss": 1.9581, + "step": 4423 + }, + { + "epoch": 0.33301341764052766, + "grad_norm": 5.463700294494629, + "learning_rate": 7.787536496214972e-05, + "loss": 2.1776, + "step": 4424 + }, + { + "epoch": 0.33308869192118784, + "grad_norm": 7.23293399810791, + "learning_rate": 7.786524359674026e-05, + "loss": 1.9, + "step": 4425 + }, + { + "epoch": 0.33316396620184796, + "grad_norm": 5.656863212585449, + "learning_rate": 7.785512057482202e-05, + "loss": 2.1969, + "step": 4426 + }, + { + "epoch": 0.33323924048250814, + "grad_norm": 5.028671741485596, + "learning_rate": 7.784499589699681e-05, + "loss": 2.4658, + "step": 4427 + }, + { + "epoch": 0.3333145147631683, + "grad_norm": 4.938046932220459, + "learning_rate": 7.783486956386647e-05, + "loss": 2.026, + "step": 4428 + }, + { + "epoch": 0.33338978904382843, + "grad_norm": 5.654271602630615, + "learning_rate": 7.782474157603301e-05, + "loss": 2.2582, + "step": 4429 + }, + { + "epoch": 0.3334650633244886, + "grad_norm": 5.342366695404053, + "learning_rate": 7.78146119340985e-05, + "loss": 2.1101, + "step": 4430 + }, + { + "epoch": 0.3335403376051488, + "grad_norm": 6.084756374359131, + "learning_rate": 7.780448063866511e-05, + "loss": 2.4105, + "step": 4431 + }, + { + "epoch": 0.3336156118858089, + "grad_norm": 3.4337925910949707, + "learning_rate": 7.779434769033514e-05, + "loss": 1.8763, + "step": 4432 + }, + { + "epoch": 0.3336908861664691, + "grad_norm": 5.285945892333984, + "learning_rate": 7.778421308971093e-05, + "loss": 1.9216, + "step": 4433 + }, + { + "epoch": 0.3337661604471292, + "grad_norm": 4.035688400268555, + "learning_rate": 7.7774076837395e-05, + "loss": 1.9087, + "step": 4434 + }, + { + "epoch": 0.3338414347277894, + "grad_norm": 4.876992225646973, + "learning_rate": 7.776393893398988e-05, + "loss": 2.0467, + "step": 4435 + }, + { + "epoch": 0.33391670900844955, + "grad_norm": 4.6616058349609375, + "learning_rate": 7.775379938009826e-05, + "loss": 2.2517, + "step": 4436 + }, + { + "epoch": 0.3339919832891097, + "grad_norm": 4.198055267333984, + "learning_rate": 7.77436581763229e-05, + "loss": 1.5766, + "step": 4437 + }, + { + "epoch": 0.33406725756976985, + "grad_norm": 4.089532852172852, + "learning_rate": 7.773351532326667e-05, + "loss": 2.0709, + "step": 4438 + }, + { + "epoch": 0.33414253185043, + "grad_norm": 4.617081642150879, + "learning_rate": 7.772337082153253e-05, + "loss": 1.5901, + "step": 4439 + }, + { + "epoch": 0.33421780613109014, + "grad_norm": 4.185454368591309, + "learning_rate": 7.771322467172353e-05, + "loss": 1.9904, + "step": 4440 + }, + { + "epoch": 0.3342930804117503, + "grad_norm": 5.368082046508789, + "learning_rate": 7.770307687444284e-05, + "loss": 1.6603, + "step": 4441 + }, + { + "epoch": 0.3343683546924105, + "grad_norm": 4.917660236358643, + "learning_rate": 7.769292743029374e-05, + "loss": 1.9464, + "step": 4442 + }, + { + "epoch": 0.3344436289730706, + "grad_norm": 5.311621189117432, + "learning_rate": 7.768277633987955e-05, + "loss": 2.1194, + "step": 4443 + }, + { + "epoch": 0.3345189032537308, + "grad_norm": 4.397782325744629, + "learning_rate": 7.767262360380374e-05, + "loss": 2.0917, + "step": 4444 + }, + { + "epoch": 0.3345941775343909, + "grad_norm": 4.391879081726074, + "learning_rate": 7.766246922266986e-05, + "loss": 2.024, + "step": 4445 + }, + { + "epoch": 0.3346694518150511, + "grad_norm": 4.531358242034912, + "learning_rate": 7.765231319708156e-05, + "loss": 2.1964, + "step": 4446 + }, + { + "epoch": 0.33474472609571126, + "grad_norm": 4.115760326385498, + "learning_rate": 7.764215552764258e-05, + "loss": 2.0223, + "step": 4447 + }, + { + "epoch": 0.3348200003763714, + "grad_norm": 4.948193073272705, + "learning_rate": 7.763199621495676e-05, + "loss": 2.297, + "step": 4448 + }, + { + "epoch": 0.33489527465703156, + "grad_norm": 5.160480499267578, + "learning_rate": 7.762183525962804e-05, + "loss": 1.8223, + "step": 4449 + }, + { + "epoch": 0.33497054893769174, + "grad_norm": 4.777421951293945, + "learning_rate": 7.76116726622605e-05, + "loss": 1.8727, + "step": 4450 + }, + { + "epoch": 0.33504582321835186, + "grad_norm": 5.0138983726501465, + "learning_rate": 7.760150842345823e-05, + "loss": 2.3869, + "step": 4451 + }, + { + "epoch": 0.33512109749901203, + "grad_norm": 5.5878071784973145, + "learning_rate": 7.75913425438255e-05, + "loss": 2.0247, + "step": 4452 + }, + { + "epoch": 0.33519637177967215, + "grad_norm": 4.397315502166748, + "learning_rate": 7.758117502396661e-05, + "loss": 2.0419, + "step": 4453 + }, + { + "epoch": 0.33527164606033233, + "grad_norm": 7.074615478515625, + "learning_rate": 7.7571005864486e-05, + "loss": 2.1746, + "step": 4454 + }, + { + "epoch": 0.3353469203409925, + "grad_norm": 4.192893981933594, + "learning_rate": 7.756083506598821e-05, + "loss": 1.9827, + "step": 4455 + }, + { + "epoch": 0.3354221946216526, + "grad_norm": 5.783407688140869, + "learning_rate": 7.755066262907786e-05, + "loss": 2.0694, + "step": 4456 + }, + { + "epoch": 0.3354974689023128, + "grad_norm": 4.581982135772705, + "learning_rate": 7.754048855435965e-05, + "loss": 2.1816, + "step": 4457 + }, + { + "epoch": 0.335572743182973, + "grad_norm": 3.9422690868377686, + "learning_rate": 7.753031284243844e-05, + "loss": 2.0383, + "step": 4458 + }, + { + "epoch": 0.3356480174636331, + "grad_norm": 5.631012439727783, + "learning_rate": 7.75201354939191e-05, + "loss": 1.934, + "step": 4459 + }, + { + "epoch": 0.3357232917442933, + "grad_norm": 5.207779884338379, + "learning_rate": 7.750995650940669e-05, + "loss": 1.761, + "step": 4460 + }, + { + "epoch": 0.33579856602495345, + "grad_norm": 5.82303524017334, + "learning_rate": 7.74997758895063e-05, + "loss": 2.1199, + "step": 4461 + }, + { + "epoch": 0.33587384030561357, + "grad_norm": 4.269281387329102, + "learning_rate": 7.748959363482313e-05, + "loss": 2.0459, + "step": 4462 + }, + { + "epoch": 0.33594911458627374, + "grad_norm": 5.640504360198975, + "learning_rate": 7.747940974596248e-05, + "loss": 2.4645, + "step": 4463 + }, + { + "epoch": 0.33602438886693387, + "grad_norm": 6.266437530517578, + "learning_rate": 7.746922422352978e-05, + "loss": 2.3986, + "step": 4464 + }, + { + "epoch": 0.33609966314759404, + "grad_norm": 4.637019634246826, + "learning_rate": 7.745903706813054e-05, + "loss": 1.8858, + "step": 4465 + }, + { + "epoch": 0.3361749374282542, + "grad_norm": 3.3513193130493164, + "learning_rate": 7.744884828037031e-05, + "loss": 2.1596, + "step": 4466 + }, + { + "epoch": 0.33625021170891434, + "grad_norm": 4.358898162841797, + "learning_rate": 7.743865786085482e-05, + "loss": 1.942, + "step": 4467 + }, + { + "epoch": 0.3363254859895745, + "grad_norm": 4.427700996398926, + "learning_rate": 7.742846581018985e-05, + "loss": 1.6143, + "step": 4468 + }, + { + "epoch": 0.3364007602702347, + "grad_norm": 4.402661323547363, + "learning_rate": 7.741827212898128e-05, + "loss": 2.195, + "step": 4469 + }, + { + "epoch": 0.3364760345508948, + "grad_norm": 4.933595180511475, + "learning_rate": 7.740807681783511e-05, + "loss": 1.712, + "step": 4470 + }, + { + "epoch": 0.336551308831555, + "grad_norm": 3.9877562522888184, + "learning_rate": 7.739787987735742e-05, + "loss": 1.8227, + "step": 4471 + }, + { + "epoch": 0.33662658311221516, + "grad_norm": 4.576951026916504, + "learning_rate": 7.73876813081544e-05, + "loss": 2.0839, + "step": 4472 + }, + { + "epoch": 0.3367018573928753, + "grad_norm": 4.622626781463623, + "learning_rate": 7.737748111083227e-05, + "loss": 1.891, + "step": 4473 + }, + { + "epoch": 0.33677713167353546, + "grad_norm": 5.091668605804443, + "learning_rate": 7.736727928599748e-05, + "loss": 1.788, + "step": 4474 + }, + { + "epoch": 0.3368524059541956, + "grad_norm": 8.14338207244873, + "learning_rate": 7.735707583425643e-05, + "loss": 2.1729, + "step": 4475 + }, + { + "epoch": 0.33692768023485575, + "grad_norm": 6.775550365447998, + "learning_rate": 7.734687075621574e-05, + "loss": 2.3048, + "step": 4476 + }, + { + "epoch": 0.33700295451551593, + "grad_norm": 5.820411205291748, + "learning_rate": 7.733666405248205e-05, + "loss": 1.712, + "step": 4477 + }, + { + "epoch": 0.33707822879617605, + "grad_norm": 5.631104946136475, + "learning_rate": 7.73264557236621e-05, + "loss": 1.9128, + "step": 4478 + }, + { + "epoch": 0.3371535030768362, + "grad_norm": 4.196645736694336, + "learning_rate": 7.731624577036278e-05, + "loss": 2.6356, + "step": 4479 + }, + { + "epoch": 0.3372287773574964, + "grad_norm": 4.481573581695557, + "learning_rate": 7.730603419319103e-05, + "loss": 2.7323, + "step": 4480 + }, + { + "epoch": 0.3373040516381565, + "grad_norm": 5.715769290924072, + "learning_rate": 7.72958209927539e-05, + "loss": 2.0297, + "step": 4481 + }, + { + "epoch": 0.3373793259188167, + "grad_norm": 4.2328715324401855, + "learning_rate": 7.728560616965852e-05, + "loss": 2.0633, + "step": 4482 + }, + { + "epoch": 0.3374546001994768, + "grad_norm": 7.679381847381592, + "learning_rate": 7.727538972451216e-05, + "loss": 2.1336, + "step": 4483 + }, + { + "epoch": 0.337529874480137, + "grad_norm": 4.048801422119141, + "learning_rate": 7.726517165792213e-05, + "loss": 1.7317, + "step": 4484 + }, + { + "epoch": 0.33760514876079717, + "grad_norm": 4.568599700927734, + "learning_rate": 7.725495197049588e-05, + "loss": 1.8173, + "step": 4485 + }, + { + "epoch": 0.3376804230414573, + "grad_norm": 4.221613883972168, + "learning_rate": 7.724473066284095e-05, + "loss": 2.1084, + "step": 4486 + }, + { + "epoch": 0.33775569732211747, + "grad_norm": 4.320460796356201, + "learning_rate": 7.723450773556494e-05, + "loss": 2.2091, + "step": 4487 + }, + { + "epoch": 0.33783097160277764, + "grad_norm": 4.902789115905762, + "learning_rate": 7.72242831892756e-05, + "loss": 1.7225, + "step": 4488 + }, + { + "epoch": 0.33790624588343776, + "grad_norm": 5.744263648986816, + "learning_rate": 7.721405702458073e-05, + "loss": 2.0202, + "step": 4489 + }, + { + "epoch": 0.33798152016409794, + "grad_norm": 3.8968923091888428, + "learning_rate": 7.720382924208827e-05, + "loss": 1.7443, + "step": 4490 + }, + { + "epoch": 0.3380567944447581, + "grad_norm": 7.999432563781738, + "learning_rate": 7.719359984240619e-05, + "loss": 2.011, + "step": 4491 + }, + { + "epoch": 0.33813206872541823, + "grad_norm": 6.596442699432373, + "learning_rate": 7.718336882614265e-05, + "loss": 1.8791, + "step": 4492 + }, + { + "epoch": 0.3382073430060784, + "grad_norm": 4.972760200500488, + "learning_rate": 7.717313619390583e-05, + "loss": 1.9773, + "step": 4493 + }, + { + "epoch": 0.33828261728673853, + "grad_norm": 3.9295780658721924, + "learning_rate": 7.716290194630404e-05, + "loss": 1.7177, + "step": 4494 + }, + { + "epoch": 0.3383578915673987, + "grad_norm": 3.8098621368408203, + "learning_rate": 7.715266608394565e-05, + "loss": 2.3969, + "step": 4495 + }, + { + "epoch": 0.3384331658480589, + "grad_norm": 3.7683727741241455, + "learning_rate": 7.714242860743916e-05, + "loss": 2.2652, + "step": 4496 + }, + { + "epoch": 0.338508440128719, + "grad_norm": 3.973531484603882, + "learning_rate": 7.71321895173932e-05, + "loss": 2.1398, + "step": 4497 + }, + { + "epoch": 0.3385837144093792, + "grad_norm": 5.437339782714844, + "learning_rate": 7.712194881441641e-05, + "loss": 1.7838, + "step": 4498 + }, + { + "epoch": 0.33865898869003935, + "grad_norm": 4.454009056091309, + "learning_rate": 7.711170649911759e-05, + "loss": 1.8643, + "step": 4499 + }, + { + "epoch": 0.3387342629706995, + "grad_norm": 5.164426803588867, + "learning_rate": 7.710146257210561e-05, + "loss": 2.2005, + "step": 4500 + }, + { + "epoch": 0.33880953725135965, + "grad_norm": 4.7237677574157715, + "learning_rate": 7.709121703398944e-05, + "loss": 2.0712, + "step": 4501 + }, + { + "epoch": 0.33888481153201977, + "grad_norm": 4.242769241333008, + "learning_rate": 7.708096988537815e-05, + "loss": 2.41, + "step": 4502 + }, + { + "epoch": 0.33896008581267995, + "grad_norm": 4.900112152099609, + "learning_rate": 7.707072112688091e-05, + "loss": 2.1935, + "step": 4503 + }, + { + "epoch": 0.3390353600933401, + "grad_norm": 3.766782760620117, + "learning_rate": 7.706047075910696e-05, + "loss": 2.0112, + "step": 4504 + }, + { + "epoch": 0.33911063437400024, + "grad_norm": 5.16675329208374, + "learning_rate": 7.705021878266568e-05, + "loss": 2.171, + "step": 4505 + }, + { + "epoch": 0.3391859086546604, + "grad_norm": 5.00575065612793, + "learning_rate": 7.703996519816651e-05, + "loss": 2.4108, + "step": 4506 + }, + { + "epoch": 0.3392611829353206, + "grad_norm": 4.930896759033203, + "learning_rate": 7.702971000621899e-05, + "loss": 1.863, + "step": 4507 + }, + { + "epoch": 0.3393364572159807, + "grad_norm": 4.462894439697266, + "learning_rate": 7.701945320743278e-05, + "loss": 1.5271, + "step": 4508 + }, + { + "epoch": 0.3394117314966409, + "grad_norm": 5.848621368408203, + "learning_rate": 7.70091948024176e-05, + "loss": 2.0097, + "step": 4509 + }, + { + "epoch": 0.33948700577730107, + "grad_norm": 4.751779556274414, + "learning_rate": 7.699893479178326e-05, + "loss": 1.9456, + "step": 4510 + }, + { + "epoch": 0.3395622800579612, + "grad_norm": 4.161238670349121, + "learning_rate": 7.698867317613974e-05, + "loss": 2.0393, + "step": 4511 + }, + { + "epoch": 0.33963755433862136, + "grad_norm": 4.925955295562744, + "learning_rate": 7.697840995609703e-05, + "loss": 1.5402, + "step": 4512 + }, + { + "epoch": 0.3397128286192815, + "grad_norm": 3.698184013366699, + "learning_rate": 7.696814513226527e-05, + "loss": 2.2706, + "step": 4513 + }, + { + "epoch": 0.33978810289994166, + "grad_norm": 3.8912553787231445, + "learning_rate": 7.695787870525465e-05, + "loss": 1.8541, + "step": 4514 + }, + { + "epoch": 0.33986337718060183, + "grad_norm": 5.740266799926758, + "learning_rate": 7.694761067567551e-05, + "loss": 1.9408, + "step": 4515 + }, + { + "epoch": 0.33993865146126195, + "grad_norm": 4.267884731292725, + "learning_rate": 7.693734104413821e-05, + "loss": 2.411, + "step": 4516 + }, + { + "epoch": 0.34001392574192213, + "grad_norm": 3.908961534500122, + "learning_rate": 7.692706981125329e-05, + "loss": 2.0016, + "step": 4517 + }, + { + "epoch": 0.3400892000225823, + "grad_norm": 7.236872673034668, + "learning_rate": 7.691679697763133e-05, + "loss": 2.0333, + "step": 4518 + }, + { + "epoch": 0.3401644743032424, + "grad_norm": 4.250300884246826, + "learning_rate": 7.690652254388303e-05, + "loss": 1.7263, + "step": 4519 + }, + { + "epoch": 0.3402397485839026, + "grad_norm": 3.9014570713043213, + "learning_rate": 7.689624651061916e-05, + "loss": 2.0795, + "step": 4520 + }, + { + "epoch": 0.3403150228645628, + "grad_norm": 4.842888355255127, + "learning_rate": 7.688596887845062e-05, + "loss": 1.895, + "step": 4521 + }, + { + "epoch": 0.3403902971452229, + "grad_norm": 4.448963165283203, + "learning_rate": 7.687568964798836e-05, + "loss": 2.1164, + "step": 4522 + }, + { + "epoch": 0.3404655714258831, + "grad_norm": 5.541933536529541, + "learning_rate": 7.686540881984347e-05, + "loss": 1.9672, + "step": 4523 + }, + { + "epoch": 0.3405408457065432, + "grad_norm": 3.8231594562530518, + "learning_rate": 7.685512639462711e-05, + "loss": 1.9295, + "step": 4524 + }, + { + "epoch": 0.34061611998720337, + "grad_norm": 4.114373683929443, + "learning_rate": 7.684484237295055e-05, + "loss": 1.8997, + "step": 4525 + }, + { + "epoch": 0.34069139426786355, + "grad_norm": 3.881075143814087, + "learning_rate": 7.683455675542515e-05, + "loss": 2.0707, + "step": 4526 + }, + { + "epoch": 0.34076666854852367, + "grad_norm": 3.963350772857666, + "learning_rate": 7.682426954266231e-05, + "loss": 1.9434, + "step": 4527 + }, + { + "epoch": 0.34084194282918384, + "grad_norm": 5.266693592071533, + "learning_rate": 7.681398073527364e-05, + "loss": 1.7345, + "step": 4528 + }, + { + "epoch": 0.340917217109844, + "grad_norm": 4.409034252166748, + "learning_rate": 7.680369033387073e-05, + "loss": 1.6433, + "step": 4529 + }, + { + "epoch": 0.34099249139050414, + "grad_norm": 4.036330223083496, + "learning_rate": 7.679339833906537e-05, + "loss": 2.1143, + "step": 4530 + }, + { + "epoch": 0.3410677656711643, + "grad_norm": 4.3933916091918945, + "learning_rate": 7.678310475146935e-05, + "loss": 2.0351, + "step": 4531 + }, + { + "epoch": 0.34114303995182443, + "grad_norm": 4.535825252532959, + "learning_rate": 7.67728095716946e-05, + "loss": 1.6391, + "step": 4532 + }, + { + "epoch": 0.3412183142324846, + "grad_norm": 4.306824207305908, + "learning_rate": 7.676251280035312e-05, + "loss": 2.0512, + "step": 4533 + }, + { + "epoch": 0.3412935885131448, + "grad_norm": 4.810033321380615, + "learning_rate": 7.675221443805706e-05, + "loss": 2.2103, + "step": 4534 + }, + { + "epoch": 0.3413688627938049, + "grad_norm": 5.2636799812316895, + "learning_rate": 7.674191448541861e-05, + "loss": 2.0217, + "step": 4535 + }, + { + "epoch": 0.3414441370744651, + "grad_norm": 5.04922342300415, + "learning_rate": 7.673161294305008e-05, + "loss": 1.7624, + "step": 4536 + }, + { + "epoch": 0.34151941135512526, + "grad_norm": 3.822277069091797, + "learning_rate": 7.672130981156387e-05, + "loss": 1.8217, + "step": 4537 + }, + { + "epoch": 0.3415946856357854, + "grad_norm": 4.553699970245361, + "learning_rate": 7.671100509157243e-05, + "loss": 2.2154, + "step": 4538 + }, + { + "epoch": 0.34166995991644555, + "grad_norm": 5.175565719604492, + "learning_rate": 7.670069878368842e-05, + "loss": 1.622, + "step": 4539 + }, + { + "epoch": 0.34174523419710573, + "grad_norm": 4.373694896697998, + "learning_rate": 7.669039088852446e-05, + "loss": 1.7503, + "step": 4540 + }, + { + "epoch": 0.34182050847776585, + "grad_norm": 5.188920974731445, + "learning_rate": 7.668008140669335e-05, + "loss": 2.0285, + "step": 4541 + }, + { + "epoch": 0.341895782758426, + "grad_norm": 4.475624084472656, + "learning_rate": 7.666977033880795e-05, + "loss": 2.1663, + "step": 4542 + }, + { + "epoch": 0.34197105703908615, + "grad_norm": 4.608626842498779, + "learning_rate": 7.665945768548122e-05, + "loss": 1.8493, + "step": 4543 + }, + { + "epoch": 0.3420463313197463, + "grad_norm": 4.155726432800293, + "learning_rate": 7.664914344732623e-05, + "loss": 1.9544, + "step": 4544 + }, + { + "epoch": 0.3421216056004065, + "grad_norm": 6.454883575439453, + "learning_rate": 7.663882762495614e-05, + "loss": 1.9329, + "step": 4545 + }, + { + "epoch": 0.3421968798810666, + "grad_norm": 5.580347061157227, + "learning_rate": 7.662851021898417e-05, + "loss": 1.8874, + "step": 4546 + }, + { + "epoch": 0.3422721541617268, + "grad_norm": 5.288273334503174, + "learning_rate": 7.661819123002367e-05, + "loss": 1.9504, + "step": 4547 + }, + { + "epoch": 0.34234742844238697, + "grad_norm": 3.8523223400115967, + "learning_rate": 7.66078706586881e-05, + "loss": 2.0412, + "step": 4548 + }, + { + "epoch": 0.3424227027230471, + "grad_norm": 4.138383388519287, + "learning_rate": 7.659754850559095e-05, + "loss": 1.9187, + "step": 4549 + }, + { + "epoch": 0.34249797700370727, + "grad_norm": 4.0213446617126465, + "learning_rate": 7.658722477134585e-05, + "loss": 1.887, + "step": 4550 + }, + { + "epoch": 0.3425732512843674, + "grad_norm": 4.166807651519775, + "learning_rate": 7.657689945656654e-05, + "loss": 1.8533, + "step": 4551 + }, + { + "epoch": 0.34264852556502756, + "grad_norm": 3.826023578643799, + "learning_rate": 7.656657256186681e-05, + "loss": 2.1984, + "step": 4552 + }, + { + "epoch": 0.34272379984568774, + "grad_norm": 6.021687984466553, + "learning_rate": 7.655624408786058e-05, + "loss": 2.2327, + "step": 4553 + }, + { + "epoch": 0.34279907412634786, + "grad_norm": 6.18969202041626, + "learning_rate": 7.654591403516183e-05, + "loss": 2.341, + "step": 4554 + }, + { + "epoch": 0.34287434840700803, + "grad_norm": 4.900983810424805, + "learning_rate": 7.653558240438463e-05, + "loss": 2.1054, + "step": 4555 + }, + { + "epoch": 0.3429496226876682, + "grad_norm": 3.8235034942626953, + "learning_rate": 7.652524919614323e-05, + "loss": 1.8667, + "step": 4556 + }, + { + "epoch": 0.34302489696832833, + "grad_norm": 4.521095275878906, + "learning_rate": 7.651491441105188e-05, + "loss": 1.9447, + "step": 4557 + }, + { + "epoch": 0.3431001712489885, + "grad_norm": 4.181612014770508, + "learning_rate": 7.650457804972493e-05, + "loss": 1.8305, + "step": 4558 + }, + { + "epoch": 0.3431754455296487, + "grad_norm": 3.963656425476074, + "learning_rate": 7.649424011277686e-05, + "loss": 1.9965, + "step": 4559 + }, + { + "epoch": 0.3432507198103088, + "grad_norm": 4.6136345863342285, + "learning_rate": 7.648390060082225e-05, + "loss": 2.0132, + "step": 4560 + }, + { + "epoch": 0.343325994090969, + "grad_norm": 5.049828052520752, + "learning_rate": 7.647355951447572e-05, + "loss": 2.218, + "step": 4561 + }, + { + "epoch": 0.3434012683716291, + "grad_norm": 4.716050624847412, + "learning_rate": 7.646321685435205e-05, + "loss": 1.9644, + "step": 4562 + }, + { + "epoch": 0.3434765426522893, + "grad_norm": 3.6872105598449707, + "learning_rate": 7.645287262106607e-05, + "loss": 1.9582, + "step": 4563 + }, + { + "epoch": 0.34355181693294945, + "grad_norm": 4.714653491973877, + "learning_rate": 7.644252681523271e-05, + "loss": 2.0582, + "step": 4564 + }, + { + "epoch": 0.34362709121360957, + "grad_norm": 4.092118263244629, + "learning_rate": 7.643217943746703e-05, + "loss": 1.9243, + "step": 4565 + }, + { + "epoch": 0.34370236549426975, + "grad_norm": 5.0258612632751465, + "learning_rate": 7.64218304883841e-05, + "loss": 2.0855, + "step": 4566 + }, + { + "epoch": 0.3437776397749299, + "grad_norm": 3.448729991912842, + "learning_rate": 7.641147996859916e-05, + "loss": 2.1726, + "step": 4567 + }, + { + "epoch": 0.34385291405559004, + "grad_norm": 5.362695217132568, + "learning_rate": 7.640112787872753e-05, + "loss": 1.9428, + "step": 4568 + }, + { + "epoch": 0.3439281883362502, + "grad_norm": 3.757882595062256, + "learning_rate": 7.639077421938459e-05, + "loss": 2.0626, + "step": 4569 + }, + { + "epoch": 0.3440034626169104, + "grad_norm": 3.0293984413146973, + "learning_rate": 7.638041899118586e-05, + "loss": 1.999, + "step": 4570 + }, + { + "epoch": 0.3440787368975705, + "grad_norm": 4.279021739959717, + "learning_rate": 7.63700621947469e-05, + "loss": 2.1412, + "step": 4571 + }, + { + "epoch": 0.3441540111782307, + "grad_norm": 3.9731411933898926, + "learning_rate": 7.635970383068341e-05, + "loss": 1.8495, + "step": 4572 + }, + { + "epoch": 0.3442292854588908, + "grad_norm": 5.1099467277526855, + "learning_rate": 7.634934389961116e-05, + "loss": 2.0807, + "step": 4573 + }, + { + "epoch": 0.344304559739551, + "grad_norm": 5.556121349334717, + "learning_rate": 7.633898240214603e-05, + "loss": 2.0429, + "step": 4574 + }, + { + "epoch": 0.34437983402021116, + "grad_norm": 4.677093982696533, + "learning_rate": 7.632861933890397e-05, + "loss": 2.24, + "step": 4575 + }, + { + "epoch": 0.3444551083008713, + "grad_norm": 3.9654626846313477, + "learning_rate": 7.631825471050102e-05, + "loss": 1.8898, + "step": 4576 + }, + { + "epoch": 0.34453038258153146, + "grad_norm": 4.97780704498291, + "learning_rate": 7.630788851755336e-05, + "loss": 1.795, + "step": 4577 + }, + { + "epoch": 0.34460565686219163, + "grad_norm": 5.8871941566467285, + "learning_rate": 7.62975207606772e-05, + "loss": 2.2431, + "step": 4578 + }, + { + "epoch": 0.34468093114285175, + "grad_norm": 5.159177303314209, + "learning_rate": 7.628715144048888e-05, + "loss": 1.9894, + "step": 4579 + }, + { + "epoch": 0.34475620542351193, + "grad_norm": 5.730270862579346, + "learning_rate": 7.627678055760485e-05, + "loss": 1.8643, + "step": 4580 + }, + { + "epoch": 0.34483147970417205, + "grad_norm": 4.280642032623291, + "learning_rate": 7.626640811264161e-05, + "loss": 1.9696, + "step": 4581 + }, + { + "epoch": 0.3449067539848322, + "grad_norm": 6.338525295257568, + "learning_rate": 7.625603410621577e-05, + "loss": 1.7777, + "step": 4582 + }, + { + "epoch": 0.3449820282654924, + "grad_norm": 5.291580677032471, + "learning_rate": 7.624565853894405e-05, + "loss": 2.5501, + "step": 4583 + }, + { + "epoch": 0.3450573025461525, + "grad_norm": 4.285087585449219, + "learning_rate": 7.623528141144322e-05, + "loss": 1.9375, + "step": 4584 + }, + { + "epoch": 0.3451325768268127, + "grad_norm": 4.013237476348877, + "learning_rate": 7.62249027243302e-05, + "loss": 1.7449, + "step": 4585 + }, + { + "epoch": 0.3452078511074729, + "grad_norm": 4.898282051086426, + "learning_rate": 7.621452247822197e-05, + "loss": 1.9462, + "step": 4586 + }, + { + "epoch": 0.345283125388133, + "grad_norm": 5.8522047996521, + "learning_rate": 7.620414067373558e-05, + "loss": 2.1401, + "step": 4587 + }, + { + "epoch": 0.34535839966879317, + "grad_norm": 4.723256587982178, + "learning_rate": 7.619375731148823e-05, + "loss": 1.6346, + "step": 4588 + }, + { + "epoch": 0.34543367394945335, + "grad_norm": 4.570127010345459, + "learning_rate": 7.618337239209715e-05, + "loss": 2.1056, + "step": 4589 + }, + { + "epoch": 0.34550894823011347, + "grad_norm": 6.95283842086792, + "learning_rate": 7.617298591617971e-05, + "loss": 2.4376, + "step": 4590 + }, + { + "epoch": 0.34558422251077364, + "grad_norm": 4.839108467102051, + "learning_rate": 7.616259788435337e-05, + "loss": 1.8667, + "step": 4591 + }, + { + "epoch": 0.34565949679143376, + "grad_norm": 4.085836887359619, + "learning_rate": 7.615220829723563e-05, + "loss": 1.8209, + "step": 4592 + }, + { + "epoch": 0.34573477107209394, + "grad_norm": 5.696686267852783, + "learning_rate": 7.614181715544417e-05, + "loss": 2.0996, + "step": 4593 + }, + { + "epoch": 0.3458100453527541, + "grad_norm": 6.192338943481445, + "learning_rate": 7.613142445959668e-05, + "loss": 2.3157, + "step": 4594 + }, + { + "epoch": 0.34588531963341423, + "grad_norm": 5.837879180908203, + "learning_rate": 7.612103021031099e-05, + "loss": 1.8426, + "step": 4595 + }, + { + "epoch": 0.3459605939140744, + "grad_norm": 4.4337477684021, + "learning_rate": 7.611063440820501e-05, + "loss": 2.2347, + "step": 4596 + }, + { + "epoch": 0.3460358681947346, + "grad_norm": 5.813669204711914, + "learning_rate": 7.610023705389673e-05, + "loss": 1.9623, + "step": 4597 + }, + { + "epoch": 0.3461111424753947, + "grad_norm": 4.502193450927734, + "learning_rate": 7.608983814800423e-05, + "loss": 1.943, + "step": 4598 + }, + { + "epoch": 0.3461864167560549, + "grad_norm": 5.070720195770264, + "learning_rate": 7.607943769114574e-05, + "loss": 2.0809, + "step": 4599 + }, + { + "epoch": 0.34626169103671506, + "grad_norm": 4.427411079406738, + "learning_rate": 7.606903568393948e-05, + "loss": 2.1432, + "step": 4600 + }, + { + "epoch": 0.3463369653173752, + "grad_norm": 4.621142387390137, + "learning_rate": 7.605863212700385e-05, + "loss": 1.5111, + "step": 4601 + }, + { + "epoch": 0.34641223959803535, + "grad_norm": 3.4134957790374756, + "learning_rate": 7.604822702095735e-05, + "loss": 1.7918, + "step": 4602 + }, + { + "epoch": 0.3464875138786955, + "grad_norm": 4.904789924621582, + "learning_rate": 7.603782036641846e-05, + "loss": 2.0124, + "step": 4603 + }, + { + "epoch": 0.34656278815935565, + "grad_norm": 4.430422306060791, + "learning_rate": 7.602741216400586e-05, + "loss": 1.7916, + "step": 4604 + }, + { + "epoch": 0.3466380624400158, + "grad_norm": 4.909929275512695, + "learning_rate": 7.60170024143383e-05, + "loss": 2.4277, + "step": 4605 + }, + { + "epoch": 0.34671333672067595, + "grad_norm": 4.747127532958984, + "learning_rate": 7.600659111803458e-05, + "loss": 1.8553, + "step": 4606 + }, + { + "epoch": 0.3467886110013361, + "grad_norm": 6.263697624206543, + "learning_rate": 7.599617827571367e-05, + "loss": 1.7871, + "step": 4607 + }, + { + "epoch": 0.3468638852819963, + "grad_norm": 4.54171085357666, + "learning_rate": 7.598576388799452e-05, + "loss": 1.9016, + "step": 4608 + }, + { + "epoch": 0.3469391595626564, + "grad_norm": 4.513587951660156, + "learning_rate": 7.59753479554963e-05, + "loss": 2.1541, + "step": 4609 + }, + { + "epoch": 0.3470144338433166, + "grad_norm": 5.317783832550049, + "learning_rate": 7.596493047883816e-05, + "loss": 2.3906, + "step": 4610 + }, + { + "epoch": 0.3470897081239767, + "grad_norm": 4.195907115936279, + "learning_rate": 7.595451145863938e-05, + "loss": 1.8432, + "step": 4611 + }, + { + "epoch": 0.3471649824046369, + "grad_norm": 5.053755760192871, + "learning_rate": 7.594409089551941e-05, + "loss": 2.0609, + "step": 4612 + }, + { + "epoch": 0.34724025668529707, + "grad_norm": 3.9988138675689697, + "learning_rate": 7.593366879009766e-05, + "loss": 2.1742, + "step": 4613 + }, + { + "epoch": 0.3473155309659572, + "grad_norm": 5.1297502517700195, + "learning_rate": 7.592324514299371e-05, + "loss": 2.0278, + "step": 4614 + }, + { + "epoch": 0.34739080524661736, + "grad_norm": 3.50924015045166, + "learning_rate": 7.591281995482722e-05, + "loss": 2.0818, + "step": 4615 + }, + { + "epoch": 0.34746607952727754, + "grad_norm": 4.464993953704834, + "learning_rate": 7.590239322621792e-05, + "loss": 1.7159, + "step": 4616 + }, + { + "epoch": 0.34754135380793766, + "grad_norm": 3.403712749481201, + "learning_rate": 7.58919649577857e-05, + "loss": 2.1461, + "step": 4617 + }, + { + "epoch": 0.34761662808859783, + "grad_norm": 3.7006256580352783, + "learning_rate": 7.588153515015043e-05, + "loss": 1.7536, + "step": 4618 + }, + { + "epoch": 0.347691902369258, + "grad_norm": 4.032113552093506, + "learning_rate": 7.587110380393216e-05, + "loss": 2.2485, + "step": 4619 + }, + { + "epoch": 0.34776717664991813, + "grad_norm": 4.458015441894531, + "learning_rate": 7.586067091975103e-05, + "loss": 2.2192, + "step": 4620 + }, + { + "epoch": 0.3478424509305783, + "grad_norm": 5.377789497375488, + "learning_rate": 7.585023649822717e-05, + "loss": 2.3237, + "step": 4621 + }, + { + "epoch": 0.3479177252112384, + "grad_norm": 6.268248558044434, + "learning_rate": 7.583980053998095e-05, + "loss": 2.5396, + "step": 4622 + }, + { + "epoch": 0.3479929994918986, + "grad_norm": 5.029088497161865, + "learning_rate": 7.582936304563272e-05, + "loss": 2.1357, + "step": 4623 + }, + { + "epoch": 0.3480682737725588, + "grad_norm": 4.130476474761963, + "learning_rate": 7.581892401580297e-05, + "loss": 1.9216, + "step": 4624 + }, + { + "epoch": 0.3481435480532189, + "grad_norm": 4.483092308044434, + "learning_rate": 7.580848345111228e-05, + "loss": 2.3111, + "step": 4625 + }, + { + "epoch": 0.3482188223338791, + "grad_norm": 4.400412082672119, + "learning_rate": 7.579804135218129e-05, + "loss": 2.0789, + "step": 4626 + }, + { + "epoch": 0.34829409661453925, + "grad_norm": 5.589075565338135, + "learning_rate": 7.578759771963077e-05, + "loss": 1.8212, + "step": 4627 + }, + { + "epoch": 0.34836937089519937, + "grad_norm": 4.75601863861084, + "learning_rate": 7.577715255408155e-05, + "loss": 2.0269, + "step": 4628 + }, + { + "epoch": 0.34844464517585955, + "grad_norm": 3.8390045166015625, + "learning_rate": 7.576670585615459e-05, + "loss": 1.9359, + "step": 4629 + }, + { + "epoch": 0.34851991945651967, + "grad_norm": 6.69706916809082, + "learning_rate": 7.57562576264709e-05, + "loss": 2.0578, + "step": 4630 + }, + { + "epoch": 0.34859519373717984, + "grad_norm": 7.2338361740112305, + "learning_rate": 7.57458078656516e-05, + "loss": 2.6355, + "step": 4631 + }, + { + "epoch": 0.34867046801784, + "grad_norm": 3.920349359512329, + "learning_rate": 7.573535657431788e-05, + "loss": 1.5752, + "step": 4632 + }, + { + "epoch": 0.34874574229850014, + "grad_norm": 4.407501220703125, + "learning_rate": 7.572490375309105e-05, + "loss": 2.514, + "step": 4633 + }, + { + "epoch": 0.3488210165791603, + "grad_norm": 4.966390132904053, + "learning_rate": 7.571444940259252e-05, + "loss": 1.7593, + "step": 4634 + }, + { + "epoch": 0.3488962908598205, + "grad_norm": 4.687565803527832, + "learning_rate": 7.570399352344375e-05, + "loss": 1.9425, + "step": 4635 + }, + { + "epoch": 0.3489715651404806, + "grad_norm": 4.8182549476623535, + "learning_rate": 7.569353611626633e-05, + "loss": 2.1645, + "step": 4636 + }, + { + "epoch": 0.3490468394211408, + "grad_norm": 3.68569016456604, + "learning_rate": 7.568307718168189e-05, + "loss": 2.2552, + "step": 4637 + }, + { + "epoch": 0.34912211370180096, + "grad_norm": 5.435297012329102, + "learning_rate": 7.56726167203122e-05, + "loss": 1.8918, + "step": 4638 + }, + { + "epoch": 0.3491973879824611, + "grad_norm": 4.0888495445251465, + "learning_rate": 7.566215473277913e-05, + "loss": 2.0708, + "step": 4639 + }, + { + "epoch": 0.34927266226312126, + "grad_norm": 5.4462761878967285, + "learning_rate": 7.565169121970459e-05, + "loss": 2.2294, + "step": 4640 + }, + { + "epoch": 0.3493479365437814, + "grad_norm": 3.784348964691162, + "learning_rate": 7.564122618171061e-05, + "loss": 1.8248, + "step": 4641 + }, + { + "epoch": 0.34942321082444155, + "grad_norm": 6.087522029876709, + "learning_rate": 7.56307596194193e-05, + "loss": 2.1466, + "step": 4642 + }, + { + "epoch": 0.34949848510510173, + "grad_norm": 4.043786525726318, + "learning_rate": 7.56202915334529e-05, + "loss": 2.1563, + "step": 4643 + }, + { + "epoch": 0.34957375938576185, + "grad_norm": 4.114598274230957, + "learning_rate": 7.560982192443364e-05, + "loss": 1.7694, + "step": 4644 + }, + { + "epoch": 0.349649033666422, + "grad_norm": 4.336655139923096, + "learning_rate": 7.559935079298397e-05, + "loss": 1.8078, + "step": 4645 + }, + { + "epoch": 0.3497243079470822, + "grad_norm": 6.600823879241943, + "learning_rate": 7.558887813972635e-05, + "loss": 2.3505, + "step": 4646 + }, + { + "epoch": 0.3497995822277423, + "grad_norm": 5.6425461769104, + "learning_rate": 7.557840396528334e-05, + "loss": 2.1108, + "step": 4647 + }, + { + "epoch": 0.3498748565084025, + "grad_norm": 6.038580417633057, + "learning_rate": 7.556792827027761e-05, + "loss": 2.3126, + "step": 4648 + }, + { + "epoch": 0.3499501307890627, + "grad_norm": 5.721938133239746, + "learning_rate": 7.555745105533191e-05, + "loss": 2.0378, + "step": 4649 + }, + { + "epoch": 0.3500254050697228, + "grad_norm": 5.774184703826904, + "learning_rate": 7.554697232106906e-05, + "loss": 1.8985, + "step": 4650 + }, + { + "epoch": 0.35010067935038297, + "grad_norm": 4.349372386932373, + "learning_rate": 7.553649206811204e-05, + "loss": 1.8721, + "step": 4651 + }, + { + "epoch": 0.3501759536310431, + "grad_norm": 5.158112049102783, + "learning_rate": 7.552601029708382e-05, + "loss": 2.0118, + "step": 4652 + }, + { + "epoch": 0.35025122791170327, + "grad_norm": 4.468225002288818, + "learning_rate": 7.551552700860753e-05, + "loss": 2.2721, + "step": 4653 + }, + { + "epoch": 0.35032650219236344, + "grad_norm": 7.925318717956543, + "learning_rate": 7.550504220330638e-05, + "loss": 2.1947, + "step": 4654 + }, + { + "epoch": 0.35040177647302356, + "grad_norm": 7.497496604919434, + "learning_rate": 7.549455588180363e-05, + "loss": 2.2687, + "step": 4655 + }, + { + "epoch": 0.35047705075368374, + "grad_norm": 4.677943706512451, + "learning_rate": 7.548406804472271e-05, + "loss": 2.0925, + "step": 4656 + }, + { + "epoch": 0.3505523250343439, + "grad_norm": 4.890117645263672, + "learning_rate": 7.547357869268705e-05, + "loss": 1.9201, + "step": 4657 + }, + { + "epoch": 0.35062759931500403, + "grad_norm": 3.9217960834503174, + "learning_rate": 7.546308782632024e-05, + "loss": 2.0213, + "step": 4658 + }, + { + "epoch": 0.3507028735956642, + "grad_norm": 4.779502868652344, + "learning_rate": 7.545259544624592e-05, + "loss": 1.7539, + "step": 4659 + }, + { + "epoch": 0.35077814787632433, + "grad_norm": 5.4057841300964355, + "learning_rate": 7.544210155308783e-05, + "loss": 1.9425, + "step": 4660 + }, + { + "epoch": 0.3508534221569845, + "grad_norm": 4.38415002822876, + "learning_rate": 7.54316061474698e-05, + "loss": 2.433, + "step": 4661 + }, + { + "epoch": 0.3509286964376447, + "grad_norm": 4.677315711975098, + "learning_rate": 7.542110923001576e-05, + "loss": 1.7816, + "step": 4662 + }, + { + "epoch": 0.3510039707183048, + "grad_norm": 4.090447902679443, + "learning_rate": 7.541061080134972e-05, + "loss": 2.3677, + "step": 4663 + }, + { + "epoch": 0.351079244998965, + "grad_norm": 5.142765998840332, + "learning_rate": 7.540011086209578e-05, + "loss": 1.8085, + "step": 4664 + }, + { + "epoch": 0.35115451927962515, + "grad_norm": 4.226830005645752, + "learning_rate": 7.538960941287813e-05, + "loss": 2.0493, + "step": 4665 + }, + { + "epoch": 0.3512297935602853, + "grad_norm": 4.075433731079102, + "learning_rate": 7.537910645432105e-05, + "loss": 1.8381, + "step": 4666 + }, + { + "epoch": 0.35130506784094545, + "grad_norm": 4.133872985839844, + "learning_rate": 7.53686019870489e-05, + "loss": 1.8037, + "step": 4667 + }, + { + "epoch": 0.3513803421216056, + "grad_norm": 4.133872985839844, + "learning_rate": 7.53686019870489e-05, + "loss": 2.5777, + "step": 4668 + }, + { + "epoch": 0.35145561640226575, + "grad_norm": 4.6818528175354, + "learning_rate": 7.535809601168617e-05, + "loss": 2.0334, + "step": 4669 + }, + { + "epoch": 0.3515308906829259, + "grad_norm": 3.3950798511505127, + "learning_rate": 7.53475885288574e-05, + "loss": 1.8075, + "step": 4670 + }, + { + "epoch": 0.35160616496358604, + "grad_norm": 4.6079912185668945, + "learning_rate": 7.53370795391872e-05, + "loss": 1.8669, + "step": 4671 + }, + { + "epoch": 0.3516814392442462, + "grad_norm": 4.347660541534424, + "learning_rate": 7.53265690433003e-05, + "loss": 1.5897, + "step": 4672 + }, + { + "epoch": 0.3517567135249064, + "grad_norm": 3.6169469356536865, + "learning_rate": 7.531605704182159e-05, + "loss": 1.7861, + "step": 4673 + }, + { + "epoch": 0.3518319878055665, + "grad_norm": 4.170974254608154, + "learning_rate": 7.53055435353759e-05, + "loss": 1.9276, + "step": 4674 + }, + { + "epoch": 0.3519072620862267, + "grad_norm": 4.170974254608154, + "learning_rate": 7.53055435353759e-05, + "loss": 2.5965, + "step": 4675 + }, + { + "epoch": 0.35198253636688687, + "grad_norm": 4.087090492248535, + "learning_rate": 7.529502852458828e-05, + "loss": 1.922, + "step": 4676 + }, + { + "epoch": 0.352057810647547, + "grad_norm": 3.9385740756988525, + "learning_rate": 7.528451201008378e-05, + "loss": 2.094, + "step": 4677 + }, + { + "epoch": 0.35213308492820716, + "grad_norm": 4.231057643890381, + "learning_rate": 7.527399399248759e-05, + "loss": 2.018, + "step": 4678 + }, + { + "epoch": 0.3522083592088673, + "grad_norm": 4.711199760437012, + "learning_rate": 7.526347447242496e-05, + "loss": 2.2746, + "step": 4679 + }, + { + "epoch": 0.35228363348952746, + "grad_norm": 4.021269798278809, + "learning_rate": 7.525295345052128e-05, + "loss": 2.2216, + "step": 4680 + }, + { + "epoch": 0.35235890777018763, + "grad_norm": 5.026397228240967, + "learning_rate": 7.524243092740198e-05, + "loss": 1.7627, + "step": 4681 + }, + { + "epoch": 0.35243418205084776, + "grad_norm": 6.576305866241455, + "learning_rate": 7.523190690369258e-05, + "loss": 2.0708, + "step": 4682 + }, + { + "epoch": 0.35250945633150793, + "grad_norm": 5.01113748550415, + "learning_rate": 7.522138138001873e-05, + "loss": 2.2132, + "step": 4683 + }, + { + "epoch": 0.3525847306121681, + "grad_norm": 6.437198162078857, + "learning_rate": 7.52108543570061e-05, + "loss": 2.2058, + "step": 4684 + }, + { + "epoch": 0.3526600048928282, + "grad_norm": 4.341702938079834, + "learning_rate": 7.520032583528052e-05, + "loss": 1.8943, + "step": 4685 + }, + { + "epoch": 0.3527352791734884, + "grad_norm": 4.824337482452393, + "learning_rate": 7.51897958154679e-05, + "loss": 2.1713, + "step": 4686 + }, + { + "epoch": 0.3528105534541486, + "grad_norm": 4.546299934387207, + "learning_rate": 7.517926429819418e-05, + "loss": 2.385, + "step": 4687 + }, + { + "epoch": 0.3528858277348087, + "grad_norm": 4.972386360168457, + "learning_rate": 7.516873128408546e-05, + "loss": 1.7746, + "step": 4688 + }, + { + "epoch": 0.3529611020154689, + "grad_norm": 3.514876127243042, + "learning_rate": 7.515819677376787e-05, + "loss": 2.0456, + "step": 4689 + }, + { + "epoch": 0.353036376296129, + "grad_norm": 4.511981964111328, + "learning_rate": 7.514766076786766e-05, + "loss": 2.4228, + "step": 4690 + }, + { + "epoch": 0.35311165057678917, + "grad_norm": 6.4157233238220215, + "learning_rate": 7.513712326701116e-05, + "loss": 2.1866, + "step": 4691 + }, + { + "epoch": 0.35318692485744935, + "grad_norm": 5.201791286468506, + "learning_rate": 7.512658427182484e-05, + "loss": 2.088, + "step": 4692 + }, + { + "epoch": 0.35326219913810947, + "grad_norm": 4.7506103515625, + "learning_rate": 7.511604378293518e-05, + "loss": 1.6858, + "step": 4693 + }, + { + "epoch": 0.35333747341876964, + "grad_norm": 4.93814754486084, + "learning_rate": 7.510550180096877e-05, + "loss": 2.5642, + "step": 4694 + }, + { + "epoch": 0.3534127476994298, + "grad_norm": 4.639529705047607, + "learning_rate": 7.509495832655232e-05, + "loss": 1.907, + "step": 4695 + }, + { + "epoch": 0.35348802198008994, + "grad_norm": 4.353016376495361, + "learning_rate": 7.50844133603126e-05, + "loss": 2.1002, + "step": 4696 + }, + { + "epoch": 0.3535632962607501, + "grad_norm": 5.383945465087891, + "learning_rate": 7.507386690287647e-05, + "loss": 2.0776, + "step": 4697 + }, + { + "epoch": 0.3536385705414103, + "grad_norm": 5.4766035079956055, + "learning_rate": 7.506331895487092e-05, + "loss": 1.9973, + "step": 4698 + }, + { + "epoch": 0.3537138448220704, + "grad_norm": 4.778066635131836, + "learning_rate": 7.505276951692297e-05, + "loss": 2.0725, + "step": 4699 + }, + { + "epoch": 0.3537891191027306, + "grad_norm": 4.152775287628174, + "learning_rate": 7.504221858965975e-05, + "loss": 1.8939, + "step": 4700 + }, + { + "epoch": 0.3538643933833907, + "grad_norm": 5.066797733306885, + "learning_rate": 7.503166617370849e-05, + "loss": 1.5798, + "step": 4701 + }, + { + "epoch": 0.3539396676640509, + "grad_norm": 4.326906204223633, + "learning_rate": 7.50211122696965e-05, + "loss": 2.0667, + "step": 4702 + }, + { + "epoch": 0.35401494194471106, + "grad_norm": 4.983119010925293, + "learning_rate": 7.501055687825117e-05, + "loss": 2.1523, + "step": 4703 + }, + { + "epoch": 0.3540902162253712, + "grad_norm": 3.7467000484466553, + "learning_rate": 7.500000000000001e-05, + "loss": 2.1015, + "step": 4704 + }, + { + "epoch": 0.35416549050603136, + "grad_norm": 5.698918342590332, + "learning_rate": 7.498944163557059e-05, + "loss": 1.9026, + "step": 4705 + }, + { + "epoch": 0.35424076478669153, + "grad_norm": 5.043235778808594, + "learning_rate": 7.497888178559055e-05, + "loss": 2.2288, + "step": 4706 + }, + { + "epoch": 0.35431603906735165, + "grad_norm": 4.1288347244262695, + "learning_rate": 7.496832045068765e-05, + "loss": 2.1109, + "step": 4707 + }, + { + "epoch": 0.3543913133480118, + "grad_norm": 4.3924241065979, + "learning_rate": 7.495775763148975e-05, + "loss": 2.0131, + "step": 4708 + }, + { + "epoch": 0.35446658762867195, + "grad_norm": 4.646059036254883, + "learning_rate": 7.494719332862478e-05, + "loss": 2.1389, + "step": 4709 + }, + { + "epoch": 0.3545418619093321, + "grad_norm": 4.2902655601501465, + "learning_rate": 7.493662754272075e-05, + "loss": 2.2265, + "step": 4710 + }, + { + "epoch": 0.3546171361899923, + "grad_norm": 5.3461408615112305, + "learning_rate": 7.492606027440577e-05, + "loss": 1.7097, + "step": 4711 + }, + { + "epoch": 0.3546924104706524, + "grad_norm": 4.69003963470459, + "learning_rate": 7.491549152430801e-05, + "loss": 2.0261, + "step": 4712 + }, + { + "epoch": 0.3547676847513126, + "grad_norm": 3.4113070964813232, + "learning_rate": 7.490492129305577e-05, + "loss": 1.7582, + "step": 4713 + }, + { + "epoch": 0.35484295903197277, + "grad_norm": 7.446574687957764, + "learning_rate": 7.489434958127744e-05, + "loss": 1.665, + "step": 4714 + }, + { + "epoch": 0.3549182333126329, + "grad_norm": 3.97598934173584, + "learning_rate": 7.488377638960143e-05, + "loss": 1.671, + "step": 4715 + }, + { + "epoch": 0.35499350759329307, + "grad_norm": 4.233238220214844, + "learning_rate": 7.487320171865633e-05, + "loss": 1.8296, + "step": 4716 + }, + { + "epoch": 0.35506878187395324, + "grad_norm": 6.832796096801758, + "learning_rate": 7.486262556907075e-05, + "loss": 2.3745, + "step": 4717 + }, + { + "epoch": 0.35514405615461336, + "grad_norm": 7.490714073181152, + "learning_rate": 7.485204794147343e-05, + "loss": 1.9779, + "step": 4718 + }, + { + "epoch": 0.35521933043527354, + "grad_norm": 4.480311393737793, + "learning_rate": 7.484146883649316e-05, + "loss": 1.93, + "step": 4719 + }, + { + "epoch": 0.35529460471593366, + "grad_norm": 5.3867082595825195, + "learning_rate": 7.483088825475885e-05, + "loss": 2.1377, + "step": 4720 + }, + { + "epoch": 0.35536987899659384, + "grad_norm": 4.348886966705322, + "learning_rate": 7.482030619689947e-05, + "loss": 1.7696, + "step": 4721 + }, + { + "epoch": 0.355445153277254, + "grad_norm": 3.7030577659606934, + "learning_rate": 7.480972266354411e-05, + "loss": 2.0611, + "step": 4722 + }, + { + "epoch": 0.35552042755791413, + "grad_norm": 6.327174663543701, + "learning_rate": 7.479913765532193e-05, + "loss": 2.1051, + "step": 4723 + }, + { + "epoch": 0.3555957018385743, + "grad_norm": 5.268999099731445, + "learning_rate": 7.478855117286217e-05, + "loss": 1.9332, + "step": 4724 + }, + { + "epoch": 0.3556709761192345, + "grad_norm": 6.376285076141357, + "learning_rate": 7.477796321679416e-05, + "loss": 2.3561, + "step": 4725 + }, + { + "epoch": 0.3557462503998946, + "grad_norm": 4.8138203620910645, + "learning_rate": 7.476737378774735e-05, + "loss": 1.8648, + "step": 4726 + }, + { + "epoch": 0.3558215246805548, + "grad_norm": 5.368795394897461, + "learning_rate": 7.475678288635122e-05, + "loss": 1.8362, + "step": 4727 + }, + { + "epoch": 0.3558967989612149, + "grad_norm": 4.2373127937316895, + "learning_rate": 7.474619051323539e-05, + "loss": 2.0507, + "step": 4728 + }, + { + "epoch": 0.3559720732418751, + "grad_norm": 4.380899906158447, + "learning_rate": 7.473559666902954e-05, + "loss": 2.0412, + "step": 4729 + }, + { + "epoch": 0.35604734752253525, + "grad_norm": 4.2086181640625, + "learning_rate": 7.472500135436344e-05, + "loss": 1.8879, + "step": 4730 + }, + { + "epoch": 0.35612262180319537, + "grad_norm": 3.9845449924468994, + "learning_rate": 7.471440456986695e-05, + "loss": 2.2651, + "step": 4731 + }, + { + "epoch": 0.35619789608385555, + "grad_norm": 3.9845449924468994, + "learning_rate": 7.471440456986695e-05, + "loss": 2.1018, + "step": 4732 + }, + { + "epoch": 0.3562731703645157, + "grad_norm": 6.091592311859131, + "learning_rate": 7.470380631617002e-05, + "loss": 2.0122, + "step": 4733 + }, + { + "epoch": 0.35634844464517584, + "grad_norm": 6.080043315887451, + "learning_rate": 7.46932065939027e-05, + "loss": 1.7726, + "step": 4734 + }, + { + "epoch": 0.356423718925836, + "grad_norm": 4.906828880310059, + "learning_rate": 7.46826054036951e-05, + "loss": 1.9076, + "step": 4735 + }, + { + "epoch": 0.3564989932064962, + "grad_norm": 5.630390167236328, + "learning_rate": 7.467200274617741e-05, + "loss": 2.5238, + "step": 4736 + }, + { + "epoch": 0.3565742674871563, + "grad_norm": 4.263101577758789, + "learning_rate": 7.466139862197996e-05, + "loss": 1.9743, + "step": 4737 + }, + { + "epoch": 0.3566495417678165, + "grad_norm": 4.454501628875732, + "learning_rate": 7.465079303173312e-05, + "loss": 1.8774, + "step": 4738 + }, + { + "epoch": 0.3567248160484766, + "grad_norm": 3.741698741912842, + "learning_rate": 7.464018597606738e-05, + "loss": 1.7884, + "step": 4739 + }, + { + "epoch": 0.3568000903291368, + "grad_norm": 7.079321384429932, + "learning_rate": 7.462957745561328e-05, + "loss": 1.8561, + "step": 4740 + }, + { + "epoch": 0.35687536460979696, + "grad_norm": 3.574429988861084, + "learning_rate": 7.461896747100146e-05, + "loss": 1.8334, + "step": 4741 + }, + { + "epoch": 0.3569506388904571, + "grad_norm": 5.020144939422607, + "learning_rate": 7.460835602286265e-05, + "loss": 2.4262, + "step": 4742 + }, + { + "epoch": 0.35702591317111726, + "grad_norm": 4.757741451263428, + "learning_rate": 7.45977431118277e-05, + "loss": 2.2663, + "step": 4743 + }, + { + "epoch": 0.35710118745177744, + "grad_norm": 4.493041038513184, + "learning_rate": 7.458712873852752e-05, + "loss": 1.9607, + "step": 4744 + }, + { + "epoch": 0.35717646173243756, + "grad_norm": 5.57503604888916, + "learning_rate": 7.457651290359306e-05, + "loss": 2.0441, + "step": 4745 + }, + { + "epoch": 0.35725173601309773, + "grad_norm": 5.385438919067383, + "learning_rate": 7.456589560765545e-05, + "loss": 2.1624, + "step": 4746 + }, + { + "epoch": 0.3573270102937579, + "grad_norm": 6.012648105621338, + "learning_rate": 7.455527685134581e-05, + "loss": 1.9506, + "step": 4747 + }, + { + "epoch": 0.357402284574418, + "grad_norm": 4.124917507171631, + "learning_rate": 7.454465663529542e-05, + "loss": 1.8411, + "step": 4748 + }, + { + "epoch": 0.3574775588550782, + "grad_norm": 3.7410566806793213, + "learning_rate": 7.453403496013563e-05, + "loss": 2.1715, + "step": 4749 + }, + { + "epoch": 0.3575528331357383, + "grad_norm": 4.363630294799805, + "learning_rate": 7.452341182649786e-05, + "loss": 1.9888, + "step": 4750 + }, + { + "epoch": 0.3576281074163985, + "grad_norm": 7.150066375732422, + "learning_rate": 7.451278723501362e-05, + "loss": 1.6745, + "step": 4751 + }, + { + "epoch": 0.3577033816970587, + "grad_norm": 5.0994977951049805, + "learning_rate": 7.45021611863145e-05, + "loss": 1.9023, + "step": 4752 + }, + { + "epoch": 0.3577786559777188, + "grad_norm": 4.080338001251221, + "learning_rate": 7.449153368103222e-05, + "loss": 2.0436, + "step": 4753 + }, + { + "epoch": 0.35785393025837897, + "grad_norm": 4.2494378089904785, + "learning_rate": 7.448090471979853e-05, + "loss": 2.1868, + "step": 4754 + }, + { + "epoch": 0.35792920453903915, + "grad_norm": 4.448575496673584, + "learning_rate": 7.44702743032453e-05, + "loss": 1.9114, + "step": 4755 + }, + { + "epoch": 0.35800447881969927, + "grad_norm": 4.7296857833862305, + "learning_rate": 7.445964243200447e-05, + "loss": 2.2521, + "step": 4756 + }, + { + "epoch": 0.35807975310035944, + "grad_norm": 4.6908650398254395, + "learning_rate": 7.444900910670808e-05, + "loss": 2.5617, + "step": 4757 + }, + { + "epoch": 0.35815502738101956, + "grad_norm": 7.209279537200928, + "learning_rate": 7.443837432798826e-05, + "loss": 1.9502, + "step": 4758 + }, + { + "epoch": 0.35823030166167974, + "grad_norm": 6.864109992980957, + "learning_rate": 7.442773809647718e-05, + "loss": 2.1742, + "step": 4759 + }, + { + "epoch": 0.3583055759423399, + "grad_norm": 4.517666816711426, + "learning_rate": 7.44171004128072e-05, + "loss": 1.9867, + "step": 4760 + }, + { + "epoch": 0.35838085022300004, + "grad_norm": 5.149838924407959, + "learning_rate": 7.440646127761064e-05, + "loss": 1.944, + "step": 4761 + }, + { + "epoch": 0.3584561245036602, + "grad_norm": 4.458034038543701, + "learning_rate": 7.439582069151999e-05, + "loss": 2.2805, + "step": 4762 + }, + { + "epoch": 0.3585313987843204, + "grad_norm": 4.561699867248535, + "learning_rate": 7.43851786551678e-05, + "loss": 1.8461, + "step": 4763 + }, + { + "epoch": 0.3586066730649805, + "grad_norm": 3.4224936962127686, + "learning_rate": 7.437453516918669e-05, + "loss": 1.9924, + "step": 4764 + }, + { + "epoch": 0.3586819473456407, + "grad_norm": 4.27512788772583, + "learning_rate": 7.436389023420944e-05, + "loss": 2.0595, + "step": 4765 + }, + { + "epoch": 0.35875722162630086, + "grad_norm": 4.82543420791626, + "learning_rate": 7.43532438508688e-05, + "loss": 1.9961, + "step": 4766 + }, + { + "epoch": 0.358832495906961, + "grad_norm": 4.925774097442627, + "learning_rate": 7.43425960197977e-05, + "loss": 2.0342, + "step": 4767 + }, + { + "epoch": 0.35890777018762116, + "grad_norm": 4.324056625366211, + "learning_rate": 7.433194674162911e-05, + "loss": 1.6868, + "step": 4768 + }, + { + "epoch": 0.3589830444682813, + "grad_norm": 4.3465986251831055, + "learning_rate": 7.43212960169961e-05, + "loss": 1.7335, + "step": 4769 + }, + { + "epoch": 0.35905831874894145, + "grad_norm": 3.428480863571167, + "learning_rate": 7.431064384653182e-05, + "loss": 2.0071, + "step": 4770 + }, + { + "epoch": 0.3591335930296016, + "grad_norm": 4.085512638092041, + "learning_rate": 7.429999023086953e-05, + "loss": 2.1411, + "step": 4771 + }, + { + "epoch": 0.35920886731026175, + "grad_norm": 4.5069732666015625, + "learning_rate": 7.428933517064254e-05, + "loss": 2.1567, + "step": 4772 + }, + { + "epoch": 0.3592841415909219, + "grad_norm": 3.8591883182525635, + "learning_rate": 7.427867866648425e-05, + "loss": 1.9823, + "step": 4773 + }, + { + "epoch": 0.3593594158715821, + "grad_norm": 5.22625732421875, + "learning_rate": 7.42680207190282e-05, + "loss": 1.8327, + "step": 4774 + }, + { + "epoch": 0.3594346901522422, + "grad_norm": 4.900472164154053, + "learning_rate": 7.425736132890795e-05, + "loss": 1.9832, + "step": 4775 + }, + { + "epoch": 0.3595099644329024, + "grad_norm": 9.567375183105469, + "learning_rate": 7.424670049675716e-05, + "loss": 2.1592, + "step": 4776 + }, + { + "epoch": 0.3595852387135625, + "grad_norm": 5.523016452789307, + "learning_rate": 7.423603822320962e-05, + "loss": 2.0211, + "step": 4777 + }, + { + "epoch": 0.3596605129942227, + "grad_norm": 3.923966407775879, + "learning_rate": 7.422537450889913e-05, + "loss": 2.0995, + "step": 4778 + }, + { + "epoch": 0.35973578727488287, + "grad_norm": 5.271427631378174, + "learning_rate": 7.421470935445964e-05, + "loss": 2.1583, + "step": 4779 + }, + { + "epoch": 0.359811061555543, + "grad_norm": 5.400697708129883, + "learning_rate": 7.420404276052517e-05, + "loss": 1.9825, + "step": 4780 + }, + { + "epoch": 0.35988633583620316, + "grad_norm": 5.789221286773682, + "learning_rate": 7.419337472772978e-05, + "loss": 2.4105, + "step": 4781 + }, + { + "epoch": 0.35996161011686334, + "grad_norm": 13.104486465454102, + "learning_rate": 7.418270525670769e-05, + "loss": 2.3911, + "step": 4782 + }, + { + "epoch": 0.36003688439752346, + "grad_norm": 3.88413143157959, + "learning_rate": 7.417203434809317e-05, + "loss": 1.6194, + "step": 4783 + }, + { + "epoch": 0.36011215867818364, + "grad_norm": 6.00800085067749, + "learning_rate": 7.416136200252057e-05, + "loss": 1.7539, + "step": 4784 + }, + { + "epoch": 0.3601874329588438, + "grad_norm": 4.076695442199707, + "learning_rate": 7.415068822062432e-05, + "loss": 1.8827, + "step": 4785 + }, + { + "epoch": 0.36026270723950393, + "grad_norm": 5.769009113311768, + "learning_rate": 7.414001300303894e-05, + "loss": 2.3875, + "step": 4786 + }, + { + "epoch": 0.3603379815201641, + "grad_norm": 3.7072229385375977, + "learning_rate": 7.412933635039907e-05, + "loss": 1.8795, + "step": 4787 + }, + { + "epoch": 0.36041325580082423, + "grad_norm": 5.090272903442383, + "learning_rate": 7.411865826333937e-05, + "loss": 2.3674, + "step": 4788 + }, + { + "epoch": 0.3604885300814844, + "grad_norm": 5.507866382598877, + "learning_rate": 7.410797874249464e-05, + "loss": 2.1187, + "step": 4789 + }, + { + "epoch": 0.3605638043621446, + "grad_norm": 5.066542625427246, + "learning_rate": 7.409729778849977e-05, + "loss": 1.8951, + "step": 4790 + }, + { + "epoch": 0.3606390786428047, + "grad_norm": 4.192557334899902, + "learning_rate": 7.408661540198968e-05, + "loss": 2.0987, + "step": 4791 + }, + { + "epoch": 0.3607143529234649, + "grad_norm": 4.773987293243408, + "learning_rate": 7.40759315835994e-05, + "loss": 1.8078, + "step": 4792 + }, + { + "epoch": 0.36078962720412505, + "grad_norm": 6.124429702758789, + "learning_rate": 7.40652463339641e-05, + "loss": 2.1519, + "step": 4793 + }, + { + "epoch": 0.36086490148478517, + "grad_norm": 5.819246768951416, + "learning_rate": 7.405455965371892e-05, + "loss": 1.7901, + "step": 4794 + }, + { + "epoch": 0.36094017576544535, + "grad_norm": 5.987430095672607, + "learning_rate": 7.40438715434992e-05, + "loss": 2.6114, + "step": 4795 + }, + { + "epoch": 0.3610154500461055, + "grad_norm": 5.041569709777832, + "learning_rate": 7.403318200394033e-05, + "loss": 2.4454, + "step": 4796 + }, + { + "epoch": 0.36109072432676564, + "grad_norm": 7.414455413818359, + "learning_rate": 7.402249103567773e-05, + "loss": 2.1914, + "step": 4797 + }, + { + "epoch": 0.3611659986074258, + "grad_norm": 4.2949724197387695, + "learning_rate": 7.401179863934698e-05, + "loss": 1.9993, + "step": 4798 + }, + { + "epoch": 0.36124127288808594, + "grad_norm": 7.248776435852051, + "learning_rate": 7.400110481558367e-05, + "loss": 2.0786, + "step": 4799 + }, + { + "epoch": 0.3613165471687461, + "grad_norm": 8.821529388427734, + "learning_rate": 7.399040956502358e-05, + "loss": 2.57, + "step": 4800 + }, + { + "epoch": 0.3613918214494063, + "grad_norm": 4.257993221282959, + "learning_rate": 7.397971288830246e-05, + "loss": 1.808, + "step": 4801 + }, + { + "epoch": 0.3614670957300664, + "grad_norm": 4.006655216217041, + "learning_rate": 7.39690147860562e-05, + "loss": 1.9007, + "step": 4802 + }, + { + "epoch": 0.3615423700107266, + "grad_norm": 3.936253309249878, + "learning_rate": 7.39583152589208e-05, + "loss": 1.9579, + "step": 4803 + }, + { + "epoch": 0.36161764429138676, + "grad_norm": 5.082552909851074, + "learning_rate": 7.394761430753232e-05, + "loss": 2.0267, + "step": 4804 + }, + { + "epoch": 0.3616929185720469, + "grad_norm": 4.073530197143555, + "learning_rate": 7.393691193252685e-05, + "loss": 2.0529, + "step": 4805 + }, + { + "epoch": 0.36176819285270706, + "grad_norm": 4.454007148742676, + "learning_rate": 7.392620813454066e-05, + "loss": 1.7266, + "step": 4806 + }, + { + "epoch": 0.3618434671333672, + "grad_norm": 6.2713303565979, + "learning_rate": 7.391550291421007e-05, + "loss": 1.9563, + "step": 4807 + }, + { + "epoch": 0.36191874141402736, + "grad_norm": 3.697896718978882, + "learning_rate": 7.390479627217143e-05, + "loss": 2.1485, + "step": 4808 + }, + { + "epoch": 0.36199401569468753, + "grad_norm": 5.898225784301758, + "learning_rate": 7.389408820906124e-05, + "loss": 2.2685, + "step": 4809 + }, + { + "epoch": 0.36206928997534765, + "grad_norm": 2.9236931800842285, + "learning_rate": 7.388337872551607e-05, + "loss": 2.0541, + "step": 4810 + }, + { + "epoch": 0.36214456425600783, + "grad_norm": 4.811728477478027, + "learning_rate": 7.387266782217258e-05, + "loss": 2.3104, + "step": 4811 + }, + { + "epoch": 0.362219838536668, + "grad_norm": 5.527030944824219, + "learning_rate": 7.386195549966747e-05, + "loss": 1.7537, + "step": 4812 + }, + { + "epoch": 0.3622951128173281, + "grad_norm": 5.188048362731934, + "learning_rate": 7.385124175863759e-05, + "loss": 1.8926, + "step": 4813 + }, + { + "epoch": 0.3623703870979883, + "grad_norm": 4.454374313354492, + "learning_rate": 7.38405265997198e-05, + "loss": 2.1972, + "step": 4814 + }, + { + "epoch": 0.3624456613786485, + "grad_norm": 3.2439000606536865, + "learning_rate": 7.382981002355111e-05, + "loss": 1.8493, + "step": 4815 + }, + { + "epoch": 0.3625209356593086, + "grad_norm": 4.143496990203857, + "learning_rate": 7.38190920307686e-05, + "loss": 2.1858, + "step": 4816 + }, + { + "epoch": 0.36259620993996877, + "grad_norm": 3.6470460891723633, + "learning_rate": 7.380837262200943e-05, + "loss": 2.0283, + "step": 4817 + }, + { + "epoch": 0.3626714842206289, + "grad_norm": 4.135168552398682, + "learning_rate": 7.379765179791081e-05, + "loss": 1.8709, + "step": 4818 + }, + { + "epoch": 0.36274675850128907, + "grad_norm": 5.41266393661499, + "learning_rate": 7.378692955911005e-05, + "loss": 1.7939, + "step": 4819 + }, + { + "epoch": 0.36282203278194924, + "grad_norm": 3.172822952270508, + "learning_rate": 7.377620590624462e-05, + "loss": 2.0273, + "step": 4820 + }, + { + "epoch": 0.36289730706260936, + "grad_norm": 4.549415588378906, + "learning_rate": 7.376548083995193e-05, + "loss": 1.5833, + "step": 4821 + }, + { + "epoch": 0.36297258134326954, + "grad_norm": 4.41801118850708, + "learning_rate": 7.375475436086961e-05, + "loss": 1.758, + "step": 4822 + }, + { + "epoch": 0.3630478556239297, + "grad_norm": 4.3426194190979, + "learning_rate": 7.374402646963533e-05, + "loss": 1.9815, + "step": 4823 + }, + { + "epoch": 0.36312312990458984, + "grad_norm": 6.006479740142822, + "learning_rate": 7.373329716688677e-05, + "loss": 1.9615, + "step": 4824 + }, + { + "epoch": 0.36319840418525, + "grad_norm": 4.682624340057373, + "learning_rate": 7.37225664532618e-05, + "loss": 2.045, + "step": 4825 + }, + { + "epoch": 0.36327367846591013, + "grad_norm": 3.200746774673462, + "learning_rate": 7.371183432939832e-05, + "loss": 2.1439, + "step": 4826 + }, + { + "epoch": 0.3633489527465703, + "grad_norm": 4.339603900909424, + "learning_rate": 7.370110079593435e-05, + "loss": 2.1029, + "step": 4827 + }, + { + "epoch": 0.3634242270272305, + "grad_norm": 3.5030617713928223, + "learning_rate": 7.369036585350792e-05, + "loss": 1.6845, + "step": 4828 + }, + { + "epoch": 0.3634995013078906, + "grad_norm": 4.315023422241211, + "learning_rate": 7.367962950275724e-05, + "loss": 1.8469, + "step": 4829 + }, + { + "epoch": 0.3635747755885508, + "grad_norm": 4.554084300994873, + "learning_rate": 7.36688917443205e-05, + "loss": 2.2038, + "step": 4830 + }, + { + "epoch": 0.36365004986921096, + "grad_norm": 5.289467811584473, + "learning_rate": 7.365815257883607e-05, + "loss": 2.2925, + "step": 4831 + }, + { + "epoch": 0.3637253241498711, + "grad_norm": 4.193687438964844, + "learning_rate": 7.364741200694235e-05, + "loss": 1.7365, + "step": 4832 + }, + { + "epoch": 0.36380059843053125, + "grad_norm": 4.348536968231201, + "learning_rate": 7.363667002927785e-05, + "loss": 1.8362, + "step": 4833 + }, + { + "epoch": 0.36387587271119143, + "grad_norm": 6.698772430419922, + "learning_rate": 7.362592664648113e-05, + "loss": 1.9214, + "step": 4834 + }, + { + "epoch": 0.36395114699185155, + "grad_norm": 5.859483242034912, + "learning_rate": 7.361518185919087e-05, + "loss": 1.7184, + "step": 4835 + }, + { + "epoch": 0.3640264212725117, + "grad_norm": 4.6047797203063965, + "learning_rate": 7.360443566804578e-05, + "loss": 2.2897, + "step": 4836 + }, + { + "epoch": 0.36410169555317184, + "grad_norm": 4.616860389709473, + "learning_rate": 7.359368807368474e-05, + "loss": 1.8805, + "step": 4837 + }, + { + "epoch": 0.364176969833832, + "grad_norm": 4.212845802307129, + "learning_rate": 7.358293907674664e-05, + "loss": 1.9099, + "step": 4838 + }, + { + "epoch": 0.3642522441144922, + "grad_norm": 5.893839359283447, + "learning_rate": 7.357218867787048e-05, + "loss": 2.2213, + "step": 4839 + }, + { + "epoch": 0.3643275183951523, + "grad_norm": 5.683198928833008, + "learning_rate": 7.356143687769534e-05, + "loss": 1.9078, + "step": 4840 + }, + { + "epoch": 0.3644027926758125, + "grad_norm": 5.062298774719238, + "learning_rate": 7.355068367686039e-05, + "loss": 1.7167, + "step": 4841 + }, + { + "epoch": 0.36447806695647267, + "grad_norm": 4.512243747711182, + "learning_rate": 7.353992907600484e-05, + "loss": 1.9748, + "step": 4842 + }, + { + "epoch": 0.3645533412371328, + "grad_norm": 4.125036716461182, + "learning_rate": 7.352917307576808e-05, + "loss": 1.7362, + "step": 4843 + }, + { + "epoch": 0.36462861551779296, + "grad_norm": 4.952609539031982, + "learning_rate": 7.351841567678948e-05, + "loss": 1.9814, + "step": 4844 + }, + { + "epoch": 0.36470388979845314, + "grad_norm": 3.766040086746216, + "learning_rate": 7.350765687970856e-05, + "loss": 1.6756, + "step": 4845 + }, + { + "epoch": 0.36477916407911326, + "grad_norm": 4.0204057693481445, + "learning_rate": 7.349689668516488e-05, + "loss": 2.078, + "step": 4846 + }, + { + "epoch": 0.36485443835977344, + "grad_norm": 6.277435302734375, + "learning_rate": 7.348613509379811e-05, + "loss": 1.9333, + "step": 4847 + }, + { + "epoch": 0.36492971264043356, + "grad_norm": 5.5256147384643555, + "learning_rate": 7.347537210624802e-05, + "loss": 2.0258, + "step": 4848 + }, + { + "epoch": 0.36500498692109373, + "grad_norm": 6.066572666168213, + "learning_rate": 7.346460772315439e-05, + "loss": 2.072, + "step": 4849 + }, + { + "epoch": 0.3650802612017539, + "grad_norm": 6.329031944274902, + "learning_rate": 7.345384194515719e-05, + "loss": 2.0047, + "step": 4850 + }, + { + "epoch": 0.36515553548241403, + "grad_norm": 4.246005058288574, + "learning_rate": 7.344307477289637e-05, + "loss": 1.9237, + "step": 4851 + }, + { + "epoch": 0.3652308097630742, + "grad_norm": 5.608550548553467, + "learning_rate": 7.343230620701199e-05, + "loss": 1.9446, + "step": 4852 + }, + { + "epoch": 0.3653060840437344, + "grad_norm": 4.107571601867676, + "learning_rate": 7.342153624814427e-05, + "loss": 2.1045, + "step": 4853 + }, + { + "epoch": 0.3653813583243945, + "grad_norm": 4.538722991943359, + "learning_rate": 7.34107648969334e-05, + "loss": 2.1323, + "step": 4854 + }, + { + "epoch": 0.3654566326050547, + "grad_norm": 4.807298183441162, + "learning_rate": 7.339999215401975e-05, + "loss": 1.8293, + "step": 4855 + }, + { + "epoch": 0.3655319068857148, + "grad_norm": 4.328993320465088, + "learning_rate": 7.338921802004372e-05, + "loss": 2.0214, + "step": 4856 + }, + { + "epoch": 0.365607181166375, + "grad_norm": 5.195735454559326, + "learning_rate": 7.337844249564577e-05, + "loss": 2.1436, + "step": 4857 + }, + { + "epoch": 0.36568245544703515, + "grad_norm": 5.643425941467285, + "learning_rate": 7.33676655814665e-05, + "loss": 1.7839, + "step": 4858 + }, + { + "epoch": 0.36575772972769527, + "grad_norm": 4.731564998626709, + "learning_rate": 7.335688727814655e-05, + "loss": 1.998, + "step": 4859 + }, + { + "epoch": 0.36583300400835544, + "grad_norm": 3.9718198776245117, + "learning_rate": 7.33461075863267e-05, + "loss": 2.0454, + "step": 4860 + }, + { + "epoch": 0.3659082782890156, + "grad_norm": 6.705219268798828, + "learning_rate": 7.333532650664772e-05, + "loss": 1.8717, + "step": 4861 + }, + { + "epoch": 0.36598355256967574, + "grad_norm": 4.640828609466553, + "learning_rate": 7.332454403975054e-05, + "loss": 1.9267, + "step": 4862 + }, + { + "epoch": 0.3660588268503359, + "grad_norm": 4.463264465332031, + "learning_rate": 7.331376018627613e-05, + "loss": 1.8131, + "step": 4863 + }, + { + "epoch": 0.3661341011309961, + "grad_norm": 5.110785007476807, + "learning_rate": 7.330297494686557e-05, + "loss": 1.8752, + "step": 4864 + }, + { + "epoch": 0.3662093754116562, + "grad_norm": 5.874404430389404, + "learning_rate": 7.329218832216003e-05, + "loss": 1.7848, + "step": 4865 + }, + { + "epoch": 0.3662846496923164, + "grad_norm": 4.12038516998291, + "learning_rate": 7.328140031280074e-05, + "loss": 1.8644, + "step": 4866 + }, + { + "epoch": 0.3663599239729765, + "grad_norm": 4.457777500152588, + "learning_rate": 7.327061091942897e-05, + "loss": 1.969, + "step": 4867 + }, + { + "epoch": 0.3664351982536367, + "grad_norm": 5.2491655349731445, + "learning_rate": 7.325982014268617e-05, + "loss": 1.7904, + "step": 4868 + }, + { + "epoch": 0.36651047253429686, + "grad_norm": 4.683348655700684, + "learning_rate": 7.324902798321379e-05, + "loss": 1.7999, + "step": 4869 + }, + { + "epoch": 0.366585746814957, + "grad_norm": 4.114802837371826, + "learning_rate": 7.32382344416534e-05, + "loss": 2.0892, + "step": 4870 + }, + { + "epoch": 0.36666102109561716, + "grad_norm": 7.05025053024292, + "learning_rate": 7.322743951864668e-05, + "loss": 2.295, + "step": 4871 + }, + { + "epoch": 0.36673629537627733, + "grad_norm": 5.158146381378174, + "learning_rate": 7.321664321483531e-05, + "loss": 2.2242, + "step": 4872 + }, + { + "epoch": 0.36681156965693745, + "grad_norm": 5.335732460021973, + "learning_rate": 7.320584553086113e-05, + "loss": 1.8312, + "step": 4873 + }, + { + "epoch": 0.36688684393759763, + "grad_norm": 4.165846824645996, + "learning_rate": 7.3195046467366e-05, + "loss": 2.162, + "step": 4874 + }, + { + "epoch": 0.3669621182182578, + "grad_norm": 4.397825717926025, + "learning_rate": 7.318424602499192e-05, + "loss": 1.9227, + "step": 4875 + }, + { + "epoch": 0.3670373924989179, + "grad_norm": 4.312918663024902, + "learning_rate": 7.317344420438093e-05, + "loss": 1.7857, + "step": 4876 + }, + { + "epoch": 0.3671126667795781, + "grad_norm": 6.843772888183594, + "learning_rate": 7.316264100617518e-05, + "loss": 2.1434, + "step": 4877 + }, + { + "epoch": 0.3671879410602382, + "grad_norm": 5.819020748138428, + "learning_rate": 7.315183643101689e-05, + "loss": 1.9415, + "step": 4878 + }, + { + "epoch": 0.3672632153408984, + "grad_norm": 4.090846061706543, + "learning_rate": 7.314103047954834e-05, + "loss": 1.468, + "step": 4879 + }, + { + "epoch": 0.3673384896215586, + "grad_norm": 4.501558780670166, + "learning_rate": 7.313022315241195e-05, + "loss": 1.9053, + "step": 4880 + }, + { + "epoch": 0.3674137639022187, + "grad_norm": 4.205174446105957, + "learning_rate": 7.311941445025014e-05, + "loss": 2.0861, + "step": 4881 + }, + { + "epoch": 0.36748903818287887, + "grad_norm": 6.205946445465088, + "learning_rate": 7.310860437370548e-05, + "loss": 2.1394, + "step": 4882 + }, + { + "epoch": 0.36756431246353904, + "grad_norm": 4.712464332580566, + "learning_rate": 7.30977929234206e-05, + "loss": 1.8987, + "step": 4883 + }, + { + "epoch": 0.36763958674419916, + "grad_norm": 5.140176773071289, + "learning_rate": 7.308698010003822e-05, + "loss": 1.8801, + "step": 4884 + }, + { + "epoch": 0.36771486102485934, + "grad_norm": 5.020617485046387, + "learning_rate": 7.30761659042011e-05, + "loss": 1.82, + "step": 4885 + }, + { + "epoch": 0.36779013530551946, + "grad_norm": 6.558064937591553, + "learning_rate": 7.306535033655212e-05, + "loss": 2.1496, + "step": 4886 + }, + { + "epoch": 0.36786540958617964, + "grad_norm": 5.348294258117676, + "learning_rate": 7.305453339773425e-05, + "loss": 1.9108, + "step": 4887 + }, + { + "epoch": 0.3679406838668398, + "grad_norm": 4.863654136657715, + "learning_rate": 7.304371508839053e-05, + "loss": 1.825, + "step": 4888 + }, + { + "epoch": 0.36801595814749993, + "grad_norm": 4.5959978103637695, + "learning_rate": 7.303289540916407e-05, + "loss": 2.1451, + "step": 4889 + }, + { + "epoch": 0.3680912324281601, + "grad_norm": 5.211095333099365, + "learning_rate": 7.302207436069807e-05, + "loss": 1.8705, + "step": 4890 + }, + { + "epoch": 0.3681665067088203, + "grad_norm": 5.763791561126709, + "learning_rate": 7.301125194363579e-05, + "loss": 2.188, + "step": 4891 + }, + { + "epoch": 0.3682417809894804, + "grad_norm": 5.282546520233154, + "learning_rate": 7.300042815862062e-05, + "loss": 2.1692, + "step": 4892 + }, + { + "epoch": 0.3683170552701406, + "grad_norm": 5.3406171798706055, + "learning_rate": 7.298960300629598e-05, + "loss": 1.9179, + "step": 4893 + }, + { + "epoch": 0.36839232955080076, + "grad_norm": 4.139369010925293, + "learning_rate": 7.297877648730542e-05, + "loss": 2.382, + "step": 4894 + }, + { + "epoch": 0.3684676038314609, + "grad_norm": 4.6781463623046875, + "learning_rate": 7.296794860229254e-05, + "loss": 2.3396, + "step": 4895 + }, + { + "epoch": 0.36854287811212105, + "grad_norm": 6.445566177368164, + "learning_rate": 7.295711935190099e-05, + "loss": 2.2569, + "step": 4896 + }, + { + "epoch": 0.3686181523927812, + "grad_norm": 3.5880117416381836, + "learning_rate": 7.294628873677458e-05, + "loss": 1.8082, + "step": 4897 + }, + { + "epoch": 0.36869342667344135, + "grad_norm": 4.24821138381958, + "learning_rate": 7.293545675755716e-05, + "loss": 1.7728, + "step": 4898 + }, + { + "epoch": 0.3687687009541015, + "grad_norm": 5.272346496582031, + "learning_rate": 7.292462341489262e-05, + "loss": 1.9196, + "step": 4899 + }, + { + "epoch": 0.36884397523476165, + "grad_norm": 5.66727352142334, + "learning_rate": 7.291378870942501e-05, + "loss": 1.9229, + "step": 4900 + }, + { + "epoch": 0.3689192495154218, + "grad_norm": 5.6897969245910645, + "learning_rate": 7.29029526417984e-05, + "loss": 1.8551, + "step": 4901 + }, + { + "epoch": 0.368994523796082, + "grad_norm": 5.403607368469238, + "learning_rate": 7.289211521265698e-05, + "loss": 1.6613, + "step": 4902 + }, + { + "epoch": 0.3690697980767421, + "grad_norm": 4.262795925140381, + "learning_rate": 7.288127642264497e-05, + "loss": 2.139, + "step": 4903 + }, + { + "epoch": 0.3691450723574023, + "grad_norm": 4.469027519226074, + "learning_rate": 7.287043627240676e-05, + "loss": 2.1908, + "step": 4904 + }, + { + "epoch": 0.3692203466380624, + "grad_norm": 4.615016460418701, + "learning_rate": 7.285959476258673e-05, + "loss": 1.8332, + "step": 4905 + }, + { + "epoch": 0.3692956209187226, + "grad_norm": 6.115482807159424, + "learning_rate": 7.284875189382938e-05, + "loss": 2.1107, + "step": 4906 + }, + { + "epoch": 0.36937089519938276, + "grad_norm": 4.915789604187012, + "learning_rate": 7.283790766677931e-05, + "loss": 2.0013, + "step": 4907 + }, + { + "epoch": 0.3694461694800429, + "grad_norm": 9.588419914245605, + "learning_rate": 7.282706208208113e-05, + "loss": 2.1525, + "step": 4908 + }, + { + "epoch": 0.36952144376070306, + "grad_norm": 4.847698211669922, + "learning_rate": 7.281621514037962e-05, + "loss": 1.915, + "step": 4909 + }, + { + "epoch": 0.36959671804136324, + "grad_norm": 4.7174153327941895, + "learning_rate": 7.280536684231957e-05, + "loss": 1.8076, + "step": 4910 + }, + { + "epoch": 0.36967199232202336, + "grad_norm": 4.227524280548096, + "learning_rate": 7.279451718854592e-05, + "loss": 1.8091, + "step": 4911 + }, + { + "epoch": 0.36974726660268353, + "grad_norm": 3.4831960201263428, + "learning_rate": 7.278366617970363e-05, + "loss": 1.7343, + "step": 4912 + }, + { + "epoch": 0.3698225408833437, + "grad_norm": 4.319570541381836, + "learning_rate": 7.277281381643774e-05, + "loss": 1.9639, + "step": 4913 + }, + { + "epoch": 0.36989781516400383, + "grad_norm": 4.9830241203308105, + "learning_rate": 7.276196009939342e-05, + "loss": 2.3083, + "step": 4914 + }, + { + "epoch": 0.369973089444664, + "grad_norm": 5.01857328414917, + "learning_rate": 7.275110502921588e-05, + "loss": 1.7981, + "step": 4915 + }, + { + "epoch": 0.3700483637253241, + "grad_norm": 4.583183288574219, + "learning_rate": 7.274024860655044e-05, + "loss": 2.1572, + "step": 4916 + }, + { + "epoch": 0.3701236380059843, + "grad_norm": 4.341670036315918, + "learning_rate": 7.272939083204246e-05, + "loss": 2.2375, + "step": 4917 + }, + { + "epoch": 0.3701989122866445, + "grad_norm": 5.428600311279297, + "learning_rate": 7.27185317063374e-05, + "loss": 2.0468, + "step": 4918 + }, + { + "epoch": 0.3702741865673046, + "grad_norm": 5.212686538696289, + "learning_rate": 7.270767123008083e-05, + "loss": 1.6357, + "step": 4919 + }, + { + "epoch": 0.3703494608479648, + "grad_norm": 4.537408828735352, + "learning_rate": 7.269680940391836e-05, + "loss": 1.9553, + "step": 4920 + }, + { + "epoch": 0.37042473512862495, + "grad_norm": 3.0378222465515137, + "learning_rate": 7.268594622849569e-05, + "loss": 1.8631, + "step": 4921 + }, + { + "epoch": 0.37050000940928507, + "grad_norm": 3.8827826976776123, + "learning_rate": 7.267508170445862e-05, + "loss": 2.206, + "step": 4922 + }, + { + "epoch": 0.37057528368994525, + "grad_norm": 4.087047100067139, + "learning_rate": 7.266421583245299e-05, + "loss": 1.6895, + "step": 4923 + }, + { + "epoch": 0.3706505579706054, + "grad_norm": 4.350297927856445, + "learning_rate": 7.265334861312479e-05, + "loss": 1.7875, + "step": 4924 + }, + { + "epoch": 0.37072583225126554, + "grad_norm": 5.5706963539123535, + "learning_rate": 7.264248004711998e-05, + "loss": 2.1399, + "step": 4925 + }, + { + "epoch": 0.3708011065319257, + "grad_norm": 6.357446193695068, + "learning_rate": 7.263161013508471e-05, + "loss": 1.9069, + "step": 4926 + }, + { + "epoch": 0.37087638081258584, + "grad_norm": 6.167360782623291, + "learning_rate": 7.262073887766517e-05, + "loss": 2.0014, + "step": 4927 + }, + { + "epoch": 0.370951655093246, + "grad_norm": 4.467043399810791, + "learning_rate": 7.26098662755076e-05, + "loss": 1.9529, + "step": 4928 + }, + { + "epoch": 0.3710269293739062, + "grad_norm": 4.538745403289795, + "learning_rate": 7.259899232925835e-05, + "loss": 1.9351, + "step": 4929 + }, + { + "epoch": 0.3711022036545663, + "grad_norm": 5.637248516082764, + "learning_rate": 7.258811703956385e-05, + "loss": 2.1126, + "step": 4930 + }, + { + "epoch": 0.3711774779352265, + "grad_norm": 5.919388771057129, + "learning_rate": 7.257724040707061e-05, + "loss": 1.7308, + "step": 4931 + }, + { + "epoch": 0.37125275221588666, + "grad_norm": 4.361879348754883, + "learning_rate": 7.25663624324252e-05, + "loss": 1.9224, + "step": 4932 + }, + { + "epoch": 0.3713280264965468, + "grad_norm": 7.6469502449035645, + "learning_rate": 7.255548311627432e-05, + "loss": 1.7658, + "step": 4933 + }, + { + "epoch": 0.37140330077720696, + "grad_norm": 7.319008827209473, + "learning_rate": 7.254460245926466e-05, + "loss": 1.9378, + "step": 4934 + }, + { + "epoch": 0.3714785750578671, + "grad_norm": 5.513526916503906, + "learning_rate": 7.253372046204307e-05, + "loss": 2.2914, + "step": 4935 + }, + { + "epoch": 0.37155384933852725, + "grad_norm": 4.749660015106201, + "learning_rate": 7.252283712525646e-05, + "loss": 1.8299, + "step": 4936 + }, + { + "epoch": 0.37162912361918743, + "grad_norm": 4.704840660095215, + "learning_rate": 7.251195244955183e-05, + "loss": 1.9976, + "step": 4937 + }, + { + "epoch": 0.37170439789984755, + "grad_norm": 5.760531425476074, + "learning_rate": 7.250106643557619e-05, + "loss": 2.0084, + "step": 4938 + }, + { + "epoch": 0.3717796721805077, + "grad_norm": 4.350567817687988, + "learning_rate": 7.249017908397673e-05, + "loss": 1.8266, + "step": 4939 + }, + { + "epoch": 0.3718549464611679, + "grad_norm": 4.014101028442383, + "learning_rate": 7.247929039540066e-05, + "loss": 1.7896, + "step": 4940 + }, + { + "epoch": 0.371930220741828, + "grad_norm": 3.99737286567688, + "learning_rate": 7.246840037049527e-05, + "loss": 2.1688, + "step": 4941 + }, + { + "epoch": 0.3720054950224882, + "grad_norm": 3.815800666809082, + "learning_rate": 7.245750900990793e-05, + "loss": 2.1723, + "step": 4942 + }, + { + "epoch": 0.3720807693031484, + "grad_norm": 3.4326460361480713, + "learning_rate": 7.244661631428614e-05, + "loss": 2.0385, + "step": 4943 + }, + { + "epoch": 0.3721560435838085, + "grad_norm": 4.556941986083984, + "learning_rate": 7.243572228427743e-05, + "loss": 2.0194, + "step": 4944 + }, + { + "epoch": 0.37223131786446867, + "grad_norm": 6.778387069702148, + "learning_rate": 7.242482692052936e-05, + "loss": 1.8463, + "step": 4945 + }, + { + "epoch": 0.3723065921451288, + "grad_norm": 5.627053737640381, + "learning_rate": 7.24139302236897e-05, + "loss": 2.1858, + "step": 4946 + }, + { + "epoch": 0.37238186642578897, + "grad_norm": 5.661882400512695, + "learning_rate": 7.24030321944062e-05, + "loss": 1.9663, + "step": 4947 + }, + { + "epoch": 0.37245714070644914, + "grad_norm": 4.480603218078613, + "learning_rate": 7.239213283332672e-05, + "loss": 2.0683, + "step": 4948 + }, + { + "epoch": 0.37253241498710926, + "grad_norm": 4.722218036651611, + "learning_rate": 7.23812321410992e-05, + "loss": 1.6676, + "step": 4949 + }, + { + "epoch": 0.37260768926776944, + "grad_norm": 5.633545875549316, + "learning_rate": 7.237033011837162e-05, + "loss": 1.8249, + "step": 4950 + }, + { + "epoch": 0.3726829635484296, + "grad_norm": 4.879673957824707, + "learning_rate": 7.235942676579213e-05, + "loss": 2.2057, + "step": 4951 + }, + { + "epoch": 0.37275823782908973, + "grad_norm": 4.61749267578125, + "learning_rate": 7.234852208400886e-05, + "loss": 2.0422, + "step": 4952 + }, + { + "epoch": 0.3728335121097499, + "grad_norm": 4.231074810028076, + "learning_rate": 7.233761607367009e-05, + "loss": 1.8792, + "step": 4953 + }, + { + "epoch": 0.37290878639041003, + "grad_norm": 4.710064888000488, + "learning_rate": 7.232670873542413e-05, + "loss": 1.9134, + "step": 4954 + }, + { + "epoch": 0.3729840606710702, + "grad_norm": 4.5765461921691895, + "learning_rate": 7.23158000699194e-05, + "loss": 1.7344, + "step": 4955 + }, + { + "epoch": 0.3730593349517304, + "grad_norm": 4.981764316558838, + "learning_rate": 7.230489007780439e-05, + "loss": 1.8601, + "step": 4956 + }, + { + "epoch": 0.3731346092323905, + "grad_norm": 7.854146480560303, + "learning_rate": 7.229397875972765e-05, + "loss": 2.4389, + "step": 4957 + }, + { + "epoch": 0.3732098835130507, + "grad_norm": 6.466365814208984, + "learning_rate": 7.228306611633787e-05, + "loss": 2.0474, + "step": 4958 + }, + { + "epoch": 0.37328515779371085, + "grad_norm": 5.906894207000732, + "learning_rate": 7.22721521482837e-05, + "loss": 2.0779, + "step": 4959 + }, + { + "epoch": 0.373360432074371, + "grad_norm": 5.88653039932251, + "learning_rate": 7.226123685621405e-05, + "loss": 2.1126, + "step": 4960 + }, + { + "epoch": 0.37343570635503115, + "grad_norm": 4.718963146209717, + "learning_rate": 7.22503202407777e-05, + "loss": 2.0624, + "step": 4961 + }, + { + "epoch": 0.3735109806356913, + "grad_norm": 5.176740646362305, + "learning_rate": 7.223940230262367e-05, + "loss": 1.7592, + "step": 4962 + }, + { + "epoch": 0.37358625491635145, + "grad_norm": 5.126104354858398, + "learning_rate": 7.2228483042401e-05, + "loss": 1.7519, + "step": 4963 + }, + { + "epoch": 0.3736615291970116, + "grad_norm": 5.948572158813477, + "learning_rate": 7.221756246075877e-05, + "loss": 2.3341, + "step": 4964 + }, + { + "epoch": 0.37373680347767174, + "grad_norm": 4.3009467124938965, + "learning_rate": 7.22066405583462e-05, + "loss": 1.8655, + "step": 4965 + }, + { + "epoch": 0.3738120777583319, + "grad_norm": 5.706455707550049, + "learning_rate": 7.219571733581257e-05, + "loss": 1.9716, + "step": 4966 + }, + { + "epoch": 0.3738873520389921, + "grad_norm": 6.1628642082214355, + "learning_rate": 7.218479279380725e-05, + "loss": 1.9441, + "step": 4967 + }, + { + "epoch": 0.3739626263196522, + "grad_norm": 4.59248161315918, + "learning_rate": 7.217386693297963e-05, + "loss": 2.3536, + "step": 4968 + }, + { + "epoch": 0.3740379006003124, + "grad_norm": 6.029731273651123, + "learning_rate": 7.216293975397927e-05, + "loss": 2.0032, + "step": 4969 + }, + { + "epoch": 0.37411317488097257, + "grad_norm": 5.294279098510742, + "learning_rate": 7.21520112574557e-05, + "loss": 2.138, + "step": 4970 + }, + { + "epoch": 0.3741884491616327, + "grad_norm": 5.968223571777344, + "learning_rate": 7.214108144405865e-05, + "loss": 2.0692, + "step": 4971 + }, + { + "epoch": 0.37426372344229286, + "grad_norm": 4.473196506500244, + "learning_rate": 7.213015031443783e-05, + "loss": 2.1575, + "step": 4972 + }, + { + "epoch": 0.37433899772295304, + "grad_norm": 4.75431489944458, + "learning_rate": 7.211921786924308e-05, + "loss": 1.882, + "step": 4973 + }, + { + "epoch": 0.37441427200361316, + "grad_norm": 4.734861850738525, + "learning_rate": 7.210828410912428e-05, + "loss": 2.0258, + "step": 4974 + }, + { + "epoch": 0.37448954628427333, + "grad_norm": 3.701101064682007, + "learning_rate": 7.209734903473143e-05, + "loss": 1.7323, + "step": 4975 + }, + { + "epoch": 0.37456482056493345, + "grad_norm": 4.051206111907959, + "learning_rate": 7.208641264671458e-05, + "loss": 1.8825, + "step": 4976 + }, + { + "epoch": 0.37464009484559363, + "grad_norm": 3.509989023208618, + "learning_rate": 7.207547494572388e-05, + "loss": 1.7323, + "step": 4977 + }, + { + "epoch": 0.3747153691262538, + "grad_norm": 4.238576412200928, + "learning_rate": 7.206453593240954e-05, + "loss": 2.0529, + "step": 4978 + }, + { + "epoch": 0.3747906434069139, + "grad_norm": 3.843618631362915, + "learning_rate": 7.205359560742185e-05, + "loss": 2.1025, + "step": 4979 + }, + { + "epoch": 0.3748659176875741, + "grad_norm": 5.472354888916016, + "learning_rate": 7.204265397141116e-05, + "loss": 1.9685, + "step": 4980 + }, + { + "epoch": 0.3749411919682343, + "grad_norm": 4.376461505889893, + "learning_rate": 7.203171102502795e-05, + "loss": 2.2941, + "step": 4981 + }, + { + "epoch": 0.3750164662488944, + "grad_norm": 4.474477291107178, + "learning_rate": 7.202076676892273e-05, + "loss": 1.9242, + "step": 4982 + }, + { + "epoch": 0.3750917405295546, + "grad_norm": 6.944291591644287, + "learning_rate": 7.200982120374613e-05, + "loss": 1.8588, + "step": 4983 + }, + { + "epoch": 0.3751670148102147, + "grad_norm": 4.876765251159668, + "learning_rate": 7.199887433014879e-05, + "loss": 1.9065, + "step": 4984 + }, + { + "epoch": 0.37524228909087487, + "grad_norm": 4.21694278717041, + "learning_rate": 7.19879261487815e-05, + "loss": 2.1048, + "step": 4985 + }, + { + "epoch": 0.37531756337153505, + "grad_norm": 4.962738990783691, + "learning_rate": 7.197697666029511e-05, + "loss": 1.9615, + "step": 4986 + }, + { + "epoch": 0.37539283765219517, + "grad_norm": 4.720092296600342, + "learning_rate": 7.196602586534049e-05, + "loss": 2.2484, + "step": 4987 + }, + { + "epoch": 0.37546811193285534, + "grad_norm": 3.698617696762085, + "learning_rate": 7.195507376456867e-05, + "loss": 2.1332, + "step": 4988 + }, + { + "epoch": 0.3755433862135155, + "grad_norm": 4.769763469696045, + "learning_rate": 7.194412035863072e-05, + "loss": 1.8165, + "step": 4989 + }, + { + "epoch": 0.37561866049417564, + "grad_norm": 4.7708845138549805, + "learning_rate": 7.193316564817776e-05, + "loss": 1.7809, + "step": 4990 + }, + { + "epoch": 0.3756939347748358, + "grad_norm": 5.258187294006348, + "learning_rate": 7.192220963386104e-05, + "loss": 2.0299, + "step": 4991 + }, + { + "epoch": 0.375769209055496, + "grad_norm": 4.5305047035217285, + "learning_rate": 7.191125231633187e-05, + "loss": 1.7322, + "step": 4992 + }, + { + "epoch": 0.3758444833361561, + "grad_norm": 5.102001190185547, + "learning_rate": 7.190029369624162e-05, + "loss": 1.6057, + "step": 4993 + }, + { + "epoch": 0.3759197576168163, + "grad_norm": 4.977925777435303, + "learning_rate": 7.188933377424174e-05, + "loss": 2.2238, + "step": 4994 + }, + { + "epoch": 0.3759950318974764, + "grad_norm": 3.880150079727173, + "learning_rate": 7.187837255098379e-05, + "loss": 1.8831, + "step": 4995 + }, + { + "epoch": 0.3760703061781366, + "grad_norm": 5.856607437133789, + "learning_rate": 7.186741002711935e-05, + "loss": 1.826, + "step": 4996 + }, + { + "epoch": 0.37614558045879676, + "grad_norm": 6.347630977630615, + "learning_rate": 7.185644620330014e-05, + "loss": 2.368, + "step": 4997 + }, + { + "epoch": 0.3762208547394569, + "grad_norm": 4.962389945983887, + "learning_rate": 7.18454810801779e-05, + "loss": 2.0538, + "step": 4998 + }, + { + "epoch": 0.37629612902011705, + "grad_norm": 5.2876715660095215, + "learning_rate": 7.18345146584045e-05, + "loss": 2.0556, + "step": 4999 + }, + { + "epoch": 0.37637140330077723, + "grad_norm": 4.463347911834717, + "learning_rate": 7.182354693863186e-05, + "loss": 1.8828, + "step": 5000 + }, + { + "epoch": 0.37644667758143735, + "grad_norm": 5.426022529602051, + "learning_rate": 7.181257792151197e-05, + "loss": 1.7753, + "step": 5001 + }, + { + "epoch": 0.3765219518620975, + "grad_norm": 5.392134189605713, + "learning_rate": 7.180160760769692e-05, + "loss": 1.9247, + "step": 5002 + }, + { + "epoch": 0.37659722614275765, + "grad_norm": 5.8048577308654785, + "learning_rate": 7.179063599783884e-05, + "loss": 2.0037, + "step": 5003 + }, + { + "epoch": 0.3766725004234178, + "grad_norm": 4.083925247192383, + "learning_rate": 7.177966309258998e-05, + "loss": 1.9497, + "step": 5004 + }, + { + "epoch": 0.376747774704078, + "grad_norm": 3.546855926513672, + "learning_rate": 7.176868889260264e-05, + "loss": 1.7008, + "step": 5005 + }, + { + "epoch": 0.3768230489847381, + "grad_norm": 3.7905898094177246, + "learning_rate": 7.17577133985292e-05, + "loss": 1.8258, + "step": 5006 + }, + { + "epoch": 0.3768983232653983, + "grad_norm": 3.6993086338043213, + "learning_rate": 7.174673661102214e-05, + "loss": 1.7559, + "step": 5007 + }, + { + "epoch": 0.37697359754605847, + "grad_norm": 4.094613552093506, + "learning_rate": 7.173575853073398e-05, + "loss": 1.541, + "step": 5008 + }, + { + "epoch": 0.3770488718267186, + "grad_norm": 5.412510871887207, + "learning_rate": 7.172477915831734e-05, + "loss": 1.9054, + "step": 5009 + }, + { + "epoch": 0.37712414610737877, + "grad_norm": 4.654134273529053, + "learning_rate": 7.171379849442493e-05, + "loss": 1.817, + "step": 5010 + }, + { + "epoch": 0.37719942038803894, + "grad_norm": 5.500486850738525, + "learning_rate": 7.170281653970949e-05, + "loss": 2.0739, + "step": 5011 + }, + { + "epoch": 0.37727469466869906, + "grad_norm": 4.02164363861084, + "learning_rate": 7.169183329482388e-05, + "loss": 2.0592, + "step": 5012 + }, + { + "epoch": 0.37734996894935924, + "grad_norm": 6.750339984893799, + "learning_rate": 7.168084876042103e-05, + "loss": 2.1483, + "step": 5013 + }, + { + "epoch": 0.37742524323001936, + "grad_norm": 3.388923406600952, + "learning_rate": 7.166986293715391e-05, + "loss": 1.8918, + "step": 5014 + }, + { + "epoch": 0.37750051751067953, + "grad_norm": 3.9066715240478516, + "learning_rate": 7.165887582567563e-05, + "loss": 1.8342, + "step": 5015 + }, + { + "epoch": 0.3775757917913397, + "grad_norm": 5.191504001617432, + "learning_rate": 7.164788742663932e-05, + "loss": 2.0172, + "step": 5016 + }, + { + "epoch": 0.37765106607199983, + "grad_norm": 4.917282581329346, + "learning_rate": 7.163689774069823e-05, + "loss": 1.8206, + "step": 5017 + }, + { + "epoch": 0.37772634035266, + "grad_norm": 5.534020900726318, + "learning_rate": 7.162590676850565e-05, + "loss": 1.9287, + "step": 5018 + }, + { + "epoch": 0.3778016146333202, + "grad_norm": 4.5132575035095215, + "learning_rate": 7.161491451071495e-05, + "loss": 1.9113, + "step": 5019 + }, + { + "epoch": 0.3778768889139803, + "grad_norm": 4.004202365875244, + "learning_rate": 7.160392096797963e-05, + "loss": 1.9418, + "step": 5020 + }, + { + "epoch": 0.3779521631946405, + "grad_norm": 4.021640777587891, + "learning_rate": 7.159292614095318e-05, + "loss": 1.8938, + "step": 5021 + }, + { + "epoch": 0.37802743747530065, + "grad_norm": 4.470360279083252, + "learning_rate": 7.158193003028922e-05, + "loss": 1.9444, + "step": 5022 + }, + { + "epoch": 0.3781027117559608, + "grad_norm": 4.077413082122803, + "learning_rate": 7.157093263664147e-05, + "loss": 2.1638, + "step": 5023 + }, + { + "epoch": 0.37817798603662095, + "grad_norm": 4.954759120941162, + "learning_rate": 7.155993396066365e-05, + "loss": 2.5408, + "step": 5024 + }, + { + "epoch": 0.37825326031728107, + "grad_norm": 4.298381805419922, + "learning_rate": 7.154893400300962e-05, + "loss": 1.7462, + "step": 5025 + }, + { + "epoch": 0.37832853459794125, + "grad_norm": 5.803854942321777, + "learning_rate": 7.153793276433329e-05, + "loss": 1.8743, + "step": 5026 + }, + { + "epoch": 0.3784038088786014, + "grad_norm": 4.230853080749512, + "learning_rate": 7.152693024528867e-05, + "loss": 2.1784, + "step": 5027 + }, + { + "epoch": 0.37847908315926154, + "grad_norm": 4.332756519317627, + "learning_rate": 7.15159264465298e-05, + "loss": 2.0783, + "step": 5028 + }, + { + "epoch": 0.3785543574399217, + "grad_norm": 3.7402727603912354, + "learning_rate": 7.150492136871086e-05, + "loss": 1.9596, + "step": 5029 + }, + { + "epoch": 0.3786296317205819, + "grad_norm": 4.28030252456665, + "learning_rate": 7.149391501248602e-05, + "loss": 1.8501, + "step": 5030 + }, + { + "epoch": 0.378704906001242, + "grad_norm": 4.2571563720703125, + "learning_rate": 7.148290737850963e-05, + "loss": 1.6562, + "step": 5031 + }, + { + "epoch": 0.3787801802819022, + "grad_norm": 4.48539400100708, + "learning_rate": 7.147189846743601e-05, + "loss": 1.9153, + "step": 5032 + }, + { + "epoch": 0.3788554545625623, + "grad_norm": 4.535348892211914, + "learning_rate": 7.146088827991966e-05, + "loss": 1.8151, + "step": 5033 + }, + { + "epoch": 0.3789307288432225, + "grad_norm": 4.449469089508057, + "learning_rate": 7.144987681661508e-05, + "loss": 1.9655, + "step": 5034 + }, + { + "epoch": 0.37900600312388266, + "grad_norm": 4.397324085235596, + "learning_rate": 7.143886407817686e-05, + "loss": 1.7285, + "step": 5035 + }, + { + "epoch": 0.3790812774045428, + "grad_norm": 4.513822555541992, + "learning_rate": 7.14278500652597e-05, + "loss": 2.2549, + "step": 5036 + }, + { + "epoch": 0.37915655168520296, + "grad_norm": 3.738454818725586, + "learning_rate": 7.141683477851832e-05, + "loss": 2.0126, + "step": 5037 + }, + { + "epoch": 0.37923182596586313, + "grad_norm": 6.871589183807373, + "learning_rate": 7.140581821860757e-05, + "loss": 1.7037, + "step": 5038 + }, + { + "epoch": 0.37930710024652325, + "grad_norm": 5.357760429382324, + "learning_rate": 7.139480038618235e-05, + "loss": 1.7613, + "step": 5039 + }, + { + "epoch": 0.37938237452718343, + "grad_norm": 5.217741012573242, + "learning_rate": 7.138378128189763e-05, + "loss": 1.7709, + "step": 5040 + }, + { + "epoch": 0.3794576488078436, + "grad_norm": 5.146366119384766, + "learning_rate": 7.137276090640847e-05, + "loss": 2.0119, + "step": 5041 + }, + { + "epoch": 0.3795329230885037, + "grad_norm": 4.0107879638671875, + "learning_rate": 7.136173926037e-05, + "loss": 1.8021, + "step": 5042 + }, + { + "epoch": 0.3796081973691639, + "grad_norm": 4.555127143859863, + "learning_rate": 7.135071634443744e-05, + "loss": 1.8569, + "step": 5043 + }, + { + "epoch": 0.379683471649824, + "grad_norm": 4.552152633666992, + "learning_rate": 7.133969215926604e-05, + "loss": 1.6807, + "step": 5044 + }, + { + "epoch": 0.3797587459304842, + "grad_norm": 4.354506969451904, + "learning_rate": 7.132866670551117e-05, + "loss": 1.9583, + "step": 5045 + }, + { + "epoch": 0.3798340202111444, + "grad_norm": 5.571901321411133, + "learning_rate": 7.131763998382828e-05, + "loss": 1.8813, + "step": 5046 + }, + { + "epoch": 0.3799092944918045, + "grad_norm": 4.542109966278076, + "learning_rate": 7.130661199487283e-05, + "loss": 2.0044, + "step": 5047 + }, + { + "epoch": 0.37998456877246467, + "grad_norm": 4.0436296463012695, + "learning_rate": 7.129558273930043e-05, + "loss": 1.9924, + "step": 5048 + }, + { + "epoch": 0.38005984305312485, + "grad_norm": 4.427436828613281, + "learning_rate": 7.128455221776677e-05, + "loss": 2.292, + "step": 5049 + }, + { + "epoch": 0.38013511733378497, + "grad_norm": 4.2799506187438965, + "learning_rate": 7.127352043092755e-05, + "loss": 1.8546, + "step": 5050 + }, + { + "epoch": 0.38021039161444514, + "grad_norm": 4.1891679763793945, + "learning_rate": 7.126248737943858e-05, + "loss": 1.6323, + "step": 5051 + }, + { + "epoch": 0.38028566589510526, + "grad_norm": 3.9905247688293457, + "learning_rate": 7.125145306395574e-05, + "loss": 2.0276, + "step": 5052 + }, + { + "epoch": 0.38036094017576544, + "grad_norm": 4.146697044372559, + "learning_rate": 7.124041748513498e-05, + "loss": 1.8364, + "step": 5053 + }, + { + "epoch": 0.3804362144564256, + "grad_norm": 4.037591457366943, + "learning_rate": 7.122938064363236e-05, + "loss": 1.7057, + "step": 5054 + }, + { + "epoch": 0.38051148873708573, + "grad_norm": 5.675656318664551, + "learning_rate": 7.121834254010398e-05, + "loss": 1.8862, + "step": 5055 + }, + { + "epoch": 0.3805867630177459, + "grad_norm": 4.468969821929932, + "learning_rate": 7.120730317520601e-05, + "loss": 1.8913, + "step": 5056 + }, + { + "epoch": 0.3806620372984061, + "grad_norm": 5.267385482788086, + "learning_rate": 7.119626254959472e-05, + "loss": 2.018, + "step": 5057 + }, + { + "epoch": 0.3807373115790662, + "grad_norm": 4.253632545471191, + "learning_rate": 7.118522066392644e-05, + "loss": 1.9346, + "step": 5058 + }, + { + "epoch": 0.3808125858597264, + "grad_norm": 5.13553524017334, + "learning_rate": 7.117417751885756e-05, + "loss": 2.0081, + "step": 5059 + }, + { + "epoch": 0.38088786014038656, + "grad_norm": 5.386470317840576, + "learning_rate": 7.11631331150446e-05, + "loss": 1.7915, + "step": 5060 + }, + { + "epoch": 0.3809631344210467, + "grad_norm": 4.69920015335083, + "learning_rate": 7.11520874531441e-05, + "loss": 2.1036, + "step": 5061 + }, + { + "epoch": 0.38103840870170685, + "grad_norm": 5.338794708251953, + "learning_rate": 7.114104053381269e-05, + "loss": 1.6762, + "step": 5062 + }, + { + "epoch": 0.381113682982367, + "grad_norm": 4.164871692657471, + "learning_rate": 7.112999235770708e-05, + "loss": 2.3391, + "step": 5063 + }, + { + "epoch": 0.38118895726302715, + "grad_norm": 6.454789161682129, + "learning_rate": 7.111894292548405e-05, + "loss": 2.1076, + "step": 5064 + }, + { + "epoch": 0.3812642315436873, + "grad_norm": 5.738858699798584, + "learning_rate": 7.110789223780046e-05, + "loss": 2.1032, + "step": 5065 + }, + { + "epoch": 0.38133950582434745, + "grad_norm": 4.800664901733398, + "learning_rate": 7.109684029531326e-05, + "loss": 1.7477, + "step": 5066 + }, + { + "epoch": 0.3814147801050076, + "grad_norm": 4.2151994705200195, + "learning_rate": 7.108578709867941e-05, + "loss": 1.8912, + "step": 5067 + }, + { + "epoch": 0.3814900543856678, + "grad_norm": 3.88224720954895, + "learning_rate": 7.107473264855604e-05, + "loss": 1.9579, + "step": 5068 + }, + { + "epoch": 0.3815653286663279, + "grad_norm": 4.584081172943115, + "learning_rate": 7.106367694560028e-05, + "loss": 1.923, + "step": 5069 + }, + { + "epoch": 0.3816406029469881, + "grad_norm": 5.365553379058838, + "learning_rate": 7.105261999046935e-05, + "loss": 2.3511, + "step": 5070 + }, + { + "epoch": 0.38171587722764827, + "grad_norm": 3.8131959438323975, + "learning_rate": 7.10415617838206e-05, + "loss": 1.9687, + "step": 5071 + }, + { + "epoch": 0.3817911515083084, + "grad_norm": 3.665224552154541, + "learning_rate": 7.103050232631134e-05, + "loss": 1.8544, + "step": 5072 + }, + { + "epoch": 0.38186642578896857, + "grad_norm": 4.011844158172607, + "learning_rate": 7.101944161859908e-05, + "loss": 2.2753, + "step": 5073 + }, + { + "epoch": 0.3819417000696287, + "grad_norm": 4.224832534790039, + "learning_rate": 7.100837966134133e-05, + "loss": 1.8325, + "step": 5074 + }, + { + "epoch": 0.38201697435028886, + "grad_norm": 6.464770317077637, + "learning_rate": 7.099731645519568e-05, + "loss": 2.0772, + "step": 5075 + }, + { + "epoch": 0.38209224863094904, + "grad_norm": 6.836861610412598, + "learning_rate": 7.098625200081982e-05, + "loss": 2.2939, + "step": 5076 + }, + { + "epoch": 0.38216752291160916, + "grad_norm": 3.214604377746582, + "learning_rate": 7.09751862988715e-05, + "loss": 1.9385, + "step": 5077 + }, + { + "epoch": 0.38224279719226933, + "grad_norm": 5.163912773132324, + "learning_rate": 7.096411935000853e-05, + "loss": 1.881, + "step": 5078 + }, + { + "epoch": 0.3823180714729295, + "grad_norm": 4.539270401000977, + "learning_rate": 7.09530511548888e-05, + "loss": 2.0638, + "step": 5079 + }, + { + "epoch": 0.38239334575358963, + "grad_norm": 3.8244714736938477, + "learning_rate": 7.094198171417032e-05, + "loss": 1.9458, + "step": 5080 + }, + { + "epoch": 0.3824686200342498, + "grad_norm": 3.7809698581695557, + "learning_rate": 7.093091102851112e-05, + "loss": 2.0929, + "step": 5081 + }, + { + "epoch": 0.3825438943149099, + "grad_norm": 5.210256099700928, + "learning_rate": 7.091983909856932e-05, + "loss": 1.9799, + "step": 5082 + }, + { + "epoch": 0.3826191685955701, + "grad_norm": 6.476984024047852, + "learning_rate": 7.09087659250031e-05, + "loss": 2.1878, + "step": 5083 + }, + { + "epoch": 0.3826944428762303, + "grad_norm": 3.7477054595947266, + "learning_rate": 7.089769150847075e-05, + "loss": 2.2522, + "step": 5084 + }, + { + "epoch": 0.3827697171568904, + "grad_norm": 4.948090553283691, + "learning_rate": 7.08866158496306e-05, + "loss": 2.1235, + "step": 5085 + }, + { + "epoch": 0.3828449914375506, + "grad_norm": 4.929243087768555, + "learning_rate": 7.087553894914107e-05, + "loss": 2.1283, + "step": 5086 + }, + { + "epoch": 0.38292026571821075, + "grad_norm": 6.278349876403809, + "learning_rate": 7.086446080766063e-05, + "loss": 2.1767, + "step": 5087 + }, + { + "epoch": 0.38299553999887087, + "grad_norm": 5.265294075012207, + "learning_rate": 7.085338142584788e-05, + "loss": 2.2872, + "step": 5088 + }, + { + "epoch": 0.38307081427953105, + "grad_norm": 4.189830303192139, + "learning_rate": 7.084230080436142e-05, + "loss": 1.9563, + "step": 5089 + }, + { + "epoch": 0.3831460885601912, + "grad_norm": 3.9800798892974854, + "learning_rate": 7.083121894386e-05, + "loss": 1.8569, + "step": 5090 + }, + { + "epoch": 0.38322136284085134, + "grad_norm": 5.580798625946045, + "learning_rate": 7.082013584500238e-05, + "loss": 1.9092, + "step": 5091 + }, + { + "epoch": 0.3832966371215115, + "grad_norm": 4.221299648284912, + "learning_rate": 7.080905150844742e-05, + "loss": 1.7303, + "step": 5092 + }, + { + "epoch": 0.38337191140217164, + "grad_norm": 4.8832011222839355, + "learning_rate": 7.079796593485405e-05, + "loss": 2.0758, + "step": 5093 + }, + { + "epoch": 0.3834471856828318, + "grad_norm": 7.229682445526123, + "learning_rate": 7.07868791248813e-05, + "loss": 2.0081, + "step": 5094 + }, + { + "epoch": 0.383522459963492, + "grad_norm": 6.102149486541748, + "learning_rate": 7.077579107918821e-05, + "loss": 2.0254, + "step": 5095 + }, + { + "epoch": 0.3835977342441521, + "grad_norm": 5.362575054168701, + "learning_rate": 7.076470179843396e-05, + "loss": 2.1112, + "step": 5096 + }, + { + "epoch": 0.3836730085248123, + "grad_norm": 4.909099102020264, + "learning_rate": 7.075361128327777e-05, + "loss": 2.067, + "step": 5097 + }, + { + "epoch": 0.38374828280547246, + "grad_norm": 5.938938140869141, + "learning_rate": 7.074251953437893e-05, + "loss": 2.3223, + "step": 5098 + }, + { + "epoch": 0.3838235570861326, + "grad_norm": 5.339906215667725, + "learning_rate": 7.073142655239684e-05, + "loss": 1.8494, + "step": 5099 + }, + { + "epoch": 0.38389883136679276, + "grad_norm": 6.356232166290283, + "learning_rate": 7.072033233799091e-05, + "loss": 1.827, + "step": 5100 + }, + { + "epoch": 0.3839741056474529, + "grad_norm": 4.398833751678467, + "learning_rate": 7.07092368918207e-05, + "loss": 2.1328, + "step": 5101 + }, + { + "epoch": 0.38404937992811305, + "grad_norm": 7.725823879241943, + "learning_rate": 7.069814021454576e-05, + "loss": 1.9501, + "step": 5102 + }, + { + "epoch": 0.38412465420877323, + "grad_norm": 4.908069133758545, + "learning_rate": 7.068704230682576e-05, + "loss": 2.18, + "step": 5103 + }, + { + "epoch": 0.38419992848943335, + "grad_norm": 4.4095306396484375, + "learning_rate": 7.067594316932047e-05, + "loss": 1.8836, + "step": 5104 + }, + { + "epoch": 0.3842752027700935, + "grad_norm": 5.494690895080566, + "learning_rate": 7.066484280268968e-05, + "loss": 1.8885, + "step": 5105 + }, + { + "epoch": 0.3843504770507537, + "grad_norm": 4.715152263641357, + "learning_rate": 7.06537412075933e-05, + "loss": 2.1896, + "step": 5106 + }, + { + "epoch": 0.3844257513314138, + "grad_norm": 3.8597936630249023, + "learning_rate": 7.064263838469124e-05, + "loss": 1.6586, + "step": 5107 + }, + { + "epoch": 0.384501025612074, + "grad_norm": 5.016448020935059, + "learning_rate": 7.063153433464356e-05, + "loss": 1.9668, + "step": 5108 + }, + { + "epoch": 0.3845762998927342, + "grad_norm": 6.7499566078186035, + "learning_rate": 7.062042905811037e-05, + "loss": 2.1402, + "step": 5109 + }, + { + "epoch": 0.3846515741733943, + "grad_norm": 3.981473445892334, + "learning_rate": 7.060932255575183e-05, + "loss": 1.9862, + "step": 5110 + }, + { + "epoch": 0.38472684845405447, + "grad_norm": 4.7544732093811035, + "learning_rate": 7.059821482822821e-05, + "loss": 1.7379, + "step": 5111 + }, + { + "epoch": 0.3848021227347146, + "grad_norm": 4.64710807800293, + "learning_rate": 7.058710587619982e-05, + "loss": 1.9903, + "step": 5112 + }, + { + "epoch": 0.38487739701537477, + "grad_norm": 4.240914344787598, + "learning_rate": 7.057599570032706e-05, + "loss": 1.9765, + "step": 5113 + }, + { + "epoch": 0.38495267129603494, + "grad_norm": 4.636510848999023, + "learning_rate": 7.056488430127039e-05, + "loss": 1.9417, + "step": 5114 + }, + { + "epoch": 0.38502794557669506, + "grad_norm": 3.8164968490600586, + "learning_rate": 7.055377167969035e-05, + "loss": 2.0084, + "step": 5115 + }, + { + "epoch": 0.38510321985735524, + "grad_norm": 4.97038459777832, + "learning_rate": 7.054265783624756e-05, + "loss": 2.5179, + "step": 5116 + }, + { + "epoch": 0.3851784941380154, + "grad_norm": 6.12393045425415, + "learning_rate": 7.053154277160272e-05, + "loss": 2.1607, + "step": 5117 + }, + { + "epoch": 0.38525376841867554, + "grad_norm": 6.2277421951293945, + "learning_rate": 7.052042648641656e-05, + "loss": 2.1662, + "step": 5118 + }, + { + "epoch": 0.3853290426993357, + "grad_norm": 4.501318454742432, + "learning_rate": 7.050930898134993e-05, + "loss": 1.7508, + "step": 5119 + }, + { + "epoch": 0.3854043169799959, + "grad_norm": 4.644231796264648, + "learning_rate": 7.049819025706373e-05, + "loss": 2.449, + "step": 5120 + }, + { + "epoch": 0.385479591260656, + "grad_norm": 4.109147548675537, + "learning_rate": 7.048707031421893e-05, + "loss": 1.8091, + "step": 5121 + }, + { + "epoch": 0.3855548655413162, + "grad_norm": 4.13007926940918, + "learning_rate": 7.047594915347658e-05, + "loss": 1.6222, + "step": 5122 + }, + { + "epoch": 0.3856301398219763, + "grad_norm": 8.340609550476074, + "learning_rate": 7.046482677549782e-05, + "loss": 1.9621, + "step": 5123 + }, + { + "epoch": 0.3857054141026365, + "grad_norm": 3.7131779193878174, + "learning_rate": 7.045370318094382e-05, + "loss": 1.9834, + "step": 5124 + }, + { + "epoch": 0.38578068838329665, + "grad_norm": 3.881448745727539, + "learning_rate": 7.044257837047585e-05, + "loss": 1.7421, + "step": 5125 + }, + { + "epoch": 0.3858559626639568, + "grad_norm": 3.990791082382202, + "learning_rate": 7.043145234475526e-05, + "loss": 2.2056, + "step": 5126 + }, + { + "epoch": 0.38593123694461695, + "grad_norm": 5.629392623901367, + "learning_rate": 7.042032510444343e-05, + "loss": 2.1236, + "step": 5127 + }, + { + "epoch": 0.3860065112252771, + "grad_norm": 4.396368503570557, + "learning_rate": 7.04091966502019e-05, + "loss": 1.8319, + "step": 5128 + }, + { + "epoch": 0.38608178550593725, + "grad_norm": 4.149318218231201, + "learning_rate": 7.039806698269216e-05, + "loss": 1.922, + "step": 5129 + }, + { + "epoch": 0.3861570597865974, + "grad_norm": 3.678591251373291, + "learning_rate": 7.038693610257588e-05, + "loss": 2.5824, + "step": 5130 + }, + { + "epoch": 0.38623233406725754, + "grad_norm": 3.483721971511841, + "learning_rate": 7.037580401051474e-05, + "loss": 1.9086, + "step": 5131 + }, + { + "epoch": 0.3863076083479177, + "grad_norm": 4.043239593505859, + "learning_rate": 7.036467070717053e-05, + "loss": 2.0304, + "step": 5132 + }, + { + "epoch": 0.3863828826285779, + "grad_norm": 5.411766529083252, + "learning_rate": 7.035353619320507e-05, + "loss": 2.0291, + "step": 5133 + }, + { + "epoch": 0.386458156909238, + "grad_norm": 6.395378589630127, + "learning_rate": 7.03424004692803e-05, + "loss": 1.912, + "step": 5134 + }, + { + "epoch": 0.3865334311898982, + "grad_norm": 5.284376621246338, + "learning_rate": 7.033126353605817e-05, + "loss": 2.0315, + "step": 5135 + }, + { + "epoch": 0.38660870547055837, + "grad_norm": 11.093127250671387, + "learning_rate": 7.032012539420076e-05, + "loss": 1.8797, + "step": 5136 + }, + { + "epoch": 0.3866839797512185, + "grad_norm": 4.60587739944458, + "learning_rate": 7.03089860443702e-05, + "loss": 2.2172, + "step": 5137 + }, + { + "epoch": 0.38675925403187866, + "grad_norm": 4.815913200378418, + "learning_rate": 7.029784548722871e-05, + "loss": 2.1422, + "step": 5138 + }, + { + "epoch": 0.38683452831253884, + "grad_norm": 3.66540789604187, + "learning_rate": 7.028670372343853e-05, + "loss": 1.8488, + "step": 5139 + }, + { + "epoch": 0.38690980259319896, + "grad_norm": 5.9024248123168945, + "learning_rate": 7.027556075366202e-05, + "loss": 2.1104, + "step": 5140 + }, + { + "epoch": 0.38698507687385914, + "grad_norm": 3.7674996852874756, + "learning_rate": 7.026441657856162e-05, + "loss": 1.8183, + "step": 5141 + }, + { + "epoch": 0.38706035115451926, + "grad_norm": 6.17453670501709, + "learning_rate": 7.025327119879979e-05, + "loss": 2.0484, + "step": 5142 + }, + { + "epoch": 0.38713562543517943, + "grad_norm": 3.8538362979888916, + "learning_rate": 7.02421246150391e-05, + "loss": 1.8584, + "step": 5143 + }, + { + "epoch": 0.3872108997158396, + "grad_norm": 5.662238597869873, + "learning_rate": 7.023097682794217e-05, + "loss": 2.1748, + "step": 5144 + }, + { + "epoch": 0.3872861739964997, + "grad_norm": 3.8990836143493652, + "learning_rate": 7.021982783817172e-05, + "loss": 1.8932, + "step": 5145 + }, + { + "epoch": 0.3873614482771599, + "grad_norm": 3.45202374458313, + "learning_rate": 7.020867764639054e-05, + "loss": 2.1976, + "step": 5146 + }, + { + "epoch": 0.3874367225578201, + "grad_norm": 7.609501838684082, + "learning_rate": 7.019752625326145e-05, + "loss": 2.2901, + "step": 5147 + }, + { + "epoch": 0.3875119968384802, + "grad_norm": 4.57600212097168, + "learning_rate": 7.018637365944737e-05, + "loss": 1.9127, + "step": 5148 + }, + { + "epoch": 0.3875872711191404, + "grad_norm": 5.195226192474365, + "learning_rate": 7.017521986561131e-05, + "loss": 1.7377, + "step": 5149 + }, + { + "epoch": 0.3876625453998005, + "grad_norm": 4.835358619689941, + "learning_rate": 7.016406487241632e-05, + "loss": 1.7477, + "step": 5150 + }, + { + "epoch": 0.38773781968046067, + "grad_norm": 5.3273539543151855, + "learning_rate": 7.01529086805255e-05, + "loss": 1.6292, + "step": 5151 + }, + { + "epoch": 0.38781309396112085, + "grad_norm": 4.562398910522461, + "learning_rate": 7.014175129060212e-05, + "loss": 1.8722, + "step": 5152 + }, + { + "epoch": 0.38788836824178097, + "grad_norm": 3.4114813804626465, + "learning_rate": 7.01305927033094e-05, + "loss": 1.9809, + "step": 5153 + }, + { + "epoch": 0.38796364252244114, + "grad_norm": 5.234886646270752, + "learning_rate": 7.011943291931071e-05, + "loss": 2.1683, + "step": 5154 + }, + { + "epoch": 0.3880389168031013, + "grad_norm": 4.3607707023620605, + "learning_rate": 7.010827193926947e-05, + "loss": 1.7392, + "step": 5155 + }, + { + "epoch": 0.38811419108376144, + "grad_norm": 4.793964862823486, + "learning_rate": 7.009710976384915e-05, + "loss": 1.9888, + "step": 5156 + }, + { + "epoch": 0.3881894653644216, + "grad_norm": 3.9515368938446045, + "learning_rate": 7.008594639371333e-05, + "loss": 2.1223, + "step": 5157 + }, + { + "epoch": 0.3882647396450818, + "grad_norm": 4.231078147888184, + "learning_rate": 7.00747818295256e-05, + "loss": 2.0826, + "step": 5158 + }, + { + "epoch": 0.3883400139257419, + "grad_norm": 3.7947447299957275, + "learning_rate": 7.006361607194972e-05, + "loss": 1.8089, + "step": 5159 + }, + { + "epoch": 0.3884152882064021, + "grad_norm": 5.510124206542969, + "learning_rate": 7.005244912164942e-05, + "loss": 2.1654, + "step": 5160 + }, + { + "epoch": 0.3884905624870622, + "grad_norm": 3.541808605194092, + "learning_rate": 7.004128097928857e-05, + "loss": 1.8131, + "step": 5161 + }, + { + "epoch": 0.3885658367677224, + "grad_norm": 5.766353607177734, + "learning_rate": 7.003011164553107e-05, + "loss": 1.8511, + "step": 5162 + }, + { + "epoch": 0.38864111104838256, + "grad_norm": 4.433131217956543, + "learning_rate": 7.00189411210409e-05, + "loss": 1.8669, + "step": 5163 + }, + { + "epoch": 0.3887163853290427, + "grad_norm": 4.663097858428955, + "learning_rate": 7.000776940648213e-05, + "loss": 2.4336, + "step": 5164 + }, + { + "epoch": 0.38879165960970286, + "grad_norm": 5.722248077392578, + "learning_rate": 6.999659650251885e-05, + "loss": 2.1351, + "step": 5165 + }, + { + "epoch": 0.38886693389036303, + "grad_norm": 3.8639774322509766, + "learning_rate": 6.998542240981531e-05, + "loss": 1.7186, + "step": 5166 + }, + { + "epoch": 0.38894220817102315, + "grad_norm": 3.6881942749023438, + "learning_rate": 6.997424712903576e-05, + "loss": 1.6883, + "step": 5167 + }, + { + "epoch": 0.3890174824516833, + "grad_norm": 3.686176061630249, + "learning_rate": 6.99630706608445e-05, + "loss": 2.6687, + "step": 5168 + }, + { + "epoch": 0.3890927567323435, + "grad_norm": 3.8122544288635254, + "learning_rate": 6.995189300590599e-05, + "loss": 2.1757, + "step": 5169 + }, + { + "epoch": 0.3891680310130036, + "grad_norm": 3.9678571224212646, + "learning_rate": 6.994071416488468e-05, + "loss": 1.5493, + "step": 5170 + }, + { + "epoch": 0.3892433052936638, + "grad_norm": 4.851013660430908, + "learning_rate": 6.992953413844514e-05, + "loss": 2.1696, + "step": 5171 + }, + { + "epoch": 0.3893185795743239, + "grad_norm": 4.309285640716553, + "learning_rate": 6.991835292725197e-05, + "loss": 1.8684, + "step": 5172 + }, + { + "epoch": 0.3893938538549841, + "grad_norm": 5.811774253845215, + "learning_rate": 6.990717053196987e-05, + "loss": 1.7465, + "step": 5173 + }, + { + "epoch": 0.38946912813564427, + "grad_norm": 5.433722019195557, + "learning_rate": 6.989598695326363e-05, + "loss": 2.09, + "step": 5174 + }, + { + "epoch": 0.3895444024163044, + "grad_norm": 4.857303142547607, + "learning_rate": 6.988480219179801e-05, + "loss": 2.0593, + "step": 5175 + }, + { + "epoch": 0.38961967669696457, + "grad_norm": 4.678370475769043, + "learning_rate": 6.987361624823797e-05, + "loss": 2.125, + "step": 5176 + }, + { + "epoch": 0.38969495097762474, + "grad_norm": 4.956774711608887, + "learning_rate": 6.986242912324847e-05, + "loss": 1.8494, + "step": 5177 + }, + { + "epoch": 0.38977022525828486, + "grad_norm": 5.266158103942871, + "learning_rate": 6.985124081749457e-05, + "loss": 2.9335, + "step": 5178 + }, + { + "epoch": 0.38984549953894504, + "grad_norm": 4.795161724090576, + "learning_rate": 6.984005133164134e-05, + "loss": 1.8209, + "step": 5179 + }, + { + "epoch": 0.38992077381960516, + "grad_norm": 4.719874382019043, + "learning_rate": 6.982886066635399e-05, + "loss": 2.1454, + "step": 5180 + }, + { + "epoch": 0.38999604810026534, + "grad_norm": 5.314960956573486, + "learning_rate": 6.981766882229777e-05, + "loss": 2.0495, + "step": 5181 + }, + { + "epoch": 0.3900713223809255, + "grad_norm": 5.239182949066162, + "learning_rate": 6.9806475800138e-05, + "loss": 2.215, + "step": 5182 + }, + { + "epoch": 0.39014659666158563, + "grad_norm": 6.457382678985596, + "learning_rate": 6.979528160054009e-05, + "loss": 1.8731, + "step": 5183 + }, + { + "epoch": 0.3902218709422458, + "grad_norm": 4.513756275177002, + "learning_rate": 6.978408622416948e-05, + "loss": 2.2722, + "step": 5184 + }, + { + "epoch": 0.390297145222906, + "grad_norm": 4.637709617614746, + "learning_rate": 6.977288967169171e-05, + "loss": 1.953, + "step": 5185 + }, + { + "epoch": 0.3903724195035661, + "grad_norm": 4.369754791259766, + "learning_rate": 6.976169194377238e-05, + "loss": 2.1536, + "step": 5186 + }, + { + "epoch": 0.3904476937842263, + "grad_norm": 5.093334197998047, + "learning_rate": 6.975049304107716e-05, + "loss": 2.1538, + "step": 5187 + }, + { + "epoch": 0.39052296806488646, + "grad_norm": 3.9879164695739746, + "learning_rate": 6.973929296427181e-05, + "loss": 1.7484, + "step": 5188 + }, + { + "epoch": 0.3905982423455466, + "grad_norm": 6.970964431762695, + "learning_rate": 6.972809171402213e-05, + "loss": 2.1684, + "step": 5189 + }, + { + "epoch": 0.39067351662620675, + "grad_norm": 6.39085054397583, + "learning_rate": 6.9716889290994e-05, + "loss": 1.8039, + "step": 5190 + }, + { + "epoch": 0.39074879090686687, + "grad_norm": 4.7798638343811035, + "learning_rate": 6.970568569585338e-05, + "loss": 1.8142, + "step": 5191 + }, + { + "epoch": 0.39082406518752705, + "grad_norm": 4.437145709991455, + "learning_rate": 6.969448092926629e-05, + "loss": 1.9917, + "step": 5192 + }, + { + "epoch": 0.3908993394681872, + "grad_norm": 4.2501630783081055, + "learning_rate": 6.968327499189879e-05, + "loss": 1.9556, + "step": 5193 + }, + { + "epoch": 0.39097461374884734, + "grad_norm": 4.457305431365967, + "learning_rate": 6.96720678844171e-05, + "loss": 1.986, + "step": 5194 + }, + { + "epoch": 0.3910498880295075, + "grad_norm": 4.49616813659668, + "learning_rate": 6.966085960748741e-05, + "loss": 2.2419, + "step": 5195 + }, + { + "epoch": 0.3911251623101677, + "grad_norm": 4.975657939910889, + "learning_rate": 6.964965016177603e-05, + "loss": 1.7282, + "step": 5196 + }, + { + "epoch": 0.3912004365908278, + "grad_norm": 4.2967987060546875, + "learning_rate": 6.963843954794935e-05, + "loss": 2.0065, + "step": 5197 + }, + { + "epoch": 0.391275710871488, + "grad_norm": 4.789773941040039, + "learning_rate": 6.962722776667376e-05, + "loss": 2.0844, + "step": 5198 + }, + { + "epoch": 0.39135098515214817, + "grad_norm": 6.244070529937744, + "learning_rate": 6.961601481861581e-05, + "loss": 1.9259, + "step": 5199 + }, + { + "epoch": 0.3914262594328083, + "grad_norm": 9.858591079711914, + "learning_rate": 6.960480070444205e-05, + "loss": 1.8169, + "step": 5200 + }, + { + "epoch": 0.39150153371346846, + "grad_norm": 4.541567802429199, + "learning_rate": 6.959358542481915e-05, + "loss": 2.2345, + "step": 5201 + }, + { + "epoch": 0.3915768079941286, + "grad_norm": 3.4116322994232178, + "learning_rate": 6.958236898041382e-05, + "loss": 1.9152, + "step": 5202 + }, + { + "epoch": 0.39165208227478876, + "grad_norm": 6.525388717651367, + "learning_rate": 6.957115137189285e-05, + "loss": 2.0367, + "step": 5203 + }, + { + "epoch": 0.39172735655544894, + "grad_norm": 4.803116798400879, + "learning_rate": 6.955993259992306e-05, + "loss": 2.0156, + "step": 5204 + }, + { + "epoch": 0.39180263083610906, + "grad_norm": 3.4431774616241455, + "learning_rate": 6.954871266517143e-05, + "loss": 1.9005, + "step": 5205 + }, + { + "epoch": 0.39187790511676923, + "grad_norm": 4.246635437011719, + "learning_rate": 6.95374915683049e-05, + "loss": 2.4181, + "step": 5206 + }, + { + "epoch": 0.3919531793974294, + "grad_norm": 5.497817039489746, + "learning_rate": 6.952626930999058e-05, + "loss": 1.8896, + "step": 5207 + }, + { + "epoch": 0.39202845367808953, + "grad_norm": 3.530252695083618, + "learning_rate": 6.951504589089555e-05, + "loss": 2.2551, + "step": 5208 + }, + { + "epoch": 0.3921037279587497, + "grad_norm": 5.155354976654053, + "learning_rate": 6.950382131168705e-05, + "loss": 1.8505, + "step": 5209 + }, + { + "epoch": 0.3921790022394098, + "grad_norm": 5.639703750610352, + "learning_rate": 6.949259557303233e-05, + "loss": 2.0545, + "step": 5210 + }, + { + "epoch": 0.39225427652007, + "grad_norm": 3.528696060180664, + "learning_rate": 6.948136867559874e-05, + "loss": 1.894, + "step": 5211 + }, + { + "epoch": 0.3923295508007302, + "grad_norm": 4.529226779937744, + "learning_rate": 6.947014062005368e-05, + "loss": 2.0473, + "step": 5212 + }, + { + "epoch": 0.3924048250813903, + "grad_norm": 3.6226611137390137, + "learning_rate": 6.945891140706462e-05, + "loss": 1.957, + "step": 5213 + }, + { + "epoch": 0.39248009936205047, + "grad_norm": 4.451925754547119, + "learning_rate": 6.944768103729913e-05, + "loss": 2.2745, + "step": 5214 + }, + { + "epoch": 0.39255537364271065, + "grad_norm": 4.6399054527282715, + "learning_rate": 6.943644951142478e-05, + "loss": 2.1126, + "step": 5215 + }, + { + "epoch": 0.39263064792337077, + "grad_norm": 4.229841709136963, + "learning_rate": 6.942521683010928e-05, + "loss": 2.0751, + "step": 5216 + }, + { + "epoch": 0.39270592220403094, + "grad_norm": 4.860999584197998, + "learning_rate": 6.941398299402039e-05, + "loss": 1.9959, + "step": 5217 + }, + { + "epoch": 0.3927811964846911, + "grad_norm": 6.332437515258789, + "learning_rate": 6.940274800382591e-05, + "loss": 1.9549, + "step": 5218 + }, + { + "epoch": 0.39285647076535124, + "grad_norm": 4.228542327880859, + "learning_rate": 6.939151186019373e-05, + "loss": 1.6997, + "step": 5219 + }, + { + "epoch": 0.3929317450460114, + "grad_norm": 6.465989589691162, + "learning_rate": 6.938027456379182e-05, + "loss": 1.8414, + "step": 5220 + }, + { + "epoch": 0.39300701932667154, + "grad_norm": 3.890904188156128, + "learning_rate": 6.936903611528818e-05, + "loss": 2.1604, + "step": 5221 + }, + { + "epoch": 0.3930822936073317, + "grad_norm": 4.503025054931641, + "learning_rate": 6.935779651535093e-05, + "loss": 2.1503, + "step": 5222 + }, + { + "epoch": 0.3931575678879919, + "grad_norm": 4.028709888458252, + "learning_rate": 6.934655576464822e-05, + "loss": 1.9473, + "step": 5223 + }, + { + "epoch": 0.393232842168652, + "grad_norm": 4.7539591789245605, + "learning_rate": 6.933531386384827e-05, + "loss": 1.6832, + "step": 5224 + }, + { + "epoch": 0.3933081164493122, + "grad_norm": 4.165890216827393, + "learning_rate": 6.93240708136194e-05, + "loss": 1.7192, + "step": 5225 + }, + { + "epoch": 0.39338339072997236, + "grad_norm": 4.968687534332275, + "learning_rate": 6.931282661462997e-05, + "loss": 1.9188, + "step": 5226 + }, + { + "epoch": 0.3934586650106325, + "grad_norm": 4.237826824188232, + "learning_rate": 6.93015812675484e-05, + "loss": 2.2298, + "step": 5227 + }, + { + "epoch": 0.39353393929129266, + "grad_norm": 5.553686141967773, + "learning_rate": 6.929033477304323e-05, + "loss": 2.0493, + "step": 5228 + }, + { + "epoch": 0.3936092135719528, + "grad_norm": 7.572820663452148, + "learning_rate": 6.927908713178299e-05, + "loss": 2.5351, + "step": 5229 + }, + { + "epoch": 0.39368448785261295, + "grad_norm": 3.9996278285980225, + "learning_rate": 6.926783834443634e-05, + "loss": 1.9888, + "step": 5230 + }, + { + "epoch": 0.39375976213327313, + "grad_norm": 5.510247707366943, + "learning_rate": 6.925658841167197e-05, + "loss": 2.1663, + "step": 5231 + }, + { + "epoch": 0.39383503641393325, + "grad_norm": 6.237199306488037, + "learning_rate": 6.92453373341587e-05, + "loss": 1.8505, + "step": 5232 + }, + { + "epoch": 0.3939103106945934, + "grad_norm": 4.817835807800293, + "learning_rate": 6.923408511256533e-05, + "loss": 1.8843, + "step": 5233 + }, + { + "epoch": 0.3939855849752536, + "grad_norm": 6.970422267913818, + "learning_rate": 6.922283174756081e-05, + "loss": 1.7876, + "step": 5234 + }, + { + "epoch": 0.3940608592559137, + "grad_norm": 4.317952632904053, + "learning_rate": 6.921157723981408e-05, + "loss": 1.9553, + "step": 5235 + }, + { + "epoch": 0.3941361335365739, + "grad_norm": 5.635189533233643, + "learning_rate": 6.920032158999422e-05, + "loss": 1.6901, + "step": 5236 + }, + { + "epoch": 0.39421140781723407, + "grad_norm": 5.372305870056152, + "learning_rate": 6.918906479877032e-05, + "loss": 1.9993, + "step": 5237 + }, + { + "epoch": 0.3942866820978942, + "grad_norm": 4.717647075653076, + "learning_rate": 6.917780686681158e-05, + "loss": 2.064, + "step": 5238 + }, + { + "epoch": 0.39436195637855437, + "grad_norm": 5.763543128967285, + "learning_rate": 6.916654779478725e-05, + "loss": 1.942, + "step": 5239 + }, + { + "epoch": 0.3944372306592145, + "grad_norm": 4.852224349975586, + "learning_rate": 6.915528758336665e-05, + "loss": 2.0205, + "step": 5240 + }, + { + "epoch": 0.39451250493987466, + "grad_norm": 6.8463358879089355, + "learning_rate": 6.914402623321916e-05, + "loss": 2.2348, + "step": 5241 + }, + { + "epoch": 0.39458777922053484, + "grad_norm": 6.190590858459473, + "learning_rate": 6.913276374501425e-05, + "loss": 2.3121, + "step": 5242 + }, + { + "epoch": 0.39466305350119496, + "grad_norm": 5.485639572143555, + "learning_rate": 6.912150011942143e-05, + "loss": 1.8485, + "step": 5243 + }, + { + "epoch": 0.39473832778185514, + "grad_norm": 5.057278156280518, + "learning_rate": 6.911023535711029e-05, + "loss": 2.0078, + "step": 5244 + }, + { + "epoch": 0.3948136020625153, + "grad_norm": 4.9163665771484375, + "learning_rate": 6.90989694587505e-05, + "loss": 1.8428, + "step": 5245 + }, + { + "epoch": 0.39488887634317543, + "grad_norm": 5.255848407745361, + "learning_rate": 6.908770242501176e-05, + "loss": 1.9317, + "step": 5246 + }, + { + "epoch": 0.3949641506238356, + "grad_norm": 4.284176826477051, + "learning_rate": 6.90764342565639e-05, + "loss": 2.1024, + "step": 5247 + }, + { + "epoch": 0.3950394249044958, + "grad_norm": 5.583375453948975, + "learning_rate": 6.906516495407674e-05, + "loss": 1.703, + "step": 5248 + }, + { + "epoch": 0.3951146991851559, + "grad_norm": 5.231213569641113, + "learning_rate": 6.905389451822024e-05, + "loss": 2.2031, + "step": 5249 + }, + { + "epoch": 0.3951899734658161, + "grad_norm": 5.470889091491699, + "learning_rate": 6.90426229496644e-05, + "loss": 2.0737, + "step": 5250 + }, + { + "epoch": 0.3952652477464762, + "grad_norm": 6.145623683929443, + "learning_rate": 6.903135024907923e-05, + "loss": 2.2188, + "step": 5251 + }, + { + "epoch": 0.3953405220271364, + "grad_norm": 3.9044687747955322, + "learning_rate": 6.902007641713492e-05, + "loss": 2.0435, + "step": 5252 + }, + { + "epoch": 0.39541579630779655, + "grad_norm": 3.5779004096984863, + "learning_rate": 6.900880145450163e-05, + "loss": 1.8906, + "step": 5253 + }, + { + "epoch": 0.3954910705884567, + "grad_norm": 3.3963522911071777, + "learning_rate": 6.899752536184964e-05, + "loss": 1.9346, + "step": 5254 + }, + { + "epoch": 0.39556634486911685, + "grad_norm": 4.952561855316162, + "learning_rate": 6.898624813984928e-05, + "loss": 2.2904, + "step": 5255 + }, + { + "epoch": 0.395641619149777, + "grad_norm": 3.7818069458007812, + "learning_rate": 6.897496978917093e-05, + "loss": 2.1326, + "step": 5256 + }, + { + "epoch": 0.39571689343043714, + "grad_norm": 5.169959545135498, + "learning_rate": 6.896369031048508e-05, + "loss": 2.3314, + "step": 5257 + }, + { + "epoch": 0.3957921677110973, + "grad_norm": 5.590060234069824, + "learning_rate": 6.895240970446226e-05, + "loss": 2.2577, + "step": 5258 + }, + { + "epoch": 0.39586744199175744, + "grad_norm": 5.482585906982422, + "learning_rate": 6.894112797177306e-05, + "loss": 2.1709, + "step": 5259 + }, + { + "epoch": 0.3959427162724176, + "grad_norm": 5.6133036613464355, + "learning_rate": 6.892984511308814e-05, + "loss": 1.9704, + "step": 5260 + }, + { + "epoch": 0.3960179905530778, + "grad_norm": 4.960308074951172, + "learning_rate": 6.891856112907827e-05, + "loss": 1.8028, + "step": 5261 + }, + { + "epoch": 0.3960932648337379, + "grad_norm": 5.726239204406738, + "learning_rate": 6.89072760204142e-05, + "loss": 1.9112, + "step": 5262 + }, + { + "epoch": 0.3961685391143981, + "grad_norm": 5.848134994506836, + "learning_rate": 6.889598978776684e-05, + "loss": 1.7987, + "step": 5263 + }, + { + "epoch": 0.39624381339505826, + "grad_norm": 3.87086820602417, + "learning_rate": 6.88847024318071e-05, + "loss": 1.9066, + "step": 5264 + }, + { + "epoch": 0.3963190876757184, + "grad_norm": 4.969845294952393, + "learning_rate": 6.887341395320597e-05, + "loss": 1.9808, + "step": 5265 + }, + { + "epoch": 0.39639436195637856, + "grad_norm": 3.9948718547821045, + "learning_rate": 6.886212435263455e-05, + "loss": 2.1369, + "step": 5266 + }, + { + "epoch": 0.39646963623703874, + "grad_norm": 4.366805553436279, + "learning_rate": 6.885083363076395e-05, + "loss": 1.9568, + "step": 5267 + }, + { + "epoch": 0.39654491051769886, + "grad_norm": 4.778541564941406, + "learning_rate": 6.88395417882654e-05, + "loss": 1.6151, + "step": 5268 + }, + { + "epoch": 0.39662018479835903, + "grad_norm": 4.081904888153076, + "learning_rate": 6.882824882581013e-05, + "loss": 2.0432, + "step": 5269 + }, + { + "epoch": 0.39669545907901915, + "grad_norm": 6.065483570098877, + "learning_rate": 6.881695474406949e-05, + "loss": 1.6471, + "step": 5270 + }, + { + "epoch": 0.39677073335967933, + "grad_norm": 3.967003345489502, + "learning_rate": 6.880565954371489e-05, + "loss": 1.7362, + "step": 5271 + }, + { + "epoch": 0.3968460076403395, + "grad_norm": 4.717532157897949, + "learning_rate": 6.87943632254178e-05, + "loss": 2.0467, + "step": 5272 + }, + { + "epoch": 0.3969212819209996, + "grad_norm": 4.1001176834106445, + "learning_rate": 6.87830657898497e-05, + "loss": 1.6473, + "step": 5273 + }, + { + "epoch": 0.3969965562016598, + "grad_norm": 4.454771995544434, + "learning_rate": 6.877176723768228e-05, + "loss": 1.6812, + "step": 5274 + }, + { + "epoch": 0.39707183048232, + "grad_norm": 7.681920051574707, + "learning_rate": 6.876046756958715e-05, + "loss": 2.2034, + "step": 5275 + }, + { + "epoch": 0.3971471047629801, + "grad_norm": 5.013684272766113, + "learning_rate": 6.874916678623603e-05, + "loss": 1.9127, + "step": 5276 + }, + { + "epoch": 0.3972223790436403, + "grad_norm": 6.265958786010742, + "learning_rate": 6.873786488830076e-05, + "loss": 2.0918, + "step": 5277 + }, + { + "epoch": 0.3972976533243004, + "grad_norm": 4.134488105773926, + "learning_rate": 6.87265618764532e-05, + "loss": 1.9597, + "step": 5278 + }, + { + "epoch": 0.39737292760496057, + "grad_norm": 5.5006103515625, + "learning_rate": 6.871525775136525e-05, + "loss": 1.9306, + "step": 5279 + }, + { + "epoch": 0.39744820188562074, + "grad_norm": 6.579580783843994, + "learning_rate": 6.870395251370894e-05, + "loss": 2.1004, + "step": 5280 + }, + { + "epoch": 0.39752347616628086, + "grad_norm": 3.9398036003112793, + "learning_rate": 6.869264616415629e-05, + "loss": 1.7839, + "step": 5281 + }, + { + "epoch": 0.39759875044694104, + "grad_norm": 5.8119401931762695, + "learning_rate": 6.86813387033795e-05, + "loss": 2.0025, + "step": 5282 + }, + { + "epoch": 0.3976740247276012, + "grad_norm": 4.4750494956970215, + "learning_rate": 6.86700301320507e-05, + "loss": 2.4783, + "step": 5283 + }, + { + "epoch": 0.39774929900826134, + "grad_norm": 5.226348876953125, + "learning_rate": 6.865872045084219e-05, + "loss": 2.0954, + "step": 5284 + }, + { + "epoch": 0.3978245732889215, + "grad_norm": 4.446264266967773, + "learning_rate": 6.864740966042629e-05, + "loss": 1.9985, + "step": 5285 + }, + { + "epoch": 0.3978998475695817, + "grad_norm": 5.967496395111084, + "learning_rate": 6.863609776147539e-05, + "loss": 1.678, + "step": 5286 + }, + { + "epoch": 0.3979751218502418, + "grad_norm": 3.8630638122558594, + "learning_rate": 6.862478475466194e-05, + "loss": 1.8381, + "step": 5287 + }, + { + "epoch": 0.398050396130902, + "grad_norm": 4.604032516479492, + "learning_rate": 6.861347064065849e-05, + "loss": 2.0495, + "step": 5288 + }, + { + "epoch": 0.3981256704115621, + "grad_norm": 7.21726131439209, + "learning_rate": 6.860215542013761e-05, + "loss": 1.9833, + "step": 5289 + }, + { + "epoch": 0.3982009446922223, + "grad_norm": 9.447072982788086, + "learning_rate": 6.859083909377197e-05, + "loss": 2.147, + "step": 5290 + }, + { + "epoch": 0.39827621897288246, + "grad_norm": 5.315767765045166, + "learning_rate": 6.857952166223429e-05, + "loss": 2.1883, + "step": 5291 + }, + { + "epoch": 0.3983514932535426, + "grad_norm": 7.571106910705566, + "learning_rate": 6.856820312619735e-05, + "loss": 1.9042, + "step": 5292 + }, + { + "epoch": 0.39842676753420275, + "grad_norm": 3.8130648136138916, + "learning_rate": 6.855688348633402e-05, + "loss": 1.8368, + "step": 5293 + }, + { + "epoch": 0.39850204181486293, + "grad_norm": 4.890517711639404, + "learning_rate": 6.85455627433172e-05, + "loss": 1.7831, + "step": 5294 + }, + { + "epoch": 0.39857731609552305, + "grad_norm": 5.495864391326904, + "learning_rate": 6.85342408978199e-05, + "loss": 1.6298, + "step": 5295 + }, + { + "epoch": 0.3986525903761832, + "grad_norm": 5.847315788269043, + "learning_rate": 6.852291795051518e-05, + "loss": 2.0748, + "step": 5296 + }, + { + "epoch": 0.3987278646568434, + "grad_norm": 4.205320835113525, + "learning_rate": 6.851159390207611e-05, + "loss": 2.2681, + "step": 5297 + }, + { + "epoch": 0.3988031389375035, + "grad_norm": 4.053877830505371, + "learning_rate": 6.85002687531759e-05, + "loss": 1.7144, + "step": 5298 + }, + { + "epoch": 0.3988784132181637, + "grad_norm": 5.00118350982666, + "learning_rate": 6.848894250448778e-05, + "loss": 1.7858, + "step": 5299 + }, + { + "epoch": 0.3989536874988238, + "grad_norm": 4.14704704284668, + "learning_rate": 6.847761515668511e-05, + "loss": 1.6874, + "step": 5300 + }, + { + "epoch": 0.399028961779484, + "grad_norm": 3.2100651264190674, + "learning_rate": 6.846628671044121e-05, + "loss": 1.7546, + "step": 5301 + }, + { + "epoch": 0.39910423606014417, + "grad_norm": 5.690414905548096, + "learning_rate": 6.845495716642958e-05, + "loss": 2.5446, + "step": 5302 + }, + { + "epoch": 0.3991795103408043, + "grad_norm": 7.436687469482422, + "learning_rate": 6.844362652532368e-05, + "loss": 1.8027, + "step": 5303 + }, + { + "epoch": 0.39925478462146446, + "grad_norm": 4.503210067749023, + "learning_rate": 6.843229478779712e-05, + "loss": 2.0429, + "step": 5304 + }, + { + "epoch": 0.39933005890212464, + "grad_norm": 4.755377769470215, + "learning_rate": 6.84209619545235e-05, + "loss": 2.2132, + "step": 5305 + }, + { + "epoch": 0.39940533318278476, + "grad_norm": 3.7669692039489746, + "learning_rate": 6.840962802617656e-05, + "loss": 1.5255, + "step": 5306 + }, + { + "epoch": 0.39948060746344494, + "grad_norm": 4.223198413848877, + "learning_rate": 6.839829300343006e-05, + "loss": 2.0126, + "step": 5307 + }, + { + "epoch": 0.39955588174410506, + "grad_norm": 4.116264820098877, + "learning_rate": 6.838695688695782e-05, + "loss": 1.6213, + "step": 5308 + }, + { + "epoch": 0.39963115602476523, + "grad_norm": 8.226816177368164, + "learning_rate": 6.837561967743375e-05, + "loss": 2.1123, + "step": 5309 + }, + { + "epoch": 0.3997064303054254, + "grad_norm": 4.63575553894043, + "learning_rate": 6.836428137553184e-05, + "loss": 1.7832, + "step": 5310 + }, + { + "epoch": 0.39978170458608553, + "grad_norm": 5.650008678436279, + "learning_rate": 6.835294198192608e-05, + "loss": 1.6598, + "step": 5311 + }, + { + "epoch": 0.3998569788667457, + "grad_norm": 6.670443534851074, + "learning_rate": 6.834160149729058e-05, + "loss": 1.846, + "step": 5312 + }, + { + "epoch": 0.3999322531474059, + "grad_norm": 4.246932029724121, + "learning_rate": 6.83302599222995e-05, + "loss": 2.2101, + "step": 5313 + }, + { + "epoch": 0.400007527428066, + "grad_norm": 5.359315872192383, + "learning_rate": 6.831891725762706e-05, + "loss": 2.1341, + "step": 5314 + }, + { + "epoch": 0.4000828017087262, + "grad_norm": 4.920767307281494, + "learning_rate": 6.830757350394755e-05, + "loss": 2.1021, + "step": 5315 + }, + { + "epoch": 0.40015807598938635, + "grad_norm": 5.670407772064209, + "learning_rate": 6.829622866193532e-05, + "loss": 1.9613, + "step": 5316 + }, + { + "epoch": 0.4002333502700465, + "grad_norm": 4.997689247131348, + "learning_rate": 6.82848827322648e-05, + "loss": 2.076, + "step": 5317 + }, + { + "epoch": 0.40030862455070665, + "grad_norm": 3.870497703552246, + "learning_rate": 6.827353571561048e-05, + "loss": 1.705, + "step": 5318 + }, + { + "epoch": 0.40038389883136677, + "grad_norm": 3.82179594039917, + "learning_rate": 6.826218761264689e-05, + "loss": 1.7637, + "step": 5319 + }, + { + "epoch": 0.40045917311202694, + "grad_norm": 3.977510690689087, + "learning_rate": 6.825083842404862e-05, + "loss": 2.0026, + "step": 5320 + }, + { + "epoch": 0.4005344473926871, + "grad_norm": 3.6797702312469482, + "learning_rate": 6.823948815049039e-05, + "loss": 1.9066, + "step": 5321 + }, + { + "epoch": 0.40060972167334724, + "grad_norm": 4.414495944976807, + "learning_rate": 6.822813679264694e-05, + "loss": 1.7956, + "step": 5322 + }, + { + "epoch": 0.4006849959540074, + "grad_norm": 3.931663990020752, + "learning_rate": 6.821678435119304e-05, + "loss": 1.7816, + "step": 5323 + }, + { + "epoch": 0.4007602702346676, + "grad_norm": 5.739896297454834, + "learning_rate": 6.82054308268036e-05, + "loss": 2.4555, + "step": 5324 + }, + { + "epoch": 0.4008355445153277, + "grad_norm": 4.897427558898926, + "learning_rate": 6.819407622015353e-05, + "loss": 2.11, + "step": 5325 + }, + { + "epoch": 0.4009108187959879, + "grad_norm": 4.48984432220459, + "learning_rate": 6.818272053191781e-05, + "loss": 1.7368, + "step": 5326 + }, + { + "epoch": 0.400986093076648, + "grad_norm": 4.326822757720947, + "learning_rate": 6.817136376277154e-05, + "loss": 1.8739, + "step": 5327 + }, + { + "epoch": 0.4010613673573082, + "grad_norm": 4.570828437805176, + "learning_rate": 6.816000591338985e-05, + "loss": 1.8174, + "step": 5328 + }, + { + "epoch": 0.40113664163796836, + "grad_norm": 4.024326801300049, + "learning_rate": 6.81486469844479e-05, + "loss": 1.655, + "step": 5329 + }, + { + "epoch": 0.4012119159186285, + "grad_norm": 5.480904579162598, + "learning_rate": 6.813728697662096e-05, + "loss": 1.9133, + "step": 5330 + }, + { + "epoch": 0.40128719019928866, + "grad_norm": 4.703399181365967, + "learning_rate": 6.812592589058435e-05, + "loss": 1.7196, + "step": 5331 + }, + { + "epoch": 0.40136246447994883, + "grad_norm": 5.78317928314209, + "learning_rate": 6.811456372701345e-05, + "loss": 2.415, + "step": 5332 + }, + { + "epoch": 0.40143773876060895, + "grad_norm": 4.7734270095825195, + "learning_rate": 6.810320048658372e-05, + "loss": 2.0948, + "step": 5333 + }, + { + "epoch": 0.40151301304126913, + "grad_norm": 5.654014587402344, + "learning_rate": 6.809183616997068e-05, + "loss": 2.3321, + "step": 5334 + }, + { + "epoch": 0.4015882873219293, + "grad_norm": 7.924801349639893, + "learning_rate": 6.808047077784988e-05, + "loss": 1.8561, + "step": 5335 + }, + { + "epoch": 0.4016635616025894, + "grad_norm": 5.06160306930542, + "learning_rate": 6.806910431089696e-05, + "loss": 1.6722, + "step": 5336 + }, + { + "epoch": 0.4017388358832496, + "grad_norm": 4.080567359924316, + "learning_rate": 6.805773676978764e-05, + "loss": 1.8174, + "step": 5337 + }, + { + "epoch": 0.4018141101639097, + "grad_norm": 4.801535129547119, + "learning_rate": 6.804636815519768e-05, + "loss": 1.9026, + "step": 5338 + }, + { + "epoch": 0.4018893844445699, + "grad_norm": 4.634988784790039, + "learning_rate": 6.803499846780292e-05, + "loss": 2.2378, + "step": 5339 + }, + { + "epoch": 0.4019646587252301, + "grad_norm": 4.98685359954834, + "learning_rate": 6.802362770827926e-05, + "loss": 2.1058, + "step": 5340 + }, + { + "epoch": 0.4020399330058902, + "grad_norm": 4.1788153648376465, + "learning_rate": 6.801225587730263e-05, + "loss": 1.6515, + "step": 5341 + }, + { + "epoch": 0.40211520728655037, + "grad_norm": 3.9120664596557617, + "learning_rate": 6.800088297554908e-05, + "loss": 1.9521, + "step": 5342 + }, + { + "epoch": 0.40219048156721054, + "grad_norm": 5.457016944885254, + "learning_rate": 6.798950900369469e-05, + "loss": 1.9686, + "step": 5343 + }, + { + "epoch": 0.40226575584787067, + "grad_norm": 3.8646645545959473, + "learning_rate": 6.79781339624156e-05, + "loss": 2.0856, + "step": 5344 + }, + { + "epoch": 0.40234103012853084, + "grad_norm": 4.47550630569458, + "learning_rate": 6.796675785238804e-05, + "loss": 2.0627, + "step": 5345 + }, + { + "epoch": 0.402416304409191, + "grad_norm": 3.247621536254883, + "learning_rate": 6.795538067428827e-05, + "loss": 1.7987, + "step": 5346 + }, + { + "epoch": 0.40249157868985114, + "grad_norm": 4.706051826477051, + "learning_rate": 6.794400242879264e-05, + "loss": 1.7926, + "step": 5347 + }, + { + "epoch": 0.4025668529705113, + "grad_norm": 4.113757133483887, + "learning_rate": 6.793262311657755e-05, + "loss": 1.6333, + "step": 5348 + }, + { + "epoch": 0.40264212725117143, + "grad_norm": 4.506556987762451, + "learning_rate": 6.792124273831948e-05, + "loss": 2.2044, + "step": 5349 + }, + { + "epoch": 0.4027174015318316, + "grad_norm": 4.144906520843506, + "learning_rate": 6.790986129469494e-05, + "loss": 2.0494, + "step": 5350 + }, + { + "epoch": 0.4027926758124918, + "grad_norm": 5.415107250213623, + "learning_rate": 6.789847878638054e-05, + "loss": 2.0532, + "step": 5351 + }, + { + "epoch": 0.4028679500931519, + "grad_norm": 4.5909342765808105, + "learning_rate": 6.788709521405295e-05, + "loss": 2.2343, + "step": 5352 + }, + { + "epoch": 0.4029432243738121, + "grad_norm": 3.308436870574951, + "learning_rate": 6.787571057838884e-05, + "loss": 2.1883, + "step": 5353 + }, + { + "epoch": 0.40301849865447226, + "grad_norm": 5.77985143661499, + "learning_rate": 6.786432488006503e-05, + "loss": 2.0106, + "step": 5354 + }, + { + "epoch": 0.4030937729351324, + "grad_norm": 5.094431400299072, + "learning_rate": 6.785293811975838e-05, + "loss": 1.9518, + "step": 5355 + }, + { + "epoch": 0.40316904721579255, + "grad_norm": 3.918471574783325, + "learning_rate": 6.784155029814579e-05, + "loss": 1.6179, + "step": 5356 + }, + { + "epoch": 0.4032443214964527, + "grad_norm": 3.814473867416382, + "learning_rate": 6.783016141590422e-05, + "loss": 2.183, + "step": 5357 + }, + { + "epoch": 0.40331959577711285, + "grad_norm": 3.6349940299987793, + "learning_rate": 6.781877147371071e-05, + "loss": 1.9747, + "step": 5358 + }, + { + "epoch": 0.403394870057773, + "grad_norm": 3.963618516921997, + "learning_rate": 6.780738047224237e-05, + "loss": 2.094, + "step": 5359 + }, + { + "epoch": 0.40347014433843315, + "grad_norm": 5.607470989227295, + "learning_rate": 6.779598841217636e-05, + "loss": 2.0079, + "step": 5360 + }, + { + "epoch": 0.4035454186190933, + "grad_norm": 4.573211193084717, + "learning_rate": 6.778459529418989e-05, + "loss": 1.7159, + "step": 5361 + }, + { + "epoch": 0.4036206928997535, + "grad_norm": 4.286177635192871, + "learning_rate": 6.777320111896027e-05, + "loss": 1.9202, + "step": 5362 + }, + { + "epoch": 0.4036959671804136, + "grad_norm": 5.121272563934326, + "learning_rate": 6.776180588716484e-05, + "loss": 1.9964, + "step": 5363 + }, + { + "epoch": 0.4037712414610738, + "grad_norm": 4.205358982086182, + "learning_rate": 6.775040959948101e-05, + "loss": 1.9531, + "step": 5364 + }, + { + "epoch": 0.40384651574173397, + "grad_norm": 4.61204195022583, + "learning_rate": 6.773901225658626e-05, + "loss": 2.2701, + "step": 5365 + }, + { + "epoch": 0.4039217900223941, + "grad_norm": 5.428807258605957, + "learning_rate": 6.772761385915812e-05, + "loss": 1.9785, + "step": 5366 + }, + { + "epoch": 0.40399706430305427, + "grad_norm": 4.795749664306641, + "learning_rate": 6.771621440787423e-05, + "loss": 2.2442, + "step": 5367 + }, + { + "epoch": 0.4040723385837144, + "grad_norm": 4.88709831237793, + "learning_rate": 6.770481390341221e-05, + "loss": 2.1068, + "step": 5368 + }, + { + "epoch": 0.40414761286437456, + "grad_norm": 4.479779243469238, + "learning_rate": 6.76934123464498e-05, + "loss": 2.0085, + "step": 5369 + }, + { + "epoch": 0.40422288714503474, + "grad_norm": 4.843325138092041, + "learning_rate": 6.768200973766478e-05, + "loss": 2.0468, + "step": 5370 + }, + { + "epoch": 0.40429816142569486, + "grad_norm": 7.453130722045898, + "learning_rate": 6.767060607773504e-05, + "loss": 1.9619, + "step": 5371 + }, + { + "epoch": 0.40437343570635503, + "grad_norm": 5.16350793838501, + "learning_rate": 6.765920136733848e-05, + "loss": 2.3995, + "step": 5372 + }, + { + "epoch": 0.4044487099870152, + "grad_norm": 7.323816299438477, + "learning_rate": 6.764779560715305e-05, + "loss": 2.0284, + "step": 5373 + }, + { + "epoch": 0.40452398426767533, + "grad_norm": 5.113553524017334, + "learning_rate": 6.763638879785681e-05, + "loss": 2.0464, + "step": 5374 + }, + { + "epoch": 0.4045992585483355, + "grad_norm": 5.947402477264404, + "learning_rate": 6.762498094012787e-05, + "loss": 1.8261, + "step": 5375 + }, + { + "epoch": 0.4046745328289956, + "grad_norm": 3.4446866512298584, + "learning_rate": 6.761357203464436e-05, + "loss": 1.7482, + "step": 5376 + }, + { + "epoch": 0.4047498071096558, + "grad_norm": 4.794046878814697, + "learning_rate": 6.760216208208456e-05, + "loss": 1.9207, + "step": 5377 + }, + { + "epoch": 0.404825081390316, + "grad_norm": 4.4714765548706055, + "learning_rate": 6.759075108312672e-05, + "loss": 1.7401, + "step": 5378 + }, + { + "epoch": 0.4049003556709761, + "grad_norm": 4.635371208190918, + "learning_rate": 6.757933903844921e-05, + "loss": 1.9601, + "step": 5379 + }, + { + "epoch": 0.4049756299516363, + "grad_norm": 5.2532877922058105, + "learning_rate": 6.756792594873042e-05, + "loss": 2.1482, + "step": 5380 + }, + { + "epoch": 0.40505090423229645, + "grad_norm": 4.8074727058410645, + "learning_rate": 6.755651181464885e-05, + "loss": 2.0965, + "step": 5381 + }, + { + "epoch": 0.40512617851295657, + "grad_norm": 6.030498027801514, + "learning_rate": 6.754509663688303e-05, + "loss": 2.0231, + "step": 5382 + }, + { + "epoch": 0.40520145279361675, + "grad_norm": 4.379759788513184, + "learning_rate": 6.753368041611156e-05, + "loss": 1.7873, + "step": 5383 + }, + { + "epoch": 0.4052767270742769, + "grad_norm": 5.034201622009277, + "learning_rate": 6.752226315301309e-05, + "loss": 1.7356, + "step": 5384 + }, + { + "epoch": 0.40535200135493704, + "grad_norm": 5.684075355529785, + "learning_rate": 6.751084484826635e-05, + "loss": 2.012, + "step": 5385 + }, + { + "epoch": 0.4054272756355972, + "grad_norm": 6.668386936187744, + "learning_rate": 6.749942550255013e-05, + "loss": 2.3243, + "step": 5386 + }, + { + "epoch": 0.40550254991625734, + "grad_norm": 5.3215436935424805, + "learning_rate": 6.748800511654327e-05, + "loss": 2.0491, + "step": 5387 + }, + { + "epoch": 0.4055778241969175, + "grad_norm": 6.198952674865723, + "learning_rate": 6.74765836909247e-05, + "loss": 1.9957, + "step": 5388 + }, + { + "epoch": 0.4056530984775777, + "grad_norm": 4.485803127288818, + "learning_rate": 6.746516122637336e-05, + "loss": 1.8012, + "step": 5389 + }, + { + "epoch": 0.4057283727582378, + "grad_norm": 4.713202476501465, + "learning_rate": 6.745373772356832e-05, + "loss": 1.761, + "step": 5390 + }, + { + "epoch": 0.405803647038898, + "grad_norm": 4.559616565704346, + "learning_rate": 6.744231318318863e-05, + "loss": 1.78, + "step": 5391 + }, + { + "epoch": 0.40587892131955816, + "grad_norm": 4.271080493927002, + "learning_rate": 6.743088760591349e-05, + "loss": 1.7002, + "step": 5392 + }, + { + "epoch": 0.4059541956002183, + "grad_norm": 5.435100078582764, + "learning_rate": 6.741946099242206e-05, + "loss": 1.8004, + "step": 5393 + }, + { + "epoch": 0.40602946988087846, + "grad_norm": 4.465577125549316, + "learning_rate": 6.74080333433937e-05, + "loss": 1.6774, + "step": 5394 + }, + { + "epoch": 0.40610474416153863, + "grad_norm": 7.253365516662598, + "learning_rate": 6.739660465950769e-05, + "loss": 1.8281, + "step": 5395 + }, + { + "epoch": 0.40618001844219875, + "grad_norm": 4.638010025024414, + "learning_rate": 6.738517494144346e-05, + "loss": 2.1896, + "step": 5396 + }, + { + "epoch": 0.40625529272285893, + "grad_norm": 5.373666286468506, + "learning_rate": 6.737374418988044e-05, + "loss": 1.8781, + "step": 5397 + }, + { + "epoch": 0.40633056700351905, + "grad_norm": 5.579836368560791, + "learning_rate": 6.736231240549818e-05, + "loss": 1.9463, + "step": 5398 + }, + { + "epoch": 0.4064058412841792, + "grad_norm": 5.846822261810303, + "learning_rate": 6.735087958897628e-05, + "loss": 1.6214, + "step": 5399 + }, + { + "epoch": 0.4064811155648394, + "grad_norm": 3.9557971954345703, + "learning_rate": 6.733944574099437e-05, + "loss": 1.7163, + "step": 5400 + }, + { + "epoch": 0.4065563898454995, + "grad_norm": 3.311464309692383, + "learning_rate": 6.732801086223216e-05, + "loss": 2.0677, + "step": 5401 + }, + { + "epoch": 0.4066316641261597, + "grad_norm": 7.025592803955078, + "learning_rate": 6.731657495336942e-05, + "loss": 1.6905, + "step": 5402 + }, + { + "epoch": 0.4067069384068199, + "grad_norm": 5.563939094543457, + "learning_rate": 6.730513801508601e-05, + "loss": 1.8414, + "step": 5403 + }, + { + "epoch": 0.40678221268748, + "grad_norm": 4.507243633270264, + "learning_rate": 6.729370004806176e-05, + "loss": 2.0683, + "step": 5404 + }, + { + "epoch": 0.40685748696814017, + "grad_norm": 4.501284122467041, + "learning_rate": 6.728226105297668e-05, + "loss": 1.7875, + "step": 5405 + }, + { + "epoch": 0.4069327612488003, + "grad_norm": 3.7367732524871826, + "learning_rate": 6.727082103051077e-05, + "loss": 2.0878, + "step": 5406 + }, + { + "epoch": 0.40700803552946047, + "grad_norm": 3.810563802719116, + "learning_rate": 6.725937998134412e-05, + "loss": 1.8462, + "step": 5407 + }, + { + "epoch": 0.40708330981012064, + "grad_norm": 4.526647567749023, + "learning_rate": 6.724793790615685e-05, + "loss": 2.098, + "step": 5408 + }, + { + "epoch": 0.40715858409078076, + "grad_norm": 4.322384357452393, + "learning_rate": 6.723649480562913e-05, + "loss": 1.7134, + "step": 5409 + }, + { + "epoch": 0.40723385837144094, + "grad_norm": 4.435588359832764, + "learning_rate": 6.722505068044128e-05, + "loss": 2.3888, + "step": 5410 + }, + { + "epoch": 0.4073091326521011, + "grad_norm": 5.700343608856201, + "learning_rate": 6.721360553127358e-05, + "loss": 2.4076, + "step": 5411 + }, + { + "epoch": 0.40738440693276123, + "grad_norm": 4.957685947418213, + "learning_rate": 6.720215935880644e-05, + "loss": 2.135, + "step": 5412 + }, + { + "epoch": 0.4074596812134214, + "grad_norm": 5.051072597503662, + "learning_rate": 6.719071216372027e-05, + "loss": 1.8736, + "step": 5413 + }, + { + "epoch": 0.4075349554940816, + "grad_norm": 4.596951961517334, + "learning_rate": 6.717926394669558e-05, + "loss": 1.9796, + "step": 5414 + }, + { + "epoch": 0.4076102297747417, + "grad_norm": 7.163468837738037, + "learning_rate": 6.716781470841296e-05, + "loss": 1.901, + "step": 5415 + }, + { + "epoch": 0.4076855040554019, + "grad_norm": 3.8344624042510986, + "learning_rate": 6.7156364449553e-05, + "loss": 1.7953, + "step": 5416 + }, + { + "epoch": 0.407760778336062, + "grad_norm": 4.731624603271484, + "learning_rate": 6.71449131707964e-05, + "loss": 2.1101, + "step": 5417 + }, + { + "epoch": 0.4078360526167222, + "grad_norm": 4.400200366973877, + "learning_rate": 6.713346087282392e-05, + "loss": 2.167, + "step": 5418 + }, + { + "epoch": 0.40791132689738235, + "grad_norm": 4.1537017822265625, + "learning_rate": 6.712200755631634e-05, + "loss": 1.9945, + "step": 5419 + }, + { + "epoch": 0.4079866011780425, + "grad_norm": 3.854161500930786, + "learning_rate": 6.711055322195455e-05, + "loss": 2.0093, + "step": 5420 + }, + { + "epoch": 0.40806187545870265, + "grad_norm": 4.663625240325928, + "learning_rate": 6.709909787041945e-05, + "loss": 2.1156, + "step": 5421 + }, + { + "epoch": 0.4081371497393628, + "grad_norm": 5.383629322052002, + "learning_rate": 6.708764150239206e-05, + "loss": 1.7476, + "step": 5422 + }, + { + "epoch": 0.40821242402002295, + "grad_norm": 5.7428507804870605, + "learning_rate": 6.70761841185534e-05, + "loss": 1.704, + "step": 5423 + }, + { + "epoch": 0.4082876983006831, + "grad_norm": 3.8466684818267822, + "learning_rate": 6.706472571958461e-05, + "loss": 2.1686, + "step": 5424 + }, + { + "epoch": 0.40836297258134324, + "grad_norm": 5.016278266906738, + "learning_rate": 6.705326630616684e-05, + "loss": 2.2948, + "step": 5425 + }, + { + "epoch": 0.4084382468620034, + "grad_norm": 7.761519432067871, + "learning_rate": 6.70418058789813e-05, + "loss": 2.3454, + "step": 5426 + }, + { + "epoch": 0.4085135211426636, + "grad_norm": 4.111473560333252, + "learning_rate": 6.70303444387093e-05, + "loss": 1.964, + "step": 5427 + }, + { + "epoch": 0.4085887954233237, + "grad_norm": 5.230279922485352, + "learning_rate": 6.701888198603221e-05, + "loss": 2.3346, + "step": 5428 + }, + { + "epoch": 0.4086640697039839, + "grad_norm": 4.104366302490234, + "learning_rate": 6.70074185216314e-05, + "loss": 2.526, + "step": 5429 + }, + { + "epoch": 0.40873934398464407, + "grad_norm": 7.066196918487549, + "learning_rate": 6.699595404618837e-05, + "loss": 1.907, + "step": 5430 + }, + { + "epoch": 0.4088146182653042, + "grad_norm": 5.222086429595947, + "learning_rate": 6.698448856038462e-05, + "loss": 1.8991, + "step": 5431 + }, + { + "epoch": 0.40888989254596436, + "grad_norm": 6.374770641326904, + "learning_rate": 6.697302206490178e-05, + "loss": 2.2013, + "step": 5432 + }, + { + "epoch": 0.40896516682662454, + "grad_norm": 5.201332092285156, + "learning_rate": 6.696155456042147e-05, + "loss": 2.0184, + "step": 5433 + }, + { + "epoch": 0.40904044110728466, + "grad_norm": 4.482089042663574, + "learning_rate": 6.69500860476254e-05, + "loss": 1.9787, + "step": 5434 + }, + { + "epoch": 0.40911571538794483, + "grad_norm": 5.201145648956299, + "learning_rate": 6.693861652719536e-05, + "loss": 2.0073, + "step": 5435 + }, + { + "epoch": 0.40919098966860495, + "grad_norm": 4.535206317901611, + "learning_rate": 6.692714599981318e-05, + "loss": 2.0493, + "step": 5436 + }, + { + "epoch": 0.40926626394926513, + "grad_norm": 7.1508660316467285, + "learning_rate": 6.691567446616072e-05, + "loss": 1.7305, + "step": 5437 + }, + { + "epoch": 0.4093415382299253, + "grad_norm": 3.420806646347046, + "learning_rate": 6.690420192691996e-05, + "loss": 2.0333, + "step": 5438 + }, + { + "epoch": 0.4094168125105854, + "grad_norm": 3.878455638885498, + "learning_rate": 6.68927283827729e-05, + "loss": 2.2041, + "step": 5439 + }, + { + "epoch": 0.4094920867912456, + "grad_norm": 4.764273166656494, + "learning_rate": 6.68812538344016e-05, + "loss": 1.9954, + "step": 5440 + }, + { + "epoch": 0.4095673610719058, + "grad_norm": 3.9040868282318115, + "learning_rate": 6.686977828248822e-05, + "loss": 2.0542, + "step": 5441 + }, + { + "epoch": 0.4096426353525659, + "grad_norm": 3.967054843902588, + "learning_rate": 6.68583017277149e-05, + "loss": 1.9223, + "step": 5442 + }, + { + "epoch": 0.4097179096332261, + "grad_norm": 5.52067756652832, + "learning_rate": 6.684682417076392e-05, + "loss": 1.9067, + "step": 5443 + }, + { + "epoch": 0.40979318391388625, + "grad_norm": 5.799335956573486, + "learning_rate": 6.68353456123176e-05, + "loss": 1.7956, + "step": 5444 + }, + { + "epoch": 0.40986845819454637, + "grad_norm": 3.870598793029785, + "learning_rate": 6.682386605305827e-05, + "loss": 1.5659, + "step": 5445 + }, + { + "epoch": 0.40994373247520655, + "grad_norm": 4.401954650878906, + "learning_rate": 6.681238549366838e-05, + "loss": 2.2076, + "step": 5446 + }, + { + "epoch": 0.41001900675586667, + "grad_norm": 4.360254287719727, + "learning_rate": 6.680090393483041e-05, + "loss": 2.4369, + "step": 5447 + }, + { + "epoch": 0.41009428103652684, + "grad_norm": 4.853362560272217, + "learning_rate": 6.678942137722691e-05, + "loss": 1.8962, + "step": 5448 + }, + { + "epoch": 0.410169555317187, + "grad_norm": 5.478942394256592, + "learning_rate": 6.677793782154049e-05, + "loss": 2.0748, + "step": 5449 + }, + { + "epoch": 0.41024482959784714, + "grad_norm": 5.751876354217529, + "learning_rate": 6.67664532684538e-05, + "loss": 2.1473, + "step": 5450 + }, + { + "epoch": 0.4103201038785073, + "grad_norm": 4.607638359069824, + "learning_rate": 6.675496771864958e-05, + "loss": 1.713, + "step": 5451 + }, + { + "epoch": 0.4103953781591675, + "grad_norm": 5.937441349029541, + "learning_rate": 6.67434811728106e-05, + "loss": 1.8276, + "step": 5452 + }, + { + "epoch": 0.4104706524398276, + "grad_norm": 5.579580783843994, + "learning_rate": 6.67319936316197e-05, + "loss": 1.6076, + "step": 5453 + }, + { + "epoch": 0.4105459267204878, + "grad_norm": 5.940950393676758, + "learning_rate": 6.672050509575978e-05, + "loss": 2.0264, + "step": 5454 + }, + { + "epoch": 0.4106212010011479, + "grad_norm": 3.596885919570923, + "learning_rate": 6.670901556591383e-05, + "loss": 1.8774, + "step": 5455 + }, + { + "epoch": 0.4106964752818081, + "grad_norm": 3.9531962871551514, + "learning_rate": 6.669752504276486e-05, + "loss": 1.9779, + "step": 5456 + }, + { + "epoch": 0.41077174956246826, + "grad_norm": 3.6535210609436035, + "learning_rate": 6.668603352699592e-05, + "loss": 2.1141, + "step": 5457 + }, + { + "epoch": 0.4108470238431284, + "grad_norm": 5.708169460296631, + "learning_rate": 6.667454101929016e-05, + "loss": 1.7303, + "step": 5458 + }, + { + "epoch": 0.41092229812378855, + "grad_norm": 3.641714572906494, + "learning_rate": 6.666304752033076e-05, + "loss": 1.8841, + "step": 5459 + }, + { + "epoch": 0.41099757240444873, + "grad_norm": 4.162882328033447, + "learning_rate": 6.665155303080104e-05, + "loss": 1.9474, + "step": 5460 + }, + { + "epoch": 0.41107284668510885, + "grad_norm": 3.7056288719177246, + "learning_rate": 6.664005755138424e-05, + "loss": 1.9426, + "step": 5461 + }, + { + "epoch": 0.411148120965769, + "grad_norm": 5.81548547744751, + "learning_rate": 6.662856108276379e-05, + "loss": 1.7837, + "step": 5462 + }, + { + "epoch": 0.4112233952464292, + "grad_norm": 4.699812412261963, + "learning_rate": 6.661706362562308e-05, + "loss": 2.0458, + "step": 5463 + }, + { + "epoch": 0.4112986695270893, + "grad_norm": 3.592576742172241, + "learning_rate": 6.66055651806456e-05, + "loss": 1.8828, + "step": 5464 + }, + { + "epoch": 0.4113739438077495, + "grad_norm": 4.497145652770996, + "learning_rate": 6.659406574851494e-05, + "loss": 1.7356, + "step": 5465 + }, + { + "epoch": 0.4114492180884096, + "grad_norm": 5.0229573249816895, + "learning_rate": 6.658256532991468e-05, + "loss": 1.785, + "step": 5466 + }, + { + "epoch": 0.4115244923690698, + "grad_norm": 4.335805892944336, + "learning_rate": 6.65710639255285e-05, + "loss": 2.0807, + "step": 5467 + }, + { + "epoch": 0.41159976664972997, + "grad_norm": 4.9765625, + "learning_rate": 6.65595615360401e-05, + "loss": 1.9448, + "step": 5468 + }, + { + "epoch": 0.4116750409303901, + "grad_norm": 3.9427287578582764, + "learning_rate": 6.65480581621333e-05, + "loss": 1.8073, + "step": 5469 + }, + { + "epoch": 0.41175031521105027, + "grad_norm": 4.376291275024414, + "learning_rate": 6.653655380449192e-05, + "loss": 2.293, + "step": 5470 + }, + { + "epoch": 0.41182558949171044, + "grad_norm": 6.065493583679199, + "learning_rate": 6.652504846379984e-05, + "loss": 2.3089, + "step": 5471 + }, + { + "epoch": 0.41190086377237056, + "grad_norm": 5.656959056854248, + "learning_rate": 6.651354214074106e-05, + "loss": 2.1937, + "step": 5472 + }, + { + "epoch": 0.41197613805303074, + "grad_norm": 6.063931465148926, + "learning_rate": 6.65020348359996e-05, + "loss": 2.1979, + "step": 5473 + }, + { + "epoch": 0.41205141233369086, + "grad_norm": 6.589503288269043, + "learning_rate": 6.649052655025951e-05, + "loss": 1.6999, + "step": 5474 + }, + { + "epoch": 0.41212668661435103, + "grad_norm": 5.17265510559082, + "learning_rate": 6.647901728420494e-05, + "loss": 2.288, + "step": 5475 + }, + { + "epoch": 0.4122019608950112, + "grad_norm": 4.724446773529053, + "learning_rate": 6.646750703852006e-05, + "loss": 1.8786, + "step": 5476 + }, + { + "epoch": 0.41227723517567133, + "grad_norm": 3.911710023880005, + "learning_rate": 6.645599581388913e-05, + "loss": 1.9273, + "step": 5477 + }, + { + "epoch": 0.4123525094563315, + "grad_norm": 3.653254270553589, + "learning_rate": 6.64444836109965e-05, + "loss": 2.2875, + "step": 5478 + }, + { + "epoch": 0.4124277837369917, + "grad_norm": 4.853933811187744, + "learning_rate": 6.643297043052647e-05, + "loss": 1.8094, + "step": 5479 + }, + { + "epoch": 0.4125030580176518, + "grad_norm": 4.0237932205200195, + "learning_rate": 6.64214562731635e-05, + "loss": 1.8404, + "step": 5480 + }, + { + "epoch": 0.412578332298312, + "grad_norm": 4.912440299987793, + "learning_rate": 6.640994113959209e-05, + "loss": 2.2973, + "step": 5481 + }, + { + "epoch": 0.41265360657897215, + "grad_norm": 5.190946578979492, + "learning_rate": 6.639842503049674e-05, + "loss": 1.8979, + "step": 5482 + }, + { + "epoch": 0.4127288808596323, + "grad_norm": 3.769709587097168, + "learning_rate": 6.63869079465621e-05, + "loss": 1.7073, + "step": 5483 + }, + { + "epoch": 0.41280415514029245, + "grad_norm": 4.815976142883301, + "learning_rate": 6.63753898884728e-05, + "loss": 2.104, + "step": 5484 + }, + { + "epoch": 0.41287942942095257, + "grad_norm": 4.878068923950195, + "learning_rate": 6.636387085691356e-05, + "loss": 1.7059, + "step": 5485 + }, + { + "epoch": 0.41295470370161275, + "grad_norm": 3.944702386856079, + "learning_rate": 6.635235085256914e-05, + "loss": 1.7494, + "step": 5486 + }, + { + "epoch": 0.4130299779822729, + "grad_norm": 5.319370269775391, + "learning_rate": 6.634082987612438e-05, + "loss": 1.8966, + "step": 5487 + }, + { + "epoch": 0.41310525226293304, + "grad_norm": 4.177558898925781, + "learning_rate": 6.632930792826417e-05, + "loss": 1.9216, + "step": 5488 + }, + { + "epoch": 0.4131805265435932, + "grad_norm": 7.6635847091674805, + "learning_rate": 6.631778500967347e-05, + "loss": 2.0967, + "step": 5489 + }, + { + "epoch": 0.4132558008242534, + "grad_norm": 4.461691856384277, + "learning_rate": 6.630626112103727e-05, + "loss": 2.1364, + "step": 5490 + }, + { + "epoch": 0.4133310751049135, + "grad_norm": 5.968453407287598, + "learning_rate": 6.629473626304063e-05, + "loss": 2.1202, + "step": 5491 + }, + { + "epoch": 0.4134063493855737, + "grad_norm": 3.196467638015747, + "learning_rate": 6.628321043636868e-05, + "loss": 1.8225, + "step": 5492 + }, + { + "epoch": 0.41348162366623387, + "grad_norm": 5.531734943389893, + "learning_rate": 6.627168364170659e-05, + "loss": 1.9503, + "step": 5493 + }, + { + "epoch": 0.413556897946894, + "grad_norm": 4.877641201019287, + "learning_rate": 6.626015587973958e-05, + "loss": 1.7222, + "step": 5494 + }, + { + "epoch": 0.41363217222755416, + "grad_norm": 4.493724822998047, + "learning_rate": 6.624862715115297e-05, + "loss": 2.0498, + "step": 5495 + }, + { + "epoch": 0.4137074465082143, + "grad_norm": 4.2190046310424805, + "learning_rate": 6.623709745663212e-05, + "loss": 1.7869, + "step": 5496 + }, + { + "epoch": 0.41378272078887446, + "grad_norm": 3.5987708568573, + "learning_rate": 6.622556679686238e-05, + "loss": 2.1812, + "step": 5497 + }, + { + "epoch": 0.41385799506953463, + "grad_norm": 5.731755256652832, + "learning_rate": 6.621403517252926e-05, + "loss": 1.7986, + "step": 5498 + }, + { + "epoch": 0.41393326935019475, + "grad_norm": 6.7161641120910645, + "learning_rate": 6.620250258431827e-05, + "loss": 1.9385, + "step": 5499 + }, + { + "epoch": 0.41400854363085493, + "grad_norm": 5.86796760559082, + "learning_rate": 6.6190969032915e-05, + "loss": 2.2876, + "step": 5500 + }, + { + "epoch": 0.4140838179115151, + "grad_norm": 4.715075969696045, + "learning_rate": 6.617943451900508e-05, + "loss": 1.8518, + "step": 5501 + }, + { + "epoch": 0.4141590921921752, + "grad_norm": 4.128543376922607, + "learning_rate": 6.616789904327421e-05, + "loss": 2.2018, + "step": 5502 + }, + { + "epoch": 0.4142343664728354, + "grad_norm": 3.8076722621917725, + "learning_rate": 6.615636260640812e-05, + "loss": 1.7915, + "step": 5503 + }, + { + "epoch": 0.4143096407534955, + "grad_norm": 4.099318981170654, + "learning_rate": 6.614482520909262e-05, + "loss": 2.0997, + "step": 5504 + }, + { + "epoch": 0.4143849150341557, + "grad_norm": 3.803635835647583, + "learning_rate": 6.61332868520136e-05, + "loss": 1.7286, + "step": 5505 + }, + { + "epoch": 0.4144601893148159, + "grad_norm": 4.256179332733154, + "learning_rate": 6.612174753585697e-05, + "loss": 1.7436, + "step": 5506 + }, + { + "epoch": 0.414535463595476, + "grad_norm": 5.945194244384766, + "learning_rate": 6.61102072613087e-05, + "loss": 1.9926, + "step": 5507 + }, + { + "epoch": 0.41461073787613617, + "grad_norm": 3.957068681716919, + "learning_rate": 6.609866602905484e-05, + "loss": 1.6294, + "step": 5508 + }, + { + "epoch": 0.41468601215679635, + "grad_norm": 6.181549072265625, + "learning_rate": 6.608712383978147e-05, + "loss": 1.9012, + "step": 5509 + }, + { + "epoch": 0.41476128643745647, + "grad_norm": 5.627074718475342, + "learning_rate": 6.607558069417476e-05, + "loss": 1.8964, + "step": 5510 + }, + { + "epoch": 0.41483656071811664, + "grad_norm": 4.87725305557251, + "learning_rate": 6.60640365929209e-05, + "loss": 1.6232, + "step": 5511 + }, + { + "epoch": 0.4149118349987768, + "grad_norm": 4.347396373748779, + "learning_rate": 6.605249153670615e-05, + "loss": 2.1033, + "step": 5512 + }, + { + "epoch": 0.41498710927943694, + "grad_norm": 5.015224456787109, + "learning_rate": 6.604094552621685e-05, + "loss": 1.8466, + "step": 5513 + }, + { + "epoch": 0.4150623835600971, + "grad_norm": 3.6444180011749268, + "learning_rate": 6.602939856213935e-05, + "loss": 1.7917, + "step": 5514 + }, + { + "epoch": 0.41513765784075723, + "grad_norm": 5.13500452041626, + "learning_rate": 6.601785064516011e-05, + "loss": 1.7059, + "step": 5515 + }, + { + "epoch": 0.4152129321214174, + "grad_norm": 5.212123394012451, + "learning_rate": 6.600630177596559e-05, + "loss": 1.681, + "step": 5516 + }, + { + "epoch": 0.4152882064020776, + "grad_norm": 4.870180130004883, + "learning_rate": 6.599475195524237e-05, + "loss": 1.7537, + "step": 5517 + }, + { + "epoch": 0.4153634806827377, + "grad_norm": 4.179619312286377, + "learning_rate": 6.598320118367704e-05, + "loss": 1.7974, + "step": 5518 + }, + { + "epoch": 0.4154387549633979, + "grad_norm": 4.074046611785889, + "learning_rate": 6.597164946195627e-05, + "loss": 1.6882, + "step": 5519 + }, + { + "epoch": 0.41551402924405806, + "grad_norm": 5.8371052742004395, + "learning_rate": 6.596009679076675e-05, + "loss": 2.0133, + "step": 5520 + }, + { + "epoch": 0.4155893035247182, + "grad_norm": 4.035010814666748, + "learning_rate": 6.594854317079529e-05, + "loss": 1.8934, + "step": 5521 + }, + { + "epoch": 0.41566457780537835, + "grad_norm": 4.570525646209717, + "learning_rate": 6.593698860272868e-05, + "loss": 1.711, + "step": 5522 + }, + { + "epoch": 0.41573985208603853, + "grad_norm": 4.247208595275879, + "learning_rate": 6.592543308725383e-05, + "loss": 1.8048, + "step": 5523 + }, + { + "epoch": 0.41581512636669865, + "grad_norm": 4.1996636390686035, + "learning_rate": 6.591387662505769e-05, + "loss": 1.9503, + "step": 5524 + }, + { + "epoch": 0.4158904006473588, + "grad_norm": 4.474452972412109, + "learning_rate": 6.590231921682724e-05, + "loss": 1.7582, + "step": 5525 + }, + { + "epoch": 0.41596567492801895, + "grad_norm": 4.767648696899414, + "learning_rate": 6.589076086324954e-05, + "loss": 2.1755, + "step": 5526 + }, + { + "epoch": 0.4160409492086791, + "grad_norm": 5.134219646453857, + "learning_rate": 6.587920156501171e-05, + "loss": 1.7309, + "step": 5527 + }, + { + "epoch": 0.4161162234893393, + "grad_norm": 5.276822566986084, + "learning_rate": 6.58676413228009e-05, + "loss": 1.8096, + "step": 5528 + }, + { + "epoch": 0.4161914977699994, + "grad_norm": 7.404260158538818, + "learning_rate": 6.585608013730436e-05, + "loss": 2.0437, + "step": 5529 + }, + { + "epoch": 0.4162667720506596, + "grad_norm": 5.470710754394531, + "learning_rate": 6.584451800920933e-05, + "loss": 2.108, + "step": 5530 + }, + { + "epoch": 0.41634204633131977, + "grad_norm": 4.577542781829834, + "learning_rate": 6.583295493920317e-05, + "loss": 2.018, + "step": 5531 + }, + { + "epoch": 0.4164173206119799, + "grad_norm": 4.170786380767822, + "learning_rate": 6.582139092797327e-05, + "loss": 1.9929, + "step": 5532 + }, + { + "epoch": 0.41649259489264007, + "grad_norm": 5.146010398864746, + "learning_rate": 6.580982597620709e-05, + "loss": 1.8045, + "step": 5533 + }, + { + "epoch": 0.4165678691733002, + "grad_norm": 5.699809551239014, + "learning_rate": 6.57982600845921e-05, + "loss": 2.1068, + "step": 5534 + }, + { + "epoch": 0.41664314345396036, + "grad_norm": 4.2040815353393555, + "learning_rate": 6.57866932538159e-05, + "loss": 2.1131, + "step": 5535 + }, + { + "epoch": 0.41671841773462054, + "grad_norm": 4.423560619354248, + "learning_rate": 6.577512548456606e-05, + "loss": 1.8453, + "step": 5536 + }, + { + "epoch": 0.41679369201528066, + "grad_norm": 4.110985279083252, + "learning_rate": 6.576355677753028e-05, + "loss": 1.8922, + "step": 5537 + }, + { + "epoch": 0.41686896629594083, + "grad_norm": 3.8957650661468506, + "learning_rate": 6.575198713339629e-05, + "loss": 1.8041, + "step": 5538 + }, + { + "epoch": 0.416944240576601, + "grad_norm": 7.501094818115234, + "learning_rate": 6.574041655285185e-05, + "loss": 2.0819, + "step": 5539 + }, + { + "epoch": 0.41701951485726113, + "grad_norm": 4.668057918548584, + "learning_rate": 6.572884503658483e-05, + "loss": 1.857, + "step": 5540 + }, + { + "epoch": 0.4170947891379213, + "grad_norm": 4.338768005371094, + "learning_rate": 6.57172725852831e-05, + "loss": 1.9319, + "step": 5541 + }, + { + "epoch": 0.4171700634185815, + "grad_norm": 4.211892604827881, + "learning_rate": 6.57056991996346e-05, + "loss": 1.9953, + "step": 5542 + }, + { + "epoch": 0.4172453376992416, + "grad_norm": 4.700490474700928, + "learning_rate": 6.569412488032735e-05, + "loss": 2.1271, + "step": 5543 + }, + { + "epoch": 0.4173206119799018, + "grad_norm": 4.889296531677246, + "learning_rate": 6.568254962804941e-05, + "loss": 1.8898, + "step": 5544 + }, + { + "epoch": 0.4173958862605619, + "grad_norm": 4.8866400718688965, + "learning_rate": 6.567097344348889e-05, + "loss": 2.0086, + "step": 5545 + }, + { + "epoch": 0.4174711605412221, + "grad_norm": 6.036957263946533, + "learning_rate": 6.565939632733396e-05, + "loss": 2.1814, + "step": 5546 + }, + { + "epoch": 0.41754643482188225, + "grad_norm": 4.945574760437012, + "learning_rate": 6.564781828027285e-05, + "loss": 2.1964, + "step": 5547 + }, + { + "epoch": 0.41762170910254237, + "grad_norm": 4.205514907836914, + "learning_rate": 6.563623930299385e-05, + "loss": 1.8114, + "step": 5548 + }, + { + "epoch": 0.41769698338320255, + "grad_norm": 3.874279737472534, + "learning_rate": 6.562465939618529e-05, + "loss": 1.7542, + "step": 5549 + }, + { + "epoch": 0.4177722576638627, + "grad_norm": 4.054240703582764, + "learning_rate": 6.561307856053556e-05, + "loss": 1.8724, + "step": 5550 + }, + { + "epoch": 0.41784753194452284, + "grad_norm": 4.651938438415527, + "learning_rate": 6.56014967967331e-05, + "loss": 1.8783, + "step": 5551 + }, + { + "epoch": 0.417922806225183, + "grad_norm": 4.218622207641602, + "learning_rate": 6.558991410546644e-05, + "loss": 2.1694, + "step": 5552 + }, + { + "epoch": 0.41799808050584314, + "grad_norm": 6.124258041381836, + "learning_rate": 6.55783304874241e-05, + "loss": 2.0684, + "step": 5553 + }, + { + "epoch": 0.4180733547865033, + "grad_norm": 5.30171012878418, + "learning_rate": 6.556674594329471e-05, + "loss": 1.9376, + "step": 5554 + }, + { + "epoch": 0.4181486290671635, + "grad_norm": 4.841015815734863, + "learning_rate": 6.555516047376696e-05, + "loss": 2.0306, + "step": 5555 + }, + { + "epoch": 0.4182239033478236, + "grad_norm": 4.988308906555176, + "learning_rate": 6.554357407952954e-05, + "loss": 1.9401, + "step": 5556 + }, + { + "epoch": 0.4182991776284838, + "grad_norm": 5.21719217300415, + "learning_rate": 6.553198676127124e-05, + "loss": 2.0798, + "step": 5557 + }, + { + "epoch": 0.41837445190914396, + "grad_norm": 3.97567081451416, + "learning_rate": 6.552039851968089e-05, + "loss": 1.6506, + "step": 5558 + }, + { + "epoch": 0.4184497261898041, + "grad_norm": 4.648266315460205, + "learning_rate": 6.550880935544738e-05, + "loss": 1.9546, + "step": 5559 + }, + { + "epoch": 0.41852500047046426, + "grad_norm": 4.081138610839844, + "learning_rate": 6.549721926925966e-05, + "loss": 1.5423, + "step": 5560 + }, + { + "epoch": 0.41860027475112443, + "grad_norm": 4.243156433105469, + "learning_rate": 6.548562826180672e-05, + "loss": 1.6255, + "step": 5561 + }, + { + "epoch": 0.41867554903178456, + "grad_norm": 4.178682804107666, + "learning_rate": 6.547403633377762e-05, + "loss": 1.7335, + "step": 5562 + }, + { + "epoch": 0.41875082331244473, + "grad_norm": 4.467832088470459, + "learning_rate": 6.546244348586144e-05, + "loss": 1.656, + "step": 5563 + }, + { + "epoch": 0.41882609759310485, + "grad_norm": 3.900017023086548, + "learning_rate": 6.545084971874738e-05, + "loss": 1.5719, + "step": 5564 + }, + { + "epoch": 0.418901371873765, + "grad_norm": 6.569604873657227, + "learning_rate": 6.543925503312462e-05, + "loss": 2.226, + "step": 5565 + }, + { + "epoch": 0.4189766461544252, + "grad_norm": 5.198246002197266, + "learning_rate": 6.542765942968246e-05, + "loss": 1.9406, + "step": 5566 + }, + { + "epoch": 0.4190519204350853, + "grad_norm": 3.909407138824463, + "learning_rate": 6.541606290911022e-05, + "loss": 1.9182, + "step": 5567 + }, + { + "epoch": 0.4191271947157455, + "grad_norm": 4.8404974937438965, + "learning_rate": 6.540446547209727e-05, + "loss": 1.9712, + "step": 5568 + }, + { + "epoch": 0.4192024689964057, + "grad_norm": 4.304462909698486, + "learning_rate": 6.539286711933304e-05, + "loss": 2.1316, + "step": 5569 + }, + { + "epoch": 0.4192777432770658, + "grad_norm": 6.251839637756348, + "learning_rate": 6.538126785150705e-05, + "loss": 1.7148, + "step": 5570 + }, + { + "epoch": 0.41935301755772597, + "grad_norm": 4.521939754486084, + "learning_rate": 6.536966766930879e-05, + "loss": 2.0965, + "step": 5571 + }, + { + "epoch": 0.41942829183838615, + "grad_norm": 5.470311164855957, + "learning_rate": 6.53580665734279e-05, + "loss": 2.0385, + "step": 5572 + }, + { + "epoch": 0.41950356611904627, + "grad_norm": 4.349471092224121, + "learning_rate": 6.534646456455402e-05, + "loss": 2.0269, + "step": 5573 + }, + { + "epoch": 0.41957884039970644, + "grad_norm": 4.1794233322143555, + "learning_rate": 6.533486164337686e-05, + "loss": 2.4095, + "step": 5574 + }, + { + "epoch": 0.41965411468036656, + "grad_norm": 6.031955718994141, + "learning_rate": 6.532325781058616e-05, + "loss": 2.1703, + "step": 5575 + }, + { + "epoch": 0.41972938896102674, + "grad_norm": 3.9421780109405518, + "learning_rate": 6.531165306687177e-05, + "loss": 1.9262, + "step": 5576 + }, + { + "epoch": 0.4198046632416869, + "grad_norm": 4.936164855957031, + "learning_rate": 6.530004741292352e-05, + "loss": 2.2354, + "step": 5577 + }, + { + "epoch": 0.41987993752234704, + "grad_norm": 4.212682247161865, + "learning_rate": 6.528844084943137e-05, + "loss": 1.7489, + "step": 5578 + }, + { + "epoch": 0.4199552118030072, + "grad_norm": 7.0396647453308105, + "learning_rate": 6.527683337708527e-05, + "loss": 1.8702, + "step": 5579 + }, + { + "epoch": 0.4200304860836674, + "grad_norm": 4.923828125, + "learning_rate": 6.526522499657526e-05, + "loss": 1.9955, + "step": 5580 + }, + { + "epoch": 0.4201057603643275, + "grad_norm": 4.858852863311768, + "learning_rate": 6.525361570859143e-05, + "loss": 2.0155, + "step": 5581 + }, + { + "epoch": 0.4201810346449877, + "grad_norm": 4.04617166519165, + "learning_rate": 6.524200551382392e-05, + "loss": 2.0444, + "step": 5582 + }, + { + "epoch": 0.4202563089256478, + "grad_norm": 4.236696720123291, + "learning_rate": 6.523039441296289e-05, + "loss": 1.9147, + "step": 5583 + }, + { + "epoch": 0.420331583206308, + "grad_norm": 5.749179840087891, + "learning_rate": 6.521878240669863e-05, + "loss": 2.1391, + "step": 5584 + }, + { + "epoch": 0.42040685748696816, + "grad_norm": 4.734997749328613, + "learning_rate": 6.520716949572143e-05, + "loss": 1.8811, + "step": 5585 + }, + { + "epoch": 0.4204821317676283, + "grad_norm": 3.8527164459228516, + "learning_rate": 6.519555568072163e-05, + "loss": 2.2059, + "step": 5586 + }, + { + "epoch": 0.42055740604828845, + "grad_norm": 4.343130588531494, + "learning_rate": 6.518394096238965e-05, + "loss": 1.864, + "step": 5587 + }, + { + "epoch": 0.4206326803289486, + "grad_norm": 4.452051639556885, + "learning_rate": 6.517232534141594e-05, + "loss": 1.9093, + "step": 5588 + }, + { + "epoch": 0.42070795460960875, + "grad_norm": 4.398669242858887, + "learning_rate": 6.516070881849104e-05, + "loss": 1.9407, + "step": 5589 + }, + { + "epoch": 0.4207832288902689, + "grad_norm": 4.123104095458984, + "learning_rate": 6.514909139430549e-05, + "loss": 1.7576, + "step": 5590 + }, + { + "epoch": 0.4208585031709291, + "grad_norm": 4.239006996154785, + "learning_rate": 6.513747306954992e-05, + "loss": 2.2412, + "step": 5591 + }, + { + "epoch": 0.4209337774515892, + "grad_norm": 4.470141410827637, + "learning_rate": 6.5125853844915e-05, + "loss": 2.0184, + "step": 5592 + }, + { + "epoch": 0.4210090517322494, + "grad_norm": 5.281346321105957, + "learning_rate": 6.511423372109149e-05, + "loss": 1.9499, + "step": 5593 + }, + { + "epoch": 0.4210843260129095, + "grad_norm": 4.19333028793335, + "learning_rate": 6.510261269877015e-05, + "loss": 2.1242, + "step": 5594 + }, + { + "epoch": 0.4211596002935697, + "grad_norm": 4.250405788421631, + "learning_rate": 6.50909907786418e-05, + "loss": 2.035, + "step": 5595 + }, + { + "epoch": 0.42123487457422987, + "grad_norm": 4.133971691131592, + "learning_rate": 6.507936796139737e-05, + "loss": 2.3415, + "step": 5596 + }, + { + "epoch": 0.42131014885489, + "grad_norm": 3.4514198303222656, + "learning_rate": 6.506774424772778e-05, + "loss": 2.1725, + "step": 5597 + }, + { + "epoch": 0.42138542313555016, + "grad_norm": 4.451467514038086, + "learning_rate": 6.505611963832404e-05, + "loss": 1.9895, + "step": 5598 + }, + { + "epoch": 0.42146069741621034, + "grad_norm": 4.091064453125, + "learning_rate": 6.504449413387715e-05, + "loss": 1.903, + "step": 5599 + }, + { + "epoch": 0.42153597169687046, + "grad_norm": 3.049105644226074, + "learning_rate": 6.503286773507829e-05, + "loss": 2.2702, + "step": 5600 + }, + { + "epoch": 0.42161124597753064, + "grad_norm": 3.6067404747009277, + "learning_rate": 6.502124044261855e-05, + "loss": 1.7548, + "step": 5601 + }, + { + "epoch": 0.42168652025819076, + "grad_norm": 4.892045974731445, + "learning_rate": 6.500961225718918e-05, + "loss": 2.3685, + "step": 5602 + }, + { + "epoch": 0.42176179453885093, + "grad_norm": 3.874382972717285, + "learning_rate": 6.499798317948143e-05, + "loss": 2.0288, + "step": 5603 + }, + { + "epoch": 0.4218370688195111, + "grad_norm": 5.166650772094727, + "learning_rate": 6.498635321018661e-05, + "loss": 2.0425, + "step": 5604 + }, + { + "epoch": 0.4219123431001712, + "grad_norm": 3.9357619285583496, + "learning_rate": 6.497472234999608e-05, + "loss": 1.6051, + "step": 5605 + }, + { + "epoch": 0.4219876173808314, + "grad_norm": 3.3423304557800293, + "learning_rate": 6.49630905996013e-05, + "loss": 1.8607, + "step": 5606 + }, + { + "epoch": 0.4220628916614916, + "grad_norm": 8.150017738342285, + "learning_rate": 6.495145795969371e-05, + "loss": 2.034, + "step": 5607 + }, + { + "epoch": 0.4221381659421517, + "grad_norm": 4.082434177398682, + "learning_rate": 6.493982443096484e-05, + "loss": 2.2919, + "step": 5608 + }, + { + "epoch": 0.4222134402228119, + "grad_norm": 4.554947376251221, + "learning_rate": 6.492819001410627e-05, + "loss": 2.069, + "step": 5609 + }, + { + "epoch": 0.42228871450347205, + "grad_norm": 6.27967643737793, + "learning_rate": 6.491655470980963e-05, + "loss": 2.251, + "step": 5610 + }, + { + "epoch": 0.42236398878413217, + "grad_norm": 5.9056291580200195, + "learning_rate": 6.490491851876663e-05, + "loss": 1.6081, + "step": 5611 + }, + { + "epoch": 0.42243926306479235, + "grad_norm": 4.23393440246582, + "learning_rate": 6.489328144166899e-05, + "loss": 2.3094, + "step": 5612 + }, + { + "epoch": 0.42251453734545247, + "grad_norm": 4.29860782623291, + "learning_rate": 6.48816434792085e-05, + "loss": 2.0796, + "step": 5613 + }, + { + "epoch": 0.42258981162611264, + "grad_norm": 5.503111362457275, + "learning_rate": 6.4870004632077e-05, + "loss": 2.029, + "step": 5614 + }, + { + "epoch": 0.4226650859067728, + "grad_norm": 6.667332172393799, + "learning_rate": 6.48583649009664e-05, + "loss": 2.3769, + "step": 5615 + }, + { + "epoch": 0.42274036018743294, + "grad_norm": 6.4203877449035645, + "learning_rate": 6.484672428656862e-05, + "loss": 1.9105, + "step": 5616 + }, + { + "epoch": 0.4228156344680931, + "grad_norm": 4.276432514190674, + "learning_rate": 6.48350827895757e-05, + "loss": 1.8086, + "step": 5617 + }, + { + "epoch": 0.4228909087487533, + "grad_norm": 5.440550327301025, + "learning_rate": 6.482344041067969e-05, + "loss": 1.9897, + "step": 5618 + }, + { + "epoch": 0.4229661830294134, + "grad_norm": 4.325236797332764, + "learning_rate": 6.481179715057266e-05, + "loss": 1.9733, + "step": 5619 + }, + { + "epoch": 0.4230414573100736, + "grad_norm": 3.871230363845825, + "learning_rate": 6.480015300994678e-05, + "loss": 2.2546, + "step": 5620 + }, + { + "epoch": 0.42311673159073376, + "grad_norm": 6.921532154083252, + "learning_rate": 6.478850798949428e-05, + "loss": 2.3421, + "step": 5621 + }, + { + "epoch": 0.4231920058713939, + "grad_norm": 4.719679355621338, + "learning_rate": 6.477686208990741e-05, + "loss": 2.1701, + "step": 5622 + }, + { + "epoch": 0.42326728015205406, + "grad_norm": 6.527713298797607, + "learning_rate": 6.47652153118785e-05, + "loss": 1.9278, + "step": 5623 + }, + { + "epoch": 0.4233425544327142, + "grad_norm": 3.849130868911743, + "learning_rate": 6.47535676560999e-05, + "loss": 2.3324, + "step": 5624 + }, + { + "epoch": 0.42341782871337436, + "grad_norm": 4.888229846954346, + "learning_rate": 6.474191912326404e-05, + "loss": 2.3075, + "step": 5625 + }, + { + "epoch": 0.42349310299403453, + "grad_norm": 3.5545620918273926, + "learning_rate": 6.473026971406338e-05, + "loss": 2.0089, + "step": 5626 + }, + { + "epoch": 0.42356837727469465, + "grad_norm": 5.497159957885742, + "learning_rate": 6.471861942919043e-05, + "loss": 1.8244, + "step": 5627 + }, + { + "epoch": 0.4236436515553548, + "grad_norm": 4.458886623382568, + "learning_rate": 6.470696826933782e-05, + "loss": 2.1825, + "step": 5628 + }, + { + "epoch": 0.423718925836015, + "grad_norm": 5.496108055114746, + "learning_rate": 6.469531623519813e-05, + "loss": 2.0918, + "step": 5629 + }, + { + "epoch": 0.4237942001166751, + "grad_norm": 3.3734006881713867, + "learning_rate": 6.468366332746406e-05, + "loss": 1.8253, + "step": 5630 + }, + { + "epoch": 0.4238694743973353, + "grad_norm": 6.460005283355713, + "learning_rate": 6.467200954682835e-05, + "loss": 2.2932, + "step": 5631 + }, + { + "epoch": 0.4239447486779954, + "grad_norm": 3.8254501819610596, + "learning_rate": 6.466035489398374e-05, + "loss": 2.2456, + "step": 5632 + }, + { + "epoch": 0.4240200229586556, + "grad_norm": 5.881473064422607, + "learning_rate": 6.464869936962312e-05, + "loss": 1.6875, + "step": 5633 + }, + { + "epoch": 0.42409529723931577, + "grad_norm": 4.08540153503418, + "learning_rate": 6.463704297443935e-05, + "loss": 1.9969, + "step": 5634 + }, + { + "epoch": 0.4241705715199759, + "grad_norm": 7.7894287109375, + "learning_rate": 6.462538570912539e-05, + "loss": 2.0714, + "step": 5635 + }, + { + "epoch": 0.42424584580063607, + "grad_norm": 5.379004001617432, + "learning_rate": 6.461372757437419e-05, + "loss": 1.8022, + "step": 5636 + }, + { + "epoch": 0.42432112008129624, + "grad_norm": 4.519621849060059, + "learning_rate": 6.460206857087882e-05, + "loss": 1.95, + "step": 5637 + }, + { + "epoch": 0.42439639436195636, + "grad_norm": 4.3377685546875, + "learning_rate": 6.459040869933238e-05, + "loss": 2.0346, + "step": 5638 + }, + { + "epoch": 0.42447166864261654, + "grad_norm": 5.817694664001465, + "learning_rate": 6.457874796042801e-05, + "loss": 1.8139, + "step": 5639 + }, + { + "epoch": 0.4245469429232767, + "grad_norm": 4.451927661895752, + "learning_rate": 6.45670863548589e-05, + "loss": 1.9106, + "step": 5640 + }, + { + "epoch": 0.42462221720393684, + "grad_norm": 5.235522747039795, + "learning_rate": 6.45554238833183e-05, + "loss": 2.1191, + "step": 5641 + }, + { + "epoch": 0.424697491484597, + "grad_norm": 6.057497978210449, + "learning_rate": 6.454376054649951e-05, + "loss": 1.8064, + "step": 5642 + }, + { + "epoch": 0.42477276576525713, + "grad_norm": 8.908404350280762, + "learning_rate": 6.45320963450959e-05, + "loss": 2.2445, + "step": 5643 + }, + { + "epoch": 0.4248480400459173, + "grad_norm": 5.425161361694336, + "learning_rate": 6.452043127980084e-05, + "loss": 1.7459, + "step": 5644 + }, + { + "epoch": 0.4249233143265775, + "grad_norm": 4.097648620605469, + "learning_rate": 6.450876535130782e-05, + "loss": 2.039, + "step": 5645 + }, + { + "epoch": 0.4249985886072376, + "grad_norm": 5.247454643249512, + "learning_rate": 6.449709856031033e-05, + "loss": 2.1966, + "step": 5646 + }, + { + "epoch": 0.4250738628878978, + "grad_norm": 5.577682971954346, + "learning_rate": 6.448543090750193e-05, + "loss": 1.9621, + "step": 5647 + }, + { + "epoch": 0.42514913716855796, + "grad_norm": 6.4181623458862305, + "learning_rate": 6.447376239357623e-05, + "loss": 2.3809, + "step": 5648 + }, + { + "epoch": 0.4252244114492181, + "grad_norm": 4.736383438110352, + "learning_rate": 6.446209301922686e-05, + "loss": 2.3393, + "step": 5649 + }, + { + "epoch": 0.42529968572987825, + "grad_norm": 4.835147380828857, + "learning_rate": 6.445042278514758e-05, + "loss": 1.722, + "step": 5650 + }, + { + "epoch": 0.42537496001053837, + "grad_norm": 4.267189979553223, + "learning_rate": 6.443875169203213e-05, + "loss": 1.8631, + "step": 5651 + }, + { + "epoch": 0.42545023429119855, + "grad_norm": 6.311431407928467, + "learning_rate": 6.442707974057432e-05, + "loss": 2.0236, + "step": 5652 + }, + { + "epoch": 0.4255255085718587, + "grad_norm": 4.5755109786987305, + "learning_rate": 6.441540693146799e-05, + "loss": 2.3902, + "step": 5653 + }, + { + "epoch": 0.42560078285251884, + "grad_norm": 5.2585129737854, + "learning_rate": 6.440373326540709e-05, + "loss": 1.6589, + "step": 5654 + }, + { + "epoch": 0.425676057133179, + "grad_norm": 5.402616500854492, + "learning_rate": 6.43920587430856e-05, + "loss": 2.0014, + "step": 5655 + }, + { + "epoch": 0.4257513314138392, + "grad_norm": 4.718621253967285, + "learning_rate": 6.438038336519749e-05, + "loss": 1.9912, + "step": 5656 + }, + { + "epoch": 0.4258266056944993, + "grad_norm": 4.067650318145752, + "learning_rate": 6.436870713243687e-05, + "loss": 1.9912, + "step": 5657 + }, + { + "epoch": 0.4259018799751595, + "grad_norm": 4.3337483406066895, + "learning_rate": 6.435703004549782e-05, + "loss": 2.0985, + "step": 5658 + }, + { + "epoch": 0.42597715425581967, + "grad_norm": 5.808797836303711, + "learning_rate": 6.434535210507453e-05, + "loss": 1.8555, + "step": 5659 + }, + { + "epoch": 0.4260524285364798, + "grad_norm": 5.587106704711914, + "learning_rate": 6.433367331186122e-05, + "loss": 1.887, + "step": 5660 + }, + { + "epoch": 0.42612770281713996, + "grad_norm": 3.134948492050171, + "learning_rate": 6.432199366655217e-05, + "loss": 1.9426, + "step": 5661 + }, + { + "epoch": 0.4262029770978001, + "grad_norm": 4.005614280700684, + "learning_rate": 6.431031316984169e-05, + "loss": 1.9145, + "step": 5662 + }, + { + "epoch": 0.42627825137846026, + "grad_norm": 4.251115798950195, + "learning_rate": 6.429863182242415e-05, + "loss": 2.0053, + "step": 5663 + }, + { + "epoch": 0.42635352565912044, + "grad_norm": 6.862904071807861, + "learning_rate": 6.428694962499397e-05, + "loss": 1.892, + "step": 5664 + }, + { + "epoch": 0.42642879993978056, + "grad_norm": 4.527406692504883, + "learning_rate": 6.427526657824563e-05, + "loss": 1.8634, + "step": 5665 + }, + { + "epoch": 0.42650407422044073, + "grad_norm": 5.64104700088501, + "learning_rate": 6.426358268287366e-05, + "loss": 2.2718, + "step": 5666 + }, + { + "epoch": 0.4265793485011009, + "grad_norm": 5.171625137329102, + "learning_rate": 6.425189793957262e-05, + "loss": 2.0148, + "step": 5667 + }, + { + "epoch": 0.42665462278176103, + "grad_norm": 4.813802242279053, + "learning_rate": 6.424021234903714e-05, + "loss": 1.7304, + "step": 5668 + }, + { + "epoch": 0.4267298970624212, + "grad_norm": 4.335046291351318, + "learning_rate": 6.422852591196191e-05, + "loss": 2.1238, + "step": 5669 + }, + { + "epoch": 0.4268051713430814, + "grad_norm": 4.876555442810059, + "learning_rate": 6.421683862904163e-05, + "loss": 1.9286, + "step": 5670 + }, + { + "epoch": 0.4268804456237415, + "grad_norm": 3.706444025039673, + "learning_rate": 6.420515050097107e-05, + "loss": 1.7301, + "step": 5671 + }, + { + "epoch": 0.4269557199044017, + "grad_norm": 3.9172468185424805, + "learning_rate": 6.41934615284451e-05, + "loss": 1.6058, + "step": 5672 + }, + { + "epoch": 0.4270309941850618, + "grad_norm": 4.290459156036377, + "learning_rate": 6.418177171215856e-05, + "loss": 1.9359, + "step": 5673 + }, + { + "epoch": 0.42710626846572197, + "grad_norm": 6.446939945220947, + "learning_rate": 6.417008105280637e-05, + "loss": 2.2751, + "step": 5674 + }, + { + "epoch": 0.42718154274638215, + "grad_norm": 4.605138301849365, + "learning_rate": 6.415838955108353e-05, + "loss": 2.0881, + "step": 5675 + }, + { + "epoch": 0.42725681702704227, + "grad_norm": 4.872567176818848, + "learning_rate": 6.414669720768505e-05, + "loss": 2.2949, + "step": 5676 + }, + { + "epoch": 0.42733209130770244, + "grad_norm": 4.5091962814331055, + "learning_rate": 6.413500402330602e-05, + "loss": 1.7376, + "step": 5677 + }, + { + "epoch": 0.4274073655883626, + "grad_norm": 6.263108730316162, + "learning_rate": 6.412330999864155e-05, + "loss": 1.6687, + "step": 5678 + }, + { + "epoch": 0.42748263986902274, + "grad_norm": 5.759294033050537, + "learning_rate": 6.411161513438684e-05, + "loss": 2.134, + "step": 5679 + }, + { + "epoch": 0.4275579141496829, + "grad_norm": 4.382905960083008, + "learning_rate": 6.409991943123709e-05, + "loss": 1.8283, + "step": 5680 + }, + { + "epoch": 0.42763318843034304, + "grad_norm": 5.303624153137207, + "learning_rate": 6.408822288988758e-05, + "loss": 2.2724, + "step": 5681 + }, + { + "epoch": 0.4277084627110032, + "grad_norm": 3.873399496078491, + "learning_rate": 6.407652551103364e-05, + "loss": 2.0191, + "step": 5682 + }, + { + "epoch": 0.4277837369916634, + "grad_norm": 3.8777012825012207, + "learning_rate": 6.406482729537068e-05, + "loss": 1.7661, + "step": 5683 + }, + { + "epoch": 0.4278590112723235, + "grad_norm": 4.47244119644165, + "learning_rate": 6.405312824359408e-05, + "loss": 1.9484, + "step": 5684 + }, + { + "epoch": 0.4279342855529837, + "grad_norm": 5.212845802307129, + "learning_rate": 6.404142835639931e-05, + "loss": 1.8307, + "step": 5685 + }, + { + "epoch": 0.42800955983364386, + "grad_norm": 5.502110481262207, + "learning_rate": 6.402972763448194e-05, + "loss": 1.9913, + "step": 5686 + }, + { + "epoch": 0.428084834114304, + "grad_norm": 3.6294803619384766, + "learning_rate": 6.401802607853749e-05, + "loss": 1.8928, + "step": 5687 + }, + { + "epoch": 0.42816010839496416, + "grad_norm": 5.221503734588623, + "learning_rate": 6.400632368926163e-05, + "loss": 1.8757, + "step": 5688 + }, + { + "epoch": 0.42823538267562433, + "grad_norm": 4.934842109680176, + "learning_rate": 6.399462046735001e-05, + "loss": 1.9658, + "step": 5689 + }, + { + "epoch": 0.42831065695628445, + "grad_norm": 4.679507255554199, + "learning_rate": 6.398291641349836e-05, + "loss": 1.9626, + "step": 5690 + }, + { + "epoch": 0.42838593123694463, + "grad_norm": 5.309213638305664, + "learning_rate": 6.397121152840245e-05, + "loss": 2.0364, + "step": 5691 + }, + { + "epoch": 0.42846120551760475, + "grad_norm": 3.6243607997894287, + "learning_rate": 6.395950581275811e-05, + "loss": 2.0319, + "step": 5692 + }, + { + "epoch": 0.4285364797982649, + "grad_norm": 4.856205940246582, + "learning_rate": 6.394779926726117e-05, + "loss": 1.9477, + "step": 5693 + }, + { + "epoch": 0.4286117540789251, + "grad_norm": 5.259153842926025, + "learning_rate": 6.393609189260762e-05, + "loss": 1.788, + "step": 5694 + }, + { + "epoch": 0.4286870283595852, + "grad_norm": 4.2632365226745605, + "learning_rate": 6.392438368949338e-05, + "loss": 2.3005, + "step": 5695 + }, + { + "epoch": 0.4287623026402454, + "grad_norm": 5.557430267333984, + "learning_rate": 6.39126746586145e-05, + "loss": 1.9711, + "step": 5696 + }, + { + "epoch": 0.42883757692090557, + "grad_norm": 4.379390716552734, + "learning_rate": 6.390096480066703e-05, + "loss": 2.1913, + "step": 5697 + }, + { + "epoch": 0.4289128512015657, + "grad_norm": 3.9160053730010986, + "learning_rate": 6.388925411634708e-05, + "loss": 1.5613, + "step": 5698 + }, + { + "epoch": 0.42898812548222587, + "grad_norm": 4.172816276550293, + "learning_rate": 6.387754260635081e-05, + "loss": 2.2435, + "step": 5699 + }, + { + "epoch": 0.429063399762886, + "grad_norm": 4.686192035675049, + "learning_rate": 6.386583027137447e-05, + "loss": 1.7772, + "step": 5700 + }, + { + "epoch": 0.42913867404354616, + "grad_norm": 5.868608474731445, + "learning_rate": 6.385411711211429e-05, + "loss": 2.0232, + "step": 5701 + }, + { + "epoch": 0.42921394832420634, + "grad_norm": 7.952432632446289, + "learning_rate": 6.384240312926663e-05, + "loss": 2.1553, + "step": 5702 + }, + { + "epoch": 0.42928922260486646, + "grad_norm": 5.211426258087158, + "learning_rate": 6.38306883235278e-05, + "loss": 2.0059, + "step": 5703 + }, + { + "epoch": 0.42936449688552664, + "grad_norm": 4.585365295410156, + "learning_rate": 6.381897269559423e-05, + "loss": 1.8291, + "step": 5704 + }, + { + "epoch": 0.4294397711661868, + "grad_norm": 3.78871750831604, + "learning_rate": 6.38072562461624e-05, + "loss": 1.8944, + "step": 5705 + }, + { + "epoch": 0.42951504544684693, + "grad_norm": 3.7727975845336914, + "learning_rate": 6.37955389759288e-05, + "loss": 1.7849, + "step": 5706 + }, + { + "epoch": 0.4295903197275071, + "grad_norm": 4.853158950805664, + "learning_rate": 6.378382088558998e-05, + "loss": 1.9812, + "step": 5707 + }, + { + "epoch": 0.4296655940081673, + "grad_norm": 6.993547439575195, + "learning_rate": 6.377210197584256e-05, + "loss": 2.1997, + "step": 5708 + }, + { + "epoch": 0.4297408682888274, + "grad_norm": 4.194733619689941, + "learning_rate": 6.37603822473832e-05, + "loss": 1.7033, + "step": 5709 + }, + { + "epoch": 0.4298161425694876, + "grad_norm": 5.313787460327148, + "learning_rate": 6.374866170090858e-05, + "loss": 2.2826, + "step": 5710 + }, + { + "epoch": 0.4298914168501477, + "grad_norm": 3.630655527114868, + "learning_rate": 6.373694033711551e-05, + "loss": 1.9933, + "step": 5711 + }, + { + "epoch": 0.4299666911308079, + "grad_norm": 4.226672172546387, + "learning_rate": 6.372521815670072e-05, + "loss": 2.0093, + "step": 5712 + }, + { + "epoch": 0.43004196541146805, + "grad_norm": 5.1546630859375, + "learning_rate": 6.371349516036111e-05, + "loss": 2.1626, + "step": 5713 + }, + { + "epoch": 0.4301172396921282, + "grad_norm": 4.447945594787598, + "learning_rate": 6.370177134879356e-05, + "loss": 1.7292, + "step": 5714 + }, + { + "epoch": 0.43019251397278835, + "grad_norm": 4.924492359161377, + "learning_rate": 6.3690046722695e-05, + "loss": 1.8213, + "step": 5715 + }, + { + "epoch": 0.4302677882534485, + "grad_norm": 5.612069606781006, + "learning_rate": 6.367832128276245e-05, + "loss": 1.8508, + "step": 5716 + }, + { + "epoch": 0.43034306253410864, + "grad_norm": 3.8491220474243164, + "learning_rate": 6.366659502969297e-05, + "loss": 1.8841, + "step": 5717 + }, + { + "epoch": 0.4304183368147688, + "grad_norm": 5.035004138946533, + "learning_rate": 6.365486796418361e-05, + "loss": 1.8389, + "step": 5718 + }, + { + "epoch": 0.430493611095429, + "grad_norm": 5.173888206481934, + "learning_rate": 6.364314008693154e-05, + "loss": 1.9747, + "step": 5719 + }, + { + "epoch": 0.4305688853760891, + "grad_norm": 4.266820907592773, + "learning_rate": 6.363141139863394e-05, + "loss": 1.5425, + "step": 5720 + }, + { + "epoch": 0.4306441596567493, + "grad_norm": 4.174493312835693, + "learning_rate": 6.361968189998802e-05, + "loss": 1.9229, + "step": 5721 + }, + { + "epoch": 0.4307194339374094, + "grad_norm": 4.937036991119385, + "learning_rate": 6.360795159169111e-05, + "loss": 2.0161, + "step": 5722 + }, + { + "epoch": 0.4307947082180696, + "grad_norm": 4.678271293640137, + "learning_rate": 6.359622047444052e-05, + "loss": 2.3053, + "step": 5723 + }, + { + "epoch": 0.43086998249872976, + "grad_norm": 8.525506019592285, + "learning_rate": 6.358448854893363e-05, + "loss": 1.8797, + "step": 5724 + }, + { + "epoch": 0.4309452567793899, + "grad_norm": 4.631918430328369, + "learning_rate": 6.357275581586788e-05, + "loss": 2.0413, + "step": 5725 + }, + { + "epoch": 0.43102053106005006, + "grad_norm": 5.002639293670654, + "learning_rate": 6.356102227594075e-05, + "loss": 1.7297, + "step": 5726 + }, + { + "epoch": 0.43109580534071024, + "grad_norm": 4.278130054473877, + "learning_rate": 6.354928792984973e-05, + "loss": 1.7146, + "step": 5727 + }, + { + "epoch": 0.43117107962137036, + "grad_norm": 4.839090347290039, + "learning_rate": 6.353755277829244e-05, + "loss": 1.907, + "step": 5728 + }, + { + "epoch": 0.43124635390203053, + "grad_norm": 4.6313252449035645, + "learning_rate": 6.352581682196648e-05, + "loss": 1.7001, + "step": 5729 + }, + { + "epoch": 0.43132162818269065, + "grad_norm": 4.1141557693481445, + "learning_rate": 6.351408006156953e-05, + "loss": 2.1519, + "step": 5730 + }, + { + "epoch": 0.43139690246335083, + "grad_norm": 4.779444694519043, + "learning_rate": 6.350234249779929e-05, + "loss": 1.9672, + "step": 5731 + }, + { + "epoch": 0.431472176744011, + "grad_norm": 5.1121907234191895, + "learning_rate": 6.349060413135352e-05, + "loss": 2.4931, + "step": 5732 + }, + { + "epoch": 0.4315474510246711, + "grad_norm": 4.61134147644043, + "learning_rate": 6.347886496293003e-05, + "loss": 2.0464, + "step": 5733 + }, + { + "epoch": 0.4316227253053313, + "grad_norm": 4.426197528839111, + "learning_rate": 6.346712499322673e-05, + "loss": 2.0627, + "step": 5734 + }, + { + "epoch": 0.4316979995859915, + "grad_norm": 7.25959587097168, + "learning_rate": 6.345538422294148e-05, + "loss": 2.3694, + "step": 5735 + }, + { + "epoch": 0.4317732738666516, + "grad_norm": 3.6505913734436035, + "learning_rate": 6.344364265277223e-05, + "loss": 1.8285, + "step": 5736 + }, + { + "epoch": 0.4318485481473118, + "grad_norm": 5.894350051879883, + "learning_rate": 6.3431900283417e-05, + "loss": 1.9006, + "step": 5737 + }, + { + "epoch": 0.43192382242797195, + "grad_norm": 4.518353462219238, + "learning_rate": 6.342015711557385e-05, + "loss": 2.4241, + "step": 5738 + }, + { + "epoch": 0.43199909670863207, + "grad_norm": 6.546237945556641, + "learning_rate": 6.340841314994085e-05, + "loss": 1.8049, + "step": 5739 + }, + { + "epoch": 0.43207437098929224, + "grad_norm": 4.4490065574646, + "learning_rate": 6.339666838721618e-05, + "loss": 1.6927, + "step": 5740 + }, + { + "epoch": 0.43214964526995236, + "grad_norm": 4.030456066131592, + "learning_rate": 6.3384922828098e-05, + "loss": 2.1084, + "step": 5741 + }, + { + "epoch": 0.43222491955061254, + "grad_norm": 4.70572566986084, + "learning_rate": 6.337317647328458e-05, + "loss": 1.8181, + "step": 5742 + }, + { + "epoch": 0.4323001938312727, + "grad_norm": 3.6510467529296875, + "learning_rate": 6.336142932347417e-05, + "loss": 2.2258, + "step": 5743 + }, + { + "epoch": 0.43237546811193284, + "grad_norm": 6.072367191314697, + "learning_rate": 6.334968137936514e-05, + "loss": 1.5756, + "step": 5744 + }, + { + "epoch": 0.432450742392593, + "grad_norm": 4.301730155944824, + "learning_rate": 6.333793264165586e-05, + "loss": 1.8066, + "step": 5745 + }, + { + "epoch": 0.4325260166732532, + "grad_norm": 3.0722105503082275, + "learning_rate": 6.332618311104474e-05, + "loss": 1.8871, + "step": 5746 + }, + { + "epoch": 0.4326012909539133, + "grad_norm": 5.575454235076904, + "learning_rate": 6.331443278823029e-05, + "loss": 2.3544, + "step": 5747 + }, + { + "epoch": 0.4326765652345735, + "grad_norm": 5.277907371520996, + "learning_rate": 6.3302681673911e-05, + "loss": 1.6439, + "step": 5748 + }, + { + "epoch": 0.4327518395152336, + "grad_norm": 6.179056167602539, + "learning_rate": 6.329092976878545e-05, + "loss": 2.3607, + "step": 5749 + }, + { + "epoch": 0.4328271137958938, + "grad_norm": 4.6801605224609375, + "learning_rate": 6.327917707355227e-05, + "loss": 1.8758, + "step": 5750 + }, + { + "epoch": 0.43290238807655396, + "grad_norm": 6.048764705657959, + "learning_rate": 6.326742358891011e-05, + "loss": 1.9761, + "step": 5751 + }, + { + "epoch": 0.4329776623572141, + "grad_norm": 4.253023624420166, + "learning_rate": 6.32556693155577e-05, + "loss": 1.9671, + "step": 5752 + }, + { + "epoch": 0.43305293663787425, + "grad_norm": 7.505892276763916, + "learning_rate": 6.324391425419379e-05, + "loss": 2.0121, + "step": 5753 + }, + { + "epoch": 0.43312821091853443, + "grad_norm": 6.723575592041016, + "learning_rate": 6.323215840551714e-05, + "loss": 2.3984, + "step": 5754 + }, + { + "epoch": 0.43320348519919455, + "grad_norm": 5.20761251449585, + "learning_rate": 6.322040177022666e-05, + "loss": 1.7984, + "step": 5755 + }, + { + "epoch": 0.4332787594798547, + "grad_norm": 5.013870716094971, + "learning_rate": 6.320864434902122e-05, + "loss": 2.2126, + "step": 5756 + }, + { + "epoch": 0.4333540337605149, + "grad_norm": 4.628096103668213, + "learning_rate": 6.319688614259979e-05, + "loss": 2.038, + "step": 5757 + }, + { + "epoch": 0.433429308041175, + "grad_norm": 7.383403778076172, + "learning_rate": 6.318512715166135e-05, + "loss": 2.0267, + "step": 5758 + }, + { + "epoch": 0.4335045823218352, + "grad_norm": 4.653458595275879, + "learning_rate": 6.317336737690493e-05, + "loss": 1.9686, + "step": 5759 + }, + { + "epoch": 0.4335798566024953, + "grad_norm": 3.618833303451538, + "learning_rate": 6.31616068190296e-05, + "loss": 2.2715, + "step": 5760 + }, + { + "epoch": 0.4336551308831555, + "grad_norm": 5.47083044052124, + "learning_rate": 6.314984547873451e-05, + "loss": 1.7802, + "step": 5761 + }, + { + "epoch": 0.43373040516381567, + "grad_norm": 5.446893692016602, + "learning_rate": 6.313808335671886e-05, + "loss": 1.773, + "step": 5762 + }, + { + "epoch": 0.4338056794444758, + "grad_norm": 8.371232032775879, + "learning_rate": 6.312632045368184e-05, + "loss": 1.9115, + "step": 5763 + }, + { + "epoch": 0.43388095372513596, + "grad_norm": 7.154722213745117, + "learning_rate": 6.311455677032274e-05, + "loss": 2.0646, + "step": 5764 + }, + { + "epoch": 0.43395622800579614, + "grad_norm": 5.043438911437988, + "learning_rate": 6.310279230734084e-05, + "loss": 1.9507, + "step": 5765 + }, + { + "epoch": 0.43403150228645626, + "grad_norm": 3.613232135772705, + "learning_rate": 6.309102706543556e-05, + "loss": 1.9675, + "step": 5766 + }, + { + "epoch": 0.43410677656711644, + "grad_norm": 3.601789712905884, + "learning_rate": 6.307926104530629e-05, + "loss": 1.9699, + "step": 5767 + }, + { + "epoch": 0.4341820508477766, + "grad_norm": 6.2960944175720215, + "learning_rate": 6.306749424765246e-05, + "loss": 2.1078, + "step": 5768 + }, + { + "epoch": 0.43425732512843673, + "grad_norm": 4.588296413421631, + "learning_rate": 6.30557266731736e-05, + "loss": 1.8613, + "step": 5769 + }, + { + "epoch": 0.4343325994090969, + "grad_norm": 5.749566555023193, + "learning_rate": 6.304395832256925e-05, + "loss": 1.8505, + "step": 5770 + }, + { + "epoch": 0.43440787368975703, + "grad_norm": 3.626641273498535, + "learning_rate": 6.3032189196539e-05, + "loss": 2.1019, + "step": 5771 + }, + { + "epoch": 0.4344831479704172, + "grad_norm": 4.830503940582275, + "learning_rate": 6.30204192957825e-05, + "loss": 2.1722, + "step": 5772 + }, + { + "epoch": 0.4345584222510774, + "grad_norm": 5.286192893981934, + "learning_rate": 6.300864862099943e-05, + "loss": 2.0046, + "step": 5773 + }, + { + "epoch": 0.4346336965317375, + "grad_norm": 4.790497779846191, + "learning_rate": 6.299687717288953e-05, + "loss": 1.8867, + "step": 5774 + }, + { + "epoch": 0.4347089708123977, + "grad_norm": 3.465799331665039, + "learning_rate": 6.298510495215258e-05, + "loss": 2.2847, + "step": 5775 + }, + { + "epoch": 0.43478424509305785, + "grad_norm": 4.555606365203857, + "learning_rate": 6.29733319594884e-05, + "loss": 1.8833, + "step": 5776 + }, + { + "epoch": 0.434859519373718, + "grad_norm": 6.483745574951172, + "learning_rate": 6.296155819559684e-05, + "loss": 1.7383, + "step": 5777 + }, + { + "epoch": 0.43493479365437815, + "grad_norm": 3.956658363342285, + "learning_rate": 6.294978366117786e-05, + "loss": 1.7581, + "step": 5778 + }, + { + "epoch": 0.43501006793503827, + "grad_norm": 4.499672889709473, + "learning_rate": 6.293800835693137e-05, + "loss": 2.0297, + "step": 5779 + }, + { + "epoch": 0.43508534221569845, + "grad_norm": 4.4785637855529785, + "learning_rate": 6.292623228355743e-05, + "loss": 1.8827, + "step": 5780 + }, + { + "epoch": 0.4351606164963586, + "grad_norm": 5.152351379394531, + "learning_rate": 6.291445544175607e-05, + "loss": 2.0522, + "step": 5781 + }, + { + "epoch": 0.43523589077701874, + "grad_norm": 5.675931453704834, + "learning_rate": 6.290267783222737e-05, + "loss": 1.8767, + "step": 5782 + }, + { + "epoch": 0.4353111650576789, + "grad_norm": 4.8720855712890625, + "learning_rate": 6.289089945567151e-05, + "loss": 1.8225, + "step": 5783 + }, + { + "epoch": 0.4353864393383391, + "grad_norm": 8.385793685913086, + "learning_rate": 6.287912031278868e-05, + "loss": 2.1011, + "step": 5784 + }, + { + "epoch": 0.4354617136189992, + "grad_norm": 3.90242075920105, + "learning_rate": 6.286734040427908e-05, + "loss": 1.5936, + "step": 5785 + }, + { + "epoch": 0.4355369878996594, + "grad_norm": 4.243035316467285, + "learning_rate": 6.285555973084302e-05, + "loss": 1.7424, + "step": 5786 + }, + { + "epoch": 0.43561226218031956, + "grad_norm": 6.727688312530518, + "learning_rate": 6.28437782931808e-05, + "loss": 1.6863, + "step": 5787 + }, + { + "epoch": 0.4356875364609797, + "grad_norm": 4.153524398803711, + "learning_rate": 6.283199609199285e-05, + "loss": 2.2002, + "step": 5788 + }, + { + "epoch": 0.43576281074163986, + "grad_norm": 4.874256610870361, + "learning_rate": 6.282021312797952e-05, + "loss": 1.8772, + "step": 5789 + }, + { + "epoch": 0.4358380850223, + "grad_norm": 4.528047561645508, + "learning_rate": 6.280842940184133e-05, + "loss": 1.8688, + "step": 5790 + }, + { + "epoch": 0.43591335930296016, + "grad_norm": 4.553868293762207, + "learning_rate": 6.279664491427875e-05, + "loss": 2.0458, + "step": 5791 + }, + { + "epoch": 0.43598863358362033, + "grad_norm": 6.397035121917725, + "learning_rate": 6.278485966599235e-05, + "loss": 1.7075, + "step": 5792 + }, + { + "epoch": 0.43606390786428045, + "grad_norm": 4.026989936828613, + "learning_rate": 6.277307365768273e-05, + "loss": 1.9292, + "step": 5793 + }, + { + "epoch": 0.43613918214494063, + "grad_norm": 6.154115200042725, + "learning_rate": 6.276128689005053e-05, + "loss": 2.0618, + "step": 5794 + }, + { + "epoch": 0.4362144564256008, + "grad_norm": 4.665353775024414, + "learning_rate": 6.274949936379644e-05, + "loss": 2.0919, + "step": 5795 + }, + { + "epoch": 0.4362897307062609, + "grad_norm": 5.1261115074157715, + "learning_rate": 6.273771107962119e-05, + "loss": 1.8395, + "step": 5796 + }, + { + "epoch": 0.4363650049869211, + "grad_norm": 4.472558498382568, + "learning_rate": 6.272592203822558e-05, + "loss": 1.7939, + "step": 5797 + }, + { + "epoch": 0.4364402792675812, + "grad_norm": 6.943745136260986, + "learning_rate": 6.27141322403104e-05, + "loss": 2.2446, + "step": 5798 + }, + { + "epoch": 0.4365155535482414, + "grad_norm": 5.025240898132324, + "learning_rate": 6.270234168657653e-05, + "loss": 1.8791, + "step": 5799 + }, + { + "epoch": 0.4365908278289016, + "grad_norm": 4.878539562225342, + "learning_rate": 6.269055037772492e-05, + "loss": 2.0237, + "step": 5800 + }, + { + "epoch": 0.4366661021095617, + "grad_norm": 3.9647951126098633, + "learning_rate": 6.26787583144565e-05, + "loss": 1.9357, + "step": 5801 + }, + { + "epoch": 0.43674137639022187, + "grad_norm": 3.8735458850860596, + "learning_rate": 6.266696549747227e-05, + "loss": 2.0257, + "step": 5802 + }, + { + "epoch": 0.43681665067088205, + "grad_norm": 4.7664690017700195, + "learning_rate": 6.26551719274733e-05, + "loss": 2.404, + "step": 5803 + }, + { + "epoch": 0.43689192495154217, + "grad_norm": 3.4478893280029297, + "learning_rate": 6.264337760516064e-05, + "loss": 2.0116, + "step": 5804 + }, + { + "epoch": 0.43696719923220234, + "grad_norm": 4.413941383361816, + "learning_rate": 6.263158253123548e-05, + "loss": 1.8766, + "step": 5805 + }, + { + "epoch": 0.4370424735128625, + "grad_norm": 3.876295804977417, + "learning_rate": 6.261978670639899e-05, + "loss": 1.8091, + "step": 5806 + }, + { + "epoch": 0.43711774779352264, + "grad_norm": 4.756508827209473, + "learning_rate": 6.260799013135237e-05, + "loss": 1.7618, + "step": 5807 + }, + { + "epoch": 0.4371930220741828, + "grad_norm": 4.375035762786865, + "learning_rate": 6.259619280679694e-05, + "loss": 1.8209, + "step": 5808 + }, + { + "epoch": 0.43726829635484293, + "grad_norm": 4.6030497550964355, + "learning_rate": 6.258439473343398e-05, + "loss": 1.9201, + "step": 5809 + }, + { + "epoch": 0.4373435706355031, + "grad_norm": 4.600318431854248, + "learning_rate": 6.257259591196484e-05, + "loss": 2.2298, + "step": 5810 + }, + { + "epoch": 0.4374188449161633, + "grad_norm": 4.446491241455078, + "learning_rate": 6.256079634309098e-05, + "loss": 2.1536, + "step": 5811 + }, + { + "epoch": 0.4374941191968234, + "grad_norm": 3.885686159133911, + "learning_rate": 6.25489960275138e-05, + "loss": 1.8866, + "step": 5812 + }, + { + "epoch": 0.4375693934774836, + "grad_norm": 3.783769130706787, + "learning_rate": 6.253719496593482e-05, + "loss": 1.7782, + "step": 5813 + }, + { + "epoch": 0.43764466775814376, + "grad_norm": 4.285191535949707, + "learning_rate": 6.252539315905557e-05, + "loss": 1.9723, + "step": 5814 + }, + { + "epoch": 0.4377199420388039, + "grad_norm": 4.648037910461426, + "learning_rate": 6.251359060757762e-05, + "loss": 2.0891, + "step": 5815 + }, + { + "epoch": 0.43779521631946405, + "grad_norm": 4.315683364868164, + "learning_rate": 6.250178731220265e-05, + "loss": 1.5708, + "step": 5816 + }, + { + "epoch": 0.43787049060012423, + "grad_norm": 4.218448162078857, + "learning_rate": 6.248998327363227e-05, + "loss": 1.8053, + "step": 5817 + }, + { + "epoch": 0.43794576488078435, + "grad_norm": 3.898834466934204, + "learning_rate": 6.247817849256823e-05, + "loss": 1.8033, + "step": 5818 + }, + { + "epoch": 0.4380210391614445, + "grad_norm": 3.9393954277038574, + "learning_rate": 6.246637296971228e-05, + "loss": 1.6337, + "step": 5819 + }, + { + "epoch": 0.43809631344210465, + "grad_norm": 5.663894176483154, + "learning_rate": 6.245456670576621e-05, + "loss": 1.7471, + "step": 5820 + }, + { + "epoch": 0.4381715877227648, + "grad_norm": 3.4593889713287354, + "learning_rate": 6.244275970143191e-05, + "loss": 1.6809, + "step": 5821 + }, + { + "epoch": 0.438246862003425, + "grad_norm": 4.155667304992676, + "learning_rate": 6.243095195741124e-05, + "loss": 2.1139, + "step": 5822 + }, + { + "epoch": 0.4383221362840851, + "grad_norm": 4.147579669952393, + "learning_rate": 6.241914347440614e-05, + "loss": 1.862, + "step": 5823 + }, + { + "epoch": 0.4383974105647453, + "grad_norm": 4.642208099365234, + "learning_rate": 6.240733425311859e-05, + "loss": 1.9903, + "step": 5824 + }, + { + "epoch": 0.43847268484540547, + "grad_norm": 4.561496257781982, + "learning_rate": 6.239552429425063e-05, + "loss": 2.0583, + "step": 5825 + }, + { + "epoch": 0.4385479591260656, + "grad_norm": 4.853338241577148, + "learning_rate": 6.238371359850433e-05, + "loss": 1.8325, + "step": 5826 + }, + { + "epoch": 0.43862323340672577, + "grad_norm": 4.065107345581055, + "learning_rate": 6.237190216658177e-05, + "loss": 2.3571, + "step": 5827 + }, + { + "epoch": 0.4386985076873859, + "grad_norm": 4.894888401031494, + "learning_rate": 6.236008999918514e-05, + "loss": 2.1457, + "step": 5828 + }, + { + "epoch": 0.43877378196804606, + "grad_norm": 5.1090569496154785, + "learning_rate": 6.234827709701662e-05, + "loss": 1.7546, + "step": 5829 + }, + { + "epoch": 0.43884905624870624, + "grad_norm": 4.411669731140137, + "learning_rate": 6.233646346077844e-05, + "loss": 1.8638, + "step": 5830 + }, + { + "epoch": 0.43892433052936636, + "grad_norm": 6.119626998901367, + "learning_rate": 6.232464909117292e-05, + "loss": 2.1074, + "step": 5831 + }, + { + "epoch": 0.43899960481002653, + "grad_norm": 5.595834255218506, + "learning_rate": 6.231283398890237e-05, + "loss": 1.8592, + "step": 5832 + }, + { + "epoch": 0.4390748790906867, + "grad_norm": 4.224753379821777, + "learning_rate": 6.230101815466917e-05, + "loss": 1.7581, + "step": 5833 + }, + { + "epoch": 0.43915015337134683, + "grad_norm": 6.337724685668945, + "learning_rate": 6.228920158917574e-05, + "loss": 1.6936, + "step": 5834 + }, + { + "epoch": 0.439225427652007, + "grad_norm": 4.504908084869385, + "learning_rate": 6.227738429312456e-05, + "loss": 2.0062, + "step": 5835 + }, + { + "epoch": 0.4393007019326672, + "grad_norm": 4.386195659637451, + "learning_rate": 6.226556626721808e-05, + "loss": 2.2207, + "step": 5836 + }, + { + "epoch": 0.4393759762133273, + "grad_norm": 3.7572054862976074, + "learning_rate": 6.225374751215888e-05, + "loss": 2.2299, + "step": 5837 + }, + { + "epoch": 0.4394512504939875, + "grad_norm": 5.244462490081787, + "learning_rate": 6.224192802864955e-05, + "loss": 2.0849, + "step": 5838 + }, + { + "epoch": 0.4395265247746476, + "grad_norm": 5.741331577301025, + "learning_rate": 6.223010781739273e-05, + "loss": 2.0038, + "step": 5839 + }, + { + "epoch": 0.4396017990553078, + "grad_norm": 3.723790168762207, + "learning_rate": 6.221828687909109e-05, + "loss": 1.7946, + "step": 5840 + }, + { + "epoch": 0.43967707333596795, + "grad_norm": 7.204499244689941, + "learning_rate": 6.220646521444736e-05, + "loss": 2.2006, + "step": 5841 + }, + { + "epoch": 0.43975234761662807, + "grad_norm": 3.997116804122925, + "learning_rate": 6.21946428241643e-05, + "loss": 2.2271, + "step": 5842 + }, + { + "epoch": 0.43982762189728825, + "grad_norm": 4.753002643585205, + "learning_rate": 6.21828197089447e-05, + "loss": 1.6895, + "step": 5843 + }, + { + "epoch": 0.4399028961779484, + "grad_norm": 5.885253429412842, + "learning_rate": 6.217099586949143e-05, + "loss": 2.0463, + "step": 5844 + }, + { + "epoch": 0.43997817045860854, + "grad_norm": 3.3102731704711914, + "learning_rate": 6.215917130650739e-05, + "loss": 1.7357, + "step": 5845 + }, + { + "epoch": 0.4400534447392687, + "grad_norm": 5.026159286499023, + "learning_rate": 6.21473460206955e-05, + "loss": 1.8191, + "step": 5846 + }, + { + "epoch": 0.4401287190199289, + "grad_norm": 5.543838977813721, + "learning_rate": 6.213552001275874e-05, + "loss": 2.1985, + "step": 5847 + }, + { + "epoch": 0.440203993300589, + "grad_norm": 5.000566005706787, + "learning_rate": 6.212369328340014e-05, + "loss": 1.8587, + "step": 5848 + }, + { + "epoch": 0.4402792675812492, + "grad_norm": 5.314465522766113, + "learning_rate": 6.211186583332275e-05, + "loss": 2.0911, + "step": 5849 + }, + { + "epoch": 0.4403545418619093, + "grad_norm": 6.697752475738525, + "learning_rate": 6.210003766322969e-05, + "loss": 2.1173, + "step": 5850 + }, + { + "epoch": 0.4404298161425695, + "grad_norm": 5.667004108428955, + "learning_rate": 6.208820877382414e-05, + "loss": 1.9221, + "step": 5851 + }, + { + "epoch": 0.44050509042322966, + "grad_norm": 4.934892177581787, + "learning_rate": 6.207637916580923e-05, + "loss": 1.5931, + "step": 5852 + }, + { + "epoch": 0.4405803647038898, + "grad_norm": 3.7798006534576416, + "learning_rate": 6.206454883988823e-05, + "loss": 1.716, + "step": 5853 + }, + { + "epoch": 0.44065563898454996, + "grad_norm": 6.391505241394043, + "learning_rate": 6.205271779676443e-05, + "loss": 1.6619, + "step": 5854 + }, + { + "epoch": 0.44073091326521013, + "grad_norm": 5.354294776916504, + "learning_rate": 6.204088603714113e-05, + "loss": 1.9408, + "step": 5855 + }, + { + "epoch": 0.44080618754587025, + "grad_norm": 6.182084083557129, + "learning_rate": 6.202905356172171e-05, + "loss": 2.0821, + "step": 5856 + }, + { + "epoch": 0.44088146182653043, + "grad_norm": 4.672182083129883, + "learning_rate": 6.201722037120957e-05, + "loss": 1.8795, + "step": 5857 + }, + { + "epoch": 0.44095673610719055, + "grad_norm": 5.455212593078613, + "learning_rate": 6.200538646630817e-05, + "loss": 2.393, + "step": 5858 + }, + { + "epoch": 0.4410320103878507, + "grad_norm": 3.989269971847534, + "learning_rate": 6.199355184772099e-05, + "loss": 1.9019, + "step": 5859 + }, + { + "epoch": 0.4411072846685109, + "grad_norm": 4.534832000732422, + "learning_rate": 6.198171651615155e-05, + "loss": 2.1006, + "step": 5860 + }, + { + "epoch": 0.441182558949171, + "grad_norm": 5.176645755767822, + "learning_rate": 6.196988047230345e-05, + "loss": 1.8378, + "step": 5861 + }, + { + "epoch": 0.4412578332298312, + "grad_norm": 3.412224769592285, + "learning_rate": 6.195804371688031e-05, + "loss": 1.6773, + "step": 5862 + }, + { + "epoch": 0.4413331075104914, + "grad_norm": 5.141180038452148, + "learning_rate": 6.194620625058578e-05, + "loss": 1.7391, + "step": 5863 + }, + { + "epoch": 0.4414083817911515, + "grad_norm": 3.93776798248291, + "learning_rate": 6.193436807412356e-05, + "loss": 1.9573, + "step": 5864 + }, + { + "epoch": 0.44148365607181167, + "grad_norm": 6.58004903793335, + "learning_rate": 6.19225291881974e-05, + "loss": 1.9819, + "step": 5865 + }, + { + "epoch": 0.44155893035247185, + "grad_norm": 4.9218525886535645, + "learning_rate": 6.19106895935111e-05, + "loss": 2.1121, + "step": 5866 + }, + { + "epoch": 0.44163420463313197, + "grad_norm": 4.046970844268799, + "learning_rate": 6.189884929076847e-05, + "loss": 1.7778, + "step": 5867 + }, + { + "epoch": 0.44170947891379214, + "grad_norm": 3.4083220958709717, + "learning_rate": 6.18870082806734e-05, + "loss": 1.7752, + "step": 5868 + }, + { + "epoch": 0.44178475319445226, + "grad_norm": 6.671065807342529, + "learning_rate": 6.187516656392978e-05, + "loss": 2.0058, + "step": 5869 + }, + { + "epoch": 0.44186002747511244, + "grad_norm": 6.113986015319824, + "learning_rate": 6.18633241412416e-05, + "loss": 1.6301, + "step": 5870 + }, + { + "epoch": 0.4419353017557726, + "grad_norm": 5.218296051025391, + "learning_rate": 6.185148101331281e-05, + "loss": 1.9825, + "step": 5871 + }, + { + "epoch": 0.44201057603643273, + "grad_norm": 4.4001688957214355, + "learning_rate": 6.183963718084752e-05, + "loss": 1.9731, + "step": 5872 + }, + { + "epoch": 0.4420858503170929, + "grad_norm": 3.642742395401001, + "learning_rate": 6.182779264454974e-05, + "loss": 1.9946, + "step": 5873 + }, + { + "epoch": 0.4421611245977531, + "grad_norm": 6.567331314086914, + "learning_rate": 6.181594740512365e-05, + "loss": 1.9168, + "step": 5874 + }, + { + "epoch": 0.4422363988784132, + "grad_norm": 4.609991550445557, + "learning_rate": 6.180410146327336e-05, + "loss": 2.1172, + "step": 5875 + }, + { + "epoch": 0.4423116731590734, + "grad_norm": 6.285871505737305, + "learning_rate": 6.179225481970313e-05, + "loss": 1.5599, + "step": 5876 + }, + { + "epoch": 0.4423869474397335, + "grad_norm": 5.3340983390808105, + "learning_rate": 6.178040747511717e-05, + "loss": 2.3341, + "step": 5877 + }, + { + "epoch": 0.4424622217203937, + "grad_norm": 4.116152763366699, + "learning_rate": 6.176855943021981e-05, + "loss": 1.8963, + "step": 5878 + }, + { + "epoch": 0.44253749600105385, + "grad_norm": 3.549401044845581, + "learning_rate": 6.175671068571535e-05, + "loss": 2.0334, + "step": 5879 + }, + { + "epoch": 0.442612770281714, + "grad_norm": 4.7267022132873535, + "learning_rate": 6.174486124230817e-05, + "loss": 1.877, + "step": 5880 + }, + { + "epoch": 0.44268804456237415, + "grad_norm": 5.716840744018555, + "learning_rate": 6.17330111007027e-05, + "loss": 2.0898, + "step": 5881 + }, + { + "epoch": 0.4427633188430343, + "grad_norm": 4.019646644592285, + "learning_rate": 6.172116026160337e-05, + "loss": 1.8261, + "step": 5882 + }, + { + "epoch": 0.44283859312369445, + "grad_norm": 5.86329460144043, + "learning_rate": 6.17093087257147e-05, + "loss": 1.7064, + "step": 5883 + }, + { + "epoch": 0.4429138674043546, + "grad_norm": 4.133573532104492, + "learning_rate": 6.169745649374122e-05, + "loss": 2.0934, + "step": 5884 + }, + { + "epoch": 0.4429891416850148, + "grad_norm": 5.172225475311279, + "learning_rate": 6.168560356638752e-05, + "loss": 1.8559, + "step": 5885 + }, + { + "epoch": 0.4430644159656749, + "grad_norm": 6.817885875701904, + "learning_rate": 6.167374994435822e-05, + "loss": 2.1575, + "step": 5886 + }, + { + "epoch": 0.4431396902463351, + "grad_norm": 7.958605766296387, + "learning_rate": 6.166189562835798e-05, + "loss": 1.9088, + "step": 5887 + }, + { + "epoch": 0.4432149645269952, + "grad_norm": 4.35015869140625, + "learning_rate": 6.16500406190915e-05, + "loss": 1.6406, + "step": 5888 + }, + { + "epoch": 0.4432902388076554, + "grad_norm": 4.303564071655273, + "learning_rate": 6.163818491726354e-05, + "loss": 1.8652, + "step": 5889 + }, + { + "epoch": 0.44336551308831557, + "grad_norm": 3.940829277038574, + "learning_rate": 6.162632852357887e-05, + "loss": 1.9498, + "step": 5890 + }, + { + "epoch": 0.4434407873689757, + "grad_norm": 5.269733428955078, + "learning_rate": 6.161447143874234e-05, + "loss": 2.1676, + "step": 5891 + }, + { + "epoch": 0.44351606164963586, + "grad_norm": 5.32184362411499, + "learning_rate": 6.160261366345882e-05, + "loss": 1.7529, + "step": 5892 + }, + { + "epoch": 0.44359133593029604, + "grad_norm": 6.089979648590088, + "learning_rate": 6.159075519843319e-05, + "loss": 2.0035, + "step": 5893 + }, + { + "epoch": 0.44366661021095616, + "grad_norm": 4.3718414306640625, + "learning_rate": 6.157889604437043e-05, + "loss": 1.7562, + "step": 5894 + }, + { + "epoch": 0.44374188449161633, + "grad_norm": 4.9573516845703125, + "learning_rate": 6.156703620197553e-05, + "loss": 2.0056, + "step": 5895 + }, + { + "epoch": 0.4438171587722765, + "grad_norm": 3.8163676261901855, + "learning_rate": 6.155517567195351e-05, + "loss": 1.7949, + "step": 5896 + }, + { + "epoch": 0.44389243305293663, + "grad_norm": 4.765012264251709, + "learning_rate": 6.154331445500945e-05, + "loss": 1.9379, + "step": 5897 + }, + { + "epoch": 0.4439677073335968, + "grad_norm": 5.006673336029053, + "learning_rate": 6.153145255184846e-05, + "loss": 1.6598, + "step": 5898 + }, + { + "epoch": 0.4440429816142569, + "grad_norm": 4.2472734451293945, + "learning_rate": 6.15195899631757e-05, + "loss": 1.9413, + "step": 5899 + }, + { + "epoch": 0.4441182558949171, + "grad_norm": 6.310937404632568, + "learning_rate": 6.150772668969639e-05, + "loss": 1.816, + "step": 5900 + }, + { + "epoch": 0.4441935301755773, + "grad_norm": 5.364302158355713, + "learning_rate": 6.149586273211573e-05, + "loss": 1.8705, + "step": 5901 + }, + { + "epoch": 0.4442688044562374, + "grad_norm": 3.4378249645233154, + "learning_rate": 6.148399809113903e-05, + "loss": 1.8854, + "step": 5902 + }, + { + "epoch": 0.4443440787368976, + "grad_norm": 5.181258201599121, + "learning_rate": 6.147213276747159e-05, + "loss": 1.9383, + "step": 5903 + }, + { + "epoch": 0.44441935301755775, + "grad_norm": 5.779449939727783, + "learning_rate": 6.146026676181877e-05, + "loss": 1.945, + "step": 5904 + }, + { + "epoch": 0.44449462729821787, + "grad_norm": 4.231904983520508, + "learning_rate": 6.144840007488598e-05, + "loss": 1.9648, + "step": 5905 + }, + { + "epoch": 0.44456990157887805, + "grad_norm": 4.831796646118164, + "learning_rate": 6.143653270737866e-05, + "loss": 2.0543, + "step": 5906 + }, + { + "epoch": 0.44464517585953817, + "grad_norm": 5.520266532897949, + "learning_rate": 6.142466466000228e-05, + "loss": 1.5273, + "step": 5907 + }, + { + "epoch": 0.44472045014019834, + "grad_norm": 4.027518272399902, + "learning_rate": 6.141279593346237e-05, + "loss": 1.9979, + "step": 5908 + }, + { + "epoch": 0.4447957244208585, + "grad_norm": 3.4035582542419434, + "learning_rate": 6.140092652846448e-05, + "loss": 1.9474, + "step": 5909 + }, + { + "epoch": 0.44487099870151864, + "grad_norm": 4.433760643005371, + "learning_rate": 6.138905644571421e-05, + "loss": 1.9754, + "step": 5910 + }, + { + "epoch": 0.4449462729821788, + "grad_norm": 7.222991943359375, + "learning_rate": 6.137718568591722e-05, + "loss": 2.4565, + "step": 5911 + }, + { + "epoch": 0.445021547262839, + "grad_norm": 5.692912578582764, + "learning_rate": 6.13653142497792e-05, + "loss": 2.0414, + "step": 5912 + }, + { + "epoch": 0.4450968215434991, + "grad_norm": 4.777101993560791, + "learning_rate": 6.135344213800586e-05, + "loss": 2.2095, + "step": 5913 + }, + { + "epoch": 0.4451720958241593, + "grad_norm": 4.028824806213379, + "learning_rate": 6.134156935130296e-05, + "loss": 2.0912, + "step": 5914 + }, + { + "epoch": 0.44524737010481946, + "grad_norm": 3.3961150646209717, + "learning_rate": 6.13296958903763e-05, + "loss": 1.7184, + "step": 5915 + }, + { + "epoch": 0.4453226443854796, + "grad_norm": 5.075456142425537, + "learning_rate": 6.131782175593172e-05, + "loss": 2.1433, + "step": 5916 + }, + { + "epoch": 0.44539791866613976, + "grad_norm": 5.175343990325928, + "learning_rate": 6.130594694867512e-05, + "loss": 1.9034, + "step": 5917 + }, + { + "epoch": 0.4454731929467999, + "grad_norm": 3.562257766723633, + "learning_rate": 6.12940714693124e-05, + "loss": 2.0215, + "step": 5918 + }, + { + "epoch": 0.44554846722746005, + "grad_norm": 4.450735092163086, + "learning_rate": 6.128219531854956e-05, + "loss": 1.9351, + "step": 5919 + }, + { + "epoch": 0.44562374150812023, + "grad_norm": 6.513960361480713, + "learning_rate": 6.127031849709257e-05, + "loss": 1.8493, + "step": 5920 + }, + { + "epoch": 0.44569901578878035, + "grad_norm": 3.5960655212402344, + "learning_rate": 6.125844100564746e-05, + "loss": 1.9367, + "step": 5921 + }, + { + "epoch": 0.4457742900694405, + "grad_norm": 5.0706095695495605, + "learning_rate": 6.124656284492036e-05, + "loss": 1.8621, + "step": 5922 + }, + { + "epoch": 0.4458495643501007, + "grad_norm": 3.510718584060669, + "learning_rate": 6.123468401561738e-05, + "loss": 1.8233, + "step": 5923 + }, + { + "epoch": 0.4459248386307608, + "grad_norm": 3.0832386016845703, + "learning_rate": 6.122280451844467e-05, + "loss": 1.9186, + "step": 5924 + }, + { + "epoch": 0.446000112911421, + "grad_norm": 3.89540958404541, + "learning_rate": 6.121092435410841e-05, + "loss": 1.8799, + "step": 5925 + }, + { + "epoch": 0.4460753871920811, + "grad_norm": 6.350008964538574, + "learning_rate": 6.119904352331488e-05, + "loss": 1.951, + "step": 5926 + }, + { + "epoch": 0.4461506614727413, + "grad_norm": 3.2523128986358643, + "learning_rate": 6.118716202677033e-05, + "loss": 1.9655, + "step": 5927 + }, + { + "epoch": 0.44622593575340147, + "grad_norm": 6.013514518737793, + "learning_rate": 6.117527986518113e-05, + "loss": 1.8137, + "step": 5928 + }, + { + "epoch": 0.4463012100340616, + "grad_norm": 5.659713268280029, + "learning_rate": 6.116339703925358e-05, + "loss": 1.9177, + "step": 5929 + }, + { + "epoch": 0.44637648431472177, + "grad_norm": 5.024892330169678, + "learning_rate": 6.115151354969413e-05, + "loss": 2.0692, + "step": 5930 + }, + { + "epoch": 0.44645175859538194, + "grad_norm": 4.851288318634033, + "learning_rate": 6.113962939720918e-05, + "loss": 1.9722, + "step": 5931 + }, + { + "epoch": 0.44652703287604206, + "grad_norm": 4.694975852966309, + "learning_rate": 6.112774458250522e-05, + "loss": 2.0585, + "step": 5932 + }, + { + "epoch": 0.44660230715670224, + "grad_norm": 5.526699066162109, + "learning_rate": 6.111585910628878e-05, + "loss": 2.1207, + "step": 5933 + }, + { + "epoch": 0.4466775814373624, + "grad_norm": 4.565197467803955, + "learning_rate": 6.110397296926641e-05, + "loss": 2.1849, + "step": 5934 + }, + { + "epoch": 0.44675285571802253, + "grad_norm": 3.6555278301239014, + "learning_rate": 6.10920861721447e-05, + "loss": 1.9686, + "step": 5935 + }, + { + "epoch": 0.4468281299986827, + "grad_norm": 5.101633548736572, + "learning_rate": 6.108019871563031e-05, + "loss": 1.7502, + "step": 5936 + }, + { + "epoch": 0.44690340427934283, + "grad_norm": 4.757491111755371, + "learning_rate": 6.10683106004299e-05, + "loss": 1.8732, + "step": 5937 + }, + { + "epoch": 0.446978678560003, + "grad_norm": 6.349032402038574, + "learning_rate": 6.105642182725017e-05, + "loss": 2.1068, + "step": 5938 + }, + { + "epoch": 0.4470539528406632, + "grad_norm": 4.216952800750732, + "learning_rate": 6.104453239679789e-05, + "loss": 1.8794, + "step": 5939 + }, + { + "epoch": 0.4471292271213233, + "grad_norm": 5.036924839019775, + "learning_rate": 6.103264230977986e-05, + "loss": 1.7914, + "step": 5940 + }, + { + "epoch": 0.4472045014019835, + "grad_norm": 3.6164581775665283, + "learning_rate": 6.10207515669029e-05, + "loss": 2.0867, + "step": 5941 + }, + { + "epoch": 0.44727977568264365, + "grad_norm": 5.532937526702881, + "learning_rate": 6.100886016887387e-05, + "loss": 1.8802, + "step": 5942 + }, + { + "epoch": 0.4473550499633038, + "grad_norm": 6.9371562004089355, + "learning_rate": 6.0996968116399686e-05, + "loss": 1.9058, + "step": 5943 + }, + { + "epoch": 0.44743032424396395, + "grad_norm": 5.944525241851807, + "learning_rate": 6.098507541018732e-05, + "loss": 2.2233, + "step": 5944 + }, + { + "epoch": 0.4475055985246241, + "grad_norm": 4.571796894073486, + "learning_rate": 6.097318205094373e-05, + "loss": 1.6698, + "step": 5945 + }, + { + "epoch": 0.44758087280528425, + "grad_norm": 5.640833377838135, + "learning_rate": 6.096128803937596e-05, + "loss": 1.9164, + "step": 5946 + }, + { + "epoch": 0.4476561470859444, + "grad_norm": 4.008815765380859, + "learning_rate": 6.0949393376191066e-05, + "loss": 1.6408, + "step": 5947 + }, + { + "epoch": 0.44773142136660454, + "grad_norm": 4.3384246826171875, + "learning_rate": 6.0937498062096146e-05, + "loss": 1.8556, + "step": 5948 + }, + { + "epoch": 0.4478066956472647, + "grad_norm": 6.0752153396606445, + "learning_rate": 6.092560209779835e-05, + "loss": 2.1825, + "step": 5949 + }, + { + "epoch": 0.4478819699279249, + "grad_norm": 4.470860481262207, + "learning_rate": 6.091370548400486e-05, + "loss": 2.0284, + "step": 5950 + }, + { + "epoch": 0.447957244208585, + "grad_norm": 4.851217269897461, + "learning_rate": 6.0901808221422885e-05, + "loss": 2.0249, + "step": 5951 + }, + { + "epoch": 0.4480325184892452, + "grad_norm": 4.3529438972473145, + "learning_rate": 6.0889910310759714e-05, + "loss": 2.2993, + "step": 5952 + }, + { + "epoch": 0.44810779276990537, + "grad_norm": 5.031883716583252, + "learning_rate": 6.087801175272261e-05, + "loss": 1.8768, + "step": 5953 + }, + { + "epoch": 0.4481830670505655, + "grad_norm": 4.142302989959717, + "learning_rate": 6.0866112548018915e-05, + "loss": 1.9138, + "step": 5954 + }, + { + "epoch": 0.44825834133122566, + "grad_norm": 6.355103015899658, + "learning_rate": 6.085421269735601e-05, + "loss": 2.5333, + "step": 5955 + }, + { + "epoch": 0.4483336156118858, + "grad_norm": 3.5790059566497803, + "learning_rate": 6.084231220144131e-05, + "loss": 1.5943, + "step": 5956 + }, + { + "epoch": 0.44840888989254596, + "grad_norm": 4.150872230529785, + "learning_rate": 6.083041106098225e-05, + "loss": 2.2456, + "step": 5957 + }, + { + "epoch": 0.44848416417320613, + "grad_norm": 4.3456010818481445, + "learning_rate": 6.081850927668634e-05, + "loss": 1.7906, + "step": 5958 + }, + { + "epoch": 0.44855943845386625, + "grad_norm": 4.23631477355957, + "learning_rate": 6.0806606849261096e-05, + "loss": 1.6512, + "step": 5959 + }, + { + "epoch": 0.44863471273452643, + "grad_norm": 5.442815780639648, + "learning_rate": 6.079470377941407e-05, + "loss": 1.9065, + "step": 5960 + }, + { + "epoch": 0.4487099870151866, + "grad_norm": 6.5241498947143555, + "learning_rate": 6.0782800067852886e-05, + "loss": 1.8874, + "step": 5961 + }, + { + "epoch": 0.4487852612958467, + "grad_norm": 4.726539611816406, + "learning_rate": 6.077089571528517e-05, + "loss": 2.1969, + "step": 5962 + }, + { + "epoch": 0.4488605355765069, + "grad_norm": 5.546689033508301, + "learning_rate": 6.075899072241862e-05, + "loss": 1.8063, + "step": 5963 + }, + { + "epoch": 0.4489358098571671, + "grad_norm": 4.864353656768799, + "learning_rate": 6.074708508996094e-05, + "loss": 1.8378, + "step": 5964 + }, + { + "epoch": 0.4490110841378272, + "grad_norm": 7.372176170349121, + "learning_rate": 6.073517881861988e-05, + "loss": 1.8837, + "step": 5965 + }, + { + "epoch": 0.4490863584184874, + "grad_norm": 4.812704563140869, + "learning_rate": 6.072327190910323e-05, + "loss": 1.9821, + "step": 5966 + }, + { + "epoch": 0.4491616326991475, + "grad_norm": 4.812704563140869, + "learning_rate": 6.072327190910323e-05, + "loss": 2.0037, + "step": 5967 + }, + { + "epoch": 0.44923690697980767, + "grad_norm": 5.514007568359375, + "learning_rate": 6.071136436211886e-05, + "loss": 1.7521, + "step": 5968 + }, + { + "epoch": 0.44931218126046785, + "grad_norm": 5.489030838012695, + "learning_rate": 6.0699456178374605e-05, + "loss": 2.1959, + "step": 5969 + }, + { + "epoch": 0.44938745554112797, + "grad_norm": 4.788032054901123, + "learning_rate": 6.068754735857838e-05, + "loss": 1.7582, + "step": 5970 + }, + { + "epoch": 0.44946272982178814, + "grad_norm": 5.289285182952881, + "learning_rate": 6.067563790343813e-05, + "loss": 2.2174, + "step": 5971 + }, + { + "epoch": 0.4495380041024483, + "grad_norm": 3.956015110015869, + "learning_rate": 6.066372781366183e-05, + "loss": 1.9942, + "step": 5972 + }, + { + "epoch": 0.44961327838310844, + "grad_norm": 5.779730319976807, + "learning_rate": 6.065181708995752e-05, + "loss": 2.014, + "step": 5973 + }, + { + "epoch": 0.4496885526637686, + "grad_norm": 4.122438430786133, + "learning_rate": 6.0639905733033245e-05, + "loss": 1.7382, + "step": 5974 + }, + { + "epoch": 0.44976382694442874, + "grad_norm": 10.876434326171875, + "learning_rate": 6.06279937435971e-05, + "loss": 2.2528, + "step": 5975 + }, + { + "epoch": 0.4498391012250889, + "grad_norm": 6.262092590332031, + "learning_rate": 6.0616081122357235e-05, + "loss": 2.422, + "step": 5976 + }, + { + "epoch": 0.4499143755057491, + "grad_norm": 4.21637487411499, + "learning_rate": 6.06041678700218e-05, + "loss": 2.0436, + "step": 5977 + }, + { + "epoch": 0.4499896497864092, + "grad_norm": 5.686376571655273, + "learning_rate": 6.0592253987299e-05, + "loss": 2.0256, + "step": 5978 + }, + { + "epoch": 0.4500649240670694, + "grad_norm": 4.940842628479004, + "learning_rate": 6.0580339474897106e-05, + "loss": 1.7117, + "step": 5979 + }, + { + "epoch": 0.45014019834772956, + "grad_norm": 4.913683891296387, + "learning_rate": 6.0568424333524386e-05, + "loss": 1.9737, + "step": 5980 + }, + { + "epoch": 0.4502154726283897, + "grad_norm": 5.5284833908081055, + "learning_rate": 6.055650856388917e-05, + "loss": 1.7662, + "step": 5981 + }, + { + "epoch": 0.45029074690904985, + "grad_norm": 3.949586868286133, + "learning_rate": 6.05445921666998e-05, + "loss": 1.8073, + "step": 5982 + }, + { + "epoch": 0.45036602118971003, + "grad_norm": 4.705341339111328, + "learning_rate": 6.053267514266468e-05, + "loss": 1.7362, + "step": 5983 + }, + { + "epoch": 0.45044129547037015, + "grad_norm": 3.7629899978637695, + "learning_rate": 6.0520757492492244e-05, + "loss": 1.7402, + "step": 5984 + }, + { + "epoch": 0.4505165697510303, + "grad_norm": 6.499493598937988, + "learning_rate": 6.0508839216890964e-05, + "loss": 2.4095, + "step": 5985 + }, + { + "epoch": 0.45059184403169045, + "grad_norm": 4.297918319702148, + "learning_rate": 6.049692031656935e-05, + "loss": 2.0019, + "step": 5986 + }, + { + "epoch": 0.4506671183123506, + "grad_norm": 4.276834487915039, + "learning_rate": 6.048500079223595e-05, + "loss": 1.8343, + "step": 5987 + }, + { + "epoch": 0.4507423925930108, + "grad_norm": 4.483490943908691, + "learning_rate": 6.047308064459933e-05, + "loss": 1.9587, + "step": 5988 + }, + { + "epoch": 0.4508176668736709, + "grad_norm": 5.314219951629639, + "learning_rate": 6.04611598743681e-05, + "loss": 1.8696, + "step": 5989 + }, + { + "epoch": 0.4508929411543311, + "grad_norm": 3.4307024478912354, + "learning_rate": 6.044923848225096e-05, + "loss": 1.5495, + "step": 5990 + }, + { + "epoch": 0.45096821543499127, + "grad_norm": 4.445413112640381, + "learning_rate": 6.043731646895656e-05, + "loss": 2.0814, + "step": 5991 + }, + { + "epoch": 0.4510434897156514, + "grad_norm": 4.226023197174072, + "learning_rate": 6.042539383519364e-05, + "loss": 1.8908, + "step": 5992 + }, + { + "epoch": 0.45111876399631157, + "grad_norm": 4.241024494171143, + "learning_rate": 6.041347058167098e-05, + "loss": 2.0917, + "step": 5993 + }, + { + "epoch": 0.45119403827697174, + "grad_norm": 6.026360034942627, + "learning_rate": 6.0401546709097366e-05, + "loss": 1.5562, + "step": 5994 + }, + { + "epoch": 0.45126931255763186, + "grad_norm": 3.9674556255340576, + "learning_rate": 6.038962221818165e-05, + "loss": 1.9904, + "step": 5995 + }, + { + "epoch": 0.45134458683829204, + "grad_norm": 4.2482733726501465, + "learning_rate": 6.0377697109632716e-05, + "loss": 1.7127, + "step": 5996 + }, + { + "epoch": 0.45141986111895216, + "grad_norm": 4.999752521514893, + "learning_rate": 6.036577138415945e-05, + "loss": 1.921, + "step": 5997 + }, + { + "epoch": 0.45149513539961234, + "grad_norm": 5.2310309410095215, + "learning_rate": 6.035384504247082e-05, + "loss": 2.2203, + "step": 5998 + }, + { + "epoch": 0.4515704096802725, + "grad_norm": 6.787916660308838, + "learning_rate": 6.03419180852758e-05, + "loss": 1.9591, + "step": 5999 + }, + { + "epoch": 0.45164568396093263, + "grad_norm": 4.101888179779053, + "learning_rate": 6.032999051328345e-05, + "loss": 2.7233, + "step": 6000 + }, + { + "epoch": 0.4517209582415928, + "grad_norm": 5.638253211975098, + "learning_rate": 6.031806232720277e-05, + "loss": 2.1475, + "step": 6001 + }, + { + "epoch": 0.451796232522253, + "grad_norm": 6.141194820404053, + "learning_rate": 6.030613352774293e-05, + "loss": 1.7782, + "step": 6002 + }, + { + "epoch": 0.4518715068029131, + "grad_norm": 4.367072105407715, + "learning_rate": 6.0294204115613e-05, + "loss": 1.7448, + "step": 6003 + }, + { + "epoch": 0.4519467810835733, + "grad_norm": 3.6829652786254883, + "learning_rate": 6.0282274091522174e-05, + "loss": 1.8882, + "step": 6004 + }, + { + "epoch": 0.4520220553642334, + "grad_norm": 4.12482213973999, + "learning_rate": 6.0270343456179666e-05, + "loss": 2.2419, + "step": 6005 + }, + { + "epoch": 0.4520973296448936, + "grad_norm": 4.992110252380371, + "learning_rate": 6.025841221029469e-05, + "loss": 2.1173, + "step": 6006 + }, + { + "epoch": 0.45217260392555375, + "grad_norm": 3.1126821041107178, + "learning_rate": 6.024648035457656e-05, + "loss": 1.9173, + "step": 6007 + }, + { + "epoch": 0.45224787820621387, + "grad_norm": 4.627983570098877, + "learning_rate": 6.023454788973457e-05, + "loss": 2.1659, + "step": 6008 + }, + { + "epoch": 0.45232315248687405, + "grad_norm": 4.1611528396606445, + "learning_rate": 6.022261481647808e-05, + "loss": 2.0813, + "step": 6009 + }, + { + "epoch": 0.4523984267675342, + "grad_norm": 5.859460830688477, + "learning_rate": 6.021068113551645e-05, + "loss": 1.8312, + "step": 6010 + }, + { + "epoch": 0.45247370104819434, + "grad_norm": 3.900682210922241, + "learning_rate": 6.019874684755914e-05, + "loss": 1.9834, + "step": 6011 + }, + { + "epoch": 0.4525489753288545, + "grad_norm": 4.332180023193359, + "learning_rate": 6.0186811953315593e-05, + "loss": 1.8246, + "step": 6012 + }, + { + "epoch": 0.4526242496095147, + "grad_norm": 4.874317169189453, + "learning_rate": 6.01748764534953e-05, + "loss": 2.292, + "step": 6013 + }, + { + "epoch": 0.4526995238901748, + "grad_norm": 4.688967227935791, + "learning_rate": 6.0162940348807804e-05, + "loss": 1.6333, + "step": 6014 + }, + { + "epoch": 0.452774798170835, + "grad_norm": 6.165711402893066, + "learning_rate": 6.0151003639962654e-05, + "loss": 2.1033, + "step": 6015 + }, + { + "epoch": 0.4528500724514951, + "grad_norm": 5.114849090576172, + "learning_rate": 6.0139066327669466e-05, + "loss": 1.937, + "step": 6016 + }, + { + "epoch": 0.4529253467321553, + "grad_norm": 3.925725221633911, + "learning_rate": 6.012712841263788e-05, + "loss": 1.916, + "step": 6017 + }, + { + "epoch": 0.45300062101281546, + "grad_norm": 3.900815725326538, + "learning_rate": 6.011518989557757e-05, + "loss": 1.6135, + "step": 6018 + }, + { + "epoch": 0.4530758952934756, + "grad_norm": 6.237217903137207, + "learning_rate": 6.010325077719825e-05, + "loss": 1.9457, + "step": 6019 + }, + { + "epoch": 0.45315116957413576, + "grad_norm": 5.853795051574707, + "learning_rate": 6.009131105820965e-05, + "loss": 1.953, + "step": 6020 + }, + { + "epoch": 0.45322644385479594, + "grad_norm": 6.7400898933410645, + "learning_rate": 6.007937073932157e-05, + "loss": 2.1115, + "step": 6021 + }, + { + "epoch": 0.45330171813545606, + "grad_norm": 5.347073554992676, + "learning_rate": 6.0067429821243816e-05, + "loss": 1.7168, + "step": 6022 + }, + { + "epoch": 0.45337699241611623, + "grad_norm": 5.739337921142578, + "learning_rate": 6.005548830468625e-05, + "loss": 1.7385, + "step": 6023 + }, + { + "epoch": 0.45345226669677635, + "grad_norm": 4.295375823974609, + "learning_rate": 6.004354619035876e-05, + "loss": 1.9291, + "step": 6024 + }, + { + "epoch": 0.4535275409774365, + "grad_norm": 5.407988548278809, + "learning_rate": 6.003160347897126e-05, + "loss": 1.9602, + "step": 6025 + }, + { + "epoch": 0.4536028152580967, + "grad_norm": 5.012875556945801, + "learning_rate": 6.0019660171233715e-05, + "loss": 1.658, + "step": 6026 + }, + { + "epoch": 0.4536780895387568, + "grad_norm": 5.996007442474365, + "learning_rate": 6.0007716267856126e-05, + "loss": 1.988, + "step": 6027 + }, + { + "epoch": 0.453753363819417, + "grad_norm": 5.223214149475098, + "learning_rate": 5.9995771769548514e-05, + "loss": 1.6316, + "step": 6028 + }, + { + "epoch": 0.4538286381000772, + "grad_norm": 3.7236433029174805, + "learning_rate": 5.998382667702096e-05, + "loss": 1.8148, + "step": 6029 + }, + { + "epoch": 0.4539039123807373, + "grad_norm": 6.124312877655029, + "learning_rate": 5.997188099098357e-05, + "loss": 2.2687, + "step": 6030 + }, + { + "epoch": 0.45397918666139747, + "grad_norm": 5.337610721588135, + "learning_rate": 5.995993471214645e-05, + "loss": 1.7451, + "step": 6031 + }, + { + "epoch": 0.45405446094205765, + "grad_norm": 6.2527337074279785, + "learning_rate": 5.9947987841219774e-05, + "loss": 1.7022, + "step": 6032 + }, + { + "epoch": 0.45412973522271777, + "grad_norm": 4.850403308868408, + "learning_rate": 5.9936040378913785e-05, + "loss": 1.7393, + "step": 6033 + }, + { + "epoch": 0.45420500950337794, + "grad_norm": 6.706861972808838, + "learning_rate": 5.9924092325938694e-05, + "loss": 1.7573, + "step": 6034 + }, + { + "epoch": 0.45428028378403806, + "grad_norm": 4.820898532867432, + "learning_rate": 5.991214368300481e-05, + "loss": 2.0673, + "step": 6035 + }, + { + "epoch": 0.45435555806469824, + "grad_norm": 4.394272327423096, + "learning_rate": 5.9900194450822413e-05, + "loss": 1.928, + "step": 6036 + }, + { + "epoch": 0.4544308323453584, + "grad_norm": 4.091886043548584, + "learning_rate": 5.988824463010187e-05, + "loss": 1.8936, + "step": 6037 + }, + { + "epoch": 0.45450610662601854, + "grad_norm": 5.050229072570801, + "learning_rate": 5.987629422155354e-05, + "loss": 1.6381, + "step": 6038 + }, + { + "epoch": 0.4545813809066787, + "grad_norm": 5.246290683746338, + "learning_rate": 5.9864343225887864e-05, + "loss": 1.7704, + "step": 6039 + }, + { + "epoch": 0.4546566551873389, + "grad_norm": 5.636992931365967, + "learning_rate": 5.985239164381529e-05, + "loss": 1.9712, + "step": 6040 + }, + { + "epoch": 0.454731929467999, + "grad_norm": 4.305088043212891, + "learning_rate": 5.98404394760463e-05, + "loss": 1.6054, + "step": 6041 + }, + { + "epoch": 0.4548072037486592, + "grad_norm": 4.476842403411865, + "learning_rate": 5.9828486723291435e-05, + "loss": 2.4206, + "step": 6042 + }, + { + "epoch": 0.45488247802931936, + "grad_norm": 4.255067348480225, + "learning_rate": 5.981653338626122e-05, + "loss": 1.6144, + "step": 6043 + }, + { + "epoch": 0.4549577523099795, + "grad_norm": 5.431327819824219, + "learning_rate": 5.980457946566625e-05, + "loss": 2.2985, + "step": 6044 + }, + { + "epoch": 0.45503302659063966, + "grad_norm": 5.836861610412598, + "learning_rate": 5.979262496221718e-05, + "loss": 1.68, + "step": 6045 + }, + { + "epoch": 0.4551083008712998, + "grad_norm": 4.298095226287842, + "learning_rate": 5.978066987662465e-05, + "loss": 1.7517, + "step": 6046 + }, + { + "epoch": 0.45518357515195995, + "grad_norm": 4.132864475250244, + "learning_rate": 5.976871420959936e-05, + "loss": 1.9019, + "step": 6047 + }, + { + "epoch": 0.4552588494326201, + "grad_norm": 5.600347518920898, + "learning_rate": 5.975675796185204e-05, + "loss": 1.9065, + "step": 6048 + }, + { + "epoch": 0.45533412371328025, + "grad_norm": 3.901496648788452, + "learning_rate": 5.9744801134093454e-05, + "loss": 1.9271, + "step": 6049 + }, + { + "epoch": 0.4554093979939404, + "grad_norm": 5.38663387298584, + "learning_rate": 5.9732843727034395e-05, + "loss": 1.8771, + "step": 6050 + }, + { + "epoch": 0.4554846722746006, + "grad_norm": 5.455505847930908, + "learning_rate": 5.972088574138571e-05, + "loss": 1.9057, + "step": 6051 + }, + { + "epoch": 0.4555599465552607, + "grad_norm": 5.193195343017578, + "learning_rate": 5.9708927177858265e-05, + "loss": 1.687, + "step": 6052 + }, + { + "epoch": 0.4556352208359209, + "grad_norm": 5.373270034790039, + "learning_rate": 5.969696803716295e-05, + "loss": 1.9376, + "step": 6053 + }, + { + "epoch": 0.455710495116581, + "grad_norm": 5.75775671005249, + "learning_rate": 5.968500832001072e-05, + "loss": 2.2669, + "step": 6054 + }, + { + "epoch": 0.4557857693972412, + "grad_norm": 4.701210975646973, + "learning_rate": 5.967304802711252e-05, + "loss": 1.9175, + "step": 6055 + }, + { + "epoch": 0.45586104367790137, + "grad_norm": 5.253629684448242, + "learning_rate": 5.966108715917937e-05, + "loss": 2.2749, + "step": 6056 + }, + { + "epoch": 0.4559363179585615, + "grad_norm": 5.26934289932251, + "learning_rate": 5.9649125716922316e-05, + "loss": 2.4595, + "step": 6057 + }, + { + "epoch": 0.45601159223922166, + "grad_norm": 6.326803684234619, + "learning_rate": 5.9637163701052434e-05, + "loss": 2.0817, + "step": 6058 + }, + { + "epoch": 0.45608686651988184, + "grad_norm": 5.832178592681885, + "learning_rate": 5.9625201112280806e-05, + "loss": 2.15, + "step": 6059 + }, + { + "epoch": 0.45616214080054196, + "grad_norm": 5.275348663330078, + "learning_rate": 5.96132379513186e-05, + "loss": 1.5737, + "step": 6060 + }, + { + "epoch": 0.45623741508120214, + "grad_norm": 4.472059726715088, + "learning_rate": 5.960127421887697e-05, + "loss": 1.9204, + "step": 6061 + }, + { + "epoch": 0.4563126893618623, + "grad_norm": 4.421325206756592, + "learning_rate": 5.9589309915667146e-05, + "loss": 1.7761, + "step": 6062 + }, + { + "epoch": 0.45638796364252243, + "grad_norm": 6.40798807144165, + "learning_rate": 5.9577345042400355e-05, + "loss": 2.2179, + "step": 6063 + }, + { + "epoch": 0.4564632379231826, + "grad_norm": 4.162315368652344, + "learning_rate": 5.9565379599787885e-05, + "loss": 1.9176, + "step": 6064 + }, + { + "epoch": 0.45653851220384273, + "grad_norm": 5.723100185394287, + "learning_rate": 5.955341358854105e-05, + "loss": 2.1868, + "step": 6065 + }, + { + "epoch": 0.4566137864845029, + "grad_norm": 4.859919548034668, + "learning_rate": 5.954144700937119e-05, + "loss": 1.9833, + "step": 6066 + }, + { + "epoch": 0.4566890607651631, + "grad_norm": 5.13480281829834, + "learning_rate": 5.952947986298967e-05, + "loss": 1.7, + "step": 6067 + }, + { + "epoch": 0.4567643350458232, + "grad_norm": 4.001134872436523, + "learning_rate": 5.9517512150107926e-05, + "loss": 2.0627, + "step": 6068 + }, + { + "epoch": 0.4568396093264834, + "grad_norm": 3.800814151763916, + "learning_rate": 5.950554387143739e-05, + "loss": 1.6681, + "step": 6069 + }, + { + "epoch": 0.45691488360714355, + "grad_norm": 8.66020393371582, + "learning_rate": 5.949357502768954e-05, + "loss": 2.0261, + "step": 6070 + }, + { + "epoch": 0.45699015788780367, + "grad_norm": 4.2896623611450195, + "learning_rate": 5.948160561957591e-05, + "loss": 1.866, + "step": 6071 + }, + { + "epoch": 0.45706543216846385, + "grad_norm": 3.8667845726013184, + "learning_rate": 5.9469635647808006e-05, + "loss": 1.7372, + "step": 6072 + }, + { + "epoch": 0.45714070644912397, + "grad_norm": 3.50824236869812, + "learning_rate": 5.945766511309745e-05, + "loss": 1.9068, + "step": 6073 + }, + { + "epoch": 0.45721598072978414, + "grad_norm": 4.218340873718262, + "learning_rate": 5.944569401615585e-05, + "loss": 1.573, + "step": 6074 + }, + { + "epoch": 0.4572912550104443, + "grad_norm": 4.466118335723877, + "learning_rate": 5.943372235769483e-05, + "loss": 1.8682, + "step": 6075 + }, + { + "epoch": 0.45736652929110444, + "grad_norm": 4.892060279846191, + "learning_rate": 5.942175013842609e-05, + "loss": 1.973, + "step": 6076 + }, + { + "epoch": 0.4574418035717646, + "grad_norm": 5.726229190826416, + "learning_rate": 5.9409777359061324e-05, + "loss": 2.0796, + "step": 6077 + }, + { + "epoch": 0.4575170778524248, + "grad_norm": 4.99564266204834, + "learning_rate": 5.939780402031232e-05, + "loss": 2.0617, + "step": 6078 + }, + { + "epoch": 0.4575923521330849, + "grad_norm": 6.545931339263916, + "learning_rate": 5.938583012289082e-05, + "loss": 1.7876, + "step": 6079 + }, + { + "epoch": 0.4576676264137451, + "grad_norm": 4.696487903594971, + "learning_rate": 5.937385566750864e-05, + "loss": 1.7632, + "step": 6080 + }, + { + "epoch": 0.45774290069440526, + "grad_norm": 3.8622817993164062, + "learning_rate": 5.9361880654877645e-05, + "loss": 1.9871, + "step": 6081 + }, + { + "epoch": 0.4578181749750654, + "grad_norm": 3.7468810081481934, + "learning_rate": 5.934990508570971e-05, + "loss": 2.0188, + "step": 6082 + }, + { + "epoch": 0.45789344925572556, + "grad_norm": 4.950771331787109, + "learning_rate": 5.9337928960716746e-05, + "loss": 2.0899, + "step": 6083 + }, + { + "epoch": 0.4579687235363857, + "grad_norm": 5.0970611572265625, + "learning_rate": 5.932595228061069e-05, + "loss": 1.9046, + "step": 6084 + }, + { + "epoch": 0.45804399781704586, + "grad_norm": 4.2430291175842285, + "learning_rate": 5.931397504610353e-05, + "loss": 1.8966, + "step": 6085 + }, + { + "epoch": 0.45811927209770603, + "grad_norm": 4.811651706695557, + "learning_rate": 5.9301997257907296e-05, + "loss": 1.7643, + "step": 6086 + }, + { + "epoch": 0.45819454637836615, + "grad_norm": 5.101222991943359, + "learning_rate": 5.9290018916734e-05, + "loss": 1.9049, + "step": 6087 + }, + { + "epoch": 0.45826982065902633, + "grad_norm": 6.524330139160156, + "learning_rate": 5.9278040023295744e-05, + "loss": 1.7444, + "step": 6088 + }, + { + "epoch": 0.4583450949396865, + "grad_norm": 4.4134955406188965, + "learning_rate": 5.926606057830462e-05, + "loss": 1.9636, + "step": 6089 + }, + { + "epoch": 0.4584203692203466, + "grad_norm": 4.550789833068848, + "learning_rate": 5.925408058247278e-05, + "loss": 2.3889, + "step": 6090 + }, + { + "epoch": 0.4584956435010068, + "grad_norm": 4.4516072273254395, + "learning_rate": 5.924210003651242e-05, + "loss": 1.7583, + "step": 6091 + }, + { + "epoch": 0.458570917781667, + "grad_norm": 4.464069366455078, + "learning_rate": 5.923011894113573e-05, + "loss": 2.019, + "step": 6092 + }, + { + "epoch": 0.4586461920623271, + "grad_norm": 4.778139114379883, + "learning_rate": 5.9218137297054945e-05, + "loss": 1.8796, + "step": 6093 + }, + { + "epoch": 0.45872146634298727, + "grad_norm": 5.728743553161621, + "learning_rate": 5.9206155104982333e-05, + "loss": 1.9561, + "step": 6094 + }, + { + "epoch": 0.4587967406236474, + "grad_norm": 3.7842822074890137, + "learning_rate": 5.919417236563023e-05, + "loss": 1.648, + "step": 6095 + }, + { + "epoch": 0.45887201490430757, + "grad_norm": 4.400025367736816, + "learning_rate": 5.918218907971095e-05, + "loss": 1.8278, + "step": 6096 + }, + { + "epoch": 0.45894728918496774, + "grad_norm": 4.153076171875, + "learning_rate": 5.9170205247936896e-05, + "loss": 1.7074, + "step": 6097 + }, + { + "epoch": 0.45902256346562786, + "grad_norm": 5.94003963470459, + "learning_rate": 5.915822087102044e-05, + "loss": 1.9478, + "step": 6098 + }, + { + "epoch": 0.45909783774628804, + "grad_norm": 4.797269344329834, + "learning_rate": 5.914623594967404e-05, + "loss": 1.6932, + "step": 6099 + }, + { + "epoch": 0.4591731120269482, + "grad_norm": 4.070412635803223, + "learning_rate": 5.913425048461013e-05, + "loss": 1.9724, + "step": 6100 + }, + { + "epoch": 0.45924838630760834, + "grad_norm": 4.645787239074707, + "learning_rate": 5.912226447654127e-05, + "loss": 1.77, + "step": 6101 + }, + { + "epoch": 0.4593236605882685, + "grad_norm": 3.9777133464813232, + "learning_rate": 5.911027792617995e-05, + "loss": 1.8891, + "step": 6102 + }, + { + "epoch": 0.45939893486892863, + "grad_norm": 7.653037071228027, + "learning_rate": 5.909829083423875e-05, + "loss": 1.8425, + "step": 6103 + }, + { + "epoch": 0.4594742091495888, + "grad_norm": 5.192277431488037, + "learning_rate": 5.908630320143026e-05, + "loss": 2.0377, + "step": 6104 + }, + { + "epoch": 0.459549483430249, + "grad_norm": 4.545719623565674, + "learning_rate": 5.907431502846712e-05, + "loss": 1.8998, + "step": 6105 + }, + { + "epoch": 0.4596247577109091, + "grad_norm": 5.5499653816223145, + "learning_rate": 5.906232631606198e-05, + "loss": 1.742, + "step": 6106 + }, + { + "epoch": 0.4597000319915693, + "grad_norm": 4.814578056335449, + "learning_rate": 5.905033706492755e-05, + "loss": 2.0466, + "step": 6107 + }, + { + "epoch": 0.45977530627222946, + "grad_norm": 4.70163631439209, + "learning_rate": 5.903834727577656e-05, + "loss": 2.0124, + "step": 6108 + }, + { + "epoch": 0.4598505805528896, + "grad_norm": 5.952963352203369, + "learning_rate": 5.902635694932175e-05, + "loss": 2.2362, + "step": 6109 + }, + { + "epoch": 0.45992585483354975, + "grad_norm": 6.062039375305176, + "learning_rate": 5.901436608627592e-05, + "loss": 1.8526, + "step": 6110 + }, + { + "epoch": 0.46000112911420993, + "grad_norm": 6.062039375305176, + "learning_rate": 5.901436608627592e-05, + "loss": 1.855, + "step": 6111 + }, + { + "epoch": 0.46007640339487005, + "grad_norm": 3.8001315593719482, + "learning_rate": 5.900237468735188e-05, + "loss": 1.5048, + "step": 6112 + }, + { + "epoch": 0.4601516776755302, + "grad_norm": 4.128273963928223, + "learning_rate": 5.89903827532625e-05, + "loss": 2.0092, + "step": 6113 + }, + { + "epoch": 0.46022695195619034, + "grad_norm": 8.076136589050293, + "learning_rate": 5.897839028472066e-05, + "loss": 2.4223, + "step": 6114 + }, + { + "epoch": 0.4603022262368505, + "grad_norm": 4.980038166046143, + "learning_rate": 5.896639728243929e-05, + "loss": 1.9077, + "step": 6115 + }, + { + "epoch": 0.4603775005175107, + "grad_norm": 4.359116554260254, + "learning_rate": 5.895440374713133e-05, + "loss": 2.0745, + "step": 6116 + }, + { + "epoch": 0.4604527747981708, + "grad_norm": 4.29932165145874, + "learning_rate": 5.894240967950976e-05, + "loss": 1.9347, + "step": 6117 + }, + { + "epoch": 0.460528049078831, + "grad_norm": 5.578684329986572, + "learning_rate": 5.8930415080287585e-05, + "loss": 2.1286, + "step": 6118 + }, + { + "epoch": 0.46060332335949117, + "grad_norm": 5.237558364868164, + "learning_rate": 5.8918419950177864e-05, + "loss": 2.2142, + "step": 6119 + }, + { + "epoch": 0.4606785976401513, + "grad_norm": 3.7731804847717285, + "learning_rate": 5.890642428989366e-05, + "loss": 2.0002, + "step": 6120 + }, + { + "epoch": 0.46075387192081146, + "grad_norm": 6.02181339263916, + "learning_rate": 5.889442810014809e-05, + "loss": 1.8548, + "step": 6121 + }, + { + "epoch": 0.4608291462014716, + "grad_norm": 5.883323669433594, + "learning_rate": 5.888243138165429e-05, + "loss": 1.9086, + "step": 6122 + }, + { + "epoch": 0.46090442048213176, + "grad_norm": 5.164095401763916, + "learning_rate": 5.887043413512543e-05, + "loss": 1.7454, + "step": 6123 + }, + { + "epoch": 0.46097969476279194, + "grad_norm": 4.250068664550781, + "learning_rate": 5.8858436361274714e-05, + "loss": 2.0833, + "step": 6124 + }, + { + "epoch": 0.46105496904345206, + "grad_norm": 5.0383782386779785, + "learning_rate": 5.884643806081538e-05, + "loss": 1.8903, + "step": 6125 + }, + { + "epoch": 0.46113024332411223, + "grad_norm": 4.567081451416016, + "learning_rate": 5.883443923446068e-05, + "loss": 1.9542, + "step": 6126 + }, + { + "epoch": 0.4612055176047724, + "grad_norm": 5.049877643585205, + "learning_rate": 5.882243988292393e-05, + "loss": 1.8925, + "step": 6127 + }, + { + "epoch": 0.46128079188543253, + "grad_norm": 4.685807228088379, + "learning_rate": 5.881044000691842e-05, + "loss": 1.9993, + "step": 6128 + }, + { + "epoch": 0.4613560661660927, + "grad_norm": 6.258084774017334, + "learning_rate": 5.879843960715754e-05, + "loss": 2.3033, + "step": 6129 + }, + { + "epoch": 0.4614313404467529, + "grad_norm": 3.606372356414795, + "learning_rate": 5.878643868435467e-05, + "loss": 2.0483, + "step": 6130 + }, + { + "epoch": 0.461506614727413, + "grad_norm": 3.7735278606414795, + "learning_rate": 5.877443723922323e-05, + "loss": 1.8318, + "step": 6131 + }, + { + "epoch": 0.4615818890080732, + "grad_norm": 4.213362216949463, + "learning_rate": 5.876243527247669e-05, + "loss": 1.9625, + "step": 6132 + }, + { + "epoch": 0.4616571632887333, + "grad_norm": 4.581994533538818, + "learning_rate": 5.87504327848285e-05, + "loss": 1.988, + "step": 6133 + }, + { + "epoch": 0.4617324375693935, + "grad_norm": 5.330654621124268, + "learning_rate": 5.87384297769922e-05, + "loss": 2.2455, + "step": 6134 + }, + { + "epoch": 0.46180771185005365, + "grad_norm": 4.982420444488525, + "learning_rate": 5.8726426249681296e-05, + "loss": 2.0131, + "step": 6135 + }, + { + "epoch": 0.46188298613071377, + "grad_norm": 5.3704633712768555, + "learning_rate": 5.871442220360942e-05, + "loss": 1.5961, + "step": 6136 + }, + { + "epoch": 0.46195826041137394, + "grad_norm": 5.449668884277344, + "learning_rate": 5.870241763949014e-05, + "loss": 1.6291, + "step": 6137 + }, + { + "epoch": 0.4620335346920341, + "grad_norm": 4.630037307739258, + "learning_rate": 5.86904125580371e-05, + "loss": 1.9306, + "step": 6138 + }, + { + "epoch": 0.46210880897269424, + "grad_norm": 7.90316915512085, + "learning_rate": 5.8678406959963974e-05, + "loss": 1.6066, + "step": 6139 + }, + { + "epoch": 0.4621840832533544, + "grad_norm": 3.5883285999298096, + "learning_rate": 5.866640084598444e-05, + "loss": 1.9163, + "step": 6140 + }, + { + "epoch": 0.4622593575340146, + "grad_norm": 6.554765701293945, + "learning_rate": 5.865439421681226e-05, + "loss": 2.0131, + "step": 6141 + }, + { + "epoch": 0.4623346318146747, + "grad_norm": 3.749727725982666, + "learning_rate": 5.864238707316117e-05, + "loss": 1.7178, + "step": 6142 + }, + { + "epoch": 0.4624099060953349, + "grad_norm": 4.062201023101807, + "learning_rate": 5.863037941574496e-05, + "loss": 1.7238, + "step": 6143 + }, + { + "epoch": 0.462485180375995, + "grad_norm": 5.0798540115356445, + "learning_rate": 5.8618371245277474e-05, + "loss": 1.9235, + "step": 6144 + }, + { + "epoch": 0.4625604546566552, + "grad_norm": 4.518141746520996, + "learning_rate": 5.860636256247252e-05, + "loss": 1.9074, + "step": 6145 + }, + { + "epoch": 0.46263572893731536, + "grad_norm": 4.606646537780762, + "learning_rate": 5.859435336804401e-05, + "loss": 2.0495, + "step": 6146 + }, + { + "epoch": 0.4627110032179755, + "grad_norm": 4.6152262687683105, + "learning_rate": 5.858234366270586e-05, + "loss": 2.0432, + "step": 6147 + }, + { + "epoch": 0.46278627749863566, + "grad_norm": 4.4273176193237305, + "learning_rate": 5.8570333447172024e-05, + "loss": 2.1614, + "step": 6148 + }, + { + "epoch": 0.46286155177929583, + "grad_norm": 4.393697738647461, + "learning_rate": 5.8558322722156437e-05, + "loss": 2.3018, + "step": 6149 + }, + { + "epoch": 0.46293682605995595, + "grad_norm": 4.865461349487305, + "learning_rate": 5.854631148837312e-05, + "loss": 1.99, + "step": 6150 + }, + { + "epoch": 0.46301210034061613, + "grad_norm": 4.272097110748291, + "learning_rate": 5.8534299746536103e-05, + "loss": 1.833, + "step": 6151 + }, + { + "epoch": 0.46308737462127625, + "grad_norm": 5.799352645874023, + "learning_rate": 5.852228749735946e-05, + "loss": 1.9095, + "step": 6152 + }, + { + "epoch": 0.4631626489019364, + "grad_norm": 4.514222145080566, + "learning_rate": 5.851027474155728e-05, + "loss": 1.8374, + "step": 6153 + }, + { + "epoch": 0.4632379231825966, + "grad_norm": 4.221158981323242, + "learning_rate": 5.8498261479843685e-05, + "loss": 1.8974, + "step": 6154 + }, + { + "epoch": 0.4633131974632567, + "grad_norm": 4.292566776275635, + "learning_rate": 5.848624771293284e-05, + "loss": 1.9298, + "step": 6155 + }, + { + "epoch": 0.4633884717439169, + "grad_norm": 5.188507556915283, + "learning_rate": 5.847423344153891e-05, + "loss": 2.0096, + "step": 6156 + }, + { + "epoch": 0.4634637460245771, + "grad_norm": 6.87600040435791, + "learning_rate": 5.84622186663761e-05, + "loss": 1.9994, + "step": 6157 + }, + { + "epoch": 0.4635390203052372, + "grad_norm": 5.687765121459961, + "learning_rate": 5.845020338815869e-05, + "loss": 1.8411, + "step": 6158 + }, + { + "epoch": 0.46361429458589737, + "grad_norm": 5.334669589996338, + "learning_rate": 5.8438187607600935e-05, + "loss": 1.8098, + "step": 6159 + }, + { + "epoch": 0.46368956886655754, + "grad_norm": 5.574984073638916, + "learning_rate": 5.8426171325417136e-05, + "loss": 1.9769, + "step": 6160 + }, + { + "epoch": 0.46376484314721766, + "grad_norm": 5.236361980438232, + "learning_rate": 5.841415454232162e-05, + "loss": 1.8007, + "step": 6161 + }, + { + "epoch": 0.46384011742787784, + "grad_norm": 5.731012344360352, + "learning_rate": 5.840213725902877e-05, + "loss": 1.9784, + "step": 6162 + }, + { + "epoch": 0.46391539170853796, + "grad_norm": 5.692600727081299, + "learning_rate": 5.839011947625295e-05, + "loss": 2.1517, + "step": 6163 + }, + { + "epoch": 0.46399066598919814, + "grad_norm": 6.384438514709473, + "learning_rate": 5.8378101194708614e-05, + "loss": 1.7702, + "step": 6164 + }, + { + "epoch": 0.4640659402698583, + "grad_norm": 5.911719799041748, + "learning_rate": 5.8366082415110215e-05, + "loss": 1.9245, + "step": 6165 + }, + { + "epoch": 0.46414121455051843, + "grad_norm": 6.263427734375, + "learning_rate": 5.83540631381722e-05, + "loss": 1.8174, + "step": 6166 + }, + { + "epoch": 0.4642164888311786, + "grad_norm": 4.851933479309082, + "learning_rate": 5.834204336460911e-05, + "loss": 2.0516, + "step": 6167 + }, + { + "epoch": 0.4642917631118388, + "grad_norm": 4.586934566497803, + "learning_rate": 5.8330023095135476e-05, + "loss": 1.8159, + "step": 6168 + }, + { + "epoch": 0.4643670373924989, + "grad_norm": 6.346707344055176, + "learning_rate": 5.8318002330465884e-05, + "loss": 1.6566, + "step": 6169 + }, + { + "epoch": 0.4644423116731591, + "grad_norm": 4.117097854614258, + "learning_rate": 5.830598107131491e-05, + "loss": 2.1882, + "step": 6170 + }, + { + "epoch": 0.46451758595381926, + "grad_norm": 5.091454029083252, + "learning_rate": 5.8293959318397194e-05, + "loss": 1.5757, + "step": 6171 + }, + { + "epoch": 0.4645928602344794, + "grad_norm": 6.693051815032959, + "learning_rate": 5.82819370724274e-05, + "loss": 2.215, + "step": 6172 + }, + { + "epoch": 0.46466813451513955, + "grad_norm": 4.601351737976074, + "learning_rate": 5.826991433412019e-05, + "loss": 2.255, + "step": 6173 + }, + { + "epoch": 0.4647434087957997, + "grad_norm": 4.238350868225098, + "learning_rate": 5.825789110419032e-05, + "loss": 1.7408, + "step": 6174 + }, + { + "epoch": 0.46481868307645985, + "grad_norm": 3.776761054992676, + "learning_rate": 5.8245867383352516e-05, + "loss": 1.9469, + "step": 6175 + }, + { + "epoch": 0.46489395735712, + "grad_norm": 5.935523986816406, + "learning_rate": 5.8233843172321564e-05, + "loss": 2.1649, + "step": 6176 + }, + { + "epoch": 0.46496923163778014, + "grad_norm": 4.455471992492676, + "learning_rate": 5.822181847181225e-05, + "loss": 2.0635, + "step": 6177 + }, + { + "epoch": 0.4650445059184403, + "grad_norm": 3.9656732082366943, + "learning_rate": 5.8209793282539414e-05, + "loss": 1.8731, + "step": 6178 + }, + { + "epoch": 0.4651197801991005, + "grad_norm": 3.774287462234497, + "learning_rate": 5.819776760521793e-05, + "loss": 1.9017, + "step": 6179 + }, + { + "epoch": 0.4651950544797606, + "grad_norm": 6.014800071716309, + "learning_rate": 5.818574144056268e-05, + "loss": 2.4556, + "step": 6180 + }, + { + "epoch": 0.4652703287604208, + "grad_norm": 5.37847375869751, + "learning_rate": 5.8173714789288604e-05, + "loss": 2.0758, + "step": 6181 + }, + { + "epoch": 0.4653456030410809, + "grad_norm": 5.380580902099609, + "learning_rate": 5.816168765211063e-05, + "loss": 1.7509, + "step": 6182 + }, + { + "epoch": 0.4654208773217411, + "grad_norm": 3.3990745544433594, + "learning_rate": 5.814966002974374e-05, + "loss": 1.9663, + "step": 6183 + }, + { + "epoch": 0.46549615160240126, + "grad_norm": 4.516787528991699, + "learning_rate": 5.8137631922902944e-05, + "loss": 2.1708, + "step": 6184 + }, + { + "epoch": 0.4655714258830614, + "grad_norm": 3.905266761779785, + "learning_rate": 5.812560333230328e-05, + "loss": 1.8058, + "step": 6185 + }, + { + "epoch": 0.46564670016372156, + "grad_norm": 4.200650691986084, + "learning_rate": 5.811357425865981e-05, + "loss": 2.3434, + "step": 6186 + }, + { + "epoch": 0.46572197444438174, + "grad_norm": 4.848728179931641, + "learning_rate": 5.8101544702687636e-05, + "loss": 1.6202, + "step": 6187 + }, + { + "epoch": 0.46579724872504186, + "grad_norm": 5.740512371063232, + "learning_rate": 5.808951466510188e-05, + "loss": 1.8352, + "step": 6188 + }, + { + "epoch": 0.46587252300570203, + "grad_norm": 5.244520664215088, + "learning_rate": 5.807748414661769e-05, + "loss": 1.9215, + "step": 6189 + }, + { + "epoch": 0.4659477972863622, + "grad_norm": 5.365092754364014, + "learning_rate": 5.806545314795022e-05, + "loss": 1.836, + "step": 6190 + }, + { + "epoch": 0.46602307156702233, + "grad_norm": 3.605998992919922, + "learning_rate": 5.805342166981472e-05, + "loss": 2.051, + "step": 6191 + }, + { + "epoch": 0.4660983458476825, + "grad_norm": 4.080414295196533, + "learning_rate": 5.804138971292642e-05, + "loss": 1.7786, + "step": 6192 + }, + { + "epoch": 0.4661736201283426, + "grad_norm": 3.924818515777588, + "learning_rate": 5.802935727800056e-05, + "loss": 2.01, + "step": 6193 + }, + { + "epoch": 0.4662488944090028, + "grad_norm": 4.2257466316223145, + "learning_rate": 5.801732436575245e-05, + "loss": 1.8988, + "step": 6194 + }, + { + "epoch": 0.466324168689663, + "grad_norm": 4.413915157318115, + "learning_rate": 5.800529097689742e-05, + "loss": 1.8617, + "step": 6195 + }, + { + "epoch": 0.4663994429703231, + "grad_norm": 3.800804376602173, + "learning_rate": 5.799325711215079e-05, + "loss": 2.1775, + "step": 6196 + }, + { + "epoch": 0.4664747172509833, + "grad_norm": 4.679453372955322, + "learning_rate": 5.7981222772227985e-05, + "loss": 2.2822, + "step": 6197 + }, + { + "epoch": 0.46654999153164345, + "grad_norm": 3.8881962299346924, + "learning_rate": 5.796918795784437e-05, + "loss": 1.424, + "step": 6198 + }, + { + "epoch": 0.46662526581230357, + "grad_norm": 4.5422773361206055, + "learning_rate": 5.7957152669715406e-05, + "loss": 2.0155, + "step": 6199 + }, + { + "epoch": 0.46670054009296374, + "grad_norm": 4.647782802581787, + "learning_rate": 5.7945116908556554e-05, + "loss": 1.7315, + "step": 6200 + }, + { + "epoch": 0.46677581437362387, + "grad_norm": 5.930076599121094, + "learning_rate": 5.793308067508328e-05, + "loss": 1.4385, + "step": 6201 + }, + { + "epoch": 0.46685108865428404, + "grad_norm": 4.503535747528076, + "learning_rate": 5.7921043970011134e-05, + "loss": 1.7247, + "step": 6202 + }, + { + "epoch": 0.4669263629349442, + "grad_norm": 4.112096309661865, + "learning_rate": 5.790900679405565e-05, + "loss": 2.0921, + "step": 6203 + }, + { + "epoch": 0.46700163721560434, + "grad_norm": 4.92310094833374, + "learning_rate": 5.7896969147932414e-05, + "loss": 1.7441, + "step": 6204 + }, + { + "epoch": 0.4670769114962645, + "grad_norm": 5.805795192718506, + "learning_rate": 5.7884931032357015e-05, + "loss": 2.5231, + "step": 6205 + }, + { + "epoch": 0.4671521857769247, + "grad_norm": 4.6085309982299805, + "learning_rate": 5.787289244804509e-05, + "loss": 2.0675, + "step": 6206 + }, + { + "epoch": 0.4672274600575848, + "grad_norm": 5.6390061378479, + "learning_rate": 5.786085339571229e-05, + "loss": 2.2054, + "step": 6207 + }, + { + "epoch": 0.467302734338245, + "grad_norm": 4.3372087478637695, + "learning_rate": 5.784881387607433e-05, + "loss": 1.7363, + "step": 6208 + }, + { + "epoch": 0.46737800861890516, + "grad_norm": 4.134713172912598, + "learning_rate": 5.783677388984689e-05, + "loss": 1.6687, + "step": 6209 + }, + { + "epoch": 0.4674532828995653, + "grad_norm": 5.727628707885742, + "learning_rate": 5.7824733437745725e-05, + "loss": 1.895, + "step": 6210 + }, + { + "epoch": 0.46752855718022546, + "grad_norm": 5.829822063446045, + "learning_rate": 5.781269252048662e-05, + "loss": 1.9313, + "step": 6211 + }, + { + "epoch": 0.4676038314608856, + "grad_norm": 4.493118762969971, + "learning_rate": 5.780065113878537e-05, + "loss": 2.1191, + "step": 6212 + }, + { + "epoch": 0.46767910574154575, + "grad_norm": 5.89603328704834, + "learning_rate": 5.778860929335777e-05, + "loss": 1.8107, + "step": 6213 + }, + { + "epoch": 0.46775438002220593, + "grad_norm": 4.582896709442139, + "learning_rate": 5.77765669849197e-05, + "loss": 2.0457, + "step": 6214 + }, + { + "epoch": 0.46782965430286605, + "grad_norm": 4.682589054107666, + "learning_rate": 5.7764524214187044e-05, + "loss": 2.1012, + "step": 6215 + }, + { + "epoch": 0.4679049285835262, + "grad_norm": 4.900998592376709, + "learning_rate": 5.775248098187571e-05, + "loss": 2.292, + "step": 6216 + }, + { + "epoch": 0.4679802028641864, + "grad_norm": 5.489734649658203, + "learning_rate": 5.774043728870162e-05, + "loss": 1.7491, + "step": 6217 + }, + { + "epoch": 0.4680554771448465, + "grad_norm": 5.939738750457764, + "learning_rate": 5.772839313538073e-05, + "loss": 1.9004, + "step": 6218 + }, + { + "epoch": 0.4681307514255067, + "grad_norm": 6.664865970611572, + "learning_rate": 5.771634852262906e-05, + "loss": 2.3472, + "step": 6219 + }, + { + "epoch": 0.4682060257061669, + "grad_norm": 4.616393566131592, + "learning_rate": 5.77043034511626e-05, + "loss": 2.1801, + "step": 6220 + }, + { + "epoch": 0.468281299986827, + "grad_norm": 4.235937595367432, + "learning_rate": 5.769225792169741e-05, + "loss": 2.2261, + "step": 6221 + }, + { + "epoch": 0.46835657426748717, + "grad_norm": 5.158927917480469, + "learning_rate": 5.768021193494957e-05, + "loss": 1.873, + "step": 6222 + }, + { + "epoch": 0.4684318485481473, + "grad_norm": 5.125085353851318, + "learning_rate": 5.766816549163514e-05, + "loss": 1.9652, + "step": 6223 + }, + { + "epoch": 0.46850712282880747, + "grad_norm": 4.54403829574585, + "learning_rate": 5.76561185924703e-05, + "loss": 1.8689, + "step": 6224 + }, + { + "epoch": 0.46858239710946764, + "grad_norm": 3.592175006866455, + "learning_rate": 5.764407123817116e-05, + "loss": 1.8026, + "step": 6225 + }, + { + "epoch": 0.46865767139012776, + "grad_norm": 4.588351249694824, + "learning_rate": 5.763202342945392e-05, + "loss": 1.8453, + "step": 6226 + }, + { + "epoch": 0.46873294567078794, + "grad_norm": 6.254735469818115, + "learning_rate": 5.761997516703479e-05, + "loss": 2.043, + "step": 6227 + }, + { + "epoch": 0.4688082199514481, + "grad_norm": 5.188050270080566, + "learning_rate": 5.760792645163001e-05, + "loss": 1.9984, + "step": 6228 + }, + { + "epoch": 0.46888349423210823, + "grad_norm": 4.085492134094238, + "learning_rate": 5.7595877283955814e-05, + "loss": 1.8735, + "step": 6229 + }, + { + "epoch": 0.4689587685127684, + "grad_norm": 4.205752372741699, + "learning_rate": 5.7583827664728516e-05, + "loss": 1.8845, + "step": 6230 + }, + { + "epoch": 0.46903404279342853, + "grad_norm": 4.196042060852051, + "learning_rate": 5.7571777594664434e-05, + "loss": 1.8089, + "step": 6231 + }, + { + "epoch": 0.4691093170740887, + "grad_norm": 4.628505229949951, + "learning_rate": 5.7559727074479896e-05, + "loss": 2.0956, + "step": 6232 + }, + { + "epoch": 0.4691845913547489, + "grad_norm": 4.327082633972168, + "learning_rate": 5.754767610489127e-05, + "loss": 2.1062, + "step": 6233 + }, + { + "epoch": 0.469259865635409, + "grad_norm": 4.564215183258057, + "learning_rate": 5.753562468661497e-05, + "loss": 1.8069, + "step": 6234 + }, + { + "epoch": 0.4693351399160692, + "grad_norm": 4.412684440612793, + "learning_rate": 5.752357282036738e-05, + "loss": 1.8424, + "step": 6235 + }, + { + "epoch": 0.46941041419672935, + "grad_norm": 4.835409164428711, + "learning_rate": 5.751152050686498e-05, + "loss": 1.6814, + "step": 6236 + }, + { + "epoch": 0.4694856884773895, + "grad_norm": 3.582608938217163, + "learning_rate": 5.749946774682425e-05, + "loss": 1.7111, + "step": 6237 + }, + { + "epoch": 0.46956096275804965, + "grad_norm": 4.621428966522217, + "learning_rate": 5.7487414540961684e-05, + "loss": 2.1499, + "step": 6238 + }, + { + "epoch": 0.4696362370387098, + "grad_norm": 4.225468158721924, + "learning_rate": 5.7475360889993793e-05, + "loss": 1.5202, + "step": 6239 + }, + { + "epoch": 0.46971151131936995, + "grad_norm": 4.9156413078308105, + "learning_rate": 5.746330679463714e-05, + "loss": 2.107, + "step": 6240 + }, + { + "epoch": 0.4697867856000301, + "grad_norm": 4.5124192237854, + "learning_rate": 5.745125225560833e-05, + "loss": 1.8903, + "step": 6241 + }, + { + "epoch": 0.46986205988069024, + "grad_norm": 4.60844087600708, + "learning_rate": 5.743919727362395e-05, + "loss": 1.9222, + "step": 6242 + }, + { + "epoch": 0.4699373341613504, + "grad_norm": 6.4271979331970215, + "learning_rate": 5.7427141849400625e-05, + "loss": 1.9923, + "step": 6243 + }, + { + "epoch": 0.4700126084420106, + "grad_norm": 3.941171407699585, + "learning_rate": 5.7415085983655036e-05, + "loss": 2.0671, + "step": 6244 + }, + { + "epoch": 0.4700878827226707, + "grad_norm": 5.526283264160156, + "learning_rate": 5.740302967710387e-05, + "loss": 1.8658, + "step": 6245 + }, + { + "epoch": 0.4701631570033309, + "grad_norm": 6.0859222412109375, + "learning_rate": 5.739097293046382e-05, + "loss": 2.2689, + "step": 6246 + }, + { + "epoch": 0.47023843128399107, + "grad_norm": 4.590087890625, + "learning_rate": 5.737891574445164e-05, + "loss": 1.9471, + "step": 6247 + }, + { + "epoch": 0.4703137055646512, + "grad_norm": 4.172879219055176, + "learning_rate": 5.736685811978409e-05, + "loss": 1.9712, + "step": 6248 + }, + { + "epoch": 0.47038897984531136, + "grad_norm": 4.682403087615967, + "learning_rate": 5.735480005717797e-05, + "loss": 1.8274, + "step": 6249 + }, + { + "epoch": 0.4704642541259715, + "grad_norm": 4.299895286560059, + "learning_rate": 5.7342741557350086e-05, + "loss": 1.5215, + "step": 6250 + }, + { + "epoch": 0.47053952840663166, + "grad_norm": 6.324371337890625, + "learning_rate": 5.733068262101728e-05, + "loss": 2.0383, + "step": 6251 + }, + { + "epoch": 0.47061480268729183, + "grad_norm": 5.781114101409912, + "learning_rate": 5.7318623248896444e-05, + "loss": 1.642, + "step": 6252 + }, + { + "epoch": 0.47069007696795195, + "grad_norm": 5.2984938621521, + "learning_rate": 5.730656344170444e-05, + "loss": 1.6114, + "step": 6253 + }, + { + "epoch": 0.47076535124861213, + "grad_norm": 4.689993858337402, + "learning_rate": 5.729450320015821e-05, + "loss": 1.8478, + "step": 6254 + }, + { + "epoch": 0.4708406255292723, + "grad_norm": 10.7761812210083, + "learning_rate": 5.728244252497469e-05, + "loss": 2.1104, + "step": 6255 + }, + { + "epoch": 0.4709158998099324, + "grad_norm": 4.138166427612305, + "learning_rate": 5.7270381416870844e-05, + "loss": 2.1262, + "step": 6256 + }, + { + "epoch": 0.4709911740905926, + "grad_norm": 3.983053207397461, + "learning_rate": 5.7258319876563706e-05, + "loss": 1.7846, + "step": 6257 + }, + { + "epoch": 0.4710664483712528, + "grad_norm": 9.24421501159668, + "learning_rate": 5.724625790477026e-05, + "loss": 1.6986, + "step": 6258 + }, + { + "epoch": 0.4711417226519129, + "grad_norm": 7.4755425453186035, + "learning_rate": 5.723419550220759e-05, + "loss": 1.7405, + "step": 6259 + }, + { + "epoch": 0.4712169969325731, + "grad_norm": 3.8950953483581543, + "learning_rate": 5.7222132669592755e-05, + "loss": 1.8926, + "step": 6260 + }, + { + "epoch": 0.4712922712132332, + "grad_norm": 4.3589301109313965, + "learning_rate": 5.7210069407642844e-05, + "loss": 1.8608, + "step": 6261 + }, + { + "epoch": 0.47136754549389337, + "grad_norm": 5.551499843597412, + "learning_rate": 5.719800571707501e-05, + "loss": 1.6912, + "step": 6262 + }, + { + "epoch": 0.47144281977455355, + "grad_norm": 6.671307563781738, + "learning_rate": 5.7185941598606366e-05, + "loss": 2.0654, + "step": 6263 + }, + { + "epoch": 0.47151809405521367, + "grad_norm": 4.551986217498779, + "learning_rate": 5.717387705295413e-05, + "loss": 1.7901, + "step": 6264 + }, + { + "epoch": 0.47159336833587384, + "grad_norm": 3.8453047275543213, + "learning_rate": 5.716181208083548e-05, + "loss": 1.8323, + "step": 6265 + }, + { + "epoch": 0.471668642616534, + "grad_norm": 3.31730580329895, + "learning_rate": 5.7149746682967665e-05, + "loss": 1.8142, + "step": 6266 + }, + { + "epoch": 0.47174391689719414, + "grad_norm": 3.5658116340637207, + "learning_rate": 5.713768086006791e-05, + "loss": 1.736, + "step": 6267 + }, + { + "epoch": 0.4718191911778543, + "grad_norm": 5.811598777770996, + "learning_rate": 5.71256146128535e-05, + "loss": 2.1401, + "step": 6268 + }, + { + "epoch": 0.4718944654585145, + "grad_norm": 3.8438498973846436, + "learning_rate": 5.711354794204177e-05, + "loss": 1.9435, + "step": 6269 + }, + { + "epoch": 0.4719697397391746, + "grad_norm": 3.2192771434783936, + "learning_rate": 5.7101480848350016e-05, + "loss": 1.8742, + "step": 6270 + }, + { + "epoch": 0.4720450140198348, + "grad_norm": 4.943243503570557, + "learning_rate": 5.70894133324956e-05, + "loss": 1.9867, + "step": 6271 + }, + { + "epoch": 0.4721202883004949, + "grad_norm": 4.282954692840576, + "learning_rate": 5.707734539519591e-05, + "loss": 1.8354, + "step": 6272 + }, + { + "epoch": 0.4721955625811551, + "grad_norm": 3.7576797008514404, + "learning_rate": 5.706527703716833e-05, + "loss": 1.5095, + "step": 6273 + }, + { + "epoch": 0.47227083686181526, + "grad_norm": 5.438213348388672, + "learning_rate": 5.7053208259130296e-05, + "loss": 1.9158, + "step": 6274 + }, + { + "epoch": 0.4723461111424754, + "grad_norm": 4.024550914764404, + "learning_rate": 5.7041139061799285e-05, + "loss": 2.0162, + "step": 6275 + }, + { + "epoch": 0.47242138542313555, + "grad_norm": 4.87020206451416, + "learning_rate": 5.702906944589277e-05, + "loss": 2.2639, + "step": 6276 + }, + { + "epoch": 0.47249665970379573, + "grad_norm": 6.648222923278809, + "learning_rate": 5.7016999412128235e-05, + "loss": 1.4645, + "step": 6277 + }, + { + "epoch": 0.47257193398445585, + "grad_norm": 6.833711624145508, + "learning_rate": 5.7004928961223224e-05, + "loss": 1.869, + "step": 6278 + }, + { + "epoch": 0.472647208265116, + "grad_norm": 4.809195518493652, + "learning_rate": 5.699285809389526e-05, + "loss": 2.0548, + "step": 6279 + }, + { + "epoch": 0.47272248254577615, + "grad_norm": 5.604750633239746, + "learning_rate": 5.6980786810861974e-05, + "loss": 1.9458, + "step": 6280 + }, + { + "epoch": 0.4727977568264363, + "grad_norm": 6.335312366485596, + "learning_rate": 5.696871511284094e-05, + "loss": 1.7803, + "step": 6281 + }, + { + "epoch": 0.4728730311070965, + "grad_norm": 3.6210274696350098, + "learning_rate": 5.695664300054978e-05, + "loss": 1.894, + "step": 6282 + }, + { + "epoch": 0.4729483053877566, + "grad_norm": 3.923715114593506, + "learning_rate": 5.694457047470616e-05, + "loss": 1.7115, + "step": 6283 + }, + { + "epoch": 0.4730235796684168, + "grad_norm": 3.4237923622131348, + "learning_rate": 5.6932497536027754e-05, + "loss": 1.7166, + "step": 6284 + }, + { + "epoch": 0.47309885394907697, + "grad_norm": 5.746707916259766, + "learning_rate": 5.6920424185232245e-05, + "loss": 2.5159, + "step": 6285 + }, + { + "epoch": 0.4731741282297371, + "grad_norm": 3.881298303604126, + "learning_rate": 5.6908350423037394e-05, + "loss": 2.151, + "step": 6286 + }, + { + "epoch": 0.47324940251039727, + "grad_norm": 4.77166748046875, + "learning_rate": 5.6896276250160915e-05, + "loss": 1.7471, + "step": 6287 + }, + { + "epoch": 0.47332467679105744, + "grad_norm": 4.470195770263672, + "learning_rate": 5.6884201667320626e-05, + "loss": 1.9611, + "step": 6288 + }, + { + "epoch": 0.47339995107171756, + "grad_norm": 5.8305487632751465, + "learning_rate": 5.687212667523428e-05, + "loss": 1.7984, + "step": 6289 + }, + { + "epoch": 0.47347522535237774, + "grad_norm": 3.7198565006256104, + "learning_rate": 5.686005127461972e-05, + "loss": 1.9287, + "step": 6290 + }, + { + "epoch": 0.47355049963303786, + "grad_norm": 5.26173734664917, + "learning_rate": 5.684797546619479e-05, + "loss": 1.8081, + "step": 6291 + }, + { + "epoch": 0.47362577391369803, + "grad_norm": 4.555299282073975, + "learning_rate": 5.683589925067738e-05, + "loss": 1.984, + "step": 6292 + }, + { + "epoch": 0.4737010481943582, + "grad_norm": 3.355376720428467, + "learning_rate": 5.6823822628785384e-05, + "loss": 1.9806, + "step": 6293 + }, + { + "epoch": 0.47377632247501833, + "grad_norm": 4.6258225440979, + "learning_rate": 5.68117456012367e-05, + "loss": 1.9366, + "step": 6294 + }, + { + "epoch": 0.4738515967556785, + "grad_norm": 5.893681049346924, + "learning_rate": 5.679966816874929e-05, + "loss": 1.8333, + "step": 6295 + }, + { + "epoch": 0.4739268710363387, + "grad_norm": 4.524265289306641, + "learning_rate": 5.678759033204111e-05, + "loss": 1.9951, + "step": 6296 + }, + { + "epoch": 0.4740021453169988, + "grad_norm": 4.757098197937012, + "learning_rate": 5.677551209183016e-05, + "loss": 1.9888, + "step": 6297 + }, + { + "epoch": 0.474077419597659, + "grad_norm": 3.6930558681488037, + "learning_rate": 5.676343344883447e-05, + "loss": 2.1562, + "step": 6298 + }, + { + "epoch": 0.4741526938783191, + "grad_norm": 4.2077507972717285, + "learning_rate": 5.675135440377206e-05, + "loss": 1.8127, + "step": 6299 + }, + { + "epoch": 0.4742279681589793, + "grad_norm": 4.4843573570251465, + "learning_rate": 5.673927495736101e-05, + "loss": 1.9962, + "step": 6300 + }, + { + "epoch": 0.47430324243963945, + "grad_norm": 6.508464813232422, + "learning_rate": 5.672719511031939e-05, + "loss": 2.1499, + "step": 6301 + }, + { + "epoch": 0.47437851672029957, + "grad_norm": 5.006059169769287, + "learning_rate": 5.6715114863365315e-05, + "loss": 1.9331, + "step": 6302 + }, + { + "epoch": 0.47445379100095975, + "grad_norm": 4.76358699798584, + "learning_rate": 5.6703034217216946e-05, + "loss": 1.772, + "step": 6303 + }, + { + "epoch": 0.4745290652816199, + "grad_norm": 4.853478908538818, + "learning_rate": 5.669095317259243e-05, + "loss": 1.7636, + "step": 6304 + }, + { + "epoch": 0.47460433956228004, + "grad_norm": 5.0832390785217285, + "learning_rate": 5.667887173020994e-05, + "loss": 1.894, + "step": 6305 + }, + { + "epoch": 0.4746796138429402, + "grad_norm": 4.127617359161377, + "learning_rate": 5.6666789890787675e-05, + "loss": 1.9766, + "step": 6306 + }, + { + "epoch": 0.4747548881236004, + "grad_norm": 4.2291131019592285, + "learning_rate": 5.6654707655043894e-05, + "loss": 2.1901, + "step": 6307 + }, + { + "epoch": 0.4748301624042605, + "grad_norm": 4.272363662719727, + "learning_rate": 5.6642625023696825e-05, + "loss": 1.8097, + "step": 6308 + }, + { + "epoch": 0.4749054366849207, + "grad_norm": 4.337517738342285, + "learning_rate": 5.663054199746477e-05, + "loss": 1.9706, + "step": 6309 + }, + { + "epoch": 0.4749807109655808, + "grad_norm": 3.9452738761901855, + "learning_rate": 5.661845857706602e-05, + "loss": 2.1088, + "step": 6310 + }, + { + "epoch": 0.475055985246241, + "grad_norm": 4.407073974609375, + "learning_rate": 5.6606374763218905e-05, + "loss": 2.1809, + "step": 6311 + }, + { + "epoch": 0.47513125952690116, + "grad_norm": 4.012044429779053, + "learning_rate": 5.659429055664176e-05, + "loss": 2.234, + "step": 6312 + }, + { + "epoch": 0.4752065338075613, + "grad_norm": 3.9452366828918457, + "learning_rate": 5.658220595805295e-05, + "loss": 1.9897, + "step": 6313 + }, + { + "epoch": 0.47528180808822146, + "grad_norm": 4.416503429412842, + "learning_rate": 5.6570120968170894e-05, + "loss": 2.2219, + "step": 6314 + }, + { + "epoch": 0.47535708236888163, + "grad_norm": 4.730295658111572, + "learning_rate": 5.6558035587714e-05, + "loss": 1.7912, + "step": 6315 + }, + { + "epoch": 0.47543235664954175, + "grad_norm": 4.5806193351745605, + "learning_rate": 5.65459498174007e-05, + "loss": 1.8763, + "step": 6316 + }, + { + "epoch": 0.47550763093020193, + "grad_norm": 4.9890570640563965, + "learning_rate": 5.6533863657949474e-05, + "loss": 1.6085, + "step": 6317 + }, + { + "epoch": 0.4755829052108621, + "grad_norm": 3.957623243331909, + "learning_rate": 5.652177711007878e-05, + "loss": 1.7229, + "step": 6318 + }, + { + "epoch": 0.4756581794915222, + "grad_norm": 4.691608905792236, + "learning_rate": 5.650969017450717e-05, + "loss": 1.7539, + "step": 6319 + }, + { + "epoch": 0.4757334537721824, + "grad_norm": 4.535962104797363, + "learning_rate": 5.6497602851953156e-05, + "loss": 1.7558, + "step": 6320 + }, + { + "epoch": 0.4758087280528425, + "grad_norm": 4.202637195587158, + "learning_rate": 5.6485515143135294e-05, + "loss": 1.5503, + "step": 6321 + }, + { + "epoch": 0.4758840023335027, + "grad_norm": 5.434566020965576, + "learning_rate": 5.6473427048772165e-05, + "loss": 1.9693, + "step": 6322 + }, + { + "epoch": 0.4759592766141629, + "grad_norm": 3.815725326538086, + "learning_rate": 5.646133856958237e-05, + "loss": 2.7614, + "step": 6323 + }, + { + "epoch": 0.476034550894823, + "grad_norm": 3.9051058292388916, + "learning_rate": 5.6449249706284534e-05, + "loss": 2.1292, + "step": 6324 + }, + { + "epoch": 0.47610982517548317, + "grad_norm": 4.619932651519775, + "learning_rate": 5.643716045959732e-05, + "loss": 1.9181, + "step": 6325 + }, + { + "epoch": 0.47618509945614335, + "grad_norm": 5.109991073608398, + "learning_rate": 5.642507083023938e-05, + "loss": 1.6782, + "step": 6326 + }, + { + "epoch": 0.47626037373680347, + "grad_norm": 5.29757833480835, + "learning_rate": 5.6412980818929406e-05, + "loss": 2.1398, + "step": 6327 + }, + { + "epoch": 0.47633564801746364, + "grad_norm": 4.539900302886963, + "learning_rate": 5.640089042638614e-05, + "loss": 1.9293, + "step": 6328 + }, + { + "epoch": 0.47641092229812376, + "grad_norm": 5.332345008850098, + "learning_rate": 5.6388799653328284e-05, + "loss": 1.8492, + "step": 6329 + }, + { + "epoch": 0.47648619657878394, + "grad_norm": 6.914534091949463, + "learning_rate": 5.637670850047464e-05, + "loss": 1.849, + "step": 6330 + }, + { + "epoch": 0.4765614708594441, + "grad_norm": 4.831961154937744, + "learning_rate": 5.6364616968543973e-05, + "loss": 1.7322, + "step": 6331 + }, + { + "epoch": 0.47663674514010423, + "grad_norm": 5.263299942016602, + "learning_rate": 5.635252505825508e-05, + "loss": 2.0602, + "step": 6332 + }, + { + "epoch": 0.4767120194207644, + "grad_norm": 8.768918991088867, + "learning_rate": 5.63404327703268e-05, + "loss": 1.8796, + "step": 6333 + }, + { + "epoch": 0.4767872937014246, + "grad_norm": 5.573263168334961, + "learning_rate": 5.632834010547798e-05, + "loss": 1.8019, + "step": 6334 + }, + { + "epoch": 0.4768625679820847, + "grad_norm": 4.626332759857178, + "learning_rate": 5.6316247064427504e-05, + "loss": 2.0785, + "step": 6335 + }, + { + "epoch": 0.4769378422627449, + "grad_norm": 13.972732543945312, + "learning_rate": 5.630415364789426e-05, + "loss": 1.8535, + "step": 6336 + }, + { + "epoch": 0.47701311654340506, + "grad_norm": 4.4583563804626465, + "learning_rate": 5.629205985659718e-05, + "loss": 1.6426, + "step": 6337 + }, + { + "epoch": 0.4770883908240652, + "grad_norm": 4.307314872741699, + "learning_rate": 5.62799656912552e-05, + "loss": 1.6567, + "step": 6338 + }, + { + "epoch": 0.47716366510472535, + "grad_norm": 4.567723751068115, + "learning_rate": 5.626787115258726e-05, + "loss": 1.9421, + "step": 6339 + }, + { + "epoch": 0.4772389393853855, + "grad_norm": 4.745924949645996, + "learning_rate": 5.6255776241312374e-05, + "loss": 2.2892, + "step": 6340 + }, + { + "epoch": 0.47731421366604565, + "grad_norm": 4.9133100509643555, + "learning_rate": 5.6243680958149525e-05, + "loss": 1.8242, + "step": 6341 + }, + { + "epoch": 0.4773894879467058, + "grad_norm": 5.110033988952637, + "learning_rate": 5.623158530381778e-05, + "loss": 1.6635, + "step": 6342 + }, + { + "epoch": 0.47746476222736595, + "grad_norm": 4.4033613204956055, + "learning_rate": 5.621948927903616e-05, + "loss": 1.9696, + "step": 6343 + }, + { + "epoch": 0.4775400365080261, + "grad_norm": 5.070498943328857, + "learning_rate": 5.6207392884523755e-05, + "loss": 2.2543, + "step": 6344 + }, + { + "epoch": 0.4776153107886863, + "grad_norm": 7.706250190734863, + "learning_rate": 5.619529612099965e-05, + "loss": 1.7195, + "step": 6345 + }, + { + "epoch": 0.4776905850693464, + "grad_norm": 3.8137874603271484, + "learning_rate": 5.618319898918296e-05, + "loss": 1.8165, + "step": 6346 + }, + { + "epoch": 0.4777658593500066, + "grad_norm": 5.7412800788879395, + "learning_rate": 5.6171101489792835e-05, + "loss": 2.1338, + "step": 6347 + }, + { + "epoch": 0.4778411336306667, + "grad_norm": 5.855752944946289, + "learning_rate": 5.6159003623548444e-05, + "loss": 1.9841, + "step": 6348 + }, + { + "epoch": 0.4779164079113269, + "grad_norm": 4.267614841461182, + "learning_rate": 5.6146905391168946e-05, + "loss": 1.9286, + "step": 6349 + }, + { + "epoch": 0.47799168219198707, + "grad_norm": 5.545926094055176, + "learning_rate": 5.613480679337357e-05, + "loss": 2.0125, + "step": 6350 + }, + { + "epoch": 0.4780669564726472, + "grad_norm": 4.504691123962402, + "learning_rate": 5.6122707830881516e-05, + "loss": 1.7489, + "step": 6351 + }, + { + "epoch": 0.47814223075330736, + "grad_norm": 4.732955455780029, + "learning_rate": 5.6110608504412064e-05, + "loss": 1.8037, + "step": 6352 + }, + { + "epoch": 0.47821750503396754, + "grad_norm": 5.685470104217529, + "learning_rate": 5.609850881468447e-05, + "loss": 1.6545, + "step": 6353 + }, + { + "epoch": 0.47829277931462766, + "grad_norm": 5.293968200683594, + "learning_rate": 5.608640876241803e-05, + "loss": 1.8031, + "step": 6354 + }, + { + "epoch": 0.47836805359528783, + "grad_norm": 3.5953304767608643, + "learning_rate": 5.607430834833204e-05, + "loss": 1.7303, + "step": 6355 + }, + { + "epoch": 0.478443327875948, + "grad_norm": 5.320721626281738, + "learning_rate": 5.606220757314585e-05, + "loss": 1.9182, + "step": 6356 + }, + { + "epoch": 0.47851860215660813, + "grad_norm": 4.980002403259277, + "learning_rate": 5.605010643757881e-05, + "loss": 1.9659, + "step": 6357 + }, + { + "epoch": 0.4785938764372683, + "grad_norm": 4.784584045410156, + "learning_rate": 5.603800494235032e-05, + "loss": 1.6457, + "step": 6358 + }, + { + "epoch": 0.4786691507179284, + "grad_norm": 3.8784241676330566, + "learning_rate": 5.6025903088179745e-05, + "loss": 1.5963, + "step": 6359 + }, + { + "epoch": 0.4787444249985886, + "grad_norm": 4.006401538848877, + "learning_rate": 5.601380087578654e-05, + "loss": 1.791, + "step": 6360 + }, + { + "epoch": 0.4788196992792488, + "grad_norm": 5.516632080078125, + "learning_rate": 5.600169830589012e-05, + "loss": 1.7746, + "step": 6361 + }, + { + "epoch": 0.4788949735599089, + "grad_norm": 3.885056495666504, + "learning_rate": 5.5989595379209945e-05, + "loss": 1.9317, + "step": 6362 + }, + { + "epoch": 0.4789702478405691, + "grad_norm": 3.2825815677642822, + "learning_rate": 5.597749209646551e-05, + "loss": 1.8626, + "step": 6363 + }, + { + "epoch": 0.47904552212122925, + "grad_norm": 4.296835422515869, + "learning_rate": 5.5965388458376337e-05, + "loss": 1.6603, + "step": 6364 + }, + { + "epoch": 0.47912079640188937, + "grad_norm": 6.031205654144287, + "learning_rate": 5.595328446566193e-05, + "loss": 2.3506, + "step": 6365 + }, + { + "epoch": 0.47919607068254955, + "grad_norm": 3.74759578704834, + "learning_rate": 5.5941180119041845e-05, + "loss": 1.8234, + "step": 6366 + }, + { + "epoch": 0.4792713449632097, + "grad_norm": 3.4017677307128906, + "learning_rate": 5.592907541923564e-05, + "loss": 1.8133, + "step": 6367 + }, + { + "epoch": 0.47934661924386984, + "grad_norm": 3.7446372509002686, + "learning_rate": 5.5916970366962914e-05, + "loss": 1.8927, + "step": 6368 + }, + { + "epoch": 0.47942189352453, + "grad_norm": 4.254096508026123, + "learning_rate": 5.5904864962943296e-05, + "loss": 1.8294, + "step": 6369 + }, + { + "epoch": 0.47949716780519014, + "grad_norm": 3.59273099899292, + "learning_rate": 5.5892759207896386e-05, + "loss": 1.8149, + "step": 6370 + }, + { + "epoch": 0.4795724420858503, + "grad_norm": 4.840024471282959, + "learning_rate": 5.588065310254185e-05, + "loss": 1.8987, + "step": 6371 + }, + { + "epoch": 0.4796477163665105, + "grad_norm": 6.468723297119141, + "learning_rate": 5.586854664759935e-05, + "loss": 1.8035, + "step": 6372 + }, + { + "epoch": 0.4797229906471706, + "grad_norm": 6.230376720428467, + "learning_rate": 5.585643984378861e-05, + "loss": 1.7975, + "step": 6373 + }, + { + "epoch": 0.4797982649278308, + "grad_norm": 4.697869300842285, + "learning_rate": 5.58443326918293e-05, + "loss": 2.0766, + "step": 6374 + }, + { + "epoch": 0.47987353920849096, + "grad_norm": 4.413533687591553, + "learning_rate": 5.58322251924412e-05, + "loss": 1.5893, + "step": 6375 + }, + { + "epoch": 0.4799488134891511, + "grad_norm": 4.307180404663086, + "learning_rate": 5.5820117346344045e-05, + "loss": 2.1065, + "step": 6376 + }, + { + "epoch": 0.48002408776981126, + "grad_norm": 4.2669196128845215, + "learning_rate": 5.5808009154257625e-05, + "loss": 1.6462, + "step": 6377 + }, + { + "epoch": 0.4800993620504714, + "grad_norm": 4.337838172912598, + "learning_rate": 5.579590061690171e-05, + "loss": 1.7609, + "step": 6378 + }, + { + "epoch": 0.48017463633113155, + "grad_norm": 4.283106803894043, + "learning_rate": 5.5783791734996124e-05, + "loss": 1.79, + "step": 6379 + }, + { + "epoch": 0.48024991061179173, + "grad_norm": 4.739707946777344, + "learning_rate": 5.5771682509260735e-05, + "loss": 2.1653, + "step": 6380 + }, + { + "epoch": 0.48032518489245185, + "grad_norm": 5.706414699554443, + "learning_rate": 5.5759572940415386e-05, + "loss": 1.5917, + "step": 6381 + }, + { + "epoch": 0.480400459173112, + "grad_norm": 6.01108980178833, + "learning_rate": 5.574746302917994e-05, + "loss": 1.9168, + "step": 6382 + }, + { + "epoch": 0.4804757334537722, + "grad_norm": 5.496085166931152, + "learning_rate": 5.5735352776274306e-05, + "loss": 2.017, + "step": 6383 + }, + { + "epoch": 0.4805510077344323, + "grad_norm": 4.263923168182373, + "learning_rate": 5.57232421824184e-05, + "loss": 1.9053, + "step": 6384 + }, + { + "epoch": 0.4806262820150925, + "grad_norm": 3.3274738788604736, + "learning_rate": 5.5711131248332185e-05, + "loss": 1.925, + "step": 6385 + }, + { + "epoch": 0.4807015562957527, + "grad_norm": 4.82871675491333, + "learning_rate": 5.56990199747356e-05, + "loss": 1.8865, + "step": 6386 + }, + { + "epoch": 0.4807768305764128, + "grad_norm": 5.63754415512085, + "learning_rate": 5.568690836234861e-05, + "loss": 1.773, + "step": 6387 + }, + { + "epoch": 0.48085210485707297, + "grad_norm": 4.828920364379883, + "learning_rate": 5.567479641189126e-05, + "loss": 1.6184, + "step": 6388 + }, + { + "epoch": 0.4809273791377331, + "grad_norm": 4.03684663772583, + "learning_rate": 5.5662684124083545e-05, + "loss": 1.7707, + "step": 6389 + }, + { + "epoch": 0.48100265341839327, + "grad_norm": 7.016988754272461, + "learning_rate": 5.56505714996455e-05, + "loss": 2.0252, + "step": 6390 + }, + { + "epoch": 0.48107792769905344, + "grad_norm": 5.318288326263428, + "learning_rate": 5.563845853929718e-05, + "loss": 2.0716, + "step": 6391 + }, + { + "epoch": 0.48115320197971356, + "grad_norm": 3.760000705718994, + "learning_rate": 5.562634524375869e-05, + "loss": 1.7625, + "step": 6392 + }, + { + "epoch": 0.48122847626037374, + "grad_norm": 5.033058166503906, + "learning_rate": 5.5614231613750124e-05, + "loss": 1.6887, + "step": 6393 + }, + { + "epoch": 0.4813037505410339, + "grad_norm": 3.9787211418151855, + "learning_rate": 5.560211764999159e-05, + "loss": 1.8863, + "step": 6394 + }, + { + "epoch": 0.48137902482169403, + "grad_norm": 4.904836177825928, + "learning_rate": 5.559000335320325e-05, + "loss": 1.9658, + "step": 6395 + }, + { + "epoch": 0.4814542991023542, + "grad_norm": 5.078438758850098, + "learning_rate": 5.557788872410523e-05, + "loss": 1.9511, + "step": 6396 + }, + { + "epoch": 0.48152957338301433, + "grad_norm": 7.313134670257568, + "learning_rate": 5.556577376341774e-05, + "loss": 2.0546, + "step": 6397 + }, + { + "epoch": 0.4816048476636745, + "grad_norm": 5.61083984375, + "learning_rate": 5.555365847186097e-05, + "loss": 1.9182, + "step": 6398 + }, + { + "epoch": 0.4816801219443347, + "grad_norm": 6.040586948394775, + "learning_rate": 5.5541542850155136e-05, + "loss": 2.0561, + "step": 6399 + }, + { + "epoch": 0.4817553962249948, + "grad_norm": 4.592431545257568, + "learning_rate": 5.5529426899020484e-05, + "loss": 2.0644, + "step": 6400 + }, + { + "epoch": 0.481830670505655, + "grad_norm": 4.533272743225098, + "learning_rate": 5.551731061917727e-05, + "loss": 1.8183, + "step": 6401 + }, + { + "epoch": 0.48190594478631515, + "grad_norm": 4.679579257965088, + "learning_rate": 5.5505194011345764e-05, + "loss": 2.0861, + "step": 6402 + }, + { + "epoch": 0.4819812190669753, + "grad_norm": 4.739593505859375, + "learning_rate": 5.549307707624627e-05, + "loss": 1.6189, + "step": 6403 + }, + { + "epoch": 0.48205649334763545, + "grad_norm": 3.4774935245513916, + "learning_rate": 5.548095981459911e-05, + "loss": 1.9006, + "step": 6404 + }, + { + "epoch": 0.4821317676282956, + "grad_norm": 3.710263729095459, + "learning_rate": 5.5468842227124615e-05, + "loss": 1.6716, + "step": 6405 + }, + { + "epoch": 0.48220704190895575, + "grad_norm": 5.572620868682861, + "learning_rate": 5.545672431454314e-05, + "loss": 1.686, + "step": 6406 + }, + { + "epoch": 0.4822823161896159, + "grad_norm": 5.486653804779053, + "learning_rate": 5.5444606077575056e-05, + "loss": 2.2022, + "step": 6407 + }, + { + "epoch": 0.48235759047027604, + "grad_norm": 3.2374887466430664, + "learning_rate": 5.5432487516940765e-05, + "loss": 1.7563, + "step": 6408 + }, + { + "epoch": 0.4824328647509362, + "grad_norm": 5.346471786499023, + "learning_rate": 5.5420368633360695e-05, + "loss": 1.8299, + "step": 6409 + }, + { + "epoch": 0.4825081390315964, + "grad_norm": 4.188678741455078, + "learning_rate": 5.5408249427555245e-05, + "loss": 1.9282, + "step": 6410 + }, + { + "epoch": 0.4825834133122565, + "grad_norm": 4.597207546234131, + "learning_rate": 5.5396129900244894e-05, + "loss": 1.9242, + "step": 6411 + }, + { + "epoch": 0.4826586875929167, + "grad_norm": 4.855897426605225, + "learning_rate": 5.53840100521501e-05, + "loss": 1.906, + "step": 6412 + }, + { + "epoch": 0.48273396187357687, + "grad_norm": 5.525973796844482, + "learning_rate": 5.5371889883991354e-05, + "loss": 2.0479, + "step": 6413 + }, + { + "epoch": 0.482809236154237, + "grad_norm": 7.5665283203125, + "learning_rate": 5.535976939648917e-05, + "loss": 1.9169, + "step": 6414 + }, + { + "epoch": 0.48288451043489716, + "grad_norm": 3.9134891033172607, + "learning_rate": 5.5347648590364085e-05, + "loss": 1.938, + "step": 6415 + }, + { + "epoch": 0.48295978471555734, + "grad_norm": 4.094476699829102, + "learning_rate": 5.5335527466336625e-05, + "loss": 1.6765, + "step": 6416 + }, + { + "epoch": 0.48303505899621746, + "grad_norm": 4.275125503540039, + "learning_rate": 5.5323406025127364e-05, + "loss": 1.945, + "step": 6417 + }, + { + "epoch": 0.48311033327687763, + "grad_norm": 4.317477703094482, + "learning_rate": 5.5311284267456884e-05, + "loss": 1.8148, + "step": 6418 + }, + { + "epoch": 0.48318560755753776, + "grad_norm": 4.612224578857422, + "learning_rate": 5.5299162194045806e-05, + "loss": 2.2421, + "step": 6419 + }, + { + "epoch": 0.48326088183819793, + "grad_norm": 5.999127388000488, + "learning_rate": 5.528703980561474e-05, + "loss": 1.9438, + "step": 6420 + }, + { + "epoch": 0.4833361561188581, + "grad_norm": 4.804381847381592, + "learning_rate": 5.527491710288433e-05, + "loss": 2.0369, + "step": 6421 + }, + { + "epoch": 0.4834114303995182, + "grad_norm": 16.27371597290039, + "learning_rate": 5.526279408657522e-05, + "loss": 2.0063, + "step": 6422 + }, + { + "epoch": 0.4834867046801784, + "grad_norm": 4.779478073120117, + "learning_rate": 5.525067075740812e-05, + "loss": 1.4375, + "step": 6423 + }, + { + "epoch": 0.4835619789608386, + "grad_norm": 6.568668842315674, + "learning_rate": 5.523854711610369e-05, + "loss": 2.0574, + "step": 6424 + }, + { + "epoch": 0.4836372532414987, + "grad_norm": 5.429203510284424, + "learning_rate": 5.522642316338268e-05, + "loss": 1.8831, + "step": 6425 + }, + { + "epoch": 0.4837125275221589, + "grad_norm": 5.603474140167236, + "learning_rate": 5.52142988999658e-05, + "loss": 1.6548, + "step": 6426 + }, + { + "epoch": 0.483787801802819, + "grad_norm": 4.25176477432251, + "learning_rate": 5.520217432657382e-05, + "loss": 2.0371, + "step": 6427 + }, + { + "epoch": 0.48386307608347917, + "grad_norm": 5.165562152862549, + "learning_rate": 5.519004944392751e-05, + "loss": 1.7478, + "step": 6428 + }, + { + "epoch": 0.48393835036413935, + "grad_norm": 4.938843727111816, + "learning_rate": 5.517792425274762e-05, + "loss": 1.985, + "step": 6429 + }, + { + "epoch": 0.48401362464479947, + "grad_norm": 4.424111366271973, + "learning_rate": 5.516579875375502e-05, + "loss": 2.0982, + "step": 6430 + }, + { + "epoch": 0.48408889892545964, + "grad_norm": 4.951056957244873, + "learning_rate": 5.51536729476705e-05, + "loss": 1.7929, + "step": 6431 + }, + { + "epoch": 0.4841641732061198, + "grad_norm": 5.1057281494140625, + "learning_rate": 5.5141546835214906e-05, + "loss": 2.2513, + "step": 6432 + }, + { + "epoch": 0.48423944748677994, + "grad_norm": 3.885099411010742, + "learning_rate": 5.5129420417109126e-05, + "loss": 1.6526, + "step": 6433 + }, + { + "epoch": 0.4843147217674401, + "grad_norm": 4.9807257652282715, + "learning_rate": 5.511729369407399e-05, + "loss": 2.0737, + "step": 6434 + }, + { + "epoch": 0.4843899960481003, + "grad_norm": 4.173788547515869, + "learning_rate": 5.5105166666830444e-05, + "loss": 2.1203, + "step": 6435 + }, + { + "epoch": 0.4844652703287604, + "grad_norm": 4.617697715759277, + "learning_rate": 5.509303933609937e-05, + "loss": 1.8019, + "step": 6436 + }, + { + "epoch": 0.4845405446094206, + "grad_norm": 4.278045177459717, + "learning_rate": 5.5080911702601734e-05, + "loss": 2.2662, + "step": 6437 + }, + { + "epoch": 0.4846158188900807, + "grad_norm": 4.496089935302734, + "learning_rate": 5.5068783767058486e-05, + "loss": 1.7241, + "step": 6438 + }, + { + "epoch": 0.4846910931707409, + "grad_norm": 5.214039325714111, + "learning_rate": 5.5056655530190584e-05, + "loss": 1.8078, + "step": 6439 + }, + { + "epoch": 0.48476636745140106, + "grad_norm": 4.256460666656494, + "learning_rate": 5.504452699271901e-05, + "loss": 2.028, + "step": 6440 + }, + { + "epoch": 0.4848416417320612, + "grad_norm": 4.191312789916992, + "learning_rate": 5.5032398155364775e-05, + "loss": 1.737, + "step": 6441 + }, + { + "epoch": 0.48491691601272136, + "grad_norm": 3.783006191253662, + "learning_rate": 5.5020269018848923e-05, + "loss": 2.128, + "step": 6442 + }, + { + "epoch": 0.48499219029338153, + "grad_norm": 3.8177261352539062, + "learning_rate": 5.5008139583892485e-05, + "loss": 2.0486, + "step": 6443 + }, + { + "epoch": 0.48506746457404165, + "grad_norm": 6.276378631591797, + "learning_rate": 5.4996009851216515e-05, + "loss": 1.689, + "step": 6444 + }, + { + "epoch": 0.4851427388547018, + "grad_norm": 5.2423248291015625, + "learning_rate": 5.49838798215421e-05, + "loss": 1.7339, + "step": 6445 + }, + { + "epoch": 0.485218013135362, + "grad_norm": 4.056105136871338, + "learning_rate": 5.49717494955903e-05, + "loss": 2.0997, + "step": 6446 + }, + { + "epoch": 0.4852932874160221, + "grad_norm": 3.3604941368103027, + "learning_rate": 5.4959618874082284e-05, + "loss": 1.6656, + "step": 6447 + }, + { + "epoch": 0.4853685616966823, + "grad_norm": 3.8618831634521484, + "learning_rate": 5.494748795773915e-05, + "loss": 1.9945, + "step": 6448 + }, + { + "epoch": 0.4854438359773424, + "grad_norm": 4.705929279327393, + "learning_rate": 5.493535674728206e-05, + "loss": 1.7755, + "step": 6449 + }, + { + "epoch": 0.4855191102580026, + "grad_norm": 3.9789557456970215, + "learning_rate": 5.492322524343218e-05, + "loss": 1.8262, + "step": 6450 + }, + { + "epoch": 0.48559438453866277, + "grad_norm": 4.300180435180664, + "learning_rate": 5.4911093446910654e-05, + "loss": 1.9044, + "step": 6451 + }, + { + "epoch": 0.4856696588193229, + "grad_norm": 5.275288105010986, + "learning_rate": 5.4898961358438726e-05, + "loss": 1.8408, + "step": 6452 + }, + { + "epoch": 0.48574493309998307, + "grad_norm": 3.6018564701080322, + "learning_rate": 5.488682897873761e-05, + "loss": 1.8243, + "step": 6453 + }, + { + "epoch": 0.48582020738064324, + "grad_norm": 4.977598667144775, + "learning_rate": 5.487469630852854e-05, + "loss": 1.8119, + "step": 6454 + }, + { + "epoch": 0.48589548166130336, + "grad_norm": 5.883234977722168, + "learning_rate": 5.486256334853276e-05, + "loss": 1.9167, + "step": 6455 + }, + { + "epoch": 0.48597075594196354, + "grad_norm": 6.700308322906494, + "learning_rate": 5.4850430099471526e-05, + "loss": 1.9208, + "step": 6456 + }, + { + "epoch": 0.48604603022262366, + "grad_norm": 6.252125263214111, + "learning_rate": 5.483829656206615e-05, + "loss": 2.1003, + "step": 6457 + }, + { + "epoch": 0.48612130450328384, + "grad_norm": 7.259039878845215, + "learning_rate": 5.482616273703792e-05, + "loss": 2.0475, + "step": 6458 + }, + { + "epoch": 0.486196578783944, + "grad_norm": 4.768459320068359, + "learning_rate": 5.481402862510817e-05, + "loss": 2.1664, + "step": 6459 + }, + { + "epoch": 0.48627185306460413, + "grad_norm": 4.7697529792785645, + "learning_rate": 5.480189422699824e-05, + "loss": 1.9889, + "step": 6460 + }, + { + "epoch": 0.4863471273452643, + "grad_norm": 4.36065149307251, + "learning_rate": 5.478975954342947e-05, + "loss": 1.9103, + "step": 6461 + }, + { + "epoch": 0.4864224016259245, + "grad_norm": 4.5343217849731445, + "learning_rate": 5.477762457512323e-05, + "loss": 2.1913, + "step": 6462 + }, + { + "epoch": 0.4864976759065846, + "grad_norm": 4.51072359085083, + "learning_rate": 5.476548932280092e-05, + "loss": 1.9727, + "step": 6463 + }, + { + "epoch": 0.4865729501872448, + "grad_norm": 4.391369819641113, + "learning_rate": 5.4753353787183945e-05, + "loss": 1.9416, + "step": 6464 + }, + { + "epoch": 0.48664822446790496, + "grad_norm": 3.696805000305176, + "learning_rate": 5.474121796899373e-05, + "loss": 1.8544, + "step": 6465 + }, + { + "epoch": 0.4867234987485651, + "grad_norm": 5.127026557922363, + "learning_rate": 5.4729081868951706e-05, + "loss": 1.6701, + "step": 6466 + }, + { + "epoch": 0.48679877302922525, + "grad_norm": 3.75046443939209, + "learning_rate": 5.471694548777934e-05, + "loss": 2.1559, + "step": 6467 + }, + { + "epoch": 0.48687404730988537, + "grad_norm": 6.920037746429443, + "learning_rate": 5.4704808826198085e-05, + "loss": 1.8228, + "step": 6468 + }, + { + "epoch": 0.48694932159054555, + "grad_norm": 3.4855964183807373, + "learning_rate": 5.4692671884929445e-05, + "loss": 1.7654, + "step": 6469 + }, + { + "epoch": 0.4870245958712057, + "grad_norm": 4.799417495727539, + "learning_rate": 5.4680534664694935e-05, + "loss": 1.6938, + "step": 6470 + }, + { + "epoch": 0.48709987015186584, + "grad_norm": 4.094606399536133, + "learning_rate": 5.466839716621608e-05, + "loss": 1.899, + "step": 6471 + }, + { + "epoch": 0.487175144432526, + "grad_norm": 4.776040554046631, + "learning_rate": 5.4656259390214406e-05, + "loss": 1.708, + "step": 6472 + }, + { + "epoch": 0.4872504187131862, + "grad_norm": 3.5086989402770996, + "learning_rate": 5.464412133741146e-05, + "loss": 2.1431, + "step": 6473 + }, + { + "epoch": 0.4873256929938463, + "grad_norm": 5.574459075927734, + "learning_rate": 5.463198300852882e-05, + "loss": 2.0567, + "step": 6474 + }, + { + "epoch": 0.4874009672745065, + "grad_norm": 5.2659759521484375, + "learning_rate": 5.46198444042881e-05, + "loss": 1.8074, + "step": 6475 + }, + { + "epoch": 0.4874762415551666, + "grad_norm": 4.93739652633667, + "learning_rate": 5.460770552541088e-05, + "loss": 2.391, + "step": 6476 + }, + { + "epoch": 0.4875515158358268, + "grad_norm": 4.549590587615967, + "learning_rate": 5.4595566372618775e-05, + "loss": 1.9116, + "step": 6477 + }, + { + "epoch": 0.48762679011648696, + "grad_norm": 5.146921634674072, + "learning_rate": 5.458342694663345e-05, + "loss": 2.0942, + "step": 6478 + }, + { + "epoch": 0.4877020643971471, + "grad_norm": 6.532907009124756, + "learning_rate": 5.457128724817654e-05, + "loss": 2.1415, + "step": 6479 + }, + { + "epoch": 0.48777733867780726, + "grad_norm": 4.148580074310303, + "learning_rate": 5.455914727796971e-05, + "loss": 1.9203, + "step": 6480 + }, + { + "epoch": 0.48785261295846744, + "grad_norm": 4.636167049407959, + "learning_rate": 5.454700703673468e-05, + "loss": 1.7817, + "step": 6481 + }, + { + "epoch": 0.48792788723912756, + "grad_norm": 4.693921089172363, + "learning_rate": 5.453486652519311e-05, + "loss": 1.9467, + "step": 6482 + }, + { + "epoch": 0.48800316151978773, + "grad_norm": 5.099684715270996, + "learning_rate": 5.4522725744066735e-05, + "loss": 1.7751, + "step": 6483 + }, + { + "epoch": 0.4880784358004479, + "grad_norm": 4.242069721221924, + "learning_rate": 5.451058469407729e-05, + "loss": 1.9235, + "step": 6484 + }, + { + "epoch": 0.488153710081108, + "grad_norm": 4.0216064453125, + "learning_rate": 5.449844337594654e-05, + "loss": 1.7325, + "step": 6485 + }, + { + "epoch": 0.4882289843617682, + "grad_norm": 5.807652473449707, + "learning_rate": 5.448630179039622e-05, + "loss": 1.9754, + "step": 6486 + }, + { + "epoch": 0.4883042586424283, + "grad_norm": 4.336977005004883, + "learning_rate": 5.447415993814815e-05, + "loss": 1.3755, + "step": 6487 + }, + { + "epoch": 0.4883795329230885, + "grad_norm": 6.303496837615967, + "learning_rate": 5.4462017819924106e-05, + "loss": 2.0581, + "step": 6488 + }, + { + "epoch": 0.4884548072037487, + "grad_norm": 5.578193664550781, + "learning_rate": 5.444987543644591e-05, + "loss": 1.8661, + "step": 6489 + }, + { + "epoch": 0.4885300814844088, + "grad_norm": 5.070801734924316, + "learning_rate": 5.443773278843538e-05, + "loss": 1.7357, + "step": 6490 + }, + { + "epoch": 0.48860535576506897, + "grad_norm": 4.3117356300354, + "learning_rate": 5.4425589876614366e-05, + "loss": 2.3736, + "step": 6491 + }, + { + "epoch": 0.48868063004572915, + "grad_norm": 4.435965061187744, + "learning_rate": 5.441344670170474e-05, + "loss": 1.7422, + "step": 6492 + }, + { + "epoch": 0.48875590432638927, + "grad_norm": 3.736522674560547, + "learning_rate": 5.4401303264428364e-05, + "loss": 2.2251, + "step": 6493 + }, + { + "epoch": 0.48883117860704944, + "grad_norm": 5.066519737243652, + "learning_rate": 5.438915956550714e-05, + "loss": 2.1216, + "step": 6494 + }, + { + "epoch": 0.4889064528877096, + "grad_norm": 5.010735511779785, + "learning_rate": 5.4377015605662986e-05, + "loss": 1.8648, + "step": 6495 + }, + { + "epoch": 0.48898172716836974, + "grad_norm": 4.502397537231445, + "learning_rate": 5.4364871385617786e-05, + "loss": 2.0692, + "step": 6496 + }, + { + "epoch": 0.4890570014490299, + "grad_norm": 3.5062954425811768, + "learning_rate": 5.435272690609353e-05, + "loss": 2.0433, + "step": 6497 + }, + { + "epoch": 0.48913227572969004, + "grad_norm": 3.7083046436309814, + "learning_rate": 5.434058216781214e-05, + "loss": 1.66, + "step": 6498 + }, + { + "epoch": 0.4892075500103502, + "grad_norm": 9.759012222290039, + "learning_rate": 5.4328437171495604e-05, + "loss": 2.0309, + "step": 6499 + }, + { + "epoch": 0.4892828242910104, + "grad_norm": 4.209195613861084, + "learning_rate": 5.4316291917865883e-05, + "loss": 2.0649, + "step": 6500 + }, + { + "epoch": 0.4893580985716705, + "grad_norm": 5.8448333740234375, + "learning_rate": 5.4304146407644994e-05, + "loss": 1.5482, + "step": 6501 + }, + { + "epoch": 0.4894333728523307, + "grad_norm": 4.206943511962891, + "learning_rate": 5.429200064155495e-05, + "loss": 2.0601, + "step": 6502 + }, + { + "epoch": 0.48950864713299086, + "grad_norm": 8.297439575195312, + "learning_rate": 5.427985462031779e-05, + "loss": 2.5263, + "step": 6503 + }, + { + "epoch": 0.489583921413651, + "grad_norm": 5.022284507751465, + "learning_rate": 5.4267708344655556e-05, + "loss": 1.8905, + "step": 6504 + }, + { + "epoch": 0.48965919569431116, + "grad_norm": 6.0491228103637695, + "learning_rate": 5.42555618152903e-05, + "loss": 1.9399, + "step": 6505 + }, + { + "epoch": 0.4897344699749713, + "grad_norm": 6.512802600860596, + "learning_rate": 5.424341503294411e-05, + "loss": 1.8161, + "step": 6506 + }, + { + "epoch": 0.48980974425563145, + "grad_norm": 5.927082538604736, + "learning_rate": 5.4231267998339066e-05, + "loss": 2.0209, + "step": 6507 + }, + { + "epoch": 0.4898850185362916, + "grad_norm": 5.356221675872803, + "learning_rate": 5.421912071219728e-05, + "loss": 1.9805, + "step": 6508 + }, + { + "epoch": 0.48996029281695175, + "grad_norm": 4.81496000289917, + "learning_rate": 5.420697317524088e-05, + "loss": 1.8737, + "step": 6509 + }, + { + "epoch": 0.4900355670976119, + "grad_norm": 5.58132266998291, + "learning_rate": 5.4194825388191995e-05, + "loss": 2.1025, + "step": 6510 + }, + { + "epoch": 0.4901108413782721, + "grad_norm": 4.656373500823975, + "learning_rate": 5.418267735177278e-05, + "loss": 1.9189, + "step": 6511 + }, + { + "epoch": 0.4901861156589322, + "grad_norm": 5.391656875610352, + "learning_rate": 5.41705290667054e-05, + "loss": 2.1958, + "step": 6512 + }, + { + "epoch": 0.4902613899395924, + "grad_norm": 5.249373435974121, + "learning_rate": 5.4158380533712016e-05, + "loss": 1.7043, + "step": 6513 + }, + { + "epoch": 0.49033666422025257, + "grad_norm": 4.356224536895752, + "learning_rate": 5.414623175351486e-05, + "loss": 2.0932, + "step": 6514 + }, + { + "epoch": 0.4904119385009127, + "grad_norm": 7.173582077026367, + "learning_rate": 5.413408272683611e-05, + "loss": 1.5527, + "step": 6515 + }, + { + "epoch": 0.49048721278157287, + "grad_norm": 6.840654373168945, + "learning_rate": 5.4121933454398e-05, + "loss": 1.938, + "step": 6516 + }, + { + "epoch": 0.490562487062233, + "grad_norm": 4.42674446105957, + "learning_rate": 5.410978393692279e-05, + "loss": 2.1111, + "step": 6517 + }, + { + "epoch": 0.49063776134289316, + "grad_norm": 4.461450576782227, + "learning_rate": 5.40976341751327e-05, + "loss": 2.0442, + "step": 6518 + }, + { + "epoch": 0.49071303562355334, + "grad_norm": 5.244041442871094, + "learning_rate": 5.408548416975002e-05, + "loss": 1.4828, + "step": 6519 + }, + { + "epoch": 0.49078830990421346, + "grad_norm": 4.033167839050293, + "learning_rate": 5.4073333921497036e-05, + "loss": 2.0941, + "step": 6520 + }, + { + "epoch": 0.49086358418487364, + "grad_norm": 5.228991508483887, + "learning_rate": 5.406118343109604e-05, + "loss": 1.7965, + "step": 6521 + }, + { + "epoch": 0.4909388584655338, + "grad_norm": 6.748352527618408, + "learning_rate": 5.404903269926933e-05, + "loss": 2.2177, + "step": 6522 + }, + { + "epoch": 0.49101413274619393, + "grad_norm": 4.209940433502197, + "learning_rate": 5.4036881726739265e-05, + "loss": 1.7173, + "step": 6523 + }, + { + "epoch": 0.4910894070268541, + "grad_norm": 4.503101348876953, + "learning_rate": 5.402473051422814e-05, + "loss": 1.9533, + "step": 6524 + }, + { + "epoch": 0.49116468130751423, + "grad_norm": 5.839128494262695, + "learning_rate": 5.401257906245835e-05, + "loss": 2.0055, + "step": 6525 + }, + { + "epoch": 0.4912399555881744, + "grad_norm": 4.642331600189209, + "learning_rate": 5.400042737215224e-05, + "loss": 2.099, + "step": 6526 + }, + { + "epoch": 0.4913152298688346, + "grad_norm": 7.155869960784912, + "learning_rate": 5.398827544403221e-05, + "loss": 2.2093, + "step": 6527 + }, + { + "epoch": 0.4913905041494947, + "grad_norm": 5.854203224182129, + "learning_rate": 5.397612327882064e-05, + "loss": 1.9402, + "step": 6528 + }, + { + "epoch": 0.4914657784301549, + "grad_norm": 5.820188045501709, + "learning_rate": 5.396397087723994e-05, + "loss": 1.9695, + "step": 6529 + }, + { + "epoch": 0.49154105271081505, + "grad_norm": 3.5693118572235107, + "learning_rate": 5.395181824001256e-05, + "loss": 1.8955, + "step": 6530 + }, + { + "epoch": 0.49161632699147517, + "grad_norm": 7.379233360290527, + "learning_rate": 5.3939665367860926e-05, + "loss": 2.2634, + "step": 6531 + }, + { + "epoch": 0.49169160127213535, + "grad_norm": 5.738043308258057, + "learning_rate": 5.3927512261507485e-05, + "loss": 2.2121, + "step": 6532 + }, + { + "epoch": 0.4917668755527955, + "grad_norm": 4.206314563751221, + "learning_rate": 5.391535892167471e-05, + "loss": 1.8922, + "step": 6533 + }, + { + "epoch": 0.49184214983345564, + "grad_norm": 3.834730863571167, + "learning_rate": 5.390320534908508e-05, + "loss": 1.8218, + "step": 6534 + }, + { + "epoch": 0.4919174241141158, + "grad_norm": 4.031350135803223, + "learning_rate": 5.389105154446109e-05, + "loss": 2.0442, + "step": 6535 + }, + { + "epoch": 0.49199269839477594, + "grad_norm": 5.927657604217529, + "learning_rate": 5.3878897508525265e-05, + "loss": 1.8528, + "step": 6536 + }, + { + "epoch": 0.4920679726754361, + "grad_norm": 5.304276466369629, + "learning_rate": 5.386674324200009e-05, + "loss": 1.9195, + "step": 6537 + }, + { + "epoch": 0.4921432469560963, + "grad_norm": 6.380029201507568, + "learning_rate": 5.385458874560815e-05, + "loss": 1.7714, + "step": 6538 + }, + { + "epoch": 0.4922185212367564, + "grad_norm": 4.784676551818848, + "learning_rate": 5.3842434020071954e-05, + "loss": 2.2081, + "step": 6539 + }, + { + "epoch": 0.4922937955174166, + "grad_norm": 4.669191837310791, + "learning_rate": 5.383027906611409e-05, + "loss": 1.7475, + "step": 6540 + }, + { + "epoch": 0.49236906979807676, + "grad_norm": 4.375756740570068, + "learning_rate": 5.381812388445711e-05, + "loss": 1.765, + "step": 6541 + }, + { + "epoch": 0.4924443440787369, + "grad_norm": 6.5041961669921875, + "learning_rate": 5.380596847582363e-05, + "loss": 1.7643, + "step": 6542 + }, + { + "epoch": 0.49251961835939706, + "grad_norm": 4.717128276824951, + "learning_rate": 5.379381284093624e-05, + "loss": 1.8469, + "step": 6543 + }, + { + "epoch": 0.49259489264005724, + "grad_norm": 5.612990856170654, + "learning_rate": 5.378165698051757e-05, + "loss": 2.2545, + "step": 6544 + }, + { + "epoch": 0.49267016692071736, + "grad_norm": 4.011388301849365, + "learning_rate": 5.3769500895290245e-05, + "loss": 1.8134, + "step": 6545 + }, + { + "epoch": 0.49274544120137753, + "grad_norm": 5.489065170288086, + "learning_rate": 5.3757344585976886e-05, + "loss": 1.8106, + "step": 6546 + }, + { + "epoch": 0.49282071548203765, + "grad_norm": 4.551653861999512, + "learning_rate": 5.37451880533002e-05, + "loss": 1.659, + "step": 6547 + }, + { + "epoch": 0.49289598976269783, + "grad_norm": 4.0693464279174805, + "learning_rate": 5.3733031297982816e-05, + "loss": 2.0894, + "step": 6548 + }, + { + "epoch": 0.492971264043358, + "grad_norm": 6.810680866241455, + "learning_rate": 5.372087432074743e-05, + "loss": 1.9829, + "step": 6549 + }, + { + "epoch": 0.4930465383240181, + "grad_norm": 4.803364276885986, + "learning_rate": 5.3708717122316744e-05, + "loss": 2.0914, + "step": 6550 + }, + { + "epoch": 0.4931218126046783, + "grad_norm": 4.585589408874512, + "learning_rate": 5.3696559703413474e-05, + "loss": 1.63, + "step": 6551 + }, + { + "epoch": 0.4931970868853385, + "grad_norm": 3.918064594268799, + "learning_rate": 5.368440206476032e-05, + "loss": 1.62, + "step": 6552 + }, + { + "epoch": 0.4932723611659986, + "grad_norm": 4.058220863342285, + "learning_rate": 5.3672244207080045e-05, + "loss": 1.8729, + "step": 6553 + }, + { + "epoch": 0.49334763544665877, + "grad_norm": 4.199698448181152, + "learning_rate": 5.36600861310954e-05, + "loss": 1.8036, + "step": 6554 + }, + { + "epoch": 0.4934229097273189, + "grad_norm": 4.538209915161133, + "learning_rate": 5.3647927837529135e-05, + "loss": 2.0558, + "step": 6555 + }, + { + "epoch": 0.49349818400797907, + "grad_norm": 4.436923503875732, + "learning_rate": 5.363576932710401e-05, + "loss": 1.8746, + "step": 6556 + }, + { + "epoch": 0.49357345828863924, + "grad_norm": 4.124301433563232, + "learning_rate": 5.362361060054284e-05, + "loss": 2.0124, + "step": 6557 + }, + { + "epoch": 0.49364873256929936, + "grad_norm": 4.70476770401001, + "learning_rate": 5.3611451658568414e-05, + "loss": 1.9247, + "step": 6558 + }, + { + "epoch": 0.49372400684995954, + "grad_norm": 4.670519828796387, + "learning_rate": 5.359929250190355e-05, + "loss": 2.0932, + "step": 6559 + }, + { + "epoch": 0.4937992811306197, + "grad_norm": 3.702254295349121, + "learning_rate": 5.358713313127108e-05, + "loss": 1.6535, + "step": 6560 + }, + { + "epoch": 0.49387455541127984, + "grad_norm": 4.913912296295166, + "learning_rate": 5.357497354739385e-05, + "loss": 2.0881, + "step": 6561 + }, + { + "epoch": 0.49394982969194, + "grad_norm": 4.65692663192749, + "learning_rate": 5.356281375099467e-05, + "loss": 2.0455, + "step": 6562 + }, + { + "epoch": 0.4940251039726002, + "grad_norm": 5.847254276275635, + "learning_rate": 5.355065374279645e-05, + "loss": 1.8759, + "step": 6563 + }, + { + "epoch": 0.4941003782532603, + "grad_norm": 4.444882869720459, + "learning_rate": 5.353849352352206e-05, + "loss": 1.791, + "step": 6564 + }, + { + "epoch": 0.4941756525339205, + "grad_norm": 5.502654075622559, + "learning_rate": 5.352633309389437e-05, + "loss": 1.5627, + "step": 6565 + }, + { + "epoch": 0.4942509268145806, + "grad_norm": 5.259897708892822, + "learning_rate": 5.35141724546363e-05, + "loss": 1.7208, + "step": 6566 + }, + { + "epoch": 0.4943262010952408, + "grad_norm": 4.840779781341553, + "learning_rate": 5.350201160647077e-05, + "loss": 1.8647, + "step": 6567 + }, + { + "epoch": 0.49440147537590096, + "grad_norm": 4.3065571784973145, + "learning_rate": 5.348985055012069e-05, + "loss": 2.0042, + "step": 6568 + }, + { + "epoch": 0.4944767496565611, + "grad_norm": 3.9905309677124023, + "learning_rate": 5.3477689286308996e-05, + "loss": 1.9621, + "step": 6569 + }, + { + "epoch": 0.49455202393722125, + "grad_norm": 5.944657325744629, + "learning_rate": 5.3465527815758674e-05, + "loss": 1.8088, + "step": 6570 + }, + { + "epoch": 0.49462729821788143, + "grad_norm": 8.757949829101562, + "learning_rate": 5.345336613919266e-05, + "loss": 1.9353, + "step": 6571 + }, + { + "epoch": 0.49470257249854155, + "grad_norm": 7.129432201385498, + "learning_rate": 5.3441204257333946e-05, + "loss": 1.7618, + "step": 6572 + }, + { + "epoch": 0.4947778467792017, + "grad_norm": 4.390137195587158, + "learning_rate": 5.342904217090549e-05, + "loss": 1.9872, + "step": 6573 + }, + { + "epoch": 0.49485312105986184, + "grad_norm": 6.598457336425781, + "learning_rate": 5.3416879880630336e-05, + "loss": 1.9089, + "step": 6574 + }, + { + "epoch": 0.494928395340522, + "grad_norm": 8.359405517578125, + "learning_rate": 5.340471738723147e-05, + "loss": 2.3524, + "step": 6575 + }, + { + "epoch": 0.4950036696211822, + "grad_norm": 4.667253494262695, + "learning_rate": 5.3392554691431926e-05, + "loss": 1.9262, + "step": 6576 + }, + { + "epoch": 0.4950789439018423, + "grad_norm": 4.346924781799316, + "learning_rate": 5.338039179395474e-05, + "loss": 2.0853, + "step": 6577 + }, + { + "epoch": 0.4951542181825025, + "grad_norm": 5.357377052307129, + "learning_rate": 5.336822869552297e-05, + "loss": 1.8427, + "step": 6578 + }, + { + "epoch": 0.49522949246316267, + "grad_norm": 4.795270919799805, + "learning_rate": 5.335606539685964e-05, + "loss": 1.9301, + "step": 6579 + }, + { + "epoch": 0.4953047667438228, + "grad_norm": 4.3588786125183105, + "learning_rate": 5.3343901898687875e-05, + "loss": 2.5415, + "step": 6580 + }, + { + "epoch": 0.49538004102448296, + "grad_norm": 6.663821697235107, + "learning_rate": 5.333173820173073e-05, + "loss": 1.9014, + "step": 6581 + }, + { + "epoch": 0.49545531530514314, + "grad_norm": 4.672849655151367, + "learning_rate": 5.331957430671132e-05, + "loss": 1.5636, + "step": 6582 + }, + { + "epoch": 0.49553058958580326, + "grad_norm": 3.2184488773345947, + "learning_rate": 5.3307410214352724e-05, + "loss": 1.9791, + "step": 6583 + }, + { + "epoch": 0.49560586386646344, + "grad_norm": 3.9048497676849365, + "learning_rate": 5.32952459253781e-05, + "loss": 2.0377, + "step": 6584 + }, + { + "epoch": 0.49568113814712356, + "grad_norm": 6.407617568969727, + "learning_rate": 5.328308144051054e-05, + "loss": 1.8617, + "step": 6585 + }, + { + "epoch": 0.49575641242778373, + "grad_norm": 4.093458652496338, + "learning_rate": 5.327091676047322e-05, + "loss": 1.7339, + "step": 6586 + }, + { + "epoch": 0.4958316867084439, + "grad_norm": 6.431451797485352, + "learning_rate": 5.325875188598929e-05, + "loss": 2.0334, + "step": 6587 + }, + { + "epoch": 0.49590696098910403, + "grad_norm": 4.165318012237549, + "learning_rate": 5.3246586817781906e-05, + "loss": 1.9062, + "step": 6588 + }, + { + "epoch": 0.4959822352697642, + "grad_norm": 8.978361129760742, + "learning_rate": 5.3234421556574254e-05, + "loss": 2.1827, + "step": 6589 + }, + { + "epoch": 0.4960575095504244, + "grad_norm": 4.872499465942383, + "learning_rate": 5.322225610308952e-05, + "loss": 2.0648, + "step": 6590 + }, + { + "epoch": 0.4961327838310845, + "grad_norm": 5.128283500671387, + "learning_rate": 5.32100904580509e-05, + "loss": 1.6407, + "step": 6591 + }, + { + "epoch": 0.4962080581117447, + "grad_norm": 3.9446115493774414, + "learning_rate": 5.3197924622181614e-05, + "loss": 1.5445, + "step": 6592 + }, + { + "epoch": 0.49628333239240485, + "grad_norm": 5.773181915283203, + "learning_rate": 5.3185758596204896e-05, + "loss": 1.8557, + "step": 6593 + }, + { + "epoch": 0.496358606673065, + "grad_norm": 4.181520938873291, + "learning_rate": 5.3173592380843963e-05, + "loss": 1.6195, + "step": 6594 + }, + { + "epoch": 0.49643388095372515, + "grad_norm": 4.615962505340576, + "learning_rate": 5.316142597682207e-05, + "loss": 1.8266, + "step": 6595 + }, + { + "epoch": 0.49650915523438527, + "grad_norm": 4.229737281799316, + "learning_rate": 5.314925938486246e-05, + "loss": 1.9448, + "step": 6596 + }, + { + "epoch": 0.49658442951504544, + "grad_norm": 3.4112026691436768, + "learning_rate": 5.313709260568842e-05, + "loss": 1.8504, + "step": 6597 + }, + { + "epoch": 0.4966597037957056, + "grad_norm": 4.6149444580078125, + "learning_rate": 5.312492564002324e-05, + "loss": 2.0293, + "step": 6598 + }, + { + "epoch": 0.49673497807636574, + "grad_norm": 3.720318078994751, + "learning_rate": 5.311275848859019e-05, + "loss": 2.1347, + "step": 6599 + }, + { + "epoch": 0.4968102523570259, + "grad_norm": 3.8474161624908447, + "learning_rate": 5.310059115211259e-05, + "loss": 1.914, + "step": 6600 + }, + { + "epoch": 0.4968855266376861, + "grad_norm": 3.9680564403533936, + "learning_rate": 5.3088423631313735e-05, + "loss": 1.8852, + "step": 6601 + }, + { + "epoch": 0.4969608009183462, + "grad_norm": 4.404036521911621, + "learning_rate": 5.307625592691694e-05, + "loss": 1.9378, + "step": 6602 + }, + { + "epoch": 0.4970360751990064, + "grad_norm": 4.159010410308838, + "learning_rate": 5.306408803964557e-05, + "loss": 2.2767, + "step": 6603 + }, + { + "epoch": 0.4971113494796665, + "grad_norm": 4.397821426391602, + "learning_rate": 5.3051919970222964e-05, + "loss": 2.1887, + "step": 6604 + }, + { + "epoch": 0.4971866237603267, + "grad_norm": 5.168182849884033, + "learning_rate": 5.303975171937248e-05, + "loss": 1.8381, + "step": 6605 + }, + { + "epoch": 0.49726189804098686, + "grad_norm": 3.5057852268218994, + "learning_rate": 5.302758328781746e-05, + "loss": 1.9219, + "step": 6606 + }, + { + "epoch": 0.497337172321647, + "grad_norm": 6.528621196746826, + "learning_rate": 5.30154146762813e-05, + "loss": 2.0501, + "step": 6607 + }, + { + "epoch": 0.49741244660230716, + "grad_norm": 4.499448776245117, + "learning_rate": 5.3003245885487393e-05, + "loss": 1.7813, + "step": 6608 + }, + { + "epoch": 0.49748772088296733, + "grad_norm": 4.891198635101318, + "learning_rate": 5.2991076916159146e-05, + "loss": 1.8765, + "step": 6609 + }, + { + "epoch": 0.49756299516362745, + "grad_norm": 4.469904899597168, + "learning_rate": 5.297890776901996e-05, + "loss": 2.5131, + "step": 6610 + }, + { + "epoch": 0.49763826944428763, + "grad_norm": 4.918172836303711, + "learning_rate": 5.296673844479325e-05, + "loss": 1.5167, + "step": 6611 + }, + { + "epoch": 0.4977135437249478, + "grad_norm": 6.013672828674316, + "learning_rate": 5.2954568944202434e-05, + "loss": 1.9267, + "step": 6612 + }, + { + "epoch": 0.4977888180056079, + "grad_norm": 4.5740275382995605, + "learning_rate": 5.294239926797099e-05, + "loss": 1.6924, + "step": 6613 + }, + { + "epoch": 0.4978640922862681, + "grad_norm": 5.910006046295166, + "learning_rate": 5.293022941682234e-05, + "loss": 1.9462, + "step": 6614 + }, + { + "epoch": 0.4979393665669282, + "grad_norm": 4.6266984939575195, + "learning_rate": 5.2918059391479965e-05, + "loss": 1.9748, + "step": 6615 + }, + { + "epoch": 0.4980146408475884, + "grad_norm": 4.044189453125, + "learning_rate": 5.290588919266735e-05, + "loss": 1.7018, + "step": 6616 + }, + { + "epoch": 0.4980899151282486, + "grad_norm": 4.312623977661133, + "learning_rate": 5.289371882110794e-05, + "loss": 1.82, + "step": 6617 + }, + { + "epoch": 0.4981651894089087, + "grad_norm": 4.063395977020264, + "learning_rate": 5.288154827752526e-05, + "loss": 1.9681, + "step": 6618 + }, + { + "epoch": 0.49824046368956887, + "grad_norm": 3.1786062717437744, + "learning_rate": 5.286937756264278e-05, + "loss": 1.6618, + "step": 6619 + }, + { + "epoch": 0.49831573797022904, + "grad_norm": 3.902327060699463, + "learning_rate": 5.285720667718406e-05, + "loss": 1.9643, + "step": 6620 + }, + { + "epoch": 0.49839101225088916, + "grad_norm": 5.7253546714782715, + "learning_rate": 5.28450356218726e-05, + "loss": 1.9578, + "step": 6621 + }, + { + "epoch": 0.49846628653154934, + "grad_norm": 8.799309730529785, + "learning_rate": 5.283286439743193e-05, + "loss": 2.0379, + "step": 6622 + }, + { + "epoch": 0.49854156081220946, + "grad_norm": 4.470696926116943, + "learning_rate": 5.282069300458561e-05, + "loss": 2.1186, + "step": 6623 + }, + { + "epoch": 0.49861683509286964, + "grad_norm": 4.90806245803833, + "learning_rate": 5.280852144405717e-05, + "loss": 1.9687, + "step": 6624 + }, + { + "epoch": 0.4986921093735298, + "grad_norm": 4.017029285430908, + "learning_rate": 5.279634971657019e-05, + "loss": 1.6861, + "step": 6625 + }, + { + "epoch": 0.49876738365418993, + "grad_norm": 5.66743278503418, + "learning_rate": 5.2784177822848245e-05, + "loss": 2.3473, + "step": 6626 + }, + { + "epoch": 0.4988426579348501, + "grad_norm": 3.9530794620513916, + "learning_rate": 5.277200576361493e-05, + "loss": 1.8846, + "step": 6627 + }, + { + "epoch": 0.4989179322155103, + "grad_norm": 4.128320693969727, + "learning_rate": 5.2759833539593814e-05, + "loss": 1.905, + "step": 6628 + }, + { + "epoch": 0.4989932064961704, + "grad_norm": 4.087277412414551, + "learning_rate": 5.274766115150851e-05, + "loss": 1.7715, + "step": 6629 + }, + { + "epoch": 0.4990684807768306, + "grad_norm": 4.750850200653076, + "learning_rate": 5.273548860008263e-05, + "loss": 2.3477, + "step": 6630 + }, + { + "epoch": 0.49914375505749076, + "grad_norm": 4.339632987976074, + "learning_rate": 5.27233158860398e-05, + "loss": 1.842, + "step": 6631 + }, + { + "epoch": 0.4992190293381509, + "grad_norm": 6.536107540130615, + "learning_rate": 5.271114301010368e-05, + "loss": 2.0784, + "step": 6632 + }, + { + "epoch": 0.49929430361881105, + "grad_norm": 4.373968601226807, + "learning_rate": 5.2698969972997866e-05, + "loss": 1.7521, + "step": 6633 + }, + { + "epoch": 0.4993695778994712, + "grad_norm": 3.7168049812316895, + "learning_rate": 5.268679677544605e-05, + "loss": 1.9182, + "step": 6634 + }, + { + "epoch": 0.49944485218013135, + "grad_norm": 4.963070869445801, + "learning_rate": 5.267462341817185e-05, + "loss": 1.8125, + "step": 6635 + }, + { + "epoch": 0.4995201264607915, + "grad_norm": 6.004863739013672, + "learning_rate": 5.266244990189898e-05, + "loss": 1.5558, + "step": 6636 + }, + { + "epoch": 0.49959540074145165, + "grad_norm": 3.28002667427063, + "learning_rate": 5.265027622735109e-05, + "loss": 1.8543, + "step": 6637 + }, + { + "epoch": 0.4996706750221118, + "grad_norm": 4.824798107147217, + "learning_rate": 5.26381023952519e-05, + "loss": 2.1974, + "step": 6638 + }, + { + "epoch": 0.499745949302772, + "grad_norm": 5.915852069854736, + "learning_rate": 5.262592840632509e-05, + "loss": 1.9609, + "step": 6639 + }, + { + "epoch": 0.4998212235834321, + "grad_norm": 5.207124710083008, + "learning_rate": 5.261375426129436e-05, + "loss": 1.6356, + "step": 6640 + }, + { + "epoch": 0.4998964978640923, + "grad_norm": 4.946179389953613, + "learning_rate": 5.260157996088343e-05, + "loss": 1.7205, + "step": 6641 + }, + { + "epoch": 0.49997177214475247, + "grad_norm": 4.143404960632324, + "learning_rate": 5.258940550581606e-05, + "loss": 1.8587, + "step": 6642 + }, + { + "epoch": 0.5000470464254126, + "grad_norm": 4.522736072540283, + "learning_rate": 5.2577230896815944e-05, + "loss": 1.6212, + "step": 6643 + }, + { + "epoch": 0.5001223207060728, + "grad_norm": 7.085289478302002, + "learning_rate": 5.256505613460686e-05, + "loss": 1.8814, + "step": 6644 + }, + { + "epoch": 0.5001975949867329, + "grad_norm": 4.942041397094727, + "learning_rate": 5.255288121991253e-05, + "loss": 2.0122, + "step": 6645 + }, + { + "epoch": 0.5002728692673931, + "grad_norm": 5.063052654266357, + "learning_rate": 5.2540706153456755e-05, + "loss": 1.7972, + "step": 6646 + }, + { + "epoch": 0.5003481435480532, + "grad_norm": 6.212124347686768, + "learning_rate": 5.2528530935963274e-05, + "loss": 2.0467, + "step": 6647 + }, + { + "epoch": 0.5004234178287134, + "grad_norm": 4.333958148956299, + "learning_rate": 5.2516355568155895e-05, + "loss": 1.9459, + "step": 6648 + }, + { + "epoch": 0.5004986921093735, + "grad_norm": 4.473553657531738, + "learning_rate": 5.250418005075839e-05, + "loss": 1.6653, + "step": 6649 + }, + { + "epoch": 0.5005739663900337, + "grad_norm": 5.196820259094238, + "learning_rate": 5.2492004384494576e-05, + "loss": 1.9713, + "step": 6650 + }, + { + "epoch": 0.5006492406706938, + "grad_norm": 4.646674156188965, + "learning_rate": 5.247982857008825e-05, + "loss": 2.4776, + "step": 6651 + }, + { + "epoch": 0.500724514951354, + "grad_norm": 4.333740234375, + "learning_rate": 5.246765260826322e-05, + "loss": 2.2692, + "step": 6652 + }, + { + "epoch": 0.5007997892320142, + "grad_norm": 5.411248207092285, + "learning_rate": 5.245547649974334e-05, + "loss": 1.8884, + "step": 6653 + }, + { + "epoch": 0.5008750635126743, + "grad_norm": 4.009570121765137, + "learning_rate": 5.244330024525241e-05, + "loss": 1.9895, + "step": 6654 + }, + { + "epoch": 0.5009503377933344, + "grad_norm": 6.674574851989746, + "learning_rate": 5.243112384551431e-05, + "loss": 2.1379, + "step": 6655 + }, + { + "epoch": 0.5010256120739947, + "grad_norm": 4.459656238555908, + "learning_rate": 5.241894730125286e-05, + "loss": 1.7061, + "step": 6656 + }, + { + "epoch": 0.5011008863546548, + "grad_norm": 4.510525226593018, + "learning_rate": 5.240677061319194e-05, + "loss": 1.7712, + "step": 6657 + }, + { + "epoch": 0.5011761606353149, + "grad_norm": 4.68900203704834, + "learning_rate": 5.23945937820554e-05, + "loss": 1.9551, + "step": 6658 + }, + { + "epoch": 0.5012514349159751, + "grad_norm": 5.634761810302734, + "learning_rate": 5.238241680856715e-05, + "loss": 1.9508, + "step": 6659 + }, + { + "epoch": 0.5013267091966352, + "grad_norm": 4.2384352684021, + "learning_rate": 5.237023969345104e-05, + "loss": 2.0671, + "step": 6660 + }, + { + "epoch": 0.5014019834772954, + "grad_norm": 4.337565898895264, + "learning_rate": 5.2358062437430974e-05, + "loss": 1.6626, + "step": 6661 + }, + { + "epoch": 0.5014772577579556, + "grad_norm": 6.642696380615234, + "learning_rate": 5.234588504123089e-05, + "loss": 2.0451, + "step": 6662 + }, + { + "epoch": 0.5015525320386157, + "grad_norm": 5.136529922485352, + "learning_rate": 5.2333707505574656e-05, + "loss": 1.8311, + "step": 6663 + }, + { + "epoch": 0.5016278063192758, + "grad_norm": 3.1710305213928223, + "learning_rate": 5.23215298311862e-05, + "loss": 2.0421, + "step": 6664 + }, + { + "epoch": 0.5017030805999361, + "grad_norm": 4.964192867279053, + "learning_rate": 5.230935201878947e-05, + "loss": 1.6524, + "step": 6665 + }, + { + "epoch": 0.5017783548805962, + "grad_norm": 4.646667003631592, + "learning_rate": 5.229717406910838e-05, + "loss": 1.9214, + "step": 6666 + }, + { + "epoch": 0.5018536291612563, + "grad_norm": 5.1704421043396, + "learning_rate": 5.228499598286689e-05, + "loss": 1.9825, + "step": 6667 + }, + { + "epoch": 0.5019289034419164, + "grad_norm": 3.866488218307495, + "learning_rate": 5.227281776078895e-05, + "loss": 1.9435, + "step": 6668 + }, + { + "epoch": 0.5020041777225767, + "grad_norm": 4.703546524047852, + "learning_rate": 5.22606394035985e-05, + "loss": 1.8869, + "step": 6669 + }, + { + "epoch": 0.5020794520032368, + "grad_norm": 5.812427997589111, + "learning_rate": 5.224846091201953e-05, + "loss": 2.0668, + "step": 6670 + }, + { + "epoch": 0.5021547262838969, + "grad_norm": 4.198016166687012, + "learning_rate": 5.223628228677602e-05, + "loss": 2.3489, + "step": 6671 + }, + { + "epoch": 0.5022300005645571, + "grad_norm": 5.551123142242432, + "learning_rate": 5.222410352859193e-05, + "loss": 1.9847, + "step": 6672 + }, + { + "epoch": 0.5023052748452173, + "grad_norm": 4.253283977508545, + "learning_rate": 5.221192463819128e-05, + "loss": 1.7987, + "step": 6673 + }, + { + "epoch": 0.5023805491258774, + "grad_norm": 3.8291642665863037, + "learning_rate": 5.219974561629803e-05, + "loss": 1.7841, + "step": 6674 + }, + { + "epoch": 0.5024558234065376, + "grad_norm": 4.810837745666504, + "learning_rate": 5.218756646363625e-05, + "loss": 2.2023, + "step": 6675 + }, + { + "epoch": 0.5025310976871977, + "grad_norm": 4.038191318511963, + "learning_rate": 5.21753871809299e-05, + "loss": 1.5822, + "step": 6676 + }, + { + "epoch": 0.5026063719678578, + "grad_norm": 4.373762130737305, + "learning_rate": 5.2163207768903036e-05, + "loss": 1.8033, + "step": 6677 + }, + { + "epoch": 0.5026816462485181, + "grad_norm": 4.48949670791626, + "learning_rate": 5.215102822827966e-05, + "loss": 1.9618, + "step": 6678 + }, + { + "epoch": 0.5027569205291782, + "grad_norm": 3.321491003036499, + "learning_rate": 5.2138848559783835e-05, + "loss": 1.9513, + "step": 6679 + }, + { + "epoch": 0.5028321948098383, + "grad_norm": 5.063469886779785, + "learning_rate": 5.212666876413961e-05, + "loss": 2.0714, + "step": 6680 + }, + { + "epoch": 0.5029074690904985, + "grad_norm": 5.096776962280273, + "learning_rate": 5.211448884207102e-05, + "loss": 1.7062, + "step": 6681 + }, + { + "epoch": 0.5029827433711587, + "grad_norm": 4.336129188537598, + "learning_rate": 5.210230879430214e-05, + "loss": 1.9647, + "step": 6682 + }, + { + "epoch": 0.5030580176518188, + "grad_norm": 4.75596809387207, + "learning_rate": 5.209012862155703e-05, + "loss": 1.6804, + "step": 6683 + }, + { + "epoch": 0.503133291932479, + "grad_norm": 4.212070465087891, + "learning_rate": 5.207794832455978e-05, + "loss": 1.9132, + "step": 6684 + }, + { + "epoch": 0.5032085662131391, + "grad_norm": 4.937553882598877, + "learning_rate": 5.2065767904034445e-05, + "loss": 2.1129, + "step": 6685 + }, + { + "epoch": 0.5032838404937993, + "grad_norm": 10.074333190917969, + "learning_rate": 5.2053587360705156e-05, + "loss": 2.1045, + "step": 6686 + }, + { + "epoch": 0.5033591147744594, + "grad_norm": 4.156828880310059, + "learning_rate": 5.204140669529598e-05, + "loss": 1.9989, + "step": 6687 + }, + { + "epoch": 0.5034343890551196, + "grad_norm": 5.474024772644043, + "learning_rate": 5.2029225908531044e-05, + "loss": 2.0191, + "step": 6688 + }, + { + "epoch": 0.5035096633357797, + "grad_norm": 14.56948184967041, + "learning_rate": 5.2017045001134445e-05, + "loss": 2.0714, + "step": 6689 + }, + { + "epoch": 0.5035849376164399, + "grad_norm": 4.782439231872559, + "learning_rate": 5.200486397383031e-05, + "loss": 2.0435, + "step": 6690 + }, + { + "epoch": 0.5036602118971001, + "grad_norm": 4.0484747886657715, + "learning_rate": 5.199268282734276e-05, + "loss": 2.0064, + "step": 6691 + }, + { + "epoch": 0.5037354861777602, + "grad_norm": 5.236764430999756, + "learning_rate": 5.1980501562395925e-05, + "loss": 1.5891, + "step": 6692 + }, + { + "epoch": 0.5038107604584203, + "grad_norm": 3.6485483646392822, + "learning_rate": 5.196832017971397e-05, + "loss": 2.065, + "step": 6693 + }, + { + "epoch": 0.5038860347390806, + "grad_norm": 4.8559041023254395, + "learning_rate": 5.195613868002101e-05, + "loss": 1.8319, + "step": 6694 + }, + { + "epoch": 0.5039613090197407, + "grad_norm": 3.490993022918701, + "learning_rate": 5.1943957064041225e-05, + "loss": 1.6896, + "step": 6695 + }, + { + "epoch": 0.5040365833004008, + "grad_norm": 4.025605201721191, + "learning_rate": 5.193177533249879e-05, + "loss": 1.9785, + "step": 6696 + }, + { + "epoch": 0.504111857581061, + "grad_norm": 5.432565212249756, + "learning_rate": 5.1919593486117824e-05, + "loss": 2.1705, + "step": 6697 + }, + { + "epoch": 0.5041871318617211, + "grad_norm": 4.28887414932251, + "learning_rate": 5.190741152562255e-05, + "loss": 1.8371, + "step": 6698 + }, + { + "epoch": 0.5042624061423813, + "grad_norm": 3.529099941253662, + "learning_rate": 5.1895229451737135e-05, + "loss": 1.9488, + "step": 6699 + }, + { + "epoch": 0.5043376804230415, + "grad_norm": 5.892711639404297, + "learning_rate": 5.188304726518577e-05, + "loss": 2.1959, + "step": 6700 + }, + { + "epoch": 0.5044129547037016, + "grad_norm": 4.754748821258545, + "learning_rate": 5.187086496669265e-05, + "loss": 1.8788, + "step": 6701 + }, + { + "epoch": 0.5044882289843617, + "grad_norm": 6.027359485626221, + "learning_rate": 5.1858682556981954e-05, + "loss": 1.9679, + "step": 6702 + }, + { + "epoch": 0.504563503265022, + "grad_norm": 3.580472469329834, + "learning_rate": 5.184650003677793e-05, + "loss": 1.7225, + "step": 6703 + }, + { + "epoch": 0.5046387775456821, + "grad_norm": 4.181191921234131, + "learning_rate": 5.183431740680478e-05, + "loss": 1.8376, + "step": 6704 + }, + { + "epoch": 0.5047140518263422, + "grad_norm": 3.86775541305542, + "learning_rate": 5.1822134667786714e-05, + "loss": 1.6344, + "step": 6705 + }, + { + "epoch": 0.5047893261070024, + "grad_norm": 5.569202423095703, + "learning_rate": 5.180995182044798e-05, + "loss": 1.9528, + "step": 6706 + }, + { + "epoch": 0.5048646003876626, + "grad_norm": 5.448028087615967, + "learning_rate": 5.1797768865512794e-05, + "loss": 1.9753, + "step": 6707 + }, + { + "epoch": 0.5049398746683227, + "grad_norm": 4.283004283905029, + "learning_rate": 5.1785585803705416e-05, + "loss": 1.9926, + "step": 6708 + }, + { + "epoch": 0.5050151489489828, + "grad_norm": 6.169532299041748, + "learning_rate": 5.177340263575008e-05, + "loss": 1.7316, + "step": 6709 + }, + { + "epoch": 0.505090423229643, + "grad_norm": 4.4532623291015625, + "learning_rate": 5.1761219362371036e-05, + "loss": 1.7977, + "step": 6710 + }, + { + "epoch": 0.5051656975103032, + "grad_norm": 4.912318229675293, + "learning_rate": 5.1749035984292574e-05, + "loss": 1.8289, + "step": 6711 + }, + { + "epoch": 0.5052409717909633, + "grad_norm": 5.287909030914307, + "learning_rate": 5.173685250223893e-05, + "loss": 2.0992, + "step": 6712 + }, + { + "epoch": 0.5053162460716235, + "grad_norm": 4.788411617279053, + "learning_rate": 5.1724668916934384e-05, + "loss": 1.8494, + "step": 6713 + }, + { + "epoch": 0.5053915203522836, + "grad_norm": 4.089344501495361, + "learning_rate": 5.1712485229103224e-05, + "loss": 2.2271, + "step": 6714 + }, + { + "epoch": 0.5054667946329438, + "grad_norm": 3.6223223209381104, + "learning_rate": 5.1700301439469736e-05, + "loss": 2.0639, + "step": 6715 + }, + { + "epoch": 0.505542068913604, + "grad_norm": 4.406446933746338, + "learning_rate": 5.168811754875821e-05, + "loss": 1.6676, + "step": 6716 + }, + { + "epoch": 0.5056173431942641, + "grad_norm": 5.190280914306641, + "learning_rate": 5.167593355769294e-05, + "loss": 1.9448, + "step": 6717 + }, + { + "epoch": 0.5056926174749242, + "grad_norm": 4.703592777252197, + "learning_rate": 5.166374946699821e-05, + "loss": 1.8969, + "step": 6718 + }, + { + "epoch": 0.5057678917555845, + "grad_norm": 3.4521965980529785, + "learning_rate": 5.165156527739836e-05, + "loss": 1.6392, + "step": 6719 + }, + { + "epoch": 0.5058431660362446, + "grad_norm": 6.324052333831787, + "learning_rate": 5.1639380989617694e-05, + "loss": 2.009, + "step": 6720 + }, + { + "epoch": 0.5059184403169047, + "grad_norm": 4.220641613006592, + "learning_rate": 5.1627196604380534e-05, + "loss": 1.9461, + "step": 6721 + }, + { + "epoch": 0.5059937145975649, + "grad_norm": 5.34996223449707, + "learning_rate": 5.16150121224112e-05, + "loss": 1.9681, + "step": 6722 + }, + { + "epoch": 0.506068988878225, + "grad_norm": 3.2762959003448486, + "learning_rate": 5.160282754443404e-05, + "loss": 1.9002, + "step": 6723 + }, + { + "epoch": 0.5061442631588852, + "grad_norm": 5.706763744354248, + "learning_rate": 5.1590642871173354e-05, + "loss": 1.8134, + "step": 6724 + }, + { + "epoch": 0.5062195374395454, + "grad_norm": 5.4844136238098145, + "learning_rate": 5.157845810335353e-05, + "loss": 1.952, + "step": 6725 + }, + { + "epoch": 0.5062948117202055, + "grad_norm": 5.797453880310059, + "learning_rate": 5.15662732416989e-05, + "loss": 1.9008, + "step": 6726 + }, + { + "epoch": 0.5063700860008656, + "grad_norm": 3.9168620109558105, + "learning_rate": 5.155408828693383e-05, + "loss": 1.7457, + "step": 6727 + }, + { + "epoch": 0.5064453602815258, + "grad_norm": 4.206057071685791, + "learning_rate": 5.1541903239782664e-05, + "loss": 1.5096, + "step": 6728 + }, + { + "epoch": 0.506520634562186, + "grad_norm": 9.40599536895752, + "learning_rate": 5.152971810096977e-05, + "loss": 2.0214, + "step": 6729 + }, + { + "epoch": 0.5065959088428461, + "grad_norm": 5.5322394371032715, + "learning_rate": 5.151753287121952e-05, + "loss": 2.4696, + "step": 6730 + }, + { + "epoch": 0.5066711831235062, + "grad_norm": 5.431405067443848, + "learning_rate": 5.150534755125631e-05, + "loss": 1.8869, + "step": 6731 + }, + { + "epoch": 0.5067464574041665, + "grad_norm": 7.7712321281433105, + "learning_rate": 5.149316214180451e-05, + "loss": 1.6708, + "step": 6732 + }, + { + "epoch": 0.5068217316848266, + "grad_norm": 4.853963375091553, + "learning_rate": 5.1480976643588486e-05, + "loss": 2.1094, + "step": 6733 + }, + { + "epoch": 0.5068970059654867, + "grad_norm": 4.799790382385254, + "learning_rate": 5.1468791057332665e-05, + "loss": 1.9773, + "step": 6734 + }, + { + "epoch": 0.5069722802461469, + "grad_norm": 7.293521404266357, + "learning_rate": 5.1456605383761414e-05, + "loss": 2.3182, + "step": 6735 + }, + { + "epoch": 0.507047554526807, + "grad_norm": 4.357425689697266, + "learning_rate": 5.144441962359917e-05, + "loss": 2.0324, + "step": 6736 + }, + { + "epoch": 0.5071228288074672, + "grad_norm": 4.715337753295898, + "learning_rate": 5.143223377757032e-05, + "loss": 1.9506, + "step": 6737 + }, + { + "epoch": 0.5071981030881274, + "grad_norm": 4.954517364501953, + "learning_rate": 5.142004784639928e-05, + "loss": 2.068, + "step": 6738 + }, + { + "epoch": 0.5072733773687875, + "grad_norm": 8.095152854919434, + "learning_rate": 5.140786183081046e-05, + "loss": 1.8576, + "step": 6739 + }, + { + "epoch": 0.5073486516494476, + "grad_norm": 3.657353401184082, + "learning_rate": 5.13956757315283e-05, + "loss": 1.7307, + "step": 6740 + }, + { + "epoch": 0.5074239259301079, + "grad_norm": 3.8049044609069824, + "learning_rate": 5.138348954927723e-05, + "loss": 1.5787, + "step": 6741 + }, + { + "epoch": 0.507499200210768, + "grad_norm": 6.047858715057373, + "learning_rate": 5.137130328478166e-05, + "loss": 1.9857, + "step": 6742 + }, + { + "epoch": 0.5075744744914281, + "grad_norm": 5.1330461502075195, + "learning_rate": 5.135911693876606e-05, + "loss": 2.1349, + "step": 6743 + }, + { + "epoch": 0.5076497487720883, + "grad_norm": 5.729600429534912, + "learning_rate": 5.134693051195485e-05, + "loss": 1.7973, + "step": 6744 + }, + { + "epoch": 0.5077250230527485, + "grad_norm": 5.038491249084473, + "learning_rate": 5.133474400507249e-05, + "loss": 2.0774, + "step": 6745 + }, + { + "epoch": 0.5078002973334086, + "grad_norm": 5.444791316986084, + "learning_rate": 5.132255741884343e-05, + "loss": 2.0905, + "step": 6746 + }, + { + "epoch": 0.5078755716140687, + "grad_norm": 4.389624118804932, + "learning_rate": 5.131037075399212e-05, + "loss": 1.7255, + "step": 6747 + }, + { + "epoch": 0.5079508458947289, + "grad_norm": 4.421477317810059, + "learning_rate": 5.129818401124303e-05, + "loss": 1.7822, + "step": 6748 + }, + { + "epoch": 0.5080261201753891, + "grad_norm": 5.592228412628174, + "learning_rate": 5.1285997191320635e-05, + "loss": 2.1499, + "step": 6749 + }, + { + "epoch": 0.5081013944560492, + "grad_norm": 5.381822109222412, + "learning_rate": 5.1273810294949386e-05, + "loss": 1.7784, + "step": 6750 + }, + { + "epoch": 0.5081766687367094, + "grad_norm": 5.821904182434082, + "learning_rate": 5.126162332285378e-05, + "loss": 1.9367, + "step": 6751 + }, + { + "epoch": 0.5082519430173695, + "grad_norm": 3.8950958251953125, + "learning_rate": 5.124943627575827e-05, + "loss": 1.7842, + "step": 6752 + }, + { + "epoch": 0.5083272172980297, + "grad_norm": 5.104915142059326, + "learning_rate": 5.1237249154387366e-05, + "loss": 2.0075, + "step": 6753 + }, + { + "epoch": 0.5084024915786899, + "grad_norm": 7.373931407928467, + "learning_rate": 5.122506195946556e-05, + "loss": 1.9125, + "step": 6754 + }, + { + "epoch": 0.50847776585935, + "grad_norm": 5.867264270782471, + "learning_rate": 5.121287469171733e-05, + "loss": 2.1131, + "step": 6755 + }, + { + "epoch": 0.5085530401400101, + "grad_norm": 4.396439552307129, + "learning_rate": 5.120068735186718e-05, + "loss": 1.9448, + "step": 6756 + }, + { + "epoch": 0.5086283144206704, + "grad_norm": 5.604939937591553, + "learning_rate": 5.11884999406396e-05, + "loss": 1.9606, + "step": 6757 + }, + { + "epoch": 0.5087035887013305, + "grad_norm": 4.8259358406066895, + "learning_rate": 5.1176312458759115e-05, + "loss": 1.9132, + "step": 6758 + }, + { + "epoch": 0.5087788629819906, + "grad_norm": 5.529745101928711, + "learning_rate": 5.116412490695023e-05, + "loss": 1.8852, + "step": 6759 + }, + { + "epoch": 0.5088541372626508, + "grad_norm": 5.993441581726074, + "learning_rate": 5.115193728593747e-05, + "loss": 1.9044, + "step": 6760 + }, + { + "epoch": 0.508929411543311, + "grad_norm": 5.119747161865234, + "learning_rate": 5.113974959644534e-05, + "loss": 1.973, + "step": 6761 + }, + { + "epoch": 0.5090046858239711, + "grad_norm": 8.281730651855469, + "learning_rate": 5.112756183919837e-05, + "loss": 1.9255, + "step": 6762 + }, + { + "epoch": 0.5090799601046313, + "grad_norm": 4.355381965637207, + "learning_rate": 5.111537401492106e-05, + "loss": 2.0383, + "step": 6763 + }, + { + "epoch": 0.5091552343852914, + "grad_norm": 6.217527866363525, + "learning_rate": 5.110318612433799e-05, + "loss": 1.9155, + "step": 6764 + }, + { + "epoch": 0.5092305086659515, + "grad_norm": 5.128861904144287, + "learning_rate": 5.1090998168173666e-05, + "loss": 1.939, + "step": 6765 + }, + { + "epoch": 0.5093057829466117, + "grad_norm": 4.530824661254883, + "learning_rate": 5.107881014715263e-05, + "loss": 1.8618, + "step": 6766 + }, + { + "epoch": 0.5093810572272719, + "grad_norm": 3.5501315593719482, + "learning_rate": 5.106662206199943e-05, + "loss": 1.8874, + "step": 6767 + }, + { + "epoch": 0.509456331507932, + "grad_norm": 3.821485757827759, + "learning_rate": 5.10544339134386e-05, + "loss": 1.6803, + "step": 6768 + }, + { + "epoch": 0.5095316057885921, + "grad_norm": 4.466263294219971, + "learning_rate": 5.104224570219469e-05, + "loss": 1.7033, + "step": 6769 + }, + { + "epoch": 0.5096068800692524, + "grad_norm": 6.206378936767578, + "learning_rate": 5.1030057428992284e-05, + "loss": 1.8901, + "step": 6770 + }, + { + "epoch": 0.5096821543499125, + "grad_norm": 3.7993228435516357, + "learning_rate": 5.1017869094555914e-05, + "loss": 1.6731, + "step": 6771 + }, + { + "epoch": 0.5097574286305726, + "grad_norm": 4.2592620849609375, + "learning_rate": 5.1005680699610146e-05, + "loss": 1.8052, + "step": 6772 + }, + { + "epoch": 0.5098327029112328, + "grad_norm": 6.020939826965332, + "learning_rate": 5.0993492244879546e-05, + "loss": 1.9472, + "step": 6773 + }, + { + "epoch": 0.509907977191893, + "grad_norm": 5.051983833312988, + "learning_rate": 5.098130373108867e-05, + "loss": 1.7637, + "step": 6774 + }, + { + "epoch": 0.5099832514725531, + "grad_norm": 4.431304454803467, + "learning_rate": 5.096911515896211e-05, + "loss": 2.0273, + "step": 6775 + }, + { + "epoch": 0.5100585257532133, + "grad_norm": 4.491030693054199, + "learning_rate": 5.095692652922445e-05, + "loss": 1.6398, + "step": 6776 + }, + { + "epoch": 0.5101338000338734, + "grad_norm": 5.151197910308838, + "learning_rate": 5.094473784260023e-05, + "loss": 2.0918, + "step": 6777 + }, + { + "epoch": 0.5102090743145336, + "grad_norm": 5.7026214599609375, + "learning_rate": 5.093254909981407e-05, + "loss": 2.0468, + "step": 6778 + }, + { + "epoch": 0.5102843485951938, + "grad_norm": 5.698533535003662, + "learning_rate": 5.092036030159055e-05, + "loss": 1.7777, + "step": 6779 + }, + { + "epoch": 0.5103596228758539, + "grad_norm": 5.243957042694092, + "learning_rate": 5.090817144865423e-05, + "loss": 2.1034, + "step": 6780 + }, + { + "epoch": 0.510434897156514, + "grad_norm": 4.988044738769531, + "learning_rate": 5.089598254172974e-05, + "loss": 1.76, + "step": 6781 + }, + { + "epoch": 0.5105101714371743, + "grad_norm": 4.061254024505615, + "learning_rate": 5.088379358154165e-05, + "loss": 1.7152, + "step": 6782 + }, + { + "epoch": 0.5105854457178344, + "grad_norm": 5.8456807136535645, + "learning_rate": 5.087160456881458e-05, + "loss": 1.7742, + "step": 6783 + }, + { + "epoch": 0.5106607199984945, + "grad_norm": 6.585233211517334, + "learning_rate": 5.0859415504273114e-05, + "loss": 1.895, + "step": 6784 + }, + { + "epoch": 0.5107359942791546, + "grad_norm": 6.776304721832275, + "learning_rate": 5.0847226388641845e-05, + "loss": 1.8219, + "step": 6785 + }, + { + "epoch": 0.5108112685598148, + "grad_norm": 4.197993755340576, + "learning_rate": 5.083503722264541e-05, + "loss": 1.6516, + "step": 6786 + }, + { + "epoch": 0.510886542840475, + "grad_norm": 5.710675239562988, + "learning_rate": 5.0822848007008414e-05, + "loss": 1.8449, + "step": 6787 + }, + { + "epoch": 0.5109618171211351, + "grad_norm": 5.319664001464844, + "learning_rate": 5.0810658742455464e-05, + "loss": 1.9438, + "step": 6788 + }, + { + "epoch": 0.5110370914017953, + "grad_norm": 4.425714015960693, + "learning_rate": 5.0798469429711185e-05, + "loss": 1.7941, + "step": 6789 + }, + { + "epoch": 0.5111123656824554, + "grad_norm": 5.851222038269043, + "learning_rate": 5.0786280069500155e-05, + "loss": 1.7404, + "step": 6790 + }, + { + "epoch": 0.5111876399631156, + "grad_norm": 3.832235336303711, + "learning_rate": 5.0774090662547055e-05, + "loss": 1.991, + "step": 6791 + }, + { + "epoch": 0.5112629142437758, + "grad_norm": 5.484068393707275, + "learning_rate": 5.076190120957649e-05, + "loss": 2.1917, + "step": 6792 + }, + { + "epoch": 0.5113381885244359, + "grad_norm": 4.618852138519287, + "learning_rate": 5.074971171131308e-05, + "loss": 2.2317, + "step": 6793 + }, + { + "epoch": 0.511413462805096, + "grad_norm": 10.821094512939453, + "learning_rate": 5.0737522168481453e-05, + "loss": 2.0404, + "step": 6794 + }, + { + "epoch": 0.5114887370857563, + "grad_norm": 4.949869632720947, + "learning_rate": 5.0725332581806265e-05, + "loss": 1.6138, + "step": 6795 + }, + { + "epoch": 0.5115640113664164, + "grad_norm": 7.573105335235596, + "learning_rate": 5.0713142952012126e-05, + "loss": 2.0455, + "step": 6796 + }, + { + "epoch": 0.5116392856470765, + "grad_norm": 4.421236991882324, + "learning_rate": 5.070095327982368e-05, + "loss": 2.1107, + "step": 6797 + }, + { + "epoch": 0.5117145599277367, + "grad_norm": 5.214579105377197, + "learning_rate": 5.068876356596558e-05, + "loss": 2.0186, + "step": 6798 + }, + { + "epoch": 0.5117898342083969, + "grad_norm": 4.235634803771973, + "learning_rate": 5.0676573811162465e-05, + "loss": 2.0007, + "step": 6799 + }, + { + "epoch": 0.511865108489057, + "grad_norm": 7.372382164001465, + "learning_rate": 5.066438401613898e-05, + "loss": 2.1085, + "step": 6800 + }, + { + "epoch": 0.5119403827697172, + "grad_norm": 4.532595634460449, + "learning_rate": 5.0652194181619774e-05, + "loss": 1.754, + "step": 6801 + }, + { + "epoch": 0.5120156570503773, + "grad_norm": 4.227382183074951, + "learning_rate": 5.064000430832947e-05, + "loss": 1.8598, + "step": 6802 + }, + { + "epoch": 0.5120909313310374, + "grad_norm": 4.752003192901611, + "learning_rate": 5.062781439699277e-05, + "loss": 2.131, + "step": 6803 + }, + { + "epoch": 0.5121662056116977, + "grad_norm": 4.954397201538086, + "learning_rate": 5.06156244483343e-05, + "loss": 2.0002, + "step": 6804 + }, + { + "epoch": 0.5122414798923578, + "grad_norm": 5.293426036834717, + "learning_rate": 5.060343446307872e-05, + "loss": 1.963, + "step": 6805 + }, + { + "epoch": 0.5123167541730179, + "grad_norm": 8.222238540649414, + "learning_rate": 5.05912444419507e-05, + "loss": 1.632, + "step": 6806 + }, + { + "epoch": 0.512392028453678, + "grad_norm": 4.669669151306152, + "learning_rate": 5.057905438567488e-05, + "loss": 1.8256, + "step": 6807 + }, + { + "epoch": 0.5124673027343383, + "grad_norm": 4.59066104888916, + "learning_rate": 5.056686429497594e-05, + "loss": 2.3087, + "step": 6808 + }, + { + "epoch": 0.5125425770149984, + "grad_norm": 4.068904876708984, + "learning_rate": 5.055467417057855e-05, + "loss": 1.6905, + "step": 6809 + }, + { + "epoch": 0.5126178512956585, + "grad_norm": 5.9013261795043945, + "learning_rate": 5.054248401320738e-05, + "loss": 2.3371, + "step": 6810 + }, + { + "epoch": 0.5126931255763187, + "grad_norm": 8.629036903381348, + "learning_rate": 5.053029382358708e-05, + "loss": 1.807, + "step": 6811 + }, + { + "epoch": 0.5127683998569789, + "grad_norm": 4.809014320373535, + "learning_rate": 5.0518103602442346e-05, + "loss": 1.7238, + "step": 6812 + }, + { + "epoch": 0.512843674137639, + "grad_norm": 5.800009250640869, + "learning_rate": 5.0505913350497825e-05, + "loss": 2.4917, + "step": 6813 + }, + { + "epoch": 0.5129189484182992, + "grad_norm": 4.991604328155518, + "learning_rate": 5.049372306847821e-05, + "loss": 2.08, + "step": 6814 + }, + { + "epoch": 0.5129942226989593, + "grad_norm": 4.171029567718506, + "learning_rate": 5.0481532757108184e-05, + "loss": 1.9725, + "step": 6815 + }, + { + "epoch": 0.5130694969796195, + "grad_norm": 9.149380683898926, + "learning_rate": 5.046934241711242e-05, + "loss": 2.2616, + "step": 6816 + }, + { + "epoch": 0.5131447712602797, + "grad_norm": 4.736367225646973, + "learning_rate": 5.045715204921559e-05, + "loss": 1.7606, + "step": 6817 + }, + { + "epoch": 0.5132200455409398, + "grad_norm": 4.694098949432373, + "learning_rate": 5.0444961654142385e-05, + "loss": 2.3341, + "step": 6818 + }, + { + "epoch": 0.5132953198215999, + "grad_norm": 5.636812686920166, + "learning_rate": 5.0432771232617484e-05, + "loss": 2.4845, + "step": 6819 + }, + { + "epoch": 0.5133705941022602, + "grad_norm": 4.193817138671875, + "learning_rate": 5.042058078536557e-05, + "loss": 1.7284, + "step": 6820 + }, + { + "epoch": 0.5134458683829203, + "grad_norm": 6.854394912719727, + "learning_rate": 5.040839031311134e-05, + "loss": 2.0648, + "step": 6821 + }, + { + "epoch": 0.5135211426635804, + "grad_norm": 4.402436256408691, + "learning_rate": 5.0396199816579485e-05, + "loss": 2.1523, + "step": 6822 + }, + { + "epoch": 0.5135964169442406, + "grad_norm": 3.992069721221924, + "learning_rate": 5.038400929649466e-05, + "loss": 1.8521, + "step": 6823 + }, + { + "epoch": 0.5136716912249008, + "grad_norm": 3.866669178009033, + "learning_rate": 5.037181875358161e-05, + "loss": 1.664, + "step": 6824 + }, + { + "epoch": 0.5137469655055609, + "grad_norm": 3.647256374359131, + "learning_rate": 5.035962818856499e-05, + "loss": 2.0171, + "step": 6825 + }, + { + "epoch": 0.513822239786221, + "grad_norm": 4.605841636657715, + "learning_rate": 5.034743760216951e-05, + "loss": 1.9501, + "step": 6826 + }, + { + "epoch": 0.5138975140668812, + "grad_norm": 3.749753475189209, + "learning_rate": 5.0335246995119864e-05, + "loss": 1.6774, + "step": 6827 + }, + { + "epoch": 0.5139727883475413, + "grad_norm": 4.9727678298950195, + "learning_rate": 5.032305636814075e-05, + "loss": 2.1272, + "step": 6828 + }, + { + "epoch": 0.5140480626282015, + "grad_norm": 4.057631492614746, + "learning_rate": 5.0310865721956854e-05, + "loss": 2.6007, + "step": 6829 + }, + { + "epoch": 0.5141233369088617, + "grad_norm": 3.980250120162964, + "learning_rate": 5.029867505729289e-05, + "loss": 2.0065, + "step": 6830 + }, + { + "epoch": 0.5141986111895218, + "grad_norm": 5.497494220733643, + "learning_rate": 5.028648437487354e-05, + "loss": 2.1502, + "step": 6831 + }, + { + "epoch": 0.5142738854701819, + "grad_norm": 3.7058589458465576, + "learning_rate": 5.0274293675423525e-05, + "loss": 1.8049, + "step": 6832 + }, + { + "epoch": 0.5143491597508422, + "grad_norm": 7.040330410003662, + "learning_rate": 5.0262102959667546e-05, + "loss": 2.063, + "step": 6833 + }, + { + "epoch": 0.5144244340315023, + "grad_norm": 5.201131343841553, + "learning_rate": 5.024991222833029e-05, + "loss": 2.1388, + "step": 6834 + }, + { + "epoch": 0.5144997083121624, + "grad_norm": 5.2422356605529785, + "learning_rate": 5.023772148213646e-05, + "loss": 1.9717, + "step": 6835 + }, + { + "epoch": 0.5145749825928226, + "grad_norm": 4.488335609436035, + "learning_rate": 5.022553072181078e-05, + "loss": 2.0539, + "step": 6836 + }, + { + "epoch": 0.5146502568734828, + "grad_norm": 4.804295063018799, + "learning_rate": 5.021333994807794e-05, + "loss": 1.6963, + "step": 6837 + }, + { + "epoch": 0.5147255311541429, + "grad_norm": 3.918212413787842, + "learning_rate": 5.020114916166265e-05, + "loss": 1.8088, + "step": 6838 + }, + { + "epoch": 0.5148008054348031, + "grad_norm": 4.215106010437012, + "learning_rate": 5.018895836328962e-05, + "loss": 1.8461, + "step": 6839 + }, + { + "epoch": 0.5148760797154632, + "grad_norm": 4.371639728546143, + "learning_rate": 5.017676755368357e-05, + "loss": 1.7672, + "step": 6840 + }, + { + "epoch": 0.5149513539961234, + "grad_norm": 3.7421443462371826, + "learning_rate": 5.016457673356917e-05, + "loss": 2.0301, + "step": 6841 + }, + { + "epoch": 0.5150266282767836, + "grad_norm": 5.293332099914551, + "learning_rate": 5.015238590367117e-05, + "loss": 2.3953, + "step": 6842 + }, + { + "epoch": 0.5151019025574437, + "grad_norm": 5.041553020477295, + "learning_rate": 5.014019506471428e-05, + "loss": 1.9352, + "step": 6843 + }, + { + "epoch": 0.5151771768381038, + "grad_norm": 4.102403163909912, + "learning_rate": 5.012800421742318e-05, + "loss": 1.9296, + "step": 6844 + }, + { + "epoch": 0.5152524511187639, + "grad_norm": 4.40903377532959, + "learning_rate": 5.0115813362522615e-05, + "loss": 1.6571, + "step": 6845 + }, + { + "epoch": 0.5153277253994242, + "grad_norm": 4.813185214996338, + "learning_rate": 5.0103622500737255e-05, + "loss": 2.4134, + "step": 6846 + }, + { + "epoch": 0.5154029996800843, + "grad_norm": 3.8157384395599365, + "learning_rate": 5.009143163279184e-05, + "loss": 1.6864, + "step": 6847 + }, + { + "epoch": 0.5154782739607444, + "grad_norm": 3.538339376449585, + "learning_rate": 5.0079240759411085e-05, + "loss": 2.0502, + "step": 6848 + }, + { + "epoch": 0.5155535482414046, + "grad_norm": 5.106586933135986, + "learning_rate": 5.00670498813197e-05, + "loss": 1.7201, + "step": 6849 + }, + { + "epoch": 0.5156288225220648, + "grad_norm": 4.649513244628906, + "learning_rate": 5.005485899924237e-05, + "loss": 2.0033, + "step": 6850 + }, + { + "epoch": 0.5157040968027249, + "grad_norm": 5.803775787353516, + "learning_rate": 5.004266811390385e-05, + "loss": 1.8993, + "step": 6851 + }, + { + "epoch": 0.5157793710833851, + "grad_norm": 4.632706165313721, + "learning_rate": 5.0030477226028816e-05, + "loss": 2.2742, + "step": 6852 + }, + { + "epoch": 0.5158546453640452, + "grad_norm": 5.387406826019287, + "learning_rate": 5.0018286336342e-05, + "loss": 1.8267, + "step": 6853 + }, + { + "epoch": 0.5159299196447054, + "grad_norm": 5.561295509338379, + "learning_rate": 5.0006095445568124e-05, + "loss": 2.2549, + "step": 6854 + }, + { + "epoch": 0.5160051939253656, + "grad_norm": 7.257455348968506, + "learning_rate": 4.9993904554431894e-05, + "loss": 2.1567, + "step": 6855 + }, + { + "epoch": 0.5160804682060257, + "grad_norm": 5.317276954650879, + "learning_rate": 4.9981713663658004e-05, + "loss": 1.6771, + "step": 6856 + }, + { + "epoch": 0.5161557424866858, + "grad_norm": 4.524988651275635, + "learning_rate": 4.99695227739712e-05, + "loss": 2.0057, + "step": 6857 + }, + { + "epoch": 0.5162310167673461, + "grad_norm": 5.059763431549072, + "learning_rate": 4.9957331886096164e-05, + "loss": 2.1157, + "step": 6858 + }, + { + "epoch": 0.5163062910480062, + "grad_norm": 7.32643985748291, + "learning_rate": 4.994514100075765e-05, + "loss": 2.3038, + "step": 6859 + }, + { + "epoch": 0.5163815653286663, + "grad_norm": 5.789794445037842, + "learning_rate": 4.993295011868032e-05, + "loss": 1.7921, + "step": 6860 + }, + { + "epoch": 0.5164568396093265, + "grad_norm": 3.712477684020996, + "learning_rate": 4.992075924058894e-05, + "loss": 1.9185, + "step": 6861 + }, + { + "epoch": 0.5165321138899867, + "grad_norm": 4.081736087799072, + "learning_rate": 4.990856836720817e-05, + "loss": 1.7324, + "step": 6862 + }, + { + "epoch": 0.5166073881706468, + "grad_norm": 4.747015953063965, + "learning_rate": 4.989637749926275e-05, + "loss": 1.9919, + "step": 6863 + }, + { + "epoch": 0.5166826624513069, + "grad_norm": 3.6823556423187256, + "learning_rate": 4.9884186637477404e-05, + "loss": 2.012, + "step": 6864 + }, + { + "epoch": 0.5167579367319671, + "grad_norm": 4.525242328643799, + "learning_rate": 4.9871995782576816e-05, + "loss": 1.7546, + "step": 6865 + }, + { + "epoch": 0.5168332110126272, + "grad_norm": 5.906013488769531, + "learning_rate": 4.9859804935285734e-05, + "loss": 2.0468, + "step": 6866 + }, + { + "epoch": 0.5169084852932874, + "grad_norm": 4.098196983337402, + "learning_rate": 4.984761409632882e-05, + "loss": 1.8925, + "step": 6867 + }, + { + "epoch": 0.5169837595739476, + "grad_norm": 6.528939723968506, + "learning_rate": 4.9835423266430836e-05, + "loss": 2.3215, + "step": 6868 + }, + { + "epoch": 0.5170590338546077, + "grad_norm": 5.712769508361816, + "learning_rate": 4.982323244631645e-05, + "loss": 2.2422, + "step": 6869 + }, + { + "epoch": 0.5171343081352678, + "grad_norm": 4.830761909484863, + "learning_rate": 4.981104163671039e-05, + "loss": 1.6947, + "step": 6870 + }, + { + "epoch": 0.5172095824159281, + "grad_norm": 4.076691627502441, + "learning_rate": 4.979885083833736e-05, + "loss": 2.0192, + "step": 6871 + }, + { + "epoch": 0.5172848566965882, + "grad_norm": 4.893392086029053, + "learning_rate": 4.978666005192208e-05, + "loss": 2.1991, + "step": 6872 + }, + { + "epoch": 0.5173601309772483, + "grad_norm": 5.401125907897949, + "learning_rate": 4.977446927818924e-05, + "loss": 2.272, + "step": 6873 + }, + { + "epoch": 0.5174354052579085, + "grad_norm": 7.253882884979248, + "learning_rate": 4.976227851786356e-05, + "loss": 2.2889, + "step": 6874 + }, + { + "epoch": 0.5175106795385687, + "grad_norm": 5.098233699798584, + "learning_rate": 4.975008777166973e-05, + "loss": 1.8926, + "step": 6875 + }, + { + "epoch": 0.5175859538192288, + "grad_norm": 5.662689685821533, + "learning_rate": 4.973789704033248e-05, + "loss": 1.8065, + "step": 6876 + }, + { + "epoch": 0.517661228099889, + "grad_norm": 3.8207900524139404, + "learning_rate": 4.972570632457648e-05, + "loss": 1.9377, + "step": 6877 + }, + { + "epoch": 0.5177365023805491, + "grad_norm": 6.19397497177124, + "learning_rate": 4.9713515625126475e-05, + "loss": 2.7659, + "step": 6878 + }, + { + "epoch": 0.5178117766612093, + "grad_norm": 5.682821750640869, + "learning_rate": 4.9701324942707125e-05, + "loss": 2.0679, + "step": 6879 + }, + { + "epoch": 0.5178870509418695, + "grad_norm": 5.77880859375, + "learning_rate": 4.968913427804315e-05, + "loss": 1.8812, + "step": 6880 + }, + { + "epoch": 0.5179623252225296, + "grad_norm": 4.24069881439209, + "learning_rate": 4.967694363185926e-05, + "loss": 1.7689, + "step": 6881 + }, + { + "epoch": 0.5180375995031897, + "grad_norm": 5.851943492889404, + "learning_rate": 4.9664753004880134e-05, + "loss": 1.8395, + "step": 6882 + }, + { + "epoch": 0.5181128737838498, + "grad_norm": 5.177786350250244, + "learning_rate": 4.96525623978305e-05, + "loss": 1.813, + "step": 6883 + }, + { + "epoch": 0.5181881480645101, + "grad_norm": 4.929501056671143, + "learning_rate": 4.964037181143501e-05, + "loss": 1.7638, + "step": 6884 + }, + { + "epoch": 0.5182634223451702, + "grad_norm": 4.608745574951172, + "learning_rate": 4.96281812464184e-05, + "loss": 2.28, + "step": 6885 + }, + { + "epoch": 0.5183386966258303, + "grad_norm": 5.799046993255615, + "learning_rate": 4.9615990703505335e-05, + "loss": 2.0126, + "step": 6886 + }, + { + "epoch": 0.5184139709064906, + "grad_norm": 4.481860160827637, + "learning_rate": 4.960380018342055e-05, + "loss": 1.6807, + "step": 6887 + }, + { + "epoch": 0.5184892451871507, + "grad_norm": 4.460608005523682, + "learning_rate": 4.9591609686888676e-05, + "loss": 1.9347, + "step": 6888 + }, + { + "epoch": 0.5185645194678108, + "grad_norm": 4.363182067871094, + "learning_rate": 4.9579419214634456e-05, + "loss": 1.8492, + "step": 6889 + }, + { + "epoch": 0.518639793748471, + "grad_norm": 4.726400375366211, + "learning_rate": 4.9567228767382535e-05, + "loss": 2.1296, + "step": 6890 + }, + { + "epoch": 0.5187150680291311, + "grad_norm": 5.13202428817749, + "learning_rate": 4.955503834585763e-05, + "loss": 1.5846, + "step": 6891 + }, + { + "epoch": 0.5187903423097913, + "grad_norm": 4.812490940093994, + "learning_rate": 4.954284795078442e-05, + "loss": 2.0728, + "step": 6892 + }, + { + "epoch": 0.5188656165904515, + "grad_norm": 3.7938036918640137, + "learning_rate": 4.95306575828876e-05, + "loss": 1.7599, + "step": 6893 + }, + { + "epoch": 0.5189408908711116, + "grad_norm": 3.8220901489257812, + "learning_rate": 4.951846724289182e-05, + "loss": 1.7535, + "step": 6894 + }, + { + "epoch": 0.5190161651517717, + "grad_norm": 5.791460037231445, + "learning_rate": 4.9506276931521785e-05, + "loss": 2.2117, + "step": 6895 + }, + { + "epoch": 0.519091439432432, + "grad_norm": 5.122953414916992, + "learning_rate": 4.949408664950219e-05, + "loss": 1.7235, + "step": 6896 + }, + { + "epoch": 0.5191667137130921, + "grad_norm": 6.580044269561768, + "learning_rate": 4.9481896397557665e-05, + "loss": 1.9738, + "step": 6897 + }, + { + "epoch": 0.5192419879937522, + "grad_norm": 3.792405843734741, + "learning_rate": 4.946970617641292e-05, + "loss": 1.9672, + "step": 6898 + }, + { + "epoch": 0.5193172622744124, + "grad_norm": 4.758399963378906, + "learning_rate": 4.9457515986792624e-05, + "loss": 1.5851, + "step": 6899 + }, + { + "epoch": 0.5193925365550726, + "grad_norm": 4.688673973083496, + "learning_rate": 4.944532582942146e-05, + "loss": 1.7311, + "step": 6900 + }, + { + "epoch": 0.5194678108357327, + "grad_norm": 7.336785793304443, + "learning_rate": 4.943313570502407e-05, + "loss": 2.415, + "step": 6901 + }, + { + "epoch": 0.5195430851163929, + "grad_norm": 5.997488975524902, + "learning_rate": 4.942094561432513e-05, + "loss": 2.4754, + "step": 6902 + }, + { + "epoch": 0.519618359397053, + "grad_norm": 6.137256145477295, + "learning_rate": 4.940875555804931e-05, + "loss": 2.0807, + "step": 6903 + }, + { + "epoch": 0.5196936336777132, + "grad_norm": 4.39750862121582, + "learning_rate": 4.93965655369213e-05, + "loss": 2.1995, + "step": 6904 + }, + { + "epoch": 0.5197689079583733, + "grad_norm": 3.6037113666534424, + "learning_rate": 4.9384375551665714e-05, + "loss": 1.7112, + "step": 6905 + }, + { + "epoch": 0.5198441822390335, + "grad_norm": 4.31144905090332, + "learning_rate": 4.9372185603007255e-05, + "loss": 1.9347, + "step": 6906 + }, + { + "epoch": 0.5199194565196936, + "grad_norm": 4.638736248016357, + "learning_rate": 4.935999569167054e-05, + "loss": 1.838, + "step": 6907 + }, + { + "epoch": 0.5199947308003537, + "grad_norm": 4.404640197753906, + "learning_rate": 4.934780581838026e-05, + "loss": 1.9759, + "step": 6908 + }, + { + "epoch": 0.520070005081014, + "grad_norm": 4.184910774230957, + "learning_rate": 4.9335615983861034e-05, + "loss": 2.0044, + "step": 6909 + }, + { + "epoch": 0.5201452793616741, + "grad_norm": 4.163177967071533, + "learning_rate": 4.932342618883756e-05, + "loss": 1.8945, + "step": 6910 + }, + { + "epoch": 0.5202205536423342, + "grad_norm": 4.8875274658203125, + "learning_rate": 4.931123643403443e-05, + "loss": 1.8009, + "step": 6911 + }, + { + "epoch": 0.5202958279229944, + "grad_norm": 4.165733337402344, + "learning_rate": 4.9299046720176315e-05, + "loss": 1.812, + "step": 6912 + }, + { + "epoch": 0.5203711022036546, + "grad_norm": 4.415853023529053, + "learning_rate": 4.9286857047987886e-05, + "loss": 1.884, + "step": 6913 + }, + { + "epoch": 0.5204463764843147, + "grad_norm": 5.037762641906738, + "learning_rate": 4.927466741819373e-05, + "loss": 1.9259, + "step": 6914 + }, + { + "epoch": 0.5205216507649749, + "grad_norm": 4.049993515014648, + "learning_rate": 4.926247783151855e-05, + "loss": 1.8196, + "step": 6915 + }, + { + "epoch": 0.520596925045635, + "grad_norm": 7.728363990783691, + "learning_rate": 4.925028828868693e-05, + "loss": 1.8621, + "step": 6916 + }, + { + "epoch": 0.5206721993262952, + "grad_norm": 4.262650489807129, + "learning_rate": 4.9238098790423526e-05, + "loss": 1.9102, + "step": 6917 + }, + { + "epoch": 0.5207474736069554, + "grad_norm": 5.607816219329834, + "learning_rate": 4.922590933745295e-05, + "loss": 2.0538, + "step": 6918 + }, + { + "epoch": 0.5208227478876155, + "grad_norm": 4.166906356811523, + "learning_rate": 4.921371993049985e-05, + "loss": 1.9765, + "step": 6919 + }, + { + "epoch": 0.5208980221682756, + "grad_norm": 4.375038146972656, + "learning_rate": 4.9201530570288834e-05, + "loss": 2.2206, + "step": 6920 + }, + { + "epoch": 0.5209732964489359, + "grad_norm": 3.7139694690704346, + "learning_rate": 4.918934125754456e-05, + "loss": 1.5156, + "step": 6921 + }, + { + "epoch": 0.521048570729596, + "grad_norm": 4.020748615264893, + "learning_rate": 4.917715199299159e-05, + "loss": 1.7546, + "step": 6922 + }, + { + "epoch": 0.5211238450102561, + "grad_norm": 4.18602180480957, + "learning_rate": 4.9164962777354605e-05, + "loss": 1.8005, + "step": 6923 + }, + { + "epoch": 0.5211991192909162, + "grad_norm": 3.9706244468688965, + "learning_rate": 4.915277361135817e-05, + "loss": 1.7741, + "step": 6924 + }, + { + "epoch": 0.5212743935715765, + "grad_norm": 3.938563346862793, + "learning_rate": 4.914058449572691e-05, + "loss": 1.8682, + "step": 6925 + }, + { + "epoch": 0.5213496678522366, + "grad_norm": 4.620153427124023, + "learning_rate": 4.912839543118543e-05, + "loss": 2.4686, + "step": 6926 + }, + { + "epoch": 0.5214249421328967, + "grad_norm": 5.794540882110596, + "learning_rate": 4.9116206418458364e-05, + "loss": 1.8652, + "step": 6927 + }, + { + "epoch": 0.5215002164135569, + "grad_norm": 4.430908679962158, + "learning_rate": 4.910401745827027e-05, + "loss": 1.9695, + "step": 6928 + }, + { + "epoch": 0.521575490694217, + "grad_norm": 5.343656539916992, + "learning_rate": 4.909182855134577e-05, + "loss": 2.2022, + "step": 6929 + }, + { + "epoch": 0.5216507649748772, + "grad_norm": 4.821630477905273, + "learning_rate": 4.907963969840946e-05, + "loss": 2.2725, + "step": 6930 + }, + { + "epoch": 0.5217260392555374, + "grad_norm": 4.798400402069092, + "learning_rate": 4.9067450900185926e-05, + "loss": 1.7424, + "step": 6931 + }, + { + "epoch": 0.5218013135361975, + "grad_norm": 5.2089667320251465, + "learning_rate": 4.9055262157399776e-05, + "loss": 1.7361, + "step": 6932 + }, + { + "epoch": 0.5218765878168576, + "grad_norm": 5.804737567901611, + "learning_rate": 4.9043073470775556e-05, + "loss": 1.9865, + "step": 6933 + }, + { + "epoch": 0.5219518620975179, + "grad_norm": 5.477221965789795, + "learning_rate": 4.90308848410379e-05, + "loss": 1.9161, + "step": 6934 + }, + { + "epoch": 0.522027136378178, + "grad_norm": 4.658236026763916, + "learning_rate": 4.901869626891133e-05, + "loss": 1.8658, + "step": 6935 + }, + { + "epoch": 0.5221024106588381, + "grad_norm": 4.555919647216797, + "learning_rate": 4.900650775512047e-05, + "loss": 2.1852, + "step": 6936 + }, + { + "epoch": 0.5221776849394983, + "grad_norm": 4.156063556671143, + "learning_rate": 4.8994319300389865e-05, + "loss": 2.008, + "step": 6937 + }, + { + "epoch": 0.5222529592201585, + "grad_norm": 3.579723358154297, + "learning_rate": 4.8982130905444104e-05, + "loss": 1.888, + "step": 6938 + }, + { + "epoch": 0.5223282335008186, + "grad_norm": 5.1500420570373535, + "learning_rate": 4.896994257100773e-05, + "loss": 1.9456, + "step": 6939 + }, + { + "epoch": 0.5224035077814788, + "grad_norm": 4.530655384063721, + "learning_rate": 4.8957754297805314e-05, + "loss": 2.0376, + "step": 6940 + }, + { + "epoch": 0.5224787820621389, + "grad_norm": 5.194767951965332, + "learning_rate": 4.894556608656141e-05, + "loss": 2.1867, + "step": 6941 + }, + { + "epoch": 0.5225540563427991, + "grad_norm": 4.082827091217041, + "learning_rate": 4.89333779380006e-05, + "loss": 2.2868, + "step": 6942 + }, + { + "epoch": 0.5226293306234592, + "grad_norm": 5.288009166717529, + "learning_rate": 4.892118985284738e-05, + "loss": 1.65, + "step": 6943 + }, + { + "epoch": 0.5227046049041194, + "grad_norm": 3.6602885723114014, + "learning_rate": 4.890900183182633e-05, + "loss": 1.9536, + "step": 6944 + }, + { + "epoch": 0.5227798791847795, + "grad_norm": 5.31079626083374, + "learning_rate": 4.889681387566202e-05, + "loss": 1.7908, + "step": 6945 + }, + { + "epoch": 0.5228551534654396, + "grad_norm": 5.516058921813965, + "learning_rate": 4.888462598507893e-05, + "loss": 1.7786, + "step": 6946 + }, + { + "epoch": 0.5229304277460999, + "grad_norm": 5.667055130004883, + "learning_rate": 4.887243816080165e-05, + "loss": 2.0693, + "step": 6947 + }, + { + "epoch": 0.52300570202676, + "grad_norm": 3.468761444091797, + "learning_rate": 4.8860250403554665e-05, + "loss": 1.7268, + "step": 6948 + }, + { + "epoch": 0.5230809763074201, + "grad_norm": 3.6433684825897217, + "learning_rate": 4.8848062714062544e-05, + "loss": 1.8313, + "step": 6949 + }, + { + "epoch": 0.5231562505880804, + "grad_norm": 4.384222030639648, + "learning_rate": 4.8835875093049764e-05, + "loss": 2.0591, + "step": 6950 + }, + { + "epoch": 0.5232315248687405, + "grad_norm": 4.685346603393555, + "learning_rate": 4.8823687541240896e-05, + "loss": 1.9477, + "step": 6951 + }, + { + "epoch": 0.5233067991494006, + "grad_norm": 3.928412437438965, + "learning_rate": 4.881150005936041e-05, + "loss": 1.8924, + "step": 6952 + }, + { + "epoch": 0.5233820734300608, + "grad_norm": 4.876494884490967, + "learning_rate": 4.879931264813284e-05, + "loss": 1.82, + "step": 6953 + }, + { + "epoch": 0.5234573477107209, + "grad_norm": 4.825035095214844, + "learning_rate": 4.8787125308282684e-05, + "loss": 1.8536, + "step": 6954 + }, + { + "epoch": 0.5235326219913811, + "grad_norm": 4.440576076507568, + "learning_rate": 4.877493804053446e-05, + "loss": 1.8652, + "step": 6955 + }, + { + "epoch": 0.5236078962720413, + "grad_norm": 3.434795618057251, + "learning_rate": 4.8762750845612646e-05, + "loss": 1.9216, + "step": 6956 + }, + { + "epoch": 0.5236831705527014, + "grad_norm": 3.4492878913879395, + "learning_rate": 4.8750563724241745e-05, + "loss": 1.9036, + "step": 6957 + }, + { + "epoch": 0.5237584448333615, + "grad_norm": 3.8548552989959717, + "learning_rate": 4.8738376677146234e-05, + "loss": 1.57, + "step": 6958 + }, + { + "epoch": 0.5238337191140218, + "grad_norm": 3.8144750595092773, + "learning_rate": 4.872618970505063e-05, + "loss": 1.8975, + "step": 6959 + }, + { + "epoch": 0.5239089933946819, + "grad_norm": 4.973082542419434, + "learning_rate": 4.8714002808679384e-05, + "loss": 1.7708, + "step": 6960 + }, + { + "epoch": 0.523984267675342, + "grad_norm": 4.822205066680908, + "learning_rate": 4.870181598875697e-05, + "loss": 1.9087, + "step": 6961 + }, + { + "epoch": 0.5240595419560021, + "grad_norm": 3.8845839500427246, + "learning_rate": 4.8689629246007894e-05, + "loss": 1.9338, + "step": 6962 + }, + { + "epoch": 0.5241348162366624, + "grad_norm": 4.4362053871154785, + "learning_rate": 4.867744258115658e-05, + "loss": 1.9343, + "step": 6963 + }, + { + "epoch": 0.5242100905173225, + "grad_norm": 4.53852653503418, + "learning_rate": 4.8665255994927515e-05, + "loss": 1.8991, + "step": 6964 + }, + { + "epoch": 0.5242853647979826, + "grad_norm": 4.749608039855957, + "learning_rate": 4.865306948804515e-05, + "loss": 1.8591, + "step": 6965 + }, + { + "epoch": 0.5243606390786428, + "grad_norm": 3.6270833015441895, + "learning_rate": 4.8640883061233954e-05, + "loss": 1.7171, + "step": 6966 + }, + { + "epoch": 0.524435913359303, + "grad_norm": 5.376784801483154, + "learning_rate": 4.8628696715218344e-05, + "loss": 1.8227, + "step": 6967 + }, + { + "epoch": 0.5245111876399631, + "grad_norm": 5.203309059143066, + "learning_rate": 4.8616510450722784e-05, + "loss": 1.6976, + "step": 6968 + }, + { + "epoch": 0.5245864619206233, + "grad_norm": 4.017502307891846, + "learning_rate": 4.860432426847171e-05, + "loss": 2.0636, + "step": 6969 + }, + { + "epoch": 0.5246617362012834, + "grad_norm": 4.973633766174316, + "learning_rate": 4.8592138169189555e-05, + "loss": 1.957, + "step": 6970 + }, + { + "epoch": 0.5247370104819435, + "grad_norm": 3.9245617389678955, + "learning_rate": 4.857995215360073e-05, + "loss": 1.8552, + "step": 6971 + }, + { + "epoch": 0.5248122847626038, + "grad_norm": 4.092756271362305, + "learning_rate": 4.8567766222429706e-05, + "loss": 1.7892, + "step": 6972 + }, + { + "epoch": 0.5248875590432639, + "grad_norm": 5.6921892166137695, + "learning_rate": 4.855558037640085e-05, + "loss": 2.5466, + "step": 6973 + }, + { + "epoch": 0.524962833323924, + "grad_norm": 4.092710018157959, + "learning_rate": 4.85433946162386e-05, + "loss": 2.1661, + "step": 6974 + }, + { + "epoch": 0.5250381076045842, + "grad_norm": 4.116332530975342, + "learning_rate": 4.853120894266735e-05, + "loss": 1.848, + "step": 6975 + }, + { + "epoch": 0.5251133818852444, + "grad_norm": 5.287973880767822, + "learning_rate": 4.8519023356411505e-05, + "loss": 1.8983, + "step": 6976 + }, + { + "epoch": 0.5251886561659045, + "grad_norm": 3.953367233276367, + "learning_rate": 4.850683785819551e-05, + "loss": 1.7517, + "step": 6977 + }, + { + "epoch": 0.5252639304465647, + "grad_norm": 3.8583693504333496, + "learning_rate": 4.849465244874369e-05, + "loss": 1.8783, + "step": 6978 + }, + { + "epoch": 0.5253392047272248, + "grad_norm": 5.788880825042725, + "learning_rate": 4.848246712878048e-05, + "loss": 1.9744, + "step": 6979 + }, + { + "epoch": 0.525414479007885, + "grad_norm": 5.482563018798828, + "learning_rate": 4.847028189903024e-05, + "loss": 2.1148, + "step": 6980 + }, + { + "epoch": 0.5254897532885452, + "grad_norm": 7.347797393798828, + "learning_rate": 4.845809676021735e-05, + "loss": 2.3018, + "step": 6981 + }, + { + "epoch": 0.5255650275692053, + "grad_norm": 4.234721660614014, + "learning_rate": 4.8445911713066176e-05, + "loss": 1.7296, + "step": 6982 + }, + { + "epoch": 0.5256403018498654, + "grad_norm": 4.58505916595459, + "learning_rate": 4.8433726758301104e-05, + "loss": 1.9592, + "step": 6983 + }, + { + "epoch": 0.5257155761305256, + "grad_norm": 4.317677974700928, + "learning_rate": 4.8421541896646475e-05, + "loss": 1.8073, + "step": 6984 + }, + { + "epoch": 0.5257908504111858, + "grad_norm": 5.643073081970215, + "learning_rate": 4.840935712882666e-05, + "loss": 1.9062, + "step": 6985 + }, + { + "epoch": 0.5258661246918459, + "grad_norm": 5.233477592468262, + "learning_rate": 4.839717245556597e-05, + "loss": 1.9583, + "step": 6986 + }, + { + "epoch": 0.525941398972506, + "grad_norm": 4.624270439147949, + "learning_rate": 4.8384987877588823e-05, + "loss": 1.8511, + "step": 6987 + }, + { + "epoch": 0.5260166732531663, + "grad_norm": 4.989600658416748, + "learning_rate": 4.8372803395619484e-05, + "loss": 1.9322, + "step": 6988 + }, + { + "epoch": 0.5260919475338264, + "grad_norm": 6.288695812225342, + "learning_rate": 4.836061901038233e-05, + "loss": 2.4129, + "step": 6989 + }, + { + "epoch": 0.5261672218144865, + "grad_norm": 4.117554187774658, + "learning_rate": 4.834843472260165e-05, + "loss": 1.9719, + "step": 6990 + }, + { + "epoch": 0.5262424960951467, + "grad_norm": 4.179305553436279, + "learning_rate": 4.83362505330018e-05, + "loss": 2.1038, + "step": 6991 + }, + { + "epoch": 0.5263177703758068, + "grad_norm": 4.1908793449401855, + "learning_rate": 4.832406644230708e-05, + "loss": 2.2383, + "step": 6992 + }, + { + "epoch": 0.526393044656467, + "grad_norm": 4.188826560974121, + "learning_rate": 4.831188245124179e-05, + "loss": 1.8981, + "step": 6993 + }, + { + "epoch": 0.5264683189371272, + "grad_norm": 4.018468379974365, + "learning_rate": 4.829969856053027e-05, + "loss": 2.1641, + "step": 6994 + }, + { + "epoch": 0.5265435932177873, + "grad_norm": 3.9451262950897217, + "learning_rate": 4.8287514770896774e-05, + "loss": 1.8916, + "step": 6995 + }, + { + "epoch": 0.5266188674984474, + "grad_norm": 4.334110260009766, + "learning_rate": 4.827533108306563e-05, + "loss": 1.8285, + "step": 6996 + }, + { + "epoch": 0.5266941417791077, + "grad_norm": 3.5793375968933105, + "learning_rate": 4.826314749776108e-05, + "loss": 1.9938, + "step": 6997 + }, + { + "epoch": 0.5267694160597678, + "grad_norm": 9.41827392578125, + "learning_rate": 4.825096401570744e-05, + "loss": 1.732, + "step": 6998 + }, + { + "epoch": 0.5268446903404279, + "grad_norm": 4.204832077026367, + "learning_rate": 4.823878063762896e-05, + "loss": 2.0474, + "step": 6999 + }, + { + "epoch": 0.5269199646210881, + "grad_norm": 4.062893390655518, + "learning_rate": 4.822659736424994e-05, + "loss": 1.8633, + "step": 7000 + }, + { + "epoch": 0.5269952389017483, + "grad_norm": 4.513487815856934, + "learning_rate": 4.82144141962946e-05, + "loss": 1.5433, + "step": 7001 + }, + { + "epoch": 0.5270705131824084, + "grad_norm": 3.875760555267334, + "learning_rate": 4.820223113448722e-05, + "loss": 1.7221, + "step": 7002 + }, + { + "epoch": 0.5271457874630685, + "grad_norm": 3.7595367431640625, + "learning_rate": 4.819004817955203e-05, + "loss": 1.8335, + "step": 7003 + }, + { + "epoch": 0.5272210617437287, + "grad_norm": 5.465179443359375, + "learning_rate": 4.8177865332213304e-05, + "loss": 1.7907, + "step": 7004 + }, + { + "epoch": 0.5272963360243889, + "grad_norm": 6.533691883087158, + "learning_rate": 4.8165682593195234e-05, + "loss": 1.7674, + "step": 7005 + }, + { + "epoch": 0.527371610305049, + "grad_norm": 5.1197381019592285, + "learning_rate": 4.815349996322209e-05, + "loss": 1.5915, + "step": 7006 + }, + { + "epoch": 0.5274468845857092, + "grad_norm": 4.613940715789795, + "learning_rate": 4.814131744301806e-05, + "loss": 2.181, + "step": 7007 + }, + { + "epoch": 0.5275221588663693, + "grad_norm": 3.5230231285095215, + "learning_rate": 4.8129135033307364e-05, + "loss": 1.5449, + "step": 7008 + }, + { + "epoch": 0.5275974331470294, + "grad_norm": 3.9887583255767822, + "learning_rate": 4.8116952734814244e-05, + "loss": 2.1319, + "step": 7009 + }, + { + "epoch": 0.5276727074276897, + "grad_norm": 5.512673377990723, + "learning_rate": 4.810477054826286e-05, + "loss": 2.1013, + "step": 7010 + }, + { + "epoch": 0.5277479817083498, + "grad_norm": 5.702235221862793, + "learning_rate": 4.8092588474377456e-05, + "loss": 1.9598, + "step": 7011 + }, + { + "epoch": 0.5278232559890099, + "grad_norm": 5.409027099609375, + "learning_rate": 4.8080406513882175e-05, + "loss": 2.2694, + "step": 7012 + }, + { + "epoch": 0.5278985302696702, + "grad_norm": 5.838965892791748, + "learning_rate": 4.8068224667501225e-05, + "loss": 1.84, + "step": 7013 + }, + { + "epoch": 0.5279738045503303, + "grad_norm": 3.913105010986328, + "learning_rate": 4.8056042935958766e-05, + "loss": 1.8403, + "step": 7014 + }, + { + "epoch": 0.5280490788309904, + "grad_norm": 4.155385494232178, + "learning_rate": 4.8043861319979e-05, + "loss": 1.6537, + "step": 7015 + }, + { + "epoch": 0.5281243531116506, + "grad_norm": 3.395749092102051, + "learning_rate": 4.8031679820286044e-05, + "loss": 1.6948, + "step": 7016 + }, + { + "epoch": 0.5281996273923107, + "grad_norm": 5.627654552459717, + "learning_rate": 4.80194984376041e-05, + "loss": 1.7634, + "step": 7017 + }, + { + "epoch": 0.5282749016729709, + "grad_norm": 4.569192886352539, + "learning_rate": 4.800731717265726e-05, + "loss": 1.7557, + "step": 7018 + }, + { + "epoch": 0.5283501759536311, + "grad_norm": 6.035717010498047, + "learning_rate": 4.7995136026169716e-05, + "loss": 2.404, + "step": 7019 + }, + { + "epoch": 0.5284254502342912, + "grad_norm": 5.31645393371582, + "learning_rate": 4.798295499886557e-05, + "loss": 1.7557, + "step": 7020 + }, + { + "epoch": 0.5285007245149513, + "grad_norm": 4.1376142501831055, + "learning_rate": 4.797077409146898e-05, + "loss": 1.7917, + "step": 7021 + }, + { + "epoch": 0.5285759987956115, + "grad_norm": 7.303235054016113, + "learning_rate": 4.795859330470402e-05, + "loss": 2.0473, + "step": 7022 + }, + { + "epoch": 0.5286512730762717, + "grad_norm": 4.449717044830322, + "learning_rate": 4.794641263929487e-05, + "loss": 1.9801, + "step": 7023 + }, + { + "epoch": 0.5287265473569318, + "grad_norm": 3.7318694591522217, + "learning_rate": 4.793423209596557e-05, + "loss": 1.8471, + "step": 7024 + }, + { + "epoch": 0.5288018216375919, + "grad_norm": 4.434027194976807, + "learning_rate": 4.792205167544023e-05, + "loss": 1.9011, + "step": 7025 + }, + { + "epoch": 0.5288770959182522, + "grad_norm": 4.1048431396484375, + "learning_rate": 4.790987137844298e-05, + "loss": 1.7617, + "step": 7026 + }, + { + "epoch": 0.5289523701989123, + "grad_norm": 4.809249401092529, + "learning_rate": 4.789769120569786e-05, + "loss": 1.6215, + "step": 7027 + }, + { + "epoch": 0.5290276444795724, + "grad_norm": 3.929060697555542, + "learning_rate": 4.788551115792899e-05, + "loss": 1.5432, + "step": 7028 + }, + { + "epoch": 0.5291029187602326, + "grad_norm": 4.522894382476807, + "learning_rate": 4.7873331235860396e-05, + "loss": 1.9355, + "step": 7029 + }, + { + "epoch": 0.5291781930408928, + "grad_norm": 3.6166481971740723, + "learning_rate": 4.786115144021617e-05, + "loss": 1.71, + "step": 7030 + }, + { + "epoch": 0.5292534673215529, + "grad_norm": 3.611379623413086, + "learning_rate": 4.784897177172034e-05, + "loss": 1.973, + "step": 7031 + }, + { + "epoch": 0.5293287416022131, + "grad_norm": 7.510868072509766, + "learning_rate": 4.783679223109699e-05, + "loss": 2.3842, + "step": 7032 + }, + { + "epoch": 0.5294040158828732, + "grad_norm": 4.0979485511779785, + "learning_rate": 4.782461281907011e-05, + "loss": 1.7509, + "step": 7033 + }, + { + "epoch": 0.5294792901635333, + "grad_norm": 4.53986120223999, + "learning_rate": 4.781243353636378e-05, + "loss": 1.5495, + "step": 7034 + }, + { + "epoch": 0.5295545644441936, + "grad_norm": 3.8875980377197266, + "learning_rate": 4.780025438370197e-05, + "loss": 2.1569, + "step": 7035 + }, + { + "epoch": 0.5296298387248537, + "grad_norm": 6.213791847229004, + "learning_rate": 4.7788075361808743e-05, + "loss": 2.0291, + "step": 7036 + }, + { + "epoch": 0.5297051130055138, + "grad_norm": 4.641781806945801, + "learning_rate": 4.7775896471408076e-05, + "loss": 2.1041, + "step": 7037 + }, + { + "epoch": 0.529780387286174, + "grad_norm": 3.900177478790283, + "learning_rate": 4.776371771322401e-05, + "loss": 1.7114, + "step": 7038 + }, + { + "epoch": 0.5298556615668342, + "grad_norm": 4.5729146003723145, + "learning_rate": 4.7751539087980484e-05, + "loss": 1.9908, + "step": 7039 + }, + { + "epoch": 0.5299309358474943, + "grad_norm": 7.400148391723633, + "learning_rate": 4.7739360596401505e-05, + "loss": 1.7709, + "step": 7040 + }, + { + "epoch": 0.5300062101281544, + "grad_norm": 4.948382377624512, + "learning_rate": 4.7727182239211066e-05, + "loss": 2.2318, + "step": 7041 + }, + { + "epoch": 0.5300814844088146, + "grad_norm": 4.408289432525635, + "learning_rate": 4.771500401713311e-05, + "loss": 1.8992, + "step": 7042 + }, + { + "epoch": 0.5301567586894748, + "grad_norm": 5.357378959655762, + "learning_rate": 4.770282593089162e-05, + "loss": 1.9719, + "step": 7043 + }, + { + "epoch": 0.5302320329701349, + "grad_norm": 5.367403030395508, + "learning_rate": 4.769064798121054e-05, + "loss": 1.9242, + "step": 7044 + }, + { + "epoch": 0.5303073072507951, + "grad_norm": 5.554671287536621, + "learning_rate": 4.767847016881381e-05, + "loss": 1.9939, + "step": 7045 + }, + { + "epoch": 0.5303825815314552, + "grad_norm": 6.019415855407715, + "learning_rate": 4.7666292494425355e-05, + "loss": 1.8112, + "step": 7046 + }, + { + "epoch": 0.5304578558121154, + "grad_norm": 10.823569297790527, + "learning_rate": 4.765411495876912e-05, + "loss": 2.0026, + "step": 7047 + }, + { + "epoch": 0.5305331300927756, + "grad_norm": 4.234469890594482, + "learning_rate": 4.764193756256902e-05, + "loss": 1.8089, + "step": 7048 + }, + { + "epoch": 0.5306084043734357, + "grad_norm": 4.066115856170654, + "learning_rate": 4.7629760306548975e-05, + "loss": 1.7608, + "step": 7049 + }, + { + "epoch": 0.5306836786540958, + "grad_norm": 3.949735403060913, + "learning_rate": 4.761758319143287e-05, + "loss": 1.8622, + "step": 7050 + }, + { + "epoch": 0.5307589529347561, + "grad_norm": 2.902693033218384, + "learning_rate": 4.760540621794462e-05, + "loss": 1.6411, + "step": 7051 + }, + { + "epoch": 0.5308342272154162, + "grad_norm": 4.166128635406494, + "learning_rate": 4.759322938680808e-05, + "loss": 1.8505, + "step": 7052 + }, + { + "epoch": 0.5309095014960763, + "grad_norm": 4.4644927978515625, + "learning_rate": 4.758105269874716e-05, + "loss": 1.788, + "step": 7053 + }, + { + "epoch": 0.5309847757767365, + "grad_norm": 3.843764305114746, + "learning_rate": 4.75688761544857e-05, + "loss": 1.7032, + "step": 7054 + }, + { + "epoch": 0.5310600500573966, + "grad_norm": 3.5982511043548584, + "learning_rate": 4.7556699754747605e-05, + "loss": 1.9141, + "step": 7055 + }, + { + "epoch": 0.5311353243380568, + "grad_norm": 4.304314613342285, + "learning_rate": 4.754452350025668e-05, + "loss": 1.9731, + "step": 7056 + }, + { + "epoch": 0.531210598618717, + "grad_norm": 5.979513168334961, + "learning_rate": 4.753234739173678e-05, + "loss": 1.7079, + "step": 7057 + }, + { + "epoch": 0.5312858728993771, + "grad_norm": 5.045577049255371, + "learning_rate": 4.7520171429911755e-05, + "loss": 2.1946, + "step": 7058 + }, + { + "epoch": 0.5313611471800372, + "grad_norm": 3.9108498096466064, + "learning_rate": 4.750799561550542e-05, + "loss": 1.944, + "step": 7059 + }, + { + "epoch": 0.5314364214606974, + "grad_norm": 5.862430095672607, + "learning_rate": 4.749581994924161e-05, + "loss": 2.0753, + "step": 7060 + }, + { + "epoch": 0.5315116957413576, + "grad_norm": 4.302278518676758, + "learning_rate": 4.748364443184411e-05, + "loss": 2.3075, + "step": 7061 + }, + { + "epoch": 0.5315869700220177, + "grad_norm": 5.241310119628906, + "learning_rate": 4.747146906403674e-05, + "loss": 1.8098, + "step": 7062 + }, + { + "epoch": 0.5316622443026778, + "grad_norm": 4.788744926452637, + "learning_rate": 4.7459293846543256e-05, + "loss": 2.1982, + "step": 7063 + }, + { + "epoch": 0.5317375185833381, + "grad_norm": 6.442987442016602, + "learning_rate": 4.744711878008748e-05, + "loss": 2.2314, + "step": 7064 + }, + { + "epoch": 0.5318127928639982, + "grad_norm": 3.9667046070098877, + "learning_rate": 4.743494386539316e-05, + "loss": 1.7363, + "step": 7065 + }, + { + "epoch": 0.5318880671446583, + "grad_norm": 3.9151365756988525, + "learning_rate": 4.7422769103184074e-05, + "loss": 2.1363, + "step": 7066 + }, + { + "epoch": 0.5319633414253185, + "grad_norm": 5.04141092300415, + "learning_rate": 4.741059449418396e-05, + "loss": 2.2775, + "step": 7067 + }, + { + "epoch": 0.5320386157059787, + "grad_norm": 4.230179786682129, + "learning_rate": 4.7398420039116576e-05, + "loss": 2.3637, + "step": 7068 + }, + { + "epoch": 0.5321138899866388, + "grad_norm": 3.9334120750427246, + "learning_rate": 4.738624573870565e-05, + "loss": 1.5845, + "step": 7069 + }, + { + "epoch": 0.532189164267299, + "grad_norm": 4.179913520812988, + "learning_rate": 4.737407159367494e-05, + "loss": 1.6291, + "step": 7070 + }, + { + "epoch": 0.5322644385479591, + "grad_norm": 3.8906409740448, + "learning_rate": 4.7361897604748114e-05, + "loss": 1.8825, + "step": 7071 + }, + { + "epoch": 0.5323397128286192, + "grad_norm": 4.138206958770752, + "learning_rate": 4.734972377264892e-05, + "loss": 2.0115, + "step": 7072 + }, + { + "epoch": 0.5324149871092795, + "grad_norm": 4.013099670410156, + "learning_rate": 4.7337550098101034e-05, + "loss": 1.8908, + "step": 7073 + }, + { + "epoch": 0.5324902613899396, + "grad_norm": 7.243107318878174, + "learning_rate": 4.732537658182815e-05, + "loss": 1.7049, + "step": 7074 + }, + { + "epoch": 0.5325655356705997, + "grad_norm": 7.560256481170654, + "learning_rate": 4.7313203224553965e-05, + "loss": 2.1741, + "step": 7075 + }, + { + "epoch": 0.53264080995126, + "grad_norm": 4.1712799072265625, + "learning_rate": 4.730103002700213e-05, + "loss": 1.7437, + "step": 7076 + }, + { + "epoch": 0.5327160842319201, + "grad_norm": 4.552664756774902, + "learning_rate": 4.7288856989896336e-05, + "loss": 2.0069, + "step": 7077 + }, + { + "epoch": 0.5327913585125802, + "grad_norm": 3.638277292251587, + "learning_rate": 4.727668411396019e-05, + "loss": 1.4366, + "step": 7078 + }, + { + "epoch": 0.5328666327932404, + "grad_norm": 3.880326509475708, + "learning_rate": 4.726451139991738e-05, + "loss": 1.7208, + "step": 7079 + }, + { + "epoch": 0.5329419070739005, + "grad_norm": 4.694003105163574, + "learning_rate": 4.725233884849151e-05, + "loss": 1.7411, + "step": 7080 + }, + { + "epoch": 0.5330171813545607, + "grad_norm": 4.782301902770996, + "learning_rate": 4.724016646040621e-05, + "loss": 1.9971, + "step": 7081 + }, + { + "epoch": 0.5330924556352208, + "grad_norm": 5.097995281219482, + "learning_rate": 4.722799423638509e-05, + "loss": 1.8428, + "step": 7082 + }, + { + "epoch": 0.533167729915881, + "grad_norm": 4.44391393661499, + "learning_rate": 4.721582217715177e-05, + "loss": 1.8378, + "step": 7083 + }, + { + "epoch": 0.5332430041965411, + "grad_norm": 4.077167510986328, + "learning_rate": 4.720365028342982e-05, + "loss": 1.937, + "step": 7084 + }, + { + "epoch": 0.5333182784772013, + "grad_norm": 4.596120834350586, + "learning_rate": 4.719147855594285e-05, + "loss": 2.088, + "step": 7085 + }, + { + "epoch": 0.5333935527578615, + "grad_norm": 10.432188987731934, + "learning_rate": 4.7179306995414404e-05, + "loss": 1.938, + "step": 7086 + }, + { + "epoch": 0.5334688270385216, + "grad_norm": 7.165554046630859, + "learning_rate": 4.716713560256809e-05, + "loss": 1.8936, + "step": 7087 + }, + { + "epoch": 0.5335441013191817, + "grad_norm": 5.582726001739502, + "learning_rate": 4.715496437812741e-05, + "loss": 1.9756, + "step": 7088 + }, + { + "epoch": 0.533619375599842, + "grad_norm": 7.4014177322387695, + "learning_rate": 4.714279332281594e-05, + "loss": 1.8441, + "step": 7089 + }, + { + "epoch": 0.5336946498805021, + "grad_norm": 4.264235496520996, + "learning_rate": 4.713062243735722e-05, + "loss": 2.0099, + "step": 7090 + }, + { + "epoch": 0.5337699241611622, + "grad_norm": 5.126283168792725, + "learning_rate": 4.711845172247475e-05, + "loss": 1.7007, + "step": 7091 + }, + { + "epoch": 0.5338451984418224, + "grad_norm": 5.324197769165039, + "learning_rate": 4.710628117889207e-05, + "loss": 1.8389, + "step": 7092 + }, + { + "epoch": 0.5339204727224826, + "grad_norm": 5.139522552490234, + "learning_rate": 4.7094110807332656e-05, + "loss": 1.8308, + "step": 7093 + }, + { + "epoch": 0.5339957470031427, + "grad_norm": 4.007652282714844, + "learning_rate": 4.708194060852004e-05, + "loss": 2.0468, + "step": 7094 + }, + { + "epoch": 0.5340710212838029, + "grad_norm": 3.9862287044525146, + "learning_rate": 4.706977058317766e-05, + "loss": 1.9759, + "step": 7095 + }, + { + "epoch": 0.534146295564463, + "grad_norm": 6.065804958343506, + "learning_rate": 4.7057600732029016e-05, + "loss": 1.84, + "step": 7096 + }, + { + "epoch": 0.5342215698451231, + "grad_norm": 4.315358638763428, + "learning_rate": 4.704543105579757e-05, + "loss": 1.979, + "step": 7097 + }, + { + "epoch": 0.5342968441257834, + "grad_norm": 7.23746919631958, + "learning_rate": 4.7033261555206776e-05, + "loss": 2.1668, + "step": 7098 + }, + { + "epoch": 0.5343721184064435, + "grad_norm": 6.216946125030518, + "learning_rate": 4.702109223098005e-05, + "loss": 2.0449, + "step": 7099 + }, + { + "epoch": 0.5344473926871036, + "grad_norm": 5.135339260101318, + "learning_rate": 4.700892308384087e-05, + "loss": 1.714, + "step": 7100 + }, + { + "epoch": 0.5345226669677637, + "grad_norm": 5.694299221038818, + "learning_rate": 4.699675411451261e-05, + "loss": 2.1415, + "step": 7101 + }, + { + "epoch": 0.534597941248424, + "grad_norm": 3.72849440574646, + "learning_rate": 4.698458532371871e-05, + "loss": 1.7641, + "step": 7102 + }, + { + "epoch": 0.5346732155290841, + "grad_norm": 4.403263568878174, + "learning_rate": 4.6972416712182546e-05, + "loss": 1.9095, + "step": 7103 + }, + { + "epoch": 0.5347484898097442, + "grad_norm": 6.242741584777832, + "learning_rate": 4.696024828062755e-05, + "loss": 2.0923, + "step": 7104 + }, + { + "epoch": 0.5348237640904044, + "grad_norm": 5.412423610687256, + "learning_rate": 4.694808002977704e-05, + "loss": 2.0866, + "step": 7105 + }, + { + "epoch": 0.5348990383710646, + "grad_norm": 4.419037342071533, + "learning_rate": 4.6935911960354425e-05, + "loss": 1.7658, + "step": 7106 + }, + { + "epoch": 0.5349743126517247, + "grad_norm": 6.027182102203369, + "learning_rate": 4.692374407308307e-05, + "loss": 1.8826, + "step": 7107 + }, + { + "epoch": 0.5350495869323849, + "grad_norm": 5.010058403015137, + "learning_rate": 4.6911576368686284e-05, + "loss": 1.4985, + "step": 7108 + }, + { + "epoch": 0.535124861213045, + "grad_norm": 5.639893054962158, + "learning_rate": 4.689940884788743e-05, + "loss": 1.9841, + "step": 7109 + }, + { + "epoch": 0.5352001354937052, + "grad_norm": 5.386854648590088, + "learning_rate": 4.688724151140981e-05, + "loss": 1.6238, + "step": 7110 + }, + { + "epoch": 0.5352754097743654, + "grad_norm": 5.632213115692139, + "learning_rate": 4.6875074359976775e-05, + "loss": 2.0899, + "step": 7111 + }, + { + "epoch": 0.5353506840550255, + "grad_norm": 3.9364993572235107, + "learning_rate": 4.686290739431158e-05, + "loss": 1.9543, + "step": 7112 + }, + { + "epoch": 0.5354259583356856, + "grad_norm": 5.6435546875, + "learning_rate": 4.685074061513755e-05, + "loss": 2.5083, + "step": 7113 + }, + { + "epoch": 0.5355012326163459, + "grad_norm": 5.331766128540039, + "learning_rate": 4.6838574023177936e-05, + "loss": 1.8142, + "step": 7114 + }, + { + "epoch": 0.535576506897006, + "grad_norm": 5.100854873657227, + "learning_rate": 4.682640761915606e-05, + "loss": 2.1522, + "step": 7115 + }, + { + "epoch": 0.5356517811776661, + "grad_norm": 6.256421089172363, + "learning_rate": 4.6814241403795116e-05, + "loss": 2.0506, + "step": 7116 + }, + { + "epoch": 0.5357270554583263, + "grad_norm": 6.039792537689209, + "learning_rate": 4.6802075377818404e-05, + "loss": 2.0938, + "step": 7117 + }, + { + "epoch": 0.5358023297389864, + "grad_norm": 3.491872549057007, + "learning_rate": 4.678990954194911e-05, + "loss": 1.7932, + "step": 7118 + }, + { + "epoch": 0.5358776040196466, + "grad_norm": 4.274329662322998, + "learning_rate": 4.67777438969105e-05, + "loss": 1.9438, + "step": 7119 + }, + { + "epoch": 0.5359528783003067, + "grad_norm": 5.088064670562744, + "learning_rate": 4.676557844342576e-05, + "loss": 2.0418, + "step": 7120 + }, + { + "epoch": 0.5360281525809669, + "grad_norm": 5.919598579406738, + "learning_rate": 4.675341318221809e-05, + "loss": 1.9826, + "step": 7121 + }, + { + "epoch": 0.536103426861627, + "grad_norm": 5.451521396636963, + "learning_rate": 4.674124811401071e-05, + "loss": 1.7512, + "step": 7122 + }, + { + "epoch": 0.5361787011422872, + "grad_norm": 4.607489585876465, + "learning_rate": 4.6729083239526775e-05, + "loss": 1.8516, + "step": 7123 + }, + { + "epoch": 0.5362539754229474, + "grad_norm": 5.585968494415283, + "learning_rate": 4.671691855948947e-05, + "loss": 1.6594, + "step": 7124 + }, + { + "epoch": 0.5363292497036075, + "grad_norm": 3.7467923164367676, + "learning_rate": 4.670475407462191e-05, + "loss": 1.9381, + "step": 7125 + }, + { + "epoch": 0.5364045239842676, + "grad_norm": 4.839333534240723, + "learning_rate": 4.669258978564728e-05, + "loss": 2.0935, + "step": 7126 + }, + { + "epoch": 0.5364797982649279, + "grad_norm": 4.495025634765625, + "learning_rate": 4.6680425693288684e-05, + "loss": 2.2325, + "step": 7127 + }, + { + "epoch": 0.536555072545588, + "grad_norm": 5.829689025878906, + "learning_rate": 4.666826179826928e-05, + "loss": 1.8106, + "step": 7128 + }, + { + "epoch": 0.5366303468262481, + "grad_norm": 5.903088092803955, + "learning_rate": 4.665609810131213e-05, + "loss": 1.7813, + "step": 7129 + }, + { + "epoch": 0.5367056211069083, + "grad_norm": 4.563745021820068, + "learning_rate": 4.664393460314036e-05, + "loss": 1.8227, + "step": 7130 + }, + { + "epoch": 0.5367808953875685, + "grad_norm": 6.159015655517578, + "learning_rate": 4.663177130447705e-05, + "loss": 1.98, + "step": 7131 + }, + { + "epoch": 0.5368561696682286, + "grad_norm": 5.188133716583252, + "learning_rate": 4.6619608206045276e-05, + "loss": 2.3185, + "step": 7132 + }, + { + "epoch": 0.5369314439488888, + "grad_norm": 5.6095662117004395, + "learning_rate": 4.6607445308568085e-05, + "loss": 2.4495, + "step": 7133 + }, + { + "epoch": 0.5370067182295489, + "grad_norm": 4.729706287384033, + "learning_rate": 4.659528261276855e-05, + "loss": 2.1361, + "step": 7134 + }, + { + "epoch": 0.537081992510209, + "grad_norm": 3.586517810821533, + "learning_rate": 4.658312011936968e-05, + "loss": 1.9664, + "step": 7135 + }, + { + "epoch": 0.5371572667908693, + "grad_norm": 4.189297199249268, + "learning_rate": 4.6570957829094525e-05, + "loss": 2.0263, + "step": 7136 + }, + { + "epoch": 0.5372325410715294, + "grad_norm": 5.3675537109375, + "learning_rate": 4.655879574266607e-05, + "loss": 1.8416, + "step": 7137 + }, + { + "epoch": 0.5373078153521895, + "grad_norm": 3.8867509365081787, + "learning_rate": 4.6546633860807346e-05, + "loss": 1.8297, + "step": 7138 + }, + { + "epoch": 0.5373830896328496, + "grad_norm": 3.9739983081817627, + "learning_rate": 4.653447218424134e-05, + "loss": 1.9554, + "step": 7139 + }, + { + "epoch": 0.5374583639135099, + "grad_norm": 3.485842704772949, + "learning_rate": 4.6522310713690995e-05, + "loss": 1.8739, + "step": 7140 + }, + { + "epoch": 0.53753363819417, + "grad_norm": 4.629156112670898, + "learning_rate": 4.6510149449879325e-05, + "loss": 1.9451, + "step": 7141 + }, + { + "epoch": 0.5376089124748301, + "grad_norm": 4.9842634201049805, + "learning_rate": 4.649798839352923e-05, + "loss": 2.164, + "step": 7142 + }, + { + "epoch": 0.5376841867554903, + "grad_norm": 4.058955669403076, + "learning_rate": 4.648582754536372e-05, + "loss": 2.0392, + "step": 7143 + }, + { + "epoch": 0.5377594610361505, + "grad_norm": 4.076719760894775, + "learning_rate": 4.647366690610564e-05, + "loss": 1.7612, + "step": 7144 + }, + { + "epoch": 0.5378347353168106, + "grad_norm": 6.8890275955200195, + "learning_rate": 4.6461506476477966e-05, + "loss": 2.3213, + "step": 7145 + }, + { + "epoch": 0.5379100095974708, + "grad_norm": 5.026981830596924, + "learning_rate": 4.6449346257203555e-05, + "loss": 1.5731, + "step": 7146 + }, + { + "epoch": 0.5379852838781309, + "grad_norm": 4.281398296356201, + "learning_rate": 4.643718624900534e-05, + "loss": 1.8817, + "step": 7147 + }, + { + "epoch": 0.5380605581587911, + "grad_norm": 5.409975528717041, + "learning_rate": 4.642502645260617e-05, + "loss": 1.8751, + "step": 7148 + }, + { + "epoch": 0.5381358324394513, + "grad_norm": 4.666240692138672, + "learning_rate": 4.6412866868728935e-05, + "loss": 1.6206, + "step": 7149 + }, + { + "epoch": 0.5382111067201114, + "grad_norm": 4.69838285446167, + "learning_rate": 4.6400707498096454e-05, + "loss": 2.2947, + "step": 7150 + }, + { + "epoch": 0.5382863810007715, + "grad_norm": 4.841192245483398, + "learning_rate": 4.6388548341431604e-05, + "loss": 1.8125, + "step": 7151 + }, + { + "epoch": 0.5383616552814318, + "grad_norm": 3.94899320602417, + "learning_rate": 4.6376389399457176e-05, + "loss": 1.7008, + "step": 7152 + }, + { + "epoch": 0.5384369295620919, + "grad_norm": 6.47050666809082, + "learning_rate": 4.6364230672896e-05, + "loss": 2.2182, + "step": 7153 + }, + { + "epoch": 0.538512203842752, + "grad_norm": 4.819343090057373, + "learning_rate": 4.6352072162470883e-05, + "loss": 1.7317, + "step": 7154 + }, + { + "epoch": 0.5385874781234122, + "grad_norm": 5.318504810333252, + "learning_rate": 4.63399138689046e-05, + "loss": 1.8923, + "step": 7155 + }, + { + "epoch": 0.5386627524040724, + "grad_norm": 6.286028861999512, + "learning_rate": 4.632775579291996e-05, + "loss": 1.8324, + "step": 7156 + }, + { + "epoch": 0.5387380266847325, + "grad_norm": 4.735677242279053, + "learning_rate": 4.631559793523968e-05, + "loss": 2.0792, + "step": 7157 + }, + { + "epoch": 0.5388133009653926, + "grad_norm": 4.113296985626221, + "learning_rate": 4.630344029658654e-05, + "loss": 2.0041, + "step": 7158 + }, + { + "epoch": 0.5388885752460528, + "grad_norm": 5.619882583618164, + "learning_rate": 4.6291282877683254e-05, + "loss": 1.7686, + "step": 7159 + }, + { + "epoch": 0.538963849526713, + "grad_norm": 3.4414563179016113, + "learning_rate": 4.627912567925259e-05, + "loss": 2.162, + "step": 7160 + }, + { + "epoch": 0.5390391238073731, + "grad_norm": 5.1061110496521, + "learning_rate": 4.6266968702017195e-05, + "loss": 2.0211, + "step": 7161 + }, + { + "epoch": 0.5391143980880333, + "grad_norm": 4.122971534729004, + "learning_rate": 4.6254811946699824e-05, + "loss": 2.3782, + "step": 7162 + }, + { + "epoch": 0.5391896723686934, + "grad_norm": 5.279617786407471, + "learning_rate": 4.6242655414023125e-05, + "loss": 2.0438, + "step": 7163 + }, + { + "epoch": 0.5392649466493535, + "grad_norm": 6.048137664794922, + "learning_rate": 4.623049910470978e-05, + "loss": 1.844, + "step": 7164 + }, + { + "epoch": 0.5393402209300138, + "grad_norm": 3.968564987182617, + "learning_rate": 4.621834301948244e-05, + "loss": 1.7759, + "step": 7165 + }, + { + "epoch": 0.5394154952106739, + "grad_norm": 6.077966690063477, + "learning_rate": 4.620618715906378e-05, + "loss": 1.7141, + "step": 7166 + }, + { + "epoch": 0.539490769491334, + "grad_norm": 4.009443759918213, + "learning_rate": 4.619403152417638e-05, + "loss": 1.7804, + "step": 7167 + }, + { + "epoch": 0.5395660437719942, + "grad_norm": 6.092879295349121, + "learning_rate": 4.6181876115542904e-05, + "loss": 1.8143, + "step": 7168 + }, + { + "epoch": 0.5396413180526544, + "grad_norm": 5.466367721557617, + "learning_rate": 4.6169720933885924e-05, + "loss": 1.6723, + "step": 7169 + }, + { + "epoch": 0.5397165923333145, + "grad_norm": 4.8065900802612305, + "learning_rate": 4.6157565979928044e-05, + "loss": 2.0034, + "step": 7170 + }, + { + "epoch": 0.5397918666139747, + "grad_norm": 6.0215373039245605, + "learning_rate": 4.614541125439186e-05, + "loss": 1.9522, + "step": 7171 + }, + { + "epoch": 0.5398671408946348, + "grad_norm": 5.7492475509643555, + "learning_rate": 4.61332567579999e-05, + "loss": 2.1315, + "step": 7172 + }, + { + "epoch": 0.539942415175295, + "grad_norm": 5.747673511505127, + "learning_rate": 4.6121102491474754e-05, + "loss": 1.8017, + "step": 7173 + }, + { + "epoch": 0.5400176894559552, + "grad_norm": 4.093325138092041, + "learning_rate": 4.6108948455538906e-05, + "loss": 2.1799, + "step": 7174 + }, + { + "epoch": 0.5400929637366153, + "grad_norm": 6.3927531242370605, + "learning_rate": 4.609679465091493e-05, + "loss": 1.735, + "step": 7175 + }, + { + "epoch": 0.5401682380172754, + "grad_norm": 3.98288631439209, + "learning_rate": 4.608464107832529e-05, + "loss": 2.0449, + "step": 7176 + }, + { + "epoch": 0.5402435122979357, + "grad_norm": 5.237273693084717, + "learning_rate": 4.607248773849253e-05, + "loss": 2.0829, + "step": 7177 + }, + { + "epoch": 0.5403187865785958, + "grad_norm": 4.548027038574219, + "learning_rate": 4.606033463213908e-05, + "loss": 1.8221, + "step": 7178 + }, + { + "epoch": 0.5403940608592559, + "grad_norm": 6.00238561630249, + "learning_rate": 4.6048181759987456e-05, + "loss": 2.0311, + "step": 7179 + }, + { + "epoch": 0.540469335139916, + "grad_norm": 4.499464988708496, + "learning_rate": 4.603602912276007e-05, + "loss": 1.7508, + "step": 7180 + }, + { + "epoch": 0.5405446094205762, + "grad_norm": 4.544561386108398, + "learning_rate": 4.602387672117938e-05, + "loss": 1.911, + "step": 7181 + }, + { + "epoch": 0.5406198837012364, + "grad_norm": 4.576714515686035, + "learning_rate": 4.6011724555967806e-05, + "loss": 2.0198, + "step": 7182 + }, + { + "epoch": 0.5406951579818965, + "grad_norm": 5.893620014190674, + "learning_rate": 4.599957262784778e-05, + "loss": 1.744, + "step": 7183 + }, + { + "epoch": 0.5407704322625567, + "grad_norm": 4.704677104949951, + "learning_rate": 4.5987420937541664e-05, + "loss": 1.9483, + "step": 7184 + }, + { + "epoch": 0.5408457065432168, + "grad_norm": 6.924588680267334, + "learning_rate": 4.597526948577188e-05, + "loss": 1.8612, + "step": 7185 + }, + { + "epoch": 0.540920980823877, + "grad_norm": 4.043465614318848, + "learning_rate": 4.596311827326075e-05, + "loss": 1.8183, + "step": 7186 + }, + { + "epoch": 0.5409962551045372, + "grad_norm": 4.647791862487793, + "learning_rate": 4.595096730073066e-05, + "loss": 1.8423, + "step": 7187 + }, + { + "epoch": 0.5410715293851973, + "grad_norm": 4.878299713134766, + "learning_rate": 4.593881656890397e-05, + "loss": 1.8761, + "step": 7188 + }, + { + "epoch": 0.5411468036658574, + "grad_norm": 4.8372111320495605, + "learning_rate": 4.592666607850297e-05, + "loss": 1.9255, + "step": 7189 + }, + { + "epoch": 0.5412220779465177, + "grad_norm": 4.870363235473633, + "learning_rate": 4.5914515830249996e-05, + "loss": 2.0947, + "step": 7190 + }, + { + "epoch": 0.5412973522271778, + "grad_norm": 4.007397651672363, + "learning_rate": 4.590236582486731e-05, + "loss": 1.6737, + "step": 7191 + }, + { + "epoch": 0.5413726265078379, + "grad_norm": 3.873143434524536, + "learning_rate": 4.589021606307723e-05, + "loss": 1.9809, + "step": 7192 + }, + { + "epoch": 0.5414479007884981, + "grad_norm": 5.2229695320129395, + "learning_rate": 4.5878066545602006e-05, + "loss": 1.9063, + "step": 7193 + }, + { + "epoch": 0.5415231750691583, + "grad_norm": 5.185843467712402, + "learning_rate": 4.5865917273163915e-05, + "loss": 1.7828, + "step": 7194 + }, + { + "epoch": 0.5415984493498184, + "grad_norm": 7.470333576202393, + "learning_rate": 4.585376824648516e-05, + "loss": 1.927, + "step": 7195 + }, + { + "epoch": 0.5416737236304786, + "grad_norm": 5.343120574951172, + "learning_rate": 4.5841619466288e-05, + "loss": 1.975, + "step": 7196 + }, + { + "epoch": 0.5417489979111387, + "grad_norm": 3.1645259857177734, + "learning_rate": 4.582947093329462e-05, + "loss": 1.8065, + "step": 7197 + }, + { + "epoch": 0.5418242721917989, + "grad_norm": 4.749367713928223, + "learning_rate": 4.581732264822724e-05, + "loss": 1.8178, + "step": 7198 + }, + { + "epoch": 0.541899546472459, + "grad_norm": 4.746345043182373, + "learning_rate": 4.580517461180801e-05, + "loss": 2.0404, + "step": 7199 + }, + { + "epoch": 0.5419748207531192, + "grad_norm": 5.490805625915527, + "learning_rate": 4.5793026824759136e-05, + "loss": 1.9331, + "step": 7200 + }, + { + "epoch": 0.5420500950337793, + "grad_norm": 5.888421535491943, + "learning_rate": 4.578087928780273e-05, + "loss": 2.1156, + "step": 7201 + }, + { + "epoch": 0.5421253693144394, + "grad_norm": 4.284915924072266, + "learning_rate": 4.576873200166094e-05, + "loss": 1.7174, + "step": 7202 + }, + { + "epoch": 0.5422006435950997, + "grad_norm": 4.496925354003906, + "learning_rate": 4.57565849670559e-05, + "loss": 2.1483, + "step": 7203 + }, + { + "epoch": 0.5422759178757598, + "grad_norm": 5.7125091552734375, + "learning_rate": 4.5744438184709696e-05, + "loss": 2.073, + "step": 7204 + }, + { + "epoch": 0.5423511921564199, + "grad_norm": 5.5248003005981445, + "learning_rate": 4.5732291655344456e-05, + "loss": 1.9524, + "step": 7205 + }, + { + "epoch": 0.5424264664370801, + "grad_norm": 4.666708469390869, + "learning_rate": 4.572014537968221e-05, + "loss": 1.7433, + "step": 7206 + }, + { + "epoch": 0.5425017407177403, + "grad_norm": 4.9575347900390625, + "learning_rate": 4.5707999358445066e-05, + "loss": 1.757, + "step": 7207 + }, + { + "epoch": 0.5425770149984004, + "grad_norm": 4.72661018371582, + "learning_rate": 4.569585359235502e-05, + "loss": 1.6633, + "step": 7208 + }, + { + "epoch": 0.5426522892790606, + "grad_norm": 6.025607585906982, + "learning_rate": 4.5683708082134135e-05, + "loss": 2.2094, + "step": 7209 + }, + { + "epoch": 0.5427275635597207, + "grad_norm": 5.6639227867126465, + "learning_rate": 4.567156282850441e-05, + "loss": 1.8059, + "step": 7210 + }, + { + "epoch": 0.5428028378403809, + "grad_norm": 5.5550537109375, + "learning_rate": 4.565941783218788e-05, + "loss": 2.0967, + "step": 7211 + }, + { + "epoch": 0.5428781121210411, + "grad_norm": 5.448683261871338, + "learning_rate": 4.564727309390648e-05, + "loss": 2.1925, + "step": 7212 + }, + { + "epoch": 0.5429533864017012, + "grad_norm": 4.301255702972412, + "learning_rate": 4.563512861438222e-05, + "loss": 2.1201, + "step": 7213 + }, + { + "epoch": 0.5430286606823613, + "grad_norm": 5.220934867858887, + "learning_rate": 4.562298439433703e-05, + "loss": 1.4957, + "step": 7214 + }, + { + "epoch": 0.5431039349630216, + "grad_norm": 5.2877116203308105, + "learning_rate": 4.561084043449287e-05, + "loss": 1.5886, + "step": 7215 + }, + { + "epoch": 0.5431792092436817, + "grad_norm": 4.303807735443115, + "learning_rate": 4.559869673557164e-05, + "loss": 1.8627, + "step": 7216 + }, + { + "epoch": 0.5432544835243418, + "grad_norm": 4.707512378692627, + "learning_rate": 4.558655329829529e-05, + "loss": 1.7436, + "step": 7217 + }, + { + "epoch": 0.5433297578050019, + "grad_norm": 4.219814777374268, + "learning_rate": 4.5574410123385645e-05, + "loss": 1.9253, + "step": 7218 + }, + { + "epoch": 0.5434050320856622, + "grad_norm": 4.085941791534424, + "learning_rate": 4.556226721156463e-05, + "loss": 2.1195, + "step": 7219 + }, + { + "epoch": 0.5434803063663223, + "grad_norm": 5.37359619140625, + "learning_rate": 4.55501245635541e-05, + "loss": 2.395, + "step": 7220 + }, + { + "epoch": 0.5435555806469824, + "grad_norm": 5.066382884979248, + "learning_rate": 4.553798218007589e-05, + "loss": 1.8986, + "step": 7221 + }, + { + "epoch": 0.5436308549276426, + "grad_norm": 6.823185920715332, + "learning_rate": 4.552584006185186e-05, + "loss": 1.9875, + "step": 7222 + }, + { + "epoch": 0.5437061292083027, + "grad_norm": 3.806490898132324, + "learning_rate": 4.551369820960377e-05, + "loss": 1.9114, + "step": 7223 + }, + { + "epoch": 0.5437814034889629, + "grad_norm": 5.039447784423828, + "learning_rate": 4.550155662405347e-05, + "loss": 1.8261, + "step": 7224 + }, + { + "epoch": 0.5438566777696231, + "grad_norm": 4.0304436683654785, + "learning_rate": 4.548941530592272e-05, + "loss": 2.0245, + "step": 7225 + }, + { + "epoch": 0.5439319520502832, + "grad_norm": 5.348670959472656, + "learning_rate": 4.5477274255933276e-05, + "loss": 2.1405, + "step": 7226 + }, + { + "epoch": 0.5440072263309433, + "grad_norm": 6.283318996429443, + "learning_rate": 4.5465133474806903e-05, + "loss": 1.9118, + "step": 7227 + }, + { + "epoch": 0.5440825006116036, + "grad_norm": 4.4190545082092285, + "learning_rate": 4.545299296326535e-05, + "loss": 2.1445, + "step": 7228 + }, + { + "epoch": 0.5441577748922637, + "grad_norm": 6.2103166580200195, + "learning_rate": 4.5440852722030294e-05, + "loss": 1.8545, + "step": 7229 + }, + { + "epoch": 0.5442330491729238, + "grad_norm": 5.552028656005859, + "learning_rate": 4.5428712751823476e-05, + "loss": 1.8581, + "step": 7230 + }, + { + "epoch": 0.544308323453584, + "grad_norm": 4.799374103546143, + "learning_rate": 4.541657305336656e-05, + "loss": 2.0975, + "step": 7231 + }, + { + "epoch": 0.5443835977342442, + "grad_norm": 3.859109878540039, + "learning_rate": 4.5404433627381236e-05, + "loss": 2.1311, + "step": 7232 + }, + { + "epoch": 0.5444588720149043, + "grad_norm": 4.820240497589111, + "learning_rate": 4.539229447458914e-05, + "loss": 1.7116, + "step": 7233 + }, + { + "epoch": 0.5445341462955645, + "grad_norm": 3.7648513317108154, + "learning_rate": 4.53801555957119e-05, + "loss": 1.7167, + "step": 7234 + }, + { + "epoch": 0.5446094205762246, + "grad_norm": 4.766754150390625, + "learning_rate": 4.536801699147119e-05, + "loss": 1.7716, + "step": 7235 + }, + { + "epoch": 0.5446846948568848, + "grad_norm": 4.234167575836182, + "learning_rate": 4.535587866258855e-05, + "loss": 2.2069, + "step": 7236 + }, + { + "epoch": 0.5447599691375449, + "grad_norm": 4.321406364440918, + "learning_rate": 4.534374060978561e-05, + "loss": 1.8936, + "step": 7237 + }, + { + "epoch": 0.5448352434182051, + "grad_norm": 5.81303596496582, + "learning_rate": 4.533160283378392e-05, + "loss": 1.728, + "step": 7238 + }, + { + "epoch": 0.5449105176988652, + "grad_norm": 4.741263389587402, + "learning_rate": 4.531946533530507e-05, + "loss": 2.1517, + "step": 7239 + }, + { + "epoch": 0.5449857919795253, + "grad_norm": 3.782055616378784, + "learning_rate": 4.530732811507055e-05, + "loss": 1.9274, + "step": 7240 + }, + { + "epoch": 0.5450610662601856, + "grad_norm": 4.311224937438965, + "learning_rate": 4.529519117380192e-05, + "loss": 1.7882, + "step": 7241 + }, + { + "epoch": 0.5451363405408457, + "grad_norm": 4.395197868347168, + "learning_rate": 4.5283054512220665e-05, + "loss": 1.899, + "step": 7242 + }, + { + "epoch": 0.5452116148215058, + "grad_norm": 5.094570636749268, + "learning_rate": 4.527091813104831e-05, + "loss": 2.1774, + "step": 7243 + }, + { + "epoch": 0.545286889102166, + "grad_norm": 5.369093894958496, + "learning_rate": 4.525878203100628e-05, + "loss": 2.0271, + "step": 7244 + }, + { + "epoch": 0.5453621633828262, + "grad_norm": 3.9408247470855713, + "learning_rate": 4.524664621281608e-05, + "loss": 1.9604, + "step": 7245 + }, + { + "epoch": 0.5454374376634863, + "grad_norm": 4.695934295654297, + "learning_rate": 4.52345106771991e-05, + "loss": 1.849, + "step": 7246 + }, + { + "epoch": 0.5455127119441465, + "grad_norm": 4.815086841583252, + "learning_rate": 4.522237542487679e-05, + "loss": 1.8387, + "step": 7247 + }, + { + "epoch": 0.5455879862248066, + "grad_norm": 4.516716480255127, + "learning_rate": 4.5210240456570544e-05, + "loss": 1.7391, + "step": 7248 + }, + { + "epoch": 0.5456632605054668, + "grad_norm": 5.474329471588135, + "learning_rate": 4.5198105773001784e-05, + "loss": 1.9555, + "step": 7249 + }, + { + "epoch": 0.545738534786127, + "grad_norm": 5.571545600891113, + "learning_rate": 4.5185971374891834e-05, + "loss": 1.7115, + "step": 7250 + }, + { + "epoch": 0.5458138090667871, + "grad_norm": 5.538477420806885, + "learning_rate": 4.5173837262962075e-05, + "loss": 1.8419, + "step": 7251 + }, + { + "epoch": 0.5458890833474472, + "grad_norm": 5.463476181030273, + "learning_rate": 4.516170343793386e-05, + "loss": 1.845, + "step": 7252 + }, + { + "epoch": 0.5459643576281075, + "grad_norm": 3.232405424118042, + "learning_rate": 4.514956990052848e-05, + "loss": 1.7819, + "step": 7253 + }, + { + "epoch": 0.5460396319087676, + "grad_norm": 4.944972515106201, + "learning_rate": 4.513743665146726e-05, + "loss": 1.9253, + "step": 7254 + }, + { + "epoch": 0.5461149061894277, + "grad_norm": 3.788090229034424, + "learning_rate": 4.512530369147146e-05, + "loss": 1.6777, + "step": 7255 + }, + { + "epoch": 0.5461901804700879, + "grad_norm": 5.750962734222412, + "learning_rate": 4.51131710212624e-05, + "loss": 2.0733, + "step": 7256 + }, + { + "epoch": 0.5462654547507481, + "grad_norm": 4.356586456298828, + "learning_rate": 4.510103864156127e-05, + "loss": 1.9877, + "step": 7257 + }, + { + "epoch": 0.5463407290314082, + "grad_norm": 7.100419521331787, + "learning_rate": 4.508890655308936e-05, + "loss": 2.0544, + "step": 7258 + }, + { + "epoch": 0.5464160033120683, + "grad_norm": 3.6300253868103027, + "learning_rate": 4.5076774756567834e-05, + "loss": 1.5025, + "step": 7259 + }, + { + "epoch": 0.5464912775927285, + "grad_norm": 5.504705905914307, + "learning_rate": 4.506464325271796e-05, + "loss": 1.8322, + "step": 7260 + }, + { + "epoch": 0.5465665518733887, + "grad_norm": 3.5102968215942383, + "learning_rate": 4.5052512042260854e-05, + "loss": 1.6375, + "step": 7261 + }, + { + "epoch": 0.5466418261540488, + "grad_norm": 4.714266777038574, + "learning_rate": 4.5040381125917734e-05, + "loss": 1.7604, + "step": 7262 + }, + { + "epoch": 0.546717100434709, + "grad_norm": 5.035887718200684, + "learning_rate": 4.5028250504409706e-05, + "loss": 1.467, + "step": 7263 + }, + { + "epoch": 0.5467923747153691, + "grad_norm": 4.72441291809082, + "learning_rate": 4.5016120178457935e-05, + "loss": 1.7012, + "step": 7264 + }, + { + "epoch": 0.5468676489960292, + "grad_norm": 6.256455898284912, + "learning_rate": 4.50039901487835e-05, + "loss": 2.05, + "step": 7265 + }, + { + "epoch": 0.5469429232766895, + "grad_norm": 4.261584281921387, + "learning_rate": 4.499186041610752e-05, + "loss": 2.2197, + "step": 7266 + }, + { + "epoch": 0.5470181975573496, + "grad_norm": 4.091939926147461, + "learning_rate": 4.497973098115109e-05, + "loss": 1.7024, + "step": 7267 + }, + { + "epoch": 0.5470934718380097, + "grad_norm": 4.211394786834717, + "learning_rate": 4.496760184463522e-05, + "loss": 2.1085, + "step": 7268 + }, + { + "epoch": 0.54716874611867, + "grad_norm": 4.209076881408691, + "learning_rate": 4.4955473007281e-05, + "loss": 1.7858, + "step": 7269 + }, + { + "epoch": 0.5472440203993301, + "grad_norm": 5.287043571472168, + "learning_rate": 4.494334446980942e-05, + "loss": 1.9093, + "step": 7270 + }, + { + "epoch": 0.5473192946799902, + "grad_norm": 4.956048011779785, + "learning_rate": 4.4931216232941526e-05, + "loss": 2.3107, + "step": 7271 + }, + { + "epoch": 0.5473945689606504, + "grad_norm": 4.21409273147583, + "learning_rate": 4.491908829739826e-05, + "loss": 1.6429, + "step": 7272 + }, + { + "epoch": 0.5474698432413105, + "grad_norm": 4.417195796966553, + "learning_rate": 4.4906960663900636e-05, + "loss": 2.132, + "step": 7273 + }, + { + "epoch": 0.5475451175219707, + "grad_norm": 4.306947708129883, + "learning_rate": 4.4894833333169574e-05, + "loss": 1.8505, + "step": 7274 + }, + { + "epoch": 0.5476203918026309, + "grad_norm": 4.216700077056885, + "learning_rate": 4.4882706305926026e-05, + "loss": 1.4864, + "step": 7275 + }, + { + "epoch": 0.547695666083291, + "grad_norm": 4.046915054321289, + "learning_rate": 4.487057958289089e-05, + "loss": 1.9534, + "step": 7276 + }, + { + "epoch": 0.5477709403639511, + "grad_norm": 5.374661922454834, + "learning_rate": 4.4858453164785105e-05, + "loss": 2.2955, + "step": 7277 + }, + { + "epoch": 0.5478462146446113, + "grad_norm": 5.344202995300293, + "learning_rate": 4.4846327052329504e-05, + "loss": 2.2217, + "step": 7278 + }, + { + "epoch": 0.5479214889252715, + "grad_norm": 4.85532283782959, + "learning_rate": 4.4834201246245e-05, + "loss": 2.167, + "step": 7279 + }, + { + "epoch": 0.5479967632059316, + "grad_norm": 3.7367851734161377, + "learning_rate": 4.482207574725238e-05, + "loss": 2.1791, + "step": 7280 + }, + { + "epoch": 0.5480720374865917, + "grad_norm": 5.266561985015869, + "learning_rate": 4.4809950556072525e-05, + "loss": 2.2391, + "step": 7281 + }, + { + "epoch": 0.548147311767252, + "grad_norm": 5.5713725090026855, + "learning_rate": 4.4797825673426194e-05, + "loss": 1.8552, + "step": 7282 + }, + { + "epoch": 0.5482225860479121, + "grad_norm": 5.478013038635254, + "learning_rate": 4.47857011000342e-05, + "loss": 1.9048, + "step": 7283 + }, + { + "epoch": 0.5482978603285722, + "grad_norm": 4.999684810638428, + "learning_rate": 4.477357683661734e-05, + "loss": 1.9032, + "step": 7284 + }, + { + "epoch": 0.5483731346092324, + "grad_norm": 4.312249183654785, + "learning_rate": 4.476145288389631e-05, + "loss": 1.7808, + "step": 7285 + }, + { + "epoch": 0.5484484088898925, + "grad_norm": 3.2888519763946533, + "learning_rate": 4.47493292425919e-05, + "loss": 1.8086, + "step": 7286 + }, + { + "epoch": 0.5485236831705527, + "grad_norm": 4.756214141845703, + "learning_rate": 4.473720591342478e-05, + "loss": 2.1057, + "step": 7287 + }, + { + "epoch": 0.5485989574512129, + "grad_norm": 4.740874290466309, + "learning_rate": 4.472508289711569e-05, + "loss": 1.6812, + "step": 7288 + }, + { + "epoch": 0.548674231731873, + "grad_norm": 3.677210569381714, + "learning_rate": 4.471296019438527e-05, + "loss": 2.0035, + "step": 7289 + }, + { + "epoch": 0.5487495060125331, + "grad_norm": 3.867335081100464, + "learning_rate": 4.470083780595421e-05, + "loss": 1.8038, + "step": 7290 + }, + { + "epoch": 0.5488247802931934, + "grad_norm": 4.989095687866211, + "learning_rate": 4.468871573254312e-05, + "loss": 1.5651, + "step": 7291 + }, + { + "epoch": 0.5489000545738535, + "grad_norm": 4.610777378082275, + "learning_rate": 4.467659397487265e-05, + "loss": 2.0209, + "step": 7292 + }, + { + "epoch": 0.5489753288545136, + "grad_norm": 2.9480783939361572, + "learning_rate": 4.466447253366338e-05, + "loss": 1.7167, + "step": 7293 + }, + { + "epoch": 0.5490506031351738, + "grad_norm": 4.002351760864258, + "learning_rate": 4.465235140963594e-05, + "loss": 1.6173, + "step": 7294 + }, + { + "epoch": 0.549125877415834, + "grad_norm": 3.871690273284912, + "learning_rate": 4.4640230603510835e-05, + "loss": 1.6102, + "step": 7295 + }, + { + "epoch": 0.5492011516964941, + "grad_norm": 5.054386138916016, + "learning_rate": 4.462811011600865e-05, + "loss": 1.8483, + "step": 7296 + }, + { + "epoch": 0.5492764259771542, + "grad_norm": 5.16343879699707, + "learning_rate": 4.46159899478499e-05, + "loss": 1.612, + "step": 7297 + }, + { + "epoch": 0.5493517002578144, + "grad_norm": 4.424676418304443, + "learning_rate": 4.4603870099755104e-05, + "loss": 1.8288, + "step": 7298 + }, + { + "epoch": 0.5494269745384746, + "grad_norm": 5.338099956512451, + "learning_rate": 4.459175057244476e-05, + "loss": 1.7232, + "step": 7299 + }, + { + "epoch": 0.5495022488191347, + "grad_norm": 3.822824001312256, + "learning_rate": 4.457963136663931e-05, + "loss": 1.8117, + "step": 7300 + }, + { + "epoch": 0.5495775230997949, + "grad_norm": 4.4067769050598145, + "learning_rate": 4.456751248305924e-05, + "loss": 1.8385, + "step": 7301 + }, + { + "epoch": 0.549652797380455, + "grad_norm": 5.020514965057373, + "learning_rate": 4.455539392242494e-05, + "loss": 1.9673, + "step": 7302 + }, + { + "epoch": 0.5497280716611151, + "grad_norm": 6.6952619552612305, + "learning_rate": 4.454327568545687e-05, + "loss": 1.5423, + "step": 7303 + }, + { + "epoch": 0.5498033459417754, + "grad_norm": 4.522897720336914, + "learning_rate": 4.453115777287539e-05, + "loss": 1.7981, + "step": 7304 + }, + { + "epoch": 0.5498786202224355, + "grad_norm": 6.122636795043945, + "learning_rate": 4.4519040185400904e-05, + "loss": 2.1514, + "step": 7305 + }, + { + "epoch": 0.5499538945030956, + "grad_norm": 4.2663469314575195, + "learning_rate": 4.4506922923753735e-05, + "loss": 1.7761, + "step": 7306 + }, + { + "epoch": 0.5500291687837559, + "grad_norm": 4.534322738647461, + "learning_rate": 4.449480598865426e-05, + "loss": 1.704, + "step": 7307 + }, + { + "epoch": 0.550104443064416, + "grad_norm": 3.895892381668091, + "learning_rate": 4.4482689380822744e-05, + "loss": 1.9618, + "step": 7308 + }, + { + "epoch": 0.5501797173450761, + "grad_norm": 3.3728456497192383, + "learning_rate": 4.447057310097953e-05, + "loss": 1.662, + "step": 7309 + }, + { + "epoch": 0.5502549916257363, + "grad_norm": 3.4772403240203857, + "learning_rate": 4.445845714984487e-05, + "loss": 2.2187, + "step": 7310 + }, + { + "epoch": 0.5503302659063964, + "grad_norm": 3.8210296630859375, + "learning_rate": 4.444634152813905e-05, + "loss": 1.7237, + "step": 7311 + }, + { + "epoch": 0.5504055401870566, + "grad_norm": 4.806382656097412, + "learning_rate": 4.443422623658227e-05, + "loss": 2.2813, + "step": 7312 + }, + { + "epoch": 0.5504808144677168, + "grad_norm": 5.3934855461120605, + "learning_rate": 4.4422111275894785e-05, + "loss": 1.9531, + "step": 7313 + }, + { + "epoch": 0.5505560887483769, + "grad_norm": 6.14397668838501, + "learning_rate": 4.440999664679676e-05, + "loss": 1.9077, + "step": 7314 + }, + { + "epoch": 0.550631363029037, + "grad_norm": 3.7839744091033936, + "learning_rate": 4.439788235000841e-05, + "loss": 1.7434, + "step": 7315 + }, + { + "epoch": 0.5507066373096972, + "grad_norm": 5.898906707763672, + "learning_rate": 4.438576838624989e-05, + "loss": 1.7614, + "step": 7316 + }, + { + "epoch": 0.5507819115903574, + "grad_norm": 4.722009181976318, + "learning_rate": 4.4373654756241314e-05, + "loss": 1.928, + "step": 7317 + }, + { + "epoch": 0.5508571858710175, + "grad_norm": 7.242898941040039, + "learning_rate": 4.436154146070283e-05, + "loss": 1.7158, + "step": 7318 + }, + { + "epoch": 0.5509324601516776, + "grad_norm": 8.452837944030762, + "learning_rate": 4.4349428500354514e-05, + "loss": 2.184, + "step": 7319 + }, + { + "epoch": 0.5510077344323379, + "grad_norm": 4.863820552825928, + "learning_rate": 4.4337315875916473e-05, + "loss": 1.5358, + "step": 7320 + }, + { + "epoch": 0.551083008712998, + "grad_norm": 4.039882659912109, + "learning_rate": 4.432520358810875e-05, + "loss": 1.673, + "step": 7321 + }, + { + "epoch": 0.5511582829936581, + "grad_norm": 5.298859596252441, + "learning_rate": 4.43130916376514e-05, + "loss": 1.9258, + "step": 7322 + }, + { + "epoch": 0.5512335572743183, + "grad_norm": 4.4065937995910645, + "learning_rate": 4.430098002526442e-05, + "loss": 1.9056, + "step": 7323 + }, + { + "epoch": 0.5513088315549785, + "grad_norm": 4.024372100830078, + "learning_rate": 4.4288868751667826e-05, + "loss": 2.0418, + "step": 7324 + }, + { + "epoch": 0.5513841058356386, + "grad_norm": 4.446809768676758, + "learning_rate": 4.427675781758161e-05, + "loss": 2.0881, + "step": 7325 + }, + { + "epoch": 0.5514593801162988, + "grad_norm": 4.457399845123291, + "learning_rate": 4.426464722372571e-05, + "loss": 1.6843, + "step": 7326 + }, + { + "epoch": 0.5515346543969589, + "grad_norm": 4.253214359283447, + "learning_rate": 4.425253697082007e-05, + "loss": 1.8723, + "step": 7327 + }, + { + "epoch": 0.551609928677619, + "grad_norm": 4.1736226081848145, + "learning_rate": 4.4240427059584646e-05, + "loss": 1.5328, + "step": 7328 + }, + { + "epoch": 0.5516852029582793, + "grad_norm": 5.437619686126709, + "learning_rate": 4.4228317490739276e-05, + "loss": 2.0168, + "step": 7329 + }, + { + "epoch": 0.5517604772389394, + "grad_norm": 4.763359546661377, + "learning_rate": 4.421620826500388e-05, + "loss": 1.8036, + "step": 7330 + }, + { + "epoch": 0.5518357515195995, + "grad_norm": 4.337121963500977, + "learning_rate": 4.4204099383098294e-05, + "loss": 1.5906, + "step": 7331 + }, + { + "epoch": 0.5519110258002597, + "grad_norm": 5.2828779220581055, + "learning_rate": 4.419199084574237e-05, + "loss": 2.0052, + "step": 7332 + }, + { + "epoch": 0.5519863000809199, + "grad_norm": 5.544689178466797, + "learning_rate": 4.417988265365596e-05, + "loss": 1.824, + "step": 7333 + }, + { + "epoch": 0.55206157436158, + "grad_norm": 5.500155925750732, + "learning_rate": 4.4167774807558796e-05, + "loss": 1.9976, + "step": 7334 + }, + { + "epoch": 0.5521368486422401, + "grad_norm": 4.5070390701293945, + "learning_rate": 4.415566730817071e-05, + "loss": 2.0174, + "step": 7335 + }, + { + "epoch": 0.5522121229229003, + "grad_norm": 4.999845504760742, + "learning_rate": 4.414356015621141e-05, + "loss": 2.0856, + "step": 7336 + }, + { + "epoch": 0.5522873972035605, + "grad_norm": 6.309022903442383, + "learning_rate": 4.413145335240066e-05, + "loss": 2.0811, + "step": 7337 + }, + { + "epoch": 0.5523626714842206, + "grad_norm": 3.884275197982788, + "learning_rate": 4.4119346897458156e-05, + "loss": 1.5132, + "step": 7338 + }, + { + "epoch": 0.5524379457648808, + "grad_norm": 5.557037830352783, + "learning_rate": 4.410724079210364e-05, + "loss": 1.7826, + "step": 7339 + }, + { + "epoch": 0.5525132200455409, + "grad_norm": 5.318655014038086, + "learning_rate": 4.4095135037056715e-05, + "loss": 2.13, + "step": 7340 + }, + { + "epoch": 0.552588494326201, + "grad_norm": 3.735858678817749, + "learning_rate": 4.408302963303709e-05, + "loss": 1.7768, + "step": 7341 + }, + { + "epoch": 0.5526637686068613, + "grad_norm": 4.463040351867676, + "learning_rate": 4.407092458076436e-05, + "loss": 2.0833, + "step": 7342 + }, + { + "epoch": 0.5527390428875214, + "grad_norm": 3.703172445297241, + "learning_rate": 4.405881988095818e-05, + "loss": 1.7127, + "step": 7343 + }, + { + "epoch": 0.5528143171681815, + "grad_norm": 4.204286098480225, + "learning_rate": 4.4046715534338084e-05, + "loss": 1.9628, + "step": 7344 + }, + { + "epoch": 0.5528895914488418, + "grad_norm": 3.9676642417907715, + "learning_rate": 4.403461154162369e-05, + "loss": 1.8234, + "step": 7345 + }, + { + "epoch": 0.5529648657295019, + "grad_norm": 4.691980838775635, + "learning_rate": 4.40225079035345e-05, + "loss": 2.6842, + "step": 7346 + }, + { + "epoch": 0.553040140010162, + "grad_norm": 4.52195405960083, + "learning_rate": 4.4010404620790066e-05, + "loss": 1.9568, + "step": 7347 + }, + { + "epoch": 0.5531154142908222, + "grad_norm": 4.678031921386719, + "learning_rate": 4.39983016941099e-05, + "loss": 2.0218, + "step": 7348 + }, + { + "epoch": 0.5531906885714823, + "grad_norm": 3.9991066455841064, + "learning_rate": 4.3986199124213465e-05, + "loss": 1.6209, + "step": 7349 + }, + { + "epoch": 0.5532659628521425, + "grad_norm": 4.050024509429932, + "learning_rate": 4.397409691182026e-05, + "loss": 2.0356, + "step": 7350 + }, + { + "epoch": 0.5533412371328027, + "grad_norm": 5.740078449249268, + "learning_rate": 4.396199505764968e-05, + "loss": 2.0958, + "step": 7351 + }, + { + "epoch": 0.5534165114134628, + "grad_norm": 6.486820220947266, + "learning_rate": 4.3949893562421195e-05, + "loss": 2.1881, + "step": 7352 + }, + { + "epoch": 0.5534917856941229, + "grad_norm": 5.230270862579346, + "learning_rate": 4.393779242685416e-05, + "loss": 2.105, + "step": 7353 + }, + { + "epoch": 0.5535670599747832, + "grad_norm": 3.867591381072998, + "learning_rate": 4.392569165166798e-05, + "loss": 1.9645, + "step": 7354 + }, + { + "epoch": 0.5536423342554433, + "grad_norm": 4.531659126281738, + "learning_rate": 4.391359123758198e-05, + "loss": 1.7865, + "step": 7355 + }, + { + "epoch": 0.5537176085361034, + "grad_norm": 3.9760096073150635, + "learning_rate": 4.3901491185315544e-05, + "loss": 1.9326, + "step": 7356 + }, + { + "epoch": 0.5537928828167635, + "grad_norm": 5.3282999992370605, + "learning_rate": 4.388939149558795e-05, + "loss": 1.9297, + "step": 7357 + }, + { + "epoch": 0.5538681570974238, + "grad_norm": 5.008666515350342, + "learning_rate": 4.387729216911849e-05, + "loss": 1.952, + "step": 7358 + }, + { + "epoch": 0.5539434313780839, + "grad_norm": 5.533703327178955, + "learning_rate": 4.3865193206626446e-05, + "loss": 1.8613, + "step": 7359 + }, + { + "epoch": 0.554018705658744, + "grad_norm": 5.5756707191467285, + "learning_rate": 4.385309460883107e-05, + "loss": 2.0622, + "step": 7360 + }, + { + "epoch": 0.5540939799394042, + "grad_norm": 7.701865196228027, + "learning_rate": 4.3840996376451574e-05, + "loss": 2.3025, + "step": 7361 + }, + { + "epoch": 0.5541692542200644, + "grad_norm": 5.8811774253845215, + "learning_rate": 4.382889851020718e-05, + "loss": 2.211, + "step": 7362 + }, + { + "epoch": 0.5542445285007245, + "grad_norm": 4.434207916259766, + "learning_rate": 4.381680101081706e-05, + "loss": 2.0364, + "step": 7363 + }, + { + "epoch": 0.5543198027813847, + "grad_norm": 4.133795738220215, + "learning_rate": 4.380470387900036e-05, + "loss": 1.7435, + "step": 7364 + }, + { + "epoch": 0.5543950770620448, + "grad_norm": 6.588438034057617, + "learning_rate": 4.3792607115476256e-05, + "loss": 1.7612, + "step": 7365 + }, + { + "epoch": 0.554470351342705, + "grad_norm": 5.552674770355225, + "learning_rate": 4.378051072096384e-05, + "loss": 2.0138, + "step": 7366 + }, + { + "epoch": 0.5545456256233652, + "grad_norm": 4.626672744750977, + "learning_rate": 4.376841469618224e-05, + "loss": 1.7241, + "step": 7367 + }, + { + "epoch": 0.5546208999040253, + "grad_norm": 3.9006006717681885, + "learning_rate": 4.375631904185047e-05, + "loss": 1.8154, + "step": 7368 + }, + { + "epoch": 0.5546961741846854, + "grad_norm": 4.490985870361328, + "learning_rate": 4.3744223758687645e-05, + "loss": 1.8341, + "step": 7369 + }, + { + "epoch": 0.5547714484653457, + "grad_norm": 3.63482403755188, + "learning_rate": 4.373212884741275e-05, + "loss": 1.8988, + "step": 7370 + }, + { + "epoch": 0.5548467227460058, + "grad_norm": 5.313016891479492, + "learning_rate": 4.3720034308744825e-05, + "loss": 2.177, + "step": 7371 + }, + { + "epoch": 0.5549219970266659, + "grad_norm": 5.0663652420043945, + "learning_rate": 4.3707940143402825e-05, + "loss": 1.8069, + "step": 7372 + }, + { + "epoch": 0.5549972713073261, + "grad_norm": 3.6593358516693115, + "learning_rate": 4.369584635210575e-05, + "loss": 2.1928, + "step": 7373 + }, + { + "epoch": 0.5550725455879862, + "grad_norm": 3.938316583633423, + "learning_rate": 4.368375293557251e-05, + "loss": 2.1227, + "step": 7374 + }, + { + "epoch": 0.5551478198686464, + "grad_norm": 5.642133712768555, + "learning_rate": 4.367165989452203e-05, + "loss": 1.9792, + "step": 7375 + }, + { + "epoch": 0.5552230941493065, + "grad_norm": 7.100983619689941, + "learning_rate": 4.365956722967321e-05, + "loss": 1.845, + "step": 7376 + }, + { + "epoch": 0.5552983684299667, + "grad_norm": 5.176304340362549, + "learning_rate": 4.3647474941744947e-05, + "loss": 1.9888, + "step": 7377 + }, + { + "epoch": 0.5553736427106268, + "grad_norm": 4.718135356903076, + "learning_rate": 4.3635383031456045e-05, + "loss": 1.5989, + "step": 7378 + }, + { + "epoch": 0.555448916991287, + "grad_norm": 4.617133617401123, + "learning_rate": 4.3623291499525356e-05, + "loss": 2.3071, + "step": 7379 + }, + { + "epoch": 0.5555241912719472, + "grad_norm": 4.759497165679932, + "learning_rate": 4.361120034667172e-05, + "loss": 2.1348, + "step": 7380 + }, + { + "epoch": 0.5555994655526073, + "grad_norm": 4.604881763458252, + "learning_rate": 4.3599109573613875e-05, + "loss": 1.8121, + "step": 7381 + }, + { + "epoch": 0.5556747398332674, + "grad_norm": 5.332067966461182, + "learning_rate": 4.35870191810706e-05, + "loss": 2.3038, + "step": 7382 + }, + { + "epoch": 0.5557500141139277, + "grad_norm": 4.925795078277588, + "learning_rate": 4.357492916976062e-05, + "loss": 2.0304, + "step": 7383 + }, + { + "epoch": 0.5558252883945878, + "grad_norm": 3.9847230911254883, + "learning_rate": 4.35628395404027e-05, + "loss": 1.8119, + "step": 7384 + }, + { + "epoch": 0.5559005626752479, + "grad_norm": 3.7008583545684814, + "learning_rate": 4.355075029371547e-05, + "loss": 1.7616, + "step": 7385 + }, + { + "epoch": 0.5559758369559081, + "grad_norm": 3.713547945022583, + "learning_rate": 4.353866143041764e-05, + "loss": 2.1348, + "step": 7386 + }, + { + "epoch": 0.5560511112365683, + "grad_norm": 6.205094814300537, + "learning_rate": 4.352657295122784e-05, + "loss": 1.9995, + "step": 7387 + }, + { + "epoch": 0.5561263855172284, + "grad_norm": 4.85087251663208, + "learning_rate": 4.3514484856864724e-05, + "loss": 1.6254, + "step": 7388 + }, + { + "epoch": 0.5562016597978886, + "grad_norm": 4.379775524139404, + "learning_rate": 4.3502397148046855e-05, + "loss": 1.6096, + "step": 7389 + }, + { + "epoch": 0.5562769340785487, + "grad_norm": 5.867446422576904, + "learning_rate": 4.349030982549285e-05, + "loss": 1.6307, + "step": 7390 + }, + { + "epoch": 0.5563522083592088, + "grad_norm": 4.021681308746338, + "learning_rate": 4.347822288992123e-05, + "loss": 1.6925, + "step": 7391 + }, + { + "epoch": 0.5564274826398691, + "grad_norm": 4.031123638153076, + "learning_rate": 4.346613634205055e-05, + "loss": 1.6419, + "step": 7392 + }, + { + "epoch": 0.5565027569205292, + "grad_norm": 5.8570146560668945, + "learning_rate": 4.3454050182599315e-05, + "loss": 1.7624, + "step": 7393 + }, + { + "epoch": 0.5565780312011893, + "grad_norm": 4.970510959625244, + "learning_rate": 4.3441964412286026e-05, + "loss": 1.8939, + "step": 7394 + }, + { + "epoch": 0.5566533054818494, + "grad_norm": 5.431758403778076, + "learning_rate": 4.3429879031829125e-05, + "loss": 1.6955, + "step": 7395 + }, + { + "epoch": 0.5567285797625097, + "grad_norm": 4.710243225097656, + "learning_rate": 4.341779404194706e-05, + "loss": 1.7647, + "step": 7396 + }, + { + "epoch": 0.5568038540431698, + "grad_norm": 3.986436128616333, + "learning_rate": 4.3405709443358256e-05, + "loss": 1.5752, + "step": 7397 + }, + { + "epoch": 0.5568791283238299, + "grad_norm": 6.038854122161865, + "learning_rate": 4.33936252367811e-05, + "loss": 1.776, + "step": 7398 + }, + { + "epoch": 0.5569544026044901, + "grad_norm": 6.0635600090026855, + "learning_rate": 4.338154142293398e-05, + "loss": 1.9351, + "step": 7399 + }, + { + "epoch": 0.5570296768851503, + "grad_norm": 5.238595962524414, + "learning_rate": 4.336945800253522e-05, + "loss": 1.8521, + "step": 7400 + }, + { + "epoch": 0.5571049511658104, + "grad_norm": 5.034926414489746, + "learning_rate": 4.335737497630318e-05, + "loss": 1.9239, + "step": 7401 + }, + { + "epoch": 0.5571802254464706, + "grad_norm": 6.994501113891602, + "learning_rate": 4.334529234495612e-05, + "loss": 2.0602, + "step": 7402 + }, + { + "epoch": 0.5572554997271307, + "grad_norm": 4.744661808013916, + "learning_rate": 4.3333210109212336e-05, + "loss": 2.0983, + "step": 7403 + }, + { + "epoch": 0.5573307740077909, + "grad_norm": 4.382345199584961, + "learning_rate": 4.3321128269790074e-05, + "loss": 1.9319, + "step": 7404 + }, + { + "epoch": 0.5574060482884511, + "grad_norm": 6.412303924560547, + "learning_rate": 4.3309046827407594e-05, + "loss": 1.7208, + "step": 7405 + }, + { + "epoch": 0.5574813225691112, + "grad_norm": 13.301950454711914, + "learning_rate": 4.329696578278306e-05, + "loss": 2.0987, + "step": 7406 + }, + { + "epoch": 0.5575565968497713, + "grad_norm": 5.290268898010254, + "learning_rate": 4.3284885136634704e-05, + "loss": 1.5533, + "step": 7407 + }, + { + "epoch": 0.5576318711304316, + "grad_norm": 6.362166881561279, + "learning_rate": 4.327280488968063e-05, + "loss": 1.948, + "step": 7408 + }, + { + "epoch": 0.5577071454110917, + "grad_norm": 3.867560863494873, + "learning_rate": 4.3260725042639014e-05, + "loss": 1.7492, + "step": 7409 + }, + { + "epoch": 0.5577824196917518, + "grad_norm": 4.308571815490723, + "learning_rate": 4.324864559622795e-05, + "loss": 2.0932, + "step": 7410 + }, + { + "epoch": 0.557857693972412, + "grad_norm": 4.236301898956299, + "learning_rate": 4.323656655116553e-05, + "loss": 1.8278, + "step": 7411 + }, + { + "epoch": 0.5579329682530721, + "grad_norm": 5.600011825561523, + "learning_rate": 4.3224487908169844e-05, + "loss": 1.7568, + "step": 7412 + }, + { + "epoch": 0.5580082425337323, + "grad_norm": 5.102320671081543, + "learning_rate": 4.32124096679589e-05, + "loss": 2.0575, + "step": 7413 + }, + { + "epoch": 0.5580835168143924, + "grad_norm": 5.7027130126953125, + "learning_rate": 4.320033183125072e-05, + "loss": 2.0576, + "step": 7414 + }, + { + "epoch": 0.5581587910950526, + "grad_norm": 4.68280029296875, + "learning_rate": 4.31882543987633e-05, + "loss": 1.926, + "step": 7415 + }, + { + "epoch": 0.5582340653757127, + "grad_norm": 4.541062355041504, + "learning_rate": 4.3176177371214634e-05, + "loss": 1.9934, + "step": 7416 + }, + { + "epoch": 0.5583093396563729, + "grad_norm": 5.059689998626709, + "learning_rate": 4.316410074932262e-05, + "loss": 1.7551, + "step": 7417 + }, + { + "epoch": 0.5583846139370331, + "grad_norm": 4.464911460876465, + "learning_rate": 4.315202453380522e-05, + "loss": 1.9189, + "step": 7418 + }, + { + "epoch": 0.5584598882176932, + "grad_norm": 3.9713943004608154, + "learning_rate": 4.3139948725380295e-05, + "loss": 1.8338, + "step": 7419 + }, + { + "epoch": 0.5585351624983533, + "grad_norm": 5.557081699371338, + "learning_rate": 4.312787332476574e-05, + "loss": 1.8421, + "step": 7420 + }, + { + "epoch": 0.5586104367790136, + "grad_norm": 6.325413703918457, + "learning_rate": 4.311579833267939e-05, + "loss": 2.0226, + "step": 7421 + }, + { + "epoch": 0.5586857110596737, + "grad_norm": 4.482569217681885, + "learning_rate": 4.3103723749839096e-05, + "loss": 1.982, + "step": 7422 + }, + { + "epoch": 0.5587609853403338, + "grad_norm": 4.55171537399292, + "learning_rate": 4.3091649576962624e-05, + "loss": 1.794, + "step": 7423 + }, + { + "epoch": 0.558836259620994, + "grad_norm": 5.786789894104004, + "learning_rate": 4.3079575814767766e-05, + "loss": 1.7255, + "step": 7424 + }, + { + "epoch": 0.5589115339016542, + "grad_norm": 4.895748615264893, + "learning_rate": 4.306750246397225e-05, + "loss": 1.6539, + "step": 7425 + }, + { + "epoch": 0.5589868081823143, + "grad_norm": 6.730261325836182, + "learning_rate": 4.305542952529386e-05, + "loss": 1.9069, + "step": 7426 + }, + { + "epoch": 0.5590620824629745, + "grad_norm": 4.3582000732421875, + "learning_rate": 4.304335699945023e-05, + "loss": 1.8138, + "step": 7427 + }, + { + "epoch": 0.5591373567436346, + "grad_norm": 5.143578052520752, + "learning_rate": 4.3031284887159065e-05, + "loss": 1.7364, + "step": 7428 + }, + { + "epoch": 0.5592126310242947, + "grad_norm": 6.419661521911621, + "learning_rate": 4.301921318913804e-05, + "loss": 1.8883, + "step": 7429 + }, + { + "epoch": 0.559287905304955, + "grad_norm": 4.226903438568115, + "learning_rate": 4.300714190610473e-05, + "loss": 1.6471, + "step": 7430 + }, + { + "epoch": 0.5593631795856151, + "grad_norm": 3.828778028488159, + "learning_rate": 4.2995071038776794e-05, + "loss": 1.8076, + "step": 7431 + }, + { + "epoch": 0.5594384538662752, + "grad_norm": 3.457158327102661, + "learning_rate": 4.2983000587871763e-05, + "loss": 1.7967, + "step": 7432 + }, + { + "epoch": 0.5595137281469353, + "grad_norm": 4.551523685455322, + "learning_rate": 4.2970930554107244e-05, + "loss": 1.9437, + "step": 7433 + }, + { + "epoch": 0.5595890024275956, + "grad_norm": 7.061385631561279, + "learning_rate": 4.2958860938200706e-05, + "loss": 2.1281, + "step": 7434 + }, + { + "epoch": 0.5596642767082557, + "grad_norm": 5.252857685089111, + "learning_rate": 4.294679174086971e-05, + "loss": 1.768, + "step": 7435 + }, + { + "epoch": 0.5597395509889158, + "grad_norm": 5.978367805480957, + "learning_rate": 4.293472296283168e-05, + "loss": 2.1951, + "step": 7436 + }, + { + "epoch": 0.559814825269576, + "grad_norm": 5.210687637329102, + "learning_rate": 4.292265460480411e-05, + "loss": 1.8667, + "step": 7437 + }, + { + "epoch": 0.5598900995502362, + "grad_norm": 4.041530132293701, + "learning_rate": 4.2910586667504406e-05, + "loss": 1.7806, + "step": 7438 + }, + { + "epoch": 0.5599653738308963, + "grad_norm": 3.877411365509033, + "learning_rate": 4.289851915165001e-05, + "loss": 1.7278, + "step": 7439 + }, + { + "epoch": 0.5600406481115565, + "grad_norm": 3.7501790523529053, + "learning_rate": 4.288645205795824e-05, + "loss": 1.7401, + "step": 7440 + }, + { + "epoch": 0.5601159223922166, + "grad_norm": 3.820499897003174, + "learning_rate": 4.28743853871465e-05, + "loss": 1.5775, + "step": 7441 + }, + { + "epoch": 0.5601911966728768, + "grad_norm": 4.196033000946045, + "learning_rate": 4.28623191399321e-05, + "loss": 1.8763, + "step": 7442 + }, + { + "epoch": 0.560266470953537, + "grad_norm": 4.171761512756348, + "learning_rate": 4.285025331703236e-05, + "loss": 1.8668, + "step": 7443 + }, + { + "epoch": 0.5603417452341971, + "grad_norm": 4.1810503005981445, + "learning_rate": 4.283818791916453e-05, + "loss": 1.7125, + "step": 7444 + }, + { + "epoch": 0.5604170195148572, + "grad_norm": 5.239692687988281, + "learning_rate": 4.2826122947045875e-05, + "loss": 1.8159, + "step": 7445 + }, + { + "epoch": 0.5604922937955175, + "grad_norm": 5.308150768280029, + "learning_rate": 4.281405840139364e-05, + "loss": 2.1461, + "step": 7446 + }, + { + "epoch": 0.5605675680761776, + "grad_norm": 4.282695293426514, + "learning_rate": 4.280199428292501e-05, + "loss": 1.757, + "step": 7447 + }, + { + "epoch": 0.5606428423568377, + "grad_norm": 3.744215726852417, + "learning_rate": 4.278993059235716e-05, + "loss": 2.2698, + "step": 7448 + }, + { + "epoch": 0.5607181166374979, + "grad_norm": 4.231766223907471, + "learning_rate": 4.277786733040725e-05, + "loss": 1.8222, + "step": 7449 + }, + { + "epoch": 0.560793390918158, + "grad_norm": 4.536243915557861, + "learning_rate": 4.276580449779242e-05, + "loss": 1.5572, + "step": 7450 + }, + { + "epoch": 0.5608686651988182, + "grad_norm": 3.9466090202331543, + "learning_rate": 4.275374209522974e-05, + "loss": 1.9276, + "step": 7451 + }, + { + "epoch": 0.5609439394794784, + "grad_norm": 4.0187458992004395, + "learning_rate": 4.27416801234363e-05, + "loss": 1.6828, + "step": 7452 + }, + { + "epoch": 0.5610192137601385, + "grad_norm": 4.821317195892334, + "learning_rate": 4.272961858312916e-05, + "loss": 1.7848, + "step": 7453 + }, + { + "epoch": 0.5610944880407986, + "grad_norm": 4.9564900398254395, + "learning_rate": 4.2717557475025335e-05, + "loss": 2.0061, + "step": 7454 + }, + { + "epoch": 0.5611697623214588, + "grad_norm": 5.768749237060547, + "learning_rate": 4.27054967998418e-05, + "loss": 2.3863, + "step": 7455 + }, + { + "epoch": 0.561245036602119, + "grad_norm": 5.145970344543457, + "learning_rate": 4.269343655829558e-05, + "loss": 1.9027, + "step": 7456 + }, + { + "epoch": 0.5613203108827791, + "grad_norm": 4.501526355743408, + "learning_rate": 4.2681376751103575e-05, + "loss": 1.9796, + "step": 7457 + }, + { + "epoch": 0.5613955851634392, + "grad_norm": 4.454838752746582, + "learning_rate": 4.266931737898273e-05, + "loss": 2.2908, + "step": 7458 + }, + { + "epoch": 0.5614708594440995, + "grad_norm": 5.044410705566406, + "learning_rate": 4.265725844264992e-05, + "loss": 1.5678, + "step": 7459 + }, + { + "epoch": 0.5615461337247596, + "grad_norm": 6.122454643249512, + "learning_rate": 4.264519994282202e-05, + "loss": 1.6593, + "step": 7460 + }, + { + "epoch": 0.5616214080054197, + "grad_norm": 4.768460750579834, + "learning_rate": 4.263314188021591e-05, + "loss": 1.905, + "step": 7461 + }, + { + "epoch": 0.5616966822860799, + "grad_norm": 4.453472137451172, + "learning_rate": 4.2621084255548363e-05, + "loss": 1.9826, + "step": 7462 + }, + { + "epoch": 0.5617719565667401, + "grad_norm": 5.3138108253479, + "learning_rate": 4.2609027069536186e-05, + "loss": 2.1403, + "step": 7463 + }, + { + "epoch": 0.5618472308474002, + "grad_norm": 4.939953327178955, + "learning_rate": 4.259697032289613e-05, + "loss": 2.0381, + "step": 7464 + }, + { + "epoch": 0.5619225051280604, + "grad_norm": 4.13845157623291, + "learning_rate": 4.258491401634497e-05, + "loss": 1.751, + "step": 7465 + }, + { + "epoch": 0.5619977794087205, + "grad_norm": 4.92451810836792, + "learning_rate": 4.2572858150599374e-05, + "loss": 1.6275, + "step": 7466 + }, + { + "epoch": 0.5620730536893807, + "grad_norm": 5.270460605621338, + "learning_rate": 4.256080272637607e-05, + "loss": 2.5168, + "step": 7467 + }, + { + "epoch": 0.5621483279700409, + "grad_norm": 4.0166096687316895, + "learning_rate": 4.254874774439168e-05, + "loss": 1.4184, + "step": 7468 + }, + { + "epoch": 0.562223602250701, + "grad_norm": 6.811859130859375, + "learning_rate": 4.253669320536286e-05, + "loss": 1.862, + "step": 7469 + }, + { + "epoch": 0.5622988765313611, + "grad_norm": 4.710254669189453, + "learning_rate": 4.252463911000621e-05, + "loss": 1.8795, + "step": 7470 + }, + { + "epoch": 0.5623741508120214, + "grad_norm": 4.976755619049072, + "learning_rate": 4.251258545903835e-05, + "loss": 1.8689, + "step": 7471 + }, + { + "epoch": 0.5624494250926815, + "grad_norm": 4.888972759246826, + "learning_rate": 4.250053225317576e-05, + "loss": 1.9504, + "step": 7472 + }, + { + "epoch": 0.5625246993733416, + "grad_norm": 4.428262233734131, + "learning_rate": 4.2488479493135034e-05, + "loss": 2.0106, + "step": 7473 + }, + { + "epoch": 0.5625999736540017, + "grad_norm": 4.419419765472412, + "learning_rate": 4.2476427179632635e-05, + "loss": 2.2447, + "step": 7474 + }, + { + "epoch": 0.562675247934662, + "grad_norm": 3.3958523273468018, + "learning_rate": 4.246437531338506e-05, + "loss": 1.9714, + "step": 7475 + }, + { + "epoch": 0.5627505222153221, + "grad_norm": 4.874242782592773, + "learning_rate": 4.245232389510874e-05, + "loss": 2.0492, + "step": 7476 + }, + { + "epoch": 0.5628257964959822, + "grad_norm": 4.655029296875, + "learning_rate": 4.24402729255201e-05, + "loss": 2.0589, + "step": 7477 + }, + { + "epoch": 0.5629010707766424, + "grad_norm": 4.235930442810059, + "learning_rate": 4.242822240533558e-05, + "loss": 1.9571, + "step": 7478 + }, + { + "epoch": 0.5629763450573025, + "grad_norm": 5.331794738769531, + "learning_rate": 4.2416172335271476e-05, + "loss": 2.2019, + "step": 7479 + }, + { + "epoch": 0.5630516193379627, + "grad_norm": 6.205273151397705, + "learning_rate": 4.240412271604419e-05, + "loss": 2.4965, + "step": 7480 + }, + { + "epoch": 0.5631268936186229, + "grad_norm": 3.5802690982818604, + "learning_rate": 4.239207354837e-05, + "loss": 1.4024, + "step": 7481 + }, + { + "epoch": 0.563202167899283, + "grad_norm": 5.780887603759766, + "learning_rate": 4.238002483296522e-05, + "loss": 1.9583, + "step": 7482 + }, + { + "epoch": 0.5632774421799431, + "grad_norm": 4.809574127197266, + "learning_rate": 4.236797657054608e-05, + "loss": 1.9486, + "step": 7483 + }, + { + "epoch": 0.5633527164606034, + "grad_norm": 5.021664619445801, + "learning_rate": 4.2355928761828855e-05, + "loss": 1.9926, + "step": 7484 + }, + { + "epoch": 0.5634279907412635, + "grad_norm": 3.7073211669921875, + "learning_rate": 4.234388140752972e-05, + "loss": 1.6559, + "step": 7485 + }, + { + "epoch": 0.5635032650219236, + "grad_norm": 4.195054531097412, + "learning_rate": 4.233183450836487e-05, + "loss": 1.7289, + "step": 7486 + }, + { + "epoch": 0.5635785393025838, + "grad_norm": 3.6992850303649902, + "learning_rate": 4.2319788065050445e-05, + "loss": 1.7606, + "step": 7487 + }, + { + "epoch": 0.563653813583244, + "grad_norm": 4.9785661697387695, + "learning_rate": 4.230774207830261e-05, + "loss": 1.8724, + "step": 7488 + }, + { + "epoch": 0.5637290878639041, + "grad_norm": 5.745755672454834, + "learning_rate": 4.229569654883741e-05, + "loss": 1.9408, + "step": 7489 + }, + { + "epoch": 0.5638043621445643, + "grad_norm": 4.568248271942139, + "learning_rate": 4.2283651477370966e-05, + "loss": 2.0735, + "step": 7490 + }, + { + "epoch": 0.5638796364252244, + "grad_norm": 7.15548038482666, + "learning_rate": 4.227160686461928e-05, + "loss": 1.901, + "step": 7491 + }, + { + "epoch": 0.5639549107058845, + "grad_norm": 4.740743637084961, + "learning_rate": 4.22595627112984e-05, + "loss": 1.8834, + "step": 7492 + }, + { + "epoch": 0.5640301849865447, + "grad_norm": 5.510941505432129, + "learning_rate": 4.2247519018124306e-05, + "loss": 1.689, + "step": 7493 + }, + { + "epoch": 0.5641054592672049, + "grad_norm": 3.954754590988159, + "learning_rate": 4.223547578581295e-05, + "loss": 1.9628, + "step": 7494 + }, + { + "epoch": 0.564180733547865, + "grad_norm": 5.239189147949219, + "learning_rate": 4.2223433015080314e-05, + "loss": 1.9016, + "step": 7495 + }, + { + "epoch": 0.5642560078285251, + "grad_norm": 3.6925413608551025, + "learning_rate": 4.221139070664224e-05, + "loss": 1.5892, + "step": 7496 + }, + { + "epoch": 0.5643312821091854, + "grad_norm": 3.8457372188568115, + "learning_rate": 4.219934886121465e-05, + "loss": 1.8128, + "step": 7497 + }, + { + "epoch": 0.5644065563898455, + "grad_norm": 4.166738510131836, + "learning_rate": 4.218730747951338e-05, + "loss": 1.8704, + "step": 7498 + }, + { + "epoch": 0.5644818306705056, + "grad_norm": 4.019085884094238, + "learning_rate": 4.2175266562254287e-05, + "loss": 1.9747, + "step": 7499 + }, + { + "epoch": 0.5645571049511658, + "grad_norm": 4.934507369995117, + "learning_rate": 4.216322611015312e-05, + "loss": 2.0902, + "step": 7500 + }, + { + "epoch": 0.564632379231826, + "grad_norm": 5.739882469177246, + "learning_rate": 4.21511861239257e-05, + "loss": 1.8918, + "step": 7501 + }, + { + "epoch": 0.5647076535124861, + "grad_norm": 6.980597496032715, + "learning_rate": 4.2139146604287715e-05, + "loss": 1.8077, + "step": 7502 + }, + { + "epoch": 0.5647829277931463, + "grad_norm": 5.41663932800293, + "learning_rate": 4.212710755195493e-05, + "loss": 1.6561, + "step": 7503 + }, + { + "epoch": 0.5648582020738064, + "grad_norm": 4.110705852508545, + "learning_rate": 4.2115068967643e-05, + "loss": 1.781, + "step": 7504 + }, + { + "epoch": 0.5649334763544666, + "grad_norm": 4.848496437072754, + "learning_rate": 4.2103030852067605e-05, + "loss": 1.812, + "step": 7505 + }, + { + "epoch": 0.5650087506351268, + "grad_norm": 5.051675796508789, + "learning_rate": 4.209099320594436e-05, + "loss": 1.8409, + "step": 7506 + }, + { + "epoch": 0.5650840249157869, + "grad_norm": 4.491971015930176, + "learning_rate": 4.2078956029988884e-05, + "loss": 1.6414, + "step": 7507 + }, + { + "epoch": 0.565159299196447, + "grad_norm": 3.921613931655884, + "learning_rate": 4.206691932491673e-05, + "loss": 2.1208, + "step": 7508 + }, + { + "epoch": 0.5652345734771073, + "grad_norm": 4.366812705993652, + "learning_rate": 4.205488309144346e-05, + "loss": 1.9683, + "step": 7509 + }, + { + "epoch": 0.5653098477577674, + "grad_norm": 4.005619049072266, + "learning_rate": 4.20428473302846e-05, + "loss": 1.5771, + "step": 7510 + }, + { + "epoch": 0.5653851220384275, + "grad_norm": 5.860362529754639, + "learning_rate": 4.203081204215563e-05, + "loss": 1.9189, + "step": 7511 + }, + { + "epoch": 0.5654603963190876, + "grad_norm": 3.9213802814483643, + "learning_rate": 4.2018777227772034e-05, + "loss": 2.0196, + "step": 7512 + }, + { + "epoch": 0.5655356705997479, + "grad_norm": 4.284903049468994, + "learning_rate": 4.20067428878492e-05, + "loss": 1.6371, + "step": 7513 + }, + { + "epoch": 0.565610944880408, + "grad_norm": 7.116332530975342, + "learning_rate": 4.19947090231026e-05, + "loss": 2.1986, + "step": 7514 + }, + { + "epoch": 0.5656862191610681, + "grad_norm": 6.714890956878662, + "learning_rate": 4.198267563424755e-05, + "loss": 1.9509, + "step": 7515 + }, + { + "epoch": 0.5657614934417283, + "grad_norm": 5.1159257888793945, + "learning_rate": 4.1970642721999455e-05, + "loss": 2.0176, + "step": 7516 + }, + { + "epoch": 0.5658367677223884, + "grad_norm": 5.18733549118042, + "learning_rate": 4.195861028707359e-05, + "loss": 2.2061, + "step": 7517 + }, + { + "epoch": 0.5659120420030486, + "grad_norm": 6.285945892333984, + "learning_rate": 4.194657833018529e-05, + "loss": 1.9538, + "step": 7518 + }, + { + "epoch": 0.5659873162837088, + "grad_norm": 5.772747039794922, + "learning_rate": 4.193454685204979e-05, + "loss": 1.8703, + "step": 7519 + }, + { + "epoch": 0.5660625905643689, + "grad_norm": 4.274621486663818, + "learning_rate": 4.192251585338234e-05, + "loss": 1.9077, + "step": 7520 + }, + { + "epoch": 0.566137864845029, + "grad_norm": 6.213925361633301, + "learning_rate": 4.191048533489813e-05, + "loss": 2.0577, + "step": 7521 + }, + { + "epoch": 0.5662131391256893, + "grad_norm": 6.930326461791992, + "learning_rate": 4.189845529731238e-05, + "loss": 1.9268, + "step": 7522 + }, + { + "epoch": 0.5662884134063494, + "grad_norm": 4.048588752746582, + "learning_rate": 4.18864257413402e-05, + "loss": 2.0413, + "step": 7523 + }, + { + "epoch": 0.5663636876870095, + "grad_norm": 4.316848278045654, + "learning_rate": 4.187439666769672e-05, + "loss": 1.7213, + "step": 7524 + }, + { + "epoch": 0.5664389619676697, + "grad_norm": 5.2397589683532715, + "learning_rate": 4.186236807709706e-05, + "loss": 2.2517, + "step": 7525 + }, + { + "epoch": 0.5665142362483299, + "grad_norm": 5.127415657043457, + "learning_rate": 4.1850339970256266e-05, + "loss": 1.9301, + "step": 7526 + }, + { + "epoch": 0.56658951052899, + "grad_norm": 5.182537078857422, + "learning_rate": 4.183831234788938e-05, + "loss": 2.2056, + "step": 7527 + }, + { + "epoch": 0.5666647848096502, + "grad_norm": 7.227933883666992, + "learning_rate": 4.18262852107114e-05, + "loss": 1.5408, + "step": 7528 + }, + { + "epoch": 0.5667400590903103, + "grad_norm": 4.186732769012451, + "learning_rate": 4.1814258559437325e-05, + "loss": 1.7282, + "step": 7529 + }, + { + "epoch": 0.5668153333709705, + "grad_norm": 4.639697551727295, + "learning_rate": 4.180223239478208e-05, + "loss": 1.9161, + "step": 7530 + }, + { + "epoch": 0.5668906076516306, + "grad_norm": 5.565513610839844, + "learning_rate": 4.17902067174606e-05, + "loss": 1.8947, + "step": 7531 + }, + { + "epoch": 0.5669658819322908, + "grad_norm": 4.691824913024902, + "learning_rate": 4.177818152818776e-05, + "loss": 2.1495, + "step": 7532 + }, + { + "epoch": 0.5670411562129509, + "grad_norm": 6.342762470245361, + "learning_rate": 4.176615682767846e-05, + "loss": 2.3951, + "step": 7533 + }, + { + "epoch": 0.567116430493611, + "grad_norm": 4.102329730987549, + "learning_rate": 4.175413261664749e-05, + "loss": 1.8957, + "step": 7534 + }, + { + "epoch": 0.5671917047742713, + "grad_norm": 3.8884243965148926, + "learning_rate": 4.17421088958097e-05, + "loss": 1.8524, + "step": 7535 + }, + { + "epoch": 0.5672669790549314, + "grad_norm": 5.006742477416992, + "learning_rate": 4.1730085665879814e-05, + "loss": 1.8079, + "step": 7536 + }, + { + "epoch": 0.5673422533355915, + "grad_norm": 3.7115249633789062, + "learning_rate": 4.171806292757263e-05, + "loss": 1.8584, + "step": 7537 + }, + { + "epoch": 0.5674175276162517, + "grad_norm": 4.174636363983154, + "learning_rate": 4.170604068160282e-05, + "loss": 1.8813, + "step": 7538 + }, + { + "epoch": 0.5674928018969119, + "grad_norm": 5.886826515197754, + "learning_rate": 4.1694018928685114e-05, + "loss": 2.1428, + "step": 7539 + }, + { + "epoch": 0.567568076177572, + "grad_norm": 4.3213605880737305, + "learning_rate": 4.1681997669534134e-05, + "loss": 1.9808, + "step": 7540 + }, + { + "epoch": 0.5676433504582322, + "grad_norm": 3.2923669815063477, + "learning_rate": 4.166997690486452e-05, + "loss": 2.3043, + "step": 7541 + }, + { + "epoch": 0.5677186247388923, + "grad_norm": 4.019612789154053, + "learning_rate": 4.1657956635390896e-05, + "loss": 1.938, + "step": 7542 + }, + { + "epoch": 0.5677938990195525, + "grad_norm": 6.246671676635742, + "learning_rate": 4.164593686182779e-05, + "loss": 2.0247, + "step": 7543 + }, + { + "epoch": 0.5678691733002127, + "grad_norm": 4.0431623458862305, + "learning_rate": 4.1633917584889804e-05, + "loss": 1.9226, + "step": 7544 + }, + { + "epoch": 0.5679444475808728, + "grad_norm": 4.333072662353516, + "learning_rate": 4.1621898805291384e-05, + "loss": 1.9595, + "step": 7545 + }, + { + "epoch": 0.5680197218615329, + "grad_norm": 4.088685035705566, + "learning_rate": 4.160988052374706e-05, + "loss": 1.9298, + "step": 7546 + }, + { + "epoch": 0.5680949961421932, + "grad_norm": 4.085568904876709, + "learning_rate": 4.159786274097125e-05, + "loss": 2.0882, + "step": 7547 + }, + { + "epoch": 0.5681702704228533, + "grad_norm": 3.7184553146362305, + "learning_rate": 4.15858454576784e-05, + "loss": 1.8381, + "step": 7548 + }, + { + "epoch": 0.5682455447035134, + "grad_norm": 8.849818229675293, + "learning_rate": 4.157382867458288e-05, + "loss": 1.8773, + "step": 7549 + }, + { + "epoch": 0.5683208189841736, + "grad_norm": 4.118654251098633, + "learning_rate": 4.156181239239909e-05, + "loss": 1.776, + "step": 7550 + }, + { + "epoch": 0.5683960932648338, + "grad_norm": 4.431726932525635, + "learning_rate": 4.154979661184132e-05, + "loss": 1.9969, + "step": 7551 + }, + { + "epoch": 0.5684713675454939, + "grad_norm": 4.57051944732666, + "learning_rate": 4.153778133362391e-05, + "loss": 1.8497, + "step": 7552 + }, + { + "epoch": 0.568546641826154, + "grad_norm": 5.797927379608154, + "learning_rate": 4.15257665584611e-05, + "loss": 1.9165, + "step": 7553 + }, + { + "epoch": 0.5686219161068142, + "grad_norm": 4.613893985748291, + "learning_rate": 4.151375228706719e-05, + "loss": 2.1549, + "step": 7554 + }, + { + "epoch": 0.5686971903874743, + "grad_norm": 4.822774887084961, + "learning_rate": 4.150173852015632e-05, + "loss": 1.9217, + "step": 7555 + }, + { + "epoch": 0.5687724646681345, + "grad_norm": 3.97965145111084, + "learning_rate": 4.148972525844272e-05, + "loss": 2.1573, + "step": 7556 + }, + { + "epoch": 0.5688477389487947, + "grad_norm": 4.471109390258789, + "learning_rate": 4.147771250264055e-05, + "loss": 2.0096, + "step": 7557 + }, + { + "epoch": 0.5689230132294548, + "grad_norm": 3.559664726257324, + "learning_rate": 4.1465700253463895e-05, + "loss": 1.8905, + "step": 7558 + }, + { + "epoch": 0.5689982875101149, + "grad_norm": 5.800527095794678, + "learning_rate": 4.145368851162689e-05, + "loss": 2.2562, + "step": 7559 + }, + { + "epoch": 0.5690735617907752, + "grad_norm": 5.9950714111328125, + "learning_rate": 4.144167727784357e-05, + "loss": 2.1329, + "step": 7560 + }, + { + "epoch": 0.5691488360714353, + "grad_norm": 4.242916584014893, + "learning_rate": 4.1429666552827994e-05, + "loss": 2.0566, + "step": 7561 + }, + { + "epoch": 0.5692241103520954, + "grad_norm": 6.299023628234863, + "learning_rate": 4.141765633729413e-05, + "loss": 2.287, + "step": 7562 + }, + { + "epoch": 0.5692993846327556, + "grad_norm": 5.601983547210693, + "learning_rate": 4.1405646631955996e-05, + "loss": 1.8472, + "step": 7563 + }, + { + "epoch": 0.5693746589134158, + "grad_norm": 5.557967662811279, + "learning_rate": 4.139363743752749e-05, + "loss": 2.0713, + "step": 7564 + }, + { + "epoch": 0.5694499331940759, + "grad_norm": 3.4045512676239014, + "learning_rate": 4.138162875472256e-05, + "loss": 1.8601, + "step": 7565 + }, + { + "epoch": 0.5695252074747361, + "grad_norm": 5.910848140716553, + "learning_rate": 4.136962058425505e-05, + "loss": 1.712, + "step": 7566 + }, + { + "epoch": 0.5696004817553962, + "grad_norm": 5.968437671661377, + "learning_rate": 4.135761292683886e-05, + "loss": 2.1861, + "step": 7567 + }, + { + "epoch": 0.5696757560360564, + "grad_norm": 5.811207294464111, + "learning_rate": 4.134560578318776e-05, + "loss": 1.5196, + "step": 7568 + }, + { + "epoch": 0.5697510303167166, + "grad_norm": 4.529589653015137, + "learning_rate": 4.1333599154015564e-05, + "loss": 1.8235, + "step": 7569 + }, + { + "epoch": 0.5698263045973767, + "grad_norm": 3.7079923152923584, + "learning_rate": 4.132159304003604e-05, + "loss": 1.8984, + "step": 7570 + }, + { + "epoch": 0.5699015788780368, + "grad_norm": 4.084473133087158, + "learning_rate": 4.130958744196291e-05, + "loss": 2.0156, + "step": 7571 + }, + { + "epoch": 0.569976853158697, + "grad_norm": 4.172169208526611, + "learning_rate": 4.129758236050987e-05, + "loss": 1.9223, + "step": 7572 + }, + { + "epoch": 0.5700521274393572, + "grad_norm": 3.107506036758423, + "learning_rate": 4.1285577796390576e-05, + "loss": 1.4488, + "step": 7573 + }, + { + "epoch": 0.5701274017200173, + "grad_norm": 4.356510162353516, + "learning_rate": 4.127357375031871e-05, + "loss": 1.9654, + "step": 7574 + }, + { + "epoch": 0.5702026760006774, + "grad_norm": 5.4899187088012695, + "learning_rate": 4.126157022300781e-05, + "loss": 2.0589, + "step": 7575 + }, + { + "epoch": 0.5702779502813377, + "grad_norm": 4.5292792320251465, + "learning_rate": 4.124956721517151e-05, + "loss": 2.0968, + "step": 7576 + }, + { + "epoch": 0.5703532245619978, + "grad_norm": 4.7366042137146, + "learning_rate": 4.1237564727523315e-05, + "loss": 1.8485, + "step": 7577 + }, + { + "epoch": 0.5704284988426579, + "grad_norm": 5.013432025909424, + "learning_rate": 4.122556276077677e-05, + "loss": 1.416, + "step": 7578 + }, + { + "epoch": 0.5705037731233181, + "grad_norm": 5.654892921447754, + "learning_rate": 4.121356131564533e-05, + "loss": 1.777, + "step": 7579 + }, + { + "epoch": 0.5705790474039782, + "grad_norm": 6.319755554199219, + "learning_rate": 4.120156039284246e-05, + "loss": 1.8112, + "step": 7580 + }, + { + "epoch": 0.5706543216846384, + "grad_norm": 4.170255661010742, + "learning_rate": 4.11895599930816e-05, + "loss": 1.8495, + "step": 7581 + }, + { + "epoch": 0.5707295959652986, + "grad_norm": 4.765605926513672, + "learning_rate": 4.1177560117076095e-05, + "loss": 1.5139, + "step": 7582 + }, + { + "epoch": 0.5708048702459587, + "grad_norm": 4.976629257202148, + "learning_rate": 4.1165560765539326e-05, + "loss": 2.1208, + "step": 7583 + }, + { + "epoch": 0.5708801445266188, + "grad_norm": 3.914757013320923, + "learning_rate": 4.115356193918464e-05, + "loss": 1.5957, + "step": 7584 + }, + { + "epoch": 0.5709554188072791, + "grad_norm": 4.362957000732422, + "learning_rate": 4.11415636387253e-05, + "loss": 2.5513, + "step": 7585 + }, + { + "epoch": 0.5710306930879392, + "grad_norm": 7.167521953582764, + "learning_rate": 4.112956586487458e-05, + "loss": 2.1259, + "step": 7586 + }, + { + "epoch": 0.5711059673685993, + "grad_norm": 6.460814476013184, + "learning_rate": 4.1117568618345713e-05, + "loss": 2.0566, + "step": 7587 + }, + { + "epoch": 0.5711812416492595, + "grad_norm": 5.285572052001953, + "learning_rate": 4.110557189985193e-05, + "loss": 1.562, + "step": 7588 + }, + { + "epoch": 0.5712565159299197, + "grad_norm": 4.9157867431640625, + "learning_rate": 4.109357571010635e-05, + "loss": 1.9378, + "step": 7589 + }, + { + "epoch": 0.5713317902105798, + "grad_norm": 5.586403846740723, + "learning_rate": 4.108158004982214e-05, + "loss": 1.9796, + "step": 7590 + }, + { + "epoch": 0.5714070644912399, + "grad_norm": 4.578375816345215, + "learning_rate": 4.106958491971243e-05, + "loss": 1.7201, + "step": 7591 + }, + { + "epoch": 0.5714823387719001, + "grad_norm": 5.858940601348877, + "learning_rate": 4.1057590320490254e-05, + "loss": 2.0072, + "step": 7592 + }, + { + "epoch": 0.5715576130525603, + "grad_norm": 4.025996685028076, + "learning_rate": 4.104559625286868e-05, + "loss": 1.927, + "step": 7593 + }, + { + "epoch": 0.5716328873332204, + "grad_norm": 5.558964252471924, + "learning_rate": 4.103360271756071e-05, + "loss": 1.7311, + "step": 7594 + }, + { + "epoch": 0.5717081616138806, + "grad_norm": 4.400188446044922, + "learning_rate": 4.102160971527935e-05, + "loss": 2.0643, + "step": 7595 + }, + { + "epoch": 0.5717834358945407, + "grad_norm": 4.661321640014648, + "learning_rate": 4.10096172467375e-05, + "loss": 1.6388, + "step": 7596 + }, + { + "epoch": 0.5718587101752008, + "grad_norm": 5.56693172454834, + "learning_rate": 4.099762531264813e-05, + "loss": 1.9227, + "step": 7597 + }, + { + "epoch": 0.5719339844558611, + "grad_norm": 5.320420265197754, + "learning_rate": 4.098563391372409e-05, + "loss": 2.1256, + "step": 7598 + }, + { + "epoch": 0.5720092587365212, + "grad_norm": 4.389521598815918, + "learning_rate": 4.0973643050678274e-05, + "loss": 2.0456, + "step": 7599 + }, + { + "epoch": 0.5720845330171813, + "grad_norm": 5.7201828956604, + "learning_rate": 4.0961652724223456e-05, + "loss": 1.8459, + "step": 7600 + }, + { + "epoch": 0.5721598072978415, + "grad_norm": 6.614492416381836, + "learning_rate": 4.094966293507246e-05, + "loss": 1.701, + "step": 7601 + }, + { + "epoch": 0.5722350815785017, + "grad_norm": 5.0776190757751465, + "learning_rate": 4.0937673683938026e-05, + "loss": 1.8044, + "step": 7602 + }, + { + "epoch": 0.5723103558591618, + "grad_norm": 4.892368793487549, + "learning_rate": 4.0925684971532896e-05, + "loss": 1.9371, + "step": 7603 + }, + { + "epoch": 0.572385630139822, + "grad_norm": 4.289630889892578, + "learning_rate": 4.091369679856975e-05, + "loss": 1.8379, + "step": 7604 + }, + { + "epoch": 0.5724609044204821, + "grad_norm": 5.6774773597717285, + "learning_rate": 4.090170916576125e-05, + "loss": 1.6623, + "step": 7605 + }, + { + "epoch": 0.5725361787011423, + "grad_norm": 6.10878849029541, + "learning_rate": 4.088972207382006e-05, + "loss": 2.0404, + "step": 7606 + }, + { + "epoch": 0.5726114529818025, + "grad_norm": 3.3511202335357666, + "learning_rate": 4.087773552345873e-05, + "loss": 1.8754, + "step": 7607 + }, + { + "epoch": 0.5726867272624626, + "grad_norm": 4.990893840789795, + "learning_rate": 4.086574951538987e-05, + "loss": 1.7503, + "step": 7608 + }, + { + "epoch": 0.5727620015431227, + "grad_norm": 4.608138561248779, + "learning_rate": 4.0853764050325975e-05, + "loss": 1.956, + "step": 7609 + }, + { + "epoch": 0.5728372758237829, + "grad_norm": 5.314117431640625, + "learning_rate": 4.084177912897957e-05, + "loss": 2.2445, + "step": 7610 + }, + { + "epoch": 0.5729125501044431, + "grad_norm": 3.822608709335327, + "learning_rate": 4.082979475206311e-05, + "loss": 1.7419, + "step": 7611 + }, + { + "epoch": 0.5729878243851032, + "grad_norm": 4.511881351470947, + "learning_rate": 4.0817810920289054e-05, + "loss": 1.7492, + "step": 7612 + }, + { + "epoch": 0.5730630986657633, + "grad_norm": 12.96013355255127, + "learning_rate": 4.080582763436978e-05, + "loss": 2.2388, + "step": 7613 + }, + { + "epoch": 0.5731383729464236, + "grad_norm": 6.898222923278809, + "learning_rate": 4.079384489501768e-05, + "loss": 2.2605, + "step": 7614 + }, + { + "epoch": 0.5732136472270837, + "grad_norm": 6.851225852966309, + "learning_rate": 4.078186270294507e-05, + "loss": 1.922, + "step": 7615 + }, + { + "epoch": 0.5732889215077438, + "grad_norm": 4.838808536529541, + "learning_rate": 4.07698810588643e-05, + "loss": 1.9222, + "step": 7616 + }, + { + "epoch": 0.573364195788404, + "grad_norm": 3.6850783824920654, + "learning_rate": 4.075789996348759e-05, + "loss": 1.9528, + "step": 7617 + }, + { + "epoch": 0.5734394700690641, + "grad_norm": 4.440414905548096, + "learning_rate": 4.074591941752724e-05, + "loss": 1.6703, + "step": 7618 + }, + { + "epoch": 0.5735147443497243, + "grad_norm": 5.036422252655029, + "learning_rate": 4.073393942169539e-05, + "loss": 1.9079, + "step": 7619 + }, + { + "epoch": 0.5735900186303845, + "grad_norm": 4.969206809997559, + "learning_rate": 4.0721959976704274e-05, + "loss": 1.9456, + "step": 7620 + }, + { + "epoch": 0.5736652929110446, + "grad_norm": 4.848805904388428, + "learning_rate": 4.0709981083266005e-05, + "loss": 2.345, + "step": 7621 + }, + { + "epoch": 0.5737405671917047, + "grad_norm": 3.982607126235962, + "learning_rate": 4.069800274209271e-05, + "loss": 1.835, + "step": 7622 + }, + { + "epoch": 0.573815841472365, + "grad_norm": 5.371452808380127, + "learning_rate": 4.0686024953896476e-05, + "loss": 1.6242, + "step": 7623 + }, + { + "epoch": 0.5738911157530251, + "grad_norm": 3.659156322479248, + "learning_rate": 4.0674047719389316e-05, + "loss": 2.0293, + "step": 7624 + }, + { + "epoch": 0.5739663900336852, + "grad_norm": 4.546854496002197, + "learning_rate": 4.0662071039283266e-05, + "loss": 1.6729, + "step": 7625 + }, + { + "epoch": 0.5740416643143454, + "grad_norm": 5.95877742767334, + "learning_rate": 4.065009491429029e-05, + "loss": 2.0728, + "step": 7626 + }, + { + "epoch": 0.5741169385950056, + "grad_norm": 5.082108020782471, + "learning_rate": 4.0638119345122367e-05, + "loss": 1.9028, + "step": 7627 + }, + { + "epoch": 0.5741922128756657, + "grad_norm": 4.440121173858643, + "learning_rate": 4.0626144332491366e-05, + "loss": 1.6467, + "step": 7628 + }, + { + "epoch": 0.5742674871563259, + "grad_norm": 3.6257505416870117, + "learning_rate": 4.061416987710921e-05, + "loss": 1.6215, + "step": 7629 + }, + { + "epoch": 0.574342761436986, + "grad_norm": 4.495459079742432, + "learning_rate": 4.0602195979687694e-05, + "loss": 1.7879, + "step": 7630 + }, + { + "epoch": 0.5744180357176462, + "grad_norm": 4.656881332397461, + "learning_rate": 4.059022264093868e-05, + "loss": 1.9973, + "step": 7631 + }, + { + "epoch": 0.5744933099983063, + "grad_norm": 6.694538593292236, + "learning_rate": 4.057824986157392e-05, + "loss": 2.1799, + "step": 7632 + }, + { + "epoch": 0.5745685842789665, + "grad_norm": 4.634024620056152, + "learning_rate": 4.056627764230519e-05, + "loss": 1.6009, + "step": 7633 + }, + { + "epoch": 0.5746438585596266, + "grad_norm": 3.561347484588623, + "learning_rate": 4.055430598384416e-05, + "loss": 1.7629, + "step": 7634 + }, + { + "epoch": 0.5747191328402868, + "grad_norm": 5.816922664642334, + "learning_rate": 4.0542334886902564e-05, + "loss": 2.0131, + "step": 7635 + }, + { + "epoch": 0.574794407120947, + "grad_norm": 5.384393692016602, + "learning_rate": 4.0530364352192e-05, + "loss": 1.8412, + "step": 7636 + }, + { + "epoch": 0.5748696814016071, + "grad_norm": 5.188614368438721, + "learning_rate": 4.0518394380424105e-05, + "loss": 1.9094, + "step": 7637 + }, + { + "epoch": 0.5749449556822672, + "grad_norm": 5.6943182945251465, + "learning_rate": 4.050642497231047e-05, + "loss": 1.7961, + "step": 7638 + }, + { + "epoch": 0.5750202299629275, + "grad_norm": 4.254000663757324, + "learning_rate": 4.0494456128562616e-05, + "loss": 1.8426, + "step": 7639 + }, + { + "epoch": 0.5750955042435876, + "grad_norm": 3.4602999687194824, + "learning_rate": 4.048248784989209e-05, + "loss": 1.9444, + "step": 7640 + }, + { + "epoch": 0.5751707785242477, + "grad_norm": 3.9344232082366943, + "learning_rate": 4.047052013701034e-05, + "loss": 1.6866, + "step": 7641 + }, + { + "epoch": 0.5752460528049079, + "grad_norm": 4.342310905456543, + "learning_rate": 4.045855299062883e-05, + "loss": 1.7588, + "step": 7642 + }, + { + "epoch": 0.575321327085568, + "grad_norm": 4.8720622062683105, + "learning_rate": 4.0446586411458955e-05, + "loss": 1.6449, + "step": 7643 + }, + { + "epoch": 0.5753966013662282, + "grad_norm": 5.865235805511475, + "learning_rate": 4.0434620400212126e-05, + "loss": 1.6086, + "step": 7644 + }, + { + "epoch": 0.5754718756468884, + "grad_norm": 4.478507995605469, + "learning_rate": 4.042265495759965e-05, + "loss": 1.4689, + "step": 7645 + }, + { + "epoch": 0.5755471499275485, + "grad_norm": 5.448455810546875, + "learning_rate": 4.041069008433287e-05, + "loss": 1.9208, + "step": 7646 + }, + { + "epoch": 0.5756224242082086, + "grad_norm": 5.161365985870361, + "learning_rate": 4.039872578112304e-05, + "loss": 1.9861, + "step": 7647 + }, + { + "epoch": 0.5756976984888689, + "grad_norm": 5.40657901763916, + "learning_rate": 4.038676204868142e-05, + "loss": 1.7774, + "step": 7648 + }, + { + "epoch": 0.575772972769529, + "grad_norm": 4.4866557121276855, + "learning_rate": 4.03747988877192e-05, + "loss": 1.7399, + "step": 7649 + }, + { + "epoch": 0.5758482470501891, + "grad_norm": 4.824374198913574, + "learning_rate": 4.036283629894759e-05, + "loss": 1.9157, + "step": 7650 + }, + { + "epoch": 0.5759235213308492, + "grad_norm": 5.27728271484375, + "learning_rate": 4.035087428307769e-05, + "loss": 1.5463, + "step": 7651 + }, + { + "epoch": 0.5759987956115095, + "grad_norm": 6.984309673309326, + "learning_rate": 4.033891284082064e-05, + "loss": 1.8198, + "step": 7652 + }, + { + "epoch": 0.5760740698921696, + "grad_norm": 4.725070953369141, + "learning_rate": 4.032695197288748e-05, + "loss": 1.8195, + "step": 7653 + }, + { + "epoch": 0.5761493441728297, + "grad_norm": 4.154239177703857, + "learning_rate": 4.0314991679989286e-05, + "loss": 1.8958, + "step": 7654 + }, + { + "epoch": 0.5762246184534899, + "grad_norm": 4.55750846862793, + "learning_rate": 4.030303196283706e-05, + "loss": 1.9958, + "step": 7655 + }, + { + "epoch": 0.57629989273415, + "grad_norm": 5.894731044769287, + "learning_rate": 4.029107282214174e-05, + "loss": 2.026, + "step": 7656 + }, + { + "epoch": 0.5763751670148102, + "grad_norm": 5.585499286651611, + "learning_rate": 4.02791142586143e-05, + "loss": 1.973, + "step": 7657 + }, + { + "epoch": 0.5764504412954704, + "grad_norm": 4.196101188659668, + "learning_rate": 4.026715627296561e-05, + "loss": 1.8333, + "step": 7658 + }, + { + "epoch": 0.5765257155761305, + "grad_norm": 5.871015548706055, + "learning_rate": 4.025519886590656e-05, + "loss": 2.0052, + "step": 7659 + }, + { + "epoch": 0.5766009898567906, + "grad_norm": 5.82728910446167, + "learning_rate": 4.024324203814797e-05, + "loss": 1.9099, + "step": 7660 + }, + { + "epoch": 0.5766762641374509, + "grad_norm": 4.0447821617126465, + "learning_rate": 4.023128579040066e-05, + "loss": 2.0021, + "step": 7661 + }, + { + "epoch": 0.576751538418111, + "grad_norm": 3.868428945541382, + "learning_rate": 4.021933012337536e-05, + "loss": 1.8261, + "step": 7662 + }, + { + "epoch": 0.5768268126987711, + "grad_norm": 5.3470354080200195, + "learning_rate": 4.020737503778284e-05, + "loss": 2.1472, + "step": 7663 + }, + { + "epoch": 0.5769020869794313, + "grad_norm": 4.169758319854736, + "learning_rate": 4.019542053433376e-05, + "loss": 1.6341, + "step": 7664 + }, + { + "epoch": 0.5769773612600915, + "grad_norm": 5.550473690032959, + "learning_rate": 4.018346661373881e-05, + "loss": 2.0297, + "step": 7665 + }, + { + "epoch": 0.5770526355407516, + "grad_norm": 4.343144416809082, + "learning_rate": 4.017151327670858e-05, + "loss": 1.7207, + "step": 7666 + }, + { + "epoch": 0.5771279098214118, + "grad_norm": 3.7713727951049805, + "learning_rate": 4.0159560523953715e-05, + "loss": 1.7291, + "step": 7667 + }, + { + "epoch": 0.5772031841020719, + "grad_norm": 5.782696723937988, + "learning_rate": 4.014760835618472e-05, + "loss": 1.635, + "step": 7668 + }, + { + "epoch": 0.5772784583827321, + "grad_norm": 5.870705604553223, + "learning_rate": 4.013565677411214e-05, + "loss": 1.7758, + "step": 7669 + }, + { + "epoch": 0.5773537326633922, + "grad_norm": 4.923253536224365, + "learning_rate": 4.0123705778446464e-05, + "loss": 1.8138, + "step": 7670 + }, + { + "epoch": 0.5774290069440524, + "grad_norm": 5.282963752746582, + "learning_rate": 4.011175536989814e-05, + "loss": 1.9698, + "step": 7671 + }, + { + "epoch": 0.5775042812247125, + "grad_norm": 5.4413933753967285, + "learning_rate": 4.00998055491776e-05, + "loss": 1.7254, + "step": 7672 + }, + { + "epoch": 0.5775795555053727, + "grad_norm": 5.781665802001953, + "learning_rate": 4.0087856316995195e-05, + "loss": 2.0015, + "step": 7673 + }, + { + "epoch": 0.5776548297860329, + "grad_norm": 4.89331579208374, + "learning_rate": 4.007590767406131e-05, + "loss": 2.1218, + "step": 7674 + }, + { + "epoch": 0.577730104066693, + "grad_norm": 4.19381856918335, + "learning_rate": 4.006395962108622e-05, + "loss": 1.7952, + "step": 7675 + }, + { + "epoch": 0.5778053783473531, + "grad_norm": 4.383309841156006, + "learning_rate": 4.005201215878023e-05, + "loss": 1.5965, + "step": 7676 + }, + { + "epoch": 0.5778806526280134, + "grad_norm": 4.509468078613281, + "learning_rate": 4.004006528785357e-05, + "loss": 1.8957, + "step": 7677 + }, + { + "epoch": 0.5779559269086735, + "grad_norm": 5.223258972167969, + "learning_rate": 4.0028119009016465e-05, + "loss": 1.6935, + "step": 7678 + }, + { + "epoch": 0.5780312011893336, + "grad_norm": 4.945888996124268, + "learning_rate": 4.001617332297905e-05, + "loss": 1.6292, + "step": 7679 + }, + { + "epoch": 0.5781064754699938, + "grad_norm": 5.944561958312988, + "learning_rate": 4.00042282304515e-05, + "loss": 2.3109, + "step": 7680 + }, + { + "epoch": 0.578181749750654, + "grad_norm": 5.593074321746826, + "learning_rate": 3.999228373214388e-05, + "loss": 1.9073, + "step": 7681 + }, + { + "epoch": 0.5782570240313141, + "grad_norm": 6.466760635375977, + "learning_rate": 3.99803398287663e-05, + "loss": 1.9025, + "step": 7682 + }, + { + "epoch": 0.5783322983119743, + "grad_norm": 3.901648998260498, + "learning_rate": 3.9968396521028754e-05, + "loss": 1.5699, + "step": 7683 + }, + { + "epoch": 0.5784075725926344, + "grad_norm": 4.755966663360596, + "learning_rate": 3.995645380964127e-05, + "loss": 1.9468, + "step": 7684 + }, + { + "epoch": 0.5784828468732945, + "grad_norm": 4.47395658493042, + "learning_rate": 3.994451169531376e-05, + "loss": 1.7155, + "step": 7685 + }, + { + "epoch": 0.5785581211539548, + "grad_norm": 5.608683109283447, + "learning_rate": 3.993257017875619e-05, + "loss": 1.8306, + "step": 7686 + }, + { + "epoch": 0.5786333954346149, + "grad_norm": 4.782827854156494, + "learning_rate": 3.992062926067844e-05, + "loss": 2.1272, + "step": 7687 + }, + { + "epoch": 0.578708669715275, + "grad_norm": 6.436601161956787, + "learning_rate": 3.990868894179035e-05, + "loss": 2.0505, + "step": 7688 + }, + { + "epoch": 0.5787839439959351, + "grad_norm": 4.658920764923096, + "learning_rate": 3.9896749222801765e-05, + "loss": 2.0737, + "step": 7689 + }, + { + "epoch": 0.5788592182765954, + "grad_norm": 3.5421383380889893, + "learning_rate": 3.9884810104422434e-05, + "loss": 1.906, + "step": 7690 + }, + { + "epoch": 0.5789344925572555, + "grad_norm": 4.739721775054932, + "learning_rate": 3.987287158736213e-05, + "loss": 1.9066, + "step": 7691 + }, + { + "epoch": 0.5790097668379156, + "grad_norm": 4.567431449890137, + "learning_rate": 3.986093367233054e-05, + "loss": 2.7803, + "step": 7692 + }, + { + "epoch": 0.5790850411185758, + "grad_norm": 6.311440944671631, + "learning_rate": 3.984899636003736e-05, + "loss": 2.198, + "step": 7693 + }, + { + "epoch": 0.579160315399236, + "grad_norm": 3.944124698638916, + "learning_rate": 3.983705965119221e-05, + "loss": 2.3358, + "step": 7694 + }, + { + "epoch": 0.5792355896798961, + "grad_norm": 5.065308570861816, + "learning_rate": 3.982512354650472e-05, + "loss": 1.7071, + "step": 7695 + }, + { + "epoch": 0.5793108639605563, + "grad_norm": 8.038674354553223, + "learning_rate": 3.981318804668442e-05, + "loss": 2.0069, + "step": 7696 + }, + { + "epoch": 0.5793861382412164, + "grad_norm": 4.614602088928223, + "learning_rate": 3.980125315244088e-05, + "loss": 1.814, + "step": 7697 + }, + { + "epoch": 0.5794614125218766, + "grad_norm": 6.746983528137207, + "learning_rate": 3.978931886448355e-05, + "loss": 1.82, + "step": 7698 + }, + { + "epoch": 0.5795366868025368, + "grad_norm": 3.966038227081299, + "learning_rate": 3.977738518352195e-05, + "loss": 1.4564, + "step": 7699 + }, + { + "epoch": 0.5796119610831969, + "grad_norm": 5.068717002868652, + "learning_rate": 3.976545211026545e-05, + "loss": 1.5265, + "step": 7700 + }, + { + "epoch": 0.579687235363857, + "grad_norm": 5.492524147033691, + "learning_rate": 3.975351964542346e-05, + "loss": 2.0166, + "step": 7701 + }, + { + "epoch": 0.5797625096445173, + "grad_norm": 4.053000450134277, + "learning_rate": 3.974158778970532e-05, + "loss": 2.267, + "step": 7702 + }, + { + "epoch": 0.5798377839251774, + "grad_norm": 4.663575649261475, + "learning_rate": 3.9729656543820346e-05, + "loss": 1.7, + "step": 7703 + }, + { + "epoch": 0.5799130582058375, + "grad_norm": 5.478631019592285, + "learning_rate": 3.971772590847783e-05, + "loss": 1.8409, + "step": 7704 + }, + { + "epoch": 0.5799883324864977, + "grad_norm": 3.9644811153411865, + "learning_rate": 3.970579588438701e-05, + "loss": 1.8415, + "step": 7705 + }, + { + "epoch": 0.5800636067671578, + "grad_norm": 3.0490872859954834, + "learning_rate": 3.969386647225709e-05, + "loss": 2.3869, + "step": 7706 + }, + { + "epoch": 0.580138881047818, + "grad_norm": 3.567423105239868, + "learning_rate": 3.968193767279722e-05, + "loss": 2.0545, + "step": 7707 + }, + { + "epoch": 0.5802141553284781, + "grad_norm": 6.265186309814453, + "learning_rate": 3.9670009486716566e-05, + "loss": 1.9483, + "step": 7708 + }, + { + "epoch": 0.5802894296091383, + "grad_norm": 4.558940410614014, + "learning_rate": 3.96580819147242e-05, + "loss": 1.8082, + "step": 7709 + }, + { + "epoch": 0.5803647038897984, + "grad_norm": 4.768979549407959, + "learning_rate": 3.9646154957529196e-05, + "loss": 2.0926, + "step": 7710 + }, + { + "epoch": 0.5804399781704586, + "grad_norm": 5.609062671661377, + "learning_rate": 3.963422861584056e-05, + "loss": 1.5971, + "step": 7711 + }, + { + "epoch": 0.5805152524511188, + "grad_norm": 6.203240394592285, + "learning_rate": 3.9622302890367316e-05, + "loss": 1.811, + "step": 7712 + }, + { + "epoch": 0.5805905267317789, + "grad_norm": 5.027291774749756, + "learning_rate": 3.9610377781818356e-05, + "loss": 2.0047, + "step": 7713 + }, + { + "epoch": 0.580665801012439, + "grad_norm": 5.699218273162842, + "learning_rate": 3.9598453290902646e-05, + "loss": 1.6443, + "step": 7714 + }, + { + "epoch": 0.5807410752930993, + "grad_norm": 4.884763717651367, + "learning_rate": 3.9586529418329024e-05, + "loss": 1.8848, + "step": 7715 + }, + { + "epoch": 0.5808163495737594, + "grad_norm": 4.573393821716309, + "learning_rate": 3.957460616480637e-05, + "loss": 1.9472, + "step": 7716 + }, + { + "epoch": 0.5808916238544195, + "grad_norm": 4.217409133911133, + "learning_rate": 3.956268353104346e-05, + "loss": 1.9957, + "step": 7717 + }, + { + "epoch": 0.5809668981350797, + "grad_norm": 4.545864582061768, + "learning_rate": 3.9550761517749046e-05, + "loss": 1.8175, + "step": 7718 + }, + { + "epoch": 0.5810421724157399, + "grad_norm": 4.692615032196045, + "learning_rate": 3.9538840125631904e-05, + "loss": 1.9325, + "step": 7719 + }, + { + "epoch": 0.5811174466964, + "grad_norm": 7.264098167419434, + "learning_rate": 3.952691935540068e-05, + "loss": 2.0231, + "step": 7720 + }, + { + "epoch": 0.5811927209770602, + "grad_norm": 5.382420063018799, + "learning_rate": 3.9514999207764064e-05, + "loss": 2.099, + "step": 7721 + }, + { + "epoch": 0.5812679952577203, + "grad_norm": 8.596435546875, + "learning_rate": 3.950307968343065e-05, + "loss": 2.2097, + "step": 7722 + }, + { + "epoch": 0.5813432695383804, + "grad_norm": 6.394593238830566, + "learning_rate": 3.949116078310905e-05, + "loss": 2.124, + "step": 7723 + }, + { + "epoch": 0.5814185438190407, + "grad_norm": 5.426838397979736, + "learning_rate": 3.947924250750776e-05, + "loss": 2.133, + "step": 7724 + }, + { + "epoch": 0.5814938180997008, + "grad_norm": 5.8588151931762695, + "learning_rate": 3.946732485733533e-05, + "loss": 1.772, + "step": 7725 + }, + { + "epoch": 0.5815690923803609, + "grad_norm": 4.996241092681885, + "learning_rate": 3.9455407833300206e-05, + "loss": 1.612, + "step": 7726 + }, + { + "epoch": 0.5816443666610212, + "grad_norm": 4.909712791442871, + "learning_rate": 3.9443491436110856e-05, + "loss": 2.071, + "step": 7727 + }, + { + "epoch": 0.5817196409416813, + "grad_norm": 4.151638507843018, + "learning_rate": 3.9431575666475625e-05, + "loss": 1.8044, + "step": 7728 + }, + { + "epoch": 0.5817949152223414, + "grad_norm": 4.3523406982421875, + "learning_rate": 3.941966052510292e-05, + "loss": 2.0515, + "step": 7729 + }, + { + "epoch": 0.5818701895030015, + "grad_norm": 6.174666404724121, + "learning_rate": 3.940774601270101e-05, + "loss": 1.9962, + "step": 7730 + }, + { + "epoch": 0.5819454637836617, + "grad_norm": 5.057032108306885, + "learning_rate": 3.939583212997823e-05, + "loss": 1.85, + "step": 7731 + }, + { + "epoch": 0.5820207380643219, + "grad_norm": 4.086638450622559, + "learning_rate": 3.938391887764278e-05, + "loss": 1.7365, + "step": 7732 + }, + { + "epoch": 0.582096012344982, + "grad_norm": 4.138251781463623, + "learning_rate": 3.9372006256402916e-05, + "loss": 1.6448, + "step": 7733 + }, + { + "epoch": 0.5821712866256422, + "grad_norm": 4.54074239730835, + "learning_rate": 3.9360094266966767e-05, + "loss": 1.9168, + "step": 7734 + }, + { + "epoch": 0.5822465609063023, + "grad_norm": 4.207004070281982, + "learning_rate": 3.934818291004248e-05, + "loss": 1.7292, + "step": 7735 + }, + { + "epoch": 0.5823218351869625, + "grad_norm": 4.732818603515625, + "learning_rate": 3.933627218633818e-05, + "loss": 2.0884, + "step": 7736 + }, + { + "epoch": 0.5823971094676227, + "grad_norm": 5.5070085525512695, + "learning_rate": 3.9324362096561887e-05, + "loss": 1.8961, + "step": 7737 + }, + { + "epoch": 0.5824723837482828, + "grad_norm": 4.441492557525635, + "learning_rate": 3.931245264142163e-05, + "loss": 2.054, + "step": 7738 + }, + { + "epoch": 0.5825476580289429, + "grad_norm": 5.317041397094727, + "learning_rate": 3.93005438216254e-05, + "loss": 1.9607, + "step": 7739 + }, + { + "epoch": 0.5826229323096032, + "grad_norm": 4.862093925476074, + "learning_rate": 3.928863563788116e-05, + "loss": 1.7507, + "step": 7740 + }, + { + "epoch": 0.5826982065902633, + "grad_norm": 4.7248029708862305, + "learning_rate": 3.9276728090896765e-05, + "loss": 1.7736, + "step": 7741 + }, + { + "epoch": 0.5827734808709234, + "grad_norm": 4.251784324645996, + "learning_rate": 3.926482118138013e-05, + "loss": 1.8955, + "step": 7742 + }, + { + "epoch": 0.5828487551515836, + "grad_norm": 6.493978977203369, + "learning_rate": 3.9252914910039076e-05, + "loss": 1.9545, + "step": 7743 + }, + { + "epoch": 0.5829240294322438, + "grad_norm": 4.953982830047607, + "learning_rate": 3.92410092775814e-05, + "loss": 1.5539, + "step": 7744 + }, + { + "epoch": 0.5829993037129039, + "grad_norm": 3.9922866821289062, + "learning_rate": 3.9229104284714836e-05, + "loss": 2.0885, + "step": 7745 + }, + { + "epoch": 0.5830745779935641, + "grad_norm": 4.021296501159668, + "learning_rate": 3.921719993214714e-05, + "loss": 1.78, + "step": 7746 + }, + { + "epoch": 0.5831498522742242, + "grad_norm": 3.900331735610962, + "learning_rate": 3.9205296220585944e-05, + "loss": 1.907, + "step": 7747 + }, + { + "epoch": 0.5832251265548843, + "grad_norm": 6.634807586669922, + "learning_rate": 3.919339315073893e-05, + "loss": 1.8327, + "step": 7748 + }, + { + "epoch": 0.5833004008355445, + "grad_norm": 9.16468620300293, + "learning_rate": 3.918149072331367e-05, + "loss": 2.2514, + "step": 7749 + }, + { + "epoch": 0.5833756751162047, + "grad_norm": 6.211683750152588, + "learning_rate": 3.916958893901774e-05, + "loss": 1.7381, + "step": 7750 + }, + { + "epoch": 0.5834509493968648, + "grad_norm": 4.504829406738281, + "learning_rate": 3.9157687798558706e-05, + "loss": 1.8975, + "step": 7751 + }, + { + "epoch": 0.5835262236775249, + "grad_norm": 4.639247894287109, + "learning_rate": 3.9145787302643996e-05, + "loss": 1.814, + "step": 7752 + }, + { + "epoch": 0.5836014979581852, + "grad_norm": 3.7421114444732666, + "learning_rate": 3.9133887451981096e-05, + "loss": 1.8407, + "step": 7753 + }, + { + "epoch": 0.5836767722388453, + "grad_norm": 4.946784496307373, + "learning_rate": 3.9121988247277394e-05, + "loss": 1.8168, + "step": 7754 + }, + { + "epoch": 0.5837520465195054, + "grad_norm": 5.382870197296143, + "learning_rate": 3.9110089689240304e-05, + "loss": 1.776, + "step": 7755 + }, + { + "epoch": 0.5838273208001656, + "grad_norm": 5.082200050354004, + "learning_rate": 3.9098191778577106e-05, + "loss": 1.6497, + "step": 7756 + }, + { + "epoch": 0.5839025950808258, + "grad_norm": 3.401374340057373, + "learning_rate": 3.908629451599516e-05, + "loss": 2.0001, + "step": 7757 + }, + { + "epoch": 0.5839778693614859, + "grad_norm": 4.333251953125, + "learning_rate": 3.907439790220166e-05, + "loss": 1.8119, + "step": 7758 + }, + { + "epoch": 0.5840531436421461, + "grad_norm": 4.623253345489502, + "learning_rate": 3.906250193790387e-05, + "loss": 1.9465, + "step": 7759 + }, + { + "epoch": 0.5841284179228062, + "grad_norm": 5.91024112701416, + "learning_rate": 3.9050606623808945e-05, + "loss": 1.8, + "step": 7760 + }, + { + "epoch": 0.5842036922034664, + "grad_norm": 5.61050271987915, + "learning_rate": 3.903871196062406e-05, + "loss": 1.6253, + "step": 7761 + }, + { + "epoch": 0.5842789664841266, + "grad_norm": 5.143840312957764, + "learning_rate": 3.9026817949056274e-05, + "loss": 1.6732, + "step": 7762 + }, + { + "epoch": 0.5843542407647867, + "grad_norm": 4.222826957702637, + "learning_rate": 3.90149245898127e-05, + "loss": 2.1959, + "step": 7763 + }, + { + "epoch": 0.5844295150454468, + "grad_norm": 3.9595866203308105, + "learning_rate": 3.900303188360032e-05, + "loss": 1.7966, + "step": 7764 + }, + { + "epoch": 0.584504789326107, + "grad_norm": 5.239078521728516, + "learning_rate": 3.899113983112614e-05, + "loss": 2.0547, + "step": 7765 + }, + { + "epoch": 0.5845800636067672, + "grad_norm": 5.827862739562988, + "learning_rate": 3.897924843309711e-05, + "loss": 2.3592, + "step": 7766 + }, + { + "epoch": 0.5846553378874273, + "grad_norm": 6.119013786315918, + "learning_rate": 3.896735769022014e-05, + "loss": 1.9863, + "step": 7767 + }, + { + "epoch": 0.5847306121680874, + "grad_norm": 5.658279895782471, + "learning_rate": 3.895546760320212e-05, + "loss": 1.6501, + "step": 7768 + }, + { + "epoch": 0.5848058864487476, + "grad_norm": 5.072042465209961, + "learning_rate": 3.8943578172749826e-05, + "loss": 1.6072, + "step": 7769 + }, + { + "epoch": 0.5848811607294078, + "grad_norm": 6.512777328491211, + "learning_rate": 3.893168939957011e-05, + "loss": 1.8617, + "step": 7770 + }, + { + "epoch": 0.5849564350100679, + "grad_norm": 4.064201354980469, + "learning_rate": 3.8919801284369685e-05, + "loss": 1.631, + "step": 7771 + }, + { + "epoch": 0.5850317092907281, + "grad_norm": 4.377372741699219, + "learning_rate": 3.8907913827855304e-05, + "loss": 2.0552, + "step": 7772 + }, + { + "epoch": 0.5851069835713882, + "grad_norm": 7.75787353515625, + "learning_rate": 3.8896027030733597e-05, + "loss": 1.8891, + "step": 7773 + }, + { + "epoch": 0.5851822578520484, + "grad_norm": 6.155920505523682, + "learning_rate": 3.888414089371124e-05, + "loss": 1.9693, + "step": 7774 + }, + { + "epoch": 0.5852575321327086, + "grad_norm": 4.713784217834473, + "learning_rate": 3.8872255417494795e-05, + "loss": 1.3946, + "step": 7775 + }, + { + "epoch": 0.5853328064133687, + "grad_norm": 6.909249782562256, + "learning_rate": 3.8860370602790844e-05, + "loss": 1.8722, + "step": 7776 + }, + { + "epoch": 0.5854080806940288, + "grad_norm": 6.590963840484619, + "learning_rate": 3.884848645030589e-05, + "loss": 1.9261, + "step": 7777 + }, + { + "epoch": 0.5854833549746891, + "grad_norm": 6.702365875244141, + "learning_rate": 3.8836602960746435e-05, + "loss": 1.9424, + "step": 7778 + }, + { + "epoch": 0.5855586292553492, + "grad_norm": 4.4740729331970215, + "learning_rate": 3.882472013481889e-05, + "loss": 1.7071, + "step": 7779 + }, + { + "epoch": 0.5856339035360093, + "grad_norm": 3.27465558052063, + "learning_rate": 3.881283797322967e-05, + "loss": 1.7021, + "step": 7780 + }, + { + "epoch": 0.5857091778166695, + "grad_norm": 4.187403202056885, + "learning_rate": 3.880095647668512e-05, + "loss": 1.8652, + "step": 7781 + }, + { + "epoch": 0.5857844520973297, + "grad_norm": 4.300467014312744, + "learning_rate": 3.878907564589158e-05, + "loss": 1.8324, + "step": 7782 + }, + { + "epoch": 0.5858597263779898, + "grad_norm": 5.975734710693359, + "learning_rate": 3.877719548155535e-05, + "loss": 2.1406, + "step": 7783 + }, + { + "epoch": 0.58593500065865, + "grad_norm": 5.079812526702881, + "learning_rate": 3.876531598438262e-05, + "loss": 1.6022, + "step": 7784 + }, + { + "epoch": 0.5860102749393101, + "grad_norm": 5.078145980834961, + "learning_rate": 3.875343715507964e-05, + "loss": 2.1089, + "step": 7785 + }, + { + "epoch": 0.5860855492199702, + "grad_norm": 4.834609508514404, + "learning_rate": 3.8741558994352536e-05, + "loss": 1.9356, + "step": 7786 + }, + { + "epoch": 0.5861608235006304, + "grad_norm": 4.924367904663086, + "learning_rate": 3.872968150290745e-05, + "loss": 2.1639, + "step": 7787 + }, + { + "epoch": 0.5862360977812906, + "grad_norm": 5.167743682861328, + "learning_rate": 3.8717804681450456e-05, + "loss": 1.7953, + "step": 7788 + }, + { + "epoch": 0.5863113720619507, + "grad_norm": 6.30468225479126, + "learning_rate": 3.870592853068761e-05, + "loss": 2.0109, + "step": 7789 + }, + { + "epoch": 0.5863866463426108, + "grad_norm": 5.480498790740967, + "learning_rate": 3.8694053051324895e-05, + "loss": 1.9012, + "step": 7790 + }, + { + "epoch": 0.5864619206232711, + "grad_norm": 5.345117568969727, + "learning_rate": 3.868217824406831e-05, + "loss": 1.9743, + "step": 7791 + }, + { + "epoch": 0.5865371949039312, + "grad_norm": 4.603163719177246, + "learning_rate": 3.867030410962372e-05, + "loss": 1.5296, + "step": 7792 + }, + { + "epoch": 0.5866124691845913, + "grad_norm": 5.377043724060059, + "learning_rate": 3.865843064869706e-05, + "loss": 1.8706, + "step": 7793 + }, + { + "epoch": 0.5866877434652515, + "grad_norm": 4.131460189819336, + "learning_rate": 3.864655786199415e-05, + "loss": 1.9028, + "step": 7794 + }, + { + "epoch": 0.5867630177459117, + "grad_norm": 4.421370506286621, + "learning_rate": 3.863468575022081e-05, + "loss": 1.5959, + "step": 7795 + }, + { + "epoch": 0.5868382920265718, + "grad_norm": 4.667974948883057, + "learning_rate": 3.8622814314082775e-05, + "loss": 1.8764, + "step": 7796 + }, + { + "epoch": 0.586913566307232, + "grad_norm": 4.5249457359313965, + "learning_rate": 3.86109435542858e-05, + "loss": 1.6356, + "step": 7797 + }, + { + "epoch": 0.5869888405878921, + "grad_norm": 6.392907619476318, + "learning_rate": 3.859907347153554e-05, + "loss": 2.0216, + "step": 7798 + }, + { + "epoch": 0.5870641148685523, + "grad_norm": 4.534290313720703, + "learning_rate": 3.858720406653764e-05, + "loss": 1.9153, + "step": 7799 + }, + { + "epoch": 0.5871393891492125, + "grad_norm": 4.514787197113037, + "learning_rate": 3.8575335339997736e-05, + "loss": 1.891, + "step": 7800 + }, + { + "epoch": 0.5872146634298726, + "grad_norm": 5.686667442321777, + "learning_rate": 3.856346729262135e-05, + "loss": 1.8746, + "step": 7801 + }, + { + "epoch": 0.5872899377105327, + "grad_norm": 6.384697437286377, + "learning_rate": 3.855159992511403e-05, + "loss": 1.6886, + "step": 7802 + }, + { + "epoch": 0.587365211991193, + "grad_norm": 4.876718044281006, + "learning_rate": 3.853973323818123e-05, + "loss": 2.2076, + "step": 7803 + }, + { + "epoch": 0.5874404862718531, + "grad_norm": 6.946642875671387, + "learning_rate": 3.852786723252842e-05, + "loss": 2.4625, + "step": 7804 + }, + { + "epoch": 0.5875157605525132, + "grad_norm": 3.707252264022827, + "learning_rate": 3.851600190886096e-05, + "loss": 1.5427, + "step": 7805 + }, + { + "epoch": 0.5875910348331733, + "grad_norm": 4.963434219360352, + "learning_rate": 3.850413726788428e-05, + "loss": 1.717, + "step": 7806 + }, + { + "epoch": 0.5876663091138336, + "grad_norm": 4.309384822845459, + "learning_rate": 3.849227331030362e-05, + "loss": 1.7033, + "step": 7807 + }, + { + "epoch": 0.5877415833944937, + "grad_norm": 3.8997962474823, + "learning_rate": 3.84804100368243e-05, + "loss": 2.0988, + "step": 7808 + }, + { + "epoch": 0.5878168576751538, + "grad_norm": 4.916024684906006, + "learning_rate": 3.8468547448151554e-05, + "loss": 1.7303, + "step": 7809 + }, + { + "epoch": 0.587892131955814, + "grad_norm": 4.8505988121032715, + "learning_rate": 3.845668554499057e-05, + "loss": 2.0689, + "step": 7810 + }, + { + "epoch": 0.5879674062364741, + "grad_norm": 4.44301700592041, + "learning_rate": 3.84448243280465e-05, + "loss": 1.4431, + "step": 7811 + }, + { + "epoch": 0.5880426805171343, + "grad_norm": 4.837834358215332, + "learning_rate": 3.84329637980245e-05, + "loss": 1.896, + "step": 7812 + }, + { + "epoch": 0.5881179547977945, + "grad_norm": 4.032279968261719, + "learning_rate": 3.8421103955629576e-05, + "loss": 1.9865, + "step": 7813 + }, + { + "epoch": 0.5881932290784546, + "grad_norm": 5.82047176361084, + "learning_rate": 3.840924480156682e-05, + "loss": 2.1286, + "step": 7814 + }, + { + "epoch": 0.5882685033591147, + "grad_norm": 5.075139045715332, + "learning_rate": 3.839738633654119e-05, + "loss": 1.9871, + "step": 7815 + }, + { + "epoch": 0.588343777639775, + "grad_norm": 4.9096856117248535, + "learning_rate": 3.838552856125764e-05, + "loss": 1.7693, + "step": 7816 + }, + { + "epoch": 0.5884190519204351, + "grad_norm": 4.260968208312988, + "learning_rate": 3.8373671476421124e-05, + "loss": 1.7235, + "step": 7817 + }, + { + "epoch": 0.5884943262010952, + "grad_norm": 7.050853729248047, + "learning_rate": 3.836181508273646e-05, + "loss": 2.1274, + "step": 7818 + }, + { + "epoch": 0.5885696004817554, + "grad_norm": 11.854440689086914, + "learning_rate": 3.834995938090851e-05, + "loss": 2.3924, + "step": 7819 + }, + { + "epoch": 0.5886448747624156, + "grad_norm": 5.542457580566406, + "learning_rate": 3.8338104371642023e-05, + "loss": 1.9113, + "step": 7820 + }, + { + "epoch": 0.5887201490430757, + "grad_norm": 4.369715213775635, + "learning_rate": 3.8326250055641787e-05, + "loss": 1.7374, + "step": 7821 + }, + { + "epoch": 0.5887954233237359, + "grad_norm": 3.8842902183532715, + "learning_rate": 3.831439643361248e-05, + "loss": 1.6356, + "step": 7822 + }, + { + "epoch": 0.588870697604396, + "grad_norm": 4.105565071105957, + "learning_rate": 3.830254350625879e-05, + "loss": 1.7974, + "step": 7823 + }, + { + "epoch": 0.5889459718850562, + "grad_norm": 4.933945655822754, + "learning_rate": 3.829069127428531e-05, + "loss": 1.9895, + "step": 7824 + }, + { + "epoch": 0.5890212461657164, + "grad_norm": 5.1834001541137695, + "learning_rate": 3.8278839738396645e-05, + "loss": 2.1483, + "step": 7825 + }, + { + "epoch": 0.5890965204463765, + "grad_norm": 3.823296308517456, + "learning_rate": 3.826698889929731e-05, + "loss": 1.8366, + "step": 7826 + }, + { + "epoch": 0.5891717947270366, + "grad_norm": 4.330671310424805, + "learning_rate": 3.8255138757691845e-05, + "loss": 1.955, + "step": 7827 + }, + { + "epoch": 0.5892470690076967, + "grad_norm": 5.462123870849609, + "learning_rate": 3.824328931428466e-05, + "loss": 2.1855, + "step": 7828 + }, + { + "epoch": 0.589322343288357, + "grad_norm": 5.613857269287109, + "learning_rate": 3.823144056978021e-05, + "loss": 1.9632, + "step": 7829 + }, + { + "epoch": 0.5893976175690171, + "grad_norm": 6.444812297821045, + "learning_rate": 3.821959252488283e-05, + "loss": 1.8918, + "step": 7830 + }, + { + "epoch": 0.5894728918496772, + "grad_norm": 5.26508092880249, + "learning_rate": 3.820774518029687e-05, + "loss": 1.9225, + "step": 7831 + }, + { + "epoch": 0.5895481661303374, + "grad_norm": 7.023900508880615, + "learning_rate": 3.819589853672664e-05, + "loss": 2.4601, + "step": 7832 + }, + { + "epoch": 0.5896234404109976, + "grad_norm": 5.7562432289123535, + "learning_rate": 3.818405259487636e-05, + "loss": 1.7443, + "step": 7833 + }, + { + "epoch": 0.5896987146916577, + "grad_norm": 6.4446821212768555, + "learning_rate": 3.817220735545027e-05, + "loss": 2.5144, + "step": 7834 + }, + { + "epoch": 0.5897739889723179, + "grad_norm": 13.31273078918457, + "learning_rate": 3.8160362819152496e-05, + "loss": 2.0518, + "step": 7835 + }, + { + "epoch": 0.589849263252978, + "grad_norm": 4.9524455070495605, + "learning_rate": 3.81485189866872e-05, + "loss": 1.9837, + "step": 7836 + }, + { + "epoch": 0.5899245375336382, + "grad_norm": 5.590907096862793, + "learning_rate": 3.813667585875842e-05, + "loss": 1.7155, + "step": 7837 + }, + { + "epoch": 0.5899998118142984, + "grad_norm": 5.180372714996338, + "learning_rate": 3.812483343607023e-05, + "loss": 1.8015, + "step": 7838 + }, + { + "epoch": 0.5900750860949585, + "grad_norm": 6.319912433624268, + "learning_rate": 3.811299171932661e-05, + "loss": 2.0784, + "step": 7839 + }, + { + "epoch": 0.5901503603756186, + "grad_norm": 4.384718418121338, + "learning_rate": 3.8101150709231546e-05, + "loss": 1.7121, + "step": 7840 + }, + { + "epoch": 0.5902256346562789, + "grad_norm": 4.356826305389404, + "learning_rate": 3.808931040648892e-05, + "loss": 1.9065, + "step": 7841 + }, + { + "epoch": 0.590300908936939, + "grad_norm": 6.731471538543701, + "learning_rate": 3.807747081180261e-05, + "loss": 1.6769, + "step": 7842 + }, + { + "epoch": 0.5903761832175991, + "grad_norm": 6.328938007354736, + "learning_rate": 3.806563192587645e-05, + "loss": 2.0173, + "step": 7843 + }, + { + "epoch": 0.5904514574982593, + "grad_norm": 4.392867088317871, + "learning_rate": 3.805379374941425e-05, + "loss": 1.7314, + "step": 7844 + }, + { + "epoch": 0.5905267317789195, + "grad_norm": 3.661583662033081, + "learning_rate": 3.80419562831197e-05, + "loss": 1.7148, + "step": 7845 + }, + { + "epoch": 0.5906020060595796, + "grad_norm": 5.592376232147217, + "learning_rate": 3.803011952769657e-05, + "loss": 2.2591, + "step": 7846 + }, + { + "epoch": 0.5906772803402397, + "grad_norm": 6.500184059143066, + "learning_rate": 3.8018283483848464e-05, + "loss": 1.897, + "step": 7847 + }, + { + "epoch": 0.5907525546208999, + "grad_norm": 5.537911415100098, + "learning_rate": 3.800644815227903e-05, + "loss": 1.8472, + "step": 7848 + }, + { + "epoch": 0.59082782890156, + "grad_norm": 4.041790962219238, + "learning_rate": 3.7994613533691844e-05, + "loss": 1.925, + "step": 7849 + }, + { + "epoch": 0.5909031031822202, + "grad_norm": 4.398980617523193, + "learning_rate": 3.798277962879043e-05, + "loss": 1.9459, + "step": 7850 + }, + { + "epoch": 0.5909783774628804, + "grad_norm": 4.110594272613525, + "learning_rate": 3.7970946438278305e-05, + "loss": 1.8032, + "step": 7851 + }, + { + "epoch": 0.5910536517435405, + "grad_norm": 6.38279390335083, + "learning_rate": 3.795911396285888e-05, + "loss": 1.9273, + "step": 7852 + }, + { + "epoch": 0.5911289260242006, + "grad_norm": 4.685363292694092, + "learning_rate": 3.794728220323559e-05, + "loss": 1.6711, + "step": 7853 + }, + { + "epoch": 0.5912042003048609, + "grad_norm": 4.361425399780273, + "learning_rate": 3.7935451160111774e-05, + "loss": 1.9887, + "step": 7854 + }, + { + "epoch": 0.591279474585521, + "grad_norm": 3.8905081748962402, + "learning_rate": 3.79236208341908e-05, + "loss": 1.9184, + "step": 7855 + }, + { + "epoch": 0.5913547488661811, + "grad_norm": 5.164790630340576, + "learning_rate": 3.791179122617588e-05, + "loss": 2.1106, + "step": 7856 + }, + { + "epoch": 0.5914300231468413, + "grad_norm": 5.259014129638672, + "learning_rate": 3.789996233677032e-05, + "loss": 2.2265, + "step": 7857 + }, + { + "epoch": 0.5915052974275015, + "grad_norm": 4.083639621734619, + "learning_rate": 3.788813416667727e-05, + "loss": 1.9156, + "step": 7858 + }, + { + "epoch": 0.5915805717081616, + "grad_norm": 5.092536449432373, + "learning_rate": 3.787630671659989e-05, + "loss": 1.9323, + "step": 7859 + }, + { + "epoch": 0.5916558459888218, + "grad_norm": 5.620119094848633, + "learning_rate": 3.7864479987241266e-05, + "loss": 1.6787, + "step": 7860 + }, + { + "epoch": 0.5917311202694819, + "grad_norm": 5.034956932067871, + "learning_rate": 3.785265397930452e-05, + "loss": 1.7319, + "step": 7861 + }, + { + "epoch": 0.5918063945501421, + "grad_norm": 4.173698425292969, + "learning_rate": 3.784082869349262e-05, + "loss": 1.845, + "step": 7862 + }, + { + "epoch": 0.5918816688308023, + "grad_norm": 5.740963459014893, + "learning_rate": 3.7829004130508564e-05, + "loss": 2.0568, + "step": 7863 + }, + { + "epoch": 0.5919569431114624, + "grad_norm": 8.077258110046387, + "learning_rate": 3.781718029105531e-05, + "loss": 2.085, + "step": 7864 + }, + { + "epoch": 0.5920322173921225, + "grad_norm": 4.321019649505615, + "learning_rate": 3.780535717583571e-05, + "loss": 2.1147, + "step": 7865 + }, + { + "epoch": 0.5921074916727826, + "grad_norm": 4.551716327667236, + "learning_rate": 3.779353478555265e-05, + "loss": 1.7932, + "step": 7866 + }, + { + "epoch": 0.5921827659534429, + "grad_norm": 5.301575660705566, + "learning_rate": 3.7781713120908905e-05, + "loss": 1.842, + "step": 7867 + }, + { + "epoch": 0.592258040234103, + "grad_norm": 4.060632228851318, + "learning_rate": 3.776989218260728e-05, + "loss": 1.7008, + "step": 7868 + }, + { + "epoch": 0.5923333145147631, + "grad_norm": 4.6447014808654785, + "learning_rate": 3.7758071971350456e-05, + "loss": 1.7512, + "step": 7869 + }, + { + "epoch": 0.5924085887954234, + "grad_norm": 4.044189929962158, + "learning_rate": 3.7746252487841135e-05, + "loss": 1.7419, + "step": 7870 + }, + { + "epoch": 0.5924838630760835, + "grad_norm": 5.584356784820557, + "learning_rate": 3.773443373278193e-05, + "loss": 1.987, + "step": 7871 + }, + { + "epoch": 0.5925591373567436, + "grad_norm": 6.960014343261719, + "learning_rate": 3.7722615706875474e-05, + "loss": 1.8299, + "step": 7872 + }, + { + "epoch": 0.5926344116374038, + "grad_norm": 4.694604396820068, + "learning_rate": 3.771079841082426e-05, + "loss": 1.5138, + "step": 7873 + }, + { + "epoch": 0.5927096859180639, + "grad_norm": 5.120667457580566, + "learning_rate": 3.769898184533084e-05, + "loss": 2.014, + "step": 7874 + }, + { + "epoch": 0.5927849601987241, + "grad_norm": 3.970510244369507, + "learning_rate": 3.768716601109764e-05, + "loss": 1.9238, + "step": 7875 + }, + { + "epoch": 0.5928602344793843, + "grad_norm": 4.8945488929748535, + "learning_rate": 3.767535090882709e-05, + "loss": 1.9816, + "step": 7876 + }, + { + "epoch": 0.5929355087600444, + "grad_norm": 4.261218070983887, + "learning_rate": 3.766353653922156e-05, + "loss": 1.647, + "step": 7877 + }, + { + "epoch": 0.5930107830407045, + "grad_norm": 5.660770416259766, + "learning_rate": 3.765172290298341e-05, + "loss": 2.1478, + "step": 7878 + }, + { + "epoch": 0.5930860573213648, + "grad_norm": 8.129709243774414, + "learning_rate": 3.763991000081488e-05, + "loss": 1.9031, + "step": 7879 + }, + { + "epoch": 0.5931613316020249, + "grad_norm": 5.439948558807373, + "learning_rate": 3.762809783341823e-05, + "loss": 1.8595, + "step": 7880 + }, + { + "epoch": 0.593236605882685, + "grad_norm": 4.5891432762146, + "learning_rate": 3.761628640149567e-05, + "loss": 1.3423, + "step": 7881 + }, + { + "epoch": 0.5933118801633452, + "grad_norm": 5.952754497528076, + "learning_rate": 3.7604475705749356e-05, + "loss": 1.9639, + "step": 7882 + }, + { + "epoch": 0.5933871544440054, + "grad_norm": 6.931262493133545, + "learning_rate": 3.7592665746881406e-05, + "loss": 1.8112, + "step": 7883 + }, + { + "epoch": 0.5934624287246655, + "grad_norm": 4.159379005432129, + "learning_rate": 3.758085652559386e-05, + "loss": 1.7857, + "step": 7884 + }, + { + "epoch": 0.5935377030053256, + "grad_norm": 4.909072399139404, + "learning_rate": 3.756904804258878e-05, + "loss": 1.8284, + "step": 7885 + }, + { + "epoch": 0.5936129772859858, + "grad_norm": 4.211834907531738, + "learning_rate": 3.75572402985681e-05, + "loss": 1.9987, + "step": 7886 + }, + { + "epoch": 0.593688251566646, + "grad_norm": 7.541345596313477, + "learning_rate": 3.754543329423379e-05, + "loss": 2.0844, + "step": 7887 + }, + { + "epoch": 0.5937635258473061, + "grad_norm": 7.541345596313477, + "learning_rate": 3.754543329423379e-05, + "loss": 1.6423, + "step": 7888 + }, + { + "epoch": 0.5938388001279663, + "grad_norm": 4.836280822753906, + "learning_rate": 3.753362703028773e-05, + "loss": 1.9787, + "step": 7889 + }, + { + "epoch": 0.5939140744086264, + "grad_norm": 4.463475704193115, + "learning_rate": 3.75218215074318e-05, + "loss": 2.1009, + "step": 7890 + }, + { + "epoch": 0.5939893486892865, + "grad_norm": 6.547650337219238, + "learning_rate": 3.7510016726367746e-05, + "loss": 1.7507, + "step": 7891 + }, + { + "epoch": 0.5940646229699468, + "grad_norm": 4.9173431396484375, + "learning_rate": 3.7498212687797386e-05, + "loss": 1.693, + "step": 7892 + }, + { + "epoch": 0.5941398972506069, + "grad_norm": 4.701329708099365, + "learning_rate": 3.748640939242238e-05, + "loss": 2.0079, + "step": 7893 + }, + { + "epoch": 0.594215171531267, + "grad_norm": 5.148616790771484, + "learning_rate": 3.7474606840944446e-05, + "loss": 1.9929, + "step": 7894 + }, + { + "epoch": 0.5942904458119272, + "grad_norm": 4.35709285736084, + "learning_rate": 3.7462805034065195e-05, + "loss": 1.7157, + "step": 7895 + }, + { + "epoch": 0.5943657200925874, + "grad_norm": 3.8924877643585205, + "learning_rate": 3.74510039724862e-05, + "loss": 1.7595, + "step": 7896 + }, + { + "epoch": 0.5944409943732475, + "grad_norm": 7.674504280090332, + "learning_rate": 3.743920365690904e-05, + "loss": 2.0583, + "step": 7897 + }, + { + "epoch": 0.5945162686539077, + "grad_norm": 4.4485015869140625, + "learning_rate": 3.7427404088035154e-05, + "loss": 2.1203, + "step": 7898 + }, + { + "epoch": 0.5945915429345678, + "grad_norm": 4.843740463256836, + "learning_rate": 3.7415605266566034e-05, + "loss": 1.8284, + "step": 7899 + }, + { + "epoch": 0.594666817215228, + "grad_norm": 4.741055011749268, + "learning_rate": 3.740380719320307e-05, + "loss": 1.988, + "step": 7900 + }, + { + "epoch": 0.5947420914958882, + "grad_norm": 5.653155326843262, + "learning_rate": 3.739200986864763e-05, + "loss": 1.902, + "step": 7901 + }, + { + "epoch": 0.5948173657765483, + "grad_norm": 4.838129043579102, + "learning_rate": 3.738021329360102e-05, + "loss": 2.1271, + "step": 7902 + }, + { + "epoch": 0.5948926400572084, + "grad_norm": 6.923396587371826, + "learning_rate": 3.7368417468764526e-05, + "loss": 2.2673, + "step": 7903 + }, + { + "epoch": 0.5949679143378687, + "grad_norm": 3.68367075920105, + "learning_rate": 3.735662239483936e-05, + "loss": 1.8763, + "step": 7904 + }, + { + "epoch": 0.5950431886185288, + "grad_norm": 4.275118827819824, + "learning_rate": 3.734482807252673e-05, + "loss": 2.0478, + "step": 7905 + }, + { + "epoch": 0.5951184628991889, + "grad_norm": 7.5992231369018555, + "learning_rate": 3.733303450252773e-05, + "loss": 1.819, + "step": 7906 + }, + { + "epoch": 0.595193737179849, + "grad_norm": 4.1472320556640625, + "learning_rate": 3.732124168554352e-05, + "loss": 1.7684, + "step": 7907 + }, + { + "epoch": 0.5952690114605093, + "grad_norm": 5.666646480560303, + "learning_rate": 3.7309449622275085e-05, + "loss": 1.9225, + "step": 7908 + }, + { + "epoch": 0.5953442857411694, + "grad_norm": 5.599004745483398, + "learning_rate": 3.729765831342347e-05, + "loss": 2.6208, + "step": 7909 + }, + { + "epoch": 0.5954195600218295, + "grad_norm": 6.332678318023682, + "learning_rate": 3.7285867759689605e-05, + "loss": 2.0193, + "step": 7910 + }, + { + "epoch": 0.5954948343024897, + "grad_norm": 5.501683235168457, + "learning_rate": 3.727407796177445e-05, + "loss": 1.8501, + "step": 7911 + }, + { + "epoch": 0.5955701085831498, + "grad_norm": 4.173152923583984, + "learning_rate": 3.7262288920378815e-05, + "loss": 1.8828, + "step": 7912 + }, + { + "epoch": 0.59564538286381, + "grad_norm": 5.3801751136779785, + "learning_rate": 3.7250500636203565e-05, + "loss": 1.6013, + "step": 7913 + }, + { + "epoch": 0.5957206571444702, + "grad_norm": 4.022792816162109, + "learning_rate": 3.7238713109949486e-05, + "loss": 1.6574, + "step": 7914 + }, + { + "epoch": 0.5957959314251303, + "grad_norm": 3.3938214778900146, + "learning_rate": 3.722692634231728e-05, + "loss": 1.6375, + "step": 7915 + }, + { + "epoch": 0.5958712057057904, + "grad_norm": 5.620737552642822, + "learning_rate": 3.721514033400766e-05, + "loss": 2.1646, + "step": 7916 + }, + { + "epoch": 0.5959464799864507, + "grad_norm": 5.351798057556152, + "learning_rate": 3.7203355085721246e-05, + "loss": 1.8322, + "step": 7917 + }, + { + "epoch": 0.5960217542671108, + "grad_norm": 3.5777359008789062, + "learning_rate": 3.7191570598158685e-05, + "loss": 1.8492, + "step": 7918 + }, + { + "epoch": 0.5960970285477709, + "grad_norm": 4.743188858032227, + "learning_rate": 3.7179786872020474e-05, + "loss": 1.5006, + "step": 7919 + }, + { + "epoch": 0.5961723028284311, + "grad_norm": 4.990192890167236, + "learning_rate": 3.716800390800718e-05, + "loss": 1.8846, + "step": 7920 + }, + { + "epoch": 0.5962475771090913, + "grad_norm": 3.918168306350708, + "learning_rate": 3.71562217068192e-05, + "loss": 1.6861, + "step": 7921 + }, + { + "epoch": 0.5963228513897514, + "grad_norm": 3.9214861392974854, + "learning_rate": 3.7144440269157e-05, + "loss": 1.7838, + "step": 7922 + }, + { + "epoch": 0.5963981256704116, + "grad_norm": 5.140080451965332, + "learning_rate": 3.7132659595720934e-05, + "loss": 2.297, + "step": 7923 + }, + { + "epoch": 0.5964733999510717, + "grad_norm": 5.152413368225098, + "learning_rate": 3.712087968721135e-05, + "loss": 2.1238, + "step": 7924 + }, + { + "epoch": 0.5965486742317319, + "grad_norm": 5.307943820953369, + "learning_rate": 3.71091005443285e-05, + "loss": 1.9978, + "step": 7925 + }, + { + "epoch": 0.596623948512392, + "grad_norm": 6.113337993621826, + "learning_rate": 3.709732216777264e-05, + "loss": 1.4336, + "step": 7926 + }, + { + "epoch": 0.5966992227930522, + "grad_norm": 3.746710777282715, + "learning_rate": 3.708554455824394e-05, + "loss": 1.8175, + "step": 7927 + }, + { + "epoch": 0.5967744970737123, + "grad_norm": 5.175866603851318, + "learning_rate": 3.7073767716442565e-05, + "loss": 2.1938, + "step": 7928 + }, + { + "epoch": 0.5968497713543724, + "grad_norm": 5.046782493591309, + "learning_rate": 3.706199164306863e-05, + "loss": 1.6358, + "step": 7929 + }, + { + "epoch": 0.5969250456350327, + "grad_norm": 4.727492332458496, + "learning_rate": 3.7050216338822154e-05, + "loss": 1.9445, + "step": 7930 + }, + { + "epoch": 0.5970003199156928, + "grad_norm": 4.22304105758667, + "learning_rate": 3.703844180440317e-05, + "loss": 2.1292, + "step": 7931 + }, + { + "epoch": 0.5970755941963529, + "grad_norm": 5.28605842590332, + "learning_rate": 3.7026668040511616e-05, + "loss": 1.686, + "step": 7932 + }, + { + "epoch": 0.5971508684770132, + "grad_norm": 7.246971130371094, + "learning_rate": 3.701489504784743e-05, + "loss": 1.8671, + "step": 7933 + }, + { + "epoch": 0.5972261427576733, + "grad_norm": 4.841611385345459, + "learning_rate": 3.7003122827110466e-05, + "loss": 1.8963, + "step": 7934 + }, + { + "epoch": 0.5973014170383334, + "grad_norm": 5.4831461906433105, + "learning_rate": 3.6991351379000575e-05, + "loss": 1.7834, + "step": 7935 + }, + { + "epoch": 0.5973766913189936, + "grad_norm": 4.378210544586182, + "learning_rate": 3.69795807042175e-05, + "loss": 1.7506, + "step": 7936 + }, + { + "epoch": 0.5974519655996537, + "grad_norm": 3.409184217453003, + "learning_rate": 3.6967810803461006e-05, + "loss": 1.5844, + "step": 7937 + }, + { + "epoch": 0.5975272398803139, + "grad_norm": 4.411425590515137, + "learning_rate": 3.695604167743076e-05, + "loss": 2.0305, + "step": 7938 + }, + { + "epoch": 0.5976025141609741, + "grad_norm": 4.402707099914551, + "learning_rate": 3.694427332682642e-05, + "loss": 2.0007, + "step": 7939 + }, + { + "epoch": 0.5976777884416342, + "grad_norm": 4.217192649841309, + "learning_rate": 3.6932505752347545e-05, + "loss": 1.871, + "step": 7940 + }, + { + "epoch": 0.5977530627222943, + "grad_norm": 4.3588409423828125, + "learning_rate": 3.692073895469373e-05, + "loss": 2.0733, + "step": 7941 + }, + { + "epoch": 0.5978283370029546, + "grad_norm": 4.314188480377197, + "learning_rate": 3.690897293456444e-05, + "loss": 1.8234, + "step": 7942 + }, + { + "epoch": 0.5979036112836147, + "grad_norm": 3.796677350997925, + "learning_rate": 3.689720769265916e-05, + "loss": 1.9309, + "step": 7943 + }, + { + "epoch": 0.5979788855642748, + "grad_norm": 4.907580852508545, + "learning_rate": 3.688544322967728e-05, + "loss": 1.8871, + "step": 7944 + }, + { + "epoch": 0.5980541598449349, + "grad_norm": 4.831220626831055, + "learning_rate": 3.687367954631816e-05, + "loss": 1.7927, + "step": 7945 + }, + { + "epoch": 0.5981294341255952, + "grad_norm": 6.768815517425537, + "learning_rate": 3.686191664328116e-05, + "loss": 2.2726, + "step": 7946 + }, + { + "epoch": 0.5982047084062553, + "grad_norm": 5.013249397277832, + "learning_rate": 3.685015452126548e-05, + "loss": 1.8583, + "step": 7947 + }, + { + "epoch": 0.5982799826869154, + "grad_norm": 3.6031606197357178, + "learning_rate": 3.6838393180970416e-05, + "loss": 1.5103, + "step": 7948 + }, + { + "epoch": 0.5983552569675756, + "grad_norm": 5.221858024597168, + "learning_rate": 3.682663262309509e-05, + "loss": 1.6994, + "step": 7949 + }, + { + "epoch": 0.5984305312482358, + "grad_norm": 4.897763729095459, + "learning_rate": 3.681487284833867e-05, + "loss": 1.9034, + "step": 7950 + }, + { + "epoch": 0.5985058055288959, + "grad_norm": 6.664005756378174, + "learning_rate": 3.6803113857400206e-05, + "loss": 1.4828, + "step": 7951 + }, + { + "epoch": 0.5985810798095561, + "grad_norm": 5.071001052856445, + "learning_rate": 3.679135565097878e-05, + "loss": 1.7262, + "step": 7952 + }, + { + "epoch": 0.5986563540902162, + "grad_norm": 4.3958659172058105, + "learning_rate": 3.6779598229773344e-05, + "loss": 1.7595, + "step": 7953 + }, + { + "epoch": 0.5987316283708763, + "grad_norm": 4.544886112213135, + "learning_rate": 3.676784159448287e-05, + "loss": 2.2463, + "step": 7954 + }, + { + "epoch": 0.5988069026515366, + "grad_norm": 3.928467273712158, + "learning_rate": 3.675608574580623e-05, + "loss": 1.7742, + "step": 7955 + }, + { + "epoch": 0.5988821769321967, + "grad_norm": 4.448353290557861, + "learning_rate": 3.674433068444232e-05, + "loss": 2.0488, + "step": 7956 + }, + { + "epoch": 0.5989574512128568, + "grad_norm": 4.744180202484131, + "learning_rate": 3.673257641108989e-05, + "loss": 1.8836, + "step": 7957 + }, + { + "epoch": 0.599032725493517, + "grad_norm": 3.891849994659424, + "learning_rate": 3.672082292644775e-05, + "loss": 1.6183, + "step": 7958 + }, + { + "epoch": 0.5991079997741772, + "grad_norm": 4.785205364227295, + "learning_rate": 3.670907023121456e-05, + "loss": 1.6399, + "step": 7959 + }, + { + "epoch": 0.5991832740548373, + "grad_norm": 5.018258094787598, + "learning_rate": 3.669731832608902e-05, + "loss": 1.9479, + "step": 7960 + }, + { + "epoch": 0.5992585483354975, + "grad_norm": 3.85152006149292, + "learning_rate": 3.6685567211769726e-05, + "loss": 1.7777, + "step": 7961 + }, + { + "epoch": 0.5993338226161576, + "grad_norm": 4.894052505493164, + "learning_rate": 3.6673816888955256e-05, + "loss": 2.3965, + "step": 7962 + }, + { + "epoch": 0.5994090968968178, + "grad_norm": 5.2292585372924805, + "learning_rate": 3.666206735834415e-05, + "loss": 1.9201, + "step": 7963 + }, + { + "epoch": 0.5994843711774779, + "grad_norm": 5.393561840057373, + "learning_rate": 3.6650318620634864e-05, + "loss": 2.1616, + "step": 7964 + }, + { + "epoch": 0.5995596454581381, + "grad_norm": 3.465280055999756, + "learning_rate": 3.663857067652584e-05, + "loss": 1.8788, + "step": 7965 + }, + { + "epoch": 0.5996349197387982, + "grad_norm": 4.5058465003967285, + "learning_rate": 3.662682352671543e-05, + "loss": 1.9492, + "step": 7966 + }, + { + "epoch": 0.5997101940194584, + "grad_norm": 4.201843738555908, + "learning_rate": 3.6615077171902e-05, + "loss": 1.5962, + "step": 7967 + }, + { + "epoch": 0.5997854683001186, + "grad_norm": 3.7791907787323, + "learning_rate": 3.660333161278382e-05, + "loss": 1.9455, + "step": 7968 + }, + { + "epoch": 0.5998607425807787, + "grad_norm": 4.722830772399902, + "learning_rate": 3.659158685005916e-05, + "loss": 1.8205, + "step": 7969 + }, + { + "epoch": 0.5999360168614388, + "grad_norm": 5.043122291564941, + "learning_rate": 3.6579842884426165e-05, + "loss": 1.6884, + "step": 7970 + }, + { + "epoch": 0.6000112911420991, + "grad_norm": 5.420186996459961, + "learning_rate": 3.656809971658301e-05, + "loss": 2.2662, + "step": 7971 + }, + { + "epoch": 0.6000865654227592, + "grad_norm": 4.284111499786377, + "learning_rate": 3.655635734722778e-05, + "loss": 1.9558, + "step": 7972 + }, + { + "epoch": 0.6001618397034193, + "grad_norm": 3.5491456985473633, + "learning_rate": 3.654461577705855e-05, + "loss": 1.5448, + "step": 7973 + }, + { + "epoch": 0.6002371139840795, + "grad_norm": 3.5895063877105713, + "learning_rate": 3.6532875006773284e-05, + "loss": 1.8839, + "step": 7974 + }, + { + "epoch": 0.6003123882647396, + "grad_norm": 3.2783491611480713, + "learning_rate": 3.652113503706998e-05, + "loss": 1.7279, + "step": 7975 + }, + { + "epoch": 0.6003876625453998, + "grad_norm": 5.6505961418151855, + "learning_rate": 3.6509395868646503e-05, + "loss": 2.1068, + "step": 7976 + }, + { + "epoch": 0.60046293682606, + "grad_norm": 5.302483081817627, + "learning_rate": 3.649765750220073e-05, + "loss": 1.8227, + "step": 7977 + }, + { + "epoch": 0.6005382111067201, + "grad_norm": 4.162251949310303, + "learning_rate": 3.648591993843049e-05, + "loss": 2.0226, + "step": 7978 + }, + { + "epoch": 0.6006134853873802, + "grad_norm": 5.978146076202393, + "learning_rate": 3.6474183178033516e-05, + "loss": 1.9416, + "step": 7979 + }, + { + "epoch": 0.6006887596680405, + "grad_norm": 4.252506256103516, + "learning_rate": 3.646244722170756e-05, + "loss": 1.8379, + "step": 7980 + }, + { + "epoch": 0.6007640339487006, + "grad_norm": 4.377779006958008, + "learning_rate": 3.645071207015026e-05, + "loss": 1.6979, + "step": 7981 + }, + { + "epoch": 0.6008393082293607, + "grad_norm": 5.107065677642822, + "learning_rate": 3.6438977724059266e-05, + "loss": 2.3744, + "step": 7982 + }, + { + "epoch": 0.6009145825100208, + "grad_norm": 7.6258649826049805, + "learning_rate": 3.642724418413211e-05, + "loss": 1.8971, + "step": 7983 + }, + { + "epoch": 0.6009898567906811, + "grad_norm": 4.194841384887695, + "learning_rate": 3.641551145106638e-05, + "loss": 1.4903, + "step": 7984 + }, + { + "epoch": 0.6010651310713412, + "grad_norm": 4.250508785247803, + "learning_rate": 3.640377952555949e-05, + "loss": 1.6269, + "step": 7985 + }, + { + "epoch": 0.6011404053520013, + "grad_norm": 4.383350849151611, + "learning_rate": 3.639204840830891e-05, + "loss": 2.1613, + "step": 7986 + }, + { + "epoch": 0.6012156796326615, + "grad_norm": 5.240368366241455, + "learning_rate": 3.638031810001199e-05, + "loss": 1.8303, + "step": 7987 + }, + { + "epoch": 0.6012909539133217, + "grad_norm": 3.93595814704895, + "learning_rate": 3.636858860136609e-05, + "loss": 1.9607, + "step": 7988 + }, + { + "epoch": 0.6013662281939818, + "grad_norm": 3.9893717765808105, + "learning_rate": 3.6356859913068475e-05, + "loss": 1.8748, + "step": 7989 + }, + { + "epoch": 0.601441502474642, + "grad_norm": 5.265326976776123, + "learning_rate": 3.634513203581641e-05, + "loss": 1.8292, + "step": 7990 + }, + { + "epoch": 0.6015167767553021, + "grad_norm": 3.9016449451446533, + "learning_rate": 3.6333404970307044e-05, + "loss": 1.5806, + "step": 7991 + }, + { + "epoch": 0.6015920510359622, + "grad_norm": 5.3055315017700195, + "learning_rate": 3.6321678717237564e-05, + "loss": 1.8323, + "step": 7992 + }, + { + "epoch": 0.6016673253166225, + "grad_norm": 4.487079620361328, + "learning_rate": 3.630995327730501e-05, + "loss": 1.9372, + "step": 7993 + }, + { + "epoch": 0.6017425995972826, + "grad_norm": 8.976163864135742, + "learning_rate": 3.629822865120646e-05, + "loss": 1.9777, + "step": 7994 + }, + { + "epoch": 0.6018178738779427, + "grad_norm": 4.627541542053223, + "learning_rate": 3.62865048396389e-05, + "loss": 1.8289, + "step": 7995 + }, + { + "epoch": 0.601893148158603, + "grad_norm": 5.566967010498047, + "learning_rate": 3.6274781843299275e-05, + "loss": 1.8739, + "step": 7996 + }, + { + "epoch": 0.6019684224392631, + "grad_norm": 5.938688278198242, + "learning_rate": 3.626305966288451e-05, + "loss": 2.0693, + "step": 7997 + }, + { + "epoch": 0.6020436967199232, + "grad_norm": 4.522730827331543, + "learning_rate": 3.625133829909141e-05, + "loss": 2.0087, + "step": 7998 + }, + { + "epoch": 0.6021189710005834, + "grad_norm": 5.317759037017822, + "learning_rate": 3.623961775261682e-05, + "loss": 2.1023, + "step": 7999 + }, + { + "epoch": 0.6021942452812435, + "grad_norm": 5.180668354034424, + "learning_rate": 3.622789802415745e-05, + "loss": 1.5706, + "step": 8000 + }, + { + "epoch": 0.6022695195619037, + "grad_norm": 4.770196437835693, + "learning_rate": 3.6216179114410044e-05, + "loss": 1.8813, + "step": 8001 + }, + { + "epoch": 0.6023447938425639, + "grad_norm": 4.593331336975098, + "learning_rate": 3.620446102407122e-05, + "loss": 1.5814, + "step": 8002 + }, + { + "epoch": 0.602420068123224, + "grad_norm": 6.350017547607422, + "learning_rate": 3.619274375383763e-05, + "loss": 1.7553, + "step": 8003 + }, + { + "epoch": 0.6024953424038841, + "grad_norm": 4.056393146514893, + "learning_rate": 3.618102730440578e-05, + "loss": 1.7419, + "step": 8004 + }, + { + "epoch": 0.6025706166845443, + "grad_norm": 4.596986293792725, + "learning_rate": 3.616931167647222e-05, + "loss": 1.7508, + "step": 8005 + }, + { + "epoch": 0.6026458909652045, + "grad_norm": 5.526656150817871, + "learning_rate": 3.615759687073339e-05, + "loss": 1.7621, + "step": 8006 + }, + { + "epoch": 0.6027211652458646, + "grad_norm": 4.0575761795043945, + "learning_rate": 3.614588288788572e-05, + "loss": 1.9033, + "step": 8007 + }, + { + "epoch": 0.6027964395265247, + "grad_norm": 3.8082070350646973, + "learning_rate": 3.6134169728625546e-05, + "loss": 1.5919, + "step": 8008 + }, + { + "epoch": 0.602871713807185, + "grad_norm": 4.748533725738525, + "learning_rate": 3.612245739364919e-05, + "loss": 1.3692, + "step": 8009 + }, + { + "epoch": 0.6029469880878451, + "grad_norm": 6.354117393493652, + "learning_rate": 3.611074588365294e-05, + "loss": 2.0135, + "step": 8010 + }, + { + "epoch": 0.6030222623685052, + "grad_norm": 6.241334438323975, + "learning_rate": 3.609903519933298e-05, + "loss": 2.3166, + "step": 8011 + }, + { + "epoch": 0.6030975366491654, + "grad_norm": 6.611080646514893, + "learning_rate": 3.6087325341385514e-05, + "loss": 1.6199, + "step": 8012 + }, + { + "epoch": 0.6031728109298256, + "grad_norm": 6.188549995422363, + "learning_rate": 3.607561631050661e-05, + "loss": 1.7267, + "step": 8013 + }, + { + "epoch": 0.6032480852104857, + "grad_norm": 4.7459211349487305, + "learning_rate": 3.606390810739239e-05, + "loss": 1.546, + "step": 8014 + }, + { + "epoch": 0.6033233594911459, + "grad_norm": 4.252176761627197, + "learning_rate": 3.605220073273882e-05, + "loss": 2.0117, + "step": 8015 + }, + { + "epoch": 0.603398633771806, + "grad_norm": 4.85923957824707, + "learning_rate": 3.604049418724191e-05, + "loss": 1.7327, + "step": 8016 + }, + { + "epoch": 0.6034739080524661, + "grad_norm": 6.795665740966797, + "learning_rate": 3.602878847159755e-05, + "loss": 1.8034, + "step": 8017 + }, + { + "epoch": 0.6035491823331264, + "grad_norm": 5.557191848754883, + "learning_rate": 3.601708358650166e-05, + "loss": 1.8023, + "step": 8018 + }, + { + "epoch": 0.6036244566137865, + "grad_norm": 4.3204731941223145, + "learning_rate": 3.6005379532649996e-05, + "loss": 1.6702, + "step": 8019 + }, + { + "epoch": 0.6036997308944466, + "grad_norm": 4.874846458435059, + "learning_rate": 3.5993676310738396e-05, + "loss": 2.0416, + "step": 8020 + }, + { + "epoch": 0.6037750051751068, + "grad_norm": 4.5694169998168945, + "learning_rate": 3.598197392146252e-05, + "loss": 1.7856, + "step": 8021 + }, + { + "epoch": 0.603850279455767, + "grad_norm": 5.44784688949585, + "learning_rate": 3.597027236551809e-05, + "loss": 1.9995, + "step": 8022 + }, + { + "epoch": 0.6039255537364271, + "grad_norm": 4.517114639282227, + "learning_rate": 3.59585716436007e-05, + "loss": 1.9514, + "step": 8023 + }, + { + "epoch": 0.6040008280170872, + "grad_norm": 5.466026782989502, + "learning_rate": 3.594687175640595e-05, + "loss": 2.0542, + "step": 8024 + }, + { + "epoch": 0.6040761022977474, + "grad_norm": 4.703134536743164, + "learning_rate": 3.593517270462934e-05, + "loss": 1.6897, + "step": 8025 + }, + { + "epoch": 0.6041513765784076, + "grad_norm": 4.974737644195557, + "learning_rate": 3.592347448896635e-05, + "loss": 1.9729, + "step": 8026 + }, + { + "epoch": 0.6042266508590677, + "grad_norm": 5.321857929229736, + "learning_rate": 3.591177711011242e-05, + "loss": 1.9015, + "step": 8027 + }, + { + "epoch": 0.6043019251397279, + "grad_norm": 4.586242198944092, + "learning_rate": 3.5900080568762916e-05, + "loss": 2.0655, + "step": 8028 + }, + { + "epoch": 0.604377199420388, + "grad_norm": 4.832167625427246, + "learning_rate": 3.588838486561318e-05, + "loss": 2.0172, + "step": 8029 + }, + { + "epoch": 0.6044524737010482, + "grad_norm": 6.086312770843506, + "learning_rate": 3.5876690001358445e-05, + "loss": 1.8601, + "step": 8030 + }, + { + "epoch": 0.6045277479817084, + "grad_norm": 4.7074408531188965, + "learning_rate": 3.5864995976694e-05, + "loss": 1.553, + "step": 8031 + }, + { + "epoch": 0.6046030222623685, + "grad_norm": 5.799994468688965, + "learning_rate": 3.585330279231496e-05, + "loss": 1.6822, + "step": 8032 + }, + { + "epoch": 0.6046782965430286, + "grad_norm": 4.663863658905029, + "learning_rate": 3.5841610448916484e-05, + "loss": 1.7558, + "step": 8033 + }, + { + "epoch": 0.6047535708236889, + "grad_norm": 5.484861373901367, + "learning_rate": 3.5829918947193634e-05, + "loss": 2.0045, + "step": 8034 + }, + { + "epoch": 0.604828845104349, + "grad_norm": 5.244642734527588, + "learning_rate": 3.5818228287841466e-05, + "loss": 1.656, + "step": 8035 + }, + { + "epoch": 0.6049041193850091, + "grad_norm": 8.319445610046387, + "learning_rate": 3.580653847155491e-05, + "loss": 2.0709, + "step": 8036 + }, + { + "epoch": 0.6049793936656693, + "grad_norm": 4.302628517150879, + "learning_rate": 3.579484949902893e-05, + "loss": 1.8349, + "step": 8037 + }, + { + "epoch": 0.6050546679463294, + "grad_norm": 5.783748149871826, + "learning_rate": 3.578316137095838e-05, + "loss": 2.0228, + "step": 8038 + }, + { + "epoch": 0.6051299422269896, + "grad_norm": 7.605402946472168, + "learning_rate": 3.5771474088038114e-05, + "loss": 1.821, + "step": 8039 + }, + { + "epoch": 0.6052052165076498, + "grad_norm": 4.048700332641602, + "learning_rate": 3.575978765096286e-05, + "loss": 1.8033, + "step": 8040 + }, + { + "epoch": 0.6052804907883099, + "grad_norm": 5.959883213043213, + "learning_rate": 3.5748102060427374e-05, + "loss": 1.7772, + "step": 8041 + }, + { + "epoch": 0.60535576506897, + "grad_norm": 5.1179375648498535, + "learning_rate": 3.573641731712635e-05, + "loss": 1.8498, + "step": 8042 + }, + { + "epoch": 0.6054310393496302, + "grad_norm": 4.0493574142456055, + "learning_rate": 3.5724733421754365e-05, + "loss": 1.7278, + "step": 8043 + }, + { + "epoch": 0.6055063136302904, + "grad_norm": 4.4777350425720215, + "learning_rate": 3.571305037500603e-05, + "loss": 1.8631, + "step": 8044 + }, + { + "epoch": 0.6055815879109505, + "grad_norm": 5.109423637390137, + "learning_rate": 3.570136817757585e-05, + "loss": 1.8148, + "step": 8045 + }, + { + "epoch": 0.6056568621916106, + "grad_norm": 13.446748733520508, + "learning_rate": 3.568968683015832e-05, + "loss": 2.0569, + "step": 8046 + }, + { + "epoch": 0.6057321364722709, + "grad_norm": 4.003000736236572, + "learning_rate": 3.567800633344783e-05, + "loss": 1.7951, + "step": 8047 + }, + { + "epoch": 0.605807410752931, + "grad_norm": 3.602207660675049, + "learning_rate": 3.566632668813879e-05, + "loss": 1.8865, + "step": 8048 + }, + { + "epoch": 0.6058826850335911, + "grad_norm": 4.107050895690918, + "learning_rate": 3.5654647894925475e-05, + "loss": 1.6907, + "step": 8049 + }, + { + "epoch": 0.6059579593142513, + "grad_norm": 4.525403022766113, + "learning_rate": 3.564296995450219e-05, + "loss": 1.6926, + "step": 8050 + }, + { + "epoch": 0.6060332335949115, + "grad_norm": 5.758842468261719, + "learning_rate": 3.5631292867563146e-05, + "loss": 2.5907, + "step": 8051 + }, + { + "epoch": 0.6061085078755716, + "grad_norm": 5.020792484283447, + "learning_rate": 3.5619616634802524e-05, + "loss": 1.9564, + "step": 8052 + }, + { + "epoch": 0.6061837821562318, + "grad_norm": 4.926012992858887, + "learning_rate": 3.560794125691441e-05, + "loss": 2.07, + "step": 8053 + }, + { + "epoch": 0.6062590564368919, + "grad_norm": 6.003063678741455, + "learning_rate": 3.559626673459291e-05, + "loss": 2.3415, + "step": 8054 + }, + { + "epoch": 0.606334330717552, + "grad_norm": 4.733049392700195, + "learning_rate": 3.558459306853201e-05, + "loss": 1.7742, + "step": 8055 + }, + { + "epoch": 0.6064096049982123, + "grad_norm": 4.311572551727295, + "learning_rate": 3.5572920259425715e-05, + "loss": 1.8239, + "step": 8056 + }, + { + "epoch": 0.6064848792788724, + "grad_norm": 4.7451300621032715, + "learning_rate": 3.556124830796789e-05, + "loss": 1.6207, + "step": 8057 + }, + { + "epoch": 0.6065601535595325, + "grad_norm": 4.346027374267578, + "learning_rate": 3.554957721485242e-05, + "loss": 1.8382, + "step": 8058 + }, + { + "epoch": 0.6066354278401928, + "grad_norm": 3.985694646835327, + "learning_rate": 3.553790698077315e-05, + "loss": 1.5765, + "step": 8059 + }, + { + "epoch": 0.6067107021208529, + "grad_norm": 5.615527153015137, + "learning_rate": 3.5526237606423786e-05, + "loss": 1.6242, + "step": 8060 + }, + { + "epoch": 0.606785976401513, + "grad_norm": 4.927435398101807, + "learning_rate": 3.551456909249808e-05, + "loss": 1.8059, + "step": 8061 + }, + { + "epoch": 0.6068612506821731, + "grad_norm": 9.183197021484375, + "learning_rate": 3.5502901439689664e-05, + "loss": 2.0579, + "step": 8062 + }, + { + "epoch": 0.6069365249628333, + "grad_norm": 5.377397537231445, + "learning_rate": 3.549123464869218e-05, + "loss": 1.6157, + "step": 8063 + }, + { + "epoch": 0.6070117992434935, + "grad_norm": 5.270592212677002, + "learning_rate": 3.547956872019915e-05, + "loss": 1.7501, + "step": 8064 + }, + { + "epoch": 0.6070870735241536, + "grad_norm": 5.024423122406006, + "learning_rate": 3.546790365490411e-05, + "loss": 1.6681, + "step": 8065 + }, + { + "epoch": 0.6071623478048138, + "grad_norm": 5.829306125640869, + "learning_rate": 3.545623945350049e-05, + "loss": 1.9247, + "step": 8066 + }, + { + "epoch": 0.6072376220854739, + "grad_norm": 4.136631488800049, + "learning_rate": 3.544457611668171e-05, + "loss": 1.9709, + "step": 8067 + }, + { + "epoch": 0.6073128963661341, + "grad_norm": 6.158609867095947, + "learning_rate": 3.5432913645141106e-05, + "loss": 2.1045, + "step": 8068 + }, + { + "epoch": 0.6073881706467943, + "grad_norm": 4.047645092010498, + "learning_rate": 3.542125203957202e-05, + "loss": 1.6893, + "step": 8069 + }, + { + "epoch": 0.6074634449274544, + "grad_norm": 4.429532051086426, + "learning_rate": 3.5409591300667636e-05, + "loss": 1.5283, + "step": 8070 + }, + { + "epoch": 0.6075387192081145, + "grad_norm": 4.327272415161133, + "learning_rate": 3.539793142912119e-05, + "loss": 1.8778, + "step": 8071 + }, + { + "epoch": 0.6076139934887748, + "grad_norm": 4.6361494064331055, + "learning_rate": 3.5386272425625816e-05, + "loss": 1.78, + "step": 8072 + }, + { + "epoch": 0.6076892677694349, + "grad_norm": 4.448143482208252, + "learning_rate": 3.537461429087464e-05, + "loss": 1.7166, + "step": 8073 + }, + { + "epoch": 0.607764542050095, + "grad_norm": 5.978185653686523, + "learning_rate": 3.536295702556065e-05, + "loss": 1.6491, + "step": 8074 + }, + { + "epoch": 0.6078398163307552, + "grad_norm": 4.677586555480957, + "learning_rate": 3.5351300630376876e-05, + "loss": 1.8593, + "step": 8075 + }, + { + "epoch": 0.6079150906114154, + "grad_norm": 5.9051690101623535, + "learning_rate": 3.533964510601626e-05, + "loss": 1.773, + "step": 8076 + }, + { + "epoch": 0.6079903648920755, + "grad_norm": 4.932521820068359, + "learning_rate": 3.5327990453171663e-05, + "loss": 1.8313, + "step": 8077 + }, + { + "epoch": 0.6080656391727357, + "grad_norm": 5.28539514541626, + "learning_rate": 3.5316336672535945e-05, + "loss": 2.1272, + "step": 8078 + }, + { + "epoch": 0.6081409134533958, + "grad_norm": 4.727498531341553, + "learning_rate": 3.530468376480187e-05, + "loss": 1.4553, + "step": 8079 + }, + { + "epoch": 0.608216187734056, + "grad_norm": 3.7702107429504395, + "learning_rate": 3.5293031730662195e-05, + "loss": 1.7154, + "step": 8080 + }, + { + "epoch": 0.6082914620147161, + "grad_norm": 4.206916809082031, + "learning_rate": 3.528138057080957e-05, + "loss": 2.0161, + "step": 8081 + }, + { + "epoch": 0.6083667362953763, + "grad_norm": 3.46120548248291, + "learning_rate": 3.526973028593664e-05, + "loss": 1.824, + "step": 8082 + }, + { + "epoch": 0.6084420105760364, + "grad_norm": 5.708986759185791, + "learning_rate": 3.525808087673598e-05, + "loss": 1.9866, + "step": 8083 + }, + { + "epoch": 0.6085172848566965, + "grad_norm": 4.997643947601318, + "learning_rate": 3.524643234390012e-05, + "loss": 1.8727, + "step": 8084 + }, + { + "epoch": 0.6085925591373568, + "grad_norm": 5.269734859466553, + "learning_rate": 3.5234784688121516e-05, + "loss": 1.9493, + "step": 8085 + }, + { + "epoch": 0.6086678334180169, + "grad_norm": 4.212966442108154, + "learning_rate": 3.5223137910092605e-05, + "loss": 1.7936, + "step": 8086 + }, + { + "epoch": 0.608743107698677, + "grad_norm": 7.552111625671387, + "learning_rate": 3.5211492010505734e-05, + "loss": 2.2614, + "step": 8087 + }, + { + "epoch": 0.6088183819793372, + "grad_norm": 4.56868314743042, + "learning_rate": 3.5199846990053234e-05, + "loss": 2.0106, + "step": 8088 + }, + { + "epoch": 0.6088936562599974, + "grad_norm": 4.035215377807617, + "learning_rate": 3.5188202849427355e-05, + "loss": 1.9778, + "step": 8089 + }, + { + "epoch": 0.6089689305406575, + "grad_norm": 7.589376449584961, + "learning_rate": 3.5176559589320326e-05, + "loss": 2.3722, + "step": 8090 + }, + { + "epoch": 0.6090442048213177, + "grad_norm": 5.87318754196167, + "learning_rate": 3.516491721042431e-05, + "loss": 1.9171, + "step": 8091 + }, + { + "epoch": 0.6091194791019778, + "grad_norm": 4.617512226104736, + "learning_rate": 3.5153275713431375e-05, + "loss": 2.631, + "step": 8092 + }, + { + "epoch": 0.609194753382638, + "grad_norm": 5.0993757247924805, + "learning_rate": 3.514163509903362e-05, + "loss": 2.4103, + "step": 8093 + }, + { + "epoch": 0.6092700276632982, + "grad_norm": 7.570215225219727, + "learning_rate": 3.512999536792301e-05, + "loss": 1.8675, + "step": 8094 + }, + { + "epoch": 0.6093453019439583, + "grad_norm": 4.643409729003906, + "learning_rate": 3.511835652079151e-05, + "loss": 2.036, + "step": 8095 + }, + { + "epoch": 0.6094205762246184, + "grad_norm": 4.436516761779785, + "learning_rate": 3.5106718558331006e-05, + "loss": 1.7069, + "step": 8096 + }, + { + "epoch": 0.6094958505052787, + "grad_norm": 4.737794876098633, + "learning_rate": 3.5095081481233384e-05, + "loss": 1.8507, + "step": 8097 + }, + { + "epoch": 0.6095711247859388, + "grad_norm": 4.942888259887695, + "learning_rate": 3.5083445290190364e-05, + "loss": 2.2302, + "step": 8098 + }, + { + "epoch": 0.6096463990665989, + "grad_norm": 5.764395713806152, + "learning_rate": 3.507180998589374e-05, + "loss": 1.9121, + "step": 8099 + }, + { + "epoch": 0.6097216733472591, + "grad_norm": 4.899541854858398, + "learning_rate": 3.506017556903517e-05, + "loss": 1.7828, + "step": 8100 + }, + { + "epoch": 0.6097969476279193, + "grad_norm": 5.933323860168457, + "learning_rate": 3.504854204030631e-05, + "loss": 1.9129, + "step": 8101 + }, + { + "epoch": 0.6098722219085794, + "grad_norm": 5.2042107582092285, + "learning_rate": 3.503690940039871e-05, + "loss": 1.7762, + "step": 8102 + }, + { + "epoch": 0.6099474961892395, + "grad_norm": 3.7044179439544678, + "learning_rate": 3.502527765000393e-05, + "loss": 1.7365, + "step": 8103 + }, + { + "epoch": 0.6100227704698997, + "grad_norm": 6.34365701675415, + "learning_rate": 3.5013646789813404e-05, + "loss": 1.6649, + "step": 8104 + }, + { + "epoch": 0.6100980447505598, + "grad_norm": 4.74472188949585, + "learning_rate": 3.500201682051859e-05, + "loss": 2.0668, + "step": 8105 + }, + { + "epoch": 0.61017331903122, + "grad_norm": 5.129726409912109, + "learning_rate": 3.499038774281083e-05, + "loss": 2.1201, + "step": 8106 + }, + { + "epoch": 0.6102485933118802, + "grad_norm": 4.57461404800415, + "learning_rate": 3.4978759557381444e-05, + "loss": 1.5843, + "step": 8107 + }, + { + "epoch": 0.6103238675925403, + "grad_norm": 5.204603672027588, + "learning_rate": 3.496713226492173e-05, + "loss": 1.7654, + "step": 8108 + }, + { + "epoch": 0.6103991418732004, + "grad_norm": 7.3619704246521, + "learning_rate": 3.495550586612285e-05, + "loss": 2.0269, + "step": 8109 + }, + { + "epoch": 0.6104744161538607, + "grad_norm": 6.065268516540527, + "learning_rate": 3.494388036167598e-05, + "loss": 1.9622, + "step": 8110 + }, + { + "epoch": 0.6105496904345208, + "grad_norm": 4.100382328033447, + "learning_rate": 3.493225575227222e-05, + "loss": 2.0366, + "step": 8111 + }, + { + "epoch": 0.6106249647151809, + "grad_norm": 4.180423259735107, + "learning_rate": 3.492063203860264e-05, + "loss": 1.6262, + "step": 8112 + }, + { + "epoch": 0.6107002389958411, + "grad_norm": 5.717609405517578, + "learning_rate": 3.49090092213582e-05, + "loss": 1.9743, + "step": 8113 + }, + { + "epoch": 0.6107755132765013, + "grad_norm": 9.112792015075684, + "learning_rate": 3.489738730122988e-05, + "loss": 1.8602, + "step": 8114 + }, + { + "epoch": 0.6108507875571614, + "grad_norm": 5.108053684234619, + "learning_rate": 3.4885766278908526e-05, + "loss": 1.7395, + "step": 8115 + }, + { + "epoch": 0.6109260618378216, + "grad_norm": 5.141839981079102, + "learning_rate": 3.487414615508501e-05, + "loss": 1.9182, + "step": 8116 + }, + { + "epoch": 0.6110013361184817, + "grad_norm": 5.8351263999938965, + "learning_rate": 3.48625269304501e-05, + "loss": 1.2865, + "step": 8117 + }, + { + "epoch": 0.6110766103991419, + "grad_norm": 5.516819477081299, + "learning_rate": 3.485090860569454e-05, + "loss": 2.2089, + "step": 8118 + }, + { + "epoch": 0.6111518846798021, + "grad_norm": 4.534638404846191, + "learning_rate": 3.483929118150898e-05, + "loss": 1.7027, + "step": 8119 + }, + { + "epoch": 0.6112271589604622, + "grad_norm": 9.957633018493652, + "learning_rate": 3.482767465858408e-05, + "loss": 1.9719, + "step": 8120 + }, + { + "epoch": 0.6113024332411223, + "grad_norm": 5.063474178314209, + "learning_rate": 3.4816059037610365e-05, + "loss": 1.8365, + "step": 8121 + }, + { + "epoch": 0.6113777075217824, + "grad_norm": 4.992016315460205, + "learning_rate": 3.480444431927837e-05, + "loss": 1.6186, + "step": 8122 + }, + { + "epoch": 0.6114529818024427, + "grad_norm": 5.252773761749268, + "learning_rate": 3.479283050427858e-05, + "loss": 1.9097, + "step": 8123 + }, + { + "epoch": 0.6115282560831028, + "grad_norm": 4.58005428314209, + "learning_rate": 3.478121759330136e-05, + "loss": 1.7574, + "step": 8124 + }, + { + "epoch": 0.6116035303637629, + "grad_norm": 5.458553314208984, + "learning_rate": 3.476960558703711e-05, + "loss": 1.7179, + "step": 8125 + }, + { + "epoch": 0.6116788046444231, + "grad_norm": 4.62752103805542, + "learning_rate": 3.4757994486176094e-05, + "loss": 2.1633, + "step": 8126 + }, + { + "epoch": 0.6117540789250833, + "grad_norm": 4.543347358703613, + "learning_rate": 3.474638429140858e-05, + "loss": 1.5635, + "step": 8127 + }, + { + "epoch": 0.6118293532057434, + "grad_norm": 4.643796920776367, + "learning_rate": 3.4734775003424734e-05, + "loss": 1.624, + "step": 8128 + }, + { + "epoch": 0.6119046274864036, + "grad_norm": 3.7508130073547363, + "learning_rate": 3.472316662291474e-05, + "loss": 1.927, + "step": 8129 + }, + { + "epoch": 0.6119799017670637, + "grad_norm": 5.244353771209717, + "learning_rate": 3.4711559150568634e-05, + "loss": 1.9094, + "step": 8130 + }, + { + "epoch": 0.6120551760477239, + "grad_norm": 4.895063877105713, + "learning_rate": 3.469995258707649e-05, + "loss": 1.7094, + "step": 8131 + }, + { + "epoch": 0.6121304503283841, + "grad_norm": 4.976377010345459, + "learning_rate": 3.4688346933128246e-05, + "loss": 1.7931, + "step": 8132 + }, + { + "epoch": 0.6122057246090442, + "grad_norm": 4.292976379394531, + "learning_rate": 3.467674218941385e-05, + "loss": 1.8127, + "step": 8133 + }, + { + "epoch": 0.6122809988897043, + "grad_norm": 4.518517017364502, + "learning_rate": 3.466513835662315e-05, + "loss": 1.7762, + "step": 8134 + }, + { + "epoch": 0.6123562731703646, + "grad_norm": 4.300276279449463, + "learning_rate": 3.4653535435446e-05, + "loss": 1.7317, + "step": 8135 + }, + { + "epoch": 0.6124315474510247, + "grad_norm": 4.825788974761963, + "learning_rate": 3.464193342657211e-05, + "loss": 2.2165, + "step": 8136 + }, + { + "epoch": 0.6125068217316848, + "grad_norm": 4.445923328399658, + "learning_rate": 3.4630332330691224e-05, + "loss": 1.944, + "step": 8137 + }, + { + "epoch": 0.612582096012345, + "grad_norm": 5.350835800170898, + "learning_rate": 3.461873214849297e-05, + "loss": 2.3772, + "step": 8138 + }, + { + "epoch": 0.6126573702930052, + "grad_norm": 5.620702743530273, + "learning_rate": 3.4607132880666956e-05, + "loss": 1.9225, + "step": 8139 + }, + { + "epoch": 0.6127326445736653, + "grad_norm": 4.805440902709961, + "learning_rate": 3.459553452790274e-05, + "loss": 1.6635, + "step": 8140 + }, + { + "epoch": 0.6128079188543254, + "grad_norm": 5.023445129394531, + "learning_rate": 3.458393709088979e-05, + "loss": 1.6587, + "step": 8141 + }, + { + "epoch": 0.6128831931349856, + "grad_norm": 4.494292259216309, + "learning_rate": 3.457234057031755e-05, + "loss": 1.7722, + "step": 8142 + }, + { + "epoch": 0.6129584674156457, + "grad_norm": 4.2279815673828125, + "learning_rate": 3.456074496687538e-05, + "loss": 2.3912, + "step": 8143 + }, + { + "epoch": 0.6130337416963059, + "grad_norm": 5.196355819702148, + "learning_rate": 3.4549150281252636e-05, + "loss": 1.5514, + "step": 8144 + }, + { + "epoch": 0.6131090159769661, + "grad_norm": 4.484930992126465, + "learning_rate": 3.4537556514138556e-05, + "loss": 1.7105, + "step": 8145 + }, + { + "epoch": 0.6131842902576262, + "grad_norm": 6.017373085021973, + "learning_rate": 3.452596366622241e-05, + "loss": 1.7412, + "step": 8146 + }, + { + "epoch": 0.6132595645382863, + "grad_norm": 4.953855514526367, + "learning_rate": 3.451437173819329e-05, + "loss": 1.9372, + "step": 8147 + }, + { + "epoch": 0.6133348388189466, + "grad_norm": 4.7046709060668945, + "learning_rate": 3.450278073074036e-05, + "loss": 1.6947, + "step": 8148 + }, + { + "epoch": 0.6134101130996067, + "grad_norm": 4.654233455657959, + "learning_rate": 3.449119064455263e-05, + "loss": 1.9575, + "step": 8149 + }, + { + "epoch": 0.6134853873802668, + "grad_norm": 4.065881729125977, + "learning_rate": 3.447960148031913e-05, + "loss": 1.3527, + "step": 8150 + }, + { + "epoch": 0.613560661660927, + "grad_norm": 3.889578104019165, + "learning_rate": 3.4468013238728774e-05, + "loss": 1.8721, + "step": 8151 + }, + { + "epoch": 0.6136359359415872, + "grad_norm": 5.912107944488525, + "learning_rate": 3.445642592047049e-05, + "loss": 2.0551, + "step": 8152 + }, + { + "epoch": 0.6137112102222473, + "grad_norm": 4.926999568939209, + "learning_rate": 3.444483952623306e-05, + "loss": 2.3235, + "step": 8153 + }, + { + "epoch": 0.6137864845029075, + "grad_norm": 5.576359748840332, + "learning_rate": 3.443325405670529e-05, + "loss": 1.7611, + "step": 8154 + }, + { + "epoch": 0.6138617587835676, + "grad_norm": 7.416301250457764, + "learning_rate": 3.442166951257591e-05, + "loss": 1.8567, + "step": 8155 + }, + { + "epoch": 0.6139370330642278, + "grad_norm": 5.036648750305176, + "learning_rate": 3.4410085894533576e-05, + "loss": 2.157, + "step": 8156 + }, + { + "epoch": 0.614012307344888, + "grad_norm": 3.795140266418457, + "learning_rate": 3.439850320326691e-05, + "loss": 1.6346, + "step": 8157 + }, + { + "epoch": 0.6140875816255481, + "grad_norm": 3.8690414428710938, + "learning_rate": 3.438692143946445e-05, + "loss": 1.8093, + "step": 8158 + }, + { + "epoch": 0.6141628559062082, + "grad_norm": 5.439431190490723, + "learning_rate": 3.437534060381473e-05, + "loss": 1.8431, + "step": 8159 + }, + { + "epoch": 0.6142381301868683, + "grad_norm": 3.9665729999542236, + "learning_rate": 3.4363760697006155e-05, + "loss": 1.963, + "step": 8160 + }, + { + "epoch": 0.6143134044675286, + "grad_norm": 4.183095455169678, + "learning_rate": 3.4352181719727156e-05, + "loss": 1.9875, + "step": 8161 + }, + { + "epoch": 0.6143886787481887, + "grad_norm": 4.535946846008301, + "learning_rate": 3.434060367266604e-05, + "loss": 1.634, + "step": 8162 + }, + { + "epoch": 0.6144639530288488, + "grad_norm": 5.891911506652832, + "learning_rate": 3.4329026556511137e-05, + "loss": 1.8804, + "step": 8163 + }, + { + "epoch": 0.614539227309509, + "grad_norm": 6.393861770629883, + "learning_rate": 3.431745037195061e-05, + "loss": 1.7716, + "step": 8164 + }, + { + "epoch": 0.6146145015901692, + "grad_norm": 5.3104248046875, + "learning_rate": 3.430587511967267e-05, + "loss": 1.8541, + "step": 8165 + }, + { + "epoch": 0.6146897758708293, + "grad_norm": 4.353861331939697, + "learning_rate": 3.429430080036541e-05, + "loss": 2.017, + "step": 8166 + }, + { + "epoch": 0.6147650501514895, + "grad_norm": 4.105453968048096, + "learning_rate": 3.428272741471693e-05, + "loss": 1.8968, + "step": 8167 + }, + { + "epoch": 0.6148403244321496, + "grad_norm": 3.8583414554595947, + "learning_rate": 3.427115496341518e-05, + "loss": 1.9697, + "step": 8168 + }, + { + "epoch": 0.6149155987128098, + "grad_norm": 4.821552753448486, + "learning_rate": 3.425958344714816e-05, + "loss": 1.6207, + "step": 8169 + }, + { + "epoch": 0.61499087299347, + "grad_norm": 4.335643768310547, + "learning_rate": 3.424801286660372e-05, + "loss": 1.6681, + "step": 8170 + }, + { + "epoch": 0.6150661472741301, + "grad_norm": 5.145238399505615, + "learning_rate": 3.4236443222469715e-05, + "loss": 1.8731, + "step": 8171 + }, + { + "epoch": 0.6151414215547902, + "grad_norm": 4.591930389404297, + "learning_rate": 3.422487451543395e-05, + "loss": 2.0362, + "step": 8172 + }, + { + "epoch": 0.6152166958354505, + "grad_norm": 4.320077896118164, + "learning_rate": 3.421330674618411e-05, + "loss": 1.7148, + "step": 8173 + }, + { + "epoch": 0.6152919701161106, + "grad_norm": 8.470335960388184, + "learning_rate": 3.420173991540791e-05, + "loss": 1.5307, + "step": 8174 + }, + { + "epoch": 0.6153672443967707, + "grad_norm": 4.425691604614258, + "learning_rate": 3.419017402379292e-05, + "loss": 1.7212, + "step": 8175 + }, + { + "epoch": 0.6154425186774309, + "grad_norm": 4.004261016845703, + "learning_rate": 3.4178609072026745e-05, + "loss": 1.9242, + "step": 8176 + }, + { + "epoch": 0.6155177929580911, + "grad_norm": 5.919533729553223, + "learning_rate": 3.416704506079684e-05, + "loss": 2.2793, + "step": 8177 + }, + { + "epoch": 0.6155930672387512, + "grad_norm": 5.656198024749756, + "learning_rate": 3.415548199079069e-05, + "loss": 2.0532, + "step": 8178 + }, + { + "epoch": 0.6156683415194114, + "grad_norm": 4.965191841125488, + "learning_rate": 3.414391986269566e-05, + "loss": 1.9619, + "step": 8179 + }, + { + "epoch": 0.6157436158000715, + "grad_norm": 3.8956573009490967, + "learning_rate": 3.413235867719912e-05, + "loss": 1.894, + "step": 8180 + }, + { + "epoch": 0.6158188900807317, + "grad_norm": 4.109010219573975, + "learning_rate": 3.4120798434988296e-05, + "loss": 1.7619, + "step": 8181 + }, + { + "epoch": 0.6158941643613918, + "grad_norm": 5.274187088012695, + "learning_rate": 3.410923913675047e-05, + "loss": 1.5277, + "step": 8182 + }, + { + "epoch": 0.615969438642052, + "grad_norm": 4.784882068634033, + "learning_rate": 3.4097680783172764e-05, + "loss": 1.7618, + "step": 8183 + }, + { + "epoch": 0.6160447129227121, + "grad_norm": 3.6030797958374023, + "learning_rate": 3.408612337494233e-05, + "loss": 1.9912, + "step": 8184 + }, + { + "epoch": 0.6161199872033722, + "grad_norm": 4.4571919441223145, + "learning_rate": 3.407456691274618e-05, + "loss": 1.9, + "step": 8185 + }, + { + "epoch": 0.6161952614840325, + "grad_norm": 3.6722207069396973, + "learning_rate": 3.406301139727132e-05, + "loss": 1.7955, + "step": 8186 + }, + { + "epoch": 0.6162705357646926, + "grad_norm": 5.215631008148193, + "learning_rate": 3.405145682920473e-05, + "loss": 1.5753, + "step": 8187 + }, + { + "epoch": 0.6163458100453527, + "grad_norm": 5.818994045257568, + "learning_rate": 3.403990320923325e-05, + "loss": 2.4141, + "step": 8188 + }, + { + "epoch": 0.616421084326013, + "grad_norm": 4.393082141876221, + "learning_rate": 3.402835053804374e-05, + "loss": 1.9337, + "step": 8189 + }, + { + "epoch": 0.6164963586066731, + "grad_norm": 4.876507759094238, + "learning_rate": 3.401679881632296e-05, + "loss": 1.9883, + "step": 8190 + }, + { + "epoch": 0.6165716328873332, + "grad_norm": 6.674361228942871, + "learning_rate": 3.400524804475764e-05, + "loss": 1.7115, + "step": 8191 + }, + { + "epoch": 0.6166469071679934, + "grad_norm": 3.4941883087158203, + "learning_rate": 3.3993698224034415e-05, + "loss": 1.7839, + "step": 8192 + }, + { + "epoch": 0.6167221814486535, + "grad_norm": 4.829101085662842, + "learning_rate": 3.398214935483991e-05, + "loss": 1.9081, + "step": 8193 + }, + { + "epoch": 0.6167974557293137, + "grad_norm": 6.579063415527344, + "learning_rate": 3.397060143786067e-05, + "loss": 1.8436, + "step": 8194 + }, + { + "epoch": 0.6168727300099739, + "grad_norm": 5.214309215545654, + "learning_rate": 3.3959054473783175e-05, + "loss": 1.8023, + "step": 8195 + }, + { + "epoch": 0.616948004290634, + "grad_norm": 6.181789875030518, + "learning_rate": 3.394750846329386e-05, + "loss": 1.9845, + "step": 8196 + }, + { + "epoch": 0.6170232785712941, + "grad_norm": 6.282076358795166, + "learning_rate": 3.393596340707912e-05, + "loss": 2.4618, + "step": 8197 + }, + { + "epoch": 0.6170985528519544, + "grad_norm": 4.5025787353515625, + "learning_rate": 3.392441930582525e-05, + "loss": 1.9083, + "step": 8198 + }, + { + "epoch": 0.6171738271326145, + "grad_norm": 3.552966356277466, + "learning_rate": 3.391287616021853e-05, + "loss": 1.9448, + "step": 8199 + }, + { + "epoch": 0.6172491014132746, + "grad_norm": 4.1839447021484375, + "learning_rate": 3.390133397094516e-05, + "loss": 1.7578, + "step": 8200 + }, + { + "epoch": 0.6173243756939347, + "grad_norm": 4.193100929260254, + "learning_rate": 3.3889792738691314e-05, + "loss": 1.6644, + "step": 8201 + }, + { + "epoch": 0.617399649974595, + "grad_norm": 3.751049518585205, + "learning_rate": 3.387825246414303e-05, + "loss": 1.6328, + "step": 8202 + }, + { + "epoch": 0.6174749242552551, + "grad_norm": 5.038300037384033, + "learning_rate": 3.3866713147986394e-05, + "loss": 2.0151, + "step": 8203 + }, + { + "epoch": 0.6175501985359152, + "grad_norm": 8.461604118347168, + "learning_rate": 3.385517479090738e-05, + "loss": 2.0889, + "step": 8204 + }, + { + "epoch": 0.6176254728165754, + "grad_norm": 5.9980950355529785, + "learning_rate": 3.3843637393591894e-05, + "loss": 1.9154, + "step": 8205 + }, + { + "epoch": 0.6177007470972355, + "grad_norm": 4.784667015075684, + "learning_rate": 3.383210095672581e-05, + "loss": 2.1451, + "step": 8206 + }, + { + "epoch": 0.6177760213778957, + "grad_norm": 4.672134876251221, + "learning_rate": 3.3820565480994913e-05, + "loss": 1.3894, + "step": 8207 + }, + { + "epoch": 0.6178512956585559, + "grad_norm": 5.144842624664307, + "learning_rate": 3.380903096708501e-05, + "loss": 2.0122, + "step": 8208 + }, + { + "epoch": 0.617926569939216, + "grad_norm": 4.93029260635376, + "learning_rate": 3.3797497415681725e-05, + "loss": 2.104, + "step": 8209 + }, + { + "epoch": 0.6180018442198761, + "grad_norm": 3.9838309288024902, + "learning_rate": 3.378596482747075e-05, + "loss": 1.8608, + "step": 8210 + }, + { + "epoch": 0.6180771185005364, + "grad_norm": 4.014377117156982, + "learning_rate": 3.377443320313762e-05, + "loss": 1.6941, + "step": 8211 + }, + { + "epoch": 0.6181523927811965, + "grad_norm": 6.950359344482422, + "learning_rate": 3.3762902543367916e-05, + "loss": 1.7919, + "step": 8212 + }, + { + "epoch": 0.6182276670618566, + "grad_norm": 5.2452545166015625, + "learning_rate": 3.375137284884703e-05, + "loss": 2.0207, + "step": 8213 + }, + { + "epoch": 0.6183029413425168, + "grad_norm": 5.115932941436768, + "learning_rate": 3.3739844120260436e-05, + "loss": 1.6745, + "step": 8214 + }, + { + "epoch": 0.618378215623177, + "grad_norm": 7.520609378814697, + "learning_rate": 3.372831635829343e-05, + "loss": 2.2043, + "step": 8215 + }, + { + "epoch": 0.6184534899038371, + "grad_norm": 3.8072352409362793, + "learning_rate": 3.3716789563631334e-05, + "loss": 1.7571, + "step": 8216 + }, + { + "epoch": 0.6185287641844973, + "grad_norm": 4.510412693023682, + "learning_rate": 3.370526373695937e-05, + "loss": 1.634, + "step": 8217 + }, + { + "epoch": 0.6186040384651574, + "grad_norm": 4.100464344024658, + "learning_rate": 3.3693738878962745e-05, + "loss": 1.6751, + "step": 8218 + }, + { + "epoch": 0.6186793127458176, + "grad_norm": 5.649510860443115, + "learning_rate": 3.368221499032653e-05, + "loss": 1.631, + "step": 8219 + }, + { + "epoch": 0.6187545870264777, + "grad_norm": 4.242654800415039, + "learning_rate": 3.3670692071735824e-05, + "loss": 1.9873, + "step": 8220 + }, + { + "epoch": 0.6188298613071379, + "grad_norm": 5.126852989196777, + "learning_rate": 3.365917012387563e-05, + "loss": 1.9031, + "step": 8221 + }, + { + "epoch": 0.618905135587798, + "grad_norm": 6.010108470916748, + "learning_rate": 3.364764914743087e-05, + "loss": 2.0317, + "step": 8222 + }, + { + "epoch": 0.6189804098684581, + "grad_norm": 4.9840569496154785, + "learning_rate": 3.363612914308645e-05, + "loss": 1.8559, + "step": 8223 + }, + { + "epoch": 0.6190556841491184, + "grad_norm": 5.7103142738342285, + "learning_rate": 3.36246101115272e-05, + "loss": 2.093, + "step": 8224 + }, + { + "epoch": 0.6191309584297785, + "grad_norm": 4.976246356964111, + "learning_rate": 3.36130920534379e-05, + "loss": 1.5783, + "step": 8225 + }, + { + "epoch": 0.6192062327104386, + "grad_norm": 4.657334327697754, + "learning_rate": 3.3601574969503255e-05, + "loss": 1.9017, + "step": 8226 + }, + { + "epoch": 0.6192815069910989, + "grad_norm": 4.421818256378174, + "learning_rate": 3.359005886040792e-05, + "loss": 1.7282, + "step": 8227 + }, + { + "epoch": 0.619356781271759, + "grad_norm": 3.9061472415924072, + "learning_rate": 3.35785437268365e-05, + "loss": 1.7746, + "step": 8228 + }, + { + "epoch": 0.6194320555524191, + "grad_norm": 5.525216579437256, + "learning_rate": 3.356702956947355e-05, + "loss": 1.7409, + "step": 8229 + }, + { + "epoch": 0.6195073298330793, + "grad_norm": 3.159904956817627, + "learning_rate": 3.3555516389003525e-05, + "loss": 1.7272, + "step": 8230 + }, + { + "epoch": 0.6195826041137394, + "grad_norm": 5.123450756072998, + "learning_rate": 3.3544004186110886e-05, + "loss": 2.1347, + "step": 8231 + }, + { + "epoch": 0.6196578783943996, + "grad_norm": 5.510494232177734, + "learning_rate": 3.353249296147997e-05, + "loss": 1.9331, + "step": 8232 + }, + { + "epoch": 0.6197331526750598, + "grad_norm": 4.655089378356934, + "learning_rate": 3.352098271579509e-05, + "loss": 2.3762, + "step": 8233 + }, + { + "epoch": 0.6198084269557199, + "grad_norm": 4.618927001953125, + "learning_rate": 3.35094734497405e-05, + "loss": 2.1618, + "step": 8234 + }, + { + "epoch": 0.61988370123638, + "grad_norm": 6.0057477951049805, + "learning_rate": 3.34979651640004e-05, + "loss": 1.8834, + "step": 8235 + }, + { + "epoch": 0.6199589755170403, + "grad_norm": 8.42966079711914, + "learning_rate": 3.3486457859258934e-05, + "loss": 1.8661, + "step": 8236 + }, + { + "epoch": 0.6200342497977004, + "grad_norm": 4.626075267791748, + "learning_rate": 3.347495153620015e-05, + "loss": 2.2126, + "step": 8237 + }, + { + "epoch": 0.6201095240783605, + "grad_norm": 5.205615520477295, + "learning_rate": 3.3463446195508094e-05, + "loss": 1.9239, + "step": 8238 + }, + { + "epoch": 0.6201847983590206, + "grad_norm": 5.049977779388428, + "learning_rate": 3.34519418378667e-05, + "loss": 2.08, + "step": 8239 + }, + { + "epoch": 0.6202600726396809, + "grad_norm": 5.072951316833496, + "learning_rate": 3.344043846395991e-05, + "loss": 1.7105, + "step": 8240 + }, + { + "epoch": 0.620335346920341, + "grad_norm": 4.272696495056152, + "learning_rate": 3.342893607447151e-05, + "loss": 2.5105, + "step": 8241 + }, + { + "epoch": 0.6204106212010011, + "grad_norm": 4.273044109344482, + "learning_rate": 3.3417434670085335e-05, + "loss": 2.1126, + "step": 8242 + }, + { + "epoch": 0.6204858954816613, + "grad_norm": 3.3404288291931152, + "learning_rate": 3.340593425148507e-05, + "loss": 1.7529, + "step": 8243 + }, + { + "epoch": 0.6205611697623215, + "grad_norm": 4.337212085723877, + "learning_rate": 3.3394434819354404e-05, + "loss": 1.7534, + "step": 8244 + }, + { + "epoch": 0.6206364440429816, + "grad_norm": 4.941805362701416, + "learning_rate": 3.3382936374376935e-05, + "loss": 1.5829, + "step": 8245 + }, + { + "epoch": 0.6207117183236418, + "grad_norm": 5.1706976890563965, + "learning_rate": 3.337143891723624e-05, + "loss": 2.3988, + "step": 8246 + }, + { + "epoch": 0.6207869926043019, + "grad_norm": 4.825283050537109, + "learning_rate": 3.335994244861577e-05, + "loss": 1.911, + "step": 8247 + }, + { + "epoch": 0.620862266884962, + "grad_norm": 3.969494342803955, + "learning_rate": 3.3348446969198984e-05, + "loss": 1.7953, + "step": 8248 + }, + { + "epoch": 0.6209375411656223, + "grad_norm": 4.922825336456299, + "learning_rate": 3.3336952479669234e-05, + "loss": 1.9632, + "step": 8249 + }, + { + "epoch": 0.6210128154462824, + "grad_norm": 4.728207111358643, + "learning_rate": 3.332545898070987e-05, + "loss": 1.7429, + "step": 8250 + }, + { + "epoch": 0.6210880897269425, + "grad_norm": 4.985260009765625, + "learning_rate": 3.3313966473004096e-05, + "loss": 1.6192, + "step": 8251 + }, + { + "epoch": 0.6211633640076027, + "grad_norm": 4.827419757843018, + "learning_rate": 3.330247495723515e-05, + "loss": 1.8011, + "step": 8252 + }, + { + "epoch": 0.6212386382882629, + "grad_norm": 4.1476826667785645, + "learning_rate": 3.329098443408617e-05, + "loss": 1.8888, + "step": 8253 + }, + { + "epoch": 0.621313912568923, + "grad_norm": 4.269753932952881, + "learning_rate": 3.3279494904240206e-05, + "loss": 2.1614, + "step": 8254 + }, + { + "epoch": 0.6213891868495832, + "grad_norm": 4.465358734130859, + "learning_rate": 3.3268006368380304e-05, + "loss": 1.6303, + "step": 8255 + }, + { + "epoch": 0.6214644611302433, + "grad_norm": 3.966667652130127, + "learning_rate": 3.32565188271894e-05, + "loss": 1.7639, + "step": 8256 + }, + { + "epoch": 0.6215397354109035, + "grad_norm": 3.632319211959839, + "learning_rate": 3.3245032281350435e-05, + "loss": 1.8971, + "step": 8257 + }, + { + "epoch": 0.6216150096915636, + "grad_norm": 4.717795372009277, + "learning_rate": 3.3233546731546204e-05, + "loss": 2.2254, + "step": 8258 + }, + { + "epoch": 0.6216902839722238, + "grad_norm": 3.8923418521881104, + "learning_rate": 3.3222062178459526e-05, + "loss": 2.115, + "step": 8259 + }, + { + "epoch": 0.6217655582528839, + "grad_norm": 4.7175092697143555, + "learning_rate": 3.32105786227731e-05, + "loss": 1.946, + "step": 8260 + }, + { + "epoch": 0.621840832533544, + "grad_norm": 4.205885887145996, + "learning_rate": 3.31990960651696e-05, + "loss": 1.6029, + "step": 8261 + }, + { + "epoch": 0.6219161068142043, + "grad_norm": 6.618117809295654, + "learning_rate": 3.318761450633163e-05, + "loss": 2.1752, + "step": 8262 + }, + { + "epoch": 0.6219913810948644, + "grad_norm": 5.104174613952637, + "learning_rate": 3.317613394694175e-05, + "loss": 2.1454, + "step": 8263 + }, + { + "epoch": 0.6220666553755245, + "grad_norm": 7.77531623840332, + "learning_rate": 3.316465438768242e-05, + "loss": 1.325, + "step": 8264 + }, + { + "epoch": 0.6221419296561848, + "grad_norm": 5.644231796264648, + "learning_rate": 3.3153175829236085e-05, + "loss": 2.0972, + "step": 8265 + }, + { + "epoch": 0.6222172039368449, + "grad_norm": 4.986363410949707, + "learning_rate": 3.3141698272285107e-05, + "loss": 2.1437, + "step": 8266 + }, + { + "epoch": 0.622292478217505, + "grad_norm": 3.750397205352783, + "learning_rate": 3.313022171751179e-05, + "loss": 1.8346, + "step": 8267 + }, + { + "epoch": 0.6223677524981652, + "grad_norm": 5.335587024688721, + "learning_rate": 3.31187461655984e-05, + "loss": 1.5425, + "step": 8268 + }, + { + "epoch": 0.6224430267788253, + "grad_norm": 3.871005058288574, + "learning_rate": 3.3107271617227096e-05, + "loss": 1.8154, + "step": 8269 + }, + { + "epoch": 0.6225183010594855, + "grad_norm": 4.961533546447754, + "learning_rate": 3.3095798073080055e-05, + "loss": 1.8041, + "step": 8270 + }, + { + "epoch": 0.6225935753401457, + "grad_norm": 3.8872344493865967, + "learning_rate": 3.3084325533839275e-05, + "loss": 1.5672, + "step": 8271 + }, + { + "epoch": 0.6226688496208058, + "grad_norm": 3.621957778930664, + "learning_rate": 3.3072854000186836e-05, + "loss": 1.8466, + "step": 8272 + }, + { + "epoch": 0.6227441239014659, + "grad_norm": 3.66719913482666, + "learning_rate": 3.306138347280464e-05, + "loss": 1.8955, + "step": 8273 + }, + { + "epoch": 0.6228193981821262, + "grad_norm": 4.060410499572754, + "learning_rate": 3.304991395237461e-05, + "loss": 1.9046, + "step": 8274 + }, + { + "epoch": 0.6228946724627863, + "grad_norm": 4.263533115386963, + "learning_rate": 3.3038445439578544e-05, + "loss": 2.0354, + "step": 8275 + }, + { + "epoch": 0.6229699467434464, + "grad_norm": 4.144182205200195, + "learning_rate": 3.302697793509825e-05, + "loss": 1.8561, + "step": 8276 + }, + { + "epoch": 0.6230452210241066, + "grad_norm": 4.593705177307129, + "learning_rate": 3.3015511439615384e-05, + "loss": 1.7105, + "step": 8277 + }, + { + "epoch": 0.6231204953047668, + "grad_norm": 4.700996398925781, + "learning_rate": 3.300404595381165e-05, + "loss": 1.9159, + "step": 8278 + }, + { + "epoch": 0.6231957695854269, + "grad_norm": 7.516633033752441, + "learning_rate": 3.2992581478368614e-05, + "loss": 2.63, + "step": 8279 + }, + { + "epoch": 0.623271043866087, + "grad_norm": 4.484390735626221, + "learning_rate": 3.2981118013967815e-05, + "loss": 2.0504, + "step": 8280 + }, + { + "epoch": 0.6233463181467472, + "grad_norm": 4.061007022857666, + "learning_rate": 3.29696555612907e-05, + "loss": 1.6844, + "step": 8281 + }, + { + "epoch": 0.6234215924274074, + "grad_norm": 4.173439025878906, + "learning_rate": 3.295819412101872e-05, + "loss": 1.586, + "step": 8282 + }, + { + "epoch": 0.6234968667080675, + "grad_norm": 4.952429294586182, + "learning_rate": 3.294673369383318e-05, + "loss": 2.1062, + "step": 8283 + }, + { + "epoch": 0.6235721409887277, + "grad_norm": 5.733234405517578, + "learning_rate": 3.2935274280415385e-05, + "loss": 2.3618, + "step": 8284 + }, + { + "epoch": 0.6236474152693878, + "grad_norm": 3.6483817100524902, + "learning_rate": 3.2923815881446594e-05, + "loss": 1.8382, + "step": 8285 + }, + { + "epoch": 0.623722689550048, + "grad_norm": 4.784677028656006, + "learning_rate": 3.291235849760794e-05, + "loss": 1.767, + "step": 8286 + }, + { + "epoch": 0.6237979638307082, + "grad_norm": 5.480867385864258, + "learning_rate": 3.290090212958056e-05, + "loss": 2.1439, + "step": 8287 + }, + { + "epoch": 0.6238732381113683, + "grad_norm": 5.025787353515625, + "learning_rate": 3.2889446778045464e-05, + "loss": 1.8821, + "step": 8288 + }, + { + "epoch": 0.6239485123920284, + "grad_norm": 4.936732769012451, + "learning_rate": 3.287799244368367e-05, + "loss": 1.8836, + "step": 8289 + }, + { + "epoch": 0.6240237866726887, + "grad_norm": 3.6807878017425537, + "learning_rate": 3.286653912717609e-05, + "loss": 1.7756, + "step": 8290 + }, + { + "epoch": 0.6240990609533488, + "grad_norm": 4.392080307006836, + "learning_rate": 3.285508682920361e-05, + "loss": 1.8883, + "step": 8291 + }, + { + "epoch": 0.6241743352340089, + "grad_norm": 4.467647075653076, + "learning_rate": 3.2843635550447004e-05, + "loss": 1.6686, + "step": 8292 + }, + { + "epoch": 0.6242496095146691, + "grad_norm": 3.7380247116088867, + "learning_rate": 3.283218529158706e-05, + "loss": 2.0058, + "step": 8293 + }, + { + "epoch": 0.6243248837953292, + "grad_norm": 7.30566930770874, + "learning_rate": 3.282073605330443e-05, + "loss": 1.808, + "step": 8294 + }, + { + "epoch": 0.6244001580759894, + "grad_norm": 4.505008220672607, + "learning_rate": 3.280928783627975e-05, + "loss": 1.8222, + "step": 8295 + }, + { + "epoch": 0.6244754323566496, + "grad_norm": 4.26721715927124, + "learning_rate": 3.279784064119357e-05, + "loss": 2.1332, + "step": 8296 + }, + { + "epoch": 0.6245507066373097, + "grad_norm": 5.7599382400512695, + "learning_rate": 3.2786394468726436e-05, + "loss": 1.5231, + "step": 8297 + }, + { + "epoch": 0.6246259809179698, + "grad_norm": 4.933229923248291, + "learning_rate": 3.277494931955873e-05, + "loss": 1.7742, + "step": 8298 + }, + { + "epoch": 0.62470125519863, + "grad_norm": 4.214962005615234, + "learning_rate": 3.2763505194370866e-05, + "loss": 2.0699, + "step": 8299 + }, + { + "epoch": 0.6247765294792902, + "grad_norm": 5.859561920166016, + "learning_rate": 3.275206209384317e-05, + "loss": 2.0165, + "step": 8300 + }, + { + "epoch": 0.6248518037599503, + "grad_norm": 5.256788730621338, + "learning_rate": 3.2740620018655885e-05, + "loss": 1.9123, + "step": 8301 + }, + { + "epoch": 0.6249270780406104, + "grad_norm": 7.470607280731201, + "learning_rate": 3.272917896948923e-05, + "loss": 1.8502, + "step": 8302 + }, + { + "epoch": 0.6250023523212707, + "grad_norm": 4.81989860534668, + "learning_rate": 3.2717738947023314e-05, + "loss": 2.2194, + "step": 8303 + }, + { + "epoch": 0.6250776266019308, + "grad_norm": 6.651296138763428, + "learning_rate": 3.2706299951938255e-05, + "loss": 1.789, + "step": 8304 + }, + { + "epoch": 0.6251529008825909, + "grad_norm": 4.168320655822754, + "learning_rate": 3.269486198491402e-05, + "loss": 1.7555, + "step": 8305 + }, + { + "epoch": 0.6252281751632511, + "grad_norm": 4.957070827484131, + "learning_rate": 3.268342504663059e-05, + "loss": 2.1281, + "step": 8306 + }, + { + "epoch": 0.6253034494439113, + "grad_norm": 5.425527572631836, + "learning_rate": 3.267198913776785e-05, + "loss": 1.7913, + "step": 8307 + }, + { + "epoch": 0.6253787237245714, + "grad_norm": 4.317502975463867, + "learning_rate": 3.266055425900565e-05, + "loss": 1.8442, + "step": 8308 + }, + { + "epoch": 0.6254539980052316, + "grad_norm": 5.058265209197998, + "learning_rate": 3.2649120411023734e-05, + "loss": 1.7928, + "step": 8309 + }, + { + "epoch": 0.6255292722858917, + "grad_norm": 7.60575532913208, + "learning_rate": 3.263768759450183e-05, + "loss": 2.0965, + "step": 8310 + }, + { + "epoch": 0.6256045465665518, + "grad_norm": 5.555531978607178, + "learning_rate": 3.262625581011957e-05, + "loss": 2.057, + "step": 8311 + }, + { + "epoch": 0.6256798208472121, + "grad_norm": 4.312882900238037, + "learning_rate": 3.2614825058556573e-05, + "loss": 1.5621, + "step": 8312 + }, + { + "epoch": 0.6257550951278722, + "grad_norm": 4.7362871170043945, + "learning_rate": 3.260339534049232e-05, + "loss": 1.9293, + "step": 8313 + }, + { + "epoch": 0.6258303694085323, + "grad_norm": 4.182237148284912, + "learning_rate": 3.259196665660632e-05, + "loss": 1.8929, + "step": 8314 + }, + { + "epoch": 0.6259056436891925, + "grad_norm": 4.8938374519348145, + "learning_rate": 3.2580539007577934e-05, + "loss": 2.2774, + "step": 8315 + }, + { + "epoch": 0.6259809179698527, + "grad_norm": 4.275259017944336, + "learning_rate": 3.256911239408652e-05, + "loss": 1.8442, + "step": 8316 + }, + { + "epoch": 0.6260561922505128, + "grad_norm": 5.5678486824035645, + "learning_rate": 3.255768681681137e-05, + "loss": 1.799, + "step": 8317 + }, + { + "epoch": 0.6261314665311729, + "grad_norm": 5.898873329162598, + "learning_rate": 3.254626227643168e-05, + "loss": 1.6108, + "step": 8318 + }, + { + "epoch": 0.6262067408118331, + "grad_norm": 5.793529510498047, + "learning_rate": 3.253483877362664e-05, + "loss": 1.7145, + "step": 8319 + }, + { + "epoch": 0.6262820150924933, + "grad_norm": 5.791046142578125, + "learning_rate": 3.252341630907531e-05, + "loss": 1.6596, + "step": 8320 + }, + { + "epoch": 0.6263572893731534, + "grad_norm": 3.4461171627044678, + "learning_rate": 3.251199488345674e-05, + "loss": 1.5891, + "step": 8321 + }, + { + "epoch": 0.6264325636538136, + "grad_norm": 4.364080905914307, + "learning_rate": 3.2500574497449874e-05, + "loss": 2.0532, + "step": 8322 + }, + { + "epoch": 0.6265078379344737, + "grad_norm": 4.662264823913574, + "learning_rate": 3.2489155151733665e-05, + "loss": 2.3847, + "step": 8323 + }, + { + "epoch": 0.6265831122151339, + "grad_norm": 5.796299934387207, + "learning_rate": 3.2477736846986924e-05, + "loss": 2.0372, + "step": 8324 + }, + { + "epoch": 0.6266583864957941, + "grad_norm": 5.66961669921875, + "learning_rate": 3.2466319583888464e-05, + "loss": 1.543, + "step": 8325 + }, + { + "epoch": 0.6267336607764542, + "grad_norm": 4.356776714324951, + "learning_rate": 3.245490336311698e-05, + "loss": 1.6971, + "step": 8326 + }, + { + "epoch": 0.6268089350571143, + "grad_norm": 4.802499294281006, + "learning_rate": 3.2443488185351154e-05, + "loss": 2.1443, + "step": 8327 + }, + { + "epoch": 0.6268842093377746, + "grad_norm": 3.842556953430176, + "learning_rate": 3.2432074051269576e-05, + "loss": 1.7824, + "step": 8328 + }, + { + "epoch": 0.6269594836184347, + "grad_norm": 4.327637195587158, + "learning_rate": 3.242066096155081e-05, + "loss": 1.8276, + "step": 8329 + }, + { + "epoch": 0.6270347578990948, + "grad_norm": 4.579751014709473, + "learning_rate": 3.240924891687328e-05, + "loss": 1.7164, + "step": 8330 + }, + { + "epoch": 0.627110032179755, + "grad_norm": 5.32450008392334, + "learning_rate": 3.239783791791546e-05, + "loss": 1.871, + "step": 8331 + }, + { + "epoch": 0.6271853064604151, + "grad_norm": 4.403231143951416, + "learning_rate": 3.2386427965355636e-05, + "loss": 1.924, + "step": 8332 + }, + { + "epoch": 0.6272605807410753, + "grad_norm": 7.266706466674805, + "learning_rate": 3.2375019059872144e-05, + "loss": 2.0374, + "step": 8333 + }, + { + "epoch": 0.6273358550217355, + "grad_norm": 5.705352783203125, + "learning_rate": 3.236361120214319e-05, + "loss": 1.929, + "step": 8334 + }, + { + "epoch": 0.6274111293023956, + "grad_norm": 4.518496513366699, + "learning_rate": 3.235220439284695e-05, + "loss": 1.7801, + "step": 8335 + }, + { + "epoch": 0.6274864035830557, + "grad_norm": 5.234706401824951, + "learning_rate": 3.234079863266154e-05, + "loss": 1.5562, + "step": 8336 + }, + { + "epoch": 0.6275616778637159, + "grad_norm": 4.721508979797363, + "learning_rate": 3.232939392226496e-05, + "loss": 1.6654, + "step": 8337 + }, + { + "epoch": 0.6276369521443761, + "grad_norm": 5.178009510040283, + "learning_rate": 3.2317990262335215e-05, + "loss": 1.9676, + "step": 8338 + }, + { + "epoch": 0.6277122264250362, + "grad_norm": 4.755590438842773, + "learning_rate": 3.230658765355021e-05, + "loss": 1.9271, + "step": 8339 + }, + { + "epoch": 0.6277875007056963, + "grad_norm": 4.829550266265869, + "learning_rate": 3.2295186096587815e-05, + "loss": 2.1921, + "step": 8340 + }, + { + "epoch": 0.6278627749863566, + "grad_norm": 4.162178039550781, + "learning_rate": 3.2283785592125785e-05, + "loss": 1.689, + "step": 8341 + }, + { + "epoch": 0.6279380492670167, + "grad_norm": 6.179942607879639, + "learning_rate": 3.2272386140841895e-05, + "loss": 1.8758, + "step": 8342 + }, + { + "epoch": 0.6280133235476768, + "grad_norm": 4.728642463684082, + "learning_rate": 3.226098774341376e-05, + "loss": 1.5701, + "step": 8343 + }, + { + "epoch": 0.628088597828337, + "grad_norm": 5.58257532119751, + "learning_rate": 3.224959040051902e-05, + "loss": 1.6969, + "step": 8344 + }, + { + "epoch": 0.6281638721089972, + "grad_norm": 4.4962897300720215, + "learning_rate": 3.223819411283517e-05, + "loss": 1.7774, + "step": 8345 + }, + { + "epoch": 0.6282391463896573, + "grad_norm": 4.467648029327393, + "learning_rate": 3.222679888103975e-05, + "loss": 1.9221, + "step": 8346 + }, + { + "epoch": 0.6283144206703175, + "grad_norm": 4.631173133850098, + "learning_rate": 3.221540470581011e-05, + "loss": 1.8135, + "step": 8347 + }, + { + "epoch": 0.6283896949509776, + "grad_norm": 7.60060453414917, + "learning_rate": 3.220401158782365e-05, + "loss": 2.0137, + "step": 8348 + }, + { + "epoch": 0.6284649692316377, + "grad_norm": 6.273983955383301, + "learning_rate": 3.219261952775764e-05, + "loss": 1.7037, + "step": 8349 + }, + { + "epoch": 0.628540243512298, + "grad_norm": 6.07139253616333, + "learning_rate": 3.218122852628929e-05, + "loss": 2.0992, + "step": 8350 + }, + { + "epoch": 0.6286155177929581, + "grad_norm": 4.815303802490234, + "learning_rate": 3.216983858409579e-05, + "loss": 1.4361, + "step": 8351 + }, + { + "epoch": 0.6286907920736182, + "grad_norm": 6.025692939758301, + "learning_rate": 3.215844970185421e-05, + "loss": 1.7152, + "step": 8352 + }, + { + "epoch": 0.6287660663542785, + "grad_norm": 7.418488502502441, + "learning_rate": 3.214706188024162e-05, + "loss": 1.9619, + "step": 8353 + }, + { + "epoch": 0.6288413406349386, + "grad_norm": 3.55694842338562, + "learning_rate": 3.213567511993497e-05, + "loss": 1.9477, + "step": 8354 + }, + { + "epoch": 0.6289166149155987, + "grad_norm": 3.8427464962005615, + "learning_rate": 3.2124289421611175e-05, + "loss": 1.7306, + "step": 8355 + }, + { + "epoch": 0.6289918891962588, + "grad_norm": 4.430111885070801, + "learning_rate": 3.211290478594707e-05, + "loss": 1.8016, + "step": 8356 + }, + { + "epoch": 0.629067163476919, + "grad_norm": 4.9731764793396, + "learning_rate": 3.210152121361948e-05, + "loss": 1.6231, + "step": 8357 + }, + { + "epoch": 0.6291424377575792, + "grad_norm": 4.950997829437256, + "learning_rate": 3.209013870530507e-05, + "loss": 1.6704, + "step": 8358 + }, + { + "epoch": 0.6292177120382393, + "grad_norm": 4.167163372039795, + "learning_rate": 3.207875726168055e-05, + "loss": 1.9497, + "step": 8359 + }, + { + "epoch": 0.6292929863188995, + "grad_norm": 3.9677810668945312, + "learning_rate": 3.2067376883422464e-05, + "loss": 1.9058, + "step": 8360 + }, + { + "epoch": 0.6293682605995596, + "grad_norm": 4.717792510986328, + "learning_rate": 3.205599757120737e-05, + "loss": 1.7056, + "step": 8361 + }, + { + "epoch": 0.6294435348802198, + "grad_norm": 5.162610054016113, + "learning_rate": 3.204461932571174e-05, + "loss": 1.8545, + "step": 8362 + }, + { + "epoch": 0.62951880916088, + "grad_norm": 5.355527877807617, + "learning_rate": 3.203324214761198e-05, + "loss": 1.9754, + "step": 8363 + }, + { + "epoch": 0.6295940834415401, + "grad_norm": 4.509415149688721, + "learning_rate": 3.202186603758442e-05, + "loss": 1.9866, + "step": 8364 + }, + { + "epoch": 0.6296693577222002, + "grad_norm": 4.520984649658203, + "learning_rate": 3.201049099630532e-05, + "loss": 1.8496, + "step": 8365 + }, + { + "epoch": 0.6297446320028605, + "grad_norm": 4.6440205574035645, + "learning_rate": 3.1999117024450924e-05, + "loss": 1.9134, + "step": 8366 + }, + { + "epoch": 0.6298199062835206, + "grad_norm": 3.9407575130462646, + "learning_rate": 3.198774412269736e-05, + "loss": 1.7924, + "step": 8367 + }, + { + "epoch": 0.6298951805641807, + "grad_norm": 3.958294630050659, + "learning_rate": 3.197637229172076e-05, + "loss": 2.0424, + "step": 8368 + }, + { + "epoch": 0.6299704548448409, + "grad_norm": 6.674445629119873, + "learning_rate": 3.1965001532197075e-05, + "loss": 2.0018, + "step": 8369 + }, + { + "epoch": 0.630045729125501, + "grad_norm": 3.707188367843628, + "learning_rate": 3.1953631844802327e-05, + "loss": 1.8018, + "step": 8370 + }, + { + "epoch": 0.6301210034061612, + "grad_norm": 5.819868087768555, + "learning_rate": 3.194226323021237e-05, + "loss": 1.6668, + "step": 8371 + }, + { + "epoch": 0.6301962776868214, + "grad_norm": 4.373745918273926, + "learning_rate": 3.1930895689103054e-05, + "loss": 1.6442, + "step": 8372 + }, + { + "epoch": 0.6302715519674815, + "grad_norm": 3.326568603515625, + "learning_rate": 3.1919529222150136e-05, + "loss": 1.7995, + "step": 8373 + }, + { + "epoch": 0.6303468262481416, + "grad_norm": 6.456649303436279, + "learning_rate": 3.190816383002934e-05, + "loss": 1.7915, + "step": 8374 + }, + { + "epoch": 0.6304221005288019, + "grad_norm": 4.06173849105835, + "learning_rate": 3.189679951341628e-05, + "loss": 1.7129, + "step": 8375 + }, + { + "epoch": 0.630497374809462, + "grad_norm": 6.115271091461182, + "learning_rate": 3.1885436272986566e-05, + "loss": 1.6972, + "step": 8376 + }, + { + "epoch": 0.6305726490901221, + "grad_norm": 4.842566967010498, + "learning_rate": 3.1874074109415666e-05, + "loss": 1.8554, + "step": 8377 + }, + { + "epoch": 0.6306479233707822, + "grad_norm": 3.90193772315979, + "learning_rate": 3.186271302337906e-05, + "loss": 1.9724, + "step": 8378 + }, + { + "epoch": 0.6307231976514425, + "grad_norm": 4.660115718841553, + "learning_rate": 3.185135301555212e-05, + "loss": 1.7914, + "step": 8379 + }, + { + "epoch": 0.6307984719321026, + "grad_norm": 5.187869548797607, + "learning_rate": 3.183999408661016e-05, + "loss": 1.9176, + "step": 8380 + }, + { + "epoch": 0.6308737462127627, + "grad_norm": 3.9580442905426025, + "learning_rate": 3.182863623722847e-05, + "loss": 2.0853, + "step": 8381 + }, + { + "epoch": 0.6309490204934229, + "grad_norm": 3.8213765621185303, + "learning_rate": 3.18172794680822e-05, + "loss": 2.0184, + "step": 8382 + }, + { + "epoch": 0.6310242947740831, + "grad_norm": 4.1299662590026855, + "learning_rate": 3.180592377984649e-05, + "loss": 1.8766, + "step": 8383 + }, + { + "epoch": 0.6310995690547432, + "grad_norm": 4.048853874206543, + "learning_rate": 3.179456917319641e-05, + "loss": 1.7417, + "step": 8384 + }, + { + "epoch": 0.6311748433354034, + "grad_norm": 4.777251720428467, + "learning_rate": 3.178321564880696e-05, + "loss": 1.7942, + "step": 8385 + }, + { + "epoch": 0.6312501176160635, + "grad_norm": 3.9705843925476074, + "learning_rate": 3.177186320735306e-05, + "loss": 1.6337, + "step": 8386 + }, + { + "epoch": 0.6313253918967237, + "grad_norm": 3.4834516048431396, + "learning_rate": 3.1760511849509616e-05, + "loss": 1.9068, + "step": 8387 + }, + { + "epoch": 0.6314006661773839, + "grad_norm": 4.639775276184082, + "learning_rate": 3.174916157595138e-05, + "loss": 1.9823, + "step": 8388 + }, + { + "epoch": 0.631475940458044, + "grad_norm": 4.269222259521484, + "learning_rate": 3.173781238735313e-05, + "loss": 1.6385, + "step": 8389 + }, + { + "epoch": 0.6315512147387041, + "grad_norm": 4.1870622634887695, + "learning_rate": 3.1726464284389527e-05, + "loss": 1.7807, + "step": 8390 + }, + { + "epoch": 0.6316264890193644, + "grad_norm": 3.809448480606079, + "learning_rate": 3.171511726773521e-05, + "loss": 1.9132, + "step": 8391 + }, + { + "epoch": 0.6317017633000245, + "grad_norm": 4.513881683349609, + "learning_rate": 3.170377133806469e-05, + "loss": 2.1446, + "step": 8392 + }, + { + "epoch": 0.6317770375806846, + "grad_norm": 4.269097805023193, + "learning_rate": 3.1692426496052465e-05, + "loss": 1.945, + "step": 8393 + }, + { + "epoch": 0.6318523118613448, + "grad_norm": 4.71463680267334, + "learning_rate": 3.1681082742372955e-05, + "loss": 1.6865, + "step": 8394 + }, + { + "epoch": 0.631927586142005, + "grad_norm": 5.84751033782959, + "learning_rate": 3.166974007770053e-05, + "loss": 1.8674, + "step": 8395 + }, + { + "epoch": 0.6320028604226651, + "grad_norm": 5.969925403594971, + "learning_rate": 3.1658398502709436e-05, + "loss": 1.8413, + "step": 8396 + }, + { + "epoch": 0.6320781347033252, + "grad_norm": 3.561568021774292, + "learning_rate": 3.164705801807393e-05, + "loss": 1.8048, + "step": 8397 + }, + { + "epoch": 0.6321534089839854, + "grad_norm": 5.3204345703125, + "learning_rate": 3.163571862446818e-05, + "loss": 1.6532, + "step": 8398 + }, + { + "epoch": 0.6322286832646455, + "grad_norm": 6.0735931396484375, + "learning_rate": 3.162438032256625e-05, + "loss": 1.8224, + "step": 8399 + }, + { + "epoch": 0.6323039575453057, + "grad_norm": 4.502913475036621, + "learning_rate": 3.1613043113042186e-05, + "loss": 1.6187, + "step": 8400 + }, + { + "epoch": 0.6323792318259659, + "grad_norm": 4.421908855438232, + "learning_rate": 3.160170699656995e-05, + "loss": 1.8675, + "step": 8401 + }, + { + "epoch": 0.632454506106626, + "grad_norm": 6.09163236618042, + "learning_rate": 3.1590371973823455e-05, + "loss": 1.9831, + "step": 8402 + }, + { + "epoch": 0.6325297803872861, + "grad_norm": 4.58880090713501, + "learning_rate": 3.157903804547651e-05, + "loss": 2.0097, + "step": 8403 + }, + { + "epoch": 0.6326050546679464, + "grad_norm": 4.020610809326172, + "learning_rate": 3.1567705212202915e-05, + "loss": 1.88, + "step": 8404 + }, + { + "epoch": 0.6326803289486065, + "grad_norm": 5.396985054016113, + "learning_rate": 3.155637347467633e-05, + "loss": 2.2428, + "step": 8405 + }, + { + "epoch": 0.6327556032292666, + "grad_norm": 4.30937385559082, + "learning_rate": 3.1545042833570435e-05, + "loss": 1.9898, + "step": 8406 + }, + { + "epoch": 0.6328308775099268, + "grad_norm": 4.43129825592041, + "learning_rate": 3.1533713289558784e-05, + "loss": 1.9185, + "step": 8407 + }, + { + "epoch": 0.632906151790587, + "grad_norm": 4.572925090789795, + "learning_rate": 3.152238484331491e-05, + "loss": 1.9966, + "step": 8408 + }, + { + "epoch": 0.6329814260712471, + "grad_norm": 4.606265068054199, + "learning_rate": 3.151105749551222e-05, + "loss": 1.4964, + "step": 8409 + }, + { + "epoch": 0.6330567003519073, + "grad_norm": 4.03704309463501, + "learning_rate": 3.1499731246824124e-05, + "loss": 1.9266, + "step": 8410 + }, + { + "epoch": 0.6331319746325674, + "grad_norm": 3.9234256744384766, + "learning_rate": 3.1488406097923905e-05, + "loss": 2.1381, + "step": 8411 + }, + { + "epoch": 0.6332072489132275, + "grad_norm": 3.9247612953186035, + "learning_rate": 3.147708204948483e-05, + "loss": 2.0783, + "step": 8412 + }, + { + "epoch": 0.6332825231938878, + "grad_norm": 5.843477725982666, + "learning_rate": 3.1465759102180106e-05, + "loss": 1.7687, + "step": 8413 + }, + { + "epoch": 0.6333577974745479, + "grad_norm": 5.497551441192627, + "learning_rate": 3.145443725668279e-05, + "loss": 1.8308, + "step": 8414 + }, + { + "epoch": 0.633433071755208, + "grad_norm": 3.995081663131714, + "learning_rate": 3.1443116513665995e-05, + "loss": 2.0809, + "step": 8415 + }, + { + "epoch": 0.6335083460358681, + "grad_norm": 5.792292594909668, + "learning_rate": 3.143179687380265e-05, + "loss": 1.5218, + "step": 8416 + }, + { + "epoch": 0.6335836203165284, + "grad_norm": 5.019339561462402, + "learning_rate": 3.1420478337765726e-05, + "loss": 2.0032, + "step": 8417 + }, + { + "epoch": 0.6336588945971885, + "grad_norm": 5.25856876373291, + "learning_rate": 3.1409160906228036e-05, + "loss": 1.8416, + "step": 8418 + }, + { + "epoch": 0.6337341688778486, + "grad_norm": 3.918635368347168, + "learning_rate": 3.1397844579862404e-05, + "loss": 1.658, + "step": 8419 + }, + { + "epoch": 0.6338094431585088, + "grad_norm": 5.040243148803711, + "learning_rate": 3.138652935934152e-05, + "loss": 1.622, + "step": 8420 + }, + { + "epoch": 0.633884717439169, + "grad_norm": 4.454802513122559, + "learning_rate": 3.137521524533807e-05, + "loss": 1.7687, + "step": 8421 + }, + { + "epoch": 0.6339599917198291, + "grad_norm": 3.7485463619232178, + "learning_rate": 3.1363902238524625e-05, + "loss": 1.6373, + "step": 8422 + }, + { + "epoch": 0.6340352660004893, + "grad_norm": 6.195174217224121, + "learning_rate": 3.135259033957373e-05, + "loss": 2.1407, + "step": 8423 + }, + { + "epoch": 0.6341105402811494, + "grad_norm": 3.882117509841919, + "learning_rate": 3.134127954915782e-05, + "loss": 1.7567, + "step": 8424 + }, + { + "epoch": 0.6341858145618096, + "grad_norm": 5.763416290283203, + "learning_rate": 3.1329969867949316e-05, + "loss": 1.5771, + "step": 8425 + }, + { + "epoch": 0.6342610888424698, + "grad_norm": 4.590452671051025, + "learning_rate": 3.131866129662052e-05, + "loss": 2.0444, + "step": 8426 + }, + { + "epoch": 0.6343363631231299, + "grad_norm": 4.413145542144775, + "learning_rate": 3.130735383584372e-05, + "loss": 2.1113, + "step": 8427 + }, + { + "epoch": 0.63441163740379, + "grad_norm": 4.523130416870117, + "learning_rate": 3.129604748629108e-05, + "loss": 1.8543, + "step": 8428 + }, + { + "epoch": 0.6344869116844503, + "grad_norm": 10.310506820678711, + "learning_rate": 3.1284742248634755e-05, + "loss": 2.0874, + "step": 8429 + }, + { + "epoch": 0.6345621859651104, + "grad_norm": 4.193033218383789, + "learning_rate": 3.127343812354682e-05, + "loss": 1.7977, + "step": 8430 + }, + { + "epoch": 0.6346374602457705, + "grad_norm": 6.753688812255859, + "learning_rate": 3.126213511169924e-05, + "loss": 1.9621, + "step": 8431 + }, + { + "epoch": 0.6347127345264307, + "grad_norm": 5.896317481994629, + "learning_rate": 3.125083321376398e-05, + "loss": 2.3063, + "step": 8432 + }, + { + "epoch": 0.6347880088070909, + "grad_norm": 4.846764087677002, + "learning_rate": 3.1239532430412866e-05, + "loss": 1.7856, + "step": 8433 + }, + { + "epoch": 0.634863283087751, + "grad_norm": 4.583929061889648, + "learning_rate": 3.1228232762317734e-05, + "loss": 2.1295, + "step": 8434 + }, + { + "epoch": 0.6349385573684111, + "grad_norm": 4.837149620056152, + "learning_rate": 3.1216934210150286e-05, + "loss": 1.7093, + "step": 8435 + }, + { + "epoch": 0.6350138316490713, + "grad_norm": 4.8804144859313965, + "learning_rate": 3.120563677458224e-05, + "loss": 2.0385, + "step": 8436 + }, + { + "epoch": 0.6350891059297314, + "grad_norm": 4.434991836547852, + "learning_rate": 3.119434045628512e-05, + "loss": 1.8697, + "step": 8437 + }, + { + "epoch": 0.6351643802103916, + "grad_norm": 6.151554584503174, + "learning_rate": 3.118304525593052e-05, + "loss": 1.9632, + "step": 8438 + }, + { + "epoch": 0.6352396544910518, + "grad_norm": 5.7576904296875, + "learning_rate": 3.117175117418988e-05, + "loss": 1.942, + "step": 8439 + }, + { + "epoch": 0.6353149287717119, + "grad_norm": 5.128333568572998, + "learning_rate": 3.116045821173462e-05, + "loss": 1.904, + "step": 8440 + }, + { + "epoch": 0.635390203052372, + "grad_norm": 5.1462931632995605, + "learning_rate": 3.114916636923605e-05, + "loss": 2.0165, + "step": 8441 + }, + { + "epoch": 0.6354654773330323, + "grad_norm": 5.546964645385742, + "learning_rate": 3.1137875647365466e-05, + "loss": 1.8632, + "step": 8442 + }, + { + "epoch": 0.6355407516136924, + "grad_norm": 4.864556312561035, + "learning_rate": 3.112658604679404e-05, + "loss": 1.8112, + "step": 8443 + }, + { + "epoch": 0.6356160258943525, + "grad_norm": 4.2855634689331055, + "learning_rate": 3.1115297568192915e-05, + "loss": 2.104, + "step": 8444 + }, + { + "epoch": 0.6356913001750127, + "grad_norm": 3.92118239402771, + "learning_rate": 3.110401021223317e-05, + "loss": 1.8094, + "step": 8445 + }, + { + "epoch": 0.6357665744556729, + "grad_norm": 5.776573181152344, + "learning_rate": 3.1092723979585795e-05, + "loss": 1.5659, + "step": 8446 + }, + { + "epoch": 0.635841848736333, + "grad_norm": 5.143870830535889, + "learning_rate": 3.108143887092175e-05, + "loss": 1.8178, + "step": 8447 + }, + { + "epoch": 0.6359171230169932, + "grad_norm": 4.2534308433532715, + "learning_rate": 3.107015488691185e-05, + "loss": 1.919, + "step": 8448 + }, + { + "epoch": 0.6359923972976533, + "grad_norm": 5.454699993133545, + "learning_rate": 3.105887202822696e-05, + "loss": 1.9768, + "step": 8449 + }, + { + "epoch": 0.6360676715783135, + "grad_norm": 4.441861629486084, + "learning_rate": 3.1047590295537754e-05, + "loss": 1.7199, + "step": 8450 + }, + { + "epoch": 0.6361429458589737, + "grad_norm": 3.8113224506378174, + "learning_rate": 3.103630968951493e-05, + "loss": 1.6083, + "step": 8451 + }, + { + "epoch": 0.6362182201396338, + "grad_norm": 6.349819660186768, + "learning_rate": 3.102503021082907e-05, + "loss": 1.761, + "step": 8452 + }, + { + "epoch": 0.6362934944202939, + "grad_norm": 3.772465705871582, + "learning_rate": 3.101375186015075e-05, + "loss": 1.9929, + "step": 8453 + }, + { + "epoch": 0.636368768700954, + "grad_norm": 4.067782878875732, + "learning_rate": 3.1002474638150376e-05, + "loss": 1.946, + "step": 8454 + }, + { + "epoch": 0.6364440429816143, + "grad_norm": 5.269015312194824, + "learning_rate": 3.099119854549839e-05, + "loss": 1.9848, + "step": 8455 + }, + { + "epoch": 0.6365193172622744, + "grad_norm": 5.123625755310059, + "learning_rate": 3.097992358286509e-05, + "loss": 2.191, + "step": 8456 + }, + { + "epoch": 0.6365945915429345, + "grad_norm": 4.4515838623046875, + "learning_rate": 3.0968649750920784e-05, + "loss": 1.8606, + "step": 8457 + }, + { + "epoch": 0.6366698658235947, + "grad_norm": 5.074526786804199, + "learning_rate": 3.0957377050335624e-05, + "loss": 2.0305, + "step": 8458 + }, + { + "epoch": 0.6367451401042549, + "grad_norm": 4.495479583740234, + "learning_rate": 3.0946105481779776e-05, + "loss": 1.9195, + "step": 8459 + }, + { + "epoch": 0.636820414384915, + "grad_norm": 3.6234099864959717, + "learning_rate": 3.093483504592326e-05, + "loss": 2.0614, + "step": 8460 + }, + { + "epoch": 0.6368956886655752, + "grad_norm": 5.036413192749023, + "learning_rate": 3.092356574343611e-05, + "loss": 2.0836, + "step": 8461 + }, + { + "epoch": 0.6369709629462353, + "grad_norm": 5.620336532592773, + "learning_rate": 3.091229757498825e-05, + "loss": 2.3991, + "step": 8462 + }, + { + "epoch": 0.6370462372268955, + "grad_norm": 4.834925174713135, + "learning_rate": 3.090103054124951e-05, + "loss": 1.6901, + "step": 8463 + }, + { + "epoch": 0.6371215115075557, + "grad_norm": 7.271121501922607, + "learning_rate": 3.0889764642889725e-05, + "loss": 1.3124, + "step": 8464 + }, + { + "epoch": 0.6371967857882158, + "grad_norm": 4.830792427062988, + "learning_rate": 3.087849988057858e-05, + "loss": 1.9993, + "step": 8465 + }, + { + "epoch": 0.6372720600688759, + "grad_norm": 4.118501663208008, + "learning_rate": 3.0867236254985764e-05, + "loss": 1.7187, + "step": 8466 + }, + { + "epoch": 0.6373473343495362, + "grad_norm": 5.906317710876465, + "learning_rate": 3.085597376678084e-05, + "loss": 1.9745, + "step": 8467 + }, + { + "epoch": 0.6374226086301963, + "grad_norm": 4.795383930206299, + "learning_rate": 3.084471241663337e-05, + "loss": 2.0147, + "step": 8468 + }, + { + "epoch": 0.6374978829108564, + "grad_norm": 5.179449081420898, + "learning_rate": 3.083345220521276e-05, + "loss": 1.986, + "step": 8469 + }, + { + "epoch": 0.6375731571915166, + "grad_norm": 6.1793341636657715, + "learning_rate": 3.082219313318844e-05, + "loss": 1.9492, + "step": 8470 + }, + { + "epoch": 0.6376484314721768, + "grad_norm": 4.8410420417785645, + "learning_rate": 3.0810935201229694e-05, + "loss": 1.9789, + "step": 8471 + }, + { + "epoch": 0.6377237057528369, + "grad_norm": 4.241498947143555, + "learning_rate": 3.0799678410005806e-05, + "loss": 1.6805, + "step": 8472 + }, + { + "epoch": 0.6377989800334971, + "grad_norm": 4.631001949310303, + "learning_rate": 3.0788422760185926e-05, + "loss": 2.0209, + "step": 8473 + }, + { + "epoch": 0.6378742543141572, + "grad_norm": 4.758254051208496, + "learning_rate": 3.0777168252439217e-05, + "loss": 2.103, + "step": 8474 + }, + { + "epoch": 0.6379495285948173, + "grad_norm": 5.047450065612793, + "learning_rate": 3.0765914887434665e-05, + "loss": 2.0047, + "step": 8475 + }, + { + "epoch": 0.6380248028754775, + "grad_norm": 5.941407680511475, + "learning_rate": 3.075466266584132e-05, + "loss": 1.8381, + "step": 8476 + }, + { + "epoch": 0.6381000771561377, + "grad_norm": 3.9722232818603516, + "learning_rate": 3.074341158832803e-05, + "loss": 1.9066, + "step": 8477 + }, + { + "epoch": 0.6381753514367978, + "grad_norm": 7.776758193969727, + "learning_rate": 3.073216165556367e-05, + "loss": 1.8474, + "step": 8478 + }, + { + "epoch": 0.6382506257174579, + "grad_norm": 3.987119436264038, + "learning_rate": 3.072091286821702e-05, + "loss": 2.0514, + "step": 8479 + }, + { + "epoch": 0.6383258999981182, + "grad_norm": 4.9225664138793945, + "learning_rate": 3.070966522695677e-05, + "loss": 1.7064, + "step": 8480 + }, + { + "epoch": 0.6384011742787783, + "grad_norm": 3.492079019546509, + "learning_rate": 3.069841873245161e-05, + "loss": 1.7445, + "step": 8481 + }, + { + "epoch": 0.6384764485594384, + "grad_norm": 7.026560306549072, + "learning_rate": 3.068717338537004e-05, + "loss": 2.3793, + "step": 8482 + }, + { + "epoch": 0.6385517228400986, + "grad_norm": 6.624011993408203, + "learning_rate": 3.06759291863806e-05, + "loss": 2.1891, + "step": 8483 + }, + { + "epoch": 0.6386269971207588, + "grad_norm": 5.279020309448242, + "learning_rate": 3.066468613615173e-05, + "loss": 1.8185, + "step": 8484 + }, + { + "epoch": 0.6387022714014189, + "grad_norm": 5.62544059753418, + "learning_rate": 3.0653444235351805e-05, + "loss": 1.7669, + "step": 8485 + }, + { + "epoch": 0.6387775456820791, + "grad_norm": 5.458271026611328, + "learning_rate": 3.064220348464908e-05, + "loss": 1.5607, + "step": 8486 + }, + { + "epoch": 0.6388528199627392, + "grad_norm": 4.97369384765625, + "learning_rate": 3.0630963884711836e-05, + "loss": 1.7731, + "step": 8487 + }, + { + "epoch": 0.6389280942433994, + "grad_norm": 4.403059959411621, + "learning_rate": 3.06197254362082e-05, + "loss": 1.8589, + "step": 8488 + }, + { + "epoch": 0.6390033685240596, + "grad_norm": 4.2187628746032715, + "learning_rate": 3.060848813980628e-05, + "loss": 1.6185, + "step": 8489 + }, + { + "epoch": 0.6390786428047197, + "grad_norm": 4.045538902282715, + "learning_rate": 3.05972519961741e-05, + "loss": 2.0817, + "step": 8490 + }, + { + "epoch": 0.6391539170853798, + "grad_norm": 6.184861183166504, + "learning_rate": 3.058601700597963e-05, + "loss": 1.9642, + "step": 8491 + }, + { + "epoch": 0.6392291913660401, + "grad_norm": 8.042470932006836, + "learning_rate": 3.0574783169890724e-05, + "loss": 2.1537, + "step": 8492 + }, + { + "epoch": 0.6393044656467002, + "grad_norm": 5.143957614898682, + "learning_rate": 3.056355048857522e-05, + "loss": 1.6169, + "step": 8493 + }, + { + "epoch": 0.6393797399273603, + "grad_norm": 9.582860946655273, + "learning_rate": 3.0552318962700885e-05, + "loss": 1.9245, + "step": 8494 + }, + { + "epoch": 0.6394550142080204, + "grad_norm": 3.644885540008545, + "learning_rate": 3.0541088592935374e-05, + "loss": 1.7739, + "step": 8495 + }, + { + "epoch": 0.6395302884886807, + "grad_norm": 3.9607558250427246, + "learning_rate": 3.0529859379946334e-05, + "loss": 1.7368, + "step": 8496 + }, + { + "epoch": 0.6396055627693408, + "grad_norm": 5.704013347625732, + "learning_rate": 3.051863132440126e-05, + "loss": 1.6579, + "step": 8497 + }, + { + "epoch": 0.6396808370500009, + "grad_norm": 4.952240943908691, + "learning_rate": 3.0507404426967685e-05, + "loss": 1.6261, + "step": 8498 + }, + { + "epoch": 0.6397561113306611, + "grad_norm": 4.565150260925293, + "learning_rate": 3.049617868831296e-05, + "loss": 1.739, + "step": 8499 + }, + { + "epoch": 0.6398313856113212, + "grad_norm": 3.678384780883789, + "learning_rate": 3.0484954109104462e-05, + "loss": 1.8142, + "step": 8500 + }, + { + "epoch": 0.6399066598919814, + "grad_norm": 4.516523838043213, + "learning_rate": 3.047373069000944e-05, + "loss": 1.8722, + "step": 8501 + }, + { + "epoch": 0.6399819341726416, + "grad_norm": 4.493857383728027, + "learning_rate": 3.0462508431695112e-05, + "loss": 1.9746, + "step": 8502 + }, + { + "epoch": 0.6400572084533017, + "grad_norm": 3.938154697418213, + "learning_rate": 3.045128733482859e-05, + "loss": 1.7696, + "step": 8503 + }, + { + "epoch": 0.6401324827339618, + "grad_norm": 4.596213340759277, + "learning_rate": 3.0440067400076954e-05, + "loss": 1.6798, + "step": 8504 + }, + { + "epoch": 0.6402077570146221, + "grad_norm": 7.9469146728515625, + "learning_rate": 3.0428848628107176e-05, + "loss": 1.7819, + "step": 8505 + }, + { + "epoch": 0.6402830312952822, + "grad_norm": 5.652976036071777, + "learning_rate": 3.0417631019586197e-05, + "loss": 1.9123, + "step": 8506 + }, + { + "epoch": 0.6403583055759423, + "grad_norm": 6.151829242706299, + "learning_rate": 3.0406414575180853e-05, + "loss": 1.551, + "step": 8507 + }, + { + "epoch": 0.6404335798566025, + "grad_norm": 5.076486587524414, + "learning_rate": 3.0395199295557965e-05, + "loss": 1.9937, + "step": 8508 + }, + { + "epoch": 0.6405088541372627, + "grad_norm": 5.037563323974609, + "learning_rate": 3.0383985181384213e-05, + "loss": 1.4109, + "step": 8509 + }, + { + "epoch": 0.6405841284179228, + "grad_norm": 5.346786975860596, + "learning_rate": 3.0372772233326242e-05, + "loss": 1.751, + "step": 8510 + }, + { + "epoch": 0.640659402698583, + "grad_norm": 5.752713203430176, + "learning_rate": 3.0361560452050664e-05, + "loss": 2.059, + "step": 8511 + }, + { + "epoch": 0.6407346769792431, + "grad_norm": 5.699567794799805, + "learning_rate": 3.0350349838223958e-05, + "loss": 1.913, + "step": 8512 + }, + { + "epoch": 0.6408099512599033, + "grad_norm": 3.9459757804870605, + "learning_rate": 3.03391403925126e-05, + "loss": 2.1368, + "step": 8513 + }, + { + "epoch": 0.6408852255405634, + "grad_norm": 4.238221168518066, + "learning_rate": 3.0327932115582903e-05, + "loss": 2.125, + "step": 8514 + }, + { + "epoch": 0.6409604998212236, + "grad_norm": 5.608497619628906, + "learning_rate": 3.0316725008101215e-05, + "loss": 1.8591, + "step": 8515 + }, + { + "epoch": 0.6410357741018837, + "grad_norm": 3.9737725257873535, + "learning_rate": 3.0305519070733734e-05, + "loss": 1.7274, + "step": 8516 + }, + { + "epoch": 0.6411110483825438, + "grad_norm": 3.95164155960083, + "learning_rate": 3.0294314304146642e-05, + "loss": 1.6302, + "step": 8517 + }, + { + "epoch": 0.6411863226632041, + "grad_norm": 4.32230281829834, + "learning_rate": 3.0283110709006014e-05, + "loss": 2.0586, + "step": 8518 + }, + { + "epoch": 0.6412615969438642, + "grad_norm": 4.404703140258789, + "learning_rate": 3.027190828597789e-05, + "loss": 1.7466, + "step": 8519 + }, + { + "epoch": 0.6413368712245243, + "grad_norm": 4.992621898651123, + "learning_rate": 3.0260707035728208e-05, + "loss": 1.5498, + "step": 8520 + }, + { + "epoch": 0.6414121455051845, + "grad_norm": 4.494771480560303, + "learning_rate": 3.0249506958922857e-05, + "loss": 2.0752, + "step": 8521 + }, + { + "epoch": 0.6414874197858447, + "grad_norm": 6.320178031921387, + "learning_rate": 3.0238308056227633e-05, + "loss": 1.8353, + "step": 8522 + }, + { + "epoch": 0.6415626940665048, + "grad_norm": 5.9774250984191895, + "learning_rate": 3.022711032830832e-05, + "loss": 1.7564, + "step": 8523 + }, + { + "epoch": 0.641637968347165, + "grad_norm": 4.570126056671143, + "learning_rate": 3.0215913775830533e-05, + "loss": 1.7231, + "step": 8524 + }, + { + "epoch": 0.6417132426278251, + "grad_norm": 5.786969184875488, + "learning_rate": 3.0204718399459913e-05, + "loss": 1.9839, + "step": 8525 + }, + { + "epoch": 0.6417885169084853, + "grad_norm": 3.862994909286499, + "learning_rate": 3.0193524199862e-05, + "loss": 1.6574, + "step": 8526 + }, + { + "epoch": 0.6418637911891455, + "grad_norm": 5.468550205230713, + "learning_rate": 3.018233117770223e-05, + "loss": 1.9569, + "step": 8527 + }, + { + "epoch": 0.6419390654698056, + "grad_norm": 5.428247928619385, + "learning_rate": 3.0171139333646014e-05, + "loss": 1.7367, + "step": 8528 + }, + { + "epoch": 0.6420143397504657, + "grad_norm": 4.250402450561523, + "learning_rate": 3.0159948668358662e-05, + "loss": 1.6917, + "step": 8529 + }, + { + "epoch": 0.642089614031126, + "grad_norm": 4.328627586364746, + "learning_rate": 3.014875918250545e-05, + "loss": 1.5335, + "step": 8530 + }, + { + "epoch": 0.6421648883117861, + "grad_norm": 6.313094615936279, + "learning_rate": 3.0137570876751524e-05, + "loss": 1.9671, + "step": 8531 + }, + { + "epoch": 0.6422401625924462, + "grad_norm": 3.6403791904449463, + "learning_rate": 3.0126383751762043e-05, + "loss": 1.6429, + "step": 8532 + }, + { + "epoch": 0.6423154368731063, + "grad_norm": 4.782705307006836, + "learning_rate": 3.0115197808202e-05, + "loss": 1.7788, + "step": 8533 + }, + { + "epoch": 0.6423907111537666, + "grad_norm": 7.334542274475098, + "learning_rate": 3.0104013046736402e-05, + "loss": 1.7528, + "step": 8534 + }, + { + "epoch": 0.6424659854344267, + "grad_norm": 4.299736499786377, + "learning_rate": 3.0092829468030125e-05, + "loss": 1.7937, + "step": 8535 + }, + { + "epoch": 0.6425412597150868, + "grad_norm": 5.448459625244141, + "learning_rate": 3.0081647072748043e-05, + "loss": 1.7141, + "step": 8536 + }, + { + "epoch": 0.642616533995747, + "grad_norm": 4.656899452209473, + "learning_rate": 3.0070465861554874e-05, + "loss": 1.9306, + "step": 8537 + }, + { + "epoch": 0.6426918082764072, + "grad_norm": 4.9766998291015625, + "learning_rate": 3.005928583511533e-05, + "loss": 1.8518, + "step": 8538 + }, + { + "epoch": 0.6427670825570673, + "grad_norm": 7.527698516845703, + "learning_rate": 3.0048106994094016e-05, + "loss": 1.9763, + "step": 8539 + }, + { + "epoch": 0.6428423568377275, + "grad_norm": 4.758029937744141, + "learning_rate": 3.003692933915552e-05, + "loss": 1.859, + "step": 8540 + }, + { + "epoch": 0.6429176311183876, + "grad_norm": 4.55019998550415, + "learning_rate": 3.0025752870964262e-05, + "loss": 1.7336, + "step": 8541 + }, + { + "epoch": 0.6429929053990477, + "grad_norm": 5.973742485046387, + "learning_rate": 3.001457759018469e-05, + "loss": 2.1553, + "step": 8542 + }, + { + "epoch": 0.643068179679708, + "grad_norm": 5.923421859741211, + "learning_rate": 3.0003403497481152e-05, + "loss": 1.7233, + "step": 8543 + }, + { + "epoch": 0.6431434539603681, + "grad_norm": 4.701004981994629, + "learning_rate": 2.999223059351789e-05, + "loss": 1.8867, + "step": 8544 + }, + { + "epoch": 0.6432187282410282, + "grad_norm": 5.493651390075684, + "learning_rate": 2.998105887895911e-05, + "loss": 2.036, + "step": 8545 + }, + { + "epoch": 0.6432940025216884, + "grad_norm": 5.680747985839844, + "learning_rate": 2.9969888354468933e-05, + "loss": 1.808, + "step": 8546 + }, + { + "epoch": 0.6433692768023486, + "grad_norm": 4.604275226593018, + "learning_rate": 2.995871902071144e-05, + "loss": 1.9923, + "step": 8547 + }, + { + "epoch": 0.6434445510830087, + "grad_norm": 7.673112869262695, + "learning_rate": 2.9947550878350572e-05, + "loss": 1.5103, + "step": 8548 + }, + { + "epoch": 0.6435198253636689, + "grad_norm": 4.728858947753906, + "learning_rate": 2.9936383928050284e-05, + "loss": 1.5903, + "step": 8549 + }, + { + "epoch": 0.643595099644329, + "grad_norm": 4.685051918029785, + "learning_rate": 2.9925218170474402e-05, + "loss": 1.532, + "step": 8550 + }, + { + "epoch": 0.6436703739249892, + "grad_norm": 4.883116245269775, + "learning_rate": 2.9914053606286695e-05, + "loss": 1.9962, + "step": 8551 + }, + { + "epoch": 0.6437456482056494, + "grad_norm": 5.989914894104004, + "learning_rate": 2.990289023615086e-05, + "loss": 2.3358, + "step": 8552 + }, + { + "epoch": 0.6438209224863095, + "grad_norm": 5.500239372253418, + "learning_rate": 2.9891728060730555e-05, + "loss": 2.0928, + "step": 8553 + }, + { + "epoch": 0.6438961967669696, + "grad_norm": 4.805877685546875, + "learning_rate": 2.9880567080689303e-05, + "loss": 2.2692, + "step": 8554 + }, + { + "epoch": 0.6439714710476298, + "grad_norm": 6.41030216217041, + "learning_rate": 2.986940729669061e-05, + "loss": 1.9259, + "step": 8555 + }, + { + "epoch": 0.64404674532829, + "grad_norm": 4.49720573425293, + "learning_rate": 2.985824870939789e-05, + "loss": 1.8036, + "step": 8556 + }, + { + "epoch": 0.6441220196089501, + "grad_norm": 4.38686466217041, + "learning_rate": 2.9847091319474485e-05, + "loss": 1.7433, + "step": 8557 + }, + { + "epoch": 0.6441972938896102, + "grad_norm": 3.8548953533172607, + "learning_rate": 2.9835935127583703e-05, + "loss": 2.0497, + "step": 8558 + }, + { + "epoch": 0.6442725681702705, + "grad_norm": 6.085220813751221, + "learning_rate": 2.9824780134388697e-05, + "loss": 2.3237, + "step": 8559 + }, + { + "epoch": 0.6443478424509306, + "grad_norm": 3.575270175933838, + "learning_rate": 2.981362634055265e-05, + "loss": 1.6333, + "step": 8560 + }, + { + "epoch": 0.6444231167315907, + "grad_norm": 6.429887294769287, + "learning_rate": 2.9802473746738562e-05, + "loss": 2.0933, + "step": 8561 + }, + { + "epoch": 0.6444983910122509, + "grad_norm": 7.841139793395996, + "learning_rate": 2.9791322353609473e-05, + "loss": 1.752, + "step": 8562 + }, + { + "epoch": 0.644573665292911, + "grad_norm": 3.8940541744232178, + "learning_rate": 2.978017216182828e-05, + "loss": 1.7116, + "step": 8563 + }, + { + "epoch": 0.6446489395735712, + "grad_norm": 5.283813953399658, + "learning_rate": 2.976902317205785e-05, + "loss": 2.3602, + "step": 8564 + }, + { + "epoch": 0.6447242138542314, + "grad_norm": 4.955506324768066, + "learning_rate": 2.975787538496092e-05, + "loss": 1.6618, + "step": 8565 + }, + { + "epoch": 0.6447994881348915, + "grad_norm": 5.2576375007629395, + "learning_rate": 2.974672880120023e-05, + "loss": 1.8863, + "step": 8566 + }, + { + "epoch": 0.6448747624155516, + "grad_norm": 4.595310688018799, + "learning_rate": 2.973558342143839e-05, + "loss": 1.8035, + "step": 8567 + }, + { + "epoch": 0.6449500366962119, + "grad_norm": 5.2111077308654785, + "learning_rate": 2.9724439246337987e-05, + "loss": 2.2897, + "step": 8568 + }, + { + "epoch": 0.645025310976872, + "grad_norm": 5.0141096115112305, + "learning_rate": 2.971329627656148e-05, + "loss": 1.5543, + "step": 8569 + }, + { + "epoch": 0.6451005852575321, + "grad_norm": 3.944983720779419, + "learning_rate": 2.970215451277132e-05, + "loss": 1.8509, + "step": 8570 + }, + { + "epoch": 0.6451758595381923, + "grad_norm": 7.747410297393799, + "learning_rate": 2.9691013955629808e-05, + "loss": 2.1258, + "step": 8571 + }, + { + "epoch": 0.6452511338188525, + "grad_norm": 4.766421318054199, + "learning_rate": 2.9679874605799257e-05, + "loss": 1.8184, + "step": 8572 + }, + { + "epoch": 0.6453264080995126, + "grad_norm": 5.19478178024292, + "learning_rate": 2.9668736463941844e-05, + "loss": 1.8652, + "step": 8573 + }, + { + "epoch": 0.6454016823801727, + "grad_norm": 4.1410722732543945, + "learning_rate": 2.9657599530719714e-05, + "loss": 2.2062, + "step": 8574 + }, + { + "epoch": 0.6454769566608329, + "grad_norm": 6.578781604766846, + "learning_rate": 2.964646380679494e-05, + "loss": 2.0871, + "step": 8575 + }, + { + "epoch": 0.645552230941493, + "grad_norm": 5.385827541351318, + "learning_rate": 2.963532929282947e-05, + "loss": 1.4649, + "step": 8576 + }, + { + "epoch": 0.6456275052221532, + "grad_norm": 4.8669562339782715, + "learning_rate": 2.9624195989485264e-05, + "loss": 1.8195, + "step": 8577 + }, + { + "epoch": 0.6457027795028134, + "grad_norm": 6.243131160736084, + "learning_rate": 2.961306389742412e-05, + "loss": 2.0895, + "step": 8578 + }, + { + "epoch": 0.6457780537834735, + "grad_norm": 7.147341251373291, + "learning_rate": 2.9601933017307847e-05, + "loss": 1.78, + "step": 8579 + }, + { + "epoch": 0.6458533280641336, + "grad_norm": 5.092959403991699, + "learning_rate": 2.959080334979811e-05, + "loss": 1.897, + "step": 8580 + }, + { + "epoch": 0.6459286023447939, + "grad_norm": 4.118281364440918, + "learning_rate": 2.9579674895556574e-05, + "loss": 1.9852, + "step": 8581 + }, + { + "epoch": 0.646003876625454, + "grad_norm": 4.885115146636963, + "learning_rate": 2.956854765524476e-05, + "loss": 2.0077, + "step": 8582 + }, + { + "epoch": 0.6460791509061141, + "grad_norm": 5.322532653808594, + "learning_rate": 2.9557421629524163e-05, + "loss": 2.0309, + "step": 8583 + }, + { + "epoch": 0.6461544251867744, + "grad_norm": 4.8626556396484375, + "learning_rate": 2.9546296819056192e-05, + "loss": 1.5482, + "step": 8584 + }, + { + "epoch": 0.6462296994674345, + "grad_norm": 5.2390875816345215, + "learning_rate": 2.95351732245022e-05, + "loss": 1.9367, + "step": 8585 + }, + { + "epoch": 0.6463049737480946, + "grad_norm": 5.341590881347656, + "learning_rate": 2.9524050846523427e-05, + "loss": 1.8399, + "step": 8586 + }, + { + "epoch": 0.6463802480287548, + "grad_norm": 6.855342388153076, + "learning_rate": 2.9512929685781092e-05, + "loss": 2.0165, + "step": 8587 + }, + { + "epoch": 0.6464555223094149, + "grad_norm": 5.312610149383545, + "learning_rate": 2.9501809742936292e-05, + "loss": 2.1165, + "step": 8588 + }, + { + "epoch": 0.6465307965900751, + "grad_norm": 4.534675598144531, + "learning_rate": 2.9490691018650084e-05, + "loss": 1.5757, + "step": 8589 + }, + { + "epoch": 0.6466060708707353, + "grad_norm": 5.893584251403809, + "learning_rate": 2.9479573513583446e-05, + "loss": 1.7665, + "step": 8590 + }, + { + "epoch": 0.6466813451513954, + "grad_norm": 3.467353105545044, + "learning_rate": 2.9468457228397284e-05, + "loss": 1.9285, + "step": 8591 + }, + { + "epoch": 0.6467566194320555, + "grad_norm": 4.947606086730957, + "learning_rate": 2.9457342163752443e-05, + "loss": 1.7428, + "step": 8592 + }, + { + "epoch": 0.6468318937127157, + "grad_norm": 5.851820945739746, + "learning_rate": 2.9446228320309655e-05, + "loss": 1.9027, + "step": 8593 + }, + { + "epoch": 0.6469071679933759, + "grad_norm": 5.959867477416992, + "learning_rate": 2.943511569872962e-05, + "loss": 1.684, + "step": 8594 + }, + { + "epoch": 0.646982442274036, + "grad_norm": 4.563910484313965, + "learning_rate": 2.9424004299672948e-05, + "loss": 2.0268, + "step": 8595 + }, + { + "epoch": 0.6470577165546961, + "grad_norm": 7.226961135864258, + "learning_rate": 2.9412894123800195e-05, + "loss": 1.8213, + "step": 8596 + }, + { + "epoch": 0.6471329908353564, + "grad_norm": 5.474841594696045, + "learning_rate": 2.940178517177179e-05, + "loss": 1.6239, + "step": 8597 + }, + { + "epoch": 0.6472082651160165, + "grad_norm": 4.743842124938965, + "learning_rate": 2.939067744424818e-05, + "loss": 2.0848, + "step": 8598 + }, + { + "epoch": 0.6472835393966766, + "grad_norm": 4.671804904937744, + "learning_rate": 2.9379570941889643e-05, + "loss": 1.757, + "step": 8599 + }, + { + "epoch": 0.6473588136773368, + "grad_norm": 4.820899963378906, + "learning_rate": 2.9368465665356448e-05, + "loss": 1.7968, + "step": 8600 + }, + { + "epoch": 0.647434087957997, + "grad_norm": 3.182859420776367, + "learning_rate": 2.935736161530877e-05, + "loss": 1.7045, + "step": 8601 + }, + { + "epoch": 0.6475093622386571, + "grad_norm": 5.053806304931641, + "learning_rate": 2.9346258792406734e-05, + "loss": 1.7039, + "step": 8602 + }, + { + "epoch": 0.6475846365193173, + "grad_norm": 4.7595295906066895, + "learning_rate": 2.9335157197310327e-05, + "loss": 1.7799, + "step": 8603 + }, + { + "epoch": 0.6476599107999774, + "grad_norm": 4.2374043464660645, + "learning_rate": 2.932405683067955e-05, + "loss": 1.76, + "step": 8604 + }, + { + "epoch": 0.6477351850806375, + "grad_norm": 3.7517526149749756, + "learning_rate": 2.931295769317425e-05, + "loss": 1.9355, + "step": 8605 + }, + { + "epoch": 0.6478104593612978, + "grad_norm": 6.759345054626465, + "learning_rate": 2.9301859785454254e-05, + "loss": 2.0159, + "step": 8606 + }, + { + "epoch": 0.6478857336419579, + "grad_norm": 6.61454439163208, + "learning_rate": 2.929076310817932e-05, + "loss": 2.0018, + "step": 8607 + }, + { + "epoch": 0.647961007922618, + "grad_norm": 6.61454439163208, + "learning_rate": 2.929076310817932e-05, + "loss": 2.1288, + "step": 8608 + }, + { + "epoch": 0.6480362822032782, + "grad_norm": 5.1581315994262695, + "learning_rate": 2.927966766200908e-05, + "loss": 1.883, + "step": 8609 + }, + { + "epoch": 0.6481115564839384, + "grad_norm": 5.625715732574463, + "learning_rate": 2.926857344760317e-05, + "loss": 1.6717, + "step": 8610 + }, + { + "epoch": 0.6481868307645985, + "grad_norm": 6.846342086791992, + "learning_rate": 2.9257480465621063e-05, + "loss": 2.254, + "step": 8611 + }, + { + "epoch": 0.6482621050452586, + "grad_norm": 6.118210792541504, + "learning_rate": 2.9246388716722233e-05, + "loss": 1.9584, + "step": 8612 + }, + { + "epoch": 0.6483373793259188, + "grad_norm": 4.919810771942139, + "learning_rate": 2.9235298201566037e-05, + "loss": 2.1906, + "step": 8613 + }, + { + "epoch": 0.648412653606579, + "grad_norm": 3.967045307159424, + "learning_rate": 2.9224208920811803e-05, + "loss": 1.722, + "step": 8614 + }, + { + "epoch": 0.6484879278872391, + "grad_norm": 4.292625904083252, + "learning_rate": 2.9213120875118715e-05, + "loss": 1.8844, + "step": 8615 + }, + { + "epoch": 0.6485632021678993, + "grad_norm": 4.200898170471191, + "learning_rate": 2.9202034065145967e-05, + "loss": 2.1818, + "step": 8616 + }, + { + "epoch": 0.6486384764485594, + "grad_norm": 5.403683185577393, + "learning_rate": 2.9190948491552596e-05, + "loss": 1.7544, + "step": 8617 + }, + { + "epoch": 0.6487137507292196, + "grad_norm": 10.114962577819824, + "learning_rate": 2.917986415499765e-05, + "loss": 1.8368, + "step": 8618 + }, + { + "epoch": 0.6487890250098798, + "grad_norm": 3.80903959274292, + "learning_rate": 2.916878105614002e-05, + "loss": 1.8435, + "step": 8619 + }, + { + "epoch": 0.6488642992905399, + "grad_norm": 4.539679050445557, + "learning_rate": 2.9157699195638588e-05, + "loss": 1.8655, + "step": 8620 + }, + { + "epoch": 0.6489395735712, + "grad_norm": 5.5913238525390625, + "learning_rate": 2.9146618574152128e-05, + "loss": 1.8801, + "step": 8621 + }, + { + "epoch": 0.6490148478518603, + "grad_norm": 3.4713363647460938, + "learning_rate": 2.913553919233939e-05, + "loss": 1.9798, + "step": 8622 + }, + { + "epoch": 0.6490901221325204, + "grad_norm": 5.722151756286621, + "learning_rate": 2.9124461050858954e-05, + "loss": 2.0308, + "step": 8623 + }, + { + "epoch": 0.6491653964131805, + "grad_norm": 5.274685382843018, + "learning_rate": 2.9113384150369406e-05, + "loss": 2.0087, + "step": 8624 + }, + { + "epoch": 0.6492406706938407, + "grad_norm": 4.676621913909912, + "learning_rate": 2.910230849152926e-05, + "loss": 1.7031, + "step": 8625 + }, + { + "epoch": 0.6493159449745008, + "grad_norm": 5.577304840087891, + "learning_rate": 2.9091234074996897e-05, + "loss": 1.8762, + "step": 8626 + }, + { + "epoch": 0.649391219255161, + "grad_norm": 4.7884931564331055, + "learning_rate": 2.9080160901430698e-05, + "loss": 2.0286, + "step": 8627 + }, + { + "epoch": 0.6494664935358212, + "grad_norm": 5.585773468017578, + "learning_rate": 2.9069088971488877e-05, + "loss": 1.4661, + "step": 8628 + }, + { + "epoch": 0.6495417678164813, + "grad_norm": 4.676112174987793, + "learning_rate": 2.905801828582969e-05, + "loss": 1.8685, + "step": 8629 + }, + { + "epoch": 0.6496170420971414, + "grad_norm": 5.2993550300598145, + "learning_rate": 2.9046948845111198e-05, + "loss": 2.2756, + "step": 8630 + }, + { + "epoch": 0.6496923163778016, + "grad_norm": 5.965620517730713, + "learning_rate": 2.9035880649991487e-05, + "loss": 1.9021, + "step": 8631 + }, + { + "epoch": 0.6497675906584618, + "grad_norm": 4.728835105895996, + "learning_rate": 2.9024813701128506e-05, + "loss": 1.8307, + "step": 8632 + }, + { + "epoch": 0.6498428649391219, + "grad_norm": 3.898773670196533, + "learning_rate": 2.9013747999180195e-05, + "loss": 1.9384, + "step": 8633 + }, + { + "epoch": 0.649918139219782, + "grad_norm": 4.24934196472168, + "learning_rate": 2.900268354480432e-05, + "loss": 1.8787, + "step": 8634 + }, + { + "epoch": 0.6499934135004423, + "grad_norm": 3.7674505710601807, + "learning_rate": 2.8991620338658687e-05, + "loss": 1.7757, + "step": 8635 + }, + { + "epoch": 0.6500686877811024, + "grad_norm": 5.886000156402588, + "learning_rate": 2.8980558381400925e-05, + "loss": 2.2199, + "step": 8636 + }, + { + "epoch": 0.6501439620617625, + "grad_norm": 4.2713165283203125, + "learning_rate": 2.8969497673688674e-05, + "loss": 2.0352, + "step": 8637 + }, + { + "epoch": 0.6502192363424227, + "grad_norm": 4.730532169342041, + "learning_rate": 2.8958438216179425e-05, + "loss": 1.9243, + "step": 8638 + }, + { + "epoch": 0.6502945106230829, + "grad_norm": 4.13792610168457, + "learning_rate": 2.8947380009530644e-05, + "loss": 1.5656, + "step": 8639 + }, + { + "epoch": 0.650369784903743, + "grad_norm": 4.677649974822998, + "learning_rate": 2.8936323054399738e-05, + "loss": 1.8078, + "step": 8640 + }, + { + "epoch": 0.6504450591844032, + "grad_norm": 4.231987953186035, + "learning_rate": 2.892526735144397e-05, + "loss": 2.1614, + "step": 8641 + }, + { + "epoch": 0.6505203334650633, + "grad_norm": 4.96175479888916, + "learning_rate": 2.8914212901320605e-05, + "loss": 1.6936, + "step": 8642 + }, + { + "epoch": 0.6505956077457234, + "grad_norm": 4.54158353805542, + "learning_rate": 2.8903159704686762e-05, + "loss": 1.7023, + "step": 8643 + }, + { + "epoch": 0.6506708820263837, + "grad_norm": 4.517205238342285, + "learning_rate": 2.8892107762199538e-05, + "loss": 1.8842, + "step": 8644 + }, + { + "epoch": 0.6507461563070438, + "grad_norm": 3.989027261734009, + "learning_rate": 2.888105707451595e-05, + "loss": 1.4609, + "step": 8645 + }, + { + "epoch": 0.6508214305877039, + "grad_norm": 4.304959774017334, + "learning_rate": 2.8870007642292933e-05, + "loss": 2.1232, + "step": 8646 + }, + { + "epoch": 0.6508967048683642, + "grad_norm": 3.9756412506103516, + "learning_rate": 2.8858959466187317e-05, + "loss": 1.9693, + "step": 8647 + }, + { + "epoch": 0.6509719791490243, + "grad_norm": 3.8650543689727783, + "learning_rate": 2.884791254685592e-05, + "loss": 2.1174, + "step": 8648 + }, + { + "epoch": 0.6510472534296844, + "grad_norm": 3.9971768856048584, + "learning_rate": 2.883686688495541e-05, + "loss": 1.9875, + "step": 8649 + }, + { + "epoch": 0.6511225277103446, + "grad_norm": 4.313498497009277, + "learning_rate": 2.882582248114246e-05, + "loss": 1.81, + "step": 8650 + }, + { + "epoch": 0.6511978019910047, + "grad_norm": 3.4087369441986084, + "learning_rate": 2.881477933607359e-05, + "loss": 1.907, + "step": 8651 + }, + { + "epoch": 0.6512730762716649, + "grad_norm": 5.429731845855713, + "learning_rate": 2.8803737450405322e-05, + "loss": 2.0832, + "step": 8652 + }, + { + "epoch": 0.651348350552325, + "grad_norm": 3.744896173477173, + "learning_rate": 2.8792696824794018e-05, + "loss": 1.8025, + "step": 8653 + }, + { + "epoch": 0.6514236248329852, + "grad_norm": 5.274442672729492, + "learning_rate": 2.878165745989604e-05, + "loss": 1.8795, + "step": 8654 + }, + { + "epoch": 0.6514988991136453, + "grad_norm": 4.412415981292725, + "learning_rate": 2.8770619356367646e-05, + "loss": 1.9839, + "step": 8655 + }, + { + "epoch": 0.6515741733943055, + "grad_norm": 5.6461687088012695, + "learning_rate": 2.8759582514865012e-05, + "loss": 2.0527, + "step": 8656 + }, + { + "epoch": 0.6516494476749657, + "grad_norm": 4.013443946838379, + "learning_rate": 2.8748546936044275e-05, + "loss": 1.9805, + "step": 8657 + }, + { + "epoch": 0.6517247219556258, + "grad_norm": 5.20570707321167, + "learning_rate": 2.8737512620561423e-05, + "loss": 1.6596, + "step": 8658 + }, + { + "epoch": 0.6517999962362859, + "grad_norm": 3.515523910522461, + "learning_rate": 2.872647956907246e-05, + "loss": 1.5992, + "step": 8659 + }, + { + "epoch": 0.6518752705169462, + "grad_norm": 5.263373851776123, + "learning_rate": 2.8715447782233227e-05, + "loss": 1.8315, + "step": 8660 + }, + { + "epoch": 0.6519505447976063, + "grad_norm": 4.3307881355285645, + "learning_rate": 2.870441726069957e-05, + "loss": 2.0157, + "step": 8661 + }, + { + "epoch": 0.6520258190782664, + "grad_norm": 4.524033069610596, + "learning_rate": 2.869338800512718e-05, + "loss": 1.6649, + "step": 8662 + }, + { + "epoch": 0.6521010933589266, + "grad_norm": 4.244225978851318, + "learning_rate": 2.8682360016171762e-05, + "loss": 1.8696, + "step": 8663 + }, + { + "epoch": 0.6521763676395868, + "grad_norm": 5.265883922576904, + "learning_rate": 2.867133329448885e-05, + "loss": 2.4691, + "step": 8664 + }, + { + "epoch": 0.6522516419202469, + "grad_norm": 4.064724445343018, + "learning_rate": 2.8660307840733973e-05, + "loss": 1.9237, + "step": 8665 + }, + { + "epoch": 0.6523269162009071, + "grad_norm": 6.914492607116699, + "learning_rate": 2.864928365556257e-05, + "loss": 2.7163, + "step": 8666 + }, + { + "epoch": 0.6524021904815672, + "grad_norm": 3.861659526824951, + "learning_rate": 2.8638260739630007e-05, + "loss": 1.6831, + "step": 8667 + }, + { + "epoch": 0.6524774647622273, + "grad_norm": 6.618344306945801, + "learning_rate": 2.8627239093591536e-05, + "loss": 2.0614, + "step": 8668 + }, + { + "epoch": 0.6525527390428876, + "grad_norm": 4.913801670074463, + "learning_rate": 2.8616218718102388e-05, + "loss": 1.8187, + "step": 8669 + }, + { + "epoch": 0.6526280133235477, + "grad_norm": 4.293828964233398, + "learning_rate": 2.8605199613817668e-05, + "loss": 1.7561, + "step": 8670 + }, + { + "epoch": 0.6527032876042078, + "grad_norm": 4.456260681152344, + "learning_rate": 2.8594181781392437e-05, + "loss": 2.0805, + "step": 8671 + }, + { + "epoch": 0.6527785618848679, + "grad_norm": 7.194111347198486, + "learning_rate": 2.85831652214817e-05, + "loss": 1.4987, + "step": 8672 + }, + { + "epoch": 0.6528538361655282, + "grad_norm": 4.761721134185791, + "learning_rate": 2.8572149934740317e-05, + "loss": 1.526, + "step": 8673 + }, + { + "epoch": 0.6529291104461883, + "grad_norm": 4.444544792175293, + "learning_rate": 2.8561135921823157e-05, + "loss": 1.7775, + "step": 8674 + }, + { + "epoch": 0.6530043847268484, + "grad_norm": 4.1387248039245605, + "learning_rate": 2.8550123183384936e-05, + "loss": 1.6179, + "step": 8675 + }, + { + "epoch": 0.6530796590075086, + "grad_norm": 5.734201431274414, + "learning_rate": 2.8539111720080343e-05, + "loss": 2.1411, + "step": 8676 + }, + { + "epoch": 0.6531549332881688, + "grad_norm": 5.701095104217529, + "learning_rate": 2.8528101532563978e-05, + "loss": 2.362, + "step": 8677 + }, + { + "epoch": 0.6532302075688289, + "grad_norm": 4.324429512023926, + "learning_rate": 2.8517092621490388e-05, + "loss": 2.0575, + "step": 8678 + }, + { + "epoch": 0.6533054818494891, + "grad_norm": 4.849112510681152, + "learning_rate": 2.8506084987513983e-05, + "loss": 1.5267, + "step": 8679 + }, + { + "epoch": 0.6533807561301492, + "grad_norm": 5.130321502685547, + "learning_rate": 2.8495078631289167e-05, + "loss": 1.7454, + "step": 8680 + }, + { + "epoch": 0.6534560304108094, + "grad_norm": 4.246479034423828, + "learning_rate": 2.8484073553470203e-05, + "loss": 1.9407, + "step": 8681 + }, + { + "epoch": 0.6535313046914696, + "grad_norm": 5.759989261627197, + "learning_rate": 2.8473069754711356e-05, + "loss": 1.9653, + "step": 8682 + }, + { + "epoch": 0.6536065789721297, + "grad_norm": 5.1931257247924805, + "learning_rate": 2.8462067235666724e-05, + "loss": 1.9794, + "step": 8683 + }, + { + "epoch": 0.6536818532527898, + "grad_norm": 3.2792458534240723, + "learning_rate": 2.845106599699041e-05, + "loss": 1.9016, + "step": 8684 + }, + { + "epoch": 0.65375712753345, + "grad_norm": 3.82450795173645, + "learning_rate": 2.8440066039336372e-05, + "loss": 1.6829, + "step": 8685 + }, + { + "epoch": 0.6538324018141102, + "grad_norm": 4.776988983154297, + "learning_rate": 2.8429067363358545e-05, + "loss": 1.4054, + "step": 8686 + }, + { + "epoch": 0.6539076760947703, + "grad_norm": 4.953002452850342, + "learning_rate": 2.8418069969710776e-05, + "loss": 2.0174, + "step": 8687 + }, + { + "epoch": 0.6539829503754305, + "grad_norm": 5.949739933013916, + "learning_rate": 2.840707385904682e-05, + "loss": 2.0863, + "step": 8688 + }, + { + "epoch": 0.6540582246560906, + "grad_norm": 5.9535298347473145, + "learning_rate": 2.8396079032020383e-05, + "loss": 2.0541, + "step": 8689 + }, + { + "epoch": 0.6541334989367508, + "grad_norm": 3.6686513423919678, + "learning_rate": 2.8385085489285034e-05, + "loss": 1.6368, + "step": 8690 + }, + { + "epoch": 0.6542087732174109, + "grad_norm": 6.072544097900391, + "learning_rate": 2.837409323149436e-05, + "loss": 1.8019, + "step": 8691 + }, + { + "epoch": 0.6542840474980711, + "grad_norm": 5.601493835449219, + "learning_rate": 2.836310225930177e-05, + "loss": 2.1077, + "step": 8692 + }, + { + "epoch": 0.6543593217787312, + "grad_norm": 4.370206356048584, + "learning_rate": 2.835211257336069e-05, + "loss": 1.9673, + "step": 8693 + }, + { + "epoch": 0.6544345960593914, + "grad_norm": 4.355002403259277, + "learning_rate": 2.8341124174324372e-05, + "loss": 1.7738, + "step": 8694 + }, + { + "epoch": 0.6545098703400516, + "grad_norm": 5.089869022369385, + "learning_rate": 2.8330137062846107e-05, + "loss": 1.8935, + "step": 8695 + }, + { + "epoch": 0.6545851446207117, + "grad_norm": 6.398248195648193, + "learning_rate": 2.8319151239578994e-05, + "loss": 2.0295, + "step": 8696 + }, + { + "epoch": 0.6546604189013718, + "grad_norm": 4.059794902801514, + "learning_rate": 2.8308166705176147e-05, + "loss": 1.8573, + "step": 8697 + }, + { + "epoch": 0.6547356931820321, + "grad_norm": 5.221839427947998, + "learning_rate": 2.8297183460290532e-05, + "loss": 1.8594, + "step": 8698 + }, + { + "epoch": 0.6548109674626922, + "grad_norm": 4.219726085662842, + "learning_rate": 2.8286201505575084e-05, + "loss": 1.6764, + "step": 8699 + }, + { + "epoch": 0.6548862417433523, + "grad_norm": 3.922380208969116, + "learning_rate": 2.8275220841682658e-05, + "loss": 1.7608, + "step": 8700 + }, + { + "epoch": 0.6549615160240125, + "grad_norm": 4.685485363006592, + "learning_rate": 2.8264241469266033e-05, + "loss": 1.5478, + "step": 8701 + }, + { + "epoch": 0.6550367903046727, + "grad_norm": 8.281332969665527, + "learning_rate": 2.825326338897787e-05, + "loss": 1.666, + "step": 8702 + }, + { + "epoch": 0.6551120645853328, + "grad_norm": 5.294943809509277, + "learning_rate": 2.8242286601470795e-05, + "loss": 1.9232, + "step": 8703 + }, + { + "epoch": 0.655187338865993, + "grad_norm": 5.194580078125, + "learning_rate": 2.8231311107397373e-05, + "loss": 2.0269, + "step": 8704 + }, + { + "epoch": 0.6552626131466531, + "grad_norm": 11.073525428771973, + "learning_rate": 2.8220336907410028e-05, + "loss": 2.0984, + "step": 8705 + }, + { + "epoch": 0.6553378874273132, + "grad_norm": 6.163061618804932, + "learning_rate": 2.8209364002161177e-05, + "loss": 1.8944, + "step": 8706 + }, + { + "epoch": 0.6554131617079735, + "grad_norm": 4.358154773712158, + "learning_rate": 2.81983923923031e-05, + "loss": 1.7566, + "step": 8707 + }, + { + "epoch": 0.6554884359886336, + "grad_norm": 4.6281938552856445, + "learning_rate": 2.8187422078488045e-05, + "loss": 1.7854, + "step": 8708 + }, + { + "epoch": 0.6555637102692937, + "grad_norm": 5.033481121063232, + "learning_rate": 2.8176453061368145e-05, + "loss": 1.9167, + "step": 8709 + }, + { + "epoch": 0.6556389845499538, + "grad_norm": 7.444966793060303, + "learning_rate": 2.8165485341595498e-05, + "loss": 2.2107, + "step": 8710 + }, + { + "epoch": 0.6557142588306141, + "grad_norm": 5.250480651855469, + "learning_rate": 2.8154518919822092e-05, + "loss": 2.078, + "step": 8711 + }, + { + "epoch": 0.6557895331112742, + "grad_norm": 4.205929279327393, + "learning_rate": 2.8143553796699872e-05, + "loss": 1.9267, + "step": 8712 + }, + { + "epoch": 0.6558648073919343, + "grad_norm": 6.08516788482666, + "learning_rate": 2.8132589972880653e-05, + "loss": 1.8515, + "step": 8713 + }, + { + "epoch": 0.6559400816725945, + "grad_norm": 5.553742408752441, + "learning_rate": 2.812162744901623e-05, + "loss": 1.7718, + "step": 8714 + }, + { + "epoch": 0.6560153559532547, + "grad_norm": 6.910336971282959, + "learning_rate": 2.8110666225758264e-05, + "loss": 2.3409, + "step": 8715 + }, + { + "epoch": 0.6560906302339148, + "grad_norm": 3.601201057434082, + "learning_rate": 2.80997063037584e-05, + "loss": 1.6426, + "step": 8716 + }, + { + "epoch": 0.656165904514575, + "grad_norm": 3.970806360244751, + "learning_rate": 2.8088747683668138e-05, + "loss": 2.0209, + "step": 8717 + }, + { + "epoch": 0.6562411787952351, + "grad_norm": 4.646604537963867, + "learning_rate": 2.8077790366138977e-05, + "loss": 2.0318, + "step": 8718 + }, + { + "epoch": 0.6563164530758953, + "grad_norm": 4.490777492523193, + "learning_rate": 2.8066834351822257e-05, + "loss": 1.6372, + "step": 8719 + }, + { + "epoch": 0.6563917273565555, + "grad_norm": 3.6624560356140137, + "learning_rate": 2.8055879641369298e-05, + "loss": 1.6147, + "step": 8720 + }, + { + "epoch": 0.6564670016372156, + "grad_norm": 5.967345714569092, + "learning_rate": 2.8044926235431335e-05, + "loss": 1.6397, + "step": 8721 + }, + { + "epoch": 0.6565422759178757, + "grad_norm": 4.960357666015625, + "learning_rate": 2.80339741346595e-05, + "loss": 1.8527, + "step": 8722 + }, + { + "epoch": 0.656617550198536, + "grad_norm": 5.28615140914917, + "learning_rate": 2.8023023339704907e-05, + "loss": 1.7492, + "step": 8723 + }, + { + "epoch": 0.6566928244791961, + "grad_norm": 5.680187225341797, + "learning_rate": 2.801207385121849e-05, + "loss": 1.8242, + "step": 8724 + }, + { + "epoch": 0.6567680987598562, + "grad_norm": 4.5485968589782715, + "learning_rate": 2.800112566985122e-05, + "loss": 1.8231, + "step": 8725 + }, + { + "epoch": 0.6568433730405164, + "grad_norm": 6.216640472412109, + "learning_rate": 2.799017879625388e-05, + "loss": 1.776, + "step": 8726 + }, + { + "epoch": 0.6569186473211766, + "grad_norm": 5.127228260040283, + "learning_rate": 2.7979233231077278e-05, + "loss": 1.6569, + "step": 8727 + }, + { + "epoch": 0.6569939216018367, + "grad_norm": 3.3271596431732178, + "learning_rate": 2.7968288974972058e-05, + "loss": 1.8206, + "step": 8728 + }, + { + "epoch": 0.6570691958824968, + "grad_norm": 4.320346355438232, + "learning_rate": 2.795734602858886e-05, + "loss": 1.8484, + "step": 8729 + }, + { + "epoch": 0.657144470163157, + "grad_norm": 5.000116348266602, + "learning_rate": 2.794640439257818e-05, + "loss": 1.7838, + "step": 8730 + }, + { + "epoch": 0.6572197444438171, + "grad_norm": 5.192089557647705, + "learning_rate": 2.7935464067590473e-05, + "loss": 2.2165, + "step": 8731 + }, + { + "epoch": 0.6572950187244773, + "grad_norm": 4.940530300140381, + "learning_rate": 2.7924525054276118e-05, + "loss": 1.7177, + "step": 8732 + }, + { + "epoch": 0.6573702930051375, + "grad_norm": 4.420196533203125, + "learning_rate": 2.7913587353285432e-05, + "loss": 1.7902, + "step": 8733 + }, + { + "epoch": 0.6574455672857976, + "grad_norm": 5.060540676116943, + "learning_rate": 2.7902650965268572e-05, + "loss": 2.1291, + "step": 8734 + }, + { + "epoch": 0.6575208415664577, + "grad_norm": 4.391613006591797, + "learning_rate": 2.7891715890875742e-05, + "loss": 1.3926, + "step": 8735 + }, + { + "epoch": 0.657596115847118, + "grad_norm": 7.196225166320801, + "learning_rate": 2.7880782130756937e-05, + "loss": 1.8463, + "step": 8736 + }, + { + "epoch": 0.6576713901277781, + "grad_norm": 4.369712829589844, + "learning_rate": 2.786984968556218e-05, + "loss": 1.8656, + "step": 8737 + }, + { + "epoch": 0.6577466644084382, + "grad_norm": 4.403004169464111, + "learning_rate": 2.7858918555941365e-05, + "loss": 1.738, + "step": 8738 + }, + { + "epoch": 0.6578219386890984, + "grad_norm": 4.340316295623779, + "learning_rate": 2.7847988742544297e-05, + "loss": 1.7306, + "step": 8739 + }, + { + "epoch": 0.6578972129697586, + "grad_norm": 4.489402770996094, + "learning_rate": 2.7837060246020762e-05, + "loss": 1.6377, + "step": 8740 + }, + { + "epoch": 0.6579724872504187, + "grad_norm": 4.764037609100342, + "learning_rate": 2.7826133067020375e-05, + "loss": 2.0081, + "step": 8741 + }, + { + "epoch": 0.6580477615310789, + "grad_norm": 4.301174640655518, + "learning_rate": 2.7815207206192783e-05, + "loss": 1.8452, + "step": 8742 + }, + { + "epoch": 0.658123035811739, + "grad_norm": 5.14947509765625, + "learning_rate": 2.7804282664187443e-05, + "loss": 1.9155, + "step": 8743 + }, + { + "epoch": 0.6581983100923992, + "grad_norm": 7.12629508972168, + "learning_rate": 2.779335944165381e-05, + "loss": 2.0534, + "step": 8744 + }, + { + "epoch": 0.6582735843730594, + "grad_norm": 4.720340251922607, + "learning_rate": 2.7782437539241233e-05, + "loss": 2.0739, + "step": 8745 + }, + { + "epoch": 0.6583488586537195, + "grad_norm": 4.3159661293029785, + "learning_rate": 2.777151695759903e-05, + "loss": 1.7583, + "step": 8746 + }, + { + "epoch": 0.6584241329343796, + "grad_norm": 5.066626071929932, + "learning_rate": 2.7760597697376335e-05, + "loss": 2.2023, + "step": 8747 + }, + { + "epoch": 0.6584994072150399, + "grad_norm": 4.680880546569824, + "learning_rate": 2.7749679759222314e-05, + "loss": 1.9215, + "step": 8748 + }, + { + "epoch": 0.6585746814957, + "grad_norm": 3.7674427032470703, + "learning_rate": 2.7738763143785972e-05, + "loss": 1.8614, + "step": 8749 + }, + { + "epoch": 0.6586499557763601, + "grad_norm": 3.798656940460205, + "learning_rate": 2.7727847851716305e-05, + "loss": 1.7624, + "step": 8750 + }, + { + "epoch": 0.6587252300570202, + "grad_norm": 5.341363906860352, + "learning_rate": 2.7716933883662154e-05, + "loss": 2.0625, + "step": 8751 + }, + { + "epoch": 0.6588005043376804, + "grad_norm": 4.5162272453308105, + "learning_rate": 2.770602124027235e-05, + "loss": 2.0344, + "step": 8752 + }, + { + "epoch": 0.6588757786183406, + "grad_norm": 3.985649824142456, + "learning_rate": 2.7695109922195632e-05, + "loss": 1.7292, + "step": 8753 + }, + { + "epoch": 0.6589510528990007, + "grad_norm": 5.624774932861328, + "learning_rate": 2.7684199930080613e-05, + "loss": 1.6561, + "step": 8754 + }, + { + "epoch": 0.6590263271796609, + "grad_norm": 4.4445109367370605, + "learning_rate": 2.7673291264575874e-05, + "loss": 1.8889, + "step": 8755 + }, + { + "epoch": 0.659101601460321, + "grad_norm": 4.026817798614502, + "learning_rate": 2.7662383926329903e-05, + "loss": 1.7208, + "step": 8756 + }, + { + "epoch": 0.6591768757409812, + "grad_norm": 5.325575351715088, + "learning_rate": 2.765147791599114e-05, + "loss": 1.6658, + "step": 8757 + }, + { + "epoch": 0.6592521500216414, + "grad_norm": 6.273247241973877, + "learning_rate": 2.7640573234207866e-05, + "loss": 2.0033, + "step": 8758 + }, + { + "epoch": 0.6593274243023015, + "grad_norm": 3.3490426540374756, + "learning_rate": 2.7629669881628384e-05, + "loss": 1.9294, + "step": 8759 + }, + { + "epoch": 0.6594026985829616, + "grad_norm": 5.980242729187012, + "learning_rate": 2.7618767858900818e-05, + "loss": 2.1203, + "step": 8760 + }, + { + "epoch": 0.6594779728636219, + "grad_norm": 5.140210151672363, + "learning_rate": 2.7607867166673296e-05, + "loss": 1.9771, + "step": 8761 + }, + { + "epoch": 0.659553247144282, + "grad_norm": 4.407763481140137, + "learning_rate": 2.759696780559381e-05, + "loss": 1.9322, + "step": 8762 + }, + { + "epoch": 0.6596285214249421, + "grad_norm": 4.496096134185791, + "learning_rate": 2.758606977631032e-05, + "loss": 1.9758, + "step": 8763 + }, + { + "epoch": 0.6597037957056023, + "grad_norm": 5.305761337280273, + "learning_rate": 2.757517307947065e-05, + "loss": 2.0047, + "step": 8764 + }, + { + "epoch": 0.6597790699862625, + "grad_norm": 4.556825637817383, + "learning_rate": 2.7564277715722596e-05, + "loss": 2.0209, + "step": 8765 + }, + { + "epoch": 0.6598543442669226, + "grad_norm": 5.236590385437012, + "learning_rate": 2.7553383685713862e-05, + "loss": 1.674, + "step": 8766 + }, + { + "epoch": 0.6599296185475828, + "grad_norm": 4.662038803100586, + "learning_rate": 2.7542490990092074e-05, + "loss": 2.0531, + "step": 8767 + }, + { + "epoch": 0.6600048928282429, + "grad_norm": 4.337377071380615, + "learning_rate": 2.7531599629504745e-05, + "loss": 2.1333, + "step": 8768 + }, + { + "epoch": 0.660080167108903, + "grad_norm": 4.375157833099365, + "learning_rate": 2.752070960459934e-05, + "loss": 1.7405, + "step": 8769 + }, + { + "epoch": 0.6601554413895632, + "grad_norm": 4.503413200378418, + "learning_rate": 2.7509820916023276e-05, + "loss": 1.9461, + "step": 8770 + }, + { + "epoch": 0.6602307156702234, + "grad_norm": 4.720217227935791, + "learning_rate": 2.749893356442381e-05, + "loss": 2.0645, + "step": 8771 + }, + { + "epoch": 0.6603059899508835, + "grad_norm": 5.786413669586182, + "learning_rate": 2.7488047550448193e-05, + "loss": 1.7386, + "step": 8772 + }, + { + "epoch": 0.6603812642315436, + "grad_norm": 3.9931697845458984, + "learning_rate": 2.747716287474354e-05, + "loss": 2.0254, + "step": 8773 + }, + { + "epoch": 0.6604565385122039, + "grad_norm": 5.328513145446777, + "learning_rate": 2.7466279537956944e-05, + "loss": 1.8, + "step": 8774 + }, + { + "epoch": 0.660531812792864, + "grad_norm": 3.8266711235046387, + "learning_rate": 2.745539754073536e-05, + "loss": 2.1057, + "step": 8775 + }, + { + "epoch": 0.6606070870735241, + "grad_norm": 4.392454147338867, + "learning_rate": 2.74445168837257e-05, + "loss": 2.1317, + "step": 8776 + }, + { + "epoch": 0.6606823613541843, + "grad_norm": 4.543263912200928, + "learning_rate": 2.7433637567574788e-05, + "loss": 1.8057, + "step": 8777 + }, + { + "epoch": 0.6607576356348445, + "grad_norm": 3.929015636444092, + "learning_rate": 2.7422759592929405e-05, + "loss": 1.7747, + "step": 8778 + }, + { + "epoch": 0.6608329099155046, + "grad_norm": 4.441057205200195, + "learning_rate": 2.7411882960436153e-05, + "loss": 1.5652, + "step": 8779 + }, + { + "epoch": 0.6609081841961648, + "grad_norm": 7.1514129638671875, + "learning_rate": 2.7401007670741667e-05, + "loss": 1.6769, + "step": 8780 + }, + { + "epoch": 0.6609834584768249, + "grad_norm": 4.549609661102295, + "learning_rate": 2.7390133724492417e-05, + "loss": 1.5836, + "step": 8781 + }, + { + "epoch": 0.6610587327574851, + "grad_norm": 4.804782390594482, + "learning_rate": 2.7379261122334855e-05, + "loss": 2.2311, + "step": 8782 + }, + { + "epoch": 0.6611340070381453, + "grad_norm": 4.868736267089844, + "learning_rate": 2.7368389864915294e-05, + "loss": 1.8926, + "step": 8783 + }, + { + "epoch": 0.6612092813188054, + "grad_norm": 6.154513359069824, + "learning_rate": 2.7357519952880023e-05, + "loss": 1.7282, + "step": 8784 + }, + { + "epoch": 0.6612845555994655, + "grad_norm": 5.211771011352539, + "learning_rate": 2.7346651386875237e-05, + "loss": 1.7508, + "step": 8785 + }, + { + "epoch": 0.6613598298801258, + "grad_norm": 4.101115703582764, + "learning_rate": 2.7335784167547006e-05, + "loss": 1.8346, + "step": 8786 + }, + { + "epoch": 0.6614351041607859, + "grad_norm": 4.862835884094238, + "learning_rate": 2.732491829554138e-05, + "loss": 1.835, + "step": 8787 + }, + { + "epoch": 0.661510378441446, + "grad_norm": 5.2062907218933105, + "learning_rate": 2.73140537715043e-05, + "loss": 2.0171, + "step": 8788 + }, + { + "epoch": 0.6615856527221061, + "grad_norm": 5.2599029541015625, + "learning_rate": 2.7303190596081645e-05, + "loss": 2.2126, + "step": 8789 + }, + { + "epoch": 0.6616609270027664, + "grad_norm": 4.830210208892822, + "learning_rate": 2.7292328769919172e-05, + "loss": 1.6777, + "step": 8790 + }, + { + "epoch": 0.6617362012834265, + "grad_norm": 4.170035362243652, + "learning_rate": 2.7281468293662608e-05, + "loss": 1.6983, + "step": 8791 + }, + { + "epoch": 0.6618114755640866, + "grad_norm": 4.532258033752441, + "learning_rate": 2.727060916795755e-05, + "loss": 1.8308, + "step": 8792 + }, + { + "epoch": 0.6618867498447468, + "grad_norm": 5.162574291229248, + "learning_rate": 2.7259751393449584e-05, + "loss": 1.9941, + "step": 8793 + }, + { + "epoch": 0.661962024125407, + "grad_norm": 5.646231174468994, + "learning_rate": 2.724889497078413e-05, + "loss": 2.2627, + "step": 8794 + }, + { + "epoch": 0.6620372984060671, + "grad_norm": 4.764942169189453, + "learning_rate": 2.7238039900606605e-05, + "loss": 2.3024, + "step": 8795 + }, + { + "epoch": 0.6621125726867273, + "grad_norm": 4.774278163909912, + "learning_rate": 2.7227186183562276e-05, + "loss": 1.5764, + "step": 8796 + }, + { + "epoch": 0.6621878469673874, + "grad_norm": 4.349574089050293, + "learning_rate": 2.7216333820296402e-05, + "loss": 1.8824, + "step": 8797 + }, + { + "epoch": 0.6622631212480475, + "grad_norm": 5.4706292152404785, + "learning_rate": 2.7205482811454097e-05, + "loss": 1.884, + "step": 8798 + }, + { + "epoch": 0.6623383955287078, + "grad_norm": 3.846074104309082, + "learning_rate": 2.7194633157680434e-05, + "loss": 2.0543, + "step": 8799 + }, + { + "epoch": 0.6624136698093679, + "grad_norm": 5.873631000518799, + "learning_rate": 2.7183784859620387e-05, + "loss": 1.7716, + "step": 8800 + }, + { + "epoch": 0.662488944090028, + "grad_norm": 5.629774570465088, + "learning_rate": 2.7172937917918868e-05, + "loss": 1.9605, + "step": 8801 + }, + { + "epoch": 0.6625642183706882, + "grad_norm": 4.951746940612793, + "learning_rate": 2.7162092333220712e-05, + "loss": 1.7837, + "step": 8802 + }, + { + "epoch": 0.6626394926513484, + "grad_norm": 5.148329734802246, + "learning_rate": 2.7151248106170613e-05, + "loss": 2.1406, + "step": 8803 + }, + { + "epoch": 0.6627147669320085, + "grad_norm": 5.117009162902832, + "learning_rate": 2.714040523741328e-05, + "loss": 1.8966, + "step": 8804 + }, + { + "epoch": 0.6627900412126687, + "grad_norm": 4.9454216957092285, + "learning_rate": 2.712956372759324e-05, + "loss": 1.851, + "step": 8805 + }, + { + "epoch": 0.6628653154933288, + "grad_norm": 3.680377244949341, + "learning_rate": 2.7118723577355033e-05, + "loss": 1.7048, + "step": 8806 + }, + { + "epoch": 0.662940589773989, + "grad_norm": 6.020511627197266, + "learning_rate": 2.7107884787343035e-05, + "loss": 1.777, + "step": 8807 + }, + { + "epoch": 0.6630158640546491, + "grad_norm": 4.53709077835083, + "learning_rate": 2.709704735820162e-05, + "loss": 1.9225, + "step": 8808 + }, + { + "epoch": 0.6630911383353093, + "grad_norm": 4.87180233001709, + "learning_rate": 2.708621129057501e-05, + "loss": 1.4963, + "step": 8809 + }, + { + "epoch": 0.6631664126159694, + "grad_norm": 4.665671348571777, + "learning_rate": 2.7075376585107388e-05, + "loss": 1.7045, + "step": 8810 + }, + { + "epoch": 0.6632416868966295, + "grad_norm": 4.7149763107299805, + "learning_rate": 2.706454324244285e-05, + "loss": 1.7171, + "step": 8811 + }, + { + "epoch": 0.6633169611772898, + "grad_norm": 4.419112682342529, + "learning_rate": 2.7053711263225427e-05, + "loss": 2.0527, + "step": 8812 + }, + { + "epoch": 0.6633922354579499, + "grad_norm": 5.024907112121582, + "learning_rate": 2.7042880648099013e-05, + "loss": 1.5522, + "step": 8813 + }, + { + "epoch": 0.66346750973861, + "grad_norm": 5.341494560241699, + "learning_rate": 2.7032051397707492e-05, + "loss": 1.8569, + "step": 8814 + }, + { + "epoch": 0.6635427840192702, + "grad_norm": 4.947847366333008, + "learning_rate": 2.7021223512694587e-05, + "loss": 2.0411, + "step": 8815 + }, + { + "epoch": 0.6636180582999304, + "grad_norm": 5.178067207336426, + "learning_rate": 2.701039699370401e-05, + "loss": 1.8223, + "step": 8816 + }, + { + "epoch": 0.6636933325805905, + "grad_norm": 4.4944047927856445, + "learning_rate": 2.6999571841379394e-05, + "loss": 1.8239, + "step": 8817 + }, + { + "epoch": 0.6637686068612507, + "grad_norm": 4.428443431854248, + "learning_rate": 2.6988748056364214e-05, + "loss": 1.6589, + "step": 8818 + }, + { + "epoch": 0.6638438811419108, + "grad_norm": 4.5360517501831055, + "learning_rate": 2.697792563930196e-05, + "loss": 1.8336, + "step": 8819 + }, + { + "epoch": 0.663919155422571, + "grad_norm": 6.5190629959106445, + "learning_rate": 2.696710459083594e-05, + "loss": 1.5053, + "step": 8820 + }, + { + "epoch": 0.6639944297032312, + "grad_norm": 3.7468550205230713, + "learning_rate": 2.695628491160947e-05, + "loss": 1.4678, + "step": 8821 + }, + { + "epoch": 0.6640697039838913, + "grad_norm": 4.403375148773193, + "learning_rate": 2.694546660226574e-05, + "loss": 1.847, + "step": 8822 + }, + { + "epoch": 0.6641449782645514, + "grad_norm": 4.647982597351074, + "learning_rate": 2.6934649663447885e-05, + "loss": 1.4818, + "step": 8823 + }, + { + "epoch": 0.6642202525452117, + "grad_norm": 3.9248507022857666, + "learning_rate": 2.6923834095798916e-05, + "loss": 1.9991, + "step": 8824 + }, + { + "epoch": 0.6642955268258718, + "grad_norm": 4.492358207702637, + "learning_rate": 2.691301989996181e-05, + "loss": 1.7987, + "step": 8825 + }, + { + "epoch": 0.6643708011065319, + "grad_norm": 3.765650749206543, + "learning_rate": 2.6902207076579406e-05, + "loss": 1.3297, + "step": 8826 + }, + { + "epoch": 0.6644460753871921, + "grad_norm": 4.731827259063721, + "learning_rate": 2.6891395626294536e-05, + "loss": 1.6934, + "step": 8827 + }, + { + "epoch": 0.6645213496678523, + "grad_norm": 5.9216628074646, + "learning_rate": 2.6880585549749875e-05, + "loss": 1.9936, + "step": 8828 + }, + { + "epoch": 0.6645966239485124, + "grad_norm": 5.2762885093688965, + "learning_rate": 2.6869776847588078e-05, + "loss": 1.8152, + "step": 8829 + }, + { + "epoch": 0.6646718982291725, + "grad_norm": 5.487278938293457, + "learning_rate": 2.685896952045167e-05, + "loss": 1.637, + "step": 8830 + }, + { + "epoch": 0.6647471725098327, + "grad_norm": 6.06055212020874, + "learning_rate": 2.684816356898312e-05, + "loss": 2.1396, + "step": 8831 + }, + { + "epoch": 0.6648224467904928, + "grad_norm": 4.602733612060547, + "learning_rate": 2.6837358993824814e-05, + "loss": 1.9866, + "step": 8832 + }, + { + "epoch": 0.664897721071153, + "grad_norm": 4.820003509521484, + "learning_rate": 2.6826555795619056e-05, + "loss": 2.5, + "step": 8833 + }, + { + "epoch": 0.6649729953518132, + "grad_norm": 5.898910999298096, + "learning_rate": 2.6815753975008085e-05, + "loss": 1.7225, + "step": 8834 + }, + { + "epoch": 0.6650482696324733, + "grad_norm": 5.85598087310791, + "learning_rate": 2.6804953532634e-05, + "loss": 1.7758, + "step": 8835 + }, + { + "epoch": 0.6651235439131334, + "grad_norm": 5.7575154304504395, + "learning_rate": 2.6794154469138887e-05, + "loss": 2.1179, + "step": 8836 + }, + { + "epoch": 0.6651988181937937, + "grad_norm": 5.69008207321167, + "learning_rate": 2.6783356785164686e-05, + "loss": 1.7158, + "step": 8837 + }, + { + "epoch": 0.6652740924744538, + "grad_norm": 3.8738021850585938, + "learning_rate": 2.6772560481353332e-05, + "loss": 1.5365, + "step": 8838 + }, + { + "epoch": 0.6653493667551139, + "grad_norm": 5.95831298828125, + "learning_rate": 2.6761765558346585e-05, + "loss": 1.8869, + "step": 8839 + }, + { + "epoch": 0.6654246410357741, + "grad_norm": 4.69302225112915, + "learning_rate": 2.6750972016786223e-05, + "loss": 2.0503, + "step": 8840 + }, + { + "epoch": 0.6654999153164343, + "grad_norm": 4.687337875366211, + "learning_rate": 2.6740179857313845e-05, + "loss": 1.9947, + "step": 8841 + }, + { + "epoch": 0.6655751895970944, + "grad_norm": 4.024134159088135, + "learning_rate": 2.6729389080571033e-05, + "loss": 1.7627, + "step": 8842 + }, + { + "epoch": 0.6656504638777546, + "grad_norm": 6.774590969085693, + "learning_rate": 2.6718599687199293e-05, + "loss": 2.0674, + "step": 8843 + }, + { + "epoch": 0.6657257381584147, + "grad_norm": 5.074631690979004, + "learning_rate": 2.6707811677839978e-05, + "loss": 2.0338, + "step": 8844 + }, + { + "epoch": 0.6658010124390749, + "grad_norm": 6.617143154144287, + "learning_rate": 2.669702505313442e-05, + "loss": 2.0158, + "step": 8845 + }, + { + "epoch": 0.6658762867197351, + "grad_norm": 6.428169250488281, + "learning_rate": 2.668623981372389e-05, + "loss": 1.8422, + "step": 8846 + }, + { + "epoch": 0.6659515610003952, + "grad_norm": 4.327280044555664, + "learning_rate": 2.6675455960249478e-05, + "loss": 1.6413, + "step": 8847 + }, + { + "epoch": 0.6660268352810553, + "grad_norm": 4.7735090255737305, + "learning_rate": 2.666467349335231e-05, + "loss": 1.7619, + "step": 8848 + }, + { + "epoch": 0.6661021095617154, + "grad_norm": 4.143545150756836, + "learning_rate": 2.665389241367332e-05, + "loss": 1.7187, + "step": 8849 + }, + { + "epoch": 0.6661773838423757, + "grad_norm": 6.336202621459961, + "learning_rate": 2.6643112721853446e-05, + "loss": 1.9036, + "step": 8850 + }, + { + "epoch": 0.6662526581230358, + "grad_norm": 4.243865966796875, + "learning_rate": 2.6632334418533516e-05, + "loss": 1.6973, + "step": 8851 + }, + { + "epoch": 0.6663279324036959, + "grad_norm": 5.223363399505615, + "learning_rate": 2.6621557504354237e-05, + "loss": 1.561, + "step": 8852 + }, + { + "epoch": 0.6664032066843562, + "grad_norm": 5.2146172523498535, + "learning_rate": 2.6610781979956307e-05, + "loss": 1.7123, + "step": 8853 + }, + { + "epoch": 0.6664784809650163, + "grad_norm": 4.6776018142700195, + "learning_rate": 2.6600007845980257e-05, + "loss": 1.8581, + "step": 8854 + }, + { + "epoch": 0.6665537552456764, + "grad_norm": 5.126739025115967, + "learning_rate": 2.6589235103066595e-05, + "loss": 1.7989, + "step": 8855 + }, + { + "epoch": 0.6666290295263366, + "grad_norm": 4.947964668273926, + "learning_rate": 2.657846375185573e-05, + "loss": 1.859, + "step": 8856 + }, + { + "epoch": 0.6667043038069967, + "grad_norm": 4.283720016479492, + "learning_rate": 2.6567693792988017e-05, + "loss": 1.8351, + "step": 8857 + }, + { + "epoch": 0.6667795780876569, + "grad_norm": 5.613223075866699, + "learning_rate": 2.6556925227103656e-05, + "loss": 1.8202, + "step": 8858 + }, + { + "epoch": 0.6668548523683171, + "grad_norm": 4.260611057281494, + "learning_rate": 2.654615805484284e-05, + "loss": 1.6868, + "step": 8859 + }, + { + "epoch": 0.6669301266489772, + "grad_norm": 4.571383476257324, + "learning_rate": 2.6535392276845612e-05, + "loss": 1.8439, + "step": 8860 + }, + { + "epoch": 0.6670054009296373, + "grad_norm": 4.924087047576904, + "learning_rate": 2.652462789375201e-05, + "loss": 1.8539, + "step": 8861 + }, + { + "epoch": 0.6670806752102976, + "grad_norm": 4.206077575683594, + "learning_rate": 2.651386490620189e-05, + "loss": 1.9324, + "step": 8862 + }, + { + "epoch": 0.6671559494909577, + "grad_norm": 4.6520676612854, + "learning_rate": 2.6503103314835142e-05, + "loss": 1.5271, + "step": 8863 + }, + { + "epoch": 0.6672312237716178, + "grad_norm": 4.078134059906006, + "learning_rate": 2.6492343120291453e-05, + "loss": 1.6754, + "step": 8864 + }, + { + "epoch": 0.667306498052278, + "grad_norm": 3.992279529571533, + "learning_rate": 2.6481584323210524e-05, + "loss": 1.8784, + "step": 8865 + }, + { + "epoch": 0.6673817723329382, + "grad_norm": 5.328444004058838, + "learning_rate": 2.6470826924231918e-05, + "loss": 1.8818, + "step": 8866 + }, + { + "epoch": 0.6674570466135983, + "grad_norm": 5.049443244934082, + "learning_rate": 2.646007092399514e-05, + "loss": 1.5722, + "step": 8867 + }, + { + "epoch": 0.6675323208942584, + "grad_norm": 4.65287971496582, + "learning_rate": 2.644931632313963e-05, + "loss": 2.2848, + "step": 8868 + }, + { + "epoch": 0.6676075951749186, + "grad_norm": 4.520260810852051, + "learning_rate": 2.643856312230466e-05, + "loss": 2.0799, + "step": 8869 + }, + { + "epoch": 0.6676828694555788, + "grad_norm": 6.330007553100586, + "learning_rate": 2.6427811322129525e-05, + "loss": 2.1194, + "step": 8870 + }, + { + "epoch": 0.6677581437362389, + "grad_norm": 4.228724002838135, + "learning_rate": 2.641706092325336e-05, + "loss": 1.9272, + "step": 8871 + }, + { + "epoch": 0.6678334180168991, + "grad_norm": 4.327436923980713, + "learning_rate": 2.6406311926315273e-05, + "loss": 1.7915, + "step": 8872 + }, + { + "epoch": 0.6679086922975592, + "grad_norm": 5.304169178009033, + "learning_rate": 2.6395564331954225e-05, + "loss": 1.7595, + "step": 8873 + }, + { + "epoch": 0.6679839665782193, + "grad_norm": 5.184169769287109, + "learning_rate": 2.6384818140809166e-05, + "loss": 2.2099, + "step": 8874 + }, + { + "epoch": 0.6680592408588796, + "grad_norm": 6.44237756729126, + "learning_rate": 2.6374073353518892e-05, + "loss": 1.6844, + "step": 8875 + }, + { + "epoch": 0.6681345151395397, + "grad_norm": 4.902488708496094, + "learning_rate": 2.636332997072216e-05, + "loss": 2.0372, + "step": 8876 + }, + { + "epoch": 0.6682097894201998, + "grad_norm": 8.373483657836914, + "learning_rate": 2.6352587993057646e-05, + "loss": 1.7565, + "step": 8877 + }, + { + "epoch": 0.66828506370086, + "grad_norm": 5.195601463317871, + "learning_rate": 2.6341847421163946e-05, + "loss": 1.8044, + "step": 8878 + }, + { + "epoch": 0.6683603379815202, + "grad_norm": 5.000908851623535, + "learning_rate": 2.633110825567951e-05, + "loss": 1.6557, + "step": 8879 + }, + { + "epoch": 0.6684356122621803, + "grad_norm": 5.616551399230957, + "learning_rate": 2.6320370497242798e-05, + "loss": 1.8628, + "step": 8880 + }, + { + "epoch": 0.6685108865428405, + "grad_norm": 4.4209513664245605, + "learning_rate": 2.630963414649209e-05, + "loss": 1.7255, + "step": 8881 + }, + { + "epoch": 0.6685861608235006, + "grad_norm": 6.026299953460693, + "learning_rate": 2.6298899204065657e-05, + "loss": 2.2945, + "step": 8882 + }, + { + "epoch": 0.6686614351041608, + "grad_norm": 3.0953924655914307, + "learning_rate": 2.6288165670601684e-05, + "loss": 1.9448, + "step": 8883 + }, + { + "epoch": 0.668736709384821, + "grad_norm": 6.213339328765869, + "learning_rate": 2.6277433546738206e-05, + "loss": 1.6914, + "step": 8884 + }, + { + "epoch": 0.6688119836654811, + "grad_norm": 3.756370782852173, + "learning_rate": 2.626670283311325e-05, + "loss": 1.9009, + "step": 8885 + }, + { + "epoch": 0.6688872579461412, + "grad_norm": 6.405398368835449, + "learning_rate": 2.6255973530364696e-05, + "loss": 1.7966, + "step": 8886 + }, + { + "epoch": 0.6689625322268014, + "grad_norm": 4.880269527435303, + "learning_rate": 2.6245245639130388e-05, + "loss": 1.5388, + "step": 8887 + }, + { + "epoch": 0.6690378065074616, + "grad_norm": 6.3944220542907715, + "learning_rate": 2.6234519160048064e-05, + "loss": 1.845, + "step": 8888 + }, + { + "epoch": 0.6691130807881217, + "grad_norm": 5.3372063636779785, + "learning_rate": 2.6223794093755405e-05, + "loss": 1.9204, + "step": 8889 + }, + { + "epoch": 0.6691883550687818, + "grad_norm": 4.326444625854492, + "learning_rate": 2.6213070440889942e-05, + "loss": 1.8462, + "step": 8890 + }, + { + "epoch": 0.6692636293494421, + "grad_norm": 5.463204383850098, + "learning_rate": 2.6202348202089216e-05, + "loss": 2.1027, + "step": 8891 + }, + { + "epoch": 0.6693389036301022, + "grad_norm": 5.259471416473389, + "learning_rate": 2.6191627377990586e-05, + "loss": 1.9241, + "step": 8892 + }, + { + "epoch": 0.6694141779107623, + "grad_norm": 6.563348770141602, + "learning_rate": 2.618090796923141e-05, + "loss": 1.9239, + "step": 8893 + }, + { + "epoch": 0.6694894521914225, + "grad_norm": 3.9169232845306396, + "learning_rate": 2.6170189976448894e-05, + "loss": 2.1645, + "step": 8894 + }, + { + "epoch": 0.6695647264720826, + "grad_norm": 4.303414821624756, + "learning_rate": 2.6159473400280227e-05, + "loss": 2.0235, + "step": 8895 + }, + { + "epoch": 0.6696400007527428, + "grad_norm": 4.6599273681640625, + "learning_rate": 2.6148758241362436e-05, + "loss": 1.9235, + "step": 8896 + }, + { + "epoch": 0.669715275033403, + "grad_norm": 4.29166841506958, + "learning_rate": 2.6138044500332536e-05, + "loss": 1.9216, + "step": 8897 + }, + { + "epoch": 0.6697905493140631, + "grad_norm": 5.0810017585754395, + "learning_rate": 2.6127332177827446e-05, + "loss": 1.479, + "step": 8898 + }, + { + "epoch": 0.6698658235947232, + "grad_norm": 5.189507484436035, + "learning_rate": 2.611662127448393e-05, + "loss": 1.7415, + "step": 8899 + }, + { + "epoch": 0.6699410978753835, + "grad_norm": 5.042854309082031, + "learning_rate": 2.6105911790938763e-05, + "loss": 2.018, + "step": 8900 + }, + { + "epoch": 0.6700163721560436, + "grad_norm": 5.700060844421387, + "learning_rate": 2.609520372782857e-05, + "loss": 1.7725, + "step": 8901 + }, + { + "epoch": 0.6700916464367037, + "grad_norm": 4.652759552001953, + "learning_rate": 2.608449708578995e-05, + "loss": 1.6746, + "step": 8902 + }, + { + "epoch": 0.670166920717364, + "grad_norm": 5.318171501159668, + "learning_rate": 2.607379186545933e-05, + "loss": 1.8465, + "step": 8903 + }, + { + "epoch": 0.6702421949980241, + "grad_norm": 3.932227373123169, + "learning_rate": 2.6063088067473156e-05, + "loss": 1.5701, + "step": 8904 + }, + { + "epoch": 0.6703174692786842, + "grad_norm": 4.138515949249268, + "learning_rate": 2.6052385692467697e-05, + "loss": 1.5684, + "step": 8905 + }, + { + "epoch": 0.6703927435593443, + "grad_norm": 4.297887802124023, + "learning_rate": 2.604168474107921e-05, + "loss": 1.8615, + "step": 8906 + }, + { + "epoch": 0.6704680178400045, + "grad_norm": 4.569864273071289, + "learning_rate": 2.6030985213943803e-05, + "loss": 1.7772, + "step": 8907 + }, + { + "epoch": 0.6705432921206647, + "grad_norm": 4.180438995361328, + "learning_rate": 2.602028711169757e-05, + "loss": 1.4586, + "step": 8908 + }, + { + "epoch": 0.6706185664013248, + "grad_norm": 4.958812236785889, + "learning_rate": 2.6009590434976445e-05, + "loss": 1.7733, + "step": 8909 + }, + { + "epoch": 0.670693840681985, + "grad_norm": 8.969746589660645, + "learning_rate": 2.5998895184416327e-05, + "loss": 1.9241, + "step": 8910 + }, + { + "epoch": 0.6707691149626451, + "grad_norm": 4.2592878341674805, + "learning_rate": 2.598820136065303e-05, + "loss": 1.4884, + "step": 8911 + }, + { + "epoch": 0.6708443892433053, + "grad_norm": 4.074267864227295, + "learning_rate": 2.5977508964322282e-05, + "loss": 1.9795, + "step": 8912 + }, + { + "epoch": 0.6709196635239655, + "grad_norm": 5.606818675994873, + "learning_rate": 2.5966817996059677e-05, + "loss": 2.001, + "step": 8913 + }, + { + "epoch": 0.6709949378046256, + "grad_norm": 5.230132579803467, + "learning_rate": 2.5956128456500784e-05, + "loss": 2.1983, + "step": 8914 + }, + { + "epoch": 0.6710702120852857, + "grad_norm": 5.076301574707031, + "learning_rate": 2.594544034628108e-05, + "loss": 2.0636, + "step": 8915 + }, + { + "epoch": 0.671145486365946, + "grad_norm": 5.236661434173584, + "learning_rate": 2.5934753666035917e-05, + "loss": 1.9933, + "step": 8916 + }, + { + "epoch": 0.6712207606466061, + "grad_norm": 7.820615291595459, + "learning_rate": 2.592406841640061e-05, + "loss": 1.692, + "step": 8917 + }, + { + "epoch": 0.6712960349272662, + "grad_norm": 5.645308494567871, + "learning_rate": 2.5913384598010337e-05, + "loss": 2.2752, + "step": 8918 + }, + { + "epoch": 0.6713713092079264, + "grad_norm": 13.594305992126465, + "learning_rate": 2.590270221150025e-05, + "loss": 1.9373, + "step": 8919 + }, + { + "epoch": 0.6714465834885865, + "grad_norm": 4.773320198059082, + "learning_rate": 2.589202125750536e-05, + "loss": 1.7681, + "step": 8920 + }, + { + "epoch": 0.6715218577692467, + "grad_norm": 5.516262531280518, + "learning_rate": 2.5881341736660635e-05, + "loss": 1.7638, + "step": 8921 + }, + { + "epoch": 0.6715971320499069, + "grad_norm": 6.392311096191406, + "learning_rate": 2.587066364960094e-05, + "loss": 1.7556, + "step": 8922 + }, + { + "epoch": 0.671672406330567, + "grad_norm": 5.861118793487549, + "learning_rate": 2.5859986996961074e-05, + "loss": 1.4801, + "step": 8923 + }, + { + "epoch": 0.6717476806112271, + "grad_norm": 3.715571641921997, + "learning_rate": 2.5849311779375696e-05, + "loss": 1.796, + "step": 8924 + }, + { + "epoch": 0.6718229548918874, + "grad_norm": 5.83088493347168, + "learning_rate": 2.5838637997479454e-05, + "loss": 2.1076, + "step": 8925 + }, + { + "epoch": 0.6718982291725475, + "grad_norm": 5.278836727142334, + "learning_rate": 2.5827965651906838e-05, + "loss": 1.7612, + "step": 8926 + }, + { + "epoch": 0.6719735034532076, + "grad_norm": 4.42335844039917, + "learning_rate": 2.5817294743292324e-05, + "loss": 1.7263, + "step": 8927 + }, + { + "epoch": 0.6720487777338677, + "grad_norm": 6.038539409637451, + "learning_rate": 2.5806625272270236e-05, + "loss": 2.0524, + "step": 8928 + }, + { + "epoch": 0.672124052014528, + "grad_norm": 5.974808692932129, + "learning_rate": 2.5795957239474856e-05, + "loss": 1.7796, + "step": 8929 + }, + { + "epoch": 0.6721993262951881, + "grad_norm": 5.594521999359131, + "learning_rate": 2.5785290645540382e-05, + "loss": 1.7868, + "step": 8930 + }, + { + "epoch": 0.6722746005758482, + "grad_norm": 5.21370792388916, + "learning_rate": 2.5774625491100877e-05, + "loss": 1.9736, + "step": 8931 + }, + { + "epoch": 0.6723498748565084, + "grad_norm": 5.262139797210693, + "learning_rate": 2.576396177679039e-05, + "loss": 2.0341, + "step": 8932 + }, + { + "epoch": 0.6724251491371686, + "grad_norm": 5.050817489624023, + "learning_rate": 2.5753299503242823e-05, + "loss": 1.9789, + "step": 8933 + }, + { + "epoch": 0.6725004234178287, + "grad_norm": 4.763389587402344, + "learning_rate": 2.5742638671092058e-05, + "loss": 2.1324, + "step": 8934 + }, + { + "epoch": 0.6725756976984889, + "grad_norm": 4.985348224639893, + "learning_rate": 2.5731979280971795e-05, + "loss": 1.8983, + "step": 8935 + }, + { + "epoch": 0.672650971979149, + "grad_norm": 3.7745063304901123, + "learning_rate": 2.572132133351575e-05, + "loss": 1.7179, + "step": 8936 + }, + { + "epoch": 0.6727262462598091, + "grad_norm": 4.594761371612549, + "learning_rate": 2.5710664829357473e-05, + "loss": 2.0431, + "step": 8937 + }, + { + "epoch": 0.6728015205404694, + "grad_norm": 4.333402156829834, + "learning_rate": 2.5700009769130496e-05, + "loss": 1.8918, + "step": 8938 + }, + { + "epoch": 0.6728767948211295, + "grad_norm": 5.764066219329834, + "learning_rate": 2.5689356153468192e-05, + "loss": 1.9849, + "step": 8939 + }, + { + "epoch": 0.6729520691017896, + "grad_norm": 5.158320426940918, + "learning_rate": 2.567870398300393e-05, + "loss": 1.8153, + "step": 8940 + }, + { + "epoch": 0.6730273433824498, + "grad_norm": 6.412191390991211, + "learning_rate": 2.5668053258370918e-05, + "loss": 1.8805, + "step": 8941 + }, + { + "epoch": 0.67310261766311, + "grad_norm": 4.714682102203369, + "learning_rate": 2.565740398020231e-05, + "loss": 1.4891, + "step": 8942 + }, + { + "epoch": 0.6731778919437701, + "grad_norm": 5.484572410583496, + "learning_rate": 2.5646756149131202e-05, + "loss": 1.6467, + "step": 8943 + }, + { + "epoch": 0.6732531662244303, + "grad_norm": 6.749970436096191, + "learning_rate": 2.5636109765790573e-05, + "loss": 1.9803, + "step": 8944 + }, + { + "epoch": 0.6733284405050904, + "grad_norm": 4.412948131561279, + "learning_rate": 2.5625464830813295e-05, + "loss": 1.6374, + "step": 8945 + }, + { + "epoch": 0.6734037147857506, + "grad_norm": 4.890275001525879, + "learning_rate": 2.5614821344832197e-05, + "loss": 2.0223, + "step": 8946 + }, + { + "epoch": 0.6734789890664107, + "grad_norm": 5.711452484130859, + "learning_rate": 2.5604179308480018e-05, + "loss": 1.8884, + "step": 8947 + }, + { + "epoch": 0.6735542633470709, + "grad_norm": 4.520747184753418, + "learning_rate": 2.5593538722389364e-05, + "loss": 1.9254, + "step": 8948 + }, + { + "epoch": 0.673629537627731, + "grad_norm": 3.953178644180298, + "learning_rate": 2.558289958719282e-05, + "loss": 1.7587, + "step": 8949 + }, + { + "epoch": 0.6737048119083912, + "grad_norm": 5.083440780639648, + "learning_rate": 2.557226190352281e-05, + "loss": 1.7061, + "step": 8950 + }, + { + "epoch": 0.6737800861890514, + "grad_norm": 3.774132251739502, + "learning_rate": 2.5561625672011767e-05, + "loss": 1.8521, + "step": 8951 + }, + { + "epoch": 0.6738553604697115, + "grad_norm": 5.798524379730225, + "learning_rate": 2.5550990893291933e-05, + "loss": 2.1594, + "step": 8952 + }, + { + "epoch": 0.6739306347503716, + "grad_norm": 4.3504719734191895, + "learning_rate": 2.5540357567995554e-05, + "loss": 1.8972, + "step": 8953 + }, + { + "epoch": 0.6740059090310319, + "grad_norm": 4.402191162109375, + "learning_rate": 2.5529725696754726e-05, + "loss": 1.6157, + "step": 8954 + }, + { + "epoch": 0.674081183311692, + "grad_norm": 4.855814456939697, + "learning_rate": 2.551909528020148e-05, + "loss": 1.6673, + "step": 8955 + }, + { + "epoch": 0.6741564575923521, + "grad_norm": 3.893885612487793, + "learning_rate": 2.550846631896778e-05, + "loss": 1.8216, + "step": 8956 + }, + { + "epoch": 0.6742317318730123, + "grad_norm": 4.49495267868042, + "learning_rate": 2.5497838813685503e-05, + "loss": 1.8598, + "step": 8957 + }, + { + "epoch": 0.6743070061536725, + "grad_norm": 4.231896877288818, + "learning_rate": 2.548721276498639e-05, + "loss": 1.7879, + "step": 8958 + }, + { + "epoch": 0.6743822804343326, + "grad_norm": 7.277490139007568, + "learning_rate": 2.547658817350216e-05, + "loss": 1.909, + "step": 8959 + }, + { + "epoch": 0.6744575547149928, + "grad_norm": 10.944930076599121, + "learning_rate": 2.546596503986437e-05, + "loss": 2.3362, + "step": 8960 + }, + { + "epoch": 0.6745328289956529, + "grad_norm": 4.4685378074646, + "learning_rate": 2.5455343364704575e-05, + "loss": 1.723, + "step": 8961 + }, + { + "epoch": 0.674608103276313, + "grad_norm": 4.842438697814941, + "learning_rate": 2.5444723148654204e-05, + "loss": 1.8035, + "step": 8962 + }, + { + "epoch": 0.6746833775569733, + "grad_norm": 4.29227876663208, + "learning_rate": 2.543410439234456e-05, + "loss": 1.6039, + "step": 8963 + }, + { + "epoch": 0.6747586518376334, + "grad_norm": 4.410990238189697, + "learning_rate": 2.542348709640695e-05, + "loss": 1.8499, + "step": 8964 + }, + { + "epoch": 0.6748339261182935, + "grad_norm": 4.420595169067383, + "learning_rate": 2.5412871261472494e-05, + "loss": 1.8169, + "step": 8965 + }, + { + "epoch": 0.6749092003989536, + "grad_norm": 3.357102394104004, + "learning_rate": 2.5402256888172282e-05, + "loss": 1.6254, + "step": 8966 + }, + { + "epoch": 0.6749844746796139, + "grad_norm": 4.532436370849609, + "learning_rate": 2.539164397713733e-05, + "loss": 1.5907, + "step": 8967 + }, + { + "epoch": 0.675059748960274, + "grad_norm": 3.719102621078491, + "learning_rate": 2.538103252899855e-05, + "loss": 1.8033, + "step": 8968 + }, + { + "epoch": 0.6751350232409341, + "grad_norm": 4.57113790512085, + "learning_rate": 2.537042254438673e-05, + "loss": 1.7598, + "step": 8969 + }, + { + "epoch": 0.6752102975215943, + "grad_norm": 3.956610918045044, + "learning_rate": 2.5359814023932637e-05, + "loss": 1.7427, + "step": 8970 + }, + { + "epoch": 0.6752855718022545, + "grad_norm": 4.250957489013672, + "learning_rate": 2.534920696826688e-05, + "loss": 1.8295, + "step": 8971 + }, + { + "epoch": 0.6753608460829146, + "grad_norm": 7.939421653747559, + "learning_rate": 2.533860137802006e-05, + "loss": 2.3515, + "step": 8972 + }, + { + "epoch": 0.6754361203635748, + "grad_norm": 6.079216957092285, + "learning_rate": 2.5327997253822605e-05, + "loss": 1.7206, + "step": 8973 + }, + { + "epoch": 0.6755113946442349, + "grad_norm": 4.681916236877441, + "learning_rate": 2.5317394596304943e-05, + "loss": 2.1824, + "step": 8974 + }, + { + "epoch": 0.675586668924895, + "grad_norm": 5.979300498962402, + "learning_rate": 2.530679340609733e-05, + "loss": 1.9406, + "step": 8975 + }, + { + "epoch": 0.6756619432055553, + "grad_norm": 4.574625015258789, + "learning_rate": 2.5296193683829987e-05, + "loss": 1.9338, + "step": 8976 + }, + { + "epoch": 0.6757372174862154, + "grad_norm": 5.333022594451904, + "learning_rate": 2.5285595430133058e-05, + "loss": 2.0642, + "step": 8977 + }, + { + "epoch": 0.6758124917668755, + "grad_norm": 4.997891902923584, + "learning_rate": 2.527499864563656e-05, + "loss": 1.9122, + "step": 8978 + }, + { + "epoch": 0.6758877660475358, + "grad_norm": 5.70123291015625, + "learning_rate": 2.526440333097047e-05, + "loss": 1.9367, + "step": 8979 + }, + { + "epoch": 0.6759630403281959, + "grad_norm": 4.904244899749756, + "learning_rate": 2.5253809486764614e-05, + "loss": 1.991, + "step": 8980 + }, + { + "epoch": 0.676038314608856, + "grad_norm": 4.200491428375244, + "learning_rate": 2.5243217113648788e-05, + "loss": 1.8222, + "step": 8981 + }, + { + "epoch": 0.6761135888895162, + "grad_norm": 5.340821266174316, + "learning_rate": 2.5232626212252657e-05, + "loss": 1.925, + "step": 8982 + }, + { + "epoch": 0.6761888631701763, + "grad_norm": 4.627140045166016, + "learning_rate": 2.522203678320585e-05, + "loss": 1.8809, + "step": 8983 + }, + { + "epoch": 0.6762641374508365, + "grad_norm": 4.27971076965332, + "learning_rate": 2.521144882713784e-05, + "loss": 2.2423, + "step": 8984 + }, + { + "epoch": 0.6763394117314966, + "grad_norm": 4.922454833984375, + "learning_rate": 2.520086234467809e-05, + "loss": 2.032, + "step": 8985 + }, + { + "epoch": 0.6764146860121568, + "grad_norm": 5.3509039878845215, + "learning_rate": 2.5190277336455902e-05, + "loss": 2.0768, + "step": 8986 + }, + { + "epoch": 0.6764899602928169, + "grad_norm": 10.495585441589355, + "learning_rate": 2.5179693803100536e-05, + "loss": 1.988, + "step": 8987 + }, + { + "epoch": 0.6765652345734771, + "grad_norm": 4.338545322418213, + "learning_rate": 2.516911174524116e-05, + "loss": 1.6176, + "step": 8988 + }, + { + "epoch": 0.6766405088541373, + "grad_norm": 5.272329330444336, + "learning_rate": 2.5158531163506854e-05, + "loss": 1.5887, + "step": 8989 + }, + { + "epoch": 0.6767157831347974, + "grad_norm": 4.774064540863037, + "learning_rate": 2.514795205852658e-05, + "loss": 1.5859, + "step": 8990 + }, + { + "epoch": 0.6767910574154575, + "grad_norm": 4.001836776733398, + "learning_rate": 2.5137374430929262e-05, + "loss": 1.9374, + "step": 8991 + }, + { + "epoch": 0.6768663316961178, + "grad_norm": 5.248881816864014, + "learning_rate": 2.5126798281343677e-05, + "loss": 2.1088, + "step": 8992 + }, + { + "epoch": 0.6769416059767779, + "grad_norm": 4.291639804840088, + "learning_rate": 2.511622361039858e-05, + "loss": 1.8804, + "step": 8993 + }, + { + "epoch": 0.677016880257438, + "grad_norm": 5.402246475219727, + "learning_rate": 2.510565041872258e-05, + "loss": 1.766, + "step": 8994 + }, + { + "epoch": 0.6770921545380982, + "grad_norm": 5.217708110809326, + "learning_rate": 2.509507870694423e-05, + "loss": 2.2748, + "step": 8995 + }, + { + "epoch": 0.6771674288187584, + "grad_norm": 4.564388751983643, + "learning_rate": 2.5084508475692008e-05, + "loss": 1.9711, + "step": 8996 + }, + { + "epoch": 0.6772427030994185, + "grad_norm": 4.961460590362549, + "learning_rate": 2.5073939725594248e-05, + "loss": 1.9904, + "step": 8997 + }, + { + "epoch": 0.6773179773800787, + "grad_norm": 3.4901316165924072, + "learning_rate": 2.5063372457279266e-05, + "loss": 1.8561, + "step": 8998 + }, + { + "epoch": 0.6773932516607388, + "grad_norm": 6.30404806137085, + "learning_rate": 2.505280667137523e-05, + "loss": 1.6779, + "step": 8999 + }, + { + "epoch": 0.677468525941399, + "grad_norm": 5.243917942047119, + "learning_rate": 2.504224236851025e-05, + "loss": 1.767, + "step": 9000 + }, + { + "epoch": 0.6775438002220592, + "grad_norm": 5.209652423858643, + "learning_rate": 2.5031679549312347e-05, + "loss": 2.1176, + "step": 9001 + }, + { + "epoch": 0.6776190745027193, + "grad_norm": 6.367055416107178, + "learning_rate": 2.5021118214409473e-05, + "loss": 1.8135, + "step": 9002 + }, + { + "epoch": 0.6776943487833794, + "grad_norm": 4.160006046295166, + "learning_rate": 2.5010558364429433e-05, + "loss": 1.9674, + "step": 9003 + }, + { + "epoch": 0.6777696230640395, + "grad_norm": 5.699466705322266, + "learning_rate": 2.500000000000001e-05, + "loss": 1.9511, + "step": 9004 + }, + { + "epoch": 0.6778448973446998, + "grad_norm": 4.277342319488525, + "learning_rate": 2.498944312174884e-05, + "loss": 1.6726, + "step": 9005 + }, + { + "epoch": 0.6779201716253599, + "grad_norm": 4.5288543701171875, + "learning_rate": 2.497888773030353e-05, + "loss": 1.5794, + "step": 9006 + }, + { + "epoch": 0.67799544590602, + "grad_norm": 4.378624439239502, + "learning_rate": 2.496833382629153e-05, + "loss": 1.7962, + "step": 9007 + }, + { + "epoch": 0.6780707201866802, + "grad_norm": 6.767026901245117, + "learning_rate": 2.495778141034028e-05, + "loss": 2.0421, + "step": 9008 + }, + { + "epoch": 0.6781459944673404, + "grad_norm": 3.869217872619629, + "learning_rate": 2.4947230483077048e-05, + "loss": 1.7959, + "step": 9009 + }, + { + "epoch": 0.6782212687480005, + "grad_norm": 4.102424621582031, + "learning_rate": 2.493668104512909e-05, + "loss": 1.8528, + "step": 9010 + }, + { + "epoch": 0.6782965430286607, + "grad_norm": 5.372541904449463, + "learning_rate": 2.492613309712352e-05, + "loss": 1.9733, + "step": 9011 + }, + { + "epoch": 0.6783718173093208, + "grad_norm": 4.331643581390381, + "learning_rate": 2.4915586639687394e-05, + "loss": 1.497, + "step": 9012 + }, + { + "epoch": 0.678447091589981, + "grad_norm": 5.577416896820068, + "learning_rate": 2.490504167344769e-05, + "loss": 1.3954, + "step": 9013 + }, + { + "epoch": 0.6785223658706412, + "grad_norm": 6.015734672546387, + "learning_rate": 2.489449819903123e-05, + "loss": 2.0559, + "step": 9014 + }, + { + "epoch": 0.6785976401513013, + "grad_norm": 4.008953094482422, + "learning_rate": 2.4883956217064834e-05, + "loss": 1.6231, + "step": 9015 + }, + { + "epoch": 0.6786729144319614, + "grad_norm": 6.033753395080566, + "learning_rate": 2.487341572817516e-05, + "loss": 2.1297, + "step": 9016 + }, + { + "epoch": 0.6787481887126217, + "grad_norm": 4.549692630767822, + "learning_rate": 2.4862876732988844e-05, + "loss": 1.8499, + "step": 9017 + }, + { + "epoch": 0.6788234629932818, + "grad_norm": 4.264279365539551, + "learning_rate": 2.485233923213236e-05, + "loss": 1.7449, + "step": 9018 + }, + { + "epoch": 0.6788987372739419, + "grad_norm": 4.737195014953613, + "learning_rate": 2.4841803226232168e-05, + "loss": 1.9426, + "step": 9019 + }, + { + "epoch": 0.6789740115546021, + "grad_norm": 6.565869331359863, + "learning_rate": 2.4831268715914568e-05, + "loss": 1.3662, + "step": 9020 + }, + { + "epoch": 0.6790492858352623, + "grad_norm": 5.90390682220459, + "learning_rate": 2.482073570180583e-05, + "loss": 2.0404, + "step": 9021 + }, + { + "epoch": 0.6791245601159224, + "grad_norm": 3.933093547821045, + "learning_rate": 2.4810204184532104e-05, + "loss": 2.0474, + "step": 9022 + }, + { + "epoch": 0.6791998343965826, + "grad_norm": 4.74329948425293, + "learning_rate": 2.4799674164719483e-05, + "loss": 1.4304, + "step": 9023 + }, + { + "epoch": 0.6792751086772427, + "grad_norm": 6.14803409576416, + "learning_rate": 2.4789145642993904e-05, + "loss": 2.17, + "step": 9024 + }, + { + "epoch": 0.6793503829579028, + "grad_norm": 5.256182670593262, + "learning_rate": 2.4778618619981296e-05, + "loss": 1.7758, + "step": 9025 + }, + { + "epoch": 0.679425657238563, + "grad_norm": 8.053637504577637, + "learning_rate": 2.4768093096307428e-05, + "loss": 2.4638, + "step": 9026 + }, + { + "epoch": 0.6795009315192232, + "grad_norm": 5.0397820472717285, + "learning_rate": 2.4757569072598025e-05, + "loss": 2.1023, + "step": 9027 + }, + { + "epoch": 0.6795762057998833, + "grad_norm": 4.719660758972168, + "learning_rate": 2.474704654947873e-05, + "loss": 1.6651, + "step": 9028 + }, + { + "epoch": 0.6796514800805434, + "grad_norm": 5.490492343902588, + "learning_rate": 2.4736525527575038e-05, + "loss": 2.0088, + "step": 9029 + }, + { + "epoch": 0.6797267543612037, + "grad_norm": 3.843472719192505, + "learning_rate": 2.4726006007512437e-05, + "loss": 2.576, + "step": 9030 + }, + { + "epoch": 0.6798020286418638, + "grad_norm": 5.109609127044678, + "learning_rate": 2.4715487989916237e-05, + "loss": 1.6954, + "step": 9031 + }, + { + "epoch": 0.6798773029225239, + "grad_norm": 4.13579797744751, + "learning_rate": 2.470497147541173e-05, + "loss": 1.6793, + "step": 9032 + }, + { + "epoch": 0.6799525772031841, + "grad_norm": 5.528165817260742, + "learning_rate": 2.469445646462409e-05, + "loss": 1.9381, + "step": 9033 + }, + { + "epoch": 0.6800278514838443, + "grad_norm": 4.903003692626953, + "learning_rate": 2.468394295817842e-05, + "loss": 1.7115, + "step": 9034 + }, + { + "epoch": 0.6801031257645044, + "grad_norm": 4.704830169677734, + "learning_rate": 2.4673430956699684e-05, + "loss": 1.7333, + "step": 9035 + }, + { + "epoch": 0.6801784000451646, + "grad_norm": 4.738926410675049, + "learning_rate": 2.466292046081282e-05, + "loss": 1.8686, + "step": 9036 + }, + { + "epoch": 0.6802536743258247, + "grad_norm": 5.204471111297607, + "learning_rate": 2.4652411471142624e-05, + "loss": 1.7435, + "step": 9037 + }, + { + "epoch": 0.6803289486064849, + "grad_norm": 4.219439506530762, + "learning_rate": 2.4641903988313852e-05, + "loss": 2.0567, + "step": 9038 + }, + { + "epoch": 0.6804042228871451, + "grad_norm": 3.5801472663879395, + "learning_rate": 2.4631398012951107e-05, + "loss": 1.8702, + "step": 9039 + }, + { + "epoch": 0.6804794971678052, + "grad_norm": 5.407886981964111, + "learning_rate": 2.462089354567898e-05, + "loss": 2.1262, + "step": 9040 + }, + { + "epoch": 0.6805547714484653, + "grad_norm": 4.085020542144775, + "learning_rate": 2.461039058712189e-05, + "loss": 1.5482, + "step": 9041 + }, + { + "epoch": 0.6806300457291256, + "grad_norm": 3.4622628688812256, + "learning_rate": 2.459988913790423e-05, + "loss": 2.2664, + "step": 9042 + }, + { + "epoch": 0.6807053200097857, + "grad_norm": 4.15674352645874, + "learning_rate": 2.4589389198650282e-05, + "loss": 1.7201, + "step": 9043 + }, + { + "epoch": 0.6807805942904458, + "grad_norm": 4.034127712249756, + "learning_rate": 2.457889076998423e-05, + "loss": 1.8085, + "step": 9044 + }, + { + "epoch": 0.6808558685711059, + "grad_norm": 5.499331474304199, + "learning_rate": 2.4568393852530202e-05, + "loss": 1.5786, + "step": 9045 + }, + { + "epoch": 0.6809311428517661, + "grad_norm": 4.728934288024902, + "learning_rate": 2.455789844691217e-05, + "loss": 1.9485, + "step": 9046 + }, + { + "epoch": 0.6810064171324263, + "grad_norm": 3.9879322052001953, + "learning_rate": 2.454740455375409e-05, + "loss": 2.0854, + "step": 9047 + }, + { + "epoch": 0.6810816914130864, + "grad_norm": 4.168044090270996, + "learning_rate": 2.453691217367976e-05, + "loss": 1.6686, + "step": 9048 + }, + { + "epoch": 0.6811569656937466, + "grad_norm": 3.6258819103240967, + "learning_rate": 2.452642130731296e-05, + "loss": 1.8926, + "step": 9049 + }, + { + "epoch": 0.6812322399744067, + "grad_norm": 5.833902835845947, + "learning_rate": 2.45159319552773e-05, + "loss": 1.8476, + "step": 9050 + }, + { + "epoch": 0.6813075142550669, + "grad_norm": 4.166037082672119, + "learning_rate": 2.450544411819638e-05, + "loss": 1.6482, + "step": 9051 + }, + { + "epoch": 0.6813827885357271, + "grad_norm": 4.751802444458008, + "learning_rate": 2.449495779669364e-05, + "loss": 1.791, + "step": 9052 + }, + { + "epoch": 0.6814580628163872, + "grad_norm": 4.893528461456299, + "learning_rate": 2.4484472991392497e-05, + "loss": 1.6024, + "step": 9053 + }, + { + "epoch": 0.6815333370970473, + "grad_norm": 4.66733455657959, + "learning_rate": 2.44739897029162e-05, + "loss": 1.5003, + "step": 9054 + }, + { + "epoch": 0.6816086113777076, + "grad_norm": 6.2677106857299805, + "learning_rate": 2.4463507931887976e-05, + "loss": 1.7657, + "step": 9055 + }, + { + "epoch": 0.6816838856583677, + "grad_norm": 8.477002143859863, + "learning_rate": 2.445302767893093e-05, + "loss": 1.86, + "step": 9056 + }, + { + "epoch": 0.6817591599390278, + "grad_norm": 6.425327777862549, + "learning_rate": 2.4442548944668107e-05, + "loss": 2.0036, + "step": 9057 + }, + { + "epoch": 0.681834434219688, + "grad_norm": 5.1553473472595215, + "learning_rate": 2.44320717297224e-05, + "loss": 1.9077, + "step": 9058 + }, + { + "epoch": 0.6819097085003482, + "grad_norm": 5.682281970977783, + "learning_rate": 2.442159603471666e-05, + "loss": 1.9357, + "step": 9059 + }, + { + "epoch": 0.6819849827810083, + "grad_norm": 4.62346887588501, + "learning_rate": 2.4411121860273666e-05, + "loss": 2.0843, + "step": 9060 + }, + { + "epoch": 0.6820602570616685, + "grad_norm": 5.829575538635254, + "learning_rate": 2.4400649207016037e-05, + "loss": 1.5117, + "step": 9061 + }, + { + "epoch": 0.6821355313423286, + "grad_norm": 4.86073637008667, + "learning_rate": 2.4390178075566373e-05, + "loss": 1.8396, + "step": 9062 + }, + { + "epoch": 0.6822108056229887, + "grad_norm": 4.742579936981201, + "learning_rate": 2.4379708466547125e-05, + "loss": 1.8558, + "step": 9063 + }, + { + "epoch": 0.6822860799036489, + "grad_norm": 4.661652565002441, + "learning_rate": 2.4369240380580716e-05, + "loss": 1.6086, + "step": 9064 + }, + { + "epoch": 0.6823613541843091, + "grad_norm": 4.343211650848389, + "learning_rate": 2.43587738182894e-05, + "loss": 1.6114, + "step": 9065 + }, + { + "epoch": 0.6824366284649692, + "grad_norm": 5.326225757598877, + "learning_rate": 2.4348308780295408e-05, + "loss": 1.8733, + "step": 9066 + }, + { + "epoch": 0.6825119027456293, + "grad_norm": 5.697606086730957, + "learning_rate": 2.433784526722086e-05, + "loss": 1.827, + "step": 9067 + }, + { + "epoch": 0.6825871770262896, + "grad_norm": 5.193353652954102, + "learning_rate": 2.43273832796878e-05, + "loss": 1.7111, + "step": 9068 + }, + { + "epoch": 0.6826624513069497, + "grad_norm": 3.550020456314087, + "learning_rate": 2.4316922818318115e-05, + "loss": 1.9625, + "step": 9069 + }, + { + "epoch": 0.6827377255876098, + "grad_norm": 4.3767852783203125, + "learning_rate": 2.4306463883733693e-05, + "loss": 1.7519, + "step": 9070 + }, + { + "epoch": 0.68281299986827, + "grad_norm": 4.287188529968262, + "learning_rate": 2.4296006476556256e-05, + "loss": 2.1424, + "step": 9071 + }, + { + "epoch": 0.6828882741489302, + "grad_norm": 4.752734661102295, + "learning_rate": 2.4285550597407503e-05, + "loss": 2.0816, + "step": 9072 + }, + { + "epoch": 0.6829635484295903, + "grad_norm": 4.153080940246582, + "learning_rate": 2.427509624690896e-05, + "loss": 1.7652, + "step": 9073 + }, + { + "epoch": 0.6830388227102505, + "grad_norm": 4.861576080322266, + "learning_rate": 2.4264643425682133e-05, + "loss": 1.8001, + "step": 9074 + }, + { + "epoch": 0.6831140969909106, + "grad_norm": 5.290843486785889, + "learning_rate": 2.4254192134348425e-05, + "loss": 1.5115, + "step": 9075 + }, + { + "epoch": 0.6831893712715708, + "grad_norm": 7.709638595581055, + "learning_rate": 2.424374237352911e-05, + "loss": 2.0455, + "step": 9076 + }, + { + "epoch": 0.683264645552231, + "grad_norm": 4.527714729309082, + "learning_rate": 2.4233294143845403e-05, + "loss": 1.9473, + "step": 9077 + }, + { + "epoch": 0.6833399198328911, + "grad_norm": 4.879895210266113, + "learning_rate": 2.4222847445918428e-05, + "loss": 1.5161, + "step": 9078 + }, + { + "epoch": 0.6834151941135512, + "grad_norm": 4.958390712738037, + "learning_rate": 2.4212402280369235e-05, + "loss": 1.7933, + "step": 9079 + }, + { + "epoch": 0.6834904683942115, + "grad_norm": 4.639610290527344, + "learning_rate": 2.4201958647818707e-05, + "loss": 1.9909, + "step": 9080 + }, + { + "epoch": 0.6835657426748716, + "grad_norm": 5.387855052947998, + "learning_rate": 2.4191516548887732e-05, + "loss": 1.7857, + "step": 9081 + }, + { + "epoch": 0.6836410169555317, + "grad_norm": 5.309325695037842, + "learning_rate": 2.4181075984197034e-05, + "loss": 1.8503, + "step": 9082 + }, + { + "epoch": 0.6837162912361918, + "grad_norm": 4.217898368835449, + "learning_rate": 2.41706369543673e-05, + "loss": 1.7031, + "step": 9083 + }, + { + "epoch": 0.683791565516852, + "grad_norm": 4.389471530914307, + "learning_rate": 2.4160199460019067e-05, + "loss": 1.8445, + "step": 9084 + }, + { + "epoch": 0.6838668397975122, + "grad_norm": 4.237769603729248, + "learning_rate": 2.4149763501772847e-05, + "loss": 1.5626, + "step": 9085 + }, + { + "epoch": 0.6839421140781723, + "grad_norm": 4.457744598388672, + "learning_rate": 2.4139329080249006e-05, + "loss": 1.4935, + "step": 9086 + }, + { + "epoch": 0.6840173883588325, + "grad_norm": 5.8319902420043945, + "learning_rate": 2.412889619606784e-05, + "loss": 1.6587, + "step": 9087 + }, + { + "epoch": 0.6840926626394926, + "grad_norm": 5.9280476570129395, + "learning_rate": 2.411846484984957e-05, + "loss": 1.734, + "step": 9088 + }, + { + "epoch": 0.6841679369201528, + "grad_norm": 4.0684332847595215, + "learning_rate": 2.4108035042214316e-05, + "loss": 1.9654, + "step": 9089 + }, + { + "epoch": 0.684243211200813, + "grad_norm": 4.031683921813965, + "learning_rate": 2.409760677378207e-05, + "loss": 1.8936, + "step": 9090 + }, + { + "epoch": 0.6843184854814731, + "grad_norm": 6.697076320648193, + "learning_rate": 2.408718004517278e-05, + "loss": 2.219, + "step": 9091 + }, + { + "epoch": 0.6843937597621332, + "grad_norm": 4.2221174240112305, + "learning_rate": 2.40767548570063e-05, + "loss": 1.5519, + "step": 9092 + }, + { + "epoch": 0.6844690340427935, + "grad_norm": 5.143556118011475, + "learning_rate": 2.4066331209902342e-05, + "loss": 1.7326, + "step": 9093 + }, + { + "epoch": 0.6845443083234536, + "grad_norm": 4.825675964355469, + "learning_rate": 2.4055909104480602e-05, + "loss": 1.8874, + "step": 9094 + }, + { + "epoch": 0.6846195826041137, + "grad_norm": 5.928195953369141, + "learning_rate": 2.4045488541360606e-05, + "loss": 1.8395, + "step": 9095 + }, + { + "epoch": 0.6846948568847739, + "grad_norm": 4.8983659744262695, + "learning_rate": 2.4035069521161864e-05, + "loss": 1.8492, + "step": 9096 + }, + { + "epoch": 0.6847701311654341, + "grad_norm": 3.9793169498443604, + "learning_rate": 2.4024652044503713e-05, + "loss": 1.4956, + "step": 9097 + }, + { + "epoch": 0.6848454054460942, + "grad_norm": 6.323598384857178, + "learning_rate": 2.4014236112005474e-05, + "loss": 1.7211, + "step": 9098 + }, + { + "epoch": 0.6849206797267544, + "grad_norm": 4.206787586212158, + "learning_rate": 2.4003821724286353e-05, + "loss": 1.7448, + "step": 9099 + }, + { + "epoch": 0.6849959540074145, + "grad_norm": 5.844894886016846, + "learning_rate": 2.3993408881965418e-05, + "loss": 1.6057, + "step": 9100 + }, + { + "epoch": 0.6850712282880747, + "grad_norm": 4.612281322479248, + "learning_rate": 2.3982997585661703e-05, + "loss": 1.7822, + "step": 9101 + }, + { + "epoch": 0.6851465025687348, + "grad_norm": 4.586778163909912, + "learning_rate": 2.397258783599415e-05, + "loss": 1.6179, + "step": 9102 + }, + { + "epoch": 0.685221776849395, + "grad_norm": 4.259588718414307, + "learning_rate": 2.396217963358155e-05, + "loss": 1.6766, + "step": 9103 + }, + { + "epoch": 0.6852970511300551, + "grad_norm": 5.42435359954834, + "learning_rate": 2.395177297904268e-05, + "loss": 1.8473, + "step": 9104 + }, + { + "epoch": 0.6853723254107152, + "grad_norm": 5.430781841278076, + "learning_rate": 2.3941367872996144e-05, + "loss": 1.7342, + "step": 9105 + }, + { + "epoch": 0.6854475996913755, + "grad_norm": 8.5025053024292, + "learning_rate": 2.393096431606054e-05, + "loss": 2.1931, + "step": 9106 + }, + { + "epoch": 0.6855228739720356, + "grad_norm": 4.162214279174805, + "learning_rate": 2.392056230885429e-05, + "loss": 1.9266, + "step": 9107 + }, + { + "epoch": 0.6855981482526957, + "grad_norm": 5.234429836273193, + "learning_rate": 2.3910161851995778e-05, + "loss": 1.9606, + "step": 9108 + }, + { + "epoch": 0.685673422533356, + "grad_norm": 4.853522300720215, + "learning_rate": 2.38997629461033e-05, + "loss": 1.7711, + "step": 9109 + }, + { + "epoch": 0.6857486968140161, + "grad_norm": 4.913171768188477, + "learning_rate": 2.388936559179501e-05, + "loss": 1.9255, + "step": 9110 + }, + { + "epoch": 0.6858239710946762, + "grad_norm": 4.298588752746582, + "learning_rate": 2.387896978968901e-05, + "loss": 1.9373, + "step": 9111 + }, + { + "epoch": 0.6858992453753364, + "grad_norm": 8.830284118652344, + "learning_rate": 2.386857554040331e-05, + "loss": 1.7534, + "step": 9112 + }, + { + "epoch": 0.6859745196559965, + "grad_norm": 5.750880718231201, + "learning_rate": 2.3858182844555832e-05, + "loss": 1.7645, + "step": 9113 + }, + { + "epoch": 0.6860497939366567, + "grad_norm": 4.981136798858643, + "learning_rate": 2.3847791702764362e-05, + "loss": 1.6493, + "step": 9114 + }, + { + "epoch": 0.6861250682173169, + "grad_norm": 4.677420616149902, + "learning_rate": 2.3837402115646647e-05, + "loss": 2.0298, + "step": 9115 + }, + { + "epoch": 0.686200342497977, + "grad_norm": 4.720395565032959, + "learning_rate": 2.3827014083820292e-05, + "loss": 1.4519, + "step": 9116 + }, + { + "epoch": 0.6862756167786371, + "grad_norm": 4.0699968338012695, + "learning_rate": 2.381662760790287e-05, + "loss": 1.4495, + "step": 9117 + }, + { + "epoch": 0.6863508910592974, + "grad_norm": 3.9409024715423584, + "learning_rate": 2.3806242688511786e-05, + "loss": 1.8021, + "step": 9118 + }, + { + "epoch": 0.6864261653399575, + "grad_norm": 8.845688819885254, + "learning_rate": 2.379585932626444e-05, + "loss": 2.0402, + "step": 9119 + }, + { + "epoch": 0.6865014396206176, + "grad_norm": 5.33734655380249, + "learning_rate": 2.378547752177805e-05, + "loss": 1.6132, + "step": 9120 + }, + { + "epoch": 0.6865767139012778, + "grad_norm": 5.080447673797607, + "learning_rate": 2.37750972756698e-05, + "loss": 1.7566, + "step": 9121 + }, + { + "epoch": 0.686651988181938, + "grad_norm": 6.031424045562744, + "learning_rate": 2.3764718588556773e-05, + "loss": 1.9316, + "step": 9122 + }, + { + "epoch": 0.6867272624625981, + "grad_norm": 8.825675010681152, + "learning_rate": 2.3754341461055947e-05, + "loss": 1.4095, + "step": 9123 + }, + { + "epoch": 0.6868025367432582, + "grad_norm": 5.342495441436768, + "learning_rate": 2.3743965893784225e-05, + "loss": 1.7483, + "step": 9124 + }, + { + "epoch": 0.6868778110239184, + "grad_norm": 4.1590142250061035, + "learning_rate": 2.3733591887358385e-05, + "loss": 1.4803, + "step": 9125 + }, + { + "epoch": 0.6869530853045785, + "grad_norm": 4.0848846435546875, + "learning_rate": 2.3723219442395156e-05, + "loss": 1.9417, + "step": 9126 + }, + { + "epoch": 0.6870283595852387, + "grad_norm": 6.331859111785889, + "learning_rate": 2.3712848559511112e-05, + "loss": 2.1291, + "step": 9127 + }, + { + "epoch": 0.6871036338658989, + "grad_norm": 3.932668924331665, + "learning_rate": 2.3702479239322818e-05, + "loss": 1.4551, + "step": 9128 + }, + { + "epoch": 0.687178908146559, + "grad_norm": 7.706721305847168, + "learning_rate": 2.369211148244666e-05, + "loss": 1.7745, + "step": 9129 + }, + { + "epoch": 0.6872541824272191, + "grad_norm": 5.1186676025390625, + "learning_rate": 2.3681745289499002e-05, + "loss": 2.1512, + "step": 9130 + }, + { + "epoch": 0.6873294567078794, + "grad_norm": 3.926673173904419, + "learning_rate": 2.3671380661096053e-05, + "loss": 1.8025, + "step": 9131 + }, + { + "epoch": 0.6874047309885395, + "grad_norm": 5.5438971519470215, + "learning_rate": 2.3661017597853987e-05, + "loss": 1.5459, + "step": 9132 + }, + { + "epoch": 0.6874800052691996, + "grad_norm": 6.186820983886719, + "learning_rate": 2.365065610038884e-05, + "loss": 1.9546, + "step": 9133 + }, + { + "epoch": 0.6875552795498598, + "grad_norm": 4.476471900939941, + "learning_rate": 2.3640296169316604e-05, + "loss": 1.78, + "step": 9134 + }, + { + "epoch": 0.68763055383052, + "grad_norm": 4.045302867889404, + "learning_rate": 2.362993780525311e-05, + "loss": 1.7067, + "step": 9135 + }, + { + "epoch": 0.6877058281111801, + "grad_norm": 6.452706813812256, + "learning_rate": 2.361958100881417e-05, + "loss": 1.5105, + "step": 9136 + }, + { + "epoch": 0.6877811023918403, + "grad_norm": 6.263516902923584, + "learning_rate": 2.360922578061542e-05, + "loss": 2.1166, + "step": 9137 + }, + { + "epoch": 0.6878563766725004, + "grad_norm": 4.467607021331787, + "learning_rate": 2.3598872121272498e-05, + "loss": 1.7217, + "step": 9138 + }, + { + "epoch": 0.6879316509531606, + "grad_norm": 4.40142297744751, + "learning_rate": 2.358852003140085e-05, + "loss": 1.8081, + "step": 9139 + }, + { + "epoch": 0.6880069252338208, + "grad_norm": 4.108103275299072, + "learning_rate": 2.3578169511615912e-05, + "loss": 1.8588, + "step": 9140 + }, + { + "epoch": 0.6880821995144809, + "grad_norm": 4.90578556060791, + "learning_rate": 2.3567820562532994e-05, + "loss": 1.6132, + "step": 9141 + }, + { + "epoch": 0.688157473795141, + "grad_norm": 4.553472518920898, + "learning_rate": 2.3557473184767287e-05, + "loss": 1.5817, + "step": 9142 + }, + { + "epoch": 0.6882327480758011, + "grad_norm": 4.37747049331665, + "learning_rate": 2.3547127378933925e-05, + "loss": 1.803, + "step": 9143 + }, + { + "epoch": 0.6883080223564614, + "grad_norm": 4.457742691040039, + "learning_rate": 2.353678314564794e-05, + "loss": 1.6272, + "step": 9144 + }, + { + "epoch": 0.6883832966371215, + "grad_norm": 4.232684135437012, + "learning_rate": 2.352644048552428e-05, + "loss": 1.9434, + "step": 9145 + }, + { + "epoch": 0.6884585709177816, + "grad_norm": 4.716200828552246, + "learning_rate": 2.351609939917776e-05, + "loss": 2.2549, + "step": 9146 + }, + { + "epoch": 0.6885338451984419, + "grad_norm": 4.657998561859131, + "learning_rate": 2.3505759887223155e-05, + "loss": 1.828, + "step": 9147 + }, + { + "epoch": 0.688609119479102, + "grad_norm": 5.67838716506958, + "learning_rate": 2.3495421950275086e-05, + "loss": 1.9363, + "step": 9148 + }, + { + "epoch": 0.6886843937597621, + "grad_norm": 8.302669525146484, + "learning_rate": 2.3485085588948153e-05, + "loss": 2.1315, + "step": 9149 + }, + { + "epoch": 0.6887596680404223, + "grad_norm": 5.238595962524414, + "learning_rate": 2.3474750803856782e-05, + "loss": 2.1672, + "step": 9150 + }, + { + "epoch": 0.6888349423210824, + "grad_norm": 4.266178131103516, + "learning_rate": 2.346441759561538e-05, + "loss": 1.8642, + "step": 9151 + }, + { + "epoch": 0.6889102166017426, + "grad_norm": 6.604078769683838, + "learning_rate": 2.34540859648382e-05, + "loss": 1.9481, + "step": 9152 + }, + { + "epoch": 0.6889854908824028, + "grad_norm": 5.05017614364624, + "learning_rate": 2.3443755912139458e-05, + "loss": 1.8284, + "step": 9153 + }, + { + "epoch": 0.6890607651630629, + "grad_norm": 4.7816162109375, + "learning_rate": 2.343342743813321e-05, + "loss": 2.0192, + "step": 9154 + }, + { + "epoch": 0.689136039443723, + "grad_norm": 4.639046669006348, + "learning_rate": 2.342310054343347e-05, + "loss": 1.9001, + "step": 9155 + }, + { + "epoch": 0.6892113137243833, + "grad_norm": 5.766395568847656, + "learning_rate": 2.3412775228654147e-05, + "loss": 2.4821, + "step": 9156 + }, + { + "epoch": 0.6892865880050434, + "grad_norm": 4.912259578704834, + "learning_rate": 2.340245149440905e-05, + "loss": 1.6332, + "step": 9157 + }, + { + "epoch": 0.6893618622857035, + "grad_norm": 4.39493465423584, + "learning_rate": 2.3392129341311914e-05, + "loss": 1.7889, + "step": 9158 + }, + { + "epoch": 0.6894371365663637, + "grad_norm": 3.6425859928131104, + "learning_rate": 2.3381808769976323e-05, + "loss": 1.9316, + "step": 9159 + }, + { + "epoch": 0.6895124108470239, + "grad_norm": 5.146086692810059, + "learning_rate": 2.3371489781015838e-05, + "loss": 2.1955, + "step": 9160 + }, + { + "epoch": 0.689587685127684, + "grad_norm": 3.5690157413482666, + "learning_rate": 2.3361172375043867e-05, + "loss": 1.7187, + "step": 9161 + }, + { + "epoch": 0.6896629594083441, + "grad_norm": 6.691940784454346, + "learning_rate": 2.335085655267378e-05, + "loss": 1.8649, + "step": 9162 + }, + { + "epoch": 0.6897382336890043, + "grad_norm": 4.657131671905518, + "learning_rate": 2.3340542314518786e-05, + "loss": 1.7588, + "step": 9163 + }, + { + "epoch": 0.6898135079696645, + "grad_norm": 5.437439441680908, + "learning_rate": 2.3330229661192077e-05, + "loss": 1.406, + "step": 9164 + }, + { + "epoch": 0.6898887822503246, + "grad_norm": 4.345561981201172, + "learning_rate": 2.3319918593306673e-05, + "loss": 1.7277, + "step": 9165 + }, + { + "epoch": 0.6899640565309848, + "grad_norm": 5.805276393890381, + "learning_rate": 2.330960911147555e-05, + "loss": 1.8672, + "step": 9166 + }, + { + "epoch": 0.6900393308116449, + "grad_norm": 4.972979545593262, + "learning_rate": 2.3299301216311586e-05, + "loss": 1.5813, + "step": 9167 + }, + { + "epoch": 0.690114605092305, + "grad_norm": 6.759117126464844, + "learning_rate": 2.328899490842757e-05, + "loss": 1.8435, + "step": 9168 + }, + { + "epoch": 0.6901898793729653, + "grad_norm": 6.600821495056152, + "learning_rate": 2.3278690188436147e-05, + "loss": 2.086, + "step": 9169 + }, + { + "epoch": 0.6902651536536254, + "grad_norm": 4.582935810089111, + "learning_rate": 2.3268387056949935e-05, + "loss": 1.7489, + "step": 9170 + }, + { + "epoch": 0.6903404279342855, + "grad_norm": 4.0333991050720215, + "learning_rate": 2.3258085514581395e-05, + "loss": 1.7854, + "step": 9171 + }, + { + "epoch": 0.6904157022149457, + "grad_norm": 5.5755085945129395, + "learning_rate": 2.324778556194294e-05, + "loss": 1.3656, + "step": 9172 + }, + { + "epoch": 0.6904909764956059, + "grad_norm": 3.8935749530792236, + "learning_rate": 2.3237487199646896e-05, + "loss": 1.7583, + "step": 9173 + }, + { + "epoch": 0.690566250776266, + "grad_norm": 4.003864288330078, + "learning_rate": 2.3227190428305425e-05, + "loss": 1.7924, + "step": 9174 + }, + { + "epoch": 0.6906415250569262, + "grad_norm": 6.712199687957764, + "learning_rate": 2.321689524853068e-05, + "loss": 1.9053, + "step": 9175 + }, + { + "epoch": 0.6907167993375863, + "grad_norm": 4.886847019195557, + "learning_rate": 2.320660166093464e-05, + "loss": 1.9904, + "step": 9176 + }, + { + "epoch": 0.6907920736182465, + "grad_norm": 6.222190856933594, + "learning_rate": 2.3196309666129263e-05, + "loss": 1.5595, + "step": 9177 + }, + { + "epoch": 0.6908673478989067, + "grad_norm": 4.808961868286133, + "learning_rate": 2.318601926472636e-05, + "loss": 2.0759, + "step": 9178 + }, + { + "epoch": 0.6909426221795668, + "grad_norm": 9.17426872253418, + "learning_rate": 2.3175730457337697e-05, + "loss": 1.9261, + "step": 9179 + }, + { + "epoch": 0.6910178964602269, + "grad_norm": 5.523231029510498, + "learning_rate": 2.316544324457487e-05, + "loss": 1.4542, + "step": 9180 + }, + { + "epoch": 0.691093170740887, + "grad_norm": 4.6755781173706055, + "learning_rate": 2.3155157627049462e-05, + "loss": 1.5624, + "step": 9181 + }, + { + "epoch": 0.6911684450215473, + "grad_norm": 4.937200546264648, + "learning_rate": 2.314487360537289e-05, + "loss": 1.9299, + "step": 9182 + }, + { + "epoch": 0.6912437193022074, + "grad_norm": 5.608829975128174, + "learning_rate": 2.3134591180156545e-05, + "loss": 1.5029, + "step": 9183 + }, + { + "epoch": 0.6913189935828675, + "grad_norm": 3.9532153606414795, + "learning_rate": 2.3124310352011652e-05, + "loss": 1.4616, + "step": 9184 + }, + { + "epoch": 0.6913942678635278, + "grad_norm": 6.401953220367432, + "learning_rate": 2.3114031121549405e-05, + "loss": 1.7897, + "step": 9185 + }, + { + "epoch": 0.6914695421441879, + "grad_norm": 5.974150657653809, + "learning_rate": 2.3103753489380853e-05, + "loss": 2.3324, + "step": 9186 + }, + { + "epoch": 0.691544816424848, + "grad_norm": 5.477693557739258, + "learning_rate": 2.3093477456116978e-05, + "loss": 1.5144, + "step": 9187 + }, + { + "epoch": 0.6916200907055082, + "grad_norm": 5.690422058105469, + "learning_rate": 2.3083203022368667e-05, + "loss": 1.7013, + "step": 9188 + }, + { + "epoch": 0.6916953649861683, + "grad_norm": 6.343783855438232, + "learning_rate": 2.30729301887467e-05, + "loss": 1.5204, + "step": 9189 + }, + { + "epoch": 0.6917706392668285, + "grad_norm": 4.771541595458984, + "learning_rate": 2.3062658955861794e-05, + "loss": 1.9007, + "step": 9190 + }, + { + "epoch": 0.6918459135474887, + "grad_norm": 5.155393600463867, + "learning_rate": 2.3052389324324504e-05, + "loss": 1.7802, + "step": 9191 + }, + { + "epoch": 0.6919211878281488, + "grad_norm": 5.258251190185547, + "learning_rate": 2.3042121294745356e-05, + "loss": 2.188, + "step": 9192 + }, + { + "epoch": 0.6919964621088089, + "grad_norm": 5.419024467468262, + "learning_rate": 2.3031854867734737e-05, + "loss": 1.7274, + "step": 9193 + }, + { + "epoch": 0.6920717363894692, + "grad_norm": 4.655803203582764, + "learning_rate": 2.302159004390298e-05, + "loss": 1.8137, + "step": 9194 + }, + { + "epoch": 0.6921470106701293, + "grad_norm": 4.632664680480957, + "learning_rate": 2.301132682386027e-05, + "loss": 1.7877, + "step": 9195 + }, + { + "epoch": 0.6922222849507894, + "grad_norm": 4.912025451660156, + "learning_rate": 2.3001065208216755e-05, + "loss": 1.7493, + "step": 9196 + }, + { + "epoch": 0.6922975592314496, + "grad_norm": 8.843585014343262, + "learning_rate": 2.2990805197582426e-05, + "loss": 1.8767, + "step": 9197 + }, + { + "epoch": 0.6923728335121098, + "grad_norm": 5.6237897872924805, + "learning_rate": 2.298054679256724e-05, + "loss": 1.9113, + "step": 9198 + }, + { + "epoch": 0.6924481077927699, + "grad_norm": 10.498770713806152, + "learning_rate": 2.2970289993781025e-05, + "loss": 2.0073, + "step": 9199 + }, + { + "epoch": 0.6925233820734301, + "grad_norm": 4.830541610717773, + "learning_rate": 2.29600348018335e-05, + "loss": 1.9106, + "step": 9200 + }, + { + "epoch": 0.6925986563540902, + "grad_norm": 3.9322240352630615, + "learning_rate": 2.2949781217334322e-05, + "loss": 1.6409, + "step": 9201 + }, + { + "epoch": 0.6926739306347504, + "grad_norm": 5.151636123657227, + "learning_rate": 2.293952924089305e-05, + "loss": 1.8522, + "step": 9202 + }, + { + "epoch": 0.6927492049154105, + "grad_norm": 4.6234130859375, + "learning_rate": 2.2929278873119103e-05, + "loss": 1.7615, + "step": 9203 + }, + { + "epoch": 0.6928244791960707, + "grad_norm": 4.71243143081665, + "learning_rate": 2.2919030114621848e-05, + "loss": 1.9931, + "step": 9204 + }, + { + "epoch": 0.6928997534767308, + "grad_norm": 5.4978346824646, + "learning_rate": 2.2908782966010574e-05, + "loss": 2.0309, + "step": 9205 + }, + { + "epoch": 0.692975027757391, + "grad_norm": 4.959962844848633, + "learning_rate": 2.2898537427894396e-05, + "loss": 1.8914, + "step": 9206 + }, + { + "epoch": 0.6930503020380512, + "grad_norm": 6.575296878814697, + "learning_rate": 2.2888293500882423e-05, + "loss": 2.2001, + "step": 9207 + }, + { + "epoch": 0.6931255763187113, + "grad_norm": 4.736110687255859, + "learning_rate": 2.2878051185583594e-05, + "loss": 1.6847, + "step": 9208 + }, + { + "epoch": 0.6932008505993714, + "grad_norm": 5.7614030838012695, + "learning_rate": 2.2867810482606816e-05, + "loss": 1.6137, + "step": 9209 + }, + { + "epoch": 0.6932761248800317, + "grad_norm": 4.224489688873291, + "learning_rate": 2.2857571392560838e-05, + "loss": 1.5964, + "step": 9210 + }, + { + "epoch": 0.6933513991606918, + "grad_norm": 5.394630432128906, + "learning_rate": 2.284733391605436e-05, + "loss": 1.9322, + "step": 9211 + }, + { + "epoch": 0.6934266734413519, + "grad_norm": 4.493192672729492, + "learning_rate": 2.283709805369597e-05, + "loss": 1.96, + "step": 9212 + }, + { + "epoch": 0.6935019477220121, + "grad_norm": 5.20558500289917, + "learning_rate": 2.282686380609418e-05, + "loss": 1.625, + "step": 9213 + }, + { + "epoch": 0.6935772220026722, + "grad_norm": 10.132219314575195, + "learning_rate": 2.281663117385735e-05, + "loss": 1.9751, + "step": 9214 + }, + { + "epoch": 0.6936524962833324, + "grad_norm": 5.027317523956299, + "learning_rate": 2.2806400157593816e-05, + "loss": 1.9312, + "step": 9215 + }, + { + "epoch": 0.6937277705639926, + "grad_norm": 7.338446140289307, + "learning_rate": 2.279617075791175e-05, + "loss": 2.1066, + "step": 9216 + }, + { + "epoch": 0.6938030448446527, + "grad_norm": 6.198676109313965, + "learning_rate": 2.2785942975419293e-05, + "loss": 1.7719, + "step": 9217 + }, + { + "epoch": 0.6938783191253128, + "grad_norm": 5.513648509979248, + "learning_rate": 2.2775716810724422e-05, + "loss": 2.3094, + "step": 9218 + }, + { + "epoch": 0.6939535934059731, + "grad_norm": 4.731198310852051, + "learning_rate": 2.276549226443509e-05, + "loss": 1.6878, + "step": 9219 + }, + { + "epoch": 0.6940288676866332, + "grad_norm": 4.2922186851501465, + "learning_rate": 2.275526933715908e-05, + "loss": 1.9455, + "step": 9220 + }, + { + "epoch": 0.6941041419672933, + "grad_norm": 9.820531845092773, + "learning_rate": 2.274504802950413e-05, + "loss": 1.9895, + "step": 9221 + }, + { + "epoch": 0.6941794162479534, + "grad_norm": 5.64205265045166, + "learning_rate": 2.2734828342077875e-05, + "loss": 1.9654, + "step": 9222 + }, + { + "epoch": 0.6942546905286137, + "grad_norm": 4.4885687828063965, + "learning_rate": 2.2724610275487844e-05, + "loss": 1.9458, + "step": 9223 + }, + { + "epoch": 0.6943299648092738, + "grad_norm": 5.2593770027160645, + "learning_rate": 2.2714393830341483e-05, + "loss": 1.9413, + "step": 9224 + }, + { + "epoch": 0.6944052390899339, + "grad_norm": 5.17440938949585, + "learning_rate": 2.2704179007246107e-05, + "loss": 1.6935, + "step": 9225 + }, + { + "epoch": 0.6944805133705941, + "grad_norm": 4.944150447845459, + "learning_rate": 2.269396580680898e-05, + "loss": 1.5861, + "step": 9226 + }, + { + "epoch": 0.6945557876512543, + "grad_norm": 4.647194862365723, + "learning_rate": 2.268375422963722e-05, + "loss": 1.7095, + "step": 9227 + }, + { + "epoch": 0.6946310619319144, + "grad_norm": 6.682551383972168, + "learning_rate": 2.2673544276337915e-05, + "loss": 1.8163, + "step": 9228 + }, + { + "epoch": 0.6947063362125746, + "grad_norm": 5.43378210067749, + "learning_rate": 2.2663335947517973e-05, + "loss": 1.8934, + "step": 9229 + }, + { + "epoch": 0.6947816104932347, + "grad_norm": 4.208450794219971, + "learning_rate": 2.265312924378429e-05, + "loss": 1.7279, + "step": 9230 + }, + { + "epoch": 0.6948568847738948, + "grad_norm": 11.26501750946045, + "learning_rate": 2.2642924165743584e-05, + "loss": 1.81, + "step": 9231 + }, + { + "epoch": 0.6949321590545551, + "grad_norm": 4.731900215148926, + "learning_rate": 2.2632720714002542e-05, + "loss": 1.949, + "step": 9232 + }, + { + "epoch": 0.6950074333352152, + "grad_norm": 7.432902812957764, + "learning_rate": 2.2622518889167725e-05, + "loss": 1.8894, + "step": 9233 + }, + { + "epoch": 0.6950827076158753, + "grad_norm": 7.676168441772461, + "learning_rate": 2.2612318691845628e-05, + "loss": 1.9798, + "step": 9234 + }, + { + "epoch": 0.6951579818965355, + "grad_norm": 4.9224090576171875, + "learning_rate": 2.2602120122642585e-05, + "loss": 1.4561, + "step": 9235 + }, + { + "epoch": 0.6952332561771957, + "grad_norm": 3.893045663833618, + "learning_rate": 2.259192318216488e-05, + "loss": 1.9676, + "step": 9236 + }, + { + "epoch": 0.6953085304578558, + "grad_norm": 6.134203910827637, + "learning_rate": 2.2581727871018726e-05, + "loss": 2.0661, + "step": 9237 + }, + { + "epoch": 0.695383804738516, + "grad_norm": 3.6348259449005127, + "learning_rate": 2.257153418981015e-05, + "loss": 1.9444, + "step": 9238 + }, + { + "epoch": 0.6954590790191761, + "grad_norm": 4.642723083496094, + "learning_rate": 2.256134213914519e-05, + "loss": 2.0677, + "step": 9239 + }, + { + "epoch": 0.6955343532998363, + "grad_norm": 5.865072727203369, + "learning_rate": 2.2551151719629693e-05, + "loss": 1.9967, + "step": 9240 + }, + { + "epoch": 0.6956096275804964, + "grad_norm": 5.17803430557251, + "learning_rate": 2.254096293186948e-05, + "loss": 2.0331, + "step": 9241 + }, + { + "epoch": 0.6956849018611566, + "grad_norm": 3.7839152812957764, + "learning_rate": 2.253077577647022e-05, + "loss": 1.9727, + "step": 9242 + }, + { + "epoch": 0.6957601761418167, + "grad_norm": 5.9798054695129395, + "learning_rate": 2.2520590254037516e-05, + "loss": 2.0086, + "step": 9243 + }, + { + "epoch": 0.6958354504224769, + "grad_norm": 5.171562671661377, + "learning_rate": 2.251040636517688e-05, + "loss": 1.9071, + "step": 9244 + }, + { + "epoch": 0.6959107247031371, + "grad_norm": 5.818791389465332, + "learning_rate": 2.2500224110493717e-05, + "loss": 1.8327, + "step": 9245 + }, + { + "epoch": 0.6959859989837972, + "grad_norm": 4.878377914428711, + "learning_rate": 2.2490043490593315e-05, + "loss": 1.7789, + "step": 9246 + }, + { + "epoch": 0.6960612732644573, + "grad_norm": 6.003511905670166, + "learning_rate": 2.247986450608091e-05, + "loss": 2.2119, + "step": 9247 + }, + { + "epoch": 0.6961365475451176, + "grad_norm": 4.520191669464111, + "learning_rate": 2.2469687157561576e-05, + "loss": 1.7008, + "step": 9248 + }, + { + "epoch": 0.6962118218257777, + "grad_norm": 7.781517505645752, + "learning_rate": 2.2459511445640362e-05, + "loss": 2.1897, + "step": 9249 + }, + { + "epoch": 0.6962870961064378, + "grad_norm": 4.689202785491943, + "learning_rate": 2.2449337370922158e-05, + "loss": 2.0805, + "step": 9250 + }, + { + "epoch": 0.696362370387098, + "grad_norm": 6.558923721313477, + "learning_rate": 2.2439164934011815e-05, + "loss": 2.0677, + "step": 9251 + }, + { + "epoch": 0.6964376446677581, + "grad_norm": 8.834991455078125, + "learning_rate": 2.2428994135514015e-05, + "loss": 1.8102, + "step": 9252 + }, + { + "epoch": 0.6965129189484183, + "grad_norm": 5.984163761138916, + "learning_rate": 2.2418824976033397e-05, + "loss": 1.8725, + "step": 9253 + }, + { + "epoch": 0.6965881932290785, + "grad_norm": 4.9911346435546875, + "learning_rate": 2.240865745617452e-05, + "loss": 1.7862, + "step": 9254 + }, + { + "epoch": 0.6966634675097386, + "grad_norm": 5.888492584228516, + "learning_rate": 2.239849157654177e-05, + "loss": 1.7328, + "step": 9255 + }, + { + "epoch": 0.6967387417903987, + "grad_norm": 4.192131042480469, + "learning_rate": 2.23883273377395e-05, + "loss": 1.4821, + "step": 9256 + }, + { + "epoch": 0.696814016071059, + "grad_norm": 3.871842384338379, + "learning_rate": 2.2378164740371936e-05, + "loss": 1.7818, + "step": 9257 + }, + { + "epoch": 0.6968892903517191, + "grad_norm": 4.933742046356201, + "learning_rate": 2.2368003785043253e-05, + "loss": 1.9669, + "step": 9258 + }, + { + "epoch": 0.6969645646323792, + "grad_norm": 4.600714683532715, + "learning_rate": 2.2357844472357432e-05, + "loss": 1.4692, + "step": 9259 + }, + { + "epoch": 0.6970398389130393, + "grad_norm": 5.010395050048828, + "learning_rate": 2.234768680291846e-05, + "loss": 2.1681, + "step": 9260 + }, + { + "epoch": 0.6971151131936996, + "grad_norm": 4.872340202331543, + "learning_rate": 2.233753077733015e-05, + "loss": 1.9431, + "step": 9261 + }, + { + "epoch": 0.6971903874743597, + "grad_norm": 4.049929618835449, + "learning_rate": 2.232737639619628e-05, + "loss": 1.7836, + "step": 9262 + }, + { + "epoch": 0.6972656617550198, + "grad_norm": 6.61251974105835, + "learning_rate": 2.2317223660120464e-05, + "loss": 1.7196, + "step": 9263 + }, + { + "epoch": 0.69734093603568, + "grad_norm": 4.973830699920654, + "learning_rate": 2.230707256970629e-05, + "loss": 1.9496, + "step": 9264 + }, + { + "epoch": 0.6974162103163402, + "grad_norm": 5.179134845733643, + "learning_rate": 2.2296923125557167e-05, + "loss": 1.4152, + "step": 9265 + }, + { + "epoch": 0.6974914845970003, + "grad_norm": 5.262170791625977, + "learning_rate": 2.2286775328276482e-05, + "loss": 1.526, + "step": 9266 + }, + { + "epoch": 0.6975667588776605, + "grad_norm": 5.391235828399658, + "learning_rate": 2.2276629178467483e-05, + "loss": 1.7981, + "step": 9267 + }, + { + "epoch": 0.6976420331583206, + "grad_norm": 4.867656707763672, + "learning_rate": 2.2266484676733328e-05, + "loss": 1.6827, + "step": 9268 + }, + { + "epoch": 0.6977173074389807, + "grad_norm": 4.321715354919434, + "learning_rate": 2.2256341823677106e-05, + "loss": 2.0111, + "step": 9269 + }, + { + "epoch": 0.697792581719641, + "grad_norm": 5.91670560836792, + "learning_rate": 2.224620061990174e-05, + "loss": 2.2746, + "step": 9270 + }, + { + "epoch": 0.6978678560003011, + "grad_norm": 3.856194019317627, + "learning_rate": 2.2236061066010127e-05, + "loss": 1.3904, + "step": 9271 + }, + { + "epoch": 0.6979431302809612, + "grad_norm": 4.685153007507324, + "learning_rate": 2.2225923162605006e-05, + "loss": 1.8802, + "step": 9272 + }, + { + "epoch": 0.6980184045616215, + "grad_norm": 7.412583827972412, + "learning_rate": 2.2215786910289076e-05, + "loss": 1.8379, + "step": 9273 + }, + { + "epoch": 0.6980936788422816, + "grad_norm": 4.549882411956787, + "learning_rate": 2.2205652309664876e-05, + "loss": 1.7944, + "step": 9274 + }, + { + "epoch": 0.6981689531229417, + "grad_norm": 6.999424934387207, + "learning_rate": 2.219551936133491e-05, + "loss": 1.6265, + "step": 9275 + }, + { + "epoch": 0.6982442274036019, + "grad_norm": 5.160253524780273, + "learning_rate": 2.2185388065901518e-05, + "loss": 1.922, + "step": 9276 + }, + { + "epoch": 0.698319501684262, + "grad_norm": 4.559370517730713, + "learning_rate": 2.2175258423967003e-05, + "loss": 1.5671, + "step": 9277 + }, + { + "epoch": 0.6983947759649222, + "grad_norm": 3.975020408630371, + "learning_rate": 2.2165130436133535e-05, + "loss": 1.9614, + "step": 9278 + }, + { + "epoch": 0.6984700502455823, + "grad_norm": 4.767719745635986, + "learning_rate": 2.2155004103003206e-05, + "loss": 1.7707, + "step": 9279 + }, + { + "epoch": 0.6985453245262425, + "grad_norm": 4.912820339202881, + "learning_rate": 2.2144879425177977e-05, + "loss": 2.1508, + "step": 9280 + }, + { + "epoch": 0.6986205988069026, + "grad_norm": 6.164126873016357, + "learning_rate": 2.213475640325976e-05, + "loss": 2.0283, + "step": 9281 + }, + { + "epoch": 0.6986958730875628, + "grad_norm": 5.1750712394714355, + "learning_rate": 2.21246350378503e-05, + "loss": 1.8956, + "step": 9282 + }, + { + "epoch": 0.698771147368223, + "grad_norm": 4.71627950668335, + "learning_rate": 2.2114515329551315e-05, + "loss": 1.6123, + "step": 9283 + }, + { + "epoch": 0.6988464216488831, + "grad_norm": 6.4594244956970215, + "learning_rate": 2.2104397278964372e-05, + "loss": 2.0762, + "step": 9284 + }, + { + "epoch": 0.6989216959295432, + "grad_norm": 4.881511211395264, + "learning_rate": 2.2094280886690967e-05, + "loss": 1.858, + "step": 9285 + }, + { + "epoch": 0.6989969702102035, + "grad_norm": 6.458160877227783, + "learning_rate": 2.2084166153332508e-05, + "loss": 1.8266, + "step": 9286 + }, + { + "epoch": 0.6990722444908636, + "grad_norm": 5.805734157562256, + "learning_rate": 2.2074053079490255e-05, + "loss": 1.6864, + "step": 9287 + }, + { + "epoch": 0.6991475187715237, + "grad_norm": 5.414831161499023, + "learning_rate": 2.206394166576542e-05, + "loss": 1.6567, + "step": 9288 + }, + { + "epoch": 0.6992227930521839, + "grad_norm": 4.865509033203125, + "learning_rate": 2.2053831912759093e-05, + "loss": 1.95, + "step": 9289 + }, + { + "epoch": 0.699298067332844, + "grad_norm": 3.63382887840271, + "learning_rate": 2.2043723821072292e-05, + "loss": 1.7686, + "step": 9290 + }, + { + "epoch": 0.6993733416135042, + "grad_norm": 6.146778583526611, + "learning_rate": 2.2033617391305876e-05, + "loss": 2.2629, + "step": 9291 + }, + { + "epoch": 0.6994486158941644, + "grad_norm": 4.015007495880127, + "learning_rate": 2.2023512624060678e-05, + "loss": 1.4133, + "step": 9292 + }, + { + "epoch": 0.6995238901748245, + "grad_norm": 7.483290672302246, + "learning_rate": 2.201340951993736e-05, + "loss": 1.7149, + "step": 9293 + }, + { + "epoch": 0.6995991644554846, + "grad_norm": 6.540182590484619, + "learning_rate": 2.2003308079536572e-05, + "loss": 1.8213, + "step": 9294 + }, + { + "epoch": 0.6996744387361449, + "grad_norm": 4.4751057624816895, + "learning_rate": 2.1993208303458762e-05, + "loss": 2.0089, + "step": 9295 + }, + { + "epoch": 0.699749713016805, + "grad_norm": 4.6467084884643555, + "learning_rate": 2.1983110192304374e-05, + "loss": 2.1168, + "step": 9296 + }, + { + "epoch": 0.6998249872974651, + "grad_norm": 4.232470512390137, + "learning_rate": 2.1973013746673682e-05, + "loss": 1.7915, + "step": 9297 + }, + { + "epoch": 0.6999002615781253, + "grad_norm": 5.053588390350342, + "learning_rate": 2.19629189671669e-05, + "loss": 2.2114, + "step": 9298 + }, + { + "epoch": 0.6999755358587855, + "grad_norm": 5.083146095275879, + "learning_rate": 2.195282585438414e-05, + "loss": 1.6179, + "step": 9299 + }, + { + "epoch": 0.7000508101394456, + "grad_norm": 5.449784278869629, + "learning_rate": 2.1942734408925425e-05, + "loss": 1.6886, + "step": 9300 + }, + { + "epoch": 0.7001260844201057, + "grad_norm": 5.43645715713501, + "learning_rate": 2.1932644631390625e-05, + "loss": 1.641, + "step": 9301 + }, + { + "epoch": 0.7002013587007659, + "grad_norm": 10.92508602142334, + "learning_rate": 2.1922556522379573e-05, + "loss": 2.171, + "step": 9302 + }, + { + "epoch": 0.7002766329814261, + "grad_norm": 6.381450176239014, + "learning_rate": 2.191247008249198e-05, + "loss": 1.9996, + "step": 9303 + }, + { + "epoch": 0.7003519072620862, + "grad_norm": 4.0064473152160645, + "learning_rate": 2.190238531232744e-05, + "loss": 1.758, + "step": 9304 + }, + { + "epoch": 0.7004271815427464, + "grad_norm": 5.375153064727783, + "learning_rate": 2.189230221248549e-05, + "loss": 1.6912, + "step": 9305 + }, + { + "epoch": 0.7005024558234065, + "grad_norm": 5.514552116394043, + "learning_rate": 2.18822207835655e-05, + "loss": 1.669, + "step": 9306 + }, + { + "epoch": 0.7005777301040667, + "grad_norm": 4.428057670593262, + "learning_rate": 2.1872141026166825e-05, + "loss": 2.0135, + "step": 9307 + }, + { + "epoch": 0.7006530043847269, + "grad_norm": 4.594017505645752, + "learning_rate": 2.1862062940888638e-05, + "loss": 1.6488, + "step": 9308 + }, + { + "epoch": 0.700728278665387, + "grad_norm": 5.2304277420043945, + "learning_rate": 2.1851986528330094e-05, + "loss": 1.5904, + "step": 9309 + }, + { + "epoch": 0.7008035529460471, + "grad_norm": 4.830564022064209, + "learning_rate": 2.1841911789090163e-05, + "loss": 2.1304, + "step": 9310 + }, + { + "epoch": 0.7008788272267074, + "grad_norm": 4.99942684173584, + "learning_rate": 2.183183872376779e-05, + "loss": 1.5426, + "step": 9311 + }, + { + "epoch": 0.7009541015073675, + "grad_norm": 4.5725908279418945, + "learning_rate": 2.1821767332961772e-05, + "loss": 2.0446, + "step": 9312 + }, + { + "epoch": 0.7010293757880276, + "grad_norm": 5.205332279205322, + "learning_rate": 2.1811697617270854e-05, + "loss": 2.5718, + "step": 9313 + }, + { + "epoch": 0.7011046500686878, + "grad_norm": 4.680220603942871, + "learning_rate": 2.180162957729362e-05, + "loss": 1.7823, + "step": 9314 + }, + { + "epoch": 0.701179924349348, + "grad_norm": 4.48225212097168, + "learning_rate": 2.1791563213628606e-05, + "loss": 1.9265, + "step": 9315 + }, + { + "epoch": 0.7012551986300081, + "grad_norm": 5.262147903442383, + "learning_rate": 2.1781498526874205e-05, + "loss": 1.9798, + "step": 9316 + }, + { + "epoch": 0.7013304729106683, + "grad_norm": 5.336678981781006, + "learning_rate": 2.177143551762875e-05, + "loss": 1.5556, + "step": 9317 + }, + { + "epoch": 0.7014057471913284, + "grad_norm": 5.078948020935059, + "learning_rate": 2.176137418649048e-05, + "loss": 2.076, + "step": 9318 + }, + { + "epoch": 0.7014810214719885, + "grad_norm": 5.765548229217529, + "learning_rate": 2.175131453405746e-05, + "loss": 1.9074, + "step": 9319 + }, + { + "epoch": 0.7015562957526487, + "grad_norm": 3.9967434406280518, + "learning_rate": 2.1741256560927763e-05, + "loss": 1.6182, + "step": 9320 + }, + { + "epoch": 0.7016315700333089, + "grad_norm": 4.858323574066162, + "learning_rate": 2.173120026769926e-05, + "loss": 1.9258, + "step": 9321 + }, + { + "epoch": 0.701706844313969, + "grad_norm": 4.378138065338135, + "learning_rate": 2.172114565496979e-05, + "loss": 1.7107, + "step": 9322 + }, + { + "epoch": 0.7017821185946291, + "grad_norm": 4.164498805999756, + "learning_rate": 2.171109272333706e-05, + "loss": 1.818, + "step": 9323 + }, + { + "epoch": 0.7018573928752894, + "grad_norm": 4.1086320877075195, + "learning_rate": 2.1701041473398725e-05, + "loss": 2.1962, + "step": 9324 + }, + { + "epoch": 0.7019326671559495, + "grad_norm": 4.772989273071289, + "learning_rate": 2.1690991905752254e-05, + "loss": 1.9398, + "step": 9325 + }, + { + "epoch": 0.7020079414366096, + "grad_norm": 4.492087364196777, + "learning_rate": 2.168094402099511e-05, + "loss": 1.8252, + "step": 9326 + }, + { + "epoch": 0.7020832157172698, + "grad_norm": 4.383864879608154, + "learning_rate": 2.1670897819724562e-05, + "loss": 2.2872, + "step": 9327 + }, + { + "epoch": 0.70215848999793, + "grad_norm": 4.97402811050415, + "learning_rate": 2.1660853302537874e-05, + "loss": 1.8907, + "step": 9328 + }, + { + "epoch": 0.7022337642785901, + "grad_norm": 4.69892692565918, + "learning_rate": 2.165081047003213e-05, + "loss": 1.8133, + "step": 9329 + }, + { + "epoch": 0.7023090385592503, + "grad_norm": 3.776805877685547, + "learning_rate": 2.164076932280437e-05, + "loss": 1.8718, + "step": 9330 + }, + { + "epoch": 0.7023843128399104, + "grad_norm": 7.233206272125244, + "learning_rate": 2.1630729861451493e-05, + "loss": 1.7294, + "step": 9331 + }, + { + "epoch": 0.7024595871205705, + "grad_norm": 4.274624824523926, + "learning_rate": 2.162069208657032e-05, + "loss": 1.4394, + "step": 9332 + }, + { + "epoch": 0.7025348614012308, + "grad_norm": 4.72705602645874, + "learning_rate": 2.161065599875757e-05, + "loss": 1.7681, + "step": 9333 + }, + { + "epoch": 0.7026101356818909, + "grad_norm": 4.526385307312012, + "learning_rate": 2.1600621598609865e-05, + "loss": 1.8165, + "step": 9334 + }, + { + "epoch": 0.702685409962551, + "grad_norm": 5.886971950531006, + "learning_rate": 2.159058888672374e-05, + "loss": 1.7325, + "step": 9335 + }, + { + "epoch": 0.7027606842432113, + "grad_norm": 4.353018283843994, + "learning_rate": 2.1580557863695573e-05, + "loss": 1.5334, + "step": 9336 + }, + { + "epoch": 0.7028359585238714, + "grad_norm": 4.670185089111328, + "learning_rate": 2.157052853012171e-05, + "loss": 1.8523, + "step": 9337 + }, + { + "epoch": 0.7029112328045315, + "grad_norm": 8.775164604187012, + "learning_rate": 2.156050088659834e-05, + "loss": 2.1419, + "step": 9338 + }, + { + "epoch": 0.7029865070851916, + "grad_norm": 4.2399396896362305, + "learning_rate": 2.1550474933721605e-05, + "loss": 1.9868, + "step": 9339 + }, + { + "epoch": 0.7030617813658518, + "grad_norm": 4.301163673400879, + "learning_rate": 2.1540450672087482e-05, + "loss": 2.0163, + "step": 9340 + }, + { + "epoch": 0.703137055646512, + "grad_norm": 5.607899188995361, + "learning_rate": 2.1530428102291927e-05, + "loss": 2.165, + "step": 9341 + }, + { + "epoch": 0.7032123299271721, + "grad_norm": 5.423983097076416, + "learning_rate": 2.152040722493072e-05, + "loss": 1.805, + "step": 9342 + }, + { + "epoch": 0.7032876042078323, + "grad_norm": 6.00930118560791, + "learning_rate": 2.1510388040599584e-05, + "loss": 1.9848, + "step": 9343 + }, + { + "epoch": 0.7033628784884924, + "grad_norm": 6.215510845184326, + "learning_rate": 2.1500370549894135e-05, + "loss": 1.8873, + "step": 9344 + }, + { + "epoch": 0.7034381527691526, + "grad_norm": 5.830587387084961, + "learning_rate": 2.1490354753409892e-05, + "loss": 1.6592, + "step": 9345 + }, + { + "epoch": 0.7035134270498128, + "grad_norm": 6.64855432510376, + "learning_rate": 2.148034065174224e-05, + "loss": 1.7833, + "step": 9346 + }, + { + "epoch": 0.7035887013304729, + "grad_norm": 4.266633987426758, + "learning_rate": 2.1470328245486533e-05, + "loss": 1.7095, + "step": 9347 + }, + { + "epoch": 0.703663975611133, + "grad_norm": 4.774324893951416, + "learning_rate": 2.1460317535237922e-05, + "loss": 1.9698, + "step": 9348 + }, + { + "epoch": 0.7037392498917933, + "grad_norm": 6.444777011871338, + "learning_rate": 2.145030852159155e-05, + "loss": 1.5569, + "step": 9349 + }, + { + "epoch": 0.7038145241724534, + "grad_norm": 4.9345784187316895, + "learning_rate": 2.1440301205142433e-05, + "loss": 1.8729, + "step": 9350 + }, + { + "epoch": 0.7038897984531135, + "grad_norm": 5.914407730102539, + "learning_rate": 2.1430295586485443e-05, + "loss": 1.873, + "step": 9351 + }, + { + "epoch": 0.7039650727337737, + "grad_norm": 4.735976696014404, + "learning_rate": 2.1420291666215426e-05, + "loss": 1.6529, + "step": 9352 + }, + { + "epoch": 0.7040403470144339, + "grad_norm": 3.983957052230835, + "learning_rate": 2.1410289444927045e-05, + "loss": 1.4232, + "step": 9353 + }, + { + "epoch": 0.704115621295094, + "grad_norm": 4.668962478637695, + "learning_rate": 2.140028892321494e-05, + "loss": 1.5796, + "step": 9354 + }, + { + "epoch": 0.7041908955757542, + "grad_norm": 6.854010105133057, + "learning_rate": 2.1390290101673583e-05, + "loss": 1.8398, + "step": 9355 + }, + { + "epoch": 0.7042661698564143, + "grad_norm": 3.9459760189056396, + "learning_rate": 2.1380292980897388e-05, + "loss": 1.787, + "step": 9356 + }, + { + "epoch": 0.7043414441370744, + "grad_norm": 5.830298900604248, + "learning_rate": 2.137029756148065e-05, + "loss": 1.6778, + "step": 9357 + }, + { + "epoch": 0.7044167184177346, + "grad_norm": 5.088885307312012, + "learning_rate": 2.13603038440176e-05, + "loss": 1.6525, + "step": 9358 + }, + { + "epoch": 0.7044919926983948, + "grad_norm": 4.401206970214844, + "learning_rate": 2.1350311829102288e-05, + "loss": 1.9909, + "step": 9359 + }, + { + "epoch": 0.7045672669790549, + "grad_norm": 5.848589897155762, + "learning_rate": 2.1340321517328754e-05, + "loss": 1.584, + "step": 9360 + }, + { + "epoch": 0.704642541259715, + "grad_norm": 5.143894672393799, + "learning_rate": 2.133033290929085e-05, + "loss": 1.6961, + "step": 9361 + }, + { + "epoch": 0.7047178155403753, + "grad_norm": 7.149900913238525, + "learning_rate": 2.132034600558241e-05, + "loss": 1.8017, + "step": 9362 + }, + { + "epoch": 0.7047930898210354, + "grad_norm": 6.186385631561279, + "learning_rate": 2.1310360806797092e-05, + "loss": 2.0581, + "step": 9363 + }, + { + "epoch": 0.7048683641016955, + "grad_norm": 3.7667362689971924, + "learning_rate": 2.1300377313528524e-05, + "loss": 2.2862, + "step": 9364 + }, + { + "epoch": 0.7049436383823557, + "grad_norm": 5.4394683837890625, + "learning_rate": 2.1290395526370155e-05, + "loss": 1.9493, + "step": 9365 + }, + { + "epoch": 0.7050189126630159, + "grad_norm": 3.6797165870666504, + "learning_rate": 2.1280415445915397e-05, + "loss": 1.7459, + "step": 9366 + }, + { + "epoch": 0.705094186943676, + "grad_norm": 4.405654430389404, + "learning_rate": 2.1270437072757538e-05, + "loss": 2.0053, + "step": 9367 + }, + { + "epoch": 0.7051694612243362, + "grad_norm": 3.4335784912109375, + "learning_rate": 2.1260460407489758e-05, + "loss": 1.8559, + "step": 9368 + }, + { + "epoch": 0.7052447355049963, + "grad_norm": 4.21601676940918, + "learning_rate": 2.125048545070516e-05, + "loss": 2.3365, + "step": 9369 + }, + { + "epoch": 0.7053200097856565, + "grad_norm": 4.850498676300049, + "learning_rate": 2.1240512202996695e-05, + "loss": 1.9836, + "step": 9370 + }, + { + "epoch": 0.7053952840663167, + "grad_norm": 4.358738422393799, + "learning_rate": 2.123054066495727e-05, + "loss": 1.7281, + "step": 9371 + }, + { + "epoch": 0.7054705583469768, + "grad_norm": 5.4170732498168945, + "learning_rate": 2.1220570837179643e-05, + "loss": 1.8855, + "step": 9372 + }, + { + "epoch": 0.7055458326276369, + "grad_norm": 4.894082546234131, + "learning_rate": 2.1210602720256522e-05, + "loss": 1.6757, + "step": 9373 + }, + { + "epoch": 0.7056211069082972, + "grad_norm": 4.747068881988525, + "learning_rate": 2.1200636314780442e-05, + "loss": 1.604, + "step": 9374 + }, + { + "epoch": 0.7056963811889573, + "grad_norm": 5.248499393463135, + "learning_rate": 2.1190671621343916e-05, + "loss": 1.642, + "step": 9375 + }, + { + "epoch": 0.7057716554696174, + "grad_norm": 4.000081539154053, + "learning_rate": 2.118070864053928e-05, + "loss": 1.8255, + "step": 9376 + }, + { + "epoch": 0.7058469297502775, + "grad_norm": 4.539329528808594, + "learning_rate": 2.1170747372958828e-05, + "loss": 2.0555, + "step": 9377 + }, + { + "epoch": 0.7059222040309377, + "grad_norm": 4.026722431182861, + "learning_rate": 2.1160787819194715e-05, + "loss": 1.7773, + "step": 9378 + }, + { + "epoch": 0.7059974783115979, + "grad_norm": 6.095086574554443, + "learning_rate": 2.1150829979839043e-05, + "loss": 2.4515, + "step": 9379 + }, + { + "epoch": 0.706072752592258, + "grad_norm": 7.773655414581299, + "learning_rate": 2.1140873855483733e-05, + "loss": 1.6775, + "step": 9380 + }, + { + "epoch": 0.7061480268729182, + "grad_norm": 4.406905174255371, + "learning_rate": 2.1130919446720667e-05, + "loss": 1.6362, + "step": 9381 + }, + { + "epoch": 0.7062233011535783, + "grad_norm": 10.06230640411377, + "learning_rate": 2.112096675414162e-05, + "loss": 1.8967, + "step": 9382 + }, + { + "epoch": 0.7062985754342385, + "grad_norm": 4.2922797203063965, + "learning_rate": 2.1111015778338215e-05, + "loss": 1.7575, + "step": 9383 + }, + { + "epoch": 0.7063738497148987, + "grad_norm": 4.28108549118042, + "learning_rate": 2.110106651990205e-05, + "loss": 1.5583, + "step": 9384 + }, + { + "epoch": 0.7064491239955588, + "grad_norm": 4.165553092956543, + "learning_rate": 2.109111897942455e-05, + "loss": 1.8619, + "step": 9385 + }, + { + "epoch": 0.7065243982762189, + "grad_norm": 4.458963394165039, + "learning_rate": 2.1081173157497086e-05, + "loss": 1.7861, + "step": 9386 + }, + { + "epoch": 0.7065996725568792, + "grad_norm": 5.161516189575195, + "learning_rate": 2.107122905471089e-05, + "loss": 2.3224, + "step": 9387 + }, + { + "epoch": 0.7066749468375393, + "grad_norm": 6.569104194641113, + "learning_rate": 2.1061286671657116e-05, + "loss": 1.8312, + "step": 9388 + }, + { + "epoch": 0.7067502211181994, + "grad_norm": 5.383288383483887, + "learning_rate": 2.1051346008926815e-05, + "loss": 1.7546, + "step": 9389 + }, + { + "epoch": 0.7068254953988596, + "grad_norm": 4.633711338043213, + "learning_rate": 2.1041407067110953e-05, + "loss": 1.7025, + "step": 9390 + }, + { + "epoch": 0.7069007696795198, + "grad_norm": 5.346320152282715, + "learning_rate": 2.1031469846800327e-05, + "loss": 2.0629, + "step": 9391 + }, + { + "epoch": 0.7069760439601799, + "grad_norm": 6.171438217163086, + "learning_rate": 2.1021534348585718e-05, + "loss": 1.841, + "step": 9392 + }, + { + "epoch": 0.7070513182408401, + "grad_norm": 7.632879257202148, + "learning_rate": 2.1011600573057722e-05, + "loss": 1.7749, + "step": 9393 + }, + { + "epoch": 0.7071265925215002, + "grad_norm": 4.841483116149902, + "learning_rate": 2.1001668520806917e-05, + "loss": 1.7153, + "step": 9394 + }, + { + "epoch": 0.7072018668021604, + "grad_norm": 4.584401607513428, + "learning_rate": 2.0991738192423695e-05, + "loss": 1.6034, + "step": 9395 + }, + { + "epoch": 0.7072771410828206, + "grad_norm": 4.426340579986572, + "learning_rate": 2.0981809588498418e-05, + "loss": 1.6639, + "step": 9396 + }, + { + "epoch": 0.7073524153634807, + "grad_norm": 4.835805416107178, + "learning_rate": 2.0971882709621288e-05, + "loss": 2.2606, + "step": 9397 + }, + { + "epoch": 0.7074276896441408, + "grad_norm": 6.390164852142334, + "learning_rate": 2.0961957556382434e-05, + "loss": 1.9499, + "step": 9398 + }, + { + "epoch": 0.7075029639248009, + "grad_norm": 3.7949137687683105, + "learning_rate": 2.0952034129371884e-05, + "loss": 1.6472, + "step": 9399 + }, + { + "epoch": 0.7075782382054612, + "grad_norm": 4.826058864593506, + "learning_rate": 2.0942112429179556e-05, + "loss": 2.2777, + "step": 9400 + }, + { + "epoch": 0.7076535124861213, + "grad_norm": 5.310537815093994, + "learning_rate": 2.0932192456395288e-05, + "loss": 1.84, + "step": 9401 + }, + { + "epoch": 0.7077287867667814, + "grad_norm": 5.666449069976807, + "learning_rate": 2.0922274211608756e-05, + "loss": 1.8859, + "step": 9402 + }, + { + "epoch": 0.7078040610474416, + "grad_norm": 4.422976016998291, + "learning_rate": 2.09123576954096e-05, + "loss": 1.4833, + "step": 9403 + }, + { + "epoch": 0.7078793353281018, + "grad_norm": 5.033746719360352, + "learning_rate": 2.090244290838731e-05, + "loss": 1.602, + "step": 9404 + }, + { + "epoch": 0.7079546096087619, + "grad_norm": 4.90593147277832, + "learning_rate": 2.089252985113131e-05, + "loss": 1.6487, + "step": 9405 + }, + { + "epoch": 0.7080298838894221, + "grad_norm": 4.035327434539795, + "learning_rate": 2.0882618524230875e-05, + "loss": 1.6887, + "step": 9406 + }, + { + "epoch": 0.7081051581700822, + "grad_norm": 4.4100661277771, + "learning_rate": 2.087270892827524e-05, + "loss": 1.9973, + "step": 9407 + }, + { + "epoch": 0.7081804324507424, + "grad_norm": 9.177873611450195, + "learning_rate": 2.0862801063853466e-05, + "loss": 2.2082, + "step": 9408 + }, + { + "epoch": 0.7082557067314026, + "grad_norm": 5.6072211265563965, + "learning_rate": 2.0852894931554586e-05, + "loss": 1.7898, + "step": 9409 + }, + { + "epoch": 0.7083309810120627, + "grad_norm": 4.769901275634766, + "learning_rate": 2.0842990531967444e-05, + "loss": 1.7207, + "step": 9410 + }, + { + "epoch": 0.7084062552927228, + "grad_norm": 5.831104278564453, + "learning_rate": 2.0833087865680855e-05, + "loss": 2.3435, + "step": 9411 + }, + { + "epoch": 0.7084815295733831, + "grad_norm": 5.289443492889404, + "learning_rate": 2.0823186933283513e-05, + "loss": 1.6374, + "step": 9412 + }, + { + "epoch": 0.7085568038540432, + "grad_norm": 5.817784309387207, + "learning_rate": 2.081328773536398e-05, + "loss": 2.1398, + "step": 9413 + }, + { + "epoch": 0.7086320781347033, + "grad_norm": 4.402369976043701, + "learning_rate": 2.0803390272510764e-05, + "loss": 1.6205, + "step": 9414 + }, + { + "epoch": 0.7087073524153635, + "grad_norm": 5.286246299743652, + "learning_rate": 2.079349454531221e-05, + "loss": 1.8302, + "step": 9415 + }, + { + "epoch": 0.7087826266960237, + "grad_norm": 4.683670997619629, + "learning_rate": 2.0783600554356615e-05, + "loss": 1.5501, + "step": 9416 + }, + { + "epoch": 0.7088579009766838, + "grad_norm": 4.5598063468933105, + "learning_rate": 2.0773708300232118e-05, + "loss": 1.9481, + "step": 9417 + }, + { + "epoch": 0.7089331752573439, + "grad_norm": 5.628935813903809, + "learning_rate": 2.0763817783526824e-05, + "loss": 1.6645, + "step": 9418 + }, + { + "epoch": 0.7090084495380041, + "grad_norm": 5.669116020202637, + "learning_rate": 2.0753929004828654e-05, + "loss": 1.6413, + "step": 9419 + }, + { + "epoch": 0.7090837238186642, + "grad_norm": 5.136711120605469, + "learning_rate": 2.0744041964725508e-05, + "loss": 1.4798, + "step": 9420 + }, + { + "epoch": 0.7091589980993244, + "grad_norm": 6.001125812530518, + "learning_rate": 2.0734156663805103e-05, + "loss": 1.8518, + "step": 9421 + }, + { + "epoch": 0.7092342723799846, + "grad_norm": 6.930325508117676, + "learning_rate": 2.072427310265511e-05, + "loss": 1.9453, + "step": 9422 + }, + { + "epoch": 0.7093095466606447, + "grad_norm": 4.790477275848389, + "learning_rate": 2.071439128186309e-05, + "loss": 1.9081, + "step": 9423 + }, + { + "epoch": 0.7093848209413048, + "grad_norm": 6.618725299835205, + "learning_rate": 2.0704511202016485e-05, + "loss": 2.0128, + "step": 9424 + }, + { + "epoch": 0.7094600952219651, + "grad_norm": 5.4649271965026855, + "learning_rate": 2.0694632863702618e-05, + "loss": 1.6879, + "step": 9425 + }, + { + "epoch": 0.7095353695026252, + "grad_norm": 5.447333335876465, + "learning_rate": 2.0684756267508755e-05, + "loss": 1.9284, + "step": 9426 + }, + { + "epoch": 0.7096106437832853, + "grad_norm": 4.140979766845703, + "learning_rate": 2.0674881414022002e-05, + "loss": 1.5889, + "step": 9427 + }, + { + "epoch": 0.7096859180639455, + "grad_norm": 4.824054718017578, + "learning_rate": 2.066500830382942e-05, + "loss": 1.7807, + "step": 9428 + }, + { + "epoch": 0.7097611923446057, + "grad_norm": 4.64283561706543, + "learning_rate": 2.0655136937517906e-05, + "loss": 1.951, + "step": 9429 + }, + { + "epoch": 0.7098364666252658, + "grad_norm": 4.86910343170166, + "learning_rate": 2.0645267315674306e-05, + "loss": 1.773, + "step": 9430 + }, + { + "epoch": 0.709911740905926, + "grad_norm": 5.271092891693115, + "learning_rate": 2.063539943888535e-05, + "loss": 2.2061, + "step": 9431 + }, + { + "epoch": 0.7099870151865861, + "grad_norm": 4.815905570983887, + "learning_rate": 2.062553330773762e-05, + "loss": 2.1344, + "step": 9432 + }, + { + "epoch": 0.7100622894672463, + "grad_norm": 4.7830491065979, + "learning_rate": 2.0615668922817655e-05, + "loss": 1.9825, + "step": 9433 + }, + { + "epoch": 0.7101375637479065, + "grad_norm": 6.408608436584473, + "learning_rate": 2.0605806284711858e-05, + "loss": 1.6837, + "step": 9434 + }, + { + "epoch": 0.7102128380285666, + "grad_norm": 4.737805366516113, + "learning_rate": 2.0595945394006554e-05, + "loss": 1.6237, + "step": 9435 + }, + { + "epoch": 0.7102881123092267, + "grad_norm": 4.678584575653076, + "learning_rate": 2.0586086251287907e-05, + "loss": 2.0914, + "step": 9436 + }, + { + "epoch": 0.7103633865898868, + "grad_norm": 5.823984146118164, + "learning_rate": 2.0576228857142056e-05, + "loss": 1.4322, + "step": 9437 + }, + { + "epoch": 0.7104386608705471, + "grad_norm": 5.274572372436523, + "learning_rate": 2.0566373212154956e-05, + "loss": 1.9612, + "step": 9438 + }, + { + "epoch": 0.7105139351512072, + "grad_norm": 4.736893653869629, + "learning_rate": 2.0556519316912526e-05, + "loss": 1.9395, + "step": 9439 + }, + { + "epoch": 0.7105892094318673, + "grad_norm": 4.674492835998535, + "learning_rate": 2.0546667172000524e-05, + "loss": 1.6518, + "step": 9440 + }, + { + "epoch": 0.7106644837125276, + "grad_norm": 4.059900760650635, + "learning_rate": 2.0536816778004665e-05, + "loss": 1.452, + "step": 9441 + }, + { + "epoch": 0.7107397579931877, + "grad_norm": 5.007218360900879, + "learning_rate": 2.0526968135510493e-05, + "loss": 1.7674, + "step": 9442 + }, + { + "epoch": 0.7108150322738478, + "grad_norm": 4.747188568115234, + "learning_rate": 2.05171212451035e-05, + "loss": 1.6817, + "step": 9443 + }, + { + "epoch": 0.710890306554508, + "grad_norm": 5.4470744132995605, + "learning_rate": 2.050727610736905e-05, + "loss": 2.077, + "step": 9444 + }, + { + "epoch": 0.7109655808351681, + "grad_norm": 6.768156051635742, + "learning_rate": 2.0497432722892414e-05, + "loss": 1.9952, + "step": 9445 + }, + { + "epoch": 0.7110408551158283, + "grad_norm": 3.845045804977417, + "learning_rate": 2.0487591092258763e-05, + "loss": 1.8572, + "step": 9446 + }, + { + "epoch": 0.7111161293964885, + "grad_norm": 5.5883917808532715, + "learning_rate": 2.0477751216053126e-05, + "loss": 1.7998, + "step": 9447 + }, + { + "epoch": 0.7111914036771486, + "grad_norm": 4.677051544189453, + "learning_rate": 2.0467913094860487e-05, + "loss": 1.9686, + "step": 9448 + }, + { + "epoch": 0.7112666779578087, + "grad_norm": 6.796509265899658, + "learning_rate": 2.045807672926566e-05, + "loss": 1.8988, + "step": 9449 + }, + { + "epoch": 0.711341952238469, + "grad_norm": 3.9641034603118896, + "learning_rate": 2.0448242119853427e-05, + "loss": 1.7176, + "step": 9450 + }, + { + "epoch": 0.7114172265191291, + "grad_norm": 5.551223278045654, + "learning_rate": 2.0438409267208387e-05, + "loss": 2.0897, + "step": 9451 + }, + { + "epoch": 0.7114925007997892, + "grad_norm": 5.59065580368042, + "learning_rate": 2.0428578171915107e-05, + "loss": 1.8998, + "step": 9452 + }, + { + "epoch": 0.7115677750804494, + "grad_norm": 5.542457103729248, + "learning_rate": 2.0418748834557995e-05, + "loss": 1.7374, + "step": 9453 + }, + { + "epoch": 0.7116430493611096, + "grad_norm": 4.429009914398193, + "learning_rate": 2.040892125572138e-05, + "loss": 1.8505, + "step": 9454 + }, + { + "epoch": 0.7117183236417697, + "grad_norm": 4.183013439178467, + "learning_rate": 2.0399095435989514e-05, + "loss": 1.7087, + "step": 9455 + }, + { + "epoch": 0.7117935979224298, + "grad_norm": 6.364041805267334, + "learning_rate": 2.038927137594647e-05, + "loss": 1.8004, + "step": 9456 + }, + { + "epoch": 0.71186887220309, + "grad_norm": 4.549670219421387, + "learning_rate": 2.0379449076176276e-05, + "loss": 1.877, + "step": 9457 + }, + { + "epoch": 0.7119441464837502, + "grad_norm": 6.37774658203125, + "learning_rate": 2.0369628537262863e-05, + "loss": 1.7572, + "step": 9458 + }, + { + "epoch": 0.7120194207644103, + "grad_norm": 5.045804023742676, + "learning_rate": 2.035980975979e-05, + "loss": 1.9443, + "step": 9459 + }, + { + "epoch": 0.7120946950450705, + "grad_norm": 4.541904449462891, + "learning_rate": 2.034999274434142e-05, + "loss": 1.7683, + "step": 9460 + }, + { + "epoch": 0.7121699693257306, + "grad_norm": 5.265013694763184, + "learning_rate": 2.034017749150067e-05, + "loss": 2.1479, + "step": 9461 + }, + { + "epoch": 0.7122452436063907, + "grad_norm": 4.115753173828125, + "learning_rate": 2.0330364001851264e-05, + "loss": 1.6876, + "step": 9462 + }, + { + "epoch": 0.712320517887051, + "grad_norm": 4.402643203735352, + "learning_rate": 2.032055227597661e-05, + "loss": 2.11, + "step": 9463 + }, + { + "epoch": 0.7123957921677111, + "grad_norm": 4.408322811126709, + "learning_rate": 2.0310742314459945e-05, + "loss": 1.9864, + "step": 9464 + }, + { + "epoch": 0.7124710664483712, + "grad_norm": 3.5569489002227783, + "learning_rate": 2.030093411788448e-05, + "loss": 1.6481, + "step": 9465 + }, + { + "epoch": 0.7125463407290314, + "grad_norm": 5.080889701843262, + "learning_rate": 2.0291127686833244e-05, + "loss": 1.6673, + "step": 9466 + }, + { + "epoch": 0.7126216150096916, + "grad_norm": 3.850985527038574, + "learning_rate": 2.028132302188922e-05, + "loss": 1.6713, + "step": 9467 + }, + { + "epoch": 0.7126968892903517, + "grad_norm": 4.193704605102539, + "learning_rate": 2.027152012363528e-05, + "loss": 1.9128, + "step": 9468 + }, + { + "epoch": 0.7127721635710119, + "grad_norm": 5.77872896194458, + "learning_rate": 2.0261718992654176e-05, + "loss": 1.7688, + "step": 9469 + }, + { + "epoch": 0.712847437851672, + "grad_norm": 4.845407009124756, + "learning_rate": 2.0251919629528538e-05, + "loss": 1.7781, + "step": 9470 + }, + { + "epoch": 0.7129227121323322, + "grad_norm": 3.637472629547119, + "learning_rate": 2.0242122034840932e-05, + "loss": 1.4049, + "step": 9471 + }, + { + "epoch": 0.7129979864129924, + "grad_norm": 4.914551258087158, + "learning_rate": 2.0232326209173774e-05, + "loss": 1.8648, + "step": 9472 + }, + { + "epoch": 0.7130732606936525, + "grad_norm": 4.670671463012695, + "learning_rate": 2.022253215310943e-05, + "loss": 1.8214, + "step": 9473 + }, + { + "epoch": 0.7131485349743126, + "grad_norm": 5.561976432800293, + "learning_rate": 2.0212739867230086e-05, + "loss": 1.8266, + "step": 9474 + }, + { + "epoch": 0.7132238092549729, + "grad_norm": 4.460993766784668, + "learning_rate": 2.0202949352117905e-05, + "loss": 1.6355, + "step": 9475 + }, + { + "epoch": 0.713299083535633, + "grad_norm": 8.429607391357422, + "learning_rate": 2.019316060835487e-05, + "loss": 1.7074, + "step": 9476 + }, + { + "epoch": 0.7133743578162931, + "grad_norm": 4.11983585357666, + "learning_rate": 2.0183373636522912e-05, + "loss": 1.5695, + "step": 9477 + }, + { + "epoch": 0.7134496320969532, + "grad_norm": 5.1690263748168945, + "learning_rate": 2.0173588437203838e-05, + "loss": 1.9743, + "step": 9478 + }, + { + "epoch": 0.7135249063776135, + "grad_norm": 5.531281471252441, + "learning_rate": 2.016380501097935e-05, + "loss": 1.9025, + "step": 9479 + }, + { + "epoch": 0.7136001806582736, + "grad_norm": 4.883945941925049, + "learning_rate": 2.015402335843107e-05, + "loss": 1.7413, + "step": 9480 + }, + { + "epoch": 0.7136754549389337, + "grad_norm": 4.979677200317383, + "learning_rate": 2.0144243480140433e-05, + "loss": 1.7923, + "step": 9481 + }, + { + "epoch": 0.7137507292195939, + "grad_norm": 4.138659477233887, + "learning_rate": 2.0134465376688877e-05, + "loss": 1.5244, + "step": 9482 + }, + { + "epoch": 0.713826003500254, + "grad_norm": 11.317208290100098, + "learning_rate": 2.0124689048657646e-05, + "loss": 2.1525, + "step": 9483 + }, + { + "epoch": 0.7139012777809142, + "grad_norm": 5.241421222686768, + "learning_rate": 2.0114914496627944e-05, + "loss": 1.9196, + "step": 9484 + }, + { + "epoch": 0.7139765520615744, + "grad_norm": 5.762665271759033, + "learning_rate": 2.010514172118081e-05, + "loss": 1.9304, + "step": 9485 + }, + { + "epoch": 0.7140518263422345, + "grad_norm": 4.502680778503418, + "learning_rate": 2.009537072289724e-05, + "loss": 2.0734, + "step": 9486 + }, + { + "epoch": 0.7141271006228946, + "grad_norm": 5.363126277923584, + "learning_rate": 2.0085601502358052e-05, + "loss": 1.6732, + "step": 9487 + }, + { + "epoch": 0.7142023749035549, + "grad_norm": 4.757355690002441, + "learning_rate": 2.0075834060144027e-05, + "loss": 1.7796, + "step": 9488 + }, + { + "epoch": 0.714277649184215, + "grad_norm": 4.702545642852783, + "learning_rate": 2.0066068396835803e-05, + "loss": 1.9702, + "step": 9489 + }, + { + "epoch": 0.7143529234648751, + "grad_norm": 6.604905605316162, + "learning_rate": 2.005630451301394e-05, + "loss": 1.8952, + "step": 9490 + }, + { + "epoch": 0.7144281977455353, + "grad_norm": 4.455034255981445, + "learning_rate": 2.004654240925884e-05, + "loss": 1.6805, + "step": 9491 + }, + { + "epoch": 0.7145034720261955, + "grad_norm": 4.227853775024414, + "learning_rate": 2.0036782086150862e-05, + "loss": 1.5846, + "step": 9492 + }, + { + "epoch": 0.7145787463068556, + "grad_norm": 6.459262847900391, + "learning_rate": 2.0027023544270195e-05, + "loss": 1.7413, + "step": 9493 + }, + { + "epoch": 0.7146540205875158, + "grad_norm": 4.390810966491699, + "learning_rate": 2.001726678419698e-05, + "loss": 1.9777, + "step": 9494 + }, + { + "epoch": 0.7147292948681759, + "grad_norm": 5.537712097167969, + "learning_rate": 2.000751180651124e-05, + "loss": 1.9674, + "step": 9495 + }, + { + "epoch": 0.714804569148836, + "grad_norm": 5.226698398590088, + "learning_rate": 1.9997758611792855e-05, + "loss": 1.6521, + "step": 9496 + }, + { + "epoch": 0.7148798434294962, + "grad_norm": 5.052948474884033, + "learning_rate": 1.9988007200621646e-05, + "loss": 1.7016, + "step": 9497 + }, + { + "epoch": 0.7149551177101564, + "grad_norm": 4.696854591369629, + "learning_rate": 1.9978257573577275e-05, + "loss": 1.376, + "step": 9498 + }, + { + "epoch": 0.7150303919908165, + "grad_norm": 5.07398796081543, + "learning_rate": 1.9968509731239354e-05, + "loss": 2.3853, + "step": 9499 + }, + { + "epoch": 0.7151056662714766, + "grad_norm": 3.914128065109253, + "learning_rate": 1.9958763674187358e-05, + "loss": 1.883, + "step": 9500 + }, + { + "epoch": 0.7151809405521369, + "grad_norm": 6.058406829833984, + "learning_rate": 1.994901940300068e-05, + "loss": 1.6567, + "step": 9501 + }, + { + "epoch": 0.715256214832797, + "grad_norm": 5.501729488372803, + "learning_rate": 1.9939276918258554e-05, + "loss": 1.6225, + "step": 9502 + }, + { + "epoch": 0.7153314891134571, + "grad_norm": 4.761053562164307, + "learning_rate": 1.992953622054018e-05, + "loss": 2.359, + "step": 9503 + }, + { + "epoch": 0.7154067633941174, + "grad_norm": 4.187493324279785, + "learning_rate": 1.9919797310424582e-05, + "loss": 1.7836, + "step": 9504 + }, + { + "epoch": 0.7154820376747775, + "grad_norm": 6.203763008117676, + "learning_rate": 1.9910060188490738e-05, + "loss": 2.0934, + "step": 9505 + }, + { + "epoch": 0.7155573119554376, + "grad_norm": 4.951480388641357, + "learning_rate": 1.990032485531746e-05, + "loss": 1.8963, + "step": 9506 + }, + { + "epoch": 0.7156325862360978, + "grad_norm": 4.882652282714844, + "learning_rate": 1.989059131148352e-05, + "loss": 1.7381, + "step": 9507 + }, + { + "epoch": 0.7157078605167579, + "grad_norm": 5.824948787689209, + "learning_rate": 1.9880859557567517e-05, + "loss": 2.1631, + "step": 9508 + }, + { + "epoch": 0.7157831347974181, + "grad_norm": 5.714610576629639, + "learning_rate": 1.9871129594148015e-05, + "loss": 2.5054, + "step": 9509 + }, + { + "epoch": 0.7158584090780783, + "grad_norm": 4.927806854248047, + "learning_rate": 1.9861401421803382e-05, + "loss": 1.8776, + "step": 9510 + }, + { + "epoch": 0.7159336833587384, + "grad_norm": 4.6382527351379395, + "learning_rate": 1.985167504111196e-05, + "loss": 1.735, + "step": 9511 + }, + { + "epoch": 0.7160089576393985, + "grad_norm": 6.377290725708008, + "learning_rate": 1.984195045265195e-05, + "loss": 1.6773, + "step": 9512 + }, + { + "epoch": 0.7160842319200588, + "grad_norm": 4.6818108558654785, + "learning_rate": 1.983222765700146e-05, + "loss": 1.809, + "step": 9513 + }, + { + "epoch": 0.7161595062007189, + "grad_norm": 4.784926414489746, + "learning_rate": 1.9822506654738486e-05, + "loss": 1.5798, + "step": 9514 + }, + { + "epoch": 0.716234780481379, + "grad_norm": 5.976828575134277, + "learning_rate": 1.981278744644089e-05, + "loss": 1.9079, + "step": 9515 + }, + { + "epoch": 0.7163100547620391, + "grad_norm": 6.271202564239502, + "learning_rate": 1.9803070032686478e-05, + "loss": 1.4672, + "step": 9516 + }, + { + "epoch": 0.7163853290426994, + "grad_norm": 5.964905261993408, + "learning_rate": 1.979335441405289e-05, + "loss": 1.8177, + "step": 9517 + }, + { + "epoch": 0.7164606033233595, + "grad_norm": 4.530181407928467, + "learning_rate": 1.9783640591117725e-05, + "loss": 2.0666, + "step": 9518 + }, + { + "epoch": 0.7165358776040196, + "grad_norm": 5.328892230987549, + "learning_rate": 1.977392856445841e-05, + "loss": 1.9104, + "step": 9519 + }, + { + "epoch": 0.7166111518846798, + "grad_norm": 5.724018573760986, + "learning_rate": 1.9764218334652335e-05, + "loss": 1.8381, + "step": 9520 + }, + { + "epoch": 0.71668642616534, + "grad_norm": 4.3129448890686035, + "learning_rate": 1.9754509902276704e-05, + "loss": 1.7737, + "step": 9521 + }, + { + "epoch": 0.7167617004460001, + "grad_norm": 4.701634883880615, + "learning_rate": 1.9744803267908674e-05, + "loss": 1.704, + "step": 9522 + }, + { + "epoch": 0.7168369747266603, + "grad_norm": 5.569914817810059, + "learning_rate": 1.9735098432125277e-05, + "loss": 1.79, + "step": 9523 + }, + { + "epoch": 0.7169122490073204, + "grad_norm": 4.8248677253723145, + "learning_rate": 1.972539539550346e-05, + "loss": 1.7106, + "step": 9524 + }, + { + "epoch": 0.7169875232879805, + "grad_norm": 3.9295737743377686, + "learning_rate": 1.9715694158619996e-05, + "loss": 1.844, + "step": 9525 + }, + { + "epoch": 0.7170627975686408, + "grad_norm": 4.6467132568359375, + "learning_rate": 1.9705994722051624e-05, + "loss": 1.793, + "step": 9526 + }, + { + "epoch": 0.7171380718493009, + "grad_norm": 4.346904754638672, + "learning_rate": 1.969629708637496e-05, + "loss": 1.5909, + "step": 9527 + }, + { + "epoch": 0.717213346129961, + "grad_norm": 4.667377471923828, + "learning_rate": 1.968660125216646e-05, + "loss": 1.7459, + "step": 9528 + }, + { + "epoch": 0.7172886204106212, + "grad_norm": 5.513657093048096, + "learning_rate": 1.967690722000256e-05, + "loss": 1.9629, + "step": 9529 + }, + { + "epoch": 0.7173638946912814, + "grad_norm": 4.792205810546875, + "learning_rate": 1.9667214990459497e-05, + "loss": 1.8842, + "step": 9530 + }, + { + "epoch": 0.7174391689719415, + "grad_norm": 3.7832374572753906, + "learning_rate": 1.965752456411349e-05, + "loss": 1.6079, + "step": 9531 + }, + { + "epoch": 0.7175144432526017, + "grad_norm": 5.25220251083374, + "learning_rate": 1.9647835941540565e-05, + "loss": 2.1688, + "step": 9532 + }, + { + "epoch": 0.7175897175332618, + "grad_norm": 4.870365619659424, + "learning_rate": 1.9638149123316706e-05, + "loss": 1.7067, + "step": 9533 + }, + { + "epoch": 0.717664991813922, + "grad_norm": 3.4470694065093994, + "learning_rate": 1.9628464110017757e-05, + "loss": 1.9056, + "step": 9534 + }, + { + "epoch": 0.7177402660945821, + "grad_norm": 5.867374897003174, + "learning_rate": 1.9618780902219496e-05, + "loss": 2.0299, + "step": 9535 + }, + { + "epoch": 0.7178155403752423, + "grad_norm": 6.339066505432129, + "learning_rate": 1.9609099500497517e-05, + "loss": 2.0115, + "step": 9536 + }, + { + "epoch": 0.7178908146559024, + "grad_norm": 4.336844444274902, + "learning_rate": 1.959941990542739e-05, + "loss": 1.8723, + "step": 9537 + }, + { + "epoch": 0.7179660889365626, + "grad_norm": 4.932903289794922, + "learning_rate": 1.9589742117584502e-05, + "loss": 2.3154, + "step": 9538 + }, + { + "epoch": 0.7180413632172228, + "grad_norm": 4.143261432647705, + "learning_rate": 1.9580066137544208e-05, + "loss": 2.0187, + "step": 9539 + }, + { + "epoch": 0.7181166374978829, + "grad_norm": 4.2491302490234375, + "learning_rate": 1.9570391965881675e-05, + "loss": 1.5832, + "step": 9540 + }, + { + "epoch": 0.718191911778543, + "grad_norm": 5.4467949867248535, + "learning_rate": 1.9560719603172046e-05, + "loss": 1.771, + "step": 9541 + }, + { + "epoch": 0.7182671860592033, + "grad_norm": 4.698729515075684, + "learning_rate": 1.955104904999028e-05, + "loss": 1.6989, + "step": 9542 + }, + { + "epoch": 0.7183424603398634, + "grad_norm": 4.595734119415283, + "learning_rate": 1.9541380306911276e-05, + "loss": 2.1554, + "step": 9543 + }, + { + "epoch": 0.7184177346205235, + "grad_norm": 6.739954471588135, + "learning_rate": 1.9531713374509824e-05, + "loss": 1.6398, + "step": 9544 + }, + { + "epoch": 0.7184930089011837, + "grad_norm": 4.812310218811035, + "learning_rate": 1.9522048253360587e-05, + "loss": 1.9355, + "step": 9545 + }, + { + "epoch": 0.7185682831818438, + "grad_norm": 6.03534460067749, + "learning_rate": 1.9512384944038148e-05, + "loss": 2.0226, + "step": 9546 + }, + { + "epoch": 0.718643557462504, + "grad_norm": 4.772028923034668, + "learning_rate": 1.9502723447116927e-05, + "loss": 2.3704, + "step": 9547 + }, + { + "epoch": 0.7187188317431642, + "grad_norm": 6.009061813354492, + "learning_rate": 1.949306376317131e-05, + "loss": 1.5869, + "step": 9548 + }, + { + "epoch": 0.7187941060238243, + "grad_norm": 5.287210464477539, + "learning_rate": 1.9483405892775495e-05, + "loss": 1.9085, + "step": 9549 + }, + { + "epoch": 0.7188693803044844, + "grad_norm": 4.43877649307251, + "learning_rate": 1.9473749836503657e-05, + "loss": 1.271, + "step": 9550 + }, + { + "epoch": 0.7189446545851447, + "grad_norm": 4.632543087005615, + "learning_rate": 1.9464095594929788e-05, + "loss": 2.108, + "step": 9551 + }, + { + "epoch": 0.7190199288658048, + "grad_norm": 4.28782844543457, + "learning_rate": 1.9454443168627833e-05, + "loss": 1.6824, + "step": 9552 + }, + { + "epoch": 0.7190952031464649, + "grad_norm": 4.903390407562256, + "learning_rate": 1.944479255817157e-05, + "loss": 1.7923, + "step": 9553 + }, + { + "epoch": 0.719170477427125, + "grad_norm": 7.645961761474609, + "learning_rate": 1.9435143764134718e-05, + "loss": 2.0172, + "step": 9554 + }, + { + "epoch": 0.7192457517077853, + "grad_norm": 5.691678524017334, + "learning_rate": 1.942549678709087e-05, + "loss": 1.8204, + "step": 9555 + }, + { + "epoch": 0.7193210259884454, + "grad_norm": 4.54675817489624, + "learning_rate": 1.9415851627613525e-05, + "loss": 1.9056, + "step": 9556 + }, + { + "epoch": 0.7193963002691055, + "grad_norm": 4.945567607879639, + "learning_rate": 1.9406208286276024e-05, + "loss": 1.7552, + "step": 9557 + }, + { + "epoch": 0.7194715745497657, + "grad_norm": 4.792803764343262, + "learning_rate": 1.9396566763651658e-05, + "loss": 1.6443, + "step": 9558 + }, + { + "epoch": 0.7195468488304259, + "grad_norm": 4.259121417999268, + "learning_rate": 1.9386927060313607e-05, + "loss": 1.5547, + "step": 9559 + }, + { + "epoch": 0.719622123111086, + "grad_norm": 4.082737445831299, + "learning_rate": 1.9377289176834884e-05, + "loss": 2.0998, + "step": 9560 + }, + { + "epoch": 0.7196973973917462, + "grad_norm": 6.311650276184082, + "learning_rate": 1.9367653113788474e-05, + "loss": 1.7414, + "step": 9561 + }, + { + "epoch": 0.7197726716724063, + "grad_norm": 5.185740947723389, + "learning_rate": 1.9358018871747176e-05, + "loss": 1.7923, + "step": 9562 + }, + { + "epoch": 0.7198479459530664, + "grad_norm": 4.590246677398682, + "learning_rate": 1.9348386451283746e-05, + "loss": 1.6235, + "step": 9563 + }, + { + "epoch": 0.7199232202337267, + "grad_norm": 5.305851936340332, + "learning_rate": 1.933875585297078e-05, + "loss": 1.6704, + "step": 9564 + }, + { + "epoch": 0.7199984945143868, + "grad_norm": 4.344723701477051, + "learning_rate": 1.9329127077380815e-05, + "loss": 1.9337, + "step": 9565 + }, + { + "epoch": 0.7200737687950469, + "grad_norm": 6.741992950439453, + "learning_rate": 1.9319500125086228e-05, + "loss": 2.4713, + "step": 9566 + }, + { + "epoch": 0.7201490430757072, + "grad_norm": 5.562639236450195, + "learning_rate": 1.9309874996659317e-05, + "loss": 1.7452, + "step": 9567 + }, + { + "epoch": 0.7202243173563673, + "grad_norm": 8.103625297546387, + "learning_rate": 1.9300251692672283e-05, + "loss": 2.4275, + "step": 9568 + }, + { + "epoch": 0.7202995916370274, + "grad_norm": 4.430222034454346, + "learning_rate": 1.929063021369722e-05, + "loss": 1.8206, + "step": 9569 + }, + { + "epoch": 0.7203748659176876, + "grad_norm": 5.527270793914795, + "learning_rate": 1.928101056030605e-05, + "loss": 1.8373, + "step": 9570 + }, + { + "epoch": 0.7204501401983477, + "grad_norm": 4.450514793395996, + "learning_rate": 1.9271392733070686e-05, + "loss": 1.7859, + "step": 9571 + }, + { + "epoch": 0.7205254144790079, + "grad_norm": 4.106017112731934, + "learning_rate": 1.926177673256283e-05, + "loss": 1.785, + "step": 9572 + }, + { + "epoch": 0.7206006887596681, + "grad_norm": 4.553153038024902, + "learning_rate": 1.925216255935417e-05, + "loss": 1.7082, + "step": 9573 + }, + { + "epoch": 0.7206759630403282, + "grad_norm": 4.588836193084717, + "learning_rate": 1.9242550214016204e-05, + "loss": 2.0324, + "step": 9574 + }, + { + "epoch": 0.7207512373209883, + "grad_norm": 6.4269328117370605, + "learning_rate": 1.9232939697120374e-05, + "loss": 1.5279, + "step": 9575 + }, + { + "epoch": 0.7208265116016485, + "grad_norm": 4.636991500854492, + "learning_rate": 1.922333100923801e-05, + "loss": 1.8487, + "step": 9576 + }, + { + "epoch": 0.7209017858823087, + "grad_norm": 4.073963165283203, + "learning_rate": 1.92137241509403e-05, + "loss": 2.1126, + "step": 9577 + }, + { + "epoch": 0.7209770601629688, + "grad_norm": 5.726802825927734, + "learning_rate": 1.9204119122798347e-05, + "loss": 1.8359, + "step": 9578 + }, + { + "epoch": 0.7210523344436289, + "grad_norm": 6.7845377922058105, + "learning_rate": 1.919451592538315e-05, + "loss": 1.7283, + "step": 9579 + }, + { + "epoch": 0.7211276087242892, + "grad_norm": 5.296535015106201, + "learning_rate": 1.918491455926561e-05, + "loss": 1.7735, + "step": 9580 + }, + { + "epoch": 0.7212028830049493, + "grad_norm": 5.237893581390381, + "learning_rate": 1.9175315025016464e-05, + "loss": 2.0306, + "step": 9581 + }, + { + "epoch": 0.7212781572856094, + "grad_norm": 4.582086086273193, + "learning_rate": 1.9165717323206412e-05, + "loss": 1.8575, + "step": 9582 + }, + { + "epoch": 0.7213534315662696, + "grad_norm": 4.404659271240234, + "learning_rate": 1.9156121454405968e-05, + "loss": 2.1049, + "step": 9583 + }, + { + "epoch": 0.7214287058469298, + "grad_norm": 4.021386623382568, + "learning_rate": 1.914652741918563e-05, + "loss": 1.4943, + "step": 9584 + }, + { + "epoch": 0.7215039801275899, + "grad_norm": 3.68868088722229, + "learning_rate": 1.9136935218115682e-05, + "loss": 1.7914, + "step": 9585 + }, + { + "epoch": 0.7215792544082501, + "grad_norm": 6.577298164367676, + "learning_rate": 1.91273448517664e-05, + "loss": 1.7602, + "step": 9586 + }, + { + "epoch": 0.7216545286889102, + "grad_norm": 6.713287830352783, + "learning_rate": 1.9117756320707865e-05, + "loss": 1.9236, + "step": 9587 + }, + { + "epoch": 0.7217298029695703, + "grad_norm": 4.912992000579834, + "learning_rate": 1.9108169625510107e-05, + "loss": 1.7037, + "step": 9588 + }, + { + "epoch": 0.7218050772502306, + "grad_norm": 4.865211009979248, + "learning_rate": 1.9098584766743026e-05, + "loss": 1.9806, + "step": 9589 + }, + { + "epoch": 0.7218803515308907, + "grad_norm": 5.134067535400391, + "learning_rate": 1.9089001744976413e-05, + "loss": 1.8654, + "step": 9590 + }, + { + "epoch": 0.7219556258115508, + "grad_norm": 5.546294689178467, + "learning_rate": 1.907942056077997e-05, + "loss": 1.8915, + "step": 9591 + }, + { + "epoch": 0.722030900092211, + "grad_norm": 4.9889349937438965, + "learning_rate": 1.906984121472324e-05, + "loss": 1.8894, + "step": 9592 + }, + { + "epoch": 0.7221061743728712, + "grad_norm": 5.335707187652588, + "learning_rate": 1.9060263707375713e-05, + "loss": 2.0343, + "step": 9593 + }, + { + "epoch": 0.7221814486535313, + "grad_norm": 7.060311794281006, + "learning_rate": 1.9050688039306717e-05, + "loss": 1.5834, + "step": 9594 + }, + { + "epoch": 0.7222567229341914, + "grad_norm": 4.028903484344482, + "learning_rate": 1.9041114211085538e-05, + "loss": 1.6511, + "step": 9595 + }, + { + "epoch": 0.7223319972148516, + "grad_norm": 6.402505874633789, + "learning_rate": 1.9031542223281263e-05, + "loss": 1.6092, + "step": 9596 + }, + { + "epoch": 0.7224072714955118, + "grad_norm": 6.058710098266602, + "learning_rate": 1.9021972076462964e-05, + "loss": 1.8923, + "step": 9597 + }, + { + "epoch": 0.7224825457761719, + "grad_norm": 5.561830520629883, + "learning_rate": 1.9012403771199517e-05, + "loss": 1.7782, + "step": 9598 + }, + { + "epoch": 0.7225578200568321, + "grad_norm": 4.070313930511475, + "learning_rate": 1.9002837308059755e-05, + "loss": 2.0121, + "step": 9599 + }, + { + "epoch": 0.7226330943374922, + "grad_norm": 3.963165760040283, + "learning_rate": 1.8993272687612378e-05, + "loss": 1.7335, + "step": 9600 + }, + { + "epoch": 0.7227083686181524, + "grad_norm": 3.5139055252075195, + "learning_rate": 1.898370991042599e-05, + "loss": 2.0153, + "step": 9601 + }, + { + "epoch": 0.7227836428988126, + "grad_norm": 4.647072792053223, + "learning_rate": 1.897414897706903e-05, + "loss": 1.628, + "step": 9602 + }, + { + "epoch": 0.7228589171794727, + "grad_norm": 3.811833381652832, + "learning_rate": 1.8964589888109906e-05, + "loss": 1.8338, + "step": 9603 + }, + { + "epoch": 0.7229341914601328, + "grad_norm": 4.302700519561768, + "learning_rate": 1.8955032644116848e-05, + "loss": 1.5848, + "step": 9604 + }, + { + "epoch": 0.7230094657407931, + "grad_norm": 4.486805438995361, + "learning_rate": 1.8945477245658038e-05, + "loss": 1.6467, + "step": 9605 + }, + { + "epoch": 0.7230847400214532, + "grad_norm": 4.820157527923584, + "learning_rate": 1.8935923693301488e-05, + "loss": 2.0539, + "step": 9606 + }, + { + "epoch": 0.7231600143021133, + "grad_norm": 4.296202182769775, + "learning_rate": 1.8926371987615137e-05, + "loss": 1.9226, + "step": 9607 + }, + { + "epoch": 0.7232352885827735, + "grad_norm": 3.899815082550049, + "learning_rate": 1.8916822129166827e-05, + "loss": 2.0188, + "step": 9608 + }, + { + "epoch": 0.7233105628634336, + "grad_norm": 4.494897842407227, + "learning_rate": 1.8907274118524236e-05, + "loss": 1.8445, + "step": 9609 + }, + { + "epoch": 0.7233858371440938, + "grad_norm": 3.7157018184661865, + "learning_rate": 1.8897727956255007e-05, + "loss": 1.6226, + "step": 9610 + }, + { + "epoch": 0.723461111424754, + "grad_norm": 5.906121253967285, + "learning_rate": 1.888818364292659e-05, + "loss": 1.8541, + "step": 9611 + }, + { + "epoch": 0.7235363857054141, + "grad_norm": 5.056562900543213, + "learning_rate": 1.8878641179106375e-05, + "loss": 1.7235, + "step": 9612 + }, + { + "epoch": 0.7236116599860742, + "grad_norm": 5.222574234008789, + "learning_rate": 1.8869100565361657e-05, + "loss": 1.3679, + "step": 9613 + }, + { + "epoch": 0.7236869342667344, + "grad_norm": 4.466900825500488, + "learning_rate": 1.8859561802259597e-05, + "loss": 1.7114, + "step": 9614 + }, + { + "epoch": 0.7237622085473946, + "grad_norm": 5.444098472595215, + "learning_rate": 1.8850024890367224e-05, + "loss": 1.954, + "step": 9615 + }, + { + "epoch": 0.7238374828280547, + "grad_norm": 4.00093936920166, + "learning_rate": 1.88404898302515e-05, + "loss": 1.9861, + "step": 9616 + }, + { + "epoch": 0.7239127571087148, + "grad_norm": 5.224597454071045, + "learning_rate": 1.883095662247924e-05, + "loss": 1.8943, + "step": 9617 + }, + { + "epoch": 0.7239880313893751, + "grad_norm": 4.640960216522217, + "learning_rate": 1.8821425267617187e-05, + "loss": 1.9859, + "step": 9618 + }, + { + "epoch": 0.7240633056700352, + "grad_norm": 3.755033016204834, + "learning_rate": 1.8811895766231928e-05, + "loss": 1.6073, + "step": 9619 + }, + { + "epoch": 0.7241385799506953, + "grad_norm": 4.830538272857666, + "learning_rate": 1.880236811888999e-05, + "loss": 2.0587, + "step": 9620 + }, + { + "epoch": 0.7242138542313555, + "grad_norm": 4.16685676574707, + "learning_rate": 1.879284232615774e-05, + "loss": 2.0139, + "step": 9621 + }, + { + "epoch": 0.7242891285120157, + "grad_norm": 5.567697048187256, + "learning_rate": 1.8783318388601463e-05, + "loss": 1.554, + "step": 9622 + }, + { + "epoch": 0.7243644027926758, + "grad_norm": 4.302929401397705, + "learning_rate": 1.8773796306787343e-05, + "loss": 1.6313, + "step": 9623 + }, + { + "epoch": 0.724439677073336, + "grad_norm": 5.352593421936035, + "learning_rate": 1.8764276081281428e-05, + "loss": 1.971, + "step": 9624 + }, + { + "epoch": 0.7245149513539961, + "grad_norm": 3.3538074493408203, + "learning_rate": 1.87547577126497e-05, + "loss": 1.9039, + "step": 9625 + }, + { + "epoch": 0.7245902256346562, + "grad_norm": 6.26210355758667, + "learning_rate": 1.8745241201457954e-05, + "loss": 1.8066, + "step": 9626 + }, + { + "epoch": 0.7246654999153165, + "grad_norm": 3.9136078357696533, + "learning_rate": 1.8735726548271947e-05, + "loss": 1.6886, + "step": 9627 + }, + { + "epoch": 0.7247407741959766, + "grad_norm": 5.439061641693115, + "learning_rate": 1.8726213753657278e-05, + "loss": 1.982, + "step": 9628 + }, + { + "epoch": 0.7248160484766367, + "grad_norm": 5.446580410003662, + "learning_rate": 1.8716702818179487e-05, + "loss": 1.8119, + "step": 9629 + }, + { + "epoch": 0.724891322757297, + "grad_norm": 4.451972007751465, + "learning_rate": 1.8707193742403935e-05, + "loss": 1.602, + "step": 9630 + }, + { + "epoch": 0.7249665970379571, + "grad_norm": 5.40526008605957, + "learning_rate": 1.869768652689594e-05, + "loss": 2.2619, + "step": 9631 + }, + { + "epoch": 0.7250418713186172, + "grad_norm": 4.169073581695557, + "learning_rate": 1.868818117222065e-05, + "loss": 1.8202, + "step": 9632 + }, + { + "epoch": 0.7251171455992773, + "grad_norm": 7.562443733215332, + "learning_rate": 1.867867767894314e-05, + "loss": 2.2829, + "step": 9633 + }, + { + "epoch": 0.7251924198799375, + "grad_norm": 4.879371643066406, + "learning_rate": 1.8669176047628373e-05, + "loss": 1.7564, + "step": 9634 + }, + { + "epoch": 0.7252676941605977, + "grad_norm": 5.123662948608398, + "learning_rate": 1.8659676278841215e-05, + "loss": 1.8249, + "step": 9635 + }, + { + "epoch": 0.7253429684412578, + "grad_norm": 4.29550313949585, + "learning_rate": 1.8650178373146355e-05, + "loss": 2.0688, + "step": 9636 + }, + { + "epoch": 0.725418242721918, + "grad_norm": 5.675379276275635, + "learning_rate": 1.864068233110845e-05, + "loss": 1.7642, + "step": 9637 + }, + { + "epoch": 0.7254935170025781, + "grad_norm": 5.180209159851074, + "learning_rate": 1.863118815329199e-05, + "loss": 1.7766, + "step": 9638 + }, + { + "epoch": 0.7255687912832383, + "grad_norm": 4.255703926086426, + "learning_rate": 1.8621695840261388e-05, + "loss": 1.683, + "step": 9639 + }, + { + "epoch": 0.7256440655638985, + "grad_norm": 6.847949028015137, + "learning_rate": 1.8612205392580945e-05, + "loss": 1.9474, + "step": 9640 + }, + { + "epoch": 0.7257193398445586, + "grad_norm": 4.82269811630249, + "learning_rate": 1.860271681081482e-05, + "loss": 1.7569, + "step": 9641 + }, + { + "epoch": 0.7257946141252187, + "grad_norm": 4.468365669250488, + "learning_rate": 1.8593230095527104e-05, + "loss": 1.5133, + "step": 9642 + }, + { + "epoch": 0.725869888405879, + "grad_norm": 5.581293106079102, + "learning_rate": 1.8583745247281724e-05, + "loss": 2.1109, + "step": 9643 + }, + { + "epoch": 0.7259451626865391, + "grad_norm": 4.230678081512451, + "learning_rate": 1.8574262266642552e-05, + "loss": 1.8581, + "step": 9644 + }, + { + "epoch": 0.7260204369671992, + "grad_norm": 3.937662363052368, + "learning_rate": 1.8564781154173312e-05, + "loss": 1.6738, + "step": 9645 + }, + { + "epoch": 0.7260957112478594, + "grad_norm": 4.213057518005371, + "learning_rate": 1.855530191043765e-05, + "loss": 1.7502, + "step": 9646 + }, + { + "epoch": 0.7261709855285196, + "grad_norm": 4.31326150894165, + "learning_rate": 1.854582453599905e-05, + "loss": 1.363, + "step": 9647 + }, + { + "epoch": 0.7262462598091797, + "grad_norm": 5.665443420410156, + "learning_rate": 1.8536349031420953e-05, + "loss": 1.7954, + "step": 9648 + }, + { + "epoch": 0.7263215340898399, + "grad_norm": 8.148431777954102, + "learning_rate": 1.8526875397266603e-05, + "loss": 2.0201, + "step": 9649 + }, + { + "epoch": 0.7263968083705, + "grad_norm": 5.401271343231201, + "learning_rate": 1.8517403634099217e-05, + "loss": 1.5663, + "step": 9650 + }, + { + "epoch": 0.7264720826511601, + "grad_norm": 5.100400924682617, + "learning_rate": 1.850793374248184e-05, + "loss": 1.6419, + "step": 9651 + }, + { + "epoch": 0.7265473569318203, + "grad_norm": 5.196130752563477, + "learning_rate": 1.849846572297746e-05, + "loss": 2.0133, + "step": 9652 + }, + { + "epoch": 0.7266226312124805, + "grad_norm": 4.777595043182373, + "learning_rate": 1.8488999576148886e-05, + "loss": 1.9229, + "step": 9653 + }, + { + "epoch": 0.7266979054931406, + "grad_norm": 6.994591236114502, + "learning_rate": 1.8479535302558872e-05, + "loss": 2.1819, + "step": 9654 + }, + { + "epoch": 0.7267731797738007, + "grad_norm": 6.8679938316345215, + "learning_rate": 1.8470072902770037e-05, + "loss": 2.2083, + "step": 9655 + }, + { + "epoch": 0.726848454054461, + "grad_norm": 5.348995208740234, + "learning_rate": 1.846061237734492e-05, + "loss": 1.9756, + "step": 9656 + }, + { + "epoch": 0.7269237283351211, + "grad_norm": 3.789236068725586, + "learning_rate": 1.8451153726845882e-05, + "loss": 1.7119, + "step": 9657 + }, + { + "epoch": 0.7269990026157812, + "grad_norm": 6.561370849609375, + "learning_rate": 1.8441696951835235e-05, + "loss": 2.1884, + "step": 9658 + }, + { + "epoch": 0.7270742768964414, + "grad_norm": 6.422976493835449, + "learning_rate": 1.8432242052875165e-05, + "loss": 2.0826, + "step": 9659 + }, + { + "epoch": 0.7271495511771016, + "grad_norm": 5.728564739227295, + "learning_rate": 1.8422789030527714e-05, + "loss": 2.246, + "step": 9660 + }, + { + "epoch": 0.7272248254577617, + "grad_norm": 5.061489105224609, + "learning_rate": 1.8413337885354866e-05, + "loss": 1.924, + "step": 9661 + }, + { + "epoch": 0.7273000997384219, + "grad_norm": 6.333934783935547, + "learning_rate": 1.840388861791843e-05, + "loss": 1.4921, + "step": 9662 + }, + { + "epoch": 0.727375374019082, + "grad_norm": 4.267570495605469, + "learning_rate": 1.8394441228780174e-05, + "loss": 1.4773, + "step": 9663 + }, + { + "epoch": 0.7274506482997422, + "grad_norm": 5.911327838897705, + "learning_rate": 1.8384995718501685e-05, + "loss": 1.7595, + "step": 9664 + }, + { + "epoch": 0.7275259225804024, + "grad_norm": 6.126576900482178, + "learning_rate": 1.8375552087644497e-05, + "loss": 2.3626, + "step": 9665 + }, + { + "epoch": 0.7276011968610625, + "grad_norm": 4.967687606811523, + "learning_rate": 1.8366110336769987e-05, + "loss": 1.9229, + "step": 9666 + }, + { + "epoch": 0.7276764711417226, + "grad_norm": 4.9898271560668945, + "learning_rate": 1.8356670466439446e-05, + "loss": 1.6429, + "step": 9667 + }, + { + "epoch": 0.7277517454223829, + "grad_norm": 5.991692066192627, + "learning_rate": 1.8347232477214053e-05, + "loss": 1.9946, + "step": 9668 + }, + { + "epoch": 0.727827019703043, + "grad_norm": 5.030762672424316, + "learning_rate": 1.8337796369654887e-05, + "loss": 1.8292, + "step": 9669 + }, + { + "epoch": 0.7279022939837031, + "grad_norm": 4.66921329498291, + "learning_rate": 1.832836214432286e-05, + "loss": 1.7523, + "step": 9670 + }, + { + "epoch": 0.7279775682643633, + "grad_norm": 4.7627410888671875, + "learning_rate": 1.8318929801778824e-05, + "loss": 1.7108, + "step": 9671 + }, + { + "epoch": 0.7280528425450234, + "grad_norm": 4.6498308181762695, + "learning_rate": 1.8309499342583525e-05, + "loss": 1.4522, + "step": 9672 + }, + { + "epoch": 0.7281281168256836, + "grad_norm": 4.665681838989258, + "learning_rate": 1.830007076729754e-05, + "loss": 1.7993, + "step": 9673 + }, + { + "epoch": 0.7282033911063437, + "grad_norm": 4.798788547515869, + "learning_rate": 1.829064407648141e-05, + "loss": 2.2603, + "step": 9674 + }, + { + "epoch": 0.7282786653870039, + "grad_norm": 5.3134026527404785, + "learning_rate": 1.8281219270695493e-05, + "loss": 1.9267, + "step": 9675 + }, + { + "epoch": 0.728353939667664, + "grad_norm": 4.871895790100098, + "learning_rate": 1.827179635050009e-05, + "loss": 1.816, + "step": 9676 + }, + { + "epoch": 0.7284292139483242, + "grad_norm": 4.945578098297119, + "learning_rate": 1.8262375316455343e-05, + "loss": 1.8274, + "step": 9677 + }, + { + "epoch": 0.7285044882289844, + "grad_norm": 4.161609172821045, + "learning_rate": 1.8252956169121315e-05, + "loss": 1.6447, + "step": 9678 + }, + { + "epoch": 0.7285797625096445, + "grad_norm": 4.553129196166992, + "learning_rate": 1.8243538909057954e-05, + "loss": 1.8793, + "step": 9679 + }, + { + "epoch": 0.7286550367903046, + "grad_norm": 4.220051288604736, + "learning_rate": 1.82341235368251e-05, + "loss": 1.6566, + "step": 9680 + }, + { + "epoch": 0.7287303110709649, + "grad_norm": 6.925411701202393, + "learning_rate": 1.8224710052982435e-05, + "loss": 1.7613, + "step": 9681 + }, + { + "epoch": 0.728805585351625, + "grad_norm": 4.924813270568848, + "learning_rate": 1.8215298458089607e-05, + "loss": 1.8584, + "step": 9682 + }, + { + "epoch": 0.7288808596322851, + "grad_norm": 5.307071685791016, + "learning_rate": 1.8205888752706062e-05, + "loss": 2.2315, + "step": 9683 + }, + { + "epoch": 0.7289561339129453, + "grad_norm": 7.626509189605713, + "learning_rate": 1.8196480937391223e-05, + "loss": 2.2469, + "step": 9684 + }, + { + "epoch": 0.7290314081936055, + "grad_norm": 4.821575164794922, + "learning_rate": 1.8187075012704324e-05, + "loss": 1.5392, + "step": 9685 + }, + { + "epoch": 0.7291066824742656, + "grad_norm": 4.358816623687744, + "learning_rate": 1.8177670979204546e-05, + "loss": 2.1884, + "step": 9686 + }, + { + "epoch": 0.7291819567549258, + "grad_norm": 6.0131635665893555, + "learning_rate": 1.8168268837450907e-05, + "loss": 2.1701, + "step": 9687 + }, + { + "epoch": 0.7292572310355859, + "grad_norm": 4.657010555267334, + "learning_rate": 1.815886858800235e-05, + "loss": 2.1627, + "step": 9688 + }, + { + "epoch": 0.729332505316246, + "grad_norm": 5.907837390899658, + "learning_rate": 1.8149470231417686e-05, + "loss": 1.6824, + "step": 9689 + }, + { + "epoch": 0.7294077795969063, + "grad_norm": 4.379014015197754, + "learning_rate": 1.8140073768255633e-05, + "loss": 1.753, + "step": 9690 + }, + { + "epoch": 0.7294830538775664, + "grad_norm": 4.511151313781738, + "learning_rate": 1.8130679199074798e-05, + "loss": 2.2083, + "step": 9691 + }, + { + "epoch": 0.7295583281582265, + "grad_norm": 4.889375686645508, + "learning_rate": 1.8121286524433616e-05, + "loss": 1.4597, + "step": 9692 + }, + { + "epoch": 0.7296336024388866, + "grad_norm": 4.540733814239502, + "learning_rate": 1.81118957448905e-05, + "loss": 1.8036, + "step": 9693 + }, + { + "epoch": 0.7297088767195469, + "grad_norm": 4.11193323135376, + "learning_rate": 1.810250686100367e-05, + "loss": 1.7872, + "step": 9694 + }, + { + "epoch": 0.729784151000207, + "grad_norm": 5.5439629554748535, + "learning_rate": 1.8093119873331297e-05, + "loss": 1.9098, + "step": 9695 + }, + { + "epoch": 0.7298594252808671, + "grad_norm": 6.257260322570801, + "learning_rate": 1.808373478243138e-05, + "loss": 2.3189, + "step": 9696 + }, + { + "epoch": 0.7299346995615273, + "grad_norm": 4.942529201507568, + "learning_rate": 1.8074351588861876e-05, + "loss": 2.1724, + "step": 9697 + }, + { + "epoch": 0.7300099738421875, + "grad_norm": 4.4367804527282715, + "learning_rate": 1.8064970293180545e-05, + "loss": 2.2091, + "step": 9698 + }, + { + "epoch": 0.7300852481228476, + "grad_norm": 4.344898223876953, + "learning_rate": 1.805559089594509e-05, + "loss": 1.8996, + "step": 9699 + }, + { + "epoch": 0.7301605224035078, + "grad_norm": 4.409204006195068, + "learning_rate": 1.8046213397713108e-05, + "loss": 2.0279, + "step": 9700 + }, + { + "epoch": 0.7302357966841679, + "grad_norm": 8.010478019714355, + "learning_rate": 1.803683779904206e-05, + "loss": 1.8559, + "step": 9701 + }, + { + "epoch": 0.7303110709648281, + "grad_norm": 7.777335166931152, + "learning_rate": 1.8027464100489283e-05, + "loss": 1.7544, + "step": 9702 + }, + { + "epoch": 0.7303863452454883, + "grad_norm": 7.196082592010498, + "learning_rate": 1.801809230261203e-05, + "loss": 1.9297, + "step": 9703 + }, + { + "epoch": 0.7304616195261484, + "grad_norm": 4.2694878578186035, + "learning_rate": 1.800872240596743e-05, + "loss": 1.9816, + "step": 9704 + }, + { + "epoch": 0.7305368938068085, + "grad_norm": 3.8258280754089355, + "learning_rate": 1.799935441111248e-05, + "loss": 1.7919, + "step": 9705 + }, + { + "epoch": 0.7306121680874688, + "grad_norm": 6.191116809844971, + "learning_rate": 1.79899883186041e-05, + "loss": 1.7088, + "step": 9706 + }, + { + "epoch": 0.7306874423681289, + "grad_norm": 3.6788432598114014, + "learning_rate": 1.7980624128999056e-05, + "loss": 1.8622, + "step": 9707 + }, + { + "epoch": 0.730762716648789, + "grad_norm": 5.6116414070129395, + "learning_rate": 1.7971261842854048e-05, + "loss": 1.8853, + "step": 9708 + }, + { + "epoch": 0.7308379909294492, + "grad_norm": 5.4416704177856445, + "learning_rate": 1.7961901460725605e-05, + "loss": 1.7398, + "step": 9709 + }, + { + "epoch": 0.7309132652101094, + "grad_norm": 3.781179666519165, + "learning_rate": 1.7952542983170213e-05, + "loss": 1.7704, + "step": 9710 + }, + { + "epoch": 0.7309885394907695, + "grad_norm": 3.867990016937256, + "learning_rate": 1.794318641074417e-05, + "loss": 2.0546, + "step": 9711 + }, + { + "epoch": 0.7310638137714296, + "grad_norm": 5.500983715057373, + "learning_rate": 1.793383174400371e-05, + "loss": 1.7079, + "step": 9712 + }, + { + "epoch": 0.7311390880520898, + "grad_norm": 4.387624740600586, + "learning_rate": 1.7924478983504945e-05, + "loss": 1.8676, + "step": 9713 + }, + { + "epoch": 0.73121436233275, + "grad_norm": 5.4258904457092285, + "learning_rate": 1.7915128129803886e-05, + "loss": 1.71, + "step": 9714 + }, + { + "epoch": 0.7312896366134101, + "grad_norm": 4.6748199462890625, + "learning_rate": 1.7905779183456382e-05, + "loss": 1.9719, + "step": 9715 + }, + { + "epoch": 0.7313649108940703, + "grad_norm": 3.597334623336792, + "learning_rate": 1.789643214501824e-05, + "loss": 1.5988, + "step": 9716 + }, + { + "epoch": 0.7314401851747304, + "grad_norm": 5.096696376800537, + "learning_rate": 1.788708701504506e-05, + "loss": 1.83, + "step": 9717 + }, + { + "epoch": 0.7315154594553905, + "grad_norm": 4.292730331420898, + "learning_rate": 1.7877743794092444e-05, + "loss": 1.6436, + "step": 9718 + }, + { + "epoch": 0.7315907337360508, + "grad_norm": 5.707505226135254, + "learning_rate": 1.7868402482715768e-05, + "loss": 1.7773, + "step": 9719 + }, + { + "epoch": 0.7316660080167109, + "grad_norm": 4.0598978996276855, + "learning_rate": 1.7859063081470372e-05, + "loss": 1.9463, + "step": 9720 + }, + { + "epoch": 0.731741282297371, + "grad_norm": 4.274533748626709, + "learning_rate": 1.784972559091147e-05, + "loss": 1.5552, + "step": 9721 + }, + { + "epoch": 0.7318165565780312, + "grad_norm": 4.773648262023926, + "learning_rate": 1.784039001159411e-05, + "loss": 1.8461, + "step": 9722 + }, + { + "epoch": 0.7318918308586914, + "grad_norm": 4.201850891113281, + "learning_rate": 1.7831056344073293e-05, + "loss": 1.5214, + "step": 9723 + }, + { + "epoch": 0.7319671051393515, + "grad_norm": 4.602932929992676, + "learning_rate": 1.782172458890387e-05, + "loss": 1.5414, + "step": 9724 + }, + { + "epoch": 0.7320423794200117, + "grad_norm": 4.259728908538818, + "learning_rate": 1.781239474664061e-05, + "loss": 1.784, + "step": 9725 + }, + { + "epoch": 0.7321176537006718, + "grad_norm": 5.31164026260376, + "learning_rate": 1.7803066817838103e-05, + "loss": 1.758, + "step": 9726 + }, + { + "epoch": 0.732192927981332, + "grad_norm": 6.805521488189697, + "learning_rate": 1.779374080305092e-05, + "loss": 1.6763, + "step": 9727 + }, + { + "epoch": 0.7322682022619922, + "grad_norm": 6.2445597648620605, + "learning_rate": 1.7784416702833407e-05, + "loss": 1.6983, + "step": 9728 + }, + { + "epoch": 0.7323434765426523, + "grad_norm": 6.488143444061279, + "learning_rate": 1.7775094517739903e-05, + "loss": 1.7871, + "step": 9729 + }, + { + "epoch": 0.7324187508233124, + "grad_norm": 7.087213039398193, + "learning_rate": 1.776577424832455e-05, + "loss": 1.7209, + "step": 9730 + }, + { + "epoch": 0.7324940251039725, + "grad_norm": 5.352892875671387, + "learning_rate": 1.775645589514145e-05, + "loss": 1.7292, + "step": 9731 + }, + { + "epoch": 0.7325692993846328, + "grad_norm": 4.766453266143799, + "learning_rate": 1.7747139458744504e-05, + "loss": 1.8335, + "step": 9732 + }, + { + "epoch": 0.7326445736652929, + "grad_norm": 5.376926422119141, + "learning_rate": 1.773782493968758e-05, + "loss": 1.9848, + "step": 9733 + }, + { + "epoch": 0.732719847945953, + "grad_norm": 5.157107353210449, + "learning_rate": 1.772851233852439e-05, + "loss": 1.7593, + "step": 9734 + }, + { + "epoch": 0.7327951222266132, + "grad_norm": 4.221771717071533, + "learning_rate": 1.7719201655808565e-05, + "loss": 1.8166, + "step": 9735 + }, + { + "epoch": 0.7328703965072734, + "grad_norm": 5.821368217468262, + "learning_rate": 1.7709892892093554e-05, + "loss": 2.0513, + "step": 9736 + }, + { + "epoch": 0.7329456707879335, + "grad_norm": 6.510801315307617, + "learning_rate": 1.7700586047932764e-05, + "loss": 2.1425, + "step": 9737 + }, + { + "epoch": 0.7330209450685937, + "grad_norm": 5.537794589996338, + "learning_rate": 1.7691281123879472e-05, + "loss": 1.7634, + "step": 9738 + }, + { + "epoch": 0.7330962193492538, + "grad_norm": 4.1630425453186035, + "learning_rate": 1.76819781204868e-05, + "loss": 2.039, + "step": 9739 + }, + { + "epoch": 0.733171493629914, + "grad_norm": 4.676449298858643, + "learning_rate": 1.7672677038307812e-05, + "loss": 1.7387, + "step": 9740 + }, + { + "epoch": 0.7332467679105742, + "grad_norm": 4.5910234451293945, + "learning_rate": 1.7663377877895397e-05, + "loss": 1.4709, + "step": 9741 + }, + { + "epoch": 0.7333220421912343, + "grad_norm": 4.184149265289307, + "learning_rate": 1.76540806398024e-05, + "loss": 1.5164, + "step": 9742 + }, + { + "epoch": 0.7333973164718944, + "grad_norm": 4.522687911987305, + "learning_rate": 1.7644785324581487e-05, + "loss": 1.8831, + "step": 9743 + }, + { + "epoch": 0.7334725907525547, + "grad_norm": 4.9783525466918945, + "learning_rate": 1.763549193278524e-05, + "loss": 2.0299, + "step": 9744 + }, + { + "epoch": 0.7335478650332148, + "grad_norm": 5.061306953430176, + "learning_rate": 1.762620046496614e-05, + "loss": 2.1781, + "step": 9745 + }, + { + "epoch": 0.7336231393138749, + "grad_norm": 4.065035820007324, + "learning_rate": 1.7616910921676548e-05, + "loss": 2.1733, + "step": 9746 + }, + { + "epoch": 0.7336984135945351, + "grad_norm": 4.660484313964844, + "learning_rate": 1.760762330346867e-05, + "loss": 1.7157, + "step": 9747 + }, + { + "epoch": 0.7337736878751953, + "grad_norm": 5.202456951141357, + "learning_rate": 1.759833761089465e-05, + "loss": 1.5034, + "step": 9748 + }, + { + "epoch": 0.7338489621558554, + "grad_norm": 6.001447677612305, + "learning_rate": 1.7589053844506477e-05, + "loss": 1.4525, + "step": 9749 + }, + { + "epoch": 0.7339242364365156, + "grad_norm": 3.924381971359253, + "learning_rate": 1.757977200485607e-05, + "loss": 1.9319, + "step": 9750 + }, + { + "epoch": 0.7339995107171757, + "grad_norm": 3.994356870651245, + "learning_rate": 1.7570492092495183e-05, + "loss": 2.0428, + "step": 9751 + }, + { + "epoch": 0.7340747849978358, + "grad_norm": 4.514043807983398, + "learning_rate": 1.7561214107975483e-05, + "loss": 1.7472, + "step": 9752 + }, + { + "epoch": 0.734150059278496, + "grad_norm": 4.36171817779541, + "learning_rate": 1.7551938051848548e-05, + "loss": 1.7168, + "step": 9753 + }, + { + "epoch": 0.7342253335591562, + "grad_norm": 5.348937511444092, + "learning_rate": 1.7542663924665776e-05, + "loss": 1.7727, + "step": 9754 + }, + { + "epoch": 0.7343006078398163, + "grad_norm": 4.585260391235352, + "learning_rate": 1.7533391726978498e-05, + "loss": 1.4679, + "step": 9755 + }, + { + "epoch": 0.7343758821204764, + "grad_norm": 4.222022533416748, + "learning_rate": 1.752412145933793e-05, + "loss": 1.7319, + "step": 9756 + }, + { + "epoch": 0.7344511564011367, + "grad_norm": 4.450934410095215, + "learning_rate": 1.751485312229517e-05, + "loss": 1.6003, + "step": 9757 + }, + { + "epoch": 0.7345264306817968, + "grad_norm": 5.085152626037598, + "learning_rate": 1.7505586716401167e-05, + "loss": 1.7326, + "step": 9758 + }, + { + "epoch": 0.7346017049624569, + "grad_norm": 4.06351900100708, + "learning_rate": 1.7496322242206815e-05, + "loss": 1.9483, + "step": 9759 + }, + { + "epoch": 0.7346769792431171, + "grad_norm": 4.651455879211426, + "learning_rate": 1.7487059700262824e-05, + "loss": 1.6702, + "step": 9760 + }, + { + "epoch": 0.7347522535237773, + "grad_norm": 5.352548122406006, + "learning_rate": 1.7477799091119862e-05, + "loss": 2.3562, + "step": 9761 + }, + { + "epoch": 0.7348275278044374, + "grad_norm": 4.365647315979004, + "learning_rate": 1.7468540415328406e-05, + "loss": 2.031, + "step": 9762 + }, + { + "epoch": 0.7349028020850976, + "grad_norm": 5.19638204574585, + "learning_rate": 1.74592836734389e-05, + "loss": 1.5893, + "step": 9763 + }, + { + "epoch": 0.7349780763657577, + "grad_norm": 4.678807735443115, + "learning_rate": 1.745002886600159e-05, + "loss": 1.376, + "step": 9764 + }, + { + "epoch": 0.7350533506464179, + "grad_norm": 5.133519172668457, + "learning_rate": 1.744077599356669e-05, + "loss": 1.8811, + "step": 9765 + }, + { + "epoch": 0.7351286249270781, + "grad_norm": 5.054455280303955, + "learning_rate": 1.7431525056684213e-05, + "loss": 1.7761, + "step": 9766 + }, + { + "epoch": 0.7352038992077382, + "grad_norm": 4.485023021697998, + "learning_rate": 1.742227605590412e-05, + "loss": 2.095, + "step": 9767 + }, + { + "epoch": 0.7352791734883983, + "grad_norm": 4.016222953796387, + "learning_rate": 1.7413028991776242e-05, + "loss": 1.985, + "step": 9768 + }, + { + "epoch": 0.7353544477690586, + "grad_norm": 6.328843116760254, + "learning_rate": 1.7403783864850286e-05, + "loss": 1.4483, + "step": 9769 + }, + { + "epoch": 0.7354297220497187, + "grad_norm": 5.10483980178833, + "learning_rate": 1.7394540675675868e-05, + "loss": 1.6889, + "step": 9770 + }, + { + "epoch": 0.7355049963303788, + "grad_norm": 4.845269203186035, + "learning_rate": 1.7385299424802437e-05, + "loss": 1.6641, + "step": 9771 + }, + { + "epoch": 0.7355802706110389, + "grad_norm": 5.702883720397949, + "learning_rate": 1.737606011277939e-05, + "loss": 1.9045, + "step": 9772 + }, + { + "epoch": 0.7356555448916992, + "grad_norm": 4.234776020050049, + "learning_rate": 1.736682274015594e-05, + "loss": 1.524, + "step": 9773 + }, + { + "epoch": 0.7357308191723593, + "grad_norm": 6.878330230712891, + "learning_rate": 1.7357587307481267e-05, + "loss": 1.9828, + "step": 9774 + }, + { + "epoch": 0.7358060934530194, + "grad_norm": 4.601244926452637, + "learning_rate": 1.7348353815304343e-05, + "loss": 2.043, + "step": 9775 + }, + { + "epoch": 0.7358813677336796, + "grad_norm": 5.564022541046143, + "learning_rate": 1.7339122264174118e-05, + "loss": 1.8573, + "step": 9776 + }, + { + "epoch": 0.7359566420143397, + "grad_norm": 3.8240888118743896, + "learning_rate": 1.732989265463934e-05, + "loss": 1.6579, + "step": 9777 + }, + { + "epoch": 0.7360319162949999, + "grad_norm": 3.6997196674346924, + "learning_rate": 1.73206649872487e-05, + "loss": 1.6569, + "step": 9778 + }, + { + "epoch": 0.7361071905756601, + "grad_norm": 4.304880619049072, + "learning_rate": 1.7311439262550766e-05, + "loss": 1.9989, + "step": 9779 + }, + { + "epoch": 0.7361824648563202, + "grad_norm": 4.956548690795898, + "learning_rate": 1.7302215481093985e-05, + "loss": 2.0161, + "step": 9780 + }, + { + "epoch": 0.7362577391369803, + "grad_norm": 4.1241350173950195, + "learning_rate": 1.7292993643426657e-05, + "loss": 1.8653, + "step": 9781 + }, + { + "epoch": 0.7363330134176406, + "grad_norm": 4.550619602203369, + "learning_rate": 1.728377375009703e-05, + "loss": 2.3279, + "step": 9782 + }, + { + "epoch": 0.7364082876983007, + "grad_norm": 4.133872985839844, + "learning_rate": 1.7274555801653163e-05, + "loss": 1.6952, + "step": 9783 + }, + { + "epoch": 0.7364835619789608, + "grad_norm": 3.9653403759002686, + "learning_rate": 1.7265339798643048e-05, + "loss": 1.8654, + "step": 9784 + }, + { + "epoch": 0.736558836259621, + "grad_norm": 5.002777576446533, + "learning_rate": 1.725612574161458e-05, + "loss": 1.8781, + "step": 9785 + }, + { + "epoch": 0.7366341105402812, + "grad_norm": 3.9798200130462646, + "learning_rate": 1.7246913631115465e-05, + "loss": 1.5089, + "step": 9786 + }, + { + "epoch": 0.7367093848209413, + "grad_norm": 7.075782775878906, + "learning_rate": 1.7237703467693366e-05, + "loss": 1.9177, + "step": 9787 + }, + { + "epoch": 0.7367846591016015, + "grad_norm": 5.200432300567627, + "learning_rate": 1.7228495251895783e-05, + "loss": 2.0511, + "step": 9788 + }, + { + "epoch": 0.7368599333822616, + "grad_norm": 6.1699676513671875, + "learning_rate": 1.721928898427012e-05, + "loss": 1.432, + "step": 9789 + }, + { + "epoch": 0.7369352076629218, + "grad_norm": 4.753431797027588, + "learning_rate": 1.7210084665363668e-05, + "loss": 1.91, + "step": 9790 + }, + { + "epoch": 0.7370104819435819, + "grad_norm": 4.626429080963135, + "learning_rate": 1.720088229572362e-05, + "loss": 1.4399, + "step": 9791 + }, + { + "epoch": 0.7370857562242421, + "grad_norm": 6.596329212188721, + "learning_rate": 1.7191681875896986e-05, + "loss": 2.0601, + "step": 9792 + }, + { + "epoch": 0.7371610305049022, + "grad_norm": 3.7581191062927246, + "learning_rate": 1.7182483406430745e-05, + "loss": 2.0018, + "step": 9793 + }, + { + "epoch": 0.7372363047855623, + "grad_norm": 3.9835758209228516, + "learning_rate": 1.7173286887871687e-05, + "loss": 1.7088, + "step": 9794 + }, + { + "epoch": 0.7373115790662226, + "grad_norm": 4.646708965301514, + "learning_rate": 1.7164092320766544e-05, + "loss": 2.053, + "step": 9795 + }, + { + "epoch": 0.7373868533468827, + "grad_norm": 4.294607162475586, + "learning_rate": 1.7154899705661886e-05, + "loss": 1.8916, + "step": 9796 + }, + { + "epoch": 0.7374621276275428, + "grad_norm": 4.639212131500244, + "learning_rate": 1.7145709043104208e-05, + "loss": 2.0017, + "step": 9797 + }, + { + "epoch": 0.737537401908203, + "grad_norm": 4.923370838165283, + "learning_rate": 1.7136520333639843e-05, + "loss": 1.8616, + "step": 9798 + }, + { + "epoch": 0.7376126761888632, + "grad_norm": 4.515198230743408, + "learning_rate": 1.7127333577815053e-05, + "loss": 1.6887, + "step": 9799 + }, + { + "epoch": 0.7376879504695233, + "grad_norm": 6.550492286682129, + "learning_rate": 1.711814877617595e-05, + "loss": 1.637, + "step": 9800 + }, + { + "epoch": 0.7377632247501835, + "grad_norm": 4.596217632293701, + "learning_rate": 1.710896592926856e-05, + "loss": 1.8319, + "step": 9801 + }, + { + "epoch": 0.7378384990308436, + "grad_norm": 5.986010551452637, + "learning_rate": 1.7099785037638787e-05, + "loss": 1.8664, + "step": 9802 + }, + { + "epoch": 0.7379137733115038, + "grad_norm": 3.6857333183288574, + "learning_rate": 1.709060610183238e-05, + "loss": 1.8447, + "step": 9803 + }, + { + "epoch": 0.737989047592164, + "grad_norm": 6.030239582061768, + "learning_rate": 1.7081429122395022e-05, + "loss": 1.8531, + "step": 9804 + }, + { + "epoch": 0.7380643218728241, + "grad_norm": 5.257287979125977, + "learning_rate": 1.707225409987224e-05, + "loss": 1.6938, + "step": 9805 + }, + { + "epoch": 0.7381395961534842, + "grad_norm": 4.69661283493042, + "learning_rate": 1.706308103480948e-05, + "loss": 1.7411, + "step": 9806 + }, + { + "epoch": 0.7382148704341445, + "grad_norm": 5.9631028175354, + "learning_rate": 1.7053909927752034e-05, + "loss": 1.5424, + "step": 9807 + }, + { + "epoch": 0.7382901447148046, + "grad_norm": 4.716734886169434, + "learning_rate": 1.7044740779245123e-05, + "loss": 2.1388, + "step": 9808 + }, + { + "epoch": 0.7383654189954647, + "grad_norm": 6.152210235595703, + "learning_rate": 1.7035573589833798e-05, + "loss": 1.8094, + "step": 9809 + }, + { + "epoch": 0.7384406932761248, + "grad_norm": 4.928935527801514, + "learning_rate": 1.7026408360063035e-05, + "loss": 1.7463, + "step": 9810 + }, + { + "epoch": 0.7385159675567851, + "grad_norm": 5.057109355926514, + "learning_rate": 1.70172450904777e-05, + "loss": 1.9769, + "step": 9811 + }, + { + "epoch": 0.7385912418374452, + "grad_norm": 4.73891019821167, + "learning_rate": 1.7008083781622496e-05, + "loss": 1.936, + "step": 9812 + }, + { + "epoch": 0.7386665161181053, + "grad_norm": 5.9416584968566895, + "learning_rate": 1.699892443404204e-05, + "loss": 1.8201, + "step": 9813 + }, + { + "epoch": 0.7387417903987655, + "grad_norm": 5.524312496185303, + "learning_rate": 1.698976704828086e-05, + "loss": 2.0471, + "step": 9814 + }, + { + "epoch": 0.7388170646794257, + "grad_norm": 6.035019397735596, + "learning_rate": 1.6980611624883285e-05, + "loss": 1.5005, + "step": 9815 + }, + { + "epoch": 0.7388923389600858, + "grad_norm": 5.996881484985352, + "learning_rate": 1.6971458164393618e-05, + "loss": 1.8045, + "step": 9816 + }, + { + "epoch": 0.738967613240746, + "grad_norm": 4.0374274253845215, + "learning_rate": 1.6962306667355997e-05, + "loss": 1.6018, + "step": 9817 + }, + { + "epoch": 0.7390428875214061, + "grad_norm": 5.851326942443848, + "learning_rate": 1.695315713431444e-05, + "loss": 2.1033, + "step": 9818 + }, + { + "epoch": 0.7391181618020662, + "grad_norm": 4.750574111938477, + "learning_rate": 1.6944009565812886e-05, + "loss": 2.0676, + "step": 9819 + }, + { + "epoch": 0.7391934360827265, + "grad_norm": 4.734914302825928, + "learning_rate": 1.69348639623951e-05, + "loss": 1.9219, + "step": 9820 + }, + { + "epoch": 0.7392687103633866, + "grad_norm": 6.682633876800537, + "learning_rate": 1.692572032460479e-05, + "loss": 1.6547, + "step": 9821 + }, + { + "epoch": 0.7393439846440467, + "grad_norm": 4.324955940246582, + "learning_rate": 1.6916578652985488e-05, + "loss": 2.0777, + "step": 9822 + }, + { + "epoch": 0.739419258924707, + "grad_norm": 5.25320291519165, + "learning_rate": 1.690743894808066e-05, + "loss": 1.7514, + "step": 9823 + }, + { + "epoch": 0.7394945332053671, + "grad_norm": 4.350616931915283, + "learning_rate": 1.6898301210433633e-05, + "loss": 1.7402, + "step": 9824 + }, + { + "epoch": 0.7395698074860272, + "grad_norm": 7.803241729736328, + "learning_rate": 1.6889165440587635e-05, + "loss": 1.9206, + "step": 9825 + }, + { + "epoch": 0.7396450817666874, + "grad_norm": 3.8721065521240234, + "learning_rate": 1.688003163908573e-05, + "loss": 1.613, + "step": 9826 + }, + { + "epoch": 0.7397203560473475, + "grad_norm": 8.960091590881348, + "learning_rate": 1.687089980647093e-05, + "loss": 1.8164, + "step": 9827 + }, + { + "epoch": 0.7397956303280077, + "grad_norm": 5.168846130371094, + "learning_rate": 1.6861769943286065e-05, + "loss": 1.5543, + "step": 9828 + }, + { + "epoch": 0.7398709046086678, + "grad_norm": 4.666390895843506, + "learning_rate": 1.68526420500739e-05, + "loss": 1.8044, + "step": 9829 + }, + { + "epoch": 0.739946178889328, + "grad_norm": 5.713004112243652, + "learning_rate": 1.6843516127377044e-05, + "loss": 1.8832, + "step": 9830 + }, + { + "epoch": 0.7400214531699881, + "grad_norm": 3.6045401096343994, + "learning_rate": 1.683439217573804e-05, + "loss": 1.8955, + "step": 9831 + }, + { + "epoch": 0.7400967274506483, + "grad_norm": 4.702240467071533, + "learning_rate": 1.682527019569923e-05, + "loss": 2.1439, + "step": 9832 + }, + { + "epoch": 0.7401720017313085, + "grad_norm": 3.8796756267547607, + "learning_rate": 1.6816150187802925e-05, + "loss": 1.9255, + "step": 9833 + }, + { + "epoch": 0.7402472760119686, + "grad_norm": 4.2519073486328125, + "learning_rate": 1.680703215259128e-05, + "loss": 1.6422, + "step": 9834 + }, + { + "epoch": 0.7403225502926287, + "grad_norm": 4.791501045227051, + "learning_rate": 1.6797916090606326e-05, + "loss": 2.0549, + "step": 9835 + }, + { + "epoch": 0.740397824573289, + "grad_norm": 5.626745700836182, + "learning_rate": 1.6788802002390008e-05, + "loss": 2.0549, + "step": 9836 + }, + { + "epoch": 0.7404730988539491, + "grad_norm": 4.230198383331299, + "learning_rate": 1.677968988848411e-05, + "loss": 1.8099, + "step": 9837 + }, + { + "epoch": 0.7405483731346092, + "grad_norm": 4.774646282196045, + "learning_rate": 1.677057974943033e-05, + "loss": 1.9989, + "step": 9838 + }, + { + "epoch": 0.7406236474152694, + "grad_norm": 4.388845920562744, + "learning_rate": 1.6761471585770232e-05, + "loss": 1.8398, + "step": 9839 + }, + { + "epoch": 0.7406989216959295, + "grad_norm": 3.661468267440796, + "learning_rate": 1.6752365398045295e-05, + "loss": 2.0451, + "step": 9840 + }, + { + "epoch": 0.7407741959765897, + "grad_norm": 4.158090114593506, + "learning_rate": 1.674326118679681e-05, + "loss": 1.8393, + "step": 9841 + }, + { + "epoch": 0.7408494702572499, + "grad_norm": 4.40130090713501, + "learning_rate": 1.6734158952566048e-05, + "loss": 1.593, + "step": 9842 + }, + { + "epoch": 0.74092474453791, + "grad_norm": 4.243234157562256, + "learning_rate": 1.672505869589407e-05, + "loss": 1.8338, + "step": 9843 + }, + { + "epoch": 0.7410000188185701, + "grad_norm": 4.289233207702637, + "learning_rate": 1.671596041732187e-05, + "loss": 2.0266, + "step": 9844 + }, + { + "epoch": 0.7410752930992304, + "grad_norm": 7.550484657287598, + "learning_rate": 1.6706864117390326e-05, + "loss": 1.896, + "step": 9845 + }, + { + "epoch": 0.7411505673798905, + "grad_norm": 3.981187105178833, + "learning_rate": 1.6697769796640196e-05, + "loss": 1.1863, + "step": 9846 + }, + { + "epoch": 0.7412258416605506, + "grad_norm": 5.1406097412109375, + "learning_rate": 1.668867745561208e-05, + "loss": 1.6656, + "step": 9847 + }, + { + "epoch": 0.7413011159412108, + "grad_norm": 6.1259307861328125, + "learning_rate": 1.6679587094846504e-05, + "loss": 2.0484, + "step": 9848 + }, + { + "epoch": 0.741376390221871, + "grad_norm": 3.8600540161132812, + "learning_rate": 1.667049871488389e-05, + "loss": 2.0193, + "step": 9849 + }, + { + "epoch": 0.7414516645025311, + "grad_norm": 4.152215957641602, + "learning_rate": 1.666141231626448e-05, + "loss": 1.6299, + "step": 9850 + }, + { + "epoch": 0.7415269387831912, + "grad_norm": 5.832450866699219, + "learning_rate": 1.6652327899528458e-05, + "loss": 1.6373, + "step": 9851 + }, + { + "epoch": 0.7416022130638514, + "grad_norm": 4.742039680480957, + "learning_rate": 1.6643245465215845e-05, + "loss": 1.8856, + "step": 9852 + }, + { + "epoch": 0.7416774873445116, + "grad_norm": 4.237407207489014, + "learning_rate": 1.6634165013866592e-05, + "loss": 1.8743, + "step": 9853 + }, + { + "epoch": 0.7417527616251717, + "grad_norm": 7.049525737762451, + "learning_rate": 1.662508654602048e-05, + "loss": 1.8939, + "step": 9854 + }, + { + "epoch": 0.7418280359058319, + "grad_norm": 5.102390766143799, + "learning_rate": 1.6616010062217208e-05, + "loss": 1.8585, + "step": 9855 + }, + { + "epoch": 0.741903310186492, + "grad_norm": 6.493330001831055, + "learning_rate": 1.6606935562996346e-05, + "loss": 1.9909, + "step": 9856 + }, + { + "epoch": 0.7419785844671521, + "grad_norm": 5.144820213317871, + "learning_rate": 1.6597863048897367e-05, + "loss": 1.8987, + "step": 9857 + }, + { + "epoch": 0.7420538587478124, + "grad_norm": 4.827053546905518, + "learning_rate": 1.658879252045957e-05, + "loss": 1.7674, + "step": 9858 + }, + { + "epoch": 0.7421291330284725, + "grad_norm": 4.613465785980225, + "learning_rate": 1.6579723978222213e-05, + "loss": 1.9365, + "step": 9859 + }, + { + "epoch": 0.7422044073091326, + "grad_norm": 6.175017833709717, + "learning_rate": 1.6570657422724345e-05, + "loss": 1.8502, + "step": 9860 + }, + { + "epoch": 0.7422796815897929, + "grad_norm": 4.701926231384277, + "learning_rate": 1.6561592854504996e-05, + "loss": 1.3468, + "step": 9861 + }, + { + "epoch": 0.742354955870453, + "grad_norm": 4.537257671356201, + "learning_rate": 1.655253027410299e-05, + "loss": 2.1463, + "step": 9862 + }, + { + "epoch": 0.7424302301511131, + "grad_norm": 3.98941707611084, + "learning_rate": 1.6543469682057106e-05, + "loss": 2.0973, + "step": 9863 + }, + { + "epoch": 0.7425055044317733, + "grad_norm": 6.910943984985352, + "learning_rate": 1.6534411078905932e-05, + "loss": 2.0741, + "step": 9864 + }, + { + "epoch": 0.7425807787124334, + "grad_norm": 4.984281063079834, + "learning_rate": 1.6525354465187998e-05, + "loss": 1.9611, + "step": 9865 + }, + { + "epoch": 0.7426560529930936, + "grad_norm": 3.760453701019287, + "learning_rate": 1.6516299841441707e-05, + "loss": 1.7254, + "step": 9866 + }, + { + "epoch": 0.7427313272737538, + "grad_norm": 4.243823051452637, + "learning_rate": 1.6507247208205295e-05, + "loss": 1.5871, + "step": 9867 + }, + { + "epoch": 0.7428066015544139, + "grad_norm": 4.406725883483887, + "learning_rate": 1.649819656601694e-05, + "loss": 2.0743, + "step": 9868 + }, + { + "epoch": 0.742881875835074, + "grad_norm": 5.711879253387451, + "learning_rate": 1.6489147915414672e-05, + "loss": 1.5666, + "step": 9869 + }, + { + "epoch": 0.7429571501157342, + "grad_norm": 6.708004474639893, + "learning_rate": 1.648010125693642e-05, + "loss": 1.7104, + "step": 9870 + }, + { + "epoch": 0.7430324243963944, + "grad_norm": 6.33036470413208, + "learning_rate": 1.6471056591119955e-05, + "loss": 1.7219, + "step": 9871 + }, + { + "epoch": 0.7431076986770545, + "grad_norm": 4.447892665863037, + "learning_rate": 1.6462013918502983e-05, + "loss": 2.6098, + "step": 9872 + }, + { + "epoch": 0.7431829729577146, + "grad_norm": 7.324742317199707, + "learning_rate": 1.645297323962304e-05, + "loss": 2.0948, + "step": 9873 + }, + { + "epoch": 0.7432582472383749, + "grad_norm": 5.258800029754639, + "learning_rate": 1.6443934555017593e-05, + "loss": 2.0331, + "step": 9874 + }, + { + "epoch": 0.743333521519035, + "grad_norm": 5.579085826873779, + "learning_rate": 1.643489786522394e-05, + "loss": 1.8244, + "step": 9875 + }, + { + "epoch": 0.7434087957996951, + "grad_norm": 4.505311965942383, + "learning_rate": 1.6425863170779315e-05, + "loss": 1.9142, + "step": 9876 + }, + { + "epoch": 0.7434840700803553, + "grad_norm": 6.410801410675049, + "learning_rate": 1.641683047222078e-05, + "loss": 1.7909, + "step": 9877 + }, + { + "epoch": 0.7435593443610155, + "grad_norm": 4.629251480102539, + "learning_rate": 1.6407799770085303e-05, + "loss": 2.0321, + "step": 9878 + }, + { + "epoch": 0.7436346186416756, + "grad_norm": 5.248072624206543, + "learning_rate": 1.6398771064909744e-05, + "loss": 1.7495, + "step": 9879 + }, + { + "epoch": 0.7437098929223358, + "grad_norm": 4.81912088394165, + "learning_rate": 1.6389744357230857e-05, + "loss": 1.9599, + "step": 9880 + }, + { + "epoch": 0.7437851672029959, + "grad_norm": 4.297432899475098, + "learning_rate": 1.6380719647585202e-05, + "loss": 1.7311, + "step": 9881 + }, + { + "epoch": 0.743860441483656, + "grad_norm": 4.145941734313965, + "learning_rate": 1.6371696936509307e-05, + "loss": 1.8869, + "step": 9882 + }, + { + "epoch": 0.7439357157643163, + "grad_norm": 3.571012020111084, + "learning_rate": 1.6362676224539553e-05, + "loss": 1.65, + "step": 9883 + }, + { + "epoch": 0.7440109900449764, + "grad_norm": 3.5506057739257812, + "learning_rate": 1.6353657512212168e-05, + "loss": 1.8912, + "step": 9884 + }, + { + "epoch": 0.7440862643256365, + "grad_norm": 5.5435686111450195, + "learning_rate": 1.634464080006331e-05, + "loss": 1.9438, + "step": 9885 + }, + { + "epoch": 0.7441615386062967, + "grad_norm": 4.103124141693115, + "learning_rate": 1.6335626088628982e-05, + "loss": 1.4913, + "step": 9886 + }, + { + "epoch": 0.7442368128869569, + "grad_norm": 4.7531914710998535, + "learning_rate": 1.6326613378445094e-05, + "loss": 1.8622, + "step": 9887 + }, + { + "epoch": 0.744312087167617, + "grad_norm": 4.540632724761963, + "learning_rate": 1.6317602670047412e-05, + "loss": 1.7091, + "step": 9888 + }, + { + "epoch": 0.7443873614482771, + "grad_norm": 3.917654037475586, + "learning_rate": 1.63085939639716e-05, + "loss": 1.7902, + "step": 9889 + }, + { + "epoch": 0.7444626357289373, + "grad_norm": 4.325102806091309, + "learning_rate": 1.6299587260753214e-05, + "loss": 1.7013, + "step": 9890 + }, + { + "epoch": 0.7445379100095975, + "grad_norm": 4.560401916503906, + "learning_rate": 1.629058256092768e-05, + "loss": 1.8116, + "step": 9891 + }, + { + "epoch": 0.7446131842902576, + "grad_norm": 6.286881446838379, + "learning_rate": 1.6281579865030272e-05, + "loss": 1.635, + "step": 9892 + }, + { + "epoch": 0.7446884585709178, + "grad_norm": 5.761804103851318, + "learning_rate": 1.6272579173596207e-05, + "loss": 1.866, + "step": 9893 + }, + { + "epoch": 0.7447637328515779, + "grad_norm": 6.114563941955566, + "learning_rate": 1.6263580487160523e-05, + "loss": 1.892, + "step": 9894 + }, + { + "epoch": 0.744839007132238, + "grad_norm": 4.459379196166992, + "learning_rate": 1.6254583806258196e-05, + "loss": 2.255, + "step": 9895 + }, + { + "epoch": 0.7449142814128983, + "grad_norm": 3.4857254028320312, + "learning_rate": 1.6245589131424015e-05, + "loss": 1.5535, + "step": 9896 + }, + { + "epoch": 0.7449895556935584, + "grad_norm": 5.383429050445557, + "learning_rate": 1.6236596463192712e-05, + "loss": 1.9829, + "step": 9897 + }, + { + "epoch": 0.7450648299742185, + "grad_norm": 4.698866844177246, + "learning_rate": 1.6227605802098883e-05, + "loss": 2.0038, + "step": 9898 + }, + { + "epoch": 0.7451401042548788, + "grad_norm": 5.783909320831299, + "learning_rate": 1.6218617148676966e-05, + "loss": 2.0986, + "step": 9899 + }, + { + "epoch": 0.7452153785355389, + "grad_norm": 5.782463550567627, + "learning_rate": 1.6209630503461337e-05, + "loss": 1.537, + "step": 9900 + }, + { + "epoch": 0.745290652816199, + "grad_norm": 5.257208347320557, + "learning_rate": 1.620064586698622e-05, + "loss": 2.3814, + "step": 9901 + }, + { + "epoch": 0.7453659270968592, + "grad_norm": 5.572715759277344, + "learning_rate": 1.619166323978574e-05, + "loss": 1.9776, + "step": 9902 + }, + { + "epoch": 0.7454412013775193, + "grad_norm": 6.007562160491943, + "learning_rate": 1.618268262239385e-05, + "loss": 2.3918, + "step": 9903 + }, + { + "epoch": 0.7455164756581795, + "grad_norm": 5.239382743835449, + "learning_rate": 1.6173704015344464e-05, + "loss": 1.8424, + "step": 9904 + }, + { + "epoch": 0.7455917499388397, + "grad_norm": 6.2795023918151855, + "learning_rate": 1.6164727419171305e-05, + "loss": 2.275, + "step": 9905 + }, + { + "epoch": 0.7456670242194998, + "grad_norm": 4.568172931671143, + "learning_rate": 1.6155752834408027e-05, + "loss": 2.0041, + "step": 9906 + }, + { + "epoch": 0.7457422985001599, + "grad_norm": 4.719252586364746, + "learning_rate": 1.6146780261588123e-05, + "loss": 1.9264, + "step": 9907 + }, + { + "epoch": 0.7458175727808201, + "grad_norm": 5.133445739746094, + "learning_rate": 1.613780970124501e-05, + "loss": 1.6788, + "step": 9908 + }, + { + "epoch": 0.7458928470614803, + "grad_norm": 5.094578742980957, + "learning_rate": 1.6128841153911934e-05, + "loss": 1.9862, + "step": 9909 + }, + { + "epoch": 0.7459681213421404, + "grad_norm": 3.811624526977539, + "learning_rate": 1.611987462012206e-05, + "loss": 1.6791, + "step": 9910 + }, + { + "epoch": 0.7460433956228005, + "grad_norm": 4.280270576477051, + "learning_rate": 1.6110910100408427e-05, + "loss": 2.0025, + "step": 9911 + }, + { + "epoch": 0.7461186699034608, + "grad_norm": 4.275609493255615, + "learning_rate": 1.610194759530397e-05, + "loss": 1.5968, + "step": 9912 + }, + { + "epoch": 0.7461939441841209, + "grad_norm": 4.728058815002441, + "learning_rate": 1.6092987105341443e-05, + "loss": 1.7052, + "step": 9913 + }, + { + "epoch": 0.746269218464781, + "grad_norm": 5.2638373374938965, + "learning_rate": 1.608402863105355e-05, + "loss": 1.6152, + "step": 9914 + }, + { + "epoch": 0.7463444927454412, + "grad_norm": 4.976005554199219, + "learning_rate": 1.607507217297285e-05, + "loss": 1.6614, + "step": 9915 + }, + { + "epoch": 0.7464197670261014, + "grad_norm": 5.303394794464111, + "learning_rate": 1.6066117731631754e-05, + "loss": 2.3206, + "step": 9916 + }, + { + "epoch": 0.7464950413067615, + "grad_norm": 5.240142822265625, + "learning_rate": 1.60571653075626e-05, + "loss": 1.8974, + "step": 9917 + }, + { + "epoch": 0.7465703155874217, + "grad_norm": 5.302753925323486, + "learning_rate": 1.6048214901297566e-05, + "loss": 1.973, + "step": 9918 + }, + { + "epoch": 0.7466455898680818, + "grad_norm": 4.430035591125488, + "learning_rate": 1.6039266513368757e-05, + "loss": 2.1102, + "step": 9919 + }, + { + "epoch": 0.746720864148742, + "grad_norm": 5.703297138214111, + "learning_rate": 1.6030320144308088e-05, + "loss": 1.9649, + "step": 9920 + }, + { + "epoch": 0.7467961384294022, + "grad_norm": 4.121432781219482, + "learning_rate": 1.602137579464743e-05, + "loss": 1.6882, + "step": 9921 + }, + { + "epoch": 0.7468714127100623, + "grad_norm": 5.7158074378967285, + "learning_rate": 1.6012433464918475e-05, + "loss": 2.111, + "step": 9922 + }, + { + "epoch": 0.7469466869907224, + "grad_norm": 3.5301871299743652, + "learning_rate": 1.6003493155652825e-05, + "loss": 1.8535, + "step": 9923 + }, + { + "epoch": 0.7470219612713827, + "grad_norm": 5.5596489906311035, + "learning_rate": 1.5994554867381967e-05, + "loss": 2.0635, + "step": 9924 + }, + { + "epoch": 0.7470972355520428, + "grad_norm": 4.481655120849609, + "learning_rate": 1.5985618600637257e-05, + "loss": 1.6222, + "step": 9925 + }, + { + "epoch": 0.7471725098327029, + "grad_norm": 4.996507167816162, + "learning_rate": 1.597668435594991e-05, + "loss": 1.9297, + "step": 9926 + }, + { + "epoch": 0.747247784113363, + "grad_norm": 4.631219387054443, + "learning_rate": 1.5967752133851076e-05, + "loss": 2.0041, + "step": 9927 + }, + { + "epoch": 0.7473230583940232, + "grad_norm": 4.944079875946045, + "learning_rate": 1.595882193487171e-05, + "loss": 1.7581, + "step": 9928 + }, + { + "epoch": 0.7473983326746834, + "grad_norm": 4.562366962432861, + "learning_rate": 1.59498937595427e-05, + "loss": 2.0134, + "step": 9929 + }, + { + "epoch": 0.7474736069553435, + "grad_norm": 3.90766978263855, + "learning_rate": 1.5940967608394823e-05, + "loss": 1.7661, + "step": 9930 + }, + { + "epoch": 0.7475488812360037, + "grad_norm": 4.513801097869873, + "learning_rate": 1.593204348195868e-05, + "loss": 2.156, + "step": 9931 + }, + { + "epoch": 0.7476241555166638, + "grad_norm": 5.679186820983887, + "learning_rate": 1.5923121380764818e-05, + "loss": 1.7024, + "step": 9932 + }, + { + "epoch": 0.747699429797324, + "grad_norm": 4.746894836425781, + "learning_rate": 1.591420130534359e-05, + "loss": 1.9402, + "step": 9933 + }, + { + "epoch": 0.7477747040779842, + "grad_norm": 7.584643840789795, + "learning_rate": 1.59052832562253e-05, + "loss": 2.0297, + "step": 9934 + }, + { + "epoch": 0.7478499783586443, + "grad_norm": 4.102123260498047, + "learning_rate": 1.5896367233940084e-05, + "loss": 1.7823, + "step": 9935 + }, + { + "epoch": 0.7479252526393044, + "grad_norm": 4.667409896850586, + "learning_rate": 1.5887453239018e-05, + "loss": 2.0671, + "step": 9936 + }, + { + "epoch": 0.7480005269199647, + "grad_norm": 4.708219051361084, + "learning_rate": 1.587854127198893e-05, + "loss": 1.9376, + "step": 9937 + }, + { + "epoch": 0.7480758012006248, + "grad_norm": 6.039755821228027, + "learning_rate": 1.5869631333382684e-05, + "loss": 2.0222, + "step": 9938 + }, + { + "epoch": 0.7481510754812849, + "grad_norm": 5.210358619689941, + "learning_rate": 1.586072342372891e-05, + "loss": 1.7184, + "step": 9939 + }, + { + "epoch": 0.7482263497619451, + "grad_norm": 6.337641716003418, + "learning_rate": 1.5851817543557196e-05, + "loss": 2.1556, + "step": 9940 + }, + { + "epoch": 0.7483016240426053, + "grad_norm": 4.71444845199585, + "learning_rate": 1.5842913693396928e-05, + "loss": 1.3112, + "step": 9941 + }, + { + "epoch": 0.7483768983232654, + "grad_norm": 3.6353938579559326, + "learning_rate": 1.5834011873777445e-05, + "loss": 1.8482, + "step": 9942 + }, + { + "epoch": 0.7484521726039256, + "grad_norm": 6.729156494140625, + "learning_rate": 1.5825112085227904e-05, + "loss": 1.8334, + "step": 9943 + }, + { + "epoch": 0.7485274468845857, + "grad_norm": 5.832686901092529, + "learning_rate": 1.5816214328277395e-05, + "loss": 1.86, + "step": 9944 + }, + { + "epoch": 0.7486027211652458, + "grad_norm": 4.744770526885986, + "learning_rate": 1.5807318603454864e-05, + "loss": 2.0968, + "step": 9945 + }, + { + "epoch": 0.7486779954459061, + "grad_norm": 5.062871932983398, + "learning_rate": 1.5798424911289128e-05, + "loss": 1.6747, + "step": 9946 + }, + { + "epoch": 0.7487532697265662, + "grad_norm": 4.214980602264404, + "learning_rate": 1.5789533252308908e-05, + "loss": 1.7126, + "step": 9947 + }, + { + "epoch": 0.7488285440072263, + "grad_norm": 6.616235256195068, + "learning_rate": 1.5780643627042763e-05, + "loss": 1.7633, + "step": 9948 + }, + { + "epoch": 0.7489038182878864, + "grad_norm": 4.031871795654297, + "learning_rate": 1.5771756036019185e-05, + "loss": 1.8728, + "step": 9949 + }, + { + "epoch": 0.7489790925685467, + "grad_norm": 4.752917766571045, + "learning_rate": 1.5762870479766484e-05, + "loss": 1.7278, + "step": 9950 + }, + { + "epoch": 0.7490543668492068, + "grad_norm": 3.939545154571533, + "learning_rate": 1.575398695881291e-05, + "loss": 1.8063, + "step": 9951 + }, + { + "epoch": 0.7491296411298669, + "grad_norm": 4.217740535736084, + "learning_rate": 1.5745105473686533e-05, + "loss": 2.0681, + "step": 9952 + }, + { + "epoch": 0.7492049154105271, + "grad_norm": 4.378330230712891, + "learning_rate": 1.5736226024915364e-05, + "loss": 1.6649, + "step": 9953 + }, + { + "epoch": 0.7492801896911873, + "grad_norm": 3.8178975582122803, + "learning_rate": 1.572734861302722e-05, + "loss": 1.7891, + "step": 9954 + }, + { + "epoch": 0.7493554639718474, + "grad_norm": 4.729191303253174, + "learning_rate": 1.571847323854987e-05, + "loss": 2.1144, + "step": 9955 + }, + { + "epoch": 0.7494307382525076, + "grad_norm": 4.716762065887451, + "learning_rate": 1.570959990201092e-05, + "loss": 1.9679, + "step": 9956 + }, + { + "epoch": 0.7495060125331677, + "grad_norm": 4.810666561126709, + "learning_rate": 1.570072860393788e-05, + "loss": 1.773, + "step": 9957 + }, + { + "epoch": 0.7495812868138279, + "grad_norm": 4.066127300262451, + "learning_rate": 1.5691859344858096e-05, + "loss": 1.9217, + "step": 9958 + }, + { + "epoch": 0.7496565610944881, + "grad_norm": 5.161827087402344, + "learning_rate": 1.5682992125298847e-05, + "loss": 2.239, + "step": 9959 + }, + { + "epoch": 0.7497318353751482, + "grad_norm": 4.892882347106934, + "learning_rate": 1.5674126945787232e-05, + "loss": 2.1145, + "step": 9960 + }, + { + "epoch": 0.7498071096558083, + "grad_norm": 4.724384307861328, + "learning_rate": 1.5665263806850276e-05, + "loss": 1.9097, + "step": 9961 + }, + { + "epoch": 0.7498823839364686, + "grad_norm": 5.255415916442871, + "learning_rate": 1.565640270901489e-05, + "loss": 1.837, + "step": 9962 + }, + { + "epoch": 0.7499576582171287, + "grad_norm": 6.142913818359375, + "learning_rate": 1.5647543652807804e-05, + "loss": 1.6247, + "step": 9963 + }, + { + "epoch": 0.7500329324977888, + "grad_norm": 6.005850791931152, + "learning_rate": 1.5638686638755696e-05, + "loss": 1.6901, + "step": 9964 + }, + { + "epoch": 0.750108206778449, + "grad_norm": 5.859500885009766, + "learning_rate": 1.5629831667385054e-05, + "loss": 1.7682, + "step": 9965 + }, + { + "epoch": 0.7501834810591091, + "grad_norm": 6.2174577713012695, + "learning_rate": 1.562097873922232e-05, + "loss": 1.8863, + "step": 9966 + }, + { + "epoch": 0.7502587553397693, + "grad_norm": 5.0533366203308105, + "learning_rate": 1.5612127854793734e-05, + "loss": 2.0477, + "step": 9967 + }, + { + "epoch": 0.7503340296204294, + "grad_norm": 4.6626200675964355, + "learning_rate": 1.5603279014625484e-05, + "loss": 1.7795, + "step": 9968 + }, + { + "epoch": 0.7504093039010896, + "grad_norm": 5.1897406578063965, + "learning_rate": 1.5594432219243598e-05, + "loss": 1.7437, + "step": 9969 + }, + { + "epoch": 0.7504845781817497, + "grad_norm": 4.275720119476318, + "learning_rate": 1.5585587469174012e-05, + "loss": 1.5333, + "step": 9970 + }, + { + "epoch": 0.7505598524624099, + "grad_norm": 4.640401363372803, + "learning_rate": 1.557674476494249e-05, + "loss": 1.6171, + "step": 9971 + }, + { + "epoch": 0.7506351267430701, + "grad_norm": 5.083990573883057, + "learning_rate": 1.556790410707473e-05, + "loss": 1.7867, + "step": 9972 + }, + { + "epoch": 0.7507104010237302, + "grad_norm": 4.3151373863220215, + "learning_rate": 1.555906549609627e-05, + "loss": 1.7202, + "step": 9973 + }, + { + "epoch": 0.7507856753043903, + "grad_norm": 5.8796820640563965, + "learning_rate": 1.555022893253255e-05, + "loss": 1.7207, + "step": 9974 + }, + { + "epoch": 0.7508609495850506, + "grad_norm": 4.458544731140137, + "learning_rate": 1.554139441690886e-05, + "loss": 1.7106, + "step": 9975 + }, + { + "epoch": 0.7509362238657107, + "grad_norm": 4.028659343719482, + "learning_rate": 1.5532561949750418e-05, + "loss": 1.6165, + "step": 9976 + }, + { + "epoch": 0.7510114981463708, + "grad_norm": 3.9202828407287598, + "learning_rate": 1.552373153158225e-05, + "loss": 1.9387, + "step": 9977 + }, + { + "epoch": 0.751086772427031, + "grad_norm": 5.001904487609863, + "learning_rate": 1.5514903162929323e-05, + "loss": 2.3045, + "step": 9978 + }, + { + "epoch": 0.7511620467076912, + "grad_norm": 5.902695655822754, + "learning_rate": 1.5506076844316446e-05, + "loss": 1.6151, + "step": 9979 + }, + { + "epoch": 0.7512373209883513, + "grad_norm": 6.135552883148193, + "learning_rate": 1.5497252576268336e-05, + "loss": 1.7529, + "step": 9980 + }, + { + "epoch": 0.7513125952690115, + "grad_norm": 3.423927068710327, + "learning_rate": 1.5488430359309568e-05, + "loss": 1.8646, + "step": 9981 + }, + { + "epoch": 0.7513878695496716, + "grad_norm": 4.520468235015869, + "learning_rate": 1.5479610193964582e-05, + "loss": 1.5353, + "step": 9982 + }, + { + "epoch": 0.7514631438303317, + "grad_norm": 4.159463405609131, + "learning_rate": 1.5470792080757733e-05, + "loss": 1.5023, + "step": 9983 + }, + { + "epoch": 0.751538418110992, + "grad_norm": 4.193337917327881, + "learning_rate": 1.5461976020213204e-05, + "loss": 2.0218, + "step": 9984 + }, + { + "epoch": 0.7516136923916521, + "grad_norm": 4.588779449462891, + "learning_rate": 1.5453162012855117e-05, + "loss": 1.5502, + "step": 9985 + }, + { + "epoch": 0.7516889666723122, + "grad_norm": 6.364390850067139, + "learning_rate": 1.54443500592074e-05, + "loss": 1.7093, + "step": 9986 + }, + { + "epoch": 0.7517642409529723, + "grad_norm": 5.398507595062256, + "learning_rate": 1.5435540159793942e-05, + "loss": 1.6269, + "step": 9987 + }, + { + "epoch": 0.7518395152336326, + "grad_norm": 5.735795021057129, + "learning_rate": 1.5426732315138425e-05, + "loss": 1.9006, + "step": 9988 + }, + { + "epoch": 0.7519147895142927, + "grad_norm": 5.758598327636719, + "learning_rate": 1.5417926525764475e-05, + "loss": 2.0928, + "step": 9989 + }, + { + "epoch": 0.7519900637949528, + "grad_norm": 5.055922031402588, + "learning_rate": 1.540912279219556e-05, + "loss": 1.6823, + "step": 9990 + }, + { + "epoch": 0.752065338075613, + "grad_norm": 6.7720818519592285, + "learning_rate": 1.540032111495507e-05, + "loss": 1.9203, + "step": 9991 + }, + { + "epoch": 0.7521406123562732, + "grad_norm": 5.812138557434082, + "learning_rate": 1.5391521494566184e-05, + "loss": 1.7873, + "step": 9992 + }, + { + "epoch": 0.7522158866369333, + "grad_norm": 5.028049945831299, + "learning_rate": 1.5382723931552067e-05, + "loss": 1.8459, + "step": 9993 + }, + { + "epoch": 0.7522911609175935, + "grad_norm": 5.53501033782959, + "learning_rate": 1.5373928426435663e-05, + "loss": 1.7893, + "step": 9994 + }, + { + "epoch": 0.7523664351982536, + "grad_norm": 3.7731287479400635, + "learning_rate": 1.5365134979739858e-05, + "loss": 1.9724, + "step": 9995 + }, + { + "epoch": 0.7524417094789138, + "grad_norm": 5.036252498626709, + "learning_rate": 1.5356343591987422e-05, + "loss": 1.832, + "step": 9996 + }, + { + "epoch": 0.752516983759574, + "grad_norm": 5.82282829284668, + "learning_rate": 1.5347554263700936e-05, + "loss": 2.0169, + "step": 9997 + }, + { + "epoch": 0.7525922580402341, + "grad_norm": 6.087488651275635, + "learning_rate": 1.533876699540294e-05, + "loss": 1.9611, + "step": 9998 + }, + { + "epoch": 0.7526675323208942, + "grad_norm": 4.167436122894287, + "learning_rate": 1.532998178761577e-05, + "loss": 1.8921, + "step": 9999 + }, + { + "epoch": 0.7527428066015545, + "grad_norm": 4.24816370010376, + "learning_rate": 1.5321198640861712e-05, + "loss": 1.7124, + "step": 10000 + }, + { + "epoch": 0.7528180808822146, + "grad_norm": 4.933469295501709, + "learning_rate": 1.5312417555662882e-05, + "loss": 1.65, + "step": 10001 + }, + { + "epoch": 0.7528933551628747, + "grad_norm": 4.974427223205566, + "learning_rate": 1.530363853254132e-05, + "loss": 1.7783, + "step": 10002 + }, + { + "epoch": 0.7529686294435349, + "grad_norm": 6.356522083282471, + "learning_rate": 1.529486157201887e-05, + "loss": 2.1736, + "step": 10003 + }, + { + "epoch": 0.753043903724195, + "grad_norm": 6.629919528961182, + "learning_rate": 1.528608667461734e-05, + "loss": 2.1371, + "step": 10004 + }, + { + "epoch": 0.7531191780048552, + "grad_norm": 5.3332343101501465, + "learning_rate": 1.5277313840858337e-05, + "loss": 1.5657, + "step": 10005 + }, + { + "epoch": 0.7531944522855153, + "grad_norm": 7.4482808113098145, + "learning_rate": 1.5268543071263408e-05, + "loss": 1.9898, + "step": 10006 + }, + { + "epoch": 0.7532697265661755, + "grad_norm": 6.14054012298584, + "learning_rate": 1.5259774366353925e-05, + "loss": 1.7172, + "step": 10007 + }, + { + "epoch": 0.7533450008468356, + "grad_norm": 4.487518787384033, + "learning_rate": 1.5251007726651184e-05, + "loss": 1.7558, + "step": 10008 + }, + { + "epoch": 0.7534202751274958, + "grad_norm": 4.30145263671875, + "learning_rate": 1.5242243152676317e-05, + "loss": 1.69, + "step": 10009 + }, + { + "epoch": 0.753495549408156, + "grad_norm": 4.363803386688232, + "learning_rate": 1.5233480644950365e-05, + "loss": 1.669, + "step": 10010 + }, + { + "epoch": 0.7535708236888161, + "grad_norm": 4.698695659637451, + "learning_rate": 1.5224720203994231e-05, + "loss": 1.905, + "step": 10011 + }, + { + "epoch": 0.7536460979694762, + "grad_norm": 4.231746196746826, + "learning_rate": 1.5215961830328695e-05, + "loss": 1.9179, + "step": 10012 + }, + { + "epoch": 0.7537213722501365, + "grad_norm": 5.356712341308594, + "learning_rate": 1.5207205524474443e-05, + "loss": 2.1216, + "step": 10013 + }, + { + "epoch": 0.7537966465307966, + "grad_norm": 3.891530752182007, + "learning_rate": 1.5198451286951975e-05, + "loss": 1.9005, + "step": 10014 + }, + { + "epoch": 0.7538719208114567, + "grad_norm": 4.404372215270996, + "learning_rate": 1.5189699118281736e-05, + "loss": 1.5977, + "step": 10015 + }, + { + "epoch": 0.7539471950921169, + "grad_norm": 5.879538059234619, + "learning_rate": 1.5180949018983987e-05, + "loss": 1.8772, + "step": 10016 + }, + { + "epoch": 0.7540224693727771, + "grad_norm": 4.035519599914551, + "learning_rate": 1.5172200989578921e-05, + "loss": 1.6174, + "step": 10017 + }, + { + "epoch": 0.7540977436534372, + "grad_norm": 3.8961410522460938, + "learning_rate": 1.5163455030586565e-05, + "loss": 1.5893, + "step": 10018 + }, + { + "epoch": 0.7541730179340974, + "grad_norm": 5.440550804138184, + "learning_rate": 1.5154711142526867e-05, + "loss": 1.4628, + "step": 10019 + }, + { + "epoch": 0.7542482922147575, + "grad_norm": 6.2042436599731445, + "learning_rate": 1.514596932591959e-05, + "loss": 1.6468, + "step": 10020 + }, + { + "epoch": 0.7543235664954177, + "grad_norm": 6.319267749786377, + "learning_rate": 1.5137229581284445e-05, + "loss": 1.4374, + "step": 10021 + }, + { + "epoch": 0.7543988407760779, + "grad_norm": 4.681739807128906, + "learning_rate": 1.5128491909140952e-05, + "loss": 1.9577, + "step": 10022 + }, + { + "epoch": 0.754474115056738, + "grad_norm": 5.44571590423584, + "learning_rate": 1.511975631000856e-05, + "loss": 1.7032, + "step": 10023 + }, + { + "epoch": 0.7545493893373981, + "grad_norm": 6.700099468231201, + "learning_rate": 1.5111022784406575e-05, + "loss": 1.6449, + "step": 10024 + }, + { + "epoch": 0.7546246636180582, + "grad_norm": 5.16166877746582, + "learning_rate": 1.510229133285419e-05, + "loss": 1.6617, + "step": 10025 + }, + { + "epoch": 0.7546999378987185, + "grad_norm": 4.2618632316589355, + "learning_rate": 1.5093561955870439e-05, + "loss": 1.698, + "step": 10026 + }, + { + "epoch": 0.7547752121793786, + "grad_norm": 5.441774845123291, + "learning_rate": 1.5084834653974267e-05, + "loss": 1.8413, + "step": 10027 + }, + { + "epoch": 0.7548504864600387, + "grad_norm": 5.109415054321289, + "learning_rate": 1.5076109427684515e-05, + "loss": 1.7759, + "step": 10028 + }, + { + "epoch": 0.754925760740699, + "grad_norm": 7.502496719360352, + "learning_rate": 1.506738627751983e-05, + "loss": 2.0623, + "step": 10029 + }, + { + "epoch": 0.7550010350213591, + "grad_norm": 5.332769393920898, + "learning_rate": 1.5058665203998812e-05, + "loss": 2.0068, + "step": 10030 + }, + { + "epoch": 0.7550763093020192, + "grad_norm": 4.8725504875183105, + "learning_rate": 1.5049946207639875e-05, + "loss": 1.9227, + "step": 10031 + }, + { + "epoch": 0.7551515835826794, + "grad_norm": 4.100222587585449, + "learning_rate": 1.5041229288961367e-05, + "loss": 2.0567, + "step": 10032 + }, + { + "epoch": 0.7552268578633395, + "grad_norm": 4.386989116668701, + "learning_rate": 1.503251444848146e-05, + "loss": 1.7172, + "step": 10033 + }, + { + "epoch": 0.7553021321439997, + "grad_norm": 3.874253988265991, + "learning_rate": 1.5023801686718231e-05, + "loss": 1.6771, + "step": 10034 + }, + { + "epoch": 0.7553774064246599, + "grad_norm": 5.213954925537109, + "learning_rate": 1.5015091004189635e-05, + "loss": 1.7787, + "step": 10035 + }, + { + "epoch": 0.75545268070532, + "grad_norm": 5.724802017211914, + "learning_rate": 1.500638240141351e-05, + "loss": 1.9594, + "step": 10036 + }, + { + "epoch": 0.7555279549859801, + "grad_norm": 5.363083362579346, + "learning_rate": 1.499767587890753e-05, + "loss": 2.1877, + "step": 10037 + }, + { + "epoch": 0.7556032292666404, + "grad_norm": 4.180286407470703, + "learning_rate": 1.4988971437189304e-05, + "loss": 2.0091, + "step": 10038 + }, + { + "epoch": 0.7556785035473005, + "grad_norm": 5.6729044914245605, + "learning_rate": 1.4980269076776249e-05, + "loss": 1.7128, + "step": 10039 + }, + { + "epoch": 0.7557537778279606, + "grad_norm": 4.191105365753174, + "learning_rate": 1.4971568798185731e-05, + "loss": 1.7467, + "step": 10040 + }, + { + "epoch": 0.7558290521086208, + "grad_norm": 8.155600547790527, + "learning_rate": 1.4962870601934924e-05, + "loss": 2.5319, + "step": 10041 + }, + { + "epoch": 0.755904326389281, + "grad_norm": 4.792415142059326, + "learning_rate": 1.4954174488540923e-05, + "loss": 1.6589, + "step": 10042 + }, + { + "epoch": 0.7559796006699411, + "grad_norm": 4.4061126708984375, + "learning_rate": 1.4945480458520711e-05, + "loss": 1.762, + "step": 10043 + }, + { + "epoch": 0.7560548749506013, + "grad_norm": 5.4761857986450195, + "learning_rate": 1.4936788512391087e-05, + "loss": 1.7334, + "step": 10044 + }, + { + "epoch": 0.7561301492312614, + "grad_norm": 5.121610164642334, + "learning_rate": 1.4928098650668782e-05, + "loss": 1.6016, + "step": 10045 + }, + { + "epoch": 0.7562054235119215, + "grad_norm": 4.5385050773620605, + "learning_rate": 1.4919410873870377e-05, + "loss": 1.9712, + "step": 10046 + }, + { + "epoch": 0.7562806977925817, + "grad_norm": 6.4989166259765625, + "learning_rate": 1.4910725182512354e-05, + "loss": 1.9882, + "step": 10047 + }, + { + "epoch": 0.7563559720732419, + "grad_norm": 5.446986675262451, + "learning_rate": 1.4902041577111014e-05, + "loss": 1.7309, + "step": 10048 + }, + { + "epoch": 0.756431246353902, + "grad_norm": 4.288657188415527, + "learning_rate": 1.4893360058182621e-05, + "loss": 1.8747, + "step": 10049 + }, + { + "epoch": 0.7565065206345621, + "grad_norm": 4.878511428833008, + "learning_rate": 1.4884680626243219e-05, + "loss": 1.6474, + "step": 10050 + }, + { + "epoch": 0.7565817949152224, + "grad_norm": 4.053687572479248, + "learning_rate": 1.487600328180881e-05, + "loss": 1.6522, + "step": 10051 + }, + { + "epoch": 0.7566570691958825, + "grad_norm": 5.102494239807129, + "learning_rate": 1.486732802539521e-05, + "loss": 1.6672, + "step": 10052 + }, + { + "epoch": 0.7567323434765426, + "grad_norm": 4.51108455657959, + "learning_rate": 1.4858654857518173e-05, + "loss": 1.7443, + "step": 10053 + }, + { + "epoch": 0.7568076177572028, + "grad_norm": 4.4213337898254395, + "learning_rate": 1.484998377869325e-05, + "loss": 1.6428, + "step": 10054 + }, + { + "epoch": 0.756882892037863, + "grad_norm": 4.611510753631592, + "learning_rate": 1.484131478943594e-05, + "loss": 1.8679, + "step": 10055 + }, + { + "epoch": 0.7569581663185231, + "grad_norm": 5.248319149017334, + "learning_rate": 1.483264789026158e-05, + "loss": 1.7786, + "step": 10056 + }, + { + "epoch": 0.7570334405991833, + "grad_norm": 4.49108362197876, + "learning_rate": 1.4823983081685417e-05, + "loss": 2.1219, + "step": 10057 + }, + { + "epoch": 0.7571087148798434, + "grad_norm": 4.832563400268555, + "learning_rate": 1.4815320364222507e-05, + "loss": 1.6951, + "step": 10058 + }, + { + "epoch": 0.7571839891605036, + "grad_norm": 8.878467559814453, + "learning_rate": 1.4806659738387846e-05, + "loss": 2.3644, + "step": 10059 + }, + { + "epoch": 0.7572592634411638, + "grad_norm": 5.238609313964844, + "learning_rate": 1.4798001204696305e-05, + "loss": 2.0396, + "step": 10060 + }, + { + "epoch": 0.7573345377218239, + "grad_norm": 4.765303611755371, + "learning_rate": 1.4789344763662565e-05, + "loss": 1.6369, + "step": 10061 + }, + { + "epoch": 0.757409812002484, + "grad_norm": 5.658217430114746, + "learning_rate": 1.4780690415801262e-05, + "loss": 1.8445, + "step": 10062 + }, + { + "epoch": 0.7574850862831443, + "grad_norm": 7.026732444763184, + "learning_rate": 1.4772038161626845e-05, + "loss": 1.5348, + "step": 10063 + }, + { + "epoch": 0.7575603605638044, + "grad_norm": 5.15501070022583, + "learning_rate": 1.47633880016537e-05, + "loss": 1.8285, + "step": 10064 + }, + { + "epoch": 0.7576356348444645, + "grad_norm": 5.152629375457764, + "learning_rate": 1.4754739936396012e-05, + "loss": 2.4846, + "step": 10065 + }, + { + "epoch": 0.7577109091251246, + "grad_norm": 4.669248104095459, + "learning_rate": 1.474609396636792e-05, + "loss": 1.9223, + "step": 10066 + }, + { + "epoch": 0.7577861834057849, + "grad_norm": 5.185349941253662, + "learning_rate": 1.4737450092083371e-05, + "loss": 1.8226, + "step": 10067 + }, + { + "epoch": 0.757861457686445, + "grad_norm": 3.7794644832611084, + "learning_rate": 1.4728808314056236e-05, + "loss": 1.7302, + "step": 10068 + }, + { + "epoch": 0.7579367319671051, + "grad_norm": 4.549095153808594, + "learning_rate": 1.472016863280024e-05, + "loss": 1.6542, + "step": 10069 + }, + { + "epoch": 0.7580120062477653, + "grad_norm": 4.128381729125977, + "learning_rate": 1.4711531048829008e-05, + "loss": 1.851, + "step": 10070 + }, + { + "epoch": 0.7580872805284254, + "grad_norm": 4.7417826652526855, + "learning_rate": 1.4702895562655983e-05, + "loss": 1.4187, + "step": 10071 + }, + { + "epoch": 0.7581625548090856, + "grad_norm": 6.507777214050293, + "learning_rate": 1.469426217479456e-05, + "loss": 1.6092, + "step": 10072 + }, + { + "epoch": 0.7582378290897458, + "grad_norm": 4.4352803230285645, + "learning_rate": 1.4685630885757922e-05, + "loss": 1.6687, + "step": 10073 + }, + { + "epoch": 0.7583131033704059, + "grad_norm": 4.384361267089844, + "learning_rate": 1.4677001696059206e-05, + "loss": 1.8644, + "step": 10074 + }, + { + "epoch": 0.758388377651066, + "grad_norm": 6.202516078948975, + "learning_rate": 1.46683746062114e-05, + "loss": 1.825, + "step": 10075 + }, + { + "epoch": 0.7584636519317263, + "grad_norm": 4.29433012008667, + "learning_rate": 1.465974961672733e-05, + "loss": 1.9568, + "step": 10076 + }, + { + "epoch": 0.7585389262123864, + "grad_norm": 7.799400329589844, + "learning_rate": 1.4651126728119763e-05, + "loss": 1.8897, + "step": 10077 + }, + { + "epoch": 0.7586142004930465, + "grad_norm": 5.073342323303223, + "learning_rate": 1.4642505940901263e-05, + "loss": 2.2903, + "step": 10078 + }, + { + "epoch": 0.7586894747737067, + "grad_norm": 6.123051166534424, + "learning_rate": 1.4633887255584333e-05, + "loss": 1.4637, + "step": 10079 + }, + { + "epoch": 0.7587647490543669, + "grad_norm": 4.969133377075195, + "learning_rate": 1.4625270672681329e-05, + "loss": 1.7672, + "step": 10080 + }, + { + "epoch": 0.758840023335027, + "grad_norm": 4.791050434112549, + "learning_rate": 1.4616656192704503e-05, + "loss": 2.0636, + "step": 10081 + }, + { + "epoch": 0.7589152976156872, + "grad_norm": 4.908662796020508, + "learning_rate": 1.4608043816165922e-05, + "loss": 2.1467, + "step": 10082 + }, + { + "epoch": 0.7589905718963473, + "grad_norm": 4.78682279586792, + "learning_rate": 1.4599433543577595e-05, + "loss": 1.7868, + "step": 10083 + }, + { + "epoch": 0.7590658461770075, + "grad_norm": 4.780208110809326, + "learning_rate": 1.4590825375451361e-05, + "loss": 1.7034, + "step": 10084 + }, + { + "epoch": 0.7591411204576676, + "grad_norm": 4.949016094207764, + "learning_rate": 1.4582219312298967e-05, + "loss": 1.9641, + "step": 10085 + }, + { + "epoch": 0.7592163947383278, + "grad_norm": 4.622855186462402, + "learning_rate": 1.4573615354631997e-05, + "loss": 2.0881, + "step": 10086 + }, + { + "epoch": 0.7592916690189879, + "grad_norm": 7.076879024505615, + "learning_rate": 1.4565013502961961e-05, + "loss": 1.7142, + "step": 10087 + }, + { + "epoch": 0.759366943299648, + "grad_norm": 4.772395610809326, + "learning_rate": 1.4556413757800185e-05, + "loss": 1.7488, + "step": 10088 + }, + { + "epoch": 0.7594422175803083, + "grad_norm": 4.258056640625, + "learning_rate": 1.4547816119657909e-05, + "loss": 2.0981, + "step": 10089 + }, + { + "epoch": 0.7595174918609684, + "grad_norm": 4.140929222106934, + "learning_rate": 1.4539220589046237e-05, + "loss": 1.6252, + "step": 10090 + }, + { + "epoch": 0.7595927661416285, + "grad_norm": 5.462441444396973, + "learning_rate": 1.4530627166476157e-05, + "loss": 1.6531, + "step": 10091 + }, + { + "epoch": 0.7596680404222887, + "grad_norm": 6.058199405670166, + "learning_rate": 1.4522035852458532e-05, + "loss": 2.0438, + "step": 10092 + }, + { + "epoch": 0.7597433147029489, + "grad_norm": 3.7801451683044434, + "learning_rate": 1.4513446647504064e-05, + "loss": 2.0075, + "step": 10093 + }, + { + "epoch": 0.759818588983609, + "grad_norm": 4.6017231941223145, + "learning_rate": 1.450485955212339e-05, + "loss": 1.7479, + "step": 10094 + }, + { + "epoch": 0.7598938632642692, + "grad_norm": 5.249778747558594, + "learning_rate": 1.4496274566826945e-05, + "loss": 1.9271, + "step": 10095 + }, + { + "epoch": 0.7599691375449293, + "grad_norm": 3.9409539699554443, + "learning_rate": 1.4487691692125126e-05, + "loss": 1.5207, + "step": 10096 + }, + { + "epoch": 0.7600444118255895, + "grad_norm": 6.422845363616943, + "learning_rate": 1.4479110928528122e-05, + "loss": 2.0724, + "step": 10097 + }, + { + "epoch": 0.7601196861062497, + "grad_norm": 3.645026445388794, + "learning_rate": 1.4470532276546073e-05, + "loss": 1.8209, + "step": 10098 + }, + { + "epoch": 0.7601949603869098, + "grad_norm": 5.043045520782471, + "learning_rate": 1.4461955736688915e-05, + "loss": 2.0898, + "step": 10099 + }, + { + "epoch": 0.7602702346675699, + "grad_norm": 4.188684463500977, + "learning_rate": 1.4453381309466519e-05, + "loss": 1.6992, + "step": 10100 + }, + { + "epoch": 0.7603455089482302, + "grad_norm": 4.888040065765381, + "learning_rate": 1.444480899538861e-05, + "loss": 2.0255, + "step": 10101 + }, + { + "epoch": 0.7604207832288903, + "grad_norm": 4.585510730743408, + "learning_rate": 1.4436238794964802e-05, + "loss": 1.6398, + "step": 10102 + }, + { + "epoch": 0.7604960575095504, + "grad_norm": 4.1049652099609375, + "learning_rate": 1.4427670708704532e-05, + "loss": 1.7256, + "step": 10103 + }, + { + "epoch": 0.7605713317902105, + "grad_norm": 5.418903827667236, + "learning_rate": 1.4419104737117195e-05, + "loss": 1.933, + "step": 10104 + }, + { + "epoch": 0.7606466060708708, + "grad_norm": 4.789968013763428, + "learning_rate": 1.441054088071197e-05, + "loss": 1.5497, + "step": 10105 + }, + { + "epoch": 0.7607218803515309, + "grad_norm": 5.453304767608643, + "learning_rate": 1.4401979139997968e-05, + "loss": 1.9004, + "step": 10106 + }, + { + "epoch": 0.760797154632191, + "grad_norm": 4.689150333404541, + "learning_rate": 1.4393419515484186e-05, + "loss": 1.8116, + "step": 10107 + }, + { + "epoch": 0.7608724289128512, + "grad_norm": 4.344583511352539, + "learning_rate": 1.4384862007679429e-05, + "loss": 1.8466, + "step": 10108 + }, + { + "epoch": 0.7609477031935113, + "grad_norm": 4.653151512145996, + "learning_rate": 1.4376306617092445e-05, + "loss": 2.0953, + "step": 10109 + }, + { + "epoch": 0.7610229774741715, + "grad_norm": 5.969259738922119, + "learning_rate": 1.4367753344231804e-05, + "loss": 1.857, + "step": 10110 + }, + { + "epoch": 0.7610982517548317, + "grad_norm": 3.999704599380493, + "learning_rate": 1.4359202189605991e-05, + "loss": 1.6641, + "step": 10111 + }, + { + "epoch": 0.7611735260354918, + "grad_norm": 4.767504692077637, + "learning_rate": 1.4350653153723337e-05, + "loss": 1.9794, + "step": 10112 + }, + { + "epoch": 0.7612488003161519, + "grad_norm": 8.0087251663208, + "learning_rate": 1.4342106237092085e-05, + "loss": 2.0896, + "step": 10113 + }, + { + "epoch": 0.7613240745968122, + "grad_norm": 5.4577860832214355, + "learning_rate": 1.4333561440220283e-05, + "loss": 1.83, + "step": 10114 + }, + { + "epoch": 0.7613993488774723, + "grad_norm": 4.4710211753845215, + "learning_rate": 1.4325018763615933e-05, + "loss": 2.0372, + "step": 10115 + }, + { + "epoch": 0.7614746231581324, + "grad_norm": 7.323471546173096, + "learning_rate": 1.4316478207786837e-05, + "loss": 1.814, + "step": 10116 + }, + { + "epoch": 0.7615498974387926, + "grad_norm": 4.940953254699707, + "learning_rate": 1.4307939773240735e-05, + "loss": 1.5845, + "step": 10117 + }, + { + "epoch": 0.7616251717194528, + "grad_norm": 6.005489349365234, + "learning_rate": 1.4299403460485195e-05, + "loss": 1.705, + "step": 10118 + }, + { + "epoch": 0.7617004460001129, + "grad_norm": 7.866281986236572, + "learning_rate": 1.4290869270027696e-05, + "loss": 2.193, + "step": 10119 + }, + { + "epoch": 0.7617757202807731, + "grad_norm": 5.98714542388916, + "learning_rate": 1.428233720237554e-05, + "loss": 1.8539, + "step": 10120 + }, + { + "epoch": 0.7618509945614332, + "grad_norm": 5.611191272735596, + "learning_rate": 1.4273807258035964e-05, + "loss": 1.9976, + "step": 10121 + }, + { + "epoch": 0.7619262688420934, + "grad_norm": 6.04744291305542, + "learning_rate": 1.4265279437516027e-05, + "loss": 2.0185, + "step": 10122 + }, + { + "epoch": 0.7620015431227536, + "grad_norm": 4.674132823944092, + "learning_rate": 1.4256753741322692e-05, + "loss": 1.8721, + "step": 10123 + }, + { + "epoch": 0.7620768174034137, + "grad_norm": 3.6567792892456055, + "learning_rate": 1.424823016996279e-05, + "loss": 1.9501, + "step": 10124 + }, + { + "epoch": 0.7621520916840738, + "grad_norm": 5.554230690002441, + "learning_rate": 1.4239708723943018e-05, + "loss": 1.7415, + "step": 10125 + }, + { + "epoch": 0.762227365964734, + "grad_norm": 7.334959983825684, + "learning_rate": 1.4231189403769967e-05, + "loss": 1.6385, + "step": 10126 + }, + { + "epoch": 0.7623026402453942, + "grad_norm": 6.000788688659668, + "learning_rate": 1.4222672209950066e-05, + "loss": 1.4593, + "step": 10127 + }, + { + "epoch": 0.7623779145260543, + "grad_norm": 9.119935035705566, + "learning_rate": 1.4214157142989654e-05, + "loss": 1.7431, + "step": 10128 + }, + { + "epoch": 0.7624531888067144, + "grad_norm": 4.112301349639893, + "learning_rate": 1.4205644203394908e-05, + "loss": 1.7218, + "step": 10129 + }, + { + "epoch": 0.7625284630873747, + "grad_norm": 4.8725080490112305, + "learning_rate": 1.4197133391671925e-05, + "loss": 2.0033, + "step": 10130 + }, + { + "epoch": 0.7626037373680348, + "grad_norm": 4.427544116973877, + "learning_rate": 1.4188624708326609e-05, + "loss": 1.7914, + "step": 10131 + }, + { + "epoch": 0.7626790116486949, + "grad_norm": 3.9803240299224854, + "learning_rate": 1.4180118153864818e-05, + "loss": 2.1275, + "step": 10132 + }, + { + "epoch": 0.7627542859293551, + "grad_norm": 4.8816375732421875, + "learning_rate": 1.4171613728792215e-05, + "loss": 1.7891, + "step": 10133 + }, + { + "epoch": 0.7628295602100152, + "grad_norm": 4.174935817718506, + "learning_rate": 1.4163111433614369e-05, + "loss": 1.513, + "step": 10134 + }, + { + "epoch": 0.7629048344906754, + "grad_norm": 4.618239879608154, + "learning_rate": 1.4154611268836714e-05, + "loss": 2.1072, + "step": 10135 + }, + { + "epoch": 0.7629801087713356, + "grad_norm": 4.536387920379639, + "learning_rate": 1.4146113234964592e-05, + "loss": 2.0055, + "step": 10136 + }, + { + "epoch": 0.7630553830519957, + "grad_norm": 5.468199253082275, + "learning_rate": 1.4137617332503144e-05, + "loss": 1.9225, + "step": 10137 + }, + { + "epoch": 0.7631306573326558, + "grad_norm": 5.075319290161133, + "learning_rate": 1.4129123561957457e-05, + "loss": 1.94, + "step": 10138 + }, + { + "epoch": 0.7632059316133161, + "grad_norm": 4.103004455566406, + "learning_rate": 1.4120631923832433e-05, + "loss": 1.9509, + "step": 10139 + }, + { + "epoch": 0.7632812058939762, + "grad_norm": 5.256185054779053, + "learning_rate": 1.4112142418632895e-05, + "loss": 1.8984, + "step": 10140 + }, + { + "epoch": 0.7633564801746363, + "grad_norm": 4.984541893005371, + "learning_rate": 1.4103655046863535e-05, + "loss": 1.729, + "step": 10141 + }, + { + "epoch": 0.7634317544552965, + "grad_norm": 4.6674041748046875, + "learning_rate": 1.4095169809028863e-05, + "loss": 1.7435, + "step": 10142 + }, + { + "epoch": 0.7635070287359567, + "grad_norm": 4.191567897796631, + "learning_rate": 1.408668670563334e-05, + "loss": 2.0525, + "step": 10143 + }, + { + "epoch": 0.7635823030166168, + "grad_norm": 5.630195617675781, + "learning_rate": 1.4078205737181233e-05, + "loss": 1.486, + "step": 10144 + }, + { + "epoch": 0.7636575772972769, + "grad_norm": 5.826077938079834, + "learning_rate": 1.4069726904176722e-05, + "loss": 1.9778, + "step": 10145 + }, + { + "epoch": 0.7637328515779371, + "grad_norm": 4.659607410430908, + "learning_rate": 1.4061250207123849e-05, + "loss": 1.4408, + "step": 10146 + }, + { + "epoch": 0.7638081258585973, + "grad_norm": 5.411288738250732, + "learning_rate": 1.405277564652655e-05, + "loss": 1.6467, + "step": 10147 + }, + { + "epoch": 0.7638834001392574, + "grad_norm": 5.05604362487793, + "learning_rate": 1.4044303222888577e-05, + "loss": 1.9486, + "step": 10148 + }, + { + "epoch": 0.7639586744199176, + "grad_norm": 4.694107532501221, + "learning_rate": 1.4035832936713622e-05, + "loss": 1.9845, + "step": 10149 + }, + { + "epoch": 0.7640339487005777, + "grad_norm": 4.309806823730469, + "learning_rate": 1.4027364788505199e-05, + "loss": 1.354, + "step": 10150 + }, + { + "epoch": 0.7641092229812378, + "grad_norm": 5.2016377449035645, + "learning_rate": 1.4018898778766732e-05, + "loss": 1.9064, + "step": 10151 + }, + { + "epoch": 0.7641844972618981, + "grad_norm": 6.1968674659729, + "learning_rate": 1.4010434908001474e-05, + "loss": 1.6488, + "step": 10152 + }, + { + "epoch": 0.7642597715425582, + "grad_norm": 4.028750419616699, + "learning_rate": 1.4001973176712613e-05, + "loss": 1.7321, + "step": 10153 + }, + { + "epoch": 0.7643350458232183, + "grad_norm": 5.01830530166626, + "learning_rate": 1.399351358540314e-05, + "loss": 2.1989, + "step": 10154 + }, + { + "epoch": 0.7644103201038785, + "grad_norm": 5.428408145904541, + "learning_rate": 1.3985056134575975e-05, + "loss": 1.9105, + "step": 10155 + }, + { + "epoch": 0.7644855943845387, + "grad_norm": 5.1468400955200195, + "learning_rate": 1.3976600824733876e-05, + "loss": 1.8674, + "step": 10156 + }, + { + "epoch": 0.7645608686651988, + "grad_norm": 7.597207069396973, + "learning_rate": 1.3968147656379498e-05, + "loss": 1.9232, + "step": 10157 + }, + { + "epoch": 0.764636142945859, + "grad_norm": 4.367791175842285, + "learning_rate": 1.3959696630015373e-05, + "loss": 1.8208, + "step": 10158 + }, + { + "epoch": 0.7647114172265191, + "grad_norm": 4.914565563201904, + "learning_rate": 1.3951247746143859e-05, + "loss": 1.9465, + "step": 10159 + }, + { + "epoch": 0.7647866915071793, + "grad_norm": 4.326786041259766, + "learning_rate": 1.3942801005267241e-05, + "loss": 1.8901, + "step": 10160 + }, + { + "epoch": 0.7648619657878395, + "grad_norm": 6.845241546630859, + "learning_rate": 1.3934356407887633e-05, + "loss": 1.753, + "step": 10161 + }, + { + "epoch": 0.7649372400684996, + "grad_norm": 6.695394039154053, + "learning_rate": 1.3925913954507063e-05, + "loss": 1.9097, + "step": 10162 + }, + { + "epoch": 0.7650125143491597, + "grad_norm": 4.475179672241211, + "learning_rate": 1.3917473645627387e-05, + "loss": 1.7474, + "step": 10163 + }, + { + "epoch": 0.7650877886298199, + "grad_norm": 4.231069087982178, + "learning_rate": 1.3909035481750388e-05, + "loss": 2.0426, + "step": 10164 + }, + { + "epoch": 0.7651630629104801, + "grad_norm": 4.919641017913818, + "learning_rate": 1.3900599463377655e-05, + "loss": 1.9045, + "step": 10165 + }, + { + "epoch": 0.7652383371911402, + "grad_norm": 3.9251348972320557, + "learning_rate": 1.3892165591010703e-05, + "loss": 2.0674, + "step": 10166 + }, + { + "epoch": 0.7653136114718003, + "grad_norm": 6.994894981384277, + "learning_rate": 1.3883733865150917e-05, + "loss": 1.73, + "step": 10167 + }, + { + "epoch": 0.7653888857524606, + "grad_norm": 4.049494743347168, + "learning_rate": 1.3875304286299511e-05, + "loss": 1.903, + "step": 10168 + }, + { + "epoch": 0.7654641600331207, + "grad_norm": 6.0920796394348145, + "learning_rate": 1.3866876854957612e-05, + "loss": 1.9131, + "step": 10169 + }, + { + "epoch": 0.7655394343137808, + "grad_norm": 4.842472553253174, + "learning_rate": 1.385845157162622e-05, + "loss": 2.3039, + "step": 10170 + }, + { + "epoch": 0.765614708594441, + "grad_norm": 4.513106822967529, + "learning_rate": 1.3850028436806163e-05, + "loss": 2.2758, + "step": 10171 + }, + { + "epoch": 0.7656899828751011, + "grad_norm": 4.150333881378174, + "learning_rate": 1.384160745099819e-05, + "loss": 1.8097, + "step": 10172 + }, + { + "epoch": 0.7657652571557613, + "grad_norm": 4.194115161895752, + "learning_rate": 1.3833188614702918e-05, + "loss": 1.8377, + "step": 10173 + }, + { + "epoch": 0.7658405314364215, + "grad_norm": 4.844198226928711, + "learning_rate": 1.3824771928420798e-05, + "loss": 1.8957, + "step": 10174 + }, + { + "epoch": 0.7659158057170816, + "grad_norm": 5.60052490234375, + "learning_rate": 1.3816357392652202e-05, + "loss": 2.0665, + "step": 10175 + }, + { + "epoch": 0.7659910799977417, + "grad_norm": 4.106018543243408, + "learning_rate": 1.3807945007897315e-05, + "loss": 1.663, + "step": 10176 + }, + { + "epoch": 0.766066354278402, + "grad_norm": 5.395410537719727, + "learning_rate": 1.3799534774656276e-05, + "loss": 2.1665, + "step": 10177 + }, + { + "epoch": 0.7661416285590621, + "grad_norm": 4.695087909698486, + "learning_rate": 1.3791126693429001e-05, + "loss": 1.9174, + "step": 10178 + }, + { + "epoch": 0.7662169028397222, + "grad_norm": 5.752013683319092, + "learning_rate": 1.3782720764715357e-05, + "loss": 1.7325, + "step": 10179 + }, + { + "epoch": 0.7662921771203824, + "grad_norm": 4.553603172302246, + "learning_rate": 1.3774316989015034e-05, + "loss": 1.9477, + "step": 10180 + }, + { + "epoch": 0.7663674514010426, + "grad_norm": 4.580021858215332, + "learning_rate": 1.3765915366827648e-05, + "loss": 1.6122, + "step": 10181 + }, + { + "epoch": 0.7664427256817027, + "grad_norm": 5.4661712646484375, + "learning_rate": 1.3757515898652607e-05, + "loss": 1.7131, + "step": 10182 + }, + { + "epoch": 0.7665179999623628, + "grad_norm": 4.642796516418457, + "learning_rate": 1.3749118584989267e-05, + "loss": 1.7412, + "step": 10183 + }, + { + "epoch": 0.766593274243023, + "grad_norm": 4.853013515472412, + "learning_rate": 1.3740723426336799e-05, + "loss": 1.6331, + "step": 10184 + }, + { + "epoch": 0.7666685485236832, + "grad_norm": 4.144401550292969, + "learning_rate": 1.37323304231943e-05, + "loss": 1.8196, + "step": 10185 + }, + { + "epoch": 0.7667438228043433, + "grad_norm": 5.098297119140625, + "learning_rate": 1.372393957606068e-05, + "loss": 1.8074, + "step": 10186 + }, + { + "epoch": 0.7668190970850035, + "grad_norm": 4.6307220458984375, + "learning_rate": 1.3715550885434758e-05, + "loss": 1.5264, + "step": 10187 + }, + { + "epoch": 0.7668943713656636, + "grad_norm": 4.551353931427002, + "learning_rate": 1.3707164351815244e-05, + "loss": 1.6491, + "step": 10188 + }, + { + "epoch": 0.7669696456463237, + "grad_norm": 5.007851600646973, + "learning_rate": 1.369877997570066e-05, + "loss": 1.6406, + "step": 10189 + }, + { + "epoch": 0.767044919926984, + "grad_norm": 3.551795482635498, + "learning_rate": 1.369039775758944e-05, + "loss": 1.7617, + "step": 10190 + }, + { + "epoch": 0.7671201942076441, + "grad_norm": 4.31390905380249, + "learning_rate": 1.3682017697979899e-05, + "loss": 1.7804, + "step": 10191 + }, + { + "epoch": 0.7671954684883042, + "grad_norm": 4.623659610748291, + "learning_rate": 1.3673639797370202e-05, + "loss": 1.5581, + "step": 10192 + }, + { + "epoch": 0.7672707427689645, + "grad_norm": 5.457238674163818, + "learning_rate": 1.3665264056258381e-05, + "loss": 1.5742, + "step": 10193 + }, + { + "epoch": 0.7673460170496246, + "grad_norm": 4.827406883239746, + "learning_rate": 1.3656890475142365e-05, + "loss": 1.8533, + "step": 10194 + }, + { + "epoch": 0.7674212913302847, + "grad_norm": 3.542780876159668, + "learning_rate": 1.3648519054519921e-05, + "loss": 1.7031, + "step": 10195 + }, + { + "epoch": 0.7674965656109449, + "grad_norm": 5.623861789703369, + "learning_rate": 1.3640149794888724e-05, + "loss": 1.5998, + "step": 10196 + }, + { + "epoch": 0.767571839891605, + "grad_norm": 5.4970879554748535, + "learning_rate": 1.3631782696746281e-05, + "loss": 1.8656, + "step": 10197 + }, + { + "epoch": 0.7676471141722652, + "grad_norm": 6.528900623321533, + "learning_rate": 1.3623417760590019e-05, + "loss": 1.859, + "step": 10198 + }, + { + "epoch": 0.7677223884529254, + "grad_norm": 5.136155128479004, + "learning_rate": 1.3615054986917181e-05, + "loss": 1.8869, + "step": 10199 + }, + { + "epoch": 0.7677976627335855, + "grad_norm": 4.395875453948975, + "learning_rate": 1.3606694376224927e-05, + "loss": 1.4866, + "step": 10200 + }, + { + "epoch": 0.7678729370142456, + "grad_norm": 5.447878360748291, + "learning_rate": 1.3598335929010264e-05, + "loss": 1.9389, + "step": 10201 + }, + { + "epoch": 0.7679482112949058, + "grad_norm": 4.584583282470703, + "learning_rate": 1.3589979645770095e-05, + "loss": 1.672, + "step": 10202 + }, + { + "epoch": 0.768023485575566, + "grad_norm": 3.8034114837646484, + "learning_rate": 1.3581625527001152e-05, + "loss": 1.7425, + "step": 10203 + }, + { + "epoch": 0.7680987598562261, + "grad_norm": 5.853212356567383, + "learning_rate": 1.3573273573200074e-05, + "loss": 2.0961, + "step": 10204 + }, + { + "epoch": 0.7681740341368862, + "grad_norm": 4.55974817276001, + "learning_rate": 1.3564923784863381e-05, + "loss": 1.535, + "step": 10205 + }, + { + "epoch": 0.7682493084175465, + "grad_norm": 4.903714656829834, + "learning_rate": 1.3556576162487405e-05, + "loss": 1.8728, + "step": 10206 + }, + { + "epoch": 0.7683245826982066, + "grad_norm": 4.199012279510498, + "learning_rate": 1.3548230706568427e-05, + "loss": 1.7993, + "step": 10207 + }, + { + "epoch": 0.7683998569788667, + "grad_norm": 5.503488063812256, + "learning_rate": 1.3539887417602525e-05, + "loss": 1.962, + "step": 10208 + }, + { + "epoch": 0.7684751312595269, + "grad_norm": 4.729506492614746, + "learning_rate": 1.3531546296085718e-05, + "loss": 1.5379, + "step": 10209 + }, + { + "epoch": 0.768550405540187, + "grad_norm": 4.008634567260742, + "learning_rate": 1.3523207342513827e-05, + "loss": 1.68, + "step": 10210 + }, + { + "epoch": 0.7686256798208472, + "grad_norm": 5.311148166656494, + "learning_rate": 1.3514870557382592e-05, + "loss": 1.9655, + "step": 10211 + }, + { + "epoch": 0.7687009541015074, + "grad_norm": 5.845650672912598, + "learning_rate": 1.350653594118762e-05, + "loss": 1.8099, + "step": 10212 + }, + { + "epoch": 0.7687762283821675, + "grad_norm": 4.755159378051758, + "learning_rate": 1.3498203494424389e-05, + "loss": 1.6694, + "step": 10213 + }, + { + "epoch": 0.7688515026628276, + "grad_norm": 4.569772243499756, + "learning_rate": 1.3489873217588206e-05, + "loss": 1.4385, + "step": 10214 + }, + { + "epoch": 0.7689267769434879, + "grad_norm": 6.986860275268555, + "learning_rate": 1.348154511117432e-05, + "loss": 1.6177, + "step": 10215 + }, + { + "epoch": 0.769002051224148, + "grad_norm": 4.347231864929199, + "learning_rate": 1.3473219175677776e-05, + "loss": 1.4043, + "step": 10216 + }, + { + "epoch": 0.7690773255048081, + "grad_norm": 6.113827705383301, + "learning_rate": 1.3464895411593565e-05, + "loss": 2.1064, + "step": 10217 + }, + { + "epoch": 0.7691525997854683, + "grad_norm": 6.7090911865234375, + "learning_rate": 1.3456573819416468e-05, + "loss": 2.2535, + "step": 10218 + }, + { + "epoch": 0.7692278740661285, + "grad_norm": 4.872086048126221, + "learning_rate": 1.3448254399641207e-05, + "loss": 1.6807, + "step": 10219 + }, + { + "epoch": 0.7693031483467886, + "grad_norm": 5.076318740844727, + "learning_rate": 1.3439937152762361e-05, + "loss": 2.1116, + "step": 10220 + }, + { + "epoch": 0.7693784226274488, + "grad_norm": 6.067685127258301, + "learning_rate": 1.343162207927433e-05, + "loss": 1.7678, + "step": 10221 + }, + { + "epoch": 0.7694536969081089, + "grad_norm": 6.3673787117004395, + "learning_rate": 1.3423309179671457e-05, + "loss": 1.6488, + "step": 10222 + }, + { + "epoch": 0.7695289711887691, + "grad_norm": 5.070117473602295, + "learning_rate": 1.3414998454447886e-05, + "loss": 2.1144, + "step": 10223 + }, + { + "epoch": 0.7696042454694292, + "grad_norm": 4.137439250946045, + "learning_rate": 1.340668990409768e-05, + "loss": 1.8164, + "step": 10224 + }, + { + "epoch": 0.7696795197500894, + "grad_norm": 4.388188362121582, + "learning_rate": 1.3398383529114766e-05, + "loss": 1.7565, + "step": 10225 + }, + { + "epoch": 0.7697547940307495, + "grad_norm": 7.138183116912842, + "learning_rate": 1.3390079329992943e-05, + "loss": 2.1584, + "step": 10226 + }, + { + "epoch": 0.7698300683114097, + "grad_norm": 5.42036247253418, + "learning_rate": 1.338177730722584e-05, + "loss": 1.7831, + "step": 10227 + }, + { + "epoch": 0.7699053425920699, + "grad_norm": 6.899005889892578, + "learning_rate": 1.3373477461307027e-05, + "loss": 1.511, + "step": 10228 + }, + { + "epoch": 0.76998061687273, + "grad_norm": 4.203088760375977, + "learning_rate": 1.3365179792729871e-05, + "loss": 1.5412, + "step": 10229 + }, + { + "epoch": 0.7700558911533901, + "grad_norm": 5.174337863922119, + "learning_rate": 1.3356884301987671e-05, + "loss": 2.1092, + "step": 10230 + }, + { + "epoch": 0.7701311654340504, + "grad_norm": 4.662639141082764, + "learning_rate": 1.3348590989573544e-05, + "loss": 1.5012, + "step": 10231 + }, + { + "epoch": 0.7702064397147105, + "grad_norm": 5.123978614807129, + "learning_rate": 1.3340299855980531e-05, + "loss": 1.9079, + "step": 10232 + }, + { + "epoch": 0.7702817139953706, + "grad_norm": 3.7078208923339844, + "learning_rate": 1.3332010901701492e-05, + "loss": 1.8383, + "step": 10233 + }, + { + "epoch": 0.7703569882760308, + "grad_norm": 5.786520957946777, + "learning_rate": 1.3323724127229192e-05, + "loss": 1.8397, + "step": 10234 + }, + { + "epoch": 0.770432262556691, + "grad_norm": 5.370747089385986, + "learning_rate": 1.3315439533056251e-05, + "loss": 1.9631, + "step": 10235 + }, + { + "epoch": 0.7705075368373511, + "grad_norm": 5.924793720245361, + "learning_rate": 1.3307157119675179e-05, + "loss": 2.1299, + "step": 10236 + }, + { + "epoch": 0.7705828111180113, + "grad_norm": 4.776413917541504, + "learning_rate": 1.3298876887578338e-05, + "loss": 1.9125, + "step": 10237 + }, + { + "epoch": 0.7706580853986714, + "grad_norm": 5.023410797119141, + "learning_rate": 1.329059883725795e-05, + "loss": 1.8723, + "step": 10238 + }, + { + "epoch": 0.7707333596793315, + "grad_norm": 4.753942966461182, + "learning_rate": 1.3282322969206145e-05, + "loss": 1.9613, + "step": 10239 + }, + { + "epoch": 0.7708086339599918, + "grad_norm": 4.498771667480469, + "learning_rate": 1.3274049283914869e-05, + "loss": 2.2984, + "step": 10240 + }, + { + "epoch": 0.7708839082406519, + "grad_norm": 7.124361038208008, + "learning_rate": 1.3265777781875998e-05, + "loss": 2.0021, + "step": 10241 + }, + { + "epoch": 0.770959182521312, + "grad_norm": 3.7062511444091797, + "learning_rate": 1.325750846358122e-05, + "loss": 1.8647, + "step": 10242 + }, + { + "epoch": 0.7710344568019721, + "grad_norm": 4.498233795166016, + "learning_rate": 1.3249241329522155e-05, + "loss": 1.6721, + "step": 10243 + }, + { + "epoch": 0.7711097310826324, + "grad_norm": 6.511932373046875, + "learning_rate": 1.3240976380190229e-05, + "loss": 2.442, + "step": 10244 + }, + { + "epoch": 0.7711850053632925, + "grad_norm": 6.102153778076172, + "learning_rate": 1.323271361607678e-05, + "loss": 1.7448, + "step": 10245 + }, + { + "epoch": 0.7712602796439526, + "grad_norm": 4.479430198669434, + "learning_rate": 1.3224453037673012e-05, + "loss": 2.1887, + "step": 10246 + }, + { + "epoch": 0.7713355539246128, + "grad_norm": 5.22567081451416, + "learning_rate": 1.3216194645470003e-05, + "loss": 1.6235, + "step": 10247 + }, + { + "epoch": 0.771410828205273, + "grad_norm": 6.178092002868652, + "learning_rate": 1.3207938439958667e-05, + "loss": 1.5667, + "step": 10248 + }, + { + "epoch": 0.7714861024859331, + "grad_norm": 6.74956750869751, + "learning_rate": 1.3199684421629833e-05, + "loss": 1.6232, + "step": 10249 + }, + { + "epoch": 0.7715613767665933, + "grad_norm": 4.291245937347412, + "learning_rate": 1.3191432590974157e-05, + "loss": 1.8429, + "step": 10250 + }, + { + "epoch": 0.7716366510472534, + "grad_norm": 4.1864495277404785, + "learning_rate": 1.3183182948482209e-05, + "loss": 1.7356, + "step": 10251 + }, + { + "epoch": 0.7717119253279136, + "grad_norm": 8.126055717468262, + "learning_rate": 1.3174935494644385e-05, + "loss": 2.3598, + "step": 10252 + }, + { + "epoch": 0.7717871996085738, + "grad_norm": 4.174500942230225, + "learning_rate": 1.3166690229950978e-05, + "loss": 2.4416, + "step": 10253 + }, + { + "epoch": 0.7718624738892339, + "grad_norm": 5.007815837860107, + "learning_rate": 1.3158447154892168e-05, + "loss": 1.9911, + "step": 10254 + }, + { + "epoch": 0.771937748169894, + "grad_norm": 4.2170023918151855, + "learning_rate": 1.3150206269957948e-05, + "loss": 1.938, + "step": 10255 + }, + { + "epoch": 0.7720130224505543, + "grad_norm": 4.838845252990723, + "learning_rate": 1.3141967575638231e-05, + "loss": 2.2108, + "step": 10256 + }, + { + "epoch": 0.7720882967312144, + "grad_norm": 5.322053909301758, + "learning_rate": 1.313373107242279e-05, + "loss": 1.744, + "step": 10257 + }, + { + "epoch": 0.7721635710118745, + "grad_norm": 4.062689781188965, + "learning_rate": 1.3125496760801265e-05, + "loss": 1.8338, + "step": 10258 + }, + { + "epoch": 0.7722388452925347, + "grad_norm": 6.558047294616699, + "learning_rate": 1.3117264641263139e-05, + "loss": 1.6632, + "step": 10259 + }, + { + "epoch": 0.7723141195731948, + "grad_norm": 5.126962661743164, + "learning_rate": 1.3109034714297813e-05, + "loss": 1.578, + "step": 10260 + }, + { + "epoch": 0.772389393853855, + "grad_norm": 4.497790336608887, + "learning_rate": 1.3100806980394508e-05, + "loss": 1.838, + "step": 10261 + }, + { + "epoch": 0.7724646681345151, + "grad_norm": 4.56712532043457, + "learning_rate": 1.3092581440042368e-05, + "loss": 1.5657, + "step": 10262 + }, + { + "epoch": 0.7725399424151753, + "grad_norm": 5.281686782836914, + "learning_rate": 1.3084358093730348e-05, + "loss": 1.8436, + "step": 10263 + }, + { + "epoch": 0.7726152166958354, + "grad_norm": 4.230472564697266, + "learning_rate": 1.3076136941947331e-05, + "loss": 1.5697, + "step": 10264 + }, + { + "epoch": 0.7726904909764956, + "grad_norm": 4.151181697845459, + "learning_rate": 1.3067917985182016e-05, + "loss": 1.9169, + "step": 10265 + }, + { + "epoch": 0.7727657652571558, + "grad_norm": 4.191083908081055, + "learning_rate": 1.3059701223923e-05, + "loss": 2.2028, + "step": 10266 + }, + { + "epoch": 0.7728410395378159, + "grad_norm": 5.646610736846924, + "learning_rate": 1.3051486658658756e-05, + "loss": 2.0996, + "step": 10267 + }, + { + "epoch": 0.772916313818476, + "grad_norm": 5.301667213439941, + "learning_rate": 1.3043274289877627e-05, + "loss": 1.7049, + "step": 10268 + }, + { + "epoch": 0.7729915880991363, + "grad_norm": 6.5089240074157715, + "learning_rate": 1.3035064118067785e-05, + "loss": 1.537, + "step": 10269 + }, + { + "epoch": 0.7730668623797964, + "grad_norm": 4.733782768249512, + "learning_rate": 1.3026856143717314e-05, + "loss": 1.9638, + "step": 10270 + }, + { + "epoch": 0.7731421366604565, + "grad_norm": 4.956386089324951, + "learning_rate": 1.3018650367314179e-05, + "loss": 1.643, + "step": 10271 + }, + { + "epoch": 0.7732174109411167, + "grad_norm": 4.07127046585083, + "learning_rate": 1.3010446789346149e-05, + "loss": 2.0078, + "step": 10272 + }, + { + "epoch": 0.7732926852217769, + "grad_norm": 4.3423919677734375, + "learning_rate": 1.3002245410300939e-05, + "loss": 1.675, + "step": 10273 + }, + { + "epoch": 0.773367959502437, + "grad_norm": 3.9609344005584717, + "learning_rate": 1.2994046230666068e-05, + "loss": 2.0238, + "step": 10274 + }, + { + "epoch": 0.7734432337830972, + "grad_norm": 4.841676235198975, + "learning_rate": 1.2985849250928978e-05, + "loss": 1.8763, + "step": 10275 + }, + { + "epoch": 0.7735185080637573, + "grad_norm": 7.237201690673828, + "learning_rate": 1.2977654471576929e-05, + "loss": 1.8126, + "step": 10276 + }, + { + "epoch": 0.7735937823444174, + "grad_norm": 5.282929420471191, + "learning_rate": 1.2969461893097113e-05, + "loss": 2.0821, + "step": 10277 + }, + { + "epoch": 0.7736690566250777, + "grad_norm": 4.099306583404541, + "learning_rate": 1.2961271515976515e-05, + "loss": 1.6924, + "step": 10278 + }, + { + "epoch": 0.7737443309057378, + "grad_norm": 4.538743019104004, + "learning_rate": 1.2953083340702049e-05, + "loss": 2.0157, + "step": 10279 + }, + { + "epoch": 0.7738196051863979, + "grad_norm": 4.927515506744385, + "learning_rate": 1.2944897367760478e-05, + "loss": 1.6174, + "step": 10280 + }, + { + "epoch": 0.773894879467058, + "grad_norm": 6.47377347946167, + "learning_rate": 1.2936713597638456e-05, + "loss": 2.0051, + "step": 10281 + }, + { + "epoch": 0.7739701537477183, + "grad_norm": 4.01516580581665, + "learning_rate": 1.2928532030822444e-05, + "loss": 1.7607, + "step": 10282 + }, + { + "epoch": 0.7740454280283784, + "grad_norm": 4.944225311279297, + "learning_rate": 1.2920352667798852e-05, + "loss": 1.7969, + "step": 10283 + }, + { + "epoch": 0.7741207023090385, + "grad_norm": 4.958349227905273, + "learning_rate": 1.291217550905388e-05, + "loss": 1.919, + "step": 10284 + }, + { + "epoch": 0.7741959765896987, + "grad_norm": 6.7434186935424805, + "learning_rate": 1.2904000555073664e-05, + "loss": 1.7989, + "step": 10285 + }, + { + "epoch": 0.7742712508703589, + "grad_norm": 4.951135635375977, + "learning_rate": 1.2895827806344185e-05, + "loss": 1.8776, + "step": 10286 + }, + { + "epoch": 0.774346525151019, + "grad_norm": 4.431406497955322, + "learning_rate": 1.2887657263351265e-05, + "loss": 1.8321, + "step": 10287 + }, + { + "epoch": 0.7744217994316792, + "grad_norm": 5.4372334480285645, + "learning_rate": 1.2879488926580647e-05, + "loss": 2.1528, + "step": 10288 + }, + { + "epoch": 0.7744970737123393, + "grad_norm": 3.8832361698150635, + "learning_rate": 1.2871322796517888e-05, + "loss": 1.6191, + "step": 10289 + }, + { + "epoch": 0.7745723479929995, + "grad_norm": 4.155890464782715, + "learning_rate": 1.2863158873648456e-05, + "loss": 1.6831, + "step": 10290 + }, + { + "epoch": 0.7746476222736597, + "grad_norm": 3.9918711185455322, + "learning_rate": 1.285499715845767e-05, + "loss": 1.2899, + "step": 10291 + }, + { + "epoch": 0.7747228965543198, + "grad_norm": 4.175673484802246, + "learning_rate": 1.2846837651430737e-05, + "loss": 2.0, + "step": 10292 + }, + { + "epoch": 0.7747981708349799, + "grad_norm": 5.092034816741943, + "learning_rate": 1.283868035305269e-05, + "loss": 1.568, + "step": 10293 + }, + { + "epoch": 0.7748734451156402, + "grad_norm": 4.143556118011475, + "learning_rate": 1.283052526380848e-05, + "loss": 2.292, + "step": 10294 + }, + { + "epoch": 0.7749487193963003, + "grad_norm": 6.161681175231934, + "learning_rate": 1.2822372384182874e-05, + "loss": 1.6774, + "step": 10295 + }, + { + "epoch": 0.7750239936769604, + "grad_norm": 5.420450210571289, + "learning_rate": 1.2814221714660574e-05, + "loss": 1.7932, + "step": 10296 + }, + { + "epoch": 0.7750992679576206, + "grad_norm": 5.200966835021973, + "learning_rate": 1.2806073255726076e-05, + "loss": 1.9951, + "step": 10297 + }, + { + "epoch": 0.7751745422382808, + "grad_norm": 4.236879348754883, + "learning_rate": 1.2797927007863819e-05, + "loss": 1.6228, + "step": 10298 + }, + { + "epoch": 0.7752498165189409, + "grad_norm": 4.418315887451172, + "learning_rate": 1.2789782971558046e-05, + "loss": 1.8796, + "step": 10299 + }, + { + "epoch": 0.775325090799601, + "grad_norm": 4.553370475769043, + "learning_rate": 1.27816411472929e-05, + "loss": 1.8835, + "step": 10300 + }, + { + "epoch": 0.7754003650802612, + "grad_norm": 5.340518951416016, + "learning_rate": 1.2773501535552402e-05, + "loss": 1.6591, + "step": 10301 + }, + { + "epoch": 0.7754756393609213, + "grad_norm": 4.0605974197387695, + "learning_rate": 1.2765364136820418e-05, + "loss": 1.7191, + "step": 10302 + }, + { + "epoch": 0.7755509136415815, + "grad_norm": 5.216078758239746, + "learning_rate": 1.2757228951580718e-05, + "loss": 1.322, + "step": 10303 + }, + { + "epoch": 0.7756261879222417, + "grad_norm": 4.079835414886475, + "learning_rate": 1.2749095980316877e-05, + "loss": 1.7638, + "step": 10304 + }, + { + "epoch": 0.7757014622029018, + "grad_norm": 4.159818172454834, + "learning_rate": 1.2740965223512413e-05, + "loss": 1.7083, + "step": 10305 + }, + { + "epoch": 0.7757767364835619, + "grad_norm": 4.0179338455200195, + "learning_rate": 1.273283668165064e-05, + "loss": 1.9264, + "step": 10306 + }, + { + "epoch": 0.7758520107642222, + "grad_norm": 5.417019367218018, + "learning_rate": 1.2724710355214809e-05, + "loss": 1.6962, + "step": 10307 + }, + { + "epoch": 0.7759272850448823, + "grad_norm": 4.357333660125732, + "learning_rate": 1.2716586244687984e-05, + "loss": 1.6524, + "step": 10308 + }, + { + "epoch": 0.7760025593255424, + "grad_norm": 6.675785541534424, + "learning_rate": 1.2708464350553134e-05, + "loss": 2.1753, + "step": 10309 + }, + { + "epoch": 0.7760778336062026, + "grad_norm": 6.394741058349609, + "learning_rate": 1.2700344673293069e-05, + "loss": 1.9177, + "step": 10310 + }, + { + "epoch": 0.7761531078868628, + "grad_norm": 3.9632022380828857, + "learning_rate": 1.269222721339049e-05, + "loss": 1.4868, + "step": 10311 + }, + { + "epoch": 0.7762283821675229, + "grad_norm": 4.983537673950195, + "learning_rate": 1.268411197132795e-05, + "loss": 1.9336, + "step": 10312 + }, + { + "epoch": 0.7763036564481831, + "grad_norm": 4.459327697753906, + "learning_rate": 1.2675998947587898e-05, + "loss": 2.0253, + "step": 10313 + }, + { + "epoch": 0.7763789307288432, + "grad_norm": 7.048152446746826, + "learning_rate": 1.2667888142652601e-05, + "loss": 1.9392, + "step": 10314 + }, + { + "epoch": 0.7764542050095034, + "grad_norm": 5.153658866882324, + "learning_rate": 1.2659779557004248e-05, + "loss": 1.6958, + "step": 10315 + }, + { + "epoch": 0.7765294792901636, + "grad_norm": 4.004358768463135, + "learning_rate": 1.2651673191124847e-05, + "loss": 2.0261, + "step": 10316 + }, + { + "epoch": 0.7766047535708237, + "grad_norm": 5.73649787902832, + "learning_rate": 1.2643569045496312e-05, + "loss": 2.1094, + "step": 10317 + }, + { + "epoch": 0.7766800278514838, + "grad_norm": 6.659619331359863, + "learning_rate": 1.263546712060042e-05, + "loss": 1.8192, + "step": 10318 + }, + { + "epoch": 0.776755302132144, + "grad_norm": 5.289032936096191, + "learning_rate": 1.2627367416918783e-05, + "loss": 1.7693, + "step": 10319 + }, + { + "epoch": 0.7768305764128042, + "grad_norm": 5.239470958709717, + "learning_rate": 1.2619269934932937e-05, + "loss": 1.8994, + "step": 10320 + }, + { + "epoch": 0.7769058506934643, + "grad_norm": 4.476828575134277, + "learning_rate": 1.2611174675124222e-05, + "loss": 1.6465, + "step": 10321 + }, + { + "epoch": 0.7769811249741244, + "grad_norm": 4.582423210144043, + "learning_rate": 1.26030816379739e-05, + "loss": 1.8044, + "step": 10322 + }, + { + "epoch": 0.7770563992547846, + "grad_norm": 4.477261543273926, + "learning_rate": 1.259499082396306e-05, + "loss": 1.5441, + "step": 10323 + }, + { + "epoch": 0.7771316735354448, + "grad_norm": 4.7444071769714355, + "learning_rate": 1.258690223357269e-05, + "loss": 1.3621, + "step": 10324 + }, + { + "epoch": 0.7772069478161049, + "grad_norm": 4.55178689956665, + "learning_rate": 1.2578815867283628e-05, + "loss": 2.0456, + "step": 10325 + }, + { + "epoch": 0.7772822220967651, + "grad_norm": 4.64877986907959, + "learning_rate": 1.2570731725576606e-05, + "loss": 1.779, + "step": 10326 + }, + { + "epoch": 0.7773574963774252, + "grad_norm": 5.220911502838135, + "learning_rate": 1.2562649808932175e-05, + "loss": 1.5407, + "step": 10327 + }, + { + "epoch": 0.7774327706580854, + "grad_norm": 4.980185031890869, + "learning_rate": 1.2554570117830805e-05, + "loss": 1.9126, + "step": 10328 + }, + { + "epoch": 0.7775080449387456, + "grad_norm": 5.095905780792236, + "learning_rate": 1.2546492652752784e-05, + "loss": 1.8974, + "step": 10329 + }, + { + "epoch": 0.7775833192194057, + "grad_norm": 5.476496696472168, + "learning_rate": 1.2538417414178322e-05, + "loss": 1.8037, + "step": 10330 + }, + { + "epoch": 0.7776585935000658, + "grad_norm": 4.375895977020264, + "learning_rate": 1.2530344402587441e-05, + "loss": 1.8401, + "step": 10331 + }, + { + "epoch": 0.7777338677807261, + "grad_norm": 4.5472893714904785, + "learning_rate": 1.2522273618460073e-05, + "loss": 1.8458, + "step": 10332 + }, + { + "epoch": 0.7778091420613862, + "grad_norm": 7.180160999298096, + "learning_rate": 1.2514205062276019e-05, + "loss": 2.0163, + "step": 10333 + }, + { + "epoch": 0.7778844163420463, + "grad_norm": 4.065024375915527, + "learning_rate": 1.25061387345149e-05, + "loss": 1.9184, + "step": 10334 + }, + { + "epoch": 0.7779596906227065, + "grad_norm": 7.553528785705566, + "learning_rate": 1.2498074635656248e-05, + "loss": 2.3549, + "step": 10335 + }, + { + "epoch": 0.7780349649033667, + "grad_norm": 4.331940174102783, + "learning_rate": 1.249001276617946e-05, + "loss": 1.7035, + "step": 10336 + }, + { + "epoch": 0.7781102391840268, + "grad_norm": 4.262084007263184, + "learning_rate": 1.2481953126563795e-05, + "loss": 1.6999, + "step": 10337 + }, + { + "epoch": 0.778185513464687, + "grad_norm": 4.418400287628174, + "learning_rate": 1.2473895717288353e-05, + "loss": 1.3919, + "step": 10338 + }, + { + "epoch": 0.7782607877453471, + "grad_norm": 4.201472759246826, + "learning_rate": 1.2465840538832147e-05, + "loss": 1.6041, + "step": 10339 + }, + { + "epoch": 0.7783360620260072, + "grad_norm": 4.977296829223633, + "learning_rate": 1.2457787591674014e-05, + "loss": 2.0329, + "step": 10340 + }, + { + "epoch": 0.7784113363066674, + "grad_norm": 5.484498023986816, + "learning_rate": 1.2449736876292695e-05, + "loss": 2.1246, + "step": 10341 + }, + { + "epoch": 0.7784866105873276, + "grad_norm": 9.492502212524414, + "learning_rate": 1.2441688393166767e-05, + "loss": 1.8732, + "step": 10342 + }, + { + "epoch": 0.7785618848679877, + "grad_norm": 4.51957893371582, + "learning_rate": 1.243364214277471e-05, + "loss": 1.5185, + "step": 10343 + }, + { + "epoch": 0.7786371591486478, + "grad_norm": 3.8268020153045654, + "learning_rate": 1.2425598125594823e-05, + "loss": 1.6918, + "step": 10344 + }, + { + "epoch": 0.7787124334293081, + "grad_norm": 4.446866512298584, + "learning_rate": 1.2417556342105314e-05, + "loss": 1.7997, + "step": 10345 + }, + { + "epoch": 0.7787877077099682, + "grad_norm": 4.687377452850342, + "learning_rate": 1.2409516792784242e-05, + "loss": 1.6313, + "step": 10346 + }, + { + "epoch": 0.7788629819906283, + "grad_norm": 3.6155290603637695, + "learning_rate": 1.2401479478109557e-05, + "loss": 1.8244, + "step": 10347 + }, + { + "epoch": 0.7789382562712885, + "grad_norm": 4.625990390777588, + "learning_rate": 1.2393444398559018e-05, + "loss": 2.1789, + "step": 10348 + }, + { + "epoch": 0.7790135305519487, + "grad_norm": 7.053936004638672, + "learning_rate": 1.2385411554610304e-05, + "loss": 1.8304, + "step": 10349 + }, + { + "epoch": 0.7790888048326088, + "grad_norm": 6.356447696685791, + "learning_rate": 1.2377380946740958e-05, + "loss": 2.0246, + "step": 10350 + }, + { + "epoch": 0.779164079113269, + "grad_norm": 4.5922698974609375, + "learning_rate": 1.236935257542835e-05, + "loss": 1.85, + "step": 10351 + }, + { + "epoch": 0.7792393533939291, + "grad_norm": 4.202937126159668, + "learning_rate": 1.2361326441149773e-05, + "loss": 1.8956, + "step": 10352 + }, + { + "epoch": 0.7793146276745893, + "grad_norm": 6.506840705871582, + "learning_rate": 1.2353302544382329e-05, + "loss": 1.9281, + "step": 10353 + }, + { + "epoch": 0.7793899019552495, + "grad_norm": 4.456070423126221, + "learning_rate": 1.2345280885603039e-05, + "loss": 1.8459, + "step": 10354 + }, + { + "epoch": 0.7794651762359096, + "grad_norm": 5.902978897094727, + "learning_rate": 1.2337261465288742e-05, + "loss": 1.9373, + "step": 10355 + }, + { + "epoch": 0.7795404505165697, + "grad_norm": 6.452949047088623, + "learning_rate": 1.2329244283916187e-05, + "loss": 1.959, + "step": 10356 + }, + { + "epoch": 0.77961572479723, + "grad_norm": 5.0876784324646, + "learning_rate": 1.2321229341961965e-05, + "loss": 2.0362, + "step": 10357 + }, + { + "epoch": 0.7796909990778901, + "grad_norm": 6.603688716888428, + "learning_rate": 1.2313216639902564e-05, + "loss": 1.4617, + "step": 10358 + }, + { + "epoch": 0.7797662733585502, + "grad_norm": 6.878844261169434, + "learning_rate": 1.2305206178214279e-05, + "loss": 2.064, + "step": 10359 + }, + { + "epoch": 0.7798415476392103, + "grad_norm": 4.643365859985352, + "learning_rate": 1.2297197957373353e-05, + "loss": 1.7397, + "step": 10360 + }, + { + "epoch": 0.7799168219198706, + "grad_norm": 5.111471652984619, + "learning_rate": 1.2289191977855801e-05, + "loss": 1.8848, + "step": 10361 + }, + { + "epoch": 0.7799920962005307, + "grad_norm": 4.444356918334961, + "learning_rate": 1.2281188240137604e-05, + "loss": 1.6391, + "step": 10362 + }, + { + "epoch": 0.7800673704811908, + "grad_norm": 4.751591205596924, + "learning_rate": 1.2273186744694521e-05, + "loss": 1.8169, + "step": 10363 + }, + { + "epoch": 0.780142644761851, + "grad_norm": 4.49291467666626, + "learning_rate": 1.2265187492002239e-05, + "loss": 1.7499, + "step": 10364 + }, + { + "epoch": 0.7802179190425111, + "grad_norm": 7.507765293121338, + "learning_rate": 1.2257190482536296e-05, + "loss": 1.9189, + "step": 10365 + }, + { + "epoch": 0.7802931933231713, + "grad_norm": 5.165387153625488, + "learning_rate": 1.2249195716772071e-05, + "loss": 1.5551, + "step": 10366 + }, + { + "epoch": 0.7803684676038315, + "grad_norm": 7.934799671173096, + "learning_rate": 1.2241203195184842e-05, + "loss": 1.7629, + "step": 10367 + }, + { + "epoch": 0.7804437418844916, + "grad_norm": 4.580837249755859, + "learning_rate": 1.2233212918249742e-05, + "loss": 1.9449, + "step": 10368 + }, + { + "epoch": 0.7805190161651517, + "grad_norm": 4.414484977722168, + "learning_rate": 1.2225224886441784e-05, + "loss": 2.0188, + "step": 10369 + }, + { + "epoch": 0.780594290445812, + "grad_norm": 6.710933685302734, + "learning_rate": 1.22172391002358e-05, + "loss": 1.935, + "step": 10370 + }, + { + "epoch": 0.7806695647264721, + "grad_norm": 6.412884712219238, + "learning_rate": 1.220925556010656e-05, + "loss": 1.9024, + "step": 10371 + }, + { + "epoch": 0.7807448390071322, + "grad_norm": 4.9875874519348145, + "learning_rate": 1.2201274266528622e-05, + "loss": 1.5656, + "step": 10372 + }, + { + "epoch": 0.7808201132877924, + "grad_norm": 8.993953704833984, + "learning_rate": 1.2193295219976491e-05, + "loss": 2.255, + "step": 10373 + }, + { + "epoch": 0.7808953875684526, + "grad_norm": 4.583989143371582, + "learning_rate": 1.2185318420924462e-05, + "loss": 1.6245, + "step": 10374 + }, + { + "epoch": 0.7809706618491127, + "grad_norm": 5.050039291381836, + "learning_rate": 1.2177343869846769e-05, + "loss": 1.8833, + "step": 10375 + }, + { + "epoch": 0.7810459361297729, + "grad_norm": 5.000247955322266, + "learning_rate": 1.216937156721744e-05, + "loss": 1.9465, + "step": 10376 + }, + { + "epoch": 0.781121210410433, + "grad_norm": 4.74833869934082, + "learning_rate": 1.2161401513510439e-05, + "loss": 1.8926, + "step": 10377 + }, + { + "epoch": 0.7811964846910932, + "grad_norm": 4.246920108795166, + "learning_rate": 1.2153433709199536e-05, + "loss": 1.7957, + "step": 10378 + }, + { + "epoch": 0.7812717589717533, + "grad_norm": 6.072246551513672, + "learning_rate": 1.21454681547584e-05, + "loss": 1.8872, + "step": 10379 + }, + { + "epoch": 0.7813470332524135, + "grad_norm": 4.116084098815918, + "learning_rate": 1.2137504850660564e-05, + "loss": 1.8675, + "step": 10380 + }, + { + "epoch": 0.7814223075330736, + "grad_norm": 5.24302339553833, + "learning_rate": 1.2129543797379428e-05, + "loss": 1.6275, + "step": 10381 + }, + { + "epoch": 0.7814975818137337, + "grad_norm": 4.412202835083008, + "learning_rate": 1.2121584995388268e-05, + "loss": 1.6991, + "step": 10382 + }, + { + "epoch": 0.781572856094394, + "grad_norm": 4.712615489959717, + "learning_rate": 1.2113628445160175e-05, + "loss": 1.6041, + "step": 10383 + }, + { + "epoch": 0.7816481303750541, + "grad_norm": 4.713449001312256, + "learning_rate": 1.2105674147168177e-05, + "loss": 1.9424, + "step": 10384 + }, + { + "epoch": 0.7817234046557142, + "grad_norm": 5.1208014488220215, + "learning_rate": 1.2097722101885112e-05, + "loss": 1.757, + "step": 10385 + }, + { + "epoch": 0.7817986789363744, + "grad_norm": 4.6546630859375, + "learning_rate": 1.208977230978372e-05, + "loss": 1.7464, + "step": 10386 + }, + { + "epoch": 0.7818739532170346, + "grad_norm": 5.556705474853516, + "learning_rate": 1.208182477133658e-05, + "loss": 2.2877, + "step": 10387 + }, + { + "epoch": 0.7819492274976947, + "grad_norm": 4.568760871887207, + "learning_rate": 1.2073879487016171e-05, + "loss": 1.7213, + "step": 10388 + }, + { + "epoch": 0.7820245017783549, + "grad_norm": 4.479852676391602, + "learning_rate": 1.2065936457294785e-05, + "loss": 1.6587, + "step": 10389 + }, + { + "epoch": 0.782099776059015, + "grad_norm": 4.923946857452393, + "learning_rate": 1.205799568264464e-05, + "loss": 1.9684, + "step": 10390 + }, + { + "epoch": 0.7821750503396752, + "grad_norm": 3.9960007667541504, + "learning_rate": 1.2050057163537775e-05, + "loss": 1.746, + "step": 10391 + }, + { + "epoch": 0.7822503246203354, + "grad_norm": 4.612290382385254, + "learning_rate": 1.2042120900446141e-05, + "loss": 1.8248, + "step": 10392 + }, + { + "epoch": 0.7823255989009955, + "grad_norm": 6.955273151397705, + "learning_rate": 1.2034186893841493e-05, + "loss": 1.9221, + "step": 10393 + }, + { + "epoch": 0.7824008731816556, + "grad_norm": 4.870214939117432, + "learning_rate": 1.2026255144195508e-05, + "loss": 1.6902, + "step": 10394 + }, + { + "epoch": 0.7824761474623159, + "grad_norm": 4.886004447937012, + "learning_rate": 1.2018325651979684e-05, + "loss": 1.9584, + "step": 10395 + }, + { + "epoch": 0.782551421742976, + "grad_norm": 4.791713237762451, + "learning_rate": 1.201039841766543e-05, + "loss": 2.0985, + "step": 10396 + }, + { + "epoch": 0.7826266960236361, + "grad_norm": 4.8301920890808105, + "learning_rate": 1.2002473441723972e-05, + "loss": 1.5059, + "step": 10397 + }, + { + "epoch": 0.7827019703042963, + "grad_norm": 5.076519966125488, + "learning_rate": 1.1994550724626441e-05, + "loss": 1.7431, + "step": 10398 + }, + { + "epoch": 0.7827772445849565, + "grad_norm": 4.058077812194824, + "learning_rate": 1.1986630266843835e-05, + "loss": 1.6869, + "step": 10399 + }, + { + "epoch": 0.7828525188656166, + "grad_norm": 4.515242099761963, + "learning_rate": 1.1978712068846976e-05, + "loss": 1.1763, + "step": 10400 + }, + { + "epoch": 0.7829277931462767, + "grad_norm": 10.128247261047363, + "learning_rate": 1.1970796131106582e-05, + "loss": 2.315, + "step": 10401 + }, + { + "epoch": 0.7830030674269369, + "grad_norm": 4.249345779418945, + "learning_rate": 1.1962882454093244e-05, + "loss": 2.0062, + "step": 10402 + }, + { + "epoch": 0.783078341707597, + "grad_norm": 7.606321334838867, + "learning_rate": 1.1954971038277418e-05, + "loss": 1.9329, + "step": 10403 + }, + { + "epoch": 0.7831536159882572, + "grad_norm": 4.880064487457275, + "learning_rate": 1.1947061884129385e-05, + "loss": 2.4716, + "step": 10404 + }, + { + "epoch": 0.7832288902689174, + "grad_norm": 4.390942096710205, + "learning_rate": 1.1939154992119356e-05, + "loss": 1.6747, + "step": 10405 + }, + { + "epoch": 0.7833041645495775, + "grad_norm": 4.513258934020996, + "learning_rate": 1.1931250362717333e-05, + "loss": 2.0628, + "step": 10406 + }, + { + "epoch": 0.7833794388302376, + "grad_norm": 6.1467742919921875, + "learning_rate": 1.1923347996393259e-05, + "loss": 1.569, + "step": 10407 + }, + { + "epoch": 0.7834547131108979, + "grad_norm": 6.095576763153076, + "learning_rate": 1.1915447893616883e-05, + "loss": 1.8377, + "step": 10408 + }, + { + "epoch": 0.783529987391558, + "grad_norm": 6.952891826629639, + "learning_rate": 1.1907550054857863e-05, + "loss": 2.0903, + "step": 10409 + }, + { + "epoch": 0.7836052616722181, + "grad_norm": 5.326104640960693, + "learning_rate": 1.189965448058568e-05, + "loss": 1.6673, + "step": 10410 + }, + { + "epoch": 0.7836805359528783, + "grad_norm": 4.444530963897705, + "learning_rate": 1.1891761171269717e-05, + "loss": 1.6171, + "step": 10411 + }, + { + "epoch": 0.7837558102335385, + "grad_norm": 5.441180229187012, + "learning_rate": 1.1883870127379204e-05, + "loss": 1.7582, + "step": 10412 + }, + { + "epoch": 0.7838310845141986, + "grad_norm": 5.535923957824707, + "learning_rate": 1.1875981349383248e-05, + "loss": 1.5503, + "step": 10413 + }, + { + "epoch": 0.7839063587948588, + "grad_norm": 3.8517143726348877, + "learning_rate": 1.186809483775082e-05, + "loss": 1.8827, + "step": 10414 + }, + { + "epoch": 0.7839816330755189, + "grad_norm": 5.057762622833252, + "learning_rate": 1.1860210592950732e-05, + "loss": 1.6214, + "step": 10415 + }, + { + "epoch": 0.7840569073561791, + "grad_norm": 6.8897318840026855, + "learning_rate": 1.1852328615451702e-05, + "loss": 2.1726, + "step": 10416 + }, + { + "epoch": 0.7841321816368393, + "grad_norm": 5.734986782073975, + "learning_rate": 1.1844448905722267e-05, + "loss": 1.6404, + "step": 10417 + }, + { + "epoch": 0.7842074559174994, + "grad_norm": 5.420649528503418, + "learning_rate": 1.1836571464230873e-05, + "loss": 1.9012, + "step": 10418 + }, + { + "epoch": 0.7842827301981595, + "grad_norm": 6.2140069007873535, + "learning_rate": 1.182869629144579e-05, + "loss": 2.1754, + "step": 10419 + }, + { + "epoch": 0.7843580044788196, + "grad_norm": 4.264763832092285, + "learning_rate": 1.18208233878352e-05, + "loss": 2.1013, + "step": 10420 + }, + { + "epoch": 0.7844332787594799, + "grad_norm": 4.4977312088012695, + "learning_rate": 1.1812952753867096e-05, + "loss": 1.7753, + "step": 10421 + }, + { + "epoch": 0.78450855304014, + "grad_norm": 5.05929708480835, + "learning_rate": 1.180508439000938e-05, + "loss": 2.11, + "step": 10422 + }, + { + "epoch": 0.7845838273208001, + "grad_norm": 6.788955211639404, + "learning_rate": 1.1797218296729823e-05, + "loss": 1.8552, + "step": 10423 + }, + { + "epoch": 0.7846591016014604, + "grad_norm": 4.922468185424805, + "learning_rate": 1.1789354474496e-05, + "loss": 1.8283, + "step": 10424 + }, + { + "epoch": 0.7847343758821205, + "grad_norm": 4.120331287384033, + "learning_rate": 1.1781492923775422e-05, + "loss": 1.8723, + "step": 10425 + }, + { + "epoch": 0.7848096501627806, + "grad_norm": 5.124722957611084, + "learning_rate": 1.1773633645035443e-05, + "loss": 1.9874, + "step": 10426 + }, + { + "epoch": 0.7848849244434408, + "grad_norm": 5.40078592300415, + "learning_rate": 1.1765776638743242e-05, + "loss": 1.8642, + "step": 10427 + }, + { + "epoch": 0.7849601987241009, + "grad_norm": 4.8090925216674805, + "learning_rate": 1.1757921905365927e-05, + "loss": 1.6028, + "step": 10428 + }, + { + "epoch": 0.7850354730047611, + "grad_norm": 5.225217342376709, + "learning_rate": 1.175006944537041e-05, + "loss": 1.9427, + "step": 10429 + }, + { + "epoch": 0.7851107472854213, + "grad_norm": 4.434376239776611, + "learning_rate": 1.174221925922352e-05, + "loss": 1.912, + "step": 10430 + }, + { + "epoch": 0.7851860215660814, + "grad_norm": 5.809786796569824, + "learning_rate": 1.1734371347391932e-05, + "loss": 1.7531, + "step": 10431 + }, + { + "epoch": 0.7852612958467415, + "grad_norm": 4.614532470703125, + "learning_rate": 1.1726525710342157e-05, + "loss": 1.6316, + "step": 10432 + }, + { + "epoch": 0.7853365701274018, + "grad_norm": 4.726720333099365, + "learning_rate": 1.1718682348540621e-05, + "loss": 2.2078, + "step": 10433 + }, + { + "epoch": 0.7854118444080619, + "grad_norm": 3.8608624935150146, + "learning_rate": 1.1710841262453564e-05, + "loss": 1.9004, + "step": 10434 + }, + { + "epoch": 0.785487118688722, + "grad_norm": 4.820503234863281, + "learning_rate": 1.1703002452547135e-05, + "loss": 1.6875, + "step": 10435 + }, + { + "epoch": 0.7855623929693822, + "grad_norm": 5.726373195648193, + "learning_rate": 1.1695165919287321e-05, + "loss": 2.0559, + "step": 10436 + }, + { + "epoch": 0.7856376672500424, + "grad_norm": 7.237523555755615, + "learning_rate": 1.1687331663139994e-05, + "loss": 1.9674, + "step": 10437 + }, + { + "epoch": 0.7857129415307025, + "grad_norm": 4.502978801727295, + "learning_rate": 1.167949968457086e-05, + "loss": 1.8373, + "step": 10438 + }, + { + "epoch": 0.7857882158113626, + "grad_norm": 5.338217735290527, + "learning_rate": 1.167166998404553e-05, + "loss": 1.8209, + "step": 10439 + }, + { + "epoch": 0.7858634900920228, + "grad_norm": 5.1488423347473145, + "learning_rate": 1.1663842562029425e-05, + "loss": 2.2347, + "step": 10440 + }, + { + "epoch": 0.785938764372683, + "grad_norm": 4.778382301330566, + "learning_rate": 1.1656017418987896e-05, + "loss": 1.5278, + "step": 10441 + }, + { + "epoch": 0.7860140386533431, + "grad_norm": 4.175078392028809, + "learning_rate": 1.16481945553861e-05, + "loss": 1.7219, + "step": 10442 + }, + { + "epoch": 0.7860893129340033, + "grad_norm": 4.212646007537842, + "learning_rate": 1.1640373971689106e-05, + "loss": 1.7759, + "step": 10443 + }, + { + "epoch": 0.7861645872146634, + "grad_norm": 4.006810188293457, + "learning_rate": 1.1632555668361799e-05, + "loss": 1.7194, + "step": 10444 + }, + { + "epoch": 0.7862398614953235, + "grad_norm": 4.763530254364014, + "learning_rate": 1.1624739645868966e-05, + "loss": 2.0957, + "step": 10445 + }, + { + "epoch": 0.7863151357759838, + "grad_norm": 5.659134864807129, + "learning_rate": 1.1616925904675253e-05, + "loss": 1.6673, + "step": 10446 + }, + { + "epoch": 0.7863904100566439, + "grad_norm": 5.201047897338867, + "learning_rate": 1.1609114445245161e-05, + "loss": 1.722, + "step": 10447 + }, + { + "epoch": 0.786465684337304, + "grad_norm": 5.603242874145508, + "learning_rate": 1.1601305268043079e-05, + "loss": 1.9005, + "step": 10448 + }, + { + "epoch": 0.7865409586179642, + "grad_norm": 6.72194766998291, + "learning_rate": 1.1593498373533196e-05, + "loss": 2.188, + "step": 10449 + }, + { + "epoch": 0.7866162328986244, + "grad_norm": 4.8227152824401855, + "learning_rate": 1.1585693762179656e-05, + "loss": 2.0663, + "step": 10450 + }, + { + "epoch": 0.7866915071792845, + "grad_norm": 3.574622392654419, + "learning_rate": 1.157789143444638e-05, + "loss": 1.7861, + "step": 10451 + }, + { + "epoch": 0.7867667814599447, + "grad_norm": 4.818136692047119, + "learning_rate": 1.1570091390797232e-05, + "loss": 1.7715, + "step": 10452 + }, + { + "epoch": 0.7868420557406048, + "grad_norm": 5.031071662902832, + "learning_rate": 1.1562293631695864e-05, + "loss": 1.6933, + "step": 10453 + }, + { + "epoch": 0.786917330021265, + "grad_norm": 7.095625877380371, + "learning_rate": 1.1554498157605864e-05, + "loss": 2.1685, + "step": 10454 + }, + { + "epoch": 0.7869926043019252, + "grad_norm": 4.141712665557861, + "learning_rate": 1.1546704968990623e-05, + "loss": 1.9408, + "step": 10455 + }, + { + "epoch": 0.7870678785825853, + "grad_norm": 4.2152862548828125, + "learning_rate": 1.153891406631344e-05, + "loss": 1.9961, + "step": 10456 + }, + { + "epoch": 0.7871431528632454, + "grad_norm": 4.70906925201416, + "learning_rate": 1.1531125450037455e-05, + "loss": 1.8631, + "step": 10457 + }, + { + "epoch": 0.7872184271439056, + "grad_norm": 4.998145580291748, + "learning_rate": 1.1523339120625698e-05, + "loss": 2.0323, + "step": 10458 + }, + { + "epoch": 0.7872937014245658, + "grad_norm": 5.832976818084717, + "learning_rate": 1.1515555078541012e-05, + "loss": 1.8167, + "step": 10459 + }, + { + "epoch": 0.7873689757052259, + "grad_norm": 5.334981441497803, + "learning_rate": 1.1507773324246168e-05, + "loss": 1.8554, + "step": 10460 + }, + { + "epoch": 0.787444249985886, + "grad_norm": 4.192061901092529, + "learning_rate": 1.1499993858203739e-05, + "loss": 1.7941, + "step": 10461 + }, + { + "epoch": 0.7875195242665463, + "grad_norm": 4.096248149871826, + "learning_rate": 1.1492216680876212e-05, + "loss": 1.5605, + "step": 10462 + }, + { + "epoch": 0.7875947985472064, + "grad_norm": 4.156639575958252, + "learning_rate": 1.148444179272592e-05, + "loss": 1.8762, + "step": 10463 + }, + { + "epoch": 0.7876700728278665, + "grad_norm": 8.860349655151367, + "learning_rate": 1.1476669194215034e-05, + "loss": 1.9457, + "step": 10464 + }, + { + "epoch": 0.7877453471085267, + "grad_norm": 4.781268119812012, + "learning_rate": 1.1468898885805646e-05, + "loss": 1.8275, + "step": 10465 + }, + { + "epoch": 0.7878206213891868, + "grad_norm": 5.10231876373291, + "learning_rate": 1.1461130867959647e-05, + "loss": 1.9615, + "step": 10466 + }, + { + "epoch": 0.787895895669847, + "grad_norm": 5.899445056915283, + "learning_rate": 1.1453365141138834e-05, + "loss": 1.5259, + "step": 10467 + }, + { + "epoch": 0.7879711699505072, + "grad_norm": 6.294816970825195, + "learning_rate": 1.1445601705804864e-05, + "loss": 2.1219, + "step": 10468 + }, + { + "epoch": 0.7880464442311673, + "grad_norm": 4.7341532707214355, + "learning_rate": 1.143784056241926e-05, + "loss": 1.7959, + "step": 10469 + }, + { + "epoch": 0.7881217185118274, + "grad_norm": 4.954004287719727, + "learning_rate": 1.1430081711443375e-05, + "loss": 1.6121, + "step": 10470 + }, + { + "epoch": 0.7881969927924877, + "grad_norm": 4.701150894165039, + "learning_rate": 1.1422325153338475e-05, + "loss": 1.8551, + "step": 10471 + }, + { + "epoch": 0.7882722670731478, + "grad_norm": 4.160506725311279, + "learning_rate": 1.141457088856564e-05, + "loss": 1.8352, + "step": 10472 + }, + { + "epoch": 0.7883475413538079, + "grad_norm": 4.863999843597412, + "learning_rate": 1.1406818917585865e-05, + "loss": 1.8032, + "step": 10473 + }, + { + "epoch": 0.7884228156344681, + "grad_norm": 6.536059379577637, + "learning_rate": 1.1399069240859955e-05, + "loss": 1.9012, + "step": 10474 + }, + { + "epoch": 0.7884980899151283, + "grad_norm": 5.541990756988525, + "learning_rate": 1.1391321858848636e-05, + "loss": 1.453, + "step": 10475 + }, + { + "epoch": 0.7885733641957884, + "grad_norm": 5.1296186447143555, + "learning_rate": 1.1383576772012438e-05, + "loss": 1.9429, + "step": 10476 + }, + { + "epoch": 0.7886486384764485, + "grad_norm": 3.924842357635498, + "learning_rate": 1.1375833980811795e-05, + "loss": 1.4478, + "step": 10477 + }, + { + "epoch": 0.7887239127571087, + "grad_norm": 4.032488822937012, + "learning_rate": 1.1368093485707015e-05, + "loss": 1.6431, + "step": 10478 + }, + { + "epoch": 0.7887991870377689, + "grad_norm": 4.2381134033203125, + "learning_rate": 1.1360355287158214e-05, + "loss": 1.8313, + "step": 10479 + }, + { + "epoch": 0.788874461318429, + "grad_norm": 4.998696327209473, + "learning_rate": 1.1352619385625418e-05, + "loss": 1.6417, + "step": 10480 + }, + { + "epoch": 0.7889497355990892, + "grad_norm": 4.638940334320068, + "learning_rate": 1.134488578156851e-05, + "loss": 2.2269, + "step": 10481 + }, + { + "epoch": 0.7890250098797493, + "grad_norm": 4.457995414733887, + "learning_rate": 1.1337154475447243e-05, + "loss": 1.831, + "step": 10482 + }, + { + "epoch": 0.7891002841604094, + "grad_norm": 6.432929039001465, + "learning_rate": 1.1329425467721194e-05, + "loss": 2.1138, + "step": 10483 + }, + { + "epoch": 0.7891755584410697, + "grad_norm": 5.354211330413818, + "learning_rate": 1.1321698758849853e-05, + "loss": 1.4494, + "step": 10484 + }, + { + "epoch": 0.7892508327217298, + "grad_norm": 4.447618007659912, + "learning_rate": 1.1313974349292528e-05, + "loss": 1.8217, + "step": 10485 + }, + { + "epoch": 0.7893261070023899, + "grad_norm": 4.113140106201172, + "learning_rate": 1.1306252239508441e-05, + "loss": 1.9076, + "step": 10486 + }, + { + "epoch": 0.7894013812830502, + "grad_norm": 5.005544662475586, + "learning_rate": 1.1298532429956615e-05, + "loss": 1.9262, + "step": 10487 + }, + { + "epoch": 0.7894766555637103, + "grad_norm": 6.521586894989014, + "learning_rate": 1.1290814921096011e-05, + "loss": 1.8046, + "step": 10488 + }, + { + "epoch": 0.7895519298443704, + "grad_norm": 5.025541305541992, + "learning_rate": 1.128309971338537e-05, + "loss": 1.7758, + "step": 10489 + }, + { + "epoch": 0.7896272041250306, + "grad_norm": 4.610903739929199, + "learning_rate": 1.1275386807283362e-05, + "loss": 1.8613, + "step": 10490 + }, + { + "epoch": 0.7897024784056907, + "grad_norm": 5.4119553565979, + "learning_rate": 1.126767620324849e-05, + "loss": 2.0248, + "step": 10491 + }, + { + "epoch": 0.7897777526863509, + "grad_norm": 5.321690082550049, + "learning_rate": 1.1259967901739155e-05, + "loss": 1.5541, + "step": 10492 + }, + { + "epoch": 0.7898530269670111, + "grad_norm": 4.882321357727051, + "learning_rate": 1.1252261903213552e-05, + "loss": 1.8298, + "step": 10493 + }, + { + "epoch": 0.7899283012476712, + "grad_norm": 4.279666423797607, + "learning_rate": 1.1244558208129796e-05, + "loss": 2.0404, + "step": 10494 + }, + { + "epoch": 0.7900035755283313, + "grad_norm": 4.211197376251221, + "learning_rate": 1.1236856816945873e-05, + "loss": 1.7322, + "step": 10495 + }, + { + "epoch": 0.7900788498089916, + "grad_norm": 5.199526309967041, + "learning_rate": 1.1229157730119571e-05, + "loss": 2.3412, + "step": 10496 + }, + { + "epoch": 0.7901541240896517, + "grad_norm": 5.16616678237915, + "learning_rate": 1.122146094810861e-05, + "loss": 1.8211, + "step": 10497 + }, + { + "epoch": 0.7902293983703118, + "grad_norm": 5.986471176147461, + "learning_rate": 1.1213766471370513e-05, + "loss": 1.8823, + "step": 10498 + }, + { + "epoch": 0.7903046726509719, + "grad_norm": 4.395902156829834, + "learning_rate": 1.1206074300362723e-05, + "loss": 1.893, + "step": 10499 + }, + { + "epoch": 0.7903799469316322, + "grad_norm": 6.345595836639404, + "learning_rate": 1.1198384435542492e-05, + "loss": 2.0792, + "step": 10500 + }, + { + "epoch": 0.7904552212122923, + "grad_norm": 4.27605676651001, + "learning_rate": 1.119069687736697e-05, + "loss": 1.7656, + "step": 10501 + }, + { + "epoch": 0.7905304954929524, + "grad_norm": 4.311114311218262, + "learning_rate": 1.118301162629316e-05, + "loss": 1.9608, + "step": 10502 + }, + { + "epoch": 0.7906057697736126, + "grad_norm": 5.204464435577393, + "learning_rate": 1.1175328682777952e-05, + "loss": 1.7361, + "step": 10503 + }, + { + "epoch": 0.7906810440542728, + "grad_norm": 4.836484909057617, + "learning_rate": 1.1167648047278034e-05, + "loss": 1.7384, + "step": 10504 + }, + { + "epoch": 0.7907563183349329, + "grad_norm": 4.623745441436768, + "learning_rate": 1.115996972025003e-05, + "loss": 2.1583, + "step": 10505 + }, + { + "epoch": 0.7908315926155931, + "grad_norm": 5.195585250854492, + "learning_rate": 1.1152293702150368e-05, + "loss": 1.7657, + "step": 10506 + }, + { + "epoch": 0.7909068668962532, + "grad_norm": 4.8195624351501465, + "learning_rate": 1.1144619993435396e-05, + "loss": 2.0471, + "step": 10507 + }, + { + "epoch": 0.7909821411769133, + "grad_norm": 4.018946647644043, + "learning_rate": 1.1136948594561258e-05, + "loss": 1.8078, + "step": 10508 + }, + { + "epoch": 0.7910574154575736, + "grad_norm": 4.304199695587158, + "learning_rate": 1.1129279505984036e-05, + "loss": 1.854, + "step": 10509 + }, + { + "epoch": 0.7911326897382337, + "grad_norm": 4.6675705909729, + "learning_rate": 1.1121612728159598e-05, + "loss": 1.6067, + "step": 10510 + }, + { + "epoch": 0.7912079640188938, + "grad_norm": 6.016103267669678, + "learning_rate": 1.1113948261543727e-05, + "loss": 1.7509, + "step": 10511 + }, + { + "epoch": 0.791283238299554, + "grad_norm": 6.147856712341309, + "learning_rate": 1.1106286106592057e-05, + "loss": 1.978, + "step": 10512 + }, + { + "epoch": 0.7913585125802142, + "grad_norm": 4.859924793243408, + "learning_rate": 1.1098626263760077e-05, + "loss": 1.8824, + "step": 10513 + }, + { + "epoch": 0.7914337868608743, + "grad_norm": 4.324882507324219, + "learning_rate": 1.109096873350316e-05, + "loss": 1.7933, + "step": 10514 + }, + { + "epoch": 0.7915090611415345, + "grad_norm": 5.222978115081787, + "learning_rate": 1.1083313516276495e-05, + "loss": 2.0191, + "step": 10515 + }, + { + "epoch": 0.7915843354221946, + "grad_norm": 4.9351420402526855, + "learning_rate": 1.1075660612535193e-05, + "loss": 1.7355, + "step": 10516 + }, + { + "epoch": 0.7916596097028548, + "grad_norm": 5.0382280349731445, + "learning_rate": 1.1068010022734165e-05, + "loss": 1.8204, + "step": 10517 + }, + { + "epoch": 0.7917348839835149, + "grad_norm": 4.4992995262146, + "learning_rate": 1.1060361747328247e-05, + "loss": 1.8748, + "step": 10518 + }, + { + "epoch": 0.7918101582641751, + "grad_norm": 8.730916023254395, + "learning_rate": 1.1052715786772077e-05, + "loss": 1.6461, + "step": 10519 + }, + { + "epoch": 0.7918854325448352, + "grad_norm": 6.294488906860352, + "learning_rate": 1.1045072141520219e-05, + "loss": 1.7078, + "step": 10520 + }, + { + "epoch": 0.7919607068254954, + "grad_norm": 5.546000003814697, + "learning_rate": 1.1037430812027033e-05, + "loss": 1.878, + "step": 10521 + }, + { + "epoch": 0.7920359811061556, + "grad_norm": 5.042482852935791, + "learning_rate": 1.1029791798746792e-05, + "loss": 1.6006, + "step": 10522 + }, + { + "epoch": 0.7921112553868157, + "grad_norm": 4.686640739440918, + "learning_rate": 1.1022155102133603e-05, + "loss": 1.7746, + "step": 10523 + }, + { + "epoch": 0.7921865296674758, + "grad_norm": 5.204477310180664, + "learning_rate": 1.1014520722641476e-05, + "loss": 2.017, + "step": 10524 + }, + { + "epoch": 0.7922618039481361, + "grad_norm": 4.673209190368652, + "learning_rate": 1.1006888660724207e-05, + "loss": 1.4741, + "step": 10525 + }, + { + "epoch": 0.7923370782287962, + "grad_norm": 5.0145769119262695, + "learning_rate": 1.0999258916835531e-05, + "loss": 2.0601, + "step": 10526 + }, + { + "epoch": 0.7924123525094563, + "grad_norm": 4.081797122955322, + "learning_rate": 1.0991631491429017e-05, + "loss": 1.6026, + "step": 10527 + }, + { + "epoch": 0.7924876267901165, + "grad_norm": 4.916884422302246, + "learning_rate": 1.0984006384958068e-05, + "loss": 1.8145, + "step": 10528 + }, + { + "epoch": 0.7925629010707766, + "grad_norm": 4.483363628387451, + "learning_rate": 1.0976383597876006e-05, + "loss": 1.9069, + "step": 10529 + }, + { + "epoch": 0.7926381753514368, + "grad_norm": 5.311499118804932, + "learning_rate": 1.0968763130635951e-05, + "loss": 1.8131, + "step": 10530 + }, + { + "epoch": 0.792713449632097, + "grad_norm": 4.320664405822754, + "learning_rate": 1.0961144983690952e-05, + "loss": 2.151, + "step": 10531 + }, + { + "epoch": 0.7927887239127571, + "grad_norm": 4.578085422515869, + "learning_rate": 1.0953529157493852e-05, + "loss": 1.9826, + "step": 10532 + }, + { + "epoch": 0.7928639981934172, + "grad_norm": 3.807785987854004, + "learning_rate": 1.0945915652497424e-05, + "loss": 1.8156, + "step": 10533 + }, + { + "epoch": 0.7929392724740775, + "grad_norm": 6.542794227600098, + "learning_rate": 1.0938304469154237e-05, + "loss": 1.7456, + "step": 10534 + }, + { + "epoch": 0.7930145467547376, + "grad_norm": 5.614415168762207, + "learning_rate": 1.093069560791677e-05, + "loss": 2.1634, + "step": 10535 + }, + { + "epoch": 0.7930898210353977, + "grad_norm": 4.348136901855469, + "learning_rate": 1.0923089069237341e-05, + "loss": 1.6034, + "step": 10536 + }, + { + "epoch": 0.7931650953160578, + "grad_norm": 5.081855297088623, + "learning_rate": 1.0915484853568164e-05, + "loss": 1.786, + "step": 10537 + }, + { + "epoch": 0.7932403695967181, + "grad_norm": 4.064887046813965, + "learning_rate": 1.0907882961361248e-05, + "loss": 1.5823, + "step": 10538 + }, + { + "epoch": 0.7933156438773782, + "grad_norm": 5.364055633544922, + "learning_rate": 1.090028339306854e-05, + "loss": 1.8118, + "step": 10539 + }, + { + "epoch": 0.7933909181580383, + "grad_norm": 5.807515621185303, + "learning_rate": 1.0892686149141773e-05, + "loss": 1.8948, + "step": 10540 + }, + { + "epoch": 0.7934661924386985, + "grad_norm": 5.550662994384766, + "learning_rate": 1.0885091230032624e-05, + "loss": 1.9071, + "step": 10541 + }, + { + "epoch": 0.7935414667193587, + "grad_norm": 5.765417098999023, + "learning_rate": 1.0877498636192552e-05, + "loss": 1.8936, + "step": 10542 + }, + { + "epoch": 0.7936167410000188, + "grad_norm": 5.395507335662842, + "learning_rate": 1.0869908368072928e-05, + "loss": 1.8087, + "step": 10543 + }, + { + "epoch": 0.793692015280679, + "grad_norm": 5.470346927642822, + "learning_rate": 1.0862320426124989e-05, + "loss": 2.0436, + "step": 10544 + }, + { + "epoch": 0.7937672895613391, + "grad_norm": 4.59730339050293, + "learning_rate": 1.0854734810799794e-05, + "loss": 1.9371, + "step": 10545 + }, + { + "epoch": 0.7938425638419992, + "grad_norm": 6.259925842285156, + "learning_rate": 1.0847151522548287e-05, + "loss": 1.8583, + "step": 10546 + }, + { + "epoch": 0.7939178381226595, + "grad_norm": 5.511729717254639, + "learning_rate": 1.0839570561821282e-05, + "loss": 2.0519, + "step": 10547 + }, + { + "epoch": 0.7939931124033196, + "grad_norm": 4.435070514678955, + "learning_rate": 1.0831991929069463e-05, + "loss": 2.1082, + "step": 10548 + }, + { + "epoch": 0.7940683866839797, + "grad_norm": 4.86382532119751, + "learning_rate": 1.0824415624743316e-05, + "loss": 1.7053, + "step": 10549 + }, + { + "epoch": 0.79414366096464, + "grad_norm": 6.5566630363464355, + "learning_rate": 1.0816841649293275e-05, + "loss": 1.915, + "step": 10550 + }, + { + "epoch": 0.7942189352453001, + "grad_norm": 5.961004257202148, + "learning_rate": 1.0809270003169548e-05, + "loss": 1.5648, + "step": 10551 + }, + { + "epoch": 0.7942942095259602, + "grad_norm": 5.56597375869751, + "learning_rate": 1.0801700686822286e-05, + "loss": 1.941, + "step": 10552 + }, + { + "epoch": 0.7943694838066204, + "grad_norm": 4.410744667053223, + "learning_rate": 1.0794133700701432e-05, + "loss": 1.8975, + "step": 10553 + }, + { + "epoch": 0.7944447580872805, + "grad_norm": 4.48292875289917, + "learning_rate": 1.0786569045256845e-05, + "loss": 1.7614, + "step": 10554 + }, + { + "epoch": 0.7945200323679407, + "grad_norm": 4.816855430603027, + "learning_rate": 1.0779006720938201e-05, + "loss": 1.6495, + "step": 10555 + }, + { + "epoch": 0.7945953066486008, + "grad_norm": 4.901763439178467, + "learning_rate": 1.0771446728195073e-05, + "loss": 2.1345, + "step": 10556 + }, + { + "epoch": 0.794670580929261, + "grad_norm": 4.171257972717285, + "learning_rate": 1.0763889067476873e-05, + "loss": 1.6871, + "step": 10557 + }, + { + "epoch": 0.7947458552099211, + "grad_norm": 5.240748405456543, + "learning_rate": 1.0756333739232888e-05, + "loss": 1.7006, + "step": 10558 + }, + { + "epoch": 0.7948211294905813, + "grad_norm": 4.132002830505371, + "learning_rate": 1.0748780743912273e-05, + "loss": 1.9255, + "step": 10559 + }, + { + "epoch": 0.7948964037712415, + "grad_norm": 5.472195625305176, + "learning_rate": 1.0741230081964004e-05, + "loss": 1.8117, + "step": 10560 + }, + { + "epoch": 0.7949716780519016, + "grad_norm": 5.17474365234375, + "learning_rate": 1.0733681753836977e-05, + "loss": 1.805, + "step": 10561 + }, + { + "epoch": 0.7950469523325617, + "grad_norm": 5.187584400177002, + "learning_rate": 1.0726135759979888e-05, + "loss": 1.7891, + "step": 10562 + }, + { + "epoch": 0.795122226613222, + "grad_norm": 4.606949806213379, + "learning_rate": 1.0718592100841352e-05, + "loss": 1.9286, + "step": 10563 + }, + { + "epoch": 0.7951975008938821, + "grad_norm": 5.152125835418701, + "learning_rate": 1.0711050776869791e-05, + "loss": 1.8753, + "step": 10564 + }, + { + "epoch": 0.7952727751745422, + "grad_norm": 4.856875419616699, + "learning_rate": 1.0703511788513543e-05, + "loss": 1.8397, + "step": 10565 + }, + { + "epoch": 0.7953480494552024, + "grad_norm": 6.894735813140869, + "learning_rate": 1.0695975136220749e-05, + "loss": 2.0339, + "step": 10566 + }, + { + "epoch": 0.7954233237358626, + "grad_norm": 4.199718952178955, + "learning_rate": 1.0688440820439455e-05, + "loss": 1.9094, + "step": 10567 + }, + { + "epoch": 0.7954985980165227, + "grad_norm": 4.852376461029053, + "learning_rate": 1.0680908841617559e-05, + "loss": 1.8294, + "step": 10568 + }, + { + "epoch": 0.7955738722971829, + "grad_norm": 5.831381797790527, + "learning_rate": 1.0673379200202831e-05, + "loss": 1.5391, + "step": 10569 + }, + { + "epoch": 0.795649146577843, + "grad_norm": 4.5340423583984375, + "learning_rate": 1.0665851896642854e-05, + "loss": 2.0763, + "step": 10570 + }, + { + "epoch": 0.7957244208585031, + "grad_norm": 4.982690334320068, + "learning_rate": 1.0658326931385127e-05, + "loss": 1.7669, + "step": 10571 + }, + { + "epoch": 0.7957996951391634, + "grad_norm": 4.154668807983398, + "learning_rate": 1.065080430487697e-05, + "loss": 1.952, + "step": 10572 + }, + { + "epoch": 0.7958749694198235, + "grad_norm": 4.445095062255859, + "learning_rate": 1.0643284017565608e-05, + "loss": 2.0379, + "step": 10573 + }, + { + "epoch": 0.7959502437004836, + "grad_norm": 5.1501240730285645, + "learning_rate": 1.0635766069898068e-05, + "loss": 1.8313, + "step": 10574 + }, + { + "epoch": 0.7960255179811437, + "grad_norm": 3.782294988632202, + "learning_rate": 1.0628250462321287e-05, + "loss": 1.7792, + "step": 10575 + }, + { + "epoch": 0.796100792261804, + "grad_norm": 4.5195794105529785, + "learning_rate": 1.062073719528206e-05, + "loss": 1.8392, + "step": 10576 + }, + { + "epoch": 0.7961760665424641, + "grad_norm": 5.385557174682617, + "learning_rate": 1.0613226269227e-05, + "loss": 2.0307, + "step": 10577 + }, + { + "epoch": 0.7962513408231242, + "grad_norm": 4.283694267272949, + "learning_rate": 1.060571768460264e-05, + "loss": 1.4571, + "step": 10578 + }, + { + "epoch": 0.7963266151037844, + "grad_norm": 6.535726070404053, + "learning_rate": 1.0598211441855311e-05, + "loss": 1.6028, + "step": 10579 + }, + { + "epoch": 0.7964018893844446, + "grad_norm": 4.400607109069824, + "learning_rate": 1.059070754143126e-05, + "loss": 1.6233, + "step": 10580 + }, + { + "epoch": 0.7964771636651047, + "grad_norm": 4.905205726623535, + "learning_rate": 1.0583205983776567e-05, + "loss": 1.9233, + "step": 10581 + }, + { + "epoch": 0.7965524379457649, + "grad_norm": 4.403603553771973, + "learning_rate": 1.057570676933719e-05, + "loss": 1.9596, + "step": 10582 + }, + { + "epoch": 0.796627712226425, + "grad_norm": 4.7504448890686035, + "learning_rate": 1.0568209898558911e-05, + "loss": 1.5788, + "step": 10583 + }, + { + "epoch": 0.7967029865070852, + "grad_norm": 7.158041000366211, + "learning_rate": 1.0560715371887425e-05, + "loss": 2.176, + "step": 10584 + }, + { + "epoch": 0.7967782607877454, + "grad_norm": 4.335253715515137, + "learning_rate": 1.0553223189768235e-05, + "loss": 1.6456, + "step": 10585 + }, + { + "epoch": 0.7968535350684055, + "grad_norm": 4.165205955505371, + "learning_rate": 1.0545733352646753e-05, + "loss": 1.7837, + "step": 10586 + }, + { + "epoch": 0.7969288093490656, + "grad_norm": 5.156903266906738, + "learning_rate": 1.0538245860968204e-05, + "loss": 1.7138, + "step": 10587 + }, + { + "epoch": 0.7970040836297259, + "grad_norm": 5.625891208648682, + "learning_rate": 1.0530760715177724e-05, + "loss": 1.8802, + "step": 10588 + }, + { + "epoch": 0.797079357910386, + "grad_norm": 4.726641654968262, + "learning_rate": 1.0523277915720253e-05, + "loss": 1.7543, + "step": 10589 + }, + { + "epoch": 0.7971546321910461, + "grad_norm": 4.422852516174316, + "learning_rate": 1.0515797463040639e-05, + "loss": 1.9818, + "step": 10590 + }, + { + "epoch": 0.7972299064717063, + "grad_norm": 4.1001105308532715, + "learning_rate": 1.0508319357583574e-05, + "loss": 2.0495, + "step": 10591 + }, + { + "epoch": 0.7973051807523664, + "grad_norm": 4.357824325561523, + "learning_rate": 1.0500843599793609e-05, + "loss": 1.4581, + "step": 10592 + }, + { + "epoch": 0.7973804550330266, + "grad_norm": 6.033605098724365, + "learning_rate": 1.0493370190115171e-05, + "loss": 1.5925, + "step": 10593 + }, + { + "epoch": 0.7974557293136868, + "grad_norm": 5.0049285888671875, + "learning_rate": 1.0485899128992499e-05, + "loss": 1.7696, + "step": 10594 + }, + { + "epoch": 0.7975310035943469, + "grad_norm": 6.114724159240723, + "learning_rate": 1.0478430416869767e-05, + "loss": 1.8543, + "step": 10595 + }, + { + "epoch": 0.797606277875007, + "grad_norm": 4.8095383644104, + "learning_rate": 1.0470964054190928e-05, + "loss": 1.8121, + "step": 10596 + }, + { + "epoch": 0.7976815521556672, + "grad_norm": 4.194774150848389, + "learning_rate": 1.0463500041399866e-05, + "loss": 2.0228, + "step": 10597 + }, + { + "epoch": 0.7977568264363274, + "grad_norm": 6.96627140045166, + "learning_rate": 1.0456038378940276e-05, + "loss": 2.0407, + "step": 10598 + }, + { + "epoch": 0.7978321007169875, + "grad_norm": 4.906794548034668, + "learning_rate": 1.0448579067255749e-05, + "loss": 1.6905, + "step": 10599 + }, + { + "epoch": 0.7979073749976476, + "grad_norm": 6.082169055938721, + "learning_rate": 1.0441122106789697e-05, + "loss": 1.9021, + "step": 10600 + }, + { + "epoch": 0.7979826492783079, + "grad_norm": 3.8410072326660156, + "learning_rate": 1.0433667497985433e-05, + "loss": 1.5913, + "step": 10601 + }, + { + "epoch": 0.798057923558968, + "grad_norm": 5.016761302947998, + "learning_rate": 1.0426215241286103e-05, + "loss": 1.9341, + "step": 10602 + }, + { + "epoch": 0.7981331978396281, + "grad_norm": 7.987148761749268, + "learning_rate": 1.041876533713474e-05, + "loss": 1.6058, + "step": 10603 + }, + { + "epoch": 0.7982084721202883, + "grad_norm": 5.406179428100586, + "learning_rate": 1.0411317785974196e-05, + "loss": 1.8621, + "step": 10604 + }, + { + "epoch": 0.7982837464009485, + "grad_norm": 9.935291290283203, + "learning_rate": 1.0403872588247232e-05, + "loss": 1.6798, + "step": 10605 + }, + { + "epoch": 0.7983590206816086, + "grad_norm": 4.945474624633789, + "learning_rate": 1.0396429744396414e-05, + "loss": 1.4806, + "step": 10606 + }, + { + "epoch": 0.7984342949622688, + "grad_norm": 6.583197593688965, + "learning_rate": 1.0388989254864206e-05, + "loss": 2.2207, + "step": 10607 + }, + { + "epoch": 0.7985095692429289, + "grad_norm": 4.793309211730957, + "learning_rate": 1.0381551120092953e-05, + "loss": 2.1315, + "step": 10608 + }, + { + "epoch": 0.798584843523589, + "grad_norm": 5.367116928100586, + "learning_rate": 1.0374115340524787e-05, + "loss": 1.8995, + "step": 10609 + }, + { + "epoch": 0.7986601178042493, + "grad_norm": 5.316595554351807, + "learning_rate": 1.0366681916601784e-05, + "loss": 1.6572, + "step": 10610 + }, + { + "epoch": 0.7987353920849094, + "grad_norm": 6.079329490661621, + "learning_rate": 1.03592508487658e-05, + "loss": 1.783, + "step": 10611 + }, + { + "epoch": 0.7988106663655695, + "grad_norm": 6.35397481918335, + "learning_rate": 1.0351822137458617e-05, + "loss": 1.7655, + "step": 10612 + }, + { + "epoch": 0.7988859406462298, + "grad_norm": 4.46872091293335, + "learning_rate": 1.0344395783121842e-05, + "loss": 1.6872, + "step": 10613 + }, + { + "epoch": 0.7989612149268899, + "grad_norm": 4.64361572265625, + "learning_rate": 1.0336971786196964e-05, + "loss": 1.5952, + "step": 10614 + }, + { + "epoch": 0.79903648920755, + "grad_norm": 4.38369083404541, + "learning_rate": 1.03295501471253e-05, + "loss": 1.9921, + "step": 10615 + }, + { + "epoch": 0.7991117634882101, + "grad_norm": 4.6624908447265625, + "learning_rate": 1.0322130866348063e-05, + "loss": 1.7404, + "step": 10616 + }, + { + "epoch": 0.7991870377688703, + "grad_norm": 6.090721130371094, + "learning_rate": 1.031471394430628e-05, + "loss": 2.0913, + "step": 10617 + }, + { + "epoch": 0.7992623120495305, + "grad_norm": 7.265249252319336, + "learning_rate": 1.0307299381440904e-05, + "loss": 2.1835, + "step": 10618 + }, + { + "epoch": 0.7993375863301906, + "grad_norm": 4.3720383644104, + "learning_rate": 1.0299887178192668e-05, + "loss": 2.2123, + "step": 10619 + }, + { + "epoch": 0.7994128606108508, + "grad_norm": 5.423922538757324, + "learning_rate": 1.0292477335002243e-05, + "loss": 1.7212, + "step": 10620 + }, + { + "epoch": 0.7994881348915109, + "grad_norm": 4.756758213043213, + "learning_rate": 1.0285069852310097e-05, + "loss": 1.6806, + "step": 10621 + }, + { + "epoch": 0.7995634091721711, + "grad_norm": 4.531630992889404, + "learning_rate": 1.0277664730556591e-05, + "loss": 2.0157, + "step": 10622 + }, + { + "epoch": 0.7996386834528313, + "grad_norm": 5.678662300109863, + "learning_rate": 1.0270261970181938e-05, + "loss": 1.8567, + "step": 10623 + }, + { + "epoch": 0.7997139577334914, + "grad_norm": 5.319336414337158, + "learning_rate": 1.0262861571626232e-05, + "loss": 2.177, + "step": 10624 + }, + { + "epoch": 0.7997892320141515, + "grad_norm": 5.703810691833496, + "learning_rate": 1.0255463535329369e-05, + "loss": 2.0952, + "step": 10625 + }, + { + "epoch": 0.7998645062948118, + "grad_norm": 5.950246334075928, + "learning_rate": 1.0248067861731158e-05, + "loss": 1.6458, + "step": 10626 + }, + { + "epoch": 0.7999397805754719, + "grad_norm": 4.7629523277282715, + "learning_rate": 1.0240674551271267e-05, + "loss": 1.9041, + "step": 10627 + }, + { + "epoch": 0.800015054856132, + "grad_norm": 5.508734226226807, + "learning_rate": 1.0233283604389183e-05, + "loss": 1.8321, + "step": 10628 + }, + { + "epoch": 0.8000903291367922, + "grad_norm": 5.226073265075684, + "learning_rate": 1.0225895021524289e-05, + "loss": 1.4688, + "step": 10629 + }, + { + "epoch": 0.8001656034174524, + "grad_norm": 5.285341262817383, + "learning_rate": 1.0218508803115806e-05, + "loss": 2.0605, + "step": 10630 + }, + { + "epoch": 0.8002408776981125, + "grad_norm": 5.178981781005859, + "learning_rate": 1.021112494960284e-05, + "loss": 1.6965, + "step": 10631 + }, + { + "epoch": 0.8003161519787727, + "grad_norm": 5.68692684173584, + "learning_rate": 1.0203743461424315e-05, + "loss": 1.7158, + "step": 10632 + }, + { + "epoch": 0.8003914262594328, + "grad_norm": 4.835432052612305, + "learning_rate": 1.0196364339019065e-05, + "loss": 1.9185, + "step": 10633 + }, + { + "epoch": 0.800466700540093, + "grad_norm": 5.050309658050537, + "learning_rate": 1.0188987582825732e-05, + "loss": 1.7044, + "step": 10634 + }, + { + "epoch": 0.8005419748207531, + "grad_norm": 5.335615634918213, + "learning_rate": 1.0181613193282857e-05, + "loss": 1.892, + "step": 10635 + }, + { + "epoch": 0.8006172491014133, + "grad_norm": 5.324244976043701, + "learning_rate": 1.0174241170828824e-05, + "loss": 1.4616, + "step": 10636 + }, + { + "epoch": 0.8006925233820734, + "grad_norm": 4.925962448120117, + "learning_rate": 1.01668715159019e-05, + "loss": 1.5167, + "step": 10637 + }, + { + "epoch": 0.8007677976627335, + "grad_norm": 5.607387065887451, + "learning_rate": 1.0159504228940154e-05, + "loss": 1.6402, + "step": 10638 + }, + { + "epoch": 0.8008430719433938, + "grad_norm": 4.65199089050293, + "learning_rate": 1.0152139310381565e-05, + "loss": 2.0347, + "step": 10639 + }, + { + "epoch": 0.8009183462240539, + "grad_norm": 8.321224212646484, + "learning_rate": 1.0144776760663972e-05, + "loss": 1.8323, + "step": 10640 + }, + { + "epoch": 0.800993620504714, + "grad_norm": 4.292596340179443, + "learning_rate": 1.0137416580225029e-05, + "loss": 1.7162, + "step": 10641 + }, + { + "epoch": 0.8010688947853742, + "grad_norm": 3.7240381240844727, + "learning_rate": 1.0130058769502304e-05, + "loss": 1.519, + "step": 10642 + }, + { + "epoch": 0.8011441690660344, + "grad_norm": 5.344223499298096, + "learning_rate": 1.0122703328933175e-05, + "loss": 2.2044, + "step": 10643 + }, + { + "epoch": 0.8012194433466945, + "grad_norm": 6.579819202423096, + "learning_rate": 1.0115350258954926e-05, + "loss": 1.9823, + "step": 10644 + }, + { + "epoch": 0.8012947176273547, + "grad_norm": 4.223052501678467, + "learning_rate": 1.0107999560004643e-05, + "loss": 1.7825, + "step": 10645 + }, + { + "epoch": 0.8013699919080148, + "grad_norm": 6.044129848480225, + "learning_rate": 1.0100651232519325e-05, + "loss": 1.8163, + "step": 10646 + }, + { + "epoch": 0.801445266188675, + "grad_norm": 4.213158130645752, + "learning_rate": 1.009330527693581e-05, + "loss": 1.7388, + "step": 10647 + }, + { + "epoch": 0.8015205404693352, + "grad_norm": 5.015080451965332, + "learning_rate": 1.008596169369081e-05, + "loss": 1.7258, + "step": 10648 + }, + { + "epoch": 0.8015958147499953, + "grad_norm": 5.202444076538086, + "learning_rate": 1.0078620483220841e-05, + "loss": 1.6329, + "step": 10649 + }, + { + "epoch": 0.8016710890306554, + "grad_norm": 4.469363212585449, + "learning_rate": 1.007128164596235e-05, + "loss": 1.7105, + "step": 10650 + }, + { + "epoch": 0.8017463633113157, + "grad_norm": 5.312971115112305, + "learning_rate": 1.006394518235158e-05, + "loss": 1.6272, + "step": 10651 + }, + { + "epoch": 0.8018216375919758, + "grad_norm": 4.956375598907471, + "learning_rate": 1.0056611092824702e-05, + "loss": 1.7256, + "step": 10652 + }, + { + "epoch": 0.8018969118726359, + "grad_norm": 4.4215898513793945, + "learning_rate": 1.0049279377817667e-05, + "loss": 1.7335, + "step": 10653 + }, + { + "epoch": 0.801972186153296, + "grad_norm": 4.372335433959961, + "learning_rate": 1.0041950037766351e-05, + "loss": 1.8173, + "step": 10654 + }, + { + "epoch": 0.8020474604339562, + "grad_norm": 3.806447744369507, + "learning_rate": 1.0034623073106441e-05, + "loss": 1.8572, + "step": 10655 + }, + { + "epoch": 0.8021227347146164, + "grad_norm": 8.996606826782227, + "learning_rate": 1.0027298484273518e-05, + "loss": 2.282, + "step": 10656 + }, + { + "epoch": 0.8021980089952765, + "grad_norm": 5.268815994262695, + "learning_rate": 1.0019976271703007e-05, + "loss": 1.7133, + "step": 10657 + }, + { + "epoch": 0.8022732832759367, + "grad_norm": 4.3197550773620605, + "learning_rate": 1.0012656435830186e-05, + "loss": 1.9446, + "step": 10658 + }, + { + "epoch": 0.8023485575565968, + "grad_norm": 6.176807880401611, + "learning_rate": 1.0005338977090223e-05, + "loss": 1.9034, + "step": 10659 + }, + { + "epoch": 0.802423831837257, + "grad_norm": 5.208347320556641, + "learning_rate": 9.99802389591808e-06, + "loss": 1.8544, + "step": 10660 + }, + { + "epoch": 0.8024991061179172, + "grad_norm": 3.647977590560913, + "learning_rate": 9.990711192748658e-06, + "loss": 1.7485, + "step": 10661 + }, + { + "epoch": 0.8025743803985773, + "grad_norm": 5.0032429695129395, + "learning_rate": 9.98340086801664e-06, + "loss": 1.9411, + "step": 10662 + }, + { + "epoch": 0.8026496546792374, + "grad_norm": 4.905118942260742, + "learning_rate": 9.976092922156632e-06, + "loss": 1.772, + "step": 10663 + }, + { + "epoch": 0.8027249289598977, + "grad_norm": 5.962714672088623, + "learning_rate": 9.968787355603044e-06, + "loss": 1.7948, + "step": 10664 + }, + { + "epoch": 0.8028002032405578, + "grad_norm": 6.093608379364014, + "learning_rate": 9.961484168790197e-06, + "loss": 1.6562, + "step": 10665 + }, + { + "epoch": 0.8028754775212179, + "grad_norm": 4.597022533416748, + "learning_rate": 9.954183362152225e-06, + "loss": 1.6775, + "step": 10666 + }, + { + "epoch": 0.8029507518018781, + "grad_norm": 6.098922252655029, + "learning_rate": 9.946884936123147e-06, + "loss": 1.8179, + "step": 10667 + }, + { + "epoch": 0.8030260260825383, + "grad_norm": 6.503742694854736, + "learning_rate": 9.93958889113683e-06, + "loss": 1.9091, + "step": 10668 + }, + { + "epoch": 0.8031013003631984, + "grad_norm": 3.827249050140381, + "learning_rate": 9.932295227627026e-06, + "loss": 1.7099, + "step": 10669 + }, + { + "epoch": 0.8031765746438586, + "grad_norm": 4.976996898651123, + "learning_rate": 9.925003946027284e-06, + "loss": 1.6231, + "step": 10670 + }, + { + "epoch": 0.8032518489245187, + "grad_norm": 5.31277322769165, + "learning_rate": 9.917715046771075e-06, + "loss": 1.8341, + "step": 10671 + }, + { + "epoch": 0.8033271232051789, + "grad_norm": 5.836062908172607, + "learning_rate": 9.91042853029171e-06, + "loss": 1.9529, + "step": 10672 + }, + { + "epoch": 0.803402397485839, + "grad_norm": 4.353870868682861, + "learning_rate": 9.903144397022324e-06, + "loss": 1.6743, + "step": 10673 + }, + { + "epoch": 0.8034776717664992, + "grad_norm": 5.54949426651001, + "learning_rate": 9.895862647395964e-06, + "loss": 1.9927, + "step": 10674 + }, + { + "epoch": 0.8035529460471593, + "grad_norm": 4.469899654388428, + "learning_rate": 9.888583281845487e-06, + "loss": 2.1199, + "step": 10675 + }, + { + "epoch": 0.8036282203278194, + "grad_norm": 6.5621466636657715, + "learning_rate": 9.881306300803656e-06, + "loss": 2.1115, + "step": 10676 + }, + { + "epoch": 0.8037034946084797, + "grad_norm": 7.258991241455078, + "learning_rate": 9.874031704703035e-06, + "loss": 2.4716, + "step": 10677 + }, + { + "epoch": 0.8037787688891398, + "grad_norm": 3.9647786617279053, + "learning_rate": 9.866759493976113e-06, + "loss": 1.5881, + "step": 10678 + }, + { + "epoch": 0.8038540431697999, + "grad_norm": 5.832951068878174, + "learning_rate": 9.859489669055166e-06, + "loss": 1.6782, + "step": 10679 + }, + { + "epoch": 0.8039293174504601, + "grad_norm": 5.832951068878174, + "learning_rate": 9.859489669055166e-06, + "loss": 1.7834, + "step": 10680 + }, + { + "epoch": 0.8040045917311203, + "grad_norm": 6.232776641845703, + "learning_rate": 9.852222230372388e-06, + "loss": 1.996, + "step": 10681 + }, + { + "epoch": 0.8040798660117804, + "grad_norm": 4.047651290893555, + "learning_rate": 9.844957178359798e-06, + "loss": 1.6895, + "step": 10682 + }, + { + "epoch": 0.8041551402924406, + "grad_norm": 4.456789016723633, + "learning_rate": 9.837694513449297e-06, + "loss": 1.4301, + "step": 10683 + }, + { + "epoch": 0.8042304145731007, + "grad_norm": 6.298038959503174, + "learning_rate": 9.830434236072605e-06, + "loss": 2.3635, + "step": 10684 + }, + { + "epoch": 0.8043056888537609, + "grad_norm": 6.315861701965332, + "learning_rate": 9.823176346661356e-06, + "loss": 1.709, + "step": 10685 + }, + { + "epoch": 0.8043809631344211, + "grad_norm": 5.794083595275879, + "learning_rate": 9.815920845646976e-06, + "loss": 1.6315, + "step": 10686 + }, + { + "epoch": 0.8044562374150812, + "grad_norm": 4.282148361206055, + "learning_rate": 9.80866773346082e-06, + "loss": 1.4934, + "step": 10687 + }, + { + "epoch": 0.8045315116957413, + "grad_norm": 6.939359188079834, + "learning_rate": 9.801417010534026e-06, + "loss": 1.9112, + "step": 10688 + }, + { + "epoch": 0.8046067859764016, + "grad_norm": 5.723571300506592, + "learning_rate": 9.794168677297649e-06, + "loss": 1.9267, + "step": 10689 + }, + { + "epoch": 0.8046820602570617, + "grad_norm": 6.424620628356934, + "learning_rate": 9.786922734182597e-06, + "loss": 2.1242, + "step": 10690 + }, + { + "epoch": 0.8047573345377218, + "grad_norm": 3.8335604667663574, + "learning_rate": 9.779679181619583e-06, + "loss": 1.7609, + "step": 10691 + }, + { + "epoch": 0.804832608818382, + "grad_norm": 6.078704357147217, + "learning_rate": 9.772438020039243e-06, + "loss": 1.5793, + "step": 10692 + }, + { + "epoch": 0.8049078830990422, + "grad_norm": 4.111569404602051, + "learning_rate": 9.765199249872038e-06, + "loss": 1.4538, + "step": 10693 + }, + { + "epoch": 0.8049831573797023, + "grad_norm": 3.8913235664367676, + "learning_rate": 9.757962871548305e-06, + "loss": 1.6399, + "step": 10694 + }, + { + "epoch": 0.8050584316603624, + "grad_norm": 4.875214576721191, + "learning_rate": 9.750728885498195e-06, + "loss": 1.4047, + "step": 10695 + }, + { + "epoch": 0.8051337059410226, + "grad_norm": 4.7166643142700195, + "learning_rate": 9.743497292151776e-06, + "loss": 1.6971, + "step": 10696 + }, + { + "epoch": 0.8052089802216827, + "grad_norm": 6.289992809295654, + "learning_rate": 9.736268091938927e-06, + "loss": 1.6479, + "step": 10697 + }, + { + "epoch": 0.8052842545023429, + "grad_norm": 4.575614929199219, + "learning_rate": 9.72904128528942e-06, + "loss": 1.7579, + "step": 10698 + }, + { + "epoch": 0.8053595287830031, + "grad_norm": 4.374817371368408, + "learning_rate": 9.721816872632844e-06, + "loss": 1.6652, + "step": 10699 + }, + { + "epoch": 0.8054348030636632, + "grad_norm": 5.086137771606445, + "learning_rate": 9.714594854398695e-06, + "loss": 1.8821, + "step": 10700 + }, + { + "epoch": 0.8055100773443233, + "grad_norm": 4.670071125030518, + "learning_rate": 9.707375231016286e-06, + "loss": 1.8576, + "step": 10701 + }, + { + "epoch": 0.8055853516249836, + "grad_norm": 5.064945697784424, + "learning_rate": 9.7001580029148e-06, + "loss": 1.5716, + "step": 10702 + }, + { + "epoch": 0.8056606259056437, + "grad_norm": 6.030887126922607, + "learning_rate": 9.692943170523288e-06, + "loss": 2.3776, + "step": 10703 + }, + { + "epoch": 0.8057359001863038, + "grad_norm": 5.851210594177246, + "learning_rate": 9.685730734270654e-06, + "loss": 1.666, + "step": 10704 + }, + { + "epoch": 0.805811174466964, + "grad_norm": 4.9819536209106445, + "learning_rate": 9.678520694585664e-06, + "loss": 1.7411, + "step": 10705 + }, + { + "epoch": 0.8058864487476242, + "grad_norm": 5.127562999725342, + "learning_rate": 9.671313051896908e-06, + "loss": 1.7239, + "step": 10706 + }, + { + "epoch": 0.8059617230282843, + "grad_norm": 5.233977794647217, + "learning_rate": 9.664107806632888e-06, + "loss": 2.1205, + "step": 10707 + }, + { + "epoch": 0.8060369973089445, + "grad_norm": 3.824859857559204, + "learning_rate": 9.656904959221912e-06, + "loss": 1.8875, + "step": 10708 + }, + { + "epoch": 0.8061122715896046, + "grad_norm": 5.744001388549805, + "learning_rate": 9.649704510092195e-06, + "loss": 1.7138, + "step": 10709 + }, + { + "epoch": 0.8061875458702648, + "grad_norm": 7.650867462158203, + "learning_rate": 9.642506459671746e-06, + "loss": 2.4303, + "step": 10710 + }, + { + "epoch": 0.806262820150925, + "grad_norm": 5.673069477081299, + "learning_rate": 9.635310808388509e-06, + "loss": 1.4306, + "step": 10711 + }, + { + "epoch": 0.8063380944315851, + "grad_norm": 4.3479905128479, + "learning_rate": 9.628117556670207e-06, + "loss": 1.4309, + "step": 10712 + }, + { + "epoch": 0.8064133687122452, + "grad_norm": 4.550969123840332, + "learning_rate": 9.620926704944482e-06, + "loss": 2.0859, + "step": 10713 + }, + { + "epoch": 0.8064886429929053, + "grad_norm": 4.660947322845459, + "learning_rate": 9.613738253638798e-06, + "loss": 1.8436, + "step": 10714 + }, + { + "epoch": 0.8065639172735656, + "grad_norm": 4.935039043426514, + "learning_rate": 9.606552203180513e-06, + "loss": 1.8763, + "step": 10715 + }, + { + "epoch": 0.8066391915542257, + "grad_norm": 4.6597819328308105, + "learning_rate": 9.599368553996779e-06, + "loss": 1.6187, + "step": 10716 + }, + { + "epoch": 0.8067144658348858, + "grad_norm": 4.312115669250488, + "learning_rate": 9.592187306514676e-06, + "loss": 1.7801, + "step": 10717 + }, + { + "epoch": 0.806789740115546, + "grad_norm": 5.013323783874512, + "learning_rate": 9.585008461161082e-06, + "loss": 1.8543, + "step": 10718 + }, + { + "epoch": 0.8068650143962062, + "grad_norm": 4.603612422943115, + "learning_rate": 9.577832018362788e-06, + "loss": 1.5446, + "step": 10719 + }, + { + "epoch": 0.8069402886768663, + "grad_norm": 5.909267425537109, + "learning_rate": 9.570657978546383e-06, + "loss": 1.5844, + "step": 10720 + }, + { + "epoch": 0.8070155629575265, + "grad_norm": 6.481149673461914, + "learning_rate": 9.56348634213835e-06, + "loss": 1.4137, + "step": 10721 + }, + { + "epoch": 0.8070908372381866, + "grad_norm": 4.195736885070801, + "learning_rate": 9.55631710956505e-06, + "loss": 1.6917, + "step": 10722 + }, + { + "epoch": 0.8071661115188468, + "grad_norm": 5.30164098739624, + "learning_rate": 9.549150281252633e-06, + "loss": 1.6404, + "step": 10723 + }, + { + "epoch": 0.807241385799507, + "grad_norm": 5.554624557495117, + "learning_rate": 9.541985857627166e-06, + "loss": 1.9411, + "step": 10724 + }, + { + "epoch": 0.8073166600801671, + "grad_norm": 4.075313568115234, + "learning_rate": 9.534823839114553e-06, + "loss": 1.9542, + "step": 10725 + }, + { + "epoch": 0.8073919343608272, + "grad_norm": 4.310482025146484, + "learning_rate": 9.52766422614057e-06, + "loss": 1.665, + "step": 10726 + }, + { + "epoch": 0.8074672086414875, + "grad_norm": 4.176849365234375, + "learning_rate": 9.520507019130804e-06, + "loss": 1.6664, + "step": 10727 + }, + { + "epoch": 0.8075424829221476, + "grad_norm": 6.520838260650635, + "learning_rate": 9.513352218510763e-06, + "loss": 1.8622, + "step": 10728 + }, + { + "epoch": 0.8076177572028077, + "grad_norm": 4.1897687911987305, + "learning_rate": 9.506199824705753e-06, + "loss": 1.8259, + "step": 10729 + }, + { + "epoch": 0.8076930314834679, + "grad_norm": 4.515954494476318, + "learning_rate": 9.499049838140982e-06, + "loss": 1.94, + "step": 10730 + }, + { + "epoch": 0.8077683057641281, + "grad_norm": 4.747716903686523, + "learning_rate": 9.491902259241476e-06, + "loss": 1.348, + "step": 10731 + }, + { + "epoch": 0.8078435800447882, + "grad_norm": 6.552875995635986, + "learning_rate": 9.484757088432161e-06, + "loss": 1.7899, + "step": 10732 + }, + { + "epoch": 0.8079188543254483, + "grad_norm": 4.528985977172852, + "learning_rate": 9.477614326137773e-06, + "loss": 1.8451, + "step": 10733 + }, + { + "epoch": 0.8079941286061085, + "grad_norm": 5.588685035705566, + "learning_rate": 9.470473972782962e-06, + "loss": 1.5428, + "step": 10734 + }, + { + "epoch": 0.8080694028867687, + "grad_norm": 5.34883975982666, + "learning_rate": 9.463336028792157e-06, + "loss": 2.0991, + "step": 10735 + }, + { + "epoch": 0.8081446771674288, + "grad_norm": 4.229326248168945, + "learning_rate": 9.456200494589723e-06, + "loss": 1.9816, + "step": 10736 + }, + { + "epoch": 0.808219951448089, + "grad_norm": 3.9271128177642822, + "learning_rate": 9.449067370599829e-06, + "loss": 1.6893, + "step": 10737 + }, + { + "epoch": 0.8082952257287491, + "grad_norm": 5.129074573516846, + "learning_rate": 9.44193665724653e-06, + "loss": 1.8029, + "step": 10738 + }, + { + "epoch": 0.8083705000094092, + "grad_norm": 5.309793472290039, + "learning_rate": 9.434808354953734e-06, + "loss": 1.6701, + "step": 10739 + }, + { + "epoch": 0.8084457742900695, + "grad_norm": 4.950886249542236, + "learning_rate": 9.427682464145172e-06, + "loss": 2.2871, + "step": 10740 + }, + { + "epoch": 0.8085210485707296, + "grad_norm": 3.967445135116577, + "learning_rate": 9.420558985244488e-06, + "loss": 1.8083, + "step": 10741 + }, + { + "epoch": 0.8085963228513897, + "grad_norm": 6.281248569488525, + "learning_rate": 9.413437918675123e-06, + "loss": 1.7827, + "step": 10742 + }, + { + "epoch": 0.80867159713205, + "grad_norm": 4.986514568328857, + "learning_rate": 9.40631926486043e-06, + "loss": 2.0824, + "step": 10743 + }, + { + "epoch": 0.8087468714127101, + "grad_norm": 4.760549068450928, + "learning_rate": 9.399203024223568e-06, + "loss": 1.7938, + "step": 10744 + }, + { + "epoch": 0.8088221456933702, + "grad_norm": 4.574357986450195, + "learning_rate": 9.392089197187604e-06, + "loss": 1.4696, + "step": 10745 + }, + { + "epoch": 0.8088974199740304, + "grad_norm": 3.9092679023742676, + "learning_rate": 9.384977784175403e-06, + "loss": 1.6746, + "step": 10746 + }, + { + "epoch": 0.8089726942546905, + "grad_norm": 5.171268463134766, + "learning_rate": 9.37786878560974e-06, + "loss": 1.8227, + "step": 10747 + }, + { + "epoch": 0.8090479685353507, + "grad_norm": 5.368162155151367, + "learning_rate": 9.37076220191322e-06, + "loss": 1.383, + "step": 10748 + }, + { + "epoch": 0.8091232428160109, + "grad_norm": 4.458542346954346, + "learning_rate": 9.363658033508315e-06, + "loss": 1.6161, + "step": 10749 + }, + { + "epoch": 0.809198517096671, + "grad_norm": 4.535194396972656, + "learning_rate": 9.356556280817335e-06, + "loss": 1.7852, + "step": 10750 + }, + { + "epoch": 0.8092737913773311, + "grad_norm": 7.015781402587891, + "learning_rate": 9.349456944262474e-06, + "loss": 1.4985, + "step": 10751 + }, + { + "epoch": 0.8093490656579913, + "grad_norm": 3.718797206878662, + "learning_rate": 9.34236002426575e-06, + "loss": 1.5249, + "step": 10752 + }, + { + "epoch": 0.8094243399386515, + "grad_norm": 6.286694049835205, + "learning_rate": 9.335265521249059e-06, + "loss": 1.7973, + "step": 10753 + }, + { + "epoch": 0.8094996142193116, + "grad_norm": 6.147034168243408, + "learning_rate": 9.328173435634164e-06, + "loss": 2.2572, + "step": 10754 + }, + { + "epoch": 0.8095748884999717, + "grad_norm": 7.531181812286377, + "learning_rate": 9.321083767842648e-06, + "loss": 1.9205, + "step": 10755 + }, + { + "epoch": 0.809650162780632, + "grad_norm": 5.010212421417236, + "learning_rate": 9.313996518295993e-06, + "loss": 1.6824, + "step": 10756 + }, + { + "epoch": 0.8097254370612921, + "grad_norm": 3.934354066848755, + "learning_rate": 9.306911687415488e-06, + "loss": 1.7618, + "step": 10757 + }, + { + "epoch": 0.8098007113419522, + "grad_norm": 4.612907409667969, + "learning_rate": 9.299829275622323e-06, + "loss": 1.6604, + "step": 10758 + }, + { + "epoch": 0.8098759856226124, + "grad_norm": 4.187878131866455, + "learning_rate": 9.292749283337533e-06, + "loss": 1.5036, + "step": 10759 + }, + { + "epoch": 0.8099512599032725, + "grad_norm": 5.0052103996276855, + "learning_rate": 9.285671710981997e-06, + "loss": 1.971, + "step": 10760 + }, + { + "epoch": 0.8100265341839327, + "grad_norm": 7.626448631286621, + "learning_rate": 9.27859655897645e-06, + "loss": 1.8097, + "step": 10761 + }, + { + "epoch": 0.8101018084645929, + "grad_norm": 5.556909084320068, + "learning_rate": 9.271523827741508e-06, + "loss": 1.7399, + "step": 10762 + }, + { + "epoch": 0.810177082745253, + "grad_norm": 4.775475978851318, + "learning_rate": 9.264453517697597e-06, + "loss": 1.6214, + "step": 10763 + }, + { + "epoch": 0.8102523570259131, + "grad_norm": 5.027905464172363, + "learning_rate": 9.257385629265053e-06, + "loss": 1.8219, + "step": 10764 + }, + { + "epoch": 0.8103276313065734, + "grad_norm": 5.68484354019165, + "learning_rate": 9.250320162864012e-06, + "loss": 1.777, + "step": 10765 + }, + { + "epoch": 0.8104029055872335, + "grad_norm": 5.306346416473389, + "learning_rate": 9.243257118914533e-06, + "loss": 1.7954, + "step": 10766 + }, + { + "epoch": 0.8104781798678936, + "grad_norm": 4.2520647048950195, + "learning_rate": 9.236196497836458e-06, + "loss": 1.5632, + "step": 10767 + }, + { + "epoch": 0.8105534541485538, + "grad_norm": 5.100350856781006, + "learning_rate": 9.22913830004954e-06, + "loss": 1.6379, + "step": 10768 + }, + { + "epoch": 0.810628728429214, + "grad_norm": 5.0618672370910645, + "learning_rate": 9.222082525973364e-06, + "loss": 1.491, + "step": 10769 + }, + { + "epoch": 0.8107040027098741, + "grad_norm": 5.813129425048828, + "learning_rate": 9.215029176027373e-06, + "loss": 1.7683, + "step": 10770 + }, + { + "epoch": 0.8107792769905343, + "grad_norm": 4.636019229888916, + "learning_rate": 9.207978250630888e-06, + "loss": 1.6041, + "step": 10771 + }, + { + "epoch": 0.8108545512711944, + "grad_norm": 4.913620948791504, + "learning_rate": 9.20092975020304e-06, + "loss": 1.8379, + "step": 10772 + }, + { + "epoch": 0.8109298255518546, + "grad_norm": 5.632382869720459, + "learning_rate": 9.193883675162863e-06, + "loss": 1.6162, + "step": 10773 + }, + { + "epoch": 0.8110050998325147, + "grad_norm": 5.632382869720459, + "learning_rate": 9.193883675162863e-06, + "loss": 1.7046, + "step": 10774 + }, + { + "epoch": 0.8110803741131749, + "grad_norm": 5.2676682472229, + "learning_rate": 9.186840025929206e-06, + "loss": 1.8647, + "step": 10775 + }, + { + "epoch": 0.811155648393835, + "grad_norm": 5.111133098602295, + "learning_rate": 9.179798802920814e-06, + "loss": 1.4395, + "step": 10776 + }, + { + "epoch": 0.8112309226744951, + "grad_norm": 4.482577323913574, + "learning_rate": 9.172760006556242e-06, + "loss": 1.8926, + "step": 10777 + }, + { + "epoch": 0.8113061969551554, + "grad_norm": 5.009120941162109, + "learning_rate": 9.165723637253953e-06, + "loss": 2.032, + "step": 10778 + }, + { + "epoch": 0.8113814712358155, + "grad_norm": 5.149282455444336, + "learning_rate": 9.158689695432221e-06, + "loss": 1.7103, + "step": 10779 + }, + { + "epoch": 0.8114567455164756, + "grad_norm": 8.16108512878418, + "learning_rate": 9.151658181509194e-06, + "loss": 2.3891, + "step": 10780 + }, + { + "epoch": 0.8115320197971359, + "grad_norm": 6.173976898193359, + "learning_rate": 9.144629095902896e-06, + "loss": 1.8225, + "step": 10781 + }, + { + "epoch": 0.811607294077796, + "grad_norm": 4.532297611236572, + "learning_rate": 9.137602439031157e-06, + "loss": 1.7306, + "step": 10782 + }, + { + "epoch": 0.8116825683584561, + "grad_norm": 6.302655220031738, + "learning_rate": 9.130578211311708e-06, + "loss": 1.751, + "step": 10783 + }, + { + "epoch": 0.8117578426391163, + "grad_norm": 4.427190780639648, + "learning_rate": 9.123556413162126e-06, + "loss": 1.8865, + "step": 10784 + }, + { + "epoch": 0.8118331169197764, + "grad_norm": 4.001774311065674, + "learning_rate": 9.116537044999817e-06, + "loss": 2.0738, + "step": 10785 + }, + { + "epoch": 0.8119083912004366, + "grad_norm": 5.022767066955566, + "learning_rate": 9.10952010724207e-06, + "loss": 1.5779, + "step": 10786 + }, + { + "epoch": 0.8119836654810968, + "grad_norm": 5.475062847137451, + "learning_rate": 9.102505600306045e-06, + "loss": 2.3591, + "step": 10787 + }, + { + "epoch": 0.8120589397617569, + "grad_norm": 4.143284320831299, + "learning_rate": 9.095493524608694e-06, + "loss": 1.6952, + "step": 10788 + }, + { + "epoch": 0.812134214042417, + "grad_norm": 5.463651180267334, + "learning_rate": 9.0884838805669e-06, + "loss": 1.8657, + "step": 10789 + }, + { + "epoch": 0.8122094883230773, + "grad_norm": 4.535449504852295, + "learning_rate": 9.081476668597338e-06, + "loss": 1.8759, + "step": 10790 + }, + { + "epoch": 0.8122847626037374, + "grad_norm": 5.332362651824951, + "learning_rate": 9.074471889116593e-06, + "loss": 1.491, + "step": 10791 + }, + { + "epoch": 0.8123600368843975, + "grad_norm": 6.7469282150268555, + "learning_rate": 9.067469542541051e-06, + "loss": 2.2716, + "step": 10792 + }, + { + "epoch": 0.8124353111650576, + "grad_norm": 4.326296806335449, + "learning_rate": 9.060469629287e-06, + "loss": 1.6534, + "step": 10793 + }, + { + "epoch": 0.8125105854457179, + "grad_norm": 6.551939010620117, + "learning_rate": 9.053472149770559e-06, + "loss": 1.8713, + "step": 10794 + }, + { + "epoch": 0.812585859726378, + "grad_norm": 4.524019241333008, + "learning_rate": 9.04647710440772e-06, + "loss": 1.7803, + "step": 10795 + }, + { + "epoch": 0.8126611340070381, + "grad_norm": 4.482858180999756, + "learning_rate": 9.039484493614298e-06, + "loss": 1.8058, + "step": 10796 + }, + { + "epoch": 0.8127364082876983, + "grad_norm": 10.275532722473145, + "learning_rate": 9.032494317806007e-06, + "loss": 2.0376, + "step": 10797 + }, + { + "epoch": 0.8128116825683585, + "grad_norm": 4.812849998474121, + "learning_rate": 9.025506577398362e-06, + "loss": 1.6337, + "step": 10798 + }, + { + "epoch": 0.8128869568490186, + "grad_norm": 4.211179733276367, + "learning_rate": 9.0185212728068e-06, + "loss": 2.0419, + "step": 10799 + }, + { + "epoch": 0.8129622311296788, + "grad_norm": 8.443922996520996, + "learning_rate": 9.011538404446546e-06, + "loss": 1.8586, + "step": 10800 + }, + { + "epoch": 0.8130375054103389, + "grad_norm": 3.891097068786621, + "learning_rate": 9.00455797273274e-06, + "loss": 1.891, + "step": 10801 + }, + { + "epoch": 0.813112779690999, + "grad_norm": 6.209191799163818, + "learning_rate": 8.997579978080317e-06, + "loss": 1.7102, + "step": 10802 + }, + { + "epoch": 0.8131880539716593, + "grad_norm": 5.509005069732666, + "learning_rate": 8.990604420904115e-06, + "loss": 2.0006, + "step": 10803 + }, + { + "epoch": 0.8132633282523194, + "grad_norm": 4.314136981964111, + "learning_rate": 8.983631301618812e-06, + "loss": 1.7638, + "step": 10804 + }, + { + "epoch": 0.8133386025329795, + "grad_norm": 4.226730823516846, + "learning_rate": 8.976660620638939e-06, + "loss": 2.1153, + "step": 10805 + }, + { + "epoch": 0.8134138768136397, + "grad_norm": 6.691463947296143, + "learning_rate": 8.969692378378897e-06, + "loss": 1.8806, + "step": 10806 + }, + { + "epoch": 0.8134891510942999, + "grad_norm": 6.5203537940979, + "learning_rate": 8.9627265752529e-06, + "loss": 1.7904, + "step": 10807 + }, + { + "epoch": 0.81356442537496, + "grad_norm": 4.176247596740723, + "learning_rate": 8.955763211675072e-06, + "loss": 1.8313, + "step": 10808 + }, + { + "epoch": 0.8136396996556202, + "grad_norm": 4.838740348815918, + "learning_rate": 8.948802288059338e-06, + "loss": 1.9172, + "step": 10809 + }, + { + "epoch": 0.8137149739362803, + "grad_norm": 4.36600399017334, + "learning_rate": 8.941843804819533e-06, + "loss": 2.5324, + "step": 10810 + }, + { + "epoch": 0.8137902482169405, + "grad_norm": 6.494029998779297, + "learning_rate": 8.934887762369293e-06, + "loss": 1.4995, + "step": 10811 + }, + { + "epoch": 0.8138655224976006, + "grad_norm": 4.998374938964844, + "learning_rate": 8.927934161122154e-06, + "loss": 1.9609, + "step": 10812 + }, + { + "epoch": 0.8139407967782608, + "grad_norm": 6.45481538772583, + "learning_rate": 8.920983001491473e-06, + "loss": 1.7951, + "step": 10813 + }, + { + "epoch": 0.8140160710589209, + "grad_norm": 7.360757350921631, + "learning_rate": 8.914034283890483e-06, + "loss": 1.776, + "step": 10814 + }, + { + "epoch": 0.814091345339581, + "grad_norm": 5.082930564880371, + "learning_rate": 8.907088008732261e-06, + "loss": 1.6566, + "step": 10815 + }, + { + "epoch": 0.8141666196202413, + "grad_norm": 4.800956726074219, + "learning_rate": 8.900144176429765e-06, + "loss": 1.7358, + "step": 10816 + }, + { + "epoch": 0.8142418939009014, + "grad_norm": 5.695190906524658, + "learning_rate": 8.893202787395755e-06, + "loss": 1.981, + "step": 10817 + }, + { + "epoch": 0.8143171681815615, + "grad_norm": 5.245087623596191, + "learning_rate": 8.886263842042892e-06, + "loss": 2.0929, + "step": 10818 + }, + { + "epoch": 0.8143924424622218, + "grad_norm": 6.453237056732178, + "learning_rate": 8.879327340783688e-06, + "loss": 2.072, + "step": 10819 + }, + { + "epoch": 0.8144677167428819, + "grad_norm": 4.42209005355835, + "learning_rate": 8.872393284030472e-06, + "loss": 1.8223, + "step": 10820 + }, + { + "epoch": 0.814542991023542, + "grad_norm": 4.721053600311279, + "learning_rate": 8.865461672195479e-06, + "loss": 1.8154, + "step": 10821 + }, + { + "epoch": 0.8146182653042022, + "grad_norm": 5.299997806549072, + "learning_rate": 8.858532505690747e-06, + "loss": 1.9668, + "step": 10822 + }, + { + "epoch": 0.8146935395848623, + "grad_norm": 6.3773627281188965, + "learning_rate": 8.851605784928224e-06, + "loss": 1.8669, + "step": 10823 + }, + { + "epoch": 0.8147688138655225, + "grad_norm": 4.670791149139404, + "learning_rate": 8.844681510319658e-06, + "loss": 2.0647, + "step": 10824 + }, + { + "epoch": 0.8148440881461827, + "grad_norm": 4.377533435821533, + "learning_rate": 8.837759682276685e-06, + "loss": 1.9389, + "step": 10825 + }, + { + "epoch": 0.8149193624268428, + "grad_norm": 4.788690090179443, + "learning_rate": 8.830840301210796e-06, + "loss": 2.0852, + "step": 10826 + }, + { + "epoch": 0.8149946367075029, + "grad_norm": 5.35303258895874, + "learning_rate": 8.823923367533332e-06, + "loss": 2.2674, + "step": 10827 + }, + { + "epoch": 0.8150699109881632, + "grad_norm": 4.1622772216796875, + "learning_rate": 8.817008881655464e-06, + "loss": 1.6945, + "step": 10828 + }, + { + "epoch": 0.8151451852688233, + "grad_norm": 5.137579441070557, + "learning_rate": 8.810096843988263e-06, + "loss": 1.576, + "step": 10829 + }, + { + "epoch": 0.8152204595494834, + "grad_norm": 5.846545696258545, + "learning_rate": 8.803187254942607e-06, + "loss": 1.9946, + "step": 10830 + }, + { + "epoch": 0.8152957338301435, + "grad_norm": 4.926131248474121, + "learning_rate": 8.796280114929274e-06, + "loss": 1.5647, + "step": 10831 + }, + { + "epoch": 0.8153710081108038, + "grad_norm": 5.013854503631592, + "learning_rate": 8.789375424358847e-06, + "loss": 1.696, + "step": 10832 + }, + { + "epoch": 0.8154462823914639, + "grad_norm": 3.7120392322540283, + "learning_rate": 8.78247318364182e-06, + "loss": 1.6129, + "step": 10833 + }, + { + "epoch": 0.815521556672124, + "grad_norm": 4.516750335693359, + "learning_rate": 8.775573393188485e-06, + "loss": 1.8204, + "step": 10834 + }, + { + "epoch": 0.8155968309527842, + "grad_norm": 4.984897136688232, + "learning_rate": 8.76867605340902e-06, + "loss": 1.8868, + "step": 10835 + }, + { + "epoch": 0.8156721052334444, + "grad_norm": 5.796410083770752, + "learning_rate": 8.761781164713472e-06, + "loss": 2.0085, + "step": 10836 + }, + { + "epoch": 0.8157473795141045, + "grad_norm": 4.42592191696167, + "learning_rate": 8.754888727511696e-06, + "loss": 1.5354, + "step": 10837 + }, + { + "epoch": 0.8158226537947647, + "grad_norm": 4.458138942718506, + "learning_rate": 8.747998742213443e-06, + "loss": 1.5543, + "step": 10838 + }, + { + "epoch": 0.8158979280754248, + "grad_norm": 5.161789894104004, + "learning_rate": 8.741111209228292e-06, + "loss": 1.7793, + "step": 10839 + }, + { + "epoch": 0.815973202356085, + "grad_norm": 3.833183765411377, + "learning_rate": 8.73422612896571e-06, + "loss": 1.7617, + "step": 10840 + }, + { + "epoch": 0.8160484766367452, + "grad_norm": 7.159696102142334, + "learning_rate": 8.727343501834972e-06, + "loss": 2.2675, + "step": 10841 + }, + { + "epoch": 0.8161237509174053, + "grad_norm": 4.582847595214844, + "learning_rate": 8.720463328245248e-06, + "loss": 1.5086, + "step": 10842 + }, + { + "epoch": 0.8161990251980654, + "grad_norm": 4.902654647827148, + "learning_rate": 8.713585608605524e-06, + "loss": 1.7983, + "step": 10843 + }, + { + "epoch": 0.8162742994787257, + "grad_norm": 4.875204086303711, + "learning_rate": 8.706710343324681e-06, + "loss": 1.7053, + "step": 10844 + }, + { + "epoch": 0.8163495737593858, + "grad_norm": 3.8512213230133057, + "learning_rate": 8.69983753281141e-06, + "loss": 1.7641, + "step": 10845 + }, + { + "epoch": 0.8164248480400459, + "grad_norm": 5.193028450012207, + "learning_rate": 8.69296717747431e-06, + "loss": 1.9469, + "step": 10846 + }, + { + "epoch": 0.8165001223207061, + "grad_norm": 5.994436740875244, + "learning_rate": 8.686099277721777e-06, + "loss": 1.5639, + "step": 10847 + }, + { + "epoch": 0.8165753966013662, + "grad_norm": 4.695322513580322, + "learning_rate": 8.679233833962097e-06, + "loss": 1.7832, + "step": 10848 + }, + { + "epoch": 0.8166506708820264, + "grad_norm": 5.546252250671387, + "learning_rate": 8.672370846603406e-06, + "loss": 1.9258, + "step": 10849 + }, + { + "epoch": 0.8167259451626865, + "grad_norm": 4.912105083465576, + "learning_rate": 8.665510316053682e-06, + "loss": 1.4253, + "step": 10850 + }, + { + "epoch": 0.8168012194433467, + "grad_norm": 5.021442413330078, + "learning_rate": 8.658652242720777e-06, + "loss": 1.9913, + "step": 10851 + }, + { + "epoch": 0.8168764937240068, + "grad_norm": 3.657658815383911, + "learning_rate": 8.651796627012371e-06, + "loss": 1.9411, + "step": 10852 + }, + { + "epoch": 0.816951768004667, + "grad_norm": 6.262230396270752, + "learning_rate": 8.64494346933602e-06, + "loss": 2.0501, + "step": 10853 + }, + { + "epoch": 0.8170270422853272, + "grad_norm": 6.539548397064209, + "learning_rate": 8.63809277009911e-06, + "loss": 1.681, + "step": 10854 + }, + { + "epoch": 0.8171023165659873, + "grad_norm": 4.937403678894043, + "learning_rate": 8.631244529708915e-06, + "loss": 1.9723, + "step": 10855 + }, + { + "epoch": 0.8171775908466474, + "grad_norm": 3.7029614448547363, + "learning_rate": 8.62439874857252e-06, + "loss": 1.9043, + "step": 10856 + }, + { + "epoch": 0.8172528651273077, + "grad_norm": 6.8704304695129395, + "learning_rate": 8.617555427096913e-06, + "loss": 1.932, + "step": 10857 + }, + { + "epoch": 0.8173281394079678, + "grad_norm": 4.510538578033447, + "learning_rate": 8.610714565688882e-06, + "loss": 2.1058, + "step": 10858 + }, + { + "epoch": 0.8174034136886279, + "grad_norm": 4.989439487457275, + "learning_rate": 8.60387616475511e-06, + "loss": 1.9232, + "step": 10859 + }, + { + "epoch": 0.8174786879692881, + "grad_norm": 6.048931121826172, + "learning_rate": 8.597040224702124e-06, + "loss": 2.1272, + "step": 10860 + }, + { + "epoch": 0.8175539622499483, + "grad_norm": 4.535512447357178, + "learning_rate": 8.590206745936308e-06, + "loss": 1.551, + "step": 10861 + }, + { + "epoch": 0.8176292365306084, + "grad_norm": 6.280004978179932, + "learning_rate": 8.583375728863874e-06, + "loss": 1.6735, + "step": 10862 + }, + { + "epoch": 0.8177045108112686, + "grad_norm": 4.35219144821167, + "learning_rate": 8.576547173890926e-06, + "loss": 1.7076, + "step": 10863 + }, + { + "epoch": 0.8177797850919287, + "grad_norm": 5.373569488525391, + "learning_rate": 8.569721081423376e-06, + "loss": 1.9191, + "step": 10864 + }, + { + "epoch": 0.8178550593725888, + "grad_norm": 4.578735828399658, + "learning_rate": 8.562897451867047e-06, + "loss": 1.798, + "step": 10865 + }, + { + "epoch": 0.8179303336532491, + "grad_norm": 4.907925128936768, + "learning_rate": 8.556076285627557e-06, + "loss": 2.1186, + "step": 10866 + }, + { + "epoch": 0.8180056079339092, + "grad_norm": 5.499814987182617, + "learning_rate": 8.549257583110415e-06, + "loss": 1.5638, + "step": 10867 + }, + { + "epoch": 0.8180808822145693, + "grad_norm": 4.844296455383301, + "learning_rate": 8.54244134472098e-06, + "loss": 1.7051, + "step": 10868 + }, + { + "epoch": 0.8181561564952295, + "grad_norm": 6.254519462585449, + "learning_rate": 8.535627570864447e-06, + "loss": 1.6878, + "step": 10869 + }, + { + "epoch": 0.8182314307758897, + "grad_norm": 4.311774730682373, + "learning_rate": 8.528816261945877e-06, + "loss": 1.7822, + "step": 10870 + }, + { + "epoch": 0.8183067050565498, + "grad_norm": 3.9855504035949707, + "learning_rate": 8.522007418370188e-06, + "loss": 1.665, + "step": 10871 + }, + { + "epoch": 0.8183819793372099, + "grad_norm": 6.331838607788086, + "learning_rate": 8.515201040542158e-06, + "loss": 1.7336, + "step": 10872 + }, + { + "epoch": 0.8184572536178701, + "grad_norm": 5.406843662261963, + "learning_rate": 8.50839712886638e-06, + "loss": 1.7519, + "step": 10873 + }, + { + "epoch": 0.8185325278985303, + "grad_norm": 4.678377628326416, + "learning_rate": 8.501595683747348e-06, + "loss": 1.7087, + "step": 10874 + }, + { + "epoch": 0.8186078021791904, + "grad_norm": 5.033703804016113, + "learning_rate": 8.49479670558937e-06, + "loss": 1.3437, + "step": 10875 + }, + { + "epoch": 0.8186830764598506, + "grad_norm": 5.009019374847412, + "learning_rate": 8.488000194796647e-06, + "loss": 1.6715, + "step": 10876 + }, + { + "epoch": 0.8187583507405107, + "grad_norm": 5.265163898468018, + "learning_rate": 8.48120615177319e-06, + "loss": 1.8161, + "step": 10877 + }, + { + "epoch": 0.8188336250211709, + "grad_norm": 4.090880393981934, + "learning_rate": 8.474414576922912e-06, + "loss": 1.8737, + "step": 10878 + }, + { + "epoch": 0.8189088993018311, + "grad_norm": 3.992671012878418, + "learning_rate": 8.467625470649526e-06, + "loss": 1.7233, + "step": 10879 + }, + { + "epoch": 0.8189841735824912, + "grad_norm": 5.976937294006348, + "learning_rate": 8.460838833356632e-06, + "loss": 2.058, + "step": 10880 + }, + { + "epoch": 0.8190594478631513, + "grad_norm": 3.9277961254119873, + "learning_rate": 8.454054665447681e-06, + "loss": 1.6229, + "step": 10881 + }, + { + "epoch": 0.8191347221438116, + "grad_norm": 7.06601095199585, + "learning_rate": 8.447272967325981e-06, + "loss": 1.8344, + "step": 10882 + }, + { + "epoch": 0.8192099964244717, + "grad_norm": 6.076743125915527, + "learning_rate": 8.440493739394668e-06, + "loss": 2.1974, + "step": 10883 + }, + { + "epoch": 0.8192852707051318, + "grad_norm": 4.61710262298584, + "learning_rate": 8.433716982056749e-06, + "loss": 1.6597, + "step": 10884 + }, + { + "epoch": 0.819360544985792, + "grad_norm": 5.550755500793457, + "learning_rate": 8.426942695715101e-06, + "loss": 1.6254, + "step": 10885 + }, + { + "epoch": 0.8194358192664521, + "grad_norm": 6.744055271148682, + "learning_rate": 8.420170880772415e-06, + "loss": 1.6014, + "step": 10886 + }, + { + "epoch": 0.8195110935471123, + "grad_norm": 4.458874225616455, + "learning_rate": 8.413401537631277e-06, + "loss": 1.9159, + "step": 10887 + }, + { + "epoch": 0.8195863678277725, + "grad_norm": 4.733869552612305, + "learning_rate": 8.406634666694074e-06, + "loss": 1.7131, + "step": 10888 + }, + { + "epoch": 0.8196616421084326, + "grad_norm": 4.1965227127075195, + "learning_rate": 8.399870268363113e-06, + "loss": 1.5551, + "step": 10889 + }, + { + "epoch": 0.8197369163890927, + "grad_norm": 6.501962661743164, + "learning_rate": 8.393108343040484e-06, + "loss": 1.7853, + "step": 10890 + }, + { + "epoch": 0.8198121906697529, + "grad_norm": 5.464043617248535, + "learning_rate": 8.386348891128198e-06, + "loss": 2.0379, + "step": 10891 + }, + { + "epoch": 0.8198874649504131, + "grad_norm": 6.3169074058532715, + "learning_rate": 8.379591913028051e-06, + "loss": 1.7009, + "step": 10892 + }, + { + "epoch": 0.8199627392310732, + "grad_norm": 5.8659539222717285, + "learning_rate": 8.372837409141744e-06, + "loss": 1.9881, + "step": 10893 + }, + { + "epoch": 0.8200380135117333, + "grad_norm": 4.222545623779297, + "learning_rate": 8.366085379870814e-06, + "loss": 1.3388, + "step": 10894 + }, + { + "epoch": 0.8201132877923936, + "grad_norm": 4.818695068359375, + "learning_rate": 8.359335825616655e-06, + "loss": 1.7907, + "step": 10895 + }, + { + "epoch": 0.8201885620730537, + "grad_norm": 3.8797008991241455, + "learning_rate": 8.352588746780488e-06, + "loss": 1.8336, + "step": 10896 + }, + { + "epoch": 0.8202638363537138, + "grad_norm": 3.721681833267212, + "learning_rate": 8.345844143763437e-06, + "loss": 1.7255, + "step": 10897 + }, + { + "epoch": 0.820339110634374, + "grad_norm": 5.012353897094727, + "learning_rate": 8.339102016966415e-06, + "loss": 1.3947, + "step": 10898 + }, + { + "epoch": 0.8204143849150342, + "grad_norm": 5.829016208648682, + "learning_rate": 8.332362366790241e-06, + "loss": 1.7644, + "step": 10899 + }, + { + "epoch": 0.8204896591956943, + "grad_norm": 5.4228386878967285, + "learning_rate": 8.325625193635583e-06, + "loss": 1.6821, + "step": 10900 + }, + { + "epoch": 0.8205649334763545, + "grad_norm": 6.020086765289307, + "learning_rate": 8.318890497902914e-06, + "loss": 1.695, + "step": 10901 + }, + { + "epoch": 0.8206402077570146, + "grad_norm": 6.0146684646606445, + "learning_rate": 8.31215827999262e-06, + "loss": 1.7345, + "step": 10902 + }, + { + "epoch": 0.8207154820376747, + "grad_norm": 4.474878787994385, + "learning_rate": 8.305428540304889e-06, + "loss": 1.6151, + "step": 10903 + }, + { + "epoch": 0.820790756318335, + "grad_norm": 3.7589683532714844, + "learning_rate": 8.298701279239795e-06, + "loss": 1.858, + "step": 10904 + }, + { + "epoch": 0.8208660305989951, + "grad_norm": 3.725609540939331, + "learning_rate": 8.291976497197263e-06, + "loss": 1.7318, + "step": 10905 + }, + { + "epoch": 0.8209413048796552, + "grad_norm": 4.537235260009766, + "learning_rate": 8.285254194577057e-06, + "loss": 1.9291, + "step": 10906 + }, + { + "epoch": 0.8210165791603155, + "grad_norm": 6.215237617492676, + "learning_rate": 8.27853437177879e-06, + "loss": 1.6112, + "step": 10907 + }, + { + "epoch": 0.8210918534409756, + "grad_norm": 4.277639865875244, + "learning_rate": 8.27181702920195e-06, + "loss": 2.1441, + "step": 10908 + }, + { + "epoch": 0.8211671277216357, + "grad_norm": 4.801058769226074, + "learning_rate": 8.265102167245848e-06, + "loss": 1.8406, + "step": 10909 + }, + { + "epoch": 0.8212424020022958, + "grad_norm": 5.4379730224609375, + "learning_rate": 8.258389786309677e-06, + "loss": 1.6644, + "step": 10910 + }, + { + "epoch": 0.821317676282956, + "grad_norm": 6.305517196655273, + "learning_rate": 8.251679886792457e-06, + "loss": 1.7889, + "step": 10911 + }, + { + "epoch": 0.8213929505636162, + "grad_norm": 5.066317081451416, + "learning_rate": 8.244972469093092e-06, + "loss": 1.8237, + "step": 10912 + }, + { + "epoch": 0.8214682248442763, + "grad_norm": 5.019262790679932, + "learning_rate": 8.23826753361029e-06, + "loss": 1.7514, + "step": 10913 + }, + { + "epoch": 0.8215434991249365, + "grad_norm": 4.752792835235596, + "learning_rate": 8.231565080742654e-06, + "loss": 1.728, + "step": 10914 + }, + { + "epoch": 0.8216187734055966, + "grad_norm": 3.877133846282959, + "learning_rate": 8.22486511088863e-06, + "loss": 1.7324, + "step": 10915 + }, + { + "epoch": 0.8216940476862568, + "grad_norm": 4.577157974243164, + "learning_rate": 8.218167624446504e-06, + "loss": 2.0657, + "step": 10916 + }, + { + "epoch": 0.821769321966917, + "grad_norm": 4.082424163818359, + "learning_rate": 8.211472621814441e-06, + "loss": 1.5422, + "step": 10917 + }, + { + "epoch": 0.8218445962475771, + "grad_norm": 5.90043306350708, + "learning_rate": 8.204780103390414e-06, + "loss": 2.0231, + "step": 10918 + }, + { + "epoch": 0.8219198705282372, + "grad_norm": 5.470618724822998, + "learning_rate": 8.198090069572301e-06, + "loss": 2.0521, + "step": 10919 + }, + { + "epoch": 0.8219951448088975, + "grad_norm": 4.029650688171387, + "learning_rate": 8.191402520757773e-06, + "loss": 1.7807, + "step": 10920 + }, + { + "epoch": 0.8220704190895576, + "grad_norm": 6.506187915802002, + "learning_rate": 8.184717457344421e-06, + "loss": 1.631, + "step": 10921 + }, + { + "epoch": 0.8221456933702177, + "grad_norm": 4.254798412322998, + "learning_rate": 8.178034879729618e-06, + "loss": 1.3766, + "step": 10922 + }, + { + "epoch": 0.8222209676508779, + "grad_norm": 5.289538383483887, + "learning_rate": 8.171354788310659e-06, + "loss": 2.1839, + "step": 10923 + }, + { + "epoch": 0.822296241931538, + "grad_norm": 4.185739040374756, + "learning_rate": 8.16467718348462e-06, + "loss": 2.0434, + "step": 10924 + }, + { + "epoch": 0.8223715162121982, + "grad_norm": 4.849399089813232, + "learning_rate": 8.158002065648485e-06, + "loss": 1.956, + "step": 10925 + }, + { + "epoch": 0.8224467904928584, + "grad_norm": 7.248867034912109, + "learning_rate": 8.151329435199068e-06, + "loss": 2.0818, + "step": 10926 + }, + { + "epoch": 0.8225220647735185, + "grad_norm": 4.065545082092285, + "learning_rate": 8.144659292533052e-06, + "loss": 1.5531, + "step": 10927 + }, + { + "epoch": 0.8225973390541786, + "grad_norm": 4.054635047912598, + "learning_rate": 8.137991638046932e-06, + "loss": 1.9219, + "step": 10928 + }, + { + "epoch": 0.8226726133348388, + "grad_norm": 5.3163676261901855, + "learning_rate": 8.131326472137107e-06, + "loss": 1.6865, + "step": 10929 + }, + { + "epoch": 0.822747887615499, + "grad_norm": 4.921208381652832, + "learning_rate": 8.124663795199771e-06, + "loss": 2.0788, + "step": 10930 + }, + { + "epoch": 0.8228231618961591, + "grad_norm": 5.136446475982666, + "learning_rate": 8.118003607631025e-06, + "loss": 1.5267, + "step": 10931 + }, + { + "epoch": 0.8228984361768192, + "grad_norm": 5.207831382751465, + "learning_rate": 8.111345909826801e-06, + "loss": 1.9983, + "step": 10932 + }, + { + "epoch": 0.8229737104574795, + "grad_norm": 4.303314685821533, + "learning_rate": 8.10469070218286e-06, + "loss": 1.9486, + "step": 10933 + }, + { + "epoch": 0.8230489847381396, + "grad_norm": 4.740650653839111, + "learning_rate": 8.098037985094858e-06, + "loss": 2.025, + "step": 10934 + }, + { + "epoch": 0.8231242590187997, + "grad_norm": 4.641570091247559, + "learning_rate": 8.091387758958253e-06, + "loss": 1.952, + "step": 10935 + }, + { + "epoch": 0.8231995332994599, + "grad_norm": 4.878793239593506, + "learning_rate": 8.084740024168408e-06, + "loss": 1.987, + "step": 10936 + }, + { + "epoch": 0.8232748075801201, + "grad_norm": 5.910962104797363, + "learning_rate": 8.078094781120494e-06, + "loss": 1.8295, + "step": 10937 + }, + { + "epoch": 0.8233500818607802, + "grad_norm": 4.991444110870361, + "learning_rate": 8.071452030209553e-06, + "loss": 1.8612, + "step": 10938 + }, + { + "epoch": 0.8234253561414404, + "grad_norm": 4.691476821899414, + "learning_rate": 8.064811771830483e-06, + "loss": 2.094, + "step": 10939 + }, + { + "epoch": 0.8235006304221005, + "grad_norm": 7.601766586303711, + "learning_rate": 8.058174006378039e-06, + "loss": 1.766, + "step": 10940 + }, + { + "epoch": 0.8235759047027607, + "grad_norm": 5.229360580444336, + "learning_rate": 8.051538734246793e-06, + "loss": 2.0167, + "step": 10941 + }, + { + "epoch": 0.8236511789834209, + "grad_norm": 6.295788764953613, + "learning_rate": 8.044905955831223e-06, + "loss": 1.6874, + "step": 10942 + }, + { + "epoch": 0.823726453264081, + "grad_norm": 4.330489158630371, + "learning_rate": 8.038275671525592e-06, + "loss": 1.8229, + "step": 10943 + }, + { + "epoch": 0.8238017275447411, + "grad_norm": 5.644754409790039, + "learning_rate": 8.031647881724086e-06, + "loss": 1.9557, + "step": 10944 + }, + { + "epoch": 0.8238770018254014, + "grad_norm": 5.599817276000977, + "learning_rate": 8.025022586820679e-06, + "loss": 1.8217, + "step": 10945 + }, + { + "epoch": 0.8239522761060615, + "grad_norm": 4.577640533447266, + "learning_rate": 8.018399787209258e-06, + "loss": 1.6692, + "step": 10946 + }, + { + "epoch": 0.8240275503867216, + "grad_norm": 7.905070781707764, + "learning_rate": 8.011779483283494e-06, + "loss": 2.8015, + "step": 10947 + }, + { + "epoch": 0.8241028246673817, + "grad_norm": 6.149569034576416, + "learning_rate": 8.005161675436961e-06, + "loss": 1.7827, + "step": 10948 + }, + { + "epoch": 0.824178098948042, + "grad_norm": 5.115812301635742, + "learning_rate": 7.998546364063069e-06, + "loss": 1.6647, + "step": 10949 + }, + { + "epoch": 0.8242533732287021, + "grad_norm": 4.170004844665527, + "learning_rate": 7.99193354955508e-06, + "loss": 1.7517, + "step": 10950 + }, + { + "epoch": 0.8243286475093622, + "grad_norm": 4.287388324737549, + "learning_rate": 7.985323232306124e-06, + "loss": 2.1085, + "step": 10951 + }, + { + "epoch": 0.8244039217900224, + "grad_norm": 5.317685127258301, + "learning_rate": 7.978715412709131e-06, + "loss": 1.9699, + "step": 10952 + }, + { + "epoch": 0.8244791960706825, + "grad_norm": 5.4601731300354, + "learning_rate": 7.972110091156947e-06, + "loss": 2.0166, + "step": 10953 + }, + { + "epoch": 0.8245544703513427, + "grad_norm": 6.129608154296875, + "learning_rate": 7.965507268042217e-06, + "loss": 1.7415, + "step": 10954 + }, + { + "epoch": 0.8246297446320029, + "grad_norm": 4.224732875823975, + "learning_rate": 7.958906943757483e-06, + "loss": 1.8813, + "step": 10955 + }, + { + "epoch": 0.824705018912663, + "grad_norm": 3.9492104053497314, + "learning_rate": 7.952309118695084e-06, + "loss": 1.677, + "step": 10956 + }, + { + "epoch": 0.8247802931933231, + "grad_norm": 4.593045234680176, + "learning_rate": 7.945713793247273e-06, + "loss": 1.5629, + "step": 10957 + }, + { + "epoch": 0.8248555674739834, + "grad_norm": 5.2660064697265625, + "learning_rate": 7.939120967806101e-06, + "loss": 1.6015, + "step": 10958 + }, + { + "epoch": 0.8249308417546435, + "grad_norm": 5.169642925262451, + "learning_rate": 7.932530642763498e-06, + "loss": 2.0367, + "step": 10959 + }, + { + "epoch": 0.8250061160353036, + "grad_norm": 8.278311729431152, + "learning_rate": 7.925942818511245e-06, + "loss": 2.0329, + "step": 10960 + }, + { + "epoch": 0.8250813903159638, + "grad_norm": 4.970489501953125, + "learning_rate": 7.919357495440977e-06, + "loss": 1.6282, + "step": 10961 + }, + { + "epoch": 0.825156664596624, + "grad_norm": 3.97762131690979, + "learning_rate": 7.912774673944157e-06, + "loss": 1.7111, + "step": 10962 + }, + { + "epoch": 0.8252319388772841, + "grad_norm": 4.463409423828125, + "learning_rate": 7.906194354412117e-06, + "loss": 2.1348, + "step": 10963 + }, + { + "epoch": 0.8253072131579443, + "grad_norm": 7.818986415863037, + "learning_rate": 7.89961653723605e-06, + "loss": 1.9446, + "step": 10964 + }, + { + "epoch": 0.8253824874386044, + "grad_norm": 5.592797756195068, + "learning_rate": 7.893041222806975e-06, + "loss": 1.8107, + "step": 10965 + }, + { + "epoch": 0.8254577617192645, + "grad_norm": 6.234675884246826, + "learning_rate": 7.886468411515784e-06, + "loss": 1.8526, + "step": 10966 + }, + { + "epoch": 0.8255330359999248, + "grad_norm": 5.361891269683838, + "learning_rate": 7.879898103753208e-06, + "loss": 1.5978, + "step": 10967 + }, + { + "epoch": 0.8256083102805849, + "grad_norm": 3.8418986797332764, + "learning_rate": 7.873330299909837e-06, + "loss": 1.909, + "step": 10968 + }, + { + "epoch": 0.825683584561245, + "grad_norm": 5.313836097717285, + "learning_rate": 7.866765000376098e-06, + "loss": 1.7203, + "step": 10969 + }, + { + "epoch": 0.8257588588419051, + "grad_norm": 5.107827663421631, + "learning_rate": 7.860202205542282e-06, + "loss": 1.6603, + "step": 10970 + }, + { + "epoch": 0.8258341331225654, + "grad_norm": 5.811594009399414, + "learning_rate": 7.853641915798532e-06, + "loss": 2.0315, + "step": 10971 + }, + { + "epoch": 0.8259094074032255, + "grad_norm": 3.613593101501465, + "learning_rate": 7.847084131534855e-06, + "loss": 1.917, + "step": 10972 + }, + { + "epoch": 0.8259846816838856, + "grad_norm": 5.179539203643799, + "learning_rate": 7.84052885314106e-06, + "loss": 1.7584, + "step": 10973 + }, + { + "epoch": 0.8260599559645458, + "grad_norm": 4.686038970947266, + "learning_rate": 7.833976081006873e-06, + "loss": 2.0446, + "step": 10974 + }, + { + "epoch": 0.826135230245206, + "grad_norm": 4.419909477233887, + "learning_rate": 7.827425815521804e-06, + "loss": 1.9806, + "step": 10975 + }, + { + "epoch": 0.8262105045258661, + "grad_norm": 5.647611141204834, + "learning_rate": 7.820878057075275e-06, + "loss": 1.8023, + "step": 10976 + }, + { + "epoch": 0.8262857788065263, + "grad_norm": 6.002238750457764, + "learning_rate": 7.814332806056507e-06, + "loss": 1.6235, + "step": 10977 + }, + { + "epoch": 0.8263610530871864, + "grad_norm": 4.448552131652832, + "learning_rate": 7.807790062854625e-06, + "loss": 1.6161, + "step": 10978 + }, + { + "epoch": 0.8264363273678466, + "grad_norm": 6.940880298614502, + "learning_rate": 7.801249827858547e-06, + "loss": 2.0792, + "step": 10979 + }, + { + "epoch": 0.8265116016485068, + "grad_norm": 5.002687454223633, + "learning_rate": 7.794712101457086e-06, + "loss": 1.522, + "step": 10980 + }, + { + "epoch": 0.8265868759291669, + "grad_norm": 4.470086097717285, + "learning_rate": 7.788176884038889e-06, + "loss": 1.8557, + "step": 10981 + }, + { + "epoch": 0.826662150209827, + "grad_norm": 4.128772735595703, + "learning_rate": 7.78164417599246e-06, + "loss": 1.8809, + "step": 10982 + }, + { + "epoch": 0.8267374244904873, + "grad_norm": 5.70338249206543, + "learning_rate": 7.77511397770615e-06, + "loss": 2.0448, + "step": 10983 + }, + { + "epoch": 0.8268126987711474, + "grad_norm": 4.384184837341309, + "learning_rate": 7.768586289568153e-06, + "loss": 2.0023, + "step": 10984 + }, + { + "epoch": 0.8268879730518075, + "grad_norm": 6.2365875244140625, + "learning_rate": 7.762061111966534e-06, + "loss": 1.9373, + "step": 10985 + }, + { + "epoch": 0.8269632473324677, + "grad_norm": 4.188620567321777, + "learning_rate": 7.755538445289179e-06, + "loss": 1.9007, + "step": 10986 + }, + { + "epoch": 0.8270385216131279, + "grad_norm": 4.218332767486572, + "learning_rate": 7.74901828992386e-06, + "loss": 1.8203, + "step": 10987 + }, + { + "epoch": 0.827113795893788, + "grad_norm": 5.843628883361816, + "learning_rate": 7.74250064625816e-06, + "loss": 1.6626, + "step": 10988 + }, + { + "epoch": 0.8271890701744481, + "grad_norm": 4.487967014312744, + "learning_rate": 7.735985514679561e-06, + "loss": 2.19, + "step": 10989 + }, + { + "epoch": 0.8272643444551083, + "grad_norm": 4.762287139892578, + "learning_rate": 7.729472895575341e-06, + "loss": 1.7855, + "step": 10990 + }, + { + "epoch": 0.8273396187357684, + "grad_norm": 5.629855155944824, + "learning_rate": 7.722962789332676e-06, + "loss": 1.6636, + "step": 10991 + }, + { + "epoch": 0.8274148930164286, + "grad_norm": 4.606715679168701, + "learning_rate": 7.716455196338563e-06, + "loss": 2.0147, + "step": 10992 + }, + { + "epoch": 0.8274901672970888, + "grad_norm": 5.693932056427002, + "learning_rate": 7.709950116979858e-06, + "loss": 1.7612, + "step": 10993 + }, + { + "epoch": 0.8275654415777489, + "grad_norm": 5.193629264831543, + "learning_rate": 7.703447551643278e-06, + "loss": 1.734, + "step": 10994 + }, + { + "epoch": 0.827640715858409, + "grad_norm": 5.53890323638916, + "learning_rate": 7.69694750071538e-06, + "loss": 1.971, + "step": 10995 + }, + { + "epoch": 0.8277159901390693, + "grad_norm": 5.0209503173828125, + "learning_rate": 7.69044996458258e-06, + "loss": 1.7613, + "step": 10996 + }, + { + "epoch": 0.8277912644197294, + "grad_norm": 5.361001491546631, + "learning_rate": 7.683954943631116e-06, + "loss": 1.7346, + "step": 10997 + }, + { + "epoch": 0.8278665387003895, + "grad_norm": 4.7079081535339355, + "learning_rate": 7.677462438247129e-06, + "loss": 1.5784, + "step": 10998 + }, + { + "epoch": 0.8279418129810497, + "grad_norm": 4.381944179534912, + "learning_rate": 7.67097244881655e-06, + "loss": 1.6478, + "step": 10999 + }, + { + "epoch": 0.8280170872617099, + "grad_norm": 4.929294109344482, + "learning_rate": 7.664484975725211e-06, + "loss": 1.6248, + "step": 11000 + }, + { + "epoch": 0.82809236154237, + "grad_norm": 6.1471028327941895, + "learning_rate": 7.658000019358764e-06, + "loss": 2.0462, + "step": 11001 + }, + { + "epoch": 0.8281676358230302, + "grad_norm": 4.5387983322143555, + "learning_rate": 7.651517580102724e-06, + "loss": 1.6195, + "step": 11002 + }, + { + "epoch": 0.8282429101036903, + "grad_norm": 4.5794572830200195, + "learning_rate": 7.645037658342447e-06, + "loss": 1.6948, + "step": 11003 + }, + { + "epoch": 0.8283181843843505, + "grad_norm": 4.946218490600586, + "learning_rate": 7.63856025446315e-06, + "loss": 1.6512, + "step": 11004 + }, + { + "epoch": 0.8283934586650107, + "grad_norm": 6.110923767089844, + "learning_rate": 7.632085368849896e-06, + "loss": 1.8117, + "step": 11005 + }, + { + "epoch": 0.8284687329456708, + "grad_norm": 4.481554985046387, + "learning_rate": 7.62561300188761e-06, + "loss": 1.6948, + "step": 11006 + }, + { + "epoch": 0.8285440072263309, + "grad_norm": 4.663935661315918, + "learning_rate": 7.619143153961039e-06, + "loss": 1.9378, + "step": 11007 + }, + { + "epoch": 0.828619281506991, + "grad_norm": 6.7290730476379395, + "learning_rate": 7.6126758254548125e-06, + "loss": 1.6098, + "step": 11008 + }, + { + "epoch": 0.8286945557876513, + "grad_norm": 4.337218284606934, + "learning_rate": 7.6062110167533726e-06, + "loss": 1.7352, + "step": 11009 + }, + { + "epoch": 0.8287698300683114, + "grad_norm": 6.8893866539001465, + "learning_rate": 7.599748728241057e-06, + "loss": 1.385, + "step": 11010 + }, + { + "epoch": 0.8288451043489715, + "grad_norm": 4.733375549316406, + "learning_rate": 7.593288960302008e-06, + "loss": 1.3784, + "step": 11011 + }, + { + "epoch": 0.8289203786296317, + "grad_norm": 5.756259441375732, + "learning_rate": 7.586831713320253e-06, + "loss": 1.6168, + "step": 11012 + }, + { + "epoch": 0.8289956529102919, + "grad_norm": 7.07257604598999, + "learning_rate": 7.580376987679666e-06, + "loss": 1.8568, + "step": 11013 + }, + { + "epoch": 0.829070927190952, + "grad_norm": 5.0352253913879395, + "learning_rate": 7.573924783763942e-06, + "loss": 2.01, + "step": 11014 + }, + { + "epoch": 0.8291462014716122, + "grad_norm": 4.676278591156006, + "learning_rate": 7.567475101956656e-06, + "loss": 1.771, + "step": 11015 + }, + { + "epoch": 0.8292214757522723, + "grad_norm": 4.422346591949463, + "learning_rate": 7.561027942641219e-06, + "loss": 1.8488, + "step": 11016 + }, + { + "epoch": 0.8292967500329325, + "grad_norm": 4.946375846862793, + "learning_rate": 7.554583306200913e-06, + "loss": 1.6746, + "step": 11017 + }, + { + "epoch": 0.8293720243135927, + "grad_norm": 4.293478012084961, + "learning_rate": 7.548141193018832e-06, + "loss": 1.7332, + "step": 11018 + }, + { + "epoch": 0.8294472985942528, + "grad_norm": 5.111875534057617, + "learning_rate": 7.541701603477957e-06, + "loss": 1.8061, + "step": 11019 + }, + { + "epoch": 0.8295225728749129, + "grad_norm": 6.037639617919922, + "learning_rate": 7.535264537961084e-06, + "loss": 1.7114, + "step": 11020 + }, + { + "epoch": 0.8295978471555732, + "grad_norm": 5.5868964195251465, + "learning_rate": 7.5288299968509e-06, + "loss": 1.9138, + "step": 11021 + }, + { + "epoch": 0.8296731214362333, + "grad_norm": 4.353113651275635, + "learning_rate": 7.5223979805299e-06, + "loss": 1.993, + "step": 11022 + }, + { + "epoch": 0.8297483957168934, + "grad_norm": 5.670248985290527, + "learning_rate": 7.515968489380465e-06, + "loss": 2.2826, + "step": 11023 + }, + { + "epoch": 0.8298236699975536, + "grad_norm": 5.503589153289795, + "learning_rate": 7.509541523784797e-06, + "loss": 1.7227, + "step": 11024 + }, + { + "epoch": 0.8298989442782138, + "grad_norm": 3.9664077758789062, + "learning_rate": 7.503117084124961e-06, + "loss": 1.675, + "step": 11025 + }, + { + "epoch": 0.8299742185588739, + "grad_norm": 5.680034637451172, + "learning_rate": 7.49669517078288e-06, + "loss": 2.2477, + "step": 11026 + }, + { + "epoch": 0.830049492839534, + "grad_norm": 4.639273166656494, + "learning_rate": 7.4902757841403325e-06, + "loss": 1.8315, + "step": 11027 + }, + { + "epoch": 0.8301247671201942, + "grad_norm": 6.767391204833984, + "learning_rate": 7.483858924578896e-06, + "loss": 2.0925, + "step": 11028 + }, + { + "epoch": 0.8302000414008543, + "grad_norm": 4.622908115386963, + "learning_rate": 7.47744459248006e-06, + "loss": 1.661, + "step": 11029 + }, + { + "epoch": 0.8302753156815145, + "grad_norm": 5.5543131828308105, + "learning_rate": 7.4710327882251355e-06, + "loss": 1.7081, + "step": 11030 + }, + { + "epoch": 0.8303505899621747, + "grad_norm": 5.449149131774902, + "learning_rate": 7.464623512195279e-06, + "loss": 2.1125, + "step": 11031 + }, + { + "epoch": 0.8304258642428348, + "grad_norm": 5.358203411102295, + "learning_rate": 7.458216764771514e-06, + "loss": 1.7113, + "step": 11032 + }, + { + "epoch": 0.8305011385234949, + "grad_norm": 5.866293907165527, + "learning_rate": 7.4518125463346855e-06, + "loss": 1.6806, + "step": 11033 + }, + { + "epoch": 0.8305764128041552, + "grad_norm": 4.008992671966553, + "learning_rate": 7.445410857265528e-06, + "loss": 1.8011, + "step": 11034 + }, + { + "epoch": 0.8306516870848153, + "grad_norm": 4.065059661865234, + "learning_rate": 7.439011697944581e-06, + "loss": 1.6445, + "step": 11035 + }, + { + "epoch": 0.8307269613654754, + "grad_norm": 5.211691856384277, + "learning_rate": 7.43261506875228e-06, + "loss": 1.6852, + "step": 11036 + }, + { + "epoch": 0.8308022356461356, + "grad_norm": 5.035432815551758, + "learning_rate": 7.426220970068864e-06, + "loss": 1.615, + "step": 11037 + }, + { + "epoch": 0.8308775099267958, + "grad_norm": 4.664009094238281, + "learning_rate": 7.419829402274453e-06, + "loss": 1.9492, + "step": 11038 + }, + { + "epoch": 0.8309527842074559, + "grad_norm": 3.64105224609375, + "learning_rate": 7.413440365749002e-06, + "loss": 1.8129, + "step": 11039 + }, + { + "epoch": 0.8310280584881161, + "grad_norm": 6.603893280029297, + "learning_rate": 7.407053860872343e-06, + "loss": 1.7318, + "step": 11040 + }, + { + "epoch": 0.8311033327687762, + "grad_norm": 6.286447525024414, + "learning_rate": 7.4006698880241084e-06, + "loss": 2.0356, + "step": 11041 + }, + { + "epoch": 0.8311786070494364, + "grad_norm": 5.453951835632324, + "learning_rate": 7.394288447583825e-06, + "loss": 1.9739, + "step": 11042 + }, + { + "epoch": 0.8312538813300966, + "grad_norm": 5.093417644500732, + "learning_rate": 7.3879095399308375e-06, + "loss": 1.6465, + "step": 11043 + }, + { + "epoch": 0.8313291556107567, + "grad_norm": 5.27060604095459, + "learning_rate": 7.381533165444355e-06, + "loss": 2.0461, + "step": 11044 + }, + { + "epoch": 0.8314044298914168, + "grad_norm": 6.195003986358643, + "learning_rate": 7.375159324503456e-06, + "loss": 1.876, + "step": 11045 + }, + { + "epoch": 0.8314797041720771, + "grad_norm": 4.605326175689697, + "learning_rate": 7.368788017487016e-06, + "loss": 2.0004, + "step": 11046 + }, + { + "epoch": 0.8315549784527372, + "grad_norm": 4.836153984069824, + "learning_rate": 7.362419244773816e-06, + "loss": 1.77, + "step": 11047 + }, + { + "epoch": 0.8316302527333973, + "grad_norm": 11.919204711914062, + "learning_rate": 7.356053006742442e-06, + "loss": 1.4523, + "step": 11048 + }, + { + "epoch": 0.8317055270140574, + "grad_norm": 4.372076034545898, + "learning_rate": 7.3496893037713564e-06, + "loss": 1.6074, + "step": 11049 + }, + { + "epoch": 0.8317808012947177, + "grad_norm": 5.7756757736206055, + "learning_rate": 7.343328136238869e-06, + "loss": 1.1829, + "step": 11050 + }, + { + "epoch": 0.8318560755753778, + "grad_norm": 6.489713668823242, + "learning_rate": 7.336969504523133e-06, + "loss": 2.4567, + "step": 11051 + }, + { + "epoch": 0.8319313498560379, + "grad_norm": 3.8584771156311035, + "learning_rate": 7.330613409002135e-06, + "loss": 2.0335, + "step": 11052 + }, + { + "epoch": 0.8320066241366981, + "grad_norm": 4.212832450866699, + "learning_rate": 7.324259850053755e-06, + "loss": 1.555, + "step": 11053 + }, + { + "epoch": 0.8320818984173582, + "grad_norm": 4.950713634490967, + "learning_rate": 7.317908828055659e-06, + "loss": 1.8308, + "step": 11054 + }, + { + "epoch": 0.8321571726980184, + "grad_norm": 5.218753814697266, + "learning_rate": 7.311560343385432e-06, + "loss": 1.705, + "step": 11055 + }, + { + "epoch": 0.8322324469786786, + "grad_norm": 6.191812038421631, + "learning_rate": 7.30521439642044e-06, + "loss": 1.8691, + "step": 11056 + }, + { + "epoch": 0.8323077212593387, + "grad_norm": 6.975505828857422, + "learning_rate": 7.298870987537959e-06, + "loss": 1.8965, + "step": 11057 + }, + { + "epoch": 0.8323829955399988, + "grad_norm": 5.136867046356201, + "learning_rate": 7.292530117115059e-06, + "loss": 1.5467, + "step": 11058 + }, + { + "epoch": 0.8324582698206591, + "grad_norm": 4.7672529220581055, + "learning_rate": 7.286191785528707e-06, + "loss": 1.9761, + "step": 11059 + }, + { + "epoch": 0.8325335441013192, + "grad_norm": 5.822338104248047, + "learning_rate": 7.2798559931556934e-06, + "loss": 1.9192, + "step": 11060 + }, + { + "epoch": 0.8326088183819793, + "grad_norm": 4.806952476501465, + "learning_rate": 7.273522740372662e-06, + "loss": 1.9103, + "step": 11061 + }, + { + "epoch": 0.8326840926626395, + "grad_norm": 4.035099506378174, + "learning_rate": 7.267192027556119e-06, + "loss": 1.9447, + "step": 11062 + }, + { + "epoch": 0.8327593669432997, + "grad_norm": 5.279050350189209, + "learning_rate": 7.260863855082389e-06, + "loss": 1.8112, + "step": 11063 + }, + { + "epoch": 0.8328346412239598, + "grad_norm": 5.123414039611816, + "learning_rate": 7.254538223327678e-06, + "loss": 1.9104, + "step": 11064 + }, + { + "epoch": 0.83290991550462, + "grad_norm": 3.678065538406372, + "learning_rate": 7.248215132668007e-06, + "loss": 2.034, + "step": 11065 + }, + { + "epoch": 0.8329851897852801, + "grad_norm": 5.231009483337402, + "learning_rate": 7.241894583479286e-06, + "loss": 1.6622, + "step": 11066 + }, + { + "epoch": 0.8330604640659403, + "grad_norm": 5.201895713806152, + "learning_rate": 7.235576576137243e-06, + "loss": 1.8051, + "step": 11067 + }, + { + "epoch": 0.8331357383466004, + "grad_norm": 5.084416389465332, + "learning_rate": 7.229261111017471e-06, + "loss": 1.8061, + "step": 11068 + }, + { + "epoch": 0.8332110126272606, + "grad_norm": 4.602647304534912, + "learning_rate": 7.222948188495393e-06, + "loss": 1.8797, + "step": 11069 + }, + { + "epoch": 0.8332862869079207, + "grad_norm": 5.758812427520752, + "learning_rate": 7.216637808946308e-06, + "loss": 1.8662, + "step": 11070 + }, + { + "epoch": 0.8333615611885808, + "grad_norm": 4.816676139831543, + "learning_rate": 7.210329972745344e-06, + "loss": 1.7012, + "step": 11071 + }, + { + "epoch": 0.8334368354692411, + "grad_norm": 4.679792404174805, + "learning_rate": 7.204024680267496e-06, + "loss": 1.7483, + "step": 11072 + }, + { + "epoch": 0.8335121097499012, + "grad_norm": 4.840357303619385, + "learning_rate": 7.197721931887574e-06, + "loss": 1.7046, + "step": 11073 + }, + { + "epoch": 0.8335873840305613, + "grad_norm": 5.905903339385986, + "learning_rate": 7.19142172798028e-06, + "loss": 2.0841, + "step": 11074 + }, + { + "epoch": 0.8336626583112215, + "grad_norm": 7.352158546447754, + "learning_rate": 7.185124068920124e-06, + "loss": 1.7296, + "step": 11075 + }, + { + "epoch": 0.8337379325918817, + "grad_norm": 6.462066650390625, + "learning_rate": 7.17882895508149e-06, + "loss": 1.4067, + "step": 11076 + }, + { + "epoch": 0.8338132068725418, + "grad_norm": 6.375122547149658, + "learning_rate": 7.1725363868386185e-06, + "loss": 2.0245, + "step": 11077 + }, + { + "epoch": 0.833888481153202, + "grad_norm": 6.159372806549072, + "learning_rate": 7.166246364565565e-06, + "loss": 1.8356, + "step": 11078 + }, + { + "epoch": 0.8339637554338621, + "grad_norm": 5.991606712341309, + "learning_rate": 7.15995888863627e-06, + "loss": 1.7666, + "step": 11079 + }, + { + "epoch": 0.8340390297145223, + "grad_norm": 5.612191200256348, + "learning_rate": 7.153673959424484e-06, + "loss": 2.0731, + "step": 11080 + }, + { + "epoch": 0.8341143039951825, + "grad_norm": 5.853646755218506, + "learning_rate": 7.147391577303847e-06, + "loss": 1.7331, + "step": 11081 + }, + { + "epoch": 0.8341895782758426, + "grad_norm": 4.453214168548584, + "learning_rate": 7.141111742647816e-06, + "loss": 1.5959, + "step": 11082 + }, + { + "epoch": 0.8342648525565027, + "grad_norm": 4.861266136169434, + "learning_rate": 7.134834455829731e-06, + "loss": 1.8731, + "step": 11083 + }, + { + "epoch": 0.834340126837163, + "grad_norm": 4.7663893699646, + "learning_rate": 7.1285597172227325e-06, + "loss": 1.6521, + "step": 11084 + }, + { + "epoch": 0.8344154011178231, + "grad_norm": 5.383310317993164, + "learning_rate": 7.122287527199861e-06, + "loss": 1.8253, + "step": 11085 + }, + { + "epoch": 0.8344906753984832, + "grad_norm": 6.459989070892334, + "learning_rate": 7.116017886133946e-06, + "loss": 1.7457, + "step": 11086 + }, + { + "epoch": 0.8345659496791433, + "grad_norm": 5.328629016876221, + "learning_rate": 7.1097507943977405e-06, + "loss": 1.7066, + "step": 11087 + }, + { + "epoch": 0.8346412239598036, + "grad_norm": 3.785832405090332, + "learning_rate": 7.103486252363767e-06, + "loss": 1.8998, + "step": 11088 + }, + { + "epoch": 0.8347164982404637, + "grad_norm": 6.3931450843811035, + "learning_rate": 7.097224260404467e-06, + "loss": 1.6242, + "step": 11089 + }, + { + "epoch": 0.8347917725211238, + "grad_norm": 4.314263820648193, + "learning_rate": 7.090964818892071e-06, + "loss": 1.5718, + "step": 11090 + }, + { + "epoch": 0.834867046801784, + "grad_norm": 6.2679762840271, + "learning_rate": 7.084707928198703e-06, + "loss": 1.7489, + "step": 11091 + }, + { + "epoch": 0.8349423210824441, + "grad_norm": 5.2999491691589355, + "learning_rate": 7.078453588696304e-06, + "loss": 1.9958, + "step": 11092 + }, + { + "epoch": 0.8350175953631043, + "grad_norm": 5.195150852203369, + "learning_rate": 7.072201800756684e-06, + "loss": 1.8579, + "step": 11093 + }, + { + "epoch": 0.8350928696437645, + "grad_norm": 5.689398288726807, + "learning_rate": 7.065952564751488e-06, + "loss": 1.9673, + "step": 11094 + }, + { + "epoch": 0.8351681439244246, + "grad_norm": 6.044033527374268, + "learning_rate": 7.0597058810522255e-06, + "loss": 2.3433, + "step": 11095 + }, + { + "epoch": 0.8352434182050847, + "grad_norm": 4.774587154388428, + "learning_rate": 7.053461750030249e-06, + "loss": 2.0049, + "step": 11096 + }, + { + "epoch": 0.835318692485745, + "grad_norm": 6.9197211265563965, + "learning_rate": 7.047220172056734e-06, + "loss": 2.149, + "step": 11097 + }, + { + "epoch": 0.8353939667664051, + "grad_norm": 4.784257888793945, + "learning_rate": 7.040981147502746e-06, + "loss": 1.7605, + "step": 11098 + }, + { + "epoch": 0.8354692410470652, + "grad_norm": 5.215122222900391, + "learning_rate": 7.034744676739152e-06, + "loss": 1.7546, + "step": 11099 + }, + { + "epoch": 0.8355445153277254, + "grad_norm": 6.411661148071289, + "learning_rate": 7.028510760136719e-06, + "loss": 1.5137, + "step": 11100 + }, + { + "epoch": 0.8356197896083856, + "grad_norm": 5.081946849822998, + "learning_rate": 7.022279398066006e-06, + "loss": 1.7903, + "step": 11101 + }, + { + "epoch": 0.8356950638890457, + "grad_norm": 5.067341327667236, + "learning_rate": 7.016050590897482e-06, + "loss": 2.0615, + "step": 11102 + }, + { + "epoch": 0.8357703381697059, + "grad_norm": 5.401487350463867, + "learning_rate": 7.009824339001403e-06, + "loss": 1.7912, + "step": 11103 + }, + { + "epoch": 0.835845612450366, + "grad_norm": 4.524129867553711, + "learning_rate": 7.0036006427479154e-06, + "loss": 1.8348, + "step": 11104 + }, + { + "epoch": 0.8359208867310262, + "grad_norm": 7.708333969116211, + "learning_rate": 6.997379502507001e-06, + "loss": 2.102, + "step": 11105 + }, + { + "epoch": 0.8359961610116863, + "grad_norm": 4.23085355758667, + "learning_rate": 6.991160918648493e-06, + "loss": 1.7265, + "step": 11106 + }, + { + "epoch": 0.8360714352923465, + "grad_norm": 4.6369218826293945, + "learning_rate": 6.9849448915420555e-06, + "loss": 1.9469, + "step": 11107 + }, + { + "epoch": 0.8361467095730066, + "grad_norm": 4.357454776763916, + "learning_rate": 6.9787314215572165e-06, + "loss": 1.6868, + "step": 11108 + }, + { + "epoch": 0.8362219838536668, + "grad_norm": 4.764465808868408, + "learning_rate": 6.972520509063363e-06, + "loss": 1.4145, + "step": 11109 + }, + { + "epoch": 0.836297258134327, + "grad_norm": 5.528480529785156, + "learning_rate": 6.966312154429699e-06, + "loss": 1.5341, + "step": 11110 + }, + { + "epoch": 0.8363725324149871, + "grad_norm": 4.671770095825195, + "learning_rate": 6.9601063580253086e-06, + "loss": 1.8078, + "step": 11111 + }, + { + "epoch": 0.8364478066956472, + "grad_norm": 3.9453747272491455, + "learning_rate": 6.953903120219091e-06, + "loss": 1.7232, + "step": 11112 + }, + { + "epoch": 0.8365230809763075, + "grad_norm": 6.011135101318359, + "learning_rate": 6.947702441379828e-06, + "loss": 2.4406, + "step": 11113 + }, + { + "epoch": 0.8365983552569676, + "grad_norm": 5.105716705322266, + "learning_rate": 6.9415043218761136e-06, + "loss": 1.5811, + "step": 11114 + }, + { + "epoch": 0.8366736295376277, + "grad_norm": 5.13422966003418, + "learning_rate": 6.935308762076415e-06, + "loss": 1.6372, + "step": 11115 + }, + { + "epoch": 0.8367489038182879, + "grad_norm": 4.162962436676025, + "learning_rate": 6.9291157623490475e-06, + "loss": 1.6416, + "step": 11116 + }, + { + "epoch": 0.836824178098948, + "grad_norm": 4.233205795288086, + "learning_rate": 6.92292532306218e-06, + "loss": 1.915, + "step": 11117 + }, + { + "epoch": 0.8368994523796082, + "grad_norm": 4.262418746948242, + "learning_rate": 6.916737444583782e-06, + "loss": 1.5722, + "step": 11118 + }, + { + "epoch": 0.8369747266602684, + "grad_norm": 3.917956590652466, + "learning_rate": 6.9105521272817384e-06, + "loss": 2.0511, + "step": 11119 + }, + { + "epoch": 0.8370500009409285, + "grad_norm": 5.590387344360352, + "learning_rate": 6.9043693715237165e-06, + "loss": 1.7002, + "step": 11120 + }, + { + "epoch": 0.8371252752215886, + "grad_norm": 4.6854705810546875, + "learning_rate": 6.898189177677295e-06, + "loss": 1.7841, + "step": 11121 + }, + { + "epoch": 0.8372005495022489, + "grad_norm": 5.919164657592773, + "learning_rate": 6.8920115461098445e-06, + "loss": 1.6234, + "step": 11122 + }, + { + "epoch": 0.837275823782909, + "grad_norm": 4.469350814819336, + "learning_rate": 6.885836477188629e-06, + "loss": 1.7272, + "step": 11123 + }, + { + "epoch": 0.8373510980635691, + "grad_norm": 5.236168384552002, + "learning_rate": 6.879663971280709e-06, + "loss": 1.6134, + "step": 11124 + }, + { + "epoch": 0.8374263723442292, + "grad_norm": 4.39715576171875, + "learning_rate": 6.873494028753041e-06, + "loss": 1.7281, + "step": 11125 + }, + { + "epoch": 0.8375016466248895, + "grad_norm": 5.225582599639893, + "learning_rate": 6.8673266499724166e-06, + "loss": 1.7771, + "step": 11126 + }, + { + "epoch": 0.8375769209055496, + "grad_norm": 4.675426006317139, + "learning_rate": 6.861161835305452e-06, + "loss": 1.6815, + "step": 11127 + }, + { + "epoch": 0.8376521951862097, + "grad_norm": 4.589982032775879, + "learning_rate": 6.854999585118654e-06, + "loss": 1.4972, + "step": 11128 + }, + { + "epoch": 0.8377274694668699, + "grad_norm": 4.061214447021484, + "learning_rate": 6.8488398997783195e-06, + "loss": 1.3671, + "step": 11129 + }, + { + "epoch": 0.83780274374753, + "grad_norm": 4.747130870819092, + "learning_rate": 6.842682779650655e-06, + "loss": 1.83, + "step": 11130 + }, + { + "epoch": 0.8378780180281902, + "grad_norm": 4.630528450012207, + "learning_rate": 6.8365282251016515e-06, + "loss": 1.6915, + "step": 11131 + }, + { + "epoch": 0.8379532923088504, + "grad_norm": 3.776576280593872, + "learning_rate": 6.830376236497205e-06, + "loss": 1.7709, + "step": 11132 + }, + { + "epoch": 0.8380285665895105, + "grad_norm": 4.731005668640137, + "learning_rate": 6.824226814203017e-06, + "loss": 1.6608, + "step": 11133 + }, + { + "epoch": 0.8381038408701706, + "grad_norm": 4.95127010345459, + "learning_rate": 6.8180799585846665e-06, + "loss": 2.0121, + "step": 11134 + }, + { + "epoch": 0.8381791151508309, + "grad_norm": 4.485501766204834, + "learning_rate": 6.811935670007552e-06, + "loss": 1.7871, + "step": 11135 + }, + { + "epoch": 0.838254389431491, + "grad_norm": 4.852588653564453, + "learning_rate": 6.805793948836941e-06, + "loss": 1.819, + "step": 11136 + }, + { + "epoch": 0.8383296637121511, + "grad_norm": 5.005697250366211, + "learning_rate": 6.799654795437949e-06, + "loss": 1.5672, + "step": 11137 + }, + { + "epoch": 0.8384049379928113, + "grad_norm": 8.769268035888672, + "learning_rate": 6.793518210175515e-06, + "loss": 2.1549, + "step": 11138 + }, + { + "epoch": 0.8384802122734715, + "grad_norm": 4.214446544647217, + "learning_rate": 6.787384193414453e-06, + "loss": 1.782, + "step": 11139 + }, + { + "epoch": 0.8385554865541316, + "grad_norm": 4.881605625152588, + "learning_rate": 6.781252745519417e-06, + "loss": 1.8778, + "step": 11140 + }, + { + "epoch": 0.8386307608347918, + "grad_norm": 5.593880653381348, + "learning_rate": 6.775123866854888e-06, + "loss": 2.1665, + "step": 11141 + }, + { + "epoch": 0.8387060351154519, + "grad_norm": 5.857632637023926, + "learning_rate": 6.768997557785217e-06, + "loss": 1.9774, + "step": 11142 + }, + { + "epoch": 0.8387813093961121, + "grad_norm": 6.197014808654785, + "learning_rate": 6.762873818674609e-06, + "loss": 2.3342, + "step": 11143 + }, + { + "epoch": 0.8388565836767723, + "grad_norm": 4.749516487121582, + "learning_rate": 6.756752649887082e-06, + "loss": 1.6864, + "step": 11144 + }, + { + "epoch": 0.8389318579574324, + "grad_norm": 4.441079139709473, + "learning_rate": 6.750634051786542e-06, + "loss": 1.7264, + "step": 11145 + }, + { + "epoch": 0.8390071322380925, + "grad_norm": 5.646838665008545, + "learning_rate": 6.744518024736696e-06, + "loss": 2.0181, + "step": 11146 + }, + { + "epoch": 0.8390824065187527, + "grad_norm": 4.400927543640137, + "learning_rate": 6.738404569101153e-06, + "loss": 1.5453, + "step": 11147 + }, + { + "epoch": 0.8391576807994129, + "grad_norm": 4.460588455200195, + "learning_rate": 6.7322936852433184e-06, + "loss": 1.4863, + "step": 11148 + }, + { + "epoch": 0.839232955080073, + "grad_norm": 5.296993732452393, + "learning_rate": 6.726185373526473e-06, + "loss": 2.2783, + "step": 11149 + }, + { + "epoch": 0.8393082293607331, + "grad_norm": 5.389023303985596, + "learning_rate": 6.720079634313742e-06, + "loss": 1.4713, + "step": 11150 + }, + { + "epoch": 0.8393835036413934, + "grad_norm": 4.712506294250488, + "learning_rate": 6.713976467968103e-06, + "loss": 1.7241, + "step": 11151 + }, + { + "epoch": 0.8394587779220535, + "grad_norm": 5.460819244384766, + "learning_rate": 6.707875874852348e-06, + "loss": 1.6132, + "step": 11152 + }, + { + "epoch": 0.8395340522027136, + "grad_norm": 4.264761447906494, + "learning_rate": 6.701777855329167e-06, + "loss": 1.9214, + "step": 11153 + }, + { + "epoch": 0.8396093264833738, + "grad_norm": 4.11516809463501, + "learning_rate": 6.695682409761045e-06, + "loss": 2.101, + "step": 11154 + }, + { + "epoch": 0.839684600764034, + "grad_norm": 5.3792009353637695, + "learning_rate": 6.6895895385103605e-06, + "loss": 1.5466, + "step": 11155 + }, + { + "epoch": 0.8397598750446941, + "grad_norm": 7.123032569885254, + "learning_rate": 6.683499241939295e-06, + "loss": 1.7773, + "step": 11156 + }, + { + "epoch": 0.8398351493253543, + "grad_norm": 5.164830207824707, + "learning_rate": 6.67741152040991e-06, + "loss": 1.8228, + "step": 11157 + }, + { + "epoch": 0.8399104236060144, + "grad_norm": 4.144986629486084, + "learning_rate": 6.6713263742841145e-06, + "loss": 1.6297, + "step": 11158 + }, + { + "epoch": 0.8399856978866745, + "grad_norm": 4.575332164764404, + "learning_rate": 6.665243803923632e-06, + "loss": 1.7586, + "step": 11159 + }, + { + "epoch": 0.8400609721673348, + "grad_norm": 5.089232444763184, + "learning_rate": 6.659163809690067e-06, + "loss": 1.6265, + "step": 11160 + }, + { + "epoch": 0.8401362464479949, + "grad_norm": 4.514784812927246, + "learning_rate": 6.653086391944852e-06, + "loss": 1.7338, + "step": 11161 + }, + { + "epoch": 0.840211520728655, + "grad_norm": 4.259333610534668, + "learning_rate": 6.647011551049287e-06, + "loss": 1.4617, + "step": 11162 + }, + { + "epoch": 0.8402867950093152, + "grad_norm": 5.080699443817139, + "learning_rate": 6.640939287364478e-06, + "loss": 1.6577, + "step": 11163 + }, + { + "epoch": 0.8403620692899754, + "grad_norm": 5.670527458190918, + "learning_rate": 6.634869601251426e-06, + "loss": 2.3144, + "step": 11164 + }, + { + "epoch": 0.8404373435706355, + "grad_norm": 3.8688299655914307, + "learning_rate": 6.62880249307094e-06, + "loss": 1.7761, + "step": 11165 + }, + { + "epoch": 0.8405126178512956, + "grad_norm": 10.417957305908203, + "learning_rate": 6.622737963183712e-06, + "loss": 2.1792, + "step": 11166 + }, + { + "epoch": 0.8405878921319558, + "grad_norm": 4.5511393547058105, + "learning_rate": 6.6166760119502405e-06, + "loss": 1.6349, + "step": 11167 + }, + { + "epoch": 0.840663166412616, + "grad_norm": 4.164097309112549, + "learning_rate": 6.6106166397309045e-06, + "loss": 1.9282, + "step": 11168 + }, + { + "epoch": 0.8407384406932761, + "grad_norm": 3.953265428543091, + "learning_rate": 6.6045598468858996e-06, + "loss": 1.6833, + "step": 11169 + }, + { + "epoch": 0.8408137149739363, + "grad_norm": 3.1790060997009277, + "learning_rate": 6.598505633775304e-06, + "loss": 1.8434, + "step": 11170 + }, + { + "epoch": 0.8408889892545964, + "grad_norm": 4.558568000793457, + "learning_rate": 6.5924540007590076e-06, + "loss": 1.6772, + "step": 11171 + }, + { + "epoch": 0.8409642635352566, + "grad_norm": 5.258319854736328, + "learning_rate": 6.5864049481967834e-06, + "loss": 1.9443, + "step": 11172 + }, + { + "epoch": 0.8410395378159168, + "grad_norm": 4.417283058166504, + "learning_rate": 6.580358476448212e-06, + "loss": 1.6938, + "step": 11173 + }, + { + "epoch": 0.8411148120965769, + "grad_norm": 4.552297115325928, + "learning_rate": 6.574314585872738e-06, + "loss": 1.8327, + "step": 11174 + }, + { + "epoch": 0.841190086377237, + "grad_norm": 4.643150329589844, + "learning_rate": 6.5682732768296725e-06, + "loss": 1.7858, + "step": 11175 + }, + { + "epoch": 0.8412653606578973, + "grad_norm": 6.868575096130371, + "learning_rate": 6.5622345496781315e-06, + "loss": 1.7422, + "step": 11176 + }, + { + "epoch": 0.8413406349385574, + "grad_norm": 4.626029968261719, + "learning_rate": 6.556198404777119e-06, + "loss": 2.2154, + "step": 11177 + }, + { + "epoch": 0.8414159092192175, + "grad_norm": 6.260887622833252, + "learning_rate": 6.550164842485445e-06, + "loss": 1.6859, + "step": 11178 + }, + { + "epoch": 0.8414911834998777, + "grad_norm": 4.105422496795654, + "learning_rate": 6.544133863161811e-06, + "loss": 1.4085, + "step": 11179 + }, + { + "epoch": 0.8415664577805378, + "grad_norm": 4.183300495147705, + "learning_rate": 6.538105467164718e-06, + "loss": 1.9785, + "step": 11180 + }, + { + "epoch": 0.841641732061198, + "grad_norm": 4.764847755432129, + "learning_rate": 6.532079654852552e-06, + "loss": 1.8358, + "step": 11181 + }, + { + "epoch": 0.8417170063418582, + "grad_norm": 6.050087928771973, + "learning_rate": 6.526056426583526e-06, + "loss": 1.9587, + "step": 11182 + }, + { + "epoch": 0.8417922806225183, + "grad_norm": 4.971504211425781, + "learning_rate": 6.520035782715716e-06, + "loss": 1.8499, + "step": 11183 + }, + { + "epoch": 0.8418675549031784, + "grad_norm": 5.099761486053467, + "learning_rate": 6.514017723607007e-06, + "loss": 1.7452, + "step": 11184 + }, + { + "epoch": 0.8419428291838386, + "grad_norm": 4.524901390075684, + "learning_rate": 6.508002249615186e-06, + "loss": 1.9179, + "step": 11185 + }, + { + "epoch": 0.8420181034644988, + "grad_norm": 5.432777404785156, + "learning_rate": 6.501989361097821e-06, + "loss": 1.9629, + "step": 11186 + }, + { + "epoch": 0.8420933777451589, + "grad_norm": 4.498457431793213, + "learning_rate": 6.4959790584123934e-06, + "loss": 1.8279, + "step": 11187 + }, + { + "epoch": 0.842168652025819, + "grad_norm": 4.5996294021606445, + "learning_rate": 6.4899713419161735e-06, + "loss": 1.8645, + "step": 11188 + }, + { + "epoch": 0.8422439263064793, + "grad_norm": 5.522895812988281, + "learning_rate": 6.483966211966308e-06, + "loss": 2.0191, + "step": 11189 + }, + { + "epoch": 0.8423192005871394, + "grad_norm": 3.5606210231781006, + "learning_rate": 6.477963668919806e-06, + "loss": 2.0935, + "step": 11190 + }, + { + "epoch": 0.8423944748677995, + "grad_norm": 5.094019889831543, + "learning_rate": 6.471963713133472e-06, + "loss": 1.7191, + "step": 11191 + }, + { + "epoch": 0.8424697491484597, + "grad_norm": 4.548551559448242, + "learning_rate": 6.465966344964014e-06, + "loss": 1.4239, + "step": 11192 + }, + { + "epoch": 0.8425450234291199, + "grad_norm": 4.219467639923096, + "learning_rate": 6.459971564767925e-06, + "loss": 1.7072, + "step": 11193 + }, + { + "epoch": 0.84262029770978, + "grad_norm": 4.513721942901611, + "learning_rate": 6.453979372901603e-06, + "loss": 1.9566, + "step": 11194 + }, + { + "epoch": 0.8426955719904402, + "grad_norm": 4.069916248321533, + "learning_rate": 6.4479897697212574e-06, + "loss": 1.8304, + "step": 11195 + }, + { + "epoch": 0.8427708462711003, + "grad_norm": 4.504793167114258, + "learning_rate": 6.442002755582966e-06, + "loss": 1.9497, + "step": 11196 + }, + { + "epoch": 0.8428461205517604, + "grad_norm": 3.9722812175750732, + "learning_rate": 6.436018330842619e-06, + "loss": 1.8806, + "step": 11197 + }, + { + "epoch": 0.8429213948324207, + "grad_norm": 7.850261211395264, + "learning_rate": 6.43003649585599e-06, + "loss": 1.773, + "step": 11198 + }, + { + "epoch": 0.8429966691130808, + "grad_norm": 5.102206707000732, + "learning_rate": 6.424057250978671e-06, + "loss": 1.7999, + "step": 11199 + }, + { + "epoch": 0.8430719433937409, + "grad_norm": 6.055898189544678, + "learning_rate": 6.418080596566123e-06, + "loss": 1.8146, + "step": 11200 + }, + { + "epoch": 0.8431472176744012, + "grad_norm": 4.280010223388672, + "learning_rate": 6.4121065329736165e-06, + "loss": 1.8588, + "step": 11201 + }, + { + "epoch": 0.8432224919550613, + "grad_norm": 4.572033882141113, + "learning_rate": 6.406135060556329e-06, + "loss": 1.7214, + "step": 11202 + }, + { + "epoch": 0.8432977662357214, + "grad_norm": 4.822880268096924, + "learning_rate": 6.400166179669209e-06, + "loss": 1.9036, + "step": 11203 + }, + { + "epoch": 0.8433730405163815, + "grad_norm": 4.275100231170654, + "learning_rate": 6.394199890667113e-06, + "loss": 1.7458, + "step": 11204 + }, + { + "epoch": 0.8434483147970417, + "grad_norm": 5.10707950592041, + "learning_rate": 6.388236193904712e-06, + "loss": 2.3163, + "step": 11205 + }, + { + "epoch": 0.8435235890777019, + "grad_norm": 4.97098970413208, + "learning_rate": 6.382275089736534e-06, + "loss": 1.8306, + "step": 11206 + }, + { + "epoch": 0.843598863358362, + "grad_norm": 4.013286590576172, + "learning_rate": 6.376316578516955e-06, + "loss": 1.76, + "step": 11207 + }, + { + "epoch": 0.8436741376390222, + "grad_norm": 6.344820022583008, + "learning_rate": 6.370360660600178e-06, + "loss": 1.7801, + "step": 11208 + }, + { + "epoch": 0.8437494119196823, + "grad_norm": 4.477084636688232, + "learning_rate": 6.3644073363402836e-06, + "loss": 1.8177, + "step": 11209 + }, + { + "epoch": 0.8438246862003425, + "grad_norm": 5.514729976654053, + "learning_rate": 6.3584566060911565e-06, + "loss": 1.9845, + "step": 11210 + }, + { + "epoch": 0.8438999604810027, + "grad_norm": 4.488874435424805, + "learning_rate": 6.3525084702065754e-06, + "loss": 1.9079, + "step": 11211 + }, + { + "epoch": 0.8439752347616628, + "grad_norm": 5.180163383483887, + "learning_rate": 6.346562929040112e-06, + "loss": 2.2314, + "step": 11212 + }, + { + "epoch": 0.8440505090423229, + "grad_norm": 4.341010570526123, + "learning_rate": 6.340619982945239e-06, + "loss": 1.7962, + "step": 11213 + }, + { + "epoch": 0.8441257833229832, + "grad_norm": 6.545727252960205, + "learning_rate": 6.334679632275226e-06, + "loss": 1.8127, + "step": 11214 + }, + { + "epoch": 0.8442010576036433, + "grad_norm": 4.947411060333252, + "learning_rate": 6.328741877383221e-06, + "loss": 1.8007, + "step": 11215 + }, + { + "epoch": 0.8442763318843034, + "grad_norm": 5.583044052124023, + "learning_rate": 6.322806718622204e-06, + "loss": 1.8532, + "step": 11216 + }, + { + "epoch": 0.8443516061649636, + "grad_norm": 5.236429691314697, + "learning_rate": 6.3168741563450095e-06, + "loss": 1.5682, + "step": 11217 + }, + { + "epoch": 0.8444268804456238, + "grad_norm": 5.19541597366333, + "learning_rate": 6.3109441909043e-06, + "loss": 1.7171, + "step": 11218 + }, + { + "epoch": 0.8445021547262839, + "grad_norm": 6.942011833190918, + "learning_rate": 6.305016822652609e-06, + "loss": 1.7679, + "step": 11219 + }, + { + "epoch": 0.8445774290069441, + "grad_norm": 6.323000907897949, + "learning_rate": 6.299092051942279e-06, + "loss": 2.1287, + "step": 11220 + }, + { + "epoch": 0.8446527032876042, + "grad_norm": 6.352884292602539, + "learning_rate": 6.293169879125538e-06, + "loss": 2.1497, + "step": 11221 + }, + { + "epoch": 0.8447279775682643, + "grad_norm": 4.266534805297852, + "learning_rate": 6.287250304554448e-06, + "loss": 1.6091, + "step": 11222 + }, + { + "epoch": 0.8448032518489245, + "grad_norm": 3.6926400661468506, + "learning_rate": 6.281333328580896e-06, + "loss": 1.6438, + "step": 11223 + }, + { + "epoch": 0.8448785261295847, + "grad_norm": 6.845376491546631, + "learning_rate": 6.275418951556639e-06, + "loss": 1.5882, + "step": 11224 + }, + { + "epoch": 0.8449538004102448, + "grad_norm": 5.468923568725586, + "learning_rate": 6.2695071738332555e-06, + "loss": 1.8688, + "step": 11225 + }, + { + "epoch": 0.8450290746909049, + "grad_norm": 5.483686923980713, + "learning_rate": 6.263597995762199e-06, + "loss": 1.845, + "step": 11226 + }, + { + "epoch": 0.8451043489715652, + "grad_norm": 5.1428608894348145, + "learning_rate": 6.257691417694739e-06, + "loss": 1.86, + "step": 11227 + }, + { + "epoch": 0.8451796232522253, + "grad_norm": 4.612880229949951, + "learning_rate": 6.251787439982032e-06, + "loss": 1.8806, + "step": 11228 + }, + { + "epoch": 0.8452548975328854, + "grad_norm": 4.612621307373047, + "learning_rate": 6.245886062975021e-06, + "loss": 1.5457, + "step": 11229 + }, + { + "epoch": 0.8453301718135456, + "grad_norm": 3.6237878799438477, + "learning_rate": 6.239987287024546e-06, + "loss": 1.7081, + "step": 11230 + }, + { + "epoch": 0.8454054460942058, + "grad_norm": 5.440907955169678, + "learning_rate": 6.2340911124812605e-06, + "loss": 1.894, + "step": 11231 + }, + { + "epoch": 0.8454807203748659, + "grad_norm": 5.359150409698486, + "learning_rate": 6.228197539695685e-06, + "loss": 1.8042, + "step": 11232 + }, + { + "epoch": 0.8455559946555261, + "grad_norm": 3.9303054809570312, + "learning_rate": 6.222306569018166e-06, + "loss": 1.6954, + "step": 11233 + }, + { + "epoch": 0.8456312689361862, + "grad_norm": 4.7124223709106445, + "learning_rate": 6.2164182007989124e-06, + "loss": 1.9149, + "step": 11234 + }, + { + "epoch": 0.8457065432168464, + "grad_norm": 4.2395243644714355, + "learning_rate": 6.2105324353879615e-06, + "loss": 1.5313, + "step": 11235 + }, + { + "epoch": 0.8457818174975066, + "grad_norm": 4.236502647399902, + "learning_rate": 6.20464927313521e-06, + "loss": 1.8376, + "step": 11236 + }, + { + "epoch": 0.8458570917781667, + "grad_norm": 4.608453273773193, + "learning_rate": 6.198768714390396e-06, + "loss": 2.225, + "step": 11237 + }, + { + "epoch": 0.8459323660588268, + "grad_norm": 4.667847156524658, + "learning_rate": 6.192890759503117e-06, + "loss": 1.719, + "step": 11238 + }, + { + "epoch": 0.846007640339487, + "grad_norm": 4.957977771759033, + "learning_rate": 6.187015408822772e-06, + "loss": 1.6268, + "step": 11239 + }, + { + "epoch": 0.8460829146201472, + "grad_norm": 4.79469633102417, + "learning_rate": 6.181142662698647e-06, + "loss": 2.0776, + "step": 11240 + }, + { + "epoch": 0.8461581889008073, + "grad_norm": 5.145556449890137, + "learning_rate": 6.175272521479869e-06, + "loss": 1.946, + "step": 11241 + }, + { + "epoch": 0.8462334631814675, + "grad_norm": 5.155332088470459, + "learning_rate": 6.169404985515381e-06, + "loss": 1.7049, + "step": 11242 + }, + { + "epoch": 0.8463087374621276, + "grad_norm": 5.052038192749023, + "learning_rate": 6.163540055154016e-06, + "loss": 1.7565, + "step": 11243 + }, + { + "epoch": 0.8463840117427878, + "grad_norm": 4.200429916381836, + "learning_rate": 6.1576777307444046e-06, + "loss": 1.804, + "step": 11244 + }, + { + "epoch": 0.8464592860234479, + "grad_norm": 4.319472789764404, + "learning_rate": 6.151818012635064e-06, + "loss": 1.4788, + "step": 11245 + }, + { + "epoch": 0.8465345603041081, + "grad_norm": 6.487365245819092, + "learning_rate": 6.145960901174314e-06, + "loss": 1.7951, + "step": 11246 + }, + { + "epoch": 0.8466098345847682, + "grad_norm": 4.510237216949463, + "learning_rate": 6.140106396710371e-06, + "loss": 2.2786, + "step": 11247 + }, + { + "epoch": 0.8466851088654284, + "grad_norm": 6.785840034484863, + "learning_rate": 6.134254499591247e-06, + "loss": 2.1406, + "step": 11248 + }, + { + "epoch": 0.8467603831460886, + "grad_norm": 5.295591354370117, + "learning_rate": 6.128405210164823e-06, + "loss": 2.1391, + "step": 11249 + }, + { + "epoch": 0.8468356574267487, + "grad_norm": 4.7377238273620605, + "learning_rate": 6.122558528778826e-06, + "loss": 1.8891, + "step": 11250 + }, + { + "epoch": 0.8469109317074088, + "grad_norm": 5.227242469787598, + "learning_rate": 6.116714455780842e-06, + "loss": 1.6686, + "step": 11251 + }, + { + "epoch": 0.8469862059880691, + "grad_norm": 5.858652114868164, + "learning_rate": 6.110872991518251e-06, + "loss": 1.8096, + "step": 11252 + }, + { + "epoch": 0.8470614802687292, + "grad_norm": 4.820150852203369, + "learning_rate": 6.105034136338333e-06, + "loss": 1.7638, + "step": 11253 + }, + { + "epoch": 0.8471367545493893, + "grad_norm": 5.854217529296875, + "learning_rate": 6.099197890588199e-06, + "loss": 1.991, + "step": 11254 + }, + { + "epoch": 0.8472120288300495, + "grad_norm": 6.970722198486328, + "learning_rate": 6.093364254614775e-06, + "loss": 1.8455, + "step": 11255 + }, + { + "epoch": 0.8472873031107097, + "grad_norm": 5.369820594787598, + "learning_rate": 6.087533228764869e-06, + "loss": 1.7705, + "step": 11256 + }, + { + "epoch": 0.8473625773913698, + "grad_norm": 5.172910213470459, + "learning_rate": 6.081704813385109e-06, + "loss": 1.6951, + "step": 11257 + }, + { + "epoch": 0.84743785167203, + "grad_norm": 5.604563236236572, + "learning_rate": 6.0758790088219885e-06, + "loss": 2.1248, + "step": 11258 + }, + { + "epoch": 0.8475131259526901, + "grad_norm": 4.990391731262207, + "learning_rate": 6.070055815421816e-06, + "loss": 1.9498, + "step": 11259 + }, + { + "epoch": 0.8475884002333502, + "grad_norm": 5.144252777099609, + "learning_rate": 6.064235233530779e-06, + "loss": 1.5854, + "step": 11260 + }, + { + "epoch": 0.8476636745140105, + "grad_norm": 3.9531991481781006, + "learning_rate": 6.0584172634948925e-06, + "loss": 1.6825, + "step": 11261 + }, + { + "epoch": 0.8477389487946706, + "grad_norm": 7.152756690979004, + "learning_rate": 6.052601905660027e-06, + "loss": 1.9642, + "step": 11262 + }, + { + "epoch": 0.8478142230753307, + "grad_norm": 6.138336181640625, + "learning_rate": 6.0467891603718686e-06, + "loss": 1.5943, + "step": 11263 + }, + { + "epoch": 0.8478894973559908, + "grad_norm": 4.9320878982543945, + "learning_rate": 6.04097902797599e-06, + "loss": 1.5358, + "step": 11264 + }, + { + "epoch": 0.8479647716366511, + "grad_norm": 4.430438995361328, + "learning_rate": 6.035171508817766e-06, + "loss": 2.0484, + "step": 11265 + }, + { + "epoch": 0.8480400459173112, + "grad_norm": 4.702756881713867, + "learning_rate": 6.029366603242453e-06, + "loss": 1.8195, + "step": 11266 + }, + { + "epoch": 0.8481153201979713, + "grad_norm": 4.260861873626709, + "learning_rate": 6.02356431159512e-06, + "loss": 1.5999, + "step": 11267 + }, + { + "epoch": 0.8481905944786315, + "grad_norm": 4.201164722442627, + "learning_rate": 6.017764634220719e-06, + "loss": 1.9959, + "step": 11268 + }, + { + "epoch": 0.8482658687592917, + "grad_norm": 6.723023414611816, + "learning_rate": 6.011967571464e-06, + "loss": 2.0883, + "step": 11269 + }, + { + "epoch": 0.8483411430399518, + "grad_norm": 6.232792854309082, + "learning_rate": 6.00617312366959e-06, + "loss": 1.9358, + "step": 11270 + }, + { + "epoch": 0.848416417320612, + "grad_norm": 3.7897911071777344, + "learning_rate": 6.0003812911819615e-06, + "loss": 1.8006, + "step": 11271 + }, + { + "epoch": 0.8484916916012721, + "grad_norm": 6.220593452453613, + "learning_rate": 5.994592074345412e-06, + "loss": 2.0034, + "step": 11272 + }, + { + "epoch": 0.8485669658819323, + "grad_norm": 5.0158538818359375, + "learning_rate": 5.988805473504106e-06, + "loss": 2.0759, + "step": 11273 + }, + { + "epoch": 0.8486422401625925, + "grad_norm": 3.8611676692962646, + "learning_rate": 5.9830214890020274e-06, + "loss": 1.7533, + "step": 11274 + }, + { + "epoch": 0.8487175144432526, + "grad_norm": 4.1529860496521, + "learning_rate": 5.9772401211830306e-06, + "loss": 1.8748, + "step": 11275 + }, + { + "epoch": 0.8487927887239127, + "grad_norm": 5.566751956939697, + "learning_rate": 5.971461370390779e-06, + "loss": 1.7035, + "step": 11276 + }, + { + "epoch": 0.848868063004573, + "grad_norm": 5.445955753326416, + "learning_rate": 5.965685236968832e-06, + "loss": 2.0433, + "step": 11277 + }, + { + "epoch": 0.8489433372852331, + "grad_norm": 4.100853443145752, + "learning_rate": 5.959911721260541e-06, + "loss": 1.6811, + "step": 11278 + }, + { + "epoch": 0.8490186115658932, + "grad_norm": 4.144077301025391, + "learning_rate": 5.954140823609139e-06, + "loss": 1.5215, + "step": 11279 + }, + { + "epoch": 0.8490938858465534, + "grad_norm": 6.304539203643799, + "learning_rate": 5.948372544357672e-06, + "loss": 1.8958, + "step": 11280 + }, + { + "epoch": 0.8491691601272136, + "grad_norm": 4.566933631896973, + "learning_rate": 5.942606883849061e-06, + "loss": 1.8817, + "step": 11281 + }, + { + "epoch": 0.8492444344078737, + "grad_norm": 5.39998722076416, + "learning_rate": 5.936843842426054e-06, + "loss": 1.9203, + "step": 11282 + }, + { + "epoch": 0.8493197086885338, + "grad_norm": 5.680767059326172, + "learning_rate": 5.93108342043126e-06, + "loss": 1.9378, + "step": 11283 + }, + { + "epoch": 0.849394982969194, + "grad_norm": 4.6040730476379395, + "learning_rate": 5.925325618207101e-06, + "loss": 1.681, + "step": 11284 + }, + { + "epoch": 0.8494702572498541, + "grad_norm": 6.809340000152588, + "learning_rate": 5.919570436095878e-06, + "loss": 2.0843, + "step": 11285 + }, + { + "epoch": 0.8495455315305143, + "grad_norm": 8.677302360534668, + "learning_rate": 5.9138178744397036e-06, + "loss": 1.7938, + "step": 11286 + }, + { + "epoch": 0.8496208058111745, + "grad_norm": 4.486725807189941, + "learning_rate": 5.908067933580558e-06, + "loss": 1.645, + "step": 11287 + }, + { + "epoch": 0.8496960800918346, + "grad_norm": 5.2654266357421875, + "learning_rate": 5.902320613860268e-06, + "loss": 1.6082, + "step": 11288 + }, + { + "epoch": 0.8497713543724947, + "grad_norm": 5.1449198722839355, + "learning_rate": 5.896575915620478e-06, + "loss": 1.4999, + "step": 11289 + }, + { + "epoch": 0.849846628653155, + "grad_norm": 7.149578094482422, + "learning_rate": 5.890833839202714e-06, + "loss": 1.6823, + "step": 11290 + }, + { + "epoch": 0.8499219029338151, + "grad_norm": 4.404765605926514, + "learning_rate": 5.885094384948303e-06, + "loss": 2.0962, + "step": 11291 + }, + { + "epoch": 0.8499971772144752, + "grad_norm": 5.636532306671143, + "learning_rate": 5.879357553198461e-06, + "loss": 1.9787, + "step": 11292 + }, + { + "epoch": 0.8500724514951354, + "grad_norm": 4.277993679046631, + "learning_rate": 5.87362334429421e-06, + "loss": 1.9387, + "step": 11293 + }, + { + "epoch": 0.8501477257757956, + "grad_norm": 5.32755184173584, + "learning_rate": 5.867891758576432e-06, + "loss": 1.5929, + "step": 11294 + }, + { + "epoch": 0.8502230000564557, + "grad_norm": 8.300172805786133, + "learning_rate": 5.862162796385867e-06, + "loss": 1.6336, + "step": 11295 + }, + { + "epoch": 0.8502982743371159, + "grad_norm": 4.200565814971924, + "learning_rate": 5.856436458063086e-06, + "loss": 1.9825, + "step": 11296 + }, + { + "epoch": 0.850373548617776, + "grad_norm": 6.907848358154297, + "learning_rate": 5.850712743948483e-06, + "loss": 1.7795, + "step": 11297 + }, + { + "epoch": 0.8504488228984362, + "grad_norm": 5.160358905792236, + "learning_rate": 5.844991654382337e-06, + "loss": 1.7414, + "step": 11298 + }, + { + "epoch": 0.8505240971790964, + "grad_norm": 4.649304389953613, + "learning_rate": 5.839273189704736e-06, + "loss": 2.4212, + "step": 11299 + }, + { + "epoch": 0.8505993714597565, + "grad_norm": 6.801667213439941, + "learning_rate": 5.833557350255642e-06, + "loss": 1.4508, + "step": 11300 + }, + { + "epoch": 0.8506746457404166, + "grad_norm": 4.885019779205322, + "learning_rate": 5.827844136374827e-06, + "loss": 1.8312, + "step": 11301 + }, + { + "epoch": 0.8507499200210767, + "grad_norm": 4.672741413116455, + "learning_rate": 5.822133548401931e-06, + "loss": 2.0063, + "step": 11302 + }, + { + "epoch": 0.850825194301737, + "grad_norm": 6.103072643280029, + "learning_rate": 5.816425586676444e-06, + "loss": 1.8144, + "step": 11303 + }, + { + "epoch": 0.8509004685823971, + "grad_norm": 3.7380785942077637, + "learning_rate": 5.810720251537671e-06, + "loss": 1.5957, + "step": 11304 + }, + { + "epoch": 0.8509757428630572, + "grad_norm": 5.434122562408447, + "learning_rate": 5.8050175433247864e-06, + "loss": 1.7337, + "step": 11305 + }, + { + "epoch": 0.8510510171437174, + "grad_norm": 5.055976867675781, + "learning_rate": 5.799317462376802e-06, + "loss": 1.5506, + "step": 11306 + }, + { + "epoch": 0.8511262914243776, + "grad_norm": 5.650340557098389, + "learning_rate": 5.793620009032574e-06, + "loss": 2.1537, + "step": 11307 + }, + { + "epoch": 0.8512015657050377, + "grad_norm": 4.495242118835449, + "learning_rate": 5.787925183630788e-06, + "loss": 1.9838, + "step": 11308 + }, + { + "epoch": 0.8512768399856979, + "grad_norm": 5.4922990798950195, + "learning_rate": 5.782232986509995e-06, + "loss": 1.8259, + "step": 11309 + }, + { + "epoch": 0.851352114266358, + "grad_norm": 4.520008087158203, + "learning_rate": 5.776543418008573e-06, + "loss": 1.9088, + "step": 11310 + }, + { + "epoch": 0.8514273885470182, + "grad_norm": 5.4600629806518555, + "learning_rate": 5.770856478464759e-06, + "loss": 1.7309, + "step": 11311 + }, + { + "epoch": 0.8515026628276784, + "grad_norm": 6.564732551574707, + "learning_rate": 5.765172168216609e-06, + "loss": 1.6091, + "step": 11312 + }, + { + "epoch": 0.8515779371083385, + "grad_norm": 3.8732523918151855, + "learning_rate": 5.759490487602065e-06, + "loss": 2.0157, + "step": 11313 + }, + { + "epoch": 0.8516532113889986, + "grad_norm": 4.641762733459473, + "learning_rate": 5.753811436958856e-06, + "loss": 2.1006, + "step": 11314 + }, + { + "epoch": 0.8517284856696589, + "grad_norm": 4.557394504547119, + "learning_rate": 5.7481350166246004e-06, + "loss": 1.7528, + "step": 11315 + }, + { + "epoch": 0.851803759950319, + "grad_norm": 5.444219589233398, + "learning_rate": 5.742461226936746e-06, + "loss": 1.8976, + "step": 11316 + }, + { + "epoch": 0.8518790342309791, + "grad_norm": 6.359411716461182, + "learning_rate": 5.736790068232595e-06, + "loss": 1.7496, + "step": 11317 + }, + { + "epoch": 0.8519543085116393, + "grad_norm": 4.254822731018066, + "learning_rate": 5.7311215408492605e-06, + "loss": 2.0206, + "step": 11318 + }, + { + "epoch": 0.8520295827922995, + "grad_norm": 4.232534885406494, + "learning_rate": 5.725455645123723e-06, + "loss": 1.8079, + "step": 11319 + }, + { + "epoch": 0.8521048570729596, + "grad_norm": 5.990346908569336, + "learning_rate": 5.719792381392824e-06, + "loss": 1.6694, + "step": 11320 + }, + { + "epoch": 0.8521801313536198, + "grad_norm": 4.427980422973633, + "learning_rate": 5.714131749993201e-06, + "loss": 2.0511, + "step": 11321 + }, + { + "epoch": 0.8522554056342799, + "grad_norm": 3.8947792053222656, + "learning_rate": 5.708473751261384e-06, + "loss": 1.7736, + "step": 11322 + }, + { + "epoch": 0.85233067991494, + "grad_norm": 4.89677619934082, + "learning_rate": 5.7028183855337095e-06, + "loss": 1.6667, + "step": 11323 + }, + { + "epoch": 0.8524059541956002, + "grad_norm": 4.721692085266113, + "learning_rate": 5.697165653146386e-06, + "loss": 1.4774, + "step": 11324 + }, + { + "epoch": 0.8524812284762604, + "grad_norm": 4.659456253051758, + "learning_rate": 5.691515554435439e-06, + "loss": 2.1724, + "step": 11325 + }, + { + "epoch": 0.8525565027569205, + "grad_norm": 4.3916425704956055, + "learning_rate": 5.685868089736757e-06, + "loss": 1.8265, + "step": 11326 + }, + { + "epoch": 0.8526317770375806, + "grad_norm": 4.282839298248291, + "learning_rate": 5.680223259386064e-06, + "loss": 1.5446, + "step": 11327 + }, + { + "epoch": 0.8527070513182409, + "grad_norm": 5.252005577087402, + "learning_rate": 5.6745810637189364e-06, + "loss": 1.9379, + "step": 11328 + }, + { + "epoch": 0.852782325598901, + "grad_norm": 6.562023162841797, + "learning_rate": 5.668941503070773e-06, + "loss": 1.5385, + "step": 11329 + }, + { + "epoch": 0.8528575998795611, + "grad_norm": 5.468602657318115, + "learning_rate": 5.663304577776851e-06, + "loss": 1.9455, + "step": 11330 + }, + { + "epoch": 0.8529328741602213, + "grad_norm": 4.0843825340271, + "learning_rate": 5.657670288172246e-06, + "loss": 1.9929, + "step": 11331 + }, + { + "epoch": 0.8530081484408815, + "grad_norm": 5.78109884262085, + "learning_rate": 5.652038634591916e-06, + "loss": 2.1174, + "step": 11332 + }, + { + "epoch": 0.8530834227215416, + "grad_norm": 4.657322406768799, + "learning_rate": 5.64640961737063e-06, + "loss": 1.6437, + "step": 11333 + }, + { + "epoch": 0.8531586970022018, + "grad_norm": 4.304448127746582, + "learning_rate": 5.640783236843028e-06, + "loss": 1.8518, + "step": 11334 + }, + { + "epoch": 0.8532339712828619, + "grad_norm": 4.677581787109375, + "learning_rate": 5.635159493343595e-06, + "loss": 1.711, + "step": 11335 + }, + { + "epoch": 0.8533092455635221, + "grad_norm": 4.7286882400512695, + "learning_rate": 5.629538387206617e-06, + "loss": 2.1629, + "step": 11336 + }, + { + "epoch": 0.8533845198441823, + "grad_norm": 5.954164505004883, + "learning_rate": 5.6239199187662735e-06, + "loss": 1.8109, + "step": 11337 + }, + { + "epoch": 0.8534597941248424, + "grad_norm": 6.394717216491699, + "learning_rate": 5.618304088356563e-06, + "loss": 1.7403, + "step": 11338 + }, + { + "epoch": 0.8535350684055025, + "grad_norm": 5.252198219299316, + "learning_rate": 5.612690896311334e-06, + "loss": 1.7397, + "step": 11339 + }, + { + "epoch": 0.8536103426861628, + "grad_norm": 4.424313068389893, + "learning_rate": 5.607080342964266e-06, + "loss": 1.6129, + "step": 11340 + }, + { + "epoch": 0.8536856169668229, + "grad_norm": 6.685618877410889, + "learning_rate": 5.601472428648902e-06, + "loss": 1.8551, + "step": 11341 + }, + { + "epoch": 0.853760891247483, + "grad_norm": 4.491438865661621, + "learning_rate": 5.595867153698597e-06, + "loss": 1.9619, + "step": 11342 + }, + { + "epoch": 0.8538361655281431, + "grad_norm": 4.7786407470703125, + "learning_rate": 5.590264518446586e-06, + "loss": 2.4309, + "step": 11343 + }, + { + "epoch": 0.8539114398088034, + "grad_norm": 5.806628704071045, + "learning_rate": 5.584664523225918e-06, + "loss": 2.0155, + "step": 11344 + }, + { + "epoch": 0.8539867140894635, + "grad_norm": 4.507737159729004, + "learning_rate": 5.579067168369511e-06, + "loss": 1.7316, + "step": 11345 + }, + { + "epoch": 0.8540619883701236, + "grad_norm": 4.34588623046875, + "learning_rate": 5.573472454210099e-06, + "loss": 1.8259, + "step": 11346 + }, + { + "epoch": 0.8541372626507838, + "grad_norm": 4.247511863708496, + "learning_rate": 5.567880381080276e-06, + "loss": 1.7818, + "step": 11347 + }, + { + "epoch": 0.8542125369314439, + "grad_norm": 4.825319766998291, + "learning_rate": 5.562290949312471e-06, + "loss": 1.6259, + "step": 11348 + }, + { + "epoch": 0.8542878112121041, + "grad_norm": 5.244551181793213, + "learning_rate": 5.5567041592389624e-06, + "loss": 1.9471, + "step": 11349 + }, + { + "epoch": 0.8543630854927643, + "grad_norm": 5.681702613830566, + "learning_rate": 5.551120011191868e-06, + "loss": 2.111, + "step": 11350 + }, + { + "epoch": 0.8544383597734244, + "grad_norm": 5.542675495147705, + "learning_rate": 5.54553850550315e-06, + "loss": 1.5149, + "step": 11351 + }, + { + "epoch": 0.8545136340540845, + "grad_norm": 8.099947929382324, + "learning_rate": 5.539959642504622e-06, + "loss": 2.1872, + "step": 11352 + }, + { + "epoch": 0.8545889083347448, + "grad_norm": 4.147285461425781, + "learning_rate": 5.534383422527917e-06, + "loss": 1.7073, + "step": 11353 + }, + { + "epoch": 0.8546641826154049, + "grad_norm": 4.144649982452393, + "learning_rate": 5.528809845904537e-06, + "loss": 1.6317, + "step": 11354 + }, + { + "epoch": 0.854739456896065, + "grad_norm": 4.827475070953369, + "learning_rate": 5.523238912965806e-06, + "loss": 1.8281, + "step": 11355 + }, + { + "epoch": 0.8548147311767252, + "grad_norm": 4.3208723068237305, + "learning_rate": 5.517670624042909e-06, + "loss": 1.7259, + "step": 11356 + }, + { + "epoch": 0.8548900054573854, + "grad_norm": 4.612565994262695, + "learning_rate": 5.512104979466848e-06, + "loss": 1.516, + "step": 11357 + }, + { + "epoch": 0.8549652797380455, + "grad_norm": 4.594831466674805, + "learning_rate": 5.506541979568508e-06, + "loss": 1.946, + "step": 11358 + }, + { + "epoch": 0.8550405540187057, + "grad_norm": 4.143504619598389, + "learning_rate": 5.50098162467857e-06, + "loss": 1.9511, + "step": 11359 + }, + { + "epoch": 0.8551158282993658, + "grad_norm": 4.555916786193848, + "learning_rate": 5.495423915127596e-06, + "loss": 2.1596, + "step": 11360 + }, + { + "epoch": 0.855191102580026, + "grad_norm": 4.5442023277282715, + "learning_rate": 5.489868851245966e-06, + "loss": 1.8068, + "step": 11361 + }, + { + "epoch": 0.8552663768606861, + "grad_norm": 5.009327411651611, + "learning_rate": 5.4843164333639344e-06, + "loss": 1.8077, + "step": 11362 + }, + { + "epoch": 0.8553416511413463, + "grad_norm": 4.815648555755615, + "learning_rate": 5.478766661811552e-06, + "loss": 1.7917, + "step": 11363 + }, + { + "epoch": 0.8554169254220064, + "grad_norm": 5.050643444061279, + "learning_rate": 5.47321953691875e-06, + "loss": 1.7534, + "step": 11364 + }, + { + "epoch": 0.8554921997026665, + "grad_norm": 6.382200241088867, + "learning_rate": 5.467675059015276e-06, + "loss": 1.7799, + "step": 11365 + }, + { + "epoch": 0.8555674739833268, + "grad_norm": 3.927781343460083, + "learning_rate": 5.462133228430749e-06, + "loss": 1.6054, + "step": 11366 + }, + { + "epoch": 0.8556427482639869, + "grad_norm": 3.700148344039917, + "learning_rate": 5.456594045494612e-06, + "loss": 1.6804, + "step": 11367 + }, + { + "epoch": 0.855718022544647, + "grad_norm": 4.513267517089844, + "learning_rate": 5.451057510536139e-06, + "loss": 1.8871, + "step": 11368 + }, + { + "epoch": 0.8557932968253072, + "grad_norm": 4.847709655761719, + "learning_rate": 5.445523623884485e-06, + "loss": 2.0879, + "step": 11369 + }, + { + "epoch": 0.8558685711059674, + "grad_norm": 5.425947189331055, + "learning_rate": 5.439992385868598e-06, + "loss": 1.7721, + "step": 11370 + }, + { + "epoch": 0.8559438453866275, + "grad_norm": 4.6961565017700195, + "learning_rate": 5.434463796817307e-06, + "loss": 1.4921, + "step": 11371 + }, + { + "epoch": 0.8560191196672877, + "grad_norm": 6.164254188537598, + "learning_rate": 5.428937857059274e-06, + "loss": 1.6734, + "step": 11372 + }, + { + "epoch": 0.8560943939479478, + "grad_norm": 4.206858158111572, + "learning_rate": 5.423414566923002e-06, + "loss": 1.8606, + "step": 11373 + }, + { + "epoch": 0.856169668228608, + "grad_norm": 4.629970550537109, + "learning_rate": 5.4178939267368226e-06, + "loss": 1.6114, + "step": 11374 + }, + { + "epoch": 0.8562449425092682, + "grad_norm": 4.035036087036133, + "learning_rate": 5.4123759368289336e-06, + "loss": 2.0318, + "step": 11375 + }, + { + "epoch": 0.8563202167899283, + "grad_norm": 5.874871730804443, + "learning_rate": 5.4068605975273525e-06, + "loss": 1.8604, + "step": 11376 + }, + { + "epoch": 0.8563954910705884, + "grad_norm": 5.744148254394531, + "learning_rate": 5.401347909159971e-06, + "loss": 1.9988, + "step": 11377 + }, + { + "epoch": 0.8564707653512487, + "grad_norm": 4.31630802154541, + "learning_rate": 5.395837872054471e-06, + "loss": 1.7285, + "step": 11378 + }, + { + "epoch": 0.8565460396319088, + "grad_norm": 5.325786590576172, + "learning_rate": 5.390330486538442e-06, + "loss": 1.5601, + "step": 11379 + }, + { + "epoch": 0.8566213139125689, + "grad_norm": 6.213504791259766, + "learning_rate": 5.384825752939254e-06, + "loss": 2.3901, + "step": 11380 + }, + { + "epoch": 0.856696588193229, + "grad_norm": 4.757960796356201, + "learning_rate": 5.3793236715841574e-06, + "loss": 1.8566, + "step": 11381 + }, + { + "epoch": 0.8567718624738893, + "grad_norm": 3.7822248935699463, + "learning_rate": 5.373824242800241e-06, + "loss": 1.8802, + "step": 11382 + }, + { + "epoch": 0.8568471367545494, + "grad_norm": 4.352264404296875, + "learning_rate": 5.368327466914425e-06, + "loss": 1.7107, + "step": 11383 + }, + { + "epoch": 0.8569224110352095, + "grad_norm": 4.954226016998291, + "learning_rate": 5.362833344253487e-06, + "loss": 1.6309, + "step": 11384 + }, + { + "epoch": 0.8569976853158697, + "grad_norm": 5.501974105834961, + "learning_rate": 5.3573418751440175e-06, + "loss": 1.8756, + "step": 11385 + }, + { + "epoch": 0.8570729595965298, + "grad_norm": 5.184263229370117, + "learning_rate": 5.351853059912493e-06, + "loss": 2.1411, + "step": 11386 + }, + { + "epoch": 0.85714823387719, + "grad_norm": 5.575963973999023, + "learning_rate": 5.34636689888518e-06, + "loss": 1.8215, + "step": 11387 + }, + { + "epoch": 0.8572235081578502, + "grad_norm": 5.021780014038086, + "learning_rate": 5.340883392388246e-06, + "loss": 1.6331, + "step": 11388 + }, + { + "epoch": 0.8572987824385103, + "grad_norm": 4.778122425079346, + "learning_rate": 5.335402540747636e-06, + "loss": 1.928, + "step": 11389 + }, + { + "epoch": 0.8573740567191704, + "grad_norm": 5.680552959442139, + "learning_rate": 5.3299243442892e-06, + "loss": 1.6547, + "step": 11390 + }, + { + "epoch": 0.8574493309998307, + "grad_norm": 5.697072982788086, + "learning_rate": 5.324448803338577e-06, + "loss": 2.1053, + "step": 11391 + }, + { + "epoch": 0.8575246052804908, + "grad_norm": 4.456849575042725, + "learning_rate": 5.318975918221292e-06, + "loss": 1.4087, + "step": 11392 + }, + { + "epoch": 0.8575998795611509, + "grad_norm": 4.067120552062988, + "learning_rate": 5.313505689262688e-06, + "loss": 1.7939, + "step": 11393 + }, + { + "epoch": 0.8576751538418111, + "grad_norm": 5.4717912673950195, + "learning_rate": 5.308038116787939e-06, + "loss": 1.3898, + "step": 11394 + }, + { + "epoch": 0.8577504281224713, + "grad_norm": 5.063018321990967, + "learning_rate": 5.302573201122091e-06, + "loss": 1.7387, + "step": 11395 + }, + { + "epoch": 0.8578257024031314, + "grad_norm": 5.424300193786621, + "learning_rate": 5.297110942590028e-06, + "loss": 1.8743, + "step": 11396 + }, + { + "epoch": 0.8579009766837916, + "grad_norm": 4.340017795562744, + "learning_rate": 5.29165134151644e-06, + "loss": 2.3701, + "step": 11397 + }, + { + "epoch": 0.8579762509644517, + "grad_norm": 4.655369758605957, + "learning_rate": 5.2861943982259025e-06, + "loss": 1.802, + "step": 11398 + }, + { + "epoch": 0.8580515252451119, + "grad_norm": 7.578344821929932, + "learning_rate": 5.280740113042803e-06, + "loss": 1.8941, + "step": 11399 + }, + { + "epoch": 0.858126799525772, + "grad_norm": 7.623256683349609, + "learning_rate": 5.275288486291391e-06, + "loss": 2.1034, + "step": 11400 + }, + { + "epoch": 0.8582020738064322, + "grad_norm": 5.581150054931641, + "learning_rate": 5.2698395182957605e-06, + "loss": 2.0165, + "step": 11401 + }, + { + "epoch": 0.8582773480870923, + "grad_norm": 5.090108394622803, + "learning_rate": 5.264393209379814e-06, + "loss": 1.62, + "step": 11402 + }, + { + "epoch": 0.8583526223677524, + "grad_norm": 5.204704761505127, + "learning_rate": 5.258949559867338e-06, + "loss": 1.8108, + "step": 11403 + }, + { + "epoch": 0.8584278966484127, + "grad_norm": 4.71949577331543, + "learning_rate": 5.253508570081928e-06, + "loss": 1.6139, + "step": 11404 + }, + { + "epoch": 0.8585031709290728, + "grad_norm": 4.059711456298828, + "learning_rate": 5.2480702403470415e-06, + "loss": 1.5545, + "step": 11405 + }, + { + "epoch": 0.8585784452097329, + "grad_norm": 6.447992324829102, + "learning_rate": 5.242634570985966e-06, + "loss": 1.8447, + "step": 11406 + }, + { + "epoch": 0.8586537194903932, + "grad_norm": 3.8485782146453857, + "learning_rate": 5.237201562321858e-06, + "loss": 1.9373, + "step": 11407 + }, + { + "epoch": 0.8587289937710533, + "grad_norm": 4.185934066772461, + "learning_rate": 5.231771214677667e-06, + "loss": 1.8701, + "step": 11408 + }, + { + "epoch": 0.8588042680517134, + "grad_norm": 4.902507305145264, + "learning_rate": 5.226343528376226e-06, + "loss": 1.6618, + "step": 11409 + }, + { + "epoch": 0.8588795423323736, + "grad_norm": 5.240971565246582, + "learning_rate": 5.220918503740191e-06, + "loss": 1.8494, + "step": 11410 + }, + { + "epoch": 0.8589548166130337, + "grad_norm": 5.346987724304199, + "learning_rate": 5.215496141092075e-06, + "loss": 1.7486, + "step": 11411 + }, + { + "epoch": 0.8590300908936939, + "grad_norm": 3.6412837505340576, + "learning_rate": 5.210076440754197e-06, + "loss": 1.8034, + "step": 11412 + }, + { + "epoch": 0.8591053651743541, + "grad_norm": 4.350774765014648, + "learning_rate": 5.204659403048767e-06, + "loss": 2.0478, + "step": 11413 + }, + { + "epoch": 0.8591806394550142, + "grad_norm": 4.902470111846924, + "learning_rate": 5.1992450282978e-06, + "loss": 1.5889, + "step": 11414 + }, + { + "epoch": 0.8592559137356743, + "grad_norm": 6.072124481201172, + "learning_rate": 5.1938333168231656e-06, + "loss": 2.1175, + "step": 11415 + }, + { + "epoch": 0.8593311880163346, + "grad_norm": 6.366473197937012, + "learning_rate": 5.188424268946573e-06, + "loss": 1.83, + "step": 11416 + }, + { + "epoch": 0.8594064622969947, + "grad_norm": 5.194305896759033, + "learning_rate": 5.183017884989583e-06, + "loss": 1.8335, + "step": 11417 + }, + { + "epoch": 0.8594817365776548, + "grad_norm": 5.40255069732666, + "learning_rate": 5.177614165273597e-06, + "loss": 1.6381, + "step": 11418 + }, + { + "epoch": 0.859557010858315, + "grad_norm": 4.915066719055176, + "learning_rate": 5.172213110119823e-06, + "loss": 1.8786, + "step": 11419 + }, + { + "epoch": 0.8596322851389752, + "grad_norm": 5.29539680480957, + "learning_rate": 5.166814719849372e-06, + "loss": 1.9521, + "step": 11420 + }, + { + "epoch": 0.8597075594196353, + "grad_norm": 5.285453796386719, + "learning_rate": 5.161418994783129e-06, + "loss": 1.9565, + "step": 11421 + }, + { + "epoch": 0.8597828337002954, + "grad_norm": 3.980003595352173, + "learning_rate": 5.156025935241881e-06, + "loss": 1.9911, + "step": 11422 + }, + { + "epoch": 0.8598581079809556, + "grad_norm": 4.498735427856445, + "learning_rate": 5.150635541546211e-06, + "loss": 1.8753, + "step": 11423 + }, + { + "epoch": 0.8599333822616158, + "grad_norm": 7.84881067276001, + "learning_rate": 5.145247814016579e-06, + "loss": 1.8622, + "step": 11424 + }, + { + "epoch": 0.8600086565422759, + "grad_norm": 3.8739640712738037, + "learning_rate": 5.139862752973257e-06, + "loss": 1.755, + "step": 11425 + }, + { + "epoch": 0.8600839308229361, + "grad_norm": 5.518463134765625, + "learning_rate": 5.134480358736371e-06, + "loss": 1.9369, + "step": 11426 + }, + { + "epoch": 0.8601592051035962, + "grad_norm": 5.194349765777588, + "learning_rate": 5.129100631625894e-06, + "loss": 1.683, + "step": 11427 + }, + { + "epoch": 0.8602344793842563, + "grad_norm": 4.4260454177856445, + "learning_rate": 5.123723571961647e-06, + "loss": 2.4653, + "step": 11428 + }, + { + "epoch": 0.8603097536649166, + "grad_norm": 4.79770040512085, + "learning_rate": 5.118349180063259e-06, + "loss": 2.081, + "step": 11429 + }, + { + "epoch": 0.8603850279455767, + "grad_norm": 7.826600551605225, + "learning_rate": 5.112977456250245e-06, + "loss": 1.9509, + "step": 11430 + }, + { + "epoch": 0.8604603022262368, + "grad_norm": 7.9065704345703125, + "learning_rate": 5.107608400841913e-06, + "loss": 1.742, + "step": 11431 + }, + { + "epoch": 0.860535576506897, + "grad_norm": 5.645403861999512, + "learning_rate": 5.102242014157449e-06, + "loss": 1.8479, + "step": 11432 + }, + { + "epoch": 0.8606108507875572, + "grad_norm": 5.59318208694458, + "learning_rate": 5.096878296515883e-06, + "loss": 1.784, + "step": 11433 + }, + { + "epoch": 0.8606861250682173, + "grad_norm": 7.79551887512207, + "learning_rate": 5.0915172482360556e-06, + "loss": 1.5384, + "step": 11434 + }, + { + "epoch": 0.8607613993488775, + "grad_norm": 4.59033727645874, + "learning_rate": 5.086158869636676e-06, + "loss": 1.9118, + "step": 11435 + }, + { + "epoch": 0.8608366736295376, + "grad_norm": 5.209449291229248, + "learning_rate": 5.080803161036268e-06, + "loss": 1.8927, + "step": 11436 + }, + { + "epoch": 0.8609119479101978, + "grad_norm": 5.686946868896484, + "learning_rate": 5.075450122753228e-06, + "loss": 1.931, + "step": 11437 + }, + { + "epoch": 0.860987222190858, + "grad_norm": 4.581290245056152, + "learning_rate": 5.070099755105772e-06, + "loss": 1.5945, + "step": 11438 + }, + { + "epoch": 0.8610624964715181, + "grad_norm": 7.103540420532227, + "learning_rate": 5.064752058411975e-06, + "loss": 1.6487, + "step": 11439 + }, + { + "epoch": 0.8611377707521782, + "grad_norm": 5.881924152374268, + "learning_rate": 5.059407032989733e-06, + "loss": 1.9384, + "step": 11440 + }, + { + "epoch": 0.8612130450328384, + "grad_norm": 3.439589500427246, + "learning_rate": 5.054064679156795e-06, + "loss": 1.9732, + "step": 11441 + }, + { + "epoch": 0.8612883193134986, + "grad_norm": 5.381748199462891, + "learning_rate": 5.0487249972307415e-06, + "loss": 1.6726, + "step": 11442 + }, + { + "epoch": 0.8613635935941587, + "grad_norm": 4.571863651275635, + "learning_rate": 5.043387987529019e-06, + "loss": 2.0467, + "step": 11443 + }, + { + "epoch": 0.8614388678748188, + "grad_norm": 4.915619373321533, + "learning_rate": 5.038053650368874e-06, + "loss": 1.9863, + "step": 11444 + }, + { + "epoch": 0.8615141421554791, + "grad_norm": 6.579681396484375, + "learning_rate": 5.03272198606744e-06, + "loss": 2.2532, + "step": 11445 + }, + { + "epoch": 0.8615894164361392, + "grad_norm": 4.242094039916992, + "learning_rate": 5.027392994941643e-06, + "loss": 1.7283, + "step": 11446 + }, + { + "epoch": 0.8616646907167993, + "grad_norm": 6.660259246826172, + "learning_rate": 5.022066677308296e-06, + "loss": 2.2605, + "step": 11447 + }, + { + "epoch": 0.8617399649974595, + "grad_norm": 5.98316764831543, + "learning_rate": 5.016743033484039e-06, + "loss": 1.6845, + "step": 11448 + }, + { + "epoch": 0.8618152392781196, + "grad_norm": 4.509366512298584, + "learning_rate": 5.011422063785332e-06, + "loss": 1.8656, + "step": 11449 + }, + { + "epoch": 0.8618905135587798, + "grad_norm": 4.465482234954834, + "learning_rate": 5.006103768528486e-06, + "loss": 1.4003, + "step": 11450 + }, + { + "epoch": 0.86196578783944, + "grad_norm": 5.4790778160095215, + "learning_rate": 5.000788148029678e-06, + "loss": 2.0433, + "step": 11451 + }, + { + "epoch": 0.8620410621201001, + "grad_norm": 5.217287063598633, + "learning_rate": 4.995475202604905e-06, + "loss": 1.7413, + "step": 11452 + }, + { + "epoch": 0.8621163364007602, + "grad_norm": 5.025666236877441, + "learning_rate": 4.990164932569985e-06, + "loss": 1.8915, + "step": 11453 + }, + { + "epoch": 0.8621916106814205, + "grad_norm": 4.624127388000488, + "learning_rate": 4.984857338240623e-06, + "loss": 1.9549, + "step": 11454 + }, + { + "epoch": 0.8622668849620806, + "grad_norm": 4.594157695770264, + "learning_rate": 4.979552419932327e-06, + "loss": 2.1406, + "step": 11455 + }, + { + "epoch": 0.8623421592427407, + "grad_norm": 4.060484409332275, + "learning_rate": 4.97425017796046e-06, + "loss": 1.7687, + "step": 11456 + }, + { + "epoch": 0.862417433523401, + "grad_norm": 6.634838104248047, + "learning_rate": 4.9689506126402216e-06, + "loss": 1.8449, + "step": 11457 + }, + { + "epoch": 0.8624927078040611, + "grad_norm": 3.821725368499756, + "learning_rate": 4.963653724286671e-06, + "loss": 1.5767, + "step": 11458 + }, + { + "epoch": 0.8625679820847212, + "grad_norm": 5.238875865936279, + "learning_rate": 4.958359513214678e-06, + "loss": 2.2654, + "step": 11459 + }, + { + "epoch": 0.8626432563653813, + "grad_norm": 5.4520792961120605, + "learning_rate": 4.953067979738968e-06, + "loss": 1.9012, + "step": 11460 + }, + { + "epoch": 0.8627185306460415, + "grad_norm": 4.650437831878662, + "learning_rate": 4.947779124174112e-06, + "loss": 1.7756, + "step": 11461 + }, + { + "epoch": 0.8627938049267017, + "grad_norm": 7.86212158203125, + "learning_rate": 4.942492946834526e-06, + "loss": 2.2318, + "step": 11462 + }, + { + "epoch": 0.8628690792073618, + "grad_norm": 5.373051166534424, + "learning_rate": 4.937209448034441e-06, + "loss": 1.4676, + "step": 11463 + }, + { + "epoch": 0.862944353488022, + "grad_norm": 4.336888790130615, + "learning_rate": 4.931928628087956e-06, + "loss": 1.8026, + "step": 11464 + }, + { + "epoch": 0.8630196277686821, + "grad_norm": 5.209778308868408, + "learning_rate": 4.926650487309009e-06, + "loss": 1.9573, + "step": 11465 + }, + { + "epoch": 0.8630949020493422, + "grad_norm": 4.62621545791626, + "learning_rate": 4.921375026011349e-06, + "loss": 1.7024, + "step": 11466 + }, + { + "epoch": 0.8631701763300025, + "grad_norm": 4.174376010894775, + "learning_rate": 4.916102244508614e-06, + "loss": 1.6201, + "step": 11467 + }, + { + "epoch": 0.8632454506106626, + "grad_norm": 4.613985061645508, + "learning_rate": 4.910832143114225e-06, + "loss": 1.6349, + "step": 11468 + }, + { + "epoch": 0.8633207248913227, + "grad_norm": 3.961477518081665, + "learning_rate": 4.905564722141498e-06, + "loss": 1.5436, + "step": 11469 + }, + { + "epoch": 0.863395999171983, + "grad_norm": 4.334711074829102, + "learning_rate": 4.900299981903556e-06, + "loss": 1.7122, + "step": 11470 + }, + { + "epoch": 0.8634712734526431, + "grad_norm": 4.807438373565674, + "learning_rate": 4.895037922713369e-06, + "loss": 1.7362, + "step": 11471 + }, + { + "epoch": 0.8635465477333032, + "grad_norm": 4.9058637619018555, + "learning_rate": 4.8897785448837555e-06, + "loss": 1.7389, + "step": 11472 + }, + { + "epoch": 0.8636218220139634, + "grad_norm": 4.043872356414795, + "learning_rate": 4.884521848727386e-06, + "loss": 1.8973, + "step": 11473 + }, + { + "epoch": 0.8636970962946235, + "grad_norm": 4.52606725692749, + "learning_rate": 4.879267834556728e-06, + "loss": 1.734, + "step": 11474 + }, + { + "epoch": 0.8637723705752837, + "grad_norm": 3.9975106716156006, + "learning_rate": 4.874016502684142e-06, + "loss": 1.7962, + "step": 11475 + }, + { + "epoch": 0.8638476448559439, + "grad_norm": 5.700189590454102, + "learning_rate": 4.868767853421785e-06, + "loss": 1.9558, + "step": 11476 + }, + { + "epoch": 0.863922919136604, + "grad_norm": 4.748110294342041, + "learning_rate": 4.86352188708169e-06, + "loss": 1.8304, + "step": 11477 + }, + { + "epoch": 0.8639981934172641, + "grad_norm": 4.018179416656494, + "learning_rate": 4.8582786039757025e-06, + "loss": 1.8149, + "step": 11478 + }, + { + "epoch": 0.8640734676979243, + "grad_norm": 4.373999118804932, + "learning_rate": 4.8530380044155274e-06, + "loss": 1.7972, + "step": 11479 + }, + { + "epoch": 0.8641487419785845, + "grad_norm": 4.066504001617432, + "learning_rate": 4.847800088712706e-06, + "loss": 1.8754, + "step": 11480 + }, + { + "epoch": 0.8642240162592446, + "grad_norm": 5.190664291381836, + "learning_rate": 4.842564857178605e-06, + "loss": 1.9336, + "step": 11481 + }, + { + "epoch": 0.8642992905399047, + "grad_norm": 5.248288631439209, + "learning_rate": 4.837332310124454e-06, + "loss": 1.9199, + "step": 11482 + }, + { + "epoch": 0.864374564820565, + "grad_norm": 4.19376277923584, + "learning_rate": 4.832102447861309e-06, + "loss": 2.1524, + "step": 11483 + }, + { + "epoch": 0.8644498391012251, + "grad_norm": 4.536162376403809, + "learning_rate": 4.826875270700082e-06, + "loss": 1.7688, + "step": 11484 + }, + { + "epoch": 0.8645251133818852, + "grad_norm": 3.9947783946990967, + "learning_rate": 4.821650778951492e-06, + "loss": 1.6149, + "step": 11485 + }, + { + "epoch": 0.8646003876625454, + "grad_norm": 4.225605010986328, + "learning_rate": 4.816428972926146e-06, + "loss": 1.6091, + "step": 11486 + }, + { + "epoch": 0.8646756619432056, + "grad_norm": 4.963068008422852, + "learning_rate": 4.8112098529344395e-06, + "loss": 1.8834, + "step": 11487 + }, + { + "epoch": 0.8647509362238657, + "grad_norm": 4.862916946411133, + "learning_rate": 4.805993419286658e-06, + "loss": 1.7848, + "step": 11488 + }, + { + "epoch": 0.8648262105045259, + "grad_norm": 4.686556816101074, + "learning_rate": 4.800779672292882e-06, + "loss": 1.7064, + "step": 11489 + }, + { + "epoch": 0.864901484785186, + "grad_norm": 5.078429222106934, + "learning_rate": 4.7955686122630685e-06, + "loss": 1.9309, + "step": 11490 + }, + { + "epoch": 0.8649767590658461, + "grad_norm": 5.973289966583252, + "learning_rate": 4.790360239506986e-06, + "loss": 1.6972, + "step": 11491 + }, + { + "epoch": 0.8650520333465064, + "grad_norm": 4.8482441902160645, + "learning_rate": 4.78515455433427e-06, + "loss": 1.8212, + "step": 11492 + }, + { + "epoch": 0.8651273076271665, + "grad_norm": 4.316984176635742, + "learning_rate": 4.779951557054391e-06, + "loss": 2.2551, + "step": 11493 + }, + { + "epoch": 0.8652025819078266, + "grad_norm": 4.604041576385498, + "learning_rate": 4.774751247976628e-06, + "loss": 1.6312, + "step": 11494 + }, + { + "epoch": 0.8652778561884868, + "grad_norm": 5.055167198181152, + "learning_rate": 4.769553627410134e-06, + "loss": 1.6159, + "step": 11495 + }, + { + "epoch": 0.865353130469147, + "grad_norm": 4.917102813720703, + "learning_rate": 4.7643586956639005e-06, + "loss": 2.2331, + "step": 11496 + }, + { + "epoch": 0.8654284047498071, + "grad_norm": 5.21144437789917, + "learning_rate": 4.759166453046754e-06, + "loss": 1.794, + "step": 11497 + }, + { + "epoch": 0.8655036790304672, + "grad_norm": 4.193215847015381, + "learning_rate": 4.753976899867346e-06, + "loss": 1.9602, + "step": 11498 + }, + { + "epoch": 0.8655789533111274, + "grad_norm": 5.544768333435059, + "learning_rate": 4.748790036434192e-06, + "loss": 1.9221, + "step": 11499 + }, + { + "epoch": 0.8656542275917876, + "grad_norm": 4.593832969665527, + "learning_rate": 4.7436058630556225e-06, + "loss": 1.8846, + "step": 11500 + }, + { + "epoch": 0.8657295018724477, + "grad_norm": 5.250650405883789, + "learning_rate": 4.738424380039835e-06, + "loss": 1.6778, + "step": 11501 + }, + { + "epoch": 0.8658047761531079, + "grad_norm": 3.97278094291687, + "learning_rate": 4.733245587694846e-06, + "loss": 2.081, + "step": 11502 + }, + { + "epoch": 0.865880050433768, + "grad_norm": 6.192994594573975, + "learning_rate": 4.728069486328524e-06, + "loss": 2.0425, + "step": 11503 + }, + { + "epoch": 0.8659553247144282, + "grad_norm": 6.482472896575928, + "learning_rate": 4.722896076248568e-06, + "loss": 1.8384, + "step": 11504 + }, + { + "epoch": 0.8660305989950884, + "grad_norm": 5.451891899108887, + "learning_rate": 4.71772535776252e-06, + "loss": 1.6352, + "step": 11505 + }, + { + "epoch": 0.8661058732757485, + "grad_norm": 6.063283920288086, + "learning_rate": 4.712557331177775e-06, + "loss": 1.8351, + "step": 11506 + }, + { + "epoch": 0.8661811475564086, + "grad_norm": 4.665142059326172, + "learning_rate": 4.707391996801558e-06, + "loss": 1.5667, + "step": 11507 + }, + { + "epoch": 0.8662564218370689, + "grad_norm": 5.57370662689209, + "learning_rate": 4.702229354940918e-06, + "loss": 1.8064, + "step": 11508 + }, + { + "epoch": 0.866331696117729, + "grad_norm": 4.212967872619629, + "learning_rate": 4.697069405902782e-06, + "loss": 1.4364, + "step": 11509 + }, + { + "epoch": 0.8664069703983891, + "grad_norm": 4.478463172912598, + "learning_rate": 4.691912149993866e-06, + "loss": 1.4597, + "step": 11510 + }, + { + "epoch": 0.8664822446790493, + "grad_norm": 4.530065059661865, + "learning_rate": 4.686757587520774e-06, + "loss": 2.0677, + "step": 11511 + }, + { + "epoch": 0.8665575189597094, + "grad_norm": 4.536741733551025, + "learning_rate": 4.681605718789933e-06, + "loss": 1.4195, + "step": 11512 + }, + { + "epoch": 0.8666327932403696, + "grad_norm": 3.8651647567749023, + "learning_rate": 4.6764565441075935e-06, + "loss": 1.8939, + "step": 11513 + }, + { + "epoch": 0.8667080675210298, + "grad_norm": 6.274515628814697, + "learning_rate": 4.671310063779871e-06, + "loss": 1.7656, + "step": 11514 + }, + { + "epoch": 0.8667833418016899, + "grad_norm": 4.5438232421875, + "learning_rate": 4.666166278112688e-06, + "loss": 1.6627, + "step": 11515 + }, + { + "epoch": 0.86685861608235, + "grad_norm": 4.222365379333496, + "learning_rate": 4.661025187411849e-06, + "loss": 1.6147, + "step": 11516 + }, + { + "epoch": 0.8669338903630103, + "grad_norm": 5.207612037658691, + "learning_rate": 4.655886791982972e-06, + "loss": 1.8663, + "step": 11517 + }, + { + "epoch": 0.8670091646436704, + "grad_norm": 4.801716327667236, + "learning_rate": 4.650751092131523e-06, + "loss": 2.2603, + "step": 11518 + }, + { + "epoch": 0.8670844389243305, + "grad_norm": 6.084791660308838, + "learning_rate": 4.64561808816279e-06, + "loss": 1.9834, + "step": 11519 + }, + { + "epoch": 0.8671597132049906, + "grad_norm": 6.908560752868652, + "learning_rate": 4.640487780381936e-06, + "loss": 1.5051, + "step": 11520 + }, + { + "epoch": 0.8672349874856509, + "grad_norm": 4.244696140289307, + "learning_rate": 4.635360169093928e-06, + "loss": 1.7514, + "step": 11521 + }, + { + "epoch": 0.867310261766311, + "grad_norm": 3.900775194168091, + "learning_rate": 4.6302352546035935e-06, + "loss": 1.7779, + "step": 11522 + }, + { + "epoch": 0.8673855360469711, + "grad_norm": 4.440598487854004, + "learning_rate": 4.625113037215589e-06, + "loss": 1.4278, + "step": 11523 + }, + { + "epoch": 0.8674608103276313, + "grad_norm": 4.032225608825684, + "learning_rate": 4.619993517234428e-06, + "loss": 1.8447, + "step": 11524 + }, + { + "epoch": 0.8675360846082915, + "grad_norm": 4.032618045806885, + "learning_rate": 4.614876694964432e-06, + "loss": 1.4955, + "step": 11525 + }, + { + "epoch": 0.8676113588889516, + "grad_norm": 4.271254539489746, + "learning_rate": 4.6097625707097914e-06, + "loss": 1.787, + "step": 11526 + }, + { + "epoch": 0.8676866331696118, + "grad_norm": 4.169753551483154, + "learning_rate": 4.604651144774524e-06, + "loss": 1.6766, + "step": 11527 + }, + { + "epoch": 0.8677619074502719, + "grad_norm": 5.307674407958984, + "learning_rate": 4.599542417462499e-06, + "loss": 1.5426, + "step": 11528 + }, + { + "epoch": 0.867837181730932, + "grad_norm": 4.8163981437683105, + "learning_rate": 4.594436389077411e-06, + "loss": 2.1584, + "step": 11529 + }, + { + "epoch": 0.8679124560115923, + "grad_norm": 4.162590026855469, + "learning_rate": 4.58933305992279e-06, + "loss": 1.7731, + "step": 11530 + }, + { + "epoch": 0.8679877302922524, + "grad_norm": 4.718050003051758, + "learning_rate": 4.584232430302033e-06, + "loss": 1.6502, + "step": 11531 + }, + { + "epoch": 0.8680630045729125, + "grad_norm": 5.588866233825684, + "learning_rate": 4.579134500518334e-06, + "loss": 2.1532, + "step": 11532 + }, + { + "epoch": 0.8681382788535728, + "grad_norm": 4.87261438369751, + "learning_rate": 4.574039270874775e-06, + "loss": 1.6802, + "step": 11533 + }, + { + "epoch": 0.8682135531342329, + "grad_norm": 5.608423709869385, + "learning_rate": 4.5689467416742234e-06, + "loss": 1.72, + "step": 11534 + }, + { + "epoch": 0.868288827414893, + "grad_norm": 6.913978576660156, + "learning_rate": 4.5638569132194446e-06, + "loss": 1.6945, + "step": 11535 + }, + { + "epoch": 0.8683641016955532, + "grad_norm": 4.584637641906738, + "learning_rate": 4.558769785812994e-06, + "loss": 1.554, + "step": 11536 + }, + { + "epoch": 0.8684393759762133, + "grad_norm": 4.676369667053223, + "learning_rate": 4.553685359757287e-06, + "loss": 1.8062, + "step": 11537 + }, + { + "epoch": 0.8685146502568735, + "grad_norm": 4.393977165222168, + "learning_rate": 4.548603635354592e-06, + "loss": 1.9029, + "step": 11538 + }, + { + "epoch": 0.8685899245375336, + "grad_norm": 5.103185653686523, + "learning_rate": 4.543524612907002e-06, + "loss": 1.9495, + "step": 11539 + }, + { + "epoch": 0.8686651988181938, + "grad_norm": 4.820971488952637, + "learning_rate": 4.538448292716441e-06, + "loss": 1.6858, + "step": 11540 + }, + { + "epoch": 0.8687404730988539, + "grad_norm": 5.37093448638916, + "learning_rate": 4.533374675084689e-06, + "loss": 1.9335, + "step": 11541 + }, + { + "epoch": 0.8688157473795141, + "grad_norm": 5.1994123458862305, + "learning_rate": 4.528303760313346e-06, + "loss": 1.8861, + "step": 11542 + }, + { + "epoch": 0.8688910216601743, + "grad_norm": 5.579268932342529, + "learning_rate": 4.52323554870388e-06, + "loss": 1.8159, + "step": 11543 + }, + { + "epoch": 0.8689662959408344, + "grad_norm": 3.9115986824035645, + "learning_rate": 4.518170040557568e-06, + "loss": 1.7044, + "step": 11544 + }, + { + "epoch": 0.8690415702214945, + "grad_norm": 5.134445667266846, + "learning_rate": 4.51310723617554e-06, + "loss": 1.8796, + "step": 11545 + }, + { + "epoch": 0.8691168445021548, + "grad_norm": 4.854567527770996, + "learning_rate": 4.508047135858778e-06, + "loss": 2.0243, + "step": 11546 + }, + { + "epoch": 0.8691921187828149, + "grad_norm": 5.1354217529296875, + "learning_rate": 4.502989739908081e-06, + "loss": 1.894, + "step": 11547 + }, + { + "epoch": 0.869267393063475, + "grad_norm": 5.95161247253418, + "learning_rate": 4.497935048624102e-06, + "loss": 1.7065, + "step": 11548 + }, + { + "epoch": 0.8693426673441352, + "grad_norm": 4.187385559082031, + "learning_rate": 4.492883062307318e-06, + "loss": 1.7254, + "step": 11549 + }, + { + "epoch": 0.8694179416247954, + "grad_norm": 3.7051124572753906, + "learning_rate": 4.4878337812580605e-06, + "loss": 1.6645, + "step": 11550 + }, + { + "epoch": 0.8694932159054555, + "grad_norm": 5.29319953918457, + "learning_rate": 4.482787205776495e-06, + "loss": 1.9425, + "step": 11551 + }, + { + "epoch": 0.8695684901861157, + "grad_norm": 4.27440881729126, + "learning_rate": 4.477743336162638e-06, + "loss": 1.914, + "step": 11552 + }, + { + "epoch": 0.8696437644667758, + "grad_norm": 4.368507385253906, + "learning_rate": 4.472702172716309e-06, + "loss": 1.684, + "step": 11553 + }, + { + "epoch": 0.869719038747436, + "grad_norm": 6.391887187957764, + "learning_rate": 4.467663715737214e-06, + "loss": 1.9123, + "step": 11554 + }, + { + "epoch": 0.8697943130280962, + "grad_norm": 4.533965110778809, + "learning_rate": 4.462627965524851e-06, + "loss": 2.1336, + "step": 11555 + }, + { + "epoch": 0.8698695873087563, + "grad_norm": 3.8347065448760986, + "learning_rate": 4.4575949223786105e-06, + "loss": 2.0396, + "step": 11556 + }, + { + "epoch": 0.8699448615894164, + "grad_norm": 4.205840587615967, + "learning_rate": 4.452564586597663e-06, + "loss": 1.5999, + "step": 11557 + }, + { + "epoch": 0.8700201358700765, + "grad_norm": 5.164328098297119, + "learning_rate": 4.447536958481069e-06, + "loss": 1.7612, + "step": 11558 + }, + { + "epoch": 0.8700954101507368, + "grad_norm": 3.815864324569702, + "learning_rate": 4.44251203832769e-06, + "loss": 2.0034, + "step": 11559 + }, + { + "epoch": 0.8701706844313969, + "grad_norm": 6.352051734924316, + "learning_rate": 4.437489826436253e-06, + "loss": 2.1426, + "step": 11560 + }, + { + "epoch": 0.870245958712057, + "grad_norm": 6.125393390655518, + "learning_rate": 4.432470323105309e-06, + "loss": 1.9815, + "step": 11561 + }, + { + "epoch": 0.8703212329927172, + "grad_norm": 5.443470478057861, + "learning_rate": 4.427453528633263e-06, + "loss": 2.008, + "step": 11562 + }, + { + "epoch": 0.8703965072733774, + "grad_norm": 3.6203298568725586, + "learning_rate": 4.422439443318343e-06, + "loss": 1.4455, + "step": 11563 + }, + { + "epoch": 0.8704717815540375, + "grad_norm": 5.032252311706543, + "learning_rate": 4.417428067458618e-06, + "loss": 2.0595, + "step": 11564 + }, + { + "epoch": 0.8705470558346977, + "grad_norm": 4.501786708831787, + "learning_rate": 4.412419401352008e-06, + "loss": 1.8541, + "step": 11565 + }, + { + "epoch": 0.8706223301153578, + "grad_norm": 5.044890880584717, + "learning_rate": 4.407413445296255e-06, + "loss": 2.0446, + "step": 11566 + }, + { + "epoch": 0.870697604396018, + "grad_norm": 4.555783271789551, + "learning_rate": 4.402410199588958e-06, + "loss": 1.5987, + "step": 11567 + }, + { + "epoch": 0.8707728786766782, + "grad_norm": 7.961402893066406, + "learning_rate": 4.3974096645275354e-06, + "loss": 1.6441, + "step": 11568 + }, + { + "epoch": 0.8708481529573383, + "grad_norm": 5.344577312469482, + "learning_rate": 4.392411840409266e-06, + "loss": 1.7703, + "step": 11569 + }, + { + "epoch": 0.8709234272379984, + "grad_norm": 4.754013538360596, + "learning_rate": 4.38741672753124e-06, + "loss": 1.8351, + "step": 11570 + }, + { + "epoch": 0.8709987015186587, + "grad_norm": 4.343361854553223, + "learning_rate": 4.382424326190415e-06, + "loss": 1.6238, + "step": 11571 + }, + { + "epoch": 0.8710739757993188, + "grad_norm": 5.468206405639648, + "learning_rate": 4.37743463668357e-06, + "loss": 1.3445, + "step": 11572 + }, + { + "epoch": 0.8711492500799789, + "grad_norm": 4.7761969566345215, + "learning_rate": 4.372447659307338e-06, + "loss": 2.1126, + "step": 11573 + }, + { + "epoch": 0.8712245243606391, + "grad_norm": 4.152520179748535, + "learning_rate": 4.367463394358168e-06, + "loss": 1.3699, + "step": 11574 + }, + { + "epoch": 0.8712997986412993, + "grad_norm": 4.990994453430176, + "learning_rate": 4.362481842132371e-06, + "loss": 1.9946, + "step": 11575 + }, + { + "epoch": 0.8713750729219594, + "grad_norm": 4.660702705383301, + "learning_rate": 4.357503002926072e-06, + "loss": 1.7523, + "step": 11576 + }, + { + "epoch": 0.8714503472026195, + "grad_norm": 5.045409202575684, + "learning_rate": 4.352526877035257e-06, + "loss": 1.6273, + "step": 11577 + }, + { + "epoch": 0.8715256214832797, + "grad_norm": 5.219690322875977, + "learning_rate": 4.347553464755749e-06, + "loss": 1.8787, + "step": 11578 + }, + { + "epoch": 0.8716008957639398, + "grad_norm": 5.692169666290283, + "learning_rate": 4.342582766383185e-06, + "loss": 1.8663, + "step": 11579 + }, + { + "epoch": 0.8716761700446, + "grad_norm": 4.974844932556152, + "learning_rate": 4.337614782213079e-06, + "loss": 1.8248, + "step": 11580 + }, + { + "epoch": 0.8717514443252602, + "grad_norm": 4.979865074157715, + "learning_rate": 4.332649512540748e-06, + "loss": 1.5178, + "step": 11581 + }, + { + "epoch": 0.8718267186059203, + "grad_norm": 5.0891618728637695, + "learning_rate": 4.32768695766137e-06, + "loss": 2.0021, + "step": 11582 + }, + { + "epoch": 0.8719019928865804, + "grad_norm": 3.6198441982269287, + "learning_rate": 4.322727117869951e-06, + "loss": 1.9911, + "step": 11583 + }, + { + "epoch": 0.8719772671672407, + "grad_norm": 4.341318130493164, + "learning_rate": 4.317769993461351e-06, + "loss": 1.866, + "step": 11584 + }, + { + "epoch": 0.8720525414479008, + "grad_norm": 4.057556629180908, + "learning_rate": 4.31281558473024e-06, + "loss": 1.9603, + "step": 11585 + }, + { + "epoch": 0.8721278157285609, + "grad_norm": 6.607398986816406, + "learning_rate": 4.307863891971164e-06, + "loss": 2.0442, + "step": 11586 + }, + { + "epoch": 0.8722030900092211, + "grad_norm": 4.788666725158691, + "learning_rate": 4.302914915478462e-06, + "loss": 1.7816, + "step": 11587 + }, + { + "epoch": 0.8722783642898813, + "grad_norm": 6.239824295043945, + "learning_rate": 4.297968655546353e-06, + "loss": 1.7223, + "step": 11588 + }, + { + "epoch": 0.8723536385705414, + "grad_norm": 3.6766207218170166, + "learning_rate": 4.293025112468868e-06, + "loss": 1.7178, + "step": 11589 + }, + { + "epoch": 0.8724289128512016, + "grad_norm": 4.75899600982666, + "learning_rate": 4.288084286539906e-06, + "loss": 1.6665, + "step": 11590 + }, + { + "epoch": 0.8725041871318617, + "grad_norm": 6.134271144866943, + "learning_rate": 4.283146178053155e-06, + "loss": 1.7782, + "step": 11591 + }, + { + "epoch": 0.8725794614125219, + "grad_norm": 6.320107936859131, + "learning_rate": 4.278210787302189e-06, + "loss": 1.3949, + "step": 11592 + }, + { + "epoch": 0.8726547356931821, + "grad_norm": 4.9732666015625, + "learning_rate": 4.273278114580403e-06, + "loss": 1.6789, + "step": 11593 + }, + { + "epoch": 0.8727300099738422, + "grad_norm": 6.660370349884033, + "learning_rate": 4.268348160181035e-06, + "loss": 1.9689, + "step": 11594 + }, + { + "epoch": 0.8728052842545023, + "grad_norm": 5.151939392089844, + "learning_rate": 4.263420924397143e-06, + "loss": 1.7288, + "step": 11595 + }, + { + "epoch": 0.8728805585351624, + "grad_norm": 5.486466884613037, + "learning_rate": 4.2584964075216446e-06, + "loss": 1.9685, + "step": 11596 + }, + { + "epoch": 0.8729558328158227, + "grad_norm": 4.641103744506836, + "learning_rate": 4.2535746098473e-06, + "loss": 1.6891, + "step": 11597 + }, + { + "epoch": 0.8730311070964828, + "grad_norm": 4.811590194702148, + "learning_rate": 4.248655531666668e-06, + "loss": 1.8336, + "step": 11598 + }, + { + "epoch": 0.8731063813771429, + "grad_norm": 6.013032913208008, + "learning_rate": 4.2437391732721985e-06, + "loss": 1.8566, + "step": 11599 + }, + { + "epoch": 0.8731816556578031, + "grad_norm": 4.535085201263428, + "learning_rate": 4.238825534956142e-06, + "loss": 1.8832, + "step": 11600 + }, + { + "epoch": 0.8732569299384633, + "grad_norm": 4.858318328857422, + "learning_rate": 4.233914617010609e-06, + "loss": 1.9169, + "step": 11601 + }, + { + "epoch": 0.8733322042191234, + "grad_norm": 5.1384196281433105, + "learning_rate": 4.229006419727527e-06, + "loss": 2.1884, + "step": 11602 + }, + { + "epoch": 0.8734074784997836, + "grad_norm": 4.979616641998291, + "learning_rate": 4.22410094339869e-06, + "loss": 1.7776, + "step": 11603 + }, + { + "epoch": 0.8734827527804437, + "grad_norm": 5.56610107421875, + "learning_rate": 4.2191981883156985e-06, + "loss": 1.7658, + "step": 11604 + }, + { + "epoch": 0.8735580270611039, + "grad_norm": 5.433693885803223, + "learning_rate": 4.214298154770013e-06, + "loss": 1.7263, + "step": 11605 + }, + { + "epoch": 0.8736333013417641, + "grad_norm": 5.809007167816162, + "learning_rate": 4.209400843052924e-06, + "loss": 1.6306, + "step": 11606 + }, + { + "epoch": 0.8737085756224242, + "grad_norm": 5.423648834228516, + "learning_rate": 4.204506253455582e-06, + "loss": 1.7637, + "step": 11607 + }, + { + "epoch": 0.8737838499030843, + "grad_norm": 5.1037116050720215, + "learning_rate": 4.1996143862689294e-06, + "loss": 1.7978, + "step": 11608 + }, + { + "epoch": 0.8738591241837446, + "grad_norm": 4.331618785858154, + "learning_rate": 4.194725241783792e-06, + "loss": 1.4007, + "step": 11609 + }, + { + "epoch": 0.8739343984644047, + "grad_norm": 3.9037506580352783, + "learning_rate": 4.189838820290809e-06, + "loss": 1.6225, + "step": 11610 + }, + { + "epoch": 0.8740096727450648, + "grad_norm": 5.359700679779053, + "learning_rate": 4.184955122080458e-06, + "loss": 1.9444, + "step": 11611 + }, + { + "epoch": 0.874084947025725, + "grad_norm": 5.916167259216309, + "learning_rate": 4.180074147443081e-06, + "loss": 1.8808, + "step": 11612 + }, + { + "epoch": 0.8741602213063852, + "grad_norm": 6.919978141784668, + "learning_rate": 4.17519589666881e-06, + "loss": 1.8984, + "step": 11613 + }, + { + "epoch": 0.8742354955870453, + "grad_norm": 3.9990856647491455, + "learning_rate": 4.170320370047665e-06, + "loss": 1.5025, + "step": 11614 + }, + { + "epoch": 0.8743107698677055, + "grad_norm": 6.4516825675964355, + "learning_rate": 4.1654475678694686e-06, + "loss": 2.0728, + "step": 11615 + }, + { + "epoch": 0.8743860441483656, + "grad_norm": 5.90434455871582, + "learning_rate": 4.160577490423895e-06, + "loss": 1.8552, + "step": 11616 + }, + { + "epoch": 0.8744613184290257, + "grad_norm": 4.695637226104736, + "learning_rate": 4.155710138000468e-06, + "loss": 1.7793, + "step": 11617 + }, + { + "epoch": 0.8745365927096859, + "grad_norm": 3.9613237380981445, + "learning_rate": 4.1508455108885344e-06, + "loss": 1.7337, + "step": 11618 + }, + { + "epoch": 0.8746118669903461, + "grad_norm": 5.0093302726745605, + "learning_rate": 4.145983609377274e-06, + "loss": 1.5641, + "step": 11619 + }, + { + "epoch": 0.8746871412710062, + "grad_norm": 4.1677703857421875, + "learning_rate": 4.141124433755722e-06, + "loss": 1.8011, + "step": 11620 + }, + { + "epoch": 0.8747624155516663, + "grad_norm": 5.50277853012085, + "learning_rate": 4.136267984312736e-06, + "loss": 1.8674, + "step": 11621 + }, + { + "epoch": 0.8748376898323266, + "grad_norm": 7.7771806716918945, + "learning_rate": 4.131414261337024e-06, + "loss": 1.9557, + "step": 11622 + }, + { + "epoch": 0.8749129641129867, + "grad_norm": 4.732153415679932, + "learning_rate": 4.126563265117117e-06, + "loss": 1.8455, + "step": 11623 + }, + { + "epoch": 0.8749882383936468, + "grad_norm": 6.435847759246826, + "learning_rate": 4.1217149959414e-06, + "loss": 1.7414, + "step": 11624 + }, + { + "epoch": 0.875063512674307, + "grad_norm": 6.931395530700684, + "learning_rate": 4.116869454098093e-06, + "loss": 2.16, + "step": 11625 + }, + { + "epoch": 0.8751387869549672, + "grad_norm": 5.370662689208984, + "learning_rate": 4.112026639875233e-06, + "loss": 1.6863, + "step": 11626 + }, + { + "epoch": 0.8752140612356273, + "grad_norm": 4.384397983551025, + "learning_rate": 4.1071865535607266e-06, + "loss": 1.4211, + "step": 11627 + }, + { + "epoch": 0.8752893355162875, + "grad_norm": 6.004904747009277, + "learning_rate": 4.1023491954422955e-06, + "loss": 1.6357, + "step": 11628 + }, + { + "epoch": 0.8753646097969476, + "grad_norm": 4.047789096832275, + "learning_rate": 4.097514565807514e-06, + "loss": 1.6432, + "step": 11629 + }, + { + "epoch": 0.8754398840776078, + "grad_norm": 4.4809160232543945, + "learning_rate": 4.092682664943781e-06, + "loss": 1.8955, + "step": 11630 + }, + { + "epoch": 0.875515158358268, + "grad_norm": 4.3151068687438965, + "learning_rate": 4.087853493138344e-06, + "loss": 1.6515, + "step": 11631 + }, + { + "epoch": 0.8755904326389281, + "grad_norm": 7.531031608581543, + "learning_rate": 4.083027050678279e-06, + "loss": 1.5924, + "step": 11632 + }, + { + "epoch": 0.8756657069195882, + "grad_norm": 6.302493572235107, + "learning_rate": 4.078203337850506e-06, + "loss": 1.8163, + "step": 11633 + }, + { + "epoch": 0.8757409812002485, + "grad_norm": 4.595805644989014, + "learning_rate": 4.0733823549417736e-06, + "loss": 1.6336, + "step": 11634 + }, + { + "epoch": 0.8758162554809086, + "grad_norm": 5.001889705657959, + "learning_rate": 4.068564102238692e-06, + "loss": 2.0329, + "step": 11635 + }, + { + "epoch": 0.8758915297615687, + "grad_norm": 5.550800323486328, + "learning_rate": 4.063748580027676e-06, + "loss": 2.2104, + "step": 11636 + }, + { + "epoch": 0.8759668040422288, + "grad_norm": 5.672146320343018, + "learning_rate": 4.0589357885949975e-06, + "loss": 1.8105, + "step": 11637 + }, + { + "epoch": 0.876042078322889, + "grad_norm": 3.606351852416992, + "learning_rate": 4.054125728226765e-06, + "loss": 1.7368, + "step": 11638 + }, + { + "epoch": 0.8761173526035492, + "grad_norm": 7.833452224731445, + "learning_rate": 4.049318399208935e-06, + "loss": 1.764, + "step": 11639 + }, + { + "epoch": 0.8761926268842093, + "grad_norm": 5.139779567718506, + "learning_rate": 4.044513801827271e-06, + "loss": 2.6393, + "step": 11640 + }, + { + "epoch": 0.8762679011648695, + "grad_norm": 4.088352680206299, + "learning_rate": 4.039711936367402e-06, + "loss": 1.6665, + "step": 11641 + }, + { + "epoch": 0.8763431754455296, + "grad_norm": 5.271029472351074, + "learning_rate": 4.034912803114793e-06, + "loss": 1.7512, + "step": 11642 + }, + { + "epoch": 0.8764184497261898, + "grad_norm": 4.135016441345215, + "learning_rate": 4.030116402354717e-06, + "loss": 2.3249, + "step": 11643 + }, + { + "epoch": 0.87649372400685, + "grad_norm": 5.018148422241211, + "learning_rate": 4.025322734372328e-06, + "loss": 1.6255, + "step": 11644 + }, + { + "epoch": 0.8765689982875101, + "grad_norm": 5.522651195526123, + "learning_rate": 4.020531799452576e-06, + "loss": 1.7589, + "step": 11645 + }, + { + "epoch": 0.8766442725681702, + "grad_norm": 5.385191440582275, + "learning_rate": 4.015743597880289e-06, + "loss": 1.9545, + "step": 11646 + }, + { + "epoch": 0.8767195468488305, + "grad_norm": 4.508188724517822, + "learning_rate": 4.010958129940096e-06, + "loss": 1.5308, + "step": 11647 + }, + { + "epoch": 0.8767948211294906, + "grad_norm": 5.951359272003174, + "learning_rate": 4.006175395916489e-06, + "loss": 1.5051, + "step": 11648 + }, + { + "epoch": 0.8768700954101507, + "grad_norm": 4.846992015838623, + "learning_rate": 4.001395396093777e-06, + "loss": 1.6764, + "step": 11649 + }, + { + "epoch": 0.8769453696908109, + "grad_norm": 5.062412738800049, + "learning_rate": 3.996618130756119e-06, + "loss": 1.6578, + "step": 11650 + }, + { + "epoch": 0.8770206439714711, + "grad_norm": 5.050487041473389, + "learning_rate": 3.991843600187522e-06, + "loss": 1.8284, + "step": 11651 + }, + { + "epoch": 0.8770959182521312, + "grad_norm": 4.7077436447143555, + "learning_rate": 3.987071804671816e-06, + "loss": 1.6228, + "step": 11652 + }, + { + "epoch": 0.8771711925327914, + "grad_norm": 5.870832920074463, + "learning_rate": 3.982302744492655e-06, + "loss": 1.8833, + "step": 11653 + }, + { + "epoch": 0.8772464668134515, + "grad_norm": 4.387657165527344, + "learning_rate": 3.97753641993357e-06, + "loss": 1.7787, + "step": 11654 + }, + { + "epoch": 0.8773217410941117, + "grad_norm": 4.99583625793457, + "learning_rate": 3.9727728312778765e-06, + "loss": 1.8209, + "step": 11655 + }, + { + "epoch": 0.8773970153747718, + "grad_norm": 4.308382034301758, + "learning_rate": 3.9680119788087835e-06, + "loss": 1.4889, + "step": 11656 + }, + { + "epoch": 0.877472289655432, + "grad_norm": 5.0629496574401855, + "learning_rate": 3.96325386280929e-06, + "loss": 1.7658, + "step": 11657 + }, + { + "epoch": 0.8775475639360921, + "grad_norm": 4.795827388763428, + "learning_rate": 3.958498483562262e-06, + "loss": 1.7653, + "step": 11658 + }, + { + "epoch": 0.8776228382167522, + "grad_norm": 4.843021392822266, + "learning_rate": 3.953745841350393e-06, + "loss": 1.812, + "step": 11659 + }, + { + "epoch": 0.8776981124974125, + "grad_norm": 5.62350606918335, + "learning_rate": 3.948995936456207e-06, + "loss": 1.726, + "step": 11660 + }, + { + "epoch": 0.8777733867780726, + "grad_norm": 5.8269171714782715, + "learning_rate": 3.9442487691620785e-06, + "loss": 1.9534, + "step": 11661 + }, + { + "epoch": 0.8778486610587327, + "grad_norm": 4.477621555328369, + "learning_rate": 3.939504339750211e-06, + "loss": 1.6203, + "step": 11662 + }, + { + "epoch": 0.877923935339393, + "grad_norm": 6.591576099395752, + "learning_rate": 3.93476264850266e-06, + "loss": 1.8539, + "step": 11663 + }, + { + "epoch": 0.8779992096200531, + "grad_norm": 4.527975082397461, + "learning_rate": 3.9300236957012795e-06, + "loss": 1.697, + "step": 11664 + }, + { + "epoch": 0.8780744839007132, + "grad_norm": 7.748128890991211, + "learning_rate": 3.92528748162781e-06, + "loss": 1.6799, + "step": 11665 + }, + { + "epoch": 0.8781497581813734, + "grad_norm": 4.162700176239014, + "learning_rate": 3.920554006563792e-06, + "loss": 1.7182, + "step": 11666 + }, + { + "epoch": 0.8782250324620335, + "grad_norm": 4.6374831199646, + "learning_rate": 3.915823270790625e-06, + "loss": 2.174, + "step": 11667 + }, + { + "epoch": 0.8783003067426937, + "grad_norm": 4.238149166107178, + "learning_rate": 3.9110952745895325e-06, + "loss": 1.6535, + "step": 11668 + }, + { + "epoch": 0.8783755810233539, + "grad_norm": 4.5881147384643555, + "learning_rate": 3.906370018241584e-06, + "loss": 1.6171, + "step": 11669 + }, + { + "epoch": 0.878450855304014, + "grad_norm": 5.682853698730469, + "learning_rate": 3.9016475020276774e-06, + "loss": 1.8192, + "step": 11670 + }, + { + "epoch": 0.8785261295846741, + "grad_norm": 6.177950859069824, + "learning_rate": 3.89692772622855e-06, + "loss": 1.5586, + "step": 11671 + }, + { + "epoch": 0.8786014038653344, + "grad_norm": 5.545617580413818, + "learning_rate": 3.892210691124787e-06, + "loss": 1.7595, + "step": 11672 + }, + { + "epoch": 0.8786766781459945, + "grad_norm": 4.5383992195129395, + "learning_rate": 3.887496396996803e-06, + "loss": 1.6476, + "step": 11673 + }, + { + "epoch": 0.8787519524266546, + "grad_norm": 4.272154808044434, + "learning_rate": 3.8827848441248535e-06, + "loss": 1.5321, + "step": 11674 + }, + { + "epoch": 0.8788272267073147, + "grad_norm": 4.761125564575195, + "learning_rate": 3.8780760327890074e-06, + "loss": 2.1016, + "step": 11675 + }, + { + "epoch": 0.878902500987975, + "grad_norm": 5.4325337409973145, + "learning_rate": 3.873369963269219e-06, + "loss": 1.7483, + "step": 11676 + }, + { + "epoch": 0.8789777752686351, + "grad_norm": 5.6443939208984375, + "learning_rate": 3.8686666358452196e-06, + "loss": 1.5513, + "step": 11677 + }, + { + "epoch": 0.8790530495492952, + "grad_norm": 4.852678298950195, + "learning_rate": 3.863966050796635e-06, + "loss": 1.9776, + "step": 11678 + }, + { + "epoch": 0.8791283238299554, + "grad_norm": 4.189968109130859, + "learning_rate": 3.859268208402877e-06, + "loss": 1.8548, + "step": 11679 + }, + { + "epoch": 0.8792035981106155, + "grad_norm": 4.60287618637085, + "learning_rate": 3.854573108943244e-06, + "loss": 1.9396, + "step": 11680 + }, + { + "epoch": 0.8792788723912757, + "grad_norm": 5.255151748657227, + "learning_rate": 3.849880752696822e-06, + "loss": 2.1499, + "step": 11681 + }, + { + "epoch": 0.8793541466719359, + "grad_norm": 5.558388710021973, + "learning_rate": 3.8451911399425725e-06, + "loss": 1.7875, + "step": 11682 + }, + { + "epoch": 0.879429420952596, + "grad_norm": 4.829524517059326, + "learning_rate": 3.840504270959272e-06, + "loss": 1.8612, + "step": 11683 + }, + { + "epoch": 0.8795046952332561, + "grad_norm": 4.546968936920166, + "learning_rate": 3.835820146025559e-06, + "loss": 1.5953, + "step": 11684 + }, + { + "epoch": 0.8795799695139164, + "grad_norm": 6.303708553314209, + "learning_rate": 3.831138765419867e-06, + "loss": 2.1797, + "step": 11685 + }, + { + "epoch": 0.8796552437945765, + "grad_norm": 4.823291301727295, + "learning_rate": 3.8264601294205114e-06, + "loss": 2.0228, + "step": 11686 + }, + { + "epoch": 0.8797305180752366, + "grad_norm": 4.09678316116333, + "learning_rate": 3.821784238305609e-06, + "loss": 1.8344, + "step": 11687 + }, + { + "epoch": 0.8798057923558968, + "grad_norm": 5.928163051605225, + "learning_rate": 3.817111092353138e-06, + "loss": 1.7816, + "step": 11688 + }, + { + "epoch": 0.879881066636557, + "grad_norm": 4.956056118011475, + "learning_rate": 3.8124406918408962e-06, + "loss": 1.7789, + "step": 11689 + }, + { + "epoch": 0.8799563409172171, + "grad_norm": 4.112998962402344, + "learning_rate": 3.8077730370465247e-06, + "loss": 1.7091, + "step": 11690 + }, + { + "epoch": 0.8800316151978773, + "grad_norm": 4.732588291168213, + "learning_rate": 3.8031081282475122e-06, + "loss": 1.6471, + "step": 11691 + }, + { + "epoch": 0.8801068894785374, + "grad_norm": 4.895622730255127, + "learning_rate": 3.798445965721159e-06, + "loss": 1.6966, + "step": 11692 + }, + { + "epoch": 0.8801821637591976, + "grad_norm": 4.909831523895264, + "learning_rate": 3.793786549744627e-06, + "loss": 1.7482, + "step": 11693 + }, + { + "epoch": 0.8802574380398578, + "grad_norm": 5.542243480682373, + "learning_rate": 3.789129880594905e-06, + "loss": 1.4186, + "step": 11694 + }, + { + "epoch": 0.8803327123205179, + "grad_norm": 5.330420017242432, + "learning_rate": 3.7844759585488276e-06, + "loss": 1.8367, + "step": 11695 + }, + { + "epoch": 0.880407986601178, + "grad_norm": 7.158842086791992, + "learning_rate": 3.7798247838830403e-06, + "loss": 1.9479, + "step": 11696 + }, + { + "epoch": 0.8804832608818381, + "grad_norm": 5.65027379989624, + "learning_rate": 3.775176356874055e-06, + "loss": 2.0471, + "step": 11697 + }, + { + "epoch": 0.8805585351624984, + "grad_norm": 5.438329696655273, + "learning_rate": 3.7705306777981896e-06, + "loss": 1.7276, + "step": 11698 + }, + { + "epoch": 0.8806338094431585, + "grad_norm": 4.852111339569092, + "learning_rate": 3.76588774693164e-06, + "loss": 1.6597, + "step": 11699 + }, + { + "epoch": 0.8807090837238186, + "grad_norm": 3.996556520462036, + "learning_rate": 3.7612475645503963e-06, + "loss": 1.657, + "step": 11700 + }, + { + "epoch": 0.8807843580044789, + "grad_norm": 4.998070240020752, + "learning_rate": 3.756610130930316e-06, + "loss": 1.405, + "step": 11701 + }, + { + "epoch": 0.880859632285139, + "grad_norm": 4.979634761810303, + "learning_rate": 3.7519754463470725e-06, + "loss": 2.0878, + "step": 11702 + }, + { + "epoch": 0.8809349065657991, + "grad_norm": 4.137125015258789, + "learning_rate": 3.7473435110761903e-06, + "loss": 1.8052, + "step": 11703 + }, + { + "epoch": 0.8810101808464593, + "grad_norm": 4.739142417907715, + "learning_rate": 3.7427143253930207e-06, + "loss": 1.9087, + "step": 11704 + }, + { + "epoch": 0.8810854551271194, + "grad_norm": 6.339193344116211, + "learning_rate": 3.73808788957275e-06, + "loss": 1.9109, + "step": 11705 + }, + { + "epoch": 0.8811607294077796, + "grad_norm": 5.92103385925293, + "learning_rate": 3.733464203890419e-06, + "loss": 1.5833, + "step": 11706 + }, + { + "epoch": 0.8812360036884398, + "grad_norm": 4.376158237457275, + "learning_rate": 3.7288432686208853e-06, + "loss": 1.8536, + "step": 11707 + }, + { + "epoch": 0.8813112779690999, + "grad_norm": 5.535974025726318, + "learning_rate": 3.7242250840388625e-06, + "loss": 1.7207, + "step": 11708 + }, + { + "epoch": 0.88138655224976, + "grad_norm": 4.286526203155518, + "learning_rate": 3.7196096504188705e-06, + "loss": 1.824, + "step": 11709 + }, + { + "epoch": 0.8814618265304203, + "grad_norm": 5.494035243988037, + "learning_rate": 3.7149969680353e-06, + "loss": 1.713, + "step": 11710 + }, + { + "epoch": 0.8815371008110804, + "grad_norm": 3.8491909503936768, + "learning_rate": 3.710387037162344e-06, + "loss": 1.5778, + "step": 11711 + }, + { + "epoch": 0.8816123750917405, + "grad_norm": 4.229644775390625, + "learning_rate": 3.7057798580740656e-06, + "loss": 2.1494, + "step": 11712 + }, + { + "epoch": 0.8816876493724007, + "grad_norm": 5.165879249572754, + "learning_rate": 3.7011754310443347e-06, + "loss": 1.9515, + "step": 11713 + }, + { + "epoch": 0.8817629236530609, + "grad_norm": 5.129971027374268, + "learning_rate": 3.6965737563468884e-06, + "loss": 1.7509, + "step": 11714 + }, + { + "epoch": 0.881838197933721, + "grad_norm": 4.066309928894043, + "learning_rate": 3.691974834255263e-06, + "loss": 1.7692, + "step": 11715 + }, + { + "epoch": 0.8819134722143811, + "grad_norm": 4.80290412902832, + "learning_rate": 3.687378665042862e-06, + "loss": 1.8186, + "step": 11716 + }, + { + "epoch": 0.8819887464950413, + "grad_norm": 4.2350568771362305, + "learning_rate": 3.682785248982912e-06, + "loss": 1.8628, + "step": 11717 + }, + { + "epoch": 0.8820640207757015, + "grad_norm": 5.003052711486816, + "learning_rate": 3.678194586348488e-06, + "loss": 1.6765, + "step": 11718 + }, + { + "epoch": 0.8821392950563616, + "grad_norm": 4.9283766746521, + "learning_rate": 3.6736066774124778e-06, + "loss": 1.8051, + "step": 11719 + }, + { + "epoch": 0.8822145693370218, + "grad_norm": 3.984339475631714, + "learning_rate": 3.6690215224476297e-06, + "loss": 1.7729, + "step": 11720 + }, + { + "epoch": 0.8822898436176819, + "grad_norm": 5.933766841888428, + "learning_rate": 3.6644391217265094e-06, + "loss": 2.1696, + "step": 11721 + }, + { + "epoch": 0.882365117898342, + "grad_norm": 4.05521297454834, + "learning_rate": 3.6598594755215266e-06, + "loss": 2.1176, + "step": 11722 + }, + { + "epoch": 0.8824403921790023, + "grad_norm": 4.379974365234375, + "learning_rate": 3.6552825841049464e-06, + "loss": 1.6567, + "step": 11723 + }, + { + "epoch": 0.8825156664596624, + "grad_norm": 4.739940166473389, + "learning_rate": 3.6507084477488295e-06, + "loss": 1.5901, + "step": 11724 + }, + { + "epoch": 0.8825909407403225, + "grad_norm": 4.386242389678955, + "learning_rate": 3.646137066725108e-06, + "loss": 1.7343, + "step": 11725 + }, + { + "epoch": 0.8826662150209827, + "grad_norm": 4.930104732513428, + "learning_rate": 3.6415684413055317e-06, + "loss": 2.2461, + "step": 11726 + }, + { + "epoch": 0.8827414893016429, + "grad_norm": 5.442676067352295, + "learning_rate": 3.6370025717616886e-06, + "loss": 1.6076, + "step": 11727 + }, + { + "epoch": 0.882816763582303, + "grad_norm": 4.562854290008545, + "learning_rate": 3.632439458365017e-06, + "loss": 2.0476, + "step": 11728 + }, + { + "epoch": 0.8828920378629632, + "grad_norm": 4.101523399353027, + "learning_rate": 3.627879101386783e-06, + "loss": 1.7887, + "step": 11729 + }, + { + "epoch": 0.8829673121436233, + "grad_norm": 5.210818290710449, + "learning_rate": 3.6233215010980704e-06, + "loss": 2.0049, + "step": 11730 + }, + { + "epoch": 0.8830425864242835, + "grad_norm": 3.6276750564575195, + "learning_rate": 3.6187666577698344e-06, + "loss": 1.7847, + "step": 11731 + }, + { + "epoch": 0.8831178607049437, + "grad_norm": 6.829989910125732, + "learning_rate": 3.6142145716728305e-06, + "loss": 1.7474, + "step": 11732 + }, + { + "epoch": 0.8831931349856038, + "grad_norm": 6.399127960205078, + "learning_rate": 3.609665243077681e-06, + "loss": 1.9325, + "step": 11733 + }, + { + "epoch": 0.8832684092662639, + "grad_norm": 4.941920757293701, + "learning_rate": 3.60511867225482e-06, + "loss": 1.8576, + "step": 11734 + }, + { + "epoch": 0.883343683546924, + "grad_norm": 4.378361701965332, + "learning_rate": 3.600574859474537e-06, + "loss": 2.0939, + "step": 11735 + }, + { + "epoch": 0.8834189578275843, + "grad_norm": 4.887096881866455, + "learning_rate": 3.5960338050069318e-06, + "loss": 1.5146, + "step": 11736 + }, + { + "epoch": 0.8834942321082444, + "grad_norm": 3.7326459884643555, + "learning_rate": 3.5914955091219727e-06, + "loss": 1.5082, + "step": 11737 + }, + { + "epoch": 0.8835695063889045, + "grad_norm": 5.693533420562744, + "learning_rate": 3.5869599720894486e-06, + "loss": 2.0089, + "step": 11738 + }, + { + "epoch": 0.8836447806695648, + "grad_norm": 5.627947807312012, + "learning_rate": 3.582427194178972e-06, + "loss": 1.8153, + "step": 11739 + }, + { + "epoch": 0.8837200549502249, + "grad_norm": 6.34576416015625, + "learning_rate": 3.5778971756600277e-06, + "loss": 2.0942, + "step": 11740 + }, + { + "epoch": 0.883795329230885, + "grad_norm": 4.331240177154541, + "learning_rate": 3.5733699168018832e-06, + "loss": 1.8588, + "step": 11741 + }, + { + "epoch": 0.8838706035115452, + "grad_norm": 4.81483268737793, + "learning_rate": 3.5688454178736954e-06, + "loss": 1.7601, + "step": 11742 + }, + { + "epoch": 0.8839458777922053, + "grad_norm": 4.712187767028809, + "learning_rate": 3.56432367914441e-06, + "loss": 1.5383, + "step": 11743 + }, + { + "epoch": 0.8840211520728655, + "grad_norm": 4.57413911819458, + "learning_rate": 3.5598047008828517e-06, + "loss": 1.5656, + "step": 11744 + }, + { + "epoch": 0.8840964263535257, + "grad_norm": 5.560389518737793, + "learning_rate": 3.555288483357644e-06, + "loss": 1.9668, + "step": 11745 + }, + { + "epoch": 0.8841717006341858, + "grad_norm": 4.135054111480713, + "learning_rate": 3.550775026837283e-06, + "loss": 1.9964, + "step": 11746 + }, + { + "epoch": 0.8842469749148459, + "grad_norm": 8.078460693359375, + "learning_rate": 3.5462643315900602e-06, + "loss": 1.9363, + "step": 11747 + }, + { + "epoch": 0.8843222491955062, + "grad_norm": 5.840259075164795, + "learning_rate": 3.5417563978841273e-06, + "loss": 1.7716, + "step": 11748 + }, + { + "epoch": 0.8843975234761663, + "grad_norm": 4.86867618560791, + "learning_rate": 3.5372512259874867e-06, + "loss": 1.8895, + "step": 11749 + }, + { + "epoch": 0.8844727977568264, + "grad_norm": 4.765399932861328, + "learning_rate": 3.53274881616793e-06, + "loss": 1.7925, + "step": 11750 + }, + { + "epoch": 0.8845480720374866, + "grad_norm": 4.583217144012451, + "learning_rate": 3.5282491686931317e-06, + "loss": 1.9386, + "step": 11751 + }, + { + "epoch": 0.8846233463181468, + "grad_norm": 5.901762008666992, + "learning_rate": 3.523752283830584e-06, + "loss": 1.7171, + "step": 11752 + }, + { + "epoch": 0.8846986205988069, + "grad_norm": 5.61619234085083, + "learning_rate": 3.5192581618475997e-06, + "loss": 1.7941, + "step": 11753 + }, + { + "epoch": 0.884773894879467, + "grad_norm": 6.162242889404297, + "learning_rate": 3.514766803011349e-06, + "loss": 2.0147, + "step": 11754 + }, + { + "epoch": 0.8848491691601272, + "grad_norm": 5.188413143157959, + "learning_rate": 3.5102782075888407e-06, + "loss": 2.1021, + "step": 11755 + }, + { + "epoch": 0.8849244434407874, + "grad_norm": 5.4029011726379395, + "learning_rate": 3.5057923758468893e-06, + "loss": 1.6433, + "step": 11756 + }, + { + "epoch": 0.8849997177214475, + "grad_norm": 4.076094627380371, + "learning_rate": 3.5013093080521808e-06, + "loss": 1.8113, + "step": 11757 + }, + { + "epoch": 0.8850749920021077, + "grad_norm": 7.070972919464111, + "learning_rate": 3.496829004471208e-06, + "loss": 2.0368, + "step": 11758 + }, + { + "epoch": 0.8851502662827678, + "grad_norm": 4.31694221496582, + "learning_rate": 3.4923514653703247e-06, + "loss": 1.7314, + "step": 11759 + }, + { + "epoch": 0.885225540563428, + "grad_norm": 5.058100700378418, + "learning_rate": 3.487876691015696e-06, + "loss": 1.7466, + "step": 11760 + }, + { + "epoch": 0.8853008148440882, + "grad_norm": 4.5564117431640625, + "learning_rate": 3.483404681673341e-06, + "loss": 1.6735, + "step": 11761 + }, + { + "epoch": 0.8853760891247483, + "grad_norm": 4.933740615844727, + "learning_rate": 3.4789354376091043e-06, + "loss": 1.674, + "step": 11762 + }, + { + "epoch": 0.8854513634054084, + "grad_norm": 5.162326812744141, + "learning_rate": 3.474468959088678e-06, + "loss": 1.7968, + "step": 11763 + }, + { + "epoch": 0.8855266376860687, + "grad_norm": 5.507837772369385, + "learning_rate": 3.470005246377572e-06, + "loss": 1.7393, + "step": 11764 + }, + { + "epoch": 0.8856019119667288, + "grad_norm": 4.234638214111328, + "learning_rate": 3.465544299741147e-06, + "loss": 1.5094, + "step": 11765 + }, + { + "epoch": 0.8856771862473889, + "grad_norm": 3.986448049545288, + "learning_rate": 3.4610861194445897e-06, + "loss": 1.7611, + "step": 11766 + }, + { + "epoch": 0.8857524605280491, + "grad_norm": 6.9107184410095215, + "learning_rate": 3.456630705752928e-06, + "loss": 1.9551, + "step": 11767 + }, + { + "epoch": 0.8858277348087092, + "grad_norm": 5.278472900390625, + "learning_rate": 3.4521780589310213e-06, + "loss": 1.5861, + "step": 11768 + }, + { + "epoch": 0.8859030090893694, + "grad_norm": 3.7348806858062744, + "learning_rate": 3.4477281792435646e-06, + "loss": 1.8812, + "step": 11769 + }, + { + "epoch": 0.8859782833700296, + "grad_norm": 5.235109806060791, + "learning_rate": 3.443281066955106e-06, + "loss": 1.667, + "step": 11770 + }, + { + "epoch": 0.8860535576506897, + "grad_norm": 5.706743240356445, + "learning_rate": 3.438836722329991e-06, + "loss": 1.7574, + "step": 11771 + }, + { + "epoch": 0.8861288319313498, + "grad_norm": 4.530086994171143, + "learning_rate": 3.434395145632435e-06, + "loss": 1.5736, + "step": 11772 + }, + { + "epoch": 0.88620410621201, + "grad_norm": 4.790240287780762, + "learning_rate": 3.429956337126472e-06, + "loss": 1.8924, + "step": 11773 + }, + { + "epoch": 0.8862793804926702, + "grad_norm": 8.091974258422852, + "learning_rate": 3.42552029707599e-06, + "loss": 1.6761, + "step": 11774 + }, + { + "epoch": 0.8863546547733303, + "grad_norm": 4.775570869445801, + "learning_rate": 3.4210870257446847e-06, + "loss": 1.8182, + "step": 11775 + }, + { + "epoch": 0.8864299290539904, + "grad_norm": 4.188730239868164, + "learning_rate": 3.4166565233961113e-06, + "loss": 1.5956, + "step": 11776 + }, + { + "epoch": 0.8865052033346507, + "grad_norm": 4.084385871887207, + "learning_rate": 3.412228790293637e-06, + "loss": 1.9459, + "step": 11777 + }, + { + "epoch": 0.8865804776153108, + "grad_norm": 4.2184553146362305, + "learning_rate": 3.4078038267004953e-06, + "loss": 1.8293, + "step": 11778 + }, + { + "epoch": 0.8866557518959709, + "grad_norm": 4.290620803833008, + "learning_rate": 3.403381632879721e-06, + "loss": 1.6568, + "step": 11779 + }, + { + "epoch": 0.8867310261766311, + "grad_norm": 4.861558437347412, + "learning_rate": 3.3989622090942087e-06, + "loss": 2.1661, + "step": 11780 + }, + { + "epoch": 0.8868063004572913, + "grad_norm": 4.432613849639893, + "learning_rate": 3.394545555606682e-06, + "loss": 2.1038, + "step": 11781 + }, + { + "epoch": 0.8868815747379514, + "grad_norm": 4.658480167388916, + "learning_rate": 3.390131672679686e-06, + "loss": 1.4717, + "step": 11782 + }, + { + "epoch": 0.8869568490186116, + "grad_norm": 4.808432579040527, + "learning_rate": 3.385720560575628e-06, + "loss": 1.8406, + "step": 11783 + }, + { + "epoch": 0.8870321232992717, + "grad_norm": 6.0383734703063965, + "learning_rate": 3.3813122195567426e-06, + "loss": 1.7959, + "step": 11784 + }, + { + "epoch": 0.8871073975799318, + "grad_norm": 5.115785598754883, + "learning_rate": 3.37690664988507e-06, + "loss": 1.6087, + "step": 11785 + }, + { + "epoch": 0.8871826718605921, + "grad_norm": 5.181887626647949, + "learning_rate": 3.3725038518225228e-06, + "loss": 1.5711, + "step": 11786 + }, + { + "epoch": 0.8872579461412522, + "grad_norm": 6.70319128036499, + "learning_rate": 3.368103825630836e-06, + "loss": 1.5276, + "step": 11787 + }, + { + "epoch": 0.8873332204219123, + "grad_norm": 5.32488489151001, + "learning_rate": 3.3637065715715676e-06, + "loss": 1.6134, + "step": 11788 + }, + { + "epoch": 0.8874084947025725, + "grad_norm": 3.7461555004119873, + "learning_rate": 3.3593120899061413e-06, + "loss": 1.6014, + "step": 11789 + }, + { + "epoch": 0.8874837689832327, + "grad_norm": 3.944082260131836, + "learning_rate": 3.354920380895771e-06, + "loss": 1.7107, + "step": 11790 + }, + { + "epoch": 0.8875590432638928, + "grad_norm": 4.842274188995361, + "learning_rate": 3.350531444801558e-06, + "loss": 2.0498, + "step": 11791 + }, + { + "epoch": 0.887634317544553, + "grad_norm": 4.317521095275879, + "learning_rate": 3.3461452818843887e-06, + "loss": 1.7203, + "step": 11792 + }, + { + "epoch": 0.8877095918252131, + "grad_norm": 6.183497428894043, + "learning_rate": 3.3417618924050153e-06, + "loss": 1.9386, + "step": 11793 + }, + { + "epoch": 0.8877848661058733, + "grad_norm": 3.985163450241089, + "learning_rate": 3.3373812766240188e-06, + "loss": 1.8251, + "step": 11794 + }, + { + "epoch": 0.8878601403865334, + "grad_norm": 4.228626728057861, + "learning_rate": 3.3330034348018235e-06, + "loss": 2.1026, + "step": 11795 + }, + { + "epoch": 0.8879354146671936, + "grad_norm": 7.4498090744018555, + "learning_rate": 3.3286283671986664e-06, + "loss": 1.6393, + "step": 11796 + }, + { + "epoch": 0.8880106889478537, + "grad_norm": 4.097780227661133, + "learning_rate": 3.324256074074644e-06, + "loss": 1.5087, + "step": 11797 + }, + { + "epoch": 0.8880859632285139, + "grad_norm": 5.756229400634766, + "learning_rate": 3.319886555689661e-06, + "loss": 2.0575, + "step": 11798 + }, + { + "epoch": 0.8881612375091741, + "grad_norm": 3.740021228790283, + "learning_rate": 3.315519812303491e-06, + "loss": 1.8686, + "step": 11799 + }, + { + "epoch": 0.8882365117898342, + "grad_norm": 5.576794147491455, + "learning_rate": 3.311155844175712e-06, + "loss": 1.6125, + "step": 11800 + }, + { + "epoch": 0.8883117860704943, + "grad_norm": 5.809225559234619, + "learning_rate": 3.3067946515657532e-06, + "loss": 1.7876, + "step": 11801 + }, + { + "epoch": 0.8883870603511546, + "grad_norm": 3.8192005157470703, + "learning_rate": 3.3024362347328697e-06, + "loss": 1.5767, + "step": 11802 + }, + { + "epoch": 0.8884623346318147, + "grad_norm": 5.374025344848633, + "learning_rate": 3.2980805939361646e-06, + "loss": 1.7612, + "step": 11803 + }, + { + "epoch": 0.8885376089124748, + "grad_norm": 5.49501895904541, + "learning_rate": 3.2937277294345703e-06, + "loss": 1.6202, + "step": 11804 + }, + { + "epoch": 0.888612883193135, + "grad_norm": 5.229307174682617, + "learning_rate": 3.2893776414868406e-06, + "loss": 2.0495, + "step": 11805 + }, + { + "epoch": 0.8886881574737951, + "grad_norm": 5.162867069244385, + "learning_rate": 3.2850303303515797e-06, + "loss": 1.7861, + "step": 11806 + }, + { + "epoch": 0.8887634317544553, + "grad_norm": 5.295732498168945, + "learning_rate": 3.280685796287225e-06, + "loss": 1.8394, + "step": 11807 + }, + { + "epoch": 0.8888387060351155, + "grad_norm": 6.164709091186523, + "learning_rate": 3.2763440395520593e-06, + "loss": 1.7819, + "step": 11808 + }, + { + "epoch": 0.8889139803157756, + "grad_norm": 4.862895965576172, + "learning_rate": 3.272005060404165e-06, + "loss": 1.8755, + "step": 11809 + }, + { + "epoch": 0.8889892545964357, + "grad_norm": 5.777836799621582, + "learning_rate": 3.267668859101497e-06, + "loss": 2.0372, + "step": 11810 + }, + { + "epoch": 0.889064528877096, + "grad_norm": 4.18715763092041, + "learning_rate": 3.263335435901821e-06, + "loss": 1.803, + "step": 11811 + }, + { + "epoch": 0.8891398031577561, + "grad_norm": 4.8020734786987305, + "learning_rate": 3.2590047910627585e-06, + "loss": 1.5089, + "step": 11812 + }, + { + "epoch": 0.8892150774384162, + "grad_norm": 6.14511251449585, + "learning_rate": 3.2546769248417374e-06, + "loss": 2.6288, + "step": 11813 + }, + { + "epoch": 0.8892903517190763, + "grad_norm": 4.045788288116455, + "learning_rate": 3.250351837496057e-06, + "loss": 1.8121, + "step": 11814 + }, + { + "epoch": 0.8893656259997366, + "grad_norm": 5.799196243286133, + "learning_rate": 3.2460295292828067e-06, + "loss": 1.7446, + "step": 11815 + }, + { + "epoch": 0.8894409002803967, + "grad_norm": 4.424937725067139, + "learning_rate": 3.241710000458953e-06, + "loss": 1.8113, + "step": 11816 + }, + { + "epoch": 0.8895161745610568, + "grad_norm": 7.677599906921387, + "learning_rate": 3.2373932512812733e-06, + "loss": 1.9577, + "step": 11817 + }, + { + "epoch": 0.889591448841717, + "grad_norm": 4.3186750411987305, + "learning_rate": 3.2330792820063905e-06, + "loss": 1.5709, + "step": 11818 + }, + { + "epoch": 0.8896667231223772, + "grad_norm": 4.3186750411987305, + "learning_rate": 3.2330792820063905e-06, + "loss": 2.0441, + "step": 11819 + }, + { + "epoch": 0.8897419974030373, + "grad_norm": 4.3186750411987305, + "learning_rate": 3.2330792820063905e-06, + "loss": 1.826, + "step": 11820 + }, + { + "epoch": 0.8898172716836975, + "grad_norm": 5.141098976135254, + "learning_rate": 3.228768092890755e-06, + "loss": 1.7428, + "step": 11821 + }, + { + "epoch": 0.8898925459643576, + "grad_norm": 7.1366753578186035, + "learning_rate": 3.224459684190656e-06, + "loss": 2.0392, + "step": 11822 + }, + { + "epoch": 0.8899678202450177, + "grad_norm": 3.4166417121887207, + "learning_rate": 3.220154056162217e-06, + "loss": 1.7084, + "step": 11823 + }, + { + "epoch": 0.890043094525678, + "grad_norm": 4.907994747161865, + "learning_rate": 3.215851209061388e-06, + "loss": 1.8153, + "step": 11824 + }, + { + "epoch": 0.8901183688063381, + "grad_norm": 3.8626811504364014, + "learning_rate": 3.2115511431439704e-06, + "loss": 1.3738, + "step": 11825 + }, + { + "epoch": 0.8901936430869982, + "grad_norm": 6.657654285430908, + "learning_rate": 3.207253858665582e-06, + "loss": 1.7999, + "step": 11826 + }, + { + "epoch": 0.8902689173676585, + "grad_norm": 5.0645270347595215, + "learning_rate": 3.202959355881696e-06, + "loss": 1.9608, + "step": 11827 + }, + { + "epoch": 0.8903441916483186, + "grad_norm": 4.2301154136657715, + "learning_rate": 3.198667635047592e-06, + "loss": 1.7449, + "step": 11828 + }, + { + "epoch": 0.8904194659289787, + "grad_norm": 4.87670373916626, + "learning_rate": 3.1943786964184043e-06, + "loss": 1.9404, + "step": 11829 + }, + { + "epoch": 0.8904947402096389, + "grad_norm": 3.7896482944488525, + "learning_rate": 3.1900925402491067e-06, + "loss": 2.0846, + "step": 11830 + }, + { + "epoch": 0.890570014490299, + "grad_norm": 5.157933712005615, + "learning_rate": 3.185809166794507e-06, + "loss": 1.7528, + "step": 11831 + }, + { + "epoch": 0.8906452887709592, + "grad_norm": 3.8069164752960205, + "learning_rate": 3.1815285763092118e-06, + "loss": 1.9564, + "step": 11832 + }, + { + "epoch": 0.8907205630516193, + "grad_norm": 3.3343169689178467, + "learning_rate": 3.177250769047718e-06, + "loss": 1.7407, + "step": 11833 + }, + { + "epoch": 0.8907958373322795, + "grad_norm": 3.9978039264678955, + "learning_rate": 3.172975745264306e-06, + "loss": 1.7015, + "step": 11834 + }, + { + "epoch": 0.8908711116129396, + "grad_norm": 4.346434116363525, + "learning_rate": 3.168703505213133e-06, + "loss": 1.9475, + "step": 11835 + }, + { + "epoch": 0.8909463858935998, + "grad_norm": 4.142141342163086, + "learning_rate": 3.1644340491481574e-06, + "loss": 1.7142, + "step": 11836 + }, + { + "epoch": 0.89102166017426, + "grad_norm": 4.459104537963867, + "learning_rate": 3.1601673773231867e-06, + "loss": 1.8307, + "step": 11837 + }, + { + "epoch": 0.8910969344549201, + "grad_norm": 3.816215991973877, + "learning_rate": 3.1559034899918794e-06, + "loss": 1.7033, + "step": 11838 + }, + { + "epoch": 0.8911722087355802, + "grad_norm": 5.893944263458252, + "learning_rate": 3.1516423874076886e-06, + "loss": 1.8463, + "step": 11839 + }, + { + "epoch": 0.8912474830162405, + "grad_norm": 5.152439117431641, + "learning_rate": 3.1473840698239334e-06, + "loss": 1.7638, + "step": 11840 + }, + { + "epoch": 0.8913227572969006, + "grad_norm": 4.627366065979004, + "learning_rate": 3.1431285374937614e-06, + "loss": 1.5939, + "step": 11841 + }, + { + "epoch": 0.8913980315775607, + "grad_norm": 4.759149551391602, + "learning_rate": 3.1388757906701595e-06, + "loss": 2.0037, + "step": 11842 + }, + { + "epoch": 0.8914733058582209, + "grad_norm": 4.542873382568359, + "learning_rate": 3.134625829605925e-06, + "loss": 1.803, + "step": 11843 + }, + { + "epoch": 0.891548580138881, + "grad_norm": 4.590996742248535, + "learning_rate": 3.1303786545537173e-06, + "loss": 1.6505, + "step": 11844 + }, + { + "epoch": 0.8916238544195412, + "grad_norm": 4.327089786529541, + "learning_rate": 3.1261342657660063e-06, + "loss": 1.7829, + "step": 11845 + }, + { + "epoch": 0.8916991287002014, + "grad_norm": 5.363874912261963, + "learning_rate": 3.1218926634951296e-06, + "loss": 1.6933, + "step": 11846 + }, + { + "epoch": 0.8917744029808615, + "grad_norm": 6.655374050140381, + "learning_rate": 3.117653847993213e-06, + "loss": 2.0365, + "step": 11847 + }, + { + "epoch": 0.8918496772615216, + "grad_norm": 4.274673938751221, + "learning_rate": 3.1134178195122653e-06, + "loss": 1.4952, + "step": 11848 + }, + { + "epoch": 0.8919249515421819, + "grad_norm": 4.389845371246338, + "learning_rate": 3.109184578304086e-06, + "loss": 1.841, + "step": 11849 + }, + { + "epoch": 0.892000225822842, + "grad_norm": 4.642415523529053, + "learning_rate": 3.1049541246203406e-06, + "loss": 1.7275, + "step": 11850 + }, + { + "epoch": 0.8920755001035021, + "grad_norm": 5.653261661529541, + "learning_rate": 3.1007264587125163e-06, + "loss": 1.4653, + "step": 11851 + }, + { + "epoch": 0.8921507743841622, + "grad_norm": 5.971973896026611, + "learning_rate": 3.09650158083194e-06, + "loss": 1.4789, + "step": 11852 + }, + { + "epoch": 0.8922260486648225, + "grad_norm": 4.13915491104126, + "learning_rate": 3.0922794912297605e-06, + "loss": 1.6234, + "step": 11853 + }, + { + "epoch": 0.8923013229454826, + "grad_norm": 5.602231979370117, + "learning_rate": 3.088060190156966e-06, + "loss": 1.6539, + "step": 11854 + }, + { + "epoch": 0.8923765972261427, + "grad_norm": 5.522675514221191, + "learning_rate": 3.0838436778643954e-06, + "loss": 1.5568, + "step": 11855 + }, + { + "epoch": 0.8924518715068029, + "grad_norm": 4.610293865203857, + "learning_rate": 3.0796299546026976e-06, + "loss": 1.7234, + "step": 11856 + }, + { + "epoch": 0.8925271457874631, + "grad_norm": 6.0704216957092285, + "learning_rate": 3.0754190206223776e-06, + "loss": 1.9192, + "step": 11857 + }, + { + "epoch": 0.8926024200681232, + "grad_norm": 6.7875494956970215, + "learning_rate": 3.0712108761737413e-06, + "loss": 1.7332, + "step": 11858 + }, + { + "epoch": 0.8926776943487834, + "grad_norm": 6.020510673522949, + "learning_rate": 3.0670055215069772e-06, + "loss": 2.4567, + "step": 11859 + }, + { + "epoch": 0.8927529686294435, + "grad_norm": 4.295178413391113, + "learning_rate": 3.062802956872057e-06, + "loss": 2.0064, + "step": 11860 + }, + { + "epoch": 0.8928282429101037, + "grad_norm": 5.418540954589844, + "learning_rate": 3.058603182518832e-06, + "loss": 1.89, + "step": 11861 + }, + { + "epoch": 0.8929035171907639, + "grad_norm": 4.2862677574157715, + "learning_rate": 3.054406198696952e-06, + "loss": 2.0019, + "step": 11862 + }, + { + "epoch": 0.892978791471424, + "grad_norm": 4.884140968322754, + "learning_rate": 3.0502120056559223e-06, + "loss": 1.8629, + "step": 11863 + }, + { + "epoch": 0.8930540657520841, + "grad_norm": 5.246596813201904, + "learning_rate": 3.0460206036450724e-06, + "loss": 1.8386, + "step": 11864 + }, + { + "epoch": 0.8931293400327444, + "grad_norm": 5.6330695152282715, + "learning_rate": 3.0418319929135854e-06, + "loss": 1.8794, + "step": 11865 + }, + { + "epoch": 0.8932046143134045, + "grad_norm": 5.535592555999756, + "learning_rate": 3.0376461737104345e-06, + "loss": 1.7446, + "step": 11866 + }, + { + "epoch": 0.8932798885940646, + "grad_norm": 7.210932731628418, + "learning_rate": 3.0334631462844766e-06, + "loss": 1.8556, + "step": 11867 + }, + { + "epoch": 0.8933551628747248, + "grad_norm": 5.280955791473389, + "learning_rate": 3.0292829108843625e-06, + "loss": 1.8282, + "step": 11868 + }, + { + "epoch": 0.893430437155385, + "grad_norm": 4.655411243438721, + "learning_rate": 3.02510546775861e-06, + "loss": 1.9196, + "step": 11869 + }, + { + "epoch": 0.8935057114360451, + "grad_norm": 4.468110084533691, + "learning_rate": 3.0209308171555593e-06, + "loss": 1.9959, + "step": 11870 + }, + { + "epoch": 0.8935809857167052, + "grad_norm": 5.1010332107543945, + "learning_rate": 3.0167589593233624e-06, + "loss": 1.9308, + "step": 11871 + }, + { + "epoch": 0.8936562599973654, + "grad_norm": 3.653269052505493, + "learning_rate": 3.0125898945100427e-06, + "loss": 1.913, + "step": 11872 + }, + { + "epoch": 0.8937315342780255, + "grad_norm": 5.934589385986328, + "learning_rate": 3.0084236229634244e-06, + "loss": 1.7819, + "step": 11873 + }, + { + "epoch": 0.8938068085586857, + "grad_norm": 4.32749605178833, + "learning_rate": 3.0042601449311925e-06, + "loss": 2.0242, + "step": 11874 + }, + { + "epoch": 0.8938820828393459, + "grad_norm": 4.6355977058410645, + "learning_rate": 3.0000994606608433e-06, + "loss": 1.6809, + "step": 11875 + }, + { + "epoch": 0.893957357120006, + "grad_norm": 4.898728370666504, + "learning_rate": 2.995941570399735e-06, + "loss": 1.9374, + "step": 11876 + }, + { + "epoch": 0.8940326314006661, + "grad_norm": 4.161886692047119, + "learning_rate": 2.9917864743950195e-06, + "loss": 1.6023, + "step": 11877 + }, + { + "epoch": 0.8941079056813264, + "grad_norm": 5.221473693847656, + "learning_rate": 2.987634172893727e-06, + "loss": 1.9183, + "step": 11878 + }, + { + "epoch": 0.8941831799619865, + "grad_norm": 4.498020172119141, + "learning_rate": 2.983484666142683e-06, + "loss": 1.8458, + "step": 11879 + }, + { + "epoch": 0.8942584542426466, + "grad_norm": 5.728368282318115, + "learning_rate": 2.9793379543885724e-06, + "loss": 2.0314, + "step": 11880 + }, + { + "epoch": 0.8943337285233068, + "grad_norm": 5.019113063812256, + "learning_rate": 2.9751940378778986e-06, + "loss": 1.5129, + "step": 11881 + }, + { + "epoch": 0.894409002803967, + "grad_norm": 4.794315338134766, + "learning_rate": 2.97105291685702e-06, + "loss": 1.6153, + "step": 11882 + }, + { + "epoch": 0.8944842770846271, + "grad_norm": 4.009993076324463, + "learning_rate": 2.96691459157209e-06, + "loss": 1.6454, + "step": 11883 + }, + { + "epoch": 0.8945595513652873, + "grad_norm": 5.24515438079834, + "learning_rate": 2.9627790622691387e-06, + "loss": 1.7421, + "step": 11884 + }, + { + "epoch": 0.8946348256459474, + "grad_norm": 6.472077369689941, + "learning_rate": 2.9586463291940093e-06, + "loss": 1.705, + "step": 11885 + }, + { + "epoch": 0.8947100999266075, + "grad_norm": 4.370456218719482, + "learning_rate": 2.9545163925923767e-06, + "loss": 2.1343, + "step": 11886 + }, + { + "epoch": 0.8947853742072678, + "grad_norm": 7.8226141929626465, + "learning_rate": 2.950389252709762e-06, + "loss": 2.1813, + "step": 11887 + }, + { + "epoch": 0.8948606484879279, + "grad_norm": 4.593178749084473, + "learning_rate": 2.946264909791502e-06, + "loss": 1.6944, + "step": 11888 + }, + { + "epoch": 0.894935922768588, + "grad_norm": 4.995981693267822, + "learning_rate": 2.9421433640827834e-06, + "loss": 1.4935, + "step": 11889 + }, + { + "epoch": 0.8950111970492483, + "grad_norm": 6.428556442260742, + "learning_rate": 2.9380246158286105e-06, + "loss": 1.6151, + "step": 11890 + }, + { + "epoch": 0.8950864713299084, + "grad_norm": 4.174126148223877, + "learning_rate": 2.933908665273849e-06, + "loss": 1.7006, + "step": 11891 + }, + { + "epoch": 0.8951617456105685, + "grad_norm": 4.930836200714111, + "learning_rate": 2.929795512663164e-06, + "loss": 2.0235, + "step": 11892 + }, + { + "epoch": 0.8952370198912286, + "grad_norm": 4.087159633636475, + "learning_rate": 2.925685158241076e-06, + "loss": 2.0748, + "step": 11893 + }, + { + "epoch": 0.8953122941718888, + "grad_norm": 4.530377388000488, + "learning_rate": 2.9215776022519347e-06, + "loss": 1.7235, + "step": 11894 + }, + { + "epoch": 0.895387568452549, + "grad_norm": 6.479909896850586, + "learning_rate": 2.917472844939917e-06, + "loss": 1.657, + "step": 11895 + }, + { + "epoch": 0.8954628427332091, + "grad_norm": 4.67534065246582, + "learning_rate": 2.9133708865490438e-06, + "loss": 1.759, + "step": 11896 + }, + { + "epoch": 0.8955381170138693, + "grad_norm": 3.5963919162750244, + "learning_rate": 2.9092717273231764e-06, + "loss": 1.7908, + "step": 11897 + }, + { + "epoch": 0.8956133912945294, + "grad_norm": 5.5777106285095215, + "learning_rate": 2.9051753675059745e-06, + "loss": 1.886, + "step": 11898 + }, + { + "epoch": 0.8956886655751896, + "grad_norm": 3.829454183578491, + "learning_rate": 2.901081807340977e-06, + "loss": 1.6295, + "step": 11899 + }, + { + "epoch": 0.8957639398558498, + "grad_norm": 6.346753120422363, + "learning_rate": 2.8969910470715122e-06, + "loss": 1.6911, + "step": 11900 + }, + { + "epoch": 0.8958392141365099, + "grad_norm": 4.225761890411377, + "learning_rate": 2.892903086940779e-06, + "loss": 1.686, + "step": 11901 + }, + { + "epoch": 0.89591448841717, + "grad_norm": 5.266272068023682, + "learning_rate": 2.8888179271918e-06, + "loss": 1.3056, + "step": 11902 + }, + { + "epoch": 0.8959897626978303, + "grad_norm": 6.298349857330322, + "learning_rate": 2.8847355680674094e-06, + "loss": 1.7288, + "step": 11903 + }, + { + "epoch": 0.8960650369784904, + "grad_norm": 4.389970779418945, + "learning_rate": 2.8806560098103117e-06, + "loss": 1.755, + "step": 11904 + }, + { + "epoch": 0.8961403112591505, + "grad_norm": 7.884381294250488, + "learning_rate": 2.876579252663003e-06, + "loss": 1.5247, + "step": 11905 + }, + { + "epoch": 0.8962155855398107, + "grad_norm": 6.21115779876709, + "learning_rate": 2.8725052968678502e-06, + "loss": 1.9159, + "step": 11906 + }, + { + "epoch": 0.8962908598204709, + "grad_norm": 7.484222888946533, + "learning_rate": 2.8684341426670316e-06, + "loss": 1.481, + "step": 11907 + }, + { + "epoch": 0.896366134101131, + "grad_norm": 4.33501672744751, + "learning_rate": 2.8643657903025647e-06, + "loss": 1.8524, + "step": 11908 + }, + { + "epoch": 0.8964414083817912, + "grad_norm": 3.8369853496551514, + "learning_rate": 2.8603002400163003e-06, + "loss": 1.6306, + "step": 11909 + }, + { + "epoch": 0.8965166826624513, + "grad_norm": 4.877063751220703, + "learning_rate": 2.85623749204994e-06, + "loss": 1.9345, + "step": 11910 + }, + { + "epoch": 0.8965919569431114, + "grad_norm": 5.807718753814697, + "learning_rate": 2.852177546644985e-06, + "loss": 1.8559, + "step": 11911 + }, + { + "epoch": 0.8966672312237716, + "grad_norm": 4.768845558166504, + "learning_rate": 2.8481204040427923e-06, + "loss": 1.6579, + "step": 11912 + }, + { + "epoch": 0.8967425055044318, + "grad_norm": 5.8899827003479, + "learning_rate": 2.844066064484546e-06, + "loss": 2.2638, + "step": 11913 + }, + { + "epoch": 0.8968177797850919, + "grad_norm": 4.5548810958862305, + "learning_rate": 2.8400145282112713e-06, + "loss": 1.548, + "step": 11914 + }, + { + "epoch": 0.896893054065752, + "grad_norm": 4.679693222045898, + "learning_rate": 2.835965795463813e-06, + "loss": 1.6485, + "step": 11915 + }, + { + "epoch": 0.8969683283464123, + "grad_norm": 5.313938140869141, + "learning_rate": 2.8319198664828628e-06, + "loss": 2.1556, + "step": 11916 + }, + { + "epoch": 0.8970436026270724, + "grad_norm": 5.710687160491943, + "learning_rate": 2.827876741508928e-06, + "loss": 1.8302, + "step": 11917 + }, + { + "epoch": 0.8971188769077325, + "grad_norm": 7.161043167114258, + "learning_rate": 2.823836420782372e-06, + "loss": 2.0583, + "step": 11918 + }, + { + "epoch": 0.8971941511883927, + "grad_norm": 4.390212535858154, + "learning_rate": 2.819798904543375e-06, + "loss": 2.0067, + "step": 11919 + }, + { + "epoch": 0.8972694254690529, + "grad_norm": 6.0757269859313965, + "learning_rate": 2.8157641930319613e-06, + "loss": 1.7676, + "step": 11920 + }, + { + "epoch": 0.897344699749713, + "grad_norm": 6.575694561004639, + "learning_rate": 2.8117322864879834e-06, + "loss": 1.6393, + "step": 11921 + }, + { + "epoch": 0.8974199740303732, + "grad_norm": 3.7963900566101074, + "learning_rate": 2.8077031851511173e-06, + "loss": 1.8968, + "step": 11922 + }, + { + "epoch": 0.8974952483110333, + "grad_norm": 5.335954189300537, + "learning_rate": 2.8036768892608924e-06, + "loss": 1.8378, + "step": 11923 + }, + { + "epoch": 0.8975705225916935, + "grad_norm": 6.314944744110107, + "learning_rate": 2.7996533990566454e-06, + "loss": 2.0751, + "step": 11924 + }, + { + "epoch": 0.8976457968723537, + "grad_norm": 4.7898850440979, + "learning_rate": 2.795632714777585e-06, + "loss": 1.5771, + "step": 11925 + }, + { + "epoch": 0.8977210711530138, + "grad_norm": 9.502776145935059, + "learning_rate": 2.791614836662704e-06, + "loss": 2.3836, + "step": 11926 + }, + { + "epoch": 0.8977963454336739, + "grad_norm": 5.735191822052002, + "learning_rate": 2.7875997649508713e-06, + "loss": 1.8338, + "step": 11927 + }, + { + "epoch": 0.8978716197143342, + "grad_norm": 3.998290777206421, + "learning_rate": 2.7835874998807576e-06, + "loss": 2.0137, + "step": 11928 + }, + { + "epoch": 0.8979468939949943, + "grad_norm": 4.785402297973633, + "learning_rate": 2.779578041690889e-06, + "loss": 1.5864, + "step": 11929 + }, + { + "epoch": 0.8980221682756544, + "grad_norm": 4.234623432159424, + "learning_rate": 2.7755713906196134e-06, + "loss": 1.6581, + "step": 11930 + }, + { + "epoch": 0.8980974425563145, + "grad_norm": 4.966156005859375, + "learning_rate": 2.7715675469051293e-06, + "loss": 1.6874, + "step": 11931 + }, + { + "epoch": 0.8981727168369747, + "grad_norm": 5.304784774780273, + "learning_rate": 2.7675665107854298e-06, + "loss": 2.6467, + "step": 11932 + }, + { + "epoch": 0.8982479911176349, + "grad_norm": 4.455078601837158, + "learning_rate": 2.7635682824983743e-06, + "loss": 1.4891, + "step": 11933 + }, + { + "epoch": 0.898323265398295, + "grad_norm": 6.032569885253906, + "learning_rate": 2.759572862281662e-06, + "loss": 1.3525, + "step": 11934 + }, + { + "epoch": 0.8983985396789552, + "grad_norm": 7.020341873168945, + "learning_rate": 2.75558025037278e-06, + "loss": 1.7543, + "step": 11935 + }, + { + "epoch": 0.8984738139596153, + "grad_norm": 3.9947221279144287, + "learning_rate": 2.751590447009106e-06, + "loss": 1.8887, + "step": 11936 + }, + { + "epoch": 0.8985490882402755, + "grad_norm": 4.335216045379639, + "learning_rate": 2.747603452427794e-06, + "loss": 1.5954, + "step": 11937 + }, + { + "epoch": 0.8986243625209357, + "grad_norm": 6.364669322967529, + "learning_rate": 2.7436192668658876e-06, + "loss": 1.7033, + "step": 11938 + }, + { + "epoch": 0.8986996368015958, + "grad_norm": 4.1729888916015625, + "learning_rate": 2.739637890560215e-06, + "loss": 2.0867, + "step": 11939 + }, + { + "epoch": 0.8987749110822559, + "grad_norm": 3.890747547149658, + "learning_rate": 2.7356593237474636e-06, + "loss": 1.8854, + "step": 11940 + }, + { + "epoch": 0.8988501853629162, + "grad_norm": 6.1546406745910645, + "learning_rate": 2.7316835666641504e-06, + "loss": 1.7402, + "step": 11941 + }, + { + "epoch": 0.8989254596435763, + "grad_norm": 4.240593433380127, + "learning_rate": 2.727710619546625e-06, + "loss": 1.5398, + "step": 11942 + }, + { + "epoch": 0.8990007339242364, + "grad_norm": 5.984617233276367, + "learning_rate": 2.72374048263106e-06, + "loss": 1.9002, + "step": 11943 + }, + { + "epoch": 0.8990760082048966, + "grad_norm": 4.601238250732422, + "learning_rate": 2.7197731561534767e-06, + "loss": 1.7549, + "step": 11944 + }, + { + "epoch": 0.8991512824855568, + "grad_norm": 4.546549320220947, + "learning_rate": 2.7158086403497097e-06, + "loss": 2.1923, + "step": 11945 + }, + { + "epoch": 0.8992265567662169, + "grad_norm": 7.224729537963867, + "learning_rate": 2.711846935455453e-06, + "loss": 1.619, + "step": 11946 + }, + { + "epoch": 0.8993018310468771, + "grad_norm": 4.989003658294678, + "learning_rate": 2.707888041706208e-06, + "loss": 1.9677, + "step": 11947 + }, + { + "epoch": 0.8993771053275372, + "grad_norm": 4.853872776031494, + "learning_rate": 2.7039319593373246e-06, + "loss": 1.7004, + "step": 11948 + }, + { + "epoch": 0.8994523796081973, + "grad_norm": 7.603703022003174, + "learning_rate": 2.699978688583976e-06, + "loss": 2.2507, + "step": 11949 + }, + { + "epoch": 0.8995276538888575, + "grad_norm": 7.670825004577637, + "learning_rate": 2.696028229681169e-06, + "loss": 1.9575, + "step": 11950 + }, + { + "epoch": 0.8996029281695177, + "grad_norm": 5.305761337280273, + "learning_rate": 2.69208058286376e-06, + "loss": 2.1033, + "step": 11951 + }, + { + "epoch": 0.8996782024501778, + "grad_norm": 5.928761005401611, + "learning_rate": 2.6881357483664216e-06, + "loss": 1.7812, + "step": 11952 + }, + { + "epoch": 0.8997534767308379, + "grad_norm": 4.458364009857178, + "learning_rate": 2.684193726423656e-06, + "loss": 1.6437, + "step": 11953 + }, + { + "epoch": 0.8998287510114982, + "grad_norm": 7.592646598815918, + "learning_rate": 2.680254517269809e-06, + "loss": 1.8663, + "step": 11954 + }, + { + "epoch": 0.8999040252921583, + "grad_norm": 3.8419203758239746, + "learning_rate": 2.6763181211390596e-06, + "loss": 1.8588, + "step": 11955 + }, + { + "epoch": 0.8999792995728184, + "grad_norm": 5.933088302612305, + "learning_rate": 2.6723845382654043e-06, + "loss": 2.005, + "step": 11956 + }, + { + "epoch": 0.9000545738534786, + "grad_norm": 5.551792621612549, + "learning_rate": 2.6684537688827006e-06, + "loss": 1.7234, + "step": 11957 + }, + { + "epoch": 0.9001298481341388, + "grad_norm": 4.715183258056641, + "learning_rate": 2.6645258132245997e-06, + "loss": 1.5114, + "step": 11958 + }, + { + "epoch": 0.9002051224147989, + "grad_norm": 5.215699195861816, + "learning_rate": 2.6606006715246268e-06, + "loss": 1.8891, + "step": 11959 + }, + { + "epoch": 0.9002803966954591, + "grad_norm": 4.994283676147461, + "learning_rate": 2.656678344016106e-06, + "loss": 1.608, + "step": 11960 + }, + { + "epoch": 0.9003556709761192, + "grad_norm": 5.674286842346191, + "learning_rate": 2.6527588309322226e-06, + "loss": 1.976, + "step": 11961 + }, + { + "epoch": 0.9004309452567794, + "grad_norm": 5.190013408660889, + "learning_rate": 2.648842132505963e-06, + "loss": 1.5415, + "step": 11962 + }, + { + "epoch": 0.9005062195374396, + "grad_norm": 4.652156829833984, + "learning_rate": 2.6449282489701744e-06, + "loss": 1.9476, + "step": 11963 + }, + { + "epoch": 0.9005814938180997, + "grad_norm": 5.031260013580322, + "learning_rate": 2.6410171805575203e-06, + "loss": 1.5048, + "step": 11964 + }, + { + "epoch": 0.9006567680987598, + "grad_norm": 4.505012035369873, + "learning_rate": 2.637108927500509e-06, + "loss": 1.73, + "step": 11965 + }, + { + "epoch": 0.9007320423794201, + "grad_norm": 5.676845550537109, + "learning_rate": 2.6332034900314827e-06, + "loss": 1.7342, + "step": 11966 + }, + { + "epoch": 0.9008073166600802, + "grad_norm": 4.564072608947754, + "learning_rate": 2.6293008683825893e-06, + "loss": 2.0508, + "step": 11967 + }, + { + "epoch": 0.9008825909407403, + "grad_norm": 4.7408366203308105, + "learning_rate": 2.6254010627858426e-06, + "loss": 1.7701, + "step": 11968 + }, + { + "epoch": 0.9009578652214005, + "grad_norm": 4.5672221183776855, + "learning_rate": 2.6215040734730633e-06, + "loss": 2.12, + "step": 11969 + }, + { + "epoch": 0.9010331395020607, + "grad_norm": 4.8732781410217285, + "learning_rate": 2.6176099006759324e-06, + "loss": 1.7931, + "step": 11970 + }, + { + "epoch": 0.9011084137827208, + "grad_norm": 5.545689105987549, + "learning_rate": 2.6137185446259315e-06, + "loss": 1.6724, + "step": 11971 + }, + { + "epoch": 0.9011836880633809, + "grad_norm": 5.86617374420166, + "learning_rate": 2.6098300055543978e-06, + "loss": 1.9748, + "step": 11972 + }, + { + "epoch": 0.9012589623440411, + "grad_norm": 5.060419082641602, + "learning_rate": 2.6059442836924906e-06, + "loss": 1.7987, + "step": 11973 + }, + { + "epoch": 0.9013342366247012, + "grad_norm": 5.573587417602539, + "learning_rate": 2.6020613792712032e-06, + "loss": 1.3434, + "step": 11974 + }, + { + "epoch": 0.9014095109053614, + "grad_norm": 3.9352023601531982, + "learning_rate": 2.598181292521373e-06, + "loss": 1.577, + "step": 11975 + }, + { + "epoch": 0.9014847851860216, + "grad_norm": 4.468507766723633, + "learning_rate": 2.594304023673655e-06, + "loss": 1.8203, + "step": 11976 + }, + { + "epoch": 0.9015600594666817, + "grad_norm": 4.273713111877441, + "learning_rate": 2.590429572958536e-06, + "loss": 1.6355, + "step": 11977 + }, + { + "epoch": 0.9016353337473418, + "grad_norm": 4.349310874938965, + "learning_rate": 2.5865579406063488e-06, + "loss": 1.5269, + "step": 11978 + }, + { + "epoch": 0.9017106080280021, + "grad_norm": 4.628576278686523, + "learning_rate": 2.5826891268472485e-06, + "loss": 1.8985, + "step": 11979 + }, + { + "epoch": 0.9017858823086622, + "grad_norm": 5.092843532562256, + "learning_rate": 2.578823131911223e-06, + "loss": 1.934, + "step": 11980 + }, + { + "epoch": 0.9018611565893223, + "grad_norm": 5.034144878387451, + "learning_rate": 2.5749599560280947e-06, + "loss": 1.7996, + "step": 11981 + }, + { + "epoch": 0.9019364308699825, + "grad_norm": 4.841254234313965, + "learning_rate": 2.5710995994275176e-06, + "loss": 1.7984, + "step": 11982 + }, + { + "epoch": 0.9020117051506427, + "grad_norm": 4.680467128753662, + "learning_rate": 2.567242062338987e-06, + "loss": 1.9285, + "step": 11983 + }, + { + "epoch": 0.9020869794313028, + "grad_norm": 4.606339931488037, + "learning_rate": 2.563387344991808e-06, + "loss": 1.668, + "step": 11984 + }, + { + "epoch": 0.902162253711963, + "grad_norm": 4.296433448791504, + "learning_rate": 2.5595354476151413e-06, + "loss": 1.4844, + "step": 11985 + }, + { + "epoch": 0.9022375279926231, + "grad_norm": 6.967336177825928, + "learning_rate": 2.555686370437971e-06, + "loss": 1.5619, + "step": 11986 + }, + { + "epoch": 0.9023128022732833, + "grad_norm": 4.634474277496338, + "learning_rate": 2.5518401136891134e-06, + "loss": 1.5101, + "step": 11987 + }, + { + "epoch": 0.9023880765539435, + "grad_norm": 5.470510959625244, + "learning_rate": 2.5479966775972198e-06, + "loss": 2.1155, + "step": 11988 + }, + { + "epoch": 0.9024633508346036, + "grad_norm": 4.887975692749023, + "learning_rate": 2.544156062390768e-06, + "loss": 1.6347, + "step": 11989 + }, + { + "epoch": 0.9025386251152637, + "grad_norm": 7.093113899230957, + "learning_rate": 2.54031826829807e-06, + "loss": 1.7189, + "step": 11990 + }, + { + "epoch": 0.9026138993959238, + "grad_norm": 5.453039646148682, + "learning_rate": 2.536483295547282e-06, + "loss": 2.0303, + "step": 11991 + }, + { + "epoch": 0.9026891736765841, + "grad_norm": 3.6220996379852295, + "learning_rate": 2.53265114436636e-06, + "loss": 1.7392, + "step": 11992 + }, + { + "epoch": 0.9027644479572442, + "grad_norm": 5.754844665527344, + "learning_rate": 2.52882181498314e-06, + "loss": 1.8435, + "step": 11993 + }, + { + "epoch": 0.9028397222379043, + "grad_norm": 6.359145641326904, + "learning_rate": 2.5249953076252496e-06, + "loss": 1.6707, + "step": 11994 + }, + { + "epoch": 0.9029149965185645, + "grad_norm": 5.185604095458984, + "learning_rate": 2.5211716225201632e-06, + "loss": 1.8284, + "step": 11995 + }, + { + "epoch": 0.9029902707992247, + "grad_norm": 4.033519744873047, + "learning_rate": 2.5173507598951927e-06, + "loss": 1.6419, + "step": 11996 + }, + { + "epoch": 0.9030655450798848, + "grad_norm": 4.735548496246338, + "learning_rate": 2.513532719977474e-06, + "loss": 1.7643, + "step": 11997 + }, + { + "epoch": 0.903140819360545, + "grad_norm": 3.8570384979248047, + "learning_rate": 2.509717502993991e-06, + "loss": 2.0708, + "step": 11998 + }, + { + "epoch": 0.9032160936412051, + "grad_norm": 4.196596145629883, + "learning_rate": 2.5059051091715303e-06, + "loss": 1.9194, + "step": 11999 + }, + { + "epoch": 0.9032913679218653, + "grad_norm": 6.576998233795166, + "learning_rate": 2.5020955387367372e-06, + "loss": 1.9091, + "step": 12000 + }, + { + "epoch": 0.9033666422025255, + "grad_norm": 5.059309959411621, + "learning_rate": 2.498288791916076e-06, + "loss": 1.7275, + "step": 12001 + }, + { + "epoch": 0.9034419164831856, + "grad_norm": 4.960028648376465, + "learning_rate": 2.4944848689358534e-06, + "loss": 1.6355, + "step": 12002 + }, + { + "epoch": 0.9035171907638457, + "grad_norm": 5.26534366607666, + "learning_rate": 2.4906837700221896e-06, + "loss": 1.6269, + "step": 12003 + }, + { + "epoch": 0.903592465044506, + "grad_norm": 4.87063455581665, + "learning_rate": 2.4868854954010645e-06, + "loss": 1.6982, + "step": 12004 + }, + { + "epoch": 0.9036677393251661, + "grad_norm": 6.078065395355225, + "learning_rate": 2.4830900452982587e-06, + "loss": 1.9179, + "step": 12005 + }, + { + "epoch": 0.9037430136058262, + "grad_norm": 4.551740646362305, + "learning_rate": 2.479297419939419e-06, + "loss": 1.9464, + "step": 12006 + }, + { + "epoch": 0.9038182878864864, + "grad_norm": 5.093257904052734, + "learning_rate": 2.475507619549983e-06, + "loss": 1.7702, + "step": 12007 + }, + { + "epoch": 0.9038935621671466, + "grad_norm": 6.582335472106934, + "learning_rate": 2.471720644355258e-06, + "loss": 1.4875, + "step": 12008 + }, + { + "epoch": 0.9039688364478067, + "grad_norm": 5.120762348175049, + "learning_rate": 2.467936494580364e-06, + "loss": 1.7898, + "step": 12009 + }, + { + "epoch": 0.9040441107284668, + "grad_norm": 4.245059013366699, + "learning_rate": 2.464155170450272e-06, + "loss": 1.7692, + "step": 12010 + }, + { + "epoch": 0.904119385009127, + "grad_norm": 5.217390537261963, + "learning_rate": 2.4603766721897513e-06, + "loss": 1.7489, + "step": 12011 + }, + { + "epoch": 0.9041946592897872, + "grad_norm": 4.219017028808594, + "learning_rate": 2.4566010000234397e-06, + "loss": 2.0146, + "step": 12012 + }, + { + "epoch": 0.9042699335704473, + "grad_norm": 5.43870210647583, + "learning_rate": 2.452828154175768e-06, + "loss": 1.8117, + "step": 12013 + }, + { + "epoch": 0.9043452078511075, + "grad_norm": 4.560093879699707, + "learning_rate": 2.4490581348710408e-06, + "loss": 2.2599, + "step": 12014 + }, + { + "epoch": 0.9044204821317676, + "grad_norm": 4.2792181968688965, + "learning_rate": 2.4452909423333723e-06, + "loss": 1.6955, + "step": 12015 + }, + { + "epoch": 0.9044957564124277, + "grad_norm": 5.564821243286133, + "learning_rate": 2.4415265767867014e-06, + "loss": 1.7263, + "step": 12016 + }, + { + "epoch": 0.904571030693088, + "grad_norm": 5.642936706542969, + "learning_rate": 2.43776503845482e-06, + "loss": 1.6286, + "step": 12017 + }, + { + "epoch": 0.9046463049737481, + "grad_norm": 5.0309882164001465, + "learning_rate": 2.4340063275613335e-06, + "loss": 1.7298, + "step": 12018 + }, + { + "epoch": 0.9047215792544082, + "grad_norm": 4.734522819519043, + "learning_rate": 2.4302504443296848e-06, + "loss": 2.1225, + "step": 12019 + }, + { + "epoch": 0.9047968535350684, + "grad_norm": 7.406623363494873, + "learning_rate": 2.4264973889831565e-06, + "loss": 1.9675, + "step": 12020 + }, + { + "epoch": 0.9048721278157286, + "grad_norm": 5.505327224731445, + "learning_rate": 2.4227471617448596e-06, + "loss": 1.994, + "step": 12021 + }, + { + "epoch": 0.9049474020963887, + "grad_norm": 5.354662895202637, + "learning_rate": 2.4189997628377205e-06, + "loss": 1.6612, + "step": 12022 + }, + { + "epoch": 0.9050226763770489, + "grad_norm": 3.991269826889038, + "learning_rate": 2.4152551924845334e-06, + "loss": 1.8677, + "step": 12023 + }, + { + "epoch": 0.905097950657709, + "grad_norm": 4.828816890716553, + "learning_rate": 2.411513450907876e-06, + "loss": 1.5795, + "step": 12024 + }, + { + "epoch": 0.9051732249383692, + "grad_norm": 5.001769065856934, + "learning_rate": 2.4077745383302097e-06, + "loss": 2.1248, + "step": 12025 + }, + { + "epoch": 0.9052484992190294, + "grad_norm": 4.290104389190674, + "learning_rate": 2.404038454973778e-06, + "loss": 2.1903, + "step": 12026 + }, + { + "epoch": 0.9053237734996895, + "grad_norm": 5.534820079803467, + "learning_rate": 2.400305201060704e-06, + "loss": 1.5882, + "step": 12027 + }, + { + "epoch": 0.9053990477803496, + "grad_norm": 4.812999248504639, + "learning_rate": 2.3965747768128933e-06, + "loss": 1.7623, + "step": 12028 + }, + { + "epoch": 0.9054743220610098, + "grad_norm": 4.874666213989258, + "learning_rate": 2.3928471824521293e-06, + "loss": 1.9715, + "step": 12029 + }, + { + "epoch": 0.90554959634167, + "grad_norm": 4.669535160064697, + "learning_rate": 2.389122418199996e-06, + "loss": 1.5057, + "step": 12030 + }, + { + "epoch": 0.9056248706223301, + "grad_norm": 5.131518840789795, + "learning_rate": 2.3854004842779277e-06, + "loss": 1.5649, + "step": 12031 + }, + { + "epoch": 0.9057001449029902, + "grad_norm": 4.108508586883545, + "learning_rate": 2.38168138090718e-06, + "loss": 1.6892, + "step": 12032 + }, + { + "epoch": 0.9057754191836505, + "grad_norm": 5.186368465423584, + "learning_rate": 2.3779651083088384e-06, + "loss": 1.7513, + "step": 12033 + }, + { + "epoch": 0.9058506934643106, + "grad_norm": 4.305336952209473, + "learning_rate": 2.374251666703836e-06, + "loss": 1.9857, + "step": 12034 + }, + { + "epoch": 0.9059259677449707, + "grad_norm": 4.379505634307861, + "learning_rate": 2.3705410563129137e-06, + "loss": 1.8207, + "step": 12035 + }, + { + "epoch": 0.9060012420256309, + "grad_norm": 3.9230740070343018, + "learning_rate": 2.3668332773566617e-06, + "loss": 1.8812, + "step": 12036 + }, + { + "epoch": 0.906076516306291, + "grad_norm": 4.204113006591797, + "learning_rate": 2.363128330055492e-06, + "loss": 2.0519, + "step": 12037 + }, + { + "epoch": 0.9061517905869512, + "grad_norm": 6.357110023498535, + "learning_rate": 2.3594262146296676e-06, + "loss": 1.7123, + "step": 12038 + }, + { + "epoch": 0.9062270648676114, + "grad_norm": 6.072261810302734, + "learning_rate": 2.3557269312992514e-06, + "loss": 2.0566, + "step": 12039 + }, + { + "epoch": 0.9063023391482715, + "grad_norm": 3.6129419803619385, + "learning_rate": 2.352030480284162e-06, + "loss": 1.8136, + "step": 12040 + }, + { + "epoch": 0.9063776134289316, + "grad_norm": 4.159516334533691, + "learning_rate": 2.3483368618041403e-06, + "loss": 1.9119, + "step": 12041 + }, + { + "epoch": 0.9064528877095919, + "grad_norm": 7.715047836303711, + "learning_rate": 2.3446460760787713e-06, + "loss": 2.0005, + "step": 12042 + }, + { + "epoch": 0.906528161990252, + "grad_norm": 5.2658162117004395, + "learning_rate": 2.3409581233274468e-06, + "loss": 1.6645, + "step": 12043 + }, + { + "epoch": 0.9066034362709121, + "grad_norm": 4.586082458496094, + "learning_rate": 2.3372730037694245e-06, + "loss": 1.8197, + "step": 12044 + }, + { + "epoch": 0.9066787105515723, + "grad_norm": 3.306954860687256, + "learning_rate": 2.3335907176237513e-06, + "loss": 1.5572, + "step": 12045 + }, + { + "epoch": 0.9067539848322325, + "grad_norm": 5.0185627937316895, + "learning_rate": 2.3299112651093414e-06, + "loss": 1.7469, + "step": 12046 + }, + { + "epoch": 0.9068292591128926, + "grad_norm": 5.887462139129639, + "learning_rate": 2.3262346464449358e-06, + "loss": 2.3686, + "step": 12047 + }, + { + "epoch": 0.9069045333935527, + "grad_norm": 5.155585289001465, + "learning_rate": 2.3225608618490766e-06, + "loss": 1.917, + "step": 12048 + }, + { + "epoch": 0.9069798076742129, + "grad_norm": 4.50060510635376, + "learning_rate": 2.318889911540184e-06, + "loss": 2.0471, + "step": 12049 + }, + { + "epoch": 0.907055081954873, + "grad_norm": 5.397789001464844, + "learning_rate": 2.3152217957364607e-06, + "loss": 1.9728, + "step": 12050 + }, + { + "epoch": 0.9071303562355332, + "grad_norm": 4.0448079109191895, + "learning_rate": 2.3115565146559826e-06, + "loss": 1.861, + "step": 12051 + }, + { + "epoch": 0.9072056305161934, + "grad_norm": 4.343610763549805, + "learning_rate": 2.3078940685166364e-06, + "loss": 1.7512, + "step": 12052 + }, + { + "epoch": 0.9072809047968535, + "grad_norm": 4.3381428718566895, + "learning_rate": 2.304234457536153e-06, + "loss": 1.9521, + "step": 12053 + }, + { + "epoch": 0.9073561790775136, + "grad_norm": 4.190424919128418, + "learning_rate": 2.3005776819320648e-06, + "loss": 1.7461, + "step": 12054 + }, + { + "epoch": 0.9074314533581739, + "grad_norm": 4.159946918487549, + "learning_rate": 2.2969237419217802e-06, + "loss": 1.6084, + "step": 12055 + }, + { + "epoch": 0.907506727638834, + "grad_norm": 6.220204830169678, + "learning_rate": 2.293272637722493e-06, + "loss": 2.0885, + "step": 12056 + }, + { + "epoch": 0.9075820019194941, + "grad_norm": 4.030333042144775, + "learning_rate": 2.289624369551269e-06, + "loss": 1.7672, + "step": 12057 + }, + { + "epoch": 0.9076572762001544, + "grad_norm": 5.191592216491699, + "learning_rate": 2.2859789376249718e-06, + "loss": 1.8069, + "step": 12058 + }, + { + "epoch": 0.9077325504808145, + "grad_norm": 6.664621829986572, + "learning_rate": 2.2823363421603294e-06, + "loss": 2.1352, + "step": 12059 + }, + { + "epoch": 0.9078078247614746, + "grad_norm": 6.185181140899658, + "learning_rate": 2.2786965833738684e-06, + "loss": 1.6447, + "step": 12060 + }, + { + "epoch": 0.9078830990421348, + "grad_norm": 4.342257022857666, + "learning_rate": 2.2750596614819708e-06, + "loss": 1.7512, + "step": 12061 + }, + { + "epoch": 0.9079583733227949, + "grad_norm": 5.421034336090088, + "learning_rate": 2.271425576700831e-06, + "loss": 1.653, + "step": 12062 + }, + { + "epoch": 0.9080336476034551, + "grad_norm": 5.593506813049316, + "learning_rate": 2.267794329246498e-06, + "loss": 1.6534, + "step": 12063 + }, + { + "epoch": 0.9081089218841153, + "grad_norm": 4.65397310256958, + "learning_rate": 2.264165919334826e-06, + "loss": 1.7472, + "step": 12064 + }, + { + "epoch": 0.9081841961647754, + "grad_norm": 4.712983131408691, + "learning_rate": 2.2605403471815278e-06, + "loss": 1.9471, + "step": 12065 + }, + { + "epoch": 0.9082594704454355, + "grad_norm": 4.293585300445557, + "learning_rate": 2.256917613002124e-06, + "loss": 1.7198, + "step": 12066 + }, + { + "epoch": 0.9083347447260958, + "grad_norm": 4.533767223358154, + "learning_rate": 2.2532977170119764e-06, + "loss": 1.4754, + "step": 12067 + }, + { + "epoch": 0.9084100190067559, + "grad_norm": 6.342230319976807, + "learning_rate": 2.2496806594262842e-06, + "loss": 1.8504, + "step": 12068 + }, + { + "epoch": 0.908485293287416, + "grad_norm": 5.326993465423584, + "learning_rate": 2.246066440460065e-06, + "loss": 1.7505, + "step": 12069 + }, + { + "epoch": 0.9085605675680761, + "grad_norm": 6.085519313812256, + "learning_rate": 2.2424550603281746e-06, + "loss": 1.6443, + "step": 12070 + }, + { + "epoch": 0.9086358418487364, + "grad_norm": 5.617304801940918, + "learning_rate": 2.2388465192452914e-06, + "loss": 1.8948, + "step": 12071 + }, + { + "epoch": 0.9087111161293965, + "grad_norm": 4.76896858215332, + "learning_rate": 2.2352408174259487e-06, + "loss": 1.6311, + "step": 12072 + }, + { + "epoch": 0.9087863904100566, + "grad_norm": 5.786907196044922, + "learning_rate": 2.2316379550844815e-06, + "loss": 1.8867, + "step": 12073 + }, + { + "epoch": 0.9088616646907168, + "grad_norm": 4.617349147796631, + "learning_rate": 2.2280379324350785e-06, + "loss": 1.6429, + "step": 12074 + }, + { + "epoch": 0.908936938971377, + "grad_norm": 4.733928203582764, + "learning_rate": 2.224440749691742e-06, + "loss": 1.5572, + "step": 12075 + }, + { + "epoch": 0.9090122132520371, + "grad_norm": 4.012433052062988, + "learning_rate": 2.220846407068328e-06, + "loss": 1.7184, + "step": 12076 + }, + { + "epoch": 0.9090874875326973, + "grad_norm": 5.539886474609375, + "learning_rate": 2.217254904778493e-06, + "loss": 2.4661, + "step": 12077 + }, + { + "epoch": 0.9091627618133574, + "grad_norm": 5.59161376953125, + "learning_rate": 2.21366624303575e-06, + "loss": 1.7802, + "step": 12078 + }, + { + "epoch": 0.9092380360940175, + "grad_norm": 5.362966060638428, + "learning_rate": 2.2100804220534454e-06, + "loss": 1.7354, + "step": 12079 + }, + { + "epoch": 0.9093133103746778, + "grad_norm": 4.5530781745910645, + "learning_rate": 2.206497442044725e-06, + "loss": 1.848, + "step": 12080 + }, + { + "epoch": 0.9093885846553379, + "grad_norm": 5.087313175201416, + "learning_rate": 2.202917303222607e-06, + "loss": 1.7929, + "step": 12081 + }, + { + "epoch": 0.909463858935998, + "grad_norm": 4.621016502380371, + "learning_rate": 2.1993400057998994e-06, + "loss": 1.8301, + "step": 12082 + }, + { + "epoch": 0.9095391332166582, + "grad_norm": 9.632458686828613, + "learning_rate": 2.195765549989276e-06, + "loss": 1.837, + "step": 12083 + }, + { + "epoch": 0.9096144074973184, + "grad_norm": 6.701595306396484, + "learning_rate": 2.192193936003223e-06, + "loss": 2.0151, + "step": 12084 + }, + { + "epoch": 0.9096896817779785, + "grad_norm": 5.789704322814941, + "learning_rate": 2.1886251640540643e-06, + "loss": 2.1068, + "step": 12085 + }, + { + "epoch": 0.9097649560586387, + "grad_norm": 6.658529281616211, + "learning_rate": 2.1850592343539532e-06, + "loss": 1.6723, + "step": 12086 + }, + { + "epoch": 0.9098402303392988, + "grad_norm": 4.172266483306885, + "learning_rate": 2.181496147114881e-06, + "loss": 1.5237, + "step": 12087 + }, + { + "epoch": 0.909915504619959, + "grad_norm": 3.8307430744171143, + "learning_rate": 2.1779359025486503e-06, + "loss": 1.3783, + "step": 12088 + }, + { + "epoch": 0.9099907789006191, + "grad_norm": 4.280561447143555, + "learning_rate": 2.1743785008669194e-06, + "loss": 1.4928, + "step": 12089 + }, + { + "epoch": 0.9100660531812793, + "grad_norm": 5.1558451652526855, + "learning_rate": 2.1708239422811525e-06, + "loss": 1.6024, + "step": 12090 + }, + { + "epoch": 0.9101413274619394, + "grad_norm": 4.757821083068848, + "learning_rate": 2.167272227002676e-06, + "loss": 1.3839, + "step": 12091 + }, + { + "epoch": 0.9102166017425996, + "grad_norm": 4.467638969421387, + "learning_rate": 2.1637233552426028e-06, + "loss": 1.9276, + "step": 12092 + }, + { + "epoch": 0.9102918760232598, + "grad_norm": 5.651608467102051, + "learning_rate": 2.160177327211932e-06, + "loss": 1.7023, + "step": 12093 + }, + { + "epoch": 0.9103671503039199, + "grad_norm": 4.382534503936768, + "learning_rate": 2.1566341431214443e-06, + "loss": 2.0227, + "step": 12094 + }, + { + "epoch": 0.91044242458458, + "grad_norm": 5.981085300445557, + "learning_rate": 2.1530938031817714e-06, + "loss": 1.8377, + "step": 12095 + }, + { + "epoch": 0.9105176988652403, + "grad_norm": 3.827946901321411, + "learning_rate": 2.1495563076033896e-06, + "loss": 1.7057, + "step": 12096 + }, + { + "epoch": 0.9105929731459004, + "grad_norm": 5.965029716491699, + "learning_rate": 2.146021656596586e-06, + "loss": 1.7208, + "step": 12097 + }, + { + "epoch": 0.9106682474265605, + "grad_norm": 5.827772617340088, + "learning_rate": 2.1424898503714985e-06, + "loss": 1.8398, + "step": 12098 + }, + { + "epoch": 0.9107435217072207, + "grad_norm": 4.450931072235107, + "learning_rate": 2.1389608891380584e-06, + "loss": 1.6344, + "step": 12099 + }, + { + "epoch": 0.9108187959878808, + "grad_norm": 4.463832378387451, + "learning_rate": 2.1354347731060707e-06, + "loss": 1.4284, + "step": 12100 + }, + { + "epoch": 0.910894070268541, + "grad_norm": 4.7808966636657715, + "learning_rate": 2.13191150248514e-06, + "loss": 2.0314, + "step": 12101 + }, + { + "epoch": 0.9109693445492012, + "grad_norm": 4.422584056854248, + "learning_rate": 2.1283910774847315e-06, + "loss": 1.6688, + "step": 12102 + }, + { + "epoch": 0.9110446188298613, + "grad_norm": 5.107571601867676, + "learning_rate": 2.124873498314106e-06, + "loss": 1.8038, + "step": 12103 + }, + { + "epoch": 0.9111198931105214, + "grad_norm": 4.270766258239746, + "learning_rate": 2.121358765182385e-06, + "loss": 1.79, + "step": 12104 + }, + { + "epoch": 0.9111951673911817, + "grad_norm": 5.984953880310059, + "learning_rate": 2.1178468782985074e-06, + "loss": 1.8127, + "step": 12105 + }, + { + "epoch": 0.9112704416718418, + "grad_norm": 5.883664131164551, + "learning_rate": 2.1143378378712388e-06, + "loss": 2.1094, + "step": 12106 + }, + { + "epoch": 0.9113457159525019, + "grad_norm": 3.6939966678619385, + "learning_rate": 2.1108316441091904e-06, + "loss": 1.4716, + "step": 12107 + }, + { + "epoch": 0.911420990233162, + "grad_norm": 4.318075656890869, + "learning_rate": 2.10732829722079e-06, + "loss": 1.742, + "step": 12108 + }, + { + "epoch": 0.9114962645138223, + "grad_norm": 3.9309732913970947, + "learning_rate": 2.1038277974142982e-06, + "loss": 1.4777, + "step": 12109 + }, + { + "epoch": 0.9115715387944824, + "grad_norm": 4.194386005401611, + "learning_rate": 2.100330144897822e-06, + "loss": 1.6525, + "step": 12110 + }, + { + "epoch": 0.9116468130751425, + "grad_norm": 5.224255084991455, + "learning_rate": 2.0968353398792773e-06, + "loss": 1.5894, + "step": 12111 + }, + { + "epoch": 0.9117220873558027, + "grad_norm": 4.281154632568359, + "learning_rate": 2.0933433825664206e-06, + "loss": 1.7495, + "step": 12112 + }, + { + "epoch": 0.9117973616364629, + "grad_norm": 4.29382848739624, + "learning_rate": 2.0898542731668415e-06, + "loss": 1.9486, + "step": 12113 + }, + { + "epoch": 0.911872635917123, + "grad_norm": 4.170835971832275, + "learning_rate": 2.086368011887957e-06, + "loss": 2.0966, + "step": 12114 + }, + { + "epoch": 0.9119479101977832, + "grad_norm": 5.726946830749512, + "learning_rate": 2.0828845989370127e-06, + "loss": 1.8433, + "step": 12115 + }, + { + "epoch": 0.9120231844784433, + "grad_norm": 4.424744129180908, + "learning_rate": 2.0794040345210874e-06, + "loss": 1.8361, + "step": 12116 + }, + { + "epoch": 0.9120984587591034, + "grad_norm": 4.886198043823242, + "learning_rate": 2.075926318847099e-06, + "loss": 1.5234, + "step": 12117 + }, + { + "epoch": 0.9121737330397637, + "grad_norm": 4.902031421661377, + "learning_rate": 2.0724514521217764e-06, + "loss": 1.7774, + "step": 12118 + }, + { + "epoch": 0.9122490073204238, + "grad_norm": 5.974635601043701, + "learning_rate": 2.0689794345516935e-06, + "loss": 2.0676, + "step": 12119 + }, + { + "epoch": 0.9123242816010839, + "grad_norm": 4.213400363922119, + "learning_rate": 2.065510266343257e-06, + "loss": 2.2021, + "step": 12120 + }, + { + "epoch": 0.9123995558817442, + "grad_norm": 4.379502773284912, + "learning_rate": 2.0620439477026966e-06, + "loss": 1.7332, + "step": 12121 + }, + { + "epoch": 0.9124748301624043, + "grad_norm": 5.108627796173096, + "learning_rate": 2.0585804788360696e-06, + "loss": 1.5354, + "step": 12122 + }, + { + "epoch": 0.9125501044430644, + "grad_norm": 4.5947651863098145, + "learning_rate": 2.055119859949284e-06, + "loss": 1.6025, + "step": 12123 + }, + { + "epoch": 0.9126253787237246, + "grad_norm": 3.9317147731781006, + "learning_rate": 2.051662091248041e-06, + "loss": 1.9726, + "step": 12124 + }, + { + "epoch": 0.9127006530043847, + "grad_norm": 4.857736110687256, + "learning_rate": 2.048207172937916e-06, + "loss": 1.688, + "step": 12125 + }, + { + "epoch": 0.9127759272850449, + "grad_norm": 4.268885135650635, + "learning_rate": 2.044755105224283e-06, + "loss": 1.5768, + "step": 12126 + }, + { + "epoch": 0.912851201565705, + "grad_norm": 7.116901874542236, + "learning_rate": 2.041305888312356e-06, + "loss": 1.6902, + "step": 12127 + }, + { + "epoch": 0.9129264758463652, + "grad_norm": 5.009929180145264, + "learning_rate": 2.037859522407193e-06, + "loss": 1.6505, + "step": 12128 + }, + { + "epoch": 0.9130017501270253, + "grad_norm": 6.537803649902344, + "learning_rate": 2.034416007713652e-06, + "loss": 1.505, + "step": 12129 + }, + { + "epoch": 0.9130770244076855, + "grad_norm": 5.335648536682129, + "learning_rate": 2.0309753444364533e-06, + "loss": 1.9193, + "step": 12130 + }, + { + "epoch": 0.9131522986883457, + "grad_norm": 5.426054954528809, + "learning_rate": 2.027537532780133e-06, + "loss": 2.2388, + "step": 12131 + }, + { + "epoch": 0.9132275729690058, + "grad_norm": 4.194322109222412, + "learning_rate": 2.024102572949066e-06, + "loss": 1.5754, + "step": 12132 + }, + { + "epoch": 0.9133028472496659, + "grad_norm": 6.474128723144531, + "learning_rate": 2.020670465147434e-06, + "loss": 2.2222, + "step": 12133 + }, + { + "epoch": 0.9133781215303262, + "grad_norm": 4.180809020996094, + "learning_rate": 2.0172412095792793e-06, + "loss": 1.5092, + "step": 12134 + }, + { + "epoch": 0.9134533958109863, + "grad_norm": 5.883711814880371, + "learning_rate": 2.013814806448455e-06, + "loss": 1.5964, + "step": 12135 + }, + { + "epoch": 0.9135286700916464, + "grad_norm": 7.152510643005371, + "learning_rate": 2.0103912559586545e-06, + "loss": 2.1642, + "step": 12136 + }, + { + "epoch": 0.9136039443723066, + "grad_norm": 4.77456521987915, + "learning_rate": 2.0069705583133926e-06, + "loss": 1.7848, + "step": 12137 + }, + { + "epoch": 0.9136792186529668, + "grad_norm": 4.454054832458496, + "learning_rate": 2.0035527137160284e-06, + "loss": 1.8845, + "step": 12138 + }, + { + "epoch": 0.9137544929336269, + "grad_norm": 5.078956604003906, + "learning_rate": 2.000137722369733e-06, + "loss": 1.7626, + "step": 12139 + }, + { + "epoch": 0.9138297672142871, + "grad_norm": 4.826076030731201, + "learning_rate": 1.996725584477521e-06, + "loss": 2.1638, + "step": 12140 + }, + { + "epoch": 0.9139050414949472, + "grad_norm": 4.578750133514404, + "learning_rate": 1.9933163002422373e-06, + "loss": 1.6142, + "step": 12141 + }, + { + "epoch": 0.9139803157756073, + "grad_norm": 5.217748641967773, + "learning_rate": 1.9899098698665574e-06, + "loss": 1.9073, + "step": 12142 + }, + { + "epoch": 0.9140555900562676, + "grad_norm": 5.196359634399414, + "learning_rate": 1.986506293552981e-06, + "loss": 2.0435, + "step": 12143 + }, + { + "epoch": 0.9141308643369277, + "grad_norm": 5.367886066436768, + "learning_rate": 1.9831055715038293e-06, + "loss": 1.959, + "step": 12144 + }, + { + "epoch": 0.9142061386175878, + "grad_norm": 4.844086647033691, + "learning_rate": 1.979707703921285e-06, + "loss": 1.7157, + "step": 12145 + }, + { + "epoch": 0.9142814128982479, + "grad_norm": 4.099212646484375, + "learning_rate": 1.9763126910073315e-06, + "loss": 1.7612, + "step": 12146 + }, + { + "epoch": 0.9143566871789082, + "grad_norm": 4.701624870300293, + "learning_rate": 1.9729205329637902e-06, + "loss": 1.5493, + "step": 12147 + }, + { + "epoch": 0.9144319614595683, + "grad_norm": 4.403903961181641, + "learning_rate": 1.9695312299923165e-06, + "loss": 1.8948, + "step": 12148 + }, + { + "epoch": 0.9145072357402284, + "grad_norm": 4.937520503997803, + "learning_rate": 1.9661447822944047e-06, + "loss": 1.6898, + "step": 12149 + }, + { + "epoch": 0.9145825100208886, + "grad_norm": 3.9335262775421143, + "learning_rate": 1.9627611900713495e-06, + "loss": 1.7621, + "step": 12150 + }, + { + "epoch": 0.9146577843015488, + "grad_norm": 4.922421455383301, + "learning_rate": 1.959380453524312e-06, + "loss": 2.0592, + "step": 12151 + }, + { + "epoch": 0.9147330585822089, + "grad_norm": 4.673279285430908, + "learning_rate": 1.95600257285426e-06, + "loss": 1.506, + "step": 12152 + }, + { + "epoch": 0.9148083328628691, + "grad_norm": 5.225360870361328, + "learning_rate": 1.9526275482620095e-06, + "loss": 2.0268, + "step": 12153 + }, + { + "epoch": 0.9148836071435292, + "grad_norm": 5.667407512664795, + "learning_rate": 1.949255379948178e-06, + "loss": 1.7856, + "step": 12154 + }, + { + "epoch": 0.9149588814241894, + "grad_norm": 4.161797046661377, + "learning_rate": 1.9458860681132506e-06, + "loss": 1.8559, + "step": 12155 + }, + { + "epoch": 0.9150341557048496, + "grad_norm": 4.751323699951172, + "learning_rate": 1.942519612957505e-06, + "loss": 1.615, + "step": 12156 + }, + { + "epoch": 0.9151094299855097, + "grad_norm": 4.532934665679932, + "learning_rate": 1.9391560146810872e-06, + "loss": 1.8495, + "step": 12157 + }, + { + "epoch": 0.9151847042661698, + "grad_norm": 5.302066802978516, + "learning_rate": 1.9357952734839314e-06, + "loss": 1.8125, + "step": 12158 + }, + { + "epoch": 0.91525997854683, + "grad_norm": 5.823980808258057, + "learning_rate": 1.932437389565833e-06, + "loss": 1.6756, + "step": 12159 + }, + { + "epoch": 0.9153352528274902, + "grad_norm": 4.314770698547363, + "learning_rate": 1.929082363126422e-06, + "loss": 1.6791, + "step": 12160 + }, + { + "epoch": 0.9154105271081503, + "grad_norm": 5.326642990112305, + "learning_rate": 1.925730194365127e-06, + "loss": 1.9199, + "step": 12161 + }, + { + "epoch": 0.9154858013888105, + "grad_norm": 5.333718299865723, + "learning_rate": 1.922380883481234e-06, + "loss": 2.0131, + "step": 12162 + }, + { + "epoch": 0.9155610756694706, + "grad_norm": 4.871474266052246, + "learning_rate": 1.919034430673844e-06, + "loss": 1.6673, + "step": 12163 + }, + { + "epoch": 0.9156363499501308, + "grad_norm": 4.754137992858887, + "learning_rate": 1.9156908361418924e-06, + "loss": 1.9942, + "step": 12164 + }, + { + "epoch": 0.915711624230791, + "grad_norm": 5.771067142486572, + "learning_rate": 1.9123501000841594e-06, + "loss": 1.519, + "step": 12165 + }, + { + "epoch": 0.9157868985114511, + "grad_norm": 4.481830596923828, + "learning_rate": 1.909012222699236e-06, + "loss": 1.5606, + "step": 12166 + }, + { + "epoch": 0.9158621727921112, + "grad_norm": 5.182826042175293, + "learning_rate": 1.9056772041855408e-06, + "loss": 1.8298, + "step": 12167 + }, + { + "epoch": 0.9159374470727714, + "grad_norm": 7.040618896484375, + "learning_rate": 1.9023450447413439e-06, + "loss": 2.2798, + "step": 12168 + }, + { + "epoch": 0.9160127213534316, + "grad_norm": 4.410822868347168, + "learning_rate": 1.8990157445647195e-06, + "loss": 1.6769, + "step": 12169 + }, + { + "epoch": 0.9160879956340917, + "grad_norm": 7.301709175109863, + "learning_rate": 1.8956893038535983e-06, + "loss": 1.7509, + "step": 12170 + }, + { + "epoch": 0.9161632699147518, + "grad_norm": 4.752766132354736, + "learning_rate": 1.8923657228057168e-06, + "loss": 1.5635, + "step": 12171 + }, + { + "epoch": 0.9162385441954121, + "grad_norm": 5.100418567657471, + "learning_rate": 1.8890450016186611e-06, + "loss": 2.1537, + "step": 12172 + }, + { + "epoch": 0.9163138184760722, + "grad_norm": 4.798778057098389, + "learning_rate": 1.8857271404898292e-06, + "loss": 1.7792, + "step": 12173 + }, + { + "epoch": 0.9163890927567323, + "grad_norm": 4.980538845062256, + "learning_rate": 1.8824121396164628e-06, + "loss": 1.6818, + "step": 12174 + }, + { + "epoch": 0.9164643670373925, + "grad_norm": 4.662041187286377, + "learning_rate": 1.8790999991956327e-06, + "loss": 1.8831, + "step": 12175 + }, + { + "epoch": 0.9165396413180527, + "grad_norm": 4.635166645050049, + "learning_rate": 1.8757907194242307e-06, + "loss": 1.6141, + "step": 12176 + }, + { + "epoch": 0.9166149155987128, + "grad_norm": 5.585263252258301, + "learning_rate": 1.8724843004989944e-06, + "loss": 1.8305, + "step": 12177 + }, + { + "epoch": 0.916690189879373, + "grad_norm": 5.630923271179199, + "learning_rate": 1.869180742616461e-06, + "loss": 1.8609, + "step": 12178 + }, + { + "epoch": 0.9167654641600331, + "grad_norm": 7.096811771392822, + "learning_rate": 1.8658800459730451e-06, + "loss": 1.5213, + "step": 12179 + }, + { + "epoch": 0.9168407384406932, + "grad_norm": 6.673388957977295, + "learning_rate": 1.862582210764935e-06, + "loss": 1.6549, + "step": 12180 + }, + { + "epoch": 0.9169160127213535, + "grad_norm": 5.677751064300537, + "learning_rate": 1.8592872371882009e-06, + "loss": 1.7664, + "step": 12181 + }, + { + "epoch": 0.9169912870020136, + "grad_norm": 6.032618045806885, + "learning_rate": 1.8559951254386975e-06, + "loss": 1.6967, + "step": 12182 + }, + { + "epoch": 0.9170665612826737, + "grad_norm": 6.274374961853027, + "learning_rate": 1.8527058757121574e-06, + "loss": 2.2358, + "step": 12183 + }, + { + "epoch": 0.917141835563334, + "grad_norm": 4.8216400146484375, + "learning_rate": 1.8494194882040905e-06, + "loss": 1.6153, + "step": 12184 + }, + { + "epoch": 0.9172171098439941, + "grad_norm": 4.21168851852417, + "learning_rate": 1.8461359631098796e-06, + "loss": 1.7474, + "step": 12185 + }, + { + "epoch": 0.9172923841246542, + "grad_norm": 5.958002090454102, + "learning_rate": 1.8428553006247129e-06, + "loss": 1.9142, + "step": 12186 + }, + { + "epoch": 0.9173676584053143, + "grad_norm": 5.667016983032227, + "learning_rate": 1.8395775009436288e-06, + "loss": 2.098, + "step": 12187 + }, + { + "epoch": 0.9174429326859745, + "grad_norm": 5.322368144989014, + "learning_rate": 1.8363025642614662e-06, + "loss": 1.63, + "step": 12188 + }, + { + "epoch": 0.9175182069666347, + "grad_norm": 4.8616414070129395, + "learning_rate": 1.8330304907729302e-06, + "loss": 2.1501, + "step": 12189 + }, + { + "epoch": 0.9175934812472948, + "grad_norm": 5.068863391876221, + "learning_rate": 1.8297612806725151e-06, + "loss": 1.8887, + "step": 12190 + }, + { + "epoch": 0.917668755527955, + "grad_norm": 4.703795433044434, + "learning_rate": 1.8264949341545768e-06, + "loss": 1.7179, + "step": 12191 + }, + { + "epoch": 0.9177440298086151, + "grad_norm": 4.514343738555908, + "learning_rate": 1.823231451413293e-06, + "loss": 1.9094, + "step": 12192 + }, + { + "epoch": 0.9178193040892753, + "grad_norm": 5.487853527069092, + "learning_rate": 1.8199708326426644e-06, + "loss": 1.5176, + "step": 12193 + }, + { + "epoch": 0.9178945783699355, + "grad_norm": 3.800943374633789, + "learning_rate": 1.8167130780365248e-06, + "loss": 1.8835, + "step": 12194 + }, + { + "epoch": 0.9179698526505956, + "grad_norm": 5.158065319061279, + "learning_rate": 1.813458187788536e-06, + "loss": 2.0343, + "step": 12195 + }, + { + "epoch": 0.9180451269312557, + "grad_norm": 5.792636394500732, + "learning_rate": 1.8102061620921984e-06, + "loss": 1.6276, + "step": 12196 + }, + { + "epoch": 0.918120401211916, + "grad_norm": 5.902648448944092, + "learning_rate": 1.8069570011408298e-06, + "loss": 2.0031, + "step": 12197 + }, + { + "epoch": 0.9181956754925761, + "grad_norm": 4.943217754364014, + "learning_rate": 1.8037107051275926e-06, + "loss": 1.6429, + "step": 12198 + }, + { + "epoch": 0.9182709497732362, + "grad_norm": 4.20914363861084, + "learning_rate": 1.8004672742454598e-06, + "loss": 1.5463, + "step": 12199 + }, + { + "epoch": 0.9183462240538964, + "grad_norm": 6.348606109619141, + "learning_rate": 1.7972267086872495e-06, + "loss": 1.8233, + "step": 12200 + }, + { + "epoch": 0.9184214983345566, + "grad_norm": 5.3262529373168945, + "learning_rate": 1.793989008645597e-06, + "loss": 2.2016, + "step": 12201 + }, + { + "epoch": 0.9184967726152167, + "grad_norm": 6.567859649658203, + "learning_rate": 1.7907541743129864e-06, + "loss": 1.5946, + "step": 12202 + }, + { + "epoch": 0.9185720468958769, + "grad_norm": 4.43633508682251, + "learning_rate": 1.787522205881703e-06, + "loss": 2.0763, + "step": 12203 + }, + { + "epoch": 0.918647321176537, + "grad_norm": 4.103714942932129, + "learning_rate": 1.7842931035438992e-06, + "loss": 1.9192, + "step": 12204 + }, + { + "epoch": 0.9187225954571971, + "grad_norm": 5.5819993019104, + "learning_rate": 1.7810668674915154e-06, + "loss": 1.9005, + "step": 12205 + }, + { + "epoch": 0.9187978697378573, + "grad_norm": 4.494290351867676, + "learning_rate": 1.7778434979163483e-06, + "loss": 2.0268, + "step": 12206 + }, + { + "epoch": 0.9188731440185175, + "grad_norm": 4.2096357345581055, + "learning_rate": 1.7746229950100223e-06, + "loss": 1.8851, + "step": 12207 + }, + { + "epoch": 0.9189484182991776, + "grad_norm": 6.377787113189697, + "learning_rate": 1.77140535896399e-06, + "loss": 1.9062, + "step": 12208 + }, + { + "epoch": 0.9190236925798377, + "grad_norm": 4.032090187072754, + "learning_rate": 1.7681905899695206e-06, + "loss": 1.854, + "step": 12209 + }, + { + "epoch": 0.919098966860498, + "grad_norm": 4.690585613250732, + "learning_rate": 1.7649786882177277e-06, + "loss": 1.8613, + "step": 12210 + }, + { + "epoch": 0.9191742411411581, + "grad_norm": 4.723318099975586, + "learning_rate": 1.7617696538995587e-06, + "loss": 1.7904, + "step": 12211 + }, + { + "epoch": 0.9192495154218182, + "grad_norm": 4.920557975769043, + "learning_rate": 1.7585634872057665e-06, + "loss": 2.0168, + "step": 12212 + }, + { + "epoch": 0.9193247897024784, + "grad_norm": 4.147205352783203, + "learning_rate": 1.75536018832696e-06, + "loss": 1.8378, + "step": 12213 + }, + { + "epoch": 0.9194000639831386, + "grad_norm": 4.625912189483643, + "learning_rate": 1.752159757453553e-06, + "loss": 1.9746, + "step": 12214 + }, + { + "epoch": 0.9194753382637987, + "grad_norm": 4.77776575088501, + "learning_rate": 1.748962194775816e-06, + "loss": 1.8596, + "step": 12215 + }, + { + "epoch": 0.9195506125444589, + "grad_norm": 5.684907913208008, + "learning_rate": 1.7457675004838247e-06, + "loss": 1.8789, + "step": 12216 + }, + { + "epoch": 0.919625886825119, + "grad_norm": 6.896183490753174, + "learning_rate": 1.7425756747674992e-06, + "loss": 1.9244, + "step": 12217 + }, + { + "epoch": 0.9197011611057792, + "grad_norm": 3.7616982460021973, + "learning_rate": 1.7393867178165823e-06, + "loss": 1.6131, + "step": 12218 + }, + { + "epoch": 0.9197764353864394, + "grad_norm": 5.706488609313965, + "learning_rate": 1.73620062982065e-06, + "loss": 1.6967, + "step": 12219 + }, + { + "epoch": 0.9198517096670995, + "grad_norm": 5.436760425567627, + "learning_rate": 1.7330174109691067e-06, + "loss": 1.8944, + "step": 12220 + }, + { + "epoch": 0.9199269839477596, + "grad_norm": 4.784337997436523, + "learning_rate": 1.7298370614511895e-06, + "loss": 1.8491, + "step": 12221 + }, + { + "epoch": 0.9200022582284199, + "grad_norm": 5.08333683013916, + "learning_rate": 1.7266595814559472e-06, + "loss": 1.7878, + "step": 12222 + }, + { + "epoch": 0.92007753250908, + "grad_norm": 4.940536022186279, + "learning_rate": 1.7234849711722845e-06, + "loss": 1.9315, + "step": 12223 + }, + { + "epoch": 0.9201528067897401, + "grad_norm": 6.431300640106201, + "learning_rate": 1.720313230788928e-06, + "loss": 2.0447, + "step": 12224 + }, + { + "epoch": 0.9202280810704002, + "grad_norm": 6.289635181427002, + "learning_rate": 1.7171443604944104e-06, + "loss": 1.8862, + "step": 12225 + }, + { + "epoch": 0.9203033553510604, + "grad_norm": 6.209627151489258, + "learning_rate": 1.7139783604771253e-06, + "loss": 1.3237, + "step": 12226 + }, + { + "epoch": 0.9203786296317206, + "grad_norm": 4.827346324920654, + "learning_rate": 1.7108152309252778e-06, + "loss": 1.7259, + "step": 12227 + }, + { + "epoch": 0.9204539039123807, + "grad_norm": 5.893888473510742, + "learning_rate": 1.707654972026912e-06, + "loss": 2.0293, + "step": 12228 + }, + { + "epoch": 0.9205291781930409, + "grad_norm": 5.251547813415527, + "learning_rate": 1.704497583969883e-06, + "loss": 1.5231, + "step": 12229 + }, + { + "epoch": 0.920604452473701, + "grad_norm": 4.561532974243164, + "learning_rate": 1.7013430669419018e-06, + "loss": 1.7705, + "step": 12230 + }, + { + "epoch": 0.9206797267543612, + "grad_norm": 6.910198211669922, + "learning_rate": 1.698191421130485e-06, + "loss": 1.7174, + "step": 12231 + }, + { + "epoch": 0.9207550010350214, + "grad_norm": 5.117297649383545, + "learning_rate": 1.695042646723005e-06, + "loss": 1.8971, + "step": 12232 + }, + { + "epoch": 0.9208302753156815, + "grad_norm": 5.046881198883057, + "learning_rate": 1.6918967439066346e-06, + "loss": 1.4779, + "step": 12233 + }, + { + "epoch": 0.9209055495963416, + "grad_norm": 5.789036750793457, + "learning_rate": 1.6887537128683905e-06, + "loss": 1.5449, + "step": 12234 + }, + { + "epoch": 0.9209808238770019, + "grad_norm": 4.644052982330322, + "learning_rate": 1.6856135537951123e-06, + "loss": 1.3028, + "step": 12235 + }, + { + "epoch": 0.921056098157662, + "grad_norm": 4.927212238311768, + "learning_rate": 1.6824762668734895e-06, + "loss": 1.6329, + "step": 12236 + }, + { + "epoch": 0.9211313724383221, + "grad_norm": 4.547092437744141, + "learning_rate": 1.6793418522900062e-06, + "loss": 1.5825, + "step": 12237 + }, + { + "epoch": 0.9212066467189823, + "grad_norm": 4.7649922370910645, + "learning_rate": 1.6762103102310078e-06, + "loss": 1.6437, + "step": 12238 + }, + { + "epoch": 0.9212819209996425, + "grad_norm": 4.804971694946289, + "learning_rate": 1.67308164088264e-06, + "loss": 1.4323, + "step": 12239 + }, + { + "epoch": 0.9213571952803026, + "grad_norm": 9.616639137268066, + "learning_rate": 1.6699558444309094e-06, + "loss": 1.9122, + "step": 12240 + }, + { + "epoch": 0.9214324695609628, + "grad_norm": 5.791726589202881, + "learning_rate": 1.6668329210616229e-06, + "loss": 1.5747, + "step": 12241 + }, + { + "epoch": 0.9215077438416229, + "grad_norm": 5.349061012268066, + "learning_rate": 1.6637128709604432e-06, + "loss": 2.0135, + "step": 12242 + }, + { + "epoch": 0.921583018122283, + "grad_norm": 5.285614013671875, + "learning_rate": 1.6605956943128443e-06, + "loss": 1.6782, + "step": 12243 + }, + { + "epoch": 0.9216582924029432, + "grad_norm": 4.931285381317139, + "learning_rate": 1.657481391304122e-06, + "loss": 1.907, + "step": 12244 + }, + { + "epoch": 0.9217335666836034, + "grad_norm": 4.677455902099609, + "learning_rate": 1.6543699621194286e-06, + "loss": 1.7745, + "step": 12245 + }, + { + "epoch": 0.9218088409642635, + "grad_norm": 7.338361740112305, + "learning_rate": 1.651261406943716e-06, + "loss": 1.7279, + "step": 12246 + }, + { + "epoch": 0.9218841152449236, + "grad_norm": 7.404848575592041, + "learning_rate": 1.6481557259617864e-06, + "loss": 2.2013, + "step": 12247 + }, + { + "epoch": 0.9219593895255839, + "grad_norm": 4.182814121246338, + "learning_rate": 1.6450529193582587e-06, + "loss": 2.0324, + "step": 12248 + }, + { + "epoch": 0.922034663806244, + "grad_norm": 4.7084059715271, + "learning_rate": 1.6419529873175964e-06, + "loss": 1.8228, + "step": 12249 + }, + { + "epoch": 0.9221099380869041, + "grad_norm": 4.797237396240234, + "learning_rate": 1.6388559300240692e-06, + "loss": 1.7065, + "step": 12250 + }, + { + "epoch": 0.9221852123675643, + "grad_norm": 5.081038475036621, + "learning_rate": 1.6357617476617904e-06, + "loss": 2.0336, + "step": 12251 + }, + { + "epoch": 0.9222604866482245, + "grad_norm": 4.941250801086426, + "learning_rate": 1.632670440414702e-06, + "loss": 1.8849, + "step": 12252 + }, + { + "epoch": 0.9223357609288846, + "grad_norm": 5.9697675704956055, + "learning_rate": 1.6295820084665847e-06, + "loss": 2.1494, + "step": 12253 + }, + { + "epoch": 0.9224110352095448, + "grad_norm": 6.487722873687744, + "learning_rate": 1.6264964520010195e-06, + "loss": 2.1117, + "step": 12254 + }, + { + "epoch": 0.9224863094902049, + "grad_norm": 6.078864574432373, + "learning_rate": 1.623413771201443e-06, + "loss": 1.5579, + "step": 12255 + }, + { + "epoch": 0.9225615837708651, + "grad_norm": 4.4235405921936035, + "learning_rate": 1.6203339662511086e-06, + "loss": 1.9028, + "step": 12256 + }, + { + "epoch": 0.9226368580515253, + "grad_norm": 4.426524639129639, + "learning_rate": 1.617257037333103e-06, + "loss": 1.7325, + "step": 12257 + }, + { + "epoch": 0.9227121323321854, + "grad_norm": 5.306687831878662, + "learning_rate": 1.614182984630347e-06, + "loss": 1.895, + "step": 12258 + }, + { + "epoch": 0.9227874066128455, + "grad_norm": 5.716831684112549, + "learning_rate": 1.6111118083255717e-06, + "loss": 2.1086, + "step": 12259 + }, + { + "epoch": 0.9228626808935058, + "grad_norm": 5.054379940032959, + "learning_rate": 1.608043508601359e-06, + "loss": 1.8939, + "step": 12260 + }, + { + "epoch": 0.9229379551741659, + "grad_norm": 3.8481345176696777, + "learning_rate": 1.604978085640102e-06, + "loss": 2.072, + "step": 12261 + }, + { + "epoch": 0.923013229454826, + "grad_norm": 5.156086444854736, + "learning_rate": 1.6019155396240437e-06, + "loss": 2.0356, + "step": 12262 + }, + { + "epoch": 0.9230885037354862, + "grad_norm": 4.235472202301025, + "learning_rate": 1.598855870735233e-06, + "loss": 1.7186, + "step": 12263 + }, + { + "epoch": 0.9231637780161464, + "grad_norm": 6.332386493682861, + "learning_rate": 1.5957990791555633e-06, + "loss": 1.9756, + "step": 12264 + }, + { + "epoch": 0.9232390522968065, + "grad_norm": 5.3916239738464355, + "learning_rate": 1.5927451650667446e-06, + "loss": 1.86, + "step": 12265 + }, + { + "epoch": 0.9233143265774666, + "grad_norm": 6.285261154174805, + "learning_rate": 1.589694128650343e-06, + "loss": 2.4192, + "step": 12266 + }, + { + "epoch": 0.9233896008581268, + "grad_norm": 5.584387302398682, + "learning_rate": 1.5866459700877135e-06, + "loss": 1.7704, + "step": 12267 + }, + { + "epoch": 0.923464875138787, + "grad_norm": 4.821388244628906, + "learning_rate": 1.583600689560072e-06, + "loss": 1.8477, + "step": 12268 + }, + { + "epoch": 0.9235401494194471, + "grad_norm": 4.976274490356445, + "learning_rate": 1.5805582872484404e-06, + "loss": 1.7078, + "step": 12269 + }, + { + "epoch": 0.9236154237001073, + "grad_norm": 3.925536870956421, + "learning_rate": 1.5775187633336907e-06, + "loss": 1.8722, + "step": 12270 + }, + { + "epoch": 0.9236906979807674, + "grad_norm": 6.487788677215576, + "learning_rate": 1.574482117996512e-06, + "loss": 1.8876, + "step": 12271 + }, + { + "epoch": 0.9237659722614275, + "grad_norm": 5.969163417816162, + "learning_rate": 1.5714483514174204e-06, + "loss": 1.6099, + "step": 12272 + }, + { + "epoch": 0.9238412465420878, + "grad_norm": 4.744826316833496, + "learning_rate": 1.5684174637767724e-06, + "loss": 1.7138, + "step": 12273 + }, + { + "epoch": 0.9239165208227479, + "grad_norm": 8.98989200592041, + "learning_rate": 1.5653894552547344e-06, + "loss": 1.7619, + "step": 12274 + }, + { + "epoch": 0.923991795103408, + "grad_norm": 3.9739437103271484, + "learning_rate": 1.562364326031318e-06, + "loss": 2.0059, + "step": 12275 + }, + { + "epoch": 0.9240670693840682, + "grad_norm": 4.609249591827393, + "learning_rate": 1.5593420762863575e-06, + "loss": 1.5311, + "step": 12276 + }, + { + "epoch": 0.9241423436647284, + "grad_norm": 5.350503444671631, + "learning_rate": 1.5563227061995256e-06, + "loss": 1.5442, + "step": 12277 + }, + { + "epoch": 0.9242176179453885, + "grad_norm": 4.771426200866699, + "learning_rate": 1.5533062159503064e-06, + "loss": 1.6395, + "step": 12278 + }, + { + "epoch": 0.9242928922260487, + "grad_norm": 4.568175315856934, + "learning_rate": 1.550292605718029e-06, + "loss": 1.6099, + "step": 12279 + }, + { + "epoch": 0.9243681665067088, + "grad_norm": 4.588810443878174, + "learning_rate": 1.5472818756818274e-06, + "loss": 1.6252, + "step": 12280 + }, + { + "epoch": 0.924443440787369, + "grad_norm": 4.478748798370361, + "learning_rate": 1.5442740260207034e-06, + "loss": 1.9399, + "step": 12281 + }, + { + "epoch": 0.9245187150680292, + "grad_norm": 4.047736167907715, + "learning_rate": 1.5412690569134413e-06, + "loss": 2.0048, + "step": 12282 + }, + { + "epoch": 0.9245939893486893, + "grad_norm": 4.284279823303223, + "learning_rate": 1.538266968538704e-06, + "loss": 1.7946, + "step": 12283 + }, + { + "epoch": 0.9246692636293494, + "grad_norm": 4.401327133178711, + "learning_rate": 1.5352677610749321e-06, + "loss": 2.0512, + "step": 12284 + }, + { + "epoch": 0.9247445379100095, + "grad_norm": 5.214788913726807, + "learning_rate": 1.532271434700433e-06, + "loss": 2.213, + "step": 12285 + }, + { + "epoch": 0.9248198121906698, + "grad_norm": 5.28977632522583, + "learning_rate": 1.5292779895933252e-06, + "loss": 1.8059, + "step": 12286 + }, + { + "epoch": 0.9248950864713299, + "grad_norm": 7.314880847930908, + "learning_rate": 1.5262874259315663e-06, + "loss": 1.7721, + "step": 12287 + }, + { + "epoch": 0.92497036075199, + "grad_norm": 5.627440929412842, + "learning_rate": 1.5232997438929308e-06, + "loss": 2.1521, + "step": 12288 + }, + { + "epoch": 0.9250456350326502, + "grad_norm": 6.540442943572998, + "learning_rate": 1.5203149436550324e-06, + "loss": 1.8406, + "step": 12289 + }, + { + "epoch": 0.9251209093133104, + "grad_norm": 5.5621232986450195, + "learning_rate": 1.517333025395312e-06, + "loss": 1.9183, + "step": 12290 + }, + { + "epoch": 0.9251961835939705, + "grad_norm": 7.005645275115967, + "learning_rate": 1.514353989291023e-06, + "loss": 1.8921, + "step": 12291 + }, + { + "epoch": 0.9252714578746307, + "grad_norm": 4.031583786010742, + "learning_rate": 1.5113778355192731e-06, + "loss": 1.8887, + "step": 12292 + }, + { + "epoch": 0.9253467321552908, + "grad_norm": 5.792757987976074, + "learning_rate": 1.5084045642569765e-06, + "loss": 2.0874, + "step": 12293 + }, + { + "epoch": 0.925422006435951, + "grad_norm": 4.797659873962402, + "learning_rate": 1.5054341756808976e-06, + "loss": 1.8219, + "step": 12294 + }, + { + "epoch": 0.9254972807166112, + "grad_norm": 4.369216442108154, + "learning_rate": 1.5024666699676059e-06, + "loss": 1.8302, + "step": 12295 + }, + { + "epoch": 0.9255725549972713, + "grad_norm": 4.578887462615967, + "learning_rate": 1.499502047293516e-06, + "loss": 1.8271, + "step": 12296 + }, + { + "epoch": 0.9256478292779314, + "grad_norm": 4.997894763946533, + "learning_rate": 1.496540307834865e-06, + "loss": 1.8104, + "step": 12297 + }, + { + "epoch": 0.9257231035585917, + "grad_norm": 4.8700032234191895, + "learning_rate": 1.4935814517677226e-06, + "loss": 1.7245, + "step": 12298 + }, + { + "epoch": 0.9257983778392518, + "grad_norm": 4.54118013381958, + "learning_rate": 1.490625479267982e-06, + "loss": 1.7417, + "step": 12299 + }, + { + "epoch": 0.9258736521199119, + "grad_norm": 4.5466628074646, + "learning_rate": 1.4876723905113688e-06, + "loss": 1.8084, + "step": 12300 + }, + { + "epoch": 0.9259489264005721, + "grad_norm": 4.774020195007324, + "learning_rate": 1.4847221856734317e-06, + "loss": 1.8558, + "step": 12301 + }, + { + "epoch": 0.9260242006812323, + "grad_norm": 5.079516887664795, + "learning_rate": 1.4817748649295581e-06, + "loss": 1.8675, + "step": 12302 + }, + { + "epoch": 0.9260994749618924, + "grad_norm": 4.72567081451416, + "learning_rate": 1.478830428454947e-06, + "loss": 1.6151, + "step": 12303 + }, + { + "epoch": 0.9261747492425525, + "grad_norm": 3.883065938949585, + "learning_rate": 1.475888876424647e-06, + "loss": 1.5284, + "step": 12304 + }, + { + "epoch": 0.9262500235232127, + "grad_norm": 6.428706169128418, + "learning_rate": 1.4729502090135295e-06, + "loss": 1.7748, + "step": 12305 + }, + { + "epoch": 0.9263252978038728, + "grad_norm": 3.676638126373291, + "learning_rate": 1.4700144263962767e-06, + "loss": 1.8677, + "step": 12306 + }, + { + "epoch": 0.926400572084533, + "grad_norm": 5.967966556549072, + "learning_rate": 1.467081528747416e-06, + "loss": 1.7996, + "step": 12307 + }, + { + "epoch": 0.9264758463651932, + "grad_norm": 4.967217445373535, + "learning_rate": 1.464151516241302e-06, + "loss": 1.9483, + "step": 12308 + }, + { + "epoch": 0.9265511206458533, + "grad_norm": 4.532412528991699, + "learning_rate": 1.4612243890521238e-06, + "loss": 1.9035, + "step": 12309 + }, + { + "epoch": 0.9266263949265134, + "grad_norm": 4.576842784881592, + "learning_rate": 1.4583001473538693e-06, + "loss": 2.3958, + "step": 12310 + }, + { + "epoch": 0.9267016692071737, + "grad_norm": 6.185211658477783, + "learning_rate": 1.4553787913203999e-06, + "loss": 1.9932, + "step": 12311 + }, + { + "epoch": 0.9267769434878338, + "grad_norm": 5.435148239135742, + "learning_rate": 1.4524603211253651e-06, + "loss": 1.9148, + "step": 12312 + }, + { + "epoch": 0.9268522177684939, + "grad_norm": 5.024242877960205, + "learning_rate": 1.4495447369422766e-06, + "loss": 1.8926, + "step": 12313 + }, + { + "epoch": 0.9269274920491541, + "grad_norm": 4.752802848815918, + "learning_rate": 1.4466320389444343e-06, + "loss": 1.7478, + "step": 12314 + }, + { + "epoch": 0.9270027663298143, + "grad_norm": 4.4405694007873535, + "learning_rate": 1.4437222273050112e-06, + "loss": 2.0847, + "step": 12315 + }, + { + "epoch": 0.9270780406104744, + "grad_norm": 5.675045490264893, + "learning_rate": 1.4408153021969683e-06, + "loss": 1.4183, + "step": 12316 + }, + { + "epoch": 0.9271533148911346, + "grad_norm": 4.446272850036621, + "learning_rate": 1.4379112637931292e-06, + "loss": 1.6795, + "step": 12317 + }, + { + "epoch": 0.9272285891717947, + "grad_norm": 5.909229278564453, + "learning_rate": 1.435010112266122e-06, + "loss": 1.7283, + "step": 12318 + }, + { + "epoch": 0.9273038634524549, + "grad_norm": 4.777585983276367, + "learning_rate": 1.4321118477884199e-06, + "loss": 1.8068, + "step": 12319 + }, + { + "epoch": 0.9273791377331151, + "grad_norm": 5.509268760681152, + "learning_rate": 1.4292164705323075e-06, + "loss": 1.7152, + "step": 12320 + }, + { + "epoch": 0.9274544120137752, + "grad_norm": 4.406026840209961, + "learning_rate": 1.426323980669908e-06, + "loss": 1.5419, + "step": 12321 + }, + { + "epoch": 0.9275296862944353, + "grad_norm": 4.915528774261475, + "learning_rate": 1.423434378373184e-06, + "loss": 1.9114, + "step": 12322 + }, + { + "epoch": 0.9276049605750954, + "grad_norm": 4.654607772827148, + "learning_rate": 1.4205476638138926e-06, + "loss": 1.5583, + "step": 12323 + }, + { + "epoch": 0.9276802348557557, + "grad_norm": 5.048783779144287, + "learning_rate": 1.417663837163663e-06, + "loss": 1.8921, + "step": 12324 + }, + { + "epoch": 0.9277555091364158, + "grad_norm": 5.955204486846924, + "learning_rate": 1.4147828985939139e-06, + "loss": 1.485, + "step": 12325 + }, + { + "epoch": 0.9278307834170759, + "grad_norm": 5.14150857925415, + "learning_rate": 1.4119048482759189e-06, + "loss": 1.5399, + "step": 12326 + }, + { + "epoch": 0.9279060576977362, + "grad_norm": 5.628507614135742, + "learning_rate": 1.4090296863807638e-06, + "loss": 1.5679, + "step": 12327 + }, + { + "epoch": 0.9279813319783963, + "grad_norm": 6.916476249694824, + "learning_rate": 1.406157413079373e-06, + "loss": 1.9774, + "step": 12328 + }, + { + "epoch": 0.9280566062590564, + "grad_norm": 4.837411880493164, + "learning_rate": 1.4032880285424877e-06, + "loss": 1.9572, + "step": 12329 + }, + { + "epoch": 0.9281318805397166, + "grad_norm": 5.051713943481445, + "learning_rate": 1.4004215329406933e-06, + "loss": 1.7368, + "step": 12330 + }, + { + "epoch": 0.9282071548203767, + "grad_norm": 4.86163330078125, + "learning_rate": 1.3975579264443872e-06, + "loss": 1.9365, + "step": 12331 + }, + { + "epoch": 0.9282824291010369, + "grad_norm": 4.9362921714782715, + "learning_rate": 1.3946972092238108e-06, + "loss": 1.9422, + "step": 12332 + }, + { + "epoch": 0.9283577033816971, + "grad_norm": 4.248833179473877, + "learning_rate": 1.3918393814490172e-06, + "loss": 1.639, + "step": 12333 + }, + { + "epoch": 0.9284329776623572, + "grad_norm": 4.783376693725586, + "learning_rate": 1.3889844432899035e-06, + "loss": 2.2542, + "step": 12334 + }, + { + "epoch": 0.9285082519430173, + "grad_norm": 4.6017680168151855, + "learning_rate": 1.3861323949161787e-06, + "loss": 1.9184, + "step": 12335 + }, + { + "epoch": 0.9285835262236776, + "grad_norm": 4.360620498657227, + "learning_rate": 1.3832832364973959e-06, + "loss": 1.8149, + "step": 12336 + }, + { + "epoch": 0.9286588005043377, + "grad_norm": 4.381224155426025, + "learning_rate": 1.3804369682029362e-06, + "loss": 2.1166, + "step": 12337 + }, + { + "epoch": 0.9287340747849978, + "grad_norm": 4.172545909881592, + "learning_rate": 1.377593590201981e-06, + "loss": 1.7075, + "step": 12338 + }, + { + "epoch": 0.928809349065658, + "grad_norm": 5.277780532836914, + "learning_rate": 1.3747531026635841e-06, + "loss": 1.8819, + "step": 12339 + }, + { + "epoch": 0.9288846233463182, + "grad_norm": 5.23681640625, + "learning_rate": 1.371915505756588e-06, + "loss": 1.9712, + "step": 12340 + }, + { + "epoch": 0.9289598976269783, + "grad_norm": 5.093683242797852, + "learning_rate": 1.3690807996496858e-06, + "loss": 1.9324, + "step": 12341 + }, + { + "epoch": 0.9290351719076385, + "grad_norm": 4.756047248840332, + "learning_rate": 1.3662489845113923e-06, + "loss": 1.7555, + "step": 12342 + }, + { + "epoch": 0.9291104461882986, + "grad_norm": 6.883366107940674, + "learning_rate": 1.3634200605100511e-06, + "loss": 1.8412, + "step": 12343 + }, + { + "epoch": 0.9291857204689588, + "grad_norm": 4.924742698669434, + "learning_rate": 1.3605940278138328e-06, + "loss": 2.1506, + "step": 12344 + }, + { + "epoch": 0.9292609947496189, + "grad_norm": 4.9706292152404785, + "learning_rate": 1.3577708865907424e-06, + "loss": 1.8266, + "step": 12345 + }, + { + "epoch": 0.9293362690302791, + "grad_norm": 5.175417900085449, + "learning_rate": 1.3549506370085952e-06, + "loss": 2.2428, + "step": 12346 + }, + { + "epoch": 0.9294115433109392, + "grad_norm": 4.169736862182617, + "learning_rate": 1.3521332792350627e-06, + "loss": 1.7014, + "step": 12347 + }, + { + "epoch": 0.9294868175915993, + "grad_norm": 4.357666969299316, + "learning_rate": 1.3493188134376112e-06, + "loss": 1.7887, + "step": 12348 + }, + { + "epoch": 0.9295620918722596, + "grad_norm": 5.783558368682861, + "learning_rate": 1.3465072397835733e-06, + "loss": 1.9423, + "step": 12349 + }, + { + "epoch": 0.9296373661529197, + "grad_norm": 4.634674072265625, + "learning_rate": 1.3436985584400652e-06, + "loss": 1.6322, + "step": 12350 + }, + { + "epoch": 0.9297126404335798, + "grad_norm": 5.010001182556152, + "learning_rate": 1.3408927695740703e-06, + "loss": 1.8918, + "step": 12351 + }, + { + "epoch": 0.92978791471424, + "grad_norm": 5.452240467071533, + "learning_rate": 1.3380898733523828e-06, + "loss": 1.6482, + "step": 12352 + }, + { + "epoch": 0.9298631889949002, + "grad_norm": 5.481477737426758, + "learning_rate": 1.335289869941625e-06, + "loss": 1.6214, + "step": 12353 + }, + { + "epoch": 0.9299384632755603, + "grad_norm": 4.558218955993652, + "learning_rate": 1.3324927595082526e-06, + "loss": 1.4869, + "step": 12354 + }, + { + "epoch": 0.9300137375562205, + "grad_norm": 5.196902275085449, + "learning_rate": 1.3296985422185383e-06, + "loss": 1.5518, + "step": 12355 + }, + { + "epoch": 0.9300890118368806, + "grad_norm": 5.532809734344482, + "learning_rate": 1.3269072182386044e-06, + "loss": 1.9082, + "step": 12356 + }, + { + "epoch": 0.9301642861175408, + "grad_norm": 4.873679161071777, + "learning_rate": 1.3241187877343687e-06, + "loss": 1.3061, + "step": 12357 + }, + { + "epoch": 0.930239560398201, + "grad_norm": 4.478184700012207, + "learning_rate": 1.3213332508716093e-06, + "loss": 1.8496, + "step": 12358 + }, + { + "epoch": 0.9303148346788611, + "grad_norm": 4.214957237243652, + "learning_rate": 1.318550607815905e-06, + "loss": 2.0773, + "step": 12359 + }, + { + "epoch": 0.9303901089595212, + "grad_norm": 4.268503665924072, + "learning_rate": 1.3157708587326955e-06, + "loss": 1.8211, + "step": 12360 + }, + { + "epoch": 0.9304653832401815, + "grad_norm": 4.271566867828369, + "learning_rate": 1.31299400378721e-06, + "loss": 1.7975, + "step": 12361 + }, + { + "epoch": 0.9305406575208416, + "grad_norm": 4.710085391998291, + "learning_rate": 1.3102200431445332e-06, + "loss": 1.9764, + "step": 12362 + }, + { + "epoch": 0.9306159318015017, + "grad_norm": 3.784324884414673, + "learning_rate": 1.307448976969572e-06, + "loss": 1.7899, + "step": 12363 + }, + { + "epoch": 0.9306912060821618, + "grad_norm": 6.107129096984863, + "learning_rate": 1.3046808054270498e-06, + "loss": 2.2463, + "step": 12364 + }, + { + "epoch": 0.9307664803628221, + "grad_norm": 5.740386009216309, + "learning_rate": 1.3019155286815354e-06, + "loss": 1.9128, + "step": 12365 + }, + { + "epoch": 0.9308417546434822, + "grad_norm": 5.818799018859863, + "learning_rate": 1.299153146897414e-06, + "loss": 1.543, + "step": 12366 + }, + { + "epoch": 0.9309170289241423, + "grad_norm": 6.177310943603516, + "learning_rate": 1.2963936602388926e-06, + "loss": 1.5635, + "step": 12367 + }, + { + "epoch": 0.9309923032048025, + "grad_norm": 4.254126071929932, + "learning_rate": 1.293637068870024e-06, + "loss": 1.6423, + "step": 12368 + }, + { + "epoch": 0.9310675774854626, + "grad_norm": 4.227095127105713, + "learning_rate": 1.2908833729546822e-06, + "loss": 1.7641, + "step": 12369 + }, + { + "epoch": 0.9311428517661228, + "grad_norm": 4.562884330749512, + "learning_rate": 1.288132572656553e-06, + "loss": 1.5985, + "step": 12370 + }, + { + "epoch": 0.931218126046783, + "grad_norm": 5.227180480957031, + "learning_rate": 1.2853846681391835e-06, + "loss": 2.0847, + "step": 12371 + }, + { + "epoch": 0.9312934003274431, + "grad_norm": 3.9266233444213867, + "learning_rate": 1.2826396595659097e-06, + "loss": 1.879, + "step": 12372 + }, + { + "epoch": 0.9313686746081032, + "grad_norm": 5.308877944946289, + "learning_rate": 1.2798975470999286e-06, + "loss": 1.6638, + "step": 12373 + }, + { + "epoch": 0.9314439488887635, + "grad_norm": 6.199954509735107, + "learning_rate": 1.2771583309042378e-06, + "loss": 1.7566, + "step": 12374 + }, + { + "epoch": 0.9315192231694236, + "grad_norm": 4.5094523429870605, + "learning_rate": 1.2744220111416793e-06, + "loss": 1.8317, + "step": 12375 + }, + { + "epoch": 0.9315944974500837, + "grad_norm": 4.7906293869018555, + "learning_rate": 1.2716885879749286e-06, + "loss": 1.8685, + "step": 12376 + }, + { + "epoch": 0.931669771730744, + "grad_norm": 4.097645282745361, + "learning_rate": 1.2689580615664775e-06, + "loss": 1.6988, + "step": 12377 + }, + { + "epoch": 0.9317450460114041, + "grad_norm": 3.911189079284668, + "learning_rate": 1.2662304320786412e-06, + "loss": 1.844, + "step": 12378 + }, + { + "epoch": 0.9318203202920642, + "grad_norm": 3.939239501953125, + "learning_rate": 1.2635056996735783e-06, + "loss": 1.7582, + "step": 12379 + }, + { + "epoch": 0.9318955945727244, + "grad_norm": 5.424371719360352, + "learning_rate": 1.2607838645132486e-06, + "loss": 1.7457, + "step": 12380 + }, + { + "epoch": 0.9319708688533845, + "grad_norm": 4.451361656188965, + "learning_rate": 1.2580649267594835e-06, + "loss": 1.3664, + "step": 12381 + }, + { + "epoch": 0.9320461431340447, + "grad_norm": 5.039329528808594, + "learning_rate": 1.2553488865738927e-06, + "loss": 2.0343, + "step": 12382 + }, + { + "epoch": 0.9321214174147048, + "grad_norm": 6.252554893493652, + "learning_rate": 1.2526357441179527e-06, + "loss": 1.78, + "step": 12383 + }, + { + "epoch": 0.932196691695365, + "grad_norm": 5.426802635192871, + "learning_rate": 1.2499254995529397e-06, + "loss": 1.4672, + "step": 12384 + }, + { + "epoch": 0.9322719659760251, + "grad_norm": 5.803450107574463, + "learning_rate": 1.247218153039975e-06, + "loss": 1.8442, + "step": 12385 + }, + { + "epoch": 0.9323472402566853, + "grad_norm": 4.301825046539307, + "learning_rate": 1.2445137047400079e-06, + "loss": 2.0996, + "step": 12386 + }, + { + "epoch": 0.9324225145373455, + "grad_norm": 4.769915580749512, + "learning_rate": 1.241812154813804e-06, + "loss": 2.3025, + "step": 12387 + }, + { + "epoch": 0.9324977888180056, + "grad_norm": 5.4182353019714355, + "learning_rate": 1.239113503421968e-06, + "loss": 1.82, + "step": 12388 + }, + { + "epoch": 0.9325730630986657, + "grad_norm": 5.053954601287842, + "learning_rate": 1.2364177507249219e-06, + "loss": 1.7181, + "step": 12389 + }, + { + "epoch": 0.932648337379326, + "grad_norm": 5.407834529876709, + "learning_rate": 1.2337248968829262e-06, + "loss": 2.1727, + "step": 12390 + }, + { + "epoch": 0.9327236116599861, + "grad_norm": 3.902431011199951, + "learning_rate": 1.231034942056053e-06, + "loss": 1.7494, + "step": 12391 + }, + { + "epoch": 0.9327988859406462, + "grad_norm": 4.74291467666626, + "learning_rate": 1.2283478864042243e-06, + "loss": 2.0203, + "step": 12392 + }, + { + "epoch": 0.9328741602213064, + "grad_norm": 4.747903347015381, + "learning_rate": 1.2256637300871621e-06, + "loss": 1.5169, + "step": 12393 + }, + { + "epoch": 0.9329494345019665, + "grad_norm": 4.57589054107666, + "learning_rate": 1.22298247326445e-06, + "loss": 2.0302, + "step": 12394 + }, + { + "epoch": 0.9330247087826267, + "grad_norm": 5.166066646575928, + "learning_rate": 1.220304116095472e-06, + "loss": 1.7973, + "step": 12395 + }, + { + "epoch": 0.9330999830632869, + "grad_norm": 3.6988258361816406, + "learning_rate": 1.2176286587394447e-06, + "loss": 1.4378, + "step": 12396 + }, + { + "epoch": 0.933175257343947, + "grad_norm": 4.444631576538086, + "learning_rate": 1.2149561013554245e-06, + "loss": 1.6537, + "step": 12397 + }, + { + "epoch": 0.9332505316246071, + "grad_norm": 4.651283264160156, + "learning_rate": 1.2122864441022896e-06, + "loss": 1.459, + "step": 12398 + }, + { + "epoch": 0.9333258059052674, + "grad_norm": 4.353204727172852, + "learning_rate": 1.20961968713873e-06, + "loss": 1.6164, + "step": 12399 + }, + { + "epoch": 0.9334010801859275, + "grad_norm": 5.270798206329346, + "learning_rate": 1.2069558306232853e-06, + "loss": 1.9878, + "step": 12400 + }, + { + "epoch": 0.9334763544665876, + "grad_norm": 6.042629718780518, + "learning_rate": 1.2042948747143235e-06, + "loss": 1.659, + "step": 12401 + }, + { + "epoch": 0.9335516287472477, + "grad_norm": 4.069930553436279, + "learning_rate": 1.2016368195700122e-06, + "loss": 1.5272, + "step": 12402 + }, + { + "epoch": 0.933626903027908, + "grad_norm": 4.620057106018066, + "learning_rate": 1.1989816653483755e-06, + "loss": 1.9253, + "step": 12403 + }, + { + "epoch": 0.9337021773085681, + "grad_norm": 5.32535982131958, + "learning_rate": 1.1963294122072533e-06, + "loss": 1.8514, + "step": 12404 + }, + { + "epoch": 0.9337774515892282, + "grad_norm": 4.083754539489746, + "learning_rate": 1.1936800603043141e-06, + "loss": 1.8252, + "step": 12405 + }, + { + "epoch": 0.9338527258698884, + "grad_norm": 5.511929988861084, + "learning_rate": 1.1910336097970541e-06, + "loss": 1.7242, + "step": 12406 + }, + { + "epoch": 0.9339280001505486, + "grad_norm": 4.205560684204102, + "learning_rate": 1.1883900608427924e-06, + "loss": 2.1703, + "step": 12407 + }, + { + "epoch": 0.9340032744312087, + "grad_norm": 5.5000996589660645, + "learning_rate": 1.1857494135986858e-06, + "loss": 1.522, + "step": 12408 + }, + { + "epoch": 0.9340785487118689, + "grad_norm": 5.138461589813232, + "learning_rate": 1.1831116682217203e-06, + "loss": 2.0386, + "step": 12409 + }, + { + "epoch": 0.934153822992529, + "grad_norm": 4.082030773162842, + "learning_rate": 1.1804768248686926e-06, + "loss": 1.6248, + "step": 12410 + }, + { + "epoch": 0.9342290972731891, + "grad_norm": 6.3812408447265625, + "learning_rate": 1.1778448836962385e-06, + "loss": 2.1083, + "step": 12411 + }, + { + "epoch": 0.9343043715538494, + "grad_norm": 4.852584362030029, + "learning_rate": 1.1752158448608164e-06, + "loss": 1.7771, + "step": 12412 + }, + { + "epoch": 0.9343796458345095, + "grad_norm": 5.251467704772949, + "learning_rate": 1.1725897085187231e-06, + "loss": 2.014, + "step": 12413 + }, + { + "epoch": 0.9344549201151696, + "grad_norm": 5.6337995529174805, + "learning_rate": 1.1699664748260676e-06, + "loss": 1.8374, + "step": 12414 + }, + { + "epoch": 0.9345301943958298, + "grad_norm": 5.751669883728027, + "learning_rate": 1.1673461439387967e-06, + "loss": 2.0189, + "step": 12415 + }, + { + "epoch": 0.93460546867649, + "grad_norm": 4.690957069396973, + "learning_rate": 1.1647287160126807e-06, + "loss": 1.8297, + "step": 12416 + }, + { + "epoch": 0.9346807429571501, + "grad_norm": 4.864229679107666, + "learning_rate": 1.1621141912033173e-06, + "loss": 1.7293, + "step": 12417 + }, + { + "epoch": 0.9347560172378103, + "grad_norm": 4.473308086395264, + "learning_rate": 1.1595025696661376e-06, + "loss": 2.0501, + "step": 12418 + }, + { + "epoch": 0.9348312915184704, + "grad_norm": 5.315026760101318, + "learning_rate": 1.1568938515563843e-06, + "loss": 2.2269, + "step": 12419 + }, + { + "epoch": 0.9349065657991306, + "grad_norm": 4.450246334075928, + "learning_rate": 1.1542880370291443e-06, + "loss": 1.5225, + "step": 12420 + }, + { + "epoch": 0.9349818400797907, + "grad_norm": 4.711861610412598, + "learning_rate": 1.1516851262393325e-06, + "loss": 1.8737, + "step": 12421 + }, + { + "epoch": 0.9350571143604509, + "grad_norm": 4.109829902648926, + "learning_rate": 1.1490851193416751e-06, + "loss": 1.8027, + "step": 12422 + }, + { + "epoch": 0.935132388641111, + "grad_norm": 5.545371055603027, + "learning_rate": 1.1464880164907377e-06, + "loss": 1.7376, + "step": 12423 + }, + { + "epoch": 0.9352076629217712, + "grad_norm": 4.459930419921875, + "learning_rate": 1.1438938178409187e-06, + "loss": 1.7518, + "step": 12424 + }, + { + "epoch": 0.9352829372024314, + "grad_norm": 5.083422660827637, + "learning_rate": 1.1413025235464226e-06, + "loss": 1.8885, + "step": 12425 + }, + { + "epoch": 0.9353582114830915, + "grad_norm": 5.105082988739014, + "learning_rate": 1.1387141337612983e-06, + "loss": 1.8601, + "step": 12426 + }, + { + "epoch": 0.9354334857637516, + "grad_norm": 5.203488349914551, + "learning_rate": 1.136128648639423e-06, + "loss": 1.7542, + "step": 12427 + }, + { + "epoch": 0.9355087600444119, + "grad_norm": 4.26222038269043, + "learning_rate": 1.1335460683344957e-06, + "loss": 1.7276, + "step": 12428 + }, + { + "epoch": 0.935584034325072, + "grad_norm": 6.196890830993652, + "learning_rate": 1.1309663930000324e-06, + "loss": 2.0614, + "step": 12429 + }, + { + "epoch": 0.9356593086057321, + "grad_norm": 4.4999518394470215, + "learning_rate": 1.1283896227893997e-06, + "loss": 1.6308, + "step": 12430 + }, + { + "epoch": 0.9357345828863923, + "grad_norm": 4.548105239868164, + "learning_rate": 1.1258157578557804e-06, + "loss": 2.0237, + "step": 12431 + }, + { + "epoch": 0.9358098571670525, + "grad_norm": 5.871966361999512, + "learning_rate": 1.1232447983521743e-06, + "loss": 1.6179, + "step": 12432 + }, + { + "epoch": 0.9358851314477126, + "grad_norm": 10.403579711914062, + "learning_rate": 1.1206767444314204e-06, + "loss": 1.8935, + "step": 12433 + }, + { + "epoch": 0.9359604057283728, + "grad_norm": 5.130834579467773, + "learning_rate": 1.1181115962461853e-06, + "loss": 1.8067, + "step": 12434 + }, + { + "epoch": 0.9360356800090329, + "grad_norm": 6.020187854766846, + "learning_rate": 1.1155493539489636e-06, + "loss": 1.8979, + "step": 12435 + }, + { + "epoch": 0.936110954289693, + "grad_norm": 5.908157825469971, + "learning_rate": 1.1129900176920616e-06, + "loss": 1.7154, + "step": 12436 + }, + { + "epoch": 0.9361862285703533, + "grad_norm": 6.477232933044434, + "learning_rate": 1.110433587627635e-06, + "loss": 1.7109, + "step": 12437 + }, + { + "epoch": 0.9362615028510134, + "grad_norm": 5.100631237030029, + "learning_rate": 1.1078800639076458e-06, + "loss": 1.6685, + "step": 12438 + }, + { + "epoch": 0.9363367771316735, + "grad_norm": 4.069405555725098, + "learning_rate": 1.105329446683906e-06, + "loss": 1.4602, + "step": 12439 + }, + { + "epoch": 0.9364120514123337, + "grad_norm": 4.262848854064941, + "learning_rate": 1.1027817361080273e-06, + "loss": 2.116, + "step": 12440 + }, + { + "epoch": 0.9364873256929939, + "grad_norm": 5.0627923011779785, + "learning_rate": 1.1002369323314777e-06, + "loss": 2.0571, + "step": 12441 + }, + { + "epoch": 0.936562599973654, + "grad_norm": 5.0627923011779785, + "learning_rate": 1.1002369323314777e-06, + "loss": 2.1289, + "step": 12442 + }, + { + "epoch": 0.9366378742543141, + "grad_norm": 6.708596706390381, + "learning_rate": 1.0976950355055304e-06, + "loss": 1.6543, + "step": 12443 + }, + { + "epoch": 0.9367131485349743, + "grad_norm": 5.497631072998047, + "learning_rate": 1.0951560457813036e-06, + "loss": 1.9162, + "step": 12444 + }, + { + "epoch": 0.9367884228156345, + "grad_norm": 5.114773750305176, + "learning_rate": 1.0926199633097157e-06, + "loss": 1.6112, + "step": 12445 + }, + { + "epoch": 0.9368636970962946, + "grad_norm": 4.5852131843566895, + "learning_rate": 1.090086788241551e-06, + "loss": 1.7776, + "step": 12446 + }, + { + "epoch": 0.9369389713769548, + "grad_norm": 5.624839782714844, + "learning_rate": 1.087556520727384e-06, + "loss": 1.7819, + "step": 12447 + }, + { + "epoch": 0.9370142456576149, + "grad_norm": 6.2389044761657715, + "learning_rate": 1.0850291609176387e-06, + "loss": 2.2336, + "step": 12448 + }, + { + "epoch": 0.937089519938275, + "grad_norm": 4.557095050811768, + "learning_rate": 1.0825047089625506e-06, + "loss": 2.0348, + "step": 12449 + }, + { + "epoch": 0.9371647942189353, + "grad_norm": 4.758711814880371, + "learning_rate": 1.0799831650121995e-06, + "loss": 1.8398, + "step": 12450 + }, + { + "epoch": 0.9372400684995954, + "grad_norm": 4.8402485847473145, + "learning_rate": 1.0774645292164875e-06, + "loss": 1.5905, + "step": 12451 + }, + { + "epoch": 0.9373153427802555, + "grad_norm": 4.371322154998779, + "learning_rate": 1.0749488017251286e-06, + "loss": 1.6842, + "step": 12452 + }, + { + "epoch": 0.9373906170609158, + "grad_norm": 4.830621242523193, + "learning_rate": 1.0724359826876806e-06, + "loss": 1.9591, + "step": 12453 + }, + { + "epoch": 0.9374658913415759, + "grad_norm": 6.202361583709717, + "learning_rate": 1.0699260722535242e-06, + "loss": 2.0032, + "step": 12454 + }, + { + "epoch": 0.937541165622236, + "grad_norm": 5.951354026794434, + "learning_rate": 1.0674190705718733e-06, + "loss": 1.8076, + "step": 12455 + }, + { + "epoch": 0.9376164399028962, + "grad_norm": 3.8482415676116943, + "learning_rate": 1.0649149777917532e-06, + "loss": 1.6773, + "step": 12456 + }, + { + "epoch": 0.9376917141835563, + "grad_norm": 4.696192264556885, + "learning_rate": 1.0624137940620283e-06, + "loss": 1.5832, + "step": 12457 + }, + { + "epoch": 0.9377669884642165, + "grad_norm": 5.443212032318115, + "learning_rate": 1.0599155195313847e-06, + "loss": 1.7271, + "step": 12458 + }, + { + "epoch": 0.9378422627448767, + "grad_norm": 4.137064456939697, + "learning_rate": 1.0574201543483375e-06, + "loss": 1.4819, + "step": 12459 + }, + { + "epoch": 0.9379175370255368, + "grad_norm": 7.3501691818237305, + "learning_rate": 1.0549276986612288e-06, + "loss": 2.0562, + "step": 12460 + }, + { + "epoch": 0.9379928113061969, + "grad_norm": 4.535752296447754, + "learning_rate": 1.0524381526182347e-06, + "loss": 2.1746, + "step": 12461 + }, + { + "epoch": 0.9380680855868571, + "grad_norm": 4.882367134094238, + "learning_rate": 1.0499515163673423e-06, + "loss": 1.98, + "step": 12462 + }, + { + "epoch": 0.9381433598675173, + "grad_norm": 3.5462915897369385, + "learning_rate": 1.0474677900563723e-06, + "loss": 1.5841, + "step": 12463 + }, + { + "epoch": 0.9382186341481774, + "grad_norm": 4.408980369567871, + "learning_rate": 1.0449869738329953e-06, + "loss": 1.7213, + "step": 12464 + }, + { + "epoch": 0.9382939084288375, + "grad_norm": 4.364137172698975, + "learning_rate": 1.0425090678446658e-06, + "loss": 1.7734, + "step": 12465 + }, + { + "epoch": 0.9383691827094978, + "grad_norm": 5.5675530433654785, + "learning_rate": 1.0400340722386936e-06, + "loss": 1.3926, + "step": 12466 + }, + { + "epoch": 0.9384444569901579, + "grad_norm": 5.3386664390563965, + "learning_rate": 1.0375619871622222e-06, + "loss": 1.753, + "step": 12467 + }, + { + "epoch": 0.938519731270818, + "grad_norm": 6.269412517547607, + "learning_rate": 1.0350928127622005e-06, + "loss": 1.9518, + "step": 12468 + }, + { + "epoch": 0.9385950055514782, + "grad_norm": 5.968063831329346, + "learning_rate": 1.0326265491854114e-06, + "loss": 1.8388, + "step": 12469 + }, + { + "epoch": 0.9386702798321384, + "grad_norm": 5.513698101043701, + "learning_rate": 1.0301631965784763e-06, + "loss": 1.7869, + "step": 12470 + }, + { + "epoch": 0.9387455541127985, + "grad_norm": 6.845789432525635, + "learning_rate": 1.0277027550878226e-06, + "loss": 1.8155, + "step": 12471 + }, + { + "epoch": 0.9388208283934587, + "grad_norm": 4.660444259643555, + "learning_rate": 1.0252452248597277e-06, + "loss": 2.068, + "step": 12472 + }, + { + "epoch": 0.9388961026741188, + "grad_norm": 5.427291393280029, + "learning_rate": 1.0227906060402747e-06, + "loss": 1.8706, + "step": 12473 + }, + { + "epoch": 0.938971376954779, + "grad_norm": 5.784024238586426, + "learning_rate": 1.0203388987753971e-06, + "loss": 1.9764, + "step": 12474 + }, + { + "epoch": 0.9390466512354392, + "grad_norm": 6.6440534591674805, + "learning_rate": 1.0178901032108278e-06, + "loss": 2.0299, + "step": 12475 + }, + { + "epoch": 0.9391219255160993, + "grad_norm": 5.347610950469971, + "learning_rate": 1.0154442194921455e-06, + "loss": 1.8663, + "step": 12476 + }, + { + "epoch": 0.9391971997967594, + "grad_norm": 4.229396820068359, + "learning_rate": 1.0130012477647499e-06, + "loss": 1.7663, + "step": 12477 + }, + { + "epoch": 0.9392724740774197, + "grad_norm": 4.909701347351074, + "learning_rate": 1.0105611881738808e-06, + "loss": 2.1304, + "step": 12478 + }, + { + "epoch": 0.9393477483580798, + "grad_norm": 5.29572057723999, + "learning_rate": 1.008124040864572e-06, + "loss": 1.5572, + "step": 12479 + }, + { + "epoch": 0.9394230226387399, + "grad_norm": 7.212335109710693, + "learning_rate": 1.0056898059817245e-06, + "loss": 2.0612, + "step": 12480 + }, + { + "epoch": 0.9394982969194, + "grad_norm": 6.81786584854126, + "learning_rate": 1.0032584836700277e-06, + "loss": 2.0209, + "step": 12481 + }, + { + "epoch": 0.9395735712000602, + "grad_norm": 6.089557647705078, + "learning_rate": 1.0008300740740328e-06, + "loss": 2.3121, + "step": 12482 + }, + { + "epoch": 0.9396488454807204, + "grad_norm": 6.162079811096191, + "learning_rate": 9.984045773380968e-07, + "loss": 2.0044, + "step": 12483 + }, + { + "epoch": 0.9397241197613805, + "grad_norm": 4.843301296234131, + "learning_rate": 9.95981993606404e-07, + "loss": 1.9771, + "step": 12484 + }, + { + "epoch": 0.9397993940420407, + "grad_norm": 4.402397632598877, + "learning_rate": 9.93562323022973e-07, + "loss": 1.8666, + "step": 12485 + }, + { + "epoch": 0.9398746683227008, + "grad_norm": 6.124284744262695, + "learning_rate": 9.911455657316492e-07, + "loss": 2.1549, + "step": 12486 + }, + { + "epoch": 0.939949942603361, + "grad_norm": 4.449352264404297, + "learning_rate": 9.88731721876096e-07, + "loss": 1.7591, + "step": 12487 + }, + { + "epoch": 0.9400252168840212, + "grad_norm": 5.108132839202881, + "learning_rate": 9.86320791599815e-07, + "loss": 1.3912, + "step": 12488 + }, + { + "epoch": 0.9401004911646813, + "grad_norm": 5.851089000701904, + "learning_rate": 9.839127750461308e-07, + "loss": 1.5195, + "step": 12489 + }, + { + "epoch": 0.9401757654453414, + "grad_norm": 3.6791093349456787, + "learning_rate": 9.815076723581784e-07, + "loss": 1.5425, + "step": 12490 + }, + { + "epoch": 0.9402510397260017, + "grad_norm": 10.067129135131836, + "learning_rate": 9.791054836789549e-07, + "loss": 1.6841, + "step": 12491 + }, + { + "epoch": 0.9403263140066618, + "grad_norm": 5.731894493103027, + "learning_rate": 9.767062091512514e-07, + "loss": 1.7032, + "step": 12492 + }, + { + "epoch": 0.9404015882873219, + "grad_norm": 4.008547782897949, + "learning_rate": 9.743098489177037e-07, + "loss": 1.7873, + "step": 12493 + }, + { + "epoch": 0.9404768625679821, + "grad_norm": 5.378289699554443, + "learning_rate": 9.719164031207594e-07, + "loss": 1.8939, + "step": 12494 + }, + { + "epoch": 0.9405521368486423, + "grad_norm": 4.494997978210449, + "learning_rate": 9.6952587190271e-07, + "loss": 1.6112, + "step": 12495 + }, + { + "epoch": 0.9406274111293024, + "grad_norm": 5.443055152893066, + "learning_rate": 9.671382554056585e-07, + "loss": 1.743, + "step": 12496 + }, + { + "epoch": 0.9407026854099626, + "grad_norm": 5.768131732940674, + "learning_rate": 9.647535537715524e-07, + "loss": 1.9428, + "step": 12497 + }, + { + "epoch": 0.9407779596906227, + "grad_norm": 5.022356033325195, + "learning_rate": 9.623717671421451e-07, + "loss": 1.7744, + "step": 12498 + }, + { + "epoch": 0.9408532339712828, + "grad_norm": 5.572665691375732, + "learning_rate": 9.599928956590343e-07, + "loss": 2.1578, + "step": 12499 + }, + { + "epoch": 0.940928508251943, + "grad_norm": 5.415976047515869, + "learning_rate": 9.576169394636348e-07, + "loss": 1.7101, + "step": 12500 + }, + { + "epoch": 0.9410037825326032, + "grad_norm": 4.1262969970703125, + "learning_rate": 9.55243898697189e-07, + "loss": 1.9986, + "step": 12501 + }, + { + "epoch": 0.9410790568132633, + "grad_norm": 4.997300624847412, + "learning_rate": 9.528737735007675e-07, + "loss": 1.6543, + "step": 12502 + }, + { + "epoch": 0.9411543310939234, + "grad_norm": 5.764066219329834, + "learning_rate": 9.505065640152689e-07, + "loss": 1.5804, + "step": 12503 + }, + { + "epoch": 0.9412296053745837, + "grad_norm": 4.818778038024902, + "learning_rate": 9.481422703814191e-07, + "loss": 1.9397, + "step": 12504 + }, + { + "epoch": 0.9413048796552438, + "grad_norm": 5.200023174285889, + "learning_rate": 9.457808927397616e-07, + "loss": 1.9037, + "step": 12505 + }, + { + "epoch": 0.9413801539359039, + "grad_norm": 7.462325572967529, + "learning_rate": 9.434224312306839e-07, + "loss": 1.6622, + "step": 12506 + }, + { + "epoch": 0.9414554282165641, + "grad_norm": 4.497607231140137, + "learning_rate": 9.410668859943738e-07, + "loss": 1.6973, + "step": 12507 + }, + { + "epoch": 0.9415307024972243, + "grad_norm": 4.548285484313965, + "learning_rate": 9.387142571708751e-07, + "loss": 1.7979, + "step": 12508 + }, + { + "epoch": 0.9416059767778844, + "grad_norm": 6.1044816970825195, + "learning_rate": 9.363645449000425e-07, + "loss": 1.7162, + "step": 12509 + }, + { + "epoch": 0.9416812510585446, + "grad_norm": 6.579189777374268, + "learning_rate": 9.340177493215641e-07, + "loss": 2.1044, + "step": 12510 + }, + { + "epoch": 0.9417565253392047, + "grad_norm": 5.274760723114014, + "learning_rate": 9.31673870574934e-07, + "loss": 1.8558, + "step": 12511 + }, + { + "epoch": 0.9418317996198649, + "grad_norm": 8.2307710647583, + "learning_rate": 9.293329087995073e-07, + "loss": 2.1402, + "step": 12512 + }, + { + "epoch": 0.9419070739005251, + "grad_norm": 5.09917688369751, + "learning_rate": 9.26994864134434e-07, + "loss": 1.5689, + "step": 12513 + }, + { + "epoch": 0.9419823481811852, + "grad_norm": 4.9971137046813965, + "learning_rate": 9.246597367187138e-07, + "loss": 1.9743, + "step": 12514 + }, + { + "epoch": 0.9420576224618453, + "grad_norm": 5.610584735870361, + "learning_rate": 9.22327526691158e-07, + "loss": 1.8484, + "step": 12515 + }, + { + "epoch": 0.9421328967425056, + "grad_norm": 4.722575664520264, + "learning_rate": 9.199982341904112e-07, + "loss": 1.7099, + "step": 12516 + }, + { + "epoch": 0.9422081710231657, + "grad_norm": 5.282405853271484, + "learning_rate": 9.176718593549461e-07, + "loss": 1.8279, + "step": 12517 + }, + { + "epoch": 0.9422834453038258, + "grad_norm": 4.823583602905273, + "learning_rate": 9.153484023230519e-07, + "loss": 1.7001, + "step": 12518 + }, + { + "epoch": 0.9423587195844859, + "grad_norm": 6.802163124084473, + "learning_rate": 9.13027863232857e-07, + "loss": 1.7165, + "step": 12519 + }, + { + "epoch": 0.9424339938651461, + "grad_norm": 5.163522243499756, + "learning_rate": 9.107102422223069e-07, + "loss": 1.6006, + "step": 12520 + }, + { + "epoch": 0.9425092681458063, + "grad_norm": 5.192678928375244, + "learning_rate": 9.083955394291799e-07, + "loss": 1.8156, + "step": 12521 + }, + { + "epoch": 0.9425845424264664, + "grad_norm": 4.587754249572754, + "learning_rate": 9.060837549910772e-07, + "loss": 1.5643, + "step": 12522 + }, + { + "epoch": 0.9426598167071266, + "grad_norm": 5.609636306762695, + "learning_rate": 9.03774889045439e-07, + "loss": 2.2141, + "step": 12523 + }, + { + "epoch": 0.9427350909877867, + "grad_norm": 4.699139595031738, + "learning_rate": 9.014689417294997e-07, + "loss": 1.9678, + "step": 12524 + }, + { + "epoch": 0.9428103652684469, + "grad_norm": 4.728618144989014, + "learning_rate": 8.99165913180361e-07, + "loss": 1.6611, + "step": 12525 + }, + { + "epoch": 0.9428856395491071, + "grad_norm": 6.487859725952148, + "learning_rate": 8.968658035349131e-07, + "loss": 1.9741, + "step": 12526 + }, + { + "epoch": 0.9429609138297672, + "grad_norm": 6.679712772369385, + "learning_rate": 8.94568612929908e-07, + "loss": 1.6433, + "step": 12527 + }, + { + "epoch": 0.9430361881104273, + "grad_norm": 5.824100971221924, + "learning_rate": 8.922743415018975e-07, + "loss": 2.2038, + "step": 12528 + }, + { + "epoch": 0.9431114623910876, + "grad_norm": 4.816535949707031, + "learning_rate": 8.899829893872724e-07, + "loss": 1.5266, + "step": 12529 + }, + { + "epoch": 0.9431867366717477, + "grad_norm": 4.589961051940918, + "learning_rate": 8.876945567222461e-07, + "loss": 1.6233, + "step": 12530 + }, + { + "epoch": 0.9432620109524078, + "grad_norm": 4.767477989196777, + "learning_rate": 8.854090436428542e-07, + "loss": 1.9435, + "step": 12531 + }, + { + "epoch": 0.943337285233068, + "grad_norm": 3.601264476776123, + "learning_rate": 8.831264502849712e-07, + "loss": 1.5987, + "step": 12532 + }, + { + "epoch": 0.9434125595137282, + "grad_norm": 4.306880950927734, + "learning_rate": 8.808467767842887e-07, + "loss": 1.5511, + "step": 12533 + }, + { + "epoch": 0.9434878337943883, + "grad_norm": 4.400055408477783, + "learning_rate": 8.785700232763316e-07, + "loss": 2.0112, + "step": 12534 + }, + { + "epoch": 0.9435631080750485, + "grad_norm": 5.374870300292969, + "learning_rate": 8.762961898964362e-07, + "loss": 1.935, + "step": 12535 + }, + { + "epoch": 0.9436383823557086, + "grad_norm": 5.5986175537109375, + "learning_rate": 8.740252767797885e-07, + "loss": 1.8258, + "step": 12536 + }, + { + "epoch": 0.9437136566363687, + "grad_norm": 5.453594207763672, + "learning_rate": 8.717572840613697e-07, + "loss": 1.5967, + "step": 12537 + }, + { + "epoch": 0.943788930917029, + "grad_norm": 4.611512660980225, + "learning_rate": 8.694922118760273e-07, + "loss": 1.7153, + "step": 12538 + }, + { + "epoch": 0.9438642051976891, + "grad_norm": 5.214768886566162, + "learning_rate": 8.672300603583927e-07, + "loss": 1.7282, + "step": 12539 + }, + { + "epoch": 0.9439394794783492, + "grad_norm": 4.61006498336792, + "learning_rate": 8.649708296429526e-07, + "loss": 1.6251, + "step": 12540 + }, + { + "epoch": 0.9440147537590093, + "grad_norm": 3.7257626056671143, + "learning_rate": 8.627145198640163e-07, + "loss": 1.4965, + "step": 12541 + }, + { + "epoch": 0.9440900280396696, + "grad_norm": 4.95784330368042, + "learning_rate": 8.604611311557098e-07, + "loss": 1.5681, + "step": 12542 + }, + { + "epoch": 0.9441653023203297, + "grad_norm": 5.176828861236572, + "learning_rate": 8.582106636519871e-07, + "loss": 1.8184, + "step": 12543 + }, + { + "epoch": 0.9442405766009898, + "grad_norm": 4.349226951599121, + "learning_rate": 8.559631174866467e-07, + "loss": 1.6727, + "step": 12544 + }, + { + "epoch": 0.94431585088165, + "grad_norm": 6.516700267791748, + "learning_rate": 8.537184927932818e-07, + "loss": 1.6811, + "step": 12545 + }, + { + "epoch": 0.9443911251623102, + "grad_norm": 5.723232269287109, + "learning_rate": 8.514767897053411e-07, + "loss": 1.9636, + "step": 12546 + }, + { + "epoch": 0.9444663994429703, + "grad_norm": 4.648702621459961, + "learning_rate": 8.492380083560736e-07, + "loss": 1.9871, + "step": 12547 + }, + { + "epoch": 0.9445416737236305, + "grad_norm": 4.369608402252197, + "learning_rate": 8.470021488785839e-07, + "loss": 1.9944, + "step": 12548 + }, + { + "epoch": 0.9446169480042906, + "grad_norm": 4.019176483154297, + "learning_rate": 8.447692114057826e-07, + "loss": 2.082, + "step": 12549 + }, + { + "epoch": 0.9446922222849508, + "grad_norm": 4.5343337059021, + "learning_rate": 8.425391960704021e-07, + "loss": 1.9889, + "step": 12550 + }, + { + "epoch": 0.944767496565611, + "grad_norm": 6.490811347961426, + "learning_rate": 8.403121030050254e-07, + "loss": 1.9175, + "step": 12551 + }, + { + "epoch": 0.9448427708462711, + "grad_norm": 6.407595157623291, + "learning_rate": 8.380879323420354e-07, + "loss": 1.9267, + "step": 12552 + }, + { + "epoch": 0.9449180451269312, + "grad_norm": 3.8672337532043457, + "learning_rate": 8.358666842136542e-07, + "loss": 1.5596, + "step": 12553 + }, + { + "epoch": 0.9449933194075915, + "grad_norm": 5.161031246185303, + "learning_rate": 8.336483587519317e-07, + "loss": 1.9057, + "step": 12554 + }, + { + "epoch": 0.9450685936882516, + "grad_norm": 4.153258323669434, + "learning_rate": 8.314329560887458e-07, + "loss": 1.5539, + "step": 12555 + }, + { + "epoch": 0.9451438679689117, + "grad_norm": 4.513288974761963, + "learning_rate": 8.292204763557854e-07, + "loss": 1.6304, + "step": 12556 + }, + { + "epoch": 0.9452191422495719, + "grad_norm": 5.3652873039245605, + "learning_rate": 8.270109196845843e-07, + "loss": 1.8327, + "step": 12557 + }, + { + "epoch": 0.945294416530232, + "grad_norm": 6.923835754394531, + "learning_rate": 8.248042862064875e-07, + "loss": 1.9938, + "step": 12558 + }, + { + "epoch": 0.9453696908108922, + "grad_norm": 6.175401210784912, + "learning_rate": 8.226005760526789e-07, + "loss": 1.8228, + "step": 12559 + }, + { + "epoch": 0.9454449650915523, + "grad_norm": 4.121747970581055, + "learning_rate": 8.203997893541593e-07, + "loss": 2.1208, + "step": 12560 + }, + { + "epoch": 0.9455202393722125, + "grad_norm": 4.972888469696045, + "learning_rate": 8.182019262417628e-07, + "loss": 1.8324, + "step": 12561 + }, + { + "epoch": 0.9455955136528726, + "grad_norm": 6.506134033203125, + "learning_rate": 8.160069868461406e-07, + "loss": 1.5009, + "step": 12562 + }, + { + "epoch": 0.9456707879335328, + "grad_norm": 6.048686504364014, + "learning_rate": 8.138149712977716e-07, + "loss": 1.6118, + "step": 12563 + }, + { + "epoch": 0.945746062214193, + "grad_norm": 4.6864237785339355, + "learning_rate": 8.116258797269793e-07, + "loss": 1.6212, + "step": 12564 + }, + { + "epoch": 0.9458213364948531, + "grad_norm": 4.848001003265381, + "learning_rate": 8.09439712263893e-07, + "loss": 1.7511, + "step": 12565 + }, + { + "epoch": 0.9458966107755132, + "grad_norm": 6.427381992340088, + "learning_rate": 8.072564690384643e-07, + "loss": 1.7889, + "step": 12566 + }, + { + "epoch": 0.9459718850561735, + "grad_norm": 6.031688213348389, + "learning_rate": 8.050761501804949e-07, + "loss": 2.2765, + "step": 12567 + }, + { + "epoch": 0.9460471593368336, + "grad_norm": 5.7715840339660645, + "learning_rate": 8.028987558195922e-07, + "loss": 2.0169, + "step": 12568 + }, + { + "epoch": 0.9461224336174937, + "grad_norm": 6.054158687591553, + "learning_rate": 8.007242860851971e-07, + "loss": 1.7385, + "step": 12569 + }, + { + "epoch": 0.9461977078981539, + "grad_norm": 5.718628883361816, + "learning_rate": 7.98552741106573e-07, + "loss": 1.6291, + "step": 12570 + }, + { + "epoch": 0.9462729821788141, + "grad_norm": 4.007753849029541, + "learning_rate": 7.963841210128109e-07, + "loss": 1.6528, + "step": 12571 + }, + { + "epoch": 0.9463482564594742, + "grad_norm": 5.296968460083008, + "learning_rate": 7.942184259328356e-07, + "loss": 2.1921, + "step": 12572 + }, + { + "epoch": 0.9464235307401344, + "grad_norm": 4.816549301147461, + "learning_rate": 7.920556559953884e-07, + "loss": 1.7772, + "step": 12573 + }, + { + "epoch": 0.9464988050207945, + "grad_norm": 4.9603095054626465, + "learning_rate": 7.898958113290389e-07, + "loss": 1.7354, + "step": 12574 + }, + { + "epoch": 0.9465740793014547, + "grad_norm": 4.698737621307373, + "learning_rate": 7.877388920621786e-07, + "loss": 1.918, + "step": 12575 + }, + { + "epoch": 0.9466493535821149, + "grad_norm": 6.573275566101074, + "learning_rate": 7.855848983230385e-07, + "loss": 1.7081, + "step": 12576 + }, + { + "epoch": 0.946724627862775, + "grad_norm": 5.029136657714844, + "learning_rate": 7.834338302396604e-07, + "loss": 1.5041, + "step": 12577 + }, + { + "epoch": 0.9467999021434351, + "grad_norm": 5.33416748046875, + "learning_rate": 7.812856879399311e-07, + "loss": 1.5731, + "step": 12578 + }, + { + "epoch": 0.9468751764240952, + "grad_norm": 4.616922855377197, + "learning_rate": 7.791404715515372e-07, + "loss": 2.302, + "step": 12579 + }, + { + "epoch": 0.9469504507047555, + "grad_norm": 4.990634918212891, + "learning_rate": 7.7699818120201e-07, + "loss": 1.7495, + "step": 12580 + }, + { + "epoch": 0.9470257249854156, + "grad_norm": 4.4894866943359375, + "learning_rate": 7.748588170187087e-07, + "loss": 1.8907, + "step": 12581 + }, + { + "epoch": 0.9471009992660757, + "grad_norm": 4.430594444274902, + "learning_rate": 7.727223791288097e-07, + "loss": 2.0104, + "step": 12582 + }, + { + "epoch": 0.947176273546736, + "grad_norm": 4.7303619384765625, + "learning_rate": 7.705888676593165e-07, + "loss": 1.7304, + "step": 12583 + }, + { + "epoch": 0.9472515478273961, + "grad_norm": 4.863260269165039, + "learning_rate": 7.684582827370557e-07, + "loss": 2.0688, + "step": 12584 + }, + { + "epoch": 0.9473268221080562, + "grad_norm": 5.808121204376221, + "learning_rate": 7.663306244886925e-07, + "loss": 1.7594, + "step": 12585 + }, + { + "epoch": 0.9474020963887164, + "grad_norm": 4.721111297607422, + "learning_rate": 7.642058930406981e-07, + "loss": 1.5769, + "step": 12586 + }, + { + "epoch": 0.9474773706693765, + "grad_norm": 3.897254705429077, + "learning_rate": 7.620840885193992e-07, + "loss": 1.6152, + "step": 12587 + }, + { + "epoch": 0.9475526449500367, + "grad_norm": 5.926482677459717, + "learning_rate": 7.59965211050917e-07, + "loss": 1.8351, + "step": 12588 + }, + { + "epoch": 0.9476279192306969, + "grad_norm": 6.151461124420166, + "learning_rate": 7.57849260761223e-07, + "loss": 1.8595, + "step": 12589 + }, + { + "epoch": 0.947703193511357, + "grad_norm": 9.226770401000977, + "learning_rate": 7.55736237776089e-07, + "loss": 1.828, + "step": 12590 + }, + { + "epoch": 0.9477784677920171, + "grad_norm": 4.510571479797363, + "learning_rate": 7.536261422211477e-07, + "loss": 2.0181, + "step": 12591 + }, + { + "epoch": 0.9478537420726774, + "grad_norm": 5.621356010437012, + "learning_rate": 7.515189742218265e-07, + "loss": 1.7265, + "step": 12592 + }, + { + "epoch": 0.9479290163533375, + "grad_norm": 4.869679927825928, + "learning_rate": 7.494147339033919e-07, + "loss": 1.6226, + "step": 12593 + }, + { + "epoch": 0.9480042906339976, + "grad_norm": 5.842344284057617, + "learning_rate": 7.473134213909327e-07, + "loss": 2.1404, + "step": 12594 + }, + { + "epoch": 0.9480795649146578, + "grad_norm": 3.7453935146331787, + "learning_rate": 7.452150368093713e-07, + "loss": 1.3438, + "step": 12595 + }, + { + "epoch": 0.948154839195318, + "grad_norm": 4.883222579956055, + "learning_rate": 7.431195802834468e-07, + "loss": 2.0852, + "step": 12596 + }, + { + "epoch": 0.9482301134759781, + "grad_norm": 4.403371810913086, + "learning_rate": 7.410270519377316e-07, + "loss": 1.6844, + "step": 12597 + }, + { + "epoch": 0.9483053877566382, + "grad_norm": 5.2157111167907715, + "learning_rate": 7.389374518966152e-07, + "loss": 1.5479, + "step": 12598 + }, + { + "epoch": 0.9483806620372984, + "grad_norm": 4.309238433837891, + "learning_rate": 7.368507802843205e-07, + "loss": 2.2691, + "step": 12599 + }, + { + "epoch": 0.9484559363179585, + "grad_norm": 5.366034984588623, + "learning_rate": 7.347670372249038e-07, + "loss": 1.8283, + "step": 12600 + }, + { + "epoch": 0.9485312105986187, + "grad_norm": 6.914779186248779, + "learning_rate": 7.326862228422216e-07, + "loss": 1.6681, + "step": 12601 + }, + { + "epoch": 0.9486064848792789, + "grad_norm": 4.132501125335693, + "learning_rate": 7.306083372599804e-07, + "loss": 1.9501, + "step": 12602 + }, + { + "epoch": 0.948681759159939, + "grad_norm": 4.667428016662598, + "learning_rate": 7.285333806017036e-07, + "loss": 1.6994, + "step": 12603 + }, + { + "epoch": 0.9487570334405991, + "grad_norm": 5.719582557678223, + "learning_rate": 7.264613529907482e-07, + "loss": 1.8735, + "step": 12604 + }, + { + "epoch": 0.9488323077212594, + "grad_norm": 4.037753105163574, + "learning_rate": 7.243922545502768e-07, + "loss": 1.731, + "step": 12605 + }, + { + "epoch": 0.9489075820019195, + "grad_norm": 6.50754451751709, + "learning_rate": 7.223260854033021e-07, + "loss": 1.7722, + "step": 12606 + }, + { + "epoch": 0.9489828562825796, + "grad_norm": 4.7723917961120605, + "learning_rate": 7.202628456726423e-07, + "loss": 1.632, + "step": 12607 + }, + { + "epoch": 0.9490581305632398, + "grad_norm": 4.3687334060668945, + "learning_rate": 7.182025354809607e-07, + "loss": 1.979, + "step": 12608 + }, + { + "epoch": 0.9491334048439, + "grad_norm": 5.293466091156006, + "learning_rate": 7.161451549507314e-07, + "loss": 1.916, + "step": 12609 + }, + { + "epoch": 0.9492086791245601, + "grad_norm": 5.199947834014893, + "learning_rate": 7.140907042042621e-07, + "loss": 1.5453, + "step": 12610 + }, + { + "epoch": 0.9492839534052203, + "grad_norm": 5.563015460968018, + "learning_rate": 7.12039183363683e-07, + "loss": 1.8874, + "step": 12611 + }, + { + "epoch": 0.9493592276858804, + "grad_norm": 5.206107139587402, + "learning_rate": 7.09990592550952e-07, + "loss": 1.9164, + "step": 12612 + }, + { + "epoch": 0.9494345019665406, + "grad_norm": 5.016112327575684, + "learning_rate": 7.07944931887855e-07, + "loss": 1.7887, + "step": 12613 + }, + { + "epoch": 0.9495097762472008, + "grad_norm": 3.691054344177246, + "learning_rate": 7.059022014959893e-07, + "loss": 1.7715, + "step": 12614 + }, + { + "epoch": 0.9495850505278609, + "grad_norm": 6.079201698303223, + "learning_rate": 7.03862401496802e-07, + "loss": 1.7812, + "step": 12615 + }, + { + "epoch": 0.949660324808521, + "grad_norm": 4.421444892883301, + "learning_rate": 7.018255320115519e-07, + "loss": 1.7483, + "step": 12616 + }, + { + "epoch": 0.9497355990891813, + "grad_norm": 5.439821720123291, + "learning_rate": 6.997915931613197e-07, + "loss": 1.6121, + "step": 12617 + }, + { + "epoch": 0.9498108733698414, + "grad_norm": 4.601471424102783, + "learning_rate": 6.977605850670144e-07, + "loss": 1.4847, + "step": 12618 + }, + { + "epoch": 0.9498861476505015, + "grad_norm": 5.457115650177002, + "learning_rate": 6.957325078493837e-07, + "loss": 1.8872, + "step": 12619 + }, + { + "epoch": 0.9499614219311616, + "grad_norm": 6.230499267578125, + "learning_rate": 6.937073616289813e-07, + "loss": 1.9586, + "step": 12620 + }, + { + "epoch": 0.9500366962118219, + "grad_norm": 4.610445499420166, + "learning_rate": 6.916851465261997e-07, + "loss": 1.812, + "step": 12621 + }, + { + "epoch": 0.950111970492482, + "grad_norm": 4.16820764541626, + "learning_rate": 6.89665862661254e-07, + "loss": 2.0936, + "step": 12622 + }, + { + "epoch": 0.9501872447731421, + "grad_norm": 4.270386695861816, + "learning_rate": 6.876495101541924e-07, + "loss": 1.9136, + "step": 12623 + }, + { + "epoch": 0.9502625190538023, + "grad_norm": 4.340514659881592, + "learning_rate": 6.856360891248692e-07, + "loss": 1.9056, + "step": 12624 + }, + { + "epoch": 0.9503377933344624, + "grad_norm": 5.450936794281006, + "learning_rate": 6.836255996929886e-07, + "loss": 1.7858, + "step": 12625 + }, + { + "epoch": 0.9504130676151226, + "grad_norm": 5.421398639678955, + "learning_rate": 6.816180419780493e-07, + "loss": 1.8694, + "step": 12626 + }, + { + "epoch": 0.9504883418957828, + "grad_norm": 4.780355930328369, + "learning_rate": 6.796134160994172e-07, + "loss": 1.6679, + "step": 12627 + }, + { + "epoch": 0.9505636161764429, + "grad_norm": 4.167706489562988, + "learning_rate": 6.776117221762468e-07, + "loss": 1.5848, + "step": 12628 + }, + { + "epoch": 0.950638890457103, + "grad_norm": 4.704986095428467, + "learning_rate": 6.756129603275374e-07, + "loss": 1.478, + "step": 12629 + }, + { + "epoch": 0.9507141647377633, + "grad_norm": 6.16221284866333, + "learning_rate": 6.736171306721162e-07, + "loss": 1.8761, + "step": 12630 + }, + { + "epoch": 0.9507894390184234, + "grad_norm": 5.12618350982666, + "learning_rate": 6.71624233328616e-07, + "loss": 2.1902, + "step": 12631 + }, + { + "epoch": 0.9508647132990835, + "grad_norm": 5.246209144592285, + "learning_rate": 6.696342684155199e-07, + "loss": 1.9131, + "step": 12632 + }, + { + "epoch": 0.9509399875797437, + "grad_norm": 3.4327824115753174, + "learning_rate": 6.676472360511165e-07, + "loss": 1.6988, + "step": 12633 + }, + { + "epoch": 0.9510152618604039, + "grad_norm": 4.417757511138916, + "learning_rate": 6.656631363535448e-07, + "loss": 1.4728, + "step": 12634 + }, + { + "epoch": 0.951090536141064, + "grad_norm": 6.891496658325195, + "learning_rate": 6.636819694407382e-07, + "loss": 1.6197, + "step": 12635 + }, + { + "epoch": 0.9511658104217242, + "grad_norm": 5.932455062866211, + "learning_rate": 6.617037354304745e-07, + "loss": 1.8473, + "step": 12636 + }, + { + "epoch": 0.9512410847023843, + "grad_norm": 4.860350131988525, + "learning_rate": 6.597284344403599e-07, + "loss": 1.4481, + "step": 12637 + }, + { + "epoch": 0.9513163589830445, + "grad_norm": 4.502140522003174, + "learning_rate": 6.577560665878169e-07, + "loss": 1.8193, + "step": 12638 + }, + { + "epoch": 0.9513916332637046, + "grad_norm": 5.460235118865967, + "learning_rate": 6.557866319900907e-07, + "loss": 1.9064, + "step": 12639 + }, + { + "epoch": 0.9514669075443648, + "grad_norm": 4.415683269500732, + "learning_rate": 6.538201307642711e-07, + "loss": 1.6909, + "step": 12640 + }, + { + "epoch": 0.9515421818250249, + "grad_norm": 4.433505535125732, + "learning_rate": 6.518565630272533e-07, + "loss": 1.9376, + "step": 12641 + }, + { + "epoch": 0.951617456105685, + "grad_norm": 5.381619453430176, + "learning_rate": 6.498959288957662e-07, + "loss": 1.3601, + "step": 12642 + }, + { + "epoch": 0.9516927303863453, + "grad_norm": 6.529528617858887, + "learning_rate": 6.479382284863611e-07, + "loss": 2.0831, + "step": 12643 + }, + { + "epoch": 0.9517680046670054, + "grad_norm": 4.280087947845459, + "learning_rate": 6.459834619154282e-07, + "loss": 1.6036, + "step": 12644 + }, + { + "epoch": 0.9518432789476655, + "grad_norm": 4.9408278465271, + "learning_rate": 6.440316292991633e-07, + "loss": 2.0676, + "step": 12645 + }, + { + "epoch": 0.9519185532283257, + "grad_norm": 4.629851341247559, + "learning_rate": 6.420827307535959e-07, + "loss": 1.767, + "step": 12646 + }, + { + "epoch": 0.9519938275089859, + "grad_norm": 4.1766276359558105, + "learning_rate": 6.401367663945945e-07, + "loss": 1.4504, + "step": 12647 + }, + { + "epoch": 0.952069101789646, + "grad_norm": 4.166589736938477, + "learning_rate": 6.381937363378277e-07, + "loss": 1.5146, + "step": 12648 + }, + { + "epoch": 0.9521443760703062, + "grad_norm": 5.057321071624756, + "learning_rate": 6.362536406988084e-07, + "loss": 1.7973, + "step": 12649 + }, + { + "epoch": 0.9522196503509663, + "grad_norm": 4.526610851287842, + "learning_rate": 6.343164795928724e-07, + "loss": 1.5239, + "step": 12650 + }, + { + "epoch": 0.9522949246316265, + "grad_norm": 5.0684661865234375, + "learning_rate": 6.323822531351775e-07, + "loss": 1.8554, + "step": 12651 + }, + { + "epoch": 0.9523701989122867, + "grad_norm": 4.20596170425415, + "learning_rate": 6.304509614406984e-07, + "loss": 1.714, + "step": 12652 + }, + { + "epoch": 0.9524454731929468, + "grad_norm": 4.975878715515137, + "learning_rate": 6.285226046242543e-07, + "loss": 1.701, + "step": 12653 + }, + { + "epoch": 0.9525207474736069, + "grad_norm": 5.866959095001221, + "learning_rate": 6.265971828004813e-07, + "loss": 2.022, + "step": 12654 + }, + { + "epoch": 0.9525960217542672, + "grad_norm": 5.904901027679443, + "learning_rate": 6.246746960838379e-07, + "loss": 2.2642, + "step": 12655 + }, + { + "epoch": 0.9526712960349273, + "grad_norm": 4.925893783569336, + "learning_rate": 6.227551445886104e-07, + "loss": 1.6275, + "step": 12656 + }, + { + "epoch": 0.9527465703155874, + "grad_norm": 6.75882625579834, + "learning_rate": 6.208385284289131e-07, + "loss": 2.1533, + "step": 12657 + }, + { + "epoch": 0.9528218445962475, + "grad_norm": 4.597896575927734, + "learning_rate": 6.189248477186715e-07, + "loss": 1.5977, + "step": 12658 + }, + { + "epoch": 0.9528971188769078, + "grad_norm": 5.635746002197266, + "learning_rate": 6.170141025716669e-07, + "loss": 2.1705, + "step": 12659 + }, + { + "epoch": 0.9529723931575679, + "grad_norm": 5.3538641929626465, + "learning_rate": 6.151062931014751e-07, + "loss": 1.7818, + "step": 12660 + }, + { + "epoch": 0.953047667438228, + "grad_norm": 4.036955833435059, + "learning_rate": 6.132014194215108e-07, + "loss": 1.8244, + "step": 12661 + }, + { + "epoch": 0.9531229417188882, + "grad_norm": 4.892913818359375, + "learning_rate": 6.112994816450169e-07, + "loss": 2.0131, + "step": 12662 + }, + { + "epoch": 0.9531982159995483, + "grad_norm": 3.978306770324707, + "learning_rate": 6.094004798850584e-07, + "loss": 1.7382, + "step": 12663 + }, + { + "epoch": 0.9532734902802085, + "grad_norm": 4.9204020500183105, + "learning_rate": 6.075044142545172e-07, + "loss": 1.8224, + "step": 12664 + }, + { + "epoch": 0.9533487645608687, + "grad_norm": 5.848854064941406, + "learning_rate": 6.056112848661199e-07, + "loss": 2.0312, + "step": 12665 + }, + { + "epoch": 0.9534240388415288, + "grad_norm": 5.254955291748047, + "learning_rate": 6.037210918324099e-07, + "loss": 1.8957, + "step": 12666 + }, + { + "epoch": 0.9534993131221889, + "grad_norm": 5.435256481170654, + "learning_rate": 6.01833835265736e-07, + "loss": 1.9473, + "step": 12667 + }, + { + "epoch": 0.9535745874028492, + "grad_norm": 4.806950092315674, + "learning_rate": 5.999495152783086e-07, + "loss": 2.1219, + "step": 12668 + }, + { + "epoch": 0.9536498616835093, + "grad_norm": 4.676013946533203, + "learning_rate": 5.980681319821379e-07, + "loss": 1.9462, + "step": 12669 + }, + { + "epoch": 0.9537251359641694, + "grad_norm": 5.973019599914551, + "learning_rate": 5.961896854890681e-07, + "loss": 1.6982, + "step": 12670 + }, + { + "epoch": 0.9538004102448296, + "grad_norm": 4.057026386260986, + "learning_rate": 5.943141759107595e-07, + "loss": 1.6977, + "step": 12671 + }, + { + "epoch": 0.9538756845254898, + "grad_norm": 5.139476299285889, + "learning_rate": 5.924416033587177e-07, + "loss": 1.8449, + "step": 12672 + }, + { + "epoch": 0.9539509588061499, + "grad_norm": 4.590887069702148, + "learning_rate": 5.905719679442534e-07, + "loss": 1.9326, + "step": 12673 + }, + { + "epoch": 0.9540262330868101, + "grad_norm": 4.804156303405762, + "learning_rate": 5.887052697785167e-07, + "loss": 1.5958, + "step": 12674 + }, + { + "epoch": 0.9541015073674702, + "grad_norm": 4.616923809051514, + "learning_rate": 5.868415089724688e-07, + "loss": 1.7097, + "step": 12675 + }, + { + "epoch": 0.9541767816481304, + "grad_norm": 4.854588031768799, + "learning_rate": 5.849806856369156e-07, + "loss": 1.6195, + "step": 12676 + }, + { + "epoch": 0.9542520559287905, + "grad_norm": 4.186409950256348, + "learning_rate": 5.83122799882474e-07, + "loss": 1.8288, + "step": 12677 + }, + { + "epoch": 0.9543273302094507, + "grad_norm": 4.645761013031006, + "learning_rate": 5.812678518195836e-07, + "loss": 1.9328, + "step": 12678 + }, + { + "epoch": 0.9544026044901108, + "grad_norm": 8.028436660766602, + "learning_rate": 5.794158415585282e-07, + "loss": 1.7745, + "step": 12679 + }, + { + "epoch": 0.954477878770771, + "grad_norm": 4.998651027679443, + "learning_rate": 5.775667692093922e-07, + "loss": 1.9679, + "step": 12680 + }, + { + "epoch": 0.9545531530514312, + "grad_norm": 6.120117664337158, + "learning_rate": 5.757206348821098e-07, + "loss": 1.738, + "step": 12681 + }, + { + "epoch": 0.9546284273320913, + "grad_norm": 5.068941116333008, + "learning_rate": 5.738774386864154e-07, + "loss": 1.7005, + "step": 12682 + }, + { + "epoch": 0.9547037016127514, + "grad_norm": 5.972866535186768, + "learning_rate": 5.720371807318881e-07, + "loss": 1.7792, + "step": 12683 + }, + { + "epoch": 0.9547789758934117, + "grad_norm": 4.95200777053833, + "learning_rate": 5.701998611279291e-07, + "loss": 1.9954, + "step": 12684 + }, + { + "epoch": 0.9548542501740718, + "grad_norm": 5.003471851348877, + "learning_rate": 5.683654799837567e-07, + "loss": 1.7011, + "step": 12685 + }, + { + "epoch": 0.9549295244547319, + "grad_norm": 5.468829154968262, + "learning_rate": 5.665340374084172e-07, + "loss": 2.0048, + "step": 12686 + }, + { + "epoch": 0.9550047987353921, + "grad_norm": 5.987268447875977, + "learning_rate": 5.647055335107954e-07, + "loss": 1.9431, + "step": 12687 + }, + { + "epoch": 0.9550800730160522, + "grad_norm": 6.3180975914001465, + "learning_rate": 5.628799683995767e-07, + "loss": 1.9867, + "step": 12688 + }, + { + "epoch": 0.9551553472967124, + "grad_norm": 5.810031890869141, + "learning_rate": 5.610573421832965e-07, + "loss": 1.8616, + "step": 12689 + }, + { + "epoch": 0.9552306215773726, + "grad_norm": 4.753474235534668, + "learning_rate": 5.592376549703015e-07, + "loss": 1.6691, + "step": 12690 + }, + { + "epoch": 0.9553058958580327, + "grad_norm": 3.8705475330352783, + "learning_rate": 5.574209068687719e-07, + "loss": 1.4919, + "step": 12691 + }, + { + "epoch": 0.9553811701386928, + "grad_norm": 4.97821044921875, + "learning_rate": 5.556070979866934e-07, + "loss": 1.7721, + "step": 12692 + }, + { + "epoch": 0.9554564444193531, + "grad_norm": 5.074934005737305, + "learning_rate": 5.537962284319021e-07, + "loss": 2.0019, + "step": 12693 + }, + { + "epoch": 0.9555317187000132, + "grad_norm": 4.354022026062012, + "learning_rate": 5.519882983120561e-07, + "loss": 1.6431, + "step": 12694 + }, + { + "epoch": 0.9556069929806733, + "grad_norm": 10.771299362182617, + "learning_rate": 5.501833077346142e-07, + "loss": 2.4806, + "step": 12695 + }, + { + "epoch": 0.9556822672613334, + "grad_norm": 4.1471662521362305, + "learning_rate": 5.483812568068958e-07, + "loss": 2.0176, + "step": 12696 + }, + { + "epoch": 0.9557575415419937, + "grad_norm": 4.193211078643799, + "learning_rate": 5.465821456360098e-07, + "loss": 1.9431, + "step": 12697 + }, + { + "epoch": 0.9558328158226538, + "grad_norm": 3.8361949920654297, + "learning_rate": 5.447859743289208e-07, + "loss": 1.4868, + "step": 12698 + }, + { + "epoch": 0.9559080901033139, + "grad_norm": 5.013564586639404, + "learning_rate": 5.429927429924042e-07, + "loss": 1.7217, + "step": 12699 + }, + { + "epoch": 0.9559833643839741, + "grad_norm": 5.728600978851318, + "learning_rate": 5.412024517330638e-07, + "loss": 1.868, + "step": 12700 + }, + { + "epoch": 0.9560586386646343, + "grad_norm": 4.661545753479004, + "learning_rate": 5.3941510065732e-07, + "loss": 1.8948, + "step": 12701 + }, + { + "epoch": 0.9561339129452944, + "grad_norm": 5.729572772979736, + "learning_rate": 5.376306898714322e-07, + "loss": 1.7481, + "step": 12702 + }, + { + "epoch": 0.9562091872259546, + "grad_norm": 5.605362415313721, + "learning_rate": 5.358492194814768e-07, + "loss": 1.5649, + "step": 12703 + }, + { + "epoch": 0.9562844615066147, + "grad_norm": 3.890798807144165, + "learning_rate": 5.340706895933578e-07, + "loss": 1.845, + "step": 12704 + }, + { + "epoch": 0.9563597357872748, + "grad_norm": 4.749081134796143, + "learning_rate": 5.322951003127963e-07, + "loss": 2.0707, + "step": 12705 + }, + { + "epoch": 0.9564350100679351, + "grad_norm": 3.9745826721191406, + "learning_rate": 5.305224517453577e-07, + "loss": 1.701, + "step": 12706 + }, + { + "epoch": 0.9565102843485952, + "grad_norm": 5.145061016082764, + "learning_rate": 5.287527439964135e-07, + "loss": 1.6462, + "step": 12707 + }, + { + "epoch": 0.9565855586292553, + "grad_norm": 6.6787543296813965, + "learning_rate": 5.269859771711683e-07, + "loss": 1.545, + "step": 12708 + }, + { + "epoch": 0.9566608329099155, + "grad_norm": 4.938272953033447, + "learning_rate": 5.252221513746547e-07, + "loss": 2.0984, + "step": 12709 + }, + { + "epoch": 0.9567361071905757, + "grad_norm": 4.221813201904297, + "learning_rate": 5.234612667117278e-07, + "loss": 1.7497, + "step": 12710 + }, + { + "epoch": 0.9568113814712358, + "grad_norm": 5.071768283843994, + "learning_rate": 5.217033232870594e-07, + "loss": 1.9402, + "step": 12711 + }, + { + "epoch": 0.956886655751896, + "grad_norm": 4.225630283355713, + "learning_rate": 5.199483212051659e-07, + "loss": 1.5626, + "step": 12712 + }, + { + "epoch": 0.9569619300325561, + "grad_norm": 5.390611171722412, + "learning_rate": 5.181962605703639e-07, + "loss": 1.7544, + "step": 12713 + }, + { + "epoch": 0.9570372043132163, + "grad_norm": 7.954266548156738, + "learning_rate": 5.1644714148682e-07, + "loss": 1.7385, + "step": 12714 + }, + { + "epoch": 0.9571124785938765, + "grad_norm": 4.858428001403809, + "learning_rate": 5.147009640585065e-07, + "loss": 1.8414, + "step": 12715 + }, + { + "epoch": 0.9571877528745366, + "grad_norm": 5.260420799255371, + "learning_rate": 5.129577283892295e-07, + "loss": 1.5452, + "step": 12716 + }, + { + "epoch": 0.9572630271551967, + "grad_norm": 3.7520792484283447, + "learning_rate": 5.112174345826281e-07, + "loss": 1.6871, + "step": 12717 + }, + { + "epoch": 0.9573383014358569, + "grad_norm": 4.311631679534912, + "learning_rate": 5.094800827421475e-07, + "loss": 1.9012, + "step": 12718 + }, + { + "epoch": 0.9574135757165171, + "grad_norm": 6.373862266540527, + "learning_rate": 5.077456729710661e-07, + "loss": 1.9115, + "step": 12719 + }, + { + "epoch": 0.9574888499971772, + "grad_norm": 4.494345188140869, + "learning_rate": 5.060142053725014e-07, + "loss": 1.8008, + "step": 12720 + }, + { + "epoch": 0.9575641242778373, + "grad_norm": 4.176302909851074, + "learning_rate": 5.042856800493768e-07, + "loss": 1.8724, + "step": 12721 + }, + { + "epoch": 0.9576393985584976, + "grad_norm": 4.273013591766357, + "learning_rate": 5.025600971044486e-07, + "loss": 1.5219, + "step": 12722 + }, + { + "epoch": 0.9577146728391577, + "grad_norm": 4.693612575531006, + "learning_rate": 5.008374566403018e-07, + "loss": 1.6532, + "step": 12723 + }, + { + "epoch": 0.9577899471198178, + "grad_norm": 4.368964195251465, + "learning_rate": 4.991177587593321e-07, + "loss": 1.6518, + "step": 12724 + }, + { + "epoch": 0.957865221400478, + "grad_norm": 4.226772785186768, + "learning_rate": 4.974010035637855e-07, + "loss": 2.0488, + "step": 12725 + }, + { + "epoch": 0.9579404956811381, + "grad_norm": 4.707857608795166, + "learning_rate": 4.956871911557081e-07, + "loss": 1.5817, + "step": 12726 + }, + { + "epoch": 0.9580157699617983, + "grad_norm": 5.16405725479126, + "learning_rate": 4.939763216369797e-07, + "loss": 1.7049, + "step": 12727 + }, + { + "epoch": 0.9580910442424585, + "grad_norm": 4.712795734405518, + "learning_rate": 4.922683951093133e-07, + "loss": 1.8878, + "step": 12728 + }, + { + "epoch": 0.9581663185231186, + "grad_norm": 4.737600326538086, + "learning_rate": 4.905634116742386e-07, + "loss": 1.823, + "step": 12729 + }, + { + "epoch": 0.9582415928037787, + "grad_norm": 3.9811925888061523, + "learning_rate": 4.888613714331081e-07, + "loss": 1.6856, + "step": 12730 + }, + { + "epoch": 0.958316867084439, + "grad_norm": 3.939588785171509, + "learning_rate": 4.871622744871074e-07, + "loss": 1.8198, + "step": 12731 + }, + { + "epoch": 0.9583921413650991, + "grad_norm": 4.764878749847412, + "learning_rate": 4.854661209372391e-07, + "loss": 1.9043, + "step": 12732 + }, + { + "epoch": 0.9584674156457592, + "grad_norm": 4.294692516326904, + "learning_rate": 4.837729108843336e-07, + "loss": 1.9113, + "step": 12733 + }, + { + "epoch": 0.9585426899264194, + "grad_norm": 4.665185451507568, + "learning_rate": 4.820826444290605e-07, + "loss": 1.8684, + "step": 12734 + }, + { + "epoch": 0.9586179642070796, + "grad_norm": 5.439671516418457, + "learning_rate": 4.803953216718837e-07, + "loss": 1.9328, + "step": 12735 + }, + { + "epoch": 0.9586932384877397, + "grad_norm": 5.113662242889404, + "learning_rate": 4.787109427131175e-07, + "loss": 1.8658, + "step": 12736 + }, + { + "epoch": 0.9587685127683998, + "grad_norm": 4.699664115905762, + "learning_rate": 4.770295076528985e-07, + "loss": 1.5247, + "step": 12737 + }, + { + "epoch": 0.95884378704906, + "grad_norm": 4.78753662109375, + "learning_rate": 4.7535101659117453e-07, + "loss": 1.9261, + "step": 12738 + }, + { + "epoch": 0.9589190613297202, + "grad_norm": 5.380410194396973, + "learning_rate": 4.736754696277268e-07, + "loss": 2.0376, + "step": 12739 + }, + { + "epoch": 0.9589943356103803, + "grad_norm": 5.370891094207764, + "learning_rate": 4.720028668621701e-07, + "loss": 1.7869, + "step": 12740 + }, + { + "epoch": 0.9590696098910405, + "grad_norm": 6.113171577453613, + "learning_rate": 4.7033320839393045e-07, + "loss": 2.1247, + "step": 12741 + }, + { + "epoch": 0.9591448841717006, + "grad_norm": 7.672886371612549, + "learning_rate": 4.6866649432226185e-07, + "loss": 1.629, + "step": 12742 + }, + { + "epoch": 0.9592201584523607, + "grad_norm": 4.6829071044921875, + "learning_rate": 4.670027247462516e-07, + "loss": 1.8026, + "step": 12743 + }, + { + "epoch": 0.959295432733021, + "grad_norm": 5.635254859924316, + "learning_rate": 4.65341899764804e-07, + "loss": 1.6993, + "step": 12744 + }, + { + "epoch": 0.9593707070136811, + "grad_norm": 4.119978427886963, + "learning_rate": 4.6368401947665117e-07, + "loss": 1.4614, + "step": 12745 + }, + { + "epoch": 0.9594459812943412, + "grad_norm": 4.866243839263916, + "learning_rate": 4.6202908398034203e-07, + "loss": 1.5539, + "step": 12746 + }, + { + "epoch": 0.9595212555750015, + "grad_norm": 4.3326029777526855, + "learning_rate": 4.603770933742702e-07, + "loss": 1.8544, + "step": 12747 + }, + { + "epoch": 0.9595965298556616, + "grad_norm": 4.368873596191406, + "learning_rate": 4.5872804775662916e-07, + "loss": 2.2047, + "step": 12748 + }, + { + "epoch": 0.9596718041363217, + "grad_norm": 4.2882466316223145, + "learning_rate": 4.5708194722546284e-07, + "loss": 1.6478, + "step": 12749 + }, + { + "epoch": 0.9597470784169819, + "grad_norm": 7.5147624015808105, + "learning_rate": 4.5543879187861517e-07, + "loss": 2.2439, + "step": 12750 + }, + { + "epoch": 0.959822352697642, + "grad_norm": 4.197399616241455, + "learning_rate": 4.5379858181377466e-07, + "loss": 1.9333, + "step": 12751 + }, + { + "epoch": 0.9598976269783022, + "grad_norm": 5.4983696937561035, + "learning_rate": 4.521613171284411e-07, + "loss": 1.6143, + "step": 12752 + }, + { + "epoch": 0.9599729012589624, + "grad_norm": 4.430149078369141, + "learning_rate": 4.505269979199478e-07, + "loss": 1.8785, + "step": 12753 + }, + { + "epoch": 0.9600481755396225, + "grad_norm": 4.548789978027344, + "learning_rate": 4.4889562428545027e-07, + "loss": 1.6135, + "step": 12754 + }, + { + "epoch": 0.9601234498202826, + "grad_norm": 5.231551647186279, + "learning_rate": 4.472671963219377e-07, + "loss": 1.5345, + "step": 12755 + }, + { + "epoch": 0.9601987241009428, + "grad_norm": 5.463254451751709, + "learning_rate": 4.4564171412619926e-07, + "loss": 1.6017, + "step": 12756 + }, + { + "epoch": 0.960273998381603, + "grad_norm": 4.883767604827881, + "learning_rate": 4.4401917779487436e-07, + "loss": 1.9248, + "step": 12757 + }, + { + "epoch": 0.9603492726622631, + "grad_norm": 5.04429292678833, + "learning_rate": 4.423995874244247e-07, + "loss": 1.8472, + "step": 12758 + }, + { + "epoch": 0.9604245469429232, + "grad_norm": 5.0260796546936035, + "learning_rate": 4.407829431111177e-07, + "loss": 1.728, + "step": 12759 + }, + { + "epoch": 0.9604998212235835, + "grad_norm": 5.662580966949463, + "learning_rate": 4.3916924495106537e-07, + "loss": 1.7103, + "step": 12760 + }, + { + "epoch": 0.9605750955042436, + "grad_norm": 5.893969535827637, + "learning_rate": 4.37558493040191e-07, + "loss": 1.8945, + "step": 12761 + }, + { + "epoch": 0.9606503697849037, + "grad_norm": 5.4086713790893555, + "learning_rate": 4.359506874742569e-07, + "loss": 1.8159, + "step": 12762 + }, + { + "epoch": 0.9607256440655639, + "grad_norm": 5.280806064605713, + "learning_rate": 4.343458283488366e-07, + "loss": 1.7341, + "step": 12763 + }, + { + "epoch": 0.960800918346224, + "grad_norm": 4.8617024421691895, + "learning_rate": 4.3274391575933715e-07, + "loss": 1.4529, + "step": 12764 + }, + { + "epoch": 0.9608761926268842, + "grad_norm": 3.8887698650360107, + "learning_rate": 4.3114494980098785e-07, + "loss": 1.8785, + "step": 12765 + }, + { + "epoch": 0.9609514669075444, + "grad_norm": 4.244451999664307, + "learning_rate": 4.2954893056884606e-07, + "loss": 1.6456, + "step": 12766 + }, + { + "epoch": 0.9610267411882045, + "grad_norm": 4.5337982177734375, + "learning_rate": 4.279558581577858e-07, + "loss": 1.8425, + "step": 12767 + }, + { + "epoch": 0.9611020154688646, + "grad_norm": 5.106928825378418, + "learning_rate": 4.2636573266251476e-07, + "loss": 1.2153, + "step": 12768 + }, + { + "epoch": 0.9611772897495249, + "grad_norm": 5.047542572021484, + "learning_rate": 4.2477855417755173e-07, + "loss": 1.7941, + "step": 12769 + }, + { + "epoch": 0.961252564030185, + "grad_norm": 4.429896831512451, + "learning_rate": 4.231943227972601e-07, + "loss": 2.0056, + "step": 12770 + }, + { + "epoch": 0.9613278383108451, + "grad_norm": 4.621360778808594, + "learning_rate": 4.2161303861580903e-07, + "loss": 1.8047, + "step": 12771 + }, + { + "epoch": 0.9614031125915053, + "grad_norm": 5.414617538452148, + "learning_rate": 4.2003470172721213e-07, + "loss": 1.9059, + "step": 12772 + }, + { + "epoch": 0.9614783868721655, + "grad_norm": 5.47637939453125, + "learning_rate": 4.184593122252889e-07, + "loss": 1.6954, + "step": 12773 + }, + { + "epoch": 0.9615536611528256, + "grad_norm": 5.423379421234131, + "learning_rate": 4.168868702036921e-07, + "loss": 1.4541, + "step": 12774 + }, + { + "epoch": 0.9616289354334857, + "grad_norm": 5.788435459136963, + "learning_rate": 4.1531737575590813e-07, + "loss": 1.9944, + "step": 12775 + }, + { + "epoch": 0.9617042097141459, + "grad_norm": 4.989753723144531, + "learning_rate": 4.1375082897522344e-07, + "loss": 2.0697, + "step": 12776 + }, + { + "epoch": 0.9617794839948061, + "grad_norm": 4.762087345123291, + "learning_rate": 4.1218722995477463e-07, + "loss": 1.6014, + "step": 12777 + }, + { + "epoch": 0.9618547582754662, + "grad_norm": 4.377968788146973, + "learning_rate": 4.106265787875152e-07, + "loss": 1.7274, + "step": 12778 + }, + { + "epoch": 0.9619300325561264, + "grad_norm": 5.829367637634277, + "learning_rate": 4.090688755662153e-07, + "loss": 1.8093, + "step": 12779 + }, + { + "epoch": 0.9620053068367865, + "grad_norm": 4.503036022186279, + "learning_rate": 4.075141203834787e-07, + "loss": 1.8075, + "step": 12780 + }, + { + "epoch": 0.9620805811174467, + "grad_norm": 5.020206928253174, + "learning_rate": 4.059623133317314e-07, + "loss": 1.8916, + "step": 12781 + }, + { + "epoch": 0.9621558553981069, + "grad_norm": 5.009835720062256, + "learning_rate": 4.0441345450322745e-07, + "loss": 1.5915, + "step": 12782 + }, + { + "epoch": 0.962231129678767, + "grad_norm": 5.395751476287842, + "learning_rate": 4.02867543990032e-07, + "loss": 1.6678, + "step": 12783 + }, + { + "epoch": 0.9623064039594271, + "grad_norm": 5.530890464782715, + "learning_rate": 4.0132458188405495e-07, + "loss": 1.7713, + "step": 12784 + }, + { + "epoch": 0.9623816782400874, + "grad_norm": 4.732662200927734, + "learning_rate": 3.9978456827701185e-07, + "loss": 1.6821, + "step": 12785 + }, + { + "epoch": 0.9624569525207475, + "grad_norm": 6.709630012512207, + "learning_rate": 3.982475032604627e-07, + "loss": 1.8672, + "step": 12786 + }, + { + "epoch": 0.9625322268014076, + "grad_norm": 3.9430272579193115, + "learning_rate": 3.967133869257733e-07, + "loss": 1.7442, + "step": 12787 + }, + { + "epoch": 0.9626075010820678, + "grad_norm": 4.8096723556518555, + "learning_rate": 3.9518221936414304e-07, + "loss": 2.0516, + "step": 12788 + }, + { + "epoch": 0.962682775362728, + "grad_norm": 4.9371113777160645, + "learning_rate": 3.93654000666599e-07, + "loss": 1.868, + "step": 12789 + }, + { + "epoch": 0.9627580496433881, + "grad_norm": 5.705292224884033, + "learning_rate": 3.921287309239907e-07, + "loss": 1.8838, + "step": 12790 + }, + { + "epoch": 0.9628333239240483, + "grad_norm": 7.7011213302612305, + "learning_rate": 3.9060641022698464e-07, + "loss": 1.8731, + "step": 12791 + }, + { + "epoch": 0.9629085982047084, + "grad_norm": 6.318780899047852, + "learning_rate": 3.8908703866608606e-07, + "loss": 1.9872, + "step": 12792 + }, + { + "epoch": 0.9629838724853685, + "grad_norm": 4.973534107208252, + "learning_rate": 3.8757061633161174e-07, + "loss": 1.6144, + "step": 12793 + }, + { + "epoch": 0.9630591467660287, + "grad_norm": 4.02522611618042, + "learning_rate": 3.860571433137117e-07, + "loss": 1.746, + "step": 12794 + }, + { + "epoch": 0.9631344210466889, + "grad_norm": 5.6097092628479, + "learning_rate": 3.8454661970235284e-07, + "loss": 1.8325, + "step": 12795 + }, + { + "epoch": 0.963209695327349, + "grad_norm": 4.617707252502441, + "learning_rate": 3.8303904558734117e-07, + "loss": 1.6652, + "step": 12796 + }, + { + "epoch": 0.9632849696080091, + "grad_norm": 4.881204128265381, + "learning_rate": 3.8153442105828275e-07, + "loss": 1.4728, + "step": 12797 + }, + { + "epoch": 0.9633602438886694, + "grad_norm": 5.9089202880859375, + "learning_rate": 3.800327462046338e-07, + "loss": 2.2214, + "step": 12798 + }, + { + "epoch": 0.9634355181693295, + "grad_norm": 5.717357635498047, + "learning_rate": 3.7853402111566184e-07, + "loss": 1.9396, + "step": 12799 + }, + { + "epoch": 0.9635107924499896, + "grad_norm": 4.606716632843018, + "learning_rate": 3.7703824588046775e-07, + "loss": 1.7728, + "step": 12800 + }, + { + "epoch": 0.9635860667306498, + "grad_norm": 7.8199639320373535, + "learning_rate": 3.7554542058796385e-07, + "loss": 1.958, + "step": 12801 + }, + { + "epoch": 0.96366134101131, + "grad_norm": 5.51934289932251, + "learning_rate": 3.740555453268957e-07, + "loss": 1.4191, + "step": 12802 + }, + { + "epoch": 0.9637366152919701, + "grad_norm": 4.515603542327881, + "learning_rate": 3.725686201858314e-07, + "loss": 2.1266, + "step": 12803 + }, + { + "epoch": 0.9638118895726303, + "grad_norm": 6.134711265563965, + "learning_rate": 3.710846452531669e-07, + "loss": 1.5383, + "step": 12804 + }, + { + "epoch": 0.9638871638532904, + "grad_norm": 7.720675468444824, + "learning_rate": 3.6960362061711494e-07, + "loss": 1.658, + "step": 12805 + }, + { + "epoch": 0.9639624381339505, + "grad_norm": 3.837897539138794, + "learning_rate": 3.681255463657274e-07, + "loss": 1.545, + "step": 12806 + }, + { + "epoch": 0.9640377124146108, + "grad_norm": 4.478659629821777, + "learning_rate": 3.6665042258686166e-07, + "loss": 1.7612, + "step": 12807 + }, + { + "epoch": 0.9641129866952709, + "grad_norm": 5.460193157196045, + "learning_rate": 3.651782493682199e-07, + "loss": 1.9355, + "step": 12808 + }, + { + "epoch": 0.964188260975931, + "grad_norm": 4.338287830352783, + "learning_rate": 3.637090267973042e-07, + "loss": 1.5636, + "step": 12809 + }, + { + "epoch": 0.9642635352565913, + "grad_norm": 4.387434482574463, + "learning_rate": 3.6224275496147263e-07, + "loss": 1.3001, + "step": 12810 + }, + { + "epoch": 0.9643388095372514, + "grad_norm": 4.487557411193848, + "learning_rate": 3.607794339478776e-07, + "loss": 1.7105, + "step": 12811 + }, + { + "epoch": 0.9644140838179115, + "grad_norm": 5.497860431671143, + "learning_rate": 3.5931906384351623e-07, + "loss": 1.429, + "step": 12812 + }, + { + "epoch": 0.9644893580985717, + "grad_norm": 7.424019813537598, + "learning_rate": 3.5786164473520235e-07, + "loss": 1.9997, + "step": 12813 + }, + { + "epoch": 0.9645646323792318, + "grad_norm": 4.381494045257568, + "learning_rate": 3.5640717670957223e-07, + "loss": 1.6431, + "step": 12814 + }, + { + "epoch": 0.964639906659892, + "grad_norm": 5.804770469665527, + "learning_rate": 3.5495565985309566e-07, + "loss": 1.8807, + "step": 12815 + }, + { + "epoch": 0.9647151809405521, + "grad_norm": 4.882290840148926, + "learning_rate": 3.535070942520591e-07, + "loss": 1.9285, + "step": 12816 + }, + { + "epoch": 0.9647904552212123, + "grad_norm": 6.2644243240356445, + "learning_rate": 3.5206147999257143e-07, + "loss": 1.8068, + "step": 12817 + }, + { + "epoch": 0.9648657295018724, + "grad_norm": 5.2754669189453125, + "learning_rate": 3.5061881716056954e-07, + "loss": 1.7369, + "step": 12818 + }, + { + "epoch": 0.9649410037825326, + "grad_norm": 5.636056423187256, + "learning_rate": 3.491791058418181e-07, + "loss": 1.5516, + "step": 12819 + }, + { + "epoch": 0.9650162780631928, + "grad_norm": 5.301386833190918, + "learning_rate": 3.477423461219098e-07, + "loss": 1.661, + "step": 12820 + }, + { + "epoch": 0.9650915523438529, + "grad_norm": 4.2584614753723145, + "learning_rate": 3.463085380862485e-07, + "loss": 2.0417, + "step": 12821 + }, + { + "epoch": 0.965166826624513, + "grad_norm": 4.7916340827941895, + "learning_rate": 3.4487768182007163e-07, + "loss": 1.8691, + "step": 12822 + }, + { + "epoch": 0.9652421009051733, + "grad_norm": 5.548417091369629, + "learning_rate": 3.4344977740843887e-07, + "loss": 2.1189, + "step": 12823 + }, + { + "epoch": 0.9653173751858334, + "grad_norm": 5.059255599975586, + "learning_rate": 3.4202482493623787e-07, + "loss": 2.2455, + "step": 12824 + }, + { + "epoch": 0.9653926494664935, + "grad_norm": 5.060774326324463, + "learning_rate": 3.406028244881787e-07, + "loss": 1.8513, + "step": 12825 + }, + { + "epoch": 0.9654679237471537, + "grad_norm": 5.0774641036987305, + "learning_rate": 3.391837761487937e-07, + "loss": 1.4563, + "step": 12826 + }, + { + "epoch": 0.9655431980278139, + "grad_norm": 4.104584217071533, + "learning_rate": 3.377676800024321e-07, + "loss": 1.8978, + "step": 12827 + }, + { + "epoch": 0.965618472308474, + "grad_norm": 4.80279016494751, + "learning_rate": 3.3635453613329314e-07, + "loss": 1.8048, + "step": 12828 + }, + { + "epoch": 0.9656937465891342, + "grad_norm": 4.9074273109436035, + "learning_rate": 3.349443446253708e-07, + "loss": 1.9201, + "step": 12829 + }, + { + "epoch": 0.9657690208697943, + "grad_norm": 4.464571475982666, + "learning_rate": 3.3353710556250917e-07, + "loss": 1.8604, + "step": 12830 + }, + { + "epoch": 0.9658442951504544, + "grad_norm": 5.140709400177002, + "learning_rate": 3.3213281902835237e-07, + "loss": 1.8309, + "step": 12831 + }, + { + "epoch": 0.9659195694311147, + "grad_norm": 5.117081165313721, + "learning_rate": 3.307314851063836e-07, + "loss": 2.113, + "step": 12832 + }, + { + "epoch": 0.9659948437117748, + "grad_norm": 7.604523658752441, + "learning_rate": 3.293331038799141e-07, + "loss": 1.6905, + "step": 12833 + }, + { + "epoch": 0.9660701179924349, + "grad_norm": 5.077304840087891, + "learning_rate": 3.2793767543207176e-07, + "loss": 1.7712, + "step": 12834 + }, + { + "epoch": 0.966145392273095, + "grad_norm": 5.309237480163574, + "learning_rate": 3.265451998458069e-07, + "loss": 1.6441, + "step": 12835 + }, + { + "epoch": 0.9662206665537553, + "grad_norm": 4.8570075035095215, + "learning_rate": 3.251556772039033e-07, + "loss": 1.7237, + "step": 12836 + }, + { + "epoch": 0.9662959408344154, + "grad_norm": 4.21960973739624, + "learning_rate": 3.237691075889615e-07, + "loss": 1.6225, + "step": 12837 + }, + { + "epoch": 0.9663712151150755, + "grad_norm": 4.946710586547852, + "learning_rate": 3.2238549108340455e-07, + "loss": 1.8562, + "step": 12838 + }, + { + "epoch": 0.9664464893957357, + "grad_norm": 6.158081531524658, + "learning_rate": 3.2100482776949435e-07, + "loss": 1.802, + "step": 12839 + }, + { + "epoch": 0.9665217636763959, + "grad_norm": 5.038981914520264, + "learning_rate": 3.1962711772929863e-07, + "loss": 1.7337, + "step": 12840 + }, + { + "epoch": 0.966597037957056, + "grad_norm": 5.092843055725098, + "learning_rate": 3.182523610447241e-07, + "loss": 1.9241, + "step": 12841 + }, + { + "epoch": 0.9666723122377162, + "grad_norm": 5.793262958526611, + "learning_rate": 3.1688055779749425e-07, + "loss": 1.7307, + "step": 12842 + }, + { + "epoch": 0.9667475865183763, + "grad_norm": 6.396280765533447, + "learning_rate": 3.15511708069155e-07, + "loss": 1.9269, + "step": 12843 + }, + { + "epoch": 0.9668228607990365, + "grad_norm": 5.814577579498291, + "learning_rate": 3.141458119410856e-07, + "loss": 1.8902, + "step": 12844 + }, + { + "epoch": 0.9668981350796967, + "grad_norm": 4.52952766418457, + "learning_rate": 3.127828694944879e-07, + "loss": 1.8889, + "step": 12845 + }, + { + "epoch": 0.9669734093603568, + "grad_norm": 5.8890180587768555, + "learning_rate": 3.114228808103747e-07, + "loss": 1.9654, + "step": 12846 + }, + { + "epoch": 0.9670486836410169, + "grad_norm": 4.779413223266602, + "learning_rate": 3.100658459695982e-07, + "loss": 1.6647, + "step": 12847 + }, + { + "epoch": 0.9671239579216772, + "grad_norm": 5.064691066741943, + "learning_rate": 3.0871176505283263e-07, + "loss": 2.1677, + "step": 12848 + }, + { + "epoch": 0.9671992322023373, + "grad_norm": 4.5716705322265625, + "learning_rate": 3.073606381405747e-07, + "loss": 1.5954, + "step": 12849 + }, + { + "epoch": 0.9672745064829974, + "grad_norm": 4.712538719177246, + "learning_rate": 3.06012465313138e-07, + "loss": 1.8211, + "step": 12850 + }, + { + "epoch": 0.9673497807636576, + "grad_norm": 6.174914360046387, + "learning_rate": 3.0466724665067504e-07, + "loss": 2.2294, + "step": 12851 + }, + { + "epoch": 0.9674250550443177, + "grad_norm": 4.360692501068115, + "learning_rate": 3.0332498223314964e-07, + "loss": 1.6953, + "step": 12852 + }, + { + "epoch": 0.9675003293249779, + "grad_norm": 6.030859470367432, + "learning_rate": 3.019856721403591e-07, + "loss": 1.568, + "step": 12853 + }, + { + "epoch": 0.967575603605638, + "grad_norm": 4.935276985168457, + "learning_rate": 3.006493164519231e-07, + "loss": 1.6172, + "step": 12854 + }, + { + "epoch": 0.9676508778862982, + "grad_norm": 4.2524590492248535, + "learning_rate": 2.99315915247278e-07, + "loss": 1.7122, + "step": 12855 + }, + { + "epoch": 0.9677261521669583, + "grad_norm": 5.106043815612793, + "learning_rate": 2.979854686056993e-07, + "loss": 1.5194, + "step": 12856 + }, + { + "epoch": 0.9678014264476185, + "grad_norm": 6.843307018280029, + "learning_rate": 2.9665797660626825e-07, + "loss": 2.0082, + "step": 12857 + }, + { + "epoch": 0.9678767007282787, + "grad_norm": 5.756411075592041, + "learning_rate": 2.9533343932791056e-07, + "loss": 1.8969, + "step": 12858 + }, + { + "epoch": 0.9679519750089388, + "grad_norm": 4.695230007171631, + "learning_rate": 2.940118568493577e-07, + "loss": 1.7083, + "step": 12859 + }, + { + "epoch": 0.9680272492895989, + "grad_norm": 4.5687737464904785, + "learning_rate": 2.9269322924918573e-07, + "loss": 1.6967, + "step": 12860 + }, + { + "epoch": 0.9681025235702592, + "grad_norm": 4.319023609161377, + "learning_rate": 2.913775566057653e-07, + "loss": 1.4636, + "step": 12861 + }, + { + "epoch": 0.9681777978509193, + "grad_norm": 6.475977897644043, + "learning_rate": 2.900648389973226e-07, + "loss": 1.5725, + "step": 12862 + }, + { + "epoch": 0.9682530721315794, + "grad_norm": 5.457620620727539, + "learning_rate": 2.887550765018898e-07, + "loss": 1.7592, + "step": 12863 + }, + { + "epoch": 0.9683283464122396, + "grad_norm": 4.101484298706055, + "learning_rate": 2.874482691973324e-07, + "loss": 1.8931, + "step": 12864 + }, + { + "epoch": 0.9684036206928998, + "grad_norm": 3.980106830596924, + "learning_rate": 2.8614441716133254e-07, + "loss": 1.85, + "step": 12865 + }, + { + "epoch": 0.9684788949735599, + "grad_norm": 4.795080661773682, + "learning_rate": 2.8484352047140063e-07, + "loss": 1.4535, + "step": 12866 + }, + { + "epoch": 0.9685541692542201, + "grad_norm": 8.221390724182129, + "learning_rate": 2.835455792048747e-07, + "loss": 1.8154, + "step": 12867 + }, + { + "epoch": 0.9686294435348802, + "grad_norm": 4.251120567321777, + "learning_rate": 2.822505934389097e-07, + "loss": 1.8558, + "step": 12868 + }, + { + "epoch": 0.9687047178155404, + "grad_norm": 4.5144362449646, + "learning_rate": 2.809585632504941e-07, + "loss": 2.1037, + "step": 12869 + }, + { + "epoch": 0.9687799920962006, + "grad_norm": 5.695852279663086, + "learning_rate": 2.796694887164275e-07, + "loss": 1.7386, + "step": 12870 + }, + { + "epoch": 0.9688552663768607, + "grad_norm": 4.657914638519287, + "learning_rate": 2.783833699133542e-07, + "loss": 2.0847, + "step": 12871 + }, + { + "epoch": 0.9689305406575208, + "grad_norm": 3.799586296081543, + "learning_rate": 2.771002069177131e-07, + "loss": 1.7108, + "step": 12872 + }, + { + "epoch": 0.9690058149381809, + "grad_norm": 4.43876838684082, + "learning_rate": 2.7581999980579863e-07, + "loss": 2.1726, + "step": 12873 + }, + { + "epoch": 0.9690810892188412, + "grad_norm": 4.370305061340332, + "learning_rate": 2.745427486537111e-07, + "loss": 1.839, + "step": 12874 + }, + { + "epoch": 0.9691563634995013, + "grad_norm": 5.2520270347595215, + "learning_rate": 2.732684535373786e-07, + "loss": 1.6447, + "step": 12875 + }, + { + "epoch": 0.9692316377801614, + "grad_norm": 4.5966997146606445, + "learning_rate": 2.719971145325517e-07, + "loss": 1.676, + "step": 12876 + }, + { + "epoch": 0.9693069120608216, + "grad_norm": 5.579090595245361, + "learning_rate": 2.707287317148088e-07, + "loss": 1.8823, + "step": 12877 + }, + { + "epoch": 0.9693821863414818, + "grad_norm": 7.404440402984619, + "learning_rate": 2.694633051595563e-07, + "loss": 1.9279, + "step": 12878 + }, + { + "epoch": 0.9694574606221419, + "grad_norm": 3.836246967315674, + "learning_rate": 2.682008349420173e-07, + "loss": 1.6043, + "step": 12879 + }, + { + "epoch": 0.9695327349028021, + "grad_norm": 5.04543924331665, + "learning_rate": 2.669413211372429e-07, + "loss": 2.0549, + "step": 12880 + }, + { + "epoch": 0.9696080091834622, + "grad_norm": 5.575264930725098, + "learning_rate": 2.6568476382010656e-07, + "loss": 1.6289, + "step": 12881 + }, + { + "epoch": 0.9696832834641224, + "grad_norm": 4.055024147033691, + "learning_rate": 2.6443116306530955e-07, + "loss": 1.8076, + "step": 12882 + }, + { + "epoch": 0.9697585577447826, + "grad_norm": 5.3261613845825195, + "learning_rate": 2.6318051894737017e-07, + "loss": 1.593, + "step": 12883 + }, + { + "epoch": 0.9698338320254427, + "grad_norm": 4.643156051635742, + "learning_rate": 2.6193283154063997e-07, + "loss": 1.6863, + "step": 12884 + }, + { + "epoch": 0.9699091063061028, + "grad_norm": 4.882751941680908, + "learning_rate": 2.6068810091928187e-07, + "loss": 1.5136, + "step": 12885 + }, + { + "epoch": 0.9699843805867631, + "grad_norm": 5.036129474639893, + "learning_rate": 2.594463271573033e-07, + "loss": 1.8994, + "step": 12886 + }, + { + "epoch": 0.9700596548674232, + "grad_norm": 4.722573280334473, + "learning_rate": 2.5820751032851755e-07, + "loss": 1.6588, + "step": 12887 + }, + { + "epoch": 0.9701349291480833, + "grad_norm": 6.217869758605957, + "learning_rate": 2.5697165050657113e-07, + "loss": 2.0225, + "step": 12888 + }, + { + "epoch": 0.9702102034287435, + "grad_norm": 5.278621673583984, + "learning_rate": 2.557387477649331e-07, + "loss": 1.7792, + "step": 12889 + }, + { + "epoch": 0.9702854777094037, + "grad_norm": 5.1986083984375, + "learning_rate": 2.5450880217688935e-07, + "loss": 1.7527, + "step": 12890 + }, + { + "epoch": 0.9703607519900638, + "grad_norm": 5.341720104217529, + "learning_rate": 2.532818138155646e-07, + "loss": 1.5141, + "step": 12891 + }, + { + "epoch": 0.970436026270724, + "grad_norm": 4.697822570800781, + "learning_rate": 2.5205778275390056e-07, + "loss": 1.7573, + "step": 12892 + }, + { + "epoch": 0.9705113005513841, + "grad_norm": 4.658453464508057, + "learning_rate": 2.508367090646502e-07, + "loss": 1.6563, + "step": 12893 + }, + { + "epoch": 0.9705865748320442, + "grad_norm": 4.803254127502441, + "learning_rate": 2.4961859282041643e-07, + "loss": 1.9874, + "step": 12894 + }, + { + "epoch": 0.9706618491127044, + "grad_norm": 4.559871196746826, + "learning_rate": 2.484034340936081e-07, + "loss": 1.926, + "step": 12895 + }, + { + "epoch": 0.9707371233933646, + "grad_norm": 4.435745716094971, + "learning_rate": 2.471912329564618e-07, + "loss": 1.6567, + "step": 12896 + }, + { + "epoch": 0.9708123976740247, + "grad_norm": 4.745160102844238, + "learning_rate": 2.459819894810367e-07, + "loss": 1.8007, + "step": 12897 + }, + { + "epoch": 0.9708876719546848, + "grad_norm": 5.0204997062683105, + "learning_rate": 2.447757037392251e-07, + "loss": 2.0072, + "step": 12898 + }, + { + "epoch": 0.9709629462353451, + "grad_norm": 5.456808090209961, + "learning_rate": 2.4357237580273084e-07, + "loss": 1.8933, + "step": 12899 + }, + { + "epoch": 0.9710382205160052, + "grad_norm": 5.426011562347412, + "learning_rate": 2.423720057430967e-07, + "loss": 1.873, + "step": 12900 + }, + { + "epoch": 0.9711134947966653, + "grad_norm": 4.370813846588135, + "learning_rate": 2.411745936316712e-07, + "loss": 1.6608, + "step": 12901 + }, + { + "epoch": 0.9711887690773255, + "grad_norm": 3.82869815826416, + "learning_rate": 2.399801395396417e-07, + "loss": 1.6807, + "step": 12902 + }, + { + "epoch": 0.9712640433579857, + "grad_norm": 5.512392520904541, + "learning_rate": 2.387886435380182e-07, + "loss": 1.6068, + "step": 12903 + }, + { + "epoch": 0.9713393176386458, + "grad_norm": 5.068700313568115, + "learning_rate": 2.3760010569762737e-07, + "loss": 1.7812, + "step": 12904 + }, + { + "epoch": 0.971414591919306, + "grad_norm": 6.03415584564209, + "learning_rate": 2.364145260891293e-07, + "loss": 1.9484, + "step": 12905 + }, + { + "epoch": 0.9714898661999661, + "grad_norm": 6.718302249908447, + "learning_rate": 2.352319047829954e-07, + "loss": 1.5456, + "step": 12906 + }, + { + "epoch": 0.9715651404806263, + "grad_norm": 4.510940074920654, + "learning_rate": 2.3405224184954166e-07, + "loss": 1.8412, + "step": 12907 + }, + { + "epoch": 0.9716404147612865, + "grad_norm": 4.695608139038086, + "learning_rate": 2.328755373588787e-07, + "loss": 1.3734, + "step": 12908 + }, + { + "epoch": 0.9717156890419466, + "grad_norm": 5.0936689376831055, + "learning_rate": 2.317017913809727e-07, + "loss": 2.0167, + "step": 12909 + }, + { + "epoch": 0.9717909633226067, + "grad_norm": 5.769432544708252, + "learning_rate": 2.3053100398559015e-07, + "loss": 2.051, + "step": 12910 + }, + { + "epoch": 0.971866237603267, + "grad_norm": 3.9330129623413086, + "learning_rate": 2.293631752423364e-07, + "loss": 1.9386, + "step": 12911 + }, + { + "epoch": 0.9719415118839271, + "grad_norm": 5.308951377868652, + "learning_rate": 2.281983052206338e-07, + "loss": 1.8434, + "step": 12912 + }, + { + "epoch": 0.9720167861645872, + "grad_norm": 4.602841377258301, + "learning_rate": 2.2703639398973242e-07, + "loss": 2.1663, + "step": 12913 + }, + { + "epoch": 0.9720920604452473, + "grad_norm": 5.422677040100098, + "learning_rate": 2.2587744161869927e-07, + "loss": 1.6994, + "step": 12914 + }, + { + "epoch": 0.9721673347259076, + "grad_norm": 4.921991348266602, + "learning_rate": 2.2472144817643482e-07, + "loss": 1.4174, + "step": 12915 + }, + { + "epoch": 0.9722426090065677, + "grad_norm": 7.114505767822266, + "learning_rate": 2.2356841373166182e-07, + "loss": 1.8411, + "step": 12916 + }, + { + "epoch": 0.9723178832872278, + "grad_norm": 4.967130184173584, + "learning_rate": 2.224183383529199e-07, + "loss": 1.7277, + "step": 12917 + }, + { + "epoch": 0.972393157567888, + "grad_norm": 5.603796482086182, + "learning_rate": 2.2127122210857663e-07, + "loss": 1.7817, + "step": 12918 + }, + { + "epoch": 0.9724684318485481, + "grad_norm": 5.356256484985352, + "learning_rate": 2.20127065066833e-07, + "loss": 1.5158, + "step": 12919 + }, + { + "epoch": 0.9725437061292083, + "grad_norm": 4.639211177825928, + "learning_rate": 2.1898586729570126e-07, + "loss": 1.6356, + "step": 12920 + }, + { + "epoch": 0.9726189804098685, + "grad_norm": 4.742597579956055, + "learning_rate": 2.1784762886302157e-07, + "loss": 2.0759, + "step": 12921 + }, + { + "epoch": 0.9726942546905286, + "grad_norm": 4.5918097496032715, + "learning_rate": 2.167123498364565e-07, + "loss": 1.4234, + "step": 12922 + }, + { + "epoch": 0.9727695289711887, + "grad_norm": 4.690528392791748, + "learning_rate": 2.1558003028349648e-07, + "loss": 1.5495, + "step": 12923 + }, + { + "epoch": 0.972844803251849, + "grad_norm": 4.799502372741699, + "learning_rate": 2.1445067027145994e-07, + "loss": 1.6469, + "step": 12924 + }, + { + "epoch": 0.9729200775325091, + "grad_norm": 5.731314182281494, + "learning_rate": 2.1332426986747644e-07, + "loss": 1.9086, + "step": 12925 + }, + { + "epoch": 0.9729953518131692, + "grad_norm": 4.05239200592041, + "learning_rate": 2.1220082913852024e-07, + "loss": 1.9317, + "step": 12926 + }, + { + "epoch": 0.9730706260938294, + "grad_norm": 5.346749305725098, + "learning_rate": 2.1108034815136013e-07, + "loss": 1.4937, + "step": 12927 + }, + { + "epoch": 0.9731459003744896, + "grad_norm": 5.038517951965332, + "learning_rate": 2.0996282697261505e-07, + "loss": 1.7887, + "step": 12928 + }, + { + "epoch": 0.9732211746551497, + "grad_norm": 5.209228038787842, + "learning_rate": 2.0884826566871517e-07, + "loss": 1.9106, + "step": 12929 + }, + { + "epoch": 0.9732964489358099, + "grad_norm": 4.027022361755371, + "learning_rate": 2.0773666430591866e-07, + "loss": 1.8402, + "step": 12930 + }, + { + "epoch": 0.97337172321647, + "grad_norm": 4.628996849060059, + "learning_rate": 2.0662802295030592e-07, + "loss": 1.9488, + "step": 12931 + }, + { + "epoch": 0.9734469974971302, + "grad_norm": 5.011528968811035, + "learning_rate": 2.055223416677854e-07, + "loss": 2.1942, + "step": 12932 + }, + { + "epoch": 0.9735222717777903, + "grad_norm": 4.232499122619629, + "learning_rate": 2.0441962052408782e-07, + "loss": 1.7977, + "step": 12933 + }, + { + "epoch": 0.9735975460584505, + "grad_norm": 6.438435077667236, + "learning_rate": 2.0331985958476075e-07, + "loss": 2.1995, + "step": 12934 + }, + { + "epoch": 0.9736728203391106, + "grad_norm": 7.225154399871826, + "learning_rate": 2.0222305891518521e-07, + "loss": 2.0804, + "step": 12935 + }, + { + "epoch": 0.9737480946197707, + "grad_norm": 4.778552055358887, + "learning_rate": 2.0112921858056466e-07, + "loss": 1.8135, + "step": 12936 + }, + { + "epoch": 0.973823368900431, + "grad_norm": 5.350754261016846, + "learning_rate": 2.0003833864592482e-07, + "loss": 1.6589, + "step": 12937 + }, + { + "epoch": 0.9738986431810911, + "grad_norm": 4.418392658233643, + "learning_rate": 1.989504191761138e-07, + "loss": 1.9128, + "step": 12938 + }, + { + "epoch": 0.9739739174617512, + "grad_norm": 4.4751482009887695, + "learning_rate": 1.978654602358021e-07, + "loss": 1.7483, + "step": 12939 + }, + { + "epoch": 0.9740491917424114, + "grad_norm": 5.576937198638916, + "learning_rate": 1.967834618894937e-07, + "loss": 1.751, + "step": 12940 + }, + { + "epoch": 0.9741244660230716, + "grad_norm": 5.267410755157471, + "learning_rate": 1.957044242015038e-07, + "loss": 2.1266, + "step": 12941 + }, + { + "epoch": 0.9741997403037317, + "grad_norm": 4.853013038635254, + "learning_rate": 1.946283472359811e-07, + "loss": 1.9482, + "step": 12942 + }, + { + "epoch": 0.9742750145843919, + "grad_norm": 5.489080429077148, + "learning_rate": 1.9355523105689666e-07, + "loss": 2.2334, + "step": 12943 + }, + { + "epoch": 0.974350288865052, + "grad_norm": 5.7867960929870605, + "learning_rate": 1.9248507572804385e-07, + "loss": 1.8813, + "step": 12944 + }, + { + "epoch": 0.9744255631457122, + "grad_norm": 5.590057849884033, + "learning_rate": 1.9141788131303851e-07, + "loss": 1.4356, + "step": 12945 + }, + { + "epoch": 0.9745008374263724, + "grad_norm": 3.8147857189178467, + "learning_rate": 1.9035364787532427e-07, + "loss": 1.8349, + "step": 12946 + }, + { + "epoch": 0.9745761117070325, + "grad_norm": 5.534485340118408, + "learning_rate": 1.8929237547816726e-07, + "loss": 2.3447, + "step": 12947 + }, + { + "epoch": 0.9746513859876926, + "grad_norm": 5.00258207321167, + "learning_rate": 1.8823406418465028e-07, + "loss": 1.778, + "step": 12948 + }, + { + "epoch": 0.9747266602683529, + "grad_norm": 5.148022651672363, + "learning_rate": 1.8717871405769526e-07, + "loss": 1.7496, + "step": 12949 + }, + { + "epoch": 0.974801934549013, + "grad_norm": 5.67739725112915, + "learning_rate": 1.8612632516004092e-07, + "loss": 1.613, + "step": 12950 + }, + { + "epoch": 0.9748772088296731, + "grad_norm": 4.235254764556885, + "learning_rate": 1.850768975542372e-07, + "loss": 2.0982, + "step": 12951 + }, + { + "epoch": 0.9749524831103332, + "grad_norm": 4.133965969085693, + "learning_rate": 1.840304313026786e-07, + "loss": 1.7344, + "step": 12952 + }, + { + "epoch": 0.9750277573909935, + "grad_norm": 4.7754998207092285, + "learning_rate": 1.8298692646757653e-07, + "loss": 1.8371, + "step": 12953 + }, + { + "epoch": 0.9751030316716536, + "grad_norm": 6.86676549911499, + "learning_rate": 1.8194638311095914e-07, + "loss": 1.7308, + "step": 12954 + }, + { + "epoch": 0.9751783059523137, + "grad_norm": 4.804413318634033, + "learning_rate": 1.8090880129468246e-07, + "loss": 1.7924, + "step": 12955 + }, + { + "epoch": 0.9752535802329739, + "grad_norm": 5.413994789123535, + "learning_rate": 1.7987418108043608e-07, + "loss": 1.9852, + "step": 12956 + }, + { + "epoch": 0.975328854513634, + "grad_norm": 4.407379150390625, + "learning_rate": 1.7884252252971524e-07, + "loss": 1.9656, + "step": 12957 + }, + { + "epoch": 0.9754041287942942, + "grad_norm": 4.6286797523498535, + "learning_rate": 1.7781382570385418e-07, + "loss": 1.4735, + "step": 12958 + }, + { + "epoch": 0.9754794030749544, + "grad_norm": 4.9368743896484375, + "learning_rate": 1.7678809066400405e-07, + "loss": 1.6398, + "step": 12959 + }, + { + "epoch": 0.9755546773556145, + "grad_norm": 5.2719197273254395, + "learning_rate": 1.7576531747114378e-07, + "loss": 1.8267, + "step": 12960 + }, + { + "epoch": 0.9756299516362746, + "grad_norm": 5.103287220001221, + "learning_rate": 1.7474550618607477e-07, + "loss": 1.9093, + "step": 12961 + }, + { + "epoch": 0.9757052259169349, + "grad_norm": 5.483565330505371, + "learning_rate": 1.7372865686942075e-07, + "loss": 1.6723, + "step": 12962 + }, + { + "epoch": 0.975780500197595, + "grad_norm": 5.80971622467041, + "learning_rate": 1.7271476958163336e-07, + "loss": 1.8963, + "step": 12963 + }, + { + "epoch": 0.9758557744782551, + "grad_norm": 6.123488426208496, + "learning_rate": 1.717038443829755e-07, + "loss": 2.1776, + "step": 12964 + }, + { + "epoch": 0.9759310487589153, + "grad_norm": 6.915956497192383, + "learning_rate": 1.7069588133355464e-07, + "loss": 2.0053, + "step": 12965 + }, + { + "epoch": 0.9760063230395755, + "grad_norm": 5.975083351135254, + "learning_rate": 1.6969088049328398e-07, + "loss": 1.675, + "step": 12966 + }, + { + "epoch": 0.9760815973202356, + "grad_norm": 6.502622604370117, + "learning_rate": 1.686888419219157e-07, + "loss": 1.8324, + "step": 12967 + }, + { + "epoch": 0.9761568716008958, + "grad_norm": 5.546345233917236, + "learning_rate": 1.6768976567901328e-07, + "loss": 1.6187, + "step": 12968 + }, + { + "epoch": 0.9762321458815559, + "grad_norm": 4.889604091644287, + "learning_rate": 1.666936518239681e-07, + "loss": 1.9573, + "step": 12969 + }, + { + "epoch": 0.976307420162216, + "grad_norm": 8.208456993103027, + "learning_rate": 1.657005004159995e-07, + "loss": 1.9067, + "step": 12970 + }, + { + "epoch": 0.9763826944428762, + "grad_norm": 5.595680236816406, + "learning_rate": 1.647103115141435e-07, + "loss": 1.7742, + "step": 12971 + }, + { + "epoch": 0.9764579687235364, + "grad_norm": 5.283383846282959, + "learning_rate": 1.6372308517726976e-07, + "loss": 1.9905, + "step": 12972 + }, + { + "epoch": 0.9765332430041965, + "grad_norm": 4.882968902587891, + "learning_rate": 1.6273882146405904e-07, + "loss": 1.7688, + "step": 12973 + }, + { + "epoch": 0.9766085172848566, + "grad_norm": 4.411565780639648, + "learning_rate": 1.6175752043302573e-07, + "loss": 1.5687, + "step": 12974 + }, + { + "epoch": 0.9766837915655169, + "grad_norm": 4.612016677856445, + "learning_rate": 1.6077918214250642e-07, + "loss": 1.9334, + "step": 12975 + }, + { + "epoch": 0.976759065846177, + "grad_norm": 7.30670690536499, + "learning_rate": 1.598038066506602e-07, + "loss": 2.1734, + "step": 12976 + }, + { + "epoch": 0.9768343401268371, + "grad_norm": 8.496973037719727, + "learning_rate": 1.5883139401546844e-07, + "loss": 1.9697, + "step": 12977 + }, + { + "epoch": 0.9769096144074974, + "grad_norm": 4.6441497802734375, + "learning_rate": 1.5786194429474044e-07, + "loss": 1.9611, + "step": 12978 + }, + { + "epoch": 0.9769848886881575, + "grad_norm": 3.7294232845306396, + "learning_rate": 1.5689545754610792e-07, + "loss": 2.0013, + "step": 12979 + }, + { + "epoch": 0.9770601629688176, + "grad_norm": 5.306214332580566, + "learning_rate": 1.5593193382702486e-07, + "loss": 1.9306, + "step": 12980 + }, + { + "epoch": 0.9771354372494778, + "grad_norm": 5.148443222045898, + "learning_rate": 1.5497137319476773e-07, + "loss": 1.894, + "step": 12981 + }, + { + "epoch": 0.9772107115301379, + "grad_norm": 6.160584926605225, + "learning_rate": 1.5401377570644083e-07, + "loss": 1.5534, + "step": 12982 + }, + { + "epoch": 0.9772859858107981, + "grad_norm": 5.997537136077881, + "learning_rate": 1.5305914141897082e-07, + "loss": 1.6788, + "step": 12983 + }, + { + "epoch": 0.9773612600914583, + "grad_norm": 7.141962051391602, + "learning_rate": 1.5210747038911234e-07, + "loss": 1.7355, + "step": 12984 + }, + { + "epoch": 0.9774365343721184, + "grad_norm": 5.960862636566162, + "learning_rate": 1.511587626734312e-07, + "loss": 1.6497, + "step": 12985 + }, + { + "epoch": 0.9775118086527785, + "grad_norm": 4.968209266662598, + "learning_rate": 1.5021301832833235e-07, + "loss": 1.7074, + "step": 12986 + }, + { + "epoch": 0.9775870829334388, + "grad_norm": 5.550293922424316, + "learning_rate": 1.4927023741003187e-07, + "loss": 2.1063, + "step": 12987 + }, + { + "epoch": 0.9776623572140989, + "grad_norm": 4.9811835289001465, + "learning_rate": 1.4833041997457942e-07, + "loss": 1.7594, + "step": 12988 + }, + { + "epoch": 0.977737631494759, + "grad_norm": 6.835657119750977, + "learning_rate": 1.473935660778414e-07, + "loss": 1.6718, + "step": 12989 + }, + { + "epoch": 0.9778129057754192, + "grad_norm": 4.086740493774414, + "learning_rate": 1.4645967577551212e-07, + "loss": 1.3885, + "step": 12990 + }, + { + "epoch": 0.9778881800560794, + "grad_norm": 5.872897624969482, + "learning_rate": 1.4552874912310832e-07, + "loss": 1.7537, + "step": 12991 + }, + { + "epoch": 0.9779634543367395, + "grad_norm": 5.258413791656494, + "learning_rate": 1.4460078617597462e-07, + "loss": 1.7176, + "step": 12992 + }, + { + "epoch": 0.9780387286173996, + "grad_norm": 3.5322299003601074, + "learning_rate": 1.4367578698927242e-07, + "loss": 1.8511, + "step": 12993 + }, + { + "epoch": 0.9781140028980598, + "grad_norm": 4.950979232788086, + "learning_rate": 1.427537516179911e-07, + "loss": 1.6264, + "step": 12994 + }, + { + "epoch": 0.97818927717872, + "grad_norm": 4.316150188446045, + "learning_rate": 1.4183468011694235e-07, + "loss": 1.6371, + "step": 12995 + }, + { + "epoch": 0.9782645514593801, + "grad_norm": 6.338021755218506, + "learning_rate": 1.409185725407658e-07, + "loss": 2.0498, + "step": 12996 + }, + { + "epoch": 0.9783398257400403, + "grad_norm": 7.142224311828613, + "learning_rate": 1.4000542894391232e-07, + "loss": 1.8443, + "step": 12997 + }, + { + "epoch": 0.9784151000207004, + "grad_norm": 4.912839412689209, + "learning_rate": 1.390952493806774e-07, + "loss": 1.5096, + "step": 12998 + }, + { + "epoch": 0.9784903743013605, + "grad_norm": 5.672285079956055, + "learning_rate": 1.3818803390515667e-07, + "loss": 1.9076, + "step": 12999 + }, + { + "epoch": 0.9785656485820208, + "grad_norm": 4.521214008331299, + "learning_rate": 1.3728378257129027e-07, + "loss": 1.6593, + "step": 13000 + }, + { + "epoch": 0.9786409228626809, + "grad_norm": 4.792366027832031, + "learning_rate": 1.3638249543283522e-07, + "loss": 2.3685, + "step": 13001 + }, + { + "epoch": 0.978716197143341, + "grad_norm": 8.043989181518555, + "learning_rate": 1.3548417254336532e-07, + "loss": 2.1338, + "step": 13002 + }, + { + "epoch": 0.9787914714240012, + "grad_norm": 4.693892478942871, + "learning_rate": 1.345888139562823e-07, + "loss": 2.0123, + "step": 13003 + }, + { + "epoch": 0.9788667457046614, + "grad_norm": 4.125730037689209, + "learning_rate": 1.336964197248103e-07, + "loss": 2.1044, + "step": 13004 + }, + { + "epoch": 0.9789420199853215, + "grad_norm": 4.785754680633545, + "learning_rate": 1.328069899020068e-07, + "loss": 1.724, + "step": 13005 + }, + { + "epoch": 0.9790172942659817, + "grad_norm": 4.683404445648193, + "learning_rate": 1.319205245407462e-07, + "loss": 1.9796, + "step": 13006 + }, + { + "epoch": 0.9790925685466418, + "grad_norm": 4.532181739807129, + "learning_rate": 1.3103702369371973e-07, + "loss": 1.8935, + "step": 13007 + }, + { + "epoch": 0.979167842827302, + "grad_norm": 5.674066543579102, + "learning_rate": 1.3015648741345199e-07, + "loss": 1.7685, + "step": 13008 + }, + { + "epoch": 0.9792431171079622, + "grad_norm": 5.136986255645752, + "learning_rate": 1.2927891575228446e-07, + "loss": 1.5399, + "step": 13009 + }, + { + "epoch": 0.9793183913886223, + "grad_norm": 4.792853355407715, + "learning_rate": 1.2840430876239206e-07, + "loss": 2.0651, + "step": 13010 + }, + { + "epoch": 0.9793936656692824, + "grad_norm": 4.621176242828369, + "learning_rate": 1.275326664957721e-07, + "loss": 1.8561, + "step": 13011 + }, + { + "epoch": 0.9794689399499426, + "grad_norm": 4.739818572998047, + "learning_rate": 1.266639890042276e-07, + "loss": 1.7558, + "step": 13012 + }, + { + "epoch": 0.9795442142306028, + "grad_norm": 8.046173095703125, + "learning_rate": 1.2579827633941166e-07, + "loss": 1.9338, + "step": 13013 + }, + { + "epoch": 0.9796194885112629, + "grad_norm": 5.873472690582275, + "learning_rate": 1.2493552855278313e-07, + "loss": 1.7114, + "step": 13014 + }, + { + "epoch": 0.979694762791923, + "grad_norm": 4.823851585388184, + "learning_rate": 1.240757456956232e-07, + "loss": 1.4271, + "step": 13015 + }, + { + "epoch": 0.9797700370725833, + "grad_norm": 6.730445384979248, + "learning_rate": 1.2321892781905763e-07, + "loss": 1.7472, + "step": 13016 + }, + { + "epoch": 0.9798453113532434, + "grad_norm": 5.2308454513549805, + "learning_rate": 1.223650749740124e-07, + "loss": 1.986, + "step": 13017 + }, + { + "epoch": 0.9799205856339035, + "grad_norm": 6.757125377655029, + "learning_rate": 1.2151418721124684e-07, + "loss": 2.0117, + "step": 13018 + }, + { + "epoch": 0.9799958599145637, + "grad_norm": 3.811600685119629, + "learning_rate": 1.2066626458134278e-07, + "loss": 1.8664, + "step": 13019 + }, + { + "epoch": 0.9800711341952238, + "grad_norm": 4.637233257293701, + "learning_rate": 1.198213071347154e-07, + "loss": 1.7718, + "step": 13020 + }, + { + "epoch": 0.980146408475884, + "grad_norm": 4.9097795486450195, + "learning_rate": 1.1897931492158565e-07, + "loss": 2.0681, + "step": 13021 + }, + { + "epoch": 0.9802216827565442, + "grad_norm": 4.5483078956604, + "learning_rate": 1.1814028799201348e-07, + "loss": 2.0928, + "step": 13022 + }, + { + "epoch": 0.9802969570372043, + "grad_norm": 4.831428050994873, + "learning_rate": 1.1730422639587013e-07, + "loss": 1.893, + "step": 13023 + }, + { + "epoch": 0.9803722313178644, + "grad_norm": 8.606104850769043, + "learning_rate": 1.1647113018286027e-07, + "loss": 2.0254, + "step": 13024 + }, + { + "epoch": 0.9804475055985247, + "grad_norm": 5.141611576080322, + "learning_rate": 1.156409994025165e-07, + "loss": 2.1332, + "step": 13025 + }, + { + "epoch": 0.9805227798791848, + "grad_norm": 4.279614448547363, + "learning_rate": 1.1481383410417712e-07, + "loss": 1.8194, + "step": 13026 + }, + { + "epoch": 0.9805980541598449, + "grad_norm": 6.046935081481934, + "learning_rate": 1.1398963433701393e-07, + "loss": 1.5399, + "step": 13027 + }, + { + "epoch": 0.9806733284405051, + "grad_norm": 4.307223796844482, + "learning_rate": 1.1316840015003772e-07, + "loss": 1.9866, + "step": 13028 + }, + { + "epoch": 0.9807486027211653, + "grad_norm": 5.5199737548828125, + "learning_rate": 1.1235013159205387e-07, + "loss": 1.8194, + "step": 13029 + }, + { + "epoch": 0.9808238770018254, + "grad_norm": 12.91816520690918, + "learning_rate": 1.1153482871170684e-07, + "loss": 2.1331, + "step": 13030 + }, + { + "epoch": 0.9808991512824855, + "grad_norm": 5.013204097747803, + "learning_rate": 1.107224915574745e-07, + "loss": 2.1744, + "step": 13031 + }, + { + "epoch": 0.9809744255631457, + "grad_norm": 5.964956283569336, + "learning_rate": 1.0991312017764044e-07, + "loss": 2.1114, + "step": 13032 + }, + { + "epoch": 0.9810496998438059, + "grad_norm": 5.463572025299072, + "learning_rate": 1.0910671462031618e-07, + "loss": 1.708, + "step": 13033 + }, + { + "epoch": 0.981124974124466, + "grad_norm": 4.063724994659424, + "learning_rate": 1.0830327493344672e-07, + "loss": 1.6479, + "step": 13034 + }, + { + "epoch": 0.9812002484051262, + "grad_norm": 5.764823913574219, + "learning_rate": 1.0750280116479383e-07, + "loss": 1.8031, + "step": 13035 + }, + { + "epoch": 0.9812755226857863, + "grad_norm": 4.788595199584961, + "learning_rate": 1.0670529336194168e-07, + "loss": 1.7181, + "step": 13036 + }, + { + "epoch": 0.9813507969664464, + "grad_norm": 4.862103462219238, + "learning_rate": 1.0591075157230235e-07, + "loss": 1.9211, + "step": 13037 + }, + { + "epoch": 0.9814260712471067, + "grad_norm": 5.095187664031982, + "learning_rate": 1.0511917584310472e-07, + "loss": 1.7008, + "step": 13038 + }, + { + "epoch": 0.9815013455277668, + "grad_norm": 4.9878830909729, + "learning_rate": 1.0433056622140558e-07, + "loss": 1.9138, + "step": 13039 + }, + { + "epoch": 0.9815766198084269, + "grad_norm": 4.880194664001465, + "learning_rate": 1.0354492275408966e-07, + "loss": 1.9044, + "step": 13040 + }, + { + "epoch": 0.9816518940890872, + "grad_norm": 4.527126789093018, + "learning_rate": 1.027622454878585e-07, + "loss": 1.5647, + "step": 13041 + }, + { + "epoch": 0.9817271683697473, + "grad_norm": 5.153310298919678, + "learning_rate": 1.01982534469236e-07, + "loss": 2.1593, + "step": 13042 + }, + { + "epoch": 0.9818024426504074, + "grad_norm": 4.031414985656738, + "learning_rate": 1.0120578974458506e-07, + "loss": 1.7779, + "step": 13043 + }, + { + "epoch": 0.9818777169310676, + "grad_norm": 5.343712329864502, + "learning_rate": 1.0043201136006874e-07, + "loss": 1.8917, + "step": 13044 + }, + { + "epoch": 0.9819529912117277, + "grad_norm": 5.463987350463867, + "learning_rate": 9.966119936170026e-08, + "loss": 1.7369, + "step": 13045 + }, + { + "epoch": 0.9820282654923879, + "grad_norm": 5.339509963989258, + "learning_rate": 9.889335379528186e-08, + "loss": 1.7345, + "step": 13046 + }, + { + "epoch": 0.9821035397730481, + "grad_norm": 4.834533214569092, + "learning_rate": 9.812847470647702e-08, + "loss": 1.8672, + "step": 13047 + }, + { + "epoch": 0.9821788140537082, + "grad_norm": 7.370434761047363, + "learning_rate": 9.73665621407549e-08, + "loss": 2.0389, + "step": 13048 + }, + { + "epoch": 0.9822540883343683, + "grad_norm": 5.545389175415039, + "learning_rate": 9.660761614339598e-08, + "loss": 1.9579, + "step": 13049 + }, + { + "epoch": 0.9823293626150285, + "grad_norm": 5.122701644897461, + "learning_rate": 9.58516367595308e-08, + "loss": 1.7374, + "step": 13050 + }, + { + "epoch": 0.9824046368956887, + "grad_norm": 6.175267696380615, + "learning_rate": 9.50986240340901e-08, + "loss": 1.5349, + "step": 13051 + }, + { + "epoch": 0.9824799111763488, + "grad_norm": 5.517742156982422, + "learning_rate": 9.434857801184915e-08, + "loss": 1.9744, + "step": 13052 + }, + { + "epoch": 0.9825551854570089, + "grad_norm": 3.5727803707122803, + "learning_rate": 9.360149873738345e-08, + "loss": 1.6103, + "step": 13053 + }, + { + "epoch": 0.9826304597376692, + "grad_norm": 4.6505584716796875, + "learning_rate": 9.285738625511298e-08, + "loss": 1.9874, + "step": 13054 + }, + { + "epoch": 0.9827057340183293, + "grad_norm": 4.945466995239258, + "learning_rate": 9.211624060926904e-08, + "loss": 1.9691, + "step": 13055 + }, + { + "epoch": 0.9827810082989894, + "grad_norm": 4.510948657989502, + "learning_rate": 9.137806184391639e-08, + "loss": 1.5888, + "step": 13056 + }, + { + "epoch": 0.9828562825796496, + "grad_norm": 4.9971137046813965, + "learning_rate": 9.06428500029366e-08, + "loss": 1.7101, + "step": 13057 + }, + { + "epoch": 0.9829315568603098, + "grad_norm": 4.093054294586182, + "learning_rate": 8.991060513002803e-08, + "loss": 1.7063, + "step": 13058 + }, + { + "epoch": 0.9830068311409699, + "grad_norm": 3.8774163722991943, + "learning_rate": 8.918132726872808e-08, + "loss": 1.7303, + "step": 13059 + }, + { + "epoch": 0.9830821054216301, + "grad_norm": 8.15949535369873, + "learning_rate": 8.845501646239096e-08, + "loss": 2.1858, + "step": 13060 + }, + { + "epoch": 0.9831573797022902, + "grad_norm": 4.288280010223389, + "learning_rate": 8.773167275418214e-08, + "loss": 1.8548, + "step": 13061 + }, + { + "epoch": 0.9832326539829503, + "grad_norm": 5.814545154571533, + "learning_rate": 8.701129618712279e-08, + "loss": 1.6416, + "step": 13062 + }, + { + "epoch": 0.9833079282636106, + "grad_norm": 5.314117908477783, + "learning_rate": 8.62938868040175e-08, + "loss": 1.8706, + "step": 13063 + }, + { + "epoch": 0.9833832025442707, + "grad_norm": 4.418406009674072, + "learning_rate": 8.55794446475211e-08, + "loss": 1.7888, + "step": 13064 + }, + { + "epoch": 0.9834584768249308, + "grad_norm": 4.633051872253418, + "learning_rate": 8.486796976011069e-08, + "loss": 1.537, + "step": 13065 + }, + { + "epoch": 0.983533751105591, + "grad_norm": 6.053751468658447, + "learning_rate": 8.415946218407466e-08, + "loss": 1.848, + "step": 13066 + }, + { + "epoch": 0.9836090253862512, + "grad_norm": 4.960136890411377, + "learning_rate": 8.345392196153489e-08, + "loss": 1.7763, + "step": 13067 + }, + { + "epoch": 0.9836842996669113, + "grad_norm": 5.135873317718506, + "learning_rate": 8.275134913443005e-08, + "loss": 1.9022, + "step": 13068 + }, + { + "epoch": 0.9837595739475714, + "grad_norm": 4.405609607696533, + "learning_rate": 8.205174374453228e-08, + "loss": 1.7956, + "step": 13069 + }, + { + "epoch": 0.9838348482282316, + "grad_norm": 5.821405410766602, + "learning_rate": 8.135510583342498e-08, + "loss": 1.9378, + "step": 13070 + }, + { + "epoch": 0.9839101225088918, + "grad_norm": 4.579228401184082, + "learning_rate": 8.066143544253058e-08, + "loss": 1.9251, + "step": 13071 + }, + { + "epoch": 0.9839853967895519, + "grad_norm": 5.583433151245117, + "learning_rate": 7.997073261307164e-08, + "loss": 2.0254, + "step": 13072 + }, + { + "epoch": 0.9840606710702121, + "grad_norm": 5.75313663482666, + "learning_rate": 7.928299738612088e-08, + "loss": 1.7007, + "step": 13073 + }, + { + "epoch": 0.9841359453508722, + "grad_norm": 4.439338684082031, + "learning_rate": 7.859822980255115e-08, + "loss": 1.5112, + "step": 13074 + }, + { + "epoch": 0.9842112196315324, + "grad_norm": 6.740578651428223, + "learning_rate": 7.79164299030799e-08, + "loss": 2.1429, + "step": 13075 + }, + { + "epoch": 0.9842864939121926, + "grad_norm": 4.406201362609863, + "learning_rate": 7.72375977282358e-08, + "loss": 1.6211, + "step": 13076 + }, + { + "epoch": 0.9843617681928527, + "grad_norm": 4.113922595977783, + "learning_rate": 7.656173331836991e-08, + "loss": 1.7484, + "step": 13077 + }, + { + "epoch": 0.9844370424735128, + "grad_norm": 4.590651035308838, + "learning_rate": 7.588883671366675e-08, + "loss": 1.725, + "step": 13078 + }, + { + "epoch": 0.9845123167541731, + "grad_norm": 5.183358669281006, + "learning_rate": 7.521890795411657e-08, + "loss": 1.6976, + "step": 13079 + }, + { + "epoch": 0.9845875910348332, + "grad_norm": 5.077670574188232, + "learning_rate": 7.45519470795597e-08, + "loss": 1.6645, + "step": 13080 + }, + { + "epoch": 0.9846628653154933, + "grad_norm": 4.6408796310424805, + "learning_rate": 7.388795412963112e-08, + "loss": 1.5964, + "step": 13081 + }, + { + "epoch": 0.9847381395961535, + "grad_norm": 4.676085948944092, + "learning_rate": 7.32269291438159e-08, + "loss": 1.876, + "step": 13082 + }, + { + "epoch": 0.9848134138768136, + "grad_norm": 7.34224271774292, + "learning_rate": 7.256887216139929e-08, + "loss": 2.0622, + "step": 13083 + }, + { + "epoch": 0.9848886881574738, + "grad_norm": 5.3807373046875, + "learning_rate": 7.191378322150555e-08, + "loss": 1.6785, + "step": 13084 + }, + { + "epoch": 0.984963962438134, + "grad_norm": 4.592549800872803, + "learning_rate": 7.126166236307575e-08, + "loss": 1.8521, + "step": 13085 + }, + { + "epoch": 0.9850392367187941, + "grad_norm": 4.799075126647949, + "learning_rate": 7.061250962488441e-08, + "loss": 1.763, + "step": 13086 + }, + { + "epoch": 0.9851145109994542, + "grad_norm": 4.773825168609619, + "learning_rate": 6.996632504551182e-08, + "loss": 1.8987, + "step": 13087 + }, + { + "epoch": 0.9851897852801145, + "grad_norm": 5.173593044281006, + "learning_rate": 6.932310866337721e-08, + "loss": 1.5234, + "step": 13088 + }, + { + "epoch": 0.9852650595607746, + "grad_norm": 4.833860874176025, + "learning_rate": 6.868286051671669e-08, + "loss": 1.5358, + "step": 13089 + }, + { + "epoch": 0.9853403338414347, + "grad_norm": 4.161336898803711, + "learning_rate": 6.804558064358868e-08, + "loss": 1.5907, + "step": 13090 + }, + { + "epoch": 0.9854156081220948, + "grad_norm": 4.897314548492432, + "learning_rate": 6.741126908187956e-08, + "loss": 1.4992, + "step": 13091 + }, + { + "epoch": 0.9854908824027551, + "grad_norm": 4.734277725219727, + "learning_rate": 6.67799258693036e-08, + "loss": 1.6127, + "step": 13092 + }, + { + "epoch": 0.9855661566834152, + "grad_norm": 4.174915313720703, + "learning_rate": 6.615155104338077e-08, + "loss": 1.5206, + "step": 13093 + }, + { + "epoch": 0.9856414309640753, + "grad_norm": 5.145100116729736, + "learning_rate": 6.552614464147566e-08, + "loss": 2.0955, + "step": 13094 + }, + { + "epoch": 0.9857167052447355, + "grad_norm": 5.781323432922363, + "learning_rate": 6.490370670076407e-08, + "loss": 2.1061, + "step": 13095 + }, + { + "epoch": 0.9857919795253957, + "grad_norm": 5.626537799835205, + "learning_rate": 6.428423725824417e-08, + "loss": 1.9053, + "step": 13096 + }, + { + "epoch": 0.9858672538060558, + "grad_norm": 5.205971717834473, + "learning_rate": 6.366773635074208e-08, + "loss": 1.6103, + "step": 13097 + }, + { + "epoch": 0.985942528086716, + "grad_norm": 5.263709545135498, + "learning_rate": 6.30542040149118e-08, + "loss": 2.2085, + "step": 13098 + }, + { + "epoch": 0.9860178023673761, + "grad_norm": 5.402046203613281, + "learning_rate": 6.244364028722971e-08, + "loss": 1.9088, + "step": 13099 + }, + { + "epoch": 0.9860930766480362, + "grad_norm": 4.687032222747803, + "learning_rate": 6.18360452039779e-08, + "loss": 1.7946, + "step": 13100 + }, + { + "epoch": 0.9861683509286965, + "grad_norm": 3.61903715133667, + "learning_rate": 6.123141880128857e-08, + "loss": 1.8414, + "step": 13101 + }, + { + "epoch": 0.9862436252093566, + "grad_norm": 4.242746353149414, + "learning_rate": 6.062976111509966e-08, + "loss": 2.0542, + "step": 13102 + }, + { + "epoch": 0.9863188994900167, + "grad_norm": 7.258364200592041, + "learning_rate": 6.003107218118253e-08, + "loss": 1.6877, + "step": 13103 + }, + { + "epoch": 0.986394173770677, + "grad_norm": 5.150749683380127, + "learning_rate": 5.94353520351254e-08, + "loss": 1.9453, + "step": 13104 + }, + { + "epoch": 0.9864694480513371, + "grad_norm": 5.658138275146484, + "learning_rate": 5.8842600712338825e-08, + "loss": 1.845, + "step": 13105 + }, + { + "epoch": 0.9865447223319972, + "grad_norm": 5.427069187164307, + "learning_rate": 5.825281824805573e-08, + "loss": 1.7285, + "step": 13106 + }, + { + "epoch": 0.9866199966126574, + "grad_norm": 4.351526737213135, + "learning_rate": 5.7666004677353616e-08, + "loss": 1.9474, + "step": 13107 + }, + { + "epoch": 0.9866952708933175, + "grad_norm": 7.731470584869385, + "learning_rate": 5.708216003509903e-08, + "loss": 1.341, + "step": 13108 + }, + { + "epoch": 0.9867705451739777, + "grad_norm": 5.416434288024902, + "learning_rate": 5.65012843560142e-08, + "loss": 1.9776, + "step": 13109 + }, + { + "epoch": 0.9868458194546378, + "grad_norm": 4.176894664764404, + "learning_rate": 5.592337767462152e-08, + "loss": 2.0167, + "step": 13110 + }, + { + "epoch": 0.986921093735298, + "grad_norm": 6.282223224639893, + "learning_rate": 5.5348440025276835e-08, + "loss": 1.9449, + "step": 13111 + }, + { + "epoch": 0.9869963680159581, + "grad_norm": 5.001054286956787, + "learning_rate": 5.477647144216391e-08, + "loss": 1.5154, + "step": 13112 + }, + { + "epoch": 0.9870716422966183, + "grad_norm": 6.220345497131348, + "learning_rate": 5.420747195927778e-08, + "loss": 1.773, + "step": 13113 + }, + { + "epoch": 0.9871469165772785, + "grad_norm": 5.983572959899902, + "learning_rate": 5.364144161044693e-08, + "loss": 1.893, + "step": 13114 + }, + { + "epoch": 0.9872221908579386, + "grad_norm": 4.614436626434326, + "learning_rate": 5.307838042931667e-08, + "loss": 1.8526, + "step": 13115 + }, + { + "epoch": 0.9872974651385987, + "grad_norm": 4.497433185577393, + "learning_rate": 5.251828844937135e-08, + "loss": 1.8405, + "step": 13116 + }, + { + "epoch": 0.987372739419259, + "grad_norm": 4.064354419708252, + "learning_rate": 5.1961165703889866e-08, + "loss": 1.7328, + "step": 13117 + }, + { + "epoch": 0.9874480136999191, + "grad_norm": 4.049808502197266, + "learning_rate": 5.1407012226006855e-08, + "loss": 1.4904, + "step": 13118 + }, + { + "epoch": 0.9875232879805792, + "grad_norm": 4.754990100860596, + "learning_rate": 5.0855828048657073e-08, + "loss": 1.6312, + "step": 13119 + }, + { + "epoch": 0.9875985622612394, + "grad_norm": 6.001760959625244, + "learning_rate": 5.03076132046032e-08, + "loss": 1.8852, + "step": 13120 + }, + { + "epoch": 0.9876738365418996, + "grad_norm": 4.823469638824463, + "learning_rate": 4.9762367726446935e-08, + "loss": 1.6977, + "step": 13121 + }, + { + "epoch": 0.9877491108225597, + "grad_norm": 5.914470672607422, + "learning_rate": 4.922009164659014e-08, + "loss": 2.2929, + "step": 13122 + }, + { + "epoch": 0.9878243851032199, + "grad_norm": 5.593662738800049, + "learning_rate": 4.8680784997273684e-08, + "loss": 1.8996, + "step": 13123 + }, + { + "epoch": 0.98789965938388, + "grad_norm": 6.188007831573486, + "learning_rate": 4.8144447810560825e-08, + "loss": 1.7584, + "step": 13124 + }, + { + "epoch": 0.9879749336645401, + "grad_norm": 7.864127159118652, + "learning_rate": 4.7611080118331596e-08, + "loss": 1.9415, + "step": 13125 + }, + { + "epoch": 0.9880502079452004, + "grad_norm": 5.918152332305908, + "learning_rate": 4.708068195229398e-08, + "loss": 1.8846, + "step": 13126 + }, + { + "epoch": 0.9881254822258605, + "grad_norm": 4.015069007873535, + "learning_rate": 4.655325334397831e-08, + "loss": 1.6841, + "step": 13127 + }, + { + "epoch": 0.9882007565065206, + "grad_norm": 6.2730302810668945, + "learning_rate": 4.602879432473728e-08, + "loss": 1.9123, + "step": 13128 + }, + { + "epoch": 0.9882760307871807, + "grad_norm": 4.26728630065918, + "learning_rate": 4.550730492575705e-08, + "loss": 1.6457, + "step": 13129 + }, + { + "epoch": 0.988351305067841, + "grad_norm": 4.477145195007324, + "learning_rate": 4.498878517802396e-08, + "loss": 1.5855, + "step": 13130 + }, + { + "epoch": 0.9884265793485011, + "grad_norm": 4.947357654571533, + "learning_rate": 4.447323511237999e-08, + "loss": 1.8056, + "step": 13131 + }, + { + "epoch": 0.9885018536291612, + "grad_norm": 4.1914286613464355, + "learning_rate": 4.396065475945621e-08, + "loss": 1.7953, + "step": 13132 + }, + { + "epoch": 0.9885771279098214, + "grad_norm": 5.948415756225586, + "learning_rate": 4.3451044149733777e-08, + "loss": 1.6858, + "step": 13133 + }, + { + "epoch": 0.9886524021904816, + "grad_norm": 3.814768075942993, + "learning_rate": 4.2944403313505135e-08, + "loss": 1.8172, + "step": 13134 + }, + { + "epoch": 0.9887276764711417, + "grad_norm": 5.156869411468506, + "learning_rate": 4.2440732280885075e-08, + "loss": 1.5084, + "step": 13135 + }, + { + "epoch": 0.9888029507518019, + "grad_norm": 4.52300500869751, + "learning_rate": 4.194003108182742e-08, + "loss": 1.8068, + "step": 13136 + }, + { + "epoch": 0.988878225032462, + "grad_norm": 4.190985679626465, + "learning_rate": 4.1442299746080603e-08, + "loss": 1.8246, + "step": 13137 + }, + { + "epoch": 0.9889534993131222, + "grad_norm": 5.313257694244385, + "learning_rate": 4.094753830324871e-08, + "loss": 1.792, + "step": 13138 + }, + { + "epoch": 0.9890287735937824, + "grad_norm": 5.041929244995117, + "learning_rate": 4.0455746782736007e-08, + "loss": 1.8836, + "step": 13139 + }, + { + "epoch": 0.9891040478744425, + "grad_norm": 4.644732475280762, + "learning_rate": 3.9966925213774654e-08, + "loss": 1.5816, + "step": 13140 + }, + { + "epoch": 0.9891793221551026, + "grad_norm": 6.309013366699219, + "learning_rate": 3.9481073625430296e-08, + "loss": 1.9388, + "step": 13141 + }, + { + "epoch": 0.9892545964357629, + "grad_norm": 5.337410926818848, + "learning_rate": 3.8998192046585394e-08, + "loss": 2.1094, + "step": 13142 + }, + { + "epoch": 0.989329870716423, + "grad_norm": 8.428162574768066, + "learning_rate": 3.851828050594475e-08, + "loss": 2.0227, + "step": 13143 + }, + { + "epoch": 0.9894051449970831, + "grad_norm": 5.643184661865234, + "learning_rate": 3.804133903203e-08, + "loss": 1.776, + "step": 13144 + }, + { + "epoch": 0.9894804192777433, + "grad_norm": 5.555056095123291, + "learning_rate": 3.756736765320734e-08, + "loss": 1.6066, + "step": 13145 + }, + { + "epoch": 0.9895556935584034, + "grad_norm": 5.052450180053711, + "learning_rate": 3.709636639764313e-08, + "loss": 1.8886, + "step": 13146 + }, + { + "epoch": 0.9896309678390636, + "grad_norm": 5.18364143371582, + "learning_rate": 3.662833529334275e-08, + "loss": 1.3593, + "step": 13147 + }, + { + "epoch": 0.9897062421197237, + "grad_norm": 4.390613555908203, + "learning_rate": 3.6163274368122834e-08, + "loss": 2.2483, + "step": 13148 + }, + { + "epoch": 0.9897815164003839, + "grad_norm": 3.8033335208892822, + "learning_rate": 3.570118364963904e-08, + "loss": 1.9823, + "step": 13149 + }, + { + "epoch": 0.989856790681044, + "grad_norm": 7.855715751647949, + "learning_rate": 3.5242063165352725e-08, + "loss": 1.7753, + "step": 13150 + }, + { + "epoch": 0.9899320649617042, + "grad_norm": 4.596234321594238, + "learning_rate": 3.478591294256428e-08, + "loss": 2.1249, + "step": 13151 + }, + { + "epoch": 0.9900073392423644, + "grad_norm": 4.507048606872559, + "learning_rate": 3.43327330083798e-08, + "loss": 1.61, + "step": 13152 + }, + { + "epoch": 0.9900826135230245, + "grad_norm": 4.837597370147705, + "learning_rate": 3.3882523389755504e-08, + "loss": 1.4973, + "step": 13153 + }, + { + "epoch": 0.9901578878036846, + "grad_norm": 4.154238224029541, + "learning_rate": 3.343528411344221e-08, + "loss": 1.7266, + "step": 13154 + }, + { + "epoch": 0.9902331620843449, + "grad_norm": 4.838868141174316, + "learning_rate": 3.299101520602976e-08, + "loss": 1.582, + "step": 13155 + }, + { + "epoch": 0.990308436365005, + "grad_norm": 3.949821710586548, + "learning_rate": 3.2549716693930365e-08, + "loss": 1.5112, + "step": 13156 + }, + { + "epoch": 0.9903837106456651, + "grad_norm": 4.174725532531738, + "learning_rate": 3.2111388603378586e-08, + "loss": 2.0756, + "step": 13157 + }, + { + "epoch": 0.9904589849263253, + "grad_norm": 4.587085247039795, + "learning_rate": 3.167603096043137e-08, + "loss": 2.0162, + "step": 13158 + }, + { + "epoch": 0.9905342592069855, + "grad_norm": 3.627316474914551, + "learning_rate": 3.1243643790968e-08, + "loss": 1.8721, + "step": 13159 + }, + { + "epoch": 0.9906095334876456, + "grad_norm": 5.89481782913208, + "learning_rate": 3.0814227120695707e-08, + "loss": 1.9895, + "step": 13160 + }, + { + "epoch": 0.9906848077683058, + "grad_norm": 6.094086170196533, + "learning_rate": 3.0387780975138505e-08, + "loss": 1.8393, + "step": 13161 + }, + { + "epoch": 0.9907600820489659, + "grad_norm": 5.741156578063965, + "learning_rate": 2.996430537964834e-08, + "loss": 1.8518, + "step": 13162 + }, + { + "epoch": 0.990835356329626, + "grad_norm": 5.1079936027526855, + "learning_rate": 2.954380035939952e-08, + "loss": 2.0655, + "step": 13163 + }, + { + "epoch": 0.9909106306102863, + "grad_norm": 4.007481575012207, + "learning_rate": 2.912626593938872e-08, + "loss": 1.5509, + "step": 13164 + }, + { + "epoch": 0.9909859048909464, + "grad_norm": 5.811824798583984, + "learning_rate": 2.8711702144446074e-08, + "loss": 1.8812, + "step": 13165 + }, + { + "epoch": 0.9910611791716065, + "grad_norm": 4.512807846069336, + "learning_rate": 2.830010899920188e-08, + "loss": 1.7058, + "step": 13166 + }, + { + "epoch": 0.9911364534522666, + "grad_norm": 4.473937511444092, + "learning_rate": 2.7891486528136558e-08, + "loss": 1.5665, + "step": 13167 + }, + { + "epoch": 0.9912117277329269, + "grad_norm": 4.295932292938232, + "learning_rate": 2.7485834755530683e-08, + "loss": 1.6102, + "step": 13168 + }, + { + "epoch": 0.991287002013587, + "grad_norm": 5.402720928192139, + "learning_rate": 2.70831537055094e-08, + "loss": 1.7475, + "step": 13169 + }, + { + "epoch": 0.9913622762942471, + "grad_norm": 4.579638481140137, + "learning_rate": 2.668344340200357e-08, + "loss": 1.649, + "step": 13170 + }, + { + "epoch": 0.9914375505749073, + "grad_norm": 4.465633392333984, + "learning_rate": 2.6286703868777518e-08, + "loss": 1.6987, + "step": 13171 + }, + { + "epoch": 0.9915128248555675, + "grad_norm": 4.027198314666748, + "learning_rate": 2.5892935129417927e-08, + "loss": 1.6258, + "step": 13172 + }, + { + "epoch": 0.9915880991362276, + "grad_norm": 6.073844909667969, + "learning_rate": 2.5502137207333853e-08, + "loss": 1.7642, + "step": 13173 + }, + { + "epoch": 0.9916633734168878, + "grad_norm": 4.4179911613464355, + "learning_rate": 2.5114310125751162e-08, + "loss": 2.1268, + "step": 13174 + }, + { + "epoch": 0.9917386476975479, + "grad_norm": 8.082771301269531, + "learning_rate": 2.4729453907729182e-08, + "loss": 2.4059, + "step": 13175 + }, + { + "epoch": 0.9918139219782081, + "grad_norm": 4.373549938201904, + "learning_rate": 2.4347568576144064e-08, + "loss": 1.7302, + "step": 13176 + }, + { + "epoch": 0.9918891962588683, + "grad_norm": 4.33629035949707, + "learning_rate": 2.3968654153699866e-08, + "loss": 1.9269, + "step": 13177 + }, + { + "epoch": 0.9919644705395284, + "grad_norm": 5.392022609710693, + "learning_rate": 2.3592710662923012e-08, + "loss": 1.9855, + "step": 13178 + }, + { + "epoch": 0.9920397448201885, + "grad_norm": 4.506936073303223, + "learning_rate": 2.3219738126162293e-08, + "loss": 1.6856, + "step": 13179 + }, + { + "epoch": 0.9921150191008488, + "grad_norm": 6.104228973388672, + "learning_rate": 2.2849736565588865e-08, + "loss": 1.5727, + "step": 13180 + }, + { + "epoch": 0.9921902933815089, + "grad_norm": 4.354866981506348, + "learning_rate": 2.2482706003201794e-08, + "loss": 1.884, + "step": 13181 + }, + { + "epoch": 0.992265567662169, + "grad_norm": 4.863189697265625, + "learning_rate": 2.211864646081141e-08, + "loss": 1.5875, + "step": 13182 + }, + { + "epoch": 0.9923408419428292, + "grad_norm": 7.900731563568115, + "learning_rate": 2.1757557960061513e-08, + "loss": 2.2157, + "step": 13183 + }, + { + "epoch": 0.9924161162234894, + "grad_norm": 5.169208526611328, + "learning_rate": 2.139944052242937e-08, + "loss": 1.7159, + "step": 13184 + }, + { + "epoch": 0.9924913905041495, + "grad_norm": 4.612951755523682, + "learning_rate": 2.104429416918685e-08, + "loss": 2.0171, + "step": 13185 + }, + { + "epoch": 0.9925666647848097, + "grad_norm": 5.994222164154053, + "learning_rate": 2.06921189214615e-08, + "loss": 1.592, + "step": 13186 + }, + { + "epoch": 0.9926419390654698, + "grad_norm": 6.375983715057373, + "learning_rate": 2.0342914800181022e-08, + "loss": 1.7444, + "step": 13187 + }, + { + "epoch": 0.99271721334613, + "grad_norm": 4.1232171058654785, + "learning_rate": 1.999668182610659e-08, + "loss": 1.4631, + "step": 13188 + }, + { + "epoch": 0.9927924876267901, + "grad_norm": 4.350636959075928, + "learning_rate": 1.9653420019821735e-08, + "loss": 1.5617, + "step": 13189 + }, + { + "epoch": 0.9928677619074503, + "grad_norm": 4.18228816986084, + "learning_rate": 1.9313129401732355e-08, + "loss": 1.7492, + "step": 13190 + }, + { + "epoch": 0.9929430361881104, + "grad_norm": 6.83117151260376, + "learning_rate": 1.897580999206672e-08, + "loss": 1.4845, + "step": 13191 + }, + { + "epoch": 0.9930183104687705, + "grad_norm": 4.776318550109863, + "learning_rate": 1.864146181087545e-08, + "loss": 1.7254, + "step": 13192 + }, + { + "epoch": 0.9930935847494308, + "grad_norm": 5.618453025817871, + "learning_rate": 1.8310084878037093e-08, + "loss": 1.442, + "step": 13193 + }, + { + "epoch": 0.9931688590300909, + "grad_norm": 5.079207897186279, + "learning_rate": 1.7981679213247005e-08, + "loss": 1.5268, + "step": 13194 + }, + { + "epoch": 0.993244133310751, + "grad_norm": 7.378453254699707, + "learning_rate": 1.765624483603956e-08, + "loss": 1.7862, + "step": 13195 + }, + { + "epoch": 0.9933194075914112, + "grad_norm": 4.970518112182617, + "learning_rate": 1.733378176574929e-08, + "loss": 1.8096, + "step": 13196 + }, + { + "epoch": 0.9933946818720714, + "grad_norm": 5.081238746643066, + "learning_rate": 1.70142900215442e-08, + "loss": 1.8428, + "step": 13197 + }, + { + "epoch": 0.9934699561527315, + "grad_norm": 6.832697868347168, + "learning_rate": 1.6697769622431303e-08, + "loss": 2.2327, + "step": 13198 + }, + { + "epoch": 0.9935452304333917, + "grad_norm": 4.939356803894043, + "learning_rate": 1.6384220587212228e-08, + "loss": 1.5723, + "step": 13199 + }, + { + "epoch": 0.9936205047140518, + "grad_norm": 5.165452480316162, + "learning_rate": 1.6073642934533172e-08, + "loss": 1.8158, + "step": 13200 + }, + { + "epoch": 0.993695778994712, + "grad_norm": 5.340014934539795, + "learning_rate": 1.5766036682857142e-08, + "loss": 1.7935, + "step": 13201 + }, + { + "epoch": 0.9937710532753722, + "grad_norm": 3.8375563621520996, + "learning_rate": 1.5461401850463964e-08, + "loss": 2.3243, + "step": 13202 + }, + { + "epoch": 0.9938463275560323, + "grad_norm": 7.1038103103637695, + "learning_rate": 1.5159738455478022e-08, + "loss": 1.7332, + "step": 13203 + }, + { + "epoch": 0.9939216018366924, + "grad_norm": 5.9426422119140625, + "learning_rate": 1.4861046515818322e-08, + "loss": 1.667, + "step": 13204 + }, + { + "epoch": 0.9939968761173527, + "grad_norm": 4.481041431427002, + "learning_rate": 1.4565326049242877e-08, + "loss": 1.8162, + "step": 13205 + }, + { + "epoch": 0.9940721503980128, + "grad_norm": 6.005204677581787, + "learning_rate": 1.4272577073337624e-08, + "loss": 2.037, + "step": 13206 + }, + { + "epoch": 0.9941474246786729, + "grad_norm": 5.24858283996582, + "learning_rate": 1.3982799605505303e-08, + "loss": 1.6847, + "step": 13207 + }, + { + "epoch": 0.994222698959333, + "grad_norm": 5.851583957672119, + "learning_rate": 1.3695993662965478e-08, + "loss": 1.9851, + "step": 13208 + }, + { + "epoch": 0.9942979732399932, + "grad_norm": 5.044328212738037, + "learning_rate": 1.3412159262771174e-08, + "loss": 1.8975, + "step": 13209 + }, + { + "epoch": 0.9943732475206534, + "grad_norm": 5.559646129608154, + "learning_rate": 1.3131296421797778e-08, + "loss": 1.7824, + "step": 13210 + }, + { + "epoch": 0.9944485218013135, + "grad_norm": 6.9781904220581055, + "learning_rate": 1.2853405156743048e-08, + "loss": 1.8259, + "step": 13211 + }, + { + "epoch": 0.9945237960819737, + "grad_norm": 5.184886455535889, + "learning_rate": 1.257848548412155e-08, + "loss": 1.7331, + "step": 13212 + }, + { + "epoch": 0.9945990703626338, + "grad_norm": 4.714271068572998, + "learning_rate": 1.2306537420281316e-08, + "loss": 1.7014, + "step": 13213 + }, + { + "epoch": 0.994674344643294, + "grad_norm": 4.957435607910156, + "learning_rate": 1.2037560981381646e-08, + "loss": 1.821, + "step": 13214 + }, + { + "epoch": 0.9947496189239542, + "grad_norm": 6.1166090965271, + "learning_rate": 1.177155618342085e-08, + "loss": 1.7948, + "step": 13215 + }, + { + "epoch": 0.9948248932046143, + "grad_norm": 5.278020858764648, + "learning_rate": 1.1508523042214058e-08, + "loss": 1.8824, + "step": 13216 + }, + { + "epoch": 0.9949001674852744, + "grad_norm": 5.908717632293701, + "learning_rate": 1.1248461573382108e-08, + "loss": 1.3429, + "step": 13217 + }, + { + "epoch": 0.9949754417659347, + "grad_norm": 5.376378536224365, + "learning_rate": 1.0991371792401506e-08, + "loss": 1.9484, + "step": 13218 + }, + { + "epoch": 0.9950507160465948, + "grad_norm": 4.909424304962158, + "learning_rate": 1.0737253714548923e-08, + "loss": 1.9071, + "step": 13219 + }, + { + "epoch": 0.9951259903272549, + "grad_norm": 5.313304901123047, + "learning_rate": 1.0486107354928942e-08, + "loss": 1.7426, + "step": 13220 + }, + { + "epoch": 0.9952012646079151, + "grad_norm": 4.2952775955200195, + "learning_rate": 1.0237932728479616e-08, + "loss": 1.7724, + "step": 13221 + }, + { + "epoch": 0.9952765388885753, + "grad_norm": 4.796300888061523, + "learning_rate": 9.992729849944704e-09, + "loss": 2.1162, + "step": 13222 + }, + { + "epoch": 0.9953518131692354, + "grad_norm": 4.420007228851318, + "learning_rate": 9.750498733901436e-09, + "loss": 1.6545, + "step": 13223 + }, + { + "epoch": 0.9954270874498956, + "grad_norm": 6.675930023193359, + "learning_rate": 9.511239394754955e-09, + "loss": 2.1507, + "step": 13224 + }, + { + "epoch": 0.9955023617305557, + "grad_norm": 4.241837978363037, + "learning_rate": 9.27495184672722e-09, + "loss": 1.5117, + "step": 13225 + }, + { + "epoch": 0.9955776360112158, + "grad_norm": 5.348954677581787, + "learning_rate": 9.04163610386255e-09, + "loss": 1.4654, + "step": 13226 + }, + { + "epoch": 0.995652910291876, + "grad_norm": 4.765196323394775, + "learning_rate": 8.811292180038733e-09, + "loss": 1.742, + "step": 13227 + }, + { + "epoch": 0.9957281845725362, + "grad_norm": 5.047995567321777, + "learning_rate": 8.583920088933717e-09, + "loss": 1.7765, + "step": 13228 + }, + { + "epoch": 0.9958034588531963, + "grad_norm": 4.878895282745361, + "learning_rate": 8.35951984408112e-09, + "loss": 2.0499, + "step": 13229 + }, + { + "epoch": 0.9958787331338564, + "grad_norm": 6.066239833831787, + "learning_rate": 8.138091458809172e-09, + "loss": 2.1483, + "step": 13230 + }, + { + "epoch": 0.9959540074145167, + "grad_norm": 4.200047492980957, + "learning_rate": 7.919634946285115e-09, + "loss": 1.3955, + "step": 13231 + }, + { + "epoch": 0.9960292816951768, + "grad_norm": 5.785281658172607, + "learning_rate": 7.704150319493008e-09, + "loss": 1.9654, + "step": 13232 + }, + { + "epoch": 0.9961045559758369, + "grad_norm": 5.729581832885742, + "learning_rate": 7.491637591250378e-09, + "loss": 1.9358, + "step": 13233 + }, + { + "epoch": 0.9961798302564971, + "grad_norm": 3.953002691268921, + "learning_rate": 7.282096774180458e-09, + "loss": 1.8323, + "step": 13234 + }, + { + "epoch": 0.9962551045371573, + "grad_norm": 4.2614874839782715, + "learning_rate": 7.075527880751054e-09, + "loss": 1.5823, + "step": 13235 + }, + { + "epoch": 0.9963303788178174, + "grad_norm": 6.689180374145508, + "learning_rate": 6.87193092323013e-09, + "loss": 2.2737, + "step": 13236 + }, + { + "epoch": 0.9964056530984776, + "grad_norm": 4.424046516418457, + "learning_rate": 6.671305913724668e-09, + "loss": 1.4145, + "step": 13237 + }, + { + "epoch": 0.9964809273791377, + "grad_norm": 6.797179698944092, + "learning_rate": 6.473652864169566e-09, + "loss": 1.6863, + "step": 13238 + }, + { + "epoch": 0.9965562016597979, + "grad_norm": 5.1269073486328125, + "learning_rate": 6.278971786305432e-09, + "loss": 1.5198, + "step": 13239 + }, + { + "epoch": 0.9966314759404581, + "grad_norm": 4.8358235359191895, + "learning_rate": 6.087262691706341e-09, + "loss": 1.8027, + "step": 13240 + }, + { + "epoch": 0.9967067502211182, + "grad_norm": 7.849664688110352, + "learning_rate": 5.898525591774284e-09, + "loss": 2.0837, + "step": 13241 + }, + { + "epoch": 0.9967820245017783, + "grad_norm": 5.3083271980285645, + "learning_rate": 5.712760497722514e-09, + "loss": 2.1192, + "step": 13242 + }, + { + "epoch": 0.9968572987824386, + "grad_norm": 3.818995714187622, + "learning_rate": 5.529967420597748e-09, + "loss": 2.0152, + "step": 13243 + }, + { + "epoch": 0.9969325730630987, + "grad_norm": 5.439818382263184, + "learning_rate": 5.350146371269071e-09, + "loss": 1.7853, + "step": 13244 + }, + { + "epoch": 0.9970078473437588, + "grad_norm": 5.298344135284424, + "learning_rate": 5.17329736042238e-09, + "loss": 2.2856, + "step": 13245 + }, + { + "epoch": 0.9970831216244189, + "grad_norm": 5.614660263061523, + "learning_rate": 4.999420398571486e-09, + "loss": 1.7458, + "step": 13246 + }, + { + "epoch": 0.9971583959050792, + "grad_norm": 6.490902423858643, + "learning_rate": 4.828515496052566e-09, + "loss": 1.7767, + "step": 13247 + }, + { + "epoch": 0.9972336701857393, + "grad_norm": 5.269622325897217, + "learning_rate": 4.66058266302416e-09, + "loss": 1.6769, + "step": 13248 + }, + { + "epoch": 0.9973089444663994, + "grad_norm": 6.375268459320068, + "learning_rate": 4.495621909478276e-09, + "loss": 2.1209, + "step": 13249 + }, + { + "epoch": 0.9973842187470596, + "grad_norm": 4.250128746032715, + "learning_rate": 4.3336332452070805e-09, + "loss": 1.8069, + "step": 13250 + }, + { + "epoch": 0.9974594930277197, + "grad_norm": 5.310784339904785, + "learning_rate": 4.174616679852861e-09, + "loss": 1.7188, + "step": 13251 + }, + { + "epoch": 0.9975347673083799, + "grad_norm": 4.131718158721924, + "learning_rate": 4.018572222858064e-09, + "loss": 1.585, + "step": 13252 + }, + { + "epoch": 0.9976100415890401, + "grad_norm": 5.407041549682617, + "learning_rate": 3.865499883509704e-09, + "loss": 1.4805, + "step": 13253 + }, + { + "epoch": 0.9976853158697002, + "grad_norm": 4.429222583770752, + "learning_rate": 3.71539967090051e-09, + "loss": 1.8644, + "step": 13254 + }, + { + "epoch": 0.9977605901503603, + "grad_norm": 4.958990097045898, + "learning_rate": 3.5682715939566737e-09, + "loss": 1.7753, + "step": 13255 + }, + { + "epoch": 0.9978358644310206, + "grad_norm": 4.494583606719971, + "learning_rate": 3.424115661421201e-09, + "loss": 1.56, + "step": 13256 + }, + { + "epoch": 0.9979111387116807, + "grad_norm": 5.914072036743164, + "learning_rate": 3.2829318818650144e-09, + "loss": 1.9059, + "step": 13257 + }, + { + "epoch": 0.9979864129923408, + "grad_norm": 4.745224475860596, + "learning_rate": 3.144720263686951e-09, + "loss": 1.6141, + "step": 13258 + }, + { + "epoch": 0.998061687273001, + "grad_norm": 13.653491973876953, + "learning_rate": 3.009480815091559e-09, + "loss": 1.898, + "step": 13259 + }, + { + "epoch": 0.9981369615536612, + "grad_norm": 4.860389709472656, + "learning_rate": 2.8772135441279546e-09, + "loss": 1.4189, + "step": 13260 + }, + { + "epoch": 0.9982122358343213, + "grad_norm": 4.856529235839844, + "learning_rate": 2.7479184586620688e-09, + "loss": 1.6266, + "step": 13261 + }, + { + "epoch": 0.9982875101149815, + "grad_norm": 4.535051345825195, + "learning_rate": 2.621595566365542e-09, + "loss": 1.6652, + "step": 13262 + }, + { + "epoch": 0.9983627843956416, + "grad_norm": 4.056579113006592, + "learning_rate": 2.498244874760136e-09, + "loss": 1.8611, + "step": 13263 + }, + { + "epoch": 0.9984380586763018, + "grad_norm": 9.294820785522461, + "learning_rate": 2.3778663911733222e-09, + "loss": 2.0419, + "step": 13264 + }, + { + "epoch": 0.998513332956962, + "grad_norm": 5.269387245178223, + "learning_rate": 2.2604601227660394e-09, + "loss": 1.7935, + "step": 13265 + }, + { + "epoch": 0.9985886072376221, + "grad_norm": 4.996938705444336, + "learning_rate": 2.146026076510488e-09, + "loss": 1.5363, + "step": 13266 + }, + { + "epoch": 0.9986638815182822, + "grad_norm": 7.4594526290893555, + "learning_rate": 2.034564259217886e-09, + "loss": 1.7581, + "step": 13267 + }, + { + "epoch": 0.9987391557989423, + "grad_norm": 7.098395347595215, + "learning_rate": 1.926074677510714e-09, + "loss": 1.9823, + "step": 13268 + }, + { + "epoch": 0.9988144300796026, + "grad_norm": 4.721424102783203, + "learning_rate": 1.820557337833817e-09, + "loss": 1.4479, + "step": 13269 + }, + { + "epoch": 0.9988897043602627, + "grad_norm": 4.402496337890625, + "learning_rate": 1.7180122464655057e-09, + "loss": 1.9812, + "step": 13270 + }, + { + "epoch": 0.9989649786409228, + "grad_norm": 5.043260097503662, + "learning_rate": 1.6184394095009048e-09, + "loss": 2.0613, + "step": 13271 + }, + { + "epoch": 0.999040252921583, + "grad_norm": 4.3816704750061035, + "learning_rate": 1.521838832863054e-09, + "loss": 1.8314, + "step": 13272 + }, + { + "epoch": 0.9991155272022432, + "grad_norm": 4.691875457763672, + "learning_rate": 1.4282105222862553e-09, + "loss": 1.829, + "step": 13273 + }, + { + "epoch": 0.9991908014829033, + "grad_norm": 4.684067726135254, + "learning_rate": 1.3375544833382769e-09, + "loss": 2.0828, + "step": 13274 + }, + { + "epoch": 0.9992660757635635, + "grad_norm": 5.453580856323242, + "learning_rate": 1.249870721409252e-09, + "loss": 2.0721, + "step": 13275 + }, + { + "epoch": 0.9993413500442236, + "grad_norm": 3.6280970573425293, + "learning_rate": 1.1651592417172285e-09, + "loss": 1.7303, + "step": 13276 + }, + { + "epoch": 0.9994166243248838, + "grad_norm": 5.092569828033447, + "learning_rate": 1.0834200492915169e-09, + "loss": 1.7451, + "step": 13277 + }, + { + "epoch": 0.999491898605544, + "grad_norm": 4.0405778884887695, + "learning_rate": 1.004653148994894e-09, + "loss": 1.7597, + "step": 13278 + }, + { + "epoch": 0.9995671728862041, + "grad_norm": 4.947397708892822, + "learning_rate": 9.288585455069498e-10, + "loss": 1.7801, + "step": 13279 + }, + { + "epoch": 0.9996424471668642, + "grad_norm": 6.2149882316589355, + "learning_rate": 8.560362433351898e-10, + "loss": 1.7064, + "step": 13280 + }, + { + "epoch": 0.9997177214475245, + "grad_norm": 4.5937628746032715, + "learning_rate": 7.861862468094838e-10, + "loss": 2.1554, + "step": 13281 + }, + { + "epoch": 0.9997929957281846, + "grad_norm": 4.7368388175964355, + "learning_rate": 7.193085600820659e-10, + "loss": 1.733, + "step": 13282 + }, + { + "epoch": 0.9998682700088447, + "grad_norm": 5.895082473754883, + "learning_rate": 6.554031871275345e-10, + "loss": 1.9067, + "step": 13283 + }, + { + "epoch": 0.9999435442895049, + "grad_norm": 4.6465888023376465, + "learning_rate": 5.944701317428525e-10, + "loss": 1.9208, + "step": 13284 + }, + { + "epoch": 0.9999435442895049, + "step": 13284, + "total_flos": 8.115413063038927e+18, + "train_loss": 2.0213059956806316, + "train_runtime": 88296.7738, + "train_samples_per_second": 2.407, + "train_steps_per_second": 0.15 + } + ], + "logging_steps": 1.0, + "max_steps": 13284, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 8.115413063038927e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}