{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2344, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004266211604095563, "grad_norm": 182.41513061523438, "learning_rate": 0.0, "loss": 7.0137, "step": 1 }, { "epoch": 0.0008532423208191126, "grad_norm": 168.21009826660156, "learning_rate": 8.474576271186442e-08, "loss": 6.4453, "step": 2 }, { "epoch": 0.001279863481228669, "grad_norm": 191.7275848388672, "learning_rate": 1.6949152542372883e-07, "loss": 6.5684, "step": 3 }, { "epoch": 0.0017064846416382253, "grad_norm": 188.32925415039062, "learning_rate": 2.5423728813559323e-07, "loss": 6.9531, "step": 4 }, { "epoch": 0.0021331058020477816, "grad_norm": 165.11737060546875, "learning_rate": 3.3898305084745766e-07, "loss": 5.9609, "step": 5 }, { "epoch": 0.002559726962457338, "grad_norm": 103.73497009277344, "learning_rate": 4.2372881355932204e-07, "loss": 5.7617, "step": 6 }, { "epoch": 0.0029863481228668944, "grad_norm": 139.20375061035156, "learning_rate": 5.084745762711865e-07, "loss": 5.5273, "step": 7 }, { "epoch": 0.0034129692832764505, "grad_norm": 96.38396453857422, "learning_rate": 5.93220338983051e-07, "loss": 5.5898, "step": 8 }, { "epoch": 0.0038395904436860067, "grad_norm": 100.02017211914062, "learning_rate": 6.779661016949153e-07, "loss": 5.0391, "step": 9 }, { "epoch": 0.004266211604095563, "grad_norm": 131.98098754882812, "learning_rate": 7.627118644067798e-07, "loss": 5.375, "step": 10 }, { "epoch": 0.00469283276450512, "grad_norm": 155.806396484375, "learning_rate": 8.474576271186441e-07, "loss": 5.7734, "step": 11 }, { "epoch": 0.005119453924914676, "grad_norm": 151.8001251220703, "learning_rate": 9.322033898305086e-07, "loss": 6.3633, "step": 12 }, { "epoch": 0.005546075085324232, "grad_norm": 148.3273468017578, "learning_rate": 1.016949152542373e-06, "loss": 6.2266, "step": 13 }, { "epoch": 0.005972696245733789, "grad_norm": 178.5255126953125, "learning_rate": 1.1016949152542374e-06, "loss": 6.4297, "step": 14 }, { "epoch": 0.0063993174061433445, "grad_norm": 167.9723663330078, "learning_rate": 1.186440677966102e-06, "loss": 6.6602, "step": 15 }, { "epoch": 0.006825938566552901, "grad_norm": 132.07110595703125, "learning_rate": 1.2711864406779662e-06, "loss": 6.0898, "step": 16 }, { "epoch": 0.007252559726962458, "grad_norm": 85.35553741455078, "learning_rate": 1.3559322033898307e-06, "loss": 5.9453, "step": 17 }, { "epoch": 0.007679180887372013, "grad_norm": 59.41227340698242, "learning_rate": 1.4406779661016951e-06, "loss": 5.4102, "step": 18 }, { "epoch": 0.008105802047781569, "grad_norm": 126.45568084716797, "learning_rate": 1.5254237288135596e-06, "loss": 5.4043, "step": 19 }, { "epoch": 0.008532423208191127, "grad_norm": 48.57197570800781, "learning_rate": 1.6101694915254237e-06, "loss": 5.7871, "step": 20 }, { "epoch": 0.008959044368600682, "grad_norm": 102.81019592285156, "learning_rate": 1.6949152542372882e-06, "loss": 5.459, "step": 21 }, { "epoch": 0.00938566552901024, "grad_norm": 130.22621154785156, "learning_rate": 1.7796610169491526e-06, "loss": 5.207, "step": 22 }, { "epoch": 0.009812286689419795, "grad_norm": 159.87655639648438, "learning_rate": 1.8644067796610171e-06, "loss": 5.0664, "step": 23 }, { "epoch": 0.010238907849829351, "grad_norm": 136.3029327392578, "learning_rate": 1.9491525423728816e-06, "loss": 5.4824, "step": 24 }, { "epoch": 0.010665529010238909, "grad_norm": 92.69465637207031, "learning_rate": 2.033898305084746e-06, "loss": 5.918, "step": 25 }, { "epoch": 0.011092150170648464, "grad_norm": 154.13182067871094, "learning_rate": 2.11864406779661e-06, "loss": 5.3008, "step": 26 }, { "epoch": 0.01151877133105802, "grad_norm": 107.52462005615234, "learning_rate": 2.203389830508475e-06, "loss": 5.2422, "step": 27 }, { "epoch": 0.011945392491467578, "grad_norm": 103.60791778564453, "learning_rate": 2.288135593220339e-06, "loss": 4.7988, "step": 28 }, { "epoch": 0.012372013651877133, "grad_norm": 128.28968811035156, "learning_rate": 2.372881355932204e-06, "loss": 5.123, "step": 29 }, { "epoch": 0.012798634812286689, "grad_norm": 28.77732276916504, "learning_rate": 2.457627118644068e-06, "loss": 5.0469, "step": 30 }, { "epoch": 0.013225255972696246, "grad_norm": 12.902440071105957, "learning_rate": 2.5423728813559323e-06, "loss": 4.8359, "step": 31 }, { "epoch": 0.013651877133105802, "grad_norm": 76.21299743652344, "learning_rate": 2.627118644067797e-06, "loss": 4.418, "step": 32 }, { "epoch": 0.014078498293515358, "grad_norm": 25.660552978515625, "learning_rate": 2.7118644067796613e-06, "loss": 4.6562, "step": 33 }, { "epoch": 0.014505119453924915, "grad_norm": 12.741079330444336, "learning_rate": 2.7966101694915256e-06, "loss": 4.6113, "step": 34 }, { "epoch": 0.014931740614334471, "grad_norm": 62.22819137573242, "learning_rate": 2.8813559322033903e-06, "loss": 4.5723, "step": 35 }, { "epoch": 0.015358361774744027, "grad_norm": 18.65302276611328, "learning_rate": 2.9661016949152545e-06, "loss": 4.4434, "step": 36 }, { "epoch": 0.015784982935153583, "grad_norm": 33.47079849243164, "learning_rate": 3.0508474576271192e-06, "loss": 4.6621, "step": 37 }, { "epoch": 0.016211604095563138, "grad_norm": 65.28570556640625, "learning_rate": 3.135593220338983e-06, "loss": 4.4492, "step": 38 }, { "epoch": 0.016638225255972697, "grad_norm": 18.592966079711914, "learning_rate": 3.2203389830508473e-06, "loss": 4.2324, "step": 39 }, { "epoch": 0.017064846416382253, "grad_norm": 44.71397018432617, "learning_rate": 3.305084745762712e-06, "loss": 4.2344, "step": 40 }, { "epoch": 0.01749146757679181, "grad_norm": 28.17648696899414, "learning_rate": 3.3898305084745763e-06, "loss": 4.127, "step": 41 }, { "epoch": 0.017918088737201365, "grad_norm": 19.224647521972656, "learning_rate": 3.474576271186441e-06, "loss": 4.2148, "step": 42 }, { "epoch": 0.01834470989761092, "grad_norm": 22.937255859375, "learning_rate": 3.5593220338983053e-06, "loss": 4.3301, "step": 43 }, { "epoch": 0.01877133105802048, "grad_norm": 29.138164520263672, "learning_rate": 3.6440677966101695e-06, "loss": 4.3203, "step": 44 }, { "epoch": 0.019197952218430035, "grad_norm": 51.77314758300781, "learning_rate": 3.7288135593220342e-06, "loss": 3.8086, "step": 45 }, { "epoch": 0.01962457337883959, "grad_norm": 11.840182304382324, "learning_rate": 3.8135593220338985e-06, "loss": 3.9746, "step": 46 }, { "epoch": 0.020051194539249147, "grad_norm": 47.615848541259766, "learning_rate": 3.898305084745763e-06, "loss": 4.2988, "step": 47 }, { "epoch": 0.020477815699658702, "grad_norm": 42.8083381652832, "learning_rate": 3.9830508474576275e-06, "loss": 3.7168, "step": 48 }, { "epoch": 0.020904436860068258, "grad_norm": 40.28496551513672, "learning_rate": 4.067796610169492e-06, "loss": 4.0176, "step": 49 }, { "epoch": 0.021331058020477817, "grad_norm": 13.85673713684082, "learning_rate": 4.152542372881356e-06, "loss": 3.8164, "step": 50 }, { "epoch": 0.021757679180887373, "grad_norm": 14.762373924255371, "learning_rate": 4.23728813559322e-06, "loss": 4.1719, "step": 51 }, { "epoch": 0.02218430034129693, "grad_norm": 27.187488555908203, "learning_rate": 4.322033898305085e-06, "loss": 4.416, "step": 52 }, { "epoch": 0.022610921501706484, "grad_norm": 38.772220611572266, "learning_rate": 4.40677966101695e-06, "loss": 3.9219, "step": 53 }, { "epoch": 0.02303754266211604, "grad_norm": 59.488548278808594, "learning_rate": 4.491525423728814e-06, "loss": 3.4355, "step": 54 }, { "epoch": 0.023464163822525596, "grad_norm": 59.54182434082031, "learning_rate": 4.576271186440678e-06, "loss": 3.7969, "step": 55 }, { "epoch": 0.023890784982935155, "grad_norm": 28.035480499267578, "learning_rate": 4.6610169491525425e-06, "loss": 3.75, "step": 56 }, { "epoch": 0.02431740614334471, "grad_norm": 61.01053237915039, "learning_rate": 4.745762711864408e-06, "loss": 4.3125, "step": 57 }, { "epoch": 0.024744027303754267, "grad_norm": 48.68558883666992, "learning_rate": 4.830508474576272e-06, "loss": 4.2266, "step": 58 }, { "epoch": 0.025170648464163822, "grad_norm": 26.09473419189453, "learning_rate": 4.915254237288136e-06, "loss": 3.9473, "step": 59 }, { "epoch": 0.025597269624573378, "grad_norm": 31.998085021972656, "learning_rate": 5e-06, "loss": 3.5244, "step": 60 }, { "epoch": 0.026023890784982934, "grad_norm": 64.63945007324219, "learning_rate": 5.084745762711865e-06, "loss": 3.9707, "step": 61 }, { "epoch": 0.026450511945392493, "grad_norm": 61.44773864746094, "learning_rate": 5.169491525423729e-06, "loss": 3.7266, "step": 62 }, { "epoch": 0.02687713310580205, "grad_norm": 96.3592529296875, "learning_rate": 5.254237288135594e-06, "loss": 4.2656, "step": 63 }, { "epoch": 0.027303754266211604, "grad_norm": 88.27396392822266, "learning_rate": 5.338983050847458e-06, "loss": 3.9473, "step": 64 }, { "epoch": 0.02773037542662116, "grad_norm": 49.265892028808594, "learning_rate": 5.423728813559323e-06, "loss": 3.6973, "step": 65 }, { "epoch": 0.028156996587030716, "grad_norm": 20.13355255126953, "learning_rate": 5.508474576271187e-06, "loss": 3.9805, "step": 66 }, { "epoch": 0.02858361774744027, "grad_norm": 21.27371597290039, "learning_rate": 5.593220338983051e-06, "loss": 3.6777, "step": 67 }, { "epoch": 0.02901023890784983, "grad_norm": 18.588361740112305, "learning_rate": 5.677966101694916e-06, "loss": 3.6465, "step": 68 }, { "epoch": 0.029436860068259386, "grad_norm": 46.04341506958008, "learning_rate": 5.7627118644067805e-06, "loss": 3.627, "step": 69 }, { "epoch": 0.029863481228668942, "grad_norm": 89.78927612304688, "learning_rate": 5.847457627118645e-06, "loss": 3.7227, "step": 70 }, { "epoch": 0.030290102389078498, "grad_norm": 85.39374542236328, "learning_rate": 5.932203389830509e-06, "loss": 3.4609, "step": 71 }, { "epoch": 0.030716723549488054, "grad_norm": 100.72692108154297, "learning_rate": 6.0169491525423725e-06, "loss": 3.6016, "step": 72 }, { "epoch": 0.03114334470989761, "grad_norm": 23.946613311767578, "learning_rate": 6.1016949152542385e-06, "loss": 4.0117, "step": 73 }, { "epoch": 0.031569965870307165, "grad_norm": 63.32298278808594, "learning_rate": 6.186440677966103e-06, "loss": 3.9414, "step": 74 }, { "epoch": 0.03199658703071672, "grad_norm": 63.94538116455078, "learning_rate": 6.271186440677966e-06, "loss": 4.1016, "step": 75 }, { "epoch": 0.032423208191126277, "grad_norm": 85.71823120117188, "learning_rate": 6.3559322033898304e-06, "loss": 4.1504, "step": 76 }, { "epoch": 0.03284982935153584, "grad_norm": 43.66586685180664, "learning_rate": 6.440677966101695e-06, "loss": 3.5293, "step": 77 }, { "epoch": 0.033276450511945395, "grad_norm": 31.57004165649414, "learning_rate": 6.52542372881356e-06, "loss": 3.9141, "step": 78 }, { "epoch": 0.03370307167235495, "grad_norm": 17.844924926757812, "learning_rate": 6.610169491525424e-06, "loss": 3.5469, "step": 79 }, { "epoch": 0.034129692832764506, "grad_norm": 60.84952163696289, "learning_rate": 6.694915254237288e-06, "loss": 3.6504, "step": 80 }, { "epoch": 0.03455631399317406, "grad_norm": 54.73509979248047, "learning_rate": 6.779661016949153e-06, "loss": 3.5234, "step": 81 }, { "epoch": 0.03498293515358362, "grad_norm": 15.640512466430664, "learning_rate": 6.864406779661017e-06, "loss": 3.3867, "step": 82 }, { "epoch": 0.035409556313993173, "grad_norm": 25.58538246154785, "learning_rate": 6.949152542372882e-06, "loss": 3.5215, "step": 83 }, { "epoch": 0.03583617747440273, "grad_norm": 47.705474853515625, "learning_rate": 7.033898305084746e-06, "loss": 3.3828, "step": 84 }, { "epoch": 0.036262798634812285, "grad_norm": 19.92372703552246, "learning_rate": 7.1186440677966106e-06, "loss": 3.959, "step": 85 }, { "epoch": 0.03668941979522184, "grad_norm": 58.42499542236328, "learning_rate": 7.203389830508475e-06, "loss": 3.6934, "step": 86 }, { "epoch": 0.037116040955631396, "grad_norm": 16.288785934448242, "learning_rate": 7.288135593220339e-06, "loss": 3.0273, "step": 87 }, { "epoch": 0.03754266211604096, "grad_norm": 38.42738342285156, "learning_rate": 7.372881355932204e-06, "loss": 3.0938, "step": 88 }, { "epoch": 0.037969283276450515, "grad_norm": 60.21091842651367, "learning_rate": 7.4576271186440685e-06, "loss": 3.3691, "step": 89 }, { "epoch": 0.03839590443686007, "grad_norm": 30.50881004333496, "learning_rate": 7.542372881355933e-06, "loss": 3.4746, "step": 90 }, { "epoch": 0.038822525597269626, "grad_norm": 21.170913696289062, "learning_rate": 7.627118644067797e-06, "loss": 3.25, "step": 91 }, { "epoch": 0.03924914675767918, "grad_norm": 22.86617088317871, "learning_rate": 7.711864406779663e-06, "loss": 3.1152, "step": 92 }, { "epoch": 0.03967576791808874, "grad_norm": 73.49543762207031, "learning_rate": 7.796610169491526e-06, "loss": 3.1328, "step": 93 }, { "epoch": 0.04010238907849829, "grad_norm": 40.927406311035156, "learning_rate": 7.88135593220339e-06, "loss": 3.0215, "step": 94 }, { "epoch": 0.04052901023890785, "grad_norm": 51.40994644165039, "learning_rate": 7.966101694915255e-06, "loss": 3.8906, "step": 95 }, { "epoch": 0.040955631399317405, "grad_norm": 59.001495361328125, "learning_rate": 8.050847457627118e-06, "loss": 3.9209, "step": 96 }, { "epoch": 0.04138225255972696, "grad_norm": 26.802316665649414, "learning_rate": 8.135593220338983e-06, "loss": 3.5762, "step": 97 }, { "epoch": 0.041808873720136516, "grad_norm": 32.98997116088867, "learning_rate": 8.220338983050849e-06, "loss": 3.7031, "step": 98 }, { "epoch": 0.04223549488054607, "grad_norm": 23.042098999023438, "learning_rate": 8.305084745762712e-06, "loss": 4.0488, "step": 99 }, { "epoch": 0.042662116040955635, "grad_norm": 66.93525695800781, "learning_rate": 8.389830508474577e-06, "loss": 3.8965, "step": 100 }, { "epoch": 0.04308873720136519, "grad_norm": 39.358070373535156, "learning_rate": 8.47457627118644e-06, "loss": 3.2852, "step": 101 }, { "epoch": 0.043515358361774746, "grad_norm": 14.342375755310059, "learning_rate": 8.559322033898306e-06, "loss": 3.3594, "step": 102 }, { "epoch": 0.0439419795221843, "grad_norm": 36.743690490722656, "learning_rate": 8.64406779661017e-06, "loss": 3.3359, "step": 103 }, { "epoch": 0.04436860068259386, "grad_norm": 32.253173828125, "learning_rate": 8.728813559322034e-06, "loss": 3.3418, "step": 104 }, { "epoch": 0.04479522184300341, "grad_norm": 34.88021469116211, "learning_rate": 8.8135593220339e-06, "loss": 3.623, "step": 105 }, { "epoch": 0.04522184300341297, "grad_norm": 62.94032287597656, "learning_rate": 8.898305084745763e-06, "loss": 3.6289, "step": 106 }, { "epoch": 0.045648464163822525, "grad_norm": 37.47450637817383, "learning_rate": 8.983050847457628e-06, "loss": 2.9238, "step": 107 }, { "epoch": 0.04607508532423208, "grad_norm": 21.631309509277344, "learning_rate": 9.067796610169493e-06, "loss": 3.4141, "step": 108 }, { "epoch": 0.046501706484641636, "grad_norm": 42.39487075805664, "learning_rate": 9.152542372881356e-06, "loss": 4.3613, "step": 109 }, { "epoch": 0.04692832764505119, "grad_norm": 29.449064254760742, "learning_rate": 9.237288135593222e-06, "loss": 3.7773, "step": 110 }, { "epoch": 0.04735494880546075, "grad_norm": 35.71257019042969, "learning_rate": 9.322033898305085e-06, "loss": 3.0, "step": 111 }, { "epoch": 0.04778156996587031, "grad_norm": 45.36302947998047, "learning_rate": 9.40677966101695e-06, "loss": 3.7363, "step": 112 }, { "epoch": 0.048208191126279866, "grad_norm": 21.860015869140625, "learning_rate": 9.491525423728815e-06, "loss": 3.6289, "step": 113 }, { "epoch": 0.04863481228668942, "grad_norm": 46.48929977416992, "learning_rate": 9.576271186440679e-06, "loss": 3.3066, "step": 114 }, { "epoch": 0.04906143344709898, "grad_norm": 16.482284545898438, "learning_rate": 9.661016949152544e-06, "loss": 3.1934, "step": 115 }, { "epoch": 0.04948805460750853, "grad_norm": 29.34090232849121, "learning_rate": 9.745762711864407e-06, "loss": 3.7871, "step": 116 }, { "epoch": 0.04991467576791809, "grad_norm": 26.02210235595703, "learning_rate": 9.830508474576272e-06, "loss": 3.2969, "step": 117 }, { "epoch": 0.050341296928327645, "grad_norm": 33.3517951965332, "learning_rate": 9.915254237288137e-06, "loss": 3.1006, "step": 118 }, { "epoch": 0.0507679180887372, "grad_norm": 40.15692138671875, "learning_rate": 1e-05, "loss": 3.6816, "step": 119 }, { "epoch": 0.051194539249146756, "grad_norm": 31.148941040039062, "learning_rate": 9.995507637017073e-06, "loss": 3.6113, "step": 120 }, { "epoch": 0.05162116040955631, "grad_norm": 44.07515335083008, "learning_rate": 9.991015274034143e-06, "loss": 3.5859, "step": 121 }, { "epoch": 0.05204778156996587, "grad_norm": 73.39000701904297, "learning_rate": 9.986522911051215e-06, "loss": 3.2852, "step": 122 }, { "epoch": 0.05247440273037542, "grad_norm": 47.39154815673828, "learning_rate": 9.982030548068285e-06, "loss": 2.7686, "step": 123 }, { "epoch": 0.052901023890784986, "grad_norm": 18.518726348876953, "learning_rate": 9.977538185085355e-06, "loss": 3.5391, "step": 124 }, { "epoch": 0.05332764505119454, "grad_norm": 19.222064971923828, "learning_rate": 9.973045822102425e-06, "loss": 3.8418, "step": 125 }, { "epoch": 0.0537542662116041, "grad_norm": 93.03326416015625, "learning_rate": 9.968553459119497e-06, "loss": 3.4277, "step": 126 }, { "epoch": 0.05418088737201365, "grad_norm": 25.00471305847168, "learning_rate": 9.96406109613657e-06, "loss": 3.4277, "step": 127 }, { "epoch": 0.05460750853242321, "grad_norm": 20.304758071899414, "learning_rate": 9.95956873315364e-06, "loss": 3.5527, "step": 128 }, { "epoch": 0.055034129692832764, "grad_norm": 49.463077545166016, "learning_rate": 9.955076370170711e-06, "loss": 3.3809, "step": 129 }, { "epoch": 0.05546075085324232, "grad_norm": 19.390296936035156, "learning_rate": 9.950584007187781e-06, "loss": 3.7402, "step": 130 }, { "epoch": 0.055887372013651876, "grad_norm": 70.4116439819336, "learning_rate": 9.946091644204853e-06, "loss": 3.6895, "step": 131 }, { "epoch": 0.05631399317406143, "grad_norm": 32.00130844116211, "learning_rate": 9.941599281221924e-06, "loss": 2.8701, "step": 132 }, { "epoch": 0.05674061433447099, "grad_norm": 15.688907623291016, "learning_rate": 9.937106918238994e-06, "loss": 3.5215, "step": 133 }, { "epoch": 0.05716723549488054, "grad_norm": 28.683942794799805, "learning_rate": 9.932614555256066e-06, "loss": 3.791, "step": 134 }, { "epoch": 0.057593856655290106, "grad_norm": 20.80558204650879, "learning_rate": 9.928122192273136e-06, "loss": 3.0703, "step": 135 }, { "epoch": 0.05802047781569966, "grad_norm": 59.96001434326172, "learning_rate": 9.923629829290208e-06, "loss": 3.6309, "step": 136 }, { "epoch": 0.05844709897610922, "grad_norm": 62.419368743896484, "learning_rate": 9.919137466307278e-06, "loss": 3.5293, "step": 137 }, { "epoch": 0.05887372013651877, "grad_norm": 65.35993957519531, "learning_rate": 9.91464510332435e-06, "loss": 3.4355, "step": 138 }, { "epoch": 0.05930034129692833, "grad_norm": 47.3123779296875, "learning_rate": 9.91015274034142e-06, "loss": 3.3809, "step": 139 }, { "epoch": 0.059726962457337884, "grad_norm": 47.363914489746094, "learning_rate": 9.905660377358492e-06, "loss": 3.998, "step": 140 }, { "epoch": 0.06015358361774744, "grad_norm": 37.10547637939453, "learning_rate": 9.901168014375562e-06, "loss": 3.0352, "step": 141 }, { "epoch": 0.060580204778156996, "grad_norm": 46.55497360229492, "learning_rate": 9.896675651392634e-06, "loss": 3.0293, "step": 142 }, { "epoch": 0.06100682593856655, "grad_norm": 25.803016662597656, "learning_rate": 9.892183288409704e-06, "loss": 3.3047, "step": 143 }, { "epoch": 0.06143344709897611, "grad_norm": 33.59292984008789, "learning_rate": 9.887690925426774e-06, "loss": 3.6113, "step": 144 }, { "epoch": 0.06186006825938566, "grad_norm": 36.967140197753906, "learning_rate": 9.883198562443846e-06, "loss": 2.8086, "step": 145 }, { "epoch": 0.06228668941979522, "grad_norm": 80.75099182128906, "learning_rate": 9.878706199460916e-06, "loss": 3.4902, "step": 146 }, { "epoch": 0.06271331058020478, "grad_norm": 60.304359436035156, "learning_rate": 9.874213836477988e-06, "loss": 3.2695, "step": 147 }, { "epoch": 0.06313993174061433, "grad_norm": 73.07403564453125, "learning_rate": 9.86972147349506e-06, "loss": 3.5273, "step": 148 }, { "epoch": 0.06356655290102389, "grad_norm": 24.91359519958496, "learning_rate": 9.86522911051213e-06, "loss": 3.4062, "step": 149 }, { "epoch": 0.06399317406143344, "grad_norm": 41.76774978637695, "learning_rate": 9.860736747529202e-06, "loss": 3.4316, "step": 150 }, { "epoch": 0.064419795221843, "grad_norm": 39.29990005493164, "learning_rate": 9.856244384546273e-06, "loss": 3.3379, "step": 151 }, { "epoch": 0.06484641638225255, "grad_norm": 66.60665130615234, "learning_rate": 9.851752021563343e-06, "loss": 3.5, "step": 152 }, { "epoch": 0.06527303754266212, "grad_norm": 32.428009033203125, "learning_rate": 9.847259658580413e-06, "loss": 3.1816, "step": 153 }, { "epoch": 0.06569965870307168, "grad_norm": 20.9417781829834, "learning_rate": 9.842767295597485e-06, "loss": 3.3105, "step": 154 }, { "epoch": 0.06612627986348123, "grad_norm": 59.17805862426758, "learning_rate": 9.838274932614557e-06, "loss": 3.1074, "step": 155 }, { "epoch": 0.06655290102389079, "grad_norm": 58.085693359375, "learning_rate": 9.833782569631627e-06, "loss": 3.1348, "step": 156 }, { "epoch": 0.06697952218430034, "grad_norm": 26.633859634399414, "learning_rate": 9.829290206648699e-06, "loss": 3.3379, "step": 157 }, { "epoch": 0.0674061433447099, "grad_norm": 27.633708953857422, "learning_rate": 9.824797843665769e-06, "loss": 3.7852, "step": 158 }, { "epoch": 0.06783276450511945, "grad_norm": 103.55244445800781, "learning_rate": 9.820305480682841e-06, "loss": 3.1504, "step": 159 }, { "epoch": 0.06825938566552901, "grad_norm": 118.57666778564453, "learning_rate": 9.815813117699911e-06, "loss": 3.5752, "step": 160 }, { "epoch": 0.06868600682593856, "grad_norm": 46.34486389160156, "learning_rate": 9.811320754716981e-06, "loss": 3.4902, "step": 161 }, { "epoch": 0.06911262798634812, "grad_norm": 74.79405975341797, "learning_rate": 9.806828391734053e-06, "loss": 2.9424, "step": 162 }, { "epoch": 0.06953924914675767, "grad_norm": 57.469566345214844, "learning_rate": 9.802336028751123e-06, "loss": 3.4219, "step": 163 }, { "epoch": 0.06996587030716724, "grad_norm": 20.774744033813477, "learning_rate": 9.797843665768195e-06, "loss": 3.7871, "step": 164 }, { "epoch": 0.0703924914675768, "grad_norm": 58.800113677978516, "learning_rate": 9.793351302785265e-06, "loss": 3.4141, "step": 165 }, { "epoch": 0.07081911262798635, "grad_norm": 65.20160675048828, "learning_rate": 9.788858939802337e-06, "loss": 3.3359, "step": 166 }, { "epoch": 0.07124573378839591, "grad_norm": 71.90331268310547, "learning_rate": 9.784366576819408e-06, "loss": 3.2422, "step": 167 }, { "epoch": 0.07167235494880546, "grad_norm": 69.28284454345703, "learning_rate": 9.77987421383648e-06, "loss": 3.6152, "step": 168 }, { "epoch": 0.07209897610921502, "grad_norm": 34.195621490478516, "learning_rate": 9.77538185085355e-06, "loss": 3.7012, "step": 169 }, { "epoch": 0.07252559726962457, "grad_norm": 15.721536636352539, "learning_rate": 9.77088948787062e-06, "loss": 3.5898, "step": 170 }, { "epoch": 0.07295221843003413, "grad_norm": 18.50776481628418, "learning_rate": 9.766397124887692e-06, "loss": 3.3477, "step": 171 }, { "epoch": 0.07337883959044368, "grad_norm": 149.7288055419922, "learning_rate": 9.761904761904762e-06, "loss": 3.7578, "step": 172 }, { "epoch": 0.07380546075085324, "grad_norm": 84.52557373046875, "learning_rate": 9.757412398921834e-06, "loss": 3.4316, "step": 173 }, { "epoch": 0.07423208191126279, "grad_norm": 83.47309112548828, "learning_rate": 9.752920035938904e-06, "loss": 3.3984, "step": 174 }, { "epoch": 0.07465870307167236, "grad_norm": 27.937088012695312, "learning_rate": 9.748427672955976e-06, "loss": 3.7148, "step": 175 }, { "epoch": 0.07508532423208192, "grad_norm": 23.611757278442383, "learning_rate": 9.743935309973048e-06, "loss": 3.208, "step": 176 }, { "epoch": 0.07551194539249147, "grad_norm": 30.196529388427734, "learning_rate": 9.739442946990118e-06, "loss": 3.1592, "step": 177 }, { "epoch": 0.07593856655290103, "grad_norm": 42.56078338623047, "learning_rate": 9.734950584007188e-06, "loss": 3.6641, "step": 178 }, { "epoch": 0.07636518771331058, "grad_norm": 51.98075866699219, "learning_rate": 9.73045822102426e-06, "loss": 3.834, "step": 179 }, { "epoch": 0.07679180887372014, "grad_norm": 54.121307373046875, "learning_rate": 9.72596585804133e-06, "loss": 3.457, "step": 180 }, { "epoch": 0.07721843003412969, "grad_norm": 39.24628829956055, "learning_rate": 9.7214734950584e-06, "loss": 3.2236, "step": 181 }, { "epoch": 0.07764505119453925, "grad_norm": 20.651260375976562, "learning_rate": 9.716981132075472e-06, "loss": 3.8125, "step": 182 }, { "epoch": 0.0780716723549488, "grad_norm": 68.77881622314453, "learning_rate": 9.712488769092544e-06, "loss": 3.3652, "step": 183 }, { "epoch": 0.07849829351535836, "grad_norm": 21.397153854370117, "learning_rate": 9.707996406109614e-06, "loss": 3.3125, "step": 184 }, { "epoch": 0.07892491467576791, "grad_norm": 73.60026550292969, "learning_rate": 9.703504043126686e-06, "loss": 3.3945, "step": 185 }, { "epoch": 0.07935153583617748, "grad_norm": 129.18272399902344, "learning_rate": 9.699011680143757e-06, "loss": 3.4141, "step": 186 }, { "epoch": 0.07977815699658702, "grad_norm": 27.627702713012695, "learning_rate": 9.694519317160828e-06, "loss": 3.3594, "step": 187 }, { "epoch": 0.08020477815699659, "grad_norm": 19.12004852294922, "learning_rate": 9.690026954177899e-06, "loss": 3.1738, "step": 188 }, { "epoch": 0.08063139931740615, "grad_norm": 54.20294189453125, "learning_rate": 9.685534591194969e-06, "loss": 3.3184, "step": 189 }, { "epoch": 0.0810580204778157, "grad_norm": 79.17913818359375, "learning_rate": 9.68104222821204e-06, "loss": 3.8184, "step": 190 }, { "epoch": 0.08148464163822526, "grad_norm": 93.08058166503906, "learning_rate": 9.676549865229111e-06, "loss": 3.7852, "step": 191 }, { "epoch": 0.08191126279863481, "grad_norm": 94.66240692138672, "learning_rate": 9.672057502246183e-06, "loss": 4.0957, "step": 192 }, { "epoch": 0.08233788395904437, "grad_norm": 74.10448455810547, "learning_rate": 9.667565139263253e-06, "loss": 3.3672, "step": 193 }, { "epoch": 0.08276450511945392, "grad_norm": 15.366697311401367, "learning_rate": 9.663072776280325e-06, "loss": 2.8145, "step": 194 }, { "epoch": 0.08319112627986348, "grad_norm": 28.673463821411133, "learning_rate": 9.658580413297395e-06, "loss": 3.334, "step": 195 }, { "epoch": 0.08361774744027303, "grad_norm": 67.39178466796875, "learning_rate": 9.654088050314467e-06, "loss": 3.5195, "step": 196 }, { "epoch": 0.0840443686006826, "grad_norm": 50.03989791870117, "learning_rate": 9.649595687331537e-06, "loss": 3.6289, "step": 197 }, { "epoch": 0.08447098976109214, "grad_norm": 60.73853302001953, "learning_rate": 9.645103324348607e-06, "loss": 3.5918, "step": 198 }, { "epoch": 0.0848976109215017, "grad_norm": 62.10542297363281, "learning_rate": 9.64061096136568e-06, "loss": 3.0469, "step": 199 }, { "epoch": 0.08532423208191127, "grad_norm": 46.920196533203125, "learning_rate": 9.63611859838275e-06, "loss": 3.1494, "step": 200 }, { "epoch": 0.08575085324232082, "grad_norm": 33.74738693237305, "learning_rate": 9.631626235399821e-06, "loss": 3.6973, "step": 201 }, { "epoch": 0.08617747440273038, "grad_norm": 41.40378189086914, "learning_rate": 9.627133872416892e-06, "loss": 3.373, "step": 202 }, { "epoch": 0.08660409556313993, "grad_norm": 83.42320251464844, "learning_rate": 9.622641509433963e-06, "loss": 3.4805, "step": 203 }, { "epoch": 0.08703071672354949, "grad_norm": 60.74094772338867, "learning_rate": 9.618149146451034e-06, "loss": 3.2285, "step": 204 }, { "epoch": 0.08745733788395904, "grad_norm": 80.0930404663086, "learning_rate": 9.613656783468106e-06, "loss": 3.5938, "step": 205 }, { "epoch": 0.0878839590443686, "grad_norm": 45.48680877685547, "learning_rate": 9.609164420485176e-06, "loss": 3.3945, "step": 206 }, { "epoch": 0.08831058020477815, "grad_norm": 37.88972473144531, "learning_rate": 9.604672057502246e-06, "loss": 2.7588, "step": 207 }, { "epoch": 0.08873720136518772, "grad_norm": 25.568714141845703, "learning_rate": 9.600179694519318e-06, "loss": 3.2305, "step": 208 }, { "epoch": 0.08916382252559726, "grad_norm": 56.041358947753906, "learning_rate": 9.595687331536388e-06, "loss": 3.123, "step": 209 }, { "epoch": 0.08959044368600683, "grad_norm": 34.62221908569336, "learning_rate": 9.59119496855346e-06, "loss": 3.166, "step": 210 }, { "epoch": 0.09001706484641639, "grad_norm": 79.33340454101562, "learning_rate": 9.58670260557053e-06, "loss": 3.1953, "step": 211 }, { "epoch": 0.09044368600682594, "grad_norm": 45.325496673583984, "learning_rate": 9.582210242587602e-06, "loss": 3.4336, "step": 212 }, { "epoch": 0.0908703071672355, "grad_norm": 20.415624618530273, "learning_rate": 9.577717879604674e-06, "loss": 3.1016, "step": 213 }, { "epoch": 0.09129692832764505, "grad_norm": 80.71441650390625, "learning_rate": 9.573225516621744e-06, "loss": 3.8184, "step": 214 }, { "epoch": 0.09172354948805461, "grad_norm": 44.34187698364258, "learning_rate": 9.568733153638814e-06, "loss": 3.4629, "step": 215 }, { "epoch": 0.09215017064846416, "grad_norm": 23.23796272277832, "learning_rate": 9.564240790655886e-06, "loss": 3.5723, "step": 216 }, { "epoch": 0.09257679180887372, "grad_norm": 24.016454696655273, "learning_rate": 9.559748427672956e-06, "loss": 3.0312, "step": 217 }, { "epoch": 0.09300341296928327, "grad_norm": 22.58914566040039, "learning_rate": 9.555256064690027e-06, "loss": 3.3926, "step": 218 }, { "epoch": 0.09343003412969283, "grad_norm": 43.52719497680664, "learning_rate": 9.550763701707098e-06, "loss": 3.0215, "step": 219 }, { "epoch": 0.09385665529010238, "grad_norm": 19.073322296142578, "learning_rate": 9.54627133872417e-06, "loss": 3.0996, "step": 220 }, { "epoch": 0.09428327645051195, "grad_norm": 35.90131759643555, "learning_rate": 9.54177897574124e-06, "loss": 3.0117, "step": 221 }, { "epoch": 0.0947098976109215, "grad_norm": 41.415557861328125, "learning_rate": 9.537286612758312e-06, "loss": 2.7695, "step": 222 }, { "epoch": 0.09513651877133106, "grad_norm": 37.814273834228516, "learning_rate": 9.532794249775383e-06, "loss": 3.3242, "step": 223 }, { "epoch": 0.09556313993174062, "grad_norm": 85.59257507324219, "learning_rate": 9.528301886792455e-06, "loss": 3.6572, "step": 224 }, { "epoch": 0.09598976109215017, "grad_norm": 60.9831428527832, "learning_rate": 9.523809523809525e-06, "loss": 3.0859, "step": 225 }, { "epoch": 0.09641638225255973, "grad_norm": 66.6321029663086, "learning_rate": 9.519317160826595e-06, "loss": 3.2715, "step": 226 }, { "epoch": 0.09684300341296928, "grad_norm": 30.54461097717285, "learning_rate": 9.514824797843667e-06, "loss": 2.5059, "step": 227 }, { "epoch": 0.09726962457337884, "grad_norm": 31.020240783691406, "learning_rate": 9.510332434860737e-06, "loss": 3.0801, "step": 228 }, { "epoch": 0.09769624573378839, "grad_norm": 38.5662841796875, "learning_rate": 9.505840071877809e-06, "loss": 3.4961, "step": 229 }, { "epoch": 0.09812286689419795, "grad_norm": 32.846153259277344, "learning_rate": 9.501347708894879e-06, "loss": 3.2148, "step": 230 }, { "epoch": 0.0985494880546075, "grad_norm": 24.943174362182617, "learning_rate": 9.496855345911951e-06, "loss": 3.1523, "step": 231 }, { "epoch": 0.09897610921501707, "grad_norm": 36.72368240356445, "learning_rate": 9.492362982929021e-06, "loss": 3.0732, "step": 232 }, { "epoch": 0.09940273037542662, "grad_norm": 18.37833023071289, "learning_rate": 9.487870619946093e-06, "loss": 2.8652, "step": 233 }, { "epoch": 0.09982935153583618, "grad_norm": 23.98996353149414, "learning_rate": 9.483378256963163e-06, "loss": 3.1426, "step": 234 }, { "epoch": 0.10025597269624574, "grad_norm": 43.43721008300781, "learning_rate": 9.478885893980234e-06, "loss": 3.6602, "step": 235 }, { "epoch": 0.10068259385665529, "grad_norm": 59.83910369873047, "learning_rate": 9.474393530997305e-06, "loss": 3.2461, "step": 236 }, { "epoch": 0.10110921501706485, "grad_norm": 26.23409652709961, "learning_rate": 9.469901168014376e-06, "loss": 2.832, "step": 237 }, { "epoch": 0.1015358361774744, "grad_norm": 20.064634323120117, "learning_rate": 9.465408805031447e-06, "loss": 2.9238, "step": 238 }, { "epoch": 0.10196245733788396, "grad_norm": 17.101430892944336, "learning_rate": 9.460916442048518e-06, "loss": 2.8154, "step": 239 }, { "epoch": 0.10238907849829351, "grad_norm": 23.318517684936523, "learning_rate": 9.45642407906559e-06, "loss": 3.3594, "step": 240 }, { "epoch": 0.10281569965870307, "grad_norm": 35.31196212768555, "learning_rate": 9.451931716082661e-06, "loss": 2.875, "step": 241 }, { "epoch": 0.10324232081911262, "grad_norm": 25.542692184448242, "learning_rate": 9.447439353099732e-06, "loss": 2.8672, "step": 242 }, { "epoch": 0.10366894197952219, "grad_norm": 28.5914249420166, "learning_rate": 9.442946990116802e-06, "loss": 3.252, "step": 243 }, { "epoch": 0.10409556313993173, "grad_norm": 35.261146545410156, "learning_rate": 9.438454627133872e-06, "loss": 2.9238, "step": 244 }, { "epoch": 0.1045221843003413, "grad_norm": 31.006851196289062, "learning_rate": 9.433962264150944e-06, "loss": 3.543, "step": 245 }, { "epoch": 0.10494880546075085, "grad_norm": 28.32399559020996, "learning_rate": 9.429469901168014e-06, "loss": 3.1533, "step": 246 }, { "epoch": 0.10537542662116041, "grad_norm": 22.59617805480957, "learning_rate": 9.424977538185086e-06, "loss": 2.9961, "step": 247 }, { "epoch": 0.10580204778156997, "grad_norm": 31.118349075317383, "learning_rate": 9.420485175202158e-06, "loss": 3.8945, "step": 248 }, { "epoch": 0.10622866894197952, "grad_norm": 42.014286041259766, "learning_rate": 9.415992812219228e-06, "loss": 2.7529, "step": 249 }, { "epoch": 0.10665529010238908, "grad_norm": 30.373682022094727, "learning_rate": 9.4115004492363e-06, "loss": 3.2578, "step": 250 }, { "epoch": 0.10708191126279863, "grad_norm": 18.01287078857422, "learning_rate": 9.40700808625337e-06, "loss": 3.1133, "step": 251 }, { "epoch": 0.1075085324232082, "grad_norm": 34.93136978149414, "learning_rate": 9.40251572327044e-06, "loss": 2.9883, "step": 252 }, { "epoch": 0.10793515358361774, "grad_norm": 19.136699676513672, "learning_rate": 9.398023360287512e-06, "loss": 3.2773, "step": 253 }, { "epoch": 0.1083617747440273, "grad_norm": 67.24413299560547, "learning_rate": 9.393530997304582e-06, "loss": 3.0068, "step": 254 }, { "epoch": 0.10878839590443685, "grad_norm": 65.19268035888672, "learning_rate": 9.389038634321654e-06, "loss": 3.1406, "step": 255 }, { "epoch": 0.10921501706484642, "grad_norm": 78.92341613769531, "learning_rate": 9.384546271338725e-06, "loss": 3.709, "step": 256 }, { "epoch": 0.10964163822525597, "grad_norm": 58.49309539794922, "learning_rate": 9.380053908355796e-06, "loss": 3.1504, "step": 257 }, { "epoch": 0.11006825938566553, "grad_norm": 29.386493682861328, "learning_rate": 9.375561545372867e-06, "loss": 3.25, "step": 258 }, { "epoch": 0.11049488054607509, "grad_norm": 36.24472427368164, "learning_rate": 9.371069182389939e-06, "loss": 3.084, "step": 259 }, { "epoch": 0.11092150170648464, "grad_norm": 91.86468505859375, "learning_rate": 9.366576819407009e-06, "loss": 3.084, "step": 260 }, { "epoch": 0.1113481228668942, "grad_norm": 105.02268981933594, "learning_rate": 9.36208445642408e-06, "loss": 4.002, "step": 261 }, { "epoch": 0.11177474402730375, "grad_norm": 66.99861907958984, "learning_rate": 9.357592093441151e-06, "loss": 3.1582, "step": 262 }, { "epoch": 0.11220136518771331, "grad_norm": 19.5322208404541, "learning_rate": 9.353099730458221e-06, "loss": 3.25, "step": 263 }, { "epoch": 0.11262798634812286, "grad_norm": 21.23870849609375, "learning_rate": 9.348607367475293e-06, "loss": 3.4746, "step": 264 }, { "epoch": 0.11305460750853243, "grad_norm": 21.280193328857422, "learning_rate": 9.344115004492363e-06, "loss": 3.0547, "step": 265 }, { "epoch": 0.11348122866894197, "grad_norm": 23.46653938293457, "learning_rate": 9.339622641509435e-06, "loss": 2.9883, "step": 266 }, { "epoch": 0.11390784982935154, "grad_norm": 24.50129508972168, "learning_rate": 9.335130278526505e-06, "loss": 3.2188, "step": 267 }, { "epoch": 0.11433447098976109, "grad_norm": 20.978782653808594, "learning_rate": 9.330637915543577e-06, "loss": 3.1113, "step": 268 }, { "epoch": 0.11476109215017065, "grad_norm": 23.665151596069336, "learning_rate": 9.326145552560647e-06, "loss": 2.9453, "step": 269 }, { "epoch": 0.11518771331058021, "grad_norm": 19.293272018432617, "learning_rate": 9.32165318957772e-06, "loss": 2.8809, "step": 270 }, { "epoch": 0.11561433447098976, "grad_norm": 17.699464797973633, "learning_rate": 9.31716082659479e-06, "loss": 2.9258, "step": 271 }, { "epoch": 0.11604095563139932, "grad_norm": 44.36705780029297, "learning_rate": 9.31266846361186e-06, "loss": 2.7197, "step": 272 }, { "epoch": 0.11646757679180887, "grad_norm": 23.318252563476562, "learning_rate": 9.308176100628931e-06, "loss": 3.8906, "step": 273 }, { "epoch": 0.11689419795221843, "grad_norm": 49.59714889526367, "learning_rate": 9.303683737646002e-06, "loss": 3.3008, "step": 274 }, { "epoch": 0.11732081911262798, "grad_norm": 33.267974853515625, "learning_rate": 9.299191374663074e-06, "loss": 3.498, "step": 275 }, { "epoch": 0.11774744027303755, "grad_norm": 27.456832885742188, "learning_rate": 9.294699011680145e-06, "loss": 3.5273, "step": 276 }, { "epoch": 0.1181740614334471, "grad_norm": 38.26292037963867, "learning_rate": 9.290206648697216e-06, "loss": 3.0762, "step": 277 }, { "epoch": 0.11860068259385666, "grad_norm": 41.656185150146484, "learning_rate": 9.285714285714288e-06, "loss": 3.4746, "step": 278 }, { "epoch": 0.1190273037542662, "grad_norm": 32.80821228027344, "learning_rate": 9.281221922731358e-06, "loss": 3.7451, "step": 279 }, { "epoch": 0.11945392491467577, "grad_norm": 20.763341903686523, "learning_rate": 9.276729559748428e-06, "loss": 3.3711, "step": 280 }, { "epoch": 0.11988054607508532, "grad_norm": 62.02823257446289, "learning_rate": 9.272237196765498e-06, "loss": 3.2432, "step": 281 }, { "epoch": 0.12030716723549488, "grad_norm": 17.502212524414062, "learning_rate": 9.26774483378257e-06, "loss": 3.1055, "step": 282 }, { "epoch": 0.12073378839590444, "grad_norm": 19.061084747314453, "learning_rate": 9.263252470799642e-06, "loss": 3.1777, "step": 283 }, { "epoch": 0.12116040955631399, "grad_norm": 18.355392456054688, "learning_rate": 9.258760107816712e-06, "loss": 2.4844, "step": 284 }, { "epoch": 0.12158703071672355, "grad_norm": 53.59149932861328, "learning_rate": 9.254267744833784e-06, "loss": 3.3242, "step": 285 }, { "epoch": 0.1220136518771331, "grad_norm": 64.19649505615234, "learning_rate": 9.249775381850854e-06, "loss": 3.4668, "step": 286 }, { "epoch": 0.12244027303754267, "grad_norm": 24.470962524414062, "learning_rate": 9.245283018867926e-06, "loss": 3.1016, "step": 287 }, { "epoch": 0.12286689419795221, "grad_norm": 35.040287017822266, "learning_rate": 9.240790655884996e-06, "loss": 3.0752, "step": 288 }, { "epoch": 0.12329351535836178, "grad_norm": 49.83961486816406, "learning_rate": 9.236298292902067e-06, "loss": 3.7383, "step": 289 }, { "epoch": 0.12372013651877133, "grad_norm": 71.65055084228516, "learning_rate": 9.231805929919138e-06, "loss": 3.7207, "step": 290 }, { "epoch": 0.12414675767918089, "grad_norm": 35.319000244140625, "learning_rate": 9.227313566936209e-06, "loss": 3.6309, "step": 291 }, { "epoch": 0.12457337883959044, "grad_norm": 29.98273468017578, "learning_rate": 9.22282120395328e-06, "loss": 2.9551, "step": 292 }, { "epoch": 0.125, "grad_norm": 26.370370864868164, "learning_rate": 9.21832884097035e-06, "loss": 3.2305, "step": 293 }, { "epoch": 0.12542662116040956, "grad_norm": 61.236785888671875, "learning_rate": 9.213836477987423e-06, "loss": 3.7461, "step": 294 }, { "epoch": 0.12585324232081913, "grad_norm": 23.307022094726562, "learning_rate": 9.209344115004493e-06, "loss": 2.8672, "step": 295 }, { "epoch": 0.12627986348122866, "grad_norm": 42.118377685546875, "learning_rate": 9.204851752021565e-06, "loss": 2.8496, "step": 296 }, { "epoch": 0.12670648464163822, "grad_norm": 62.872283935546875, "learning_rate": 9.200359389038635e-06, "loss": 3.3105, "step": 297 }, { "epoch": 0.12713310580204779, "grad_norm": 27.943336486816406, "learning_rate": 9.195867026055707e-06, "loss": 3.3574, "step": 298 }, { "epoch": 0.12755972696245735, "grad_norm": 32.46928787231445, "learning_rate": 9.191374663072777e-06, "loss": 3.5156, "step": 299 }, { "epoch": 0.12798634812286688, "grad_norm": 61.87351989746094, "learning_rate": 9.186882300089847e-06, "loss": 3.2422, "step": 300 }, { "epoch": 0.12841296928327645, "grad_norm": 108.4822998046875, "learning_rate": 9.182389937106919e-06, "loss": 3.3574, "step": 301 }, { "epoch": 0.128839590443686, "grad_norm": 95.23423767089844, "learning_rate": 9.17789757412399e-06, "loss": 3.5137, "step": 302 }, { "epoch": 0.12926621160409557, "grad_norm": 75.93583679199219, "learning_rate": 9.173405211141061e-06, "loss": 3.2715, "step": 303 }, { "epoch": 0.1296928327645051, "grad_norm": 48.323036193847656, "learning_rate": 9.168912848158133e-06, "loss": 3.0869, "step": 304 }, { "epoch": 0.13011945392491467, "grad_norm": 33.861515045166016, "learning_rate": 9.164420485175203e-06, "loss": 3.332, "step": 305 }, { "epoch": 0.13054607508532423, "grad_norm": 48.168663024902344, "learning_rate": 9.159928122192273e-06, "loss": 3.4336, "step": 306 }, { "epoch": 0.1309726962457338, "grad_norm": 43.305660247802734, "learning_rate": 9.155435759209345e-06, "loss": 2.998, "step": 307 }, { "epoch": 0.13139931740614336, "grad_norm": 32.60908889770508, "learning_rate": 9.150943396226416e-06, "loss": 2.9258, "step": 308 }, { "epoch": 0.1318259385665529, "grad_norm": 68.1447525024414, "learning_rate": 9.146451033243486e-06, "loss": 3.3203, "step": 309 }, { "epoch": 0.13225255972696245, "grad_norm": 66.20540618896484, "learning_rate": 9.141958670260558e-06, "loss": 3.2773, "step": 310 }, { "epoch": 0.13267918088737202, "grad_norm": 39.17079544067383, "learning_rate": 9.13746630727763e-06, "loss": 3.168, "step": 311 }, { "epoch": 0.13310580204778158, "grad_norm": 48.80988311767578, "learning_rate": 9.1329739442947e-06, "loss": 3.7939, "step": 312 }, { "epoch": 0.13353242320819111, "grad_norm": 49.57157897949219, "learning_rate": 9.128481581311772e-06, "loss": 3.5527, "step": 313 }, { "epoch": 0.13395904436860068, "grad_norm": 29.27692222595215, "learning_rate": 9.123989218328842e-06, "loss": 3.3145, "step": 314 }, { "epoch": 0.13438566552901024, "grad_norm": 79.36219024658203, "learning_rate": 9.119496855345914e-06, "loss": 3.9492, "step": 315 }, { "epoch": 0.1348122866894198, "grad_norm": 67.88323974609375, "learning_rate": 9.115004492362984e-06, "loss": 3.292, "step": 316 }, { "epoch": 0.13523890784982937, "grad_norm": 30.693199157714844, "learning_rate": 9.110512129380054e-06, "loss": 3.4082, "step": 317 }, { "epoch": 0.1356655290102389, "grad_norm": 49.18144989013672, "learning_rate": 9.106019766397126e-06, "loss": 4.2949, "step": 318 }, { "epoch": 0.13609215017064846, "grad_norm": 22.70636749267578, "learning_rate": 9.101527403414196e-06, "loss": 3.124, "step": 319 }, { "epoch": 0.13651877133105803, "grad_norm": 17.64731788635254, "learning_rate": 9.097035040431268e-06, "loss": 3.2451, "step": 320 }, { "epoch": 0.1369453924914676, "grad_norm": 35.68535232543945, "learning_rate": 9.092542677448338e-06, "loss": 3.2607, "step": 321 }, { "epoch": 0.13737201365187712, "grad_norm": 25.95724868774414, "learning_rate": 9.08805031446541e-06, "loss": 3.1426, "step": 322 }, { "epoch": 0.13779863481228669, "grad_norm": 20.903661727905273, "learning_rate": 9.08355795148248e-06, "loss": 2.7314, "step": 323 }, { "epoch": 0.13822525597269625, "grad_norm": 35.06086730957031, "learning_rate": 9.079065588499552e-06, "loss": 3.2363, "step": 324 }, { "epoch": 0.1386518771331058, "grad_norm": 14.262763023376465, "learning_rate": 9.074573225516622e-06, "loss": 3.0547, "step": 325 }, { "epoch": 0.13907849829351535, "grad_norm": 14.322052001953125, "learning_rate": 9.070080862533693e-06, "loss": 2.9902, "step": 326 }, { "epoch": 0.1395051194539249, "grad_norm": 27.241592407226562, "learning_rate": 9.065588499550765e-06, "loss": 2.542, "step": 327 }, { "epoch": 0.13993174061433447, "grad_norm": 23.563817977905273, "learning_rate": 9.061096136567835e-06, "loss": 3.4922, "step": 328 }, { "epoch": 0.14035836177474403, "grad_norm": 26.41652488708496, "learning_rate": 9.056603773584907e-06, "loss": 3.1562, "step": 329 }, { "epoch": 0.1407849829351536, "grad_norm": 41.39048385620117, "learning_rate": 9.052111410601977e-06, "loss": 3.0781, "step": 330 }, { "epoch": 0.14121160409556313, "grad_norm": 48.801815032958984, "learning_rate": 9.047619047619049e-06, "loss": 2.877, "step": 331 }, { "epoch": 0.1416382252559727, "grad_norm": 27.74787139892578, "learning_rate": 9.04312668463612e-06, "loss": 3.5898, "step": 332 }, { "epoch": 0.14206484641638226, "grad_norm": 27.950883865356445, "learning_rate": 9.03863432165319e-06, "loss": 3.0215, "step": 333 }, { "epoch": 0.14249146757679182, "grad_norm": 27.625429153442383, "learning_rate": 9.034141958670261e-06, "loss": 3.377, "step": 334 }, { "epoch": 0.14291808873720135, "grad_norm": 56.00491714477539, "learning_rate": 9.029649595687333e-06, "loss": 3.6309, "step": 335 }, { "epoch": 0.14334470989761092, "grad_norm": 21.263246536254883, "learning_rate": 9.025157232704403e-06, "loss": 3.0566, "step": 336 }, { "epoch": 0.14377133105802048, "grad_norm": 19.661069869995117, "learning_rate": 9.020664869721473e-06, "loss": 3.0547, "step": 337 }, { "epoch": 0.14419795221843004, "grad_norm": 22.829408645629883, "learning_rate": 9.016172506738545e-06, "loss": 2.9902, "step": 338 }, { "epoch": 0.14462457337883958, "grad_norm": 26.44325065612793, "learning_rate": 9.011680143755617e-06, "loss": 3.0586, "step": 339 }, { "epoch": 0.14505119453924914, "grad_norm": 23.391923904418945, "learning_rate": 9.007187780772687e-06, "loss": 3.3105, "step": 340 }, { "epoch": 0.1454778156996587, "grad_norm": 18.95509910583496, "learning_rate": 9.002695417789759e-06, "loss": 2.7871, "step": 341 }, { "epoch": 0.14590443686006827, "grad_norm": 18.797534942626953, "learning_rate": 8.99820305480683e-06, "loss": 3.4238, "step": 342 }, { "epoch": 0.14633105802047783, "grad_norm": 12.933985710144043, "learning_rate": 8.9937106918239e-06, "loss": 2.957, "step": 343 }, { "epoch": 0.14675767918088736, "grad_norm": 74.13087463378906, "learning_rate": 8.989218328840971e-06, "loss": 3.709, "step": 344 }, { "epoch": 0.14718430034129693, "grad_norm": 27.274499893188477, "learning_rate": 8.984725965858042e-06, "loss": 3.1973, "step": 345 }, { "epoch": 0.1476109215017065, "grad_norm": 21.707658767700195, "learning_rate": 8.980233602875114e-06, "loss": 3.1875, "step": 346 }, { "epoch": 0.14803754266211605, "grad_norm": 22.60236930847168, "learning_rate": 8.975741239892184e-06, "loss": 2.7441, "step": 347 }, { "epoch": 0.14846416382252559, "grad_norm": 22.136030197143555, "learning_rate": 8.971248876909256e-06, "loss": 3.1777, "step": 348 }, { "epoch": 0.14889078498293515, "grad_norm": 41.246482849121094, "learning_rate": 8.966756513926326e-06, "loss": 3.207, "step": 349 }, { "epoch": 0.1493174061433447, "grad_norm": 20.428062438964844, "learning_rate": 8.962264150943398e-06, "loss": 3.2832, "step": 350 }, { "epoch": 0.14974402730375427, "grad_norm": 18.96843147277832, "learning_rate": 8.957771787960468e-06, "loss": 2.7578, "step": 351 }, { "epoch": 0.15017064846416384, "grad_norm": 51.70307540893555, "learning_rate": 8.95327942497754e-06, "loss": 3.1045, "step": 352 }, { "epoch": 0.15059726962457337, "grad_norm": 24.539751052856445, "learning_rate": 8.94878706199461e-06, "loss": 3.3164, "step": 353 }, { "epoch": 0.15102389078498293, "grad_norm": 28.489370346069336, "learning_rate": 8.94429469901168e-06, "loss": 2.8105, "step": 354 }, { "epoch": 0.1514505119453925, "grad_norm": 67.5721664428711, "learning_rate": 8.939802336028752e-06, "loss": 3.4707, "step": 355 }, { "epoch": 0.15187713310580206, "grad_norm": 19.821903228759766, "learning_rate": 8.935309973045822e-06, "loss": 3.2227, "step": 356 }, { "epoch": 0.1523037542662116, "grad_norm": 27.4295711517334, "learning_rate": 8.930817610062894e-06, "loss": 2.8145, "step": 357 }, { "epoch": 0.15273037542662116, "grad_norm": 42.77101516723633, "learning_rate": 8.926325247079964e-06, "loss": 2.791, "step": 358 }, { "epoch": 0.15315699658703072, "grad_norm": 15.855666160583496, "learning_rate": 8.921832884097036e-06, "loss": 3.0332, "step": 359 }, { "epoch": 0.15358361774744028, "grad_norm": 21.724477767944336, "learning_rate": 8.917340521114106e-06, "loss": 3.0322, "step": 360 }, { "epoch": 0.15401023890784982, "grad_norm": 44.551021575927734, "learning_rate": 8.912848158131178e-06, "loss": 3.3057, "step": 361 }, { "epoch": 0.15443686006825938, "grad_norm": 16.5596981048584, "learning_rate": 8.908355795148249e-06, "loss": 3.0547, "step": 362 }, { "epoch": 0.15486348122866894, "grad_norm": 17.56165313720703, "learning_rate": 8.903863432165319e-06, "loss": 3.0039, "step": 363 }, { "epoch": 0.1552901023890785, "grad_norm": 21.125158309936523, "learning_rate": 8.89937106918239e-06, "loss": 3.0469, "step": 364 }, { "epoch": 0.15571672354948807, "grad_norm": 39.21709442138672, "learning_rate": 8.89487870619946e-06, "loss": 3.1875, "step": 365 }, { "epoch": 0.1561433447098976, "grad_norm": 52.8671875, "learning_rate": 8.890386343216533e-06, "loss": 3.5781, "step": 366 }, { "epoch": 0.15656996587030717, "grad_norm": 33.692012786865234, "learning_rate": 8.885893980233603e-06, "loss": 3.3535, "step": 367 }, { "epoch": 0.15699658703071673, "grad_norm": 75.68614196777344, "learning_rate": 8.881401617250675e-06, "loss": 2.6094, "step": 368 }, { "epoch": 0.1574232081911263, "grad_norm": 65.00382232666016, "learning_rate": 8.876909254267747e-06, "loss": 2.8457, "step": 369 }, { "epoch": 0.15784982935153583, "grad_norm": 78.48975372314453, "learning_rate": 8.872416891284817e-06, "loss": 2.9648, "step": 370 }, { "epoch": 0.1582764505119454, "grad_norm": 43.618648529052734, "learning_rate": 8.867924528301887e-06, "loss": 3.1523, "step": 371 }, { "epoch": 0.15870307167235495, "grad_norm": 20.660226821899414, "learning_rate": 8.863432165318957e-06, "loss": 2.9531, "step": 372 }, { "epoch": 0.1591296928327645, "grad_norm": 17.3546199798584, "learning_rate": 8.85893980233603e-06, "loss": 2.6953, "step": 373 }, { "epoch": 0.15955631399317405, "grad_norm": 82.11502838134766, "learning_rate": 8.8544474393531e-06, "loss": 3.3164, "step": 374 }, { "epoch": 0.1599829351535836, "grad_norm": 56.06876754760742, "learning_rate": 8.849955076370171e-06, "loss": 2.9805, "step": 375 }, { "epoch": 0.16040955631399317, "grad_norm": 63.79014587402344, "learning_rate": 8.845462713387243e-06, "loss": 2.9902, "step": 376 }, { "epoch": 0.16083617747440274, "grad_norm": 62.75901412963867, "learning_rate": 8.840970350404313e-06, "loss": 2.8008, "step": 377 }, { "epoch": 0.1612627986348123, "grad_norm": 30.096982955932617, "learning_rate": 8.836477987421385e-06, "loss": 3.5918, "step": 378 }, { "epoch": 0.16168941979522183, "grad_norm": 38.117801666259766, "learning_rate": 8.831985624438455e-06, "loss": 3.0928, "step": 379 }, { "epoch": 0.1621160409556314, "grad_norm": 23.72600746154785, "learning_rate": 8.827493261455526e-06, "loss": 3.0742, "step": 380 }, { "epoch": 0.16254266211604096, "grad_norm": 34.349876403808594, "learning_rate": 8.823000898472598e-06, "loss": 3.6162, "step": 381 }, { "epoch": 0.16296928327645052, "grad_norm": 41.78531265258789, "learning_rate": 8.818508535489668e-06, "loss": 2.623, "step": 382 }, { "epoch": 0.16339590443686006, "grad_norm": 16.38304328918457, "learning_rate": 8.81401617250674e-06, "loss": 2.7578, "step": 383 }, { "epoch": 0.16382252559726962, "grad_norm": 26.40755844116211, "learning_rate": 8.80952380952381e-06, "loss": 2.9482, "step": 384 }, { "epoch": 0.16424914675767918, "grad_norm": 64.92070007324219, "learning_rate": 8.805031446540882e-06, "loss": 3.1387, "step": 385 }, { "epoch": 0.16467576791808874, "grad_norm": 50.635353088378906, "learning_rate": 8.800539083557952e-06, "loss": 3.2109, "step": 386 }, { "epoch": 0.1651023890784983, "grad_norm": 20.982133865356445, "learning_rate": 8.796046720575024e-06, "loss": 2.9834, "step": 387 }, { "epoch": 0.16552901023890784, "grad_norm": 70.14364624023438, "learning_rate": 8.791554357592094e-06, "loss": 3.8008, "step": 388 }, { "epoch": 0.1659556313993174, "grad_norm": 19.195613861083984, "learning_rate": 8.787061994609166e-06, "loss": 3.3164, "step": 389 }, { "epoch": 0.16638225255972697, "grad_norm": 22.438356399536133, "learning_rate": 8.782569631626236e-06, "loss": 3.0254, "step": 390 }, { "epoch": 0.16680887372013653, "grad_norm": 18.336254119873047, "learning_rate": 8.778077268643306e-06, "loss": 2.877, "step": 391 }, { "epoch": 0.16723549488054607, "grad_norm": 19.402578353881836, "learning_rate": 8.773584905660378e-06, "loss": 3.1582, "step": 392 }, { "epoch": 0.16766211604095563, "grad_norm": 18.75274085998535, "learning_rate": 8.769092542677448e-06, "loss": 3.0508, "step": 393 }, { "epoch": 0.1680887372013652, "grad_norm": 29.596078872680664, "learning_rate": 8.76460017969452e-06, "loss": 3.3203, "step": 394 }, { "epoch": 0.16851535836177475, "grad_norm": 54.33580780029297, "learning_rate": 8.76010781671159e-06, "loss": 2.8652, "step": 395 }, { "epoch": 0.1689419795221843, "grad_norm": 19.791545867919922, "learning_rate": 8.755615453728662e-06, "loss": 2.7363, "step": 396 }, { "epoch": 0.16936860068259385, "grad_norm": 13.774062156677246, "learning_rate": 8.751123090745734e-06, "loss": 2.623, "step": 397 }, { "epoch": 0.1697952218430034, "grad_norm": 16.633455276489258, "learning_rate": 8.746630727762804e-06, "loss": 3.0918, "step": 398 }, { "epoch": 0.17022184300341298, "grad_norm": 19.359846115112305, "learning_rate": 8.742138364779875e-06, "loss": 2.9824, "step": 399 }, { "epoch": 0.17064846416382254, "grad_norm": 20.94213104248047, "learning_rate": 8.737646001796945e-06, "loss": 2.7168, "step": 400 }, { "epoch": 0.17107508532423207, "grad_norm": 36.21412658691406, "learning_rate": 8.733153638814017e-06, "loss": 3.0322, "step": 401 }, { "epoch": 0.17150170648464164, "grad_norm": 20.138742446899414, "learning_rate": 8.728661275831087e-06, "loss": 2.9668, "step": 402 }, { "epoch": 0.1719283276450512, "grad_norm": 23.036239624023438, "learning_rate": 8.724168912848159e-06, "loss": 2.6787, "step": 403 }, { "epoch": 0.17235494880546076, "grad_norm": 23.585355758666992, "learning_rate": 8.71967654986523e-06, "loss": 2.5127, "step": 404 }, { "epoch": 0.1727815699658703, "grad_norm": 39.485984802246094, "learning_rate": 8.715184186882301e-06, "loss": 3.0957, "step": 405 }, { "epoch": 0.17320819112627986, "grad_norm": 24.583091735839844, "learning_rate": 8.710691823899373e-06, "loss": 2.998, "step": 406 }, { "epoch": 0.17363481228668942, "grad_norm": 25.577939987182617, "learning_rate": 8.706199460916443e-06, "loss": 3.1807, "step": 407 }, { "epoch": 0.17406143344709898, "grad_norm": 20.640886306762695, "learning_rate": 8.701707097933513e-06, "loss": 2.8564, "step": 408 }, { "epoch": 0.17448805460750852, "grad_norm": 27.32965087890625, "learning_rate": 8.697214734950583e-06, "loss": 3.043, "step": 409 }, { "epoch": 0.17491467576791808, "grad_norm": 23.118106842041016, "learning_rate": 8.692722371967655e-06, "loss": 3.002, "step": 410 }, { "epoch": 0.17534129692832764, "grad_norm": 36.54021453857422, "learning_rate": 8.688230008984727e-06, "loss": 2.8516, "step": 411 }, { "epoch": 0.1757679180887372, "grad_norm": 33.472747802734375, "learning_rate": 8.683737646001797e-06, "loss": 2.8984, "step": 412 }, { "epoch": 0.17619453924914677, "grad_norm": 16.954639434814453, "learning_rate": 8.67924528301887e-06, "loss": 2.8779, "step": 413 }, { "epoch": 0.1766211604095563, "grad_norm": 15.781172752380371, "learning_rate": 8.67475292003594e-06, "loss": 2.8623, "step": 414 }, { "epoch": 0.17704778156996587, "grad_norm": 30.408344268798828, "learning_rate": 8.670260557053011e-06, "loss": 2.918, "step": 415 }, { "epoch": 0.17747440273037543, "grad_norm": 20.8376522064209, "learning_rate": 8.665768194070082e-06, "loss": 2.8662, "step": 416 }, { "epoch": 0.177901023890785, "grad_norm": 19.008398056030273, "learning_rate": 8.661275831087152e-06, "loss": 3.1709, "step": 417 }, { "epoch": 0.17832764505119453, "grad_norm": 21.907400131225586, "learning_rate": 8.656783468104224e-06, "loss": 2.9385, "step": 418 }, { "epoch": 0.1787542662116041, "grad_norm": 18.403654098510742, "learning_rate": 8.652291105121294e-06, "loss": 2.8965, "step": 419 }, { "epoch": 0.17918088737201365, "grad_norm": 45.25904083251953, "learning_rate": 8.647798742138366e-06, "loss": 2.4941, "step": 420 }, { "epoch": 0.17960750853242322, "grad_norm": 21.15122413635254, "learning_rate": 8.643306379155436e-06, "loss": 2.9277, "step": 421 }, { "epoch": 0.18003412969283278, "grad_norm": 20.08800506591797, "learning_rate": 8.638814016172508e-06, "loss": 2.8164, "step": 422 }, { "epoch": 0.1804607508532423, "grad_norm": 41.06690979003906, "learning_rate": 8.634321653189578e-06, "loss": 3.2324, "step": 423 }, { "epoch": 0.18088737201365188, "grad_norm": 37.022769927978516, "learning_rate": 8.62982929020665e-06, "loss": 3.0547, "step": 424 }, { "epoch": 0.18131399317406144, "grad_norm": 30.134794235229492, "learning_rate": 8.62533692722372e-06, "loss": 3.1455, "step": 425 }, { "epoch": 0.181740614334471, "grad_norm": 29.732011795043945, "learning_rate": 8.620844564240792e-06, "loss": 3.1504, "step": 426 }, { "epoch": 0.18216723549488054, "grad_norm": 27.267990112304688, "learning_rate": 8.616352201257862e-06, "loss": 3.0957, "step": 427 }, { "epoch": 0.1825938566552901, "grad_norm": 20.294174194335938, "learning_rate": 8.611859838274932e-06, "loss": 3.2441, "step": 428 }, { "epoch": 0.18302047781569966, "grad_norm": 59.82269287109375, "learning_rate": 8.607367475292004e-06, "loss": 2.7686, "step": 429 }, { "epoch": 0.18344709897610922, "grad_norm": 13.050138473510742, "learning_rate": 8.602875112309074e-06, "loss": 3.1133, "step": 430 }, { "epoch": 0.18387372013651876, "grad_norm": 23.325040817260742, "learning_rate": 8.598382749326146e-06, "loss": 3.1797, "step": 431 }, { "epoch": 0.18430034129692832, "grad_norm": 42.300777435302734, "learning_rate": 8.593890386343218e-06, "loss": 3.1504, "step": 432 }, { "epoch": 0.18472696245733788, "grad_norm": 17.584157943725586, "learning_rate": 8.589398023360288e-06, "loss": 2.8652, "step": 433 }, { "epoch": 0.18515358361774745, "grad_norm": 25.537080764770508, "learning_rate": 8.58490566037736e-06, "loss": 3.1504, "step": 434 }, { "epoch": 0.185580204778157, "grad_norm": 33.94193649291992, "learning_rate": 8.58041329739443e-06, "loss": 3.4141, "step": 435 }, { "epoch": 0.18600682593856654, "grad_norm": 36.545509338378906, "learning_rate": 8.5759209344115e-06, "loss": 3.4863, "step": 436 }, { "epoch": 0.1864334470989761, "grad_norm": 22.413768768310547, "learning_rate": 8.571428571428571e-06, "loss": 3.123, "step": 437 }, { "epoch": 0.18686006825938567, "grad_norm": 20.436925888061523, "learning_rate": 8.566936208445643e-06, "loss": 3.0488, "step": 438 }, { "epoch": 0.18728668941979523, "grad_norm": 82.86627960205078, "learning_rate": 8.562443845462715e-06, "loss": 3.2324, "step": 439 }, { "epoch": 0.18771331058020477, "grad_norm": 85.6285629272461, "learning_rate": 8.557951482479785e-06, "loss": 3.5078, "step": 440 }, { "epoch": 0.18813993174061433, "grad_norm": 90.78428649902344, "learning_rate": 8.553459119496857e-06, "loss": 2.9971, "step": 441 }, { "epoch": 0.1885665529010239, "grad_norm": 64.85440063476562, "learning_rate": 8.548966756513927e-06, "loss": 3.4453, "step": 442 }, { "epoch": 0.18899317406143346, "grad_norm": 21.0238094329834, "learning_rate": 8.544474393530999e-06, "loss": 2.6582, "step": 443 }, { "epoch": 0.189419795221843, "grad_norm": 28.667882919311523, "learning_rate": 8.539982030548069e-06, "loss": 3.623, "step": 444 }, { "epoch": 0.18984641638225255, "grad_norm": 55.42424392700195, "learning_rate": 8.53548966756514e-06, "loss": 3.0801, "step": 445 }, { "epoch": 0.19027303754266212, "grad_norm": 20.951005935668945, "learning_rate": 8.530997304582211e-06, "loss": 2.6289, "step": 446 }, { "epoch": 0.19069965870307168, "grad_norm": 92.44082641601562, "learning_rate": 8.526504941599281e-06, "loss": 3.8604, "step": 447 }, { "epoch": 0.19112627986348124, "grad_norm": 49.035438537597656, "learning_rate": 8.522012578616353e-06, "loss": 3.3428, "step": 448 }, { "epoch": 0.19155290102389078, "grad_norm": 34.45978546142578, "learning_rate": 8.517520215633423e-06, "loss": 3.3242, "step": 449 }, { "epoch": 0.19197952218430034, "grad_norm": 20.125938415527344, "learning_rate": 8.513027852650495e-06, "loss": 2.8877, "step": 450 }, { "epoch": 0.1924061433447099, "grad_norm": 27.46782684326172, "learning_rate": 8.508535489667566e-06, "loss": 3.1895, "step": 451 }, { "epoch": 0.19283276450511946, "grad_norm": 59.36712646484375, "learning_rate": 8.504043126684637e-06, "loss": 3.3203, "step": 452 }, { "epoch": 0.193259385665529, "grad_norm": 33.552188873291016, "learning_rate": 8.499550763701708e-06, "loss": 3.2256, "step": 453 }, { "epoch": 0.19368600682593856, "grad_norm": 17.913942337036133, "learning_rate": 8.495058400718778e-06, "loss": 2.7002, "step": 454 }, { "epoch": 0.19411262798634812, "grad_norm": 28.015735626220703, "learning_rate": 8.49056603773585e-06, "loss": 2.7158, "step": 455 }, { "epoch": 0.1945392491467577, "grad_norm": 14.068643569946289, "learning_rate": 8.48607367475292e-06, "loss": 2.916, "step": 456 }, { "epoch": 0.19496587030716722, "grad_norm": 19.000486373901367, "learning_rate": 8.481581311769992e-06, "loss": 2.8457, "step": 457 }, { "epoch": 0.19539249146757678, "grad_norm": 19.684755325317383, "learning_rate": 8.477088948787062e-06, "loss": 3.1113, "step": 458 }, { "epoch": 0.19581911262798635, "grad_norm": 25.133827209472656, "learning_rate": 8.472596585804134e-06, "loss": 2.2959, "step": 459 }, { "epoch": 0.1962457337883959, "grad_norm": 35.57670593261719, "learning_rate": 8.468104222821206e-06, "loss": 3.082, "step": 460 }, { "epoch": 0.19667235494880547, "grad_norm": 58.25027847290039, "learning_rate": 8.463611859838276e-06, "loss": 2.9502, "step": 461 }, { "epoch": 0.197098976109215, "grad_norm": 50.10265350341797, "learning_rate": 8.459119496855346e-06, "loss": 3.1689, "step": 462 }, { "epoch": 0.19752559726962457, "grad_norm": 22.792301177978516, "learning_rate": 8.454627133872418e-06, "loss": 2.7754, "step": 463 }, { "epoch": 0.19795221843003413, "grad_norm": 24.666540145874023, "learning_rate": 8.450134770889488e-06, "loss": 2.7695, "step": 464 }, { "epoch": 0.1983788395904437, "grad_norm": 22.577451705932617, "learning_rate": 8.445642407906558e-06, "loss": 2.8379, "step": 465 }, { "epoch": 0.19880546075085323, "grad_norm": 71.5382308959961, "learning_rate": 8.44115004492363e-06, "loss": 3.3887, "step": 466 }, { "epoch": 0.1992320819112628, "grad_norm": 55.62384033203125, "learning_rate": 8.436657681940702e-06, "loss": 3.293, "step": 467 }, { "epoch": 0.19965870307167236, "grad_norm": 33.4422492980957, "learning_rate": 8.432165318957772e-06, "loss": 3.4922, "step": 468 }, { "epoch": 0.20008532423208192, "grad_norm": 65.28173828125, "learning_rate": 8.427672955974844e-06, "loss": 3.0293, "step": 469 }, { "epoch": 0.20051194539249148, "grad_norm": 22.559324264526367, "learning_rate": 8.423180592991915e-06, "loss": 2.7207, "step": 470 }, { "epoch": 0.20093856655290102, "grad_norm": 25.741518020629883, "learning_rate": 8.418688230008986e-06, "loss": 2.6143, "step": 471 }, { "epoch": 0.20136518771331058, "grad_norm": 77.9397201538086, "learning_rate": 8.414195867026057e-06, "loss": 3.1191, "step": 472 }, { "epoch": 0.20179180887372014, "grad_norm": 96.0541000366211, "learning_rate": 8.409703504043127e-06, "loss": 3.2148, "step": 473 }, { "epoch": 0.2022184300341297, "grad_norm": 84.57242584228516, "learning_rate": 8.405211141060199e-06, "loss": 3.1709, "step": 474 }, { "epoch": 0.20264505119453924, "grad_norm": 60.31303024291992, "learning_rate": 8.400718778077269e-06, "loss": 3.1729, "step": 475 }, { "epoch": 0.2030716723549488, "grad_norm": 44.18376159667969, "learning_rate": 8.39622641509434e-06, "loss": 3.0234, "step": 476 }, { "epoch": 0.20349829351535836, "grad_norm": 35.493770599365234, "learning_rate": 8.391734052111411e-06, "loss": 2.8408, "step": 477 }, { "epoch": 0.20392491467576793, "grad_norm": 22.417808532714844, "learning_rate": 8.387241689128483e-06, "loss": 2.8154, "step": 478 }, { "epoch": 0.20435153583617746, "grad_norm": 16.359333038330078, "learning_rate": 8.382749326145553e-06, "loss": 2.6514, "step": 479 }, { "epoch": 0.20477815699658702, "grad_norm": 80.31161499023438, "learning_rate": 8.378256963162625e-06, "loss": 3.1973, "step": 480 }, { "epoch": 0.2052047781569966, "grad_norm": 56.79182815551758, "learning_rate": 8.373764600179695e-06, "loss": 3.2148, "step": 481 }, { "epoch": 0.20563139931740615, "grad_norm": 79.41356658935547, "learning_rate": 8.369272237196765e-06, "loss": 3.1064, "step": 482 }, { "epoch": 0.2060580204778157, "grad_norm": 92.82357025146484, "learning_rate": 8.364779874213837e-06, "loss": 2.8896, "step": 483 }, { "epoch": 0.20648464163822525, "grad_norm": 63.247501373291016, "learning_rate": 8.360287511230907e-06, "loss": 3.0967, "step": 484 }, { "epoch": 0.2069112627986348, "grad_norm": 21.605344772338867, "learning_rate": 8.35579514824798e-06, "loss": 2.9824, "step": 485 }, { "epoch": 0.20733788395904437, "grad_norm": 20.009994506835938, "learning_rate": 8.35130278526505e-06, "loss": 2.9199, "step": 486 }, { "epoch": 0.20776450511945393, "grad_norm": 23.621641159057617, "learning_rate": 8.346810422282121e-06, "loss": 3.0059, "step": 487 }, { "epoch": 0.20819112627986347, "grad_norm": 16.526639938354492, "learning_rate": 8.342318059299193e-06, "loss": 2.3086, "step": 488 }, { "epoch": 0.20861774744027303, "grad_norm": 50.59242630004883, "learning_rate": 8.337825696316264e-06, "loss": 3.3828, "step": 489 }, { "epoch": 0.2090443686006826, "grad_norm": 21.509166717529297, "learning_rate": 8.333333333333334e-06, "loss": 2.293, "step": 490 }, { "epoch": 0.20947098976109216, "grad_norm": 60.66617965698242, "learning_rate": 8.328840970350404e-06, "loss": 2.9844, "step": 491 }, { "epoch": 0.2098976109215017, "grad_norm": 48.574520111083984, "learning_rate": 8.324348607367476e-06, "loss": 3.2734, "step": 492 }, { "epoch": 0.21032423208191126, "grad_norm": 33.63777160644531, "learning_rate": 8.319856244384546e-06, "loss": 2.8203, "step": 493 }, { "epoch": 0.21075085324232082, "grad_norm": 28.87843132019043, "learning_rate": 8.315363881401618e-06, "loss": 2.6367, "step": 494 }, { "epoch": 0.21117747440273038, "grad_norm": 27.04725456237793, "learning_rate": 8.31087151841869e-06, "loss": 2.9414, "step": 495 }, { "epoch": 0.21160409556313994, "grad_norm": 25.990129470825195, "learning_rate": 8.30637915543576e-06, "loss": 2.583, "step": 496 }, { "epoch": 0.21203071672354948, "grad_norm": 19.33521270751953, "learning_rate": 8.301886792452832e-06, "loss": 2.9385, "step": 497 }, { "epoch": 0.21245733788395904, "grad_norm": 55.301395416259766, "learning_rate": 8.297394429469902e-06, "loss": 2.6992, "step": 498 }, { "epoch": 0.2128839590443686, "grad_norm": 36.61276626586914, "learning_rate": 8.292902066486972e-06, "loss": 3.0576, "step": 499 }, { "epoch": 0.21331058020477817, "grad_norm": 68.78702545166016, "learning_rate": 8.288409703504044e-06, "loss": 3.4453, "step": 500 }, { "epoch": 0.2137372013651877, "grad_norm": 42.436580657958984, "learning_rate": 8.283917340521114e-06, "loss": 2.8301, "step": 501 }, { "epoch": 0.21416382252559726, "grad_norm": 26.847007751464844, "learning_rate": 8.279424977538186e-06, "loss": 3.5859, "step": 502 }, { "epoch": 0.21459044368600683, "grad_norm": 20.354631423950195, "learning_rate": 8.274932614555256e-06, "loss": 3.1436, "step": 503 }, { "epoch": 0.2150170648464164, "grad_norm": 21.232385635375977, "learning_rate": 8.270440251572328e-06, "loss": 3.1562, "step": 504 }, { "epoch": 0.21544368600682595, "grad_norm": 14.351090431213379, "learning_rate": 8.265947888589399e-06, "loss": 2.6172, "step": 505 }, { "epoch": 0.2158703071672355, "grad_norm": 39.42547607421875, "learning_rate": 8.26145552560647e-06, "loss": 2.7881, "step": 506 }, { "epoch": 0.21629692832764505, "grad_norm": 49.7579460144043, "learning_rate": 8.25696316262354e-06, "loss": 3.2529, "step": 507 }, { "epoch": 0.2167235494880546, "grad_norm": 45.9566535949707, "learning_rate": 8.252470799640613e-06, "loss": 2.3877, "step": 508 }, { "epoch": 0.21715017064846417, "grad_norm": 16.314781188964844, "learning_rate": 8.247978436657683e-06, "loss": 3.1973, "step": 509 }, { "epoch": 0.2175767918088737, "grad_norm": 17.71656608581543, "learning_rate": 8.243486073674753e-06, "loss": 2.8887, "step": 510 }, { "epoch": 0.21800341296928327, "grad_norm": 17.876148223876953, "learning_rate": 8.238993710691825e-06, "loss": 3.207, "step": 511 }, { "epoch": 0.21843003412969283, "grad_norm": 29.692302703857422, "learning_rate": 8.234501347708895e-06, "loss": 2.8604, "step": 512 }, { "epoch": 0.2188566552901024, "grad_norm": 28.06431770324707, "learning_rate": 8.230008984725967e-06, "loss": 2.8867, "step": 513 }, { "epoch": 0.21928327645051193, "grad_norm": 46.33169937133789, "learning_rate": 8.225516621743037e-06, "loss": 3.5039, "step": 514 }, { "epoch": 0.2197098976109215, "grad_norm": 25.718185424804688, "learning_rate": 8.221024258760109e-06, "loss": 3.1055, "step": 515 }, { "epoch": 0.22013651877133106, "grad_norm": 17.153024673461914, "learning_rate": 8.21653189577718e-06, "loss": 2.9961, "step": 516 }, { "epoch": 0.22056313993174062, "grad_norm": 18.76894760131836, "learning_rate": 8.212039532794251e-06, "loss": 2.6982, "step": 517 }, { "epoch": 0.22098976109215018, "grad_norm": 34.840484619140625, "learning_rate": 8.207547169811321e-06, "loss": 2.9414, "step": 518 }, { "epoch": 0.22141638225255972, "grad_norm": 44.31029510498047, "learning_rate": 8.203054806828391e-06, "loss": 3.1445, "step": 519 }, { "epoch": 0.22184300341296928, "grad_norm": 14.775177955627441, "learning_rate": 8.198562443845463e-06, "loss": 2.6436, "step": 520 }, { "epoch": 0.22226962457337884, "grad_norm": 45.092811584472656, "learning_rate": 8.194070080862534e-06, "loss": 3.0488, "step": 521 }, { "epoch": 0.2226962457337884, "grad_norm": 18.923681259155273, "learning_rate": 8.189577717879605e-06, "loss": 3.1758, "step": 522 }, { "epoch": 0.22312286689419794, "grad_norm": 26.43110466003418, "learning_rate": 8.185085354896676e-06, "loss": 2.4033, "step": 523 }, { "epoch": 0.2235494880546075, "grad_norm": 20.219383239746094, "learning_rate": 8.180592991913748e-06, "loss": 2.7607, "step": 524 }, { "epoch": 0.22397610921501707, "grad_norm": 17.233644485473633, "learning_rate": 8.17610062893082e-06, "loss": 3.0059, "step": 525 }, { "epoch": 0.22440273037542663, "grad_norm": 30.3442325592041, "learning_rate": 8.17160826594789e-06, "loss": 2.4814, "step": 526 }, { "epoch": 0.22482935153583616, "grad_norm": 26.53208351135254, "learning_rate": 8.16711590296496e-06, "loss": 2.7441, "step": 527 }, { "epoch": 0.22525597269624573, "grad_norm": 34.5225944519043, "learning_rate": 8.16262353998203e-06, "loss": 3.0029, "step": 528 }, { "epoch": 0.2256825938566553, "grad_norm": 24.5207576751709, "learning_rate": 8.158131176999102e-06, "loss": 2.7129, "step": 529 }, { "epoch": 0.22610921501706485, "grad_norm": 26.960975646972656, "learning_rate": 8.153638814016172e-06, "loss": 2.8838, "step": 530 }, { "epoch": 0.22653583617747441, "grad_norm": 27.879377365112305, "learning_rate": 8.149146451033244e-06, "loss": 3.125, "step": 531 }, { "epoch": 0.22696245733788395, "grad_norm": 33.46174621582031, "learning_rate": 8.144654088050316e-06, "loss": 2.877, "step": 532 }, { "epoch": 0.2273890784982935, "grad_norm": 17.529651641845703, "learning_rate": 8.140161725067386e-06, "loss": 2.6582, "step": 533 }, { "epoch": 0.22781569965870307, "grad_norm": 28.782405853271484, "learning_rate": 8.135669362084458e-06, "loss": 2.9971, "step": 534 }, { "epoch": 0.22824232081911264, "grad_norm": 19.45545768737793, "learning_rate": 8.131176999101528e-06, "loss": 2.6934, "step": 535 }, { "epoch": 0.22866894197952217, "grad_norm": 31.947803497314453, "learning_rate": 8.126684636118598e-06, "loss": 3.084, "step": 536 }, { "epoch": 0.22909556313993173, "grad_norm": 46.25978469848633, "learning_rate": 8.12219227313567e-06, "loss": 2.7988, "step": 537 }, { "epoch": 0.2295221843003413, "grad_norm": 37.96335220336914, "learning_rate": 8.11769991015274e-06, "loss": 3.2832, "step": 538 }, { "epoch": 0.22994880546075086, "grad_norm": 51.27415466308594, "learning_rate": 8.113207547169812e-06, "loss": 3.0996, "step": 539 }, { "epoch": 0.23037542662116042, "grad_norm": 27.133466720581055, "learning_rate": 8.108715184186883e-06, "loss": 2.8252, "step": 540 }, { "epoch": 0.23080204778156996, "grad_norm": 15.50619888305664, "learning_rate": 8.104222821203954e-06, "loss": 2.8223, "step": 541 }, { "epoch": 0.23122866894197952, "grad_norm": 27.35281753540039, "learning_rate": 8.099730458221025e-06, "loss": 2.7988, "step": 542 }, { "epoch": 0.23165529010238908, "grad_norm": 27.582168579101562, "learning_rate": 8.095238095238097e-06, "loss": 2.8779, "step": 543 }, { "epoch": 0.23208191126279865, "grad_norm": 46.383941650390625, "learning_rate": 8.090745732255167e-06, "loss": 2.6719, "step": 544 }, { "epoch": 0.23250853242320818, "grad_norm": 24.81082534790039, "learning_rate": 8.086253369272239e-06, "loss": 3.6602, "step": 545 }, { "epoch": 0.23293515358361774, "grad_norm": 19.902307510375977, "learning_rate": 8.081761006289309e-06, "loss": 2.6816, "step": 546 }, { "epoch": 0.2333617747440273, "grad_norm": 30.1766357421875, "learning_rate": 8.077268643306379e-06, "loss": 3.2646, "step": 547 }, { "epoch": 0.23378839590443687, "grad_norm": 18.741374969482422, "learning_rate": 8.072776280323451e-06, "loss": 3.4355, "step": 548 }, { "epoch": 0.2342150170648464, "grad_norm": 16.75374984741211, "learning_rate": 8.068283917340521e-06, "loss": 2.5967, "step": 549 }, { "epoch": 0.23464163822525597, "grad_norm": 17.990034103393555, "learning_rate": 8.063791554357593e-06, "loss": 2.6309, "step": 550 }, { "epoch": 0.23506825938566553, "grad_norm": 17.261295318603516, "learning_rate": 8.059299191374663e-06, "loss": 2.9277, "step": 551 }, { "epoch": 0.2354948805460751, "grad_norm": 29.741744995117188, "learning_rate": 8.054806828391735e-06, "loss": 3.4238, "step": 552 }, { "epoch": 0.23592150170648465, "grad_norm": 42.073055267333984, "learning_rate": 8.050314465408805e-06, "loss": 3.8125, "step": 553 }, { "epoch": 0.2363481228668942, "grad_norm": 15.395310401916504, "learning_rate": 8.045822102425877e-06, "loss": 3.0898, "step": 554 }, { "epoch": 0.23677474402730375, "grad_norm": 13.185260772705078, "learning_rate": 8.041329739442947e-06, "loss": 3.1797, "step": 555 }, { "epoch": 0.23720136518771331, "grad_norm": 57.49574661254883, "learning_rate": 8.036837376460018e-06, "loss": 3.0664, "step": 556 }, { "epoch": 0.23762798634812288, "grad_norm": 19.3037109375, "learning_rate": 8.03234501347709e-06, "loss": 2.8164, "step": 557 }, { "epoch": 0.2380546075085324, "grad_norm": 13.614612579345703, "learning_rate": 8.02785265049416e-06, "loss": 2.8848, "step": 558 }, { "epoch": 0.23848122866894197, "grad_norm": 36.09251403808594, "learning_rate": 8.023360287511232e-06, "loss": 2.8193, "step": 559 }, { "epoch": 0.23890784982935154, "grad_norm": 38.101280212402344, "learning_rate": 8.018867924528303e-06, "loss": 2.9141, "step": 560 }, { "epoch": 0.2393344709897611, "grad_norm": 30.57044792175293, "learning_rate": 8.014375561545374e-06, "loss": 2.9688, "step": 561 }, { "epoch": 0.23976109215017063, "grad_norm": 55.0594367980957, "learning_rate": 8.009883198562446e-06, "loss": 3.2461, "step": 562 }, { "epoch": 0.2401877133105802, "grad_norm": 20.185823440551758, "learning_rate": 8.005390835579516e-06, "loss": 2.8135, "step": 563 }, { "epoch": 0.24061433447098976, "grad_norm": 23.831144332885742, "learning_rate": 8.000898472596586e-06, "loss": 2.6895, "step": 564 }, { "epoch": 0.24104095563139932, "grad_norm": 65.2356948852539, "learning_rate": 7.996406109613656e-06, "loss": 2.8037, "step": 565 }, { "epoch": 0.24146757679180889, "grad_norm": 47.4893684387207, "learning_rate": 7.991913746630728e-06, "loss": 3.1465, "step": 566 }, { "epoch": 0.24189419795221842, "grad_norm": 37.859413146972656, "learning_rate": 7.9874213836478e-06, "loss": 3.0283, "step": 567 }, { "epoch": 0.24232081911262798, "grad_norm": 26.822053909301758, "learning_rate": 7.98292902066487e-06, "loss": 3.1855, "step": 568 }, { "epoch": 0.24274744027303755, "grad_norm": 17.800918579101562, "learning_rate": 7.978436657681942e-06, "loss": 2.7031, "step": 569 }, { "epoch": 0.2431740614334471, "grad_norm": 55.43043518066406, "learning_rate": 7.973944294699012e-06, "loss": 3.0273, "step": 570 }, { "epoch": 0.24360068259385664, "grad_norm": 24.088314056396484, "learning_rate": 7.969451931716084e-06, "loss": 3.166, "step": 571 }, { "epoch": 0.2440273037542662, "grad_norm": 57.95652389526367, "learning_rate": 7.964959568733154e-06, "loss": 2.7227, "step": 572 }, { "epoch": 0.24445392491467577, "grad_norm": 43.33866500854492, "learning_rate": 7.960467205750224e-06, "loss": 2.7988, "step": 573 }, { "epoch": 0.24488054607508533, "grad_norm": 46.34353256225586, "learning_rate": 7.955974842767296e-06, "loss": 3.0703, "step": 574 }, { "epoch": 0.2453071672354949, "grad_norm": 35.27037048339844, "learning_rate": 7.951482479784367e-06, "loss": 3.1719, "step": 575 }, { "epoch": 0.24573378839590443, "grad_norm": 21.704389572143555, "learning_rate": 7.946990116801438e-06, "loss": 2.4922, "step": 576 }, { "epoch": 0.246160409556314, "grad_norm": 23.15915298461914, "learning_rate": 7.942497753818509e-06, "loss": 2.7568, "step": 577 }, { "epoch": 0.24658703071672355, "grad_norm": 51.94176483154297, "learning_rate": 7.93800539083558e-06, "loss": 2.3887, "step": 578 }, { "epoch": 0.24701365187713312, "grad_norm": 24.061573028564453, "learning_rate": 7.93351302785265e-06, "loss": 2.8096, "step": 579 }, { "epoch": 0.24744027303754265, "grad_norm": 15.878485679626465, "learning_rate": 7.929020664869723e-06, "loss": 2.4375, "step": 580 }, { "epoch": 0.24786689419795221, "grad_norm": 19.481571197509766, "learning_rate": 7.924528301886793e-06, "loss": 2.9883, "step": 581 }, { "epoch": 0.24829351535836178, "grad_norm": 31.10467529296875, "learning_rate": 7.920035938903865e-06, "loss": 2.2686, "step": 582 }, { "epoch": 0.24872013651877134, "grad_norm": 19.816104888916016, "learning_rate": 7.915543575920935e-06, "loss": 2.6562, "step": 583 }, { "epoch": 0.24914675767918087, "grad_norm": 18.393310546875, "learning_rate": 7.911051212938005e-06, "loss": 2.8135, "step": 584 }, { "epoch": 0.24957337883959044, "grad_norm": 38.82542419433594, "learning_rate": 7.906558849955077e-06, "loss": 2.5684, "step": 585 }, { "epoch": 0.25, "grad_norm": 89.28064727783203, "learning_rate": 7.902066486972147e-06, "loss": 3.9434, "step": 586 }, { "epoch": 0.25042662116040953, "grad_norm": 66.42613983154297, "learning_rate": 7.897574123989219e-06, "loss": 3.1797, "step": 587 }, { "epoch": 0.2508532423208191, "grad_norm": 85.55559539794922, "learning_rate": 7.893081761006291e-06, "loss": 3.2402, "step": 588 }, { "epoch": 0.25127986348122866, "grad_norm": 64.09162902832031, "learning_rate": 7.888589398023361e-06, "loss": 3.3799, "step": 589 }, { "epoch": 0.25170648464163825, "grad_norm": 18.269323348999023, "learning_rate": 7.884097035040431e-06, "loss": 2.7402, "step": 590 }, { "epoch": 0.2521331058020478, "grad_norm": 27.09288787841797, "learning_rate": 7.879604672057503e-06, "loss": 2.7715, "step": 591 }, { "epoch": 0.2525597269624573, "grad_norm": 23.382566452026367, "learning_rate": 7.875112309074573e-06, "loss": 2.8662, "step": 592 }, { "epoch": 0.2529863481228669, "grad_norm": 34.532344818115234, "learning_rate": 7.870619946091644e-06, "loss": 2.6865, "step": 593 }, { "epoch": 0.25341296928327645, "grad_norm": 64.53623962402344, "learning_rate": 7.866127583108716e-06, "loss": 2.9072, "step": 594 }, { "epoch": 0.253839590443686, "grad_norm": 36.02356719970703, "learning_rate": 7.861635220125787e-06, "loss": 2.6055, "step": 595 }, { "epoch": 0.25426621160409557, "grad_norm": 77.93729400634766, "learning_rate": 7.857142857142858e-06, "loss": 3.1201, "step": 596 }, { "epoch": 0.2546928327645051, "grad_norm": 33.50895309448242, "learning_rate": 7.85265049415993e-06, "loss": 3.1641, "step": 597 }, { "epoch": 0.2551194539249147, "grad_norm": 21.22097396850586, "learning_rate": 7.848158131177e-06, "loss": 3.252, "step": 598 }, { "epoch": 0.25554607508532423, "grad_norm": 28.865007400512695, "learning_rate": 7.843665768194072e-06, "loss": 3.2734, "step": 599 }, { "epoch": 0.25597269624573377, "grad_norm": 24.108217239379883, "learning_rate": 7.839173405211142e-06, "loss": 2.8125, "step": 600 }, { "epoch": 0.25639931740614336, "grad_norm": 38.636558532714844, "learning_rate": 7.834681042228212e-06, "loss": 3.1211, "step": 601 }, { "epoch": 0.2568259385665529, "grad_norm": 20.850168228149414, "learning_rate": 7.830188679245284e-06, "loss": 3.4316, "step": 602 }, { "epoch": 0.2572525597269625, "grad_norm": 40.4669075012207, "learning_rate": 7.825696316262354e-06, "loss": 2.4248, "step": 603 }, { "epoch": 0.257679180887372, "grad_norm": 21.405136108398438, "learning_rate": 7.821203953279426e-06, "loss": 2.7734, "step": 604 }, { "epoch": 0.25810580204778155, "grad_norm": 16.794015884399414, "learning_rate": 7.816711590296496e-06, "loss": 2.4277, "step": 605 }, { "epoch": 0.25853242320819114, "grad_norm": 16.479524612426758, "learning_rate": 7.812219227313568e-06, "loss": 2.8281, "step": 606 }, { "epoch": 0.2589590443686007, "grad_norm": 38.69625473022461, "learning_rate": 7.807726864330638e-06, "loss": 3.0762, "step": 607 }, { "epoch": 0.2593856655290102, "grad_norm": 64.19967651367188, "learning_rate": 7.80323450134771e-06, "loss": 3.1436, "step": 608 }, { "epoch": 0.2598122866894198, "grad_norm": 31.575109481811523, "learning_rate": 7.79874213836478e-06, "loss": 2.5459, "step": 609 }, { "epoch": 0.26023890784982934, "grad_norm": 46.65521240234375, "learning_rate": 7.79424977538185e-06, "loss": 3.0957, "step": 610 }, { "epoch": 0.2606655290102389, "grad_norm": 15.696451187133789, "learning_rate": 7.789757412398922e-06, "loss": 3.0127, "step": 611 }, { "epoch": 0.26109215017064846, "grad_norm": 30.369564056396484, "learning_rate": 7.785265049415993e-06, "loss": 2.3926, "step": 612 }, { "epoch": 0.261518771331058, "grad_norm": 36.5367546081543, "learning_rate": 7.780772686433065e-06, "loss": 3.1211, "step": 613 }, { "epoch": 0.2619453924914676, "grad_norm": 52.71562957763672, "learning_rate": 7.776280323450135e-06, "loss": 2.4541, "step": 614 }, { "epoch": 0.2623720136518771, "grad_norm": 17.766803741455078, "learning_rate": 7.771787960467207e-06, "loss": 2.8086, "step": 615 }, { "epoch": 0.2627986348122867, "grad_norm": 22.81015968322754, "learning_rate": 7.767295597484279e-06, "loss": 2.5469, "step": 616 }, { "epoch": 0.26322525597269625, "grad_norm": 26.944904327392578, "learning_rate": 7.762803234501349e-06, "loss": 3.3516, "step": 617 }, { "epoch": 0.2636518771331058, "grad_norm": 30.520727157592773, "learning_rate": 7.758310871518419e-06, "loss": 2.5283, "step": 618 }, { "epoch": 0.2640784982935154, "grad_norm": 45.16160583496094, "learning_rate": 7.75381850853549e-06, "loss": 2.9727, "step": 619 }, { "epoch": 0.2645051194539249, "grad_norm": 48.80204391479492, "learning_rate": 7.749326145552561e-06, "loss": 3.4336, "step": 620 }, { "epoch": 0.26493174061433444, "grad_norm": 20.030519485473633, "learning_rate": 7.744833782569631e-06, "loss": 3.083, "step": 621 }, { "epoch": 0.26535836177474403, "grad_norm": 37.67460250854492, "learning_rate": 7.740341419586703e-06, "loss": 2.7139, "step": 622 }, { "epoch": 0.26578498293515357, "grad_norm": 27.934261322021484, "learning_rate": 7.735849056603775e-06, "loss": 2.9209, "step": 623 }, { "epoch": 0.26621160409556316, "grad_norm": 32.7962646484375, "learning_rate": 7.731356693620845e-06, "loss": 2.2637, "step": 624 }, { "epoch": 0.2666382252559727, "grad_norm": 21.026763916015625, "learning_rate": 7.726864330637917e-06, "loss": 3.0254, "step": 625 }, { "epoch": 0.26706484641638223, "grad_norm": 20.621740341186523, "learning_rate": 7.722371967654987e-06, "loss": 2.8926, "step": 626 }, { "epoch": 0.2674914675767918, "grad_norm": 20.117944717407227, "learning_rate": 7.717879604672058e-06, "loss": 3.3389, "step": 627 }, { "epoch": 0.26791808873720135, "grad_norm": 16.37653160095215, "learning_rate": 7.71338724168913e-06, "loss": 2.5674, "step": 628 }, { "epoch": 0.26834470989761094, "grad_norm": 17.15595817565918, "learning_rate": 7.7088948787062e-06, "loss": 2.8652, "step": 629 }, { "epoch": 0.2687713310580205, "grad_norm": 29.244970321655273, "learning_rate": 7.704402515723271e-06, "loss": 3.0859, "step": 630 }, { "epoch": 0.26919795221843, "grad_norm": 17.230205535888672, "learning_rate": 7.699910152740342e-06, "loss": 2.2197, "step": 631 }, { "epoch": 0.2696245733788396, "grad_norm": 18.00596046447754, "learning_rate": 7.695417789757414e-06, "loss": 2.498, "step": 632 }, { "epoch": 0.27005119453924914, "grad_norm": 27.735151290893555, "learning_rate": 7.690925426774484e-06, "loss": 2.8369, "step": 633 }, { "epoch": 0.27047781569965873, "grad_norm": 18.78544044494629, "learning_rate": 7.686433063791556e-06, "loss": 2.9736, "step": 634 }, { "epoch": 0.27090443686006827, "grad_norm": 44.827117919921875, "learning_rate": 7.681940700808626e-06, "loss": 2.9199, "step": 635 }, { "epoch": 0.2713310580204778, "grad_norm": 36.48609924316406, "learning_rate": 7.677448337825698e-06, "loss": 2.7637, "step": 636 }, { "epoch": 0.2717576791808874, "grad_norm": 18.71845245361328, "learning_rate": 7.672955974842768e-06, "loss": 2.8975, "step": 637 }, { "epoch": 0.2721843003412969, "grad_norm": 21.344680786132812, "learning_rate": 7.668463611859838e-06, "loss": 2.9189, "step": 638 }, { "epoch": 0.27261092150170646, "grad_norm": 21.78455924987793, "learning_rate": 7.66397124887691e-06, "loss": 2.7725, "step": 639 }, { "epoch": 0.27303754266211605, "grad_norm": 29.85883140563965, "learning_rate": 7.65947888589398e-06, "loss": 2.5283, "step": 640 }, { "epoch": 0.2734641638225256, "grad_norm": 20.216970443725586, "learning_rate": 7.654986522911052e-06, "loss": 3.0732, "step": 641 }, { "epoch": 0.2738907849829352, "grad_norm": 24.468124389648438, "learning_rate": 7.650494159928122e-06, "loss": 3.3447, "step": 642 }, { "epoch": 0.2743174061433447, "grad_norm": 23.012601852416992, "learning_rate": 7.646001796945194e-06, "loss": 2.9512, "step": 643 }, { "epoch": 0.27474402730375425, "grad_norm": 20.536497116088867, "learning_rate": 7.641509433962266e-06, "loss": 2.6768, "step": 644 }, { "epoch": 0.27517064846416384, "grad_norm": 29.968305587768555, "learning_rate": 7.637017070979336e-06, "loss": 2.3643, "step": 645 }, { "epoch": 0.27559726962457337, "grad_norm": 34.39928436279297, "learning_rate": 7.632524707996406e-06, "loss": 3.3252, "step": 646 }, { "epoch": 0.27602389078498296, "grad_norm": 23.07245635986328, "learning_rate": 7.6280323450134775e-06, "loss": 3.1465, "step": 647 }, { "epoch": 0.2764505119453925, "grad_norm": 52.81306457519531, "learning_rate": 7.623539982030549e-06, "loss": 2.874, "step": 648 }, { "epoch": 0.27687713310580203, "grad_norm": 13.939685821533203, "learning_rate": 7.61904761904762e-06, "loss": 2.7227, "step": 649 }, { "epoch": 0.2773037542662116, "grad_norm": 16.605741500854492, "learning_rate": 7.614555256064691e-06, "loss": 2.6914, "step": 650 }, { "epoch": 0.27773037542662116, "grad_norm": 18.800424575805664, "learning_rate": 7.6100628930817626e-06, "loss": 2.6963, "step": 651 }, { "epoch": 0.2781569965870307, "grad_norm": 51.422508239746094, "learning_rate": 7.605570530098833e-06, "loss": 2.7793, "step": 652 }, { "epoch": 0.2785836177474403, "grad_norm": 31.76155662536621, "learning_rate": 7.601078167115904e-06, "loss": 2.8135, "step": 653 }, { "epoch": 0.2790102389078498, "grad_norm": 27.845605850219727, "learning_rate": 7.596585804132974e-06, "loss": 2.3193, "step": 654 }, { "epoch": 0.2794368600682594, "grad_norm": 32.7022705078125, "learning_rate": 7.592093441150046e-06, "loss": 2.8477, "step": 655 }, { "epoch": 0.27986348122866894, "grad_norm": 32.736080169677734, "learning_rate": 7.587601078167116e-06, "loss": 2.9072, "step": 656 }, { "epoch": 0.2802901023890785, "grad_norm": 22.69315528869629, "learning_rate": 7.583108715184188e-06, "loss": 3.0986, "step": 657 }, { "epoch": 0.28071672354948807, "grad_norm": 41.13274383544922, "learning_rate": 7.578616352201259e-06, "loss": 2.5479, "step": 658 }, { "epoch": 0.2811433447098976, "grad_norm": 34.375823974609375, "learning_rate": 7.574123989218329e-06, "loss": 2.54, "step": 659 }, { "epoch": 0.2815699658703072, "grad_norm": 50.617103576660156, "learning_rate": 7.569631626235401e-06, "loss": 3.3945, "step": 660 }, { "epoch": 0.2819965870307167, "grad_norm": 19.791898727416992, "learning_rate": 7.565139263252471e-06, "loss": 2.6709, "step": 661 }, { "epoch": 0.28242320819112626, "grad_norm": 38.855125427246094, "learning_rate": 7.560646900269542e-06, "loss": 3.3887, "step": 662 }, { "epoch": 0.28284982935153585, "grad_norm": 19.478548049926758, "learning_rate": 7.5561545372866126e-06, "loss": 2.6631, "step": 663 }, { "epoch": 0.2832764505119454, "grad_norm": 33.800086975097656, "learning_rate": 7.5516621743036844e-06, "loss": 2.7578, "step": 664 }, { "epoch": 0.2837030716723549, "grad_norm": 23.76025390625, "learning_rate": 7.5471698113207555e-06, "loss": 3.123, "step": 665 }, { "epoch": 0.2841296928327645, "grad_norm": 15.858804702758789, "learning_rate": 7.5426774483378265e-06, "loss": 2.7354, "step": 666 }, { "epoch": 0.28455631399317405, "grad_norm": 23.25118064880371, "learning_rate": 7.538185085354898e-06, "loss": 2.332, "step": 667 }, { "epoch": 0.28498293515358364, "grad_norm": 13.508567810058594, "learning_rate": 7.533692722371968e-06, "loss": 2.6807, "step": 668 }, { "epoch": 0.2854095563139932, "grad_norm": 62.4262580871582, "learning_rate": 7.52920035938904e-06, "loss": 3.2617, "step": 669 }, { "epoch": 0.2858361774744027, "grad_norm": 21.98171615600586, "learning_rate": 7.52470799640611e-06, "loss": 2.9414, "step": 670 }, { "epoch": 0.2862627986348123, "grad_norm": 25.632265090942383, "learning_rate": 7.520215633423181e-06, "loss": 2.5762, "step": 671 }, { "epoch": 0.28668941979522183, "grad_norm": 14.146087646484375, "learning_rate": 7.515723270440253e-06, "loss": 2.7949, "step": 672 }, { "epoch": 0.2871160409556314, "grad_norm": 37.98383331298828, "learning_rate": 7.511230907457323e-06, "loss": 3.1572, "step": 673 }, { "epoch": 0.28754266211604096, "grad_norm": 44.23236083984375, "learning_rate": 7.506738544474395e-06, "loss": 2.5234, "step": 674 }, { "epoch": 0.2879692832764505, "grad_norm": 23.958847045898438, "learning_rate": 7.502246181491465e-06, "loss": 3.0811, "step": 675 }, { "epoch": 0.2883959044368601, "grad_norm": 19.74029541015625, "learning_rate": 7.497753818508536e-06, "loss": 2.8955, "step": 676 }, { "epoch": 0.2888225255972696, "grad_norm": 16.605003356933594, "learning_rate": 7.493261455525606e-06, "loss": 2.9785, "step": 677 }, { "epoch": 0.28924914675767915, "grad_norm": 17.542959213256836, "learning_rate": 7.488769092542678e-06, "loss": 2.9639, "step": 678 }, { "epoch": 0.28967576791808874, "grad_norm": 32.7658805847168, "learning_rate": 7.484276729559748e-06, "loss": 3.3887, "step": 679 }, { "epoch": 0.2901023890784983, "grad_norm": 23.959367752075195, "learning_rate": 7.47978436657682e-06, "loss": 2.6729, "step": 680 }, { "epoch": 0.29052901023890787, "grad_norm": 30.203109741210938, "learning_rate": 7.475292003593891e-06, "loss": 2.9775, "step": 681 }, { "epoch": 0.2909556313993174, "grad_norm": 48.85400390625, "learning_rate": 7.4707996406109616e-06, "loss": 2.6562, "step": 682 }, { "epoch": 0.29138225255972694, "grad_norm": 30.754945755004883, "learning_rate": 7.4663072776280334e-06, "loss": 3.0322, "step": 683 }, { "epoch": 0.29180887372013653, "grad_norm": 20.216299057006836, "learning_rate": 7.461814914645104e-06, "loss": 2.6914, "step": 684 }, { "epoch": 0.29223549488054607, "grad_norm": 25.015613555908203, "learning_rate": 7.457322551662175e-06, "loss": 2.8984, "step": 685 }, { "epoch": 0.29266211604095566, "grad_norm": 33.04557418823242, "learning_rate": 7.452830188679246e-06, "loss": 2.4697, "step": 686 }, { "epoch": 0.2930887372013652, "grad_norm": 45.48899459838867, "learning_rate": 7.448337825696317e-06, "loss": 2.751, "step": 687 }, { "epoch": 0.2935153583617747, "grad_norm": 14.871517181396484, "learning_rate": 7.443845462713389e-06, "loss": 2.75, "step": 688 }, { "epoch": 0.2939419795221843, "grad_norm": 42.968326568603516, "learning_rate": 7.439353099730459e-06, "loss": 2.9346, "step": 689 }, { "epoch": 0.29436860068259385, "grad_norm": 19.281057357788086, "learning_rate": 7.43486073674753e-06, "loss": 2.3662, "step": 690 }, { "epoch": 0.2947952218430034, "grad_norm": 33.68014907836914, "learning_rate": 7.4303683737646e-06, "loss": 2.8867, "step": 691 }, { "epoch": 0.295221843003413, "grad_norm": 22.495100021362305, "learning_rate": 7.425876010781672e-06, "loss": 2.6484, "step": 692 }, { "epoch": 0.2956484641638225, "grad_norm": 23.902799606323242, "learning_rate": 7.421383647798742e-06, "loss": 3.0869, "step": 693 }, { "epoch": 0.2960750853242321, "grad_norm": 29.26067352294922, "learning_rate": 7.416891284815813e-06, "loss": 2.6133, "step": 694 }, { "epoch": 0.29650170648464164, "grad_norm": 65.28421020507812, "learning_rate": 7.412398921832885e-06, "loss": 3.1631, "step": 695 }, { "epoch": 0.29692832764505117, "grad_norm": 19.710037231445312, "learning_rate": 7.407906558849955e-06, "loss": 2.4121, "step": 696 }, { "epoch": 0.29735494880546076, "grad_norm": 20.86551284790039, "learning_rate": 7.403414195867027e-06, "loss": 2.8691, "step": 697 }, { "epoch": 0.2977815699658703, "grad_norm": 26.411109924316406, "learning_rate": 7.398921832884097e-06, "loss": 3.0977, "step": 698 }, { "epoch": 0.2982081911262799, "grad_norm": 16.621917724609375, "learning_rate": 7.3944294699011685e-06, "loss": 2.4307, "step": 699 }, { "epoch": 0.2986348122866894, "grad_norm": 54.798316955566406, "learning_rate": 7.389937106918239e-06, "loss": 3.0176, "step": 700 }, { "epoch": 0.29906143344709896, "grad_norm": 16.24550437927246, "learning_rate": 7.3854447439353106e-06, "loss": 2.751, "step": 701 }, { "epoch": 0.29948805460750855, "grad_norm": 46.361629486083984, "learning_rate": 7.380952380952382e-06, "loss": 3.0439, "step": 702 }, { "epoch": 0.2999146757679181, "grad_norm": 24.557788848876953, "learning_rate": 7.376460017969453e-06, "loss": 3.0371, "step": 703 }, { "epoch": 0.3003412969283277, "grad_norm": 20.77698516845703, "learning_rate": 7.371967654986524e-06, "loss": 3.3896, "step": 704 }, { "epoch": 0.3007679180887372, "grad_norm": 30.620227813720703, "learning_rate": 7.367475292003594e-06, "loss": 2.8926, "step": 705 }, { "epoch": 0.30119453924914674, "grad_norm": 18.66374969482422, "learning_rate": 7.362982929020666e-06, "loss": 2.8789, "step": 706 }, { "epoch": 0.30162116040955633, "grad_norm": 46.04564666748047, "learning_rate": 7.358490566037736e-06, "loss": 2.4189, "step": 707 }, { "epoch": 0.30204778156996587, "grad_norm": 52.486331939697266, "learning_rate": 7.353998203054807e-06, "loss": 2.8389, "step": 708 }, { "epoch": 0.3024744027303754, "grad_norm": 14.944149017333984, "learning_rate": 7.349505840071879e-06, "loss": 2.7168, "step": 709 }, { "epoch": 0.302901023890785, "grad_norm": 14.924211502075195, "learning_rate": 7.345013477088949e-06, "loss": 2.877, "step": 710 }, { "epoch": 0.3033276450511945, "grad_norm": 21.726722717285156, "learning_rate": 7.340521114106021e-06, "loss": 2.7949, "step": 711 }, { "epoch": 0.3037542662116041, "grad_norm": 28.78704833984375, "learning_rate": 7.336028751123091e-06, "loss": 2.6279, "step": 712 }, { "epoch": 0.30418088737201365, "grad_norm": 41.87957763671875, "learning_rate": 7.331536388140162e-06, "loss": 3.2207, "step": 713 }, { "epoch": 0.3046075085324232, "grad_norm": 18.67123031616211, "learning_rate": 7.3270440251572324e-06, "loss": 2.8047, "step": 714 }, { "epoch": 0.3050341296928328, "grad_norm": 24.137971878051758, "learning_rate": 7.322551662174304e-06, "loss": 3.0098, "step": 715 }, { "epoch": 0.3054607508532423, "grad_norm": 31.285362243652344, "learning_rate": 7.318059299191375e-06, "loss": 2.501, "step": 716 }, { "epoch": 0.3058873720136519, "grad_norm": 15.62086296081543, "learning_rate": 7.313566936208446e-06, "loss": 2.5713, "step": 717 }, { "epoch": 0.30631399317406144, "grad_norm": 40.82994842529297, "learning_rate": 7.3090745732255175e-06, "loss": 2.5781, "step": 718 }, { "epoch": 0.306740614334471, "grad_norm": 48.110591888427734, "learning_rate": 7.304582210242588e-06, "loss": 2.5967, "step": 719 }, { "epoch": 0.30716723549488056, "grad_norm": 80.84425354003906, "learning_rate": 7.3000898472596595e-06, "loss": 3.1064, "step": 720 }, { "epoch": 0.3075938566552901, "grad_norm": 39.28225326538086, "learning_rate": 7.29559748427673e-06, "loss": 3.1758, "step": 721 }, { "epoch": 0.30802047781569963, "grad_norm": 28.14806365966797, "learning_rate": 7.291105121293801e-06, "loss": 2.9873, "step": 722 }, { "epoch": 0.3084470989761092, "grad_norm": 57.97172927856445, "learning_rate": 7.286612758310873e-06, "loss": 2.2529, "step": 723 }, { "epoch": 0.30887372013651876, "grad_norm": 19.667949676513672, "learning_rate": 7.282120395327943e-06, "loss": 2.749, "step": 724 }, { "epoch": 0.30930034129692835, "grad_norm": 41.68422317504883, "learning_rate": 7.277628032345015e-06, "loss": 2.6387, "step": 725 }, { "epoch": 0.3097269624573379, "grad_norm": 54.32461929321289, "learning_rate": 7.273135669362085e-06, "loss": 3.0742, "step": 726 }, { "epoch": 0.3101535836177474, "grad_norm": 39.13712692260742, "learning_rate": 7.268643306379156e-06, "loss": 2.5449, "step": 727 }, { "epoch": 0.310580204778157, "grad_norm": 85.2264404296875, "learning_rate": 7.264150943396226e-06, "loss": 3.1641, "step": 728 }, { "epoch": 0.31100682593856654, "grad_norm": 67.59739685058594, "learning_rate": 7.259658580413298e-06, "loss": 2.7236, "step": 729 }, { "epoch": 0.31143344709897613, "grad_norm": 51.05998229980469, "learning_rate": 7.255166217430369e-06, "loss": 2.7227, "step": 730 }, { "epoch": 0.31186006825938567, "grad_norm": 31.026859283447266, "learning_rate": 7.250673854447439e-06, "loss": 2.8906, "step": 731 }, { "epoch": 0.3122866894197952, "grad_norm": 15.103306770324707, "learning_rate": 7.246181491464511e-06, "loss": 2.4883, "step": 732 }, { "epoch": 0.3127133105802048, "grad_norm": 33.09571075439453, "learning_rate": 7.2416891284815814e-06, "loss": 2.6738, "step": 733 }, { "epoch": 0.31313993174061433, "grad_norm": 14.153240203857422, "learning_rate": 7.237196765498653e-06, "loss": 2.6084, "step": 734 }, { "epoch": 0.31356655290102387, "grad_norm": 27.6345157623291, "learning_rate": 7.2327044025157235e-06, "loss": 2.4736, "step": 735 }, { "epoch": 0.31399317406143346, "grad_norm": 38.9594612121582, "learning_rate": 7.2282120395327946e-06, "loss": 2.9785, "step": 736 }, { "epoch": 0.314419795221843, "grad_norm": 18.558412551879883, "learning_rate": 7.2237196765498665e-06, "loss": 2.6201, "step": 737 }, { "epoch": 0.3148464163822526, "grad_norm": 28.34615135192871, "learning_rate": 7.219227313566937e-06, "loss": 2.5713, "step": 738 }, { "epoch": 0.3152730375426621, "grad_norm": 20.91386604309082, "learning_rate": 7.214734950584008e-06, "loss": 2.7188, "step": 739 }, { "epoch": 0.31569965870307165, "grad_norm": 49.61574935913086, "learning_rate": 7.210242587601079e-06, "loss": 2.6348, "step": 740 }, { "epoch": 0.31612627986348124, "grad_norm": 19.535137176513672, "learning_rate": 7.20575022461815e-06, "loss": 3.002, "step": 741 }, { "epoch": 0.3165529010238908, "grad_norm": 16.409156799316406, "learning_rate": 7.20125786163522e-06, "loss": 2.54, "step": 742 }, { "epoch": 0.31697952218430037, "grad_norm": 25.113422393798828, "learning_rate": 7.196765498652292e-06, "loss": 2.7148, "step": 743 }, { "epoch": 0.3174061433447099, "grad_norm": 21.636598587036133, "learning_rate": 7.192273135669363e-06, "loss": 2.7275, "step": 744 }, { "epoch": 0.31783276450511944, "grad_norm": 17.268226623535156, "learning_rate": 7.187780772686433e-06, "loss": 2.9434, "step": 745 }, { "epoch": 0.318259385665529, "grad_norm": 25.657812118530273, "learning_rate": 7.183288409703505e-06, "loss": 2.3467, "step": 746 }, { "epoch": 0.31868600682593856, "grad_norm": 16.959508895874023, "learning_rate": 7.178796046720575e-06, "loss": 2.998, "step": 747 }, { "epoch": 0.3191126279863481, "grad_norm": 15.126802444458008, "learning_rate": 7.174303683737647e-06, "loss": 2.7227, "step": 748 }, { "epoch": 0.3195392491467577, "grad_norm": 35.737876892089844, "learning_rate": 7.169811320754717e-06, "loss": 2.8984, "step": 749 }, { "epoch": 0.3199658703071672, "grad_norm": 23.50027084350586, "learning_rate": 7.165318957771788e-06, "loss": 2.7188, "step": 750 }, { "epoch": 0.3203924914675768, "grad_norm": 46.277732849121094, "learning_rate": 7.16082659478886e-06, "loss": 2.8535, "step": 751 }, { "epoch": 0.32081911262798635, "grad_norm": 15.298663139343262, "learning_rate": 7.1563342318059304e-06, "loss": 2.5234, "step": 752 }, { "epoch": 0.3212457337883959, "grad_norm": 18.49799156188965, "learning_rate": 7.1518418688230015e-06, "loss": 2.627, "step": 753 }, { "epoch": 0.3216723549488055, "grad_norm": 24.766132354736328, "learning_rate": 7.1473495058400725e-06, "loss": 2.9336, "step": 754 }, { "epoch": 0.322098976109215, "grad_norm": 17.22234535217285, "learning_rate": 7.1428571428571436e-06, "loss": 3.1074, "step": 755 }, { "epoch": 0.3225255972696246, "grad_norm": 33.30344009399414, "learning_rate": 7.138364779874214e-06, "loss": 2.3975, "step": 756 }, { "epoch": 0.32295221843003413, "grad_norm": 21.910131454467773, "learning_rate": 7.133872416891286e-06, "loss": 2.3291, "step": 757 }, { "epoch": 0.32337883959044367, "grad_norm": 19.836538314819336, "learning_rate": 7.129380053908357e-06, "loss": 2.5684, "step": 758 }, { "epoch": 0.32380546075085326, "grad_norm": 25.19745445251465, "learning_rate": 7.124887690925427e-06, "loss": 2.8418, "step": 759 }, { "epoch": 0.3242320819112628, "grad_norm": 16.91659927368164, "learning_rate": 7.120395327942499e-06, "loss": 2.2397, "step": 760 }, { "epoch": 0.3246587030716723, "grad_norm": 50.67641067504883, "learning_rate": 7.115902964959569e-06, "loss": 3.2031, "step": 761 }, { "epoch": 0.3250853242320819, "grad_norm": 40.11088562011719, "learning_rate": 7.111410601976641e-06, "loss": 2.7275, "step": 762 }, { "epoch": 0.32551194539249145, "grad_norm": 43.53041458129883, "learning_rate": 7.106918238993711e-06, "loss": 3.5127, "step": 763 }, { "epoch": 0.32593856655290104, "grad_norm": 24.22418975830078, "learning_rate": 7.102425876010782e-06, "loss": 2.8223, "step": 764 }, { "epoch": 0.3263651877133106, "grad_norm": 20.259675979614258, "learning_rate": 7.097933513027854e-06, "loss": 2.7197, "step": 765 }, { "epoch": 0.3267918088737201, "grad_norm": 34.51850891113281, "learning_rate": 7.093441150044924e-06, "loss": 2.4951, "step": 766 }, { "epoch": 0.3272184300341297, "grad_norm": 41.48360824584961, "learning_rate": 7.088948787061995e-06, "loss": 2.4844, "step": 767 }, { "epoch": 0.32764505119453924, "grad_norm": 40.56487274169922, "learning_rate": 7.0844564240790654e-06, "loss": 3.0293, "step": 768 }, { "epoch": 0.32807167235494883, "grad_norm": 21.44087791442871, "learning_rate": 7.079964061096137e-06, "loss": 2.5938, "step": 769 }, { "epoch": 0.32849829351535836, "grad_norm": 17.163280487060547, "learning_rate": 7.0754716981132075e-06, "loss": 2.8359, "step": 770 }, { "epoch": 0.3289249146757679, "grad_norm": 37.45278549194336, "learning_rate": 7.0709793351302794e-06, "loss": 3.0166, "step": 771 }, { "epoch": 0.3293515358361775, "grad_norm": 24.047813415527344, "learning_rate": 7.0664869721473505e-06, "loss": 2.9121, "step": 772 }, { "epoch": 0.329778156996587, "grad_norm": 28.624671936035156, "learning_rate": 7.061994609164421e-06, "loss": 2.5586, "step": 773 }, { "epoch": 0.3302047781569966, "grad_norm": 16.891504287719727, "learning_rate": 7.0575022461814926e-06, "loss": 2.3682, "step": 774 }, { "epoch": 0.33063139931740615, "grad_norm": 18.889495849609375, "learning_rate": 7.053009883198563e-06, "loss": 2.9854, "step": 775 }, { "epoch": 0.3310580204778157, "grad_norm": 15.73511028289795, "learning_rate": 7.048517520215634e-06, "loss": 2.6162, "step": 776 }, { "epoch": 0.3314846416382253, "grad_norm": 28.1113338470459, "learning_rate": 7.044025157232705e-06, "loss": 3.1523, "step": 777 }, { "epoch": 0.3319112627986348, "grad_norm": 39.47050094604492, "learning_rate": 7.039532794249776e-06, "loss": 3.6523, "step": 778 }, { "epoch": 0.33233788395904434, "grad_norm": 28.715503692626953, "learning_rate": 7.035040431266848e-06, "loss": 2.7803, "step": 779 }, { "epoch": 0.33276450511945393, "grad_norm": 16.625925064086914, "learning_rate": 7.030548068283918e-06, "loss": 2.1416, "step": 780 }, { "epoch": 0.33319112627986347, "grad_norm": 22.609573364257812, "learning_rate": 7.026055705300989e-06, "loss": 3.0781, "step": 781 }, { "epoch": 0.33361774744027306, "grad_norm": 23.720556259155273, "learning_rate": 7.021563342318059e-06, "loss": 2.5449, "step": 782 }, { "epoch": 0.3340443686006826, "grad_norm": 21.275888442993164, "learning_rate": 7.017070979335131e-06, "loss": 2.9033, "step": 783 }, { "epoch": 0.33447098976109213, "grad_norm": 43.3743896484375, "learning_rate": 7.012578616352201e-06, "loss": 2.7246, "step": 784 }, { "epoch": 0.3348976109215017, "grad_norm": 44.226131439208984, "learning_rate": 7.008086253369273e-06, "loss": 2.7852, "step": 785 }, { "epoch": 0.33532423208191126, "grad_norm": 29.745895385742188, "learning_rate": 7.003593890386344e-06, "loss": 3.3535, "step": 786 }, { "epoch": 0.33575085324232085, "grad_norm": 17.165884017944336, "learning_rate": 6.9991015274034144e-06, "loss": 2.3242, "step": 787 }, { "epoch": 0.3361774744027304, "grad_norm": 17.3034725189209, "learning_rate": 6.994609164420486e-06, "loss": 2.4121, "step": 788 }, { "epoch": 0.3366040955631399, "grad_norm": 35.9029655456543, "learning_rate": 6.9901168014375565e-06, "loss": 3.1836, "step": 789 }, { "epoch": 0.3370307167235495, "grad_norm": 32.665138244628906, "learning_rate": 6.9856244384546276e-06, "loss": 2.7295, "step": 790 }, { "epoch": 0.33745733788395904, "grad_norm": 24.935747146606445, "learning_rate": 6.981132075471699e-06, "loss": 2.9609, "step": 791 }, { "epoch": 0.3378839590443686, "grad_norm": 17.43288230895996, "learning_rate": 6.97663971248877e-06, "loss": 2.5576, "step": 792 }, { "epoch": 0.33831058020477817, "grad_norm": 17.04522705078125, "learning_rate": 6.9721473495058416e-06, "loss": 2.4004, "step": 793 }, { "epoch": 0.3387372013651877, "grad_norm": 36.757266998291016, "learning_rate": 6.967654986522912e-06, "loss": 2.8086, "step": 794 }, { "epoch": 0.3391638225255973, "grad_norm": 27.1303768157959, "learning_rate": 6.963162623539983e-06, "loss": 3.0547, "step": 795 }, { "epoch": 0.3395904436860068, "grad_norm": 19.866275787353516, "learning_rate": 6.958670260557053e-06, "loss": 3.1318, "step": 796 }, { "epoch": 0.34001706484641636, "grad_norm": 33.106712341308594, "learning_rate": 6.954177897574125e-06, "loss": 3.5977, "step": 797 }, { "epoch": 0.34044368600682595, "grad_norm": 22.2625732421875, "learning_rate": 6.949685534591195e-06, "loss": 2.9775, "step": 798 }, { "epoch": 0.3408703071672355, "grad_norm": 39.98801040649414, "learning_rate": 6.945193171608267e-06, "loss": 2.5645, "step": 799 }, { "epoch": 0.3412969283276451, "grad_norm": 28.945308685302734, "learning_rate": 6.940700808625338e-06, "loss": 2.7559, "step": 800 }, { "epoch": 0.3417235494880546, "grad_norm": 32.561458587646484, "learning_rate": 6.936208445642408e-06, "loss": 3.3193, "step": 801 }, { "epoch": 0.34215017064846415, "grad_norm": 19.446264266967773, "learning_rate": 6.93171608265948e-06, "loss": 2.6797, "step": 802 }, { "epoch": 0.34257679180887374, "grad_norm": 14.95886516571045, "learning_rate": 6.92722371967655e-06, "loss": 2.8809, "step": 803 }, { "epoch": 0.3430034129692833, "grad_norm": 27.389297485351562, "learning_rate": 6.922731356693621e-06, "loss": 3.2207, "step": 804 }, { "epoch": 0.3434300341296928, "grad_norm": 25.899202346801758, "learning_rate": 6.9182389937106915e-06, "loss": 2.6055, "step": 805 }, { "epoch": 0.3438566552901024, "grad_norm": 29.216964721679688, "learning_rate": 6.9137466307277634e-06, "loss": 2.9365, "step": 806 }, { "epoch": 0.34428327645051193, "grad_norm": 14.480628967285156, "learning_rate": 6.9092542677448345e-06, "loss": 2.1611, "step": 807 }, { "epoch": 0.3447098976109215, "grad_norm": 21.613162994384766, "learning_rate": 6.9047619047619055e-06, "loss": 2.4648, "step": 808 }, { "epoch": 0.34513651877133106, "grad_norm": 19.232038497924805, "learning_rate": 6.9002695417789766e-06, "loss": 2.6406, "step": 809 }, { "epoch": 0.3455631399317406, "grad_norm": 26.753767013549805, "learning_rate": 6.895777178796047e-06, "loss": 2.8281, "step": 810 }, { "epoch": 0.3459897610921502, "grad_norm": 17.898767471313477, "learning_rate": 6.891284815813119e-06, "loss": 2.5, "step": 811 }, { "epoch": 0.3464163822525597, "grad_norm": 29.41793441772461, "learning_rate": 6.886792452830189e-06, "loss": 2.7646, "step": 812 }, { "epoch": 0.3468430034129693, "grad_norm": 25.555450439453125, "learning_rate": 6.88230008984726e-06, "loss": 3.3203, "step": 813 }, { "epoch": 0.34726962457337884, "grad_norm": 25.43630599975586, "learning_rate": 6.877807726864332e-06, "loss": 3.1045, "step": 814 }, { "epoch": 0.3476962457337884, "grad_norm": 24.84125518798828, "learning_rate": 6.873315363881402e-06, "loss": 2.4229, "step": 815 }, { "epoch": 0.34812286689419797, "grad_norm": 31.87086296081543, "learning_rate": 6.868823000898474e-06, "loss": 2.6289, "step": 816 }, { "epoch": 0.3485494880546075, "grad_norm": 20.92204475402832, "learning_rate": 6.864330637915544e-06, "loss": 2.4424, "step": 817 }, { "epoch": 0.34897610921501704, "grad_norm": 34.5516471862793, "learning_rate": 6.859838274932615e-06, "loss": 2.7666, "step": 818 }, { "epoch": 0.34940273037542663, "grad_norm": 24.447370529174805, "learning_rate": 6.855345911949685e-06, "loss": 2.3027, "step": 819 }, { "epoch": 0.34982935153583616, "grad_norm": 22.048521041870117, "learning_rate": 6.850853548966757e-06, "loss": 2.6621, "step": 820 }, { "epoch": 0.35025597269624575, "grad_norm": 31.585052490234375, "learning_rate": 6.846361185983828e-06, "loss": 2.5371, "step": 821 }, { "epoch": 0.3506825938566553, "grad_norm": 19.84841537475586, "learning_rate": 6.841868823000899e-06, "loss": 2.3789, "step": 822 }, { "epoch": 0.3511092150170648, "grad_norm": 14.99898624420166, "learning_rate": 6.83737646001797e-06, "loss": 2.373, "step": 823 }, { "epoch": 0.3515358361774744, "grad_norm": 18.301692962646484, "learning_rate": 6.8328840970350405e-06, "loss": 2.4795, "step": 824 }, { "epoch": 0.35196245733788395, "grad_norm": 38.46403884887695, "learning_rate": 6.8283917340521124e-06, "loss": 3.001, "step": 825 }, { "epoch": 0.35238907849829354, "grad_norm": 42.38199996948242, "learning_rate": 6.823899371069183e-06, "loss": 2.2314, "step": 826 }, { "epoch": 0.3528156996587031, "grad_norm": 17.83135223388672, "learning_rate": 6.819407008086254e-06, "loss": 2.6631, "step": 827 }, { "epoch": 0.3532423208191126, "grad_norm": 24.800996780395508, "learning_rate": 6.8149146451033256e-06, "loss": 3.1953, "step": 828 }, { "epoch": 0.3536689419795222, "grad_norm": 18.44524383544922, "learning_rate": 6.810422282120396e-06, "loss": 2.6289, "step": 829 }, { "epoch": 0.35409556313993173, "grad_norm": 24.03241539001465, "learning_rate": 6.805929919137468e-06, "loss": 3.0879, "step": 830 }, { "epoch": 0.35452218430034127, "grad_norm": 56.00475311279297, "learning_rate": 6.801437556154538e-06, "loss": 2.2793, "step": 831 }, { "epoch": 0.35494880546075086, "grad_norm": 56.98671340942383, "learning_rate": 6.796945193171609e-06, "loss": 3.3281, "step": 832 }, { "epoch": 0.3553754266211604, "grad_norm": 66.61808013916016, "learning_rate": 6.792452830188679e-06, "loss": 3.208, "step": 833 }, { "epoch": 0.35580204778157, "grad_norm": 47.69834518432617, "learning_rate": 6.787960467205751e-06, "loss": 2.4082, "step": 834 }, { "epoch": 0.3562286689419795, "grad_norm": 20.011539459228516, "learning_rate": 6.783468104222821e-06, "loss": 3.0039, "step": 835 }, { "epoch": 0.35665529010238906, "grad_norm": 22.645479202270508, "learning_rate": 6.778975741239893e-06, "loss": 2.6797, "step": 836 }, { "epoch": 0.35708191126279865, "grad_norm": 25.83390998840332, "learning_rate": 6.774483378256964e-06, "loss": 2.7969, "step": 837 }, { "epoch": 0.3575085324232082, "grad_norm": 25.184431076049805, "learning_rate": 6.769991015274034e-06, "loss": 2.7764, "step": 838 }, { "epoch": 0.35793515358361777, "grad_norm": 24.293928146362305, "learning_rate": 6.765498652291106e-06, "loss": 2.6016, "step": 839 }, { "epoch": 0.3583617747440273, "grad_norm": 17.71709442138672, "learning_rate": 6.761006289308176e-06, "loss": 2.3506, "step": 840 }, { "epoch": 0.35878839590443684, "grad_norm": 32.32875061035156, "learning_rate": 6.7565139263252475e-06, "loss": 2.8857, "step": 841 }, { "epoch": 0.35921501706484643, "grad_norm": 24.71383285522461, "learning_rate": 6.752021563342318e-06, "loss": 3.1406, "step": 842 }, { "epoch": 0.35964163822525597, "grad_norm": 47.8384895324707, "learning_rate": 6.7475292003593895e-06, "loss": 2.833, "step": 843 }, { "epoch": 0.36006825938566556, "grad_norm": 20.34820556640625, "learning_rate": 6.743036837376461e-06, "loss": 3.1543, "step": 844 }, { "epoch": 0.3604948805460751, "grad_norm": 36.99142837524414, "learning_rate": 6.738544474393532e-06, "loss": 2.4814, "step": 845 }, { "epoch": 0.3609215017064846, "grad_norm": 17.918649673461914, "learning_rate": 6.734052111410603e-06, "loss": 3.002, "step": 846 }, { "epoch": 0.3613481228668942, "grad_norm": 43.56641387939453, "learning_rate": 6.729559748427673e-06, "loss": 2.8711, "step": 847 }, { "epoch": 0.36177474402730375, "grad_norm": 19.88298988342285, "learning_rate": 6.725067385444745e-06, "loss": 3.1875, "step": 848 }, { "epoch": 0.3622013651877133, "grad_norm": 16.08139991760254, "learning_rate": 6.720575022461815e-06, "loss": 2.3691, "step": 849 }, { "epoch": 0.3626279863481229, "grad_norm": 27.99295997619629, "learning_rate": 6.716082659478886e-06, "loss": 2.5859, "step": 850 }, { "epoch": 0.3630546075085324, "grad_norm": 18.48909568786621, "learning_rate": 6.711590296495958e-06, "loss": 2.9287, "step": 851 }, { "epoch": 0.363481228668942, "grad_norm": 22.837125778198242, "learning_rate": 6.707097933513028e-06, "loss": 2.5029, "step": 852 }, { "epoch": 0.36390784982935154, "grad_norm": 16.875024795532227, "learning_rate": 6.7026055705301e-06, "loss": 2.5801, "step": 853 }, { "epoch": 0.3643344709897611, "grad_norm": 20.041526794433594, "learning_rate": 6.69811320754717e-06, "loss": 2.9414, "step": 854 }, { "epoch": 0.36476109215017066, "grad_norm": 33.22433853149414, "learning_rate": 6.693620844564241e-06, "loss": 2.9219, "step": 855 }, { "epoch": 0.3651877133105802, "grad_norm": 40.56968688964844, "learning_rate": 6.6891284815813114e-06, "loss": 2.7197, "step": 856 }, { "epoch": 0.3656143344709898, "grad_norm": 21.923696517944336, "learning_rate": 6.684636118598383e-06, "loss": 2.5166, "step": 857 }, { "epoch": 0.3660409556313993, "grad_norm": 34.44352340698242, "learning_rate": 6.680143755615454e-06, "loss": 2.5635, "step": 858 }, { "epoch": 0.36646757679180886, "grad_norm": 18.53885269165039, "learning_rate": 6.675651392632525e-06, "loss": 2.6416, "step": 859 }, { "epoch": 0.36689419795221845, "grad_norm": 18.8934268951416, "learning_rate": 6.6711590296495964e-06, "loss": 2.6191, "step": 860 }, { "epoch": 0.367320819112628, "grad_norm": 38.83974075317383, "learning_rate": 6.666666666666667e-06, "loss": 2.4268, "step": 861 }, { "epoch": 0.3677474402730375, "grad_norm": 20.042360305786133, "learning_rate": 6.6621743036837385e-06, "loss": 2.458, "step": 862 }, { "epoch": 0.3681740614334471, "grad_norm": 36.47145462036133, "learning_rate": 6.657681940700809e-06, "loss": 1.9346, "step": 863 }, { "epoch": 0.36860068259385664, "grad_norm": 28.04621696472168, "learning_rate": 6.65318957771788e-06, "loss": 3.3066, "step": 864 }, { "epoch": 0.36902730375426623, "grad_norm": 21.341005325317383, "learning_rate": 6.648697214734952e-06, "loss": 2.6602, "step": 865 }, { "epoch": 0.36945392491467577, "grad_norm": 31.291488647460938, "learning_rate": 6.644204851752022e-06, "loss": 2.8086, "step": 866 }, { "epoch": 0.3698805460750853, "grad_norm": 32.221710205078125, "learning_rate": 6.639712488769094e-06, "loss": 2.5039, "step": 867 }, { "epoch": 0.3703071672354949, "grad_norm": 29.85544776916504, "learning_rate": 6.635220125786164e-06, "loss": 2.6084, "step": 868 }, { "epoch": 0.37073378839590443, "grad_norm": 29.484418869018555, "learning_rate": 6.630727762803235e-06, "loss": 2.4824, "step": 869 }, { "epoch": 0.371160409556314, "grad_norm": 25.850082397460938, "learning_rate": 6.626235399820305e-06, "loss": 2.4316, "step": 870 }, { "epoch": 0.37158703071672355, "grad_norm": 21.31001091003418, "learning_rate": 6.621743036837377e-06, "loss": 2.5518, "step": 871 }, { "epoch": 0.3720136518771331, "grad_norm": 21.5012149810791, "learning_rate": 6.617250673854448e-06, "loss": 2.6621, "step": 872 }, { "epoch": 0.3724402730375427, "grad_norm": 24.898344039916992, "learning_rate": 6.612758310871519e-06, "loss": 2.2402, "step": 873 }, { "epoch": 0.3728668941979522, "grad_norm": 21.651897430419922, "learning_rate": 6.60826594788859e-06, "loss": 2.7373, "step": 874 }, { "epoch": 0.37329351535836175, "grad_norm": 18.67881202697754, "learning_rate": 6.60377358490566e-06, "loss": 2.4678, "step": 875 }, { "epoch": 0.37372013651877134, "grad_norm": 28.32158088684082, "learning_rate": 6.599281221922732e-06, "loss": 2.6621, "step": 876 }, { "epoch": 0.3741467576791809, "grad_norm": 27.963510513305664, "learning_rate": 6.5947888589398025e-06, "loss": 2.3184, "step": 877 }, { "epoch": 0.37457337883959047, "grad_norm": 18.716800689697266, "learning_rate": 6.5902964959568736e-06, "loss": 2.707, "step": 878 }, { "epoch": 0.375, "grad_norm": 26.4693660736084, "learning_rate": 6.5858041329739454e-06, "loss": 2.8682, "step": 879 }, { "epoch": 0.37542662116040953, "grad_norm": 17.989103317260742, "learning_rate": 6.581311769991016e-06, "loss": 2.5781, "step": 880 }, { "epoch": 0.3758532423208191, "grad_norm": 17.228116989135742, "learning_rate": 6.576819407008087e-06, "loss": 2.1328, "step": 881 }, { "epoch": 0.37627986348122866, "grad_norm": 42.95380401611328, "learning_rate": 6.572327044025158e-06, "loss": 2.8135, "step": 882 }, { "epoch": 0.37670648464163825, "grad_norm": 33.39653778076172, "learning_rate": 6.567834681042229e-06, "loss": 2.9727, "step": 883 }, { "epoch": 0.3771331058020478, "grad_norm": 39.138118743896484, "learning_rate": 6.563342318059299e-06, "loss": 2.7061, "step": 884 }, { "epoch": 0.3775597269624573, "grad_norm": 18.219497680664062, "learning_rate": 6.558849955076371e-06, "loss": 2.7354, "step": 885 }, { "epoch": 0.3779863481228669, "grad_norm": 37.8060302734375, "learning_rate": 6.554357592093442e-06, "loss": 2.9434, "step": 886 }, { "epoch": 0.37841296928327645, "grad_norm": 23.32965660095215, "learning_rate": 6.549865229110512e-06, "loss": 3.0215, "step": 887 }, { "epoch": 0.378839590443686, "grad_norm": 39.0524787902832, "learning_rate": 6.545372866127584e-06, "loss": 2.4316, "step": 888 }, { "epoch": 0.37926621160409557, "grad_norm": 22.534435272216797, "learning_rate": 6.540880503144654e-06, "loss": 2.5332, "step": 889 }, { "epoch": 0.3796928327645051, "grad_norm": 41.38739013671875, "learning_rate": 6.536388140161726e-06, "loss": 2.8721, "step": 890 }, { "epoch": 0.3801194539249147, "grad_norm": 21.326303482055664, "learning_rate": 6.531895777178796e-06, "loss": 2.4785, "step": 891 }, { "epoch": 0.38054607508532423, "grad_norm": 18.502744674682617, "learning_rate": 6.527403414195867e-06, "loss": 2.7476, "step": 892 }, { "epoch": 0.38097269624573377, "grad_norm": 20.264904022216797, "learning_rate": 6.522911051212939e-06, "loss": 2.4033, "step": 893 }, { "epoch": 0.38139931740614336, "grad_norm": 25.84261131286621, "learning_rate": 6.518418688230009e-06, "loss": 2.668, "step": 894 }, { "epoch": 0.3818259385665529, "grad_norm": 39.63558578491211, "learning_rate": 6.5139263252470805e-06, "loss": 2.2871, "step": 895 }, { "epoch": 0.3822525597269625, "grad_norm": 42.330448150634766, "learning_rate": 6.5094339622641515e-06, "loss": 2.7676, "step": 896 }, { "epoch": 0.382679180887372, "grad_norm": 19.066465377807617, "learning_rate": 6.5049415992812226e-06, "loss": 2.4248, "step": 897 }, { "epoch": 0.38310580204778155, "grad_norm": 20.37721061706543, "learning_rate": 6.500449236298293e-06, "loss": 2.2129, "step": 898 }, { "epoch": 0.38353242320819114, "grad_norm": 40.267765045166016, "learning_rate": 6.495956873315365e-06, "loss": 2.5332, "step": 899 }, { "epoch": 0.3839590443686007, "grad_norm": 18.17340850830078, "learning_rate": 6.491464510332436e-06, "loss": 2.3945, "step": 900 }, { "epoch": 0.3843856655290102, "grad_norm": 15.765764236450195, "learning_rate": 6.486972147349506e-06, "loss": 1.9697, "step": 901 }, { "epoch": 0.3848122866894198, "grad_norm": 47.31410598754883, "learning_rate": 6.482479784366578e-06, "loss": 3.4316, "step": 902 }, { "epoch": 0.38523890784982934, "grad_norm": 19.825407028198242, "learning_rate": 6.477987421383648e-06, "loss": 2.2998, "step": 903 }, { "epoch": 0.3856655290102389, "grad_norm": 23.80759048461914, "learning_rate": 6.47349505840072e-06, "loss": 2.8438, "step": 904 }, { "epoch": 0.38609215017064846, "grad_norm": 33.58242416381836, "learning_rate": 6.46900269541779e-06, "loss": 2.7188, "step": 905 }, { "epoch": 0.386518771331058, "grad_norm": 20.253826141357422, "learning_rate": 6.464510332434861e-06, "loss": 2.5635, "step": 906 }, { "epoch": 0.3869453924914676, "grad_norm": 31.54781723022461, "learning_rate": 6.460017969451933e-06, "loss": 2.9951, "step": 907 }, { "epoch": 0.3873720136518771, "grad_norm": 36.786502838134766, "learning_rate": 6.455525606469003e-06, "loss": 3.0371, "step": 908 }, { "epoch": 0.3877986348122867, "grad_norm": 17.36682891845703, "learning_rate": 6.451033243486074e-06, "loss": 2.167, "step": 909 }, { "epoch": 0.38822525597269625, "grad_norm": 21.992660522460938, "learning_rate": 6.446540880503145e-06, "loss": 2.0117, "step": 910 }, { "epoch": 0.3886518771331058, "grad_norm": 17.453601837158203, "learning_rate": 6.442048517520216e-06, "loss": 2.5195, "step": 911 }, { "epoch": 0.3890784982935154, "grad_norm": 17.015531539916992, "learning_rate": 6.4375561545372865e-06, "loss": 2.2314, "step": 912 }, { "epoch": 0.3895051194539249, "grad_norm": 22.681379318237305, "learning_rate": 6.433063791554358e-06, "loss": 3.1436, "step": 913 }, { "epoch": 0.38993174061433444, "grad_norm": 15.419713020324707, "learning_rate": 6.4285714285714295e-06, "loss": 2.5117, "step": 914 }, { "epoch": 0.39035836177474403, "grad_norm": 29.035032272338867, "learning_rate": 6.4240790655885e-06, "loss": 2.6631, "step": 915 }, { "epoch": 0.39078498293515357, "grad_norm": 25.45609474182129, "learning_rate": 6.4195867026055715e-06, "loss": 2.8594, "step": 916 }, { "epoch": 0.39121160409556316, "grad_norm": 26.522363662719727, "learning_rate": 6.415094339622642e-06, "loss": 2.2881, "step": 917 }, { "epoch": 0.3916382252559727, "grad_norm": 31.536636352539062, "learning_rate": 6.410601976639713e-06, "loss": 2.3486, "step": 918 }, { "epoch": 0.39206484641638223, "grad_norm": 25.44186019897461, "learning_rate": 6.406109613656784e-06, "loss": 2.6182, "step": 919 }, { "epoch": 0.3924914675767918, "grad_norm": 37.88819885253906, "learning_rate": 6.401617250673855e-06, "loss": 2.5488, "step": 920 }, { "epoch": 0.39291808873720135, "grad_norm": 17.342376708984375, "learning_rate": 6.397124887690927e-06, "loss": 2.5137, "step": 921 }, { "epoch": 0.39334470989761094, "grad_norm": 25.706722259521484, "learning_rate": 6.392632524707997e-06, "loss": 2.9775, "step": 922 }, { "epoch": 0.3937713310580205, "grad_norm": 18.36219596862793, "learning_rate": 6.388140161725068e-06, "loss": 2.5596, "step": 923 }, { "epoch": 0.39419795221843, "grad_norm": 16.040849685668945, "learning_rate": 6.383647798742138e-06, "loss": 1.9355, "step": 924 }, { "epoch": 0.3946245733788396, "grad_norm": 19.206146240234375, "learning_rate": 6.37915543575921e-06, "loss": 2.5811, "step": 925 }, { "epoch": 0.39505119453924914, "grad_norm": 20.79313087463379, "learning_rate": 6.37466307277628e-06, "loss": 2.4482, "step": 926 }, { "epoch": 0.39547781569965873, "grad_norm": 27.068574905395508, "learning_rate": 6.370170709793352e-06, "loss": 2.7402, "step": 927 }, { "epoch": 0.39590443686006827, "grad_norm": 23.966190338134766, "learning_rate": 6.365678346810423e-06, "loss": 2.9131, "step": 928 }, { "epoch": 0.3963310580204778, "grad_norm": 37.11189651489258, "learning_rate": 6.3611859838274934e-06, "loss": 2.7607, "step": 929 }, { "epoch": 0.3967576791808874, "grad_norm": 28.57893943786621, "learning_rate": 6.356693620844565e-06, "loss": 2.7305, "step": 930 }, { "epoch": 0.3971843003412969, "grad_norm": 21.24115753173828, "learning_rate": 6.3522012578616355e-06, "loss": 2.7012, "step": 931 }, { "epoch": 0.39761092150170646, "grad_norm": 17.273521423339844, "learning_rate": 6.3477088948787066e-06, "loss": 2.6914, "step": 932 }, { "epoch": 0.39803754266211605, "grad_norm": 42.97947692871094, "learning_rate": 6.343216531895778e-06, "loss": 2.5791, "step": 933 }, { "epoch": 0.3984641638225256, "grad_norm": 16.661331176757812, "learning_rate": 6.338724168912849e-06, "loss": 2.7388, "step": 934 }, { "epoch": 0.3988907849829352, "grad_norm": 51.49508285522461, "learning_rate": 6.3342318059299205e-06, "loss": 2.5723, "step": 935 }, { "epoch": 0.3993174061433447, "grad_norm": 50.0458869934082, "learning_rate": 6.329739442946991e-06, "loss": 2.751, "step": 936 }, { "epoch": 0.39974402730375425, "grad_norm": 39.40812683105469, "learning_rate": 6.325247079964062e-06, "loss": 2.7227, "step": 937 }, { "epoch": 0.40017064846416384, "grad_norm": 20.065385818481445, "learning_rate": 6.320754716981132e-06, "loss": 2.6074, "step": 938 }, { "epoch": 0.40059726962457337, "grad_norm": 29.637264251708984, "learning_rate": 6.316262353998204e-06, "loss": 2.4805, "step": 939 }, { "epoch": 0.40102389078498296, "grad_norm": 31.02073860168457, "learning_rate": 6.311769991015274e-06, "loss": 2.5801, "step": 940 }, { "epoch": 0.4014505119453925, "grad_norm": 23.977516174316406, "learning_rate": 6.307277628032346e-06, "loss": 2.4229, "step": 941 }, { "epoch": 0.40187713310580203, "grad_norm": 16.94057273864746, "learning_rate": 6.302785265049417e-06, "loss": 2.3359, "step": 942 }, { "epoch": 0.4023037542662116, "grad_norm": 47.786930084228516, "learning_rate": 6.298292902066487e-06, "loss": 2.2529, "step": 943 }, { "epoch": 0.40273037542662116, "grad_norm": 30.42767333984375, "learning_rate": 6.293800539083559e-06, "loss": 2.6318, "step": 944 }, { "epoch": 0.4031569965870307, "grad_norm": 68.6019515991211, "learning_rate": 6.289308176100629e-06, "loss": 3.3115, "step": 945 }, { "epoch": 0.4035836177474403, "grad_norm": 46.8463020324707, "learning_rate": 6.2848158131177e-06, "loss": 2.9385, "step": 946 }, { "epoch": 0.4040102389078498, "grad_norm": 16.7492618560791, "learning_rate": 6.2803234501347705e-06, "loss": 2.0059, "step": 947 }, { "epoch": 0.4044368600682594, "grad_norm": 19.94270896911621, "learning_rate": 6.2758310871518424e-06, "loss": 2.2812, "step": 948 }, { "epoch": 0.40486348122866894, "grad_norm": 24.3834285736084, "learning_rate": 6.2713387241689135e-06, "loss": 3.5176, "step": 949 }, { "epoch": 0.4052901023890785, "grad_norm": 25.361343383789062, "learning_rate": 6.2668463611859845e-06, "loss": 3.1465, "step": 950 }, { "epoch": 0.40571672354948807, "grad_norm": 33.129478454589844, "learning_rate": 6.2623539982030556e-06, "loss": 2.5137, "step": 951 }, { "epoch": 0.4061433447098976, "grad_norm": 36.047359466552734, "learning_rate": 6.257861635220126e-06, "loss": 2.5098, "step": 952 }, { "epoch": 0.4065699658703072, "grad_norm": 29.85443878173828, "learning_rate": 6.253369272237198e-06, "loss": 2.7441, "step": 953 }, { "epoch": 0.4069965870307167, "grad_norm": 20.3319149017334, "learning_rate": 6.248876909254268e-06, "loss": 2.7578, "step": 954 }, { "epoch": 0.40742320819112626, "grad_norm": 45.543827056884766, "learning_rate": 6.244384546271339e-06, "loss": 2.4893, "step": 955 }, { "epoch": 0.40784982935153585, "grad_norm": 17.98326301574707, "learning_rate": 6.239892183288411e-06, "loss": 2.7275, "step": 956 }, { "epoch": 0.4082764505119454, "grad_norm": 16.175251007080078, "learning_rate": 6.235399820305481e-06, "loss": 2.5791, "step": 957 }, { "epoch": 0.4087030716723549, "grad_norm": 23.90378189086914, "learning_rate": 6.230907457322553e-06, "loss": 2.7363, "step": 958 }, { "epoch": 0.4091296928327645, "grad_norm": 18.764850616455078, "learning_rate": 6.226415094339623e-06, "loss": 2.7949, "step": 959 }, { "epoch": 0.40955631399317405, "grad_norm": 24.244136810302734, "learning_rate": 6.221922731356694e-06, "loss": 2.8447, "step": 960 }, { "epoch": 0.40998293515358364, "grad_norm": 18.434280395507812, "learning_rate": 6.217430368373764e-06, "loss": 2.9453, "step": 961 }, { "epoch": 0.4104095563139932, "grad_norm": 29.534372329711914, "learning_rate": 6.212938005390836e-06, "loss": 2.998, "step": 962 }, { "epoch": 0.4108361774744027, "grad_norm": 16.15813446044922, "learning_rate": 6.208445642407907e-06, "loss": 2.6787, "step": 963 }, { "epoch": 0.4112627986348123, "grad_norm": 20.7001953125, "learning_rate": 6.203953279424978e-06, "loss": 2.666, "step": 964 }, { "epoch": 0.41168941979522183, "grad_norm": 25.749404907226562, "learning_rate": 6.199460916442049e-06, "loss": 3.0195, "step": 965 }, { "epoch": 0.4121160409556314, "grad_norm": 23.610687255859375, "learning_rate": 6.1949685534591195e-06, "loss": 2.418, "step": 966 }, { "epoch": 0.41254266211604096, "grad_norm": 20.643394470214844, "learning_rate": 6.1904761904761914e-06, "loss": 2.5879, "step": 967 }, { "epoch": 0.4129692832764505, "grad_norm": 39.67180252075195, "learning_rate": 6.185983827493262e-06, "loss": 3.2188, "step": 968 }, { "epoch": 0.4133959044368601, "grad_norm": 37.136566162109375, "learning_rate": 6.181491464510333e-06, "loss": 2.4941, "step": 969 }, { "epoch": 0.4138225255972696, "grad_norm": 23.125782012939453, "learning_rate": 6.1769991015274046e-06, "loss": 2.5205, "step": 970 }, { "epoch": 0.41424914675767915, "grad_norm": 17.800636291503906, "learning_rate": 6.172506738544475e-06, "loss": 2.4541, "step": 971 }, { "epoch": 0.41467576791808874, "grad_norm": 21.861169815063477, "learning_rate": 6.168014375561547e-06, "loss": 2.6475, "step": 972 }, { "epoch": 0.4151023890784983, "grad_norm": 14.438638687133789, "learning_rate": 6.163522012578617e-06, "loss": 2.4336, "step": 973 }, { "epoch": 0.41552901023890787, "grad_norm": 29.65276527404785, "learning_rate": 6.159029649595688e-06, "loss": 2.7949, "step": 974 }, { "epoch": 0.4159556313993174, "grad_norm": 24.857969284057617, "learning_rate": 6.154537286612758e-06, "loss": 2.6533, "step": 975 }, { "epoch": 0.41638225255972694, "grad_norm": 20.298656463623047, "learning_rate": 6.15004492362983e-06, "loss": 2.8877, "step": 976 }, { "epoch": 0.41680887372013653, "grad_norm": 20.43273162841797, "learning_rate": 6.145552560646901e-06, "loss": 3.293, "step": 977 }, { "epoch": 0.41723549488054607, "grad_norm": 15.368301391601562, "learning_rate": 6.141060197663972e-06, "loss": 2.7148, "step": 978 }, { "epoch": 0.41766211604095566, "grad_norm": 18.29572296142578, "learning_rate": 6.136567834681043e-06, "loss": 2.4434, "step": 979 }, { "epoch": 0.4180887372013652, "grad_norm": 16.30262565612793, "learning_rate": 6.132075471698113e-06, "loss": 2.5898, "step": 980 }, { "epoch": 0.4185153583617747, "grad_norm": 21.400785446166992, "learning_rate": 6.127583108715185e-06, "loss": 2.7266, "step": 981 }, { "epoch": 0.4189419795221843, "grad_norm": 29.911426544189453, "learning_rate": 6.123090745732255e-06, "loss": 2.6367, "step": 982 }, { "epoch": 0.41936860068259385, "grad_norm": 22.61137580871582, "learning_rate": 6.1185983827493264e-06, "loss": 3.0137, "step": 983 }, { "epoch": 0.4197952218430034, "grad_norm": 12.429327011108398, "learning_rate": 6.114106019766398e-06, "loss": 2.0137, "step": 984 }, { "epoch": 0.420221843003413, "grad_norm": 63.111473083496094, "learning_rate": 6.1096136567834685e-06, "loss": 2.8916, "step": 985 }, { "epoch": 0.4206484641638225, "grad_norm": 24.865880966186523, "learning_rate": 6.1051212938005396e-06, "loss": 2.5361, "step": 986 }, { "epoch": 0.4210750853242321, "grad_norm": 24.293546676635742, "learning_rate": 6.100628930817611e-06, "loss": 2.6357, "step": 987 }, { "epoch": 0.42150170648464164, "grad_norm": 16.23811149597168, "learning_rate": 6.096136567834682e-06, "loss": 2.1338, "step": 988 }, { "epoch": 0.42192832764505117, "grad_norm": 21.733821868896484, "learning_rate": 6.091644204851752e-06, "loss": 2.373, "step": 989 }, { "epoch": 0.42235494880546076, "grad_norm": 18.73717498779297, "learning_rate": 6.087151841868824e-06, "loss": 2.2764, "step": 990 }, { "epoch": 0.4227815699658703, "grad_norm": 26.71270751953125, "learning_rate": 6.082659478885895e-06, "loss": 2.4351, "step": 991 }, { "epoch": 0.4232081911262799, "grad_norm": 22.92618751525879, "learning_rate": 6.078167115902965e-06, "loss": 3.0625, "step": 992 }, { "epoch": 0.4236348122866894, "grad_norm": 38.981849670410156, "learning_rate": 6.073674752920037e-06, "loss": 2.9932, "step": 993 }, { "epoch": 0.42406143344709896, "grad_norm": 45.898590087890625, "learning_rate": 6.069182389937107e-06, "loss": 3.1201, "step": 994 }, { "epoch": 0.42448805460750855, "grad_norm": 47.92092514038086, "learning_rate": 6.064690026954179e-06, "loss": 2.6689, "step": 995 }, { "epoch": 0.4249146757679181, "grad_norm": 33.29727554321289, "learning_rate": 6.060197663971249e-06, "loss": 1.7715, "step": 996 }, { "epoch": 0.4253412969283277, "grad_norm": 19.22433853149414, "learning_rate": 6.05570530098832e-06, "loss": 2.8066, "step": 997 }, { "epoch": 0.4257679180887372, "grad_norm": 27.66653060913086, "learning_rate": 6.05121293800539e-06, "loss": 2.918, "step": 998 }, { "epoch": 0.42619453924914674, "grad_norm": 49.86943817138672, "learning_rate": 6.046720575022462e-06, "loss": 2.5547, "step": 999 }, { "epoch": 0.42662116040955633, "grad_norm": 75.04938507080078, "learning_rate": 6.042228212039533e-06, "loss": 2.75, "step": 1000 }, { "epoch": 0.42704778156996587, "grad_norm": 22.91497802734375, "learning_rate": 6.037735849056604e-06, "loss": 3.0762, "step": 1001 }, { "epoch": 0.4274744027303754, "grad_norm": 53.352027893066406, "learning_rate": 6.0332434860736754e-06, "loss": 2.4795, "step": 1002 }, { "epoch": 0.427901023890785, "grad_norm": 45.149879455566406, "learning_rate": 6.028751123090746e-06, "loss": 2.5918, "step": 1003 }, { "epoch": 0.4283276450511945, "grad_norm": 51.942405700683594, "learning_rate": 6.0242587601078175e-06, "loss": 2.8516, "step": 1004 }, { "epoch": 0.4287542662116041, "grad_norm": 31.42732048034668, "learning_rate": 6.019766397124888e-06, "loss": 2.7529, "step": 1005 }, { "epoch": 0.42918088737201365, "grad_norm": 22.029170989990234, "learning_rate": 6.015274034141959e-06, "loss": 2.2666, "step": 1006 }, { "epoch": 0.4296075085324232, "grad_norm": 32.57006072998047, "learning_rate": 6.010781671159031e-06, "loss": 2.4951, "step": 1007 }, { "epoch": 0.4300341296928328, "grad_norm": 21.854209899902344, "learning_rate": 6.006289308176101e-06, "loss": 2.7588, "step": 1008 }, { "epoch": 0.4304607508532423, "grad_norm": 24.148046493530273, "learning_rate": 6.001796945193173e-06, "loss": 2.6313, "step": 1009 }, { "epoch": 0.4308873720136519, "grad_norm": 25.809846878051758, "learning_rate": 5.997304582210243e-06, "loss": 2.6152, "step": 1010 }, { "epoch": 0.43131399317406144, "grad_norm": 19.045398712158203, "learning_rate": 5.992812219227314e-06, "loss": 2.416, "step": 1011 }, { "epoch": 0.431740614334471, "grad_norm": 21.391023635864258, "learning_rate": 5.988319856244384e-06, "loss": 2.3564, "step": 1012 }, { "epoch": 0.43216723549488056, "grad_norm": 44.32109451293945, "learning_rate": 5.983827493261456e-06, "loss": 2.8799, "step": 1013 }, { "epoch": 0.4325938566552901, "grad_norm": 16.042848587036133, "learning_rate": 5.979335130278527e-06, "loss": 2.3809, "step": 1014 }, { "epoch": 0.43302047781569963, "grad_norm": 40.12324523925781, "learning_rate": 5.974842767295598e-06, "loss": 2.1445, "step": 1015 }, { "epoch": 0.4334470989761092, "grad_norm": 16.52766227722168, "learning_rate": 5.970350404312669e-06, "loss": 2.4209, "step": 1016 }, { "epoch": 0.43387372013651876, "grad_norm": 30.28310775756836, "learning_rate": 5.965858041329739e-06, "loss": 2.2705, "step": 1017 }, { "epoch": 0.43430034129692835, "grad_norm": 63.12556457519531, "learning_rate": 5.961365678346811e-06, "loss": 2.4072, "step": 1018 }, { "epoch": 0.4347269624573379, "grad_norm": 30.52492904663086, "learning_rate": 5.9568733153638815e-06, "loss": 3.2734, "step": 1019 }, { "epoch": 0.4351535836177474, "grad_norm": 16.001874923706055, "learning_rate": 5.9523809523809525e-06, "loss": 2.207, "step": 1020 }, { "epoch": 0.435580204778157, "grad_norm": 27.28630828857422, "learning_rate": 5.9478885893980244e-06, "loss": 2.6582, "step": 1021 }, { "epoch": 0.43600682593856654, "grad_norm": 20.705059051513672, "learning_rate": 5.943396226415095e-06, "loss": 2.3154, "step": 1022 }, { "epoch": 0.43643344709897613, "grad_norm": 25.739479064941406, "learning_rate": 5.938903863432166e-06, "loss": 2.8486, "step": 1023 }, { "epoch": 0.43686006825938567, "grad_norm": 24.382295608520508, "learning_rate": 5.934411500449237e-06, "loss": 2.4648, "step": 1024 }, { "epoch": 0.4372866894197952, "grad_norm": 39.85039138793945, "learning_rate": 5.929919137466308e-06, "loss": 2.2539, "step": 1025 }, { "epoch": 0.4377133105802048, "grad_norm": 24.816200256347656, "learning_rate": 5.925426774483378e-06, "loss": 2.377, "step": 1026 }, { "epoch": 0.43813993174061433, "grad_norm": 38.931419372558594, "learning_rate": 5.92093441150045e-06, "loss": 2.5742, "step": 1027 }, { "epoch": 0.43856655290102387, "grad_norm": 19.670873641967773, "learning_rate": 5.916442048517521e-06, "loss": 2.4814, "step": 1028 }, { "epoch": 0.43899317406143346, "grad_norm": 18.52810287475586, "learning_rate": 5.911949685534591e-06, "loss": 2.8203, "step": 1029 }, { "epoch": 0.439419795221843, "grad_norm": 26.418214797973633, "learning_rate": 5.907457322551663e-06, "loss": 2.2998, "step": 1030 }, { "epoch": 0.4398464163822526, "grad_norm": 19.35175895690918, "learning_rate": 5.902964959568733e-06, "loss": 2.8086, "step": 1031 }, { "epoch": 0.4402730375426621, "grad_norm": 46.12089157104492, "learning_rate": 5.898472596585805e-06, "loss": 2.4053, "step": 1032 }, { "epoch": 0.44069965870307165, "grad_norm": 19.147022247314453, "learning_rate": 5.893980233602875e-06, "loss": 2.4072, "step": 1033 }, { "epoch": 0.44112627986348124, "grad_norm": 17.690813064575195, "learning_rate": 5.889487870619946e-06, "loss": 2.7412, "step": 1034 }, { "epoch": 0.4415529010238908, "grad_norm": 55.85981750488281, "learning_rate": 5.884995507637018e-06, "loss": 2.5459, "step": 1035 }, { "epoch": 0.44197952218430037, "grad_norm": 19.12326431274414, "learning_rate": 5.880503144654088e-06, "loss": 2.3145, "step": 1036 }, { "epoch": 0.4424061433447099, "grad_norm": 22.143829345703125, "learning_rate": 5.8760107816711595e-06, "loss": 2.1816, "step": 1037 }, { "epoch": 0.44283276450511944, "grad_norm": 16.12741470336914, "learning_rate": 5.8715184186882305e-06, "loss": 2.2637, "step": 1038 }, { "epoch": 0.443259385665529, "grad_norm": 24.46463966369629, "learning_rate": 5.8670260557053015e-06, "loss": 2.6064, "step": 1039 }, { "epoch": 0.44368600682593856, "grad_norm": 17.870634078979492, "learning_rate": 5.862533692722372e-06, "loss": 2.2676, "step": 1040 }, { "epoch": 0.4441126279863481, "grad_norm": 55.32482147216797, "learning_rate": 5.858041329739444e-06, "loss": 3.1143, "step": 1041 }, { "epoch": 0.4445392491467577, "grad_norm": 18.651668548583984, "learning_rate": 5.853548966756515e-06, "loss": 2.3745, "step": 1042 }, { "epoch": 0.4449658703071672, "grad_norm": 38.92100524902344, "learning_rate": 5.849056603773585e-06, "loss": 3.0059, "step": 1043 }, { "epoch": 0.4453924914675768, "grad_norm": 24.060253143310547, "learning_rate": 5.844564240790657e-06, "loss": 2.9922, "step": 1044 }, { "epoch": 0.44581911262798635, "grad_norm": 19.545761108398438, "learning_rate": 5.840071877807727e-06, "loss": 2.333, "step": 1045 }, { "epoch": 0.4462457337883959, "grad_norm": 31.55517578125, "learning_rate": 5.835579514824799e-06, "loss": 3.0957, "step": 1046 }, { "epoch": 0.4466723549488055, "grad_norm": 18.396970748901367, "learning_rate": 5.831087151841869e-06, "loss": 1.8457, "step": 1047 }, { "epoch": 0.447098976109215, "grad_norm": 27.257366180419922, "learning_rate": 5.82659478885894e-06, "loss": 2.8574, "step": 1048 }, { "epoch": 0.4475255972696246, "grad_norm": 21.9024658203125, "learning_rate": 5.822102425876012e-06, "loss": 2.2686, "step": 1049 }, { "epoch": 0.44795221843003413, "grad_norm": 22.36772918701172, "learning_rate": 5.817610062893082e-06, "loss": 2.4717, "step": 1050 }, { "epoch": 0.44837883959044367, "grad_norm": 24.611875534057617, "learning_rate": 5.813117699910153e-06, "loss": 2.5195, "step": 1051 }, { "epoch": 0.44880546075085326, "grad_norm": 18.70650863647461, "learning_rate": 5.808625336927224e-06, "loss": 2.3916, "step": 1052 }, { "epoch": 0.4492320819112628, "grad_norm": 15.388991355895996, "learning_rate": 5.804132973944295e-06, "loss": 2.4912, "step": 1053 }, { "epoch": 0.4496587030716723, "grad_norm": 21.341957092285156, "learning_rate": 5.7996406109613655e-06, "loss": 2.6201, "step": 1054 }, { "epoch": 0.4500853242320819, "grad_norm": 22.18140411376953, "learning_rate": 5.795148247978437e-06, "loss": 2.8838, "step": 1055 }, { "epoch": 0.45051194539249145, "grad_norm": 17.50536346435547, "learning_rate": 5.7906558849955085e-06, "loss": 2.6729, "step": 1056 }, { "epoch": 0.45093856655290104, "grad_norm": 21.7335205078125, "learning_rate": 5.786163522012579e-06, "loss": 2.666, "step": 1057 }, { "epoch": 0.4513651877133106, "grad_norm": 24.962146759033203, "learning_rate": 5.7816711590296505e-06, "loss": 2.1211, "step": 1058 }, { "epoch": 0.4517918088737201, "grad_norm": 20.209169387817383, "learning_rate": 5.777178796046721e-06, "loss": 2.5088, "step": 1059 }, { "epoch": 0.4522184300341297, "grad_norm": 18.95348358154297, "learning_rate": 5.772686433063792e-06, "loss": 2.6123, "step": 1060 }, { "epoch": 0.45264505119453924, "grad_norm": 16.449033737182617, "learning_rate": 5.768194070080863e-06, "loss": 2.29, "step": 1061 }, { "epoch": 0.45307167235494883, "grad_norm": 19.58086585998535, "learning_rate": 5.763701707097934e-06, "loss": 2.4629, "step": 1062 }, { "epoch": 0.45349829351535836, "grad_norm": 28.654550552368164, "learning_rate": 5.759209344115006e-06, "loss": 2.5645, "step": 1063 }, { "epoch": 0.4539249146757679, "grad_norm": 38.15443801879883, "learning_rate": 5.754716981132076e-06, "loss": 2.3252, "step": 1064 }, { "epoch": 0.4543515358361775, "grad_norm": 18.46957778930664, "learning_rate": 5.750224618149147e-06, "loss": 2.9434, "step": 1065 }, { "epoch": 0.454778156996587, "grad_norm": 30.695247650146484, "learning_rate": 5.745732255166217e-06, "loss": 3.1504, "step": 1066 }, { "epoch": 0.4552047781569966, "grad_norm": 20.049087524414062, "learning_rate": 5.741239892183289e-06, "loss": 2.0674, "step": 1067 }, { "epoch": 0.45563139931740615, "grad_norm": 17.473054885864258, "learning_rate": 5.736747529200359e-06, "loss": 2.3477, "step": 1068 }, { "epoch": 0.4560580204778157, "grad_norm": 20.370532989501953, "learning_rate": 5.732255166217431e-06, "loss": 2.1992, "step": 1069 }, { "epoch": 0.4564846416382253, "grad_norm": 24.673768997192383, "learning_rate": 5.727762803234502e-06, "loss": 2.333, "step": 1070 }, { "epoch": 0.4569112627986348, "grad_norm": 26.14463233947754, "learning_rate": 5.723270440251572e-06, "loss": 2.3564, "step": 1071 }, { "epoch": 0.45733788395904434, "grad_norm": 20.086162567138672, "learning_rate": 5.718778077268644e-06, "loss": 2.4512, "step": 1072 }, { "epoch": 0.45776450511945393, "grad_norm": 20.043794631958008, "learning_rate": 5.7142857142857145e-06, "loss": 2.8096, "step": 1073 }, { "epoch": 0.45819112627986347, "grad_norm": 40.026126861572266, "learning_rate": 5.7097933513027856e-06, "loss": 2.2119, "step": 1074 }, { "epoch": 0.45861774744027306, "grad_norm": 25.369979858398438, "learning_rate": 5.705300988319857e-06, "loss": 2.5371, "step": 1075 }, { "epoch": 0.4590443686006826, "grad_norm": 22.0860538482666, "learning_rate": 5.700808625336928e-06, "loss": 2.9932, "step": 1076 }, { "epoch": 0.45947098976109213, "grad_norm": 31.264373779296875, "learning_rate": 5.6963162623539995e-06, "loss": 2.9736, "step": 1077 }, { "epoch": 0.4598976109215017, "grad_norm": 59.6558723449707, "learning_rate": 5.69182389937107e-06, "loss": 2.7861, "step": 1078 }, { "epoch": 0.46032423208191126, "grad_norm": 20.534461975097656, "learning_rate": 5.687331536388141e-06, "loss": 3.2305, "step": 1079 }, { "epoch": 0.46075085324232085, "grad_norm": 20.675729751586914, "learning_rate": 5.682839173405211e-06, "loss": 2.6768, "step": 1080 }, { "epoch": 0.4611774744027304, "grad_norm": 36.32667922973633, "learning_rate": 5.678346810422283e-06, "loss": 2.7354, "step": 1081 }, { "epoch": 0.4616040955631399, "grad_norm": 18.81818199157715, "learning_rate": 5.673854447439353e-06, "loss": 2.4863, "step": 1082 }, { "epoch": 0.4620307167235495, "grad_norm": 22.678890228271484, "learning_rate": 5.669362084456425e-06, "loss": 2.2812, "step": 1083 }, { "epoch": 0.46245733788395904, "grad_norm": 52.0784912109375, "learning_rate": 5.664869721473496e-06, "loss": 2.6211, "step": 1084 }, { "epoch": 0.4628839590443686, "grad_norm": 16.65648078918457, "learning_rate": 5.660377358490566e-06, "loss": 2.6953, "step": 1085 }, { "epoch": 0.46331058020477817, "grad_norm": 15.564534187316895, "learning_rate": 5.655884995507638e-06, "loss": 2.5234, "step": 1086 }, { "epoch": 0.4637372013651877, "grad_norm": 14.748406410217285, "learning_rate": 5.651392632524708e-06, "loss": 2.1904, "step": 1087 }, { "epoch": 0.4641638225255973, "grad_norm": 36.08272933959961, "learning_rate": 5.646900269541779e-06, "loss": 2.499, "step": 1088 }, { "epoch": 0.4645904436860068, "grad_norm": 31.064861297607422, "learning_rate": 5.64240790655885e-06, "loss": 2.5146, "step": 1089 }, { "epoch": 0.46501706484641636, "grad_norm": 20.036354064941406, "learning_rate": 5.637915543575921e-06, "loss": 2.8281, "step": 1090 }, { "epoch": 0.46544368600682595, "grad_norm": 16.89986801147461, "learning_rate": 5.6334231805929925e-06, "loss": 2.3643, "step": 1091 }, { "epoch": 0.4658703071672355, "grad_norm": 21.826248168945312, "learning_rate": 5.6289308176100635e-06, "loss": 2.0293, "step": 1092 }, { "epoch": 0.4662969283276451, "grad_norm": 23.403091430664062, "learning_rate": 5.6244384546271346e-06, "loss": 2.0586, "step": 1093 }, { "epoch": 0.4667235494880546, "grad_norm": 19.01036834716797, "learning_rate": 5.619946091644205e-06, "loss": 2.2705, "step": 1094 }, { "epoch": 0.46715017064846415, "grad_norm": 40.55333709716797, "learning_rate": 5.615453728661277e-06, "loss": 2.6978, "step": 1095 }, { "epoch": 0.46757679180887374, "grad_norm": 29.737985610961914, "learning_rate": 5.610961365678347e-06, "loss": 2.5518, "step": 1096 }, { "epoch": 0.4680034129692833, "grad_norm": 23.35071563720703, "learning_rate": 5.606469002695418e-06, "loss": 2.7148, "step": 1097 }, { "epoch": 0.4684300341296928, "grad_norm": 23.68649673461914, "learning_rate": 5.60197663971249e-06, "loss": 2.626, "step": 1098 }, { "epoch": 0.4688566552901024, "grad_norm": 31.265790939331055, "learning_rate": 5.59748427672956e-06, "loss": 3.2021, "step": 1099 }, { "epoch": 0.46928327645051193, "grad_norm": 28.331497192382812, "learning_rate": 5.592991913746632e-06, "loss": 2.165, "step": 1100 }, { "epoch": 0.4697098976109215, "grad_norm": 21.12152862548828, "learning_rate": 5.588499550763702e-06, "loss": 2.186, "step": 1101 }, { "epoch": 0.47013651877133106, "grad_norm": 22.582916259765625, "learning_rate": 5.584007187780773e-06, "loss": 2.5117, "step": 1102 }, { "epoch": 0.4705631399317406, "grad_norm": 19.8083438873291, "learning_rate": 5.579514824797843e-06, "loss": 2.0576, "step": 1103 }, { "epoch": 0.4709897610921502, "grad_norm": 25.670671463012695, "learning_rate": 5.575022461814915e-06, "loss": 2.7871, "step": 1104 }, { "epoch": 0.4714163822525597, "grad_norm": 26.358726501464844, "learning_rate": 5.570530098831986e-06, "loss": 2.3057, "step": 1105 }, { "epoch": 0.4718430034129693, "grad_norm": 15.725049018859863, "learning_rate": 5.566037735849057e-06, "loss": 2.6221, "step": 1106 }, { "epoch": 0.47226962457337884, "grad_norm": 32.615047454833984, "learning_rate": 5.561545372866128e-06, "loss": 2.1543, "step": 1107 }, { "epoch": 0.4726962457337884, "grad_norm": 19.219602584838867, "learning_rate": 5.5570530098831985e-06, "loss": 2.4561, "step": 1108 }, { "epoch": 0.47312286689419797, "grad_norm": 18.56755828857422, "learning_rate": 5.55256064690027e-06, "loss": 2.2778, "step": 1109 }, { "epoch": 0.4735494880546075, "grad_norm": 15.861446380615234, "learning_rate": 5.548068283917341e-06, "loss": 2.4023, "step": 1110 }, { "epoch": 0.47397610921501704, "grad_norm": 34.29450225830078, "learning_rate": 5.543575920934412e-06, "loss": 2.4219, "step": 1111 }, { "epoch": 0.47440273037542663, "grad_norm": 28.189361572265625, "learning_rate": 5.5390835579514835e-06, "loss": 2.7207, "step": 1112 }, { "epoch": 0.47482935153583616, "grad_norm": 34.98194885253906, "learning_rate": 5.534591194968554e-06, "loss": 2.5635, "step": 1113 }, { "epoch": 0.47525597269624575, "grad_norm": 16.712553024291992, "learning_rate": 5.530098831985626e-06, "loss": 2.1338, "step": 1114 }, { "epoch": 0.4756825938566553, "grad_norm": 24.47882843017578, "learning_rate": 5.525606469002696e-06, "loss": 2.1953, "step": 1115 }, { "epoch": 0.4761092150170648, "grad_norm": 42.815486907958984, "learning_rate": 5.521114106019767e-06, "loss": 2.4131, "step": 1116 }, { "epoch": 0.4765358361774744, "grad_norm": 27.543291091918945, "learning_rate": 5.516621743036837e-06, "loss": 2.3418, "step": 1117 }, { "epoch": 0.47696245733788395, "grad_norm": 26.759994506835938, "learning_rate": 5.512129380053909e-06, "loss": 3.208, "step": 1118 }, { "epoch": 0.47738907849829354, "grad_norm": 27.660327911376953, "learning_rate": 5.50763701707098e-06, "loss": 2.2578, "step": 1119 }, { "epoch": 0.4778156996587031, "grad_norm": 20.498302459716797, "learning_rate": 5.503144654088051e-06, "loss": 2.8809, "step": 1120 }, { "epoch": 0.4782423208191126, "grad_norm": 33.10205841064453, "learning_rate": 5.498652291105122e-06, "loss": 2.8047, "step": 1121 }, { "epoch": 0.4786689419795222, "grad_norm": 18.99040985107422, "learning_rate": 5.494159928122192e-06, "loss": 2.4971, "step": 1122 }, { "epoch": 0.47909556313993173, "grad_norm": 20.12007713317871, "learning_rate": 5.489667565139264e-06, "loss": 2.4883, "step": 1123 }, { "epoch": 0.47952218430034127, "grad_norm": 34.227657318115234, "learning_rate": 5.485175202156334e-06, "loss": 2.0781, "step": 1124 }, { "epoch": 0.47994880546075086, "grad_norm": 18.170772552490234, "learning_rate": 5.4806828391734054e-06, "loss": 2.3389, "step": 1125 }, { "epoch": 0.4803754266211604, "grad_norm": 22.892549514770508, "learning_rate": 5.476190476190477e-06, "loss": 2.0605, "step": 1126 }, { "epoch": 0.48080204778157, "grad_norm": 14.68527889251709, "learning_rate": 5.4716981132075475e-06, "loss": 2.248, "step": 1127 }, { "epoch": 0.4812286689419795, "grad_norm": 23.0800838470459, "learning_rate": 5.4672057502246186e-06, "loss": 2.1816, "step": 1128 }, { "epoch": 0.48165529010238906, "grad_norm": 28.375272750854492, "learning_rate": 5.46271338724169e-06, "loss": 2.5811, "step": 1129 }, { "epoch": 0.48208191126279865, "grad_norm": 64.8277587890625, "learning_rate": 5.458221024258761e-06, "loss": 3.0557, "step": 1130 }, { "epoch": 0.4825085324232082, "grad_norm": 52.17132568359375, "learning_rate": 5.453728661275831e-06, "loss": 2.6113, "step": 1131 }, { "epoch": 0.48293515358361777, "grad_norm": 19.48263168334961, "learning_rate": 5.449236298292903e-06, "loss": 2.0498, "step": 1132 }, { "epoch": 0.4833617747440273, "grad_norm": 18.925079345703125, "learning_rate": 5.444743935309974e-06, "loss": 2.9893, "step": 1133 }, { "epoch": 0.48378839590443684, "grad_norm": 19.89716148376465, "learning_rate": 5.440251572327044e-06, "loss": 2.7705, "step": 1134 }, { "epoch": 0.48421501706484643, "grad_norm": 28.375167846679688, "learning_rate": 5.435759209344116e-06, "loss": 2.8643, "step": 1135 }, { "epoch": 0.48464163822525597, "grad_norm": 22.519615173339844, "learning_rate": 5.431266846361186e-06, "loss": 2.5166, "step": 1136 }, { "epoch": 0.48506825938566556, "grad_norm": 16.866195678710938, "learning_rate": 5.426774483378258e-06, "loss": 2.5, "step": 1137 }, { "epoch": 0.4854948805460751, "grad_norm": 36.9367790222168, "learning_rate": 5.422282120395328e-06, "loss": 2.8838, "step": 1138 }, { "epoch": 0.4859215017064846, "grad_norm": 51.1767463684082, "learning_rate": 5.417789757412399e-06, "loss": 2.6123, "step": 1139 }, { "epoch": 0.4863481228668942, "grad_norm": 18.42434310913086, "learning_rate": 5.413297394429471e-06, "loss": 2.2568, "step": 1140 }, { "epoch": 0.48677474402730375, "grad_norm": 20.477943420410156, "learning_rate": 5.408805031446541e-06, "loss": 2.4121, "step": 1141 }, { "epoch": 0.4872013651877133, "grad_norm": 23.394515991210938, "learning_rate": 5.404312668463612e-06, "loss": 2.3174, "step": 1142 }, { "epoch": 0.4876279863481229, "grad_norm": 15.655838966369629, "learning_rate": 5.399820305480683e-06, "loss": 2.4307, "step": 1143 }, { "epoch": 0.4880546075085324, "grad_norm": 17.30601692199707, "learning_rate": 5.3953279424977544e-06, "loss": 2.1406, "step": 1144 }, { "epoch": 0.488481228668942, "grad_norm": 17.408594131469727, "learning_rate": 5.390835579514825e-06, "loss": 2.4082, "step": 1145 }, { "epoch": 0.48890784982935154, "grad_norm": 20.133886337280273, "learning_rate": 5.3863432165318965e-06, "loss": 2.4775, "step": 1146 }, { "epoch": 0.4893344709897611, "grad_norm": 28.761140823364258, "learning_rate": 5.3818508535489676e-06, "loss": 2.376, "step": 1147 }, { "epoch": 0.48976109215017066, "grad_norm": 20.112117767333984, "learning_rate": 5.377358490566038e-06, "loss": 2.5723, "step": 1148 }, { "epoch": 0.4901877133105802, "grad_norm": 33.22997283935547, "learning_rate": 5.37286612758311e-06, "loss": 2.4688, "step": 1149 }, { "epoch": 0.4906143344709898, "grad_norm": 31.31877899169922, "learning_rate": 5.36837376460018e-06, "loss": 2.4941, "step": 1150 }, { "epoch": 0.4910409556313993, "grad_norm": 27.41322135925293, "learning_rate": 5.363881401617252e-06, "loss": 3.0059, "step": 1151 }, { "epoch": 0.49146757679180886, "grad_norm": 23.11745262145996, "learning_rate": 5.359389038634322e-06, "loss": 2.6611, "step": 1152 }, { "epoch": 0.49189419795221845, "grad_norm": 22.89065933227539, "learning_rate": 5.354896675651393e-06, "loss": 2.7139, "step": 1153 }, { "epoch": 0.492320819112628, "grad_norm": 17.799470901489258, "learning_rate": 5.350404312668463e-06, "loss": 2.0234, "step": 1154 }, { "epoch": 0.4927474402730375, "grad_norm": 24.692134857177734, "learning_rate": 5.345911949685535e-06, "loss": 2.5801, "step": 1155 }, { "epoch": 0.4931740614334471, "grad_norm": 23.89969825744629, "learning_rate": 5.341419586702606e-06, "loss": 2.4844, "step": 1156 }, { "epoch": 0.49360068259385664, "grad_norm": 18.586627960205078, "learning_rate": 5.336927223719677e-06, "loss": 2.125, "step": 1157 }, { "epoch": 0.49402730375426623, "grad_norm": 25.83800506591797, "learning_rate": 5.332434860736748e-06, "loss": 2.7432, "step": 1158 }, { "epoch": 0.49445392491467577, "grad_norm": 18.455707550048828, "learning_rate": 5.327942497753818e-06, "loss": 2.458, "step": 1159 }, { "epoch": 0.4948805460750853, "grad_norm": 29.522663116455078, "learning_rate": 5.32345013477089e-06, "loss": 2.7256, "step": 1160 }, { "epoch": 0.4953071672354949, "grad_norm": 33.990966796875, "learning_rate": 5.3189577717879605e-06, "loss": 2.5508, "step": 1161 }, { "epoch": 0.49573378839590443, "grad_norm": 37.19002914428711, "learning_rate": 5.3144654088050315e-06, "loss": 2.1768, "step": 1162 }, { "epoch": 0.496160409556314, "grad_norm": 17.45855712890625, "learning_rate": 5.3099730458221034e-06, "loss": 2.3623, "step": 1163 }, { "epoch": 0.49658703071672355, "grad_norm": 21.004287719726562, "learning_rate": 5.305480682839174e-06, "loss": 2.5498, "step": 1164 }, { "epoch": 0.4970136518771331, "grad_norm": 18.135629653930664, "learning_rate": 5.300988319856245e-06, "loss": 2.5381, "step": 1165 }, { "epoch": 0.4974402730375427, "grad_norm": 54.42517852783203, "learning_rate": 5.296495956873316e-06, "loss": 3.4648, "step": 1166 }, { "epoch": 0.4978668941979522, "grad_norm": 23.84809684753418, "learning_rate": 5.292003593890387e-06, "loss": 2.7002, "step": 1167 }, { "epoch": 0.49829351535836175, "grad_norm": 37.697021484375, "learning_rate": 5.287511230907457e-06, "loss": 2.8711, "step": 1168 }, { "epoch": 0.49872013651877134, "grad_norm": 50.69107437133789, "learning_rate": 5.283018867924529e-06, "loss": 2.2715, "step": 1169 }, { "epoch": 0.4991467576791809, "grad_norm": 32.984683990478516, "learning_rate": 5.2785265049416e-06, "loss": 2.6553, "step": 1170 }, { "epoch": 0.49957337883959047, "grad_norm": 19.88435173034668, "learning_rate": 5.27403414195867e-06, "loss": 2.3164, "step": 1171 }, { "epoch": 0.5, "grad_norm": 27.762235641479492, "learning_rate": 5.269541778975742e-06, "loss": 2.9717, "step": 1172 }, { "epoch": 0.5004266211604096, "grad_norm": 17.994325637817383, "learning_rate": 5.265049415992812e-06, "loss": 2.2549, "step": 1173 }, { "epoch": 0.5008532423208191, "grad_norm": 16.808128356933594, "learning_rate": 5.260557053009884e-06, "loss": 2.4434, "step": 1174 }, { "epoch": 0.5012798634812287, "grad_norm": 19.83344268798828, "learning_rate": 5.256064690026954e-06, "loss": 2.291, "step": 1175 }, { "epoch": 0.5017064846416383, "grad_norm": 18.270017623901367, "learning_rate": 5.251572327044025e-06, "loss": 2.6992, "step": 1176 }, { "epoch": 0.5021331058020477, "grad_norm": 23.939945220947266, "learning_rate": 5.247079964061097e-06, "loss": 2.1406, "step": 1177 }, { "epoch": 0.5025597269624573, "grad_norm": 15.736184120178223, "learning_rate": 5.242587601078167e-06, "loss": 2.1777, "step": 1178 }, { "epoch": 0.5029863481228669, "grad_norm": 26.24043083190918, "learning_rate": 5.2380952380952384e-06, "loss": 2.6523, "step": 1179 }, { "epoch": 0.5034129692832765, "grad_norm": 20.484760284423828, "learning_rate": 5.2336028751123095e-06, "loss": 2.6504, "step": 1180 }, { "epoch": 0.503839590443686, "grad_norm": 20.004854202270508, "learning_rate": 5.2291105121293805e-06, "loss": 2.0503, "step": 1181 }, { "epoch": 0.5042662116040956, "grad_norm": 21.367979049682617, "learning_rate": 5.224618149146451e-06, "loss": 2.0713, "step": 1182 }, { "epoch": 0.5046928327645052, "grad_norm": 22.656190872192383, "learning_rate": 5.220125786163523e-06, "loss": 2.3164, "step": 1183 }, { "epoch": 0.5051194539249146, "grad_norm": 19.639827728271484, "learning_rate": 5.215633423180594e-06, "loss": 2.1338, "step": 1184 }, { "epoch": 0.5055460750853242, "grad_norm": 36.95730209350586, "learning_rate": 5.211141060197664e-06, "loss": 2.1045, "step": 1185 }, { "epoch": 0.5059726962457338, "grad_norm": 25.0295467376709, "learning_rate": 5.206648697214736e-06, "loss": 2.5039, "step": 1186 }, { "epoch": 0.5063993174061433, "grad_norm": 31.372411727905273, "learning_rate": 5.202156334231806e-06, "loss": 2.5654, "step": 1187 }, { "epoch": 0.5068259385665529, "grad_norm": 35.015716552734375, "learning_rate": 5.197663971248878e-06, "loss": 2.2344, "step": 1188 }, { "epoch": 0.5072525597269625, "grad_norm": 34.40018844604492, "learning_rate": 5.193171608265948e-06, "loss": 2.6162, "step": 1189 }, { "epoch": 0.507679180887372, "grad_norm": 27.553220748901367, "learning_rate": 5.188679245283019e-06, "loss": 2.5781, "step": 1190 }, { "epoch": 0.5081058020477816, "grad_norm": 29.57081413269043, "learning_rate": 5.184186882300091e-06, "loss": 2.8047, "step": 1191 }, { "epoch": 0.5085324232081911, "grad_norm": 29.047510147094727, "learning_rate": 5.179694519317161e-06, "loss": 3.1738, "step": 1192 }, { "epoch": 0.5089590443686007, "grad_norm": 22.855804443359375, "learning_rate": 5.175202156334232e-06, "loss": 2.5459, "step": 1193 }, { "epoch": 0.5093856655290102, "grad_norm": 16.29975700378418, "learning_rate": 5.170709793351303e-06, "loss": 2.3008, "step": 1194 }, { "epoch": 0.5098122866894198, "grad_norm": 18.54262351989746, "learning_rate": 5.166217430368374e-06, "loss": 2.2949, "step": 1195 }, { "epoch": 0.5102389078498294, "grad_norm": 21.953824996948242, "learning_rate": 5.1617250673854445e-06, "loss": 2.6367, "step": 1196 }, { "epoch": 0.5106655290102389, "grad_norm": 16.686342239379883, "learning_rate": 5.157232704402516e-06, "loss": 2.2246, "step": 1197 }, { "epoch": 0.5110921501706485, "grad_norm": 20.373126983642578, "learning_rate": 5.1527403414195874e-06, "loss": 2.7695, "step": 1198 }, { "epoch": 0.511518771331058, "grad_norm": 19.9268798828125, "learning_rate": 5.148247978436658e-06, "loss": 2.6016, "step": 1199 }, { "epoch": 0.5119453924914675, "grad_norm": 16.246139526367188, "learning_rate": 5.1437556154537295e-06, "loss": 2.4639, "step": 1200 }, { "epoch": 0.5123720136518771, "grad_norm": 25.90472984313965, "learning_rate": 5.1392632524708e-06, "loss": 2.5928, "step": 1201 }, { "epoch": 0.5127986348122867, "grad_norm": 20.83331871032715, "learning_rate": 5.134770889487871e-06, "loss": 2.583, "step": 1202 }, { "epoch": 0.5132252559726962, "grad_norm": 28.974882125854492, "learning_rate": 5.130278526504942e-06, "loss": 2.6738, "step": 1203 }, { "epoch": 0.5136518771331058, "grad_norm": 16.286806106567383, "learning_rate": 5.125786163522013e-06, "loss": 2.3604, "step": 1204 }, { "epoch": 0.5140784982935154, "grad_norm": 27.790830612182617, "learning_rate": 5.121293800539085e-06, "loss": 2.1895, "step": 1205 }, { "epoch": 0.514505119453925, "grad_norm": 18.570655822753906, "learning_rate": 5.116801437556155e-06, "loss": 2.4883, "step": 1206 }, { "epoch": 0.5149317406143344, "grad_norm": 39.29001235961914, "learning_rate": 5.112309074573226e-06, "loss": 2.6777, "step": 1207 }, { "epoch": 0.515358361774744, "grad_norm": 18.890174865722656, "learning_rate": 5.107816711590296e-06, "loss": 2.5215, "step": 1208 }, { "epoch": 0.5157849829351536, "grad_norm": 24.600017547607422, "learning_rate": 5.103324348607368e-06, "loss": 2.3481, "step": 1209 }, { "epoch": 0.5162116040955631, "grad_norm": 22.012611389160156, "learning_rate": 5.098831985624438e-06, "loss": 2.374, "step": 1210 }, { "epoch": 0.5166382252559727, "grad_norm": 17.54711151123047, "learning_rate": 5.09433962264151e-06, "loss": 2.2744, "step": 1211 }, { "epoch": 0.5170648464163823, "grad_norm": 25.47920799255371, "learning_rate": 5.089847259658581e-06, "loss": 2.9141, "step": 1212 }, { "epoch": 0.5174914675767918, "grad_norm": 18.82390022277832, "learning_rate": 5.085354896675651e-06, "loss": 2.1562, "step": 1213 }, { "epoch": 0.5179180887372014, "grad_norm": 52.312286376953125, "learning_rate": 5.080862533692723e-06, "loss": 3.0713, "step": 1214 }, { "epoch": 0.518344709897611, "grad_norm": 27.089868545532227, "learning_rate": 5.0763701707097935e-06, "loss": 2.6895, "step": 1215 }, { "epoch": 0.5187713310580204, "grad_norm": 20.7680606842041, "learning_rate": 5.0718778077268645e-06, "loss": 2.6787, "step": 1216 }, { "epoch": 0.51919795221843, "grad_norm": 31.512439727783203, "learning_rate": 5.067385444743936e-06, "loss": 2.334, "step": 1217 }, { "epoch": 0.5196245733788396, "grad_norm": 29.377948760986328, "learning_rate": 5.062893081761007e-06, "loss": 2.4209, "step": 1218 }, { "epoch": 0.5200511945392492, "grad_norm": 53.89030456542969, "learning_rate": 5.0584007187780785e-06, "loss": 2.2803, "step": 1219 }, { "epoch": 0.5204778156996587, "grad_norm": 35.73152542114258, "learning_rate": 5.053908355795149e-06, "loss": 1.998, "step": 1220 }, { "epoch": 0.5209044368600683, "grad_norm": 20.638343811035156, "learning_rate": 5.04941599281222e-06, "loss": 2.7383, "step": 1221 }, { "epoch": 0.5213310580204779, "grad_norm": 29.040016174316406, "learning_rate": 5.04492362982929e-06, "loss": 2.7412, "step": 1222 }, { "epoch": 0.5217576791808873, "grad_norm": 17.859210968017578, "learning_rate": 5.040431266846362e-06, "loss": 2.0283, "step": 1223 }, { "epoch": 0.5221843003412969, "grad_norm": 24.831958770751953, "learning_rate": 5.035938903863432e-06, "loss": 2.3135, "step": 1224 }, { "epoch": 0.5226109215017065, "grad_norm": 16.252965927124023, "learning_rate": 5.031446540880504e-06, "loss": 2.1006, "step": 1225 }, { "epoch": 0.523037542662116, "grad_norm": 24.575307846069336, "learning_rate": 5.026954177897575e-06, "loss": 2.6299, "step": 1226 }, { "epoch": 0.5234641638225256, "grad_norm": 17.893375396728516, "learning_rate": 5.022461814914645e-06, "loss": 2.5127, "step": 1227 }, { "epoch": 0.5238907849829352, "grad_norm": 17.92720603942871, "learning_rate": 5.017969451931717e-06, "loss": 2.626, "step": 1228 }, { "epoch": 0.5243174061433447, "grad_norm": 39.802059173583984, "learning_rate": 5.013477088948787e-06, "loss": 2.6738, "step": 1229 }, { "epoch": 0.5247440273037542, "grad_norm": 30.07309341430664, "learning_rate": 5.008984725965858e-06, "loss": 2.6621, "step": 1230 }, { "epoch": 0.5251706484641638, "grad_norm": 25.614131927490234, "learning_rate": 5.004492362982929e-06, "loss": 2.4473, "step": 1231 }, { "epoch": 0.5255972696245734, "grad_norm": 16.77971839904785, "learning_rate": 5e-06, "loss": 2.0137, "step": 1232 }, { "epoch": 0.5260238907849829, "grad_norm": 23.12045669555664, "learning_rate": 4.9955076370170715e-06, "loss": 2.2012, "step": 1233 }, { "epoch": 0.5264505119453925, "grad_norm": 25.62958526611328, "learning_rate": 4.9910152740341425e-06, "loss": 2.2744, "step": 1234 }, { "epoch": 0.5268771331058021, "grad_norm": 22.395214080810547, "learning_rate": 4.986522911051213e-06, "loss": 2.4043, "step": 1235 }, { "epoch": 0.5273037542662116, "grad_norm": 19.13766098022461, "learning_rate": 4.982030548068285e-06, "loss": 2.334, "step": 1236 }, { "epoch": 0.5277303754266212, "grad_norm": 21.669918060302734, "learning_rate": 4.977538185085356e-06, "loss": 2.7744, "step": 1237 }, { "epoch": 0.5281569965870307, "grad_norm": 21.873449325561523, "learning_rate": 4.973045822102427e-06, "loss": 2.0127, "step": 1238 }, { "epoch": 0.5285836177474402, "grad_norm": 23.759654998779297, "learning_rate": 4.968553459119497e-06, "loss": 2.1641, "step": 1239 }, { "epoch": 0.5290102389078498, "grad_norm": 19.724685668945312, "learning_rate": 4.964061096136568e-06, "loss": 2.8232, "step": 1240 }, { "epoch": 0.5294368600682594, "grad_norm": 24.578834533691406, "learning_rate": 4.959568733153639e-06, "loss": 2.9199, "step": 1241 }, { "epoch": 0.5298634812286689, "grad_norm": 33.008914947509766, "learning_rate": 4.95507637017071e-06, "loss": 2.3926, "step": 1242 }, { "epoch": 0.5302901023890785, "grad_norm": 26.207422256469727, "learning_rate": 4.950584007187781e-06, "loss": 2.1758, "step": 1243 }, { "epoch": 0.5307167235494881, "grad_norm": 21.034513473510742, "learning_rate": 4.946091644204852e-06, "loss": 2.9932, "step": 1244 }, { "epoch": 0.5311433447098977, "grad_norm": 16.45236587524414, "learning_rate": 4.941599281221923e-06, "loss": 2.5645, "step": 1245 }, { "epoch": 0.5315699658703071, "grad_norm": 36.50604248046875, "learning_rate": 4.937106918238994e-06, "loss": 2.3691, "step": 1246 }, { "epoch": 0.5319965870307167, "grad_norm": 19.009384155273438, "learning_rate": 4.932614555256065e-06, "loss": 1.9756, "step": 1247 }, { "epoch": 0.5324232081911263, "grad_norm": 19.400297164916992, "learning_rate": 4.928122192273136e-06, "loss": 2.417, "step": 1248 }, { "epoch": 0.5328498293515358, "grad_norm": 33.59346389770508, "learning_rate": 4.9236298292902065e-06, "loss": 2.4419, "step": 1249 }, { "epoch": 0.5332764505119454, "grad_norm": 18.19139862060547, "learning_rate": 4.919137466307278e-06, "loss": 1.9072, "step": 1250 }, { "epoch": 0.533703071672355, "grad_norm": 23.012216567993164, "learning_rate": 4.914645103324349e-06, "loss": 2.5869, "step": 1251 }, { "epoch": 0.5341296928327645, "grad_norm": 38.36344528198242, "learning_rate": 4.9101527403414205e-06, "loss": 2.6416, "step": 1252 }, { "epoch": 0.534556313993174, "grad_norm": 21.004196166992188, "learning_rate": 4.905660377358491e-06, "loss": 2.5957, "step": 1253 }, { "epoch": 0.5349829351535836, "grad_norm": 19.322311401367188, "learning_rate": 4.901168014375562e-06, "loss": 1.7642, "step": 1254 }, { "epoch": 0.5354095563139932, "grad_norm": 32.670780181884766, "learning_rate": 4.896675651392633e-06, "loss": 2.7812, "step": 1255 }, { "epoch": 0.5358361774744027, "grad_norm": 27.457656860351562, "learning_rate": 4.892183288409704e-06, "loss": 2.1992, "step": 1256 }, { "epoch": 0.5362627986348123, "grad_norm": 30.712905883789062, "learning_rate": 4.887690925426775e-06, "loss": 2.4502, "step": 1257 }, { "epoch": 0.5366894197952219, "grad_norm": 16.117074966430664, "learning_rate": 4.883198562443846e-06, "loss": 2.1328, "step": 1258 }, { "epoch": 0.5371160409556314, "grad_norm": 25.477861404418945, "learning_rate": 4.878706199460917e-06, "loss": 2.5, "step": 1259 }, { "epoch": 0.537542662116041, "grad_norm": 28.105344772338867, "learning_rate": 4.874213836477988e-06, "loss": 2.5112, "step": 1260 }, { "epoch": 0.5379692832764505, "grad_norm": 24.64742660522461, "learning_rate": 4.869721473495059e-06, "loss": 2.0586, "step": 1261 }, { "epoch": 0.53839590443686, "grad_norm": 16.60985565185547, "learning_rate": 4.86522911051213e-06, "loss": 2.1172, "step": 1262 }, { "epoch": 0.5388225255972696, "grad_norm": 28.405723571777344, "learning_rate": 4.8607367475292e-06, "loss": 2.8301, "step": 1263 }, { "epoch": 0.5392491467576792, "grad_norm": 20.750675201416016, "learning_rate": 4.856244384546272e-06, "loss": 2.3027, "step": 1264 }, { "epoch": 0.5396757679180887, "grad_norm": 45.88642120361328, "learning_rate": 4.851752021563343e-06, "loss": 3.0342, "step": 1265 }, { "epoch": 0.5401023890784983, "grad_norm": 23.829801559448242, "learning_rate": 4.847259658580414e-06, "loss": 2.4111, "step": 1266 }, { "epoch": 0.5405290102389079, "grad_norm": 43.45284652709961, "learning_rate": 4.842767295597484e-06, "loss": 2.6387, "step": 1267 }, { "epoch": 0.5409556313993175, "grad_norm": 44.6607666015625, "learning_rate": 4.8382749326145555e-06, "loss": 2.3877, "step": 1268 }, { "epoch": 0.5413822525597269, "grad_norm": 23.956260681152344, "learning_rate": 4.8337825696316265e-06, "loss": 2.5483, "step": 1269 }, { "epoch": 0.5418088737201365, "grad_norm": 25.851301193237305, "learning_rate": 4.8292902066486976e-06, "loss": 2.3066, "step": 1270 }, { "epoch": 0.5422354948805461, "grad_norm": 17.28289794921875, "learning_rate": 4.824797843665769e-06, "loss": 2.1562, "step": 1271 }, { "epoch": 0.5426621160409556, "grad_norm": 22.85431671142578, "learning_rate": 4.82030548068284e-06, "loss": 1.9053, "step": 1272 }, { "epoch": 0.5430887372013652, "grad_norm": 24.000097274780273, "learning_rate": 4.815813117699911e-06, "loss": 2.6982, "step": 1273 }, { "epoch": 0.5435153583617748, "grad_norm": 21.238855361938477, "learning_rate": 4.811320754716982e-06, "loss": 2.3408, "step": 1274 }, { "epoch": 0.5439419795221843, "grad_norm": 27.686309814453125, "learning_rate": 4.806828391734053e-06, "loss": 2.4111, "step": 1275 }, { "epoch": 0.5443686006825939, "grad_norm": 27.691089630126953, "learning_rate": 4.802336028751123e-06, "loss": 2.4756, "step": 1276 }, { "epoch": 0.5447952218430034, "grad_norm": 46.49020767211914, "learning_rate": 4.797843665768194e-06, "loss": 2.4033, "step": 1277 }, { "epoch": 0.5452218430034129, "grad_norm": 42.342952728271484, "learning_rate": 4.793351302785265e-06, "loss": 2.8711, "step": 1278 }, { "epoch": 0.5456484641638225, "grad_norm": 17.928974151611328, "learning_rate": 4.788858939802337e-06, "loss": 2.3525, "step": 1279 }, { "epoch": 0.5460750853242321, "grad_norm": 17.892406463623047, "learning_rate": 4.784366576819407e-06, "loss": 2.2793, "step": 1280 }, { "epoch": 0.5465017064846417, "grad_norm": 23.351177215576172, "learning_rate": 4.779874213836478e-06, "loss": 2.4336, "step": 1281 }, { "epoch": 0.5469283276450512, "grad_norm": 20.19877052307129, "learning_rate": 4.775381850853549e-06, "loss": 2.4072, "step": 1282 }, { "epoch": 0.5473549488054608, "grad_norm": 24.09697723388672, "learning_rate": 4.77088948787062e-06, "loss": 2.3301, "step": 1283 }, { "epoch": 0.5477815699658704, "grad_norm": 22.438928604125977, "learning_rate": 4.766397124887691e-06, "loss": 2.2861, "step": 1284 }, { "epoch": 0.5482081911262798, "grad_norm": 25.924821853637695, "learning_rate": 4.761904761904762e-06, "loss": 2.168, "step": 1285 }, { "epoch": 0.5486348122866894, "grad_norm": 29.486230850219727, "learning_rate": 4.757412398921833e-06, "loss": 2.6016, "step": 1286 }, { "epoch": 0.549061433447099, "grad_norm": 27.782878875732422, "learning_rate": 4.7529200359389045e-06, "loss": 2.3057, "step": 1287 }, { "epoch": 0.5494880546075085, "grad_norm": 17.11376953125, "learning_rate": 4.7484276729559755e-06, "loss": 2.3311, "step": 1288 }, { "epoch": 0.5499146757679181, "grad_norm": 26.16791534423828, "learning_rate": 4.7439353099730466e-06, "loss": 2.4697, "step": 1289 }, { "epoch": 0.5503412969283277, "grad_norm": 17.218490600585938, "learning_rate": 4.739442946990117e-06, "loss": 2.4209, "step": 1290 }, { "epoch": 0.5507679180887372, "grad_norm": 25.795942306518555, "learning_rate": 4.734950584007188e-06, "loss": 2.541, "step": 1291 }, { "epoch": 0.5511945392491467, "grad_norm": 33.7730598449707, "learning_rate": 4.730458221024259e-06, "loss": 2.6445, "step": 1292 }, { "epoch": 0.5516211604095563, "grad_norm": 35.27183151245117, "learning_rate": 4.725965858041331e-06, "loss": 2.8086, "step": 1293 }, { "epoch": 0.5520477815699659, "grad_norm": 22.450122833251953, "learning_rate": 4.721473495058401e-06, "loss": 2.499, "step": 1294 }, { "epoch": 0.5524744027303754, "grad_norm": 20.96445655822754, "learning_rate": 4.716981132075472e-06, "loss": 2.1758, "step": 1295 }, { "epoch": 0.552901023890785, "grad_norm": 20.37903594970703, "learning_rate": 4.712488769092543e-06, "loss": 2.542, "step": 1296 }, { "epoch": 0.5533276450511946, "grad_norm": 21.708114624023438, "learning_rate": 4.707996406109614e-06, "loss": 2.4727, "step": 1297 }, { "epoch": 0.5537542662116041, "grad_norm": 35.709999084472656, "learning_rate": 4.703504043126685e-06, "loss": 2.3232, "step": 1298 }, { "epoch": 0.5541808873720137, "grad_norm": 17.35190773010254, "learning_rate": 4.699011680143756e-06, "loss": 2.6094, "step": 1299 }, { "epoch": 0.5546075085324232, "grad_norm": 16.749832153320312, "learning_rate": 4.694519317160827e-06, "loss": 2.252, "step": 1300 }, { "epoch": 0.5550341296928327, "grad_norm": 32.088626861572266, "learning_rate": 4.690026954177898e-06, "loss": 2.334, "step": 1301 }, { "epoch": 0.5554607508532423, "grad_norm": 17.354429244995117, "learning_rate": 4.685534591194969e-06, "loss": 2.0596, "step": 1302 }, { "epoch": 0.5558873720136519, "grad_norm": 20.02385139465332, "learning_rate": 4.68104222821204e-06, "loss": 2.3604, "step": 1303 }, { "epoch": 0.5563139931740614, "grad_norm": 17.277572631835938, "learning_rate": 4.6765498652291105e-06, "loss": 2.3037, "step": 1304 }, { "epoch": 0.556740614334471, "grad_norm": 21.47422218322754, "learning_rate": 4.6720575022461816e-06, "loss": 2.54, "step": 1305 }, { "epoch": 0.5571672354948806, "grad_norm": 27.697263717651367, "learning_rate": 4.667565139263253e-06, "loss": 2.1299, "step": 1306 }, { "epoch": 0.5575938566552902, "grad_norm": 26.158000946044922, "learning_rate": 4.663072776280324e-06, "loss": 2.5928, "step": 1307 }, { "epoch": 0.5580204778156996, "grad_norm": 29.957881927490234, "learning_rate": 4.658580413297395e-06, "loss": 2.4639, "step": 1308 }, { "epoch": 0.5584470989761092, "grad_norm": 20.76220703125, "learning_rate": 4.654088050314466e-06, "loss": 2.5986, "step": 1309 }, { "epoch": 0.5588737201365188, "grad_norm": 22.95929718017578, "learning_rate": 4.649595687331537e-06, "loss": 2.7158, "step": 1310 }, { "epoch": 0.5593003412969283, "grad_norm": 17.745424270629883, "learning_rate": 4.645103324348608e-06, "loss": 1.9844, "step": 1311 }, { "epoch": 0.5597269624573379, "grad_norm": 22.64072036743164, "learning_rate": 4.640610961365679e-06, "loss": 2.0156, "step": 1312 }, { "epoch": 0.5601535836177475, "grad_norm": 44.04773712158203, "learning_rate": 4.636118598382749e-06, "loss": 3.0254, "step": 1313 }, { "epoch": 0.560580204778157, "grad_norm": 37.88700866699219, "learning_rate": 4.631626235399821e-06, "loss": 2.3643, "step": 1314 }, { "epoch": 0.5610068259385665, "grad_norm": 17.318178176879883, "learning_rate": 4.627133872416892e-06, "loss": 2.2002, "step": 1315 }, { "epoch": 0.5614334470989761, "grad_norm": 22.529075622558594, "learning_rate": 4.622641509433963e-06, "loss": 2.625, "step": 1316 }, { "epoch": 0.5618600682593856, "grad_norm": 29.588624954223633, "learning_rate": 4.618149146451033e-06, "loss": 2.2354, "step": 1317 }, { "epoch": 0.5622866894197952, "grad_norm": 20.1776123046875, "learning_rate": 4.613656783468104e-06, "loss": 2.7715, "step": 1318 }, { "epoch": 0.5627133105802048, "grad_norm": 18.949871063232422, "learning_rate": 4.609164420485175e-06, "loss": 2.3174, "step": 1319 }, { "epoch": 0.5631399317406144, "grad_norm": 23.272371292114258, "learning_rate": 4.604672057502246e-06, "loss": 2.3906, "step": 1320 }, { "epoch": 0.5635665529010239, "grad_norm": 28.942964553833008, "learning_rate": 4.6001796945193174e-06, "loss": 2.332, "step": 1321 }, { "epoch": 0.5639931740614335, "grad_norm": 17.387943267822266, "learning_rate": 4.5956873315363885e-06, "loss": 1.8735, "step": 1322 }, { "epoch": 0.564419795221843, "grad_norm": 28.30118179321289, "learning_rate": 4.5911949685534595e-06, "loss": 2.8379, "step": 1323 }, { "epoch": 0.5648464163822525, "grad_norm": 20.047611236572266, "learning_rate": 4.5867026055705306e-06, "loss": 2.666, "step": 1324 }, { "epoch": 0.5652730375426621, "grad_norm": 24.70307159423828, "learning_rate": 4.582210242587602e-06, "loss": 2.0962, "step": 1325 }, { "epoch": 0.5656996587030717, "grad_norm": 16.165760040283203, "learning_rate": 4.577717879604673e-06, "loss": 2.3633, "step": 1326 }, { "epoch": 0.5661262798634812, "grad_norm": 24.163618087768555, "learning_rate": 4.573225516621743e-06, "loss": 3.1045, "step": 1327 }, { "epoch": 0.5665529010238908, "grad_norm": 20.49041748046875, "learning_rate": 4.568733153638815e-06, "loss": 2.8564, "step": 1328 }, { "epoch": 0.5669795221843004, "grad_norm": 28.167705535888672, "learning_rate": 4.564240790655886e-06, "loss": 3.1016, "step": 1329 }, { "epoch": 0.5674061433447098, "grad_norm": 17.828779220581055, "learning_rate": 4.559748427672957e-06, "loss": 2.5957, "step": 1330 }, { "epoch": 0.5678327645051194, "grad_norm": 44.40090560913086, "learning_rate": 4.555256064690027e-06, "loss": 2.4102, "step": 1331 }, { "epoch": 0.568259385665529, "grad_norm": 20.21436882019043, "learning_rate": 4.550763701707098e-06, "loss": 2.3369, "step": 1332 }, { "epoch": 0.5686860068259386, "grad_norm": 19.991886138916016, "learning_rate": 4.546271338724169e-06, "loss": 2.2256, "step": 1333 }, { "epoch": 0.5691126279863481, "grad_norm": 19.2490177154541, "learning_rate": 4.54177897574124e-06, "loss": 2.0166, "step": 1334 }, { "epoch": 0.5695392491467577, "grad_norm": 16.32038688659668, "learning_rate": 4.537286612758311e-06, "loss": 2.1035, "step": 1335 }, { "epoch": 0.5699658703071673, "grad_norm": 16.530920028686523, "learning_rate": 4.532794249775382e-06, "loss": 2.3135, "step": 1336 }, { "epoch": 0.5703924914675768, "grad_norm": 24.308748245239258, "learning_rate": 4.528301886792453e-06, "loss": 2.4453, "step": 1337 }, { "epoch": 0.5708191126279863, "grad_norm": 32.11763000488281, "learning_rate": 4.523809523809524e-06, "loss": 2.5371, "step": 1338 }, { "epoch": 0.5712457337883959, "grad_norm": 21.306795120239258, "learning_rate": 4.519317160826595e-06, "loss": 2.2373, "step": 1339 }, { "epoch": 0.5716723549488054, "grad_norm": 26.340106964111328, "learning_rate": 4.5148247978436664e-06, "loss": 1.749, "step": 1340 }, { "epoch": 0.572098976109215, "grad_norm": 21.620981216430664, "learning_rate": 4.510332434860737e-06, "loss": 2.1904, "step": 1341 }, { "epoch": 0.5725255972696246, "grad_norm": 27.624412536621094, "learning_rate": 4.5058400718778085e-06, "loss": 2.3442, "step": 1342 }, { "epoch": 0.5729522184300341, "grad_norm": 18.985275268554688, "learning_rate": 4.5013477088948796e-06, "loss": 2.3994, "step": 1343 }, { "epoch": 0.5733788395904437, "grad_norm": 42.2714729309082, "learning_rate": 4.49685534591195e-06, "loss": 2.8359, "step": 1344 }, { "epoch": 0.5738054607508533, "grad_norm": 32.8116455078125, "learning_rate": 4.492362982929021e-06, "loss": 1.9834, "step": 1345 }, { "epoch": 0.5742320819112628, "grad_norm": 31.643775939941406, "learning_rate": 4.487870619946092e-06, "loss": 2.1738, "step": 1346 }, { "epoch": 0.5746587030716723, "grad_norm": 25.283864974975586, "learning_rate": 4.483378256963163e-06, "loss": 1.9385, "step": 1347 }, { "epoch": 0.5750853242320819, "grad_norm": 23.998638153076172, "learning_rate": 4.478885893980234e-06, "loss": 2.5303, "step": 1348 }, { "epoch": 0.5755119453924915, "grad_norm": 17.42400360107422, "learning_rate": 4.474393530997305e-06, "loss": 2.0859, "step": 1349 }, { "epoch": 0.575938566552901, "grad_norm": 23.299835205078125, "learning_rate": 4.469901168014376e-06, "loss": 2.5273, "step": 1350 }, { "epoch": 0.5763651877133106, "grad_norm": 21.345651626586914, "learning_rate": 4.465408805031447e-06, "loss": 2.7656, "step": 1351 }, { "epoch": 0.5767918088737202, "grad_norm": 29.26232147216797, "learning_rate": 4.460916442048518e-06, "loss": 2.3965, "step": 1352 }, { "epoch": 0.5772184300341296, "grad_norm": 20.05579376220703, "learning_rate": 4.456424079065589e-06, "loss": 2.4482, "step": 1353 }, { "epoch": 0.5776450511945392, "grad_norm": 16.981386184692383, "learning_rate": 4.451931716082659e-06, "loss": 2.0449, "step": 1354 }, { "epoch": 0.5780716723549488, "grad_norm": 33.122074127197266, "learning_rate": 4.44743935309973e-06, "loss": 2.3916, "step": 1355 }, { "epoch": 0.5784982935153583, "grad_norm": 22.73659324645996, "learning_rate": 4.4429469901168014e-06, "loss": 2.5859, "step": 1356 }, { "epoch": 0.5789249146757679, "grad_norm": 18.3586368560791, "learning_rate": 4.438454627133873e-06, "loss": 2.5098, "step": 1357 }, { "epoch": 0.5793515358361775, "grad_norm": 16.827573776245117, "learning_rate": 4.4339622641509435e-06, "loss": 2.2197, "step": 1358 }, { "epoch": 0.5797781569965871, "grad_norm": 27.184112548828125, "learning_rate": 4.429469901168015e-06, "loss": 2.4912, "step": 1359 }, { "epoch": 0.5802047781569966, "grad_norm": 24.514375686645508, "learning_rate": 4.424977538185086e-06, "loss": 2.2236, "step": 1360 }, { "epoch": 0.5806313993174061, "grad_norm": 21.03577423095703, "learning_rate": 4.420485175202157e-06, "loss": 2.1973, "step": 1361 }, { "epoch": 0.5810580204778157, "grad_norm": 21.42534637451172, "learning_rate": 4.415992812219228e-06, "loss": 2.6367, "step": 1362 }, { "epoch": 0.5814846416382252, "grad_norm": 25.538663864135742, "learning_rate": 4.411500449236299e-06, "loss": 2.46, "step": 1363 }, { "epoch": 0.5819112627986348, "grad_norm": 20.800830841064453, "learning_rate": 4.40700808625337e-06, "loss": 2.1704, "step": 1364 }, { "epoch": 0.5823378839590444, "grad_norm": 29.959156036376953, "learning_rate": 4.402515723270441e-06, "loss": 2.5312, "step": 1365 }, { "epoch": 0.5827645051194539, "grad_norm": 23.058263778686523, "learning_rate": 4.398023360287512e-06, "loss": 2.5, "step": 1366 }, { "epoch": 0.5831911262798635, "grad_norm": 25.83173179626465, "learning_rate": 4.393530997304583e-06, "loss": 2.3066, "step": 1367 }, { "epoch": 0.5836177474402731, "grad_norm": 22.539011001586914, "learning_rate": 4.389038634321653e-06, "loss": 2.6162, "step": 1368 }, { "epoch": 0.5840443686006825, "grad_norm": 40.652252197265625, "learning_rate": 4.384546271338724e-06, "loss": 2.3555, "step": 1369 }, { "epoch": 0.5844709897610921, "grad_norm": 29.97538948059082, "learning_rate": 4.380053908355795e-06, "loss": 2.5176, "step": 1370 }, { "epoch": 0.5848976109215017, "grad_norm": 21.40877914428711, "learning_rate": 4.375561545372867e-06, "loss": 2.5283, "step": 1371 }, { "epoch": 0.5853242320819113, "grad_norm": 22.635761260986328, "learning_rate": 4.371069182389937e-06, "loss": 2.3252, "step": 1372 }, { "epoch": 0.5857508532423208, "grad_norm": 22.346757888793945, "learning_rate": 4.366576819407008e-06, "loss": 2.2568, "step": 1373 }, { "epoch": 0.5861774744027304, "grad_norm": 19.227197647094727, "learning_rate": 4.362084456424079e-06, "loss": 2.4033, "step": 1374 }, { "epoch": 0.58660409556314, "grad_norm": 18.52188491821289, "learning_rate": 4.3575920934411504e-06, "loss": 1.7754, "step": 1375 }, { "epoch": 0.5870307167235495, "grad_norm": 60.36956024169922, "learning_rate": 4.3530997304582215e-06, "loss": 2.9863, "step": 1376 }, { "epoch": 0.587457337883959, "grad_norm": 23.77344512939453, "learning_rate": 4.348607367475292e-06, "loss": 2.3872, "step": 1377 }, { "epoch": 0.5878839590443686, "grad_norm": 20.1966552734375, "learning_rate": 4.344115004492364e-06, "loss": 2.457, "step": 1378 }, { "epoch": 0.5883105802047781, "grad_norm": 37.82915496826172, "learning_rate": 4.339622641509435e-06, "loss": 2.2109, "step": 1379 }, { "epoch": 0.5887372013651877, "grad_norm": 15.02815055847168, "learning_rate": 4.335130278526506e-06, "loss": 1.8213, "step": 1380 }, { "epoch": 0.5891638225255973, "grad_norm": 24.90953826904297, "learning_rate": 4.330637915543576e-06, "loss": 2.4609, "step": 1381 }, { "epoch": 0.5895904436860068, "grad_norm": 25.334327697753906, "learning_rate": 4.326145552560647e-06, "loss": 2.9844, "step": 1382 }, { "epoch": 0.5900170648464164, "grad_norm": 16.76384925842285, "learning_rate": 4.321653189577718e-06, "loss": 2.1152, "step": 1383 }, { "epoch": 0.590443686006826, "grad_norm": 24.324865341186523, "learning_rate": 4.317160826594789e-06, "loss": 2.0791, "step": 1384 }, { "epoch": 0.5908703071672355, "grad_norm": 18.31525421142578, "learning_rate": 4.31266846361186e-06, "loss": 2.1934, "step": 1385 }, { "epoch": 0.591296928327645, "grad_norm": 29.65024185180664, "learning_rate": 4.308176100628931e-06, "loss": 3.0664, "step": 1386 }, { "epoch": 0.5917235494880546, "grad_norm": 66.52584075927734, "learning_rate": 4.303683737646002e-06, "loss": 2.7041, "step": 1387 }, { "epoch": 0.5921501706484642, "grad_norm": 43.482704162597656, "learning_rate": 4.299191374663073e-06, "loss": 2.3887, "step": 1388 }, { "epoch": 0.5925767918088737, "grad_norm": 44.79536819458008, "learning_rate": 4.294699011680144e-06, "loss": 2.5781, "step": 1389 }, { "epoch": 0.5930034129692833, "grad_norm": 35.73846435546875, "learning_rate": 4.290206648697215e-06, "loss": 2.2432, "step": 1390 }, { "epoch": 0.5934300341296929, "grad_norm": 36.16798400878906, "learning_rate": 4.2857142857142855e-06, "loss": 2.5605, "step": 1391 }, { "epoch": 0.5938566552901023, "grad_norm": 20.952844619750977, "learning_rate": 4.281221922731357e-06, "loss": 2.3232, "step": 1392 }, { "epoch": 0.5942832764505119, "grad_norm": 27.37010955810547, "learning_rate": 4.276729559748428e-06, "loss": 2.832, "step": 1393 }, { "epoch": 0.5947098976109215, "grad_norm": 15.844951629638672, "learning_rate": 4.2722371967654994e-06, "loss": 2.2432, "step": 1394 }, { "epoch": 0.5951365187713311, "grad_norm": 27.62236213684082, "learning_rate": 4.26774483378257e-06, "loss": 2.2822, "step": 1395 }, { "epoch": 0.5955631399317406, "grad_norm": 44.5030517578125, "learning_rate": 4.263252470799641e-06, "loss": 3.1357, "step": 1396 }, { "epoch": 0.5959897610921502, "grad_norm": 43.49186325073242, "learning_rate": 4.258760107816712e-06, "loss": 2.7344, "step": 1397 }, { "epoch": 0.5964163822525598, "grad_norm": 45.643653869628906, "learning_rate": 4.254267744833783e-06, "loss": 2.8877, "step": 1398 }, { "epoch": 0.5968430034129693, "grad_norm": 44.29466247558594, "learning_rate": 4.249775381850854e-06, "loss": 2.4814, "step": 1399 }, { "epoch": 0.5972696245733788, "grad_norm": 36.38447570800781, "learning_rate": 4.245283018867925e-06, "loss": 2.5371, "step": 1400 }, { "epoch": 0.5976962457337884, "grad_norm": 34.13217544555664, "learning_rate": 4.240790655884996e-06, "loss": 2.2461, "step": 1401 }, { "epoch": 0.5981228668941979, "grad_norm": 16.927581787109375, "learning_rate": 4.236298292902067e-06, "loss": 1.9912, "step": 1402 }, { "epoch": 0.5985494880546075, "grad_norm": 19.867677688598633, "learning_rate": 4.231805929919138e-06, "loss": 1.8281, "step": 1403 }, { "epoch": 0.5989761092150171, "grad_norm": 41.92014694213867, "learning_rate": 4.227313566936209e-06, "loss": 2.6885, "step": 1404 }, { "epoch": 0.5994027303754266, "grad_norm": 31.410295486450195, "learning_rate": 4.222821203953279e-06, "loss": 2.1963, "step": 1405 }, { "epoch": 0.5998293515358362, "grad_norm": 22.472976684570312, "learning_rate": 4.218328840970351e-06, "loss": 2.3354, "step": 1406 }, { "epoch": 0.6002559726962458, "grad_norm": 22.231199264526367, "learning_rate": 4.213836477987422e-06, "loss": 2.6934, "step": 1407 }, { "epoch": 0.6006825938566553, "grad_norm": 25.221227645874023, "learning_rate": 4.209344115004493e-06, "loss": 2.5557, "step": 1408 }, { "epoch": 0.6011092150170648, "grad_norm": 19.79888343811035, "learning_rate": 4.204851752021563e-06, "loss": 2.6304, "step": 1409 }, { "epoch": 0.6015358361774744, "grad_norm": 27.240476608276367, "learning_rate": 4.2003593890386345e-06, "loss": 2.042, "step": 1410 }, { "epoch": 0.601962457337884, "grad_norm": 23.848093032836914, "learning_rate": 4.1958670260557055e-06, "loss": 2.3359, "step": 1411 }, { "epoch": 0.6023890784982935, "grad_norm": 27.17098617553711, "learning_rate": 4.1913746630727765e-06, "loss": 2.7607, "step": 1412 }, { "epoch": 0.6028156996587031, "grad_norm": 35.22904968261719, "learning_rate": 4.186882300089848e-06, "loss": 2.5801, "step": 1413 }, { "epoch": 0.6032423208191127, "grad_norm": 35.9609260559082, "learning_rate": 4.182389937106919e-06, "loss": 2.4004, "step": 1414 }, { "epoch": 0.6036689419795221, "grad_norm": 19.513574600219727, "learning_rate": 4.17789757412399e-06, "loss": 2.5693, "step": 1415 }, { "epoch": 0.6040955631399317, "grad_norm": 20.018802642822266, "learning_rate": 4.173405211141061e-06, "loss": 2.0693, "step": 1416 }, { "epoch": 0.6045221843003413, "grad_norm": 19.38519287109375, "learning_rate": 4.168912848158132e-06, "loss": 1.853, "step": 1417 }, { "epoch": 0.6049488054607508, "grad_norm": 27.481489181518555, "learning_rate": 4.164420485175202e-06, "loss": 2.7305, "step": 1418 }, { "epoch": 0.6053754266211604, "grad_norm": 19.81107521057129, "learning_rate": 4.159928122192273e-06, "loss": 2.1338, "step": 1419 }, { "epoch": 0.60580204778157, "grad_norm": 44.185691833496094, "learning_rate": 4.155435759209345e-06, "loss": 2.8506, "step": 1420 }, { "epoch": 0.6062286689419796, "grad_norm": 29.988082885742188, "learning_rate": 4.150943396226416e-06, "loss": 2.2617, "step": 1421 }, { "epoch": 0.606655290102389, "grad_norm": 19.156795501708984, "learning_rate": 4.146451033243486e-06, "loss": 2.647, "step": 1422 }, { "epoch": 0.6070819112627986, "grad_norm": 25.175708770751953, "learning_rate": 4.141958670260557e-06, "loss": 2.2324, "step": 1423 }, { "epoch": 0.6075085324232082, "grad_norm": 26.030254364013672, "learning_rate": 4.137466307277628e-06, "loss": 2.2109, "step": 1424 }, { "epoch": 0.6079351535836177, "grad_norm": 15.288980484008789, "learning_rate": 4.132973944294699e-06, "loss": 2.0322, "step": 1425 }, { "epoch": 0.6083617747440273, "grad_norm": 15.24339771270752, "learning_rate": 4.12848158131177e-06, "loss": 2.0342, "step": 1426 }, { "epoch": 0.6087883959044369, "grad_norm": 28.109180450439453, "learning_rate": 4.123989218328841e-06, "loss": 2.2969, "step": 1427 }, { "epoch": 0.6092150170648464, "grad_norm": 38.712154388427734, "learning_rate": 4.119496855345912e-06, "loss": 2.4268, "step": 1428 }, { "epoch": 0.609641638225256, "grad_norm": 41.02090835571289, "learning_rate": 4.1150044923629835e-06, "loss": 2.208, "step": 1429 }, { "epoch": 0.6100682593856656, "grad_norm": 19.8539981842041, "learning_rate": 4.1105121293800545e-06, "loss": 2.5723, "step": 1430 }, { "epoch": 0.610494880546075, "grad_norm": 17.092798233032227, "learning_rate": 4.1060197663971255e-06, "loss": 1.7119, "step": 1431 }, { "epoch": 0.6109215017064846, "grad_norm": 22.7246036529541, "learning_rate": 4.101527403414196e-06, "loss": 2.2485, "step": 1432 }, { "epoch": 0.6113481228668942, "grad_norm": 21.90310287475586, "learning_rate": 4.097035040431267e-06, "loss": 2.0273, "step": 1433 }, { "epoch": 0.6117747440273038, "grad_norm": 28.43157958984375, "learning_rate": 4.092542677448338e-06, "loss": 2.8447, "step": 1434 }, { "epoch": 0.6122013651877133, "grad_norm": 18.50580406188965, "learning_rate": 4.08805031446541e-06, "loss": 2.1816, "step": 1435 }, { "epoch": 0.6126279863481229, "grad_norm": 22.50284767150879, "learning_rate": 4.08355795148248e-06, "loss": 2.0723, "step": 1436 }, { "epoch": 0.6130546075085325, "grad_norm": 28.248563766479492, "learning_rate": 4.079065588499551e-06, "loss": 2.1426, "step": 1437 }, { "epoch": 0.613481228668942, "grad_norm": 38.63645935058594, "learning_rate": 4.074573225516622e-06, "loss": 2.375, "step": 1438 }, { "epoch": 0.6139078498293515, "grad_norm": 27.894947052001953, "learning_rate": 4.070080862533693e-06, "loss": 2.0645, "step": 1439 }, { "epoch": 0.6143344709897611, "grad_norm": 37.0277099609375, "learning_rate": 4.065588499550764e-06, "loss": 2.3076, "step": 1440 }, { "epoch": 0.6147610921501706, "grad_norm": 15.98538589477539, "learning_rate": 4.061096136567835e-06, "loss": 1.8379, "step": 1441 }, { "epoch": 0.6151877133105802, "grad_norm": 16.836027145385742, "learning_rate": 4.056603773584906e-06, "loss": 2.3564, "step": 1442 }, { "epoch": 0.6156143344709898, "grad_norm": 31.81623649597168, "learning_rate": 4.052111410601977e-06, "loss": 2.6572, "step": 1443 }, { "epoch": 0.6160409556313993, "grad_norm": 38.87391662597656, "learning_rate": 4.047619047619048e-06, "loss": 2.4619, "step": 1444 }, { "epoch": 0.6164675767918089, "grad_norm": 24.75901222229004, "learning_rate": 4.043126684636119e-06, "loss": 2.6758, "step": 1445 }, { "epoch": 0.6168941979522184, "grad_norm": 26.73711395263672, "learning_rate": 4.0386343216531895e-06, "loss": 2.3291, "step": 1446 }, { "epoch": 0.617320819112628, "grad_norm": 15.336953163146973, "learning_rate": 4.0341419586702606e-06, "loss": 2.1064, "step": 1447 }, { "epoch": 0.6177474402730375, "grad_norm": 27.878210067749023, "learning_rate": 4.029649595687332e-06, "loss": 2.3994, "step": 1448 }, { "epoch": 0.6181740614334471, "grad_norm": 20.88320541381836, "learning_rate": 4.025157232704403e-06, "loss": 2.1064, "step": 1449 }, { "epoch": 0.6186006825938567, "grad_norm": 25.652420043945312, "learning_rate": 4.020664869721474e-06, "loss": 2.083, "step": 1450 }, { "epoch": 0.6190273037542662, "grad_norm": 20.050722122192383, "learning_rate": 4.016172506738545e-06, "loss": 2.2979, "step": 1451 }, { "epoch": 0.6194539249146758, "grad_norm": 15.306723594665527, "learning_rate": 4.011680143755616e-06, "loss": 2.1504, "step": 1452 }, { "epoch": 0.6198805460750854, "grad_norm": 19.287073135375977, "learning_rate": 4.007187780772687e-06, "loss": 2.8262, "step": 1453 }, { "epoch": 0.6203071672354948, "grad_norm": 14.914201736450195, "learning_rate": 4.002695417789758e-06, "loss": 1.9775, "step": 1454 }, { "epoch": 0.6207337883959044, "grad_norm": 18.950130462646484, "learning_rate": 3.998203054806828e-06, "loss": 2.4258, "step": 1455 }, { "epoch": 0.621160409556314, "grad_norm": 15.977020263671875, "learning_rate": 3.9937106918239e-06, "loss": 2.1719, "step": 1456 }, { "epoch": 0.6215870307167235, "grad_norm": 22.02812385559082, "learning_rate": 3.989218328840971e-06, "loss": 2.5156, "step": 1457 }, { "epoch": 0.6220136518771331, "grad_norm": 17.025178909301758, "learning_rate": 3.984725965858042e-06, "loss": 2.5469, "step": 1458 }, { "epoch": 0.6224402730375427, "grad_norm": 39.999542236328125, "learning_rate": 3.980233602875112e-06, "loss": 2.7939, "step": 1459 }, { "epoch": 0.6228668941979523, "grad_norm": 16.99581527709961, "learning_rate": 3.975741239892183e-06, "loss": 2.2998, "step": 1460 }, { "epoch": 0.6232935153583617, "grad_norm": 18.14431381225586, "learning_rate": 3.971248876909254e-06, "loss": 1.8926, "step": 1461 }, { "epoch": 0.6237201365187713, "grad_norm": 16.469980239868164, "learning_rate": 3.966756513926325e-06, "loss": 2.5195, "step": 1462 }, { "epoch": 0.6241467576791809, "grad_norm": 18.794111251831055, "learning_rate": 3.962264150943396e-06, "loss": 2.4385, "step": 1463 }, { "epoch": 0.6245733788395904, "grad_norm": 22.943275451660156, "learning_rate": 3.9577717879604675e-06, "loss": 2.3149, "step": 1464 }, { "epoch": 0.625, "grad_norm": 30.07889175415039, "learning_rate": 3.9532794249775385e-06, "loss": 2.1279, "step": 1465 }, { "epoch": 0.6254266211604096, "grad_norm": 36.66804885864258, "learning_rate": 3.9487870619946096e-06, "loss": 2.7715, "step": 1466 }, { "epoch": 0.6258532423208191, "grad_norm": 18.29961585998535, "learning_rate": 3.944294699011681e-06, "loss": 2.1836, "step": 1467 }, { "epoch": 0.6262798634812287, "grad_norm": 27.083736419677734, "learning_rate": 3.939802336028752e-06, "loss": 2.2803, "step": 1468 }, { "epoch": 0.6267064846416383, "grad_norm": 31.700674057006836, "learning_rate": 3.935309973045822e-06, "loss": 2.7314, "step": 1469 }, { "epoch": 0.6271331058020477, "grad_norm": 20.692028045654297, "learning_rate": 3.930817610062894e-06, "loss": 2.6025, "step": 1470 }, { "epoch": 0.6275597269624573, "grad_norm": 20.040754318237305, "learning_rate": 3.926325247079965e-06, "loss": 2.0977, "step": 1471 }, { "epoch": 0.6279863481228669, "grad_norm": 24.181251525878906, "learning_rate": 3.921832884097036e-06, "loss": 2.8213, "step": 1472 }, { "epoch": 0.6284129692832765, "grad_norm": 22.22473907470703, "learning_rate": 3.917340521114106e-06, "loss": 2.2432, "step": 1473 }, { "epoch": 0.628839590443686, "grad_norm": 21.3482666015625, "learning_rate": 3.912848158131177e-06, "loss": 2.5049, "step": 1474 }, { "epoch": 0.6292662116040956, "grad_norm": 22.529102325439453, "learning_rate": 3.908355795148248e-06, "loss": 2.0825, "step": 1475 }, { "epoch": 0.6296928327645052, "grad_norm": 29.344873428344727, "learning_rate": 3.903863432165319e-06, "loss": 2.3721, "step": 1476 }, { "epoch": 0.6301194539249146, "grad_norm": 23.979862213134766, "learning_rate": 3.89937106918239e-06, "loss": 2.4229, "step": 1477 }, { "epoch": 0.6305460750853242, "grad_norm": 20.146678924560547, "learning_rate": 3.894878706199461e-06, "loss": 2.2549, "step": 1478 }, { "epoch": 0.6309726962457338, "grad_norm": 32.02549743652344, "learning_rate": 3.890386343216532e-06, "loss": 2.1494, "step": 1479 }, { "epoch": 0.6313993174061433, "grad_norm": 15.963859558105469, "learning_rate": 3.885893980233603e-06, "loss": 2.3057, "step": 1480 }, { "epoch": 0.6318259385665529, "grad_norm": 26.127859115600586, "learning_rate": 3.881401617250674e-06, "loss": 2.8711, "step": 1481 }, { "epoch": 0.6322525597269625, "grad_norm": 16.796703338623047, "learning_rate": 3.876909254267745e-06, "loss": 2.0557, "step": 1482 }, { "epoch": 0.632679180887372, "grad_norm": 21.77611541748047, "learning_rate": 3.872416891284816e-06, "loss": 2.1187, "step": 1483 }, { "epoch": 0.6331058020477816, "grad_norm": 26.624557495117188, "learning_rate": 3.8679245283018875e-06, "loss": 2.5098, "step": 1484 }, { "epoch": 0.6335324232081911, "grad_norm": 27.21763801574707, "learning_rate": 3.8634321653189586e-06, "loss": 1.9473, "step": 1485 }, { "epoch": 0.6339590443686007, "grad_norm": 17.299362182617188, "learning_rate": 3.858939802336029e-06, "loss": 2.7949, "step": 1486 }, { "epoch": 0.6343856655290102, "grad_norm": 20.601594924926758, "learning_rate": 3.8544474393531e-06, "loss": 2.415, "step": 1487 }, { "epoch": 0.6348122866894198, "grad_norm": 16.806154251098633, "learning_rate": 3.849955076370171e-06, "loss": 2.1221, "step": 1488 }, { "epoch": 0.6352389078498294, "grad_norm": 23.250965118408203, "learning_rate": 3.845462713387242e-06, "loss": 2.2686, "step": 1489 }, { "epoch": 0.6356655290102389, "grad_norm": 24.61576271057129, "learning_rate": 3.840970350404313e-06, "loss": 2.1787, "step": 1490 }, { "epoch": 0.6360921501706485, "grad_norm": 16.336050033569336, "learning_rate": 3.836477987421384e-06, "loss": 1.9048, "step": 1491 }, { "epoch": 0.636518771331058, "grad_norm": 18.35066795349121, "learning_rate": 3.831985624438455e-06, "loss": 2.2012, "step": 1492 }, { "epoch": 0.6369453924914675, "grad_norm": 27.973064422607422, "learning_rate": 3.827493261455526e-06, "loss": 2.7266, "step": 1493 }, { "epoch": 0.6373720136518771, "grad_norm": 18.747730255126953, "learning_rate": 3.823000898472597e-06, "loss": 1.8916, "step": 1494 }, { "epoch": 0.6377986348122867, "grad_norm": 19.106430053710938, "learning_rate": 3.818508535489668e-06, "loss": 2.2539, "step": 1495 }, { "epoch": 0.6382252559726962, "grad_norm": 31.52516746520996, "learning_rate": 3.8140161725067388e-06, "loss": 1.9854, "step": 1496 }, { "epoch": 0.6386518771331058, "grad_norm": 22.284278869628906, "learning_rate": 3.80952380952381e-06, "loss": 2.2305, "step": 1497 }, { "epoch": 0.6390784982935154, "grad_norm": 32.087974548339844, "learning_rate": 3.8050314465408813e-06, "loss": 2.6914, "step": 1498 }, { "epoch": 0.639505119453925, "grad_norm": 23.919710159301758, "learning_rate": 3.800539083557952e-06, "loss": 2.2656, "step": 1499 }, { "epoch": 0.6399317406143344, "grad_norm": 34.100765228271484, "learning_rate": 3.796046720575023e-06, "loss": 2.2314, "step": 1500 }, { "epoch": 0.640358361774744, "grad_norm": 25.631540298461914, "learning_rate": 3.791554357592094e-06, "loss": 1.9922, "step": 1501 }, { "epoch": 0.6407849829351536, "grad_norm": 20.974817276000977, "learning_rate": 3.7870619946091646e-06, "loss": 2.8428, "step": 1502 }, { "epoch": 0.6412116040955631, "grad_norm": 20.296192169189453, "learning_rate": 3.7825696316262357e-06, "loss": 1.9897, "step": 1503 }, { "epoch": 0.6416382252559727, "grad_norm": 24.699539184570312, "learning_rate": 3.7780772686433063e-06, "loss": 2.4717, "step": 1504 }, { "epoch": 0.6420648464163823, "grad_norm": 18.056325912475586, "learning_rate": 3.7735849056603777e-06, "loss": 2.0938, "step": 1505 }, { "epoch": 0.6424914675767918, "grad_norm": 28.3708553314209, "learning_rate": 3.769092542677449e-06, "loss": 2.582, "step": 1506 }, { "epoch": 0.6429180887372014, "grad_norm": 23.238800048828125, "learning_rate": 3.76460017969452e-06, "loss": 2.3086, "step": 1507 }, { "epoch": 0.643344709897611, "grad_norm": 26.795642852783203, "learning_rate": 3.7601078167115905e-06, "loss": 2.4912, "step": 1508 }, { "epoch": 0.6437713310580204, "grad_norm": 29.788188934326172, "learning_rate": 3.7556154537286615e-06, "loss": 2.4717, "step": 1509 }, { "epoch": 0.64419795221843, "grad_norm": 22.72492218017578, "learning_rate": 3.7511230907457325e-06, "loss": 2.1494, "step": 1510 }, { "epoch": 0.6446245733788396, "grad_norm": 42.80537414550781, "learning_rate": 3.746630727762803e-06, "loss": 2.3721, "step": 1511 }, { "epoch": 0.6450511945392492, "grad_norm": 29.475852966308594, "learning_rate": 3.742138364779874e-06, "loss": 2.7007, "step": 1512 }, { "epoch": 0.6454778156996587, "grad_norm": 18.431743621826172, "learning_rate": 3.7376460017969457e-06, "loss": 2.1729, "step": 1513 }, { "epoch": 0.6459044368600683, "grad_norm": 25.59565544128418, "learning_rate": 3.7331536388140167e-06, "loss": 2.335, "step": 1514 }, { "epoch": 0.6463310580204779, "grad_norm": 17.255495071411133, "learning_rate": 3.7286612758310873e-06, "loss": 1.9688, "step": 1515 }, { "epoch": 0.6467576791808873, "grad_norm": 16.406015396118164, "learning_rate": 3.7241689128481584e-06, "loss": 2.2363, "step": 1516 }, { "epoch": 0.6471843003412969, "grad_norm": 16.080997467041016, "learning_rate": 3.7196765498652294e-06, "loss": 1.8213, "step": 1517 }, { "epoch": 0.6476109215017065, "grad_norm": 18.1623477935791, "learning_rate": 3.7151841868823e-06, "loss": 1.9053, "step": 1518 }, { "epoch": 0.648037542662116, "grad_norm": 29.810514450073242, "learning_rate": 3.710691823899371e-06, "loss": 2.0469, "step": 1519 }, { "epoch": 0.6484641638225256, "grad_norm": 18.05474090576172, "learning_rate": 3.7061994609164426e-06, "loss": 2.0127, "step": 1520 }, { "epoch": 0.6488907849829352, "grad_norm": 31.862010955810547, "learning_rate": 3.7017070979335136e-06, "loss": 2.2441, "step": 1521 }, { "epoch": 0.6493174061433447, "grad_norm": 40.58087921142578, "learning_rate": 3.6972147349505842e-06, "loss": 2.1729, "step": 1522 }, { "epoch": 0.6497440273037542, "grad_norm": 22.69669532775879, "learning_rate": 3.6927223719676553e-06, "loss": 2.2363, "step": 1523 }, { "epoch": 0.6501706484641638, "grad_norm": 25.206192016601562, "learning_rate": 3.6882300089847263e-06, "loss": 2.6113, "step": 1524 }, { "epoch": 0.6505972696245734, "grad_norm": 50.6636962890625, "learning_rate": 3.683737646001797e-06, "loss": 2.6367, "step": 1525 }, { "epoch": 0.6510238907849829, "grad_norm": 36.94301986694336, "learning_rate": 3.679245283018868e-06, "loss": 2.2275, "step": 1526 }, { "epoch": 0.6514505119453925, "grad_norm": 21.014408111572266, "learning_rate": 3.6747529200359395e-06, "loss": 2.4297, "step": 1527 }, { "epoch": 0.6518771331058021, "grad_norm": 30.594039916992188, "learning_rate": 3.6702605570530105e-06, "loss": 2.4385, "step": 1528 }, { "epoch": 0.6523037542662116, "grad_norm": 43.26289367675781, "learning_rate": 3.665768194070081e-06, "loss": 2.3418, "step": 1529 }, { "epoch": 0.6527303754266212, "grad_norm": 30.854293823242188, "learning_rate": 3.661275831087152e-06, "loss": 2.2285, "step": 1530 }, { "epoch": 0.6531569965870307, "grad_norm": 33.77555847167969, "learning_rate": 3.656783468104223e-06, "loss": 2.2373, "step": 1531 }, { "epoch": 0.6535836177474402, "grad_norm": 36.544769287109375, "learning_rate": 3.652291105121294e-06, "loss": 2.8721, "step": 1532 }, { "epoch": 0.6540102389078498, "grad_norm": 32.035823822021484, "learning_rate": 3.647798742138365e-06, "loss": 2.292, "step": 1533 }, { "epoch": 0.6544368600682594, "grad_norm": 21.970911026000977, "learning_rate": 3.6433063791554363e-06, "loss": 2.0488, "step": 1534 }, { "epoch": 0.6548634812286689, "grad_norm": 25.027658462524414, "learning_rate": 3.6388140161725074e-06, "loss": 2.6094, "step": 1535 }, { "epoch": 0.6552901023890785, "grad_norm": 21.497413635253906, "learning_rate": 3.634321653189578e-06, "loss": 2.5479, "step": 1536 }, { "epoch": 0.6557167235494881, "grad_norm": 26.59954261779785, "learning_rate": 3.629829290206649e-06, "loss": 2.3184, "step": 1537 }, { "epoch": 0.6561433447098977, "grad_norm": 20.841184616088867, "learning_rate": 3.6253369272237197e-06, "loss": 2.332, "step": 1538 }, { "epoch": 0.6565699658703071, "grad_norm": 25.912158966064453, "learning_rate": 3.6208445642407907e-06, "loss": 2.498, "step": 1539 }, { "epoch": 0.6569965870307167, "grad_norm": 15.258447647094727, "learning_rate": 3.6163522012578618e-06, "loss": 2.0029, "step": 1540 }, { "epoch": 0.6574232081911263, "grad_norm": 20.002315521240234, "learning_rate": 3.6118598382749332e-06, "loss": 2.2607, "step": 1541 }, { "epoch": 0.6578498293515358, "grad_norm": 19.588613510131836, "learning_rate": 3.607367475292004e-06, "loss": 2.5439, "step": 1542 }, { "epoch": 0.6582764505119454, "grad_norm": 24.894855499267578, "learning_rate": 3.602875112309075e-06, "loss": 2.5576, "step": 1543 }, { "epoch": 0.658703071672355, "grad_norm": 16.874536514282227, "learning_rate": 3.598382749326146e-06, "loss": 2.0537, "step": 1544 }, { "epoch": 0.6591296928327645, "grad_norm": 18.829151153564453, "learning_rate": 3.5938903863432166e-06, "loss": 2.1377, "step": 1545 }, { "epoch": 0.659556313993174, "grad_norm": 19.12894058227539, "learning_rate": 3.5893980233602876e-06, "loss": 2.3105, "step": 1546 }, { "epoch": 0.6599829351535836, "grad_norm": 16.351715087890625, "learning_rate": 3.5849056603773586e-06, "loss": 2.627, "step": 1547 }, { "epoch": 0.6604095563139932, "grad_norm": 27.592687606811523, "learning_rate": 3.58041329739443e-06, "loss": 2.6953, "step": 1548 }, { "epoch": 0.6608361774744027, "grad_norm": 39.048057556152344, "learning_rate": 3.5759209344115007e-06, "loss": 2.4189, "step": 1549 }, { "epoch": 0.6612627986348123, "grad_norm": 17.709571838378906, "learning_rate": 3.5714285714285718e-06, "loss": 2.3457, "step": 1550 }, { "epoch": 0.6616894197952219, "grad_norm": 34.29948806762695, "learning_rate": 3.566936208445643e-06, "loss": 2.085, "step": 1551 }, { "epoch": 0.6621160409556314, "grad_norm": 19.947063446044922, "learning_rate": 3.5624438454627134e-06, "loss": 2.7549, "step": 1552 }, { "epoch": 0.662542662116041, "grad_norm": 20.71929359436035, "learning_rate": 3.5579514824797845e-06, "loss": 1.9424, "step": 1553 }, { "epoch": 0.6629692832764505, "grad_norm": 25.915510177612305, "learning_rate": 3.5534591194968555e-06, "loss": 2.5254, "step": 1554 }, { "epoch": 0.66339590443686, "grad_norm": 18.495929718017578, "learning_rate": 3.548966756513927e-06, "loss": 2.0322, "step": 1555 }, { "epoch": 0.6638225255972696, "grad_norm": 33.1831169128418, "learning_rate": 3.5444743935309976e-06, "loss": 2.4834, "step": 1556 }, { "epoch": 0.6642491467576792, "grad_norm": 24.745229721069336, "learning_rate": 3.5399820305480687e-06, "loss": 2.4292, "step": 1557 }, { "epoch": 0.6646757679180887, "grad_norm": 20.10657501220703, "learning_rate": 3.5354896675651397e-06, "loss": 2.0835, "step": 1558 }, { "epoch": 0.6651023890784983, "grad_norm": 20.765737533569336, "learning_rate": 3.5309973045822103e-06, "loss": 1.7295, "step": 1559 }, { "epoch": 0.6655290102389079, "grad_norm": 38.23501205444336, "learning_rate": 3.5265049415992814e-06, "loss": 2.0771, "step": 1560 }, { "epoch": 0.6659556313993175, "grad_norm": 29.63912010192871, "learning_rate": 3.5220125786163524e-06, "loss": 2.3281, "step": 1561 }, { "epoch": 0.6663822525597269, "grad_norm": 32.683170318603516, "learning_rate": 3.517520215633424e-06, "loss": 2.3936, "step": 1562 }, { "epoch": 0.6668088737201365, "grad_norm": 23.847415924072266, "learning_rate": 3.5130278526504945e-06, "loss": 2.6152, "step": 1563 }, { "epoch": 0.6672354948805461, "grad_norm": 33.72971725463867, "learning_rate": 3.5085354896675656e-06, "loss": 2.4717, "step": 1564 }, { "epoch": 0.6676621160409556, "grad_norm": 29.628982543945312, "learning_rate": 3.5040431266846366e-06, "loss": 2.7285, "step": 1565 }, { "epoch": 0.6680887372013652, "grad_norm": 36.2376823425293, "learning_rate": 3.4995507637017072e-06, "loss": 2.5703, "step": 1566 }, { "epoch": 0.6685153583617748, "grad_norm": 26.659250259399414, "learning_rate": 3.4950584007187783e-06, "loss": 1.9951, "step": 1567 }, { "epoch": 0.6689419795221843, "grad_norm": 29.95134162902832, "learning_rate": 3.4905660377358493e-06, "loss": 2.7998, "step": 1568 }, { "epoch": 0.6693686006825939, "grad_norm": 31.50892448425293, "learning_rate": 3.4860736747529208e-06, "loss": 2.1104, "step": 1569 }, { "epoch": 0.6697952218430034, "grad_norm": 23.725902557373047, "learning_rate": 3.4815813117699914e-06, "loss": 2.6953, "step": 1570 }, { "epoch": 0.6702218430034129, "grad_norm": 24.071653366088867, "learning_rate": 3.4770889487870624e-06, "loss": 1.9536, "step": 1571 }, { "epoch": 0.6706484641638225, "grad_norm": 19.715450286865234, "learning_rate": 3.4725965858041335e-06, "loss": 1.8994, "step": 1572 }, { "epoch": 0.6710750853242321, "grad_norm": 19.194473266601562, "learning_rate": 3.468104222821204e-06, "loss": 2.0479, "step": 1573 }, { "epoch": 0.6715017064846417, "grad_norm": 19.590009689331055, "learning_rate": 3.463611859838275e-06, "loss": 2.2793, "step": 1574 }, { "epoch": 0.6719283276450512, "grad_norm": 27.58205795288086, "learning_rate": 3.4591194968553458e-06, "loss": 2.5371, "step": 1575 }, { "epoch": 0.6723549488054608, "grad_norm": 26.039403915405273, "learning_rate": 3.4546271338724172e-06, "loss": 2.1006, "step": 1576 }, { "epoch": 0.6727815699658704, "grad_norm": 33.50947570800781, "learning_rate": 3.4501347708894883e-06, "loss": 2.7207, "step": 1577 }, { "epoch": 0.6732081911262798, "grad_norm": 43.135765075683594, "learning_rate": 3.4456424079065593e-06, "loss": 2.3198, "step": 1578 }, { "epoch": 0.6736348122866894, "grad_norm": 26.70039939880371, "learning_rate": 3.44115004492363e-06, "loss": 2.9517, "step": 1579 }, { "epoch": 0.674061433447099, "grad_norm": 20.953989028930664, "learning_rate": 3.436657681940701e-06, "loss": 2.5205, "step": 1580 }, { "epoch": 0.6744880546075085, "grad_norm": 19.561216354370117, "learning_rate": 3.432165318957772e-06, "loss": 2.1191, "step": 1581 }, { "epoch": 0.6749146757679181, "grad_norm": 24.886316299438477, "learning_rate": 3.4276729559748427e-06, "loss": 2.126, "step": 1582 }, { "epoch": 0.6753412969283277, "grad_norm": 21.290424346923828, "learning_rate": 3.423180592991914e-06, "loss": 2.2529, "step": 1583 }, { "epoch": 0.6757679180887372, "grad_norm": 34.25997543334961, "learning_rate": 3.418688230008985e-06, "loss": 2.0249, "step": 1584 }, { "epoch": 0.6761945392491467, "grad_norm": 22.012805938720703, "learning_rate": 3.4141958670260562e-06, "loss": 2.7158, "step": 1585 }, { "epoch": 0.6766211604095563, "grad_norm": 24.38823699951172, "learning_rate": 3.409703504043127e-06, "loss": 1.8994, "step": 1586 }, { "epoch": 0.6770477815699659, "grad_norm": 22.14129066467285, "learning_rate": 3.405211141060198e-06, "loss": 2.041, "step": 1587 }, { "epoch": 0.6774744027303754, "grad_norm": 23.154438018798828, "learning_rate": 3.400718778077269e-06, "loss": 1.7139, "step": 1588 }, { "epoch": 0.677901023890785, "grad_norm": 17.61654281616211, "learning_rate": 3.3962264150943395e-06, "loss": 1.834, "step": 1589 }, { "epoch": 0.6783276450511946, "grad_norm": 18.777099609375, "learning_rate": 3.3917340521114106e-06, "loss": 2.3018, "step": 1590 }, { "epoch": 0.6787542662116041, "grad_norm": 22.44559097290039, "learning_rate": 3.387241689128482e-06, "loss": 1.6294, "step": 1591 }, { "epoch": 0.6791808873720137, "grad_norm": 21.086833953857422, "learning_rate": 3.382749326145553e-06, "loss": 2.1709, "step": 1592 }, { "epoch": 0.6796075085324232, "grad_norm": 25.184696197509766, "learning_rate": 3.3782569631626237e-06, "loss": 2.3477, "step": 1593 }, { "epoch": 0.6800341296928327, "grad_norm": 23.782960891723633, "learning_rate": 3.3737646001796948e-06, "loss": 2.8232, "step": 1594 }, { "epoch": 0.6804607508532423, "grad_norm": 41.1099739074707, "learning_rate": 3.369272237196766e-06, "loss": 2.3916, "step": 1595 }, { "epoch": 0.6808873720136519, "grad_norm": 20.48354721069336, "learning_rate": 3.3647798742138364e-06, "loss": 2.2881, "step": 1596 }, { "epoch": 0.6813139931740614, "grad_norm": 19.21649742126465, "learning_rate": 3.3602875112309075e-06, "loss": 2.042, "step": 1597 }, { "epoch": 0.681740614334471, "grad_norm": 20.342247009277344, "learning_rate": 3.355795148247979e-06, "loss": 2.1675, "step": 1598 }, { "epoch": 0.6821672354948806, "grad_norm": 26.08676528930664, "learning_rate": 3.35130278526505e-06, "loss": 2.0991, "step": 1599 }, { "epoch": 0.6825938566552902, "grad_norm": 20.84765625, "learning_rate": 3.3468104222821206e-06, "loss": 1.8174, "step": 1600 }, { "epoch": 0.6830204778156996, "grad_norm": 18.198646545410156, "learning_rate": 3.3423180592991917e-06, "loss": 2.1973, "step": 1601 }, { "epoch": 0.6834470989761092, "grad_norm": 20.363811492919922, "learning_rate": 3.3378256963162627e-06, "loss": 2.6855, "step": 1602 }, { "epoch": 0.6838737201365188, "grad_norm": 21.353164672851562, "learning_rate": 3.3333333333333333e-06, "loss": 2.6387, "step": 1603 }, { "epoch": 0.6843003412969283, "grad_norm": 33.03525161743164, "learning_rate": 3.3288409703504044e-06, "loss": 1.668, "step": 1604 }, { "epoch": 0.6847269624573379, "grad_norm": 22.72640609741211, "learning_rate": 3.324348607367476e-06, "loss": 1.9023, "step": 1605 }, { "epoch": 0.6851535836177475, "grad_norm": 22.232295989990234, "learning_rate": 3.319856244384547e-06, "loss": 2.5342, "step": 1606 }, { "epoch": 0.685580204778157, "grad_norm": 21.776939392089844, "learning_rate": 3.3153638814016175e-06, "loss": 2.4629, "step": 1607 }, { "epoch": 0.6860068259385665, "grad_norm": 32.66164016723633, "learning_rate": 3.3108715184186885e-06, "loss": 2.165, "step": 1608 }, { "epoch": 0.6864334470989761, "grad_norm": 32.1452522277832, "learning_rate": 3.3063791554357596e-06, "loss": 2.0732, "step": 1609 }, { "epoch": 0.6868600682593856, "grad_norm": 19.777278900146484, "learning_rate": 3.30188679245283e-06, "loss": 2.4092, "step": 1610 }, { "epoch": 0.6872866894197952, "grad_norm": 19.71466064453125, "learning_rate": 3.2973944294699013e-06, "loss": 2.1514, "step": 1611 }, { "epoch": 0.6877133105802048, "grad_norm": 17.147441864013672, "learning_rate": 3.2929020664869727e-06, "loss": 1.9487, "step": 1612 }, { "epoch": 0.6881399317406144, "grad_norm": 44.140228271484375, "learning_rate": 3.2884097035040433e-06, "loss": 2.7715, "step": 1613 }, { "epoch": 0.6885665529010239, "grad_norm": 33.57496643066406, "learning_rate": 3.2839173405211144e-06, "loss": 2.7432, "step": 1614 }, { "epoch": 0.6889931740614335, "grad_norm": 18.664264678955078, "learning_rate": 3.2794249775381854e-06, "loss": 2.5352, "step": 1615 }, { "epoch": 0.689419795221843, "grad_norm": 23.951438903808594, "learning_rate": 3.274932614555256e-06, "loss": 2.2256, "step": 1616 }, { "epoch": 0.6898464163822525, "grad_norm": 23.820545196533203, "learning_rate": 3.270440251572327e-06, "loss": 2.1924, "step": 1617 }, { "epoch": 0.6902730375426621, "grad_norm": 20.0013427734375, "learning_rate": 3.265947888589398e-06, "loss": 2.165, "step": 1618 }, { "epoch": 0.6906996587030717, "grad_norm": 20.567819595336914, "learning_rate": 3.2614555256064696e-06, "loss": 2.75, "step": 1619 }, { "epoch": 0.6911262798634812, "grad_norm": 16.655508041381836, "learning_rate": 3.2569631626235402e-06, "loss": 1.8428, "step": 1620 }, { "epoch": 0.6915529010238908, "grad_norm": 17.113733291625977, "learning_rate": 3.2524707996406113e-06, "loss": 2.2402, "step": 1621 }, { "epoch": 0.6919795221843004, "grad_norm": 27.453786849975586, "learning_rate": 3.2479784366576823e-06, "loss": 2.207, "step": 1622 }, { "epoch": 0.6924061433447098, "grad_norm": 32.38252258300781, "learning_rate": 3.243486073674753e-06, "loss": 2.2246, "step": 1623 }, { "epoch": 0.6928327645051194, "grad_norm": 18.718177795410156, "learning_rate": 3.238993710691824e-06, "loss": 2.3203, "step": 1624 }, { "epoch": 0.693259385665529, "grad_norm": 37.67975616455078, "learning_rate": 3.234501347708895e-06, "loss": 1.8008, "step": 1625 }, { "epoch": 0.6936860068259386, "grad_norm": 25.621017456054688, "learning_rate": 3.2300089847259665e-06, "loss": 2.2832, "step": 1626 }, { "epoch": 0.6941126279863481, "grad_norm": 18.550310134887695, "learning_rate": 3.225516621743037e-06, "loss": 2.1826, "step": 1627 }, { "epoch": 0.6945392491467577, "grad_norm": 31.40336799621582, "learning_rate": 3.221024258760108e-06, "loss": 2.0342, "step": 1628 }, { "epoch": 0.6949658703071673, "grad_norm": 21.5510196685791, "learning_rate": 3.216531895777179e-06, "loss": 2.5488, "step": 1629 }, { "epoch": 0.6953924914675768, "grad_norm": 16.434173583984375, "learning_rate": 3.21203953279425e-06, "loss": 1.8301, "step": 1630 }, { "epoch": 0.6958191126279863, "grad_norm": 23.86673355102539, "learning_rate": 3.207547169811321e-06, "loss": 1.6816, "step": 1631 }, { "epoch": 0.6962457337883959, "grad_norm": 22.883087158203125, "learning_rate": 3.203054806828392e-06, "loss": 2.374, "step": 1632 }, { "epoch": 0.6966723549488054, "grad_norm": 18.144968032836914, "learning_rate": 3.1985624438454634e-06, "loss": 2.3555, "step": 1633 }, { "epoch": 0.697098976109215, "grad_norm": 19.690826416015625, "learning_rate": 3.194070080862534e-06, "loss": 2.1045, "step": 1634 }, { "epoch": 0.6975255972696246, "grad_norm": 23.947898864746094, "learning_rate": 3.189577717879605e-06, "loss": 2.3008, "step": 1635 }, { "epoch": 0.6979522184300341, "grad_norm": 21.14999008178711, "learning_rate": 3.185085354896676e-06, "loss": 2.1064, "step": 1636 }, { "epoch": 0.6983788395904437, "grad_norm": 26.988557815551758, "learning_rate": 3.1805929919137467e-06, "loss": 1.9912, "step": 1637 }, { "epoch": 0.6988054607508533, "grad_norm": 27.08625602722168, "learning_rate": 3.1761006289308178e-06, "loss": 2.4316, "step": 1638 }, { "epoch": 0.6992320819112628, "grad_norm": 23.413265228271484, "learning_rate": 3.171608265947889e-06, "loss": 2.2451, "step": 1639 }, { "epoch": 0.6996587030716723, "grad_norm": 21.323143005371094, "learning_rate": 3.1671159029649603e-06, "loss": 1.9639, "step": 1640 }, { "epoch": 0.7000853242320819, "grad_norm": 23.912181854248047, "learning_rate": 3.162623539982031e-06, "loss": 2.1152, "step": 1641 }, { "epoch": 0.7005119453924915, "grad_norm": 23.545801162719727, "learning_rate": 3.158131176999102e-06, "loss": 2.4561, "step": 1642 }, { "epoch": 0.700938566552901, "grad_norm": 29.719528198242188, "learning_rate": 3.153638814016173e-06, "loss": 2.4512, "step": 1643 }, { "epoch": 0.7013651877133106, "grad_norm": 16.677494049072266, "learning_rate": 3.1491464510332436e-06, "loss": 2.0938, "step": 1644 }, { "epoch": 0.7017918088737202, "grad_norm": 29.851078033447266, "learning_rate": 3.1446540880503146e-06, "loss": 2.5488, "step": 1645 }, { "epoch": 0.7022184300341296, "grad_norm": 26.422513961791992, "learning_rate": 3.1401617250673853e-06, "loss": 2.4424, "step": 1646 }, { "epoch": 0.7026450511945392, "grad_norm": 22.49760627746582, "learning_rate": 3.1356693620844567e-06, "loss": 2.6123, "step": 1647 }, { "epoch": 0.7030716723549488, "grad_norm": 37.57074737548828, "learning_rate": 3.1311769991015278e-06, "loss": 2.4365, "step": 1648 }, { "epoch": 0.7034982935153583, "grad_norm": 19.417375564575195, "learning_rate": 3.126684636118599e-06, "loss": 2.084, "step": 1649 }, { "epoch": 0.7039249146757679, "grad_norm": 46.048404693603516, "learning_rate": 3.1221922731356694e-06, "loss": 2.7559, "step": 1650 }, { "epoch": 0.7043515358361775, "grad_norm": 24.032854080200195, "learning_rate": 3.1176999101527405e-06, "loss": 2.4941, "step": 1651 }, { "epoch": 0.7047781569965871, "grad_norm": 27.37352180480957, "learning_rate": 3.1132075471698115e-06, "loss": 2.5059, "step": 1652 }, { "epoch": 0.7052047781569966, "grad_norm": 18.343231201171875, "learning_rate": 3.108715184186882e-06, "loss": 1.9766, "step": 1653 }, { "epoch": 0.7056313993174061, "grad_norm": 32.41136169433594, "learning_rate": 3.1042228212039536e-06, "loss": 2.7559, "step": 1654 }, { "epoch": 0.7060580204778157, "grad_norm": 18.58074951171875, "learning_rate": 3.0997304582210247e-06, "loss": 2.3477, "step": 1655 }, { "epoch": 0.7064846416382252, "grad_norm": 17.456533432006836, "learning_rate": 3.0952380952380957e-06, "loss": 2.0459, "step": 1656 }, { "epoch": 0.7069112627986348, "grad_norm": 16.901012420654297, "learning_rate": 3.0907457322551663e-06, "loss": 2.0396, "step": 1657 }, { "epoch": 0.7073378839590444, "grad_norm": 15.16879653930664, "learning_rate": 3.0862533692722374e-06, "loss": 1.8633, "step": 1658 }, { "epoch": 0.7077645051194539, "grad_norm": 22.42111587524414, "learning_rate": 3.0817610062893084e-06, "loss": 2.4502, "step": 1659 }, { "epoch": 0.7081911262798635, "grad_norm": 27.441150665283203, "learning_rate": 3.077268643306379e-06, "loss": 2.5537, "step": 1660 }, { "epoch": 0.7086177474402731, "grad_norm": 24.77878761291504, "learning_rate": 3.0727762803234505e-06, "loss": 1.7925, "step": 1661 }, { "epoch": 0.7090443686006825, "grad_norm": 22.958045959472656, "learning_rate": 3.0682839173405216e-06, "loss": 2.5142, "step": 1662 }, { "epoch": 0.7094709897610921, "grad_norm": 28.620180130004883, "learning_rate": 3.0637915543575926e-06, "loss": 2.3408, "step": 1663 }, { "epoch": 0.7098976109215017, "grad_norm": 32.69829177856445, "learning_rate": 3.0592991913746632e-06, "loss": 2.2031, "step": 1664 }, { "epoch": 0.7103242320819113, "grad_norm": 45.391334533691406, "learning_rate": 3.0548068283917343e-06, "loss": 2.5068, "step": 1665 }, { "epoch": 0.7107508532423208, "grad_norm": 20.9996280670166, "learning_rate": 3.0503144654088053e-06, "loss": 2.5391, "step": 1666 }, { "epoch": 0.7111774744027304, "grad_norm": 23.97358512878418, "learning_rate": 3.045822102425876e-06, "loss": 2.5625, "step": 1667 }, { "epoch": 0.71160409556314, "grad_norm": 20.434316635131836, "learning_rate": 3.0413297394429474e-06, "loss": 2.1904, "step": 1668 }, { "epoch": 0.7120307167235495, "grad_norm": 28.51020622253418, "learning_rate": 3.0368373764600184e-06, "loss": 2.4521, "step": 1669 }, { "epoch": 0.712457337883959, "grad_norm": 16.132415771484375, "learning_rate": 3.0323450134770895e-06, "loss": 1.5845, "step": 1670 }, { "epoch": 0.7128839590443686, "grad_norm": 23.020078659057617, "learning_rate": 3.02785265049416e-06, "loss": 2.9268, "step": 1671 }, { "epoch": 0.7133105802047781, "grad_norm": 27.02225685119629, "learning_rate": 3.023360287511231e-06, "loss": 2.2266, "step": 1672 }, { "epoch": 0.7137372013651877, "grad_norm": 19.597509384155273, "learning_rate": 3.018867924528302e-06, "loss": 2.6143, "step": 1673 }, { "epoch": 0.7141638225255973, "grad_norm": 22.84983253479004, "learning_rate": 3.014375561545373e-06, "loss": 1.9414, "step": 1674 }, { "epoch": 0.7145904436860068, "grad_norm": 29.941665649414062, "learning_rate": 3.009883198562444e-06, "loss": 2.7354, "step": 1675 }, { "epoch": 0.7150170648464164, "grad_norm": 22.785655975341797, "learning_rate": 3.0053908355795153e-06, "loss": 2.1377, "step": 1676 }, { "epoch": 0.715443686006826, "grad_norm": 19.041711807250977, "learning_rate": 3.0008984725965864e-06, "loss": 2.1201, "step": 1677 }, { "epoch": 0.7158703071672355, "grad_norm": 19.834877014160156, "learning_rate": 2.996406109613657e-06, "loss": 2.1113, "step": 1678 }, { "epoch": 0.716296928327645, "grad_norm": 16.664960861206055, "learning_rate": 2.991913746630728e-06, "loss": 2.0352, "step": 1679 }, { "epoch": 0.7167235494880546, "grad_norm": 27.215831756591797, "learning_rate": 2.987421383647799e-06, "loss": 2.1787, "step": 1680 }, { "epoch": 0.7171501706484642, "grad_norm": 19.89889907836914, "learning_rate": 2.9829290206648697e-06, "loss": 2.4629, "step": 1681 }, { "epoch": 0.7175767918088737, "grad_norm": 23.80775260925293, "learning_rate": 2.9784366576819408e-06, "loss": 2.3076, "step": 1682 }, { "epoch": 0.7180034129692833, "grad_norm": 34.298248291015625, "learning_rate": 2.9739442946990122e-06, "loss": 1.8589, "step": 1683 }, { "epoch": 0.7184300341296929, "grad_norm": 29.29851722717285, "learning_rate": 2.969451931716083e-06, "loss": 2.1162, "step": 1684 }, { "epoch": 0.7188566552901023, "grad_norm": 23.561893463134766, "learning_rate": 2.964959568733154e-06, "loss": 2.3486, "step": 1685 }, { "epoch": 0.7192832764505119, "grad_norm": 19.6323299407959, "learning_rate": 2.960467205750225e-06, "loss": 1.9863, "step": 1686 }, { "epoch": 0.7197098976109215, "grad_norm": 31.969934463500977, "learning_rate": 2.9559748427672955e-06, "loss": 2.4453, "step": 1687 }, { "epoch": 0.7201365187713311, "grad_norm": 24.619426727294922, "learning_rate": 2.9514824797843666e-06, "loss": 2.8027, "step": 1688 }, { "epoch": 0.7205631399317406, "grad_norm": 20.186494827270508, "learning_rate": 2.9469901168014376e-06, "loss": 2.4297, "step": 1689 }, { "epoch": 0.7209897610921502, "grad_norm": 18.27107048034668, "learning_rate": 2.942497753818509e-06, "loss": 2.1455, "step": 1690 }, { "epoch": 0.7214163822525598, "grad_norm": 21.095502853393555, "learning_rate": 2.9380053908355797e-06, "loss": 2.6606, "step": 1691 }, { "epoch": 0.7218430034129693, "grad_norm": 20.978729248046875, "learning_rate": 2.9335130278526508e-06, "loss": 2.2842, "step": 1692 }, { "epoch": 0.7222696245733788, "grad_norm": 21.04619026184082, "learning_rate": 2.929020664869722e-06, "loss": 1.2656, "step": 1693 }, { "epoch": 0.7226962457337884, "grad_norm": 28.32137680053711, "learning_rate": 2.9245283018867924e-06, "loss": 2.3525, "step": 1694 }, { "epoch": 0.7231228668941979, "grad_norm": 22.19200325012207, "learning_rate": 2.9200359389038635e-06, "loss": 2.0947, "step": 1695 }, { "epoch": 0.7235494880546075, "grad_norm": 19.34542465209961, "learning_rate": 2.9155435759209345e-06, "loss": 2.1494, "step": 1696 }, { "epoch": 0.7239761092150171, "grad_norm": 23.154399871826172, "learning_rate": 2.911051212938006e-06, "loss": 2.0, "step": 1697 }, { "epoch": 0.7244027303754266, "grad_norm": 27.6765079498291, "learning_rate": 2.9065588499550766e-06, "loss": 2.1758, "step": 1698 }, { "epoch": 0.7248293515358362, "grad_norm": 30.37624740600586, "learning_rate": 2.9020664869721477e-06, "loss": 2.7207, "step": 1699 }, { "epoch": 0.7252559726962458, "grad_norm": 19.52189064025879, "learning_rate": 2.8975741239892187e-06, "loss": 1.8457, "step": 1700 }, { "epoch": 0.7256825938566553, "grad_norm": 31.081382751464844, "learning_rate": 2.8930817610062893e-06, "loss": 2.3071, "step": 1701 }, { "epoch": 0.7261092150170648, "grad_norm": 23.929893493652344, "learning_rate": 2.8885893980233604e-06, "loss": 2.3467, "step": 1702 }, { "epoch": 0.7265358361774744, "grad_norm": 19.086774826049805, "learning_rate": 2.8840970350404314e-06, "loss": 2.0127, "step": 1703 }, { "epoch": 0.726962457337884, "grad_norm": 30.935726165771484, "learning_rate": 2.879604672057503e-06, "loss": 2.4414, "step": 1704 }, { "epoch": 0.7273890784982935, "grad_norm": 33.78768539428711, "learning_rate": 2.8751123090745735e-06, "loss": 1.8154, "step": 1705 }, { "epoch": 0.7278156996587031, "grad_norm": 19.40523338317871, "learning_rate": 2.8706199460916445e-06, "loss": 1.8164, "step": 1706 }, { "epoch": 0.7282423208191127, "grad_norm": 38.8382682800293, "learning_rate": 2.8661275831087156e-06, "loss": 2.1211, "step": 1707 }, { "epoch": 0.7286689419795221, "grad_norm": 26.457233428955078, "learning_rate": 2.861635220125786e-06, "loss": 2.0469, "step": 1708 }, { "epoch": 0.7290955631399317, "grad_norm": 22.73707389831543, "learning_rate": 2.8571428571428573e-06, "loss": 1.8389, "step": 1709 }, { "epoch": 0.7295221843003413, "grad_norm": 49.22174072265625, "learning_rate": 2.8526504941599283e-06, "loss": 2.457, "step": 1710 }, { "epoch": 0.7299488054607508, "grad_norm": 26.232208251953125, "learning_rate": 2.8481581311769998e-06, "loss": 2.3633, "step": 1711 }, { "epoch": 0.7303754266211604, "grad_norm": 23.40569496154785, "learning_rate": 2.8436657681940704e-06, "loss": 2.4282, "step": 1712 }, { "epoch": 0.73080204778157, "grad_norm": 26.51075553894043, "learning_rate": 2.8391734052111414e-06, "loss": 2.5537, "step": 1713 }, { "epoch": 0.7312286689419796, "grad_norm": 20.640514373779297, "learning_rate": 2.8346810422282125e-06, "loss": 2.0527, "step": 1714 }, { "epoch": 0.731655290102389, "grad_norm": 22.921810150146484, "learning_rate": 2.830188679245283e-06, "loss": 2.1309, "step": 1715 }, { "epoch": 0.7320819112627986, "grad_norm": 29.728538513183594, "learning_rate": 2.825696316262354e-06, "loss": 2.7529, "step": 1716 }, { "epoch": 0.7325085324232082, "grad_norm": 24.323043823242188, "learning_rate": 2.821203953279425e-06, "loss": 2.083, "step": 1717 }, { "epoch": 0.7329351535836177, "grad_norm": 26.60840606689453, "learning_rate": 2.8167115902964962e-06, "loss": 2.4717, "step": 1718 }, { "epoch": 0.7333617747440273, "grad_norm": 21.946855545043945, "learning_rate": 2.8122192273135673e-06, "loss": 2.3496, "step": 1719 }, { "epoch": 0.7337883959044369, "grad_norm": 26.811389923095703, "learning_rate": 2.8077268643306383e-06, "loss": 2.1406, "step": 1720 }, { "epoch": 0.7342150170648464, "grad_norm": 18.540809631347656, "learning_rate": 2.803234501347709e-06, "loss": 2.0864, "step": 1721 }, { "epoch": 0.734641638225256, "grad_norm": 23.249303817749023, "learning_rate": 2.79874213836478e-06, "loss": 2.2393, "step": 1722 }, { "epoch": 0.7350682593856656, "grad_norm": 20.395145416259766, "learning_rate": 2.794249775381851e-06, "loss": 2.4502, "step": 1723 }, { "epoch": 0.735494880546075, "grad_norm": 30.138233184814453, "learning_rate": 2.7897574123989217e-06, "loss": 2.1201, "step": 1724 }, { "epoch": 0.7359215017064846, "grad_norm": 26.507150650024414, "learning_rate": 2.785265049415993e-06, "loss": 1.9185, "step": 1725 }, { "epoch": 0.7363481228668942, "grad_norm": 15.685582160949707, "learning_rate": 2.780772686433064e-06, "loss": 2.0293, "step": 1726 }, { "epoch": 0.7367747440273038, "grad_norm": 26.94832420349121, "learning_rate": 2.776280323450135e-06, "loss": 1.9082, "step": 1727 }, { "epoch": 0.7372013651877133, "grad_norm": 28.36393165588379, "learning_rate": 2.771787960467206e-06, "loss": 1.8506, "step": 1728 }, { "epoch": 0.7376279863481229, "grad_norm": 17.436843872070312, "learning_rate": 2.767295597484277e-06, "loss": 2.1221, "step": 1729 }, { "epoch": 0.7380546075085325, "grad_norm": 23.560590744018555, "learning_rate": 2.762803234501348e-06, "loss": 2.4292, "step": 1730 }, { "epoch": 0.738481228668942, "grad_norm": 17.839962005615234, "learning_rate": 2.7583108715184185e-06, "loss": 2.2773, "step": 1731 }, { "epoch": 0.7389078498293515, "grad_norm": 33.70856475830078, "learning_rate": 2.75381850853549e-06, "loss": 2.2441, "step": 1732 }, { "epoch": 0.7393344709897611, "grad_norm": 28.266454696655273, "learning_rate": 2.749326145552561e-06, "loss": 2.4609, "step": 1733 }, { "epoch": 0.7397610921501706, "grad_norm": 20.637826919555664, "learning_rate": 2.744833782569632e-06, "loss": 2.3477, "step": 1734 }, { "epoch": 0.7401877133105802, "grad_norm": 18.084033966064453, "learning_rate": 2.7403414195867027e-06, "loss": 2.168, "step": 1735 }, { "epoch": 0.7406143344709898, "grad_norm": 26.160219192504883, "learning_rate": 2.7358490566037738e-06, "loss": 2.1494, "step": 1736 }, { "epoch": 0.7410409556313993, "grad_norm": 25.901765823364258, "learning_rate": 2.731356693620845e-06, "loss": 2.126, "step": 1737 }, { "epoch": 0.7414675767918089, "grad_norm": 18.53731346130371, "learning_rate": 2.7268643306379154e-06, "loss": 1.915, "step": 1738 }, { "epoch": 0.7418941979522184, "grad_norm": 25.867483139038086, "learning_rate": 2.722371967654987e-06, "loss": 2.4033, "step": 1739 }, { "epoch": 0.742320819112628, "grad_norm": 29.431753158569336, "learning_rate": 2.717879604672058e-06, "loss": 1.8164, "step": 1740 }, { "epoch": 0.7427474402730375, "grad_norm": 21.479326248168945, "learning_rate": 2.713387241689129e-06, "loss": 1.9727, "step": 1741 }, { "epoch": 0.7431740614334471, "grad_norm": 17.52713966369629, "learning_rate": 2.7088948787061996e-06, "loss": 1.5908, "step": 1742 }, { "epoch": 0.7436006825938567, "grad_norm": 19.50530242919922, "learning_rate": 2.7044025157232706e-06, "loss": 1.6367, "step": 1743 }, { "epoch": 0.7440273037542662, "grad_norm": 35.69096755981445, "learning_rate": 2.6999101527403417e-06, "loss": 1.9648, "step": 1744 }, { "epoch": 0.7444539249146758, "grad_norm": 44.18523406982422, "learning_rate": 2.6954177897574123e-06, "loss": 2.3213, "step": 1745 }, { "epoch": 0.7448805460750854, "grad_norm": 22.357803344726562, "learning_rate": 2.6909254267744838e-06, "loss": 1.9614, "step": 1746 }, { "epoch": 0.7453071672354948, "grad_norm": 21.612682342529297, "learning_rate": 2.686433063791555e-06, "loss": 2.5068, "step": 1747 }, { "epoch": 0.7457337883959044, "grad_norm": 31.66046714782715, "learning_rate": 2.681940700808626e-06, "loss": 2.7129, "step": 1748 }, { "epoch": 0.746160409556314, "grad_norm": 23.753938674926758, "learning_rate": 2.6774483378256965e-06, "loss": 2.2041, "step": 1749 }, { "epoch": 0.7465870307167235, "grad_norm": 28.22380256652832, "learning_rate": 2.6729559748427675e-06, "loss": 2.2656, "step": 1750 }, { "epoch": 0.7470136518771331, "grad_norm": 22.26675796508789, "learning_rate": 2.6684636118598386e-06, "loss": 2.3828, "step": 1751 }, { "epoch": 0.7474402730375427, "grad_norm": 18.363147735595703, "learning_rate": 2.663971248876909e-06, "loss": 2.0376, "step": 1752 }, { "epoch": 0.7478668941979523, "grad_norm": 18.026317596435547, "learning_rate": 2.6594788858939802e-06, "loss": 2.0869, "step": 1753 }, { "epoch": 0.7482935153583617, "grad_norm": 23.124431610107422, "learning_rate": 2.6549865229110517e-06, "loss": 1.8809, "step": 1754 }, { "epoch": 0.7487201365187713, "grad_norm": 20.112951278686523, "learning_rate": 2.6504941599281223e-06, "loss": 1.9185, "step": 1755 }, { "epoch": 0.7491467576791809, "grad_norm": 19.65840721130371, "learning_rate": 2.6460017969451934e-06, "loss": 2.3105, "step": 1756 }, { "epoch": 0.7495733788395904, "grad_norm": 17.392486572265625, "learning_rate": 2.6415094339622644e-06, "loss": 2.0371, "step": 1757 }, { "epoch": 0.75, "grad_norm": 23.173776626586914, "learning_rate": 2.637017070979335e-06, "loss": 3.1699, "step": 1758 }, { "epoch": 0.7504266211604096, "grad_norm": 35.75828552246094, "learning_rate": 2.632524707996406e-06, "loss": 2.25, "step": 1759 }, { "epoch": 0.7508532423208191, "grad_norm": 19.388690948486328, "learning_rate": 2.628032345013477e-06, "loss": 2.043, "step": 1760 }, { "epoch": 0.7512798634812287, "grad_norm": 29.570627212524414, "learning_rate": 2.6235399820305486e-06, "loss": 2.1875, "step": 1761 }, { "epoch": 0.7517064846416383, "grad_norm": 24.603408813476562, "learning_rate": 2.6190476190476192e-06, "loss": 2.0171, "step": 1762 }, { "epoch": 0.7521331058020477, "grad_norm": 43.5400276184082, "learning_rate": 2.6145552560646903e-06, "loss": 2.4141, "step": 1763 }, { "epoch": 0.7525597269624573, "grad_norm": 20.218944549560547, "learning_rate": 2.6100628930817613e-06, "loss": 2.2295, "step": 1764 }, { "epoch": 0.7529863481228669, "grad_norm": 16.16379737854004, "learning_rate": 2.605570530098832e-06, "loss": 1.7656, "step": 1765 }, { "epoch": 0.7534129692832765, "grad_norm": 18.501811981201172, "learning_rate": 2.601078167115903e-06, "loss": 2.2217, "step": 1766 }, { "epoch": 0.753839590443686, "grad_norm": 24.61173439025879, "learning_rate": 2.596585804132974e-06, "loss": 2.2686, "step": 1767 }, { "epoch": 0.7542662116040956, "grad_norm": 25.14869499206543, "learning_rate": 2.5920934411500455e-06, "loss": 2.0107, "step": 1768 }, { "epoch": 0.7546928327645052, "grad_norm": 25.4429931640625, "learning_rate": 2.587601078167116e-06, "loss": 2.4429, "step": 1769 }, { "epoch": 0.7551194539249146, "grad_norm": 34.942222595214844, "learning_rate": 2.583108715184187e-06, "loss": 2.2051, "step": 1770 }, { "epoch": 0.7555460750853242, "grad_norm": 58.64693069458008, "learning_rate": 2.578616352201258e-06, "loss": 2.1289, "step": 1771 }, { "epoch": 0.7559726962457338, "grad_norm": 20.511198043823242, "learning_rate": 2.574123989218329e-06, "loss": 1.7251, "step": 1772 }, { "epoch": 0.7563993174061433, "grad_norm": 19.378904342651367, "learning_rate": 2.5696316262354e-06, "loss": 2.2139, "step": 1773 }, { "epoch": 0.7568259385665529, "grad_norm": 26.095415115356445, "learning_rate": 2.565139263252471e-06, "loss": 2.1777, "step": 1774 }, { "epoch": 0.7572525597269625, "grad_norm": 22.2766056060791, "learning_rate": 2.5606469002695424e-06, "loss": 2.3418, "step": 1775 }, { "epoch": 0.757679180887372, "grad_norm": 23.590354919433594, "learning_rate": 2.556154537286613e-06, "loss": 2.1255, "step": 1776 }, { "epoch": 0.7581058020477816, "grad_norm": 20.151622772216797, "learning_rate": 2.551662174303684e-06, "loss": 2.3418, "step": 1777 }, { "epoch": 0.7585324232081911, "grad_norm": 22.800518035888672, "learning_rate": 2.547169811320755e-06, "loss": 2.1279, "step": 1778 }, { "epoch": 0.7589590443686007, "grad_norm": 20.73748207092285, "learning_rate": 2.5426774483378257e-06, "loss": 2.291, "step": 1779 }, { "epoch": 0.7593856655290102, "grad_norm": 33.89189147949219, "learning_rate": 2.5381850853548968e-06, "loss": 2.3872, "step": 1780 }, { "epoch": 0.7598122866894198, "grad_norm": 28.626930236816406, "learning_rate": 2.533692722371968e-06, "loss": 2.3867, "step": 1781 }, { "epoch": 0.7602389078498294, "grad_norm": 18.091033935546875, "learning_rate": 2.5292003593890393e-06, "loss": 1.8428, "step": 1782 }, { "epoch": 0.7606655290102389, "grad_norm": 24.8004093170166, "learning_rate": 2.52470799640611e-06, "loss": 2.3076, "step": 1783 }, { "epoch": 0.7610921501706485, "grad_norm": 27.434419631958008, "learning_rate": 2.520215633423181e-06, "loss": 2.0391, "step": 1784 }, { "epoch": 0.761518771331058, "grad_norm": 21.95925521850586, "learning_rate": 2.515723270440252e-06, "loss": 2.0225, "step": 1785 }, { "epoch": 0.7619453924914675, "grad_norm": 20.50782585144043, "learning_rate": 2.5112309074573226e-06, "loss": 1.7798, "step": 1786 }, { "epoch": 0.7623720136518771, "grad_norm": 23.383893966674805, "learning_rate": 2.5067385444743936e-06, "loss": 1.7754, "step": 1787 }, { "epoch": 0.7627986348122867, "grad_norm": 30.2733154296875, "learning_rate": 2.5022461814914647e-06, "loss": 1.896, "step": 1788 }, { "epoch": 0.7632252559726962, "grad_norm": 50.34253692626953, "learning_rate": 2.4977538185085357e-06, "loss": 3.1006, "step": 1789 }, { "epoch": 0.7636518771331058, "grad_norm": 31.713327407836914, "learning_rate": 2.4932614555256063e-06, "loss": 2.1416, "step": 1790 }, { "epoch": 0.7640784982935154, "grad_norm": 24.32350730895996, "learning_rate": 2.488769092542678e-06, "loss": 2.6396, "step": 1791 }, { "epoch": 0.764505119453925, "grad_norm": 20.80146026611328, "learning_rate": 2.4842767295597484e-06, "loss": 2.4419, "step": 1792 }, { "epoch": 0.7649317406143344, "grad_norm": 17.979833602905273, "learning_rate": 2.4797843665768195e-06, "loss": 2.0811, "step": 1793 }, { "epoch": 0.765358361774744, "grad_norm": 33.28745651245117, "learning_rate": 2.4752920035938905e-06, "loss": 1.9746, "step": 1794 }, { "epoch": 0.7657849829351536, "grad_norm": 20.66737174987793, "learning_rate": 2.4707996406109616e-06, "loss": 2.0127, "step": 1795 }, { "epoch": 0.7662116040955631, "grad_norm": 24.25516700744629, "learning_rate": 2.4663072776280326e-06, "loss": 2.1421, "step": 1796 }, { "epoch": 0.7666382252559727, "grad_norm": 31.5616455078125, "learning_rate": 2.4618149146451032e-06, "loss": 2.6055, "step": 1797 }, { "epoch": 0.7670648464163823, "grad_norm": 25.29386329650879, "learning_rate": 2.4573225516621747e-06, "loss": 2.2949, "step": 1798 }, { "epoch": 0.7674914675767918, "grad_norm": 19.253997802734375, "learning_rate": 2.4528301886792453e-06, "loss": 2.3252, "step": 1799 }, { "epoch": 0.7679180887372014, "grad_norm": 32.48329162597656, "learning_rate": 2.4483378256963164e-06, "loss": 2.1895, "step": 1800 }, { "epoch": 0.768344709897611, "grad_norm": 28.606746673583984, "learning_rate": 2.4438454627133874e-06, "loss": 1.7744, "step": 1801 }, { "epoch": 0.7687713310580204, "grad_norm": 26.95733642578125, "learning_rate": 2.4393530997304585e-06, "loss": 2.6348, "step": 1802 }, { "epoch": 0.76919795221843, "grad_norm": 24.73307991027832, "learning_rate": 2.4348607367475295e-06, "loss": 2.4717, "step": 1803 }, { "epoch": 0.7696245733788396, "grad_norm": 33.115753173828125, "learning_rate": 2.4303683737646e-06, "loss": 2.4561, "step": 1804 }, { "epoch": 0.7700511945392492, "grad_norm": 24.048250198364258, "learning_rate": 2.4258760107816716e-06, "loss": 1.9028, "step": 1805 }, { "epoch": 0.7704778156996587, "grad_norm": 24.314634323120117, "learning_rate": 2.421383647798742e-06, "loss": 2.0, "step": 1806 }, { "epoch": 0.7709044368600683, "grad_norm": 29.49398422241211, "learning_rate": 2.4168912848158133e-06, "loss": 2.3843, "step": 1807 }, { "epoch": 0.7713310580204779, "grad_norm": 17.011449813842773, "learning_rate": 2.4123989218328843e-06, "loss": 1.7373, "step": 1808 }, { "epoch": 0.7717576791808873, "grad_norm": 31.276538848876953, "learning_rate": 2.4079065588499553e-06, "loss": 2.4697, "step": 1809 }, { "epoch": 0.7721843003412969, "grad_norm": 31.9920711517334, "learning_rate": 2.4034141958670264e-06, "loss": 2.0557, "step": 1810 }, { "epoch": 0.7726109215017065, "grad_norm": 23.170621871948242, "learning_rate": 2.398921832884097e-06, "loss": 2.3584, "step": 1811 }, { "epoch": 0.773037542662116, "grad_norm": 25.794071197509766, "learning_rate": 2.3944294699011685e-06, "loss": 2.5537, "step": 1812 }, { "epoch": 0.7734641638225256, "grad_norm": 22.407812118530273, "learning_rate": 2.389937106918239e-06, "loss": 2.1099, "step": 1813 }, { "epoch": 0.7738907849829352, "grad_norm": 20.0296630859375, "learning_rate": 2.38544474393531e-06, "loss": 1.8984, "step": 1814 }, { "epoch": 0.7743174061433447, "grad_norm": 18.59185218811035, "learning_rate": 2.380952380952381e-06, "loss": 2.2271, "step": 1815 }, { "epoch": 0.7747440273037542, "grad_norm": 23.263097763061523, "learning_rate": 2.3764600179694522e-06, "loss": 2.3281, "step": 1816 }, { "epoch": 0.7751706484641638, "grad_norm": 20.494384765625, "learning_rate": 2.3719676549865233e-06, "loss": 2.3438, "step": 1817 }, { "epoch": 0.7755972696245734, "grad_norm": 30.46561050415039, "learning_rate": 2.367475292003594e-06, "loss": 1.9331, "step": 1818 }, { "epoch": 0.7760238907849829, "grad_norm": 18.048294067382812, "learning_rate": 2.3629829290206654e-06, "loss": 2.1006, "step": 1819 }, { "epoch": 0.7764505119453925, "grad_norm": 22.645097732543945, "learning_rate": 2.358490566037736e-06, "loss": 2.4961, "step": 1820 }, { "epoch": 0.7768771331058021, "grad_norm": 36.97308349609375, "learning_rate": 2.353998203054807e-06, "loss": 2.7051, "step": 1821 }, { "epoch": 0.7773037542662116, "grad_norm": 20.397632598876953, "learning_rate": 2.349505840071878e-06, "loss": 2.3193, "step": 1822 }, { "epoch": 0.7777303754266212, "grad_norm": 18.061418533325195, "learning_rate": 2.345013477088949e-06, "loss": 1.8418, "step": 1823 }, { "epoch": 0.7781569965870307, "grad_norm": 26.37966537475586, "learning_rate": 2.34052111410602e-06, "loss": 2.2393, "step": 1824 }, { "epoch": 0.7785836177474402, "grad_norm": 18.162370681762695, "learning_rate": 2.3360287511230908e-06, "loss": 2.0254, "step": 1825 }, { "epoch": 0.7790102389078498, "grad_norm": 24.261823654174805, "learning_rate": 2.331536388140162e-06, "loss": 2.1113, "step": 1826 }, { "epoch": 0.7794368600682594, "grad_norm": 36.012611389160156, "learning_rate": 2.327044025157233e-06, "loss": 1.8486, "step": 1827 }, { "epoch": 0.7798634812286689, "grad_norm": 21.801942825317383, "learning_rate": 2.322551662174304e-06, "loss": 1.9272, "step": 1828 }, { "epoch": 0.7802901023890785, "grad_norm": 20.881542205810547, "learning_rate": 2.3180592991913745e-06, "loss": 2.333, "step": 1829 }, { "epoch": 0.7807167235494881, "grad_norm": 18.64068031311035, "learning_rate": 2.313566936208446e-06, "loss": 2.1831, "step": 1830 }, { "epoch": 0.7811433447098977, "grad_norm": 20.850994110107422, "learning_rate": 2.3090745732255166e-06, "loss": 2.0732, "step": 1831 }, { "epoch": 0.7815699658703071, "grad_norm": 22.28333282470703, "learning_rate": 2.3045822102425877e-06, "loss": 1.7607, "step": 1832 }, { "epoch": 0.7819965870307167, "grad_norm": 30.814220428466797, "learning_rate": 2.3000898472596587e-06, "loss": 2.5879, "step": 1833 }, { "epoch": 0.7824232081911263, "grad_norm": 31.240434646606445, "learning_rate": 2.2955974842767298e-06, "loss": 2.3564, "step": 1834 }, { "epoch": 0.7828498293515358, "grad_norm": 25.117435455322266, "learning_rate": 2.291105121293801e-06, "loss": 2.0645, "step": 1835 }, { "epoch": 0.7832764505119454, "grad_norm": 45.28719711303711, "learning_rate": 2.2866127583108714e-06, "loss": 1.9028, "step": 1836 }, { "epoch": 0.783703071672355, "grad_norm": 17.77985191345215, "learning_rate": 2.282120395327943e-06, "loss": 2.1016, "step": 1837 }, { "epoch": 0.7841296928327645, "grad_norm": 31.70467185974121, "learning_rate": 2.2776280323450135e-06, "loss": 2.3066, "step": 1838 }, { "epoch": 0.784556313993174, "grad_norm": 20.322813034057617, "learning_rate": 2.2731356693620846e-06, "loss": 2.3457, "step": 1839 }, { "epoch": 0.7849829351535836, "grad_norm": 22.35746955871582, "learning_rate": 2.2686433063791556e-06, "loss": 2.2314, "step": 1840 }, { "epoch": 0.7854095563139932, "grad_norm": 19.00843048095703, "learning_rate": 2.2641509433962266e-06, "loss": 2.0537, "step": 1841 }, { "epoch": 0.7858361774744027, "grad_norm": 23.502687454223633, "learning_rate": 2.2596585804132977e-06, "loss": 2.5869, "step": 1842 }, { "epoch": 0.7862627986348123, "grad_norm": 23.143768310546875, "learning_rate": 2.2551662174303683e-06, "loss": 1.8096, "step": 1843 }, { "epoch": 0.7866894197952219, "grad_norm": 16.373092651367188, "learning_rate": 2.2506738544474398e-06, "loss": 1.606, "step": 1844 }, { "epoch": 0.7871160409556314, "grad_norm": 18.551097869873047, "learning_rate": 2.2461814914645104e-06, "loss": 1.8193, "step": 1845 }, { "epoch": 0.787542662116041, "grad_norm": 22.2662410736084, "learning_rate": 2.2416891284815814e-06, "loss": 1.958, "step": 1846 }, { "epoch": 0.7879692832764505, "grad_norm": 49.5590705871582, "learning_rate": 2.2371967654986525e-06, "loss": 2.3447, "step": 1847 }, { "epoch": 0.78839590443686, "grad_norm": 42.248992919921875, "learning_rate": 2.2327044025157235e-06, "loss": 2.0059, "step": 1848 }, { "epoch": 0.7888225255972696, "grad_norm": 58.42856216430664, "learning_rate": 2.2282120395327946e-06, "loss": 2.5, "step": 1849 }, { "epoch": 0.7892491467576792, "grad_norm": 35.04775619506836, "learning_rate": 2.223719676549865e-06, "loss": 2.4609, "step": 1850 }, { "epoch": 0.7896757679180887, "grad_norm": 21.540834426879883, "learning_rate": 2.2192273135669367e-06, "loss": 2.0635, "step": 1851 }, { "epoch": 0.7901023890784983, "grad_norm": 46.99656295776367, "learning_rate": 2.2147349505840073e-06, "loss": 2.1025, "step": 1852 }, { "epoch": 0.7905290102389079, "grad_norm": 27.469890594482422, "learning_rate": 2.2102425876010783e-06, "loss": 2.3389, "step": 1853 }, { "epoch": 0.7909556313993175, "grad_norm": 23.75452995300293, "learning_rate": 2.2057502246181494e-06, "loss": 2.1201, "step": 1854 }, { "epoch": 0.7913822525597269, "grad_norm": 19.295225143432617, "learning_rate": 2.2012578616352204e-06, "loss": 1.9268, "step": 1855 }, { "epoch": 0.7918088737201365, "grad_norm": 24.537572860717773, "learning_rate": 2.1967654986522915e-06, "loss": 2.1846, "step": 1856 }, { "epoch": 0.7922354948805461, "grad_norm": 25.888763427734375, "learning_rate": 2.192273135669362e-06, "loss": 2.3018, "step": 1857 }, { "epoch": 0.7926621160409556, "grad_norm": 17.448314666748047, "learning_rate": 2.1877807726864336e-06, "loss": 1.6963, "step": 1858 }, { "epoch": 0.7930887372013652, "grad_norm": 31.14927101135254, "learning_rate": 2.183288409703504e-06, "loss": 2.1484, "step": 1859 }, { "epoch": 0.7935153583617748, "grad_norm": 28.934234619140625, "learning_rate": 2.1787960467205752e-06, "loss": 2.0508, "step": 1860 }, { "epoch": 0.7939419795221843, "grad_norm": 19.254159927368164, "learning_rate": 2.174303683737646e-06, "loss": 1.8345, "step": 1861 }, { "epoch": 0.7943686006825939, "grad_norm": 22.62273406982422, "learning_rate": 2.1698113207547173e-06, "loss": 2.6045, "step": 1862 }, { "epoch": 0.7947952218430034, "grad_norm": 26.880126953125, "learning_rate": 2.165318957771788e-06, "loss": 2.0889, "step": 1863 }, { "epoch": 0.7952218430034129, "grad_norm": 25.2117977142334, "learning_rate": 2.160826594788859e-06, "loss": 2.2568, "step": 1864 }, { "epoch": 0.7956484641638225, "grad_norm": 47.15989303588867, "learning_rate": 2.15633423180593e-06, "loss": 1.9131, "step": 1865 }, { "epoch": 0.7960750853242321, "grad_norm": 43.305965423583984, "learning_rate": 2.151841868823001e-06, "loss": 2.1338, "step": 1866 }, { "epoch": 0.7965017064846417, "grad_norm": 31.032060623168945, "learning_rate": 2.147349505840072e-06, "loss": 2.2998, "step": 1867 }, { "epoch": 0.7969283276450512, "grad_norm": 31.633068084716797, "learning_rate": 2.1428571428571427e-06, "loss": 2.02, "step": 1868 }, { "epoch": 0.7973549488054608, "grad_norm": 25.194854736328125, "learning_rate": 2.138364779874214e-06, "loss": 2.2529, "step": 1869 }, { "epoch": 0.7977815699658704, "grad_norm": 23.624929428100586, "learning_rate": 2.133872416891285e-06, "loss": 2.0996, "step": 1870 }, { "epoch": 0.7982081911262798, "grad_norm": 24.722183227539062, "learning_rate": 2.129380053908356e-06, "loss": 2.0752, "step": 1871 }, { "epoch": 0.7986348122866894, "grad_norm": 28.390928268432617, "learning_rate": 2.124887690925427e-06, "loss": 1.793, "step": 1872 }, { "epoch": 0.799061433447099, "grad_norm": 17.568288803100586, "learning_rate": 2.120395327942498e-06, "loss": 1.8057, "step": 1873 }, { "epoch": 0.7994880546075085, "grad_norm": 25.567970275878906, "learning_rate": 2.115902964959569e-06, "loss": 1.7949, "step": 1874 }, { "epoch": 0.7999146757679181, "grad_norm": 20.945209503173828, "learning_rate": 2.1114106019766396e-06, "loss": 2.3408, "step": 1875 }, { "epoch": 0.8003412969283277, "grad_norm": 23.48021697998047, "learning_rate": 2.106918238993711e-06, "loss": 2.0088, "step": 1876 }, { "epoch": 0.8007679180887372, "grad_norm": 31.483278274536133, "learning_rate": 2.1024258760107817e-06, "loss": 2.6606, "step": 1877 }, { "epoch": 0.8011945392491467, "grad_norm": 21.021120071411133, "learning_rate": 2.0979335130278528e-06, "loss": 1.9497, "step": 1878 }, { "epoch": 0.8016211604095563, "grad_norm": 20.318021774291992, "learning_rate": 2.093441150044924e-06, "loss": 2.3223, "step": 1879 }, { "epoch": 0.8020477815699659, "grad_norm": 33.12303924560547, "learning_rate": 2.088948787061995e-06, "loss": 1.3691, "step": 1880 }, { "epoch": 0.8024744027303754, "grad_norm": 20.866891860961914, "learning_rate": 2.084456424079066e-06, "loss": 1.8916, "step": 1881 }, { "epoch": 0.802901023890785, "grad_norm": 33.17778396606445, "learning_rate": 2.0799640610961365e-06, "loss": 2.2188, "step": 1882 }, { "epoch": 0.8033276450511946, "grad_norm": 27.898754119873047, "learning_rate": 2.075471698113208e-06, "loss": 2.0117, "step": 1883 }, { "epoch": 0.8037542662116041, "grad_norm": 25.94223976135254, "learning_rate": 2.0709793351302786e-06, "loss": 1.7227, "step": 1884 }, { "epoch": 0.8041808873720137, "grad_norm": 21.092864990234375, "learning_rate": 2.0664869721473496e-06, "loss": 1.9883, "step": 1885 }, { "epoch": 0.8046075085324232, "grad_norm": 28.999971389770508, "learning_rate": 2.0619946091644207e-06, "loss": 1.9463, "step": 1886 }, { "epoch": 0.8050341296928327, "grad_norm": 22.959543228149414, "learning_rate": 2.0575022461814917e-06, "loss": 1.6489, "step": 1887 }, { "epoch": 0.8054607508532423, "grad_norm": 26.268436431884766, "learning_rate": 2.0530098831985628e-06, "loss": 1.9351, "step": 1888 }, { "epoch": 0.8058873720136519, "grad_norm": 24.802099227905273, "learning_rate": 2.0485175202156334e-06, "loss": 2.6807, "step": 1889 }, { "epoch": 0.8063139931740614, "grad_norm": 24.925498962402344, "learning_rate": 2.044025157232705e-06, "loss": 2.2598, "step": 1890 }, { "epoch": 0.806740614334471, "grad_norm": 27.905561447143555, "learning_rate": 2.0395327942497755e-06, "loss": 1.9902, "step": 1891 }, { "epoch": 0.8071672354948806, "grad_norm": 29.847583770751953, "learning_rate": 2.0350404312668465e-06, "loss": 2.1709, "step": 1892 }, { "epoch": 0.8075938566552902, "grad_norm": 27.835908889770508, "learning_rate": 2.0305480682839176e-06, "loss": 2.6738, "step": 1893 }, { "epoch": 0.8080204778156996, "grad_norm": 31.87246322631836, "learning_rate": 2.0260557053009886e-06, "loss": 2.5508, "step": 1894 }, { "epoch": 0.8084470989761092, "grad_norm": 22.407638549804688, "learning_rate": 2.0215633423180597e-06, "loss": 2.5898, "step": 1895 }, { "epoch": 0.8088737201365188, "grad_norm": 23.444799423217773, "learning_rate": 2.0170709793351303e-06, "loss": 2.5332, "step": 1896 }, { "epoch": 0.8093003412969283, "grad_norm": 24.204666137695312, "learning_rate": 2.0125786163522013e-06, "loss": 1.9404, "step": 1897 }, { "epoch": 0.8097269624573379, "grad_norm": 22.92106056213379, "learning_rate": 2.0080862533692724e-06, "loss": 1.8535, "step": 1898 }, { "epoch": 0.8101535836177475, "grad_norm": 24.548683166503906, "learning_rate": 2.0035938903863434e-06, "loss": 1.8535, "step": 1899 }, { "epoch": 0.810580204778157, "grad_norm": 20.212099075317383, "learning_rate": 1.999101527403414e-06, "loss": 2.0874, "step": 1900 }, { "epoch": 0.8110068259385665, "grad_norm": 20.215652465820312, "learning_rate": 1.9946091644204855e-06, "loss": 1.9922, "step": 1901 }, { "epoch": 0.8114334470989761, "grad_norm": 23.670936584472656, "learning_rate": 1.990116801437556e-06, "loss": 2.3438, "step": 1902 }, { "epoch": 0.8118600682593856, "grad_norm": 29.262372970581055, "learning_rate": 1.985624438454627e-06, "loss": 2.1133, "step": 1903 }, { "epoch": 0.8122866894197952, "grad_norm": 38.6277961730957, "learning_rate": 1.981132075471698e-06, "loss": 2.124, "step": 1904 }, { "epoch": 0.8127133105802048, "grad_norm": 34.57408142089844, "learning_rate": 1.9766397124887693e-06, "loss": 2.1533, "step": 1905 }, { "epoch": 0.8131399317406144, "grad_norm": 22.756662368774414, "learning_rate": 1.9721473495058403e-06, "loss": 2.4146, "step": 1906 }, { "epoch": 0.8135665529010239, "grad_norm": 23.002607345581055, "learning_rate": 1.967654986522911e-06, "loss": 1.7754, "step": 1907 }, { "epoch": 0.8139931740614335, "grad_norm": 22.411535263061523, "learning_rate": 1.9631626235399824e-06, "loss": 2.0752, "step": 1908 }, { "epoch": 0.814419795221843, "grad_norm": 43.06706619262695, "learning_rate": 1.958670260557053e-06, "loss": 2.0039, "step": 1909 }, { "epoch": 0.8148464163822525, "grad_norm": 33.41255187988281, "learning_rate": 1.954177897574124e-06, "loss": 2.0869, "step": 1910 }, { "epoch": 0.8152730375426621, "grad_norm": 18.505645751953125, "learning_rate": 1.949685534591195e-06, "loss": 1.9316, "step": 1911 }, { "epoch": 0.8156996587030717, "grad_norm": 25.759122848510742, "learning_rate": 1.945193171608266e-06, "loss": 2.1436, "step": 1912 }, { "epoch": 0.8161262798634812, "grad_norm": 22.365253448486328, "learning_rate": 1.940700808625337e-06, "loss": 2.2036, "step": 1913 }, { "epoch": 0.8165529010238908, "grad_norm": 21.153892517089844, "learning_rate": 1.936208445642408e-06, "loss": 2.0137, "step": 1914 }, { "epoch": 0.8169795221843004, "grad_norm": 20.937232971191406, "learning_rate": 1.9317160826594793e-06, "loss": 2.2578, "step": 1915 }, { "epoch": 0.8174061433447098, "grad_norm": 19.788347244262695, "learning_rate": 1.92722371967655e-06, "loss": 1.9375, "step": 1916 }, { "epoch": 0.8178327645051194, "grad_norm": 35.839080810546875, "learning_rate": 1.922731356693621e-06, "loss": 2.4355, "step": 1917 }, { "epoch": 0.818259385665529, "grad_norm": 24.199609756469727, "learning_rate": 1.918238993710692e-06, "loss": 2.417, "step": 1918 }, { "epoch": 0.8186860068259386, "grad_norm": 17.443004608154297, "learning_rate": 1.913746630727763e-06, "loss": 1.7695, "step": 1919 }, { "epoch": 0.8191126279863481, "grad_norm": 42.2618293762207, "learning_rate": 1.909254267744834e-06, "loss": 1.9414, "step": 1920 }, { "epoch": 0.8195392491467577, "grad_norm": 20.51604652404785, "learning_rate": 1.904761904761905e-06, "loss": 2.1553, "step": 1921 }, { "epoch": 0.8199658703071673, "grad_norm": 18.936643600463867, "learning_rate": 1.900269541778976e-06, "loss": 2.4023, "step": 1922 }, { "epoch": 0.8203924914675768, "grad_norm": 28.36236000061035, "learning_rate": 1.895777178796047e-06, "loss": 1.9023, "step": 1923 }, { "epoch": 0.8208191126279863, "grad_norm": 17.966564178466797, "learning_rate": 1.8912848158131178e-06, "loss": 1.5234, "step": 1924 }, { "epoch": 0.8212457337883959, "grad_norm": 23.814815521240234, "learning_rate": 1.8867924528301889e-06, "loss": 2.0996, "step": 1925 }, { "epoch": 0.8216723549488054, "grad_norm": 22.847030639648438, "learning_rate": 1.88230008984726e-06, "loss": 2.1987, "step": 1926 }, { "epoch": 0.822098976109215, "grad_norm": 23.444608688354492, "learning_rate": 1.8778077268643308e-06, "loss": 1.8022, "step": 1927 }, { "epoch": 0.8225255972696246, "grad_norm": 16.979387283325195, "learning_rate": 1.8733153638814016e-06, "loss": 1.7539, "step": 1928 }, { "epoch": 0.8229522184300341, "grad_norm": 21.849397659301758, "learning_rate": 1.8688230008984728e-06, "loss": 2.2803, "step": 1929 }, { "epoch": 0.8233788395904437, "grad_norm": 27.620882034301758, "learning_rate": 1.8643306379155437e-06, "loss": 2.2627, "step": 1930 }, { "epoch": 0.8238054607508533, "grad_norm": 44.100711822509766, "learning_rate": 1.8598382749326147e-06, "loss": 1.9717, "step": 1931 }, { "epoch": 0.8242320819112628, "grad_norm": 25.661659240722656, "learning_rate": 1.8553459119496855e-06, "loss": 1.8379, "step": 1932 }, { "epoch": 0.8246587030716723, "grad_norm": 23.532838821411133, "learning_rate": 1.8508535489667568e-06, "loss": 2.4883, "step": 1933 }, { "epoch": 0.8250853242320819, "grad_norm": 33.77391815185547, "learning_rate": 1.8463611859838276e-06, "loss": 2.2764, "step": 1934 }, { "epoch": 0.8255119453924915, "grad_norm": 33.51873016357422, "learning_rate": 1.8418688230008985e-06, "loss": 1.7642, "step": 1935 }, { "epoch": 0.825938566552901, "grad_norm": 19.887741088867188, "learning_rate": 1.8373764600179697e-06, "loss": 2.0903, "step": 1936 }, { "epoch": 0.8263651877133106, "grad_norm": 33.04109573364258, "learning_rate": 1.8328840970350406e-06, "loss": 2.3115, "step": 1937 }, { "epoch": 0.8267918088737202, "grad_norm": 21.50177574157715, "learning_rate": 1.8283917340521116e-06, "loss": 2.127, "step": 1938 }, { "epoch": 0.8272184300341296, "grad_norm": 24.439558029174805, "learning_rate": 1.8238993710691824e-06, "loss": 2.4102, "step": 1939 }, { "epoch": 0.8276450511945392, "grad_norm": 31.59778594970703, "learning_rate": 1.8194070080862537e-06, "loss": 2.3262, "step": 1940 }, { "epoch": 0.8280716723549488, "grad_norm": 23.083101272583008, "learning_rate": 1.8149146451033245e-06, "loss": 2.3291, "step": 1941 }, { "epoch": 0.8284982935153583, "grad_norm": 17.35047149658203, "learning_rate": 1.8104222821203954e-06, "loss": 1.8423, "step": 1942 }, { "epoch": 0.8289249146757679, "grad_norm": 22.005908966064453, "learning_rate": 1.8059299191374666e-06, "loss": 1.9536, "step": 1943 }, { "epoch": 0.8293515358361775, "grad_norm": 22.35020637512207, "learning_rate": 1.8014375561545374e-06, "loss": 2.1006, "step": 1944 }, { "epoch": 0.8297781569965871, "grad_norm": 26.536985397338867, "learning_rate": 1.7969451931716083e-06, "loss": 2.0322, "step": 1945 }, { "epoch": 0.8302047781569966, "grad_norm": 20.950786590576172, "learning_rate": 1.7924528301886793e-06, "loss": 2.1045, "step": 1946 }, { "epoch": 0.8306313993174061, "grad_norm": 28.460447311401367, "learning_rate": 1.7879604672057504e-06, "loss": 2.0654, "step": 1947 }, { "epoch": 0.8310580204778157, "grad_norm": 24.567773818969727, "learning_rate": 1.7834681042228214e-06, "loss": 2.2036, "step": 1948 }, { "epoch": 0.8314846416382252, "grad_norm": 26.11224937438965, "learning_rate": 1.7789757412398922e-06, "loss": 1.8745, "step": 1949 }, { "epoch": 0.8319112627986348, "grad_norm": 22.54220962524414, "learning_rate": 1.7744833782569635e-06, "loss": 1.6982, "step": 1950 }, { "epoch": 0.8323378839590444, "grad_norm": 24.108007431030273, "learning_rate": 1.7699910152740343e-06, "loss": 2.042, "step": 1951 }, { "epoch": 0.8327645051194539, "grad_norm": 16.941404342651367, "learning_rate": 1.7654986522911052e-06, "loss": 1.5225, "step": 1952 }, { "epoch": 0.8331911262798635, "grad_norm": 31.694730758666992, "learning_rate": 1.7610062893081762e-06, "loss": 1.9097, "step": 1953 }, { "epoch": 0.8336177474402731, "grad_norm": 20.774200439453125, "learning_rate": 1.7565139263252473e-06, "loss": 1.9844, "step": 1954 }, { "epoch": 0.8340443686006825, "grad_norm": 27.799217224121094, "learning_rate": 1.7520215633423183e-06, "loss": 2.0264, "step": 1955 }, { "epoch": 0.8344709897610921, "grad_norm": 30.80106544494629, "learning_rate": 1.7475292003593891e-06, "loss": 2.6523, "step": 1956 }, { "epoch": 0.8348976109215017, "grad_norm": 20.44435691833496, "learning_rate": 1.7430368373764604e-06, "loss": 1.7144, "step": 1957 }, { "epoch": 0.8353242320819113, "grad_norm": 31.098176956176758, "learning_rate": 1.7385444743935312e-06, "loss": 2.502, "step": 1958 }, { "epoch": 0.8357508532423208, "grad_norm": 25.127107620239258, "learning_rate": 1.734052111410602e-06, "loss": 2.125, "step": 1959 }, { "epoch": 0.8361774744027304, "grad_norm": 27.822410583496094, "learning_rate": 1.7295597484276729e-06, "loss": 2.1826, "step": 1960 }, { "epoch": 0.83660409556314, "grad_norm": 22.251161575317383, "learning_rate": 1.7250673854447441e-06, "loss": 2.0444, "step": 1961 }, { "epoch": 0.8370307167235495, "grad_norm": 23.80228042602539, "learning_rate": 1.720575022461815e-06, "loss": 2.4683, "step": 1962 }, { "epoch": 0.837457337883959, "grad_norm": 22.37078857421875, "learning_rate": 1.716082659478886e-06, "loss": 2.1426, "step": 1963 }, { "epoch": 0.8378839590443686, "grad_norm": 19.09315299987793, "learning_rate": 1.711590296495957e-06, "loss": 1.8899, "step": 1964 }, { "epoch": 0.8383105802047781, "grad_norm": 23.49970817565918, "learning_rate": 1.7070979335130281e-06, "loss": 2.5645, "step": 1965 }, { "epoch": 0.8387372013651877, "grad_norm": 68.0501480102539, "learning_rate": 1.702605570530099e-06, "loss": 2.2539, "step": 1966 }, { "epoch": 0.8391638225255973, "grad_norm": 24.918535232543945, "learning_rate": 1.6981132075471698e-06, "loss": 2.1025, "step": 1967 }, { "epoch": 0.8395904436860068, "grad_norm": 30.718915939331055, "learning_rate": 1.693620844564241e-06, "loss": 1.6104, "step": 1968 }, { "epoch": 0.8400170648464164, "grad_norm": 24.890552520751953, "learning_rate": 1.6891284815813119e-06, "loss": 2.416, "step": 1969 }, { "epoch": 0.840443686006826, "grad_norm": 23.353469848632812, "learning_rate": 1.684636118598383e-06, "loss": 2.7842, "step": 1970 }, { "epoch": 0.8408703071672355, "grad_norm": 21.08226776123047, "learning_rate": 1.6801437556154537e-06, "loss": 1.8774, "step": 1971 }, { "epoch": 0.841296928327645, "grad_norm": 20.481369018554688, "learning_rate": 1.675651392632525e-06, "loss": 2.2529, "step": 1972 }, { "epoch": 0.8417235494880546, "grad_norm": 37.47206115722656, "learning_rate": 1.6711590296495958e-06, "loss": 2.1377, "step": 1973 }, { "epoch": 0.8421501706484642, "grad_norm": 21.767244338989258, "learning_rate": 1.6666666666666667e-06, "loss": 1.8887, "step": 1974 }, { "epoch": 0.8425767918088737, "grad_norm": 25.5743350982666, "learning_rate": 1.662174303683738e-06, "loss": 2.293, "step": 1975 }, { "epoch": 0.8430034129692833, "grad_norm": 21.288652420043945, "learning_rate": 1.6576819407008088e-06, "loss": 1.9062, "step": 1976 }, { "epoch": 0.8434300341296929, "grad_norm": 23.8049259185791, "learning_rate": 1.6531895777178798e-06, "loss": 2.0957, "step": 1977 }, { "epoch": 0.8438566552901023, "grad_norm": 24.071441650390625, "learning_rate": 1.6486972147349506e-06, "loss": 2.2852, "step": 1978 }, { "epoch": 0.8442832764505119, "grad_norm": 23.946063995361328, "learning_rate": 1.6442048517520217e-06, "loss": 1.9229, "step": 1979 }, { "epoch": 0.8447098976109215, "grad_norm": 18.481111526489258, "learning_rate": 1.6397124887690927e-06, "loss": 1.8364, "step": 1980 }, { "epoch": 0.8451365187713311, "grad_norm": 17.902070999145508, "learning_rate": 1.6352201257861635e-06, "loss": 1.9219, "step": 1981 }, { "epoch": 0.8455631399317406, "grad_norm": 23.107677459716797, "learning_rate": 1.6307277628032348e-06, "loss": 1.8792, "step": 1982 }, { "epoch": 0.8459897610921502, "grad_norm": 16.212055206298828, "learning_rate": 1.6262353998203056e-06, "loss": 1.9043, "step": 1983 }, { "epoch": 0.8464163822525598, "grad_norm": 29.245019912719727, "learning_rate": 1.6217430368373765e-06, "loss": 1.8955, "step": 1984 }, { "epoch": 0.8468430034129693, "grad_norm": 29.921899795532227, "learning_rate": 1.6172506738544475e-06, "loss": 2.2031, "step": 1985 }, { "epoch": 0.8472696245733788, "grad_norm": 29.58133316040039, "learning_rate": 1.6127583108715186e-06, "loss": 2.3086, "step": 1986 }, { "epoch": 0.8476962457337884, "grad_norm": 30.39230728149414, "learning_rate": 1.6082659478885896e-06, "loss": 2.1631, "step": 1987 }, { "epoch": 0.8481228668941979, "grad_norm": 24.831098556518555, "learning_rate": 1.6037735849056604e-06, "loss": 2.2432, "step": 1988 }, { "epoch": 0.8485494880546075, "grad_norm": 36.24596405029297, "learning_rate": 1.5992812219227317e-06, "loss": 2.2324, "step": 1989 }, { "epoch": 0.8489761092150171, "grad_norm": 25.26513671875, "learning_rate": 1.5947888589398025e-06, "loss": 2.6475, "step": 1990 }, { "epoch": 0.8494027303754266, "grad_norm": 22.898345947265625, "learning_rate": 1.5902964959568734e-06, "loss": 1.9658, "step": 1991 }, { "epoch": 0.8498293515358362, "grad_norm": 24.595247268676758, "learning_rate": 1.5858041329739444e-06, "loss": 2.373, "step": 1992 }, { "epoch": 0.8502559726962458, "grad_norm": 41.001983642578125, "learning_rate": 1.5813117699910154e-06, "loss": 2.5127, "step": 1993 }, { "epoch": 0.8506825938566553, "grad_norm": 29.098054885864258, "learning_rate": 1.5768194070080865e-06, "loss": 2.0771, "step": 1994 }, { "epoch": 0.8511092150170648, "grad_norm": 19.65403175354004, "learning_rate": 1.5723270440251573e-06, "loss": 2.1309, "step": 1995 }, { "epoch": 0.8515358361774744, "grad_norm": 36.26100158691406, "learning_rate": 1.5678346810422284e-06, "loss": 1.751, "step": 1996 }, { "epoch": 0.851962457337884, "grad_norm": 34.61759567260742, "learning_rate": 1.5633423180592994e-06, "loss": 1.8232, "step": 1997 }, { "epoch": 0.8523890784982935, "grad_norm": 29.321847915649414, "learning_rate": 1.5588499550763702e-06, "loss": 1.8682, "step": 1998 }, { "epoch": 0.8528156996587031, "grad_norm": 20.491769790649414, "learning_rate": 1.554357592093441e-06, "loss": 1.7466, "step": 1999 }, { "epoch": 0.8532423208191127, "grad_norm": 25.29309844970703, "learning_rate": 1.5498652291105123e-06, "loss": 1.8062, "step": 2000 }, { "epoch": 0.8536689419795221, "grad_norm": 21.117168426513672, "learning_rate": 1.5453728661275832e-06, "loss": 2.0132, "step": 2001 }, { "epoch": 0.8540955631399317, "grad_norm": 26.844043731689453, "learning_rate": 1.5408805031446542e-06, "loss": 1.8477, "step": 2002 }, { "epoch": 0.8545221843003413, "grad_norm": 28.656776428222656, "learning_rate": 1.5363881401617253e-06, "loss": 2.5469, "step": 2003 }, { "epoch": 0.8549488054607508, "grad_norm": 20.729461669921875, "learning_rate": 1.5318957771787963e-06, "loss": 1.8193, "step": 2004 }, { "epoch": 0.8553754266211604, "grad_norm": 24.91880226135254, "learning_rate": 1.5274034141958671e-06, "loss": 2.4961, "step": 2005 }, { "epoch": 0.85580204778157, "grad_norm": 19.17313575744629, "learning_rate": 1.522911051212938e-06, "loss": 1.9546, "step": 2006 }, { "epoch": 0.8562286689419796, "grad_norm": 17.52025604248047, "learning_rate": 1.5184186882300092e-06, "loss": 1.6777, "step": 2007 }, { "epoch": 0.856655290102389, "grad_norm": 20.038908004760742, "learning_rate": 1.51392632524708e-06, "loss": 1.9248, "step": 2008 }, { "epoch": 0.8570819112627986, "grad_norm": 41.33238220214844, "learning_rate": 1.509433962264151e-06, "loss": 2.4092, "step": 2009 }, { "epoch": 0.8575085324232082, "grad_norm": 30.137481689453125, "learning_rate": 1.504941599281222e-06, "loss": 2.084, "step": 2010 }, { "epoch": 0.8579351535836177, "grad_norm": 17.346654891967773, "learning_rate": 1.5004492362982932e-06, "loss": 1.4453, "step": 2011 }, { "epoch": 0.8583617747440273, "grad_norm": 26.113149642944336, "learning_rate": 1.495956873315364e-06, "loss": 2.5786, "step": 2012 }, { "epoch": 0.8587883959044369, "grad_norm": 26.139968872070312, "learning_rate": 1.4914645103324349e-06, "loss": 2.1934, "step": 2013 }, { "epoch": 0.8592150170648464, "grad_norm": 24.80109214782715, "learning_rate": 1.4869721473495061e-06, "loss": 2.3604, "step": 2014 }, { "epoch": 0.859641638225256, "grad_norm": 25.143037796020508, "learning_rate": 1.482479784366577e-06, "loss": 2.4048, "step": 2015 }, { "epoch": 0.8600682593856656, "grad_norm": 29.503969192504883, "learning_rate": 1.4779874213836478e-06, "loss": 1.9688, "step": 2016 }, { "epoch": 0.860494880546075, "grad_norm": 20.356889724731445, "learning_rate": 1.4734950584007188e-06, "loss": 2.2007, "step": 2017 }, { "epoch": 0.8609215017064846, "grad_norm": 26.912221908569336, "learning_rate": 1.4690026954177899e-06, "loss": 2.3154, "step": 2018 }, { "epoch": 0.8613481228668942, "grad_norm": 20.486494064331055, "learning_rate": 1.464510332434861e-06, "loss": 2.3149, "step": 2019 }, { "epoch": 0.8617747440273038, "grad_norm": 23.620391845703125, "learning_rate": 1.4600179694519317e-06, "loss": 2.1152, "step": 2020 }, { "epoch": 0.8622013651877133, "grad_norm": 19.62716293334961, "learning_rate": 1.455525606469003e-06, "loss": 1.7969, "step": 2021 }, { "epoch": 0.8626279863481229, "grad_norm": 32.85416030883789, "learning_rate": 1.4510332434860738e-06, "loss": 2.248, "step": 2022 }, { "epoch": 0.8630546075085325, "grad_norm": 18.510087966918945, "learning_rate": 1.4465408805031447e-06, "loss": 1.9766, "step": 2023 }, { "epoch": 0.863481228668942, "grad_norm": 23.601825714111328, "learning_rate": 1.4420485175202157e-06, "loss": 2.5127, "step": 2024 }, { "epoch": 0.8639078498293515, "grad_norm": 42.78569412231445, "learning_rate": 1.4375561545372868e-06, "loss": 2.3135, "step": 2025 }, { "epoch": 0.8643344709897611, "grad_norm": 22.61996841430664, "learning_rate": 1.4330637915543578e-06, "loss": 1.7754, "step": 2026 }, { "epoch": 0.8647610921501706, "grad_norm": 20.096586227416992, "learning_rate": 1.4285714285714286e-06, "loss": 2.1143, "step": 2027 }, { "epoch": 0.8651877133105802, "grad_norm": 32.09673309326172, "learning_rate": 1.4240790655884999e-06, "loss": 1.9922, "step": 2028 }, { "epoch": 0.8656143344709898, "grad_norm": 23.80088233947754, "learning_rate": 1.4195867026055707e-06, "loss": 2.1055, "step": 2029 }, { "epoch": 0.8660409556313993, "grad_norm": 31.186294555664062, "learning_rate": 1.4150943396226415e-06, "loss": 2.3418, "step": 2030 }, { "epoch": 0.8664675767918089, "grad_norm": 24.158933639526367, "learning_rate": 1.4106019766397126e-06, "loss": 2.0342, "step": 2031 }, { "epoch": 0.8668941979522184, "grad_norm": 23.584606170654297, "learning_rate": 1.4061096136567836e-06, "loss": 2.3428, "step": 2032 }, { "epoch": 0.867320819112628, "grad_norm": 19.908782958984375, "learning_rate": 1.4016172506738545e-06, "loss": 1.9004, "step": 2033 }, { "epoch": 0.8677474402730375, "grad_norm": 19.742876052856445, "learning_rate": 1.3971248876909255e-06, "loss": 1.5688, "step": 2034 }, { "epoch": 0.8681740614334471, "grad_norm": 22.75543975830078, "learning_rate": 1.3926325247079966e-06, "loss": 2.5547, "step": 2035 }, { "epoch": 0.8686006825938567, "grad_norm": 32.865848541259766, "learning_rate": 1.3881401617250676e-06, "loss": 2.0195, "step": 2036 }, { "epoch": 0.8690273037542662, "grad_norm": 25.184295654296875, "learning_rate": 1.3836477987421384e-06, "loss": 2.1816, "step": 2037 }, { "epoch": 0.8694539249146758, "grad_norm": 19.12177085876465, "learning_rate": 1.3791554357592093e-06, "loss": 2.3477, "step": 2038 }, { "epoch": 0.8698805460750854, "grad_norm": 21.0264949798584, "learning_rate": 1.3746630727762805e-06, "loss": 1.7646, "step": 2039 }, { "epoch": 0.8703071672354948, "grad_norm": 24.255212783813477, "learning_rate": 1.3701707097933514e-06, "loss": 2.2432, "step": 2040 }, { "epoch": 0.8707337883959044, "grad_norm": 18.39045524597168, "learning_rate": 1.3656783468104224e-06, "loss": 1.6699, "step": 2041 }, { "epoch": 0.871160409556314, "grad_norm": 18.828027725219727, "learning_rate": 1.3611859838274934e-06, "loss": 2.2451, "step": 2042 }, { "epoch": 0.8715870307167235, "grad_norm": 37.50020980834961, "learning_rate": 1.3566936208445645e-06, "loss": 1.7896, "step": 2043 }, { "epoch": 0.8720136518771331, "grad_norm": 19.664005279541016, "learning_rate": 1.3522012578616353e-06, "loss": 1.8555, "step": 2044 }, { "epoch": 0.8724402730375427, "grad_norm": 17.749042510986328, "learning_rate": 1.3477088948787062e-06, "loss": 1.6689, "step": 2045 }, { "epoch": 0.8728668941979523, "grad_norm": 21.117351531982422, "learning_rate": 1.3432165318957774e-06, "loss": 2.3311, "step": 2046 }, { "epoch": 0.8732935153583617, "grad_norm": 20.475221633911133, "learning_rate": 1.3387241689128482e-06, "loss": 1.9712, "step": 2047 }, { "epoch": 0.8737201365187713, "grad_norm": 22.14813804626465, "learning_rate": 1.3342318059299193e-06, "loss": 2.1699, "step": 2048 }, { "epoch": 0.8741467576791809, "grad_norm": 22.878765106201172, "learning_rate": 1.3297394429469901e-06, "loss": 2.1074, "step": 2049 }, { "epoch": 0.8745733788395904, "grad_norm": 20.329811096191406, "learning_rate": 1.3252470799640612e-06, "loss": 1.9375, "step": 2050 }, { "epoch": 0.875, "grad_norm": 23.280149459838867, "learning_rate": 1.3207547169811322e-06, "loss": 1.877, "step": 2051 }, { "epoch": 0.8754266211604096, "grad_norm": 19.771005630493164, "learning_rate": 1.316262353998203e-06, "loss": 1.9712, "step": 2052 }, { "epoch": 0.8758532423208191, "grad_norm": 18.957622528076172, "learning_rate": 1.3117699910152743e-06, "loss": 1.6733, "step": 2053 }, { "epoch": 0.8762798634812287, "grad_norm": 22.287721633911133, "learning_rate": 1.3072776280323451e-06, "loss": 2.3984, "step": 2054 }, { "epoch": 0.8767064846416383, "grad_norm": 28.450151443481445, "learning_rate": 1.302785265049416e-06, "loss": 2.6621, "step": 2055 }, { "epoch": 0.8771331058020477, "grad_norm": 23.43517303466797, "learning_rate": 1.298292902066487e-06, "loss": 1.9453, "step": 2056 }, { "epoch": 0.8775597269624573, "grad_norm": 24.089502334594727, "learning_rate": 1.293800539083558e-06, "loss": 2.1045, "step": 2057 }, { "epoch": 0.8779863481228669, "grad_norm": 26.210840225219727, "learning_rate": 1.289308176100629e-06, "loss": 2.2441, "step": 2058 }, { "epoch": 0.8784129692832765, "grad_norm": 19.64763832092285, "learning_rate": 1.2848158131177e-06, "loss": 2.0576, "step": 2059 }, { "epoch": 0.878839590443686, "grad_norm": 24.991710662841797, "learning_rate": 1.2803234501347712e-06, "loss": 2.0898, "step": 2060 }, { "epoch": 0.8792662116040956, "grad_norm": 19.32874298095703, "learning_rate": 1.275831087151842e-06, "loss": 2.3027, "step": 2061 }, { "epoch": 0.8796928327645052, "grad_norm": 17.936874389648438, "learning_rate": 1.2713387241689129e-06, "loss": 1.6777, "step": 2062 }, { "epoch": 0.8801194539249146, "grad_norm": 26.58176040649414, "learning_rate": 1.266846361185984e-06, "loss": 2.4883, "step": 2063 }, { "epoch": 0.8805460750853242, "grad_norm": 24.924253463745117, "learning_rate": 1.262353998203055e-06, "loss": 2.4746, "step": 2064 }, { "epoch": 0.8809726962457338, "grad_norm": 22.25920295715332, "learning_rate": 1.257861635220126e-06, "loss": 1.6597, "step": 2065 }, { "epoch": 0.8813993174061433, "grad_norm": 21.69879722595215, "learning_rate": 1.2533692722371968e-06, "loss": 2.0581, "step": 2066 }, { "epoch": 0.8818259385665529, "grad_norm": 20.946352005004883, "learning_rate": 1.2488769092542679e-06, "loss": 1.8125, "step": 2067 }, { "epoch": 0.8822525597269625, "grad_norm": 25.7633056640625, "learning_rate": 1.244384546271339e-06, "loss": 1.8984, "step": 2068 }, { "epoch": 0.882679180887372, "grad_norm": 22.418216705322266, "learning_rate": 1.2398921832884097e-06, "loss": 2.0986, "step": 2069 }, { "epoch": 0.8831058020477816, "grad_norm": 24.003862380981445, "learning_rate": 1.2353998203054808e-06, "loss": 2.0674, "step": 2070 }, { "epoch": 0.8835324232081911, "grad_norm": 34.044063568115234, "learning_rate": 1.2309074573225516e-06, "loss": 2.0288, "step": 2071 }, { "epoch": 0.8839590443686007, "grad_norm": 24.938892364501953, "learning_rate": 1.2264150943396227e-06, "loss": 2.3838, "step": 2072 }, { "epoch": 0.8843856655290102, "grad_norm": 33.812461853027344, "learning_rate": 1.2219227313566937e-06, "loss": 2.0166, "step": 2073 }, { "epoch": 0.8848122866894198, "grad_norm": 40.91825866699219, "learning_rate": 1.2174303683737648e-06, "loss": 2.5576, "step": 2074 }, { "epoch": 0.8852389078498294, "grad_norm": 25.86836814880371, "learning_rate": 1.2129380053908358e-06, "loss": 2.1416, "step": 2075 }, { "epoch": 0.8856655290102389, "grad_norm": 20.361167907714844, "learning_rate": 1.2084456424079066e-06, "loss": 2.2021, "step": 2076 }, { "epoch": 0.8860921501706485, "grad_norm": 35.148014068603516, "learning_rate": 1.2039532794249777e-06, "loss": 2.2529, "step": 2077 }, { "epoch": 0.886518771331058, "grad_norm": 19.91792869567871, "learning_rate": 1.1994609164420485e-06, "loss": 2.0381, "step": 2078 }, { "epoch": 0.8869453924914675, "grad_norm": 19.79247283935547, "learning_rate": 1.1949685534591195e-06, "loss": 1.8823, "step": 2079 }, { "epoch": 0.8873720136518771, "grad_norm": 21.4259033203125, "learning_rate": 1.1904761904761906e-06, "loss": 2.6367, "step": 2080 }, { "epoch": 0.8877986348122867, "grad_norm": 22.293596267700195, "learning_rate": 1.1859838274932616e-06, "loss": 2.4014, "step": 2081 }, { "epoch": 0.8882252559726962, "grad_norm": 30.97688102722168, "learning_rate": 1.1814914645103327e-06, "loss": 1.9829, "step": 2082 }, { "epoch": 0.8886518771331058, "grad_norm": 27.427173614501953, "learning_rate": 1.1769991015274035e-06, "loss": 2.7266, "step": 2083 }, { "epoch": 0.8890784982935154, "grad_norm": 23.10563087463379, "learning_rate": 1.1725067385444746e-06, "loss": 2.4023, "step": 2084 }, { "epoch": 0.889505119453925, "grad_norm": 24.243976593017578, "learning_rate": 1.1680143755615454e-06, "loss": 2.5977, "step": 2085 }, { "epoch": 0.8899317406143344, "grad_norm": 26.19727325439453, "learning_rate": 1.1635220125786164e-06, "loss": 2.165, "step": 2086 }, { "epoch": 0.890358361774744, "grad_norm": 21.771196365356445, "learning_rate": 1.1590296495956873e-06, "loss": 2.2637, "step": 2087 }, { "epoch": 0.8907849829351536, "grad_norm": 20.06059455871582, "learning_rate": 1.1545372866127583e-06, "loss": 1.9355, "step": 2088 }, { "epoch": 0.8912116040955631, "grad_norm": 17.9766902923584, "learning_rate": 1.1500449236298294e-06, "loss": 1.5938, "step": 2089 }, { "epoch": 0.8916382252559727, "grad_norm": 27.272201538085938, "learning_rate": 1.1455525606469004e-06, "loss": 1.9111, "step": 2090 }, { "epoch": 0.8920648464163823, "grad_norm": 19.23063850402832, "learning_rate": 1.1410601976639714e-06, "loss": 1.9775, "step": 2091 }, { "epoch": 0.8924914675767918, "grad_norm": 32.15581130981445, "learning_rate": 1.1365678346810423e-06, "loss": 2.1719, "step": 2092 }, { "epoch": 0.8929180887372014, "grad_norm": 17.017885208129883, "learning_rate": 1.1320754716981133e-06, "loss": 1.5283, "step": 2093 }, { "epoch": 0.893344709897611, "grad_norm": 28.969139099121094, "learning_rate": 1.1275831087151842e-06, "loss": 2.2002, "step": 2094 }, { "epoch": 0.8937713310580204, "grad_norm": 30.85367774963379, "learning_rate": 1.1230907457322552e-06, "loss": 2.4268, "step": 2095 }, { "epoch": 0.89419795221843, "grad_norm": 21.826454162597656, "learning_rate": 1.1185983827493262e-06, "loss": 2.123, "step": 2096 }, { "epoch": 0.8946245733788396, "grad_norm": 25.08101463317871, "learning_rate": 1.1141060197663973e-06, "loss": 1.9346, "step": 2097 }, { "epoch": 0.8950511945392492, "grad_norm": 42.20564270019531, "learning_rate": 1.1096136567834683e-06, "loss": 2.4873, "step": 2098 }, { "epoch": 0.8954778156996587, "grad_norm": 23.242401123046875, "learning_rate": 1.1051212938005392e-06, "loss": 2.5371, "step": 2099 }, { "epoch": 0.8959044368600683, "grad_norm": 24.964221954345703, "learning_rate": 1.1006289308176102e-06, "loss": 2.2559, "step": 2100 }, { "epoch": 0.8963310580204779, "grad_norm": 26.806692123413086, "learning_rate": 1.096136567834681e-06, "loss": 2.0908, "step": 2101 }, { "epoch": 0.8967576791808873, "grad_norm": 27.2700252532959, "learning_rate": 1.091644204851752e-06, "loss": 2.1367, "step": 2102 }, { "epoch": 0.8971843003412969, "grad_norm": 16.515722274780273, "learning_rate": 1.087151841868823e-06, "loss": 1.8257, "step": 2103 }, { "epoch": 0.8976109215017065, "grad_norm": 29.197978973388672, "learning_rate": 1.082659478885894e-06, "loss": 2.2861, "step": 2104 }, { "epoch": 0.898037542662116, "grad_norm": 32.033565521240234, "learning_rate": 1.078167115902965e-06, "loss": 2.1191, "step": 2105 }, { "epoch": 0.8984641638225256, "grad_norm": 24.29416847229004, "learning_rate": 1.073674752920036e-06, "loss": 1.9277, "step": 2106 }, { "epoch": 0.8988907849829352, "grad_norm": 21.460018157958984, "learning_rate": 1.069182389937107e-06, "loss": 2.2949, "step": 2107 }, { "epoch": 0.8993174061433447, "grad_norm": 23.199054718017578, "learning_rate": 1.064690026954178e-06, "loss": 2.0605, "step": 2108 }, { "epoch": 0.8997440273037542, "grad_norm": 19.786056518554688, "learning_rate": 1.060197663971249e-06, "loss": 1.981, "step": 2109 }, { "epoch": 0.9001706484641638, "grad_norm": 27.960031509399414, "learning_rate": 1.0557053009883198e-06, "loss": 2.126, "step": 2110 }, { "epoch": 0.9005972696245734, "grad_norm": 24.085905075073242, "learning_rate": 1.0512129380053909e-06, "loss": 2.0376, "step": 2111 }, { "epoch": 0.9010238907849829, "grad_norm": 20.048545837402344, "learning_rate": 1.046720575022462e-06, "loss": 1.6416, "step": 2112 }, { "epoch": 0.9014505119453925, "grad_norm": 21.789766311645508, "learning_rate": 1.042228212039533e-06, "loss": 2.3467, "step": 2113 }, { "epoch": 0.9018771331058021, "grad_norm": 19.590885162353516, "learning_rate": 1.037735849056604e-06, "loss": 1.7197, "step": 2114 }, { "epoch": 0.9023037542662116, "grad_norm": 24.716259002685547, "learning_rate": 1.0332434860736748e-06, "loss": 1.9697, "step": 2115 }, { "epoch": 0.9027303754266212, "grad_norm": 23.647457122802734, "learning_rate": 1.0287511230907459e-06, "loss": 2.0835, "step": 2116 }, { "epoch": 0.9031569965870307, "grad_norm": 24.428579330444336, "learning_rate": 1.0242587601078167e-06, "loss": 1.959, "step": 2117 }, { "epoch": 0.9035836177474402, "grad_norm": 20.997983932495117, "learning_rate": 1.0197663971248877e-06, "loss": 1.7334, "step": 2118 }, { "epoch": 0.9040102389078498, "grad_norm": 21.304641723632812, "learning_rate": 1.0152740341419588e-06, "loss": 1.4746, "step": 2119 }, { "epoch": 0.9044368600682594, "grad_norm": 32.69042205810547, "learning_rate": 1.0107816711590298e-06, "loss": 2.3721, "step": 2120 }, { "epoch": 0.9048634812286689, "grad_norm": 26.37349510192871, "learning_rate": 1.0062893081761007e-06, "loss": 2.2334, "step": 2121 }, { "epoch": 0.9052901023890785, "grad_norm": 37.03302001953125, "learning_rate": 1.0017969451931717e-06, "loss": 2.292, "step": 2122 }, { "epoch": 0.9057167235494881, "grad_norm": 22.1188907623291, "learning_rate": 9.973045822102428e-07, "loss": 2.0234, "step": 2123 }, { "epoch": 0.9061433447098977, "grad_norm": 27.15044593811035, "learning_rate": 9.928122192273136e-07, "loss": 2.0137, "step": 2124 }, { "epoch": 0.9065699658703071, "grad_norm": 24.961999893188477, "learning_rate": 9.883198562443846e-07, "loss": 2.5054, "step": 2125 }, { "epoch": 0.9069965870307167, "grad_norm": 30.560245513916016, "learning_rate": 9.838274932614555e-07, "loss": 1.8213, "step": 2126 }, { "epoch": 0.9074232081911263, "grad_norm": 22.483095169067383, "learning_rate": 9.793351302785265e-07, "loss": 1.5615, "step": 2127 }, { "epoch": 0.9078498293515358, "grad_norm": 28.84935760498047, "learning_rate": 9.748427672955975e-07, "loss": 2.5317, "step": 2128 }, { "epoch": 0.9082764505119454, "grad_norm": 22.43422508239746, "learning_rate": 9.703504043126686e-07, "loss": 1.897, "step": 2129 }, { "epoch": 0.908703071672355, "grad_norm": 23.057863235473633, "learning_rate": 9.658580413297396e-07, "loss": 2.1621, "step": 2130 }, { "epoch": 0.9091296928327645, "grad_norm": 23.351160049438477, "learning_rate": 9.613656783468105e-07, "loss": 1.6934, "step": 2131 }, { "epoch": 0.909556313993174, "grad_norm": 29.212770462036133, "learning_rate": 9.568733153638815e-07, "loss": 2.124, "step": 2132 }, { "epoch": 0.9099829351535836, "grad_norm": 26.707088470458984, "learning_rate": 9.523809523809525e-07, "loss": 2.3135, "step": 2133 }, { "epoch": 0.9104095563139932, "grad_norm": 23.58577537536621, "learning_rate": 9.478885893980235e-07, "loss": 2.2168, "step": 2134 }, { "epoch": 0.9108361774744027, "grad_norm": 21.69426155090332, "learning_rate": 9.433962264150944e-07, "loss": 2.4932, "step": 2135 }, { "epoch": 0.9112627986348123, "grad_norm": 34.198604583740234, "learning_rate": 9.389038634321654e-07, "loss": 2.1826, "step": 2136 }, { "epoch": 0.9116894197952219, "grad_norm": 20.484844207763672, "learning_rate": 9.344115004492364e-07, "loss": 1.6392, "step": 2137 }, { "epoch": 0.9121160409556314, "grad_norm": 48.64004135131836, "learning_rate": 9.299191374663074e-07, "loss": 1.8081, "step": 2138 }, { "epoch": 0.912542662116041, "grad_norm": 36.88442611694336, "learning_rate": 9.254267744833784e-07, "loss": 2.5791, "step": 2139 }, { "epoch": 0.9129692832764505, "grad_norm": 31.48349380493164, "learning_rate": 9.209344115004492e-07, "loss": 1.9502, "step": 2140 }, { "epoch": 0.91339590443686, "grad_norm": 21.401731491088867, "learning_rate": 9.164420485175203e-07, "loss": 2.2046, "step": 2141 }, { "epoch": 0.9138225255972696, "grad_norm": 22.95062255859375, "learning_rate": 9.119496855345912e-07, "loss": 1.7771, "step": 2142 }, { "epoch": 0.9142491467576792, "grad_norm": 17.692584991455078, "learning_rate": 9.074573225516623e-07, "loss": 1.7725, "step": 2143 }, { "epoch": 0.9146757679180887, "grad_norm": 25.14078140258789, "learning_rate": 9.029649595687333e-07, "loss": 2.1123, "step": 2144 }, { "epoch": 0.9151023890784983, "grad_norm": 23.51230239868164, "learning_rate": 8.984725965858041e-07, "loss": 2.0317, "step": 2145 }, { "epoch": 0.9155290102389079, "grad_norm": 40.93922805786133, "learning_rate": 8.939802336028752e-07, "loss": 2.1279, "step": 2146 }, { "epoch": 0.9159556313993175, "grad_norm": 31.59501075744629, "learning_rate": 8.894878706199461e-07, "loss": 2.0869, "step": 2147 }, { "epoch": 0.9163822525597269, "grad_norm": 43.97781753540039, "learning_rate": 8.849955076370172e-07, "loss": 2.0508, "step": 2148 }, { "epoch": 0.9168088737201365, "grad_norm": 29.16642951965332, "learning_rate": 8.805031446540881e-07, "loss": 1.8281, "step": 2149 }, { "epoch": 0.9172354948805461, "grad_norm": 19.41118812561035, "learning_rate": 8.760107816711592e-07, "loss": 1.7861, "step": 2150 }, { "epoch": 0.9176621160409556, "grad_norm": 24.558164596557617, "learning_rate": 8.715184186882302e-07, "loss": 2.6377, "step": 2151 }, { "epoch": 0.9180887372013652, "grad_norm": 16.41124153137207, "learning_rate": 8.67026055705301e-07, "loss": 1.6436, "step": 2152 }, { "epoch": 0.9185153583617748, "grad_norm": 19.3815860748291, "learning_rate": 8.625336927223721e-07, "loss": 1.6597, "step": 2153 }, { "epoch": 0.9189419795221843, "grad_norm": 35.49598693847656, "learning_rate": 8.58041329739443e-07, "loss": 2.2285, "step": 2154 }, { "epoch": 0.9193686006825939, "grad_norm": 28.167762756347656, "learning_rate": 8.535489667565141e-07, "loss": 2.2881, "step": 2155 }, { "epoch": 0.9197952218430034, "grad_norm": 27.260639190673828, "learning_rate": 8.490566037735849e-07, "loss": 2.5161, "step": 2156 }, { "epoch": 0.9202218430034129, "grad_norm": 21.667072296142578, "learning_rate": 8.445642407906559e-07, "loss": 2.2041, "step": 2157 }, { "epoch": 0.9206484641638225, "grad_norm": 19.246713638305664, "learning_rate": 8.400718778077269e-07, "loss": 1.7842, "step": 2158 }, { "epoch": 0.9210750853242321, "grad_norm": 20.60174560546875, "learning_rate": 8.355795148247979e-07, "loss": 1.9756, "step": 2159 }, { "epoch": 0.9215017064846417, "grad_norm": 20.229415893554688, "learning_rate": 8.31087151841869e-07, "loss": 1.6904, "step": 2160 }, { "epoch": 0.9219283276450512, "grad_norm": 19.543737411499023, "learning_rate": 8.265947888589399e-07, "loss": 2.252, "step": 2161 }, { "epoch": 0.9223549488054608, "grad_norm": 20.05359649658203, "learning_rate": 8.221024258760108e-07, "loss": 1.5576, "step": 2162 }, { "epoch": 0.9227815699658704, "grad_norm": 25.795568466186523, "learning_rate": 8.176100628930818e-07, "loss": 1.7412, "step": 2163 }, { "epoch": 0.9232081911262798, "grad_norm": 25.85951805114746, "learning_rate": 8.131176999101528e-07, "loss": 2.7568, "step": 2164 }, { "epoch": 0.9236348122866894, "grad_norm": 25.40492057800293, "learning_rate": 8.086253369272238e-07, "loss": 2.125, "step": 2165 }, { "epoch": 0.924061433447099, "grad_norm": 21.45697784423828, "learning_rate": 8.041329739442948e-07, "loss": 2.0469, "step": 2166 }, { "epoch": 0.9244880546075085, "grad_norm": 27.524532318115234, "learning_rate": 7.996406109613658e-07, "loss": 2.1284, "step": 2167 }, { "epoch": 0.9249146757679181, "grad_norm": 25.877559661865234, "learning_rate": 7.951482479784367e-07, "loss": 2.2461, "step": 2168 }, { "epoch": 0.9253412969283277, "grad_norm": 27.39493179321289, "learning_rate": 7.906558849955077e-07, "loss": 2.1006, "step": 2169 }, { "epoch": 0.9257679180887372, "grad_norm": 31.304426193237305, "learning_rate": 7.861635220125787e-07, "loss": 2.4561, "step": 2170 }, { "epoch": 0.9261945392491467, "grad_norm": 33.025875091552734, "learning_rate": 7.816711590296497e-07, "loss": 1.6978, "step": 2171 }, { "epoch": 0.9266211604095563, "grad_norm": 29.005495071411133, "learning_rate": 7.771787960467205e-07, "loss": 2.3926, "step": 2172 }, { "epoch": 0.9270477815699659, "grad_norm": 46.53617477416992, "learning_rate": 7.726864330637916e-07, "loss": 2.0522, "step": 2173 }, { "epoch": 0.9274744027303754, "grad_norm": 24.20000457763672, "learning_rate": 7.681940700808626e-07, "loss": 2.2446, "step": 2174 }, { "epoch": 0.927901023890785, "grad_norm": 25.25002670288086, "learning_rate": 7.637017070979336e-07, "loss": 2.0361, "step": 2175 }, { "epoch": 0.9283276450511946, "grad_norm": 21.676692962646484, "learning_rate": 7.592093441150046e-07, "loss": 1.8105, "step": 2176 }, { "epoch": 0.9287542662116041, "grad_norm": 29.155437469482422, "learning_rate": 7.547169811320755e-07, "loss": 2.2046, "step": 2177 }, { "epoch": 0.9291808873720137, "grad_norm": 29.175691604614258, "learning_rate": 7.502246181491466e-07, "loss": 2.5146, "step": 2178 }, { "epoch": 0.9296075085324232, "grad_norm": 25.129615783691406, "learning_rate": 7.457322551662174e-07, "loss": 1.9575, "step": 2179 }, { "epoch": 0.9300341296928327, "grad_norm": 24.428255081176758, "learning_rate": 7.412398921832885e-07, "loss": 1.7617, "step": 2180 }, { "epoch": 0.9304607508532423, "grad_norm": 25.9346866607666, "learning_rate": 7.367475292003594e-07, "loss": 1.9551, "step": 2181 }, { "epoch": 0.9308873720136519, "grad_norm": 23.452688217163086, "learning_rate": 7.322551662174305e-07, "loss": 2.2588, "step": 2182 }, { "epoch": 0.9313139931740614, "grad_norm": 33.63352584838867, "learning_rate": 7.277628032345015e-07, "loss": 2.2471, "step": 2183 }, { "epoch": 0.931740614334471, "grad_norm": 19.231609344482422, "learning_rate": 7.232704402515723e-07, "loss": 1.998, "step": 2184 }, { "epoch": 0.9321672354948806, "grad_norm": 18.095294952392578, "learning_rate": 7.187780772686434e-07, "loss": 1.6978, "step": 2185 }, { "epoch": 0.9325938566552902, "grad_norm": 20.472152709960938, "learning_rate": 7.142857142857143e-07, "loss": 1.9365, "step": 2186 }, { "epoch": 0.9330204778156996, "grad_norm": 19.606014251708984, "learning_rate": 7.097933513027854e-07, "loss": 2.1846, "step": 2187 }, { "epoch": 0.9334470989761092, "grad_norm": 19.07863998413086, "learning_rate": 7.053009883198563e-07, "loss": 2.166, "step": 2188 }, { "epoch": 0.9338737201365188, "grad_norm": 21.699003219604492, "learning_rate": 7.008086253369272e-07, "loss": 2.1631, "step": 2189 }, { "epoch": 0.9343003412969283, "grad_norm": 26.416015625, "learning_rate": 6.963162623539983e-07, "loss": 1.8184, "step": 2190 }, { "epoch": 0.9347269624573379, "grad_norm": 28.63286590576172, "learning_rate": 6.918238993710692e-07, "loss": 1.9531, "step": 2191 }, { "epoch": 0.9351535836177475, "grad_norm": 29.21709632873535, "learning_rate": 6.873315363881403e-07, "loss": 2.0801, "step": 2192 }, { "epoch": 0.935580204778157, "grad_norm": 45.255577087402344, "learning_rate": 6.828391734052112e-07, "loss": 2.0156, "step": 2193 }, { "epoch": 0.9360068259385665, "grad_norm": 17.908376693725586, "learning_rate": 6.783468104222822e-07, "loss": 1.876, "step": 2194 }, { "epoch": 0.9364334470989761, "grad_norm": 18.84600257873535, "learning_rate": 6.738544474393531e-07, "loss": 1.7788, "step": 2195 }, { "epoch": 0.9368600682593856, "grad_norm": 21.277381896972656, "learning_rate": 6.693620844564241e-07, "loss": 2.0059, "step": 2196 }, { "epoch": 0.9372866894197952, "grad_norm": 26.39069938659668, "learning_rate": 6.648697214734951e-07, "loss": 2.1885, "step": 2197 }, { "epoch": 0.9377133105802048, "grad_norm": 20.444717407226562, "learning_rate": 6.603773584905661e-07, "loss": 1.6426, "step": 2198 }, { "epoch": 0.9381399317406144, "grad_norm": 26.616222381591797, "learning_rate": 6.558849955076372e-07, "loss": 2.0957, "step": 2199 }, { "epoch": 0.9385665529010239, "grad_norm": 22.825220108032227, "learning_rate": 6.51392632524708e-07, "loss": 1.8066, "step": 2200 }, { "epoch": 0.9389931740614335, "grad_norm": 35.218353271484375, "learning_rate": 6.46900269541779e-07, "loss": 1.8926, "step": 2201 }, { "epoch": 0.939419795221843, "grad_norm": 18.171491622924805, "learning_rate": 6.4240790655885e-07, "loss": 1.9795, "step": 2202 }, { "epoch": 0.9398464163822525, "grad_norm": 22.466522216796875, "learning_rate": 6.37915543575921e-07, "loss": 2.0566, "step": 2203 }, { "epoch": 0.9402730375426621, "grad_norm": 29.288331985473633, "learning_rate": 6.33423180592992e-07, "loss": 2.625, "step": 2204 }, { "epoch": 0.9406996587030717, "grad_norm": 17.484268188476562, "learning_rate": 6.28930817610063e-07, "loss": 1.7607, "step": 2205 }, { "epoch": 0.9411262798634812, "grad_norm": 29.735485076904297, "learning_rate": 6.244384546271339e-07, "loss": 2.2158, "step": 2206 }, { "epoch": 0.9415529010238908, "grad_norm": 18.694398880004883, "learning_rate": 6.199460916442049e-07, "loss": 1.5278, "step": 2207 }, { "epoch": 0.9419795221843004, "grad_norm": 22.542926788330078, "learning_rate": 6.154537286612758e-07, "loss": 1.8477, "step": 2208 }, { "epoch": 0.9424061433447098, "grad_norm": 41.35770034790039, "learning_rate": 6.109613656783469e-07, "loss": 2.6221, "step": 2209 }, { "epoch": 0.9428327645051194, "grad_norm": 22.501956939697266, "learning_rate": 6.064690026954179e-07, "loss": 1.8408, "step": 2210 }, { "epoch": 0.943259385665529, "grad_norm": 21.826805114746094, "learning_rate": 6.019766397124888e-07, "loss": 2.208, "step": 2211 }, { "epoch": 0.9436860068259386, "grad_norm": 19.94651985168457, "learning_rate": 5.974842767295598e-07, "loss": 1.9448, "step": 2212 }, { "epoch": 0.9441126279863481, "grad_norm": 24.20819091796875, "learning_rate": 5.929919137466308e-07, "loss": 1.8105, "step": 2213 }, { "epoch": 0.9445392491467577, "grad_norm": 22.356653213500977, "learning_rate": 5.884995507637018e-07, "loss": 1.5435, "step": 2214 }, { "epoch": 0.9449658703071673, "grad_norm": 30.155969619750977, "learning_rate": 5.840071877807727e-07, "loss": 2.3643, "step": 2215 }, { "epoch": 0.9453924914675768, "grad_norm": 21.76837921142578, "learning_rate": 5.795148247978436e-07, "loss": 1.7051, "step": 2216 }, { "epoch": 0.9458191126279863, "grad_norm": 25.253297805786133, "learning_rate": 5.750224618149147e-07, "loss": 2.3926, "step": 2217 }, { "epoch": 0.9462457337883959, "grad_norm": 17.783184051513672, "learning_rate": 5.705300988319857e-07, "loss": 1.874, "step": 2218 }, { "epoch": 0.9466723549488054, "grad_norm": 26.631513595581055, "learning_rate": 5.660377358490567e-07, "loss": 1.7959, "step": 2219 }, { "epoch": 0.947098976109215, "grad_norm": 23.53889274597168, "learning_rate": 5.615453728661276e-07, "loss": 2.2017, "step": 2220 }, { "epoch": 0.9475255972696246, "grad_norm": 29.182485580444336, "learning_rate": 5.570530098831986e-07, "loss": 1.8428, "step": 2221 }, { "epoch": 0.9479522184300341, "grad_norm": 25.79732894897461, "learning_rate": 5.525606469002696e-07, "loss": 1.8906, "step": 2222 }, { "epoch": 0.9483788395904437, "grad_norm": 23.194297790527344, "learning_rate": 5.480682839173405e-07, "loss": 2.1357, "step": 2223 }, { "epoch": 0.9488054607508533, "grad_norm": 24.474523544311523, "learning_rate": 5.435759209344115e-07, "loss": 2.0537, "step": 2224 }, { "epoch": 0.9492320819112628, "grad_norm": 24.2773380279541, "learning_rate": 5.390835579514825e-07, "loss": 1.9922, "step": 2225 }, { "epoch": 0.9496587030716723, "grad_norm": 24.227388381958008, "learning_rate": 5.345911949685535e-07, "loss": 2.1426, "step": 2226 }, { "epoch": 0.9500853242320819, "grad_norm": 24.159053802490234, "learning_rate": 5.300988319856245e-07, "loss": 2.1465, "step": 2227 }, { "epoch": 0.9505119453924915, "grad_norm": 30.254880905151367, "learning_rate": 5.256064690026954e-07, "loss": 2.248, "step": 2228 }, { "epoch": 0.950938566552901, "grad_norm": 29.864953994750977, "learning_rate": 5.211141060197665e-07, "loss": 1.7378, "step": 2229 }, { "epoch": 0.9513651877133106, "grad_norm": 24.123804092407227, "learning_rate": 5.166217430368374e-07, "loss": 2.5049, "step": 2230 }, { "epoch": 0.9517918088737202, "grad_norm": 42.50309753417969, "learning_rate": 5.121293800539083e-07, "loss": 2.2256, "step": 2231 }, { "epoch": 0.9522184300341296, "grad_norm": 23.206518173217773, "learning_rate": 5.076370170709794e-07, "loss": 2.498, "step": 2232 }, { "epoch": 0.9526450511945392, "grad_norm": 23.798446655273438, "learning_rate": 5.031446540880503e-07, "loss": 1.8755, "step": 2233 }, { "epoch": 0.9530716723549488, "grad_norm": 20.48027229309082, "learning_rate": 4.986522911051214e-07, "loss": 1.7832, "step": 2234 }, { "epoch": 0.9534982935153583, "grad_norm": 24.295406341552734, "learning_rate": 4.941599281221923e-07, "loss": 1.8711, "step": 2235 }, { "epoch": 0.9539249146757679, "grad_norm": 25.365156173706055, "learning_rate": 4.896675651392633e-07, "loss": 2.3271, "step": 2236 }, { "epoch": 0.9543515358361775, "grad_norm": 29.6678466796875, "learning_rate": 4.851752021563343e-07, "loss": 1.7764, "step": 2237 }, { "epoch": 0.9547781569965871, "grad_norm": 23.474645614624023, "learning_rate": 4.806828391734052e-07, "loss": 2.3232, "step": 2238 }, { "epoch": 0.9552047781569966, "grad_norm": 24.556297302246094, "learning_rate": 4.7619047619047623e-07, "loss": 2.2383, "step": 2239 }, { "epoch": 0.9556313993174061, "grad_norm": 35.68256378173828, "learning_rate": 4.716981132075472e-07, "loss": 1.7432, "step": 2240 }, { "epoch": 0.9560580204778157, "grad_norm": 23.850263595581055, "learning_rate": 4.672057502246182e-07, "loss": 1.9561, "step": 2241 }, { "epoch": 0.9564846416382252, "grad_norm": 24.310489654541016, "learning_rate": 4.627133872416892e-07, "loss": 2.6641, "step": 2242 }, { "epoch": 0.9569112627986348, "grad_norm": 26.781076431274414, "learning_rate": 4.5822102425876014e-07, "loss": 2.415, "step": 2243 }, { "epoch": 0.9573378839590444, "grad_norm": 24.84231948852539, "learning_rate": 4.5372866127583113e-07, "loss": 2.3594, "step": 2244 }, { "epoch": 0.9577645051194539, "grad_norm": 22.890363693237305, "learning_rate": 4.4923629829290207e-07, "loss": 2.3408, "step": 2245 }, { "epoch": 0.9581911262798635, "grad_norm": 21.693756103515625, "learning_rate": 4.4474393530997306e-07, "loss": 2.0381, "step": 2246 }, { "epoch": 0.9586177474402731, "grad_norm": 26.951416015625, "learning_rate": 4.4025157232704405e-07, "loss": 2.0249, "step": 2247 }, { "epoch": 0.9590443686006825, "grad_norm": 30.898317337036133, "learning_rate": 4.357592093441151e-07, "loss": 1.9487, "step": 2248 }, { "epoch": 0.9594709897610921, "grad_norm": 18.807350158691406, "learning_rate": 4.3126684636118604e-07, "loss": 1.4497, "step": 2249 }, { "epoch": 0.9598976109215017, "grad_norm": 19.728879928588867, "learning_rate": 4.2677448337825703e-07, "loss": 2.0342, "step": 2250 }, { "epoch": 0.9603242320819113, "grad_norm": 19.783090591430664, "learning_rate": 4.2228212039532797e-07, "loss": 1.6553, "step": 2251 }, { "epoch": 0.9607508532423208, "grad_norm": 21.11236572265625, "learning_rate": 4.1778975741239896e-07, "loss": 1.6157, "step": 2252 }, { "epoch": 0.9611774744027304, "grad_norm": 24.241262435913086, "learning_rate": 4.1329739442946995e-07, "loss": 2.0508, "step": 2253 }, { "epoch": 0.96160409556314, "grad_norm": 22.40394401550293, "learning_rate": 4.088050314465409e-07, "loss": 2.0752, "step": 2254 }, { "epoch": 0.9620307167235495, "grad_norm": 21.405790328979492, "learning_rate": 4.043126684636119e-07, "loss": 2.0957, "step": 2255 }, { "epoch": 0.962457337883959, "grad_norm": 19.25254249572754, "learning_rate": 3.998203054806829e-07, "loss": 1.957, "step": 2256 }, { "epoch": 0.9628839590443686, "grad_norm": 27.212743759155273, "learning_rate": 3.9532794249775386e-07, "loss": 2.1802, "step": 2257 }, { "epoch": 0.9633105802047781, "grad_norm": 31.379535675048828, "learning_rate": 3.9083557951482485e-07, "loss": 1.7871, "step": 2258 }, { "epoch": 0.9637372013651877, "grad_norm": 20.31130027770996, "learning_rate": 3.863432165318958e-07, "loss": 2.0615, "step": 2259 }, { "epoch": 0.9641638225255973, "grad_norm": 26.714658737182617, "learning_rate": 3.818508535489668e-07, "loss": 2.6816, "step": 2260 }, { "epoch": 0.9645904436860068, "grad_norm": 30.048847198486328, "learning_rate": 3.773584905660378e-07, "loss": 2.9863, "step": 2261 }, { "epoch": 0.9650170648464164, "grad_norm": 19.18850326538086, "learning_rate": 3.728661275831087e-07, "loss": 1.9072, "step": 2262 }, { "epoch": 0.965443686006826, "grad_norm": 39.271331787109375, "learning_rate": 3.683737646001797e-07, "loss": 2.2754, "step": 2263 }, { "epoch": 0.9658703071672355, "grad_norm": 20.29155158996582, "learning_rate": 3.6388140161725075e-07, "loss": 1.7832, "step": 2264 }, { "epoch": 0.966296928327645, "grad_norm": 21.201148986816406, "learning_rate": 3.593890386343217e-07, "loss": 1.8188, "step": 2265 }, { "epoch": 0.9667235494880546, "grad_norm": 21.81801414489746, "learning_rate": 3.548966756513927e-07, "loss": 1.6567, "step": 2266 }, { "epoch": 0.9671501706484642, "grad_norm": 22.927953720092773, "learning_rate": 3.504043126684636e-07, "loss": 2.3242, "step": 2267 }, { "epoch": 0.9675767918088737, "grad_norm": 24.306936264038086, "learning_rate": 3.459119496855346e-07, "loss": 1.502, "step": 2268 }, { "epoch": 0.9680034129692833, "grad_norm": 22.41961669921875, "learning_rate": 3.414195867026056e-07, "loss": 2.335, "step": 2269 }, { "epoch": 0.9684300341296929, "grad_norm": 23.747060775756836, "learning_rate": 3.3692722371967654e-07, "loss": 1.9102, "step": 2270 }, { "epoch": 0.9688566552901023, "grad_norm": 22.018306732177734, "learning_rate": 3.3243486073674753e-07, "loss": 1.8613, "step": 2271 }, { "epoch": 0.9692832764505119, "grad_norm": 22.868671417236328, "learning_rate": 3.279424977538186e-07, "loss": 2.1055, "step": 2272 }, { "epoch": 0.9697098976109215, "grad_norm": 29.00540542602539, "learning_rate": 3.234501347708895e-07, "loss": 2.8096, "step": 2273 }, { "epoch": 0.9701365187713311, "grad_norm": 21.77559471130371, "learning_rate": 3.189577717879605e-07, "loss": 1.9175, "step": 2274 }, { "epoch": 0.9705631399317406, "grad_norm": 18.843427658081055, "learning_rate": 3.144654088050315e-07, "loss": 1.7012, "step": 2275 }, { "epoch": 0.9709897610921502, "grad_norm": 24.1773681640625, "learning_rate": 3.0997304582210244e-07, "loss": 1.5874, "step": 2276 }, { "epoch": 0.9714163822525598, "grad_norm": 19.41014862060547, "learning_rate": 3.0548068283917343e-07, "loss": 1.395, "step": 2277 }, { "epoch": 0.9718430034129693, "grad_norm": 25.66033363342285, "learning_rate": 3.009883198562444e-07, "loss": 3.2275, "step": 2278 }, { "epoch": 0.9722696245733788, "grad_norm": 30.046892166137695, "learning_rate": 2.964959568733154e-07, "loss": 1.9316, "step": 2279 }, { "epoch": 0.9726962457337884, "grad_norm": 20.475860595703125, "learning_rate": 2.9200359389038635e-07, "loss": 1.7725, "step": 2280 }, { "epoch": 0.9731228668941979, "grad_norm": 24.3115177154541, "learning_rate": 2.8751123090745734e-07, "loss": 1.875, "step": 2281 }, { "epoch": 0.9735494880546075, "grad_norm": 18.358522415161133, "learning_rate": 2.8301886792452833e-07, "loss": 1.6748, "step": 2282 }, { "epoch": 0.9739761092150171, "grad_norm": 20.390560150146484, "learning_rate": 2.785265049415993e-07, "loss": 1.7437, "step": 2283 }, { "epoch": 0.9744027303754266, "grad_norm": 36.942039489746094, "learning_rate": 2.7403414195867026e-07, "loss": 2.3301, "step": 2284 }, { "epoch": 0.9748293515358362, "grad_norm": 21.772939682006836, "learning_rate": 2.6954177897574125e-07, "loss": 2.1104, "step": 2285 }, { "epoch": 0.9752559726962458, "grad_norm": 22.524234771728516, "learning_rate": 2.6504941599281224e-07, "loss": 1.8799, "step": 2286 }, { "epoch": 0.9756825938566553, "grad_norm": 23.909032821655273, "learning_rate": 2.6055705300988324e-07, "loss": 1.793, "step": 2287 }, { "epoch": 0.9761092150170648, "grad_norm": 34.19670486450195, "learning_rate": 2.560646900269542e-07, "loss": 2.3564, "step": 2288 }, { "epoch": 0.9765358361774744, "grad_norm": 23.46267318725586, "learning_rate": 2.5157232704402517e-07, "loss": 1.8989, "step": 2289 }, { "epoch": 0.976962457337884, "grad_norm": 25.99132537841797, "learning_rate": 2.4707996406109616e-07, "loss": 1.6729, "step": 2290 }, { "epoch": 0.9773890784982935, "grad_norm": 19.899215698242188, "learning_rate": 2.4258760107816715e-07, "loss": 2.0142, "step": 2291 }, { "epoch": 0.9778156996587031, "grad_norm": 32.16071701049805, "learning_rate": 2.3809523809523811e-07, "loss": 2.2734, "step": 2292 }, { "epoch": 0.9782423208191127, "grad_norm": 21.155961990356445, "learning_rate": 2.336028751123091e-07, "loss": 1.6182, "step": 2293 }, { "epoch": 0.9786689419795221, "grad_norm": 19.982471466064453, "learning_rate": 2.2911051212938007e-07, "loss": 2.0479, "step": 2294 }, { "epoch": 0.9790955631399317, "grad_norm": 26.075380325317383, "learning_rate": 2.2461814914645103e-07, "loss": 2.0459, "step": 2295 }, { "epoch": 0.9795221843003413, "grad_norm": 21.82196617126465, "learning_rate": 2.2012578616352203e-07, "loss": 1.7998, "step": 2296 }, { "epoch": 0.9799488054607508, "grad_norm": 17.56633186340332, "learning_rate": 2.1563342318059302e-07, "loss": 1.6953, "step": 2297 }, { "epoch": 0.9803754266211604, "grad_norm": 20.110218048095703, "learning_rate": 2.1114106019766398e-07, "loss": 2.0469, "step": 2298 }, { "epoch": 0.98080204778157, "grad_norm": 20.577871322631836, "learning_rate": 2.0664869721473497e-07, "loss": 1.853, "step": 2299 }, { "epoch": 0.9812286689419796, "grad_norm": 26.926685333251953, "learning_rate": 2.0215633423180594e-07, "loss": 2.0679, "step": 2300 }, { "epoch": 0.981655290102389, "grad_norm": 26.033254623413086, "learning_rate": 1.9766397124887693e-07, "loss": 2.124, "step": 2301 }, { "epoch": 0.9820819112627986, "grad_norm": 36.75836944580078, "learning_rate": 1.931716082659479e-07, "loss": 1.7207, "step": 2302 }, { "epoch": 0.9825085324232082, "grad_norm": 17.078088760375977, "learning_rate": 1.886792452830189e-07, "loss": 1.6279, "step": 2303 }, { "epoch": 0.9829351535836177, "grad_norm": 23.182506561279297, "learning_rate": 1.8418688230008985e-07, "loss": 2.2988, "step": 2304 }, { "epoch": 0.9833617747440273, "grad_norm": 25.592905044555664, "learning_rate": 1.7969451931716084e-07, "loss": 2.0176, "step": 2305 }, { "epoch": 0.9837883959044369, "grad_norm": 21.9675350189209, "learning_rate": 1.752021563342318e-07, "loss": 1.9124, "step": 2306 }, { "epoch": 0.9842150170648464, "grad_norm": 26.200014114379883, "learning_rate": 1.707097933513028e-07, "loss": 1.8384, "step": 2307 }, { "epoch": 0.984641638225256, "grad_norm": 37.688297271728516, "learning_rate": 1.6621743036837377e-07, "loss": 1.9126, "step": 2308 }, { "epoch": 0.9850682593856656, "grad_norm": 33.95469284057617, "learning_rate": 1.6172506738544476e-07, "loss": 2.4209, "step": 2309 }, { "epoch": 0.985494880546075, "grad_norm": 32.37906265258789, "learning_rate": 1.5723270440251575e-07, "loss": 2.2676, "step": 2310 }, { "epoch": 0.9859215017064846, "grad_norm": 22.61172866821289, "learning_rate": 1.5274034141958671e-07, "loss": 2.1543, "step": 2311 }, { "epoch": 0.9863481228668942, "grad_norm": 23.850786209106445, "learning_rate": 1.482479784366577e-07, "loss": 2.0547, "step": 2312 }, { "epoch": 0.9867747440273038, "grad_norm": 26.80950164794922, "learning_rate": 1.4375561545372867e-07, "loss": 2.3789, "step": 2313 }, { "epoch": 0.9872013651877133, "grad_norm": 27.270933151245117, "learning_rate": 1.3926325247079966e-07, "loss": 2.1729, "step": 2314 }, { "epoch": 0.9876279863481229, "grad_norm": 20.2312068939209, "learning_rate": 1.3477088948787063e-07, "loss": 1.8989, "step": 2315 }, { "epoch": 0.9880546075085325, "grad_norm": 28.13861846923828, "learning_rate": 1.3027852650494162e-07, "loss": 2.0293, "step": 2316 }, { "epoch": 0.988481228668942, "grad_norm": 25.935266494750977, "learning_rate": 1.2578616352201258e-07, "loss": 1.7109, "step": 2317 }, { "epoch": 0.9889078498293515, "grad_norm": 36.28443145751953, "learning_rate": 1.2129380053908357e-07, "loss": 2.1348, "step": 2318 }, { "epoch": 0.9893344709897611, "grad_norm": 22.1302547454834, "learning_rate": 1.1680143755615455e-07, "loss": 1.7305, "step": 2319 }, { "epoch": 0.9897610921501706, "grad_norm": 34.13624954223633, "learning_rate": 1.1230907457322552e-07, "loss": 2.2808, "step": 2320 }, { "epoch": 0.9901877133105802, "grad_norm": 22.001792907714844, "learning_rate": 1.0781671159029651e-07, "loss": 2.0962, "step": 2321 }, { "epoch": 0.9906143344709898, "grad_norm": 23.512155532836914, "learning_rate": 1.0332434860736749e-07, "loss": 1.915, "step": 2322 }, { "epoch": 0.9910409556313993, "grad_norm": 20.13483238220215, "learning_rate": 9.883198562443847e-08, "loss": 1.8555, "step": 2323 }, { "epoch": 0.9914675767918089, "grad_norm": 23.738903045654297, "learning_rate": 9.433962264150944e-08, "loss": 2.0503, "step": 2324 }, { "epoch": 0.9918941979522184, "grad_norm": 21.183456420898438, "learning_rate": 8.984725965858042e-08, "loss": 2.0312, "step": 2325 }, { "epoch": 0.992320819112628, "grad_norm": 45.3771858215332, "learning_rate": 8.53548966756514e-08, "loss": 2.8213, "step": 2326 }, { "epoch": 0.9927474402730375, "grad_norm": 28.3593807220459, "learning_rate": 8.086253369272238e-08, "loss": 1.8647, "step": 2327 }, { "epoch": 0.9931740614334471, "grad_norm": 35.657875061035156, "learning_rate": 7.637017070979336e-08, "loss": 1.9424, "step": 2328 }, { "epoch": 0.9936006825938567, "grad_norm": 30.0842227935791, "learning_rate": 7.187780772686433e-08, "loss": 2.043, "step": 2329 }, { "epoch": 0.9940273037542662, "grad_norm": 23.042972564697266, "learning_rate": 6.738544474393531e-08, "loss": 2.0967, "step": 2330 }, { "epoch": 0.9944539249146758, "grad_norm": 21.70021629333496, "learning_rate": 6.289308176100629e-08, "loss": 1.8965, "step": 2331 }, { "epoch": 0.9948805460750854, "grad_norm": 27.594707489013672, "learning_rate": 5.8400718778077276e-08, "loss": 1.9629, "step": 2332 }, { "epoch": 0.9953071672354948, "grad_norm": 22.49936866760254, "learning_rate": 5.3908355795148254e-08, "loss": 2.335, "step": 2333 }, { "epoch": 0.9957337883959044, "grad_norm": 37.52388381958008, "learning_rate": 4.941599281221923e-08, "loss": 2.9873, "step": 2334 }, { "epoch": 0.996160409556314, "grad_norm": 36.102996826171875, "learning_rate": 4.492362982929021e-08, "loss": 2.1113, "step": 2335 }, { "epoch": 0.9965870307167235, "grad_norm": 23.30735969543457, "learning_rate": 4.043126684636119e-08, "loss": 2.0635, "step": 2336 }, { "epoch": 0.9970136518771331, "grad_norm": 26.220535278320312, "learning_rate": 3.593890386343217e-08, "loss": 1.7402, "step": 2337 }, { "epoch": 0.9974402730375427, "grad_norm": 22.964366912841797, "learning_rate": 3.1446540880503146e-08, "loss": 2.0645, "step": 2338 }, { "epoch": 0.9978668941979523, "grad_norm": 36.568817138671875, "learning_rate": 2.6954177897574127e-08, "loss": 1.9258, "step": 2339 }, { "epoch": 0.9982935153583617, "grad_norm": 23.925901412963867, "learning_rate": 2.2461814914645105e-08, "loss": 1.8818, "step": 2340 }, { "epoch": 0.9987201365187713, "grad_norm": 25.192325592041016, "learning_rate": 1.7969451931716084e-08, "loss": 2.4082, "step": 2341 }, { "epoch": 0.9991467576791809, "grad_norm": 46.00706100463867, "learning_rate": 1.3477088948787064e-08, "loss": 2.2656, "step": 2342 }, { "epoch": 0.9995733788395904, "grad_norm": 24.589725494384766, "learning_rate": 8.984725965858042e-09, "loss": 2.0283, "step": 2343 }, { "epoch": 1.0, "grad_norm": 20.995540618896484, "learning_rate": 4.492362982929021e-09, "loss": 2.2715, "step": 2344 }, { "epoch": 1.0, "step": 2344, "total_flos": 2.0877700871959347e+18, "train_loss": 2.621050004666169, "train_runtime": 1424.1008, "train_samples_per_second": 421.318, "train_steps_per_second": 1.646 } ], "logging_steps": 1, "max_steps": 2344, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.0877700871959347e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }