{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9987217724755006, "eval_steps": 500, "global_step": 586, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0034086067319982955, "grad_norm": 7.117070422384071, "learning_rate": 3.3898305084745766e-07, "loss": 1.5934, "step": 2 }, { "epoch": 0.006817213463996591, "grad_norm": 7.58934528658197, "learning_rate": 6.779661016949153e-07, "loss": 1.6231, "step": 4 }, { "epoch": 0.010225820195994887, "grad_norm": 6.51402630995005, "learning_rate": 1.016949152542373e-06, "loss": 1.5692, "step": 6 }, { "epoch": 0.013634426927993182, "grad_norm": 5.957629945500037, "learning_rate": 1.3559322033898307e-06, "loss": 1.5249, "step": 8 }, { "epoch": 0.017043033659991477, "grad_norm": 4.594385046827305, "learning_rate": 1.6949152542372882e-06, "loss": 1.5132, "step": 10 }, { "epoch": 0.020451640391989774, "grad_norm": 3.4805968864154875, "learning_rate": 2.033898305084746e-06, "loss": 1.4339, "step": 12 }, { "epoch": 0.02386024712398807, "grad_norm": 3.672492727269049, "learning_rate": 2.372881355932204e-06, "loss": 1.3733, "step": 14 }, { "epoch": 0.027268853855986364, "grad_norm": 3.5696770668674653, "learning_rate": 2.7118644067796613e-06, "loss": 1.2981, "step": 16 }, { "epoch": 0.03067746058798466, "grad_norm": 3.721066217222145, "learning_rate": 3.0508474576271192e-06, "loss": 1.2152, "step": 18 }, { "epoch": 0.034086067319982954, "grad_norm": 3.3165399160593765, "learning_rate": 3.3898305084745763e-06, "loss": 1.1972, "step": 20 }, { "epoch": 0.03749467405198125, "grad_norm": 3.0704960315045087, "learning_rate": 3.7288135593220342e-06, "loss": 1.1098, "step": 22 }, { "epoch": 0.04090328078397955, "grad_norm": 3.0249632466321708, "learning_rate": 4.067796610169492e-06, "loss": 1.1101, "step": 24 }, { "epoch": 0.044311887515977845, "grad_norm": 2.51223844701648, "learning_rate": 4.40677966101695e-06, "loss": 1.0707, "step": 26 }, { "epoch": 0.04772049424797614, "grad_norm": 2.337756418129475, "learning_rate": 4.745762711864408e-06, "loss": 1.0054, "step": 28 }, { "epoch": 0.05112910097997444, "grad_norm": 2.507569726418487, "learning_rate": 5.084745762711865e-06, "loss": 0.9751, "step": 30 }, { "epoch": 0.05453770771197273, "grad_norm": 2.8133808280998664, "learning_rate": 5.423728813559323e-06, "loss": 1.0073, "step": 32 }, { "epoch": 0.057946314443971025, "grad_norm": 2.259104843714805, "learning_rate": 5.7627118644067805e-06, "loss": 0.9816, "step": 34 }, { "epoch": 0.06135492117596932, "grad_norm": 2.489954458392594, "learning_rate": 6.1016949152542385e-06, "loss": 0.9969, "step": 36 }, { "epoch": 0.06476352790796762, "grad_norm": 2.489196411046845, "learning_rate": 6.440677966101695e-06, "loss": 0.9751, "step": 38 }, { "epoch": 0.06817213463996591, "grad_norm": 2.328531532406521, "learning_rate": 6.779661016949153e-06, "loss": 0.9063, "step": 40 }, { "epoch": 0.07158074137196421, "grad_norm": 1.8777822207540218, "learning_rate": 7.1186440677966106e-06, "loss": 0.9076, "step": 42 }, { "epoch": 0.0749893481039625, "grad_norm": 2.3564822959993403, "learning_rate": 7.4576271186440685e-06, "loss": 0.9544, "step": 44 }, { "epoch": 0.0783979548359608, "grad_norm": 2.0258103744170604, "learning_rate": 7.796610169491526e-06, "loss": 0.9056, "step": 46 }, { "epoch": 0.0818065615679591, "grad_norm": 2.156862728985429, "learning_rate": 8.135593220338983e-06, "loss": 0.853, "step": 48 }, { "epoch": 0.08521516829995739, "grad_norm": 2.34893762869447, "learning_rate": 8.47457627118644e-06, "loss": 0.8894, "step": 50 }, { "epoch": 0.08862377503195569, "grad_norm": 2.315393254315105, "learning_rate": 8.8135593220339e-06, "loss": 0.9133, "step": 52 }, { "epoch": 0.09203238176395398, "grad_norm": 2.301164621363547, "learning_rate": 9.152542372881356e-06, "loss": 0.8895, "step": 54 }, { "epoch": 0.09544098849595228, "grad_norm": 2.2398380284062425, "learning_rate": 9.491525423728815e-06, "loss": 0.8563, "step": 56 }, { "epoch": 0.09884959522795057, "grad_norm": 2.029764049433947, "learning_rate": 9.830508474576272e-06, "loss": 0.8813, "step": 58 }, { "epoch": 0.10225820195994888, "grad_norm": 2.0880574238328387, "learning_rate": 9.999911158222826e-06, "loss": 0.9337, "step": 60 }, { "epoch": 0.10566680869194717, "grad_norm": 2.3366310826461993, "learning_rate": 9.999200442948178e-06, "loss": 0.924, "step": 62 }, { "epoch": 0.10907541542394546, "grad_norm": 2.344634528525901, "learning_rate": 9.997779113423916e-06, "loss": 0.9312, "step": 64 }, { "epoch": 0.11248402215594376, "grad_norm": 2.551521533584163, "learning_rate": 9.99564737168575e-06, "loss": 0.9062, "step": 66 }, { "epoch": 0.11589262888794205, "grad_norm": 2.2111244932074, "learning_rate": 9.992805520751352e-06, "loss": 0.9032, "step": 68 }, { "epoch": 0.11930123561994035, "grad_norm": 2.2310561621390463, "learning_rate": 9.989253964577267e-06, "loss": 0.8851, "step": 70 }, { "epoch": 0.12270984235193864, "grad_norm": 2.1278482556209153, "learning_rate": 9.984993208001513e-06, "loss": 0.8663, "step": 72 }, { "epoch": 0.12611844908393693, "grad_norm": 2.173728187499986, "learning_rate": 9.980023856671805e-06, "loss": 0.889, "step": 74 }, { "epoch": 0.12952705581593524, "grad_norm": 2.0928351193635026, "learning_rate": 9.974346616959476e-06, "loss": 0.8293, "step": 76 }, { "epoch": 0.13293566254793354, "grad_norm": 2.8128383896971902, "learning_rate": 9.96796229585906e-06, "loss": 0.8746, "step": 78 }, { "epoch": 0.13634426927993182, "grad_norm": 2.0208631286469254, "learning_rate": 9.960871800873586e-06, "loss": 0.9004, "step": 80 }, { "epoch": 0.13975287601193012, "grad_norm": 2.1974668712171157, "learning_rate": 9.953076139885581e-06, "loss": 0.9384, "step": 82 }, { "epoch": 0.14316148274392843, "grad_norm": 2.3881999042131516, "learning_rate": 9.944576421013804e-06, "loss": 0.8679, "step": 84 }, { "epoch": 0.1465700894759267, "grad_norm": 1.9809033021368312, "learning_rate": 9.935373852455724e-06, "loss": 0.855, "step": 86 }, { "epoch": 0.149978696207925, "grad_norm": 2.063145920288984, "learning_rate": 9.925469742315799e-06, "loss": 0.8509, "step": 88 }, { "epoch": 0.1533873029399233, "grad_norm": 2.1854349522591248, "learning_rate": 9.91486549841951e-06, "loss": 0.8774, "step": 90 }, { "epoch": 0.1567959096719216, "grad_norm": 1.9442802721360337, "learning_rate": 9.903562628113266e-06, "loss": 0.9293, "step": 92 }, { "epoch": 0.1602045164039199, "grad_norm": 1.9595403019093638, "learning_rate": 9.891562738050127e-06, "loss": 0.835, "step": 94 }, { "epoch": 0.1636131231359182, "grad_norm": 2.001941381433757, "learning_rate": 9.878867533961434e-06, "loss": 0.8594, "step": 96 }, { "epoch": 0.1670217298679165, "grad_norm": 2.24822151912714, "learning_rate": 9.865478820414344e-06, "loss": 0.85, "step": 98 }, { "epoch": 0.17043033659991477, "grad_norm": 1.8987704947701014, "learning_rate": 9.851398500555321e-06, "loss": 0.8647, "step": 100 }, { "epoch": 0.17383894333191308, "grad_norm": 1.9294894488605314, "learning_rate": 9.8366285758396e-06, "loss": 0.8374, "step": 102 }, { "epoch": 0.17724755006391138, "grad_norm": 2.0659769053985455, "learning_rate": 9.82117114574671e-06, "loss": 0.8625, "step": 104 }, { "epoch": 0.18065615679590968, "grad_norm": 2.1086334049039044, "learning_rate": 9.805028407482027e-06, "loss": 0.8418, "step": 106 }, { "epoch": 0.18406476352790796, "grad_norm": 1.9093180149494513, "learning_rate": 9.78820265566445e-06, "loss": 0.8378, "step": 108 }, { "epoch": 0.18747337025990626, "grad_norm": 2.108586166296398, "learning_rate": 9.770696282000245e-06, "loss": 0.8717, "step": 110 }, { "epoch": 0.19088197699190457, "grad_norm": 1.9600773842942703, "learning_rate": 9.75251177494306e-06, "loss": 0.8626, "step": 112 }, { "epoch": 0.19429058372390284, "grad_norm": 2.2720190152376425, "learning_rate": 9.733651719340207e-06, "loss": 0.8714, "step": 114 }, { "epoch": 0.19769919045590115, "grad_norm": 1.9850702888722942, "learning_rate": 9.714118796065243e-06, "loss": 0.8529, "step": 116 }, { "epoch": 0.20110779718789945, "grad_norm": 2.0995202528398234, "learning_rate": 9.693915781636887e-06, "loss": 0.8508, "step": 118 }, { "epoch": 0.20451640391989775, "grad_norm": 2.191390132974462, "learning_rate": 9.673045547824356e-06, "loss": 0.8699, "step": 120 }, { "epoch": 0.20792501065189603, "grad_norm": 1.92525268724109, "learning_rate": 9.651511061239151e-06, "loss": 0.8746, "step": 122 }, { "epoch": 0.21133361738389433, "grad_norm": 2.1743856429859765, "learning_rate": 9.62931538291337e-06, "loss": 0.8643, "step": 124 }, { "epoch": 0.21474222411589264, "grad_norm": 2.2545723733622642, "learning_rate": 9.606461667864603e-06, "loss": 0.8518, "step": 126 }, { "epoch": 0.21815083084789091, "grad_norm": 2.088328769098791, "learning_rate": 9.582953164647438e-06, "loss": 0.8036, "step": 128 }, { "epoch": 0.22155943757988922, "grad_norm": 1.7154521796104316, "learning_rate": 9.558793214891716e-06, "loss": 0.8749, "step": 130 }, { "epoch": 0.22496804431188752, "grad_norm": 1.9971816903951989, "learning_rate": 9.533985252827525e-06, "loss": 0.8442, "step": 132 }, { "epoch": 0.22837665104388583, "grad_norm": 2.0452202203810246, "learning_rate": 9.508532804797035e-06, "loss": 0.8331, "step": 134 }, { "epoch": 0.2317852577758841, "grad_norm": 1.8873007094774574, "learning_rate": 9.482439488753248e-06, "loss": 0.8404, "step": 136 }, { "epoch": 0.2351938645078824, "grad_norm": 1.8694505860373616, "learning_rate": 9.455709013745726e-06, "loss": 0.8518, "step": 138 }, { "epoch": 0.2386024712398807, "grad_norm": 1.8826067133760322, "learning_rate": 9.42834517939336e-06, "loss": 0.8224, "step": 140 }, { "epoch": 0.24201107797187899, "grad_norm": 1.783995366074571, "learning_rate": 9.400351875344268e-06, "loss": 0.8368, "step": 142 }, { "epoch": 0.2454196847038773, "grad_norm": 1.9541798533923145, "learning_rate": 9.371733080722911e-06, "loss": 0.8753, "step": 144 }, { "epoch": 0.2488282914358756, "grad_norm": 1.8725559407975727, "learning_rate": 9.342492863564468e-06, "loss": 0.8382, "step": 146 }, { "epoch": 0.25223689816787387, "grad_norm": 2.166287610640464, "learning_rate": 9.312635380236582e-06, "loss": 0.8311, "step": 148 }, { "epoch": 0.2556455048998722, "grad_norm": 2.100500640351814, "learning_rate": 9.282164874848554e-06, "loss": 0.8731, "step": 150 }, { "epoch": 0.2590541116318705, "grad_norm": 2.269255130053819, "learning_rate": 9.251085678648072e-06, "loss": 0.832, "step": 152 }, { "epoch": 0.26246271836386875, "grad_norm": 2.019471552415879, "learning_rate": 9.219402209405519e-06, "loss": 0.8715, "step": 154 }, { "epoch": 0.2658713250958671, "grad_norm": 2.057431118897924, "learning_rate": 9.18711897078603e-06, "loss": 0.8809, "step": 156 }, { "epoch": 0.26927993182786536, "grad_norm": 1.7964562596950788, "learning_rate": 9.154240551709298e-06, "loss": 0.8159, "step": 158 }, { "epoch": 0.27268853855986364, "grad_norm": 1.77492002243483, "learning_rate": 9.1207716256973e-06, "loss": 0.8443, "step": 160 }, { "epoch": 0.27609714529186197, "grad_norm": 2.042551909185088, "learning_rate": 9.08671695020995e-06, "loss": 0.8718, "step": 162 }, { "epoch": 0.27950575202386024, "grad_norm": 1.9983795965696414, "learning_rate": 9.052081365968873e-06, "loss": 0.827, "step": 164 }, { "epoch": 0.2829143587558585, "grad_norm": 2.052158753663356, "learning_rate": 9.016869796269299e-06, "loss": 0.8478, "step": 166 }, { "epoch": 0.28632296548785685, "grad_norm": 1.8439668797771172, "learning_rate": 8.981087246280246e-06, "loss": 0.8063, "step": 168 }, { "epoch": 0.2897315722198551, "grad_norm": 1.8391702615486196, "learning_rate": 8.944738802333062e-06, "loss": 0.8345, "step": 170 }, { "epoch": 0.2931401789518534, "grad_norm": 1.832576311204302, "learning_rate": 8.907829631198412e-06, "loss": 0.7921, "step": 172 }, { "epoch": 0.29654878568385173, "grad_norm": 1.9349714568402026, "learning_rate": 8.870364979351862e-06, "loss": 0.8122, "step": 174 }, { "epoch": 0.29995739241585, "grad_norm": 2.4339551873431104, "learning_rate": 8.832350172228099e-06, "loss": 0.82, "step": 176 }, { "epoch": 0.30336599914784834, "grad_norm": 1.5892853144502257, "learning_rate": 8.793790613463956e-06, "loss": 0.8362, "step": 178 }, { "epoch": 0.3067746058798466, "grad_norm": 1.7805951409311684, "learning_rate": 8.754691784130298e-06, "loss": 0.8283, "step": 180 }, { "epoch": 0.3101832126118449, "grad_norm": 1.6741127801982758, "learning_rate": 8.715059241952925e-06, "loss": 0.8055, "step": 182 }, { "epoch": 0.3135918193438432, "grad_norm": 1.7776583452428674, "learning_rate": 8.674898620522558e-06, "loss": 0.8238, "step": 184 }, { "epoch": 0.3170004260758415, "grad_norm": 1.7939673520888073, "learning_rate": 8.634215628494044e-06, "loss": 0.7946, "step": 186 }, { "epoch": 0.3204090328078398, "grad_norm": 1.9491996130141838, "learning_rate": 8.593016048774911e-06, "loss": 0.8269, "step": 188 }, { "epoch": 0.3238176395398381, "grad_norm": 2.0260518716817355, "learning_rate": 8.551305737703338e-06, "loss": 0.8205, "step": 190 }, { "epoch": 0.3272262462718364, "grad_norm": 2.0407011387236915, "learning_rate": 8.509090624215708e-06, "loss": 0.7965, "step": 192 }, { "epoch": 0.33063485300383466, "grad_norm": 1.885729706338136, "learning_rate": 8.466376709003841e-06, "loss": 0.8444, "step": 194 }, { "epoch": 0.334043459735833, "grad_norm": 1.8594029276174733, "learning_rate": 8.423170063662014e-06, "loss": 0.8301, "step": 196 }, { "epoch": 0.33745206646783127, "grad_norm": 2.0661661167934775, "learning_rate": 8.379476829823919e-06, "loss": 0.8194, "step": 198 }, { "epoch": 0.34086067319982954, "grad_norm": 1.843312820214377, "learning_rate": 8.335303218289645e-06, "loss": 0.8223, "step": 200 }, { "epoch": 0.3442692799318279, "grad_norm": 1.9237357576874556, "learning_rate": 8.290655508142855e-06, "loss": 0.8066, "step": 202 }, { "epoch": 0.34767788666382615, "grad_norm": 1.8680286378652424, "learning_rate": 8.245540045858229e-06, "loss": 0.8035, "step": 204 }, { "epoch": 0.3510864933958245, "grad_norm": 1.7375474557232102, "learning_rate": 8.199963244399351e-06, "loss": 0.8115, "step": 206 }, { "epoch": 0.35449510012782276, "grad_norm": 1.8317920240540273, "learning_rate": 8.153931582307124e-06, "loss": 0.8333, "step": 208 }, { "epoch": 0.35790370685982104, "grad_norm": 1.8807672342031, "learning_rate": 8.107451602778887e-06, "loss": 0.8114, "step": 210 }, { "epoch": 0.36131231359181937, "grad_norm": 1.8233277253885543, "learning_rate": 8.060529912738316e-06, "loss": 0.8084, "step": 212 }, { "epoch": 0.36472092032381764, "grad_norm": 1.8018328218174473, "learning_rate": 8.013173181896283e-06, "loss": 0.8036, "step": 214 }, { "epoch": 0.3681295270558159, "grad_norm": 1.7500277066768286, "learning_rate": 7.965388141802792e-06, "loss": 0.8493, "step": 216 }, { "epoch": 0.37153813378781425, "grad_norm": 1.8202749968468963, "learning_rate": 7.917181584890113e-06, "loss": 0.8666, "step": 218 }, { "epoch": 0.3749467405198125, "grad_norm": 1.7463930119282958, "learning_rate": 7.868560363507268e-06, "loss": 0.8104, "step": 220 }, { "epoch": 0.3783553472518108, "grad_norm": 1.687535685348095, "learning_rate": 7.819531388946e-06, "loss": 0.7423, "step": 222 }, { "epoch": 0.38176395398380913, "grad_norm": 1.8780292088928865, "learning_rate": 7.770101630458364e-06, "loss": 0.7817, "step": 224 }, { "epoch": 0.3851725607158074, "grad_norm": 1.7870199236678057, "learning_rate": 7.720278114266084e-06, "loss": 0.8055, "step": 226 }, { "epoch": 0.3885811674478057, "grad_norm": 2.1478936673535522, "learning_rate": 7.670067922561795e-06, "loss": 0.8181, "step": 228 }, { "epoch": 0.391989774179804, "grad_norm": 1.996195916952366, "learning_rate": 7.619478192502352e-06, "loss": 0.786, "step": 230 }, { "epoch": 0.3953983809118023, "grad_norm": 1.7607026492930253, "learning_rate": 7.5685161151943066e-06, "loss": 0.7861, "step": 232 }, { "epoch": 0.39880698764380057, "grad_norm": 1.982916095534787, "learning_rate": 7.5171889346717264e-06, "loss": 0.8314, "step": 234 }, { "epoch": 0.4022155943757989, "grad_norm": 1.8475361020735406, "learning_rate": 7.465503946866491e-06, "loss": 0.8045, "step": 236 }, { "epoch": 0.4056242011077972, "grad_norm": 1.6483972591254976, "learning_rate": 7.413468498571199e-06, "loss": 0.8149, "step": 238 }, { "epoch": 0.4090328078397955, "grad_norm": 1.9862409096042428, "learning_rate": 7.361089986394863e-06, "loss": 0.8038, "step": 240 }, { "epoch": 0.4124414145717938, "grad_norm": 1.9381151415852729, "learning_rate": 7.308375855711508e-06, "loss": 0.8203, "step": 242 }, { "epoch": 0.41585002130379206, "grad_norm": 1.8252397931133695, "learning_rate": 7.2553335996018485e-06, "loss": 0.8096, "step": 244 }, { "epoch": 0.4192586280357904, "grad_norm": 1.8096704344426044, "learning_rate": 7.201970757788172e-06, "loss": 0.7692, "step": 246 }, { "epoch": 0.42266723476778867, "grad_norm": 1.7220473220530976, "learning_rate": 7.148294915562617e-06, "loss": 0.7941, "step": 248 }, { "epoch": 0.42607584149978694, "grad_norm": 1.6560074887380722, "learning_rate": 7.09431370270894e-06, "loss": 0.795, "step": 250 }, { "epoch": 0.4294844482317853, "grad_norm": 1.6420030994162023, "learning_rate": 7.040034792417986e-06, "loss": 0.7854, "step": 252 }, { "epoch": 0.43289305496378355, "grad_norm": 1.8777722598215576, "learning_rate": 6.985465900196971e-06, "loss": 0.8445, "step": 254 }, { "epoch": 0.43630166169578183, "grad_norm": 1.8421388677475183, "learning_rate": 6.930614782772766e-06, "loss": 0.8372, "step": 256 }, { "epoch": 0.43971026842778016, "grad_norm": 1.9176127454855982, "learning_rate": 6.875489236989306e-06, "loss": 0.82, "step": 258 }, { "epoch": 0.44311887515977844, "grad_norm": 1.8186020984518791, "learning_rate": 6.8200970986993e-06, "loss": 0.8208, "step": 260 }, { "epoch": 0.4465274818917767, "grad_norm": 1.9882080130853592, "learning_rate": 6.764446241650411e-06, "loss": 0.8073, "step": 262 }, { "epoch": 0.44993608862377504, "grad_norm": 1.5626224821883135, "learning_rate": 6.708544576366024e-06, "loss": 0.8231, "step": 264 }, { "epoch": 0.4533446953557733, "grad_norm": 1.8993741801940567, "learning_rate": 6.65240004902081e-06, "loss": 0.8065, "step": 266 }, { "epoch": 0.45675330208777165, "grad_norm": 1.7550634910546117, "learning_rate": 6.5960206403112045e-06, "loss": 0.7485, "step": 268 }, { "epoch": 0.4601619088197699, "grad_norm": 1.9326203800829964, "learning_rate": 6.539414364320997e-06, "loss": 0.779, "step": 270 }, { "epoch": 0.4635705155517682, "grad_norm": 1.6516836712807916, "learning_rate": 6.4825892673821556e-06, "loss": 0.794, "step": 272 }, { "epoch": 0.46697912228376653, "grad_norm": 2.003145254209738, "learning_rate": 6.425553426931075e-06, "loss": 0.7618, "step": 274 }, { "epoch": 0.4703877290157648, "grad_norm": 1.6246913128967992, "learning_rate": 6.368314950360416e-06, "loss": 0.7785, "step": 276 }, { "epoch": 0.4737963357477631, "grad_norm": 1.8754262243075055, "learning_rate": 6.31088197386666e-06, "loss": 0.765, "step": 278 }, { "epoch": 0.4772049424797614, "grad_norm": 1.6448889983604882, "learning_rate": 6.2532626612936035e-06, "loss": 0.7977, "step": 280 }, { "epoch": 0.4806135492117597, "grad_norm": 1.8537195734950456, "learning_rate": 6.195465202971883e-06, "loss": 0.7637, "step": 282 }, { "epoch": 0.48402215594375797, "grad_norm": 1.7350572092463452, "learning_rate": 6.137497814554773e-06, "loss": 0.7974, "step": 284 }, { "epoch": 0.4874307626757563, "grad_norm": 1.659202469943535, "learning_rate": 6.079368735850355e-06, "loss": 0.78, "step": 286 }, { "epoch": 0.4908393694077546, "grad_norm": 2.0704522103071326, "learning_rate": 6.021086229650273e-06, "loss": 0.78, "step": 288 }, { "epoch": 0.49424797613975285, "grad_norm": 1.7654110381757835, "learning_rate": 5.96265858055521e-06, "loss": 0.8101, "step": 290 }, { "epoch": 0.4976565828717512, "grad_norm": 1.6744010674594727, "learning_rate": 5.904094093797274e-06, "loss": 0.7854, "step": 292 }, { "epoch": 0.5010651896037495, "grad_norm": 1.672935281461675, "learning_rate": 5.845401094059439e-06, "loss": 0.8233, "step": 294 }, { "epoch": 0.5044737963357477, "grad_norm": 1.7571827575826455, "learning_rate": 5.786587924292228e-06, "loss": 0.8072, "step": 296 }, { "epoch": 0.507882403067746, "grad_norm": 1.7879627913914475, "learning_rate": 5.727662944527808e-06, "loss": 0.8236, "step": 298 }, { "epoch": 0.5112910097997444, "grad_norm": 1.6693554243950373, "learning_rate": 5.668634530691639e-06, "loss": 0.7829, "step": 300 }, { "epoch": 0.5146996165317427, "grad_norm": 1.7885967539315661, "learning_rate": 5.609511073411881e-06, "loss": 0.8069, "step": 302 }, { "epoch": 0.518108223263741, "grad_norm": 1.8927285112086825, "learning_rate": 5.550300976826697e-06, "loss": 0.7978, "step": 304 }, { "epoch": 0.5215168299957392, "grad_norm": 1.9422631435450946, "learning_rate": 5.491012657389644e-06, "loss": 0.8143, "step": 306 }, { "epoch": 0.5249254367277375, "grad_norm": 1.8664920158914793, "learning_rate": 5.431654542673315e-06, "loss": 0.8184, "step": 308 }, { "epoch": 0.5283340434597358, "grad_norm": 1.7545208791954934, "learning_rate": 5.372235070171387e-06, "loss": 0.7976, "step": 310 }, { "epoch": 0.5317426501917342, "grad_norm": 1.8345783742469912, "learning_rate": 5.312762686099282e-06, "loss": 0.7726, "step": 312 }, { "epoch": 0.5351512569237324, "grad_norm": 1.777874135882686, "learning_rate": 5.253245844193564e-06, "loss": 0.791, "step": 314 }, { "epoch": 0.5385598636557307, "grad_norm": 1.7559490288145514, "learning_rate": 5.193693004510284e-06, "loss": 0.7654, "step": 316 }, { "epoch": 0.541968470387729, "grad_norm": 1.7582347301570869, "learning_rate": 5.1341126322224145e-06, "loss": 0.7509, "step": 318 }, { "epoch": 0.5453770771197273, "grad_norm": 1.6309306234524417, "learning_rate": 5.0745131964165686e-06, "loss": 0.7737, "step": 320 }, { "epoch": 0.5487856838517257, "grad_norm": 1.6033761601218945, "learning_rate": 5.014903168889155e-06, "loss": 0.7814, "step": 322 }, { "epoch": 0.5521942905837239, "grad_norm": 1.9401976205946008, "learning_rate": 4.955291022942146e-06, "loss": 0.7782, "step": 324 }, { "epoch": 0.5556028973157222, "grad_norm": 1.8683852058875654, "learning_rate": 4.8956852321786394e-06, "loss": 0.802, "step": 326 }, { "epoch": 0.5590115040477205, "grad_norm": 1.859200797460236, "learning_rate": 4.836094269298373e-06, "loss": 0.7798, "step": 328 }, { "epoch": 0.5624201107797188, "grad_norm": 1.803688727471679, "learning_rate": 4.7765266048933626e-06, "loss": 0.8098, "step": 330 }, { "epoch": 0.565828717511717, "grad_norm": 1.7851903640220046, "learning_rate": 4.7169907062438556e-06, "loss": 0.7728, "step": 332 }, { "epoch": 0.5692373242437154, "grad_norm": 1.6343651082668207, "learning_rate": 4.657495036114731e-06, "loss": 0.7721, "step": 334 }, { "epoch": 0.5726459309757137, "grad_norm": 1.6586804458742679, "learning_rate": 4.598048051552563e-06, "loss": 0.8186, "step": 336 }, { "epoch": 0.576054537707712, "grad_norm": 1.750348106992188, "learning_rate": 4.53865820268349e-06, "loss": 0.7503, "step": 338 }, { "epoch": 0.5794631444397103, "grad_norm": 1.7165597335606, "learning_rate": 4.479333931512064e-06, "loss": 0.7915, "step": 340 }, { "epoch": 0.5828717511717085, "grad_norm": 1.7582639909981337, "learning_rate": 4.4200836707212525e-06, "loss": 0.7754, "step": 342 }, { "epoch": 0.5862803579037068, "grad_norm": 1.6838798515151536, "learning_rate": 4.360915842473778e-06, "loss": 0.7543, "step": 344 }, { "epoch": 0.5896889646357052, "grad_norm": 1.4519358598788679, "learning_rate": 4.301838857214944e-06, "loss": 0.7629, "step": 346 }, { "epoch": 0.5930975713677035, "grad_norm": 1.6951153457921704, "learning_rate": 4.2428611124771184e-06, "loss": 0.7919, "step": 348 }, { "epoch": 0.5965061780997017, "grad_norm": 1.716470856323328, "learning_rate": 4.183990991686071e-06, "loss": 0.7853, "step": 350 }, { "epoch": 0.5999147848317, "grad_norm": 1.8973005312026041, "learning_rate": 4.125236862969304e-06, "loss": 0.7695, "step": 352 }, { "epoch": 0.6033233915636983, "grad_norm": 1.4862142786601558, "learning_rate": 4.066607077966559e-06, "loss": 0.7395, "step": 354 }, { "epoch": 0.6067319982956967, "grad_norm": 1.8815334843344234, "learning_rate": 4.008109970642665e-06, "loss": 0.787, "step": 356 }, { "epoch": 0.610140605027695, "grad_norm": 1.6378926667179599, "learning_rate": 3.949753856102909e-06, "loss": 0.7393, "step": 358 }, { "epoch": 0.6135492117596932, "grad_norm": 1.7084737118259692, "learning_rate": 3.891547029411074e-06, "loss": 0.8053, "step": 360 }, { "epoch": 0.6169578184916915, "grad_norm": 1.8544822094338145, "learning_rate": 3.833497764410336e-06, "loss": 0.7674, "step": 362 }, { "epoch": 0.6203664252236898, "grad_norm": 1.7409266320356052, "learning_rate": 3.7756143125471745e-06, "loss": 0.7489, "step": 364 }, { "epoch": 0.6237750319556881, "grad_norm": 1.8302377955954783, "learning_rate": 3.7179049016984616e-06, "loss": 0.7757, "step": 366 }, { "epoch": 0.6271836386876865, "grad_norm": 1.6552784195481047, "learning_rate": 3.660377735001911e-06, "loss": 0.7656, "step": 368 }, { "epoch": 0.6305922454196847, "grad_norm": 1.8034934811229115, "learning_rate": 3.6030409896900333e-06, "loss": 0.7731, "step": 370 }, { "epoch": 0.634000852151683, "grad_norm": 1.656497367167841, "learning_rate": 3.545902815927783e-06, "loss": 0.7564, "step": 372 }, { "epoch": 0.6374094588836813, "grad_norm": 1.6538858927486708, "learning_rate": 3.4889713356540435e-06, "loss": 0.7556, "step": 374 }, { "epoch": 0.6408180656156796, "grad_norm": 1.7721211559316303, "learning_rate": 3.432254641427136e-06, "loss": 0.7754, "step": 376 }, { "epoch": 0.6442266723476778, "grad_norm": 2.0169569259903946, "learning_rate": 3.375760795274493e-06, "loss": 0.7745, "step": 378 }, { "epoch": 0.6476352790796762, "grad_norm": 1.5174685690539855, "learning_rate": 3.3194978275466755e-06, "loss": 0.7649, "step": 380 }, { "epoch": 0.6510438858116745, "grad_norm": 1.5043926003321177, "learning_rate": 3.2634737357758994e-06, "loss": 0.74, "step": 382 }, { "epoch": 0.6544524925436728, "grad_norm": 1.6692309306036763, "learning_rate": 3.207696483539219e-06, "loss": 0.7611, "step": 384 }, { "epoch": 0.657861099275671, "grad_norm": 1.784881350588902, "learning_rate": 3.152173999326537e-06, "loss": 0.7675, "step": 386 }, { "epoch": 0.6612697060076693, "grad_norm": 1.7444177474948184, "learning_rate": 3.096914175413611e-06, "loss": 0.7685, "step": 388 }, { "epoch": 0.6646783127396677, "grad_norm": 1.999124406609706, "learning_rate": 3.041924866740197e-06, "loss": 0.7505, "step": 390 }, { "epoch": 0.668086919471666, "grad_norm": 1.5350179140801363, "learning_rate": 2.987213889793502e-06, "loss": 0.7792, "step": 392 }, { "epoch": 0.6714955262036643, "grad_norm": 1.7053442818016058, "learning_rate": 2.9327890214971135e-06, "loss": 0.7661, "step": 394 }, { "epoch": 0.6749041329356625, "grad_norm": 1.6957720054975949, "learning_rate": 2.878657998105533e-06, "loss": 0.7587, "step": 396 }, { "epoch": 0.6783127396676608, "grad_norm": 1.7615190253028947, "learning_rate": 2.824828514104512e-06, "loss": 0.8067, "step": 398 }, { "epoch": 0.6817213463996591, "grad_norm": 1.5571276091989836, "learning_rate": 2.771308221117309e-06, "loss": 0.7478, "step": 400 }, { "epoch": 0.6851299531316575, "grad_norm": 1.5280419708791328, "learning_rate": 2.7181047268170436e-06, "loss": 0.8071, "step": 402 }, { "epoch": 0.6885385598636558, "grad_norm": 1.6771310372158614, "learning_rate": 2.665225593845307e-06, "loss": 0.7799, "step": 404 }, { "epoch": 0.691947166595654, "grad_norm": 1.7562403180703012, "learning_rate": 2.6126783387371545e-06, "loss": 0.7471, "step": 406 }, { "epoch": 0.6953557733276523, "grad_norm": 1.796003576907647, "learning_rate": 2.5604704308526784e-06, "loss": 0.7716, "step": 408 }, { "epoch": 0.6987643800596506, "grad_norm": 1.6859622044955247, "learning_rate": 2.5086092913152533e-06, "loss": 0.8003, "step": 410 }, { "epoch": 0.702172986791649, "grad_norm": 1.6580555467710547, "learning_rate": 2.4571022919566747e-06, "loss": 0.7463, "step": 412 }, { "epoch": 0.7055815935236472, "grad_norm": 1.5775642264745176, "learning_rate": 2.4059567542692688e-06, "loss": 0.7329, "step": 414 }, { "epoch": 0.7089902002556455, "grad_norm": 1.6145511797148977, "learning_rate": 2.3551799483651894e-06, "loss": 0.7557, "step": 416 }, { "epoch": 0.7123988069876438, "grad_norm": 1.815641939119739, "learning_rate": 2.3047790919429895e-06, "loss": 0.7618, "step": 418 }, { "epoch": 0.7158074137196421, "grad_norm": 1.751085671764699, "learning_rate": 2.254761349261676e-06, "loss": 0.7307, "step": 420 }, { "epoch": 0.7192160204516403, "grad_norm": 1.722113083895161, "learning_rate": 2.2051338301223187e-06, "loss": 0.7536, "step": 422 }, { "epoch": 0.7226246271836387, "grad_norm": 1.5887620089800378, "learning_rate": 2.1559035888574433e-06, "loss": 0.7139, "step": 424 }, { "epoch": 0.726033233915637, "grad_norm": 1.792034913644711, "learning_rate": 2.107077623328275e-06, "loss": 0.7335, "step": 426 }, { "epoch": 0.7294418406476353, "grad_norm": 1.5385202560838132, "learning_rate": 2.0586628739300386e-06, "loss": 0.7752, "step": 428 }, { "epoch": 0.7328504473796336, "grad_norm": 1.554007093824859, "learning_rate": 2.0106662226053924e-06, "loss": 0.7814, "step": 430 }, { "epoch": 0.7362590541116318, "grad_norm": 1.60530576083176, "learning_rate": 1.9630944918662122e-06, "loss": 0.7552, "step": 432 }, { "epoch": 0.7396676608436301, "grad_norm": 1.5045851644291919, "learning_rate": 1.9159544438237797e-06, "loss": 0.7602, "step": 434 }, { "epoch": 0.7430762675756285, "grad_norm": 1.6227515256198517, "learning_rate": 1.8692527792276e-06, "loss": 0.7695, "step": 436 }, { "epoch": 0.7464848743076268, "grad_norm": 1.6302337283434762, "learning_rate": 1.8229961365128985e-06, "loss": 0.732, "step": 438 }, { "epoch": 0.749893481039625, "grad_norm": 1.5834938000524268, "learning_rate": 1.7771910908570156e-06, "loss": 0.7566, "step": 440 }, { "epoch": 0.7533020877716233, "grad_norm": 1.4483744882069087, "learning_rate": 1.7318441532447555e-06, "loss": 0.8073, "step": 442 }, { "epoch": 0.7567106945036216, "grad_norm": 1.6825397102814452, "learning_rate": 1.6869617695429024e-06, "loss": 0.7579, "step": 444 }, { "epoch": 0.76011930123562, "grad_norm": 1.5595787362408242, "learning_rate": 1.6425503195839416e-06, "loss": 0.8201, "step": 446 }, { "epoch": 0.7635279079676183, "grad_norm": 1.6860253511961054, "learning_rate": 1.5986161162592173e-06, "loss": 0.7515, "step": 448 }, { "epoch": 0.7669365146996165, "grad_norm": 1.6217349191290344, "learning_rate": 1.555165404621567e-06, "loss": 0.7684, "step": 450 }, { "epoch": 0.7703451214316148, "grad_norm": 1.7202733697764292, "learning_rate": 1.5122043609976282e-06, "loss": 0.7335, "step": 452 }, { "epoch": 0.7737537281636131, "grad_norm": 1.7746564761962131, "learning_rate": 1.4697390921098886e-06, "loss": 0.7848, "step": 454 }, { "epoch": 0.7771623348956114, "grad_norm": 1.5349803625176617, "learning_rate": 1.4277756342086524e-06, "loss": 0.7757, "step": 456 }, { "epoch": 0.7805709416276098, "grad_norm": 1.6124010704416651, "learning_rate": 1.3863199522140053e-06, "loss": 0.8031, "step": 458 }, { "epoch": 0.783979548359608, "grad_norm": 1.7539891598801893, "learning_rate": 1.3453779388679378e-06, "loss": 0.7826, "step": 460 }, { "epoch": 0.7873881550916063, "grad_norm": 1.6197274527555163, "learning_rate": 1.3049554138967052e-06, "loss": 0.7706, "step": 462 }, { "epoch": 0.7907967618236046, "grad_norm": 1.5679920515099621, "learning_rate": 1.2650581231835923e-06, "loss": 0.7943, "step": 464 }, { "epoch": 0.7942053685556029, "grad_norm": 1.6044331803289653, "learning_rate": 1.225691737952152e-06, "loss": 0.7558, "step": 466 }, { "epoch": 0.7976139752876011, "grad_norm": 1.5903299233431156, "learning_rate": 1.1868618539600718e-06, "loss": 0.767, "step": 468 }, { "epoch": 0.8010225820195995, "grad_norm": 1.5362855174389376, "learning_rate": 1.1485739907037546e-06, "loss": 0.7838, "step": 470 }, { "epoch": 0.8044311887515978, "grad_norm": 1.6430906694597753, "learning_rate": 1.1108335906337559e-06, "loss": 0.7519, "step": 472 }, { "epoch": 0.8078397954835961, "grad_norm": 1.7499785715417322, "learning_rate": 1.0736460183811547e-06, "loss": 0.7681, "step": 474 }, { "epoch": 0.8112484022155944, "grad_norm": 1.7219048713168092, "learning_rate": 1.0370165599950026e-06, "loss": 0.7552, "step": 476 }, { "epoch": 0.8146570089475926, "grad_norm": 1.797003265591149, "learning_rate": 1.0009504221909284e-06, "loss": 0.7336, "step": 478 }, { "epoch": 0.818065615679591, "grad_norm": 1.322267929113636, "learning_rate": 9.654527316110323e-07, "loss": 0.7478, "step": 480 }, { "epoch": 0.8214742224115893, "grad_norm": 1.6060348997953555, "learning_rate": 9.305285340951509e-07, "loss": 0.7693, "step": 482 }, { "epoch": 0.8248828291435876, "grad_norm": 1.589739100394995, "learning_rate": 8.961827939636198e-07, "loss": 0.762, "step": 484 }, { "epoch": 0.8282914358755858, "grad_norm": 1.5977203911776114, "learning_rate": 8.624203933116088e-07, "loss": 0.7517, "step": 486 }, { "epoch": 0.8317000426075841, "grad_norm": 1.4433282291047127, "learning_rate": 8.292461313151662e-07, "loss": 0.7814, "step": 488 }, { "epoch": 0.8351086493395824, "grad_norm": 1.5861673926723674, "learning_rate": 7.966647235490222e-07, "loss": 0.7988, "step": 490 }, { "epoch": 0.8385172560715808, "grad_norm": 1.7785118279929921, "learning_rate": 7.646808013163026e-07, "loss": 0.7871, "step": 492 }, { "epoch": 0.8419258628035791, "grad_norm": 1.4097703537190163, "learning_rate": 7.332989109902028e-07, "loss": 0.7471, "step": 494 }, { "epoch": 0.8453344695355773, "grad_norm": 1.6394050318483955, "learning_rate": 7.025235133677455e-07, "loss": 0.7793, "step": 496 }, { "epoch": 0.8487430762675756, "grad_norm": 1.4882317995688061, "learning_rate": 6.723589830356925e-07, "loss": 0.7587, "step": 498 }, { "epoch": 0.8521516829995739, "grad_norm": 1.5993598547748789, "learning_rate": 6.428096077487244e-07, "loss": 0.7256, "step": 500 }, { "epoch": 0.8555602897315722, "grad_norm": 1.8369103578526638, "learning_rate": 6.138795878199467e-07, "loss": 0.7346, "step": 502 }, { "epoch": 0.8589688964635706, "grad_norm": 1.3809387771101223, "learning_rate": 5.855730355238415e-07, "loss": 0.7389, "step": 504 }, { "epoch": 0.8623775031955688, "grad_norm": 1.606520808055538, "learning_rate": 5.578939745117173e-07, "loss": 0.7626, "step": 506 }, { "epoch": 0.8657861099275671, "grad_norm": 1.4252640716496985, "learning_rate": 5.308463392397756e-07, "loss": 0.7471, "step": 508 }, { "epoch": 0.8691947166595654, "grad_norm": 1.5620326120060357, "learning_rate": 5.044339744098348e-07, "loss": 0.7919, "step": 510 }, { "epoch": 0.8726033233915637, "grad_norm": 1.582860405051893, "learning_rate": 4.786606344228317e-07, "loss": 0.8001, "step": 512 }, { "epoch": 0.876011930123562, "grad_norm": 1.6390377107564265, "learning_rate": 4.5352998284514004e-07, "loss": 0.7693, "step": 514 }, { "epoch": 0.8794205368555603, "grad_norm": 1.5740989498771922, "learning_rate": 4.2904559188782334e-07, "loss": 0.743, "step": 516 }, { "epoch": 0.8828291435875586, "grad_norm": 1.6277724611514495, "learning_rate": 4.05210941898847e-07, "loss": 0.7204, "step": 518 }, { "epoch": 0.8862377503195569, "grad_norm": 1.4964610352537944, "learning_rate": 3.82029420868375e-07, "loss": 0.7572, "step": 520 }, { "epoch": 0.8896463570515551, "grad_norm": 1.5631623729163737, "learning_rate": 3.5950432394717296e-07, "loss": 0.7451, "step": 522 }, { "epoch": 0.8930549637835534, "grad_norm": 1.6364549935760584, "learning_rate": 3.3763885297822153e-07, "loss": 0.7748, "step": 524 }, { "epoch": 0.8964635705155518, "grad_norm": 1.901698775456035, "learning_rate": 3.1643611604158687e-07, "loss": 0.7467, "step": 526 }, { "epoch": 0.8998721772475501, "grad_norm": 1.6649529271504364, "learning_rate": 2.958991270126221e-07, "loss": 0.7316, "step": 528 }, { "epoch": 0.9032807839795484, "grad_norm": 1.7118286972837617, "learning_rate": 2.760308051335564e-07, "loss": 0.7546, "step": 530 }, { "epoch": 0.9066893907115466, "grad_norm": 1.5718606351870987, "learning_rate": 2.5683397459854045e-07, "loss": 0.7064, "step": 532 }, { "epoch": 0.9100979974435449, "grad_norm": 1.3894669570974365, "learning_rate": 2.383113641521956e-07, "loss": 0.747, "step": 534 }, { "epoch": 0.9135066041755433, "grad_norm": 1.4787818567245852, "learning_rate": 2.2046560670173912e-07, "loss": 0.7447, "step": 536 }, { "epoch": 0.9169152109075416, "grad_norm": 1.6746803954544742, "learning_rate": 2.0329923894272463e-07, "loss": 0.7329, "step": 538 }, { "epoch": 0.9203238176395399, "grad_norm": 1.6476225947328762, "learning_rate": 1.8681470099846554e-07, "loss": 0.7494, "step": 540 }, { "epoch": 0.9237324243715381, "grad_norm": 1.4591813987858449, "learning_rate": 1.710143360731781e-07, "loss": 0.7251, "step": 542 }, { "epoch": 0.9271410311035364, "grad_norm": 1.6247674197735347, "learning_rate": 1.559003901189099e-07, "loss": 0.7689, "step": 544 }, { "epoch": 0.9305496378355347, "grad_norm": 1.7245251671767248, "learning_rate": 1.4147501151628273e-07, "loss": 0.7592, "step": 546 }, { "epoch": 0.9339582445675331, "grad_norm": 1.4107246676973044, "learning_rate": 1.2774025076911445e-07, "loss": 0.7369, "step": 548 }, { "epoch": 0.9373668512995313, "grad_norm": 1.6265296555944986, "learning_rate": 1.146980602129466e-07, "loss": 0.735, "step": 550 }, { "epoch": 0.9407754580315296, "grad_norm": 1.7696409777572968, "learning_rate": 1.0235029373752758e-07, "loss": 0.7895, "step": 552 }, { "epoch": 0.9441840647635279, "grad_norm": 1.6245561044198507, "learning_rate": 9.069870652329283e-08, "loss": 0.8128, "step": 554 }, { "epoch": 0.9475926714955262, "grad_norm": 1.5989317111370667, "learning_rate": 7.974495479187271e-08, "loss": 0.7438, "step": 556 }, { "epoch": 0.9510012782275244, "grad_norm": 1.4760047355170607, "learning_rate": 6.949059557066596e-08, "loss": 0.7748, "step": 558 }, { "epoch": 0.9544098849595228, "grad_norm": 1.7501965543108127, "learning_rate": 5.993708647151874e-08, "loss": 0.775, "step": 560 }, { "epoch": 0.9578184916915211, "grad_norm": 1.4579885728200068, "learning_rate": 5.108578548352838e-08, "loss": 0.7588, "step": 562 }, { "epoch": 0.9612270984235194, "grad_norm": 1.5049680408512587, "learning_rate": 4.2937950780013174e-08, "loss": 0.7671, "step": 564 }, { "epoch": 0.9646357051555177, "grad_norm": 1.6438976356754593, "learning_rate": 3.549474053966828e-08, "loss": 0.7908, "step": 566 }, { "epoch": 0.9680443118875159, "grad_norm": 1.4470095982360724, "learning_rate": 2.8757212781935218e-08, "loss": 0.7466, "step": 568 }, { "epoch": 0.9714529186195143, "grad_norm": 1.8960810552290914, "learning_rate": 2.2726325216608224e-08, "loss": 0.7697, "step": 570 }, { "epoch": 0.9748615253515126, "grad_norm": 1.6811694769201933, "learning_rate": 1.7402935107702634e-08, "loss": 0.7753, "step": 572 }, { "epoch": 0.9782701320835109, "grad_norm": 1.487143334303855, "learning_rate": 1.2787799151596225e-08, "loss": 0.7234, "step": 574 }, { "epoch": 0.9816787388155092, "grad_norm": 1.3834640793692283, "learning_rate": 8.881573369469153e-09, "loss": 0.7403, "step": 576 }, { "epoch": 0.9850873455475074, "grad_norm": 1.6803671958080162, "learning_rate": 5.684813014052437e-09, "loss": 0.7659, "step": 578 }, { "epoch": 0.9884959522795057, "grad_norm": 1.556690011938104, "learning_rate": 3.197972490704415e-09, "loss": 0.7572, "step": 580 }, { "epoch": 0.9919045590115041, "grad_norm": 1.7044244176997279, "learning_rate": 1.421405292813538e-09, "loss": 0.7572, "step": 582 }, { "epoch": 0.9953131657435024, "grad_norm": 1.5088989927512526, "learning_rate": 3.553639515574414e-10, "loss": 0.752, "step": 584 }, { "epoch": 0.9987217724755006, "grad_norm": 1.4553290863162762, "learning_rate": 0.0, "loss": 0.7504, "step": 586 }, { "epoch": 0.9987217724755006, "step": 586, "total_flos": 6.719053758464e+16, "train_loss": 0.8281301904863871, "train_runtime": 9052.2037, "train_samples_per_second": 1.037, "train_steps_per_second": 0.065 } ], "logging_steps": 2, "max_steps": 586, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.719053758464e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }