| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9987217724755006, | |
| "eval_steps": 500, | |
| "global_step": 586, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0034086067319982955, | |
| "grad_norm": 7.117070422384071, | |
| "learning_rate": 3.3898305084745766e-07, | |
| "loss": 1.5934, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.006817213463996591, | |
| "grad_norm": 7.58934528658197, | |
| "learning_rate": 6.779661016949153e-07, | |
| "loss": 1.6231, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.010225820195994887, | |
| "grad_norm": 6.51402630995005, | |
| "learning_rate": 1.016949152542373e-06, | |
| "loss": 1.5692, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.013634426927993182, | |
| "grad_norm": 5.957629945500037, | |
| "learning_rate": 1.3559322033898307e-06, | |
| "loss": 1.5249, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.017043033659991477, | |
| "grad_norm": 4.594385046827305, | |
| "learning_rate": 1.6949152542372882e-06, | |
| "loss": 1.5132, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.020451640391989774, | |
| "grad_norm": 3.4805968864154875, | |
| "learning_rate": 2.033898305084746e-06, | |
| "loss": 1.4339, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.02386024712398807, | |
| "grad_norm": 3.672492727269049, | |
| "learning_rate": 2.372881355932204e-06, | |
| "loss": 1.3733, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.027268853855986364, | |
| "grad_norm": 3.5696770668674653, | |
| "learning_rate": 2.7118644067796613e-06, | |
| "loss": 1.2981, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.03067746058798466, | |
| "grad_norm": 3.721066217222145, | |
| "learning_rate": 3.0508474576271192e-06, | |
| "loss": 1.2152, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.034086067319982954, | |
| "grad_norm": 3.3165399160593765, | |
| "learning_rate": 3.3898305084745763e-06, | |
| "loss": 1.1972, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.03749467405198125, | |
| "grad_norm": 3.0704960315045087, | |
| "learning_rate": 3.7288135593220342e-06, | |
| "loss": 1.1098, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.04090328078397955, | |
| "grad_norm": 3.0249632466321708, | |
| "learning_rate": 4.067796610169492e-06, | |
| "loss": 1.1101, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.044311887515977845, | |
| "grad_norm": 2.51223844701648, | |
| "learning_rate": 4.40677966101695e-06, | |
| "loss": 1.0707, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.04772049424797614, | |
| "grad_norm": 2.337756418129475, | |
| "learning_rate": 4.745762711864408e-06, | |
| "loss": 1.0054, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.05112910097997444, | |
| "grad_norm": 2.507569726418487, | |
| "learning_rate": 5.084745762711865e-06, | |
| "loss": 0.9751, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.05453770771197273, | |
| "grad_norm": 2.8133808280998664, | |
| "learning_rate": 5.423728813559323e-06, | |
| "loss": 1.0073, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.057946314443971025, | |
| "grad_norm": 2.259104843714805, | |
| "learning_rate": 5.7627118644067805e-06, | |
| "loss": 0.9816, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.06135492117596932, | |
| "grad_norm": 2.489954458392594, | |
| "learning_rate": 6.1016949152542385e-06, | |
| "loss": 0.9969, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.06476352790796762, | |
| "grad_norm": 2.489196411046845, | |
| "learning_rate": 6.440677966101695e-06, | |
| "loss": 0.9751, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.06817213463996591, | |
| "grad_norm": 2.328531532406521, | |
| "learning_rate": 6.779661016949153e-06, | |
| "loss": 0.9063, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.07158074137196421, | |
| "grad_norm": 1.8777822207540218, | |
| "learning_rate": 7.1186440677966106e-06, | |
| "loss": 0.9076, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.0749893481039625, | |
| "grad_norm": 2.3564822959993403, | |
| "learning_rate": 7.4576271186440685e-06, | |
| "loss": 0.9544, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.0783979548359608, | |
| "grad_norm": 2.0258103744170604, | |
| "learning_rate": 7.796610169491526e-06, | |
| "loss": 0.9056, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.0818065615679591, | |
| "grad_norm": 2.156862728985429, | |
| "learning_rate": 8.135593220338983e-06, | |
| "loss": 0.853, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.08521516829995739, | |
| "grad_norm": 2.34893762869447, | |
| "learning_rate": 8.47457627118644e-06, | |
| "loss": 0.8894, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.08862377503195569, | |
| "grad_norm": 2.315393254315105, | |
| "learning_rate": 8.8135593220339e-06, | |
| "loss": 0.9133, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.09203238176395398, | |
| "grad_norm": 2.301164621363547, | |
| "learning_rate": 9.152542372881356e-06, | |
| "loss": 0.8895, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.09544098849595228, | |
| "grad_norm": 2.2398380284062425, | |
| "learning_rate": 9.491525423728815e-06, | |
| "loss": 0.8563, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.09884959522795057, | |
| "grad_norm": 2.029764049433947, | |
| "learning_rate": 9.830508474576272e-06, | |
| "loss": 0.8813, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.10225820195994888, | |
| "grad_norm": 2.0880574238328387, | |
| "learning_rate": 9.999911158222826e-06, | |
| "loss": 0.9337, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.10566680869194717, | |
| "grad_norm": 2.3366310826461993, | |
| "learning_rate": 9.999200442948178e-06, | |
| "loss": 0.924, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.10907541542394546, | |
| "grad_norm": 2.344634528525901, | |
| "learning_rate": 9.997779113423916e-06, | |
| "loss": 0.9312, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.11248402215594376, | |
| "grad_norm": 2.551521533584163, | |
| "learning_rate": 9.99564737168575e-06, | |
| "loss": 0.9062, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.11589262888794205, | |
| "grad_norm": 2.2111244932074, | |
| "learning_rate": 9.992805520751352e-06, | |
| "loss": 0.9032, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.11930123561994035, | |
| "grad_norm": 2.2310561621390463, | |
| "learning_rate": 9.989253964577267e-06, | |
| "loss": 0.8851, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.12270984235193864, | |
| "grad_norm": 2.1278482556209153, | |
| "learning_rate": 9.984993208001513e-06, | |
| "loss": 0.8663, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.12611844908393693, | |
| "grad_norm": 2.173728187499986, | |
| "learning_rate": 9.980023856671805e-06, | |
| "loss": 0.889, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.12952705581593524, | |
| "grad_norm": 2.0928351193635026, | |
| "learning_rate": 9.974346616959476e-06, | |
| "loss": 0.8293, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.13293566254793354, | |
| "grad_norm": 2.8128383896971902, | |
| "learning_rate": 9.96796229585906e-06, | |
| "loss": 0.8746, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.13634426927993182, | |
| "grad_norm": 2.0208631286469254, | |
| "learning_rate": 9.960871800873586e-06, | |
| "loss": 0.9004, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.13975287601193012, | |
| "grad_norm": 2.1974668712171157, | |
| "learning_rate": 9.953076139885581e-06, | |
| "loss": 0.9384, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.14316148274392843, | |
| "grad_norm": 2.3881999042131516, | |
| "learning_rate": 9.944576421013804e-06, | |
| "loss": 0.8679, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.1465700894759267, | |
| "grad_norm": 1.9809033021368312, | |
| "learning_rate": 9.935373852455724e-06, | |
| "loss": 0.855, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.149978696207925, | |
| "grad_norm": 2.063145920288984, | |
| "learning_rate": 9.925469742315799e-06, | |
| "loss": 0.8509, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.1533873029399233, | |
| "grad_norm": 2.1854349522591248, | |
| "learning_rate": 9.91486549841951e-06, | |
| "loss": 0.8774, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.1567959096719216, | |
| "grad_norm": 1.9442802721360337, | |
| "learning_rate": 9.903562628113266e-06, | |
| "loss": 0.9293, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.1602045164039199, | |
| "grad_norm": 1.9595403019093638, | |
| "learning_rate": 9.891562738050127e-06, | |
| "loss": 0.835, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.1636131231359182, | |
| "grad_norm": 2.001941381433757, | |
| "learning_rate": 9.878867533961434e-06, | |
| "loss": 0.8594, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.1670217298679165, | |
| "grad_norm": 2.24822151912714, | |
| "learning_rate": 9.865478820414344e-06, | |
| "loss": 0.85, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.17043033659991477, | |
| "grad_norm": 1.8987704947701014, | |
| "learning_rate": 9.851398500555321e-06, | |
| "loss": 0.8647, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.17383894333191308, | |
| "grad_norm": 1.9294894488605314, | |
| "learning_rate": 9.8366285758396e-06, | |
| "loss": 0.8374, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.17724755006391138, | |
| "grad_norm": 2.0659769053985455, | |
| "learning_rate": 9.82117114574671e-06, | |
| "loss": 0.8625, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.18065615679590968, | |
| "grad_norm": 2.1086334049039044, | |
| "learning_rate": 9.805028407482027e-06, | |
| "loss": 0.8418, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.18406476352790796, | |
| "grad_norm": 1.9093180149494513, | |
| "learning_rate": 9.78820265566445e-06, | |
| "loss": 0.8378, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.18747337025990626, | |
| "grad_norm": 2.108586166296398, | |
| "learning_rate": 9.770696282000245e-06, | |
| "loss": 0.8717, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.19088197699190457, | |
| "grad_norm": 1.9600773842942703, | |
| "learning_rate": 9.75251177494306e-06, | |
| "loss": 0.8626, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.19429058372390284, | |
| "grad_norm": 2.2720190152376425, | |
| "learning_rate": 9.733651719340207e-06, | |
| "loss": 0.8714, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.19769919045590115, | |
| "grad_norm": 1.9850702888722942, | |
| "learning_rate": 9.714118796065243e-06, | |
| "loss": 0.8529, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.20110779718789945, | |
| "grad_norm": 2.0995202528398234, | |
| "learning_rate": 9.693915781636887e-06, | |
| "loss": 0.8508, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.20451640391989775, | |
| "grad_norm": 2.191390132974462, | |
| "learning_rate": 9.673045547824356e-06, | |
| "loss": 0.8699, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.20792501065189603, | |
| "grad_norm": 1.92525268724109, | |
| "learning_rate": 9.651511061239151e-06, | |
| "loss": 0.8746, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.21133361738389433, | |
| "grad_norm": 2.1743856429859765, | |
| "learning_rate": 9.62931538291337e-06, | |
| "loss": 0.8643, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.21474222411589264, | |
| "grad_norm": 2.2545723733622642, | |
| "learning_rate": 9.606461667864603e-06, | |
| "loss": 0.8518, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.21815083084789091, | |
| "grad_norm": 2.088328769098791, | |
| "learning_rate": 9.582953164647438e-06, | |
| "loss": 0.8036, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.22155943757988922, | |
| "grad_norm": 1.7154521796104316, | |
| "learning_rate": 9.558793214891716e-06, | |
| "loss": 0.8749, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.22496804431188752, | |
| "grad_norm": 1.9971816903951989, | |
| "learning_rate": 9.533985252827525e-06, | |
| "loss": 0.8442, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.22837665104388583, | |
| "grad_norm": 2.0452202203810246, | |
| "learning_rate": 9.508532804797035e-06, | |
| "loss": 0.8331, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.2317852577758841, | |
| "grad_norm": 1.8873007094774574, | |
| "learning_rate": 9.482439488753248e-06, | |
| "loss": 0.8404, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.2351938645078824, | |
| "grad_norm": 1.8694505860373616, | |
| "learning_rate": 9.455709013745726e-06, | |
| "loss": 0.8518, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.2386024712398807, | |
| "grad_norm": 1.8826067133760322, | |
| "learning_rate": 9.42834517939336e-06, | |
| "loss": 0.8224, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.24201107797187899, | |
| "grad_norm": 1.783995366074571, | |
| "learning_rate": 9.400351875344268e-06, | |
| "loss": 0.8368, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.2454196847038773, | |
| "grad_norm": 1.9541798533923145, | |
| "learning_rate": 9.371733080722911e-06, | |
| "loss": 0.8753, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.2488282914358756, | |
| "grad_norm": 1.8725559407975727, | |
| "learning_rate": 9.342492863564468e-06, | |
| "loss": 0.8382, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.25223689816787387, | |
| "grad_norm": 2.166287610640464, | |
| "learning_rate": 9.312635380236582e-06, | |
| "loss": 0.8311, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.2556455048998722, | |
| "grad_norm": 2.100500640351814, | |
| "learning_rate": 9.282164874848554e-06, | |
| "loss": 0.8731, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.2590541116318705, | |
| "grad_norm": 2.269255130053819, | |
| "learning_rate": 9.251085678648072e-06, | |
| "loss": 0.832, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.26246271836386875, | |
| "grad_norm": 2.019471552415879, | |
| "learning_rate": 9.219402209405519e-06, | |
| "loss": 0.8715, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.2658713250958671, | |
| "grad_norm": 2.057431118897924, | |
| "learning_rate": 9.18711897078603e-06, | |
| "loss": 0.8809, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.26927993182786536, | |
| "grad_norm": 1.7964562596950788, | |
| "learning_rate": 9.154240551709298e-06, | |
| "loss": 0.8159, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.27268853855986364, | |
| "grad_norm": 1.77492002243483, | |
| "learning_rate": 9.1207716256973e-06, | |
| "loss": 0.8443, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.27609714529186197, | |
| "grad_norm": 2.042551909185088, | |
| "learning_rate": 9.08671695020995e-06, | |
| "loss": 0.8718, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.27950575202386024, | |
| "grad_norm": 1.9983795965696414, | |
| "learning_rate": 9.052081365968873e-06, | |
| "loss": 0.827, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.2829143587558585, | |
| "grad_norm": 2.052158753663356, | |
| "learning_rate": 9.016869796269299e-06, | |
| "loss": 0.8478, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.28632296548785685, | |
| "grad_norm": 1.8439668797771172, | |
| "learning_rate": 8.981087246280246e-06, | |
| "loss": 0.8063, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.2897315722198551, | |
| "grad_norm": 1.8391702615486196, | |
| "learning_rate": 8.944738802333062e-06, | |
| "loss": 0.8345, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.2931401789518534, | |
| "grad_norm": 1.832576311204302, | |
| "learning_rate": 8.907829631198412e-06, | |
| "loss": 0.7921, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.29654878568385173, | |
| "grad_norm": 1.9349714568402026, | |
| "learning_rate": 8.870364979351862e-06, | |
| "loss": 0.8122, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.29995739241585, | |
| "grad_norm": 2.4339551873431104, | |
| "learning_rate": 8.832350172228099e-06, | |
| "loss": 0.82, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.30336599914784834, | |
| "grad_norm": 1.5892853144502257, | |
| "learning_rate": 8.793790613463956e-06, | |
| "loss": 0.8362, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.3067746058798466, | |
| "grad_norm": 1.7805951409311684, | |
| "learning_rate": 8.754691784130298e-06, | |
| "loss": 0.8283, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.3101832126118449, | |
| "grad_norm": 1.6741127801982758, | |
| "learning_rate": 8.715059241952925e-06, | |
| "loss": 0.8055, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.3135918193438432, | |
| "grad_norm": 1.7776583452428674, | |
| "learning_rate": 8.674898620522558e-06, | |
| "loss": 0.8238, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.3170004260758415, | |
| "grad_norm": 1.7939673520888073, | |
| "learning_rate": 8.634215628494044e-06, | |
| "loss": 0.7946, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.3204090328078398, | |
| "grad_norm": 1.9491996130141838, | |
| "learning_rate": 8.593016048774911e-06, | |
| "loss": 0.8269, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.3238176395398381, | |
| "grad_norm": 2.0260518716817355, | |
| "learning_rate": 8.551305737703338e-06, | |
| "loss": 0.8205, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.3272262462718364, | |
| "grad_norm": 2.0407011387236915, | |
| "learning_rate": 8.509090624215708e-06, | |
| "loss": 0.7965, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.33063485300383466, | |
| "grad_norm": 1.885729706338136, | |
| "learning_rate": 8.466376709003841e-06, | |
| "loss": 0.8444, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.334043459735833, | |
| "grad_norm": 1.8594029276174733, | |
| "learning_rate": 8.423170063662014e-06, | |
| "loss": 0.8301, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.33745206646783127, | |
| "grad_norm": 2.0661661167934775, | |
| "learning_rate": 8.379476829823919e-06, | |
| "loss": 0.8194, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.34086067319982954, | |
| "grad_norm": 1.843312820214377, | |
| "learning_rate": 8.335303218289645e-06, | |
| "loss": 0.8223, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.3442692799318279, | |
| "grad_norm": 1.9237357576874556, | |
| "learning_rate": 8.290655508142855e-06, | |
| "loss": 0.8066, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.34767788666382615, | |
| "grad_norm": 1.8680286378652424, | |
| "learning_rate": 8.245540045858229e-06, | |
| "loss": 0.8035, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.3510864933958245, | |
| "grad_norm": 1.7375474557232102, | |
| "learning_rate": 8.199963244399351e-06, | |
| "loss": 0.8115, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.35449510012782276, | |
| "grad_norm": 1.8317920240540273, | |
| "learning_rate": 8.153931582307124e-06, | |
| "loss": 0.8333, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.35790370685982104, | |
| "grad_norm": 1.8807672342031, | |
| "learning_rate": 8.107451602778887e-06, | |
| "loss": 0.8114, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.36131231359181937, | |
| "grad_norm": 1.8233277253885543, | |
| "learning_rate": 8.060529912738316e-06, | |
| "loss": 0.8084, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.36472092032381764, | |
| "grad_norm": 1.8018328218174473, | |
| "learning_rate": 8.013173181896283e-06, | |
| "loss": 0.8036, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.3681295270558159, | |
| "grad_norm": 1.7500277066768286, | |
| "learning_rate": 7.965388141802792e-06, | |
| "loss": 0.8493, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.37153813378781425, | |
| "grad_norm": 1.8202749968468963, | |
| "learning_rate": 7.917181584890113e-06, | |
| "loss": 0.8666, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.3749467405198125, | |
| "grad_norm": 1.7463930119282958, | |
| "learning_rate": 7.868560363507268e-06, | |
| "loss": 0.8104, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.3783553472518108, | |
| "grad_norm": 1.687535685348095, | |
| "learning_rate": 7.819531388946e-06, | |
| "loss": 0.7423, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.38176395398380913, | |
| "grad_norm": 1.8780292088928865, | |
| "learning_rate": 7.770101630458364e-06, | |
| "loss": 0.7817, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.3851725607158074, | |
| "grad_norm": 1.7870199236678057, | |
| "learning_rate": 7.720278114266084e-06, | |
| "loss": 0.8055, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.3885811674478057, | |
| "grad_norm": 2.1478936673535522, | |
| "learning_rate": 7.670067922561795e-06, | |
| "loss": 0.8181, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.391989774179804, | |
| "grad_norm": 1.996195916952366, | |
| "learning_rate": 7.619478192502352e-06, | |
| "loss": 0.786, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.3953983809118023, | |
| "grad_norm": 1.7607026492930253, | |
| "learning_rate": 7.5685161151943066e-06, | |
| "loss": 0.7861, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.39880698764380057, | |
| "grad_norm": 1.982916095534787, | |
| "learning_rate": 7.5171889346717264e-06, | |
| "loss": 0.8314, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.4022155943757989, | |
| "grad_norm": 1.8475361020735406, | |
| "learning_rate": 7.465503946866491e-06, | |
| "loss": 0.8045, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.4056242011077972, | |
| "grad_norm": 1.6483972591254976, | |
| "learning_rate": 7.413468498571199e-06, | |
| "loss": 0.8149, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.4090328078397955, | |
| "grad_norm": 1.9862409096042428, | |
| "learning_rate": 7.361089986394863e-06, | |
| "loss": 0.8038, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.4124414145717938, | |
| "grad_norm": 1.9381151415852729, | |
| "learning_rate": 7.308375855711508e-06, | |
| "loss": 0.8203, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.41585002130379206, | |
| "grad_norm": 1.8252397931133695, | |
| "learning_rate": 7.2553335996018485e-06, | |
| "loss": 0.8096, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.4192586280357904, | |
| "grad_norm": 1.8096704344426044, | |
| "learning_rate": 7.201970757788172e-06, | |
| "loss": 0.7692, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.42266723476778867, | |
| "grad_norm": 1.7220473220530976, | |
| "learning_rate": 7.148294915562617e-06, | |
| "loss": 0.7941, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.42607584149978694, | |
| "grad_norm": 1.6560074887380722, | |
| "learning_rate": 7.09431370270894e-06, | |
| "loss": 0.795, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.4294844482317853, | |
| "grad_norm": 1.6420030994162023, | |
| "learning_rate": 7.040034792417986e-06, | |
| "loss": 0.7854, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.43289305496378355, | |
| "grad_norm": 1.8777722598215576, | |
| "learning_rate": 6.985465900196971e-06, | |
| "loss": 0.8445, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.43630166169578183, | |
| "grad_norm": 1.8421388677475183, | |
| "learning_rate": 6.930614782772766e-06, | |
| "loss": 0.8372, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.43971026842778016, | |
| "grad_norm": 1.9176127454855982, | |
| "learning_rate": 6.875489236989306e-06, | |
| "loss": 0.82, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.44311887515977844, | |
| "grad_norm": 1.8186020984518791, | |
| "learning_rate": 6.8200970986993e-06, | |
| "loss": 0.8208, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.4465274818917767, | |
| "grad_norm": 1.9882080130853592, | |
| "learning_rate": 6.764446241650411e-06, | |
| "loss": 0.8073, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.44993608862377504, | |
| "grad_norm": 1.5626224821883135, | |
| "learning_rate": 6.708544576366024e-06, | |
| "loss": 0.8231, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.4533446953557733, | |
| "grad_norm": 1.8993741801940567, | |
| "learning_rate": 6.65240004902081e-06, | |
| "loss": 0.8065, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.45675330208777165, | |
| "grad_norm": 1.7550634910546117, | |
| "learning_rate": 6.5960206403112045e-06, | |
| "loss": 0.7485, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.4601619088197699, | |
| "grad_norm": 1.9326203800829964, | |
| "learning_rate": 6.539414364320997e-06, | |
| "loss": 0.779, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.4635705155517682, | |
| "grad_norm": 1.6516836712807916, | |
| "learning_rate": 6.4825892673821556e-06, | |
| "loss": 0.794, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.46697912228376653, | |
| "grad_norm": 2.003145254209738, | |
| "learning_rate": 6.425553426931075e-06, | |
| "loss": 0.7618, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.4703877290157648, | |
| "grad_norm": 1.6246913128967992, | |
| "learning_rate": 6.368314950360416e-06, | |
| "loss": 0.7785, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.4737963357477631, | |
| "grad_norm": 1.8754262243075055, | |
| "learning_rate": 6.31088197386666e-06, | |
| "loss": 0.765, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.4772049424797614, | |
| "grad_norm": 1.6448889983604882, | |
| "learning_rate": 6.2532626612936035e-06, | |
| "loss": 0.7977, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.4806135492117597, | |
| "grad_norm": 1.8537195734950456, | |
| "learning_rate": 6.195465202971883e-06, | |
| "loss": 0.7637, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.48402215594375797, | |
| "grad_norm": 1.7350572092463452, | |
| "learning_rate": 6.137497814554773e-06, | |
| "loss": 0.7974, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.4874307626757563, | |
| "grad_norm": 1.659202469943535, | |
| "learning_rate": 6.079368735850355e-06, | |
| "loss": 0.78, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.4908393694077546, | |
| "grad_norm": 2.0704522103071326, | |
| "learning_rate": 6.021086229650273e-06, | |
| "loss": 0.78, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.49424797613975285, | |
| "grad_norm": 1.7654110381757835, | |
| "learning_rate": 5.96265858055521e-06, | |
| "loss": 0.8101, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.4976565828717512, | |
| "grad_norm": 1.6744010674594727, | |
| "learning_rate": 5.904094093797274e-06, | |
| "loss": 0.7854, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.5010651896037495, | |
| "grad_norm": 1.672935281461675, | |
| "learning_rate": 5.845401094059439e-06, | |
| "loss": 0.8233, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.5044737963357477, | |
| "grad_norm": 1.7571827575826455, | |
| "learning_rate": 5.786587924292228e-06, | |
| "loss": 0.8072, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.507882403067746, | |
| "grad_norm": 1.7879627913914475, | |
| "learning_rate": 5.727662944527808e-06, | |
| "loss": 0.8236, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.5112910097997444, | |
| "grad_norm": 1.6693554243950373, | |
| "learning_rate": 5.668634530691639e-06, | |
| "loss": 0.7829, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.5146996165317427, | |
| "grad_norm": 1.7885967539315661, | |
| "learning_rate": 5.609511073411881e-06, | |
| "loss": 0.8069, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.518108223263741, | |
| "grad_norm": 1.8927285112086825, | |
| "learning_rate": 5.550300976826697e-06, | |
| "loss": 0.7978, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.5215168299957392, | |
| "grad_norm": 1.9422631435450946, | |
| "learning_rate": 5.491012657389644e-06, | |
| "loss": 0.8143, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.5249254367277375, | |
| "grad_norm": 1.8664920158914793, | |
| "learning_rate": 5.431654542673315e-06, | |
| "loss": 0.8184, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.5283340434597358, | |
| "grad_norm": 1.7545208791954934, | |
| "learning_rate": 5.372235070171387e-06, | |
| "loss": 0.7976, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.5317426501917342, | |
| "grad_norm": 1.8345783742469912, | |
| "learning_rate": 5.312762686099282e-06, | |
| "loss": 0.7726, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.5351512569237324, | |
| "grad_norm": 1.777874135882686, | |
| "learning_rate": 5.253245844193564e-06, | |
| "loss": 0.791, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.5385598636557307, | |
| "grad_norm": 1.7559490288145514, | |
| "learning_rate": 5.193693004510284e-06, | |
| "loss": 0.7654, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.541968470387729, | |
| "grad_norm": 1.7582347301570869, | |
| "learning_rate": 5.1341126322224145e-06, | |
| "loss": 0.7509, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.5453770771197273, | |
| "grad_norm": 1.6309306234524417, | |
| "learning_rate": 5.0745131964165686e-06, | |
| "loss": 0.7737, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.5487856838517257, | |
| "grad_norm": 1.6033761601218945, | |
| "learning_rate": 5.014903168889155e-06, | |
| "loss": 0.7814, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.5521942905837239, | |
| "grad_norm": 1.9401976205946008, | |
| "learning_rate": 4.955291022942146e-06, | |
| "loss": 0.7782, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.5556028973157222, | |
| "grad_norm": 1.8683852058875654, | |
| "learning_rate": 4.8956852321786394e-06, | |
| "loss": 0.802, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.5590115040477205, | |
| "grad_norm": 1.859200797460236, | |
| "learning_rate": 4.836094269298373e-06, | |
| "loss": 0.7798, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.5624201107797188, | |
| "grad_norm": 1.803688727471679, | |
| "learning_rate": 4.7765266048933626e-06, | |
| "loss": 0.8098, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.565828717511717, | |
| "grad_norm": 1.7851903640220046, | |
| "learning_rate": 4.7169907062438556e-06, | |
| "loss": 0.7728, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.5692373242437154, | |
| "grad_norm": 1.6343651082668207, | |
| "learning_rate": 4.657495036114731e-06, | |
| "loss": 0.7721, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.5726459309757137, | |
| "grad_norm": 1.6586804458742679, | |
| "learning_rate": 4.598048051552563e-06, | |
| "loss": 0.8186, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.576054537707712, | |
| "grad_norm": 1.750348106992188, | |
| "learning_rate": 4.53865820268349e-06, | |
| "loss": 0.7503, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.5794631444397103, | |
| "grad_norm": 1.7165597335606, | |
| "learning_rate": 4.479333931512064e-06, | |
| "loss": 0.7915, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.5828717511717085, | |
| "grad_norm": 1.7582639909981337, | |
| "learning_rate": 4.4200836707212525e-06, | |
| "loss": 0.7754, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.5862803579037068, | |
| "grad_norm": 1.6838798515151536, | |
| "learning_rate": 4.360915842473778e-06, | |
| "loss": 0.7543, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.5896889646357052, | |
| "grad_norm": 1.4519358598788679, | |
| "learning_rate": 4.301838857214944e-06, | |
| "loss": 0.7629, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.5930975713677035, | |
| "grad_norm": 1.6951153457921704, | |
| "learning_rate": 4.2428611124771184e-06, | |
| "loss": 0.7919, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.5965061780997017, | |
| "grad_norm": 1.716470856323328, | |
| "learning_rate": 4.183990991686071e-06, | |
| "loss": 0.7853, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.5999147848317, | |
| "grad_norm": 1.8973005312026041, | |
| "learning_rate": 4.125236862969304e-06, | |
| "loss": 0.7695, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.6033233915636983, | |
| "grad_norm": 1.4862142786601558, | |
| "learning_rate": 4.066607077966559e-06, | |
| "loss": 0.7395, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.6067319982956967, | |
| "grad_norm": 1.8815334843344234, | |
| "learning_rate": 4.008109970642665e-06, | |
| "loss": 0.787, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.610140605027695, | |
| "grad_norm": 1.6378926667179599, | |
| "learning_rate": 3.949753856102909e-06, | |
| "loss": 0.7393, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.6135492117596932, | |
| "grad_norm": 1.7084737118259692, | |
| "learning_rate": 3.891547029411074e-06, | |
| "loss": 0.8053, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.6169578184916915, | |
| "grad_norm": 1.8544822094338145, | |
| "learning_rate": 3.833497764410336e-06, | |
| "loss": 0.7674, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.6203664252236898, | |
| "grad_norm": 1.7409266320356052, | |
| "learning_rate": 3.7756143125471745e-06, | |
| "loss": 0.7489, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.6237750319556881, | |
| "grad_norm": 1.8302377955954783, | |
| "learning_rate": 3.7179049016984616e-06, | |
| "loss": 0.7757, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.6271836386876865, | |
| "grad_norm": 1.6552784195481047, | |
| "learning_rate": 3.660377735001911e-06, | |
| "loss": 0.7656, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.6305922454196847, | |
| "grad_norm": 1.8034934811229115, | |
| "learning_rate": 3.6030409896900333e-06, | |
| "loss": 0.7731, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.634000852151683, | |
| "grad_norm": 1.656497367167841, | |
| "learning_rate": 3.545902815927783e-06, | |
| "loss": 0.7564, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.6374094588836813, | |
| "grad_norm": 1.6538858927486708, | |
| "learning_rate": 3.4889713356540435e-06, | |
| "loss": 0.7556, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 0.6408180656156796, | |
| "grad_norm": 1.7721211559316303, | |
| "learning_rate": 3.432254641427136e-06, | |
| "loss": 0.7754, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.6442266723476778, | |
| "grad_norm": 2.0169569259903946, | |
| "learning_rate": 3.375760795274493e-06, | |
| "loss": 0.7745, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.6476352790796762, | |
| "grad_norm": 1.5174685690539855, | |
| "learning_rate": 3.3194978275466755e-06, | |
| "loss": 0.7649, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.6510438858116745, | |
| "grad_norm": 1.5043926003321177, | |
| "learning_rate": 3.2634737357758994e-06, | |
| "loss": 0.74, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 0.6544524925436728, | |
| "grad_norm": 1.6692309306036763, | |
| "learning_rate": 3.207696483539219e-06, | |
| "loss": 0.7611, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.657861099275671, | |
| "grad_norm": 1.784881350588902, | |
| "learning_rate": 3.152173999326537e-06, | |
| "loss": 0.7675, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 0.6612697060076693, | |
| "grad_norm": 1.7444177474948184, | |
| "learning_rate": 3.096914175413611e-06, | |
| "loss": 0.7685, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 0.6646783127396677, | |
| "grad_norm": 1.999124406609706, | |
| "learning_rate": 3.041924866740197e-06, | |
| "loss": 0.7505, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.668086919471666, | |
| "grad_norm": 1.5350179140801363, | |
| "learning_rate": 2.987213889793502e-06, | |
| "loss": 0.7792, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 0.6714955262036643, | |
| "grad_norm": 1.7053442818016058, | |
| "learning_rate": 2.9327890214971135e-06, | |
| "loss": 0.7661, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 0.6749041329356625, | |
| "grad_norm": 1.6957720054975949, | |
| "learning_rate": 2.878657998105533e-06, | |
| "loss": 0.7587, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 0.6783127396676608, | |
| "grad_norm": 1.7615190253028947, | |
| "learning_rate": 2.824828514104512e-06, | |
| "loss": 0.8067, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 0.6817213463996591, | |
| "grad_norm": 1.5571276091989836, | |
| "learning_rate": 2.771308221117309e-06, | |
| "loss": 0.7478, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.6851299531316575, | |
| "grad_norm": 1.5280419708791328, | |
| "learning_rate": 2.7181047268170436e-06, | |
| "loss": 0.8071, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 0.6885385598636558, | |
| "grad_norm": 1.6771310372158614, | |
| "learning_rate": 2.665225593845307e-06, | |
| "loss": 0.7799, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 0.691947166595654, | |
| "grad_norm": 1.7562403180703012, | |
| "learning_rate": 2.6126783387371545e-06, | |
| "loss": 0.7471, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 0.6953557733276523, | |
| "grad_norm": 1.796003576907647, | |
| "learning_rate": 2.5604704308526784e-06, | |
| "loss": 0.7716, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 0.6987643800596506, | |
| "grad_norm": 1.6859622044955247, | |
| "learning_rate": 2.5086092913152533e-06, | |
| "loss": 0.8003, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.702172986791649, | |
| "grad_norm": 1.6580555467710547, | |
| "learning_rate": 2.4571022919566747e-06, | |
| "loss": 0.7463, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 0.7055815935236472, | |
| "grad_norm": 1.5775642264745176, | |
| "learning_rate": 2.4059567542692688e-06, | |
| "loss": 0.7329, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 0.7089902002556455, | |
| "grad_norm": 1.6145511797148977, | |
| "learning_rate": 2.3551799483651894e-06, | |
| "loss": 0.7557, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.7123988069876438, | |
| "grad_norm": 1.815641939119739, | |
| "learning_rate": 2.3047790919429895e-06, | |
| "loss": 0.7618, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 0.7158074137196421, | |
| "grad_norm": 1.751085671764699, | |
| "learning_rate": 2.254761349261676e-06, | |
| "loss": 0.7307, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.7192160204516403, | |
| "grad_norm": 1.722113083895161, | |
| "learning_rate": 2.2051338301223187e-06, | |
| "loss": 0.7536, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 0.7226246271836387, | |
| "grad_norm": 1.5887620089800378, | |
| "learning_rate": 2.1559035888574433e-06, | |
| "loss": 0.7139, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 0.726033233915637, | |
| "grad_norm": 1.792034913644711, | |
| "learning_rate": 2.107077623328275e-06, | |
| "loss": 0.7335, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 0.7294418406476353, | |
| "grad_norm": 1.5385202560838132, | |
| "learning_rate": 2.0586628739300386e-06, | |
| "loss": 0.7752, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 0.7328504473796336, | |
| "grad_norm": 1.554007093824859, | |
| "learning_rate": 2.0106662226053924e-06, | |
| "loss": 0.7814, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.7362590541116318, | |
| "grad_norm": 1.60530576083176, | |
| "learning_rate": 1.9630944918662122e-06, | |
| "loss": 0.7552, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 0.7396676608436301, | |
| "grad_norm": 1.5045851644291919, | |
| "learning_rate": 1.9159544438237797e-06, | |
| "loss": 0.7602, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 0.7430762675756285, | |
| "grad_norm": 1.6227515256198517, | |
| "learning_rate": 1.8692527792276e-06, | |
| "loss": 0.7695, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 0.7464848743076268, | |
| "grad_norm": 1.6302337283434762, | |
| "learning_rate": 1.8229961365128985e-06, | |
| "loss": 0.732, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 0.749893481039625, | |
| "grad_norm": 1.5834938000524268, | |
| "learning_rate": 1.7771910908570156e-06, | |
| "loss": 0.7566, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.7533020877716233, | |
| "grad_norm": 1.4483744882069087, | |
| "learning_rate": 1.7318441532447555e-06, | |
| "loss": 0.8073, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 0.7567106945036216, | |
| "grad_norm": 1.6825397102814452, | |
| "learning_rate": 1.6869617695429024e-06, | |
| "loss": 0.7579, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 0.76011930123562, | |
| "grad_norm": 1.5595787362408242, | |
| "learning_rate": 1.6425503195839416e-06, | |
| "loss": 0.8201, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 0.7635279079676183, | |
| "grad_norm": 1.6860253511961054, | |
| "learning_rate": 1.5986161162592173e-06, | |
| "loss": 0.7515, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.7669365146996165, | |
| "grad_norm": 1.6217349191290344, | |
| "learning_rate": 1.555165404621567e-06, | |
| "loss": 0.7684, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.7703451214316148, | |
| "grad_norm": 1.7202733697764292, | |
| "learning_rate": 1.5122043609976282e-06, | |
| "loss": 0.7335, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 0.7737537281636131, | |
| "grad_norm": 1.7746564761962131, | |
| "learning_rate": 1.4697390921098886e-06, | |
| "loss": 0.7848, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 0.7771623348956114, | |
| "grad_norm": 1.5349803625176617, | |
| "learning_rate": 1.4277756342086524e-06, | |
| "loss": 0.7757, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 0.7805709416276098, | |
| "grad_norm": 1.6124010704416651, | |
| "learning_rate": 1.3863199522140053e-06, | |
| "loss": 0.8031, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 0.783979548359608, | |
| "grad_norm": 1.7539891598801893, | |
| "learning_rate": 1.3453779388679378e-06, | |
| "loss": 0.7826, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.7873881550916063, | |
| "grad_norm": 1.6197274527555163, | |
| "learning_rate": 1.3049554138967052e-06, | |
| "loss": 0.7706, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 0.7907967618236046, | |
| "grad_norm": 1.5679920515099621, | |
| "learning_rate": 1.2650581231835923e-06, | |
| "loss": 0.7943, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 0.7942053685556029, | |
| "grad_norm": 1.6044331803289653, | |
| "learning_rate": 1.225691737952152e-06, | |
| "loss": 0.7558, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 0.7976139752876011, | |
| "grad_norm": 1.5903299233431156, | |
| "learning_rate": 1.1868618539600718e-06, | |
| "loss": 0.767, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 0.8010225820195995, | |
| "grad_norm": 1.5362855174389376, | |
| "learning_rate": 1.1485739907037546e-06, | |
| "loss": 0.7838, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.8044311887515978, | |
| "grad_norm": 1.6430906694597753, | |
| "learning_rate": 1.1108335906337559e-06, | |
| "loss": 0.7519, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 0.8078397954835961, | |
| "grad_norm": 1.7499785715417322, | |
| "learning_rate": 1.0736460183811547e-06, | |
| "loss": 0.7681, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 0.8112484022155944, | |
| "grad_norm": 1.7219048713168092, | |
| "learning_rate": 1.0370165599950026e-06, | |
| "loss": 0.7552, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 0.8146570089475926, | |
| "grad_norm": 1.797003265591149, | |
| "learning_rate": 1.0009504221909284e-06, | |
| "loss": 0.7336, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 0.818065615679591, | |
| "grad_norm": 1.322267929113636, | |
| "learning_rate": 9.654527316110323e-07, | |
| "loss": 0.7478, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.8214742224115893, | |
| "grad_norm": 1.6060348997953555, | |
| "learning_rate": 9.305285340951509e-07, | |
| "loss": 0.7693, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 0.8248828291435876, | |
| "grad_norm": 1.589739100394995, | |
| "learning_rate": 8.961827939636198e-07, | |
| "loss": 0.762, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 0.8282914358755858, | |
| "grad_norm": 1.5977203911776114, | |
| "learning_rate": 8.624203933116088e-07, | |
| "loss": 0.7517, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 0.8317000426075841, | |
| "grad_norm": 1.4433282291047127, | |
| "learning_rate": 8.292461313151662e-07, | |
| "loss": 0.7814, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 0.8351086493395824, | |
| "grad_norm": 1.5861673926723674, | |
| "learning_rate": 7.966647235490222e-07, | |
| "loss": 0.7988, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.8385172560715808, | |
| "grad_norm": 1.7785118279929921, | |
| "learning_rate": 7.646808013163026e-07, | |
| "loss": 0.7871, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 0.8419258628035791, | |
| "grad_norm": 1.4097703537190163, | |
| "learning_rate": 7.332989109902028e-07, | |
| "loss": 0.7471, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 0.8453344695355773, | |
| "grad_norm": 1.6394050318483955, | |
| "learning_rate": 7.025235133677455e-07, | |
| "loss": 0.7793, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 0.8487430762675756, | |
| "grad_norm": 1.4882317995688061, | |
| "learning_rate": 6.723589830356925e-07, | |
| "loss": 0.7587, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 0.8521516829995739, | |
| "grad_norm": 1.5993598547748789, | |
| "learning_rate": 6.428096077487244e-07, | |
| "loss": 0.7256, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8555602897315722, | |
| "grad_norm": 1.8369103578526638, | |
| "learning_rate": 6.138795878199467e-07, | |
| "loss": 0.7346, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 0.8589688964635706, | |
| "grad_norm": 1.3809387771101223, | |
| "learning_rate": 5.855730355238415e-07, | |
| "loss": 0.7389, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 0.8623775031955688, | |
| "grad_norm": 1.606520808055538, | |
| "learning_rate": 5.578939745117173e-07, | |
| "loss": 0.7626, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 0.8657861099275671, | |
| "grad_norm": 1.4252640716496985, | |
| "learning_rate": 5.308463392397756e-07, | |
| "loss": 0.7471, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 0.8691947166595654, | |
| "grad_norm": 1.5620326120060357, | |
| "learning_rate": 5.044339744098348e-07, | |
| "loss": 0.7919, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.8726033233915637, | |
| "grad_norm": 1.582860405051893, | |
| "learning_rate": 4.786606344228317e-07, | |
| "loss": 0.8001, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 0.876011930123562, | |
| "grad_norm": 1.6390377107564265, | |
| "learning_rate": 4.5352998284514004e-07, | |
| "loss": 0.7693, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 0.8794205368555603, | |
| "grad_norm": 1.5740989498771922, | |
| "learning_rate": 4.2904559188782334e-07, | |
| "loss": 0.743, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 0.8828291435875586, | |
| "grad_norm": 1.6277724611514495, | |
| "learning_rate": 4.05210941898847e-07, | |
| "loss": 0.7204, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 0.8862377503195569, | |
| "grad_norm": 1.4964610352537944, | |
| "learning_rate": 3.82029420868375e-07, | |
| "loss": 0.7572, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.8896463570515551, | |
| "grad_norm": 1.5631623729163737, | |
| "learning_rate": 3.5950432394717296e-07, | |
| "loss": 0.7451, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 0.8930549637835534, | |
| "grad_norm": 1.6364549935760584, | |
| "learning_rate": 3.3763885297822153e-07, | |
| "loss": 0.7748, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 0.8964635705155518, | |
| "grad_norm": 1.901698775456035, | |
| "learning_rate": 3.1643611604158687e-07, | |
| "loss": 0.7467, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 0.8998721772475501, | |
| "grad_norm": 1.6649529271504364, | |
| "learning_rate": 2.958991270126221e-07, | |
| "loss": 0.7316, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 0.9032807839795484, | |
| "grad_norm": 1.7118286972837617, | |
| "learning_rate": 2.760308051335564e-07, | |
| "loss": 0.7546, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.9066893907115466, | |
| "grad_norm": 1.5718606351870987, | |
| "learning_rate": 2.5683397459854045e-07, | |
| "loss": 0.7064, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 0.9100979974435449, | |
| "grad_norm": 1.3894669570974365, | |
| "learning_rate": 2.383113641521956e-07, | |
| "loss": 0.747, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 0.9135066041755433, | |
| "grad_norm": 1.4787818567245852, | |
| "learning_rate": 2.2046560670173912e-07, | |
| "loss": 0.7447, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 0.9169152109075416, | |
| "grad_norm": 1.6746803954544742, | |
| "learning_rate": 2.0329923894272463e-07, | |
| "loss": 0.7329, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 0.9203238176395399, | |
| "grad_norm": 1.6476225947328762, | |
| "learning_rate": 1.8681470099846554e-07, | |
| "loss": 0.7494, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.9237324243715381, | |
| "grad_norm": 1.4591813987858449, | |
| "learning_rate": 1.710143360731781e-07, | |
| "loss": 0.7251, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 0.9271410311035364, | |
| "grad_norm": 1.6247674197735347, | |
| "learning_rate": 1.559003901189099e-07, | |
| "loss": 0.7689, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 0.9305496378355347, | |
| "grad_norm": 1.7245251671767248, | |
| "learning_rate": 1.4147501151628273e-07, | |
| "loss": 0.7592, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 0.9339582445675331, | |
| "grad_norm": 1.4107246676973044, | |
| "learning_rate": 1.2774025076911445e-07, | |
| "loss": 0.7369, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 0.9373668512995313, | |
| "grad_norm": 1.6265296555944986, | |
| "learning_rate": 1.146980602129466e-07, | |
| "loss": 0.735, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.9407754580315296, | |
| "grad_norm": 1.7696409777572968, | |
| "learning_rate": 1.0235029373752758e-07, | |
| "loss": 0.7895, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 0.9441840647635279, | |
| "grad_norm": 1.6245561044198507, | |
| "learning_rate": 9.069870652329283e-08, | |
| "loss": 0.8128, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 0.9475926714955262, | |
| "grad_norm": 1.5989317111370667, | |
| "learning_rate": 7.974495479187271e-08, | |
| "loss": 0.7438, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 0.9510012782275244, | |
| "grad_norm": 1.4760047355170607, | |
| "learning_rate": 6.949059557066596e-08, | |
| "loss": 0.7748, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 0.9544098849595228, | |
| "grad_norm": 1.7501965543108127, | |
| "learning_rate": 5.993708647151874e-08, | |
| "loss": 0.775, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.9578184916915211, | |
| "grad_norm": 1.4579885728200068, | |
| "learning_rate": 5.108578548352838e-08, | |
| "loss": 0.7588, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 0.9612270984235194, | |
| "grad_norm": 1.5049680408512587, | |
| "learning_rate": 4.2937950780013174e-08, | |
| "loss": 0.7671, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 0.9646357051555177, | |
| "grad_norm": 1.6438976356754593, | |
| "learning_rate": 3.549474053966828e-08, | |
| "loss": 0.7908, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 0.9680443118875159, | |
| "grad_norm": 1.4470095982360724, | |
| "learning_rate": 2.8757212781935218e-08, | |
| "loss": 0.7466, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 0.9714529186195143, | |
| "grad_norm": 1.8960810552290914, | |
| "learning_rate": 2.2726325216608224e-08, | |
| "loss": 0.7697, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.9748615253515126, | |
| "grad_norm": 1.6811694769201933, | |
| "learning_rate": 1.7402935107702634e-08, | |
| "loss": 0.7753, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 0.9782701320835109, | |
| "grad_norm": 1.487143334303855, | |
| "learning_rate": 1.2787799151596225e-08, | |
| "loss": 0.7234, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 0.9816787388155092, | |
| "grad_norm": 1.3834640793692283, | |
| "learning_rate": 8.881573369469153e-09, | |
| "loss": 0.7403, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 0.9850873455475074, | |
| "grad_norm": 1.6803671958080162, | |
| "learning_rate": 5.684813014052437e-09, | |
| "loss": 0.7659, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 0.9884959522795057, | |
| "grad_norm": 1.556690011938104, | |
| "learning_rate": 3.197972490704415e-09, | |
| "loss": 0.7572, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.9919045590115041, | |
| "grad_norm": 1.7044244176997279, | |
| "learning_rate": 1.421405292813538e-09, | |
| "loss": 0.7572, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 0.9953131657435024, | |
| "grad_norm": 1.5088989927512526, | |
| "learning_rate": 3.553639515574414e-10, | |
| "loss": 0.752, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 0.9987217724755006, | |
| "grad_norm": 1.4553290863162762, | |
| "learning_rate": 0.0, | |
| "loss": 0.7504, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 0.9987217724755006, | |
| "step": 586, | |
| "total_flos": 6.719053758464e+16, | |
| "train_loss": 0.8281301904863871, | |
| "train_runtime": 9052.2037, | |
| "train_samples_per_second": 1.037, | |
| "train_steps_per_second": 0.065 | |
| } | |
| ], | |
| "logging_steps": 2, | |
| "max_steps": 586, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.719053758464e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |