diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6910 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 1962, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0030581039755351682, + "grad_norm": 0.6435028314590454, + "learning_rate": 1.0101010101010103e-07, + "loss": 1.8936554193496704, + "step": 2 + }, + { + "epoch": 0.0061162079510703364, + "grad_norm": 0.5548882484436035, + "learning_rate": 3.0303030303030305e-07, + "loss": 1.8550586700439453, + "step": 4 + }, + { + "epoch": 0.009174311926605505, + "grad_norm": 0.27108362317085266, + "learning_rate": 5.05050505050505e-07, + "loss": 1.890197992324829, + "step": 6 + }, + { + "epoch": 0.012232415902140673, + "grad_norm": 0.24754057824611664, + "learning_rate": 7.070707070707071e-07, + "loss": 1.8445472717285156, + "step": 8 + }, + { + "epoch": 0.01529051987767584, + "grad_norm": 0.39890649914741516, + "learning_rate": 9.090909090909091e-07, + "loss": 2.010572910308838, + "step": 10 + }, + { + "epoch": 0.01834862385321101, + "grad_norm": 0.23249551653862, + "learning_rate": 1.111111111111111e-06, + "loss": 1.8801705837249756, + "step": 12 + }, + { + "epoch": 0.021406727828746176, + "grad_norm": 0.4299562871456146, + "learning_rate": 1.3131313131313134e-06, + "loss": 1.8805203437805176, + "step": 14 + }, + { + "epoch": 0.024464831804281346, + "grad_norm": 0.5231528282165527, + "learning_rate": 1.5151515151515152e-06, + "loss": 1.9465537071228027, + "step": 16 + }, + { + "epoch": 0.027522935779816515, + "grad_norm": 0.3482355773448944, + "learning_rate": 1.7171717171717173e-06, + "loss": 1.8298053741455078, + "step": 18 + }, + { + "epoch": 0.03058103975535168, + "grad_norm": 0.3003389239311218, + "learning_rate": 1.9191919191919192e-06, + "loss": 1.853845238685608, + "step": 20 + }, + { + "epoch": 0.03363914373088685, + "grad_norm": 0.5087025165557861, + "learning_rate": 2.1212121212121216e-06, + "loss": 1.9923889636993408, + "step": 22 + }, + { + "epoch": 0.03669724770642202, + "grad_norm": 2.0046560764312744, + "learning_rate": 2.3232323232323234e-06, + "loss": 2.008021354675293, + "step": 24 + }, + { + "epoch": 0.039755351681957186, + "grad_norm": 0.2651369571685791, + "learning_rate": 2.5252525252525258e-06, + "loss": 1.7058303356170654, + "step": 26 + }, + { + "epoch": 0.04281345565749235, + "grad_norm": 0.5547925233840942, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.8821287155151367, + "step": 28 + }, + { + "epoch": 0.045871559633027525, + "grad_norm": 0.5607280731201172, + "learning_rate": 2.9292929292929295e-06, + "loss": 2.1788079738616943, + "step": 30 + }, + { + "epoch": 0.04892966360856269, + "grad_norm": 0.36416563391685486, + "learning_rate": 3.131313131313132e-06, + "loss": 1.8534326553344727, + "step": 32 + }, + { + "epoch": 0.05198776758409786, + "grad_norm": 0.4965146481990814, + "learning_rate": 3.3333333333333333e-06, + "loss": 1.9557833671569824, + "step": 34 + }, + { + "epoch": 0.05504587155963303, + "grad_norm": 0.3163432776927948, + "learning_rate": 3.5353535353535356e-06, + "loss": 1.7984235286712646, + "step": 36 + }, + { + "epoch": 0.0581039755351682, + "grad_norm": 0.3063645362854004, + "learning_rate": 3.737373737373738e-06, + "loss": 1.8264985084533691, + "step": 38 + }, + { + "epoch": 0.06116207951070336, + "grad_norm": 0.30639225244522095, + "learning_rate": 3.93939393939394e-06, + "loss": 1.8241571187973022, + "step": 40 + }, + { + "epoch": 0.06422018348623854, + "grad_norm": 0.3971042335033417, + "learning_rate": 4.141414141414142e-06, + "loss": 1.874243974685669, + "step": 42 + }, + { + "epoch": 0.0672782874617737, + "grad_norm": 0.6156560182571411, + "learning_rate": 4.343434343434344e-06, + "loss": 1.965466022491455, + "step": 44 + }, + { + "epoch": 0.07033639143730887, + "grad_norm": 0.5533192753791809, + "learning_rate": 4.5454545454545455e-06, + "loss": 2.0693740844726562, + "step": 46 + }, + { + "epoch": 0.07339449541284404, + "grad_norm": 1.9126055240631104, + "learning_rate": 4.747474747474748e-06, + "loss": 2.060253143310547, + "step": 48 + }, + { + "epoch": 0.0764525993883792, + "grad_norm": 0.3860923647880554, + "learning_rate": 4.94949494949495e-06, + "loss": 1.8577625751495361, + "step": 50 + }, + { + "epoch": 0.07951070336391437, + "grad_norm": 0.4684409499168396, + "learning_rate": 5.151515151515152e-06, + "loss": 1.8510971069335938, + "step": 52 + }, + { + "epoch": 0.08256880733944955, + "grad_norm": 0.4307204484939575, + "learning_rate": 5.353535353535354e-06, + "loss": 1.9931628704071045, + "step": 54 + }, + { + "epoch": 0.0856269113149847, + "grad_norm": 0.3140373229980469, + "learning_rate": 5.555555555555557e-06, + "loss": 1.925836443901062, + "step": 56 + }, + { + "epoch": 0.08868501529051988, + "grad_norm": 0.36317509412765503, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.9616905450820923, + "step": 58 + }, + { + "epoch": 0.09174311926605505, + "grad_norm": 0.21478985249996185, + "learning_rate": 5.95959595959596e-06, + "loss": 1.895378589630127, + "step": 60 + }, + { + "epoch": 0.09480122324159021, + "grad_norm": 0.2936638593673706, + "learning_rate": 6.1616161616161615e-06, + "loss": 1.8279492855072021, + "step": 62 + }, + { + "epoch": 0.09785932721712538, + "grad_norm": 0.3114721179008484, + "learning_rate": 6.363636363636364e-06, + "loss": 1.715104103088379, + "step": 64 + }, + { + "epoch": 0.10091743119266056, + "grad_norm": 0.32813334465026855, + "learning_rate": 6.565656565656566e-06, + "loss": 1.852712631225586, + "step": 66 + }, + { + "epoch": 0.10397553516819572, + "grad_norm": 0.37994885444641113, + "learning_rate": 6.767676767676769e-06, + "loss": 1.9753448963165283, + "step": 68 + }, + { + "epoch": 0.10703363914373089, + "grad_norm": 0.5206537246704102, + "learning_rate": 6.969696969696971e-06, + "loss": 1.8388103246688843, + "step": 70 + }, + { + "epoch": 0.11009174311926606, + "grad_norm": 0.6430595517158508, + "learning_rate": 7.171717171717172e-06, + "loss": 2.0399489402770996, + "step": 72 + }, + { + "epoch": 0.11314984709480122, + "grad_norm": 0.5809399485588074, + "learning_rate": 7.373737373737374e-06, + "loss": 2.1389784812927246, + "step": 74 + }, + { + "epoch": 0.1162079510703364, + "grad_norm": 1.2094364166259766, + "learning_rate": 7.5757575757575764e-06, + "loss": 1.9202568531036377, + "step": 76 + }, + { + "epoch": 0.11926605504587157, + "grad_norm": 0.7485645413398743, + "learning_rate": 7.77777777777778e-06, + "loss": 2.2573585510253906, + "step": 78 + }, + { + "epoch": 0.12232415902140673, + "grad_norm": 0.47476136684417725, + "learning_rate": 7.97979797979798e-06, + "loss": 1.8947498798370361, + "step": 80 + }, + { + "epoch": 0.12538226299694188, + "grad_norm": 0.24537041783332825, + "learning_rate": 8.181818181818183e-06, + "loss": 1.636450171470642, + "step": 82 + }, + { + "epoch": 0.12844036697247707, + "grad_norm": 0.4732670783996582, + "learning_rate": 8.383838383838384e-06, + "loss": 1.818341612815857, + "step": 84 + }, + { + "epoch": 0.13149847094801223, + "grad_norm": 0.37070026993751526, + "learning_rate": 8.585858585858587e-06, + "loss": 1.845613718032837, + "step": 86 + }, + { + "epoch": 0.1345565749235474, + "grad_norm": 0.3881911635398865, + "learning_rate": 8.787878787878788e-06, + "loss": 1.7559518814086914, + "step": 88 + }, + { + "epoch": 0.13761467889908258, + "grad_norm": 0.45207998156547546, + "learning_rate": 8.98989898989899e-06, + "loss": 1.7992792129516602, + "step": 90 + }, + { + "epoch": 0.14067278287461774, + "grad_norm": 0.1907433420419693, + "learning_rate": 9.191919191919193e-06, + "loss": 1.8380980491638184, + "step": 92 + }, + { + "epoch": 0.1437308868501529, + "grad_norm": 0.2265041321516037, + "learning_rate": 9.393939393939396e-06, + "loss": 1.9353697299957275, + "step": 94 + }, + { + "epoch": 0.14678899082568808, + "grad_norm": 0.5571039319038391, + "learning_rate": 9.595959595959597e-06, + "loss": 1.861445665359497, + "step": 96 + }, + { + "epoch": 0.14984709480122324, + "grad_norm": 0.318570613861084, + "learning_rate": 9.797979797979798e-06, + "loss": 1.7963485717773438, + "step": 98 + }, + { + "epoch": 0.1529051987767584, + "grad_norm": 0.35685858130455017, + "learning_rate": 1e-05, + "loss": 1.955026626586914, + "step": 100 + }, + { + "epoch": 0.1559633027522936, + "grad_norm": 0.7966809272766113, + "learning_rate": 9.99997440729838e-06, + "loss": 1.8856327533721924, + "step": 102 + }, + { + "epoch": 0.15902140672782875, + "grad_norm": 0.2650541663169861, + "learning_rate": 9.999897629484621e-06, + "loss": 1.814586877822876, + "step": 104 + }, + { + "epoch": 0.1620795107033639, + "grad_norm": 0.36088353395462036, + "learning_rate": 9.999769667432037e-06, + "loss": 1.8607715368270874, + "step": 106 + }, + { + "epoch": 0.1651376146788991, + "grad_norm": 0.6270299553871155, + "learning_rate": 9.999590522596136e-06, + "loss": 1.9078267812728882, + "step": 108 + }, + { + "epoch": 0.16819571865443425, + "grad_norm": 0.27504709362983704, + "learning_rate": 9.999360197014607e-06, + "loss": 1.9029535055160522, + "step": 110 + }, + { + "epoch": 0.1712538226299694, + "grad_norm": 0.5007109642028809, + "learning_rate": 9.999078693307296e-06, + "loss": 1.7704020738601685, + "step": 112 + }, + { + "epoch": 0.1743119266055046, + "grad_norm": 0.5426493883132935, + "learning_rate": 9.99874601467618e-06, + "loss": 1.8907287120819092, + "step": 114 + }, + { + "epoch": 0.17737003058103976, + "grad_norm": 0.26077231764793396, + "learning_rate": 9.998362164905318e-06, + "loss": 1.760542869567871, + "step": 116 + }, + { + "epoch": 0.18042813455657492, + "grad_norm": 0.37686067819595337, + "learning_rate": 9.997927148360824e-06, + "loss": 1.995668649673462, + "step": 118 + }, + { + "epoch": 0.1834862385321101, + "grad_norm": 0.4259154498577118, + "learning_rate": 9.99744096999081e-06, + "loss": 1.8606561422348022, + "step": 120 + }, + { + "epoch": 0.18654434250764526, + "grad_norm": 0.3365345299243927, + "learning_rate": 9.996903635325326e-06, + "loss": 1.909229040145874, + "step": 122 + }, + { + "epoch": 0.18960244648318042, + "grad_norm": 0.25919589400291443, + "learning_rate": 9.996315150476308e-06, + "loss": 1.9200305938720703, + "step": 124 + }, + { + "epoch": 0.1926605504587156, + "grad_norm": 0.2932458221912384, + "learning_rate": 9.995675522137492e-06, + "loss": 1.8696832656860352, + "step": 126 + }, + { + "epoch": 0.19571865443425077, + "grad_norm": 0.38474535942077637, + "learning_rate": 9.994984757584353e-06, + "loss": 1.828667402267456, + "step": 128 + }, + { + "epoch": 0.19877675840978593, + "grad_norm": 0.3214952349662781, + "learning_rate": 9.994242864674021e-06, + "loss": 1.8718284368515015, + "step": 130 + }, + { + "epoch": 0.2018348623853211, + "grad_norm": 0.33034268021583557, + "learning_rate": 9.993449851845176e-06, + "loss": 1.8226697444915771, + "step": 132 + }, + { + "epoch": 0.20489296636085627, + "grad_norm": 0.8973183631896973, + "learning_rate": 9.992605728117972e-06, + "loss": 1.9453703165054321, + "step": 134 + }, + { + "epoch": 0.20795107033639143, + "grad_norm": 0.6750196218490601, + "learning_rate": 9.991710503093923e-06, + "loss": 1.820605993270874, + "step": 136 + }, + { + "epoch": 0.21100917431192662, + "grad_norm": 0.2680327594280243, + "learning_rate": 9.990764186955797e-06, + "loss": 1.711888074874878, + "step": 138 + }, + { + "epoch": 0.21406727828746178, + "grad_norm": 0.3089163899421692, + "learning_rate": 9.989766790467498e-06, + "loss": 1.668878197669983, + "step": 140 + }, + { + "epoch": 0.21712538226299694, + "grad_norm": 0.5638787746429443, + "learning_rate": 9.988718324973947e-06, + "loss": 1.7612136602401733, + "step": 142 + }, + { + "epoch": 0.22018348623853212, + "grad_norm": 0.24349473416805267, + "learning_rate": 9.98761880240095e-06, + "loss": 1.6873559951782227, + "step": 144 + }, + { + "epoch": 0.22324159021406728, + "grad_norm": 0.3549518585205078, + "learning_rate": 9.986468235255065e-06, + "loss": 1.743373990058899, + "step": 146 + }, + { + "epoch": 0.22629969418960244, + "grad_norm": 0.44438421726226807, + "learning_rate": 9.985266636623457e-06, + "loss": 1.6509066820144653, + "step": 148 + }, + { + "epoch": 0.22935779816513763, + "grad_norm": 0.46152663230895996, + "learning_rate": 9.984014020173748e-06, + "loss": 1.8014967441558838, + "step": 150 + }, + { + "epoch": 0.2324159021406728, + "grad_norm": 0.278169184923172, + "learning_rate": 9.98271040015387e-06, + "loss": 1.8622685670852661, + "step": 152 + }, + { + "epoch": 0.23547400611620795, + "grad_norm": 0.3168479800224304, + "learning_rate": 9.981355791391891e-06, + "loss": 1.8940097093582153, + "step": 154 + }, + { + "epoch": 0.23853211009174313, + "grad_norm": 0.3639688491821289, + "learning_rate": 9.979950209295855e-06, + "loss": 1.7917258739471436, + "step": 156 + }, + { + "epoch": 0.2415902140672783, + "grad_norm": 0.40860888361930847, + "learning_rate": 9.978493669853606e-06, + "loss": 1.8766049146652222, + "step": 158 + }, + { + "epoch": 0.24464831804281345, + "grad_norm": 0.315494179725647, + "learning_rate": 9.976986189632597e-06, + "loss": 1.7932193279266357, + "step": 160 + }, + { + "epoch": 0.24770642201834864, + "grad_norm": 0.3525390923023224, + "learning_rate": 9.975427785779717e-06, + "loss": 1.9470767974853516, + "step": 162 + }, + { + "epoch": 0.25076452599388377, + "grad_norm": 0.33575552701950073, + "learning_rate": 9.97381847602108e-06, + "loss": 1.7163609266281128, + "step": 164 + }, + { + "epoch": 0.25382262996941896, + "grad_norm": 1.193529725074768, + "learning_rate": 9.972158278661838e-06, + "loss": 1.877960205078125, + "step": 166 + }, + { + "epoch": 0.25688073394495414, + "grad_norm": 0.348765105009079, + "learning_rate": 9.970447212585961e-06, + "loss": 1.6149842739105225, + "step": 168 + }, + { + "epoch": 0.2599388379204893, + "grad_norm": 0.5527969598770142, + "learning_rate": 9.968685297256027e-06, + "loss": 1.8597733974456787, + "step": 170 + }, + { + "epoch": 0.26299694189602446, + "grad_norm": 0.656193196773529, + "learning_rate": 9.966872552713006e-06, + "loss": 1.5253994464874268, + "step": 172 + }, + { + "epoch": 0.26605504587155965, + "grad_norm": 0.7701634764671326, + "learning_rate": 9.965008999576018e-06, + "loss": 1.5178442001342773, + "step": 174 + }, + { + "epoch": 0.2691131498470948, + "grad_norm": 0.3889455795288086, + "learning_rate": 9.963094659042113e-06, + "loss": 1.7432003021240234, + "step": 176 + }, + { + "epoch": 0.27217125382262997, + "grad_norm": 0.7660208344459534, + "learning_rate": 9.961129552886024e-06, + "loss": 1.655880331993103, + "step": 178 + }, + { + "epoch": 0.27522935779816515, + "grad_norm": 0.7760636210441589, + "learning_rate": 9.959113703459917e-06, + "loss": 1.9860963821411133, + "step": 180 + }, + { + "epoch": 0.2782874617737003, + "grad_norm": 1.5110101699829102, + "learning_rate": 9.957047133693141e-06, + "loss": 1.9139325618743896, + "step": 182 + }, + { + "epoch": 0.28134556574923547, + "grad_norm": 1.1153804063796997, + "learning_rate": 9.954929867091961e-06, + "loss": 1.7500460147857666, + "step": 184 + }, + { + "epoch": 0.28440366972477066, + "grad_norm": 0.3268054723739624, + "learning_rate": 9.952761927739303e-06, + "loss": 1.5284479856491089, + "step": 186 + }, + { + "epoch": 0.2874617737003058, + "grad_norm": 0.2701658308506012, + "learning_rate": 9.95054334029446e-06, + "loss": 1.5575287342071533, + "step": 188 + }, + { + "epoch": 0.290519877675841, + "grad_norm": 0.5897979140281677, + "learning_rate": 9.948274129992838e-06, + "loss": 1.5360642671585083, + "step": 190 + }, + { + "epoch": 0.29357798165137616, + "grad_norm": 3.0125443935394287, + "learning_rate": 9.945954322645643e-06, + "loss": 1.7250124216079712, + "step": 192 + }, + { + "epoch": 0.2966360856269113, + "grad_norm": 0.22849687933921814, + "learning_rate": 9.9435839446396e-06, + "loss": 1.7317864894866943, + "step": 194 + }, + { + "epoch": 0.2996941896024465, + "grad_norm": 0.41497474908828735, + "learning_rate": 9.941163022936659e-06, + "loss": 1.7118513584136963, + "step": 196 + }, + { + "epoch": 0.30275229357798167, + "grad_norm": 0.43153518438339233, + "learning_rate": 9.938691585073677e-06, + "loss": 1.4813673496246338, + "step": 198 + }, + { + "epoch": 0.3058103975535168, + "grad_norm": 0.2877158522605896, + "learning_rate": 9.936169659162105e-06, + "loss": 1.5152385234832764, + "step": 200 + }, + { + "epoch": 0.308868501529052, + "grad_norm": 0.319741427898407, + "learning_rate": 9.933597273887676e-06, + "loss": 1.657623291015625, + "step": 202 + }, + { + "epoch": 0.3119266055045872, + "grad_norm": 0.4885481894016266, + "learning_rate": 9.930974458510074e-06, + "loss": 1.8340609073638916, + "step": 204 + }, + { + "epoch": 0.3149847094801223, + "grad_norm": 0.3470771312713623, + "learning_rate": 9.9283012428626e-06, + "loss": 1.8779006004333496, + "step": 206 + }, + { + "epoch": 0.3180428134556575, + "grad_norm": 0.21095849573612213, + "learning_rate": 9.92557765735184e-06, + "loss": 1.946405053138733, + "step": 208 + }, + { + "epoch": 0.3211009174311927, + "grad_norm": 0.4015672504901886, + "learning_rate": 9.922803732957309e-06, + "loss": 1.5457347631454468, + "step": 210 + }, + { + "epoch": 0.3241590214067278, + "grad_norm": 0.2712498903274536, + "learning_rate": 9.919979501231102e-06, + "loss": 1.6519064903259277, + "step": 212 + }, + { + "epoch": 0.327217125382263, + "grad_norm": 0.24934278428554535, + "learning_rate": 9.917104994297543e-06, + "loss": 1.4617292881011963, + "step": 214 + }, + { + "epoch": 0.3302752293577982, + "grad_norm": 0.22483140230178833, + "learning_rate": 9.914180244852804e-06, + "loss": 1.3875129222869873, + "step": 216 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.6217460632324219, + "learning_rate": 9.911205286164553e-06, + "loss": 1.8669204711914062, + "step": 218 + }, + { + "epoch": 0.3363914373088685, + "grad_norm": 0.4357741177082062, + "learning_rate": 9.908180152071553e-06, + "loss": 1.666574239730835, + "step": 220 + }, + { + "epoch": 0.3394495412844037, + "grad_norm": 0.29025763273239136, + "learning_rate": 9.9051048769833e-06, + "loss": 1.810868263244629, + "step": 222 + }, + { + "epoch": 0.3425076452599388, + "grad_norm": 0.7838276624679565, + "learning_rate": 9.901979495879612e-06, + "loss": 1.3125014305114746, + "step": 224 + }, + { + "epoch": 0.345565749235474, + "grad_norm": 0.2543538212776184, + "learning_rate": 9.898804044310245e-06, + "loss": 1.6106175184249878, + "step": 226 + }, + { + "epoch": 0.3486238532110092, + "grad_norm": 0.4557286500930786, + "learning_rate": 9.89557855839448e-06, + "loss": 1.886078953742981, + "step": 228 + }, + { + "epoch": 0.3516819571865443, + "grad_norm": 0.2689090073108673, + "learning_rate": 9.892303074820712e-06, + "loss": 1.631593108177185, + "step": 230 + }, + { + "epoch": 0.3547400611620795, + "grad_norm": 0.25291207432746887, + "learning_rate": 9.888977630846048e-06, + "loss": 1.7156798839569092, + "step": 232 + }, + { + "epoch": 0.3577981651376147, + "grad_norm": 0.3357708752155304, + "learning_rate": 9.88560226429586e-06, + "loss": 1.6416988372802734, + "step": 234 + }, + { + "epoch": 0.36085626911314983, + "grad_norm": 0.3246925473213196, + "learning_rate": 9.88217701356337e-06, + "loss": 1.5658977031707764, + "step": 236 + }, + { + "epoch": 0.363914373088685, + "grad_norm": 0.2840614318847656, + "learning_rate": 9.878701917609208e-06, + "loss": 1.6534138917922974, + "step": 238 + }, + { + "epoch": 0.3669724770642202, + "grad_norm": 0.5397573709487915, + "learning_rate": 9.875177015960973e-06, + "loss": 1.7614964246749878, + "step": 240 + }, + { + "epoch": 0.37003058103975534, + "grad_norm": 0.28763291239738464, + "learning_rate": 9.871602348712777e-06, + "loss": 1.5937902927398682, + "step": 242 + }, + { + "epoch": 0.3730886850152905, + "grad_norm": 0.21111302077770233, + "learning_rate": 9.867977956524798e-06, + "loss": 1.6914631128311157, + "step": 244 + }, + { + "epoch": 0.3761467889908257, + "grad_norm": 0.5114771723747253, + "learning_rate": 9.864303880622806e-06, + "loss": 1.8919175863265991, + "step": 246 + }, + { + "epoch": 0.37920489296636084, + "grad_norm": 0.4698966145515442, + "learning_rate": 9.8605801627977e-06, + "loss": 2.395404815673828, + "step": 248 + }, + { + "epoch": 0.382262996941896, + "grad_norm": 0.604468047618866, + "learning_rate": 9.85680684540504e-06, + "loss": 1.523594617843628, + "step": 250 + }, + { + "epoch": 0.3853211009174312, + "grad_norm": 0.295039564371109, + "learning_rate": 9.852983971364549e-06, + "loss": 1.520268440246582, + "step": 252 + }, + { + "epoch": 0.38837920489296635, + "grad_norm": 0.2590586245059967, + "learning_rate": 9.84911158415964e-06, + "loss": 1.5712318420410156, + "step": 254 + }, + { + "epoch": 0.39143730886850153, + "grad_norm": 0.9178432822227478, + "learning_rate": 9.845189727836914e-06, + "loss": 1.7512378692626953, + "step": 256 + }, + { + "epoch": 0.3944954128440367, + "grad_norm": 0.512359619140625, + "learning_rate": 9.841218447005657e-06, + "loss": 1.677209496498108, + "step": 258 + }, + { + "epoch": 0.39755351681957185, + "grad_norm": 0.8242136240005493, + "learning_rate": 9.837197786837341e-06, + "loss": 1.52079439163208, + "step": 260 + }, + { + "epoch": 0.40061162079510704, + "grad_norm": 0.5057528614997864, + "learning_rate": 9.833127793065098e-06, + "loss": 1.3776154518127441, + "step": 262 + }, + { + "epoch": 0.4036697247706422, + "grad_norm": 0.287590891122818, + "learning_rate": 9.829008511983214e-06, + "loss": 1.313464879989624, + "step": 264 + }, + { + "epoch": 0.40672782874617736, + "grad_norm": 0.22291725873947144, + "learning_rate": 9.82483999044659e-06, + "loss": 1.4770923852920532, + "step": 266 + }, + { + "epoch": 0.40978593272171254, + "grad_norm": 0.4278978109359741, + "learning_rate": 9.820622275870219e-06, + "loss": 1.713256597518921, + "step": 268 + }, + { + "epoch": 0.41284403669724773, + "grad_norm": 0.7735996246337891, + "learning_rate": 9.816355416228636e-06, + "loss": 1.7301435470581055, + "step": 270 + }, + { + "epoch": 0.41590214067278286, + "grad_norm": 0.36943763494491577, + "learning_rate": 9.812039460055383e-06, + "loss": 1.746875286102295, + "step": 272 + }, + { + "epoch": 0.41896024464831805, + "grad_norm": 0.30427658557891846, + "learning_rate": 9.807674456442448e-06, + "loss": 1.7644126415252686, + "step": 274 + }, + { + "epoch": 0.42201834862385323, + "grad_norm": 0.2680354416370392, + "learning_rate": 9.80326045503972e-06, + "loss": 1.6075056791305542, + "step": 276 + }, + { + "epoch": 0.42507645259938837, + "grad_norm": 0.5165081024169922, + "learning_rate": 9.798797506054398e-06, + "loss": 1.7466685771942139, + "step": 278 + }, + { + "epoch": 0.42813455657492355, + "grad_norm": 0.46960580348968506, + "learning_rate": 9.794285660250457e-06, + "loss": 1.6852364540100098, + "step": 280 + }, + { + "epoch": 0.43119266055045874, + "grad_norm": 0.3378291130065918, + "learning_rate": 9.789724968948034e-06, + "loss": 1.5493333339691162, + "step": 282 + }, + { + "epoch": 0.43425076452599387, + "grad_norm": 0.2972247004508972, + "learning_rate": 9.78511548402287e-06, + "loss": 1.5161151885986328, + "step": 284 + }, + { + "epoch": 0.43730886850152906, + "grad_norm": 0.3610173165798187, + "learning_rate": 9.780457257905708e-06, + "loss": 1.698796272277832, + "step": 286 + }, + { + "epoch": 0.44036697247706424, + "grad_norm": 0.4165475070476532, + "learning_rate": 9.775750343581702e-06, + "loss": 1.4344041347503662, + "step": 288 + }, + { + "epoch": 0.4434250764525994, + "grad_norm": 0.565291702747345, + "learning_rate": 9.770994794589804e-06, + "loss": 1.6736053228378296, + "step": 290 + }, + { + "epoch": 0.44648318042813456, + "grad_norm": 0.22272102534770966, + "learning_rate": 9.766190665022173e-06, + "loss": 1.515446424484253, + "step": 292 + }, + { + "epoch": 0.44954128440366975, + "grad_norm": 0.292961061000824, + "learning_rate": 9.761338009523542e-06, + "loss": 1.5677558183670044, + "step": 294 + }, + { + "epoch": 0.4525993883792049, + "grad_norm": 0.22576913237571716, + "learning_rate": 9.756436883290608e-06, + "loss": 1.6895636320114136, + "step": 296 + }, + { + "epoch": 0.45565749235474007, + "grad_norm": 0.514447808265686, + "learning_rate": 9.751487342071394e-06, + "loss": 1.6961359977722168, + "step": 298 + }, + { + "epoch": 0.45871559633027525, + "grad_norm": 0.4707038402557373, + "learning_rate": 9.74648944216463e-06, + "loss": 1.5364969968795776, + "step": 300 + }, + { + "epoch": 0.4617737003058104, + "grad_norm": 0.3324492871761322, + "learning_rate": 9.741443240419096e-06, + "loss": 1.4445494413375854, + "step": 302 + }, + { + "epoch": 0.4648318042813456, + "grad_norm": 0.40139055252075195, + "learning_rate": 9.736348794232986e-06, + "loss": 1.631695032119751, + "step": 304 + }, + { + "epoch": 0.46788990825688076, + "grad_norm": 0.32826143503189087, + "learning_rate": 9.731206161553253e-06, + "loss": 1.5630545616149902, + "step": 306 + }, + { + "epoch": 0.4709480122324159, + "grad_norm": 0.7137564420700073, + "learning_rate": 9.726015400874945e-06, + "loss": 1.7077264785766602, + "step": 308 + }, + { + "epoch": 0.4740061162079511, + "grad_norm": 0.5834897756576538, + "learning_rate": 9.72077657124055e-06, + "loss": 1.541429877281189, + "step": 310 + }, + { + "epoch": 0.47706422018348627, + "grad_norm": 0.30517715215682983, + "learning_rate": 9.715489732239309e-06, + "loss": 1.486952781677246, + "step": 312 + }, + { + "epoch": 0.4801223241590214, + "grad_norm": 0.39915895462036133, + "learning_rate": 9.710154944006558e-06, + "loss": 1.4761033058166504, + "step": 314 + }, + { + "epoch": 0.4831804281345566, + "grad_norm": 0.24902665615081787, + "learning_rate": 9.70477226722302e-06, + "loss": 1.555905818939209, + "step": 316 + }, + { + "epoch": 0.48623853211009177, + "grad_norm": 0.27528202533721924, + "learning_rate": 9.699341763114142e-06, + "loss": 1.5418330430984497, + "step": 318 + }, + { + "epoch": 0.4892966360856269, + "grad_norm": 0.37373027205467224, + "learning_rate": 9.693863493449376e-06, + "loss": 1.5460388660430908, + "step": 320 + }, + { + "epoch": 0.4923547400611621, + "grad_norm": 0.3926723301410675, + "learning_rate": 9.688337520541487e-06, + "loss": 1.7003178596496582, + "step": 322 + }, + { + "epoch": 0.4954128440366973, + "grad_norm": 0.2708083987236023, + "learning_rate": 9.68276390724584e-06, + "loss": 1.8639323711395264, + "step": 324 + }, + { + "epoch": 0.4984709480122324, + "grad_norm": 0.3522673547267914, + "learning_rate": 9.67714271695969e-06, + "loss": 1.7603111267089844, + "step": 326 + }, + { + "epoch": 0.5015290519877675, + "grad_norm": 0.2736775279045105, + "learning_rate": 9.671474013621461e-06, + "loss": 1.7426960468292236, + "step": 328 + }, + { + "epoch": 0.5045871559633027, + "grad_norm": 0.34006989002227783, + "learning_rate": 9.665757861710008e-06, + "loss": 1.6802008152008057, + "step": 330 + }, + { + "epoch": 0.5076452599388379, + "grad_norm": 0.7181631922721863, + "learning_rate": 9.659994326243897e-06, + "loss": 1.3610038757324219, + "step": 332 + }, + { + "epoch": 0.5107033639143731, + "grad_norm": 0.3209435045719147, + "learning_rate": 9.654183472780655e-06, + "loss": 1.3310749530792236, + "step": 334 + }, + { + "epoch": 0.5137614678899083, + "grad_norm": 0.3394523561000824, + "learning_rate": 9.64832536741604e-06, + "loss": 1.7552449703216553, + "step": 336 + }, + { + "epoch": 0.5168195718654435, + "grad_norm": 0.26636433601379395, + "learning_rate": 9.642420076783266e-06, + "loss": 1.7648036479949951, + "step": 338 + }, + { + "epoch": 0.5198776758409785, + "grad_norm": 0.4860476553440094, + "learning_rate": 9.636467668052263e-06, + "loss": 1.8371148109436035, + "step": 340 + }, + { + "epoch": 0.5229357798165137, + "grad_norm": 0.3957999050617218, + "learning_rate": 9.630468208928906e-06, + "loss": 1.7691468000411987, + "step": 342 + }, + { + "epoch": 0.5259938837920489, + "grad_norm": 0.29553869366645813, + "learning_rate": 9.624421767654247e-06, + "loss": 1.8050150871276855, + "step": 344 + }, + { + "epoch": 0.5290519877675841, + "grad_norm": 0.8523488640785217, + "learning_rate": 9.618328413003742e-06, + "loss": 1.7548258304595947, + "step": 346 + }, + { + "epoch": 0.5321100917431193, + "grad_norm": 0.30288758873939514, + "learning_rate": 9.612188214286457e-06, + "loss": 1.652245044708252, + "step": 348 + }, + { + "epoch": 0.5351681957186545, + "grad_norm": 0.44331154227256775, + "learning_rate": 9.606001241344293e-06, + "loss": 1.5749201774597168, + "step": 350 + }, + { + "epoch": 0.5382262996941896, + "grad_norm": 0.3775594234466553, + "learning_rate": 9.599767564551185e-06, + "loss": 1.8136138916015625, + "step": 352 + }, + { + "epoch": 0.5412844036697247, + "grad_norm": 0.6260164976119995, + "learning_rate": 9.593487254812298e-06, + "loss": 1.753260850906372, + "step": 354 + }, + { + "epoch": 0.5443425076452599, + "grad_norm": 0.21940867602825165, + "learning_rate": 9.587160383563235e-06, + "loss": 1.2595834732055664, + "step": 356 + }, + { + "epoch": 0.5474006116207951, + "grad_norm": 0.45921286940574646, + "learning_rate": 9.580787022769205e-06, + "loss": 1.8687834739685059, + "step": 358 + }, + { + "epoch": 0.5504587155963303, + "grad_norm": 0.25323811173439026, + "learning_rate": 9.574367244924216e-06, + "loss": 1.87260901927948, + "step": 360 + }, + { + "epoch": 0.5535168195718655, + "grad_norm": 0.3825606405735016, + "learning_rate": 9.567901123050255e-06, + "loss": 1.9380344152450562, + "step": 362 + }, + { + "epoch": 0.5565749235474006, + "grad_norm": 0.8433843851089478, + "learning_rate": 9.56138873069644e-06, + "loss": 1.854411005973816, + "step": 364 + }, + { + "epoch": 0.5596330275229358, + "grad_norm": 0.5623306035995483, + "learning_rate": 9.554830141938201e-06, + "loss": 1.8307363986968994, + "step": 366 + }, + { + "epoch": 0.5626911314984709, + "grad_norm": 0.5833460688591003, + "learning_rate": 9.54822543137643e-06, + "loss": 1.691839575767517, + "step": 368 + }, + { + "epoch": 0.5657492354740061, + "grad_norm": 0.7582941651344299, + "learning_rate": 9.541574674136634e-06, + "loss": 1.5816738605499268, + "step": 370 + }, + { + "epoch": 0.5688073394495413, + "grad_norm": 0.5991274118423462, + "learning_rate": 9.534877945868075e-06, + "loss": 1.141850471496582, + "step": 372 + }, + { + "epoch": 0.5718654434250765, + "grad_norm": 0.27493157982826233, + "learning_rate": 9.528135322742916e-06, + "loss": 1.1190171241760254, + "step": 374 + }, + { + "epoch": 0.5749235474006116, + "grad_norm": 0.20014670491218567, + "learning_rate": 9.521346881455356e-06, + "loss": 1.4172542095184326, + "step": 376 + }, + { + "epoch": 0.5779816513761468, + "grad_norm": 0.45737189054489136, + "learning_rate": 9.514512699220751e-06, + "loss": 1.3267741203308105, + "step": 378 + }, + { + "epoch": 0.581039755351682, + "grad_norm": 0.342574805021286, + "learning_rate": 9.507632853774738e-06, + "loss": 1.2848198413848877, + "step": 380 + }, + { + "epoch": 0.5840978593272171, + "grad_norm": 0.2764483690261841, + "learning_rate": 9.500707423372354e-06, + "loss": 1.2696105241775513, + "step": 382 + }, + { + "epoch": 0.5871559633027523, + "grad_norm": 0.5538342595100403, + "learning_rate": 9.493736486787145e-06, + "loss": 1.5733320713043213, + "step": 384 + }, + { + "epoch": 0.5902140672782875, + "grad_norm": 0.5002435445785522, + "learning_rate": 9.486720123310264e-06, + "loss": 1.4811735153198242, + "step": 386 + }, + { + "epoch": 0.5932721712538226, + "grad_norm": 0.2729179561138153, + "learning_rate": 9.479658412749575e-06, + "loss": 1.2759473323822021, + "step": 388 + }, + { + "epoch": 0.5963302752293578, + "grad_norm": 0.422869473695755, + "learning_rate": 9.472551435428751e-06, + "loss": 1.6186537742614746, + "step": 390 + }, + { + "epoch": 0.599388379204893, + "grad_norm": 0.18889868259429932, + "learning_rate": 9.465399272186341e-06, + "loss": 1.5904256105422974, + "step": 392 + }, + { + "epoch": 0.6024464831804281, + "grad_norm": 0.4715130925178528, + "learning_rate": 9.458202004374875e-06, + "loss": 1.3664047718048096, + "step": 394 + }, + { + "epoch": 0.6055045871559633, + "grad_norm": 0.3192538321018219, + "learning_rate": 9.450959713859918e-06, + "loss": 1.5540097951889038, + "step": 396 + }, + { + "epoch": 0.6085626911314985, + "grad_norm": 0.48479557037353516, + "learning_rate": 9.443672483019146e-06, + "loss": 1.7298085689544678, + "step": 398 + }, + { + "epoch": 0.6116207951070336, + "grad_norm": 0.40212106704711914, + "learning_rate": 9.436340394741424e-06, + "loss": 1.2515219449996948, + "step": 400 + }, + { + "epoch": 0.6146788990825688, + "grad_norm": 0.31416311860084534, + "learning_rate": 9.428963532425832e-06, + "loss": 1.5272061824798584, + "step": 402 + }, + { + "epoch": 0.617737003058104, + "grad_norm": 0.39595550298690796, + "learning_rate": 9.421541979980743e-06, + "loss": 1.584099531173706, + "step": 404 + }, + { + "epoch": 0.6207951070336392, + "grad_norm": 0.3684428632259369, + "learning_rate": 9.414075821822862e-06, + "loss": 1.5516374111175537, + "step": 406 + }, + { + "epoch": 0.6238532110091743, + "grad_norm": 0.2936325669288635, + "learning_rate": 9.406565142876252e-06, + "loss": 1.3937046527862549, + "step": 408 + }, + { + "epoch": 0.6269113149847095, + "grad_norm": 0.8210769295692444, + "learning_rate": 9.399010028571394e-06, + "loss": 1.0384480953216553, + "step": 410 + }, + { + "epoch": 0.6299694189602446, + "grad_norm": 0.31836938858032227, + "learning_rate": 9.391410564844189e-06, + "loss": 1.6605589389801025, + "step": 412 + }, + { + "epoch": 0.6330275229357798, + "grad_norm": 0.4151877164840698, + "learning_rate": 9.383766838134997e-06, + "loss": 1.5902981758117676, + "step": 414 + }, + { + "epoch": 0.636085626911315, + "grad_norm": 0.29467517137527466, + "learning_rate": 9.376078935387647e-06, + "loss": 1.511544942855835, + "step": 416 + }, + { + "epoch": 0.6391437308868502, + "grad_norm": 0.4552344083786011, + "learning_rate": 9.36834694404845e-06, + "loss": 1.6092697381973267, + "step": 418 + }, + { + "epoch": 0.6422018348623854, + "grad_norm": 0.3086092174053192, + "learning_rate": 9.360570952065205e-06, + "loss": 1.5458872318267822, + "step": 420 + }, + { + "epoch": 0.6452599388379205, + "grad_norm": 0.29464077949523926, + "learning_rate": 9.3527510478862e-06, + "loss": 1.5201151371002197, + "step": 422 + }, + { + "epoch": 0.6483180428134556, + "grad_norm": 0.35874319076538086, + "learning_rate": 9.3448873204592e-06, + "loss": 1.7184113264083862, + "step": 424 + }, + { + "epoch": 0.6513761467889908, + "grad_norm": 0.6177545189857483, + "learning_rate": 9.336979859230438e-06, + "loss": 1.425230860710144, + "step": 426 + }, + { + "epoch": 0.654434250764526, + "grad_norm": 0.4207315742969513, + "learning_rate": 9.329028754143606e-06, + "loss": 1.1580491065979004, + "step": 428 + }, + { + "epoch": 0.6574923547400612, + "grad_norm": 0.40215086936950684, + "learning_rate": 9.321034095638816e-06, + "loss": 1.776092767715454, + "step": 430 + }, + { + "epoch": 0.6605504587155964, + "grad_norm": 0.48207205533981323, + "learning_rate": 9.312995974651581e-06, + "loss": 1.5432982444763184, + "step": 432 + }, + { + "epoch": 0.6636085626911316, + "grad_norm": 0.9188543558120728, + "learning_rate": 9.304914482611788e-06, + "loss": 1.6913204193115234, + "step": 434 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 2.0712273120880127, + "learning_rate": 9.296789711442641e-06, + "loss": 1.5286757946014404, + "step": 436 + }, + { + "epoch": 0.6697247706422018, + "grad_norm": 0.4487042725086212, + "learning_rate": 9.288621753559624e-06, + "loss": 1.7271997928619385, + "step": 438 + }, + { + "epoch": 0.672782874617737, + "grad_norm": 0.4550405442714691, + "learning_rate": 9.280410701869456e-06, + "loss": 1.5852614641189575, + "step": 440 + }, + { + "epoch": 0.6758409785932722, + "grad_norm": 0.8099808692932129, + "learning_rate": 9.27215664976902e-06, + "loss": 1.6332128047943115, + "step": 442 + }, + { + "epoch": 0.6788990825688074, + "grad_norm": 0.5566719174385071, + "learning_rate": 9.263859691144315e-06, + "loss": 1.5285072326660156, + "step": 444 + }, + { + "epoch": 0.6819571865443425, + "grad_norm": 0.3996361196041107, + "learning_rate": 9.25551992036938e-06, + "loss": 1.181262731552124, + "step": 446 + }, + { + "epoch": 0.6850152905198776, + "grad_norm": 0.7320879697799683, + "learning_rate": 9.247137432305221e-06, + "loss": 1.6381134986877441, + "step": 448 + }, + { + "epoch": 0.6880733944954128, + "grad_norm": 0.5473281741142273, + "learning_rate": 9.238712322298733e-06, + "loss": 1.623387098312378, + "step": 450 + }, + { + "epoch": 0.691131498470948, + "grad_norm": 0.2673215866088867, + "learning_rate": 9.230244686181616e-06, + "loss": 1.6147091388702393, + "step": 452 + }, + { + "epoch": 0.6941896024464832, + "grad_norm": 0.41044941544532776, + "learning_rate": 9.22173462026929e-06, + "loss": 1.6174466609954834, + "step": 454 + }, + { + "epoch": 0.6972477064220184, + "grad_norm": 0.3210803270339966, + "learning_rate": 9.213182221359785e-06, + "loss": 1.4634352922439575, + "step": 456 + }, + { + "epoch": 0.7003058103975535, + "grad_norm": 0.4366549551486969, + "learning_rate": 9.204587586732653e-06, + "loss": 1.6598728895187378, + "step": 458 + }, + { + "epoch": 0.7033639143730887, + "grad_norm": 0.6817240118980408, + "learning_rate": 9.195950814147862e-06, + "loss": 1.7457971572875977, + "step": 460 + }, + { + "epoch": 0.7064220183486238, + "grad_norm": 1.429196834564209, + "learning_rate": 9.187272001844673e-06, + "loss": 1.4895765781402588, + "step": 462 + }, + { + "epoch": 0.709480122324159, + "grad_norm": 0.33415424823760986, + "learning_rate": 9.178551248540534e-06, + "loss": 1.7249622344970703, + "step": 464 + }, + { + "epoch": 0.7125382262996942, + "grad_norm": 0.5185303092002869, + "learning_rate": 9.169788653429949e-06, + "loss": 1.5071038007736206, + "step": 466 + }, + { + "epoch": 0.7155963302752294, + "grad_norm": 0.703040599822998, + "learning_rate": 9.160984316183354e-06, + "loss": 1.6332056522369385, + "step": 468 + }, + { + "epoch": 0.7186544342507645, + "grad_norm": 0.2760729491710663, + "learning_rate": 9.152138336945985e-06, + "loss": 1.5567004680633545, + "step": 470 + }, + { + "epoch": 0.7217125382262997, + "grad_norm": 0.26987555623054504, + "learning_rate": 9.143250816336733e-06, + "loss": 1.6896016597747803, + "step": 472 + }, + { + "epoch": 0.7247706422018348, + "grad_norm": 0.4577353894710541, + "learning_rate": 9.134321855447004e-06, + "loss": 1.780794620513916, + "step": 474 + }, + { + "epoch": 0.72782874617737, + "grad_norm": 0.3506152629852295, + "learning_rate": 9.125351555839568e-06, + "loss": 1.676330327987671, + "step": 476 + }, + { + "epoch": 0.7308868501529052, + "grad_norm": 0.3420753479003906, + "learning_rate": 9.116340019547403e-06, + "loss": 1.53602933883667, + "step": 478 + }, + { + "epoch": 0.7339449541284404, + "grad_norm": 0.615734875202179, + "learning_rate": 9.107287349072535e-06, + "loss": 1.6315178871154785, + "step": 480 + }, + { + "epoch": 0.7370030581039755, + "grad_norm": 0.3383826017379761, + "learning_rate": 9.098193647384872e-06, + "loss": 1.646344542503357, + "step": 482 + }, + { + "epoch": 0.7400611620795107, + "grad_norm": 0.40700384974479675, + "learning_rate": 9.089059017921034e-06, + "loss": 1.6499868631362915, + "step": 484 + }, + { + "epoch": 0.7431192660550459, + "grad_norm": 0.4302765727043152, + "learning_rate": 9.079883564583176e-06, + "loss": 1.6223028898239136, + "step": 486 + }, + { + "epoch": 0.746177370030581, + "grad_norm": 0.2995837330818176, + "learning_rate": 9.070667391737804e-06, + "loss": 1.639768123626709, + "step": 488 + }, + { + "epoch": 0.7492354740061162, + "grad_norm": 0.3183751702308655, + "learning_rate": 9.061410604214588e-06, + "loss": 1.4172444343566895, + "step": 490 + }, + { + "epoch": 0.7522935779816514, + "grad_norm": 0.41883519291877747, + "learning_rate": 9.052113307305178e-06, + "loss": 1.5172092914581299, + "step": 492 + }, + { + "epoch": 0.7553516819571865, + "grad_norm": 0.4170067310333252, + "learning_rate": 9.04277560676199e-06, + "loss": 1.4581788778305054, + "step": 494 + }, + { + "epoch": 0.7584097859327217, + "grad_norm": 0.4589844346046448, + "learning_rate": 9.033397608797015e-06, + "loss": 1.5675625801086426, + "step": 496 + }, + { + "epoch": 0.7614678899082569, + "grad_norm": 0.4775915741920471, + "learning_rate": 9.023979420080614e-06, + "loss": 1.5760972499847412, + "step": 498 + }, + { + "epoch": 0.764525993883792, + "grad_norm": 0.4255703389644623, + "learning_rate": 9.014521147740295e-06, + "loss": 1.4211878776550293, + "step": 500 + }, + { + "epoch": 0.7675840978593272, + "grad_norm": 0.2350740283727646, + "learning_rate": 9.005022899359498e-06, + "loss": 1.0600173473358154, + "step": 502 + }, + { + "epoch": 0.7706422018348624, + "grad_norm": 0.25523892045021057, + "learning_rate": 8.995484782976372e-06, + "loss": 1.3498680591583252, + "step": 504 + }, + { + "epoch": 0.7737003058103975, + "grad_norm": 0.25793585181236267, + "learning_rate": 8.985906907082548e-06, + "loss": 1.4128957986831665, + "step": 506 + }, + { + "epoch": 0.7767584097859327, + "grad_norm": 0.2672351002693176, + "learning_rate": 8.9762893806219e-06, + "loss": 1.4579813480377197, + "step": 508 + }, + { + "epoch": 0.7798165137614679, + "grad_norm": 0.3467871844768524, + "learning_rate": 8.96663231298931e-06, + "loss": 1.469613790512085, + "step": 510 + }, + { + "epoch": 0.7828746177370031, + "grad_norm": 0.2631012797355652, + "learning_rate": 8.956935814029426e-06, + "loss": 1.5352952480316162, + "step": 512 + }, + { + "epoch": 0.7859327217125383, + "grad_norm": 0.42967817187309265, + "learning_rate": 8.947199994035402e-06, + "loss": 1.448859691619873, + "step": 514 + }, + { + "epoch": 0.7889908256880734, + "grad_norm": 0.18720397353172302, + "learning_rate": 8.937424963747656e-06, + "loss": 1.4682276248931885, + "step": 516 + }, + { + "epoch": 0.7920489296636085, + "grad_norm": 0.2571136951446533, + "learning_rate": 8.9276108343526e-06, + "loss": 1.430220365524292, + "step": 518 + }, + { + "epoch": 0.7951070336391437, + "grad_norm": 0.49666231870651245, + "learning_rate": 8.917757717481388e-06, + "loss": 1.4388704299926758, + "step": 520 + }, + { + "epoch": 0.7981651376146789, + "grad_norm": 0.18454308807849884, + "learning_rate": 8.90786572520863e-06, + "loss": 1.3887765407562256, + "step": 522 + }, + { + "epoch": 0.8012232415902141, + "grad_norm": 0.19775497913360596, + "learning_rate": 8.897934970051128e-06, + "loss": 1.4397857189178467, + "step": 524 + }, + { + "epoch": 0.8042813455657493, + "grad_norm": 0.24946311116218567, + "learning_rate": 8.8879655649666e-06, + "loss": 1.3772547245025635, + "step": 526 + }, + { + "epoch": 0.8073394495412844, + "grad_norm": 0.1347188949584961, + "learning_rate": 8.877957623352376e-06, + "loss": 1.2148081064224243, + "step": 528 + }, + { + "epoch": 0.8103975535168195, + "grad_norm": 0.17375752329826355, + "learning_rate": 8.867911259044134e-06, + "loss": 1.2351716756820679, + "step": 530 + }, + { + "epoch": 0.8134556574923547, + "grad_norm": 0.12528319656848907, + "learning_rate": 8.857826586314586e-06, + "loss": 1.0168347358703613, + "step": 532 + }, + { + "epoch": 0.8165137614678899, + "grad_norm": 0.22279202938079834, + "learning_rate": 8.847703719872184e-06, + "loss": 1.3256959915161133, + "step": 534 + }, + { + "epoch": 0.8195718654434251, + "grad_norm": 0.22974777221679688, + "learning_rate": 8.837542774859819e-06, + "loss": 1.3868855237960815, + "step": 536 + }, + { + "epoch": 0.8226299694189603, + "grad_norm": 0.2833384871482849, + "learning_rate": 8.827343866853505e-06, + "loss": 1.4037737846374512, + "step": 538 + }, + { + "epoch": 0.8256880733944955, + "grad_norm": 0.20462170243263245, + "learning_rate": 8.817107111861068e-06, + "loss": 1.3688358068466187, + "step": 540 + }, + { + "epoch": 0.8287461773700305, + "grad_norm": 0.21328498423099518, + "learning_rate": 8.806832626320828e-06, + "loss": 1.3812446594238281, + "step": 542 + }, + { + "epoch": 0.8318042813455657, + "grad_norm": 0.2749079465866089, + "learning_rate": 8.796520527100268e-06, + "loss": 1.3695695400238037, + "step": 544 + }, + { + "epoch": 0.8348623853211009, + "grad_norm": 0.17869983613491058, + "learning_rate": 8.786170931494714e-06, + "loss": 1.3381950855255127, + "step": 546 + }, + { + "epoch": 0.8379204892966361, + "grad_norm": 0.23981167376041412, + "learning_rate": 8.775783957225991e-06, + "loss": 1.409177541732788, + "step": 548 + }, + { + "epoch": 0.8409785932721713, + "grad_norm": 0.4634632170200348, + "learning_rate": 8.765359722441096e-06, + "loss": 1.3826044797897339, + "step": 550 + }, + { + "epoch": 0.8440366972477065, + "grad_norm": 0.19470739364624023, + "learning_rate": 8.754898345710839e-06, + "loss": 1.3529078960418701, + "step": 552 + }, + { + "epoch": 0.8470948012232415, + "grad_norm": 0.21753935515880585, + "learning_rate": 8.744399946028506e-06, + "loss": 1.3324353694915771, + "step": 554 + }, + { + "epoch": 0.8501529051987767, + "grad_norm": 0.24797090888023376, + "learning_rate": 8.733864642808505e-06, + "loss": 1.3469841480255127, + "step": 556 + }, + { + "epoch": 0.8532110091743119, + "grad_norm": 0.2123066782951355, + "learning_rate": 8.723292555884997e-06, + "loss": 1.343614101409912, + "step": 558 + }, + { + "epoch": 0.8562691131498471, + "grad_norm": 0.25072529911994934, + "learning_rate": 8.712683805510547e-06, + "loss": 1.305376648902893, + "step": 560 + }, + { + "epoch": 0.8593272171253823, + "grad_norm": 0.3219304382801056, + "learning_rate": 8.702038512354746e-06, + "loss": 1.3584821224212646, + "step": 562 + }, + { + "epoch": 0.8623853211009175, + "grad_norm": 0.3253892660140991, + "learning_rate": 8.691356797502846e-06, + "loss": 1.3929443359375, + "step": 564 + }, + { + "epoch": 0.8654434250764526, + "grad_norm": 0.22387385368347168, + "learning_rate": 8.680638782454373e-06, + "loss": 1.3898614645004272, + "step": 566 + }, + { + "epoch": 0.8685015290519877, + "grad_norm": 0.2767902612686157, + "learning_rate": 8.669884589121756e-06, + "loss": 1.3842121362686157, + "step": 568 + }, + { + "epoch": 0.8715596330275229, + "grad_norm": 0.2403760552406311, + "learning_rate": 8.659094339828934e-06, + "loss": 1.3873755931854248, + "step": 570 + }, + { + "epoch": 0.8746177370030581, + "grad_norm": 0.30079615116119385, + "learning_rate": 8.648268157309964e-06, + "loss": 1.3781442642211914, + "step": 572 + }, + { + "epoch": 0.8776758409785933, + "grad_norm": 0.24510778486728668, + "learning_rate": 8.637406164707628e-06, + "loss": 1.4003241062164307, + "step": 574 + }, + { + "epoch": 0.8807339449541285, + "grad_norm": 0.19053591787815094, + "learning_rate": 8.62650848557203e-06, + "loss": 1.318782091140747, + "step": 576 + }, + { + "epoch": 0.8837920489296636, + "grad_norm": 0.5118341445922852, + "learning_rate": 8.615575243859194e-06, + "loss": 1.3740344047546387, + "step": 578 + }, + { + "epoch": 0.8868501529051988, + "grad_norm": 0.2653733193874359, + "learning_rate": 8.604606563929649e-06, + "loss": 1.3240249156951904, + "step": 580 + }, + { + "epoch": 0.8899082568807339, + "grad_norm": 0.2646930515766144, + "learning_rate": 8.59360257054702e-06, + "loss": 1.3533198833465576, + "step": 582 + }, + { + "epoch": 0.8929663608562691, + "grad_norm": 0.21842285990715027, + "learning_rate": 8.582563388876602e-06, + "loss": 1.3596748113632202, + "step": 584 + }, + { + "epoch": 0.8960244648318043, + "grad_norm": 0.2090519517660141, + "learning_rate": 8.571489144483945e-06, + "loss": 1.3835537433624268, + "step": 586 + }, + { + "epoch": 0.8990825688073395, + "grad_norm": 0.2362383008003235, + "learning_rate": 8.560379963333416e-06, + "loss": 1.368111252784729, + "step": 588 + }, + { + "epoch": 0.9021406727828746, + "grad_norm": 0.4883694350719452, + "learning_rate": 8.549235971786777e-06, + "loss": 1.3067984580993652, + "step": 590 + }, + { + "epoch": 0.9051987767584098, + "grad_norm": 0.3407292366027832, + "learning_rate": 8.538057296601739e-06, + "loss": 1.3290581703186035, + "step": 592 + }, + { + "epoch": 0.908256880733945, + "grad_norm": 0.21036434173583984, + "learning_rate": 8.526844064930523e-06, + "loss": 1.3695251941680908, + "step": 594 + }, + { + "epoch": 0.9113149847094801, + "grad_norm": 0.22752052545547485, + "learning_rate": 8.515596404318415e-06, + "loss": 1.3922007083892822, + "step": 596 + }, + { + "epoch": 0.9143730886850153, + "grad_norm": 0.23141705989837646, + "learning_rate": 8.504314442702315e-06, + "loss": 1.371009111404419, + "step": 598 + }, + { + "epoch": 0.9174311926605505, + "grad_norm": 0.18458011746406555, + "learning_rate": 8.492998308409275e-06, + "loss": 1.3468807935714722, + "step": 600 + }, + { + "epoch": 0.9204892966360856, + "grad_norm": 0.2277638018131256, + "learning_rate": 8.481648130155054e-06, + "loss": 1.3067777156829834, + "step": 602 + }, + { + "epoch": 0.9235474006116208, + "grad_norm": 0.2761037051677704, + "learning_rate": 8.470264037042639e-06, + "loss": 1.3436920642852783, + "step": 604 + }, + { + "epoch": 0.926605504587156, + "grad_norm": 0.2718355059623718, + "learning_rate": 8.458846158560787e-06, + "loss": 1.368149995803833, + "step": 606 + }, + { + "epoch": 0.9296636085626911, + "grad_norm": 0.471161812543869, + "learning_rate": 8.447394624582544e-06, + "loss": 1.3190257549285889, + "step": 608 + }, + { + "epoch": 0.9327217125382263, + "grad_norm": 0.24170783162117004, + "learning_rate": 8.435909565363772e-06, + "loss": 1.3419578075408936, + "step": 610 + }, + { + "epoch": 0.9357798165137615, + "grad_norm": 0.26485109329223633, + "learning_rate": 8.424391111541673e-06, + "loss": 1.338409662246704, + "step": 612 + }, + { + "epoch": 0.9388379204892966, + "grad_norm": 0.23220610618591309, + "learning_rate": 8.412839394133285e-06, + "loss": 1.3877780437469482, + "step": 614 + }, + { + "epoch": 0.9418960244648318, + "grad_norm": 0.24310626089572906, + "learning_rate": 8.401254544534018e-06, + "loss": 1.4051454067230225, + "step": 616 + }, + { + "epoch": 0.944954128440367, + "grad_norm": 0.299958735704422, + "learning_rate": 8.389636694516134e-06, + "loss": 1.3702571392059326, + "step": 618 + }, + { + "epoch": 0.9480122324159022, + "grad_norm": 0.449929803609848, + "learning_rate": 8.377985976227265e-06, + "loss": 1.379606008529663, + "step": 620 + }, + { + "epoch": 0.9510703363914373, + "grad_norm": 0.24171197414398193, + "learning_rate": 8.366302522188902e-06, + "loss": 1.350182294845581, + "step": 622 + }, + { + "epoch": 0.9541284403669725, + "grad_norm": 0.2935427129268646, + "learning_rate": 8.354586465294894e-06, + "loss": 1.2931137084960938, + "step": 624 + }, + { + "epoch": 0.9571865443425076, + "grad_norm": 0.23755374550819397, + "learning_rate": 8.342837938809925e-06, + "loss": 1.3183162212371826, + "step": 626 + }, + { + "epoch": 0.9602446483180428, + "grad_norm": 0.3486945331096649, + "learning_rate": 8.331057076368012e-06, + "loss": 1.3358354568481445, + "step": 628 + }, + { + "epoch": 0.963302752293578, + "grad_norm": 0.3866771459579468, + "learning_rate": 8.319244011970975e-06, + "loss": 1.3079657554626465, + "step": 630 + }, + { + "epoch": 0.9663608562691132, + "grad_norm": 0.23048752546310425, + "learning_rate": 8.307398879986917e-06, + "loss": 1.323075294494629, + "step": 632 + }, + { + "epoch": 0.9694189602446484, + "grad_norm": 0.2808099687099457, + "learning_rate": 8.295521815148697e-06, + "loss": 1.376133918762207, + "step": 634 + }, + { + "epoch": 0.9724770642201835, + "grad_norm": 0.3424737751483917, + "learning_rate": 8.283612952552393e-06, + "loss": 1.363619327545166, + "step": 636 + }, + { + "epoch": 0.9755351681957186, + "grad_norm": 0.23272113502025604, + "learning_rate": 8.271672427655765e-06, + "loss": 1.3780806064605713, + "step": 638 + }, + { + "epoch": 0.9785932721712538, + "grad_norm": 0.33965811133384705, + "learning_rate": 8.259700376276724e-06, + "loss": 1.3397910594940186, + "step": 640 + }, + { + "epoch": 0.981651376146789, + "grad_norm": 0.25269240140914917, + "learning_rate": 8.247696934591774e-06, + "loss": 1.3255189657211304, + "step": 642 + }, + { + "epoch": 0.9847094801223242, + "grad_norm": 1.2317392826080322, + "learning_rate": 8.235662239134473e-06, + "loss": 1.347729206085205, + "step": 644 + }, + { + "epoch": 0.9877675840978594, + "grad_norm": 0.37982505559921265, + "learning_rate": 8.22359642679387e-06, + "loss": 1.3894901275634766, + "step": 646 + }, + { + "epoch": 0.9908256880733946, + "grad_norm": 0.2849336564540863, + "learning_rate": 8.211499634812966e-06, + "loss": 1.429058313369751, + "step": 648 + }, + { + "epoch": 0.9938837920489296, + "grad_norm": 0.6233349442481995, + "learning_rate": 8.199372000787126e-06, + "loss": 2.095426082611084, + "step": 650 + }, + { + "epoch": 0.9969418960244648, + "grad_norm": 0.6541375517845154, + "learning_rate": 8.187213662662539e-06, + "loss": 2.1073060035705566, + "step": 652 + }, + { + "epoch": 1.0, + "grad_norm": 11.037178039550781, + "learning_rate": 8.175024758734636e-06, + "loss": 2.095914840698242, + "step": 654 + }, + { + "epoch": 1.003058103975535, + "grad_norm": 0.3948424160480499, + "learning_rate": 8.16280542764652e-06, + "loss": 1.4957305192947388, + "step": 656 + }, + { + "epoch": 1.0061162079510704, + "grad_norm": 0.310005784034729, + "learning_rate": 8.150555808387389e-06, + "loss": 1.455479383468628, + "step": 658 + }, + { + "epoch": 1.0091743119266054, + "grad_norm": 0.26789844036102295, + "learning_rate": 8.138276040290952e-06, + "loss": 1.4779293537139893, + "step": 660 + }, + { + "epoch": 1.0122324159021407, + "grad_norm": 0.19781345129013062, + "learning_rate": 8.125966263033852e-06, + "loss": 1.4063279628753662, + "step": 662 + }, + { + "epoch": 1.0152905198776758, + "grad_norm": 0.21764519810676575, + "learning_rate": 8.11362661663407e-06, + "loss": 1.5875146389007568, + "step": 664 + }, + { + "epoch": 1.018348623853211, + "grad_norm": 0.25749847292900085, + "learning_rate": 8.101257241449332e-06, + "loss": 1.480888843536377, + "step": 666 + }, + { + "epoch": 1.0214067278287462, + "grad_norm": 0.26426374912261963, + "learning_rate": 8.08885827817552e-06, + "loss": 1.4235765933990479, + "step": 668 + }, + { + "epoch": 1.0244648318042813, + "grad_norm": 0.25188708305358887, + "learning_rate": 8.07642986784506e-06, + "loss": 1.5084459781646729, + "step": 670 + }, + { + "epoch": 1.0275229357798166, + "grad_norm": 0.6583337783813477, + "learning_rate": 8.063972151825332e-06, + "loss": 1.369026780128479, + "step": 672 + }, + { + "epoch": 1.0305810397553516, + "grad_norm": 0.21123117208480835, + "learning_rate": 8.05148527181705e-06, + "loss": 1.4445654153823853, + "step": 674 + }, + { + "epoch": 1.033639143730887, + "grad_norm": 0.293588787317276, + "learning_rate": 8.038969369852654e-06, + "loss": 1.555469274520874, + "step": 676 + }, + { + "epoch": 1.036697247706422, + "grad_norm": 0.27872779965400696, + "learning_rate": 8.026424588294701e-06, + "loss": 1.4869214296340942, + "step": 678 + }, + { + "epoch": 1.039755351681957, + "grad_norm": 0.23042356967926025, + "learning_rate": 8.013851069834233e-06, + "loss": 1.279091238975525, + "step": 680 + }, + { + "epoch": 1.0428134556574924, + "grad_norm": 0.289106547832489, + "learning_rate": 8.001248957489164e-06, + "loss": 1.4306490421295166, + "step": 682 + }, + { + "epoch": 1.0458715596330275, + "grad_norm": 0.5272045135498047, + "learning_rate": 7.988618394602653e-06, + "loss": 1.6781132221221924, + "step": 684 + }, + { + "epoch": 1.0489296636085628, + "grad_norm": 0.22576113045215607, + "learning_rate": 7.975959524841464e-06, + "loss": 1.3457372188568115, + "step": 686 + }, + { + "epoch": 1.0519877675840978, + "grad_norm": 0.5630601644515991, + "learning_rate": 7.963272492194344e-06, + "loss": 1.4807915687561035, + "step": 688 + }, + { + "epoch": 1.0550458715596331, + "grad_norm": 0.34389057755470276, + "learning_rate": 7.950557440970377e-06, + "loss": 1.368910789489746, + "step": 690 + }, + { + "epoch": 1.0581039755351682, + "grad_norm": 0.21063481271266937, + "learning_rate": 7.937814515797348e-06, + "loss": 1.360002040863037, + "step": 692 + }, + { + "epoch": 1.0611620795107033, + "grad_norm": 0.20320424437522888, + "learning_rate": 7.92504386162009e-06, + "loss": 1.3675504922866821, + "step": 694 + }, + { + "epoch": 1.0642201834862386, + "grad_norm": 0.2813395857810974, + "learning_rate": 7.912245623698846e-06, + "loss": 1.395061731338501, + "step": 696 + }, + { + "epoch": 1.0672782874617737, + "grad_norm": 0.4647752046585083, + "learning_rate": 7.899419947607611e-06, + "loss": 1.5662283897399902, + "step": 698 + }, + { + "epoch": 1.070336391437309, + "grad_norm": 0.3765999972820282, + "learning_rate": 7.886566979232471e-06, + "loss": 1.5935697555541992, + "step": 700 + }, + { + "epoch": 1.073394495412844, + "grad_norm": 0.29083383083343506, + "learning_rate": 7.873686864769955e-06, + "loss": 1.434537649154663, + "step": 702 + }, + { + "epoch": 1.0764525993883791, + "grad_norm": 0.4763205349445343, + "learning_rate": 7.860779750725362e-06, + "loss": 1.4121177196502686, + "step": 704 + }, + { + "epoch": 1.0795107033639144, + "grad_norm": 0.33439531922340393, + "learning_rate": 7.8478457839111e-06, + "loss": 1.3943579196929932, + "step": 706 + }, + { + "epoch": 1.0825688073394495, + "grad_norm": 0.342690110206604, + "learning_rate": 7.834885111445017e-06, + "loss": 1.4776759147644043, + "step": 708 + }, + { + "epoch": 1.0856269113149848, + "grad_norm": 0.29185494780540466, + "learning_rate": 7.82189788074872e-06, + "loss": 1.4435069561004639, + "step": 710 + }, + { + "epoch": 1.0886850152905199, + "grad_norm": 1.3288284540176392, + "learning_rate": 7.80888423954591e-06, + "loss": 1.4731531143188477, + "step": 712 + }, + { + "epoch": 1.091743119266055, + "grad_norm": 0.2119162380695343, + "learning_rate": 7.795844335860691e-06, + "loss": 1.4626476764678955, + "step": 714 + }, + { + "epoch": 1.0948012232415902, + "grad_norm": 0.20571930706501007, + "learning_rate": 7.782778318015892e-06, + "loss": 1.342850685119629, + "step": 716 + }, + { + "epoch": 1.0978593272171253, + "grad_norm": 0.22236645221710205, + "learning_rate": 7.769686334631375e-06, + "loss": 1.286208152770996, + "step": 718 + }, + { + "epoch": 1.1009174311926606, + "grad_norm": 0.18384046852588654, + "learning_rate": 7.756568534622355e-06, + "loss": 1.4446015357971191, + "step": 720 + }, + { + "epoch": 1.1039755351681957, + "grad_norm": 0.2486264407634735, + "learning_rate": 7.743425067197693e-06, + "loss": 1.5612818002700806, + "step": 722 + }, + { + "epoch": 1.107033639143731, + "grad_norm": 0.23211126029491425, + "learning_rate": 7.730256081858207e-06, + "loss": 1.3999545574188232, + "step": 724 + }, + { + "epoch": 1.110091743119266, + "grad_norm": 0.41483980417251587, + "learning_rate": 7.717061728394968e-06, + "loss": 1.591150164604187, + "step": 726 + }, + { + "epoch": 1.1131498470948011, + "grad_norm": 0.3113287091255188, + "learning_rate": 7.7038421568876e-06, + "loss": 1.620883584022522, + "step": 728 + }, + { + "epoch": 1.1162079510703364, + "grad_norm": 0.5611585378646851, + "learning_rate": 7.690597517702569e-06, + "loss": 1.3835599422454834, + "step": 730 + }, + { + "epoch": 1.1192660550458715, + "grad_norm": 0.5187618732452393, + "learning_rate": 7.677327961491475e-06, + "loss": 1.3614990711212158, + "step": 732 + }, + { + "epoch": 1.1223241590214068, + "grad_norm": 0.34465184807777405, + "learning_rate": 7.664033639189336e-06, + "loss": 1.467517614364624, + "step": 734 + }, + { + "epoch": 1.1253822629969419, + "grad_norm": 0.22211050987243652, + "learning_rate": 7.650714702012876e-06, + "loss": 1.287433385848999, + "step": 736 + }, + { + "epoch": 1.1284403669724772, + "grad_norm": 0.36259227991104126, + "learning_rate": 7.637371301458797e-06, + "loss": 1.367175817489624, + "step": 738 + }, + { + "epoch": 1.1314984709480123, + "grad_norm": 0.44571414589881897, + "learning_rate": 7.6240035893020625e-06, + "loss": 1.3308281898498535, + "step": 740 + }, + { + "epoch": 1.1345565749235473, + "grad_norm": 0.26124662160873413, + "learning_rate": 7.610611717594173e-06, + "loss": 1.3915913105010986, + "step": 742 + }, + { + "epoch": 1.1376146788990826, + "grad_norm": 0.3137398064136505, + "learning_rate": 7.597195838661426e-06, + "loss": 1.3188378810882568, + "step": 744 + }, + { + "epoch": 1.1406727828746177, + "grad_norm": 0.3484938144683838, + "learning_rate": 7.583756105103195e-06, + "loss": 1.3703608512878418, + "step": 746 + }, + { + "epoch": 1.143730886850153, + "grad_norm": 0.3699035942554474, + "learning_rate": 7.570292669790186e-06, + "loss": 1.5115067958831787, + "step": 748 + }, + { + "epoch": 1.146788990825688, + "grad_norm": 0.24170878529548645, + "learning_rate": 7.556805685862703e-06, + "loss": 1.3954684734344482, + "step": 750 + }, + { + "epoch": 1.1498470948012232, + "grad_norm": 0.20038793981075287, + "learning_rate": 7.543295306728904e-06, + "loss": 1.345947027206421, + "step": 752 + }, + { + "epoch": 1.1529051987767585, + "grad_norm": 0.38949868083000183, + "learning_rate": 7.529761686063056e-06, + "loss": 1.5590949058532715, + "step": 754 + }, + { + "epoch": 1.1559633027522935, + "grad_norm": 0.33645766973495483, + "learning_rate": 7.516204977803789e-06, + "loss": 1.446972370147705, + "step": 756 + }, + { + "epoch": 1.1590214067278288, + "grad_norm": 0.18463970720767975, + "learning_rate": 7.5026253361523435e-06, + "loss": 1.3630192279815674, + "step": 758 + }, + { + "epoch": 1.162079510703364, + "grad_norm": 0.33572879433631897, + "learning_rate": 7.489022915570813e-06, + "loss": 1.457106113433838, + "step": 760 + }, + { + "epoch": 1.165137614678899, + "grad_norm": 0.2753995954990387, + "learning_rate": 7.475397870780397e-06, + "loss": 1.4502360820770264, + "step": 762 + }, + { + "epoch": 1.1681957186544343, + "grad_norm": 0.35596194863319397, + "learning_rate": 7.4617503567596295e-06, + "loss": 1.4977834224700928, + "step": 764 + }, + { + "epoch": 1.1712538226299694, + "grad_norm": 0.4726940095424652, + "learning_rate": 7.448080528742624e-06, + "loss": 1.3764468431472778, + "step": 766 + }, + { + "epoch": 1.1743119266055047, + "grad_norm": 0.26225268840789795, + "learning_rate": 7.434388542217303e-06, + "loss": 1.4741466045379639, + "step": 768 + }, + { + "epoch": 1.1773700305810397, + "grad_norm": 0.27619338035583496, + "learning_rate": 7.420674552923638e-06, + "loss": 1.3593350648880005, + "step": 770 + }, + { + "epoch": 1.1804281345565748, + "grad_norm": 0.3182947635650635, + "learning_rate": 7.4069387168518615e-06, + "loss": 1.673621654510498, + "step": 772 + }, + { + "epoch": 1.18348623853211, + "grad_norm": 0.28721779584884644, + "learning_rate": 7.393181190240714e-06, + "loss": 1.4450278282165527, + "step": 774 + }, + { + "epoch": 1.1865443425076452, + "grad_norm": 0.2768658399581909, + "learning_rate": 7.379402129575645e-06, + "loss": 1.5032843351364136, + "step": 776 + }, + { + "epoch": 1.1896024464831805, + "grad_norm": 0.3218024969100952, + "learning_rate": 7.3656016915870545e-06, + "loss": 1.4965013265609741, + "step": 778 + }, + { + "epoch": 1.1926605504587156, + "grad_norm": 0.4919971227645874, + "learning_rate": 7.351780033248491e-06, + "loss": 1.4509224891662598, + "step": 780 + }, + { + "epoch": 1.1957186544342508, + "grad_norm": 0.3981909155845642, + "learning_rate": 7.33793731177488e-06, + "loss": 1.4464759826660156, + "step": 782 + }, + { + "epoch": 1.198776758409786, + "grad_norm": 0.3076995611190796, + "learning_rate": 7.324073684620726e-06, + "loss": 1.4577126502990723, + "step": 784 + }, + { + "epoch": 1.2018348623853212, + "grad_norm": 0.28227174282073975, + "learning_rate": 7.310189309478331e-06, + "loss": 1.439997911453247, + "step": 786 + }, + { + "epoch": 1.2048929663608563, + "grad_norm": 0.26599401235580444, + "learning_rate": 7.296284344275991e-06, + "loss": 1.531783103942871, + "step": 788 + }, + { + "epoch": 1.2079510703363914, + "grad_norm": 0.69685959815979, + "learning_rate": 7.282358947176207e-06, + "loss": 1.4577662944793701, + "step": 790 + }, + { + "epoch": 1.2110091743119267, + "grad_norm": 0.25103896856307983, + "learning_rate": 7.268413276573881e-06, + "loss": 1.3561824560165405, + "step": 792 + }, + { + "epoch": 1.2140672782874617, + "grad_norm": 0.21765579283237457, + "learning_rate": 7.25444749109452e-06, + "loss": 1.3165652751922607, + "step": 794 + }, + { + "epoch": 1.217125382262997, + "grad_norm": 0.2564055919647217, + "learning_rate": 7.2404617495924254e-06, + "loss": 1.383346676826477, + "step": 796 + }, + { + "epoch": 1.2201834862385321, + "grad_norm": 0.40797773003578186, + "learning_rate": 7.226456211148891e-06, + "loss": 1.3315465450286865, + "step": 798 + }, + { + "epoch": 1.2232415902140672, + "grad_norm": 0.31532490253448486, + "learning_rate": 7.212431035070391e-06, + "loss": 1.3896580934524536, + "step": 800 + }, + { + "epoch": 1.2262996941896025, + "grad_norm": 0.25705334544181824, + "learning_rate": 7.198386380886765e-06, + "loss": 1.3460421562194824, + "step": 802 + }, + { + "epoch": 1.2293577981651376, + "grad_norm": 0.31377753615379333, + "learning_rate": 7.1843224083494154e-06, + "loss": 1.595191240310669, + "step": 804 + }, + { + "epoch": 1.2324159021406729, + "grad_norm": 0.2853119969367981, + "learning_rate": 7.170239277429474e-06, + "loss": 1.6170880794525146, + "step": 806 + }, + { + "epoch": 1.235474006116208, + "grad_norm": 0.44243165850639343, + "learning_rate": 7.156137148315993e-06, + "loss": 1.6550755500793457, + "step": 808 + }, + { + "epoch": 1.238532110091743, + "grad_norm": 0.3517357110977173, + "learning_rate": 7.14201618141412e-06, + "loss": 1.566192865371704, + "step": 810 + }, + { + "epoch": 1.2415902140672783, + "grad_norm": 0.2986673414707184, + "learning_rate": 7.127876537343277e-06, + "loss": 1.63118314743042, + "step": 812 + }, + { + "epoch": 1.2446483180428134, + "grad_norm": 0.3479074537754059, + "learning_rate": 7.1137183769353225e-06, + "loss": 1.5168559551239014, + "step": 814 + }, + { + "epoch": 1.2477064220183487, + "grad_norm": 0.4152420461177826, + "learning_rate": 7.099541861232736e-06, + "loss": 1.6398264169692993, + "step": 816 + }, + { + "epoch": 1.2507645259938838, + "grad_norm": 0.384573370218277, + "learning_rate": 7.085347151486779e-06, + "loss": 1.4128949642181396, + "step": 818 + }, + { + "epoch": 1.2538226299694188, + "grad_norm": 0.3804616630077362, + "learning_rate": 7.071134409155659e-06, + "loss": 1.557448148727417, + "step": 820 + }, + { + "epoch": 1.2568807339449541, + "grad_norm": 0.6236130595207214, + "learning_rate": 7.056903795902701e-06, + "loss": 1.3184959888458252, + "step": 822 + }, + { + "epoch": 1.2599388379204892, + "grad_norm": 0.7443933486938477, + "learning_rate": 7.042655473594495e-06, + "loss": 1.537932276725769, + "step": 824 + }, + { + "epoch": 1.2629969418960245, + "grad_norm": 0.5472233891487122, + "learning_rate": 7.028389604299074e-06, + "loss": 1.1561626195907593, + "step": 826 + }, + { + "epoch": 1.2660550458715596, + "grad_norm": 0.847542941570282, + "learning_rate": 7.01410635028405e-06, + "loss": 1.1249284744262695, + "step": 828 + }, + { + "epoch": 1.2691131498470947, + "grad_norm": 0.3495579957962036, + "learning_rate": 6.9998058740147835e-06, + "loss": 1.3474421501159668, + "step": 830 + }, + { + "epoch": 1.27217125382263, + "grad_norm": 0.4069005846977234, + "learning_rate": 6.985488338152529e-06, + "loss": 1.3892837762832642, + "step": 832 + }, + { + "epoch": 1.2752293577981653, + "grad_norm": 0.6165335178375244, + "learning_rate": 6.971153905552587e-06, + "loss": 1.524814248085022, + "step": 834 + }, + { + "epoch": 1.2782874617737003, + "grad_norm": 0.6481596827507019, + "learning_rate": 6.956802739262446e-06, + "loss": 1.464059829711914, + "step": 836 + }, + { + "epoch": 1.2813455657492354, + "grad_norm": 0.3051135241985321, + "learning_rate": 6.942435002519938e-06, + "loss": 1.212691307067871, + "step": 838 + }, + { + "epoch": 1.2844036697247707, + "grad_norm": 0.31896138191223145, + "learning_rate": 6.9280508587513725e-06, + "loss": 1.179284691810608, + "step": 840 + }, + { + "epoch": 1.2874617737003058, + "grad_norm": 0.2261551022529602, + "learning_rate": 6.913650471569684e-06, + "loss": 1.38997220993042, + "step": 842 + }, + { + "epoch": 1.290519877675841, + "grad_norm": 0.3368714451789856, + "learning_rate": 6.899234004772566e-06, + "loss": 1.3169426918029785, + "step": 844 + }, + { + "epoch": 1.2935779816513762, + "grad_norm": 0.49499788880348206, + "learning_rate": 6.884801622340612e-06, + "loss": 1.293768048286438, + "step": 846 + }, + { + "epoch": 1.2966360856269112, + "grad_norm": 0.2904210686683655, + "learning_rate": 6.870353488435447e-06, + "loss": 1.5008976459503174, + "step": 848 + }, + { + "epoch": 1.2996941896024465, + "grad_norm": 0.4230108857154846, + "learning_rate": 6.855889767397863e-06, + "loss": 1.4707106351852417, + "step": 850 + }, + { + "epoch": 1.3027522935779816, + "grad_norm": 0.2836777865886688, + "learning_rate": 6.841410623745944e-06, + "loss": 1.182532548904419, + "step": 852 + }, + { + "epoch": 1.305810397553517, + "grad_norm": 0.3048684895038605, + "learning_rate": 6.826916222173205e-06, + "loss": 1.373314380645752, + "step": 854 + }, + { + "epoch": 1.308868501529052, + "grad_norm": 0.38874655961990356, + "learning_rate": 6.812406727546713e-06, + "loss": 1.5207183361053467, + "step": 856 + }, + { + "epoch": 1.311926605504587, + "grad_norm": 0.541847288608551, + "learning_rate": 6.7978823049052046e-06, + "loss": 1.6546745300292969, + "step": 858 + }, + { + "epoch": 1.3149847094801224, + "grad_norm": 0.3354927897453308, + "learning_rate": 6.783343119457221e-06, + "loss": 1.6852827072143555, + "step": 860 + }, + { + "epoch": 1.3180428134556574, + "grad_norm": 0.22799281775951385, + "learning_rate": 6.768789336579224e-06, + "loss": 1.7998615503311157, + "step": 862 + }, + { + "epoch": 1.3211009174311927, + "grad_norm": 0.2829393446445465, + "learning_rate": 6.754221121813707e-06, + "loss": 1.3555914163589478, + "step": 864 + }, + { + "epoch": 1.3241590214067278, + "grad_norm": 0.2552604377269745, + "learning_rate": 6.739638640867332e-06, + "loss": 1.44038724899292, + "step": 866 + }, + { + "epoch": 1.3272171253822629, + "grad_norm": 0.2328341007232666, + "learning_rate": 6.72504205960902e-06, + "loss": 1.2792387008666992, + "step": 868 + }, + { + "epoch": 1.3302752293577982, + "grad_norm": 0.19776956737041473, + "learning_rate": 6.710431544068085e-06, + "loss": 1.2014856338500977, + "step": 870 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.2862965762615204, + "learning_rate": 6.695807260432332e-06, + "loss": 1.612195372581482, + "step": 872 + }, + { + "epoch": 1.3363914373088686, + "grad_norm": 0.2737024426460266, + "learning_rate": 6.681169375046173e-06, + "loss": 1.4856352806091309, + "step": 874 + }, + { + "epoch": 1.3394495412844036, + "grad_norm": 0.33617132902145386, + "learning_rate": 6.666518054408734e-06, + "loss": 1.6690922975540161, + "step": 876 + }, + { + "epoch": 1.3425076452599387, + "grad_norm": 0.33230748772621155, + "learning_rate": 6.65185346517196e-06, + "loss": 1.134220838546753, + "step": 878 + }, + { + "epoch": 1.345565749235474, + "grad_norm": 0.34520813822746277, + "learning_rate": 6.637175774138722e-06, + "loss": 1.3939542770385742, + "step": 880 + }, + { + "epoch": 1.3486238532110093, + "grad_norm": 0.3193676471710205, + "learning_rate": 6.622485148260916e-06, + "loss": 1.6689043045043945, + "step": 882 + }, + { + "epoch": 1.3516819571865444, + "grad_norm": 0.2586718499660492, + "learning_rate": 6.607781754637567e-06, + "loss": 1.3927881717681885, + "step": 884 + }, + { + "epoch": 1.3547400611620795, + "grad_norm": 0.36470475792884827, + "learning_rate": 6.593065760512924e-06, + "loss": 1.5524687767028809, + "step": 886 + }, + { + "epoch": 1.3577981651376148, + "grad_norm": 0.5333327054977417, + "learning_rate": 6.578337333274566e-06, + "loss": 1.4335553646087646, + "step": 888 + }, + { + "epoch": 1.3608562691131498, + "grad_norm": 0.24828922748565674, + "learning_rate": 6.563596640451489e-06, + "loss": 1.3478354215621948, + "step": 890 + }, + { + "epoch": 1.3639143730886851, + "grad_norm": 0.2684786021709442, + "learning_rate": 6.548843849712206e-06, + "loss": 1.4221248626708984, + "step": 892 + }, + { + "epoch": 1.3669724770642202, + "grad_norm": 0.2922813594341278, + "learning_rate": 6.534079128862835e-06, + "loss": 1.4792616367340088, + "step": 894 + }, + { + "epoch": 1.3700305810397553, + "grad_norm": 0.21960243582725525, + "learning_rate": 6.5193026458452006e-06, + "loss": 1.3363940715789795, + "step": 896 + }, + { + "epoch": 1.3730886850152906, + "grad_norm": 0.41456371545791626, + "learning_rate": 6.50451456873491e-06, + "loss": 1.4480544328689575, + "step": 898 + }, + { + "epoch": 1.3761467889908257, + "grad_norm": 0.6222192049026489, + "learning_rate": 6.489715065739448e-06, + "loss": 1.7465565204620361, + "step": 900 + }, + { + "epoch": 1.379204892966361, + "grad_norm": 0.5998108983039856, + "learning_rate": 6.474904305196268e-06, + "loss": 2.144679546356201, + "step": 902 + }, + { + "epoch": 1.382262996941896, + "grad_norm": 0.5612609386444092, + "learning_rate": 6.4600824555708695e-06, + "loss": 1.378048300743103, + "step": 904 + }, + { + "epoch": 1.385321100917431, + "grad_norm": 0.32021385431289673, + "learning_rate": 6.445249685454885e-06, + "loss": 1.361167073249817, + "step": 906 + }, + { + "epoch": 1.3883792048929664, + "grad_norm": 0.36393630504608154, + "learning_rate": 6.4304061635641645e-06, + "loss": 1.433903694152832, + "step": 908 + }, + { + "epoch": 1.3914373088685015, + "grad_norm": 0.7985405325889587, + "learning_rate": 6.415552058736854e-06, + "loss": 1.5466125011444092, + "step": 910 + }, + { + "epoch": 1.3944954128440368, + "grad_norm": 0.30912530422210693, + "learning_rate": 6.4006875399314705e-06, + "loss": 1.463235855102539, + "step": 912 + }, + { + "epoch": 1.3975535168195719, + "grad_norm": 0.2953026294708252, + "learning_rate": 6.3858127762249945e-06, + "loss": 1.3276557922363281, + "step": 914 + }, + { + "epoch": 1.400611620795107, + "grad_norm": 0.19828742742538452, + "learning_rate": 6.3709279368109264e-06, + "loss": 1.2300511598587036, + "step": 916 + }, + { + "epoch": 1.4036697247706422, + "grad_norm": 0.21878407895565033, + "learning_rate": 6.356033190997386e-06, + "loss": 1.1606783866882324, + "step": 918 + }, + { + "epoch": 1.4067278287461773, + "grad_norm": 0.19046013057231903, + "learning_rate": 6.341128708205162e-06, + "loss": 1.3056751489639282, + "step": 920 + }, + { + "epoch": 1.4097859327217126, + "grad_norm": 0.40108954906463623, + "learning_rate": 6.326214657965804e-06, + "loss": 1.5421757698059082, + "step": 922 + }, + { + "epoch": 1.4128440366972477, + "grad_norm": 0.46537211537361145, + "learning_rate": 6.311291209919682e-06, + "loss": 1.5684192180633545, + "step": 924 + }, + { + "epoch": 1.4159021406727827, + "grad_norm": 0.5733487606048584, + "learning_rate": 6.296358533814065e-06, + "loss": 1.5650339126586914, + "step": 926 + }, + { + "epoch": 1.418960244648318, + "grad_norm": 0.4306733310222626, + "learning_rate": 6.281416799501188e-06, + "loss": 1.5992372035980225, + "step": 928 + }, + { + "epoch": 1.4220183486238533, + "grad_norm": 0.407654732465744, + "learning_rate": 6.266466176936313e-06, + "loss": 1.4283607006072998, + "step": 930 + }, + { + "epoch": 1.4250764525993884, + "grad_norm": 4.419346332550049, + "learning_rate": 6.251506836175807e-06, + "loss": 1.5659562349319458, + "step": 932 + }, + { + "epoch": 1.4281345565749235, + "grad_norm": 0.7012003064155579, + "learning_rate": 6.236538947375203e-06, + "loss": 1.4677741527557373, + "step": 934 + }, + { + "epoch": 1.4311926605504588, + "grad_norm": 0.22764644026756287, + "learning_rate": 6.221562680787258e-06, + "loss": 1.374863624572754, + "step": 936 + }, + { + "epoch": 1.4342507645259939, + "grad_norm": 0.4946407973766327, + "learning_rate": 6.20657820676003e-06, + "loss": 1.3795430660247803, + "step": 938 + }, + { + "epoch": 1.4373088685015292, + "grad_norm": 1.4666649103164673, + "learning_rate": 6.191585695734925e-06, + "loss": 1.584106683731079, + "step": 940 + }, + { + "epoch": 1.4403669724770642, + "grad_norm": 0.9116813540458679, + "learning_rate": 6.176585318244775e-06, + "loss": 1.3207650184631348, + "step": 942 + }, + { + "epoch": 1.4434250764525993, + "grad_norm": 0.4549460709095001, + "learning_rate": 6.161577244911883e-06, + "loss": 1.5188086032867432, + "step": 944 + }, + { + "epoch": 1.4464831804281346, + "grad_norm": 0.6293279528617859, + "learning_rate": 6.146561646446088e-06, + "loss": 1.40483558177948, + "step": 946 + }, + { + "epoch": 1.4495412844036697, + "grad_norm": 0.5348030924797058, + "learning_rate": 6.131538693642828e-06, + "loss": 1.4180057048797607, + "step": 948 + }, + { + "epoch": 1.452599388379205, + "grad_norm": 0.7010774612426758, + "learning_rate": 6.116508557381191e-06, + "loss": 1.5555238723754883, + "step": 950 + }, + { + "epoch": 1.45565749235474, + "grad_norm": 0.3996182382106781, + "learning_rate": 6.1014714086219725e-06, + "loss": 1.5635944604873657, + "step": 952 + }, + { + "epoch": 1.4587155963302751, + "grad_norm": 0.3819827139377594, + "learning_rate": 6.086427418405735e-06, + "loss": 1.3868696689605713, + "step": 954 + }, + { + "epoch": 1.4617737003058104, + "grad_norm": 0.24838334321975708, + "learning_rate": 6.071376757850858e-06, + "loss": 1.3217381238937378, + "step": 956 + }, + { + "epoch": 1.4648318042813455, + "grad_norm": 0.5527139902114868, + "learning_rate": 6.0563195981515885e-06, + "loss": 1.456415057182312, + "step": 958 + }, + { + "epoch": 1.4678899082568808, + "grad_norm": 0.2822090983390808, + "learning_rate": 6.0412561105761055e-06, + "loss": 1.3990404605865479, + "step": 960 + }, + { + "epoch": 1.470948012232416, + "grad_norm": 0.370832234621048, + "learning_rate": 6.026186466464562e-06, + "loss": 1.5524400472640991, + "step": 962 + }, + { + "epoch": 1.474006116207951, + "grad_norm": 0.30970191955566406, + "learning_rate": 6.011110837227138e-06, + "loss": 1.4143943786621094, + "step": 964 + }, + { + "epoch": 1.4770642201834863, + "grad_norm": 0.3659932613372803, + "learning_rate": 5.996029394342089e-06, + "loss": 1.3726913928985596, + "step": 966 + }, + { + "epoch": 1.4801223241590213, + "grad_norm": 0.40378639101982117, + "learning_rate": 5.980942309353803e-06, + "loss": 1.3403112888336182, + "step": 968 + }, + { + "epoch": 1.4831804281345566, + "grad_norm": 0.2668818235397339, + "learning_rate": 5.965849753870841e-06, + "loss": 1.4581551551818848, + "step": 970 + }, + { + "epoch": 1.4862385321100917, + "grad_norm": 0.39147576689720154, + "learning_rate": 5.950751899563989e-06, + "loss": 1.4426075220108032, + "step": 972 + }, + { + "epoch": 1.4892966360856268, + "grad_norm": 0.4053312838077545, + "learning_rate": 5.935648918164308e-06, + "loss": 1.429807424545288, + "step": 974 + }, + { + "epoch": 1.492354740061162, + "grad_norm": 0.2912329435348511, + "learning_rate": 5.9205409814611694e-06, + "loss": 1.6015820503234863, + "step": 976 + }, + { + "epoch": 1.4954128440366974, + "grad_norm": 0.39581140875816345, + "learning_rate": 5.9054282613003165e-06, + "loss": 1.7901129722595215, + "step": 978 + }, + { + "epoch": 1.4984709480122325, + "grad_norm": 5.4772210121154785, + "learning_rate": 5.890310929581899e-06, + "loss": 1.665008544921875, + "step": 980 + }, + { + "epoch": 1.5015290519877675, + "grad_norm": 0.32753488421440125, + "learning_rate": 5.875189158258521e-06, + "loss": 1.658569574356079, + "step": 982 + }, + { + "epoch": 1.5045871559633026, + "grad_norm": 0.3322629928588867, + "learning_rate": 5.860063119333287e-06, + "loss": 1.568853735923767, + "step": 984 + }, + { + "epoch": 1.507645259938838, + "grad_norm": 0.3625146746635437, + "learning_rate": 5.844932984857841e-06, + "loss": 1.2555010318756104, + "step": 986 + }, + { + "epoch": 1.5107033639143732, + "grad_norm": 0.3967174291610718, + "learning_rate": 5.829798926930411e-06, + "loss": 1.2352030277252197, + "step": 988 + }, + { + "epoch": 1.5137614678899083, + "grad_norm": 0.92249995470047, + "learning_rate": 5.814661117693856e-06, + "loss": 1.6529834270477295, + "step": 990 + }, + { + "epoch": 1.5168195718654434, + "grad_norm": 0.43264713883399963, + "learning_rate": 5.799519729333702e-06, + "loss": 1.6510822772979736, + "step": 992 + }, + { + "epoch": 1.5198776758409784, + "grad_norm": 0.48226049542427063, + "learning_rate": 5.784374934076188e-06, + "loss": 1.7469120025634766, + "step": 994 + }, + { + "epoch": 1.5229357798165137, + "grad_norm": 0.6006577014923096, + "learning_rate": 5.769226904186301e-06, + "loss": 1.6751326322555542, + "step": 996 + }, + { + "epoch": 1.525993883792049, + "grad_norm": 0.417524129152298, + "learning_rate": 5.754075811965826e-06, + "loss": 1.7241541147232056, + "step": 998 + }, + { + "epoch": 1.529051987767584, + "grad_norm": 0.4846678674221039, + "learning_rate": 5.738921829751374e-06, + "loss": 1.5894498825073242, + "step": 1000 + }, + { + "epoch": 1.5321100917431192, + "grad_norm": 0.37620386481285095, + "learning_rate": 5.723765129912433e-06, + "loss": 1.5567536354064941, + "step": 1002 + }, + { + "epoch": 1.5351681957186545, + "grad_norm": 0.9559251070022583, + "learning_rate": 5.708605884849402e-06, + "loss": 1.444126844406128, + "step": 1004 + }, + { + "epoch": 1.5382262996941896, + "grad_norm": 0.4608314335346222, + "learning_rate": 5.6934442669916315e-06, + "loss": 1.7045128345489502, + "step": 1006 + }, + { + "epoch": 1.5412844036697249, + "grad_norm": 0.5580506920814514, + "learning_rate": 5.678280448795457e-06, + "loss": 1.576319932937622, + "step": 1008 + }, + { + "epoch": 1.54434250764526, + "grad_norm": 0.414983332157135, + "learning_rate": 5.663114602742247e-06, + "loss": 1.1866123676300049, + "step": 1010 + }, + { + "epoch": 1.547400611620795, + "grad_norm": 0.5494526624679565, + "learning_rate": 5.647946901336433e-06, + "loss": 1.7420477867126465, + "step": 1012 + }, + { + "epoch": 1.5504587155963303, + "grad_norm": 0.6842697262763977, + "learning_rate": 5.632777517103552e-06, + "loss": 1.7904109954833984, + "step": 1014 + }, + { + "epoch": 1.5535168195718656, + "grad_norm": 0.43980666995048523, + "learning_rate": 5.617606622588282e-06, + "loss": 1.862006425857544, + "step": 1016 + }, + { + "epoch": 1.5565749235474007, + "grad_norm": 0.3990402817726135, + "learning_rate": 5.602434390352476e-06, + "loss": 1.7830100059509277, + "step": 1018 + }, + { + "epoch": 1.5596330275229358, + "grad_norm": 0.4031524360179901, + "learning_rate": 5.58726099297321e-06, + "loss": 1.7594141960144043, + "step": 1020 + }, + { + "epoch": 1.5626911314984708, + "grad_norm": 0.6580591797828674, + "learning_rate": 5.572086603040809e-06, + "loss": 1.6219829320907593, + "step": 1022 + }, + { + "epoch": 1.5657492354740061, + "grad_norm": 0.36656439304351807, + "learning_rate": 5.556911393156885e-06, + "loss": 1.4893901348114014, + "step": 1024 + }, + { + "epoch": 1.5688073394495414, + "grad_norm": 0.6261524558067322, + "learning_rate": 5.541735535932383e-06, + "loss": 1.058058261871338, + "step": 1026 + }, + { + "epoch": 1.5718654434250765, + "grad_norm": 0.3441345691680908, + "learning_rate": 5.526559203985605e-06, + "loss": 1.0509142875671387, + "step": 1028 + }, + { + "epoch": 1.5749235474006116, + "grad_norm": 0.2408900260925293, + "learning_rate": 5.511382569940258e-06, + "loss": 1.2871123552322388, + "step": 1030 + }, + { + "epoch": 1.5779816513761467, + "grad_norm": 0.45723816752433777, + "learning_rate": 5.496205806423481e-06, + "loss": 1.2235673666000366, + "step": 1032 + }, + { + "epoch": 1.581039755351682, + "grad_norm": 0.3109905421733856, + "learning_rate": 5.481029086063887e-06, + "loss": 1.177577018737793, + "step": 1034 + }, + { + "epoch": 1.5840978593272173, + "grad_norm": 0.20282985270023346, + "learning_rate": 5.4658525814896014e-06, + "loss": 1.2040612697601318, + "step": 1036 + }, + { + "epoch": 1.5871559633027523, + "grad_norm": 0.43076759576797485, + "learning_rate": 5.45067646532629e-06, + "loss": 1.4584531784057617, + "step": 1038 + }, + { + "epoch": 1.5902140672782874, + "grad_norm": 0.472885400056839, + "learning_rate": 5.435500910195203e-06, + "loss": 1.387641429901123, + "step": 1040 + }, + { + "epoch": 1.5932721712538225, + "grad_norm": 3.1532437801361084, + "learning_rate": 5.420326088711209e-06, + "loss": 1.221092700958252, + "step": 1042 + }, + { + "epoch": 1.5963302752293578, + "grad_norm": 0.6743189692497253, + "learning_rate": 5.405152173480833e-06, + "loss": 1.4836219549179077, + "step": 1044 + }, + { + "epoch": 1.599388379204893, + "grad_norm": 0.20277228951454163, + "learning_rate": 5.389979337100289e-06, + "loss": 1.5031371116638184, + "step": 1046 + }, + { + "epoch": 1.6024464831804281, + "grad_norm": 0.5120447874069214, + "learning_rate": 5.374807752153522e-06, + "loss": 1.282975673675537, + "step": 1048 + }, + { + "epoch": 1.6055045871559632, + "grad_norm": 0.35753709077835083, + "learning_rate": 5.359637591210242e-06, + "loss": 1.4665361642837524, + "step": 1050 + }, + { + "epoch": 1.6085626911314985, + "grad_norm": 0.7353309988975525, + "learning_rate": 5.344469026823959e-06, + "loss": 1.6730611324310303, + "step": 1052 + }, + { + "epoch": 1.6116207951070336, + "grad_norm": 0.4338257610797882, + "learning_rate": 5.329302231530029e-06, + "loss": 1.186348795890808, + "step": 1054 + }, + { + "epoch": 1.614678899082569, + "grad_norm": 0.42416566610336304, + "learning_rate": 5.31413737784368e-06, + "loss": 1.4430310726165771, + "step": 1056 + }, + { + "epoch": 1.617737003058104, + "grad_norm": 0.2432592213153839, + "learning_rate": 5.298974638258055e-06, + "loss": 1.518967866897583, + "step": 1058 + }, + { + "epoch": 1.620795107033639, + "grad_norm": 0.408245712518692, + "learning_rate": 5.283814185242252e-06, + "loss": 1.426690697669983, + "step": 1060 + }, + { + "epoch": 1.6238532110091743, + "grad_norm": 0.2117079198360443, + "learning_rate": 5.2686561912393606e-06, + "loss": 1.2693121433258057, + "step": 1062 + }, + { + "epoch": 1.6269113149847096, + "grad_norm": 4.30716609954834, + "learning_rate": 5.253500828664501e-06, + "loss": 0.9013931155204773, + "step": 1064 + }, + { + "epoch": 1.6299694189602447, + "grad_norm": 0.38770049810409546, + "learning_rate": 5.23834826990286e-06, + "loss": 1.5694489479064941, + "step": 1066 + }, + { + "epoch": 1.6330275229357798, + "grad_norm": 0.6700468063354492, + "learning_rate": 5.223198687307733e-06, + "loss": 1.503030776977539, + "step": 1068 + }, + { + "epoch": 1.6360856269113149, + "grad_norm": 0.2767106294631958, + "learning_rate": 5.208052253198564e-06, + "loss": 1.3917062282562256, + "step": 1070 + }, + { + "epoch": 1.6391437308868502, + "grad_norm": 0.3463125228881836, + "learning_rate": 5.192909139858981e-06, + "loss": 1.5068938732147217, + "step": 1072 + }, + { + "epoch": 1.6422018348623855, + "grad_norm": 0.3212260603904724, + "learning_rate": 5.177769519534846e-06, + "loss": 1.4421181678771973, + "step": 1074 + }, + { + "epoch": 1.6452599388379205, + "grad_norm": 0.4484805762767792, + "learning_rate": 5.162633564432285e-06, + "loss": 1.408212661743164, + "step": 1076 + }, + { + "epoch": 1.6483180428134556, + "grad_norm": 0.4805358350276947, + "learning_rate": 5.1475014467157325e-06, + "loss": 1.6133791208267212, + "step": 1078 + }, + { + "epoch": 1.6513761467889907, + "grad_norm": 0.5775420665740967, + "learning_rate": 5.132373338505978e-06, + "loss": 1.2856450080871582, + "step": 1080 + }, + { + "epoch": 1.654434250764526, + "grad_norm": 0.32906994223594666, + "learning_rate": 5.117249411878204e-06, + "loss": 1.04205322265625, + "step": 1082 + }, + { + "epoch": 1.6574923547400613, + "grad_norm": 0.5074779987335205, + "learning_rate": 5.10212983886003e-06, + "loss": 1.6698901653289795, + "step": 1084 + }, + { + "epoch": 1.6605504587155964, + "grad_norm": 0.36449626088142395, + "learning_rate": 5.087014791429552e-06, + "loss": 1.449878215789795, + "step": 1086 + }, + { + "epoch": 1.6636085626911314, + "grad_norm": 1.0477646589279175, + "learning_rate": 5.071904441513393e-06, + "loss": 1.5865240097045898, + "step": 1088 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.3797400891780853, + "learning_rate": 5.056798960984741e-06, + "loss": 1.4271771907806396, + "step": 1090 + }, + { + "epoch": 1.6697247706422018, + "grad_norm": 0.3018883466720581, + "learning_rate": 5.041698521661401e-06, + "loss": 1.6418373584747314, + "step": 1092 + }, + { + "epoch": 1.6727828746177371, + "grad_norm": 0.5908496379852295, + "learning_rate": 5.026603295303833e-06, + "loss": 1.5063586235046387, + "step": 1094 + }, + { + "epoch": 1.6758409785932722, + "grad_norm": 0.5799764394760132, + "learning_rate": 5.011513453613205e-06, + "loss": 1.5312390327453613, + "step": 1096 + }, + { + "epoch": 1.6788990825688073, + "grad_norm": 0.4648537337779999, + "learning_rate": 4.996429168229432e-06, + "loss": 1.4155495166778564, + "step": 1098 + }, + { + "epoch": 1.6819571865443423, + "grad_norm": 0.3357274830341339, + "learning_rate": 4.981350610729234e-06, + "loss": 1.07462477684021, + "step": 1100 + }, + { + "epoch": 1.6850152905198776, + "grad_norm": 0.8209952712059021, + "learning_rate": 4.966277952624179e-06, + "loss": 1.532288670539856, + "step": 1102 + }, + { + "epoch": 1.688073394495413, + "grad_norm": 0.6916195750236511, + "learning_rate": 4.951211365358723e-06, + "loss": 1.5015881061553955, + "step": 1104 + }, + { + "epoch": 1.691131498470948, + "grad_norm": 0.6677690148353577, + "learning_rate": 4.936151020308282e-06, + "loss": 1.5166327953338623, + "step": 1106 + }, + { + "epoch": 1.694189602446483, + "grad_norm": 0.7889437675476074, + "learning_rate": 4.921097088777261e-06, + "loss": 1.5232961177825928, + "step": 1108 + }, + { + "epoch": 1.6972477064220184, + "grad_norm": 0.5421835780143738, + "learning_rate": 4.906049741997119e-06, + "loss": 1.3370258808135986, + "step": 1110 + }, + { + "epoch": 1.7003058103975535, + "grad_norm": 0.28672778606414795, + "learning_rate": 4.8910091511244115e-06, + "loss": 1.5552886724472046, + "step": 1112 + }, + { + "epoch": 1.7033639143730888, + "grad_norm": 0.8609727025032043, + "learning_rate": 4.875975487238853e-06, + "loss": 1.6477062702178955, + "step": 1114 + }, + { + "epoch": 1.7064220183486238, + "grad_norm": 0.46577727794647217, + "learning_rate": 4.860948921341366e-06, + "loss": 1.3554713726043701, + "step": 1116 + }, + { + "epoch": 1.709480122324159, + "grad_norm": 0.4357546865940094, + "learning_rate": 4.845929624352136e-06, + "loss": 1.616469383239746, + "step": 1118 + }, + { + "epoch": 1.7125382262996942, + "grad_norm": 0.8016573786735535, + "learning_rate": 4.830917767108666e-06, + "loss": 1.4049677848815918, + "step": 1120 + }, + { + "epoch": 1.7155963302752295, + "grad_norm": 0.34570103883743286, + "learning_rate": 4.8159135203638394e-06, + "loss": 1.5350430011749268, + "step": 1122 + }, + { + "epoch": 1.7186544342507646, + "grad_norm": 0.6164813041687012, + "learning_rate": 4.800917054783971e-06, + "loss": 1.4737257957458496, + "step": 1124 + }, + { + "epoch": 1.7217125382262997, + "grad_norm": 0.30021098256111145, + "learning_rate": 4.785928540946869e-06, + "loss": 1.59697425365448, + "step": 1126 + }, + { + "epoch": 1.7247706422018347, + "grad_norm": 0.3294142782688141, + "learning_rate": 4.770948149339897e-06, + "loss": 1.6918811798095703, + "step": 1128 + }, + { + "epoch": 1.72782874617737, + "grad_norm": 0.33221927285194397, + "learning_rate": 4.755976050358026e-06, + "loss": 1.581977128982544, + "step": 1130 + }, + { + "epoch": 1.7308868501529053, + "grad_norm": 0.27995747327804565, + "learning_rate": 4.741012414301907e-06, + "loss": 1.42479407787323, + "step": 1132 + }, + { + "epoch": 1.7339449541284404, + "grad_norm": 0.4526294767856598, + "learning_rate": 4.726057411375927e-06, + "loss": 1.5270183086395264, + "step": 1134 + }, + { + "epoch": 1.7370030581039755, + "grad_norm": 0.6458525657653809, + "learning_rate": 4.711111211686279e-06, + "loss": 1.5350821018218994, + "step": 1136 + }, + { + "epoch": 1.7400611620795106, + "grad_norm": 0.40516841411590576, + "learning_rate": 4.6961739852390175e-06, + "loss": 1.5310497283935547, + "step": 1138 + }, + { + "epoch": 1.7431192660550459, + "grad_norm": 1.3104746341705322, + "learning_rate": 4.681245901938134e-06, + "loss": 1.5385562181472778, + "step": 1140 + }, + { + "epoch": 1.7461773700305812, + "grad_norm": 0.40381914377212524, + "learning_rate": 4.666327131583621e-06, + "loss": 1.5392662286758423, + "step": 1142 + }, + { + "epoch": 1.7492354740061162, + "grad_norm": 0.8844152688980103, + "learning_rate": 4.65141784386954e-06, + "loss": 1.333682894706726, + "step": 1144 + }, + { + "epoch": 1.7522935779816513, + "grad_norm": 0.423922061920166, + "learning_rate": 4.636518208382091e-06, + "loss": 1.4100391864776611, + "step": 1146 + }, + { + "epoch": 1.7553516819571864, + "grad_norm": 0.3589678406715393, + "learning_rate": 4.621628394597687e-06, + "loss": 1.341862440109253, + "step": 1148 + }, + { + "epoch": 1.7584097859327217, + "grad_norm": 0.6498292088508606, + "learning_rate": 4.606748571881018e-06, + "loss": 1.4297010898590088, + "step": 1150 + }, + { + "epoch": 1.761467889908257, + "grad_norm": 0.5506405234336853, + "learning_rate": 4.59187890948314e-06, + "loss": 1.4309487342834473, + "step": 1152 + }, + { + "epoch": 1.764525993883792, + "grad_norm": 0.45955854654312134, + "learning_rate": 4.577019576539527e-06, + "loss": 1.2851155996322632, + "step": 1154 + }, + { + "epoch": 1.7675840978593271, + "grad_norm": 0.28625011444091797, + "learning_rate": 4.562170742068175e-06, + "loss": 0.9397743940353394, + "step": 1156 + }, + { + "epoch": 1.7706422018348624, + "grad_norm": 0.22773736715316772, + "learning_rate": 4.547332574967653e-06, + "loss": 1.237460732460022, + "step": 1158 + }, + { + "epoch": 1.7737003058103975, + "grad_norm": 0.25427719950675964, + "learning_rate": 4.5325052440151985e-06, + "loss": 1.3028910160064697, + "step": 1160 + }, + { + "epoch": 1.7767584097859328, + "grad_norm": 0.2875189781188965, + "learning_rate": 4.517688917864794e-06, + "loss": 1.3547457456588745, + "step": 1162 + }, + { + "epoch": 1.7798165137614679, + "grad_norm": 0.21899199485778809, + "learning_rate": 4.502883765045244e-06, + "loss": 1.36411714553833, + "step": 1164 + }, + { + "epoch": 1.782874617737003, + "grad_norm": 0.21183030307292938, + "learning_rate": 4.488089953958264e-06, + "loss": 1.4323028326034546, + "step": 1166 + }, + { + "epoch": 1.7859327217125383, + "grad_norm": 0.22526955604553223, + "learning_rate": 4.473307652876563e-06, + "loss": 1.3429040908813477, + "step": 1168 + }, + { + "epoch": 1.7889908256880735, + "grad_norm": 0.266107439994812, + "learning_rate": 4.458537029941926e-06, + "loss": 1.3663442134857178, + "step": 1170 + }, + { + "epoch": 1.7920489296636086, + "grad_norm": 0.490496963262558, + "learning_rate": 4.4437782531633074e-06, + "loss": 1.3354597091674805, + "step": 1172 + }, + { + "epoch": 1.7951070336391437, + "grad_norm": 0.1854841560125351, + "learning_rate": 4.429031490414919e-06, + "loss": 1.3446393013000488, + "step": 1174 + }, + { + "epoch": 1.7981651376146788, + "grad_norm": 0.1960364729166031, + "learning_rate": 4.414296909434311e-06, + "loss": 1.3029416799545288, + "step": 1176 + }, + { + "epoch": 1.801223241590214, + "grad_norm": 0.35048866271972656, + "learning_rate": 4.399574677820481e-06, + "loss": 1.348449945449829, + "step": 1178 + }, + { + "epoch": 1.8042813455657494, + "grad_norm": 0.3793323040008545, + "learning_rate": 4.384864963031952e-06, + "loss": 1.297593593597412, + "step": 1180 + }, + { + "epoch": 1.8073394495412844, + "grad_norm": 0.14626124501228333, + "learning_rate": 4.370167932384873e-06, + "loss": 1.1695170402526855, + "step": 1182 + }, + { + "epoch": 1.8103975535168195, + "grad_norm": 0.16865181922912598, + "learning_rate": 4.355483753051125e-06, + "loss": 1.2123092412948608, + "step": 1184 + }, + { + "epoch": 1.8134556574923546, + "grad_norm": 0.1931789070367813, + "learning_rate": 4.340812592056401e-06, + "loss": 0.9932126998901367, + "step": 1186 + }, + { + "epoch": 1.81651376146789, + "grad_norm": 0.2547837793827057, + "learning_rate": 4.326154616278326e-06, + "loss": 1.2431546449661255, + "step": 1188 + }, + { + "epoch": 1.8195718654434252, + "grad_norm": 0.23825769126415253, + "learning_rate": 4.311509992444539e-06, + "loss": 1.286515712738037, + "step": 1190 + }, + { + "epoch": 1.8226299694189603, + "grad_norm": 0.25244706869125366, + "learning_rate": 4.296878887130819e-06, + "loss": 1.3000450134277344, + "step": 1192 + }, + { + "epoch": 1.8256880733944953, + "grad_norm": 0.23451480269432068, + "learning_rate": 4.282261466759165e-06, + "loss": 1.2664532661437988, + "step": 1194 + }, + { + "epoch": 1.8287461773700304, + "grad_norm": 0.2735919952392578, + "learning_rate": 4.267657897595929e-06, + "loss": 1.288360834121704, + "step": 1196 + }, + { + "epoch": 1.8318042813455657, + "grad_norm": 0.18107269704341888, + "learning_rate": 4.253068345749903e-06, + "loss": 1.2625651359558105, + "step": 1198 + }, + { + "epoch": 1.834862385321101, + "grad_norm": 0.2293253242969513, + "learning_rate": 4.238492977170439e-06, + "loss": 1.234043836593628, + "step": 1200 + }, + { + "epoch": 1.837920489296636, + "grad_norm": 0.27160146832466125, + "learning_rate": 4.223931957645566e-06, + "loss": 1.300539493560791, + "step": 1202 + }, + { + "epoch": 1.8409785932721712, + "grad_norm": 0.25112462043762207, + "learning_rate": 4.2093854528000955e-06, + "loss": 1.2719401121139526, + "step": 1204 + }, + { + "epoch": 1.8440366972477065, + "grad_norm": 0.33997592329978943, + "learning_rate": 4.194853628093742e-06, + "loss": 1.2453508377075195, + "step": 1206 + }, + { + "epoch": 1.8470948012232415, + "grad_norm": 0.6576793789863586, + "learning_rate": 4.180336648819242e-06, + "loss": 1.233917236328125, + "step": 1208 + }, + { + "epoch": 1.8501529051987768, + "grad_norm": 0.26551222801208496, + "learning_rate": 4.165834680100469e-06, + "loss": 1.2595276832580566, + "step": 1210 + }, + { + "epoch": 1.853211009174312, + "grad_norm": 0.2170596420764923, + "learning_rate": 4.151347886890562e-06, + "loss": 1.2505378723144531, + "step": 1212 + }, + { + "epoch": 1.856269113149847, + "grad_norm": 0.2974804937839508, + "learning_rate": 4.1368764339700404e-06, + "loss": 1.2092756032943726, + "step": 1214 + }, + { + "epoch": 1.8593272171253823, + "grad_norm": 0.2567199468612671, + "learning_rate": 4.1224204859449425e-06, + "loss": 1.2698951959609985, + "step": 1216 + }, + { + "epoch": 1.8623853211009176, + "grad_norm": 0.23152267932891846, + "learning_rate": 4.107980207244937e-06, + "loss": 1.3027379512786865, + "step": 1218 + }, + { + "epoch": 1.8654434250764527, + "grad_norm": 0.26830926537513733, + "learning_rate": 4.093555762121469e-06, + "loss": 1.308929443359375, + "step": 1220 + }, + { + "epoch": 1.8685015290519877, + "grad_norm": 0.2566030025482178, + "learning_rate": 4.07914731464588e-06, + "loss": 1.2964577674865723, + "step": 1222 + }, + { + "epoch": 1.8715596330275228, + "grad_norm": 0.4025701582431793, + "learning_rate": 4.064755028707546e-06, + "loss": 1.31220543384552, + "step": 1224 + }, + { + "epoch": 1.8746177370030581, + "grad_norm": 0.25386303663253784, + "learning_rate": 4.0503790680120136e-06, + "loss": 1.299830436706543, + "step": 1226 + }, + { + "epoch": 1.8776758409785934, + "grad_norm": 0.39947405457496643, + "learning_rate": 4.036019596079136e-06, + "loss": 1.3202039003372192, + "step": 1228 + }, + { + "epoch": 1.8807339449541285, + "grad_norm": 0.23179592192173004, + "learning_rate": 4.021676776241218e-06, + "loss": 1.2405881881713867, + "step": 1230 + }, + { + "epoch": 1.8837920489296636, + "grad_norm": 0.48796483874320984, + "learning_rate": 4.007350771641151e-06, + "loss": 1.288329005241394, + "step": 1232 + }, + { + "epoch": 1.8868501529051986, + "grad_norm": 0.26645490527153015, + "learning_rate": 3.993041745230562e-06, + "loss": 1.2443333864212036, + "step": 1234 + }, + { + "epoch": 1.889908256880734, + "grad_norm": 0.19715459644794464, + "learning_rate": 3.978749859767961e-06, + "loss": 1.2754254341125488, + "step": 1236 + }, + { + "epoch": 1.8929663608562692, + "grad_norm": 0.2424282431602478, + "learning_rate": 3.9644752778168836e-06, + "loss": 1.2853577136993408, + "step": 1238 + }, + { + "epoch": 1.8960244648318043, + "grad_norm": 0.22451399266719818, + "learning_rate": 3.950218161744049e-06, + "loss": 1.308832049369812, + "step": 1240 + }, + { + "epoch": 1.8990825688073394, + "grad_norm": 0.38970160484313965, + "learning_rate": 3.935978673717512e-06, + "loss": 1.2945680618286133, + "step": 1242 + }, + { + "epoch": 1.9021406727828745, + "grad_norm": 0.22287186980247498, + "learning_rate": 3.921756975704809e-06, + "loss": 1.2276027202606201, + "step": 1244 + }, + { + "epoch": 1.9051987767584098, + "grad_norm": 0.2538350820541382, + "learning_rate": 3.9075532294711326e-06, + "loss": 1.2546557188034058, + "step": 1246 + }, + { + "epoch": 1.908256880733945, + "grad_norm": 0.19810384511947632, + "learning_rate": 3.893367596577475e-06, + "loss": 1.2940235137939453, + "step": 1248 + }, + { + "epoch": 1.9113149847094801, + "grad_norm": 0.20586298406124115, + "learning_rate": 3.8792002383788044e-06, + "loss": 1.3136601448059082, + "step": 1250 + }, + { + "epoch": 1.9143730886850152, + "grad_norm": 0.2770041227340698, + "learning_rate": 3.865051316022215e-06, + "loss": 1.2952957153320312, + "step": 1252 + }, + { + "epoch": 1.9174311926605505, + "grad_norm": 0.22728121280670166, + "learning_rate": 3.85092099044511e-06, + "loss": 1.271630048751831, + "step": 1254 + }, + { + "epoch": 1.9204892966360856, + "grad_norm": 0.1984010934829712, + "learning_rate": 3.836809422373354e-06, + "loss": 1.2360022068023682, + "step": 1256 + }, + { + "epoch": 1.9235474006116209, + "grad_norm": 0.24555295705795288, + "learning_rate": 3.822716772319463e-06, + "loss": 1.271683692932129, + "step": 1258 + }, + { + "epoch": 1.926605504587156, + "grad_norm": 0.20771312713623047, + "learning_rate": 3.8086432005807616e-06, + "loss": 1.2962419986724854, + "step": 1260 + }, + { + "epoch": 1.929663608562691, + "grad_norm": 0.268265962600708, + "learning_rate": 3.794588867237574e-06, + "loss": 1.2458467483520508, + "step": 1262 + }, + { + "epoch": 1.9327217125382263, + "grad_norm": 0.3802253007888794, + "learning_rate": 3.780553932151392e-06, + "loss": 1.2733559608459473, + "step": 1264 + }, + { + "epoch": 1.9357798165137616, + "grad_norm": 0.6309070587158203, + "learning_rate": 3.766538554963062e-06, + "loss": 1.270596981048584, + "step": 1266 + }, + { + "epoch": 1.9388379204892967, + "grad_norm": 0.3053569793701172, + "learning_rate": 3.752542895090969e-06, + "loss": 1.3194211721420288, + "step": 1268 + }, + { + "epoch": 1.9418960244648318, + "grad_norm": 0.21923166513442993, + "learning_rate": 3.7385671117292245e-06, + "loss": 1.3323618173599243, + "step": 1270 + }, + { + "epoch": 1.9449541284403669, + "grad_norm": 0.2166883647441864, + "learning_rate": 3.72461136384585e-06, + "loss": 1.2965784072875977, + "step": 1272 + }, + { + "epoch": 1.9480122324159022, + "grad_norm": 0.2825508117675781, + "learning_rate": 3.710675810180977e-06, + "loss": 1.3159446716308594, + "step": 1274 + }, + { + "epoch": 1.9510703363914375, + "grad_norm": 0.299638956785202, + "learning_rate": 3.696760609245035e-06, + "loss": 1.2833199501037598, + "step": 1276 + }, + { + "epoch": 1.9541284403669725, + "grad_norm": 0.2223178744316101, + "learning_rate": 3.68286591931695e-06, + "loss": 1.22653329372406, + "step": 1278 + }, + { + "epoch": 1.9571865443425076, + "grad_norm": 0.2592408359050751, + "learning_rate": 3.668991898442347e-06, + "loss": 1.2542335987091064, + "step": 1280 + }, + { + "epoch": 1.9602446483180427, + "grad_norm": 0.2755810618400574, + "learning_rate": 3.6551387044317464e-06, + "loss": 1.2745262384414673, + "step": 1282 + }, + { + "epoch": 1.963302752293578, + "grad_norm": 0.21057268977165222, + "learning_rate": 3.6413064948587773e-06, + "loss": 1.2521765232086182, + "step": 1284 + }, + { + "epoch": 1.9663608562691133, + "grad_norm": 0.34427741169929504, + "learning_rate": 3.6274954270583797e-06, + "loss": 1.263521432876587, + "step": 1286 + }, + { + "epoch": 1.9694189602446484, + "grad_norm": 0.2196524441242218, + "learning_rate": 3.6137056581250142e-06, + "loss": 1.3154864311218262, + "step": 1288 + }, + { + "epoch": 1.9724770642201834, + "grad_norm": 0.3191309869289398, + "learning_rate": 3.599937344910872e-06, + "loss": 1.2999801635742188, + "step": 1290 + }, + { + "epoch": 1.9755351681957185, + "grad_norm": 0.22587168216705322, + "learning_rate": 3.5861906440241057e-06, + "loss": 1.3176116943359375, + "step": 1292 + }, + { + "epoch": 1.9785932721712538, + "grad_norm": 0.2769485414028168, + "learning_rate": 3.5724657118270344e-06, + "loss": 1.273116111755371, + "step": 1294 + }, + { + "epoch": 1.981651376146789, + "grad_norm": 0.3299882411956787, + "learning_rate": 3.558762704434361e-06, + "loss": 1.268465280532837, + "step": 1296 + }, + { + "epoch": 1.9847094801223242, + "grad_norm": 0.26859885454177856, + "learning_rate": 3.545081777711412e-06, + "loss": 1.2919847965240479, + "step": 1298 + }, + { + "epoch": 1.9877675840978593, + "grad_norm": 0.9502137899398804, + "learning_rate": 3.5314230872723564e-06, + "loss": 1.342604160308838, + "step": 1300 + }, + { + "epoch": 1.9908256880733946, + "grad_norm": 0.2677958011627197, + "learning_rate": 3.5177867884784334e-06, + "loss": 1.3786706924438477, + "step": 1302 + }, + { + "epoch": 1.9938837920489296, + "grad_norm": 0.40644171833992004, + "learning_rate": 3.504173036436186e-06, + "loss": 1.7326993942260742, + "step": 1304 + }, + { + "epoch": 1.996941896024465, + "grad_norm": 0.45419755578041077, + "learning_rate": 3.4905819859957002e-06, + "loss": 1.7214076519012451, + "step": 1306 + }, + { + "epoch": 2.0, + "grad_norm": 0.9430392980575562, + "learning_rate": 3.4770137917488454e-06, + "loss": 1.8467901945114136, + "step": 1308 + }, + { + "epoch": 2.003058103975535, + "grad_norm": 0.26824504137039185, + "learning_rate": 3.463468608027505e-06, + "loss": 1.4361066818237305, + "step": 1310 + }, + { + "epoch": 2.00611620795107, + "grad_norm": 0.22578075528144836, + "learning_rate": 3.4499465889018337e-06, + "loss": 1.394030213356018, + "step": 1312 + }, + { + "epoch": 2.0091743119266057, + "grad_norm": 0.26776137948036194, + "learning_rate": 3.4364478881785002e-06, + "loss": 1.4127156734466553, + "step": 1314 + }, + { + "epoch": 2.0122324159021407, + "grad_norm": 0.3707635998725891, + "learning_rate": 3.4229726593989353e-06, + "loss": 1.340601921081543, + "step": 1316 + }, + { + "epoch": 2.015290519877676, + "grad_norm": 0.23890726268291473, + "learning_rate": 3.409521055837586e-06, + "loss": 1.5300512313842773, + "step": 1318 + }, + { + "epoch": 2.018348623853211, + "grad_norm": 0.21163959801197052, + "learning_rate": 3.396093230500176e-06, + "loss": 1.4162603616714478, + "step": 1320 + }, + { + "epoch": 2.021406727828746, + "grad_norm": 0.3320009112358093, + "learning_rate": 3.3826893361219614e-06, + "loss": 1.3640984296798706, + "step": 1322 + }, + { + "epoch": 2.0244648318042815, + "grad_norm": 0.2645728886127472, + "learning_rate": 3.3693095251659975e-06, + "loss": 1.4446080923080444, + "step": 1324 + }, + { + "epoch": 2.0275229357798166, + "grad_norm": 0.2824868857860565, + "learning_rate": 3.3559539498213965e-06, + "loss": 1.3105710744857788, + "step": 1326 + }, + { + "epoch": 2.0305810397553516, + "grad_norm": 0.23126038908958435, + "learning_rate": 3.342622762001606e-06, + "loss": 1.3857829570770264, + "step": 1328 + }, + { + "epoch": 2.0336391437308867, + "grad_norm": 0.3670974671840668, + "learning_rate": 3.3293161133426777e-06, + "loss": 1.496924638748169, + "step": 1330 + }, + { + "epoch": 2.036697247706422, + "grad_norm": 0.3528394401073456, + "learning_rate": 3.3160341552015375e-06, + "loss": 1.4135003089904785, + "step": 1332 + }, + { + "epoch": 2.0397553516819573, + "grad_norm": 0.20478151738643646, + "learning_rate": 3.3027770386542706e-06, + "loss": 1.2156240940093994, + "step": 1334 + }, + { + "epoch": 2.0428134556574924, + "grad_norm": 0.46617865562438965, + "learning_rate": 3.289544914494403e-06, + "loss": 1.3763898611068726, + "step": 1336 + }, + { + "epoch": 2.0458715596330275, + "grad_norm": 0.3884037733078003, + "learning_rate": 3.276337933231179e-06, + "loss": 1.622403860092163, + "step": 1338 + }, + { + "epoch": 2.0489296636085625, + "grad_norm": 0.25180479884147644, + "learning_rate": 3.2631562450878597e-06, + "loss": 1.2860331535339355, + "step": 1340 + }, + { + "epoch": 2.051987767584098, + "grad_norm": 0.3756599426269531, + "learning_rate": 3.2500000000000015e-06, + "loss": 1.4189289808273315, + "step": 1342 + }, + { + "epoch": 2.055045871559633, + "grad_norm": 0.32630693912506104, + "learning_rate": 3.236869347613764e-06, + "loss": 1.308931827545166, + "step": 1344 + }, + { + "epoch": 2.058103975535168, + "grad_norm": 0.28512176871299744, + "learning_rate": 3.2237644372842016e-06, + "loss": 1.2988288402557373, + "step": 1346 + }, + { + "epoch": 2.0611620795107033, + "grad_norm": 0.19952069222927094, + "learning_rate": 3.2106854180735625e-06, + "loss": 1.3092859983444214, + "step": 1348 + }, + { + "epoch": 2.0642201834862384, + "grad_norm": 0.24031268060207367, + "learning_rate": 3.1976324387495948e-06, + "loss": 1.3389842510223389, + "step": 1350 + }, + { + "epoch": 2.067278287461774, + "grad_norm": 0.26569297909736633, + "learning_rate": 3.1846056477838572e-06, + "loss": 1.5241750478744507, + "step": 1352 + }, + { + "epoch": 2.070336391437309, + "grad_norm": 0.5251048803329468, + "learning_rate": 3.171605193350028e-06, + "loss": 1.542860507965088, + "step": 1354 + }, + { + "epoch": 2.073394495412844, + "grad_norm": 0.34643858671188354, + "learning_rate": 3.158631223322216e-06, + "loss": 1.3612843751907349, + "step": 1356 + }, + { + "epoch": 2.076452599388379, + "grad_norm": 0.2934923470020294, + "learning_rate": 3.145683885273288e-06, + "loss": 1.355604648590088, + "step": 1358 + }, + { + "epoch": 2.079510703363914, + "grad_norm": 0.743224024772644, + "learning_rate": 3.1327633264731806e-06, + "loss": 1.341210126876831, + "step": 1360 + }, + { + "epoch": 2.0825688073394497, + "grad_norm": 0.32269051671028137, + "learning_rate": 3.11986969388723e-06, + "loss": 1.4118154048919678, + "step": 1362 + }, + { + "epoch": 2.085626911314985, + "grad_norm": 0.29159843921661377, + "learning_rate": 3.1070031341744983e-06, + "loss": 1.389265775680542, + "step": 1364 + }, + { + "epoch": 2.08868501529052, + "grad_norm": 0.24911250174045563, + "learning_rate": 3.094163793686108e-06, + "loss": 1.422662377357483, + "step": 1366 + }, + { + "epoch": 2.091743119266055, + "grad_norm": 0.21826767921447754, + "learning_rate": 3.0813518184635737e-06, + "loss": 1.4053363800048828, + "step": 1368 + }, + { + "epoch": 2.09480122324159, + "grad_norm": 0.3076784610748291, + "learning_rate": 3.0685673542371465e-06, + "loss": 1.283433198928833, + "step": 1370 + }, + { + "epoch": 2.0978593272171255, + "grad_norm": 0.17591321468353271, + "learning_rate": 3.0558105464241466e-06, + "loss": 1.237450361251831, + "step": 1372 + }, + { + "epoch": 2.1009174311926606, + "grad_norm": 0.2663421332836151, + "learning_rate": 3.0430815401273206e-06, + "loss": 1.3944424390792847, + "step": 1374 + }, + { + "epoch": 2.1039755351681957, + "grad_norm": 0.26904943585395813, + "learning_rate": 3.030380480133186e-06, + "loss": 1.5187671184539795, + "step": 1376 + }, + { + "epoch": 2.1070336391437308, + "grad_norm": 0.6649749279022217, + "learning_rate": 3.017707510910378e-06, + "loss": 1.3504502773284912, + "step": 1378 + }, + { + "epoch": 2.1100917431192663, + "grad_norm": 0.37516942620277405, + "learning_rate": 3.0050627766080188e-06, + "loss": 1.5420799255371094, + "step": 1380 + }, + { + "epoch": 2.1131498470948014, + "grad_norm": 0.342439204454422, + "learning_rate": 2.9924464210540717e-06, + "loss": 1.5547534227371216, + "step": 1382 + }, + { + "epoch": 2.1162079510703364, + "grad_norm": 0.48497647047042847, + "learning_rate": 2.979858587753698e-06, + "loss": 1.3153679370880127, + "step": 1384 + }, + { + "epoch": 2.1192660550458715, + "grad_norm": 0.39512813091278076, + "learning_rate": 2.96729941988764e-06, + "loss": 1.2663487195968628, + "step": 1386 + }, + { + "epoch": 2.1223241590214066, + "grad_norm": 0.3283194899559021, + "learning_rate": 2.9547690603105774e-06, + "loss": 1.4247238636016846, + "step": 1388 + }, + { + "epoch": 2.1253822629969417, + "grad_norm": 0.3506661355495453, + "learning_rate": 2.942267651549513e-06, + "loss": 1.2393386363983154, + "step": 1390 + }, + { + "epoch": 2.128440366972477, + "grad_norm": 0.3594140112400055, + "learning_rate": 2.9297953358021487e-06, + "loss": 1.317380666732788, + "step": 1392 + }, + { + "epoch": 2.1314984709480123, + "grad_norm": 0.5971735715866089, + "learning_rate": 2.9173522549352608e-06, + "loss": 1.2773442268371582, + "step": 1394 + }, + { + "epoch": 2.1345565749235473, + "grad_norm": 0.3666265606880188, + "learning_rate": 2.9049385504830987e-06, + "loss": 1.34925377368927, + "step": 1396 + }, + { + "epoch": 2.1376146788990824, + "grad_norm": 0.31561410427093506, + "learning_rate": 2.892554363645766e-06, + "loss": 1.2674505710601807, + "step": 1398 + }, + { + "epoch": 2.140672782874618, + "grad_norm": 0.2038232684135437, + "learning_rate": 2.880199835287618e-06, + "loss": 1.3169916868209839, + "step": 1400 + }, + { + "epoch": 2.143730886850153, + "grad_norm": 0.25303685665130615, + "learning_rate": 2.867875105935658e-06, + "loss": 1.4587633609771729, + "step": 1402 + }, + { + "epoch": 2.146788990825688, + "grad_norm": 0.31143543124198914, + "learning_rate": 2.8555803157779384e-06, + "loss": 1.3396885395050049, + "step": 1404 + }, + { + "epoch": 2.149847094801223, + "grad_norm": 0.2281101942062378, + "learning_rate": 2.8433156046619705e-06, + "loss": 1.2936108112335205, + "step": 1406 + }, + { + "epoch": 2.1529051987767582, + "grad_norm": 0.3648523688316345, + "learning_rate": 2.831081112093129e-06, + "loss": 1.5100679397583008, + "step": 1408 + }, + { + "epoch": 2.1559633027522938, + "grad_norm": 0.278677374124527, + "learning_rate": 2.8188769772330637e-06, + "loss": 1.3869754076004028, + "step": 1410 + }, + { + "epoch": 2.159021406727829, + "grad_norm": 0.21437983214855194, + "learning_rate": 2.806703338898123e-06, + "loss": 1.3129749298095703, + "step": 1412 + }, + { + "epoch": 2.162079510703364, + "grad_norm": 0.24729043245315552, + "learning_rate": 2.794560335557771e-06, + "loss": 1.4099204540252686, + "step": 1414 + }, + { + "epoch": 2.165137614678899, + "grad_norm": 0.3120039701461792, + "learning_rate": 2.7824481053330154e-06, + "loss": 1.3897459506988525, + "step": 1416 + }, + { + "epoch": 2.168195718654434, + "grad_norm": 0.4525415897369385, + "learning_rate": 2.770366785994827e-06, + "loss": 1.445647954940796, + "step": 1418 + }, + { + "epoch": 2.1712538226299696, + "grad_norm": 0.4266716241836548, + "learning_rate": 2.758316514962585e-06, + "loss": 1.3233726024627686, + "step": 1420 + }, + { + "epoch": 2.1743119266055047, + "grad_norm": 0.28266647458076477, + "learning_rate": 2.7462974293025112e-06, + "loss": 1.4238274097442627, + "step": 1422 + }, + { + "epoch": 2.1773700305810397, + "grad_norm": 0.3248072564601898, + "learning_rate": 2.7343096657261e-06, + "loss": 1.3104677200317383, + "step": 1424 + }, + { + "epoch": 2.180428134556575, + "grad_norm": 0.3584449887275696, + "learning_rate": 2.7223533605885784e-06, + "loss": 1.6277508735656738, + "step": 1426 + }, + { + "epoch": 2.18348623853211, + "grad_norm": 0.35764527320861816, + "learning_rate": 2.710428649887348e-06, + "loss": 1.3882687091827393, + "step": 1428 + }, + { + "epoch": 2.1865443425076454, + "grad_norm": 0.24804551899433136, + "learning_rate": 2.6985356692604336e-06, + "loss": 1.4513651132583618, + "step": 1430 + }, + { + "epoch": 2.1896024464831805, + "grad_norm": 0.2202014923095703, + "learning_rate": 2.686674553984951e-06, + "loss": 1.4342420101165771, + "step": 1432 + }, + { + "epoch": 2.1926605504587156, + "grad_norm": 0.36250677704811096, + "learning_rate": 2.6748454389755576e-06, + "loss": 1.394620656967163, + "step": 1434 + }, + { + "epoch": 2.1957186544342506, + "grad_norm": 0.3232296109199524, + "learning_rate": 2.6630484587829265e-06, + "loss": 1.3978071212768555, + "step": 1436 + }, + { + "epoch": 2.198776758409786, + "grad_norm": 0.4420628547668457, + "learning_rate": 2.651283747592211e-06, + "loss": 1.4031468629837036, + "step": 1438 + }, + { + "epoch": 2.2018348623853212, + "grad_norm": 0.6229142546653748, + "learning_rate": 2.639551439221516e-06, + "loss": 1.3914484977722168, + "step": 1440 + }, + { + "epoch": 2.2048929663608563, + "grad_norm": 0.3233772814273834, + "learning_rate": 2.627851667120387e-06, + "loss": 1.476043701171875, + "step": 1442 + }, + { + "epoch": 2.2079510703363914, + "grad_norm": 0.35107681155204773, + "learning_rate": 2.6161845643682763e-06, + "loss": 1.407777190208435, + "step": 1444 + }, + { + "epoch": 2.2110091743119265, + "grad_norm": 0.3123028874397278, + "learning_rate": 2.6045502636730457e-06, + "loss": 1.3102259635925293, + "step": 1446 + }, + { + "epoch": 2.214067278287462, + "grad_norm": 0.2534146308898926, + "learning_rate": 2.5929488973694406e-06, + "loss": 1.2788276672363281, + "step": 1448 + }, + { + "epoch": 2.217125382262997, + "grad_norm": 0.24462664127349854, + "learning_rate": 2.581380597417599e-06, + "loss": 1.3362743854522705, + "step": 1450 + }, + { + "epoch": 2.220183486238532, + "grad_norm": 0.2978283166885376, + "learning_rate": 2.569845495401542e-06, + "loss": 1.2902576923370361, + "step": 1452 + }, + { + "epoch": 2.223241590214067, + "grad_norm": 0.299277126789093, + "learning_rate": 2.5583437225276818e-06, + "loss": 1.3449206352233887, + "step": 1454 + }, + { + "epoch": 2.2262996941896023, + "grad_norm": 0.36601486802101135, + "learning_rate": 2.546875409623324e-06, + "loss": 1.3038407564163208, + "step": 1456 + }, + { + "epoch": 2.229357798165138, + "grad_norm": 0.42299339175224304, + "learning_rate": 2.5354406871351833e-06, + "loss": 1.5554304122924805, + "step": 1458 + }, + { + "epoch": 2.232415902140673, + "grad_norm": 0.32388123869895935, + "learning_rate": 2.5240396851279043e-06, + "loss": 1.5746049880981445, + "step": 1460 + }, + { + "epoch": 2.235474006116208, + "grad_norm": 0.39095836877822876, + "learning_rate": 2.5126725332825675e-06, + "loss": 1.6094728708267212, + "step": 1462 + }, + { + "epoch": 2.238532110091743, + "grad_norm": 0.5842258930206299, + "learning_rate": 2.501339360895231e-06, + "loss": 1.5279463529586792, + "step": 1464 + }, + { + "epoch": 2.241590214067278, + "grad_norm": 0.3429890275001526, + "learning_rate": 2.4900402968754504e-06, + "loss": 1.5856099128723145, + "step": 1466 + }, + { + "epoch": 2.2446483180428136, + "grad_norm": 0.35519224405288696, + "learning_rate": 2.4787754697448153e-06, + "loss": 1.4757394790649414, + "step": 1468 + }, + { + "epoch": 2.2477064220183487, + "grad_norm": 0.46203580498695374, + "learning_rate": 2.4675450076354822e-06, + "loss": 1.584846019744873, + "step": 1470 + }, + { + "epoch": 2.2507645259938838, + "grad_norm": 0.8099899888038635, + "learning_rate": 2.4563490382887267e-06, + "loss": 1.367172360420227, + "step": 1472 + }, + { + "epoch": 2.253822629969419, + "grad_norm": 0.7287035584449768, + "learning_rate": 2.4451876890534847e-06, + "loss": 1.492293357849121, + "step": 1474 + }, + { + "epoch": 2.2568807339449544, + "grad_norm": 0.3203519284725189, + "learning_rate": 2.4340610868849e-06, + "loss": 1.2751667499542236, + "step": 1476 + }, + { + "epoch": 2.2599388379204894, + "grad_norm": 0.6493098139762878, + "learning_rate": 2.4229693583428916e-06, + "loss": 1.4823472499847412, + "step": 1478 + }, + { + "epoch": 2.2629969418960245, + "grad_norm": 0.4101910889148712, + "learning_rate": 2.4119126295906997e-06, + "loss": 1.09395170211792, + "step": 1480 + }, + { + "epoch": 2.2660550458715596, + "grad_norm": 0.4682796597480774, + "learning_rate": 2.400891026393464e-06, + "loss": 1.0601507425308228, + "step": 1482 + }, + { + "epoch": 2.2691131498470947, + "grad_norm": 0.5146844387054443, + "learning_rate": 2.3899046741167868e-06, + "loss": 1.2724342346191406, + "step": 1484 + }, + { + "epoch": 2.2721712538226297, + "grad_norm": 0.8610156178474426, + "learning_rate": 2.3789536977253034e-06, + "loss": 1.3352521657943726, + "step": 1486 + }, + { + "epoch": 2.2752293577981653, + "grad_norm": 1.053831696510315, + "learning_rate": 2.3680382217812685e-06, + "loss": 1.4391016960144043, + "step": 1488 + }, + { + "epoch": 2.2782874617737003, + "grad_norm": 0.6413374543190002, + "learning_rate": 2.3571583704431355e-06, + "loss": 1.3907897472381592, + "step": 1490 + }, + { + "epoch": 2.2813455657492354, + "grad_norm": 0.30044737458229065, + "learning_rate": 2.346314267464145e-06, + "loss": 1.1618599891662598, + "step": 1492 + }, + { + "epoch": 2.2844036697247705, + "grad_norm": 0.3427642285823822, + "learning_rate": 2.3355060361909134e-06, + "loss": 1.134230375289917, + "step": 1494 + }, + { + "epoch": 2.287461773700306, + "grad_norm": 0.28166523575782776, + "learning_rate": 2.3247337995620363e-06, + "loss": 1.357274055480957, + "step": 1496 + }, + { + "epoch": 2.290519877675841, + "grad_norm": 0.7598418593406677, + "learning_rate": 2.313997680106686e-06, + "loss": 1.2663555145263672, + "step": 1498 + }, + { + "epoch": 2.293577981651376, + "grad_norm": 1.0048569440841675, + "learning_rate": 2.3032977999432205e-06, + "loss": 1.2259790897369385, + "step": 1500 + }, + { + "epoch": 2.2966360856269112, + "grad_norm": 0.3067741096019745, + "learning_rate": 2.2926342807777886e-06, + "loss": 1.435164213180542, + "step": 1502 + }, + { + "epoch": 2.2996941896024463, + "grad_norm": 0.5623937249183655, + "learning_rate": 2.2820072439029524e-06, + "loss": 1.4023568630218506, + "step": 1504 + }, + { + "epoch": 2.302752293577982, + "grad_norm": 0.3359718918800354, + "learning_rate": 2.271416810196308e-06, + "loss": 1.1277801990509033, + "step": 1506 + }, + { + "epoch": 2.305810397553517, + "grad_norm": 0.3305533528327942, + "learning_rate": 2.2608631001190994e-06, + "loss": 1.3414134979248047, + "step": 1508 + }, + { + "epoch": 2.308868501529052, + "grad_norm": 0.28481531143188477, + "learning_rate": 2.2503462337148642e-06, + "loss": 1.4879052639007568, + "step": 1510 + }, + { + "epoch": 2.311926605504587, + "grad_norm": 0.28595951199531555, + "learning_rate": 2.239866330608057e-06, + "loss": 1.6209688186645508, + "step": 1512 + }, + { + "epoch": 2.314984709480122, + "grad_norm": 0.29558923840522766, + "learning_rate": 2.2294235100026933e-06, + "loss": 1.6481235027313232, + "step": 1514 + }, + { + "epoch": 2.3180428134556577, + "grad_norm": 0.5758782029151917, + "learning_rate": 2.21901789068099e-06, + "loss": 1.7679166793823242, + "step": 1516 + }, + { + "epoch": 2.3211009174311927, + "grad_norm": 0.3111439347267151, + "learning_rate": 2.2086495910020192e-06, + "loss": 1.3151183128356934, + "step": 1518 + }, + { + "epoch": 2.324159021406728, + "grad_norm": 0.44918501377105713, + "learning_rate": 2.1983187289003587e-06, + "loss": 1.3933916091918945, + "step": 1520 + }, + { + "epoch": 2.327217125382263, + "grad_norm": 0.3173042833805084, + "learning_rate": 2.188025421884754e-06, + "loss": 1.240437388420105, + "step": 1522 + }, + { + "epoch": 2.330275229357798, + "grad_norm": 0.2350539118051529, + "learning_rate": 2.1777697870367713e-06, + "loss": 1.1647779941558838, + "step": 1524 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.3137843906879425, + "learning_rate": 2.1675519410094803e-06, + "loss": 1.5445265769958496, + "step": 1526 + }, + { + "epoch": 2.3363914373088686, + "grad_norm": 0.5268841981887817, + "learning_rate": 2.157372000026119e-06, + "loss": 1.444595217704773, + "step": 1528 + }, + { + "epoch": 2.3394495412844036, + "grad_norm": 0.3506692349910736, + "learning_rate": 2.1472300798787746e-06, + "loss": 1.6354224681854248, + "step": 1530 + }, + { + "epoch": 2.3425076452599387, + "grad_norm": 0.3233583867549896, + "learning_rate": 2.1371262959270594e-06, + "loss": 1.1021732091903687, + "step": 1532 + }, + { + "epoch": 2.3455657492354742, + "grad_norm": 0.29296091198921204, + "learning_rate": 2.1270607630968104e-06, + "loss": 1.3453254699707031, + "step": 1534 + }, + { + "epoch": 2.3486238532110093, + "grad_norm": 0.3317727744579315, + "learning_rate": 2.1170335958787736e-06, + "loss": 1.607575535774231, + "step": 1536 + }, + { + "epoch": 2.3516819571865444, + "grad_norm": 0.2295382171869278, + "learning_rate": 2.1070449083273047e-06, + "loss": 1.3497262001037598, + "step": 1538 + }, + { + "epoch": 2.3547400611620795, + "grad_norm": 0.4568946957588196, + "learning_rate": 2.0970948140590672e-06, + "loss": 1.509822130203247, + "step": 1540 + }, + { + "epoch": 2.3577981651376145, + "grad_norm": 0.34416595101356506, + "learning_rate": 2.08718342625175e-06, + "loss": 1.385573148727417, + "step": 1542 + }, + { + "epoch": 2.3608562691131496, + "grad_norm": 0.33610644936561584, + "learning_rate": 2.077310857642772e-06, + "loss": 1.3133833408355713, + "step": 1544 + }, + { + "epoch": 2.363914373088685, + "grad_norm": 0.332163006067276, + "learning_rate": 2.067477220527998e-06, + "loss": 1.3794035911560059, + "step": 1546 + }, + { + "epoch": 2.36697247706422, + "grad_norm": 0.46091410517692566, + "learning_rate": 2.05768262676047e-06, + "loss": 1.4221172332763672, + "step": 1548 + }, + { + "epoch": 2.3700305810397553, + "grad_norm": 0.2670794427394867, + "learning_rate": 2.0479271877491278e-06, + "loss": 1.2908828258514404, + "step": 1550 + }, + { + "epoch": 2.3730886850152904, + "grad_norm": 0.31927385926246643, + "learning_rate": 2.038211014457546e-06, + "loss": 1.3988337516784668, + "step": 1552 + }, + { + "epoch": 2.376146788990826, + "grad_norm": 0.4126211404800415, + "learning_rate": 2.028534217402667e-06, + "loss": 1.7016716003417969, + "step": 1554 + }, + { + "epoch": 2.379204892966361, + "grad_norm": 0.6094360947608948, + "learning_rate": 2.0188969066535484e-06, + "loss": 2.0326876640319824, + "step": 1556 + }, + { + "epoch": 2.382262996941896, + "grad_norm": 0.40967652201652527, + "learning_rate": 2.0092991918301106e-06, + "loss": 1.3301377296447754, + "step": 1558 + }, + { + "epoch": 2.385321100917431, + "grad_norm": 0.6155174970626831, + "learning_rate": 1.9997411821018885e-06, + "loss": 1.319265604019165, + "step": 1560 + }, + { + "epoch": 2.388379204892966, + "grad_norm": 0.4441206455230713, + "learning_rate": 1.990222986186786e-06, + "loss": 1.3922169208526611, + "step": 1562 + }, + { + "epoch": 2.3914373088685017, + "grad_norm": 0.5924298167228699, + "learning_rate": 1.980744712349849e-06, + "loss": 1.4741730690002441, + "step": 1564 + }, + { + "epoch": 2.3944954128440368, + "grad_norm": 0.42252296209335327, + "learning_rate": 1.9713064684020262e-06, + "loss": 1.4076108932495117, + "step": 1566 + }, + { + "epoch": 2.397553516819572, + "grad_norm": 0.36031708121299744, + "learning_rate": 1.9619083616989457e-06, + "loss": 1.278861403465271, + "step": 1568 + }, + { + "epoch": 2.400611620795107, + "grad_norm": 0.24064381420612335, + "learning_rate": 1.952550499139689e-06, + "loss": 1.19804048538208, + "step": 1570 + }, + { + "epoch": 2.4036697247706424, + "grad_norm": 0.18197159469127655, + "learning_rate": 1.9432329871655837e-06, + "loss": 1.12447988986969, + "step": 1572 + }, + { + "epoch": 2.4067278287461775, + "grad_norm": 0.30438297986984253, + "learning_rate": 1.933955931758988e-06, + "loss": 1.2643486261367798, + "step": 1574 + }, + { + "epoch": 2.4097859327217126, + "grad_norm": 0.5426669120788574, + "learning_rate": 1.9247194384420855e-06, + "loss": 1.504340410232544, + "step": 1576 + }, + { + "epoch": 2.4128440366972477, + "grad_norm": 0.6118716597557068, + "learning_rate": 1.915523612275681e-06, + "loss": 1.5359920263290405, + "step": 1578 + }, + { + "epoch": 2.4159021406727827, + "grad_norm": 0.5290548801422119, + "learning_rate": 1.9063685578580137e-06, + "loss": 1.5219250917434692, + "step": 1580 + }, + { + "epoch": 2.418960244648318, + "grad_norm": 0.348886638879776, + "learning_rate": 1.8972543793235626e-06, + "loss": 1.5620722770690918, + "step": 1582 + }, + { + "epoch": 2.4220183486238533, + "grad_norm": 0.4480542838573456, + "learning_rate": 1.8881811803418624e-06, + "loss": 1.3870704174041748, + "step": 1584 + }, + { + "epoch": 2.4250764525993884, + "grad_norm": 0.6594481468200684, + "learning_rate": 1.8791490641163218e-06, + "loss": 1.5246330499649048, + "step": 1586 + }, + { + "epoch": 2.4281345565749235, + "grad_norm": 0.48964548110961914, + "learning_rate": 1.870158133383055e-06, + "loss": 1.4073295593261719, + "step": 1588 + }, + { + "epoch": 2.4311926605504586, + "grad_norm": 0.40440455079078674, + "learning_rate": 1.8612084904097117e-06, + "loss": 1.329315423965454, + "step": 1590 + }, + { + "epoch": 2.434250764525994, + "grad_norm": 0.3714819550514221, + "learning_rate": 1.852300236994308e-06, + "loss": 1.3444490432739258, + "step": 1592 + }, + { + "epoch": 2.437308868501529, + "grad_norm": 0.5145377516746521, + "learning_rate": 1.8434334744640763e-06, + "loss": 1.5467479228973389, + "step": 1594 + }, + { + "epoch": 2.4403669724770642, + "grad_norm": 0.46002912521362305, + "learning_rate": 1.8346083036743104e-06, + "loss": 1.289878249168396, + "step": 1596 + }, + { + "epoch": 2.4434250764525993, + "grad_norm": 0.793483555316925, + "learning_rate": 1.8258248250072158e-06, + "loss": 1.4660496711730957, + "step": 1598 + }, + { + "epoch": 2.4464831804281344, + "grad_norm": 0.44911351799964905, + "learning_rate": 1.8170831383707683e-06, + "loss": 1.3652875423431396, + "step": 1600 + }, + { + "epoch": 2.44954128440367, + "grad_norm": 0.38207677006721497, + "learning_rate": 1.8083833431975805e-06, + "loss": 1.3762791156768799, + "step": 1602 + }, + { + "epoch": 2.452599388379205, + "grad_norm": 0.4357513189315796, + "learning_rate": 1.7997255384437695e-06, + "loss": 1.5232503414154053, + "step": 1604 + }, + { + "epoch": 2.45565749235474, + "grad_norm": 0.3423779308795929, + "learning_rate": 1.7911098225878309e-06, + "loss": 1.5271486043930054, + "step": 1606 + }, + { + "epoch": 2.458715596330275, + "grad_norm": 5.960415363311768, + "learning_rate": 1.7825362936295171e-06, + "loss": 1.3485842943191528, + "step": 1608 + }, + { + "epoch": 2.46177370030581, + "grad_norm": 0.36111417412757874, + "learning_rate": 1.774005049088725e-06, + "loss": 1.2900433540344238, + "step": 1610 + }, + { + "epoch": 2.4648318042813457, + "grad_norm": 0.33147767186164856, + "learning_rate": 1.7655161860043873e-06, + "loss": 1.4210761785507202, + "step": 1612 + }, + { + "epoch": 2.467889908256881, + "grad_norm": 0.3786766231060028, + "learning_rate": 1.7570698009333664e-06, + "loss": 1.370017409324646, + "step": 1614 + }, + { + "epoch": 2.470948012232416, + "grad_norm": 1.8267617225646973, + "learning_rate": 1.7486659899493537e-06, + "loss": 1.5153461694717407, + "step": 1616 + }, + { + "epoch": 2.474006116207951, + "grad_norm": 0.3199278712272644, + "learning_rate": 1.740304848641787e-06, + "loss": 1.3838684558868408, + "step": 1618 + }, + { + "epoch": 2.477064220183486, + "grad_norm": 0.3670620322227478, + "learning_rate": 1.731986472114751e-06, + "loss": 1.33723783493042, + "step": 1620 + }, + { + "epoch": 2.4801223241590216, + "grad_norm": 0.36861374974250793, + "learning_rate": 1.7237109549859043e-06, + "loss": 1.2932226657867432, + "step": 1622 + }, + { + "epoch": 2.4831804281345566, + "grad_norm": 0.34438320994377136, + "learning_rate": 1.7154783913853968e-06, + "loss": 1.42689049243927, + "step": 1624 + }, + { + "epoch": 2.4862385321100917, + "grad_norm": 0.23838122189044952, + "learning_rate": 1.7072888749548033e-06, + "loss": 1.4100431203842163, + "step": 1626 + }, + { + "epoch": 2.489296636085627, + "grad_norm": 0.46484264731407166, + "learning_rate": 1.6991424988460592e-06, + "loss": 1.3829045295715332, + "step": 1628 + }, + { + "epoch": 2.4923547400611623, + "grad_norm": 0.3008574843406677, + "learning_rate": 1.6910393557203964e-06, + "loss": 1.5693084001541138, + "step": 1630 + }, + { + "epoch": 2.4954128440366974, + "grad_norm": 0.37115153670310974, + "learning_rate": 1.6829795377472908e-06, + "loss": 1.7590757608413696, + "step": 1632 + }, + { + "epoch": 2.4984709480122325, + "grad_norm": 0.616698682308197, + "learning_rate": 1.674963136603417e-06, + "loss": 1.6397650241851807, + "step": 1634 + }, + { + "epoch": 2.5015290519877675, + "grad_norm": 0.384959876537323, + "learning_rate": 1.6669902434716046e-06, + "loss": 1.6299896240234375, + "step": 1636 + }, + { + "epoch": 2.5045871559633026, + "grad_norm": 0.8294275403022766, + "learning_rate": 1.6590609490397958e-06, + "loss": 1.5394856929779053, + "step": 1638 + }, + { + "epoch": 2.5076452599388377, + "grad_norm": 0.40894415974617004, + "learning_rate": 1.6511753435000205e-06, + "loss": 1.2182371616363525, + "step": 1640 + }, + { + "epoch": 2.510703363914373, + "grad_norm": 0.45905759930610657, + "learning_rate": 1.6433335165473686e-06, + "loss": 1.2023439407348633, + "step": 1642 + }, + { + "epoch": 2.5137614678899083, + "grad_norm": 0.38532376289367676, + "learning_rate": 1.635535557378968e-06, + "loss": 1.6095008850097656, + "step": 1644 + }, + { + "epoch": 2.5168195718654434, + "grad_norm": 1.44415283203125, + "learning_rate": 1.6277815546929688e-06, + "loss": 1.6082322597503662, + "step": 1646 + }, + { + "epoch": 2.5198776758409784, + "grad_norm": 0.5093996524810791, + "learning_rate": 1.6200715966875394e-06, + "loss": 1.7141090631484985, + "step": 1648 + }, + { + "epoch": 2.522935779816514, + "grad_norm": 0.5241023898124695, + "learning_rate": 1.6124057710598603e-06, + "loss": 1.6450610160827637, + "step": 1650 + }, + { + "epoch": 2.525993883792049, + "grad_norm": 0.49204516410827637, + "learning_rate": 1.6047841650051272e-06, + "loss": 1.6974513530731201, + "step": 1652 + }, + { + "epoch": 2.529051987767584, + "grad_norm": 0.8506813049316406, + "learning_rate": 1.5972068652155554e-06, + "loss": 1.5313912630081177, + "step": 1654 + }, + { + "epoch": 2.532110091743119, + "grad_norm": 0.33754727244377136, + "learning_rate": 1.5896739578794e-06, + "loss": 1.5209699869155884, + "step": 1656 + }, + { + "epoch": 2.5351681957186543, + "grad_norm": 0.7774704694747925, + "learning_rate": 1.5821855286799742e-06, + "loss": 1.4035563468933105, + "step": 1658 + }, + { + "epoch": 2.5382262996941893, + "grad_norm": 0.6433319449424744, + "learning_rate": 1.5747416627946673e-06, + "loss": 1.665273666381836, + "step": 1660 + }, + { + "epoch": 2.541284403669725, + "grad_norm": 0.6971220970153809, + "learning_rate": 1.5673424448939887e-06, + "loss": 1.5019344091415405, + "step": 1662 + }, + { + "epoch": 2.54434250764526, + "grad_norm": 0.40314802527427673, + "learning_rate": 1.5599879591405917e-06, + "loss": 1.1620054244995117, + "step": 1664 + }, + { + "epoch": 2.547400611620795, + "grad_norm": 0.48018017411231995, + "learning_rate": 1.552678289188326e-06, + "loss": 1.6923828125, + "step": 1666 + }, + { + "epoch": 2.5504587155963305, + "grad_norm": 0.4809359312057495, + "learning_rate": 1.545413518181283e-06, + "loss": 1.7656713724136353, + "step": 1668 + }, + { + "epoch": 2.5535168195718656, + "grad_norm": 0.40401753783226013, + "learning_rate": 1.5381937287528449e-06, + "loss": 1.8313161134719849, + "step": 1670 + }, + { + "epoch": 2.5565749235474007, + "grad_norm": 0.4581202268600464, + "learning_rate": 1.5310190030247546e-06, + "loss": 1.7572789192199707, + "step": 1672 + }, + { + "epoch": 2.5596330275229358, + "grad_norm": 0.9305920600891113, + "learning_rate": 1.5238894226061737e-06, + "loss": 1.7307026386260986, + "step": 1674 + }, + { + "epoch": 2.562691131498471, + "grad_norm": 0.47380930185317993, + "learning_rate": 1.5168050685927566e-06, + "loss": 1.5947740077972412, + "step": 1676 + }, + { + "epoch": 2.565749235474006, + "grad_norm": 1.2263463735580444, + "learning_rate": 1.5097660215657306e-06, + "loss": 1.4555588960647583, + "step": 1678 + }, + { + "epoch": 2.5688073394495414, + "grad_norm": 0.43118909001350403, + "learning_rate": 1.5027723615909745e-06, + "loss": 1.0147868394851685, + "step": 1680 + }, + { + "epoch": 2.5718654434250765, + "grad_norm": 0.5391921401023865, + "learning_rate": 1.4958241682181137e-06, + "loss": 1.0223249197006226, + "step": 1682 + }, + { + "epoch": 2.5749235474006116, + "grad_norm": 0.2522028386592865, + "learning_rate": 1.4889215204796082e-06, + "loss": 1.250197172164917, + "step": 1684 + }, + { + "epoch": 2.5779816513761467, + "grad_norm": 0.29159918427467346, + "learning_rate": 1.4820644968898605e-06, + "loss": 1.1835776567459106, + "step": 1686 + }, + { + "epoch": 2.581039755351682, + "grad_norm": 0.2946909964084625, + "learning_rate": 1.47525317544432e-06, + "loss": 1.1374409198760986, + "step": 1688 + }, + { + "epoch": 2.5840978593272173, + "grad_norm": 0.19036340713500977, + "learning_rate": 1.468487633618594e-06, + "loss": 1.1817882061004639, + "step": 1690 + }, + { + "epoch": 2.5871559633027523, + "grad_norm": 1.4873279333114624, + "learning_rate": 1.4617679483675673e-06, + "loss": 1.4171775579452515, + "step": 1692 + }, + { + "epoch": 2.5902140672782874, + "grad_norm": 0.32151684165000916, + "learning_rate": 1.4550941961245288e-06, + "loss": 1.3625459671020508, + "step": 1694 + }, + { + "epoch": 2.5932721712538225, + "grad_norm": 0.26637983322143555, + "learning_rate": 1.4484664528003026e-06, + "loss": 1.2058180570602417, + "step": 1696 + }, + { + "epoch": 2.5963302752293576, + "grad_norm": 0.5087877511978149, + "learning_rate": 1.4418847937823784e-06, + "loss": 1.425114631652832, + "step": 1698 + }, + { + "epoch": 2.599388379204893, + "grad_norm": 0.9368872046470642, + "learning_rate": 1.4353492939340618e-06, + "loss": 1.4749643802642822, + "step": 1700 + }, + { + "epoch": 2.602446483180428, + "grad_norm": 0.48912081122398376, + "learning_rate": 1.4288600275936184e-06, + "loss": 1.245436668395996, + "step": 1702 + }, + { + "epoch": 2.6055045871559632, + "grad_norm": 0.4674423635005951, + "learning_rate": 1.4224170685734303e-06, + "loss": 1.4404422044754028, + "step": 1704 + }, + { + "epoch": 2.6085626911314987, + "grad_norm": 0.7305318117141724, + "learning_rate": 1.416020490159152e-06, + "loss": 1.6482999324798584, + "step": 1706 + }, + { + "epoch": 2.611620795107034, + "grad_norm": 0.5728065371513367, + "learning_rate": 1.4096703651088848e-06, + "loss": 1.1557910442352295, + "step": 1708 + }, + { + "epoch": 2.614678899082569, + "grad_norm": 0.6479355096817017, + "learning_rate": 1.4033667656523405e-06, + "loss": 1.4093899726867676, + "step": 1710 + }, + { + "epoch": 2.617737003058104, + "grad_norm": 1.1274484395980835, + "learning_rate": 1.3971097634900262e-06, + "loss": 1.4923943281173706, + "step": 1712 + }, + { + "epoch": 2.620795107033639, + "grad_norm": 0.5374640822410583, + "learning_rate": 1.3908994297924275e-06, + "loss": 1.3800336122512817, + "step": 1714 + }, + { + "epoch": 2.623853211009174, + "grad_norm": 0.6038364171981812, + "learning_rate": 1.3847358351991945e-06, + "loss": 1.2194199562072754, + "step": 1716 + }, + { + "epoch": 2.6269113149847096, + "grad_norm": 0.7064008712768555, + "learning_rate": 1.3786190498183446e-06, + "loss": 0.8604775667190552, + "step": 1718 + }, + { + "epoch": 2.6299694189602447, + "grad_norm": 0.3798482418060303, + "learning_rate": 1.3725491432254627e-06, + "loss": 1.5459158420562744, + "step": 1720 + }, + { + "epoch": 2.63302752293578, + "grad_norm": 0.47553232312202454, + "learning_rate": 1.3665261844629053e-06, + "loss": 1.466538429260254, + "step": 1722 + }, + { + "epoch": 2.636085626911315, + "grad_norm": 0.3397771716117859, + "learning_rate": 1.360550242039024e-06, + "loss": 1.3562582731246948, + "step": 1724 + }, + { + "epoch": 2.6391437308868504, + "grad_norm": 0.282279908657074, + "learning_rate": 1.354621383927379e-06, + "loss": 1.4752657413482666, + "step": 1726 + }, + { + "epoch": 2.6422018348623855, + "grad_norm": 0.3183048963546753, + "learning_rate": 1.3487396775659691e-06, + "loss": 1.4154858589172363, + "step": 1728 + }, + { + "epoch": 2.6452599388379205, + "grad_norm": 0.4210142493247986, + "learning_rate": 1.3429051898564623e-06, + "loss": 1.3750901222229004, + "step": 1730 + }, + { + "epoch": 2.6483180428134556, + "grad_norm": 0.6870266795158386, + "learning_rate": 1.337117987163439e-06, + "loss": 1.5814931392669678, + "step": 1732 + }, + { + "epoch": 2.6513761467889907, + "grad_norm": 0.4824894964694977, + "learning_rate": 1.3313781353136329e-06, + "loss": 1.2281584739685059, + "step": 1734 + }, + { + "epoch": 2.6544342507645258, + "grad_norm": 0.2543982267379761, + "learning_rate": 1.3256856995951852e-06, + "loss": 1.0042641162872314, + "step": 1736 + }, + { + "epoch": 2.6574923547400613, + "grad_norm": 0.39150846004486084, + "learning_rate": 1.3200407447568985e-06, + "loss": 1.6282243728637695, + "step": 1738 + }, + { + "epoch": 2.6605504587155964, + "grad_norm": 0.43744921684265137, + "learning_rate": 1.3144433350075045e-06, + "loss": 1.419670820236206, + "step": 1740 + }, + { + "epoch": 2.6636085626911314, + "grad_norm": 0.5169599652290344, + "learning_rate": 1.3088935340149312e-06, + "loss": 1.5492973327636719, + "step": 1742 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.3686998188495636, + "learning_rate": 1.3033914049055776e-06, + "loss": 1.390296459197998, + "step": 1744 + }, + { + "epoch": 2.669724770642202, + "grad_norm": 0.3961811363697052, + "learning_rate": 1.2979370102636001e-06, + "loss": 1.6185352802276611, + "step": 1746 + }, + { + "epoch": 2.672782874617737, + "grad_norm": 0.4181622266769409, + "learning_rate": 1.2925304121301956e-06, + "loss": 1.47446608543396, + "step": 1748 + }, + { + "epoch": 2.675840978593272, + "grad_norm": 0.5175849199295044, + "learning_rate": 1.2871716720029001e-06, + "loss": 1.4941065311431885, + "step": 1750 + }, + { + "epoch": 2.6788990825688073, + "grad_norm": 0.4671924412250519, + "learning_rate": 1.2818608508348831e-06, + "loss": 1.3738720417022705, + "step": 1752 + }, + { + "epoch": 2.6819571865443423, + "grad_norm": 0.31229135394096375, + "learning_rate": 1.2765980090342638e-06, + "loss": 1.0343739986419678, + "step": 1754 + }, + { + "epoch": 2.6850152905198774, + "grad_norm": 0.5780667662620544, + "learning_rate": 1.2713832064634127e-06, + "loss": 1.4987692832946777, + "step": 1756 + }, + { + "epoch": 2.688073394495413, + "grad_norm": 0.29605942964553833, + "learning_rate": 1.2662165024382813e-06, + "loss": 1.4711230993270874, + "step": 1758 + }, + { + "epoch": 2.691131498470948, + "grad_norm": 0.4572795629501343, + "learning_rate": 1.2610979557277186e-06, + "loss": 1.4898228645324707, + "step": 1760 + }, + { + "epoch": 2.694189602446483, + "grad_norm": 0.5139583945274353, + "learning_rate": 1.2560276245528099e-06, + "loss": 1.4924449920654297, + "step": 1762 + }, + { + "epoch": 2.6972477064220186, + "grad_norm": 0.3455151319503784, + "learning_rate": 1.251005566586209e-06, + "loss": 1.3008229732513428, + "step": 1764 + }, + { + "epoch": 2.7003058103975537, + "grad_norm": 0.5034812092781067, + "learning_rate": 1.2460318389514868e-06, + "loss": 1.5259795188903809, + "step": 1766 + }, + { + "epoch": 2.7033639143730888, + "grad_norm": 0.55739825963974, + "learning_rate": 1.241106498222476e-06, + "loss": 1.610971212387085, + "step": 1768 + }, + { + "epoch": 2.706422018348624, + "grad_norm": 0.3922676146030426, + "learning_rate": 1.2362296004226327e-06, + "loss": 1.3188968896865845, + "step": 1770 + }, + { + "epoch": 2.709480122324159, + "grad_norm": 0.4953126311302185, + "learning_rate": 1.2314012010243973e-06, + "loss": 1.5828558206558228, + "step": 1772 + }, + { + "epoch": 2.712538226299694, + "grad_norm": 0.6791023015975952, + "learning_rate": 1.2266213549485638e-06, + "loss": 1.3703022003173828, + "step": 1774 + }, + { + "epoch": 2.7155963302752295, + "grad_norm": 0.37211811542510986, + "learning_rate": 1.2218901165636526e-06, + "loss": 1.504420280456543, + "step": 1776 + }, + { + "epoch": 2.7186544342507646, + "grad_norm": 0.2997111678123474, + "learning_rate": 1.2172075396852972e-06, + "loss": 1.442054271697998, + "step": 1778 + }, + { + "epoch": 2.7217125382262997, + "grad_norm": 0.3290131390094757, + "learning_rate": 1.212573677575627e-06, + "loss": 1.5728079080581665, + "step": 1780 + }, + { + "epoch": 2.7247706422018347, + "grad_norm": 0.3726375102996826, + "learning_rate": 1.2079885829426653e-06, + "loss": 1.6637623310089111, + "step": 1782 + }, + { + "epoch": 2.7278287461773703, + "grad_norm": 0.7502315640449524, + "learning_rate": 1.2034523079397264e-06, + "loss": 1.550297737121582, + "step": 1784 + }, + { + "epoch": 2.7308868501529053, + "grad_norm": 0.3677420914173126, + "learning_rate": 1.1989649041648244e-06, + "loss": 1.3913054466247559, + "step": 1786 + }, + { + "epoch": 2.7339449541284404, + "grad_norm": 0.6194299459457397, + "learning_rate": 1.1945264226600878e-06, + "loss": 1.49534010887146, + "step": 1788 + }, + { + "epoch": 2.7370030581039755, + "grad_norm": 0.42255425453186035, + "learning_rate": 1.1901369139111737e-06, + "loss": 1.5017262697219849, + "step": 1790 + }, + { + "epoch": 2.7400611620795106, + "grad_norm": 0.39475998282432556, + "learning_rate": 1.1857964278467003e-06, + "loss": 1.4985376596450806, + "step": 1792 + }, + { + "epoch": 2.7431192660550456, + "grad_norm": 0.4835125207901001, + "learning_rate": 1.1815050138376731e-06, + "loss": 1.513980746269226, + "step": 1794 + }, + { + "epoch": 2.746177370030581, + "grad_norm": 0.27400922775268555, + "learning_rate": 1.1772627206969286e-06, + "loss": 1.5117716789245605, + "step": 1796 + }, + { + "epoch": 2.7492354740061162, + "grad_norm": 0.35452115535736084, + "learning_rate": 1.1730695966785726e-06, + "loss": 1.3024158477783203, + "step": 1798 + }, + { + "epoch": 2.7522935779816513, + "grad_norm": 0.45254552364349365, + "learning_rate": 1.1689256894774384e-06, + "loss": 1.3760697841644287, + "step": 1800 + }, + { + "epoch": 2.7553516819571864, + "grad_norm": 0.6041072010993958, + "learning_rate": 1.1648310462285386e-06, + "loss": 1.298436164855957, + "step": 1802 + }, + { + "epoch": 2.758409785932722, + "grad_norm": 0.555728554725647, + "learning_rate": 1.1607857135065337e-06, + "loss": 1.3885629177093506, + "step": 1804 + }, + { + "epoch": 2.761467889908257, + "grad_norm": 0.5937597751617432, + "learning_rate": 1.1567897373251967e-06, + "loss": 1.3754394054412842, + "step": 1806 + }, + { + "epoch": 2.764525993883792, + "grad_norm": 0.35898932814598083, + "learning_rate": 1.1528431631368957e-06, + "loss": 1.2469127178192139, + "step": 1808 + }, + { + "epoch": 2.767584097859327, + "grad_norm": 0.24282048642635345, + "learning_rate": 1.1489460358320728e-06, + "loss": 0.9015558958053589, + "step": 1810 + }, + { + "epoch": 2.770642201834862, + "grad_norm": 0.27484798431396484, + "learning_rate": 1.1450983997387365e-06, + "loss": 1.2076148986816406, + "step": 1812 + }, + { + "epoch": 2.7737003058103973, + "grad_norm": 0.29970651865005493, + "learning_rate": 1.1413002986219528e-06, + "loss": 1.2744965553283691, + "step": 1814 + }, + { + "epoch": 2.776758409785933, + "grad_norm": 0.26047366857528687, + "learning_rate": 1.1375517756833534e-06, + "loss": 1.3271204233169556, + "step": 1816 + }, + { + "epoch": 2.779816513761468, + "grad_norm": 0.3544829785823822, + "learning_rate": 1.1338528735606391e-06, + "loss": 1.3407413959503174, + "step": 1818 + }, + { + "epoch": 2.782874617737003, + "grad_norm": 0.24868814647197723, + "learning_rate": 1.1302036343270996e-06, + "loss": 1.4030461311340332, + "step": 1820 + }, + { + "epoch": 2.7859327217125385, + "grad_norm": 0.30862292647361755, + "learning_rate": 1.12660409949113e-06, + "loss": 1.3144700527191162, + "step": 1822 + }, + { + "epoch": 2.7889908256880735, + "grad_norm": 0.9225071668624878, + "learning_rate": 1.1230543099957608e-06, + "loss": 1.338538646697998, + "step": 1824 + }, + { + "epoch": 2.7920489296636086, + "grad_norm": 0.32354745268821716, + "learning_rate": 1.1195543062181954e-06, + "loss": 1.310173749923706, + "step": 1826 + }, + { + "epoch": 2.7951070336391437, + "grad_norm": 0.24064457416534424, + "learning_rate": 1.1161041279693445e-06, + "loss": 1.3204376697540283, + "step": 1828 + }, + { + "epoch": 2.7981651376146788, + "grad_norm": 0.23651309311389923, + "learning_rate": 1.1127038144933787e-06, + "loss": 1.281717300415039, + "step": 1830 + }, + { + "epoch": 2.801223241590214, + "grad_norm": 0.21533581614494324, + "learning_rate": 1.1093534044672796e-06, + "loss": 1.3252437114715576, + "step": 1832 + }, + { + "epoch": 2.8042813455657494, + "grad_norm": 0.38182252645492554, + "learning_rate": 1.1060529360004003e-06, + "loss": 1.27931809425354, + "step": 1834 + }, + { + "epoch": 2.8073394495412844, + "grad_norm": 0.12391169369220734, + "learning_rate": 1.1028024466340305e-06, + "loss": 1.1552488803863525, + "step": 1836 + }, + { + "epoch": 2.8103975535168195, + "grad_norm": 0.17293956875801086, + "learning_rate": 1.0996019733409732e-06, + "loss": 1.2036254405975342, + "step": 1838 + }, + { + "epoch": 2.8134556574923546, + "grad_norm": 0.21059419214725494, + "learning_rate": 1.096451552525121e-06, + "loss": 0.9850409030914307, + "step": 1840 + }, + { + "epoch": 2.81651376146789, + "grad_norm": 0.2714180648326874, + "learning_rate": 1.093351220021043e-06, + "loss": 1.2215778827667236, + "step": 1842 + }, + { + "epoch": 2.819571865443425, + "grad_norm": 0.22156941890716553, + "learning_rate": 1.090301011093575e-06, + "loss": 1.2629544734954834, + "step": 1844 + }, + { + "epoch": 2.8226299694189603, + "grad_norm": 0.20625340938568115, + "learning_rate": 1.0873009604374246e-06, + "loss": 1.2778034210205078, + "step": 1846 + }, + { + "epoch": 2.8256880733944953, + "grad_norm": 0.29442811012268066, + "learning_rate": 1.084351102176769e-06, + "loss": 1.2413357496261597, + "step": 1848 + }, + { + "epoch": 2.8287461773700304, + "grad_norm": 0.18544712662696838, + "learning_rate": 1.081451469864872e-06, + "loss": 1.2637240886688232, + "step": 1850 + }, + { + "epoch": 2.8318042813455655, + "grad_norm": 0.22874392569065094, + "learning_rate": 1.0786020964836991e-06, + "loss": 1.2410205602645874, + "step": 1852 + }, + { + "epoch": 2.834862385321101, + "grad_norm": 0.2457342892885208, + "learning_rate": 1.075803014443546e-06, + "loss": 1.2094589471817017, + "step": 1854 + }, + { + "epoch": 2.837920489296636, + "grad_norm": 0.22759026288986206, + "learning_rate": 1.0730542555826654e-06, + "loss": 1.274350643157959, + "step": 1856 + }, + { + "epoch": 2.840978593272171, + "grad_norm": 0.206235870718956, + "learning_rate": 1.07035585116691e-06, + "loss": 1.245356559753418, + "step": 1858 + }, + { + "epoch": 2.8440366972477067, + "grad_norm": 0.49194467067718506, + "learning_rate": 1.0677078318893716e-06, + "loss": 1.2151732444763184, + "step": 1860 + }, + { + "epoch": 2.8470948012232418, + "grad_norm": 0.33920061588287354, + "learning_rate": 1.0651102278700364e-06, + "loss": 1.2073887586593628, + "step": 1862 + }, + { + "epoch": 2.850152905198777, + "grad_norm": 0.25718092918395996, + "learning_rate": 1.062563068655439e-06, + "loss": 1.2325494289398193, + "step": 1864 + }, + { + "epoch": 2.853211009174312, + "grad_norm": 0.24365228414535522, + "learning_rate": 1.0600663832183293e-06, + "loss": 1.2226455211639404, + "step": 1866 + }, + { + "epoch": 2.856269113149847, + "grad_norm": 0.19332216680049896, + "learning_rate": 1.0576201999573405e-06, + "loss": 1.1831451654434204, + "step": 1868 + }, + { + "epoch": 2.859327217125382, + "grad_norm": 0.25319862365722656, + "learning_rate": 1.0552245466966678e-06, + "loss": 1.2440452575683594, + "step": 1870 + }, + { + "epoch": 2.8623853211009176, + "grad_norm": 0.27022072672843933, + "learning_rate": 1.0528794506857508e-06, + "loss": 1.2725245952606201, + "step": 1872 + }, + { + "epoch": 2.8654434250764527, + "grad_norm": 0.3112826943397522, + "learning_rate": 1.050584938598963e-06, + "loss": 1.282654047012329, + "step": 1874 + }, + { + "epoch": 2.8685015290519877, + "grad_norm": 0.2421792596578598, + "learning_rate": 1.048341036535311e-06, + "loss": 1.273242712020874, + "step": 1876 + }, + { + "epoch": 2.871559633027523, + "grad_norm": 0.23541022837162018, + "learning_rate": 1.0461477700181355e-06, + "loss": 1.2899906635284424, + "step": 1878 + }, + { + "epoch": 2.8746177370030583, + "grad_norm": 0.2772025167942047, + "learning_rate": 1.044005163994821e-06, + "loss": 1.2756202220916748, + "step": 1880 + }, + { + "epoch": 2.8776758409785934, + "grad_norm": 0.47361937165260315, + "learning_rate": 1.0419132428365116e-06, + "loss": 1.2930552959442139, + "step": 1882 + }, + { + "epoch": 2.8807339449541285, + "grad_norm": 0.18241485953330994, + "learning_rate": 1.0398720303378374e-06, + "loss": 1.223031997680664, + "step": 1884 + }, + { + "epoch": 2.8837920489296636, + "grad_norm": 0.40437427163124084, + "learning_rate": 1.0378815497166385e-06, + "loss": 1.2670063972473145, + "step": 1886 + }, + { + "epoch": 2.8868501529051986, + "grad_norm": 0.22389701008796692, + "learning_rate": 1.0359418236137047e-06, + "loss": 1.2270456552505493, + "step": 1888 + }, + { + "epoch": 2.8899082568807337, + "grad_norm": 0.29309970140457153, + "learning_rate": 1.0340528740925169e-06, + "loss": 1.2563271522521973, + "step": 1890 + }, + { + "epoch": 2.8929663608562692, + "grad_norm": 0.24637004733085632, + "learning_rate": 1.0322147226389952e-06, + "loss": 1.2668583393096924, + "step": 1892 + }, + { + "epoch": 2.8960244648318043, + "grad_norm": 0.5765001177787781, + "learning_rate": 1.0304273901612566e-06, + "loss": 1.2873437404632568, + "step": 1894 + }, + { + "epoch": 2.8990825688073394, + "grad_norm": 0.3287610411643982, + "learning_rate": 1.028690896989375e-06, + "loss": 1.274024248123169, + "step": 1896 + }, + { + "epoch": 2.9021406727828745, + "grad_norm": 0.2688363492488861, + "learning_rate": 1.027005262875151e-06, + "loss": 1.20585036277771, + "step": 1898 + }, + { + "epoch": 2.90519877675841, + "grad_norm": 0.3984238803386688, + "learning_rate": 1.0253705069918865e-06, + "loss": 1.2360919713974, + "step": 1900 + }, + { + "epoch": 2.908256880733945, + "grad_norm": 0.27637046575546265, + "learning_rate": 1.0237866479341687e-06, + "loss": 1.2752952575683594, + "step": 1902 + }, + { + "epoch": 2.91131498470948, + "grad_norm": 0.5071486234664917, + "learning_rate": 1.0222537037176572e-06, + "loss": 1.2954089641571045, + "step": 1904 + }, + { + "epoch": 2.914373088685015, + "grad_norm": 0.22012606263160706, + "learning_rate": 1.0207716917788768e-06, + "loss": 1.2765629291534424, + "step": 1906 + }, + { + "epoch": 2.9174311926605503, + "grad_norm": 0.20149464905261993, + "learning_rate": 1.019340628975023e-06, + "loss": 1.2535219192504883, + "step": 1908 + }, + { + "epoch": 2.9204892966360854, + "grad_norm": 0.227265864610672, + "learning_rate": 1.0179605315837695e-06, + "loss": 1.2175259590148926, + "step": 1910 + }, + { + "epoch": 2.923547400611621, + "grad_norm": 0.2566111087799072, + "learning_rate": 1.0166314153030799e-06, + "loss": 1.255599856376648, + "step": 1912 + }, + { + "epoch": 2.926605504587156, + "grad_norm": 0.38341450691223145, + "learning_rate": 1.0153532952510328e-06, + "loss": 1.2794301509857178, + "step": 1914 + }, + { + "epoch": 2.929663608562691, + "grad_norm": 0.28000977635383606, + "learning_rate": 1.0141261859656484e-06, + "loss": 1.2272768020629883, + "step": 1916 + }, + { + "epoch": 2.9327217125382266, + "grad_norm": 0.2550158202648163, + "learning_rate": 1.0129501014047236e-06, + "loss": 1.2561171054840088, + "step": 1918 + }, + { + "epoch": 2.9357798165137616, + "grad_norm": 0.21566316485404968, + "learning_rate": 1.0118250549456717e-06, + "loss": 1.2545552253723145, + "step": 1920 + }, + { + "epoch": 2.9388379204892967, + "grad_norm": 0.36798691749572754, + "learning_rate": 1.0107510593853716e-06, + "loss": 1.3016841411590576, + "step": 1922 + }, + { + "epoch": 2.941896024464832, + "grad_norm": 0.29115161299705505, + "learning_rate": 1.0097281269400234e-06, + "loss": 1.3122904300689697, + "step": 1924 + }, + { + "epoch": 2.944954128440367, + "grad_norm": 0.42286819219589233, + "learning_rate": 1.0087562692450062e-06, + "loss": 1.2751294374465942, + "step": 1926 + }, + { + "epoch": 2.948012232415902, + "grad_norm": 0.29917454719543457, + "learning_rate": 1.0078354973547484e-06, + "loss": 1.2971951961517334, + "step": 1928 + }, + { + "epoch": 2.9510703363914375, + "grad_norm": 0.28312069177627563, + "learning_rate": 1.0069658217426017e-06, + "loss": 1.2662827968597412, + "step": 1930 + }, + { + "epoch": 2.9541284403669725, + "grad_norm": 0.2748239040374756, + "learning_rate": 1.0061472523007213e-06, + "loss": 1.209917664527893, + "step": 1932 + }, + { + "epoch": 2.9571865443425076, + "grad_norm": 0.36147835850715637, + "learning_rate": 1.0053797983399524e-06, + "loss": 1.2387361526489258, + "step": 1934 + }, + { + "epoch": 2.9602446483180427, + "grad_norm": 0.34865546226501465, + "learning_rate": 1.004663468589726e-06, + "loss": 1.2596259117126465, + "step": 1936 + }, + { + "epoch": 2.963302752293578, + "grad_norm": 0.23798368871212006, + "learning_rate": 1.0039982711979603e-06, + "loss": 1.239612340927124, + "step": 1938 + }, + { + "epoch": 2.9663608562691133, + "grad_norm": 0.31115320324897766, + "learning_rate": 1.0033842137309649e-06, + "loss": 1.2498747110366821, + "step": 1940 + }, + { + "epoch": 2.9694189602446484, + "grad_norm": 0.37815067172050476, + "learning_rate": 1.0028213031733578e-06, + "loss": 1.3014090061187744, + "step": 1942 + }, + { + "epoch": 2.9724770642201834, + "grad_norm": 0.26476937532424927, + "learning_rate": 1.0023095459279838e-06, + "loss": 1.2854735851287842, + "step": 1944 + }, + { + "epoch": 2.9755351681957185, + "grad_norm": 0.3802984952926636, + "learning_rate": 1.0018489478158434e-06, + "loss": 1.3032188415527344, + "step": 1946 + }, + { + "epoch": 2.9785932721712536, + "grad_norm": 0.3544924855232239, + "learning_rate": 1.0014395140760255e-06, + "loss": 1.2610487937927246, + "step": 1948 + }, + { + "epoch": 2.981651376146789, + "grad_norm": 0.30221831798553467, + "learning_rate": 1.0010812493656488e-06, + "loss": 1.2582671642303467, + "step": 1950 + }, + { + "epoch": 2.984709480122324, + "grad_norm": 0.2731051743030548, + "learning_rate": 1.000774157759806e-06, + "loss": 1.2794151306152344, + "step": 1952 + }, + { + "epoch": 2.9877675840978593, + "grad_norm": 0.3089560270309448, + "learning_rate": 1.0005182427515222e-06, + "loss": 1.334507703781128, + "step": 1954 + }, + { + "epoch": 2.9908256880733948, + "grad_norm": 0.31155917048454285, + "learning_rate": 1.0003135072517108e-06, + "loss": 1.3732435703277588, + "step": 1956 + }, + { + "epoch": 2.99388379204893, + "grad_norm": 0.3963629901409149, + "learning_rate": 1.000159953589143e-06, + "loss": 1.6014021635055542, + "step": 1958 + }, + { + "epoch": 2.996941896024465, + "grad_norm": 0.8739917278289795, + "learning_rate": 1.00005758351042e-06, + "loss": 1.5767264366149902, + "step": 1960 + }, + { + "epoch": 3.0, + "grad_norm": 1.2575660943984985, + "learning_rate": 1.0000063981799541e-06, + "loss": 1.7074545621871948, + "step": 1962 + }, + { + "epoch": 3.0, + "step": 1962, + "total_flos": 2.4882019125669396e+18, + "train_loss": 1.4736498374943825, + "train_runtime": 8380.6004, + "train_samples_per_second": 3.746, + "train_steps_per_second": 0.234 + } + ], + "logging_steps": 2, + "max_steps": 1962, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 9999999, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.4882019125669396e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}