{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997955149614887, "eval_steps": 500, "global_step": 3667, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00027264671801513187, "grad_norm": 26.292772128393796, "learning_rate": 9.009009009009009e-09, "loss": 1.2377, "step": 1 }, { "epoch": 0.0005452934360302637, "grad_norm": 27.21296541936894, "learning_rate": 1.8018018018018017e-08, "loss": 1.2735, "step": 2 }, { "epoch": 0.0008179401540453957, "grad_norm": 26.412639182352972, "learning_rate": 2.7027027027027028e-08, "loss": 1.2612, "step": 3 }, { "epoch": 0.0010905868720605275, "grad_norm": 28.375076632781383, "learning_rate": 3.6036036036036035e-08, "loss": 1.1565, "step": 4 }, { "epoch": 0.0013632335900756595, "grad_norm": 27.211928950551044, "learning_rate": 4.504504504504504e-08, "loss": 1.1916, "step": 5 }, { "epoch": 0.0016358803080907913, "grad_norm": 22.68389744418117, "learning_rate": 5.4054054054054056e-08, "loss": 1.1924, "step": 6 }, { "epoch": 0.0019085270261059233, "grad_norm": 19.817909771049127, "learning_rate": 6.306306306306305e-08, "loss": 1.1779, "step": 7 }, { "epoch": 0.002181173744121055, "grad_norm": 22.602254814004485, "learning_rate": 7.207207207207207e-08, "loss": 1.1881, "step": 8 }, { "epoch": 0.002453820462136187, "grad_norm": 22.441888087675828, "learning_rate": 8.108108108108108e-08, "loss": 1.1784, "step": 9 }, { "epoch": 0.002726467180151319, "grad_norm": 25.295273443572363, "learning_rate": 9.009009009009008e-08, "loss": 1.18, "step": 10 }, { "epoch": 0.002999113898166451, "grad_norm": 22.34585322099101, "learning_rate": 9.909909909909909e-08, "loss": 1.1929, "step": 11 }, { "epoch": 0.0032717606161815826, "grad_norm": 20.113714032876448, "learning_rate": 1.0810810810810811e-07, "loss": 1.2158, "step": 12 }, { "epoch": 0.0035444073341967144, "grad_norm": 25.556988544884586, "learning_rate": 1.171171171171171e-07, "loss": 1.2535, "step": 13 }, { "epoch": 0.0038170540522118467, "grad_norm": 21.403979825776243, "learning_rate": 1.261261261261261e-07, "loss": 1.1882, "step": 14 }, { "epoch": 0.0040897007702269785, "grad_norm": 22.449955659057043, "learning_rate": 1.3513513513513515e-07, "loss": 1.1671, "step": 15 }, { "epoch": 0.00436234748824211, "grad_norm": 20.39444148102075, "learning_rate": 1.4414414414414414e-07, "loss": 1.1287, "step": 16 }, { "epoch": 0.004634994206257242, "grad_norm": 25.053394312977048, "learning_rate": 1.5315315315315313e-07, "loss": 1.236, "step": 17 }, { "epoch": 0.004907640924272374, "grad_norm": 25.66983660413123, "learning_rate": 1.6216216216216215e-07, "loss": 1.1425, "step": 18 }, { "epoch": 0.005180287642287506, "grad_norm": 43.7243093905169, "learning_rate": 1.7117117117117117e-07, "loss": 1.222, "step": 19 }, { "epoch": 0.005452934360302638, "grad_norm": 24.407505807714042, "learning_rate": 1.8018018018018017e-07, "loss": 1.1436, "step": 20 }, { "epoch": 0.005725581078317769, "grad_norm": 22.23163056070696, "learning_rate": 1.891891891891892e-07, "loss": 1.1963, "step": 21 }, { "epoch": 0.005998227796332902, "grad_norm": 19.953786418286597, "learning_rate": 1.9819819819819818e-07, "loss": 1.2132, "step": 22 }, { "epoch": 0.006270874514348034, "grad_norm": 14.607549456700841, "learning_rate": 2.072072072072072e-07, "loss": 1.1524, "step": 23 }, { "epoch": 0.006543521232363165, "grad_norm": 25.413672888929867, "learning_rate": 2.1621621621621622e-07, "loss": 1.1174, "step": 24 }, { "epoch": 0.0068161679503782975, "grad_norm": 21.289940488145227, "learning_rate": 2.2522522522522522e-07, "loss": 1.1459, "step": 25 }, { "epoch": 0.007088814668393429, "grad_norm": 19.001648687250327, "learning_rate": 2.342342342342342e-07, "loss": 1.0932, "step": 26 }, { "epoch": 0.007361461386408561, "grad_norm": 16.978732140766798, "learning_rate": 2.4324324324324326e-07, "loss": 1.1615, "step": 27 }, { "epoch": 0.007634108104423693, "grad_norm": 35.494969766437904, "learning_rate": 2.522522522522522e-07, "loss": 1.1323, "step": 28 }, { "epoch": 0.007906754822438826, "grad_norm": 18.56810076590373, "learning_rate": 2.6126126126126124e-07, "loss": 1.1154, "step": 29 }, { "epoch": 0.008179401540453957, "grad_norm": 15.99501809448612, "learning_rate": 2.702702702702703e-07, "loss": 1.1089, "step": 30 }, { "epoch": 0.008452048258469088, "grad_norm": 18.749495107166208, "learning_rate": 2.7927927927927923e-07, "loss": 1.0803, "step": 31 }, { "epoch": 0.00872469497648422, "grad_norm": 13.092010747156477, "learning_rate": 2.882882882882883e-07, "loss": 1.052, "step": 32 }, { "epoch": 0.008997341694499353, "grad_norm": 15.65936073464815, "learning_rate": 2.972972972972973e-07, "loss": 1.072, "step": 33 }, { "epoch": 0.009269988412514484, "grad_norm": 8.662937180860052, "learning_rate": 3.0630630630630627e-07, "loss": 1.0511, "step": 34 }, { "epoch": 0.009542635130529616, "grad_norm": 11.266871818931229, "learning_rate": 3.153153153153153e-07, "loss": 1.051, "step": 35 }, { "epoch": 0.009815281848544749, "grad_norm": 16.60648211029163, "learning_rate": 3.243243243243243e-07, "loss": 1.0156, "step": 36 }, { "epoch": 0.01008792856655988, "grad_norm": 12.897814907252357, "learning_rate": 3.333333333333333e-07, "loss": 1.0678, "step": 37 }, { "epoch": 0.010360575284575011, "grad_norm": 9.839524049277342, "learning_rate": 3.4234234234234235e-07, "loss": 0.9789, "step": 38 }, { "epoch": 0.010633222002590145, "grad_norm": 7.67193090843479, "learning_rate": 3.5135135135135134e-07, "loss": 1.0594, "step": 39 }, { "epoch": 0.010905868720605276, "grad_norm": 7.919486516813141, "learning_rate": 3.6036036036036033e-07, "loss": 1.0033, "step": 40 }, { "epoch": 0.011178515438620407, "grad_norm": 7.560514087552813, "learning_rate": 3.6936936936936933e-07, "loss": 1.0103, "step": 41 }, { "epoch": 0.011451162156635539, "grad_norm": 5.94084310369256, "learning_rate": 3.783783783783784e-07, "loss": 1.0289, "step": 42 }, { "epoch": 0.011723808874650672, "grad_norm": 8.69906462046374, "learning_rate": 3.8738738738738737e-07, "loss": 1.0711, "step": 43 }, { "epoch": 0.011996455592665803, "grad_norm": 6.361423967747099, "learning_rate": 3.9639639639639636e-07, "loss": 0.9818, "step": 44 }, { "epoch": 0.012269102310680935, "grad_norm": 5.185542178424501, "learning_rate": 4.054054054054054e-07, "loss": 0.975, "step": 45 }, { "epoch": 0.012541749028696068, "grad_norm": 7.503986732053337, "learning_rate": 4.144144144144144e-07, "loss": 0.9395, "step": 46 }, { "epoch": 0.012814395746711199, "grad_norm": 5.521735395122545, "learning_rate": 4.234234234234234e-07, "loss": 0.9182, "step": 47 }, { "epoch": 0.01308704246472633, "grad_norm": 5.153152911733834, "learning_rate": 4.3243243243243244e-07, "loss": 1.005, "step": 48 }, { "epoch": 0.013359689182741464, "grad_norm": 11.928154627753692, "learning_rate": 4.414414414414414e-07, "loss": 0.9263, "step": 49 }, { "epoch": 0.013632335900756595, "grad_norm": 4.912206634561692, "learning_rate": 4.5045045045045043e-07, "loss": 0.9254, "step": 50 }, { "epoch": 0.013904982618771726, "grad_norm": 5.456785279568662, "learning_rate": 4.594594594594595e-07, "loss": 0.9732, "step": 51 }, { "epoch": 0.014177629336786858, "grad_norm": 6.310440326191991, "learning_rate": 4.684684684684684e-07, "loss": 0.9646, "step": 52 }, { "epoch": 0.01445027605480199, "grad_norm": 3.8947048499811694, "learning_rate": 4.774774774774775e-07, "loss": 0.9014, "step": 53 }, { "epoch": 0.014722922772817122, "grad_norm": 5.020321975632386, "learning_rate": 4.864864864864865e-07, "loss": 0.8474, "step": 54 }, { "epoch": 0.014995569490832254, "grad_norm": 6.303940062016482, "learning_rate": 4.954954954954955e-07, "loss": 0.9562, "step": 55 }, { "epoch": 0.015268216208847387, "grad_norm": 4.658264895515695, "learning_rate": 5.045045045045044e-07, "loss": 0.8785, "step": 56 }, { "epoch": 0.015540862926862518, "grad_norm": 6.3003171732122905, "learning_rate": 5.135135135135134e-07, "loss": 0.9564, "step": 57 }, { "epoch": 0.01581350964487765, "grad_norm": 5.332364911950274, "learning_rate": 5.225225225225225e-07, "loss": 0.8588, "step": 58 }, { "epoch": 0.016086156362892783, "grad_norm": 4.061446152645851, "learning_rate": 5.315315315315315e-07, "loss": 0.9027, "step": 59 }, { "epoch": 0.016358803080907914, "grad_norm": 4.4863511414285755, "learning_rate": 5.405405405405406e-07, "loss": 0.9411, "step": 60 }, { "epoch": 0.016631449798923045, "grad_norm": 4.061736728296456, "learning_rate": 5.495495495495495e-07, "loss": 0.8679, "step": 61 }, { "epoch": 0.016904096516938177, "grad_norm": 14.144224719768323, "learning_rate": 5.585585585585585e-07, "loss": 0.8595, "step": 62 }, { "epoch": 0.017176743234953308, "grad_norm": 2.9713704904151896, "learning_rate": 5.675675675675675e-07, "loss": 0.8863, "step": 63 }, { "epoch": 0.01744938995296844, "grad_norm": 3.2787330865183355, "learning_rate": 5.765765765765766e-07, "loss": 0.8737, "step": 64 }, { "epoch": 0.017722036670983574, "grad_norm": 4.805326082687503, "learning_rate": 5.855855855855856e-07, "loss": 0.921, "step": 65 }, { "epoch": 0.017994683388998706, "grad_norm": 3.4389422238728296, "learning_rate": 5.945945945945947e-07, "loss": 0.9563, "step": 66 }, { "epoch": 0.018267330107013837, "grad_norm": 3.338566661460429, "learning_rate": 6.036036036036036e-07, "loss": 0.8688, "step": 67 }, { "epoch": 0.01853997682502897, "grad_norm": 3.8014602738875065, "learning_rate": 6.126126126126125e-07, "loss": 0.8324, "step": 68 }, { "epoch": 0.0188126235430441, "grad_norm": 3.420653156406546, "learning_rate": 6.216216216216216e-07, "loss": 0.85, "step": 69 }, { "epoch": 0.01908527026105923, "grad_norm": 4.200079302880338, "learning_rate": 6.306306306306306e-07, "loss": 0.8748, "step": 70 }, { "epoch": 0.019357916979074366, "grad_norm": 4.629076818126849, "learning_rate": 6.396396396396397e-07, "loss": 0.8907, "step": 71 }, { "epoch": 0.019630563697089497, "grad_norm": 3.477240771363601, "learning_rate": 6.486486486486486e-07, "loss": 0.9285, "step": 72 }, { "epoch": 0.01990321041510463, "grad_norm": 3.9518513358885574, "learning_rate": 6.576576576576577e-07, "loss": 0.8398, "step": 73 }, { "epoch": 0.02017585713311976, "grad_norm": 4.164383078527997, "learning_rate": 6.666666666666666e-07, "loss": 0.961, "step": 74 }, { "epoch": 0.02044850385113489, "grad_norm": 3.4693567464663997, "learning_rate": 6.756756756756756e-07, "loss": 0.8412, "step": 75 }, { "epoch": 0.020721150569150023, "grad_norm": 3.617547895084456, "learning_rate": 6.846846846846847e-07, "loss": 0.8354, "step": 76 }, { "epoch": 0.020993797287165154, "grad_norm": 3.5001758369353175, "learning_rate": 6.936936936936936e-07, "loss": 0.8904, "step": 77 }, { "epoch": 0.02126644400518029, "grad_norm": 5.630686037671838, "learning_rate": 7.027027027027027e-07, "loss": 0.8764, "step": 78 }, { "epoch": 0.02153909072319542, "grad_norm": 9.939358054507984, "learning_rate": 7.117117117117116e-07, "loss": 0.8706, "step": 79 }, { "epoch": 0.021811737441210552, "grad_norm": 4.418520706987715, "learning_rate": 7.207207207207207e-07, "loss": 0.8966, "step": 80 }, { "epoch": 0.022084384159225683, "grad_norm": 4.023155711590758, "learning_rate": 7.297297297297297e-07, "loss": 0.7931, "step": 81 }, { "epoch": 0.022357030877240815, "grad_norm": 4.1554891166415295, "learning_rate": 7.387387387387387e-07, "loss": 0.8032, "step": 82 }, { "epoch": 0.022629677595255946, "grad_norm": 4.065208278440677, "learning_rate": 7.477477477477477e-07, "loss": 0.8745, "step": 83 }, { "epoch": 0.022902324313271077, "grad_norm": 2.966304302571671, "learning_rate": 7.567567567567568e-07, "loss": 0.834, "step": 84 }, { "epoch": 0.023174971031286212, "grad_norm": 3.2302976910892727, "learning_rate": 7.657657657657657e-07, "loss": 0.8552, "step": 85 }, { "epoch": 0.023447617749301344, "grad_norm": 3.8323594184481493, "learning_rate": 7.747747747747747e-07, "loss": 0.7871, "step": 86 }, { "epoch": 0.023720264467316475, "grad_norm": 5.470411660383244, "learning_rate": 7.837837837837838e-07, "loss": 0.8745, "step": 87 }, { "epoch": 0.023992911185331606, "grad_norm": 3.0768325107968795, "learning_rate": 7.927927927927927e-07, "loss": 0.8042, "step": 88 }, { "epoch": 0.024265557903346738, "grad_norm": 183.76136635061602, "learning_rate": 8.018018018018018e-07, "loss": 0.8671, "step": 89 }, { "epoch": 0.02453820462136187, "grad_norm": 3.139098607068219, "learning_rate": 8.108108108108108e-07, "loss": 0.8654, "step": 90 }, { "epoch": 0.024810851339377, "grad_norm": 4.062783845806974, "learning_rate": 8.198198198198198e-07, "loss": 0.7868, "step": 91 }, { "epoch": 0.025083498057392135, "grad_norm": 8.538775433792054, "learning_rate": 8.288288288288288e-07, "loss": 0.7469, "step": 92 }, { "epoch": 0.025356144775407267, "grad_norm": 4.80451327798043, "learning_rate": 8.378378378378377e-07, "loss": 0.8301, "step": 93 }, { "epoch": 0.025628791493422398, "grad_norm": 19.452247037853216, "learning_rate": 8.468468468468468e-07, "loss": 0.8372, "step": 94 }, { "epoch": 0.02590143821143753, "grad_norm": 4.2834755134330775, "learning_rate": 8.558558558558558e-07, "loss": 0.8208, "step": 95 }, { "epoch": 0.02617408492945266, "grad_norm": 3.24061021320425, "learning_rate": 8.648648648648649e-07, "loss": 0.829, "step": 96 }, { "epoch": 0.026446731647467792, "grad_norm": 3.536655252262855, "learning_rate": 8.738738738738738e-07, "loss": 0.884, "step": 97 }, { "epoch": 0.026719378365482927, "grad_norm": 8.263123226009755, "learning_rate": 8.828828828828828e-07, "loss": 0.7936, "step": 98 }, { "epoch": 0.02699202508349806, "grad_norm": 3.6407197861342375, "learning_rate": 8.918918918918918e-07, "loss": 0.7881, "step": 99 }, { "epoch": 0.02726467180151319, "grad_norm": 3.5146531758022617, "learning_rate": 9.009009009009009e-07, "loss": 0.7691, "step": 100 }, { "epoch": 0.02753731851952832, "grad_norm": 6.533436212567394, "learning_rate": 9.099099099099099e-07, "loss": 0.8398, "step": 101 }, { "epoch": 0.027809965237543453, "grad_norm": 2.508997285041768, "learning_rate": 9.18918918918919e-07, "loss": 0.8394, "step": 102 }, { "epoch": 0.028082611955558584, "grad_norm": 2.771634669928697, "learning_rate": 9.279279279279278e-07, "loss": 0.8088, "step": 103 }, { "epoch": 0.028355258673573715, "grad_norm": 3.0202270043584885, "learning_rate": 9.369369369369368e-07, "loss": 0.8561, "step": 104 }, { "epoch": 0.02862790539158885, "grad_norm": 23.292770788240585, "learning_rate": 9.459459459459459e-07, "loss": 0.7834, "step": 105 }, { "epoch": 0.02890055210960398, "grad_norm": 2.707868586306906, "learning_rate": 9.54954954954955e-07, "loss": 0.7672, "step": 106 }, { "epoch": 0.029173198827619113, "grad_norm": 4.701168415311593, "learning_rate": 9.63963963963964e-07, "loss": 0.8379, "step": 107 }, { "epoch": 0.029445845545634244, "grad_norm": 2.8226038201589394, "learning_rate": 9.72972972972973e-07, "loss": 0.8042, "step": 108 }, { "epoch": 0.029718492263649376, "grad_norm": 3.696298331310396, "learning_rate": 9.819819819819819e-07, "loss": 0.7431, "step": 109 }, { "epoch": 0.029991138981664507, "grad_norm": 4.184343039495453, "learning_rate": 9.90990990990991e-07, "loss": 0.7848, "step": 110 }, { "epoch": 0.03026378569967964, "grad_norm": 3.442257848174583, "learning_rate": 1e-06, "loss": 0.8061, "step": 111 }, { "epoch": 0.030536432417694773, "grad_norm": 2.5737929191600224, "learning_rate": 9.999998048735149e-07, "loss": 0.8427, "step": 112 }, { "epoch": 0.030809079135709905, "grad_norm": 5.197242895683309, "learning_rate": 9.999992194942124e-07, "loss": 0.8451, "step": 113 }, { "epoch": 0.031081725853725036, "grad_norm": 3.972717977425227, "learning_rate": 9.99998243862549e-07, "loss": 0.7837, "step": 114 }, { "epoch": 0.03135437257174017, "grad_norm": 2.8100380301146073, "learning_rate": 9.999968779792863e-07, "loss": 0.8013, "step": 115 }, { "epoch": 0.0316270192897553, "grad_norm": 2.6111821552942307, "learning_rate": 9.999951218454902e-07, "loss": 0.7266, "step": 116 }, { "epoch": 0.03189966600777043, "grad_norm": 3.5093406235573834, "learning_rate": 9.999929754625319e-07, "loss": 0.812, "step": 117 }, { "epoch": 0.032172312725785565, "grad_norm": 4.174291933652808, "learning_rate": 9.999904388320862e-07, "loss": 0.7759, "step": 118 }, { "epoch": 0.03244495944380069, "grad_norm": 6.074820034449762, "learning_rate": 9.99987511956133e-07, "loss": 0.7583, "step": 119 }, { "epoch": 0.03271760616181583, "grad_norm": 3.312591838212692, "learning_rate": 9.999841948369569e-07, "loss": 0.7114, "step": 120 }, { "epoch": 0.032990252879830956, "grad_norm": 2.8758621239803444, "learning_rate": 9.99980487477147e-07, "loss": 0.7558, "step": 121 }, { "epoch": 0.03326289959784609, "grad_norm": 2.1668632116447775, "learning_rate": 9.999763898795965e-07, "loss": 0.7134, "step": 122 }, { "epoch": 0.033535546315861225, "grad_norm": 2.343425238261702, "learning_rate": 9.99971902047504e-07, "loss": 0.7993, "step": 123 }, { "epoch": 0.03380819303387635, "grad_norm": 4.22484365801235, "learning_rate": 9.999670239843722e-07, "loss": 0.7765, "step": 124 }, { "epoch": 0.03408083975189149, "grad_norm": 4.152439391582255, "learning_rate": 9.999617556940084e-07, "loss": 0.7616, "step": 125 }, { "epoch": 0.034353486469906616, "grad_norm": 53.06604342498906, "learning_rate": 9.999560971805246e-07, "loss": 0.755, "step": 126 }, { "epoch": 0.03462613318792175, "grad_norm": 2.9778489345061376, "learning_rate": 9.99950048448337e-07, "loss": 0.7437, "step": 127 }, { "epoch": 0.03489877990593688, "grad_norm": 2.7797076924255326, "learning_rate": 9.999436095021671e-07, "loss": 0.7701, "step": 128 }, { "epoch": 0.035171426623952014, "grad_norm": 2.6763619195010535, "learning_rate": 9.999367803470404e-07, "loss": 0.7721, "step": 129 }, { "epoch": 0.03544407334196715, "grad_norm": 3.3252306386114823, "learning_rate": 9.999295609882872e-07, "loss": 0.7638, "step": 130 }, { "epoch": 0.035716720059982277, "grad_norm": 3.9981263650616574, "learning_rate": 9.999219514315417e-07, "loss": 0.7732, "step": 131 }, { "epoch": 0.03598936677799741, "grad_norm": 4.020463362242287, "learning_rate": 9.99913951682744e-07, "loss": 0.7939, "step": 132 }, { "epoch": 0.03626201349601254, "grad_norm": 4.712502805475307, "learning_rate": 9.999055617481373e-07, "loss": 0.8348, "step": 133 }, { "epoch": 0.036534660214027674, "grad_norm": 3.161662034266747, "learning_rate": 9.998967816342704e-07, "loss": 0.7735, "step": 134 }, { "epoch": 0.03680730693204281, "grad_norm": 3.0322466492645557, "learning_rate": 9.99887611347996e-07, "loss": 0.8049, "step": 135 }, { "epoch": 0.03707995365005794, "grad_norm": 4.701139343197454, "learning_rate": 9.998780508964717e-07, "loss": 0.7456, "step": 136 }, { "epoch": 0.03735260036807307, "grad_norm": 3.3166854664678342, "learning_rate": 9.998681002871595e-07, "loss": 0.7828, "step": 137 }, { "epoch": 0.0376252470860882, "grad_norm": 3.654145565657327, "learning_rate": 9.998577595278258e-07, "loss": 0.7652, "step": 138 }, { "epoch": 0.037897893804103334, "grad_norm": 3.847280346724718, "learning_rate": 9.998470286265414e-07, "loss": 0.756, "step": 139 }, { "epoch": 0.03817054052211846, "grad_norm": 2.268369192329989, "learning_rate": 9.998359075916824e-07, "loss": 0.8341, "step": 140 }, { "epoch": 0.0384431872401336, "grad_norm": 2.871690467333706, "learning_rate": 9.998243964319285e-07, "loss": 0.788, "step": 141 }, { "epoch": 0.03871583395814873, "grad_norm": 3.6528954961133464, "learning_rate": 9.998124951562643e-07, "loss": 0.7504, "step": 142 }, { "epoch": 0.03898848067616386, "grad_norm": 2.532670468540119, "learning_rate": 9.998002037739788e-07, "loss": 0.7632, "step": 143 }, { "epoch": 0.039261127394178995, "grad_norm": 3.881959379821288, "learning_rate": 9.997875222946652e-07, "loss": 0.7394, "step": 144 }, { "epoch": 0.03953377411219412, "grad_norm": 2.198702875349348, "learning_rate": 9.99774450728222e-07, "loss": 0.7008, "step": 145 }, { "epoch": 0.03980642083020926, "grad_norm": 3.02967828243444, "learning_rate": 9.997609890848513e-07, "loss": 0.8115, "step": 146 }, { "epoch": 0.040079067548224385, "grad_norm": 2.092595894778474, "learning_rate": 9.997471373750602e-07, "loss": 0.7252, "step": 147 }, { "epoch": 0.04035171426623952, "grad_norm": 4.212218005119078, "learning_rate": 9.997328956096597e-07, "loss": 0.7993, "step": 148 }, { "epoch": 0.040624360984254655, "grad_norm": 2.254259506836924, "learning_rate": 9.997182637997662e-07, "loss": 0.7184, "step": 149 }, { "epoch": 0.04089700770226978, "grad_norm": 6.322343303108434, "learning_rate": 9.99703241956799e-07, "loss": 0.7498, "step": 150 }, { "epoch": 0.04116965442028492, "grad_norm": 2.3394402118272715, "learning_rate": 9.996878300924835e-07, "loss": 0.7947, "step": 151 }, { "epoch": 0.041442301138300046, "grad_norm": 13.172657944841186, "learning_rate": 9.996720282188486e-07, "loss": 0.7862, "step": 152 }, { "epoch": 0.04171494785631518, "grad_norm": 3.1792012374302856, "learning_rate": 9.996558363482276e-07, "loss": 0.8239, "step": 153 }, { "epoch": 0.04198759457433031, "grad_norm": 2.028442601209087, "learning_rate": 9.996392544932584e-07, "loss": 0.7831, "step": 154 }, { "epoch": 0.04226024129234544, "grad_norm": 3.7390316549705997, "learning_rate": 9.996222826668833e-07, "loss": 0.7649, "step": 155 }, { "epoch": 0.04253288801036058, "grad_norm": 2.71099232710514, "learning_rate": 9.996049208823487e-07, "loss": 0.7049, "step": 156 }, { "epoch": 0.042805534728375706, "grad_norm": 7.178389783953226, "learning_rate": 9.995871691532059e-07, "loss": 0.7393, "step": 157 }, { "epoch": 0.04307818144639084, "grad_norm": 2.60078640223263, "learning_rate": 9.9956902749331e-07, "loss": 0.7622, "step": 158 }, { "epoch": 0.04335082816440597, "grad_norm": 26.839721611200595, "learning_rate": 9.995504959168206e-07, "loss": 0.8223, "step": 159 }, { "epoch": 0.043623474882421104, "grad_norm": 3.699308945298128, "learning_rate": 9.99531574438202e-07, "loss": 0.8122, "step": 160 }, { "epoch": 0.04389612160043623, "grad_norm": 2.937295513857142, "learning_rate": 9.995122630722223e-07, "loss": 0.7307, "step": 161 }, { "epoch": 0.04416876831845137, "grad_norm": 2.7162379605080895, "learning_rate": 9.994925618339543e-07, "loss": 0.8018, "step": 162 }, { "epoch": 0.0444414150364665, "grad_norm": 2.7399239714958292, "learning_rate": 9.994724707387747e-07, "loss": 0.7589, "step": 163 }, { "epoch": 0.04471406175448163, "grad_norm": 155.25294889424922, "learning_rate": 9.99451989802365e-07, "loss": 0.8031, "step": 164 }, { "epoch": 0.044986708472496764, "grad_norm": 3.494633726278638, "learning_rate": 9.994311190407106e-07, "loss": 0.6711, "step": 165 }, { "epoch": 0.04525935519051189, "grad_norm": 2.2906934082173565, "learning_rate": 9.99409858470101e-07, "loss": 0.725, "step": 166 }, { "epoch": 0.04553200190852703, "grad_norm": 2.6741202962018282, "learning_rate": 9.993882081071305e-07, "loss": 0.8072, "step": 167 }, { "epoch": 0.045804648626542155, "grad_norm": 3.7780887384384063, "learning_rate": 9.993661679686973e-07, "loss": 0.7967, "step": 168 }, { "epoch": 0.04607729534455729, "grad_norm": 2.526294275530266, "learning_rate": 9.993437380720037e-07, "loss": 0.7158, "step": 169 }, { "epoch": 0.046349942062572425, "grad_norm": 2.379515328871547, "learning_rate": 9.993209184345567e-07, "loss": 0.7015, "step": 170 }, { "epoch": 0.04662258878058755, "grad_norm": 2.6471829570337158, "learning_rate": 9.992977090741668e-07, "loss": 0.7786, "step": 171 }, { "epoch": 0.04689523549860269, "grad_norm": 3.1771430642833716, "learning_rate": 9.99274110008949e-07, "loss": 0.7004, "step": 172 }, { "epoch": 0.047167882216617815, "grad_norm": 3.0649598422050577, "learning_rate": 9.99250121257323e-07, "loss": 0.8052, "step": 173 }, { "epoch": 0.04744052893463295, "grad_norm": 2.8244687931370236, "learning_rate": 9.992257428380115e-07, "loss": 0.797, "step": 174 }, { "epoch": 0.04771317565264808, "grad_norm": 4.022283227379177, "learning_rate": 9.992009747700428e-07, "loss": 0.6881, "step": 175 }, { "epoch": 0.04798582237066321, "grad_norm": 2.2745469032758154, "learning_rate": 9.991758170727476e-07, "loss": 0.7592, "step": 176 }, { "epoch": 0.04825846908867835, "grad_norm": 2.103719977891981, "learning_rate": 9.991502697657623e-07, "loss": 0.7367, "step": 177 }, { "epoch": 0.048531115806693476, "grad_norm": 2.668218781186664, "learning_rate": 9.991243328690266e-07, "loss": 0.8251, "step": 178 }, { "epoch": 0.04880376252470861, "grad_norm": 2.543333926692166, "learning_rate": 9.990980064027843e-07, "loss": 0.8206, "step": 179 }, { "epoch": 0.04907640924272374, "grad_norm": 3.3363124329036156, "learning_rate": 9.990712903875834e-07, "loss": 0.8026, "step": 180 }, { "epoch": 0.04934905596073887, "grad_norm": 19.99556486306425, "learning_rate": 9.99044184844276e-07, "loss": 0.7605, "step": 181 }, { "epoch": 0.049621702678754, "grad_norm": 4.883515078255179, "learning_rate": 9.99016689794018e-07, "loss": 0.7579, "step": 182 }, { "epoch": 0.049894349396769136, "grad_norm": 2.284440162703846, "learning_rate": 9.989888052582693e-07, "loss": 0.7176, "step": 183 }, { "epoch": 0.05016699611478427, "grad_norm": 4.545270829279444, "learning_rate": 9.989605312587944e-07, "loss": 0.6673, "step": 184 }, { "epoch": 0.0504396428327994, "grad_norm": 2.9120751561261424, "learning_rate": 9.989318678176608e-07, "loss": 0.7043, "step": 185 }, { "epoch": 0.050712289550814534, "grad_norm": 3.810046608072521, "learning_rate": 9.98902814957241e-07, "loss": 0.7338, "step": 186 }, { "epoch": 0.05098493626882966, "grad_norm": 4.624533360274892, "learning_rate": 9.988733727002106e-07, "loss": 0.7832, "step": 187 }, { "epoch": 0.051257582986844796, "grad_norm": 2.7726982537997205, "learning_rate": 9.988435410695494e-07, "loss": 0.7786, "step": 188 }, { "epoch": 0.05153022970485993, "grad_norm": 3.303277144222094, "learning_rate": 9.988133200885413e-07, "loss": 0.7432, "step": 189 }, { "epoch": 0.05180287642287506, "grad_norm": 8.265599668266956, "learning_rate": 9.987827097807741e-07, "loss": 0.7312, "step": 190 }, { "epoch": 0.052075523140890194, "grad_norm": 2.6346943461800314, "learning_rate": 9.987517101701393e-07, "loss": 0.732, "step": 191 }, { "epoch": 0.05234816985890532, "grad_norm": 3.737434237905701, "learning_rate": 9.987203212808319e-07, "loss": 0.7597, "step": 192 }, { "epoch": 0.05262081657692046, "grad_norm": 2.5254085270847013, "learning_rate": 9.986885431373516e-07, "loss": 0.7495, "step": 193 }, { "epoch": 0.052893463294935585, "grad_norm": 2.881476313819538, "learning_rate": 9.98656375764501e-07, "loss": 0.6799, "step": 194 }, { "epoch": 0.05316611001295072, "grad_norm": 2.9205450890479816, "learning_rate": 9.986238191873872e-07, "loss": 0.7646, "step": 195 }, { "epoch": 0.053438756730965854, "grad_norm": 2.4351288269567335, "learning_rate": 9.985908734314209e-07, "loss": 0.7483, "step": 196 }, { "epoch": 0.05371140344898098, "grad_norm": 2.828976251959388, "learning_rate": 9.98557538522316e-07, "loss": 0.6773, "step": 197 }, { "epoch": 0.05398405016699612, "grad_norm": 3.8067970768543318, "learning_rate": 9.985238144860913e-07, "loss": 0.7356, "step": 198 }, { "epoch": 0.054256696885011245, "grad_norm": 3.3997622869578024, "learning_rate": 9.984897013490679e-07, "loss": 0.7385, "step": 199 }, { "epoch": 0.05452934360302638, "grad_norm": 2.537003329255559, "learning_rate": 9.984551991378716e-07, "loss": 0.7214, "step": 200 }, { "epoch": 0.05480199032104151, "grad_norm": 3.0973896693759677, "learning_rate": 9.984203078794319e-07, "loss": 0.8004, "step": 201 }, { "epoch": 0.05507463703905664, "grad_norm": 2.2383809271230053, "learning_rate": 9.98385027600981e-07, "loss": 0.7514, "step": 202 }, { "epoch": 0.05534728375707178, "grad_norm": 2.939744150342932, "learning_rate": 9.983493583300556e-07, "loss": 0.7827, "step": 203 }, { "epoch": 0.055619930475086905, "grad_norm": 5.187776873383872, "learning_rate": 9.98313300094496e-07, "loss": 0.749, "step": 204 }, { "epoch": 0.05589257719310204, "grad_norm": 3.760372860611929, "learning_rate": 9.98276852922446e-07, "loss": 0.7094, "step": 205 }, { "epoch": 0.05616522391111717, "grad_norm": 6.1532233998844985, "learning_rate": 9.982400168423522e-07, "loss": 0.7778, "step": 206 }, { "epoch": 0.0564378706291323, "grad_norm": 2.270639298808816, "learning_rate": 9.982027918829659e-07, "loss": 0.7841, "step": 207 }, { "epoch": 0.05671051734714743, "grad_norm": 5.011139933672726, "learning_rate": 9.981651780733411e-07, "loss": 0.7373, "step": 208 }, { "epoch": 0.056983164065162566, "grad_norm": 12.992749029320922, "learning_rate": 9.98127175442836e-07, "loss": 0.738, "step": 209 }, { "epoch": 0.0572558107831777, "grad_norm": 2.9099440752190593, "learning_rate": 9.980887840211114e-07, "loss": 0.7628, "step": 210 }, { "epoch": 0.05752845750119283, "grad_norm": 3.6922410931960026, "learning_rate": 9.980500038381324e-07, "loss": 0.6789, "step": 211 }, { "epoch": 0.05780110421920796, "grad_norm": 3.33389703308959, "learning_rate": 9.98010834924167e-07, "loss": 0.7263, "step": 212 }, { "epoch": 0.05807375093722309, "grad_norm": 7.213538688971588, "learning_rate": 9.979712773097867e-07, "loss": 0.7995, "step": 213 }, { "epoch": 0.058346397655238226, "grad_norm": 1.8750264419809353, "learning_rate": 9.979313310258667e-07, "loss": 0.6717, "step": 214 }, { "epoch": 0.058619044373253354, "grad_norm": 4.038439249770481, "learning_rate": 9.97890996103585e-07, "loss": 0.7856, "step": 215 }, { "epoch": 0.05889169109126849, "grad_norm": 2.4569165826481028, "learning_rate": 9.978502725744237e-07, "loss": 0.775, "step": 216 }, { "epoch": 0.059164337809283624, "grad_norm": 7.729800586606809, "learning_rate": 9.978091604701672e-07, "loss": 0.7147, "step": 217 }, { "epoch": 0.05943698452729875, "grad_norm": 2.4658348251104867, "learning_rate": 9.97767659822904e-07, "loss": 0.7834, "step": 218 }, { "epoch": 0.059709631245313886, "grad_norm": 4.048500719878393, "learning_rate": 9.977257706650258e-07, "loss": 0.7292, "step": 219 }, { "epoch": 0.059982277963329014, "grad_norm": 2.4179773173574812, "learning_rate": 9.97683493029227e-07, "loss": 0.6942, "step": 220 }, { "epoch": 0.06025492468134415, "grad_norm": 7.71821678574368, "learning_rate": 9.976408269485057e-07, "loss": 0.8104, "step": 221 }, { "epoch": 0.06052757139935928, "grad_norm": 5.782033555331807, "learning_rate": 9.97597772456163e-07, "loss": 0.7531, "step": 222 }, { "epoch": 0.06080021811737441, "grad_norm": 2.885062612118746, "learning_rate": 9.975543295858033e-07, "loss": 0.7784, "step": 223 }, { "epoch": 0.06107286483538955, "grad_norm": 4.010588570850995, "learning_rate": 9.975104983713339e-07, "loss": 0.731, "step": 224 }, { "epoch": 0.061345511553404675, "grad_norm": 2.647022709585418, "learning_rate": 9.974662788469652e-07, "loss": 0.7051, "step": 225 }, { "epoch": 0.06161815827141981, "grad_norm": 3.781588342371108, "learning_rate": 9.97421671047211e-07, "loss": 0.7849, "step": 226 }, { "epoch": 0.06189080498943494, "grad_norm": 3.3130814717567123, "learning_rate": 9.97376675006888e-07, "loss": 0.7125, "step": 227 }, { "epoch": 0.06216345170745007, "grad_norm": 10.119766076890143, "learning_rate": 9.973312907611158e-07, "loss": 0.7499, "step": 228 }, { "epoch": 0.0624360984254652, "grad_norm": 3.8668780514163372, "learning_rate": 9.972855183453169e-07, "loss": 0.7972, "step": 229 }, { "epoch": 0.06270874514348034, "grad_norm": 2.6638666071239485, "learning_rate": 9.97239357795217e-07, "loss": 0.6834, "step": 230 }, { "epoch": 0.06298139186149547, "grad_norm": 4.333470701889349, "learning_rate": 9.97192809146845e-07, "loss": 0.7098, "step": 231 }, { "epoch": 0.0632540385795106, "grad_norm": 12.25493762202695, "learning_rate": 9.971458724365323e-07, "loss": 0.7145, "step": 232 }, { "epoch": 0.06352668529752573, "grad_norm": 7.390905286794014, "learning_rate": 9.970985477009129e-07, "loss": 0.778, "step": 233 }, { "epoch": 0.06379933201554086, "grad_norm": 2.34245340792967, "learning_rate": 9.970508349769245e-07, "loss": 0.7141, "step": 234 }, { "epoch": 0.064071978733556, "grad_norm": 8.227733566979822, "learning_rate": 9.970027343018066e-07, "loss": 0.7341, "step": 235 }, { "epoch": 0.06434462545157113, "grad_norm": 19.530930630823885, "learning_rate": 9.969542457131027e-07, "loss": 0.7482, "step": 236 }, { "epoch": 0.06461727216958627, "grad_norm": 2.4526886399967966, "learning_rate": 9.969053692486582e-07, "loss": 0.7959, "step": 237 }, { "epoch": 0.06488991888760139, "grad_norm": 1.8295210728670865, "learning_rate": 9.968561049466213e-07, "loss": 0.7649, "step": 238 }, { "epoch": 0.06516256560561652, "grad_norm": 2.7174326765813492, "learning_rate": 9.96806452845443e-07, "loss": 0.7645, "step": 239 }, { "epoch": 0.06543521232363166, "grad_norm": 4.777564599470371, "learning_rate": 9.967564129838777e-07, "loss": 0.7259, "step": 240 }, { "epoch": 0.06570785904164679, "grad_norm": 12.620355898892193, "learning_rate": 9.96705985400981e-07, "loss": 0.7975, "step": 241 }, { "epoch": 0.06598050575966191, "grad_norm": 2.3788129718781232, "learning_rate": 9.966551701361126e-07, "loss": 0.7893, "step": 242 }, { "epoch": 0.06625315247767705, "grad_norm": 6.845642626602085, "learning_rate": 9.966039672289335e-07, "loss": 0.7156, "step": 243 }, { "epoch": 0.06652579919569218, "grad_norm": 3.2048162507332125, "learning_rate": 9.965523767194083e-07, "loss": 0.8177, "step": 244 }, { "epoch": 0.06679844591370732, "grad_norm": 3.9170917812363473, "learning_rate": 9.965003986478037e-07, "loss": 0.73, "step": 245 }, { "epoch": 0.06707109263172245, "grad_norm": 2.0525155556272865, "learning_rate": 9.964480330546886e-07, "loss": 0.756, "step": 246 }, { "epoch": 0.06734373934973757, "grad_norm": 2.654068695650492, "learning_rate": 9.963952799809351e-07, "loss": 0.7256, "step": 247 }, { "epoch": 0.0676163860677527, "grad_norm": 2.2111353285920656, "learning_rate": 9.96342139467717e-07, "loss": 0.7405, "step": 248 }, { "epoch": 0.06788903278576784, "grad_norm": 1.7257434881755198, "learning_rate": 9.962886115565106e-07, "loss": 0.7925, "step": 249 }, { "epoch": 0.06816167950378298, "grad_norm": 2.205639208699663, "learning_rate": 9.96234696289095e-07, "loss": 0.7593, "step": 250 }, { "epoch": 0.06843432622179811, "grad_norm": 2.923416299817598, "learning_rate": 9.961803937075514e-07, "loss": 0.7201, "step": 251 }, { "epoch": 0.06870697293981323, "grad_norm": 2.4798959293186753, "learning_rate": 9.961257038542633e-07, "loss": 0.782, "step": 252 }, { "epoch": 0.06897961965782837, "grad_norm": 2.3855848969371567, "learning_rate": 9.960706267719164e-07, "loss": 0.7494, "step": 253 }, { "epoch": 0.0692522663758435, "grad_norm": 3.9536998001476746, "learning_rate": 9.960151625034984e-07, "loss": 0.7784, "step": 254 }, { "epoch": 0.06952491309385864, "grad_norm": 5.146545762704756, "learning_rate": 9.959593110923e-07, "loss": 0.7255, "step": 255 }, { "epoch": 0.06979755981187376, "grad_norm": 2.364426740857767, "learning_rate": 9.959030725819135e-07, "loss": 0.7703, "step": 256 }, { "epoch": 0.07007020652988889, "grad_norm": 2.6494308070207455, "learning_rate": 9.95846447016233e-07, "loss": 0.7228, "step": 257 }, { "epoch": 0.07034285324790403, "grad_norm": 3.312213073679546, "learning_rate": 9.957894344394554e-07, "loss": 0.698, "step": 258 }, { "epoch": 0.07061549996591916, "grad_norm": 2.1058944827140214, "learning_rate": 9.957320348960791e-07, "loss": 0.7099, "step": 259 }, { "epoch": 0.0708881466839343, "grad_norm": 3.401564171511839, "learning_rate": 9.956742484309052e-07, "loss": 0.6652, "step": 260 }, { "epoch": 0.07116079340194942, "grad_norm": 7.728122498834332, "learning_rate": 9.956160750890359e-07, "loss": 0.7745, "step": 261 }, { "epoch": 0.07143344011996455, "grad_norm": 2.4403527928214324, "learning_rate": 9.955575149158763e-07, "loss": 0.773, "step": 262 }, { "epoch": 0.07170608683797969, "grad_norm": 2.621868751318946, "learning_rate": 9.954985679571326e-07, "loss": 0.7146, "step": 263 }, { "epoch": 0.07197873355599482, "grad_norm": 2.3585824076248807, "learning_rate": 9.954392342588133e-07, "loss": 0.7325, "step": 264 }, { "epoch": 0.07225138027400996, "grad_norm": 2.318243203604457, "learning_rate": 9.95379513867229e-07, "loss": 0.7331, "step": 265 }, { "epoch": 0.07252402699202508, "grad_norm": 2.964444960042265, "learning_rate": 9.953194068289915e-07, "loss": 0.7052, "step": 266 }, { "epoch": 0.07279667371004021, "grad_norm": 2.795250454691837, "learning_rate": 9.952589131910148e-07, "loss": 0.7377, "step": 267 }, { "epoch": 0.07306932042805535, "grad_norm": 4.917360542785966, "learning_rate": 9.951980330005145e-07, "loss": 0.7169, "step": 268 }, { "epoch": 0.07334196714607048, "grad_norm": 3.4031978095786717, "learning_rate": 9.95136766305008e-07, "loss": 0.7737, "step": 269 }, { "epoch": 0.07361461386408562, "grad_norm": 3.868107698740403, "learning_rate": 9.950751131523146e-07, "loss": 0.7324, "step": 270 }, { "epoch": 0.07388726058210074, "grad_norm": 2.9055493494618565, "learning_rate": 9.950130735905544e-07, "loss": 0.8062, "step": 271 }, { "epoch": 0.07415990730011587, "grad_norm": 2.5281352920251123, "learning_rate": 9.9495064766815e-07, "loss": 0.8057, "step": 272 }, { "epoch": 0.07443255401813101, "grad_norm": 2.2998631440297768, "learning_rate": 9.94887835433825e-07, "loss": 0.7356, "step": 273 }, { "epoch": 0.07470520073614614, "grad_norm": 4.129819190256559, "learning_rate": 9.94824636936605e-07, "loss": 0.7636, "step": 274 }, { "epoch": 0.07497784745416126, "grad_norm": 4.671777848056066, "learning_rate": 9.947610522258166e-07, "loss": 0.7742, "step": 275 }, { "epoch": 0.0752504941721764, "grad_norm": 2.013765913312642, "learning_rate": 9.946970813510883e-07, "loss": 0.6864, "step": 276 }, { "epoch": 0.07552314089019153, "grad_norm": 3.1837327060619205, "learning_rate": 9.946327243623491e-07, "loss": 0.7147, "step": 277 }, { "epoch": 0.07579578760820667, "grad_norm": 3.5476569467573125, "learning_rate": 9.945679813098309e-07, "loss": 0.7765, "step": 278 }, { "epoch": 0.0760684343262218, "grad_norm": 1.9678448065546834, "learning_rate": 9.945028522440653e-07, "loss": 0.7346, "step": 279 }, { "epoch": 0.07634108104423692, "grad_norm": 2.1188628379610313, "learning_rate": 9.944373372158862e-07, "loss": 0.7697, "step": 280 }, { "epoch": 0.07661372776225206, "grad_norm": 5.089067986208042, "learning_rate": 9.943714362764287e-07, "loss": 0.6731, "step": 281 }, { "epoch": 0.0768863744802672, "grad_norm": 2.7631609054467003, "learning_rate": 9.943051494771287e-07, "loss": 0.7646, "step": 282 }, { "epoch": 0.07715902119828233, "grad_norm": 2.8122529407796235, "learning_rate": 9.942384768697232e-07, "loss": 0.7503, "step": 283 }, { "epoch": 0.07743166791629746, "grad_norm": 3.2561671381274535, "learning_rate": 9.941714185062507e-07, "loss": 0.7242, "step": 284 }, { "epoch": 0.07770431463431259, "grad_norm": 7.068952580607953, "learning_rate": 9.94103974439051e-07, "loss": 0.7195, "step": 285 }, { "epoch": 0.07797696135232772, "grad_norm": 2.5726924950983774, "learning_rate": 9.94036144720764e-07, "loss": 0.7209, "step": 286 }, { "epoch": 0.07824960807034285, "grad_norm": 2.5412404174425953, "learning_rate": 9.939679294043318e-07, "loss": 0.7277, "step": 287 }, { "epoch": 0.07852225478835799, "grad_norm": 2.5472124810910004, "learning_rate": 9.938993285429962e-07, "loss": 0.7063, "step": 288 }, { "epoch": 0.07879490150637311, "grad_norm": 6.338650265128365, "learning_rate": 9.93830342190301e-07, "loss": 0.7575, "step": 289 }, { "epoch": 0.07906754822438825, "grad_norm": 3.849143767007227, "learning_rate": 9.93760970400091e-07, "loss": 0.7262, "step": 290 }, { "epoch": 0.07934019494240338, "grad_norm": 3.015952589565951, "learning_rate": 9.9369121322651e-07, "loss": 0.7683, "step": 291 }, { "epoch": 0.07961284166041852, "grad_norm": 22.233378717440658, "learning_rate": 9.93621070724005e-07, "loss": 0.768, "step": 292 }, { "epoch": 0.07988548837843365, "grad_norm": 2.2245556905879007, "learning_rate": 9.93550542947322e-07, "loss": 0.7318, "step": 293 }, { "epoch": 0.08015813509644877, "grad_norm": 2.3212108006419734, "learning_rate": 9.934796299515087e-07, "loss": 0.7814, "step": 294 }, { "epoch": 0.0804307818144639, "grad_norm": 2.0341341247658433, "learning_rate": 9.934083317919128e-07, "loss": 0.7259, "step": 295 }, { "epoch": 0.08070342853247904, "grad_norm": 2.392772420433164, "learning_rate": 9.933366485241834e-07, "loss": 0.7493, "step": 296 }, { "epoch": 0.08097607525049418, "grad_norm": 3.196243748656725, "learning_rate": 9.932645802042693e-07, "loss": 0.7238, "step": 297 }, { "epoch": 0.08124872196850931, "grad_norm": 2.5317267511967945, "learning_rate": 9.931921268884206e-07, "loss": 0.7112, "step": 298 }, { "epoch": 0.08152136868652443, "grad_norm": 3.144454350077618, "learning_rate": 9.93119288633187e-07, "loss": 0.735, "step": 299 }, { "epoch": 0.08179401540453957, "grad_norm": 1.84973687046946, "learning_rate": 9.930460654954196e-07, "loss": 0.7399, "step": 300 }, { "epoch": 0.0820666621225547, "grad_norm": 3.1429246079933604, "learning_rate": 9.929724575322696e-07, "loss": 0.6968, "step": 301 }, { "epoch": 0.08233930884056984, "grad_norm": 6.248825441246589, "learning_rate": 9.928984648011881e-07, "loss": 0.6792, "step": 302 }, { "epoch": 0.08261195555858496, "grad_norm": 6.4783306223975625, "learning_rate": 9.928240873599272e-07, "loss": 0.7384, "step": 303 }, { "epoch": 0.08288460227660009, "grad_norm": 3.0560956616256107, "learning_rate": 9.927493252665387e-07, "loss": 0.7186, "step": 304 }, { "epoch": 0.08315724899461523, "grad_norm": 2.6380056484638494, "learning_rate": 9.926741785793752e-07, "loss": 0.8015, "step": 305 }, { "epoch": 0.08342989571263036, "grad_norm": 3.465762155248208, "learning_rate": 9.925986473570884e-07, "loss": 0.7697, "step": 306 }, { "epoch": 0.0837025424306455, "grad_norm": 2.431806708119997, "learning_rate": 9.925227316586314e-07, "loss": 0.6951, "step": 307 }, { "epoch": 0.08397518914866062, "grad_norm": 2.1441010920363497, "learning_rate": 9.924464315432573e-07, "loss": 0.7037, "step": 308 }, { "epoch": 0.08424783586667575, "grad_norm": 4.133612697690267, "learning_rate": 9.92369747070518e-07, "loss": 0.7486, "step": 309 }, { "epoch": 0.08452048258469089, "grad_norm": 1.9159498440196108, "learning_rate": 9.922926783002661e-07, "loss": 0.7002, "step": 310 }, { "epoch": 0.08479312930270602, "grad_norm": 3.26386961711345, "learning_rate": 9.92215225292655e-07, "loss": 0.7443, "step": 311 }, { "epoch": 0.08506577602072116, "grad_norm": 2.7079375184638796, "learning_rate": 9.921373881081368e-07, "loss": 0.7408, "step": 312 }, { "epoch": 0.08533842273873628, "grad_norm": 2.1845296882711978, "learning_rate": 9.92059166807464e-07, "loss": 0.7196, "step": 313 }, { "epoch": 0.08561106945675141, "grad_norm": 2.2835363793977073, "learning_rate": 9.919805614516885e-07, "loss": 0.679, "step": 314 }, { "epoch": 0.08588371617476655, "grad_norm": 1.9929949925429618, "learning_rate": 9.919015721021625e-07, "loss": 0.7668, "step": 315 }, { "epoch": 0.08615636289278168, "grad_norm": 4.352249727809832, "learning_rate": 9.918221988205378e-07, "loss": 0.7851, "step": 316 }, { "epoch": 0.0864290096107968, "grad_norm": 3.4943346782231886, "learning_rate": 9.917424416687652e-07, "loss": 0.6686, "step": 317 }, { "epoch": 0.08670165632881194, "grad_norm": 23.12938922176183, "learning_rate": 9.916623007090964e-07, "loss": 0.7662, "step": 318 }, { "epoch": 0.08697430304682707, "grad_norm": 2.8938467895021587, "learning_rate": 9.915817760040811e-07, "loss": 0.7113, "step": 319 }, { "epoch": 0.08724694976484221, "grad_norm": 3.1690461921849495, "learning_rate": 9.915008676165697e-07, "loss": 0.7182, "step": 320 }, { "epoch": 0.08751959648285734, "grad_norm": 2.3890741776340705, "learning_rate": 9.914195756097118e-07, "loss": 0.7613, "step": 321 }, { "epoch": 0.08779224320087246, "grad_norm": 2.525835142762597, "learning_rate": 9.91337900046956e-07, "loss": 0.723, "step": 322 }, { "epoch": 0.0880648899188876, "grad_norm": 2.241382568868164, "learning_rate": 9.912558409920508e-07, "loss": 0.7233, "step": 323 }, { "epoch": 0.08833753663690273, "grad_norm": 2.3371040547219875, "learning_rate": 9.911733985090437e-07, "loss": 0.6871, "step": 324 }, { "epoch": 0.08861018335491787, "grad_norm": 1.844653225622064, "learning_rate": 9.910905726622815e-07, "loss": 0.6815, "step": 325 }, { "epoch": 0.088882830072933, "grad_norm": 2.4020983569791516, "learning_rate": 9.910073635164104e-07, "loss": 0.712, "step": 326 }, { "epoch": 0.08915547679094812, "grad_norm": 4.90592391320997, "learning_rate": 9.909237711363753e-07, "loss": 0.7521, "step": 327 }, { "epoch": 0.08942812350896326, "grad_norm": 2.0583319080730416, "learning_rate": 9.90839795587421e-07, "loss": 0.734, "step": 328 }, { "epoch": 0.0897007702269784, "grad_norm": 4.0118159307233485, "learning_rate": 9.907554369350905e-07, "loss": 0.7614, "step": 329 }, { "epoch": 0.08997341694499353, "grad_norm": 10.963585527022955, "learning_rate": 9.906706952452266e-07, "loss": 0.7693, "step": 330 }, { "epoch": 0.09024606366300866, "grad_norm": 3.08829976869329, "learning_rate": 9.905855705839706e-07, "loss": 0.7041, "step": 331 }, { "epoch": 0.09051871038102378, "grad_norm": 3.0332334963947987, "learning_rate": 9.905000630177625e-07, "loss": 0.6774, "step": 332 }, { "epoch": 0.09079135709903892, "grad_norm": 2.1617650879926975, "learning_rate": 9.904141726133417e-07, "loss": 0.6818, "step": 333 }, { "epoch": 0.09106400381705405, "grad_norm": 2.7947397566980627, "learning_rate": 9.903278994377462e-07, "loss": 0.7341, "step": 334 }, { "epoch": 0.09133665053506919, "grad_norm": 3.360969409445067, "learning_rate": 9.902412435583125e-07, "loss": 0.7562, "step": 335 }, { "epoch": 0.09160929725308431, "grad_norm": 2.8507838968168713, "learning_rate": 9.901542050426766e-07, "loss": 0.7805, "step": 336 }, { "epoch": 0.09188194397109944, "grad_norm": 1.9158531611694356, "learning_rate": 9.900667839587718e-07, "loss": 0.7326, "step": 337 }, { "epoch": 0.09215459068911458, "grad_norm": 3.6862824282004967, "learning_rate": 9.899789803748314e-07, "loss": 0.7296, "step": 338 }, { "epoch": 0.09242723740712971, "grad_norm": 2.109261982717954, "learning_rate": 9.898907943593862e-07, "loss": 0.7237, "step": 339 }, { "epoch": 0.09269988412514485, "grad_norm": 6.587994108594432, "learning_rate": 9.898022259812662e-07, "loss": 0.7611, "step": 340 }, { "epoch": 0.09297253084315997, "grad_norm": 3.0975971059254532, "learning_rate": 9.897132753095994e-07, "loss": 0.7242, "step": 341 }, { "epoch": 0.0932451775611751, "grad_norm": 2.552202167506714, "learning_rate": 9.896239424138125e-07, "loss": 0.7333, "step": 342 }, { "epoch": 0.09351782427919024, "grad_norm": 2.7926985739415575, "learning_rate": 9.8953422736363e-07, "loss": 0.6815, "step": 343 }, { "epoch": 0.09379047099720537, "grad_norm": 2.6748491850120257, "learning_rate": 9.894441302290754e-07, "loss": 0.7418, "step": 344 }, { "epoch": 0.09406311771522051, "grad_norm": 12.093649276037752, "learning_rate": 9.8935365108047e-07, "loss": 0.7243, "step": 345 }, { "epoch": 0.09433576443323563, "grad_norm": 2.7072506521782973, "learning_rate": 9.89262789988433e-07, "loss": 0.7176, "step": 346 }, { "epoch": 0.09460841115125077, "grad_norm": 2.39370462210432, "learning_rate": 9.89171547023882e-07, "loss": 0.6831, "step": 347 }, { "epoch": 0.0948810578692659, "grad_norm": 2.437588895598248, "learning_rate": 9.890799222580333e-07, "loss": 0.677, "step": 348 }, { "epoch": 0.09515370458728103, "grad_norm": 1.943327928232813, "learning_rate": 9.889879157624002e-07, "loss": 0.715, "step": 349 }, { "epoch": 0.09542635130529616, "grad_norm": 3.7315388346293514, "learning_rate": 9.88895527608794e-07, "loss": 0.7057, "step": 350 }, { "epoch": 0.09569899802331129, "grad_norm": 2.28591899675916, "learning_rate": 9.888027578693249e-07, "loss": 0.6678, "step": 351 }, { "epoch": 0.09597164474132643, "grad_norm": 3.6051650029988407, "learning_rate": 9.887096066163995e-07, "loss": 0.8171, "step": 352 }, { "epoch": 0.09624429145934156, "grad_norm": 2.018758154105944, "learning_rate": 9.886160739227233e-07, "loss": 0.7253, "step": 353 }, { "epoch": 0.0965169381773567, "grad_norm": 3.1690715219474304, "learning_rate": 9.88522159861299e-07, "loss": 0.7114, "step": 354 }, { "epoch": 0.09678958489537182, "grad_norm": 2.525978097898892, "learning_rate": 9.884278645054273e-07, "loss": 0.7388, "step": 355 }, { "epoch": 0.09706223161338695, "grad_norm": 1.9896984686779233, "learning_rate": 9.883331879287061e-07, "loss": 0.7483, "step": 356 }, { "epoch": 0.09733487833140209, "grad_norm": 3.623999979616081, "learning_rate": 9.88238130205031e-07, "loss": 0.7338, "step": 357 }, { "epoch": 0.09760752504941722, "grad_norm": 2.5433796591588305, "learning_rate": 9.881426914085952e-07, "loss": 0.7093, "step": 358 }, { "epoch": 0.09788017176743236, "grad_norm": 2.8596695468141182, "learning_rate": 9.880468716138892e-07, "loss": 0.6623, "step": 359 }, { "epoch": 0.09815281848544748, "grad_norm": 2.2683846887201398, "learning_rate": 9.87950670895701e-07, "loss": 0.7154, "step": 360 }, { "epoch": 0.09842546520346261, "grad_norm": 3.077494120819024, "learning_rate": 9.878540893291158e-07, "loss": 0.8236, "step": 361 }, { "epoch": 0.09869811192147775, "grad_norm": 2.17573987972439, "learning_rate": 9.87757126989516e-07, "loss": 0.7337, "step": 362 }, { "epoch": 0.09897075863949288, "grad_norm": 2.638832439945767, "learning_rate": 9.876597839525813e-07, "loss": 0.7733, "step": 363 }, { "epoch": 0.099243405357508, "grad_norm": 1.879454134214771, "learning_rate": 9.875620602942886e-07, "loss": 0.7552, "step": 364 }, { "epoch": 0.09951605207552314, "grad_norm": 2.24030584676272, "learning_rate": 9.874639560909118e-07, "loss": 0.7708, "step": 365 }, { "epoch": 0.09978869879353827, "grad_norm": 2.772638391086895, "learning_rate": 9.873654714190218e-07, "loss": 0.8095, "step": 366 }, { "epoch": 0.1000613455115534, "grad_norm": 4.0806575589611835, "learning_rate": 9.872666063554862e-07, "loss": 0.7691, "step": 367 }, { "epoch": 0.10033399222956854, "grad_norm": 2.2171178311864073, "learning_rate": 9.871673609774701e-07, "loss": 0.7115, "step": 368 }, { "epoch": 0.10060663894758366, "grad_norm": 2.097374735352501, "learning_rate": 9.870677353624352e-07, "loss": 0.7017, "step": 369 }, { "epoch": 0.1008792856655988, "grad_norm": 3.04148836918815, "learning_rate": 9.869677295881398e-07, "loss": 0.7385, "step": 370 }, { "epoch": 0.10115193238361393, "grad_norm": 4.8574613864827905, "learning_rate": 9.868673437326387e-07, "loss": 0.7728, "step": 371 }, { "epoch": 0.10142457910162907, "grad_norm": 4.632476543349325, "learning_rate": 9.86766577874284e-07, "loss": 0.7335, "step": 372 }, { "epoch": 0.1016972258196442, "grad_norm": 2.684640764580972, "learning_rate": 9.866654320917237e-07, "loss": 0.6984, "step": 373 }, { "epoch": 0.10196987253765932, "grad_norm": 2.386094872576494, "learning_rate": 9.86563906463903e-07, "loss": 0.7175, "step": 374 }, { "epoch": 0.10224251925567446, "grad_norm": 2.27633986868299, "learning_rate": 9.86462001070063e-07, "loss": 0.8017, "step": 375 }, { "epoch": 0.10251516597368959, "grad_norm": 2.5827890548997727, "learning_rate": 9.86359715989742e-07, "loss": 0.823, "step": 376 }, { "epoch": 0.10278781269170473, "grad_norm": 5.848909922670165, "learning_rate": 9.862570513027736e-07, "loss": 0.7487, "step": 377 }, { "epoch": 0.10306045940971986, "grad_norm": 2.5100174923323686, "learning_rate": 9.86154007089288e-07, "loss": 0.7865, "step": 378 }, { "epoch": 0.10333310612773498, "grad_norm": 2.8151196464634975, "learning_rate": 9.860505834297123e-07, "loss": 0.6803, "step": 379 }, { "epoch": 0.10360575284575012, "grad_norm": 3.13561834740273, "learning_rate": 9.859467804047693e-07, "loss": 0.6874, "step": 380 }, { "epoch": 0.10387839956376525, "grad_norm": 3.0310196333477677, "learning_rate": 9.858425980954777e-07, "loss": 0.7329, "step": 381 }, { "epoch": 0.10415104628178039, "grad_norm": 4.327877263025173, "learning_rate": 9.857380365831522e-07, "loss": 0.7132, "step": 382 }, { "epoch": 0.10442369299979551, "grad_norm": 2.352929555018582, "learning_rate": 9.85633095949404e-07, "loss": 0.7518, "step": 383 }, { "epoch": 0.10469633971781064, "grad_norm": 3.0841538290990345, "learning_rate": 9.855277762761398e-07, "loss": 0.6793, "step": 384 }, { "epoch": 0.10496898643582578, "grad_norm": 3.1477755168028017, "learning_rate": 9.854220776455624e-07, "loss": 0.7866, "step": 385 }, { "epoch": 0.10524163315384091, "grad_norm": 2.616769974557188, "learning_rate": 9.8531600014017e-07, "loss": 0.7162, "step": 386 }, { "epoch": 0.10551427987185605, "grad_norm": 2.7777788035239532, "learning_rate": 9.852095438427564e-07, "loss": 0.7644, "step": 387 }, { "epoch": 0.10578692658987117, "grad_norm": 2.850226487807443, "learning_rate": 9.85102708836412e-07, "loss": 0.7873, "step": 388 }, { "epoch": 0.1060595733078863, "grad_norm": 3.126902082845002, "learning_rate": 9.849954952045218e-07, "loss": 0.7514, "step": 389 }, { "epoch": 0.10633222002590144, "grad_norm": 3.0436358590407924, "learning_rate": 9.84887903030767e-07, "loss": 0.7886, "step": 390 }, { "epoch": 0.10660486674391657, "grad_norm": 2.807948108676688, "learning_rate": 9.847799323991233e-07, "loss": 0.7475, "step": 391 }, { "epoch": 0.10687751346193171, "grad_norm": 2.3789197111995297, "learning_rate": 9.84671583393863e-07, "loss": 0.7377, "step": 392 }, { "epoch": 0.10715016017994683, "grad_norm": 2.889050098007009, "learning_rate": 9.845628560995528e-07, "loss": 0.6529, "step": 393 }, { "epoch": 0.10742280689796196, "grad_norm": 2.2881698678810314, "learning_rate": 9.844537506010552e-07, "loss": 0.6706, "step": 394 }, { "epoch": 0.1076954536159771, "grad_norm": 2.9475718086055345, "learning_rate": 9.843442669835278e-07, "loss": 0.7066, "step": 395 }, { "epoch": 0.10796810033399223, "grad_norm": 3.88309386722723, "learning_rate": 9.84234405332423e-07, "loss": 0.697, "step": 396 }, { "epoch": 0.10824074705200736, "grad_norm": 3.090589120502419, "learning_rate": 9.841241657334885e-07, "loss": 0.8347, "step": 397 }, { "epoch": 0.10851339377002249, "grad_norm": 2.020181604131662, "learning_rate": 9.840135482727669e-07, "loss": 0.6686, "step": 398 }, { "epoch": 0.10878604048803762, "grad_norm": 2.9882598112857846, "learning_rate": 9.83902553036596e-07, "loss": 0.6776, "step": 399 }, { "epoch": 0.10905868720605276, "grad_norm": 2.0137770636855516, "learning_rate": 9.837911801116078e-07, "loss": 0.7319, "step": 400 }, { "epoch": 0.1093313339240679, "grad_norm": 3.084862929466016, "learning_rate": 9.836794295847304e-07, "loss": 0.6774, "step": 401 }, { "epoch": 0.10960398064208302, "grad_norm": 2.5810563569119673, "learning_rate": 9.835673015431847e-07, "loss": 0.7449, "step": 402 }, { "epoch": 0.10987662736009815, "grad_norm": 4.930767600570214, "learning_rate": 9.83454796074488e-07, "loss": 0.6897, "step": 403 }, { "epoch": 0.11014927407811329, "grad_norm": 2.793825730507813, "learning_rate": 9.833419132664512e-07, "loss": 0.7448, "step": 404 }, { "epoch": 0.11042192079612842, "grad_norm": 1.947177234093249, "learning_rate": 9.8322865320718e-07, "loss": 0.7033, "step": 405 }, { "epoch": 0.11069456751414355, "grad_norm": 2.791643791569829, "learning_rate": 9.831150159850749e-07, "loss": 0.7037, "step": 406 }, { "epoch": 0.11096721423215868, "grad_norm": 2.580039620838536, "learning_rate": 9.8300100168883e-07, "loss": 0.701, "step": 407 }, { "epoch": 0.11123986095017381, "grad_norm": 2.913108686974083, "learning_rate": 9.828866104074344e-07, "loss": 0.7496, "step": 408 }, { "epoch": 0.11151250766818895, "grad_norm": 2.9845758225375008, "learning_rate": 9.82771842230171e-07, "loss": 0.7422, "step": 409 }, { "epoch": 0.11178515438620408, "grad_norm": 2.1334292887896544, "learning_rate": 9.82656697246617e-07, "loss": 0.7402, "step": 410 }, { "epoch": 0.1120578011042192, "grad_norm": 2.113512071828122, "learning_rate": 9.82541175546644e-07, "loss": 0.6825, "step": 411 }, { "epoch": 0.11233044782223434, "grad_norm": 5.511720988829896, "learning_rate": 9.82425277220417e-07, "loss": 0.7308, "step": 412 }, { "epoch": 0.11260309454024947, "grad_norm": 2.712761039359136, "learning_rate": 9.823090023583957e-07, "loss": 0.7062, "step": 413 }, { "epoch": 0.1128757412582646, "grad_norm": 2.751932205500275, "learning_rate": 9.821923510513331e-07, "loss": 0.7178, "step": 414 }, { "epoch": 0.11314838797627974, "grad_norm": 2.351001843854399, "learning_rate": 9.820753233902766e-07, "loss": 0.7681, "step": 415 }, { "epoch": 0.11342103469429486, "grad_norm": 2.586484397894137, "learning_rate": 9.819579194665665e-07, "loss": 0.731, "step": 416 }, { "epoch": 0.11369368141231, "grad_norm": 2.33094434270762, "learning_rate": 9.818401393718376e-07, "loss": 0.7068, "step": 417 }, { "epoch": 0.11396632813032513, "grad_norm": 2.6884458358924017, "learning_rate": 9.817219831980177e-07, "loss": 0.7676, "step": 418 }, { "epoch": 0.11423897484834027, "grad_norm": 4.201280456856337, "learning_rate": 9.816034510373285e-07, "loss": 0.7865, "step": 419 }, { "epoch": 0.1145116215663554, "grad_norm": 3.0187533694394126, "learning_rate": 9.814845429822851e-07, "loss": 0.7809, "step": 420 }, { "epoch": 0.11478426828437052, "grad_norm": 2.269071646542116, "learning_rate": 9.81365259125696e-07, "loss": 0.7389, "step": 421 }, { "epoch": 0.11505691500238566, "grad_norm": 2.364721838240693, "learning_rate": 9.812455995606627e-07, "loss": 0.7099, "step": 422 }, { "epoch": 0.11532956172040079, "grad_norm": 2.1111374974882993, "learning_rate": 9.811255643805804e-07, "loss": 0.7359, "step": 423 }, { "epoch": 0.11560220843841593, "grad_norm": 3.2931629543558314, "learning_rate": 9.810051536791374e-07, "loss": 0.768, "step": 424 }, { "epoch": 0.11587485515643106, "grad_norm": 2.8788155923343326, "learning_rate": 9.80884367550315e-07, "loss": 0.6949, "step": 425 }, { "epoch": 0.11614750187444618, "grad_norm": 3.202126065824179, "learning_rate": 9.80763206088387e-07, "loss": 0.783, "step": 426 }, { "epoch": 0.11642014859246132, "grad_norm": 3.074196868395862, "learning_rate": 9.80641669387921e-07, "loss": 0.7173, "step": 427 }, { "epoch": 0.11669279531047645, "grad_norm": 2.623978988005273, "learning_rate": 9.805197575437771e-07, "loss": 0.7689, "step": 428 }, { "epoch": 0.11696544202849159, "grad_norm": 2.329315546934162, "learning_rate": 9.803974706511082e-07, "loss": 0.7686, "step": 429 }, { "epoch": 0.11723808874650671, "grad_norm": 4.655034868096206, "learning_rate": 9.802748088053598e-07, "loss": 0.6754, "step": 430 }, { "epoch": 0.11751073546452184, "grad_norm": 2.9122365009215754, "learning_rate": 9.801517721022705e-07, "loss": 0.7808, "step": 431 }, { "epoch": 0.11778338218253698, "grad_norm": 6.614817904576154, "learning_rate": 9.80028360637871e-07, "loss": 0.7684, "step": 432 }, { "epoch": 0.11805602890055211, "grad_norm": 3.194990328395329, "learning_rate": 9.799045745084847e-07, "loss": 0.757, "step": 433 }, { "epoch": 0.11832867561856725, "grad_norm": 2.248947590551858, "learning_rate": 9.797804138107273e-07, "loss": 0.6957, "step": 434 }, { "epoch": 0.11860132233658237, "grad_norm": 2.8509559822340016, "learning_rate": 9.79655878641507e-07, "loss": 0.7281, "step": 435 }, { "epoch": 0.1188739690545975, "grad_norm": 19.444841439502312, "learning_rate": 9.795309690980245e-07, "loss": 0.6599, "step": 436 }, { "epoch": 0.11914661577261264, "grad_norm": 7.089095735386049, "learning_rate": 9.79405685277772e-07, "loss": 0.7845, "step": 437 }, { "epoch": 0.11941926249062777, "grad_norm": 2.295797102360237, "learning_rate": 9.792800272785347e-07, "loss": 0.6726, "step": 438 }, { "epoch": 0.11969190920864291, "grad_norm": 2.457445506618536, "learning_rate": 9.79153995198389e-07, "loss": 0.7256, "step": 439 }, { "epoch": 0.11996455592665803, "grad_norm": 2.4094005037100117, "learning_rate": 9.790275891357043e-07, "loss": 0.7176, "step": 440 }, { "epoch": 0.12023720264467316, "grad_norm": 2.7083702821362694, "learning_rate": 9.789008091891407e-07, "loss": 0.7407, "step": 441 }, { "epoch": 0.1205098493626883, "grad_norm": 2.8918196959048097, "learning_rate": 9.78773655457651e-07, "loss": 0.7337, "step": 442 }, { "epoch": 0.12078249608070343, "grad_norm": 4.642997296134165, "learning_rate": 9.786461280404792e-07, "loss": 0.7966, "step": 443 }, { "epoch": 0.12105514279871855, "grad_norm": 2.861403427541752, "learning_rate": 9.785182270371612e-07, "loss": 0.7589, "step": 444 }, { "epoch": 0.12132778951673369, "grad_norm": 2.6438786778619736, "learning_rate": 9.783899525475248e-07, "loss": 0.7284, "step": 445 }, { "epoch": 0.12160043623474882, "grad_norm": 1.7827864590316513, "learning_rate": 9.782613046716887e-07, "loss": 0.7157, "step": 446 }, { "epoch": 0.12187308295276396, "grad_norm": 1.9063049763314945, "learning_rate": 9.781322835100637e-07, "loss": 0.7146, "step": 447 }, { "epoch": 0.1221457296707791, "grad_norm": 2.2721399407139145, "learning_rate": 9.780028891633514e-07, "loss": 0.7556, "step": 448 }, { "epoch": 0.12241837638879421, "grad_norm": 2.6042543658275794, "learning_rate": 9.778731217325444e-07, "loss": 0.7238, "step": 449 }, { "epoch": 0.12269102310680935, "grad_norm": 3.7031449087617982, "learning_rate": 9.777429813189277e-07, "loss": 0.7479, "step": 450 }, { "epoch": 0.12296366982482448, "grad_norm": 2.822515664007574, "learning_rate": 9.776124680240764e-07, "loss": 0.7228, "step": 451 }, { "epoch": 0.12323631654283962, "grad_norm": 3.119399462844459, "learning_rate": 9.774815819498566e-07, "loss": 0.7728, "step": 452 }, { "epoch": 0.12350896326085475, "grad_norm": 1.944024591096134, "learning_rate": 9.773503231984258e-07, "loss": 0.7059, "step": 453 }, { "epoch": 0.12378160997886987, "grad_norm": 2.460800834282024, "learning_rate": 9.772186918722325e-07, "loss": 0.6965, "step": 454 }, { "epoch": 0.12405425669688501, "grad_norm": 2.270782966343311, "learning_rate": 9.770866880740156e-07, "loss": 0.7765, "step": 455 }, { "epoch": 0.12432690341490014, "grad_norm": 3.9285623472543922, "learning_rate": 9.769543119068046e-07, "loss": 0.7587, "step": 456 }, { "epoch": 0.12459955013291528, "grad_norm": 6.307631459619789, "learning_rate": 9.768215634739201e-07, "loss": 0.7194, "step": 457 }, { "epoch": 0.1248721968509304, "grad_norm": 2.833887789181686, "learning_rate": 9.766884428789732e-07, "loss": 0.6863, "step": 458 }, { "epoch": 0.12514484356894554, "grad_norm": 3.6026705922153544, "learning_rate": 9.765549502258649e-07, "loss": 0.7331, "step": 459 }, { "epoch": 0.12541749028696067, "grad_norm": 2.4685805031180243, "learning_rate": 9.764210856187877e-07, "loss": 0.6855, "step": 460 }, { "epoch": 0.1256901370049758, "grad_norm": 2.4688801125777826, "learning_rate": 9.762868491622228e-07, "loss": 0.7692, "step": 461 }, { "epoch": 0.12596278372299094, "grad_norm": 7.52996138948621, "learning_rate": 9.761522409609433e-07, "loss": 0.6637, "step": 462 }, { "epoch": 0.12623543044100607, "grad_norm": 4.16753682996507, "learning_rate": 9.760172611200113e-07, "loss": 0.7326, "step": 463 }, { "epoch": 0.1265080771590212, "grad_norm": 3.026845755127748, "learning_rate": 9.758819097447795e-07, "loss": 0.7435, "step": 464 }, { "epoch": 0.12678072387703632, "grad_norm": 2.694312618641871, "learning_rate": 9.757461869408904e-07, "loss": 0.7378, "step": 465 }, { "epoch": 0.12705337059505145, "grad_norm": 8.64082887410928, "learning_rate": 9.756100928142765e-07, "loss": 0.7336, "step": 466 }, { "epoch": 0.1273260173130666, "grad_norm": 2.8020253814719407, "learning_rate": 9.754736274711601e-07, "loss": 0.801, "step": 467 }, { "epoch": 0.12759866403108172, "grad_norm": 3.548036765200782, "learning_rate": 9.753367910180532e-07, "loss": 0.7445, "step": 468 }, { "epoch": 0.12787131074909686, "grad_norm": 2.425087360579484, "learning_rate": 9.751995835617574e-07, "loss": 0.631, "step": 469 }, { "epoch": 0.128143957467112, "grad_norm": 2.7360722522065815, "learning_rate": 9.75062005209364e-07, "loss": 0.7007, "step": 470 }, { "epoch": 0.12841660418512713, "grad_norm": 1.8395966741377474, "learning_rate": 9.74924056068254e-07, "loss": 0.7664, "step": 471 }, { "epoch": 0.12868925090314226, "grad_norm": 2.1192428842502204, "learning_rate": 9.747857362460968e-07, "loss": 0.6971, "step": 472 }, { "epoch": 0.1289618976211574, "grad_norm": 2.0324678352068273, "learning_rate": 9.746470458508525e-07, "loss": 0.6774, "step": 473 }, { "epoch": 0.12923454433917253, "grad_norm": 2.243874872301008, "learning_rate": 9.745079849907695e-07, "loss": 0.7221, "step": 474 }, { "epoch": 0.12950719105718764, "grad_norm": 2.358143190396531, "learning_rate": 9.743685537743856e-07, "loss": 0.7052, "step": 475 }, { "epoch": 0.12977983777520277, "grad_norm": 3.714414552476959, "learning_rate": 9.742287523105276e-07, "loss": 0.7077, "step": 476 }, { "epoch": 0.1300524844932179, "grad_norm": 2.4743274358718277, "learning_rate": 9.740885807083118e-07, "loss": 0.7189, "step": 477 }, { "epoch": 0.13032513121123304, "grad_norm": 9.894573235607359, "learning_rate": 9.739480390771427e-07, "loss": 0.8332, "step": 478 }, { "epoch": 0.13059777792924818, "grad_norm": 2.2379981752882263, "learning_rate": 9.73807127526714e-07, "loss": 0.6789, "step": 479 }, { "epoch": 0.1308704246472633, "grad_norm": 1.863812877718637, "learning_rate": 9.736658461670075e-07, "loss": 0.7556, "step": 480 }, { "epoch": 0.13114307136527845, "grad_norm": 2.439207562560368, "learning_rate": 9.73524195108295e-07, "loss": 0.6494, "step": 481 }, { "epoch": 0.13141571808329358, "grad_norm": 3.6310510752956957, "learning_rate": 9.733821744611351e-07, "loss": 0.6578, "step": 482 }, { "epoch": 0.13168836480130872, "grad_norm": 1.9250716900575646, "learning_rate": 9.732397843363762e-07, "loss": 0.7047, "step": 483 }, { "epoch": 0.13196101151932382, "grad_norm": 2.120183096626139, "learning_rate": 9.73097024845155e-07, "loss": 0.6889, "step": 484 }, { "epoch": 0.13223365823733896, "grad_norm": 2.5774720466279613, "learning_rate": 9.729538960988954e-07, "loss": 0.6956, "step": 485 }, { "epoch": 0.1325063049553541, "grad_norm": 2.525085524305542, "learning_rate": 9.728103982093108e-07, "loss": 0.7799, "step": 486 }, { "epoch": 0.13277895167336923, "grad_norm": 2.819438861354178, "learning_rate": 9.726665312884016e-07, "loss": 0.7455, "step": 487 }, { "epoch": 0.13305159839138436, "grad_norm": 2.768945107692899, "learning_rate": 9.725222954484575e-07, "loss": 0.7024, "step": 488 }, { "epoch": 0.1333242451093995, "grad_norm": 2.0474810888052244, "learning_rate": 9.723776908020548e-07, "loss": 0.6919, "step": 489 }, { "epoch": 0.13359689182741463, "grad_norm": 1.9440078369917624, "learning_rate": 9.722327174620588e-07, "loss": 0.7422, "step": 490 }, { "epoch": 0.13386953854542977, "grad_norm": 2.5948368626113614, "learning_rate": 9.720873755416217e-07, "loss": 0.7037, "step": 491 }, { "epoch": 0.1341421852634449, "grad_norm": 5.08959421386544, "learning_rate": 9.719416651541837e-07, "loss": 0.7438, "step": 492 }, { "epoch": 0.13441483198146004, "grad_norm": 2.3954721390881173, "learning_rate": 9.717955864134728e-07, "loss": 0.7218, "step": 493 }, { "epoch": 0.13468747869947514, "grad_norm": 1.724719565322906, "learning_rate": 9.716491394335043e-07, "loss": 0.7518, "step": 494 }, { "epoch": 0.13496012541749028, "grad_norm": 3.041557487219062, "learning_rate": 9.71502324328581e-07, "loss": 0.7417, "step": 495 }, { "epoch": 0.1352327721355054, "grad_norm": 1.9822788690864832, "learning_rate": 9.713551412132928e-07, "loss": 0.7232, "step": 496 }, { "epoch": 0.13550541885352055, "grad_norm": 2.4037516282030578, "learning_rate": 9.71207590202517e-07, "loss": 0.7499, "step": 497 }, { "epoch": 0.13577806557153568, "grad_norm": 2.335876057211659, "learning_rate": 9.710596714114181e-07, "loss": 0.7643, "step": 498 }, { "epoch": 0.13605071228955082, "grad_norm": 2.2798521092974378, "learning_rate": 9.709113849554478e-07, "loss": 0.7336, "step": 499 }, { "epoch": 0.13632335900756595, "grad_norm": 2.009533243389872, "learning_rate": 9.707627309503443e-07, "loss": 0.6942, "step": 500 }, { "epoch": 0.1365960057255811, "grad_norm": 2.3499350808503077, "learning_rate": 9.70613709512133e-07, "loss": 0.6941, "step": 501 }, { "epoch": 0.13686865244359622, "grad_norm": 2.728624025424667, "learning_rate": 9.704643207571257e-07, "loss": 0.7809, "step": 502 }, { "epoch": 0.13714129916161133, "grad_norm": 2.010048836875114, "learning_rate": 9.70314564801922e-07, "loss": 0.6726, "step": 503 }, { "epoch": 0.13741394587962646, "grad_norm": 2.7215405548744047, "learning_rate": 9.701644417634066e-07, "loss": 0.6837, "step": 504 }, { "epoch": 0.1376865925976416, "grad_norm": 2.001138930239582, "learning_rate": 9.700139517587515e-07, "loss": 0.6739, "step": 505 }, { "epoch": 0.13795923931565673, "grad_norm": 12.040758404163025, "learning_rate": 9.698630949054153e-07, "loss": 0.7195, "step": 506 }, { "epoch": 0.13823188603367187, "grad_norm": 2.674519813910211, "learning_rate": 9.697118713211428e-07, "loss": 0.7472, "step": 507 }, { "epoch": 0.138504532751687, "grad_norm": 3.5147561785060115, "learning_rate": 9.695602811239646e-07, "loss": 0.6855, "step": 508 }, { "epoch": 0.13877717946970214, "grad_norm": 1.999545856690547, "learning_rate": 9.694083244321978e-07, "loss": 0.7145, "step": 509 }, { "epoch": 0.13904982618771727, "grad_norm": 2.0524258187455433, "learning_rate": 9.692560013644456e-07, "loss": 0.6251, "step": 510 }, { "epoch": 0.1393224729057324, "grad_norm": 2.730336558278889, "learning_rate": 9.691033120395968e-07, "loss": 0.7207, "step": 511 }, { "epoch": 0.13959511962374752, "grad_norm": 3.480640635237784, "learning_rate": 9.689502565768266e-07, "loss": 0.7173, "step": 512 }, { "epoch": 0.13986776634176265, "grad_norm": 5.49219741195257, "learning_rate": 9.687968350955955e-07, "loss": 0.6884, "step": 513 }, { "epoch": 0.14014041305977779, "grad_norm": 2.4455208479572086, "learning_rate": 9.686430477156502e-07, "loss": 0.7152, "step": 514 }, { "epoch": 0.14041305977779292, "grad_norm": 5.301652530958417, "learning_rate": 9.684888945570225e-07, "loss": 0.7914, "step": 515 }, { "epoch": 0.14068570649580805, "grad_norm": 3.482976644824944, "learning_rate": 9.683343757400296e-07, "loss": 0.7601, "step": 516 }, { "epoch": 0.1409583532138232, "grad_norm": 2.5484756129989874, "learning_rate": 9.681794913852746e-07, "loss": 0.7733, "step": 517 }, { "epoch": 0.14123099993183832, "grad_norm": 3.2138557608143383, "learning_rate": 9.680242416136458e-07, "loss": 0.742, "step": 518 }, { "epoch": 0.14150364664985346, "grad_norm": 2.6923014835072023, "learning_rate": 9.678686265463161e-07, "loss": 0.7221, "step": 519 }, { "epoch": 0.1417762933678686, "grad_norm": 1.9463825512039463, "learning_rate": 9.677126463047444e-07, "loss": 0.6957, "step": 520 }, { "epoch": 0.14204894008588373, "grad_norm": 2.0775882857574794, "learning_rate": 9.675563010106741e-07, "loss": 0.6891, "step": 521 }, { "epoch": 0.14232158680389884, "grad_norm": 2.3553791554589676, "learning_rate": 9.673995907861336e-07, "loss": 0.7778, "step": 522 }, { "epoch": 0.14259423352191397, "grad_norm": 3.715815035631825, "learning_rate": 9.672425157534362e-07, "loss": 0.7368, "step": 523 }, { "epoch": 0.1428668802399291, "grad_norm": 1.977975805017582, "learning_rate": 9.6708507603518e-07, "loss": 0.7113, "step": 524 }, { "epoch": 0.14313952695794424, "grad_norm": 3.4552487174779927, "learning_rate": 9.669272717542474e-07, "loss": 0.6842, "step": 525 }, { "epoch": 0.14341217367595938, "grad_norm": 2.1001616552069446, "learning_rate": 9.667691030338056e-07, "loss": 0.7822, "step": 526 }, { "epoch": 0.1436848203939745, "grad_norm": 4.433262093658708, "learning_rate": 9.666105699973064e-07, "loss": 0.7799, "step": 527 }, { "epoch": 0.14395746711198965, "grad_norm": 2.4533211465609512, "learning_rate": 9.664516727684857e-07, "loss": 0.6659, "step": 528 }, { "epoch": 0.14423011383000478, "grad_norm": 4.329771210298486, "learning_rate": 9.662924114713635e-07, "loss": 0.661, "step": 529 }, { "epoch": 0.14450276054801992, "grad_norm": 4.056431405143964, "learning_rate": 9.661327862302446e-07, "loss": 0.7489, "step": 530 }, { "epoch": 0.14477540726603502, "grad_norm": 2.6881061023925596, "learning_rate": 9.659727971697173e-07, "loss": 0.7268, "step": 531 }, { "epoch": 0.14504805398405016, "grad_norm": 5.5801203597070925, "learning_rate": 9.658124444146537e-07, "loss": 0.7098, "step": 532 }, { "epoch": 0.1453207007020653, "grad_norm": 3.770594638226019, "learning_rate": 9.656517280902106e-07, "loss": 0.7479, "step": 533 }, { "epoch": 0.14559334742008043, "grad_norm": 2.102837272394866, "learning_rate": 9.654906483218277e-07, "loss": 0.7509, "step": 534 }, { "epoch": 0.14586599413809556, "grad_norm": 1.6683126021958432, "learning_rate": 9.65329205235229e-07, "loss": 0.7426, "step": 535 }, { "epoch": 0.1461386408561107, "grad_norm": 2.5021694923047435, "learning_rate": 9.651673989564213e-07, "loss": 0.7168, "step": 536 }, { "epoch": 0.14641128757412583, "grad_norm": 3.332835992283637, "learning_rate": 9.650052296116957e-07, "loss": 0.7162, "step": 537 }, { "epoch": 0.14668393429214097, "grad_norm": 2.3097498635945177, "learning_rate": 9.648426973276265e-07, "loss": 0.6906, "step": 538 }, { "epoch": 0.1469565810101561, "grad_norm": 4.064422352147432, "learning_rate": 9.646798022310706e-07, "loss": 0.7556, "step": 539 }, { "epoch": 0.14722922772817124, "grad_norm": 3.42174379464164, "learning_rate": 9.645165444491692e-07, "loss": 0.7304, "step": 540 }, { "epoch": 0.14750187444618634, "grad_norm": 2.039686539653648, "learning_rate": 9.643529241093455e-07, "loss": 0.7264, "step": 541 }, { "epoch": 0.14777452116420148, "grad_norm": 5.561059072215627, "learning_rate": 9.641889413393066e-07, "loss": 0.7533, "step": 542 }, { "epoch": 0.1480471678822166, "grad_norm": 2.794250608461097, "learning_rate": 9.640245962670414e-07, "loss": 0.6916, "step": 543 }, { "epoch": 0.14831981460023175, "grad_norm": 2.874319619652907, "learning_rate": 9.638598890208225e-07, "loss": 0.7777, "step": 544 }, { "epoch": 0.14859246131824688, "grad_norm": 2.4996359053071417, "learning_rate": 9.63694819729205e-07, "loss": 0.6937, "step": 545 }, { "epoch": 0.14886510803626202, "grad_norm": 2.4932957145814787, "learning_rate": 9.635293885210265e-07, "loss": 0.6439, "step": 546 }, { "epoch": 0.14913775475427715, "grad_norm": 3.3744534438346614, "learning_rate": 9.633635955254068e-07, "loss": 0.6772, "step": 547 }, { "epoch": 0.1494104014722923, "grad_norm": 2.6167783745069713, "learning_rate": 9.631974408717487e-07, "loss": 0.7152, "step": 548 }, { "epoch": 0.14968304819030742, "grad_norm": 3.07976484859319, "learning_rate": 9.630309246897363e-07, "loss": 0.758, "step": 549 }, { "epoch": 0.14995569490832253, "grad_norm": 2.7246382702721497, "learning_rate": 9.628640471093372e-07, "loss": 0.6637, "step": 550 }, { "epoch": 0.15022834162633766, "grad_norm": 2.3319437154660565, "learning_rate": 9.626968082607995e-07, "loss": 0.7164, "step": 551 }, { "epoch": 0.1505009883443528, "grad_norm": 2.3355303684935795, "learning_rate": 9.62529208274655e-07, "loss": 0.7512, "step": 552 }, { "epoch": 0.15077363506236793, "grad_norm": 3.0343895628763833, "learning_rate": 9.623612472817158e-07, "loss": 0.7021, "step": 553 }, { "epoch": 0.15104628178038307, "grad_norm": 2.5327226658695197, "learning_rate": 9.621929254130767e-07, "loss": 0.7056, "step": 554 }, { "epoch": 0.1513189284983982, "grad_norm": 2.9668414108931156, "learning_rate": 9.62024242800114e-07, "loss": 0.8323, "step": 555 }, { "epoch": 0.15159157521641334, "grad_norm": 1.764359529061429, "learning_rate": 9.618551995744856e-07, "loss": 0.7097, "step": 556 }, { "epoch": 0.15186422193442847, "grad_norm": 2.9619465013831006, "learning_rate": 9.6168579586813e-07, "loss": 0.6956, "step": 557 }, { "epoch": 0.1521368686524436, "grad_norm": 2.12890783620962, "learning_rate": 9.615160318132688e-07, "loss": 0.7327, "step": 558 }, { "epoch": 0.15240951537045871, "grad_norm": 3.8196640702188547, "learning_rate": 9.613459075424033e-07, "loss": 0.6831, "step": 559 }, { "epoch": 0.15268216208847385, "grad_norm": 4.084738789957526, "learning_rate": 9.611754231883165e-07, "loss": 0.6809, "step": 560 }, { "epoch": 0.15295480880648898, "grad_norm": 3.980621829167933, "learning_rate": 9.610045788840727e-07, "loss": 0.6956, "step": 561 }, { "epoch": 0.15322745552450412, "grad_norm": 3.0866680184053807, "learning_rate": 9.608333747630167e-07, "loss": 0.7449, "step": 562 }, { "epoch": 0.15350010224251925, "grad_norm": 2.920956843139316, "learning_rate": 9.606618109587743e-07, "loss": 0.7774, "step": 563 }, { "epoch": 0.1537727489605344, "grad_norm": 2.796597721775287, "learning_rate": 9.604898876052523e-07, "loss": 0.7247, "step": 564 }, { "epoch": 0.15404539567854952, "grad_norm": 8.101338406830095, "learning_rate": 9.603176048366378e-07, "loss": 0.7419, "step": 565 }, { "epoch": 0.15431804239656466, "grad_norm": 34.98161105945755, "learning_rate": 9.601449627873982e-07, "loss": 0.6743, "step": 566 }, { "epoch": 0.1545906891145798, "grad_norm": 2.014094390016758, "learning_rate": 9.59971961592282e-07, "loss": 0.6978, "step": 567 }, { "epoch": 0.15486333583259493, "grad_norm": 2.2421869243611785, "learning_rate": 9.597986013863178e-07, "loss": 0.7343, "step": 568 }, { "epoch": 0.15513598255061004, "grad_norm": 4.3519781929968655, "learning_rate": 9.596248823048138e-07, "loss": 0.6512, "step": 569 }, { "epoch": 0.15540862926862517, "grad_norm": 3.2749008154732238, "learning_rate": 9.59450804483359e-07, "loss": 0.7061, "step": 570 }, { "epoch": 0.1556812759866403, "grad_norm": 2.564852073956592, "learning_rate": 9.592763680578224e-07, "loss": 0.6773, "step": 571 }, { "epoch": 0.15595392270465544, "grad_norm": 4.226278451493533, "learning_rate": 9.591015731643523e-07, "loss": 0.7158, "step": 572 }, { "epoch": 0.15622656942267057, "grad_norm": 6.372170480714187, "learning_rate": 9.589264199393776e-07, "loss": 0.6878, "step": 573 }, { "epoch": 0.1564992161406857, "grad_norm": 3.6337883376705538, "learning_rate": 9.587509085196058e-07, "loss": 0.7233, "step": 574 }, { "epoch": 0.15677186285870084, "grad_norm": 3.0346917945276983, "learning_rate": 9.585750390420251e-07, "loss": 0.7139, "step": 575 }, { "epoch": 0.15704450957671598, "grad_norm": 3.6274584427301737, "learning_rate": 9.583988116439026e-07, "loss": 0.713, "step": 576 }, { "epoch": 0.15731715629473111, "grad_norm": 3.7714266817401585, "learning_rate": 9.582222264627846e-07, "loss": 0.6938, "step": 577 }, { "epoch": 0.15758980301274622, "grad_norm": 2.084624731668444, "learning_rate": 9.580452836364972e-07, "loss": 0.7491, "step": 578 }, { "epoch": 0.15786244973076136, "grad_norm": 2.7015697868230295, "learning_rate": 9.57867983303145e-07, "loss": 0.7351, "step": 579 }, { "epoch": 0.1581350964487765, "grad_norm": 2.4739653203762333, "learning_rate": 9.57690325601112e-07, "loss": 0.665, "step": 580 }, { "epoch": 0.15840774316679163, "grad_norm": 2.147510672199476, "learning_rate": 9.575123106690617e-07, "loss": 0.7197, "step": 581 }, { "epoch": 0.15868038988480676, "grad_norm": 1.8154706195686066, "learning_rate": 9.57333938645935e-07, "loss": 0.7019, "step": 582 }, { "epoch": 0.1589530366028219, "grad_norm": 2.681079218982788, "learning_rate": 9.571552096709528e-07, "loss": 0.7436, "step": 583 }, { "epoch": 0.15922568332083703, "grad_norm": 7.429487084578031, "learning_rate": 9.569761238836138e-07, "loss": 0.7259, "step": 584 }, { "epoch": 0.15949833003885217, "grad_norm": 10.380494292811743, "learning_rate": 9.567966814236959e-07, "loss": 0.7365, "step": 585 }, { "epoch": 0.1597709767568673, "grad_norm": 2.3327440118104925, "learning_rate": 9.566168824312549e-07, "loss": 0.7906, "step": 586 }, { "epoch": 0.16004362347488243, "grad_norm": 8.801197669166683, "learning_rate": 9.564367270466245e-07, "loss": 0.6345, "step": 587 }, { "epoch": 0.16031627019289754, "grad_norm": 3.5822199853498415, "learning_rate": 9.562562154104177e-07, "loss": 0.7932, "step": 588 }, { "epoch": 0.16058891691091268, "grad_norm": 2.358217565618254, "learning_rate": 9.560753476635246e-07, "loss": 0.6997, "step": 589 }, { "epoch": 0.1608615636289278, "grad_norm": 2.397174731839339, "learning_rate": 9.558941239471137e-07, "loss": 0.7222, "step": 590 }, { "epoch": 0.16113421034694295, "grad_norm": 3.261218981090591, "learning_rate": 9.55712544402631e-07, "loss": 0.6448, "step": 591 }, { "epoch": 0.16140685706495808, "grad_norm": 2.7224161186499076, "learning_rate": 9.555306091718005e-07, "loss": 0.7628, "step": 592 }, { "epoch": 0.16167950378297322, "grad_norm": 3.422277711635845, "learning_rate": 9.553483183966237e-07, "loss": 0.7755, "step": 593 }, { "epoch": 0.16195215050098835, "grad_norm": 4.915122983610233, "learning_rate": 9.551656722193796e-07, "loss": 0.6887, "step": 594 }, { "epoch": 0.16222479721900349, "grad_norm": 1.8373944071930068, "learning_rate": 9.549826707826248e-07, "loss": 0.74, "step": 595 }, { "epoch": 0.16249744393701862, "grad_norm": 13.3786282560353, "learning_rate": 9.547993142291927e-07, "loss": 0.7595, "step": 596 }, { "epoch": 0.16277009065503373, "grad_norm": 3.9818417686758423, "learning_rate": 9.546156027021945e-07, "loss": 0.7277, "step": 597 }, { "epoch": 0.16304273737304886, "grad_norm": 2.2586117698947272, "learning_rate": 9.54431536345018e-07, "loss": 0.7898, "step": 598 }, { "epoch": 0.163315384091064, "grad_norm": 2.2891165663385364, "learning_rate": 9.542471153013282e-07, "loss": 0.8227, "step": 599 }, { "epoch": 0.16358803080907913, "grad_norm": 14.83189707619587, "learning_rate": 9.540623397150667e-07, "loss": 0.7062, "step": 600 }, { "epoch": 0.16386067752709427, "grad_norm": 1.9812088155990226, "learning_rate": 9.538772097304519e-07, "loss": 0.7244, "step": 601 }, { "epoch": 0.1641333242451094, "grad_norm": 4.6503440528173146, "learning_rate": 9.53691725491979e-07, "loss": 0.7372, "step": 602 }, { "epoch": 0.16440597096312454, "grad_norm": 2.208869286594173, "learning_rate": 9.535058871444193e-07, "loss": 0.7093, "step": 603 }, { "epoch": 0.16467861768113967, "grad_norm": 4.792437572102221, "learning_rate": 9.533196948328208e-07, "loss": 0.734, "step": 604 }, { "epoch": 0.1649512643991548, "grad_norm": 2.3878414845069544, "learning_rate": 9.531331487025081e-07, "loss": 0.8011, "step": 605 }, { "epoch": 0.1652239111171699, "grad_norm": 2.638639270739112, "learning_rate": 9.529462488990811e-07, "loss": 0.7012, "step": 606 }, { "epoch": 0.16549655783518505, "grad_norm": 2.2228103370326324, "learning_rate": 9.527589955684163e-07, "loss": 0.649, "step": 607 }, { "epoch": 0.16576920455320018, "grad_norm": 2.0079420292222885, "learning_rate": 9.525713888566662e-07, "loss": 0.7001, "step": 608 }, { "epoch": 0.16604185127121532, "grad_norm": 2.360077709177052, "learning_rate": 9.523834289102588e-07, "loss": 0.74, "step": 609 }, { "epoch": 0.16631449798923045, "grad_norm": 1.961017387077767, "learning_rate": 9.521951158758981e-07, "loss": 0.6981, "step": 610 }, { "epoch": 0.1665871447072456, "grad_norm": 2.520508459942479, "learning_rate": 9.520064499005634e-07, "loss": 0.7671, "step": 611 }, { "epoch": 0.16685979142526072, "grad_norm": 4.0854042149157515, "learning_rate": 9.518174311315097e-07, "loss": 0.6658, "step": 612 }, { "epoch": 0.16713243814327586, "grad_norm": 2.911809289130455, "learning_rate": 9.516280597162671e-07, "loss": 0.7329, "step": 613 }, { "epoch": 0.167405084861291, "grad_norm": 3.1051709820473254, "learning_rate": 9.514383358026415e-07, "loss": 0.7249, "step": 614 }, { "epoch": 0.16767773157930613, "grad_norm": 2.5652829071426755, "learning_rate": 9.51248259538713e-07, "loss": 0.6347, "step": 615 }, { "epoch": 0.16795037829732123, "grad_norm": 14.279102072109819, "learning_rate": 9.510578310728379e-07, "loss": 0.6784, "step": 616 }, { "epoch": 0.16822302501533637, "grad_norm": 2.7365482078509342, "learning_rate": 9.508670505536461e-07, "loss": 0.7232, "step": 617 }, { "epoch": 0.1684956717333515, "grad_norm": 1.9778224686523982, "learning_rate": 9.506759181300434e-07, "loss": 0.7388, "step": 618 }, { "epoch": 0.16876831845136664, "grad_norm": 2.517484180141714, "learning_rate": 9.504844339512094e-07, "loss": 0.728, "step": 619 }, { "epoch": 0.16904096516938177, "grad_norm": 2.3014920913732846, "learning_rate": 9.502925981665992e-07, "loss": 0.6619, "step": 620 }, { "epoch": 0.1693136118873969, "grad_norm": 2.882593968565622, "learning_rate": 9.501004109259412e-07, "loss": 0.7133, "step": 621 }, { "epoch": 0.16958625860541204, "grad_norm": 3.583802387350086, "learning_rate": 9.499078723792389e-07, "loss": 0.7913, "step": 622 }, { "epoch": 0.16985890532342718, "grad_norm": 3.522342801356296, "learning_rate": 9.497149826767698e-07, "loss": 0.7235, "step": 623 }, { "epoch": 0.1701315520414423, "grad_norm": 1.9503393095902877, "learning_rate": 9.495217419690855e-07, "loss": 0.6766, "step": 624 }, { "epoch": 0.17040419875945742, "grad_norm": 2.329248516908381, "learning_rate": 9.493281504070115e-07, "loss": 0.6665, "step": 625 }, { "epoch": 0.17067684547747256, "grad_norm": 4.466088532432933, "learning_rate": 9.491342081416471e-07, "loss": 0.6993, "step": 626 }, { "epoch": 0.1709494921954877, "grad_norm": 1.9177697576287296, "learning_rate": 9.489399153243653e-07, "loss": 0.771, "step": 627 }, { "epoch": 0.17122213891350282, "grad_norm": 1.8661798900821338, "learning_rate": 9.487452721068132e-07, "loss": 0.7046, "step": 628 }, { "epoch": 0.17149478563151796, "grad_norm": 3.1128899894175976, "learning_rate": 9.485502786409105e-07, "loss": 0.7673, "step": 629 }, { "epoch": 0.1717674323495331, "grad_norm": 4.0353976233862365, "learning_rate": 9.48354935078851e-07, "loss": 0.7121, "step": 630 }, { "epoch": 0.17204007906754823, "grad_norm": 3.7304008595419518, "learning_rate": 9.481592415731015e-07, "loss": 0.7141, "step": 631 }, { "epoch": 0.17231272578556336, "grad_norm": 4.155818486575539, "learning_rate": 9.47963198276402e-07, "loss": 0.7766, "step": 632 }, { "epoch": 0.1725853725035785, "grad_norm": 2.0538846784714546, "learning_rate": 9.477668053417651e-07, "loss": 0.7911, "step": 633 }, { "epoch": 0.1728580192215936, "grad_norm": 1.8277838637459631, "learning_rate": 9.475700629224772e-07, "loss": 0.7745, "step": 634 }, { "epoch": 0.17313066593960874, "grad_norm": 2.0052477772549664, "learning_rate": 9.473729711720966e-07, "loss": 0.7554, "step": 635 }, { "epoch": 0.17340331265762388, "grad_norm": 3.4351144535918747, "learning_rate": 9.471755302444546e-07, "loss": 0.6895, "step": 636 }, { "epoch": 0.173675959375639, "grad_norm": 4.068749310844097, "learning_rate": 9.469777402936549e-07, "loss": 0.7316, "step": 637 }, { "epoch": 0.17394860609365415, "grad_norm": 2.516205548119409, "learning_rate": 9.467796014740741e-07, "loss": 0.6911, "step": 638 }, { "epoch": 0.17422125281166928, "grad_norm": 3.7936095213641643, "learning_rate": 9.465811139403606e-07, "loss": 0.7274, "step": 639 }, { "epoch": 0.17449389952968442, "grad_norm": 2.0868783065284298, "learning_rate": 9.463822778474349e-07, "loss": 0.7645, "step": 640 }, { "epoch": 0.17476654624769955, "grad_norm": 2.19201353505732, "learning_rate": 9.461830933504898e-07, "loss": 0.7196, "step": 641 }, { "epoch": 0.17503919296571469, "grad_norm": 7.476903451178536, "learning_rate": 9.4598356060499e-07, "loss": 0.7083, "step": 642 }, { "epoch": 0.17531183968372982, "grad_norm": 15.555081316702031, "learning_rate": 9.457836797666721e-07, "loss": 0.7567, "step": 643 }, { "epoch": 0.17558448640174493, "grad_norm": 2.0947915677477065, "learning_rate": 9.455834509915444e-07, "loss": 0.6787, "step": 644 }, { "epoch": 0.17585713311976006, "grad_norm": 2.043230000152344, "learning_rate": 9.453828744358862e-07, "loss": 0.7197, "step": 645 }, { "epoch": 0.1761297798377752, "grad_norm": 2.910434542221576, "learning_rate": 9.45181950256249e-07, "loss": 0.7016, "step": 646 }, { "epoch": 0.17640242655579033, "grad_norm": 6.145422657646292, "learning_rate": 9.449806786094554e-07, "loss": 0.7304, "step": 647 }, { "epoch": 0.17667507327380547, "grad_norm": 2.179145267804263, "learning_rate": 9.44779059652599e-07, "loss": 0.7517, "step": 648 }, { "epoch": 0.1769477199918206, "grad_norm": 2.1366563142505615, "learning_rate": 9.445770935430445e-07, "loss": 0.6434, "step": 649 }, { "epoch": 0.17722036670983574, "grad_norm": 3.053803365204597, "learning_rate": 9.443747804384278e-07, "loss": 0.7583, "step": 650 }, { "epoch": 0.17749301342785087, "grad_norm": 4.687712734615648, "learning_rate": 9.441721204966553e-07, "loss": 0.7584, "step": 651 }, { "epoch": 0.177765660145866, "grad_norm": 2.6365763582276096, "learning_rate": 9.439691138759044e-07, "loss": 0.7126, "step": 652 }, { "epoch": 0.1780383068638811, "grad_norm": 2.0017230996443374, "learning_rate": 9.437657607346233e-07, "loss": 0.7266, "step": 653 }, { "epoch": 0.17831095358189625, "grad_norm": 2.0387688448370547, "learning_rate": 9.435620612315297e-07, "loss": 0.7116, "step": 654 }, { "epoch": 0.17858360029991138, "grad_norm": 2.9019846794682085, "learning_rate": 9.433580155256126e-07, "loss": 0.6861, "step": 655 }, { "epoch": 0.17885624701792652, "grad_norm": 2.657149308523212, "learning_rate": 9.431536237761309e-07, "loss": 0.7579, "step": 656 }, { "epoch": 0.17912889373594165, "grad_norm": 4.361458574369521, "learning_rate": 9.429488861426136e-07, "loss": 0.7068, "step": 657 }, { "epoch": 0.1794015404539568, "grad_norm": 2.1313747171571698, "learning_rate": 9.427438027848595e-07, "loss": 0.6776, "step": 658 }, { "epoch": 0.17967418717197192, "grad_norm": 2.1939198560400324, "learning_rate": 9.425383738629375e-07, "loss": 0.7113, "step": 659 }, { "epoch": 0.17994683388998706, "grad_norm": 2.376801309321317, "learning_rate": 9.423325995371862e-07, "loss": 0.7773, "step": 660 }, { "epoch": 0.1802194806080022, "grad_norm": 2.0897542745529525, "learning_rate": 9.421264799682134e-07, "loss": 0.751, "step": 661 }, { "epoch": 0.18049212732601733, "grad_norm": 4.882483409061933, "learning_rate": 9.419200153168968e-07, "loss": 0.6615, "step": 662 }, { "epoch": 0.18076477404403243, "grad_norm": 4.456673080167893, "learning_rate": 9.417132057443833e-07, "loss": 0.6387, "step": 663 }, { "epoch": 0.18103742076204757, "grad_norm": 1.9961265355811904, "learning_rate": 9.41506051412089e-07, "loss": 0.7453, "step": 664 }, { "epoch": 0.1813100674800627, "grad_norm": 3.5251835414647514, "learning_rate": 9.41298552481699e-07, "loss": 0.7331, "step": 665 }, { "epoch": 0.18158271419807784, "grad_norm": 2.722939294566406, "learning_rate": 9.410907091151674e-07, "loss": 0.711, "step": 666 }, { "epoch": 0.18185536091609297, "grad_norm": 2.327301811644886, "learning_rate": 9.408825214747175e-07, "loss": 0.6824, "step": 667 }, { "epoch": 0.1821280076341081, "grad_norm": 2.595992552852226, "learning_rate": 9.406739897228406e-07, "loss": 0.7365, "step": 668 }, { "epoch": 0.18240065435212324, "grad_norm": 2.9716781903034373, "learning_rate": 9.404651140222972e-07, "loss": 0.6927, "step": 669 }, { "epoch": 0.18267330107013838, "grad_norm": 2.293594115716192, "learning_rate": 9.402558945361161e-07, "loss": 0.6793, "step": 670 }, { "epoch": 0.1829459477881535, "grad_norm": 1.9217071888818724, "learning_rate": 9.400463314275941e-07, "loss": 0.7985, "step": 671 }, { "epoch": 0.18321859450616862, "grad_norm": 5.471750243440078, "learning_rate": 9.398364248602967e-07, "loss": 0.7156, "step": 672 }, { "epoch": 0.18349124122418375, "grad_norm": 2.2153301944535935, "learning_rate": 9.39626174998057e-07, "loss": 0.7287, "step": 673 }, { "epoch": 0.1837638879421989, "grad_norm": 8.121742243322707, "learning_rate": 9.394155820049764e-07, "loss": 0.7404, "step": 674 }, { "epoch": 0.18403653466021402, "grad_norm": 3.0743814812490373, "learning_rate": 9.392046460454239e-07, "loss": 0.7024, "step": 675 }, { "epoch": 0.18430918137822916, "grad_norm": 3.8241202596122807, "learning_rate": 9.389933672840364e-07, "loss": 0.7164, "step": 676 }, { "epoch": 0.1845818280962443, "grad_norm": 3.2441150197731874, "learning_rate": 9.387817458857181e-07, "loss": 0.6772, "step": 677 }, { "epoch": 0.18485447481425943, "grad_norm": 3.3495015586615042, "learning_rate": 9.385697820156409e-07, "loss": 0.7466, "step": 678 }, { "epoch": 0.18512712153227456, "grad_norm": 1.8988222308943752, "learning_rate": 9.383574758392436e-07, "loss": 0.7062, "step": 679 }, { "epoch": 0.1853997682502897, "grad_norm": 3.7925499900041855, "learning_rate": 9.381448275222326e-07, "loss": 0.7523, "step": 680 }, { "epoch": 0.1856724149683048, "grad_norm": 8.530184102697921, "learning_rate": 9.379318372305813e-07, "loss": 0.7365, "step": 681 }, { "epoch": 0.18594506168631994, "grad_norm": 6.4774578276149795, "learning_rate": 9.377185051305296e-07, "loss": 0.7335, "step": 682 }, { "epoch": 0.18621770840433507, "grad_norm": 2.0975603509455834, "learning_rate": 9.375048313885848e-07, "loss": 0.7, "step": 683 }, { "epoch": 0.1864903551223502, "grad_norm": 2.673164402214995, "learning_rate": 9.372908161715203e-07, "loss": 0.7692, "step": 684 }, { "epoch": 0.18676300184036534, "grad_norm": 4.420917210085181, "learning_rate": 9.370764596463763e-07, "loss": 0.7581, "step": 685 }, { "epoch": 0.18703564855838048, "grad_norm": 1.92313918409248, "learning_rate": 9.368617619804593e-07, "loss": 0.726, "step": 686 }, { "epoch": 0.18730829527639561, "grad_norm": 2.1886085609377304, "learning_rate": 9.366467233413422e-07, "loss": 0.6607, "step": 687 }, { "epoch": 0.18758094199441075, "grad_norm": 2.0775354929811387, "learning_rate": 9.364313438968638e-07, "loss": 0.7322, "step": 688 }, { "epoch": 0.18785358871242588, "grad_norm": 4.044724801033246, "learning_rate": 9.362156238151293e-07, "loss": 0.7191, "step": 689 }, { "epoch": 0.18812623543044102, "grad_norm": 1.669038872545946, "learning_rate": 9.359995632645092e-07, "loss": 0.6846, "step": 690 }, { "epoch": 0.18839888214845613, "grad_norm": 2.376054704618913, "learning_rate": 9.357831624136403e-07, "loss": 0.6657, "step": 691 }, { "epoch": 0.18867152886647126, "grad_norm": 2.690105845078373, "learning_rate": 9.355664214314244e-07, "loss": 0.6834, "step": 692 }, { "epoch": 0.1889441755844864, "grad_norm": 1.9322463718957659, "learning_rate": 9.353493404870294e-07, "loss": 0.6924, "step": 693 }, { "epoch": 0.18921682230250153, "grad_norm": 8.80410432657082, "learning_rate": 9.351319197498883e-07, "loss": 0.7542, "step": 694 }, { "epoch": 0.18948946902051667, "grad_norm": 12.684843004930958, "learning_rate": 9.349141593896992e-07, "loss": 0.7724, "step": 695 }, { "epoch": 0.1897621157385318, "grad_norm": 2.643134646782283, "learning_rate": 9.346960595764253e-07, "loss": 0.6961, "step": 696 }, { "epoch": 0.19003476245654694, "grad_norm": 2.263753480747242, "learning_rate": 9.344776204802946e-07, "loss": 0.7971, "step": 697 }, { "epoch": 0.19030740917456207, "grad_norm": 2.164990027517432, "learning_rate": 9.342588422718008e-07, "loss": 0.7167, "step": 698 }, { "epoch": 0.1905800558925772, "grad_norm": 2.0440525480684064, "learning_rate": 9.340397251217008e-07, "loss": 0.7482, "step": 699 }, { "epoch": 0.1908527026105923, "grad_norm": 6.0636010200181225, "learning_rate": 9.338202692010173e-07, "loss": 0.7483, "step": 700 }, { "epoch": 0.19112534932860745, "grad_norm": 2.246220009483922, "learning_rate": 9.336004746810369e-07, "loss": 0.7045, "step": 701 }, { "epoch": 0.19139799604662258, "grad_norm": 2.316267094262517, "learning_rate": 9.333803417333104e-07, "loss": 0.7504, "step": 702 }, { "epoch": 0.19167064276463772, "grad_norm": 2.2101315904222365, "learning_rate": 9.331598705296528e-07, "loss": 0.7254, "step": 703 }, { "epoch": 0.19194328948265285, "grad_norm": 4.498431499337013, "learning_rate": 9.329390612421433e-07, "loss": 0.7417, "step": 704 }, { "epoch": 0.192215936200668, "grad_norm": 2.8989305491244606, "learning_rate": 9.327179140431252e-07, "loss": 0.7812, "step": 705 }, { "epoch": 0.19248858291868312, "grad_norm": 34.297959218162994, "learning_rate": 9.324964291052047e-07, "loss": 0.7255, "step": 706 }, { "epoch": 0.19276122963669826, "grad_norm": 3.4872518691586802, "learning_rate": 9.322746066012521e-07, "loss": 0.7232, "step": 707 }, { "epoch": 0.1930338763547134, "grad_norm": 6.144045556887775, "learning_rate": 9.320524467044015e-07, "loss": 0.6982, "step": 708 }, { "epoch": 0.19330652307272853, "grad_norm": 3.721449471746236, "learning_rate": 9.318299495880499e-07, "loss": 0.677, "step": 709 }, { "epoch": 0.19357916979074363, "grad_norm": 1.9328773971341724, "learning_rate": 9.316071154258578e-07, "loss": 0.704, "step": 710 }, { "epoch": 0.19385181650875877, "grad_norm": 2.1060987010883223, "learning_rate": 9.313839443917482e-07, "loss": 0.6996, "step": 711 }, { "epoch": 0.1941244632267739, "grad_norm": 1.9143785070579515, "learning_rate": 9.311604366599075e-07, "loss": 0.7007, "step": 712 }, { "epoch": 0.19439710994478904, "grad_norm": 1.936886991858108, "learning_rate": 9.309365924047852e-07, "loss": 0.7099, "step": 713 }, { "epoch": 0.19466975666280417, "grad_norm": 3.7012017642600976, "learning_rate": 9.307124118010926e-07, "loss": 0.6951, "step": 714 }, { "epoch": 0.1949424033808193, "grad_norm": 3.016238813730886, "learning_rate": 9.304878950238042e-07, "loss": 0.6556, "step": 715 }, { "epoch": 0.19521505009883444, "grad_norm": 2.44198665851863, "learning_rate": 9.302630422481568e-07, "loss": 0.699, "step": 716 }, { "epoch": 0.19548769681684958, "grad_norm": 4.5011572618347335, "learning_rate": 9.300378536496492e-07, "loss": 0.7035, "step": 717 }, { "epoch": 0.1957603435348647, "grad_norm": 3.1385134486735424, "learning_rate": 9.298123294040424e-07, "loss": 0.7091, "step": 718 }, { "epoch": 0.19603299025287982, "grad_norm": 2.785441568431047, "learning_rate": 9.295864696873592e-07, "loss": 0.7601, "step": 719 }, { "epoch": 0.19630563697089495, "grad_norm": 2.7168793775605535, "learning_rate": 9.29360274675885e-07, "loss": 0.7003, "step": 720 }, { "epoch": 0.1965782836889101, "grad_norm": 2.631236715675284, "learning_rate": 9.291337445461659e-07, "loss": 0.7065, "step": 721 }, { "epoch": 0.19685093040692522, "grad_norm": 2.6156377040676495, "learning_rate": 9.289068794750101e-07, "loss": 0.7318, "step": 722 }, { "epoch": 0.19712357712494036, "grad_norm": 3.1070468949562446, "learning_rate": 9.286796796394873e-07, "loss": 0.7231, "step": 723 }, { "epoch": 0.1973962238429555, "grad_norm": 1.92017572294559, "learning_rate": 9.284521452169282e-07, "loss": 0.6723, "step": 724 }, { "epoch": 0.19766887056097063, "grad_norm": 2.940705694294662, "learning_rate": 9.282242763849248e-07, "loss": 0.6712, "step": 725 }, { "epoch": 0.19794151727898576, "grad_norm": 2.503811892790525, "learning_rate": 9.279960733213301e-07, "loss": 0.6829, "step": 726 }, { "epoch": 0.1982141639970009, "grad_norm": 5.683533590097, "learning_rate": 9.27767536204258e-07, "loss": 0.6621, "step": 727 }, { "epoch": 0.198486810715016, "grad_norm": 2.2247889466598543, "learning_rate": 9.275386652120828e-07, "loss": 0.6991, "step": 728 }, { "epoch": 0.19875945743303114, "grad_norm": 4.782949426765943, "learning_rate": 9.273094605234399e-07, "loss": 0.7606, "step": 729 }, { "epoch": 0.19903210415104627, "grad_norm": 2.3741395677116075, "learning_rate": 9.27079922317225e-07, "loss": 0.7021, "step": 730 }, { "epoch": 0.1993047508690614, "grad_norm": 3.5822612573937036, "learning_rate": 9.268500507725937e-07, "loss": 0.7139, "step": 731 }, { "epoch": 0.19957739758707654, "grad_norm": 2.7906063254328304, "learning_rate": 9.266198460689624e-07, "loss": 0.6647, "step": 732 }, { "epoch": 0.19985004430509168, "grad_norm": 3.7227823011523307, "learning_rate": 9.263893083860071e-07, "loss": 0.7383, "step": 733 }, { "epoch": 0.2001226910231068, "grad_norm": 2.2755651729377995, "learning_rate": 9.26158437903664e-07, "loss": 0.6376, "step": 734 }, { "epoch": 0.20039533774112195, "grad_norm": 5.6823618624096905, "learning_rate": 9.259272348021288e-07, "loss": 0.7447, "step": 735 }, { "epoch": 0.20066798445913708, "grad_norm": 5.224068874203693, "learning_rate": 9.256956992618568e-07, "loss": 0.7432, "step": 736 }, { "epoch": 0.20094063117715222, "grad_norm": 2.0821292837323093, "learning_rate": 9.254638314635628e-07, "loss": 0.6879, "step": 737 }, { "epoch": 0.20121327789516733, "grad_norm": 8.155301768777635, "learning_rate": 9.252316315882213e-07, "loss": 0.6868, "step": 738 }, { "epoch": 0.20148592461318246, "grad_norm": 2.789240502716789, "learning_rate": 9.249990998170654e-07, "loss": 0.6756, "step": 739 }, { "epoch": 0.2017585713311976, "grad_norm": 5.328351552674522, "learning_rate": 9.247662363315876e-07, "loss": 0.6982, "step": 740 }, { "epoch": 0.20203121804921273, "grad_norm": 8.883831517177128, "learning_rate": 9.245330413135394e-07, "loss": 0.7183, "step": 741 }, { "epoch": 0.20230386476722786, "grad_norm": 5.473744777800431, "learning_rate": 9.242995149449306e-07, "loss": 0.7, "step": 742 }, { "epoch": 0.202576511485243, "grad_norm": 3.0296563024942724, "learning_rate": 9.240656574080301e-07, "loss": 0.6881, "step": 743 }, { "epoch": 0.20284915820325813, "grad_norm": 2.2558441646434857, "learning_rate": 9.238314688853653e-07, "loss": 0.6864, "step": 744 }, { "epoch": 0.20312180492127327, "grad_norm": 2.988160564644187, "learning_rate": 9.235969495597214e-07, "loss": 0.7524, "step": 745 }, { "epoch": 0.2033944516392884, "grad_norm": 2.422386435621177, "learning_rate": 9.233620996141421e-07, "loss": 0.6259, "step": 746 }, { "epoch": 0.2036670983573035, "grad_norm": 6.474225125855503, "learning_rate": 9.231269192319293e-07, "loss": 0.7372, "step": 747 }, { "epoch": 0.20393974507531865, "grad_norm": 6.964743051518204, "learning_rate": 9.228914085966429e-07, "loss": 0.6746, "step": 748 }, { "epoch": 0.20421239179333378, "grad_norm": 2.379847164034899, "learning_rate": 9.226555678921e-07, "loss": 0.631, "step": 749 }, { "epoch": 0.20448503851134892, "grad_norm": 1.804482568604722, "learning_rate": 9.224193973023757e-07, "loss": 0.727, "step": 750 }, { "epoch": 0.20475768522936405, "grad_norm": 2.527288771206224, "learning_rate": 9.221828970118029e-07, "loss": 0.714, "step": 751 }, { "epoch": 0.20503033194737919, "grad_norm": 2.0316018520494454, "learning_rate": 9.219460672049712e-07, "loss": 0.6669, "step": 752 }, { "epoch": 0.20530297866539432, "grad_norm": 3.705994076251483, "learning_rate": 9.217089080667277e-07, "loss": 0.7157, "step": 753 }, { "epoch": 0.20557562538340945, "grad_norm": 2.2443787470588674, "learning_rate": 9.214714197821764e-07, "loss": 0.7311, "step": 754 }, { "epoch": 0.2058482721014246, "grad_norm": 2.778485232253143, "learning_rate": 9.212336025366787e-07, "loss": 0.7758, "step": 755 }, { "epoch": 0.20612091881943972, "grad_norm": 2.4659878416847607, "learning_rate": 9.209954565158519e-07, "loss": 0.7127, "step": 756 }, { "epoch": 0.20639356553745483, "grad_norm": 2.339942936278723, "learning_rate": 9.207569819055708e-07, "loss": 0.7213, "step": 757 }, { "epoch": 0.20666621225546997, "grad_norm": 1.6692522779524883, "learning_rate": 9.205181788919659e-07, "loss": 0.7232, "step": 758 }, { "epoch": 0.2069388589734851, "grad_norm": 2.8692478846118514, "learning_rate": 9.202790476614246e-07, "loss": 0.6324, "step": 759 }, { "epoch": 0.20721150569150024, "grad_norm": 9.047303010738503, "learning_rate": 9.200395884005903e-07, "loss": 0.6814, "step": 760 }, { "epoch": 0.20748415240951537, "grad_norm": 1.9480647960924107, "learning_rate": 9.19799801296362e-07, "loss": 0.696, "step": 761 }, { "epoch": 0.2077567991275305, "grad_norm": 2.1744145368304375, "learning_rate": 9.195596865358953e-07, "loss": 0.6321, "step": 762 }, { "epoch": 0.20802944584554564, "grad_norm": 5.089792227786513, "learning_rate": 9.193192443066011e-07, "loss": 0.7927, "step": 763 }, { "epoch": 0.20830209256356078, "grad_norm": 1.9920680051031172, "learning_rate": 9.190784747961462e-07, "loss": 0.7072, "step": 764 }, { "epoch": 0.2085747392815759, "grad_norm": 3.479965917594852, "learning_rate": 9.188373781924521e-07, "loss": 0.7595, "step": 765 }, { "epoch": 0.20884738599959102, "grad_norm": 6.1108400331979515, "learning_rate": 9.185959546836967e-07, "loss": 0.7285, "step": 766 }, { "epoch": 0.20912003271760615, "grad_norm": 2.547858328977491, "learning_rate": 9.183542044583121e-07, "loss": 0.6721, "step": 767 }, { "epoch": 0.2093926794356213, "grad_norm": 2.5776430874296796, "learning_rate": 9.18112127704986e-07, "loss": 0.7763, "step": 768 }, { "epoch": 0.20966532615363642, "grad_norm": 2.010095905526122, "learning_rate": 9.178697246126606e-07, "loss": 0.7934, "step": 769 }, { "epoch": 0.20993797287165156, "grad_norm": 2.8733107996554543, "learning_rate": 9.176269953705331e-07, "loss": 0.7057, "step": 770 }, { "epoch": 0.2102106195896667, "grad_norm": 2.851828393987426, "learning_rate": 9.173839401680551e-07, "loss": 0.7033, "step": 771 }, { "epoch": 0.21048326630768183, "grad_norm": 2.781202748150913, "learning_rate": 9.171405591949324e-07, "loss": 0.7909, "step": 772 }, { "epoch": 0.21075591302569696, "grad_norm": 2.0484275643811425, "learning_rate": 9.168968526411256e-07, "loss": 0.7201, "step": 773 }, { "epoch": 0.2110285597437121, "grad_norm": 3.3680009067843177, "learning_rate": 9.166528206968489e-07, "loss": 0.7514, "step": 774 }, { "epoch": 0.2113012064617272, "grad_norm": 2.88602162032739, "learning_rate": 9.164084635525709e-07, "loss": 0.6855, "step": 775 }, { "epoch": 0.21157385317974234, "grad_norm": 3.0053592338486363, "learning_rate": 9.161637813990135e-07, "loss": 0.6907, "step": 776 }, { "epoch": 0.21184649989775747, "grad_norm": 3.8892114990139124, "learning_rate": 9.159187744271528e-07, "loss": 0.7491, "step": 777 }, { "epoch": 0.2121191466157726, "grad_norm": 2.7116580017828067, "learning_rate": 9.156734428282181e-07, "loss": 0.7569, "step": 778 }, { "epoch": 0.21239179333378774, "grad_norm": 4.192875464628646, "learning_rate": 9.154277867936922e-07, "loss": 0.693, "step": 779 }, { "epoch": 0.21266444005180288, "grad_norm": 6.063473588507559, "learning_rate": 9.151818065153111e-07, "loss": 0.6826, "step": 780 }, { "epoch": 0.212937086769818, "grad_norm": 3.7763419836248877, "learning_rate": 9.149355021850639e-07, "loss": 0.7872, "step": 781 }, { "epoch": 0.21320973348783315, "grad_norm": 2.2299268060424233, "learning_rate": 9.146888739951925e-07, "loss": 0.7257, "step": 782 }, { "epoch": 0.21348238020584828, "grad_norm": 5.671899321836684, "learning_rate": 9.144419221381918e-07, "loss": 0.722, "step": 783 }, { "epoch": 0.21375502692386342, "grad_norm": 2.483600954136612, "learning_rate": 9.141946468068092e-07, "loss": 0.6868, "step": 784 }, { "epoch": 0.21402767364187852, "grad_norm": 3.214754211740077, "learning_rate": 9.139470481940444e-07, "loss": 0.6702, "step": 785 }, { "epoch": 0.21430032035989366, "grad_norm": 2.47848493130806, "learning_rate": 9.136991264931496e-07, "loss": 0.7539, "step": 786 }, { "epoch": 0.2145729670779088, "grad_norm": 5.849646985614556, "learning_rate": 9.134508818976293e-07, "loss": 0.6883, "step": 787 }, { "epoch": 0.21484561379592393, "grad_norm": 2.335922237060724, "learning_rate": 9.132023146012398e-07, "loss": 0.7366, "step": 788 }, { "epoch": 0.21511826051393906, "grad_norm": 2.8605869073036367, "learning_rate": 9.129534247979893e-07, "loss": 0.7294, "step": 789 }, { "epoch": 0.2153909072319542, "grad_norm": 2.3120183492839437, "learning_rate": 9.12704212682138e-07, "loss": 0.666, "step": 790 }, { "epoch": 0.21566355394996933, "grad_norm": 4.144096764525521, "learning_rate": 9.12454678448197e-07, "loss": 0.729, "step": 791 }, { "epoch": 0.21593620066798447, "grad_norm": 1.9456053601340235, "learning_rate": 9.122048222909297e-07, "loss": 0.6405, "step": 792 }, { "epoch": 0.2162088473859996, "grad_norm": 1.9384809268140186, "learning_rate": 9.119546444053502e-07, "loss": 0.7155, "step": 793 }, { "epoch": 0.2164814941040147, "grad_norm": 2.7927760811464184, "learning_rate": 9.117041449867238e-07, "loss": 0.7026, "step": 794 }, { "epoch": 0.21675414082202984, "grad_norm": 2.3235068803908874, "learning_rate": 9.114533242305665e-07, "loss": 0.7508, "step": 795 }, { "epoch": 0.21702678754004498, "grad_norm": 2.6229628982394306, "learning_rate": 9.112021823326458e-07, "loss": 0.7006, "step": 796 }, { "epoch": 0.21729943425806011, "grad_norm": 3.2186401007460113, "learning_rate": 9.109507194889792e-07, "loss": 0.6945, "step": 797 }, { "epoch": 0.21757208097607525, "grad_norm": 4.826175195340622, "learning_rate": 9.10698935895835e-07, "loss": 0.6867, "step": 798 }, { "epoch": 0.21784472769409038, "grad_norm": 4.281646742150314, "learning_rate": 9.104468317497319e-07, "loss": 0.716, "step": 799 }, { "epoch": 0.21811737441210552, "grad_norm": 2.221986605035443, "learning_rate": 9.101944072474386e-07, "loss": 0.7547, "step": 800 }, { "epoch": 0.21839002113012065, "grad_norm": 4.012715385204946, "learning_rate": 9.099416625859739e-07, "loss": 0.6476, "step": 801 }, { "epoch": 0.2186626678481358, "grad_norm": 4.125494313372368, "learning_rate": 9.096885979626066e-07, "loss": 0.7008, "step": 802 }, { "epoch": 0.21893531456615092, "grad_norm": 3.561655138499091, "learning_rate": 9.094352135748549e-07, "loss": 0.7149, "step": 803 }, { "epoch": 0.21920796128416603, "grad_norm": 2.988483501783352, "learning_rate": 9.091815096204872e-07, "loss": 0.7105, "step": 804 }, { "epoch": 0.21948060800218117, "grad_norm": 1.9311233801402208, "learning_rate": 9.089274862975205e-07, "loss": 0.6258, "step": 805 }, { "epoch": 0.2197532547201963, "grad_norm": 2.2554748429997376, "learning_rate": 9.086731438042219e-07, "loss": 0.7593, "step": 806 }, { "epoch": 0.22002590143821144, "grad_norm": 2.50029033711971, "learning_rate": 9.08418482339107e-07, "loss": 0.6796, "step": 807 }, { "epoch": 0.22029854815622657, "grad_norm": 2.987128287975278, "learning_rate": 9.081635021009407e-07, "loss": 0.6865, "step": 808 }, { "epoch": 0.2205711948742417, "grad_norm": 3.0068059593261287, "learning_rate": 9.079082032887366e-07, "loss": 0.6521, "step": 809 }, { "epoch": 0.22084384159225684, "grad_norm": 1.7804758917980679, "learning_rate": 9.076525861017568e-07, "loss": 0.6621, "step": 810 }, { "epoch": 0.22111648831027197, "grad_norm": 2.613398222777793, "learning_rate": 9.073966507395121e-07, "loss": 0.7088, "step": 811 }, { "epoch": 0.2213891350282871, "grad_norm": 7.285522463588143, "learning_rate": 9.071403974017618e-07, "loss": 0.7332, "step": 812 }, { "epoch": 0.22166178174630222, "grad_norm": 2.014717337234504, "learning_rate": 9.068838262885128e-07, "loss": 0.6972, "step": 813 }, { "epoch": 0.22193442846431735, "grad_norm": 13.06870406077101, "learning_rate": 9.066269376000204e-07, "loss": 0.6842, "step": 814 }, { "epoch": 0.2222070751823325, "grad_norm": 2.319842387570376, "learning_rate": 9.06369731536788e-07, "loss": 0.705, "step": 815 }, { "epoch": 0.22247972190034762, "grad_norm": 2.5398777986539027, "learning_rate": 9.061122082995664e-07, "loss": 0.668, "step": 816 }, { "epoch": 0.22275236861836276, "grad_norm": 1.9403030217930028, "learning_rate": 9.058543680893538e-07, "loss": 0.7081, "step": 817 }, { "epoch": 0.2230250153363779, "grad_norm": 2.8369548671779965, "learning_rate": 9.055962111073962e-07, "loss": 0.6993, "step": 818 }, { "epoch": 0.22329766205439303, "grad_norm": 1.8838176009928713, "learning_rate": 9.053377375551867e-07, "loss": 0.7189, "step": 819 }, { "epoch": 0.22357030877240816, "grad_norm": 3.3016769971229833, "learning_rate": 9.050789476344654e-07, "loss": 0.6624, "step": 820 }, { "epoch": 0.2238429554904233, "grad_norm": 3.157625806078888, "learning_rate": 9.048198415472192e-07, "loss": 0.7088, "step": 821 }, { "epoch": 0.2241156022084384, "grad_norm": 5.428353447210289, "learning_rate": 9.045604194956821e-07, "loss": 0.6727, "step": 822 }, { "epoch": 0.22438824892645354, "grad_norm": 2.3161152801621436, "learning_rate": 9.043006816823345e-07, "loss": 0.6822, "step": 823 }, { "epoch": 0.22466089564446867, "grad_norm": 2.3918682485980924, "learning_rate": 9.040406283099034e-07, "loss": 0.7269, "step": 824 }, { "epoch": 0.2249335423624838, "grad_norm": 2.1233527464964, "learning_rate": 9.037802595813619e-07, "loss": 0.7188, "step": 825 }, { "epoch": 0.22520618908049894, "grad_norm": 2.207143584733204, "learning_rate": 9.035195756999295e-07, "loss": 0.6517, "step": 826 }, { "epoch": 0.22547883579851408, "grad_norm": 2.599681397002975, "learning_rate": 9.032585768690711e-07, "loss": 0.7776, "step": 827 }, { "epoch": 0.2257514825165292, "grad_norm": 2.020313305763361, "learning_rate": 9.029972632924983e-07, "loss": 0.6896, "step": 828 }, { "epoch": 0.22602412923454435, "grad_norm": 2.3086734874768386, "learning_rate": 9.027356351741677e-07, "loss": 0.6916, "step": 829 }, { "epoch": 0.22629677595255948, "grad_norm": 4.1472823779279695, "learning_rate": 9.024736927182816e-07, "loss": 0.7558, "step": 830 }, { "epoch": 0.22656942267057462, "grad_norm": 1.7609236477774899, "learning_rate": 9.022114361292878e-07, "loss": 0.669, "step": 831 }, { "epoch": 0.22684206938858972, "grad_norm": 2.713124438860595, "learning_rate": 9.01948865611879e-07, "loss": 0.7177, "step": 832 }, { "epoch": 0.22711471610660486, "grad_norm": 2.62076555381633, "learning_rate": 9.01685981370993e-07, "loss": 0.6968, "step": 833 }, { "epoch": 0.22738736282462, "grad_norm": 6.087624817067153, "learning_rate": 9.014227836118124e-07, "loss": 0.6911, "step": 834 }, { "epoch": 0.22766000954263513, "grad_norm": 2.0368092870531442, "learning_rate": 9.01159272539765e-07, "loss": 0.7249, "step": 835 }, { "epoch": 0.22793265626065026, "grad_norm": 4.7604779504038675, "learning_rate": 9.008954483605223e-07, "loss": 0.7355, "step": 836 }, { "epoch": 0.2282053029786654, "grad_norm": 2.4054537768866813, "learning_rate": 9.00631311280001e-07, "loss": 0.6687, "step": 837 }, { "epoch": 0.22847794969668053, "grad_norm": 9.499228410362226, "learning_rate": 9.003668615043616e-07, "loss": 0.7642, "step": 838 }, { "epoch": 0.22875059641469567, "grad_norm": 2.04293915656025, "learning_rate": 9.001020992400085e-07, "loss": 0.6622, "step": 839 }, { "epoch": 0.2290232431327108, "grad_norm": 3.229954560727948, "learning_rate": 8.998370246935905e-07, "loss": 0.7071, "step": 840 }, { "epoch": 0.2292958898507259, "grad_norm": 2.2463195001216976, "learning_rate": 8.995716380719996e-07, "loss": 0.78, "step": 841 }, { "epoch": 0.22956853656874104, "grad_norm": 2.698021440402077, "learning_rate": 8.993059395823717e-07, "loss": 0.7068, "step": 842 }, { "epoch": 0.22984118328675618, "grad_norm": 4.571806358173533, "learning_rate": 8.990399294320863e-07, "loss": 0.703, "step": 843 }, { "epoch": 0.2301138300047713, "grad_norm": 5.539944282644867, "learning_rate": 8.987736078287656e-07, "loss": 0.6442, "step": 844 }, { "epoch": 0.23038647672278645, "grad_norm": 2.4610964131347086, "learning_rate": 8.985069749802754e-07, "loss": 0.6442, "step": 845 }, { "epoch": 0.23065912344080158, "grad_norm": 3.247053006448736, "learning_rate": 8.98240031094724e-07, "loss": 0.711, "step": 846 }, { "epoch": 0.23093177015881672, "grad_norm": 2.8038823121364356, "learning_rate": 8.979727763804629e-07, "loss": 0.7009, "step": 847 }, { "epoch": 0.23120441687683185, "grad_norm": 2.5207855710954306, "learning_rate": 8.977052110460858e-07, "loss": 0.6795, "step": 848 }, { "epoch": 0.231477063594847, "grad_norm": 2.7031079662793682, "learning_rate": 8.974373353004293e-07, "loss": 0.7375, "step": 849 }, { "epoch": 0.23174971031286212, "grad_norm": 2.06336515210521, "learning_rate": 8.971691493525717e-07, "loss": 0.6381, "step": 850 }, { "epoch": 0.23202235703087723, "grad_norm": 2.0337114975233863, "learning_rate": 8.96900653411834e-07, "loss": 0.7176, "step": 851 }, { "epoch": 0.23229500374889236, "grad_norm": 2.560019988698607, "learning_rate": 8.966318476877788e-07, "loss": 0.7156, "step": 852 }, { "epoch": 0.2325676504669075, "grad_norm": 5.000581707003586, "learning_rate": 8.963627323902103e-07, "loss": 0.745, "step": 853 }, { "epoch": 0.23284029718492263, "grad_norm": 2.349876932101579, "learning_rate": 8.96093307729175e-07, "loss": 0.686, "step": 854 }, { "epoch": 0.23311294390293777, "grad_norm": 2.3862465035413534, "learning_rate": 8.958235739149601e-07, "loss": 0.7156, "step": 855 }, { "epoch": 0.2333855906209529, "grad_norm": 1.8131516699351269, "learning_rate": 8.955535311580946e-07, "loss": 0.7137, "step": 856 }, { "epoch": 0.23365823733896804, "grad_norm": 2.496378264704518, "learning_rate": 8.952831796693485e-07, "loss": 0.6921, "step": 857 }, { "epoch": 0.23393088405698317, "grad_norm": 3.3298191515325795, "learning_rate": 8.950125196597329e-07, "loss": 0.7148, "step": 858 }, { "epoch": 0.2342035307749983, "grad_norm": 3.5757395449009564, "learning_rate": 8.947415513404992e-07, "loss": 0.7299, "step": 859 }, { "epoch": 0.23447617749301342, "grad_norm": 3.9119248026908844, "learning_rate": 8.944702749231399e-07, "loss": 0.6873, "step": 860 }, { "epoch": 0.23474882421102855, "grad_norm": 2.78216505069334, "learning_rate": 8.94198690619388e-07, "loss": 0.747, "step": 861 }, { "epoch": 0.23502147092904369, "grad_norm": 14.041618287471534, "learning_rate": 8.939267986412166e-07, "loss": 0.6863, "step": 862 }, { "epoch": 0.23529411764705882, "grad_norm": 3.1553055431416377, "learning_rate": 8.936545992008389e-07, "loss": 0.7171, "step": 863 }, { "epoch": 0.23556676436507396, "grad_norm": 2.43596103561399, "learning_rate": 8.933820925107082e-07, "loss": 0.7064, "step": 864 }, { "epoch": 0.2358394110830891, "grad_norm": 2.1654997060844035, "learning_rate": 8.931092787835177e-07, "loss": 0.6609, "step": 865 }, { "epoch": 0.23611205780110422, "grad_norm": 4.043036961365399, "learning_rate": 8.928361582321999e-07, "loss": 0.7348, "step": 866 }, { "epoch": 0.23638470451911936, "grad_norm": 2.3549462707369058, "learning_rate": 8.925627310699274e-07, "loss": 0.6988, "step": 867 }, { "epoch": 0.2366573512371345, "grad_norm": 2.071760027377647, "learning_rate": 8.922889975101114e-07, "loss": 0.6322, "step": 868 }, { "epoch": 0.2369299979551496, "grad_norm": 1.874497464376992, "learning_rate": 8.920149577664028e-07, "loss": 0.6694, "step": 869 }, { "epoch": 0.23720264467316474, "grad_norm": 4.532492153804591, "learning_rate": 8.917406120526909e-07, "loss": 0.6832, "step": 870 }, { "epoch": 0.23747529139117987, "grad_norm": 2.408544162104381, "learning_rate": 8.914659605831045e-07, "loss": 0.6953, "step": 871 }, { "epoch": 0.237747938109195, "grad_norm": 2.356461677272514, "learning_rate": 8.911910035720107e-07, "loss": 0.7794, "step": 872 }, { "epoch": 0.23802058482721014, "grad_norm": 2.7676957556612614, "learning_rate": 8.909157412340149e-07, "loss": 0.6973, "step": 873 }, { "epoch": 0.23829323154522528, "grad_norm": 2.4901332353435817, "learning_rate": 8.906401737839611e-07, "loss": 0.6707, "step": 874 }, { "epoch": 0.2385658782632404, "grad_norm": 5.23494559524629, "learning_rate": 8.903643014369311e-07, "loss": 0.735, "step": 875 }, { "epoch": 0.23883852498125555, "grad_norm": 2.026208473831808, "learning_rate": 8.900881244082452e-07, "loss": 0.7112, "step": 876 }, { "epoch": 0.23911117169927068, "grad_norm": 3.646203290010336, "learning_rate": 8.898116429134611e-07, "loss": 0.6611, "step": 877 }, { "epoch": 0.23938381841728582, "grad_norm": 3.1953048731138267, "learning_rate": 8.895348571683743e-07, "loss": 0.7736, "step": 878 }, { "epoch": 0.23965646513530092, "grad_norm": 2.2975649466056676, "learning_rate": 8.892577673890176e-07, "loss": 0.6481, "step": 879 }, { "epoch": 0.23992911185331606, "grad_norm": 2.140187826405907, "learning_rate": 8.889803737916614e-07, "loss": 0.691, "step": 880 }, { "epoch": 0.2402017585713312, "grad_norm": 4.267249607248873, "learning_rate": 8.887026765928129e-07, "loss": 0.6543, "step": 881 }, { "epoch": 0.24047440528934633, "grad_norm": 2.2170330280587174, "learning_rate": 8.884246760092164e-07, "loss": 0.7425, "step": 882 }, { "epoch": 0.24074705200736146, "grad_norm": 3.3791924137003027, "learning_rate": 8.88146372257853e-07, "loss": 0.6678, "step": 883 }, { "epoch": 0.2410196987253766, "grad_norm": 2.090336329687449, "learning_rate": 8.878677655559406e-07, "loss": 0.7528, "step": 884 }, { "epoch": 0.24129234544339173, "grad_norm": 1.8536219948371695, "learning_rate": 8.875888561209332e-07, "loss": 0.6997, "step": 885 }, { "epoch": 0.24156499216140687, "grad_norm": 2.515569559088616, "learning_rate": 8.873096441705216e-07, "loss": 0.6841, "step": 886 }, { "epoch": 0.241837638879422, "grad_norm": 3.796859219099523, "learning_rate": 8.87030129922632e-07, "loss": 0.6652, "step": 887 }, { "epoch": 0.2421102855974371, "grad_norm": 15.164991548320666, "learning_rate": 8.867503135954269e-07, "loss": 0.688, "step": 888 }, { "epoch": 0.24238293231545224, "grad_norm": 4.083782169658832, "learning_rate": 8.864701954073048e-07, "loss": 0.6337, "step": 889 }, { "epoch": 0.24265557903346738, "grad_norm": 8.240842998115935, "learning_rate": 8.861897755768997e-07, "loss": 0.7137, "step": 890 }, { "epoch": 0.2429282257514825, "grad_norm": 4.031817456599184, "learning_rate": 8.859090543230808e-07, "loss": 0.6999, "step": 891 }, { "epoch": 0.24320087246949765, "grad_norm": 2.1669716323056503, "learning_rate": 8.856280318649528e-07, "loss": 0.7447, "step": 892 }, { "epoch": 0.24347351918751278, "grad_norm": 2.3062587555901737, "learning_rate": 8.853467084218551e-07, "loss": 0.7, "step": 893 }, { "epoch": 0.24374616590552792, "grad_norm": 2.4803472343268056, "learning_rate": 8.850650842133628e-07, "loss": 0.7072, "step": 894 }, { "epoch": 0.24401881262354305, "grad_norm": 6.507585488560434, "learning_rate": 8.84783159459285e-07, "loss": 0.7839, "step": 895 }, { "epoch": 0.2442914593415582, "grad_norm": 1.9722868740048225, "learning_rate": 8.845009343796654e-07, "loss": 0.7318, "step": 896 }, { "epoch": 0.2445641060595733, "grad_norm": 2.836144994226975, "learning_rate": 8.842184091947829e-07, "loss": 0.8013, "step": 897 }, { "epoch": 0.24483675277758843, "grad_norm": 1.8862604333100887, "learning_rate": 8.839355841251497e-07, "loss": 0.7228, "step": 898 }, { "epoch": 0.24510939949560356, "grad_norm": 2.2989130007298804, "learning_rate": 8.836524593915124e-07, "loss": 0.7189, "step": 899 }, { "epoch": 0.2453820462136187, "grad_norm": 4.615863057095102, "learning_rate": 8.833690352148518e-07, "loss": 0.7136, "step": 900 }, { "epoch": 0.24565469293163383, "grad_norm": 2.271726208356459, "learning_rate": 8.83085311816382e-07, "loss": 0.695, "step": 901 }, { "epoch": 0.24592733964964897, "grad_norm": 2.559621515232891, "learning_rate": 8.82801289417551e-07, "loss": 0.7009, "step": 902 }, { "epoch": 0.2461999863676641, "grad_norm": 2.985585649908531, "learning_rate": 8.825169682400396e-07, "loss": 0.6819, "step": 903 }, { "epoch": 0.24647263308567924, "grad_norm": 1.9319800123075124, "learning_rate": 8.822323485057623e-07, "loss": 0.6707, "step": 904 }, { "epoch": 0.24674527980369437, "grad_norm": 1.7785203165864352, "learning_rate": 8.819474304368669e-07, "loss": 0.7562, "step": 905 }, { "epoch": 0.2470179265217095, "grad_norm": 2.0099615817613588, "learning_rate": 8.816622142557329e-07, "loss": 0.7888, "step": 906 }, { "epoch": 0.24729057323972461, "grad_norm": 2.1302586150550478, "learning_rate": 8.813767001849739e-07, "loss": 0.7493, "step": 907 }, { "epoch": 0.24756321995773975, "grad_norm": 2.2421204326410056, "learning_rate": 8.81090888447435e-07, "loss": 0.6435, "step": 908 }, { "epoch": 0.24783586667575488, "grad_norm": 2.9501193215705475, "learning_rate": 8.808047792661939e-07, "loss": 0.6986, "step": 909 }, { "epoch": 0.24810851339377002, "grad_norm": 2.309653092481919, "learning_rate": 8.805183728645606e-07, "loss": 0.7596, "step": 910 }, { "epoch": 0.24838116011178515, "grad_norm": 1.9273567710739627, "learning_rate": 8.802316694660772e-07, "loss": 0.7037, "step": 911 }, { "epoch": 0.2486538068298003, "grad_norm": 2.2268185738178587, "learning_rate": 8.799446692945172e-07, "loss": 0.686, "step": 912 }, { "epoch": 0.24892645354781542, "grad_norm": 2.3310954749126767, "learning_rate": 8.796573725738861e-07, "loss": 0.6613, "step": 913 }, { "epoch": 0.24919910026583056, "grad_norm": 5.685654376438305, "learning_rate": 8.793697795284203e-07, "loss": 0.7143, "step": 914 }, { "epoch": 0.2494717469838457, "grad_norm": 4.816671505942241, "learning_rate": 8.790818903825883e-07, "loss": 0.7732, "step": 915 }, { "epoch": 0.2497443937018608, "grad_norm": 2.1662960676276715, "learning_rate": 8.787937053610892e-07, "loss": 0.6558, "step": 916 }, { "epoch": 0.25001704041987594, "grad_norm": 4.180001587183687, "learning_rate": 8.785052246888529e-07, "loss": 0.6385, "step": 917 }, { "epoch": 0.25028968713789107, "grad_norm": 2.028955654355518, "learning_rate": 8.782164485910405e-07, "loss": 0.7376, "step": 918 }, { "epoch": 0.2505623338559062, "grad_norm": 2.085044075275439, "learning_rate": 8.779273772930433e-07, "loss": 0.6708, "step": 919 }, { "epoch": 0.25083498057392134, "grad_norm": 2.514047831762966, "learning_rate": 8.776380110204835e-07, "loss": 0.7202, "step": 920 }, { "epoch": 0.2511076272919365, "grad_norm": 4.1208123837029325, "learning_rate": 8.773483499992128e-07, "loss": 0.7571, "step": 921 }, { "epoch": 0.2513802740099516, "grad_norm": 4.952874795226447, "learning_rate": 8.770583944553136e-07, "loss": 0.7575, "step": 922 }, { "epoch": 0.25165292072796674, "grad_norm": 2.5851375825057303, "learning_rate": 8.767681446150976e-07, "loss": 0.6532, "step": 923 }, { "epoch": 0.2519255674459819, "grad_norm": 25.556677751991188, "learning_rate": 8.76477600705107e-07, "loss": 0.6917, "step": 924 }, { "epoch": 0.252198214163997, "grad_norm": 2.99114550897273, "learning_rate": 8.761867629521127e-07, "loss": 0.7367, "step": 925 }, { "epoch": 0.25247086088201215, "grad_norm": 4.222809222372426, "learning_rate": 8.758956315831155e-07, "loss": 0.7253, "step": 926 }, { "epoch": 0.2527435076000273, "grad_norm": 1.9645394538856984, "learning_rate": 8.756042068253448e-07, "loss": 0.723, "step": 927 }, { "epoch": 0.2530161543180424, "grad_norm": 2.0801160220356243, "learning_rate": 8.753124889062599e-07, "loss": 0.7434, "step": 928 }, { "epoch": 0.25328880103605755, "grad_norm": 1.944586140878796, "learning_rate": 8.75020478053548e-07, "loss": 0.735, "step": 929 }, { "epoch": 0.25356144775407263, "grad_norm": 12.423794706181775, "learning_rate": 8.747281744951254e-07, "loss": 0.7309, "step": 930 }, { "epoch": 0.25383409447208777, "grad_norm": 2.642967052824267, "learning_rate": 8.744355784591366e-07, "loss": 0.7285, "step": 931 }, { "epoch": 0.2541067411901029, "grad_norm": 2.0476428088382597, "learning_rate": 8.741426901739548e-07, "loss": 0.7223, "step": 932 }, { "epoch": 0.25437938790811804, "grad_norm": 2.191066571091054, "learning_rate": 8.73849509868181e-07, "loss": 0.7374, "step": 933 }, { "epoch": 0.2546520346261332, "grad_norm": 2.1555757901724033, "learning_rate": 8.73556037770644e-07, "loss": 0.6078, "step": 934 }, { "epoch": 0.2549246813441483, "grad_norm": 2.9450808199152934, "learning_rate": 8.732622741104008e-07, "loss": 0.6546, "step": 935 }, { "epoch": 0.25519732806216344, "grad_norm": 4.171858449529322, "learning_rate": 8.729682191167353e-07, "loss": 0.713, "step": 936 }, { "epoch": 0.2554699747801786, "grad_norm": 2.043964878921221, "learning_rate": 8.726738730191594e-07, "loss": 0.7112, "step": 937 }, { "epoch": 0.2557426214981937, "grad_norm": 3.009235680313109, "learning_rate": 8.72379236047412e-07, "loss": 0.692, "step": 938 }, { "epoch": 0.25601526821620885, "grad_norm": 2.158355231097736, "learning_rate": 8.72084308431459e-07, "loss": 0.6688, "step": 939 }, { "epoch": 0.256287914934224, "grad_norm": 2.343657695971492, "learning_rate": 8.717890904014931e-07, "loss": 0.6748, "step": 940 }, { "epoch": 0.2565605616522391, "grad_norm": 2.6915252055411774, "learning_rate": 8.714935821879337e-07, "loss": 0.6926, "step": 941 }, { "epoch": 0.25683320837025425, "grad_norm": 2.6411417886544672, "learning_rate": 8.711977840214269e-07, "loss": 0.7113, "step": 942 }, { "epoch": 0.2571058550882694, "grad_norm": 8.497465202702632, "learning_rate": 8.709016961328446e-07, "loss": 0.7366, "step": 943 }, { "epoch": 0.2573785018062845, "grad_norm": 2.212628760110969, "learning_rate": 8.706053187532853e-07, "loss": 0.6703, "step": 944 }, { "epoch": 0.25765114852429966, "grad_norm": 2.3731500697889363, "learning_rate": 8.703086521140735e-07, "loss": 0.6476, "step": 945 }, { "epoch": 0.2579237952423148, "grad_norm": 3.026700385378759, "learning_rate": 8.700116964467591e-07, "loss": 0.6874, "step": 946 }, { "epoch": 0.2581964419603299, "grad_norm": 2.7349106353712345, "learning_rate": 8.697144519831178e-07, "loss": 0.7138, "step": 947 }, { "epoch": 0.25846908867834506, "grad_norm": 3.5523652807090396, "learning_rate": 8.694169189551506e-07, "loss": 0.6701, "step": 948 }, { "epoch": 0.25874173539636014, "grad_norm": 2.2593314891631446, "learning_rate": 8.691190975950837e-07, "loss": 0.7057, "step": 949 }, { "epoch": 0.2590143821143753, "grad_norm": 2.219624350582658, "learning_rate": 8.688209881353688e-07, "loss": 0.6513, "step": 950 }, { "epoch": 0.2592870288323904, "grad_norm": 2.599980056722357, "learning_rate": 8.68522590808682e-07, "loss": 0.6442, "step": 951 }, { "epoch": 0.25955967555040554, "grad_norm": 2.1354419273809797, "learning_rate": 8.682239058479238e-07, "loss": 0.6365, "step": 952 }, { "epoch": 0.2598323222684207, "grad_norm": 2.2444962173658487, "learning_rate": 8.6792493348622e-07, "loss": 0.6869, "step": 953 }, { "epoch": 0.2601049689864358, "grad_norm": 2.059602984113889, "learning_rate": 8.676256739569201e-07, "loss": 0.6843, "step": 954 }, { "epoch": 0.26037761570445095, "grad_norm": 2.615232432485742, "learning_rate": 8.673261274935981e-07, "loss": 0.718, "step": 955 }, { "epoch": 0.2606502624224661, "grad_norm": 1.9854468775638494, "learning_rate": 8.670262943300517e-07, "loss": 0.7194, "step": 956 }, { "epoch": 0.2609229091404812, "grad_norm": 2.5174737239539247, "learning_rate": 8.667261747003022e-07, "loss": 0.6475, "step": 957 }, { "epoch": 0.26119555585849635, "grad_norm": 2.638913619127141, "learning_rate": 8.664257688385954e-07, "loss": 0.7168, "step": 958 }, { "epoch": 0.2614682025765115, "grad_norm": 7.256225427690231, "learning_rate": 8.661250769793993e-07, "loss": 0.6743, "step": 959 }, { "epoch": 0.2617408492945266, "grad_norm": 2.4381017069677293, "learning_rate": 8.65824099357406e-07, "loss": 0.6749, "step": 960 }, { "epoch": 0.26201349601254176, "grad_norm": 1.9995075897455568, "learning_rate": 8.6552283620753e-07, "loss": 0.6747, "step": 961 }, { "epoch": 0.2622861427305569, "grad_norm": 2.8643841078144723, "learning_rate": 8.652212877649092e-07, "loss": 0.7195, "step": 962 }, { "epoch": 0.262558789448572, "grad_norm": 2.3218913004233364, "learning_rate": 8.649194542649039e-07, "loss": 0.6568, "step": 963 }, { "epoch": 0.26283143616658716, "grad_norm": 2.0740053217842855, "learning_rate": 8.64617335943097e-07, "loss": 0.7175, "step": 964 }, { "epoch": 0.2631040828846023, "grad_norm": 2.506324790102128, "learning_rate": 8.643149330352936e-07, "loss": 0.7017, "step": 965 }, { "epoch": 0.26337672960261743, "grad_norm": 3.939596277381441, "learning_rate": 8.640122457775209e-07, "loss": 0.738, "step": 966 }, { "epoch": 0.26364937632063257, "grad_norm": 2.8456837797543373, "learning_rate": 8.637092744060284e-07, "loss": 0.6948, "step": 967 }, { "epoch": 0.26392202303864765, "grad_norm": 2.898002978675539, "learning_rate": 8.634060191572865e-07, "loss": 0.7087, "step": 968 }, { "epoch": 0.2641946697566628, "grad_norm": 5.940664614934427, "learning_rate": 8.631024802679882e-07, "loss": 0.7114, "step": 969 }, { "epoch": 0.2644673164746779, "grad_norm": 2.0763536726590113, "learning_rate": 8.627986579750473e-07, "loss": 0.6347, "step": 970 }, { "epoch": 0.26473996319269305, "grad_norm": 1.909081794646712, "learning_rate": 8.624945525155987e-07, "loss": 0.691, "step": 971 }, { "epoch": 0.2650126099107082, "grad_norm": 3.3582132585056517, "learning_rate": 8.621901641269988e-07, "loss": 0.7389, "step": 972 }, { "epoch": 0.2652852566287233, "grad_norm": 2.301576385488036, "learning_rate": 8.618854930468245e-07, "loss": 0.7757, "step": 973 }, { "epoch": 0.26555790334673846, "grad_norm": 6.263053474281853, "learning_rate": 8.615805395128731e-07, "loss": 0.6827, "step": 974 }, { "epoch": 0.2658305500647536, "grad_norm": 2.1594579002159535, "learning_rate": 8.612753037631629e-07, "loss": 0.7053, "step": 975 }, { "epoch": 0.2661031967827687, "grad_norm": 2.396557507485505, "learning_rate": 8.609697860359322e-07, "loss": 0.6414, "step": 976 }, { "epoch": 0.26637584350078386, "grad_norm": 3.3468601582510935, "learning_rate": 8.606639865696394e-07, "loss": 0.7401, "step": 977 }, { "epoch": 0.266648490218799, "grad_norm": 3.4473698521397815, "learning_rate": 8.603579056029628e-07, "loss": 0.7529, "step": 978 }, { "epoch": 0.26692113693681413, "grad_norm": 2.3179118846871356, "learning_rate": 8.600515433748001e-07, "loss": 0.6756, "step": 979 }, { "epoch": 0.26719378365482926, "grad_norm": 1.8285759162951416, "learning_rate": 8.597449001242694e-07, "loss": 0.6743, "step": 980 }, { "epoch": 0.2674664303728444, "grad_norm": 2.8656617793453205, "learning_rate": 8.594379760907074e-07, "loss": 0.7402, "step": 981 }, { "epoch": 0.26773907709085953, "grad_norm": 2.4380432426753043, "learning_rate": 8.591307715136698e-07, "loss": 0.6301, "step": 982 }, { "epoch": 0.26801172380887467, "grad_norm": 2.7550047078708157, "learning_rate": 8.588232866329318e-07, "loss": 0.7388, "step": 983 }, { "epoch": 0.2682843705268898, "grad_norm": 4.416030559391878, "learning_rate": 8.585155216884873e-07, "loss": 0.7423, "step": 984 }, { "epoch": 0.26855701724490494, "grad_norm": 2.3175294352794977, "learning_rate": 8.582074769205485e-07, "loss": 0.7314, "step": 985 }, { "epoch": 0.2688296639629201, "grad_norm": 3.967741745641965, "learning_rate": 8.578991525695463e-07, "loss": 0.717, "step": 986 }, { "epoch": 0.26910231068093515, "grad_norm": 22.563313837989888, "learning_rate": 8.575905488761294e-07, "loss": 0.7996, "step": 987 }, { "epoch": 0.2693749573989503, "grad_norm": 1.9534896743259451, "learning_rate": 8.572816660811655e-07, "loss": 0.7122, "step": 988 }, { "epoch": 0.2696476041169654, "grad_norm": 2.091515826508265, "learning_rate": 8.569725044257386e-07, "loss": 0.7302, "step": 989 }, { "epoch": 0.26992025083498056, "grad_norm": 2.119855269900613, "learning_rate": 8.566630641511519e-07, "loss": 0.6967, "step": 990 }, { "epoch": 0.2701928975529957, "grad_norm": 2.2133376875278126, "learning_rate": 8.563533454989248e-07, "loss": 0.7081, "step": 991 }, { "epoch": 0.2704655442710108, "grad_norm": 8.740256866648046, "learning_rate": 8.56043348710795e-07, "loss": 0.7115, "step": 992 }, { "epoch": 0.27073819098902596, "grad_norm": 2.000349599055623, "learning_rate": 8.557330740287166e-07, "loss": 0.619, "step": 993 }, { "epoch": 0.2710108377070411, "grad_norm": 2.2076851558442407, "learning_rate": 8.55422521694861e-07, "loss": 0.7071, "step": 994 }, { "epoch": 0.27128348442505623, "grad_norm": 2.549368456199061, "learning_rate": 8.551116919516158e-07, "loss": 0.7045, "step": 995 }, { "epoch": 0.27155613114307137, "grad_norm": 1.8706695991491105, "learning_rate": 8.548005850415858e-07, "loss": 0.7319, "step": 996 }, { "epoch": 0.2718287778610865, "grad_norm": 2.3301753042508193, "learning_rate": 8.544892012075917e-07, "loss": 0.6968, "step": 997 }, { "epoch": 0.27210142457910164, "grad_norm": 1.97270162303938, "learning_rate": 8.541775406926706e-07, "loss": 0.7358, "step": 998 }, { "epoch": 0.27237407129711677, "grad_norm": 2.782922452279277, "learning_rate": 8.538656037400749e-07, "loss": 0.6438, "step": 999 }, { "epoch": 0.2726467180151319, "grad_norm": 1.9072592294941457, "learning_rate": 8.535533905932737e-07, "loss": 0.7176, "step": 1000 }, { "epoch": 0.27291936473314704, "grad_norm": 2.951903827036589, "learning_rate": 8.532409014959511e-07, "loss": 0.6907, "step": 1001 }, { "epoch": 0.2731920114511622, "grad_norm": 2.7252494241979, "learning_rate": 8.529281366920067e-07, "loss": 0.7406, "step": 1002 }, { "epoch": 0.2734646581691773, "grad_norm": 15.607155369012107, "learning_rate": 8.526150964255551e-07, "loss": 0.6629, "step": 1003 }, { "epoch": 0.27373730488719245, "grad_norm": 1.831876133941206, "learning_rate": 8.523017809409262e-07, "loss": 0.7117, "step": 1004 }, { "epoch": 0.2740099516052075, "grad_norm": 2.210124811941493, "learning_rate": 8.519881904826647e-07, "loss": 0.7471, "step": 1005 }, { "epoch": 0.27428259832322266, "grad_norm": 3.0961318087157763, "learning_rate": 8.516743252955299e-07, "loss": 0.6843, "step": 1006 }, { "epoch": 0.2745552450412378, "grad_norm": 6.365829395578147, "learning_rate": 8.51360185624495e-07, "loss": 0.7312, "step": 1007 }, { "epoch": 0.27482789175925293, "grad_norm": 2.5917598775473656, "learning_rate": 8.510457717147483e-07, "loss": 0.7444, "step": 1008 }, { "epoch": 0.27510053847726806, "grad_norm": 3.797532240215371, "learning_rate": 8.507310838116917e-07, "loss": 0.63, "step": 1009 }, { "epoch": 0.2753731851952832, "grad_norm": 2.4861882827010207, "learning_rate": 8.504161221609407e-07, "loss": 0.6931, "step": 1010 }, { "epoch": 0.27564583191329833, "grad_norm": 2.2632478584303106, "learning_rate": 8.50100887008325e-07, "loss": 0.6576, "step": 1011 }, { "epoch": 0.27591847863131347, "grad_norm": 2.2328560439809797, "learning_rate": 8.497853785998874e-07, "loss": 0.701, "step": 1012 }, { "epoch": 0.2761911253493286, "grad_norm": 2.1447532595582888, "learning_rate": 8.494695971818841e-07, "loss": 0.66, "step": 1013 }, { "epoch": 0.27646377206734374, "grad_norm": 2.6176676108494115, "learning_rate": 8.491535430007843e-07, "loss": 0.6681, "step": 1014 }, { "epoch": 0.2767364187853589, "grad_norm": 2.853800179668248, "learning_rate": 8.488372163032704e-07, "loss": 0.6636, "step": 1015 }, { "epoch": 0.277009065503374, "grad_norm": 1.8376634102756835, "learning_rate": 8.485206173362372e-07, "loss": 0.663, "step": 1016 }, { "epoch": 0.27728171222138914, "grad_norm": 3.7066466452928046, "learning_rate": 8.482037463467917e-07, "loss": 0.6743, "step": 1017 }, { "epoch": 0.2775543589394043, "grad_norm": 6.559617509907182, "learning_rate": 8.478866035822539e-07, "loss": 0.6826, "step": 1018 }, { "epoch": 0.2778270056574194, "grad_norm": 5.825149990974548, "learning_rate": 8.475691892901558e-07, "loss": 0.6798, "step": 1019 }, { "epoch": 0.27809965237543455, "grad_norm": 3.851070006350462, "learning_rate": 8.472515037182407e-07, "loss": 0.7013, "step": 1020 }, { "epoch": 0.2783722990934497, "grad_norm": 2.387837247368048, "learning_rate": 8.469335471144646e-07, "loss": 0.6759, "step": 1021 }, { "epoch": 0.2786449458114648, "grad_norm": 2.1619676140524553, "learning_rate": 8.466153197269938e-07, "loss": 0.71, "step": 1022 }, { "epoch": 0.27891759252947995, "grad_norm": 2.462150947971652, "learning_rate": 8.462968218042073e-07, "loss": 0.6377, "step": 1023 }, { "epoch": 0.27919023924749503, "grad_norm": 3.126862281034543, "learning_rate": 8.459780535946945e-07, "loss": 0.7389, "step": 1024 }, { "epoch": 0.27946288596551017, "grad_norm": 3.7729396942705633, "learning_rate": 8.456590153472555e-07, "loss": 0.7214, "step": 1025 }, { "epoch": 0.2797355326835253, "grad_norm": 2.0652873995463428, "learning_rate": 8.453397073109021e-07, "loss": 0.6853, "step": 1026 }, { "epoch": 0.28000817940154044, "grad_norm": 1.93359263925882, "learning_rate": 8.450201297348558e-07, "loss": 0.6624, "step": 1027 }, { "epoch": 0.28028082611955557, "grad_norm": 2.0988274336141473, "learning_rate": 8.447002828685489e-07, "loss": 0.7189, "step": 1028 }, { "epoch": 0.2805534728375707, "grad_norm": 4.320159972750844, "learning_rate": 8.443801669616238e-07, "loss": 0.7439, "step": 1029 }, { "epoch": 0.28082611955558584, "grad_norm": 3.477915729698169, "learning_rate": 8.440597822639324e-07, "loss": 0.7805, "step": 1030 }, { "epoch": 0.281098766273601, "grad_norm": 53.678902314693126, "learning_rate": 8.437391290255375e-07, "loss": 0.7417, "step": 1031 }, { "epoch": 0.2813714129916161, "grad_norm": 2.3694374718848343, "learning_rate": 8.434182074967106e-07, "loss": 0.7022, "step": 1032 }, { "epoch": 0.28164405970963124, "grad_norm": 5.008670373745134, "learning_rate": 8.43097017927933e-07, "loss": 0.7241, "step": 1033 }, { "epoch": 0.2819167064276464, "grad_norm": 1.8600220817206232, "learning_rate": 8.427755605698947e-07, "loss": 0.6333, "step": 1034 }, { "epoch": 0.2821893531456615, "grad_norm": 1.889573389972253, "learning_rate": 8.424538356734956e-07, "loss": 0.7, "step": 1035 }, { "epoch": 0.28246199986367665, "grad_norm": 2.5133544886003594, "learning_rate": 8.421318434898434e-07, "loss": 0.6251, "step": 1036 }, { "epoch": 0.2827346465816918, "grad_norm": 2.6690782913937467, "learning_rate": 8.41809584270255e-07, "loss": 0.6975, "step": 1037 }, { "epoch": 0.2830072932997069, "grad_norm": 2.4144687751926224, "learning_rate": 8.41487058266256e-07, "loss": 0.6657, "step": 1038 }, { "epoch": 0.28327994001772205, "grad_norm": 23.78629890574906, "learning_rate": 8.411642657295795e-07, "loss": 0.7487, "step": 1039 }, { "epoch": 0.2835525867357372, "grad_norm": 2.3299374609061583, "learning_rate": 8.408412069121672e-07, "loss": 0.7514, "step": 1040 }, { "epoch": 0.2838252334537523, "grad_norm": 3.018772574150368, "learning_rate": 8.405178820661682e-07, "loss": 0.6933, "step": 1041 }, { "epoch": 0.28409788017176746, "grad_norm": 3.0431955035805665, "learning_rate": 8.401942914439397e-07, "loss": 0.6948, "step": 1042 }, { "epoch": 0.28437052688978254, "grad_norm": 1.9834179755540926, "learning_rate": 8.398704352980459e-07, "loss": 0.7661, "step": 1043 }, { "epoch": 0.2846431736077977, "grad_norm": 4.848763176966332, "learning_rate": 8.395463138812586e-07, "loss": 0.6928, "step": 1044 }, { "epoch": 0.2849158203258128, "grad_norm": 2.966307527204037, "learning_rate": 8.392219274465566e-07, "loss": 0.6753, "step": 1045 }, { "epoch": 0.28518846704382794, "grad_norm": 7.682217359048256, "learning_rate": 8.388972762471251e-07, "loss": 0.6932, "step": 1046 }, { "epoch": 0.2854611137618431, "grad_norm": 2.7963095753346883, "learning_rate": 8.385723605363565e-07, "loss": 0.7235, "step": 1047 }, { "epoch": 0.2857337604798582, "grad_norm": 4.008533860830827, "learning_rate": 8.382471805678495e-07, "loss": 0.7114, "step": 1048 }, { "epoch": 0.28600640719787335, "grad_norm": 2.754184583597783, "learning_rate": 8.379217365954088e-07, "loss": 0.7225, "step": 1049 }, { "epoch": 0.2862790539158885, "grad_norm": 2.3157225282758973, "learning_rate": 8.375960288730453e-07, "loss": 0.7405, "step": 1050 }, { "epoch": 0.2865517006339036, "grad_norm": 2.0816858616932232, "learning_rate": 8.372700576549762e-07, "loss": 0.7061, "step": 1051 }, { "epoch": 0.28682434735191875, "grad_norm": 2.0940333351205895, "learning_rate": 8.369438231956237e-07, "loss": 0.6732, "step": 1052 }, { "epoch": 0.2870969940699339, "grad_norm": 1.9708792558829031, "learning_rate": 8.366173257496156e-07, "loss": 0.7048, "step": 1053 }, { "epoch": 0.287369640787949, "grad_norm": 2.614582267864537, "learning_rate": 8.362905655717856e-07, "loss": 0.7055, "step": 1054 }, { "epoch": 0.28764228750596416, "grad_norm": 2.3157496612747335, "learning_rate": 8.359635429171714e-07, "loss": 0.68, "step": 1055 }, { "epoch": 0.2879149342239793, "grad_norm": 2.123579393693026, "learning_rate": 8.356362580410163e-07, "loss": 0.7235, "step": 1056 }, { "epoch": 0.2881875809419944, "grad_norm": 2.557900876378316, "learning_rate": 8.353087111987683e-07, "loss": 0.7117, "step": 1057 }, { "epoch": 0.28846022766000956, "grad_norm": 2.8694048645355266, "learning_rate": 8.349809026460795e-07, "loss": 0.6212, "step": 1058 }, { "epoch": 0.2887328743780247, "grad_norm": 2.2228952737263312, "learning_rate": 8.346528326388062e-07, "loss": 0.7079, "step": 1059 }, { "epoch": 0.28900552109603983, "grad_norm": 2.5044198565192892, "learning_rate": 8.343245014330095e-07, "loss": 0.6791, "step": 1060 }, { "epoch": 0.28927816781405497, "grad_norm": 4.217235996065754, "learning_rate": 8.339959092849535e-07, "loss": 0.7266, "step": 1061 }, { "epoch": 0.28955081453207004, "grad_norm": 2.1668939836690275, "learning_rate": 8.336670564511063e-07, "loss": 0.7059, "step": 1062 }, { "epoch": 0.2898234612500852, "grad_norm": 5.878541681801647, "learning_rate": 8.333379431881397e-07, "loss": 0.7606, "step": 1063 }, { "epoch": 0.2900961079681003, "grad_norm": 2.854336933547971, "learning_rate": 8.330085697529283e-07, "loss": 0.6511, "step": 1064 }, { "epoch": 0.29036875468611545, "grad_norm": 3.122724463816616, "learning_rate": 8.326789364025502e-07, "loss": 0.6619, "step": 1065 }, { "epoch": 0.2906414014041306, "grad_norm": 10.868445526013076, "learning_rate": 8.323490433942862e-07, "loss": 0.7197, "step": 1066 }, { "epoch": 0.2909140481221457, "grad_norm": 4.509481140742328, "learning_rate": 8.320188909856198e-07, "loss": 0.7604, "step": 1067 }, { "epoch": 0.29118669484016085, "grad_norm": 2.5155243606102475, "learning_rate": 8.316884794342367e-07, "loss": 0.6355, "step": 1068 }, { "epoch": 0.291459341558176, "grad_norm": 2.4910273951897, "learning_rate": 8.313578089980251e-07, "loss": 0.7241, "step": 1069 }, { "epoch": 0.2917319882761911, "grad_norm": 6.10460786989703, "learning_rate": 8.310268799350755e-07, "loss": 0.714, "step": 1070 }, { "epoch": 0.29200463499420626, "grad_norm": 1.7754408811789042, "learning_rate": 8.306956925036797e-07, "loss": 0.7199, "step": 1071 }, { "epoch": 0.2922772817122214, "grad_norm": 1.9308207322617021, "learning_rate": 8.303642469623316e-07, "loss": 0.7107, "step": 1072 }, { "epoch": 0.2925499284302365, "grad_norm": 2.5342973125859025, "learning_rate": 8.300325435697266e-07, "loss": 0.7222, "step": 1073 }, { "epoch": 0.29282257514825166, "grad_norm": 2.8777038269464597, "learning_rate": 8.297005825847608e-07, "loss": 0.6949, "step": 1074 }, { "epoch": 0.2930952218662668, "grad_norm": 2.7163756405679456, "learning_rate": 8.29368364266532e-07, "loss": 0.7377, "step": 1075 }, { "epoch": 0.29336786858428193, "grad_norm": 3.368148515558149, "learning_rate": 8.290358888743383e-07, "loss": 0.734, "step": 1076 }, { "epoch": 0.29364051530229707, "grad_norm": 2.0442045533794864, "learning_rate": 8.28703156667679e-07, "loss": 0.6536, "step": 1077 }, { "epoch": 0.2939131620203122, "grad_norm": 1.9163425163653995, "learning_rate": 8.283701679062535e-07, "loss": 0.6566, "step": 1078 }, { "epoch": 0.29418580873832734, "grad_norm": 2.7120478891452944, "learning_rate": 8.280369228499615e-07, "loss": 0.6756, "step": 1079 }, { "epoch": 0.29445845545634247, "grad_norm": 5.986083335530315, "learning_rate": 8.277034217589025e-07, "loss": 0.7277, "step": 1080 }, { "epoch": 0.29473110217435755, "grad_norm": 6.369931853614676, "learning_rate": 8.273696648933764e-07, "loss": 0.7116, "step": 1081 }, { "epoch": 0.2950037488923727, "grad_norm": 5.763828677797375, "learning_rate": 8.270356525138824e-07, "loss": 0.6553, "step": 1082 }, { "epoch": 0.2952763956103878, "grad_norm": 2.027356915497659, "learning_rate": 8.267013848811188e-07, "loss": 0.6228, "step": 1083 }, { "epoch": 0.29554904232840296, "grad_norm": 2.561435694208411, "learning_rate": 8.263668622559838e-07, "loss": 0.6812, "step": 1084 }, { "epoch": 0.2958216890464181, "grad_norm": 5.682037047488708, "learning_rate": 8.260320848995743e-07, "loss": 0.6903, "step": 1085 }, { "epoch": 0.2960943357644332, "grad_norm": 3.5080986291660614, "learning_rate": 8.25697053073186e-07, "loss": 0.6811, "step": 1086 }, { "epoch": 0.29636698248244836, "grad_norm": 7.400046746248996, "learning_rate": 8.253617670383131e-07, "loss": 0.6752, "step": 1087 }, { "epoch": 0.2966396292004635, "grad_norm": 2.675975987819887, "learning_rate": 8.250262270566484e-07, "loss": 0.6977, "step": 1088 }, { "epoch": 0.29691227591847863, "grad_norm": 1.9470834272115132, "learning_rate": 8.246904333900826e-07, "loss": 0.727, "step": 1089 }, { "epoch": 0.29718492263649376, "grad_norm": 2.467830527022115, "learning_rate": 8.243543863007052e-07, "loss": 0.7496, "step": 1090 }, { "epoch": 0.2974575693545089, "grad_norm": 1.995281540924074, "learning_rate": 8.240180860508026e-07, "loss": 0.6716, "step": 1091 }, { "epoch": 0.29773021607252403, "grad_norm": 3.308816778229639, "learning_rate": 8.236815329028592e-07, "loss": 0.739, "step": 1092 }, { "epoch": 0.29800286279053917, "grad_norm": 2.3947009185897397, "learning_rate": 8.233447271195566e-07, "loss": 0.7029, "step": 1093 }, { "epoch": 0.2982755095085543, "grad_norm": 2.6878767782302133, "learning_rate": 8.23007668963774e-07, "loss": 0.7502, "step": 1094 }, { "epoch": 0.29854815622656944, "grad_norm": 2.1731448144987993, "learning_rate": 8.226703586985871e-07, "loss": 0.7993, "step": 1095 }, { "epoch": 0.2988208029445846, "grad_norm": 3.5082825308753596, "learning_rate": 8.223327965872684e-07, "loss": 0.7013, "step": 1096 }, { "epoch": 0.2990934496625997, "grad_norm": 2.1493345650171953, "learning_rate": 8.219949828932875e-07, "loss": 0.7287, "step": 1097 }, { "epoch": 0.29936609638061484, "grad_norm": 2.386050914087641, "learning_rate": 8.216569178803099e-07, "loss": 0.6279, "step": 1098 }, { "epoch": 0.2996387430986299, "grad_norm": 2.3413056885166705, "learning_rate": 8.213186018121971e-07, "loss": 0.6909, "step": 1099 }, { "epoch": 0.29991138981664506, "grad_norm": 2.801012810223906, "learning_rate": 8.209800349530072e-07, "loss": 0.6627, "step": 1100 }, { "epoch": 0.3001840365346602, "grad_norm": 3.1341919676708225, "learning_rate": 8.206412175669932e-07, "loss": 0.7368, "step": 1101 }, { "epoch": 0.3004566832526753, "grad_norm": 4.094925735033716, "learning_rate": 8.203021499186044e-07, "loss": 0.6764, "step": 1102 }, { "epoch": 0.30072932997069046, "grad_norm": 2.0284560475860935, "learning_rate": 8.199628322724849e-07, "loss": 0.665, "step": 1103 }, { "epoch": 0.3010019766887056, "grad_norm": 2.09149175941722, "learning_rate": 8.196232648934745e-07, "loss": 0.6834, "step": 1104 }, { "epoch": 0.30127462340672073, "grad_norm": 3.765522173402244, "learning_rate": 8.192834480466071e-07, "loss": 0.7255, "step": 1105 }, { "epoch": 0.30154727012473587, "grad_norm": 11.29428675502819, "learning_rate": 8.189433819971121e-07, "loss": 0.7272, "step": 1106 }, { "epoch": 0.301819916842751, "grad_norm": 2.4178506072222863, "learning_rate": 8.186030670104129e-07, "loss": 0.7043, "step": 1107 }, { "epoch": 0.30209256356076614, "grad_norm": 2.3837509462152076, "learning_rate": 8.182625033521276e-07, "loss": 0.7425, "step": 1108 }, { "epoch": 0.30236521027878127, "grad_norm": 1.821317562905148, "learning_rate": 8.179216912880677e-07, "loss": 0.746, "step": 1109 }, { "epoch": 0.3026378569967964, "grad_norm": 2.614825338447899, "learning_rate": 8.175806310842394e-07, "loss": 0.659, "step": 1110 }, { "epoch": 0.30291050371481154, "grad_norm": 4.715399057298287, "learning_rate": 8.172393230068421e-07, "loss": 0.7808, "step": 1111 }, { "epoch": 0.3031831504328267, "grad_norm": 2.3152737673689683, "learning_rate": 8.16897767322269e-07, "loss": 0.751, "step": 1112 }, { "epoch": 0.3034557971508418, "grad_norm": 2.1664904291958136, "learning_rate": 8.165559642971061e-07, "loss": 0.7116, "step": 1113 }, { "epoch": 0.30372844386885695, "grad_norm": 17.081626142632057, "learning_rate": 8.162139141981325e-07, "loss": 0.7276, "step": 1114 }, { "epoch": 0.3040010905868721, "grad_norm": 2.200614969255518, "learning_rate": 8.158716172923209e-07, "loss": 0.6873, "step": 1115 }, { "epoch": 0.3042737373048872, "grad_norm": 2.0319950352652474, "learning_rate": 8.155290738468356e-07, "loss": 0.7003, "step": 1116 }, { "epoch": 0.30454638402290235, "grad_norm": 2.9098833730162093, "learning_rate": 8.15186284129034e-07, "loss": 0.7298, "step": 1117 }, { "epoch": 0.30481903074091743, "grad_norm": 3.225018325925044, "learning_rate": 8.148432484064654e-07, "loss": 0.7123, "step": 1118 }, { "epoch": 0.30509167745893256, "grad_norm": 2.34138121656858, "learning_rate": 8.144999669468713e-07, "loss": 0.6711, "step": 1119 }, { "epoch": 0.3053643241769477, "grad_norm": 3.4381616521363156, "learning_rate": 8.141564400181849e-07, "loss": 0.7328, "step": 1120 }, { "epoch": 0.30563697089496283, "grad_norm": 3.5406003299029645, "learning_rate": 8.138126678885308e-07, "loss": 0.7032, "step": 1121 }, { "epoch": 0.30590961761297797, "grad_norm": 2.4244765013607226, "learning_rate": 8.134686508262256e-07, "loss": 0.7224, "step": 1122 }, { "epoch": 0.3061822643309931, "grad_norm": 2.849563777918214, "learning_rate": 8.131243890997765e-07, "loss": 0.717, "step": 1123 }, { "epoch": 0.30645491104900824, "grad_norm": 3.348211588949593, "learning_rate": 8.127798829778817e-07, "loss": 0.7457, "step": 1124 }, { "epoch": 0.3067275577670234, "grad_norm": 1.770545875013245, "learning_rate": 8.124351327294303e-07, "loss": 0.6833, "step": 1125 }, { "epoch": 0.3070002044850385, "grad_norm": 3.4150426344493097, "learning_rate": 8.12090138623502e-07, "loss": 0.6835, "step": 1126 }, { "epoch": 0.30727285120305364, "grad_norm": 3.3077864668168493, "learning_rate": 8.117449009293668e-07, "loss": 0.7551, "step": 1127 }, { "epoch": 0.3075454979210688, "grad_norm": 2.7991411246080555, "learning_rate": 8.113994199164847e-07, "loss": 0.6805, "step": 1128 }, { "epoch": 0.3078181446390839, "grad_norm": 10.059572696575739, "learning_rate": 8.110536958545054e-07, "loss": 0.7484, "step": 1129 }, { "epoch": 0.30809079135709905, "grad_norm": 3.230962789380112, "learning_rate": 8.10707729013269e-07, "loss": 0.6818, "step": 1130 }, { "epoch": 0.3083634380751142, "grad_norm": 5.224511195892849, "learning_rate": 8.103615196628045e-07, "loss": 0.758, "step": 1131 }, { "epoch": 0.3086360847931293, "grad_norm": 1.8771749325850833, "learning_rate": 8.100150680733304e-07, "loss": 0.6968, "step": 1132 }, { "epoch": 0.30890873151114445, "grad_norm": 2.5039763211958035, "learning_rate": 8.096683745152544e-07, "loss": 0.7415, "step": 1133 }, { "epoch": 0.3091813782291596, "grad_norm": 2.333626626791709, "learning_rate": 8.093214392591725e-07, "loss": 0.6318, "step": 1134 }, { "epoch": 0.3094540249471747, "grad_norm": 2.8168656119731543, "learning_rate": 8.089742625758698e-07, "loss": 0.7387, "step": 1135 }, { "epoch": 0.30972667166518986, "grad_norm": 2.566435543266975, "learning_rate": 8.086268447363201e-07, "loss": 0.6954, "step": 1136 }, { "epoch": 0.30999931838320494, "grad_norm": 2.2161981893471476, "learning_rate": 8.082791860116847e-07, "loss": 0.6922, "step": 1137 }, { "epoch": 0.31027196510122007, "grad_norm": 4.996369194860005, "learning_rate": 8.079312866733135e-07, "loss": 0.7236, "step": 1138 }, { "epoch": 0.3105446118192352, "grad_norm": 2.9922153446594044, "learning_rate": 8.07583146992744e-07, "loss": 0.719, "step": 1139 }, { "epoch": 0.31081725853725034, "grad_norm": 1.8848492943878956, "learning_rate": 8.072347672417013e-07, "loss": 0.6295, "step": 1140 }, { "epoch": 0.3110899052552655, "grad_norm": 3.5854425669989207, "learning_rate": 8.068861476920977e-07, "loss": 0.7129, "step": 1141 }, { "epoch": 0.3113625519732806, "grad_norm": 2.467592803894994, "learning_rate": 8.065372886160329e-07, "loss": 0.7302, "step": 1142 }, { "epoch": 0.31163519869129575, "grad_norm": 2.73513315162365, "learning_rate": 8.061881902857936e-07, "loss": 0.7066, "step": 1143 }, { "epoch": 0.3119078454093109, "grad_norm": 3.895593074413459, "learning_rate": 8.058388529738531e-07, "loss": 0.7079, "step": 1144 }, { "epoch": 0.312180492127326, "grad_norm": 1.6407670964151382, "learning_rate": 8.054892769528713e-07, "loss": 0.6912, "step": 1145 }, { "epoch": 0.31245313884534115, "grad_norm": 6.765803567977074, "learning_rate": 8.051394624956941e-07, "loss": 0.6806, "step": 1146 }, { "epoch": 0.3127257855633563, "grad_norm": 2.5887754892075834, "learning_rate": 8.047894098753539e-07, "loss": 0.7257, "step": 1147 }, { "epoch": 0.3129984322813714, "grad_norm": 11.409898754540254, "learning_rate": 8.04439119365069e-07, "loss": 0.6989, "step": 1148 }, { "epoch": 0.31327107899938655, "grad_norm": 1.994578372568323, "learning_rate": 8.040885912382429e-07, "loss": 0.6916, "step": 1149 }, { "epoch": 0.3135437257174017, "grad_norm": 4.289383135640034, "learning_rate": 8.037378257684652e-07, "loss": 0.7039, "step": 1150 }, { "epoch": 0.3138163724354168, "grad_norm": 2.402997509823705, "learning_rate": 8.033868232295102e-07, "loss": 0.6949, "step": 1151 }, { "epoch": 0.31408901915343196, "grad_norm": 2.62298729553075, "learning_rate": 8.030355838953377e-07, "loss": 0.6615, "step": 1152 }, { "epoch": 0.3143616658714471, "grad_norm": 1.955463760408973, "learning_rate": 8.026841080400919e-07, "loss": 0.7293, "step": 1153 }, { "epoch": 0.31463431258946223, "grad_norm": 1.9775446361690694, "learning_rate": 8.02332395938102e-07, "loss": 0.7021, "step": 1154 }, { "epoch": 0.31490695930747736, "grad_norm": 2.412138864964455, "learning_rate": 8.01980447863881e-07, "loss": 0.7719, "step": 1155 }, { "epoch": 0.31517960602549244, "grad_norm": 2.7319187839099404, "learning_rate": 8.016282640921268e-07, "loss": 0.7708, "step": 1156 }, { "epoch": 0.3154522527435076, "grad_norm": 2.447752105170249, "learning_rate": 8.012758448977208e-07, "loss": 0.695, "step": 1157 }, { "epoch": 0.3157248994615227, "grad_norm": 4.270816875399033, "learning_rate": 8.009231905557283e-07, "loss": 0.7247, "step": 1158 }, { "epoch": 0.31599754617953785, "grad_norm": 3.241634678257943, "learning_rate": 8.005703013413983e-07, "loss": 0.6847, "step": 1159 }, { "epoch": 0.316270192897553, "grad_norm": 3.9039964564054253, "learning_rate": 8.002171775301625e-07, "loss": 0.7732, "step": 1160 }, { "epoch": 0.3165428396155681, "grad_norm": 2.8394949739199298, "learning_rate": 7.998638193976365e-07, "loss": 0.7014, "step": 1161 }, { "epoch": 0.31681548633358325, "grad_norm": 2.361495375880293, "learning_rate": 7.995102272196181e-07, "loss": 0.7156, "step": 1162 }, { "epoch": 0.3170881330515984, "grad_norm": 2.137260801708163, "learning_rate": 7.991564012720886e-07, "loss": 0.6768, "step": 1163 }, { "epoch": 0.3173607797696135, "grad_norm": 5.932279552877067, "learning_rate": 7.988023418312108e-07, "loss": 0.7733, "step": 1164 }, { "epoch": 0.31763342648762866, "grad_norm": 2.837507665097167, "learning_rate": 7.984480491733303e-07, "loss": 0.6427, "step": 1165 }, { "epoch": 0.3179060732056438, "grad_norm": 1.9568946010363961, "learning_rate": 7.980935235749746e-07, "loss": 0.6573, "step": 1166 }, { "epoch": 0.3181787199236589, "grad_norm": 3.651996425101692, "learning_rate": 7.977387653128532e-07, "loss": 0.6866, "step": 1167 }, { "epoch": 0.31845136664167406, "grad_norm": 3.091994234951737, "learning_rate": 7.973837746638569e-07, "loss": 0.6627, "step": 1168 }, { "epoch": 0.3187240133596892, "grad_norm": 1.8751745840834493, "learning_rate": 7.970285519050581e-07, "loss": 0.704, "step": 1169 }, { "epoch": 0.31899666007770433, "grad_norm": 4.593287936993618, "learning_rate": 7.966730973137102e-07, "loss": 0.7149, "step": 1170 }, { "epoch": 0.31926930679571947, "grad_norm": 2.4152027716945015, "learning_rate": 7.963174111672475e-07, "loss": 0.7415, "step": 1171 }, { "epoch": 0.3195419535137346, "grad_norm": 4.369815119567433, "learning_rate": 7.959614937432853e-07, "loss": 0.6954, "step": 1172 }, { "epoch": 0.31981460023174973, "grad_norm": 2.696262090330289, "learning_rate": 7.956053453196194e-07, "loss": 0.6892, "step": 1173 }, { "epoch": 0.32008724694976487, "grad_norm": 2.1784034547008284, "learning_rate": 7.952489661742255e-07, "loss": 0.7134, "step": 1174 }, { "epoch": 0.32035989366777995, "grad_norm": 2.288211069606536, "learning_rate": 7.948923565852597e-07, "loss": 0.6861, "step": 1175 }, { "epoch": 0.3206325403857951, "grad_norm": 2.114246879700957, "learning_rate": 7.94535516831058e-07, "loss": 0.7636, "step": 1176 }, { "epoch": 0.3209051871038102, "grad_norm": 2.8442440561284537, "learning_rate": 7.941784471901358e-07, "loss": 0.6921, "step": 1177 }, { "epoch": 0.32117783382182535, "grad_norm": 2.3065214508221494, "learning_rate": 7.938211479411884e-07, "loss": 0.6764, "step": 1178 }, { "epoch": 0.3214504805398405, "grad_norm": 10.9090549031901, "learning_rate": 7.934636193630897e-07, "loss": 0.7489, "step": 1179 }, { "epoch": 0.3217231272578556, "grad_norm": 2.420670682812594, "learning_rate": 7.931058617348928e-07, "loss": 0.7167, "step": 1180 }, { "epoch": 0.32199577397587076, "grad_norm": 2.5870868137722485, "learning_rate": 7.927478753358299e-07, "loss": 0.7476, "step": 1181 }, { "epoch": 0.3222684206938859, "grad_norm": 2.0344030179571395, "learning_rate": 7.923896604453113e-07, "loss": 0.6245, "step": 1182 }, { "epoch": 0.32254106741190103, "grad_norm": 3.794720407396578, "learning_rate": 7.92031217342926e-07, "loss": 0.7306, "step": 1183 }, { "epoch": 0.32281371412991616, "grad_norm": 2.219816195416327, "learning_rate": 7.916725463084408e-07, "loss": 0.7229, "step": 1184 }, { "epoch": 0.3230863608479313, "grad_norm": 2.1789199708047535, "learning_rate": 7.913136476218009e-07, "loss": 0.7417, "step": 1185 }, { "epoch": 0.32335900756594643, "grad_norm": 2.127211474576336, "learning_rate": 7.909545215631286e-07, "loss": 0.6799, "step": 1186 }, { "epoch": 0.32363165428396157, "grad_norm": 2.446236847720144, "learning_rate": 7.905951684127238e-07, "loss": 0.7066, "step": 1187 }, { "epoch": 0.3239043010019767, "grad_norm": 2.395361591100281, "learning_rate": 7.902355884510641e-07, "loss": 0.6836, "step": 1188 }, { "epoch": 0.32417694771999184, "grad_norm": 2.120843360522879, "learning_rate": 7.898757819588037e-07, "loss": 0.6671, "step": 1189 }, { "epoch": 0.32444959443800697, "grad_norm": 2.186455677980774, "learning_rate": 7.895157492167734e-07, "loss": 0.7155, "step": 1190 }, { "epoch": 0.3247222411560221, "grad_norm": 4.439538573579997, "learning_rate": 7.891554905059814e-07, "loss": 0.6862, "step": 1191 }, { "epoch": 0.32499488787403724, "grad_norm": 2.600279692899243, "learning_rate": 7.887950061076114e-07, "loss": 0.7005, "step": 1192 }, { "epoch": 0.3252675345920523, "grad_norm": 2.2493875667253858, "learning_rate": 7.884342963030238e-07, "loss": 0.6825, "step": 1193 }, { "epoch": 0.32554018131006746, "grad_norm": 2.219188353765569, "learning_rate": 7.880733613737545e-07, "loss": 0.6789, "step": 1194 }, { "epoch": 0.3258128280280826, "grad_norm": 2.9120270454183097, "learning_rate": 7.877122016015156e-07, "loss": 0.699, "step": 1195 }, { "epoch": 0.3260854747460977, "grad_norm": 3.2213998005523226, "learning_rate": 7.873508172681942e-07, "loss": 0.7601, "step": 1196 }, { "epoch": 0.32635812146411286, "grad_norm": 2.9196176523822825, "learning_rate": 7.869892086558532e-07, "loss": 0.6539, "step": 1197 }, { "epoch": 0.326630768182128, "grad_norm": 8.289222197171712, "learning_rate": 7.866273760467303e-07, "loss": 0.6987, "step": 1198 }, { "epoch": 0.32690341490014313, "grad_norm": 1.9034320965691764, "learning_rate": 7.862653197232376e-07, "loss": 0.6744, "step": 1199 }, { "epoch": 0.32717606161815826, "grad_norm": 1.93794843243472, "learning_rate": 7.859030399679627e-07, "loss": 0.722, "step": 1200 }, { "epoch": 0.3274487083361734, "grad_norm": 34.56054018926358, "learning_rate": 7.855405370636666e-07, "loss": 0.5946, "step": 1201 }, { "epoch": 0.32772135505418853, "grad_norm": 2.1340098017762377, "learning_rate": 7.851778112932855e-07, "loss": 0.738, "step": 1202 }, { "epoch": 0.32799400177220367, "grad_norm": 2.942883640633845, "learning_rate": 7.848148629399285e-07, "loss": 0.7832, "step": 1203 }, { "epoch": 0.3282666484902188, "grad_norm": 2.439570744504257, "learning_rate": 7.844516922868793e-07, "loss": 0.6812, "step": 1204 }, { "epoch": 0.32853929520823394, "grad_norm": 3.8144971703583685, "learning_rate": 7.840882996175947e-07, "loss": 0.6954, "step": 1205 }, { "epoch": 0.3288119419262491, "grad_norm": 2.230872450074237, "learning_rate": 7.837246852157048e-07, "loss": 0.6631, "step": 1206 }, { "epoch": 0.3290845886442642, "grad_norm": 2.818089740826756, "learning_rate": 7.833608493650128e-07, "loss": 0.6416, "step": 1207 }, { "epoch": 0.32935723536227934, "grad_norm": 9.339720433268745, "learning_rate": 7.829967923494945e-07, "loss": 0.7287, "step": 1208 }, { "epoch": 0.3296298820802945, "grad_norm": 2.2191605618793147, "learning_rate": 7.82632514453299e-07, "loss": 0.7145, "step": 1209 }, { "epoch": 0.3299025287983096, "grad_norm": 3.1886265405678356, "learning_rate": 7.822680159607471e-07, "loss": 0.6604, "step": 1210 }, { "epoch": 0.33017517551632475, "grad_norm": 4.315158946982288, "learning_rate": 7.819032971563322e-07, "loss": 0.7373, "step": 1211 }, { "epoch": 0.3304478222343398, "grad_norm": 2.0045660424334146, "learning_rate": 7.815383583247193e-07, "loss": 0.7198, "step": 1212 }, { "epoch": 0.33072046895235496, "grad_norm": 3.1768393924151965, "learning_rate": 7.811731997507453e-07, "loss": 0.6969, "step": 1213 }, { "epoch": 0.3309931156703701, "grad_norm": 1.9687151556870903, "learning_rate": 7.80807821719419e-07, "loss": 0.6996, "step": 1214 }, { "epoch": 0.33126576238838523, "grad_norm": 2.3424203998336997, "learning_rate": 7.804422245159195e-07, "loss": 0.6648, "step": 1215 }, { "epoch": 0.33153840910640037, "grad_norm": 3.628523459103048, "learning_rate": 7.800764084255982e-07, "loss": 0.7144, "step": 1216 }, { "epoch": 0.3318110558244155, "grad_norm": 4.098896992461442, "learning_rate": 7.797103737339766e-07, "loss": 0.7041, "step": 1217 }, { "epoch": 0.33208370254243064, "grad_norm": 5.0579813083745595, "learning_rate": 7.793441207267466e-07, "loss": 0.7051, "step": 1218 }, { "epoch": 0.33235634926044577, "grad_norm": 1.7837541958702827, "learning_rate": 7.789776496897713e-07, "loss": 0.6392, "step": 1219 }, { "epoch": 0.3326289959784609, "grad_norm": 2.8708589942564604, "learning_rate": 7.786109609090832e-07, "loss": 0.6805, "step": 1220 }, { "epoch": 0.33290164269647604, "grad_norm": 1.8959393923918404, "learning_rate": 7.782440546708852e-07, "loss": 0.6468, "step": 1221 }, { "epoch": 0.3331742894144912, "grad_norm": 2.1338561178216455, "learning_rate": 7.778769312615498e-07, "loss": 0.7492, "step": 1222 }, { "epoch": 0.3334469361325063, "grad_norm": 3.830730952970738, "learning_rate": 7.775095909676188e-07, "loss": 0.674, "step": 1223 }, { "epoch": 0.33371958285052145, "grad_norm": 3.1177542838700036, "learning_rate": 7.771420340758039e-07, "loss": 0.677, "step": 1224 }, { "epoch": 0.3339922295685366, "grad_norm": 2.1743414897697377, "learning_rate": 7.767742608729851e-07, "loss": 0.6887, "step": 1225 }, { "epoch": 0.3342648762865517, "grad_norm": 2.24355088095329, "learning_rate": 7.764062716462118e-07, "loss": 0.7068, "step": 1226 }, { "epoch": 0.33453752300456685, "grad_norm": 2.6723428117642536, "learning_rate": 7.760380666827015e-07, "loss": 0.7293, "step": 1227 }, { "epoch": 0.334810169722582, "grad_norm": 3.141182704864245, "learning_rate": 7.756696462698404e-07, "loss": 0.6936, "step": 1228 }, { "epoch": 0.3350828164405971, "grad_norm": 2.1724575913180293, "learning_rate": 7.753010106951832e-07, "loss": 0.6878, "step": 1229 }, { "epoch": 0.33535546315861225, "grad_norm": 3.5090302983060004, "learning_rate": 7.749321602464518e-07, "loss": 0.7261, "step": 1230 }, { "epoch": 0.33562810987662733, "grad_norm": 6.3189167129084405, "learning_rate": 7.745630952115363e-07, "loss": 0.7193, "step": 1231 }, { "epoch": 0.33590075659464247, "grad_norm": 4.077835700439304, "learning_rate": 7.74193815878494e-07, "loss": 0.6605, "step": 1232 }, { "epoch": 0.3361734033126576, "grad_norm": 2.187416097862206, "learning_rate": 7.738243225355496e-07, "loss": 0.7127, "step": 1233 }, { "epoch": 0.33644605003067274, "grad_norm": 2.193937191873204, "learning_rate": 7.73454615471095e-07, "loss": 0.6165, "step": 1234 }, { "epoch": 0.3367186967486879, "grad_norm": 3.2708331803767687, "learning_rate": 7.730846949736888e-07, "loss": 0.7449, "step": 1235 }, { "epoch": 0.336991343466703, "grad_norm": 3.1968625399903985, "learning_rate": 7.727145613320563e-07, "loss": 0.7554, "step": 1236 }, { "epoch": 0.33726399018471814, "grad_norm": 1.9150296738119736, "learning_rate": 7.723442148350885e-07, "loss": 0.7138, "step": 1237 }, { "epoch": 0.3375366369027333, "grad_norm": 4.067079202887401, "learning_rate": 7.719736557718434e-07, "loss": 0.6587, "step": 1238 }, { "epoch": 0.3378092836207484, "grad_norm": 2.4289155103882085, "learning_rate": 7.716028844315446e-07, "loss": 0.7095, "step": 1239 }, { "epoch": 0.33808193033876355, "grad_norm": 2.0601503054862236, "learning_rate": 7.712319011035809e-07, "loss": 0.6997, "step": 1240 }, { "epoch": 0.3383545770567787, "grad_norm": 1.7215794344731798, "learning_rate": 7.708607060775074e-07, "loss": 0.6813, "step": 1241 }, { "epoch": 0.3386272237747938, "grad_norm": 2.440489930077423, "learning_rate": 7.704892996430441e-07, "loss": 0.7754, "step": 1242 }, { "epoch": 0.33889987049280895, "grad_norm": 2.318673281916767, "learning_rate": 7.701176820900755e-07, "loss": 0.7261, "step": 1243 }, { "epoch": 0.3391725172108241, "grad_norm": 2.248002583605671, "learning_rate": 7.697458537086517e-07, "loss": 0.7291, "step": 1244 }, { "epoch": 0.3394451639288392, "grad_norm": 2.417364793240655, "learning_rate": 7.693738147889868e-07, "loss": 0.6851, "step": 1245 }, { "epoch": 0.33971781064685436, "grad_norm": 2.39000827752512, "learning_rate": 7.690015656214594e-07, "loss": 0.7373, "step": 1246 }, { "epoch": 0.3399904573648695, "grad_norm": 2.403951465487042, "learning_rate": 7.686291064966121e-07, "loss": 0.671, "step": 1247 }, { "epoch": 0.3402631040828846, "grad_norm": 1.9599310903823588, "learning_rate": 7.682564377051516e-07, "loss": 0.7651, "step": 1248 }, { "epoch": 0.34053575080089976, "grad_norm": 2.057466851230842, "learning_rate": 7.67883559537948e-07, "loss": 0.7625, "step": 1249 }, { "epoch": 0.34080839751891484, "grad_norm": 2.857144406111625, "learning_rate": 7.67510472286035e-07, "loss": 0.6766, "step": 1250 }, { "epoch": 0.34108104423693, "grad_norm": 1.6743981974075588, "learning_rate": 7.671371762406094e-07, "loss": 0.677, "step": 1251 }, { "epoch": 0.3413536909549451, "grad_norm": 3.563386472927292, "learning_rate": 7.66763671693031e-07, "loss": 0.6976, "step": 1252 }, { "epoch": 0.34162633767296025, "grad_norm": 2.4765009351994203, "learning_rate": 7.663899589348224e-07, "loss": 0.7759, "step": 1253 }, { "epoch": 0.3418989843909754, "grad_norm": 2.425924956923561, "learning_rate": 7.660160382576683e-07, "loss": 0.7578, "step": 1254 }, { "epoch": 0.3421716311089905, "grad_norm": 3.7675586491529174, "learning_rate": 7.656419099534163e-07, "loss": 0.6693, "step": 1255 }, { "epoch": 0.34244427782700565, "grad_norm": 3.0378595528030408, "learning_rate": 7.652675743140755e-07, "loss": 0.7085, "step": 1256 }, { "epoch": 0.3427169245450208, "grad_norm": 2.324137397279641, "learning_rate": 7.648930316318177e-07, "loss": 0.6589, "step": 1257 }, { "epoch": 0.3429895712630359, "grad_norm": 1.969342023562692, "learning_rate": 7.645182821989749e-07, "loss": 0.7642, "step": 1258 }, { "epoch": 0.34326221798105105, "grad_norm": 6.0999957302249515, "learning_rate": 7.641433263080418e-07, "loss": 0.7073, "step": 1259 }, { "epoch": 0.3435348646990662, "grad_norm": 1.972144389225515, "learning_rate": 7.637681642516735e-07, "loss": 0.7217, "step": 1260 }, { "epoch": 0.3438075114170813, "grad_norm": 3.2850665267273507, "learning_rate": 7.633927963226862e-07, "loss": 0.6883, "step": 1261 }, { "epoch": 0.34408015813509646, "grad_norm": 2.1542753972203017, "learning_rate": 7.630172228140567e-07, "loss": 0.7448, "step": 1262 }, { "epoch": 0.3443528048531116, "grad_norm": 2.374092370865783, "learning_rate": 7.626414440189228e-07, "loss": 0.7609, "step": 1263 }, { "epoch": 0.34462545157112673, "grad_norm": 2.2920190118899355, "learning_rate": 7.622654602305816e-07, "loss": 0.7174, "step": 1264 }, { "epoch": 0.34489809828914186, "grad_norm": 2.731983474849317, "learning_rate": 7.61889271742491e-07, "loss": 0.6573, "step": 1265 }, { "epoch": 0.345170745007157, "grad_norm": 3.089571301602777, "learning_rate": 7.615128788482681e-07, "loss": 0.7243, "step": 1266 }, { "epoch": 0.34544339172517213, "grad_norm": 2.2699299619736877, "learning_rate": 7.611362818416898e-07, "loss": 0.7902, "step": 1267 }, { "epoch": 0.3457160384431872, "grad_norm": 2.171633751140346, "learning_rate": 7.607594810166924e-07, "loss": 0.6598, "step": 1268 }, { "epoch": 0.34598868516120235, "grad_norm": 2.8370892555084875, "learning_rate": 7.603824766673712e-07, "loss": 0.7306, "step": 1269 }, { "epoch": 0.3462613318792175, "grad_norm": 3.04115987927263, "learning_rate": 7.600052690879804e-07, "loss": 0.7578, "step": 1270 }, { "epoch": 0.3465339785972326, "grad_norm": 2.536084326184834, "learning_rate": 7.596278585729327e-07, "loss": 0.6975, "step": 1271 }, { "epoch": 0.34680662531524775, "grad_norm": 8.859708323823414, "learning_rate": 7.592502454167991e-07, "loss": 0.7792, "step": 1272 }, { "epoch": 0.3470792720332629, "grad_norm": 2.1441052855369094, "learning_rate": 7.58872429914309e-07, "loss": 0.6798, "step": 1273 }, { "epoch": 0.347351918751278, "grad_norm": 2.7185058919449356, "learning_rate": 7.584944123603497e-07, "loss": 0.6542, "step": 1274 }, { "epoch": 0.34762456546929316, "grad_norm": 3.262970899133585, "learning_rate": 7.581161930499662e-07, "loss": 0.6578, "step": 1275 }, { "epoch": 0.3478972121873083, "grad_norm": 3.4303689171547176, "learning_rate": 7.577377722783606e-07, "loss": 0.6973, "step": 1276 }, { "epoch": 0.3481698589053234, "grad_norm": 2.368498817155732, "learning_rate": 7.573591503408932e-07, "loss": 0.7043, "step": 1277 }, { "epoch": 0.34844250562333856, "grad_norm": 3.181694020596902, "learning_rate": 7.5698032753308e-07, "loss": 0.7028, "step": 1278 }, { "epoch": 0.3487151523413537, "grad_norm": 2.997794262941334, "learning_rate": 7.566013041505948e-07, "loss": 0.7002, "step": 1279 }, { "epoch": 0.34898779905936883, "grad_norm": 2.3430963543979804, "learning_rate": 7.562220804892673e-07, "loss": 0.7388, "step": 1280 }, { "epoch": 0.34926044577738397, "grad_norm": 1.9995002790759324, "learning_rate": 7.558426568450842e-07, "loss": 0.6989, "step": 1281 }, { "epoch": 0.3495330924953991, "grad_norm": 3.7205993563007036, "learning_rate": 7.554630335141878e-07, "loss": 0.6345, "step": 1282 }, { "epoch": 0.34980573921341424, "grad_norm": 128.82302757645283, "learning_rate": 7.550832107928762e-07, "loss": 0.6711, "step": 1283 }, { "epoch": 0.35007838593142937, "grad_norm": 6.238817398069892, "learning_rate": 7.547031889776036e-07, "loss": 0.7081, "step": 1284 }, { "epoch": 0.3503510326494445, "grad_norm": 7.520755240673071, "learning_rate": 7.543229683649789e-07, "loss": 0.6974, "step": 1285 }, { "epoch": 0.35062367936745964, "grad_norm": 2.685995034178567, "learning_rate": 7.539425492517669e-07, "loss": 0.6592, "step": 1286 }, { "epoch": 0.3508963260854747, "grad_norm": 2.0973230627997377, "learning_rate": 7.535619319348865e-07, "loss": 0.6611, "step": 1287 }, { "epoch": 0.35116897280348985, "grad_norm": 2.511825499434242, "learning_rate": 7.531811167114123e-07, "loss": 0.7013, "step": 1288 }, { "epoch": 0.351441619521505, "grad_norm": 2.1632343125261637, "learning_rate": 7.528001038785726e-07, "loss": 0.7358, "step": 1289 }, { "epoch": 0.3517142662395201, "grad_norm": 2.1321088903161654, "learning_rate": 7.524188937337504e-07, "loss": 0.7037, "step": 1290 }, { "epoch": 0.35198691295753526, "grad_norm": 2.552239950140482, "learning_rate": 7.52037486574482e-07, "loss": 0.7352, "step": 1291 }, { "epoch": 0.3522595596755504, "grad_norm": 1.962214871791568, "learning_rate": 7.516558826984583e-07, "loss": 0.721, "step": 1292 }, { "epoch": 0.35253220639356553, "grad_norm": 2.7596500664361994, "learning_rate": 7.512740824035233e-07, "loss": 0.6932, "step": 1293 }, { "epoch": 0.35280485311158066, "grad_norm": 2.040030294109265, "learning_rate": 7.508920859876746e-07, "loss": 0.6521, "step": 1294 }, { "epoch": 0.3530774998295958, "grad_norm": 2.3472780456073967, "learning_rate": 7.505098937490624e-07, "loss": 0.707, "step": 1295 }, { "epoch": 0.35335014654761093, "grad_norm": 2.210365903770209, "learning_rate": 7.501275059859901e-07, "loss": 0.6705, "step": 1296 }, { "epoch": 0.35362279326562607, "grad_norm": 2.6633034434628717, "learning_rate": 7.497449229969137e-07, "loss": 0.7312, "step": 1297 }, { "epoch": 0.3538954399836412, "grad_norm": 3.0620847632719146, "learning_rate": 7.493621450804415e-07, "loss": 0.6768, "step": 1298 }, { "epoch": 0.35416808670165634, "grad_norm": 3.0962108017706242, "learning_rate": 7.489791725353338e-07, "loss": 0.6886, "step": 1299 }, { "epoch": 0.35444073341967147, "grad_norm": 2.4178883558056348, "learning_rate": 7.48596005660503e-07, "loss": 0.6939, "step": 1300 }, { "epoch": 0.3547133801376866, "grad_norm": 2.6493293445091806, "learning_rate": 7.482126447550131e-07, "loss": 0.6998, "step": 1301 }, { "epoch": 0.35498602685570174, "grad_norm": 2.6604600062637584, "learning_rate": 7.478290901180797e-07, "loss": 0.6747, "step": 1302 }, { "epoch": 0.3552586735737169, "grad_norm": 5.0178676852650534, "learning_rate": 7.474453420490694e-07, "loss": 0.7547, "step": 1303 }, { "epoch": 0.355531320291732, "grad_norm": 1.8384071353649702, "learning_rate": 7.470614008474997e-07, "loss": 0.7394, "step": 1304 }, { "epoch": 0.35580396700974715, "grad_norm": 7.7584771412247635, "learning_rate": 7.466772668130392e-07, "loss": 0.5884, "step": 1305 }, { "epoch": 0.3560766137277622, "grad_norm": 3.054732837870358, "learning_rate": 7.462929402455067e-07, "loss": 0.613, "step": 1306 }, { "epoch": 0.35634926044577736, "grad_norm": 2.588602037288793, "learning_rate": 7.459084214448713e-07, "loss": 0.7201, "step": 1307 }, { "epoch": 0.3566219071637925, "grad_norm": 5.862003110888804, "learning_rate": 7.455237107112524e-07, "loss": 0.7109, "step": 1308 }, { "epoch": 0.35689455388180763, "grad_norm": 2.3316626149470774, "learning_rate": 7.451388083449188e-07, "loss": 0.7394, "step": 1309 }, { "epoch": 0.35716720059982277, "grad_norm": 3.795727625532326, "learning_rate": 7.447537146462892e-07, "loss": 0.7114, "step": 1310 }, { "epoch": 0.3574398473178379, "grad_norm": 1.863930126704741, "learning_rate": 7.443684299159316e-07, "loss": 0.6651, "step": 1311 }, { "epoch": 0.35771249403585303, "grad_norm": 2.2989163321402555, "learning_rate": 7.439829544545629e-07, "loss": 0.7385, "step": 1312 }, { "epoch": 0.35798514075386817, "grad_norm": 2.5108293244517146, "learning_rate": 7.435972885630488e-07, "loss": 0.7459, "step": 1313 }, { "epoch": 0.3582577874718833, "grad_norm": 2.7825217190963247, "learning_rate": 7.432114325424041e-07, "loss": 0.7239, "step": 1314 }, { "epoch": 0.35853043418989844, "grad_norm": 2.5401576492971265, "learning_rate": 7.428253866937918e-07, "loss": 0.794, "step": 1315 }, { "epoch": 0.3588030809079136, "grad_norm": 3.872455028126752, "learning_rate": 7.424391513185227e-07, "loss": 0.7072, "step": 1316 }, { "epoch": 0.3590757276259287, "grad_norm": 2.354436401886198, "learning_rate": 7.420527267180561e-07, "loss": 0.7422, "step": 1317 }, { "epoch": 0.35934837434394384, "grad_norm": 2.7478492946581254, "learning_rate": 7.416661131939983e-07, "loss": 0.6661, "step": 1318 }, { "epoch": 0.359621021061959, "grad_norm": 2.1267461887900323, "learning_rate": 7.412793110481039e-07, "loss": 0.6981, "step": 1319 }, { "epoch": 0.3598936677799741, "grad_norm": 2.102452163465651, "learning_rate": 7.408923205822737e-07, "loss": 0.7388, "step": 1320 }, { "epoch": 0.36016631449798925, "grad_norm": 4.5125547752766, "learning_rate": 7.405051420985566e-07, "loss": 0.7506, "step": 1321 }, { "epoch": 0.3604389612160044, "grad_norm": 2.178337920737195, "learning_rate": 7.401177758991477e-07, "loss": 0.728, "step": 1322 }, { "epoch": 0.3607116079340195, "grad_norm": 5.576866562788828, "learning_rate": 7.397302222863881e-07, "loss": 0.6578, "step": 1323 }, { "epoch": 0.36098425465203465, "grad_norm": 3.626393855859761, "learning_rate": 7.393424815627663e-07, "loss": 0.6933, "step": 1324 }, { "epoch": 0.36125690137004973, "grad_norm": 1.77829707964316, "learning_rate": 7.389545540309159e-07, "loss": 0.6855, "step": 1325 }, { "epoch": 0.36152954808806487, "grad_norm": 2.700769198453862, "learning_rate": 7.385664399936166e-07, "loss": 0.7085, "step": 1326 }, { "epoch": 0.36180219480608, "grad_norm": 3.621613389951744, "learning_rate": 7.381781397537939e-07, "loss": 0.7048, "step": 1327 }, { "epoch": 0.36207484152409514, "grad_norm": 4.406388960419418, "learning_rate": 7.377896536145183e-07, "loss": 0.6593, "step": 1328 }, { "epoch": 0.36234748824211027, "grad_norm": 2.317478874452263, "learning_rate": 7.374009818790057e-07, "loss": 0.6211, "step": 1329 }, { "epoch": 0.3626201349601254, "grad_norm": 2.2380315414444256, "learning_rate": 7.370121248506165e-07, "loss": 0.7585, "step": 1330 }, { "epoch": 0.36289278167814054, "grad_norm": 3.4070303730308407, "learning_rate": 7.366230828328559e-07, "loss": 0.7003, "step": 1331 }, { "epoch": 0.3631654283961557, "grad_norm": 2.7247822479795416, "learning_rate": 7.362338561293738e-07, "loss": 0.7191, "step": 1332 }, { "epoch": 0.3634380751141708, "grad_norm": 2.11065131043162, "learning_rate": 7.358444450439637e-07, "loss": 0.6967, "step": 1333 }, { "epoch": 0.36371072183218595, "grad_norm": 2.105028427496537, "learning_rate": 7.354548498805635e-07, "loss": 0.7, "step": 1334 }, { "epoch": 0.3639833685502011, "grad_norm": 3.18019449855647, "learning_rate": 7.350650709432542e-07, "loss": 0.6267, "step": 1335 }, { "epoch": 0.3642560152682162, "grad_norm": 1.9368158544021172, "learning_rate": 7.34675108536261e-07, "loss": 0.6991, "step": 1336 }, { "epoch": 0.36452866198623135, "grad_norm": 3.371149927151986, "learning_rate": 7.342849629639513e-07, "loss": 0.7269, "step": 1337 }, { "epoch": 0.3648013087042465, "grad_norm": 2.483437862544481, "learning_rate": 7.338946345308367e-07, "loss": 0.7485, "step": 1338 }, { "epoch": 0.3650739554222616, "grad_norm": 1.8121079111402498, "learning_rate": 7.335041235415704e-07, "loss": 0.7123, "step": 1339 }, { "epoch": 0.36534660214027675, "grad_norm": 3.114593875963402, "learning_rate": 7.331134303009485e-07, "loss": 0.6916, "step": 1340 }, { "epoch": 0.3656192488582919, "grad_norm": 24.713539019870183, "learning_rate": 7.327225551139098e-07, "loss": 0.7497, "step": 1341 }, { "epoch": 0.365891895576307, "grad_norm": 4.350931717285157, "learning_rate": 7.323314982855345e-07, "loss": 0.6777, "step": 1342 }, { "epoch": 0.36616454229432216, "grad_norm": 1.8204167261079893, "learning_rate": 7.319402601210447e-07, "loss": 0.7969, "step": 1343 }, { "epoch": 0.36643718901233724, "grad_norm": 2.8615374213183467, "learning_rate": 7.315488409258042e-07, "loss": 0.6863, "step": 1344 }, { "epoch": 0.3667098357303524, "grad_norm": 9.458153531876302, "learning_rate": 7.311572410053179e-07, "loss": 0.7355, "step": 1345 }, { "epoch": 0.3669824824483675, "grad_norm": 1.9663037247185446, "learning_rate": 7.307654606652319e-07, "loss": 0.7152, "step": 1346 }, { "epoch": 0.36725512916638264, "grad_norm": 6.053858521381498, "learning_rate": 7.303735002113332e-07, "loss": 0.6891, "step": 1347 }, { "epoch": 0.3675277758843978, "grad_norm": 2.681410373271466, "learning_rate": 7.299813599495492e-07, "loss": 0.7543, "step": 1348 }, { "epoch": 0.3678004226024129, "grad_norm": 4.549626331179066, "learning_rate": 7.295890401859476e-07, "loss": 0.6929, "step": 1349 }, { "epoch": 0.36807306932042805, "grad_norm": 1.9800767499736895, "learning_rate": 7.291965412267363e-07, "loss": 0.6691, "step": 1350 }, { "epoch": 0.3683457160384432, "grad_norm": 2.0103672345544217, "learning_rate": 7.288038633782634e-07, "loss": 0.7363, "step": 1351 }, { "epoch": 0.3686183627564583, "grad_norm": 2.2485657724288295, "learning_rate": 7.28411006947016e-07, "loss": 0.7345, "step": 1352 }, { "epoch": 0.36889100947447345, "grad_norm": 3.0867597077852147, "learning_rate": 7.280179722396208e-07, "loss": 0.7277, "step": 1353 }, { "epoch": 0.3691636561924886, "grad_norm": 2.0291068164303203, "learning_rate": 7.276247595628439e-07, "loss": 0.6863, "step": 1354 }, { "epoch": 0.3694363029105037, "grad_norm": 2.020459495656706, "learning_rate": 7.272313692235902e-07, "loss": 0.7428, "step": 1355 }, { "epoch": 0.36970894962851886, "grad_norm": 4.612088015981483, "learning_rate": 7.26837801528903e-07, "loss": 0.7078, "step": 1356 }, { "epoch": 0.369981596346534, "grad_norm": 4.357023989963209, "learning_rate": 7.264440567859644e-07, "loss": 0.6752, "step": 1357 }, { "epoch": 0.3702542430645491, "grad_norm": 3.7430160313711003, "learning_rate": 7.260501353020944e-07, "loss": 0.6129, "step": 1358 }, { "epoch": 0.37052688978256426, "grad_norm": 2.123382227770236, "learning_rate": 7.256560373847509e-07, "loss": 0.6834, "step": 1359 }, { "epoch": 0.3707995365005794, "grad_norm": 3.1190304305373515, "learning_rate": 7.2526176334153e-07, "loss": 0.6752, "step": 1360 }, { "epoch": 0.37107218321859453, "grad_norm": 2.410577079467074, "learning_rate": 7.248673134801647e-07, "loss": 0.7613, "step": 1361 }, { "epoch": 0.3713448299366096, "grad_norm": 2.118396337967617, "learning_rate": 7.244726881085257e-07, "loss": 0.7543, "step": 1362 }, { "epoch": 0.37161747665462475, "grad_norm": 4.2397010945473745, "learning_rate": 7.240778875346201e-07, "loss": 0.7404, "step": 1363 }, { "epoch": 0.3718901233726399, "grad_norm": 3.115693419822553, "learning_rate": 7.236829120665924e-07, "loss": 0.7246, "step": 1364 }, { "epoch": 0.372162770090655, "grad_norm": 3.118124406913913, "learning_rate": 7.232877620127232e-07, "loss": 0.6596, "step": 1365 }, { "epoch": 0.37243541680867015, "grad_norm": 2.050343167392117, "learning_rate": 7.228924376814292e-07, "loss": 0.7407, "step": 1366 }, { "epoch": 0.3727080635266853, "grad_norm": 1.7096648657021307, "learning_rate": 7.224969393812637e-07, "loss": 0.6798, "step": 1367 }, { "epoch": 0.3729807102447004, "grad_norm": 3.379993777039902, "learning_rate": 7.221012674209155e-07, "loss": 0.7801, "step": 1368 }, { "epoch": 0.37325335696271555, "grad_norm": 2.444078835592972, "learning_rate": 7.217054221092087e-07, "loss": 0.7286, "step": 1369 }, { "epoch": 0.3735260036807307, "grad_norm": 1.9798768030223612, "learning_rate": 7.213094037551031e-07, "loss": 0.6655, "step": 1370 }, { "epoch": 0.3737986503987458, "grad_norm": 7.833204429301889, "learning_rate": 7.209132126676933e-07, "loss": 0.7323, "step": 1371 }, { "epoch": 0.37407129711676096, "grad_norm": 3.425006757506246, "learning_rate": 7.205168491562089e-07, "loss": 0.6803, "step": 1372 }, { "epoch": 0.3743439438347761, "grad_norm": 7.894206291986132, "learning_rate": 7.201203135300137e-07, "loss": 0.7527, "step": 1373 }, { "epoch": 0.37461659055279123, "grad_norm": 2.2078448215342577, "learning_rate": 7.197236060986064e-07, "loss": 0.6757, "step": 1374 }, { "epoch": 0.37488923727080636, "grad_norm": 2.347873577795533, "learning_rate": 7.193267271716195e-07, "loss": 0.7497, "step": 1375 }, { "epoch": 0.3751618839888215, "grad_norm": 2.1718164047972266, "learning_rate": 7.189296770588191e-07, "loss": 0.7072, "step": 1376 }, { "epoch": 0.37543453070683663, "grad_norm": 2.035727161607417, "learning_rate": 7.185324560701054e-07, "loss": 0.6701, "step": 1377 }, { "epoch": 0.37570717742485177, "grad_norm": 2.318407582581617, "learning_rate": 7.181350645155116e-07, "loss": 0.6609, "step": 1378 }, { "epoch": 0.3759798241428669, "grad_norm": 3.29262604855436, "learning_rate": 7.177375027052043e-07, "loss": 0.6848, "step": 1379 }, { "epoch": 0.37625247086088204, "grad_norm": 1.9663139271647663, "learning_rate": 7.173397709494827e-07, "loss": 0.6379, "step": 1380 }, { "epoch": 0.3765251175788971, "grad_norm": 8.431507308215703, "learning_rate": 7.16941869558779e-07, "loss": 0.6737, "step": 1381 }, { "epoch": 0.37679776429691225, "grad_norm": 3.2662295049031727, "learning_rate": 7.165437988436575e-07, "loss": 0.7561, "step": 1382 }, { "epoch": 0.3770704110149274, "grad_norm": 2.858282346304002, "learning_rate": 7.161455591148148e-07, "loss": 0.7351, "step": 1383 }, { "epoch": 0.3773430577329425, "grad_norm": 3.7543337749572356, "learning_rate": 7.157471506830793e-07, "loss": 0.6964, "step": 1384 }, { "epoch": 0.37761570445095766, "grad_norm": 2.1164104203389797, "learning_rate": 7.15348573859411e-07, "loss": 0.7003, "step": 1385 }, { "epoch": 0.3778883511689728, "grad_norm": 2.22582585396, "learning_rate": 7.149498289549018e-07, "loss": 0.8107, "step": 1386 }, { "epoch": 0.3781609978869879, "grad_norm": 2.554653839409578, "learning_rate": 7.145509162807742e-07, "loss": 0.7347, "step": 1387 }, { "epoch": 0.37843364460500306, "grad_norm": 2.0352279810979343, "learning_rate": 7.141518361483821e-07, "loss": 0.7603, "step": 1388 }, { "epoch": 0.3787062913230182, "grad_norm": 2.9054567574852936, "learning_rate": 7.137525888692098e-07, "loss": 0.7692, "step": 1389 }, { "epoch": 0.37897893804103333, "grad_norm": 3.2072172314668626, "learning_rate": 7.133531747548723e-07, "loss": 0.6304, "step": 1390 }, { "epoch": 0.37925158475904847, "grad_norm": 1.9999980312811447, "learning_rate": 7.129535941171145e-07, "loss": 0.6648, "step": 1391 }, { "epoch": 0.3795242314770636, "grad_norm": 2.0149656921417907, "learning_rate": 7.125538472678115e-07, "loss": 0.6805, "step": 1392 }, { "epoch": 0.37979687819507874, "grad_norm": 2.291816019276859, "learning_rate": 7.121539345189683e-07, "loss": 0.7083, "step": 1393 }, { "epoch": 0.38006952491309387, "grad_norm": 1.8617423964218487, "learning_rate": 7.117538561827191e-07, "loss": 0.7052, "step": 1394 }, { "epoch": 0.380342171631109, "grad_norm": 6.226463932899705, "learning_rate": 7.113536125713272e-07, "loss": 0.7161, "step": 1395 }, { "epoch": 0.38061481834912414, "grad_norm": 2.128157892270293, "learning_rate": 7.109532039971854e-07, "loss": 0.7096, "step": 1396 }, { "epoch": 0.3808874650671393, "grad_norm": 2.4871119812105675, "learning_rate": 7.105526307728147e-07, "loss": 0.6597, "step": 1397 }, { "epoch": 0.3811601117851544, "grad_norm": 1.8299517415132691, "learning_rate": 7.101518932108651e-07, "loss": 0.7501, "step": 1398 }, { "epoch": 0.38143275850316954, "grad_norm": 2.3813833420514428, "learning_rate": 7.097509916241145e-07, "loss": 0.7434, "step": 1399 }, { "epoch": 0.3817054052211846, "grad_norm": 3.4770057481887084, "learning_rate": 7.09349926325469e-07, "loss": 0.6531, "step": 1400 }, { "epoch": 0.38197805193919976, "grad_norm": 2.7900048464528933, "learning_rate": 7.089486976279626e-07, "loss": 0.7471, "step": 1401 }, { "epoch": 0.3822506986572149, "grad_norm": 18.13683327554361, "learning_rate": 7.085473058447565e-07, "loss": 0.6731, "step": 1402 }, { "epoch": 0.38252334537523003, "grad_norm": 6.342158674355083, "learning_rate": 7.081457512891395e-07, "loss": 0.7145, "step": 1403 }, { "epoch": 0.38279599209324516, "grad_norm": 2.2056865737090776, "learning_rate": 7.077440342745271e-07, "loss": 0.7458, "step": 1404 }, { "epoch": 0.3830686388112603, "grad_norm": 2.506487827921892, "learning_rate": 7.07342155114462e-07, "loss": 0.7203, "step": 1405 }, { "epoch": 0.38334128552927543, "grad_norm": 2.3479221560832784, "learning_rate": 7.069401141226133e-07, "loss": 0.7237, "step": 1406 }, { "epoch": 0.38361393224729057, "grad_norm": 3.087913848033599, "learning_rate": 7.065379116127764e-07, "loss": 0.7677, "step": 1407 }, { "epoch": 0.3838865789653057, "grad_norm": 3.339369102114098, "learning_rate": 7.061355478988724e-07, "loss": 0.6636, "step": 1408 }, { "epoch": 0.38415922568332084, "grad_norm": 1.8417036793871464, "learning_rate": 7.05733023294949e-07, "loss": 0.7235, "step": 1409 }, { "epoch": 0.384431872401336, "grad_norm": 2.1059853761937557, "learning_rate": 7.05330338115179e-07, "loss": 0.6251, "step": 1410 }, { "epoch": 0.3847045191193511, "grad_norm": 2.5081859485685483, "learning_rate": 7.049274926738604e-07, "loss": 0.6329, "step": 1411 }, { "epoch": 0.38497716583736624, "grad_norm": 3.111797239071926, "learning_rate": 7.045244872854162e-07, "loss": 0.7434, "step": 1412 }, { "epoch": 0.3852498125553814, "grad_norm": 7.03488867557555, "learning_rate": 7.041213222643951e-07, "loss": 0.6833, "step": 1413 }, { "epoch": 0.3855224592733965, "grad_norm": 2.036866803090378, "learning_rate": 7.037179979254695e-07, "loss": 0.6961, "step": 1414 }, { "epoch": 0.38579510599141165, "grad_norm": 2.0754786831202527, "learning_rate": 7.033145145834365e-07, "loss": 0.7165, "step": 1415 }, { "epoch": 0.3860677527094268, "grad_norm": 2.0770810426199398, "learning_rate": 7.029108725532171e-07, "loss": 0.7046, "step": 1416 }, { "epoch": 0.3863403994274419, "grad_norm": 1.8460996388276305, "learning_rate": 7.025070721498565e-07, "loss": 0.6328, "step": 1417 }, { "epoch": 0.38661304614545705, "grad_norm": 3.8546864155797764, "learning_rate": 7.021031136885231e-07, "loss": 0.6291, "step": 1418 }, { "epoch": 0.38688569286347213, "grad_norm": 1.909763323611281, "learning_rate": 7.016989974845089e-07, "loss": 0.705, "step": 1419 }, { "epoch": 0.38715833958148727, "grad_norm": 1.9525348538148908, "learning_rate": 7.012947238532292e-07, "loss": 0.5869, "step": 1420 }, { "epoch": 0.3874309862995024, "grad_norm": 4.183912084119327, "learning_rate": 7.008902931102218e-07, "loss": 0.6916, "step": 1421 }, { "epoch": 0.38770363301751753, "grad_norm": 2.359518729778165, "learning_rate": 7.004857055711475e-07, "loss": 0.721, "step": 1422 }, { "epoch": 0.38797627973553267, "grad_norm": 1.8275707694642733, "learning_rate": 7.000809615517891e-07, "loss": 0.7181, "step": 1423 }, { "epoch": 0.3882489264535478, "grad_norm": 2.8626145292597527, "learning_rate": 6.996760613680517e-07, "loss": 0.7053, "step": 1424 }, { "epoch": 0.38852157317156294, "grad_norm": 2.2021031386176992, "learning_rate": 6.992710053359623e-07, "loss": 0.7295, "step": 1425 }, { "epoch": 0.3887942198895781, "grad_norm": 1.853693505245515, "learning_rate": 6.988657937716696e-07, "loss": 0.7229, "step": 1426 }, { "epoch": 0.3890668666075932, "grad_norm": 1.819671949754291, "learning_rate": 6.984604269914436e-07, "loss": 0.6557, "step": 1427 }, { "epoch": 0.38933951332560834, "grad_norm": 4.223434286410685, "learning_rate": 6.980549053116754e-07, "loss": 0.7073, "step": 1428 }, { "epoch": 0.3896121600436235, "grad_norm": 12.568523302092828, "learning_rate": 6.976492290488774e-07, "loss": 0.7345, "step": 1429 }, { "epoch": 0.3898848067616386, "grad_norm": 2.244957574036224, "learning_rate": 6.97243398519682e-07, "loss": 0.7331, "step": 1430 }, { "epoch": 0.39015745347965375, "grad_norm": 4.612716323232397, "learning_rate": 6.968374140408424e-07, "loss": 0.7008, "step": 1431 }, { "epoch": 0.3904301001976689, "grad_norm": 5.256333639943165, "learning_rate": 6.964312759292318e-07, "loss": 0.6417, "step": 1432 }, { "epoch": 0.390702746915684, "grad_norm": 2.6278015129140884, "learning_rate": 6.960249845018439e-07, "loss": 0.6898, "step": 1433 }, { "epoch": 0.39097539363369915, "grad_norm": 2.2882067756586357, "learning_rate": 6.95618540075791e-07, "loss": 0.6832, "step": 1434 }, { "epoch": 0.3912480403517143, "grad_norm": 2.5434542244124154, "learning_rate": 6.952119429683058e-07, "loss": 0.7389, "step": 1435 }, { "epoch": 0.3915206870697294, "grad_norm": 2.2742407093799146, "learning_rate": 6.948051934967394e-07, "loss": 0.6695, "step": 1436 }, { "epoch": 0.39179333378774456, "grad_norm": 2.6095531275212984, "learning_rate": 6.943982919785624e-07, "loss": 0.7649, "step": 1437 }, { "epoch": 0.39206598050575964, "grad_norm": 2.65048748883679, "learning_rate": 6.939912387313636e-07, "loss": 0.6923, "step": 1438 }, { "epoch": 0.39233862722377477, "grad_norm": 5.030457896788535, "learning_rate": 6.935840340728509e-07, "loss": 0.7257, "step": 1439 }, { "epoch": 0.3926112739417899, "grad_norm": 1.95095481567217, "learning_rate": 6.931766783208498e-07, "loss": 0.7191, "step": 1440 }, { "epoch": 0.39288392065980504, "grad_norm": 2.308008035841445, "learning_rate": 6.927691717933038e-07, "loss": 0.7551, "step": 1441 }, { "epoch": 0.3931565673778202, "grad_norm": 4.391158728191475, "learning_rate": 6.92361514808274e-07, "loss": 0.6756, "step": 1442 }, { "epoch": 0.3934292140958353, "grad_norm": 25.10698077398727, "learning_rate": 6.919537076839395e-07, "loss": 0.6728, "step": 1443 }, { "epoch": 0.39370186081385045, "grad_norm": 7.500149834749113, "learning_rate": 6.915457507385957e-07, "loss": 0.5877, "step": 1444 }, { "epoch": 0.3939745075318656, "grad_norm": 2.0169865648103213, "learning_rate": 6.911376442906558e-07, "loss": 0.7383, "step": 1445 }, { "epoch": 0.3942471542498807, "grad_norm": 2.155309461750178, "learning_rate": 6.90729388658649e-07, "loss": 0.6552, "step": 1446 }, { "epoch": 0.39451980096789585, "grad_norm": 2.3225313990847165, "learning_rate": 6.903209841612218e-07, "loss": 0.7331, "step": 1447 }, { "epoch": 0.394792447685911, "grad_norm": 2.1563855382346038, "learning_rate": 6.899124311171358e-07, "loss": 0.708, "step": 1448 }, { "epoch": 0.3950650944039261, "grad_norm": 2.2840386083024393, "learning_rate": 6.895037298452693e-07, "loss": 0.6947, "step": 1449 }, { "epoch": 0.39533774112194126, "grad_norm": 3.8061003312005988, "learning_rate": 6.89094880664616e-07, "loss": 0.7049, "step": 1450 }, { "epoch": 0.3956103878399564, "grad_norm": 1.972432589716315, "learning_rate": 6.886858838942851e-07, "loss": 0.6982, "step": 1451 }, { "epoch": 0.3958830345579715, "grad_norm": 6.176001591562267, "learning_rate": 6.882767398535008e-07, "loss": 0.7276, "step": 1452 }, { "epoch": 0.39615568127598666, "grad_norm": 2.5700126700225487, "learning_rate": 6.878674488616031e-07, "loss": 0.6992, "step": 1453 }, { "epoch": 0.3964283279940018, "grad_norm": 2.55473562253586, "learning_rate": 6.874580112380455e-07, "loss": 0.6906, "step": 1454 }, { "epoch": 0.39670097471201693, "grad_norm": 1.6971352519659948, "learning_rate": 6.870484273023967e-07, "loss": 0.6698, "step": 1455 }, { "epoch": 0.396973621430032, "grad_norm": 2.2819171975562758, "learning_rate": 6.866386973743392e-07, "loss": 0.7166, "step": 1456 }, { "epoch": 0.39724626814804714, "grad_norm": 2.12584862757763, "learning_rate": 6.862288217736698e-07, "loss": 0.6976, "step": 1457 }, { "epoch": 0.3975189148660623, "grad_norm": 2.839894492212317, "learning_rate": 6.858188008202988e-07, "loss": 0.6387, "step": 1458 }, { "epoch": 0.3977915615840774, "grad_norm": 2.8109430104179163, "learning_rate": 6.854086348342501e-07, "loss": 0.6441, "step": 1459 }, { "epoch": 0.39806420830209255, "grad_norm": 8.165575794158453, "learning_rate": 6.849983241356607e-07, "loss": 0.7703, "step": 1460 }, { "epoch": 0.3983368550201077, "grad_norm": 4.463634370374781, "learning_rate": 6.845878690447802e-07, "loss": 0.6333, "step": 1461 }, { "epoch": 0.3986095017381228, "grad_norm": 2.398807598006776, "learning_rate": 6.841772698819715e-07, "loss": 0.7284, "step": 1462 }, { "epoch": 0.39888214845613795, "grad_norm": 4.135120780785779, "learning_rate": 6.837665269677098e-07, "loss": 0.6924, "step": 1463 }, { "epoch": 0.3991547951741531, "grad_norm": 3.152279783674292, "learning_rate": 6.833556406225821e-07, "loss": 0.6079, "step": 1464 }, { "epoch": 0.3994274418921682, "grad_norm": 8.1190912857145, "learning_rate": 6.829446111672878e-07, "loss": 0.6961, "step": 1465 }, { "epoch": 0.39970008861018336, "grad_norm": 2.882466597599829, "learning_rate": 6.825334389226379e-07, "loss": 0.7718, "step": 1466 }, { "epoch": 0.3999727353281985, "grad_norm": 1.9952433762323711, "learning_rate": 6.821221242095546e-07, "loss": 0.7209, "step": 1467 }, { "epoch": 0.4002453820462136, "grad_norm": 2.4498774067830356, "learning_rate": 6.817106673490717e-07, "loss": 0.7981, "step": 1468 }, { "epoch": 0.40051802876422876, "grad_norm": 2.260995219893883, "learning_rate": 6.812990686623334e-07, "loss": 0.7068, "step": 1469 }, { "epoch": 0.4007906754822439, "grad_norm": 2.387821820514311, "learning_rate": 6.808873284705954e-07, "loss": 0.6451, "step": 1470 }, { "epoch": 0.40106332220025903, "grad_norm": 1.87623709137008, "learning_rate": 6.804754470952228e-07, "loss": 0.6366, "step": 1471 }, { "epoch": 0.40133596891827417, "grad_norm": 2.1905269738097846, "learning_rate": 6.800634248576917e-07, "loss": 0.7384, "step": 1472 }, { "epoch": 0.4016086156362893, "grad_norm": 2.0073563897436992, "learning_rate": 6.796512620795881e-07, "loss": 0.6633, "step": 1473 }, { "epoch": 0.40188126235430444, "grad_norm": 2.5686511366720066, "learning_rate": 6.792389590826073e-07, "loss": 0.6917, "step": 1474 }, { "epoch": 0.4021539090723195, "grad_norm": 2.01390110485099, "learning_rate": 6.788265161885542e-07, "loss": 0.6522, "step": 1475 }, { "epoch": 0.40242655579033465, "grad_norm": 5.453860428216372, "learning_rate": 6.784139337193431e-07, "loss": 0.6739, "step": 1476 }, { "epoch": 0.4026992025083498, "grad_norm": 1.67697500890517, "learning_rate": 6.780012119969969e-07, "loss": 0.7049, "step": 1477 }, { "epoch": 0.4029718492263649, "grad_norm": 2.9532719459609957, "learning_rate": 6.775883513436474e-07, "loss": 0.7261, "step": 1478 }, { "epoch": 0.40324449594438005, "grad_norm": 2.848984845903519, "learning_rate": 6.771753520815349e-07, "loss": 0.704, "step": 1479 }, { "epoch": 0.4035171426623952, "grad_norm": 4.740457973089268, "learning_rate": 6.767622145330076e-07, "loss": 0.7088, "step": 1480 }, { "epoch": 0.4037897893804103, "grad_norm": 8.251582881885264, "learning_rate": 6.76348939020522e-07, "loss": 0.6253, "step": 1481 }, { "epoch": 0.40406243609842546, "grad_norm": 5.163583969113851, "learning_rate": 6.759355258666418e-07, "loss": 0.7246, "step": 1482 }, { "epoch": 0.4043350828164406, "grad_norm": 4.705395900890555, "learning_rate": 6.755219753940388e-07, "loss": 0.7653, "step": 1483 }, { "epoch": 0.40460772953445573, "grad_norm": 2.0266859260009826, "learning_rate": 6.751082879254914e-07, "loss": 0.7681, "step": 1484 }, { "epoch": 0.40488037625247086, "grad_norm": 3.5025488966010974, "learning_rate": 6.74694463783885e-07, "loss": 0.6895, "step": 1485 }, { "epoch": 0.405153022970486, "grad_norm": 2.6620711329304476, "learning_rate": 6.742805032922121e-07, "loss": 0.725, "step": 1486 }, { "epoch": 0.40542566968850113, "grad_norm": 2.0983211774050328, "learning_rate": 6.738664067735711e-07, "loss": 0.6586, "step": 1487 }, { "epoch": 0.40569831640651627, "grad_norm": 1.8720325139193084, "learning_rate": 6.73452174551167e-07, "loss": 0.7101, "step": 1488 }, { "epoch": 0.4059709631245314, "grad_norm": 2.234453012940466, "learning_rate": 6.730378069483102e-07, "loss": 0.7128, "step": 1489 }, { "epoch": 0.40624360984254654, "grad_norm": 2.9317703674048965, "learning_rate": 6.726233042884173e-07, "loss": 0.7049, "step": 1490 }, { "epoch": 0.4065162565605617, "grad_norm": 2.3841131347157236, "learning_rate": 6.722086668950099e-07, "loss": 0.6958, "step": 1491 }, { "epoch": 0.4067889032785768, "grad_norm": 2.5122483780083074, "learning_rate": 6.717938950917154e-07, "loss": 0.6663, "step": 1492 }, { "epoch": 0.40706154999659194, "grad_norm": 2.414382940413212, "learning_rate": 6.713789892022651e-07, "loss": 0.7592, "step": 1493 }, { "epoch": 0.407334196714607, "grad_norm": 2.0076935916982954, "learning_rate": 6.709639495504958e-07, "loss": 0.7182, "step": 1494 }, { "epoch": 0.40760684343262216, "grad_norm": 1.993813757484559, "learning_rate": 6.705487764603484e-07, "loss": 0.6926, "step": 1495 }, { "epoch": 0.4078794901506373, "grad_norm": 2.0224249176141114, "learning_rate": 6.701334702558679e-07, "loss": 0.6468, "step": 1496 }, { "epoch": 0.4081521368686524, "grad_norm": 2.3697106209951175, "learning_rate": 6.697180312612033e-07, "loss": 0.6827, "step": 1497 }, { "epoch": 0.40842478358666756, "grad_norm": 4.907668234450365, "learning_rate": 6.693024598006072e-07, "loss": 0.6141, "step": 1498 }, { "epoch": 0.4086974303046827, "grad_norm": 2.389977120835026, "learning_rate": 6.688867561984355e-07, "loss": 0.6779, "step": 1499 }, { "epoch": 0.40897007702269783, "grad_norm": 2.1081908553734054, "learning_rate": 6.684709207791475e-07, "loss": 0.6546, "step": 1500 }, { "epoch": 0.40924272374071297, "grad_norm": 1.9439010616933194, "learning_rate": 6.680549538673051e-07, "loss": 0.6264, "step": 1501 }, { "epoch": 0.4095153704587281, "grad_norm": 2.582888451416799, "learning_rate": 6.676388557875728e-07, "loss": 0.7206, "step": 1502 }, { "epoch": 0.40978801717674324, "grad_norm": 2.480957448998525, "learning_rate": 6.672226268647181e-07, "loss": 0.7246, "step": 1503 }, { "epoch": 0.41006066389475837, "grad_norm": 6.636251953007022, "learning_rate": 6.668062674236095e-07, "loss": 0.6409, "step": 1504 }, { "epoch": 0.4103333106127735, "grad_norm": 1.9455073192281969, "learning_rate": 6.663897777892187e-07, "loss": 0.6709, "step": 1505 }, { "epoch": 0.41060595733078864, "grad_norm": 2.009525267190023, "learning_rate": 6.659731582866178e-07, "loss": 0.6569, "step": 1506 }, { "epoch": 0.4108786040488038, "grad_norm": 3.954064126041284, "learning_rate": 6.655564092409811e-07, "loss": 0.6961, "step": 1507 }, { "epoch": 0.4111512507668189, "grad_norm": 2.8633208807374886, "learning_rate": 6.651395309775836e-07, "loss": 0.6726, "step": 1508 }, { "epoch": 0.41142389748483404, "grad_norm": 4.4560323346462285, "learning_rate": 6.647225238218011e-07, "loss": 0.7175, "step": 1509 }, { "epoch": 0.4116965442028492, "grad_norm": 4.021503798991554, "learning_rate": 6.643053880991105e-07, "loss": 0.7189, "step": 1510 }, { "epoch": 0.4119691909208643, "grad_norm": 1.9895378935883339, "learning_rate": 6.638881241350883e-07, "loss": 0.6824, "step": 1511 }, { "epoch": 0.41224183763887945, "grad_norm": 2.3274994814168934, "learning_rate": 6.63470732255412e-07, "loss": 0.6552, "step": 1512 }, { "epoch": 0.41251448435689453, "grad_norm": 2.449645617300631, "learning_rate": 6.630532127858581e-07, "loss": 0.7009, "step": 1513 }, { "epoch": 0.41278713107490966, "grad_norm": 3.440598907313824, "learning_rate": 6.62635566052303e-07, "loss": 0.6783, "step": 1514 }, { "epoch": 0.4130597777929248, "grad_norm": 1.9268470498832022, "learning_rate": 6.622177923807225e-07, "loss": 0.6301, "step": 1515 }, { "epoch": 0.41333242451093993, "grad_norm": 11.145614479678237, "learning_rate": 6.617998920971916e-07, "loss": 0.756, "step": 1516 }, { "epoch": 0.41360507122895507, "grad_norm": 4.036492442044131, "learning_rate": 6.613818655278837e-07, "loss": 0.6331, "step": 1517 }, { "epoch": 0.4138777179469702, "grad_norm": 2.083090847785536, "learning_rate": 6.609637129990714e-07, "loss": 0.6848, "step": 1518 }, { "epoch": 0.41415036466498534, "grad_norm": 2.085842747152426, "learning_rate": 6.605454348371249e-07, "loss": 0.747, "step": 1519 }, { "epoch": 0.4144230113830005, "grad_norm": 2.568685287035319, "learning_rate": 6.601270313685127e-07, "loss": 0.7331, "step": 1520 }, { "epoch": 0.4146956581010156, "grad_norm": 5.410748374496081, "learning_rate": 6.597085029198015e-07, "loss": 0.7288, "step": 1521 }, { "epoch": 0.41496830481903074, "grad_norm": 2.110660967193516, "learning_rate": 6.592898498176552e-07, "loss": 0.6667, "step": 1522 }, { "epoch": 0.4152409515370459, "grad_norm": 3.6270072399856024, "learning_rate": 6.588710723888349e-07, "loss": 0.6823, "step": 1523 }, { "epoch": 0.415513598255061, "grad_norm": 2.8862683986949427, "learning_rate": 6.584521709601988e-07, "loss": 0.7129, "step": 1524 }, { "epoch": 0.41578624497307615, "grad_norm": 2.1671337290548163, "learning_rate": 6.580331458587021e-07, "loss": 0.6084, "step": 1525 }, { "epoch": 0.4160588916910913, "grad_norm": 2.511907021643587, "learning_rate": 6.576139974113965e-07, "loss": 0.7323, "step": 1526 }, { "epoch": 0.4163315384091064, "grad_norm": 1.7579526938390793, "learning_rate": 6.571947259454297e-07, "loss": 0.6822, "step": 1527 }, { "epoch": 0.41660418512712155, "grad_norm": 2.094486571612923, "learning_rate": 6.567753317880456e-07, "loss": 0.6768, "step": 1528 }, { "epoch": 0.4168768318451367, "grad_norm": 1.8933881796934522, "learning_rate": 6.563558152665837e-07, "loss": 0.634, "step": 1529 }, { "epoch": 0.4171494785631518, "grad_norm": 1.9262774143599954, "learning_rate": 6.559361767084792e-07, "loss": 0.734, "step": 1530 }, { "epoch": 0.4174221252811669, "grad_norm": 3.135113680359974, "learning_rate": 6.555164164412625e-07, "loss": 0.6996, "step": 1531 }, { "epoch": 0.41769477199918204, "grad_norm": 2.2520557227613596, "learning_rate": 6.550965347925592e-07, "loss": 0.7162, "step": 1532 }, { "epoch": 0.41796741871719717, "grad_norm": 1.851726004588649, "learning_rate": 6.546765320900891e-07, "loss": 0.6346, "step": 1533 }, { "epoch": 0.4182400654352123, "grad_norm": 4.099023367573205, "learning_rate": 6.54256408661667e-07, "loss": 0.7263, "step": 1534 }, { "epoch": 0.41851271215322744, "grad_norm": 6.867055278032597, "learning_rate": 6.538361648352017e-07, "loss": 0.6875, "step": 1535 }, { "epoch": 0.4187853588712426, "grad_norm": 3.386851985410983, "learning_rate": 6.534158009386959e-07, "loss": 0.7108, "step": 1536 }, { "epoch": 0.4190580055892577, "grad_norm": 2.0869532553348096, "learning_rate": 6.529953173002461e-07, "loss": 0.7501, "step": 1537 }, { "epoch": 0.41933065230727284, "grad_norm": 1.8275903266895759, "learning_rate": 6.525747142480425e-07, "loss": 0.6617, "step": 1538 }, { "epoch": 0.419603299025288, "grad_norm": 3.224958160745562, "learning_rate": 6.52153992110368e-07, "loss": 0.6711, "step": 1539 }, { "epoch": 0.4198759457433031, "grad_norm": 2.916491628158226, "learning_rate": 6.51733151215599e-07, "loss": 0.7372, "step": 1540 }, { "epoch": 0.42014859246131825, "grad_norm": 2.026772130516872, "learning_rate": 6.513121918922042e-07, "loss": 0.6536, "step": 1541 }, { "epoch": 0.4204212391793334, "grad_norm": 2.0165059433576267, "learning_rate": 6.508911144687448e-07, "loss": 0.7595, "step": 1542 }, { "epoch": 0.4206938858973485, "grad_norm": 4.497274765923403, "learning_rate": 6.504699192738743e-07, "loss": 0.6677, "step": 1543 }, { "epoch": 0.42096653261536365, "grad_norm": 2.5445003967329187, "learning_rate": 6.50048606636338e-07, "loss": 0.6725, "step": 1544 }, { "epoch": 0.4212391793333788, "grad_norm": 1.9269605021995562, "learning_rate": 6.49627176884973e-07, "loss": 0.716, "step": 1545 }, { "epoch": 0.4215118260513939, "grad_norm": 2.307345258501065, "learning_rate": 6.492056303487076e-07, "loss": 0.6738, "step": 1546 }, { "epoch": 0.42178447276940906, "grad_norm": 3.5599849743302627, "learning_rate": 6.487839673565615e-07, "loss": 0.6099, "step": 1547 }, { "epoch": 0.4220571194874242, "grad_norm": 1.7782318996905646, "learning_rate": 6.483621882376451e-07, "loss": 0.6573, "step": 1548 }, { "epoch": 0.4223297662054393, "grad_norm": 2.18165759416162, "learning_rate": 6.479402933211596e-07, "loss": 0.7446, "step": 1549 }, { "epoch": 0.4226024129234544, "grad_norm": 4.729533966483951, "learning_rate": 6.475182829363962e-07, "loss": 0.7084, "step": 1550 }, { "epoch": 0.42287505964146954, "grad_norm": 2.216770539416137, "learning_rate": 6.470961574127369e-07, "loss": 0.7134, "step": 1551 }, { "epoch": 0.4231477063594847, "grad_norm": 1.9824025173467352, "learning_rate": 6.466739170796529e-07, "loss": 0.7092, "step": 1552 }, { "epoch": 0.4234203530774998, "grad_norm": 2.18568427947801, "learning_rate": 6.462515622667055e-07, "loss": 0.668, "step": 1553 }, { "epoch": 0.42369299979551495, "grad_norm": 105.79292402502323, "learning_rate": 6.45829093303545e-07, "loss": 0.6076, "step": 1554 }, { "epoch": 0.4239656465135301, "grad_norm": 3.853459508751935, "learning_rate": 6.454065105199109e-07, "loss": 0.6529, "step": 1555 }, { "epoch": 0.4242382932315452, "grad_norm": 9.604091948344802, "learning_rate": 6.449838142456316e-07, "loss": 0.658, "step": 1556 }, { "epoch": 0.42451093994956035, "grad_norm": 2.270967367500766, "learning_rate": 6.44561004810624e-07, "loss": 0.6992, "step": 1557 }, { "epoch": 0.4247835866675755, "grad_norm": 18.63765032992045, "learning_rate": 6.441380825448936e-07, "loss": 0.6839, "step": 1558 }, { "epoch": 0.4250562333855906, "grad_norm": 2.2388043009949685, "learning_rate": 6.437150477785336e-07, "loss": 0.6686, "step": 1559 }, { "epoch": 0.42532888010360576, "grad_norm": 2.4062079652897532, "learning_rate": 6.432919008417252e-07, "loss": 0.6742, "step": 1560 }, { "epoch": 0.4256015268216209, "grad_norm": 3.151327462856479, "learning_rate": 6.42868642064737e-07, "loss": 0.6654, "step": 1561 }, { "epoch": 0.425874173539636, "grad_norm": 2.5246951554200856, "learning_rate": 6.424452717779252e-07, "loss": 0.737, "step": 1562 }, { "epoch": 0.42614682025765116, "grad_norm": 2.9335361754860085, "learning_rate": 6.420217903117324e-07, "loss": 0.6886, "step": 1563 }, { "epoch": 0.4264194669756663, "grad_norm": 1.9868929736318863, "learning_rate": 6.415981979966887e-07, "loss": 0.7036, "step": 1564 }, { "epoch": 0.42669211369368143, "grad_norm": 2.331932452159086, "learning_rate": 6.411744951634105e-07, "loss": 0.684, "step": 1565 }, { "epoch": 0.42696476041169656, "grad_norm": 6.5002656849401355, "learning_rate": 6.407506821426003e-07, "loss": 0.6391, "step": 1566 }, { "epoch": 0.4272374071297117, "grad_norm": 2.0188792195047087, "learning_rate": 6.403267592650466e-07, "loss": 0.6229, "step": 1567 }, { "epoch": 0.42751005384772683, "grad_norm": 1.9246373408891273, "learning_rate": 6.399027268616238e-07, "loss": 0.7027, "step": 1568 }, { "epoch": 0.4277827005657419, "grad_norm": 2.236649905193952, "learning_rate": 6.394785852632914e-07, "loss": 0.6917, "step": 1569 }, { "epoch": 0.42805534728375705, "grad_norm": 2.1318543846442175, "learning_rate": 6.390543348010948e-07, "loss": 0.7423, "step": 1570 }, { "epoch": 0.4283279940017722, "grad_norm": 28.615512255506626, "learning_rate": 6.386299758061641e-07, "loss": 0.6939, "step": 1571 }, { "epoch": 0.4286006407197873, "grad_norm": 1.9028144886913056, "learning_rate": 6.382055086097138e-07, "loss": 0.7242, "step": 1572 }, { "epoch": 0.42887328743780245, "grad_norm": 1.7867413640876615, "learning_rate": 6.377809335430431e-07, "loss": 0.7233, "step": 1573 }, { "epoch": 0.4291459341558176, "grad_norm": 2.433341730530862, "learning_rate": 6.373562509375353e-07, "loss": 0.707, "step": 1574 }, { "epoch": 0.4294185808738327, "grad_norm": 4.531649267230234, "learning_rate": 6.369314611246578e-07, "loss": 0.6371, "step": 1575 }, { "epoch": 0.42969122759184786, "grad_norm": 2.024091431244555, "learning_rate": 6.365065644359613e-07, "loss": 0.6863, "step": 1576 }, { "epoch": 0.429963874309863, "grad_norm": 1.9962838077305822, "learning_rate": 6.360815612030808e-07, "loss": 0.6574, "step": 1577 }, { "epoch": 0.4302365210278781, "grad_norm": 2.708681966261342, "learning_rate": 6.356564517577332e-07, "loss": 0.7177, "step": 1578 }, { "epoch": 0.43050916774589326, "grad_norm": 3.7818877889257925, "learning_rate": 6.352312364317192e-07, "loss": 0.7395, "step": 1579 }, { "epoch": 0.4307818144639084, "grad_norm": 3.856968139575147, "learning_rate": 6.348059155569221e-07, "loss": 0.6706, "step": 1580 }, { "epoch": 0.43105446118192353, "grad_norm": 2.5968488261978906, "learning_rate": 6.343804894653071e-07, "loss": 0.714, "step": 1581 }, { "epoch": 0.43132710789993867, "grad_norm": 2.4589329643462836, "learning_rate": 6.339549584889218e-07, "loss": 0.6139, "step": 1582 }, { "epoch": 0.4315997546179538, "grad_norm": 4.756878884553013, "learning_rate": 6.335293229598957e-07, "loss": 0.6471, "step": 1583 }, { "epoch": 0.43187240133596894, "grad_norm": 2.6426259214294836, "learning_rate": 6.331035832104399e-07, "loss": 0.7074, "step": 1584 }, { "epoch": 0.43214504805398407, "grad_norm": 2.2815351230249843, "learning_rate": 6.326777395728468e-07, "loss": 0.6794, "step": 1585 }, { "epoch": 0.4324176947719992, "grad_norm": 4.852409275713697, "learning_rate": 6.322517923794899e-07, "loss": 0.6465, "step": 1586 }, { "epoch": 0.43269034149001434, "grad_norm": 2.206929492768177, "learning_rate": 6.318257419628234e-07, "loss": 0.7022, "step": 1587 }, { "epoch": 0.4329629882080294, "grad_norm": 1.939734952409189, "learning_rate": 6.313995886553824e-07, "loss": 0.7382, "step": 1588 }, { "epoch": 0.43323563492604455, "grad_norm": 6.266430251749675, "learning_rate": 6.309733327897819e-07, "loss": 0.7012, "step": 1589 }, { "epoch": 0.4335082816440597, "grad_norm": 2.446423998723795, "learning_rate": 6.305469746987171e-07, "loss": 0.6955, "step": 1590 }, { "epoch": 0.4337809283620748, "grad_norm": 2.4153799657106076, "learning_rate": 6.301205147149631e-07, "loss": 0.7562, "step": 1591 }, { "epoch": 0.43405357508008996, "grad_norm": 2.687782905724345, "learning_rate": 6.296939531713746e-07, "loss": 0.7021, "step": 1592 }, { "epoch": 0.4343262217981051, "grad_norm": 3.1218688602652085, "learning_rate": 6.292672904008852e-07, "loss": 0.6617, "step": 1593 }, { "epoch": 0.43459886851612023, "grad_norm": 3.501431108781358, "learning_rate": 6.288405267365079e-07, "loss": 0.6232, "step": 1594 }, { "epoch": 0.43487151523413536, "grad_norm": 3.2097411498961517, "learning_rate": 6.28413662511334e-07, "loss": 0.6489, "step": 1595 }, { "epoch": 0.4351441619521505, "grad_norm": 2.1184830758438267, "learning_rate": 6.279866980585338e-07, "loss": 0.622, "step": 1596 }, { "epoch": 0.43541680867016563, "grad_norm": 2.1227444126293684, "learning_rate": 6.275596337113556e-07, "loss": 0.7587, "step": 1597 }, { "epoch": 0.43568945538818077, "grad_norm": 2.7435186596511363, "learning_rate": 6.271324698031256e-07, "loss": 0.6759, "step": 1598 }, { "epoch": 0.4359621021061959, "grad_norm": 3.413568911241621, "learning_rate": 6.267052066672476e-07, "loss": 0.6441, "step": 1599 }, { "epoch": 0.43623474882421104, "grad_norm": 4.10080383057368, "learning_rate": 6.262778446372033e-07, "loss": 0.6905, "step": 1600 }, { "epoch": 0.4365073955422262, "grad_norm": 2.293278470264202, "learning_rate": 6.258503840465513e-07, "loss": 0.698, "step": 1601 }, { "epoch": 0.4367800422602413, "grad_norm": 3.1489128822638754, "learning_rate": 6.254228252289268e-07, "loss": 0.7507, "step": 1602 }, { "epoch": 0.43705268897825644, "grad_norm": 2.1713136800028288, "learning_rate": 6.249951685180423e-07, "loss": 0.7227, "step": 1603 }, { "epoch": 0.4373253356962716, "grad_norm": 2.4831745398489953, "learning_rate": 6.245674142476862e-07, "loss": 0.7086, "step": 1604 }, { "epoch": 0.4375979824142867, "grad_norm": 3.634072788471092, "learning_rate": 6.241395627517236e-07, "loss": 0.7336, "step": 1605 }, { "epoch": 0.43787062913230185, "grad_norm": 2.4943117279017586, "learning_rate": 6.237116143640947e-07, "loss": 0.6485, "step": 1606 }, { "epoch": 0.4381432758503169, "grad_norm": 5.366198093771849, "learning_rate": 6.23283569418816e-07, "loss": 0.615, "step": 1607 }, { "epoch": 0.43841592256833206, "grad_norm": 2.3972460864759926, "learning_rate": 6.22855428249979e-07, "loss": 0.6224, "step": 1608 }, { "epoch": 0.4386885692863472, "grad_norm": 2.9122070306412344, "learning_rate": 6.224271911917507e-07, "loss": 0.6494, "step": 1609 }, { "epoch": 0.43896121600436233, "grad_norm": 3.069548433642841, "learning_rate": 6.219988585783722e-07, "loss": 0.6271, "step": 1610 }, { "epoch": 0.43923386272237747, "grad_norm": 2.9011294227415627, "learning_rate": 6.2157043074416e-07, "loss": 0.6723, "step": 1611 }, { "epoch": 0.4395065094403926, "grad_norm": 4.775287742199215, "learning_rate": 6.211419080235046e-07, "loss": 0.6853, "step": 1612 }, { "epoch": 0.43977915615840774, "grad_norm": 2.5771826454678544, "learning_rate": 6.207132907508704e-07, "loss": 0.7219, "step": 1613 }, { "epoch": 0.44005180287642287, "grad_norm": 3.2345839315501888, "learning_rate": 6.202845792607957e-07, "loss": 0.6468, "step": 1614 }, { "epoch": 0.440324449594438, "grad_norm": 2.501093614272012, "learning_rate": 6.198557738878923e-07, "loss": 0.7059, "step": 1615 }, { "epoch": 0.44059709631245314, "grad_norm": 2.3662067258948394, "learning_rate": 6.194268749668457e-07, "loss": 0.7308, "step": 1616 }, { "epoch": 0.4408697430304683, "grad_norm": 2.0462363205435223, "learning_rate": 6.189978828324135e-07, "loss": 0.7182, "step": 1617 }, { "epoch": 0.4411423897484834, "grad_norm": 2.570486107200012, "learning_rate": 6.185687978194271e-07, "loss": 0.7985, "step": 1618 }, { "epoch": 0.44141503646649854, "grad_norm": 3.360004244990933, "learning_rate": 6.181396202627896e-07, "loss": 0.7367, "step": 1619 }, { "epoch": 0.4416876831845137, "grad_norm": 2.3690803786702266, "learning_rate": 6.177103504974768e-07, "loss": 0.6585, "step": 1620 }, { "epoch": 0.4419603299025288, "grad_norm": 3.521892840542611, "learning_rate": 6.172809888585364e-07, "loss": 0.6838, "step": 1621 }, { "epoch": 0.44223297662054395, "grad_norm": 2.383543025695426, "learning_rate": 6.168515356810874e-07, "loss": 0.6046, "step": 1622 }, { "epoch": 0.4425056233385591, "grad_norm": 2.1396648246417174, "learning_rate": 6.164219913003207e-07, "loss": 0.7276, "step": 1623 }, { "epoch": 0.4427782700565742, "grad_norm": 4.4024800395853045, "learning_rate": 6.159923560514982e-07, "loss": 0.6522, "step": 1624 }, { "epoch": 0.4430509167745893, "grad_norm": 3.6946890423148635, "learning_rate": 6.15562630269953e-07, "loss": 0.6724, "step": 1625 }, { "epoch": 0.44332356349260443, "grad_norm": 2.3605632085107673, "learning_rate": 6.151328142910883e-07, "loss": 0.6565, "step": 1626 }, { "epoch": 0.44359621021061957, "grad_norm": 5.144014698802189, "learning_rate": 6.147029084503783e-07, "loss": 0.7194, "step": 1627 }, { "epoch": 0.4438688569286347, "grad_norm": 5.479230533384474, "learning_rate": 6.142729130833668e-07, "loss": 0.6533, "step": 1628 }, { "epoch": 0.44414150364664984, "grad_norm": 2.405480316046586, "learning_rate": 6.138428285256678e-07, "loss": 0.7405, "step": 1629 }, { "epoch": 0.444414150364665, "grad_norm": 3.8226779994643705, "learning_rate": 6.134126551129651e-07, "loss": 0.6796, "step": 1630 }, { "epoch": 0.4446867970826801, "grad_norm": 4.86882362128706, "learning_rate": 6.129823931810113e-07, "loss": 0.6911, "step": 1631 }, { "epoch": 0.44495944380069524, "grad_norm": 3.0714029873365662, "learning_rate": 6.125520430656286e-07, "loss": 0.7282, "step": 1632 }, { "epoch": 0.4452320905187104, "grad_norm": 2.9576532352557563, "learning_rate": 6.121216051027077e-07, "loss": 0.6967, "step": 1633 }, { "epoch": 0.4455047372367255, "grad_norm": 3.0379879949468447, "learning_rate": 6.11691079628208e-07, "loss": 0.7064, "step": 1634 }, { "epoch": 0.44577738395474065, "grad_norm": 8.396178594539926, "learning_rate": 6.112604669781572e-07, "loss": 0.6644, "step": 1635 }, { "epoch": 0.4460500306727558, "grad_norm": 2.690575591789236, "learning_rate": 6.10829767488651e-07, "loss": 0.6233, "step": 1636 }, { "epoch": 0.4463226773907709, "grad_norm": 7.999738533998336, "learning_rate": 6.10398981495853e-07, "loss": 0.6203, "step": 1637 }, { "epoch": 0.44659532410878605, "grad_norm": 5.807139518677621, "learning_rate": 6.099681093359943e-07, "loss": 0.6856, "step": 1638 }, { "epoch": 0.4468679708268012, "grad_norm": 1.899080809779053, "learning_rate": 6.09537151345373e-07, "loss": 0.7275, "step": 1639 }, { "epoch": 0.4471406175448163, "grad_norm": 2.331723076203189, "learning_rate": 6.091061078603543e-07, "loss": 0.742, "step": 1640 }, { "epoch": 0.44741326426283146, "grad_norm": 5.787164949207161, "learning_rate": 6.086749792173705e-07, "loss": 0.73, "step": 1641 }, { "epoch": 0.4476859109808466, "grad_norm": 3.047797164444414, "learning_rate": 6.0824376575292e-07, "loss": 0.7486, "step": 1642 }, { "epoch": 0.4479585576988617, "grad_norm": 7.064823039724488, "learning_rate": 6.07812467803567e-07, "loss": 0.6449, "step": 1643 }, { "epoch": 0.4482312044168768, "grad_norm": 1.8460190861926393, "learning_rate": 6.073810857059427e-07, "loss": 0.7139, "step": 1644 }, { "epoch": 0.44850385113489194, "grad_norm": 2.083095662970879, "learning_rate": 6.069496197967432e-07, "loss": 0.6717, "step": 1645 }, { "epoch": 0.4487764978529071, "grad_norm": 2.5474909466136797, "learning_rate": 6.065180704127302e-07, "loss": 0.6509, "step": 1646 }, { "epoch": 0.4490491445709222, "grad_norm": 2.5557052139742953, "learning_rate": 6.060864378907305e-07, "loss": 0.7482, "step": 1647 }, { "epoch": 0.44932179128893734, "grad_norm": 2.399737613166449, "learning_rate": 6.056547225676358e-07, "loss": 0.689, "step": 1648 }, { "epoch": 0.4495944380069525, "grad_norm": 3.3134638700623715, "learning_rate": 6.052229247804025e-07, "loss": 0.7346, "step": 1649 }, { "epoch": 0.4498670847249676, "grad_norm": 1.9573741848245156, "learning_rate": 6.047910448660515e-07, "loss": 0.6097, "step": 1650 }, { "epoch": 0.45013973144298275, "grad_norm": 4.830547156334766, "learning_rate": 6.043590831616676e-07, "loss": 0.7066, "step": 1651 }, { "epoch": 0.4504123781609979, "grad_norm": 9.215974025365513, "learning_rate": 6.039270400043993e-07, "loss": 0.7364, "step": 1652 }, { "epoch": 0.450685024879013, "grad_norm": 2.8932440410403286, "learning_rate": 6.034949157314589e-07, "loss": 0.6957, "step": 1653 }, { "epoch": 0.45095767159702815, "grad_norm": 3.076206133496519, "learning_rate": 6.030627106801222e-07, "loss": 0.7176, "step": 1654 }, { "epoch": 0.4512303183150433, "grad_norm": 4.220364201596043, "learning_rate": 6.026304251877276e-07, "loss": 0.7053, "step": 1655 }, { "epoch": 0.4515029650330584, "grad_norm": 2.448748485301807, "learning_rate": 6.021980595916762e-07, "loss": 0.6538, "step": 1656 }, { "epoch": 0.45177561175107356, "grad_norm": 3.0254337596421395, "learning_rate": 6.017656142294326e-07, "loss": 0.7459, "step": 1657 }, { "epoch": 0.4520482584690887, "grad_norm": 2.2913484116237206, "learning_rate": 6.013330894385225e-07, "loss": 0.7095, "step": 1658 }, { "epoch": 0.45232090518710383, "grad_norm": 1.8829044646762139, "learning_rate": 6.009004855565343e-07, "loss": 0.659, "step": 1659 }, { "epoch": 0.45259355190511896, "grad_norm": 2.9421905903230976, "learning_rate": 6.004678029211176e-07, "loss": 0.6869, "step": 1660 }, { "epoch": 0.4528661986231341, "grad_norm": 2.1848901075144966, "learning_rate": 6.000350418699841e-07, "loss": 0.7007, "step": 1661 }, { "epoch": 0.45313884534114923, "grad_norm": 4.460368910692839, "learning_rate": 5.996022027409062e-07, "loss": 0.7136, "step": 1662 }, { "epoch": 0.4534114920591643, "grad_norm": 2.8058104887331865, "learning_rate": 5.991692858717174e-07, "loss": 0.7038, "step": 1663 }, { "epoch": 0.45368413877717945, "grad_norm": 4.930255786442571, "learning_rate": 5.987362916003119e-07, "loss": 0.698, "step": 1664 }, { "epoch": 0.4539567854951946, "grad_norm": 2.036764622165154, "learning_rate": 5.983032202646443e-07, "loss": 0.7276, "step": 1665 }, { "epoch": 0.4542294322132097, "grad_norm": 2.378639776102847, "learning_rate": 5.978700722027295e-07, "loss": 0.6913, "step": 1666 }, { "epoch": 0.45450207893122485, "grad_norm": 2.3628100186399927, "learning_rate": 5.974368477526419e-07, "loss": 0.7341, "step": 1667 }, { "epoch": 0.45477472564924, "grad_norm": 2.612128552281264, "learning_rate": 5.970035472525158e-07, "loss": 0.7079, "step": 1668 }, { "epoch": 0.4550473723672551, "grad_norm": 2.091401906104537, "learning_rate": 5.965701710405449e-07, "loss": 0.7749, "step": 1669 }, { "epoch": 0.45532001908527026, "grad_norm": 1.8889011453894085, "learning_rate": 5.96136719454982e-07, "loss": 0.6699, "step": 1670 }, { "epoch": 0.4555926658032854, "grad_norm": 2.7804063925395424, "learning_rate": 5.957031928341383e-07, "loss": 0.709, "step": 1671 }, { "epoch": 0.4558653125213005, "grad_norm": 3.2521183103274884, "learning_rate": 5.952695915163843e-07, "loss": 0.7083, "step": 1672 }, { "epoch": 0.45613795923931566, "grad_norm": 4.418281280683081, "learning_rate": 5.948359158401481e-07, "loss": 0.7303, "step": 1673 }, { "epoch": 0.4564106059573308, "grad_norm": 1.9844795727560232, "learning_rate": 5.944021661439164e-07, "loss": 0.7232, "step": 1674 }, { "epoch": 0.45668325267534593, "grad_norm": 2.050824667285427, "learning_rate": 5.939683427662331e-07, "loss": 0.6631, "step": 1675 }, { "epoch": 0.45695589939336106, "grad_norm": 1.8069591249504018, "learning_rate": 5.935344460457003e-07, "loss": 0.6177, "step": 1676 }, { "epoch": 0.4572285461113762, "grad_norm": 2.366655032781222, "learning_rate": 5.931004763209767e-07, "loss": 0.7198, "step": 1677 }, { "epoch": 0.45750119282939133, "grad_norm": 2.1416315626605646, "learning_rate": 5.926664339307784e-07, "loss": 0.6673, "step": 1678 }, { "epoch": 0.45777383954740647, "grad_norm": 2.1067930299105995, "learning_rate": 5.92232319213878e-07, "loss": 0.7146, "step": 1679 }, { "epoch": 0.4580464862654216, "grad_norm": 79.680324024072, "learning_rate": 5.917981325091045e-07, "loss": 0.6705, "step": 1680 }, { "epoch": 0.45831913298343674, "grad_norm": 2.1749287408750186, "learning_rate": 5.913638741553434e-07, "loss": 0.6551, "step": 1681 }, { "epoch": 0.4585917797014518, "grad_norm": 2.3513931447796064, "learning_rate": 5.909295444915356e-07, "loss": 0.6889, "step": 1682 }, { "epoch": 0.45886442641946695, "grad_norm": 2.057667628628829, "learning_rate": 5.904951438566784e-07, "loss": 0.6668, "step": 1683 }, { "epoch": 0.4591370731374821, "grad_norm": 2.1688392166904755, "learning_rate": 5.900606725898238e-07, "loss": 0.6072, "step": 1684 }, { "epoch": 0.4594097198554972, "grad_norm": 2.4197097406916326, "learning_rate": 5.896261310300794e-07, "loss": 0.7244, "step": 1685 }, { "epoch": 0.45968236657351236, "grad_norm": 15.825505420297432, "learning_rate": 5.891915195166074e-07, "loss": 0.685, "step": 1686 }, { "epoch": 0.4599550132915275, "grad_norm": 8.435539845527405, "learning_rate": 5.887568383886245e-07, "loss": 0.7074, "step": 1687 }, { "epoch": 0.4602276600095426, "grad_norm": 2.5765667541901847, "learning_rate": 5.88322087985402e-07, "loss": 0.6909, "step": 1688 }, { "epoch": 0.46050030672755776, "grad_norm": 5.905121732847541, "learning_rate": 5.878872686462653e-07, "loss": 0.6869, "step": 1689 }, { "epoch": 0.4607729534455729, "grad_norm": 3.6524720646474185, "learning_rate": 5.874523807105933e-07, "loss": 0.6705, "step": 1690 }, { "epoch": 0.46104560016358803, "grad_norm": 2.5505885120916996, "learning_rate": 5.870174245178186e-07, "loss": 0.7525, "step": 1691 }, { "epoch": 0.46131824688160317, "grad_norm": 4.473608443172918, "learning_rate": 5.865824004074273e-07, "loss": 0.6607, "step": 1692 }, { "epoch": 0.4615908935996183, "grad_norm": 2.43716648181281, "learning_rate": 5.861473087189584e-07, "loss": 0.695, "step": 1693 }, { "epoch": 0.46186354031763344, "grad_norm": 2.269448796244228, "learning_rate": 5.857121497920031e-07, "loss": 0.7406, "step": 1694 }, { "epoch": 0.46213618703564857, "grad_norm": 3.230754166488089, "learning_rate": 5.852769239662057e-07, "loss": 0.6603, "step": 1695 }, { "epoch": 0.4624088337536637, "grad_norm": 2.575499622985877, "learning_rate": 5.848416315812627e-07, "loss": 0.6916, "step": 1696 }, { "epoch": 0.46268148047167884, "grad_norm": 3.313499188817813, "learning_rate": 5.844062729769223e-07, "loss": 0.7355, "step": 1697 }, { "epoch": 0.462954127189694, "grad_norm": 1.722868079166655, "learning_rate": 5.839708484929846e-07, "loss": 0.672, "step": 1698 }, { "epoch": 0.4632267739077091, "grad_norm": 2.5834647866168496, "learning_rate": 5.835353584693008e-07, "loss": 0.6281, "step": 1699 }, { "epoch": 0.46349942062572425, "grad_norm": 7.073457267026023, "learning_rate": 5.830998032457735e-07, "loss": 0.6789, "step": 1700 }, { "epoch": 0.4637720673437393, "grad_norm": 2.90437042948393, "learning_rate": 5.826641831623561e-07, "loss": 0.6458, "step": 1701 }, { "epoch": 0.46404471406175446, "grad_norm": 2.8405269778437514, "learning_rate": 5.822284985590527e-07, "loss": 0.7067, "step": 1702 }, { "epoch": 0.4643173607797696, "grad_norm": 2.343015680348452, "learning_rate": 5.817927497759179e-07, "loss": 0.7576, "step": 1703 }, { "epoch": 0.46459000749778473, "grad_norm": 3.4323321803753495, "learning_rate": 5.813569371530559e-07, "loss": 0.7024, "step": 1704 }, { "epoch": 0.46486265421579986, "grad_norm": 4.588367171885468, "learning_rate": 5.809210610306212e-07, "loss": 0.7725, "step": 1705 }, { "epoch": 0.465135300933815, "grad_norm": 2.3253295965648912, "learning_rate": 5.804851217488178e-07, "loss": 0.7002, "step": 1706 }, { "epoch": 0.46540794765183013, "grad_norm": 2.9138913344825426, "learning_rate": 5.800491196478988e-07, "loss": 0.663, "step": 1707 }, { "epoch": 0.46568059436984527, "grad_norm": 1.9183731169856948, "learning_rate": 5.796130550681664e-07, "loss": 0.7061, "step": 1708 }, { "epoch": 0.4659532410878604, "grad_norm": 2.6364286637303245, "learning_rate": 5.791769283499715e-07, "loss": 0.7117, "step": 1709 }, { "epoch": 0.46622588780587554, "grad_norm": 2.3975618038302016, "learning_rate": 5.787407398337138e-07, "loss": 0.6674, "step": 1710 }, { "epoch": 0.4664985345238907, "grad_norm": 2.5376719451418994, "learning_rate": 5.783044898598411e-07, "loss": 0.6566, "step": 1711 }, { "epoch": 0.4667711812419058, "grad_norm": 3.012028348154745, "learning_rate": 5.778681787688487e-07, "loss": 0.689, "step": 1712 }, { "epoch": 0.46704382795992094, "grad_norm": 3.653613156560198, "learning_rate": 5.774318069012805e-07, "loss": 0.6488, "step": 1713 }, { "epoch": 0.4673164746779361, "grad_norm": 1.9575240893357488, "learning_rate": 5.769953745977269e-07, "loss": 0.6823, "step": 1714 }, { "epoch": 0.4675891213959512, "grad_norm": 4.676535725824748, "learning_rate": 5.76558882198826e-07, "loss": 0.7307, "step": 1715 }, { "epoch": 0.46786176811396635, "grad_norm": 1.9173988613971675, "learning_rate": 5.761223300452629e-07, "loss": 0.6454, "step": 1716 }, { "epoch": 0.4681344148319815, "grad_norm": 2.8093229910641244, "learning_rate": 5.756857184777691e-07, "loss": 0.7505, "step": 1717 }, { "epoch": 0.4684070615499966, "grad_norm": 2.673052967705499, "learning_rate": 5.752490478371224e-07, "loss": 0.6723, "step": 1718 }, { "epoch": 0.4686797082680117, "grad_norm": 2.0939245962328443, "learning_rate": 5.748123184641469e-07, "loss": 0.7049, "step": 1719 }, { "epoch": 0.46895235498602683, "grad_norm": 3.5652810229822953, "learning_rate": 5.743755306997124e-07, "loss": 0.6899, "step": 1720 }, { "epoch": 0.46922500170404197, "grad_norm": 4.260458196884242, "learning_rate": 5.739386848847345e-07, "loss": 0.6785, "step": 1721 }, { "epoch": 0.4694976484220571, "grad_norm": 3.0743723606129816, "learning_rate": 5.735017813601738e-07, "loss": 0.6549, "step": 1722 }, { "epoch": 0.46977029514007224, "grad_norm": 2.339564868677797, "learning_rate": 5.730648204670362e-07, "loss": 0.6932, "step": 1723 }, { "epoch": 0.47004294185808737, "grad_norm": 2.6309317282483735, "learning_rate": 5.726278025463721e-07, "loss": 0.7227, "step": 1724 }, { "epoch": 0.4703155885761025, "grad_norm": 2.59378473486541, "learning_rate": 5.721907279392769e-07, "loss": 0.6094, "step": 1725 }, { "epoch": 0.47058823529411764, "grad_norm": 2.39167568151341, "learning_rate": 5.717535969868896e-07, "loss": 0.6588, "step": 1726 }, { "epoch": 0.4708608820121328, "grad_norm": 2.0118254563139155, "learning_rate": 5.713164100303936e-07, "loss": 0.776, "step": 1727 }, { "epoch": 0.4711335287301479, "grad_norm": 1.9070026867119747, "learning_rate": 5.70879167411016e-07, "loss": 0.6752, "step": 1728 }, { "epoch": 0.47140617544816305, "grad_norm": 3.8053466225499224, "learning_rate": 5.704418694700274e-07, "loss": 0.7295, "step": 1729 }, { "epoch": 0.4716788221661782, "grad_norm": 3.2735716201532616, "learning_rate": 5.700045165487409e-07, "loss": 0.6935, "step": 1730 }, { "epoch": 0.4719514688841933, "grad_norm": 3.047798725722004, "learning_rate": 5.695671089885136e-07, "loss": 0.6436, "step": 1731 }, { "epoch": 0.47222411560220845, "grad_norm": 11.336521618995727, "learning_rate": 5.691296471307447e-07, "loss": 0.7245, "step": 1732 }, { "epoch": 0.4724967623202236, "grad_norm": 4.005058943190169, "learning_rate": 5.686921313168754e-07, "loss": 0.7565, "step": 1733 }, { "epoch": 0.4727694090382387, "grad_norm": 3.6357880688603195, "learning_rate": 5.682545618883896e-07, "loss": 0.7207, "step": 1734 }, { "epoch": 0.47304205575625385, "grad_norm": 3.0230146464933374, "learning_rate": 5.678169391868127e-07, "loss": 0.7564, "step": 1735 }, { "epoch": 0.473314702474269, "grad_norm": 3.485594137435761, "learning_rate": 5.673792635537121e-07, "loss": 0.6716, "step": 1736 }, { "epoch": 0.4735873491922841, "grad_norm": 2.8222706571507596, "learning_rate": 5.669415353306961e-07, "loss": 0.6952, "step": 1737 }, { "epoch": 0.4738599959102992, "grad_norm": 2.618119272051101, "learning_rate": 5.665037548594141e-07, "loss": 0.6573, "step": 1738 }, { "epoch": 0.47413264262831434, "grad_norm": 2.0167103593408524, "learning_rate": 5.660659224815564e-07, "loss": 0.6684, "step": 1739 }, { "epoch": 0.4744052893463295, "grad_norm": 2.4144913271338146, "learning_rate": 5.656280385388538e-07, "loss": 0.7149, "step": 1740 }, { "epoch": 0.4746779360643446, "grad_norm": 2.1275323392555996, "learning_rate": 5.651901033730771e-07, "loss": 0.7334, "step": 1741 }, { "epoch": 0.47495058278235974, "grad_norm": 2.0821283278428586, "learning_rate": 5.647521173260376e-07, "loss": 0.7308, "step": 1742 }, { "epoch": 0.4752232295003749, "grad_norm": 1.895954998840716, "learning_rate": 5.643140807395861e-07, "loss": 0.6859, "step": 1743 }, { "epoch": 0.47549587621839, "grad_norm": 2.1816772867588328, "learning_rate": 5.638759939556124e-07, "loss": 0.6826, "step": 1744 }, { "epoch": 0.47576852293640515, "grad_norm": 2.206607755705215, "learning_rate": 5.634378573160461e-07, "loss": 0.7321, "step": 1745 }, { "epoch": 0.4760411696544203, "grad_norm": 2.6735608265051414, "learning_rate": 5.62999671162855e-07, "loss": 0.7481, "step": 1746 }, { "epoch": 0.4763138163724354, "grad_norm": 1.8054654754203285, "learning_rate": 5.625614358380468e-07, "loss": 0.6859, "step": 1747 }, { "epoch": 0.47658646309045055, "grad_norm": 2.775261710455524, "learning_rate": 5.621231516836661e-07, "loss": 0.6857, "step": 1748 }, { "epoch": 0.4768591098084657, "grad_norm": 2.1976683177405887, "learning_rate": 5.616848190417965e-07, "loss": 0.6454, "step": 1749 }, { "epoch": 0.4771317565264808, "grad_norm": 2.783035703941938, "learning_rate": 5.612464382545592e-07, "loss": 0.6417, "step": 1750 }, { "epoch": 0.47740440324449596, "grad_norm": 2.2930541438949072, "learning_rate": 5.608080096641131e-07, "loss": 0.7099, "step": 1751 }, { "epoch": 0.4776770499625111, "grad_norm": 2.140012596421041, "learning_rate": 5.603695336126543e-07, "loss": 0.6611, "step": 1752 }, { "epoch": 0.4779496966805262, "grad_norm": 2.0339767410771397, "learning_rate": 5.59931010442416e-07, "loss": 0.6699, "step": 1753 }, { "epoch": 0.47822234339854136, "grad_norm": 4.039255957140312, "learning_rate": 5.594924404956679e-07, "loss": 0.6806, "step": 1754 }, { "epoch": 0.4784949901165565, "grad_norm": 2.7993888025745464, "learning_rate": 5.590538241147166e-07, "loss": 0.7463, "step": 1755 }, { "epoch": 0.47876763683457163, "grad_norm": 2.6131879185967866, "learning_rate": 5.586151616419049e-07, "loss": 0.6768, "step": 1756 }, { "epoch": 0.4790402835525867, "grad_norm": 3.615980342608864, "learning_rate": 5.581764534196113e-07, "loss": 0.6445, "step": 1757 }, { "epoch": 0.47931293027060184, "grad_norm": 3.876059346095038, "learning_rate": 5.577376997902503e-07, "loss": 0.6789, "step": 1758 }, { "epoch": 0.479585576988617, "grad_norm": 3.25602568308337, "learning_rate": 5.572989010962716e-07, "loss": 0.657, "step": 1759 }, { "epoch": 0.4798582237066321, "grad_norm": 2.0247194801818633, "learning_rate": 5.568600576801604e-07, "loss": 0.6806, "step": 1760 }, { "epoch": 0.48013087042464725, "grad_norm": 2.8403344319733663, "learning_rate": 5.564211698844362e-07, "loss": 0.6749, "step": 1761 }, { "epoch": 0.4804035171426624, "grad_norm": 5.868524296589944, "learning_rate": 5.559822380516539e-07, "loss": 0.6273, "step": 1762 }, { "epoch": 0.4806761638606775, "grad_norm": 2.8651792136043026, "learning_rate": 5.555432625244023e-07, "loss": 0.6772, "step": 1763 }, { "epoch": 0.48094881057869265, "grad_norm": 1.9830474887319427, "learning_rate": 5.551042436453043e-07, "loss": 0.6882, "step": 1764 }, { "epoch": 0.4812214572967078, "grad_norm": 1.7945699311925494, "learning_rate": 5.546651817570169e-07, "loss": 0.6405, "step": 1765 }, { "epoch": 0.4814941040147229, "grad_norm": 2.60751440249677, "learning_rate": 5.542260772022304e-07, "loss": 0.7496, "step": 1766 }, { "epoch": 0.48176675073273806, "grad_norm": 2.7463557005921984, "learning_rate": 5.537869303236686e-07, "loss": 0.6255, "step": 1767 }, { "epoch": 0.4820393974507532, "grad_norm": 2.0898686434592646, "learning_rate": 5.533477414640879e-07, "loss": 0.711, "step": 1768 }, { "epoch": 0.48231204416876833, "grad_norm": 1.9976101171177063, "learning_rate": 5.529085109662783e-07, "loss": 0.7242, "step": 1769 }, { "epoch": 0.48258469088678346, "grad_norm": 2.6676132664175864, "learning_rate": 5.524692391730618e-07, "loss": 0.7215, "step": 1770 }, { "epoch": 0.4828573376047986, "grad_norm": 2.1099181887710334, "learning_rate": 5.520299264272923e-07, "loss": 0.6701, "step": 1771 }, { "epoch": 0.48312998432281373, "grad_norm": 3.0849640767707402, "learning_rate": 5.515905730718562e-07, "loss": 0.6303, "step": 1772 }, { "epoch": 0.48340263104082887, "grad_norm": 1.92700896313567, "learning_rate": 5.511511794496714e-07, "loss": 0.6867, "step": 1773 }, { "epoch": 0.483675277758844, "grad_norm": 2.639143351873302, "learning_rate": 5.50711745903687e-07, "loss": 0.7203, "step": 1774 }, { "epoch": 0.48394792447685914, "grad_norm": 2.5234949159422215, "learning_rate": 5.502722727768838e-07, "loss": 0.7373, "step": 1775 }, { "epoch": 0.4842205711948742, "grad_norm": 2.6724307371731153, "learning_rate": 5.498327604122732e-07, "loss": 0.6626, "step": 1776 }, { "epoch": 0.48449321791288935, "grad_norm": 2.574252024274836, "learning_rate": 5.493932091528971e-07, "loss": 0.6971, "step": 1777 }, { "epoch": 0.4847658646309045, "grad_norm": 4.012797910043176, "learning_rate": 5.489536193418278e-07, "loss": 0.6732, "step": 1778 }, { "epoch": 0.4850385113489196, "grad_norm": 2.6063025659836945, "learning_rate": 5.485139913221678e-07, "loss": 0.6923, "step": 1779 }, { "epoch": 0.48531115806693476, "grad_norm": 2.2110953467486185, "learning_rate": 5.480743254370494e-07, "loss": 0.624, "step": 1780 }, { "epoch": 0.4855838047849499, "grad_norm": 2.2844896202151412, "learning_rate": 5.476346220296345e-07, "loss": 0.6725, "step": 1781 }, { "epoch": 0.485856451502965, "grad_norm": 2.3091038402457147, "learning_rate": 5.47194881443114e-07, "loss": 0.7057, "step": 1782 }, { "epoch": 0.48612909822098016, "grad_norm": 2.032679572882436, "learning_rate": 5.467551040207085e-07, "loss": 0.6453, "step": 1783 }, { "epoch": 0.4864017449389953, "grad_norm": 3.1042346154249723, "learning_rate": 5.463152901056664e-07, "loss": 0.648, "step": 1784 }, { "epoch": 0.48667439165701043, "grad_norm": 2.2261485815970876, "learning_rate": 5.458754400412652e-07, "loss": 0.7138, "step": 1785 }, { "epoch": 0.48694703837502556, "grad_norm": 1.9908153674148161, "learning_rate": 5.454355541708107e-07, "loss": 0.6353, "step": 1786 }, { "epoch": 0.4872196850930407, "grad_norm": 7.104741852699178, "learning_rate": 5.449956328376361e-07, "loss": 0.6409, "step": 1787 }, { "epoch": 0.48749233181105583, "grad_norm": 2.8692003886283524, "learning_rate": 5.445556763851029e-07, "loss": 0.7094, "step": 1788 }, { "epoch": 0.48776497852907097, "grad_norm": 2.9921485425779526, "learning_rate": 5.441156851565997e-07, "loss": 0.6918, "step": 1789 }, { "epoch": 0.4880376252470861, "grad_norm": 2.041614290618279, "learning_rate": 5.436756594955421e-07, "loss": 0.6879, "step": 1790 }, { "epoch": 0.48831027196510124, "grad_norm": 4.729026275542453, "learning_rate": 5.432355997453728e-07, "loss": 0.7362, "step": 1791 }, { "epoch": 0.4885829186831164, "grad_norm": 3.762767513050278, "learning_rate": 5.427955062495612e-07, "loss": 0.6776, "step": 1792 }, { "epoch": 0.4888555654011315, "grad_norm": 3.577227699241446, "learning_rate": 5.423553793516028e-07, "loss": 0.7611, "step": 1793 }, { "epoch": 0.4891282121191466, "grad_norm": 2.7866796631127078, "learning_rate": 5.419152193950191e-07, "loss": 0.6889, "step": 1794 }, { "epoch": 0.4894008588371617, "grad_norm": 1.9716831598575717, "learning_rate": 5.414750267233576e-07, "loss": 0.7337, "step": 1795 }, { "epoch": 0.48967350555517686, "grad_norm": 2.1047924181745388, "learning_rate": 5.410348016801915e-07, "loss": 0.6414, "step": 1796 }, { "epoch": 0.489946152273192, "grad_norm": 2.016592179259306, "learning_rate": 5.405945446091191e-07, "loss": 0.6529, "step": 1797 }, { "epoch": 0.4902187989912071, "grad_norm": 1.9842442909751856, "learning_rate": 5.401542558537634e-07, "loss": 0.6668, "step": 1798 }, { "epoch": 0.49049144570922226, "grad_norm": 1.8787571438202848, "learning_rate": 5.397139357577724e-07, "loss": 0.6844, "step": 1799 }, { "epoch": 0.4907640924272374, "grad_norm": 2.6655944843348283, "learning_rate": 5.392735846648188e-07, "loss": 0.6858, "step": 1800 }, { "epoch": 0.49103673914525253, "grad_norm": 2.5341336915922494, "learning_rate": 5.38833202918599e-07, "loss": 0.7566, "step": 1801 }, { "epoch": 0.49130938586326767, "grad_norm": 2.85118822561444, "learning_rate": 5.383927908628335e-07, "loss": 0.6425, "step": 1802 }, { "epoch": 0.4915820325812828, "grad_norm": 2.261278146816137, "learning_rate": 5.379523488412668e-07, "loss": 0.6772, "step": 1803 }, { "epoch": 0.49185467929929794, "grad_norm": 2.3755944756704492, "learning_rate": 5.375118771976664e-07, "loss": 0.6832, "step": 1804 }, { "epoch": 0.49212732601731307, "grad_norm": 2.210141761927035, "learning_rate": 5.370713762758231e-07, "loss": 0.6798, "step": 1805 }, { "epoch": 0.4923999727353282, "grad_norm": 3.2234119447737823, "learning_rate": 5.366308464195504e-07, "loss": 0.6694, "step": 1806 }, { "epoch": 0.49267261945334334, "grad_norm": 2.3209482418639213, "learning_rate": 5.361902879726843e-07, "loss": 0.7123, "step": 1807 }, { "epoch": 0.4929452661713585, "grad_norm": 4.0593348995149725, "learning_rate": 5.357497012790837e-07, "loss": 0.6059, "step": 1808 }, { "epoch": 0.4932179128893736, "grad_norm": 2.4887461836070988, "learning_rate": 5.353090866826288e-07, "loss": 0.7152, "step": 1809 }, { "epoch": 0.49349055960738875, "grad_norm": 2.138552174880042, "learning_rate": 5.34868444527222e-07, "loss": 0.6614, "step": 1810 }, { "epoch": 0.4937632063254039, "grad_norm": 3.493068435010434, "learning_rate": 5.344277751567873e-07, "loss": 0.6657, "step": 1811 }, { "epoch": 0.494035853043419, "grad_norm": 2.183051501463853, "learning_rate": 5.339870789152694e-07, "loss": 0.6694, "step": 1812 }, { "epoch": 0.4943084997614341, "grad_norm": 2.4124836029296994, "learning_rate": 5.335463561466347e-07, "loss": 0.736, "step": 1813 }, { "epoch": 0.49458114647944923, "grad_norm": 2.3238936753223443, "learning_rate": 5.331056071948695e-07, "loss": 0.6163, "step": 1814 }, { "epoch": 0.49485379319746436, "grad_norm": 2.177027895463622, "learning_rate": 5.326648324039814e-07, "loss": 0.7137, "step": 1815 }, { "epoch": 0.4951264399154795, "grad_norm": 3.5994334942589, "learning_rate": 5.322240321179976e-07, "loss": 0.7629, "step": 1816 }, { "epoch": 0.49539908663349463, "grad_norm": 5.0713249007722, "learning_rate": 5.317832066809654e-07, "loss": 0.7016, "step": 1817 }, { "epoch": 0.49567173335150977, "grad_norm": 2.8646403789294683, "learning_rate": 5.313423564369514e-07, "loss": 0.7026, "step": 1818 }, { "epoch": 0.4959443800695249, "grad_norm": 2.8360175324045325, "learning_rate": 5.309014817300421e-07, "loss": 0.6734, "step": 1819 }, { "epoch": 0.49621702678754004, "grad_norm": 3.9730576830194217, "learning_rate": 5.304605829043427e-07, "loss": 0.7233, "step": 1820 }, { "epoch": 0.4964896735055552, "grad_norm": 5.0679304671233, "learning_rate": 5.300196603039775e-07, "loss": 0.6082, "step": 1821 }, { "epoch": 0.4967623202235703, "grad_norm": 1.7787687326011499, "learning_rate": 5.295787142730891e-07, "loss": 0.707, "step": 1822 }, { "epoch": 0.49703496694158544, "grad_norm": 2.161342418202776, "learning_rate": 5.291377451558385e-07, "loss": 0.7573, "step": 1823 }, { "epoch": 0.4973076136596006, "grad_norm": 2.4009234534451065, "learning_rate": 5.286967532964048e-07, "loss": 0.7105, "step": 1824 }, { "epoch": 0.4975802603776157, "grad_norm": 2.8674821179056265, "learning_rate": 5.282557390389845e-07, "loss": 0.69, "step": 1825 }, { "epoch": 0.49785290709563085, "grad_norm": 1.9438463466244176, "learning_rate": 5.278147027277922e-07, "loss": 0.7933, "step": 1826 }, { "epoch": 0.498125553813646, "grad_norm": 2.181032846702435, "learning_rate": 5.273736447070592e-07, "loss": 0.7439, "step": 1827 }, { "epoch": 0.4983982005316611, "grad_norm": 4.058185255019489, "learning_rate": 5.269325653210339e-07, "loss": 0.6803, "step": 1828 }, { "epoch": 0.49867084724967625, "grad_norm": 3.4621674519377197, "learning_rate": 5.264914649139815e-07, "loss": 0.6751, "step": 1829 }, { "epoch": 0.4989434939676914, "grad_norm": 2.1769893330826426, "learning_rate": 5.260503438301832e-07, "loss": 0.8129, "step": 1830 }, { "epoch": 0.4992161406857065, "grad_norm": 1.7979634316178525, "learning_rate": 5.256092024139369e-07, "loss": 0.6929, "step": 1831 }, { "epoch": 0.4994887874037216, "grad_norm": 2.5841349297050202, "learning_rate": 5.25168041009556e-07, "loss": 0.7256, "step": 1832 }, { "epoch": 0.49976143412173674, "grad_norm": 2.6877856329114653, "learning_rate": 5.247268599613695e-07, "loss": 0.6417, "step": 1833 }, { "epoch": 0.5000340808397519, "grad_norm": 2.819579604791406, "learning_rate": 5.242856596137219e-07, "loss": 0.6934, "step": 1834 }, { "epoch": 0.5003067275577671, "grad_norm": 1.9032517209046516, "learning_rate": 5.238444403109728e-07, "loss": 0.6893, "step": 1835 }, { "epoch": 0.5005793742757821, "grad_norm": 3.0023079816791363, "learning_rate": 5.234032023974965e-07, "loss": 0.7564, "step": 1836 }, { "epoch": 0.5008520209937973, "grad_norm": 2.9183567654867764, "learning_rate": 5.229619462176817e-07, "loss": 0.6852, "step": 1837 }, { "epoch": 0.5011246677118124, "grad_norm": 1.9652212569257843, "learning_rate": 5.225206721159314e-07, "loss": 0.6447, "step": 1838 }, { "epoch": 0.5013973144298276, "grad_norm": 1.929990852289511, "learning_rate": 5.220793804366627e-07, "loss": 0.6572, "step": 1839 }, { "epoch": 0.5016699611478427, "grad_norm": 2.769701896397862, "learning_rate": 5.216380715243063e-07, "loss": 0.6617, "step": 1840 }, { "epoch": 0.5019426078658578, "grad_norm": 2.537458115432489, "learning_rate": 5.211967457233069e-07, "loss": 0.7001, "step": 1841 }, { "epoch": 0.502215254583873, "grad_norm": 1.9080231103432947, "learning_rate": 5.207554033781214e-07, "loss": 0.7338, "step": 1842 }, { "epoch": 0.502487901301888, "grad_norm": 4.423783271132566, "learning_rate": 5.203140448332202e-07, "loss": 0.7138, "step": 1843 }, { "epoch": 0.5027605480199032, "grad_norm": 2.5916350415277853, "learning_rate": 5.198726704330863e-07, "loss": 0.7282, "step": 1844 }, { "epoch": 0.5030331947379183, "grad_norm": 3.343909511118555, "learning_rate": 5.194312805222152e-07, "loss": 0.7062, "step": 1845 }, { "epoch": 0.5033058414559335, "grad_norm": 3.574398001007083, "learning_rate": 5.189898754451143e-07, "loss": 0.7124, "step": 1846 }, { "epoch": 0.5035784881739486, "grad_norm": 2.545719563185916, "learning_rate": 5.185484555463026e-07, "loss": 0.6446, "step": 1847 }, { "epoch": 0.5038511348919638, "grad_norm": 2.528238549816992, "learning_rate": 5.181070211703113e-07, "loss": 0.7268, "step": 1848 }, { "epoch": 0.5041237816099788, "grad_norm": 1.9665464183224823, "learning_rate": 5.176655726616824e-07, "loss": 0.6931, "step": 1849 }, { "epoch": 0.504396428327994, "grad_norm": 2.1563915773689253, "learning_rate": 5.172241103649691e-07, "loss": 0.7071, "step": 1850 }, { "epoch": 0.5046690750460091, "grad_norm": 4.191096446896658, "learning_rate": 5.167826346247355e-07, "loss": 0.6991, "step": 1851 }, { "epoch": 0.5049417217640243, "grad_norm": 2.1731020457820764, "learning_rate": 5.163411457855557e-07, "loss": 0.7433, "step": 1852 }, { "epoch": 0.5052143684820394, "grad_norm": 2.100448993347334, "learning_rate": 5.158996441920146e-07, "loss": 0.63, "step": 1853 }, { "epoch": 0.5054870152000546, "grad_norm": 2.257324146780973, "learning_rate": 5.154581301887068e-07, "loss": 0.6165, "step": 1854 }, { "epoch": 0.5057596619180696, "grad_norm": 2.240301617036304, "learning_rate": 5.150166041202365e-07, "loss": 0.6175, "step": 1855 }, { "epoch": 0.5060323086360848, "grad_norm": 5.4876098320299915, "learning_rate": 5.145750663312176e-07, "loss": 0.7304, "step": 1856 }, { "epoch": 0.5063049553540999, "grad_norm": 2.134516430480249, "learning_rate": 5.141335171662729e-07, "loss": 0.7086, "step": 1857 }, { "epoch": 0.5065776020721151, "grad_norm": 5.325614713641671, "learning_rate": 5.136919569700339e-07, "loss": 0.741, "step": 1858 }, { "epoch": 0.5068502487901302, "grad_norm": 3.5342201260798634, "learning_rate": 5.132503860871412e-07, "loss": 0.7614, "step": 1859 }, { "epoch": 0.5071228955081453, "grad_norm": 2.607273596619329, "learning_rate": 5.128088048622434e-07, "loss": 0.7172, "step": 1860 }, { "epoch": 0.5073955422261605, "grad_norm": 2.5887388462830083, "learning_rate": 5.123672136399975e-07, "loss": 0.7315, "step": 1861 }, { "epoch": 0.5076681889441755, "grad_norm": 2.6250468042454282, "learning_rate": 5.119256127650678e-07, "loss": 0.7379, "step": 1862 }, { "epoch": 0.5079408356621907, "grad_norm": 3.021664580350237, "learning_rate": 5.114840025821264e-07, "loss": 0.723, "step": 1863 }, { "epoch": 0.5082134823802058, "grad_norm": 2.000172931342589, "learning_rate": 5.110423834358529e-07, "loss": 0.6855, "step": 1864 }, { "epoch": 0.508486129098221, "grad_norm": 1.5859738518681032, "learning_rate": 5.106007556709337e-07, "loss": 0.6727, "step": 1865 }, { "epoch": 0.5087587758162361, "grad_norm": 1.7526498571374507, "learning_rate": 5.101591196320613e-07, "loss": 0.6457, "step": 1866 }, { "epoch": 0.5090314225342513, "grad_norm": 2.0113154311622106, "learning_rate": 5.09717475663936e-07, "loss": 0.7646, "step": 1867 }, { "epoch": 0.5093040692522663, "grad_norm": 2.2313612664176037, "learning_rate": 5.092758241112631e-07, "loss": 0.7548, "step": 1868 }, { "epoch": 0.5095767159702815, "grad_norm": 3.9467781084825604, "learning_rate": 5.088341653187545e-07, "loss": 0.6989, "step": 1869 }, { "epoch": 0.5098493626882966, "grad_norm": 2.550905895069194, "learning_rate": 5.083924996311274e-07, "loss": 0.7466, "step": 1870 }, { "epoch": 0.5101220094063118, "grad_norm": 2.1857119645333047, "learning_rate": 5.079508273931045e-07, "loss": 0.7509, "step": 1871 }, { "epoch": 0.5103946561243269, "grad_norm": 1.8863676192699947, "learning_rate": 5.075091489494135e-07, "loss": 0.6666, "step": 1872 }, { "epoch": 0.5106673028423421, "grad_norm": 2.7788952244757934, "learning_rate": 5.07067464644787e-07, "loss": 0.6902, "step": 1873 }, { "epoch": 0.5109399495603572, "grad_norm": 14.402840141032602, "learning_rate": 5.066257748239625e-07, "loss": 0.7299, "step": 1874 }, { "epoch": 0.5112125962783723, "grad_norm": 2.3135474666050415, "learning_rate": 5.061840798316814e-07, "loss": 0.711, "step": 1875 }, { "epoch": 0.5114852429963874, "grad_norm": 3.229916673798276, "learning_rate": 5.057423800126893e-07, "loss": 0.6919, "step": 1876 }, { "epoch": 0.5117578897144026, "grad_norm": 2.4313022514118106, "learning_rate": 5.053006757117355e-07, "loss": 0.7111, "step": 1877 }, { "epoch": 0.5120305364324177, "grad_norm": 6.670256940441904, "learning_rate": 5.048589672735728e-07, "loss": 0.6704, "step": 1878 }, { "epoch": 0.5123031831504328, "grad_norm": 3.561346162679862, "learning_rate": 5.044172550429572e-07, "loss": 0.7518, "step": 1879 }, { "epoch": 0.512575829868448, "grad_norm": 3.53877336915216, "learning_rate": 5.039755393646479e-07, "loss": 0.7601, "step": 1880 }, { "epoch": 0.512848476586463, "grad_norm": 2.6498089024338705, "learning_rate": 5.035338205834064e-07, "loss": 0.6875, "step": 1881 }, { "epoch": 0.5131211233044782, "grad_norm": 3.724308964277005, "learning_rate": 5.030920990439969e-07, "loss": 0.6498, "step": 1882 }, { "epoch": 0.5133937700224933, "grad_norm": 2.52889405278082, "learning_rate": 5.02650375091186e-07, "loss": 0.7262, "step": 1883 }, { "epoch": 0.5136664167405085, "grad_norm": 2.2177785907328, "learning_rate": 5.022086490697415e-07, "loss": 0.6069, "step": 1884 }, { "epoch": 0.5139390634585236, "grad_norm": 4.006228322381084, "learning_rate": 5.017669213244332e-07, "loss": 0.7206, "step": 1885 }, { "epoch": 0.5142117101765388, "grad_norm": 5.135603425933029, "learning_rate": 5.013251922000322e-07, "loss": 0.6593, "step": 1886 }, { "epoch": 0.5144843568945539, "grad_norm": 2.035595117539374, "learning_rate": 5.00883462041311e-07, "loss": 0.6702, "step": 1887 }, { "epoch": 0.514757003612569, "grad_norm": 2.0730220232157057, "learning_rate": 5.004417311930425e-07, "loss": 0.7524, "step": 1888 }, { "epoch": 0.5150296503305841, "grad_norm": 2.731652568515541, "learning_rate": 5e-07, "loss": 0.6351, "step": 1889 }, { "epoch": 0.5153022970485993, "grad_norm": 4.259090949503816, "learning_rate": 4.995582688069575e-07, "loss": 0.7402, "step": 1890 }, { "epoch": 0.5155749437666144, "grad_norm": 2.2195947740260964, "learning_rate": 4.99116537958689e-07, "loss": 0.668, "step": 1891 }, { "epoch": 0.5158475904846296, "grad_norm": 2.052506058949953, "learning_rate": 4.986748077999676e-07, "loss": 0.6872, "step": 1892 }, { "epoch": 0.5161202372026447, "grad_norm": 3.4398209335571104, "learning_rate": 4.982330786755669e-07, "loss": 0.68, "step": 1893 }, { "epoch": 0.5163928839206599, "grad_norm": 3.185508052202769, "learning_rate": 4.977913509302586e-07, "loss": 0.6748, "step": 1894 }, { "epoch": 0.5166655306386749, "grad_norm": 2.214293534187428, "learning_rate": 4.97349624908814e-07, "loss": 0.6427, "step": 1895 }, { "epoch": 0.5169381773566901, "grad_norm": 1.9815154598263958, "learning_rate": 4.96907900956003e-07, "loss": 0.6691, "step": 1896 }, { "epoch": 0.5172108240747052, "grad_norm": 3.0600074988634933, "learning_rate": 4.964661794165935e-07, "loss": 0.7937, "step": 1897 }, { "epoch": 0.5174834707927203, "grad_norm": 2.0394910570634446, "learning_rate": 4.960244606353521e-07, "loss": 0.5863, "step": 1898 }, { "epoch": 0.5177561175107355, "grad_norm": 11.063183744198776, "learning_rate": 4.955827449570428e-07, "loss": 0.6843, "step": 1899 }, { "epoch": 0.5180287642287505, "grad_norm": 2.08228686674727, "learning_rate": 4.951410327264272e-07, "loss": 0.6746, "step": 1900 }, { "epoch": 0.5183014109467657, "grad_norm": 2.375710034125843, "learning_rate": 4.946993242882645e-07, "loss": 0.6216, "step": 1901 }, { "epoch": 0.5185740576647808, "grad_norm": 3.166542375802894, "learning_rate": 4.942576199873108e-07, "loss": 0.6457, "step": 1902 }, { "epoch": 0.518846704382796, "grad_norm": 2.66150707514843, "learning_rate": 4.938159201683186e-07, "loss": 0.7441, "step": 1903 }, { "epoch": 0.5191193511008111, "grad_norm": 4.338980259861379, "learning_rate": 4.933742251760374e-07, "loss": 0.6559, "step": 1904 }, { "epoch": 0.5193919978188263, "grad_norm": 2.798550186207583, "learning_rate": 4.92932535355213e-07, "loss": 0.7378, "step": 1905 }, { "epoch": 0.5196646445368414, "grad_norm": 1.8758259230634187, "learning_rate": 4.924908510505866e-07, "loss": 0.6564, "step": 1906 }, { "epoch": 0.5199372912548565, "grad_norm": 3.1967187851095216, "learning_rate": 4.920491726068957e-07, "loss": 0.6469, "step": 1907 }, { "epoch": 0.5202099379728716, "grad_norm": 4.286007438737428, "learning_rate": 4.916075003688726e-07, "loss": 0.664, "step": 1908 }, { "epoch": 0.5204825846908868, "grad_norm": 2.4298143144135445, "learning_rate": 4.911658346812454e-07, "loss": 0.7511, "step": 1909 }, { "epoch": 0.5207552314089019, "grad_norm": 2.887729940327334, "learning_rate": 4.907241758887369e-07, "loss": 0.6294, "step": 1910 }, { "epoch": 0.5210278781269171, "grad_norm": 3.1851274803242613, "learning_rate": 4.902825243360639e-07, "loss": 0.6865, "step": 1911 }, { "epoch": 0.5213005248449322, "grad_norm": 1.84079438279361, "learning_rate": 4.898408803679387e-07, "loss": 0.7014, "step": 1912 }, { "epoch": 0.5215731715629474, "grad_norm": 2.087337566845923, "learning_rate": 4.893992443290664e-07, "loss": 0.6664, "step": 1913 }, { "epoch": 0.5218458182809624, "grad_norm": 2.29248852781795, "learning_rate": 4.889576165641472e-07, "loss": 0.7326, "step": 1914 }, { "epoch": 0.5221184649989776, "grad_norm": 3.178499580627742, "learning_rate": 4.885159974178736e-07, "loss": 0.7298, "step": 1915 }, { "epoch": 0.5223911117169927, "grad_norm": 4.451346227419118, "learning_rate": 4.880743872349322e-07, "loss": 0.6825, "step": 1916 }, { "epoch": 0.5226637584350078, "grad_norm": 7.87068217737144, "learning_rate": 4.876327863600025e-07, "loss": 0.6697, "step": 1917 }, { "epoch": 0.522936405153023, "grad_norm": 2.4041883804477777, "learning_rate": 4.871911951377566e-07, "loss": 0.6725, "step": 1918 }, { "epoch": 0.5232090518710381, "grad_norm": 4.664728289538559, "learning_rate": 4.867496139128589e-07, "loss": 0.6848, "step": 1919 }, { "epoch": 0.5234816985890532, "grad_norm": 3.026386124825403, "learning_rate": 4.863080430299661e-07, "loss": 0.7148, "step": 1920 }, { "epoch": 0.5237543453070683, "grad_norm": 2.305510946642436, "learning_rate": 4.858664828337274e-07, "loss": 0.7209, "step": 1921 }, { "epoch": 0.5240269920250835, "grad_norm": 3.368603048409353, "learning_rate": 4.854249336687824e-07, "loss": 0.7027, "step": 1922 }, { "epoch": 0.5242996387430986, "grad_norm": 2.777863026781603, "learning_rate": 4.849833958797634e-07, "loss": 0.7035, "step": 1923 }, { "epoch": 0.5245722854611138, "grad_norm": 2.3617134108046223, "learning_rate": 4.845418698112933e-07, "loss": 0.6565, "step": 1924 }, { "epoch": 0.5248449321791289, "grad_norm": 3.7247986623670775, "learning_rate": 4.841003558079853e-07, "loss": 0.675, "step": 1925 }, { "epoch": 0.525117578897144, "grad_norm": 3.2712506242626103, "learning_rate": 4.836588542144443e-07, "loss": 0.682, "step": 1926 }, { "epoch": 0.5253902256151591, "grad_norm": 1.7717058186155235, "learning_rate": 4.832173653752645e-07, "loss": 0.6512, "step": 1927 }, { "epoch": 0.5256628723331743, "grad_norm": 2.0231541881750026, "learning_rate": 4.827758896350309e-07, "loss": 0.654, "step": 1928 }, { "epoch": 0.5259355190511894, "grad_norm": 2.0514302292028894, "learning_rate": 4.823344273383176e-07, "loss": 0.6244, "step": 1929 }, { "epoch": 0.5262081657692046, "grad_norm": 4.021424739015963, "learning_rate": 4.818929788296886e-07, "loss": 0.6015, "step": 1930 }, { "epoch": 0.5264808124872197, "grad_norm": 1.8799068519955506, "learning_rate": 4.814515444536974e-07, "loss": 0.6501, "step": 1931 }, { "epoch": 0.5267534592052349, "grad_norm": 2.2783008714167767, "learning_rate": 4.810101245548858e-07, "loss": 0.6453, "step": 1932 }, { "epoch": 0.5270261059232499, "grad_norm": 3.5145426392928147, "learning_rate": 4.805687194777848e-07, "loss": 0.6375, "step": 1933 }, { "epoch": 0.5272987526412651, "grad_norm": 2.616713024623749, "learning_rate": 4.801273295669137e-07, "loss": 0.7508, "step": 1934 }, { "epoch": 0.5275713993592802, "grad_norm": 2.4580216931754375, "learning_rate": 4.796859551667799e-07, "loss": 0.7737, "step": 1935 }, { "epoch": 0.5278440460772953, "grad_norm": 3.2486656014905892, "learning_rate": 4.792445966218787e-07, "loss": 0.7032, "step": 1936 }, { "epoch": 0.5281166927953105, "grad_norm": 4.31515684257752, "learning_rate": 4.788032542766932e-07, "loss": 0.7054, "step": 1937 }, { "epoch": 0.5283893395133256, "grad_norm": 2.9400538569556094, "learning_rate": 4.783619284756936e-07, "loss": 0.728, "step": 1938 }, { "epoch": 0.5286619862313408, "grad_norm": 3.14961071374804, "learning_rate": 4.779206195633373e-07, "loss": 0.6667, "step": 1939 }, { "epoch": 0.5289346329493558, "grad_norm": 2.876678497984836, "learning_rate": 4.774793278840688e-07, "loss": 0.6743, "step": 1940 }, { "epoch": 0.529207279667371, "grad_norm": 13.256839867049353, "learning_rate": 4.770380537823184e-07, "loss": 0.733, "step": 1941 }, { "epoch": 0.5294799263853861, "grad_norm": 4.6830830152573935, "learning_rate": 4.7659679760250337e-07, "loss": 0.6972, "step": 1942 }, { "epoch": 0.5297525731034013, "grad_norm": 2.3155628220271267, "learning_rate": 4.7615555968902714e-07, "loss": 0.6809, "step": 1943 }, { "epoch": 0.5300252198214164, "grad_norm": 2.4011876439862903, "learning_rate": 4.757143403862779e-07, "loss": 0.6991, "step": 1944 }, { "epoch": 0.5302978665394316, "grad_norm": 3.200010702731733, "learning_rate": 4.752731400386306e-07, "loss": 0.6221, "step": 1945 }, { "epoch": 0.5305705132574466, "grad_norm": 2.0949648514785846, "learning_rate": 4.74831958990444e-07, "loss": 0.6685, "step": 1946 }, { "epoch": 0.5308431599754618, "grad_norm": 3.3143748709952345, "learning_rate": 4.743907975860632e-07, "loss": 0.6807, "step": 1947 }, { "epoch": 0.5311158066934769, "grad_norm": 6.597833876351695, "learning_rate": 4.739496561698168e-07, "loss": 0.6571, "step": 1948 }, { "epoch": 0.5313884534114921, "grad_norm": 2.4056822509354703, "learning_rate": 4.735085350860185e-07, "loss": 0.7016, "step": 1949 }, { "epoch": 0.5316611001295072, "grad_norm": 2.0920084242850523, "learning_rate": 4.730674346789661e-07, "loss": 0.6752, "step": 1950 }, { "epoch": 0.5319337468475224, "grad_norm": 3.8260872356966154, "learning_rate": 4.726263552929407e-07, "loss": 0.6527, "step": 1951 }, { "epoch": 0.5322063935655375, "grad_norm": 3.3898522540338085, "learning_rate": 4.7218529727220784e-07, "loss": 0.7409, "step": 1952 }, { "epoch": 0.5324790402835526, "grad_norm": 2.254985235476269, "learning_rate": 4.7174426096101543e-07, "loss": 0.6988, "step": 1953 }, { "epoch": 0.5327516870015677, "grad_norm": 3.76972697712417, "learning_rate": 4.713032467035953e-07, "loss": 0.7312, "step": 1954 }, { "epoch": 0.5330243337195828, "grad_norm": 2.0243053657475456, "learning_rate": 4.7086225484416155e-07, "loss": 0.6541, "step": 1955 }, { "epoch": 0.533296980437598, "grad_norm": 3.0698429906841223, "learning_rate": 4.7042128572691087e-07, "loss": 0.6742, "step": 1956 }, { "epoch": 0.5335696271556131, "grad_norm": 2.2131198130682663, "learning_rate": 4.699803396960225e-07, "loss": 0.6337, "step": 1957 }, { "epoch": 0.5338422738736283, "grad_norm": 2.349925278053693, "learning_rate": 4.695394170956572e-07, "loss": 0.7176, "step": 1958 }, { "epoch": 0.5341149205916433, "grad_norm": 3.345622971184241, "learning_rate": 4.69098518269958e-07, "loss": 0.6971, "step": 1959 }, { "epoch": 0.5343875673096585, "grad_norm": 8.418865649419894, "learning_rate": 4.686576435630486e-07, "loss": 0.6624, "step": 1960 }, { "epoch": 0.5346602140276736, "grad_norm": 29.955862840259254, "learning_rate": 4.6821679331903486e-07, "loss": 0.6689, "step": 1961 }, { "epoch": 0.5349328607456888, "grad_norm": 1.833195876708276, "learning_rate": 4.6777596788200244e-07, "loss": 0.7201, "step": 1962 }, { "epoch": 0.5352055074637039, "grad_norm": 2.5750745002108, "learning_rate": 4.6733516759601854e-07, "loss": 0.6958, "step": 1963 }, { "epoch": 0.5354781541817191, "grad_norm": 1.978414767717926, "learning_rate": 4.6689439280513056e-07, "loss": 0.6517, "step": 1964 }, { "epoch": 0.5357508008997341, "grad_norm": 1.7186570583760017, "learning_rate": 4.664536438533654e-07, "loss": 0.7625, "step": 1965 }, { "epoch": 0.5360234476177493, "grad_norm": 2.949610109421259, "learning_rate": 4.660129210847307e-07, "loss": 0.6423, "step": 1966 }, { "epoch": 0.5362960943357644, "grad_norm": 3.5994398190992727, "learning_rate": 4.655722248432128e-07, "loss": 0.6959, "step": 1967 }, { "epoch": 0.5365687410537796, "grad_norm": 2.272777728853211, "learning_rate": 4.6513155547277807e-07, "loss": 0.7067, "step": 1968 }, { "epoch": 0.5368413877717947, "grad_norm": 2.0179848204499504, "learning_rate": 4.646909133173712e-07, "loss": 0.7507, "step": 1969 }, { "epoch": 0.5371140344898099, "grad_norm": 1.8363814242042205, "learning_rate": 4.6425029872091627e-07, "loss": 0.6967, "step": 1970 }, { "epoch": 0.537386681207825, "grad_norm": 9.890909258245584, "learning_rate": 4.638097120273157e-07, "loss": 0.6443, "step": 1971 }, { "epoch": 0.5376593279258401, "grad_norm": 2.7719433366313826, "learning_rate": 4.6336915358044965e-07, "loss": 0.7517, "step": 1972 }, { "epoch": 0.5379319746438552, "grad_norm": 2.2357160127958338, "learning_rate": 4.6292862372417697e-07, "loss": 0.6828, "step": 1973 }, { "epoch": 0.5382046213618703, "grad_norm": 3.3937303484659584, "learning_rate": 4.624881228023336e-07, "loss": 0.6759, "step": 1974 }, { "epoch": 0.5384772680798855, "grad_norm": 4.3425956045211, "learning_rate": 4.620476511587331e-07, "loss": 0.6286, "step": 1975 }, { "epoch": 0.5387499147979006, "grad_norm": 1.9725600389071762, "learning_rate": 4.616072091371665e-07, "loss": 0.6996, "step": 1976 }, { "epoch": 0.5390225615159158, "grad_norm": 2.1075440954640277, "learning_rate": 4.6116679708140105e-07, "loss": 0.7036, "step": 1977 }, { "epoch": 0.5392952082339308, "grad_norm": 2.5745025142589273, "learning_rate": 4.6072641533518136e-07, "loss": 0.727, "step": 1978 }, { "epoch": 0.539567854951946, "grad_norm": 1.7125706849146776, "learning_rate": 4.6028606424222756e-07, "loss": 0.6618, "step": 1979 }, { "epoch": 0.5398405016699611, "grad_norm": 1.8101765655428002, "learning_rate": 4.598457441462368e-07, "loss": 0.7356, "step": 1980 }, { "epoch": 0.5401131483879763, "grad_norm": 2.186582090945157, "learning_rate": 4.594054553908809e-07, "loss": 0.7378, "step": 1981 }, { "epoch": 0.5403857951059914, "grad_norm": 2.0686860057653007, "learning_rate": 4.589651983198083e-07, "loss": 0.7031, "step": 1982 }, { "epoch": 0.5406584418240066, "grad_norm": 2.110199894029622, "learning_rate": 4.585249732766424e-07, "loss": 0.6827, "step": 1983 }, { "epoch": 0.5409310885420217, "grad_norm": 2.2797700424704495, "learning_rate": 4.580847806049809e-07, "loss": 0.735, "step": 1984 }, { "epoch": 0.5412037352600368, "grad_norm": 2.271278880068364, "learning_rate": 4.5764462064839736e-07, "loss": 0.7364, "step": 1985 }, { "epoch": 0.5414763819780519, "grad_norm": 6.062666256141943, "learning_rate": 4.572044937504388e-07, "loss": 0.7241, "step": 1986 }, { "epoch": 0.5417490286960671, "grad_norm": 2.305938869889513, "learning_rate": 4.5676440025462726e-07, "loss": 0.6939, "step": 1987 }, { "epoch": 0.5420216754140822, "grad_norm": 2.9272530292933125, "learning_rate": 4.5632434050445796e-07, "loss": 0.6123, "step": 1988 }, { "epoch": 0.5422943221320974, "grad_norm": 2.6433262849318586, "learning_rate": 4.558843148434003e-07, "loss": 0.5848, "step": 1989 }, { "epoch": 0.5425669688501125, "grad_norm": 5.178631732917755, "learning_rate": 4.5544432361489713e-07, "loss": 0.6861, "step": 1990 }, { "epoch": 0.5428396155681277, "grad_norm": 2.4522366433931726, "learning_rate": 4.550043671623638e-07, "loss": 0.6722, "step": 1991 }, { "epoch": 0.5431122622861427, "grad_norm": 2.1168479894443215, "learning_rate": 4.545644458291894e-07, "loss": 0.7249, "step": 1992 }, { "epoch": 0.5433849090041578, "grad_norm": 2.191052982651671, "learning_rate": 4.5412455995873485e-07, "loss": 0.6672, "step": 1993 }, { "epoch": 0.543657555722173, "grad_norm": 2.045706215271225, "learning_rate": 4.536847098943337e-07, "loss": 0.7124, "step": 1994 }, { "epoch": 0.5439302024401881, "grad_norm": 2.7282642465352427, "learning_rate": 4.5324489597929155e-07, "loss": 0.7179, "step": 1995 }, { "epoch": 0.5442028491582033, "grad_norm": 2.3127657360238834, "learning_rate": 4.528051185568859e-07, "loss": 0.7063, "step": 1996 }, { "epoch": 0.5444754958762184, "grad_norm": 6.5165062022075775, "learning_rate": 4.523653779703656e-07, "loss": 0.6954, "step": 1997 }, { "epoch": 0.5447481425942335, "grad_norm": 2.0647108137828742, "learning_rate": 4.519256745629505e-07, "loss": 0.715, "step": 1998 }, { "epoch": 0.5450207893122486, "grad_norm": 3.792415876235006, "learning_rate": 4.5148600867783224e-07, "loss": 0.7219, "step": 1999 }, { "epoch": 0.5452934360302638, "grad_norm": 3.019248210068839, "learning_rate": 4.5104638065817224e-07, "loss": 0.6465, "step": 2000 }, { "epoch": 0.5455660827482789, "grad_norm": 2.3519676834468517, "learning_rate": 4.5060679084710287e-07, "loss": 0.6921, "step": 2001 }, { "epoch": 0.5458387294662941, "grad_norm": 25.755380366981896, "learning_rate": 4.501672395877268e-07, "loss": 0.6659, "step": 2002 }, { "epoch": 0.5461113761843092, "grad_norm": 2.360829347304586, "learning_rate": 4.4972772722311605e-07, "loss": 0.7266, "step": 2003 }, { "epoch": 0.5463840229023244, "grad_norm": 2.5033897500096725, "learning_rate": 4.4928825409631303e-07, "loss": 0.7338, "step": 2004 }, { "epoch": 0.5466566696203394, "grad_norm": 2.177698984147029, "learning_rate": 4.488488205503287e-07, "loss": 0.685, "step": 2005 }, { "epoch": 0.5469293163383546, "grad_norm": 2.8340406136210747, "learning_rate": 4.48409426928144e-07, "loss": 0.793, "step": 2006 }, { "epoch": 0.5472019630563697, "grad_norm": 3.726174327179516, "learning_rate": 4.479700735727078e-07, "loss": 0.7094, "step": 2007 }, { "epoch": 0.5474746097743849, "grad_norm": 2.428953133017409, "learning_rate": 4.475307608269382e-07, "loss": 0.5702, "step": 2008 }, { "epoch": 0.5477472564924, "grad_norm": 10.249706208408265, "learning_rate": 4.470914890337216e-07, "loss": 0.7644, "step": 2009 }, { "epoch": 0.548019903210415, "grad_norm": 2.0069032557087203, "learning_rate": 4.46652258535912e-07, "loss": 0.6676, "step": 2010 }, { "epoch": 0.5482925499284302, "grad_norm": 2.176196435568453, "learning_rate": 4.4621306967633155e-07, "loss": 0.7004, "step": 2011 }, { "epoch": 0.5485651966464453, "grad_norm": 6.155480719706793, "learning_rate": 4.457739227977697e-07, "loss": 0.6843, "step": 2012 }, { "epoch": 0.5488378433644605, "grad_norm": 1.9873550641620865, "learning_rate": 4.453348182429832e-07, "loss": 0.6784, "step": 2013 }, { "epoch": 0.5491104900824756, "grad_norm": 2.1837539504930197, "learning_rate": 4.4489575635469566e-07, "loss": 0.7579, "step": 2014 }, { "epoch": 0.5493831368004908, "grad_norm": 2.3513533246039584, "learning_rate": 4.444567374755977e-07, "loss": 0.7378, "step": 2015 }, { "epoch": 0.5496557835185059, "grad_norm": 1.867824958869608, "learning_rate": 4.4401776194834603e-07, "loss": 0.6369, "step": 2016 }, { "epoch": 0.549928430236521, "grad_norm": 2.1352261156799055, "learning_rate": 4.4357883011556367e-07, "loss": 0.7201, "step": 2017 }, { "epoch": 0.5502010769545361, "grad_norm": 3.0008250183086425, "learning_rate": 4.4313994231983973e-07, "loss": 0.6424, "step": 2018 }, { "epoch": 0.5504737236725513, "grad_norm": 2.4426160297685873, "learning_rate": 4.4270109890372833e-07, "loss": 0.6278, "step": 2019 }, { "epoch": 0.5507463703905664, "grad_norm": 6.592165483519424, "learning_rate": 4.422623002097498e-07, "loss": 0.7009, "step": 2020 }, { "epoch": 0.5510190171085816, "grad_norm": 2.92810297024941, "learning_rate": 4.418235465803887e-07, "loss": 0.6881, "step": 2021 }, { "epoch": 0.5512916638265967, "grad_norm": 2.0217613963986634, "learning_rate": 4.41384838358095e-07, "loss": 0.6851, "step": 2022 }, { "epoch": 0.5515643105446119, "grad_norm": 1.8730008392201323, "learning_rate": 4.4094617588528346e-07, "loss": 0.7317, "step": 2023 }, { "epoch": 0.5518369572626269, "grad_norm": 2.934931064249679, "learning_rate": 4.405075595043321e-07, "loss": 0.7207, "step": 2024 }, { "epoch": 0.5521096039806421, "grad_norm": 3.5138416359508198, "learning_rate": 4.400689895575842e-07, "loss": 0.7237, "step": 2025 }, { "epoch": 0.5523822506986572, "grad_norm": 2.0535425250352204, "learning_rate": 4.396304663873457e-07, "loss": 0.6651, "step": 2026 }, { "epoch": 0.5526548974166724, "grad_norm": 4.866078725070122, "learning_rate": 4.3919199033588686e-07, "loss": 0.6455, "step": 2027 }, { "epoch": 0.5529275441346875, "grad_norm": 3.2067829206257152, "learning_rate": 4.387535617454408e-07, "loss": 0.7369, "step": 2028 }, { "epoch": 0.5532001908527026, "grad_norm": 1.9711827642768658, "learning_rate": 4.383151809582035e-07, "loss": 0.7072, "step": 2029 }, { "epoch": 0.5534728375707177, "grad_norm": 2.5130993674690743, "learning_rate": 4.3787684831633393e-07, "loss": 0.6814, "step": 2030 }, { "epoch": 0.5537454842887328, "grad_norm": 2.1807657886884075, "learning_rate": 4.374385641619533e-07, "loss": 0.6968, "step": 2031 }, { "epoch": 0.554018131006748, "grad_norm": 2.1160113002814978, "learning_rate": 4.370003288371449e-07, "loss": 0.6624, "step": 2032 }, { "epoch": 0.5542907777247631, "grad_norm": 2.4906181792612734, "learning_rate": 4.36562142683954e-07, "loss": 0.6818, "step": 2033 }, { "epoch": 0.5545634244427783, "grad_norm": 2.490120771784728, "learning_rate": 4.361240060443876e-07, "loss": 0.6455, "step": 2034 }, { "epoch": 0.5548360711607934, "grad_norm": 2.532062179828567, "learning_rate": 4.3568591926041395e-07, "loss": 0.7689, "step": 2035 }, { "epoch": 0.5551087178788086, "grad_norm": 2.331583111891599, "learning_rate": 4.352478826739622e-07, "loss": 0.6849, "step": 2036 }, { "epoch": 0.5553813645968236, "grad_norm": 11.342920183282406, "learning_rate": 4.3480989662692287e-07, "loss": 0.728, "step": 2037 }, { "epoch": 0.5556540113148388, "grad_norm": 4.838425987421888, "learning_rate": 4.3437196146114624e-07, "loss": 0.6427, "step": 2038 }, { "epoch": 0.5559266580328539, "grad_norm": 3.664865224993883, "learning_rate": 4.3393407751844376e-07, "loss": 0.6829, "step": 2039 }, { "epoch": 0.5561993047508691, "grad_norm": 4.129459231302442, "learning_rate": 4.334962451405859e-07, "loss": 0.7036, "step": 2040 }, { "epoch": 0.5564719514688842, "grad_norm": 1.9526592978244977, "learning_rate": 4.330584646693038e-07, "loss": 0.6727, "step": 2041 }, { "epoch": 0.5567445981868994, "grad_norm": 2.1539952915686107, "learning_rate": 4.326207364462879e-07, "loss": 0.6833, "step": 2042 }, { "epoch": 0.5570172449049144, "grad_norm": 3.7610725109592296, "learning_rate": 4.3218306081318713e-07, "loss": 0.6428, "step": 2043 }, { "epoch": 0.5572898916229296, "grad_norm": 5.241198046505595, "learning_rate": 4.317454381116105e-07, "loss": 0.7024, "step": 2044 }, { "epoch": 0.5575625383409447, "grad_norm": 1.8423912771003248, "learning_rate": 4.313078686831246e-07, "loss": 0.6558, "step": 2045 }, { "epoch": 0.5578351850589599, "grad_norm": 2.6992899151375416, "learning_rate": 4.3087035286925544e-07, "loss": 0.704, "step": 2046 }, { "epoch": 0.558107831776975, "grad_norm": 1.8679655337672376, "learning_rate": 4.3043289101148627e-07, "loss": 0.706, "step": 2047 }, { "epoch": 0.5583804784949901, "grad_norm": 1.9126890725523908, "learning_rate": 4.299954834512589e-07, "loss": 0.6989, "step": 2048 }, { "epoch": 0.5586531252130053, "grad_norm": 1.948665175152273, "learning_rate": 4.295581305299727e-07, "loss": 0.6174, "step": 2049 }, { "epoch": 0.5589257719310203, "grad_norm": 4.479698986310488, "learning_rate": 4.2912083258898396e-07, "loss": 0.7191, "step": 2050 }, { "epoch": 0.5591984186490355, "grad_norm": 1.7379447452736811, "learning_rate": 4.286835899696064e-07, "loss": 0.6683, "step": 2051 }, { "epoch": 0.5594710653670506, "grad_norm": 1.7892072372967573, "learning_rate": 4.282464030131104e-07, "loss": 0.6675, "step": 2052 }, { "epoch": 0.5597437120850658, "grad_norm": 3.8221704468694835, "learning_rate": 4.2780927206072323e-07, "loss": 0.678, "step": 2053 }, { "epoch": 0.5600163588030809, "grad_norm": 2.0372921338659378, "learning_rate": 4.2737219745362783e-07, "loss": 0.6758, "step": 2054 }, { "epoch": 0.5602890055210961, "grad_norm": 2.463842804601116, "learning_rate": 4.2693517953296374e-07, "loss": 0.7459, "step": 2055 }, { "epoch": 0.5605616522391111, "grad_norm": 2.949117655392059, "learning_rate": 4.2649821863982625e-07, "loss": 0.6895, "step": 2056 }, { "epoch": 0.5608342989571263, "grad_norm": 2.7087437045308334, "learning_rate": 4.260613151152655e-07, "loss": 0.7191, "step": 2057 }, { "epoch": 0.5611069456751414, "grad_norm": 4.978314457938921, "learning_rate": 4.2562446930028767e-07, "loss": 0.66, "step": 2058 }, { "epoch": 0.5613795923931566, "grad_norm": 2.234872520719635, "learning_rate": 4.251876815358531e-07, "loss": 0.6963, "step": 2059 }, { "epoch": 0.5616522391111717, "grad_norm": 1.9687048264279738, "learning_rate": 4.247509521628777e-07, "loss": 0.6049, "step": 2060 }, { "epoch": 0.5619248858291869, "grad_norm": 2.8146428876974774, "learning_rate": 4.243142815222309e-07, "loss": 0.706, "step": 2061 }, { "epoch": 0.562197532547202, "grad_norm": 2.2243698049378025, "learning_rate": 4.23877669954737e-07, "loss": 0.6646, "step": 2062 }, { "epoch": 0.5624701792652171, "grad_norm": 1.9985132354808937, "learning_rate": 4.23441117801174e-07, "loss": 0.7025, "step": 2063 }, { "epoch": 0.5627428259832322, "grad_norm": 2.4454101288669294, "learning_rate": 4.2300462540227316e-07, "loss": 0.6592, "step": 2064 }, { "epoch": 0.5630154727012474, "grad_norm": 5.163399595463679, "learning_rate": 4.225681930987197e-07, "loss": 0.6883, "step": 2065 }, { "epoch": 0.5632881194192625, "grad_norm": 2.5117966606799613, "learning_rate": 4.2213182123115134e-07, "loss": 0.678, "step": 2066 }, { "epoch": 0.5635607661372776, "grad_norm": 5.332466100241553, "learning_rate": 4.216955101401589e-07, "loss": 0.6086, "step": 2067 }, { "epoch": 0.5638334128552928, "grad_norm": 1.870639318990625, "learning_rate": 4.212592601662861e-07, "loss": 0.6426, "step": 2068 }, { "epoch": 0.5641060595733078, "grad_norm": 2.6096771778921095, "learning_rate": 4.2082307165002844e-07, "loss": 0.6758, "step": 2069 }, { "epoch": 0.564378706291323, "grad_norm": 4.672796896700949, "learning_rate": 4.203869449318337e-07, "loss": 0.7328, "step": 2070 }, { "epoch": 0.5646513530093381, "grad_norm": 4.716063769482912, "learning_rate": 4.199508803521012e-07, "loss": 0.7102, "step": 2071 }, { "epoch": 0.5649239997273533, "grad_norm": 2.675068404613667, "learning_rate": 4.1951487825118226e-07, "loss": 0.6806, "step": 2072 }, { "epoch": 0.5651966464453684, "grad_norm": 2.3770203867140722, "learning_rate": 4.1907893896937873e-07, "loss": 0.6932, "step": 2073 }, { "epoch": 0.5654692931633836, "grad_norm": 3.6661616062258684, "learning_rate": 4.18643062846944e-07, "loss": 0.6612, "step": 2074 }, { "epoch": 0.5657419398813986, "grad_norm": 2.3229807459871497, "learning_rate": 4.182072502240822e-07, "loss": 0.6742, "step": 2075 }, { "epoch": 0.5660145865994138, "grad_norm": 3.087237742141626, "learning_rate": 4.177715014409472e-07, "loss": 0.6581, "step": 2076 }, { "epoch": 0.5662872333174289, "grad_norm": 1.9700037893563587, "learning_rate": 4.1733581683764403e-07, "loss": 0.71, "step": 2077 }, { "epoch": 0.5665598800354441, "grad_norm": 2.340278563964696, "learning_rate": 4.1690019675422657e-07, "loss": 0.6805, "step": 2078 }, { "epoch": 0.5668325267534592, "grad_norm": 3.612293128062339, "learning_rate": 4.164646415306994e-07, "loss": 0.7138, "step": 2079 }, { "epoch": 0.5671051734714744, "grad_norm": 2.017449928970898, "learning_rate": 4.1602915150701544e-07, "loss": 0.6549, "step": 2080 }, { "epoch": 0.5673778201894895, "grad_norm": 2.0668657536489645, "learning_rate": 4.1559372702307757e-07, "loss": 0.6282, "step": 2081 }, { "epoch": 0.5676504669075046, "grad_norm": 21.114188562585397, "learning_rate": 4.151583684187373e-07, "loss": 0.7592, "step": 2082 }, { "epoch": 0.5679231136255197, "grad_norm": 1.946791132539808, "learning_rate": 4.147230760337942e-07, "loss": 0.7232, "step": 2083 }, { "epoch": 0.5681957603435349, "grad_norm": 3.973301216382491, "learning_rate": 4.1428785020799706e-07, "loss": 0.6473, "step": 2084 }, { "epoch": 0.56846840706155, "grad_norm": 4.971211606321965, "learning_rate": 4.138526912810417e-07, "loss": 0.7355, "step": 2085 }, { "epoch": 0.5687410537795651, "grad_norm": 7.058229004706999, "learning_rate": 4.1341759959257263e-07, "loss": 0.6977, "step": 2086 }, { "epoch": 0.5690137004975803, "grad_norm": 2.1398159313296006, "learning_rate": 4.1298257548218135e-07, "loss": 0.7028, "step": 2087 }, { "epoch": 0.5692863472155953, "grad_norm": 5.097482021211462, "learning_rate": 4.1254761928940666e-07, "loss": 0.6862, "step": 2088 }, { "epoch": 0.5695589939336105, "grad_norm": 2.26708780113062, "learning_rate": 4.1211273135373473e-07, "loss": 0.7003, "step": 2089 }, { "epoch": 0.5698316406516256, "grad_norm": 1.9911431807066309, "learning_rate": 4.116779120145979e-07, "loss": 0.7127, "step": 2090 }, { "epoch": 0.5701042873696408, "grad_norm": 3.3175451389129105, "learning_rate": 4.112431616113756e-07, "loss": 0.6984, "step": 2091 }, { "epoch": 0.5703769340876559, "grad_norm": 2.8717462712640227, "learning_rate": 4.1080848048339265e-07, "loss": 0.6277, "step": 2092 }, { "epoch": 0.5706495808056711, "grad_norm": 3.675419030131612, "learning_rate": 4.1037386896992064e-07, "loss": 0.686, "step": 2093 }, { "epoch": 0.5709222275236862, "grad_norm": 2.2046894230474043, "learning_rate": 4.099393274101761e-07, "loss": 0.692, "step": 2094 }, { "epoch": 0.5711948742417013, "grad_norm": 3.5035645149446704, "learning_rate": 4.095048561433215e-07, "loss": 0.6605, "step": 2095 }, { "epoch": 0.5714675209597164, "grad_norm": 1.9414405153540826, "learning_rate": 4.090704555084644e-07, "loss": 0.7245, "step": 2096 }, { "epoch": 0.5717401676777316, "grad_norm": 2.7623975265250285, "learning_rate": 4.0863612584465666e-07, "loss": 0.7241, "step": 2097 }, { "epoch": 0.5720128143957467, "grad_norm": 4.039609829486876, "learning_rate": 4.0820186749089563e-07, "loss": 0.6612, "step": 2098 }, { "epoch": 0.5722854611137619, "grad_norm": 2.281178387876032, "learning_rate": 4.0776768078612207e-07, "loss": 0.6672, "step": 2099 }, { "epoch": 0.572558107831777, "grad_norm": 1.897571417629187, "learning_rate": 4.0733356606922154e-07, "loss": 0.692, "step": 2100 }, { "epoch": 0.5728307545497922, "grad_norm": 3.894594764956749, "learning_rate": 4.0689952367902326e-07, "loss": 0.6738, "step": 2101 }, { "epoch": 0.5731034012678072, "grad_norm": 2.720985741990644, "learning_rate": 4.064655539542996e-07, "loss": 0.6918, "step": 2102 }, { "epoch": 0.5733760479858224, "grad_norm": 1.869851228594142, "learning_rate": 4.0603165723376687e-07, "loss": 0.6413, "step": 2103 }, { "epoch": 0.5736486947038375, "grad_norm": 5.776031357710388, "learning_rate": 4.0559783385608366e-07, "loss": 0.7525, "step": 2104 }, { "epoch": 0.5739213414218526, "grad_norm": 2.1106717214632993, "learning_rate": 4.051640841598519e-07, "loss": 0.7205, "step": 2105 }, { "epoch": 0.5741939881398678, "grad_norm": 2.1584797171253216, "learning_rate": 4.0473040848361584e-07, "loss": 0.7223, "step": 2106 }, { "epoch": 0.5744666348578829, "grad_norm": 9.497976921857344, "learning_rate": 4.0429680716586165e-07, "loss": 0.6596, "step": 2107 }, { "epoch": 0.574739281575898, "grad_norm": 2.1192946292780013, "learning_rate": 4.0386328054501817e-07, "loss": 0.7719, "step": 2108 }, { "epoch": 0.5750119282939131, "grad_norm": 3.110105311447256, "learning_rate": 4.034298289594551e-07, "loss": 0.6989, "step": 2109 }, { "epoch": 0.5752845750119283, "grad_norm": 2.0948382356173703, "learning_rate": 4.0299645274748433e-07, "loss": 0.7183, "step": 2110 }, { "epoch": 0.5755572217299434, "grad_norm": 1.7599178093450074, "learning_rate": 4.025631522473582e-07, "loss": 0.6467, "step": 2111 }, { "epoch": 0.5758298684479586, "grad_norm": 2.09387630382223, "learning_rate": 4.0212992779727073e-07, "loss": 0.6687, "step": 2112 }, { "epoch": 0.5761025151659737, "grad_norm": 1.8202122579987086, "learning_rate": 4.0169677973535575e-07, "loss": 0.7092, "step": 2113 }, { "epoch": 0.5763751618839889, "grad_norm": 2.3500188380621183, "learning_rate": 4.012637083996881e-07, "loss": 0.6816, "step": 2114 }, { "epoch": 0.5766478086020039, "grad_norm": 2.492104598130377, "learning_rate": 4.0083071412828274e-07, "loss": 0.6858, "step": 2115 }, { "epoch": 0.5769204553200191, "grad_norm": 4.737354276876187, "learning_rate": 4.003977972590938e-07, "loss": 0.7274, "step": 2116 }, { "epoch": 0.5771931020380342, "grad_norm": 5.838254081822456, "learning_rate": 3.99964958130016e-07, "loss": 0.7212, "step": 2117 }, { "epoch": 0.5774657487560494, "grad_norm": 2.383935943345768, "learning_rate": 3.995321970788823e-07, "loss": 0.7058, "step": 2118 }, { "epoch": 0.5777383954740645, "grad_norm": 2.167698665100435, "learning_rate": 3.9909951444346585e-07, "loss": 0.6928, "step": 2119 }, { "epoch": 0.5780110421920797, "grad_norm": 2.0162919310449894, "learning_rate": 3.9866691056147746e-07, "loss": 0.6787, "step": 2120 }, { "epoch": 0.5782836889100947, "grad_norm": 2.2085948989337116, "learning_rate": 3.9823438577056727e-07, "loss": 0.7075, "step": 2121 }, { "epoch": 0.5785563356281099, "grad_norm": 2.798815783318958, "learning_rate": 3.978019404083237e-07, "loss": 0.6535, "step": 2122 }, { "epoch": 0.578828982346125, "grad_norm": 3.9364782178475584, "learning_rate": 3.973695748122725e-07, "loss": 0.7073, "step": 2123 }, { "epoch": 0.5791016290641401, "grad_norm": 2.0835764738802047, "learning_rate": 3.9693728931987783e-07, "loss": 0.6914, "step": 2124 }, { "epoch": 0.5793742757821553, "grad_norm": 2.2767409921459567, "learning_rate": 3.9650508426854105e-07, "loss": 0.7047, "step": 2125 }, { "epoch": 0.5796469225001704, "grad_norm": 2.7596702361829193, "learning_rate": 3.960729599956008e-07, "loss": 0.6873, "step": 2126 }, { "epoch": 0.5799195692181855, "grad_norm": 2.3114760592287236, "learning_rate": 3.9564091683833244e-07, "loss": 0.6868, "step": 2127 }, { "epoch": 0.5801922159362006, "grad_norm": 2.488256992297753, "learning_rate": 3.952089551339485e-07, "loss": 0.6543, "step": 2128 }, { "epoch": 0.5804648626542158, "grad_norm": 2.023483039213979, "learning_rate": 3.947770752195975e-07, "loss": 0.689, "step": 2129 }, { "epoch": 0.5807375093722309, "grad_norm": 2.174563204763667, "learning_rate": 3.943452774323642e-07, "loss": 0.729, "step": 2130 }, { "epoch": 0.5810101560902461, "grad_norm": 2.157658024175096, "learning_rate": 3.939135621092697e-07, "loss": 0.728, "step": 2131 }, { "epoch": 0.5812828028082612, "grad_norm": 2.9120257923576665, "learning_rate": 3.9348192958726987e-07, "loss": 0.7053, "step": 2132 }, { "epoch": 0.5815554495262764, "grad_norm": 2.2473336151187975, "learning_rate": 3.930503802032567e-07, "loss": 0.6939, "step": 2133 }, { "epoch": 0.5818280962442914, "grad_norm": 8.303730048959382, "learning_rate": 3.926189142940573e-07, "loss": 0.717, "step": 2134 }, { "epoch": 0.5821007429623066, "grad_norm": 3.4025657373371736, "learning_rate": 3.9218753219643294e-07, "loss": 0.734, "step": 2135 }, { "epoch": 0.5823733896803217, "grad_norm": 2.051252271771494, "learning_rate": 3.9175623424708026e-07, "loss": 0.547, "step": 2136 }, { "epoch": 0.5826460363983369, "grad_norm": 2.642186238166317, "learning_rate": 3.9132502078262947e-07, "loss": 0.6489, "step": 2137 }, { "epoch": 0.582918683116352, "grad_norm": 1.8280792831334394, "learning_rate": 3.9089389213964573e-07, "loss": 0.7008, "step": 2138 }, { "epoch": 0.5831913298343672, "grad_norm": 3.6402247021742022, "learning_rate": 3.904628486546271e-07, "loss": 0.6445, "step": 2139 }, { "epoch": 0.5834639765523822, "grad_norm": 2.376548581697671, "learning_rate": 3.900318906640057e-07, "loss": 0.6825, "step": 2140 }, { "epoch": 0.5837366232703974, "grad_norm": 3.048188462458489, "learning_rate": 3.8960101850414695e-07, "loss": 0.6899, "step": 2141 }, { "epoch": 0.5840092699884125, "grad_norm": 2.985749745382806, "learning_rate": 3.8917023251134895e-07, "loss": 0.7107, "step": 2142 }, { "epoch": 0.5842819167064276, "grad_norm": 2.660950074065415, "learning_rate": 3.8873953302184283e-07, "loss": 0.6106, "step": 2143 }, { "epoch": 0.5845545634244428, "grad_norm": 2.155333089295208, "learning_rate": 3.883089203717921e-07, "loss": 0.7304, "step": 2144 }, { "epoch": 0.5848272101424579, "grad_norm": 3.005417524522218, "learning_rate": 3.878783948972924e-07, "loss": 0.7133, "step": 2145 }, { "epoch": 0.585099856860473, "grad_norm": 4.6317833350826145, "learning_rate": 3.874479569343714e-07, "loss": 0.7321, "step": 2146 }, { "epoch": 0.5853725035784881, "grad_norm": 2.828748364575421, "learning_rate": 3.8701760681898867e-07, "loss": 0.6869, "step": 2147 }, { "epoch": 0.5856451502965033, "grad_norm": 1.6402284071216804, "learning_rate": 3.8658734488703495e-07, "loss": 0.6721, "step": 2148 }, { "epoch": 0.5859177970145184, "grad_norm": 3.0629413601434785, "learning_rate": 3.8615717147433206e-07, "loss": 0.5555, "step": 2149 }, { "epoch": 0.5861904437325336, "grad_norm": 1.8490048890703763, "learning_rate": 3.857270869166333e-07, "loss": 0.7117, "step": 2150 }, { "epoch": 0.5864630904505487, "grad_norm": 1.759843346379735, "learning_rate": 3.8529709154962176e-07, "loss": 0.6959, "step": 2151 }, { "epoch": 0.5867357371685639, "grad_norm": 2.1456210889220038, "learning_rate": 3.8486718570891177e-07, "loss": 0.6701, "step": 2152 }, { "epoch": 0.5870083838865789, "grad_norm": 2.1811058388694526, "learning_rate": 3.84437369730047e-07, "loss": 0.6246, "step": 2153 }, { "epoch": 0.5872810306045941, "grad_norm": 4.050928438662014, "learning_rate": 3.840076439485016e-07, "loss": 0.6579, "step": 2154 }, { "epoch": 0.5875536773226092, "grad_norm": 1.851318090644661, "learning_rate": 3.835780086996793e-07, "loss": 0.6963, "step": 2155 }, { "epoch": 0.5878263240406244, "grad_norm": 12.860100875646229, "learning_rate": 3.831484643189126e-07, "loss": 0.6966, "step": 2156 }, { "epoch": 0.5880989707586395, "grad_norm": 2.623714141595384, "learning_rate": 3.8271901114146375e-07, "loss": 0.7696, "step": 2157 }, { "epoch": 0.5883716174766547, "grad_norm": 2.0586719231143995, "learning_rate": 3.822896495025231e-07, "loss": 0.6521, "step": 2158 }, { "epoch": 0.5886442641946698, "grad_norm": 18.31982038570352, "learning_rate": 3.8186037973721026e-07, "loss": 0.6975, "step": 2159 }, { "epoch": 0.5889169109126849, "grad_norm": 1.8724624692887064, "learning_rate": 3.8143120218057293e-07, "loss": 0.7022, "step": 2160 }, { "epoch": 0.5891895576307, "grad_norm": 3.1358040576867676, "learning_rate": 3.810021171675864e-07, "loss": 0.6345, "step": 2161 }, { "epoch": 0.5894622043487151, "grad_norm": 7.062439289465738, "learning_rate": 3.805731250331544e-07, "loss": 0.707, "step": 2162 }, { "epoch": 0.5897348510667303, "grad_norm": 3.5057138662741485, "learning_rate": 3.801442261121076e-07, "loss": 0.755, "step": 2163 }, { "epoch": 0.5900074977847454, "grad_norm": 1.7171420272624363, "learning_rate": 3.7971542073920435e-07, "loss": 0.6583, "step": 2164 }, { "epoch": 0.5902801445027606, "grad_norm": 2.2939840056209877, "learning_rate": 3.7928670924912957e-07, "loss": 0.6478, "step": 2165 }, { "epoch": 0.5905527912207756, "grad_norm": 4.095472363428715, "learning_rate": 3.788580919764953e-07, "loss": 0.7498, "step": 2166 }, { "epoch": 0.5908254379387908, "grad_norm": 8.685300676877167, "learning_rate": 3.784295692558399e-07, "loss": 0.6422, "step": 2167 }, { "epoch": 0.5910980846568059, "grad_norm": 2.2147772763361813, "learning_rate": 3.780011414216276e-07, "loss": 0.7315, "step": 2168 }, { "epoch": 0.5913707313748211, "grad_norm": 1.9498041293683779, "learning_rate": 3.775728088082494e-07, "loss": 0.7069, "step": 2169 }, { "epoch": 0.5916433780928362, "grad_norm": 2.3166802193459763, "learning_rate": 3.771445717500209e-07, "loss": 0.6571, "step": 2170 }, { "epoch": 0.5919160248108514, "grad_norm": 2.727635780278045, "learning_rate": 3.767164305811841e-07, "loss": 0.6897, "step": 2171 }, { "epoch": 0.5921886715288665, "grad_norm": 4.549849925845328, "learning_rate": 3.762883856359054e-07, "loss": 0.7037, "step": 2172 }, { "epoch": 0.5924613182468816, "grad_norm": 2.7522357346809345, "learning_rate": 3.7586043724827645e-07, "loss": 0.6458, "step": 2173 }, { "epoch": 0.5927339649648967, "grad_norm": 2.9686976858087277, "learning_rate": 3.754325857523138e-07, "loss": 0.6828, "step": 2174 }, { "epoch": 0.5930066116829119, "grad_norm": 2.0109884195400887, "learning_rate": 3.7500483148195773e-07, "loss": 0.6996, "step": 2175 }, { "epoch": 0.593279258400927, "grad_norm": 2.1575911324892236, "learning_rate": 3.7457717477107334e-07, "loss": 0.6335, "step": 2176 }, { "epoch": 0.5935519051189422, "grad_norm": 3.368674728316165, "learning_rate": 3.7414961595344885e-07, "loss": 0.6224, "step": 2177 }, { "epoch": 0.5938245518369573, "grad_norm": 1.672745366239162, "learning_rate": 3.7372215536279683e-07, "loss": 0.7322, "step": 2178 }, { "epoch": 0.5940971985549724, "grad_norm": 8.312991742023032, "learning_rate": 3.732947933327524e-07, "loss": 0.6761, "step": 2179 }, { "epoch": 0.5943698452729875, "grad_norm": 2.073603542655857, "learning_rate": 3.7286753019687445e-07, "loss": 0.6776, "step": 2180 }, { "epoch": 0.5946424919910026, "grad_norm": 2.2767239096471408, "learning_rate": 3.724403662886444e-07, "loss": 0.6462, "step": 2181 }, { "epoch": 0.5949151387090178, "grad_norm": 3.580852239232228, "learning_rate": 3.720133019414662e-07, "loss": 0.7531, "step": 2182 }, { "epoch": 0.5951877854270329, "grad_norm": 1.797297767843112, "learning_rate": 3.7158633748866607e-07, "loss": 0.7207, "step": 2183 }, { "epoch": 0.5954604321450481, "grad_norm": 2.550271464231421, "learning_rate": 3.711594732634922e-07, "loss": 0.6863, "step": 2184 }, { "epoch": 0.5957330788630631, "grad_norm": 4.471887576303051, "learning_rate": 3.7073270959911484e-07, "loss": 0.6377, "step": 2185 }, { "epoch": 0.5960057255810783, "grad_norm": 2.907722718288664, "learning_rate": 3.703060468286254e-07, "loss": 0.6569, "step": 2186 }, { "epoch": 0.5962783722990934, "grad_norm": 1.843381733303619, "learning_rate": 3.698794852850367e-07, "loss": 0.6922, "step": 2187 }, { "epoch": 0.5965510190171086, "grad_norm": 2.4446945392946406, "learning_rate": 3.6945302530128293e-07, "loss": 0.5904, "step": 2188 }, { "epoch": 0.5968236657351237, "grad_norm": 1.9551752720883067, "learning_rate": 3.6902666721021806e-07, "loss": 0.7356, "step": 2189 }, { "epoch": 0.5970963124531389, "grad_norm": 3.144973959380659, "learning_rate": 3.6860041134461764e-07, "loss": 0.7068, "step": 2190 }, { "epoch": 0.597368959171154, "grad_norm": 7.229715560583648, "learning_rate": 3.681742580371765e-07, "loss": 0.6197, "step": 2191 }, { "epoch": 0.5976416058891691, "grad_norm": 2.079235716079856, "learning_rate": 3.6774820762051e-07, "loss": 0.7687, "step": 2192 }, { "epoch": 0.5979142526071842, "grad_norm": 2.1338052315005047, "learning_rate": 3.673222604271532e-07, "loss": 0.7271, "step": 2193 }, { "epoch": 0.5981868993251994, "grad_norm": 4.609207749821176, "learning_rate": 3.6689641678955997e-07, "loss": 0.5873, "step": 2194 }, { "epoch": 0.5984595460432145, "grad_norm": 4.607484965334727, "learning_rate": 3.6647067704010436e-07, "loss": 0.6959, "step": 2195 }, { "epoch": 0.5987321927612297, "grad_norm": 6.587663820202455, "learning_rate": 3.6604504151107817e-07, "loss": 0.6553, "step": 2196 }, { "epoch": 0.5990048394792448, "grad_norm": 4.9709349723209115, "learning_rate": 3.656195105346931e-07, "loss": 0.6888, "step": 2197 }, { "epoch": 0.5992774861972598, "grad_norm": 2.526277413961484, "learning_rate": 3.6519408444307795e-07, "loss": 0.6364, "step": 2198 }, { "epoch": 0.599550132915275, "grad_norm": 2.1798724278377106, "learning_rate": 3.6476876356828067e-07, "loss": 0.7365, "step": 2199 }, { "epoch": 0.5998227796332901, "grad_norm": 2.2972197683266664, "learning_rate": 3.643435482422669e-07, "loss": 0.7179, "step": 2200 }, { "epoch": 0.6000954263513053, "grad_norm": 3.3899659466294096, "learning_rate": 3.639184387969193e-07, "loss": 0.732, "step": 2201 }, { "epoch": 0.6003680730693204, "grad_norm": 1.812524686162816, "learning_rate": 3.634934355640386e-07, "loss": 0.7357, "step": 2202 }, { "epoch": 0.6006407197873356, "grad_norm": 2.8880393486270957, "learning_rate": 3.6306853887534224e-07, "loss": 0.7377, "step": 2203 }, { "epoch": 0.6009133665053507, "grad_norm": 2.0059391895000704, "learning_rate": 3.6264374906246483e-07, "loss": 0.7526, "step": 2204 }, { "epoch": 0.6011860132233658, "grad_norm": 1.976225756449888, "learning_rate": 3.6221906645695693e-07, "loss": 0.6508, "step": 2205 }, { "epoch": 0.6014586599413809, "grad_norm": 1.8802193483017617, "learning_rate": 3.617944913902861e-07, "loss": 0.6743, "step": 2206 }, { "epoch": 0.6017313066593961, "grad_norm": 2.337566580648404, "learning_rate": 3.613700241938359e-07, "loss": 0.645, "step": 2207 }, { "epoch": 0.6020039533774112, "grad_norm": 1.9352685017858227, "learning_rate": 3.60945665198905e-07, "loss": 0.6824, "step": 2208 }, { "epoch": 0.6022766000954264, "grad_norm": 4.932661343446757, "learning_rate": 3.605214147367086e-07, "loss": 0.7714, "step": 2209 }, { "epoch": 0.6025492468134415, "grad_norm": 1.8861929491096052, "learning_rate": 3.6009727313837634e-07, "loss": 0.6933, "step": 2210 }, { "epoch": 0.6028218935314567, "grad_norm": 2.0277855546416306, "learning_rate": 3.596732407349536e-07, "loss": 0.6668, "step": 2211 }, { "epoch": 0.6030945402494717, "grad_norm": 2.4707047070788053, "learning_rate": 3.5924931785739973e-07, "loss": 0.653, "step": 2212 }, { "epoch": 0.6033671869674869, "grad_norm": 2.1663641794214645, "learning_rate": 3.5882550483658934e-07, "loss": 0.6978, "step": 2213 }, { "epoch": 0.603639833685502, "grad_norm": 2.3470952025460914, "learning_rate": 3.5840180200331127e-07, "loss": 0.6532, "step": 2214 }, { "epoch": 0.6039124804035172, "grad_norm": 4.385894266785223, "learning_rate": 3.579782096882675e-07, "loss": 0.6873, "step": 2215 }, { "epoch": 0.6041851271215323, "grad_norm": 2.116345338358177, "learning_rate": 3.57554728222075e-07, "loss": 0.6786, "step": 2216 }, { "epoch": 0.6044577738395474, "grad_norm": 2.074197719605394, "learning_rate": 3.5713135793526294e-07, "loss": 0.7178, "step": 2217 }, { "epoch": 0.6047304205575625, "grad_norm": 3.8643098504267224, "learning_rate": 3.5670809915827484e-07, "loss": 0.6903, "step": 2218 }, { "epoch": 0.6050030672755776, "grad_norm": 2.184332237776229, "learning_rate": 3.562849522214664e-07, "loss": 0.6464, "step": 2219 }, { "epoch": 0.6052757139935928, "grad_norm": 22.29479744235854, "learning_rate": 3.558619174551063e-07, "loss": 0.7599, "step": 2220 }, { "epoch": 0.6055483607116079, "grad_norm": 1.925478550987654, "learning_rate": 3.5543899518937594e-07, "loss": 0.7094, "step": 2221 }, { "epoch": 0.6058210074296231, "grad_norm": 3.0226223060052355, "learning_rate": 3.550161857543684e-07, "loss": 0.6868, "step": 2222 }, { "epoch": 0.6060936541476382, "grad_norm": 3.394416059773266, "learning_rate": 3.545934894800893e-07, "loss": 0.6965, "step": 2223 }, { "epoch": 0.6063663008656534, "grad_norm": 1.8386797206183159, "learning_rate": 3.541709066964551e-07, "loss": 0.6667, "step": 2224 }, { "epoch": 0.6066389475836684, "grad_norm": 2.9417849872045037, "learning_rate": 3.537484377332944e-07, "loss": 0.6895, "step": 2225 }, { "epoch": 0.6069115943016836, "grad_norm": 2.2375930200095704, "learning_rate": 3.533260829203471e-07, "loss": 0.6042, "step": 2226 }, { "epoch": 0.6071842410196987, "grad_norm": 2.2320830673596728, "learning_rate": 3.5290384258726303e-07, "loss": 0.6738, "step": 2227 }, { "epoch": 0.6074568877377139, "grad_norm": 2.8735855888618587, "learning_rate": 3.5248171706360383e-07, "loss": 0.5948, "step": 2228 }, { "epoch": 0.607729534455729, "grad_norm": 3.3972555773023347, "learning_rate": 3.5205970667884046e-07, "loss": 0.6512, "step": 2229 }, { "epoch": 0.6080021811737442, "grad_norm": 1.9249943773395128, "learning_rate": 3.5163781176235495e-07, "loss": 0.6806, "step": 2230 }, { "epoch": 0.6082748278917592, "grad_norm": 2.114118707034049, "learning_rate": 3.5121603264343846e-07, "loss": 0.649, "step": 2231 }, { "epoch": 0.6085474746097744, "grad_norm": 1.8293215009231427, "learning_rate": 3.5079436965129226e-07, "loss": 0.6995, "step": 2232 }, { "epoch": 0.6088201213277895, "grad_norm": 2.2842490589485727, "learning_rate": 3.50372823115027e-07, "loss": 0.6418, "step": 2233 }, { "epoch": 0.6090927680458047, "grad_norm": 2.463887832901913, "learning_rate": 3.499513933636619e-07, "loss": 0.6789, "step": 2234 }, { "epoch": 0.6093654147638198, "grad_norm": 2.7003334181130545, "learning_rate": 3.495300807261258e-07, "loss": 0.7235, "step": 2235 }, { "epoch": 0.6096380614818349, "grad_norm": 4.3110010978820945, "learning_rate": 3.4910888553125516e-07, "loss": 0.7157, "step": 2236 }, { "epoch": 0.60991070819985, "grad_norm": 4.899618269137269, "learning_rate": 3.4868780810779587e-07, "loss": 0.6499, "step": 2237 }, { "epoch": 0.6101833549178651, "grad_norm": 2.3930181673312263, "learning_rate": 3.4826684878440104e-07, "loss": 0.6103, "step": 2238 }, { "epoch": 0.6104560016358803, "grad_norm": 4.483443367102672, "learning_rate": 3.4784600788963193e-07, "loss": 0.6476, "step": 2239 }, { "epoch": 0.6107286483538954, "grad_norm": 1.9440845325635483, "learning_rate": 3.474252857519575e-07, "loss": 0.6159, "step": 2240 }, { "epoch": 0.6110012950719106, "grad_norm": 2.6164302140027336, "learning_rate": 3.470046826997539e-07, "loss": 0.6515, "step": 2241 }, { "epoch": 0.6112739417899257, "grad_norm": 2.7166124988933085, "learning_rate": 3.4658419906130423e-07, "loss": 0.6574, "step": 2242 }, { "epoch": 0.6115465885079409, "grad_norm": 4.509365663981904, "learning_rate": 3.4616383516479837e-07, "loss": 0.6876, "step": 2243 }, { "epoch": 0.6118192352259559, "grad_norm": 4.046753819054784, "learning_rate": 3.457435913383331e-07, "loss": 0.7136, "step": 2244 }, { "epoch": 0.6120918819439711, "grad_norm": 3.6234875701005698, "learning_rate": 3.453234679099109e-07, "loss": 0.7365, "step": 2245 }, { "epoch": 0.6123645286619862, "grad_norm": 3.202930794841372, "learning_rate": 3.4490346520744075e-07, "loss": 0.6286, "step": 2246 }, { "epoch": 0.6126371753800014, "grad_norm": 2.1289131916993522, "learning_rate": 3.4448358355873745e-07, "loss": 0.695, "step": 2247 }, { "epoch": 0.6129098220980165, "grad_norm": 2.9750494344079463, "learning_rate": 3.440638232915207e-07, "loss": 0.6359, "step": 2248 }, { "epoch": 0.6131824688160317, "grad_norm": 5.855948823459587, "learning_rate": 3.4364418473341646e-07, "loss": 0.7172, "step": 2249 }, { "epoch": 0.6134551155340467, "grad_norm": 9.191111462832785, "learning_rate": 3.432246682119545e-07, "loss": 0.6979, "step": 2250 }, { "epoch": 0.6137277622520619, "grad_norm": 3.9527906255491034, "learning_rate": 3.4280527405457036e-07, "loss": 0.6982, "step": 2251 }, { "epoch": 0.614000408970077, "grad_norm": 2.7309242382816867, "learning_rate": 3.4238600258860345e-07, "loss": 0.6464, "step": 2252 }, { "epoch": 0.6142730556880922, "grad_norm": 1.9921006270368247, "learning_rate": 3.4196685414129765e-07, "loss": 0.6818, "step": 2253 }, { "epoch": 0.6145457024061073, "grad_norm": 2.4990898818124125, "learning_rate": 3.415478290398012e-07, "loss": 0.7402, "step": 2254 }, { "epoch": 0.6148183491241224, "grad_norm": 2.0524537093897317, "learning_rate": 3.411289276111651e-07, "loss": 0.7034, "step": 2255 }, { "epoch": 0.6150909958421376, "grad_norm": 2.2000220142701257, "learning_rate": 3.4071015018234486e-07, "loss": 0.729, "step": 2256 }, { "epoch": 0.6153636425601526, "grad_norm": 2.786279536096795, "learning_rate": 3.402914970801984e-07, "loss": 0.6582, "step": 2257 }, { "epoch": 0.6156362892781678, "grad_norm": 2.173439315605033, "learning_rate": 3.3987296863148715e-07, "loss": 0.6717, "step": 2258 }, { "epoch": 0.6159089359961829, "grad_norm": 1.9986562329106772, "learning_rate": 3.394545651628752e-07, "loss": 0.722, "step": 2259 }, { "epoch": 0.6161815827141981, "grad_norm": 3.005677410343462, "learning_rate": 3.390362870009287e-07, "loss": 0.7006, "step": 2260 }, { "epoch": 0.6164542294322132, "grad_norm": 21.19734966579522, "learning_rate": 3.386181344721162e-07, "loss": 0.6577, "step": 2261 }, { "epoch": 0.6167268761502284, "grad_norm": 2.5571697963444286, "learning_rate": 3.382001079028084e-07, "loss": 0.6681, "step": 2262 }, { "epoch": 0.6169995228682434, "grad_norm": 3.87004622011675, "learning_rate": 3.3778220761927755e-07, "loss": 0.6575, "step": 2263 }, { "epoch": 0.6172721695862586, "grad_norm": 2.5040018238854653, "learning_rate": 3.373644339476971e-07, "loss": 0.7077, "step": 2264 }, { "epoch": 0.6175448163042737, "grad_norm": 2.778399284264128, "learning_rate": 3.3694678721414195e-07, "loss": 0.7129, "step": 2265 }, { "epoch": 0.6178174630222889, "grad_norm": 2.5211251716371623, "learning_rate": 3.365292677445881e-07, "loss": 0.72, "step": 2266 }, { "epoch": 0.618090109740304, "grad_norm": 1.9547239365104394, "learning_rate": 3.3611187586491157e-07, "loss": 0.6905, "step": 2267 }, { "epoch": 0.6183627564583192, "grad_norm": 1.9211522372421332, "learning_rate": 3.3569461190088964e-07, "loss": 0.7115, "step": 2268 }, { "epoch": 0.6186354031763343, "grad_norm": 1.8925141802678764, "learning_rate": 3.352774761781989e-07, "loss": 0.6635, "step": 2269 }, { "epoch": 0.6189080498943494, "grad_norm": 3.9376263983264392, "learning_rate": 3.348604690224166e-07, "loss": 0.7068, "step": 2270 }, { "epoch": 0.6191806966123645, "grad_norm": 3.8625540639677145, "learning_rate": 3.3444359075901894e-07, "loss": 0.6836, "step": 2271 }, { "epoch": 0.6194533433303797, "grad_norm": 8.671316640213288, "learning_rate": 3.340268417133821e-07, "loss": 0.7053, "step": 2272 }, { "epoch": 0.6197259900483948, "grad_norm": 2.7020063104795993, "learning_rate": 3.336102222107814e-07, "loss": 0.6893, "step": 2273 }, { "epoch": 0.6199986367664099, "grad_norm": 2.666043256844216, "learning_rate": 3.331937325763904e-07, "loss": 0.684, "step": 2274 }, { "epoch": 0.6202712834844251, "grad_norm": 1.9728628790293798, "learning_rate": 3.3277737313528195e-07, "loss": 0.6748, "step": 2275 }, { "epoch": 0.6205439302024401, "grad_norm": 1.7974866815738901, "learning_rate": 3.323611442124271e-07, "loss": 0.665, "step": 2276 }, { "epoch": 0.6208165769204553, "grad_norm": 2.733406531695483, "learning_rate": 3.3194504613269505e-07, "loss": 0.6132, "step": 2277 }, { "epoch": 0.6210892236384704, "grad_norm": 2.3744200499092267, "learning_rate": 3.3152907922085256e-07, "loss": 0.7384, "step": 2278 }, { "epoch": 0.6213618703564856, "grad_norm": 2.4995178079100993, "learning_rate": 3.311132438015645e-07, "loss": 0.7218, "step": 2279 }, { "epoch": 0.6216345170745007, "grad_norm": 2.748126150022482, "learning_rate": 3.3069754019939287e-07, "loss": 0.6773, "step": 2280 }, { "epoch": 0.6219071637925159, "grad_norm": 2.9123918959858988, "learning_rate": 3.3028196873879665e-07, "loss": 0.6773, "step": 2281 }, { "epoch": 0.622179810510531, "grad_norm": 12.052406780984198, "learning_rate": 3.298665297441322e-07, "loss": 0.6829, "step": 2282 }, { "epoch": 0.6224524572285461, "grad_norm": 2.529657653332704, "learning_rate": 3.294512235396516e-07, "loss": 0.7219, "step": 2283 }, { "epoch": 0.6227251039465612, "grad_norm": 1.6816348260945762, "learning_rate": 3.290360504495043e-07, "loss": 0.6979, "step": 2284 }, { "epoch": 0.6229977506645764, "grad_norm": 2.1304735590089185, "learning_rate": 3.286210107977349e-07, "loss": 0.6505, "step": 2285 }, { "epoch": 0.6232703973825915, "grad_norm": 1.844671176455573, "learning_rate": 3.282061049082846e-07, "loss": 0.7331, "step": 2286 }, { "epoch": 0.6235430441006067, "grad_norm": 2.107720154945987, "learning_rate": 3.2779133310499e-07, "loss": 0.6821, "step": 2287 }, { "epoch": 0.6238156908186218, "grad_norm": 2.4190461875599505, "learning_rate": 3.273766957115827e-07, "loss": 0.6934, "step": 2288 }, { "epoch": 0.624088337536637, "grad_norm": 6.35596988542563, "learning_rate": 3.2696219305168995e-07, "loss": 0.6832, "step": 2289 }, { "epoch": 0.624360984254652, "grad_norm": 1.805903111556197, "learning_rate": 3.265478254488331e-07, "loss": 0.6499, "step": 2290 }, { "epoch": 0.6246336309726672, "grad_norm": 2.608367590464633, "learning_rate": 3.261335932264288e-07, "loss": 0.7198, "step": 2291 }, { "epoch": 0.6249062776906823, "grad_norm": 2.0621038158995257, "learning_rate": 3.2571949670778794e-07, "loss": 0.6826, "step": 2292 }, { "epoch": 0.6251789244086974, "grad_norm": 2.916817328356972, "learning_rate": 3.253055362161149e-07, "loss": 0.6779, "step": 2293 }, { "epoch": 0.6254515711267126, "grad_norm": 2.163469474100287, "learning_rate": 3.2489171207450863e-07, "loss": 0.7275, "step": 2294 }, { "epoch": 0.6257242178447276, "grad_norm": 4.777767289559144, "learning_rate": 3.244780246059612e-07, "loss": 0.713, "step": 2295 }, { "epoch": 0.6259968645627428, "grad_norm": 2.8149432833808437, "learning_rate": 3.2406447413335817e-07, "loss": 0.7504, "step": 2296 }, { "epoch": 0.6262695112807579, "grad_norm": 2.001072063560266, "learning_rate": 3.2365106097947803e-07, "loss": 0.617, "step": 2297 }, { "epoch": 0.6265421579987731, "grad_norm": 4.3492251740022505, "learning_rate": 3.2323778546699244e-07, "loss": 0.633, "step": 2298 }, { "epoch": 0.6268148047167882, "grad_norm": 4.041726566196769, "learning_rate": 3.228246479184652e-07, "loss": 0.7168, "step": 2299 }, { "epoch": 0.6270874514348034, "grad_norm": 2.061481163064145, "learning_rate": 3.224116486563525e-07, "loss": 0.6592, "step": 2300 }, { "epoch": 0.6273600981528185, "grad_norm": 9.066131786232656, "learning_rate": 3.2199878800300315e-07, "loss": 0.634, "step": 2301 }, { "epoch": 0.6276327448708336, "grad_norm": 1.902056586888983, "learning_rate": 3.215860662806569e-07, "loss": 0.659, "step": 2302 }, { "epoch": 0.6279053915888487, "grad_norm": 3.4142531110978687, "learning_rate": 3.2117348381144586e-07, "loss": 0.6908, "step": 2303 }, { "epoch": 0.6281780383068639, "grad_norm": 4.443505052435285, "learning_rate": 3.207610409173928e-07, "loss": 0.7217, "step": 2304 }, { "epoch": 0.628450685024879, "grad_norm": 6.4745430323088655, "learning_rate": 3.2034873792041184e-07, "loss": 0.6641, "step": 2305 }, { "epoch": 0.6287233317428942, "grad_norm": 3.1069718122288994, "learning_rate": 3.199365751423083e-07, "loss": 0.6718, "step": 2306 }, { "epoch": 0.6289959784609093, "grad_norm": 6.489595530977953, "learning_rate": 3.195245529047772e-07, "loss": 0.7005, "step": 2307 }, { "epoch": 0.6292686251789245, "grad_norm": 4.009606669401661, "learning_rate": 3.191126715294048e-07, "loss": 0.6515, "step": 2308 }, { "epoch": 0.6295412718969395, "grad_norm": 1.9706268174817285, "learning_rate": 3.187009313376665e-07, "loss": 0.6689, "step": 2309 }, { "epoch": 0.6298139186149547, "grad_norm": 2.2025315257450733, "learning_rate": 3.1828933265092845e-07, "loss": 0.7223, "step": 2310 }, { "epoch": 0.6300865653329698, "grad_norm": 1.9058395977031175, "learning_rate": 3.1787787579044533e-07, "loss": 0.6545, "step": 2311 }, { "epoch": 0.6303592120509849, "grad_norm": 1.6984804352251996, "learning_rate": 3.17466561077362e-07, "loss": 0.6974, "step": 2312 }, { "epoch": 0.6306318587690001, "grad_norm": 4.400446265383029, "learning_rate": 3.170553888327122e-07, "loss": 0.6613, "step": 2313 }, { "epoch": 0.6309045054870152, "grad_norm": 2.8677150569370355, "learning_rate": 3.1664435937741786e-07, "loss": 0.682, "step": 2314 }, { "epoch": 0.6311771522050303, "grad_norm": 2.0903235474819777, "learning_rate": 3.1623347303229024e-07, "loss": 0.6995, "step": 2315 }, { "epoch": 0.6314497989230454, "grad_norm": 3.112632151719792, "learning_rate": 3.158227301180284e-07, "loss": 0.6313, "step": 2316 }, { "epoch": 0.6317224456410606, "grad_norm": 2.4376045504995894, "learning_rate": 3.154121309552199e-07, "loss": 0.7176, "step": 2317 }, { "epoch": 0.6319950923590757, "grad_norm": 9.144267863297364, "learning_rate": 3.1500167586433946e-07, "loss": 0.6991, "step": 2318 }, { "epoch": 0.6322677390770909, "grad_norm": 2.3404341480952104, "learning_rate": 3.145913651657498e-07, "loss": 0.6778, "step": 2319 }, { "epoch": 0.632540385795106, "grad_norm": 2.5910624498536574, "learning_rate": 3.141811991797012e-07, "loss": 0.6637, "step": 2320 }, { "epoch": 0.6328130325131212, "grad_norm": 1.966814146212022, "learning_rate": 3.137711782263302e-07, "loss": 0.7045, "step": 2321 }, { "epoch": 0.6330856792311362, "grad_norm": 3.0864676131694604, "learning_rate": 3.1336130262566095e-07, "loss": 0.7738, "step": 2322 }, { "epoch": 0.6333583259491514, "grad_norm": 2.0726437343106783, "learning_rate": 3.129515726976034e-07, "loss": 0.6805, "step": 2323 }, { "epoch": 0.6336309726671665, "grad_norm": 3.7820516247438243, "learning_rate": 3.125419887619545e-07, "loss": 0.6765, "step": 2324 }, { "epoch": 0.6339036193851817, "grad_norm": 2.1821739009715047, "learning_rate": 3.12132551138397e-07, "loss": 0.6732, "step": 2325 }, { "epoch": 0.6341762661031968, "grad_norm": 3.9792220009396977, "learning_rate": 3.11723260146499e-07, "loss": 0.7013, "step": 2326 }, { "epoch": 0.634448912821212, "grad_norm": 2.5636709419761807, "learning_rate": 3.113141161057151e-07, "loss": 0.675, "step": 2327 }, { "epoch": 0.634721559539227, "grad_norm": 1.8817583004032163, "learning_rate": 3.109051193353841e-07, "loss": 0.7081, "step": 2328 }, { "epoch": 0.6349942062572422, "grad_norm": 4.862619376013159, "learning_rate": 3.104962701547309e-07, "loss": 0.6819, "step": 2329 }, { "epoch": 0.6352668529752573, "grad_norm": 1.9076856519346173, "learning_rate": 3.1008756888286425e-07, "loss": 0.6009, "step": 2330 }, { "epoch": 0.6355394996932724, "grad_norm": 4.8412464115839375, "learning_rate": 3.096790158387782e-07, "loss": 0.6979, "step": 2331 }, { "epoch": 0.6358121464112876, "grad_norm": 3.6333862173737166, "learning_rate": 3.092706113413509e-07, "loss": 0.6772, "step": 2332 }, { "epoch": 0.6360847931293027, "grad_norm": 2.3202647003727814, "learning_rate": 3.088623557093443e-07, "loss": 0.705, "step": 2333 }, { "epoch": 0.6363574398473179, "grad_norm": 1.834695749044464, "learning_rate": 3.084542492614044e-07, "loss": 0.7005, "step": 2334 }, { "epoch": 0.6366300865653329, "grad_norm": 1.9685432730380115, "learning_rate": 3.080462923160606e-07, "loss": 0.6895, "step": 2335 }, { "epoch": 0.6369027332833481, "grad_norm": 8.453818605741803, "learning_rate": 3.076384851917261e-07, "loss": 0.6309, "step": 2336 }, { "epoch": 0.6371753800013632, "grad_norm": 2.558326160986715, "learning_rate": 3.0723082820669633e-07, "loss": 0.6922, "step": 2337 }, { "epoch": 0.6374480267193784, "grad_norm": 2.1745300188129177, "learning_rate": 3.0682332167915014e-07, "loss": 0.6448, "step": 2338 }, { "epoch": 0.6377206734373935, "grad_norm": 1.9997141796310207, "learning_rate": 3.06415965927149e-07, "loss": 0.8101, "step": 2339 }, { "epoch": 0.6379933201554087, "grad_norm": 2.0996931058920434, "learning_rate": 3.0600876126863616e-07, "loss": 0.7036, "step": 2340 }, { "epoch": 0.6382659668734237, "grad_norm": 1.8036468647876605, "learning_rate": 3.056017080214377e-07, "loss": 0.6225, "step": 2341 }, { "epoch": 0.6385386135914389, "grad_norm": 3.307677709756305, "learning_rate": 3.051948065032606e-07, "loss": 0.7717, "step": 2342 }, { "epoch": 0.638811260309454, "grad_norm": 4.139875194578992, "learning_rate": 3.0478805703169434e-07, "loss": 0.7228, "step": 2343 }, { "epoch": 0.6390839070274692, "grad_norm": 2.8736892671810375, "learning_rate": 3.0438145992420895e-07, "loss": 0.7708, "step": 2344 }, { "epoch": 0.6393565537454843, "grad_norm": 3.825303894596337, "learning_rate": 3.0397501549815603e-07, "loss": 0.7138, "step": 2345 }, { "epoch": 0.6396292004634995, "grad_norm": 2.0934678695814566, "learning_rate": 3.0356872407076806e-07, "loss": 0.6926, "step": 2346 }, { "epoch": 0.6399018471815145, "grad_norm": 2.0139277305915666, "learning_rate": 3.0316258595915753e-07, "loss": 0.7912, "step": 2347 }, { "epoch": 0.6401744938995297, "grad_norm": 2.340499283943431, "learning_rate": 3.0275660148031813e-07, "loss": 0.7014, "step": 2348 }, { "epoch": 0.6404471406175448, "grad_norm": 4.224017399644827, "learning_rate": 3.023507709511226e-07, "loss": 0.7303, "step": 2349 }, { "epoch": 0.6407197873355599, "grad_norm": 3.3907630186309947, "learning_rate": 3.019450946883244e-07, "loss": 0.699, "step": 2350 }, { "epoch": 0.6409924340535751, "grad_norm": 2.6698955504710766, "learning_rate": 3.015395730085565e-07, "loss": 0.6763, "step": 2351 }, { "epoch": 0.6412650807715902, "grad_norm": 2.798364403595026, "learning_rate": 3.011342062283304e-07, "loss": 0.6856, "step": 2352 }, { "epoch": 0.6415377274896054, "grad_norm": 4.017217303089229, "learning_rate": 3.007289946640378e-07, "loss": 0.6857, "step": 2353 }, { "epoch": 0.6418103742076204, "grad_norm": 4.74260543782049, "learning_rate": 3.003239386319483e-07, "loss": 0.6385, "step": 2354 }, { "epoch": 0.6420830209256356, "grad_norm": 2.463920933082896, "learning_rate": 2.99919038448211e-07, "loss": 0.7111, "step": 2355 }, { "epoch": 0.6423556676436507, "grad_norm": 1.9592905330910295, "learning_rate": 2.9951429442885244e-07, "loss": 0.6136, "step": 2356 }, { "epoch": 0.6426283143616659, "grad_norm": 2.4168462554606647, "learning_rate": 2.99109706889778e-07, "loss": 0.7184, "step": 2357 }, { "epoch": 0.642900961079681, "grad_norm": 9.778704124364417, "learning_rate": 2.9870527614677077e-07, "loss": 0.7518, "step": 2358 }, { "epoch": 0.6431736077976962, "grad_norm": 2.127910432468523, "learning_rate": 2.9830100251549096e-07, "loss": 0.6652, "step": 2359 }, { "epoch": 0.6434462545157112, "grad_norm": 1.8040592275652927, "learning_rate": 2.97896886311477e-07, "loss": 0.6603, "step": 2360 }, { "epoch": 0.6437189012337264, "grad_norm": 2.3197643431911423, "learning_rate": 2.9749292785014355e-07, "loss": 0.7763, "step": 2361 }, { "epoch": 0.6439915479517415, "grad_norm": 2.2262238329283806, "learning_rate": 2.97089127446783e-07, "loss": 0.6838, "step": 2362 }, { "epoch": 0.6442641946697567, "grad_norm": 2.578947051042958, "learning_rate": 2.966854854165635e-07, "loss": 0.7188, "step": 2363 }, { "epoch": 0.6445368413877718, "grad_norm": 2.273396894166589, "learning_rate": 2.9628200207453034e-07, "loss": 0.7132, "step": 2364 }, { "epoch": 0.644809488105787, "grad_norm": 5.59771773294793, "learning_rate": 2.958786777356048e-07, "loss": 0.6541, "step": 2365 }, { "epoch": 0.6450821348238021, "grad_norm": 2.3074951707712663, "learning_rate": 2.9547551271458364e-07, "loss": 0.7378, "step": 2366 }, { "epoch": 0.6453547815418171, "grad_norm": 2.677393149828098, "learning_rate": 2.950725073261398e-07, "loss": 0.6338, "step": 2367 }, { "epoch": 0.6456274282598323, "grad_norm": 2.4491083016574406, "learning_rate": 2.9466966188482106e-07, "loss": 0.6332, "step": 2368 }, { "epoch": 0.6459000749778474, "grad_norm": 3.716367447865089, "learning_rate": 2.9426697670505094e-07, "loss": 0.7294, "step": 2369 }, { "epoch": 0.6461727216958626, "grad_norm": 2.047919925798589, "learning_rate": 2.938644521011276e-07, "loss": 0.7002, "step": 2370 }, { "epoch": 0.6464453684138777, "grad_norm": 7.463937532952542, "learning_rate": 2.9346208838722364e-07, "loss": 0.6942, "step": 2371 }, { "epoch": 0.6467180151318929, "grad_norm": 4.1830136570553185, "learning_rate": 2.930598858773867e-07, "loss": 0.6628, "step": 2372 }, { "epoch": 0.6469906618499079, "grad_norm": 2.14288532130205, "learning_rate": 2.92657844885538e-07, "loss": 0.6363, "step": 2373 }, { "epoch": 0.6472633085679231, "grad_norm": 1.956125214488601, "learning_rate": 2.9225596572547294e-07, "loss": 0.6982, "step": 2374 }, { "epoch": 0.6475359552859382, "grad_norm": 1.9594710152512798, "learning_rate": 2.9185424871086056e-07, "loss": 0.6323, "step": 2375 }, { "epoch": 0.6478086020039534, "grad_norm": 2.1856119167814723, "learning_rate": 2.914526941552437e-07, "loss": 0.7104, "step": 2376 }, { "epoch": 0.6480812487219685, "grad_norm": 1.7404787656688945, "learning_rate": 2.910513023720375e-07, "loss": 0.6627, "step": 2377 }, { "epoch": 0.6483538954399837, "grad_norm": 1.8906963482037136, "learning_rate": 2.90650073674531e-07, "loss": 0.6927, "step": 2378 }, { "epoch": 0.6486265421579988, "grad_norm": 2.262711551431875, "learning_rate": 2.902490083758856e-07, "loss": 0.7572, "step": 2379 }, { "epoch": 0.6488991888760139, "grad_norm": 2.4269412116380056, "learning_rate": 2.8984810678913494e-07, "loss": 0.7016, "step": 2380 }, { "epoch": 0.649171835594029, "grad_norm": 1.9104714815156, "learning_rate": 2.894473692271854e-07, "loss": 0.5882, "step": 2381 }, { "epoch": 0.6494444823120442, "grad_norm": 2.8093174719769016, "learning_rate": 2.8904679600281457e-07, "loss": 0.6424, "step": 2382 }, { "epoch": 0.6497171290300593, "grad_norm": 3.6238537019815404, "learning_rate": 2.8864638742867263e-07, "loss": 0.7014, "step": 2383 }, { "epoch": 0.6499897757480745, "grad_norm": 2.594812666325631, "learning_rate": 2.8824614381728085e-07, "loss": 0.7138, "step": 2384 }, { "epoch": 0.6502624224660896, "grad_norm": 5.4765080920067994, "learning_rate": 2.8784606548103154e-07, "loss": 0.638, "step": 2385 }, { "epoch": 0.6505350691841046, "grad_norm": 1.9276923243028543, "learning_rate": 2.874461527321883e-07, "loss": 0.6465, "step": 2386 }, { "epoch": 0.6508077159021198, "grad_norm": 2.712172733700257, "learning_rate": 2.8704640588288547e-07, "loss": 0.7472, "step": 2387 }, { "epoch": 0.6510803626201349, "grad_norm": 1.6260021198583, "learning_rate": 2.8664682524512775e-07, "loss": 0.7139, "step": 2388 }, { "epoch": 0.6513530093381501, "grad_norm": 2.9024293484965393, "learning_rate": 2.862474111307902e-07, "loss": 0.653, "step": 2389 }, { "epoch": 0.6516256560561652, "grad_norm": 1.9115813383186964, "learning_rate": 2.858481638516178e-07, "loss": 0.6345, "step": 2390 }, { "epoch": 0.6518983027741804, "grad_norm": 2.273301633526136, "learning_rate": 2.8544908371922596e-07, "loss": 0.7885, "step": 2391 }, { "epoch": 0.6521709494921955, "grad_norm": 1.7687457561952973, "learning_rate": 2.850501710450982e-07, "loss": 0.6675, "step": 2392 }, { "epoch": 0.6524435962102106, "grad_norm": 2.333693735616515, "learning_rate": 2.8465142614058913e-07, "loss": 0.7524, "step": 2393 }, { "epoch": 0.6527162429282257, "grad_norm": 14.120600571593014, "learning_rate": 2.842528493169208e-07, "loss": 0.6429, "step": 2394 }, { "epoch": 0.6529888896462409, "grad_norm": 3.3237167658955578, "learning_rate": 2.8385444088518543e-07, "loss": 0.7544, "step": 2395 }, { "epoch": 0.653261536364256, "grad_norm": 2.0011781856822206, "learning_rate": 2.834562011563425e-07, "loss": 0.7678, "step": 2396 }, { "epoch": 0.6535341830822712, "grad_norm": 3.371814393207093, "learning_rate": 2.8305813044122093e-07, "loss": 0.66, "step": 2397 }, { "epoch": 0.6538068298002863, "grad_norm": 3.266783552724682, "learning_rate": 2.826602290505172e-07, "loss": 0.7358, "step": 2398 }, { "epoch": 0.6540794765183015, "grad_norm": 2.6475316044025172, "learning_rate": 2.822624972947958e-07, "loss": 0.7404, "step": 2399 }, { "epoch": 0.6543521232363165, "grad_norm": 2.1380637691635416, "learning_rate": 2.8186493548448846e-07, "loss": 0.6269, "step": 2400 }, { "epoch": 0.6546247699543317, "grad_norm": 33.97619345825743, "learning_rate": 2.814675439298945e-07, "loss": 0.6271, "step": 2401 }, { "epoch": 0.6548974166723468, "grad_norm": 2.2795289641253333, "learning_rate": 2.8107032294118105e-07, "loss": 0.6903, "step": 2402 }, { "epoch": 0.655170063390362, "grad_norm": 2.0720556184909054, "learning_rate": 2.806732728283805e-07, "loss": 0.6946, "step": 2403 }, { "epoch": 0.6554427101083771, "grad_norm": 14.194410004258177, "learning_rate": 2.8027639390139347e-07, "loss": 0.6542, "step": 2404 }, { "epoch": 0.6557153568263921, "grad_norm": 6.014796378734357, "learning_rate": 2.798796864699862e-07, "loss": 0.6877, "step": 2405 }, { "epoch": 0.6559880035444073, "grad_norm": 2.2789169056764, "learning_rate": 2.7948315084379114e-07, "loss": 0.674, "step": 2406 }, { "epoch": 0.6562606502624224, "grad_norm": 6.261647239366034, "learning_rate": 2.790867873323067e-07, "loss": 0.6483, "step": 2407 }, { "epoch": 0.6565332969804376, "grad_norm": 3.4814223685741004, "learning_rate": 2.7869059624489687e-07, "loss": 0.6969, "step": 2408 }, { "epoch": 0.6568059436984527, "grad_norm": 1.9134556230416915, "learning_rate": 2.782945778907914e-07, "loss": 0.6188, "step": 2409 }, { "epoch": 0.6570785904164679, "grad_norm": 2.8638629115336633, "learning_rate": 2.778987325790846e-07, "loss": 0.6796, "step": 2410 }, { "epoch": 0.657351237134483, "grad_norm": 2.364323580886552, "learning_rate": 2.775030606187362e-07, "loss": 0.724, "step": 2411 }, { "epoch": 0.6576238838524981, "grad_norm": 2.3210973292552737, "learning_rate": 2.7710756231857097e-07, "loss": 0.6699, "step": 2412 }, { "epoch": 0.6578965305705132, "grad_norm": 2.249391973700231, "learning_rate": 2.7671223798727694e-07, "loss": 0.6777, "step": 2413 }, { "epoch": 0.6581691772885284, "grad_norm": 5.518734292293635, "learning_rate": 2.7631708793340786e-07, "loss": 0.7589, "step": 2414 }, { "epoch": 0.6584418240065435, "grad_norm": 3.06090121261726, "learning_rate": 2.759221124653799e-07, "loss": 0.7208, "step": 2415 }, { "epoch": 0.6587144707245587, "grad_norm": 2.1975835131390427, "learning_rate": 2.755273118914744e-07, "loss": 0.6715, "step": 2416 }, { "epoch": 0.6589871174425738, "grad_norm": 2.3396364202506468, "learning_rate": 2.7513268651983524e-07, "loss": 0.6783, "step": 2417 }, { "epoch": 0.659259764160589, "grad_norm": 1.8901671652416916, "learning_rate": 2.7473823665847006e-07, "loss": 0.6638, "step": 2418 }, { "epoch": 0.659532410878604, "grad_norm": 3.6022053528075624, "learning_rate": 2.7434396261524914e-07, "loss": 0.6908, "step": 2419 }, { "epoch": 0.6598050575966192, "grad_norm": 1.9251164433650176, "learning_rate": 2.739498646979058e-07, "loss": 0.7211, "step": 2420 }, { "epoch": 0.6600777043146343, "grad_norm": 4.536976585599993, "learning_rate": 2.735559432140357e-07, "loss": 0.6616, "step": 2421 }, { "epoch": 0.6603503510326495, "grad_norm": 2.567221444216643, "learning_rate": 2.731621984710969e-07, "loss": 0.7951, "step": 2422 }, { "epoch": 0.6606229977506646, "grad_norm": 2.417308536285467, "learning_rate": 2.727686307764097e-07, "loss": 0.7288, "step": 2423 }, { "epoch": 0.6608956444686797, "grad_norm": 3.134796229292027, "learning_rate": 2.72375240437156e-07, "loss": 0.7357, "step": 2424 }, { "epoch": 0.6611682911866948, "grad_norm": 2.185458651276028, "learning_rate": 2.7198202776037917e-07, "loss": 0.6665, "step": 2425 }, { "epoch": 0.6614409379047099, "grad_norm": 4.874680183768466, "learning_rate": 2.7158899305298404e-07, "loss": 0.6536, "step": 2426 }, { "epoch": 0.6617135846227251, "grad_norm": 2.6592384812426486, "learning_rate": 2.711961366217367e-07, "loss": 0.6175, "step": 2427 }, { "epoch": 0.6619862313407402, "grad_norm": 1.9464745942183952, "learning_rate": 2.7080345877326364e-07, "loss": 0.6583, "step": 2428 }, { "epoch": 0.6622588780587554, "grad_norm": 2.28754485612997, "learning_rate": 2.704109598140525e-07, "loss": 0.6393, "step": 2429 }, { "epoch": 0.6625315247767705, "grad_norm": 2.840713150797581, "learning_rate": 2.7001864005045084e-07, "loss": 0.7168, "step": 2430 }, { "epoch": 0.6628041714947857, "grad_norm": 3.0925402882346984, "learning_rate": 2.69626499788667e-07, "loss": 0.6467, "step": 2431 }, { "epoch": 0.6630768182128007, "grad_norm": 2.063233140942362, "learning_rate": 2.692345393347681e-07, "loss": 0.6775, "step": 2432 }, { "epoch": 0.6633494649308159, "grad_norm": 4.9301093472541275, "learning_rate": 2.6884275899468235e-07, "loss": 0.6655, "step": 2433 }, { "epoch": 0.663622111648831, "grad_norm": 2.0856992257421463, "learning_rate": 2.684511590741959e-07, "loss": 0.6986, "step": 2434 }, { "epoch": 0.6638947583668462, "grad_norm": 3.209342856567275, "learning_rate": 2.680597398789554e-07, "loss": 0.715, "step": 2435 }, { "epoch": 0.6641674050848613, "grad_norm": 2.8818831904139164, "learning_rate": 2.6766850171446554e-07, "loss": 0.7266, "step": 2436 }, { "epoch": 0.6644400518028765, "grad_norm": 2.788483523594952, "learning_rate": 2.6727744488609015e-07, "loss": 0.7583, "step": 2437 }, { "epoch": 0.6647126985208915, "grad_norm": 5.709865375504585, "learning_rate": 2.6688656969905145e-07, "loss": 0.6983, "step": 2438 }, { "epoch": 0.6649853452389067, "grad_norm": 2.510375779083769, "learning_rate": 2.664958764584297e-07, "loss": 0.6835, "step": 2439 }, { "epoch": 0.6652579919569218, "grad_norm": 2.050260561624387, "learning_rate": 2.661053654691634e-07, "loss": 0.7168, "step": 2440 }, { "epoch": 0.665530638674937, "grad_norm": 3.1569658195923025, "learning_rate": 2.657150370360485e-07, "loss": 0.609, "step": 2441 }, { "epoch": 0.6658032853929521, "grad_norm": 1.990910323978375, "learning_rate": 2.653248914637392e-07, "loss": 0.6839, "step": 2442 }, { "epoch": 0.6660759321109672, "grad_norm": 2.081396549497997, "learning_rate": 2.649349290567456e-07, "loss": 0.6341, "step": 2443 }, { "epoch": 0.6663485788289824, "grad_norm": 2.9315023961413584, "learning_rate": 2.6454515011943647e-07, "loss": 0.6768, "step": 2444 }, { "epoch": 0.6666212255469974, "grad_norm": 10.006345333615132, "learning_rate": 2.641555549560362e-07, "loss": 0.7329, "step": 2445 }, { "epoch": 0.6668938722650126, "grad_norm": 1.9232826238426495, "learning_rate": 2.6376614387062617e-07, "loss": 0.684, "step": 2446 }, { "epoch": 0.6671665189830277, "grad_norm": 2.8989166957720207, "learning_rate": 2.6337691716714407e-07, "loss": 0.7161, "step": 2447 }, { "epoch": 0.6674391657010429, "grad_norm": 1.904951360995564, "learning_rate": 2.6298787514938365e-07, "loss": 0.6385, "step": 2448 }, { "epoch": 0.667711812419058, "grad_norm": 2.69758326099349, "learning_rate": 2.625990181209943e-07, "loss": 0.687, "step": 2449 }, { "epoch": 0.6679844591370732, "grad_norm": 2.6475146151628266, "learning_rate": 2.6221034638548184e-07, "loss": 0.7011, "step": 2450 }, { "epoch": 0.6682571058550882, "grad_norm": 2.961167676924695, "learning_rate": 2.6182186024620614e-07, "loss": 0.6714, "step": 2451 }, { "epoch": 0.6685297525731034, "grad_norm": 2.149252657284598, "learning_rate": 2.6143356000638353e-07, "loss": 0.6681, "step": 2452 }, { "epoch": 0.6688023992911185, "grad_norm": 2.2675646079831306, "learning_rate": 2.6104544596908417e-07, "loss": 0.6545, "step": 2453 }, { "epoch": 0.6690750460091337, "grad_norm": 2.391435320592049, "learning_rate": 2.606575184372337e-07, "loss": 0.6169, "step": 2454 }, { "epoch": 0.6693476927271488, "grad_norm": 2.9632105554932364, "learning_rate": 2.602697777136118e-07, "loss": 0.6903, "step": 2455 }, { "epoch": 0.669620339445164, "grad_norm": 2.213644276119554, "learning_rate": 2.598822241008524e-07, "loss": 0.6684, "step": 2456 }, { "epoch": 0.669892986163179, "grad_norm": 3.882543074115565, "learning_rate": 2.594948579014433e-07, "loss": 0.7432, "step": 2457 }, { "epoch": 0.6701656328811942, "grad_norm": 2.059836933547217, "learning_rate": 2.5910767941772626e-07, "loss": 0.6876, "step": 2458 }, { "epoch": 0.6704382795992093, "grad_norm": 2.3929378468398315, "learning_rate": 2.5872068895189625e-07, "loss": 0.7693, "step": 2459 }, { "epoch": 0.6707109263172245, "grad_norm": 1.9057316285500898, "learning_rate": 2.5833388680600153e-07, "loss": 0.663, "step": 2460 }, { "epoch": 0.6709835730352396, "grad_norm": 2.453824069263366, "learning_rate": 2.579472732819441e-07, "loss": 0.6646, "step": 2461 }, { "epoch": 0.6712562197532547, "grad_norm": 3.00383148260151, "learning_rate": 2.575608486814772e-07, "loss": 0.6941, "step": 2462 }, { "epoch": 0.6715288664712699, "grad_norm": 6.052377308553439, "learning_rate": 2.5717461330620815e-07, "loss": 0.6562, "step": 2463 }, { "epoch": 0.6718015131892849, "grad_norm": 1.8895494934487853, "learning_rate": 2.567885674575958e-07, "loss": 0.6848, "step": 2464 }, { "epoch": 0.6720741599073001, "grad_norm": 3.7371106925361777, "learning_rate": 2.5640271143695114e-07, "loss": 0.6295, "step": 2465 }, { "epoch": 0.6723468066253152, "grad_norm": 2.918628316796347, "learning_rate": 2.5601704554543724e-07, "loss": 0.7986, "step": 2466 }, { "epoch": 0.6726194533433304, "grad_norm": 2.3752554629837883, "learning_rate": 2.556315700840685e-07, "loss": 0.6779, "step": 2467 }, { "epoch": 0.6728921000613455, "grad_norm": 2.6656109151719463, "learning_rate": 2.5524628535371083e-07, "loss": 0.6508, "step": 2468 }, { "epoch": 0.6731647467793607, "grad_norm": 2.709687243645247, "learning_rate": 2.548611916550812e-07, "loss": 0.7103, "step": 2469 }, { "epoch": 0.6734373934973757, "grad_norm": 2.2105418110668817, "learning_rate": 2.544762892887475e-07, "loss": 0.6942, "step": 2470 }, { "epoch": 0.6737100402153909, "grad_norm": 2.339794030687778, "learning_rate": 2.540915785551288e-07, "loss": 0.7179, "step": 2471 }, { "epoch": 0.673982686933406, "grad_norm": 3.277608177227363, "learning_rate": 2.5370705975449326e-07, "loss": 0.6979, "step": 2472 }, { "epoch": 0.6742553336514212, "grad_norm": 1.9714607758669158, "learning_rate": 2.533227331869608e-07, "loss": 0.7231, "step": 2473 }, { "epoch": 0.6745279803694363, "grad_norm": 2.0339967699688346, "learning_rate": 2.529385991525003e-07, "loss": 0.7208, "step": 2474 }, { "epoch": 0.6748006270874515, "grad_norm": 4.270937549883212, "learning_rate": 2.5255465795093067e-07, "loss": 0.6462, "step": 2475 }, { "epoch": 0.6750732738054666, "grad_norm": 4.547935990946529, "learning_rate": 2.5217090988192036e-07, "loss": 0.6352, "step": 2476 }, { "epoch": 0.6753459205234817, "grad_norm": 5.651471340457717, "learning_rate": 2.517873552449869e-07, "loss": 0.6747, "step": 2477 }, { "epoch": 0.6756185672414968, "grad_norm": 3.312030754526098, "learning_rate": 2.5140399433949713e-07, "loss": 0.7189, "step": 2478 }, { "epoch": 0.675891213959512, "grad_norm": 2.4958710365235124, "learning_rate": 2.5102082746466614e-07, "loss": 0.675, "step": 2479 }, { "epoch": 0.6761638606775271, "grad_norm": 2.146916879732432, "learning_rate": 2.5063785491955863e-07, "loss": 0.6439, "step": 2480 }, { "epoch": 0.6764365073955422, "grad_norm": 9.201836780203353, "learning_rate": 2.5025507700308617e-07, "loss": 0.683, "step": 2481 }, { "epoch": 0.6767091541135574, "grad_norm": 2.6073556792529957, "learning_rate": 2.4987249401400976e-07, "loss": 0.6609, "step": 2482 }, { "epoch": 0.6769818008315724, "grad_norm": 2.069505560501741, "learning_rate": 2.4949010625093754e-07, "loss": 0.6945, "step": 2483 }, { "epoch": 0.6772544475495876, "grad_norm": 3.0104341078968395, "learning_rate": 2.491079140123254e-07, "loss": 0.6916, "step": 2484 }, { "epoch": 0.6775270942676027, "grad_norm": 2.1927170751982255, "learning_rate": 2.487259175964767e-07, "loss": 0.7344, "step": 2485 }, { "epoch": 0.6777997409856179, "grad_norm": 4.125223436098748, "learning_rate": 2.4834411730154174e-07, "loss": 0.6794, "step": 2486 }, { "epoch": 0.678072387703633, "grad_norm": 1.8450521974978604, "learning_rate": 2.479625134255181e-07, "loss": 0.637, "step": 2487 }, { "epoch": 0.6783450344216482, "grad_norm": 2.255604290066339, "learning_rate": 2.4758110626624986e-07, "loss": 0.7177, "step": 2488 }, { "epoch": 0.6786176811396633, "grad_norm": 1.7603448349986015, "learning_rate": 2.471998961214273e-07, "loss": 0.6719, "step": 2489 }, { "epoch": 0.6788903278576784, "grad_norm": 2.0771554687228857, "learning_rate": 2.468188832885878e-07, "loss": 0.6064, "step": 2490 }, { "epoch": 0.6791629745756935, "grad_norm": 1.9456764558770487, "learning_rate": 2.464380680651134e-07, "loss": 0.6586, "step": 2491 }, { "epoch": 0.6794356212937087, "grad_norm": 5.364019174747038, "learning_rate": 2.460574507482332e-07, "loss": 0.6724, "step": 2492 }, { "epoch": 0.6797082680117238, "grad_norm": 2.1540026493084934, "learning_rate": 2.456770316350211e-07, "loss": 0.7447, "step": 2493 }, { "epoch": 0.679980914729739, "grad_norm": 2.0702452431450404, "learning_rate": 2.452968110223965e-07, "loss": 0.6634, "step": 2494 }, { "epoch": 0.6802535614477541, "grad_norm": 1.9890561875325024, "learning_rate": 2.449167892071238e-07, "loss": 0.7301, "step": 2495 }, { "epoch": 0.6805262081657693, "grad_norm": 2.4500943782062383, "learning_rate": 2.4453696648581225e-07, "loss": 0.6579, "step": 2496 }, { "epoch": 0.6807988548837843, "grad_norm": 2.102170503663351, "learning_rate": 2.441573431549158e-07, "loss": 0.6683, "step": 2497 }, { "epoch": 0.6810715016017995, "grad_norm": 4.271911903890273, "learning_rate": 2.4377791951073254e-07, "loss": 0.6848, "step": 2498 }, { "epoch": 0.6813441483198146, "grad_norm": 7.260622391812365, "learning_rate": 2.4339869584940543e-07, "loss": 0.7538, "step": 2499 }, { "epoch": 0.6816167950378297, "grad_norm": 2.3400623690276383, "learning_rate": 2.4301967246692e-07, "loss": 0.6617, "step": 2500 }, { "epoch": 0.6818894417558449, "grad_norm": 1.8867496424539154, "learning_rate": 2.42640849659107e-07, "loss": 0.6223, "step": 2501 }, { "epoch": 0.68216208847386, "grad_norm": 2.413522023927769, "learning_rate": 2.4226222772163923e-07, "loss": 0.6369, "step": 2502 }, { "epoch": 0.6824347351918751, "grad_norm": 2.4601518732869567, "learning_rate": 2.418838069500338e-07, "loss": 0.6235, "step": 2503 }, { "epoch": 0.6827073819098902, "grad_norm": 2.483460547831078, "learning_rate": 2.415055876396503e-07, "loss": 0.7228, "step": 2504 }, { "epoch": 0.6829800286279054, "grad_norm": 2.2692937470726555, "learning_rate": 2.41127570085691e-07, "loss": 0.6665, "step": 2505 }, { "epoch": 0.6832526753459205, "grad_norm": 3.7259509841740694, "learning_rate": 2.4074975458320097e-07, "loss": 0.7172, "step": 2506 }, { "epoch": 0.6835253220639357, "grad_norm": 2.2604951309360035, "learning_rate": 2.403721414270674e-07, "loss": 0.6355, "step": 2507 }, { "epoch": 0.6837979687819508, "grad_norm": 2.0450155472445815, "learning_rate": 2.3999473091201967e-07, "loss": 0.6716, "step": 2508 }, { "epoch": 0.684070615499966, "grad_norm": 2.178882869640085, "learning_rate": 2.396175233326288e-07, "loss": 0.6628, "step": 2509 }, { "epoch": 0.684343262217981, "grad_norm": 3.1136061746195787, "learning_rate": 2.392405189833075e-07, "loss": 0.7397, "step": 2510 }, { "epoch": 0.6846159089359962, "grad_norm": 2.39512426603821, "learning_rate": 2.3886371815831035e-07, "loss": 0.7137, "step": 2511 }, { "epoch": 0.6848885556540113, "grad_norm": 1.9890673068632625, "learning_rate": 2.38487121151732e-07, "loss": 0.7336, "step": 2512 }, { "epoch": 0.6851612023720265, "grad_norm": 2.546773935334768, "learning_rate": 2.3811072825750905e-07, "loss": 0.7331, "step": 2513 }, { "epoch": 0.6854338490900416, "grad_norm": 2.309545826834027, "learning_rate": 2.3773453976941836e-07, "loss": 0.7082, "step": 2514 }, { "epoch": 0.6857064958080568, "grad_norm": 4.829872423971266, "learning_rate": 2.3735855598107718e-07, "loss": 0.6971, "step": 2515 }, { "epoch": 0.6859791425260718, "grad_norm": 3.458173250093295, "learning_rate": 2.3698277718594317e-07, "loss": 0.6996, "step": 2516 }, { "epoch": 0.686251789244087, "grad_norm": 3.2423407626774385, "learning_rate": 2.366072036773137e-07, "loss": 0.6581, "step": 2517 }, { "epoch": 0.6865244359621021, "grad_norm": 2.2534056312440502, "learning_rate": 2.3623183574832661e-07, "loss": 0.734, "step": 2518 }, { "epoch": 0.6867970826801172, "grad_norm": 2.0835628558514134, "learning_rate": 2.358566736919581e-07, "loss": 0.6559, "step": 2519 }, { "epoch": 0.6870697293981324, "grad_norm": 1.9850034607842981, "learning_rate": 2.3548171780102516e-07, "loss": 0.7067, "step": 2520 }, { "epoch": 0.6873423761161475, "grad_norm": 2.0357928391138236, "learning_rate": 2.3510696836818233e-07, "loss": 0.6676, "step": 2521 }, { "epoch": 0.6876150228341626, "grad_norm": 2.1211779874718846, "learning_rate": 2.3473242568592434e-07, "loss": 0.6462, "step": 2522 }, { "epoch": 0.6878876695521777, "grad_norm": 5.1392579536535665, "learning_rate": 2.3435809004658374e-07, "loss": 0.6746, "step": 2523 }, { "epoch": 0.6881603162701929, "grad_norm": 1.925486659967437, "learning_rate": 2.3398396174233176e-07, "loss": 0.6519, "step": 2524 }, { "epoch": 0.688432962988208, "grad_norm": 8.96735844152989, "learning_rate": 2.3361004106517773e-07, "loss": 0.647, "step": 2525 }, { "epoch": 0.6887056097062232, "grad_norm": 1.6501301707711873, "learning_rate": 2.3323632830696899e-07, "loss": 0.7045, "step": 2526 }, { "epoch": 0.6889782564242383, "grad_norm": 1.8194123932810433, "learning_rate": 2.3286282375939059e-07, "loss": 0.6843, "step": 2527 }, { "epoch": 0.6892509031422535, "grad_norm": 2.164074850916039, "learning_rate": 2.3248952771396501e-07, "loss": 0.7018, "step": 2528 }, { "epoch": 0.6895235498602685, "grad_norm": 2.0440702333424903, "learning_rate": 2.3211644046205186e-07, "loss": 0.6371, "step": 2529 }, { "epoch": 0.6897961965782837, "grad_norm": 3.0810071033081154, "learning_rate": 2.3174356229484853e-07, "loss": 0.6489, "step": 2530 }, { "epoch": 0.6900688432962988, "grad_norm": 2.209865628171203, "learning_rate": 2.3137089350338784e-07, "loss": 0.7121, "step": 2531 }, { "epoch": 0.690341490014314, "grad_norm": 2.3655864501262327, "learning_rate": 2.3099843437854061e-07, "loss": 0.6569, "step": 2532 }, { "epoch": 0.6906141367323291, "grad_norm": 3.3836885786039788, "learning_rate": 2.306261852110132e-07, "loss": 0.704, "step": 2533 }, { "epoch": 0.6908867834503443, "grad_norm": 3.4889720875715025, "learning_rate": 2.3025414629134825e-07, "loss": 0.6766, "step": 2534 }, { "epoch": 0.6911594301683593, "grad_norm": 2.080743832406496, "learning_rate": 2.2988231790992446e-07, "loss": 0.7619, "step": 2535 }, { "epoch": 0.6914320768863744, "grad_norm": 2.1218298228172, "learning_rate": 2.295107003569558e-07, "loss": 0.6886, "step": 2536 }, { "epoch": 0.6917047236043896, "grad_norm": 1.7700245068433884, "learning_rate": 2.2913929392249255e-07, "loss": 0.6935, "step": 2537 }, { "epoch": 0.6919773703224047, "grad_norm": 2.2059093189166554, "learning_rate": 2.2876809889641895e-07, "loss": 0.7172, "step": 2538 }, { "epoch": 0.6922500170404199, "grad_norm": 11.232537010952884, "learning_rate": 2.283971155684556e-07, "loss": 0.6956, "step": 2539 }, { "epoch": 0.692522663758435, "grad_norm": 2.2694563411411894, "learning_rate": 2.280263442281565e-07, "loss": 0.6969, "step": 2540 }, { "epoch": 0.6927953104764502, "grad_norm": 3.4029542949213285, "learning_rate": 2.276557851649114e-07, "loss": 0.6675, "step": 2541 }, { "epoch": 0.6930679571944652, "grad_norm": 3.016237882564371, "learning_rate": 2.272854386679437e-07, "loss": 0.6655, "step": 2542 }, { "epoch": 0.6933406039124804, "grad_norm": 2.1727426228373767, "learning_rate": 2.26915305026311e-07, "loss": 0.6882, "step": 2543 }, { "epoch": 0.6936132506304955, "grad_norm": 4.157175643323486, "learning_rate": 2.2654538452890488e-07, "loss": 0.6996, "step": 2544 }, { "epoch": 0.6938858973485107, "grad_norm": 1.8572380536312458, "learning_rate": 2.2617567746445037e-07, "loss": 0.6794, "step": 2545 }, { "epoch": 0.6941585440665258, "grad_norm": 2.382611763479567, "learning_rate": 2.2580618412150614e-07, "loss": 0.7188, "step": 2546 }, { "epoch": 0.694431190784541, "grad_norm": 2.7314128134899622, "learning_rate": 2.2543690478846388e-07, "loss": 0.7342, "step": 2547 }, { "epoch": 0.694703837502556, "grad_norm": 2.246097936609028, "learning_rate": 2.2506783975354816e-07, "loss": 0.7255, "step": 2548 }, { "epoch": 0.6949764842205712, "grad_norm": 4.191869679440371, "learning_rate": 2.246989893048169e-07, "loss": 0.7218, "step": 2549 }, { "epoch": 0.6952491309385863, "grad_norm": 2.495018534327801, "learning_rate": 2.243303537301594e-07, "loss": 0.6757, "step": 2550 }, { "epoch": 0.6955217776566015, "grad_norm": 4.6244546715465455, "learning_rate": 2.2396193331729846e-07, "loss": 0.6943, "step": 2551 }, { "epoch": 0.6957944243746166, "grad_norm": 3.097477956463918, "learning_rate": 2.2359372835378825e-07, "loss": 0.6586, "step": 2552 }, { "epoch": 0.6960670710926318, "grad_norm": 2.4299500844901822, "learning_rate": 2.2322573912701486e-07, "loss": 0.6786, "step": 2553 }, { "epoch": 0.6963397178106469, "grad_norm": 2.3653361277196185, "learning_rate": 2.228579659241961e-07, "loss": 0.6636, "step": 2554 }, { "epoch": 0.6966123645286619, "grad_norm": 2.472214958538202, "learning_rate": 2.224904090323809e-07, "loss": 0.7093, "step": 2555 }, { "epoch": 0.6968850112466771, "grad_norm": 3.402181456483939, "learning_rate": 2.2212306873845033e-07, "loss": 0.7179, "step": 2556 }, { "epoch": 0.6971576579646922, "grad_norm": 2.5228158951831725, "learning_rate": 2.2175594532911473e-07, "loss": 0.7437, "step": 2557 }, { "epoch": 0.6974303046827074, "grad_norm": 3.663438825020154, "learning_rate": 2.213890390909169e-07, "loss": 0.6483, "step": 2558 }, { "epoch": 0.6977029514007225, "grad_norm": 2.4296110697043143, "learning_rate": 2.2102235031022865e-07, "loss": 0.6492, "step": 2559 }, { "epoch": 0.6979755981187377, "grad_norm": 4.768027941807432, "learning_rate": 2.2065587927325346e-07, "loss": 0.6935, "step": 2560 }, { "epoch": 0.6982482448367527, "grad_norm": 2.413180352021713, "learning_rate": 2.2028962626602344e-07, "loss": 0.7057, "step": 2561 }, { "epoch": 0.6985208915547679, "grad_norm": 2.4815236996707095, "learning_rate": 2.199235915744017e-07, "loss": 0.5789, "step": 2562 }, { "epoch": 0.698793538272783, "grad_norm": 1.944751498782329, "learning_rate": 2.1955777548408038e-07, "loss": 0.6196, "step": 2563 }, { "epoch": 0.6990661849907982, "grad_norm": 4.574594269671898, "learning_rate": 2.1919217828058113e-07, "loss": 0.7001, "step": 2564 }, { "epoch": 0.6993388317088133, "grad_norm": 2.7955589473450186, "learning_rate": 2.188268002492547e-07, "loss": 0.7152, "step": 2565 }, { "epoch": 0.6996114784268285, "grad_norm": 2.0381229074087317, "learning_rate": 2.184616416752808e-07, "loss": 0.6674, "step": 2566 }, { "epoch": 0.6998841251448435, "grad_norm": 2.940258175308434, "learning_rate": 2.1809670284366793e-07, "loss": 0.6776, "step": 2567 }, { "epoch": 0.7001567718628587, "grad_norm": 2.3026866063065303, "learning_rate": 2.177319840392529e-07, "loss": 0.7041, "step": 2568 }, { "epoch": 0.7004294185808738, "grad_norm": 2.288107701944701, "learning_rate": 2.1736748554670087e-07, "loss": 0.6389, "step": 2569 }, { "epoch": 0.700702065298889, "grad_norm": 3.8795219043521083, "learning_rate": 2.1700320765050533e-07, "loss": 0.6815, "step": 2570 }, { "epoch": 0.7009747120169041, "grad_norm": 3.3705771716722435, "learning_rate": 2.1663915063498722e-07, "loss": 0.6904, "step": 2571 }, { "epoch": 0.7012473587349193, "grad_norm": 4.0048417164067125, "learning_rate": 2.1627531478429523e-07, "loss": 0.6783, "step": 2572 }, { "epoch": 0.7015200054529344, "grad_norm": 2.241139613667544, "learning_rate": 2.1591170038240526e-07, "loss": 0.6956, "step": 2573 }, { "epoch": 0.7017926521709494, "grad_norm": 2.4154737870162513, "learning_rate": 2.155483077131205e-07, "loss": 0.6569, "step": 2574 }, { "epoch": 0.7020652988889646, "grad_norm": 3.8911886470385735, "learning_rate": 2.1518513706007152e-07, "loss": 0.6509, "step": 2575 }, { "epoch": 0.7023379456069797, "grad_norm": 2.2186039362069683, "learning_rate": 2.1482218870671443e-07, "loss": 0.6671, "step": 2576 }, { "epoch": 0.7026105923249949, "grad_norm": 2.079779803031158, "learning_rate": 2.1445946293633344e-07, "loss": 0.6218, "step": 2577 }, { "epoch": 0.70288323904301, "grad_norm": 2.223222027871271, "learning_rate": 2.140969600320373e-07, "loss": 0.6493, "step": 2578 }, { "epoch": 0.7031558857610252, "grad_norm": 2.6763214750764, "learning_rate": 2.1373468027676244e-07, "loss": 0.735, "step": 2579 }, { "epoch": 0.7034285324790402, "grad_norm": 2.1767781553295706, "learning_rate": 2.1337262395326972e-07, "loss": 0.6454, "step": 2580 }, { "epoch": 0.7037011791970554, "grad_norm": 3.8243821253860735, "learning_rate": 2.130107913441467e-07, "loss": 0.7093, "step": 2581 }, { "epoch": 0.7039738259150705, "grad_norm": 5.349084084779543, "learning_rate": 2.1264918273180572e-07, "loss": 0.6993, "step": 2582 }, { "epoch": 0.7042464726330857, "grad_norm": 5.046697780662661, "learning_rate": 2.122877983984845e-07, "loss": 0.7132, "step": 2583 }, { "epoch": 0.7045191193511008, "grad_norm": 6.399466756597191, "learning_rate": 2.1192663862624555e-07, "loss": 0.6155, "step": 2584 }, { "epoch": 0.704791766069116, "grad_norm": 3.8025533970996537, "learning_rate": 2.1156570369697636e-07, "loss": 0.7427, "step": 2585 }, { "epoch": 0.7050644127871311, "grad_norm": 3.4484916403198254, "learning_rate": 2.1120499389238867e-07, "loss": 0.7126, "step": 2586 }, { "epoch": 0.7053370595051462, "grad_norm": 2.515475006666486, "learning_rate": 2.1084450949401865e-07, "loss": 0.6516, "step": 2587 }, { "epoch": 0.7056097062231613, "grad_norm": 2.8251108855964713, "learning_rate": 2.104842507832264e-07, "loss": 0.6667, "step": 2588 }, { "epoch": 0.7058823529411765, "grad_norm": 3.38607086950451, "learning_rate": 2.101242180411963e-07, "loss": 0.7759, "step": 2589 }, { "epoch": 0.7061549996591916, "grad_norm": 2.3134031268796953, "learning_rate": 2.0976441154893583e-07, "loss": 0.6229, "step": 2590 }, { "epoch": 0.7064276463772068, "grad_norm": 2.9215982862498873, "learning_rate": 2.0940483158727606e-07, "loss": 0.6981, "step": 2591 }, { "epoch": 0.7067002930952219, "grad_norm": 2.687939506490619, "learning_rate": 2.0904547843687142e-07, "loss": 0.7465, "step": 2592 }, { "epoch": 0.706972939813237, "grad_norm": 1.785486490327185, "learning_rate": 2.0868635237819915e-07, "loss": 0.6992, "step": 2593 }, { "epoch": 0.7072455865312521, "grad_norm": 2.1699329486997807, "learning_rate": 2.0832745369155914e-07, "loss": 0.6678, "step": 2594 }, { "epoch": 0.7075182332492672, "grad_norm": 3.921169461471826, "learning_rate": 2.079687826570739e-07, "loss": 0.656, "step": 2595 }, { "epoch": 0.7077908799672824, "grad_norm": 2.0729497390119502, "learning_rate": 2.0761033955468882e-07, "loss": 0.6853, "step": 2596 }, { "epoch": 0.7080635266852975, "grad_norm": 2.1127036802477295, "learning_rate": 2.0725212466417008e-07, "loss": 0.7202, "step": 2597 }, { "epoch": 0.7083361734033127, "grad_norm": 3.064603238694963, "learning_rate": 2.0689413826510733e-07, "loss": 0.7088, "step": 2598 }, { "epoch": 0.7086088201213278, "grad_norm": 56.71300718502552, "learning_rate": 2.0653638063691032e-07, "loss": 0.6293, "step": 2599 }, { "epoch": 0.7088814668393429, "grad_norm": 1.78593913099, "learning_rate": 2.0617885205881175e-07, "loss": 0.7233, "step": 2600 }, { "epoch": 0.709154113557358, "grad_norm": 2.349073196807314, "learning_rate": 2.0582155280986407e-07, "loss": 0.6602, "step": 2601 }, { "epoch": 0.7094267602753732, "grad_norm": 2.2055495797604063, "learning_rate": 2.05464483168942e-07, "loss": 0.6156, "step": 2602 }, { "epoch": 0.7096994069933883, "grad_norm": 1.797696473957515, "learning_rate": 2.051076434147403e-07, "loss": 0.6549, "step": 2603 }, { "epoch": 0.7099720537114035, "grad_norm": 3.3276891609950385, "learning_rate": 2.0475103382577458e-07, "loss": 0.6786, "step": 2604 }, { "epoch": 0.7102447004294186, "grad_norm": 2.976607001194123, "learning_rate": 2.0439465468038068e-07, "loss": 0.6838, "step": 2605 }, { "epoch": 0.7105173471474338, "grad_norm": 2.3741614945734653, "learning_rate": 2.040385062567147e-07, "loss": 0.6482, "step": 2606 }, { "epoch": 0.7107899938654488, "grad_norm": 2.608138852074599, "learning_rate": 2.0368258883275242e-07, "loss": 0.6328, "step": 2607 }, { "epoch": 0.711062640583464, "grad_norm": 1.9667732899906107, "learning_rate": 2.0332690268628983e-07, "loss": 0.6324, "step": 2608 }, { "epoch": 0.7113352873014791, "grad_norm": 1.9812875848152394, "learning_rate": 2.0297144809494187e-07, "loss": 0.6893, "step": 2609 }, { "epoch": 0.7116079340194943, "grad_norm": 3.319951046872833, "learning_rate": 2.0261622533614303e-07, "loss": 0.6955, "step": 2610 }, { "epoch": 0.7118805807375094, "grad_norm": 2.8704481913265703, "learning_rate": 2.0226123468714679e-07, "loss": 0.7041, "step": 2611 }, { "epoch": 0.7121532274555245, "grad_norm": 2.6021215131176154, "learning_rate": 2.0190647642502534e-07, "loss": 0.6895, "step": 2612 }, { "epoch": 0.7124258741735396, "grad_norm": 2.2983788075692226, "learning_rate": 2.0155195082666975e-07, "loss": 0.6337, "step": 2613 }, { "epoch": 0.7126985208915547, "grad_norm": 1.8785670990548262, "learning_rate": 2.0119765816878908e-07, "loss": 0.6978, "step": 2614 }, { "epoch": 0.7129711676095699, "grad_norm": 2.2645129604457734, "learning_rate": 2.0084359872791147e-07, "loss": 0.6774, "step": 2615 }, { "epoch": 0.713243814327585, "grad_norm": 2.711715020648769, "learning_rate": 2.0048977278038166e-07, "loss": 0.739, "step": 2616 }, { "epoch": 0.7135164610456002, "grad_norm": 2.7783763376758945, "learning_rate": 2.001361806023636e-07, "loss": 0.6725, "step": 2617 }, { "epoch": 0.7137891077636153, "grad_norm": 3.6715105346575365, "learning_rate": 1.997828224698374e-07, "loss": 0.6587, "step": 2618 }, { "epoch": 0.7140617544816305, "grad_norm": 2.080434629394803, "learning_rate": 1.9942969865860186e-07, "loss": 0.6965, "step": 2619 }, { "epoch": 0.7143344011996455, "grad_norm": 2.262199364572189, "learning_rate": 1.9907680944427159e-07, "loss": 0.7177, "step": 2620 }, { "epoch": 0.7146070479176607, "grad_norm": 1.8663047053814563, "learning_rate": 1.9872415510227913e-07, "loss": 0.6637, "step": 2621 }, { "epoch": 0.7148796946356758, "grad_norm": 2.724750379330609, "learning_rate": 1.983717359078732e-07, "loss": 0.644, "step": 2622 }, { "epoch": 0.715152341353691, "grad_norm": 2.022865707172582, "learning_rate": 1.9801955213611897e-07, "loss": 0.6975, "step": 2623 }, { "epoch": 0.7154249880717061, "grad_norm": 4.717751404693212, "learning_rate": 1.9766760406189814e-07, "loss": 0.6874, "step": 2624 }, { "epoch": 0.7156976347897213, "grad_norm": 1.6916442264830789, "learning_rate": 1.973158919599081e-07, "loss": 0.7116, "step": 2625 }, { "epoch": 0.7159702815077363, "grad_norm": 7.464305915673656, "learning_rate": 1.9696441610466235e-07, "loss": 0.7055, "step": 2626 }, { "epoch": 0.7162429282257515, "grad_norm": 2.4085736020098225, "learning_rate": 1.9661317677048967e-07, "loss": 0.6702, "step": 2627 }, { "epoch": 0.7165155749437666, "grad_norm": 2.0762234549912173, "learning_rate": 1.9626217423153475e-07, "loss": 0.6929, "step": 2628 }, { "epoch": 0.7167882216617818, "grad_norm": 47.649187255236086, "learning_rate": 1.9591140876175704e-07, "loss": 0.7073, "step": 2629 }, { "epoch": 0.7170608683797969, "grad_norm": 2.4205203320500455, "learning_rate": 1.9556088063493103e-07, "loss": 0.7256, "step": 2630 }, { "epoch": 0.717333515097812, "grad_norm": 2.8815205612892734, "learning_rate": 1.9521059012464607e-07, "loss": 0.6421, "step": 2631 }, { "epoch": 0.7176061618158271, "grad_norm": 2.1278968972897716, "learning_rate": 1.9486053750430592e-07, "loss": 0.6066, "step": 2632 }, { "epoch": 0.7178788085338422, "grad_norm": 19.875065570693, "learning_rate": 1.945107230471288e-07, "loss": 0.7088, "step": 2633 }, { "epoch": 0.7181514552518574, "grad_norm": 3.4558865904526255, "learning_rate": 1.941611470261469e-07, "loss": 0.7629, "step": 2634 }, { "epoch": 0.7184241019698725, "grad_norm": 1.8518960101953712, "learning_rate": 1.9381180971420617e-07, "loss": 0.6835, "step": 2635 }, { "epoch": 0.7186967486878877, "grad_norm": 3.9980086852734376, "learning_rate": 1.9346271138396708e-07, "loss": 0.6611, "step": 2636 }, { "epoch": 0.7189693954059028, "grad_norm": 2.3198181442708843, "learning_rate": 1.9311385230790223e-07, "loss": 0.6284, "step": 2637 }, { "epoch": 0.719242042123918, "grad_norm": 1.9928865448759763, "learning_rate": 1.9276523275829887e-07, "loss": 0.6943, "step": 2638 }, { "epoch": 0.719514688841933, "grad_norm": 2.2093253331665474, "learning_rate": 1.9241685300725597e-07, "loss": 0.7372, "step": 2639 }, { "epoch": 0.7197873355599482, "grad_norm": 11.355793602506697, "learning_rate": 1.9206871332668646e-07, "loss": 0.7145, "step": 2640 }, { "epoch": 0.7200599822779633, "grad_norm": 2.262247876939364, "learning_rate": 1.917208139883153e-07, "loss": 0.6823, "step": 2641 }, { "epoch": 0.7203326289959785, "grad_norm": 2.7045433518602358, "learning_rate": 1.9137315526367996e-07, "loss": 0.6778, "step": 2642 }, { "epoch": 0.7206052757139936, "grad_norm": 2.3353828303830495, "learning_rate": 1.910257374241302e-07, "loss": 0.7278, "step": 2643 }, { "epoch": 0.7208779224320088, "grad_norm": 3.001560267087742, "learning_rate": 1.9067856074082762e-07, "loss": 0.6668, "step": 2644 }, { "epoch": 0.7211505691500238, "grad_norm": 2.082348899082839, "learning_rate": 1.9033162548474574e-07, "loss": 0.7317, "step": 2645 }, { "epoch": 0.721423215868039, "grad_norm": 4.82907101485024, "learning_rate": 1.8998493192666943e-07, "loss": 0.7264, "step": 2646 }, { "epoch": 0.7216958625860541, "grad_norm": 4.588091391784852, "learning_rate": 1.8963848033719536e-07, "loss": 0.6331, "step": 2647 }, { "epoch": 0.7219685093040693, "grad_norm": 4.021505620904213, "learning_rate": 1.8929227098673094e-07, "loss": 0.6718, "step": 2648 }, { "epoch": 0.7222411560220844, "grad_norm": 2.4249294393722667, "learning_rate": 1.8894630414549455e-07, "loss": 0.7124, "step": 2649 }, { "epoch": 0.7225138027400995, "grad_norm": 6.851075874746656, "learning_rate": 1.8860058008351543e-07, "loss": 0.6702, "step": 2650 }, { "epoch": 0.7227864494581147, "grad_norm": 4.176447379669318, "learning_rate": 1.8825509907063326e-07, "loss": 0.7111, "step": 2651 }, { "epoch": 0.7230590961761297, "grad_norm": 3.426121395708759, "learning_rate": 1.87909861376498e-07, "loss": 0.7308, "step": 2652 }, { "epoch": 0.7233317428941449, "grad_norm": 2.845247132862956, "learning_rate": 1.8756486727056969e-07, "loss": 0.6935, "step": 2653 }, { "epoch": 0.72360438961216, "grad_norm": 2.065810124380107, "learning_rate": 1.872201170221182e-07, "loss": 0.6949, "step": 2654 }, { "epoch": 0.7238770363301752, "grad_norm": 5.72294137668047, "learning_rate": 1.8687561090022357e-07, "loss": 0.645, "step": 2655 }, { "epoch": 0.7241496830481903, "grad_norm": 2.8025384101022532, "learning_rate": 1.8653134917377428e-07, "loss": 0.616, "step": 2656 }, { "epoch": 0.7244223297662055, "grad_norm": 2.2582580685932814, "learning_rate": 1.861873321114692e-07, "loss": 0.6729, "step": 2657 }, { "epoch": 0.7246949764842205, "grad_norm": 1.540407484440301, "learning_rate": 1.858435599818151e-07, "loss": 0.6517, "step": 2658 }, { "epoch": 0.7249676232022357, "grad_norm": 2.6731242709646894, "learning_rate": 1.855000330531289e-07, "loss": 0.7081, "step": 2659 }, { "epoch": 0.7252402699202508, "grad_norm": 2.630657012446173, "learning_rate": 1.8515675159353467e-07, "loss": 0.7305, "step": 2660 }, { "epoch": 0.725512916638266, "grad_norm": 2.138482010914321, "learning_rate": 1.8481371587096605e-07, "loss": 0.6148, "step": 2661 }, { "epoch": 0.7257855633562811, "grad_norm": 2.2231295381172584, "learning_rate": 1.8447092615316445e-07, "loss": 0.6845, "step": 2662 }, { "epoch": 0.7260582100742963, "grad_norm": 2.580068459356385, "learning_rate": 1.8412838270767917e-07, "loss": 0.7135, "step": 2663 }, { "epoch": 0.7263308567923114, "grad_norm": 1.9697799317812028, "learning_rate": 1.837860858018675e-07, "loss": 0.7697, "step": 2664 }, { "epoch": 0.7266035035103265, "grad_norm": 2.0398839879745294, "learning_rate": 1.8344403570289407e-07, "loss": 0.6942, "step": 2665 }, { "epoch": 0.7268761502283416, "grad_norm": 4.332760199050675, "learning_rate": 1.8310223267773111e-07, "loss": 0.6313, "step": 2666 }, { "epoch": 0.7271487969463568, "grad_norm": 12.228476384266907, "learning_rate": 1.8276067699315777e-07, "loss": 0.6326, "step": 2667 }, { "epoch": 0.7274214436643719, "grad_norm": 1.9031402010549152, "learning_rate": 1.8241936891576054e-07, "loss": 0.6864, "step": 2668 }, { "epoch": 0.727694090382387, "grad_norm": 2.2579426418701267, "learning_rate": 1.820783087119323e-07, "loss": 0.7015, "step": 2669 }, { "epoch": 0.7279667371004022, "grad_norm": 2.0821420949060014, "learning_rate": 1.8173749664787253e-07, "loss": 0.7054, "step": 2670 }, { "epoch": 0.7282393838184172, "grad_norm": 2.2605110142861724, "learning_rate": 1.813969329895871e-07, "loss": 0.7322, "step": 2671 }, { "epoch": 0.7285120305364324, "grad_norm": 3.2260645882212957, "learning_rate": 1.810566180028879e-07, "loss": 0.7699, "step": 2672 }, { "epoch": 0.7287846772544475, "grad_norm": 2.226303504099247, "learning_rate": 1.807165519533927e-07, "loss": 0.7133, "step": 2673 }, { "epoch": 0.7290573239724627, "grad_norm": 3.0228189636451113, "learning_rate": 1.8037673510652556e-07, "loss": 0.7601, "step": 2674 }, { "epoch": 0.7293299706904778, "grad_norm": 1.6372026878937709, "learning_rate": 1.8003716772751486e-07, "loss": 0.6597, "step": 2675 }, { "epoch": 0.729602617408493, "grad_norm": 7.619145706217559, "learning_rate": 1.796978500813957e-07, "loss": 0.6911, "step": 2676 }, { "epoch": 0.729875264126508, "grad_norm": 3.1137845704156004, "learning_rate": 1.7935878243300672e-07, "loss": 0.6781, "step": 2677 }, { "epoch": 0.7301479108445232, "grad_norm": 3.881285121232766, "learning_rate": 1.7901996504699302e-07, "loss": 0.6468, "step": 2678 }, { "epoch": 0.7304205575625383, "grad_norm": 1.960464254151021, "learning_rate": 1.7868139818780282e-07, "loss": 0.718, "step": 2679 }, { "epoch": 0.7306932042805535, "grad_norm": 2.4657204063353895, "learning_rate": 1.7834308211969013e-07, "loss": 0.7516, "step": 2680 }, { "epoch": 0.7309658509985686, "grad_norm": 4.2416822942125005, "learning_rate": 1.7800501710671246e-07, "loss": 0.6724, "step": 2681 }, { "epoch": 0.7312384977165838, "grad_norm": 2.071000234062977, "learning_rate": 1.7766720341273157e-07, "loss": 0.6997, "step": 2682 }, { "epoch": 0.7315111444345989, "grad_norm": 1.7534126099092517, "learning_rate": 1.7732964130141303e-07, "loss": 0.5859, "step": 2683 }, { "epoch": 0.731783791152614, "grad_norm": 2.724611005678987, "learning_rate": 1.7699233103622613e-07, "loss": 0.6865, "step": 2684 }, { "epoch": 0.7320564378706291, "grad_norm": 2.2831283675082266, "learning_rate": 1.7665527288044347e-07, "loss": 0.681, "step": 2685 }, { "epoch": 0.7323290845886443, "grad_norm": 2.4679384301744904, "learning_rate": 1.763184670971408e-07, "loss": 0.669, "step": 2686 }, { "epoch": 0.7326017313066594, "grad_norm": 2.2465373507248474, "learning_rate": 1.7598191394919737e-07, "loss": 0.6713, "step": 2687 }, { "epoch": 0.7328743780246745, "grad_norm": 2.9081476948392524, "learning_rate": 1.7564561369929477e-07, "loss": 0.7054, "step": 2688 }, { "epoch": 0.7331470247426897, "grad_norm": 2.776644672361417, "learning_rate": 1.753095666099173e-07, "loss": 0.7178, "step": 2689 }, { "epoch": 0.7334196714607047, "grad_norm": 2.1326966891222554, "learning_rate": 1.749737729433517e-07, "loss": 0.6912, "step": 2690 }, { "epoch": 0.7336923181787199, "grad_norm": 2.6088929710553583, "learning_rate": 1.74638232961687e-07, "loss": 0.6618, "step": 2691 }, { "epoch": 0.733964964896735, "grad_norm": 3.3580444960575595, "learning_rate": 1.7430294692681407e-07, "loss": 0.6311, "step": 2692 }, { "epoch": 0.7342376116147502, "grad_norm": 2.3750900194813744, "learning_rate": 1.7396791510042568e-07, "loss": 0.6899, "step": 2693 }, { "epoch": 0.7345102583327653, "grad_norm": 2.9925395382481477, "learning_rate": 1.73633137744016e-07, "loss": 0.7182, "step": 2694 }, { "epoch": 0.7347829050507805, "grad_norm": 3.798415363367152, "learning_rate": 1.732986151188812e-07, "loss": 0.7102, "step": 2695 }, { "epoch": 0.7350555517687956, "grad_norm": 2.9383458078243163, "learning_rate": 1.729643474861176e-07, "loss": 0.5975, "step": 2696 }, { "epoch": 0.7353281984868107, "grad_norm": 6.2835945942521825, "learning_rate": 1.7263033510662366e-07, "loss": 0.6672, "step": 2697 }, { "epoch": 0.7356008452048258, "grad_norm": 2.188275552788979, "learning_rate": 1.7229657824109745e-07, "loss": 0.6713, "step": 2698 }, { "epoch": 0.735873491922841, "grad_norm": 3.5104450477294975, "learning_rate": 1.7196307715003855e-07, "loss": 0.6066, "step": 2699 }, { "epoch": 0.7361461386408561, "grad_norm": 1.9579568583913103, "learning_rate": 1.716298320937465e-07, "loss": 0.6463, "step": 2700 }, { "epoch": 0.7364187853588713, "grad_norm": 4.398356006217219, "learning_rate": 1.7129684333232092e-07, "loss": 0.6248, "step": 2701 }, { "epoch": 0.7366914320768864, "grad_norm": 2.2987011623757403, "learning_rate": 1.7096411112566168e-07, "loss": 0.6862, "step": 2702 }, { "epoch": 0.7369640787949016, "grad_norm": 2.1806975798001575, "learning_rate": 1.7063163573346805e-07, "loss": 0.7318, "step": 2703 }, { "epoch": 0.7372367255129166, "grad_norm": 5.189975800516949, "learning_rate": 1.7029941741523923e-07, "loss": 0.661, "step": 2704 }, { "epoch": 0.7375093722309318, "grad_norm": 3.4381817772870336, "learning_rate": 1.6996745643027332e-07, "loss": 0.6845, "step": 2705 }, { "epoch": 0.7377820189489469, "grad_norm": 2.8653923598840807, "learning_rate": 1.6963575303766825e-07, "loss": 0.6284, "step": 2706 }, { "epoch": 0.738054665666962, "grad_norm": 4.748159591256162, "learning_rate": 1.6930430749632025e-07, "loss": 0.7499, "step": 2707 }, { "epoch": 0.7383273123849772, "grad_norm": 2.1791090359682204, "learning_rate": 1.6897312006492454e-07, "loss": 0.6849, "step": 2708 }, { "epoch": 0.7385999591029923, "grad_norm": 2.8769057907622257, "learning_rate": 1.6864219100197484e-07, "loss": 0.665, "step": 2709 }, { "epoch": 0.7388726058210074, "grad_norm": 3.6222549226689824, "learning_rate": 1.6831152056576336e-07, "loss": 0.6313, "step": 2710 }, { "epoch": 0.7391452525390225, "grad_norm": 3.4587356414686727, "learning_rate": 1.679811090143803e-07, "loss": 0.7165, "step": 2711 }, { "epoch": 0.7394178992570377, "grad_norm": 2.3358505195648283, "learning_rate": 1.676509566057138e-07, "loss": 0.6315, "step": 2712 }, { "epoch": 0.7396905459750528, "grad_norm": 3.4220409323971124, "learning_rate": 1.6732106359744962e-07, "loss": 0.6837, "step": 2713 }, { "epoch": 0.739963192693068, "grad_norm": 3.6272053364057752, "learning_rate": 1.6699143024707174e-07, "loss": 0.6732, "step": 2714 }, { "epoch": 0.7402358394110831, "grad_norm": 1.9356119711614073, "learning_rate": 1.666620568118603e-07, "loss": 0.6477, "step": 2715 }, { "epoch": 0.7405084861290983, "grad_norm": 4.438207358147968, "learning_rate": 1.6633294354889383e-07, "loss": 0.6296, "step": 2716 }, { "epoch": 0.7407811328471133, "grad_norm": 4.798596200371902, "learning_rate": 1.6600409071504652e-07, "loss": 0.6866, "step": 2717 }, { "epoch": 0.7410537795651285, "grad_norm": 2.641625162896288, "learning_rate": 1.6567549856699047e-07, "loss": 0.7022, "step": 2718 }, { "epoch": 0.7413264262831436, "grad_norm": 1.9750646642896514, "learning_rate": 1.653471673611937e-07, "loss": 0.6607, "step": 2719 }, { "epoch": 0.7415990730011588, "grad_norm": 3.694592257225065, "learning_rate": 1.6501909735392055e-07, "loss": 0.711, "step": 2720 }, { "epoch": 0.7418717197191739, "grad_norm": 1.9897625538354726, "learning_rate": 1.6469128880123172e-07, "loss": 0.6599, "step": 2721 }, { "epoch": 0.7421443664371891, "grad_norm": 3.389906152020769, "learning_rate": 1.6436374195898367e-07, "loss": 0.6944, "step": 2722 }, { "epoch": 0.7424170131552041, "grad_norm": 2.0230111862282554, "learning_rate": 1.640364570828287e-07, "loss": 0.7112, "step": 2723 }, { "epoch": 0.7426896598732192, "grad_norm": 11.995448983748414, "learning_rate": 1.637094344282144e-07, "loss": 0.6735, "step": 2724 }, { "epoch": 0.7429623065912344, "grad_norm": 2.2358950998177525, "learning_rate": 1.6338267425038443e-07, "loss": 0.6839, "step": 2725 }, { "epoch": 0.7432349533092495, "grad_norm": 2.3807522713823555, "learning_rate": 1.6305617680437635e-07, "loss": 0.7326, "step": 2726 }, { "epoch": 0.7435076000272647, "grad_norm": 3.6509825453107667, "learning_rate": 1.6272994234502379e-07, "loss": 0.6687, "step": 2727 }, { "epoch": 0.7437802467452798, "grad_norm": 2.136248589927064, "learning_rate": 1.6240397112695465e-07, "loss": 0.6601, "step": 2728 }, { "epoch": 0.744052893463295, "grad_norm": 2.225507681417115, "learning_rate": 1.620782634045913e-07, "loss": 0.6651, "step": 2729 }, { "epoch": 0.74432554018131, "grad_norm": 2.577514187409659, "learning_rate": 1.6175281943215064e-07, "loss": 0.687, "step": 2730 }, { "epoch": 0.7445981868993252, "grad_norm": 2.79902818297058, "learning_rate": 1.6142763946364357e-07, "loss": 0.6433, "step": 2731 }, { "epoch": 0.7448708336173403, "grad_norm": 2.6783949002125436, "learning_rate": 1.6110272375287482e-07, "loss": 0.7291, "step": 2732 }, { "epoch": 0.7451434803353555, "grad_norm": 2.115619394981251, "learning_rate": 1.607780725534435e-07, "loss": 0.6505, "step": 2733 }, { "epoch": 0.7454161270533706, "grad_norm": 3.168286482772017, "learning_rate": 1.6045368611874122e-07, "loss": 0.6801, "step": 2734 }, { "epoch": 0.7456887737713858, "grad_norm": 2.31278786068208, "learning_rate": 1.601295647019541e-07, "loss": 0.7308, "step": 2735 }, { "epoch": 0.7459614204894008, "grad_norm": 2.026184817445475, "learning_rate": 1.5980570855606024e-07, "loss": 0.7677, "step": 2736 }, { "epoch": 0.746234067207416, "grad_norm": 3.987621710439442, "learning_rate": 1.5948211793383188e-07, "loss": 0.7126, "step": 2737 }, { "epoch": 0.7465067139254311, "grad_norm": 2.490408826527614, "learning_rate": 1.591587930878328e-07, "loss": 0.7243, "step": 2738 }, { "epoch": 0.7467793606434463, "grad_norm": 2.6206269595906235, "learning_rate": 1.5883573427042045e-07, "loss": 0.6916, "step": 2739 }, { "epoch": 0.7470520073614614, "grad_norm": 2.1997287378236607, "learning_rate": 1.5851294173374397e-07, "loss": 0.6942, "step": 2740 }, { "epoch": 0.7473246540794766, "grad_norm": 1.8788903969807638, "learning_rate": 1.581904157297449e-07, "loss": 0.6405, "step": 2741 }, { "epoch": 0.7475973007974916, "grad_norm": 1.9720274815918115, "learning_rate": 1.5786815651015667e-07, "loss": 0.6946, "step": 2742 }, { "epoch": 0.7478699475155067, "grad_norm": 3.138623279266789, "learning_rate": 1.5754616432650443e-07, "loss": 0.7201, "step": 2743 }, { "epoch": 0.7481425942335219, "grad_norm": 5.946739512922376, "learning_rate": 1.5722443943010531e-07, "loss": 0.6367, "step": 2744 }, { "epoch": 0.748415240951537, "grad_norm": 1.8295061585178058, "learning_rate": 1.56902982072067e-07, "loss": 0.6218, "step": 2745 }, { "epoch": 0.7486878876695522, "grad_norm": 2.5750570399481747, "learning_rate": 1.565817925032893e-07, "loss": 0.7194, "step": 2746 }, { "epoch": 0.7489605343875673, "grad_norm": 4.45127465358972, "learning_rate": 1.562608709744624e-07, "loss": 0.6227, "step": 2747 }, { "epoch": 0.7492331811055825, "grad_norm": 2.4032225881353617, "learning_rate": 1.5594021773606753e-07, "loss": 0.8069, "step": 2748 }, { "epoch": 0.7495058278235975, "grad_norm": 2.3907783544562387, "learning_rate": 1.5561983303837638e-07, "loss": 0.6997, "step": 2749 }, { "epoch": 0.7497784745416127, "grad_norm": 2.9174679461920032, "learning_rate": 1.5529971713145112e-07, "loss": 0.7445, "step": 2750 }, { "epoch": 0.7500511212596278, "grad_norm": 4.0397475946642825, "learning_rate": 1.549798702651442e-07, "loss": 0.6813, "step": 2751 }, { "epoch": 0.750323767977643, "grad_norm": 2.213096162366433, "learning_rate": 1.5466029268909787e-07, "loss": 0.6249, "step": 2752 }, { "epoch": 0.7505964146956581, "grad_norm": 2.1495697692552813, "learning_rate": 1.5434098465274425e-07, "loss": 0.7566, "step": 2753 }, { "epoch": 0.7508690614136733, "grad_norm": 5.561949167345263, "learning_rate": 1.5402194640530563e-07, "loss": 0.7002, "step": 2754 }, { "epoch": 0.7511417081316883, "grad_norm": 2.5568364455973627, "learning_rate": 1.5370317819579259e-07, "loss": 0.706, "step": 2755 }, { "epoch": 0.7514143548497035, "grad_norm": 2.8291325484415295, "learning_rate": 1.5338468027300626e-07, "loss": 0.719, "step": 2756 }, { "epoch": 0.7516870015677186, "grad_norm": 2.957600451821884, "learning_rate": 1.5306645288553555e-07, "loss": 0.6584, "step": 2757 }, { "epoch": 0.7519596482857338, "grad_norm": 2.184049211801403, "learning_rate": 1.5274849628175922e-07, "loss": 0.7082, "step": 2758 }, { "epoch": 0.7522322950037489, "grad_norm": 3.8461040450318724, "learning_rate": 1.5243081070984426e-07, "loss": 0.7332, "step": 2759 }, { "epoch": 0.7525049417217641, "grad_norm": 1.89342777402012, "learning_rate": 1.5211339641774608e-07, "loss": 0.6401, "step": 2760 }, { "epoch": 0.7527775884397792, "grad_norm": 2.1895106863508187, "learning_rate": 1.5179625365320837e-07, "loss": 0.7402, "step": 2761 }, { "epoch": 0.7530502351577942, "grad_norm": 3.922899674078735, "learning_rate": 1.5147938266376282e-07, "loss": 0.6633, "step": 2762 }, { "epoch": 0.7533228818758094, "grad_norm": 1.7247546153599365, "learning_rate": 1.5116278369672965e-07, "loss": 0.6656, "step": 2763 }, { "epoch": 0.7535955285938245, "grad_norm": 3.19965419941126, "learning_rate": 1.508464569992155e-07, "loss": 0.6506, "step": 2764 }, { "epoch": 0.7538681753118397, "grad_norm": 4.484037799134723, "learning_rate": 1.5053040281811579e-07, "loss": 0.7043, "step": 2765 }, { "epoch": 0.7541408220298548, "grad_norm": 4.184263119932239, "learning_rate": 1.5021462140011253e-07, "loss": 0.6332, "step": 2766 }, { "epoch": 0.75441346874787, "grad_norm": 9.46655939023751, "learning_rate": 1.4989911299167496e-07, "loss": 0.6838, "step": 2767 }, { "epoch": 0.754686115465885, "grad_norm": 4.18521169686322, "learning_rate": 1.4958387783905925e-07, "loss": 0.6484, "step": 2768 }, { "epoch": 0.7549587621839002, "grad_norm": 16.142116320823725, "learning_rate": 1.4926891618830834e-07, "loss": 0.6478, "step": 2769 }, { "epoch": 0.7552314089019153, "grad_norm": 1.9380065619171494, "learning_rate": 1.4895422828525166e-07, "loss": 0.6638, "step": 2770 }, { "epoch": 0.7555040556199305, "grad_norm": 2.4236718096262084, "learning_rate": 1.4863981437550498e-07, "loss": 0.6967, "step": 2771 }, { "epoch": 0.7557767023379456, "grad_norm": 2.050271075461488, "learning_rate": 1.483256747044701e-07, "loss": 0.6346, "step": 2772 }, { "epoch": 0.7560493490559608, "grad_norm": 2.07679222519041, "learning_rate": 1.4801180951733534e-07, "loss": 0.6787, "step": 2773 }, { "epoch": 0.7563219957739759, "grad_norm": 2.3838967076483772, "learning_rate": 1.476982190590737e-07, "loss": 0.6892, "step": 2774 }, { "epoch": 0.756594642491991, "grad_norm": 2.0947326209897352, "learning_rate": 1.4738490357444505e-07, "loss": 0.6976, "step": 2775 }, { "epoch": 0.7568672892100061, "grad_norm": 3.225223986710221, "learning_rate": 1.4707186330799337e-07, "loss": 0.6752, "step": 2776 }, { "epoch": 0.7571399359280213, "grad_norm": 2.454947911321185, "learning_rate": 1.4675909850404888e-07, "loss": 0.6099, "step": 2777 }, { "epoch": 0.7574125826460364, "grad_norm": 2.2493985856974987, "learning_rate": 1.4644660940672627e-07, "loss": 0.7451, "step": 2778 }, { "epoch": 0.7576852293640516, "grad_norm": 1.7752450454205844, "learning_rate": 1.4613439625992507e-07, "loss": 0.6097, "step": 2779 }, { "epoch": 0.7579578760820667, "grad_norm": 2.9912355350793223, "learning_rate": 1.458224593073295e-07, "loss": 0.6669, "step": 2780 }, { "epoch": 0.7582305228000817, "grad_norm": 1.6996900569512319, "learning_rate": 1.4551079879240813e-07, "loss": 0.6485, "step": 2781 }, { "epoch": 0.7585031695180969, "grad_norm": 2.3960472434373643, "learning_rate": 1.451994149584142e-07, "loss": 0.7044, "step": 2782 }, { "epoch": 0.758775816236112, "grad_norm": 2.120299503482003, "learning_rate": 1.4488830804838408e-07, "loss": 0.6741, "step": 2783 }, { "epoch": 0.7590484629541272, "grad_norm": 2.2869770440644497, "learning_rate": 1.4457747830513922e-07, "loss": 0.599, "step": 2784 }, { "epoch": 0.7593211096721423, "grad_norm": 2.4627044973934034, "learning_rate": 1.4426692597128336e-07, "loss": 0.7291, "step": 2785 }, { "epoch": 0.7595937563901575, "grad_norm": 2.4214936441318717, "learning_rate": 1.4395665128920503e-07, "loss": 0.7499, "step": 2786 }, { "epoch": 0.7598664031081726, "grad_norm": 2.0739549444464154, "learning_rate": 1.4364665450107516e-07, "loss": 0.6831, "step": 2787 }, { "epoch": 0.7601390498261877, "grad_norm": 2.297956717553249, "learning_rate": 1.433369358488482e-07, "loss": 0.7085, "step": 2788 }, { "epoch": 0.7604116965442028, "grad_norm": 3.9708012875454934, "learning_rate": 1.4302749557426136e-07, "loss": 0.6257, "step": 2789 }, { "epoch": 0.760684343262218, "grad_norm": 2.6653710375206825, "learning_rate": 1.4271833391883464e-07, "loss": 0.6733, "step": 2790 }, { "epoch": 0.7609569899802331, "grad_norm": 2.126728203032045, "learning_rate": 1.4240945112387048e-07, "loss": 0.7207, "step": 2791 }, { "epoch": 0.7612296366982483, "grad_norm": 41.40803641489434, "learning_rate": 1.4210084743045382e-07, "loss": 0.6733, "step": 2792 }, { "epoch": 0.7615022834162634, "grad_norm": 2.627708293633484, "learning_rate": 1.4179252307945145e-07, "loss": 0.6754, "step": 2793 }, { "epoch": 0.7617749301342785, "grad_norm": 3.145830892232624, "learning_rate": 1.4148447831151284e-07, "loss": 0.6767, "step": 2794 }, { "epoch": 0.7620475768522936, "grad_norm": 1.5621538204059255, "learning_rate": 1.4117671336706815e-07, "loss": 0.5799, "step": 2795 }, { "epoch": 0.7623202235703088, "grad_norm": 2.557750469909881, "learning_rate": 1.4086922848633026e-07, "loss": 0.7032, "step": 2796 }, { "epoch": 0.7625928702883239, "grad_norm": 5.111881635826964, "learning_rate": 1.4056202390929272e-07, "loss": 0.7859, "step": 2797 }, { "epoch": 0.7628655170063391, "grad_norm": 2.3684870674820098, "learning_rate": 1.4025509987573058e-07, "loss": 0.6917, "step": 2798 }, { "epoch": 0.7631381637243542, "grad_norm": 2.3066301089609444, "learning_rate": 1.3994845662519983e-07, "loss": 0.6543, "step": 2799 }, { "epoch": 0.7634108104423692, "grad_norm": 3.1845611494816466, "learning_rate": 1.3964209439703716e-07, "loss": 0.6724, "step": 2800 }, { "epoch": 0.7636834571603844, "grad_norm": 2.1893809346910023, "learning_rate": 1.3933601343036068e-07, "loss": 0.6414, "step": 2801 }, { "epoch": 0.7639561038783995, "grad_norm": 2.7947497385662565, "learning_rate": 1.3903021396406767e-07, "loss": 0.7526, "step": 2802 }, { "epoch": 0.7642287505964147, "grad_norm": 8.430222738305996, "learning_rate": 1.3872469623683714e-07, "loss": 0.7167, "step": 2803 }, { "epoch": 0.7645013973144298, "grad_norm": 6.4788426083392885, "learning_rate": 1.3841946048712684e-07, "loss": 0.7103, "step": 2804 }, { "epoch": 0.764774044032445, "grad_norm": 2.668073074766922, "learning_rate": 1.3811450695317557e-07, "loss": 0.6709, "step": 2805 }, { "epoch": 0.7650466907504601, "grad_norm": 3.7374333984983985, "learning_rate": 1.3780983587300115e-07, "loss": 0.7142, "step": 2806 }, { "epoch": 0.7653193374684752, "grad_norm": 2.422298260860935, "learning_rate": 1.3750544748440125e-07, "loss": 0.7419, "step": 2807 }, { "epoch": 0.7655919841864903, "grad_norm": 3.3125180152762264, "learning_rate": 1.372013420249527e-07, "loss": 0.7047, "step": 2808 }, { "epoch": 0.7658646309045055, "grad_norm": 2.9924791134101674, "learning_rate": 1.368975197320118e-07, "loss": 0.7624, "step": 2809 }, { "epoch": 0.7661372776225206, "grad_norm": 5.781722280462671, "learning_rate": 1.3659398084271344e-07, "loss": 0.7304, "step": 2810 }, { "epoch": 0.7664099243405358, "grad_norm": 5.828819080086589, "learning_rate": 1.3629072559397176e-07, "loss": 0.6902, "step": 2811 }, { "epoch": 0.7666825710585509, "grad_norm": 2.422429709502902, "learning_rate": 1.3598775422247894e-07, "loss": 0.6853, "step": 2812 }, { "epoch": 0.7669552177765661, "grad_norm": 2.560552562368761, "learning_rate": 1.3568506696470643e-07, "loss": 0.6966, "step": 2813 }, { "epoch": 0.7672278644945811, "grad_norm": 5.921855999941797, "learning_rate": 1.3538266405690292e-07, "loss": 0.6422, "step": 2814 }, { "epoch": 0.7675005112125963, "grad_norm": 4.231942432966812, "learning_rate": 1.3508054573509604e-07, "loss": 0.6861, "step": 2815 }, { "epoch": 0.7677731579306114, "grad_norm": 3.594347496748205, "learning_rate": 1.347787122350908e-07, "loss": 0.6784, "step": 2816 }, { "epoch": 0.7680458046486266, "grad_norm": 1.8135757312859833, "learning_rate": 1.3447716379247003e-07, "loss": 0.6548, "step": 2817 }, { "epoch": 0.7683184513666417, "grad_norm": 3.9380674779366083, "learning_rate": 1.3417590064259409e-07, "loss": 0.6859, "step": 2818 }, { "epoch": 0.7685910980846568, "grad_norm": 12.335629036210676, "learning_rate": 1.3387492302060054e-07, "loss": 0.6757, "step": 2819 }, { "epoch": 0.768863744802672, "grad_norm": 2.7715190493122464, "learning_rate": 1.3357423116140466e-07, "loss": 0.6543, "step": 2820 }, { "epoch": 0.769136391520687, "grad_norm": 2.4151111841420247, "learning_rate": 1.3327382529969755e-07, "loss": 0.7393, "step": 2821 }, { "epoch": 0.7694090382387022, "grad_norm": 1.8620134981010497, "learning_rate": 1.3297370566994843e-07, "loss": 0.6946, "step": 2822 }, { "epoch": 0.7696816849567173, "grad_norm": 2.6400910276229337, "learning_rate": 1.3267387250640182e-07, "loss": 0.6936, "step": 2823 }, { "epoch": 0.7699543316747325, "grad_norm": 2.271089214099039, "learning_rate": 1.3237432604307997e-07, "loss": 0.6626, "step": 2824 }, { "epoch": 0.7702269783927476, "grad_norm": 2.55870789222881, "learning_rate": 1.3207506651377998e-07, "loss": 0.7242, "step": 2825 }, { "epoch": 0.7704996251107628, "grad_norm": 2.3562825569558408, "learning_rate": 1.317760941520762e-07, "loss": 0.6643, "step": 2826 }, { "epoch": 0.7707722718287778, "grad_norm": 3.4071813475258548, "learning_rate": 1.3147740919131812e-07, "loss": 0.7226, "step": 2827 }, { "epoch": 0.771044918546793, "grad_norm": 2.3357829092902245, "learning_rate": 1.3117901186463114e-07, "loss": 0.6603, "step": 2828 }, { "epoch": 0.7713175652648081, "grad_norm": 2.3919801978415927, "learning_rate": 1.3088090240491622e-07, "loss": 0.7024, "step": 2829 }, { "epoch": 0.7715902119828233, "grad_norm": 1.8958013674423537, "learning_rate": 1.305830810448495e-07, "loss": 0.7086, "step": 2830 }, { "epoch": 0.7718628587008384, "grad_norm": 2.326666830638141, "learning_rate": 1.302855480168822e-07, "loss": 0.661, "step": 2831 }, { "epoch": 0.7721355054188536, "grad_norm": 2.158673130658451, "learning_rate": 1.2998830355324097e-07, "loss": 0.7033, "step": 2832 }, { "epoch": 0.7724081521368686, "grad_norm": 2.5603287263196357, "learning_rate": 1.2969134788592646e-07, "loss": 0.6814, "step": 2833 }, { "epoch": 0.7726807988548838, "grad_norm": 5.5749737074909405, "learning_rate": 1.293946812467146e-07, "loss": 0.7368, "step": 2834 }, { "epoch": 0.7729534455728989, "grad_norm": 3.1284582187634387, "learning_rate": 1.2909830386715543e-07, "loss": 0.664, "step": 2835 }, { "epoch": 0.7732260922909141, "grad_norm": 2.5116277828976585, "learning_rate": 1.2880221597857322e-07, "loss": 0.656, "step": 2836 }, { "epoch": 0.7734987390089292, "grad_norm": 2.705502889338598, "learning_rate": 1.285064178120663e-07, "loss": 0.6499, "step": 2837 }, { "epoch": 0.7737713857269443, "grad_norm": 6.113995866059347, "learning_rate": 1.2821090959850694e-07, "loss": 0.6096, "step": 2838 }, { "epoch": 0.7740440324449595, "grad_norm": 2.772890904728369, "learning_rate": 1.2791569156854104e-07, "loss": 0.6978, "step": 2839 }, { "epoch": 0.7743166791629745, "grad_norm": 2.335150544707692, "learning_rate": 1.2762076395258785e-07, "loss": 0.6966, "step": 2840 }, { "epoch": 0.7745893258809897, "grad_norm": 1.79253652052609, "learning_rate": 1.2732612698084067e-07, "loss": 0.6793, "step": 2841 }, { "epoch": 0.7748619725990048, "grad_norm": 2.2741257470256944, "learning_rate": 1.2703178088326467e-07, "loss": 0.7167, "step": 2842 }, { "epoch": 0.77513461931702, "grad_norm": 1.8207483993353843, "learning_rate": 1.267377258895994e-07, "loss": 0.629, "step": 2843 }, { "epoch": 0.7754072660350351, "grad_norm": 3.6572043803004775, "learning_rate": 1.2644396222935594e-07, "loss": 0.6824, "step": 2844 }, { "epoch": 0.7756799127530503, "grad_norm": 2.7107607293991767, "learning_rate": 1.26150490131819e-07, "loss": 0.7089, "step": 2845 }, { "epoch": 0.7759525594710653, "grad_norm": 3.0174706142078422, "learning_rate": 1.2585730982604513e-07, "loss": 0.6346, "step": 2846 }, { "epoch": 0.7762252061890805, "grad_norm": 2.285539853309283, "learning_rate": 1.2556442154086338e-07, "loss": 0.6655, "step": 2847 }, { "epoch": 0.7764978529070956, "grad_norm": 3.1896055279834026, "learning_rate": 1.252718255048747e-07, "loss": 0.6518, "step": 2848 }, { "epoch": 0.7767704996251108, "grad_norm": 1.7369319443588802, "learning_rate": 1.2497952194645207e-07, "loss": 0.646, "step": 2849 }, { "epoch": 0.7770431463431259, "grad_norm": 2.145359354531418, "learning_rate": 1.2468751109374014e-07, "loss": 0.7157, "step": 2850 }, { "epoch": 0.7773157930611411, "grad_norm": 2.29145914944003, "learning_rate": 1.2439579317465514e-07, "loss": 0.7236, "step": 2851 }, { "epoch": 0.7775884397791561, "grad_norm": 6.082387483288597, "learning_rate": 1.241043684168845e-07, "loss": 0.6843, "step": 2852 }, { "epoch": 0.7778610864971713, "grad_norm": 1.9334068576774341, "learning_rate": 1.238132370478872e-07, "loss": 0.6701, "step": 2853 }, { "epoch": 0.7781337332151864, "grad_norm": 2.377120191403558, "learning_rate": 1.2352239929489294e-07, "loss": 0.6763, "step": 2854 }, { "epoch": 0.7784063799332016, "grad_norm": 2.465566810094678, "learning_rate": 1.2323185538490228e-07, "loss": 0.6923, "step": 2855 }, { "epoch": 0.7786790266512167, "grad_norm": 2.735768896378126, "learning_rate": 1.2294160554468646e-07, "loss": 0.7491, "step": 2856 }, { "epoch": 0.7789516733692318, "grad_norm": 1.9258493394013838, "learning_rate": 1.226516500007872e-07, "loss": 0.7405, "step": 2857 }, { "epoch": 0.779224320087247, "grad_norm": 1.9420142258307134, "learning_rate": 1.2236198897951655e-07, "loss": 0.654, "step": 2858 }, { "epoch": 0.779496966805262, "grad_norm": 4.675104179724607, "learning_rate": 1.220726227069565e-07, "loss": 0.6568, "step": 2859 }, { "epoch": 0.7797696135232772, "grad_norm": 5.1557296970127515, "learning_rate": 1.2178355140895956e-07, "loss": 0.6302, "step": 2860 }, { "epoch": 0.7800422602412923, "grad_norm": 2.1381002488420333, "learning_rate": 1.2149477531114704e-07, "loss": 0.6497, "step": 2861 }, { "epoch": 0.7803149069593075, "grad_norm": 3.02538736031551, "learning_rate": 1.2120629463891096e-07, "loss": 0.6827, "step": 2862 }, { "epoch": 0.7805875536773226, "grad_norm": 1.8266324131976486, "learning_rate": 1.2091810961741166e-07, "loss": 0.6165, "step": 2863 }, { "epoch": 0.7808602003953378, "grad_norm": 2.3273498775690133, "learning_rate": 1.206302204715796e-07, "loss": 0.6608, "step": 2864 }, { "epoch": 0.7811328471133528, "grad_norm": 3.890358266719084, "learning_rate": 1.2034262742611396e-07, "loss": 0.6699, "step": 2865 }, { "epoch": 0.781405493831368, "grad_norm": 2.1478155301714277, "learning_rate": 1.2005533070548273e-07, "loss": 0.7627, "step": 2866 }, { "epoch": 0.7816781405493831, "grad_norm": 1.844436949390213, "learning_rate": 1.1976833053392277e-07, "loss": 0.6944, "step": 2867 }, { "epoch": 0.7819507872673983, "grad_norm": 2.5085980728009187, "learning_rate": 1.194816271354393e-07, "loss": 0.6745, "step": 2868 }, { "epoch": 0.7822234339854134, "grad_norm": 1.9619427950579742, "learning_rate": 1.1919522073380612e-07, "loss": 0.6562, "step": 2869 }, { "epoch": 0.7824960807034286, "grad_norm": 3.3800933482002886, "learning_rate": 1.1890911155256511e-07, "loss": 0.6746, "step": 2870 }, { "epoch": 0.7827687274214437, "grad_norm": 1.8440775528002127, "learning_rate": 1.1862329981502606e-07, "loss": 0.6446, "step": 2871 }, { "epoch": 0.7830413741394588, "grad_norm": 2.0488296320761212, "learning_rate": 1.18337785744267e-07, "loss": 0.6785, "step": 2872 }, { "epoch": 0.7833140208574739, "grad_norm": 2.331187175344418, "learning_rate": 1.180525695631332e-07, "loss": 0.6769, "step": 2873 }, { "epoch": 0.7835866675754891, "grad_norm": 1.7893177918752012, "learning_rate": 1.1776765149423761e-07, "loss": 0.6401, "step": 2874 }, { "epoch": 0.7838593142935042, "grad_norm": 2.0954405999304426, "learning_rate": 1.1748303175996044e-07, "loss": 0.6902, "step": 2875 }, { "epoch": 0.7841319610115193, "grad_norm": 3.713321356209413, "learning_rate": 1.1719871058244913e-07, "loss": 0.6487, "step": 2876 }, { "epoch": 0.7844046077295345, "grad_norm": 2.978196609599795, "learning_rate": 1.16914688183618e-07, "loss": 0.6742, "step": 2877 }, { "epoch": 0.7846772544475495, "grad_norm": 2.1733782893459765, "learning_rate": 1.1663096478514805e-07, "loss": 0.7367, "step": 2878 }, { "epoch": 0.7849499011655647, "grad_norm": 3.131142691912309, "learning_rate": 1.1634754060848761e-07, "loss": 0.6241, "step": 2879 }, { "epoch": 0.7852225478835798, "grad_norm": 4.336761852515144, "learning_rate": 1.1606441587485033e-07, "loss": 0.6921, "step": 2880 }, { "epoch": 0.785495194601595, "grad_norm": 3.1725658789547144, "learning_rate": 1.1578159080521721e-07, "loss": 0.6063, "step": 2881 }, { "epoch": 0.7857678413196101, "grad_norm": 2.4479602430150376, "learning_rate": 1.1549906562033451e-07, "loss": 0.684, "step": 2882 }, { "epoch": 0.7860404880376253, "grad_norm": 2.2149293688581593, "learning_rate": 1.1521684054071523e-07, "loss": 0.6677, "step": 2883 }, { "epoch": 0.7863131347556404, "grad_norm": 3.358673533712446, "learning_rate": 1.1493491578663717e-07, "loss": 0.7336, "step": 2884 }, { "epoch": 0.7865857814736555, "grad_norm": 2.3920214239294153, "learning_rate": 1.146532915781448e-07, "loss": 0.7234, "step": 2885 }, { "epoch": 0.7868584281916706, "grad_norm": 2.542504082287346, "learning_rate": 1.1437196813504723e-07, "loss": 0.7347, "step": 2886 }, { "epoch": 0.7871310749096858, "grad_norm": 2.1034667114579886, "learning_rate": 1.1409094567691918e-07, "loss": 0.6391, "step": 2887 }, { "epoch": 0.7874037216277009, "grad_norm": 2.4835083713370394, "learning_rate": 1.1381022442310029e-07, "loss": 0.7675, "step": 2888 }, { "epoch": 0.7876763683457161, "grad_norm": 2.1504403259851035, "learning_rate": 1.1352980459269517e-07, "loss": 0.6369, "step": 2889 }, { "epoch": 0.7879490150637312, "grad_norm": 2.7913148619057178, "learning_rate": 1.1324968640457306e-07, "loss": 0.7233, "step": 2890 }, { "epoch": 0.7882216617817464, "grad_norm": 2.4979013729303214, "learning_rate": 1.1296987007736809e-07, "loss": 0.7164, "step": 2891 }, { "epoch": 0.7884943084997614, "grad_norm": 2.1962277899730895, "learning_rate": 1.1269035582947839e-07, "loss": 0.6982, "step": 2892 }, { "epoch": 0.7887669552177765, "grad_norm": 2.3888551081444427, "learning_rate": 1.1241114387906664e-07, "loss": 0.6573, "step": 2893 }, { "epoch": 0.7890396019357917, "grad_norm": 2.4347276380330825, "learning_rate": 1.1213223444405934e-07, "loss": 0.6929, "step": 2894 }, { "epoch": 0.7893122486538068, "grad_norm": 2.104615380670339, "learning_rate": 1.1185362774214691e-07, "loss": 0.7202, "step": 2895 }, { "epoch": 0.789584895371822, "grad_norm": 2.1599347448106387, "learning_rate": 1.1157532399078362e-07, "loss": 0.7277, "step": 2896 }, { "epoch": 0.789857542089837, "grad_norm": 2.398997985913063, "learning_rate": 1.11297323407187e-07, "loss": 0.7433, "step": 2897 }, { "epoch": 0.7901301888078522, "grad_norm": 3.335190394408619, "learning_rate": 1.1101962620833866e-07, "loss": 0.6958, "step": 2898 }, { "epoch": 0.7904028355258673, "grad_norm": 5.3452156263259045, "learning_rate": 1.1074223261098225e-07, "loss": 0.6582, "step": 2899 }, { "epoch": 0.7906754822438825, "grad_norm": 1.9317951304717846, "learning_rate": 1.1046514283162578e-07, "loss": 0.692, "step": 2900 }, { "epoch": 0.7909481289618976, "grad_norm": 2.26388720264565, "learning_rate": 1.1018835708653878e-07, "loss": 0.6138, "step": 2901 }, { "epoch": 0.7912207756799128, "grad_norm": 2.56949185517022, "learning_rate": 1.0991187559175485e-07, "loss": 0.6789, "step": 2902 }, { "epoch": 0.7914934223979279, "grad_norm": 2.0434663270209255, "learning_rate": 1.0963569856306886e-07, "loss": 0.6549, "step": 2903 }, { "epoch": 0.791766069115943, "grad_norm": 2.8172036927616366, "learning_rate": 1.0935982621603895e-07, "loss": 0.6553, "step": 2904 }, { "epoch": 0.7920387158339581, "grad_norm": 2.562683091486944, "learning_rate": 1.090842587659851e-07, "loss": 0.6464, "step": 2905 }, { "epoch": 0.7923113625519733, "grad_norm": 2.240331638811781, "learning_rate": 1.088089964279893e-07, "loss": 0.7207, "step": 2906 }, { "epoch": 0.7925840092699884, "grad_norm": 2.623659260387177, "learning_rate": 1.0853403941689542e-07, "loss": 0.6739, "step": 2907 }, { "epoch": 0.7928566559880036, "grad_norm": 7.137248103137001, "learning_rate": 1.0825938794730904e-07, "loss": 0.7129, "step": 2908 }, { "epoch": 0.7931293027060187, "grad_norm": 2.2958220319791462, "learning_rate": 1.0798504223359728e-07, "loss": 0.6146, "step": 2909 }, { "epoch": 0.7934019494240339, "grad_norm": 3.479304559788808, "learning_rate": 1.0771100248988862e-07, "loss": 0.7497, "step": 2910 }, { "epoch": 0.7936745961420489, "grad_norm": 1.9194973697742415, "learning_rate": 1.0743726893007254e-07, "loss": 0.7379, "step": 2911 }, { "epoch": 0.793947242860064, "grad_norm": 3.5780576452993786, "learning_rate": 1.0716384176779996e-07, "loss": 0.6505, "step": 2912 }, { "epoch": 0.7942198895780792, "grad_norm": 2.3523689729687036, "learning_rate": 1.0689072121648229e-07, "loss": 0.7494, "step": 2913 }, { "epoch": 0.7944925362960943, "grad_norm": 2.2081413629720488, "learning_rate": 1.0661790748929179e-07, "loss": 0.6379, "step": 2914 }, { "epoch": 0.7947651830141095, "grad_norm": 4.759412931649774, "learning_rate": 1.0634540079916116e-07, "loss": 0.6103, "step": 2915 }, { "epoch": 0.7950378297321246, "grad_norm": 2.141294931875405, "learning_rate": 1.0607320135878345e-07, "loss": 0.7076, "step": 2916 }, { "epoch": 0.7953104764501397, "grad_norm": 3.1261070492194394, "learning_rate": 1.05801309380612e-07, "loss": 0.6754, "step": 2917 }, { "epoch": 0.7955831231681548, "grad_norm": 1.9489275691596402, "learning_rate": 1.0552972507685992e-07, "loss": 0.6418, "step": 2918 }, { "epoch": 0.79585576988617, "grad_norm": 3.1659144387157765, "learning_rate": 1.0525844865950084e-07, "loss": 0.7653, "step": 2919 }, { "epoch": 0.7961284166041851, "grad_norm": 3.548455730003263, "learning_rate": 1.0498748034026705e-07, "loss": 0.7413, "step": 2920 }, { "epoch": 0.7964010633222003, "grad_norm": 2.213050189745575, "learning_rate": 1.0471682033065144e-07, "loss": 0.726, "step": 2921 }, { "epoch": 0.7966737100402154, "grad_norm": 2.6032948181760767, "learning_rate": 1.0444646884190527e-07, "loss": 0.656, "step": 2922 }, { "epoch": 0.7969463567582306, "grad_norm": 2.4169431634933436, "learning_rate": 1.0417642608503985e-07, "loss": 0.6485, "step": 2923 }, { "epoch": 0.7972190034762456, "grad_norm": 2.2293914809644586, "learning_rate": 1.0390669227082505e-07, "loss": 0.6564, "step": 2924 }, { "epoch": 0.7974916501942608, "grad_norm": 2.1671316824858438, "learning_rate": 1.0363726760978969e-07, "loss": 0.7509, "step": 2925 }, { "epoch": 0.7977642969122759, "grad_norm": 6.553976766242824, "learning_rate": 1.0336815231222129e-07, "loss": 0.6817, "step": 2926 }, { "epoch": 0.7980369436302911, "grad_norm": 2.167726999782385, "learning_rate": 1.0309934658816605e-07, "loss": 0.6399, "step": 2927 }, { "epoch": 0.7983095903483062, "grad_norm": 2.0792396532211543, "learning_rate": 1.028308506474283e-07, "loss": 0.6837, "step": 2928 }, { "epoch": 0.7985822370663214, "grad_norm": 2.104104313531672, "learning_rate": 1.0256266469957076e-07, "loss": 0.6083, "step": 2929 }, { "epoch": 0.7988548837843364, "grad_norm": 2.5474737013895403, "learning_rate": 1.0229478895391408e-07, "loss": 0.6141, "step": 2930 }, { "epoch": 0.7991275305023515, "grad_norm": 2.343736434752547, "learning_rate": 1.0202722361953708e-07, "loss": 0.6651, "step": 2931 }, { "epoch": 0.7994001772203667, "grad_norm": 2.2367505184547722, "learning_rate": 1.0175996890527593e-07, "loss": 0.7255, "step": 2932 }, { "epoch": 0.7996728239383818, "grad_norm": 1.7093276222404032, "learning_rate": 1.0149302501972462e-07, "loss": 0.6841, "step": 2933 }, { "epoch": 0.799945470656397, "grad_norm": 1.99886484815243, "learning_rate": 1.0122639217123436e-07, "loss": 0.6574, "step": 2934 }, { "epoch": 0.8002181173744121, "grad_norm": 3.60531972276643, "learning_rate": 1.0096007056791373e-07, "loss": 0.6531, "step": 2935 }, { "epoch": 0.8004907640924273, "grad_norm": 1.8277464481951822, "learning_rate": 1.0069406041762824e-07, "loss": 0.6504, "step": 2936 }, { "epoch": 0.8007634108104423, "grad_norm": 6.298525448165673, "learning_rate": 1.0042836192800035e-07, "loss": 0.7026, "step": 2937 }, { "epoch": 0.8010360575284575, "grad_norm": 2.852130898575966, "learning_rate": 1.001629753064096e-07, "loss": 0.7265, "step": 2938 }, { "epoch": 0.8013087042464726, "grad_norm": 2.9526451754073095, "learning_rate": 9.989790075999144e-08, "loss": 0.7401, "step": 2939 }, { "epoch": 0.8015813509644878, "grad_norm": 2.0713694786380343, "learning_rate": 9.963313849563854e-08, "loss": 0.6815, "step": 2940 }, { "epoch": 0.8018539976825029, "grad_norm": 2.2795228974487975, "learning_rate": 9.936868871999893e-08, "loss": 0.683, "step": 2941 }, { "epoch": 0.8021266444005181, "grad_norm": 2.59636787394025, "learning_rate": 9.910455163947773e-08, "loss": 0.7427, "step": 2942 }, { "epoch": 0.8023992911185331, "grad_norm": 3.257035265176593, "learning_rate": 9.884072746023503e-08, "loss": 0.7813, "step": 2943 }, { "epoch": 0.8026719378365483, "grad_norm": 2.2299729418362575, "learning_rate": 9.857721638818756e-08, "loss": 0.618, "step": 2944 }, { "epoch": 0.8029445845545634, "grad_norm": 2.972696262809775, "learning_rate": 9.831401862900707e-08, "loss": 0.6545, "step": 2945 }, { "epoch": 0.8032172312725786, "grad_norm": 2.3523122300307944, "learning_rate": 9.805113438812108e-08, "loss": 0.6283, "step": 2946 }, { "epoch": 0.8034898779905937, "grad_norm": 2.6206918762549907, "learning_rate": 9.778856387071221e-08, "loss": 0.6342, "step": 2947 }, { "epoch": 0.8037625247086089, "grad_norm": 1.946121114635359, "learning_rate": 9.752630728171834e-08, "loss": 0.7296, "step": 2948 }, { "epoch": 0.804035171426624, "grad_norm": 4.3531734605921875, "learning_rate": 9.726436482583228e-08, "loss": 0.6729, "step": 2949 }, { "epoch": 0.804307818144639, "grad_norm": 2.7786534078828913, "learning_rate": 9.700273670750159e-08, "loss": 0.6675, "step": 2950 }, { "epoch": 0.8045804648626542, "grad_norm": 3.142967348762505, "learning_rate": 9.674142313092881e-08, "loss": 0.6851, "step": 2951 }, { "epoch": 0.8048531115806693, "grad_norm": 1.9477247471996233, "learning_rate": 9.648042430007058e-08, "loss": 0.6429, "step": 2952 }, { "epoch": 0.8051257582986845, "grad_norm": 3.555399738129455, "learning_rate": 9.621974041863811e-08, "loss": 0.672, "step": 2953 }, { "epoch": 0.8053984050166996, "grad_norm": 3.0260062798404883, "learning_rate": 9.59593716900966e-08, "loss": 0.6415, "step": 2954 }, { "epoch": 0.8056710517347148, "grad_norm": 2.500397599767995, "learning_rate": 9.569931831766548e-08, "loss": 0.6687, "step": 2955 }, { "epoch": 0.8059436984527298, "grad_norm": 1.9409757679115631, "learning_rate": 9.543958050431783e-08, "loss": 0.6364, "step": 2956 }, { "epoch": 0.806216345170745, "grad_norm": 1.7588560296880056, "learning_rate": 9.518015845278088e-08, "loss": 0.658, "step": 2957 }, { "epoch": 0.8064889918887601, "grad_norm": 2.8648199330248705, "learning_rate": 9.49210523655346e-08, "loss": 0.709, "step": 2958 }, { "epoch": 0.8067616386067753, "grad_norm": 2.4470291304470653, "learning_rate": 9.466226244481335e-08, "loss": 0.6341, "step": 2959 }, { "epoch": 0.8070342853247904, "grad_norm": 2.343528288881909, "learning_rate": 9.440378889260369e-08, "loss": 0.6495, "step": 2960 }, { "epoch": 0.8073069320428056, "grad_norm": 2.0163344454180163, "learning_rate": 9.414563191064629e-08, "loss": 0.6594, "step": 2961 }, { "epoch": 0.8075795787608206, "grad_norm": 1.9501900581752827, "learning_rate": 9.388779170043359e-08, "loss": 0.6795, "step": 2962 }, { "epoch": 0.8078522254788358, "grad_norm": 2.105580072379974, "learning_rate": 9.363026846321192e-08, "loss": 0.6691, "step": 2963 }, { "epoch": 0.8081248721968509, "grad_norm": 2.484111955119076, "learning_rate": 9.337306239997956e-08, "loss": 0.6818, "step": 2964 }, { "epoch": 0.8083975189148661, "grad_norm": 2.203251692153179, "learning_rate": 9.311617371148728e-08, "loss": 0.7086, "step": 2965 }, { "epoch": 0.8086701656328812, "grad_norm": 7.228766162527902, "learning_rate": 9.285960259823827e-08, "loss": 0.6345, "step": 2966 }, { "epoch": 0.8089428123508964, "grad_norm": 17.03919250282462, "learning_rate": 9.260334926048785e-08, "loss": 0.756, "step": 2967 }, { "epoch": 0.8092154590689115, "grad_norm": 2.265105294093761, "learning_rate": 9.234741389824324e-08, "loss": 0.6645, "step": 2968 }, { "epoch": 0.8094881057869265, "grad_norm": 3.3857618121280164, "learning_rate": 9.209179671126332e-08, "loss": 0.6665, "step": 2969 }, { "epoch": 0.8097607525049417, "grad_norm": 3.8658243115020983, "learning_rate": 9.183649789905923e-08, "loss": 0.6339, "step": 2970 }, { "epoch": 0.8100333992229568, "grad_norm": 4.381982160671141, "learning_rate": 9.158151766089294e-08, "loss": 0.7101, "step": 2971 }, { "epoch": 0.810306045940972, "grad_norm": 5.2287005582425, "learning_rate": 9.132685619577806e-08, "loss": 0.6669, "step": 2972 }, { "epoch": 0.8105786926589871, "grad_norm": 2.164987136198834, "learning_rate": 9.107251370247942e-08, "loss": 0.6438, "step": 2973 }, { "epoch": 0.8108513393770023, "grad_norm": 2.0515777380932922, "learning_rate": 9.081849037951289e-08, "loss": 0.713, "step": 2974 }, { "epoch": 0.8111239860950173, "grad_norm": 2.6430492131002263, "learning_rate": 9.056478642514509e-08, "loss": 0.7691, "step": 2975 }, { "epoch": 0.8113966328130325, "grad_norm": 1.967642048004687, "learning_rate": 9.031140203739351e-08, "loss": 0.6899, "step": 2976 }, { "epoch": 0.8116692795310476, "grad_norm": 3.115256932820046, "learning_rate": 9.005833741402602e-08, "loss": 0.7066, "step": 2977 }, { "epoch": 0.8119419262490628, "grad_norm": 3.9546994610224595, "learning_rate": 8.980559275256145e-08, "loss": 0.7115, "step": 2978 }, { "epoch": 0.8122145729670779, "grad_norm": 2.2167503521644694, "learning_rate": 8.955316825026799e-08, "loss": 0.664, "step": 2979 }, { "epoch": 0.8124872196850931, "grad_norm": 2.0103962116918686, "learning_rate": 8.9301064104165e-08, "loss": 0.7474, "step": 2980 }, { "epoch": 0.8127598664031082, "grad_norm": 1.8869177120030247, "learning_rate": 8.904928051102074e-08, "loss": 0.7027, "step": 2981 }, { "epoch": 0.8130325131211233, "grad_norm": 2.8497108613723006, "learning_rate": 8.879781766735433e-08, "loss": 0.6662, "step": 2982 }, { "epoch": 0.8133051598391384, "grad_norm": 2.2827804433973173, "learning_rate": 8.854667576943347e-08, "loss": 0.721, "step": 2983 }, { "epoch": 0.8135778065571536, "grad_norm": 5.238456996973042, "learning_rate": 8.82958550132763e-08, "loss": 0.691, "step": 2984 }, { "epoch": 0.8138504532751687, "grad_norm": 2.2891538531255793, "learning_rate": 8.804535559464982e-08, "loss": 0.6892, "step": 2985 }, { "epoch": 0.8141230999931839, "grad_norm": 2.7955928711509896, "learning_rate": 8.779517770907024e-08, "loss": 0.7311, "step": 2986 }, { "epoch": 0.814395746711199, "grad_norm": 2.2968150445297746, "learning_rate": 8.754532155180294e-08, "loss": 0.6699, "step": 2987 }, { "epoch": 0.814668393429214, "grad_norm": 1.8350287842257706, "learning_rate": 8.729578731786196e-08, "loss": 0.6909, "step": 2988 }, { "epoch": 0.8149410401472292, "grad_norm": 2.5277886001943606, "learning_rate": 8.704657520201059e-08, "loss": 0.6748, "step": 2989 }, { "epoch": 0.8152136868652443, "grad_norm": 5.751499509047329, "learning_rate": 8.679768539876015e-08, "loss": 0.7502, "step": 2990 }, { "epoch": 0.8154863335832595, "grad_norm": 1.9827170988367369, "learning_rate": 8.654911810237065e-08, "loss": 0.6741, "step": 2991 }, { "epoch": 0.8157589803012746, "grad_norm": 1.9121301215287145, "learning_rate": 8.630087350685033e-08, "loss": 0.6613, "step": 2992 }, { "epoch": 0.8160316270192898, "grad_norm": 1.7094226206068208, "learning_rate": 8.605295180595563e-08, "loss": 0.6796, "step": 2993 }, { "epoch": 0.8163042737373049, "grad_norm": 4.733466630672681, "learning_rate": 8.580535319319083e-08, "loss": 0.7444, "step": 2994 }, { "epoch": 0.81657692045532, "grad_norm": 2.2728225089273923, "learning_rate": 8.555807786180813e-08, "loss": 0.6733, "step": 2995 }, { "epoch": 0.8168495671733351, "grad_norm": 3.028908967049923, "learning_rate": 8.531112600480728e-08, "loss": 0.632, "step": 2996 }, { "epoch": 0.8171222138913503, "grad_norm": 7.32226215306294, "learning_rate": 8.506449781493608e-08, "loss": 0.7123, "step": 2997 }, { "epoch": 0.8173948606093654, "grad_norm": 2.357721916047376, "learning_rate": 8.481819348468877e-08, "loss": 0.687, "step": 2998 }, { "epoch": 0.8176675073273806, "grad_norm": 2.7091321420022325, "learning_rate": 8.457221320630781e-08, "loss": 0.6891, "step": 2999 }, { "epoch": 0.8179401540453957, "grad_norm": 3.0213595894944514, "learning_rate": 8.432655717178183e-08, "loss": 0.6296, "step": 3000 }, { "epoch": 0.8182128007634109, "grad_norm": 4.600827441876477, "learning_rate": 8.40812255728473e-08, "loss": 0.6546, "step": 3001 }, { "epoch": 0.8184854474814259, "grad_norm": 1.9299504431487515, "learning_rate": 8.383621860098649e-08, "loss": 0.6217, "step": 3002 }, { "epoch": 0.8187580941994411, "grad_norm": 6.157945162881305, "learning_rate": 8.359153644742916e-08, "loss": 0.6843, "step": 3003 }, { "epoch": 0.8190307409174562, "grad_norm": 3.783486684840667, "learning_rate": 8.334717930315105e-08, "loss": 0.7164, "step": 3004 }, { "epoch": 0.8193033876354714, "grad_norm": 3.26285642187905, "learning_rate": 8.310314735887442e-08, "loss": 0.6059, "step": 3005 }, { "epoch": 0.8195760343534865, "grad_norm": 2.7286130821773114, "learning_rate": 8.285944080506757e-08, "loss": 0.664, "step": 3006 }, { "epoch": 0.8198486810715016, "grad_norm": 2.1272634520847813, "learning_rate": 8.261605983194486e-08, "loss": 0.7015, "step": 3007 }, { "epoch": 0.8201213277895167, "grad_norm": 2.5047181565301515, "learning_rate": 8.237300462946689e-08, "loss": 0.7084, "step": 3008 }, { "epoch": 0.8203939745075318, "grad_norm": 2.4912721414397136, "learning_rate": 8.21302753873393e-08, "loss": 0.7122, "step": 3009 }, { "epoch": 0.820666621225547, "grad_norm": 2.534835251613415, "learning_rate": 8.188787229501392e-08, "loss": 0.6944, "step": 3010 }, { "epoch": 0.8209392679435621, "grad_norm": 3.1127994124673966, "learning_rate": 8.164579554168783e-08, "loss": 0.742, "step": 3011 }, { "epoch": 0.8212119146615773, "grad_norm": 2.555828773096493, "learning_rate": 8.140404531630329e-08, "loss": 0.6571, "step": 3012 }, { "epoch": 0.8214845613795924, "grad_norm": 2.4580931918719426, "learning_rate": 8.116262180754784e-08, "loss": 0.6598, "step": 3013 }, { "epoch": 0.8217572080976075, "grad_norm": 3.046308064276284, "learning_rate": 8.092152520385387e-08, "loss": 0.6429, "step": 3014 }, { "epoch": 0.8220298548156226, "grad_norm": 2.5771870263705248, "learning_rate": 8.068075569339883e-08, "loss": 0.6602, "step": 3015 }, { "epoch": 0.8223025015336378, "grad_norm": 2.848635461261066, "learning_rate": 8.044031346410469e-08, "loss": 0.6614, "step": 3016 }, { "epoch": 0.8225751482516529, "grad_norm": 1.8068326973216147, "learning_rate": 8.020019870363787e-08, "loss": 0.6639, "step": 3017 }, { "epoch": 0.8228477949696681, "grad_norm": 2.4175225137571963, "learning_rate": 7.996041159940986e-08, "loss": 0.6644, "step": 3018 }, { "epoch": 0.8231204416876832, "grad_norm": 2.649441476517368, "learning_rate": 7.972095233857528e-08, "loss": 0.652, "step": 3019 }, { "epoch": 0.8233930884056984, "grad_norm": 7.259625541620654, "learning_rate": 7.948182110803414e-08, "loss": 0.6986, "step": 3020 }, { "epoch": 0.8236657351237134, "grad_norm": 2.4547525294790455, "learning_rate": 7.924301809442918e-08, "loss": 0.6708, "step": 3021 }, { "epoch": 0.8239383818417286, "grad_norm": 2.654034615261734, "learning_rate": 7.900454348414797e-08, "loss": 0.7278, "step": 3022 }, { "epoch": 0.8242110285597437, "grad_norm": 2.168107570164984, "learning_rate": 7.876639746332131e-08, "loss": 0.6221, "step": 3023 }, { "epoch": 0.8244836752777589, "grad_norm": 3.140480621417571, "learning_rate": 7.85285802178235e-08, "loss": 0.6446, "step": 3024 }, { "epoch": 0.824756321995774, "grad_norm": 2.0772184909588427, "learning_rate": 7.829109193327232e-08, "loss": 0.6974, "step": 3025 }, { "epoch": 0.8250289687137891, "grad_norm": 2.620052216459341, "learning_rate": 7.80539327950287e-08, "loss": 0.6327, "step": 3026 }, { "epoch": 0.8253016154318042, "grad_norm": 4.10488073856683, "learning_rate": 7.78171029881971e-08, "loss": 0.6839, "step": 3027 }, { "epoch": 0.8255742621498193, "grad_norm": 3.2805925477874993, "learning_rate": 7.758060269762411e-08, "loss": 0.6905, "step": 3028 }, { "epoch": 0.8258469088678345, "grad_norm": 2.3291943229346983, "learning_rate": 7.734443210789998e-08, "loss": 0.6854, "step": 3029 }, { "epoch": 0.8261195555858496, "grad_norm": 2.0350897103262273, "learning_rate": 7.710859140335712e-08, "loss": 0.6442, "step": 3030 }, { "epoch": 0.8263922023038648, "grad_norm": 2.0743653774513335, "learning_rate": 7.687308076807065e-08, "loss": 0.722, "step": 3031 }, { "epoch": 0.8266648490218799, "grad_norm": 2.3723470301849603, "learning_rate": 7.663790038585794e-08, "loss": 0.6692, "step": 3032 }, { "epoch": 0.8269374957398951, "grad_norm": 1.5224537069380115, "learning_rate": 7.640305044027872e-08, "loss": 0.6372, "step": 3033 }, { "epoch": 0.8272101424579101, "grad_norm": 2.6549213287677094, "learning_rate": 7.616853111463478e-08, "loss": 0.7006, "step": 3034 }, { "epoch": 0.8274827891759253, "grad_norm": 3.10348424867493, "learning_rate": 7.593434259196984e-08, "loss": 0.648, "step": 3035 }, { "epoch": 0.8277554358939404, "grad_norm": 3.5780154862955604, "learning_rate": 7.570048505506926e-08, "loss": 0.7039, "step": 3036 }, { "epoch": 0.8280280826119556, "grad_norm": 1.8230953497155902, "learning_rate": 7.546695868646069e-08, "loss": 0.7024, "step": 3037 }, { "epoch": 0.8283007293299707, "grad_norm": 2.4281348259740647, "learning_rate": 7.523376366841233e-08, "loss": 0.656, "step": 3038 }, { "epoch": 0.8285733760479859, "grad_norm": 2.204939521943451, "learning_rate": 7.50009001829347e-08, "loss": 0.676, "step": 3039 }, { "epoch": 0.828846022766001, "grad_norm": 2.0006359023209126, "learning_rate": 7.47683684117787e-08, "loss": 0.7359, "step": 3040 }, { "epoch": 0.8291186694840161, "grad_norm": 1.917052289680935, "learning_rate": 7.453616853643712e-08, "loss": 0.6524, "step": 3041 }, { "epoch": 0.8293913162020312, "grad_norm": 4.815495628192778, "learning_rate": 7.430430073814325e-08, "loss": 0.668, "step": 3042 }, { "epoch": 0.8296639629200464, "grad_norm": 10.003853122237077, "learning_rate": 7.407276519787126e-08, "loss": 0.7044, "step": 3043 }, { "epoch": 0.8299366096380615, "grad_norm": 2.296430274168681, "learning_rate": 7.384156209633596e-08, "loss": 0.6693, "step": 3044 }, { "epoch": 0.8302092563560766, "grad_norm": 2.4461152100118433, "learning_rate": 7.361069161399274e-08, "loss": 0.6722, "step": 3045 }, { "epoch": 0.8304819030740918, "grad_norm": 3.310916211830496, "learning_rate": 7.338015393103764e-08, "loss": 0.6339, "step": 3046 }, { "epoch": 0.8307545497921068, "grad_norm": 2.0483806810042293, "learning_rate": 7.314994922740625e-08, "loss": 0.7254, "step": 3047 }, { "epoch": 0.831027196510122, "grad_norm": 1.943661837306981, "learning_rate": 7.292007768277503e-08, "loss": 0.6339, "step": 3048 }, { "epoch": 0.8312998432281371, "grad_norm": 2.0381154834629136, "learning_rate": 7.269053947656007e-08, "loss": 0.7372, "step": 3049 }, { "epoch": 0.8315724899461523, "grad_norm": 5.132611169554612, "learning_rate": 7.246133478791716e-08, "loss": 0.7854, "step": 3050 }, { "epoch": 0.8318451366641674, "grad_norm": 2.250969893347272, "learning_rate": 7.223246379574205e-08, "loss": 0.6586, "step": 3051 }, { "epoch": 0.8321177833821826, "grad_norm": 3.3316398668757485, "learning_rate": 7.200392667866983e-08, "loss": 0.6142, "step": 3052 }, { "epoch": 0.8323904301001976, "grad_norm": 2.048791729572377, "learning_rate": 7.177572361507511e-08, "loss": 0.7089, "step": 3053 }, { "epoch": 0.8326630768182128, "grad_norm": 2.991245251946959, "learning_rate": 7.154785478307168e-08, "loss": 0.7087, "step": 3054 }, { "epoch": 0.8329357235362279, "grad_norm": 11.103579194158712, "learning_rate": 7.13203203605125e-08, "loss": 0.7259, "step": 3055 }, { "epoch": 0.8332083702542431, "grad_norm": 1.8379691192136478, "learning_rate": 7.109312052498984e-08, "loss": 0.6773, "step": 3056 }, { "epoch": 0.8334810169722582, "grad_norm": 3.1701103305547056, "learning_rate": 7.086625545383407e-08, "loss": 0.6562, "step": 3057 }, { "epoch": 0.8337536636902734, "grad_norm": 2.8441730527653704, "learning_rate": 7.06397253241151e-08, "loss": 0.7489, "step": 3058 }, { "epoch": 0.8340263104082885, "grad_norm": 2.266799305544257, "learning_rate": 7.04135303126407e-08, "loss": 0.7228, "step": 3059 }, { "epoch": 0.8342989571263036, "grad_norm": 2.9011759825420387, "learning_rate": 7.01876705959577e-08, "loss": 0.7016, "step": 3060 }, { "epoch": 0.8345716038443187, "grad_norm": 2.512988504284322, "learning_rate": 6.996214635035085e-08, "loss": 0.6743, "step": 3061 }, { "epoch": 0.8348442505623338, "grad_norm": 26.653798950993608, "learning_rate": 6.973695775184319e-08, "loss": 0.7132, "step": 3062 }, { "epoch": 0.835116897280349, "grad_norm": 2.2287980353314603, "learning_rate": 6.951210497619574e-08, "loss": 0.6727, "step": 3063 }, { "epoch": 0.8353895439983641, "grad_norm": 2.3065800615307315, "learning_rate": 6.92875881989074e-08, "loss": 0.682, "step": 3064 }, { "epoch": 0.8356621907163793, "grad_norm": 2.1239661161221153, "learning_rate": 6.906340759521489e-08, "loss": 0.6411, "step": 3065 }, { "epoch": 0.8359348374343943, "grad_norm": 2.1819807334435164, "learning_rate": 6.883956334009233e-08, "loss": 0.6798, "step": 3066 }, { "epoch": 0.8362074841524095, "grad_norm": 9.86834280933507, "learning_rate": 6.86160556082519e-08, "loss": 0.6452, "step": 3067 }, { "epoch": 0.8364801308704246, "grad_norm": 2.085764948650434, "learning_rate": 6.839288457414222e-08, "loss": 0.6998, "step": 3068 }, { "epoch": 0.8367527775884398, "grad_norm": 2.636912743999904, "learning_rate": 6.817005041194995e-08, "loss": 0.6967, "step": 3069 }, { "epoch": 0.8370254243064549, "grad_norm": 5.306950478564038, "learning_rate": 6.794755329559837e-08, "loss": 0.66, "step": 3070 }, { "epoch": 0.8372980710244701, "grad_norm": 2.3027927562905304, "learning_rate": 6.772539339874783e-08, "loss": 0.6045, "step": 3071 }, { "epoch": 0.8375707177424851, "grad_norm": 4.075561833361568, "learning_rate": 6.750357089479541e-08, "loss": 0.5908, "step": 3072 }, { "epoch": 0.8378433644605003, "grad_norm": 2.083754406182208, "learning_rate": 6.72820859568749e-08, "loss": 0.6865, "step": 3073 }, { "epoch": 0.8381160111785154, "grad_norm": 1.8825095398122813, "learning_rate": 6.706093875785662e-08, "loss": 0.617, "step": 3074 }, { "epoch": 0.8383886578965306, "grad_norm": 1.7131327760197859, "learning_rate": 6.684012947034723e-08, "loss": 0.6342, "step": 3075 }, { "epoch": 0.8386613046145457, "grad_norm": 2.1577101617464827, "learning_rate": 6.661965826668963e-08, "loss": 0.724, "step": 3076 }, { "epoch": 0.8389339513325609, "grad_norm": 3.2167378957731647, "learning_rate": 6.639952531896326e-08, "loss": 0.6658, "step": 3077 }, { "epoch": 0.839206598050576, "grad_norm": 1.8876182411015925, "learning_rate": 6.617973079898265e-08, "loss": 0.7015, "step": 3078 }, { "epoch": 0.8394792447685911, "grad_norm": 2.6489606615116923, "learning_rate": 6.596027487829913e-08, "loss": 0.7339, "step": 3079 }, { "epoch": 0.8397518914866062, "grad_norm": 2.0917229518630363, "learning_rate": 6.574115772819927e-08, "loss": 0.7369, "step": 3080 }, { "epoch": 0.8400245382046213, "grad_norm": 2.1872851869897607, "learning_rate": 6.552237951970524e-08, "loss": 0.7343, "step": 3081 }, { "epoch": 0.8402971849226365, "grad_norm": 12.040340871755982, "learning_rate": 6.53039404235748e-08, "loss": 0.7199, "step": 3082 }, { "epoch": 0.8405698316406516, "grad_norm": 2.133406426978239, "learning_rate": 6.508584061030086e-08, "loss": 0.6249, "step": 3083 }, { "epoch": 0.8408424783586668, "grad_norm": 2.2998092977825215, "learning_rate": 6.48680802501117e-08, "loss": 0.7031, "step": 3084 }, { "epoch": 0.8411151250766818, "grad_norm": 1.6310425078017752, "learning_rate": 6.465065951297044e-08, "loss": 0.6609, "step": 3085 }, { "epoch": 0.841387771794697, "grad_norm": 2.2185300436229487, "learning_rate": 6.443357856857561e-08, "loss": 0.7083, "step": 3086 }, { "epoch": 0.8416604185127121, "grad_norm": 4.1785219318984135, "learning_rate": 6.421683758635971e-08, "loss": 0.6622, "step": 3087 }, { "epoch": 0.8419330652307273, "grad_norm": 1.7857547938830327, "learning_rate": 6.400043673549071e-08, "loss": 0.6353, "step": 3088 }, { "epoch": 0.8422057119487424, "grad_norm": 3.727022600556721, "learning_rate": 6.378437618487064e-08, "loss": 0.7072, "step": 3089 }, { "epoch": 0.8424783586667576, "grad_norm": 1.8345087253722345, "learning_rate": 6.356865610313606e-08, "loss": 0.6211, "step": 3090 }, { "epoch": 0.8427510053847727, "grad_norm": 4.55600382425769, "learning_rate": 6.335327665865775e-08, "loss": 0.6795, "step": 3091 }, { "epoch": 0.8430236521027878, "grad_norm": 2.095797930333143, "learning_rate": 6.313823801954066e-08, "loss": 0.6478, "step": 3092 }, { "epoch": 0.8432962988208029, "grad_norm": 2.025131004327821, "learning_rate": 6.292354035362368e-08, "loss": 0.7611, "step": 3093 }, { "epoch": 0.8435689455388181, "grad_norm": 2.655627230084587, "learning_rate": 6.270918382847973e-08, "loss": 0.6528, "step": 3094 }, { "epoch": 0.8438415922568332, "grad_norm": 4.408314207393651, "learning_rate": 6.249516861141512e-08, "loss": 0.6641, "step": 3095 }, { "epoch": 0.8441142389748484, "grad_norm": 2.1086270183882236, "learning_rate": 6.228149486947038e-08, "loss": 0.6455, "step": 3096 }, { "epoch": 0.8443868856928635, "grad_norm": 3.055660046292587, "learning_rate": 6.206816276941867e-08, "loss": 0.7247, "step": 3097 }, { "epoch": 0.8446595324108787, "grad_norm": 2.4805824181716, "learning_rate": 6.185517247776734e-08, "loss": 0.7139, "step": 3098 }, { "epoch": 0.8449321791288937, "grad_norm": 3.353827733882338, "learning_rate": 6.164252416075644e-08, "loss": 0.7136, "step": 3099 }, { "epoch": 0.8452048258469088, "grad_norm": 2.9190665749750675, "learning_rate": 6.143021798435922e-08, "loss": 0.645, "step": 3100 }, { "epoch": 0.845477472564924, "grad_norm": 1.8540822777572659, "learning_rate": 6.121825411428194e-08, "loss": 0.6845, "step": 3101 }, { "epoch": 0.8457501192829391, "grad_norm": 7.663076460201903, "learning_rate": 6.100663271596362e-08, "loss": 0.6844, "step": 3102 }, { "epoch": 0.8460227660009543, "grad_norm": 2.865983613519447, "learning_rate": 6.079535395457613e-08, "loss": 0.6756, "step": 3103 }, { "epoch": 0.8462954127189694, "grad_norm": 2.56780518752718, "learning_rate": 6.058441799502356e-08, "loss": 0.6869, "step": 3104 }, { "epoch": 0.8465680594369845, "grad_norm": 2.131340247504034, "learning_rate": 6.037382500194305e-08, "loss": 0.6807, "step": 3105 }, { "epoch": 0.8468407061549996, "grad_norm": 5.343454864991463, "learning_rate": 6.016357513970328e-08, "loss": 0.7017, "step": 3106 }, { "epoch": 0.8471133528730148, "grad_norm": 1.8894824056370192, "learning_rate": 5.995366857240591e-08, "loss": 0.6652, "step": 3107 }, { "epoch": 0.8473859995910299, "grad_norm": 2.1512880486378894, "learning_rate": 5.974410546388386e-08, "loss": 0.6674, "step": 3108 }, { "epoch": 0.8476586463090451, "grad_norm": 4.669908089679383, "learning_rate": 5.953488597770268e-08, "loss": 0.6242, "step": 3109 }, { "epoch": 0.8479312930270602, "grad_norm": 2.934268523358923, "learning_rate": 5.932601027715933e-08, "loss": 0.7446, "step": 3110 }, { "epoch": 0.8482039397450754, "grad_norm": 3.6225088952831377, "learning_rate": 5.911747852528254e-08, "loss": 0.7144, "step": 3111 }, { "epoch": 0.8484765864630904, "grad_norm": 2.872642340846276, "learning_rate": 5.890929088483254e-08, "loss": 0.6619, "step": 3112 }, { "epoch": 0.8487492331811056, "grad_norm": 2.011769668989077, "learning_rate": 5.8701447518301086e-08, "loss": 0.6695, "step": 3113 }, { "epoch": 0.8490218798991207, "grad_norm": 2.6543483930878367, "learning_rate": 5.8493948587911014e-08, "loss": 0.6523, "step": 3114 }, { "epoch": 0.8492945266171359, "grad_norm": 2.316630920323246, "learning_rate": 5.828679425561678e-08, "loss": 0.6882, "step": 3115 }, { "epoch": 0.849567173335151, "grad_norm": 2.4208240170079067, "learning_rate": 5.8079984683103157e-08, "loss": 0.6518, "step": 3116 }, { "epoch": 0.8498398200531662, "grad_norm": 6.3970659419861935, "learning_rate": 5.7873520031786524e-08, "loss": 0.6657, "step": 3117 }, { "epoch": 0.8501124667711812, "grad_norm": 3.508770008929028, "learning_rate": 5.7667400462813764e-08, "loss": 0.7455, "step": 3118 }, { "epoch": 0.8503851134891963, "grad_norm": 2.8765228922518915, "learning_rate": 5.746162613706235e-08, "loss": 0.7009, "step": 3119 }, { "epoch": 0.8506577602072115, "grad_norm": 2.2749405116555885, "learning_rate": 5.725619721514035e-08, "loss": 0.6009, "step": 3120 }, { "epoch": 0.8509304069252266, "grad_norm": 4.652303695534819, "learning_rate": 5.705111385738637e-08, "loss": 0.6928, "step": 3121 }, { "epoch": 0.8512030536432418, "grad_norm": 4.104573239803845, "learning_rate": 5.684637622386901e-08, "loss": 0.703, "step": 3122 }, { "epoch": 0.8514757003612569, "grad_norm": 2.206441822643221, "learning_rate": 5.664198447438728e-08, "loss": 0.6689, "step": 3123 }, { "epoch": 0.851748347079272, "grad_norm": 2.68329909317714, "learning_rate": 5.643793876847036e-08, "loss": 0.6675, "step": 3124 }, { "epoch": 0.8520209937972871, "grad_norm": 1.861226294764045, "learning_rate": 5.623423926537674e-08, "loss": 0.6092, "step": 3125 }, { "epoch": 0.8522936405153023, "grad_norm": 3.1286667533009997, "learning_rate": 5.603088612409551e-08, "loss": 0.7206, "step": 3126 }, { "epoch": 0.8525662872333174, "grad_norm": 2.5110342425363985, "learning_rate": 5.582787950334461e-08, "loss": 0.7177, "step": 3127 }, { "epoch": 0.8528389339513326, "grad_norm": 3.1025128728443057, "learning_rate": 5.562521956157218e-08, "loss": 0.7438, "step": 3128 }, { "epoch": 0.8531115806693477, "grad_norm": 2.131464679297914, "learning_rate": 5.5422906456955464e-08, "loss": 0.75, "step": 3129 }, { "epoch": 0.8533842273873629, "grad_norm": 2.4507932488402657, "learning_rate": 5.522094034740099e-08, "loss": 0.6783, "step": 3130 }, { "epoch": 0.8536568741053779, "grad_norm": 2.0723073264544, "learning_rate": 5.501932139054455e-08, "loss": 0.6891, "step": 3131 }, { "epoch": 0.8539295208233931, "grad_norm": 3.0535759627539645, "learning_rate": 5.4818049743750925e-08, "loss": 0.6602, "step": 3132 }, { "epoch": 0.8542021675414082, "grad_norm": 4.37333534038985, "learning_rate": 5.46171255641138e-08, "loss": 0.749, "step": 3133 }, { "epoch": 0.8544748142594234, "grad_norm": 6.463523308718146, "learning_rate": 5.4416549008455714e-08, "loss": 0.7258, "step": 3134 }, { "epoch": 0.8547474609774385, "grad_norm": 10.677210761812773, "learning_rate": 5.421632023332778e-08, "loss": 0.6815, "step": 3135 }, { "epoch": 0.8550201076954537, "grad_norm": 2.1623987920040295, "learning_rate": 5.401643939500994e-08, "loss": 0.6285, "step": 3136 }, { "epoch": 0.8552927544134687, "grad_norm": 2.9489681047135647, "learning_rate": 5.381690664951022e-08, "loss": 0.734, "step": 3137 }, { "epoch": 0.8555654011314838, "grad_norm": 2.0592996094003078, "learning_rate": 5.361772215256516e-08, "loss": 0.7181, "step": 3138 }, { "epoch": 0.855838047849499, "grad_norm": 2.1008790632491836, "learning_rate": 5.341888605963946e-08, "loss": 0.703, "step": 3139 }, { "epoch": 0.8561106945675141, "grad_norm": 1.7898685281772375, "learning_rate": 5.3220398525925805e-08, "loss": 0.6805, "step": 3140 }, { "epoch": 0.8563833412855293, "grad_norm": 1.8960633108788032, "learning_rate": 5.302225970634499e-08, "loss": 0.6995, "step": 3141 }, { "epoch": 0.8566559880035444, "grad_norm": 3.238947535719049, "learning_rate": 5.282446975554533e-08, "loss": 0.6397, "step": 3142 }, { "epoch": 0.8569286347215596, "grad_norm": 2.045792890157783, "learning_rate": 5.2627028827903483e-08, "loss": 0.6915, "step": 3143 }, { "epoch": 0.8572012814395746, "grad_norm": 3.4556852212216946, "learning_rate": 5.242993707752275e-08, "loss": 0.7236, "step": 3144 }, { "epoch": 0.8574739281575898, "grad_norm": 3.197844122733064, "learning_rate": 5.223319465823489e-08, "loss": 0.7189, "step": 3145 }, { "epoch": 0.8577465748756049, "grad_norm": 8.616042659780392, "learning_rate": 5.2036801723598036e-08, "loss": 0.6399, "step": 3146 }, { "epoch": 0.8580192215936201, "grad_norm": 2.962162407661386, "learning_rate": 5.184075842689845e-08, "loss": 0.6908, "step": 3147 }, { "epoch": 0.8582918683116352, "grad_norm": 2.340619004546336, "learning_rate": 5.164506492114895e-08, "loss": 0.6318, "step": 3148 }, { "epoch": 0.8585645150296504, "grad_norm": 3.0748160094111747, "learning_rate": 5.144972135908948e-08, "loss": 0.651, "step": 3149 }, { "epoch": 0.8588371617476654, "grad_norm": 1.86355628826466, "learning_rate": 5.125472789318685e-08, "loss": 0.7318, "step": 3150 }, { "epoch": 0.8591098084656806, "grad_norm": 10.847159132302687, "learning_rate": 5.106008467563455e-08, "loss": 0.7139, "step": 3151 }, { "epoch": 0.8593824551836957, "grad_norm": 2.992337709750296, "learning_rate": 5.086579185835288e-08, "loss": 0.6791, "step": 3152 }, { "epoch": 0.8596551019017109, "grad_norm": 2.67020162437003, "learning_rate": 5.0671849592988483e-08, "loss": 0.6407, "step": 3153 }, { "epoch": 0.859927748619726, "grad_norm": 2.357932449034995, "learning_rate": 5.047825803091438e-08, "loss": 0.6942, "step": 3154 }, { "epoch": 0.8602003953377412, "grad_norm": 2.6199811693798147, "learning_rate": 5.028501732323015e-08, "loss": 0.6523, "step": 3155 }, { "epoch": 0.8604730420557563, "grad_norm": 1.7924902873743351, "learning_rate": 5.009212762076104e-08, "loss": 0.7127, "step": 3156 }, { "epoch": 0.8607456887737713, "grad_norm": 2.4295727673886702, "learning_rate": 4.989958907405878e-08, "loss": 0.7956, "step": 3157 }, { "epoch": 0.8610183354917865, "grad_norm": 2.2890613910419897, "learning_rate": 4.970740183340083e-08, "loss": 0.6482, "step": 3158 }, { "epoch": 0.8612909822098016, "grad_norm": 2.33413831896105, "learning_rate": 4.951556604879048e-08, "loss": 0.6694, "step": 3159 }, { "epoch": 0.8615636289278168, "grad_norm": 2.1441460758761566, "learning_rate": 4.9324081869956625e-08, "loss": 0.6621, "step": 3160 }, { "epoch": 0.8618362756458319, "grad_norm": 2.21504614552591, "learning_rate": 4.9132949446353767e-08, "loss": 0.7057, "step": 3161 }, { "epoch": 0.8621089223638471, "grad_norm": 1.8657921054771827, "learning_rate": 4.894216892716219e-08, "loss": 0.6576, "step": 3162 }, { "epoch": 0.8623815690818621, "grad_norm": 2.2448436554752007, "learning_rate": 4.8751740461286826e-08, "loss": 0.7216, "step": 3163 }, { "epoch": 0.8626542157998773, "grad_norm": 1.8267196255689295, "learning_rate": 4.856166419735858e-08, "loss": 0.6633, "step": 3164 }, { "epoch": 0.8629268625178924, "grad_norm": 1.9910893124381965, "learning_rate": 4.837194028373276e-08, "loss": 0.6787, "step": 3165 }, { "epoch": 0.8631995092359076, "grad_norm": 2.3149267584676134, "learning_rate": 4.818256886849037e-08, "loss": 0.6308, "step": 3166 }, { "epoch": 0.8634721559539227, "grad_norm": 2.5642574358883423, "learning_rate": 4.799355009943656e-08, "loss": 0.6944, "step": 3167 }, { "epoch": 0.8637448026719379, "grad_norm": 2.31965858022462, "learning_rate": 4.780488412410189e-08, "loss": 0.6646, "step": 3168 }, { "epoch": 0.864017449389953, "grad_norm": 2.8874935320426305, "learning_rate": 4.761657108974115e-08, "loss": 0.6497, "step": 3169 }, { "epoch": 0.8642900961079681, "grad_norm": 8.29024827552166, "learning_rate": 4.7428611143333796e-08, "loss": 0.6968, "step": 3170 }, { "epoch": 0.8645627428259832, "grad_norm": 2.2571274011353326, "learning_rate": 4.724100443158369e-08, "loss": 0.629, "step": 3171 }, { "epoch": 0.8648353895439984, "grad_norm": 2.334447168508371, "learning_rate": 4.705375110091897e-08, "loss": 0.7036, "step": 3172 }, { "epoch": 0.8651080362620135, "grad_norm": 1.7751120904168332, "learning_rate": 4.6866851297492014e-08, "loss": 0.706, "step": 3173 }, { "epoch": 0.8653806829800287, "grad_norm": 1.7980712030538368, "learning_rate": 4.668030516717914e-08, "loss": 0.6056, "step": 3174 }, { "epoch": 0.8656533296980438, "grad_norm": 1.9793891539784034, "learning_rate": 4.649411285558069e-08, "loss": 0.6905, "step": 3175 }, { "epoch": 0.8659259764160588, "grad_norm": 1.7422913769552664, "learning_rate": 4.630827450802105e-08, "loss": 0.626, "step": 3176 }, { "epoch": 0.866198623134074, "grad_norm": 3.366977149442288, "learning_rate": 4.6122790269548073e-08, "loss": 0.6327, "step": 3177 }, { "epoch": 0.8664712698520891, "grad_norm": 3.1139695439566566, "learning_rate": 4.593766028493329e-08, "loss": 0.641, "step": 3178 }, { "epoch": 0.8667439165701043, "grad_norm": 2.0590411296692026, "learning_rate": 4.575288469867172e-08, "loss": 0.7065, "step": 3179 }, { "epoch": 0.8670165632881194, "grad_norm": 2.138343588886178, "learning_rate": 4.556846365498174e-08, "loss": 0.6487, "step": 3180 }, { "epoch": 0.8672892100061346, "grad_norm": 1.908373226895552, "learning_rate": 4.538439729780535e-08, "loss": 0.6761, "step": 3181 }, { "epoch": 0.8675618567241496, "grad_norm": 5.64594913712023, "learning_rate": 4.52006857708071e-08, "loss": 0.7076, "step": 3182 }, { "epoch": 0.8678345034421648, "grad_norm": 1.8913937791667446, "learning_rate": 4.5017329217375224e-08, "loss": 0.7359, "step": 3183 }, { "epoch": 0.8681071501601799, "grad_norm": 3.335795714458173, "learning_rate": 4.4834327780620295e-08, "loss": 0.7133, "step": 3184 }, { "epoch": 0.8683797968781951, "grad_norm": 3.3489573484180233, "learning_rate": 4.46516816033764e-08, "loss": 0.6811, "step": 3185 }, { "epoch": 0.8686524435962102, "grad_norm": 2.09369314191602, "learning_rate": 4.4469390828199505e-08, "loss": 0.673, "step": 3186 }, { "epoch": 0.8689250903142254, "grad_norm": 2.262029225691063, "learning_rate": 4.4287455597369014e-08, "loss": 0.6636, "step": 3187 }, { "epoch": 0.8691977370322405, "grad_norm": 2.379344888297491, "learning_rate": 4.4105876052886294e-08, "loss": 0.658, "step": 3188 }, { "epoch": 0.8694703837502556, "grad_norm": 2.010093362058926, "learning_rate": 4.39246523364753e-08, "loss": 0.7625, "step": 3189 }, { "epoch": 0.8697430304682707, "grad_norm": 1.9448347687653766, "learning_rate": 4.374378458958222e-08, "loss": 0.6967, "step": 3190 }, { "epoch": 0.8700156771862859, "grad_norm": 2.210269710321831, "learning_rate": 4.356327295337542e-08, "loss": 0.6943, "step": 3191 }, { "epoch": 0.870288323904301, "grad_norm": 1.88231047206986, "learning_rate": 4.338311756874524e-08, "loss": 0.7171, "step": 3192 }, { "epoch": 0.8705609706223162, "grad_norm": 2.2904896782192545, "learning_rate": 4.3203318576304107e-08, "loss": 0.6823, "step": 3193 }, { "epoch": 0.8708336173403313, "grad_norm": 2.8648905254131316, "learning_rate": 4.302387611638603e-08, "loss": 0.7352, "step": 3194 }, { "epoch": 0.8711062640583463, "grad_norm": 2.9225134052711064, "learning_rate": 4.2844790329047144e-08, "loss": 0.7397, "step": 3195 }, { "epoch": 0.8713789107763615, "grad_norm": 2.050648540458632, "learning_rate": 4.2666061354064916e-08, "loss": 0.5987, "step": 3196 }, { "epoch": 0.8716515574943766, "grad_norm": 1.8461492191192739, "learning_rate": 4.248768933093827e-08, "loss": 0.6189, "step": 3197 }, { "epoch": 0.8719242042123918, "grad_norm": 2.2892179546688993, "learning_rate": 4.2309674398887715e-08, "loss": 0.675, "step": 3198 }, { "epoch": 0.8721968509304069, "grad_norm": 2.0078940639397223, "learning_rate": 4.2132016696854953e-08, "loss": 0.6937, "step": 3199 }, { "epoch": 0.8724694976484221, "grad_norm": 3.921524495177282, "learning_rate": 4.195471636350284e-08, "loss": 0.664, "step": 3200 }, { "epoch": 0.8727421443664372, "grad_norm": 2.606808139058804, "learning_rate": 4.177777353721529e-08, "loss": 0.7043, "step": 3201 }, { "epoch": 0.8730147910844523, "grad_norm": 2.3627376789584558, "learning_rate": 4.160118835609749e-08, "loss": 0.5941, "step": 3202 }, { "epoch": 0.8732874378024674, "grad_norm": 2.400620966771131, "learning_rate": 4.142496095797482e-08, "loss": 0.6677, "step": 3203 }, { "epoch": 0.8735600845204826, "grad_norm": 2.3375320464807348, "learning_rate": 4.124909148039424e-08, "loss": 0.7131, "step": 3204 }, { "epoch": 0.8738327312384977, "grad_norm": 3.0173533774499037, "learning_rate": 4.1073580060622445e-08, "loss": 0.7286, "step": 3205 }, { "epoch": 0.8741053779565129, "grad_norm": 2.638415075872455, "learning_rate": 4.089842683564765e-08, "loss": 0.6282, "step": 3206 }, { "epoch": 0.874378024674528, "grad_norm": 2.138908305158991, "learning_rate": 4.072363194217754e-08, "loss": 0.6927, "step": 3207 }, { "epoch": 0.8746506713925432, "grad_norm": 1.8456219175528574, "learning_rate": 4.054919551664088e-08, "loss": 0.6117, "step": 3208 }, { "epoch": 0.8749233181105582, "grad_norm": 2.3900800560010445, "learning_rate": 4.03751176951862e-08, "loss": 0.6547, "step": 3209 }, { "epoch": 0.8751959648285734, "grad_norm": 3.5367605389503103, "learning_rate": 4.020139861368227e-08, "loss": 0.7287, "step": 3210 }, { "epoch": 0.8754686115465885, "grad_norm": 2.661716943973331, "learning_rate": 4.002803840771796e-08, "loss": 0.706, "step": 3211 }, { "epoch": 0.8757412582646037, "grad_norm": 2.6545180274648446, "learning_rate": 3.9855037212601806e-08, "loss": 0.697, "step": 3212 }, { "epoch": 0.8760139049826188, "grad_norm": 4.332275692584316, "learning_rate": 3.968239516336225e-08, "loss": 0.6841, "step": 3213 }, { "epoch": 0.8762865517006339, "grad_norm": 3.7123016343530195, "learning_rate": 3.9510112394747665e-08, "loss": 0.689, "step": 3214 }, { "epoch": 0.876559198418649, "grad_norm": 2.0755088758894056, "learning_rate": 3.93381890412256e-08, "loss": 0.6492, "step": 3215 }, { "epoch": 0.8768318451366641, "grad_norm": 2.446884530308255, "learning_rate": 3.9166625236983274e-08, "loss": 0.639, "step": 3216 }, { "epoch": 0.8771044918546793, "grad_norm": 2.4794486481244125, "learning_rate": 3.899542111592724e-08, "loss": 0.6856, "step": 3217 }, { "epoch": 0.8773771385726944, "grad_norm": 2.178582016663354, "learning_rate": 3.8824576811683386e-08, "loss": 0.6609, "step": 3218 }, { "epoch": 0.8776497852907096, "grad_norm": 4.364874305188323, "learning_rate": 3.865409245759671e-08, "loss": 0.6719, "step": 3219 }, { "epoch": 0.8779224320087247, "grad_norm": 2.65668455474124, "learning_rate": 3.8483968186731105e-08, "loss": 0.722, "step": 3220 }, { "epoch": 0.8781950787267399, "grad_norm": 2.6457090426175296, "learning_rate": 3.831420413186987e-08, "loss": 0.669, "step": 3221 }, { "epoch": 0.8784677254447549, "grad_norm": 2.5198368854390036, "learning_rate": 3.814480042551454e-08, "loss": 0.7046, "step": 3222 }, { "epoch": 0.8787403721627701, "grad_norm": 2.793118425762473, "learning_rate": 3.797575719988599e-08, "loss": 0.663, "step": 3223 }, { "epoch": 0.8790130188807852, "grad_norm": 1.948052286647526, "learning_rate": 3.7807074586923224e-08, "loss": 0.622, "step": 3224 }, { "epoch": 0.8792856655988004, "grad_norm": 2.4433942179117025, "learning_rate": 3.763875271828426e-08, "loss": 0.7191, "step": 3225 }, { "epoch": 0.8795583123168155, "grad_norm": 2.3678410502744107, "learning_rate": 3.7470791725345065e-08, "loss": 0.6765, "step": 3226 }, { "epoch": 0.8798309590348307, "grad_norm": 2.579923731988964, "learning_rate": 3.7303191739200324e-08, "loss": 0.6136, "step": 3227 }, { "epoch": 0.8801036057528457, "grad_norm": 2.7497682944911066, "learning_rate": 3.71359528906629e-08, "loss": 0.6946, "step": 3228 }, { "epoch": 0.8803762524708609, "grad_norm": 7.034178598780373, "learning_rate": 3.6969075310263555e-08, "loss": 0.6766, "step": 3229 }, { "epoch": 0.880648899188876, "grad_norm": 2.8743868204362437, "learning_rate": 3.6802559128251367e-08, "loss": 0.6896, "step": 3230 }, { "epoch": 0.8809215459068912, "grad_norm": 2.6937163727447597, "learning_rate": 3.66364044745931e-08, "loss": 0.6872, "step": 3231 }, { "epoch": 0.8811941926249063, "grad_norm": 2.652016780078199, "learning_rate": 3.6470611478973456e-08, "loss": 0.6806, "step": 3232 }, { "epoch": 0.8814668393429214, "grad_norm": 3.2044047168378205, "learning_rate": 3.630518027079482e-08, "loss": 0.6532, "step": 3233 }, { "epoch": 0.8817394860609366, "grad_norm": 2.5804614223585163, "learning_rate": 3.6140110979177406e-08, "loss": 0.7475, "step": 3234 }, { "epoch": 0.8820121327789516, "grad_norm": 5.588051065587114, "learning_rate": 3.5975403732958655e-08, "loss": 0.6475, "step": 3235 }, { "epoch": 0.8822847794969668, "grad_norm": 7.528729920052775, "learning_rate": 3.581105866069351e-08, "loss": 0.7196, "step": 3236 }, { "epoch": 0.8825574262149819, "grad_norm": 22.74140925050926, "learning_rate": 3.564707589065441e-08, "loss": 0.7052, "step": 3237 }, { "epoch": 0.8828300729329971, "grad_norm": 3.3528527966838237, "learning_rate": 3.548345555083071e-08, "loss": 0.6887, "step": 3238 }, { "epoch": 0.8831027196510122, "grad_norm": 2.318379397341436, "learning_rate": 3.532019776892914e-08, "loss": 0.6985, "step": 3239 }, { "epoch": 0.8833753663690274, "grad_norm": 1.9649896146110035, "learning_rate": 3.515730267237349e-08, "loss": 0.6693, "step": 3240 }, { "epoch": 0.8836480130870424, "grad_norm": 2.101753683177873, "learning_rate": 3.499477038830412e-08, "loss": 0.6514, "step": 3241 }, { "epoch": 0.8839206598050576, "grad_norm": 4.788868957095333, "learning_rate": 3.483260104357866e-08, "loss": 0.6564, "step": 3242 }, { "epoch": 0.8841933065230727, "grad_norm": 2.190589787794036, "learning_rate": 3.467079476477103e-08, "loss": 0.7231, "step": 3243 }, { "epoch": 0.8844659532410879, "grad_norm": 2.0078205604364805, "learning_rate": 3.450935167817226e-08, "loss": 0.7526, "step": 3244 }, { "epoch": 0.884738599959103, "grad_norm": 2.162305039137536, "learning_rate": 3.434827190978928e-08, "loss": 0.6914, "step": 3245 }, { "epoch": 0.8850112466771182, "grad_norm": 1.9429353490575554, "learning_rate": 3.418755558534614e-08, "loss": 0.6786, "step": 3246 }, { "epoch": 0.8852838933951332, "grad_norm": 2.1742500648052765, "learning_rate": 3.402720283028277e-08, "loss": 0.6982, "step": 3247 }, { "epoch": 0.8855565401131484, "grad_norm": 2.7639696724042007, "learning_rate": 3.386721376975538e-08, "loss": 0.6869, "step": 3248 }, { "epoch": 0.8858291868311635, "grad_norm": 8.104757167462031, "learning_rate": 3.370758852863648e-08, "loss": 0.7001, "step": 3249 }, { "epoch": 0.8861018335491786, "grad_norm": 4.402234040425614, "learning_rate": 3.3548327231514406e-08, "loss": 0.7606, "step": 3250 }, { "epoch": 0.8863744802671938, "grad_norm": 3.079760231423675, "learning_rate": 3.338943000269368e-08, "loss": 0.6582, "step": 3251 }, { "epoch": 0.8866471269852089, "grad_norm": 3.0497514015490252, "learning_rate": 3.323089696619436e-08, "loss": 0.6528, "step": 3252 }, { "epoch": 0.8869197737032241, "grad_norm": 2.3955105562770065, "learning_rate": 3.3072728245752634e-08, "loss": 0.7144, "step": 3253 }, { "epoch": 0.8871924204212391, "grad_norm": 22.100767937876057, "learning_rate": 3.291492396482004e-08, "loss": 0.7117, "step": 3254 }, { "epoch": 0.8874650671392543, "grad_norm": 4.177183873993089, "learning_rate": 3.2757484246563716e-08, "loss": 0.7202, "step": 3255 }, { "epoch": 0.8877377138572694, "grad_norm": 1.8709994245174264, "learning_rate": 3.260040921386631e-08, "loss": 0.6456, "step": 3256 }, { "epoch": 0.8880103605752846, "grad_norm": 2.002309801027252, "learning_rate": 3.2443698989325865e-08, "loss": 0.6626, "step": 3257 }, { "epoch": 0.8882830072932997, "grad_norm": 3.051276297841773, "learning_rate": 3.2287353695255535e-08, "loss": 0.7375, "step": 3258 }, { "epoch": 0.8885556540113149, "grad_norm": 2.999744899307764, "learning_rate": 3.2131373453683884e-08, "loss": 0.6835, "step": 3259 }, { "epoch": 0.88882830072933, "grad_norm": 2.73642866879599, "learning_rate": 3.19757583863543e-08, "loss": 0.6565, "step": 3260 }, { "epoch": 0.8891009474473451, "grad_norm": 3.011200989968755, "learning_rate": 3.1820508614725406e-08, "loss": 0.6669, "step": 3261 }, { "epoch": 0.8893735941653602, "grad_norm": 8.872546723072013, "learning_rate": 3.166562425997038e-08, "loss": 0.6661, "step": 3262 }, { "epoch": 0.8896462408833754, "grad_norm": 2.7886457978724564, "learning_rate": 3.151110544297764e-08, "loss": 0.6939, "step": 3263 }, { "epoch": 0.8899188876013905, "grad_norm": 3.2522329408484247, "learning_rate": 3.135695228434976e-08, "loss": 0.7212, "step": 3264 }, { "epoch": 0.8901915343194057, "grad_norm": 2.2434199276702156, "learning_rate": 3.1203164904404445e-08, "loss": 0.7293, "step": 3265 }, { "epoch": 0.8904641810374208, "grad_norm": 2.370573339823656, "learning_rate": 3.10497434231734e-08, "loss": 0.7566, "step": 3266 }, { "epoch": 0.890736827755436, "grad_norm": 2.823337469113944, "learning_rate": 3.0896687960403236e-08, "loss": 0.6836, "step": 3267 }, { "epoch": 0.891009474473451, "grad_norm": 5.062743851125372, "learning_rate": 3.074399863555455e-08, "loss": 0.6722, "step": 3268 }, { "epoch": 0.8912821211914661, "grad_norm": 1.9559332862055265, "learning_rate": 3.0591675567802236e-08, "loss": 0.649, "step": 3269 }, { "epoch": 0.8915547679094813, "grad_norm": 1.930377284478676, "learning_rate": 3.043971887603541e-08, "loss": 0.7448, "step": 3270 }, { "epoch": 0.8918274146274964, "grad_norm": 2.7929652444148205, "learning_rate": 3.0288128678857104e-08, "loss": 0.698, "step": 3271 }, { "epoch": 0.8921000613455116, "grad_norm": 2.1517290753672156, "learning_rate": 3.0136905094584444e-08, "loss": 0.6872, "step": 3272 }, { "epoch": 0.8923727080635266, "grad_norm": 2.3990569759023113, "learning_rate": 2.998604824124834e-08, "loss": 0.7236, "step": 3273 }, { "epoch": 0.8926453547815418, "grad_norm": 2.1351869691191454, "learning_rate": 2.983555823659345e-08, "loss": 0.7281, "step": 3274 }, { "epoch": 0.8929180014995569, "grad_norm": 3.0427261463393345, "learning_rate": 2.968543519807809e-08, "loss": 0.652, "step": 3275 }, { "epoch": 0.8931906482175721, "grad_norm": 1.9630601406964032, "learning_rate": 2.9535679242874145e-08, "loss": 0.635, "step": 3276 }, { "epoch": 0.8934632949355872, "grad_norm": 9.72116261708821, "learning_rate": 2.9386290487867117e-08, "loss": 0.6913, "step": 3277 }, { "epoch": 0.8937359416536024, "grad_norm": 1.9658309088819355, "learning_rate": 2.923726904965579e-08, "loss": 0.7156, "step": 3278 }, { "epoch": 0.8940085883716175, "grad_norm": 3.3237463001476857, "learning_rate": 2.9088615044552123e-08, "loss": 0.6524, "step": 3279 }, { "epoch": 0.8942812350896326, "grad_norm": 2.103664979466575, "learning_rate": 2.894032858858181e-08, "loss": 0.6731, "step": 3280 }, { "epoch": 0.8945538818076477, "grad_norm": 2.4599734295223508, "learning_rate": 2.8792409797482875e-08, "loss": 0.6436, "step": 3281 }, { "epoch": 0.8948265285256629, "grad_norm": 3.075538162005126, "learning_rate": 2.8644858786707248e-08, "loss": 0.6643, "step": 3282 }, { "epoch": 0.895099175243678, "grad_norm": 2.3547276347842905, "learning_rate": 2.8497675671418973e-08, "loss": 0.6972, "step": 3283 }, { "epoch": 0.8953718219616932, "grad_norm": 27.040952209216172, "learning_rate": 2.835086056649566e-08, "loss": 0.778, "step": 3284 }, { "epoch": 0.8956444686797083, "grad_norm": 2.6062962105145275, "learning_rate": 2.8204413586527086e-08, "loss": 0.7154, "step": 3285 }, { "epoch": 0.8959171153977235, "grad_norm": 1.9064995149780246, "learning_rate": 2.805833484581621e-08, "loss": 0.7004, "step": 3286 }, { "epoch": 0.8961897621157385, "grad_norm": 2.5312915153953925, "learning_rate": 2.791262445837833e-08, "loss": 0.7344, "step": 3287 }, { "epoch": 0.8964624088337536, "grad_norm": 1.91766860034062, "learning_rate": 2.7767282537941195e-08, "loss": 0.6378, "step": 3288 }, { "epoch": 0.8967350555517688, "grad_norm": 2.341773446256189, "learning_rate": 2.762230919794506e-08, "loss": 0.6741, "step": 3289 }, { "epoch": 0.8970077022697839, "grad_norm": 3.2998839224438434, "learning_rate": 2.7477704551542414e-08, "loss": 0.6516, "step": 3290 }, { "epoch": 0.8972803489877991, "grad_norm": 4.042354297294251, "learning_rate": 2.733346871159825e-08, "loss": 0.6867, "step": 3291 }, { "epoch": 0.8975529957058141, "grad_norm": 2.702987222166714, "learning_rate": 2.718960179068924e-08, "loss": 0.7382, "step": 3292 }, { "epoch": 0.8978256424238293, "grad_norm": 6.669414044978653, "learning_rate": 2.7046103901104512e-08, "loss": 0.6543, "step": 3293 }, { "epoch": 0.8980982891418444, "grad_norm": 2.032139747387718, "learning_rate": 2.6902975154845075e-08, "loss": 0.6333, "step": 3294 }, { "epoch": 0.8983709358598596, "grad_norm": 3.0800345741975463, "learning_rate": 2.676021566362363e-08, "loss": 0.7341, "step": 3295 }, { "epoch": 0.8986435825778747, "grad_norm": 2.050425715194635, "learning_rate": 2.661782553886488e-08, "loss": 0.672, "step": 3296 }, { "epoch": 0.8989162292958899, "grad_norm": 2.060963799556105, "learning_rate": 2.6475804891705146e-08, "loss": 0.6321, "step": 3297 }, { "epoch": 0.899188876013905, "grad_norm": 2.121733123879263, "learning_rate": 2.6334153832992422e-08, "loss": 0.6463, "step": 3298 }, { "epoch": 0.8994615227319201, "grad_norm": 7.214892739955104, "learning_rate": 2.6192872473286165e-08, "loss": 0.673, "step": 3299 }, { "epoch": 0.8997341694499352, "grad_norm": 7.389745876689978, "learning_rate": 2.605196092285722e-08, "loss": 0.7155, "step": 3300 }, { "epoch": 0.9000068161679504, "grad_norm": 3.8151118434819766, "learning_rate": 2.5911419291688164e-08, "loss": 0.678, "step": 3301 }, { "epoch": 0.9002794628859655, "grad_norm": 4.171309797297884, "learning_rate": 2.57712476894722e-08, "loss": 0.6851, "step": 3302 }, { "epoch": 0.9005521096039807, "grad_norm": 1.852406402621903, "learning_rate": 2.5631446225614527e-08, "loss": 0.6287, "step": 3303 }, { "epoch": 0.9008247563219958, "grad_norm": 2.993509260731107, "learning_rate": 2.5492015009230594e-08, "loss": 0.6162, "step": 3304 }, { "epoch": 0.901097403040011, "grad_norm": 1.8578856918821018, "learning_rate": 2.5352954149147564e-08, "loss": 0.7165, "step": 3305 }, { "epoch": 0.901370049758026, "grad_norm": 2.470805831471982, "learning_rate": 2.5214263753903175e-08, "loss": 0.6906, "step": 3306 }, { "epoch": 0.9016426964760411, "grad_norm": 2.620118714498054, "learning_rate": 2.5075943931746113e-08, "loss": 0.6632, "step": 3307 }, { "epoch": 0.9019153431940563, "grad_norm": 2.1033577913751436, "learning_rate": 2.4937994790635852e-08, "loss": 0.6534, "step": 3308 }, { "epoch": 0.9021879899120714, "grad_norm": 2.6694261851300696, "learning_rate": 2.480041643824249e-08, "loss": 0.7081, "step": 3309 }, { "epoch": 0.9024606366300866, "grad_norm": 2.9131889859143296, "learning_rate": 2.4663208981946738e-08, "loss": 0.6334, "step": 3310 }, { "epoch": 0.9027332833481017, "grad_norm": 2.3297931077587806, "learning_rate": 2.4526372528839767e-08, "loss": 0.6063, "step": 3311 }, { "epoch": 0.9030059300661168, "grad_norm": 4.151875755411258, "learning_rate": 2.4389907185723424e-08, "loss": 0.679, "step": 3312 }, { "epoch": 0.9032785767841319, "grad_norm": 2.2047470509512146, "learning_rate": 2.4253813059109562e-08, "loss": 0.7006, "step": 3313 }, { "epoch": 0.9035512235021471, "grad_norm": 2.2742853316594833, "learning_rate": 2.411809025522049e-08, "loss": 0.6759, "step": 3314 }, { "epoch": 0.9038238702201622, "grad_norm": 4.304362999322489, "learning_rate": 2.3982738879988696e-08, "loss": 0.6427, "step": 3315 }, { "epoch": 0.9040965169381774, "grad_norm": 4.725773781680303, "learning_rate": 2.3847759039056724e-08, "loss": 0.6861, "step": 3316 }, { "epoch": 0.9043691636561925, "grad_norm": 2.323490397775118, "learning_rate": 2.371315083777714e-08, "loss": 0.6702, "step": 3317 }, { "epoch": 0.9046418103742077, "grad_norm": 2.158407180037938, "learning_rate": 2.35789143812124e-08, "loss": 0.7185, "step": 3318 }, { "epoch": 0.9049144570922227, "grad_norm": 1.7525079048072876, "learning_rate": 2.344504977413486e-08, "loss": 0.708, "step": 3319 }, { "epoch": 0.9051871038102379, "grad_norm": 2.039140351195164, "learning_rate": 2.3311557121026782e-08, "loss": 0.689, "step": 3320 }, { "epoch": 0.905459750528253, "grad_norm": 2.2425076990965973, "learning_rate": 2.317843652607976e-08, "loss": 0.672, "step": 3321 }, { "epoch": 0.9057323972462682, "grad_norm": 4.737260918428327, "learning_rate": 2.3045688093195402e-08, "loss": 0.7404, "step": 3322 }, { "epoch": 0.9060050439642833, "grad_norm": 3.314512127417591, "learning_rate": 2.2913311925984446e-08, "loss": 0.6784, "step": 3323 }, { "epoch": 0.9062776906822985, "grad_norm": 1.6884174618268015, "learning_rate": 2.2781308127767462e-08, "loss": 0.6637, "step": 3324 }, { "epoch": 0.9065503374003135, "grad_norm": 2.5812803445812027, "learning_rate": 2.264967680157415e-08, "loss": 0.6754, "step": 3325 }, { "epoch": 0.9068229841183286, "grad_norm": 2.633773182074836, "learning_rate": 2.2518418050143485e-08, "loss": 0.7043, "step": 3326 }, { "epoch": 0.9070956308363438, "grad_norm": 1.772421006920916, "learning_rate": 2.2387531975923745e-08, "loss": 0.6604, "step": 3327 }, { "epoch": 0.9073682775543589, "grad_norm": 2.1443583857111275, "learning_rate": 2.225701868107227e-08, "loss": 0.6679, "step": 3328 }, { "epoch": 0.9076409242723741, "grad_norm": 2.7867428615675838, "learning_rate": 2.2126878267455518e-08, "loss": 0.6612, "step": 3329 }, { "epoch": 0.9079135709903892, "grad_norm": 2.4107964363615446, "learning_rate": 2.199711083664868e-08, "loss": 0.6785, "step": 3330 }, { "epoch": 0.9081862177084044, "grad_norm": 2.083806712334319, "learning_rate": 2.1867716489936294e-08, "loss": 0.6927, "step": 3331 }, { "epoch": 0.9084588644264194, "grad_norm": 2.500154504890412, "learning_rate": 2.173869532831113e-08, "loss": 0.7258, "step": 3332 }, { "epoch": 0.9087315111444346, "grad_norm": 2.1221912792131885, "learning_rate": 2.1610047452475188e-08, "loss": 0.6919, "step": 3333 }, { "epoch": 0.9090041578624497, "grad_norm": 1.8481835099073138, "learning_rate": 2.1481772962838763e-08, "loss": 0.6752, "step": 3334 }, { "epoch": 0.9092768045804649, "grad_norm": 2.261090454100049, "learning_rate": 2.135387195952093e-08, "loss": 0.7028, "step": 3335 }, { "epoch": 0.90954945129848, "grad_norm": 2.3818595792903725, "learning_rate": 2.1226344542349116e-08, "loss": 0.7036, "step": 3336 }, { "epoch": 0.9098220980164952, "grad_norm": 2.808333403817017, "learning_rate": 2.1099190810859314e-08, "loss": 0.7045, "step": 3337 }, { "epoch": 0.9100947447345102, "grad_norm": 3.5392142539564926, "learning_rate": 2.097241086429563e-08, "loss": 0.6798, "step": 3338 }, { "epoch": 0.9103673914525254, "grad_norm": 2.1244080660760147, "learning_rate": 2.084600480161075e-08, "loss": 0.6237, "step": 3339 }, { "epoch": 0.9106400381705405, "grad_norm": 6.152968027727815, "learning_rate": 2.0719972721465194e-08, "loss": 0.6983, "step": 3340 }, { "epoch": 0.9109126848885557, "grad_norm": 2.1611170286201404, "learning_rate": 2.0594314722227946e-08, "loss": 0.6565, "step": 3341 }, { "epoch": 0.9111853316065708, "grad_norm": 2.2968527674805364, "learning_rate": 2.0469030901975494e-08, "loss": 0.6588, "step": 3342 }, { "epoch": 0.911457978324586, "grad_norm": 1.899272517568524, "learning_rate": 2.034412135849295e-08, "loss": 0.6422, "step": 3343 }, { "epoch": 0.911730625042601, "grad_norm": 3.07956638970789, "learning_rate": 2.021958618927272e-08, "loss": 0.6676, "step": 3344 }, { "epoch": 0.9120032717606161, "grad_norm": 1.899493983177865, "learning_rate": 2.0095425491515384e-08, "loss": 0.6492, "step": 3345 }, { "epoch": 0.9122759184786313, "grad_norm": 2.635142206705586, "learning_rate": 1.9971639362129034e-08, "loss": 0.6684, "step": 3346 }, { "epoch": 0.9125485651966464, "grad_norm": 2.6876384599902443, "learning_rate": 1.9848227897729497e-08, "loss": 0.6397, "step": 3347 }, { "epoch": 0.9128212119146616, "grad_norm": 3.201014974259298, "learning_rate": 1.972519119464017e-08, "loss": 0.7161, "step": 3348 }, { "epoch": 0.9130938586326767, "grad_norm": 1.93617439887797, "learning_rate": 1.9602529348891785e-08, "loss": 0.6392, "step": 3349 }, { "epoch": 0.9133665053506919, "grad_norm": 3.873611208136613, "learning_rate": 1.9480242456222928e-08, "loss": 0.6406, "step": 3350 }, { "epoch": 0.9136391520687069, "grad_norm": 14.564628627718319, "learning_rate": 1.9358330612078978e-08, "loss": 0.7139, "step": 3351 }, { "epoch": 0.9139117987867221, "grad_norm": 2.0194898028346, "learning_rate": 1.9236793911613038e-08, "loss": 0.6614, "step": 3352 }, { "epoch": 0.9141844455047372, "grad_norm": 2.6824274830339245, "learning_rate": 1.9115632449685116e-08, "loss": 0.7135, "step": 3353 }, { "epoch": 0.9144570922227524, "grad_norm": 3.9516963403540037, "learning_rate": 1.8994846320862456e-08, "loss": 0.6699, "step": 3354 }, { "epoch": 0.9147297389407675, "grad_norm": 3.0836881143457204, "learning_rate": 1.8874435619419425e-08, "loss": 0.6566, "step": 3355 }, { "epoch": 0.9150023856587827, "grad_norm": 2.265961511591734, "learning_rate": 1.8754400439337238e-08, "loss": 0.6811, "step": 3356 }, { "epoch": 0.9152750323767977, "grad_norm": 4.367503903137435, "learning_rate": 1.8634740874304057e-08, "loss": 0.6936, "step": 3357 }, { "epoch": 0.9155476790948129, "grad_norm": 2.493722813850096, "learning_rate": 1.8515457017714897e-08, "loss": 0.6885, "step": 3358 }, { "epoch": 0.915820325812828, "grad_norm": 3.489730725960847, "learning_rate": 1.8396548962671454e-08, "loss": 0.6757, "step": 3359 }, { "epoch": 0.9160929725308432, "grad_norm": 2.7018793457486754, "learning_rate": 1.8278016801982377e-08, "loss": 0.6606, "step": 3360 }, { "epoch": 0.9163656192488583, "grad_norm": 2.113325777023861, "learning_rate": 1.8159860628162436e-08, "loss": 0.7897, "step": 3361 }, { "epoch": 0.9166382659668735, "grad_norm": 3.3698438504976935, "learning_rate": 1.8042080533433424e-08, "loss": 0.7267, "step": 3362 }, { "epoch": 0.9169109126848886, "grad_norm": 2.71577168843931, "learning_rate": 1.7924676609723355e-08, "loss": 0.6624, "step": 3363 }, { "epoch": 0.9171835594029036, "grad_norm": 3.020140888948677, "learning_rate": 1.7807648948666654e-08, "loss": 0.6984, "step": 3364 }, { "epoch": 0.9174562061209188, "grad_norm": 5.14070938806487, "learning_rate": 1.76909976416042e-08, "loss": 0.6756, "step": 3365 }, { "epoch": 0.9177288528389339, "grad_norm": 2.285075172367938, "learning_rate": 1.757472277958294e-08, "loss": 0.6442, "step": 3366 }, { "epoch": 0.9180014995569491, "grad_norm": 2.0209011147556972, "learning_rate": 1.7458824453356112e-08, "loss": 0.6761, "step": 3367 }, { "epoch": 0.9182741462749642, "grad_norm": 2.5176310376882602, "learning_rate": 1.7343302753382972e-08, "loss": 0.7145, "step": 3368 }, { "epoch": 0.9185467929929794, "grad_norm": 12.164141698694651, "learning_rate": 1.722815776982911e-08, "loss": 0.7352, "step": 3369 }, { "epoch": 0.9188194397109944, "grad_norm": 2.710853119675265, "learning_rate": 1.7113389592565642e-08, "loss": 0.7052, "step": 3370 }, { "epoch": 0.9190920864290096, "grad_norm": 2.3002950389475654, "learning_rate": 1.6998998311169966e-08, "loss": 0.6591, "step": 3371 }, { "epoch": 0.9193647331470247, "grad_norm": 2.706748342873945, "learning_rate": 1.688498401492505e-08, "loss": 0.7038, "step": 3372 }, { "epoch": 0.9196373798650399, "grad_norm": 2.546165760473349, "learning_rate": 1.677134679281983e-08, "loss": 0.6975, "step": 3373 }, { "epoch": 0.919910026583055, "grad_norm": 3.5048113659429307, "learning_rate": 1.6658086733548737e-08, "loss": 0.7249, "step": 3374 }, { "epoch": 0.9201826733010702, "grad_norm": 4.595287984223941, "learning_rate": 1.6545203925512008e-08, "loss": 0.7261, "step": 3375 }, { "epoch": 0.9204553200190853, "grad_norm": 2.1277745949808184, "learning_rate": 1.6432698456815276e-08, "loss": 0.687, "step": 3376 }, { "epoch": 0.9207279667371004, "grad_norm": 1.8202409324759166, "learning_rate": 1.632057041526974e-08, "loss": 0.6609, "step": 3377 }, { "epoch": 0.9210006134551155, "grad_norm": 2.263832133855102, "learning_rate": 1.6208819888391956e-08, "loss": 0.7021, "step": 3378 }, { "epoch": 0.9212732601731307, "grad_norm": 5.444832523727752, "learning_rate": 1.6097446963404092e-08, "loss": 0.7238, "step": 3379 }, { "epoch": 0.9215459068911458, "grad_norm": 1.8448629270465635, "learning_rate": 1.5986451727233064e-08, "loss": 0.7154, "step": 3380 }, { "epoch": 0.921818553609161, "grad_norm": 2.245012622422925, "learning_rate": 1.587583426651151e-08, "loss": 0.6642, "step": 3381 }, { "epoch": 0.9220912003271761, "grad_norm": 2.9657409513964317, "learning_rate": 1.5765594667576987e-08, "loss": 0.6894, "step": 3382 }, { "epoch": 0.9223638470451911, "grad_norm": 2.2449228386229847, "learning_rate": 1.5655733016472163e-08, "loss": 0.6125, "step": 3383 }, { "epoch": 0.9226364937632063, "grad_norm": 1.8284596474049426, "learning_rate": 1.5546249398944666e-08, "loss": 0.7075, "step": 3384 }, { "epoch": 0.9229091404812214, "grad_norm": 1.968467885991345, "learning_rate": 1.5437143900447146e-08, "loss": 0.7313, "step": 3385 }, { "epoch": 0.9231817871992366, "grad_norm": 2.4513706161812063, "learning_rate": 1.5328416606137095e-08, "loss": 0.6976, "step": 3386 }, { "epoch": 0.9234544339172517, "grad_norm": 5.05841790288491, "learning_rate": 1.5220067600876684e-08, "loss": 0.6578, "step": 3387 }, { "epoch": 0.9237270806352669, "grad_norm": 4.010259944907989, "learning_rate": 1.5112096969233213e-08, "loss": 0.7495, "step": 3388 }, { "epoch": 0.923999727353282, "grad_norm": 5.468680478779558, "learning_rate": 1.5004504795478156e-08, "loss": 0.7446, "step": 3389 }, { "epoch": 0.9242723740712971, "grad_norm": 2.2220470748554932, "learning_rate": 1.489729116358801e-08, "loss": 0.7011, "step": 3390 }, { "epoch": 0.9245450207893122, "grad_norm": 2.850089202707344, "learning_rate": 1.4790456157243503e-08, "loss": 0.6856, "step": 3391 }, { "epoch": 0.9248176675073274, "grad_norm": 2.6516095049501285, "learning_rate": 1.4683999859830153e-08, "loss": 0.6674, "step": 3392 }, { "epoch": 0.9250903142253425, "grad_norm": 4.152006129957767, "learning_rate": 1.4577922354437611e-08, "loss": 0.7462, "step": 3393 }, { "epoch": 0.9253629609433577, "grad_norm": 4.082964145590875, "learning_rate": 1.4472223723860034e-08, "loss": 0.7071, "step": 3394 }, { "epoch": 0.9256356076613728, "grad_norm": 5.624216956500196, "learning_rate": 1.4366904050595873e-08, "loss": 0.732, "step": 3395 }, { "epoch": 0.925908254379388, "grad_norm": 2.207489259919568, "learning_rate": 1.426196341684771e-08, "loss": 0.6994, "step": 3396 }, { "epoch": 0.926180901097403, "grad_norm": 2.1243153594409514, "learning_rate": 1.415740190452236e-08, "loss": 0.629, "step": 3397 }, { "epoch": 0.9264535478154182, "grad_norm": 2.4264982766025067, "learning_rate": 1.4053219595230704e-08, "loss": 0.6697, "step": 3398 }, { "epoch": 0.9267261945334333, "grad_norm": 3.773831390109398, "learning_rate": 1.394941657028753e-08, "loss": 0.7021, "step": 3399 }, { "epoch": 0.9269988412514485, "grad_norm": 5.037364003637088, "learning_rate": 1.3845992910711979e-08, "loss": 0.6216, "step": 3400 }, { "epoch": 0.9272714879694636, "grad_norm": 2.293203529764376, "learning_rate": 1.374294869722653e-08, "loss": 0.676, "step": 3401 }, { "epoch": 0.9275441346874786, "grad_norm": 4.7318427762568955, "learning_rate": 1.3640284010258075e-08, "loss": 0.6262, "step": 3402 }, { "epoch": 0.9278167814054938, "grad_norm": 1.9083602524293921, "learning_rate": 1.3537998929936844e-08, "loss": 0.749, "step": 3403 }, { "epoch": 0.9280894281235089, "grad_norm": 7.216299583482297, "learning_rate": 1.3436093536096981e-08, "loss": 0.7525, "step": 3404 }, { "epoch": 0.9283620748415241, "grad_norm": 6.058505788289772, "learning_rate": 1.3334567908276306e-08, "loss": 0.6506, "step": 3405 }, { "epoch": 0.9286347215595392, "grad_norm": 5.880338308038983, "learning_rate": 1.3233422125716043e-08, "loss": 0.7242, "step": 3406 }, { "epoch": 0.9289073682775544, "grad_norm": 2.875632624138645, "learning_rate": 1.313265626736132e-08, "loss": 0.6939, "step": 3407 }, { "epoch": 0.9291800149955695, "grad_norm": 2.114284054983867, "learning_rate": 1.3032270411860224e-08, "loss": 0.8173, "step": 3408 }, { "epoch": 0.9294526617135846, "grad_norm": 3.059262675121559, "learning_rate": 1.2932264637564693e-08, "loss": 0.7492, "step": 3409 }, { "epoch": 0.9297253084315997, "grad_norm": 3.5815301810956606, "learning_rate": 1.2832639022529679e-08, "loss": 0.7081, "step": 3410 }, { "epoch": 0.9299979551496149, "grad_norm": 3.322755543664425, "learning_rate": 1.2733393644513646e-08, "loss": 0.6447, "step": 3411 }, { "epoch": 0.93027060186763, "grad_norm": 1.9573246393863952, "learning_rate": 1.2634528580978243e-08, "loss": 0.5526, "step": 3412 }, { "epoch": 0.9305432485856452, "grad_norm": 2.6382909822307825, "learning_rate": 1.253604390908819e-08, "loss": 0.7018, "step": 3413 }, { "epoch": 0.9308158953036603, "grad_norm": 2.0820366293135226, "learning_rate": 1.2437939705711387e-08, "loss": 0.6981, "step": 3414 }, { "epoch": 0.9310885420216755, "grad_norm": 2.277585157073143, "learning_rate": 1.2340216047418694e-08, "loss": 0.6202, "step": 3415 }, { "epoch": 0.9313611887396905, "grad_norm": 1.9839905285331512, "learning_rate": 1.2242873010484044e-08, "loss": 0.608, "step": 3416 }, { "epoch": 0.9316338354577057, "grad_norm": 1.8072819319443754, "learning_rate": 1.2145910670884274e-08, "loss": 0.65, "step": 3417 }, { "epoch": 0.9319064821757208, "grad_norm": 1.8299304293047836, "learning_rate": 1.2049329104298955e-08, "loss": 0.7002, "step": 3418 }, { "epoch": 0.9321791288937359, "grad_norm": 2.028449978447695, "learning_rate": 1.195312838611079e-08, "loss": 0.6899, "step": 3419 }, { "epoch": 0.9324517756117511, "grad_norm": 1.7840657121239165, "learning_rate": 1.1857308591404769e-08, "loss": 0.6658, "step": 3420 }, { "epoch": 0.9327244223297662, "grad_norm": 3.0089243946785182, "learning_rate": 1.176186979496896e-08, "loss": 0.7652, "step": 3421 }, { "epoch": 0.9329970690477813, "grad_norm": 2.983601336329085, "learning_rate": 1.1666812071293942e-08, "loss": 0.6662, "step": 3422 }, { "epoch": 0.9332697157657964, "grad_norm": 2.0716352056091107, "learning_rate": 1.1572135494572699e-08, "loss": 0.6766, "step": 3423 }, { "epoch": 0.9335423624838116, "grad_norm": 3.2745370758449126, "learning_rate": 1.1477840138700901e-08, "loss": 0.6734, "step": 3424 }, { "epoch": 0.9338150092018267, "grad_norm": 14.824769725361122, "learning_rate": 1.1383926077276673e-08, "loss": 0.6287, "step": 3425 }, { "epoch": 0.9340876559198419, "grad_norm": 4.709389844456015, "learning_rate": 1.1290393383600549e-08, "loss": 0.7203, "step": 3426 }, { "epoch": 0.934360302637857, "grad_norm": 1.806743851293482, "learning_rate": 1.1197242130675188e-08, "loss": 0.7057, "step": 3427 }, { "epoch": 0.9346329493558722, "grad_norm": 2.4285625923859686, "learning_rate": 1.1104472391205932e-08, "loss": 0.6807, "step": 3428 }, { "epoch": 0.9349055960738872, "grad_norm": 2.538064950545051, "learning_rate": 1.1012084237599806e-08, "loss": 0.7011, "step": 3429 }, { "epoch": 0.9351782427919024, "grad_norm": 1.9445804336932917, "learning_rate": 1.0920077741966627e-08, "loss": 0.7555, "step": 3430 }, { "epoch": 0.9354508895099175, "grad_norm": 6.945983827901352, "learning_rate": 1.0828452976117785e-08, "loss": 0.6772, "step": 3431 }, { "epoch": 0.9357235362279327, "grad_norm": 2.8522251777949226, "learning_rate": 1.0737210011567077e-08, "loss": 0.6553, "step": 3432 }, { "epoch": 0.9359961829459478, "grad_norm": 2.0968166392425753, "learning_rate": 1.0646348919530146e-08, "loss": 0.6678, "step": 3433 }, { "epoch": 0.936268829663963, "grad_norm": 1.907210354709158, "learning_rate": 1.0555869770924597e-08, "loss": 0.6878, "step": 3434 }, { "epoch": 0.936541476381978, "grad_norm": 3.528385457323916, "learning_rate": 1.046577263636994e-08, "loss": 0.6746, "step": 3435 }, { "epoch": 0.9368141230999932, "grad_norm": 3.529461505665411, "learning_rate": 1.0376057586187536e-08, "loss": 0.6978, "step": 3436 }, { "epoch": 0.9370867698180083, "grad_norm": 2.2834198285296607, "learning_rate": 1.0286724690400483e-08, "loss": 0.6963, "step": 3437 }, { "epoch": 0.9373594165360234, "grad_norm": 2.4856433480488183, "learning_rate": 1.0197774018733729e-08, "loss": 0.6747, "step": 3438 }, { "epoch": 0.9376320632540386, "grad_norm": 3.369604468840896, "learning_rate": 1.0109205640613682e-08, "loss": 0.7287, "step": 3439 }, { "epoch": 0.9379047099720537, "grad_norm": 3.357811262613737, "learning_rate": 1.0021019625168603e-08, "loss": 0.6454, "step": 3440 }, { "epoch": 0.9381773566900689, "grad_norm": 2.3739511588024813, "learning_rate": 9.9332160412281e-09, "loss": 0.6791, "step": 3441 }, { "epoch": 0.9384500034080839, "grad_norm": 2.777958807049457, "learning_rate": 9.845794957323461e-09, "loss": 0.65, "step": 3442 }, { "epoch": 0.9387226501260991, "grad_norm": 2.934927151598463, "learning_rate": 9.758756441687332e-09, "loss": 0.7735, "step": 3443 }, { "epoch": 0.9389952968441142, "grad_norm": 2.101172097419714, "learning_rate": 9.67210056225376e-09, "loss": 0.656, "step": 3444 }, { "epoch": 0.9392679435621294, "grad_norm": 3.0902327948457913, "learning_rate": 9.585827386658306e-09, "loss": 0.6409, "step": 3445 }, { "epoch": 0.9395405902801445, "grad_norm": 1.6164713207531647, "learning_rate": 9.4999369822375e-09, "loss": 0.6882, "step": 3446 }, { "epoch": 0.9398132369981597, "grad_norm": 2.483723059081407, "learning_rate": 9.41442941602949e-09, "loss": 0.6686, "step": 3447 }, { "epoch": 0.9400858837161747, "grad_norm": 2.3650412507144716, "learning_rate": 9.329304754773337e-09, "loss": 0.6931, "step": 3448 }, { "epoch": 0.9403585304341899, "grad_norm": 6.706570081080128, "learning_rate": 9.244563064909394e-09, "loss": 0.703, "step": 3449 }, { "epoch": 0.940631177152205, "grad_norm": 3.4142755532140727, "learning_rate": 9.16020441257892e-09, "loss": 0.7341, "step": 3450 }, { "epoch": 0.9409038238702202, "grad_norm": 2.76382363334279, "learning_rate": 9.07622886362458e-09, "loss": 0.7381, "step": 3451 }, { "epoch": 0.9411764705882353, "grad_norm": 6.209221770587843, "learning_rate": 8.99263648358961e-09, "loss": 0.6596, "step": 3452 }, { "epoch": 0.9414491173062505, "grad_norm": 2.4733623023365006, "learning_rate": 8.909427337718378e-09, "loss": 0.7274, "step": 3453 }, { "epoch": 0.9417217640242656, "grad_norm": 2.32376626639601, "learning_rate": 8.82660149095621e-09, "loss": 0.7716, "step": 3454 }, { "epoch": 0.9419944107422807, "grad_norm": 1.97717446284504, "learning_rate": 8.744159007949058e-09, "loss": 0.6896, "step": 3455 }, { "epoch": 0.9422670574602958, "grad_norm": 6.027022025846571, "learning_rate": 8.662099953043844e-09, "loss": 0.6824, "step": 3456 }, { "epoch": 0.9425397041783109, "grad_norm": 2.51529878974147, "learning_rate": 8.580424390288166e-09, "loss": 0.7107, "step": 3457 }, { "epoch": 0.9428123508963261, "grad_norm": 4.147695252480914, "learning_rate": 8.499132383430196e-09, "loss": 0.6587, "step": 3458 }, { "epoch": 0.9430849976143412, "grad_norm": 2.0414985853228904, "learning_rate": 8.41822399591885e-09, "loss": 0.6323, "step": 3459 }, { "epoch": 0.9433576443323564, "grad_norm": 4.644318754928248, "learning_rate": 8.337699290903722e-09, "loss": 0.6638, "step": 3460 }, { "epoch": 0.9436302910503714, "grad_norm": 2.6582705575397876, "learning_rate": 8.25755833123465e-09, "loss": 0.6617, "step": 3461 }, { "epoch": 0.9439029377683866, "grad_norm": 2.3015581943404384, "learning_rate": 8.177801179462262e-09, "loss": 0.715, "step": 3462 }, { "epoch": 0.9441755844864017, "grad_norm": 2.8948508121129723, "learning_rate": 8.098427897837434e-09, "loss": 0.5879, "step": 3463 }, { "epoch": 0.9444482312044169, "grad_norm": 2.023832857199675, "learning_rate": 8.019438548311497e-09, "loss": 0.6595, "step": 3464 }, { "epoch": 0.944720877922432, "grad_norm": 2.5387835182221354, "learning_rate": 7.940833192536078e-09, "loss": 0.7044, "step": 3465 }, { "epoch": 0.9449935246404472, "grad_norm": 2.223421584334876, "learning_rate": 7.862611891863214e-09, "loss": 0.5532, "step": 3466 }, { "epoch": 0.9452661713584622, "grad_norm": 3.4177508739506917, "learning_rate": 7.784774707344898e-09, "loss": 0.6516, "step": 3467 }, { "epoch": 0.9455388180764774, "grad_norm": 2.0905186084890652, "learning_rate": 7.70732169973376e-09, "loss": 0.6428, "step": 3468 }, { "epoch": 0.9458114647944925, "grad_norm": 2.156208305927487, "learning_rate": 7.630252929482162e-09, "loss": 0.6782, "step": 3469 }, { "epoch": 0.9460841115125077, "grad_norm": 3.7113144807909935, "learning_rate": 7.55356845674282e-09, "loss": 0.6813, "step": 3470 }, { "epoch": 0.9463567582305228, "grad_norm": 2.706116661139866, "learning_rate": 7.477268341368359e-09, "loss": 0.7412, "step": 3471 }, { "epoch": 0.946629404948538, "grad_norm": 1.9364068383270971, "learning_rate": 7.401352642911529e-09, "loss": 0.6999, "step": 3472 }, { "epoch": 0.9469020516665531, "grad_norm": 2.1284259790673263, "learning_rate": 7.325821420624934e-09, "loss": 0.6842, "step": 3473 }, { "epoch": 0.9471746983845682, "grad_norm": 2.370207425686007, "learning_rate": 7.2506747334611924e-09, "loss": 0.7393, "step": 3474 }, { "epoch": 0.9474473451025833, "grad_norm": 1.6823073498745829, "learning_rate": 7.175912640072723e-09, "loss": 0.5698, "step": 3475 }, { "epoch": 0.9477199918205984, "grad_norm": 1.9562159755756894, "learning_rate": 7.1015351988117364e-09, "loss": 0.7457, "step": 3476 }, { "epoch": 0.9479926385386136, "grad_norm": 1.7885914660248063, "learning_rate": 7.027542467730296e-09, "loss": 0.7565, "step": 3477 }, { "epoch": 0.9482652852566287, "grad_norm": 3.2182504014242466, "learning_rate": 6.953934504580261e-09, "loss": 0.7596, "step": 3478 }, { "epoch": 0.9485379319746439, "grad_norm": 3.8487126479070874, "learning_rate": 6.8807113668129525e-09, "loss": 0.7449, "step": 3479 }, { "epoch": 0.948810578692659, "grad_norm": 2.001167946337367, "learning_rate": 6.807873111579488e-09, "loss": 0.6764, "step": 3480 }, { "epoch": 0.9490832254106741, "grad_norm": 2.873875399039404, "learning_rate": 6.7354197957306126e-09, "loss": 0.6442, "step": 3481 }, { "epoch": 0.9493558721286892, "grad_norm": 3.338725248269754, "learning_rate": 6.663351475816536e-09, "loss": 0.7508, "step": 3482 }, { "epoch": 0.9496285188467044, "grad_norm": 8.941091864941841, "learning_rate": 6.591668208087043e-09, "loss": 0.7247, "step": 3483 }, { "epoch": 0.9499011655647195, "grad_norm": 2.2171973986051423, "learning_rate": 6.5203700484912634e-09, "loss": 0.6725, "step": 3484 }, { "epoch": 0.9501738122827347, "grad_norm": 2.3579213607475396, "learning_rate": 6.449457052677965e-09, "loss": 0.7583, "step": 3485 }, { "epoch": 0.9504464590007498, "grad_norm": 3.0269183591522677, "learning_rate": 6.378929275995038e-09, "loss": 0.7247, "step": 3486 }, { "epoch": 0.950719105718765, "grad_norm": 1.8580779910215979, "learning_rate": 6.308786773489894e-09, "loss": 0.7133, "step": 3487 }, { "epoch": 0.95099175243678, "grad_norm": 1.8643738144439552, "learning_rate": 6.239029599909129e-09, "loss": 0.6509, "step": 3488 }, { "epoch": 0.9512643991547952, "grad_norm": 1.9357630708308786, "learning_rate": 6.169657809698747e-09, "loss": 0.6573, "step": 3489 }, { "epoch": 0.9515370458728103, "grad_norm": 2.0366377395194433, "learning_rate": 6.100671457003714e-09, "loss": 0.6607, "step": 3490 }, { "epoch": 0.9518096925908255, "grad_norm": 6.965462854805458, "learning_rate": 6.0320705956682926e-09, "loss": 0.6702, "step": 3491 }, { "epoch": 0.9520823393088406, "grad_norm": 1.8285174053185818, "learning_rate": 5.9638552792359856e-09, "loss": 0.627, "step": 3492 }, { "epoch": 0.9523549860268558, "grad_norm": 1.8738805785028185, "learning_rate": 5.896025560949091e-09, "loss": 0.6848, "step": 3493 }, { "epoch": 0.9526276327448708, "grad_norm": 2.528186264284169, "learning_rate": 5.828581493749263e-09, "loss": 0.6535, "step": 3494 }, { "epoch": 0.9529002794628859, "grad_norm": 2.3839290793212213, "learning_rate": 5.761523130276835e-09, "loss": 0.6348, "step": 3495 }, { "epoch": 0.9531729261809011, "grad_norm": 2.1512270876365505, "learning_rate": 5.694850522871386e-09, "loss": 0.6592, "step": 3496 }, { "epoch": 0.9534455728989162, "grad_norm": 3.5082872794554856, "learning_rate": 5.628563723571234e-09, "loss": 0.7199, "step": 3497 }, { "epoch": 0.9537182196169314, "grad_norm": 1.9566569574399522, "learning_rate": 5.562662784113603e-09, "loss": 0.6616, "step": 3498 }, { "epoch": 0.9539908663349465, "grad_norm": 15.367582260210037, "learning_rate": 5.497147755934628e-09, "loss": 0.7286, "step": 3499 }, { "epoch": 0.9542635130529616, "grad_norm": 3.5082148401407682, "learning_rate": 5.432018690169127e-09, "loss": 0.7468, "step": 3500 }, { "epoch": 0.9545361597709767, "grad_norm": 3.566512815822275, "learning_rate": 5.36727563765077e-09, "loss": 0.6998, "step": 3501 }, { "epoch": 0.9548088064889919, "grad_norm": 2.0587786010795286, "learning_rate": 5.302918648911803e-09, "loss": 0.6293, "step": 3502 }, { "epoch": 0.955081453207007, "grad_norm": 12.331437738434325, "learning_rate": 5.238947774183267e-09, "loss": 0.6602, "step": 3503 }, { "epoch": 0.9553540999250222, "grad_norm": 2.3190496039830046, "learning_rate": 5.175363063394944e-09, "loss": 0.6731, "step": 3504 }, { "epoch": 0.9556267466430373, "grad_norm": 3.3637651148614607, "learning_rate": 5.112164566174859e-09, "loss": 0.6575, "step": 3505 }, { "epoch": 0.9558993933610525, "grad_norm": 4.562301626506411, "learning_rate": 5.049352331849999e-09, "loss": 0.6499, "step": 3506 }, { "epoch": 0.9561720400790675, "grad_norm": 2.0429540804022466, "learning_rate": 4.986926409445591e-09, "loss": 0.7013, "step": 3507 }, { "epoch": 0.9564446867970827, "grad_norm": 2.7022274584910613, "learning_rate": 4.924886847685495e-09, "loss": 0.6906, "step": 3508 }, { "epoch": 0.9567173335150978, "grad_norm": 2.142916363996325, "learning_rate": 4.863233694991864e-09, "loss": 0.6106, "step": 3509 }, { "epoch": 0.956989980233113, "grad_norm": 4.272401149600567, "learning_rate": 4.801966999485429e-09, "loss": 0.7155, "step": 3510 }, { "epoch": 0.9572626269511281, "grad_norm": 1.913236814706695, "learning_rate": 4.7410868089852176e-09, "loss": 0.6691, "step": 3511 }, { "epoch": 0.9575352736691433, "grad_norm": 3.1685966948159505, "learning_rate": 4.680593171008496e-09, "loss": 0.6773, "step": 3512 }, { "epoch": 0.9578079203871583, "grad_norm": 3.1361382563429596, "learning_rate": 4.6204861327709955e-09, "loss": 0.659, "step": 3513 }, { "epoch": 0.9580805671051734, "grad_norm": 2.7492704623717965, "learning_rate": 4.560765741186578e-09, "loss": 0.7093, "step": 3514 }, { "epoch": 0.9583532138231886, "grad_norm": 2.462423460883484, "learning_rate": 4.501432042867403e-09, "loss": 0.7151, "step": 3515 }, { "epoch": 0.9586258605412037, "grad_norm": 1.942981119998954, "learning_rate": 4.4424850841237034e-09, "loss": 0.7077, "step": 3516 }, { "epoch": 0.9588985072592189, "grad_norm": 6.2476038753155105, "learning_rate": 4.38392491096401e-09, "loss": 0.6192, "step": 3517 }, { "epoch": 0.959171153977234, "grad_norm": 2.6746868154490238, "learning_rate": 4.3257515690948155e-09, "loss": 0.6615, "step": 3518 }, { "epoch": 0.9594438006952491, "grad_norm": 4.070968677403721, "learning_rate": 4.267965103920801e-09, "loss": 0.6888, "step": 3519 }, { "epoch": 0.9597164474132642, "grad_norm": 2.120643941547746, "learning_rate": 4.21056556054461e-09, "loss": 0.6697, "step": 3520 }, { "epoch": 0.9599890941312794, "grad_norm": 3.456697848375468, "learning_rate": 4.153552983766961e-09, "loss": 0.6396, "step": 3521 }, { "epoch": 0.9602617408492945, "grad_norm": 2.6172104204049, "learning_rate": 4.096927418086482e-09, "loss": 0.6864, "step": 3522 }, { "epoch": 0.9605343875673097, "grad_norm": 1.9710285889552317, "learning_rate": 4.040688907699819e-09, "loss": 0.6776, "step": 3523 }, { "epoch": 0.9608070342853248, "grad_norm": 2.4553080473332325, "learning_rate": 3.984837496501414e-09, "loss": 0.6489, "step": 3524 }, { "epoch": 0.96107968100334, "grad_norm": 2.4906739191476444, "learning_rate": 3.929373228083621e-09, "loss": 0.6898, "step": 3525 }, { "epoch": 0.961352327721355, "grad_norm": 2.3419393376239546, "learning_rate": 3.874296145736644e-09, "loss": 0.6622, "step": 3526 }, { "epoch": 0.9616249744393702, "grad_norm": 2.243681656783788, "learning_rate": 3.819606292448541e-09, "loss": 0.6952, "step": 3527 }, { "epoch": 0.9618976211573853, "grad_norm": 2.1085081862254658, "learning_rate": 3.765303710904888e-09, "loss": 0.6888, "step": 3528 }, { "epoch": 0.9621702678754005, "grad_norm": 4.119618939859866, "learning_rate": 3.711388443489338e-09, "loss": 0.6615, "step": 3529 }, { "epoch": 0.9624429145934156, "grad_norm": 2.9679018557914754, "learning_rate": 3.6578605322830637e-09, "loss": 0.6486, "step": 3530 }, { "epoch": 0.9627155613114308, "grad_norm": 1.795383285790611, "learning_rate": 3.6047200190648107e-09, "loss": 0.6527, "step": 3531 }, { "epoch": 0.9629882080294458, "grad_norm": 3.0687245288610363, "learning_rate": 3.55196694531118e-09, "loss": 0.6787, "step": 3532 }, { "epoch": 0.9632608547474609, "grad_norm": 2.159580660808086, "learning_rate": 3.4996013521961796e-09, "loss": 0.6723, "step": 3533 }, { "epoch": 0.9635335014654761, "grad_norm": 2.428980526050742, "learning_rate": 3.447623280591505e-09, "loss": 0.7063, "step": 3534 }, { "epoch": 0.9638061481834912, "grad_norm": 4.206093146037071, "learning_rate": 3.3960327710663707e-09, "loss": 0.6194, "step": 3535 }, { "epoch": 0.9640787949015064, "grad_norm": 2.2367522596455403, "learning_rate": 3.3448298638873995e-09, "loss": 0.7185, "step": 3536 }, { "epoch": 0.9643514416195215, "grad_norm": 2.4799272279695157, "learning_rate": 3.294014599018846e-09, "loss": 0.6897, "step": 3537 }, { "epoch": 0.9646240883375367, "grad_norm": 13.042314911424105, "learning_rate": 3.243587016122318e-09, "loss": 0.6628, "step": 3538 }, { "epoch": 0.9648967350555517, "grad_norm": 3.7666763540790456, "learning_rate": 3.193547154556775e-09, "loss": 0.5792, "step": 3539 }, { "epoch": 0.9651693817735669, "grad_norm": 2.0891364090374385, "learning_rate": 3.1438950533786977e-09, "loss": 0.6779, "step": 3540 }, { "epoch": 0.965442028491582, "grad_norm": 2.2646744252839213, "learning_rate": 3.094630751341809e-09, "loss": 0.6343, "step": 3541 }, { "epoch": 0.9657146752095972, "grad_norm": 2.622072623878447, "learning_rate": 3.0457542868972398e-09, "loss": 0.762, "step": 3542 }, { "epoch": 0.9659873219276123, "grad_norm": 3.064639032150594, "learning_rate": 2.997265698193252e-09, "loss": 0.6674, "step": 3543 }, { "epoch": 0.9662599686456275, "grad_norm": 2.3463240012256055, "learning_rate": 2.9491650230755727e-09, "loss": 0.6394, "step": 3544 }, { "epoch": 0.9665326153636425, "grad_norm": 14.781589662791381, "learning_rate": 2.9014522990870594e-09, "loss": 0.6541, "step": 3545 }, { "epoch": 0.9668052620816577, "grad_norm": 5.101557249805997, "learning_rate": 2.8541275634677564e-09, "loss": 0.7006, "step": 3546 }, { "epoch": 0.9670779087996728, "grad_norm": 1.8281836505490303, "learning_rate": 2.8071908531548395e-09, "loss": 0.659, "step": 3547 }, { "epoch": 0.967350555517688, "grad_norm": 2.9308305763740825, "learning_rate": 2.760642204782837e-09, "loss": 0.7248, "step": 3548 }, { "epoch": 0.9676232022357031, "grad_norm": 5.381261201504673, "learning_rate": 2.714481654683076e-09, "loss": 0.7777, "step": 3549 }, { "epoch": 0.9678958489537183, "grad_norm": 2.826065539414054, "learning_rate": 2.668709238884237e-09, "loss": 0.6264, "step": 3550 }, { "epoch": 0.9681684956717334, "grad_norm": 2.793655020189492, "learning_rate": 2.623324993111964e-09, "loss": 0.8013, "step": 3551 }, { "epoch": 0.9684411423897484, "grad_norm": 2.0370587869533288, "learning_rate": 2.578328952788922e-09, "loss": 0.7434, "step": 3552 }, { "epoch": 0.9687137891077636, "grad_norm": 3.353825958373069, "learning_rate": 2.5337211530347956e-09, "loss": 0.6718, "step": 3553 }, { "epoch": 0.9689864358257787, "grad_norm": 2.662240871540511, "learning_rate": 2.4895016286661797e-09, "loss": 0.72, "step": 3554 }, { "epoch": 0.9692590825437939, "grad_norm": 2.3126172388333113, "learning_rate": 2.4456704141967433e-09, "loss": 0.7425, "step": 3555 }, { "epoch": 0.969531729261809, "grad_norm": 1.7661593421734985, "learning_rate": 2.4022275438369543e-09, "loss": 0.7154, "step": 3556 }, { "epoch": 0.9698043759798242, "grad_norm": 2.529152156222799, "learning_rate": 2.3591730514942455e-09, "loss": 0.6922, "step": 3557 }, { "epoch": 0.9700770226978392, "grad_norm": 2.2702755996872677, "learning_rate": 2.316506970772958e-09, "loss": 0.6399, "step": 3558 }, { "epoch": 0.9703496694158544, "grad_norm": 2.5494529644241624, "learning_rate": 2.2742293349741758e-09, "loss": 0.712, "step": 3559 }, { "epoch": 0.9706223161338695, "grad_norm": 2.0026090907225154, "learning_rate": 2.2323401770958927e-09, "loss": 0.7195, "step": 3560 }, { "epoch": 0.9708949628518847, "grad_norm": 2.3675341076121614, "learning_rate": 2.190839529832733e-09, "loss": 0.7055, "step": 3561 }, { "epoch": 0.9711676095698998, "grad_norm": 2.1709875003215893, "learning_rate": 2.1497274255762866e-09, "loss": 0.7104, "step": 3562 }, { "epoch": 0.971440256287915, "grad_norm": 4.795733732581907, "learning_rate": 2.1090038964148293e-09, "loss": 0.7063, "step": 3563 }, { "epoch": 0.97171290300593, "grad_norm": 2.5902338131815896, "learning_rate": 2.068668974133214e-09, "loss": 0.6611, "step": 3564 }, { "epoch": 0.9719855497239452, "grad_norm": 5.229301264450987, "learning_rate": 2.0287226902131473e-09, "loss": 0.5902, "step": 3565 }, { "epoch": 0.9722581964419603, "grad_norm": 1.794010109061358, "learning_rate": 1.9891650758329102e-09, "loss": 0.6497, "step": 3566 }, { "epoch": 0.9725308431599755, "grad_norm": 1.993009291225745, "learning_rate": 1.9499961618675285e-09, "loss": 0.6511, "step": 3567 }, { "epoch": 0.9728034898779906, "grad_norm": 1.5579719853818232, "learning_rate": 1.911215978888492e-09, "loss": 0.5927, "step": 3568 }, { "epoch": 0.9730761365960058, "grad_norm": 5.334409788510178, "learning_rate": 1.872824557164032e-09, "loss": 0.6561, "step": 3569 }, { "epoch": 0.9733487833140209, "grad_norm": 4.125243763323083, "learning_rate": 1.8348219266587917e-09, "loss": 0.6536, "step": 3570 }, { "epoch": 0.9736214300320359, "grad_norm": 3.034320263071214, "learning_rate": 1.7972081170340992e-09, "loss": 0.7311, "step": 3571 }, { "epoch": 0.9738940767500511, "grad_norm": 2.7916647778018855, "learning_rate": 1.7599831576478042e-09, "loss": 0.7197, "step": 3572 }, { "epoch": 0.9741667234680662, "grad_norm": 2.3544040800840316, "learning_rate": 1.72314707755411e-09, "loss": 0.6693, "step": 3573 }, { "epoch": 0.9744393701860814, "grad_norm": 9.606669623233577, "learning_rate": 1.6866999055039077e-09, "loss": 0.6767, "step": 3574 }, { "epoch": 0.9747120169040965, "grad_norm": 3.475443720071072, "learning_rate": 1.6506416699443303e-09, "loss": 0.7456, "step": 3575 }, { "epoch": 0.9749846636221117, "grad_norm": 1.6565250460224046, "learning_rate": 1.6149723990190879e-09, "loss": 0.6868, "step": 3576 }, { "epoch": 0.9752573103401267, "grad_norm": 2.3684608227198622, "learning_rate": 1.5796921205682434e-09, "loss": 0.6254, "step": 3577 }, { "epoch": 0.9755299570581419, "grad_norm": 1.8121277443049466, "learning_rate": 1.544800862128326e-09, "loss": 0.6759, "step": 3578 }, { "epoch": 0.975802603776157, "grad_norm": 2.3698113984071445, "learning_rate": 1.510298650932107e-09, "loss": 0.7207, "step": 3579 }, { "epoch": 0.9760752504941722, "grad_norm": 2.9016314468984197, "learning_rate": 1.4761855139087676e-09, "loss": 0.6172, "step": 3580 }, { "epoch": 0.9763478972121873, "grad_norm": 2.002841809181448, "learning_rate": 1.4424614776838428e-09, "loss": 0.6961, "step": 3581 }, { "epoch": 0.9766205439302025, "grad_norm": 5.0508198366234325, "learning_rate": 1.409126568579111e-09, "loss": 0.7277, "step": 3582 }, { "epoch": 0.9768931906482176, "grad_norm": 2.29563937637171, "learning_rate": 1.3761808126126483e-09, "loss": 0.7302, "step": 3583 }, { "epoch": 0.9771658373662327, "grad_norm": 1.6149178289540913, "learning_rate": 1.3436242354989414e-09, "loss": 0.63, "step": 3584 }, { "epoch": 0.9774384840842478, "grad_norm": 3.0881011009598787, "learning_rate": 1.3114568626483858e-09, "loss": 0.7434, "step": 3585 }, { "epoch": 0.977711130802263, "grad_norm": 2.831498071707373, "learning_rate": 1.279678719168009e-09, "loss": 0.7538, "step": 3586 }, { "epoch": 0.9779837775202781, "grad_norm": 3.3027838059562797, "learning_rate": 1.2482898298607492e-09, "loss": 0.7195, "step": 3587 }, { "epoch": 0.9782564242382932, "grad_norm": 3.170267093011807, "learning_rate": 1.217290219225786e-09, "loss": 0.6664, "step": 3588 }, { "epoch": 0.9785290709563084, "grad_norm": 3.296932113420451, "learning_rate": 1.1866799114585435e-09, "loss": 0.7342, "step": 3589 }, { "epoch": 0.9788017176743234, "grad_norm": 3.954623295137776, "learning_rate": 1.1564589304505768e-09, "loss": 0.6744, "step": 3590 }, { "epoch": 0.9790743643923386, "grad_norm": 1.8248493047626286, "learning_rate": 1.126627299789462e-09, "loss": 0.6644, "step": 3591 }, { "epoch": 0.9793470111103537, "grad_norm": 2.08477891007412, "learning_rate": 1.0971850427590746e-09, "loss": 0.632, "step": 3592 }, { "epoch": 0.9796196578283689, "grad_norm": 2.1353898821201565, "learning_rate": 1.0681321823391432e-09, "loss": 0.6903, "step": 3593 }, { "epoch": 0.979892304546384, "grad_norm": 2.5068246337076543, "learning_rate": 1.0394687412056957e-09, "loss": 0.7326, "step": 3594 }, { "epoch": 0.9801649512643992, "grad_norm": 1.7898141277924577, "learning_rate": 1.0111947417306698e-09, "loss": 0.6868, "step": 3595 }, { "epoch": 0.9804375979824143, "grad_norm": 2.658816051317194, "learning_rate": 9.833102059820797e-10, "loss": 0.6559, "step": 3596 }, { "epoch": 0.9807102447004294, "grad_norm": 3.2069047979299934, "learning_rate": 9.558151557240156e-10, "loss": 0.6701, "step": 3597 }, { "epoch": 0.9809828914184445, "grad_norm": 2.582897868209245, "learning_rate": 9.287096124165339e-10, "loss": 0.7281, "step": 3598 }, { "epoch": 0.9812555381364597, "grad_norm": 7.748326671857173, "learning_rate": 9.019935972156001e-10, "loss": 0.7083, "step": 3599 }, { "epoch": 0.9815281848544748, "grad_norm": 1.9989290538314937, "learning_rate": 8.756671309733122e-10, "loss": 0.652, "step": 3600 }, { "epoch": 0.98180083157249, "grad_norm": 2.639062548889636, "learning_rate": 8.497302342375668e-10, "loss": 0.7417, "step": 3601 }, { "epoch": 0.9820734782905051, "grad_norm": 2.9729751160126985, "learning_rate": 8.241829272522815e-10, "loss": 0.6824, "step": 3602 }, { "epoch": 0.9823461250085203, "grad_norm": 2.639953822352038, "learning_rate": 7.990252299573396e-10, "loss": 0.6856, "step": 3603 }, { "epoch": 0.9826187717265353, "grad_norm": 6.177715363918658, "learning_rate": 7.742571619883676e-10, "loss": 0.642, "step": 3604 }, { "epoch": 0.9828914184445505, "grad_norm": 4.939280114353845, "learning_rate": 7.498787426770681e-10, "loss": 0.6352, "step": 3605 }, { "epoch": 0.9831640651625656, "grad_norm": 2.750302469718923, "learning_rate": 7.258899910508876e-10, "loss": 0.6971, "step": 3606 }, { "epoch": 0.9834367118805807, "grad_norm": 2.262416424483204, "learning_rate": 7.022909258332377e-10, "loss": 0.6684, "step": 3607 }, { "epoch": 0.9837093585985959, "grad_norm": 2.5592188301629295, "learning_rate": 6.790815654432735e-10, "loss": 0.7047, "step": 3608 }, { "epoch": 0.983982005316611, "grad_norm": 2.642161917749914, "learning_rate": 6.562619279961157e-10, "loss": 0.7335, "step": 3609 }, { "epoch": 0.9842546520346261, "grad_norm": 2.174764542959981, "learning_rate": 6.338320313025725e-10, "loss": 0.6751, "step": 3610 }, { "epoch": 0.9845272987526412, "grad_norm": 2.502127265580326, "learning_rate": 6.117918928693622e-10, "loss": 0.6486, "step": 3611 }, { "epoch": 0.9847999454706564, "grad_norm": 2.370211774111031, "learning_rate": 5.90141529898891e-10, "loss": 0.6479, "step": 3612 }, { "epoch": 0.9850725921886715, "grad_norm": 2.2324312796960264, "learning_rate": 5.68880959289364e-10, "loss": 0.7823, "step": 3613 }, { "epoch": 0.9853452389066867, "grad_norm": 31.663233921397264, "learning_rate": 5.48010197634896e-10, "loss": 0.6559, "step": 3614 }, { "epoch": 0.9856178856247018, "grad_norm": 2.3872691137936504, "learning_rate": 5.275292612251236e-10, "loss": 0.6924, "step": 3615 }, { "epoch": 0.985890532342717, "grad_norm": 2.4262868877118304, "learning_rate": 5.074381660456484e-10, "loss": 0.6732, "step": 3616 }, { "epoch": 0.986163179060732, "grad_norm": 2.649497280212605, "learning_rate": 4.877369277775934e-10, "loss": 0.7501, "step": 3617 }, { "epoch": 0.9864358257787472, "grad_norm": 1.9052803619806438, "learning_rate": 4.68425561797936e-10, "loss": 0.6921, "step": 3618 }, { "epoch": 0.9867084724967623, "grad_norm": 2.1666419663987653, "learning_rate": 4.4950408317934176e-10, "loss": 0.7253, "step": 3619 }, { "epoch": 0.9869811192147775, "grad_norm": 1.788938726822749, "learning_rate": 4.309725066900527e-10, "loss": 0.6678, "step": 3620 }, { "epoch": 0.9872537659327926, "grad_norm": 1.8721013935408408, "learning_rate": 4.1283084679416545e-10, "loss": 0.68, "step": 3621 }, { "epoch": 0.9875264126508078, "grad_norm": 4.095831490530716, "learning_rate": 3.95079117651298e-10, "loss": 0.7232, "step": 3622 }, { "epoch": 0.9877990593688228, "grad_norm": 2.6636657628449187, "learning_rate": 3.77717333116756e-10, "loss": 0.7261, "step": 3623 }, { "epoch": 0.988071706086838, "grad_norm": 2.0725026620636284, "learning_rate": 3.6074550674158877e-10, "loss": 0.6672, "step": 3624 }, { "epoch": 0.9883443528048531, "grad_norm": 2.423318511148992, "learning_rate": 3.441636517723667e-10, "loss": 0.734, "step": 3625 }, { "epoch": 0.9886169995228682, "grad_norm": 3.0458093340292987, "learning_rate": 3.2797178115134827e-10, "loss": 0.7304, "step": 3626 }, { "epoch": 0.9888896462408834, "grad_norm": 5.307335111921316, "learning_rate": 3.1216990751636863e-10, "loss": 0.5822, "step": 3627 }, { "epoch": 0.9891622929588985, "grad_norm": 1.9490608231639748, "learning_rate": 2.9675804320083987e-10, "loss": 0.627, "step": 3628 }, { "epoch": 0.9894349396769136, "grad_norm": 2.45906309239511, "learning_rate": 2.817362002339174e-10, "loss": 0.6852, "step": 3629 }, { "epoch": 0.9897075863949287, "grad_norm": 2.126387790817943, "learning_rate": 2.6710439034011154e-10, "loss": 0.6579, "step": 3630 }, { "epoch": 0.9899802331129439, "grad_norm": 2.730949990939377, "learning_rate": 2.5286262493978694e-10, "loss": 0.7674, "step": 3631 }, { "epoch": 0.990252879830959, "grad_norm": 3.2586079322397805, "learning_rate": 2.39010915148552e-10, "loss": 0.6054, "step": 3632 }, { "epoch": 0.9905255265489742, "grad_norm": 5.390713092335652, "learning_rate": 2.2554927177792504e-10, "loss": 0.7317, "step": 3633 }, { "epoch": 0.9907981732669893, "grad_norm": 2.153434402646795, "learning_rate": 2.1247770533466824e-10, "loss": 0.696, "step": 3634 }, { "epoch": 0.9910708199850045, "grad_norm": 2.6441142418878756, "learning_rate": 1.9979622602123158e-10, "loss": 0.6857, "step": 3635 }, { "epoch": 0.9913434667030195, "grad_norm": 1.9363007254871525, "learning_rate": 1.8750484373569742e-10, "loss": 0.7298, "step": 3636 }, { "epoch": 0.9916161134210347, "grad_norm": 3.062497187631331, "learning_rate": 1.7560356807144737e-10, "loss": 0.7169, "step": 3637 }, { "epoch": 0.9918887601390498, "grad_norm": 2.262282244793249, "learning_rate": 1.6409240831749549e-10, "loss": 0.6711, "step": 3638 }, { "epoch": 0.992161406857065, "grad_norm": 4.814108234373376, "learning_rate": 1.529713734584326e-10, "loss": 0.7095, "step": 3639 }, { "epoch": 0.9924340535750801, "grad_norm": 2.087505412785881, "learning_rate": 1.4224047217425982e-10, "loss": 0.7314, "step": 3640 }, { "epoch": 0.9927067002930953, "grad_norm": 3.558949508431515, "learning_rate": 1.3189971284055524e-10, "loss": 0.6955, "step": 3641 }, { "epoch": 0.9929793470111103, "grad_norm": 2.3507381095008526, "learning_rate": 1.219491035282516e-10, "loss": 0.7038, "step": 3642 }, { "epoch": 0.9932519937291255, "grad_norm": 1.9050538619778656, "learning_rate": 1.1238865200391412e-10, "loss": 0.6711, "step": 3643 }, { "epoch": 0.9935246404471406, "grad_norm": 2.1955498625101604, "learning_rate": 1.0321836572957377e-10, "loss": 0.639, "step": 3644 }, { "epoch": 0.9937972871651557, "grad_norm": 2.8244195067874838, "learning_rate": 9.443825186261634e-11, "loss": 0.6121, "step": 3645 }, { "epoch": 0.9940699338831709, "grad_norm": 3.60988523628679, "learning_rate": 8.604831725600447e-11, "loss": 0.7223, "step": 3646 }, { "epoch": 0.994342580601186, "grad_norm": 2.0992091507188295, "learning_rate": 7.804856845811114e-11, "loss": 0.6272, "step": 3647 }, { "epoch": 0.9946152273192012, "grad_norm": 2.2861819795277736, "learning_rate": 7.043901171283062e-11, "loss": 0.753, "step": 3648 }, { "epoch": 0.9948878740372162, "grad_norm": 2.752935060887187, "learning_rate": 6.321965295946752e-11, "loss": 0.6318, "step": 3649 }, { "epoch": 0.9951605207552314, "grad_norm": 2.3916619086646955, "learning_rate": 5.639049783273675e-11, "loss": 0.6143, "step": 3650 }, { "epoch": 0.9954331674732465, "grad_norm": 2.1329699285414074, "learning_rate": 4.995155166281906e-11, "loss": 0.6703, "step": 3651 }, { "epoch": 0.9957058141912617, "grad_norm": 1.9163384009909015, "learning_rate": 4.390281947541652e-11, "loss": 0.7123, "step": 3652 }, { "epoch": 0.9959784609092768, "grad_norm": 2.5356773852540946, "learning_rate": 3.8244305991530504e-11, "loss": 0.6578, "step": 3653 }, { "epoch": 0.996251107627292, "grad_norm": 5.932951715993654, "learning_rate": 3.297601562773922e-11, "loss": 0.6648, "step": 3654 }, { "epoch": 0.996523754345307, "grad_norm": 3.807001567778944, "learning_rate": 2.8097952495920173e-11, "loss": 0.7198, "step": 3655 }, { "epoch": 0.9967964010633222, "grad_norm": 3.0824140698552935, "learning_rate": 2.36101204034167e-11, "loss": 0.708, "step": 3656 }, { "epoch": 0.9970690477813373, "grad_norm": 12.181921918172938, "learning_rate": 1.951252285309346e-11, "loss": 0.6481, "step": 3657 }, { "epoch": 0.9973416944993525, "grad_norm": 4.543850584306596, "learning_rate": 1.58051630430589e-11, "loss": 0.7663, "step": 3658 }, { "epoch": 0.9976143412173676, "grad_norm": 5.129924385250918, "learning_rate": 1.2488043866942798e-11, "loss": 0.6567, "step": 3659 }, { "epoch": 0.9978869879353828, "grad_norm": 2.5117533788401953, "learning_rate": 9.561167913840762e-12, "loss": 0.7439, "step": 3660 }, { "epoch": 0.9981596346533979, "grad_norm": 2.4801348650990476, "learning_rate": 7.0245374680921735e-12, "loss": 0.7725, "step": 3661 }, { "epoch": 0.998432281371413, "grad_norm": 1.9199177147937605, "learning_rate": 4.878154509668775e-12, "loss": 0.7585, "step": 3662 }, { "epoch": 0.9987049280894281, "grad_norm": 2.2657712612853254, "learning_rate": 3.1220207137305774e-12, "loss": 0.6497, "step": 3663 }, { "epoch": 0.9989775748074432, "grad_norm": 3.0108669749209027, "learning_rate": 1.75613745106995e-12, "loss": 0.74, "step": 3664 }, { "epoch": 0.9992502215254584, "grad_norm": 2.7868383259008263, "learning_rate": 7.80505787612018e-13, "loss": 0.7131, "step": 3665 }, { "epoch": 0.9995228682434735, "grad_norm": 1.9237207527190912, "learning_rate": 1.9512648496977647e-13, "loss": 0.6293, "step": 3666 }, { "epoch": 0.9997955149614887, "grad_norm": 2.225770424206677, "learning_rate": 0.0, "loss": 0.6969, "step": 3667 }, { "epoch": 0.9997955149614887, "step": 3667, "total_flos": 3386943519457280.0, "train_loss": 0.7061884974568619, "train_runtime": 27498.1956, "train_samples_per_second": 34.145, "train_steps_per_second": 0.133 } ], "logging_steps": 1.0, "max_steps": 3667, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3386943519457280.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }