diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5923 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 1680, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0035714285714285713, + "grad_norm": 0.3871030807495117, + "learning_rate": 1.1904761904761906e-07, + "loss": 1.9294867515563965, + "step": 2 + }, + { + "epoch": 0.007142857142857143, + "grad_norm": 0.34607651829719543, + "learning_rate": 3.5714285714285716e-07, + "loss": 1.931689739227295, + "step": 4 + }, + { + "epoch": 0.010714285714285714, + "grad_norm": 0.3077217638492584, + "learning_rate": 5.952380952380953e-07, + "loss": 1.859986662864685, + "step": 6 + }, + { + "epoch": 0.014285714285714285, + "grad_norm": 0.26033300161361694, + "learning_rate": 8.333333333333333e-07, + "loss": 1.8296231031417847, + "step": 8 + }, + { + "epoch": 0.017857142857142856, + "grad_norm": 0.3577536344528198, + "learning_rate": 1.0714285714285714e-06, + "loss": 1.840135097503662, + "step": 10 + }, + { + "epoch": 0.02142857142857143, + "grad_norm": 0.35149258375167847, + "learning_rate": 1.3095238095238096e-06, + "loss": 1.718151330947876, + "step": 12 + }, + { + "epoch": 0.025, + "grad_norm": 0.3105311691761017, + "learning_rate": 1.5476190476190479e-06, + "loss": 1.8123761415481567, + "step": 14 + }, + { + "epoch": 0.02857142857142857, + "grad_norm": 0.3541400134563446, + "learning_rate": 1.7857142857142859e-06, + "loss": 1.801349401473999, + "step": 16 + }, + { + "epoch": 0.03214285714285714, + "grad_norm": 0.32876938581466675, + "learning_rate": 2.023809523809524e-06, + "loss": 1.8854210376739502, + "step": 18 + }, + { + "epoch": 0.03571428571428571, + "grad_norm": 0.9392958283424377, + "learning_rate": 2.261904761904762e-06, + "loss": 1.7024314403533936, + "step": 20 + }, + { + "epoch": 0.039285714285714285, + "grad_norm": 0.6484195590019226, + "learning_rate": 2.5e-06, + "loss": 1.9459373950958252, + "step": 22 + }, + { + "epoch": 0.04285714285714286, + "grad_norm": 0.36433079838752747, + "learning_rate": 2.7380952380952387e-06, + "loss": 1.9512709379196167, + "step": 24 + }, + { + "epoch": 0.04642857142857143, + "grad_norm": 0.4358835220336914, + "learning_rate": 2.9761904761904763e-06, + "loss": 1.7940953969955444, + "step": 26 + }, + { + "epoch": 0.05, + "grad_norm": 0.506097137928009, + "learning_rate": 3.2142857142857147e-06, + "loss": 1.7999926805496216, + "step": 28 + }, + { + "epoch": 0.05357142857142857, + "grad_norm": 0.5315778255462646, + "learning_rate": 3.4523809523809528e-06, + "loss": 1.7870306968688965, + "step": 30 + }, + { + "epoch": 0.05714285714285714, + "grad_norm": 0.30482104420661926, + "learning_rate": 3.690476190476191e-06, + "loss": 1.8913555145263672, + "step": 32 + }, + { + "epoch": 0.060714285714285714, + "grad_norm": 0.8241702318191528, + "learning_rate": 3.928571428571429e-06, + "loss": 1.9280858039855957, + "step": 34 + }, + { + "epoch": 0.06428571428571428, + "grad_norm": 0.39840635657310486, + "learning_rate": 4.166666666666667e-06, + "loss": 1.9256908893585205, + "step": 36 + }, + { + "epoch": 0.06785714285714285, + "grad_norm": 0.33251017332077026, + "learning_rate": 4.404761904761905e-06, + "loss": 1.8829214572906494, + "step": 38 + }, + { + "epoch": 0.07142857142857142, + "grad_norm": 0.49388226866722107, + "learning_rate": 4.642857142857144e-06, + "loss": 1.8666248321533203, + "step": 40 + }, + { + "epoch": 0.075, + "grad_norm": 0.28926795721054077, + "learning_rate": 4.880952380952381e-06, + "loss": 1.8469940423965454, + "step": 42 + }, + { + "epoch": 0.07857142857142857, + "grad_norm": 0.317127525806427, + "learning_rate": 5.119047619047619e-06, + "loss": 1.892695426940918, + "step": 44 + }, + { + "epoch": 0.08214285714285714, + "grad_norm": 0.8169130682945251, + "learning_rate": 5.357142857142857e-06, + "loss": 1.893534541130066, + "step": 46 + }, + { + "epoch": 0.08571428571428572, + "grad_norm": 0.27684587240219116, + "learning_rate": 5.595238095238096e-06, + "loss": 1.5699528455734253, + "step": 48 + }, + { + "epoch": 0.08928571428571429, + "grad_norm": 0.5231921076774597, + "learning_rate": 5.833333333333334e-06, + "loss": 1.6496429443359375, + "step": 50 + }, + { + "epoch": 0.09285714285714286, + "grad_norm": 0.5755372643470764, + "learning_rate": 6.071428571428571e-06, + "loss": 1.6312464475631714, + "step": 52 + }, + { + "epoch": 0.09642857142857143, + "grad_norm": 0.40994322299957275, + "learning_rate": 6.30952380952381e-06, + "loss": 1.8703556060791016, + "step": 54 + }, + { + "epoch": 0.1, + "grad_norm": 0.6402392983436584, + "learning_rate": 6.547619047619048e-06, + "loss": 1.7122882604599, + "step": 56 + }, + { + "epoch": 0.10357142857142858, + "grad_norm": 0.4092760980129242, + "learning_rate": 6.785714285714287e-06, + "loss": 1.7604010105133057, + "step": 58 + }, + { + "epoch": 0.10714285714285714, + "grad_norm": 0.41417962312698364, + "learning_rate": 7.023809523809524e-06, + "loss": 2.1781420707702637, + "step": 60 + }, + { + "epoch": 0.11071428571428571, + "grad_norm": 0.6246824264526367, + "learning_rate": 7.261904761904762e-06, + "loss": 1.7982336282730103, + "step": 62 + }, + { + "epoch": 0.11428571428571428, + "grad_norm": 0.26309502124786377, + "learning_rate": 7.500000000000001e-06, + "loss": 1.696463704109192, + "step": 64 + }, + { + "epoch": 0.11785714285714285, + "grad_norm": 0.9458585381507874, + "learning_rate": 7.738095238095238e-06, + "loss": 1.7728084325790405, + "step": 66 + }, + { + "epoch": 0.12142857142857143, + "grad_norm": 0.22862379252910614, + "learning_rate": 7.976190476190477e-06, + "loss": 1.6821340322494507, + "step": 68 + }, + { + "epoch": 0.125, + "grad_norm": 0.236324280500412, + "learning_rate": 8.214285714285714e-06, + "loss": 1.7681533098220825, + "step": 70 + }, + { + "epoch": 0.12857142857142856, + "grad_norm": 0.2597522735595703, + "learning_rate": 8.452380952380953e-06, + "loss": 1.8034054040908813, + "step": 72 + }, + { + "epoch": 0.13214285714285715, + "grad_norm": 0.24487343430519104, + "learning_rate": 8.690476190476192e-06, + "loss": 1.7554086446762085, + "step": 74 + }, + { + "epoch": 0.1357142857142857, + "grad_norm": 0.22543826699256897, + "learning_rate": 8.92857142857143e-06, + "loss": 1.7456854581832886, + "step": 76 + }, + { + "epoch": 0.1392857142857143, + "grad_norm": 0.2380058914422989, + "learning_rate": 9.166666666666666e-06, + "loss": 1.7143663167953491, + "step": 78 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 0.26500657200813293, + "learning_rate": 9.404761904761905e-06, + "loss": 1.7059998512268066, + "step": 80 + }, + { + "epoch": 0.14642857142857144, + "grad_norm": 0.2978551387786865, + "learning_rate": 9.642857142857144e-06, + "loss": 1.7792344093322754, + "step": 82 + }, + { + "epoch": 0.15, + "grad_norm": 0.2930593490600586, + "learning_rate": 9.880952380952381e-06, + "loss": 1.6987429857254028, + "step": 84 + }, + { + "epoch": 0.15357142857142858, + "grad_norm": 0.4046596884727478, + "learning_rate": 9.999991282010348e-06, + "loss": 1.7894960641860962, + "step": 86 + }, + { + "epoch": 0.15714285714285714, + "grad_norm": 0.23502953350543976, + "learning_rate": 9.999921538295799e-06, + "loss": 1.749454379081726, + "step": 88 + }, + { + "epoch": 0.16071428571428573, + "grad_norm": 0.22283266484737396, + "learning_rate": 9.999782051947632e-06, + "loss": 1.686018943786621, + "step": 90 + }, + { + "epoch": 0.16428571428571428, + "grad_norm": 0.24027639627456665, + "learning_rate": 9.999572825127696e-06, + "loss": 1.480033040046692, + "step": 92 + }, + { + "epoch": 0.16785714285714284, + "grad_norm": 0.5684676766395569, + "learning_rate": 9.99929386107872e-06, + "loss": 1.675416350364685, + "step": 94 + }, + { + "epoch": 0.17142857142857143, + "grad_norm": 1.138840675354004, + "learning_rate": 9.998945164124268e-06, + "loss": 1.7155344486236572, + "step": 96 + }, + { + "epoch": 0.175, + "grad_norm": 0.2664114534854889, + "learning_rate": 9.998526739668664e-06, + "loss": 1.6043933629989624, + "step": 98 + }, + { + "epoch": 0.17857142857142858, + "grad_norm": 0.28691864013671875, + "learning_rate": 9.998038594196913e-06, + "loss": 1.6187028884887695, + "step": 100 + }, + { + "epoch": 0.18214285714285713, + "grad_norm": 0.31850922107696533, + "learning_rate": 9.997480735274608e-06, + "loss": 1.5820776224136353, + "step": 102 + }, + { + "epoch": 0.18571428571428572, + "grad_norm": 0.23401758074760437, + "learning_rate": 9.996853171547794e-06, + "loss": 1.5967426300048828, + "step": 104 + }, + { + "epoch": 0.18928571428571428, + "grad_norm": 0.23440219461917877, + "learning_rate": 9.996155912742856e-06, + "loss": 1.6334154605865479, + "step": 106 + }, + { + "epoch": 0.19285714285714287, + "grad_norm": 0.7341821193695068, + "learning_rate": 9.995388969666348e-06, + "loss": 1.598835825920105, + "step": 108 + }, + { + "epoch": 0.19642857142857142, + "grad_norm": 0.6320663094520569, + "learning_rate": 9.994552354204844e-06, + "loss": 1.6243830919265747, + "step": 110 + }, + { + "epoch": 0.2, + "grad_norm": 0.5404586791992188, + "learning_rate": 9.993646079324738e-06, + "loss": 1.566571831703186, + "step": 112 + }, + { + "epoch": 0.20357142857142857, + "grad_norm": 0.5022917985916138, + "learning_rate": 9.992670159072052e-06, + "loss": 1.6408634185791016, + "step": 114 + }, + { + "epoch": 0.20714285714285716, + "grad_norm": 1.1218639612197876, + "learning_rate": 9.991624608572215e-06, + "loss": 1.7009669542312622, + "step": 116 + }, + { + "epoch": 0.21071428571428572, + "grad_norm": 0.5181306004524231, + "learning_rate": 9.990509444029833e-06, + "loss": 1.7996366024017334, + "step": 118 + }, + { + "epoch": 0.21428571428571427, + "grad_norm": 0.43997907638549805, + "learning_rate": 9.98932468272843e-06, + "loss": 1.7554632425308228, + "step": 120 + }, + { + "epoch": 0.21785714285714286, + "grad_norm": 0.3227292597293854, + "learning_rate": 9.98807034303019e-06, + "loss": 1.473575472831726, + "step": 122 + }, + { + "epoch": 0.22142857142857142, + "grad_norm": 0.3611178398132324, + "learning_rate": 9.98674644437566e-06, + "loss": 1.594710350036621, + "step": 124 + }, + { + "epoch": 0.225, + "grad_norm": 0.9151387214660645, + "learning_rate": 9.985353007283464e-06, + "loss": 1.6291745901107788, + "step": 126 + }, + { + "epoch": 0.22857142857142856, + "grad_norm": 0.2581241726875305, + "learning_rate": 9.983890053349969e-06, + "loss": 1.2893997430801392, + "step": 128 + }, + { + "epoch": 0.23214285714285715, + "grad_norm": 0.5861591100692749, + "learning_rate": 9.982357605248963e-06, + "loss": 1.0495647192001343, + "step": 130 + }, + { + "epoch": 0.2357142857142857, + "grad_norm": 0.46270960569381714, + "learning_rate": 9.980755686731296e-06, + "loss": 1.3306972980499268, + "step": 132 + }, + { + "epoch": 0.2392857142857143, + "grad_norm": 0.36067521572113037, + "learning_rate": 9.979084322624518e-06, + "loss": 1.5336247682571411, + "step": 134 + }, + { + "epoch": 0.24285714285714285, + "grad_norm": 0.9336586594581604, + "learning_rate": 9.977343538832486e-06, + "loss": 1.7042999267578125, + "step": 136 + }, + { + "epoch": 0.24642857142857144, + "grad_norm": 0.387260377407074, + "learning_rate": 9.97553336233497e-06, + "loss": 1.396690011024475, + "step": 138 + }, + { + "epoch": 0.25, + "grad_norm": 0.4640398919582367, + "learning_rate": 9.973653821187233e-06, + "loss": 1.3623416423797607, + "step": 140 + }, + { + "epoch": 0.25357142857142856, + "grad_norm": 0.33568593859672546, + "learning_rate": 9.971704944519593e-06, + "loss": 1.3992865085601807, + "step": 142 + }, + { + "epoch": 0.2571428571428571, + "grad_norm": 0.24492622911930084, + "learning_rate": 9.969686762536973e-06, + "loss": 1.444324016571045, + "step": 144 + }, + { + "epoch": 0.26071428571428573, + "grad_norm": 0.8304792642593384, + "learning_rate": 9.967599306518438e-06, + "loss": 1.3990877866744995, + "step": 146 + }, + { + "epoch": 0.2642857142857143, + "grad_norm": 0.623303234577179, + "learning_rate": 9.965442608816704e-06, + "loss": 1.4563076496124268, + "step": 148 + }, + { + "epoch": 0.26785714285714285, + "grad_norm": 0.31338322162628174, + "learning_rate": 9.963216702857635e-06, + "loss": 1.6392706632614136, + "step": 150 + }, + { + "epoch": 0.2714285714285714, + "grad_norm": 0.3007861375808716, + "learning_rate": 9.96092162313973e-06, + "loss": 1.5057697296142578, + "step": 152 + }, + { + "epoch": 0.275, + "grad_norm": 0.15701599419116974, + "learning_rate": 9.958557405233593e-06, + "loss": 1.4597502946853638, + "step": 154 + }, + { + "epoch": 0.2785714285714286, + "grad_norm": 0.2882039248943329, + "learning_rate": 9.956124085781366e-06, + "loss": 1.3839119672775269, + "step": 156 + }, + { + "epoch": 0.28214285714285714, + "grad_norm": 0.3199823498725891, + "learning_rate": 9.953621702496178e-06, + "loss": 1.6068451404571533, + "step": 158 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.24365948140621185, + "learning_rate": 9.951050294161548e-06, + "loss": 1.7299036979675293, + "step": 160 + }, + { + "epoch": 0.2892857142857143, + "grad_norm": 0.46145617961883545, + "learning_rate": 9.948409900630787e-06, + "loss": 1.3489717245101929, + "step": 162 + }, + { + "epoch": 0.29285714285714287, + "grad_norm": 0.15912453830242157, + "learning_rate": 9.945700562826394e-06, + "loss": 1.5043880939483643, + "step": 164 + }, + { + "epoch": 0.29642857142857143, + "grad_norm": 0.1797444075345993, + "learning_rate": 9.942922322739395e-06, + "loss": 1.1060163974761963, + "step": 166 + }, + { + "epoch": 0.3, + "grad_norm": 0.2711468040943146, + "learning_rate": 9.940075223428718e-06, + "loss": 1.8668510913848877, + "step": 168 + }, + { + "epoch": 0.30357142857142855, + "grad_norm": 0.1544935554265976, + "learning_rate": 9.93715930902051e-06, + "loss": 1.2808830738067627, + "step": 170 + }, + { + "epoch": 0.30714285714285716, + "grad_norm": 0.37536972761154175, + "learning_rate": 9.934174624707459e-06, + "loss": 1.4175796508789062, + "step": 172 + }, + { + "epoch": 0.3107142857142857, + "grad_norm": 0.22495543956756592, + "learning_rate": 9.931121216748092e-06, + "loss": 1.6282312870025635, + "step": 174 + }, + { + "epoch": 0.3142857142857143, + "grad_norm": 0.3511154055595398, + "learning_rate": 9.927999132466059e-06, + "loss": 1.635170340538025, + "step": 176 + }, + { + "epoch": 0.31785714285714284, + "grad_norm": 0.47558754682540894, + "learning_rate": 9.924808420249404e-06, + "loss": 1.563542127609253, + "step": 178 + }, + { + "epoch": 0.32142857142857145, + "grad_norm": 0.6490837335586548, + "learning_rate": 9.921549129549799e-06, + "loss": 2.066225290298462, + "step": 180 + }, + { + "epoch": 0.325, + "grad_norm": 0.37414857745170593, + "learning_rate": 9.918221310881797e-06, + "loss": 1.2984635829925537, + "step": 182 + }, + { + "epoch": 0.32857142857142857, + "grad_norm": 0.24293118715286255, + "learning_rate": 9.91482501582204e-06, + "loss": 1.560595989227295, + "step": 184 + }, + { + "epoch": 0.33214285714285713, + "grad_norm": 1.0313069820404053, + "learning_rate": 9.91136029700846e-06, + "loss": 1.6615456342697144, + "step": 186 + }, + { + "epoch": 0.3357142857142857, + "grad_norm": 0.2365736961364746, + "learning_rate": 9.907827208139462e-06, + "loss": 1.4550660848617554, + "step": 188 + }, + { + "epoch": 0.3392857142857143, + "grad_norm": 0.48783159255981445, + "learning_rate": 9.904225803973095e-06, + "loss": 1.1695599555969238, + "step": 190 + }, + { + "epoch": 0.34285714285714286, + "grad_norm": 0.18840378522872925, + "learning_rate": 9.900556140326203e-06, + "loss": 1.175657033920288, + "step": 192 + }, + { + "epoch": 0.3464285714285714, + "grad_norm": 0.33193379640579224, + "learning_rate": 9.896818274073555e-06, + "loss": 1.5140769481658936, + "step": 194 + }, + { + "epoch": 0.35, + "grad_norm": 0.23811382055282593, + "learning_rate": 9.893012263146971e-06, + "loss": 1.5519834756851196, + "step": 196 + }, + { + "epoch": 0.3535714285714286, + "grad_norm": 0.3118128478527069, + "learning_rate": 9.889138166534416e-06, + "loss": 1.6215221881866455, + "step": 198 + }, + { + "epoch": 0.35714285714285715, + "grad_norm": 0.20358364284038544, + "learning_rate": 9.885196044279095e-06, + "loss": 1.4951940774917603, + "step": 200 + }, + { + "epoch": 0.3607142857142857, + "grad_norm": 0.33988499641418457, + "learning_rate": 9.881185957478514e-06, + "loss": 1.5101016759872437, + "step": 202 + }, + { + "epoch": 0.36428571428571427, + "grad_norm": 0.2685701251029968, + "learning_rate": 9.877107968283538e-06, + "loss": 1.3352025747299194, + "step": 204 + }, + { + "epoch": 0.3678571428571429, + "grad_norm": 0.24243606626987457, + "learning_rate": 9.872962139897426e-06, + "loss": 1.6032525300979614, + "step": 206 + }, + { + "epoch": 0.37142857142857144, + "grad_norm": 0.2574315369129181, + "learning_rate": 9.86874853657485e-06, + "loss": 1.4210426807403564, + "step": 208 + }, + { + "epoch": 0.375, + "grad_norm": 0.30428075790405273, + "learning_rate": 9.864467223620908e-06, + "loss": 1.4541680812835693, + "step": 210 + }, + { + "epoch": 0.37857142857142856, + "grad_norm": 0.27129194140434265, + "learning_rate": 9.860118267390092e-06, + "loss": 1.5869474411010742, + "step": 212 + }, + { + "epoch": 0.3821428571428571, + "grad_norm": 0.2757408022880554, + "learning_rate": 9.855701735285285e-06, + "loss": 1.4132391214370728, + "step": 214 + }, + { + "epoch": 0.38571428571428573, + "grad_norm": 0.8043122291564941, + "learning_rate": 9.851217695756694e-06, + "loss": 1.3423351049423218, + "step": 216 + }, + { + "epoch": 0.3892857142857143, + "grad_norm": 0.3778972327709198, + "learning_rate": 9.846666218300808e-06, + "loss": 1.4252076148986816, + "step": 218 + }, + { + "epoch": 0.39285714285714285, + "grad_norm": 0.30076470971107483, + "learning_rate": 9.842047373459305e-06, + "loss": 1.477191686630249, + "step": 220 + }, + { + "epoch": 0.3964285714285714, + "grad_norm": 0.2551064193248749, + "learning_rate": 9.837361232817964e-06, + "loss": 1.4160501956939697, + "step": 222 + }, + { + "epoch": 0.4, + "grad_norm": 0.3507143557071686, + "learning_rate": 9.832607869005565e-06, + "loss": 1.3853830099105835, + "step": 224 + }, + { + "epoch": 0.4035714285714286, + "grad_norm": 0.2576788663864136, + "learning_rate": 9.827787355692749e-06, + "loss": 1.5642895698547363, + "step": 226 + }, + { + "epoch": 0.40714285714285714, + "grad_norm": 0.6864569187164307, + "learning_rate": 9.822899767590884e-06, + "loss": 1.5939396619796753, + "step": 228 + }, + { + "epoch": 0.4107142857142857, + "grad_norm": 0.290575236082077, + "learning_rate": 9.817945180450902e-06, + "loss": 1.692050814628601, + "step": 230 + }, + { + "epoch": 0.4142857142857143, + "grad_norm": 0.2678094506263733, + "learning_rate": 9.812923671062139e-06, + "loss": 1.5078585147857666, + "step": 232 + }, + { + "epoch": 0.41785714285714287, + "grad_norm": 0.2580035924911499, + "learning_rate": 9.80783531725112e-06, + "loss": 1.5001014471054077, + "step": 234 + }, + { + "epoch": 0.42142857142857143, + "grad_norm": 0.48962509632110596, + "learning_rate": 9.80268019788038e-06, + "loss": 1.30159592628479, + "step": 236 + }, + { + "epoch": 0.425, + "grad_norm": 0.3317374587059021, + "learning_rate": 9.79745839284722e-06, + "loss": 1.3308159112930298, + "step": 238 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 0.40648195147514343, + "learning_rate": 9.792169983082484e-06, + "loss": 1.2273372411727905, + "step": 240 + }, + { + "epoch": 0.43214285714285716, + "grad_norm": 0.23908843100070953, + "learning_rate": 9.786815050549295e-06, + "loss": 1.3610113859176636, + "step": 242 + }, + { + "epoch": 0.4357142857142857, + "grad_norm": 0.30702945590019226, + "learning_rate": 9.781393678241787e-06, + "loss": 1.526265025138855, + "step": 244 + }, + { + "epoch": 0.4392857142857143, + "grad_norm": 0.3442508578300476, + "learning_rate": 9.775905950183821e-06, + "loss": 1.4831691980361938, + "step": 246 + }, + { + "epoch": 0.44285714285714284, + "grad_norm": 0.2286010980606079, + "learning_rate": 9.770351951427684e-06, + "loss": 1.5686728954315186, + "step": 248 + }, + { + "epoch": 0.44642857142857145, + "grad_norm": 0.42278486490249634, + "learning_rate": 9.764731768052762e-06, + "loss": 1.583655834197998, + "step": 250 + }, + { + "epoch": 0.45, + "grad_norm": 0.2032240778207779, + "learning_rate": 9.75904548716422e-06, + "loss": 1.7357392311096191, + "step": 252 + }, + { + "epoch": 0.45357142857142857, + "grad_norm": 0.6682279706001282, + "learning_rate": 9.753293196891639e-06, + "loss": 1.4943958520889282, + "step": 254 + }, + { + "epoch": 0.45714285714285713, + "grad_norm": 0.3326359689235687, + "learning_rate": 9.747474986387655e-06, + "loss": 1.4537054300308228, + "step": 256 + }, + { + "epoch": 0.4607142857142857, + "grad_norm": 0.31363770365715027, + "learning_rate": 9.74159094582658e-06, + "loss": 1.4956998825073242, + "step": 258 + }, + { + "epoch": 0.4642857142857143, + "grad_norm": 0.6148827075958252, + "learning_rate": 9.735641166402998e-06, + "loss": 1.3277488946914673, + "step": 260 + }, + { + "epoch": 0.46785714285714286, + "grad_norm": 0.31903398036956787, + "learning_rate": 9.729625740330363e-06, + "loss": 1.49782133102417, + "step": 262 + }, + { + "epoch": 0.4714285714285714, + "grad_norm": 0.8235952258110046, + "learning_rate": 9.723544760839555e-06, + "loss": 1.460282325744629, + "step": 264 + }, + { + "epoch": 0.475, + "grad_norm": 0.3165692090988159, + "learning_rate": 9.717398322177442e-06, + "loss": 1.5261378288269043, + "step": 266 + }, + { + "epoch": 0.4785714285714286, + "grad_norm": 1.6001724004745483, + "learning_rate": 9.71118651960543e-06, + "loss": 1.7769297361373901, + "step": 268 + }, + { + "epoch": 0.48214285714285715, + "grad_norm": 0.26702550053596497, + "learning_rate": 9.704909449397962e-06, + "loss": 1.3878670930862427, + "step": 270 + }, + { + "epoch": 0.4857142857142857, + "grad_norm": 0.18522176146507263, + "learning_rate": 9.69856720884105e-06, + "loss": 1.2690881490707397, + "step": 272 + }, + { + "epoch": 0.48928571428571427, + "grad_norm": 0.40137895941734314, + "learning_rate": 9.692159896230757e-06, + "loss": 1.3622859716415405, + "step": 274 + }, + { + "epoch": 0.4928571428571429, + "grad_norm": 0.34499719738960266, + "learning_rate": 9.685687610871666e-06, + "loss": 1.6427959203720093, + "step": 276 + }, + { + "epoch": 0.49642857142857144, + "grad_norm": 0.3400484621524811, + "learning_rate": 9.679150453075357e-06, + "loss": 1.3161296844482422, + "step": 278 + }, + { + "epoch": 0.5, + "grad_norm": 0.3539294898509979, + "learning_rate": 9.67254852415884e-06, + "loss": 1.2691534757614136, + "step": 280 + }, + { + "epoch": 0.5035714285714286, + "grad_norm": 0.2683607339859009, + "learning_rate": 9.665881926442994e-06, + "loss": 1.5461015701293945, + "step": 282 + }, + { + "epoch": 0.5071428571428571, + "grad_norm": 0.3605668246746063, + "learning_rate": 9.659150763250966e-06, + "loss": 1.6314688920974731, + "step": 284 + }, + { + "epoch": 0.5107142857142857, + "grad_norm": 0.3184402585029602, + "learning_rate": 9.652355138906591e-06, + "loss": 1.518629789352417, + "step": 286 + }, + { + "epoch": 0.5142857142857142, + "grad_norm": 0.870186984539032, + "learning_rate": 9.645495158732755e-06, + "loss": 1.143850564956665, + "step": 288 + }, + { + "epoch": 0.5178571428571429, + "grad_norm": 0.33421170711517334, + "learning_rate": 9.638570929049776e-06, + "loss": 1.0234707593917847, + "step": 290 + }, + { + "epoch": 0.5214285714285715, + "grad_norm": 0.1717844307422638, + "learning_rate": 9.631582557173751e-06, + "loss": 1.4672911167144775, + "step": 292 + }, + { + "epoch": 0.525, + "grad_norm": 0.9301527738571167, + "learning_rate": 9.624530151414894e-06, + "loss": 1.2730239629745483, + "step": 294 + }, + { + "epoch": 0.5285714285714286, + "grad_norm": 0.38581225275993347, + "learning_rate": 9.617413821075852e-06, + "loss": 1.2601397037506104, + "step": 296 + }, + { + "epoch": 0.5321428571428571, + "grad_norm": 0.23976172506809235, + "learning_rate": 9.61023367645002e-06, + "loss": 1.2101945877075195, + "step": 298 + }, + { + "epoch": 0.5357142857142857, + "grad_norm": 0.22152353823184967, + "learning_rate": 9.602989828819829e-06, + "loss": 1.542162537574768, + "step": 300 + }, + { + "epoch": 0.5392857142857143, + "grad_norm": 0.4408532381057739, + "learning_rate": 9.595682390455015e-06, + "loss": 1.3136895895004272, + "step": 302 + }, + { + "epoch": 0.5428571428571428, + "grad_norm": 0.4480395019054413, + "learning_rate": 9.588311474610888e-06, + "loss": 1.1242649555206299, + "step": 304 + }, + { + "epoch": 0.5464285714285714, + "grad_norm": 0.7074999213218689, + "learning_rate": 9.580877195526564e-06, + "loss": 1.6407079696655273, + "step": 306 + }, + { + "epoch": 0.55, + "grad_norm": 0.3410518765449524, + "learning_rate": 9.573379668423209e-06, + "loss": 1.3072420358657837, + "step": 308 + }, + { + "epoch": 0.5535714285714286, + "grad_norm": 0.47393250465393066, + "learning_rate": 9.56581900950225e-06, + "loss": 1.4017832279205322, + "step": 310 + }, + { + "epoch": 0.5571428571428572, + "grad_norm": 0.6089979410171509, + "learning_rate": 9.558195335943566e-06, + "loss": 1.5297354459762573, + "step": 312 + }, + { + "epoch": 0.5607142857142857, + "grad_norm": 0.2799089848995209, + "learning_rate": 9.550508765903672e-06, + "loss": 1.043546199798584, + "step": 314 + }, + { + "epoch": 0.5642857142857143, + "grad_norm": 0.2620464563369751, + "learning_rate": 9.542759418513906e-06, + "loss": 1.6538763046264648, + "step": 316 + }, + { + "epoch": 0.5678571428571428, + "grad_norm": 0.5144315958023071, + "learning_rate": 9.534947413878556e-06, + "loss": 1.5541188716888428, + "step": 318 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.29509297013282776, + "learning_rate": 9.52707287307302e-06, + "loss": 1.1971598863601685, + "step": 320 + }, + { + "epoch": 0.575, + "grad_norm": 0.1929909884929657, + "learning_rate": 9.519135918141913e-06, + "loss": 1.1823662519454956, + "step": 322 + }, + { + "epoch": 0.5785714285714286, + "grad_norm": 0.48544377088546753, + "learning_rate": 9.511136672097194e-06, + "loss": 1.3313523530960083, + "step": 324 + }, + { + "epoch": 0.5821428571428572, + "grad_norm": 0.3510501980781555, + "learning_rate": 9.503075258916241e-06, + "loss": 1.3195650577545166, + "step": 326 + }, + { + "epoch": 0.5857142857142857, + "grad_norm": 0.2727429270744324, + "learning_rate": 9.494951803539942e-06, + "loss": 1.2425987720489502, + "step": 328 + }, + { + "epoch": 0.5892857142857143, + "grad_norm": 15.424638748168945, + "learning_rate": 9.486766431870752e-06, + "loss": 1.2101187705993652, + "step": 330 + }, + { + "epoch": 0.5928571428571429, + "grad_norm": 0.2866066098213196, + "learning_rate": 9.478519270770746e-06, + "loss": 1.2784419059753418, + "step": 332 + }, + { + "epoch": 0.5964285714285714, + "grad_norm": 0.4156343638896942, + "learning_rate": 9.470210448059645e-06, + "loss": 1.583785057067871, + "step": 334 + }, + { + "epoch": 0.6, + "grad_norm": 0.45487159490585327, + "learning_rate": 9.46184009251285e-06, + "loss": 1.3652830123901367, + "step": 336 + }, + { + "epoch": 0.6035714285714285, + "grad_norm": 0.32525262236595154, + "learning_rate": 9.453408333859427e-06, + "loss": 1.4697949886322021, + "step": 338 + }, + { + "epoch": 0.6071428571428571, + "grad_norm": 0.5784197449684143, + "learning_rate": 9.444915302780117e-06, + "loss": 1.3824127912521362, + "step": 340 + }, + { + "epoch": 0.6107142857142858, + "grad_norm": 0.26421067118644714, + "learning_rate": 9.436361130905288e-06, + "loss": 1.42073655128479, + "step": 342 + }, + { + "epoch": 0.6142857142857143, + "grad_norm": 0.2380143105983734, + "learning_rate": 9.427745950812917e-06, + "loss": 1.3658424615859985, + "step": 344 + }, + { + "epoch": 0.6178571428571429, + "grad_norm": 0.43499693274497986, + "learning_rate": 9.41906989602652e-06, + "loss": 1.461742639541626, + "step": 346 + }, + { + "epoch": 0.6214285714285714, + "grad_norm": 0.400419145822525, + "learning_rate": 9.410333101013086e-06, + "loss": 1.3119421005249023, + "step": 348 + }, + { + "epoch": 0.625, + "grad_norm": 0.4901754856109619, + "learning_rate": 9.401535701180998e-06, + "loss": 1.2844195365905762, + "step": 350 + }, + { + "epoch": 0.6285714285714286, + "grad_norm": 0.48951858282089233, + "learning_rate": 9.392677832877932e-06, + "loss": 1.568238615989685, + "step": 352 + }, + { + "epoch": 0.6321428571428571, + "grad_norm": 0.2112666368484497, + "learning_rate": 9.383759633388737e-06, + "loss": 1.5015143156051636, + "step": 354 + }, + { + "epoch": 0.6357142857142857, + "grad_norm": 0.2615770101547241, + "learning_rate": 9.374781240933316e-06, + "loss": 1.5211448669433594, + "step": 356 + }, + { + "epoch": 0.6392857142857142, + "grad_norm": 0.2031329870223999, + "learning_rate": 9.365742794664484e-06, + "loss": 1.3461060523986816, + "step": 358 + }, + { + "epoch": 0.6428571428571429, + "grad_norm": 0.36589643359184265, + "learning_rate": 9.356644434665804e-06, + "loss": 1.3849568367004395, + "step": 360 + }, + { + "epoch": 0.6464285714285715, + "grad_norm": 0.3387724757194519, + "learning_rate": 9.347486301949417e-06, + "loss": 1.558565616607666, + "step": 362 + }, + { + "epoch": 0.65, + "grad_norm": 0.29686594009399414, + "learning_rate": 9.33826853845387e-06, + "loss": 1.3272876739501953, + "step": 364 + }, + { + "epoch": 0.6535714285714286, + "grad_norm": 0.8538780808448792, + "learning_rate": 9.328991287041892e-06, + "loss": 1.3049917221069336, + "step": 366 + }, + { + "epoch": 0.6571428571428571, + "grad_norm": 0.2655990719795227, + "learning_rate": 9.319654691498205e-06, + "loss": 1.2668689489364624, + "step": 368 + }, + { + "epoch": 0.6607142857142857, + "grad_norm": 0.28246378898620605, + "learning_rate": 9.31025889652728e-06, + "loss": 1.4307278394699097, + "step": 370 + }, + { + "epoch": 0.6642857142857143, + "grad_norm": 0.33245643973350525, + "learning_rate": 9.300804047751093e-06, + "loss": 1.3824753761291504, + "step": 372 + }, + { + "epoch": 0.6678571428571428, + "grad_norm": 0.21639856696128845, + "learning_rate": 9.291290291706881e-06, + "loss": 1.3373095989227295, + "step": 374 + }, + { + "epoch": 0.6714285714285714, + "grad_norm": 0.42471569776535034, + "learning_rate": 9.281717775844857e-06, + "loss": 1.2794650793075562, + "step": 376 + }, + { + "epoch": 0.675, + "grad_norm": 0.5068492293357849, + "learning_rate": 9.272086648525937e-06, + "loss": 1.3947125673294067, + "step": 378 + }, + { + "epoch": 0.6785714285714286, + "grad_norm": 0.42456403374671936, + "learning_rate": 9.26239705901943e-06, + "loss": 1.4322527647018433, + "step": 380 + }, + { + "epoch": 0.6821428571428572, + "grad_norm": 0.7322901487350464, + "learning_rate": 9.25264915750073e-06, + "loss": 1.427004337310791, + "step": 382 + }, + { + "epoch": 0.6857142857142857, + "grad_norm": 0.2247505635023117, + "learning_rate": 9.242843095048987e-06, + "loss": 1.2980873584747314, + "step": 384 + }, + { + "epoch": 0.6892857142857143, + "grad_norm": 0.28208163380622864, + "learning_rate": 9.232979023644768e-06, + "loss": 1.466817855834961, + "step": 386 + }, + { + "epoch": 0.6928571428571428, + "grad_norm": 0.4910048544406891, + "learning_rate": 9.223057096167696e-06, + "loss": 1.4608205556869507, + "step": 388 + }, + { + "epoch": 0.6964285714285714, + "grad_norm": 0.5288735032081604, + "learning_rate": 9.213077466394088e-06, + "loss": 1.3513166904449463, + "step": 390 + }, + { + "epoch": 0.7, + "grad_norm": 0.304855078458786, + "learning_rate": 9.203040288994566e-06, + "loss": 1.464281678199768, + "step": 392 + }, + { + "epoch": 0.7035714285714286, + "grad_norm": 0.28660398721694946, + "learning_rate": 9.192945719531662e-06, + "loss": 1.3084968328475952, + "step": 394 + }, + { + "epoch": 0.7071428571428572, + "grad_norm": 0.5636733770370483, + "learning_rate": 9.182793914457402e-06, + "loss": 1.2844712734222412, + "step": 396 + }, + { + "epoch": 0.7107142857142857, + "grad_norm": 0.5751602649688721, + "learning_rate": 9.172585031110895e-06, + "loss": 1.5046448707580566, + "step": 398 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 0.34700506925582886, + "learning_rate": 9.162319227715877e-06, + "loss": 1.3449612855911255, + "step": 400 + }, + { + "epoch": 0.7178571428571429, + "grad_norm": 0.34046903252601624, + "learning_rate": 9.151996663378271e-06, + "loss": 1.3594465255737305, + "step": 402 + }, + { + "epoch": 0.7214285714285714, + "grad_norm": 0.48511913418769836, + "learning_rate": 9.141617498083717e-06, + "loss": 1.5169265270233154, + "step": 404 + }, + { + "epoch": 0.725, + "grad_norm": 0.26317858695983887, + "learning_rate": 9.131181892695089e-06, + "loss": 1.4639661312103271, + "step": 406 + }, + { + "epoch": 0.7285714285714285, + "grad_norm": 0.4234665632247925, + "learning_rate": 9.120690008950008e-06, + "loss": 1.4238711595535278, + "step": 408 + }, + { + "epoch": 0.7321428571428571, + "grad_norm": 0.683773934841156, + "learning_rate": 9.110142009458333e-06, + "loss": 1.2991688251495361, + "step": 410 + }, + { + "epoch": 0.7357142857142858, + "grad_norm": 0.9090404510498047, + "learning_rate": 9.099538057699643e-06, + "loss": 1.4411964416503906, + "step": 412 + }, + { + "epoch": 0.7392857142857143, + "grad_norm": 0.42502301931381226, + "learning_rate": 9.08887831802069e-06, + "loss": 1.3963665962219238, + "step": 414 + }, + { + "epoch": 0.7428571428571429, + "grad_norm": 0.25458428263664246, + "learning_rate": 9.078162955632878e-06, + "loss": 1.3666608333587646, + "step": 416 + }, + { + "epoch": 0.7464285714285714, + "grad_norm": 0.2778110206127167, + "learning_rate": 9.067392136609672e-06, + "loss": 1.4295861721038818, + "step": 418 + }, + { + "epoch": 0.75, + "grad_norm": 0.3574320673942566, + "learning_rate": 9.056566027884051e-06, + "loss": 1.4124993085861206, + "step": 420 + }, + { + "epoch": 0.7535714285714286, + "grad_norm": 0.2570479214191437, + "learning_rate": 9.045684797245902e-06, + "loss": 1.3560070991516113, + "step": 422 + }, + { + "epoch": 0.7571428571428571, + "grad_norm": 0.3374227285385132, + "learning_rate": 9.034748613339427e-06, + "loss": 1.360439658164978, + "step": 424 + }, + { + "epoch": 0.7607142857142857, + "grad_norm": 0.25365766882896423, + "learning_rate": 9.023757645660531e-06, + "loss": 1.3708235025405884, + "step": 426 + }, + { + "epoch": 0.7642857142857142, + "grad_norm": 0.2227737158536911, + "learning_rate": 9.01271206455419e-06, + "loss": 1.3818211555480957, + "step": 428 + }, + { + "epoch": 0.7678571428571429, + "grad_norm": 0.21550701558589935, + "learning_rate": 9.001612041211817e-06, + "loss": 1.3254315853118896, + "step": 430 + }, + { + "epoch": 0.7714285714285715, + "grad_norm": 0.5434844493865967, + "learning_rate": 8.9904577476686e-06, + "loss": 1.3340120315551758, + "step": 432 + }, + { + "epoch": 0.775, + "grad_norm": 0.2289412021636963, + "learning_rate": 8.979249356800846e-06, + "loss": 1.2770015001296997, + "step": 434 + }, + { + "epoch": 0.7785714285714286, + "grad_norm": 0.25175049901008606, + "learning_rate": 8.967987042323293e-06, + "loss": 1.3385746479034424, + "step": 436 + }, + { + "epoch": 0.7821428571428571, + "grad_norm": 0.27297094464302063, + "learning_rate": 8.956670978786423e-06, + "loss": 1.2522022724151611, + "step": 438 + }, + { + "epoch": 0.7857142857142857, + "grad_norm": 0.2057066559791565, + "learning_rate": 8.945301341573757e-06, + "loss": 1.3175703287124634, + "step": 440 + }, + { + "epoch": 0.7892857142857143, + "grad_norm": 0.16934043169021606, + "learning_rate": 8.93387830689913e-06, + "loss": 1.2785143852233887, + "step": 442 + }, + { + "epoch": 0.7928571428571428, + "grad_norm": 0.17673851549625397, + "learning_rate": 8.922402051803968e-06, + "loss": 1.311404824256897, + "step": 444 + }, + { + "epoch": 0.7964285714285714, + "grad_norm": 0.36772605776786804, + "learning_rate": 8.91087275415454e-06, + "loss": 1.27708101272583, + "step": 446 + }, + { + "epoch": 0.8, + "grad_norm": 0.1414009928703308, + "learning_rate": 8.8992905926392e-06, + "loss": 1.247365117073059, + "step": 448 + }, + { + "epoch": 0.8035714285714286, + "grad_norm": 0.16844603419303894, + "learning_rate": 8.887655746765625e-06, + "loss": 1.3339194059371948, + "step": 450 + }, + { + "epoch": 0.8071428571428572, + "grad_norm": 0.4043944180011749, + "learning_rate": 8.875968396858023e-06, + "loss": 1.3012686967849731, + "step": 452 + }, + { + "epoch": 0.8107142857142857, + "grad_norm": 0.19886070489883423, + "learning_rate": 8.864228724054342e-06, + "loss": 1.2051547765731812, + "step": 454 + }, + { + "epoch": 0.8142857142857143, + "grad_norm": 0.18143871426582336, + "learning_rate": 8.852436910303466e-06, + "loss": 1.264425277709961, + "step": 456 + }, + { + "epoch": 0.8178571428571428, + "grad_norm": 0.30469146370887756, + "learning_rate": 8.840593138362395e-06, + "loss": 1.2156575918197632, + "step": 458 + }, + { + "epoch": 0.8214285714285714, + "grad_norm": 0.19490455090999603, + "learning_rate": 8.828697591793405e-06, + "loss": 1.2579315900802612, + "step": 460 + }, + { + "epoch": 0.825, + "grad_norm": 0.22966210544109344, + "learning_rate": 8.816750454961206e-06, + "loss": 1.2265636920928955, + "step": 462 + }, + { + "epoch": 0.8285714285714286, + "grad_norm": 0.4836777448654175, + "learning_rate": 8.804751913030095e-06, + "loss": 1.2515498399734497, + "step": 464 + }, + { + "epoch": 0.8321428571428572, + "grad_norm": 0.22509177029132843, + "learning_rate": 8.792702151961074e-06, + "loss": 1.2572628259658813, + "step": 466 + }, + { + "epoch": 0.8357142857142857, + "grad_norm": 0.4269544184207916, + "learning_rate": 8.780601358508966e-06, + "loss": 1.2433445453643799, + "step": 468 + }, + { + "epoch": 0.8392857142857143, + "grad_norm": 0.19438913464546204, + "learning_rate": 8.768449720219533e-06, + "loss": 1.2479232549667358, + "step": 470 + }, + { + "epoch": 0.8428571428571429, + "grad_norm": 0.695250391960144, + "learning_rate": 8.75624742542656e-06, + "loss": 1.300042748451233, + "step": 472 + }, + { + "epoch": 0.8464285714285714, + "grad_norm": 0.35800135135650635, + "learning_rate": 8.743994663248939e-06, + "loss": 1.2871143817901611, + "step": 474 + }, + { + "epoch": 0.85, + "grad_norm": 0.20253418385982513, + "learning_rate": 8.73169162358774e-06, + "loss": 1.2776912450790405, + "step": 476 + }, + { + "epoch": 0.8535714285714285, + "grad_norm": 0.203902930021286, + "learning_rate": 8.719338497123258e-06, + "loss": 1.3039164543151855, + "step": 478 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.24306446313858032, + "learning_rate": 8.706935475312073e-06, + "loss": 1.30210542678833, + "step": 480 + }, + { + "epoch": 0.8607142857142858, + "grad_norm": 0.2822311520576477, + "learning_rate": 8.694482750384069e-06, + "loss": 1.2630928754806519, + "step": 482 + }, + { + "epoch": 0.8642857142857143, + "grad_norm": 0.2177450954914093, + "learning_rate": 8.681980515339464e-06, + "loss": 1.2841533422470093, + "step": 484 + }, + { + "epoch": 0.8678571428571429, + "grad_norm": 0.19454443454742432, + "learning_rate": 8.669428963945815e-06, + "loss": 1.2446175813674927, + "step": 486 + }, + { + "epoch": 0.8714285714285714, + "grad_norm": 0.161905437707901, + "learning_rate": 8.656828290735013e-06, + "loss": 1.2695343494415283, + "step": 488 + }, + { + "epoch": 0.875, + "grad_norm": 0.19021154940128326, + "learning_rate": 8.644178691000272e-06, + "loss": 1.2780508995056152, + "step": 490 + }, + { + "epoch": 0.8785714285714286, + "grad_norm": 0.3725239038467407, + "learning_rate": 8.631480360793095e-06, + "loss": 1.2979791164398193, + "step": 492 + }, + { + "epoch": 0.8821428571428571, + "grad_norm": 0.5264632701873779, + "learning_rate": 8.61873349692025e-06, + "loss": 1.2810431718826294, + "step": 494 + }, + { + "epoch": 0.8857142857142857, + "grad_norm": 0.26536062359809875, + "learning_rate": 8.605938296940702e-06, + "loss": 1.2166625261306763, + "step": 496 + }, + { + "epoch": 0.8892857142857142, + "grad_norm": 0.4096132516860962, + "learning_rate": 8.593094959162565e-06, + "loss": 1.2420190572738647, + "step": 498 + }, + { + "epoch": 0.8928571428571429, + "grad_norm": 0.4396449327468872, + "learning_rate": 8.58020368264002e-06, + "loss": 1.2754027843475342, + "step": 500 + }, + { + "epoch": 0.8964285714285715, + "grad_norm": 0.15545235574245453, + "learning_rate": 8.567264667170232e-06, + "loss": 1.3059731721878052, + "step": 502 + }, + { + "epoch": 0.9, + "grad_norm": 0.25121352076530457, + "learning_rate": 8.554278113290262e-06, + "loss": 1.2766114473342896, + "step": 504 + }, + { + "epoch": 0.9035714285714286, + "grad_norm": 0.21137557923793793, + "learning_rate": 8.541244222273942e-06, + "loss": 1.258975863456726, + "step": 506 + }, + { + "epoch": 0.9071428571428571, + "grad_norm": 0.16647249460220337, + "learning_rate": 8.528163196128767e-06, + "loss": 1.2222638130187988, + "step": 508 + }, + { + "epoch": 0.9107142857142857, + "grad_norm": 0.3039259910583496, + "learning_rate": 8.51503523759277e-06, + "loss": 1.257559895515442, + "step": 510 + }, + { + "epoch": 0.9142857142857143, + "grad_norm": 0.32180115580558777, + "learning_rate": 8.501860550131361e-06, + "loss": 1.280539631843567, + "step": 512 + }, + { + "epoch": 0.9178571428571428, + "grad_norm": 0.2822877764701843, + "learning_rate": 8.488639337934188e-06, + "loss": 1.225077509880066, + "step": 514 + }, + { + "epoch": 0.9214285714285714, + "grad_norm": 0.22444438934326172, + "learning_rate": 8.475371805911975e-06, + "loss": 1.259244441986084, + "step": 516 + }, + { + "epoch": 0.925, + "grad_norm": 0.17102967202663422, + "learning_rate": 8.462058159693332e-06, + "loss": 1.2512003183364868, + "step": 518 + }, + { + "epoch": 0.9285714285714286, + "grad_norm": 0.9442085027694702, + "learning_rate": 8.44869860562158e-06, + "loss": 1.2956591844558716, + "step": 520 + }, + { + "epoch": 0.9321428571428572, + "grad_norm": 0.31264039874076843, + "learning_rate": 8.435293350751545e-06, + "loss": 1.3134222030639648, + "step": 522 + }, + { + "epoch": 0.9357142857142857, + "grad_norm": 0.20593850314617157, + "learning_rate": 8.421842602846362e-06, + "loss": 1.269896149635315, + "step": 524 + }, + { + "epoch": 0.9392857142857143, + "grad_norm": 0.24257254600524902, + "learning_rate": 8.408346570374234e-06, + "loss": 1.2887259721755981, + "step": 526 + }, + { + "epoch": 0.9428571428571428, + "grad_norm": 0.18374580144882202, + "learning_rate": 8.394805462505224e-06, + "loss": 1.2653754949569702, + "step": 528 + }, + { + "epoch": 0.9464285714285714, + "grad_norm": 0.7440497875213623, + "learning_rate": 8.381219489107992e-06, + "loss": 1.2136163711547852, + "step": 530 + }, + { + "epoch": 0.95, + "grad_norm": 0.3250195384025574, + "learning_rate": 8.36758886074656e-06, + "loss": 1.233951449394226, + "step": 532 + }, + { + "epoch": 0.9535714285714286, + "grad_norm": 0.2864832878112793, + "learning_rate": 8.353913788677036e-06, + "loss": 1.2546851634979248, + "step": 534 + }, + { + "epoch": 0.9571428571428572, + "grad_norm": 0.22155587375164032, + "learning_rate": 8.34019448484435e-06, + "loss": 1.2355575561523438, + "step": 536 + }, + { + "epoch": 0.9607142857142857, + "grad_norm": 0.19411461055278778, + "learning_rate": 8.326431161878957e-06, + "loss": 1.2437915802001953, + "step": 538 + }, + { + "epoch": 0.9642857142857143, + "grad_norm": 0.26431798934936523, + "learning_rate": 8.312624033093555e-06, + "loss": 1.2899754047393799, + "step": 540 + }, + { + "epoch": 0.9678571428571429, + "grad_norm": 0.3181489109992981, + "learning_rate": 8.298773312479767e-06, + "loss": 1.2769360542297363, + "step": 542 + }, + { + "epoch": 0.9714285714285714, + "grad_norm": 0.2669861912727356, + "learning_rate": 8.284879214704834e-06, + "loss": 1.2913857698440552, + "step": 544 + }, + { + "epoch": 0.975, + "grad_norm": 0.2932322919368744, + "learning_rate": 8.270941955108281e-06, + "loss": 1.2430675029754639, + "step": 546 + }, + { + "epoch": 0.9785714285714285, + "grad_norm": 0.3006272614002228, + "learning_rate": 8.256961749698583e-06, + "loss": 1.2453312873840332, + "step": 548 + }, + { + "epoch": 0.9821428571428571, + "grad_norm": 0.2196272611618042, + "learning_rate": 8.242938815149817e-06, + "loss": 1.2648967504501343, + "step": 550 + }, + { + "epoch": 0.9857142857142858, + "grad_norm": 0.2562142014503479, + "learning_rate": 8.228873368798304e-06, + "loss": 1.3159946203231812, + "step": 552 + }, + { + "epoch": 0.9892857142857143, + "grad_norm": 0.26237812638282776, + "learning_rate": 8.214765628639235e-06, + "loss": 1.3476945161819458, + "step": 554 + }, + { + "epoch": 0.9928571428571429, + "grad_norm": 0.38732582330703735, + "learning_rate": 8.200615813323306e-06, + "loss": 1.9057130813598633, + "step": 556 + }, + { + "epoch": 0.9964285714285714, + "grad_norm": 0.33351263403892517, + "learning_rate": 8.18642414215331e-06, + "loss": 1.8800382614135742, + "step": 558 + }, + { + "epoch": 1.0, + "grad_norm": 0.6058505773544312, + "learning_rate": 8.172190835080757e-06, + "loss": 1.8019236326217651, + "step": 560 + }, + { + "epoch": 1.0035714285714286, + "grad_norm": 0.31470683217048645, + "learning_rate": 8.157916112702452e-06, + "loss": 1.384263277053833, + "step": 562 + }, + { + "epoch": 1.0071428571428571, + "grad_norm": 0.310624897480011, + "learning_rate": 8.143600196257086e-06, + "loss": 1.3995013236999512, + "step": 564 + }, + { + "epoch": 1.0107142857142857, + "grad_norm": 0.20878104865550995, + "learning_rate": 8.129243307621791e-06, + "loss": 1.3525418043136597, + "step": 566 + }, + { + "epoch": 1.0142857142857142, + "grad_norm": 0.2683800160884857, + "learning_rate": 8.114845669308723e-06, + "loss": 1.3207361698150635, + "step": 568 + }, + { + "epoch": 1.0178571428571428, + "grad_norm": 0.27859288454055786, + "learning_rate": 8.100407504461595e-06, + "loss": 1.3501830101013184, + "step": 570 + }, + { + "epoch": 1.0214285714285714, + "grad_norm": 0.32225877046585083, + "learning_rate": 8.085929036852236e-06, + "loss": 1.1840941905975342, + "step": 572 + }, + { + "epoch": 1.025, + "grad_norm": 0.23283155262470245, + "learning_rate": 8.071410490877097e-06, + "loss": 1.2650562524795532, + "step": 574 + }, + { + "epoch": 1.0285714285714285, + "grad_norm": 0.1705978810787201, + "learning_rate": 8.0568520915538e-06, + "loss": 1.2940489053726196, + "step": 576 + }, + { + "epoch": 1.032142857142857, + "grad_norm": 0.23754863440990448, + "learning_rate": 8.042254064517642e-06, + "loss": 1.3267643451690674, + "step": 578 + }, + { + "epoch": 1.0357142857142858, + "grad_norm": 0.46769577264785767, + "learning_rate": 8.027616636018085e-06, + "loss": 1.2288154363632202, + "step": 580 + }, + { + "epoch": 1.0392857142857144, + "grad_norm": 0.233358234167099, + "learning_rate": 8.012940032915263e-06, + "loss": 1.3615669012069702, + "step": 582 + }, + { + "epoch": 1.042857142857143, + "grad_norm": 0.2691819369792938, + "learning_rate": 7.998224482676473e-06, + "loss": 1.3021140098571777, + "step": 584 + }, + { + "epoch": 1.0464285714285715, + "grad_norm": 0.24730414152145386, + "learning_rate": 7.983470213372624e-06, + "loss": 1.2602746486663818, + "step": 586 + }, + { + "epoch": 1.05, + "grad_norm": 0.2731882929801941, + "learning_rate": 7.96867745367473e-06, + "loss": 1.2430776357650757, + "step": 588 + }, + { + "epoch": 1.0535714285714286, + "grad_norm": 0.22160141170024872, + "learning_rate": 7.953846432850346e-06, + "loss": 1.2589969635009766, + "step": 590 + }, + { + "epoch": 1.0571428571428572, + "grad_norm": 0.2917991280555725, + "learning_rate": 7.938977380760024e-06, + "loss": 1.408372402191162, + "step": 592 + }, + { + "epoch": 1.0607142857142857, + "grad_norm": 0.23420438170433044, + "learning_rate": 7.92407052785375e-06, + "loss": 1.3381731510162354, + "step": 594 + }, + { + "epoch": 1.0642857142857143, + "grad_norm": 0.19835133850574493, + "learning_rate": 7.909126105167373e-06, + "loss": 1.3641246557235718, + "step": 596 + }, + { + "epoch": 1.0678571428571428, + "grad_norm": 0.21805885434150696, + "learning_rate": 7.894144344319015e-06, + "loss": 1.2766021490097046, + "step": 598 + }, + { + "epoch": 1.0714285714285714, + "grad_norm": 0.3379668593406677, + "learning_rate": 7.879125477505495e-06, + "loss": 1.2909208536148071, + "step": 600 + }, + { + "epoch": 1.075, + "grad_norm": 0.3864686191082001, + "learning_rate": 7.864069737498722e-06, + "loss": 1.259904146194458, + "step": 602 + }, + { + "epoch": 1.0785714285714285, + "grad_norm": 0.3104611933231354, + "learning_rate": 7.848977357642089e-06, + "loss": 1.3227314949035645, + "step": 604 + }, + { + "epoch": 1.082142857142857, + "grad_norm": 0.244283065199852, + "learning_rate": 7.833848571846855e-06, + "loss": 1.3027191162109375, + "step": 606 + }, + { + "epoch": 1.0857142857142856, + "grad_norm": 0.19385835528373718, + "learning_rate": 7.818683614588523e-06, + "loss": 1.0396664142608643, + "step": 608 + }, + { + "epoch": 1.0892857142857142, + "grad_norm": 0.2750968933105469, + "learning_rate": 7.803482720903206e-06, + "loss": 1.1102863550186157, + "step": 610 + }, + { + "epoch": 1.092857142857143, + "grad_norm": 0.3333893418312073, + "learning_rate": 7.788246126383977e-06, + "loss": 1.1634554862976074, + "step": 612 + }, + { + "epoch": 1.0964285714285715, + "grad_norm": 0.28989356756210327, + "learning_rate": 7.77297406717723e-06, + "loss": 1.3986788988113403, + "step": 614 + }, + { + "epoch": 1.1, + "grad_norm": 0.27835774421691895, + "learning_rate": 7.757666779979008e-06, + "loss": 1.2263062000274658, + "step": 616 + }, + { + "epoch": 1.1035714285714286, + "grad_norm": 0.2572242021560669, + "learning_rate": 7.74232450203134e-06, + "loss": 1.2180155515670776, + "step": 618 + }, + { + "epoch": 1.1071428571428572, + "grad_norm": 0.3894072473049164, + "learning_rate": 7.72694747111857e-06, + "loss": 1.478975534439087, + "step": 620 + }, + { + "epoch": 1.1107142857142858, + "grad_norm": 0.4212060868740082, + "learning_rate": 7.711535925563655e-06, + "loss": 1.3129830360412598, + "step": 622 + }, + { + "epoch": 1.1142857142857143, + "grad_norm": 0.23659296333789825, + "learning_rate": 7.696090104224492e-06, + "loss": 1.229081392288208, + "step": 624 + }, + { + "epoch": 1.1178571428571429, + "grad_norm": 0.254404217004776, + "learning_rate": 7.680610246490199e-06, + "loss": 1.2878901958465576, + "step": 626 + }, + { + "epoch": 1.1214285714285714, + "grad_norm": 0.3570263981819153, + "learning_rate": 7.665096592277415e-06, + "loss": 1.218833088874817, + "step": 628 + }, + { + "epoch": 1.125, + "grad_norm": 0.27803489565849304, + "learning_rate": 7.649549382026575e-06, + "loss": 1.274793028831482, + "step": 630 + }, + { + "epoch": 1.1285714285714286, + "grad_norm": 0.2562004327774048, + "learning_rate": 7.633968856698192e-06, + "loss": 1.3318731784820557, + "step": 632 + }, + { + "epoch": 1.1321428571428571, + "grad_norm": 0.19307534396648407, + "learning_rate": 7.618355257769111e-06, + "loss": 1.2363682985305786, + "step": 634 + }, + { + "epoch": 1.1357142857142857, + "grad_norm": 0.5484210848808289, + "learning_rate": 7.602708827228779e-06, + "loss": 1.259455680847168, + "step": 636 + }, + { + "epoch": 1.1392857142857142, + "grad_norm": 0.2351217418909073, + "learning_rate": 7.587029807575482e-06, + "loss": 1.2625541687011719, + "step": 638 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 0.5461699962615967, + "learning_rate": 7.571318441812599e-06, + "loss": 1.1984379291534424, + "step": 640 + }, + { + "epoch": 1.1464285714285714, + "grad_norm": 0.30940407514572144, + "learning_rate": 7.55557497344482e-06, + "loss": 1.3161015510559082, + "step": 642 + }, + { + "epoch": 1.15, + "grad_norm": 0.32747605443000793, + "learning_rate": 7.539799646474393e-06, + "loss": 1.234968900680542, + "step": 644 + }, + { + "epoch": 1.1535714285714285, + "grad_norm": 0.2250605821609497, + "learning_rate": 7.523992705397321e-06, + "loss": 1.3490346670150757, + "step": 646 + }, + { + "epoch": 1.157142857142857, + "grad_norm": 0.3528631925582886, + "learning_rate": 7.508154395199592e-06, + "loss": 1.350324034690857, + "step": 648 + }, + { + "epoch": 1.1607142857142858, + "grad_norm": 0.247028186917305, + "learning_rate": 7.492284961353361e-06, + "loss": 1.285825252532959, + "step": 650 + }, + { + "epoch": 1.1642857142857144, + "grad_norm": 0.26968345046043396, + "learning_rate": 7.4763846498131675e-06, + "loss": 1.123679518699646, + "step": 652 + }, + { + "epoch": 1.167857142857143, + "grad_norm": 0.23967714607715607, + "learning_rate": 7.460453707012107e-06, + "loss": 1.2702839374542236, + "step": 654 + }, + { + "epoch": 1.1714285714285715, + "grad_norm": 0.9412787556648254, + "learning_rate": 7.444492379858021e-06, + "loss": 1.3307619094848633, + "step": 656 + }, + { + "epoch": 1.175, + "grad_norm": 0.60057133436203, + "learning_rate": 7.428500915729663e-06, + "loss": 1.218625783920288, + "step": 658 + }, + { + "epoch": 1.1785714285714286, + "grad_norm": 0.2611408829689026, + "learning_rate": 7.412479562472873e-06, + "loss": 1.1818389892578125, + "step": 660 + }, + { + "epoch": 1.1821428571428572, + "grad_norm": 0.21901297569274902, + "learning_rate": 7.3964285683967285e-06, + "loss": 1.2105083465576172, + "step": 662 + }, + { + "epoch": 1.1857142857142857, + "grad_norm": 0.9242513179779053, + "learning_rate": 7.380348182269701e-06, + "loss": 1.2359505891799927, + "step": 664 + }, + { + "epoch": 1.1892857142857143, + "grad_norm": 0.24152880907058716, + "learning_rate": 7.364238653315795e-06, + "loss": 1.268753170967102, + "step": 666 + }, + { + "epoch": 1.1928571428571428, + "grad_norm": 2.834768533706665, + "learning_rate": 7.348100231210697e-06, + "loss": 1.2450233697891235, + "step": 668 + }, + { + "epoch": 1.1964285714285714, + "grad_norm": 0.7332023978233337, + "learning_rate": 7.331933166077886e-06, + "loss": 1.2236673831939697, + "step": 670 + }, + { + "epoch": 1.2, + "grad_norm": 0.3339300751686096, + "learning_rate": 7.31573770848478e-06, + "loss": 1.1605288982391357, + "step": 672 + }, + { + "epoch": 1.2035714285714285, + "grad_norm": 0.7548586130142212, + "learning_rate": 7.299514109438835e-06, + "loss": 1.276812195777893, + "step": 674 + }, + { + "epoch": 1.207142857142857, + "grad_norm": 0.32066163420677185, + "learning_rate": 7.283262620383664e-06, + "loss": 1.2277733087539673, + "step": 676 + }, + { + "epoch": 1.2107142857142856, + "grad_norm": 0.3439161777496338, + "learning_rate": 7.266983493195133e-06, + "loss": 1.443245768547058, + "step": 678 + }, + { + "epoch": 1.2142857142857142, + "grad_norm": 0.28881630301475525, + "learning_rate": 7.250676980177468e-06, + "loss": 1.3642569780349731, + "step": 680 + }, + { + "epoch": 1.217857142857143, + "grad_norm": 0.3376900553703308, + "learning_rate": 7.2343433340593315e-06, + "loss": 1.1232848167419434, + "step": 682 + }, + { + "epoch": 1.2214285714285715, + "grad_norm": 0.5144054293632507, + "learning_rate": 7.217982807989915e-06, + "loss": 1.2558438777923584, + "step": 684 + }, + { + "epoch": 1.225, + "grad_norm": 0.25952062010765076, + "learning_rate": 7.201595655535011e-06, + "loss": 1.3395494222640991, + "step": 686 + }, + { + "epoch": 1.2285714285714286, + "grad_norm": 0.3723627030849457, + "learning_rate": 7.1851821306730876e-06, + "loss": 0.9402600526809692, + "step": 688 + }, + { + "epoch": 1.2321428571428572, + "grad_norm": 0.3420025110244751, + "learning_rate": 7.168742487791345e-06, + "loss": 0.7468339204788208, + "step": 690 + }, + { + "epoch": 1.2357142857142858, + "grad_norm": 0.34970328211784363, + "learning_rate": 7.152276981681781e-06, + "loss": 1.0327891111373901, + "step": 692 + }, + { + "epoch": 1.2392857142857143, + "grad_norm": 0.3740408420562744, + "learning_rate": 7.135785867537235e-06, + "loss": 1.267980694770813, + "step": 694 + }, + { + "epoch": 1.2428571428571429, + "grad_norm": 0.5309215188026428, + "learning_rate": 7.119269400947437e-06, + "loss": 1.4097453355789185, + "step": 696 + }, + { + "epoch": 1.2464285714285714, + "grad_norm": 0.3004949390888214, + "learning_rate": 7.1027278378950486e-06, + "loss": 1.2045501470565796, + "step": 698 + }, + { + "epoch": 1.25, + "grad_norm": 0.6161743402481079, + "learning_rate": 7.086161434751684e-06, + "loss": 1.0838185548782349, + "step": 700 + }, + { + "epoch": 1.2535714285714286, + "grad_norm": 0.22701780498027802, + "learning_rate": 7.069570448273951e-06, + "loss": 1.1616631746292114, + "step": 702 + }, + { + "epoch": 1.2571428571428571, + "grad_norm": 0.3208640515804291, + "learning_rate": 7.0529551355994686e-06, + "loss": 1.2447824478149414, + "step": 704 + }, + { + "epoch": 1.2607142857142857, + "grad_norm": 0.7384056448936462, + "learning_rate": 7.03631575424287e-06, + "loss": 1.1175577640533447, + "step": 706 + }, + { + "epoch": 1.2642857142857142, + "grad_norm": 0.5497505068778992, + "learning_rate": 7.019652562091826e-06, + "loss": 1.141535758972168, + "step": 708 + }, + { + "epoch": 1.2678571428571428, + "grad_norm": 0.3330208361148834, + "learning_rate": 7.0029658174030425e-06, + "loss": 1.3164706230163574, + "step": 710 + }, + { + "epoch": 1.2714285714285714, + "grad_norm": 0.4105195701122284, + "learning_rate": 6.986255778798253e-06, + "loss": 1.234831690788269, + "step": 712 + }, + { + "epoch": 1.275, + "grad_norm": 0.28338423371315, + "learning_rate": 6.9695227052602174e-06, + "loss": 1.1415457725524902, + "step": 714 + }, + { + "epoch": 1.2785714285714285, + "grad_norm": 0.3706303536891937, + "learning_rate": 6.952766856128709e-06, + "loss": 1.199047565460205, + "step": 716 + }, + { + "epoch": 1.282142857142857, + "grad_norm": 0.3346574902534485, + "learning_rate": 6.9359884910964856e-06, + "loss": 1.4197050333023071, + "step": 718 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 0.3120553195476532, + "learning_rate": 6.919187870205275e-06, + "loss": 1.5487772226333618, + "step": 720 + }, + { + "epoch": 1.2892857142857144, + "grad_norm": 0.2753259837627411, + "learning_rate": 6.902365253841737e-06, + "loss": 1.177211880683899, + "step": 722 + }, + { + "epoch": 1.292857142857143, + "grad_norm": 0.2185521274805069, + "learning_rate": 6.885520902733435e-06, + "loss": 1.2806293964385986, + "step": 724 + }, + { + "epoch": 1.2964285714285715, + "grad_norm": 0.14865590631961823, + "learning_rate": 6.868655077944788e-06, + "loss": 0.9303812980651855, + "step": 726 + }, + { + "epoch": 1.3, + "grad_norm": 0.39503300189971924, + "learning_rate": 6.85176804087303e-06, + "loss": 1.5363171100616455, + "step": 728 + }, + { + "epoch": 1.3035714285714286, + "grad_norm": 0.510991632938385, + "learning_rate": 6.834860053244154e-06, + "loss": 1.1531927585601807, + "step": 730 + }, + { + "epoch": 1.3071428571428572, + "grad_norm": 0.28777721524238586, + "learning_rate": 6.8179313771088626e-06, + "loss": 1.2121974229812622, + "step": 732 + }, + { + "epoch": 1.3107142857142857, + "grad_norm": 0.30707836151123047, + "learning_rate": 6.800982274838495e-06, + "loss": 1.4065004587173462, + "step": 734 + }, + { + "epoch": 1.3142857142857143, + "grad_norm": 0.23764309287071228, + "learning_rate": 6.784013009120975e-06, + "loss": 1.4308959245681763, + "step": 736 + }, + { + "epoch": 1.3178571428571428, + "grad_norm": 0.6906368136405945, + "learning_rate": 6.767023842956725e-06, + "loss": 1.1925731897354126, + "step": 738 + }, + { + "epoch": 1.3214285714285714, + "grad_norm": 0.4775388538837433, + "learning_rate": 6.750015039654603e-06, + "loss": 1.6403999328613281, + "step": 740 + }, + { + "epoch": 1.325, + "grad_norm": 0.2565818727016449, + "learning_rate": 6.732986862827813e-06, + "loss": 1.0603913068771362, + "step": 742 + }, + { + "epoch": 1.3285714285714285, + "grad_norm": 0.47122514247894287, + "learning_rate": 6.7159395763898214e-06, + "loss": 1.3830267190933228, + "step": 744 + }, + { + "epoch": 1.332142857142857, + "grad_norm": 0.5306914448738098, + "learning_rate": 6.698873444550271e-06, + "loss": 1.2981680631637573, + "step": 746 + }, + { + "epoch": 1.3357142857142856, + "grad_norm": 0.408100426197052, + "learning_rate": 6.68178873181088e-06, + "loss": 1.2487084865570068, + "step": 748 + }, + { + "epoch": 1.3392857142857144, + "grad_norm": 0.33308205008506775, + "learning_rate": 6.664685702961344e-06, + "loss": 0.9980481266975403, + "step": 750 + }, + { + "epoch": 1.342857142857143, + "grad_norm": 0.20474325120449066, + "learning_rate": 6.647564623075236e-06, + "loss": 0.9687408804893494, + "step": 752 + }, + { + "epoch": 1.3464285714285715, + "grad_norm": 0.8245405554771423, + "learning_rate": 6.630425757505894e-06, + "loss": 1.33769953250885, + "step": 754 + }, + { + "epoch": 1.35, + "grad_norm": 0.2982644736766815, + "learning_rate": 6.613269371882308e-06, + "loss": 1.3833491802215576, + "step": 756 + }, + { + "epoch": 1.3535714285714286, + "grad_norm": 0.45085495710372925, + "learning_rate": 6.596095732105011e-06, + "loss": 1.2755907773971558, + "step": 758 + }, + { + "epoch": 1.3571428571428572, + "grad_norm": 0.29945558309555054, + "learning_rate": 6.5789051043419435e-06, + "loss": 1.2956531047821045, + "step": 760 + }, + { + "epoch": 1.3607142857142858, + "grad_norm": 0.5544592142105103, + "learning_rate": 6.5616977550243435e-06, + "loss": 1.2718784809112549, + "step": 762 + }, + { + "epoch": 1.3642857142857143, + "grad_norm": 0.7638172507286072, + "learning_rate": 6.544473950842606e-06, + "loss": 1.126919150352478, + "step": 764 + }, + { + "epoch": 1.3678571428571429, + "grad_norm": 0.4192071557044983, + "learning_rate": 6.527233958742154e-06, + "loss": 1.4331161975860596, + "step": 766 + }, + { + "epoch": 1.3714285714285714, + "grad_norm": 0.2737813889980316, + "learning_rate": 6.509978045919307e-06, + "loss": 1.2379997968673706, + "step": 768 + }, + { + "epoch": 1.375, + "grad_norm": 0.7987821102142334, + "learning_rate": 6.492706479817125e-06, + "loss": 1.278856873512268, + "step": 770 + }, + { + "epoch": 1.3785714285714286, + "grad_norm": 0.30944374203681946, + "learning_rate": 6.475419528121279e-06, + "loss": 1.3922899961471558, + "step": 772 + }, + { + "epoch": 1.3821428571428571, + "grad_norm": 0.29533934593200684, + "learning_rate": 6.45811745875589e-06, + "loss": 1.235024094581604, + "step": 774 + }, + { + "epoch": 1.3857142857142857, + "grad_norm": 0.788487434387207, + "learning_rate": 6.440800539879392e-06, + "loss": 1.1024410724639893, + "step": 776 + }, + { + "epoch": 1.3892857142857142, + "grad_norm": 0.3519847095012665, + "learning_rate": 6.423469039880355e-06, + "loss": 1.233741283416748, + "step": 778 + }, + { + "epoch": 1.3928571428571428, + "grad_norm": 0.18675316870212555, + "learning_rate": 6.406123227373343e-06, + "loss": 1.3022193908691406, + "step": 780 + }, + { + "epoch": 1.3964285714285714, + "grad_norm": 0.263254314661026, + "learning_rate": 6.388763371194741e-06, + "loss": 1.2517147064208984, + "step": 782 + }, + { + "epoch": 1.4, + "grad_norm": 0.35091346502304077, + "learning_rate": 6.371389740398597e-06, + "loss": 1.1601366996765137, + "step": 784 + }, + { + "epoch": 1.4035714285714285, + "grad_norm": 0.34103208780288696, + "learning_rate": 6.35400260425244e-06, + "loss": 1.3991872072219849, + "step": 786 + }, + { + "epoch": 1.407142857142857, + "grad_norm": 1.0600661039352417, + "learning_rate": 6.336602232233116e-06, + "loss": 1.4128477573394775, + "step": 788 + }, + { + "epoch": 1.4107142857142856, + "grad_norm": 0.6274294257164001, + "learning_rate": 6.319188894022612e-06, + "loss": 1.5149511098861694, + "step": 790 + }, + { + "epoch": 1.4142857142857144, + "grad_norm": 0.25083670020103455, + "learning_rate": 6.301762859503869e-06, + "loss": 1.3468106985092163, + "step": 792 + }, + { + "epoch": 1.417857142857143, + "grad_norm": 0.4435229003429413, + "learning_rate": 6.284324398756606e-06, + "loss": 1.3005448579788208, + "step": 794 + }, + { + "epoch": 1.4214285714285715, + "grad_norm": 0.5059611201286316, + "learning_rate": 6.266873782053131e-06, + "loss": 1.0667213201522827, + "step": 796 + }, + { + "epoch": 1.425, + "grad_norm": 0.2751584053039551, + "learning_rate": 6.249411279854152e-06, + "loss": 1.1674690246582031, + "step": 798 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.2168678641319275, + "learning_rate": 6.231937162804584e-06, + "loss": 1.0654405355453491, + "step": 800 + }, + { + "epoch": 1.4321428571428572, + "grad_norm": 0.6201224327087402, + "learning_rate": 6.214451701729363e-06, + "loss": 1.1552761793136597, + "step": 802 + }, + { + "epoch": 1.4357142857142857, + "grad_norm": 0.4682956635951996, + "learning_rate": 6.196955167629236e-06, + "loss": 1.3353182077407837, + "step": 804 + }, + { + "epoch": 1.4392857142857143, + "grad_norm": 0.3534834384918213, + "learning_rate": 6.179447831676566e-06, + "loss": 1.3080209493637085, + "step": 806 + }, + { + "epoch": 1.4428571428571428, + "grad_norm": 0.4813729226589203, + "learning_rate": 6.161929965211135e-06, + "loss": 1.3717149496078491, + "step": 808 + }, + { + "epoch": 1.4464285714285714, + "grad_norm": 0.26942121982574463, + "learning_rate": 6.144401839735931e-06, + "loss": 1.4133044481277466, + "step": 810 + }, + { + "epoch": 1.45, + "grad_norm": 0.30204319953918457, + "learning_rate": 6.12686372691294e-06, + "loss": 1.581753134727478, + "step": 812 + }, + { + "epoch": 1.4535714285714285, + "grad_norm": 1.1933614015579224, + "learning_rate": 6.109315898558943e-06, + "loss": 1.1946600675582886, + "step": 814 + }, + { + "epoch": 1.457142857142857, + "grad_norm": 0.651054322719574, + "learning_rate": 6.091758626641296e-06, + "loss": 1.2849314212799072, + "step": 816 + }, + { + "epoch": 1.4607142857142856, + "grad_norm": 0.41265299916267395, + "learning_rate": 6.074192183273714e-06, + "loss": 1.2870151996612549, + "step": 818 + }, + { + "epoch": 1.4642857142857144, + "grad_norm": 0.2880115807056427, + "learning_rate": 6.056616840712065e-06, + "loss": 1.156186580657959, + "step": 820 + }, + { + "epoch": 1.467857142857143, + "grad_norm": 0.31380829215049744, + "learning_rate": 6.039032871350136e-06, + "loss": 1.3075363636016846, + "step": 822 + }, + { + "epoch": 1.4714285714285715, + "grad_norm": 0.735464334487915, + "learning_rate": 6.021440547715418e-06, + "loss": 1.2372568845748901, + "step": 824 + }, + { + "epoch": 1.475, + "grad_norm": 0.3404405117034912, + "learning_rate": 6.0038401424648866e-06, + "loss": 1.3656535148620605, + "step": 826 + }, + { + "epoch": 1.4785714285714286, + "grad_norm": 1.0161242485046387, + "learning_rate": 5.986231928380764e-06, + "loss": 1.4575047492980957, + "step": 828 + }, + { + "epoch": 1.4821428571428572, + "grad_norm": 0.32120001316070557, + "learning_rate": 5.968616178366304e-06, + "loss": 1.1328424215316772, + "step": 830 + }, + { + "epoch": 1.4857142857142858, + "grad_norm": 0.24318258464336395, + "learning_rate": 5.95099316544156e-06, + "loss": 1.1171592473983765, + "step": 832 + }, + { + "epoch": 1.4892857142857143, + "grad_norm": 0.2471759170293808, + "learning_rate": 5.9333631627391385e-06, + "loss": 1.1361713409423828, + "step": 834 + }, + { + "epoch": 1.4928571428571429, + "grad_norm": 0.31643709540367126, + "learning_rate": 5.915726443499992e-06, + "loss": 1.4550275802612305, + "step": 836 + }, + { + "epoch": 1.4964285714285714, + "grad_norm": 0.2178327739238739, + "learning_rate": 5.89808328106916e-06, + "loss": 1.1423126459121704, + "step": 838 + }, + { + "epoch": 1.5, + "grad_norm": 0.38720911741256714, + "learning_rate": 5.880433948891548e-06, + "loss": 1.0535848140716553, + "step": 840 + }, + { + "epoch": 1.5035714285714286, + "grad_norm": 0.4030672013759613, + "learning_rate": 5.862778720507684e-06, + "loss": 1.3946490287780762, + "step": 842 + }, + { + "epoch": 1.5071428571428571, + "grad_norm": 0.35578665137290955, + "learning_rate": 5.845117869549477e-06, + "loss": 1.5173096656799316, + "step": 844 + }, + { + "epoch": 1.5107142857142857, + "grad_norm": 0.3867500126361847, + "learning_rate": 5.827451669735977e-06, + "loss": 1.352368712425232, + "step": 846 + }, + { + "epoch": 1.5142857142857142, + "grad_norm": 0.9219626188278198, + "learning_rate": 5.80978039486914e-06, + "loss": 0.9382961988449097, + "step": 848 + }, + { + "epoch": 1.5178571428571428, + "grad_norm": 0.21979399025440216, + "learning_rate": 5.79210431882957e-06, + "loss": 0.8432712554931641, + "step": 850 + }, + { + "epoch": 1.5214285714285714, + "grad_norm": 0.28859761357307434, + "learning_rate": 5.774423715572289e-06, + "loss": 1.296618938446045, + "step": 852 + }, + { + "epoch": 1.525, + "grad_norm": 0.4942507743835449, + "learning_rate": 5.756738859122483e-06, + "loss": 1.0648285150527954, + "step": 854 + }, + { + "epoch": 1.5285714285714285, + "grad_norm": 0.5750854015350342, + "learning_rate": 5.739050023571258e-06, + "loss": 1.0088112354278564, + "step": 856 + }, + { + "epoch": 1.532142857142857, + "grad_norm": 0.20957696437835693, + "learning_rate": 5.721357483071386e-06, + "loss": 1.0590897798538208, + "step": 858 + }, + { + "epoch": 1.5357142857142856, + "grad_norm": 0.8381152153015137, + "learning_rate": 5.703661511833064e-06, + "loss": 1.3163901567459106, + "step": 860 + }, + { + "epoch": 1.5392857142857141, + "grad_norm": 0.4364100992679596, + "learning_rate": 5.68596238411966e-06, + "loss": 1.1863445043563843, + "step": 862 + }, + { + "epoch": 1.5428571428571427, + "grad_norm": 0.3657117784023285, + "learning_rate": 5.668260374243467e-06, + "loss": 0.98140949010849, + "step": 864 + }, + { + "epoch": 1.5464285714285713, + "grad_norm": 0.6113946437835693, + "learning_rate": 5.650555756561439e-06, + "loss": 1.3584340810775757, + "step": 866 + }, + { + "epoch": 1.55, + "grad_norm": 0.7465829849243164, + "learning_rate": 5.6328488054709575e-06, + "loss": 1.149134874343872, + "step": 868 + }, + { + "epoch": 1.5535714285714286, + "grad_norm": 0.9023903608322144, + "learning_rate": 5.615139795405559e-06, + "loss": 1.2276476621627808, + "step": 870 + }, + { + "epoch": 1.5571428571428572, + "grad_norm": 0.5961250066757202, + "learning_rate": 5.5974290008307e-06, + "loss": 1.3803772926330566, + "step": 872 + }, + { + "epoch": 1.5607142857142857, + "grad_norm": 0.31303706765174866, + "learning_rate": 5.579716696239486e-06, + "loss": 0.8974480628967285, + "step": 874 + }, + { + "epoch": 1.5642857142857143, + "grad_norm": 0.49465271830558777, + "learning_rate": 5.562003156148434e-06, + "loss": 1.500373125076294, + "step": 876 + }, + { + "epoch": 1.5678571428571428, + "grad_norm": 0.4547047019004822, + "learning_rate": 5.544288655093203e-06, + "loss": 1.3437693119049072, + "step": 878 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 0.2680365741252899, + "learning_rate": 5.526573467624351e-06, + "loss": 1.0480762720108032, + "step": 880 + }, + { + "epoch": 1.575, + "grad_norm": 0.2553335130214691, + "learning_rate": 5.508857868303068e-06, + "loss": 1.078729271888733, + "step": 882 + }, + { + "epoch": 1.5785714285714287, + "grad_norm": 0.2632956802845001, + "learning_rate": 5.491142131696934e-06, + "loss": 1.16781485080719, + "step": 884 + }, + { + "epoch": 1.5821428571428573, + "grad_norm": 0.42439237236976624, + "learning_rate": 5.473426532375651e-06, + "loss": 1.0907145738601685, + "step": 886 + }, + { + "epoch": 1.5857142857142859, + "grad_norm": 0.4016067087650299, + "learning_rate": 5.455711344906797e-06, + "loss": 1.0479315519332886, + "step": 888 + }, + { + "epoch": 1.5892857142857144, + "grad_norm": 0.787295401096344, + "learning_rate": 5.437996843851567e-06, + "loss": 1.1056879758834839, + "step": 890 + }, + { + "epoch": 1.592857142857143, + "grad_norm": 0.24893441796302795, + "learning_rate": 5.420283303760515e-06, + "loss": 1.086808443069458, + "step": 892 + }, + { + "epoch": 1.5964285714285715, + "grad_norm": 1.0016993284225464, + "learning_rate": 5.402570999169303e-06, + "loss": 1.4259756803512573, + "step": 894 + }, + { + "epoch": 1.6, + "grad_norm": 0.5658416748046875, + "learning_rate": 5.384860204594442e-06, + "loss": 1.175308346748352, + "step": 896 + }, + { + "epoch": 1.6035714285714286, + "grad_norm": 0.32960644364356995, + "learning_rate": 5.367151194529045e-06, + "loss": 1.3044936656951904, + "step": 898 + }, + { + "epoch": 1.6071428571428572, + "grad_norm": 1.566615343093872, + "learning_rate": 5.349444243438563e-06, + "loss": 1.1787108182907104, + "step": 900 + }, + { + "epoch": 1.6107142857142858, + "grad_norm": 0.3008659780025482, + "learning_rate": 5.331739625756535e-06, + "loss": 1.2578707933425903, + "step": 902 + }, + { + "epoch": 1.6142857142857143, + "grad_norm": 0.3048568367958069, + "learning_rate": 5.314037615880341e-06, + "loss": 1.214415192604065, + "step": 904 + }, + { + "epoch": 1.6178571428571429, + "grad_norm": 0.30796509981155396, + "learning_rate": 5.296338488166939e-06, + "loss": 1.2612226009368896, + "step": 906 + }, + { + "epoch": 1.6214285714285714, + "grad_norm": 0.3856910467147827, + "learning_rate": 5.278642516928617e-06, + "loss": 1.1769757270812988, + "step": 908 + }, + { + "epoch": 1.625, + "grad_norm": 0.4512476921081543, + "learning_rate": 5.260949976428745e-06, + "loss": 1.058244228363037, + "step": 910 + }, + { + "epoch": 1.6285714285714286, + "grad_norm": 0.5113015174865723, + "learning_rate": 5.243261140877517e-06, + "loss": 1.3994414806365967, + "step": 912 + }, + { + "epoch": 1.6321428571428571, + "grad_norm": 0.24723981320858002, + "learning_rate": 5.225576284427712e-06, + "loss": 1.29803466796875, + "step": 914 + }, + { + "epoch": 1.6357142857142857, + "grad_norm": 0.2900439202785492, + "learning_rate": 5.207895681170432e-06, + "loss": 1.341897964477539, + "step": 916 + }, + { + "epoch": 1.6392857142857142, + "grad_norm": 0.2555374205112457, + "learning_rate": 5.190219605130863e-06, + "loss": 1.1864595413208008, + "step": 918 + }, + { + "epoch": 1.6428571428571428, + "grad_norm": 0.31760746240615845, + "learning_rate": 5.172548330264023e-06, + "loss": 1.2025091648101807, + "step": 920 + }, + { + "epoch": 1.6464285714285714, + "grad_norm": 0.28426891565322876, + "learning_rate": 5.154882130450525e-06, + "loss": 1.3937333822250366, + "step": 922 + }, + { + "epoch": 1.65, + "grad_norm": 0.26754945516586304, + "learning_rate": 5.137221279492317e-06, + "loss": 1.1592669486999512, + "step": 924 + }, + { + "epoch": 1.6535714285714285, + "grad_norm": 0.398725301027298, + "learning_rate": 5.119566051108453e-06, + "loss": 1.041808009147644, + "step": 926 + }, + { + "epoch": 1.657142857142857, + "grad_norm": 0.24082130193710327, + "learning_rate": 5.10191671893084e-06, + "loss": 1.1113499402999878, + "step": 928 + }, + { + "epoch": 1.6607142857142856, + "grad_norm": 0.32985880970954895, + "learning_rate": 5.08427355650001e-06, + "loss": 1.243566632270813, + "step": 930 + }, + { + "epoch": 1.6642857142857141, + "grad_norm": 0.22729991376399994, + "learning_rate": 5.066636837260863e-06, + "loss": 1.218003511428833, + "step": 932 + }, + { + "epoch": 1.6678571428571427, + "grad_norm": 0.20701321959495544, + "learning_rate": 5.049006834558443e-06, + "loss": 1.1665146350860596, + "step": 934 + }, + { + "epoch": 1.6714285714285713, + "grad_norm": 0.7482126355171204, + "learning_rate": 5.031383821633695e-06, + "loss": 1.0261443853378296, + "step": 936 + }, + { + "epoch": 1.675, + "grad_norm": 0.6510646939277649, + "learning_rate": 5.013768071619237e-06, + "loss": 1.1913405656814575, + "step": 938 + }, + { + "epoch": 1.6785714285714286, + "grad_norm": 0.3893536329269409, + "learning_rate": 4.996159857535116e-06, + "loss": 1.2498658895492554, + "step": 940 + }, + { + "epoch": 1.6821428571428572, + "grad_norm": 0.33820265531539917, + "learning_rate": 4.9785594522845835e-06, + "loss": 1.2645461559295654, + "step": 942 + }, + { + "epoch": 1.6857142857142857, + "grad_norm": 0.2857305407524109, + "learning_rate": 4.9609671286498655e-06, + "loss": 1.1648997068405151, + "step": 944 + }, + { + "epoch": 1.6892857142857143, + "grad_norm": 0.35911425948143005, + "learning_rate": 4.943383159287936e-06, + "loss": 1.3200312852859497, + "step": 946 + }, + { + "epoch": 1.6928571428571428, + "grad_norm": 0.28003281354904175, + "learning_rate": 4.925807816726288e-06, + "loss": 1.2886927127838135, + "step": 948 + }, + { + "epoch": 1.6964285714285714, + "grad_norm": 0.3707423210144043, + "learning_rate": 4.908241373358707e-06, + "loss": 1.2256838083267212, + "step": 950 + }, + { + "epoch": 1.7, + "grad_norm": 0.37781476974487305, + "learning_rate": 4.890684101441059e-06, + "loss": 1.261880874633789, + "step": 952 + }, + { + "epoch": 1.7035714285714287, + "grad_norm": 0.455138623714447, + "learning_rate": 4.873136273087061e-06, + "loss": 1.1675777435302734, + "step": 954 + }, + { + "epoch": 1.7071428571428573, + "grad_norm": 0.3084830641746521, + "learning_rate": 4.855598160264071e-06, + "loss": 1.0751243829727173, + "step": 956 + }, + { + "epoch": 1.7107142857142859, + "grad_norm": 0.33484798669815063, + "learning_rate": 4.838070034788865e-06, + "loss": 1.2969300746917725, + "step": 958 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 0.45519745349884033, + "learning_rate": 4.820552168323434e-06, + "loss": 1.1682568788528442, + "step": 960 + }, + { + "epoch": 1.717857142857143, + "grad_norm": 0.3936917185783386, + "learning_rate": 4.803044832370765e-06, + "loss": 1.2029849290847778, + "step": 962 + }, + { + "epoch": 1.7214285714285715, + "grad_norm": 0.2847800850868225, + "learning_rate": 4.7855482982706396e-06, + "loss": 1.308813452720642, + "step": 964 + }, + { + "epoch": 1.725, + "grad_norm": 0.2914465069770813, + "learning_rate": 4.768062837195417e-06, + "loss": 1.2900055646896362, + "step": 966 + }, + { + "epoch": 1.7285714285714286, + "grad_norm": 0.5518858432769775, + "learning_rate": 4.7505887201458485e-06, + "loss": 1.2404606342315674, + "step": 968 + }, + { + "epoch": 1.7321428571428572, + "grad_norm": 0.34736767411231995, + "learning_rate": 4.73312621794687e-06, + "loss": 1.1192835569381714, + "step": 970 + }, + { + "epoch": 1.7357142857142858, + "grad_norm": 0.27989068627357483, + "learning_rate": 4.715675601243396e-06, + "loss": 1.2646175622940063, + "step": 972 + }, + { + "epoch": 1.7392857142857143, + "grad_norm": 0.2832848131656647, + "learning_rate": 4.698237140496132e-06, + "loss": 1.2004600763320923, + "step": 974 + }, + { + "epoch": 1.7428571428571429, + "grad_norm": 1.7877376079559326, + "learning_rate": 4.68081110597739e-06, + "loss": 1.2224751710891724, + "step": 976 + }, + { + "epoch": 1.7464285714285714, + "grad_norm": 0.2644546627998352, + "learning_rate": 4.663397767766885e-06, + "loss": 1.2846026420593262, + "step": 978 + }, + { + "epoch": 1.75, + "grad_norm": 0.23440435528755188, + "learning_rate": 4.6459973957475625e-06, + "loss": 1.2761108875274658, + "step": 980 + }, + { + "epoch": 1.7535714285714286, + "grad_norm": 0.29541414976119995, + "learning_rate": 4.628610259601406e-06, + "loss": 1.2253004312515259, + "step": 982 + }, + { + "epoch": 1.7571428571428571, + "grad_norm": 0.3721539378166199, + "learning_rate": 4.611236628805259e-06, + "loss": 1.217316746711731, + "step": 984 + }, + { + "epoch": 1.7607142857142857, + "grad_norm": 0.23486927151679993, + "learning_rate": 4.593876772626659e-06, + "loss": 1.238864779472351, + "step": 986 + }, + { + "epoch": 1.7642857142857142, + "grad_norm": 0.35403114557266235, + "learning_rate": 4.576530960119646e-06, + "loss": 1.2506440877914429, + "step": 988 + }, + { + "epoch": 1.7678571428571428, + "grad_norm": 0.24216312170028687, + "learning_rate": 4.55919946012061e-06, + "loss": 1.203848123550415, + "step": 990 + }, + { + "epoch": 1.7714285714285714, + "grad_norm": 0.5742025971412659, + "learning_rate": 4.54188254124411e-06, + "loss": 1.2036422491073608, + "step": 992 + }, + { + "epoch": 1.775, + "grad_norm": 0.4332943260669708, + "learning_rate": 4.524580471878724e-06, + "loss": 1.1484333276748657, + "step": 994 + }, + { + "epoch": 1.7785714285714285, + "grad_norm": 0.2262076586484909, + "learning_rate": 4.507293520182877e-06, + "loss": 1.2005127668380737, + "step": 996 + }, + { + "epoch": 1.782142857142857, + "grad_norm": 0.18153786659240723, + "learning_rate": 4.490021954080695e-06, + "loss": 1.1209759712219238, + "step": 998 + }, + { + "epoch": 1.7857142857142856, + "grad_norm": 0.2752821147441864, + "learning_rate": 4.472766041257846e-06, + "loss": 1.1912975311279297, + "step": 1000 + }, + { + "epoch": 1.7892857142857141, + "grad_norm": 0.37398797273635864, + "learning_rate": 4.4555260491573956e-06, + "loss": 1.1634106636047363, + "step": 1002 + }, + { + "epoch": 1.7928571428571427, + "grad_norm": 0.4885188341140747, + "learning_rate": 4.438302244975659e-06, + "loss": 1.19752037525177, + "step": 1004 + }, + { + "epoch": 1.7964285714285713, + "grad_norm": 0.20963414013385773, + "learning_rate": 4.421094895658058e-06, + "loss": 1.1573578119277954, + "step": 1006 + }, + { + "epoch": 1.8, + "grad_norm": 0.19768458604812622, + "learning_rate": 4.403904267894991e-06, + "loss": 1.1309683322906494, + "step": 1008 + }, + { + "epoch": 1.8035714285714286, + "grad_norm": 0.3265831470489502, + "learning_rate": 4.386730628117692e-06, + "loss": 1.210740566253662, + "step": 1010 + }, + { + "epoch": 1.8071428571428572, + "grad_norm": 0.24831008911132812, + "learning_rate": 4.369574242494108e-06, + "loss": 1.1857199668884277, + "step": 1012 + }, + { + "epoch": 1.8107142857142857, + "grad_norm": 0.24806837737560272, + "learning_rate": 4.3524353769247665e-06, + "loss": 1.0957400798797607, + "step": 1014 + }, + { + "epoch": 1.8142857142857143, + "grad_norm": 0.21978451311588287, + "learning_rate": 4.335314297038656e-06, + "loss": 1.1512374877929688, + "step": 1016 + }, + { + "epoch": 1.8178571428571428, + "grad_norm": 0.18174096941947937, + "learning_rate": 4.318211268189121e-06, + "loss": 1.1074084043502808, + "step": 1018 + }, + { + "epoch": 1.8214285714285714, + "grad_norm": 0.1807389110326767, + "learning_rate": 4.3011265554497305e-06, + "loss": 1.1385325193405151, + "step": 1020 + }, + { + "epoch": 1.825, + "grad_norm": 0.3229348063468933, + "learning_rate": 4.28406042361018e-06, + "loss": 1.119950771331787, + "step": 1022 + }, + { + "epoch": 1.8285714285714287, + "grad_norm": 0.21613694727420807, + "learning_rate": 4.267013137172189e-06, + "loss": 1.1364243030548096, + "step": 1024 + }, + { + "epoch": 1.8321428571428573, + "grad_norm": 0.5674333572387695, + "learning_rate": 4.249984960345399e-06, + "loss": 1.1446290016174316, + "step": 1026 + }, + { + "epoch": 1.8357142857142859, + "grad_norm": 0.19522684812545776, + "learning_rate": 4.232976157043277e-06, + "loss": 1.1350977420806885, + "step": 1028 + }, + { + "epoch": 1.8392857142857144, + "grad_norm": 0.22652848064899445, + "learning_rate": 4.2159869908790275e-06, + "loss": 1.1374115943908691, + "step": 1030 + }, + { + "epoch": 1.842857142857143, + "grad_norm": 0.20917841792106628, + "learning_rate": 4.199017725161505e-06, + "loss": 1.1824545860290527, + "step": 1032 + }, + { + "epoch": 1.8464285714285715, + "grad_norm": 0.2631721496582031, + "learning_rate": 4.182068622891139e-06, + "loss": 1.1770212650299072, + "step": 1034 + }, + { + "epoch": 1.85, + "grad_norm": 0.24983558058738708, + "learning_rate": 4.165139946755847e-06, + "loss": 1.161262035369873, + "step": 1036 + }, + { + "epoch": 1.8535714285714286, + "grad_norm": 0.31537604331970215, + "learning_rate": 4.148231959126973e-06, + "loss": 1.1958869695663452, + "step": 1038 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 0.3142789900302887, + "learning_rate": 4.131344922055213e-06, + "loss": 1.1789402961730957, + "step": 1040 + }, + { + "epoch": 1.8607142857142858, + "grad_norm": 0.42967483401298523, + "learning_rate": 4.114479097266567e-06, + "loss": 1.1411830186843872, + "step": 1042 + }, + { + "epoch": 1.8642857142857143, + "grad_norm": 0.21074344217777252, + "learning_rate": 4.0976347461582656e-06, + "loss": 1.17338228225708, + "step": 1044 + }, + { + "epoch": 1.8678571428571429, + "grad_norm": 0.33415719866752625, + "learning_rate": 4.080812129794728e-06, + "loss": 1.1420398950576782, + "step": 1046 + }, + { + "epoch": 1.8714285714285714, + "grad_norm": 0.16336952149868011, + "learning_rate": 4.064011508903516e-06, + "loss": 1.1628490686416626, + "step": 1048 + }, + { + "epoch": 1.875, + "grad_norm": 0.2252008020877838, + "learning_rate": 4.047233143871292e-06, + "loss": 1.173589825630188, + "step": 1050 + }, + { + "epoch": 1.8785714285714286, + "grad_norm": 0.33176442980766296, + "learning_rate": 4.030477294739783e-06, + "loss": 1.194374918937683, + "step": 1052 + }, + { + "epoch": 1.8821428571428571, + "grad_norm": 0.29097726941108704, + "learning_rate": 4.013744221201749e-06, + "loss": 1.1737301349639893, + "step": 1054 + }, + { + "epoch": 1.8857142857142857, + "grad_norm": 0.1832679808139801, + "learning_rate": 3.997034182596958e-06, + "loss": 1.110135793685913, + "step": 1056 + }, + { + "epoch": 1.8892857142857142, + "grad_norm": 0.2953426241874695, + "learning_rate": 3.980347437908175e-06, + "loss": 1.1428486108779907, + "step": 1058 + }, + { + "epoch": 1.8928571428571428, + "grad_norm": 0.20754416286945343, + "learning_rate": 3.963684245757132e-06, + "loss": 1.17241632938385, + "step": 1060 + }, + { + "epoch": 1.8964285714285714, + "grad_norm": 0.29985517263412476, + "learning_rate": 3.9470448644005345e-06, + "loss": 1.2037956714630127, + "step": 1062 + }, + { + "epoch": 1.9, + "grad_norm": 0.24180017411708832, + "learning_rate": 3.930429551726049e-06, + "loss": 1.1744909286499023, + "step": 1064 + }, + { + "epoch": 1.9035714285714285, + "grad_norm": 0.1725412905216217, + "learning_rate": 3.913838565248318e-06, + "loss": 1.1504842042922974, + "step": 1066 + }, + { + "epoch": 1.907142857142857, + "grad_norm": 0.19483552873134613, + "learning_rate": 3.8972721621049545e-06, + "loss": 1.1242973804473877, + "step": 1068 + }, + { + "epoch": 1.9107142857142856, + "grad_norm": 0.2150045484304428, + "learning_rate": 3.880730599052565e-06, + "loss": 1.1571553945541382, + "step": 1070 + }, + { + "epoch": 1.9142857142857141, + "grad_norm": 0.26055601239204407, + "learning_rate": 3.864214132462766e-06, + "loss": 1.1744543313980103, + "step": 1072 + }, + { + "epoch": 1.9178571428571427, + "grad_norm": 0.20224107801914215, + "learning_rate": 3.84772301831822e-06, + "loss": 1.129955768585205, + "step": 1074 + }, + { + "epoch": 1.9214285714285713, + "grad_norm": 0.21899673342704773, + "learning_rate": 3.831257512208657e-06, + "loss": 1.1564751863479614, + "step": 1076 + }, + { + "epoch": 1.925, + "grad_norm": 0.24604743719100952, + "learning_rate": 3.814817869326915e-06, + "loss": 1.1490484476089478, + "step": 1078 + }, + { + "epoch": 1.9285714285714286, + "grad_norm": 0.1920636147260666, + "learning_rate": 3.7984043444649898e-06, + "loss": 1.1944819688796997, + "step": 1080 + }, + { + "epoch": 1.9321428571428572, + "grad_norm": 0.2951393723487854, + "learning_rate": 3.782017192010087e-06, + "loss": 1.2130813598632812, + "step": 1082 + }, + { + "epoch": 1.9357142857142857, + "grad_norm": 0.38370734453201294, + "learning_rate": 3.76565666594067e-06, + "loss": 1.1711630821228027, + "step": 1084 + }, + { + "epoch": 1.9392857142857143, + "grad_norm": 0.7297260165214539, + "learning_rate": 3.749323019822534e-06, + "loss": 1.1901503801345825, + "step": 1086 + }, + { + "epoch": 1.9428571428571428, + "grad_norm": 0.22041039168834686, + "learning_rate": 3.7330165068048673e-06, + "loss": 1.1663475036621094, + "step": 1088 + }, + { + "epoch": 1.9464285714285714, + "grad_norm": 0.2529982626438141, + "learning_rate": 3.7167373796163377e-06, + "loss": 1.1222208738327026, + "step": 1090 + }, + { + "epoch": 1.95, + "grad_norm": 0.22839988768100739, + "learning_rate": 3.700485890561167e-06, + "loss": 1.1396700143814087, + "step": 1092 + }, + { + "epoch": 1.9535714285714287, + "grad_norm": 0.32207345962524414, + "learning_rate": 3.6842622915152228e-06, + "loss": 1.1646703481674194, + "step": 1094 + }, + { + "epoch": 1.9571428571428573, + "grad_norm": 0.2876273989677429, + "learning_rate": 3.668066833922116e-06, + "loss": 1.148516058921814, + "step": 1096 + }, + { + "epoch": 1.9607142857142859, + "grad_norm": 0.2196146845817566, + "learning_rate": 3.6518997687893053e-06, + "loss": 1.1533443927764893, + "step": 1098 + }, + { + "epoch": 1.9642857142857144, + "grad_norm": 0.46365395188331604, + "learning_rate": 3.635761346684206e-06, + "loss": 1.1947966814041138, + "step": 1100 + }, + { + "epoch": 1.967857142857143, + "grad_norm": 0.2954294681549072, + "learning_rate": 3.619651817730302e-06, + "loss": 1.1832884550094604, + "step": 1102 + }, + { + "epoch": 1.9714285714285715, + "grad_norm": 0.2565920650959015, + "learning_rate": 3.603571431603272e-06, + "loss": 1.1965795755386353, + "step": 1104 + }, + { + "epoch": 1.975, + "grad_norm": 0.2640427350997925, + "learning_rate": 3.587520437527128e-06, + "loss": 1.140123963356018, + "step": 1106 + }, + { + "epoch": 1.9785714285714286, + "grad_norm": 0.26683422923088074, + "learning_rate": 3.571499084270338e-06, + "loss": 1.1581156253814697, + "step": 1108 + }, + { + "epoch": 1.9821428571428572, + "grad_norm": 0.2290692776441574, + "learning_rate": 3.5555076201419816e-06, + "loss": 1.174959421157837, + "step": 1110 + }, + { + "epoch": 1.9857142857142858, + "grad_norm": 0.2061983048915863, + "learning_rate": 3.5395462929878945e-06, + "loss": 1.220007061958313, + "step": 1112 + }, + { + "epoch": 1.9892857142857143, + "grad_norm": 0.20125523209571838, + "learning_rate": 3.5236153501868343e-06, + "loss": 1.2462403774261475, + "step": 1114 + }, + { + "epoch": 1.9928571428571429, + "grad_norm": 0.29600805044174194, + "learning_rate": 3.5077150386466406e-06, + "loss": 1.2024950981140137, + "step": 1116 + }, + { + "epoch": 1.9964285714285714, + "grad_norm": 0.2931258976459503, + "learning_rate": 3.4918456048004106e-06, + "loss": 1.1237006187438965, + "step": 1118 + }, + { + "epoch": 2.0, + "grad_norm": 0.4734819829463959, + "learning_rate": 3.4760072946026786e-06, + "loss": 1.1085011959075928, + "step": 1120 + }, + { + "epoch": 2.0035714285714286, + "grad_norm": 0.20331430435180664, + "learning_rate": 3.46020035352561e-06, + "loss": 1.2824596166610718, + "step": 1122 + }, + { + "epoch": 2.007142857142857, + "grad_norm": 0.46622058749198914, + "learning_rate": 3.444425026555182e-06, + "loss": 1.2747101783752441, + "step": 1124 + }, + { + "epoch": 2.0107142857142857, + "grad_norm": 0.19980192184448242, + "learning_rate": 3.4286815581874045e-06, + "loss": 1.2517393827438354, + "step": 1126 + }, + { + "epoch": 2.0142857142857142, + "grad_norm": 0.32897406816482544, + "learning_rate": 3.4129701924245173e-06, + "loss": 1.2301400899887085, + "step": 1128 + }, + { + "epoch": 2.017857142857143, + "grad_norm": 0.17299680411815643, + "learning_rate": 3.397291172771221e-06, + "loss": 1.2544574737548828, + "step": 1130 + }, + { + "epoch": 2.0214285714285714, + "grad_norm": 0.2090325653553009, + "learning_rate": 3.3816447422308883e-06, + "loss": 1.0791321992874146, + "step": 1132 + }, + { + "epoch": 2.025, + "grad_norm": 0.2806832790374756, + "learning_rate": 3.366031143301811e-06, + "loss": 1.1756961345672607, + "step": 1134 + }, + { + "epoch": 2.0285714285714285, + "grad_norm": 0.4019312858581543, + "learning_rate": 3.3504506179734254e-06, + "loss": 1.1622370481491089, + "step": 1136 + }, + { + "epoch": 2.032142857142857, + "grad_norm": 0.22266216576099396, + "learning_rate": 3.334903407722587e-06, + "loss": 1.234253168106079, + "step": 1138 + }, + { + "epoch": 2.0357142857142856, + "grad_norm": 0.29923903942108154, + "learning_rate": 3.319389753509803e-06, + "loss": 1.1241004467010498, + "step": 1140 + }, + { + "epoch": 2.039285714285714, + "grad_norm": 0.3284701704978943, + "learning_rate": 3.30390989577551e-06, + "loss": 1.260522723197937, + "step": 1142 + }, + { + "epoch": 2.0428571428571427, + "grad_norm": 0.4323379099369049, + "learning_rate": 3.288464074436346e-06, + "loss": 1.1753382682800293, + "step": 1144 + }, + { + "epoch": 2.0464285714285713, + "grad_norm": 0.3188895285129547, + "learning_rate": 3.273052528881433e-06, + "loss": 1.1759196519851685, + "step": 1146 + }, + { + "epoch": 2.05, + "grad_norm": 0.754629373550415, + "learning_rate": 3.257675497968661e-06, + "loss": 1.0839532613754272, + "step": 1148 + }, + { + "epoch": 2.0535714285714284, + "grad_norm": 0.261398047208786, + "learning_rate": 3.2423332200209946e-06, + "loss": 1.1668034791946411, + "step": 1150 + }, + { + "epoch": 2.057142857142857, + "grad_norm": 0.3192571699619293, + "learning_rate": 3.2270259328227703e-06, + "loss": 1.312312364578247, + "step": 1152 + }, + { + "epoch": 2.0607142857142855, + "grad_norm": 0.3842572271823883, + "learning_rate": 3.2117538736160235e-06, + "loss": 1.241450548171997, + "step": 1154 + }, + { + "epoch": 2.064285714285714, + "grad_norm": 0.3109821677207947, + "learning_rate": 3.1965172790967967e-06, + "loss": 1.2660008668899536, + "step": 1156 + }, + { + "epoch": 2.067857142857143, + "grad_norm": 0.30365416407585144, + "learning_rate": 3.1813163854114793e-06, + "loss": 1.1892515420913696, + "step": 1158 + }, + { + "epoch": 2.0714285714285716, + "grad_norm": 0.26805219054222107, + "learning_rate": 3.1661514281531464e-06, + "loss": 1.2073129415512085, + "step": 1160 + }, + { + "epoch": 2.075, + "grad_norm": 0.26900723576545715, + "learning_rate": 3.1510226423579127e-06, + "loss": 1.1416363716125488, + "step": 1162 + }, + { + "epoch": 2.0785714285714287, + "grad_norm": 0.3996395468711853, + "learning_rate": 3.135930262501279e-06, + "loss": 1.2287384271621704, + "step": 1164 + }, + { + "epoch": 2.0821428571428573, + "grad_norm": 0.3018134832382202, + "learning_rate": 3.120874522494506e-06, + "loss": 1.2006416320800781, + "step": 1166 + }, + { + "epoch": 2.085714285714286, + "grad_norm": 0.16339807212352753, + "learning_rate": 3.105855655680986e-06, + "loss": 0.9185248017311096, + "step": 1168 + }, + { + "epoch": 2.0892857142857144, + "grad_norm": 0.3090437352657318, + "learning_rate": 3.090873894832628e-06, + "loss": 0.9894356727600098, + "step": 1170 + }, + { + "epoch": 2.092857142857143, + "grad_norm": 0.30770227313041687, + "learning_rate": 3.07592947214625e-06, + "loss": 1.0587633848190308, + "step": 1172 + }, + { + "epoch": 2.0964285714285715, + "grad_norm": 0.32658347487449646, + "learning_rate": 3.0610226192399767e-06, + "loss": 1.2783530950546265, + "step": 1174 + }, + { + "epoch": 2.1, + "grad_norm": 0.3846922218799591, + "learning_rate": 3.0461535671496537e-06, + "loss": 1.0930966138839722, + "step": 1176 + }, + { + "epoch": 2.1035714285714286, + "grad_norm": 0.44550713896751404, + "learning_rate": 3.0313225463252716e-06, + "loss": 1.0916811227798462, + "step": 1178 + }, + { + "epoch": 2.107142857142857, + "grad_norm": 0.9442609548568726, + "learning_rate": 3.0165297866273766e-06, + "loss": 1.2753980159759521, + "step": 1180 + }, + { + "epoch": 2.1107142857142858, + "grad_norm": 0.2832079529762268, + "learning_rate": 3.0017755173235295e-06, + "loss": 1.195408821105957, + "step": 1182 + }, + { + "epoch": 2.1142857142857143, + "grad_norm": 0.27624693512916565, + "learning_rate": 2.9870599670847366e-06, + "loss": 1.137044072151184, + "step": 1184 + }, + { + "epoch": 2.117857142857143, + "grad_norm": 0.5313391089439392, + "learning_rate": 2.972383363981917e-06, + "loss": 1.1940035820007324, + "step": 1186 + }, + { + "epoch": 2.1214285714285714, + "grad_norm": 1.0065633058547974, + "learning_rate": 2.9577459354823602e-06, + "loss": 1.1326301097869873, + "step": 1188 + }, + { + "epoch": 2.125, + "grad_norm": 0.19776014983654022, + "learning_rate": 2.9431479084462013e-06, + "loss": 1.18599534034729, + "step": 1190 + }, + { + "epoch": 2.1285714285714286, + "grad_norm": 0.2414723038673401, + "learning_rate": 2.9285895091229042e-06, + "loss": 1.2466977834701538, + "step": 1192 + }, + { + "epoch": 2.132142857142857, + "grad_norm": 0.2931707799434662, + "learning_rate": 2.9140709631477666e-06, + "loss": 1.155306339263916, + "step": 1194 + }, + { + "epoch": 2.1357142857142857, + "grad_norm": 0.26033467054367065, + "learning_rate": 2.8995924955384048e-06, + "loss": 1.1785553693771362, + "step": 1196 + }, + { + "epoch": 2.1392857142857142, + "grad_norm": 0.24594391882419586, + "learning_rate": 2.885154330691278e-06, + "loss": 1.1734336614608765, + "step": 1198 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 0.5041958093643188, + "learning_rate": 2.8707566923782105e-06, + "loss": 1.0410226583480835, + "step": 1200 + }, + { + "epoch": 2.1464285714285714, + "grad_norm": 0.25237134099006653, + "learning_rate": 2.856399803742916e-06, + "loss": 1.2042694091796875, + "step": 1202 + }, + { + "epoch": 2.15, + "grad_norm": 0.4853833019733429, + "learning_rate": 2.8420838872975482e-06, + "loss": 1.150026559829712, + "step": 1204 + }, + { + "epoch": 2.1535714285714285, + "grad_norm": 0.3172329366207123, + "learning_rate": 2.8278091649192443e-06, + "loss": 1.2379705905914307, + "step": 1206 + }, + { + "epoch": 2.157142857142857, + "grad_norm": 0.24962536990642548, + "learning_rate": 2.81357585784669e-06, + "loss": 1.2625255584716797, + "step": 1208 + }, + { + "epoch": 2.1607142857142856, + "grad_norm": 0.5905876755714417, + "learning_rate": 2.799384186676696e-06, + "loss": 1.1990773677825928, + "step": 1210 + }, + { + "epoch": 2.164285714285714, + "grad_norm": 0.2595714032649994, + "learning_rate": 2.785234371360766e-06, + "loss": 1.0102604627609253, + "step": 1212 + }, + { + "epoch": 2.1678571428571427, + "grad_norm": 0.2449759989976883, + "learning_rate": 2.7711266312016986e-06, + "loss": 1.1595333814620972, + "step": 1214 + }, + { + "epoch": 2.1714285714285713, + "grad_norm": 0.38237428665161133, + "learning_rate": 2.757061184850183e-06, + "loss": 1.2344083786010742, + "step": 1216 + }, + { + "epoch": 2.175, + "grad_norm": 0.24876584112644196, + "learning_rate": 2.743038250301418e-06, + "loss": 1.124006748199463, + "step": 1218 + }, + { + "epoch": 2.1785714285714284, + "grad_norm": 0.34139466285705566, + "learning_rate": 2.7290580448917204e-06, + "loss": 1.090733528137207, + "step": 1220 + }, + { + "epoch": 2.182142857142857, + "grad_norm": 0.22050592303276062, + "learning_rate": 2.7151207852951677e-06, + "loss": 1.1178282499313354, + "step": 1222 + }, + { + "epoch": 2.185714285714286, + "grad_norm": 0.26262110471725464, + "learning_rate": 2.701226687520235e-06, + "loss": 1.1468334197998047, + "step": 1224 + }, + { + "epoch": 2.189285714285714, + "grad_norm": 0.2389093041419983, + "learning_rate": 2.6873759669064474e-06, + "loss": 1.1655080318450928, + "step": 1226 + }, + { + "epoch": 2.192857142857143, + "grad_norm": 0.22899575531482697, + "learning_rate": 2.673568838121045e-06, + "loss": 1.169728398323059, + "step": 1228 + }, + { + "epoch": 2.1964285714285716, + "grad_norm": 0.7747792601585388, + "learning_rate": 2.659805515155653e-06, + "loss": 1.0896999835968018, + "step": 1230 + }, + { + "epoch": 2.2, + "grad_norm": 0.35865241289138794, + "learning_rate": 2.6460862113229656e-06, + "loss": 1.0157350301742554, + "step": 1232 + }, + { + "epoch": 2.2035714285714287, + "grad_norm": 0.9577608108520508, + "learning_rate": 2.6324111392534423e-06, + "loss": 1.1235113143920898, + "step": 1234 + }, + { + "epoch": 2.2071428571428573, + "grad_norm": 0.3065534234046936, + "learning_rate": 2.6187805108920104e-06, + "loss": 1.071955680847168, + "step": 1236 + }, + { + "epoch": 2.210714285714286, + "grad_norm": 0.33233603835105896, + "learning_rate": 2.605194537494779e-06, + "loss": 1.3001371622085571, + "step": 1238 + }, + { + "epoch": 2.2142857142857144, + "grad_norm": 0.8232606649398804, + "learning_rate": 2.5916534296257655e-06, + "loss": 1.2073559761047363, + "step": 1240 + }, + { + "epoch": 2.217857142857143, + "grad_norm": 0.3004189431667328, + "learning_rate": 2.5781573971536387e-06, + "loss": 0.9778292179107666, + "step": 1242 + }, + { + "epoch": 2.2214285714285715, + "grad_norm": 0.5353025794029236, + "learning_rate": 2.5647066492484564e-06, + "loss": 1.106062889099121, + "step": 1244 + }, + { + "epoch": 2.225, + "grad_norm": 0.2562118172645569, + "learning_rate": 2.5513013943784236e-06, + "loss": 1.187153935432434, + "step": 1246 + }, + { + "epoch": 2.2285714285714286, + "grad_norm": 0.3913024067878723, + "learning_rate": 2.537941840306669e-06, + "loss": 0.8193651437759399, + "step": 1248 + }, + { + "epoch": 2.232142857142857, + "grad_norm": 0.29852673411369324, + "learning_rate": 2.524628194088027e-06, + "loss": 0.5965661406517029, + "step": 1250 + }, + { + "epoch": 2.2357142857142858, + "grad_norm": 0.2190428078174591, + "learning_rate": 2.511360662065813e-06, + "loss": 0.9129496812820435, + "step": 1252 + }, + { + "epoch": 2.2392857142857143, + "grad_norm": 0.3540997803211212, + "learning_rate": 2.4981394498686413e-06, + "loss": 1.138474702835083, + "step": 1254 + }, + { + "epoch": 2.242857142857143, + "grad_norm": 0.9036802053451538, + "learning_rate": 2.484964762407232e-06, + "loss": 1.2528407573699951, + "step": 1256 + }, + { + "epoch": 2.2464285714285714, + "grad_norm": 0.4152211844921112, + "learning_rate": 2.471836803871233e-06, + "loss": 1.105533480644226, + "step": 1258 + }, + { + "epoch": 2.25, + "grad_norm": 0.48458918929100037, + "learning_rate": 2.45875577772606e-06, + "loss": 0.9600842595100403, + "step": 1260 + }, + { + "epoch": 2.2535714285714286, + "grad_norm": 0.3086172044277191, + "learning_rate": 2.4457218867097396e-06, + "loss": 1.0594391822814941, + "step": 1262 + }, + { + "epoch": 2.257142857142857, + "grad_norm": 0.24558311700820923, + "learning_rate": 2.4327353328297673e-06, + "loss": 1.1570055484771729, + "step": 1264 + }, + { + "epoch": 2.2607142857142857, + "grad_norm": 1.6706045866012573, + "learning_rate": 2.419796317359983e-06, + "loss": 0.9727555513381958, + "step": 1266 + }, + { + "epoch": 2.2642857142857142, + "grad_norm": 0.37175774574279785, + "learning_rate": 2.4069050408374376e-06, + "loss": 1.0557781457901, + "step": 1268 + }, + { + "epoch": 2.267857142857143, + "grad_norm": 0.2886607050895691, + "learning_rate": 2.3940617030593e-06, + "loss": 1.1356130838394165, + "step": 1270 + }, + { + "epoch": 2.2714285714285714, + "grad_norm": 0.2709295451641083, + "learning_rate": 2.3812665030797512e-06, + "loss": 1.0775344371795654, + "step": 1272 + }, + { + "epoch": 2.275, + "grad_norm": 0.17263904213905334, + "learning_rate": 2.368519639206905e-06, + "loss": 0.9881319999694824, + "step": 1274 + }, + { + "epoch": 2.2785714285714285, + "grad_norm": 0.3276418149471283, + "learning_rate": 2.3558213089997303e-06, + "loss": 1.1184488534927368, + "step": 1276 + }, + { + "epoch": 2.282142857142857, + "grad_norm": 0.9172634482383728, + "learning_rate": 2.3431717092649892e-06, + "loss": 1.3341600894927979, + "step": 1278 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.5521453022956848, + "learning_rate": 2.3305710360541857e-06, + "loss": 1.4648536443710327, + "step": 1280 + }, + { + "epoch": 2.289285714285714, + "grad_norm": 0.27971673011779785, + "learning_rate": 2.3180194846605367e-06, + "loss": 1.0912892818450928, + "step": 1282 + }, + { + "epoch": 2.2928571428571427, + "grad_norm": 0.38737377524375916, + "learning_rate": 2.3055172496159327e-06, + "loss": 1.1721148490905762, + "step": 1284 + }, + { + "epoch": 2.2964285714285713, + "grad_norm": 0.32838499546051025, + "learning_rate": 2.2930645246879286e-06, + "loss": 0.8287088871002197, + "step": 1286 + }, + { + "epoch": 2.3, + "grad_norm": 0.9804138541221619, + "learning_rate": 2.2806615028767447e-06, + "loss": 1.3678312301635742, + "step": 1288 + }, + { + "epoch": 2.3035714285714284, + "grad_norm": 0.22584359347820282, + "learning_rate": 2.2683083764122626e-06, + "loss": 1.076238989830017, + "step": 1290 + }, + { + "epoch": 2.307142857142857, + "grad_norm": 0.3474865257740021, + "learning_rate": 2.2560053367510624e-06, + "loss": 1.0969926118850708, + "step": 1292 + }, + { + "epoch": 2.310714285714286, + "grad_norm": 0.5705395340919495, + "learning_rate": 2.24375257457344e-06, + "loss": 1.2809841632843018, + "step": 1294 + }, + { + "epoch": 2.314285714285714, + "grad_norm": 0.4103868007659912, + "learning_rate": 2.2315502797804677e-06, + "loss": 1.329990029335022, + "step": 1296 + }, + { + "epoch": 2.317857142857143, + "grad_norm": 0.4318333566188812, + "learning_rate": 2.2193986414910347e-06, + "loss": 0.9738024473190308, + "step": 1298 + }, + { + "epoch": 2.3214285714285716, + "grad_norm": 0.6811454892158508, + "learning_rate": 2.2072978480389286e-06, + "loss": 1.3944941759109497, + "step": 1300 + }, + { + "epoch": 2.325, + "grad_norm": 0.2388792783021927, + "learning_rate": 2.195248086969904e-06, + "loss": 0.9329886436462402, + "step": 1302 + }, + { + "epoch": 2.3285714285714287, + "grad_norm": 0.316201388835907, + "learning_rate": 2.1832495450387934e-06, + "loss": 1.300463080406189, + "step": 1304 + }, + { + "epoch": 2.3321428571428573, + "grad_norm": 1.2377450466156006, + "learning_rate": 2.1713024082065965e-06, + "loss": 1.1311689615249634, + "step": 1306 + }, + { + "epoch": 2.335714285714286, + "grad_norm": 0.2693905234336853, + "learning_rate": 2.1594068616376056e-06, + "loss": 1.1664714813232422, + "step": 1308 + }, + { + "epoch": 2.3392857142857144, + "grad_norm": 0.2321355789899826, + "learning_rate": 2.1475630896965336e-06, + "loss": 0.9228266477584839, + "step": 1310 + }, + { + "epoch": 2.342857142857143, + "grad_norm": 0.2707984149456024, + "learning_rate": 2.1357712759456594e-06, + "loss": 0.8861098289489746, + "step": 1312 + }, + { + "epoch": 2.3464285714285715, + "grad_norm": 0.41806021332740784, + "learning_rate": 2.1240316031419795e-06, + "loss": 1.2544275522232056, + "step": 1314 + }, + { + "epoch": 2.35, + "grad_norm": 0.232350692152977, + "learning_rate": 2.112344253234377e-06, + "loss": 1.2989314794540405, + "step": 1316 + }, + { + "epoch": 2.3535714285714286, + "grad_norm": 0.32811442017555237, + "learning_rate": 2.1007094073607996e-06, + "loss": 1.0640029907226562, + "step": 1318 + }, + { + "epoch": 2.357142857142857, + "grad_norm": 0.4184323847293854, + "learning_rate": 2.0891272458454614e-06, + "loss": 1.1912882328033447, + "step": 1320 + }, + { + "epoch": 2.3607142857142858, + "grad_norm": 0.4829447865486145, + "learning_rate": 2.0775979481960343e-06, + "loss": 1.1298654079437256, + "step": 1322 + }, + { + "epoch": 2.3642857142857143, + "grad_norm": 0.24751894176006317, + "learning_rate": 2.0661216931008717e-06, + "loss": 1.0057674646377563, + "step": 1324 + }, + { + "epoch": 2.367857142857143, + "grad_norm": 0.3193625807762146, + "learning_rate": 2.054698658426244e-06, + "loss": 1.3430179357528687, + "step": 1326 + }, + { + "epoch": 2.3714285714285714, + "grad_norm": 1.9639981985092163, + "learning_rate": 2.043329021213577e-06, + "loss": 1.1388099193572998, + "step": 1328 + }, + { + "epoch": 2.375, + "grad_norm": 0.4315277636051178, + "learning_rate": 2.0320129576767083e-06, + "loss": 1.1900275945663452, + "step": 1330 + }, + { + "epoch": 2.3785714285714286, + "grad_norm": 0.34250909090042114, + "learning_rate": 2.0207506431991556e-06, + "loss": 1.29435396194458, + "step": 1332 + }, + { + "epoch": 2.382142857142857, + "grad_norm": 0.6402963399887085, + "learning_rate": 2.0095422523314016e-06, + "loss": 1.1449788808822632, + "step": 1334 + }, + { + "epoch": 2.3857142857142857, + "grad_norm": 0.24777400493621826, + "learning_rate": 1.998387958788185e-06, + "loss": 0.9889009594917297, + "step": 1336 + }, + { + "epoch": 2.3892857142857142, + "grad_norm": 0.27742165327072144, + "learning_rate": 1.987287935445811e-06, + "loss": 1.13013756275177, + "step": 1338 + }, + { + "epoch": 2.392857142857143, + "grad_norm": 0.6482072472572327, + "learning_rate": 1.976242354339471e-06, + "loss": 1.2214878797531128, + "step": 1340 + }, + { + "epoch": 2.3964285714285714, + "grad_norm": 0.41687601804733276, + "learning_rate": 1.965251386660575e-06, + "loss": 1.180694818496704, + "step": 1342 + }, + { + "epoch": 2.4, + "grad_norm": 0.4969882071018219, + "learning_rate": 1.9543152027541003e-06, + "loss": 1.0276660919189453, + "step": 1344 + }, + { + "epoch": 2.4035714285714285, + "grad_norm": 0.25297048687934875, + "learning_rate": 1.9434339721159506e-06, + "loss": 1.3130789995193481, + "step": 1346 + }, + { + "epoch": 2.407142857142857, + "grad_norm": 0.3224523961544037, + "learning_rate": 1.932607863390329e-06, + "loss": 1.323912262916565, + "step": 1348 + }, + { + "epoch": 2.4107142857142856, + "grad_norm": 0.3630305528640747, + "learning_rate": 1.9218370443671232e-06, + "loss": 1.420185923576355, + "step": 1350 + }, + { + "epoch": 2.414285714285714, + "grad_norm": 0.33518993854522705, + "learning_rate": 1.91112168197931e-06, + "loss": 1.2631648778915405, + "step": 1352 + }, + { + "epoch": 2.4178571428571427, + "grad_norm": 0.2684813439846039, + "learning_rate": 1.900461942300359e-06, + "loss": 1.2116239070892334, + "step": 1354 + }, + { + "epoch": 2.4214285714285713, + "grad_norm": 1.0438412427902222, + "learning_rate": 1.8898579905416678e-06, + "loss": 0.9407988786697388, + "step": 1356 + }, + { + "epoch": 2.425, + "grad_norm": 0.2759835124015808, + "learning_rate": 1.8793099910499926e-06, + "loss": 1.089248776435852, + "step": 1358 + }, + { + "epoch": 2.4285714285714284, + "grad_norm": 0.2118200808763504, + "learning_rate": 1.8688181073049125e-06, + "loss": 0.9922888278961182, + "step": 1360 + }, + { + "epoch": 2.432142857142857, + "grad_norm": 0.29328909516334534, + "learning_rate": 1.8583825019162843e-06, + "loss": 1.0572453737258911, + "step": 1362 + }, + { + "epoch": 2.435714285714286, + "grad_norm": 0.34180185198783875, + "learning_rate": 1.848003336621729e-06, + "loss": 1.2007834911346436, + "step": 1364 + }, + { + "epoch": 2.439285714285714, + "grad_norm": 2.949885368347168, + "learning_rate": 1.8376807722841231e-06, + "loss": 1.2154308557510376, + "step": 1366 + }, + { + "epoch": 2.442857142857143, + "grad_norm": 0.29990777373313904, + "learning_rate": 1.8274149688891057e-06, + "loss": 1.2820924520492554, + "step": 1368 + }, + { + "epoch": 2.4464285714285716, + "grad_norm": 0.2850666046142578, + "learning_rate": 1.8172060855425986e-06, + "loss": 1.3318397998809814, + "step": 1370 + }, + { + "epoch": 2.45, + "grad_norm": 0.4406229555606842, + "learning_rate": 1.8070542804683406e-06, + "loss": 1.490922212600708, + "step": 1372 + }, + { + "epoch": 2.4535714285714287, + "grad_norm": 0.5301911234855652, + "learning_rate": 1.7969597110054343e-06, + "loss": 1.04641854763031, + "step": 1374 + }, + { + "epoch": 2.4571428571428573, + "grad_norm": 0.4790363013744354, + "learning_rate": 1.7869225336059133e-06, + "loss": 1.2003765106201172, + "step": 1376 + }, + { + "epoch": 2.460714285714286, + "grad_norm": 0.3002559542655945, + "learning_rate": 1.7769429038323058e-06, + "loss": 1.1743593215942383, + "step": 1378 + }, + { + "epoch": 2.4642857142857144, + "grad_norm": 0.47378110885620117, + "learning_rate": 1.7670209763552342e-06, + "loss": 1.0753716230392456, + "step": 1380 + }, + { + "epoch": 2.467857142857143, + "grad_norm": 0.4303780198097229, + "learning_rate": 1.757156904951014e-06, + "loss": 1.195298194885254, + "step": 1382 + }, + { + "epoch": 2.4714285714285715, + "grad_norm": 0.40849828720092773, + "learning_rate": 1.747350842499271e-06, + "loss": 1.0725401639938354, + "step": 1384 + }, + { + "epoch": 2.475, + "grad_norm": 0.4191647469997406, + "learning_rate": 1.7376029409805708e-06, + "loss": 1.2902517318725586, + "step": 1386 + }, + { + "epoch": 2.4785714285714286, + "grad_norm": 0.5962879657745361, + "learning_rate": 1.7279133514740645e-06, + "loss": 1.2889909744262695, + "step": 1388 + }, + { + "epoch": 2.482142857142857, + "grad_norm": 0.2635829448699951, + "learning_rate": 1.7182822241551434e-06, + "loss": 0.9972074627876282, + "step": 1390 + }, + { + "epoch": 2.4857142857142858, + "grad_norm": 0.27476590871810913, + "learning_rate": 1.708709708293121e-06, + "loss": 1.0351589918136597, + "step": 1392 + }, + { + "epoch": 2.4892857142857143, + "grad_norm": 0.3098399341106415, + "learning_rate": 1.6991959522489082e-06, + "loss": 1.030190110206604, + "step": 1394 + }, + { + "epoch": 2.492857142857143, + "grad_norm": 0.37093329429626465, + "learning_rate": 1.6897411034727217e-06, + "loss": 1.3557082414627075, + "step": 1396 + }, + { + "epoch": 2.4964285714285714, + "grad_norm": 0.4083240032196045, + "learning_rate": 1.680345308501795e-06, + "loss": 1.0274466276168823, + "step": 1398 + }, + { + "epoch": 2.5, + "grad_norm": 0.34320634603500366, + "learning_rate": 1.6710087129581086e-06, + "loss": 0.9457365274429321, + "step": 1400 + }, + { + "epoch": 2.5035714285714286, + "grad_norm": 0.5619872808456421, + "learning_rate": 1.6617314615461325e-06, + "loss": 1.3013941049575806, + "step": 1402 + }, + { + "epoch": 2.507142857142857, + "grad_norm": 0.9764664769172668, + "learning_rate": 1.6525136980505835e-06, + "loss": 1.4310553073883057, + "step": 1404 + }, + { + "epoch": 2.5107142857142857, + "grad_norm": 0.548743724822998, + "learning_rate": 1.6433555653341976e-06, + "loss": 1.255396842956543, + "step": 1406 + }, + { + "epoch": 2.5142857142857142, + "grad_norm": 0.8014435172080994, + "learning_rate": 1.6342572053355166e-06, + "loss": 0.830237865447998, + "step": 1408 + }, + { + "epoch": 2.517857142857143, + "grad_norm": 0.21949461102485657, + "learning_rate": 1.625218759066685e-06, + "loss": 0.7343713641166687, + "step": 1410 + }, + { + "epoch": 2.5214285714285714, + "grad_norm": 0.6966763734817505, + "learning_rate": 1.6162403666112653e-06, + "loss": 1.1919779777526855, + "step": 1412 + }, + { + "epoch": 2.525, + "grad_norm": 0.30908581614494324, + "learning_rate": 1.6073221671220692e-06, + "loss": 0.9375178813934326, + "step": 1414 + }, + { + "epoch": 2.5285714285714285, + "grad_norm": 0.34836652874946594, + "learning_rate": 1.5984642988190022e-06, + "loss": 0.8665962219238281, + "step": 1416 + }, + { + "epoch": 2.532142857142857, + "grad_norm": 0.18187429010868073, + "learning_rate": 1.5896668989869151e-06, + "loss": 0.9749317765235901, + "step": 1418 + }, + { + "epoch": 2.5357142857142856, + "grad_norm": 0.2711097002029419, + "learning_rate": 1.5809301039734814e-06, + "loss": 1.1920053958892822, + "step": 1420 + }, + { + "epoch": 2.539285714285714, + "grad_norm": 0.35151663422584534, + "learning_rate": 1.5722540491870838e-06, + "loss": 1.1063796281814575, + "step": 1422 + }, + { + "epoch": 2.5428571428571427, + "grad_norm": 0.46157142519950867, + "learning_rate": 1.5636388690947125e-06, + "loss": 0.9042350649833679, + "step": 1424 + }, + { + "epoch": 2.5464285714285713, + "grad_norm": 0.44619572162628174, + "learning_rate": 1.5550846972198851e-06, + "loss": 1.1896483898162842, + "step": 1426 + }, + { + "epoch": 2.55, + "grad_norm": 0.5084243416786194, + "learning_rate": 1.5465916661405734e-06, + "loss": 1.0787028074264526, + "step": 1428 + }, + { + "epoch": 2.553571428571429, + "grad_norm": 0.2909405529499054, + "learning_rate": 1.5381599074871512e-06, + "loss": 1.1317380666732788, + "step": 1430 + }, + { + "epoch": 2.557142857142857, + "grad_norm": 0.7613154053688049, + "learning_rate": 1.5297895519403563e-06, + "loss": 1.3027656078338623, + "step": 1432 + }, + { + "epoch": 2.560714285714286, + "grad_norm": 0.38280853629112244, + "learning_rate": 1.5214807292292567e-06, + "loss": 0.8128288984298706, + "step": 1434 + }, + { + "epoch": 2.564285714285714, + "grad_norm": 0.33587777614593506, + "learning_rate": 1.5132335681292492e-06, + "loss": 1.4057202339172363, + "step": 1436 + }, + { + "epoch": 2.567857142857143, + "grad_norm": 0.4974580407142639, + "learning_rate": 1.5050481964600582e-06, + "loss": 1.2144535779953003, + "step": 1438 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 0.21717508137226105, + "learning_rate": 1.496924741083759e-06, + "loss": 0.9632461667060852, + "step": 1440 + }, + { + "epoch": 2.575, + "grad_norm": 0.18900008499622345, + "learning_rate": 1.4888633279028068e-06, + "loss": 1.021627426147461, + "step": 1442 + }, + { + "epoch": 2.5785714285714287, + "grad_norm": 0.41346102952957153, + "learning_rate": 1.4808640818580885e-06, + "loss": 1.0733561515808105, + "step": 1444 + }, + { + "epoch": 2.5821428571428573, + "grad_norm": 0.3450411558151245, + "learning_rate": 1.4729271269269823e-06, + "loss": 1.0130958557128906, + "step": 1446 + }, + { + "epoch": 2.585714285714286, + "grad_norm": 0.4527641832828522, + "learning_rate": 1.4650525861214454e-06, + "loss": 0.9112399220466614, + "step": 1448 + }, + { + "epoch": 2.5892857142857144, + "grad_norm": 0.43975669145584106, + "learning_rate": 1.4572405814860954e-06, + "loss": 1.0099694728851318, + "step": 1450 + }, + { + "epoch": 2.592857142857143, + "grad_norm": 2.6724021434783936, + "learning_rate": 1.4494912340963286e-06, + "loss": 0.9879626035690308, + "step": 1452 + }, + { + "epoch": 2.5964285714285715, + "grad_norm": 0.33726853132247925, + "learning_rate": 1.441804664056437e-06, + "loss": 1.3339985609054565, + "step": 1454 + }, + { + "epoch": 2.6, + "grad_norm": 0.5543254017829895, + "learning_rate": 1.4341809904977511e-06, + "loss": 1.0636701583862305, + "step": 1456 + }, + { + "epoch": 2.6035714285714286, + "grad_norm": 0.35016801953315735, + "learning_rate": 1.4266203315767917e-06, + "loss": 1.2073761224746704, + "step": 1458 + }, + { + "epoch": 2.607142857142857, + "grad_norm": 0.37314754724502563, + "learning_rate": 1.4191228044734387e-06, + "loss": 1.067349910736084, + "step": 1460 + }, + { + "epoch": 2.6107142857142858, + "grad_norm": 0.27696406841278076, + "learning_rate": 1.4116885253891142e-06, + "loss": 1.1596084833145142, + "step": 1462 + }, + { + "epoch": 2.6142857142857143, + "grad_norm": 0.23734059929847717, + "learning_rate": 1.4043176095449843e-06, + "loss": 1.130849003791809, + "step": 1464 + }, + { + "epoch": 2.617857142857143, + "grad_norm": 0.451869934797287, + "learning_rate": 1.3970101711801712e-06, + "loss": 1.1519298553466797, + "step": 1466 + }, + { + "epoch": 2.6214285714285714, + "grad_norm": 0.367313027381897, + "learning_rate": 1.3897663235499797e-06, + "loss": 1.081532597541809, + "step": 1468 + }, + { + "epoch": 2.625, + "grad_norm": 1.2766571044921875, + "learning_rate": 1.382586178924149e-06, + "loss": 0.9227726459503174, + "step": 1470 + }, + { + "epoch": 2.6285714285714286, + "grad_norm": 1.6380170583724976, + "learning_rate": 1.3754698485851074e-06, + "loss": 1.3057407140731812, + "step": 1472 + }, + { + "epoch": 2.632142857142857, + "grad_norm": 0.3816126585006714, + "learning_rate": 1.368417442826249e-06, + "loss": 1.1892451047897339, + "step": 1474 + }, + { + "epoch": 2.6357142857142857, + "grad_norm": 0.3007228672504425, + "learning_rate": 1.3614290709502242e-06, + "loss": 1.2595423460006714, + "step": 1476 + }, + { + "epoch": 2.6392857142857142, + "grad_norm": 0.2307678908109665, + "learning_rate": 1.3545048412672459e-06, + "loss": 1.10439932346344, + "step": 1478 + }, + { + "epoch": 2.642857142857143, + "grad_norm": 0.34183934330940247, + "learning_rate": 1.3476448610934104e-06, + "loss": 1.1247930526733398, + "step": 1480 + }, + { + "epoch": 2.6464285714285714, + "grad_norm": 0.50603187084198, + "learning_rate": 1.3408492367490344e-06, + "loss": 1.308542013168335, + "step": 1482 + }, + { + "epoch": 2.65, + "grad_norm": 0.5772185921669006, + "learning_rate": 1.3341180735570081e-06, + "loss": 1.086531639099121, + "step": 1484 + }, + { + "epoch": 2.6535714285714285, + "grad_norm": 0.2957296073436737, + "learning_rate": 1.3274514758411595e-06, + "loss": 0.9083548784255981, + "step": 1486 + }, + { + "epoch": 2.657142857142857, + "grad_norm": 0.2126568704843521, + "learning_rate": 1.3208495469246445e-06, + "loss": 1.0338191986083984, + "step": 1488 + }, + { + "epoch": 2.6607142857142856, + "grad_norm": 0.23187443614006042, + "learning_rate": 1.3143123891283354e-06, + "loss": 1.1434146165847778, + "step": 1490 + }, + { + "epoch": 2.664285714285714, + "grad_norm": 0.2083001434803009, + "learning_rate": 1.3078401037692451e-06, + "loss": 1.148645281791687, + "step": 1492 + }, + { + "epoch": 2.6678571428571427, + "grad_norm": 0.24332857131958008, + "learning_rate": 1.3014327911589495e-06, + "loss": 1.0858982801437378, + "step": 1494 + }, + { + "epoch": 2.6714285714285713, + "grad_norm": 0.44840723276138306, + "learning_rate": 1.2950905506020383e-06, + "loss": 0.8910313844680786, + "step": 1496 + }, + { + "epoch": 2.675, + "grad_norm": 0.6759834885597229, + "learning_rate": 1.2888134803945713e-06, + "loss": 1.0723787546157837, + "step": 1498 + }, + { + "epoch": 2.678571428571429, + "grad_norm": 0.3571532964706421, + "learning_rate": 1.2826016778225578e-06, + "loss": 1.1453263759613037, + "step": 1500 + }, + { + "epoch": 2.682142857142857, + "grad_norm": 0.3260257840156555, + "learning_rate": 1.2764552391604468e-06, + "loss": 1.1897282600402832, + "step": 1502 + }, + { + "epoch": 2.685714285714286, + "grad_norm": 0.21461273729801178, + "learning_rate": 1.2703742596696383e-06, + "loss": 1.114097237586975, + "step": 1504 + }, + { + "epoch": 2.689285714285714, + "grad_norm": 0.39265140891075134, + "learning_rate": 1.2643588335970021e-06, + "loss": 1.2430890798568726, + "step": 1506 + }, + { + "epoch": 2.692857142857143, + "grad_norm": 0.26661592721939087, + "learning_rate": 1.2584090541734216e-06, + "loss": 1.2044790983200073, + "step": 1508 + }, + { + "epoch": 2.696428571428571, + "grad_norm": 0.4279651641845703, + "learning_rate": 1.252525013612346e-06, + "loss": 1.1148457527160645, + "step": 1510 + }, + { + "epoch": 2.7, + "grad_norm": 0.26563382148742676, + "learning_rate": 1.2467068031083623e-06, + "loss": 1.151499629020691, + "step": 1512 + }, + { + "epoch": 2.7035714285714287, + "grad_norm": 0.28036361932754517, + "learning_rate": 1.2409545128357806e-06, + "loss": 1.112971544265747, + "step": 1514 + }, + { + "epoch": 2.7071428571428573, + "grad_norm": 0.3321837782859802, + "learning_rate": 1.235268231947238e-06, + "loss": 0.9679718613624573, + "step": 1516 + }, + { + "epoch": 2.710714285714286, + "grad_norm": 0.7659473419189453, + "learning_rate": 1.229648048572317e-06, + "loss": 1.18712317943573, + "step": 1518 + }, + { + "epoch": 2.7142857142857144, + "grad_norm": 0.5400887131690979, + "learning_rate": 1.2240940498161797e-06, + "loss": 1.0840147733688354, + "step": 1520 + }, + { + "epoch": 2.717857142857143, + "grad_norm": 0.3426344096660614, + "learning_rate": 1.2186063217582144e-06, + "loss": 1.1307204961776733, + "step": 1522 + }, + { + "epoch": 2.7214285714285715, + "grad_norm": 0.39970487356185913, + "learning_rate": 1.213184949450706e-06, + "loss": 1.1921186447143555, + "step": 1524 + }, + { + "epoch": 2.725, + "grad_norm": 0.31394848227500916, + "learning_rate": 1.2078300169175158e-06, + "loss": 1.1872678995132446, + "step": 1526 + }, + { + "epoch": 2.7285714285714286, + "grad_norm": 0.7688894271850586, + "learning_rate": 1.20254160715278e-06, + "loss": 1.1403369903564453, + "step": 1528 + }, + { + "epoch": 2.732142857142857, + "grad_norm": 0.3478771448135376, + "learning_rate": 1.1973198021196207e-06, + "loss": 1.0353933572769165, + "step": 1530 + }, + { + "epoch": 2.7357142857142858, + "grad_norm": 1.663916826248169, + "learning_rate": 1.1921646827488807e-06, + "loss": 1.1801190376281738, + "step": 1532 + }, + { + "epoch": 2.7392857142857143, + "grad_norm": 0.9486533999443054, + "learning_rate": 1.187076328937863e-06, + "loss": 1.118172287940979, + "step": 1534 + }, + { + "epoch": 2.742857142857143, + "grad_norm": 0.3661729693412781, + "learning_rate": 1.182054819549098e-06, + "loss": 1.166612982749939, + "step": 1536 + }, + { + "epoch": 2.7464285714285714, + "grad_norm": 0.273942768573761, + "learning_rate": 1.1771002324091183e-06, + "loss": 1.219356656074524, + "step": 1538 + }, + { + "epoch": 2.75, + "grad_norm": 0.2943507730960846, + "learning_rate": 1.172212644307252e-06, + "loss": 1.2092581987380981, + "step": 1540 + }, + { + "epoch": 2.7535714285714286, + "grad_norm": 0.22103095054626465, + "learning_rate": 1.1673921309944356e-06, + "loss": 1.1635977029800415, + "step": 1542 + }, + { + "epoch": 2.757142857142857, + "grad_norm": 0.27992480993270874, + "learning_rate": 1.1626387671820363e-06, + "loss": 1.1578980684280396, + "step": 1544 + }, + { + "epoch": 2.7607142857142857, + "grad_norm": 0.1873656064271927, + "learning_rate": 1.1579526265406972e-06, + "loss": 1.1813486814498901, + "step": 1546 + }, + { + "epoch": 2.7642857142857142, + "grad_norm": 0.3528795838356018, + "learning_rate": 1.1533337816991932e-06, + "loss": 1.1933683156967163, + "step": 1548 + }, + { + "epoch": 2.767857142857143, + "grad_norm": 0.31167811155319214, + "learning_rate": 1.1487823042433063e-06, + "loss": 1.1475173234939575, + "step": 1550 + }, + { + "epoch": 2.7714285714285714, + "grad_norm": 1.7408783435821533, + "learning_rate": 1.1442982647147167e-06, + "loss": 1.148131251335144, + "step": 1552 + }, + { + "epoch": 2.775, + "grad_norm": 0.3031138777732849, + "learning_rate": 1.1398817326099094e-06, + "loss": 1.0997506380081177, + "step": 1554 + }, + { + "epoch": 2.7785714285714285, + "grad_norm": 0.21349631249904633, + "learning_rate": 1.1355327763790943e-06, + "loss": 1.1433438062667847, + "step": 1556 + }, + { + "epoch": 2.782142857142857, + "grad_norm": 0.16756878793239594, + "learning_rate": 1.1312514634251492e-06, + "loss": 1.0694825649261475, + "step": 1558 + }, + { + "epoch": 2.7857142857142856, + "grad_norm": 0.19285623729228973, + "learning_rate": 1.127037860102575e-06, + "loss": 1.1415499448776245, + "step": 1560 + }, + { + "epoch": 2.789285714285714, + "grad_norm": 0.3282257616519928, + "learning_rate": 1.1228920317164625e-06, + "loss": 1.1128462553024292, + "step": 1562 + }, + { + "epoch": 2.7928571428571427, + "grad_norm": 0.20754434168338776, + "learning_rate": 1.118814042521486e-06, + "loss": 1.1504778861999512, + "step": 1564 + }, + { + "epoch": 2.7964285714285713, + "grad_norm": 0.22546795010566711, + "learning_rate": 1.1148039557209057e-06, + "loss": 1.1107934713363647, + "step": 1566 + }, + { + "epoch": 2.8, + "grad_norm": 0.16394157707691193, + "learning_rate": 1.1108618334655843e-06, + "loss": 1.0830016136169434, + "step": 1568 + }, + { + "epoch": 2.803571428571429, + "grad_norm": 0.1953999102115631, + "learning_rate": 1.1069877368530303e-06, + "loss": 1.16024649143219, + "step": 1570 + }, + { + "epoch": 2.807142857142857, + "grad_norm": 0.211993008852005, + "learning_rate": 1.1031817259264454e-06, + "loss": 1.1383813619613647, + "step": 1572 + }, + { + "epoch": 2.810714285714286, + "grad_norm": 0.1844896823167801, + "learning_rate": 1.0994438596737971e-06, + "loss": 1.0519864559173584, + "step": 1574 + }, + { + "epoch": 2.814285714285714, + "grad_norm": 0.4553788900375366, + "learning_rate": 1.0957741960269049e-06, + "loss": 1.1024482250213623, + "step": 1576 + }, + { + "epoch": 2.817857142857143, + "grad_norm": 0.2758769989013672, + "learning_rate": 1.092172791860539e-06, + "loss": 1.0607486963272095, + "step": 1578 + }, + { + "epoch": 2.821428571428571, + "grad_norm": 0.28464648127555847, + "learning_rate": 1.0886397029915415e-06, + "loss": 1.0878740549087524, + "step": 1580 + }, + { + "epoch": 2.825, + "grad_norm": 0.2519758641719818, + "learning_rate": 1.0851749841779609e-06, + "loss": 1.0692694187164307, + "step": 1582 + }, + { + "epoch": 2.8285714285714287, + "grad_norm": 0.20021863281726837, + "learning_rate": 1.0817786891182041e-06, + "loss": 1.0892566442489624, + "step": 1584 + }, + { + "epoch": 2.8321428571428573, + "grad_norm": 0.21085211634635925, + "learning_rate": 1.0784508704502029e-06, + "loss": 1.0911756753921509, + "step": 1586 + }, + { + "epoch": 2.835714285714286, + "grad_norm": 0.2599065899848938, + "learning_rate": 1.0751915797505986e-06, + "loss": 1.0842504501342773, + "step": 1588 + }, + { + "epoch": 2.8392857142857144, + "grad_norm": 0.23683688044548035, + "learning_rate": 1.0720008675339403e-06, + "loss": 1.0852082967758179, + "step": 1590 + }, + { + "epoch": 2.842857142857143, + "grad_norm": 0.19538818299770355, + "learning_rate": 1.0688787832519085e-06, + "loss": 1.1298590898513794, + "step": 1592 + }, + { + "epoch": 2.8464285714285715, + "grad_norm": 0.5865882039070129, + "learning_rate": 1.0658253752925417e-06, + "loss": 1.122971773147583, + "step": 1594 + }, + { + "epoch": 2.85, + "grad_norm": 0.3269581198692322, + "learning_rate": 1.062840690979491e-06, + "loss": 1.109829068183899, + "step": 1596 + }, + { + "epoch": 2.8535714285714286, + "grad_norm": 0.5810469388961792, + "learning_rate": 1.0599247765712832e-06, + "loss": 1.1492294073104858, + "step": 1598 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.2330639660358429, + "learning_rate": 1.0570776772606056e-06, + "loss": 1.123344898223877, + "step": 1600 + }, + { + "epoch": 2.8607142857142858, + "grad_norm": 0.2107606828212738, + "learning_rate": 1.0542994371736076e-06, + "loss": 1.0889390707015991, + "step": 1602 + }, + { + "epoch": 2.8642857142857143, + "grad_norm": 0.2753591239452362, + "learning_rate": 1.0515900993692128e-06, + "loss": 1.1300913095474243, + "step": 1604 + }, + { + "epoch": 2.867857142857143, + "grad_norm": 0.27015575766563416, + "learning_rate": 1.048949705838454e-06, + "loss": 1.0982666015625, + "step": 1606 + }, + { + "epoch": 2.8714285714285714, + "grad_norm": 0.1620846688747406, + "learning_rate": 1.0463782975038226e-06, + "loss": 1.1166629791259766, + "step": 1608 + }, + { + "epoch": 2.875, + "grad_norm": 0.21408753097057343, + "learning_rate": 1.0438759142186336e-06, + "loss": 1.127457857131958, + "step": 1610 + }, + { + "epoch": 2.8785714285714286, + "grad_norm": 0.26070085167884827, + "learning_rate": 1.0414425947664075e-06, + "loss": 1.1438779830932617, + "step": 1612 + }, + { + "epoch": 2.882142857142857, + "grad_norm": 0.1973988115787506, + "learning_rate": 1.0390783768602694e-06, + "loss": 1.1256788969039917, + "step": 1614 + }, + { + "epoch": 2.8857142857142857, + "grad_norm": 0.1865663081407547, + "learning_rate": 1.0367832971423664e-06, + "loss": 1.0647690296173096, + "step": 1616 + }, + { + "epoch": 2.8892857142857142, + "grad_norm": 0.40141281485557556, + "learning_rate": 1.0345573911832976e-06, + "loss": 1.0978182554244995, + "step": 1618 + }, + { + "epoch": 2.892857142857143, + "grad_norm": 0.19470001757144928, + "learning_rate": 1.0324006934815623e-06, + "loss": 1.1264913082122803, + "step": 1620 + }, + { + "epoch": 2.8964285714285714, + "grad_norm": 0.1923714429140091, + "learning_rate": 1.0303132374630276e-06, + "loss": 1.1599576473236084, + "step": 1622 + }, + { + "epoch": 2.9, + "grad_norm": 0.2873956561088562, + "learning_rate": 1.0282950554804084e-06, + "loss": 1.1344720125198364, + "step": 1624 + }, + { + "epoch": 2.9035714285714285, + "grad_norm": 0.2792896330356598, + "learning_rate": 1.0263461788127682e-06, + "loss": 1.1077191829681396, + "step": 1626 + }, + { + "epoch": 2.907142857142857, + "grad_norm": 0.17874673008918762, + "learning_rate": 1.0244666376650307e-06, + "loss": 1.0769405364990234, + "step": 1628 + }, + { + "epoch": 2.9107142857142856, + "grad_norm": 0.23230457305908203, + "learning_rate": 1.0226564611675146e-06, + "loss": 1.1149848699569702, + "step": 1630 + }, + { + "epoch": 2.914285714285714, + "grad_norm": 0.2538415789604187, + "learning_rate": 1.020915677375483e-06, + "loss": 1.1285921335220337, + "step": 1632 + }, + { + "epoch": 2.9178571428571427, + "grad_norm": 0.18281330168247223, + "learning_rate": 1.0192443132687039e-06, + "loss": 1.0885471105575562, + "step": 1634 + }, + { + "epoch": 2.9214285714285713, + "grad_norm": 0.27069422602653503, + "learning_rate": 1.0176423947510377e-06, + "loss": 1.1098750829696655, + "step": 1636 + }, + { + "epoch": 2.925, + "grad_norm": 0.24785873293876648, + "learning_rate": 1.016109946650032e-06, + "loss": 1.1053394079208374, + "step": 1638 + }, + { + "epoch": 2.928571428571429, + "grad_norm": 0.2786495089530945, + "learning_rate": 1.014646992716537e-06, + "loss": 1.1500390768051147, + "step": 1640 + }, + { + "epoch": 2.932142857142857, + "grad_norm": 0.3538748621940613, + "learning_rate": 1.01325355562434e-06, + "loss": 1.1664944887161255, + "step": 1642 + }, + { + "epoch": 2.935714285714286, + "grad_norm": 0.3729296326637268, + "learning_rate": 1.0119296569698112e-06, + "loss": 1.1281384229660034, + "step": 1644 + }, + { + "epoch": 2.939285714285714, + "grad_norm": 0.21035878360271454, + "learning_rate": 1.01067531727157e-06, + "loss": 1.1451420783996582, + "step": 1646 + }, + { + "epoch": 2.942857142857143, + "grad_norm": 0.3253045380115509, + "learning_rate": 1.0094905559701678e-06, + "loss": 1.1268796920776367, + "step": 1648 + }, + { + "epoch": 2.946428571428571, + "grad_norm": 0.20938168466091156, + "learning_rate": 1.0083753914277859e-06, + "loss": 1.0814552307128906, + "step": 1650 + }, + { + "epoch": 2.95, + "grad_norm": 0.24861246347427368, + "learning_rate": 1.007329840927949e-06, + "loss": 1.1016547679901123, + "step": 1652 + }, + { + "epoch": 2.9535714285714287, + "grad_norm": 0.26715606451034546, + "learning_rate": 1.006353920675263e-06, + "loss": 1.1287412643432617, + "step": 1654 + }, + { + "epoch": 2.9571428571428573, + "grad_norm": 0.20948819816112518, + "learning_rate": 1.0054476457951567e-06, + "loss": 1.11174476146698, + "step": 1656 + }, + { + "epoch": 2.960714285714286, + "grad_norm": 0.5076990127563477, + "learning_rate": 1.0046110303336519e-06, + "loss": 1.112143874168396, + "step": 1658 + }, + { + "epoch": 2.9642857142857144, + "grad_norm": 0.5603309273719788, + "learning_rate": 1.0038440872571456e-06, + "loss": 1.1545910835266113, + "step": 1660 + }, + { + "epoch": 2.967857142857143, + "grad_norm": 0.23968827724456787, + "learning_rate": 1.0031468284522063e-06, + "loss": 1.1435242891311646, + "step": 1662 + }, + { + "epoch": 2.9714285714285715, + "grad_norm": 0.26473504304885864, + "learning_rate": 1.0025192647253939e-06, + "loss": 1.1580908298492432, + "step": 1664 + }, + { + "epoch": 2.975, + "grad_norm": 0.6800065636634827, + "learning_rate": 1.0019614058030874e-06, + "loss": 1.1012563705444336, + "step": 1666 + }, + { + "epoch": 2.9785714285714286, + "grad_norm": 0.23044763505458832, + "learning_rate": 1.0014732603313375e-06, + "loss": 1.1186460256576538, + "step": 1668 + }, + { + "epoch": 2.982142857142857, + "grad_norm": 0.21679583191871643, + "learning_rate": 1.0010548358757327e-06, + "loss": 1.1382079124450684, + "step": 1670 + }, + { + "epoch": 2.9857142857142858, + "grad_norm": 0.4521788954734802, + "learning_rate": 1.0007061389212794e-06, + "loss": 1.182320475578308, + "step": 1672 + }, + { + "epoch": 2.9892857142857143, + "grad_norm": 0.24779334664344788, + "learning_rate": 1.0004271748723043e-06, + "loss": 1.2086482048034668, + "step": 1674 + }, + { + "epoch": 2.992857142857143, + "grad_norm": 0.5126925706863403, + "learning_rate": 1.0002179480523687e-06, + "loss": 0.834091067314148, + "step": 1676 + }, + { + "epoch": 2.9964285714285714, + "grad_norm": 0.3477499783039093, + "learning_rate": 1.0000784617042023e-06, + "loss": 0.722780168056488, + "step": 1678 + }, + { + "epoch": 3.0, + "grad_norm": 0.47854718565940857, + "learning_rate": 1.0000087179896533e-06, + "loss": 0.7972838282585144, + "step": 1680 + }, + { + "epoch": 3.0, + "step": 1680, + "total_flos": 2.510120369642275e+18, + "train_loss": 1.2744095386493774, + "train_runtime": 14979.881, + "train_samples_per_second": 1.794, + "train_steps_per_second": 0.112 + } + ], + "logging_steps": 2, + "max_steps": 1680, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 9999999, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.510120369642275e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}